diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index cc625f95ff69e..658058d846bb9 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -87,3 +87,6 @@ e2c2ffbe7a1b5d9e32a2ce64279475b50c4cba5b
 
 # [lldb][nfc] Deindent ProcessGDBRemote::SetThreadStopInfo by two levels
 b32931c5b32eb0d2cf37d688b34f8548c9674c19
+
+# [libc++][NFC] Fix inconsistent quoting and spacing in our CSV files
+64946fdaf9864d8279da1c30e4d7214fe13d1427
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index 5e4f3b536d10d..ef0d7e986fda7 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -24,14 +24,21 @@ jobs:
         github.event.workflow_run.conclusion == 'failure'
       )
     steps:
+      - name: Fetch Sources
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github/workflows/unprivileged-download-artifact/action.yml
+          sparse-checkout-cone-mode: false
       - name: 'Download artifact'
-        uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+        uses: ./.github/workflows/unprivileged-download-artifact
+        id: download-artifact
         with:
-          github-token: ${{ secrets.ISSUE_WRITE_DOWNLOAD_ARTIFACT }}
           run-id: ${{ github.event.workflow_run.id }}
-          name: workflow-args
+          artifact-name: workflow-args
 
       - name: 'Comment on PR'
+        if: steps.download-artifact.outputs.artifact-id != ''
         uses: actions/github-script@v7
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -144,5 +151,7 @@ jobs:
             });
 
       - name: Dump comments file
-        if: always()
+        if: >-
+          always() &&
+          steps.download-artifact.outputs.artifact-id != ''
         run: cat comments
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index 0a228c41f354e..17a54be16badc 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -131,6 +131,7 @@ jobs:
                 -DCMAKE_BUILD_TYPE=Release \
                 -DLLVM_ENABLE_ASSERTIONS=ON \
                 -DLLDB_INCLUDE_TESTS=OFF \
+                -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" \
                 -DCMAKE_C_COMPILER_LAUNCHER=sccache \
                 -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
                 $extra_cmake_args \
@@ -142,8 +143,6 @@ jobs:
         env:
           LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }}
         run: |
-          # Make sure all of LLVM libraries that llvm-config needs are built.
+          # The libclc tests don't have a generated check target so all we can
+          # do is build it.
           ninja -C "$LLVM_BUILDDIR"
-          cmake -G Ninja -S libclc -B libclc-build -DLLVM_DIR="$LLVM_BUILDDIR"/lib/cmake/llvm -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl"
-          ninja -C libclc-build
-          ninja -C libclc-build test
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
new file mode 100644
index 0000000000000..355e7fee1dae0
--- /dev/null
+++ b/.github/workflows/release-asset-audit.py
@@ -0,0 +1,51 @@
+import github
+import sys
+
+def main():
+    token = sys.argv[1]
+
+    gh = github.Github(login_or_token=token)
+    repo = gh.get_repo("llvm/llvm-project")
+
+    uploaders = set(
+        [
+            "DimitryAndric",
+            "stefanp-ibm",
+            "lei137",
+            "omjavaid",
+            "nicolerabjohn",
+            "amy-kwan",
+            "mandlebug",
+            "zmodem",
+            "androm3da",
+            "tru",
+            "rovka",
+            "rorth",
+            "quinnlp",
+            "kamaub",
+            "abrisco",
+            "jakeegan",
+            "maryammo",
+            "tstellar",
+            "github-actions[bot]",
+        ]
+    )
+
+    for release in repo.get_releases():
+        print("Release:", release.title)
+        for asset in release.get_assets():
+            created_at = asset.created_at
+            updated_at = (
+                "" if asset.created_at == asset.updated_at else asset.updated_at
+            )
+            print(
+                f"{asset.name} : {asset.uploader.login} [{created_at} {updated_at}] ( {asset.download_count} )"
+            )
+            if asset.uploader.login not in uploaders:
+                with open('comment', 'w') as file:
+                    file.write(f'@{asset.uploader.login} is not a valid uploader.')
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
new file mode 100644
index 0000000000000..018c5d542f32e
--- /dev/null
+++ b/.github/workflows/release-asset-audit.yml
@@ -0,0 +1,54 @@
+name: Release Asset Audit
+
+on:
+  workflow_dispatch:
+  release:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    # Run once an hour
+    - cron:  '5 * * * *'
+
+  pull_request:
+    paths:
+      - ".github/workflows/release-asset-audit.py"
+      - ".github/workflows/release-asset-audit.yml"
+
+permissions:
+  contents: read # Default everything to read-only
+
+jobs:
+  audit:
+    name: "Release Asset Audit"
+    runs-on: ubuntu-22.04
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 #v4.1.6
+      - name: "Run Audit Script"
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
+          python3 ./.github/workflows/release-asset-audit.py $GITHUB_TOKEN
+      - name: "File Issue"
+        if: >-
+          github.event_name != 'pull_request' &&
+          failure()
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
+          script: |
+            var fs = require('fs');
+            var body = ''
+            if (fs.existsSync('./comment')) {
+              body = fs.readFileSync('./comment') + "\n\n";
+            }
+            body = body + `\n\nhttps://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
+
+            const issue = await github.rest.issues.create({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               title: "Release Asset Audit Failed",
+               labels: ['infrastructure'],
+               body: body
+            });
+            console.log(issue);
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
new file mode 100644
index 0000000000000..73c9d96946e33
--- /dev/null
+++ b/.github/workflows/release-binaries-all.yml
@@ -0,0 +1,94 @@
+name: Release Binaries All
+
+permissions:
+  contents: read # Default everything to read-only
+
+on:
+  workflow_dispatch:
+    inputs:
+      release-version:
+        description: 'Release Version'
+        required: true
+        type: string
+      upload:
+        description: 'Upload binaries to the release page'
+        required: true
+        default: false
+        type: boolean
+
+  workflow_call:
+    inputs:
+      release-version:
+        description: 'Release Version'
+        required: true
+        type: string
+      upload:
+        description: 'Upload binaries to the release page'
+        required: true
+        default: false
+        type: boolean
+
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      # When a PR is closed, we still start this workflow, but then skip
+      # all the jobs, which makes it effectively a no-op.  The reason to
+      # do this is that it allows us to take advantage of concurrency groups
+      # to cancel in progress CI jobs whenever the PR is closed.
+      - closed
+    paths:
+      - '.github/workflows/release-binaries-all.yml'
+      - '.github/workflows/release-binaries.yml'
+      - '.github/workflows/release-binaries-setup-stage/*'
+      - '.github/workflows/release-binaries-save-stage/*'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'dispatch' }}
+  cancel-in-progress: True
+
+jobs:
+  setup-variables:
+    if: >-
+      (github.event_name != 'pull_request' || github.event.action != 'closed')
+    runs-on: ubuntu-22.04
+    outputs:
+      release-version: ${{ steps.vars.outputs.release-version }}
+      upload: ${{ steps.vars.outputs.upload }}
+    steps:
+      - shell: bash
+        id: vars
+        run: |
+          upload="${{ inputs.upload }}"
+          release_version="${{ inputs.release-version }}"
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            upload="false"
+            release_version=""
+          fi
+          echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
+          echo "upload=$upload" >> "$GITHUB_OUTPUT"
+
+  release-binaries-all:
+    name: Build Release Binaries
+    needs:
+      - setup-variables
+    permissions:
+      contents: write # For release uploads
+      id-token: write     # For artifact attestations
+      attestations: write # For artifact attestations
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
+
+    uses: ./.github/workflows/release-binaries.yml
+    with:
+      release-version: "${{ needs.setup-variables.outputs.release-version }}"
+      upload: ${{ needs.setup-variables.outputs.upload == 'true'}}
+      runs-on: "${{ matrix.runs-on }}"
+
diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
new file mode 100644
index 0000000000000..e2f3eeadd15be
--- /dev/null
+++ b/.github/workflows/release-binaries-save-stage/action.yml
@@ -0,0 +1,38 @@
+name: Save Stage
+description: >-
+  Upload the source and binary directories from a build stage so that they
+  can be re-used in the next stage.  This action is used to the release
+  binaries workflow into multiple stages to avoid the 6 hour timeout on
+  the GitHub hosted runners.
+inputs:
+  build-prefix:
+    description: "Directory containing the build directory."
+    required: true
+    type: 'string'
+
+runs:
+  using: "composite"
+  steps:
+    # We need to create an archive of the build directory, because it has too
+    # many files to upload.
+    - name: Package Build and Source Directories
+      shell: bash
+      run: |
+        # Windows does not support symlinks, so we need to dereference them.
+        tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
+        mv ../llvm-project.tar.zst .
+        tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst
+
+    - name: Upload Stage 1 Source
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source
+        path: llvm-project.tar.zst
+        retention-days: 2
+
+    - name: Upload Stage 1 Build Dir
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build
+        path: build.tar.zst
+        retention-days: 2
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
new file mode 100644
index 0000000000000..f5e5db27e6595
--- /dev/null
+++ b/.github/workflows/release-binaries-setup-stage/action.yml
@@ -0,0 +1,59 @@
+name: Setup Stage
+description: >-
+  Setup the next stage of the release binaries workflow.  This sets up the
+  environment correctly for a new stage of the release binaries workflow
+  and also restores the source and build directory from the previous stage.
+
+inputs:
+  previous-artifact:
+    description: >-
+      A unique descriptor for the artifact from the previous stage.  This will
+      be used to construct the final artifact pattern, which is:
+      $RUNNER_OS-$RUNNER_ARCH-$PREVIOUS_ARTIFACT-*
+    required: false
+    type: 'string'
+
+outputs:
+  build-prefix:
+    description: "Directory containing the build directory."
+    value: ${{ steps.build-prefix.outputs.build-prefix }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+   
+    - name: Setup Windows
+      if: startsWith(runner.os, 'Windows')
+      uses: llvm/actions/setup-windows@main
+      with:
+        arch: amd64
+
+    - name: Set Build Prefix
+      id: build-prefix
+      shell: bash
+      run: |
+        build_prefix=`pwd`
+        if [ "${{ runner.os }}" = "Linux" ]; then
+          sudo chown $USER:$USER /mnt/
+          build_prefix=/mnt/
+        fi
+        echo "build-prefix=$build_prefix" >> $GITHUB_OUTPUT
+
+    - name: Download Previous Stage Artifact
+      if: ${{ inputs.previous-artifact }}
+      id: download
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      with:
+        pattern: ${{ runner.os }}-${{ runner.arch }}-${{ inputs.previous-artifact }}-*
+        merge-multiple: true
+
+    - name: Unpack Artifact
+      if: ${{ steps.download.outputs.download-path }}
+      shell: bash
+      run: |
+        tar --zstd -xf llvm-project.tar.zst
+        rm llvm-project.tar.zst
+        tar --zstd -C ${{ steps.build-prefix.outputs.build-prefix}} -xf build.tar.zst
+        rm build.tar.zst
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 7de4d00334d14..b1b046dbad5f8 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -5,28 +5,38 @@ on:
     inputs:
       release-version:
         description: 'Release Version'
-        required: true
+        required: false
         type: string
       upload:
         description: 'Upload binaries to the release page'
         required: true
         default: false
         type: boolean
+      runs-on:
+        description: "Runner to use for the build"
+        required: true
+        type: choice
+        options:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
 
   workflow_call:
     inputs:
       release-version:
         description: 'Release Version'
-        required: true
+        required: false
         type: string
       upload:
         description: 'Upload binaries to the release page'
         required: true
         default: false
         type: boolean
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 8 1 * *'
+      runs-on:
+        description: "Runner to use for the build"
+        required: true
+        type: string
 
 permissions:
   contents: read # Default everything to read-only
@@ -34,30 +44,39 @@ permissions:
 jobs:
   prepare:
     name: Prepare to build binaries
-    runs-on: ubuntu-22.04
+    runs-on: ${{ inputs.runs-on }}
     if: github.repository == 'llvm/llvm-project'
     outputs:
       release-version: ${{ steps.vars.outputs.release-version }}
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
+      target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
+      build-flang: ${{ steps.vars.outputs.build-flang }}
+      enable-pgo: ${{ steps.vars.outputs.enable-pgo }}
+      release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
+      release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
 
     steps:
     - name: Checkout LLVM
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
     - name: Install Dependencies
+      shell: bash
       run: |
         pip install --require-hashes -r ./llvm/utils/git/requirements.txt
 
     - name: Check Permissions
+      if: github.event_name != 'pull_request'
       env:
         GITHUB_TOKEN: ${{ github.token }}
         USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
+      shell: bash
       run: |
         ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
 
     - name: Collect Variables
       id: vars
+      shell: bash
       # In order for the test-release.sh script to run correctly, the LLVM
       # source needs to be at the following location relative to the build dir:
       # | X.Y.Z-rcN | ./rcN/llvm-project
@@ -67,242 +86,393 @@ jobs:
       # | X.Y.Z-rcN | -rc N -test-asserts
       # | X.Y.Z     | -final
       run: |
-        tag="${{ github.ref_name }}"
         trimmed=$(echo ${{ inputs.release-version }} | xargs)
-        [[ "$trimmed" != "" ]] && tag="llvmorg-$trimmed"
-        if [ "$tag" = "main" ]; then
-          # If tag is main, then we've been triggered by a scheduled so pass so
-          # use the head commit as the tag.
-          tag=`git rev-parse HEAD`
+        if [ -n "$trimmed" ]; then
+          release_version="$trimmed"
+          ref="llvmorg-$release_version"
+        else
+          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-${{ github.sha }}"
+          ref=${{ github.sha }}
         fi
         if [ -n "${{ inputs.upload }}" ]; then
           upload="${{ inputs.upload }}"
         else
           upload="false"
         fi
-        bash .github/workflows/set-release-binary-outputs.sh "$tag" "$upload"
+        echo "release-version=$release_version">> $GITHUB_OUTPUT
+        echo "ref=$ref" >> $GITHUB_OUTPUT
+        echo "upload=$upload" >> $GITHUB_OUTPUT
+
+        release_binary_basename="LLVM-$release_version-${{ runner.os }}-${{ runner.arch }}"
+        echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
+        echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
+
+        # Detect necessary CMake flags
+        target="${{ runner.os }}-${{ runner.arch }}"
+        echo "enable-pgo=false" >> $GITHUB_OUTPUT
+        target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
+        # The macOS builds try to cross compile some libraries so we need to
+        # add extra CMake args to disable them.
+        # See https://github.com/llvm/llvm-project/issues/99767
+        if [ "${{ runner.os }}" = "macOS" ]; then
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
+          if [ "${{ runner.arch }}" = "ARM64" ]; then
+            arches=arm64
+          else
+            arches=x86_64
+          fi
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
+        fi
+
+        # x86 macOS and x86 Windows have trouble building flang, so disable it.
+        # Windows: https://github.com/llvm/llvm-project/issues/100202
+        # macOS: 'rebase opcodes terminated early at offset 1 of 80016' when building __fortran_builtins.mod
+        build_flang="true"
+
+        if [ "$target" = "Windows-X64" ]; then
+          target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS=\"clang;lld;lldb;clang-tools-extra;bolt;polly;mlir\""
+          build_flang="false"
+        fi
+
+        if [ "${{ runner.os }}" = "Windows" ]; then
+          # The build times out on Windows, so we need to disable LTO.
+          target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
+        fi
 
-  build-stage1-linux:
-    name: "Build Stage 1 Linux"
+        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
+        echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
+
+  build-stage1:
+    name: "Build Stage 1"
     needs: prepare
-    runs-on: ubuntu-22.04
     if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
+
+    - name: Checkout Actions
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        # Check out outside of working directory so the source checkout doesn't
+        # remove it.
+        path: workflows
+
+    # actions/checkout does not support paths outside of the GITHUB_WORKSPACE.
+    # Also, anything that we put inside of GITHUB_WORKSPACE will be overwritten
+    # by future actions/checkout steps.  Therefore, in order to checkout the
+    # latest actions from main, we need to first checkout out the actions inside of
+    # GITHUB_WORKSPACE (see previous step), then use actions/checkout to checkout
+    # the code being built and the move the actions from main back into GITHUB_WORKSPACE,
+    # becasue the uses on composite actions only reads workflows from inside GITHUB_WORKSPACE.
+    - shell: bash
+      run: mv workflows  ../workflows-main
+
     - name: Checkout LLVM
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         ref: ${{ needs.prepare.outputs.ref }}
 
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+    - name: Copy main workflows
+      shell: bash
+      run: |
+        mv ../workflows-main .
+
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows-main/.github/workflows/release-binaries-setup-stage
 
     - name: Setup sccache
       uses: hendrikmuhs/ccache-action@ca3acd2731eef11f1572ccb126356c2f9298d35e # v1.2.9
       with:
-        max-size: 250M
-        key: sccache-${{ runner.os }}-release
+        # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
+        max-size: 2G
+        key: sccache-${{ runner.os }}-${{ runner.arch }}-release
         variant: sccache
 
     - name: Build Stage 1 Clang
+      id: build
+      shell: bash
       run: |
-        sudo chown $USER:$USER /mnt/
-        cmake -G Ninja -C clang/cmake/caches/Release.cmake -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -S llvm -B /mnt/build
-        ninja -v -C /mnt/build
-
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Package Build and Source Directories
-      run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
-
-    - name: Upload Stage 1 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: stage1-source
-        path: llvm-project.tar.zst
-        retention-days: 2
-
-    - name: Upload Stage 1 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
+        # so we need to set some extra cmake flags to disable this.
+        cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
+            ${{ needs.prepare.outputs.target-cmake-flags }} \
+            -C clang/cmake/caches/Release.cmake \
+            -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
+            -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \
+            -DCMAKE_C_COMPILER_LAUNCHER=sccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+        ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build
+        # There is a race condition on the MacOS builders and this command is here
+        # to help debug that when it happens.
+        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
+    
+    - name: Save Stage
+      uses: ./workflows-main/.github/workflows/release-binaries-save-stage
       with:
-        name: stage1-build
-        path: build.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  build-stage2-linux:
-    name: "Build Stage 2 Linux"
+  build-stage2:
+    name: "Build Stage 2"
     needs:
       - prepare
-      - build-stage1-linux
-    runs-on: ubuntu-22.04
+      - build-stage1
     if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: Download Stage 1 Artifacts
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage1-*
-        merge-multiple: true
-
-    - name: Unpack Artifacts
-      run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage1
 
     - name: Build Stage 2
       # Re-enable once PGO builds are supported.
-      if: false
-      run: |
-        ninja -C /mnt/build stage2-instrumented
-
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Save Build and Source Directories
+      if: needs.prepare.outputs.enable-pgo == 'true'
+      shell: bash
       run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix}}/build stage2-instrumented
 
-    - name: Upload Stage 2 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
       with:
-        name: stage2-source
-        path: ${{ github.workspace }}/llvm-project.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-    - name: Upload Stage 2 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+  build-stage3-clang:
+    name: "Build Stage 3 LLVM/Clang"
+    needs:
+      - prepare
+      - build-stage2
+    if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+    - name: Checkout Actions
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
       with:
-        name: stage2-build
-        path: ${{ github.workspace }}/build.tar.zst
-        retention-days: 2
+        previous-artifact: build-stage2
 
+    - name: Build LLVM/Clang
+      shell: bash
+      run: |
+        # There is a race condition on the MacOS builders and this command is here
+        # to help debug that when it happens.
+        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-clang
+        # Build some of the larger binaries here too.
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
+            clang-scan-deps \
+            modularize clangd \
+            clangd-indexer \
+            clang-check \
+            ${{ (runner.os == 'Linux' && 'clangd-fuzzer') || '' }} \
+            clang-tidy \
+            llc \
+            lli \
+            llvm-exegesis \
+            llvm-opt-fuzzer \
+            llvm-reduce \
+            llvm-lto \
+            dsymutil
+
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
+      with:
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  build-stage3-linux:
-    name: "Build Stage 3 Linux"
+  build-stage3-flang:
+    name: "Build Stage 3 Flang/MLIR/Bolt"
     needs:
       - prepare
-      - build-stage2-linux
-    outputs:
-      filename: ${{ steps.package-info.outputs.release-filename }}
-    runs-on: ubuntu-22.04-16x64
-    if: github.repository == 'llvm/llvm-project'
+      - build-stage3-clang
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage2-*
-        merge-multiple: true
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-clang
 
-    - name: Unpack Artifact
+    - name: Build Flang / MLIR / Bolt
+      shell: bash
       run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        # Build some of the mlir tools that take a long time to link
+        if [ "${{ needs.prepare.outputs.build-flang }}" = "true" ]; then
+          ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ -j2 flang-new bbc
+        fi
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
+            mlir-bytecode-parser-fuzzer \
+            mlir-cpu-runner \
+            mlir-lsp-server \
+            mlir-opt \
+            mlir-query \
+            mlir-reduce \
+            mlir-text-parser-fuzzer \
+            mlir-translate \
+            mlir-transform-opt \
+            mlir-cat \
+            mlir-minimal-opt \
+            mlir-minimal-opt-canonicalize \
+            mlir-pdll-lsp-server \
+            llvm-bolt \
+            llvm-bolt-heatmap
+    
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
+      with:
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-    - name: Build Release Package
-      run: |
-        ninja -C /mnt/build stage2-package
+  build-stage3-all:
+    name: "Build Stage 3"
+    needs:
+      - prepare
+      - build-stage3-flang
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+    - name: Checkout Actions
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-flang
 
-    - id: package-info
+    - name: Build Release Package
+      shell: bash
       run: |
-        filename="LLVM-${{ needs.prepare.outputs.release-version }}-Linux.tar.xz"
-        echo "filename=$filename" >> $GITHUB_OUTPUT
-        echo "path=/mnt/build/tools/clang/stage2-bins/$filename" >> $GITHUB_OUTPUT
+        which cmake
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
+        # Copy Release artifact to the workspace so it is easier to upload.
+        # This is necessary, because on Windows, the build-prefix path can
+        # only be used on bash steps, because it uses the form of /d/files/
+        # and other steps expect D:\files.
+        mv ${{ steps.setup-stage.outputs.build-prefix  }}/build/tools/clang/stage2-bins/${{ needs.prepare.outputs.release-binary-filename }} .
 
     - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      if: always()
       with:
-        name: release-binary
-        path: ${{ steps.package-info.outputs.path }}
+        name: ${{ runner.os }}-${{ runner.arch }}-release-binary
+        # Due to path differences on Windows when running in bash vs running on node,
+        # we need to search for files in the current workspace.
+        path: |
+          ${{ needs.prepare.outputs.release-binary-filename }}
 
     # Clean up some build files to reduce size of artifact.
     - name: Clean Up Build Directory
+      shell: bash
       run: |
-        find /mnt/build -iname ${{ steps.package-info.outputs.filename }} -delete
+        find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname ${{ needs.prepare.outputs.release-binary-filename }} -delete
+        rm -Rf ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/_CPack_Packages
 
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Save Build and Source Directories
-      run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
-
-    - name: Upload Stage 3 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: stage3-source
-        path: llvm-project.tar.zst
-        retention-days: 2
-
-    - name: Upload Stage 3 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
       with:
-        name: stage3-build
-        path: build.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  upload-release-binaries-linux:
-    name: "Upload Linux Release Binaries"
+  upload-release-binaries:
+    name: "Upload Release Binaries"
     needs:
       - prepare
-      - build-stage3-linux
-    if : ${{ needs.prepare.outputs.upload == 'true' }}
+      - build-stage3-all
+    if: >-
+      always() &&
+      github.event_name != 'pull_request' &&
+      needs.prepare.outputs.upload == 'true'
     runs-on: ubuntu-22.04
     permissions:
       contents: write # For release uploads
+      id-token: write     # For artifact attestations
+      attestations: write # For artifact attestations
 
     steps:
     - name: 'Download artifact'
       uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
       with:
-        name: release-binary
+        pattern: '*-release-binary'
+        merge-multiple: true
+
+    - name: Attest Build Provenance
+      id: provenance
+      uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+      with:
+        subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
+
+    - name: Rename attestation file
+      run:
+        mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
+
+    - name: Upload Build Provenance
+      uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+      with:
+        name: ${{ runner.os }}-${{ runner.arch }}-release-binary-attestation
+        path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
     - name: Upload Release
+      shell: bash
       run: |
         sudo apt install python3-github
         ./llvm-project/llvm/utils/release/github-upload-release.py \
         --token ${{ github.token }} \
         --release ${{ needs.prepare.outputs.release-version }} \
         upload \
-        --files ${{ needs.build-stage3-linux.outputs.release-filename }}
-
+        --files ${{ needs.prepare.outputs.release-binary-filename }}*
 
-  test-stage3-linux:
-    name: "Test Stage 3 Linux"
+  test-stage3:
+    name: "Test Stage 3"
     needs:
       - prepare
-      - build-stage3-linux
-    runs-on: ubuntu-22.04
-    if: github.repository == 'llvm/llvm-project'
+      - build-stage3-all
+    if: >-
+      github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage3-*
-        merge-multiple: true
-
-    - name: Unpack Artifact
-      run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-all
 
     - name: Run Tests
+      shell: bash
       run: |
-        ninja -C /mnt/build stage2-check-all
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-check-all
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 9c5b1a9f01709..b0c0b652f3758 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
       - id: inputs
         run: |
-          ref=${{ inputs.release-version || github.sha }}
+          ref=${{ (inputs.release-version && format('llvmorg-{0}', inputs.release-version)) || github.sha }}
           if [ -n "${{ inputs.release-version }}" ]; then
             export_args="-release ${{ inputs.release-version }} -final"
           else
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index 2ed56dace1d4c..7dd4c306671b7 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -81,10 +81,20 @@ jobs:
     needs:
       - validate-tag
       - release-create
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
+
     uses: ./.github/workflows/release-binaries.yml
     with:
       release-version: ${{ needs.validate-tag.outputs.release-version }}
       upload: true
+      runs-on: ${{ matrix.runs-on }}
 
   release-sources:
     name: Package Release Sources
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
new file mode 100644
index 0000000000000..9d8fb59a67c0e
--- /dev/null
+++ b/.github/workflows/unprivileged-download-artifact/action.yml
@@ -0,0 +1,81 @@
+name: Unprivileged Download Artifact
+description: >-
+  Download artifacts from another workflow run without using an access token.
+inputs:
+  run-id:
+    description: >-
+      The run-id for the workflow run that you want to download the artifact
+      from.  If ommitted it will download the most recently created artifact
+      from the repo with the artifact-name.
+    required: false
+  artifact-name:
+    desciption: The name of the artifact to download.
+    required: true
+
+
+outputs:
+  filename:
+    description: >-
+      The filename of the downloaded artifact or the empty string if the
+      artifact was not found.
+    value: ${{ steps.download-artifact.outputs.filename }}
+  artifact-id:
+    description: "The id of the artifact being downloaded."
+    value: ${{ steps.artifact-url.outputs.id }}
+
+
+runs:
+  using: "composite"
+  steps:
+    - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+      id: artifact-url
+      with:
+        script: |
+          var response;
+          if (!"${{ inputs.run-id }}") {
+            response = await github.rest.actions.listArtifactsForRepo({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              name: "${{ inputs.artifact-name }}"
+            })
+          } else {
+            response = await github.rest.actions.listWorkflowRunArtifacts({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              run_id: "${{ inputs.run-id }}",
+              name: "${{ inputs.artifact-name }}"
+            })
+          }
+
+          console.log(response)
+
+          for (artifact of response.data.artifacts) {
+            console.log(artifact);
+          }
+
+          if (response.data.artifacts.length == 0) {
+            console.log("Could not find artifact ${{ inputs.artifact-name }} for workflow run ${{ inputs.run-id }}")
+            return;
+          }
+
+          const url_response = await github.rest.actions.downloadArtifact({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            artifact_id: response.data.artifacts[0].id,
+            archive_format: "zip"
+          })
+
+          core.setOutput("url", url_response.url);
+          core.setOutput("id", response.data.artifacts[0].id);
+
+    - shell: bash
+      if: steps.artifact-url.outputs.url != ''
+      id: download-artifact
+      run: |
+        curl -L -o ${{ inputs.artifact-name }}.zip "${{ steps.artifact-url.outputs.url }}"
+        echo "filename=${{ inputs.artifact-name }}.zip" >> $GITHUB_OUTPUT
+
+    - shell: bash
+      if: steps.download-artifact.outputs.filename != ''
+      run: |
+        unzip ${{ steps.download-artifact.outputs.filename }}
diff --git a/.github/workflows/version-check.yml b/.github/workflows/version-check.yml
index 4ce6119a407f5..894e07d323ca9 100644
--- a/.github/workflows/version-check.yml
+++ b/.github/workflows/version-check.yml
@@ -27,5 +27,5 @@ jobs:
 
       - name: Version Check
         run: |
-          version=$(grep -o 'LLVM_VERSION_\(MAJOR\|MINOR\|PATCH\) [0-9]\+' llvm/CMakeLists.txt  | cut -d ' ' -f 2 | tr "\n" "." | sed 's/.$//g')
+          version=$(grep -o 'LLVM_VERSION_\(MAJOR\|MINOR\|PATCH\) [0-9]\+' cmake/Modules/LLVMVersion.cmake  | cut -d ' ' -f 2 | tr "\n" "." | sed 's/.$//g')
           .github/workflows/version-check.py "$version"
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 74907ad118d12..9f5875dd21284 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -1,6 +1,17 @@
+cmake_minimum_required(VERSION 3.20.0)
+
 set(LLVM_SUBPROJECT_TITLE "BOLT")
 
-include(ExternalProject)
+if(NOT DEFINED LLVM_COMMON_CMAKE_UTILS)
+  set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
+endif()
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
+  NO_POLICY_SCOPE)
+
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+  project(bolt)
+  set(BOLT_BUILT_STANDALONE TRUE)
+endif()
 
 set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -9,6 +20,42 @@ set(CMAKE_CXX_STANDARD 17)
 # Add path for custom modules.
 list(INSERT CMAKE_MODULE_PATH 0 "${BOLT_SOURCE_DIR}/cmake/modules")
 
+include(GNUInstallDirs)
+
+# standalone build, copied from clang
+if(BOLT_BUILT_STANDALONE)
+  set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to")
+  set(CMAKE_CXX_STANDARD_REQUIRED YES)
+  set(CMAKE_CXX_EXTENSIONS NO)
+
+  if(NOT MSVC_IDE)
+    set(LLVM_ENABLE_ASSERTIONS ${ENABLE_ASSERTIONS}
+      CACHE BOOL "Enable assertions")
+    # Assertions should follow llvm-config's.
+    mark_as_advanced(LLVM_ENABLE_ASSERTIONS)
+  endif()
+
+  find_package(LLVM REQUIRED HINTS "${LLVM_CMAKE_DIR}")
+  list(APPEND CMAKE_MODULE_PATH "${LLVM_DIR}")
+
+  set(LLVM_MAIN_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../llvm" CACHE PATH "Path to LLVM source tree")
+  find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR}
+    NO_DEFAULT_PATH)
+
+  # They are used as destination of target generators.
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
+
+  include(AddLLVM)
+  include(TableGen)
+  include_directories(${LLVM_INCLUDE_DIRS})
+  link_directories("${LLVM_LIBRARY_DIR}")
+
+  set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_INSTALL_BINDIR}" )
+  set( CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}/${LLVM_LIBDIR_SUFFIX}" )
+  set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}/${LLVM_LIBDIR_SUFFIX}")
+endif() # standalone
+
 # Determine default set of targets to build -- the intersection of
 # those BOLT supports and those LLVM is targeting.
 set(BOLT_TARGETS_TO_BUILD_all "AArch64;X86;RISCV")
@@ -94,6 +141,8 @@ if (BOLT_ENABLE_RUNTIME)
   if(CMAKE_SYSROOT)
     list(APPEND extra_args -DCMAKE_SYSROOT=${CMAKE_SYSROOT})
   endif()
+
+  include(ExternalProject)
   ExternalProject_Add(bolt_rt
     SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
     STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
@@ -104,6 +153,7 @@ if (BOLT_ENABLE_RUNTIME)
                -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                -DLLVM_LIBDIR_SUFFIX=${LLVM_LIBDIR_SUFFIX}
                -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
+               -DBOLT_BUILT_STANDALONE=${BOLT_BUILT_STANDALONE}
                ${extra_args}
     INSTALL_COMMAND ""
     BUILD_ALWAYS True
@@ -113,6 +163,8 @@ if (BOLT_ENABLE_RUNTIME)
   add_llvm_install_targets(install-bolt_rt
     DEPENDS bolt_rt bolt
     COMPONENT bolt)
+  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a")
+  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a")
 endif()
 
 find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld")
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index bd6e2ec01b53e..0c8935457366d 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -88,7 +88,7 @@
 
 - `--comp-dir-override=<string>`
 
-  Overrides DW_AT_comp_dir, and provides an alterantive base location, which is
+  Overrides DW_AT_comp_dir, and provides an alternative base location, which is
   used with DW_AT_dwo_name to construct a path to *.dwo files.
 
 - `--create-debug-names-section`
@@ -113,11 +113,6 @@
 
   Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
-- `--deterministic-debuginfo`
-
-  Disables parallel execution of tasks that may produce nondeterministic debug
-  info
-
 - `--dot-tooltip-code`
 
   Add basic block instructions as tool tips on nodes
@@ -686,6 +681,10 @@
   threshold means fewer functions to process. E.g threshold of 90 means only top
   10 percent of functions with profile will be processed.
 
+- `--match-with-call-graph`
+
+  Match functions with call graph
+
 - `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
 
   List of functions with call sites for which to specialize memcpy() for size 1
diff --git a/bolt/docs/OptimizingClang.md b/bolt/docs/OptimizingClang.md
index ff7e71b6a76bc..685fcc2b738fa 100644
--- a/bolt/docs/OptimizingClang.md
+++ b/bolt/docs/OptimizingClang.md
@@ -49,6 +49,7 @@ $ cd ${TOPLEV}/stage3
 $ CPATH=${TOPLEV}/stage2-prof-use-lto/install/bin/
 $ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_ENABLE_PROJECTS="clang" \
     -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3/install
 $ perf record -e cycles:u -j any,u -- ninja clang
 ```
diff --git a/bolt/docs/generate_doc.py b/bolt/docs/generate_doc.py
index d8829daf677b4..763dc00b44ca3 100644
--- a/bolt/docs/generate_doc.py
+++ b/bolt/docs/generate_doc.py
@@ -45,7 +45,7 @@ def parse_bolt_options(output):
         cleaned_line = line.strip()
 
         if cleaned_line.casefold() in map(str.casefold, section_headers):
-            if prev_section != None:  # Save last option from prev section
+            if prev_section is not None:  # Save last option from prev section
                 add_info(sections, current_section, option, description)
                 option, description = None, []
 
@@ -76,7 +76,7 @@ def parse_bolt_options(output):
                 description = [descr]
                 if option.startswith("--print") or option.startswith("--time"):
                     current_section = "BOLT printing options:"
-                elif prev_section != None:
+                elif prev_section is not None:
                     current_section = prev_section
             continue
 
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 73932c4ca2fb3..5fb32a1ea6784 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -23,6 +23,7 @@
 #include "bolt/RuntimeLibs/RuntimeLibrary.h"
 #include "llvm/ADT/AddressRanges.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -32,6 +33,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -240,12 +242,19 @@ class BinaryContext {
   /// Function fragments to skip.
   std::unordered_set<BinaryFunction *> FragmentsToSkip;
 
+  /// Fragment equivalence classes to query belonging to the same "family" in
+  /// presence of multiple fragments/multiple parents.
+  EquivalenceClasses<const BinaryFunction *> FragmentClasses;
+
   /// The runtime library.
   std::unique_ptr<RuntimeLibrary> RtLibrary;
 
   /// DWP Context.
   std::shared_ptr<DWARFContext> DWPContext;
 
+  /// Decoded pseudo probes.
+  std::shared_ptr<MCPseudoProbeDecoder> PseudoProbeDecoder;
+
   /// A map of DWO Ids to CUs.
   using DWOIdToCUMapType = std::unordered_map<uint64_t, DWARFUnit *>;
   DWOIdToCUMapType DWOCUs;
@@ -377,6 +386,15 @@ class BinaryContext {
     RtLibrary = std::move(Lib);
   }
 
+  const MCPseudoProbeDecoder *getPseudoProbeDecoder() const {
+    return PseudoProbeDecoder.get();
+  }
+
+  void setPseudoProbeDecoder(std::shared_ptr<MCPseudoProbeDecoder> Decoder) {
+    assert(!PseudoProbeDecoder && "Cannot set pseudo probe decoder twice.");
+    PseudoProbeDecoder = Decoder;
+  }
+
   /// Return BinaryFunction containing a given \p Address or nullptr if
   /// no registered function contains the \p Address.
   ///
@@ -431,6 +449,9 @@ class BinaryContext {
     return nullptr;
   }
 
+  /// Deregister JumpTable registered at a given \p Address and delete it.
+  void deleteJumpTable(uint64_t Address);
+
   unsigned getDWARFEncodingSize(unsigned Encoding) {
     if (Encoding == dwarf::DW_EH_PE_omit)
       return 0;
@@ -1016,7 +1037,15 @@ class BinaryContext {
   ///   fragment_name == parent_name.cold(.\d+)?
   /// True if the Function is registered, false if the check failed.
   bool registerFragment(BinaryFunction &TargetFunction,
-                        BinaryFunction &Function) const;
+                        BinaryFunction &Function);
+
+  /// Return true if two functions belong to the same "family": are fragments
+  /// of one another, or fragments of the same parent, or transitively fragment-
+  /// related.
+  bool areRelatedFragments(const BinaryFunction *LHS,
+                           const BinaryFunction *RHS) const {
+    return FragmentClasses.isEquivalent(LHS, RHS);
+  }
 
   /// Add interprocedural reference for \p Function to \p Address
   void addInterproceduralReference(BinaryFunction *Function, uint64_t Address) {
@@ -1436,10 +1465,7 @@ class BinaryContext {
     std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(OS);
     std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
         *TheTriple, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), std::move(OW),
-        std::unique_ptr<MCCodeEmitter>(MCE), *STI,
-        /* RelaxAll */ false,
-        /* IncrementalLinkerCompatible */ false,
-        /* DWARFMustBeAtTheEnd */ false));
+        std::unique_ptr<MCCodeEmitter>(MCE), *STI));
     return Streamer;
   }
 
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 7d0afee4d1b78..24c7db2f5d69c 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -416,6 +416,9 @@ class BinaryFunction {
   /// different parameters by every pass.
   mutable uint64_t Hash{0};
 
+  /// Function GUID assigned externally.
+  uint64_t GUID{0};
+
   /// For PLT functions it contains a symbol associated with a function
   /// reference. It is nullptr for non-PLT functions.
   const MCSymbol *PLTSymbol{nullptr};
@@ -1790,11 +1793,6 @@ class BinaryFunction {
     return ParentFragments.contains(&Other);
   }
 
-  /// Returns if this function is a parent of \p Other function.
-  bool isParentOf(const BinaryFunction &Other) const {
-    return Fragments.contains(&Other);
-  }
-
   /// Return the child fragment form parent function
   iterator_range<FragmentsSetTy::const_iterator> getFragments() const {
     return iterator_range<FragmentsSetTy::const_iterator>(Fragments.begin(),
@@ -1804,11 +1802,6 @@ class BinaryFunction {
   /// Return the parent function for split function fragments.
   FragmentsSetTy *getParentFragments() { return &ParentFragments; }
 
-  /// Returns if this function is a parent or child of \p Other function.
-  bool isParentOrChildOf(const BinaryFunction &Other) const {
-    return isChildOf(Other) || isParentOf(Other);
-  }
-
   /// Set the profile data for the number of times the function was called.
   BinaryFunction &setExecutionCount(uint64_t Count) {
     ExecutionCount = Count;
@@ -2256,6 +2249,11 @@ class BinaryFunction {
   /// Returns the last computed hash value of the function.
   size_t getHash() const { return Hash; }
 
+  /// Returns the function GUID.
+  uint64_t getGUID() const { return GUID; }
+
+  void setGUID(uint64_t Id) { GUID = Id; }
+
   using OperandHashFuncTy =
       function_ref<typename std::string(const MCOperand &)>;
 
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index 0b840c142ed81..e5b057ea1e42b 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -127,6 +127,9 @@ class DIEBuilder {
   DWARFContext *DwarfContext{nullptr};
   DWARFUnit *SkeletonCU{nullptr};
   uint64_t UnitSize{0};
+  /// Adds separate UnitSize counter for updating DebugNames
+  /// so there is no dependency between the functions.
+  uint64_t DebugNamesUnitSize{0};
   llvm::DenseSet<uint64_t> AllProcessed;
   DWARF5AcceleratorTable &DebugNamesTable;
   // Unordered map to handle name collision if output DWO directory is
@@ -203,13 +206,16 @@ class DIEBuilder {
   /// Update references once the layout is finalized.
   void updateReferences();
 
-  /// Update the Offset and Size of DIE, populate DebugNames table.
+  /// Update the Offset and Size of DIE.
   /// Along with current CU, and DIE being processed and the new DIE offset to
   /// be updated, it takes in Parents vector that can be empty if this DIE has
   /// no parents.
-  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                        std::optional<BOLTDWARF5AccelTableData *> Parent,
-                        uint32_t NumberParentsInChain, uint32_t &CurOffset);
+  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+
+  /// Populates DebugNames table.
+  void populateDebugNamesTable(DWARFUnit &CU, const DIE &Die,
+                               std::optional<BOLTDWARF5AccelTableData *> Parent,
+                               uint32_t NumberParentsInChain);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
@@ -338,6 +344,9 @@ class DIEBuilder {
   /// Finish current DIE construction.
   void finish();
 
+  /// Update debug names table.
+  void updateDebugNamesTable();
+
   // Interface to edit DIE
   template <class T> T *allocateDIEValue() {
     return new (getState().DIEAlloc) T;
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 5935ffaa46af7..6ea3b1af1024f 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -475,7 +475,8 @@ class DebugStrOffsetsWriter {
   }
 
   /// Update Str offset in .debug_str in .debug_str_offsets.
-  void updateAddressMap(uint32_t Index, uint32_t Address);
+  void updateAddressMap(uint32_t Index, uint32_t Address,
+                        const DWARFUnit &Unit);
 
   /// Get offset for given index in original .debug_str_offsets section.
   uint64_t getOffset(uint32_t Index) const { return StrOffsets[Index]; }
@@ -507,6 +508,8 @@ class DebugStrOffsetsWriter {
   std::unique_ptr<DebugStrOffsetsBufferVector> StrOffsetsBuffer;
   std::unique_ptr<raw_svector_ostream> StrOffsetsStream;
   std::map<uint32_t, uint32_t> IndexToAddressMap;
+  [[maybe_unused]]
+  DenseSet<uint64_t> DebugStrOffsetFinalized;
   SmallVector<uint32_t, 5> StrOffsets;
   std::unordered_map<uint64_t, uint64_t> ProcessedBaseOffsets;
   bool StrOffsetSectionWasModified = false;
diff --git a/bolt/include/bolt/Core/GDBIndex.h b/bolt/include/bolt/Core/GDBIndex.h
index 6604c2a11472d..0ebcf4ecfe99e 100644
--- a/bolt/include/bolt/Core/GDBIndex.h
+++ b/bolt/include/bolt/Core/GDBIndex.h
@@ -53,6 +53,14 @@ class GDBIndex {
   const GDBIndexTUEntryType &getGDBIndexTUEntryVector() const {
     return GDBIndexTUEntryVector;
   }
+
+  /// Sorts entries in GDBIndexTUEntryVector according to the TypeHash.
+  void sortGDBIndexTUEntryVector() {
+    llvm::stable_sort(GDBIndexTUEntryVector, [](const GDBIndexTUEntry &LHS,
+                                                const GDBIndexTUEntry &RHS) {
+      return LHS.TypeHash > RHS.TypeHash;
+    });
+  }
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index c916c6f95751f..32eda0b283b88 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -58,6 +58,8 @@ enum class IndirectBranchType : char {
   POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
   POSSIBLE_GOTO,           /// Possibly a gcc's computed goto.
   POSSIBLE_FIXED_BRANCH,   /// Possibly an indirect branch to a fixed location.
+  POSSIBLE_PIC_FIXED_BRANCH, /// Possibly an indirect jump to a fixed entry in a
+                             /// PIC jump table.
 };
 
 class MCPlusBuilder {
@@ -1474,12 +1476,11 @@ class MCPlusBuilder {
   /// will be set to the different components of the branch.  \p MemLocInstr
   /// is the instruction that loads up the indirect function pointer.  It may
   /// or may not be same as \p Instruction.
-  virtual IndirectBranchType
-  analyzeIndirectBranch(MCInst &Instruction, InstructionIterator Begin,
-                        InstructionIterator End, const unsigned PtrSize,
-                        MCInst *&MemLocInstr, unsigned &BaseRegNum,
-                        unsigned &IndexRegNum, int64_t &DispValue,
-                        const MCExpr *&DispExpr, MCInst *&PCRelBaseOut) const {
+  virtual IndirectBranchType analyzeIndirectBranch(
+      MCInst &Instruction, InstructionIterator Begin, InstructionIterator End,
+      const unsigned PtrSize, MCInst *&MemLocInstr, unsigned &BaseRegNum,
+      unsigned &IndexRegNum, int64_t &DispValue, const MCExpr *&DispExpr,
+      MCInst *&PCRelBaseOut, MCInst *&FixedEntryLoadInst) const {
     llvm_unreachable("not implemented");
     return IndirectBranchType::UNKNOWN;
   }
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index 9dd3920dbf094..2a0514d7d9304 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -93,11 +93,36 @@ template <> struct MappingTraits<bolt::SuccessorInfo> {
   static const bool flow = true;
 };
 
+namespace bolt {
+struct PseudoProbeInfo {
+  llvm::yaml::Hex64 GUID;
+  uint64_t Index;
+  uint8_t Type;
+
+  bool operator==(const PseudoProbeInfo &Other) const {
+    return GUID == Other.GUID && Index == Other.Index;
+  }
+  bool operator!=(const PseudoProbeInfo &Other) const {
+    return !(*this == Other);
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::PseudoProbeInfo> {
+  static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) {
+    YamlIO.mapRequired("guid", PI.GUID);
+    YamlIO.mapRequired("id", PI.Index);
+    YamlIO.mapRequired("type", PI.Type);
+  }
+
+  static const bool flow = true;
+};
 } // end namespace yaml
 } // end namespace llvm
 
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::CallSiteInfo)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::SuccessorInfo)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::PseudoProbeInfo)
 
 namespace llvm {
 namespace yaml {
@@ -111,6 +136,7 @@ struct BinaryBasicBlockProfile {
   uint64_t EventCount{0};
   std::vector<CallSiteInfo> CallSites;
   std::vector<SuccessorInfo> Successors;
+  std::vector<PseudoProbeInfo> PseudoProbes;
 
   bool operator==(const BinaryBasicBlockProfile &Other) const {
     return Index == Other.Index;
@@ -132,6 +158,8 @@ template <> struct MappingTraits<bolt::BinaryBasicBlockProfile> {
                        std::vector<bolt::CallSiteInfo>());
     YamlIO.mapOptional("succ", BBP.Successors,
                        std::vector<bolt::SuccessorInfo>());
+    YamlIO.mapOptional("pseudo_probes", BBP.PseudoProbes,
+                       std::vector<bolt::PseudoProbeInfo>());
   }
 };
 
@@ -151,6 +179,8 @@ struct BinaryFunctionProfile {
   llvm::yaml::Hex64 Hash{0};
   uint64_t ExecCount{0};
   std::vector<BinaryBasicBlockProfile> Blocks;
+  llvm::yaml::Hex64 GUID{0};
+  llvm::yaml::Hex64 PseudoProbeDescHash{0};
   bool Used{false};
 };
 } // end namespace bolt
@@ -164,6 +194,9 @@ template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
     YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
     YamlIO.mapOptional("blocks", BFP.Blocks,
                        std::vector<bolt::BinaryBasicBlockProfile>());
+    YamlIO.mapOptional("guid", BFP.GUID, (uint64_t)0);
+    YamlIO.mapOptional("pseudo_probe_desc_hash", BFP.PseudoProbeDescHash,
+                       (uint64_t)0);
   }
 };
 
diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 502a1ad612bb0..bd5a86fd676a5 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -43,6 +43,59 @@ class YAMLProfileReader : public ProfileReaderBase {
   using ProfileLookupMap =
       DenseMap<uint32_t, yaml::bolt::BinaryFunctionProfile *>;
 
+  /// A class for matching binary functions in functions in the YAML profile.
+  /// First, a call graph is constructed for both profiled and binary functions.
+  /// Then functions are hashed based on the names of their callee/caller
+  /// functions. Finally, functions are matched based on these neighbor hashes.
+  class CallGraphMatcher {
+  public:
+    /// Constructs the call graphs for binary and profiled functions and
+    /// computes neighbor hashes for binary functions.
+    CallGraphMatcher(BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP,
+                     ProfileLookupMap &IdToYAMLBF);
+
+    /// Returns the YamlBFs adjacent to the parameter YamlBF in the call graph.
+    std::optional<std::set<yaml::bolt::BinaryFunctionProfile *>>
+    getAdjacentYamlBFs(yaml::bolt::BinaryFunctionProfile &YamlBF) {
+      auto It = YamlBFAdjacencyMap.find(&YamlBF);
+      return It == YamlBFAdjacencyMap.end() ? std::nullopt
+                                            : std::make_optional(It->second);
+    }
+
+    /// Returns the binary functions with the parameter neighbor hash.
+    std::optional<std::vector<BinaryFunction *>>
+    getBFsWithNeighborHash(uint64_t NeighborHash) {
+      auto It = NeighborHashToBFs.find(NeighborHash);
+      return It == NeighborHashToBFs.end() ? std::nullopt
+                                           : std::make_optional(It->second);
+    }
+
+  private:
+    /// Adds edges to the binary function call graph given the callsites of the
+    /// parameter function.
+    void constructBFCG(BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP);
+
+    /// Using the constructed binary function call graph, computes and creates
+    /// mappings from "neighbor hash" (composed of the function names of callee
+    /// and caller functions of a function) to binary functions.
+    void computeBFNeighborHashes(BinaryContext &BC);
+
+    /// Constructs the call graph for profile functions.
+    void constructYAMLFCG(yaml::bolt::BinaryProfile &YamlBP,
+                          ProfileLookupMap &IdToYAMLBF);
+
+    /// Adjacency map for binary functions in the call graph.
+    DenseMap<BinaryFunction *, std::set<BinaryFunction *>> BFAdjacencyMap;
+
+    /// Maps neighbor hashes to binary functions.
+    DenseMap<uint64_t, std::vector<BinaryFunction *>> NeighborHashToBFs;
+
+    /// Adjacency map for profile functions in the call graph.
+    DenseMap<yaml::bolt::BinaryFunctionProfile *,
+             std::set<yaml::bolt::BinaryFunctionProfile *>>
+        YamlBFAdjacencyMap;
+  };
+
 private:
   /// Adjustments for basic samples profiles (without LBR).
   bool NormalizeByInsnCount{false};
@@ -100,6 +153,9 @@ class YAMLProfileReader : public ProfileReaderBase {
   /// Matches functions using exact hash.
   size_t matchWithHash(BinaryContext &BC);
 
+  /// Matches functions using the call graph.
+  size_t matchWithCallGraph(BinaryContext &BC);
+
   /// Matches functions with similarly named profiled functions.
   size_t matchWithNameSimilarity(BinaryContext &BC);
 
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index abd18b56113b6..deaf179140c01 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -15,7 +15,6 @@
 #include "bolt/Core/GDBIndex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/DWP/DWP.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <cstdint>
@@ -41,13 +40,6 @@ class DWARFRewriter {
     uint64_t TypeHash;
     uint64_t TypeDIERelativeOffset;
   };
-  /// Contains information for CU or TU so we can output correct {cu, tu}-index.
-  struct UnitMeta {
-    uint64_t Offset;
-    uint64_t Length;
-    uint64_t TUHash;
-  };
-  using UnitMetaVectorType = std::vector<UnitMeta>;
 
 private:
   BinaryContext &BC;
@@ -95,9 +87,6 @@ class DWARFRewriter {
 
   std::mutex LocListDebugInfoPatchesMutex;
 
-  /// Dwo id specific its RangesBase.
-  std::unordered_map<uint64_t, uint64_t> DwoRangesBase;
-
   std::unordered_map<DWARFUnit *, uint64_t> LineTablePatchMap;
   std::unordered_map<const DWARFUnit *, uint64_t> TypeUnitRelocMap;
 
@@ -191,47 +180,12 @@ class DWARFRewriter {
   /// Update stmt_list for CUs based on the new .debug_line \p Layout.
   void updateLineTableOffsets(const MCAssembler &Asm);
 
-  uint64_t getDwoRangesBase(uint64_t DWOId) { return DwoRangesBase[DWOId]; }
-
-  void setDwoRangesBase(uint64_t DWOId, uint64_t RangesBase) {
-    DwoRangesBase[DWOId] = RangesBase;
-  }
-
   using OverriddenSectionsMap = std::unordered_map<DWARFSectionKind, StringRef>;
   /// Output .dwo files.
   void writeDWOFiles(DWARFUnit &, const OverriddenSectionsMap &,
                      const std::string &, DebugLocWriter &,
                      DebugStrOffsetsWriter &, DebugStrWriter &);
   using KnownSectionsEntry = std::pair<MCSection *, DWARFSectionKind>;
-  struct DWPState {
-    std::unique_ptr<ToolOutputFile> Out;
-    std::unique_ptr<BinaryContext> TmpBC;
-    std::unique_ptr<MCStreamer> Streamer;
-    std::unique_ptr<DWPStringPool> Strings;
-    /// Used to store String sections for .dwo files if they are being modified.
-    std::vector<std::unique_ptr<DebugBufferVector>> StrSections;
-    const MCObjectFileInfo *MCOFI = nullptr;
-    const DWARFUnitIndex *CUIndex = nullptr;
-    std::deque<SmallString<32>> UncompressedSections;
-    MapVector<uint64_t, UnitIndexEntry> IndexEntries;
-    MapVector<uint64_t, UnitIndexEntry> TypeIndexEntries;
-    StringMap<KnownSectionsEntry> KnownSections;
-    uint32_t ContributionOffsets[8] = {};
-    uint32_t IndexVersion = 2;
-    uint64_t DebugInfoSize = 0;
-    uint16_t Version = 0;
-    bool IsDWP = false;
-  };
-  /// Init .dwp file
-  void initDWPState(DWPState &);
-
-  /// Write out .dwp File
-  void finalizeDWP(DWPState &);
-
-  /// add content of dwo to .dwp file.
-  void updateDWP(DWARFUnit &, const OverriddenSectionsMap &, const UnitMeta &,
-                 UnitMetaVectorType &, DWPState &, DebugLocWriter &,
-                 DebugStrOffsetsWriter &, DebugStrWriter &);
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h b/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
index e392029156bce..fc1db7369eb4a 100644
--- a/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
+++ b/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
@@ -58,7 +58,16 @@ class RuntimeLibrary {
   uint64_t RuntimeFiniAddress{0};
   uint64_t RuntimeStartAddress{0};
 
-  /// Get the full path to a runtime library specified by \p LibFileName.
+  /// Get the full path to a runtime library specified by \p LibFileName and \p
+  /// ToolPath.
+  static std::string getLibPathByToolPath(StringRef ToolPath,
+                                          StringRef LibFileName);
+
+  /// Get the full path to a runtime library by the install directory.
+  static std::string getLibPathByInstalled(StringRef LibFileName);
+
+  /// Gets the full path to a runtime library based on whether it exists
+  /// in the install libdir or runtime libdir.
   static std::string getLibPath(StringRef ToolPath, StringRef LibFileName);
 
   /// Load a static runtime library specified by \p LibPath.
diff --git a/bolt/lib/CMakeLists.txt b/bolt/lib/CMakeLists.txt
index 22a6be44f6458..d0f921a1a1e2c 100644
--- a/bolt/lib/CMakeLists.txt
+++ b/bolt/lib/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_compile_definitions(CMAKE_INSTALL_FULL_LIBDIR="${CMAKE_INSTALL_FULL_LIBDIR}")
+
 add_subdirectory(Core)
 add_subdirectory(Passes)
 add_subdirectory(Profile)
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 0a1f1bb9e0d20..e86075e69c05d 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -71,7 +71,7 @@ PrintMemData("print-mem-data",
 
 cl::opt<std::string> CompDirOverride(
     "comp-dir-override",
-    cl::desc("overrides DW_AT_comp_dir, and provides an alterantive base "
+    cl::desc("overrides DW_AT_comp_dir, and provides an alternative base "
              "location, which is used with DW_AT_dwo_name to construct a path "
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
@@ -646,7 +646,7 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
     const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value);
     const bool DoesBelongToFunction =
         BF.containsAddress(Value) ||
-        (TargetBF && TargetBF->isParentOrChildOf(BF));
+        (TargetBF && areRelatedFragments(TargetBF, &BF));
     if (!DoesBelongToFunction) {
       LLVM_DEBUG({
         if (!BF.containsAddress(Value)) {
@@ -839,9 +839,11 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
     assert(Address == JT->getAddress() && "unexpected non-empty jump table");
 
     // Prevent associating a jump table to a specific fragment twice.
-    // This simple check arises from the assumption: no more than 2 fragments.
-    if (JT->Parents.size() == 1 && JT->Parents[0] != &Function) {
-      assert(JT->Parents[0]->isParentOrChildOf(Function) &&
+    if (!llvm::is_contained(JT->Parents, &Function)) {
+      assert(llvm::all_of(JT->Parents,
+                          [&](const BinaryFunction *BF) {
+                            return areRelatedFragments(&Function, BF);
+                          }) &&
              "cannot re-use jump table of a different function");
       // Duplicate the entry for the parent function for easy access
       JT->Parents.push_back(&Function);
@@ -852,8 +854,8 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
         JT->print(this->outs());
       }
       Function.JumpTables.emplace(Address, JT);
-      JT->Parents[0]->setHasIndirectTargetToSplitFragment(true);
-      JT->Parents[1]->setHasIndirectTargetToSplitFragment(true);
+      for (BinaryFunction *Parent : JT->Parents)
+        Parent->setHasIndirectTargetToSplitFragment(true);
     }
 
     bool IsJumpTableParent = false;
@@ -1209,12 +1211,13 @@ void BinaryContext::generateSymbolHashes() {
 }
 
 bool BinaryContext::registerFragment(BinaryFunction &TargetFunction,
-                                     BinaryFunction &Function) const {
+                                     BinaryFunction &Function) {
   assert(TargetFunction.isFragment() && "TargetFunction must be a fragment");
   if (TargetFunction.isChildOf(Function))
     return true;
   TargetFunction.addParentFragment(Function);
   Function.addFragment(TargetFunction);
+  FragmentClasses.unionSets(&TargetFunction, &Function);
   if (!HasRelocations) {
     TargetFunction.setSimple(false);
     Function.setSimple(false);
@@ -1336,7 +1339,7 @@ void BinaryContext::processInterproceduralReferences() {
 
     if (TargetFunction) {
       if (TargetFunction->isFragment() &&
-          !TargetFunction->isChildOf(Function)) {
+          !areRelatedFragments(TargetFunction, &Function)) {
         this->errs()
             << "BOLT-WARNING: interprocedural reference between unrelated "
                "fragments: "
@@ -2367,10 +2370,7 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(VecOS);
   std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
       *TheTriple, *LocalCtx, std::unique_ptr<MCAsmBackend>(MAB), std::move(OW),
-      std::unique_ptr<MCCodeEmitter>(MCEInstance.MCE.release()), *STI,
-      /*RelaxAll=*/false,
-      /*IncrementalLinkerCompatible=*/false,
-      /*DWARFMustBeAtTheEnd=*/false));
+      std::unique_ptr<MCCodeEmitter>(MCEInstance.MCE.release()), *STI));
 
   Streamer->initSections(false, *STI);
 
@@ -2523,6 +2523,16 @@ BinaryFunction *BinaryContext::getBinaryFunctionAtAddress(uint64_t Address) {
   return nullptr;
 }
 
+/// Deregister JumpTable registered at a given \p Address and delete it.
+void BinaryContext::deleteJumpTable(uint64_t Address) {
+  assert(JumpTables.count(Address) && "Must have a jump table at address");
+  JumpTable *JT = JumpTables.at(Address);
+  for (BinaryFunction *Parent : JT->Parents)
+    Parent->JumpTables.erase(Address);
+  JumpTables.erase(Address);
+  delete JT;
+}
+
 DebugAddressRangesVector BinaryContext::translateModuleAddressRanges(
     const DWARFAddressRangesVector &InputRanges) const {
   DebugAddressRangesVector OutputRanges;
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 282638f9dc76f..ea09371b57e8a 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -780,6 +780,9 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
   // setting the value of the register used by the branch.
   MCInst *MemLocInstr;
 
+  // The instruction loading the fixed PIC jump table entry value.
+  MCInst *FixedEntryLoadInstr;
+
   // Address of the table referenced by MemLocInstr. Could be either an
   // array of function pointers, or a jump table.
   uint64_t ArrayStart = 0;
@@ -811,7 +814,7 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
 
   IndirectBranchType BranchType = BC.MIB->analyzeIndirectBranch(
       Instruction, Begin, Instructions.end(), PtrSize, MemLocInstr, BaseRegNum,
-      IndexRegNum, DispValue, DispExpr, PCRelBaseInstr);
+      IndexRegNum, DispValue, DispExpr, PCRelBaseInstr, FixedEntryLoadInstr);
 
   if (BranchType == IndirectBranchType::UNKNOWN && !MemLocInstr)
     return BranchType;
@@ -877,6 +880,43 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
   if (BaseRegNum == BC.MRI->getProgramCounter())
     ArrayStart += getAddress() + Offset + Size;
 
+  if (FixedEntryLoadInstr) {
+    assert(BranchType == IndirectBranchType::POSSIBLE_PIC_FIXED_BRANCH &&
+           "Invalid IndirectBranch type");
+    MCInst::iterator FixedEntryDispOperand =
+        BC.MIB->getMemOperandDisp(*FixedEntryLoadInstr);
+    assert(FixedEntryDispOperand != FixedEntryLoadInstr->end() &&
+           "Invalid memory instruction");
+    const MCExpr *FixedEntryDispExpr = FixedEntryDispOperand->getExpr();
+    const uint64_t EntryAddress = getExprValue(FixedEntryDispExpr);
+    uint64_t EntrySize = BC.getJumpTableEntrySize(JumpTable::JTT_PIC);
+    ErrorOr<int64_t> Value =
+        BC.getSignedValueAtAddress(EntryAddress, EntrySize);
+    if (!Value)
+      return IndirectBranchType::UNKNOWN;
+
+    BC.outs() << "BOLT-INFO: fixed PIC indirect branch detected in " << *this
+              << " at 0x" << Twine::utohexstr(getAddress() + Offset)
+              << " referencing data at 0x" << Twine::utohexstr(EntryAddress)
+              << " the destination value is 0x"
+              << Twine::utohexstr(ArrayStart + *Value) << '\n';
+
+    TargetAddress = ArrayStart + *Value;
+
+    // Remove spurious JumpTable at EntryAddress caused by PIC reference from
+    // the load instruction.
+    BC.deleteJumpTable(EntryAddress);
+
+    // Replace FixedEntryDispExpr used in target address calculation with outer
+    // jump table reference.
+    JumpTable *JT = BC.getJumpTableContainingAddress(ArrayStart);
+    assert(JT && "Must have a containing jump table for PIC fixed branch");
+    BC.MIB->replaceMemOperandDisp(*FixedEntryLoadInstr, JT->getFirstLabel(),
+                                  EntryAddress - ArrayStart, &*BC.Ctx);
+
+    return BranchType;
+  }
+
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x"
                     << Twine::utohexstr(ArrayStart) << '\n');
 
@@ -1126,6 +1166,7 @@ void BinaryFunction::handleIndirectBranch(MCInst &Instruction, uint64_t Size,
   }
   case IndirectBranchType::POSSIBLE_JUMP_TABLE:
   case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE:
+  case IndirectBranchType::POSSIBLE_PIC_FIXED_BRANCH:
     if (opts::JumpTables == JTS_NONE)
       IsSimple = false;
     break;
@@ -1878,9 +1919,11 @@ bool BinaryFunction::postProcessIndirectBranches(
         int64_t DispValue;
         const MCExpr *DispExpr;
         MCInst *PCRelBaseInstr;
+        MCInst *FixedEntryLoadInstr;
         IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
             Instr, BB.begin(), II, PtrSize, MemLocInstr, BaseRegNum,
-            IndexRegNum, DispValue, DispExpr, PCRelBaseInstr);
+            IndexRegNum, DispValue, DispExpr, PCRelBaseInstr,
+            FixedEntryLoadInstr);
         if (Type != IndirectBranchType::UNKNOWN || MemLocInstr != nullptr)
           continue;
 
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index b0f550fd77318..69cfd58a1df04 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -78,7 +78,7 @@ static void addStringHelper(DebugStrOffsetsWriter &StrOffstsWriter,
   uint32_t NewOffset = StrWriter.addString(Str);
   if (Unit.getVersion() >= 5) {
     StrOffstsWriter.updateAddressMap(DIEAttrInfo.getDIEInteger().getValue(),
-                                     NewOffset);
+                                     NewOffset, Unit);
     return;
   }
   DIEBldr.replaceValue(&Die, DIEAttrInfo.getAttribute(), DIEAttrInfo.getForm(),
@@ -461,17 +461,11 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t
-DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                         std::optional<BOLTDWARF5AccelTableData *> Parent,
-                         uint32_t NumberParentsInChain, uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
+                                  uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
-  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
-      DebugNamesTable.addAccelTableEntry(
-          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
-          NumberParentsInChain, Parent);
   // It is possible that an indexed debugging information entry has a parent
   // that is not indexed (for example, if its parent does not have a name
   // attribute). In such a case, a parent attribute may point to a nameless
@@ -485,18 +479,13 @@ DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
   // If Parent is nullopt and NumberParentsInChain is not zero, then forward
   // declaration was encountered in this DF traversal. Propagating nullopt for
   // Parent to children.
-  if (!Parent && NumberParentsInChain)
-    NameEntry = std::nullopt;
-  if (NameEntry)
-    ++NumberParentsInChain;
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize =
-        finalizeDIEs(CU, Child, NameEntry, NumberParentsInChain, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -514,10 +503,9 @@ void DIEBuilder::finish() {
     DIE *UnitDIE = getUnitDIEbyUnit(CU);
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
-    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
     std::vector<std::optional<BOLTDWARF5AccelTableData *>> Parents;
     Parents.push_back(std::nullopt);
-    finalizeDIEs(CU, *UnitDIE, std::nullopt, 0, CurOffset);
+    finalizeDIEs(CU, *UnitDIE, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
@@ -548,6 +536,48 @@ void DIEBuilder::finish() {
       dbgs() << Twine::utohexstr(Address) << "\n";
     }
   }
+}
+
+void DIEBuilder::populateDebugNamesTable(
+    DWARFUnit &CU, const DIE &Die,
+    std::optional<BOLTDWARF5AccelTableData *> Parent,
+    uint32_t NumberParentsInChain) {
+  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
+      DebugNamesTable.addAccelTableEntry(
+          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
+          NumberParentsInChain, Parent);
+  if (!Parent && NumberParentsInChain)
+    NameEntry = std::nullopt;
+  if (NameEntry)
+    ++NumberParentsInChain;
+
+  for (const DIE &Child : Die.children())
+    populateDebugNamesTable(CU, Child, NameEntry, NumberParentsInChain);
+}
+
+void DIEBuilder::updateDebugNamesTable() {
+  auto finalizeDebugNamesTableForCU = [&](DWARFUnit &CU,
+                                          uint64_t &UnitStartOffset) -> void {
+    DIE *UnitDIE = getUnitDIEbyUnit(CU);
+    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
+    populateDebugNamesTable(CU, *UnitDIE, std::nullopt, 0);
+
+    DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
+    UnitStartOffset += CurUnitInfo.UnitLength;
+  };
+
+  uint64_t TypeUnitStartOffset = 0;
+  for (DWARFUnit *CU : getState().DUList) {
+    if (!(CU->getVersion() < 5 && CU->isTypeUnit()))
+      break;
+    finalizeDebugNamesTableForCU(*CU, TypeUnitStartOffset);
+  }
+
+  for (DWARFUnit *CU : getState().DUList) {
+    if (CU->getVersion() < 5 && CU->isTypeUnit())
+      continue;
+    finalizeDebugNamesTableForCU(*CU, DebugNamesUnitSize);
+  }
   updateReferences();
 }
 
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 002f58c474346..bd8aa807f1aae 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -851,7 +851,11 @@ void DebugStrOffsetsWriter::initialize(DWARFUnit &Unit) {
         StrOffsetsSection.Data.data() + Contr->Base + Offset));
 }
 
-void DebugStrOffsetsWriter::updateAddressMap(uint32_t Index, uint32_t Address) {
+void DebugStrOffsetsWriter::updateAddressMap(uint32_t Index, uint32_t Address,
+                                             const DWARFUnit &Unit) {
+  assert(DebugStrOffsetFinalized.count(Unit.getOffset()) == 0 &&
+         "Cannot update address map since debug_str_offsets was already "
+         "finalized for this CU.");
   IndexToAddressMap[Index] = Address;
   StrOffsetSectionWasModified = true;
 }
@@ -906,6 +910,8 @@ void DebugStrOffsetsWriter::finalizeSection(DWARFUnit &Unit,
   }
 
   StrOffsetSectionWasModified = false;
+  assert(DebugStrOffsetFinalized.insert(Unit.getOffset()).second &&
+         "debug_str_offsets was already finalized for this CU.");
   clear();
 }
 
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 82bddf76d5b8c..6a46a49a983d8 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -207,7 +207,7 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
                "BOLT-ERROR: cannot find landing pad fragment");
         BC.addInterproceduralReference(this, Fragment->getAddress());
         BC.processInterproceduralReferences();
-        assert(isParentOrChildOf(*Fragment) &&
+        assert(BC.areRelatedFragments(this, Fragment) &&
                "BOLT-ERROR: cannot have landing pads in different functions");
         setHasIndirectTargetToSplitFragment(true);
         BC.addFragmentsToSkip(this);
diff --git a/bolt/lib/Core/GDBIndex.cpp b/bolt/lib/Core/GDBIndex.cpp
index 9e6d24167d559..c7fb4889646b4 100644
--- a/bolt/lib/Core/GDBIndex.cpp
+++ b/bolt/lib/Core/GDBIndex.cpp
@@ -23,7 +23,6 @@ void GDBIndex::updateGdbIndexSection(
     DebugARangesSectionWriter &ARangesSectionWriter) {
   if (!BC.getGdbIndexSection())
     return;
-
   // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
   // for .gdb_index section format.
 
@@ -141,7 +140,7 @@ void GDBIndex::updateGdbIndexSection(
     write64le(Buffer + 8, CUInfo.second.Length + 4);
     Buffer += 16;
   }
-
+  sortGDBIndexTUEntryVector();
   // Rewrite TU CU List, since abbrevs can be different.
   // Entry example:
   // 0: offset = 0x00000000, type_offset = 0x0000001e, type_signature =
diff --git a/bolt/lib/Passes/AsmDump.cpp b/bolt/lib/Passes/AsmDump.cpp
index 0a12eae8b5f7f..97f985d56ce64 100644
--- a/bolt/lib/Passes/AsmDump.cpp
+++ b/bolt/lib/Passes/AsmDump.cpp
@@ -139,11 +139,8 @@ void dumpFunction(const BinaryFunction &BF) {
   auto FOut = std::make_unique<formatted_raw_ostream>(OS);
   FOut->SetUnbuffered();
   std::unique_ptr<MCStreamer> AsmStreamer(
-      createAsmStreamer(*LocalCtx, std::move(FOut),
-                        /*isVerboseAsm=*/true,
-                        /*useDwarfDirectory=*/false, InstructionPrinter,
-                        std::move(MCEInstance.MCE), std::move(MAB),
-                        /*ShowInst=*/false));
+      createAsmStreamer(*LocalCtx, std::move(FOut), InstructionPrinter,
+                        std::move(MCEInstance.MCE), std::move(MAB)));
   AsmStreamer->initSections(true, *BC.STI);
   std::unique_ptr<TargetMachine> TM(BC.TheTarget->createTargetMachine(
       BC.TripleName, "", "", TargetOptions(), std::nullopt));
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index a73eb867b3777..2b5a591f4c7a2 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -386,13 +386,15 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
   JumpTableInfoType HotTargets;
   MCInst *MemLocInstr;
   MCInst *PCRelBaseOut;
+  MCInst *FixedEntryLoadInstr;
   unsigned BaseReg, IndexReg;
   int64_t DispValue;
   const MCExpr *DispExpr;
   MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
   const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
       CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
-      MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
+      MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut,
+      FixedEntryLoadInstr);
 
   assert(MemLocInstr && "There should always be a load for jump tables");
   if (!MemLocInstr)
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt
index 39708baac4036..9aa4ba0490b0f 100644
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMBOLTProfile
 
   LINK_COMPONENTS
   Demangle
+  MC
   Support
   TransformUtils
   )
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index ce6ec0a04ac16..a300e5b2b1dab 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -88,6 +88,7 @@ MaxSamples("max-samples",
   cl::cat(AggregatorCategory));
 
 extern cl::opt<opts::ProfileFormatKind> ProfileFormat;
+extern cl::opt<bool> ProfileUsePseudoProbes;
 extern cl::opt<std::string> SaveProfile;
 
 cl::opt<bool> ReadPreAggregated(
@@ -2298,6 +2299,9 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
   yaml::bolt::BinaryProfile BP;
 
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+
   // Fill out the header info.
   BP.Header.Version = 1;
   BP.Header.FileName = std::string(BC.getFilename());
@@ -2398,6 +2402,33 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         const unsigned BlockIndex = BlockMap.getBBIndex(BI.To.Offset);
         YamlBF.Blocks[BlockIndex].ExecCount += BI.Branches;
       }
+      if (PseudoProbeDecoder) {
+        if ((YamlBF.GUID = BF->getGUID())) {
+          const MCPseudoProbeFuncDesc *FuncDesc =
+              PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
+          YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
+        }
+        // Fetch probes belonging to all fragments
+        const AddressProbesMap &ProbeMap =
+            PseudoProbeDecoder->getAddress2ProbesMap();
+        BinaryFunction::FragmentsSetTy Fragments(BF->Fragments);
+        Fragments.insert(BF);
+        for (const BinaryFunction *F : Fragments) {
+          const uint64_t FuncAddr = F->getAddress();
+          const auto &FragmentProbes =
+              llvm::make_range(ProbeMap.lower_bound(FuncAddr),
+                               ProbeMap.lower_bound(FuncAddr + F->getSize()));
+          for (const auto &[OutputAddress, Probes] : FragmentProbes) {
+            const uint32_t InputOffset = BAT->translate(
+                FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
+            const unsigned BlockIndex = getBlock(InputOffset).second;
+            for (const MCDecodedPseudoProbe &Probe : Probes)
+              YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
+                  yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                                              Probe.getType()});
+          }
+        }
+      }
       // Drop blocks without a hash, won't be useful for stale matching.
       llvm::erase_if(YamlBF.Blocks,
                      [](const yaml::bolt::BinaryBasicBlockProfile &YamlBB) {
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index d9a6998bc5e9f..3eca5e972fa5b 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -41,15 +41,87 @@ llvm::cl::opt<bool>
     MatchProfileWithFunctionHash("match-profile-with-function-hash",
                                  cl::desc("Match profile with function hash"),
                                  cl::Hidden, cl::cat(BoltOptCategory));
+llvm::cl::opt<bool>
+    MatchWithCallGraph("match-with-call-graph",
+                       cl::desc("Match functions with call graph"), cl::Hidden,
+                       cl::cat(BoltOptCategory));
 
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
                                   cl::Hidden, cl::cat(BoltOptCategory));
+
+llvm::cl::opt<bool> ProfileUsePseudoProbes(
+    "profile-use-pseudo-probes",
+    cl::desc("Use pseudo probes for profile generation and matching"),
+    cl::Hidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
 namespace bolt {
 
+YAMLProfileReader::CallGraphMatcher::CallGraphMatcher(
+    BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP,
+    ProfileLookupMap &IdToYAMLBF) {
+  constructBFCG(BC, YamlBP);
+  constructYAMLFCG(YamlBP, IdToYAMLBF);
+  computeBFNeighborHashes(BC);
+}
+
+void YAMLProfileReader::CallGraphMatcher::constructBFCG(
+    BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP) {
+  for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
+    for (const BinaryBasicBlock &BB : BF->blocks()) {
+      for (const MCInst &Instr : BB) {
+        if (!BC.MIB->isCall(Instr))
+          continue;
+        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+        if (!CallSymbol)
+          continue;
+        BinaryData *BD = BC.getBinaryDataByName(CallSymbol->getName());
+        if (!BD)
+          continue;
+        BinaryFunction *CalleeBF = BC.getFunctionForSymbol(BD->getSymbol());
+        if (!CalleeBF)
+          continue;
+
+        BFAdjacencyMap[CalleeBF].insert(BF);
+        BFAdjacencyMap[BF].insert(CalleeBF);
+      }
+    }
+  }
+}
+
+void YAMLProfileReader::CallGraphMatcher::computeBFNeighborHashes(
+    BinaryContext &BC) {
+  for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
+    auto It = BFAdjacencyMap.find(BF);
+    if (It == BFAdjacencyMap.end())
+      continue;
+    auto &AdjacentBFs = It->second;
+    std::string HashStr;
+    for (BinaryFunction *BF : AdjacentBFs)
+      HashStr += BF->getOneName();
+    uint64_t Hash = std::hash<std::string>{}(HashStr);
+    NeighborHashToBFs[Hash].push_back(BF);
+  }
+}
+
+void YAMLProfileReader::CallGraphMatcher::constructYAMLFCG(
+    yaml::bolt::BinaryProfile &YamlBP, ProfileLookupMap &IdToYAMLBF) {
+
+  for (auto &CallerYamlBF : YamlBP.Functions) {
+    for (auto &YamlBB : CallerYamlBF.Blocks) {
+      for (auto &CallSite : YamlBB.CallSites) {
+        auto IdToYAMLBFIt = IdToYAMLBF.find(CallSite.DestId);
+        if (IdToYAMLBFIt == IdToYAMLBF.end())
+          continue;
+        YamlBFAdjacencyMap[&CallerYamlBF].insert(IdToYAMLBFIt->second);
+        YamlBFAdjacencyMap[IdToYAMLBFIt->second].insert(&CallerYamlBF);
+      }
+    }
+  }
+}
+
 bool YAMLProfileReader::isYAML(const StringRef Filename) {
   if (auto MB = MemoryBuffer::getFileOrSTDIN(Filename)) {
     StringRef Buffer = (*MB)->getBuffer();
@@ -350,7 +422,7 @@ bool YAMLProfileReader::profileMatches(
 }
 
 bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
-  if (opts::MatchProfileWithFunctionHash)
+  if (opts::MatchProfileWithFunctionHash || opts::MatchWithCallGraph)
     return true;
   for (StringRef Name : BF.getNames())
     if (ProfileFunctionNames.contains(Name))
@@ -446,6 +518,79 @@ size_t YAMLProfileReader::matchWithLTOCommonName() {
   return MatchedWithLTOCommonName;
 }
 
+size_t YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
+  if (!opts::MatchWithCallGraph)
+    return 0;
+
+  size_t MatchedWithCallGraph = 0;
+  CallGraphMatcher CGMatcher(BC, YamlBP, IdToYamLBF);
+
+  ItaniumPartialDemangler Demangler;
+  auto GetBaseName = [&](std::string &FunctionName) {
+    if (Demangler.partialDemangle(FunctionName.c_str()))
+      return std::string("");
+    size_t BufferSize = 1;
+    char *Buffer = static_cast<char *>(std::malloc(BufferSize));
+    char *BaseName = Demangler.getFunctionBaseName(Buffer, &BufferSize);
+    if (!BaseName) {
+      std::free(Buffer);
+      return std::string("");
+    }
+    if (Buffer != BaseName)
+      Buffer = BaseName;
+    std::string BaseNameStr(Buffer, BufferSize);
+    std::free(Buffer);
+    return BaseNameStr;
+  };
+
+  // Matches YAMLBF to BFs with neighbor hashes.
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+    if (YamlBF.Used)
+      continue;
+    auto AdjacentYamlBFsOpt = CGMatcher.getAdjacentYamlBFs(YamlBF);
+    if (!AdjacentYamlBFsOpt)
+      continue;
+    std::set<yaml::bolt::BinaryFunctionProfile *> AdjacentYamlBFs =
+        AdjacentYamlBFsOpt.value();
+    std::string AdjacentYamlBFsHashStr;
+    for (auto *AdjacentYamlBF : AdjacentYamlBFs)
+      AdjacentYamlBFsHashStr += AdjacentYamlBF->Name;
+    uint64_t Hash = std::hash<std::string>{}(AdjacentYamlBFsHashStr);
+    auto BFsWithSameHashOpt = CGMatcher.getBFsWithNeighborHash(Hash);
+    if (!BFsWithSameHashOpt)
+      continue;
+    std::vector<BinaryFunction *> BFsWithSameHash = BFsWithSameHashOpt.value();
+    // Finds the binary function with the longest common prefix to the profiled
+    // function and matches.
+    BinaryFunction *ClosestBF = nullptr;
+    size_t LCP = 0;
+    std::string YamlBFBaseName = GetBaseName(YamlBF.Name);
+    for (BinaryFunction *BF : BFsWithSameHash) {
+      if (ProfiledFunctions.count(BF))
+        continue;
+      std::string BFName = std::string(BF->getOneName());
+      std::string BFBaseName = GetBaseName(BFName);
+      size_t PrefixLength = 0;
+      size_t N = std::min(YamlBFBaseName.size(), BFBaseName.size());
+      for (size_t I = 0; I < N; ++I) {
+        if (YamlBFBaseName[I] != BFBaseName[I])
+          break;
+        ++PrefixLength;
+      }
+      if (PrefixLength >= LCP) {
+        LCP = PrefixLength;
+        ClosestBF = BF;
+      }
+    }
+    if (ClosestBF) {
+      matchProfileToFunction(YamlBF, *ClosestBF);
+      ++MatchedWithCallGraph;
+    }
+  }
+
+  return MatchedWithCallGraph;
+}
+
 size_t YAMLProfileReader::matchWithNameSimilarity(BinaryContext &BC) {
   if (opts::NameSimilarityFunctionMatchingThreshold == 0)
     return 0;
@@ -581,9 +726,14 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     }
   }
 
+  // Map profiled function ids to names.
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
+    IdToYamLBF[YamlBF.Id] = &YamlBF;
+
   const size_t MatchedWithExactName = matchWithExactName();
   const size_t MatchedWithHash = matchWithHash(BC);
   const size_t MatchedWithLTOCommonName = matchWithLTOCommonName();
+  const size_t MatchedWithCallGraph = matchWithCallGraph(BC);
   const size_t MatchedWithNameSimilarity = matchWithNameSimilarity(BC);
 
   for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs))
@@ -603,6 +753,8 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
            << " functions with hash\n";
     outs() << "BOLT-INFO: matched " << MatchedWithLTOCommonName
            << " functions with matching LTO common names\n";
+    outs() << "BOLT-INFO: matched " << MatchedWithCallGraph
+           << " functions with call graph\n";
     outs() << "BOLT-INFO: matched " << MatchedWithNameSimilarity
            << " functions with similar names\n";
   }
@@ -610,11 +762,6 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
   // Set for parseFunctionProfile().
   NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
   NormalizeByCalls = usesEvent("branches");
-
-  // Map profiled function ids to names.
-  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
-    IdToYamLBF[YamlBF.Id] = &YamlBF;
-
   uint64_t NumUnused = 0;
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
     if (YamlBF.Id >= YamlProfileToFunction.size()) {
@@ -630,7 +777,8 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 
   BC.setNumUnusedProfiledObjects(NumUnused);
 
-  if (opts::Lite && opts::MatchProfileWithFunctionHash) {
+  if (opts::Lite &&
+      (opts::MatchProfileWithFunctionHash || opts::MatchWithCallGraph)) {
     for (BinaryFunction *BF : BC.getAllBinaryFunctions())
       if (!BF->hasProfile())
         BF->setIgnored();
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 9adbfdc5ff089..84777741d611a 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -22,6 +22,7 @@
 
 namespace opts {
 extern llvm::cl::opt<bool> ProfileUseDFS;
+extern llvm::cl::opt<bool> ProfileUsePseudoProbes;
 } // namespace opts
 
 namespace llvm {
@@ -57,6 +58,8 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
                            const BoltAddressTranslation *BAT) {
   yaml::bolt::BinaryFunctionProfile YamlBF;
   const BinaryContext &BC = BF.getBinaryContext();
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
 
@@ -69,6 +72,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
   YamlBF.ExecCount = BF.getKnownExecutionCount();
+  if (PseudoProbeDecoder) {
+    if ((YamlBF.GUID = BF.getGUID())) {
+      const MCPseudoProbeFuncDesc *FuncDesc =
+          PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
+      YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
+    }
+  }
 
   BinaryFunction::BasicBlockOrderType Order;
   llvm::copy(UseDFS ? BF.dfs() : BF.getLayout().blocks(),
@@ -177,6 +187,21 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       ++BranchInfo;
     }
 
+    if (PseudoProbeDecoder) {
+      const AddressProbesMap &ProbeMap =
+          PseudoProbeDecoder->getAddress2ProbesMap();
+      const uint64_t FuncAddr = BF.getAddress();
+      const std::pair<uint64_t, uint64_t> &BlockRange =
+          BB->getInputAddressRange();
+      const auto &BlockProbes =
+          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
+                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
+      for (const auto &[_, Probes] : BlockProbes)
+        for (const MCDecodedPseudoProbe &Probe : Probes)
+          YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
+              Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+    }
+
     YamlBF.Blocks.emplace_back(YamlBB);
   }
   return YamlBF;
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index aaa0e1ff4d46f..5dfef0b71cc79 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -263,6 +263,10 @@ static cl::opt<bool> CMOVConversionFlag("cmov-conversion",
                                         cl::ReallyHidden,
                                         cl::cat(BoltOptCategory));
 
+static cl::opt<bool> ShortenInstructions("shorten-instructions",
+                                         cl::desc("shorten instructions"),
+                                         cl::init(true),
+                                         cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -378,7 +382,8 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   else if (opts::Hugify)
     Manager.registerPass(std::make_unique<HugePage>(NeverPrint));
 
-  Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint));
+  Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint),
+                       opts::ShortenInstructions);
 
   Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint),
                        !opts::KeepNops);
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 34993af2623bf..5d114925f59b0 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   Core
   DebugInfoDWARF
-  DWP
   JITLink
   MC
   Object
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 042c39a574561..98f81f44d6490 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -56,6 +57,8 @@
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "bolt"
 
+static mc::RegisterMCTargetOptionsFlags MOF;
+
 static void printDie(const DWARFDie &DIE) {
   DIDumpOptions DumpOpts;
   DumpOpts.ShowForm = true;
@@ -326,22 +329,10 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
-static cl::opt<bool> DeterministicDebugInfo(
-    "deterministic-debuginfo",
-    cl::desc("disables parallel execution of tasks that may produce "
-             "nondeterministic debug info"),
-    cl::init(true), cl::cat(BoltCategory));
-
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
-    cl::desc("Path to where .dwo files or dwp file will be written out to."),
-    cl::init(""), cl::cat(BoltCategory));
-
-static cl::opt<bool>
-    WriteDWP("write-dwp",
-             cl::desc("output a single dwarf package file (dwp) instead of "
-                      "multiple non-relocatable dwarf object files (dwo)."),
-             cl::init(false), cl::cat(BoltCategory));
+    cl::desc("Path to where .dwo files will be written out to."), cl::init(""),
+    cl::cat(BoltCategory));
 
 static cl::opt<bool> CreateDebugNames(
     "create-debug-names-section",
@@ -473,23 +464,19 @@ createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
   return Streamer;
 }
 
-static DWARFRewriter::UnitMeta
-emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer, DWARFUnit &Unit) {
+static void emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer,
+                     DWARFUnit &Unit) {
   DIE *UnitDIE = DIEBldr.getUnitDIEbyUnit(Unit);
-  const DIEBuilder::DWARFUnitInfo &U = DIEBldr.getUnitInfoByDwarfUnit(Unit);
   Streamer.emitUnit(Unit, *UnitDIE);
-  uint64_t TypeHash = 0;
-  if (DWARFTypeUnit *DTU = dyn_cast_or_null<DWARFTypeUnit>(&Unit))
-    TypeHash = DTU->getTypeHash();
-  return {U.UnitOffset, U.UnitLength, TypeHash};
 }
 
-static void
-emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
-               DWARFRewriter &Rewriter, DWARFUnit &SplitCU, DWARFUnit &CU,
-               DWARFRewriter::DWPState &State, DebugLocWriter &LocWriter,
-               DebugStrOffsetsWriter &StrOffstsWriter,
-               DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection) {
+static void emitDWOBuilder(const std::string &DWOName,
+                           DIEBuilder &DWODIEBuilder, DWARFRewriter &Rewriter,
+                           DWARFUnit &SplitCU, DWARFUnit &CU,
+                           DebugLocWriter &LocWriter,
+                           DebugStrOffsetsWriter &StrOffstsWriter,
+                           DebugStrWriter &StrWriter,
+                           GDBIndex &GDBIndexSection) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -502,28 +489,22 @@ emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
   std::unique_ptr<DIEStreamer> Streamer =
       createDIEStreamer(*TheTriple, *ObjOS, "DwoStreamerInitAug2",
                         DWODIEBuilder, GDBIndexSection);
-  DWARFRewriter::UnitMetaVectorType TUMetaVector;
-  DWARFRewriter::UnitMeta CUMI = {0, 0, 0};
   if (SplitCU.getContext().getMaxDWOVersion() >= 5) {
     for (std::unique_ptr<llvm::DWARFUnit> &CU :
          SplitCU.getContext().dwo_info_section_units()) {
       if (!CU->isTypeUnit())
         continue;
-      DWARFRewriter::UnitMeta MI =
-          emitUnit(DWODIEBuilder, *Streamer, *CU.get());
-      TUMetaVector.emplace_back(MI);
+      emitUnit(DWODIEBuilder, *Streamer, *CU.get());
     }
-    CUMI = emitUnit(DWODIEBuilder, *Streamer, SplitCU);
+    emitUnit(DWODIEBuilder, *Streamer, SplitCU);
   } else {
     for (std::unique_ptr<llvm::DWARFUnit> &CU :
          SplitCU.getContext().dwo_compile_units())
       emitUnit(DWODIEBuilder, *Streamer, *CU.get());
 
     // emit debug_types sections for dwarf4
-    for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector()) {
-      DWARFRewriter::UnitMeta MI = emitUnit(DWODIEBuilder, *Streamer, *CU);
-      TUMetaVector.emplace_back(MI);
-    }
+    for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector())
+      emitUnit(DWODIEBuilder, *Streamer, *CU);
   }
 
   Streamer->emitAbbrevs(DWODIEBuilder.getAbbrevs(),
@@ -550,12 +531,8 @@ emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
       continue;
     OverriddenSections[Kind] = Contents;
   }
-  if (opts::WriteDWP)
-    Rewriter.updateDWP(CU, OverriddenSections, CUMI, TUMetaVector, State,
-                       LocWriter, StrOffstsWriter, StrWriter);
-  else
-    Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
-                           StrOffstsWriter, StrWriter);
+  Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
+                         StrOffstsWriter, StrWriter);
 }
 
 using DWARFUnitVec = std::vector<DWARFUnit *>;
@@ -607,11 +584,6 @@ void DWARFRewriter::updateDebugInfo() {
   StrWriter = std::make_unique<DebugStrWriter>(*BC.DwCtx, false);
   StrOffstsWriter = std::make_unique<DebugStrOffsetsWriter>(BC);
 
-  if (!opts::DeterministicDebugInfo) {
-    opts::DeterministicDebugInfo = true;
-    errs() << "BOLT-WARNING: --deterministic-debuginfo is being deprecated\n";
-  }
-
   /// Stores and serializes information that will be put into the
   /// .debug_addr DWARF section.
   std::unique_ptr<DebugAddrWriter> FinalAddrWriter;
@@ -631,8 +603,8 @@ void DWARFRewriter::updateDebugInfo() {
   uint32_t CUIndex = 0;
   std::mutex AccessMutex;
   // Needs to be invoked in the same order as CUs are processed.
-  auto createRangeLocListAddressWriters =
-      [&](DWARFUnit &CU) -> DebugLocWriter * {
+  llvm::DenseMap<uint64_t, uint64_t> LocListWritersIndexByCU;
+  auto createRangeLocListAddressWriters = [&](DWARFUnit &CU) {
     std::lock_guard<std::mutex> Lock(AccessMutex);
     const uint16_t DwarfVersion = CU.getVersion();
     if (DwarfVersion >= 5) {
@@ -652,7 +624,6 @@ void DWARFRewriter::updateDebugInfo() {
         RangeListsWritersByCU[*DWOId] = std::move(DWORangeListsSectionWriter);
       }
       AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
-
     } else {
       auto AddrW =
           std::make_unique<DebugAddrWriter>(&BC, CU.getAddressByteSize());
@@ -668,84 +639,68 @@ void DWARFRewriter::updateDebugInfo() {
             std::move(LegacyRangesSectionWriterByCU);
       }
     }
-    return LocListWritersByCU[CUIndex++].get();
+    LocListWritersIndexByCU[CU.getOffset()] = CUIndex++;
   };
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
   GDBIndex GDBIndexSection(BC);
-  DWPState State;
-  if (opts::WriteDWP)
-    initDWPState(State);
-  auto processUnitDIE = [&](DWARFUnit *Unit, DIEBuilder *DIEBlder) {
-    // Check if the unit is a skeleton and we need special updates for it and
-    // its matching split/DWO CU.
+  auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
+                            DIEBuilder &DIEBlder,
+                            DebugRangesSectionWriter &TempRangesSectionWriter,
+                            DebugAddrWriter &AddressWriter,
+                            const std::string &DWOName,
+                            const std::optional<std::string> &DwarfOutputPath,
+                            DIEBuilder &DWODIEBuilder) {
+    DWODIEBuilder.buildDWOUnit(SplitCU);
+    DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
+    DebugStrWriter DWOStrWriter((SplitCU).getContext(), true);
+    DWODIEBuilder.updateDWONameCompDirForTypes(
+        DWOStrOffstsWriter, DWOStrWriter, SplitCU, DwarfOutputPath, DWOName);
+    DebugLoclistWriter DebugLocDWoWriter(Unit, Unit.getVersion(), true,
+                                         AddressWriter);
+
+    updateUnitDebugInfo(SplitCU, DWODIEBuilder, DebugLocDWoWriter,
+                        TempRangesSectionWriter, AddressWriter);
+    DebugLocDWoWriter.finalize(DWODIEBuilder,
+                               *DWODIEBuilder.getUnitDIEbyUnit(SplitCU));
+    if (Unit.getVersion() >= 5)
+      TempRangesSectionWriter.finalizeSection();
+
+    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit,
+                   DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                   GDBIndexSection);
+  };
+  auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
     std::optional<DWARFUnit *> SplitCU;
     std::optional<uint64_t> RangesBase;
-    std::optional<uint64_t> DWOId = Unit->getDWOId();
+    std::optional<uint64_t> DWOId = Unit.getDWOId();
     if (DWOId)
       SplitCU = BC.getDWOCU(*DWOId);
-    DebugLocWriter *DebugLocWriter = createRangeLocListAddressWriters(*Unit);
-    DebugRangesSectionWriter *RangesSectionWriter =
-        Unit->getVersion() >= 5 ? RangeListsSectionWriter.get()
-                                : LegacyRangesSectionWriter.get();
-    DebugAddrWriter *AddressWriter =
-        AddressWritersByCU[Unit->getOffset()].get();
-    // Skipping CUs that failed to load.
-    if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
-                               Unit);
-      DWODIEBuilder.buildDWOUnit(**SplitCU);
-      std::string DWOName = "";
-      std::optional<std::string> DwarfOutputPath =
-          opts::DwarfOutputPath.empty()
-              ? std::nullopt
-              : std::optional<std::string>(opts::DwarfOutputPath.c_str());
-      {
-        std::lock_guard<std::mutex> Lock(AccessMutex);
-        DWOName = DIEBlder->updateDWONameCompDir(
-            *StrOffstsWriter, *StrWriter, *Unit, DwarfOutputPath, std::nullopt);
-      }
-      DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
-      DebugStrWriter DWOStrWriter((*SplitCU)->getContext(), true);
-      DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter,
-                                                 DWOStrWriter, **SplitCU,
-                                                 DwarfOutputPath, DWOName);
-      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true,
-                                           *AddressWriter);
-      DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
-      if (Unit->getVersion() >= 5) {
-        TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
-      } else {
-        TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get();
-        RangesBase = RangesSectionWriter->getSectionOffset();
-        setDwoRangesBase(*DWOId, *RangesBase);
-      }
-
-      updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter,
-                          *TempRangesSectionWriter, *AddressWriter);
-      DebugLocDWoWriter.finalize(DWODIEBuilder,
-                                 *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU));
-      if (Unit->getVersion() >= 5)
-        TempRangesSectionWriter->finalizeSection();
-
-      emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                     GDBIndexSection);
-    }
-
-    if (Unit->getVersion() >= 5) {
-      RangesBase = RangesSectionWriter->getSectionOffset() +
+    DebugLocWriter &DebugLocWriter =
+        *LocListWritersByCU[LocListWritersIndexByCU[Unit.getOffset()]].get();
+    DebugRangesSectionWriter &RangesSectionWriter =
+        Unit.getVersion() >= 5 ? *RangeListsSectionWriter.get()
+                               : *LegacyRangesSectionWriter.get();
+    DebugAddrWriter &AddressWriter =
+        *AddressWritersByCU[Unit.getOffset()].get();
+    if (Unit.getVersion() >= 5)
+      RangeListsSectionWriter->setAddressWriter(&AddressWriter);
+    if (Unit.getVersion() >= 5) {
+      RangesBase = RangesSectionWriter.getSectionOffset() +
                    getDWARF5RngListLocListHeaderSize();
-      RangesSectionWriter->initSection(*Unit);
-      StrOffstsWriter->finalizeSection(*Unit, *DIEBlder);
+      RangesSectionWriter.initSection(Unit);
+      if (!SplitCU)
+        StrOffstsWriter->finalizeSection(Unit, DIEBlder);
+    } else if (SplitCU) {
+      RangesBase = LegacyRangesSectionWriter.get()->getSectionOffset();
     }
 
-    updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter,
-                        *AddressWriter, RangesBase);
-    DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
-    if (Unit->getVersion() >= 5)
-      RangesSectionWriter->finalizeSection();
+    updateUnitDebugInfo(Unit, DIEBlder, DebugLocWriter, RangesSectionWriter,
+                        AddressWriter, RangesBase);
+    DebugLocWriter.finalize(DIEBlder, *DIEBlder.getUnitDIEbyUnit(Unit));
+    if (Unit.getVersion() >= 5)
+      RangesSectionWriter.finalizeSection();
   };
 
   DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
@@ -760,32 +715,48 @@ void DWARFRewriter::updateDebugInfo() {
   CUOffsetMap OffsetMap =
       finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
-  const bool SingleThreadedMode =
-      opts::NoThreads || opts::DeterministicDebugInfo;
-  if (!SingleThreadedMode)
-    DIEBlder.buildCompileUnits();
-  if (SingleThreadedMode) {
-    CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
-    for (std::vector<DWARFUnit *> &Vec : PartVec) {
-      DIEBlder.buildCompileUnits(Vec);
-      for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
-        processUnitDIE(CU, &DIEBlder);
-      finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
-                           DIEBlder.getProcessedCUs(), *FinalAddrWriter);
+  CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
+  for (std::vector<DWARFUnit *> &Vec : PartVec) {
+    DIEBlder.buildCompileUnits(Vec);
+    llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU;
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
+      createRangeLocListAddressWriters(*CU);
+      std::optional<DWARFUnit *> SplitCU;
+      std::optional<uint64_t> DWOId = CU->getDWOId();
+      if (DWOId)
+        SplitCU = BC.getDWOCU(*DWOId);
+      if (!SplitCU)
+        continue;
+      DebugAddrWriter &AddressWriter =
+          *AddressWritersByCU[CU->getOffset()].get();
+      DebugRangesSectionWriter *TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
+                                : LegacyRangesWritersByCU[*DWOId].get();
+      std::optional<std::string> DwarfOutputPath =
+          opts::DwarfOutputPath.empty()
+              ? std::nullopt
+              : std::optional<std::string>(opts::DwarfOutputPath.c_str());
+      std::string DWOName = DIEBlder.updateDWONameCompDir(
+          *StrOffstsWriter, *StrWriter, *CU, DwarfOutputPath, std::nullopt);
+      auto DWODIEBuilderPtr = std::make_unique<DIEBuilder>(
+          BC, &(**SplitCU).getContext(), DebugNamesTable, CU);
+      DIEBuilder &DWODIEBuilder =
+          *DWODIEBuildersByCU.emplace_back(std::move(DWODIEBuilderPtr)).get();
+      if (CU->getVersion() >= 5)
+        StrOffstsWriter->finalizeSection(*CU, DIEBlder);
+      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
+                     AddressWriter, DWOName, DwarfOutputPath, DWODIEBuilder);
     }
-  } else {
-    // Update unit debug info in parallel
-    ThreadPoolInterface &ThreadPool = ParallelUtilities::getThreadPool();
-    for (std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units())
-      ThreadPool.async(processUnitDIE, CU.get(), &DIEBlder);
-    ThreadPool.wait();
+    for (std::unique_ptr<DIEBuilder> &DWODIEBuilderPtr : DWODIEBuildersByCU)
+      DWODIEBuilderPtr->updateDebugNamesTable();
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
+      processMainBinaryCU(*CU, DIEBlder);
+    finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
+                         DIEBlder.getProcessedCUs(), *FinalAddrWriter);
   }
 
   DebugNamesTable.emitAccelTable();
 
-  if (opts::WriteDWP)
-    finalizeDWP(State);
-
   finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS, OffsetMap,
                         *FinalAddrWriter);
   GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex,
@@ -1477,6 +1448,7 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
   // generate and populate abbrevs here
   DIEBlder.generateAbbrevs();
   DIEBlder.finish();
+  DIEBlder.updateDebugNamesTable();
   SmallVector<char, 20> OutBuffer;
   std::shared_ptr<raw_svector_ostream> ObjOS =
       std::make_shared<raw_svector_ostream>(OutBuffer);
@@ -1681,6 +1653,7 @@ void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
   }
   DIEBlder.generateAbbrevs();
   DIEBlder.finish();
+  DIEBlder.updateDebugNamesTable();
   // generate debug_info and CUMap
   for (DWARFUnit *CU : CUs) {
     emitUnit(DIEBlder, Streamer, *CU);
@@ -1831,220 +1804,6 @@ std::optional<StringRef> updateDebugData(
 
 } // namespace
 
-void DWARFRewriter::initDWPState(DWPState &State) {
-  SmallString<0> OutputNameStr;
-  StringRef OutputName;
-  if (opts::DwarfOutputPath.empty()) {
-    OutputName =
-        Twine(opts::OutputFilename).concat(".dwp").toStringRef(OutputNameStr);
-  } else {
-    StringRef ExeFileName = llvm::sys::path::filename(opts::OutputFilename);
-    OutputName = Twine(opts::DwarfOutputPath)
-                     .concat("/")
-                     .concat(ExeFileName)
-                     .concat(".dwp")
-                     .toStringRef(OutputNameStr);
-    errs() << "BOLT-WARNING: dwarf-output-path is in effect and .dwp file will "
-              "possibly be written to another location that is not the same as "
-              "the executable\n";
-  }
-  std::error_code EC;
-  State.Out =
-      std::make_unique<ToolOutputFile>(OutputName, EC, sys::fs::OF_None);
-  const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
-  State.TmpBC = createDwarfOnlyBC(*File);
-  State.Streamer = State.TmpBC->createStreamer(State.Out->os());
-  State.MCOFI = State.Streamer->getContext().getObjectFileInfo();
-  State.KnownSections = createKnownSectionsMap(*State.MCOFI);
-  MCSection *const StrSection = State.MCOFI->getDwarfStrDWOSection();
-
-  // Data Structures for DWP book keeping
-  // Size of array corresponds to the number of sections supported by DWO format
-  // in DWARF4/5.
-
-  State.Strings = std::make_unique<DWPStringPool>(*State.Streamer, StrSection);
-
-  // Setup DWP code once.
-  DWARFContext *DWOCtx = BC.getDWOContext();
-
-  if (DWOCtx) {
-    State.CUIndex = &DWOCtx->getCUIndex();
-    State.IsDWP = !State.CUIndex->getRows().empty();
-  }
-}
-
-void DWARFRewriter::finalizeDWP(DWPState &State) {
-  if (State.Version < 5) {
-    // Lie about there being no info contributions so the TU index only includes
-    // the type unit contribution for DWARF < 5. In DWARFv5 the TU index has a
-    // contribution to the info section, so we do not want to lie about it.
-    State.ContributionOffsets[0] = 0;
-  }
-  writeIndex(*State.Streamer.get(), State.MCOFI->getDwarfTUIndexSection(),
-             State.ContributionOffsets, State.TypeIndexEntries,
-             State.IndexVersion);
-
-  if (State.Version < 5) {
-    // Lie about the type contribution for DWARF < 5. In DWARFv5 the type
-    // section does not exist, so no need to do anything about this.
-    State.ContributionOffsets[getContributionIndex(DW_SECT_EXT_TYPES, 2)] = 0;
-    // Unlie about the info contribution
-    State.ContributionOffsets[0] = 1;
-  }
-  writeIndex(*State.Streamer.get(), State.MCOFI->getDwarfCUIndexSection(),
-             State.ContributionOffsets, State.IndexEntries, State.IndexVersion);
-
-  State.Streamer->finish();
-  State.Out->keep();
-}
-
-void DWARFRewriter::updateDWP(DWARFUnit &CU,
-                              const OverriddenSectionsMap &OverridenSections,
-                              const DWARFRewriter::UnitMeta &CUMI,
-                              DWARFRewriter::UnitMetaVectorType &TUMetaVector,
-                              DWPState &State, DebugLocWriter &LocWriter,
-                              DebugStrOffsetsWriter &StrOffstsWriter,
-                              DebugStrWriter &StrWriter) {
-  const uint64_t DWOId = *CU.getDWOId();
-  MCSection *const StrOffsetSection = State.MCOFI->getDwarfStrOffDWOSection();
-  assert(StrOffsetSection && "StrOffsetSection does not exist.");
-  // Skipping CUs that we failed to load.
-  std::optional<DWARFUnit *> DWOCU = BC.getDWOCU(DWOId);
-  if (!DWOCU)
-    return;
-
-  if (State.Version == 0) {
-    State.Version = CU.getVersion();
-    State.IndexVersion = State.Version < 5 ? 2 : 5;
-  } else if (State.Version != CU.getVersion()) {
-    errs() << "BOLT-ERROR: incompatible DWARF compile unit versions\n";
-    exit(1);
-  }
-
-  UnitIndexEntry CurEntry = {};
-  CurEntry.DWOName = dwarf::toString(
-      CU.getUnitDIE().find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
-      "");
-  const char *Name = CU.getUnitDIE().getShortName();
-  if (Name)
-    CurEntry.Name = Name;
-  StringRef CurStrSection;
-  StringRef CurStrOffsetSection;
-
-  // This maps each section contained in this file to its length.
-  // This information is later on used to calculate the contributions,
-  // i.e. offset and length, of each compile/type unit to a section.
-  std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLength;
-
-  const DWARFUnitIndex::Entry *CUDWOEntry = nullptr;
-  if (State.IsDWP)
-    CUDWOEntry = State.CUIndex->getFromHash(DWOId);
-
-  bool StrSectionWrittenOut = false;
-  const object::ObjectFile *DWOFile =
-      (*DWOCU)->getContext().getDWARFObj().getFile();
-
-  DebugRangeListsSectionWriter *RangeListssWriter = nullptr;
-  if (CU.getVersion() == 5) {
-    assert(RangeListsWritersByCU.count(DWOId) != 0 &&
-           "No RangeListsWriter for DWO ID.");
-    RangeListssWriter = RangeListsWritersByCU[DWOId].get();
-  }
-  auto AddType = [&](unsigned int Index, uint32_t IndexVersion, uint64_t Offset,
-                     uint64_t Length, uint64_t Hash) -> void {
-    UnitIndexEntry TUEntry = CurEntry;
-    if (IndexVersion < 5)
-      TUEntry.Contributions[0] = {};
-    TUEntry.Contributions[Index].setOffset(Offset);
-    TUEntry.Contributions[Index].setLength(Length);
-    State.ContributionOffsets[Index] +=
-        TUEntry.Contributions[Index].getLength32();
-    State.TypeIndexEntries.insert(std::make_pair(Hash, TUEntry));
-  };
-  std::unique_ptr<DebugBufferVector> StrOffsetsOutputData;
-  std::unique_ptr<DebugBufferVector> StrOutputData;
-  for (const SectionRef &Section : DWOFile->sections()) {
-    std::unique_ptr<DebugBufferVector> OutputData = nullptr;
-    StringRef SectionName = getSectionName(Section);
-    Expected<StringRef> ContentsExp = Section.getContents();
-    assert(ContentsExp && "Invalid contents.");
-    std::optional<StringRef> TOutData =
-        updateDebugData((*DWOCU)->getContext(), SectionName, *ContentsExp,
-                        State.KnownSections, *State.Streamer, *this, CUDWOEntry,
-                        DWOId, OutputData, RangeListssWriter, LocWriter,
-                        StrOffstsWriter, StrWriter, OverridenSections);
-    if (!TOutData)
-      continue;
-
-    StringRef OutData = *TOutData;
-    if (SectionName == "debug_types.dwo") {
-      State.Streamer->emitBytes(OutData);
-      continue;
-    }
-
-    if (SectionName == "debug_str.dwo") {
-      CurStrSection = OutData;
-      StrOutputData = std::move(OutputData);
-    } else {
-      // Since handleDebugDataPatching returned true, we already know this is
-      // a known section.
-      auto SectionIter = State.KnownSections.find(SectionName);
-      if (SectionIter->second.second == DWARFSectionKind::DW_SECT_STR_OFFSETS) {
-        CurStrOffsetSection = OutData;
-        StrOffsetsOutputData = std::move(OutputData);
-      } else {
-        State.Streamer->emitBytes(OutData);
-      }
-      unsigned int Index =
-          getContributionIndex(SectionIter->second.second, State.IndexVersion);
-      uint64_t Offset = State.ContributionOffsets[Index];
-      uint64_t Length = OutData.size();
-      if (CU.getVersion() >= 5 &&
-          SectionIter->second.second == DWARFSectionKind::DW_SECT_INFO) {
-        for (UnitMeta &MI : TUMetaVector)
-          MI.Offset += State.DebugInfoSize;
-
-        Offset = State.DebugInfoSize + CUMI.Offset;
-        Length = CUMI.Length;
-        State.DebugInfoSize += OutData.size();
-      }
-      CurEntry.Contributions[Index].setOffset(Offset);
-      CurEntry.Contributions[Index].setLength(Length);
-      State.ContributionOffsets[Index] +=
-          CurEntry.Contributions[Index].getLength32();
-    }
-
-    // Strings are combined in to a new string section, and de-duplicated
-    // based on hash.
-    if (!StrSectionWrittenOut && !CurStrOffsetSection.empty() &&
-        !CurStrSection.empty()) {
-      // If debug_str.dwo section was modified storing it until dwp is written
-      // out. DWPStringPool stores raw pointers to strings.
-      if (StrOutputData)
-        State.StrSections.push_back(std::move(StrOutputData));
-      writeStringsAndOffsets(*State.Streamer.get(), *State.Strings.get(),
-                             StrOffsetSection, CurStrSection,
-                             CurStrOffsetSection, CU.getVersion());
-      StrSectionWrittenOut = true;
-    }
-  }
-  CompileUnitIdentifiers CUI{DWOId, CurEntry.Name.c_str(),
-                             CurEntry.DWOName.c_str()};
-  auto P = State.IndexEntries.insert(std::make_pair(CUI.Signature, CurEntry));
-  if (!P.second) {
-    Error Err = buildDuplicateError(*P.first, CUI, "");
-    errs() << "BOLT-ERROR: " << toString(std::move(Err)) << "\n";
-    return;
-  }
-
-  // Handling TU
-  const unsigned Index = getContributionIndex(
-      State.IndexVersion < 5 ? DW_SECT_EXT_TYPES : DW_SECT_INFO,
-      State.IndexVersion);
-  for (UnitMeta &MI : TUMetaVector)
-    AddType(Index, State.IndexVersion, MI.Offset, MI.Length, MI.TUHash);
-}
-
 void DWARFRewriter::writeDWOFiles(
     DWARFUnit &CU, const OverriddenSectionsMap &OverridenSections,
     const std::string &DWOName, DebugLocWriter &LocWriter,
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 51038dbead330..886bbdbf9d686 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LEB128.h"
+#include <memory>
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "pseudo-probe-rewriter"
@@ -48,6 +49,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
                clEnumValN(PPP_All, "all", "enable all debugging printout")),
     cl::Hidden, cl::cat(BoltCategory));
 
+extern cl::opt<bool> ProfileUsePseudoProbes;
 } // namespace opts
 
 namespace {
@@ -72,23 +74,38 @@ class PseudoProbeRewriter final : public MetadataRewriter {
   void parsePseudoProbe();
 
   /// PseudoProbe decoder
-  MCPseudoProbeDecoder ProbeDecoder;
+  std::shared_ptr<MCPseudoProbeDecoder> ProbeDecoderPtr;
 
 public:
   PseudoProbeRewriter(BinaryContext &BC)
-      : MetadataRewriter("pseudo-probe-rewriter", BC) {}
+      : MetadataRewriter("pseudo-probe-rewriter", BC),
+        ProbeDecoderPtr(std::make_shared<MCPseudoProbeDecoder>()) {
+    BC.setPseudoProbeDecoder(ProbeDecoderPtr);
+  }
 
+  Error preCFGInitializer() override;
   Error postEmitFinalizer() override;
+
+  ~PseudoProbeRewriter() override { ProbeDecoderPtr.reset(); }
 };
 
+Error PseudoProbeRewriter::preCFGInitializer() {
+  if (opts::ProfileUsePseudoProbes)
+    parsePseudoProbe();
+
+  return Error::success();
+}
+
 Error PseudoProbeRewriter::postEmitFinalizer() {
-  parsePseudoProbe();
+  if (!opts::ProfileUsePseudoProbes)
+    parsePseudoProbe();
   updatePseudoProbes();
 
   return Error::success();
 }
 
 void PseudoProbeRewriter::parsePseudoProbe() {
+  MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
   PseudoProbeDescSection = BC.getUniqueSectionByName(".pseudo_probe_desc");
   PseudoProbeSection = BC.getUniqueSectionByName(".pseudo_probe");
 
@@ -138,9 +155,18 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printGUID2FuncDescMap(outs());
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
+
+  for (const auto &[GUID, FuncDesc] : ProbeDecoder.getGUID2FuncDescMap()) {
+    if (!FuncStartAddrs.contains(GUID))
+      continue;
+    BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
+    assert(BF);
+    BF->setGUID(GUID);
+  }
 }
 
 void PseudoProbeRewriter::updatePseudoProbes() {
+  MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
   // check if there is pseudo probe section decoded
   if (ProbeDecoder.getAddress2ProbesMap().empty())
     return;
@@ -241,6 +267,7 @@ void PseudoProbeRewriter::updatePseudoProbes() {
 }
 
 void PseudoProbeRewriter::encodePseudoProbes() {
+  MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
   // Buffer for new pseudo probes section
   SmallString<8> Contents;
   MCDecodedPseudoProbe *LastProbe = nullptr;
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 4ae802dc97ccd..9077869fe4955 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -669,6 +669,7 @@ Error RewriteInstance::run() {
         opts::ProfileFormat == opts::ProfileFormatKind::PF_YAML) {
       selectFunctionsToProcess();
       disassembleFunctions();
+      processMetadataPreCFG();
       buildFunctionsCFG();
     }
     processProfileData();
@@ -3130,18 +3131,24 @@ void RewriteInstance::initializeMetadataManager() {
 }
 
 void RewriteInstance::processSectionMetadata() {
+  NamedRegionTimer T("processmetadata-section", "process section metadata",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   initializeMetadataManager();
 
   MetadataManager.runSectionInitializers();
 }
 
 void RewriteInstance::processMetadataPreCFG() {
+  NamedRegionTimer T("processmetadata-precfg", "process metadata pre-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPreCFG();
 
   processProfileDataPreCFG();
 }
 
 void RewriteInstance::processMetadataPostCFG() {
+  NamedRegionTimer T("processmetadata-postcfg", "process metadata post-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPostCFG();
 }
 
@@ -3193,6 +3200,7 @@ void RewriteInstance::processProfileData() {
   if (opts::AggregateOnly) {
     PrintProgramStats PPS(&*BAT);
     BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*BC));
+    TimerGroup::printAll(outs());
     exit(0);
   }
 }
@@ -3535,10 +3543,14 @@ void RewriteInstance::emitAndLink() {
 }
 
 void RewriteInstance::finalizeMetadataPreEmit() {
+  NamedRegionTimer T("finalizemetadata-preemit", "finalize metadata pre-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersPreEmit();
 }
 
 void RewriteInstance::updateMetadata() {
+  NamedRegionTimer T("updatemetadata-postemit", "update metadata post-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersAfterEmit();
 
   if (opts::UpdateDebugSections) {
diff --git a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
index d114d70f2d376..026f8d35c55c6 100644
--- a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
@@ -32,10 +32,10 @@ cl::opt<bool>
                     "(which is what --hot-text relies on)."),
            cl::cat(BoltOptCategory));
 
-static cl::opt<std::string> RuntimeHugifyLib(
-    "runtime-hugify-lib",
-    cl::desc("specify file name of the runtime hugify library"),
-    cl::init("libbolt_rt_hugify.a"), cl::cat(BoltOptCategory));
+static cl::opt<std::string>
+    RuntimeHugifyLib("runtime-hugify-lib",
+                     cl::desc("specify path of the runtime hugify library"),
+                     cl::init("libbolt_rt_hugify.a"), cl::cat(BoltOptCategory));
 
 } // namespace opts
 
diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
index cd1b975be7b90..53a0c811b41d5 100644
--- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@@ -26,7 +26,7 @@ namespace opts {
 
 cl::opt<std::string> RuntimeInstrumentationLib(
     "runtime-instrumentation-lib",
-    cl::desc("specify file name of the runtime instrumentation library"),
+    cl::desc("specify path of the runtime instrumentation library"),
     cl::init("libbolt_rt_instr.a"), cl::cat(BoltOptCategory));
 
 extern cl::opt<bool> InstrumentationFileAppendPID;
diff --git a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
index 276b034d71f96..336c6768a7f71 100644
--- a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
@@ -26,8 +26,8 @@ using namespace bolt;
 
 void RuntimeLibrary::anchor() {}
 
-std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
-                                       StringRef LibFileName) {
+std::string RuntimeLibrary::getLibPathByToolPath(StringRef ToolPath,
+                                                 StringRef LibFileName) {
   StringRef Dir = llvm::sys::path::parent_path(ToolPath);
   SmallString<128> LibPath = llvm::sys::path::parent_path(Dir);
   llvm::sys::path::append(LibPath, "lib" LLVM_LIBDIR_SUFFIX);
@@ -38,13 +38,36 @@ std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
     llvm::sys::path::append(LibPath, "lib" LLVM_LIBDIR_SUFFIX);
   }
   llvm::sys::path::append(LibPath, LibFileName);
-  if (!llvm::sys::fs::exists(LibPath)) {
-    errs() << "BOLT-ERROR: library not found: " << LibPath << "\n";
-    exit(1);
-  }
   return std::string(LibPath);
 }
 
+std::string RuntimeLibrary::getLibPathByInstalled(StringRef LibFileName) {
+  SmallString<128> LibPath(CMAKE_INSTALL_FULL_LIBDIR);
+  llvm::sys::path::append(LibPath, LibFileName);
+  return std::string(LibPath);
+}
+
+std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
+                                       StringRef LibFileName) {
+  if (llvm::sys::fs::exists(LibFileName)) {
+    return std::string(LibFileName);
+  }
+
+  std::string ByTool = getLibPathByToolPath(ToolPath, LibFileName);
+  if (llvm::sys::fs::exists(ByTool)) {
+    return ByTool;
+  }
+
+  std::string ByInstalled = getLibPathByInstalled(LibFileName);
+  if (llvm::sys::fs::exists(ByInstalled)) {
+    return ByInstalled;
+  }
+
+  errs() << "BOLT-ERROR: library not found: " << ByTool << ", " << ByInstalled
+         << ", or " << LibFileName << "\n";
+  exit(1);
+}
+
 void RuntimeLibrary::loadLibrary(StringRef LibPath, BOLTLinker &Linker,
                                  BOLTLinker::SectionsMapper MapSections) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 1a2327df1d323..f58f7857e28ae 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -852,16 +852,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Uses;
   }
 
-  IndirectBranchType analyzeIndirectBranch(
-      MCInst &Instruction, InstructionIterator Begin, InstructionIterator End,
-      const unsigned PtrSize, MCInst *&MemLocInstrOut, unsigned &BaseRegNumOut,
-      unsigned &IndexRegNumOut, int64_t &DispValueOut,
-      const MCExpr *&DispExprOut, MCInst *&PCRelBaseOut) const override {
+  IndirectBranchType
+  analyzeIndirectBranch(MCInst &Instruction, InstructionIterator Begin,
+                        InstructionIterator End, const unsigned PtrSize,
+                        MCInst *&MemLocInstrOut, unsigned &BaseRegNumOut,
+                        unsigned &IndexRegNumOut, int64_t &DispValueOut,
+                        const MCExpr *&DispExprOut, MCInst *&PCRelBaseOut,
+                        MCInst *&FixedEntryLoadInstr) const override {
     MemLocInstrOut = nullptr;
     BaseRegNumOut = AArch64::NoRegister;
     IndexRegNumOut = AArch64::NoRegister;
     DispValueOut = 0;
     DispExprOut = nullptr;
+    FixedEntryLoadInstr = nullptr;
 
     // An instruction referencing memory used by jump instruction (directly or
     // via register). This location could be an array of function pointers
diff --git a/bolt/lib/Target/AArch64/CMakeLists.txt b/bolt/lib/Target/AArch64/CMakeLists.txt
index be03e247aa96b..7e2d33e09b5a0 100644
--- a/bolt/lib/Target/AArch64/CMakeLists.txt
+++ b/bolt/lib/Target/AArch64/CMakeLists.txt
@@ -4,6 +4,18 @@ set(LLVM_LINK_COMPONENTS
   AArch64Desc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64/AArch64.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64)
+  tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
+  tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+  tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(AArch64CommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetAArch64
   AArch64MCPlusBuilder.cpp
 
diff --git a/bolt/lib/Target/RISCV/CMakeLists.txt b/bolt/lib/Target/RISCV/CMakeLists.txt
index 7f95576063200..5d19d38717de4 100644
--- a/bolt/lib/Target/RISCV/CMakeLists.txt
+++ b/bolt/lib/Target/RISCV/CMakeLists.txt
@@ -4,6 +4,19 @@ set(LLVM_LINK_COMPONENTS
   RISCVDesc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  # tablegen, copied from llvm/lib/Target/RISCV/CMakeLists.txt
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV/RISCV.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV)
+  tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
+  tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables)
+  tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(RISCVCommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetRISCV
   RISCVMCPlusBuilder.cpp
 
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index eb3f38a0b8f4a..f8c83b09395f5 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -176,13 +176,14 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
       MCInst &Instruction, InstructionIterator Begin, InstructionIterator End,
       const unsigned PtrSize, MCInst *&MemLocInstr, unsigned &BaseRegNum,
       unsigned &IndexRegNum, int64_t &DispValue, const MCExpr *&DispExpr,
-      MCInst *&PCRelBaseOut) const override {
+      MCInst *&PCRelBaseOut, MCInst *&FixedEntryLoadInst) const override {
     MemLocInstr = nullptr;
     BaseRegNum = 0;
     IndexRegNum = 0;
     DispValue = 0;
     DispExpr = nullptr;
     PCRelBaseOut = nullptr;
+    FixedEntryLoadInst = nullptr;
 
     // Check for the following long tail call sequence:
     // 1: auipc xi, %pcrel_hi(sym)
diff --git a/bolt/lib/Target/X86/CMakeLists.txt b/bolt/lib/Target/X86/CMakeLists.txt
index 2b769bc7e7f5c..b274716e89a4c 100644
--- a/bolt/lib/Target/X86/CMakeLists.txt
+++ b/bolt/lib/Target/X86/CMakeLists.txt
@@ -5,6 +5,18 @@ set(LLVM_LINK_COMPONENTS
   X86Desc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/X86/X86.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/X86)
+  tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info -instr-info-expand-mi-operand-info=0)
+  tablegen(LLVM X86GenMnemonicTables.inc -gen-x86-mnemonic-tables -asmwriternum=1)
+  tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(X86CommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetX86
   X86MCPlusBuilder.cpp
   X86MCSymbolizer.cpp
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index e46c42533031c..63086c06d74fd 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1866,8 +1866,11 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
+  /// Analyzes PIC-style jump table code template and return identified
+  /// IndirectBranchType, MemLocInstr (all cases) and FixedEntryLoadInstr
+  /// (POSSIBLE_PIC_FIXED_BRANCH case).
   template <typename Itr>
-  std::pair<IndirectBranchType, MCInst *>
+  std::tuple<IndirectBranchType, MCInst *, MCInst *>
   analyzePICJumpTable(Itr II, Itr IE, MCPhysReg R1, MCPhysReg R2) const {
     // Analyze PIC-style jump table code template:
     //
@@ -1876,6 +1879,13 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     //    add %r2, %r1
     //    jmp *%r1
     //
+    // or a fixed indirect jump template:
+    //
+    //    movslq En(%rip), {%r2|%r1}              <- FixedEntryLoadInstr
+    //    lea PIC_JUMP_TABLE(%rip), {%r1|%r2}     <- MemLocInstr
+    //    add %r2, %r1
+    //    jmp *%r1
+    //
     // (with any irrelevant instructions in-between)
     //
     // When we call this helper we've already determined %r1 and %r2, and
@@ -1916,8 +1926,13 @@ class X86MCPlusBuilder : public MCPlusBuilder {
              MO.SegRegNum == X86::NoRegister;
     };
     LLVM_DEBUG(dbgs() << "Checking for PIC jump table\n");
-    MCInst *MemLocInstr = nullptr;
-    const MCInst *MovInstr = nullptr;
+    MCInst *FirstInstr = nullptr;
+    MCInst *SecondInstr = nullptr;
+    enum {
+      NOMATCH = 0,
+      MATCH_JUMP_TABLE,
+      MATCH_FIXED_BRANCH,
+    } MatchingState = NOMATCH;
     while (++II != IE) {
       MCInst &Instr = *II;
       const MCInstrDesc &InstrDesc = Info->get(Instr.getOpcode());
@@ -1926,68 +1941,76 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         // Ignore instructions that don't affect R1, R2 registers.
         continue;
       }
-      if (!MovInstr) {
-        // Expect to see MOV instruction.
-        if (!isMOVSX64rm32(Instr)) {
-          LLVM_DEBUG(dbgs() << "MOV instruction expected.\n");
+      const bool IsMOVSXInstr = isMOVSX64rm32(Instr);
+      const bool IsLEAInstr = isLEA64r(Instr);
+      if (MatchingState == NOMATCH) {
+        if (IsMOVSXInstr)
+          MatchingState = MATCH_JUMP_TABLE;
+        else if (IsLEAInstr)
+          MatchingState = MATCH_FIXED_BRANCH;
+        else
           break;
-        }
 
-        // Check if it's setting %r1 or %r2. In canonical form it sets %r2.
-        // If it sets %r1 - rename the registers so we have to only check
-        // a single form.
-        unsigned MovDestReg = Instr.getOperand(0).getReg();
-        if (MovDestReg != R2)
+        // Check if the first instruction is setting %r1 or %r2. In canonical
+        // form lea sets %r1 and mov sets %r2. If it's the opposite - rename so
+        // we have to only check a single form.
+        unsigned DestReg = Instr.getOperand(0).getReg();
+        MCPhysReg &ExpectReg = MatchingState == MATCH_JUMP_TABLE ? R2 : R1;
+        if (DestReg != ExpectReg)
           std::swap(R1, R2);
-        if (MovDestReg != R2) {
-          LLVM_DEBUG(dbgs() << "MOV instruction expected to set %r2\n");
+        if (DestReg != ExpectReg)
           break;
-        }
 
-        // Verify operands for MOV.
+        // Verify operands
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
-        if (!isIndexed(*MO, R1))
-          // POSSIBLE_PIC_JUMP_TABLE
+        if ((MatchingState == MATCH_JUMP_TABLE && isIndexed(*MO, R1)) ||
+            (MatchingState == MATCH_FIXED_BRANCH && isRIPRel(*MO)))
+          FirstInstr = &Instr;
+        else
           break;
-        MovInstr = &Instr;
       } else {
-        if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *RegInfo))
+        unsigned ExpectReg = MatchingState == MATCH_JUMP_TABLE ? R1 : R2;
+        if (!InstrDesc.hasDefOfPhysReg(Instr, ExpectReg, *RegInfo))
           continue;
-        if (!isLEA64r(Instr)) {
-          LLVM_DEBUG(dbgs() << "LEA instruction expected\n");
+        if ((MatchingState == MATCH_JUMP_TABLE && !IsLEAInstr) ||
+            (MatchingState == MATCH_FIXED_BRANCH && !IsMOVSXInstr))
           break;
-        }
-        if (Instr.getOperand(0).getReg() != R1) {
-          LLVM_DEBUG(dbgs() << "LEA instruction expected to set %r1\n");
+        if (Instr.getOperand(0).getReg() != ExpectReg)
           break;
-        }
 
-        // Verify operands for LEA.
+        // Verify operands.
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
         if (!isRIPRel(*MO))
           break;
-        MemLocInstr = &Instr;
+        SecondInstr = &Instr;
         break;
       }
     }
 
-    if (!MemLocInstr)
-      return std::make_pair(IndirectBranchType::UNKNOWN, nullptr);
+    if (!SecondInstr)
+      return std::make_tuple(IndirectBranchType::UNKNOWN, nullptr, nullptr);
 
+    if (MatchingState == MATCH_FIXED_BRANCH) {
+      LLVM_DEBUG(dbgs() << "checking potential fixed indirect branch\n");
+      return std::make_tuple(IndirectBranchType::POSSIBLE_PIC_FIXED_BRANCH,
+                             FirstInstr, SecondInstr);
+    }
     LLVM_DEBUG(dbgs() << "checking potential PIC jump table\n");
-    return std::make_pair(IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE,
-                          MemLocInstr);
+    return std::make_tuple(IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE,
+                           SecondInstr, nullptr);
   }
 
-  IndirectBranchType analyzeIndirectBranch(
-      MCInst &Instruction, InstructionIterator Begin, InstructionIterator End,
-      const unsigned PtrSize, MCInst *&MemLocInstrOut, unsigned &BaseRegNumOut,
-      unsigned &IndexRegNumOut, int64_t &DispValueOut,
-      const MCExpr *&DispExprOut, MCInst *&PCRelBaseOut) const override {
+  IndirectBranchType
+  analyzeIndirectBranch(MCInst &Instruction, InstructionIterator Begin,
+                        InstructionIterator End, const unsigned PtrSize,
+                        MCInst *&MemLocInstrOut, unsigned &BaseRegNumOut,
+                        unsigned &IndexRegNumOut, int64_t &DispValueOut,
+                        const MCExpr *&DispExprOut, MCInst *&PCRelBaseOut,
+                        MCInst *&FixedEntryLoadInst) const override {
     // Try to find a (base) memory location from where the address for
     // the indirect branch is loaded. For X86-64 the memory will be specified
     // in the following format:
@@ -2014,6 +2037,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     IndexRegNumOut = X86::NoRegister;
     DispValueOut = 0;
     DispExprOut = nullptr;
+    FixedEntryLoadInst = nullptr;
 
     std::reverse_iterator<InstructionIterator> II(End);
     std::reverse_iterator<InstructionIterator> IE(Begin);
@@ -2046,7 +2070,8 @@ class X86MCPlusBuilder : public MCPlusBuilder {
           unsigned R2 = PrevInstr.getOperand(2).getReg();
           if (R1 == R2)
             return IndirectBranchType::UNKNOWN;
-          std::tie(Type, MemLocInstr) = analyzePICJumpTable(PrevII, IE, R1, R2);
+          std::tie(Type, MemLocInstr, FixedEntryLoadInst) =
+              analyzePICJumpTable(PrevII, IE, R1, R2);
           break;
         }
         return IndirectBranchType::UNKNOWN;
@@ -2090,6 +2115,8 @@ class X86MCPlusBuilder : public MCPlusBuilder {
       if (MO->ScaleImm != 1 || MO->BaseRegNum != RIPRegister)
         return IndirectBranchType::UNKNOWN;
       break;
+    case IndirectBranchType::POSSIBLE_PIC_FIXED_BRANCH:
+      break;
     default:
       if (MO->ScaleImm != PtrSize)
         return IndirectBranchType::UNKNOWN;
diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index d1403314274bd..c452c1fac3772 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -1,15 +1,39 @@
+find_first_existing_vc_file("${LLVM_MAIN_SRC_DIR}" llvm_vc)
+find_first_existing_vc_file("${BOLT_SOURCE_DIR}" bolt_vc)
+
+# The VC revision include that we want to generate.
+set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/VCSVersion.inc")
+
+set(generate_vcs_version_script "${LLVM_CMAKE_DIR}/GenerateVersionFromVCS.cmake")
+
+# Create custom target to generate the VC revision include.
+add_custom_command(OUTPUT "${version_inc}"
+  DEPENDS "${llvm_vc}" "${bolt_vc}" "${generate_vcs_version_script}"
+  COMMAND ${CMAKE_COMMAND} "-DNAMES=BOLT"
+                           "-DHEADER_FILE=${version_inc}"
+                           "-DBOLT_SOURCE_DIR=${BOLT_SOURCE_DIR}"
+                           "-DLLVM_VC_REPOSITORY=${llvm_vc_repository}"
+                           "-DLLVM_VC_REVISION=${llvm_vc_revision}"
+                           "-DLLVM_FORCE_VC_REVISION=${LLVM_FORCE_VC_REVISION}"
+                           "-DLLVM_FORCE_VC_REPOSITORY=${LLVM_FORCE_VC_REPOSITORY}"
+                           -P "${generate_vcs_version_script}")
+
+# Mark the generated header as being generated.
+set_source_files_properties("${version_inc}"
+  PROPERTIES GENERATED TRUE
+             HEADER_FILE_ONLY TRUE)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
 add_llvm_library(LLVMBOLTUtils
   CommandLineOpts.cpp
   Utils.cpp
-
+  ${version_inc}
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
   ${LLVM_PTHREAD_LIB}
 
-  DEPENDS
-  llvm_vcsrevision_h
-
   LINK_COMPONENTS
   Support
   )
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 47375abb2ad3b..435a8fa9cafca 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -11,15 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Utils/CommandLineOpts.h"
-#include "llvm/Support/VCSRevision.h"
+#include "VCSVersion.inc"
 
 using namespace llvm;
 
 namespace llvm {
 namespace bolt {
 const char *BoltRevision =
-#ifdef LLVM_REVISION
-    LLVM_REVISION;
+#ifdef BOLT_REVISION
+    BOLT_REVISION;
 #else
     "<unknown>";
 #endif
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 6a65f80fb9079..40f4fbc9f30d5 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -16,12 +16,19 @@ add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
 add_library(bolt_rt_hugify STATIC
   hugify.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+
+if(NOT BOLT_BUILT_STANDALONE)
+  add_custom_command(TARGET bolt_rt_instr POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
+  add_custom_command(TARGET bolt_rt_hugify POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
+endif()
 
 set(BOLT_RT_FLAGS
   -ffreestanding
@@ -46,18 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
 target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
-install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   add_library(bolt_rt_instr_osx STATIC
     instr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
   target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0
     ${BOLT_RT_FLAGS})
-  install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+  install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+
+  if(NOT BOLT_BUILT_STANDALONE)
+    add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
+  endif()
 endif()
diff --git a/bolt/test/AArch64/dummy-return.s b/bolt/test/AArch64/dummy-return.s
index a446343161730..91f89dcb84762 100644
--- a/bolt/test/AArch64/dummy-return.s
+++ b/bolt/test/AArch64/dummy-return.s
@@ -1,4 +1,6 @@
-# REQUIRES: system-linux,target=aarch64{{.*}}
+# This test checks instrumentation of static binary on AArch64.
+
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
diff --git a/bolt/test/AArch64/update-debug-reloc.test b/bolt/test/AArch64/update-debug-reloc.test
index d57f42a3852a5..dd83229ea7143 100644
--- a/bolt/test/AArch64/update-debug-reloc.test
+++ b/bolt/test/AArch64/update-debug-reloc.test
@@ -2,7 +2,7 @@
 # update-debug-sections option.
 
 RUN: %clang %cflags -g %p/../Inputs/asm_foo.s %p/../Inputs/asm_main.c -o %t.exe
-RUN: llvm-bolt %t.exe -o %t --update-debug-sections
+RUN: llvm-bolt %t.exe -o %t --update-debug-sections 2>&1 | FileCheck %s
 
 CHECK: BOLT-INFO: Target architecture: aarch64
 CHECK-NOT: Reloc num: 10
diff --git a/bolt/test/AArch64/veneer-gold.s b/bolt/test/AArch64/veneer-gold.s
index 3b3e34ecb1a9f..275febce2b372 100644
--- a/bolt/test/AArch64/veneer-gold.s
+++ b/bolt/test/AArch64/veneer-gold.s
@@ -29,7 +29,7 @@ dummy:
 .type foo, %function
 foo:
 # CHECK: <foo>:
-# CHECK-NEXT : {{.*}} bl {{.*}} <foo2>
+# CHECK-NEXT: {{.*}} bl {{.*}} <foo2>
   bl .L2
   ret
 .size foo, .-foo
@@ -38,7 +38,7 @@ foo:
 .type foo2, %function
 foo2:
 # CHECK: <foo2>:
-# CHECK-NEXT : {{.*}} bl {{.*}} <foo2>
+# CHECK-NEXT: {{.*}} bl {{.*}} <foo2>
   bl .L2
   ret
 .size foo2, .-foo2
diff --git a/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s b/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s
index 66629a4880e64..6407964593e2d 100644
--- a/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s
+++ b/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s
@@ -6,7 +6,7 @@ main:
   jae .L4
   cmpq $0x1, %rdi
   jne .L4
-  mov .Ljt_pic+8(%rip), %rax
+  movslq .Ljt_pic+8(%rip), %rax
   lea .Ljt_pic(%rip), %rdx
   add %rdx, %rax
   jmpq *%rax
diff --git a/bolt/test/X86/debug-fission-single-convert.s b/bolt/test/X86/debug-fission-single-convert.s
index 5ea6eb8e353af..02c9290211fc0 100644
--- a/bolt/test/X86/debug-fission-single-convert.s
+++ b/bolt/test/X86/debug-fission-single-convert.s
@@ -41,19 +41,6 @@
 # CHECK-ADDR-SEC: 0x00000000: Addrs: [
 # CHECK-ADDR-SEC: 0x0000000000601000
 
-# RUN: llvm-bolt %t.exe --reorder-blocks=reverse --update-debug-sections --dwarf-output-path=%T -o %t.bolt.2.exe --write-dwp=true \
-# RUN: --always-convert-to-ranges=true
-# RUN: not llvm-dwarfdump --show-form --verbose --debug-info %t.bolt.2.exe.dwp &> %tAddrIndexTestDwp
-# RUN: cat %tAddrIndexTestDwp | FileCheck %s --check-prefix=CHECK-DWP-DEBUG
-
-# CHECK-DWP-DEBUG: DW_TAG_compile_unit [1] *
-# CHECK-DWP-DEBUG:  DW_AT_producer [DW_FORM_GNU_str_index]  (indexed (0000000a) string = "clang version 13.0.0")
-# CHECK-DWP-DEBUG:  DW_AT_language [DW_FORM_data2]  (DW_LANG_C_plus_plus)
-# CHECK-DWP-DEBUG:  DW_AT_name [DW_FORM_GNU_str_index]  (indexed (0000000b) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index]  (indexed (0000000c) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x06105e732fad3796)
-
-
 //clang++ -ffunction-sections -fno-exceptions -g -gsplit-dwarf=split -S debug-fission-simple.cpp -o debug-fission-simple.s
 static int foo = 2;
 int doStuff(int val) {
diff --git a/bolt/test/X86/debug-fission-single.s b/bolt/test/X86/debug-fission-single.s
index 4350bd9ec1815..1aa502fc9a840 100644
--- a/bolt/test/X86/debug-fission-single.s
+++ b/bolt/test/X86/debug-fission-single.s
@@ -42,18 +42,6 @@
 # CHECK-ADDR-SEC: 0x00000000: Addrs: [
 # CHECK-ADDR-SEC: 0x0000000000601000
 
-# RUN: llvm-bolt %t.exe --reorder-blocks=reverse --update-debug-sections --dwarf-output-path=%T -o %t.bolt.2.exe --write-dwp=true
-# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt.2.exe.dwp &> %tAddrIndexTestDwp
-# RUN: cat %tAddrIndexTestDwp | FileCheck %s --check-prefix=CHECK-DWP-DEBUG
-
-# CHECK-DWP-DEBUG: DW_TAG_compile_unit [1] *
-# CHECK-DWP-DEBUG:  DW_AT_producer [DW_FORM_GNU_str_index]  (indexed (0000000a) string = "clang version 13.0.0")
-# CHECK-DWP-DEBUG:  DW_AT_language [DW_FORM_data2]  (DW_LANG_C_plus_plus)
-# CHECK-DWP-DEBUG:  DW_AT_name [DW_FORM_GNU_str_index]  (indexed (0000000b) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index]  (indexed (0000000c) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x06105e732fad3796)
-
-
 //clang++ -ffunction-sections -fno-exceptions -g -gsplit-dwarf=split -S debug-fission-simple.cpp -o debug-fission-simple.s
 static int foo = 2;
 int doStuff(int val) {
diff --git a/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test
deleted file mode 100644
index d08b596ec8dd1..0000000000000
--- a/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test
+++ /dev/null
@@ -1,30 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
-; RUN: --filetype=obj %p/Inputs/dwarf4-ftypes-split-dwarf.s -o=main.o
-; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Make sure the output .dwp file has a type information.
-
-; PRE-BOLT: DW_TAG_type_unit
-; PRE-BOLT: DW_TAG_type_unit
-
-; PRE-BOLT-DWP-TU-INDEX: version = 2, units = 2, slots = 4
-; PRE-BOLT-DWP-TU-INDEX: Index Signature
-; PRE-BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; PRE-BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
-
-; BOLT: DW_TAG_type_unit
-; BOLT: DW_TAG_type_unit
-
-; BOLT-DWP-TU-INDEX: version = 2, units = 2, slots = 4
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
diff --git a/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test
deleted file mode 100644
index 54382142afc8f..0000000000000
--- a/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test
+++ /dev/null
@@ -1,45 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo   -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-split-gdb-index-types-main.s -o main.o
-; RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-split-gdb-index-types-helper.s -o helper1.o
-; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-helper2.s -o helper2.o
-; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper1.o helper2.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types helper2.o | FileCheck -check-prefix=PRE-BOLT2 %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Test split-dwarf and monolithic TUs.
-;; Make sure the output .dwp file has a type information.
-
-; PRE-BOLT: 0x675d23e4f33235f2
-; PRE-BOLT: DW_TAG_type_unit
-; PRE-BOLT: 0x49dc260088be7e56
-; PRE-BOLT: DW_TAG_type_unit
-
-; PRE-BOLT2: 0x8f55ac73549bc003
-; PRE-BOLT2: DW_TAG_type_unit
-; PRE-BOLT2: 0xe7734af8fed0632e
-; PRE-BOLT2: DW_TAG_type_unit
-
-; BOLT: 0x675d23e4f33235f2
-; BOLT: DW_TAG_type_unit
-; BOLT: 0x49dc260088be7e56
-; BOLT: DW_TAG_type_unit
-; BOLT: 0x104ec427d2ebea6f
-; BOLT: DW_TAG_type_unit
-; BOLT: 0xb4580bc1535df1e4
-; BOLT: DW_TAG_type_unit
-; BOLT-NOT: 0x8f55ac73549bc003
-; BOLT-NOT: 0xe7734af8fed0632e
-
-; BOLT-DWP-TU-INDEX: version = 2, units = 4, slots = 8
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; BOLT-DWP-TU-INDEX-NEXT: 0xb4580bc1535df1e4
-; BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
-; BOLT-DWP-TU-INDEX-NEXT: 0x104ec427d2ebea6f
diff --git a/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test b/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
index c9b12574caa3a..6caf5870fca02 100644
--- a/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
+++ b/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
@@ -17,10 +17,10 @@
 # POSTCHECK-NEXT:         0: Offset = 0x0, Length = 0x34
 # POSTCHECK-NEXT:         1: Offset = 0x34, Length = 0x34
 # POSTCHECK:          Types CU list offset = 0x38, has 4 entries
-# POSTCHECK-NEXT:       0: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x675d23e4f33235f2
-# POSTCHECK-NEXT:       1: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0x49dc260088be7e56
-# POSTCHECK-NEXT:       2: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x104ec427d2ebea6f
-# POSTCHECK-NEXT:       3: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0xb4580bc1535df1e4
+# POSTCHECK-NEXT:       0: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0xb4580bc1535df1e4
+# POSTCHECK-NEXT:       1: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x675d23e4f33235f2
+# POSTCHECK-NEXT:       2: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0x49dc260088be7e56
+# POSTCHECK-NEXT:       3: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x104ec427d2ebea6f
 # POSTCHECK:          Address area offset = 0x98, has 2 entries
 # POSTCHECK-NEXT:         Low/High address = [0x[[#%.4x,ADDR:]],
 # POSTCHECK-SAME:           0x[[#ADDR + 0x7a]]) (Size: 0x7a), CU id = 0
diff --git a/bolt/test/X86/dwarf5-df-larger-batch-size.test b/bolt/test/X86/dwarf5-df-larger-batch-size.test
new file mode 100644
index 0000000000000..c2c5f63e07ad8
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-larger-batch-size.test
@@ -0,0 +1,28 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-other.s \
+; RUN: -split-dwarf-file=mainOther.dwo -o other.o
+; RUN: %clang %cflags main.o other.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --cu-processing-batch-size=1
+; RUN: llvm-bolt main.exe -o main-batch.exe.bolt --update-debug-sections --cu-processing-batch-size=2
+; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
+; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo-batch.txt
+; RUN: cat %t/foo-batch.txt | FileCheck -check-prefix=BOLT-BATCH %s
+
+;; Tests that BOLT correctly handles DWO name strings with larger batch sizes.
+
+; BOLT: DW_TAG_skeleton_unit
+; BOLT: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "main.dwo.dwo")
+
+; BOLT: DW_TAG_skeleton_unit
+; BOLT: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "mainOther.dwo.dwo")
+
+; BOLT-BATCH: DW_TAG_skeleton_unit
+; BOLT-BATCH: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "main.dwo.dwo")
+
+; BOLT-BATCH: DW_TAG_skeleton_unit
+; BOLT-BATCH: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "mainOther.dwo.dwo")
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
index a4f5ee77ab565..c8cfd82753d77 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
@@ -72,59 +72,6 @@
 ; BOLT-NEXT: "helper.cpp"
 ; BOLT-NEXT: "helper.dwo"
 
-
-;; Tests that BOLT correctly handles updating DW_AT_dwo_name when it outputs a DWP file.
-;; Currently skipping one of Type units because it is not being de-dupped.
-;; In the tu-index this TU is not present.
-; RUN: rm main.exe.bolt
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt.dwp > logDWP.txt
-; RUN: llvm-dwarfdump --debug-str-offsets main.exe.bolt.dwp >> logDWP.txt
-; RUN: cat logDWP.txt | FileCheck -check-prefix=BOLT-DWP %s
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DW-NOT: DW_AT_dwo_name
-; BOLT-DWP:       Contribution size = 68, Format = DWARF32, Version = 5
-; BOLT-DWP-NEXT: "main"
-; BOLT-DWP-NEXT: "int"
-; BOLT-DWP-NEXT: "argc"
-; BOLT-DWP-NEXT: "argv"
-; BOLT-DWP-NEXT: "char"
-; BOLT-DWP-NEXT: "f2"
-; BOLT-DWP-NEXT: "."
-; BOLT-DWP-NEXT: "main.dwo.dwo"
-; BOLT-DWP-NEXT: "c1"
-; BOLT-DWP-NEXT: "Foo2"
-; BOLT-DWP-NEXT: "f3"
-; BOLT-DWP-NEXT: "c2"
-; BOLT-DWP-NEXT: "c3"
-; BOLT-DWP-NEXT: "Foo2a"
-; BOLT-DWP-NEXT: "clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
-; BOLT-DWP-NEXT: "main.cpp"
-; BOLT-DWP-NEXT: Contribution size = 64, Format = DWARF32, Version = 5
-; BOLT-DWP-NEXT: "fooint"
-; BOLT-DWP-NEXT: "int"
-; BOLT-DWP-NEXT: "_Z3foov"
-; BOLT-DWP-NEXT: "foo"
-; BOLT-DWP-NEXT: "fint"
-; BOLT-DWP-NEXT: "c1"
-; BOLT-DWP-NEXT: "c2"
-; BOLT-DWP-NEXT: "Foo2Int"
-; BOLT-DWP-NEXT: "f"
-; BOLT-DWP-NEXT: "char"
-; BOLT-DWP-NEXT: "c3"
-; BOLT-DWP-NEXT: "Foo2a"
-; BOLT-DWP-NEXT: "clang version 18.0.0"
-; BOLT-DWP-NEXT: "helper.cpp"
-; BOLT-DWP-NEXT: "helper.dwo
-
 ;; Tests that BOLT correctly handles updating DW_AT_comp_dir/DW_AT_dwo_name when outptut directory is specified.
 
 ; RUN: mkdir DWOOut
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
index 086f8f8139628..12a7f648c2325 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
@@ -73,31 +73,6 @@
 ; BOLT-NEXT: "clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
 ; BOLT-NEXT: "helper.cpp"
 
-
-;; Tests that BOLT correctly handles updating DW_AT_dwo_name when it outputs a DWP file.
-;; Currently skipping one of Type units because it is not being de-dupped.
-;; In the tu-index this TU is not present.
-; RUN: rm main.exe.bolt
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt.dwp > logDWP.txt
-; RUN: llvm-dwarfdump --debug-str-offsets main.exe.bolt.dwp >> logDWP.txt
-; RUN: cat logDWP.txt | FileCheck -check-prefix=BOLT-DWP %s
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("helper.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_name  ("helper.cpp")
-; BOLT-DWP: DW_AT_dwo_name  ("helper.dwo.dwo")
-
 ;; Tests that BOLT correctly handles updating DW_AT_comp_dir/DW_AT_dwo_name when outptut directory is specified.
 
 ; RUN: mkdir DWOOut
diff --git a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
index 070648c042c1d..b48d6a5dc20d4 100644
--- a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
+++ b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
@@ -5,10 +5,11 @@
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe
 # RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
+# RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=POSTCHECKADDR %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
 ## This test checks that BOLT handles correctly backward and forward cross CU references
-## for DWARF5 and DWARF4 with -fdebug-types-section
+## for DWARF5 and DWARF4 with -fdebug-types-section and checks the address table is correct.
 
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_TAG_type_unit
@@ -29,6 +30,15 @@
 # POSTCHECK: DW_TAG_variable [20]
 # POSTCHECK: DW_AT_type [DW_FORM_ref_addr] (0x{{[0-9a-f]+}} "Foo3a")
 
+# POSTCHECKADDR: Addrs: [
+# POSTCHECKADDR-NEXT: 0x0000000000001360
+# POSTCHECKADDR-NEXT: 0x0000000000000000
+# POSTCHECKADDR-NEXT: ]
+# POSTCHECKADDR: Addrs: [
+# POSTCHECKADDR-NEXT: 0x00000000000013e0
+# POSTCHECKADDR-NEXT: 0x0000000000000000
+# POSTCHECKADDR-NEXT: ]
+
 # POSTCHECKTU: version = 0x0004
 # POSTCHECKTU: DW_TAG_type_unit
 # POSTCHECKTU: DW_TAG_structure_type
diff --git a/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test b/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test
deleted file mode 100644
index b6e9f60bbfc70..0000000000000
--- a/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test
+++ /dev/null
@@ -1,55 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo   -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-main.s -o main.o
-; RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-helper.s -o helper1.o
-; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-helper2.s -o helper2.o
-; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper1.o helper2.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper2.o | FileCheck -check-prefix=PRE-BOLT2 %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info -r 0 main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-cu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-CU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Test split-dwarf and monolithic TUs.
-;; Make sure the output .dwp file has a type and cu information.
-
-; PRE-BOLT: Type Unit
-; PRE-BOLT-SAME: 0x675d23e4f33235f2
-; PRE-BOLT: Type Unit
-; PRE-BOLT-SAME: 0x49dc260088be7e56
-
-; PRE-BOLT2: 0x8f55ac73549bc003
-; PRE-BOLT2: DW_TAG_type_unit
-; PRE-BOLT2: 0xe7734af8fed0632e
-; PRE-BOLT2: DW_TAG_type_unit
-
-; BOLT: 0x00000000: Type Unit: length = 0x00000047
-; BOLT-SAME: 0x675d23e4f33235f2
-; BOLT: 0x0000004b: Type Unit: length = 0x0000003e
-; BOLT-SAME: 0x49dc260088be7e56
-; BOLT: 0x0000008d: Compile Unit: length = 0x00000077
-; BOLT-SAME: 0x4257354d8bb35644
-; BOLT: 0x00000108: Type Unit: length = 0x00000047
-; BOLT-SAME: 0x104ec427d2ebea6f
-; BOLT: 0x00000153: Type Unit: length = 0x0000003e
-; BOLT-SAME: 0xb4580bc1535df1e4
-; BOLT: 0x00000195: Compile Unit: length = 0x00000054
-; BOLT-SAME: 0x7738bfb5f3edfb73
-; BOLT-NOT: 0x8f55ac73549bc003
-; BOLT-NOT: 0xe7734af8fed0632e
-
-; BOLT-DWP-TU-INDEX: version = 5, units = 4, slots = 8
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 3 0x675d23e4f33235f2 [0x0000000000000000, 0x000000000000004b) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-TU-INDEX: 5 0xb4580bc1535df1e4 [0x0000000000000153, 0x0000000000000195) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
-; BOLT-DWP-TU-INDEX: 7 0x49dc260088be7e56 [0x000000000000004b, 0x000000000000008d) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-TU-INDEX: 8 0x104ec427d2ebea6f [0x0000000000000108, 0x0000000000000153) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
-
-; BOLT-DWP-CU-INDEX: version = 5, units = 2, slots = 4
-; BOLT-DWP-CU-INDEX: Index Signature
-; BOLT-DWP-CU-INDEX: 1 0x4257354d8bb35644 [0x000000000000008d, 0x0000000000000108) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-CU-INDEX: 4 0x7738bfb5f3edfb73 [0x0000000000000195, 0x00000000000001ed) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
diff --git a/bolt/test/X86/dwarf5-locexpr-referrence.test b/bolt/test/X86/dwarf5-locexpr-referrence.test
index ea73d7601b253..cc7bb27ce602e 100644
--- a/bolt/test/X86/dwarf5-locexpr-referrence.test
+++ b/bolt/test/X86/dwarf5-locexpr-referrence.test
@@ -5,8 +5,10 @@
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=CHECK %s
+# RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=CHECKADDR %s
 
-## This test checks that we update relative DIE references with DW_OP_convert that are in locexpr.
+## This test checks that we update relative DIE references with DW_OP_convert that are in locexpr
+## and checks the address table is correct.
 
 # CHECK: version = 0x0005
 # CHECK: DW_TAG_variable
@@ -19,3 +21,18 @@
 # CHECK-SAME: DW_OP_convert (0x00000028 -> 0x00000092)
 # CHECK-SAME: DW_OP_convert (0x0000002c -> 0x00000096)
 # CHECK: version = 0x0005
+
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001330
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: 0x0000000000001333
+# CHECKADDR-NEXT: ]
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001340
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: 0x0000000000001343
+# CHECKADDR-NEXT: ]
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001320
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: ]
diff --git a/bolt/test/X86/dwarf5-one-loclists-two-bases.test b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
index 873512aad5e8d..f25f6c7a46858 100644
--- a/bolt/test/X86/dwarf5-one-loclists-two-bases.test
+++ b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
@@ -34,7 +34,7 @@
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_variable
 # POSTCHECK: DW_AT_location [DW_FORM_loclistx]
 # POSTCHECK-SAME: indexed (0x0)
diff --git a/bolt/test/X86/dwarf5-two-loclists.test b/bolt/test/X86/dwarf5-two-loclists.test
index 2ede02f3b76fb..a7c6351f9813c 100644
--- a/bolt/test/X86/dwarf5-two-loclists.test
+++ b/bolt/test/X86/dwarf5-two-loclists.test
@@ -45,7 +45,7 @@
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_variable
 # POSTCHECK: DW_AT_location [DW_FORM_loclistx]
 # POSTCHECK-SAME: indexed (0x0)
diff --git a/bolt/test/X86/dwarf5-two-rnglists.test b/bolt/test/X86/dwarf5-two-rnglists.test
index 17cdc7643bae5..98f2e347d7673 100644
--- a/bolt/test/X86/dwarf5-two-rnglists.test
+++ b/bolt/test/X86/dwarf5-two-rnglists.test
@@ -52,7 +52,7 @@
 # POSTCHECK-NEXT: DW_AT_addr_base [DW_FORM_sec_offset]  (0x00000008)
 # POSTCHECK-NEXT: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_subprogram
 # POSTCHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx]
 # POSTCHECK-SAME: indexed (0x1)
@@ -75,7 +75,7 @@
 # POSTCHECK-NEXT: DW_AT_addr_base [DW_FORM_sec_offset]  (0x00000030)
 # POSTCHECK-NEXT: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x00000045)
 # POSTCHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x00000035)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 
 # POSTCHECK: DW_TAG_subprogram
 # POSTCHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx]
diff --git a/bolt/test/X86/jump-table-fixed-ref-pic.test b/bolt/test/X86/jump-table-fixed-ref-pic.test
index c8b6eda2278b9..d215c565b31e5 100644
--- a/bolt/test/X86/jump-table-fixed-ref-pic.test
+++ b/bolt/test/X86/jump-table-fixed-ref-pic.test
@@ -1,9 +1,13 @@
 ## Verify that BOLT detects fixed destination of indirect jump for PIC
 ## case.
 
-XFAIL: *
-
 RUN: %clang %cflags -no-pie %S/Inputs/jump-table-fixed-ref-pic.s -Wl,-q -o %t
-RUN: llvm-bolt %t --relocs -o %t.null 2>&1 | FileCheck %s
+RUN: llvm-bolt %t --relocs -o %t.null -print-cfg 2>&1 | FileCheck %s
+
+CHECK: BOLT-INFO: fixed PIC indirect branch detected in main {{.*}} the destination value is 0x[[#TGT:]]
+CHECK: Binary Function "main" after building cfg
 
-CHECK: BOLT-INFO: fixed indirect branch detected in main
+CHECK:      movslq ".rodata/1"+8(%rip), %rax
+CHECK-NEXT: leaq ".rodata/1"(%rip), %rdx
+CHECK-NEXT: addq %rdx, %rax
+CHECK-NEXT: jmpq *%rax # UNKNOWN CONTROL FLOW
diff --git a/bolt/test/X86/match-functions-with-call-graph.test b/bolt/test/X86/match-functions-with-call-graph.test
new file mode 100644
index 0000000000000..e826c57f35312
--- /dev/null
+++ b/bolt/test/X86/match-functions-with-call-graph.test
@@ -0,0 +1,103 @@
+## Tests blocks matching by called function names in inferStaleProfile.
+
+# REQUIRES: system-linux
+# RUN: split-file %s %t
+# RUN: %clang %cflags %t/main.cpp -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml --profile-ignore-hash -v=1 \
+# RUN:   --dyno-stats --print-cfg --infer-stale-profile=1 --match-with-call-graph 2>&1 | FileCheck %s
+
+# CHECK: BOLT-INFO: matched 1 functions with call graph
+
+#--- main.cpp
+void foo() {}
+
+void bar() {}
+
+void qux() {
+    foo();
+    bar();
+}
+
+void fred() {
+    foo();
+    qux();
+    bar();
+    bar();
+    foo();
+}
+
+int main() {
+    return 0;
+}
+
+#--- yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'match-functions-with-calls-as-anchors.s.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            main
+    fid:             0
+    hash:            0x0000000000000001
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000001
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+  - name:            _Z3foov
+    fid:             1
+    hash:            0x0000000000000002
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000002
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+
+  - name:            _Z3barv
+    fid:             2
+    hash:            0x0000000000000003
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000003
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+  - name:            _Z3quxv
+    fid:             3
+    hash:            0x0000000000000004
+    exec:            4
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000004
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+        calls:           [ { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0} ]
+  - name:            _Z4bazv
+    fid:             4
+    hash:            0x0000000000000005
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000005
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+        calls:           [ { off : 0, fid : 3, cnt : 0},
+                           { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0},
+                           { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0} ]
+...
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index 15e93b1630a66..b361551e5711e 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -1,5 +1,42 @@
 # REQUIRES: system-linux
-# RUN: llvm-bolt  %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin --print-pseudo-probes=all -o %t.bolt 2>&1 | FileCheck %s
+# RUN: llvm-bolt  %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin --print-pseudo-probes=all -o %t.bolt --lite=0 --enable-bat 2>&1 | FileCheck %s
+
+# PREAGG: B X:0 #foo# 1 0
+# PREAGG: B X:0 #bar# 1 0
+# PREAGG: B X:0 #main# 1 0
+## Check pseudo-probes in regular YAML profile (non-BOLTed binary)
+# RUN: link_fdata %s %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin %t.preagg PREAGG
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-use-pseudo-probes
+# RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-YAML
+## Check pseudo-probes in BAT YAML profile (BOLTed binary)
+# RUN: link_fdata %s %t.bolt %t.preagg2 PREAGG
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-use-pseudo-probes
+# RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
+# CHECK-YAML: name: bar
+# CHECK-YAML: - bid: 0
+# CHECK-YAML:   pseudo_probes: [ { guid: 0xE413754A191DB537, id: 1, type: 0 }, { guid: 0xE413754A191DB537, id: 4, type: 0 } ]
+# CHECK-YAML: guid: 0xE413754A191DB537
+# CHECK-YAML: pseudo_probe_desc_hash: 0x10E852DA94
+#
+# CHECK-YAML: name: foo
+# CHECK-YAML: - bid: 0
+# CHECK-YAML:   pseudo_probes: [ { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
+# CHECK-YAML: guid: 0x5CF8C24CDB18BDAC
+# CHECK-YAML: pseudo_probe_desc_hash: 0x200205A19C5B4
+#
+# CHECK-YAML: name: main
+# CHECK-YAML: - bid: 0
+# CHECK-YAML:   pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
+# CHECK-YAML: guid: 0xDB956436E78DD5FA
+# CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
+#
+## Check that without --profile-use-pseudo-probes option, no pseudo probes are
+## generated
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
+# RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT
+# CHECK-NO-OPT-NOT: pseudo_probes
+# CHECK-NO-OPT-NOT: guid
+# CHECK-NO-OPT-NOT: pseudo_probe_desc_hash
 
 CHECK: Report of decoding input pseudo probe binaries
 
diff --git a/bolt/test/X86/three-way-split-jt.s b/bolt/test/X86/three-way-split-jt.s
new file mode 100644
index 0000000000000..c4ddc1b9905d7
--- /dev/null
+++ b/bolt/test/X86/three-way-split-jt.s
@@ -0,0 +1,95 @@
+## This reproduces an issue where the function is split into three fragments
+## and all fragments access the same jump table.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -v=1 -print-only=main.warm -print-cfg  2>&1 | FileCheck %s
+
+# CHECK-DAG: BOLT-INFO: marking main.warm as a fragment of main
+# CHECK-DAG: BOLT-INFO: marking main.cold as a fragment of main
+# CHECK-DAG: BOLT-INFO: processing main.warm as a sibling of non-ignored function
+# CHECK-DAG: BOLT-INFO: processing main.cold as a sibling of non-ignored function
+# CHECK-DAG: BOLT-WARNING: Ignoring main.cold
+# CHECK-DAG: BOLT-WARNING: Ignoring main.warm
+# CHECK-DAG: BOLT-WARNING: Ignoring main
+# CHECK: BOLT-WARNING: skipped 3 functions due to cold fragments
+
+# CHECK: PIC Jump table JUMP_TABLE for function main, main.warm, main.cold
+# CHECK-NEXT:   0x0000 : __ENTRY_main@0x[[#]]
+# CHECK-NEXT:   0x0004 : __ENTRY_main@0x[[#]]
+# CHECK-NEXT:   0x0008 : __ENTRY_main.cold@0x[[#]]
+# CHECK-NEXT:   0x000c : __ENTRY_main@0x[[#]]
+  .globl main
+  .type main, %function
+  .p2align 2
+main:
+LBB0:
+  andl $0xf, %ecx
+  cmpb $0x4, %cl
+  ## exit through ret
+  ja LBB3
+
+## jump table dispatch, jumping to label indexed by val in %ecx
+LBB1:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB2:
+  xorq %rax, %rax
+LBB3:
+  addq $0x8, %rsp
+  ret
+.size main, .-main
+
+  .globl main.warm
+  .type main.warm, %function
+  .p2align 2
+main.warm:
+LBB20:
+  andl $0xb, %ebx
+  cmpb $0x1, %cl
+  # exit through ret
+  ja LBB23
+
+## jump table dispatch, jumping to label indexed by val in %ecx
+LBB21:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB22:
+  xorq %rax, %rax
+LBB23:
+  addq $0x8, %rsp
+  ret
+.size main.warm, .-main.warm
+
+## cold fragment is only reachable through jump table
+  .globl main.cold
+  .type main.cold, %function
+main.cold:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+LBB4:
+  callq abort
+.size main.cold, .-main.cold
+
+  .rodata
+## jmp table, entries must be R_X86_64_PC32 relocs
+  .globl JUMP_TABLE
+JUMP_TABLE:
+  .long LBB2-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
+  .long LBB4-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index 3a6da210e01f0..da3ae34ba3bdd 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -92,10 +92,22 @@
 
 tool_dirs = [config.llvm_tools_dir, config.test_source_root]
 
+llvm_bolt_args = []
+
+if config.libbolt_rt_instr:
+    llvm_bolt_args.append(f"--runtime-instrumentation-lib={config.libbolt_rt_instr}")
+
+if config.libbolt_rt_hugify:
+    llvm_bolt_args.append(f"--runtime-hugify-lib={config.libbolt_rt_hugify}")
+
 tools = [
     ToolSubst("llc", unresolved="fatal"),
     ToolSubst("llvm-dwarfdump", unresolved="fatal"),
-    ToolSubst("llvm-bolt", unresolved="fatal"),
+    ToolSubst(
+        "llvm-bolt",
+        unresolved="fatal",
+        extra_args=llvm_bolt_args,
+    ),
     ToolSubst("llvm-boltdiff", unresolved="fatal"),
     ToolSubst("llvm-bolt-heatmap", unresolved="fatal"),
     ToolSubst("llvm-bat-dump", unresolved="fatal"),
diff --git a/bolt/test/lit.site.cfg.py.in b/bolt/test/lit.site.cfg.py.in
index 46cb326dfbae1..457908fc7c446 100644
--- a/bolt/test/lit.site.cfg.py.in
+++ b/bolt/test/lit.site.cfg.py.in
@@ -19,6 +19,8 @@ config.bolt_clang = "@BOLT_CLANG_EXE@"
 config.bolt_lld = "@BOLT_LLD_EXE@"
 config.targets_to_build = "@BOLT_TARGETS_TO_BUILD@"
 config.gnu_ld = "@GNU_LD_EXECUTABLE@"
+config.libbolt_rt_instr = "@LIBBOLT_RT_INSTR@"
+config.libbolt_rt_hugify = "@LIBBOLT_RT_HUGIFY@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/bolt/test/perf2bolt/lit.local.cfg b/bolt/test/perf2bolt/lit.local.cfg
index 05f41ff333b0e..4ee9ad08cc78a 100644
--- a/bolt/test/perf2bolt/lit.local.cfg
+++ b/bolt/test/perf2bolt/lit.local.cfg
@@ -1,4 +1,4 @@
 import shutil
 
-if shutil.which("perf") != None:
+if shutil.which("perf") is not None:
     config.available_features.add("perf")
\ No newline at end of file
diff --git a/bolt/test/timers.c b/bolt/test/timers.c
new file mode 100644
index 0000000000000..a34958a2a15e9
--- /dev/null
+++ b/bolt/test/timers.c
@@ -0,0 +1,22 @@
+/* This test checks timers for metadata manager phases.
+# RUN: %clang %cflags %s -o %t.exe
+# RUN: link_fdata %s %t.exe %t.fdata
+# RUN: llvm-bolt %t.exe -o %t.null --data %t.fdata -w %t.yaml --time-rewrite \
+# RUN:   2>&1 | FileCheck %s
+# RUN: link_fdata %s %t.exe %t.preagg PREAGG
+# RUN: perf2bolt %t.exe -o %t.null -p %t.preagg --pa --time-rewrite \
+# RUN:   2>&1 | FileCheck %s --check-prefix=CHECK-P2B
+
+# CHECK-DAG: update metadata post-emit
+# CHECK-DAG: process section metadata
+# CHECK-DAG: process metadata pre-CFG
+# CHECK-DAG: process metadata post-CFG
+# CHECK-DAG: finalize metadata pre-emit
+
+# CHECK-P2B-DAG: process section metadata
+# CHECK-P2B-DAG: process metadata pre-CFG
+
+# FDATA: 0 [unknown] 0 1 main 0 1 0
+# PREAGG: B X:0 #main# 1 0
+*/
+int main() { return 0; }
diff --git a/bolt/utils/docker/Dockerfile b/bolt/utils/docker/Dockerfile
index 722a07e46f9e4..c2108f7aec53c 100644
--- a/bolt/utils/docker/Dockerfile
+++ b/bolt/utils/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:20.04 AS builder
+FROM ubuntu:24.04 AS builder
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=UTC
@@ -26,6 +26,6 @@ RUN mkdir build && \
     ninja install-llvm-bolt install-perf2bolt install-merge-fdata \
       install-llvm-boltdiff install-bolt_rt
 
-FROM ubuntu:20.04
+FROM ubuntu:24.04
 
 COPY --from=builder /home/bolt/install /usr/local
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index f6b5e8926f903..aef22453035c3 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <optional>
 #include <string>
 
@@ -979,6 +980,18 @@ static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
                                    "error creating index file: " +
                                        FileErr.message());
   }
+  llvm::SmallString<128> RootPath(CDCtx.OutDirectory);
+  if (llvm::sys::path::is_relative(RootPath)) {
+    llvm::sys::fs::make_absolute(RootPath);
+  }
+  // Replace the escaped characters with a forward slash. It shouldn't matter
+  // when rendering the webpage in a web browser. This helps to prevent the
+  // JavaScript from escaping characters incorrectly, and introducing  bad paths
+  // in the URLs.
+  std::string RootPathEscaped = RootPath.str().str();
+  std::replace(RootPathEscaped.begin(), RootPathEscaped.end(), '\\', '/');
+  OS << "var RootPath = \"" << RootPathEscaped << "\";\n";
+
   CDCtx.Idx.sort();
   llvm::json::OStream J(OS, 2);
   std::function<void(Index)> IndexToJSON = [&](const Index &I) {
diff --git a/clang-tools-extra/clang-doc/Mapper.cpp b/clang-tools-extra/clang-doc/Mapper.cpp
index bb8b7952980ac..6c90db03424c6 100644
--- a/clang-tools-extra/clang-doc/Mapper.cpp
+++ b/clang-tools-extra/clang-doc/Mapper.cpp
@@ -12,16 +12,28 @@
 #include "clang/AST/Comment.h"
 #include "clang/Index/USRGeneration.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Mutex.h"
 
 namespace clang {
 namespace doc {
 
+static llvm::StringSet<> USRVisited;
+static llvm::sys::Mutex USRVisitedGuard;
+
+template <typename T> bool isTypedefAnonRecord(const T *D) {
+  if (const auto *C = dyn_cast<CXXRecordDecl>(D)) {
+    return C->getTypedefNameForAnonDecl();
+  }
+  return false;
+}
+
 void MapASTVisitor::HandleTranslationUnit(ASTContext &Context) {
   TraverseDecl(Context.getTranslationUnitDecl());
 }
 
-template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
+template <typename T>
+bool MapASTVisitor::mapDecl(const T *D, bool IsDefinition) {
   // If we're looking a decl not in user files, skip this decl.
   if (D->getASTContext().getSourceManager().isInSystemHeader(D->getLocation()))
     return true;
@@ -34,6 +46,16 @@ template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
   // If there is an error generating a USR for the decl, skip this decl.
   if (index::generateUSRForDecl(D, USR))
     return true;
+  // Prevent Visiting USR twice
+  {
+    std::lock_guard<llvm::sys::Mutex> Guard(USRVisitedGuard);
+    StringRef Visited = USR.str();
+    if (USRVisited.count(Visited) && !isTypedefAnonRecord<T>(D))
+      return true;
+    // We considered a USR to be visited only when its defined
+    if (IsDefinition)
+      USRVisited.insert(Visited);
+  }
   bool IsFileInRootDir;
   llvm::SmallString<128> File =
       getFile(D, D->getASTContext(), CDCtx.SourceRoot, IsFileInRootDir);
@@ -53,30 +75,34 @@ template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
 }
 
 bool MapASTVisitor::VisitNamespaceDecl(const NamespaceDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
-bool MapASTVisitor::VisitRecordDecl(const RecordDecl *D) { return mapDecl(D); }
+bool MapASTVisitor::VisitRecordDecl(const RecordDecl *D) {
+  return mapDecl(D, D->isThisDeclarationADefinition());
+}
 
-bool MapASTVisitor::VisitEnumDecl(const EnumDecl *D) { return mapDecl(D); }
+bool MapASTVisitor::VisitEnumDecl(const EnumDecl *D) {
+  return mapDecl(D, D->isThisDeclarationADefinition());
+}
 
 bool MapASTVisitor::VisitCXXMethodDecl(const CXXMethodDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, D->isThisDeclarationADefinition());
 }
 
 bool MapASTVisitor::VisitFunctionDecl(const FunctionDecl *D) {
   // Don't visit CXXMethodDecls twice
   if (isa<CXXMethodDecl>(D))
     return true;
-  return mapDecl(D);
+  return mapDecl(D, D->isThisDeclarationADefinition());
 }
 
 bool MapASTVisitor::VisitTypedefDecl(const TypedefDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
 bool MapASTVisitor::VisitTypeAliasDecl(const TypeAliasDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
 comments::FullComment *
diff --git a/clang-tools-extra/clang-doc/Mapper.h b/clang-tools-extra/clang-doc/Mapper.h
index cedde935ab743..75c8e947c8f90 100644
--- a/clang-tools-extra/clang-doc/Mapper.h
+++ b/clang-tools-extra/clang-doc/Mapper.h
@@ -43,7 +43,7 @@ class MapASTVisitor : public clang::RecursiveASTVisitor<MapASTVisitor>,
   bool VisitTypeAliasDecl(const TypeAliasDecl *D);
 
 private:
-  template <typename T> bool mapDecl(const T *D);
+  template <typename T> bool mapDecl(const T *D, bool IsDefinition);
 
   int getLine(const NamedDecl *D, const ASTContext &Context) const;
   llvm::SmallString<128> getFile(const NamedDecl *D, const ASTContext &Context,
diff --git a/clang-tools-extra/clang-doc/assets/index.js b/clang-tools-extra/clang-doc/assets/index.js
index 49818763a4393..6a223de66f84a 100644
--- a/clang-tools-extra/clang-doc/assets/index.js
+++ b/clang-tools-extra/clang-doc/assets/index.js
@@ -1,42 +1,17 @@
-// Append using posix-style a file name or directory to Base
-function append(Base, New) {
-  if (!New)
-    return Base;
-  if (Base)
-    Base += "/";
-  Base += New;
-  return Base;
-}
-
-// Get relative path to access FilePath from CurrentDirectory
-function computeRelativePath(FilePath, CurrentDirectory) {
-  var Path = FilePath;
-  while (Path) {
-    if (CurrentDirectory == Path)
-      return FilePath.substring(Path.length + 1);
-    Path = Path.substring(0, Path.lastIndexOf("/"));
-  }
-
-  var Dir = CurrentDirectory;
-  var Result = "";
-  while (Dir) {
-    if (Dir == FilePath)
-      break;
-    Dir = Dir.substring(0, Dir.lastIndexOf("/"));
-    Result = append(Result, "..")
+function genLink(Ref) {
+  // we treat the file paths different depending on if we're
+  // serving via a http server or viewing from a local
+  var Path = window.location.protocol.startsWith("file") ?
+      `${window.location.protocol}//${window.location.host}/${Ref.Path}` :
+      `${window.location.protocol}//${RootPath}/${Ref.Path}`;
+  if (Ref.RefType === "namespace") {
+    Path = `${Path}/index.html`
+  } else if (Ref.Path === "") {
+      Path = `${Path}${Ref.Name}.html`;
+  } else {
+    Path = `${Path}/${Ref.Name}.html`;
   }
-  Result = append(Result, FilePath.substring(Dir.length))
-  return Result;
-}
-
-function genLink(Ref, CurrentDirectory) {
-  var Path = computeRelativePath(Ref.Path, CurrentDirectory);
-  if (Ref.RefType == "namespace")
-    Path = append(Path, "index.html");
-  else
-    Path = append(Path, Ref.Name + ".html")
-
-    ANode = document.createElement("a");
+  ANode = document.createElement("a");
   ANode.setAttribute("href", Path);
   var TextNode = document.createTextNode(Ref.Name);
   ANode.appendChild(TextNode);
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index 6198a6e0cdcc3..3363cafeded5e 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -303,7 +303,6 @@ Example usage for a project using a compile commands database:
   for (auto &Group : USRToBitcode) {
     Pool.async([&]() {
       std::vector<std::unique_ptr<doc::Info>> Infos;
-
       for (auto &Bitcode : Group.getValue()) {
         llvm::BitstreamCursor Stream(Bitcode);
         doc::ClangDocBitcodeReader Reader(Stream);
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 3b14d5d158d2d..bd69bddcc6825 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -13,9 +13,12 @@
 
 import argparse
 import io
+import itertools
 import os
 import re
 import sys
+import textwrap
+
 
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
@@ -53,7 +56,29 @@ def adapt_cmake(module_path, check_name_camel):
 
 
 # Adds a header for the new check.
-def write_header(module_path, module, namespace, check_name, check_name_camel):
+def write_header(
+    module_path,
+    module,
+    namespace,
+    check_name,
+    check_name_camel,
+    description,
+    lang_restrict,
+):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="/// ", subsequent_indent="/// "
+        )
+    )
+    if lang_restrict:
+        override_supported = """
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return %s;
+  }""" % (
+            lang_restrict % {"lang": "LangOpts"}
+        )
+    else:
+        override_supported = ""
     filename = os.path.join(module_path, check_name_camel) + ".h"
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -85,7 +110,7 @@ def write_header(module_path, module, namespace, check_name, check_name_camel):
 
 namespace clang::tidy::%(namespace)s {
 
-/// FIXME: Write a short description.
+%(description)s
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html
@@ -94,7 +119,7 @@ class %(check_name_camel)s : public ClangTidyCheck {
   %(check_name_camel)s(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
-  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;%(override_supported)s
 };
 
 } // namespace clang::tidy::%(namespace)s
@@ -107,6 +132,8 @@ class %(check_name_camel)s : public ClangTidyCheck {
                 "check_name": check_name,
                 "module": module,
                 "namespace": namespace,
+                "description": wrapped_desc,
+                "override_supported": override_supported,
             }
         )
 
@@ -235,7 +262,12 @@ def adapt_module(module_path, module, check_name, check_name_camel):
 
 
 # Adds a release notes entry.
-def add_release_notes(module_path, module, check_name):
+def add_release_notes(module_path, module, check_name, description):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="  ", subsequent_indent="  "
+        )
+    )
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(module_path, "../../docs/ReleaseNotes.rst")
@@ -281,10 +313,10 @@ def add_release_notes(module_path, module, check_name):
                             """- New :doc:`%s
   <clang-tidy/checks/%s/%s>` check.
 
-  FIXME: add release notes.
+%s
 
 """
-                            % (check_name_dashes, module, check_name)
+                            % (check_name_dashes, module, check_name, wrapped_desc)
                         )
                         note_added = True
 
@@ -292,7 +324,9 @@ def add_release_notes(module_path, module, check_name):
 
 
 # Adds a test for the check.
-def write_test(module_path, module, check_name, test_extension):
+def write_test(module_path, module, check_name, test_extension, test_standard):
+    if test_standard:
+        test_standard = f"-std={test_standard}-or-later "
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(
@@ -309,7 +343,7 @@ def write_test(module_path, module, check_name, test_extension):
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
         f.write(
-            """// RUN: %%check_clang_tidy %%s %(check_name_dashes)s %%t
+            """// RUN: %%check_clang_tidy %(standard)s%%s %(check_name_dashes)s %%t
 
 // FIXME: Add something that triggers the check here.
 void f();
@@ -324,7 +358,7 @@ def write_test(module_path, module, check_name, test_extension):
 // FIXME: Add something that doesn't trigger the check here.
 void awesome_f2();
 """
-            % {"check_name_dashes": check_name_dashes}
+            % {"check_name_dashes": check_name_dashes, "standard": test_standard}
         )
 
 
@@ -497,7 +531,10 @@ def format_link_alias(doc_file):
         if (match or (check_name.startswith("clang-analyzer-"))) and check_name:
             module = doc_file[0]
             check_file = doc_file[1].replace(".rst", "")
-            if not match or match.group(1) == "https://clang.llvm.org/docs/analyzer/checkers":
+            if (
+                not match
+                or match.group(1) == "https://clang.llvm.org/docs/analyzer/checkers"
+            ):
                 title = "Clang Static Analyzer " + check_file
                 # Preserve the anchor in checkers.html from group 2.
                 target = "" if not match else match.group(1) + ".html" + match.group(2)
@@ -515,7 +552,7 @@ def format_link_alias(doc_file):
             if target:
                 # The checker is just a redirect.
                 return (
-                        "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(ref_begin)s`%(title)s <%(target)s>`%(ref_end)s,%(autofix)s\n"
+                    "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(ref_begin)s`%(title)s <%(target)s>`%(ref_end)s,%(autofix)s\n"
                     % {
                         "check_name": check_name,
                         "module": module,
@@ -523,13 +560,14 @@ def format_link_alias(doc_file):
                         "target": target,
                         "title": title,
                         "autofix": autofix,
-                        "ref_begin" : ref_begin,
-                        "ref_end" : ref_end
-                    })
+                        "ref_begin": ref_begin,
+                        "ref_end": ref_end,
+                    }
+                )
             else:
                 # The checker is just a alias without redirect.
                 return (
-                        "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(title)s,%(autofix)s\n"
+                    "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(title)s,%(autofix)s\n"
                     % {
                         "check_name": check_name,
                         "module": module,
@@ -537,7 +575,8 @@ def format_link_alias(doc_file):
                         "target": target,
                         "title": title,
                         "autofix": autofix,
-                    })
+                    }
+                )
         return ""
 
     checks = map(format_link, doc_files)
@@ -552,8 +591,8 @@ def format_link_alias(doc_file):
                 f.write('   :header: "Name", "Offers fixes"\n\n')
                 f.writelines(checks)
                 # and the aliases
-                f.write("\n\n")
-                f.write(".. csv-table:: Aliases..\n")
+                f.write("\nCheck aliases\n-------------\n\n")
+                f.write(".. csv-table::\n")
                 f.write('   :header: "Name", "Redirect", "Offers fixes"\n\n')
                 f.writelines(checks_alias)
                 break
@@ -599,6 +638,22 @@ def main():
         "objc": "m",
         "objc++": "mm",
     }
+    cpp_language_to_requirements = {
+        "c++98": "CPlusPlus",
+        "c++11": "CPlusPlus11",
+        "c++14": "CPlusPlus14",
+        "c++17": "CPlusPlus17",
+        "c++20": "CPlusPlus20",
+        "c++23": "CPlusPlus23",
+        "c++26": "CPlusPlus26",
+    }
+    c_language_to_requirements = {
+        "c99": None,
+        "c11": "C11",
+        "c17": "C17",
+        "c23": "C23",
+        "c27": "C2Y",
+    }
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--update-docs",
@@ -609,9 +664,26 @@ def main():
         "--language",
         help="language to use for new check (defaults to c++)",
         choices=language_to_extension.keys(),
-        default="c++",
+        default=None,
         metavar="LANG",
     )
+    parser.add_argument(
+        "--description",
+        "-d",
+        help="short description of what the check does",
+        default="FIXME: Write a short description",
+        type=str,
+    )
+    parser.add_argument(
+        "--standard",
+        help="Specify a specific version of the language",
+        choices=list(
+            itertools.chain(
+                cpp_language_to_requirements.keys(), c_language_to_requirements.keys()
+            )
+        ),
+        default=None,
+    )
     parser.add_argument(
         "module",
         nargs="?",
@@ -652,12 +724,53 @@ def main():
     else:
         namespace = module
 
-    write_header(module_path, module, namespace, check_name, check_name_camel)
+    description = args.description
+    if not description.endswith("."):
+        description += "."
+
+    language = args.language
+
+    if args.standard:
+        if args.standard in cpp_language_to_requirements:
+            if language and language != "c++":
+                raise ValueError("C++ standard chosen when language is not C++")
+            language = "c++"
+        elif args.standard in c_language_to_requirements:
+            if language and language != "c":
+                raise ValueError("C standard chosen when language is not C")
+            language = "c"
+
+    if not language:
+        language = "c++"
+
+    language_restrict = None
+
+    if language == "c":
+        language_restrict = "!%(lang)s.CPlusPlus"
+        extra = c_language_to_requirements.get(args.standard, None)
+        if extra:
+            language_restrict += f" && %(lang)s.{extra}"
+    elif language == "c++":
+        language_restrict = (
+            f"%(lang)s.{cpp_language_to_requirements.get(args.standard, 'CPlusPlus')}"
+        )
+    elif language in ["objc", "objc++"]:
+        language_restrict = "%(lang)s.ObjC"
+
+    write_header(
+        module_path,
+        module,
+        namespace,
+        check_name,
+        check_name_camel,
+        description,
+        language_restrict,
+    )
     write_implementation(module_path, module, namespace, check_name_camel)
     adapt_module(module_path, module, check_name, check_name_camel)
-    add_release_notes(module_path, module, check_name)
-    test_extension = language_to_extension.get(args.language)
-    write_test(module_path, module, check_name, test_extension)
+    add_release_notes(module_path, module, check_name, description)
+    test_extension = language_to_extension.get(language)
+    write_test(module_path, module, check_name, test_extension, args.standard)
     write_docs(module_path, module, check_name)
     update_checks_list(clang_tidy_path)
     print("Done. Now it's your turn!")
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index 955a9b94dfaf6..1da5b222c2e8f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -138,13 +138,13 @@ UnusedReturnValueCheck::UnusedReturnValueCheck(llvm::StringRef Name,
                                           "^::sigismember$;"
                                           "^::strcasecmp$;"
                                           "^::strsignal$;"
-                                          "^::ttyname"))),
+                                          "^::ttyname$"))),
       CheckedReturnTypes(utils::options::parseStringList(
-          Options.get("CheckedReturnTypes", "::std::error_code$;"
-                                            "::std::error_condition$;"
-                                            "::std::errc$;"
-                                            "::std::expected$;"
-                                            "::boost::system::error_code"))),
+          Options.get("CheckedReturnTypes", "^::std::error_code$;"
+                                            "^::std::error_condition$;"
+                                            "^::std::errc$;"
+                                            "^::std::expected$;"
+                                            "^::boost::system::error_code$"))),
       AllowCastToVoid(Options.get("AllowCastToVoid", false)) {}
 
 UnusedReturnValueCheck::UnusedReturnValueCheck(
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index ffb62b409b29b..8b5be9cd95f76 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -49,183 +49,183 @@ namespace {
 // with NULL argument and in this case the check is not applicable:
 // `mblen, mbrlen, mbrtowc, mbtowc, wctomb, wctomb_s`.
 // FIXME: The check can be improved to handle such cases.
-const llvm::StringRef CertErr33CCheckedFunctions = "::aligned_alloc;"
-                                                   "::asctime_s;"
-                                                   "::at_quick_exit;"
-                                                   "::atexit;"
-                                                   "::bsearch;"
-                                                   "::bsearch_s;"
-                                                   "::btowc;"
-                                                   "::c16rtomb;"
-                                                   "::c32rtomb;"
-                                                   "::calloc;"
-                                                   "::clock;"
-                                                   "::cnd_broadcast;"
-                                                   "::cnd_init;"
-                                                   "::cnd_signal;"
-                                                   "::cnd_timedwait;"
-                                                   "::cnd_wait;"
-                                                   "::ctime_s;"
-                                                   "::fclose;"
-                                                   "::fflush;"
-                                                   "::fgetc;"
-                                                   "::fgetpos;"
-                                                   "::fgets;"
-                                                   "::fgetwc;"
-                                                   "::fopen;"
-                                                   "::fopen_s;"
-                                                   "::fprintf;"
-                                                   "::fprintf_s;"
-                                                   "::fputc;"
-                                                   "::fputs;"
-                                                   "::fputwc;"
-                                                   "::fputws;"
-                                                   "::fread;"
-                                                   "::freopen;"
-                                                   "::freopen_s;"
-                                                   "::fscanf;"
-                                                   "::fscanf_s;"
-                                                   "::fseek;"
-                                                   "::fsetpos;"
-                                                   "::ftell;"
-                                                   "::fwprintf;"
-                                                   "::fwprintf_s;"
-                                                   "::fwrite;"
-                                                   "::fwscanf;"
-                                                   "::fwscanf_s;"
-                                                   "::getc;"
-                                                   "::getchar;"
-                                                   "::getenv;"
-                                                   "::getenv_s;"
-                                                   "::gets_s;"
-                                                   "::getwc;"
-                                                   "::getwchar;"
-                                                   "::gmtime;"
-                                                   "::gmtime_s;"
-                                                   "::localtime;"
-                                                   "::localtime_s;"
-                                                   "::malloc;"
-                                                   "::mbrtoc16;"
-                                                   "::mbrtoc32;"
-                                                   "::mbsrtowcs;"
-                                                   "::mbsrtowcs_s;"
-                                                   "::mbstowcs;"
-                                                   "::mbstowcs_s;"
-                                                   "::memchr;"
-                                                   "::mktime;"
-                                                   "::mtx_init;"
-                                                   "::mtx_lock;"
-                                                   "::mtx_timedlock;"
-                                                   "::mtx_trylock;"
-                                                   "::mtx_unlock;"
-                                                   "::printf_s;"
-                                                   "::putc;"
-                                                   "::putwc;"
-                                                   "::raise;"
-                                                   "::realloc;"
-                                                   "::remove;"
-                                                   "::rename;"
-                                                   "::scanf;"
-                                                   "::scanf_s;"
-                                                   "::setlocale;"
-                                                   "::setvbuf;"
-                                                   "::signal;"
-                                                   "::snprintf;"
-                                                   "::snprintf_s;"
-                                                   "::sprintf;"
-                                                   "::sprintf_s;"
-                                                   "::sscanf;"
-                                                   "::sscanf_s;"
-                                                   "::strchr;"
-                                                   "::strerror_s;"
-                                                   "::strftime;"
-                                                   "::strpbrk;"
-                                                   "::strrchr;"
-                                                   "::strstr;"
-                                                   "::strtod;"
-                                                   "::strtof;"
-                                                   "::strtoimax;"
-                                                   "::strtok;"
-                                                   "::strtok_s;"
-                                                   "::strtol;"
-                                                   "::strtold;"
-                                                   "::strtoll;"
-                                                   "::strtoul;"
-                                                   "::strtoull;"
-                                                   "::strtoumax;"
-                                                   "::strxfrm;"
-                                                   "::swprintf;"
-                                                   "::swprintf_s;"
-                                                   "::swscanf;"
-                                                   "::swscanf_s;"
-                                                   "::thrd_create;"
-                                                   "::thrd_detach;"
-                                                   "::thrd_join;"
-                                                   "::thrd_sleep;"
-                                                   "::time;"
-                                                   "::timespec_get;"
-                                                   "::tmpfile;"
-                                                   "::tmpfile_s;"
-                                                   "::tmpnam;"
-                                                   "::tmpnam_s;"
-                                                   "::tss_create;"
-                                                   "::tss_get;"
-                                                   "::tss_set;"
-                                                   "::ungetc;"
-                                                   "::ungetwc;"
-                                                   "::vfprintf;"
-                                                   "::vfprintf_s;"
-                                                   "::vfscanf;"
-                                                   "::vfscanf_s;"
-                                                   "::vfwprintf;"
-                                                   "::vfwprintf_s;"
-                                                   "::vfwscanf;"
-                                                   "::vfwscanf_s;"
-                                                   "::vprintf_s;"
-                                                   "::vscanf;"
-                                                   "::vscanf_s;"
-                                                   "::vsnprintf;"
-                                                   "::vsnprintf_s;"
-                                                   "::vsprintf;"
-                                                   "::vsprintf_s;"
-                                                   "::vsscanf;"
-                                                   "::vsscanf_s;"
-                                                   "::vswprintf;"
-                                                   "::vswprintf_s;"
-                                                   "::vswscanf;"
-                                                   "::vswscanf_s;"
-                                                   "::vwprintf_s;"
-                                                   "::vwscanf;"
-                                                   "::vwscanf_s;"
-                                                   "::wcrtomb;"
-                                                   "::wcschr;"
-                                                   "::wcsftime;"
-                                                   "::wcspbrk;"
-                                                   "::wcsrchr;"
-                                                   "::wcsrtombs;"
-                                                   "::wcsrtombs_s;"
-                                                   "::wcsstr;"
-                                                   "::wcstod;"
-                                                   "::wcstof;"
-                                                   "::wcstoimax;"
-                                                   "::wcstok;"
-                                                   "::wcstok_s;"
-                                                   "::wcstol;"
-                                                   "::wcstold;"
-                                                   "::wcstoll;"
-                                                   "::wcstombs;"
-                                                   "::wcstombs_s;"
-                                                   "::wcstoul;"
-                                                   "::wcstoull;"
-                                                   "::wcstoumax;"
-                                                   "::wcsxfrm;"
-                                                   "::wctob;"
-                                                   "::wctrans;"
-                                                   "::wctype;"
-                                                   "::wmemchr;"
-                                                   "::wprintf_s;"
-                                                   "::wscanf;"
-                                                   "::wscanf_s;";
+const llvm::StringRef CertErr33CCheckedFunctions = "^::aligned_alloc;"
+                                                   "^::asctime_s;"
+                                                   "^::at_quick_exit;"
+                                                   "^::atexit;"
+                                                   "^::bsearch;"
+                                                   "^::bsearch_s;"
+                                                   "^::btowc;"
+                                                   "^::c16rtomb;"
+                                                   "^::c32rtomb;"
+                                                   "^::calloc;"
+                                                   "^::clock;"
+                                                   "^::cnd_broadcast;"
+                                                   "^::cnd_init;"
+                                                   "^::cnd_signal;"
+                                                   "^::cnd_timedwait;"
+                                                   "^::cnd_wait;"
+                                                   "^::ctime_s;"
+                                                   "^::fclose;"
+                                                   "^::fflush;"
+                                                   "^::fgetc;"
+                                                   "^::fgetpos;"
+                                                   "^::fgets;"
+                                                   "^::fgetwc;"
+                                                   "^::fopen;"
+                                                   "^::fopen_s;"
+                                                   "^::fprintf;"
+                                                   "^::fprintf_s;"
+                                                   "^::fputc;"
+                                                   "^::fputs;"
+                                                   "^::fputwc;"
+                                                   "^::fputws;"
+                                                   "^::fread;"
+                                                   "^::freopen;"
+                                                   "^::freopen_s;"
+                                                   "^::fscanf;"
+                                                   "^::fscanf_s;"
+                                                   "^::fseek;"
+                                                   "^::fsetpos;"
+                                                   "^::ftell;"
+                                                   "^::fwprintf;"
+                                                   "^::fwprintf_s;"
+                                                   "^::fwrite;"
+                                                   "^::fwscanf;"
+                                                   "^::fwscanf_s;"
+                                                   "^::getc;"
+                                                   "^::getchar;"
+                                                   "^::getenv;"
+                                                   "^::getenv_s;"
+                                                   "^::gets_s;"
+                                                   "^::getwc;"
+                                                   "^::getwchar;"
+                                                   "^::gmtime;"
+                                                   "^::gmtime_s;"
+                                                   "^::localtime;"
+                                                   "^::localtime_s;"
+                                                   "^::malloc;"
+                                                   "^::mbrtoc16;"
+                                                   "^::mbrtoc32;"
+                                                   "^::mbsrtowcs;"
+                                                   "^::mbsrtowcs_s;"
+                                                   "^::mbstowcs;"
+                                                   "^::mbstowcs_s;"
+                                                   "^::memchr;"
+                                                   "^::mktime;"
+                                                   "^::mtx_init;"
+                                                   "^::mtx_lock;"
+                                                   "^::mtx_timedlock;"
+                                                   "^::mtx_trylock;"
+                                                   "^::mtx_unlock;"
+                                                   "^::printf_s;"
+                                                   "^::putc;"
+                                                   "^::putwc;"
+                                                   "^::raise;"
+                                                   "^::realloc;"
+                                                   "^::remove;"
+                                                   "^::rename;"
+                                                   "^::scanf;"
+                                                   "^::scanf_s;"
+                                                   "^::setlocale;"
+                                                   "^::setvbuf;"
+                                                   "^::signal;"
+                                                   "^::snprintf;"
+                                                   "^::snprintf_s;"
+                                                   "^::sprintf;"
+                                                   "^::sprintf_s;"
+                                                   "^::sscanf;"
+                                                   "^::sscanf_s;"
+                                                   "^::strchr;"
+                                                   "^::strerror_s;"
+                                                   "^::strftime;"
+                                                   "^::strpbrk;"
+                                                   "^::strrchr;"
+                                                   "^::strstr;"
+                                                   "^::strtod;"
+                                                   "^::strtof;"
+                                                   "^::strtoimax;"
+                                                   "^::strtok;"
+                                                   "^::strtok_s;"
+                                                   "^::strtol;"
+                                                   "^::strtold;"
+                                                   "^::strtoll;"
+                                                   "^::strtoul;"
+                                                   "^::strtoull;"
+                                                   "^::strtoumax;"
+                                                   "^::strxfrm;"
+                                                   "^::swprintf;"
+                                                   "^::swprintf_s;"
+                                                   "^::swscanf;"
+                                                   "^::swscanf_s;"
+                                                   "^::thrd_create;"
+                                                   "^::thrd_detach;"
+                                                   "^::thrd_join;"
+                                                   "^::thrd_sleep;"
+                                                   "^::time;"
+                                                   "^::timespec_get;"
+                                                   "^::tmpfile;"
+                                                   "^::tmpfile_s;"
+                                                   "^::tmpnam;"
+                                                   "^::tmpnam_s;"
+                                                   "^::tss_create;"
+                                                   "^::tss_get;"
+                                                   "^::tss_set;"
+                                                   "^::ungetc;"
+                                                   "^::ungetwc;"
+                                                   "^::vfprintf;"
+                                                   "^::vfprintf_s;"
+                                                   "^::vfscanf;"
+                                                   "^::vfscanf_s;"
+                                                   "^::vfwprintf;"
+                                                   "^::vfwprintf_s;"
+                                                   "^::vfwscanf;"
+                                                   "^::vfwscanf_s;"
+                                                   "^::vprintf_s;"
+                                                   "^::vscanf;"
+                                                   "^::vscanf_s;"
+                                                   "^::vsnprintf;"
+                                                   "^::vsnprintf_s;"
+                                                   "^::vsprintf;"
+                                                   "^::vsprintf_s;"
+                                                   "^::vsscanf;"
+                                                   "^::vsscanf_s;"
+                                                   "^::vswprintf;"
+                                                   "^::vswprintf_s;"
+                                                   "^::vswscanf;"
+                                                   "^::vswscanf_s;"
+                                                   "^::vwprintf_s;"
+                                                   "^::vwscanf;"
+                                                   "^::vwscanf_s;"
+                                                   "^::wcrtomb;"
+                                                   "^::wcschr;"
+                                                   "^::wcsftime;"
+                                                   "^::wcspbrk;"
+                                                   "^::wcsrchr;"
+                                                   "^::wcsrtombs;"
+                                                   "^::wcsrtombs_s;"
+                                                   "^::wcsstr;"
+                                                   "^::wcstod;"
+                                                   "^::wcstof;"
+                                                   "^::wcstoimax;"
+                                                   "^::wcstok;"
+                                                   "^::wcstok_s;"
+                                                   "^::wcstol;"
+                                                   "^::wcstold;"
+                                                   "^::wcstoll;"
+                                                   "^::wcstombs;"
+                                                   "^::wcstombs_s;"
+                                                   "^::wcstoul;"
+                                                   "^::wcstoull;"
+                                                   "^::wcstoumax;"
+                                                   "^::wcsxfrm;"
+                                                   "^::wctob;"
+                                                   "^::wctrans;"
+                                                   "^::wctype;"
+                                                   "^::wmemchr;"
+                                                   "^::wprintf_s;"
+                                                   "^::wscanf;"
+                                                   "^::wscanf_s;";
 
 } // namespace
 
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
index 8b500de0c028c..e20cf6fbcb55a 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
@@ -93,13 +93,12 @@ void ConstCorrectnessCheck::registerMatchers(MatchFinder *Finder) {
   // shall be run.
   const auto FunctionScope =
       functionDecl(
-          hasBody(
-              compoundStmt(forEachDescendant(
-                               declStmt(containsAnyDeclaration(
-                                            LocalValDecl.bind("local-value")),
-                                        unless(has(decompositionDecl())))
-                                   .bind("decl-stmt")))
-                  .bind("scope")))
+          hasBody(stmt(forEachDescendant(
+                           declStmt(containsAnyDeclaration(
+                                        LocalValDecl.bind("local-value")),
+                                    unless(has(decompositionDecl())))
+                               .bind("decl-stmt")))
+                      .bind("scope")))
           .bind("function-decl");
 
   Finder->addMatcher(FunctionScope, this);
@@ -109,7 +108,7 @@ void ConstCorrectnessCheck::registerMatchers(MatchFinder *Finder) {
 enum class VariableCategory { Value, Reference, Pointer };
 
 void ConstCorrectnessCheck::check(const MatchFinder::MatchResult &Result) {
-  const auto *LocalScope = Result.Nodes.getNodeAs<CompoundStmt>("scope");
+  const auto *LocalScope = Result.Nodes.getNodeAs<Stmt>("scope");
   const auto *Variable = Result.Nodes.getNodeAs<VarDecl>("local-value");
   const auto *Function = Result.Nodes.getNodeAs<FunctionDecl>("function-decl");
 
@@ -198,7 +197,7 @@ void ConstCorrectnessCheck::check(const MatchFinder::MatchResult &Result) {
   }
 }
 
-void ConstCorrectnessCheck::registerScope(const CompoundStmt *LocalScope,
+void ConstCorrectnessCheck::registerScope(const Stmt *LocalScope,
                                           ASTContext *Context) {
   auto &Analyzer = ScopesCache[LocalScope];
   if (!Analyzer)
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
index 08ffde524522a..bba060e555d00 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
@@ -32,10 +32,10 @@ class ConstCorrectnessCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  void registerScope(const CompoundStmt *LocalScope, ASTContext *Context);
+  void registerScope(const Stmt *LocalScope, ASTContext *Context);
 
   using MutationAnalyzer = std::unique_ptr<ExprMutationAnalyzer>;
-  llvm::DenseMap<const CompoundStmt *, MutationAnalyzer> ScopesCache;
+  llvm::DenseMap<const Stmt *, MutationAnalyzer> ScopesCache;
   llvm::DenseSet<SourceLocation> TemplateDiagnosticsCache;
 
   const bool AnalyzeValues;
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index 5a4c2363bd8af..3a255c5c133f1 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -119,65 +119,72 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) {
     }
   }
 
-  const size_t Index = llvm::find(Function->parameters(), Param) -
-                       Function->parameters().begin();
+  handleConstRefFix(*Function, *Param, *Result.Context);
+}
+
+void UnnecessaryValueParamCheck::registerPPCallbacks(
+    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
+  Inserter.registerPreprocessor(PP);
+}
+
+void UnnecessaryValueParamCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
+  Options.store(Opts, "AllowedTypes",
+                utils::options::serializeStringList(AllowedTypes));
+}
+
+void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
+  MutationAnalyzerCache.clear();
+}
+
+void UnnecessaryValueParamCheck::handleConstRefFix(const FunctionDecl &Function,
+                                                   const ParmVarDecl &Param,
+                                                   ASTContext &Context) {
+  const size_t Index =
+      llvm::find(Function.parameters(), &Param) - Function.parameters().begin();
+  const bool IsConstQualified =
+      Param.getType().getCanonicalType().isConstQualified();
 
   auto Diag =
-      diag(Param->getLocation(),
+      diag(Param.getLocation(),
            "the %select{|const qualified }0parameter %1 is copied for each "
            "invocation%select{ but only used as a const reference|}0; consider "
            "making it a %select{const |}0reference")
-      << IsConstQualified << paramNameOrIndex(Param->getName(), Index);
+      << IsConstQualified << paramNameOrIndex(Param.getName(), Index);
   // Do not propose fixes when:
   // 1. the ParmVarDecl is in a macro, since we cannot place them correctly
   // 2. the function is virtual as it might break overrides
   // 3. the function is referenced outside of a call expression within the
   //    compilation unit as the signature change could introduce build errors.
   // 4. the function is an explicit template/ specialization.
-  const auto *Method = llvm::dyn_cast<CXXMethodDecl>(Function);
-  if (Param->getBeginLoc().isMacroID() || (Method && Method->isVirtual()) ||
-      isReferencedOutsideOfCallExpr(*Function, *Result.Context) ||
-      Function->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+  const auto *Method = llvm::dyn_cast<CXXMethodDecl>(&Function);
+  if (Param.getBeginLoc().isMacroID() || (Method && Method->isVirtual()) ||
+      isReferencedOutsideOfCallExpr(Function, Context) ||
+      Function.getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
     return;
-  for (const auto *FunctionDecl = Function; FunctionDecl != nullptr;
+  for (const auto *FunctionDecl = &Function; FunctionDecl != nullptr;
        FunctionDecl = FunctionDecl->getPreviousDecl()) {
     const auto &CurrentParam = *FunctionDecl->getParamDecl(Index);
-    Diag << utils::fixit::changeVarDeclToReference(CurrentParam,
-                                                   *Result.Context);
+    Diag << utils::fixit::changeVarDeclToReference(CurrentParam, Context);
     // The parameter of each declaration needs to be checked individually as to
     // whether it is const or not as constness can differ between definition and
     // declaration.
     if (!CurrentParam.getType().getCanonicalType().isConstQualified()) {
       if (std::optional<FixItHint> Fix = utils::fixit::addQualifierToVarDecl(
-              CurrentParam, *Result.Context, DeclSpec::TQ::TQ_const))
+              CurrentParam, Context, DeclSpec::TQ::TQ_const))
         Diag << *Fix;
     }
   }
 }
 
-void UnnecessaryValueParamCheck::registerPPCallbacks(
-    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
-  Inserter.registerPreprocessor(PP);
-}
-
-void UnnecessaryValueParamCheck::storeOptions(
-    ClangTidyOptions::OptionMap &Opts) {
-  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
-  Options.store(Opts, "AllowedTypes",
-                utils::options::serializeStringList(AllowedTypes));
-}
-
-void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
-  MutationAnalyzerCache.clear();
-}
-
-void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var,
+void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Param,
                                                const DeclRefExpr &CopyArgument,
-                                               const ASTContext &Context) {
+                                               ASTContext &Context) {
   auto Diag = diag(CopyArgument.getBeginLoc(),
                    "parameter %0 is passed by value and only copied once; "
                    "consider moving it to avoid unnecessary copies")
-              << &Var;
+              << &Param;
   // Do not propose fixes in macros since we cannot place them correctly.
   if (CopyArgument.getBeginLoc().isMacroID())
     return;
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 7250bffd20b2f..8bfd814d16357 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -33,10 +33,16 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void onEndOfTranslationUnit() override;
 
-private:
-  void handleMoveFix(const ParmVarDecl &Var, const DeclRefExpr &CopyArgument,
-                     const ASTContext &Context);
+protected:
+  // Create diagnostics. These are virtual so that derived classes can change
+  // behaviour.
+  virtual void handleMoveFix(const ParmVarDecl &Param,
+                             const DeclRefExpr &CopyArgument,
+                             ASTContext &Context);
+  virtual void handleConstRefFix(const FunctionDecl &Function,
+                                 const ParmVarDecl &Param, ASTContext &Context);
 
+private:
   ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
index 95a3a5165e2e8..43b69a24bdb16 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
@@ -157,9 +157,12 @@ void NonConstParameterCheck::diagnoseNonConstParameters() {
     if (!Function)
       continue;
     unsigned Index = Par->getFunctionScopeIndex();
-    for (FunctionDecl *FnDecl : Function->redecls())
+    for (FunctionDecl *FnDecl : Function->redecls()) {
+      if (FnDecl->getNumParams() <= Index)
+        continue;
       Fixes.push_back(FixItHint::CreateInsertion(
           FnDecl->getParamDecl(Index)->getBeginLoc(), "const "));
+    }
 
     diag(Par->getLocation(), "pointer parameter '%0' can be pointer to const")
         << Par->getName() << Fixes;
diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 0dc35ad587362..48401ba5ea42a 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -34,29 +34,32 @@
 http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html
 """
 
-from __future__ import print_function
-
 import argparse
+import asyncio
+from dataclasses import dataclass
 import glob
 import json
 import multiprocessing
 import os
-import queue
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
-import threading
+import time
 import traceback
+from types import ModuleType
+from typing import Any, Awaitable, Callable, List, Optional, Tuple, TypeVar
+
 
+yaml: Optional[ModuleType] = None
 try:
     import yaml
 except ImportError:
     yaml = None
 
 
-def strtobool(val):
+def strtobool(val: str) -> bool:
     """Convert a string representation of truth to a bool following LLVM's CLI argument parsing."""
 
     val = val.lower()
@@ -67,11 +70,11 @@ def strtobool(val):
 
     # Return ArgumentTypeError so that argparse does not substitute its own error message
     raise argparse.ArgumentTypeError(
-        "'{}' is invalid value for boolean argument! Try 0 or 1.".format(val)
+        f"'{val}' is invalid value for boolean argument! Try 0 or 1."
     )
 
 
-def find_compilation_database(path):
+def find_compilation_database(path: str) -> str:
     """Adjusts the directory until a compilation database is found."""
     result = os.path.realpath("./")
     while not os.path.isfile(os.path.join(result, path)):
@@ -83,49 +86,43 @@ def find_compilation_database(path):
     return result
 
 
-def make_absolute(f, directory):
-    if os.path.isabs(f):
-        return f
-    return os.path.normpath(os.path.join(directory, f))
-
-
 def get_tidy_invocation(
-    f,
-    clang_tidy_binary,
-    checks,
-    tmpdir,
-    build_path,
-    header_filter,
-    allow_enabling_alpha_checkers,
-    extra_arg,
-    extra_arg_before,
-    quiet,
-    config_file_path,
-    config,
-    line_filter,
-    use_color,
-    plugins,
-    warnings_as_errors,
-    exclude_header_filter,
-    allow_no_checks,
-):
+    f: str,
+    clang_tidy_binary: str,
+    checks: str,
+    tmpdir: Optional[str],
+    build_path: str,
+    header_filter: Optional[str],
+    allow_enabling_alpha_checkers: bool,
+    extra_arg: List[str],
+    extra_arg_before: List[str],
+    quiet: bool,
+    config_file_path: str,
+    config: str,
+    line_filter: Optional[str],
+    use_color: bool,
+    plugins: List[str],
+    warnings_as_errors: Optional[str],
+    exclude_header_filter: Optional[str],
+    allow_no_checks: bool,
+) -> List[str]:
     """Gets a command line for clang-tidy."""
     start = [clang_tidy_binary]
     if allow_enabling_alpha_checkers:
         start.append("-allow-enabling-analyzer-alpha-checkers")
     if exclude_header_filter is not None:
-        start.append("--exclude-header-filter=" + exclude_header_filter)
+        start.append(f"--exclude-header-filter={exclude_header_filter}")
     if header_filter is not None:
-        start.append("-header-filter=" + header_filter)
+        start.append(f"-header-filter={header_filter}")
     if line_filter is not None:
-        start.append("-line-filter=" + line_filter)
+        start.append(f"-line-filter={line_filter}")
     if use_color is not None:
         if use_color:
             start.append("--use-color")
         else:
             start.append("--use-color=false")
     if checks:
-        start.append("-checks=" + checks)
+        start.append(f"-checks={checks}")
     if tmpdir is not None:
         start.append("-export-fixes")
         # Get a temporary file. We immediately close the handle so clang-tidy can
@@ -134,28 +131,29 @@ def get_tidy_invocation(
         os.close(handle)
         start.append(name)
     for arg in extra_arg:
-        start.append("-extra-arg=%s" % arg)
+        start.append(f"-extra-arg={arg}")
     for arg in extra_arg_before:
-        start.append("-extra-arg-before=%s" % arg)
-    start.append("-p=" + build_path)
+        start.append(f"-extra-arg-before={arg}")
+    start.append(f"-p={build_path}")
     if quiet:
         start.append("-quiet")
     if config_file_path:
-        start.append("--config-file=" + config_file_path)
+        start.append(f"--config-file={config_file_path}")
     elif config:
-        start.append("-config=" + config)
+        start.append(f"-config={config}")
     for plugin in plugins:
-        start.append("-load=" + plugin)
+        start.append(f"-load={plugin}")
     if warnings_as_errors:
-        start.append("--warnings-as-errors=" + warnings_as_errors)
+        start.append(f"--warnings-as-errors={warnings_as_errors}")
     if allow_no_checks:
         start.append("--allow-no-checks")
     start.append(f)
     return start
 
 
-def merge_replacement_files(tmpdir, mergefile):
+def merge_replacement_files(tmpdir: str, mergefile: str) -> None:
     """Merge all replacement files in a directory into a single file"""
+    assert yaml
     # The fixes suggested by clang-tidy >= 4.0.0 are given under
     # the top level key 'Diagnostics' in the output yaml files
     mergekey = "Diagnostics"
@@ -179,16 +177,14 @@ def merge_replacement_files(tmpdir, mergefile):
         open(mergefile, "w").close()
 
 
-def find_binary(arg, name, build_path):
+def find_binary(arg: str, name: str, build_path: str) -> str:
     """Get the path for a binary or exit"""
     if arg:
         if shutil.which(arg):
             return arg
         else:
             raise SystemExit(
-                "error: passed binary '{}' was not found or is not executable".format(
-                    arg
-                )
+                f"error: passed binary '{arg}' was not found or is not executable"
             )
 
     built_path = os.path.join(build_path, "bin", name)
@@ -196,66 +192,102 @@ def find_binary(arg, name, build_path):
     if binary:
         return binary
     else:
-        raise SystemExit(
-            "error: failed to find {} in $PATH or at {}".format(name, built_path)
-        )
+        raise SystemExit(f"error: failed to find {name} in $PATH or at {built_path}")
 
 
-def apply_fixes(args, clang_apply_replacements_binary, tmpdir):
+def apply_fixes(
+    args: argparse.Namespace, clang_apply_replacements_binary: str, tmpdir: str
+) -> None:
     """Calls clang-apply-fixes on a given directory."""
     invocation = [clang_apply_replacements_binary]
     invocation.append("-ignore-insert-conflict")
     if args.format:
         invocation.append("-format")
     if args.style:
-        invocation.append("-style=" + args.style)
+        invocation.append(f"-style={args.style}")
     invocation.append(tmpdir)
     subprocess.call(invocation)
 
 
-def run_tidy(args, clang_tidy_binary, tmpdir, build_path, queue, lock, failed_files):
-    """Takes filenames out of queue and runs clang-tidy on them."""
-    while True:
-        name = queue.get()
-        invocation = get_tidy_invocation(
-            name,
-            clang_tidy_binary,
-            args.checks,
-            tmpdir,
-            build_path,
-            args.header_filter,
-            args.allow_enabling_alpha_checkers,
-            args.extra_arg,
-            args.extra_arg_before,
-            args.quiet,
-            args.config_file,
-            args.config,
-            args.line_filter,
-            args.use_color,
-            args.plugins,
-            args.warnings_as_errors,
-            args.exclude_header_filter,
-            args.allow_no_checks,
-        )
+# FIXME Python 3.12: This can be simplified out with run_with_semaphore[T](...).
+T = TypeVar("T")
+
+
+async def run_with_semaphore(
+    semaphore: asyncio.Semaphore,
+    f: Callable[..., Awaitable[T]],
+    *args: Any,
+    **kwargs: Any,
+) -> T:
+    async with semaphore:
+        return await f(*args, **kwargs)
+
+
+@dataclass
+class ClangTidyResult:
+    filename: str
+    invocation: List[str]
+    returncode: int
+    stdout: str
+    stderr: str
+    elapsed: float
+
+
+async def run_tidy(
+    args: argparse.Namespace,
+    name: str,
+    clang_tidy_binary: str,
+    tmpdir: str,
+    build_path: str,
+) -> ClangTidyResult:
+    """
+    Runs clang-tidy on a single file and returns the result.
+    """
+    invocation = get_tidy_invocation(
+        name,
+        clang_tidy_binary,
+        args.checks,
+        tmpdir,
+        build_path,
+        args.header_filter,
+        args.allow_enabling_alpha_checkers,
+        args.extra_arg,
+        args.extra_arg_before,
+        args.quiet,
+        args.config_file,
+        args.config,
+        args.line_filter,
+        args.use_color,
+        args.plugins,
+        args.warnings_as_errors,
+        args.exclude_header_filter,
+        args.allow_no_checks,
+    )
 
-        proc = subprocess.Popen(
-            invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    try:
+        process = await asyncio.create_subprocess_exec(
+            *invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE
         )
-        output, err = proc.communicate()
-        if proc.returncode != 0:
-            if proc.returncode < 0:
-                msg = "%s: terminated by signal %d\n" % (name, -proc.returncode)
-                err += msg.encode("utf-8")
-            failed_files.append(name)
-        with lock:
-            sys.stdout.write(" ".join(invocation) + "\n" + output.decode("utf-8"))
-            if len(err) > 0:
-                sys.stdout.flush()
-                sys.stderr.write(err.decode("utf-8"))
-        queue.task_done()
-
-
-def main():
+        start = time.time()
+        stdout, stderr = await process.communicate()
+        end = time.time()
+    except asyncio.CancelledError:
+        process.terminate()
+        await process.wait()
+        raise
+
+    assert process.returncode is not None
+    return ClangTidyResult(
+        name,
+        invocation,
+        process.returncode,
+        stdout.decode("UTF-8"),
+        stderr.decode("UTF-8"),
+        end - start,
+    )
+
+
+async def main() -> None:
     parser = argparse.ArgumentParser(
         description="Runs clang-tidy over all files "
         "in a compilation database. Requires "
@@ -432,7 +464,7 @@ def main():
         )
 
     combine_fixes = False
-    export_fixes_dir = None
+    export_fixes_dir: Optional[str] = None
     delete_fixes_dir = False
     if args.export_fixes is not None:
         # if a directory is given, create it if it does not exist
@@ -490,10 +522,10 @@ def main():
         sys.exit(1)
 
     # Load the database and extract all files.
-    database = json.load(open(os.path.join(build_path, db_path)))
-    files = set(
-        [make_absolute(entry["file"], entry["directory"]) for entry in database]
-    )
+    with open(os.path.join(build_path, db_path)) as f:
+        database = json.load(f)
+    files = {os.path.abspath(os.path.join(e["directory"], e["file"])) for e in database}
+    number_files_in_database = len(files)
 
     # Filter source files from compilation database.
     if args.source_filter:
@@ -514,70 +546,81 @@ def main():
 
     # Build up a big regexy filter from all command line arguments.
     file_name_re = re.compile("|".join(args.files))
+    files = {f for f in files if file_name_re.search(f)}
+
+    print(
+        "Running clang-tidy for",
+        len(files),
+        "files out of",
+        number_files_in_database,
+        "in compilation database ...",
+    )
 
-    return_code = 0
-    try:
-        # Spin up a bunch of tidy-launching threads.
-        task_queue = queue.Queue(max_task)
-        # List of files with a non-zero return code.
-        failed_files = []
-        lock = threading.Lock()
-        for _ in range(max_task):
-            t = threading.Thread(
-                target=run_tidy,
-                args=(
-                    args,
-                    clang_tidy_binary,
-                    export_fixes_dir,
-                    build_path,
-                    task_queue,
-                    lock,
-                    failed_files,
-                ),
+    returncode = 0
+    semaphore = asyncio.Semaphore(max_task)
+    tasks = [
+        asyncio.create_task(
+            run_with_semaphore(
+                semaphore,
+                run_tidy,
+                args,
+                f,
+                clang_tidy_binary,
+                export_fixes_dir,
+                build_path,
             )
-            t.daemon = True
-            t.start()
-
-        # Fill the queue with files.
-        for name in files:
-            if file_name_re.search(name):
-                task_queue.put(name)
-
-        # Wait for all threads to be done.
-        task_queue.join()
-        if len(failed_files):
-            return_code = 1
-
-    except KeyboardInterrupt:
-        # This is a sad hack. Unfortunately subprocess goes
-        # bonkers with ctrl-c and we start forking merrily.
+        )
+        for f in files
+    ]
+
+    try:
+        for i, coro in enumerate(asyncio.as_completed(tasks)):
+            result = await coro
+            if result.returncode != 0:
+                returncode = 1
+                if result.returncode < 0:
+                    result.stderr += f"{result.filename}: terminated by signal {-result.returncode}\n"
+            progress = f"[{i + 1: >{len(f'{len(files)}')}}/{len(files)}]"
+            runtime = f"[{result.elapsed:.1f}s]"
+            print(f"{progress}{runtime} {' '.join(result.invocation)}")
+            if result.stdout:
+                print(result.stdout, end=("" if result.stderr else "\n"))
+            if result.stderr:
+                print(result.stderr)
+    except asyncio.CancelledError:
         print("\nCtrl-C detected, goodbye.")
+        for task in tasks:
+            task.cancel()
         if delete_fixes_dir:
+            assert export_fixes_dir
             shutil.rmtree(export_fixes_dir)
-        os.kill(0, 9)
+        return
 
     if combine_fixes:
-        print("Writing fixes to " + args.export_fixes + " ...")
+        print(f"Writing fixes to {args.export_fixes} ...")
         try:
+            assert export_fixes_dir
             merge_replacement_files(export_fixes_dir, args.export_fixes)
         except:
             print("Error exporting fixes.\n", file=sys.stderr)
             traceback.print_exc()
-            return_code = 1
+            returncode = 1
 
     if args.fix:
         print("Applying fixes ...")
         try:
+            assert export_fixes_dir
             apply_fixes(args, clang_apply_replacements_binary, export_fixes_dir)
         except:
             print("Error applying fixes.\n", file=sys.stderr)
             traceback.print_exc()
-            return_code = 1
+            returncode = 1
 
     if delete_fixes_dir:
+        assert export_fixes_dir
         shutil.rmtree(export_fixes_dir)
-    sys.exit(return_code)
+    sys.exit(returncode)
 
 
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index fd5dadc9b01db..0cdc7d08abc99 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -96,7 +96,7 @@ bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
   if (FirstStmt == SecondStmt)
     return true;
 
-  if (FirstStmt->getStmtClass() != FirstStmt->getStmtClass())
+  if (FirstStmt->getStmtClass() != SecondStmt->getStmtClass())
     return false;
 
   if (isa<Expr>(FirstStmt) && isa<Expr>(SecondStmt)) {
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index 6ae46e2b1262a..9bfb7e2677533 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -141,7 +141,10 @@ bool isStandardPointerConvertible(QualType From, QualType To) {
     if (RD->isCompleteDefinition() &&
         isBaseOf(From->getPointeeType().getTypePtr(),
                  To->getPointeeType().getTypePtr())) {
-      return true;
+      // If B is an inaccessible or ambiguous base class of D, a program
+      // that necessitates this conversion is ill-formed
+      return isUnambiguousPublicBaseClass(From->getPointeeType().getTypePtr(),
+                                          To->getPointeeType().getTypePtr());
     }
   }
 
@@ -375,10 +378,7 @@ bool ExceptionAnalyzer::ExceptionInfo::filterByCatch(
         isPointerOrPointerToMember(ExceptionCanTy->getTypePtr())) {
       // A standard pointer conversion not involving conversions to pointers to
       // private or protected or ambiguous classes ...
-      if (isStandardPointerConvertible(ExceptionCanTy, HandlerCanTy) &&
-          isUnambiguousPublicBaseClass(
-              ExceptionCanTy->getTypePtr()->getPointeeType().getTypePtr(),
-              HandlerCanTy->getTypePtr()->getPointeeType().getTypePtr())) {
+      if (isStandardPointerConvertible(ExceptionCanTy, HandlerCanTy)) {
         TypesToDelete.push_back(ExceptionTy);
       }
       // A function pointer conversion ...
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
index e2daa5010e2ae..aba4d17ccd035 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
@@ -39,12 +39,6 @@ static constexpr const char ArgName[] = "ArgName";
 
 namespace clang::tidy::utils {
 
-static bool operator==(const UseRangesCheck::Indexes &L,
-                       const UseRangesCheck::Indexes &R) {
-  return std::tie(L.BeginArg, L.EndArg, L.ReplaceArg) ==
-         std::tie(R.BeginArg, R.EndArg, R.ReplaceArg);
-}
-
 static std::string getFullPrefix(ArrayRef<UseRangesCheck::Indexes> Signature) {
   std::string Output;
   llvm::raw_string_ostream OS(Output);
@@ -54,15 +48,6 @@ static std::string getFullPrefix(ArrayRef<UseRangesCheck::Indexes> Signature) {
   return Output;
 }
 
-static llvm::hash_code hash_value(const UseRangesCheck::Indexes &Indexes) {
-  return llvm::hash_combine(Indexes.BeginArg, Indexes.EndArg,
-                            Indexes.ReplaceArg);
-}
-
-static llvm::hash_code hash_value(const UseRangesCheck::Signature &Sig) {
-  return llvm::hash_combine_range(Sig.begin(), Sig.end());
-}
-
 namespace {
 
 AST_MATCHER(Expr, hasSideEffects) {
@@ -123,24 +108,26 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes,
 }
 
 void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
-  Replaces = getReplacerMap();
+  auto Replaces = getReplacerMap();
   ReverseDescriptor = getReverseDescriptor();
   auto BeginEndNames = getFreeBeginEndMethods();
   llvm::SmallVector<StringRef, 4> BeginNames{
       llvm::make_first_range(BeginEndNames)};
   llvm::SmallVector<StringRef, 4> EndNames{
       llvm::make_second_range(BeginEndNames)};
-  llvm::DenseSet<ArrayRef<Signature>> Seen;
+  Replacers.clear();
+  llvm::DenseSet<Replacer *> SeenRepl;
   for (auto I = Replaces.begin(), E = Replaces.end(); I != E; ++I) {
-    const ArrayRef<Signature> &Signatures =
-        I->getValue()->getReplacementSignatures();
-    if (!Seen.insert(Signatures).second)
+    auto Replacer = I->getValue();
+    if (!SeenRepl.insert(Replacer.get()).second)
       continue;
-    assert(!Signatures.empty() &&
-           llvm::all_of(Signatures, [](auto Index) { return !Index.empty(); }));
+    Replacers.push_back(Replacer);
+    assert(!Replacer->getReplacementSignatures().empty() &&
+           llvm::all_of(Replacer->getReplacementSignatures(),
+                        [](auto Index) { return !Index.empty(); }));
     std::vector<StringRef> Names(1, I->getKey());
     for (auto J = std::next(I); J != E; ++J)
-      if (J->getValue()->getReplacementSignatures() == Signatures)
+      if (J->getValue() == Replacer)
         Names.push_back(J->getKey());
 
     std::vector<ast_matchers::internal::DynTypedMatcher> TotalMatchers;
@@ -148,7 +135,7 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
     // signatures in order of length(longest to shortest). This way any
     // signature that is a subset of another signature will be matched after the
     // other.
-    SmallVector<Signature> SigVec(Signatures);
+    SmallVector<Signature> SigVec(Replacer->getReplacementSignatures());
     llvm::sort(SigVec, [](auto &L, auto &R) { return R.size() < L.size(); });
     for (const auto &Signature : SigVec) {
       std::vector<ast_matchers::internal::DynTypedMatcher> Matchers;
@@ -163,7 +150,8 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
     }
     Finder->addMatcher(
         callExpr(
-            callee(functionDecl(hasAnyName(std::move(Names))).bind(FuncDecl)),
+            callee(functionDecl(hasAnyName(std::move(Names)))
+                       .bind((FuncDecl + Twine(Replacers.size() - 1).str()))),
             ast_matchers::internal::DynTypedMatcher::constructVariadic(
                 ast_matchers::internal::DynTypedMatcher::VO_AnyOf,
                 ASTNodeKind::getFromNodeKind<CallExpr>(),
@@ -205,21 +193,33 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call,
 }
 
 void UseRangesCheck::check(const MatchFinder::MatchResult &Result) {
-  const auto *Function = Result.Nodes.getNodeAs<FunctionDecl>(FuncDecl);
-  std::string Qualified = "::" + Function->getQualifiedNameAsString();
-  auto Iter = Replaces.find(Qualified);
-  assert(Iter != Replaces.end());
+  Replacer *Replacer = nullptr;
+  const FunctionDecl *Function = nullptr;
+  for (auto [Node, Value] : Result.Nodes.getMap()) {
+    StringRef NodeStr(Node);
+    if (!NodeStr.consume_front(FuncDecl))
+      continue;
+    Function = Value.get<FunctionDecl>();
+    size_t Index;
+    if (NodeStr.getAsInteger(10, Index)) {
+      llvm_unreachable("Unable to extract replacer index");
+    }
+    assert(Index < Replacers.size());
+    Replacer = Replacers[Index].get();
+    break;
+  }
+  assert(Replacer && Function);
   SmallString<64> Buffer;
-  for (const Signature &Sig : Iter->getValue()->getReplacementSignatures()) {
+  for (const Signature &Sig : Replacer->getReplacementSignatures()) {
     Buffer.assign({BoundCall, getFullPrefix(Sig)});
     const auto *Call = Result.Nodes.getNodeAs<CallExpr>(Buffer);
     if (!Call)
       continue;
     auto Diag = createDiag(*Call);
-    if (auto ReplaceName = Iter->getValue()->getReplaceName(*Function))
+    if (auto ReplaceName = Replacer->getReplaceName(*Function))
       Diag << FixItHint::CreateReplacement(Call->getCallee()->getSourceRange(),
                                            *ReplaceName);
-    if (auto Include = Iter->getValue()->getHeaderInclusion(*Function))
+    if (auto Include = Replacer->getHeaderInclusion(*Function))
       Diag << Inserter.createIncludeInsertion(
           Result.SourceManager->getFileID(Call->getBeginLoc()), *Include);
     llvm::SmallVector<unsigned, 3> ToRemove;
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
index 927e9694b0ec7..3a454bcf0cf07 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
@@ -85,7 +85,7 @@ class UseRangesCheck : public ClangTidyCheck {
   std::optional<TraversalKind> getCheckTraversalKind() const override;
 
 private:
-  ReplacerMap Replaces;
+  std::vector<llvm::IntrusiveRefCntPtr<Replacer>> Replacers;
   std::optional<ReverseIteratorDescriptor> ReverseDescriptor;
   IncludeInserter Inserter;
 };
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index dc5b7ec95db5f..e34706172f0bf 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -401,6 +401,26 @@ computeIncludeCleanerFindings(ParsedAST &AST, bool AnalyzeAngledIncludes) {
             Ref.RT != include_cleaner::RefType::Explicit)
           return;
 
+        // Check if we have any headers with the same spelling, in edge cases
+        // like `#include_next "foo.h"`, the user can't ever include the
+        // physical foo.h, but can have a spelling that refers to it.
+        // We postpone this check because spelling a header for every usage is
+        // expensive.
+        std::string Spelling = include_cleaner::spellHeader(
+            {Providers.front(), AST.getPreprocessor().getHeaderSearchInfo(),
+             MainFile});
+        for (auto *Inc :
+             ConvertedIncludes.match(include_cleaner::Header{Spelling})) {
+          Satisfied = true;
+          auto HeaderID =
+              AST.getIncludeStructure().getID(&Inc->Resolved->getFileEntry());
+          assert(HeaderID.has_value() &&
+                 "ConvertedIncludes only contains resolved includes.");
+          Used.insert(*HeaderID);
+        }
+        if (Satisfied)
+          return;
+
         // We actually always want to map usages to their spellings, but
         // spelling locations can point into preamble section. Using these
         // offsets could lead into crashes in presence of stale preambles. Hence
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index 7027232460354..0ee748c1ed2d0 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -644,6 +644,28 @@ TEST(IncludeCleaner, ResourceDirIsIgnored) {
   EXPECT_THAT(Findings.MissingIncludes, IsEmpty());
 }
 
+TEST(IncludeCleaner, DifferentHeaderSameSpelling) {
+  // `foo` is declared in foo_inner/foo.h, but there's no way to spell it
+  // directly. Make sure we don't generate unusued/missing include findings in
+  // such cases.
+  auto TU = TestTU::withCode(R"cpp(
+    #include <foo.h>
+    void baz() {
+      foo();
+    }
+  )cpp");
+  TU.AdditionalFiles["foo/foo.h"] = guard("#include_next <foo.h>");
+  TU.AdditionalFiles["foo_inner/foo.h"] = guard(R"cpp(
+    void foo();
+  )cpp");
+  TU.ExtraArgs.push_back("-Ifoo");
+  TU.ExtraArgs.push_back("-Ifoo_inner");
+
+  auto AST = TU.build();
+  auto Findings = computeIncludeCleanerFindings(AST);
+  EXPECT_THAT(Findings.UnusedIncludes, IsEmpty());
+  EXPECT_THAT(Findings.MissingIncludes, IsEmpty());
+}
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 697b514ae1572..642ad39cc0c1c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -48,10 +48,6 @@ Major New Features
 Improvements to clangd
 ----------------------
 
-- Introduced exmperimental support for C++20 Modules. The experimental support can
-  be enabled by `-experimental-modules-support` option. It is in an early development
-  stage and may not perform efficiently in real-world scenarios.
-
 Inlay hints
 ^^^^^^^^^^^
 
@@ -73,9 +69,6 @@ Code completion
 Code actions
 ^^^^^^^^^^^^
 
-- The tweak for turning unscoped into scoped enums now removes redundant prefixes
-  from the enum values.
-
 Signature help
 ^^^^^^^^^^^^^^
 
@@ -88,23 +81,12 @@ Objective-C
 Miscellaneous
 ^^^^^^^^^^^^^
 
-- Added a boolean option `AnalyzeAngledIncludes` to `Includes` config section,
-  which allows to enable unused includes detection for all angled ("system") headers.
-  At this moment umbrella headers are not supported, so enabling this option
-  may result in false-positives.
-
 Improvements to clang-doc
 -------------------------
 
 Improvements to clang-query
 ---------------------------
 
-- Added the `file` command to dynamically load a list of commands and matchers
-  from an external file, allowing the cost of reading the compilation database
-  and building the AST to be imposed just once for faster prototyping.
-
-- Removed support for ``enable output srcloc``. Fixes #GH82591
-
 Improvements to clang-rename
 ----------------------------
 
@@ -113,423 +95,21 @@ The improvements are...
 Improvements to clang-tidy
 --------------------------
 
-- Improved :program:`run-clang-tidy.py` script. Added argument `-source-filter`
-  to filter source files from the compilation database, via a RegEx. In a
-  similar fashion to what `-header-filter` does for header files.
-
-- Improved :program:`check_clang_tidy.py` script. Added argument `-export-fixes`
-  to aid in clang-tidy and test development.
-
-- Fixed bug where big values for unsigned check options overflowed into negative values
-  when being printed with `--dump-config`.
-
-- Fixed `--verify-config` option not properly parsing checks when using the
-  literal operator in the `.clang-tidy` config.
-
-- Added argument `--exclude-header-filter` and config option `ExcludeHeaderFilterRegex`
-  to exclude headers from analysis via a RegEx.
-
-- Added argument `--allow-no-checks` to suppress "no checks enabled" error
-  when disabling all of the checks by `--checks='-*'`.
-
 New checks
 ^^^^^^^^^^
 
-- New :doc:`boost-use-ranges
-  <clang-tidy/checks/boost/use-ranges>` check.
-
-  Detects calls to standard library iterator algorithms that could be replaced
-  with a Boost ranges version instead.
-
-- New :doc:`bugprone-crtp-constructor-accessibility
-  <clang-tidy/checks/bugprone/crtp-constructor-accessibility>` check.
-
-  Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
-  can be constructed outside itself and the derived class.
-
-- New :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
-  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>` check.
-
-  Finds pointer arithmetic performed on classes that contain a virtual function.
-
-- New :doc:`bugprone-return-const-ref-from-parameter
-  <clang-tidy/checks/bugprone/return-const-ref-from-parameter>` check.
-
-  Detects return statements that return a constant reference parameter as constant
-  reference. This may cause use-after-free errors if the caller uses xvalues as
-  arguments.
-
-- New :doc:`bugprone-suspicious-stringview-data-usage
-  <clang-tidy/checks/bugprone/suspicious-stringview-data-usage>` check.
-
-  Identifies suspicious usages of ``std::string_view::data()`` that could lead
-  to reading out-of-bounds data due to inadequate or incorrect string null
-  termination.
-
-- New :doc:`misc-use-internal-linkage
-  <clang-tidy/checks/misc/use-internal-linkage>` check.
-
-  Detects variables and functions that can be marked as static or moved into
-  an anonymous namespace to enforce internal linkage.
-
-- New :doc:`modernize-min-max-use-initializer-list
-  <clang-tidy/checks/modernize/min-max-use-initializer-list>` check.
-
-  Replaces nested ``std::min`` and ``std::max`` calls with an initializer list
-  where applicable.
-
-- New :doc:`modernize-use-designated-initializers
-  <clang-tidy/checks/modernize/use-designated-initializers>` check.
-
-  Finds initializer lists for aggregate types that could be
-  written as designated initializers instead.
-
-- New :doc:`modernize-use-ranges
-  <clang-tidy/checks/modernize/use-ranges>` check.
-
-  Detects calls to standard library iterator algorithms that could be replaced
-  with a ranges version instead.
-
-- New :doc:`modernize-use-std-format
-  <clang-tidy/checks/modernize/use-std-format>` check.
-
-  Converts calls to ``absl::StrFormat``, or other functions via
-  configuration options, to C++20's ``std::format``, or another function
-  via a configuration option, modifying the format string appropriately and
-  removing now-unnecessary calls to ``std::string::c_str()`` and
-  ``std::string::data()``.
-
-- New :doc:`readability-enum-initial-value
-  <clang-tidy/checks/readability/enum-initial-value>` check.
-
-  Enforces consistent style for enumerators' initialization, covering three
-  styles: none, first only, or all initialized explicitly.
-
-- New :doc:`readability-math-missing-parentheses
-  <clang-tidy/checks/readability/math-missing-parentheses>` check.
-
-  Check for missing parentheses in mathematical expressions that involve
-  operators of different priorities.
-
-- New :doc:`readability-use-std-min-max
-  <clang-tidy/checks/readability/use-std-min-max>` check.
-
-  Replaces certain conditional statements with equivalent calls to
-  ``std::min`` or ``std::max``.
-
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
-- New alias :doc:`cert-ctr56-cpp <clang-tidy/checks/cert/ctr56-cpp>` to
-  :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
-  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>`
-  was added.
-
-- New alias :doc:`cert-int09-c <clang-tidy/checks/cert/int09-c>` to
-  :doc:`readability-enum-initial-value <clang-tidy/checks/readability/enum-initial-value>`
-  was added.
-
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Improved :doc:`bugprone-assert-side-effect
-  <clang-tidy/checks/bugprone/assert-side-effect>` check by detecting side
-  effect from calling a method with non-const reference parameters.
-
-- Improved :doc:`bugprone-assignment-in-if-condition
-  <clang-tidy/checks/bugprone/assignment-in-if-condition>` check by ignoring
-  assignments in the C++20 ``requires`` clause.
-
-- Improved :doc:`bugprone-casting-through-void
-  <clang-tidy/checks/bugprone/casting-through-void>` check by ignoring casts
-  where source is already a ``void``` pointer, making middle ``void`` pointer
-  casts bug-free.
-
-- Improved :doc:`bugprone-forwarding-reference-overload
-  <clang-tidy/checks/bugprone/forwarding-reference-overload>`
-  check to ignore deleted constructors which won't hide other overloads.
-
-- Improved :doc:`bugprone-implicit-widening-of-multiplication-result
-  <clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result>` check
-  by adding an option to ignore constant expressions of signed integer types
-  that fit in the source expression type.
-
-- Improved :doc:`bugprone-inc-dec-in-conditions
-  <clang-tidy/checks/bugprone/inc-dec-in-conditions>` check to ignore code
-  within unevaluated contexts, such as ``decltype``.
-
-- Improved :doc:`bugprone-lambda-function-name<clang-tidy/checks/bugprone/lambda-function-name>`
-  check by ignoring ``__func__`` macro in lambda captures, initializers of
-  default parameters and nested function declarations.
-
-- Improved :doc:`bugprone-multi-level-implicit-pointer-conversion
-  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` check
-  by ignoring implicit pointer conversions that are part of a cast expression.
-
-- Improved :doc:`bugprone-non-zero-enum-to-bool-conversion
-  <clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion>` check by
-  eliminating false positives resulting from direct usage of bitwise operators
-  within parentheses.
-
-- Improved :doc:`bugprone-optional-value-conversion
-  <clang-tidy/checks/bugprone/optional-value-conversion>` check by eliminating
-  false positives resulting from use of optionals in unevaluated context.
-
-- Improved :doc:`bugprone-sizeof-expression
-  <clang-tidy/checks/bugprone/sizeof-expression>` check by clarifying the
-  diagnostics, eliminating some false positives and adding a new
-  (off-by-default) option `WarnOnSizeOfPointer` that reports all
-  ``sizeof(pointer)`` expressions (except for a few that are idiomatic).
-
-- Improved :doc:`bugprone-suspicious-include
-  <clang-tidy/checks/bugprone/suspicious-include>` check by replacing the local
-  options `HeaderFileExtensions` and `ImplementationFileExtensions` by the
-  global options of the same name.
-
-- Improved :doc:`bugprone-too-small-loop-variable
-  <clang-tidy/checks/bugprone/too-small-loop-variable>` check by incorporating
-  better support for ``const`` loop boundaries.
-
-- Improved :doc:`bugprone-unused-local-non-trivial-variable
-  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
-  ignoring local variable with ``[maybe_unused]`` attribute.
-
-- Improved :doc:`bugprone-unused-return-value
-  <clang-tidy/checks/bugprone/unused-return-value>` check by updating the
-  parameter `CheckedFunctions` to support regexp, avoiding false positive for
-  function with the same prefix as the default argument, e.g. ``std::unique_ptr``
-  and ``std::unique``, avoiding false positive for assignment operator overloading.
-
-- Improved :doc:`bugprone-use-after-move
-  <clang-tidy/checks/bugprone/use-after-move>` check to also handle
-  calls to ``std::forward``. Fixed sequencing of designated initializers. Fixed
-  sequencing of callees: In C++17 and later, the callee of a function is guaranteed
-  to be sequenced before the arguments, so don't warn if the use happens in the
-  callee and the move happens in one of the arguments.
-
-- Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
-  <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
-  with a new option `AllowInternalLinkage` to disable the warning for variables
-  with internal linkage.
-
-- Improved :doc:`cppcoreguidelines-macro-usage
-  <clang-tidy/checks/cppcoreguidelines/macro-usage>` check by ignoring macro with
-  hash preprocessing token.
-
-- Improved :doc:`cppcoreguidelines-missing-std-forward
-  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by no longer
-  giving false positives for deleted functions, by fixing false negatives when only
-  a few parameters are forwarded and by ignoring parameters without a name (unused
-  arguments).
-
-- Improved :doc:`cppcoreguidelines-owning-memory
-  <clang-tidy/checks/cppcoreguidelines/owning-memory>` check to properly handle
-  return type in lambdas and in nested functions.
-
-- Improved :doc:`cppcoreguidelines-prefer-member-initializer
-  <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check
-  by removing enforcement of rule `C.48
-  <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#c48-prefer-in-class-initializers-to-member-initializers-in-constructors-for-constant-initializers>`_,
-  which was deprecated since :program:`clang-tidy` 17. This rule is now covered
-  by :doc:`cppcoreguidelines-use-default-member-init
-  <clang-tidy/checks/cppcoreguidelines/use-default-member-init>`. Fixed
-  incorrect hints when using list-initialization.
-
-- Improved :doc:`cppcoreguidelines-special-member-functions
-  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check with a
-  new option `AllowImplicitlyDeletedCopyOrMove`, which removes the requirement
-  for explicit copy or move special member functions when they are already
-  implicitly deleted.
-
-- Improved :doc:`google-build-namespaces
-  <clang-tidy/checks/google/build-namespaces>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-
-- Improved :doc:`google-explicit-constructor
-  <clang-tidy/checks/google/explicit-constructor>` check to better handle
-  C++20 `explicit(bool)`.
-
-- Improved :doc:`google-global-names-in-headers
-  <clang-tidy/checks/google/global-names-in-headers>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-
-- Improved :doc:`google-runtime-int <clang-tidy/checks/google/runtime-int>`
-  check performance through optimizations.
-
-- Improved :doc:`hicpp-signed-bitwise <clang-tidy/checks/hicpp/signed-bitwise>`
-  check by ignoring false positives involving positive integer literals behind
-  implicit casts when `IgnorePositiveIntegerLiterals` is enabled.
-
-- Improved :doc:`hicpp-ignored-remove-result <clang-tidy/checks/hicpp/ignored-remove-result>`
-  check by ignoring other functions with same prefixes as the target specific
-  functions.
-
-- Improved :doc:`linuxkernel-must-check-errs
-  <clang-tidy/checks/linuxkernel/must-check-errs>` check documentation to
-  consistently use the check's proper name.
-
-- Improved :doc:`llvm-header-guard
-  <clang-tidy/checks/llvm/header-guard>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-
-- Improved :doc:`misc-const-correctness
-  <clang-tidy/checks/misc/const-correctness>` check by avoiding infinite recursion
-  for recursive functions with forwarding reference parameters and reference
-  variables which refer to themselves.
-
-- Improved :doc:`misc-definitions-in-headers
-  <clang-tidy/checks/misc/definitions-in-headers>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-  Additionally, the option `UseHeaderFileExtensions` is removed, so that the
-  check uses the `HeaderFileExtensions` option unconditionally.
-
-- Improved :doc:`misc-header-include-cycle
-  <clang-tidy/checks/misc/header-include-cycle>` check by avoiding crash for self
-  include cycles.
-
-- Improved :doc:`misc-unused-using-decls
-  <clang-tidy/checks/misc/unused-using-decls>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-
-- Improved :doc:`misc-use-anonymous-namespace
-  <clang-tidy/checks/misc/use-anonymous-namespace>` check by replacing the local
-  option `HeaderFileExtensions` by the global option of the same name.
-
-- Improved :doc:`modernize-avoid-c-arrays
-  <clang-tidy/checks/modernize/avoid-c-arrays>` check by introducing the new
-  `AllowStringArrays` option, enabling the exclusion of array types with deduced
-  length initialized from string literals.
-
-- Improved :doc:`modernize-loop-convert
-  <clang-tidy/checks/modernize/loop-convert>` check by ensuring that fix-its
-  don't remove parentheses used in ``sizeof`` calls when they have array index
-  accesses as arguments.
-
-- Improved :doc:`modernize-use-constraints
-  <clang-tidy/checks/modernize/use-constraints>` check by fixing a crash that
-  occurred in some scenarios and excluding system headers from analysis.
-
-- Improved :doc:`modernize-use-nullptr
-  <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
-  which also has introduced the ``nullptr`` keyword.
-
-- Improved :doc:`modernize-use-override
-  <clang-tidy/checks/modernize/use-override>` check to also remove any trailing
-  whitespace when deleting the ``virtual`` keyword.
-
-- Improved :doc:`modernize-use-starts-ends-with
-  <clang-tidy/checks/modernize/use-starts-ends-with>` check to also handle
-  calls to ``compare`` method.
-
-- Improved :doc:`modernize-use-std-print
-  <clang-tidy/checks/modernize/use-std-print>` check to not crash if the
-  format string parameter of the function to be replaced is not of the
-  expected type.
-
-- Improved :doc:`modernize-use-using <clang-tidy/checks/modernize/use-using>`
-  check by adding support for detection of typedefs declared on function level.
-
-- Improved :doc:`performance-inefficient-vector-operation
-  <clang-tidy/checks/performance/inefficient-vector-operation>` fixing false
-  negatives caused by different variable definition type and variable initial
-  value type in loop initialization expression.
-
-- Improved :doc:`performance-move-const-arg
-  <clang-tidy/checks/performance/move-const-arg>` check by ignoring
-  ``std::move()`` calls when their target is used as an rvalue.
-
-- Improved :doc:`performance-unnecessary-copy-initialization
-  <clang-tidy/checks/performance/unnecessary-copy-initialization>` check by
-  detecting more cases of constant access. In particular, pointers can be
-  analyzed, so the check now handles the common patterns
-  `const auto e = (*vector_ptr)[i]` and `const auto e = vector_ptr->at(i);`.
-  Calls to mutable function where there exists a `const` overload are also
-  handled. Fix crash in the case of a non-member operator call.
-
-- Improved :doc:`performance-unnecessary-value-param
-  <clang-tidy/checks/performance/unnecessary-value-param>` check
-  detecting more cases for template functions including lambdas with ``auto``.
-  E.g., ``std::sort(a.begin(), a.end(), [](auto x, auto y) { return a > b; });``
-  will be detected for expensive to copy types.
-
-- Improved :doc:`readability-avoid-return-with-void-value
-  <clang-tidy/checks/readability/avoid-return-with-void-value>` check by adding
-  fix-its.
-
-- Improved :doc:`readability-const-return-type
-  <clang-tidy/checks/readability/const-return-type>` check to eliminate false
-  positives when returning types with const not at the top level.
-
-- Improved :doc:`readability-container-size-empty
-  <clang-tidy/checks/readability/container-size-empty>` check to prevent false
-  positives when utilizing ``size`` or ``length`` methods that accept parameter.
-  Fixed crash when facing template user defined literals.
-
-- Improved :doc:`readability-duplicate-include
-  <clang-tidy/checks/readability/duplicate-include>` check by excluding include
-  directives that form the filename using macro.
-
-- Improved :doc:`readability-else-after-return
-  <clang-tidy/checks/readability/else-after-return>` check to ignore
-  `if consteval` statements, for which the `else` branch must not be removed.
-
-- Improved :doc:`readability-identifier-naming
-  <clang-tidy/checks/readability/identifier-naming>` check in `GetConfigPerFile`
-  mode by resolving symbolic links to header files. Fixed handling of Hungarian
-  Prefix when configured to `LowerCase`. Added support for renaming designated
-  initializers. Added support for renaming macro arguments. Fixed renaming
-  conflicts arising from out-of-line member function template definitions.
-
-- Improved :doc:`readability-implicit-bool-conversion
-  <clang-tidy/checks/readability/implicit-bool-conversion>` check to provide
-  valid fix suggestions for ``static_cast`` without a preceding space and
-  fixed problem with duplicate parentheses in double implicit casts. Corrected
-  the fix suggestions for C23 and later by using C-style casts instead of
-  ``static_cast``. Fixed false positives in C++20 spaceship operator by ignoring
-  casts in implicit and defaulted functions.
-
-- Improved :doc:`readability-redundant-inline-specifier
-  <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
-  emit warnings for static data member with an in-class initializer.
-
-- Improved :doc:`readability-redundant-member-init
-  <clang-tidy/checks/readability/redundant-member-init>` check to avoid
-  false-positives when type of the member does not match the type of the
-  initializer.
-
-- Improved :doc:`readability-static-accessed-through-instance
-  <clang-tidy/checks/readability/static-accessed-through-instance>` check to
-  support calls to overloaded operators as base expression and provide fixes to
-  expressions with side-effects.
-
-- Improved :doc:`readability-simplify-boolean-expr
-  <clang-tidy/checks/readability/simplify-boolean-expr>` check to avoid to emit
-  warning for macro when IgnoreMacro option is enabled and improve messages
-  when auto-fix does not work.
-
-- Improved :doc:`readability-static-definition-in-anonymous-namespace
-  <clang-tidy/checks/readability/static-definition-in-anonymous-namespace>`
-  check by resolving fix-it overlaps in template code by disregarding implicit
-  instances.
-
-- Improved :doc:`readability-string-compare
-  <clang-tidy/checks/readability/string-compare>` check to also detect
-  usages of ``std::string_view::compare``. Added a `StringLikeClasses` option
-  to detect usages of ``compare`` method in custom string-like classes.
-
 Removed checks
 ^^^^^^^^^^^^^^
 
-- Removed `cert-dcl21-cpp`, which was deprecated since :program:`clang-tidy` 17,
-  since the rule DCL21-CPP has been removed from the CERT guidelines.
-
 Miscellaneous
 ^^^^^^^^^^^^^
 
-- Fixed incorrect formatting in :program:`clang-apply-replacements` when no
-  `--format` option is specified. Now :program:`clang-apply-replacements`
-  applies formatting only with the option.
-
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
index 9205ba98729c4..10ae0fe3243a0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
@@ -16,23 +16,26 @@ Options
    This parameter supports regexp. The function is checked if the name
    and scope matches, with any arguments.
    By default the following functions are checked:
-   ``::std::async$, ::std::launder$, ::std::remove$, ::std::remove_if$, ::std::unique$,
-   ::std::unique_ptr::release$, ::std::basic_string::empty$, ::std::vector::empty$,
-   ::std::back_inserter$, ::std::distance$, ::std::find$, ::std::find_if$, ::std::inserter$,
-   ::std::lower_bound$, ::std::make_pair$, ::std::map::count$, ::std::map::find$,
-   ::std::map::lower_bound$, ::std::multimap::equal_range$, ::std::multimap::upper_bound$,
-   ::std::set::count$, ::std::set::find$, ::std::setfill$, ::std::setprecision$,
-   ::std::setw$, ::std::upper_bound$, ::std::vector::at$, ::bsearch$, ::ferror$,
-   ::feof$, ::isalnum$, ::isalpha$, ::isblank$, ::iscntrl$, ::isdigit$, ::isgraph$,
-   ::islower$, ::isprint$, ::ispunct$, ::isspace$, ::isupper$, ::iswalnum$, ::iswprint$,
-   ::iswspace$, ::isxdigit$, ::memchr$, ::memcmp$, ::strcmp$, ::strcoll$, ::strncmp$,
-   ::strpbrk$, ::strrchr$, ::strspn$, ::strstr$, ::wcscmp$, ::access$, ::bind$,
-   ::connect$, ::difftime$, ::dlsym$, ::fnmatch$, ::getaddrinfo$, ::getopt$,
-   ::htonl$, ::htons$, ::iconv_open$, ::inet_addr$, isascii$, isatty$, ::mmap$,
-   ::newlocale$, ::openat$, ::pathconf$, ::pthread_equal$, ::pthread_getspecific$,
-   ::pthread_mutex_trylock$, ::readdir$, ::readlink$, ::recvmsg$, ::regexec$, ::scandir$,
-   ::semget$, ::setjmp$, ::shm_open$, ::shmget$, ::sigismember$, ::strcasecmp$, ::strsignal$,
-   ::ttyname$``
+   ``^::std::async$, ^::std::launder$, ^::std::remove$, ^::std::remove_if$,
+   ^::std::unique$, ^::std::unique_ptr::release$, ^::std::basic_string::empty$,
+   ^::std::vector::empty$, ^::std::back_inserter$, ^::std::distance$,
+   ^::std::find$, ^::std::find_if$, ^::std::inserter$, ^::std::lower_bound$,
+   ^::std::make_pair$, ^::std::map::count$, ^::std::map::find$,
+   ^::std::map::lower_bound$, ^::std::multimap::equal_range$,
+   ^::std::multimap::upper_bound$, ^::std::set::count$, ^::std::set::find$,
+   ^::std::setfill$, ^::std::setprecision$, ^::std::setw$, ^::std::upper_bound$,
+   ^::std::vector::at$, ^::bsearch$, ^::ferror$, ^::feof$, ^::isalnum$,
+   ^::isalpha$, ^::isblank$, ^::iscntrl$, ^::isdigit$, ^::isgraph$, ^::islower$,
+   ^::isprint$, ^::ispunct$, ^::isspace$, ^::isupper$, ^::iswalnum$,
+   ^::iswprint$, ^::iswspace$, ^::isxdigit$, ^::memchr$, ^::memcmp$, ^::strcmp$,
+   ^::strcoll$, ^::strncmp$, ^::strpbrk$, ^::strrchr$, ^::strspn$, ^::strstr$,
+   ^::wcscmp$, ^::access$, ^::bind$, ^::connect$, ^::difftime$, ^::dlsym$,
+   ^::fnmatch$, ^::getaddrinfo$, ^::getopt$, ^::htonl$, ^::htons$,
+   ^::iconv_open$, ^::inet_addr$, isascii$, isatty$, ^::mmap$, ^::newlocale$,
+   ^::openat$, ^::pathconf$, ^::pthread_equal$, ^::pthread_getspecific$,
+   ^::pthread_mutex_trylock$, ^::readdir$, ^::readlink$, ^::recvmsg$,
+   ^::regexec$, ^::scandir$, ^::semget$, ^::setjmp$, ^::shm_open$, ^::shmget$,
+   ^::sigismember$, ^::strcasecmp$, ^::strsignal$, ^::ttyname$``
 
    - ``std::async()``. Not using the return value makes the call synchronous.
    - ``std::launder()``. Not using the return value usually means that the
@@ -54,7 +57,8 @@ Options
 
    Semicolon-separated list of function return types to check.
    By default the following function return types are checked:
-   `::std::error_code`, `::std::error_condition`, `::std::errc`, `::std::expected`, `::boost::system::error_code`
+   `^::std::error_code$`, `^::std::error_condition$`, `^::std::errc$`,
+   `^::std::expected$`, `^::boost::system::error_code$`
 
 .. option:: AllowCastToVoid
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
index e723f21f6bc60..f478598bb4e6c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
@@ -1,9 +1,13 @@
 .. title:: clang-tidy - clang-analyzer-cplusplus.Move
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move
 
 clang-analyzer-cplusplus.Move
 =============================
 
 Find use-after-move bugs in C++.
 
-The clang-analyzer-cplusplus.Move check is an alias of
-Clang Static Analyzer cplusplus.Move.
+The `clang-analyzer-cplusplus.Move` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst
new file mode 100644
index 0000000000000..9732333c61aa7
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst
@@ -0,0 +1,14 @@
+.. title:: clang-tidy - clang-analyzer-optin.taint.TaintedAlloc
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc
+
+clang-analyzer-optin.taint.TaintedAlloc
+=======================================
+
+Check for memory allocations, where the size parameter might be a tainted
+(attacker controlled) value.
+
+The `clang-analyzer-optin.taint.TaintedAlloc` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst
new file mode 100644
index 0000000000000..0a5feff8d3ca8
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst
@@ -0,0 +1,10 @@
+.. title:: clang-tidy - clang-analyzer-security.PutenvStackArray
+
+clang-analyzer-security.PutenvStackArray
+========================================
+
+Finds calls to the function 'putenv' which pass a pointer to an automatic
+(stack-allocated) array as the argument.
+
+The clang-analyzer-security.PutenvStackArray check is an alias of
+Clang Static Analyzer security.PutenvStackArray.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst
new file mode 100644
index 0000000000000..40d2d3d8b8aac
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-unix.BlockInCriticalSection
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection
+
+clang-analyzer-unix.BlockInCriticalSection
+==========================================
+
+Check for calls to blocking functions inside a critical section.
+
+The `clang-analyzer-unix.BlockInCriticalSection` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst
index 5783280478dc1..57c4829431e76 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst
@@ -35,7 +35,7 @@ Examples:
     int* x;
     std::unique_ptr<int> x;
     std::shared_ptr<int> x;
-    gsl::not_null<int> x;
+    gsl::not_null<int*> x;
   };
 
   // Bad, rvalue reference member
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index dd2887edb0f8d..a931ebf025a10 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -118,6 +118,7 @@ Clang-Tidy Checks
    :doc:`bugprone-not-null-terminated-result <bugprone/not-null-terminated-result>`, "Yes"
    :doc:`bugprone-optional-value-conversion <bugprone/optional-value-conversion>`, "Yes"
    :doc:`bugprone-parent-virtual-call <bugprone/parent-virtual-call>`, "Yes"
+   :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-posix-return <bugprone/posix-return>`, "Yes"
    :doc:`bugprone-redundant-branch-condition <bugprone/redundant-branch-condition>`, "Yes"
    :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
@@ -158,7 +159,6 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-raii <bugprone/unused-raii>`, "Yes"
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
-   :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`,
    :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
@@ -269,7 +269,7 @@ Clang-Tidy Checks
    :doc:`misc-unused-parameters <misc/unused-parameters>`, "Yes"
    :doc:`misc-unused-using-decls <misc/unused-using-decls>`, "Yes"
    :doc:`misc-use-anonymous-namespace <misc/use-anonymous-namespace>`,
-   :doc:`misc-use-internal-linkage <misc/use-internal-linkage>`,
+   :doc:`misc-use-internal-linkage <misc/use-internal-linkage>`, "Yes"
    :doc:`modernize-avoid-bind <modernize/avoid-bind>`, "Yes"
    :doc:`modernize-avoid-c-arrays <modernize/avoid-c-arrays>`,
    :doc:`modernize-concat-nested-namespaces <modernize/concat-nested-namespaces>`, "Yes"
@@ -409,6 +409,7 @@ Check aliases
    :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`, :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`,
    :doc:`cert-con36-c <cert/con36-c>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`cert-con54-cpp <cert/con54-cpp>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
+   :doc:`cert-ctr56-cpp <cert/ctr56-cpp>`, :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`cert-dcl03-c <cert/dcl03-c>`, :doc:`misc-static-assert <misc/static-assert>`, "Yes"
    :doc:`cert-dcl16-c <cert/dcl16-c>`, :doc:`readability-uppercase-literal-suffix <readability/uppercase-literal-suffix>`, "Yes"
    :doc:`cert-dcl37-c <cert/dcl37-c>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
@@ -448,7 +449,7 @@ Check aliases
    :doc:`clang-analyzer-core.uninitialized.UndefReturn <clang-analyzer/core.uninitialized.UndefReturn>`, `Clang Static Analyzer core.uninitialized.UndefReturn <https://clang.llvm.org/docs/analyzer/checkers.html#core-uninitialized-undefreturn>`_,
    :doc:`clang-analyzer-cplusplus.ArrayDelete <clang-analyzer/cplusplus.ArrayDelete>`, `Clang Static Analyzer cplusplus.ArrayDelete <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-arraydelete>`_,
    :doc:`clang-analyzer-cplusplus.InnerPointer <clang-analyzer/cplusplus.InnerPointer>`, `Clang Static Analyzer cplusplus.InnerPointer <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-innerpointer>`_,
-   :doc:`clang-analyzer-cplusplus.Move <clang-analyzer/cplusplus.Move>`, Clang Static Analyzer cplusplus.Move,
+   :doc:`clang-analyzer-cplusplus.Move <clang-analyzer/cplusplus.Move>`, `Clang Static Analyzer cplusplus.Move <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move>`_,
    :doc:`clang-analyzer-cplusplus.NewDelete <clang-analyzer/cplusplus.NewDelete>`, `Clang Static Analyzer cplusplus.NewDelete <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-newdelete>`_,
    :doc:`clang-analyzer-cplusplus.NewDeleteLeaks <clang-analyzer/cplusplus.NewDeleteLeaks>`, `Clang Static Analyzer cplusplus.NewDeleteLeaks <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-newdeleteleaks>`_,
    :doc:`clang-analyzer-cplusplus.PlacementNew <clang-analyzer/cplusplus.PlacementNew>`, `Clang Static Analyzer cplusplus.PlacementNew <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-placementnew>`_,
@@ -471,6 +472,7 @@ Check aliases
    :doc:`clang-analyzer-optin.performance.GCDAntipattern <clang-analyzer/optin.performance.GCDAntipattern>`, `Clang Static Analyzer optin.performance.GCDAntipattern <https://clang.llvm.org/docs/analyzer/checkers.html#optin-performance-gcdantipattern>`_,
    :doc:`clang-analyzer-optin.performance.Padding <clang-analyzer/optin.performance.Padding>`, `Clang Static Analyzer optin.performance.Padding <https://clang.llvm.org/docs/analyzer/checkers.html#optin-performance-padding>`_,
    :doc:`clang-analyzer-optin.portability.UnixAPI <clang-analyzer/optin.portability.UnixAPI>`, `Clang Static Analyzer optin.portability.UnixAPI <https://clang.llvm.org/docs/analyzer/checkers.html#optin-portability-unixapi>`_,
+   :doc:`clang-analyzer-optin.taint.TaintedAlloc <clang-analyzer/optin.taint.TaintedAlloc>`, `Clang Static Analyzer optin.taint.TaintedAlloc <https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc>`_,
    :doc:`clang-analyzer-osx.API <clang-analyzer/osx.API>`, `Clang Static Analyzer osx.API <https://clang.llvm.org/docs/analyzer/checkers.html#osx-api>`_,
    :doc:`clang-analyzer-osx.MIG <clang-analyzer/osx.MIG>`, Clang Static Analyzer osx.MIG,
    :doc:`clang-analyzer-osx.NumberObjectConversion <clang-analyzer/osx.NumberObjectConversion>`, `Clang Static Analyzer osx.NumberObjectConversion <https://clang.llvm.org/docs/analyzer/checkers.html#osx-numberobjectconversion>`_,
@@ -501,6 +503,7 @@ Check aliases
    :doc:`clang-analyzer-osx.coreFoundation.containers.OutOfBounds <clang-analyzer/osx.coreFoundation.containers.OutOfBounds>`, `Clang Static Analyzer osx.coreFoundation.containers.OutOfBounds <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-outofbounds>`_,
    :doc:`clang-analyzer-osx.coreFoundation.containers.PointerSizedValues <clang-analyzer/osx.coreFoundation.containers.PointerSizedValues>`, `Clang Static Analyzer osx.coreFoundation.containers.PointerSizedValues <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-pointersizedvalues>`_,
    :doc:`clang-analyzer-security.FloatLoopCounter <clang-analyzer/security.FloatLoopCounter>`, `Clang Static Analyzer security.FloatLoopCounter <https://clang.llvm.org/docs/analyzer/checkers.html#security-floatloopcounter>`_,
+   :doc:`clang-analyzer-security.PutenvStackArray <clang-analyzer/security.PutenvStackArray>`, Clang Static Analyzer security.PutenvStackArray,
    :doc:`clang-analyzer-security.SetgidSetuidOrder <clang-analyzer/security.SetgidSetuidOrder>`, Clang Static Analyzer security.SetgidSetuidOrder,
    :doc:`clang-analyzer-security.cert.env.InvalidPtr <clang-analyzer/security.cert.env.InvalidPtr>`, `Clang Static Analyzer security.cert.env.InvalidPtr <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_,
    :doc:`clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling <clang-analyzer/security.insecureAPI.DeprecatedOrUnsafeBufferHandling>`, `Clang Static Analyzer security.insecureAPI.DeprecatedOrUnsafeBufferHandling <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling>`_,
@@ -517,6 +520,7 @@ Check aliases
    :doc:`clang-analyzer-security.insecureAPI.strcpy <clang-analyzer/security.insecureAPI.strcpy>`, `Clang Static Analyzer security.insecureAPI.strcpy <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-strcpy>`_,
    :doc:`clang-analyzer-security.insecureAPI.vfork <clang-analyzer/security.insecureAPI.vfork>`, `Clang Static Analyzer security.insecureAPI.vfork <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-vfork>`_,
    :doc:`clang-analyzer-unix.API <clang-analyzer/unix.API>`, `Clang Static Analyzer unix.API <https://clang.llvm.org/docs/analyzer/checkers.html#unix-api>`_,
+   :doc:`clang-analyzer-unix.BlockInCriticalSection <clang-analyzer/unix.BlockInCriticalSection>`, `Clang Static Analyzer unix.BlockInCriticalSection <https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection>`_,
    :doc:`clang-analyzer-unix.Errno <clang-analyzer/unix.Errno>`, `Clang Static Analyzer unix.Errno <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_,
    :doc:`clang-analyzer-unix.Malloc <clang-analyzer/unix.Malloc>`, `Clang Static Analyzer unix.Malloc <https://clang.llvm.org/docs/analyzer/checkers.html#unix-malloc>`_,
    :doc:`clang-analyzer-unix.MallocSizeof <clang-analyzer/unix.MallocSizeof>`, `Clang Static Analyzer unix.MallocSizeof <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mallocsizeof>`_,
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index f1cd72f877ca2..68fe79d6929f6 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -105,9 +105,20 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
              }
              if (!Satisfied && !Providers.empty() &&
                  Ref.RT == RefType::Explicit &&
-                 !HeaderFilter(Providers.front().resolvedPath()))
-               Missing.insert(spellHeader(
-                   {Providers.front(), PP.getHeaderSearchInfo(), MainFile}));
+                 !HeaderFilter(Providers.front().resolvedPath())) {
+               // Check if we have any headers with the same spelling, in edge
+               // cases like `#include_next "foo.h"`, the user can't ever
+               // include the physical foo.h, but can have a spelling that
+               // refers to it.
+               auto Spelling = spellHeader(
+                   {Providers.front(), PP.getHeaderSearchInfo(), MainFile});
+               for (const Include *I : Inc.match(Header{Spelling})) {
+                 Used.insert(I);
+                 Satisfied = true;
+               }
+               if (!Satisfied)
+                 Missing.insert(std::move(Spelling));
+             }
            });
 
   AnalysisResults Results;
diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index 6558b68087684..5696c380758f8 100644
--- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -12,6 +12,7 @@
 #include "clang-include-cleaner/Record.h"
 #include "clang-include-cleaner/Types.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/DeclBase.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceLocation.h"
@@ -296,6 +297,31 @@ TEST_F(AnalyzeTest, ResourceDirIsIgnored) {
   EXPECT_THAT(Results.Missing, testing::IsEmpty());
 }
 
+TEST_F(AnalyzeTest, DifferentHeaderSameSpelling) {
+  Inputs.ExtraArgs.push_back("-Ifoo");
+  Inputs.ExtraArgs.push_back("-Ifoo_inner");
+  // `foo` is declared in foo_inner/foo.h, but there's no way to spell it
+  // directly. Make sure we don't generate unusued/missing include findings in
+  // such cases.
+  Inputs.Code = R"cpp(
+    #include <foo.h>
+    void baz() {
+      foo();
+    }
+  )cpp";
+  Inputs.ExtraFiles["foo/foo.h"] = guard("#include_next <foo.h>");
+  Inputs.ExtraFiles["foo_inner/foo.h"] = guard(R"cpp(
+    void foo();
+  )cpp");
+  TestAST AST(Inputs);
+  std::vector<Decl *> DeclsInTU;
+  for (auto *D : AST.context().getTranslationUnitDecl()->decls())
+    DeclsInTU.push_back(D);
+  auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
+  EXPECT_THAT(Results.Unused, testing::IsEmpty());
+  EXPECT_THAT(Results.Missing, testing::IsEmpty());
+}
+
 TEST(FixIncludes, Basic) {
   llvm::StringRef Code = R"cpp(#include "d.h"
 #include "a.h"
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 51d3ac6ce6dcd..38569d824f1f0 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,3 +1,6 @@
+// See https://github.com/llvm/llvm-project/issues/97507.
+// UNSUPPORTED: target={{.*}}
+
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 // RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
diff --git a/clang-tools-extra/test/clang-doc/test-path-abs.cpp b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
new file mode 100644
index 0000000000000..f6cce95bbea0c
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t && mkdir %t
+// RUN: clang-doc --format=html --executor=standalone %s --output=%t
+// RUN: FileCheck %s -input-file=%t/index_json.js  -check-prefix=JSON-INDEX
+// RUN: rm -rf %t
+
+// JSON-INDEX: var RootPath = "{{.*}}test-path-abs.cpp.tmp";
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
index e92179ac82c6a..5e39c05f76d86 100755
--- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
@@ -205,9 +205,11 @@ def run_clang_tidy(self):
                 self.temp_file_name,
             ]
             + [
-                "-fix"
-                if self.export_fixes is None
-                else "--export-fixes=" + self.export_fixes
+                (
+                    "-fix"
+                    if self.export_fixes is None
+                    else "--export-fixes=" + self.export_fixes
+                )
             ]
             + [
                 "--checks=-*," + self.check_name,
@@ -299,19 +301,37 @@ def run(self):
             self.check_notes(clang_tidy_output)
 
 
+CPP_STANDARDS = [
+    "c++98",
+    "c++11",
+    ("c++14", "c++1y"),
+    ("c++17", "c++1z"),
+    ("c++20", "c++2a"),
+    ("c++23", "c++2b"),
+    ("c++26", "c++2c"),
+]
+C_STANDARDS = ["c99", ("c11", "c1x"), "c17", ("c23", "c2x"), "c2y"]
+
+
 def expand_std(std):
-    if std == "c++98-or-later":
-        return ["c++98", "c++11", "c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++11-or-later":
-        return ["c++11", "c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++14-or-later":
-        return ["c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++17-or-later":
-        return ["c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++20-or-later":
-        return ["c++20", "c++23", "c++2c"]
-    if std == "c++23-or-later":
-        return ["c++23", "c++2c"]
+    split_std, or_later, _ = std.partition("-or-later")
+
+    if not or_later:
+        return [split_std]
+
+    for standard_list in (CPP_STANDARDS, C_STANDARDS):
+        item = next(
+            (
+                i
+                for i, v in enumerate(standard_list)
+                if (split_std in v if isinstance(v, (list, tuple)) else split_std == v)
+            ),
+            None,
+        )
+        if item is not None:
+            return [split_std] + [
+                x if isinstance(x, str) else x[0] for x in standard_list[item + 1 :]
+            ]
     return [std]
 
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
index f5e74df1621ce..26c443b139629 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
@@ -756,3 +756,21 @@ struct test_implicit_throw {
 };
 
 }}
+
+void pointer_exception_can_not_escape_with_const_void_handler() noexcept {
+  // CHECK-MESSAGES-NOT: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'pointer_exception_can_not_escape_with_const_void_handler' which should not throw exceptions
+  const int value = 42;
+  try {
+    throw &value;
+  } catch (const void *) {
+  }
+}
+
+void pointer_exception_can_not_escape_with_void_handler() noexcept {
+  // CHECK-MESSAGES-NOT: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'pointer_exception_can_not_escape_with_void_handler' which should not throw exceptions
+  int value = 42;
+  try {
+    throw &value;
+  } catch (void *) {
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
index 5a5d05bb4e94e..e3864be134da3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
@@ -18,7 +18,7 @@ struct Ok {
   const int *pc;
   std::unique_ptr<int> up;
   std::shared_ptr<int> sp;
-  gsl::not_null<int> n;
+  gsl::not_null<int*> n;
 };
 
 struct ConstMember {
@@ -60,7 +60,7 @@ struct Ok2 {
   const Foo *pc;
   std::unique_ptr<Foo> up;
   std::shared_ptr<Foo> sp;
-  gsl::not_null<Foo> n;
+  gsl::not_null<Foo*> n;
 };
 
 struct ConstMember2 {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
index cb6bfcc1dccba..0d1ff0db58371 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
@@ -3,7 +3,7 @@
 // RUN:     misc-const-correctness.TransformValues: true, \
 // RUN:     misc-const-correctness.WarnPointersAsValues: false, \
 // RUN:     misc-const-correctness.TransformPointersAsValues: false \
-// RUN:   }}" -- -fno-delayed-template-parsing
+// RUN:   }}" -- -fno-delayed-template-parsing -fexceptions
 
 // ------- Provide test samples for primitive builtins ---------
 // - every 'p_*' variable is a 'potential_const_*' variable
@@ -56,6 +56,15 @@ void some_function(double np_arg0, wchar_t np_arg1) {
   np_local6--;
 }
 
+int function_try_block() try {
+  int p_local0 = 0;
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int' can be declared 'const'
+  // CHECK-FIXES: int const p_local0
+  return p_local0;
+} catch (...) {
+  return 0;
+}
+
 void nested_scopes() {
   int p_local0 = 2;
   // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int' can be declared 'const'
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
index 6596511c7a38b..69ac9954f4afa 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
@@ -7,8 +7,8 @@ template <typename T> class vector {
 public:
   using iterator = T *;
   using const_iterator = const T *;
-  using reverse_iterator = T*;
-  using reverse_const_iterator = const T*;
+  using reverse_iterator = T *;
+  using reverse_const_iterator = const T *;
 
   constexpr const_iterator begin() const;
   constexpr const_iterator end() const;
@@ -72,8 +72,8 @@ template <typename Container> constexpr auto crend(const Container &Cont) {
   return Cont.crend();
 }
 // Find
-template< class InputIt, class T >
-InputIt find( InputIt first, InputIt last, const T& value );
+template <class InputIt, class T>
+InputIt find(InputIt first, InputIt last, const T &value);
 
 // Reverse
 template <typename Iter> void reverse(Iter begin, Iter end);
@@ -82,6 +82,7 @@ template <typename Iter> void reverse(Iter begin, Iter end);
 template <class InputIt1, class InputIt2>
 bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
 
+inline namespace _V1 {
 // IsPermutation
 template <class ForwardIt1, class ForwardIt2>
 bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2);
@@ -97,9 +98,10 @@ template <class InputIt1, class InputIt2>
 bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
 
 template <class InputIt1, class InputIt2, class BinaryPred>
-bool equal(InputIt1 first1, InputIt1 last1,
-           InputIt2 first2, InputIt2 last2, BinaryPred p) {
-  // Need a definition to suppress undefined_internal_type when invoked with lambda
+bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
+           BinaryPred p) {
+  // Need a definition to suppress undefined_internal_type when invoked with
+  // lambda
   return true;
 }
 
@@ -108,6 +110,7 @@ void iota(ForwardIt first, ForwardIt last, T value);
 
 template <class ForwardIt>
 ForwardIt rotate(ForwardIt first, ForwardIt middle, ForwardIt last);
+} // namespace _V1
 
 } // namespace std
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp
index 0dffaefa213a4..7c7ae43698929 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp
@@ -2,6 +2,31 @@
 
 // CHECK-FIXES: #include <utility>
 
+namespace std {
+template <typename>
+struct remove_reference;
+
+template <typename _Tp>
+struct remove_reference {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_reference<_Tp &> {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_reference<_Tp &&> {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+constexpr typename std::remove_reference<_Tp>::type &&move(_Tp &&__t) {
+  return static_cast<typename std::remove_reference<_Tp>::type &&>(__t);
+}
+} // namespace std
+
 struct ExpensiveToCopyType {
   const ExpensiveToCopyType & constReference() const {
     return *this;
@@ -357,3 +382,12 @@ void fun() {
   ExpensiveToCopyType E;
   NegativeUsingConstructor S(E);
 }
+
+struct B {
+  static void bar(ExpensiveMovableType a, ExpensiveMovableType b);
+};
+
+template <typename T>
+void NegativeCallWithDependentAndNondependentArgs(ExpensiveMovableType a, T b) {
+    B::bar(std::move(a), b);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c b/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c
new file mode 100644
index 0000000000000..db50467f3dd94
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c
@@ -0,0 +1,11 @@
+// RUN: %check_clang_tidy %s readability-non-const-parameter %t
+
+static int f();
+
+int f(p)
+  int *p;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: pointer parameter 'p' can be pointer to const [readability-non-const-parameter]
+// CHECK-FIXES: {{^}}  const int *p;{{$}}
+{
+    return *p;
+}
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index be024da5e005c..2038ef6045c7d 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -43,7 +43,7 @@
 Most object information is exposed using properties, when the underlying API
 call is efficient.
 """
-from __future__ import absolute_import, division, print_function
+from __future__ import annotations
 
 # TODO
 # ====
@@ -64,48 +64,80 @@
 
 from ctypes import *
 
-import collections.abc
 import os
+import sys
 from enum import Enum
 
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Optional,
+    Type as TType,
+    TypeVar,
+    TYPE_CHECKING,
+    Union as TUnion,
+)
+
+if TYPE_CHECKING:
+    from ctypes import _Pointer
+    from typing_extensions import Protocol, TypeAlias
+
+    StrPath: TypeAlias = TUnion[str, os.PathLike[str]]
+    LibFunc: TypeAlias = TUnion[
+        "tuple[str, Optional[list[Any]]]",
+        "tuple[str, Optional[list[Any]], Any]",
+        "tuple[str, Optional[list[Any]], Any, Callable[..., Any]]",
+    ]
+
+    TSeq = TypeVar("TSeq", covariant=True)
+
+    class NoSliceSequence(Protocol[TSeq]):
+        def __len__(self) -> int:
+            ...
+
+        def __getitem__(self, key: int) -> TSeq:
+            ...
+
 
 # Python 3 strings are unicode, translate them to/from utf8 for C-interop.
 class c_interop_string(c_char_p):
-    def __init__(self, p=None):
+    def __init__(self, p: str | bytes | None = None):
         if p is None:
             p = ""
         if isinstance(p, str):
             p = p.encode("utf8")
         super(c_char_p, self).__init__(p)
 
-    def __str__(self):
-        return self.value
+    def __str__(self) -> str:
+        return self.value or ""
 
     @property
-    def value(self):
-        if super(c_char_p, self).value is None:
+    def value(self) -> str | None:  # type: ignore [override]
+        val = super(c_char_p, self).value
+        if val is None:
             return None
-        return super(c_char_p, self).value.decode("utf8")
+        return val.decode("utf8")
 
     @classmethod
-    def from_param(cls, param):
+    def from_param(cls, param: str | bytes | None) -> c_interop_string:
         if isinstance(param, str):
             return cls(param)
         if isinstance(param, bytes):
             return cls(param)
         if param is None:
             # Support passing null to C functions expecting char arrays
-            return None
+            return cls(param)
         raise TypeError(
             "Cannot convert '{}' to '{}'".format(type(param).__name__, cls.__name__)
         )
 
     @staticmethod
-    def to_python_string(x, *args):
+    def to_python_string(x: c_interop_string, *args: Any) -> str | None:
         return x.value
 
 
-def b(x):
+def b(x: str | bytes) -> bytes:
     if isinstance(x, bytes):
         return x
     return x.encode("utf8")
@@ -115,9 +147,7 @@ def b(x):
 # object. This is a problem, because it means that from_parameter will see an
 # integer and pass the wrong value on platforms where int != void*. Work around
 # this by marshalling object arguments as void**.
-c_object_p = POINTER(c_void_p)
-
-callbacks = {}
+c_object_p: TType[_Pointer[Any]] = POINTER(c_void_p)
 
 ### Exception Classes ###
 
@@ -169,8 +199,11 @@ def __init__(self, enumeration, message):
 
 ### Structures and Utility Classes ###
 
+TInstance = TypeVar("TInstance")
+TResult = TypeVar("TResult")
+
 
-class CachedProperty:
+class CachedProperty(Generic[TInstance, TResult]):
     """Decorator that lazy-loads the value of a property.
 
     The first time the property is accessed, the original property function is
@@ -178,16 +211,20 @@ class CachedProperty:
     property, replacing the original method.
     """
 
-    def __init__(self, wrapped):
+    def __init__(self, wrapped: Callable[[TInstance], TResult]):
         self.wrapped = wrapped
         try:
             self.__doc__ = wrapped.__doc__
         except:
             pass
 
-    def __get__(self, instance, instance_type=None):
+    def __get__(self, instance: TInstance, instance_type: Any = None) -> TResult:
         if instance is None:
-            return self
+            property_name = self.wrapped.__name__
+            class_name = instance_type.__name__
+            raise TypeError(
+                f"'{property_name}' is not a static attribute of '{class_name}'"
+            )
 
         value = self.wrapped(instance)
         setattr(instance, self.wrapped.__name__, value)
@@ -200,13 +237,16 @@ class _CXString(Structure):
 
     _fields_ = [("spelling", c_char_p), ("free", c_int)]
 
-    def __del__(self):
+    def __del__(self) -> None:
         conf.lib.clang_disposeString(self)
 
     @staticmethod
-    def from_result(res, fn=None, args=None):
+    def from_result(res: _CXString, fn: Any = None, args: Any = None) -> str:
         assert isinstance(res, _CXString)
-        return conf.lib.clang_getCString(res)
+        pystr: str | None = conf.lib.clang_getCString(res)
+        if pystr is None:
+            return ""
+        return pystr
 
 
 class SourceLocation(Structure):
@@ -236,7 +276,7 @@ def from_position(tu, file, line, column):
         Retrieve the source location associated with a given file/line/column in
         a particular translation unit.
         """
-        return conf.lib.clang_getLocation(tu, file, line, column)
+        return conf.lib.clang_getLocation(tu, file, line, column)  # type: ignore [no-any-return]
 
     @staticmethod
     def from_offset(tu, file, offset):
@@ -246,7 +286,7 @@ def from_offset(tu, file, offset):
         file -- File instance to obtain offset from
         offset -- Integer character offset within file
         """
-        return conf.lib.clang_getLocationForOffset(tu, file, offset)
+        return conf.lib.clang_getLocationForOffset(tu, file, offset)  # type: ignore [no-any-return]
 
     @property
     def file(self):
@@ -271,10 +311,10 @@ def offset(self):
     @property
     def is_in_system_header(self):
         """Returns true if the given source location is in a system header."""
-        return conf.lib.clang_Location_isInSystemHeader(self)
+        return conf.lib.clang_Location_isInSystemHeader(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
-        return conf.lib.clang_equalLocations(self, other)
+        return conf.lib.clang_equalLocations(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -307,7 +347,7 @@ class SourceRange(Structure):
     # object.
     @staticmethod
     def from_locations(start, end):
-        return conf.lib.clang_getRange(start, end)
+        return conf.lib.clang_getRange(start, end)  # type: ignore [no-any-return]
 
     @property
     def start(self):
@@ -315,7 +355,7 @@ def start(self):
         Return a SourceLocation representing the first character within a
         source range.
         """
-        return conf.lib.clang_getRangeStart(self)
+        return conf.lib.clang_getRangeStart(self)  # type: ignore [no-any-return]
 
     @property
     def end(self):
@@ -323,10 +363,10 @@ def end(self):
         Return a SourceLocation representing the last character within a
         source range.
         """
-        return conf.lib.clang_getRangeEnd(self)
+        return conf.lib.clang_getRangeEnd(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
-        return conf.lib.clang_equalRanges(self, other)
+        return conf.lib.clang_equalRanges(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -389,42 +429,42 @@ def __del__(self):
 
     @property
     def severity(self):
-        return conf.lib.clang_getDiagnosticSeverity(self)
+        return conf.lib.clang_getDiagnosticSeverity(self)  # type: ignore [no-any-return]
 
     @property
     def location(self):
-        return conf.lib.clang_getDiagnosticLocation(self)
+        return conf.lib.clang_getDiagnosticLocation(self)  # type: ignore [no-any-return]
 
     @property
     def spelling(self):
-        return conf.lib.clang_getDiagnosticSpelling(self)
+        return conf.lib.clang_getDiagnosticSpelling(self)  # type: ignore [no-any-return]
 
     @property
-    def ranges(self):
+    def ranges(self) -> NoSliceSequence[SourceRange]:
         class RangeIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag = diag
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getDiagnosticNumRanges(self.diag))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> SourceRange:
                 if key >= len(self):
                     raise IndexError
-                return conf.lib.clang_getDiagnosticRange(self.diag, key)
+                return conf.lib.clang_getDiagnosticRange(self.diag, key)  # type: ignore [no-any-return]
 
         return RangeIterator(self)
 
     @property
-    def fixits(self):
+    def fixits(self) -> NoSliceSequence[FixIt]:
         class FixItIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag = diag
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getDiagnosticNumFixIts(self.diag))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> FixIt:
                 range = SourceRange()
                 value = conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range))
                 if len(value) == 0:
@@ -435,15 +475,15 @@ def __getitem__(self, key):
         return FixItIterator(self)
 
     @property
-    def children(self):
+    def children(self) -> NoSliceSequence[Diagnostic]:
         class ChildDiagnosticsIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag_set = conf.lib.clang_getChildDiagnostics(diag)
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getNumDiagnosticsInSet(self.diag_set))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Diagnostic:
                 diag = conf.lib.clang_getDiagnosticInSet(self.diag_set, key)
                 if not diag:
                     raise IndexError
@@ -454,17 +494,17 @@ def __getitem__(self, key):
     @property
     def category_number(self):
         """The category number for this diagnostic or 0 if unavailable."""
-        return conf.lib.clang_getDiagnosticCategory(self)
+        return conf.lib.clang_getDiagnosticCategory(self)  # type: ignore [no-any-return]
 
     @property
     def category_name(self):
         """The string name of the category for this diagnostic."""
-        return conf.lib.clang_getDiagnosticCategoryText(self)
+        return conf.lib.clang_getDiagnosticCategoryText(self)  # type: ignore [no-any-return]
 
     @property
     def option(self):
         """The command-line option that enables this diagnostic."""
-        return conf.lib.clang_getDiagnosticOption(self, None)
+        return conf.lib.clang_getDiagnosticOption(self, None)  # type: ignore [no-any-return]
 
     @property
     def disable_option(self):
@@ -484,7 +524,7 @@ def format(self, options=None):
             options = conf.lib.clang_defaultDiagnosticDisplayOptions()
         if options & ~Diagnostic._FormatOptionsMask:
             raise ValueError("Invalid format options")
-        return conf.lib.clang_formatDiagnostic(self, options)
+        return conf.lib.clang_formatDiagnostic(self, options)  # type: ignore [no-any-return]
 
     def __repr__(self):
         return "<Diagnostic severity %r, location %r, spelling %r>" % (
@@ -619,39 +659,39 @@ def get_all_kinds():
 
     def is_declaration(self):
         """Test if this is a declaration kind."""
-        return conf.lib.clang_isDeclaration(self)
+        return conf.lib.clang_isDeclaration(self)  # type: ignore [no-any-return]
 
     def is_reference(self):
         """Test if this is a reference kind."""
-        return conf.lib.clang_isReference(self)
+        return conf.lib.clang_isReference(self)  # type: ignore [no-any-return]
 
     def is_expression(self):
         """Test if this is an expression kind."""
-        return conf.lib.clang_isExpression(self)
+        return conf.lib.clang_isExpression(self)  # type: ignore [no-any-return]
 
     def is_statement(self):
         """Test if this is a statement kind."""
-        return conf.lib.clang_isStatement(self)
+        return conf.lib.clang_isStatement(self)  # type: ignore [no-any-return]
 
     def is_attribute(self):
         """Test if this is an attribute kind."""
-        return conf.lib.clang_isAttribute(self)
+        return conf.lib.clang_isAttribute(self)  # type: ignore [no-any-return]
 
     def is_invalid(self):
         """Test if this is an invalid kind."""
-        return conf.lib.clang_isInvalid(self)
+        return conf.lib.clang_isInvalid(self)  # type: ignore [no-any-return]
 
     def is_translation_unit(self):
         """Test if this is a translation unit kind."""
-        return conf.lib.clang_isTranslationUnit(self)
+        return conf.lib.clang_isTranslationUnit(self)  # type: ignore [no-any-return]
 
     def is_preprocessing(self):
         """Test if this is a preprocessing kind."""
-        return conf.lib.clang_isPreprocessing(self)
+        return conf.lib.clang_isPreprocessing(self)  # type: ignore [no-any-return]
 
     def is_unexposed(self):
         """Test if this is an unexposed kind."""
-        return conf.lib.clang_isUnexposed(self)
+        return conf.lib.clang_isUnexposed(self)  # type: ignore [no-any-return]
 
 
     ###
@@ -1524,7 +1564,7 @@ def from_location(tu, location):
         return cursor
 
     def __eq__(self, other):
-        return conf.lib.clang_equalCursors(self, other)
+        return conf.lib.clang_equalCursors(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -1534,41 +1574,41 @@ def is_definition(self):
         Returns true if the declaration pointed at by the cursor is also a
         definition of that entity.
         """
-        return conf.lib.clang_isCursorDefinition(self)
+        return conf.lib.clang_isCursorDefinition(self)  # type: ignore [no-any-return]
 
     def is_const_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'const'.
         """
-        return conf.lib.clang_CXXMethod_isConst(self)
+        return conf.lib.clang_CXXMethod_isConst(self)  # type: ignore [no-any-return]
 
     def is_converting_constructor(self):
         """Returns True if the cursor refers to a C++ converting constructor."""
-        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)
+        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)  # type: ignore [no-any-return]
 
     def is_copy_constructor(self):
         """Returns True if the cursor refers to a C++ copy constructor."""
-        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)
+        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)  # type: ignore [no-any-return]
 
     def is_default_constructor(self):
         """Returns True if the cursor refers to a C++ default constructor."""
-        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)
+        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)  # type: ignore [no-any-return]
 
     def is_move_constructor(self):
         """Returns True if the cursor refers to a C++ move constructor."""
-        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)
+        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)  # type: ignore [no-any-return]
 
     def is_default_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= default'.
         """
-        return conf.lib.clang_CXXMethod_isDefaulted(self)
+        return conf.lib.clang_CXXMethod_isDefaulted(self)  # type: ignore [no-any-return]
 
     def is_deleted_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= delete'.
         """
-        return conf.lib.clang_CXXMethod_isDeleted(self)
+        return conf.lib.clang_CXXMethod_isDeleted(self)  # type: ignore [no-any-return]
 
     def is_copy_assignment_operator_method(self):
         """Returnrs True if the cursor refers to a copy-assignment operator.
@@ -1593,7 +1633,7 @@ class Bar {
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)
+        return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)  # type: ignore [no-any-return]
 
     def is_move_assignment_operator_method(self):
         """Returnrs True if the cursor refers to a move-assignment operator.
@@ -1618,7 +1658,7 @@ class Bar {
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)
+        return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)  # type: ignore [no-any-return]
 
     def is_explicit_method(self):
         """Determines if a C++ constructor or conversion function is
@@ -1663,41 +1703,41 @@ class Foo {
         This method will return 0 for the constructor and 1 for
         the conversion function.
         """
-        return conf.lib.clang_CXXMethod_isExplicit(self)
+        return conf.lib.clang_CXXMethod_isExplicit(self)  # type: ignore [no-any-return]
 
     def is_mutable_field(self):
         """Returns True if the cursor refers to a C++ field that is declared
         'mutable'.
         """
-        return conf.lib.clang_CXXField_isMutable(self)
+        return conf.lib.clang_CXXField_isMutable(self)  # type: ignore [no-any-return]
 
     def is_pure_virtual_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared pure virtual.
         """
-        return conf.lib.clang_CXXMethod_isPureVirtual(self)
+        return conf.lib.clang_CXXMethod_isPureVirtual(self)  # type: ignore [no-any-return]
 
     def is_static_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'static'.
         """
-        return conf.lib.clang_CXXMethod_isStatic(self)
+        return conf.lib.clang_CXXMethod_isStatic(self)  # type: ignore [no-any-return]
 
     def is_virtual_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'virtual'.
         """
-        return conf.lib.clang_CXXMethod_isVirtual(self)
+        return conf.lib.clang_CXXMethod_isVirtual(self)  # type: ignore [no-any-return]
 
     def is_abstract_record(self):
         """Returns True if the cursor refers to a C++ record declaration
         that has pure virtual member functions.
         """
-        return conf.lib.clang_CXXRecord_isAbstract(self)
+        return conf.lib.clang_CXXRecord_isAbstract(self)  # type: ignore [no-any-return]
 
     def is_scoped_enum(self):
         """Returns True if the cursor refers to a scoped enum declaration."""
-        return conf.lib.clang_EnumDecl_isScoped(self)
+        return conf.lib.clang_EnumDecl_isScoped(self)  # type: ignore [no-any-return]
 
     def get_definition(self):
         """
@@ -1707,7 +1747,7 @@ def get_definition(self):
         """
         # TODO: Should probably check that this is either a reference or
         # declaration prior to issuing the lookup.
-        return conf.lib.clang_getCursorDefinition(self)
+        return conf.lib.clang_getCursorDefinition(self)  # type: ignore [no-any-return]
 
     def get_usr(self):
         """Return the Unified Symbol Resolution (USR) for the entity referenced
@@ -1718,13 +1758,13 @@ def get_usr(self):
         program. USRs can be compared across translation units to determine,
         e.g., when references in one translation refer to an entity defined in
         another translation unit."""
-        return conf.lib.clang_getCursorUSR(self)
+        return conf.lib.clang_getCursorUSR(self)  # type: ignore [no-any-return]
 
     def get_included_file(self):
         """Returns the File that is included by the current inclusion cursor."""
         assert self.kind == CursorKind.INCLUSION_DIRECTIVE
 
-        return conf.lib.clang_getIncludedFile(self)
+        return conf.lib.clang_getIncludedFile(self)  # type: ignore [no-any-return]
 
     @property
     def kind(self):
@@ -1994,12 +2034,12 @@ def referenced(self):
     @property
     def brief_comment(self):
         """Returns the brief comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getBriefCommentText(self)
+        return conf.lib.clang_Cursor_getBriefCommentText(self)  # type: ignore [no-any-return]
 
     @property
     def raw_comment(self):
         """Returns the raw comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getRawCommentText(self)
+        return conf.lib.clang_Cursor_getRawCommentText(self)  # type: ignore [no-any-return]
 
     def get_arguments(self):
         """Return an iterator for accessing the arguments of this cursor."""
@@ -2009,24 +2049,24 @@ def get_arguments(self):
 
     def get_num_template_arguments(self):
         """Returns the number of template args associated with this cursor."""
-        return conf.lib.clang_Cursor_getNumTemplateArguments(self)
+        return conf.lib.clang_Cursor_getNumTemplateArguments(self)  # type: ignore [no-any-return]
 
     def get_template_argument_kind(self, num):
         """Returns the TemplateArgumentKind for the indicated template
         argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_type(self, num):
         """Returns the CXType for the indicated template argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_value(self, num):
         """Returns the value of the indicated arg as a signed 64b integer."""
-        return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_unsigned_value(self, num):
         """Returns the value of the indicated arg as an unsigned 64b integer."""
-        return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num)  # type: ignore [no-any-return]
 
     def get_children(self):
         """Return an iterator for accessing the children of this cursor."""
@@ -2042,8 +2082,8 @@ def visitor(child, parent, children):
             children.append(child)
             return 1  # continue
 
-        children = []
-        conf.lib.clang_visitChildren(self, callbacks["cursor_visit"](visitor), children)
+        children: list[Cursor] = []
+        conf.lib.clang_visitChildren(self, cursor_visit_callback(visitor), children)
         return iter(children)
 
     def walk_preorder(self):
@@ -2066,7 +2106,7 @@ def get_tokens(self):
 
     def get_field_offsetof(self):
         """Returns the offsetof the FIELD_DECL pointed by this Cursor."""
-        return conf.lib.clang_Cursor_getOffsetOfField(self)
+        return conf.lib.clang_Cursor_getOffsetOfField(self)  # type: ignore [no-any-return]
 
     def is_anonymous(self):
         """
@@ -2074,19 +2114,19 @@ def is_anonymous(self):
         """
         if self.kind == CursorKind.FIELD_DECL:
             return self.type.get_declaration().is_anonymous()
-        return conf.lib.clang_Cursor_isAnonymous(self)
+        return conf.lib.clang_Cursor_isAnonymous(self)  # type: ignore [no-any-return]
 
     def is_bitfield(self):
         """
         Check if the field is a bitfield.
         """
-        return conf.lib.clang_Cursor_isBitField(self)
+        return conf.lib.clang_Cursor_isBitField(self)  # type: ignore [no-any-return]
 
     def get_bitfield_width(self):
         """
         Retrieve the width of a bitfield.
         """
-        return conf.lib.clang_getFieldDeclBitWidth(self)
+        return conf.lib.clang_getFieldDeclBitWidth(self)  # type: ignore [no-any-return]
 
     @staticmethod
     def from_result(res, fn, args):
@@ -2223,7 +2263,7 @@ class TypeKind(BaseEnumeration):
     @property
     def spelling(self):
         """Retrieve the spelling of this TypeKind."""
-        return conf.lib.clang_getTypeKindSpelling(self.value)
+        return conf.lib.clang_getTypeKindSpelling(self.value)  # type: ignore [no-any-return]
 
     INVALID = 0
     UNEXPOSED = 1
@@ -2379,25 +2419,25 @@ def kind(self):
         """Return the kind of this type."""
         return TypeKind.from_id(self._kind_id)
 
-    def argument_types(self):
+    def argument_types(self) -> NoSliceSequence[Type]:
         """Retrieve a container for the non-variadic arguments for this type.
 
         The returned object is iterable and indexable. Each item in the
         container is a Type instance.
         """
 
-        class ArgumentsIterator(collections.abc.Sequence):
-            def __init__(self, parent):
+        class ArgumentsIterator:
+            def __init__(self, parent: Type):
                 self.parent = parent
-                self.length = None
+                self.length: int | None = None
 
-            def __len__(self):
+            def __len__(self) -> int:
                 if self.length is None:
                     self.length = conf.lib.clang_getNumArgTypes(self.parent)
 
                 return self.length
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Type:
                 # FIXME Support slice objects.
                 if not isinstance(key, int):
                     raise TypeError("Must supply a non-negative int.")
@@ -2411,7 +2451,7 @@ def __getitem__(self, key):
                         "%d > %d" % (key, len(self))
                     )
 
-                result = conf.lib.clang_getArgType(self.parent, key)
+                result: Type = conf.lib.clang_getArgType(self.parent, key)
                 if result.kind == TypeKind.INVALID:
                     raise IndexError("Argument could not be retrieved.")
 
@@ -2470,10 +2510,10 @@ def from_result(res, fn, args):
         return res
 
     def get_num_template_arguments(self):
-        return conf.lib.clang_Type_getNumTemplateArguments(self)
+        return conf.lib.clang_Type_getNumTemplateArguments(self)  # type: ignore [no-any-return]
 
     def get_template_argument_type(self, num):
-        return conf.lib.clang_Type_getTemplateArgumentAsType(self, num)
+        return conf.lib.clang_Type_getTemplateArgumentAsType(self, num)  # type: ignore [no-any-return]
 
     def get_canonical(self):
         """
@@ -2485,7 +2525,7 @@ def get_canonical(self):
         example, if 'T' is a typedef for 'int', the canonical type for
         'T' would be 'int'.
         """
-        return conf.lib.clang_getCanonicalType(self)
+        return conf.lib.clang_getCanonicalType(self)  # type: ignore [no-any-return]
 
     def is_const_qualified(self):
         """Determine whether a Type has the "const" qualifier set.
@@ -2493,7 +2533,7 @@ def is_const_qualified(self):
         This does not look through typedefs that may have added "const"
         at a different level.
         """
-        return conf.lib.clang_isConstQualifiedType(self)
+        return conf.lib.clang_isConstQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_volatile_qualified(self):
         """Determine whether a Type has the "volatile" qualifier set.
@@ -2501,7 +2541,7 @@ def is_volatile_qualified(self):
         This does not look through typedefs that may have added "volatile"
         at a different level.
         """
-        return conf.lib.clang_isVolatileQualifiedType(self)
+        return conf.lib.clang_isVolatileQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_restrict_qualified(self):
         """Determine whether a Type has the "restrict" qualifier set.
@@ -2509,83 +2549,83 @@ def is_restrict_qualified(self):
         This does not look through typedefs that may have added "restrict" at
         a different level.
         """
-        return conf.lib.clang_isRestrictQualifiedType(self)
+        return conf.lib.clang_isRestrictQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_function_variadic(self):
         """Determine whether this function Type is a variadic function type."""
         assert self.kind == TypeKind.FUNCTIONPROTO
 
-        return conf.lib.clang_isFunctionTypeVariadic(self)
+        return conf.lib.clang_isFunctionTypeVariadic(self)  # type: ignore [no-any-return]
 
     def get_address_space(self):
-        return conf.lib.clang_getAddressSpace(self)
+        return conf.lib.clang_getAddressSpace(self)  # type: ignore [no-any-return]
 
     def get_typedef_name(self):
-        return conf.lib.clang_getTypedefName(self)
+        return conf.lib.clang_getTypedefName(self)  # type: ignore [no-any-return]
 
     def is_pod(self):
         """Determine whether this Type represents plain old data (POD)."""
-        return conf.lib.clang_isPODType(self)
+        return conf.lib.clang_isPODType(self)  # type: ignore [no-any-return]
 
     def get_pointee(self):
         """
         For pointer types, returns the type of the pointee.
         """
-        return conf.lib.clang_getPointeeType(self)
+        return conf.lib.clang_getPointeeType(self)  # type: ignore [no-any-return]
 
     def get_declaration(self):
         """
         Return the cursor for the declaration of the given type.
         """
-        return conf.lib.clang_getTypeDeclaration(self)
+        return conf.lib.clang_getTypeDeclaration(self)  # type: ignore [no-any-return]
 
     def get_result(self):
         """
         Retrieve the result type associated with a function type.
         """
-        return conf.lib.clang_getResultType(self)
+        return conf.lib.clang_getResultType(self)  # type: ignore [no-any-return]
 
     def get_array_element_type(self):
         """
         Retrieve the type of the elements of the array type.
         """
-        return conf.lib.clang_getArrayElementType(self)
+        return conf.lib.clang_getArrayElementType(self)  # type: ignore [no-any-return]
 
     def get_array_size(self):
         """
         Retrieve the size of the constant array.
         """
-        return conf.lib.clang_getArraySize(self)
+        return conf.lib.clang_getArraySize(self)  # type: ignore [no-any-return]
 
     def get_class_type(self):
         """
         Retrieve the class type of the member pointer type.
         """
-        return conf.lib.clang_Type_getClassType(self)
+        return conf.lib.clang_Type_getClassType(self)  # type: ignore [no-any-return]
 
     def get_named_type(self):
         """
         Retrieve the type named by the qualified-id.
         """
-        return conf.lib.clang_Type_getNamedType(self)
+        return conf.lib.clang_Type_getNamedType(self)  # type: ignore [no-any-return]
 
     def get_align(self):
         """
         Retrieve the alignment of the record.
         """
-        return conf.lib.clang_Type_getAlignOf(self)
+        return conf.lib.clang_Type_getAlignOf(self)  # type: ignore [no-any-return]
 
     def get_size(self):
         """
         Retrieve the size of the record.
         """
-        return conf.lib.clang_Type_getSizeOf(self)
+        return conf.lib.clang_Type_getSizeOf(self)  # type: ignore [no-any-return]
 
     def get_offset(self, fieldname):
         """
         Retrieve the offset of a field in the record.
         """
-        return conf.lib.clang_Type_getOffsetOf(self, fieldname)
+        return conf.lib.clang_Type_getOffsetOf(self, fieldname)  # type: ignore [no-any-return]
 
     def get_ref_qualifier(self):
         """
@@ -2604,10 +2644,8 @@ def visitor(field, children):
             fields.append(field)
             return 1  # continue
 
-        fields = []
-        conf.lib.clang_Type_visitFields(
-            self, callbacks["fields_visit"](visitor), fields
-        )
+        fields: list[Cursor] = []
+        conf.lib.clang_Type_visitFields(self, fields_visit_callback(visitor), fields)
         return iter(fields)
 
     def get_exception_specification_kind(self):
@@ -2622,13 +2660,13 @@ def get_exception_specification_kind(self):
     @property
     def spelling(self):
         """Retrieve the spelling of this Type."""
-        return conf.lib.clang_getTypeSpelling(self)
+        return conf.lib.clang_getTypeSpelling(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
         if type(other) != type(self):
             return False
 
-        return conf.lib.clang_equalTypes(self, other)
+        return conf.lib.clang_equalTypes(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -2712,7 +2750,7 @@ def __repr__(self):
     def spelling(self):
         if self.__kindNumber in SpellingCache:
             return SpellingCache[self.__kindNumber]
-        return conf.lib.clang_getCompletionChunkText(self.cs, self.key)
+        return conf.lib.clang_getCompletionChunkText(self.cs, self.key)  # type: ignore [no-any-return]
 
     # We do not use @CachedProperty here, as the manual implementation is
     # apparently still significantly faster. Please profile carefully if you
@@ -2795,7 +2833,7 @@ def __len__(self):
 
     @CachedProperty
     def num_chunks(self):
-        return conf.lib.clang_getNumCompletionChunks(self.obj)
+        return conf.lib.clang_getNumCompletionChunks(self.obj)  # type: ignore [no-any-return]
 
     def __getitem__(self, key):
         if self.num_chunks <= key:
@@ -2804,7 +2842,7 @@ def __getitem__(self, key):
 
     @property
     def priority(self):
-        return conf.lib.clang_getCompletionPriority(self.obj)
+        return conf.lib.clang_getCompletionPriority(self.obj)  # type: ignore [no-any-return]
 
     @property
     def availability(self):
@@ -2814,7 +2852,7 @@ def availability(self):
     @property
     def briefComment(self):
         if conf.function_exists("clang_getCompletionBriefComment"):
-            return conf.lib.clang_getCompletionBriefComment(self.obj)
+            return conf.lib.clang_getCompletionBriefComment(self.obj)  # type: ignore [no-any-return]
         return _CXString()
 
     def __repr__(self):
@@ -2881,16 +2919,16 @@ def results(self):
         return self.ptr.contents
 
     @property
-    def diagnostics(self):
+    def diagnostics(self) -> NoSliceSequence[Diagnostic]:
         class DiagnosticsItr:
-            def __init__(self, ccr):
+            def __init__(self, ccr: CodeCompletionResults):
                 self.ccr = ccr
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_codeCompleteGetNumDiagnostics(self.ccr))
 
-            def __getitem__(self, key):
-                return conf.lib.clang_codeCompleteGetDiagnostic(self.ccr, key)
+            def __getitem__(self, key: int) -> Diagnostic:
+                return conf.lib.clang_codeCompleteGetDiagnostic(self.ccr, key)  # type: ignore [no-any-return]
 
         return DiagnosticsItr(self)
 
@@ -2973,6 +3011,20 @@ class TranslationUnit(ClangObject):
     # into the set of code completions returned from this translation unit.
     PARSE_INCLUDE_BRIEF_COMMENTS_IN_CODE_COMPLETION = 128
 
+    @staticmethod
+    def process_unsaved_files(unsaved_files) -> Array[_CXUnsavedFile] | None:
+        unsaved_array = None
+        if len(unsaved_files):
+            unsaved_array = (_CXUnsavedFile * len(unsaved_files))()
+            for i, (name, contents) in enumerate(unsaved_files):
+                if hasattr(contents, "read"):
+                    contents = contents.read()
+                binary_contents = b(contents)
+                unsaved_array[i].name = b(os.fspath(name))
+                unsaved_array[i].contents = binary_contents
+                unsaved_array[i].length = len(binary_contents)
+        return unsaved_array
+
     @classmethod
     def from_source(
         cls, filename, args=None, unsaved_files=None, options=0, index=None
@@ -3029,16 +3081,7 @@ def from_source(
         if len(args) > 0:
             args_array = (c_char_p * len(args))(*[b(x) for x in args])
 
-        unsaved_array = None
-        if len(unsaved_files) > 0:
-            unsaved_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_array[i].name = b(os.fspath(name))
-                unsaved_array[i].contents = contents
-                unsaved_array[i].length = len(contents)
+        unsaved_array = cls.process_unsaved_files(unsaved_files)
 
         ptr = conf.lib.clang_parseTranslationUnit(
             index,
@@ -3095,12 +3138,12 @@ def __del__(self):
     @property
     def cursor(self):
         """Retrieve the cursor that represents the given translation unit."""
-        return conf.lib.clang_getTranslationUnitCursor(self)
+        return conf.lib.clang_getTranslationUnitCursor(self)  # type: ignore [no-any-return]
 
     @property
     def spelling(self):
         """Get the original translation unit source file name."""
-        return conf.lib.clang_getTranslationUnitSpelling(self)
+        return conf.lib.clang_getTranslationUnitSpelling(self)  # type: ignore [no-any-return]
 
     def get_includes(self):
         """
@@ -3119,7 +3162,7 @@ def visitor(fobj, lptr, depth, includes):
         # Automatically adapt CIndex/ctype pointers to python objects
         includes = []
         conf.lib.clang_getInclusions(
-            self, callbacks["translation_unit_includes"](visitor), includes
+            self, translation_unit_includes_callback(visitor), includes
         )
 
         return iter(includes)
@@ -3187,19 +3230,19 @@ def get_extent(self, filename, locations):
         return SourceRange.from_locations(start_location, end_location)
 
     @property
-    def diagnostics(self):
+    def diagnostics(self) -> NoSliceSequence[Diagnostic]:
         """
         Return an iterable (and indexable) object containing the diagnostics.
         """
 
         class DiagIterator:
-            def __init__(self, tu):
+            def __init__(self, tu: TranslationUnit):
                 self.tu = tu
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getNumDiagnostics(self.tu))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Diagnostic:
                 diag = conf.lib.clang_getDiagnostic(self.tu, key)
                 if not diag:
                     raise IndexError
@@ -3219,16 +3262,7 @@ def reparse(self, unsaved_files=None, options=0):
         if unsaved_files is None:
             unsaved_files = []
 
-        unsaved_files_array = 0
-        if len(unsaved_files):
-            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_files_array[i].name = b(os.fspath(name))
-                unsaved_files_array[i].contents = contents
-                unsaved_files_array[i].length = len(contents)
+        unsaved_files_array = self.process_unsaved_files(unsaved_files)
         ptr = conf.lib.clang_reparseTranslationUnit(
             self, len(unsaved_files), unsaved_files_array, options
         )
@@ -3291,16 +3325,7 @@ def codeComplete(
         if unsaved_files is None:
             unsaved_files = []
 
-        unsaved_files_array = 0
-        if len(unsaved_files):
-            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_files_array[i].name = b(os.fspath(name))
-                unsaved_files_array[i].contents = contents
-                unsaved_files_array[i].length = len(contents)
+        unsaved_files_array = self.process_unsaved_files(unsaved_files)
         ptr = conf.lib.clang_codeCompleteAt(
             self,
             os.fspath(path),
@@ -3344,12 +3369,12 @@ def from_name(translation_unit, file_name):
     @property
     def name(self):
         """Return the complete file and path name of the file."""
-        return conf.lib.clang_getFileName(self)
+        return conf.lib.clang_getFileName(self)  # type: ignore [no-any-return]
 
     @property
     def time(self):
         """Return the last modification time of the file."""
-        return conf.lib.clang_getFileTime(self)
+        return conf.lib.clang_getFileTime(self)  # type: ignore [no-any-return]
 
     def __str__(self):
         return self.name
@@ -3428,12 +3453,12 @@ def __init__(self, cmd, ccmds):
     @property
     def directory(self):
         """Get the working directory for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)
+        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)  # type: ignore [no-any-return]
 
     @property
     def filename(self):
         """Get the working filename for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getFilename(self.cmd)
+        return conf.lib.clang_CompileCommand_getFilename(self.cmd)  # type: ignore [no-any-return]
 
     @property
     def arguments(self):
@@ -3512,7 +3537,7 @@ def getCompileCommands(self, filename):
         Get an iterable object providing all the CompileCommands available to
         build filename. Returns None if filename is not found in the database.
         """
-        return conf.lib.clang_CompilationDatabase_getCompileCommands(
+        return conf.lib.clang_CompilationDatabase_getCompileCommands(  # type: ignore [no-any-return]
             self, os.fspath(filename)
         )
 
@@ -3521,7 +3546,7 @@ def getAllCompileCommands(self):
         Get an iterable object providing all the CompileCommands available from
         the database.
         """
-        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)
+        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)  # type: ignore [no-any-return]
 
 
 class Token(Structure):
@@ -3542,7 +3567,7 @@ def spelling(self):
 
         This is the textual representation of the token in source.
         """
-        return conf.lib.clang_getTokenSpelling(self._tu, self)
+        return conf.lib.clang_getTokenSpelling(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def kind(self):
@@ -3552,12 +3577,12 @@ def kind(self):
     @property
     def location(self):
         """The SourceLocation this Token occurs at."""
-        return conf.lib.clang_getTokenLocation(self._tu, self)
+        return conf.lib.clang_getTokenLocation(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def extent(self):
         """The SourceRange this Token occupies."""
-        return conf.lib.clang_getTokenExtent(self._tu, self)
+        return conf.lib.clang_getTokenExtent(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def cursor(self):
@@ -3619,7 +3644,7 @@ def overwrite_changed_files(self):
         Returns 1 if any files were not saved successfully,
         returns 0 otherwise.
         """
-        return conf.lib.clang_CXRewriter_overwriteChangedFiles(self)
+        return conf.lib.clang_CXRewriter_overwriteChangedFiles(self)  # type: ignore [no-any-return]
 
     def write_main_file_to_stdout(self):
         """
@@ -3631,15 +3656,15 @@ def write_main_file_to_stdout(self):
 
 # Now comes the plumbing to hook up the C library.
 
-# Register callback types in common container.
-callbacks["translation_unit_includes"] = CFUNCTYPE(
+# Register callback types
+translation_unit_includes_callback = CFUNCTYPE(
     None, c_object_p, POINTER(SourceLocation), c_uint, py_object
 )
-callbacks["cursor_visit"] = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
-callbacks["fields_visit"] = CFUNCTYPE(c_int, Cursor, py_object)
+cursor_visit_callback = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
+fields_visit_callback = CFUNCTYPE(c_int, Cursor, py_object)
 
 # Functions strictly alphabetical order.
-functionList = [
+functionList: list[LibFunc] = [
     (
         "clang_annotateTokens",
         [TranslationUnit, POINTER(Token), c_uint, POINTER(Cursor)],
@@ -3809,7 +3834,7 @@ def write_main_file_to_stdout(self):
     ("clang_getIncludedFile", [Cursor], c_object_p, File.from_result),
     (
         "clang_getInclusions",
-        [TranslationUnit, callbacks["translation_unit_includes"], py_object],
+        [TranslationUnit, translation_unit_includes_callback, py_object],
     ),
     (
         "clang_getInstantiationLocation",
@@ -3894,7 +3919,7 @@ def write_main_file_to_stdout(self):
         "clang_tokenize",
         [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)],
     ),
-    ("clang_visitChildren", [Cursor, callbacks["cursor_visit"], py_object], c_uint),
+    ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint),
     ("clang_Cursor_getNumArguments", [Cursor], c_int),
     ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor, Cursor.from_result),
     ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int),
@@ -3921,19 +3946,19 @@ def write_main_file_to_stdout(self):
     ("clang_Type_getSizeOf", [Type], c_longlong),
     ("clang_Type_getCXXRefQualifier", [Type], c_uint),
     ("clang_Type_getNamedType", [Type], Type, Type.from_result),
-    ("clang_Type_visitFields", [Type, callbacks["fields_visit"], py_object], c_uint),
+    ("clang_Type_visitFields", [Type, fields_visit_callback, py_object], c_uint),
 ]
 
 
 class LibclangError(Exception):
-    def __init__(self, message):
+    def __init__(self, message: str):
         self.m = message
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self.m
 
 
-def register_function(lib, item, ignore_errors):
+def register_function(lib: CDLL, item: LibFunc, ignore_errors: bool) -> None:
     # A function may not exist, if these bindings are used with an older or
     # incompatible version of libclang.so.
     try:
@@ -3957,15 +3982,15 @@ def register_function(lib, item, ignore_errors):
         func.errcheck = item[3]
 
 
-def register_functions(lib, ignore_errors):
+def register_functions(lib: CDLL, ignore_errors: bool) -> None:
     """Register function prototypes with a libclang library instance.
 
     This must be called as part of library instantiation so Python knows how
     to call out to the shared library.
     """
 
-    def register(item):
-        return register_function(lib, item, ignore_errors)
+    def register(item: LibFunc) -> None:
+        register_function(lib, item, ignore_errors)
 
     for f in functionList:
         register(f)
@@ -3973,12 +3998,12 @@ def register(item):
 
 class Config:
     library_path = None
-    library_file = None
+    library_file: str | None = None
     compatibility_check = True
     loaded = False
 
     @staticmethod
-    def set_library_path(path):
+    def set_library_path(path: StrPath) -> None:
         """Set the path in which to search for libclang"""
         if Config.loaded:
             raise Exception(
@@ -3989,7 +4014,7 @@ def set_library_path(path):
         Config.library_path = os.fspath(path)
 
     @staticmethod
-    def set_library_file(filename):
+    def set_library_file(filename: StrPath) -> None:
         """Set the exact location of libclang"""
         if Config.loaded:
             raise Exception(
@@ -4000,7 +4025,7 @@ def set_library_file(filename):
         Config.library_file = os.fspath(filename)
 
     @staticmethod
-    def set_compatibility_check(check_status):
+    def set_compatibility_check(check_status: bool) -> None:
         """Perform compatibility check when loading libclang
 
         The python bindings are only tested and evaluated with the version of
@@ -4026,13 +4051,13 @@ def set_compatibility_check(check_status):
         Config.compatibility_check = check_status
 
     @CachedProperty
-    def lib(self):
+    def lib(self) -> CDLL:
         lib = self.get_cindex_library()
         register_functions(lib, not Config.compatibility_check)
         Config.loaded = True
         return lib
 
-    def get_filename(self):
+    def get_filename(self) -> str:
         if Config.library_file:
             return Config.library_file
 
@@ -4052,7 +4077,7 @@ def get_filename(self):
 
         return file
 
-    def get_cindex_library(self):
+    def get_cindex_library(self) -> CDLL:
         try:
             library = cdll.LoadLibrary(self.get_filename())
         except OSError as e:
@@ -4065,7 +4090,7 @@ def get_cindex_library(self):
 
         return library
 
-    def function_exists(self, name):
+    def function_exists(self, name: str) -> bool:
         try:
             getattr(self.lib, name)
         except AttributeError:
@@ -4077,6 +4102,7 @@ def function_exists(self, name):
 conf = Config()
 
 __all__ = [
+    "AccessSpecifier",
     "AvailabilityKind",
     "BinaryOperator",
     "Config",
@@ -4087,12 +4113,16 @@ def function_exists(self, name):
     "CursorKind",
     "Cursor",
     "Diagnostic",
+    "ExceptionSpecificationKind",
     "File",
     "FixIt",
     "Index",
     "LinkageKind",
+    "RefQualifierKind",
     "SourceLocation",
     "SourceRange",
+    "StorageClass",
+    "TemplateArgumentKind",
     "TLSKind",
     "TokenKind",
     "Token",
diff --git a/clang/bindings/python/tests/cindex/test_code_completion.py b/clang/bindings/python/tests/cindex/test_code_completion.py
index ca52fc6f73e1d..1d513dbca2536 100644
--- a/clang/bindings/python/tests/cindex/test_code_completion.py
+++ b/clang/bindings/python/tests/cindex/test_code_completion.py
@@ -53,7 +53,7 @@ def test_code_complete(self):
         expected = [
             "{'int', ResultType} | {'test1', TypedText} || Priority: 50 || Availability: Available || Brief comment: Aaa.",
             "{'void', ResultType} | {'test2', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 50 || Availability: Available || Brief comment: Bbb.",
-            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: None",
+            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
@@ -94,7 +94,7 @@ def test_code_complete_pathlike(self):
         expected = [
             "{'int', ResultType} | {'test1', TypedText} || Priority: 50 || Availability: Available || Brief comment: Aaa.",
             "{'void', ResultType} | {'test2', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 50 || Availability: Available || Brief comment: Bbb.",
-            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: None",
+            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
@@ -128,19 +128,19 @@ class Q : public P {
         cr = tu.codeComplete("fake.cpp", 12, 5, unsaved_files=files)
 
         expected = [
-            "{'const', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'volatile', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'operator', TypedText} || Priority: 40 || Availability: Available || Brief comment: None",
-            "{'P', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'Q', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
+            "{'const', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'volatile', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'operator', TypedText} || Priority: 40 || Availability: Available || Brief comment: ",
+            "{'P', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'Q', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
         cr = tu.codeComplete("fake.cpp", 13, 5, unsaved_files=files)
         expected = [
-            "{'P', TypedText} | {'::', Text} || Priority: 75 || Availability: Available || Brief comment: None",
-            "{'P &', ResultType} | {'operator=', TypedText} | {'(', LeftParen} | {'const P &', Placeholder} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: None",
-            "{'int', ResultType} | {'member', TypedText} || Priority: 35 || Availability: NotAccessible || Brief comment: None",
-            "{'void', ResultType} | {'~P', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: None",
+            "{'P', TypedText} | {'::', Text} || Priority: 75 || Availability: Available || Brief comment: ",
+            "{'P &', ResultType} | {'operator=', TypedText} | {'(', LeftParen} | {'const P &', Placeholder} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: ",
+            "{'int', ResultType} | {'member', TypedText} || Priority: 35 || Availability: NotAccessible || Brief comment: ",
+            "{'void', ResultType} | {'~P', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
diff --git a/clang/bindings/python/tests/cindex/test_comment.py b/clang/bindings/python/tests/cindex/test_comment.py
index 0727c6fa35d95..265c6d3d73de0 100644
--- a/clang/bindings/python/tests/cindex/test_comment.py
+++ b/clang/bindings/python/tests/cindex/test_comment.py
@@ -53,5 +53,5 @@ def test_comment(self):
         f = get_cursor(tu, "f")
         raw = f.raw_comment
         brief = f.brief_comment
-        self.assertIsNone(raw)
-        self.assertIsNone(brief)
+        self.assertEqual(raw, "")
+        self.assertEqual(brief, "")
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 73ebd36c28496..75ceb799352f9 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -309,7 +309,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi)
   foreach(lang C;CXX;ASM)
     set(BUILTINS_${target}_CMAKE_${lang}_local_flags "--target=${target} -mthumb")
     if(${target} STREQUAL "armv8m.main-unknown-eabi")
-      set(BUILTINS_${target}_CMAKE_${lang}_local_flags "${BUILTINS_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=hard -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+      set(BUILTINS_${target}_CMAKE_${lang}_local_flags "${BUILTINS_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
     endif()
     set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "${BUILTINS_${target}_CMAKE_${lang}_local_flags}" CACHE STRING "")
   endforeach()
@@ -327,18 +327,24 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi)
   foreach(lang C;CXX;ASM)
     # TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
     # These should be addressed and removed over time.
-    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+    set(RUNTIMES_${target}_CMAKE_${lang}_local_flags "--target=${target} -mthumb -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1")
+    if(${target} STREQUAL "armv8m.main-unknown-eabi")
+      set(RUNTIMES_${target}_CMAKE_${lang}_local_flags "${RUNTIMES_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+    endif()
+    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "${RUNTIMES_${target}_CMAKE_${lang}_local_flags}" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
   endforeach()
   set(RUNTIMES_${target}_LLVM_LIBC_FULL_BUILD ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBC_ENABLE_USE_BY_CLANG ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBC_USE_NEW_HEADER_GEN OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
@@ -384,11 +390,13 @@ foreach(target riscv32-unknown-elf)
   endforeach()
   set(RUNTIMES_${target}_LLVM_LIBC_FULL_BUILD ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBC_ENABLE_USE_BY_CLANG ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBC_USE_NEW_HEADER_GEN OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 4d3af3ad3f403..2d2dcb9ae6798 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -67,9 +67,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   SWIG_EXECUTABLE
   CMAKE_FIND_PACKAGE_PREFER_CONFIG
   CMAKE_SYSROOT
-  CMAKE_MODULE_LINKER_FLAGS
-  CMAKE_SHARED_LINKER_FLAGS
-  CMAKE_EXE_LINKER_FLAGS
   LLVM_WINSYSROOT
   LLVM_VFSOVERLAY
 )
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index 9e6feb479d45f..e5161dd9a27b9 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -29,9 +29,13 @@ endfunction()
 # cache file to CMake via -C. e.g.
 #
 # cmake -D LLVM_RELEASE_ENABLE_PGO=ON -C Release.cmake
+set (DEFAULT_RUNTIMES "compiler-rt;libcxx")
+if (NOT WIN32)
+  list(APPEND DEFAULT_RUNTIMES "libcxxabi" "libunwind")
+endif()
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
-set(LLVM_RELEASE_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
 # Note we don't need to add install here, since it is one of the pre-defined
 # steps.
diff --git a/clang/cmake/modules/AddClang.cmake b/clang/cmake/modules/AddClang.cmake
index 9d09be1936847..5327b5d2f0892 100644
--- a/clang/cmake/modules/AddClang.cmake
+++ b/clang/cmake/modules/AddClang.cmake
@@ -147,6 +147,7 @@ endmacro(add_clang_library)
 macro(add_clang_executable name)
   add_llvm_executable( ${name} ${ARGN} )
   set_clang_windows_version_resource_properties(${name})
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro(add_clang_executable)
 
 macro(add_clang_tool name)
@@ -181,6 +182,7 @@ macro(add_clang_tool name)
       set_property(GLOBAL APPEND PROPERTY CLANG_EXPORTS ${name})
     endif()
   endif()
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro()
 
 macro(add_clang_symlink name dest)
diff --git a/clang/docs/ClangNVLinkWrapper.rst b/clang/docs/ClangNVLinkWrapper.rst
new file mode 100644
index 0000000000000..2acdb054572f8
--- /dev/null
+++ b/clang/docs/ClangNVLinkWrapper.rst
@@ -0,0 +1,74 @@
+====================
+Clang nvlink Wrapper
+====================
+
+.. contents::
+   :local:
+
+.. _clang-nvlink-wrapper:
+
+Introduction
+============
+
+This tools works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose
+of this wrapper is to provide an interface similar to the ``ld.lld`` linker
+while still relying on NVIDIA's proprietary linker to produce the final output.
+
+``nvlink`` has a number of known quirks that make it difficult to use in a
+unified offloading setting. For example, it does not accept ``.o`` files as they
+must be named ``.cubin``. Static archives do not work, so passing a ``.a`` will
+provide a linker error. ``nvlink`` also does not support link time optimization
+and ignores many standard linker arguments. This tool works around these issues.
+
+Usage
+=====
+
+This tool can be used with the following options. Any arguments not intended
+only for the linker wrapper will be forwarded to ``nvlink``.
+
+.. code-block:: console
+
+  OVERVIEW: A utility that wraps around the NVIDIA 'nvlink' linker.
+  This enables static linking and LTO handling for NVPTX targets.
+
+  USAGE: clang-nvlink-wrapper [options] <options to passed to nvlink>
+
+  OPTIONS:
+    --arch <value>       Specify the 'sm_' name of the target architecture.
+    --cuda-path=<dir>    Set the system CUDA path
+    --dry-run            Print generated commands without running.
+    --feature <value>    Specify the '+ptx' freature to use for LTO.
+    -g                   Specify that this was a debug compile.
+    -help-hidden         Display all available options
+    -help                Display available options (--help-hidden for more)
+    -L <dir>             Add <dir> to the library search path
+    -l <libname>         Search for library <libname>
+    -mllvm <arg>         Arguments passed to LLVM, including Clang invocations,
+                         for which the '-mllvm' prefix is preserved. Use '-mllvm
+                         --help' for a list of options.
+    -o <path>            Path to file to write output
+    --plugin-opt=jobs=<value>
+                         Number of LTO codegen partitions
+    --plugin-opt=lto-partitions=<value>
+                         Number of LTO codegen partitions
+    --plugin-opt=O<O0, O1, O2, or O3>
+                         Optimization level for LTO
+    --plugin-opt=thinlto<value>
+                         Enable the thin-lto backend
+    --plugin-opt=<value> Arguments passed to LLVM, including Clang invocations,
+                         for which the '-mllvm' prefix is preserved. Use '-mllvm
+                         --help' for a list of options.
+    --save-temps         Save intermediate results
+    --version            Display the version number and exit
+    -v                   Print verbose information
+
+Example
+=======
+
+This tool is intended to be invoked when targeting the NVPTX toolchain directly
+as a cross-compiling target. This can be used to create standalone GPU
+executables with normal linking semantics similar to standard compilation.
+
+.. code-block:: console
+
+  clang --target=nvptx64-nvidia-cuda -march=native -flto=full input.c
diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst
index 29154292dc7a5..663aca1f6ddcb 100644
--- a/clang/docs/CommandGuide/clang.rst
+++ b/clang/docs/CommandGuide/clang.rst
@@ -429,7 +429,8 @@ Code Generation Options
 
     :option:`-Ofast` Enables all the optimizations from :option:`-O3` along
     with other aggressive optimizations that may violate strict compliance with
-    language standards.
+    language standards. This is deprecated in favor of :option:`-O3`
+    in combination with :option:`-ffast-math`.
 
     :option:`-Os` Like :option:`-O2` with extra optimizations to reduce code
     size.
diff --git a/clang/docs/HLSL/AvailabilityDiagnostics.rst b/clang/docs/HLSL/AvailabilityDiagnostics.rst
index bb9d02f21dde6..c2f260f268e7b 100644
--- a/clang/docs/HLSL/AvailabilityDiagnostics.rst
+++ b/clang/docs/HLSL/AvailabilityDiagnostics.rst
@@ -52,7 +52,6 @@ If the compilation target is a shader library, only availability based on shader
 
 As a result, availability based on specific shader stage will only be diagnosed in code that is reachable from a shader entry point or library export function. It also means that function bodies might be scanned multiple time. When that happens, care should be taken not to produce duplicated diagnostics.
 
-========
 Examples
 ========
 
@@ -62,7 +61,7 @@ For the example below, the ``WaveActiveCountBits`` API function became available
 The availability of ``ddx`` function depends on a shader stage. It is available for pixel shaders in shader model 2.1 and higher, for compute, mesh and amplification shaders in shader model 6.6 and higher. For any other shader stages it is not available.
 
 Compute shader example
-======================
+----------------------
 
 .. code-block:: c++
 
@@ -94,7 +93,7 @@ With strict diagnostic mode, in addition to the 2 errors above Clang will also e
    <>:7:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
 
 Shader library example
-======================
+----------------------
 
 .. code-block:: c++
 
diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
index a29b6348e0b8e..4782eb3cda754 100644
--- a/clang/docs/HLSL/ExpectedDifferences.rst
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -1,4 +1,4 @@
-
+===================================
 Expected Differences vs DXC and FXC
 ===================================
 
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index d09b1f892be22..de0a4fa24f319 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1546,6 +1546,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__array_extent(type, dim)`` (Embarcadero):
   The ``dim``'th array bound in the type ``type``, or ``0`` if
   ``dim >= __array_rank(type)``.
+* ``__builtin_is_virtual_base_of`` (C++, GNU, Microsoft)
 * ``__can_pass_in_regs`` (C++)
   Returns whether a class can be passed in registers under the current
   ABI. This type can only be applied to unqualified class types.
diff --git a/clang/docs/MSVCCompatibility.rst b/clang/docs/MSVCCompatibility.rst
index b2486052abf9a..0b6fea597f8d3 100644
--- a/clang/docs/MSVCCompatibility.rst
+++ b/clang/docs/MSVCCompatibility.rst
@@ -154,3 +154,133 @@ a hint suggesting how to fix the problem.
 As of this writing, Clang is able to compile a simple ATL hello world
 application.  There are still issues parsing WRL headers for modern Windows 8
 apps, but they should be addressed soon.
+
+__forceinline behavior
+======================
+
+``__forceinline`` behaves like ``[[clang::always_inline]]``.
+Inlining is always attempted regardless of optimization level.
+
+This differs from MSVC where ``__forceinline`` is only respected once inline expansion is enabled
+which allows any function marked implicitly or explicitly ``inline`` or ``__forceinline`` to be expanded.
+Therefore functions marked ``__forceinline`` will be expanded when the optimization level is ``/Od`` unlike
+MSVC where ``__forceinline`` will not be expanded under ``/Od``.
+
+SIMD and instruction set intrinsic behavior
+===========================================
+
+Clang follows the GCC model for intrinsics and not the MSVC model.
+There are currently no plans to support the MSVC model.
+
+MSVC intrinsics always emit the machine instruction the intrinsic models regardless of the compile time options specified.
+For example ``__popcnt`` always emits the x86 popcnt instruction even if the compiler does not have the option enabled to emit popcnt on its own volition.
+
+There are two common cases where code that compiles with MSVC will need reworking to build on clang.
+Assume the examples are only built with `-msse2` so we do not have the intrinsics at compile time.
+
+.. code-block:: c++
+
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return __popcnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+.. code-block:: c++
+
+  __m128 dot4_sse3(__m128 v0, __m128 v1) {
+    __m128 r = _mm_mul_ps(v0, v1);
+    r = _mm_hadd_ps(r, r);
+    r = _mm_hadd_ps(r, r);
+    return r;
+  }
+
+Clang expects that either you have compile time support for the target features, `-msse3` and `-mpopcnt`, you mark the function with the expected target feature or use runtime detection with an indirect call.
+
+.. code-block:: c++
+
+  __attribute__((__target__("sse3"))) __m128 dot4_sse3(__m128 v0, __m128 v1) {
+    __m128 r = _mm_mul_ps(v0, v1);
+    r = _mm_hadd_ps(r, r);
+    r = _mm_hadd_ps(r, r);
+    return r;
+  }
+
+The SSE3 dot product can be easily fixed by either building the translation unit with SSE3 support or using `__target__` to compile that specific function with SSE3 support.
+
+.. code-block:: c++
+
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return __popcnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+The above ``PopCnt`` example must be changed to work with clang. If we mark the function with `__target__("popcnt")` then the compiler is free to emit popcnt at will which we do not want. While this isn't a concern in our small example it is a concern in larger functions with surrounding code around the intrinsics. Similar reasoning for compiling the translation unit with `-mpopcnt`.
+We must split each branch into its own function that can be called indirectly instead of using the intrinsic directly.
+
+.. code-block:: c++
+
+  __attribute__((__target__("popcnt"))) unsigned hwPopCnt(unsigned v) { return __popcnt(v); }
+  unsigned (*PopCnt)(unsigned) = HavePopCnt ? hwPopCnt : GenericPopCnt;
+
+.. code-block:: c++
+
+  __attribute__((__target__("popcnt"))) unsigned hwPopCnt(unsigned v) { return __popcnt(v); }
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return hwPopCnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+In the above example ``hwPopCnt`` will not be inlined into ``PopCnt`` since ``PopCnt`` doesn't have the popcnt target feature.
+With a larger function that does real work the function call overhead is negligible. However in our popcnt example there is the function call
+overhead. There is no analog for this specific MSVC behavior in clang.
+
+For clang we effectively have to create the dispatch function ourselves to each specfic implementation.
+
+SIMD vector types
+=================
+
+Clang's simd vector types are builtin types and not user defined types as in MSVC. This does have some observable behavior changes.
+We will look at the x86 `__m128` type for the examples below but the statements apply to all vector types including ARM's `float32x4_t`.
+
+There are no members that can be accessed on the vector types. Vector types are not structs in clang.
+You cannot use ``__m128.m128_f32[0]`` to access the first element of the `__m128`.
+This also means struct initialization like ``__m128{ { 0.0f, 0.0f, 0.0f, 0.0f } }`` will not compile with clang.
+
+Since vector types are builtin types, clang implements operators on them natively.
+
+.. code-block:: c++
+
+  #ifdef _MSC_VER
+  __m128 operator+(__m128 a, __m128 b) { return _mm_add_ps(a, b); }
+  #endif
+
+The above code will fail to compile since overloaded 'operator+' must have at least one parameter of class or enumeration type.
+You will need to fix such code to have the check ``#if defined(_MSC_VER) && !defined(__clang__)``.
+
+Since `__m128` is not a class type in clang any overloads after a template definition will not be considered.
+
+.. code-block:: c++
+
+  template<class T>
+  void foo(T) {}
+
+  template<class T>
+  void bar(T t) {
+    foo(t);
+  }
+
+  void foo(__m128) {}
+
+  int main() {
+    bar(_mm_setzero_ps());
+  }
+
+With MSVC ``foo(__m128)`` will be selected but with clang ``foo<__m128>()`` will be selected since on clang `__m128` is a builtin type.
+
+In general the takeaway is `__m128` is a builtin type on clang while a class type on MSVC.
diff --git a/clang/docs/MemorySanitizer.rst b/clang/docs/MemorySanitizer.rst
index bcc6cc808e8ba..05e43a32b9b87 100644
--- a/clang/docs/MemorySanitizer.rst
+++ b/clang/docs/MemorySanitizer.rst
@@ -8,11 +8,18 @@ MemorySanitizer
 Introduction
 ============
 
-MemorySanitizer is a detector of uninitialized reads. It consists of a
+MemorySanitizer is a detector of uninitialized memory use. It consists of a
 compiler instrumentation module and a run-time library.
 
 Typical slowdown introduced by MemorySanitizer is **3x**.
 
+Here is a not comprehensive of list cases when MemorySanitizer will report an error:
+
+* Uninitialized value was used in a conditional branch.
+* Uninitialized pointer was used for memory accesses.
+* Uninitialized value was passed or returned from a function call, which is considered an undefined behavior. The check can be disabled with ``-fno-sanitize-memory-param-retval``.
+* Uninitialized data was passed into some libc calls.
+
 How to build
 ============
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1c1b874273a7c..866adefd5d3c4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -37,135 +37,28 @@ These changes are ones which we think may surprise users when upgrading to
 Clang |release| because of the opportunity they pose for disruption to existing
 code bases.
 
-- Setting the deprecated CMake variable ``GCC_INSTALL_PREFIX`` (which sets the
-  default ``--gcc-toolchain=``) now leads to a fatal error.
-
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
-- Clang now supports raw string literals in ``-std=gnuXY`` mode as an extension in
-  C99 and later. This behaviour can also be overridden using ``-f[no-]raw-string-literals``.
-  Support of raw string literals in C++ is not affected. Fixes (#GH85703).
-
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
-- Clang now diagnoses function/variable templates that shadow their own template parameters, e.g. ``template<class T> void T();``.
-  This error can be disabled via `-Wno-strict-primary-template-shadow` for compatibility with previous versions of clang.
-
-- The behavior controlled by the `-frelaxed-template-template-args` flag is now
-  on by default, and the flag is deprecated. Until the flag is finally removed,
-  it's negative spelling can be used to obtain compatibility with previous
-  versions of clang. The deprecation warning for the negative spelling can be
-  disabled with `-Wno-deprecated-no-relaxed-template-template-args`.
-
-- Clang now rejects pointer to member from parenthesized expression in unevaluated context such as ``decltype(&(foo::bar))``. (#GH40906).
-
-- Clang now performs semantic analysis for unary operators with dependent operands
-  that are known to be of non-class non-enumeration type prior to instantiation.
-
-  This change uncovered a bug in libstdc++ 14.1.0 which may cause compile failures
-  on systems using that version of libstdc++ and Clang 19, with an error that looks
-  something like this:
-
-  .. code-block:: text
-
-    <source>:4:5: error: expression is not assignable
-    4 |     ++this;
-      |     ^ ~~~~
-
-  To fix this, update libstdc++ to version 14.1.1 or greater.
 
 ABI Changes in This Version
 ---------------------------
-- Fixed Microsoft name mangling of implicitly defined variables used for thread
-  safe static initialization of static local variables. This change resolves
-  incompatibilities with code compiled by MSVC but might introduce
-  incompatibilities with code compiled by earlier versions of Clang when an
-  inline member function that contains a static local variable with a dynamic
-  initializer is declared with ``__declspec(dllimport)``. (#GH83616).
-
-- Fixed Microsoft name mangling of lifetime extended temporary objects. This
-  change corrects missing back reference registrations that could result in
-  incorrect back reference indexes and suprising demangled name results. Since
-  MSVC uses a different mangling for these objects, compatibility is not affected.
-  (#GH85423).
-
-- Fixed Microsoft calling convention for returning certain classes with a
-  templated constructor. If a class has a templated constructor, it should
-  be returned indirectly even if it meets all the other requirements for
-  returning a class in a register. This affects some uses of std::pair.
-  (#GH86384).
-
-- Fixed Microsoft calling convention when returning classes that have a deleted
-  copy assignment operator. Such a class should be returned indirectly.
-
-- Removed the global alias that was pointing to AArch64 Function Multiversioning
-  ifuncs. Its purpose was to preserve backwards compatibility when the ".ifunc"
-  suffix got removed from the name mangling. The alias interacts badly with
-  GlobalOpt (see the issue #96197).
-
-- Fixed Microsoft name mangling for auto non-type template arguments of pointer
-  type for MSVC 1920+. This change resolves incompatibilities with code compiled
-  by MSVC 1920+ but will introduce incompatibilities with code compiled by
-  earlier versions of Clang unless such code is built with the compiler option
-  `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
-
-- Fixed Microsoft name mangling for auto non-type template arguments of pointer
-  to member type for MSVC 1920+. This change resolves incompatibilities with code
-  compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by
-  earlier versions of Clang unless such code is built with the compiler option
-  `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
-  (GH#70899).
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
-- The text ast-dumper has improved printing of TemplateArguments.
-- The text decl-dumper prints template parameters' trailing requires expressions now.
-
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
-- Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead
-  use ``ArrayRef<TemplateArgument>`` to pass template arguments. Transitioning internal uses to
-  ``ArrayRef<TemplateArgument>`` reduces AST memory usage by 0.4% when compiling clang, and is
-  expected to show similar improvements on other workloads.
-
-- The ``-Wgnu-binary-literal`` diagnostic group no longer controls any
-  diagnostics. Binary literals are no longer a GNU extension, they're now a C23
-  extension which is controlled via ``-pedantic`` or ``-Wc23-extensions``. Use
-  of ``-Wno-gnu-binary-literal`` will no longer silence this pedantic warning,
-  which may break existing uses with ``-Werror``.
-
-- The normalization of 3 element target triples where ``-none-`` is the middle
-  element has changed. For example, ``armv7m-none-eabi`` previously normalized
-  to ``armv7m-none-unknown-eabi``, with ``none`` for the vendor and ``unknown``
-  for the operating system. It now normalizes to ``armv7m-unknown-none-eabi``,
-  which has ``unknown`` vendor and ``none`` operating system.
-
-  The affected triples are primarily for bare metal Arm where it is intended
-  that ``none`` means that there is no operating system. As opposed to an unknown
-  type of operating system.
-
-  This change my cause clang to not find libraries, or libraries to be built at
-  different file system locations. This can be fixed by changing your builds to
-  use the new normalized triple. However, we recommend instead getting the
-  normalized triple from clang itself, as this will make your builds more
-  robust in case of future changes::
-
-    $ clang --target=<your target triple> -print-target-triple
-    <the normalized target triple>
-
-- The ``hasTypeLoc`` AST matcher will no longer match a ``classTemplateSpecializationDecl``;
-  existing uses should switch to ``templateArgumentLoc`` or ``hasAnyTemplateArgumentLoc`` instead.
 
 Clang Python Bindings Potentially Breaking Changes
 --------------------------------------------------
-- Renamed ``CursorKind`` variant 272 from ``OMP_TEAMS_DISTRIBUTE_DIRECTIVE``
-  to ``OMP_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE``. The previous name was incorrect, it was a duplicate
-  of variant 271.
-- Renamed ``TypeKind`` variant 162 from ``OBJCCLASS`` to ``OBJCTYPEPARAM``.
-  The previous name was incorrect, it was a duplicate of variant 28.
-- Refactored enum implementation, switching to the standard library `Enum` type.
+- Parts of the interface returning string results will now return
+  the empty string `""` when no result is available, instead of `None`.
+- Calling a property on the `CompletionChunk` or `CompletionString` class
+  statically now leads to an error, instead of returning a `CachedProperty` object
+  that is used internally. Properties are only available on instances.
 
 What's New in Clang |release|?
 ==============================
@@ -174,909 +67,126 @@ here. Generic improvements to Clang as a whole or to its underlying
 infrastructure are described first, followed by language-specific
 sections with improvements to Clang's support for those languages.
 
-- The ``\par`` documentation comment command now supports an optional
-  argument, which denotes the header of the paragraph started by
-  an instance of the ``\par`` command comment. The implementation
-  of the argument handling matches its semantics
-  `in Doxygen <https://www.doxygen.nl/manual/commands.html#cmdpar>`.
-  Namely, any text on the same line as the ``\par`` command will become
-  a header for the paragaph, and if there is no text then the command
-  will start a new paragraph.
-
 C++ Language Changes
 --------------------
-- C++17 support is now completed, with the enablement of the
-  relaxed temlate template argument matching rules introduced in P0522,
-  which was retroactively applied as a defect report.
-  While the implementation already existed since Clang 4, it was turned off by
-  default, and was controlled with the `-frelaxed-template-template-args` flag.
-  In this release, we implement provisional wording for a core defect on
-  P0522 (CWG2398), which avoids the most serious compatibility issues caused
-  by it, allowing us to enable it by default in this release.
-  The flag is now deprecated, and will be removed in the next release, but can
-  still be used to turn it off and regain compatibility with previous versions
-  (#GH36505).
-- Implemented ``_BitInt`` literal suffixes ``__wb`` or ``__WB`` as a Clang extension with ``unsigned`` modifiers also allowed. (#GH85223).
 
 C++17 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Clang now exposes ``__GCC_DESTRUCTIVE_SIZE`` and ``__GCC_CONSTRUCTIVE_SIZE``
-  predefined macros to support standard library implementations of
-  ``std::hardware_destructive_interference_size`` and
-  ``std::hardware_constructive_interference_size``, respectively. These macros
-  are predefined in all C and C++ language modes. The values the macros
-  expand to are not stable between releases of Clang and do not need to match
-  the values produced by GCC, so these macros should not be used from header
-  files because they may not be stable across multiple TUs (the values may vary
-  based on compiler version as well as CPU tuning). #GH60174
 
 C++14 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Sized deallocation is enabled by default in C++14 onwards. The user may specify
-  ``-fno-sized-deallocation`` to disable it if there are some regressions.
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
-- Clang won't perform ODR checks for decls in the global module fragment any
-  more to ease the implementation and improve the user's using experience.
-  This follows the MSVC's behavior. Users interested in testing the more strict
-  behavior can use the flag '-Xclang -fno-skip-odr-check-in-gmf'.
-  (#GH79240).
-
-- Implemented the `__is_layout_compatible` and `__is_pointer_interconvertible_base_of`
-  intrinsics to support
-  `P0466R5: Layout-compatibility and Pointer-interconvertibility Traits <https://wg21.link/P0466R5>`_.
-
-- Clang now implements [module.import]p7 fully. Clang now will import module
-  units transitively for the module units coming from the same module of the
-  current module units. Fixes #GH84002
-
-- Initial support for class template argument deduction (CTAD) for type alias
-  templates (`P1814R0 <https://wg21.link/p1814r0>`_).
-  (#GH54051).
-
-- We have sufficient confidence and experience with the concepts implementation
-  to update the ``__cpp_concepts`` macro to `202002L`. This enables
-  ``<expected>`` from libstdc++ to work correctly with Clang.
-
-- User defined constructors are allowed for copy-list-initialization with CTAD.
-  (#GH62925).
-
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-
-- Implemented `P2718R0: Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_. Also
-  materialize temporary object which is a prvalue in discarded-value expression.
-- Implemented `P1774R8: Portable assumptions <https://wg21.link/P1774R8>`_.
-
-- Implemented `P2448R2: Relaxing some constexpr restrictions <https://wg21.link/P2448R2>`_.
-  Note, the ``-Winvalid-constexpr`` diagnostic is now disabled in C++23 mode,
-  but can be explicitly specified to retain the old diagnostic checking
-  behavior.
-
-- Added a ``__reference_converts_from_temporary`` builtin, completing the necessary compiler support for
-  `P2255R2: Type trait to determine if a reference binds to a temporary <https://wg21.link/P2255R2>`_.
-
-- Implemented `P2797R0: Static and explicit object member functions with the same parameter-type-lists <https://wg21.link/P2797R0>`_.
-  This completes the support for "deducing this".
+- Removed the restriction to literal types in constexpr functions in C++23 mode.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
-- Implemented `P2662R3 Pack Indexing <https://wg21.link/P2662R3>`_.
-
-- Implemented `P2573R2: = delete("should have a reason"); <https://wg21.link/P2573R2>`_
-
-- Implemented `P0609R3: Attributes for Structured Bindings <https://wg21.link/P0609R3>`_
-
-- Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
-
-- Implemented `P2809R3: Trivial infinite loops are not Undefined Behavior <https://wg21.link/P2809R3>`_.
-
-- Implemented `P3144R2 Deleting a Pointer to an Incomplete Type Should be Ill-formed <https://wg21.link/P3144R2>`_.
-
-- Implemented `P2963R3 Ordering of constraints involving fold expressions <https://wg21.link/P2963R3>`_.
-
+- Add ``__builtin_is_virtual_base_of`` intrinsic, which supports
+  `P2985R0 A type trait for detecting virtual base classes <https://wg21.link/p2985r0>`_
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- Substitute template parameter pack, when it is not explicitly specified
-  in the template parameters, but is deduced from a previous argument. (#GH78449)
-
-- Type qualifications are now ignored when evaluating layout compatibility
-  of two types.
-  (`CWG1719: Layout compatibility and cv-qualification revisited <https://cplusplus.github.io/CWG/issues/1719.html>`_).
-
-- Alignment of members is now respected when evaluating layout compatibility
-  of structs.
-  (`CWG2583: Common initial sequence should consider over-alignment <https://cplusplus.github.io/CWG/issues/2583.html>`_).
-
-- ``[[no_unique_address]]`` is now respected when evaluating layout
-  compatibility of two types.
-  (`CWG2759: [[no_unique_address] and common initial sequence  <https://cplusplus.github.io/CWG/issues/2759.html>`_).
-
-- Clang now diagnoses declarative nested-name-specifiers with pack-index-specifiers.
-  (`CWG2858: Declarative nested-name-specifiers and pack-index-specifiers <https://cplusplus.github.io/CWG/issues/2858.html>`_).
-
-- Clang now allows attributes on concepts.
-  (`CWG2428: Deprecating a concept <https://cplusplus.github.io/CWG/issues/2428.html>`_).
-
-- P0522 implementation is enabled by default in all language versions, and
-  provisional wording for CWG2398 is implemented.
-
-- Clang now performs type-only lookup for the name in ``using enum`` declaration.
-  (`CWG2877: Type-only lookup for using-enum-declarator <https://cplusplus.github.io/CWG/issues/2877.html>`_).
-
-- Clang now requires a template argument list after a template keyword.
-  (`CWG96: Syntactic disambiguation using the template keyword <https://cplusplus.github.io/CWG/issues/96.html>`_).
-
-- Clang now considers ``noexcept(typeid(expr))`` more carefully, instead of always assuming that ``std::bad_typeid`` can be thrown.
-  (`CWG2191: Incorrect result for noexcept(typeid(v)) <https://cplusplus.github.io/CWG/issues/2191.html>`_).
 
 C Language Changes
 ------------------
 
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
-- Clang now enables C2y mode with ``-std=c2y``. This sets ``__STDC_VERSION__``
-  to ``202400L`` so that it's greater than the value for C23. The value of this
-  macro is subject to change in the future.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
-- No longer diagnose use of binary literals as an extension in C23 mode. Fixes
-  #GH72017.
-
-- Corrected parsing behavior for the ``alignas`` specifier/qualifier in C23. We
-  previously handled it as an attribute as in C++, but there are parsing
-  differences. The behavioral differences are:
-
-  .. code-block:: c
-
-     struct alignas(8) /* was accepted, now rejected */ S {
-       char alignas(8) /* was rejected, now accepted */ C;
-     };
-     int i alignas(8) /* was accepted, now rejected */ ;
-
-  Fixes (#GH81472).
-
-- Clang now generates predefined macros of the form ``__TYPE_FMTB__`` and
-  ``__TYPE_FMTb__`` (e.g., ``__UINT_FAST64_FMTB__``) in C23 mode for use with
-  macros typically exposed from ``<inttypes.h>``, such as ``PRIb8``. (#GH81896)
-
-- Clang now supports `N3018 The constexpr specifier for object definitions`
-  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3018.htm>`_.
-
-- Properly promote bit-fields of bit-precise integer types to the field's type
-  rather than to ``int``. #GH87641
-
-- Added the ``INFINITY`` and ``NAN`` macros to Clang's ``<float.h>``
-  freestanding implementation; these macros were defined in ``<math.h>`` in C99
-  but C23 added them to ``<float.h>`` in
-  `WG14 N2848 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2848.pdf>`_.
-
-- Clang now supports `N3017 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3017.htm>`_
-  ``#embed`` - a scannable, tooling-friendly binary resource inclusion mechanism.
-
-- Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the
-  freestanding implementation of ``<float.h>`` that ships with Clang.
-
-- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
-  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
-  literals are now of type ``char8_t[N]`` in C23 and expose
-  ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
-  implement the corresponding macro in ``<stdatomic.h>``.
-
-Non-comprehensive list of changes in this release
--------------------------------------------------
-
-- Added ``__builtin_readsteadycounter`` for reading fixed frequency hardware
-  counters.
-
-- ``__builtin_addc``, ``__builtin_subc``, and the other sizes of those
-  builtins are now constexpr and may be used in constant expressions.
-
-- Added ``__builtin_popcountg`` as a type-generic alternative to
-  ``__builtin_popcount{,l,ll}`` with support for any unsigned integer type. Like
-  the previous builtins, this new builtin is constexpr and may be used in
-  constant expressions.
-
-- Lambda expressions are now accepted in C++03 mode as an extension.
-
-- Added ``__builtin_clzg`` and ``__builtin_ctzg`` as type-generic alternatives
-  to ``__builtin_clz{,s,l,ll}`` and ``__builtin_ctz{,s,l,ll}`` respectively,
-  with support for any unsigned integer type. Like the previous builtins, these
-  new builtins are constexpr and may be used in constant expressions.
-
-- ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
-  like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.
-
-- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.
-
-* Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
-  ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
-  floating-point mode by default. This decision can be overridden with use of
-  ``-mdaz-ftz``. This behavior now matches GCC's behavior.
-  (`#57589 <https://github.com/llvm/llvm-project/issues/57589>`_)
-
-* ``-fdenormal-fp-math=preserve-sign`` is no longer implied by ``-ffast-math``
-  on x86 systems.
-
-- Builtins ``__builtin_shufflevector()`` and ``__builtin_convertvector()`` may
-  now be used within constant expressions.
-
-- When compiling a constexpr function, Clang will check to see whether the
-  function can *never* be used in a constant expression context and issues a
-  diagnostic under the ``-Winvalid-constexpr`` diagostic flag (which defaults
-  to an error). This check can be expensive because the mere presence of a
-  function marked ``constexpr`` will cause us to undergo constant expression
-  evaluation, even if the function is not called within the translation unit
-  being compiled. Due to the expense, Clang no longer checks constexpr function
-  bodies when the function is defined in a system header file or when
-  ``-Winvalid-constexpr`` is not enabled for the function definition, which
-  should result in mild compile-time performance improvements.
-
-- Added ``__is_bitwise_cloneable`` which is used to check whether a type
-  can be safely copied by memcpy/memmove.
-
-- ``#pragma GCC diagnostic warning "-Wfoo"`` can now downgrade ``-Werror=foo``
-  errors and certain default-to-error ``-W`` diagnostics to warnings.
-
-- Support importing C++20 modules in clang-repl.
-
-- Added support for ``TypeLoc::dump()`` for easier debugging, and improved
-  textual and JSON dumping for various ``TypeLoc``-related nodes.
 
 New Compiler Flags
 ------------------
-- ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and
-  sign change.
-- ``-fsanitize=implicit-integer-conversion`` a group that replaces the previous
-  group ``-fsanitize=implicit-conversion``.
-
-- ``-Wmissing-designated-field-initializers``, grouped under ``-Wmissing-field-initializers``.
-  This diagnostic can be disabled to make ``-Wmissing-field-initializers`` behave
-  like it did before Clang 18.x. Fixes #GH56628
-
-- ``-fexperimental-modules-reduced-bmi`` enables the Reduced BMI for C++20 named modules.
-  See the document of standard C++ modules for details.
-
-- ``-fexperimental-late-parse-attributes`` enables an experimental feature to
-  allow late parsing certain attributes in specific contexts where they would
-  not normally be late parsed. Currently this allows late parsing the
-  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
-
-- ``-fseparate-named-sections`` uses separate unique sections for global
-  symbols in named special sections (i.e. symbols annotated with
-  ``__attribute__((section(...)))``. This enables linker GC to collect unused
-  symbols without having to use a per-symbol section.
-
-- ``-fms-define-stdc`` and its clang-cl counterpart ``/Zc:__STDC__``.
-  Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
-  MSVC compatibility mode is used. It has no effect for C++ code.
-
-- ``-Wc++23-compat`` group was added to help migrating existing codebases
-  to C++23.
-
-- ``-Wc++2c-compat`` group was added to help migrating existing codebases
-  to upcoming C++26.
-
-- ``-fdisable-block-signature-string`` instructs clang not to emit the signature
-  string for blocks. Disabling the string can potentially break existing code
-  that relies on it. Users should carefully consider this possibiilty when using
-  the flag.
-
-- For the ARM target, added ``-Warm-interrupt-vfp-clobber`` that will emit a
-  diagnostic when an interrupt handler is declared and VFP is enabled.
 
 Deprecated Compiler Flags
 -------------------------
 
 Modified Compiler Flags
 -----------------------
-- Added a new diagnostic flag ``-Wreturn-mismatch`` which is grouped under
-  ``-Wreturn-type``, and moved some of the diagnostics previously controlled by
-  ``-Wreturn-type`` under this new flag. Fixes #GH72116.
-- ``-fsanitize=implicit-conversion`` is now a group for both
-  ``-fsanitize=implicit-integer-conversion`` and
-  ``-fsanitize=implicit-bitfield-conversion``.
-
-- Added ``-Wcast-function-type-mismatch`` under the ``-Wcast-function-type``
-  warning group. Moved the diagnostic previously controlled by
-  ``-Wcast-function-type`` to the new warning group and added
-  ``-Wcast-function-type-mismatch`` to ``-Wextra``. #GH76872
-
-  .. code-block:: c
-
-     int x(long);
-     typedef int (f2)(void*);
-     typedef int (f3)();
-
-     void func(void) {
-       // Diagnoses under -Wcast-function-type, -Wcast-function-type-mismatch,
-       // -Wcast-function-type-strict, -Wextra
-       f2 *b = (f2 *)x;
-       // Diagnoses under -Wcast-function-type, -Wcast-function-type-strict
-       f3 *c = (f3 *)x;
-     }
-
-- Carved out ``-Wformat`` warning about scoped enums into a subwarning and
-  make it controlled by ``-Wformat-pedantic``. Fixes #GH88595.
-
-- Trivial infinite loops (i.e loops with a constant controlling expresion
-  evaluating to ``true`` and an empty body such as ``while(1);``)
-  are considered infinite, even when the ``-ffinite-loop`` flag is set.
-
-- Diagnostics groups about compatibility with a particular C++ Standard version
-  now include dianostics about C++26 features that are not present in older
-  versions.
-
-- Removed the "arm interrupt calling convention" warning that was included in
-  ``-Wextra`` without its own flag. This warning suggested adding
-  ``__attribute__((interrupt))`` to functions that are called from interrupt
-  handlers to prevent clobbering VFP registers. Following this suggestion leads
-  to unpredictable behavior by causing multiple exception returns from one
-  exception. Fixes #GH34876.
 
 Removed Compiler Flags
 -------------------------
 
-- The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13.
-- ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch.
-  ``-m[no-]strict-align``, also supported by GCC, should be used instead. (#GH85350)
-
 Attribute Changes in Clang
 --------------------------
-- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
-  ``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
-  attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
-  ``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
-  and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
-  ``z`` are optional with default value of 1.
-
-- The ``swiftasynccc`` attribute is now considered to be a Clang extension
-  rather than a language standard feature. Please use
-  ``__has_extension(swiftasynccc)`` to check the availability of this attribute
-  for the target platform instead of ``__has_feature(swiftasynccc)``. Also,
-  added a new extension query ``__has_extension(swiftcc)`` corresponding to the
-  ``__attribute__((swiftcc))`` attribute.
-
-- The ``_Nullable`` and ``_Nonnull`` family of type attributes can now apply
-  to certain C++ class types, such as smart pointers:
-  ``void useObject(std::unique_ptr<Object> _Nonnull obj);``.
-
-  This works for standard library types including ``unique_ptr``, ``shared_ptr``,
-  and ``function``. See
-  `the attribute reference documentation <https://llvm.org/docs/AttributeReference.html#nullability-attributes>`_
-  for the full list.
-
-- The ``_Nullable`` attribute can be applied to C++ class declarations:
-  ``template <class T> class _Nullable MySmartPointer {};``.
-
-  This allows the ``_Nullable`` and ``_Nonnull`` family of type attributes to
-  apply to this class.
-
-- Clang now warns that the ``exclude_from_explicit_instantiation`` attribute
-  is ignored when applied to a local class or a member thereof.
-
-- The ``clspv_libclc_builtin`` attribute has been added to allow clspv
-  (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
-  (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
-- The ``counted_by`` attribute is now allowed on pointers that are members of a
-  struct in C.
-
-- The ``counted_by`` attribute can now be late parsed in C when
-  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
-  used in the declaration attribute position. This allows using the
-  attribute on existing code where it previously impossible to do so without
-  re-ordering struct field declarations would break ABI as shown below.
-
-  .. code-block:: c
-
-     struct BufferTy {
-       /* Refering to `count` requires late parsing */
-       char* buffer __counted_by(count);
-       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
-       size_t count;
-     };
-
-- The attributes ``sized_by``, ``counted_by_or_null`` and ``sized_by_or_null```
-  have been added as variants on ``counted_by``, each with slightly different semantics.
-  ``sized_by`` takes a byte size parameter instead of an element count, allowing pointees
-  with unknown size. The ``counted_by_or_null`` and ``sized_by_or_null`` variants are equivalent
-  to their base variants, except the pointer can be null regardless of count/size value.
-  If the pointer is null the size is effectively 0. ``sized_by_or_null`` is needed to properly
-  annotate allocator functions like ``malloc`` that return a buffer of a given byte size, but can
-  also return null.
-
-- The ``guarded_by``, ``pt_guarded_by``, ``acquired_after``, ``acquired_before``
-  attributes now support referencing struct members in C. The arguments are also
-  now late parsed when ``-fexperimental-late-parse-attributes`` is passed like
-  for ``counted_by``.
-
-- Introduced new function type attributes ``[[clang::nonblocking]]``, ``[[clang::nonallocating]]``,
-  ``[[clang::blocking]]``, and ``[[clang::allocating]]``, with GNU-style variants as well.
-  The attributes declare constraints about a function's behavior pertaining to blocking and
-  heap memory allocation.
-
-Improvements to Clang's diagnostics
------------------------------------
-- Clang now emits an error instead of a warning for ``-Wundefined-internal``
-  when compiling with `-pedantic-errors` to conform to the C standard
-
-- Clang now applies syntax highlighting to the code snippets it
-  prints.
-
-- Clang now diagnoses member template declarations with multiple declarators.
-
-- Clang now diagnoses use of the ``template`` keyword after declarative nested
-  name specifiers.
-
-- The ``-Wshorten-64-to-32`` diagnostic is now grouped under ``-Wimplicit-int-conversion`` instead
-   of ``-Wconversion``. Fixes #GH69444.
-
-- Clang now uses thousand separators when printing large numbers in integer overflow diagnostics.
-  Fixes #GH80939.
 
-- Clang now diagnoses friend declarations with an ``enum`` elaborated-type-specifier in language modes after C++98.
+- Clang now disallows more than one ``__attribute__((ownership_returns(class, idx)))`` with
+  different class names attached to one function.
 
-- Added diagnostics for C11 keywords being incompatible with language standards
-  before C11, under a new warning group: ``-Wpre-c11-compat``.
+- Introduced a new format attribute ``__attribute__((format(syslog, 1, 2)))`` from OpenBSD.
 
-- Now diagnoses an enumeration constant whose value is larger than can be
-  represented by ``unsigned long long``, which can happen with a large constant
-  using the ``wb`` or ``uwb`` suffix. The maximal underlying type is currently
-  ``unsigned long long``, but this behavior may change in the future when Clang
-  implements
-  `WG14 N3029 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3029.htm>`_.
-  (#GH69352).
+- The ``hybrid_patchable`` attribute is now supported on ARM64EC targets. It can be used to specify
+  that a function requires an additional x86-64 thunk, which may be patched at runtime.
 
-- Clang now diagnoses extraneous template parameter lists as a language extension.
-
-- Clang now diagnoses declarative nested name specifiers that name alias templates.
-
-- Clang now diagnoses lambda function expressions being implicitly cast to boolean values, under ``-Wpointer-bool-conversion``.
-  Fixes #GH82512.
-
-- Clang now provides improved warnings for the ``cleanup`` attribute to detect misuse scenarios,
-  such as attempting to call ``free`` on an unallocated object. Fixes #GH79443.
-
-- Clang no longer warns when the ``bitand`` operator is used with boolean
-  operands, distinguishing it from potential typographical errors or unintended
-  bitwise operations. Fixes #GH77601.
-
-- Clang now correctly diagnoses no arguments to a variadic macro parameter as a C23/C++20 extension.
-  Fixes #GH84495.
-
-- Clang no longer emits a ``-Wexit-time destructors`` warning on static variables explicitly
-  annotated with the ``clang::always_destroy`` attribute.
-  Fixes #GH68686, #GH86486
-
-- ``-Wmicrosoft``, ``-Wgnu``, or ``-pedantic`` is now required to diagnose C99
-  flexible array members in a union or alone in a struct. Fixes GH#84565.
-
-- Clang now no longer diagnoses type definitions in ``offsetof`` in C23 mode.
-  Fixes #GH83658.
-
-- New ``-Wformat-signedness`` diagnostic that warn if the format string requires an
-  unsigned argument and the argument is signed and vice versa.
-
-- Clang now emits ``unused argument`` warning when the -fmodule-output flag is used
-  with an input that is not of type c++-module.
-
-- Clang emits a ``-Wreturn-stack-address`` warning if a function returns a pointer or
-  reference to a struct literal. Fixes #GH8678
-
-- Clang emits a ``-Wunused-but-set-variable`` warning on C++ variables whose declaration
-  (with initializer) entirely consist the condition expression of a if/while/for construct
-  but are not actually used in the body of the if/while/for construct. Fixes #GH41447
-
-- Clang emits a diagnostic when a tentative array definition is assumed to have
-  a single element, but that diagnostic was never given a diagnostic group.
-  Added the ``-Wtentative-definition-array`` warning group to cover this.
-  Fixes #GH87766
-
-- Clang now uses the correct type-parameter-key (``class`` or ``typename``) when printing
-  template template parameter declarations.
-
-- Clang now diagnoses requires expressions with explicit object parameters.
+Improvements to Clang's diagnostics
+-----------------------------------
 
-- Clang now looks up members of the current instantiation in the template definition context
-  if the current instantiation has no dependent base classes.
+- Some template related diagnostics have been improved.
 
   .. code-block:: c++
 
-     template<typename T>
-     struct A {
-       int f() {
-         return this->x; // error: no member named 'x' in 'A<T>'
-       }
-     };
-
-- Clang emits a ``-Wparentheses`` warning for expressions with consecutive comparisons like ``x < y < z``.
-  Fixes #GH20456.
-
-- Clang no longer emits a "declared here" note for a builtin function that has no declaration in source.
-  Fixes #GH93369.
-
-- Clang now has an improved error message for captures that refer to a class member.
-  Fixes #GH94764.
+     void foo() { template <typename> int i; } // error: templates can only be declared in namespace or class scope
 
-- Clang now diagnoses unsupported class declarations for ``std::initializer_list<E>`` when they are
-  used rather than when they are needed for constant evaluation or when code is generated for them.
-  The check is now stricter to prevent crashes for some unsupported declarations (Fixes #GH95495).
-
-- Clang now diagnoses dangling cases where a pointer is assigned to a temporary
-  that will be destroyed at the end of the full expression.
-  Fixes #GH54492.
+     struct S {
+      template <typename> int i; // error: non-static data member 'i' cannot be declared as a template
+     };
 
-- Clang now shows implicit deduction guides when diagnosing overload resolution failure. #GH92393.
+- Clang now has improved diagnostics for functions with explicit 'this' parameters. Fixes #GH97878
 
-- Clang no longer emits a "no previous prototype" warning for Win32 entry points under ``-Wmissing-prototypes``.
-  Fixes #GH94366.
+- Clang now diagnoses dangling references to fields of temporary objects. Fixes #GH81589.
 
-- For the ARM target, calling an interrupt handler from another function is now an error. #GH95359.
+- Clang now diagnoses undefined behavior in constant expressions more consistently. This includes invalid shifts, and signed overflow in arithmetic.
 
-- Clang now diagnoses integer constant expressions that are folded to a constant value as an extension in more circumstances. Fixes #GH59863
+- -Wdangling-assignment-gsl is enabled by default.
 
 Improvements to Clang's time-trace
 ----------------------------------
 
-- Clang now specifies that using ``auto`` in a lambda parameter is a C++14 extension when
-  appropriate. (`#46059: <https://github.com/llvm/llvm-project/issues/46059>`_).
-
 Improvements to Coverage Mapping
 --------------------------------
 
-- Macros defined in system headers are not expanded in coverage
-  mapping. Conditional expressions in system header macros are no
-  longer taken into account for branch coverage. They can be included
-  with ``-mllvm -system-headers-coverage``.
-  (`#78920: <https://github.com/llvm/llvm-project/issues/78920>`_)
-- MC/DC Coverage has been improved.
-  (`#82448: <https://github.com/llvm/llvm-project/pull/82448>`_)
-
-  - The maximum number of conditions is no longer limited to 6. See
-    `this <SourceBasedCodeCoverage.html#mc-dc-instrumentation>` for
-    more details.
-
 Bug Fixes in This Version
 -------------------------
-- Clang's ``-Wundefined-func-template`` no longer warns on pure virtual
-  functions. (#GH74016)
-
-- Fixed missing warnings when comparing mismatched enumeration constants
-  in C (#GH29217)
-
-- Clang now accepts elaborated-type-specifiers that explicitly specialize
-  a member class template for an implicit instantiation of a class template.
-
-- Fixed missing warnings when doing bool-like conversions in C23 (#GH79435).
-- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as
-  a class field unless the lambda can capture this.
-  Fixes (#GH71976)
-
-- Clang now accepts qualified partial/explicit specializations of variable templates that
-  are not nominable in the lookup context of the specialization.
-
-- Clang now doesn't produce false-positive warning `-Wconstant-logical-operand`
-  for logical operators in C23.
-  Fixes (#GH64356).
-
-- ``__is_trivially_relocatable`` no longer returns ``false`` for volatile-qualified types.
-  Fixes (#GH77091).
-
-- Clang no longer produces a false-positive `-Wunused-variable` warning
-  for variables created through copy initialization having side-effects in C++17 and later.
-  Fixes (#GH64356) (#GH79518).
-
-- Fix value of predefined macro ``__FUNCTION__`` in MSVC compatibility mode.
-  Fixes (#GH66114).
-
-- Clang now emits errors for explicit specializations/instatiations of lambda call
-  operator.
-  Fixes (#GH83267).
-
-- Fix crash on ill-formed partial specialization with CRTP.
-  Fixes (#GH89374).
-
-- Clang now correctly generates overloads for bit-precise integer types for
-  builtin operators in C++. Fixes #GH82998.
-
-- Fix crash when destructor definition is preceded with an equals sign.
-  Fixes (#GH89544).
-
-- When performing mixed arithmetic between ``_Complex`` floating-point types and integers,
-  Clang now correctly promotes the integer to its corresponding real floating-point
-  type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
-  as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated
-  by the C standard. This significantly improves codegen of `*` and `/` especially.
-  Fixes #GH31205.
-
-- Fixes an assertion failure on invalid code when trying to define member
-  functions in lambdas.
-
-- Fixed a regression in CTAD that a friend declaration that befriends itself may cause
-  incorrect constraint substitution. (#GH86769).
-
-- Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
 
-- Fixed missing destructor calls when we branch from middle of an expression.
-  This could happen through a branch in stmt-expr or in an expression containing a coroutine
-  suspension. Fixes (#GH63818) (#GH88478).
-
-- Clang will no longer diagnose an erroneous non-dependent ``switch`` condition
-  during instantiation, and instead will only diagnose it once, during checking
-  of the function template.
-
-- Clang now allows the value of unroll count to be zero in ``#pragma GCC unroll`` and ``#pragma unroll``.
-  The values of 0 and 1 block any unrolling of the loop. This keeps the same behavior with GCC.
-  Fixes (`#88624 <https://github.com/llvm/llvm-project/issues/88624>`_).
-
-- Clang will no longer emit a duplicate -Wunused-value warning for an expression
-  `(A, B)` which evaluates to glvalue `B` that can be converted to non ODR-use. (#GH45783)
-
-- Clang now correctly disallows VLA type compound literals, e.g. ``(int[size]){}``,
-  as the C standard mandates. (#GH89835)
-
-- ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for
-  zero-sized arrays. Fixes (#GH54705).
-
-- Correctly reject declarations where a statement is required in C.
-  Fixes #GH92775
-
-- Fixed `static_cast` to array of unknown bound. Fixes (#GH62863).
-
-- Fixed Clang crashing when failing to perform some C++ Initialization Sequences. (#GH98102)
-
-- ``__is_trivially_equality_comparable`` no longer returns true for types which
-  have a constrained defaulted comparison operator (#GH89293).
-
-- Fixed Clang from generating dangling StringRefs when deserializing Exprs & Stmts (#GH98667)
-
-- ``__has_unique_object_representations`` correctly handles arrays of unknown bounds of
-  types by ensuring they are complete and instantiating them if needed. Fixes (#GH95311).
-
-- ``typeof_unqual`` now properly removes type qualifiers from arrays and their element types. (#GH92667)
+- Fixed the definition of ``ATOMIC_FLAG_INIT`` in ``<stdatomic.h>`` so it can
+  be used in C++.
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Fix crash when atomic builtins are called with pointer to zero-size struct (#GH90330)
-
-- Clang now allows pointee types of atomic builtin arguments to be complete template types
-  that was not instantiated elsewhere.
-
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Fix crash when calling the constructor of an invalid class.
-  (#GH10518) (#GH67914) (#GH78388)
-- Fix crash when using lifetimebound attribute in function with trailing return.
-  (#GH73619)
-- Addressed an issue where constraints involving injected class types are perceived
-  distinct from its specialization types. (#GH56482)
-- Fixed a bug where variables referenced by requires-clauses inside
-  nested generic lambdas were not properly injected into the constraint scope. (#GH73418)
-- Fixed a crash where substituting into a requires-expression that refers to function
-  parameters during the equivalence determination of two constraint expressions.
-  (#GH74447)
-- Fixed deducing auto& from const int in template parameters of partial
-  specializations. (#GH77189)
-- Fix for crash when using a erroneous type in a return statement.
-  (#GH63244) (#GH79745)
-- Fixed an out-of-bounds error caused by building a recovery expression for ill-formed
-  function calls while substituting into constraints. (#GH58548)
-- Fix incorrect code generation caused by the object argument
-  of ``static operator()`` and ``static operator[]`` calls not being evaluated. (#GH67976)
-- Fix crash and diagnostic with const qualified member operator new.
-  Fixes (#GH79748)
-- Fixed a crash where substituting into a requires-expression that involves parameter packs
-  during the equivalence determination of two constraint expressions. (#GH72557)
-- Fix a crash when specializing an out-of-line member function with a default
-  parameter where we did an incorrect specialization of the initialization of
-  the default parameter. (#GH68490)
-- Fix a crash when trying to call a varargs function that also has an explicit object parameter.
-  Fixes (#GH80971)
-- Reject explicit object parameters on `new` and `delete` operators. (#GH82249)
-- Fix a crash when trying to call a varargs function that also has an explicit object parameter. (#GH80971)
-- Fixed a bug where abbreviated function templates would append their invented template parameters to
-  an empty template parameter lists.
-- Fix parsing of abominable function types inside type traits. Fixes #GH77585
-- Clang now classifies aggregate initialization in C++17 and newer as constant
-  or non-constant more accurately. Previously, only a subset of the initializer
-  elements were considered, misclassifying some initializers as constant. Partially fixes
-  #GH80510.
-- Clang now ignores top-level cv-qualifiers on function parameters in template partial orderings. (#GH75404)
-- No longer reject valid use of the ``_Alignas`` specifier when declaring a
-  local variable, which is supported as a C11 extension in C++. Previously, it
-  was only accepted at namespace scope but not at local function scope.
-- Clang no longer tries to call consteval constructors at runtime when they appear in a member initializer. (#GH82154)
-- Fix crash when using an immediate-escalated function at global scope. (#GH82258)
-- Correctly immediate-escalate lambda conversion functions. (#GH82258)
-- Fixed an issue where template parameters of a nested abbreviated generic lambda within
-  a requires-clause lie at the same depth as those of the surrounding lambda. This,
-  in turn, results in the wrong template argument substitution during constraint checking.
-  (#GH78524)
-- Clang no longer instantiates the exception specification of discarded candidate function
-  templates when determining the primary template of an explicit specialization.
-- Fixed a crash in Microsoft compatibility mode where unqualified dependent base class
-  lookup searches the bases of an incomplete class.
-- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
-  (#GH53815)
-- In ``__restrict``-qualified member functions, attach ``__restrict`` to the pointer type of
-  ``this`` rather than the pointee type.
-  Fixes (#GH82941), (#GH42411) and (#GH18121).
-- Clang now properly reports supported C++11 attributes when using
-  ``__has_cpp_attribute`` and parses attributes with arguments in C++03 (#GH82995)
-- Clang now properly diagnoses missing 'default' template arguments on a variety
-  of templates. Previously we were diagnosing on any non-function template
-  instead of only on class, alias, and variable templates, as last updated by
-  CWG2032. Fixes (#GH83461)
-- Fixed an issue where an attribute on a declarator would cause the attribute to
-  be destructed prematurely. This fixes a pair of Chromium that were brought to
-  our attention by an attempt to fix in (#GH77703). Fixes (#GH83385).
-- Fix evaluation of some immediate calls in default arguments.
-  Fixes (#GH80630)
-- Fixed an issue where the ``RequiresExprBody`` was involved in the lambda dependency
-  calculation. (#GH56556), (#GH82849).
-- Fix a bug where overload resolution falsely reported an ambiguity when it was comparing
-  a member-function against a non member function or a member-function with an
-  explicit object parameter against a member function with no explicit object parameter
-  when one of the function had more specialized templates. Fixes #GH82509 and #GH74494
-- Clang now supports direct lambda calls inside of a type alias template declarations.
-  This addresses (#GH70601), (#GH76674), (#GH79555), (#GH81145) and (#GH82104).
-- Allow access to a public template alias declaration that refers to friend's
-  private nested type. (#GH25708).
-- Fixed a crash in constant evaluation when trying to access a
-  captured ``this`` pointer in a lambda with an explicit object parameter.
-  Fixes (#GH80997)
-- Fix an issue where missing set friend declaration in template class instantiation.
-  Fixes (#GH84368).
-- Fixed a crash while checking constraints of a trailing requires-expression of a lambda, that the
-  expression references to an entity declared outside of the lambda. (#GH64808)
-- Clang's __builtin_bit_cast will now produce a constant value for records with empty bases. See:
-  (#GH82383)
-- Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
-- Fix an issue where a namespace alias could be defined using a qualified name (all name components
-  following the first `::` were ignored).
-- Fix an out-of-bounds crash when checking the validity of template partial specializations. (part of #GH86757).
-- Fix an issue caused by not handling invalid cases when substituting into the parameter mapping of a constraint. Fixes (#GH86757).
-- Fixed a bug that prevented member function templates of class templates declared with a deduced return type
-  from being explicitly specialized for a given implicit instantiation of the class template.
-- Fixed a crash when ``this`` is used in a dependent class scope function template specialization
-  that instantiates to a static member function.
-- Fix crash when inheriting from a cv-qualified type. Fixes #GH35603
-- Fix a crash when the using enum declaration uses an anonymous enumeration. Fixes (#GH86790).
-- Handled an edge case in ``getFullyPackExpandedSize`` so that we now avoid a false-positive diagnostic. (#GH84220)
-- Clang now correctly tracks type dependence of by-value captures in lambdas with an explicit
-  object parameter.
-  Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399).
-- Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329).
-- Fix a crash in requires expression with templated base class member function. Fixes (#GH84020).
-- Fix a crash caused by defined struct in a type alias template when the structure
-  has fields with dependent type. Fixes (#GH75221).
-- Fix the Itanium mangling of lambdas defined in a member of a local class (#GH88906)
-- Fixed a crash when trying to evaluate a user-defined ``static_assert`` message whose ``size()``
-  function returns a large or negative value. Fixes (#GH89407).
-- Fixed a use-after-free bug in parsing of type constraints with default arguments that involve lambdas. (#GH67235)
-- Fixed bug in which the body of a consteval lambda within a template was not parsed as within an
-  immediate function context.
-- Fix CTAD for ``std::initializer_list``. This allows ``std::initializer_list{1, 2, 3}`` to be deduced as
-  ``std::initializer_list<int>`` as intended.
-- Fix a bug on template partial specialization whose template parameter is `decltype(auto)`.
-- Fix a bug on template partial specialization with issue on deduction of nontype template parameter
-  whose type is `decltype(auto)`. Fixes (#GH68885).
-- Clang now correctly treats the noexcept-specifier of a friend function to be a complete-class context.
-- Fix an assertion failure when parsing an invalid members of an anonymous class. (#GH85447)
-- Fixed a misuse of ``UnresolvedLookupExpr`` for ill-formed templated expressions. Fixes (#GH48673), (#GH63243)
-  and (#GH88832).
-- Clang now defers all substitution into the exception specification of a function template specialization
-  until the noexcept-specifier is instantiated.
-- Fix a crash when an implicitly declared ``operator==`` function with a trailing requires-clause has its
-  constraints compared to that of another declaration.
-- Fix a bug where explicit specializations of member functions/function templates would have substitution
-  performed incorrectly when checking constraints. Fixes (#GH90349).
-- Clang now allows constrained member functions to be explicitly specialized for an implicit instantiation
-  of a class template.
-- Fix a C++23 bug in implementation of P2564R3 which evaluates immediate invocations in place
-  within initializers for variables that are usable in constant expressions or are constant
-  initialized, rather than evaluating them as a part of the larger manifestly constant evaluated
-  expression.
-- Fix a bug in access control checking due to dealyed checking of friend declaration. Fixes (#GH12361).
-- Correctly treat the compound statement of an ``if consteval`` as an immediate context. Fixes (#GH91509).
-- When partial ordering alias templates against template template parameters,
-  allow pack expansions when the alias has a fixed-size parameter list. Fixes (#GH62529).
-- Clang now ignores template parameters only used within the exception specification of candidate function
-  templates during partial ordering when deducing template arguments from a function declaration or when
-  taking the address of a function template.
-- Fix a bug with checking constrained non-type template parameters for equivalence. Fixes (#GH77377).
-- Fix a bug where the last argument was not considered when considering the most viable function for
-  explicit object argument member functions. Fixes (#GH92188).
-- Fix a C++11 crash when a non-const non-static member function is defined out-of-line with
-  the ``constexpr`` specifier. Fixes (#GH61004).
-- Clang no longer transforms dependent qualified names into implicit class member access expressions
-  until it can be determined whether the name is that of a non-static member.
-- Clang now correctly diagnoses when the current instantiation is used as an incomplete base class.
-- Clang no longer treats ``constexpr`` class scope function template specializations of non-static members
-  as implicitly ``const`` in language modes after C++11.
-- Fixed a crash when trying to emit captures in a lambda call operator with an explicit object
-  parameter that is called on a derived type of the lambda.
-  Fixes (#GH87210), (GH89541).
-- Clang no longer tries to check if an expression is immediate-escalating in an unevaluated context.
-  Fixes (#GH91308).
-- Fix a crash caused by a regression in the handling of ``source_location``
-  in dependent contexts. Fixes (#GH92680).
-- Fixed a crash when diagnosing failed conversions involving template parameter
-  packs. (#GH93076)
-- Fixed a regression introduced in Clang 18 causing a static function overloading a non-static function
-  with the same parameters not to be diagnosed. (Fixes #GH93456).
-- Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
-- Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
-- Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536).
-- Clang no longer models dependent NTTP arguments as ``TemplateParamObjectDecl`` s. Fixes (#GH84052).
-- Fix incorrect merging of modules which contain using declarations which shadow
-  other declarations. This could manifest as ODR checker false positives.
-  Fixes (`#80252 <https://github.com/llvm/llvm-project/issues/80252>`_)
-- Fix a regression introduced in Clang 18 causing incorrect overload resolution in the presence of functions only
-  differering by their constraints when only one of these function was variadic.
-- Fix a crash when a variable is captured by a block nested inside a lambda. (Fixes #GH93625).
-- Fixed a type constraint substitution issue involving a generic lambda expression. (#GH93821)
-- Fix a crash caused by improper use of ``__array_extent``. (#GH80474)
-- Fixed several bugs in capturing variables within unevaluated contexts. (#GH63845), (#GH67260), (#GH69307),
-  (#GH88081), (#GH89496), (#GH90669), (#GH91633) and (#GH97453).
-- Fixed a crash in constraint instantiation under nested lambdas with dependent parameters.
-- Fixed handling of brace ellison when building deduction guides. (#GH64625), (#GH83368).
-- Fixed a failed assertion when attempting to convert an integer representing the difference
-  between the addresses of two labels (a GNU extension) to a pointer within a constant expression. (#GH95366).
-- Fix immediate escalation bugs in the presence of dependent call arguments. (#GH94935)
-- Clang now diagnoses explicit specializations with storage class specifiers in all contexts.
-- Fix an assertion failure caused by parsing a lambda used as a default argument for the value of a
-  forward-declared class. (#GH93512).
-- Fixed a bug in access checking inside return-type-requirement of compound requirements. (#GH93788).
-- Fixed an assertion failure about invalid conversion when calling lambda. (#GH96205).
-- Fixed a bug where the first operand of binary ``operator&`` would be transformed as if it was the operand
-  of the address of operator. (#GH97483).
-- Fixed an assertion failure about a constant expression which is a known integer but is not
-  evaluated to an integer. (#GH96670).
-- Fixed a bug where references to lambda capture inside a ``noexcept`` specifier were not correctly
-  instantiated. (#GH95735).
-- Fixed a CTAD substitution bug involving type aliases that reference outer template parameters. (#GH94614).
-- Clang now correctly handles unexpanded packs in the template parameter list of a generic lambda expression
-  (#GH48937)
-- Fix a crash when parsing an invalid type-requirement in a requires expression. (#GH51868)
-- Fix parsing of built-in type-traits such as ``__is_pointer`` in libstdc++ headers. (#GH95598)
-- Fixed failed assertion when resolving context of defaulted comparison method outside of struct. (#GH96043).
-- Clang now diagnoses explicit object parameters in member pointers and other contexts where they should not appear.
-  Fixes (#GH85992).
-- Fixed a crash-on-invalid bug involving extraneous template parameter with concept substitution. (#GH73885)
+- Fixed a crash when an expression with a dependent ``__typeof__`` type is used as the operand of a unary operator. (#GH97646)
+- Fixed a failed assertion when checking invalid delete operator declaration. (#GH96191)
+- Fix a crash when checking destructor reference with an invalid initializer. (#GH97230)
+- Clang now correctly parses potentially declarative nested-name-specifiers in pointer-to-member declarators.
+- Fix a crash when checking the initialzier of an object that was initialized
+  with a string literal. (#GH82167)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
-- Clang now properly preserves ``FoundDecls`` within a ``ConceptReference``. (#GH82628)
-- The presence of the ``typename`` keyword is now stored in ``TemplateTemplateParmDecl``.
-- Fixed malformed AST generated for anonymous union access in templates. (#GH90842)
-- Improved preservation of qualifiers and sugar in `TemplateNames`, including
-  template keyword.
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-- Fixed an infinite recursion in ASTImporter, on return type declared inside
-  body of C++11 lambda without trailing return (#GH68775).
-- Fixed declaration name source location of instantiated function definitions (GH71161).
-- Improve diagnostic output to print an expression instead of 'no argument` when comparing Values as template arguments.
-
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Do not attempt to dump the layout of dependent types or invalid declarations
-  when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684.
-- Unhandled StructuralValues in the template differ (#GH93068).
+- Fixed a crash in C due to incorrect lookup that members in nested anonymous struct/union
+  can be found as ordinary identifiers in struct/union definition. (#GH31295)
+
+- Fixed a crash caused by long chains of ``sizeof`` and other similar operators
+  that can be followed by a non-parenthesized expression. (#GH45061)
 
 OpenACC Specific Changes
 ------------------------
@@ -1090,131 +200,52 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
-- Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1
-- Support has been removed for the AMD "3DNow!" instruction-set.
-  Neither modern AMD CPUs, nor any Intel CPUs implement these
-  instructions, and they were never widely used.
-
-  * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used.
-  * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler.
-  * The header ``<mm3dnow.h>`` is deprecated, and emits a warning if included.
-  * The 3dNow intrinsic functions have been removed: ``_m_femms``,
-    ``_m_pavgusb``, ``_m_pf2id``, ``_m_pfacc``, ``_m_pfadd``,
-    ``_m_pfcmpeq``, ``_m_pfcmpge``, ``_m_pfcmpgt``, ``_m_pfmax``,
-    ``_m_pfmin``, ``_m_pfmul``, ``_m_pfrcp``, ``_m_pfrcpit1``,
-    ``_m_pfrcpit2``, ``_m_pfrsqrt``, ``_m_pfrsqrtit1``, ``_m_pfsub``,
-    ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``,
-    ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``,
-    ``_m_pswapdsi``.
-  * The compiler builtins corresponding to each of the above
-    intrinsics have also been removed  (``__builtin_ia32_femms``, and so on).
-  * "3DNow!" instructions remain supported in assembly code, including
-    inside inline-assembly.
+- The MMX vector intrinsic functions from ``*mmintrin.h`` which
+  operate on `__m64` vectors, such as ``_mm_add_pi8``, have been
+  reimplemented to use the SSE2 instruction-set and XMM registers
+  unconditionally. These intrinsics are therefore *no longer
+  supported* if MMX is enabled without SSE2 -- either from targeting
+  CPUs from the Pentium-MMX through the Pentium 3, or explicitly via
+  passing arguments such as ``-mmmx -mno-sse2``. MMX assembly code
+  remains supported without requiring SSE2, including inside
+  inline-assembly.
+
+- The compiler builtins such as ``__builtin_ia32_paddb`` which
+  formerly implemented the above MMX intrinsic functions have been
+  removed. Any uses of these removed functions should migrate to the
+  functions defined by the ``*mmintrin.h`` headers. A mapping can be
+  found in the file ``clang/www/builtins.py``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and
-  Armv8-M without the Main Extension. Baremetal targets should check that the
-  new default will work with their system configurations, since it requires
-  that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is
-  configured as "normal" memory. This brings Clang in-line with the default
-  settings for GCC and Arm Compiler. Aside from making Clang align with other
-  compilers, changing the default brings major performance and code size
-  improvements for most targets. We have not changed the default behavior for
-  ARMv6, but may revisit that decision in the future. Users can restore the old
-  behavior with -m[no-]unaligned-access.
-- An alias identifier (rdma) has been added for targeting the AArch64
-  Architecture Extension which uses Rounding Doubling Multiply Accumulate
-  instructions (rdm). The identifier is available on the command line as
-  a feature modifier for -march and -mcpu as well as via target attributes
-  like ``target_version`` or ``target_clones``.
-- Support has been added for the following processors (-mcpu identifiers in parenthesis):
-    * Arm Cortex-R52+ (cortex-r52plus).
-    * Arm Cortex-R82AE (cortex-r82ae).
-    * Arm Cortex-A78AE (cortex-a78ae).
-    * Arm Cortex-A520AE (cortex-a520ae).
-    * Arm Cortex-A720AE (cortex-a720ae).
-    * Arm Cortex-A725 (cortex-a725).
-    * Arm Cortex-X925 (cortex-x925).
-    * Arm Neoverse-N3 (neoverse-n3).
-    * Arm Neoverse-V3 (neoverse-v3).
-    * Arm Neoverse-V3AE (neoverse-v3ae).
-
 Android Support
 ^^^^^^^^^^^^^^^
 
 Windows Support
 ^^^^^^^^^^^^^^^
 
-- The clang-cl ``/Ot`` compiler option ("optimize for speed", also implied by
-  ``/O2``) now maps to clang's ``-O3`` optimizataztion level instead of ``-O2``.
-  Users who prefer the old behavior can use ``clang-cl /Ot /clang:-O2 ...``.
-
-- Clang-cl now supports function targets with intrinsic headers. This allows
-  for runtime feature detection of intrinsics. Previously under clang-cl
-  ``immintrin.h`` and similar intrinsic headers would only include the intrinsics
-  if building with that feature enabled at compile time, e.g. ``avxintrin.h``
-  would only be included if AVX was enabled at compile time. This was done to work
-  around include times from MSVC STL including ``intrin.h`` under clang-cl.
-  Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic
-  features without requiring enablement at compile time. Fixes #GH53520
-
-- Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a
-  header that only includes intrinsics that are used by MSVC STL to avoid the
-  use of ``intrin.h``. MSVC STL when compiled under clang uses ``intrin.h``
-  instead. Clang-cl now provides ``intrin0.h`` for the same compiler throughput
-  purposes as MSVC. Clang-cl also provides ``yvals_core.h`` to redefine
-  ``_STL_INTRIN_HEADER`` to expand to ``intrin0.h`` instead of ``intrin.h``.
-  This also means that if all intrinsic features are enabled at compile time
-  including STL headers will no longer slow down compile times since ``intrin.h``
-  is not included from MSVC STL.
-
-- When the target triple is `*-windows-msvc` strict aliasing is now disabled by default
-  to ensure compatibility with msvc. Previously strict aliasing was only disabled if the
-  driver mode was cl.
-
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
 RISC-V Support
 ^^^^^^^^^^^^^^
 
-- ``__attribute__((rvv_vector_bits(N)))`` is now supported for RVV vbool*_t types.
-- Profile names in ``-march`` option are now supported.
-- Passing empty structs/unions as arguments in C++ is now handled correctly. The behavior is similar to GCC's.
-- ``-m[no-]scalar-strict-align`` and ``-m[no-]vector-strict-align`` options have
-  been added to give separate control of whether scalar or vector misaligned
-  accesses may be created. ``-m[no-]strict-align`` applies to both scalar and
-  vector.
-
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- PTX is no longer included by default when compiling for CUDA. Using
-  ``--cuda-include-ptx=all`` will return the old behavior.
-
 CUDA Support
 ^^^^^^^^^^^^
-- Clang now supports CUDA SDK up to 12.5
 
 AIX Support
 ^^^^^^^^^^^
 
-- Introduced the ``-maix-small-local-dynamic-tls`` option to produce a faster
-  access sequence for local-dynamic TLS variables where the offset from the TLS
-  base is encoded as an immediate operand.
-  This access sequence is not used for TLS variables larger than 32KB, and is
-  currently only supported on 64-bit mode.
+NetBSD Support
+^^^^^^^^^^^^^^
 
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
-The -mcpu=generic configuration now enables multivalue and reference-types.
-These proposals are standardized and available in all major engines. Enabling
-multivalue here only enables the language feature but does not turn on the
-multivalue ABI (this enables non-ABI uses of multivalue, like exnref).
-
 AVR Support
 ^^^^^^^^^^^
 
@@ -1224,134 +255,56 @@ DWARF Support in Clang
 Floating Point Support in Clang
 -------------------------------
 
-- Add ``__builtin__fmaf16`` builtin for floating point types.
-
 Fixed Point Support in Clang
 ----------------------------
 
-- Support fixed point precision macros according to ``7.18a.3`` of
-  `ISO/IEC TR 18037:2008 <https://standards.iso.org/ittf/PubliclyAvailableStandards/c051126_ISO_IEC_TR_18037_2008.zip>`_.
-
 AST Matchers
 ------------
 
-- Fixes a long-standing performance issue in parent map generation for
-  ancestry-based matchers such as ``hasParent`` and ``hasAncestor``, making
-  them significantly faster.
-- ``isInStdNamespace`` now supports Decl declared with ``extern "C++"``.
-- Add ``isExplicitObjectMemberFunction``.
-- Fixed ``forEachArgumentWithParam`` and ``forEachArgumentWithParamType`` to
-  not skip the explicit object parameter for operator calls.
-- Fixed captureVars assertion failure if not capturesVariables. (#GH76425)
-- ``forCallable`` now properly preserves binding on successful match. (#GH89657)
+- Fixed an issue with the `hasName` and `hasAnyName` matcher when matching
+  inline namespaces with an enclosing namespace of the same name.
 
 clang-format
 ------------
 
-- ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to
-  ``BreakTemplateDeclarations``.
-- ``AlwaysBreakAfterReturnType`` is deprecated and renamed to
-  ``BreakAfterReturnType``.
-- Handles Java switch expressions.
-- Adds ``AllowShortCaseExpressionOnASingleLine`` option.
-- Adds ``AlignCaseArrows`` suboption to ``AlignConsecutiveShortCaseStatements``.
-- Adds ``LeftWithLastLine`` suboption to ``AlignEscapedNewlines``.
-- Adds ``KeepEmptyLines`` option to deprecate ``KeepEmptyLinesAtEOF``
-  and ``KeepEmptyLinesAtTheStartOfBlocks``.
-- Add ``ExceptDoubleParentheses`` sub-option for ``SpacesInParensOptions``
-  to override addition of spaces between multiple, non-redundant parentheses
-  similar to the rules used for ``RemoveParentheses``.
-
 libclang
 --------
 
-- ``clang_getSpellingLocation`` now correctly resolves macro expansions; that
-  is, it returns the spelling location instead of the expansion location.
-
 Static Analyzer
 ---------------
 
 New features
 ^^^^^^^^^^^^
 
-- The attribute ``[[clang::suppress]]`` can now be applied to declarations.
-  (#GH80371)
-
-- Support C++23 static operator calls. (#GH84972)
+- MallocChecker now checks for ``ownership_returns(class, idx)`` and ``ownership_takes(class, idx)``
+  attributes with class names different from "malloc". Clang static analyzer now reports an error
+  if class of allocation and deallocation function mismatches.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mismatcheddeallocator-c-c>`__.
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
-- Fixed crashing on loops if the loop variable was declared in switch blocks
-  but not under any case blocks if ``unroll-loops=true`` analyzer config is
-  set. (#GH68819)
-
-- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally
-  matched user-defined ``strerror`` and similar library functions. (#GH88181)
-
-- Fixed a crash when storing through an address that refers to the address of
-  a label. (#GH89185)
-
-- Z3 crosschecking (aka. Z3 refutation) is now bounded, and can't consume
-  more total time than the eymbolic execution itself. (#GH97298)
-
 Improvements
 ^^^^^^^^^^^^
 
+- Improved the handling of the ``ownership_returns`` attribute. Now, Clang reports an
+  error if the attribute is attached to a function that returns a non-pointer value.
+  Fixes (#GH99501)
+
 Moved checkers
 ^^^^^^^^^^^^^^
 
-- Moved ``alpha.cplusplus.ArrayDelete`` out of the ``alpha`` package
-  to ``cplusplus.ArrayDelete``. (#GH83985)
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-arraydelete-c>`__.
-
-- Moved ``alpha.unix.Stream`` out of the ``alpha`` package to
-  ``unix.Stream``. (#GH89247)
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stream-c>`__.
-
-- Moved ``alpha.unix.BlockInCriticalSection`` out of the ``alpha`` package to
-  ``unix.BlockInCriticalSection``. (#GH93815)
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection-c-c>`__.
-
-- Moved ``alpha.security.cert.pos.34c`` out of the ``alpha`` package to
-  ``security.PutenvStackArray``. (#GH92424, #GH93815)
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-putenvstackarray-c>`__.
-
-- Moved ``alpha.core.SizeofPtr`` into ``clang-tidy``
-  ``bugprone-sizeof-expression``. (#GH95118, #GH94356)
-  `Documentation <https://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-expression.html>`__.
-
 .. _release-notes-sanitizers:
 
 Sanitizers
 ----------
 
-- ``-fsanitize=signed-integer-overflow`` now instruments signed arithmetic even
-  when ``-fwrapv`` is enabled. Previously, only division checks were enabled.
-
-  Users with ``-fwrapv`` as well as a sanitizer group like
-  ``-fsanitize=undefined`` or ``-fsanitize=integer`` enabled may want to
-  manually disable potentially noisy signed integer overflow checks with
-  ``-fno-sanitize=signed-integer-overflow``
-
-- ``-fsanitize=cfi -fsanitize-cfi-cross-dso`` (cross-DSO CFI instrumentation)
-  now generates the ``__cfi_check`` function with proper target-specific
-  attributes, for example allowing unwind table generation.
-
 Python Binding Changes
 ----------------------
 
-- Exposed `CXRewriter` API as `class Rewriter`.
-- Add some missing kinds from Index.h (CursorKind: 149-156, 272-320, 420-437.
-  TemplateArgumentKind: 5-9. TypeKind: 161-175 and 178).
-- Add support for retrieving binary operator information through
-  Cursor.binary_operator().
-
 OpenMP Support
 --------------
 
-- Added support for the `[[omp::assume]]` attribute.
-
 Additional Information
 ======================
 
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index cf0528e75e7f2..b87491910e222 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -652,6 +652,149 @@ in the future. The expected roadmap for Reduced BMIs as of Clang 19.x is:
    comes, the term BMI will refer to the Reduced BMI and the Full BMI will only
    be meaningful to build systems which elect to support two-phase compilation.
 
+Experimental Non-Cascading Changes
+----------------------------------
+
+This section is primarily for build system vendors. For end compiler users,
+if you don't want to read it all, this is helpful to reduce recompilations.
+We encourage build system vendors and end users try this out and bring feedback.
+
+Before Clang 19, a change in BMI of any (transitive) dependency would cause the
+outputs of the BMI to change. Starting with Clang 19, changes to non-direct
+dependencies should not directly affect the output BMI, unless they affect the
+results of the compilations. We expect that there are many more opportunities
+for this optimization than we currently have realized and would appreaciate 
+feedback about missed optimization opportunities. For example,
+
+.. code-block:: c++
+
+  // m-partA.cppm
+  export module m:partA;
+
+  // m-partB.cppm
+  export module m:partB;
+  export int getB() { return 44; }
+
+  // m.cppm
+  export module m;
+  export import :partA;
+  export import :partB;
+
+  // useBOnly.cppm
+  export module useBOnly;
+  import m;
+  export int B() {
+    return getB();
+  }
+
+  // Use.cc
+  import useBOnly;
+  int get() {
+    return B();
+  }
+
+To compile the project (for brevity, some commands are omitted.):
+
+.. code-block:: console
+
+  $ clang++ -std=c++20 m-partA.cppm --precompile -o m-partA.pcm
+  $ clang++ -std=c++20 m-partB.cppm --precompile -o m-partB.pcm
+  $ clang++ -std=c++20 m.cppm --precompile -o m.pcm -fprebuilt-module-path=.
+  $ clang++ -std=c++20 useBOnly.cppm --precompile -o useBOnly.pcm -fprebuilt-module-path=.
+  $ md5sum useBOnly.pcm
+  07656bf4a6908626795729295f9608da  useBOnly.pcm
+
+If the interface of ``m-partA.cppm`` is changed to:
+
+.. code-block:: c++
+
+  // m-partA.v1.cppm
+  export module m:partA;
+  export int getA() { return 43; }
+
+and the BMI for ``useBOnly`` is recompiled as in:
+
+.. code-block:: console
+
+  $ clang++ -std=c++20 m-partA.cppm --precompile -o m-partA.pcm
+  $ clang++ -std=c++20 m-partB.cppm --precompile -o m-partB.pcm
+  $ clang++ -std=c++20 m.cppm --precompile -o m.pcm -fprebuilt-module-path=.
+  $ clang++ -std=c++20 useBOnly.cppm --precompile -o useBOnly.pcm -fprebuilt-module-path=.
+  $ md5sum useBOnly.pcm
+  07656bf4a6908626795729295f9608da  useBOnly.pcm
+
+then the contents of ``useBOnly.pcm`` remain unchanged.
+Consequently, if the build system only bases recompilation decisions on directly imported modules,
+it becomes possible to skip the recompilation of ``Use.cc``.
+It should be fine because the altered interfaces do not affect ``Use.cc`` in any way;
+the changes do not cascade.
+
+When ``Clang`` generates a BMI, it records the hash values of all potentially contributory BMIs
+for the BMI being produced. This ensures that build systems are not required to consider
+transitively imported modules when deciding whether to recompile.
+
+What is considered to be a potential contributory BMIs is currently unspecified.
+However, it is a severe bug for a BMI to remain unchanged following an observable change
+that affects its consumers.
+
+Build systems may utilize this optimization by doing an update-if-changed operation to the BMI
+that is consumed from the BMI that is output by the compiler.
+
+We encourage build systems to add an experimental mode that
+reuses the cached BMI when **direct** dependencies did not change,
+even if **transitive** dependencies did change.
+
+Given there are potential compiler bugs, we recommend that build systems
+support this feature as a configurable option so that users
+can go back to the transitive change mode safely at any time.
+
+Interactions with Reduced BMI
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With reduced BMI, non-cascading changes can be more powerful. For example,
+
+.. code-block:: c++
+
+  // A.cppm
+  export module A;
+  export int a() { return 44; }
+
+  // B.cppm
+  export module B;
+  import A;
+  export int b() { return a(); }
+
+.. code-block:: console
+
+  $ clang++ -std=c++20 A.cppm -c -fmodule-output=A.pcm  -fexperimental-modules-reduced-bmi -o A.o
+  $ clang++ -std=c++20 B.cppm -c -fmodule-output=B.pcm  -fexperimental-modules-reduced-bmi -o B.o -fmodule-file=A=A.pcm
+  $ md5sum B.pcm
+  6c2bd452ca32ab418bf35cd141b060b9  B.pcm
+
+And let's change the implementation for ``A.cppm`` into:
+
+.. code-block:: c++
+
+  export module A;
+  int a_impl() { return 99; }
+  export int a() { return a_impl(); }
+
+and recompile the example:
+
+.. code-block:: console
+
+  $ clang++ -std=c++20 A.cppm -c -fmodule-output=A.pcm  -fexperimental-modules-reduced-bmi -o A.o
+  $ clang++ -std=c++20 B.cppm -c -fmodule-output=B.pcm  -fexperimental-modules-reduced-bmi -o B.o -fmodule-file=A=A.pcm
+  $ md5sum B.pcm
+  6c2bd452ca32ab418bf35cd141b060b9  B.pcm
+
+We should find the contents of ``B.pcm`` remains the same. In this case, the build system is
+allowed to skip recompilations of TUs which solely and directly depend on module ``B``.
+
+This only happens with a reduced BMI. With reduced BMIs, we won't record the function body
+of ``int b()`` in the BMI for ``B`` so that the module ``A`` doesn't contribute to the BMI of ``B``
+and we have less dependencies.
+
 Performance Tips
 ----------------
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 76a9aae170893..55832d20bd27a 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1703,7 +1703,13 @@ are detected:
 * Invalid 3rd ("``whence``") argument to ``fseek``.
 
 The stream operations are by this checker usually split into two cases, a success
-and a failure case. However, in the case of write operations (like ``fwrite``,
+and a failure case.
+On the success case it also assumes that the current value of ``stdout``,
+``stderr``, or ``stdin`` can't be equal to the file pointer returned by ``fopen``.
+Operations performed on ``stdout``, ``stderr``, or ``stdin`` are not checked by
+this checker in contrast to the streams opened by ``fopen``.
+
+In the case of write operations (like ``fwrite``,
 ``fprintf`` and even ``fsetpos``) this behavior could produce a large amount of
 unwanted reports on projects that don't have error checks around the write
 operations, so by default the checker assumes that write operations always succeed.
@@ -1769,9 +1775,7 @@ are assumed to succeed.)
 **Limitations**
 
 The checker does not track the correspondence between integer file descriptors
-and ``FILE *`` pointers. Operations on standard streams like ``stdin`` are not
-treated specially and are therefore often not recognized (because these streams
-are usually not opened explicitly by the program, and are global variables).
+and ``FILE *`` pointers.
 
 .. _osx-checkers:
 
@@ -2494,15 +2498,49 @@ Check for pointer arithmetic on locations other than array elements.
 
 alpha.core.PointerSub (C)
 """""""""""""""""""""""""
-Check for pointer subtractions on two pointers pointing to different memory chunks.
+Check for pointer subtractions on two pointers pointing to different memory
+chunks. According to the C standard §6.5.6 only subtraction of pointers that
+point into (or one past the end) the same array object is valid (for this
+purpose non-array variables are like arrays of size 1).
 
 .. code-block:: c
 
  void test() {
-   int x, y;
-   int d = &y - &x; // warn
+   int a, b, c[10], d[10];
+   int x = &c[3] - &c[1];
+   x = &d[4] - &c[1]; // warn: 'c' and 'd' are different arrays
+   x = (&a + 1) - &a;
+   x = &b - &a; // warn: 'a' and 'b' are different variables
+   x = (&a + 2) - &a; // warn: for a variable it is only valid to have a pointer
+                      // to one past the address of it
+   x = &c[10] - &c[0];
+   x = &c[11] - &c[0]; // warn: index larger than one past the end
+   x = &c[-1] - &c[0]; // warn: negative index
  }
 
+ struct S {
+   int x[10];
+   int y[10];
+ };
+
+ void test1() {
+   struct S a[10];
+   struct S b;
+   int d = &a[4] - &a[6];
+   d = &a[0].x[3] - &a[0].x[1];
+   d = a[0].y - a[0].x; // warn: 'S.b' and 'S.a' are different objects
+   d = (char *)&b.y - (char *)&b.x; // warn: different members of the same object
+   d = (char *)&b.y - (char *)&b; // warn: object of type S is not the same array as a member of it
+ }
+
+There may be existing applications that use code like above for calculating
+offsets of members in a structure, using pointer subtractions. This is still
+undefined behavior according to the standard and code like this can be replaced
+with the `offsetof` macro.
+
+The checker only reports cases where stack-allocated objects are involved (no
+warnings on pointers to memory allocated by `malloc`).
+
 .. _alpha-core-StackAddressAsyncEscape:
 
 alpha.core.StackAddressAsyncEscape (C)
diff --git a/clang/docs/analyzer/checkers/mismatched_deallocator_example.cpp b/clang/docs/analyzer/checkers/mismatched_deallocator_example.cpp
index 2a4103240fe81..3b7c7d99faef3 100644
--- a/clang/docs/analyzer/checkers/mismatched_deallocator_example.cpp
+++ b/clang/docs/analyzer/checkers/mismatched_deallocator_example.cpp
@@ -6,6 +6,10 @@ void test() {
 
 // C, C++
 void __attribute((ownership_returns(malloc))) *user_malloc(size_t);
+void __attribute((ownership_takes(malloc, 1))) *user_free(void *);
+
+void __attribute((ownership_returns(malloc1))) *user_malloc1(size_t);
+void __attribute((ownership_takes(malloc1, 1))) *user_free1(void *);
 
 void test() {
   int *p = (int *)user_malloc(sizeof(int));
@@ -24,6 +28,12 @@ void test() {
   realloc(p, sizeof(long)); // warn
 }
 
+// C, C++
+void test() {
+  int *p = user_malloc(10);
+  user_free1(p); // warn
+}
+
 // C, C++
 template <typename T>
 struct SimpleSmartPointer {
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index a35a867b96bd7..9bae0bd83243b 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -92,6 +92,7 @@ Using Clang Tools
    ClangFormatStyleOptions
    ClangFormattedStatus
    ClangLinkerWrapper
+   ClangNVLinkWrapper
    ClangOffloadBundler
    ClangOffloadPackager
    ClangRepl
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index fb87ad79f0748..3e37d7ca9e7ba 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2146,6 +2146,14 @@ enum CXCursorKind {
    */
   CXCursor_OMPScopeDirective = 306,
 
+  /** OpenMP reverse directive.
+   */
+  CXCursor_OMPReverseDirective = 307,
+
+  /** OpenMP interchange directive.
+   */
+  CXCursor_OMPInterchangeDirective = 308,
+
   /** OpenACC Compute Construct.
    */
   CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/APINotes/APINotesReader.h b/clang/include/clang/APINotes/APINotesReader.h
index 37a4ff7a69712..03657352c49a5 100644
--- a/clang/include/clang/APINotes/APINotesReader.h
+++ b/clang/include/clang/APINotes/APINotesReader.h
@@ -141,6 +141,16 @@ class APINotesReader {
                                                  ObjCSelectorRef Selector,
                                                  bool IsInstanceMethod);
 
+  /// Look for information regarding the given C++ method in the given C++ tag
+  /// context.
+  ///
+  /// \param CtxID The ID that references the parent context, i.e. a C++ tag.
+  /// \param Name The name of the C++ method we're looking for.
+  ///
+  /// \returns Information about the method, if known.
+  VersionedInfo<CXXMethodInfo> lookupCXXMethod(ContextID CtxID,
+                                               llvm::StringRef Name);
+
   /// Look for information regarding the given global variable.
   ///
   /// \param Name The name of the global variable.
@@ -166,6 +176,17 @@ class APINotesReader {
   /// \returns information about the enumerator, if known.
   VersionedInfo<EnumConstantInfo> lookupEnumConstant(llvm::StringRef Name);
 
+  /// Look for the context ID of the given C++ tag.
+  ///
+  /// \param Name The name of the tag we're looking for.
+  /// \param ParentCtx The context in which this tag is declared, e.g. a C++
+  /// namespace.
+  ///
+  /// \returns The ID, if known.
+  std::optional<ContextID>
+  lookupTagID(llvm::StringRef Name,
+              std::optional<Context> ParentCtx = std::nullopt);
+
   /// Look for information regarding the given tag
   /// (struct/union/enum/C++ class).
   ///
diff --git a/clang/include/clang/APINotes/APINotesWriter.h b/clang/include/clang/APINotes/APINotesWriter.h
index e82dbc7c9540e..e0fe5eacef725 100644
--- a/clang/include/clang/APINotes/APINotesWriter.h
+++ b/clang/include/clang/APINotes/APINotesWriter.h
@@ -78,6 +78,14 @@ class APINotesWriter {
                      bool IsInstanceMethod, const ObjCMethodInfo &Info,
                      llvm::VersionTuple SwiftVersion);
 
+  /// Add information about a specific C++ method.
+  ///
+  /// \param CtxID The context in which this method resides, i.e. a C++ tag.
+  /// \param Name The name of the method.
+  /// \param Info Information about this method.
+  void addCXXMethod(ContextID CtxID, llvm::StringRef Name,
+                    const CXXMethodInfo &Info, llvm::VersionTuple SwiftVersion);
+
   /// Add information about a global variable.
   ///
   /// \param Name The name of this global variable.
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index b389aa8d56f16..c8e5e4df25d17 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -656,6 +656,12 @@ class GlobalFunctionInfo : public FunctionInfo {
   GlobalFunctionInfo() {}
 };
 
+/// Describes API notes data for a C++ method.
+class CXXMethodInfo : public FunctionInfo {
+public:
+  CXXMethodInfo() {}
+};
+
 /// Describes API notes data for an enumerator.
 class EnumConstantInfo : public CommonEntityInfo {
 public:
@@ -789,6 +795,7 @@ enum class ContextKind : uint8_t {
   ObjCClass = 0,
   ObjCProtocol = 1,
   Namespace = 2,
+  Tag = 3,
 };
 
 struct Context {
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index aae9dfb719292..ca1a948dcce97 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -738,6 +738,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   }
   void Deallocate(void *Ptr) const {}
 
+  llvm::StringRef backupStr(llvm::StringRef S) const {
+    char *Buf = new (*this) char[S.size()];
+    std::copy(S.begin(), S.end(), Buf);
+    return llvm::StringRef(Buf, S.size());
+  }
+
   /// Allocates a \c DeclListNode or returns one from the \c ListNodeFreeList
   /// pool.
   DeclListNode *AllocateDeclListNode(clang::NamedDecl *ND) {
@@ -1292,7 +1298,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   getPointerAuthVTablePointerDiscriminator(const CXXRecordDecl *RD);
 
   /// Return the "other" type-specific discriminator for the given type.
-  uint16_t getPointerAuthTypeDiscriminator(QualType T) const;
+  uint16_t getPointerAuthTypeDiscriminator(QualType T);
 
   /// Apply Objective-C protocol qualifiers to the given type.
   /// \param allowOnPointerType specifies if we can apply protocol
@@ -1368,7 +1374,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
                            bool AsWritten = false);
 
   /// Get a function type and produce the equivalent function type where
-  /// pointer size address spaces in the return type and parameter tyeps are
+  /// pointer size address spaces in the return type and parameter types are
   /// replaced with the default address space.
   QualType getFunctionTypeWithoutPtrSizes(QualType T);
 
diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h
index 4ffd913846575..f851decd0965c 100644
--- a/clang/include/clang/AST/ASTImporter.h
+++ b/clang/include/clang/AST/ASTImporter.h
@@ -258,7 +258,6 @@ class TypeSourceInfo;
     FoundDeclsTy findDeclsInToCtx(DeclContext *DC, DeclarationName Name);
 
     void AddToLookupTable(Decl *ToD);
-    llvm::Error ImportAttrs(Decl *ToD, Decl *FromD);
 
   protected:
     /// Can be overwritten by subclasses to implement their own import logic.
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 616f92691ec32..0546c19ce8119 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -583,7 +583,7 @@ class ASTNodeTraverser
   void VisitCapturedDecl(const CapturedDecl *D) { Visit(D->getBody()); }
 
   void VisitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
-    for (const auto *E : D->varlists())
+    for (const auto *E : D->varlist())
       Visit(E);
   }
 
@@ -603,7 +603,7 @@ class ASTNodeTraverser
   }
 
   void VisitOMPAllocateDecl(const OMPAllocateDecl *D) {
-    for (const auto *E : D->varlists())
+    for (const auto *E : D->varlist())
       Visit(E);
     for (const auto *C : D->clauselists())
       Visit(C);
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index fb52ac804849d..bf6a5ce92d438 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1188,10 +1188,6 @@ class CXXRecordDecl : public RecordDecl {
   ///
   /// \note This does NOT include a check for union-ness.
   bool isEmpty() const { return data().Empty; }
-  /// Marks this record as empty. This is used by DWARFASTParserClang
-  /// when parsing records with empty fields having [[no_unique_address]]
-  /// attribute
-  void markEmpty() { data().Empty = true; }
 
   void setInitMethod(bool Val) { data().HasInitMethod = Val; }
   bool hasInitMethod() const { return data().HasInitMethod; }
@@ -1210,6 +1206,13 @@ class CXXRecordDecl : public RecordDecl {
     return D.HasPublicFields || D.HasProtectedFields || D.HasPrivateFields;
   }
 
+  /// If this is a standard-layout class or union, any and all data members will
+  /// be declared in the same type.
+  ///
+  /// This retrieves the type where any fields are declared,
+  /// or the current class if there is no class with fields.
+  const CXXRecordDecl *getStandardLayoutBaseWithFields() const;
+
   /// Whether this class is polymorphic (C++ [class.virtual]),
   /// which means that the class contains or inherits a virtual function.
   bool isPolymorphic() const { return data().Polymorphic; }
diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h
index e542c3c8e66b0..868662208efa1 100644
--- a/clang/include/clang/AST/DeclOpenMP.h
+++ b/clang/include/clang/AST/DeclOpenMP.h
@@ -143,10 +143,10 @@ class OMPThreadPrivateDecl final : public OMPDeclarativeDirective<Decl> {
   unsigned varlist_size() const { return Data->getNumChildren(); }
   bool varlist_empty() const { return Data->getChildren().empty(); }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
   varlist_iterator varlist_begin() { return getVars().begin(); }
@@ -513,10 +513,10 @@ class OMPAllocateDecl final : public OMPDeclarativeDirective<Decl> {
   unsigned clauselist_size() const { return Data->getNumClauses(); }
   bool clauselist_empty() const { return Data->getClauses().empty(); }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
   varlist_iterator varlist_begin() { return getVars().begin(); }
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 05811c46885a3..e2383925f2318 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -4854,15 +4854,7 @@ class CXXFoldExpr : public Expr {
   CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee,
               SourceLocation LParenLoc, Expr *LHS, BinaryOperatorKind Opcode,
               SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc,
-              std::optional<unsigned> NumExpansions)
-      : Expr(CXXFoldExprClass, T, VK_PRValue, OK_Ordinary),
-        LParenLoc(LParenLoc), EllipsisLoc(EllipsisLoc), RParenLoc(RParenLoc),
-        NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) {
-    SubExprs[SubExpr::Callee] = Callee;
-    SubExprs[SubExpr::LHS] = LHS;
-    SubExprs[SubExpr::RHS] = RHS;
-    setDependence(computeDependence(this));
-  }
+              std::optional<unsigned> NumExpansions);
 
   CXXFoldExpr(EmptyShell Empty) : Expr(CXXFoldExprClass, Empty) {}
 
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 325a1baa44614..b029c72fa7d8f 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -316,10 +316,10 @@ template <class T> class OMPVarListClause : public OMPClause {
   unsigned varlist_size() const { return NumVars; }
   bool varlist_empty() const { return NumVars == 0; }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
 
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 0879f519c683d..922b8020c321a 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1772,10 +1772,10 @@ DEF_TRAVERSE_DECL(UsingShadowDecl, {})
 DEF_TRAVERSE_DECL(ConstructorUsingShadowDecl, {})
 
 DEF_TRAVERSE_DECL(OMPThreadPrivateDecl, {
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     TRY_TO(TraverseStmt(I));
   }
- })
+})
 
 DEF_TRAVERSE_DECL(OMPRequiresDecl, {
   for (auto *C : D->clauselists()) {
@@ -1801,7 +1801,7 @@ DEF_TRAVERSE_DECL(OMPDeclareMapperDecl, {
 DEF_TRAVERSE_DECL(OMPCapturedExprDecl, { TRY_TO(TraverseVarHelper(D)); })
 
 DEF_TRAVERSE_DECL(OMPAllocateDecl, {
-  for (auto *I : D->varlists())
+  for (auto *I : D->varlist())
     TRY_TO(TraverseStmt(I));
   for (auto *C : D->clauselists())
     TRY_TO(TraverseOMPClause(C));
@@ -3047,6 +3047,12 @@ DEF_TRAVERSE_STMT(OMPTileDirective,
 DEF_TRAVERSE_STMT(OMPUnrollDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPReverseDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPInterchangeDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPForDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -3561,7 +3567,7 @@ bool RecursiveASTVisitor<Derived>::VisitOMPNocontextClause(
 template <typename Derived>
 template <typename T>
 bool RecursiveASTVisitor<Derived>::VisitOMPClauseList(T *Node) {
-  for (auto *E : Node->varlists()) {
+  for (auto *E : Node->varlist()) {
     TRY_TO(TraverseStmt(E));
   }
   return true;
@@ -3935,7 +3941,7 @@ template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPAffinityClause(
     OMPAffinityClause *C) {
   TRY_TO(TraverseStmt(C->getModifier()));
-  for (Expr *E : C->varlists())
+  for (Expr *E : C->varlist())
     TRY_TO(TraverseStmt(E));
   return true;
 }
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index f735fa5643aec..f313c480f9a08 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -281,15 +281,6 @@ class OMPExecutableDirective : public Stmt {
     return Data->getClauses();
   }
 
-  /// Was this directive mapped from an another directive?
-  /// e.g. 1) omp loop bind(parallel) is mapped to OMPD_for
-  ///      2) omp loop bind(teams) is mapped to OMPD_distribute
-  ///      3) omp loop bind(thread) is mapped to OMPD_simd
-  /// It was necessary to note it down in the Directive because of
-  /// clang::TreeTransform::TransformOMPExecutableDirective() pass in
-  /// the frontend.
-  OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown;
-
 protected:
   /// Data, associated with the directive.
   OMPChildren *Data = nullptr;
@@ -354,10 +345,6 @@ class OMPExecutableDirective : public Stmt {
     return Inst;
   }
 
-  void setMappedDirective(OpenMPDirectiveKind MappedDirective) {
-    PrevMappedDirective = MappedDirective;
-  }
-
 public:
   /// Iterates over expressions/statements used in the construct.
   class used_clauses_child_iterator
@@ -611,8 +598,6 @@ class OMPExecutableDirective : public Stmt {
            "Expected directive with the associated statement.");
     return Data->getRawStmt();
   }
-
-  OpenMPDirectiveKind getMappedDirective() const { return PrevMappedDirective; }
 };
 
 /// This represents '#pragma omp parallel' directive.
@@ -1007,8 +992,9 @@ class OMPLoopTransformationDirective : public OMPLoopBasedDirective {
   Stmt *getPreInits() const;
 
   static bool classof(const Stmt *T) {
-    return T->getStmtClass() == OMPTileDirectiveClass ||
-           T->getStmtClass() == OMPUnrollDirectiveClass;
+    Stmt::StmtClass C = T->getStmtClass();
+    return C == OMPTileDirectiveClass || C == OMPUnrollDirectiveClass ||
+           C == OMPReverseDirectiveClass || C == OMPInterchangeDirectiveClass;
   }
 };
 
@@ -1619,8 +1605,7 @@ class OMPSimdDirective : public OMPLoopDirective {
                                   SourceLocation EndLoc, unsigned CollapsedNum,
                                   ArrayRef<OMPClause *> Clauses,
                                   Stmt *AssociatedStmt,
-                                  const HelperExprs &Exprs,
-                                  OpenMPDirectiveKind ParamPrevMappedDirective);
+                                  const HelperExprs &Exprs);
 
   /// Creates an empty directive with the place
   /// for \a NumClauses clauses.
@@ -1698,8 +1683,7 @@ class OMPForDirective : public OMPLoopDirective {
                                  SourceLocation EndLoc, unsigned CollapsedNum,
                                  ArrayRef<OMPClause *> Clauses,
                                  Stmt *AssociatedStmt, const HelperExprs &Exprs,
-                                 Expr *TaskRedRef, bool HasCancel,
-                                 OpenMPDirectiveKind ParamPrevMappedDirective);
+                                 Expr *TaskRedRef, bool HasCancel);
 
   /// Creates an empty directive with the place
   /// for \a NumClauses clauses.
@@ -4477,8 +4461,7 @@ class OMPDistributeDirective : public OMPLoopDirective {
   static OMPDistributeDirective *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
          unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
-         Stmt *AssociatedStmt, const HelperExprs &Exprs,
-         OpenMPDirectiveKind ParamPrevMappedDirective);
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
 
   /// Creates an empty directive with the place
   /// for \a NumClauses clauses.
@@ -5711,6 +5694,144 @@ class OMPUnrollDirective final : public OMPLoopTransformationDirective {
   }
 };
 
+/// Represents the '#pragma omp reverse' loop transformation directive.
+///
+/// \code
+/// #pragma omp reverse
+/// for (int i = 0; i < n; ++i)
+///   ...
+/// \endcode
+class OMPReverseDirective final : public OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+
+  /// Offsets of child members.
+  enum {
+    PreInitsOffset = 0,
+    TransformedStmtOffset,
+  };
+
+  explicit OMPReverseDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPLoopTransformationDirective(OMPReverseDirectiveClass,
+                                       llvm::omp::OMPD_reverse, StartLoc,
+                                       EndLoc, 1) {}
+
+  void setPreInits(Stmt *PreInits) {
+    Data->getChildren()[PreInitsOffset] = PreInits;
+  }
+
+  void setTransformedStmt(Stmt *S) {
+    Data->getChildren()[TransformedStmtOffset] = S;
+  }
+
+public:
+  /// Create a new AST node representation for '#pragma omp reverse'.
+  ///
+  /// \param C         Context of the AST.
+  /// \param StartLoc  Location of the introducer (e.g. the 'omp' token).
+  /// \param EndLoc    Location of the directive's end (e.g. the tok::eod).
+  /// \param AssociatedStmt  The outermost associated loop.
+  /// \param TransformedStmt The loop nest after tiling, or nullptr in
+  ///                        dependent contexts.
+  /// \param PreInits   Helper preinits statements for the loop nest.
+  static OMPReverseDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits);
+
+  /// Build an empty '#pragma omp reverse' AST node for deserialization.
+  ///
+  /// \param C          Context of the AST.
+  /// \param NumClauses Number of clauses to allocate.
+  static OMPReverseDirective *CreateEmpty(const ASTContext &C);
+
+  /// Gets/sets the associated loops after the transformation, i.e. after
+  /// de-sugaring.
+  Stmt *getTransformedStmt() const {
+    return Data->getChildren()[TransformedStmtOffset];
+  }
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPReverseDirectiveClass;
+  }
+};
+
+/// Represents the '#pragma omp interchange' loop transformation directive.
+///
+/// \code{c}
+///   #pragma omp interchange
+///   for (int i = 0; i < m; ++i)
+///     for (int j = 0; j < n; ++j)
+///       ..
+/// \endcode
+class OMPInterchangeDirective final : public OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+
+  /// Offsets of child members.
+  enum {
+    PreInitsOffset = 0,
+    TransformedStmtOffset,
+  };
+
+  explicit OMPInterchangeDirective(SourceLocation StartLoc,
+                                   SourceLocation EndLoc, unsigned NumLoops)
+      : OMPLoopTransformationDirective(OMPInterchangeDirectiveClass,
+                                       llvm::omp::OMPD_interchange, StartLoc,
+                                       EndLoc, NumLoops) {
+    setNumGeneratedLoops(3 * NumLoops);
+  }
+
+  void setPreInits(Stmt *PreInits) {
+    Data->getChildren()[PreInitsOffset] = PreInits;
+  }
+
+  void setTransformedStmt(Stmt *S) {
+    Data->getChildren()[TransformedStmtOffset] = S;
+  }
+
+public:
+  /// Create a new AST node representation for '#pragma omp interchange'.
+  ///
+  /// \param C         Context of the AST.
+  /// \param StartLoc  Location of the introducer (e.g. the 'omp' token).
+  /// \param EndLoc    Location of the directive's end (e.g. the tok::eod).
+  /// \param Clauses   The directive's clauses.
+  /// \param NumLoops  Number of affected loops
+  ///                  (number of items in the 'permutation' clause if present).
+  /// \param AssociatedStmt  The outermost associated loop.
+  /// \param TransformedStmt The loop nest after tiling, or nullptr in
+  ///                        dependent contexts.
+  /// \param PreInits  Helper preinits statements for the loop nest.
+  static OMPInterchangeDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, unsigned NumLoops, Stmt *AssociatedStmt,
+         Stmt *TransformedStmt, Stmt *PreInits);
+
+  /// Build an empty '#pragma omp interchange' AST node for deserialization.
+  ///
+  /// \param C          Context of the AST.
+  /// \param NumClauses Number of clauses to allocate.
+  /// \param NumLoops   Number of associated loops to allocate.
+  static OMPInterchangeDirective *
+  CreateEmpty(const ASTContext &C, unsigned NumClauses, unsigned NumLoops);
+
+  /// Gets the associated loops after the transformation. This is the de-sugared
+  /// replacement or nullptr in dependent contexts.
+  Stmt *getTransformedStmt() const {
+    return Data->getChildren()[TransformedStmtOffset];
+  }
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPInterchangeDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp scan' directive.
 ///
 /// \code
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 924b19649115e..6339d3adff4e3 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2509,6 +2509,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFunctionNoProtoType() const { return getAs<FunctionNoProtoType>(); }
   bool isFunctionProtoType() const { return getAs<FunctionProtoType>(); }
   bool isPointerType() const;
+  bool isPointerOrReferenceType() const;
   bool isSignableType() const;
   bool isAnyPointerType() const;   // Any C pointer or ObjC object pointer
   bool isCountAttributedType() const;
@@ -4708,26 +4709,25 @@ class FunctionEffect {
   };
 
 private:
-  LLVM_PREFERRED_TYPE(Kind)
-  unsigned FKind : 3;
+  Kind FKind;
 
   // Expansion: for hypothetical TCB+types, there could be one Kind for TCB,
   // then ~16(?) bits "SubKind" to map to a specific named TCB. SubKind would
   // be considered for uniqueness.
 
 public:
-  FunctionEffect() : FKind(unsigned(Kind::None)) {}
+  FunctionEffect() : FKind(Kind::None) {}
 
-  explicit FunctionEffect(Kind K) : FKind(unsigned(K)) {}
+  explicit FunctionEffect(Kind K) : FKind(K) {}
 
   /// The kind of the effect.
-  Kind kind() const { return Kind(FKind); }
+  Kind kind() const { return FKind; }
 
   /// Return the opposite kind, for effects which have opposites.
   Kind oppositeKind() const;
 
   /// For serialization.
-  uint32_t toOpaqueInt32() const { return FKind; }
+  uint32_t toOpaqueInt32() const { return uint32_t(FKind); }
   static FunctionEffect fromOpaqueInt32(uint32_t Value) {
     return FunctionEffect(Kind(Value));
   }
@@ -8007,6 +8007,10 @@ inline bool Type::isPointerType() const {
   return isa<PointerType>(CanonicalType);
 }
 
+inline bool Type::isPointerOrReferenceType() const {
+  return isPointerType() || isReferenceType();
+}
+
 inline bool Type::isAnyPointerType() const {
   return isPointerType() || isObjCObjectPointerType();
 }
@@ -8462,7 +8466,7 @@ inline bool Type::isUndeducedType() const {
 /// Determines whether this is a type for which one can define
 /// an overloaded operator.
 inline bool Type::isOverloadableType() const {
-  if (!CanonicalType->isDependentType())
+  if (!isDependentType())
     return isRecordType() || isEnumeralType();
   return !isArrayType() && !isFunctionType() && !isAnyPointerType() &&
          !isMemberPointerType();
diff --git a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
index 420f13ce11bfd..5c64e5b094749 100644
--- a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
+++ b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -27,6 +28,20 @@
 namespace clang {
 namespace dataflow {
 
+namespace internal {
+class StmtToBlockMap {
+public:
+  StmtToBlockMap(const CFG &Cfg);
+
+  const CFGBlock *lookup(const Stmt &S) const {
+    return StmtToBlock.lookup(&ignoreCFGOmittedNodes(S));
+  }
+
+private:
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+};
+} // namespace internal
+
 /// Holds CFG with additional information derived from it that is needed to
 /// perform dataflow analysis.
 class AdornedCFG {
@@ -48,9 +63,10 @@ class AdornedCFG {
   /// Returns the CFG that is stored in this context.
   const CFG &getCFG() const { return *Cfg; }
 
-  /// Returns a mapping from statements to basic blocks that contain them.
-  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
-    return StmtToBlock;
+  /// Returns the basic block that contains `S`, or null if no basic block
+  /// containing `S` is found.
+  const CFGBlock *blockForStmt(const Stmt &S) const {
+    return StmtToBlock.lookup(S);
   }
 
   /// Returns whether `B` is reachable from the entry block.
@@ -73,8 +89,7 @@ class AdornedCFG {
 private:
   AdornedCFG(
       const Decl &D, std::unique_ptr<CFG> Cfg,
-      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
-      llvm::BitVector BlockReachable,
+      internal::StmtToBlockMap StmtToBlock, llvm::BitVector BlockReachable,
       llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
       : ContainingDecl(D), Cfg(std::move(Cfg)),
         StmtToBlock(std::move(StmtToBlock)),
@@ -85,7 +100,7 @@ class AdornedCFG {
   /// The `Decl` containing the statement used to construct the CFG.
   const Decl &ContainingDecl;
   std::unique_ptr<CFG> Cfg;
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  internal::StmtToBlockMap StmtToBlock;
   llvm::BitVector BlockReachable;
   llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
 };
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 50a7018872061..e6efde091871f 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -233,7 +233,7 @@ llvm::Expected<std::vector<
     std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
 runDataflowAnalysis(const AdornedCFG &ACFG, AnalysisT &Analysis,
                     const Environment &InitEnv,
-                    CFGEltCallbacks<AnalysisT> PostAnalysisCallbacks,
+                    CFGEltCallbacks<AnalysisT> PostAnalysisCallbacks = {},
                     std::int32_t MaxBlockVisits = kDefaultMaxBlockVisits) {
   CFGEltCallbacksTypeErased TypeErasedCallbacks;
   if (PostAnalysisCallbacks.Before) {
@@ -286,22 +286,6 @@ runDataflowAnalysis(const AdornedCFG &ACFG, AnalysisT &Analysis,
   return std::move(BlockStates);
 }
 
-/// Overload that takes only one post-analysis callback, which is run on the
-/// state after visiting the `CFGElement`. This is provided for backwards
-/// compatibility; new callers should call the overload taking `CFGEltCallbacks`
-/// instead.
-template <typename AnalysisT>
-llvm::Expected<std::vector<
-    std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
-runDataflowAnalysis(
-    const AdornedCFG &ACFG, AnalysisT &Analysis, const Environment &InitEnv,
-    CFGEltCallback<AnalysisT> PostAnalysisCallbackAfterElt = nullptr,
-    std::int32_t MaxBlockVisits = kDefaultMaxBlockVisits) {
-  return runDataflowAnalysis(ACFG, Analysis, InitEnv,
-                             {nullptr, PostAnalysisCallbackAfterElt},
-                             MaxBlockVisits);
-}
-
 // Create an analysis class that is derived from `DataflowAnalysis`. This is an
 // SFINAE adapter that allows us to call two different variants of constructor
 // (either with or without the optional `Environment` parameter).
diff --git a/clang/include/clang/Analysis/FlowSensitive/MapLattice.h b/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
index 16b0c978779a7..b2d147e4ae444 100644
--- a/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
@@ -49,7 +49,7 @@ template <typename Key, typename ElementLattice> class MapLattice {
 
   MapLattice() = default;
 
-  explicit MapLattice(Container C) { C = std::move(C); }
+  explicit MapLattice(Container C) : C{std::move(C)} {};
 
   // The `bottom` element is the empty map.
   static MapLattice bottom() { return MapLattice(); }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index fcdb0cb0144b3..71180f8765c6b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -489,6 +489,9 @@ def TargetELF : TargetSpec {
 def TargetELFOrMachO : TargetSpec {
   let ObjectFormats = ["ELF", "MachO"];
 }
+def TargetWindowsArm64EC : TargetSpec {
+  let CustomCode = [{ Target.getTriple().isWindowsArm64EC() }];
+}
 
 def TargetSupportsInitPriority : TargetSpec {
   let CustomCode = [{ !Target.getTriple().isOSzOS() }];
@@ -943,7 +946,7 @@ def PatchableFunctionEntry
     : InheritableAttr,
       TargetSpecificAttr<TargetArch<
           ["aarch64", "aarch64_be", "loongarch32", "loongarch64", "riscv32",
-           "riscv64", "x86", "x86_64"]>> {
+           "riscv64", "x86", "x86_64", "ppc", "ppc64"]>> {
   let Spellings = [GCC<"patchable_function_entry">];
   let Subjects = SubjectList<[Function, ObjCMethod]>;
   let Args = [UnsignedArgument<"Count">, DefaultIntArgument<"Offset", 0>];
@@ -2989,6 +2992,17 @@ def Convergent : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def NoConvergent : InheritableAttr {
+  let Spellings = [Clang<"noconvergent">, Declspec<"noconvergent">];
+  let Subjects = SubjectList<[Function, Stmt], WarnDiag,
+                             "functions and statements">;
+  let LangOpts = [CUDA];
+  let Documentation = [NoConvergentDocs];
+  let SimpleHandler = 1;
+}
+
+def : MutualExclusions<[Convergent, NoConvergent]>;
+
 def NoInline : DeclOrStmtAttr {
   let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
                    CXX11<"clang", "noinline">, C23<"clang", "noinline">,
@@ -5052,6 +5066,12 @@ def SelectAny : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def HybridPatchable : InheritableAttr, TargetSpecificAttr<TargetWindowsArm64EC> {
+  let Spellings = [Declspec<"hybrid_patchable">, Clang<"hybrid_patchable">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [HybridPatchableDocs];
+}
+
 def Thread : Attr {
   let Spellings = [Declspec<"thread">];
   let LangOpts = [MicrosoftExt];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index d7f8f6c62a663..482a70949ec76 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1435,6 +1435,34 @@ Sample usage:
   }];
 }
 
+def NoConvergentDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+This attribute prevents a function from being treated as convergent, which
+means that optimizations can only move calls to that function to
+control-equivalent blocks. If a statement is marked as ``noconvergent`` and
+contains calls, it also prevents those calls from being treated as convergent.
+In other words, those calls are not restricted to only being moved to
+control-equivalent blocks.
+
+In languages following SPMD/SIMT programming model, e.g., CUDA/HIP, function
+declarations and calls are treated as convergent by default for correctness.
+This ``noconvergent`` attribute is helpful for developers to prevent them from
+being treated as convergent when it's safe.
+
+.. code-block:: c
+
+  __device__ float bar(float);
+  __device__ float foo(float) __attribute__((noconvergent)) {}
+
+  __device__ int example(void) {
+    float x;
+    [[clang::noconvergent]] x = bar(x);
+  }
+
+  }];
+}
+
 def NoSplitStackDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
@@ -4068,7 +4096,7 @@ For example:
   typedef vint8m1_t fixed_vint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
   #endif
 
-Creates a type ``fixed_vint8m1_t_t`` that is a fixed-length variant of
+Creates a type ``fixed_vint8m1_t`` that is a fixed-length variant of
 ``vint8m1_t`` that contains exactly 512 bits. Unlike ``vint8m1_t``, this type
 can be used in globals, structs, unions, and arrays, all of which are
 unsupported for sizeless types.
@@ -7511,7 +7539,8 @@ takes precedence over the command line option ``-fpatchable-function-entry=N,M``
 ``M`` defaults to 0 if omitted.
 
 This attribute is only supported on
-aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64 targets.
+aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64/ppc/ppc64 targets.
+For ppc/ppc64 targets, AIX is still not supported.
 }];
 }
 
@@ -7695,6 +7724,16 @@ For more information see
 or `msvc documentation <https://docs.microsoft.com/pl-pl/cpp/cpp/selectany>`_.
 }]; }
 
+def HybridPatchableDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``hybrid_patchable`` attribute declares an ARM64EC function with an additional
+x86-64 thunk, which may be patched at runtime.
+
+For more information see
+`ARM64EC ABI documentation <https://learn.microsoft.com/en-us/windows/arm/arm64ec-abi>`_.
+}]; }
+
 def WebAssemblyExportNameDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 5f024b4b5fd78..cdf9dcaff7508 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -191,6 +191,12 @@ class AttributeCommonInfo {
   /// __gnu__::__attr__ will be normalized to gnu::attr).
   std::string getNormalizedFullName() const;
 
+  /// Generate a normalized full name, with syntax, scope and name.
+  static std::string
+  normalizeFullNameWithSyntax(const IdentifierInfo *Name,
+                              const IdentifierInfo *Scope,
+                              AttributeCommonInfo::Syntax SyntaxUsed);
+
   bool isDeclspecAttribute() const { return SyntaxUsed == AS_Declspec; }
   bool isMicrosoftAttribute() const { return SyntaxUsed == AS_Microsoft; }
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 9e4407eeb0126..78e59050eeb6f 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -83,11 +83,11 @@ class BitInt_Long_LongLongTemplate :
 // - _Constant: Argument has to constant-fold to an integer constant expression
 
 // __fp16 and __float128 builtin variants of libc/libm functions.
-def AcosF128 : Builtin {
-  let Spellings = ["__builtin_acosf128"];
+def AcosF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_acos"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def AcoshF128 : Builtin {
@@ -97,11 +97,11 @@ def AcoshF128 : Builtin {
   let Prototype = "__float128(__float128)";
 }
 
-def AsinF128 : Builtin {
-  let Spellings = ["__builtin_asinf128"];
+def AsinF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_asin"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def AsinhF128 : Builtin {
@@ -111,11 +111,11 @@ def AsinhF128 : Builtin {
   let Prototype = "__float128(__float128)";
 }
 
-def AtanF128 : Builtin {
-  let Spellings = ["__builtin_atanf128"];
+def AtanF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_atan"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def AtanhF128 : Builtin {
@@ -143,10 +143,10 @@ def CosF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def CoshF128 : Builtin {
-  let Spellings = ["__builtin_coshf128"];
+def CoshF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_cosh"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def ErfF128 : Builtin {
@@ -468,11 +468,11 @@ def SinF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def SinhF128 : Builtin {
-  let Spellings = ["__builtin_sinhf128"];
+def SinhF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_sinh"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def SqrtF16F128 : Builtin, F16F128MathTemplate {
@@ -489,11 +489,11 @@ def TanF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def TanhF128 : Builtin {
-  let Spellings = ["__builtin_tanhf128"];
+def TanhF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_tanh"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def TgammaF128 : Builtin {
@@ -4654,13 +4654,13 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
 
 // HLSL
 def HLSLAll : LangBuiltin<"HLSL_LANG"> {
-  let Spellings = ["__builtin_hlsl_elementwise_all"];
+  let Spellings = ["__builtin_hlsl_all"];
   let Attributes = [NoThrow, Const];
   let Prototype = "bool(...)";
 }
 
 def HLSLAny : LangBuiltin<"HLSL_LANG"> {
-  let Spellings = ["__builtin_hlsl_elementwise_any"];
+  let Spellings = ["__builtin_hlsl_any"];
   let Attributes = [NoThrow, Const];
   let Prototype = "bool(...)";
 }
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8c9233d10dab..4f30002772254 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -149,12 +149,18 @@ BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
 BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
 
 BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vcQbiiIi", "n")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vsQbiiIi", "n")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "viQbiiIi", "n")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b64, "vV2iQbiiIi", "n")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b96, "vV3iQbiiIi", "n")
-BUILTIN(__builtin_amdgcn_raw_buffer_store_b128, "vV4iQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b64, "vV2UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b96, "vV3UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b128, "vV4UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b8, "UcQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b16, "UsQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b32, "UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
@@ -450,8 +456,12 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
 
 //===----------------------------------------------------------------------===//
 // WMMA builtins.
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 2a45f8a6582a2..df304a71e475e 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -201,6 +201,7 @@ TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hif", "nc", "half-precision")
 
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index a85e7918f4d7e..06ca30d65f5bd 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -47,107 +47,8 @@ TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "")
 // doesn't work in the presence of re-declaration of _mm_prefetch for windows.
 TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
 TARGET_BUILTIN(__builtin_ia32_emms, "v", "n", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddsb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddsw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddusb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddusw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubsb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubsw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubusb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubusw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmulhw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmullw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd, "V2iV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pand, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pandn, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_por, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pxor, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pslld, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrld, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psraw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrad, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllwi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pslldi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllqi, "V1OiV1Oii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlwi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrldi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlqi, "V1OiV1Oii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrawi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psradi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packsswb, "V8cV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packssdw, "V4sV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packuswb, "V8cV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhbw, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhwd, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhdq, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpcklbw, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpcklwd, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckldq, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "nV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_movntq, "vV1Oi*V1Oi", "nV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v2si, "V2iii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v4hi, "V4sssss", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v8qi, "V8ccccccccc", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v2si, "iV2ii", "ncV:64:", "mmx")
-
-// MMX2 (MMX+SSE) intrinsics
-TARGET_BUILTIN(__builtin_ia32_cvtpi2ps, "V4fV4fV2i", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pavgb, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pavgw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmaxsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmaxub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pminsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pminub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
-
-// MMX+SSE2
-TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpi2pd, "V2dV2i", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_paddq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_pmuludq, "V1OiV2iV2i", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_psubq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
-
-// MMX+SSSE3
-TARGET_BUILTIN(__builtin_ia32_pabsb, "V8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pabsd, "V2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pabsw, "V4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_palignr, "V8cV8cV8cIc", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pshufb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")
 
 // SSE intrinsics.
 TARGET_BUILTIN(__builtin_ia32_comieq, "iV4fV4f", "ncV:128:", "sse")
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index be2884913355b..2e92c6ae23d7f 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -235,6 +235,7 @@ ENUM_CODEGENOPT(StructReturnConvention, StructReturnConventionKind, 2, SRCK_Defa
 
 CODEGENOPT(RelaxAll          , 1, 0) ///< Relax all machine code instructions.
 CODEGENOPT(RelaxedAliasing   , 1, 0) ///< Set when -fno-strict-aliasing is enabled.
+CODEGENOPT(PointerTBAA, 1, 0)        ///< Whether or not to use distinct TBAA tags for pointers.
 CODEGENOPT(StructPathTBAA    , 1, 0) ///< Whether or not to use struct-path TBAA.
 CODEGENOPT(NewStructPathTBAA , 1, 0) ///< Whether or not to use enhanced struct-path TBAA.
 CODEGENOPT(SaveTempLabels    , 1, 0) ///< Save temporary labels.
@@ -431,6 +432,9 @@ CODEGENOPT(ForceAAPCSBitfieldLoad, 1, 0)
 /// Assume that by-value parameters do not alias any other values.
 CODEGENOPT(PassByValueIsNoAlias, 1, 0)
 
+/// Whether to store register parameters to stack.
+CODEGENOPT(SaveRegParams, 1, 0)
+
 /// Whether to not follow the AAPCS that enforces volatile bit-field access width to be
 /// according to the field declaring type width.
 CODEGENOPT(AAPCSBitfieldWidth, 1, 1)
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index fe950ec834d2f..68b0bf529bc93 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -167,6 +167,8 @@ def err_drv_unsupported_rtlib_for_platform : Error<
   "unsupported runtime library '%0' for platform '%1'">;
 def err_drv_invalid_unwindlib_name : Error<
   "invalid unwind library name in argument '%0'">;
+def err_drv_unsupported_unwind_for_platform : Error<
+  "unsupported unwind library '%0' for platform '%1'">;
 def err_drv_incompatible_unwindlib : Error<
   "--rtlib=libgcc requires --unwindlib=libgcc">;
 def err_drv_incompatible_fp_accuracy_options : Error<
@@ -505,6 +507,10 @@ def warn_drv_deprecated_arg : Warning<
 def warn_drv_deprecated_arg_no_relaxed_template_template_args : Warning<
   "argument '-fno-relaxed-template-template-args' is deprecated">,
   InGroup<DeprecatedNoRelaxedTemplateTemplateArgs>;
+def warn_drv_deprecated_arg_ofast : Warning<
+  "argument '-Ofast' is deprecated; use '-O3 -ffast-math' for the same behavior,"
+  " or '-O3' to enable only conforming optimizations">,
+  InGroup<DeprecatedOFast>;
 def warn_drv_deprecated_custom : Warning<
   "argument '%0' is deprecated, %1">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 6bde60b57bf10..2c4220842913a 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -288,6 +288,9 @@ def err_function_needs_feature : Error<
 let CategoryName = "Codegen ABI Check" in {
 def err_function_always_inline_attribute_mismatch : Error<
   "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
+def warn_function_always_inline_attribute_mismatch : Warning<
+  "always_inline function %1 and its caller %0 have mismatching %2 attributes, "
+  "inlining may change runtime behaviour">, InGroup<AArch64SMEAttributes>;
 def err_function_always_inline_new_za : Error<
   "always_inline function %0 has new za state">;
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 1777843eb9973..763fa9081b418 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -103,6 +103,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                 EnumFloatConversion,
                                 EnumCompareConditional]>;
 def DeprecatedNoRelaxedTemplateTemplateArgs : DiagGroup<"deprecated-no-relaxed-template-template-args">;
+def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
 def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
@@ -230,6 +231,7 @@ def Deprecated : DiagGroup<"deprecated", [DeprecatedAnonEnumEnumConversion,
                                           DeprecatedPragma,
                                           DeprecatedRegister,
                                           DeprecatedNoRelaxedTemplateTemplateArgs,
+                                          DeprecatedOFast,
                                           DeprecatedThisCapture,
                                           DeprecatedType,
                                           DeprecatedVolatile,
@@ -453,6 +455,7 @@ def LogicalNotParentheses: DiagGroup<"logical-not-parentheses">;
 def ShiftOpParentheses: DiagGroup<"shift-op-parentheses">;
 def OverloadedShiftOpParentheses: DiagGroup<"overloaded-shift-op-parentheses">;
 def DanglingAssignment: DiagGroup<"dangling-assignment">;
+def DanglingAssignmentGsl : DiagGroup<"dangling-assignment-gsl">;
 def DanglingElse: DiagGroup<"dangling-else">;
 def DanglingField : DiagGroup<"dangling-field">;
 def DanglingInitializerList : DiagGroup<"dangling-initializer-list">;
@@ -461,6 +464,7 @@ def ReturnStackAddress : DiagGroup<"return-stack-address">;
 // Name of this warning in GCC
 def : DiagGroup<"return-local-addr", [ReturnStackAddress]>;
 def Dangling : DiagGroup<"dangling", [DanglingAssignment,
+                                      DanglingAssignmentGsl,
                                       DanglingField,
                                       DanglingInitializerList,
                                       DanglingGsl,
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 12aab09f28556..f8d50d12bb935 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1260,6 +1260,9 @@ def warn_pragma_intrinsic_builtin : Warning<
 def warn_pragma_unused_expected_var : Warning<
   "expected '#pragma unused' argument to be a variable name">,
   InGroup<IgnoredPragmas>;
+// - #pragma mc_func
+def err_pragma_mc_func_not_supported :
+   Error<"#pragma mc_func is not supported">;
 // - #pragma init_seg
 def warn_pragma_init_seg_unsupported_target : Warning<
   "'#pragma init_seg' is only supported when targeting a "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 59e671caf8c68..856a8f676fa45 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1010,6 +1010,9 @@ def warn_ptrauth_auth_null_pointer :
   InGroup<PtrAuthNullPointers>;
 def err_ptrauth_string_not_literal : Error<
   "argument must be a string literal%select{| of char type}0">;
+def err_ptrauth_type_disc_undiscriminated : Error<
+  "cannot pass undiscriminated type %0 to "
+  "'__builtin_ptrauth_type_discriminator'">;
 
 def note_ptrauth_virtual_function_pointer_incomplete_arg_ret :
   Note<"cannot take an address of a virtual member function if its return or "
@@ -1017,6 +1020,9 @@ def note_ptrauth_virtual_function_pointer_incomplete_arg_ret :
 def note_ptrauth_virtual_function_incomplete_arg_ret_type :
   Note<"%0 is incomplete">;
 
+def err_ptrauth_indirect_goto_addrlabel_arithmetic : Error<
+  "%select{subtraction|addition}0 of address-of-label expressions is not "
+  "supported with ptrauth indirect gotos">;
 
 /// main()
 // static main() is not an error in C, just in C++.
@@ -3398,10 +3404,16 @@ def err_attribute_invalid_implicit_this_argument : Error<
   "%0 attribute is invalid for the implicit this argument">;
 def err_ownership_type : Error<
   "%0 attribute only applies to %select{pointer|integer}1 arguments">;
+def err_ownership_takes_return_type : Error<
+  "'ownership_returns' attribute only applies to functions that return a pointer">;
 def err_ownership_returns_index_mismatch : Error<
   "'ownership_returns' attribute index does not match; here it is %0">;
 def note_ownership_returns_index_mismatch : Note<
   "declared with index %0 here">;
+def err_ownership_takes_class_mismatch : Error<
+  "'ownership_takes' attribute class does not match; here it is '%0'">;
+def note_ownership_takes_class_mismatch : Note<
+  "declared with class '%0' here">;
 def err_format_strftime_third_parameter : Error<
   "strftime format attribute requires 3rd parameter to be 0">;
 def err_format_attribute_not : Error<"format argument not a string type">;
@@ -3566,7 +3578,7 @@ def err_attr_tlsmodel_arg : Error<"tls_model must be \"global-dynamic\", "
 
 def err_attr_codemodel_arg : Error<"code model '%0' is not supported on this target">;
 
-def err_aix_attr_unsupported_tls_model : Error<"TLS model '%0' is not yet supported on AIX">;
+def err_aix_attr_unsupported : Error<"%0 attribute is not yet supported on AIX">;
 
 def err_tls_var_aligned_over_maximum : Error<
   "alignment (%0) of thread-local variable %1 is greater than the maximum supported "
@@ -3752,6 +3764,9 @@ def err_attribute_weak_static : Error<
   "weak declaration cannot have internal linkage">;
 def err_attribute_selectany_non_extern_data : Error<
   "'selectany' can only be applied to data items with external linkage">;
+def warn_attribute_hybrid_patchable_non_extern : Warning<
+  "'hybrid_patchable' is ignored on functions without external linkage">,
+  InGroup<IgnoredAttributes>;
 def err_declspec_thread_on_thread_variable : Error<
   "'__declspec(thread)' applied to variable that already has a "
   "thread-local storage specifier">;
@@ -3885,8 +3900,6 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning<
   InGroup<AArch64SMEAttributes>, DefaultIgnore;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
-def err_sme_streaming_cannot_be_multiversioned : Error<
-  "streaming function cannot be multi-versioned">;
 def err_unknown_arm_state : Error<
   "unknown state '%0'">;
 def err_missing_arm_state : Error<
@@ -5250,7 +5263,7 @@ def warn_cxx11_compat_variable_template : Warning<
   InGroup<CXXPre14Compat>, DefaultIgnore;
 def err_template_variable_noparams : Error<
   "extraneous 'template<>' in declaration of variable %0">;
-def err_template_member : Error<"member %0 declared as a template">;
+def err_template_member : Error<"non-static data member %0 cannot be declared as a template">;
 def err_member_with_template_arguments : Error<"member %0 cannot have template arguments">;
 def err_template_member_noparams : Error<
   "extraneous 'template<>' in declaration of member %0">;
@@ -7685,9 +7698,6 @@ def err_nested_non_static_member_use : Error<
 def warn_cxx98_compat_non_static_member_use : Warning<
   "use of non-static data member %0 in an unevaluated context is "
   "incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
-def err_form_ptr_to_member_from_parenthesized_expr : Error<
-  "cannot form pointer to member from a parenthesized expression; "
-  "did you mean to remove the parentheses?">;
 def err_invalid_incomplete_type_use : Error<
   "invalid use of incomplete type %0">;
 def err_builtin_func_cast_more_than_one_arg : Error<
@@ -10227,6 +10237,9 @@ def warn_dangling_lifetime_pointer : Warning<
   "object backing the pointer "
   "will be destroyed at the end of the full-expression">,
   InGroup<DanglingGsl>;
+def warn_dangling_lifetime_pointer_assignment : Warning<"object backing the "
+  "pointer %0 will be destroyed at the end of the full-expression">,
+  InGroup<DanglingAssignmentGsl>;
 def warn_new_dangling_initializer_list : Warning<
   "array backing "
   "%select{initializer list subobject of the allocated object|"
diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index eb27de5921d6a..51d0abbbec252 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -50,14 +50,14 @@ def warn_pch_vfsoverlay_mismatch : Warning<
 def note_pch_vfsoverlay_files : Note<"%select{PCH|current translation unit}0 has the following VFS overlays:\n%1">;
 def note_pch_vfsoverlay_empty : Note<"%select{PCH|current translation unit}0 has no VFS overlays">;
 
-def err_pch_version_too_old : Error<
-    "PCH file uses an older PCH format that is no longer supported">;
-def err_pch_version_too_new : Error<
-    "PCH file uses a newer PCH format that cannot be read">;
-def err_pch_different_branch : Error<
-    "PCH file built from a different branch (%0) than the compiler (%1)">;
-def err_pch_with_compiler_errors : Error<
-    "PCH file contains compiler errors">;
+def err_ast_file_version_too_old : Error<
+    "%select{PCH|module|AST}0 file '%1' uses an older PCH format that is no longer supported">;
+def err_ast_file_version_too_new : Error<
+    "%select{PCH|module|AST}0 file '%1' uses a newer PCH format that cannot be read">;
+def err_ast_file_different_branch : Error<
+    "%select{PCH|module|AST}0 file '%1' built from a different branch (%2) than the compiler (%3)">;
+def err_ast_file_with_compiler_errors : Error<
+    "%select{PCH|module|AST}0 file '%1' contains compiler errors">;
 
 def err_module_file_conflict : Error<
   "module '%0' is defined in both '%1' and '%2'">, DefaultFatal;
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 2f864ff1c0edf..dc71ef8f98692 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -108,9 +108,11 @@ FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns)
 FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination)
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
+FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
+FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 EXTENSION(swiftcc,
   PP.getTargetInfo().checkCallingConvention(CC_Swift) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 3ea0dfada1ecd..a68d140bf104a 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -165,9 +165,11 @@ LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library fea
 LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
 LANGOPT(PointerAuthCalls  , 1, 0, "function pointer authentication")
 LANGOPT(PointerAuthReturns, 1, 0, "return pointer authentication")
+LANGOPT(PointerAuthIndirectGotos, 1, 0, "indirect gotos pointer authentication")
 LANGOPT(PointerAuthAuthTraps, 1, 0, "pointer authentication failure traps")
 LANGOPT(PointerAuthVTPtrAddressDiscrimination, 1, 0, "incorporate address discrimination in authenticated vtable pointers")
 LANGOPT(PointerAuthVTPtrTypeDiscrimination, 1, 0, "incorporate type discrimination in authenticated vtable pointers")
+LANGOPT(PointerAuthTypeInfoVTPtrDiscrimination, 1, 0, "incorporate type and address discrimination in authenticated vtable pointers for std::type_info")
 LANGOPT(PointerAuthInitFini, 1, 0, "sign function pointers in init/fini arrays")
 BENIGN_LANGOPT(PointerAuthFunctionTypeDiscrimination, 1, 0,
                "Use type discrimination when signing function pointers")
@@ -222,7 +224,6 @@ COMPATIBLE_LANGOPT(GNUInline         , 1, 0, "GNU inline semantics")
 COMPATIBLE_LANGOPT(NoInlineDefine    , 1, 0, "__NO_INLINE__ predefined macro")
 COMPATIBLE_LANGOPT(Deprecated        , 1, 0, "__DEPRECATED predefined macro")
 COMPATIBLE_LANGOPT(FastMath          , 1, 0, "fast FP math optimizations, and __FAST_MATH__ predefined macro")
-COMPATIBLE_LANGOPT(FiniteMathOnly    , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
 COMPATIBLE_LANGOPT(UnsafeFPMath      , 1, 0, "Unsafe Floating Point Math")
 COMPATIBLE_LANGOPT(ProtectParens     , 1, 0, "optimizer honors parentheses "
                    "when floating-point expressions are evaluated")
@@ -361,7 +362,6 @@ LANGOPT(SinglePrecisionConstants , 1, 0, "treating double-precision floating poi
 LANGOPT(FastRelaxedMath , 1, 0, "OpenCL fast relaxed math")
 BENIGN_LANGOPT(CLNoSignedZero , 1, 0, "Permit Floating Point optimization without regard to signed zeros")
 COMPATIBLE_LANGOPT(CLUnsafeMath , 1, 0, "Unsafe Floating Point Math")
-COMPATIBLE_LANGOPT(CLFiniteMathOnly , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
 /// FP_CONTRACT mode (on/off/fast).
 BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contraction type")
 COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
@@ -553,6 +553,9 @@ BENIGN_LANGOPT(CheckNew, 1, 0, "Do not assume C++ operator new may not return NU
 BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1,
                "Emit diagnostics for a constexpr function body that can never "
                "be used in a constant expression.")
+
+LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C")
+
 #undef LANGOPT
 #undef COMPATIBLE_LANGOPT
 #undef BENIGN_LANGOPT
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index f46a92d5ecfd4..51084913bf102 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -107,6 +107,7 @@ OPENMP_DEVICE_MODIFIER(device_num)
 OPENMP_DEFAULTMAP_KIND(scalar)
 OPENMP_DEFAULTMAP_KIND(aggregate)
 OPENMP_DEFAULTMAP_KIND(pointer)
+OPENMP_DEFAULTMAP_KIND(all)
 
 // Modifiers for 'defaultmap' clause.
 OPENMP_DEFAULTMAP_MODIFIER(alloc)
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index 197d63642ca6d..417b4b00648c7 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -25,6 +25,11 @@ namespace clang {
 
 constexpr unsigned PointerAuthKeyNone = -1;
 
+/// Constant discriminator for std::type_info vtable pointers: 0xB1EA/45546
+/// The value is ptrauth_string_discriminator("_ZTVSt9type_info"), i.e.,
+/// the vtable type discriminator for classes derived from std::type_info.
+constexpr uint16_t StdTypeInfoVTablePointerConstantDiscrimination = 0xB1EA;
+
 class PointerAuthSchema {
 public:
   enum class Kind : unsigned {
@@ -154,6 +159,9 @@ class PointerAuthSchema {
 };
 
 struct PointerAuthOptions {
+  /// Do indirect goto label addresses need to be authenticated?
+  bool IndirectGotos = false;
+
   /// The ABI for C function pointers.
   PointerAuthSchema FunctionPointers;
 
@@ -175,6 +183,9 @@ struct PointerAuthOptions {
 
   /// The ABI for variadic C++ virtual function pointers.
   PointerAuthSchema CXXVirtualVariadicFunctionPointers;
+
+  /// The ABI for C++ member function pointers.
+  PointerAuthSchema CXXMemberFunctionPointers;
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 542d94f96c7b7..4a4a8892a81a4 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -237,6 +237,8 @@ def OMPSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPLoopTransformationDirective : StmtNode<OMPLoopBasedDirective, 1>;
 def OMPTileDirective : StmtNode<OMPLoopTransformationDirective>;
 def OMPUnrollDirective : StmtNode<OMPLoopTransformationDirective>;
+def OMPReverseDirective : StmtNode<OMPLoopTransformationDirective>;
+def OMPInterchangeDirective : StmtNode<OMPLoopTransformationDirective>;
 def OMPForDirective : StmtNode<OMPLoopDirective>;
 def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index a4efaba796fb0..2f1bf4314e814 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -502,6 +502,7 @@ TYPE_TRAIT_1(__has_trivial_move_assign, HasTrivialMoveAssign, KEYCXX)
 TYPE_TRAIT_1(__has_trivial_move_constructor, HasTrivialMoveConstructor, KEYCXX)
 
 // GNU and MS Type Traits
+TYPE_TRAIT_2(__builtin_is_virtual_base_of, IsVirtualBaseOf, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_copy, HasNothrowCopy, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_constructor, HasNothrowConstructor, KEYCXX)
@@ -603,6 +604,8 @@ ALIAS("__is_same_as", __is_same, KEYCXX)
 KEYWORD(__private_extern__          , KEYALL)
 KEYWORD(__module_private__          , KEYALL)
 
+UNARY_EXPR_OR_TYPE_TRAIT(__builtin_ptrauth_type_discriminator, PtrAuthTypeDiscriminator, KEYALL)
+
 // Extension that will be enabled for Microsoft, Borland and PS4, but can be
 // disabled via '-fno-declspec'.
 KEYWORD(__declspec                  , 0)
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6390ba3f9fe5e..3098fa67e6a51 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -289,7 +289,7 @@ def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
                    "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
   let isLaneQ = 1;
 }
-let TargetGuard = "bf16" in {
+let TargetGuard = "bf16,neon" in {
   def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb">;
   def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb"> {
     let isLaneQ = 1;
@@ -323,7 +323,7 @@ def VMLSL    : SOpInst<"vmlsl", "(>Q)(>Q)..", "csiUcUsUi", OP_MLSL>;
 def VQDMULH  : SInst<"vqdmulh", "...", "siQsQi">;
 def VQRDMULH : SInst<"vqrdmulh", "...", "siQsQi">;
 
-let TargetGuard = "v8.1a" in {
+let TargetGuard = "v8.1a,neon" in {
 def VQRDMLAH : SInst<"vqrdmlah", "....", "siQsQi">;
 def VQRDMLSH : SInst<"vqrdmlsh", "....", "siQsQi">;
 }
@@ -614,7 +614,7 @@ def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
 def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
 }
 
-let TargetGuard = "v8.1a" in {
+let TargetGuard = "v8.1a,neon" in {
 def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
 def VQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "...qI", "siQsQi", OP_QRDMLSH_LN>;
 }
@@ -957,7 +957,7 @@ def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "(>Q)(>Q)QQ", "si", OP_QDMLALHi>;
 def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLALHi_N>;
 def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "(>Q)(>Q)QQ", "si", OP_QDMLSLHi>;
 def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLSLHi_N>;
-let TargetGuard = "aes" in {
+let TargetGuard = "aes,neon" in {
   def VMULL_P64    : SInst<"vmull", "(1>)11", "Pl">;
   def VMULL_HIGH_P64 : SOpInst<"vmull_high", "(1>)..", "HPl", OP_MULLHi_P64>;
 }
@@ -1091,7 +1091,7 @@ let isLaneQ = 1 in {
 def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
 def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
   let isLaneQ = 1;
 }
@@ -1122,14 +1122,14 @@ def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Crypto
-let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "aes" in {
+let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "aes,neon" in {
 def AESE : SInst<"vaese", "...", "QUc">;
 def AESD : SInst<"vaesd", "...", "QUc">;
 def AESMC : SInst<"vaesmc", "..", "QUc">;
 def AESIMC : SInst<"vaesimc", "..", "QUc">;
 }
 
-let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "sha2" in {
+let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "sha2,neon" in {
 def SHA1H : SInst<"vsha1h", "11", "Ui">;
 def SHA1SU1 : SInst<"vsha1su1", "...", "QUi">;
 def SHA256SU0 : SInst<"vsha256su0", "...", "QUi">;
@@ -1143,7 +1143,7 @@ def SHA256H2 : SInst<"vsha256h2", "....", "QUi">;
 def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3,neon" in {
 def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def RAX1 : SInst<"vrax1", "...", "QUl">;
@@ -1153,14 +1153,14 @@ def XAR :  SInst<"vxar", "...I", "QUl">;
 }
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3,neon" in {
 def SHA512SU0 : SInst<"vsha512su0", "...", "QUl">;
 def SHA512su1 : SInst<"vsha512su1", "....", "QUl">;
 def SHA512H : SInst<"vsha512h", "....", "QUl">;
 def SHA512H2 : SInst<"vsha512h2", "....", "QUl">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
 def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">;
 def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">;
 def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">;
@@ -1170,7 +1170,7 @@ def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">;
 def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
 def SM4E : SInst<"vsm4e", "...", "QUi">;
 def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">;
 }
@@ -1227,7 +1227,7 @@ def FRINTZ_S64 : SInst<"vrnd", "..", "dQd">;
 def FRINTI_S64 : SInst<"vrndi", "..", "dQd">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a,neon" in {
 def FRINT32X_S32 : SInst<"vrnd32x", "..", "fQf">;
 def FRINT32Z_S32 : SInst<"vrnd32z", "..", "fQf">;
 def FRINT64X_S32 : SInst<"vrnd64x", "..", "fQf">;
@@ -1401,7 +1401,7 @@ def SCALAR_SQDMULH : SInst<"vqdmulh", "111", "SsSi">;
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH : SInst<"vqrdmulh", "111", "SsSi">;
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">;
@@ -1632,7 +1632,7 @@ def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_
   let isLaneQ = 1;
 }
 
-let TargetGuard = "v8.1a" in {
+let TargetGuard = "v8.1a,neon" in {
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>;
 def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> {
@@ -1654,7 +1654,7 @@ def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcS
 } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)"
 
 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
-let TargetGuard = "fullfp16" in {
+let TargetGuard = "fullfp16,neon" in {
 
   // ARMv8.2-A FP16 one-operand vector intrinsics.
 
@@ -1679,7 +1679,7 @@ let TargetGuard = "fullfp16" in {
   def VCVTP_U16    : SInst<"vcvtp_u16", "U.", "hQh">;
 
   // Vector rounding
-  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)", TargetGuard = "fullfp16" in {
+  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)", TargetGuard = "fullfp16,neon" in {
     def FRINTZH      : SInst<"vrnd",  "..", "hQh">;
     def FRINTNH      : SInst<"vrndn", "..", "hQh">;
     def FRINTAH      : SInst<"vrnda", "..", "hQh">;
@@ -1728,7 +1728,7 @@ let TargetGuard = "fullfp16" in {
   // Max/Min
   def VMAXH         : SInst<"vmax", "...", "hQh">;
   def VMINH         : SInst<"vmin", "...", "hQh">;
-  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)", TargetGuard = "fullfp16" in {
+  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)", TargetGuard = "fullfp16,neon" in {
     def FMAXNMH       : SInst<"vmaxnm", "...", "hQh">;
     def FMINNMH       : SInst<"vminnm", "...", "hQh">;
   }
@@ -1775,7 +1775,7 @@ def VEXTH      : WInst<"vext", "...I", "hQh">;
 def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
 // ARMv8.2-A FP16 vector intrinsics for A64 only.
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16,neon" in {
 
   // Vector rounding
   def FRINTIH      : SInst<"vrndi", "..", "hQh">;
@@ -1872,11 +1872,11 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
 }
 
 // v8.2-A dot product instructions.
-let TargetGuard = "dotprod" in {
+let TargetGuard = "dotprod,neon" in {
   def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">;
   def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<<q)I", "iUiQiQUi", OP_DOT_LN>;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod,neon" in {
   // Variants indexing into a 128-bit vector are A64 only.
   def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ> {
     let isLaneQ = 1;
@@ -1884,7 +1884,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml,neon" in {
   def VFMLAL_LOW  : SInst<"vfmlal_low",  ">>..", "hQh">;
   def VFMLSL_LOW  : SInst<"vfmlsl_low",  ">>..", "hQh">;
   def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">;
@@ -1909,7 +1909,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   }
 }
 
-let TargetGuard = "i8mm" in {
+let TargetGuard = "i8mm,neon" in {
   def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
   def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
 
@@ -1926,7 +1926,7 @@ let TargetGuard = "i8mm" in {
   }
 }
 
-let TargetGuard = "bf16" in {
+let TargetGuard = "bf16,neon" in {
   def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">;
   def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>;
   def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> {
@@ -1970,7 +1970,7 @@ multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
 }
 
 // v8.3-A Vector complex addition intrinsics
-let TargetGuard = "v8.3a,fullfp16" in {
+let TargetGuard = "v8.3a,fullfp16,neon" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
   def VCADD_ROT270_FP16  : SInst<"vcadd_rot270", "...", "h">;
   def VCADDQ_ROT90_FP16  : SInst<"vcaddq_rot90", "QQQ", "h">;
@@ -1978,7 +1978,7 @@ let TargetGuard = "v8.3a,fullfp16" in {
 
   defm VCMLA_FP16  : VCMLA_ROTS<"h", "uint32x2_t", "uint32x4_t">;
 }
-let TargetGuard = "v8.3a" in {
+let TargetGuard = "v8.3a,neon" in {
   def VCADD_ROT90   : SInst<"vcadd_rot90", "...", "f">;
   def VCADD_ROT270  : SInst<"vcadd_rot270", "...", "f">;
   def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
@@ -1986,7 +1986,7 @@ let TargetGuard = "v8.3a" in {
 
   defm VCMLA_F32        : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a,neon" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
 
@@ -1994,7 +1994,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v
 }
 
 // V8.2-A BFloat intrinsics
-let TargetGuard = "bf16" in {
+let TargetGuard = "bf16,neon" in {
   def VCREATE_BF : NoTestOpInst<"vcreate", ".(IU>)", "b", OP_CAST> {
     let BigEndianSafe = 1;
   }
@@ -2058,14 +2058,14 @@ let TargetGuard = "bf16" in {
   def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>;
 }
 
-let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
   def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">;
   def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>;
   def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16",  "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>;
   def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
   def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
   def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
   def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
@@ -2077,14 +2077,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
   def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>;
 }
 
-let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
   let BigEndianSafe = 1 in {
     defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">;
   }
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
   let BigEndianSafe = 1 in {
     defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">;
@@ -2092,7 +2092,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
 }
 
 // v8.9a/v9.4a LRCPC3 intrinsics
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3,neon" in {
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index b8155c187d1bc..3b8015daee6d9 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -265,7 +265,7 @@ class Inst <string n, string p, string t, Operation o> {
   string Prototype = p;
   string Types = t;
   string ArchGuard = "";
-  string TargetGuard = "";
+  string TargetGuard = "neon";
 
   Operation Operation = o;
   bit BigEndianSafe = 0;
diff --git a/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h b/clang/include/clang/CodeGen/ObjectFilePCHContainerWriter.h
similarity index 74%
rename from clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
rename to clang/include/clang/CodeGen/ObjectFilePCHContainerWriter.h
index 7a02d8725885a..26ee9f22258c1 100644
--- a/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
+++ b/clang/include/clang/CodeGen/ObjectFilePCHContainerWriter.h
@@ -1,4 +1,4 @@
-//===-- CodeGen/ObjectFilePCHContainerOperations.h - ------------*- C++ -*-===//
+//===-- CodeGen/ObjectFilePCHContainerWriter.h ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -29,14 +29,6 @@ class ObjectFilePCHContainerWriter : public PCHContainerWriter {
                               std::shared_ptr<PCHBuffer> Buffer) const override;
 };
 
-/// A PCHContainerReader implementation that uses LLVM to
-/// wraps Clang modules inside a COFF, ELF, or Mach-O container.
-class ObjectFilePCHContainerReader : public PCHContainerReader {
-  ArrayRef<StringRef> getFormats() const override;
-
-  /// Returns the serialized AST inside the PCH container Buffer.
-  StringRef ExtractPCH(llvm::MemoryBufferRef Buffer) const override;
-};
 }
 
 #endif
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 1477e7f99d785..68c3ae5591e28 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -639,8 +639,9 @@ class Driver {
   /// treated before building actions or binding tools.
   ///
   /// \return Whether any compilation should be built for this
-  /// invocation.
-  bool HandleImmediateArgs(const Compilation &C);
+  /// invocation. The compilation can only be modified when
+  /// this function returns false.
+  bool HandleImmediateArgs(Compilation &C);
 
   /// ConstructAction - Construct the appropriate action to do for
   /// \p Phase on the \p Input, taking in to account arguments
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4b335becf9736..c0aa9be7752f3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -940,7 +940,9 @@ def O : Joined<["-"], "O">, Group<O_Group>,
 def O_flag : Flag<["-"], "O">, Visibility<[ClangOption, CC1Option, FC1Option]>,
   Alias<O>, AliasArgs<["1"]>;
 def Ofast : Joined<["-"], "Ofast">, Group<O_Group>,
-  Visibility<[ClangOption, CC1Option, FlangOption]>;
+  Visibility<[ClangOption, CC1Option, FlangOption]>,
+  HelpText<"Deprecated; use '-O3 -ffast-math' for the same behavior,"
+  " or '-O3' to enable only conforming optimizations">;
 def P : Flag<["-"], "P">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   Group<Preprocessor_Group>,
@@ -985,15 +987,15 @@ def Wsystem_headers_in_module_EQ : Joined<["-"], "Wsystem-headers-in-module=">,
   HelpText<"Enable -Wsystem-headers when building <module>">,
   MarshallingInfoStringVector<DiagnosticOpts<"SystemHeaderWarningsModules">>;
 def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>,
-  Visibility<[ClangOption, CC1Option]>,
+  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
 def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>,
   Visibility<[ClangOption, CC1Option]>;
 defm invalid_constexpr : BoolWOption<"invalid-constexpr",
   LangOpts<"CheckConstexprFunctionBodies">,
   Default<!strconcat("!", cpp23.KeyPath)>,
-  NegFlag<SetFalse, [], [ClangOption, CC1Option], "Disable">,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
+  NegFlag<SetFalse, [HelpHidden], [ClangOption, CC1Option], "Disable">,
+  PosFlag<SetTrue, [HelpHidden], [ClangOption, CC1Option], "Enable">,
   BothFlags<[], [ClangOption, CC1Option], " checking of constexpr function bodies for validity within a constant expression context">>;
 def Wl_COMMA : CommaJoined<["-"], "Wl,">, Visibility<[ClangOption, FlangOption]>,
   Flags<[LinkerInput, RenderAsInput]>,
@@ -1004,7 +1006,7 @@ def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Grou
   Visibility<[ClangOption, CC1Option]>;
 def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
   Visibility<[ClangOption, CC1Option]>;
-def Wno_sycl_strict : Flag<["-"], "Wno-sycl-strict">, Group<W_Group>, HelpText<"Disable warnings which enforce strict SYCL language compatibility.">;
+def Wno_sycl_strict : Flag<["-"], "Wno-sycl-strict">, Group<W_Group>, Flags<[HelpHidden]>, HelpText<"Disable warnings which enforce strict SYCL language compatibility.">;
 def Wp_COMMA : CommaJoined<["-"], "Wp,">,
   HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
   MetaVarName<"<arg>">, Group<Preprocessor_Group>;
@@ -1168,8 +1170,7 @@ def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">,
   MarshallingInfoFlag<LangOpts<"SinglePrecisionConstants">>;
 def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">,
-  MarshallingInfoFlag<LangOpts<"CLFiniteMathOnly">>;
+  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
 def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"OpenCL only. Generate kernel argument metadata.">,
@@ -1216,19 +1217,19 @@ def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[NoXarchOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
 def config : Joined<["--"], "config=">, Flags<[NoXarchOption]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>, MetaVarName<"<file>">,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>, MetaVarName<"<file>">,
   HelpText<"Specify configuration file">;
-def : Separate<["--"], "config">, Alias<config>;
+def : Separate<["--"], "config">, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>, Alias<config>;
 def no_default_config : Flag<["--"], "no-default-config">,
-  Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption]>,
+  Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"Disable loading default configuration files">;
 def config_system_dir_EQ : Joined<["--"], "config-system-dir=">,
   Flags<[NoXarchOption, HelpHidden]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"System directory for configuration files">;
 def config_user_dir_EQ : Joined<["--"], "config-user-dir=">,
   Flags<[NoXarchOption, HelpHidden]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"User directory for configuration files">;
 def coverage : Flag<["-", "--"], "coverage">, Group<Link_Group>,
   Visibility<[ClangOption, CLOption]>;
@@ -1981,6 +1982,14 @@ def fapinotes_swift_version : Joined<["-"], "fapinotes-swift-version=">,
   MetaVarName<"<version>">,
   HelpText<"Specify the Swift version to use when filtering API notes">;
 
+defm bounds_safety : BoolFOption<
+  "experimental-bounds-safety",
+  LangOpts<"BoundsSafety">, DefaultFalse,
+  PosFlag<SetTrue, [], [CC1Option], "Enable">,
+  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+  BothFlags<[], [CC1Option],
+          " experimental bounds safety extension for C">>;
+
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Emit">,
@@ -2679,13 +2688,12 @@ defm approx_func : BoolFOption<"approx-func", LangOpts<"ApproxFunc">, DefaultFal
            "with an approximately equivalent calculation",
            [funsafe_math_optimizations.KeyPath]>,
    NegFlag<SetFalse, [], [ClangOption, CC1Option, FC1Option, FlangOption]>>;
-defm finite_math_only : BoolFOption<"finite-math-only",
-  LangOpts<"FiniteMathOnly">, DefaultFalse,
+defm finite_math_only : BoolOptionWithoutMarshalling<"f", "finite-math-only",
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
           "Allow floating-point optimizations that "
           "assume arguments and results are not NaNs or +-inf. This defines "
           "the \\_\\_FINITE\\_MATH\\_ONLY\\_\\_ preprocessor macro.",
-          [cl_finite_math_only.KeyPath, ffast_math.KeyPath]>,
+          [ffast_math.KeyPath]>,
   NegFlag<SetFalse>>;
 defm signed_zeros : BoolFOption<"signed-zeros",
   LangOpts<"NoSignedZero">, DefaultFalse,
@@ -3302,6 +3310,10 @@ def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Implicitly search the file system for module map files.">,
   MarshallingInfoFlag<HeaderSearchOpts<"ImplicitModuleMaps">>;
+defm modulemap_allow_subdirectory_search : BoolFOption <"modulemap-allow-subdirectory-search",
+  HeaderSearchOpts<"AllowModuleMapSubdirectorySearch">, DefaultTrue,
+  PosFlag<SetTrue, [], [], "Allow to search for module maps in subdirectories of search paths">,
+  NegFlag<SetFalse>, BothFlags<[NoXarchOption], [ClangOption, CC1Option]>>;
 defm modules : BoolFOption<"modules",
   LangOpts<"Modules">, Default<fcxx_modules.KeyPath>,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
@@ -3465,6 +3477,7 @@ def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
 def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
+def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group<f_Group>;
 def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText<
   "Directly create compilation output files. This may lead to incorrect incremental builds if the compiler crashes">,
@@ -3605,7 +3618,7 @@ def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>,
 def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
 def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">,
-  Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option, FlangOption]>,
+  Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
 def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
   Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>,
@@ -3976,6 +3989,7 @@ defm strict_vtable_pointers : BoolFOption<"strict-vtable-pointers",
             " overwriting polymorphic C++ objects">,
   NegFlag<SetFalse>>;
 def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
+def fpointer_tbaa : Flag<["-"], "fpointer-tbaa">, Group<f_Group>;
 def fdriver_only : Flag<["-"], "fdriver-only">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CLOption, DXCOption]>,
   Group<Action_Group>, HelpText<"Only run the driver.">;
@@ -4079,6 +4093,10 @@ def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Grou
   HelpText<"Minimum time granularity (in microseconds) traced by time profiler">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
   MarshallingInfoInt<FrontendOpts<"TimeTraceGranularity">, "500u">;
+def ftime_trace_verbose : Joined<["-"], "ftime-trace-verbose">, Group<f_Group>,
+  HelpText<"Make time trace capture verbose event details (e.g. source filenames). This can increase the size of the output by 2-3 times">,
+  Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
+  MarshallingInfoFlag<FrontendOpts<"TimeTraceVerbose">>;
 def ftime_trace_EQ : Joined<["-"], "ftime-trace=">, Group<f_Group>,
   HelpText<"Similar to -ftime-trace. Specify the JSON file or a directory which will contain the JSON file">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
@@ -4326,9 +4344,13 @@ defm ptrauth_vtable_pointer_address_discrimination :
   OptInCC1FFlag<"ptrauth-vtable-pointer-address-discrimination", "Enable address discrimination of vtable pointers">;
 defm ptrauth_vtable_pointer_type_discrimination :
   OptInCC1FFlag<"ptrauth-vtable-pointer-type-discrimination", "Enable type discrimination of vtable pointers">;
+defm ptrauth_type_info_vtable_pointer_discrimination :
+  OptInCC1FFlag<"ptrauth-type-info-vtable-pointer-discrimination", "Enable type and address discrimination of vtable pointer of std::type_info">;
 defm ptrauth_init_fini : OptInCC1FFlag<"ptrauth-init-fini", "Enable signing of function pointers in init/fini arrays">;
 defm ptrauth_function_pointer_type_discrimination : OptInCC1FFlag<"ptrauth-function-pointer-type-discrimination",
   "Enable type discrimination on C function pointers">;
+defm ptrauth_indirect_gotos : OptInCC1FFlag<"ptrauth-indirect-gotos",
+  "Enable signing and authentication of indirect goto targets">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
@@ -5162,6 +5184,11 @@ def mspe : Flag<["-"], "mspe">, Group<m_ppc_Features_Group>;
 def mno_spe : Flag<["-"], "mno-spe">, Group<m_ppc_Features_Group>;
 def mefpu2 : Flag<["-"], "mefpu2">, Group<m_ppc_Features_Group>;
 } // let Flags = [TargetSpecific]
+def msave_reg_params : Flag<["-"], "msave-reg-params">, Group<m_Group>,
+  Flags<[TargetSpecific]>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Save arguments passed by registers to ABI-defined stack positions">,
+  MarshallingInfoFlag<CodeGenOpts<"SaveRegParams">>;
 def mabi_EQ_quadword_atomics : Flag<["-"], "mabi=quadword-atomics">,
   Group<m_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable quadword atomics ABI on AIX (AIX PPC64 only). Uses lqarx/stqcx. instructions.">,
@@ -6453,9 +6480,8 @@ def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Feature
     HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">,  Visibility<[ClangOption, CLOption, FlangOption]>;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
     HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">, Visibility<[ClangOption, CLOption, FlangOption]>;
-// For stability, we only add a feature to -mapxf after it passes the validation of llvm-test-suite && cpu2017 on Intel SDE.
-def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, Group<m_Group>,
                                 HelpText<"Enable use of GPR32 in inline assembly for APX">;
 } // let Flags = [TargetSpecific]
@@ -7531,6 +7557,9 @@ def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfie
 def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">,
   HelpText<"Turn off Type Based Alias Analysis">,
   MarshallingInfoFlag<CodeGenOpts<"RelaxedAliasing">>;
+def pointer_tbaa: Flag<["-"], "pointer-tbaa">,
+  HelpText<"Turn on Type Based Alias Analysis for pointer accesses">,
+  MarshallingInfoFlag<CodeGenOpts<"PointerTBAA">>;
 def no_struct_path_tbaa : Flag<["-"], "no-struct-path-tbaa">,
   HelpText<"Turn off struct-path aware Type Based Alias Analysis">,
   MarshallingInfoNegativeFlag<CodeGenOpts<"StructPathTBAA">>;
@@ -7873,6 +7902,9 @@ def code_completion_brief_comments : Flag<["-"], "code-completion-brief-comments
 def code_completion_with_fixits : Flag<["-"], "code-completion-with-fixits">,
   HelpText<"Include code completion results which require small fix-its.">,
   MarshallingInfoFlag<FrontendOpts<"CodeCompleteOpts.IncludeFixIts">>;
+def skip_function_bodies : Flag<["-"], "skip-function-bodies">,
+  HelpText<"Skip function bodies when possible">,
+  MarshallingInfoFlag<FrontendOpts<"SkipFunctionBodies">>;
 def disable_free : Flag<["-"], "disable-free">,
   HelpText<"Disable freeing of memory on exit">,
   MarshallingInfoFlag<FrontendOpts<"DisableFree">>;
@@ -8157,10 +8189,10 @@ def mreassociate : Flag<["-"], "mreassociate">,
   MarshallingInfoFlag<LangOpts<"AllowFPReassoc">>, ImpliedByAnyOf<[funsafe_math_optimizations.KeyPath]>;
 def menable_no_nans : Flag<["-"], "menable-no-nans">,
   HelpText<"Allow optimization to assume there are no NaNs.">,
-  MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
-def menable_no_infinities : Flag<["-"], "menable-no-infs">,
+  MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
+def menable_no_infs : Flag<["-"], "menable-no-infs">,
   HelpText<"Allow optimization to assume there are no infinities.">,
-  MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
+  MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
 
 def pic_level : Separate<["-"], "pic-level">,
   HelpText<"Value for __PIC__">,
@@ -8447,6 +8479,13 @@ def source_date_epoch : Separate<["-"], "source-date-epoch">,
 
 } // let Visibility = [CC1Option]
 
+defm err_pragma_mc_func_aix : BoolFOption<"err-pragma-mc-func-aix",
+  PreprocessorOpts<"ErrorOnPragmaMcfuncOnAIX">, DefaultTrue,
+  PosFlag<SetTrue, [], [ClangOption],
+          "Treat uses of #pragma mc_func as errors">,
+  NegFlag<SetFalse,[], [ClangOption, CC1Option],
+          "Ignore uses of #pragma mc_func">>;
+
 //===----------------------------------------------------------------------===//
 // CUDA Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 5e5034fe01eb5..8241925c98476 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -580,6 +580,11 @@ class FrontendOptions {
   /// Minimum time granularity (in microseconds) traced by time profiler.
   unsigned TimeTraceGranularity;
 
+  /// Make time trace capture verbose event details (e.g. source filenames).
+  /// This can increase the size of the output by 2-3 times.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned TimeTraceVerbose : 1;
+
   /// Path which stores the output files for -ftime-trace
   std::string TimeTracePath;
 
@@ -601,7 +606,8 @@ class FrontendOptions {
         EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false),
         EmitSymbolGraphSymbolLabelsForTesting(false),
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
-        UseClangIRPipeline(false), TimeTraceGranularity(500) {}
+        UseClangIRPipeline(false), TimeTraceGranularity(500),
+        TimeTraceVerbose(false) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/include/clang/Lex/HeaderSearchOptions.h b/clang/include/clang/Lex/HeaderSearchOptions.h
index 17635146a1614..83a95e9ad90a7 100644
--- a/clang/include/clang/Lex/HeaderSearchOptions.h
+++ b/clang/include/clang/Lex/HeaderSearchOptions.h
@@ -270,6 +270,12 @@ class HeaderSearchOptions {
   LLVM_PREFERRED_TYPE(bool)
   unsigned ModulesIncludeVFSUsage : 1;
 
+  /// Whether we should look for a module in module maps only in provided
+  /// header search paths or if we are allowed to look for module maps in
+  /// subdirectories of provided paths too.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned AllowModuleMapSubdirectorySearch : 1;
+
   HeaderSearchOptions(StringRef _Sysroot = "/")
       : Sysroot(_Sysroot), ModuleFormat("raw"), DisableModuleHash(false),
         ImplicitModuleMaps(false), ModuleMapFileHomeIsCwd(false),
@@ -285,7 +291,8 @@ class HeaderSearchOptions {
         ModulesSkipHeaderSearchPaths(false),
         ModulesSkipPragmaDiagnosticMappings(false),
         ModulesPruneNonAffectingModuleMaps(true), ModulesHashContent(false),
-        ModulesStrictContextHash(false), ModulesIncludeVFSUsage(false) {}
+        ModulesStrictContextHash(false), ModulesIncludeVFSUsage(false),
+        AllowModuleMapSubdirectorySearch(true) {}
 
   /// AddPath - Add the \p Path path to the specified \p Group list.
   void AddPath(StringRef Path, frontend::IncludeDirGroup Group,
diff --git a/clang/include/clang/Lex/PPEmbedParameters.h b/clang/include/clang/Lex/PPEmbedParameters.h
index 51bf908524e7a..c4fb8d02f6f35 100644
--- a/clang/include/clang/Lex/PPEmbedParameters.h
+++ b/clang/include/clang/Lex/PPEmbedParameters.h
@@ -75,7 +75,6 @@ struct LexEmbedParametersResult {
   std::optional<PPEmbedParameterIfEmpty> MaybeIfEmptyParam;
   std::optional<PPEmbedParameterPrefix> MaybePrefixParam;
   std::optional<PPEmbedParameterSuffix> MaybeSuffixParam;
-  SourceRange ParamRange;
   int UnrecognizedParams;
 
   size_t PrefixTokenCount() const {
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4de86884adfd4..46daa11f381af 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -1161,6 +1161,11 @@ class Preprocessor {
   /// invoked (at which point the last position is popped).
   std::vector<CachedTokensTy::size_type> BacktrackPositions;
 
+  /// Stack of cached tokens/initial number of cached tokens pairs, allowing
+  /// nested unannotated backtracks.
+  std::vector<std::pair<CachedTokensTy, CachedTokensTy::size_type>>
+      UnannotatedBacktrackTokens;
+
   /// True if \p Preprocessor::SkipExcludedConditionalBlock() is running.
   /// This is used to guard against calling this function recursively.
   ///
@@ -1723,8 +1728,16 @@ class Preprocessor {
   /// at some point after EnableBacktrackAtThisPos. If you don't, caching of
   /// tokens will continue indefinitely.
   ///
-  void EnableBacktrackAtThisPos();
+  /// \param Unannotated Whether token annotations are reverted upon calling
+  /// Backtrack().
+  void EnableBacktrackAtThisPos(bool Unannotated = false);
+
+private:
+  std::pair<CachedTokensTy::size_type, bool> LastBacktrackPos();
+
+  CachedTokensTy PopUnannotatedBacktrackTokens();
 
+public:
   /// Disable the last EnableBacktrackAtThisPos call.
   void CommitBacktrackedTokens();
 
@@ -1736,6 +1749,12 @@ class Preprocessor {
   /// caching of tokens is on.
   bool isBacktrackEnabled() const { return !BacktrackPositions.empty(); }
 
+  /// True if EnableBacktrackAtThisPos() was called and
+  /// caching of unannotated tokens is on.
+  bool isUnannotatedBacktrackEnabled() const {
+    return !UnannotatedBacktrackTokens.empty();
+  }
+
   /// Lex the next token for this preprocessor.
   void Lex(Token &Result);
 
@@ -1842,8 +1861,9 @@ class Preprocessor {
   void RevertCachedTokens(unsigned N) {
     assert(isBacktrackEnabled() &&
            "Should only be called when tokens are cached for backtracking");
-    assert(signed(CachedLexPos) - signed(N) >= signed(BacktrackPositions.back())
-         && "Should revert tokens up to the last backtrack position, not more");
+    assert(signed(CachedLexPos) - signed(N) >=
+               signed(LastBacktrackPos().first) &&
+           "Should revert tokens up to the last backtrack position, not more");
     assert(signed(CachedLexPos) - signed(N) >= 0 &&
            "Corrupted backtrack positions ?");
     CachedLexPos -= N;
diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
index fc98c15c3fc9b..cadec5ef83dec 100644
--- a/clang/include/clang/Lex/PreprocessorOptions.h
+++ b/clang/include/clang/Lex/PreprocessorOptions.h
@@ -212,6 +212,10 @@ class PreprocessorOptions {
   /// If set, the UNIX timestamp specified by SOURCE_DATE_EPOCH.
   std::optional<uint64_t> SourceDateEpoch;
 
+  /// If set, the preprocessor reports an error when processing #pragma mc_func
+  /// on AIX.
+  bool ErrorOnPragmaMcfuncOnAIX = true;
+
 public:
   PreprocessorOptions() : PrecompiledPreambleBytes(0, false) {}
 
@@ -249,6 +253,7 @@ class PreprocessorOptions {
     PrecompiledPreambleBytes.first = 0;
     PrecompiledPreambleBytes.second = false;
     RetainExcludedConditionalBlocks = false;
+    ErrorOnPragmaMcfuncOnAIX = true;
   }
 };
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index ed28c736b9505..1518ae96e0aca 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -221,6 +221,7 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> MaxTokensHerePragmaHandler;
   std::unique_ptr<PragmaHandler> MaxTokensTotalPragmaHandler;
   std::unique_ptr<PragmaHandler> RISCVPragmaHandler;
+  std::unique_ptr<PragmaHandler> MCFuncPragmaHandler;
 
   std::unique_ptr<CommentHandler> CommentSemaHandler;
 
@@ -1024,6 +1025,8 @@ class Parser : public CodeCompletionHandler {
   ///   ....
   ///   TPA.Revert();
   ///
+  /// If the Unannotated parameter is true, any token annotations created
+  /// during the tentative parse are reverted.
   class TentativeParsingAction {
     Parser &P;
     PreferredTypeBuilder PrevPreferredType;
@@ -1033,7 +1036,7 @@ class Parser : public CodeCompletionHandler {
     bool isActive;
 
   public:
-    explicit TentativeParsingAction(Parser &p)
+    explicit TentativeParsingAction(Parser &p, bool Unannotated = false)
         : P(p), PrevPreferredType(P.PreferredType) {
       PrevTok = P.Tok;
       PrevTentativelyDeclaredIdentifierCount =
@@ -1041,7 +1044,7 @@ class Parser : public CodeCompletionHandler {
       PrevParenCount = P.ParenCount;
       PrevBracketCount = P.BracketCount;
       PrevBraceCount = P.BraceCount;
-      P.PP.EnableBacktrackAtThisPos();
+      P.PP.EnableBacktrackAtThisPos(Unannotated);
       isActive = true;
     }
     void Commit() {
@@ -1072,13 +1075,11 @@ class Parser : public CodeCompletionHandler {
   class RevertingTentativeParsingAction
       : private Parser::TentativeParsingAction {
   public:
-    RevertingTentativeParsingAction(Parser &P)
-        : Parser::TentativeParsingAction(P) {}
+    using TentativeParsingAction::TentativeParsingAction;
+
     ~RevertingTentativeParsingAction() { Revert(); }
   };
 
-  class UnannotatedTentativeParsingAction;
-
   /// ObjCDeclContextSwitch - An object used to switch context from
   /// an objective-c decl context to its enclosing decl context and
   /// back.
@@ -1984,7 +1985,8 @@ class Parser : public CodeCompletionHandler {
       CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHasErrors,
       bool EnteringContext, bool *MayBePseudoDestructor = nullptr,
       bool IsTypename = false, const IdentifierInfo **LastII = nullptr,
-      bool OnlyNamespace = false, bool InUsingDeclaration = false);
+      bool OnlyNamespace = false, bool InUsingDeclaration = false,
+      bool Disambiguation = false);
 
   //===--------------------------------------------------------------------===//
   // C++11 5.1.2: Lambda expressions
@@ -3896,6 +3898,8 @@ class Parser : public CodeCompletionHandler {
   // __builtin_field_type, __builtin_base_type
   ExprResult ParseSYCLBuiltinType();
 
+  ExprResult ParseBuiltinPtrauthTypeDiscriminator();
+
   //===--------------------------------------------------------------------===//
   // Preprocessor code-completion pass-through
   void CodeCompleteDirective(bool InConditional) override;
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 9d8b797af6663..d6a6cee62a752 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -984,7 +984,7 @@ class Sema;
     unsigned getNumParams() const {
       if (IsSurrogate) {
         QualType STy = Surrogate->getConversionType();
-        while (STy->isPointerType() || STy->isReferenceType())
+        while (STy->isPointerOrReferenceType())
           STy = STy->getPointeeType();
         return STy->castAs<FunctionProtoType>()->getNumParams();
       }
@@ -998,7 +998,9 @@ class Sema;
   private:
     friend class OverloadCandidateSet;
     OverloadCandidate()
-        : IsSurrogate(false), IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {}
+        : IsSurrogate(false), IgnoreObjectArgument(false),
+          TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL),
+          RewriteKind(CRK_None) {}
   };
 
   /// OverloadCandidateSet - A set of overload candidates, used in C++
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 296703e931b7f..a77135ce1c2a4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2215,6 +2215,7 @@ class Sema final : public SemaBase {
     FST_FreeBSDKPrintf,
     FST_OSTrace,
     FST_OSLog,
+    FST_Syslog,
     FST_Unknown
   };
   static FormatStringType GetFormatStringType(const FormatAttr *Format);
@@ -3464,6 +3465,8 @@ class Sema final : public SemaBase {
                                     TemplateIdAnnotation *TemplateId,
                                     bool IsMemberSpecialization);
 
+  bool checkPointerAuthEnabled(SourceLocation Loc, SourceRange Range);
+
   bool checkConstantPointerAuthKey(Expr *keyExpr, unsigned &key);
 
   /// Diagnose function specifiers on a declaration of an identifier that
@@ -15090,6 +15093,44 @@ class Sema final : public SemaBase {
   void ProcessAPINotes(Decl *D);
 
   ///@}
+
+  //
+  //
+  // -------------------------------------------------------------------------
+  //
+  //
+
+  /// \name Bounds Safety
+  /// Implementations are in SemaBoundsSafety.cpp
+  ///@{
+public:
+  /// Check if applying the specified attribute variant from the "counted by"
+  /// family of attributes to FieldDecl \p FD is semantically valid. If
+  /// semantically invalid diagnostics will be emitted explaining the problems.
+  ///
+  /// \param FD The FieldDecl to apply the attribute to
+  /// \param E The count expression on the attribute
+  /// \param[out] Decls If the attribute is semantically valid \p Decls
+  ///             is populated with TypeCoupledDeclRefInfo objects, each
+  ///             describing Decls referred to in \p E.
+  /// \param CountInBytes If true the attribute is from the "sized_by" family of
+  ///                     attributes. If the false the attribute is from
+  ///                     "counted_by" family of attributes.
+  /// \param OrNull If true the attribute is from the "_or_null" suffixed family
+  ///               of attributes. If false the attribute does not have the
+  ///               suffix.
+  ///
+  /// Together \p CountInBytes and \p OrNull decide the attribute variant. E.g.
+  /// \p CountInBytes and \p OrNull both being true indicates the
+  /// `counted_by_or_null` attribute.
+  ///
+  /// \returns false iff semantically valid.
+  bool CheckCountedByAttrOnField(
+      FieldDecl *FD, Expr *E,
+      llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls, bool CountInBytes,
+      bool OrNull);
+
+  ///@}
 };
 
 DeductionFailureInfo
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 3edf1cc7c12f2..aa61dae9415e2 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -398,8 +398,7 @@ class SemaOpenMP : public SemaBase {
   StmtResult ActOnOpenMPExecutableDirective(
       OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
       OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
-      Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
-      OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown);
+      Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc);
   /// Called on well-formed '\#pragma omp parallel' after parsing
   /// of the  associated statement.
   StmtResult ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
@@ -423,6 +422,15 @@ class SemaOpenMP : public SemaBase {
   StmtResult ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
                                         Stmt *AStmt, SourceLocation StartLoc,
                                         SourceLocation EndLoc);
+  /// Called on well-formed '#pragma omp reverse'.
+  StmtResult ActOnOpenMPReverseDirective(Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed '#pragma omp interchange' after parsing of its
+  /// clauses and the associated statement.
+  StmtResult ActOnOpenMPInterchangeDirective(ArrayRef<OMPClause *> Clauses,
+                                             Stmt *AStmt,
+                                             SourceLocation StartLoc,
+                                             SourceLocation EndLoc);
   /// Called on well-formed '\#pragma omp for' after parsing
   /// of the associated statement.
   StmtResult
@@ -1421,26 +1429,6 @@ class SemaOpenMP : public SemaBase {
 
   /// All `omp assumes` we encountered so far.
   SmallVector<OMPAssumeAttr *, 4> OMPAssumeGlobal;
-
-  /// OMPD_loop is mapped to OMPD_for, OMPD_distribute or OMPD_simd depending
-  /// on the parameter of the bind clause. In the methods for the
-  /// mapped directives, check the parameters of the lastprivate clause.
-  bool checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses);
-  /// Depending on the bind clause of OMPD_loop map the directive to new
-  /// directives.
-  ///    1) loop bind(parallel) --> OMPD_for
-  ///    2) loop bind(teams) --> OMPD_distribute
-  ///    3) loop bind(thread) --> OMPD_simd
-  /// This is being handled in Sema instead of Codegen because of the need for
-  /// rigorous semantic checking in the new mapped directives.
-  bool mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
-                        ArrayRef<OMPClause *> Clauses,
-                        OpenMPBindClauseKind &BindKind,
-                        OpenMPDirectiveKind &Kind,
-                        OpenMPDirectiveKind &PrevMappedDirective,
-                        SourceLocation StartLoc, SourceLocation EndLoc,
-                        const DeclarationNameInfo &DirName,
-                        OpenMPDirectiveKind CancelRegion);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 199dcfef28287..922fdd409d360 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1900,6 +1900,8 @@ enum StmtCode {
   STMT_OMP_SIMD_DIRECTIVE,
   STMT_OMP_TILE_DIRECTIVE,
   STMT_OMP_UNROLL_DIRECTIVE,
+  STMT_OMP_REVERSE_DIRECTIVE,
+  STMT_OMP_INTERCHANGE_DIRECTIVE,
   STMT_OMP_FOR_DIRECTIVE,
   STMT_OMP_FOR_SIMD_DIRECTIVE,
   STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/include/clang/Serialization/ObjectFilePCHContainerReader.h b/clang/include/clang/Serialization/ObjectFilePCHContainerReader.h
new file mode 100644
index 0000000000000..6281a3f428869
--- /dev/null
+++ b/clang/include/clang/Serialization/ObjectFilePCHContainerReader.h
@@ -0,0 +1,25 @@
+//===-- Serialization/ObjectFilePCHContainerReader.h ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SERIALIZATION_OBJECTFILEPCHCONTAINERREADER_H
+#define LLVM_CLANG_SERIALIZATION_OBJECTFILEPCHCONTAINERREADER_H
+
+#include "clang/Frontend/PCHContainerOperations.h"
+
+namespace clang {
+/// A PCHContainerReader implementation that uses LLVM to
+/// wraps Clang modules inside a COFF, ELF, or Mach-O container.
+class ObjectFilePCHContainerReader : public PCHContainerReader {
+  ArrayRef<StringRef> getFormats() const override;
+
+  /// Returns the serialized AST inside the PCH container Buffer.
+  StringRef ExtractPCH(llvm::MemoryBufferRef Buffer) const override;
+};
+} // namespace clang
+
+#endif
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index ec5dbd28a5272..38b55a0eb0a7b 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1045,18 +1045,6 @@ def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
 
 def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   HelpText<"Warn on mmap() calls that are both writable and executable">,
-  CheckerOptions<[
-    CmdLineOption<Integer,
-                  "MmapProtExec",
-                  "Specifies the value of PROT_EXEC",
-                  "0x04",
-                  Released>,
-    CmdLineOption<Integer,
-                  "MmapProtRead",
-                  "Specifies the value of PROT_READ",
-                  "0x01",
-                  Released>
-  ]>,
   Documentation<HasDocumentation>;
 
 def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
index 3a4b087257149..def2970d448d4 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
@@ -326,6 +326,10 @@ class LocAsInteger : public NonLoc {
   static bool classof(SVal V) { return V.getKind() == LocAsIntegerKind; }
 };
 
+/// The simplest example of a concrete compound value is nonloc::CompoundVal,
+/// which represents a concrete r-value of an initializer-list or a string.
+/// Internally, it contains an llvm::ImmutableList of SVal's stored inside the
+/// literal.
 class CompoundVal : public NonLoc {
   friend class ento::SValBuilder;
 
@@ -346,6 +350,36 @@ class CompoundVal : public NonLoc {
   static bool classof(SVal V) { return V.getKind() == CompoundValKind; }
 };
 
+/// While nonloc::CompoundVal covers a few simple use cases,
+/// nonloc::LazyCompoundVal is a more performant and flexible way to represent
+/// an rvalue of record type, so it shows up much more frequently during
+/// analysis. This value is an r-value that represents a snapshot of any
+/// structure "as a whole" at a given moment during the analysis. Such value is
+/// already quite far from being referred to as "concrete", as many fields
+/// inside it would be unknown or symbolic. nonloc::LazyCompoundVal operates by
+/// storing two things:
+///   * a reference to the TypedValueRegion being snapshotted (yes, it is always
+///     typed), and also
+///   * a reference to the whole Store object, obtained from the ProgramState in
+///     which the nonloc::LazyCompoundVal was created.
+///
+/// Note that the old ProgramState and its Store is kept alive during the
+/// analysis because these are immutable functional data structures and each new
+/// Store value is represented as "earlier Store" + "additional binding".
+///
+/// Essentially, nonloc::LazyCompoundVal is a performance optimization for the
+/// analyzer. Because Store is immutable, creating a nonloc::LazyCompoundVal is
+/// a very cheap operation. Note that the Store contains all region bindings in
+/// the program state, not only related to the region. Later, if necessary, such
+/// value can be unpacked -- eg. when it is assigned to another variable.
+///
+/// If you ever need to inspect the contents of the LazyCompoundVal, you can use
+/// StoreManager::iterBindings(). It'll iterate through all values in the Store,
+/// but you're only interested in the ones that belong to
+/// LazyCompoundVal::getRegion(); other bindings are immaterial.
+///
+/// NOTE: LazyCompoundVal::getRegion() itself is also immaterial (see the actual
+/// method docs for details).
 class LazyCompoundVal : public NonLoc {
   friend class ento::SValBuilder;
 
@@ -363,6 +397,18 @@ class LazyCompoundVal : public NonLoc {
   /// It might return null.
   const void *getStore() const;
 
+  /// This function itself is immaterial. It is only an implementation detail.
+  /// LazyCompoundVal represents only the rvalue, the data (known or unknown)
+  /// that *was* stored in that region *at some point in the past*. The region
+  /// should not be used for any purpose other than figuring out what part of
+  /// the frozen Store you're interested in. The value does not represent the
+  /// *current* value of that region. Sometimes it may, but this should not be
+  /// relied upon. Instead, if you want to figure out what region it represents,
+  /// you typically need to see where you got it from in the first place. The
+  /// region is absolutely not analogous to the C++ "this" pointer. It is also
+  /// not a valid way to "materialize" the prvalue into a glvalue in C++,
+  /// because the region represents the *old* storage (sometimes very old), not
+  /// the *future* storage.
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   const TypedValueRegion *getRegion() const;
 
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index e3aa76df8316c..9d254dcc1c9ef 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 26; // SwiftCopyable
+const uint16_t VERSION_MINOR = 28; // nested tags
 
 const uint8_t kSwiftCopyable = 1;
 const uint8_t kSwiftNonCopyable = 2;
@@ -63,6 +63,10 @@ enum BlockID {
   /// about the method.
   OBJC_METHOD_BLOCK_ID,
 
+  /// The C++ method data block, which maps C++ (context id, method name) pairs
+  /// to information about the method.
+  CXX_METHOD_BLOCK_ID,
+
   /// The Objective-C selector data block, which maps Objective-C
   /// selector names (# of pieces, identifier IDs) to the selector ID
   /// used in other tables.
@@ -181,6 +185,20 @@ using ObjCMethodDataLayout =
                          >;
 } // namespace objc_method_block
 
+namespace cxx_method_block {
+enum {
+  CXX_METHOD_DATA = 1,
+};
+
+using CXXMethodDataLayout =
+    llvm::BCRecordLayout<CXX_METHOD_DATA, // record ID
+                         llvm::BCVBR<16>, // table offset within the blob (see
+                                          // below)
+                         llvm::BCBlob     // map from C++ (context id, name)
+                                          // tuples to C++ method information
+                         >;
+} // namespace cxx_method_block
+
 namespace objc_selector_block {
 enum {
   OBJC_SELECTOR_DATA = 1,
@@ -269,11 +287,16 @@ struct ContextTableKey {
       : parentContextID(parentContextID), contextKind(contextKind),
         contextID(contextID) {}
 
-  ContextTableKey(std::optional<Context> context, IdentifierID nameID)
-      : parentContextID(context ? context->id.Value : (uint32_t)-1),
-        contextKind(context ? static_cast<uint8_t>(context->kind)
-                            : static_cast<uint8_t>(-1)),
-        contextID(nameID) {}
+  ContextTableKey(std::optional<ContextID> ParentContextID, ContextKind Kind,
+                  uint32_t ContextID)
+      : parentContextID(ParentContextID ? ParentContextID->Value : -1),
+        contextKind(static_cast<uint8_t>(Kind)), contextID(ContextID) {}
+
+  ContextTableKey(std::optional<Context> ParentContext, ContextKind Kind,
+                  uint32_t ContextID)
+      : ContextTableKey(ParentContext ? std::make_optional(ParentContext->id)
+                                      : std::nullopt,
+                        Kind, ContextID) {}
 
   llvm::hash_code hashValue() const {
     return llvm::hash_value(
@@ -286,6 +309,32 @@ inline bool operator==(const ContextTableKey &lhs, const ContextTableKey &rhs) {
          lhs.contextKind == rhs.contextKind && lhs.contextID == rhs.contextID;
 }
 
+/// A stored Objective-C or C++ declaration, represented by the ID of its parent
+/// context, and the name of the declaration.
+struct SingleDeclTableKey {
+  uint32_t parentContextID;
+  uint32_t nameID;
+
+  SingleDeclTableKey() : parentContextID(-1), nameID(-1) {}
+
+  SingleDeclTableKey(uint32_t ParentContextID, uint32_t NameID)
+      : parentContextID(ParentContextID), nameID(NameID) {}
+
+  SingleDeclTableKey(std::optional<Context> ParentCtx, IdentifierID NameID)
+      : parentContextID(ParentCtx ? ParentCtx->id.Value
+                                  : static_cast<uint32_t>(-1)),
+        nameID(NameID) {}
+
+  llvm::hash_code hashValue() const {
+    return llvm::hash_value(std::make_pair(parentContextID, nameID));
+  }
+};
+
+inline bool operator==(const SingleDeclTableKey &lhs,
+                       const SingleDeclTableKey &rhs) {
+  return lhs.parentContextID == rhs.parentContextID && lhs.nameID == rhs.nameID;
+}
+
 } // namespace api_notes
 } // namespace clang
 
@@ -341,6 +390,29 @@ template <> struct DenseMapInfo<clang::api_notes::ContextTableKey> {
     return lhs == rhs;
   }
 };
+
+template <> struct DenseMapInfo<clang::api_notes::SingleDeclTableKey> {
+  static inline clang::api_notes::SingleDeclTableKey getEmptyKey() {
+    return clang::api_notes::SingleDeclTableKey();
+  }
+
+  static inline clang::api_notes::SingleDeclTableKey getTombstoneKey() {
+    return clang::api_notes::SingleDeclTableKey{
+        DenseMapInfo<uint32_t>::getTombstoneKey(),
+        DenseMapInfo<uint32_t>::getTombstoneKey()};
+  }
+
+  static unsigned
+  getHashValue(const clang::api_notes::SingleDeclTableKey &value) {
+    return value.hashValue();
+  }
+
+  static bool isEqual(const clang::api_notes::SingleDeclTableKey &lhs,
+                      const clang::api_notes::SingleDeclTableKey &rhs) {
+    return lhs == rhs;
+  }
+};
+
 } // namespace llvm
 
 #endif
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index 8454e092b55ac..871f782511d5f 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -429,15 +429,13 @@ class ObjCSelectorTableInfo {
 
 /// Used to deserialize the on-disk global variable table.
 class GlobalVariableTableInfo
-    : public VersionedTableInfo<GlobalVariableTableInfo, ContextTableKey,
+    : public VersionedTableInfo<GlobalVariableTableInfo, SingleDeclTableKey,
                                 GlobalVariableInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    return {CtxID, ContextKind, NameID};
+    return {CtxID, NameID};
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -454,15 +452,13 @@ class GlobalVariableTableInfo
 
 /// Used to deserialize the on-disk global function table.
 class GlobalFunctionTableInfo
-    : public VersionedTableInfo<GlobalFunctionTableInfo, ContextTableKey,
+    : public VersionedTableInfo<GlobalFunctionTableInfo, SingleDeclTableKey,
                                 GlobalFunctionInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    return {CtxID, ContextKind, NameID};
+    return {CtxID, NameID};
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -477,6 +473,29 @@ class GlobalFunctionTableInfo
   }
 };
 
+/// Used to deserialize the on-disk C++ method table.
+class CXXMethodTableInfo
+    : public VersionedTableInfo<CXXMethodTableInfo, SingleDeclTableKey,
+                                CXXMethodInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    return {CtxID, NameID};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static CXXMethodInfo readUnversioned(internal_key_type Key,
+                                       const uint8_t *&Data) {
+    CXXMethodInfo Info;
+    ReadFunctionInfo(Data, Info);
+    return Info;
+  }
+};
+
 /// Used to deserialize the on-disk enumerator table.
 class EnumConstantTableInfo
     : public VersionedTableInfo<EnumConstantTableInfo, uint32_t,
@@ -501,15 +520,13 @@ class EnumConstantTableInfo
 
 /// Used to deserialize the on-disk tag table.
 class TagTableInfo
-    : public VersionedTableInfo<TagTableInfo, ContextTableKey, TagInfo> {
+    : public VersionedTableInfo<TagTableInfo, SingleDeclTableKey, TagInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto NameID =
         endian::readNext<IdentifierID, llvm::endianness::little>(Data);
-    return {CtxID, ContextKind, NameID};
+    return {CtxID, NameID};
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -563,16 +580,14 @@ class TagTableInfo
 
 /// Used to deserialize the on-disk typedef table.
 class TypedefTableInfo
-    : public VersionedTableInfo<TypedefTableInfo, ContextTableKey,
+    : public VersionedTableInfo<TypedefTableInfo, SingleDeclTableKey,
                                 TypedefInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto nameID =
         endian::readNext<IdentifierID, llvm::endianness::little>(Data);
-    return {CtxID, ContextKind, nameID};
+    return {CtxID, nameID};
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -638,6 +653,12 @@ class APINotesReader::Implementation {
   /// The Objective-C method table.
   std::unique_ptr<SerializedObjCMethodTable> ObjCMethodTable;
 
+  using SerializedCXXMethodTable =
+      llvm::OnDiskIterableChainedHashTable<CXXMethodTableInfo>;
+
+  /// The C++ method table.
+  std::unique_ptr<SerializedCXXMethodTable> CXXMethodTable;
+
   using SerializedObjCSelectorTable =
       llvm::OnDiskIterableChainedHashTable<ObjCSelectorTableInfo>;
 
@@ -691,6 +712,8 @@ class APINotesReader::Implementation {
                              llvm::SmallVectorImpl<uint64_t> &Scratch);
   bool readObjCMethodBlock(llvm::BitstreamCursor &Cursor,
                            llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readCXXMethodBlock(llvm::BitstreamCursor &Cursor,
+                          llvm::SmallVectorImpl<uint64_t> &Scratch);
   bool readObjCSelectorBlock(llvm::BitstreamCursor &Cursor,
                              llvm::SmallVectorImpl<uint64_t> &Scratch);
   bool readGlobalVariableBlock(llvm::BitstreamCursor &Cursor,
@@ -1148,6 +1171,81 @@ bool APINotesReader::Implementation::readObjCMethodBlock(
   return false;
 }
 
+bool APINotesReader::Implementation::readCXXMethodBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(CXX_METHOD_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case cxx_method_block::CXX_METHOD_DATA: {
+      // Already saw C++ method table.
+      if (CXXMethodTable)
+        return true;
+
+      uint32_t tableOffset;
+      cxx_method_block::CXXMethodDataLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      CXXMethodTable.reset(SerializedCXXMethodTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
 bool APINotesReader::Implementation::readObjCSelectorBlock(
     llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
   if (Cursor.EnterSubBlock(OBJC_SELECTOR_BLOCK_ID))
@@ -1700,6 +1798,14 @@ APINotesReader::APINotesReader(llvm::MemoryBuffer *InputBuffer,
       }
       break;
 
+    case CXX_METHOD_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readCXXMethodBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
     case OBJC_SELECTOR_BLOCK_ID:
       if (!HasValidControlBlock ||
           Implementation->readObjCSelectorBlock(Cursor, Scratch)) {
@@ -1919,6 +2025,23 @@ auto APINotesReader::lookupObjCMethod(ContextID CtxID, ObjCSelectorRef Selector,
   return {Implementation->SwiftVersion, *Known};
 }
 
+auto APINotesReader::lookupCXXMethod(ContextID CtxID, llvm::StringRef Name)
+    -> VersionedInfo<CXXMethodInfo> {
+  if (!Implementation->CXXMethodTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  auto Known = Implementation->CXXMethodTable->find(
+      SingleDeclTableKey(CtxID.Value, *NameID));
+  if (Known == Implementation->CXXMethodTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
 auto APINotesReader::lookupGlobalVariable(llvm::StringRef Name,
                                           std::optional<Context> Ctx)
     -> VersionedInfo<GlobalVariableInfo> {
@@ -1929,7 +2052,7 @@ auto APINotesReader::lookupGlobalVariable(llvm::StringRef Name,
   if (!NameID)
     return std::nullopt;
 
-  ContextTableKey Key(Ctx, *NameID);
+  SingleDeclTableKey Key(Ctx, *NameID);
 
   auto Known = Implementation->GlobalVariableTable->find(Key);
   if (Known == Implementation->GlobalVariableTable->end())
@@ -1948,7 +2071,7 @@ auto APINotesReader::lookupGlobalFunction(llvm::StringRef Name,
   if (!NameID)
     return std::nullopt;
 
-  ContextTableKey Key(Ctx, *NameID);
+  SingleDeclTableKey Key(Ctx, *NameID);
 
   auto Known = Implementation->GlobalFunctionTable->find(Key);
   if (Known == Implementation->GlobalFunctionTable->end())
@@ -1973,6 +2096,24 @@ auto APINotesReader::lookupEnumConstant(llvm::StringRef Name)
   return {Implementation->SwiftVersion, *Known};
 }
 
+auto APINotesReader::lookupTagID(llvm::StringRef Name,
+                                 std::optional<Context> ParentCtx)
+    -> std::optional<ContextID> {
+  if (!Implementation->ContextIDTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> TagID = Implementation->getIdentifier(Name);
+  if (!TagID)
+    return std::nullopt;
+
+  auto KnownID = Implementation->ContextIDTable->find(
+      ContextTableKey(ParentCtx, ContextKind::Tag, *TagID));
+  if (KnownID == Implementation->ContextIDTable->end())
+    return std::nullopt;
+
+  return ContextID(*KnownID);
+}
+
 auto APINotesReader::lookupTag(llvm::StringRef Name, std::optional<Context> Ctx)
     -> VersionedInfo<TagInfo> {
   if (!Implementation->TagTable)
@@ -1982,7 +2123,7 @@ auto APINotesReader::lookupTag(llvm::StringRef Name, std::optional<Context> Ctx)
   if (!NameID)
     return std::nullopt;
 
-  ContextTableKey Key(Ctx, *NameID);
+  SingleDeclTableKey Key(Ctx, *NameID);
 
   auto Known = Implementation->TagTable->find(Key);
   if (Known == Implementation->TagTable->end())
@@ -2001,7 +2142,7 @@ auto APINotesReader::lookupTypedef(llvm::StringRef Name,
   if (!NameID)
     return std::nullopt;
 
-  ContextTableKey Key(Ctx, *NameID);
+  SingleDeclTableKey Key(Ctx, *NameID);
 
   auto Known = Implementation->TypedefTable->find(Key);
   if (Known == Implementation->TypedefTable->end())
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index 4053d515ef426..2a71922746ac5 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -70,22 +70,29 @@ class APINotesWriter::Implementation {
                  llvm::SmallVector<std::pair<VersionTuple, ObjCMethodInfo>, 1>>
       ObjCMethods;
 
+  /// Information about C++ methods.
+  ///
+  /// Indexed by the context ID and name ID.
+  llvm::DenseMap<SingleDeclTableKey,
+                 llvm::SmallVector<std::pair<VersionTuple, CXXMethodInfo>, 1>>
+      CXXMethods;
+
   /// Mapping from selectors to selector ID.
   llvm::DenseMap<StoredObjCSelector, SelectorID> SelectorIDs;
 
   /// Information about global variables.
   ///
-  /// Indexed by the context ID, contextKind, identifier ID.
+  /// Indexed by the context ID, identifier ID.
   llvm::DenseMap<
-      ContextTableKey,
+      SingleDeclTableKey,
       llvm::SmallVector<std::pair<VersionTuple, GlobalVariableInfo>, 1>>
       GlobalVariables;
 
   /// Information about global functions.
   ///
-  /// Indexed by the context ID, contextKind, identifier ID.
+  /// Indexed by the context ID, identifier ID.
   llvm::DenseMap<
-      ContextTableKey,
+      SingleDeclTableKey,
       llvm::SmallVector<std::pair<VersionTuple, GlobalFunctionInfo>, 1>>
       GlobalFunctions;
 
@@ -98,15 +105,15 @@ class APINotesWriter::Implementation {
 
   /// Information about tags.
   ///
-  /// Indexed by the context ID, contextKind, identifier ID.
-  llvm::DenseMap<ContextTableKey,
+  /// Indexed by the context ID, identifier ID.
+  llvm::DenseMap<SingleDeclTableKey,
                  llvm::SmallVector<std::pair<VersionTuple, TagInfo>, 1>>
       Tags;
 
   /// Information about typedefs.
   ///
-  /// Indexed by the context ID, contextKind, identifier ID.
-  llvm::DenseMap<ContextTableKey,
+  /// Indexed by the context ID, identifier ID.
+  llvm::DenseMap<SingleDeclTableKey,
                  llvm::SmallVector<std::pair<VersionTuple, TypedefInfo>, 1>>
       Typedefs;
 
@@ -150,6 +157,7 @@ class APINotesWriter::Implementation {
   void writeContextBlock(llvm::BitstreamWriter &Stream);
   void writeObjCPropertyBlock(llvm::BitstreamWriter &Stream);
   void writeObjCMethodBlock(llvm::BitstreamWriter &Stream);
+  void writeCXXMethodBlock(llvm::BitstreamWriter &Stream);
   void writeObjCSelectorBlock(llvm::BitstreamWriter &Stream);
   void writeGlobalVariableBlock(llvm::BitstreamWriter &Stream);
   void writeGlobalFunctionBlock(llvm::BitstreamWriter &Stream);
@@ -181,6 +189,7 @@ void APINotesWriter::Implementation::writeToStream(llvm::raw_ostream &OS) {
     writeContextBlock(Stream);
     writeObjCPropertyBlock(Stream);
     writeObjCMethodBlock(Stream);
+    writeCXXMethodBlock(Stream);
     writeObjCSelectorBlock(Stream);
     writeGlobalVariableBlock(Stream);
     writeGlobalFunctionBlock(Stream);
@@ -765,6 +774,34 @@ class ObjCMethodTableInfo
     emitFunctionInfo(OS, OMI);
   }
 };
+
+/// Used to serialize the on-disk C++ method table.
+class CXXMethodTableInfo
+    : public VersionedTableInfo<CXXMethodTableInfo, SingleDeclTableKey,
+                                CXXMethodInfo> {
+public:
+  unsigned getKeyLength(key_type_ref) {
+    return sizeof(uint32_t) + sizeof(uint32_t);
+  }
+
+  void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) {
+    llvm::support::endian::Writer writer(OS, llvm::endianness::little);
+    writer.write<uint32_t>(Key.parentContextID);
+    writer.write<uint32_t>(Key.nameID);
+  }
+
+  hash_value_type ComputeHash(key_type_ref key) {
+    return static_cast<size_t>(key.hashValue());
+  }
+
+  unsigned getUnversionedInfoSize(const CXXMethodInfo &OMI) {
+    return getFunctionInfoSize(OMI);
+  }
+
+  void emitUnversionedInfo(raw_ostream &OS, const CXXMethodInfo &OMI) {
+    emitFunctionInfo(OS, OMI);
+  }
+};
 } // namespace
 
 void APINotesWriter::Implementation::writeObjCMethodBlock(
@@ -794,6 +831,33 @@ void APINotesWriter::Implementation::writeObjCMethodBlock(
   }
 }
 
+void APINotesWriter::Implementation::writeCXXMethodBlock(
+    llvm::BitstreamWriter &Stream) {
+  llvm::BCBlockRAII Scope(Stream, CXX_METHOD_BLOCK_ID, 3);
+
+  if (CXXMethods.empty())
+    return;
+
+  {
+    llvm::SmallString<4096> HashTableBlob;
+    uint32_t Offset;
+    {
+      llvm::OnDiskChainedHashTableGenerator<CXXMethodTableInfo> Generator;
+      for (auto &MD : CXXMethods)
+        Generator.insert(MD.first, MD.second);
+
+      llvm::raw_svector_ostream BlobStream(HashTableBlob);
+      // Make sure that no bucket is at offset 0
+      llvm::support::endian::write<uint32_t>(BlobStream, 0,
+                                             llvm::endianness::little);
+      Offset = Generator.Emit(BlobStream);
+    }
+
+    cxx_method_block::CXXMethodDataLayout CXXMethodData(Stream);
+    CXXMethodData.emit(Scratch, Offset, HashTableBlob);
+  }
+}
+
 namespace {
 /// Used to serialize the on-disk Objective-C selector table.
 class ObjCSelectorTableInfo {
@@ -865,18 +929,17 @@ void APINotesWriter::Implementation::writeObjCSelectorBlock(
 namespace {
 /// Used to serialize the on-disk global variable table.
 class GlobalVariableTableInfo
-    : public VersionedTableInfo<GlobalVariableTableInfo, ContextTableKey,
+    : public VersionedTableInfo<GlobalVariableTableInfo, SingleDeclTableKey,
                                 GlobalVariableInfo> {
 public:
   unsigned getKeyLength(key_type_ref) {
-    return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint32_t);
+    return sizeof(uint32_t) + sizeof(uint32_t);
   }
 
   void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) {
     llvm::support::endian::Writer writer(OS, llvm::endianness::little);
     writer.write<uint32_t>(Key.parentContextID);
-    writer.write<uint8_t>(Key.contextKind);
-    writer.write<uint32_t>(Key.contextID);
+    writer.write<uint32_t>(Key.nameID);
   }
 
   hash_value_type ComputeHash(key_type_ref Key) {
@@ -979,18 +1042,17 @@ void emitFunctionInfo(raw_ostream &OS, const FunctionInfo &FI) {
 
 /// Used to serialize the on-disk global function table.
 class GlobalFunctionTableInfo
-    : public VersionedTableInfo<GlobalFunctionTableInfo, ContextTableKey,
+    : public VersionedTableInfo<GlobalFunctionTableInfo, SingleDeclTableKey,
                                 GlobalFunctionInfo> {
 public:
   unsigned getKeyLength(key_type_ref) {
-    return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint32_t);
+    return sizeof(uint32_t) + sizeof(uint32_t);
   }
 
   void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) {
     llvm::support::endian::Writer writer(OS, llvm::endianness::little);
     writer.write<uint32_t>(Key.parentContextID);
-    writer.write<uint8_t>(Key.contextKind);
-    writer.write<uint32_t>(Key.contextID);
+    writer.write<uint32_t>(Key.nameID);
   }
 
   hash_value_type ComputeHash(key_type_ref Key) {
@@ -1091,20 +1153,20 @@ void APINotesWriter::Implementation::writeEnumConstantBlock(
 namespace {
 template <typename Derived, typename UnversionedDataType>
 class CommonTypeTableInfo
-    : public VersionedTableInfo<Derived, ContextTableKey, UnversionedDataType> {
+    : public VersionedTableInfo<Derived, SingleDeclTableKey,
+                                UnversionedDataType> {
 public:
   using key_type_ref = typename CommonTypeTableInfo::key_type_ref;
   using hash_value_type = typename CommonTypeTableInfo::hash_value_type;
 
   unsigned getKeyLength(key_type_ref) {
-    return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(IdentifierID);
+    return sizeof(uint32_t) + sizeof(IdentifierID);
   }
 
   void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) {
     llvm::support::endian::Writer writer(OS, llvm::endianness::little);
     writer.write<uint32_t>(Key.parentContextID);
-    writer.write<uint8_t>(Key.contextKind);
-    writer.write<IdentifierID>(Key.contextID);
+    writer.write<IdentifierID>(Key.nameID);
   }
 
   hash_value_type ComputeHash(key_type_ref Key) {
@@ -1346,12 +1408,20 @@ void APINotesWriter::addObjCMethod(ContextID CtxID, ObjCSelectorRef Selector,
   }
 }
 
+void APINotesWriter::addCXXMethod(ContextID CtxID, llvm::StringRef Name,
+                                  const CXXMethodInfo &Info,
+                                  VersionTuple SwiftVersion) {
+  IdentifierID NameID = Implementation->getIdentifier(Name);
+  SingleDeclTableKey Key(CtxID.Value, NameID);
+  Implementation->CXXMethods[Key].push_back({SwiftVersion, Info});
+}
+
 void APINotesWriter::addGlobalVariable(std::optional<Context> Ctx,
                                        llvm::StringRef Name,
                                        const GlobalVariableInfo &Info,
                                        VersionTuple SwiftVersion) {
   IdentifierID VariableID = Implementation->getIdentifier(Name);
-  ContextTableKey Key(Ctx, VariableID);
+  SingleDeclTableKey Key(Ctx, VariableID);
   Implementation->GlobalVariables[Key].push_back({SwiftVersion, Info});
 }
 
@@ -1360,7 +1430,7 @@ void APINotesWriter::addGlobalFunction(std::optional<Context> Ctx,
                                        const GlobalFunctionInfo &Info,
                                        VersionTuple SwiftVersion) {
   IdentifierID NameID = Implementation->getIdentifier(Name);
-  ContextTableKey Key(Ctx, NameID);
+  SingleDeclTableKey Key(Ctx, NameID);
   Implementation->GlobalFunctions[Key].push_back({SwiftVersion, Info});
 }
 
@@ -1374,7 +1444,7 @@ void APINotesWriter::addEnumConstant(llvm::StringRef Name,
 void APINotesWriter::addTag(std::optional<Context> Ctx, llvm::StringRef Name,
                             const TagInfo &Info, VersionTuple SwiftVersion) {
   IdentifierID TagID = Implementation->getIdentifier(Name);
-  ContextTableKey Key(Ctx, TagID);
+  SingleDeclTableKey Key(Ctx, TagID);
   Implementation->Tags[Key].push_back({SwiftVersion, Info});
 }
 
@@ -1382,7 +1452,7 @@ void APINotesWriter::addTypedef(std::optional<Context> Ctx,
                                 llvm::StringRef Name, const TypedefInfo &Info,
                                 VersionTuple SwiftVersion) {
   IdentifierID TypedefID = Implementation->getIdentifier(Name);
-  ContextTableKey Key(Ctx, TypedefID);
+  SingleDeclTableKey Key(Ctx, TypedefID);
   Implementation->Typedefs[Key].push_back({SwiftVersion, Info});
 }
 } // namespace api_notes
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 870b64e3b7a9b..11cccc94a15f0 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -406,6 +406,9 @@ template <> struct ScalarEnumerationTraits<EnumConvenienceAliasKind> {
 } // namespace llvm
 
 namespace {
+struct Tag;
+typedef std::vector<Tag> TagsSeq;
+
 struct Tag {
   StringRef Name;
   AvailabilityItem Availability;
@@ -420,9 +423,12 @@ struct Tag {
   std::optional<bool> FlagEnum;
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
   std::optional<bool> SwiftCopyable;
-};
+  FunctionsSeq Methods;
 
-typedef std::vector<Tag> TagsSeq;
+  /// Tags that are declared within the current tag. Only the tags that have
+  /// corresponding API Notes will be listed.
+  TagsSeq Tags;
+};
 } // namespace
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(Tag)
@@ -454,6 +460,8 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("FlagEnum", T.FlagEnum);
     IO.mapOptional("EnumKind", T.EnumConvenienceKind);
     IO.mapOptional("SwiftCopyable", T.SwiftCopyable);
+    IO.mapOptional("Methods", T.Methods);
+    IO.mapOptional("Tags", T.Tags);
   }
 };
 } // namespace yaml
@@ -874,6 +882,101 @@ class YAMLConverter {
                          TheNamespace.Items, SwiftVersion);
   }
 
+  void convertFunction(const Function &Function, FunctionInfo &FI) {
+    convertAvailability(Function.Availability, FI, Function.Name);
+    FI.setSwiftPrivate(Function.SwiftPrivate);
+    FI.SwiftName = std::string(Function.SwiftName);
+    convertParams(Function.Params, FI);
+    convertNullability(Function.Nullability, Function.NullabilityOfRet, FI,
+                       Function.Name);
+    FI.ResultType = std::string(Function.ResultType);
+    FI.setRetainCountConvention(Function.RetainCountConvention);
+  }
+
+  void convertTagContext(std::optional<Context> ParentContext, const Tag &T,
+                         VersionTuple SwiftVersion) {
+    TagInfo TI;
+    std::optional<ContextID> ParentContextID =
+        ParentContext ? std::optional<ContextID>(ParentContext->id)
+                      : std::nullopt;
+    convertCommonType(T, TI, T.Name);
+
+    if ((T.SwiftRetainOp || T.SwiftReleaseOp) && !T.SwiftImportAs) {
+      emitError(llvm::Twine("should declare SwiftImportAs to use "
+                            "SwiftRetainOp and SwiftReleaseOp (for ") +
+                T.Name + ")");
+      return;
+    }
+    if (T.SwiftReleaseOp.has_value() != T.SwiftRetainOp.has_value()) {
+      emitError(llvm::Twine("should declare both SwiftReleaseOp and "
+                            "SwiftRetainOp (for ") +
+                T.Name + ")");
+      return;
+    }
+
+    if (T.SwiftImportAs)
+      TI.SwiftImportAs = T.SwiftImportAs;
+    if (T.SwiftRetainOp)
+      TI.SwiftRetainOp = T.SwiftRetainOp;
+    if (T.SwiftReleaseOp)
+      TI.SwiftReleaseOp = T.SwiftReleaseOp;
+
+    if (T.SwiftCopyable)
+      TI.setSwiftCopyable(T.SwiftCopyable);
+
+    if (T.EnumConvenienceKind) {
+      if (T.EnumExtensibility) {
+        emitError(
+            llvm::Twine("cannot mix EnumKind and EnumExtensibility (for ") +
+            T.Name + ")");
+        return;
+      }
+      if (T.FlagEnum) {
+        emitError(llvm::Twine("cannot mix EnumKind and FlagEnum (for ") +
+                  T.Name + ")");
+        return;
+      }
+      switch (*T.EnumConvenienceKind) {
+      case EnumConvenienceAliasKind::None:
+        TI.EnumExtensibility = EnumExtensibilityKind::None;
+        TI.setFlagEnum(false);
+        break;
+      case EnumConvenienceAliasKind::CFEnum:
+        TI.EnumExtensibility = EnumExtensibilityKind::Open;
+        TI.setFlagEnum(false);
+        break;
+      case EnumConvenienceAliasKind::CFOptions:
+        TI.EnumExtensibility = EnumExtensibilityKind::Open;
+        TI.setFlagEnum(true);
+        break;
+      case EnumConvenienceAliasKind::CFClosedEnum:
+        TI.EnumExtensibility = EnumExtensibilityKind::Closed;
+        TI.setFlagEnum(false);
+        break;
+      }
+    } else {
+      TI.EnumExtensibility = T.EnumExtensibility;
+      TI.setFlagEnum(T.FlagEnum);
+    }
+
+    Writer.addTag(ParentContext, T.Name, TI, SwiftVersion);
+
+    ContextInfo CI;
+    auto TagCtxID = Writer.addContext(ParentContextID, T.Name, ContextKind::Tag,
+                                      CI, SwiftVersion);
+    Context TagCtx(TagCtxID, ContextKind::Tag);
+
+    for (const auto &CXXMethod : T.Methods) {
+      CXXMethodInfo MI;
+      convertFunction(CXXMethod, MI);
+      Writer.addCXXMethod(TagCtxID, CXXMethod.Name, MI, SwiftVersion);
+    }
+
+    // Convert nested tags.
+    for (const auto &Tag : T.Tags)
+      convertTagContext(TagCtx, Tag, SwiftVersion);
+  }
+
   void convertTopLevelItems(std::optional<Context> Ctx,
                             const TopLevelItems &TLItems,
                             VersionTuple SwiftVersion) {
@@ -950,14 +1053,7 @@ class YAMLConverter {
       }
 
       GlobalFunctionInfo GFI;
-      convertAvailability(Function.Availability, GFI, Function.Name);
-      GFI.setSwiftPrivate(Function.SwiftPrivate);
-      GFI.SwiftName = std::string(Function.SwiftName);
-      convertParams(Function.Params, GFI);
-      convertNullability(Function.Nullability, Function.NullabilityOfRet, GFI,
-                         Function.Name);
-      GFI.ResultType = std::string(Function.ResultType);
-      GFI.setRetainCountConvention(Function.RetainCountConvention);
+      convertFunction(Function, GFI);
       Writer.addGlobalFunction(Ctx, Function.Name, GFI, SwiftVersion);
     }
 
@@ -988,68 +1084,7 @@ class YAMLConverter {
         continue;
       }
 
-      TagInfo TI;
-      convertCommonType(Tag, TI, Tag.Name);
-
-      if ((Tag.SwiftRetainOp || Tag.SwiftReleaseOp) && !Tag.SwiftImportAs) {
-        emitError(llvm::Twine("should declare SwiftImportAs to use "
-                              "SwiftRetainOp and SwiftReleaseOp (for ") +
-                  Tag.Name + ")");
-        continue;
-      }
-      if (Tag.SwiftReleaseOp.has_value() != Tag.SwiftRetainOp.has_value()) {
-        emitError(llvm::Twine("should declare both SwiftReleaseOp and "
-                              "SwiftRetainOp (for ") +
-                  Tag.Name + ")");
-        continue;
-      }
-
-      if (Tag.SwiftImportAs)
-        TI.SwiftImportAs = Tag.SwiftImportAs;
-      if (Tag.SwiftRetainOp)
-        TI.SwiftRetainOp = Tag.SwiftRetainOp;
-      if (Tag.SwiftReleaseOp)
-        TI.SwiftReleaseOp = Tag.SwiftReleaseOp;
-
-      if (Tag.SwiftCopyable)
-        TI.setSwiftCopyable(Tag.SwiftCopyable);
-
-      if (Tag.EnumConvenienceKind) {
-        if (Tag.EnumExtensibility) {
-          emitError(
-              llvm::Twine("cannot mix EnumKind and EnumExtensibility (for ") +
-              Tag.Name + ")");
-          continue;
-        }
-        if (Tag.FlagEnum) {
-          emitError(llvm::Twine("cannot mix EnumKind and FlagEnum (for ") +
-                    Tag.Name + ")");
-          continue;
-        }
-        switch (*Tag.EnumConvenienceKind) {
-        case EnumConvenienceAliasKind::None:
-          TI.EnumExtensibility = EnumExtensibilityKind::None;
-          TI.setFlagEnum(false);
-          break;
-        case EnumConvenienceAliasKind::CFEnum:
-          TI.EnumExtensibility = EnumExtensibilityKind::Open;
-          TI.setFlagEnum(false);
-          break;
-        case EnumConvenienceAliasKind::CFOptions:
-          TI.EnumExtensibility = EnumExtensibilityKind::Open;
-          TI.setFlagEnum(true);
-          break;
-        case EnumConvenienceAliasKind::CFClosedEnum:
-          TI.EnumExtensibility = EnumExtensibilityKind::Closed;
-          TI.setFlagEnum(false);
-          break;
-        }
-      } else {
-        TI.EnumExtensibility = Tag.EnumExtensibility;
-        TI.setFlagEnum(Tag.FlagEnum);
-      }
-
-      Writer.addTag(Ctx, Tag.Name, TI, SwiftVersion);
+      convertTagContext(Ctx, Tag, SwiftVersion);
     }
 
     // Write all typedefs.
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index 95e7ac1a3d775..d8efbe44dbecb 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -28,11 +28,9 @@ CreateUnsatisfiedConstraintRecord(const ASTContext &C,
   else {
     auto &SubstitutionDiagnostic =
         *Detail.get<std::pair<SourceLocation, StringRef> *>();
-    unsigned MessageSize = SubstitutionDiagnostic.second.size();
-    char *Mem = new (C) char[MessageSize];
-    memcpy(Mem, SubstitutionDiagnostic.second.data(), MessageSize);
+    StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
     auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
-        SubstitutionDiagnostic.first, StringRef(Mem, MessageSize));
+        SubstitutionDiagnostic.first, Message);
     new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
   }
 }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 268faefe6f6cc..a779d136b9d31 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1389,7 +1389,8 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
 #include "clang/Basic/OpenCLExtensionTypes.def"
   }
 
-  if (Target.hasAArch64SVETypes()) {
+  if (Target.hasAArch64SVETypes() ||
+      (AuxTarget && AuxTarget->hasAArch64SVETypes())) {
 #define SVE_TYPE(Name, Id, SingletonId) \
     InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AArch64SVEACLETypes.def"
@@ -3234,14 +3235,16 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     OS << "<objc_object>";
     return;
 
-  case Type::Enum:
+  case Type::Enum: {
     // C11 6.7.2.2p4:
     //   Each enumerated type shall be compatible with char, a signed integer
     //   type, or an unsigned integer type.
     //
     // So we have to treat enum types as integers.
+    QualType UnderlyingType = cast<EnumType>(T)->getDecl()->getIntegerType();
     return encodeTypeForFunctionPointerAuth(
-        Ctx, OS, cast<EnumType>(T)->getDecl()->getIntegerType());
+        Ctx, OS, UnderlyingType.isNull() ? Ctx.IntTy : UnderlyingType);
+  }
 
   case Type::FunctionNoProto:
   case Type::FunctionProto: {
@@ -3292,7 +3295,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     return;
 
   case Type::Builtin: {
-    const auto *BTy = T->getAs<BuiltinType>();
+    const auto *BTy = T->castAs<BuiltinType>();
     switch (BTy->getKind()) {
 #define SIGNED_TYPE(Id, SingletonId)                                           \
   case BuiltinType::Id:                                                        \
@@ -3372,9 +3375,10 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
 #include "clang/Basic/RISCVVTypes.def"
       llvm_unreachable("not yet implemented");
     }
+    llvm_unreachable("should never get here");
   }
   case Type::Record: {
-    const RecordDecl *RD = T->getAs<RecordType>()->getDecl();
+    const RecordDecl *RD = T->castAs<RecordType>()->getDecl();
     const IdentifierInfo *II = RD->getIdentifier();
 
     // In C++, an immediate typedef of an anonymous struct or union
@@ -3416,7 +3420,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
   }
 }
 
-uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) const {
+uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
   assert(!T->isDependentType() &&
          "cannot compute type discriminator of a dependent type");
 
@@ -3426,11 +3430,13 @@ uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) const {
   if (T->isFunctionPointerType() || T->isFunctionReferenceType())
     T = T->getPointeeType();
 
-  if (T->isFunctionType())
+  if (T->isFunctionType()) {
     encodeTypeForFunctionPointerAuth(*this, Out, T);
-  else
-    llvm_unreachable(
-        "type discrimination of non-function type not implemented yet");
+  } else {
+    T = T.getUnqualifiedType();
+    std::unique_ptr<MangleContext> MC(createMangleContext());
+    MC->mangleCanonicalTypeName(T, Out);
+  }
 
   return llvm::getPointerAuthStableSipHash(Str);
 }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 1d7d52db39d10..adff21df590cb 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -359,6 +359,54 @@ namespace clang {
           Params, Importer.getToContext().getTranslationUnitDecl());
     }
 
+    template <typename TemplateParmDeclT>
+    void tryUpdateTemplateParmDeclInheritedFrom(NamedDecl *RecentParm,
+                                                NamedDecl *NewParm) {
+      if (auto *ParmT = dyn_cast<TemplateParmDeclT>(RecentParm)) {
+        if (ParmT->hasDefaultArgument()) {
+          auto *P = cast<TemplateParmDeclT>(NewParm);
+          P->removeDefaultArgument();
+          P->setInheritedDefaultArgument(Importer.ToContext, ParmT);
+        }
+      }
+    }
+
+    // Update the parameter list `NewParams` of a template declaration
+    // by "inheriting" default argument values from `RecentParams`,
+    // which is the parameter list of an earlier declaration of the
+    // same template. (Note that "inheriting" default argument values
+    // is not related to object-oriented inheritance.)
+    //
+    // In the clang AST template parameters (NonTypeTemplateParmDec,
+    // TemplateTypeParmDecl, TemplateTemplateParmDecl) have a reference to the
+    // default value, if one is specified at the first declaration. The default
+    // value can be specified only once. The template parameters of the
+    // following declarations have a reference to the original default value
+    // through the "inherited" value. This value should be set for all imported
+    // template parameters that have a previous declaration (also a previous
+    // template declaration).
+    //
+    // In the `Visit*ParmDecl` functions the default value of these template
+    // arguments is always imported. At that location the previous declaration
+    // is not easily accessible, it is not possible to call
+    // `setInheritedDefaultArgument` at that place.
+    // `updateTemplateParametersInheritedFrom` is called later when the already
+    // imported default value is erased and changed to "inherited".
+    // It is important to change the mode to "inherited" otherwise false
+    // structural in-equivalences could be detected.
+    void updateTemplateParametersInheritedFrom(
+        const TemplateParameterList &RecentParams,
+        TemplateParameterList &NewParams) {
+      for (auto [Idx, Param] : enumerate(RecentParams)) {
+        tryUpdateTemplateParmDeclInheritedFrom<NonTypeTemplateParmDecl>(
+            Param, NewParams.getParam(Idx));
+        tryUpdateTemplateParmDeclInheritedFrom<TemplateTypeParmDecl>(
+            Param, NewParams.getParam(Idx));
+        tryUpdateTemplateParmDeclInheritedFrom<TemplateTemplateParmDecl>(
+            Param, NewParams.getParam(Idx));
+      }
+    }
+
   public:
     explicit ASTNodeImporter(ASTImporter &Importer) : Importer(Importer) {}
 
@@ -2955,7 +3003,7 @@ ExpectedDecl ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
       if (auto *FoundEnum = dyn_cast<EnumDecl>(FoundDecl)) {
         if (!hasSameVisibilityContextAndLinkage(FoundEnum, D))
           continue;
-        if (IsStructuralMatch(D, FoundEnum)) {
+        if (IsStructuralMatch(D, FoundEnum, !SearchName.isEmpty())) {
           EnumDecl *FoundDef = FoundEnum->getDefinition();
           if (D->isThisDeclarationADefinition() && FoundDef)
             return Importer.MapImported(D, FoundDef);
@@ -2966,7 +3014,12 @@ ExpectedDecl ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
       }
     }
 
-    if (!ConflictingDecls.empty()) {
+    // In case of unnamed enums, we try to find an existing similar one, if none
+    // was found, perform the import always.
+    // Structural in-equivalence is not detected in this way here, but it may
+    // be found when the parent decl is imported (if the enum is part of a
+    // class). To make this totally exact a more difficult solution is needed.
+    if (SearchName && !ConflictingDecls.empty()) {
       ExpectedName NameOrErr = Importer.HandleNameConflict(
           SearchName, DC, IDNS, ConflictingDecls.data(),
           ConflictingDecls.size());
@@ -4180,12 +4233,6 @@ ExpectedDecl ASTNodeImporter::VisitFieldDecl(FieldDecl *D) {
                               D->getInClassInitStyle()))
     return ToField;
 
-  // We need [[no_unqiue_address]] attributes to be added to FieldDecl, before
-  // we add fields in CXXRecordDecl::addedMember, otherwise record will be
-  // marked as having non-zero size.
-  Err = Importer.ImportAttrs(ToField, D);
-  if (Err)
-    return std::move(Err);
   ToField->setAccess(D->getAccess());
   ToField->setLexicalDeclContext(LexicalDC);
   ToField->setImplicit(D->isImplicit());
@@ -6139,6 +6186,9 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
     }
 
     D2->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          **TemplateParamsOrErr);
   }
 
   return D2;
@@ -6453,6 +6503,9 @@ ExpectedDecl ASTNodeImporter::VisitVarTemplateDecl(VarTemplateDecl *D) {
         ToTemplated->setPreviousDecl(PrevTemplated);
     }
     ToVarTD->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          **TemplateParamsOrErr);
   }
 
   return ToVarTD;
@@ -6725,6 +6778,9 @@ ASTNodeImporter::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) {
         TemplatedFD->setPreviousDecl(PrevTemplated);
     }
     ToFunc->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          *Params);
   }
 
   return ToFunc;
@@ -9400,19 +9456,6 @@ TranslationUnitDecl *ASTImporter::GetFromTU(Decl *ToD) {
   return FromDPos->second->getTranslationUnitDecl();
 }
 
-Error ASTImporter::ImportAttrs(Decl *ToD, Decl *FromD) {
-  if (!FromD->hasAttrs() || ToD->hasAttrs())
-    return Error::success();
-  for (const Attr *FromAttr : FromD->getAttrs()) {
-    auto ToAttrOrErr = Import(FromAttr);
-    if (ToAttrOrErr)
-      ToD->addAttr(*ToAttrOrErr);
-    else
-      return ToAttrOrErr.takeError();
-  }
-  return Error::success();
-}
-
 Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   if (!FromD)
     return nullptr;
@@ -9546,8 +9589,15 @@ Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   }
   // Make sure that ImportImpl registered the imported decl.
   assert(ImportedDecls.count(FromD) != 0 && "Missing call to MapImported?");
-  if (auto Error = ImportAttrs(ToD, FromD))
-    return std::move(Error);
+
+  if (FromD->hasAttrs())
+    for (const Attr *FromAttr : FromD->getAttrs()) {
+      auto ToAttrOrErr = Import(FromAttr);
+      if (ToAttrOrErr)
+        ToD->addAttr(*ToAttrOrErr);
+      else
+        return ToAttrOrErr.takeError();
+    }
 
   // Notify subclasses.
   Imported(FromD, ToD);
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index bc5a9206c0db2..a1f70546bde42 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -879,8 +879,6 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
       return IDNS_Ordinary;
     case Label:
       return IDNS_Label;
-    case IndirectField:
-      return IDNS_Ordinary | IDNS_Member;
 
     case Binding:
     case NonTypeTemplateParm:
@@ -918,6 +916,7 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
       return IDNS_ObjCProtocol;
 
     case Field:
+    case IndirectField:
     case ObjCAtDefsField:
     case ObjCIvar:
       return IDNS_Member;
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 72d68f39a97a5..9a3ede426e914 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -561,6 +561,42 @@ void CXXRecordDecl::addedClassSubobject(CXXRecordDecl *Subobj) {
     data().StructuralIfLiteral = false;
 }
 
+const CXXRecordDecl *CXXRecordDecl::getStandardLayoutBaseWithFields() const {
+  assert(
+      isStandardLayout() &&
+      "getStandardLayoutBaseWithFields called on a non-standard-layout type");
+#ifdef EXPENSIVE_CHECKS
+  {
+    unsigned NumberOfBasesWithFields = 0;
+    if (!field_empty())
+      ++NumberOfBasesWithFields;
+    llvm::SmallPtrSet<const CXXRecordDecl *, 8> UniqueBases;
+    forallBases([&](const CXXRecordDecl *Base) -> bool {
+      if (!Base->field_empty())
+        ++NumberOfBasesWithFields;
+      assert(
+          UniqueBases.insert(Base->getCanonicalDecl()).second &&
+          "Standard layout struct has multiple base classes of the same type");
+      return true;
+    });
+    assert(NumberOfBasesWithFields <= 1 &&
+           "Standard layout struct has fields declared in more than one class");
+  }
+#endif
+  if (!field_empty())
+    return this;
+  const CXXRecordDecl *Result = this;
+  forallBases([&](const CXXRecordDecl *Base) -> bool {
+    if (!Base->field_empty()) {
+      // This is the base where the fields are declared; return early
+      Result = Base;
+      return false;
+    }
+    return true;
+  });
+  return Result;
+}
+
 bool CXXRecordDecl::hasConstexprDestructor() const {
   auto *Dtor = getDestructor();
   return Dtor ? Dtor->isConstexpr() : defaultedDestructorIsConstexpr();
@@ -675,6 +711,9 @@ bool CXXRecordDecl::hasSubobjectAtOffsetZeroOfEmptyBaseType(
       if (!IsFirstField && !FD->isZeroSize(Ctx))
         continue;
 
+      if (FD->isInvalidDecl())
+        continue;
+
       //   -- If X is n array type, [visit the element type]
       QualType T = Ctx.getBaseElementType(FD->getType());
       if (auto *RD = T->getAsCXXRecordDecl())
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 722c7fcf0b0df..976b3a3e1eced 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -44,6 +44,12 @@ using namespace clang;
 // TemplateParameterList Implementation
 //===----------------------------------------------------------------------===//
 
+template <class TemplateParam>
+static bool
+DefaultTemplateArgumentContainsUnexpandedPack(const TemplateParam &P) {
+  return P.hasDefaultArgument() &&
+         P.getDefaultArgument().getArgument().containsUnexpandedParameterPack();
+}
 
 TemplateParameterList::TemplateParameterList(const ASTContext& C,
                                              SourceLocation TemplateLoc,
@@ -61,27 +67,30 @@ TemplateParameterList::TemplateParameterList(const ASTContext& C,
 
     bool IsPack = P->isTemplateParameterPack();
     if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
-      if (!IsPack && NTTP->getType()->containsUnexpandedParameterPack())
+      if (!IsPack && (NTTP->getType()->containsUnexpandedParameterPack() ||
+                      DefaultTemplateArgumentContainsUnexpandedPack(*NTTP)))
         ContainsUnexpandedParameterPack = true;
       if (NTTP->hasPlaceholderTypeConstraint())
         HasConstrainedParameters = true;
     } else if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(P)) {
       if (!IsPack &&
-          TTP->getTemplateParameters()->containsUnexpandedParameterPack())
+          (TTP->getTemplateParameters()->containsUnexpandedParameterPack() ||
+           DefaultTemplateArgumentContainsUnexpandedPack(*TTP))) {
         ContainsUnexpandedParameterPack = true;
+      }
     } else if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(P)) {
-      if (const TypeConstraint *TC = TTP->getTypeConstraint()) {
-        if (TC->getImmediatelyDeclaredConstraint()
-            ->containsUnexpandedParameterPack())
-          ContainsUnexpandedParameterPack = true;
+      if (!IsPack && DefaultTemplateArgumentContainsUnexpandedPack(*TTP)) {
+        ContainsUnexpandedParameterPack = true;
+      } else if (const TypeConstraint *TC = TTP->getTypeConstraint();
+                 TC && TC->getImmediatelyDeclaredConstraint()
+                           ->containsUnexpandedParameterPack()) {
+        ContainsUnexpandedParameterPack = true;
       }
       if (TTP->hasTypeConstraint())
         HasConstrainedParameters = true;
     } else {
       llvm_unreachable("unexpected template parameter type");
     }
-    // FIXME: If a default argument contains an unexpanded parameter pack, the
-    // template parameter list does too.
   }
 
   if (HasRequiresClause) {
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index f6956f132bd93..ec2ffeb9ef44c 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -2428,7 +2428,7 @@ APValue SourceLocExpr::EvaluateInContext(const ASTContext &Ctx,
 EmbedExpr::EmbedExpr(const ASTContext &Ctx, SourceLocation Loc,
                      EmbedDataStorage *Data, unsigned Begin,
                      unsigned NumOfElements)
-    : Expr(EmbedExprClass, Ctx.UnsignedCharTy, VK_PRValue, OK_Ordinary),
+    : Expr(EmbedExprClass, Ctx.IntTy, VK_PRValue, OK_Ordinary),
       EmbedKeywordLoc(Loc), Ctx(&Ctx), Data(Data), Begin(Begin),
       NumOfElements(NumOfElements) {
   setDependence(ExprDependence::None);
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 8d2a1b5611ccc..6212989e21737 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -152,7 +152,7 @@ bool CXXTypeidExpr::isMostDerived(ASTContext &Context) const {
   const Expr *E = getExprOperand()->IgnoreParenNoopCasts(Context);
   if (const auto *DRE = dyn_cast<DeclRefExpr>(E)) {
     QualType Ty = DRE->getDecl()->getType();
-    if (!Ty->isPointerType() && !Ty->isReferenceType())
+    if (!Ty->isPointerOrReferenceType())
       return true;
   }
 
@@ -1944,3 +1944,22 @@ CXXParenListInitExpr *CXXParenListInitExpr::CreateEmpty(ASTContext &C,
                          alignof(CXXParenListInitExpr));
   return new (Mem) CXXParenListInitExpr(Empty, NumExprs);
 }
+
+CXXFoldExpr::CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee,
+                                SourceLocation LParenLoc, Expr *LHS,
+                                BinaryOperatorKind Opcode,
+                                SourceLocation EllipsisLoc, Expr *RHS,
+                                SourceLocation RParenLoc,
+                                std::optional<unsigned> NumExpansions)
+    : Expr(CXXFoldExprClass, T, VK_PRValue, OK_Ordinary), LParenLoc(LParenLoc),
+      EllipsisLoc(EllipsisLoc), RParenLoc(RParenLoc),
+      NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) {
+  // We rely on asserted invariant to distinguish left and right folds.
+  assert(((LHS && LHS->containsUnexpandedParameterPack()) !=
+          (RHS && RHS->containsUnexpandedParameterPack())) &&
+         "Exactly one of LHS or RHS should contain an unexpanded pack");
+  SubExprs[SubExpr::Callee] = Callee;
+  SubExprs[SubExpr::LHS] = LHS;
+  SubExprs[SubExpr::RHS] = RHS;
+  setDependence(computeDependence(this));
+}
diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp
index 04061616dc5e9..7e6279f7dd25c 100644
--- a/clang/lib/AST/ExprClassification.cpp
+++ b/clang/lib/AST/ExprClassification.cpp
@@ -119,12 +119,6 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
 
     // First come the expressions that are always lvalues, unconditionally.
   case Expr::ObjCIsaExprClass:
-    // C++ [expr.prim.general]p1: A string literal is an lvalue.
-  case Expr::StringLiteralClass:
-    // @encode is equivalent to its string
-  case Expr::ObjCEncodeExprClass:
-    // __func__ and friends are too.
-  case Expr::PredefinedExprClass:
     // Property references are lvalues
   case Expr::ObjCSubscriptRefExprClass:
   case Expr::ObjCPropertyRefExprClass:
@@ -150,6 +144,26 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
   case Expr::OMPIteratorExprClass:
     return Cl::CL_LValue;
 
+    // C++ [expr.prim.general]p1: A string literal is an lvalue.
+  case Expr::StringLiteralClass:
+    // @encode is equivalent to its string
+  case Expr::ObjCEncodeExprClass:
+    // Except we special case them as prvalues when they are used to
+    // initialize a char array.
+    return E->isLValue() ? Cl::CL_LValue : Cl::CL_PRValue;
+
+    // __func__ and friends are too.
+    // The char array initialization special case also applies
+    // when they are transparent.
+  case Expr::PredefinedExprClass: {
+    auto *PE = cast<PredefinedExpr>(E);
+    const StringLiteral *SL = PE->getFunctionName();
+    if (PE->isTransparent())
+      return SL ? ClassifyInternal(Ctx, SL) : Cl::CL_LValue;
+    assert(!SL || SL->isLValue());
+    return Cl::CL_LValue;
+  }
+
     // C99 6.5.2.5p5 says that compound literals are lvalues.
     // In C++, they're prvalue temporaries, except for file-scope arrays.
   case Expr::CompoundLiteralExprClass:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3f7c2e6775e65..18fe6e4f19d4e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2401,6 +2401,10 @@ static bool CheckMemberPointerConstantExpression(EvalInfo &Info,
 /// produce an appropriate diagnostic.
 static bool CheckLiteralType(EvalInfo &Info, const Expr *E,
                              const LValue *This = nullptr) {
+  // The restriction to literal types does not exist in C++23 anymore.
+  if (Info.getLangOpts().CPlusPlus23)
+    return true;
+
   if (!E->isPRValue() || E->getType()->isLiteralType(Info.Ctx))
     return true;
 
@@ -2839,6 +2843,8 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
       // During constant-folding, a negative shift is an opposite shift. Such
       // a shift is not a constant expression.
       Info.CCEDiag(E, diag::note_constexpr_negative_shift) << RHS;
+      if (!Info.noteUndefinedBehavior())
+        return false;
       RHS = -RHS;
       goto shift_right;
     }
@@ -2849,19 +2855,23 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
     if (SA != RHS) {
       Info.CCEDiag(E, diag::note_constexpr_large_shift)
         << RHS << E->getType() << LHS.getBitWidth();
+      if (!Info.noteUndefinedBehavior())
+        return false;
     } else if (LHS.isSigned() && !Info.getLangOpts().CPlusPlus20) {
       // C++11 [expr.shift]p2: A signed left shift must have a non-negative
       // operand, and must not overflow the corresponding unsigned type.
       // C++2a [expr.shift]p2: E1 << E2 is the unique value congruent to
       // E1 x 2^E2 module 2^N.
-      if (LHS.isNegative())
+      if (LHS.isNegative()) {
         Info.CCEDiag(E, diag::note_constexpr_lshift_of_negative) << LHS;
-      else if (LHS.countl_zero() < SA)
+        if (!Info.noteUndefinedBehavior())
+          return false;
+      } else if (LHS.countl_zero() < SA) {
         Info.CCEDiag(E, diag::note_constexpr_lshift_discards);
+        if (!Info.noteUndefinedBehavior())
+          return false;
+      }
     }
-    if (Info.EvalStatus.Diag && !Info.EvalStatus.Diag->empty() &&
-        Info.getLangOpts().CPlusPlus11)
-      return false;
     Result = LHS << SA;
     return true;
   }
@@ -2875,6 +2885,8 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
       // During constant-folding, a negative shift is an opposite shift. Such a
       // shift is not a constant expression.
       Info.CCEDiag(E, diag::note_constexpr_negative_shift) << RHS;
+      if (!Info.noteUndefinedBehavior())
+        return false;
       RHS = -RHS;
       goto shift_left;
     }
@@ -2882,13 +2894,13 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
     // C++11 [expr.shift]p1: Shift width must be less than the bit width of the
     // shifted type.
     unsigned SA = (unsigned) RHS.getLimitedValue(LHS.getBitWidth()-1);
-    if (SA != RHS)
+    if (SA != RHS) {
       Info.CCEDiag(E, diag::note_constexpr_large_shift)
         << RHS << E->getType() << LHS.getBitWidth();
+      if (!Info.noteUndefinedBehavior())
+        return false;
+    }
 
-    if (Info.EvalStatus.Diag && !Info.EvalStatus.Diag->empty() &&
-        Info.getLangOpts().CPlusPlus11)
-      return false;
     Result = LHS >> SA;
     return true;
   }
@@ -12979,19 +12991,35 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
           Info.Ctx.getTargetInfo().getMaxAtomicInlineWidth();
       if (Size <= Info.Ctx.toCharUnitsFromBits(InlineWidthBits)) {
         if (BuiltinOp == Builtin::BI__c11_atomic_is_lock_free ||
-            Size == CharUnits::One() ||
-            E->getArg(1)->isNullPointerConstant(Info.Ctx,
-                                                Expr::NPC_NeverValueDependent))
-          // OK, we will inline appropriately-aligned operations of this size,
-          // and _Atomic(T) is appropriately-aligned.
+            Size == CharUnits::One())
           return Success(1, E);
 
-        QualType PointeeType = E->getArg(1)->IgnoreImpCasts()->getType()->
-          castAs<PointerType>()->getPointeeType();
-        if (!PointeeType->isIncompleteType() &&
-            Info.Ctx.getTypeAlignInChars(PointeeType) >= Size) {
-          // OK, we will inline operations on this object.
+        // If the pointer argument can be evaluated to a compile-time constant
+        // integer (or nullptr), check if that value is appropriately aligned.
+        const Expr *PtrArg = E->getArg(1);
+        Expr::EvalResult ExprResult;
+        APSInt IntResult;
+        if (PtrArg->EvaluateAsRValue(ExprResult, Info.Ctx) &&
+            ExprResult.Val.toIntegralConstant(IntResult, PtrArg->getType(),
+                                              Info.Ctx) &&
+            IntResult.isAligned(Size.getAsAlign()))
           return Success(1, E);
+
+        // Otherwise, check if the type's alignment against Size.
+        if (auto *ICE = dyn_cast<ImplicitCastExpr>(PtrArg)) {
+          // Drop the potential implicit-cast to 'const volatile void*', getting
+          // the underlying type.
+          if (ICE->getCastKind() == CK_BitCast)
+            PtrArg = ICE->getSubExpr();
+        }
+
+        if (auto PtrTy = PtrArg->getType()->getAs<PointerType>()) {
+          QualType PointeeType = PtrTy->getPointeeType();
+          if (!PointeeType->isIncompleteType() &&
+              Info.Ctx.getTypeAlignInChars(PointeeType) >= Size) {
+            // OK, we will inline operations on this object.
+            return Success(1, E);
+          }
         }
       }
     }
@@ -14068,6 +14096,12 @@ bool IntExprEvaluator::VisitUnaryExprOrTypeTraitExpr(
                      E);
   }
 
+  case UETT_PtrAuthTypeDiscriminator: {
+    if (E->getArgumentType()->isDependentType())
+      return false;
+    return Success(
+        Info.Ctx.getPointerAuthTypeDiscriminator(E->getArgumentType()), E);
+  }
   case UETT_VecStep: {
     QualType Ty = E->getTypeOfArgument();
 
diff --git a/clang/lib/AST/Interp/Boolean.h b/clang/lib/AST/Interp/Boolean.h
index 1bfb26b1b669f..23f7286036764 100644
--- a/clang/lib/AST/Interp/Boolean.h
+++ b/clang/lib/AST/Interp/Boolean.h
@@ -56,7 +56,7 @@ class Boolean final {
   APSInt toAPSInt(unsigned NumBits) const {
     return APSInt(toAPSInt().zextOrTrunc(NumBits), true);
   }
-  APValue toAPValue() const { return APValue(toAPSInt()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
   Boolean toUnsigned() const { return *this; }
 
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index a3d4c7d7392da..a01fa15dc0b7d 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -27,7 +27,9 @@ using namespace clang::interp;
 /// Similar information is available via ASTContext::BuiltinInfo,
 /// but that is not correct for our use cases.
 static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
-  return BuiltinID == Builtin::BI__builtin_classify_type;
+  return BuiltinID == Builtin::BI__builtin_classify_type ||
+         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
+         BuiltinID == Builtin::BI__builtin_constant_p;
 }
 
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 28c4ffd071862..258e4ed645254 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -89,6 +89,10 @@ bool InitLink::emit(Compiler<Emitter> *Ctx, const Expr *E) const {
     return Ctx->emitGetPtrLocal(Offset, E);
   case K_Decl:
     return Ctx->visitDeclRef(D, E);
+  case K_Elem:
+    if (!Ctx->emitConstUint32(Offset, E))
+      return false;
+    return Ctx->emitArrayElemPtrPopUint32(E);
   default:
     llvm_unreachable("Unhandled InitLink kind");
   }
@@ -466,6 +470,16 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
+    // Possibly diagnose casts to enum types if the target type does not
+    // have a fixed size.
+    if (Ctx.getLangOpts().CPlusPlus && CE->getType()->isEnumeralType()) {
+      if (const auto *ET = CE->getType().getCanonicalType()->getAs<EnumType>();
+          ET && !ET->getDecl()->isFixed()) {
+        if (!this->emitCheckEnumValue(*FromT, ET->getDecl(), CE))
+          return false;
+      }
+    }
+
     if (ToT == PT_IntAP)
       return this->emitCastAP(*FromT, Ctx.getBitWidth(CE->getType()), CE);
     if (ToT == PT_IntAPS)
@@ -1324,6 +1338,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
 
     auto initPrimitiveField = [=](const Record::Field *FieldToInit,
                                   const Expr *Init, PrimType T) -> bool {
+      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
       if (!this->visit(Init))
         return false;
 
@@ -1334,6 +1349,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
 
     auto initCompositeField = [=](const Record::Field *FieldToInit,
                                   const Expr *Init) -> bool {
+      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
       // Non-primitive case. Get a pointer to the field-to-initialize
       // on the stack and recurse into visitInitializer().
@@ -1544,6 +1560,7 @@ bool Compiler<Emitter>::visitArrayElemInit(unsigned ElemIndex,
     return this->emitInitElem(*T, ElemIndex, Init);
   }
 
+  InitLinkScope<Emitter> ILS(this, InitLink::Elem(ElemIndex));
   // Advance the pointer currently on the stack to the given
   // dimension.
   if (!this->emitConstUint32(ElemIndex, Init))
@@ -1686,10 +1703,8 @@ bool Compiler<Emitter>::VisitUnaryExprOrTypeTraitExpr(
   if (Kind == UETT_VectorElements) {
     if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>())
       return this->emitConst(VT->getNumElements(), E);
-
-    // FIXME: Apparently we need to catch the fact that a sizeless vector type
-    // has been passed and diagnose that (at run time).
     assert(E->getTypeOfArgument()->isSizelessVectorType());
+    return this->emitSizelessVectorElementSize(E);
   }
 
   if (Kind == UETT_VecStep) {
@@ -3184,13 +3199,9 @@ bool Compiler<Emitter>::VisitStmtExpr(const StmtExpr *E) {
     }
 
     assert(S == Result);
-    if (const Expr *ResultExpr = dyn_cast<Expr>(S)) {
-      if (DiscardResult)
-        return this->discard(ResultExpr);
+    if (const Expr *ResultExpr = dyn_cast<Expr>(S))
       return this->delegate(ResultExpr);
-    }
-
-    return this->visitStmt(S);
+    return this->emitUnsupported(E);
   }
 
   return BS.destroyLocals();
@@ -3245,6 +3256,9 @@ bool Compiler<Emitter>::visitInitializer(const Expr *E) {
   if (E->containsErrors())
     return this->emitError(E);
 
+  if (!this->checkLiteralType(E))
+    return false;
+
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/false,
                              /*NewInitializing=*/true);
   return this->Visit(E);
@@ -3677,6 +3691,9 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD, bool Topleve
   const Expr *Init = VD->getInit();
   std::optional<PrimType> VarT = classify(VD->getType());
 
+  if (Init && Init->isValueDependent())
+    return false;
+
   if (Context::shouldBeGloballyIndexed(VD)) {
     auto checkDecl = [&]() -> bool {
       bool NeedsOp = !Toplevel && VD->isLocalVarDecl() && VD->isStaticLocal();
@@ -4078,12 +4095,7 @@ template <class Emitter>
 bool Compiler<Emitter>::VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *E) {
   SourceLocScope<Emitter> SLS(this, E);
 
-  bool Old = InitStackActive;
-  InitStackActive =
-      !(E->getUsedContext()->getDeclKind() == Decl::CXXConstructor);
-  bool Result = this->delegate(E->getExpr());
-  InitStackActive = Old;
-  return Result;
+  return this->delegate(E->getExpr());
 }
 
 template <class Emitter>
@@ -4141,10 +4153,14 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   // instance pointer of the current function frame, but e.g. to the declaration
   // currently being initialized. Here we emit the necessary instruction(s) for
   // this scenario.
+  if (!InitStackActive || !E->isImplicit())
+    return this->emitThis(E);
+
   if (InitStackActive && !InitStack.empty()) {
     unsigned StartIndex = 0;
     for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
-      if (InitStack[StartIndex].Kind != InitLink::K_Field)
+      if (InitStack[StartIndex].Kind != InitLink::K_Field &&
+          InitStack[StartIndex].Kind != InitLink::K_Elem)
         break;
     }
 
@@ -4357,6 +4373,7 @@ bool Compiler<Emitter>::visitWhileStmt(const WhileStmt *S) {
 
   if (!this->jump(CondLabel))
     return false;
+  this->fallthrough(EndLabel);
   this->emitLabel(EndLabel);
 
   return true;
@@ -4685,6 +4702,17 @@ bool Compiler<Emitter>::emitLambdaStaticInvokerBody(const CXXMethodDecl *MD) {
   return this->emitRetVoid(MD);
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::checkLiteralType(const Expr *E) {
+  if (Ctx.getLangOpts().CPlusPlus23)
+    return true;
+
+  if (!E->isPRValue() || E->getType()->isLiteralType(Ctx.getASTContext()))
+    return true;
+
+  return this->emitCheckLiteralType(E->getType().getTypePtr(), E);
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   // Classify the return type.
@@ -5226,6 +5254,10 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
           return false;
         };
 
+        // DecompositionDecls are just proxies for us.
+        if (isa<DecompositionDecl>(VD))
+          return revisit(VD);
+
         // Visit local const variables like normal.
         if ((VD->hasGlobalStorage() || VD->isLocalVarDecl() ||
              VD->isStaticDataMember()) &&
@@ -5274,8 +5306,8 @@ template <class Emitter>
 unsigned Compiler<Emitter>::collectBaseOffset(const QualType BaseType,
                                               const QualType DerivedType) {
   const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
-    if (const auto *PT = dyn_cast<PointerType>(Ty))
-      return PT->getPointeeType()->getAsCXXRecordDecl();
+    if (const auto *R = Ty->getPointeeCXXRecordDecl())
+      return R;
     return Ty->getAsCXXRecordDecl();
   };
   const CXXRecordDecl *BaseDecl = extractRecordDecl(BaseType);
diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h
index 6df723df2b444..6bc9985fe7232 100644
--- a/clang/lib/AST/Interp/Compiler.h
+++ b/clang/lib/AST/Interp/Compiler.h
@@ -33,6 +33,7 @@ template <class Emitter> class DestructorScope;
 template <class Emitter> class VariableScope;
 template <class Emitter> class DeclScope;
 template <class Emitter> class InitLinkScope;
+template <class Emitter> class InitStackScope;
 template <class Emitter> class OptionScope;
 template <class Emitter> class ArrayIndexScope;
 template <class Emitter> class SourceLocScope;
@@ -49,6 +50,7 @@ struct InitLink {
     K_Field = 1,
     K_Temp = 2,
     K_Decl = 3,
+    K_Elem = 5,
   };
 
   static InitLink This() { return InitLink{K_This}; }
@@ -67,6 +69,11 @@ struct InitLink {
     IL.D = D;
     return IL;
   }
+  static InitLink Elem(unsigned Index) {
+    InitLink IL{K_Elem};
+    IL.Offset = Index;
+    return IL;
+  }
 
   InitLink(uint8_t Kind) : Kind(Kind) {}
   template <class Emitter>
@@ -298,6 +305,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   friend class DestructorScope<Emitter>;
   friend class DeclScope<Emitter>;
   friend class InitLinkScope<Emitter>;
+  friend class InitStackScope<Emitter>;
   friend class OptionScope<Emitter>;
   friend class ArrayIndexScope<Emitter>;
   friend class SourceLocScope<Emitter>;
@@ -351,6 +359,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
 
+  bool checkLiteralType(const Expr *E);
+
 protected:
   /// Variable to storage mapping.
   llvm::DenseMap<const ValueDecl *, Scope::Local> Locals;
@@ -612,6 +622,20 @@ template <class Emitter> class InitLinkScope final {
   Compiler<Emitter> *Ctx;
 };
 
+template <class Emitter> class InitStackScope final {
+public:
+  InitStackScope(Compiler<Emitter> *Ctx, bool Active)
+      : Ctx(Ctx), OldValue(Ctx->InitStackActive) {
+    Ctx->InitStackActive = Active;
+  }
+
+  ~InitStackScope() { this->Ctx->InitStackActive = OldValue; }
+
+private:
+  Compiler<Emitter> *Ctx;
+  bool OldValue;
+};
+
 } // namespace interp
 } // namespace clang
 
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index b5e992c5a9ac1..b1e06cd4d8a4c 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -176,8 +176,7 @@ std::optional<PrimType> Context::classify(QualType T) const {
       T->isFunctionType())
     return PT_FnPtr;
 
-  if (T->isReferenceType() || T->isPointerType() ||
-      T->isObjCObjectPointerType())
+  if (T->isPointerOrReferenceType() || T->isObjCObjectPointerType())
     return PT_Ptr;
 
   if (const auto *AT = T->getAs<AtomicType>())
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index f7d1201f625bb..671f2c03d7e5c 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -33,7 +33,8 @@ static void dtorTy(Block *, std::byte *Ptr, const Descriptor *) {
 template <typename T>
 static void moveTy(Block *, const std::byte *Src, std::byte *Dst,
                    const Descriptor *) {
-  const auto *SrcPtr = reinterpret_cast<const T *>(Src);
+  // FIXME: Get rid of the const_cast.
+  auto *SrcPtr = reinterpret_cast<T *>(const_cast<std::byte *>(Src));
   auto *DstPtr = reinterpret_cast<T *>(Dst);
   new (DstPtr) T(std::move(*SrcPtr));
 }
@@ -162,7 +163,8 @@ static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
 }
 
 static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
-                     bool IsActive, const Descriptor *D, unsigned FieldOffset) {
+                     bool IsActive, const Descriptor *D, unsigned FieldOffset,
+                     bool IsVirtualBase) {
   assert(D);
   assert(D->ElemRecord);
 
@@ -172,13 +174,14 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
   Desc->Desc = D;
   Desc->IsInitialized = D->IsArray;
   Desc->IsBase = true;
+  Desc->IsVirtualBase = IsVirtualBase;
   Desc->IsActive = IsActive && !IsUnion;
   Desc->IsConst = IsConst || D->IsConst;
   Desc->IsFieldMutable = IsMutable || D->IsMutable;
 
   for (const auto &V : D->ElemRecord->bases())
     initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
-             V.Offset);
+             V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
     initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, IsUnion,
               F.Desc, F.Offset);
@@ -187,11 +190,11 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
 static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
                        bool IsActive, const Descriptor *D) {
   for (const auto &V : D->ElemRecord->bases())
-    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
     initField(B, Ptr, IsConst, IsMutable, IsActive, D->ElemRecord->isUnion(), F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
-    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
 }
 
 static void destroyField(Block *B, std::byte *Ptr, const Descriptor *D,
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 0dd97812e5a5c..0cc5d77c407e3 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -83,6 +83,8 @@ struct InlineDescriptor {
   /// Flag indicating if the field is an embedded base class.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsBase : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsVirtualBase : 1;
   /// Flag indicating if the field is the active member of a union.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsActive : 1;
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index c6c6275593007..5e3a5b9515b52 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -278,10 +278,15 @@ LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
   OS << "\n";
   OS.indent(Spaces) << "This: " << getThis() << "\n";
   OS.indent(Spaces) << "RVO: " << getRVOPtr() << "\n";
-
-  while (const InterpFrame *F = this->Caller) {
+  OS.indent(Spaces) << "Depth: " << Depth << "\n";
+  OS.indent(Spaces) << "ArgSize: " << ArgSize << "\n";
+  OS.indent(Spaces) << "Args: " << (void *)Args << "\n";
+  OS.indent(Spaces) << "FrameOffset: " << FrameOffset << "\n";
+  OS.indent(Spaces) << "FrameSize: " << (Func ? Func->getFrameSize() : 0)
+                    << "\n";
+
+  for (const InterpFrame *F = this->Caller; F; F = F->Caller) {
     F->dump(OS, Indent + 1);
-    F = F->Caller;
   }
 }
 
@@ -366,9 +371,9 @@ LLVM_DUMP_METHOD void EvaluationResult::dump() const {
 
     OS << "LValue: ";
     if (const auto *P = std::get_if<Pointer>(&Value))
-      P->toAPValue().printPretty(OS, ASTCtx, SourceType);
+      P->toAPValue(ASTCtx).printPretty(OS, ASTCtx, SourceType);
     else if (const auto *FP = std::get_if<FunctionPointer>(&Value)) // Nope
-      FP->toAPValue().printPretty(OS, ASTCtx, SourceType);
+      FP->toAPValue(ASTCtx).printPretty(OS, ASTCtx, SourceType);
     OS << "\n";
     break;
   }
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 59e78686b78ad..08536536ac3c2 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -145,7 +145,7 @@ template <PrimType OpType> bool EvalEmitter::emitRet(const SourceInfo &Info) {
     return false;
 
   using T = typename PrimConv<OpType>::T;
-  EvalResult.setValue(S.Stk.pop<T>().toAPValue());
+  EvalResult.setValue(S.Stk.pop<T>().toAPValue(Ctx.getASTContext()));
   return true;
 }
 
@@ -169,7 +169,9 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
       return false;
     // Never allow reading from a non-const pointer, unless the memory
     // has been created in this evaluation.
-    if (!Ptr.isConst() && Ptr.block()->getEvalID() != Ctx.getEvalID())
+    if (!Ptr.isZero() && Ptr.isBlockPointer() &&
+        Ptr.block()->getEvalID() != Ctx.getEvalID() &&
+        (!CheckLoad(S, OpPC, Ptr, AK_Read) || !Ptr.isConst()))
       return false;
 
     if (std::optional<APValue> V =
@@ -179,7 +181,7 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
       return false;
     }
   } else {
-    EvalResult.setValue(Ptr.toAPValue());
+    EvalResult.setValue(Ptr.toAPValue(Ctx.getASTContext()));
   }
 
   return true;
@@ -283,7 +285,8 @@ void EvalEmitter::updateGlobalTemporaries() {
       APValue *Cached = Temp->getOrCreateValue(true);
 
       if (std::optional<PrimType> T = Ctx.classify(E->getType())) {
-        TYPE_SWITCH(*T, { *Cached = Ptr.deref<T>().toAPValue(); });
+        TYPE_SWITCH(
+            *T, { *Cached = Ptr.deref<T>().toAPValue(Ctx.getASTContext()); });
       } else {
         if (std::optional<APValue> APV =
                 Ptr.toRValue(Ctx, Temp->getTemporaryExpr()->getType()))
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index 0bebfd4ad984e..bdebd19af9f94 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -10,7 +10,9 @@
 #include "InterpState.h"
 #include "Record.h"
 #include "clang/AST/ExprCXX.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include <iterator>
 
 namespace clang {
 namespace interp {
@@ -21,9 +23,9 @@ APValue EvaluationResult::toAPValue() const {
   case LValue:
     // Either a pointer or a function pointer.
     if (const auto *P = std::get_if<Pointer>(&Value))
-      return P->toAPValue();
+      return P->toAPValue(Ctx->getASTContext());
     else if (const auto *FP = std::get_if<FunctionPointer>(&Value))
-      return FP->toAPValue();
+      return FP->toAPValue(Ctx->getASTContext());
     else
       llvm_unreachable("Unhandled LValue type");
     break;
@@ -46,7 +48,7 @@ std::optional<APValue> EvaluationResult::toRValue() const {
   if (const auto *P = std::get_if<Pointer>(&Value))
     return P->toRValue(*Ctx, getSourceType());
   else if (const auto *FP = std::get_if<FunctionPointer>(&Value)) // Nope
-    return FP->toAPValue();
+    return FP->toAPValue(Ctx->getASTContext());
   llvm_unreachable("Unhandled lvalue kind");
 }
 
@@ -122,19 +124,18 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
   }
 
   // Check Fields in all bases
-  for (const Record::Base &B : R->bases()) {
+  for (auto [I, B] : llvm::enumerate(R->bases())) {
     Pointer P = BasePtr.atField(B.Offset);
     if (!P.isInitialized()) {
       const Descriptor *Desc = BasePtr.getDeclDesc();
-      if (Desc->asDecl())
-        S.FFDiag(BasePtr.getDeclDesc()->asDecl()->getLocation(),
-                 diag::note_constexpr_uninitialized_base)
+      if (const auto *CD = dyn_cast_if_present<CXXRecordDecl>(R->getDecl())) {
+        const auto &BS = *std::next(CD->bases_begin(), I);
+        S.FFDiag(BS.getBaseTypeLoc(), diag::note_constexpr_uninitialized_base)
+            << B.Desc->getType() << BS.getSourceRange();
+      } else {
+        S.FFDiag(Desc->getLocation(), diag::note_constexpr_uninitialized_base)
             << B.Desc->getType();
-      else
-        S.FFDiag(BasePtr.getDeclDesc()->asExpr()->getExprLoc(),
-                 diag::note_constexpr_uninitialized_base)
-            << B.Desc->getType();
-
+      }
       return false;
     }
     Result &= CheckFieldsInitialized(S, Loc, P, B.R);
diff --git a/clang/lib/AST/Interp/Floating.h b/clang/lib/AST/Interp/Floating.h
index e4ac76d8509fb..114487821880f 100644
--- a/clang/lib/AST/Interp/Floating.h
+++ b/clang/lib/AST/Interp/Floating.h
@@ -69,7 +69,7 @@ class Floating final {
   APSInt toAPSInt(unsigned NumBits = 0) const {
     return APSInt(F.bitcastToAPInt());
   }
-  APValue toAPValue() const { return APValue(F); }
+  APValue toAPValue(const ASTContext &) const { return APValue(F); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
diff --git a/clang/lib/AST/Interp/FunctionPointer.h b/clang/lib/AST/Interp/FunctionPointer.h
index fc3d7a4214a72..d92cd32933fcd 100644
--- a/clang/lib/AST/Interp/FunctionPointer.h
+++ b/clang/lib/AST/Interp/FunctionPointer.h
@@ -23,11 +23,10 @@ class FunctionPointer final {
   bool Valid;
 
 public:
-  FunctionPointer(const Function *Func) : Func(Func), Valid(true) {
-    assert(Func);
-  }
+  FunctionPointer() = default;
+  FunctionPointer(const Function *Func) : Func(Func), Valid(true) {}
 
-  FunctionPointer(uintptr_t IntVal = 0, const Descriptor *Desc = nullptr)
+  FunctionPointer(uintptr_t IntVal, const Descriptor *Desc = nullptr)
       : Func(reinterpret_cast<const Function *>(IntVal)), Valid(false) {}
 
   const Function *getFunction() const { return Func; }
@@ -40,7 +39,7 @@ class FunctionPointer final {
     return Func->getDecl()->isWeak();
   }
 
-  APValue toAPValue() const {
+  APValue toAPValue(const ASTContext &) const {
     if (!Func)
       return APValue(static_cast<Expr *>(nullptr), CharUnits::Zero(), {},
                      /*OnePastTheEnd=*/false, /*IsNull=*/true);
@@ -69,7 +68,7 @@ class FunctionPointer final {
     if (!Func)
       return "nullptr";
 
-    return toAPValue().getAsString(Ctx, Func->getDecl()->getType());
+    return toAPValue(Ctx).getAsString(Ctx, Func->getDecl()->getType());
   }
 
   uint64_t getIntegerRepresentation() const {
diff --git a/clang/lib/AST/Interp/Integral.h b/clang/lib/AST/Interp/Integral.h
index db4cc9ae45b49..aafdd02676c96 100644
--- a/clang/lib/AST/Interp/Integral.h
+++ b/clang/lib/AST/Interp/Integral.h
@@ -112,7 +112,7 @@ template <unsigned Bits, bool Signed> class Integral final {
     else
       return APSInt(toAPSInt().zextOrTrunc(NumBits), !Signed);
   }
-  APValue toAPValue() const { return APValue(toAPSInt()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
   Integral<Bits, false> toUnsigned() const {
     return Integral<Bits, false>(*this);
diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index 7464f15cdb03b..b8aa21038256c 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -133,7 +133,7 @@ template <bool Signed> class IntegralAP final {
     else
       return APSInt(V.zext(Bits), !Signed);
   }
-  APValue toAPValue() const { return APValue(toAPSInt()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
   bool isZero() const { return V.isZero(); }
   bool isPositive() const { return V.isNonNegative(); }
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index fb63228f8aea8..3694253ae9782 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -22,6 +22,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/StringExtras.h"
 #include <limits>
 #include <vector>
 
@@ -232,7 +233,8 @@ void cleanupAfterFunctionCall(InterpState &S, CodePtr OpPC) {
       assert(false && "Can't get arguments from that expression type");
 
     assert(NumArgs >= CurFunc->getNumWrittenParams());
-    NumVarArgs = NumArgs - CurFunc->getNumWrittenParams();
+    NumVarArgs = NumArgs - (CurFunc->getNumWrittenParams() +
+                            isa<CXXOperatorCallExpr>(CallSite));
     for (unsigned I = 0; I != NumVarArgs; ++I) {
       const Expr *A = Args[NumArgs - 1 - I];
       popArg(S, A);
@@ -332,7 +334,7 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
 }
 
 static bool CheckConstant(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (Ptr.isIntegralPointer())
+  if (!Ptr.isBlockPointer())
     return true;
   return CheckConstant(S, OpPC, Ptr.getDeclDesc());
 }
@@ -579,57 +581,62 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
     return false;
   }
 
-  if (!F->isConstexpr() || !F->hasBody()) {
-    const SourceLocation &Loc = S.Current->getLocation(OpPC);
-    if (S.getLangOpts().CPlusPlus11) {
-      const FunctionDecl *DiagDecl = F->getDecl();
+  if (F->isConstexpr() && F->hasBody() &&
+      (F->getDecl()->isConstexpr() || F->getDecl()->hasAttr<MSConstexprAttr>()))
+    return true;
 
-      // Invalid decls have been diagnosed before.
-      if (DiagDecl->isInvalidDecl())
-        return false;
+  // Implicitly constexpr.
+  if (F->isLambdaStaticInvoker())
+    return true;
 
-      // If this function is not constexpr because it is an inherited
-      // non-constexpr constructor, diagnose that directly.
-      const auto *CD = dyn_cast<CXXConstructorDecl>(DiagDecl);
-      if (CD && CD->isInheritingConstructor()) {
-        const auto *Inherited = CD->getInheritedConstructor().getConstructor();
-        if (!Inherited->isConstexpr())
-          DiagDecl = CD = Inherited;
-      }
+  const SourceLocation &Loc = S.Current->getLocation(OpPC);
+  if (S.getLangOpts().CPlusPlus11) {
+    const FunctionDecl *DiagDecl = F->getDecl();
 
-      // FIXME: If DiagDecl is an implicitly-declared special member function
-      // or an inheriting constructor, we should be much more explicit about why
-      // it's not constexpr.
-      if (CD && CD->isInheritingConstructor()) {
-        S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1)
+    // Invalid decls have been diagnosed before.
+    if (DiagDecl->isInvalidDecl())
+      return false;
+
+    // If this function is not constexpr because it is an inherited
+    // non-constexpr constructor, diagnose that directly.
+    const auto *CD = dyn_cast<CXXConstructorDecl>(DiagDecl);
+    if (CD && CD->isInheritingConstructor()) {
+      const auto *Inherited = CD->getInheritedConstructor().getConstructor();
+      if (!Inherited->isConstexpr())
+        DiagDecl = CD = Inherited;
+    }
+
+    // FIXME: If DiagDecl is an implicitly-declared special member function
+    // or an inheriting constructor, we should be much more explicit about why
+    // it's not constexpr.
+    if (CD && CD->isInheritingConstructor()) {
+      S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1)
           << CD->getInheritedConstructor().getConstructor()->getParent();
-        S.Note(DiagDecl->getLocation(), diag::note_declared_at);
-      } else {
-        // Don't emit anything if the function isn't defined and we're checking
-        // for a constant expression. It might be defined at the point we're
-        // actually calling it.
-        bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
-        if (!DiagDecl->isDefined() && !IsExtern &&
-            S.checkingPotentialConstantExpression())
-          return false;
+      S.Note(DiagDecl->getLocation(), diag::note_declared_at);
+    } else {
+      // Don't emit anything if the function isn't defined and we're checking
+      // for a constant expression. It might be defined at the point we're
+      // actually calling it.
+      bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
+      if (!DiagDecl->isDefined() && !IsExtern && DiagDecl->isConstexpr() &&
+          S.checkingPotentialConstantExpression())
+        return false;
 
-        // If the declaration is defined, declared 'constexpr' _and_ has a body,
-        // the below diagnostic doesn't add anything useful.
-        if (DiagDecl->isDefined() && DiagDecl->isConstexpr() &&
-            DiagDecl->hasBody())
-          return false;
+      // If the declaration is defined, declared 'constexpr' _and_ has a body,
+      // the below diagnostic doesn't add anything useful.
+      if (DiagDecl->isDefined() && DiagDecl->isConstexpr() &&
+          DiagDecl->hasBody())
+        return false;
 
-        S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1)
+      S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1)
           << DiagDecl->isConstexpr() << (bool)CD << DiagDecl;
-        S.Note(DiagDecl->getLocation(), diag::note_declared_at);
-      }
-    } else {
-      S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
+      S.Note(DiagDecl->getLocation(), diag::note_declared_at);
     }
-    return false;
+  } else {
+    S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
   }
 
-  return true;
+  return false;
 }
 
 bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
@@ -722,8 +729,8 @@ bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC) {
     return true;
 
   const SourceInfo &E = S.Current->getSource(OpPC);
-  S.FFDiag(E, diag::note_constexpr_new);
-  return false;
+  S.CCEDiag(E, diag::note_constexpr_new);
+  return true;
 }
 
 bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC, bool NewWasArray,
@@ -819,6 +826,106 @@ bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
   return true;
 }
 
+// FIXME: This is similar to code we already have in Compiler.cpp.
+// I think it makes sense to instead add the field and base destruction stuff
+// to the destructor Function itself. Then destroying a record would really
+// _just_ be calling its destructor. That would also help with the diagnostic
+// difference when the destructor or a field/base fails.
+static bool runRecordDestructor(InterpState &S, CodePtr OpPC,
+                                const Pointer &BasePtr,
+                                const Descriptor *Desc) {
+  assert(Desc->isRecord());
+  const Record *R = Desc->ElemRecord;
+  assert(R);
+
+  // Fields.
+  for (const Record::Field &Field : llvm::reverse(R->fields())) {
+    const Descriptor *D = Field.Desc;
+    if (D->isRecord()) {
+      if (!runRecordDestructor(S, OpPC, BasePtr.atField(Field.Offset), D))
+        return false;
+    } else if (D->isCompositeArray()) {
+      const Descriptor *ElemDesc = Desc->ElemDesc;
+      assert(ElemDesc->isRecord());
+      for (unsigned I = 0; I != Desc->getNumElems(); ++I) {
+        if (!runRecordDestructor(S, OpPC, BasePtr.atIndex(I).narrow(),
+                                 ElemDesc))
+          return false;
+      }
+    }
+  }
+
+  // Destructor of this record.
+  if (const CXXDestructorDecl *Dtor = R->getDestructor();
+      Dtor && !Dtor->isTrivial()) {
+    const Function *DtorFunc = S.getContext().getOrCreateFunction(Dtor);
+    if (!DtorFunc)
+      return false;
+
+    S.Stk.push<Pointer>(BasePtr);
+    if (!Call(S, OpPC, DtorFunc, 0))
+      return false;
+  }
+
+  // Bases.
+  for (const Record::Base &Base : llvm::reverse(R->bases())) {
+    if (!runRecordDestructor(S, OpPC, BasePtr.atField(Base.Offset), Base.Desc))
+      return false;
+  }
+
+  return true;
+}
+
+bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B) {
+  assert(B);
+  const Descriptor *Desc = B->getDescriptor();
+
+  if (Desc->isPrimitive() || Desc->isPrimitiveArray())
+    return true;
+
+  assert(Desc->isRecord() || Desc->isCompositeArray());
+
+  if (Desc->isCompositeArray()) {
+    const Descriptor *ElemDesc = Desc->ElemDesc;
+    assert(ElemDesc->isRecord());
+
+    Pointer RP(const_cast<Block *>(B));
+    for (unsigned I = 0; I != Desc->getNumElems(); ++I) {
+      if (!runRecordDestructor(S, OpPC, RP.atIndex(I).narrow(), ElemDesc))
+        return false;
+    }
+    return true;
+  }
+
+  assert(Desc->isRecord());
+  return runRecordDestructor(S, OpPC, Pointer(const_cast<Block *>(B)), Desc);
+}
+
+void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED,
+                       const APSInt &Value) {
+  llvm::APInt Min;
+  llvm::APInt Max;
+
+  if (S.EvaluatingDecl && !S.EvaluatingDecl->isConstexpr())
+    return;
+
+  ED->getValueRange(Max, Min);
+  --Max;
+
+  if (ED->getNumNegativeBits() &&
+      (Max.slt(Value.getSExtValue()) || Min.sgt(Value.getSExtValue()))) {
+    const SourceLocation &Loc = S.Current->getLocation(OpPC);
+    S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range)
+        << llvm::toString(Value, 10) << Min.getSExtValue() << Max.getSExtValue()
+        << ED;
+  } else if (!ED->getNumNegativeBits() && Max.ult(Value.getZExtValue())) {
+    const SourceLocation &Loc = S.Current->getLocation(OpPC);
+    S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range)
+        << llvm::toString(Value, 10) << Min.getZExtValue() << Max.getZExtValue()
+        << ED;
+  }
+}
+
 bool Interpret(InterpState &S, APValue &Result) {
   // The current stack frame when we started Interpret().
   // This is being used by the ops to determine wheter
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index b4f8c03280c85..63e9966b831db 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -39,8 +39,9 @@ namespace interp {
 using APSInt = llvm::APSInt;
 
 /// Convert a value to an APValue.
-template <typename T> bool ReturnValue(const T &V, APValue &R) {
-  R = V.toAPValue();
+template <typename T>
+bool ReturnValue(const InterpState &S, const T &V, APValue &R) {
+  R = V.toAPValue(S.getCtx());
   return true;
 }
 
@@ -152,7 +153,8 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
   if (RHS.isNegative()) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    return false;
+    if (!S.noteUndefinedBehavior())
+      return false;
   }
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
@@ -162,17 +164,24 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
     S.CCEDiag(E, diag::note_constexpr_large_shift) << Val << Ty << Bits;
-    return !(S.getEvalStatus().Diag && !S.getEvalStatus().Diag->empty() && S.getLangOpts().CPlusPlus11);
+    if (!S.noteUndefinedBehavior())
+      return false;
   }
 
   if (LHS.isSigned() && !S.getLangOpts().CPlusPlus20) {
     const Expr *E = S.Current->getExpr(OpPC);
     // C++11 [expr.shift]p2: A signed left shift must have a non-negative
     // operand, and must not overflow the corresponding unsigned type.
-    if (LHS.isNegative())
+    if (LHS.isNegative()) {
       S.CCEDiag(E, diag::note_constexpr_lshift_of_negative) << LHS.toAPSInt();
-    else if (LHS.toUnsigned().countLeadingZeros() < static_cast<unsigned>(RHS))
+      if (!S.noteUndefinedBehavior())
+        return false;
+    } else if (LHS.toUnsigned().countLeadingZeros() <
+               static_cast<unsigned>(RHS)) {
       S.CCEDiag(E, diag::note_constexpr_lshift_discards);
+      if (!S.noteUndefinedBehavior())
+        return false;
+    }
   }
 
   // C++2a [expr.shift]p2: [P0907R4]:
@@ -186,10 +195,15 @@ template <typename T>
 bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
   if (RHS.isZero()) {
     const auto *Op = cast<BinaryOperator>(S.Current->getExpr(OpPC));
+    if constexpr (std::is_same_v<T, Floating>) {
+      S.CCEDiag(Op, diag::note_expr_divide_by_zero)
+          << Op->getRHS()->getSourceRange();
+      return true;
+    }
+
     S.FFDiag(Op, diag::note_expr_divide_by_zero)
         << Op->getRHS()->getSourceRange();
-    if constexpr (!std::is_same_v<T, Floating>)
-      return false;
+    return false;
   }
 
   if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
@@ -286,7 +300,7 @@ bool Ret(InterpState &S, CodePtr &PC, APValue &Result) {
   } else {
     delete S.Current;
     S.Current = nullptr;
-    if (!ReturnValue<T>(Ret, Result))
+    if (!ReturnValue<T>(S, Ret, Result))
       return false;
   }
   return true;
@@ -1318,7 +1332,7 @@ bool InitGlobalTemp(InterpState &S, CodePtr OpPC, uint32_t I,
   const Pointer &Ptr = S.P.getGlobal(I);
 
   const T Value = S.Stk.peek<T>();
-  APValue APV = Value.toAPValue();
+  APValue APV = Value.toAPValue(S.getCtx());
   APValue *Cached = Temp->getOrCreateValue(true);
   *Cached = APV;
 
@@ -1557,7 +1571,10 @@ inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
-  S.Stk.push<Pointer>(Ptr.atField(Off));
+  const Pointer &Result = Ptr.atField(Off);
+  if (Result.isPastEnd())
+    return false;
+  S.Stk.push<Pointer>(Result);
   return true;
 }
 
@@ -1567,7 +1584,10 @@ inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
-  S.Stk.push<Pointer>(Ptr.atField(Off));
+  const Pointer &Result = Ptr.atField(Off);
+  if (Result.isPastEnd())
+    return false;
+  S.Stk.push<Pointer>(Result);
   return true;
 }
 
@@ -2263,8 +2283,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     // shift is not a constant expression.
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag &&
-        !S.getEvalStatus().Diag->empty())
+    if (!S.noteUndefinedBehavior())
       return false;
     RHS = -RHS;
     return DoShift < LT, RT,
@@ -2280,8 +2299,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // E1 x 2^E2 module 2^N.
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.CCEDiag(Loc, diag::note_constexpr_lshift_of_negative) << LHS.toAPSInt();
-      if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag &&
-          !S.getEvalStatus().Diag->empty())
+      if (!S.noteUndefinedBehavior())
         return false;
     }
   }
@@ -2531,14 +2549,14 @@ inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
       if (!CheckInvoke(S, OpPC, ThisPtr))
         return false;
     }
-
-    if (S.checkingPotentialConstantExpression())
-      return false;
   }
 
   if (!CheckCallable(S, OpPC, Func))
     return false;
 
+  if (Func->hasThisPointer() && S.checkingPotentialConstantExpression())
+    return false;
+
   if (!CheckCallDepth(S, OpPC))
     return false;
 
@@ -2570,12 +2588,20 @@ inline bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
   size_t ThisOffset = ArgSize - (Func->hasRVO() ? primSize(PT_Ptr) : 0);
   Pointer &ThisPtr = S.Stk.peek<Pointer>(ThisOffset);
 
-  QualType DynamicType = ThisPtr.getDeclDesc()->getType();
-  const CXXRecordDecl *DynamicDecl;
-  if (DynamicType->isPointerType() || DynamicType->isReferenceType())
-    DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
-  else
-    DynamicDecl = ThisPtr.getDeclDesc()->getType()->getAsCXXRecordDecl();
+  const CXXRecordDecl *DynamicDecl = nullptr;
+  {
+    Pointer TypePtr = ThisPtr;
+    while (TypePtr.isBaseClass())
+      TypePtr = TypePtr.getBase();
+
+    QualType DynamicType = TypePtr.getType();
+    if (DynamicType->isPointerType() || DynamicType->isReferenceType())
+      DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
+    else
+      DynamicDecl = DynamicType->getAsCXXRecordDecl();
+  }
+  assert(DynamicDecl);
+
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
   const auto *InitialFunction = cast<CXXMethodDecl>(Func->getDecl());
   const CXXMethodDecl *Overrider = S.getContext().getOverridingFunction(
@@ -2602,7 +2628,29 @@ inline bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
     }
   }
 
-  return Call(S, OpPC, Func, VarArgSize);
+  if (!Call(S, OpPC, Func, VarArgSize))
+    return false;
+
+  // Covariant return types. The return type of Overrider is a pointer
+  // or reference to a class type.
+  if (Overrider != InitialFunction &&
+      Overrider->getReturnType()->isPointerOrReferenceType() &&
+      InitialFunction->getReturnType()->isPointerOrReferenceType()) {
+    QualType OverriderPointeeType =
+        Overrider->getReturnType()->getPointeeType();
+    QualType InitialPointeeType =
+        InitialFunction->getReturnType()->getPointeeType();
+    // We've called Overrider above, but calling code expects us to return what
+    // InitialFunction returned. According to the rules for covariant return
+    // types, what InitialFunction returns needs to be a base class of what
+    // Overrider returns. So, we need to do an upcast here.
+    unsigned Offset = S.getContext().collectBaseOffset(
+        InitialPointeeType->getAsRecordDecl(),
+        OverriderPointeeType->getAsRecordDecl());
+    return GetPtrBasePop(S, OpPC, Offset);
+  }
+
+  return true;
 }
 
 inline bool CallBI(InterpState &S, CodePtr &PC, const Function *Func,
@@ -2735,6 +2783,15 @@ inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC,
   return CheckDeclRef(S, OpPC, DR);
 }
 
+inline bool SizelessVectorElementSize(InterpState &S, CodePtr OpPC) {
+  if (S.inConstantContext()) {
+    const SourceRange &ArgRange = S.Current->getRange(OpPC);
+    const Expr *E = S.Current->getExpr(OpPC);
+    S.CCEDiag(E, diag::note_constexpr_non_const_vectorelements) << ArgRange;
+  }
+  return false;
+}
+
 inline bool Assume(InterpState &S, CodePtr OpPC) {
   const auto Val = S.Stk.pop<Boolean>();
 
@@ -2774,6 +2831,20 @@ inline bool CheckNonNullArg(InterpState &S, CodePtr OpPC) {
   return false;
 }
 
+void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED,
+                       const APSInt &Value);
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+inline bool CheckEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED) {
+  assert(ED);
+  assert(!ED->isFixed());
+  const APSInt Val = S.Stk.peek<T>().toAPSInt();
+
+  if (S.inConstantContext())
+    diagnoseEnumValue(S, OpPC, ED, Val);
+  return true;
+}
+
 /// OldPtr -> Integer -> NewPtr.
 template <PrimType TIn, PrimType TOut>
 inline bool DecayPtr(InterpState &S, CodePtr OpPC) {
@@ -2782,6 +2853,13 @@ inline bool DecayPtr(InterpState &S, CodePtr OpPC) {
   using ToT = typename PrimConv<TOut>::T;
 
   const FromT &OldPtr = S.Stk.pop<FromT>();
+
+  if constexpr (std::is_same_v<FromT, FunctionPointer> &&
+                std::is_same_v<ToT, Pointer>) {
+    S.Stk.push<Pointer>(OldPtr.getFunction());
+    return true;
+  }
+
   S.Stk.push<ToT>(ToT(OldPtr.getIntegerRepresentation(), nullptr));
   return true;
 }
@@ -2872,8 +2950,8 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc,
   return true;
 }
 
+bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B);
 static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) {
-
   if (!CheckDynamicMemoryAllocation(S, OpPC))
     return false;
 
@@ -2904,6 +2982,10 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) {
   assert(Source);
   assert(BlockToDelete);
 
+  // Invoke destructors before deallocating the memory.
+  if (!RunDestructors(S, OpPC, BlockToDelete))
+    return false;
+
   DynamicAllocator &Allocator = S.getAllocator();
   bool WasArrayAlloc = Allocator.isArrayAllocation(Source);
   const Descriptor *BlockDesc = BlockToDelete->getDescriptor();
@@ -2918,6 +3000,39 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) {
                              BlockDesc, Source);
 }
 
+inline bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) {
+  assert(T);
+  assert(!S.getLangOpts().CPlusPlus23);
+
+  // C++1y: A constant initializer for an object o [...] may also invoke
+  // constexpr constructors for o and its subobjects even if those objects
+  // are of non-literal class types.
+  //
+  // C++11 missed this detail for aggregates, so classes like this:
+  //   struct foo_t { union { int i; volatile int j; } u; };
+  // are not (obviously) initializable like so:
+  //   __attribute__((__require_constant_initialization__))
+  //   static const foo_t x = {{0}};
+  // because "i" is a subobject with non-literal initialization (due to the
+  // volatile member of the union). See:
+  //   http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1677
+  // Therefore, we use the C++1y behavior.
+
+  if (S.EvaluatingDecl)
+    return true;
+
+  if (S.Current->getFunction() && S.Current->getFunction()->isConstructor() &&
+      S.Current->getThis().getDeclDesc()->asDecl() == S.EvaluatingDecl)
+    return true;
+
+  const Expr *E = S.Current->getExpr(OpPC);
+  if (S.getLangOpts().CPlusPlus11)
+    S.FFDiag(E, diag::note_constexpr_nonliteral) << E->getType();
+  else
+    S.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 98928b3c22d7c..d7538c76e91d9 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
+#include "Compiler.h"
+#include "EvalEmitter.h"
 #include "Interp.h"
 #include "PrimType.h"
 #include "clang/AST/OSLog.h"
@@ -942,15 +944,29 @@ static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC,
       if (Ptr.isZero())
         return returnBool(true);
 
-      QualType PointeeType = Call->getArg(1)
-                                 ->IgnoreImpCasts()
-                                 ->getType()
-                                 ->castAs<PointerType>()
-                                 ->getPointeeType();
-      // OK, we will inline operations on this object.
-      if (!PointeeType->isIncompleteType() &&
-          S.getCtx().getTypeAlignInChars(PointeeType) >= Size)
-        return returnBool(true);
+      if (Ptr.isIntegralPointer()) {
+        uint64_t IntVal = Ptr.getIntegerRepresentation();
+        if (APSInt(APInt(64, IntVal, false), true).isAligned(Size.getAsAlign()))
+          return returnBool(true);
+      }
+
+      const Expr *PtrArg = Call->getArg(1);
+      // Otherwise, check if the type's alignment against Size.
+      if (const auto *ICE = dyn_cast<ImplicitCastExpr>(PtrArg)) {
+        // Drop the potential implicit-cast to 'const volatile void*', getting
+        // the underlying type.
+        if (ICE->getCastKind() == CK_BitCast)
+          PtrArg = ICE->getSubExpr();
+      }
+
+      if (auto PtrTy = PtrArg->getType()->getAs<PointerType>()) {
+        QualType PointeeType = PtrTy->getPointeeType();
+        if (!PointeeType->isIncompleteType() &&
+            S.getCtx().getTypeAlignInChars(PointeeType) >= Size) {
+          // OK, we will inline operations on this object.
+          return returnBool(true);
+        }
+      }
     }
   }
 
@@ -1113,6 +1129,76 @@ static bool interp__builtin_ptrauth_string_discriminator(
   return true;
 }
 
+// FIXME: This implementation is not complete.
+// The Compiler instance we create cannot access the current stack frame, local
+// variables, function parameters, etc. We also need protection from
+// side-effects, fatal errors, etc.
+static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
+                                       const InterpFrame *Frame,
+                                       const Function *Func,
+                                       const CallExpr *Call) {
+  const Expr *Arg = Call->getArg(0);
+  QualType ArgType = Arg->getType();
+
+  auto returnInt = [&S, Call](bool Value) -> bool {
+    pushInteger(S, Value, Call->getType());
+    return true;
+  };
+
+  // __builtin_constant_p always has one operand. The rules which gcc follows
+  // are not precisely documented, but are as follows:
+  //
+  //  - If the operand is of integral, floating, complex or enumeration type,
+  //    and can be folded to a known value of that type, it returns 1.
+  //  - If the operand can be folded to a pointer to the first character
+  //    of a string literal (or such a pointer cast to an integral type)
+  //    or to a null pointer or an integer cast to a pointer, it returns 1.
+  //
+  // Otherwise, it returns 0.
+  //
+  // FIXME: GCC also intends to return 1 for literals of aggregate types, but
+  // its support for this did not work prior to GCC 9 and is not yet well
+  // understood.
+  if (ArgType->isIntegralOrEnumerationType() || ArgType->isFloatingType() ||
+      ArgType->isAnyComplexType() || ArgType->isPointerType() ||
+      ArgType->isNullPtrType()) {
+    InterpStack Stk;
+    Compiler<EvalEmitter> C(S.Ctx, S.P, S, Stk);
+    auto Res = C.interpretExpr(Arg, /*ConvertResultToRValue=*/Arg->isGLValue());
+    if (Res.isInvalid()) {
+      C.cleanup();
+      Stk.clear();
+    }
+
+    if (!Res.isInvalid() && !Res.empty()) {
+      const APValue &LV = Res.toAPValue();
+      if (LV.isLValue()) {
+        APValue::LValueBase Base = LV.getLValueBase();
+        if (Base.isNull()) {
+          // A null base is acceptable.
+          return returnInt(true);
+        } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
+          if (!isa<StringLiteral>(E))
+            return returnInt(false);
+          return returnInt(LV.getLValueOffset().isZero());
+        } else if (Base.is<TypeInfoLValue>()) {
+          // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
+          // evaluate to true.
+          return returnInt(true);
+        } else {
+          // Any other base is not constant enough for GCC.
+          return returnInt(false);
+        }
+      }
+    }
+
+    // Otherwise, any constant value is good enough.
+    return returnInt(true);
+  }
+
+  return returnInt(false);
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   const InterpFrame *Frame = S.Current;
@@ -1442,6 +1528,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_constant_p:
+    if (!interp__builtin_constant_p(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/Interp/MemberPointer.cpp b/clang/lib/AST/Interp/MemberPointer.cpp
index 96f63643e83c9..0c1b6edc5f7e1 100644
--- a/clang/lib/AST/Interp/MemberPointer.cpp
+++ b/clang/lib/AST/Interp/MemberPointer.cpp
@@ -60,13 +60,13 @@ FunctionPointer MemberPointer::toFunctionPointer(const Context &Ctx) const {
   return FunctionPointer(Ctx.getProgram().getFunction(cast<FunctionDecl>(Dcl)));
 }
 
-APValue MemberPointer::toAPValue() const {
+APValue MemberPointer::toAPValue(const ASTContext &ASTCtx) const {
   if (isZero())
     return APValue(static_cast<ValueDecl *>(nullptr), /*IsDerivedMember=*/false,
                    /*Path=*/{});
 
   if (hasBase())
-    return Base.toAPValue();
+    return Base.toAPValue(ASTCtx);
 
   return APValue(cast<ValueDecl>(getDecl()), /*IsDerivedMember=*/false,
                  /*Path=*/{});
diff --git a/clang/lib/AST/Interp/MemberPointer.h b/clang/lib/AST/Interp/MemberPointer.h
index f56dc530431e4..2b3be124db426 100644
--- a/clang/lib/AST/Interp/MemberPointer.h
+++ b/clang/lib/AST/Interp/MemberPointer.h
@@ -80,7 +80,7 @@ class MemberPointer final {
     return MemberPointer(Instance, this->Dcl, this->PtrOffset);
   }
 
-  APValue toAPValue() const;
+  APValue toAPValue(const ASTContext &) const;
 
   bool isZero() const { return Base.isZero() && !Dcl; }
   bool hasBase() const { return !Base.isZero(); }
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 3e69098570bd7..eeb9cb2e933a6 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -66,6 +66,8 @@ def ArgDecl : ArgType { let Name = "const Decl*"; }
 def ArgVarDecl : ArgType { let Name = "const VarDecl*"; }
 def ArgDesc : ArgType { let Name = "const Descriptor *"; }
 def ArgPrimType : ArgType { let Name = "PrimType"; }
+def ArgEnumDecl : ArgType { let Name = "const EnumDecl *"; }
+def ArgTypePtr : ArgType { let Name = "const Type *"; }
 
 //===----------------------------------------------------------------------===//
 // Classes of types instructions operate on.
@@ -389,6 +391,16 @@ def CheckDecl : Opcode {
   let Args = [ArgVarDecl];
 }
 
+def CheckEnumValue : Opcode {
+  let Args = [ArgEnumDecl];
+  let Types = [FixedSizeIntegralTypeClass];
+  let HasGroup = 1;
+}
+
+def CheckLiteralType : Opcode {
+  let Args = [ArgTypePtr];
+}
+
 // [] -> [Value]
 def GetGlobal : AccessOpcode;
 def GetGlobalUnchecked : AccessOpcode;
@@ -726,6 +738,8 @@ def InvalidDeclRef : Opcode {
   let Args = [ArgDeclRef];
 }
 
+def SizelessVectorElementSize : Opcode;
+
 def Assume : Opcode;
 
 def ArrayDecay : Opcode;
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index ff4da0fa805dc..3ac8bc2b09709 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -16,6 +16,7 @@
 #include "MemberPointer.h"
 #include "PrimType.h"
 #include "Record.h"
+#include "clang/AST/RecordLayout.h"
 
 using namespace clang;
 using namespace clang::interp;
@@ -54,7 +55,7 @@ Pointer::Pointer(Pointer &&P)
 }
 
 Pointer::~Pointer() {
-  if (isIntegralPointer())
+  if (!isBlockPointer())
     return;
 
   if (Block *Pointee = PointeeStorage.BS.Pointee) {
@@ -86,6 +87,8 @@ void Pointer::operator=(const Pointer &P) {
       PointeeStorage.BS.Pointee->addPointer(this);
   } else if (P.isIntegralPointer()) {
     PointeeStorage.Int = P.PointeeStorage.Int;
+  } else if (P.isFunctionPointer()) {
+    PointeeStorage.Fn = P.PointeeStorage.Fn;
   } else {
     assert(false && "Unhandled storage kind");
   }
@@ -114,12 +117,14 @@ void Pointer::operator=(Pointer &&P) {
       PointeeStorage.BS.Pointee->addPointer(this);
   } else if (P.isIntegralPointer()) {
     PointeeStorage.Int = P.PointeeStorage.Int;
+  } else if (P.isFunctionPointer()) {
+    PointeeStorage.Fn = P.PointeeStorage.Fn;
   } else {
     assert(false && "Unhandled storage kind");
   }
 }
 
-APValue Pointer::toAPValue() const {
+APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
   llvm::SmallVector<APValue::LValuePathEntry, 5> Path;
 
   if (isZero())
@@ -130,6 +135,8 @@ APValue Pointer::toAPValue() const {
                    CharUnits::fromQuantity(asIntPointer().Value + this->Offset),
                    Path,
                    /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
+  if (isFunctionPointer())
+    return asFunctionPointer().toAPValue(ASTCtx);
 
   // Build the lvalue base from the block.
   const Descriptor *Desc = getDeclDesc();
@@ -141,41 +148,79 @@ APValue Pointer::toAPValue() const {
   else
     llvm_unreachable("Invalid allocation type");
 
-  if (isDummy() || isUnknownSizeArray() || Desc->asExpr())
+  if (isUnknownSizeArray() || Desc->asExpr())
     return APValue(Base, CharUnits::Zero(), Path,
                    /*IsOnePastEnd=*/isOnePastEnd(), /*IsNullPtr=*/false);
 
-  // TODO: compute the offset into the object.
   CharUnits Offset = CharUnits::Zero();
 
+  auto getFieldOffset = [&](const FieldDecl *FD) -> CharUnits {
+    // This shouldn't happen, but if it does, don't crash inside
+    // getASTRecordLayout.
+    if (FD->getParent()->isInvalidDecl())
+      return CharUnits::Zero();
+    const ASTRecordLayout &Layout = ASTCtx.getASTRecordLayout(FD->getParent());
+    unsigned FieldIndex = FD->getFieldIndex();
+    return ASTCtx.toCharUnitsFromBits(Layout.getFieldOffset(FieldIndex));
+  };
+
   // Build the path into the object.
   Pointer Ptr = *this;
   while (Ptr.isField() || Ptr.isArrayElement()) {
     if (Ptr.isArrayRoot()) {
       Path.push_back(APValue::LValuePathEntry(
           {Ptr.getFieldDesc()->asDecl(), /*IsVirtual=*/false}));
+
+      if (const auto *FD = dyn_cast<FieldDecl>(Ptr.getFieldDesc()->asDecl()))
+        Offset += getFieldOffset(FD);
+
       Ptr = Ptr.getBase();
     } else if (Ptr.isArrayElement()) {
+      unsigned Index;
       if (Ptr.isOnePastEnd())
-        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getArray().getNumElems()));
+        Index = Ptr.getArray().getNumElems();
       else
-        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
+        Index = Ptr.getIndex();
+
+      Offset += (Index * ASTCtx.getTypeSizeInChars(Ptr.getType()));
+      Path.push_back(APValue::LValuePathEntry::ArrayIndex(Index));
       Ptr = Ptr.getArray();
     } else {
-      // TODO: figure out if base is virtual
       bool IsVirtual = false;
 
       // Create a path entry for the field.
       const Descriptor *Desc = Ptr.getFieldDesc();
       if (const auto *BaseOrMember = Desc->asDecl()) {
+        if (const auto *FD = dyn_cast<FieldDecl>(BaseOrMember)) {
+          Ptr = Ptr.getBase();
+          Offset += getFieldOffset(FD);
+        } else if (const auto *RD = dyn_cast<CXXRecordDecl>(BaseOrMember)) {
+          IsVirtual = Ptr.isVirtualBaseClass();
+          Ptr = Ptr.getBase();
+          const Record *BaseRecord = Ptr.getRecord();
+
+          const ASTRecordLayout &Layout = ASTCtx.getASTRecordLayout(
+              cast<CXXRecordDecl>(BaseRecord->getDecl()));
+          if (IsVirtual)
+            Offset += Layout.getVBaseClassOffset(RD);
+          else
+            Offset += Layout.getBaseClassOffset(RD);
+
+        } else {
+          Ptr = Ptr.getBase();
+        }
         Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
-        Ptr = Ptr.getBase();
         continue;
       }
       llvm_unreachable("Invalid field type");
     }
   }
 
+  // FIXME(perf): We compute the lvalue path above, but we can't supply it
+  // for dummy pointers (that causes crashes later in CheckConstantExpression).
+  if (isDummy())
+    Path.clear();
+
   // We assemble the LValuePath starting from the innermost pointer to the
   // outermost one. SO in a.b.c, the first element in Path will refer to
   // the field 'c', while later code expects it to refer to 'a'.
@@ -220,13 +265,19 @@ std::string Pointer::toDiagnosticString(const ASTContext &Ctx) const {
   if (isIntegralPointer())
     return (Twine("&(") + Twine(asIntPointer().Value + Offset) + ")").str();
 
-  return toAPValue().getAsString(Ctx, getType());
+  return toAPValue(Ctx).getAsString(Ctx, getType());
 }
 
 bool Pointer::isInitialized() const {
-  if (isIntegralPointer())
+  if (!isBlockPointer())
     return true;
 
+  if (isRoot() && PointeeStorage.BS.Base == sizeof(GlobalInlineDescriptor)) {
+    const GlobalInlineDescriptor &GD =
+        *reinterpret_cast<const GlobalInlineDescriptor *>(block()->rawData());
+    return GD.InitState == GlobalInitState::Initialized;
+  }
+
   assert(PointeeStorage.BS.Pointee &&
          "Cannot check if null pointer was initialized");
   const Descriptor *Desc = getFieldDesc();
@@ -249,23 +300,24 @@ bool Pointer::isInitialized() const {
   if (asBlockPointer().Base == 0)
     return true;
 
-  if (isRoot() && PointeeStorage.BS.Base == sizeof(GlobalInlineDescriptor)) {
-    const GlobalInlineDescriptor &GD =
-        *reinterpret_cast<const GlobalInlineDescriptor *>(block()->rawData());
-    return GD.InitState == GlobalInitState::Initialized;
-  }
-
   // Field has its bit in an inline descriptor.
   return getInlineDesc()->IsInitialized;
 }
 
 void Pointer::initialize() const {
-  if (isIntegralPointer())
+  if (!isBlockPointer())
     return;
 
   assert(PointeeStorage.BS.Pointee && "Cannot initialize null pointer");
   const Descriptor *Desc = getFieldDesc();
 
+  if (isRoot() && PointeeStorage.BS.Base == sizeof(GlobalInlineDescriptor)) {
+    GlobalInlineDescriptor &GD = *reinterpret_cast<GlobalInlineDescriptor *>(
+        asBlockPointer().Pointee->rawData());
+    GD.InitState = GlobalInitState::Initialized;
+    return;
+  }
+
   assert(Desc);
   if (Desc->isPrimitiveArray()) {
     // Primitive global arrays don't have an initmap.
@@ -294,13 +346,6 @@ void Pointer::initialize() const {
     return;
   }
 
-  if (isRoot() && PointeeStorage.BS.Base == sizeof(GlobalInlineDescriptor)) {
-    GlobalInlineDescriptor &GD = *reinterpret_cast<GlobalInlineDescriptor *>(
-        asBlockPointer().Pointee->rawData());
-    GD.InitState = GlobalInitState::Initialized;
-    return;
-  }
-
   // Field has its bit in an inline descriptor.
   assert(PointeeStorage.BS.Base != 0 &&
          "Only composite fields can be initialised");
@@ -329,10 +374,15 @@ bool Pointer::hasSameBase(const Pointer &A, const Pointer &B) {
 
   if (A.isIntegralPointer() && B.isIntegralPointer())
     return true;
+  if (A.isFunctionPointer() && B.isFunctionPointer())
+    return true;
 
   if (A.isIntegralPointer() || B.isIntegralPointer())
     return A.getSource() == B.getSource();
 
+  if (A.StorageKind != B.StorageKind)
+    return false;
+
   return A.asBlockPointer().Pointee == B.asBlockPointer().Pointee;
 }
 
@@ -344,10 +394,12 @@ bool Pointer::hasSameArray(const Pointer &A, const Pointer &B) {
 
 std::optional<APValue> Pointer::toRValue(const Context &Ctx,
                                          QualType ResultType) const {
+  const ASTContext &ASTCtx = Ctx.getASTContext();
   assert(!ResultType.isNull());
   // Method to recursively traverse composites.
   std::function<bool(QualType, const Pointer &, APValue &)> Composite;
-  Composite = [&Composite, &Ctx](QualType Ty, const Pointer &Ptr, APValue &R) {
+  Composite = [&Composite, &Ctx, &ASTCtx](QualType Ty, const Pointer &Ptr,
+                                          APValue &R) {
     if (const auto *AT = Ty->getAs<AtomicType>())
       Ty = AT->getValueType();
 
@@ -358,7 +410,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
 
     // Primitive values.
     if (std::optional<PrimType> T = Ctx.classify(Ty)) {
-      TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
+      TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue(ASTCtx));
       return true;
     }
 
@@ -375,7 +427,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
           QualType FieldTy = F.Decl->getType();
           if (FP.isActive()) {
             if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
-              TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue());
+              TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue(ASTCtx));
             } else {
               Ok &= Composite(FieldTy, FP, Value);
             }
@@ -398,7 +450,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
           APValue &Value = R.getStructField(I);
 
           if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
-            TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue());
+            TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue(ASTCtx));
           } else {
             Ok &= Composite(FieldTy, FP, Value);
           }
@@ -436,7 +488,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
         APValue &Slot = R.getArrayInitializedElt(I);
         const Pointer &EP = Ptr.atIndex(I);
         if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
-          TYPE_SWITCH(*T, Slot = EP.deref<T>().toAPValue());
+          TYPE_SWITCH(*T, Slot = EP.deref<T>().toAPValue(ASTCtx));
         } else {
           Ok &= Composite(ElemTy, EP.narrow(), Slot);
         }
@@ -475,7 +527,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       Values.reserve(VT->getNumElements());
       for (unsigned I = 0; I != VT->getNumElements(); ++I) {
         TYPE_SWITCH(ElemT, {
-          Values.push_back(Ptr.atIndex(I).deref<T>().toAPValue());
+          Values.push_back(Ptr.atIndex(I).deref<T>().toAPValue(ASTCtx));
         });
       }
 
@@ -493,11 +545,11 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
 
   // We can return these as rvalues, but we can't deref() them.
   if (isZero() || isIntegralPointer())
-    return toAPValue();
+    return toAPValue(ASTCtx);
 
   // Just load primitive types.
   if (std::optional<PrimType> T = Ctx.classify(ResultType)) {
-    TYPE_SWITCH(*T, return this->deref<T>().toAPValue());
+    TYPE_SWITCH(*T, return this->deref<T>().toAPValue(ASTCtx));
   }
 
   // Return the composite type.
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 972f55a553f6e..6b0c31358159c 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -14,6 +14,7 @@
 #define LLVM_CLANG_AST_INTERP_POINTER_H
 
 #include "Descriptor.h"
+#include "FunctionPointer.h"
 #include "InterpBlock.h"
 #include "clang/AST/ComparisonCategories.h"
 #include "clang/AST/Decl.h"
@@ -45,7 +46,7 @@ struct IntPointer {
   uint64_t Value;
 };
 
-enum class Storage { Block, Int };
+enum class Storage { Block, Int, Fn };
 
 /// A pointer to a memory block, live or dead.
 ///
@@ -96,6 +97,10 @@ class Pointer {
     PointeeStorage.Int.Value = Address;
     PointeeStorage.Int.Desc = Desc;
   }
+  Pointer(const Function *F, uint64_t Offset = 0)
+      : Offset(Offset), StorageKind(Storage::Fn) {
+    PointeeStorage.Fn = FunctionPointer(F);
+  }
   ~Pointer();
 
   void operator=(const Pointer &P);
@@ -118,7 +123,7 @@ class Pointer {
   bool operator!=(const Pointer &P) const { return !(P == *this); }
 
   /// Converts the pointer to an APValue.
-  APValue toAPValue() const;
+  APValue toAPValue(const ASTContext &ASTCtx) const;
 
   /// Converts the pointer to a string usable in diagnostics.
   std::string toDiagnosticString(const ASTContext &Ctx) const;
@@ -126,6 +131,8 @@ class Pointer {
   uint64_t getIntegerRepresentation() const {
     if (isIntegralPointer())
       return asIntPointer().Value + (Offset * elemSize());
+    if (isFunctionPointer())
+      return asFunctionPointer().getIntegerRepresentation();
     return reinterpret_cast<uint64_t>(asBlockPointer().Pointee) + Offset;
   }
 
@@ -137,6 +144,8 @@ class Pointer {
   [[nodiscard]] Pointer atIndex(uint64_t Idx) const {
     if (isIntegralPointer())
       return Pointer(asIntPointer().Value, asIntPointer().Desc, Idx);
+    if (isFunctionPointer())
+      return Pointer(asFunctionPointer().getFunction(), Idx);
 
     if (asBlockPointer().Base == RootPtrMark)
       return Pointer(asBlockPointer().Pointee, RootPtrMark,
@@ -247,18 +256,20 @@ class Pointer {
   bool isZero() const {
     if (isBlockPointer())
       return asBlockPointer().Pointee == nullptr;
+    if (isFunctionPointer())
+      return asFunctionPointer().isZero();
     assert(isIntegralPointer());
     return asIntPointer().Value == 0 && Offset == 0;
   }
   /// Checks if the pointer is live.
   bool isLive() const {
-    if (isIntegralPointer())
+    if (!isBlockPointer())
       return true;
     return asBlockPointer().Pointee && !asBlockPointer().Pointee->IsDead;
   }
   /// Checks if the item is a field in an object.
   bool isField() const {
-    if (isIntegralPointer())
+    if (!isBlockPointer())
       return false;
 
     return !isRoot() && getFieldDesc()->asDecl();
@@ -268,6 +279,8 @@ class Pointer {
   const Descriptor *getDeclDesc() const {
     if (isIntegralPointer())
       return asIntPointer().Desc;
+    if (isFunctionPointer())
+      return nullptr;
 
     assert(isBlockPointer());
     assert(asBlockPointer().Pointee);
@@ -279,7 +292,10 @@ class Pointer {
   DeclTy getSource() const {
     if (isBlockPointer())
       return getDeclDesc()->getSource();
-
+    if (isFunctionPointer()) {
+      const Function *F = asFunctionPointer().getFunction();
+      return F ? F->getDecl() : DeclTy();
+    }
     assert(isIntegralPointer());
     return asIntPointer().Desc ? asIntPointer().Desc->getSource() : DeclTy();
   }
@@ -354,6 +370,7 @@ class Pointer {
   /// Returns the offset into an array.
   unsigned getOffset() const {
     assert(Offset != PastEndMark && "invalid offset");
+    assert(isBlockPointer());
     if (asBlockPointer().Base == RootPtrMark)
       return Offset;
 
@@ -421,8 +438,14 @@ class Pointer {
     assert(isIntegralPointer());
     return PointeeStorage.Int;
   }
+  [[nodiscard]] const FunctionPointer &asFunctionPointer() const {
+    assert(isFunctionPointer());
+    return PointeeStorage.Fn;
+  }
+
   bool isBlockPointer() const { return StorageKind == Storage::Block; }
   bool isIntegralPointer() const { return StorageKind == Storage::Int; }
+  bool isFunctionPointer() const { return StorageKind == Storage::Fn; }
 
   /// Returns the record descriptor of a class.
   const Record *getRecord() const { return getFieldDesc()->ElemRecord; }
@@ -445,7 +468,7 @@ class Pointer {
   }
   /// Checks if the storage is static.
   bool isStatic() const {
-    if (isIntegralPointer())
+    if (!isBlockPointer())
       return true;
     assert(asBlockPointer().Pointee);
     return asBlockPointer().Pointee->isStatic();
@@ -469,7 +492,7 @@ class Pointer {
   }
 
   bool isWeak() const {
-    if (isIntegralPointer())
+    if (!isBlockPointer())
       return false;
 
     assert(isBlockPointer());
@@ -487,6 +510,9 @@ class Pointer {
   }
   /// Checks if a structure is a base class.
   bool isBaseClass() const { return isField() && getInlineDesc()->IsBase; }
+  bool isVirtualBaseClass() const {
+    return isField() && getInlineDesc()->IsVirtualBase;
+  }
   /// Checks if the pointer points to a dummy value.
   bool isDummy() const {
     if (!isBlockPointer())
@@ -525,8 +551,8 @@ class Pointer {
 
   /// Returns the number of elements.
   unsigned getNumElems() const {
-    if (isIntegralPointer())
-      return ~unsigned(0);
+    if (!isBlockPointer())
+      return ~0u;
     return getSize() / elemSize();
   }
 
@@ -552,7 +578,7 @@ class Pointer {
 
   /// Checks if the index is one past end.
   bool isOnePastEnd() const {
-    if (isIntegralPointer())
+    if (isIntegralPointer() || isFunctionPointer())
       return false;
 
     if (!asBlockPointer().Pointee)
@@ -689,6 +715,7 @@ class Pointer {
   union {
     BlockPointer BS;
     IntPointer Int;
+    FunctionPointer Fn;
   } PointeeStorage;
   Storage StorageKind = Storage::Int;
 };
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index a9204fa9a887f..eee446df846f5 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -5191,6 +5191,14 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
       Diags.Report(DiagID);
       return;
     }
+    case UETT_PtrAuthTypeDiscriminator: {
+      DiagnosticsEngine &Diags = Context.getDiags();
+      unsigned DiagID = Diags.getCustomDiagID(
+          DiagnosticsEngine::Error,
+          "cannot yet mangle __builtin_ptrauth_type_discriminator expression");
+      Diags.Report(E->getExprLoc(), DiagID);
+      return;
+    }
     case UETT_VecStep: {
       DiagnosticsEngine &Diags = Context.getDiags();
       unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
@@ -6497,7 +6505,7 @@ void CXXNameMangler::mangleValueInTemplateArg(QualType T, const APValue &V,
 
   case APValue::LValue: {
     // Proposed in https://github.com/itanium-cxx-abi/cxx-abi/issues/47.
-    assert((T->isPointerType() || T->isReferenceType()) &&
+    assert((T->isPointerOrReferenceType()) &&
            "unexpected type for LValue template arg");
 
     if (V.isNullPointer()) {
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 661b21ace5f36..c67b0fd617a38 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -337,6 +337,10 @@ class MicrosoftCXXNameMangler {
 
   const bool PointersAre64Bit;
 
+  DiagnosticBuilder Error(SourceLocation, StringRef, StringRef);
+  DiagnosticBuilder Error(SourceLocation, StringRef);
+  DiagnosticBuilder Error(StringRef);
+
 public:
   enum QualifierMangleMode { QMM_Drop, QMM_Mangle, QMM_Escape, QMM_Result };
   enum class TplArgKind { ClassNTTP, StructuralValue };
@@ -564,6 +568,31 @@ MicrosoftMangleContextImpl::shouldMangleStringLiteral(const StringLiteral *SL) {
   return true;
 }
 
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(SourceLocation loc,
+                                                 StringRef thing1,
+                                                 StringRef thing2) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 %1 yet");
+  return Diags.Report(loc, DiagID) << thing1 << thing2;
+}
+
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(SourceLocation loc,
+                                                 StringRef thingy) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 yet");
+  return Diags.Report(loc, DiagID) << thingy;
+}
+
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(StringRef thingy) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  // extra placeholders are ignored quietly when not used
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 yet");
+  return Diags.Report(DiagID) << thingy;
+}
+
 void MicrosoftCXXNameMangler::mangle(GlobalDecl GD, StringRef Prefix) {
   const NamedDecl *D = cast<NamedDecl>(GD.getDecl());
   // MSVC doesn't mangle C++ names the same way it mangles extern "C" names.
@@ -986,6 +1015,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
   case APFloat::S_Float8E5M2FNUZ:
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
+  case APFloat::S_Float8E3M4:
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
@@ -1578,10 +1608,7 @@ void MicrosoftCXXNameMangler::mangleOperatorName(OverloadedOperatorKind OO,
   case OO_Spaceship: Out << "?__M"; break;
 
   case OO_Conditional: {
-    DiagnosticsEngine &Diags = Context.getDiags();
-    unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-      "cannot mangle this conditional operator yet");
-    Diags.Report(Loc, DiagID);
+    Error(Loc, "conditional operator");
     break;
   }
 
@@ -1673,11 +1700,8 @@ void MicrosoftCXXNameMangler::mangleExpression(
   }
 
   // As bad as this diagnostic is, it's better than crashing.
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot yet mangle expression type %0");
-  Diags.Report(E->getExprLoc(), DiagID) << E->getStmtClassName()
-                                        << E->getSourceRange();
+  Error(E->getExprLoc(), "expression type: ", E->getStmtClassName())
+      << E->getSourceRange();
 }
 
 void MicrosoftCXXNameMangler::mangleTemplateArgs(
@@ -1923,11 +1947,19 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     if (WithScalarType)
       mangleType(T, SourceRange(), QMM_Escape);
 
-    // We don't know how to mangle past-the-end pointers yet.
-    if (V.isLValueOnePastTheEnd())
-      break;
-
     APValue::LValueBase Base = V.getLValueBase();
+
+    // this might not cover every case but did cover issue 97756
+    // see test CodeGen/ms_mangler_templatearg_opte
+    if (V.isLValueOnePastTheEnd()) {
+      Out << "5E";
+      auto *VD = Base.dyn_cast<const ValueDecl *>();
+      if (VD)
+        mangle(VD);
+      Out << "@";
+      return;
+    }
+
     if (!V.hasLValuePath() || V.getLValuePath().empty()) {
       // Taking the address of a complete object has a special-case mangling.
       if (Base.isNull()) {
@@ -1939,12 +1971,14 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
         mangleNumber(V.getLValueOffset().getQuantity());
       } else if (!V.hasLValuePath()) {
         // FIXME: This can only happen as an extension. Invent a mangling.
-        break;
+        Error("template argument (extension not comaptible with ms mangler)");
+        return;
       } else if (auto *VD = Base.dyn_cast<const ValueDecl*>()) {
         Out << "E";
         mangle(VD);
       } else {
-        break;
+        Error("template argument (undeclared base)");
+        return;
       }
     } else {
       if (TAK == TplArgKind::ClassNTTP && T->isPointerType())
@@ -1989,8 +2023,10 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
         Out << *I;
 
       auto *VD = Base.dyn_cast<const ValueDecl*>();
-      if (!VD)
-        break;
+      if (!VD) {
+        Error("template argument (null value decl)");
+        return;
+      }
       Out << (TAK == TplArgKind::ClassNTTP ? 'E' : '1');
       mangle(VD);
 
@@ -2105,15 +2141,16 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     return;
   }
 
-  case APValue::AddrLabelDiff:
-  case APValue::FixedPoint:
-    break;
+  case APValue::AddrLabelDiff: {
+    Error("template argument (value type: address label diff)");
+    return;
   }
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this template argument yet");
-  Diags.Report(DiagID);
+  case APValue::FixedPoint: {
+    Error("template argument (value type: fixed point)");
+    return;
+  }
+  }
 }
 
 void MicrosoftCXXNameMangler::mangleObjCProtocol(const ObjCProtocolDecl *PD) {
@@ -2762,11 +2799,9 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
   case BuiltinType::SatULongFract:
   case BuiltinType::Ibm128:
   case BuiltinType::Float128: {
-    DiagnosticsEngine &Diags = Context.getDiags();
-    unsigned DiagID = Diags.getCustomDiagID(
-        DiagnosticsEngine::Error, "cannot mangle this built-in %0 type yet");
-    Diags.Report(Range.getBegin(), DiagID)
-        << T->getName(Context.getASTContext().getPrintingPolicy()) << Range;
+    Error(Range.getBegin(), "built-in type: ",
+          T->getName(Context.getASTContext().getPrintingPolicy()))
+        << Range;
     break;
   }
   }
@@ -3095,10 +3130,7 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC,
       return;
   }
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this calling convention yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "calling convention") << Range;
 }
 void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T,
                                                       SourceRange Range) {
@@ -3119,11 +3151,7 @@ void MicrosoftCXXNameMangler::mangleType(const UnresolvedUsingType *T,
                                          Qualifiers, SourceRange Range) {
   // Probably should be mangled as a template instantiation; need to see what
   // VC does first.
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this unresolved dependent type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "unresolved dependent type") << Range;
 }
 
 // <type>        ::= <union-type> | <struct-type> | <class-type> | <enum-type>
@@ -3230,11 +3258,8 @@ void MicrosoftCXXNameMangler::mangleArrayType(const ArrayType *T) {
       // The dependent expression has to be folded into a constant (TODO).
       const DependentSizedArrayType *DSAT =
         getASTContext().getAsDependentSizedArrayType(ElementTy);
-      DiagnosticsEngine &Diags = Context.getDiags();
-      unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-        "cannot mangle this dependent-length array yet");
-      Diags.Report(DSAT->getSizeExpr()->getExprLoc(), DiagID)
-        << DSAT->getBracketsRange();
+      Error(DSAT->getSizeExpr()->getExprLoc(), "dependent-length")
+          << DSAT->getBracketsRange();
       return;
     } else {
       break;
@@ -3274,20 +3299,12 @@ void MicrosoftCXXNameMangler::mangleType(const MemberPointerType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const TemplateTypeParmType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this template type parameter type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "template type parameter type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const SubstTemplateTypeParmPackType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this substituted parameter pack yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "substituted parameter pack") << Range;
 }
 
 // <type> ::= <pointer-type>
@@ -3438,46 +3455,27 @@ void MicrosoftCXXNameMangler::mangleType(const ExtVectorType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const DependentVectorType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "cannot mangle this dependent-sized vector type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent-sized vector type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentSizedExtVectorType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent-sized extended vector type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent-sized extended vector type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const ConstantMatrixType *T,
                                          Qualifiers quals, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                          "Cannot mangle this matrix type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "matrix type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentSizedMatrixType *T,
                                          Qualifiers quals, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "Cannot mangle this dependent-sized matrix type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent-sized matrix type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentAddressSpaceType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "cannot mangle this dependent address space type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent address space type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const ObjCInterfaceType *T, Qualifiers,
@@ -3547,39 +3545,23 @@ void MicrosoftCXXNameMangler::mangleType(const InjectedClassNameType *,
 
 void MicrosoftCXXNameMangler::mangleType(const TemplateSpecializationType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "template specialization type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentNameType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent name type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent name type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(
     const DependentTemplateSpecializationType *T, Qualifiers,
     SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent template specialization type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const PackExpansionType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this pack expansion yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "pack expansion") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const PackIndexingType *T,
@@ -3590,60 +3572,37 @@ void MicrosoftCXXNameMangler::mangleType(const PackIndexingType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const TypeOfType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this typeof(type) yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "typeof(type)") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const TypeOfExprType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this typeof(expression) yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "typeof(expression)") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DecltypeType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this decltype() yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "decltype()") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const UnaryTransformType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this unary transform type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "unary transform type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const AutoType *T, Qualifiers,
                                          SourceRange Range) {
   assert(T->getDeducedType().isNull() && "expecting a dependent type!");
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this 'auto' type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "'auto' type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(
     const DeducedTemplateSpecializationType *T, Qualifiers, SourceRange Range) {
   assert(T->getDeducedType().isNull() && "expecting a dependent type!");
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this deduced class template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "deduced class template specialization type")
+      << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const AtomicType *T, Qualifiers,
@@ -3717,10 +3676,7 @@ void MicrosoftCXXNameMangler::mangleType(const BitIntType *T, Qualifiers,
 
 void MicrosoftCXXNameMangler::mangleType(const DependentBitIntType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this DependentBitInt type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "DependentBitInt type") << Range;
 }
 
 // <this-adjustment> ::= <no-adjustment> | <static-adjustment> |
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index c8792941a6bb6..451a9fe9fe3d2 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -297,10 +297,11 @@ OMPParallelDirective *OMPParallelDirective::CreateEmpty(const ASTContext &C,
                                                     /*NumChildren=*/1);
 }
 
-OMPSimdDirective *OMPSimdDirective::Create(
-    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
-    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
-    const HelperExprs &Exprs, OpenMPDirectiveKind ParamPrevMappedDirective) {
+OMPSimdDirective *
+OMPSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                         SourceLocation EndLoc, unsigned CollapsedNum,
+                         ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+                         const HelperExprs &Exprs) {
   auto *Dir = createDirective<OMPSimdDirective>(
       C, Clauses, AssociatedStmt, numLoopChildren(CollapsedNum, OMPD_simd),
       StartLoc, EndLoc, CollapsedNum);
@@ -320,7 +321,6 @@ OMPSimdDirective *OMPSimdDirective::Create(
   Dir->setDependentInits(Exprs.DependentInits);
   Dir->setFinalsConditions(Exprs.FinalsConditions);
   Dir->setPreInits(Exprs.PreInits);
-  Dir->setMappedDirective(ParamPrevMappedDirective);
   return Dir;
 }
 
@@ -336,8 +336,7 @@ OMPSimdDirective *OMPSimdDirective::CreateEmpty(const ASTContext &C,
 OMPForDirective *OMPForDirective::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
     unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
-    const HelperExprs &Exprs, Expr *TaskRedRef, bool HasCancel,
-    OpenMPDirectiveKind ParamPrevMappedDirective) {
+    const HelperExprs &Exprs, Expr *TaskRedRef, bool HasCancel) {
   auto *Dir = createDirective<OMPForDirective>(
       C, Clauses, AssociatedStmt, numLoopChildren(CollapsedNum, OMPD_for) + 1,
       StartLoc, EndLoc, CollapsedNum);
@@ -367,7 +366,6 @@ OMPForDirective *OMPForDirective::Create(
   Dir->setPreInits(Exprs.PreInits);
   Dir->setTaskReductionRefExpr(TaskRedRef);
   Dir->setHasCancel(HasCancel);
-  Dir->setMappedDirective(ParamPrevMappedDirective);
   return Dir;
 }
 
@@ -449,6 +447,44 @@ OMPUnrollDirective *OMPUnrollDirective::CreateEmpty(const ASTContext &C,
       SourceLocation(), SourceLocation());
 }
 
+OMPReverseDirective *
+OMPReverseDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                            SourceLocation EndLoc, Stmt *AssociatedStmt,
+                            Stmt *TransformedStmt, Stmt *PreInits) {
+  OMPReverseDirective *Dir = createDirective<OMPReverseDirective>(
+      C, std::nullopt, AssociatedStmt, TransformedStmtOffset + 1, StartLoc,
+      EndLoc);
+  Dir->setTransformedStmt(TransformedStmt);
+  Dir->setPreInits(PreInits);
+  return Dir;
+}
+
+OMPReverseDirective *OMPReverseDirective::CreateEmpty(const ASTContext &C) {
+  return createEmptyDirective<OMPReverseDirective>(
+      C, /*NumClauses=*/0, /*HasAssociatedStmt=*/true,
+      TransformedStmtOffset + 1, SourceLocation(), SourceLocation());
+}
+
+OMPInterchangeDirective *OMPInterchangeDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, unsigned NumLoops, Stmt *AssociatedStmt,
+    Stmt *TransformedStmt, Stmt *PreInits) {
+  OMPInterchangeDirective *Dir = createDirective<OMPInterchangeDirective>(
+      C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc,
+      NumLoops);
+  Dir->setTransformedStmt(TransformedStmt);
+  Dir->setPreInits(PreInits);
+  return Dir;
+}
+
+OMPInterchangeDirective *
+OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
+                                     unsigned NumLoops) {
+  return createEmptyDirective<OMPInterchangeDirective>(
+      C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+      SourceLocation(), SourceLocation(), NumLoops);
+}
+
 OMPForSimdDirective *
 OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
                             SourceLocation EndLoc, unsigned CollapsedNum,
@@ -1531,10 +1567,11 @@ OMPParallelMaskedTaskLoopSimdDirective::CreateEmpty(const ASTContext &C,
       CollapsedNum);
 }
 
-OMPDistributeDirective *OMPDistributeDirective::Create(
-    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
-    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
-    const HelperExprs &Exprs, OpenMPDirectiveKind ParamPrevMappedDirective) {
+OMPDistributeDirective *
+OMPDistributeDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                               SourceLocation EndLoc, unsigned CollapsedNum,
+                               ArrayRef<OMPClause *> Clauses,
+                               Stmt *AssociatedStmt, const HelperExprs &Exprs) {
   auto *Dir = createDirective<OMPDistributeDirective>(
       C, Clauses, AssociatedStmt,
       numLoopChildren(CollapsedNum, OMPD_distribute), StartLoc, EndLoc,
@@ -1563,7 +1600,6 @@ OMPDistributeDirective *OMPDistributeDirective::Create(
   Dir->setDependentInits(Exprs.DependentInits);
   Dir->setFinalsConditions(Exprs.FinalsConditions);
   Dir->setPreInits(Exprs.PreInits);
-  Dir->setMappedDirective(ParamPrevMappedDirective);
   return Dir;
 }
 
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 1b6758a432ad6..9e472a240c98a 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -763,6 +763,16 @@ void StmtPrinter::VisitOMPUnrollDirective(OMPUnrollDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPReverseDirective(OMPReverseDirective *Node) {
+  Indent() << "#pragma omp reverse";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) {
+  Indent() << "#pragma omp interchange";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
   Indent() << "#pragma omp for";
   PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index dcfb3093470c2..cf9988fcbe75c 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -624,7 +624,7 @@ void OMPClauseProfiler::VisitOMPFilterClause(const OMPFilterClause *C) {
 
 template<typename T>
 void OMPClauseProfiler::VisitOMPClauseList(T *Node) {
-  for (auto *E : Node->varlists()) {
+  for (auto *E : Node->varlist()) {
     if (E)
       Profiler->VisitStmt(E);
   }
@@ -918,7 +918,7 @@ void OMPClauseProfiler::VisitOMPUsesAllocatorsClause(
 void OMPClauseProfiler::VisitOMPAffinityClause(const OMPAffinityClause *C) {
   if (const Expr *Modifier = C->getModifier())
     Profiler->VisitStmt(Modifier);
-  for (const Expr *E : C->varlists())
+  for (const Expr *E : C->varlist())
     Profiler->VisitStmt(E);
 }
 void OMPClauseProfiler::VisitOMPOrderClause(const OMPOrderClause *C) {}
@@ -985,6 +985,15 @@ void StmtProfiler::VisitOMPUnrollDirective(const OMPUnrollDirective *S) {
   VisitOMPLoopTransformationDirective(S);
 }
 
+void StmtProfiler::VisitOMPReverseDirective(const OMPReverseDirective *S) {
+  VisitOMPLoopTransformationDirective(S);
+}
+
+void StmtProfiler::VisitOMPInterchangeDirective(
+    const OMPInterchangeDirective *S) {
+  VisitOMPLoopTransformationDirective(S);
+}
+
 void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
   VisitOMPLoopDirective(S);
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 8de38dc97e7cc..374f1ab1a1d98 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -75,7 +75,7 @@ bool Qualifiers::isStrictSupersetOf(Qualifiers Other) const {
 const IdentifierInfo* QualType::getBaseTypeIdentifier() const {
   const Type* ty = getTypePtr();
   NamedDecl *ND = nullptr;
-  if (ty->isPointerType() || ty->isReferenceType())
+  if (ty->isPointerOrReferenceType())
     return ty->getPointeeType().getBaseTypeIdentifier();
   else if (ty->isRecordType())
     ND = ty->castAs<RecordType>()->getDecl();
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index bf87b1aa0992a..06309d327896b 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -537,14 +537,23 @@ class PatternSet {
   /// that didn't match.
   /// Return true if there are still any patterns left.
   bool consumeNameSuffix(StringRef NodeName, bool CanSkip) {
-    for (size_t I = 0; I < Patterns.size();) {
-      if (::clang::ast_matchers::internal::consumeNameSuffix(Patterns[I].P,
-                                                             NodeName) ||
-          CanSkip) {
-        ++I;
-      } else {
-        Patterns.erase(Patterns.begin() + I);
+    if (CanSkip) {
+      // If we can skip the node, then we need to handle the case where a
+      // skipped node has the same name as its parent.
+      // namespace a { inline namespace a { class A; } }
+      // cxxRecordDecl(hasName("::a::A"))
+      // To do this, any patterns that match should be duplicated in our set,
+      // one of them with the tail removed.
+      for (size_t I = 0, E = Patterns.size(); I != E; ++I) {
+        StringRef Pattern = Patterns[I].P;
+        if (ast_matchers::internal::consumeNameSuffix(Patterns[I].P, NodeName))
+          Patterns.push_back({Pattern, Patterns[I].IsFullyQualified});
       }
+    } else {
+      llvm::erase_if(Patterns, [&NodeName](auto &Pattern) {
+        return !::clang::ast_matchers::internal::consumeNameSuffix(Pattern.P,
+                                                                   NodeName);
+      });
     }
     return !Patterns.empty();
   }
diff --git a/clang/lib/Analysis/Consumed.cpp b/clang/lib/Analysis/Consumed.cpp
index d01c7f688e8b5..63c5943242944 100644
--- a/clang/lib/Analysis/Consumed.cpp
+++ b/clang/lib/Analysis/Consumed.cpp
@@ -141,7 +141,7 @@ static bool isCallableInState(const CallableWhenAttr *CWAttr,
 }
 
 static bool isConsumableType(const QualType &QT) {
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     return false;
 
   if (const CXXRecordDecl *RD = QT->getAsCXXRecordDecl())
@@ -151,7 +151,7 @@ static bool isConsumableType(const QualType &QT) {
 }
 
 static bool isAutoCastType(const QualType &QT) {
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     return false;
 
   if (const CXXRecordDecl *RD = QT->getAsCXXRecordDecl())
@@ -186,10 +186,6 @@ static bool isTestingFunction(const FunctionDecl *FunDecl) {
   return FunDecl->hasAttr<TestTypestateAttr>();
 }
 
-static bool isPointerOrRef(QualType ParamType) {
-  return ParamType->isPointerType() || ParamType->isReferenceType();
-}
-
 static ConsumedState mapConsumableAttrState(const QualType QT) {
   assert(isConsumableType(QT));
 
@@ -648,7 +644,7 @@ bool ConsumedStmtVisitor::handleCall(const CallExpr *Call, const Expr *ObjArg,
       setStateForVarOrTmp(StateMap, PInfo, mapReturnTypestateAttrState(RT));
     else if (isRValueRef(ParamType) || isConsumableType(ParamType))
       setStateForVarOrTmp(StateMap, PInfo, consumed::CS_Consumed);
-    else if (isPointerOrRef(ParamType) &&
+    else if (ParamType->isPointerOrReferenceType() &&
              (!ParamType->getPointeeType().isConstQualified() ||
               isSetOnReadPtrType(ParamType)))
       setStateForVarOrTmp(StateMap, PInfo, consumed::CS_Unknown);
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 3b3782fa1db9a..6d726ae44104e 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -404,25 +404,24 @@ ExprMutationAnalyzer::Analyzer::findDirectMutation(const Expr *Exp) {
             memberExpr(hasObjectExpression(canResolveToExpr(Exp)))),
       nonConstReferenceType());
   const auto NotInstantiated = unless(hasDeclaration(isInstantiated()));
-  const auto TypeDependentCallee =
-      callee(expr(anyOf(unresolvedLookupExpr(), unresolvedMemberExpr(),
-                        cxxDependentScopeMemberExpr(),
-                        hasType(templateTypeParmType()), isTypeDependent())));
-
-  const auto AsNonConstRefArg = anyOf(
-      callExpr(NonConstRefParam, NotInstantiated),
-      cxxConstructExpr(NonConstRefParam, NotInstantiated),
-      callExpr(TypeDependentCallee, hasAnyArgument(canResolveToExpr(Exp))),
-      cxxUnresolvedConstructExpr(hasAnyArgument(canResolveToExpr(Exp))),
-      // Previous False Positive in the following Code:
-      // `template <typename T> void f() { int i = 42; new Type<T>(i); }`
-      // Where the constructor of `Type` takes its argument as reference.
-      // The AST does not resolve in a `cxxConstructExpr` because it is
-      // type-dependent.
-      parenListExpr(hasDescendant(expr(canResolveToExpr(Exp)))),
-      // If the initializer is for a reference type, there is no cast for
-      // the variable. Values are cast to RValue first.
-      initListExpr(hasAnyInit(expr(canResolveToExpr(Exp)))));
+
+  const auto AsNonConstRefArg =
+      anyOf(callExpr(NonConstRefParam, NotInstantiated),
+            cxxConstructExpr(NonConstRefParam, NotInstantiated),
+            // If the call is type-dependent, we can't properly process any
+            // argument because required type conversions and implicit casts
+            // will be inserted only after specialization.
+            callExpr(isTypeDependent(), hasAnyArgument(canResolveToExpr(Exp))),
+            cxxUnresolvedConstructExpr(hasAnyArgument(canResolveToExpr(Exp))),
+            // Previous False Positive in the following Code:
+            // `template <typename T> void f() { int i = 42; new Type<T>(i); }`
+            // Where the constructor of `Type` takes its argument as reference.
+            // The AST does not resolve in a `cxxConstructExpr` because it is
+            // type-dependent.
+            parenListExpr(hasDescendant(expr(canResolveToExpr(Exp)))),
+            // If the initializer is for a reference type, there is no cast for
+            // the variable. Values are cast to RValue first.
+            initListExpr(hasAnyInit(expr(canResolveToExpr(Exp)))));
 
   // Captured by a lambda by reference.
   // If we're initializing a capture with 'Exp' directly then we're initializing
diff --git a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
index 255543021a998..876b5a3db5249 100644
--- a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
+++ b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -96,8 +97,7 @@ static llvm::BitVector findReachableBlocks(const CFG &Cfg) {
 
 static llvm::DenseSet<const CFGBlock *>
 buildContainsExprConsumedInDifferentBlock(
-    const CFG &Cfg,
-    const llvm::DenseMap<const Stmt *, const CFGBlock *> &StmtToBlock) {
+    const CFG &Cfg, const internal::StmtToBlockMap &StmtToBlock) {
   llvm::DenseSet<const CFGBlock *> Result;
 
   auto CheckChildExprs = [&Result, &StmtToBlock](const Stmt *S,
@@ -105,7 +105,7 @@ buildContainsExprConsumedInDifferentBlock(
     for (const Stmt *Child : S->children()) {
       if (!isa_and_nonnull<Expr>(Child))
         continue;
-      const CFGBlock *ChildBlock = StmtToBlock.lookup(Child);
+      const CFGBlock *ChildBlock = StmtToBlock.lookup(*Child);
       if (ChildBlock != Block)
         Result.insert(ChildBlock);
     }
@@ -126,6 +126,13 @@ buildContainsExprConsumedInDifferentBlock(
   return Result;
 }
 
+namespace internal {
+
+StmtToBlockMap::StmtToBlockMap(const CFG &Cfg)
+    : StmtToBlock(buildStmtToBasicBlockMap(Cfg)) {}
+
+} // namespace internal
+
 llvm::Expected<AdornedCFG> AdornedCFG::build(const FunctionDecl &Func) {
   if (!Func.doesThisDeclarationHaveABody())
     return llvm::createStringError(
@@ -166,8 +173,7 @@ llvm::Expected<AdornedCFG> AdornedCFG::build(const Decl &D, Stmt &S,
         std::make_error_code(std::errc::invalid_argument),
         "CFG::buildCFG failed");
 
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock =
-      buildStmtToBasicBlockMap(*Cfg);
+  internal::StmtToBlockMap StmtToBlock(*Cfg);
 
   llvm::BitVector BlockReachable = findReachableBlocks(*Cfg);
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index f734168e647bd..e1f68e493f355 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -416,7 +416,7 @@ class ResultObjectVisitor : public AnalysisASTVisitor<ResultObjectVisitor> {
     // below them can initialize the same object (or part of it).
     if (isa<CXXConstructExpr>(E) || isa<CallExpr>(E) || isa<LambdaExpr>(E) ||
         isa<CXXDefaultArgExpr>(E) || isa<CXXStdInitializerListExpr>(E) ||
-        isa<AtomicExpr>(E) ||
+        isa<AtomicExpr>(E) || isa<CXXInheritedCtorInitExpr>(E) ||
         // We treat `BuiltinBitCastExpr` as an "original initializer" too as
         // it may not even be casting from a record type -- and even if it is,
         // the two objects are in general of unrelated type.
@@ -524,12 +524,21 @@ void Environment::initialize() {
           assert(VarDecl != nullptr);
           setStorageLocation(*VarDecl, createObject(*VarDecl, nullptr));
         } else if (Capture.capturesThis()) {
-          const auto *SurroundingMethodDecl =
-              cast<CXXMethodDecl>(InitialTargetFunc->getNonClosureAncestor());
-          QualType ThisPointeeType =
-              SurroundingMethodDecl->getFunctionObjectParameterType();
-          setThisPointeeStorageLocation(
-              cast<RecordStorageLocation>(createObject(ThisPointeeType)));
+          if (auto *Ancestor = InitialTargetFunc->getNonClosureAncestor()) {
+            const auto *SurroundingMethodDecl = cast<CXXMethodDecl>(Ancestor);
+            QualType ThisPointeeType =
+                SurroundingMethodDecl->getFunctionObjectParameterType();
+            setThisPointeeStorageLocation(
+                cast<RecordStorageLocation>(createObject(ThisPointeeType)));
+          } else if (auto *FieldBeingInitialized =
+                         dyn_cast<FieldDecl>(Parent->getLambdaContextDecl())) {
+            // This is in a field initializer, rather than a method.
+            setThisPointeeStorageLocation(
+                cast<RecordStorageLocation>(createObject(QualType(
+                    FieldBeingInitialized->getParent()->getTypeForDecl(), 0))));
+          } else {
+            assert(false && "Unexpected this-capturing lambda context.");
+          }
         }
       }
     } else if (MethodDecl->isImplicitObjectMemberFunction()) {
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 3c896d373a211..9c54eb16d2224 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -40,17 +40,16 @@ namespace clang {
 namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
-  auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  if (BlockIt == ACFG.getStmtToBlock().end()) {
+  const CFGBlock *Block = ACFG.blockForStmt(S);
+  if (Block == nullptr) {
     assert(false);
-    // Return null to avoid dereferencing the end iterator in non-assert builds.
     return nullptr;
   }
-  if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
+  if (!ACFG.isBlockReachable(*Block))
     return nullptr;
-  if (BlockIt->getSecond()->getBlockID() == CurBlockID)
+  if (Block->getBlockID() == CurBlockID)
     return &CurState.Env;
-  const auto &State = BlockToState[BlockIt->getSecond()->getBlockID()];
+  const auto &State = BlockToState[Block->getBlockID()];
   if (!(State))
     return nullptr;
   return &State->Env;
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 200682faafd6a..8afd18b315d28 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -243,10 +243,11 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     // See `NoreturnDestructorTest` for concrete examples.
     if (Block.succ_begin()->getReachableBlock() != nullptr &&
         Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
-      auto &StmtToBlock = AC.ACFG.getStmtToBlock();
-      auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
-      assert(StmtBlock != StmtToBlock.end());
-      llvm::erase(Preds, StmtBlock->getSecond());
+      const CFGBlock *StmtBlock = nullptr;
+      if (const Stmt *Terminator = Block.getTerminatorStmt())
+        StmtBlock = AC.ACFG.blockForStmt(*Terminator);
+      assert(StmtBlock != nullptr);
+      llvm::erase(Preds, StmtBlock);
     }
   }
 
diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp
index 6d03dd05ca3d2..481932ee59c8e 100644
--- a/clang/lib/Analysis/LiveVariables.cpp
+++ b/clang/lib/Analysis/LiveVariables.cpp
@@ -214,6 +214,22 @@ static void AddLiveExpr(llvm::ImmutableSet<const Expr *> &Set,
   Set = F.add(Set, LookThroughExpr(E));
 }
 
+/// Add as a live expression all individual conditions in a logical expression.
+/// For example, for the expression:
+/// "(a < b) || (c && d && ((e || f) != (g && h)))"
+/// the following expressions will be added as live:
+/// "a < b", "c", "d", "((e || f) != (g && h))"
+static void AddAllConditionalTerms(llvm::ImmutableSet<const Expr *> &Set,
+                                   llvm::ImmutableSet<const Expr *>::Factory &F,
+                                   const Expr *Cond) {
+  AddLiveExpr(Set, F, Cond);
+  if (auto const *BO = dyn_cast<BinaryOperator>(Cond->IgnoreParens());
+      BO && BO->isLogicalOp()) {
+    AddAllConditionalTerms(Set, F, BO->getLHS());
+    AddAllConditionalTerms(Set, F, BO->getRHS());
+  }
+}
+
 void TransferFunctions::Visit(Stmt *S) {
   if (observer)
     observer->observeStmt(S, currentBlock, val);
@@ -313,7 +329,27 @@ void TransferFunctions::Visit(Stmt *S) {
       AddLiveExpr(val.liveExprs, LV.ESetFact, cast<ForStmt>(S)->getCond());
       return;
     }
-
+    case Stmt::ConditionalOperatorClass: {
+      // Keep not only direct children alive, but also all the short-circuited
+      // parts of the condition. Short-circuiting evaluation may cause the
+      // conditional operator evaluation to skip the evaluation of the entire
+      // condtion expression, so the value of the entire condition expression is
+      // never computed.
+      //
+      // This makes a difference when we compare exploded nodes coming from true
+      // and false expressions with no side effects: the only difference in the
+      // state is the value of (part of) the condition.
+      //
+      // BinaryConditionalOperatorClass ('x ?: y') is not affected because it
+      // explicitly calculates the value of the entire condition expression (to
+      // possibly use as a value for the "true expr") even if it is
+      // short-circuited.
+      auto const *CO = cast<ConditionalOperator>(S);
+      AddAllConditionalTerms(val.liveExprs, LV.ESetFact, CO->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getTrueExpr());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getFalseExpr());
+      return;
+    }
   }
 
   // HACK + FIXME: What is this? One could only guess that this is an attempt to
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 3e8c959ccee4f..cbcfefdc52549 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -97,7 +97,7 @@ static StringRef ClassifyDiagnostic(QualType VDT) {
     if (const auto *TD = TT->getDecl())
       if (const auto *CA = TD->getAttr<CapabilityAttr>())
         return ClassifyDiagnostic(CA);
-  } else if (VDT->isPointerType() || VDT->isReferenceType())
+  } else if (VDT->isPointerOrReferenceType())
     return ClassifyDiagnostic(VDT->getPointeeType());
 
   return "mutex";
diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp
index 9577277d85f81..9ce6efe6fce60 100644
--- a/clang/lib/Basic/Attributes.cpp
+++ b/clang/lib/Basic/Attributes.cpp
@@ -157,6 +157,40 @@ std::string AttributeCommonInfo::getNormalizedFullName() const {
       normalizeName(getAttrName(), getScopeName(), getSyntax()));
 }
 
+static StringRef getSyntaxName(AttributeCommonInfo::Syntax SyntaxUsed) {
+  switch (SyntaxUsed) {
+  case AttributeCommonInfo::AS_GNU:
+    return "GNU";
+  case AttributeCommonInfo::AS_CXX11:
+    return "CXX11";
+  case AttributeCommonInfo::AS_C23:
+    return "C23";
+  case AttributeCommonInfo::AS_Declspec:
+    return "Declspec";
+  case AttributeCommonInfo::AS_Microsoft:
+    return "Microsoft";
+  case AttributeCommonInfo::AS_Keyword:
+    return "Keyword";
+  case AttributeCommonInfo::AS_Pragma:
+    return "Pragma";
+  case AttributeCommonInfo::AS_ContextSensitiveKeyword:
+    return "ContextSensitiveKeyword";
+  case AttributeCommonInfo::AS_HLSLAnnotation:
+    return "HLSLAnnotation";
+  case AttributeCommonInfo::AS_Implicit:
+    return "Implicit";
+  }
+  llvm_unreachable("Invalid attribute syntax");
+}
+
+std::string AttributeCommonInfo::normalizeFullNameWithSyntax(
+    const IdentifierInfo *Name, const IdentifierInfo *ScopeName,
+    Syntax SyntaxUsed) {
+  return (Twine(getSyntaxName(SyntaxUsed)) +
+          "::" + normalizeName(Name, ScopeName, SyntaxUsed))
+      .str();
+}
+
 unsigned AttributeCommonInfo::calculateAttributeSpellingListIndex() const {
   // Both variables will be used in tablegen generated
   // attribute spell list index matching code.
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 7116e27cd9546..25a601573698e 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -90,6 +90,9 @@ static bool builtinIsSupported(const Builtin::Info &BuiltinInfo,
   /* MSMode Unsupported */
   if (!LangOpts.MicrosoftExt && (BuiltinInfo.Langs & MS_LANG))
     return false;
+  /* HLSLMode Unsupported */
+  if (!LangOpts.HLSL && (BuiltinInfo.Langs & HLSL_LANG))
+    return false;
   /* ObjC Unsupported */
   if (!LangOpts.ObjC && BuiltinInfo.Langs == OBJC_LANG)
     return false;
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index e5adc034f60c1..9331a63d91b17 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -34,8 +34,8 @@ void LangOptions::resetNonModularOptions() {
   // invocations that cannot be round-tripped to arguments.
   // FIXME: we should derive this automatically from ImpliedBy in tablegen.
   AllowFPReassoc = UnsafeFPMath;
-  NoHonorNaNs = FiniteMathOnly;
-  NoHonorInfs = FiniteMathOnly;
+  NoHonorInfs = FastMath;
+  NoHonorNaNs = FastMath;
 
   // These options do not affect AST generation.
   NoSanitizeFiles.clear();
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 766d6a8418a6a..b141e48e77e3c 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -684,7 +684,8 @@ bool clang::isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind) {
 }
 
 bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
-  return DKind == OMPD_tile || DKind == OMPD_unroll;
+  return DKind == OMPD_tile || DKind == OMPD_unroll || DKind == OMPD_reverse ||
+         DKind == OMPD_interchange;
 }
 
 bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/Basic/Sarif.cpp b/clang/lib/Basic/Sarif.cpp
index 8c144df341673..980b0ae40ae22 100644
--- a/clang/lib/Basic/Sarif.cpp
+++ b/clang/lib/Basic/Sarif.cpp
@@ -141,7 +141,7 @@ static unsigned int adjustColumnPos(FullSourceLoc Loc,
 /// @{
 
 /// \internal
-json::Object createMessage(StringRef Text) {
+static json::Object createMessage(StringRef Text) {
   return json::Object{{"text", Text.str()}};
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 2692ddec26ff4..6ba31cc05a0d7 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -204,7 +204,8 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
 StringRef AArch64TargetInfo::getABI() const { return ABI; }
 
 bool AArch64TargetInfo::setABI(const std::string &Name) {
-  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs")
+  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs" &&
+      Name != "pauthtest")
     return false;
 
   ABI = Name;
@@ -218,6 +219,12 @@ bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
     Diags.Report(diag::err_target_unsupported_abi_with_fpu) << ABI;
     return false;
   }
+  if (getTriple().getEnvironment() == llvm::Triple::PAuthTest &&
+      getTriple().getOS() != llvm::Triple::Linux) {
+    Diags.Report(diag::err_target_unsupported_abi_for_triple)
+        << getTriple().getEnvironmentName() << getTriple().getTriple();
+    return false;
+  }
   return true;
 }
 
@@ -449,6 +456,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2)
     Builder.defineMacro("__ARM_FEATURE_SVE2", "1");
 
+  if (HasSVE2p1)
+    Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
+
   if (HasSVE2 && HasSVE2AES)
     Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
 
@@ -467,8 +477,15 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   }
 
   if (HasSME2) {
-    Builder.defineMacro("__ARM_FEATURE_SME");
-    Builder.defineMacro("__ARM_FEATURE_SME2");
+    Builder.defineMacro("__ARM_FEATURE_SME", "1");
+    Builder.defineMacro("__ARM_FEATURE_SME2", "1");
+    Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
+  }
+
+  if (HasSME2p1) {
+    Builder.defineMacro("__ARM_FEATURE_SME", "1");
+    Builder.defineMacro("__ARM_FEATURE_SME2", "1");
+    Builder.defineMacro("__ARM_FEATURE_SME2p1", "1");
     Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
   }
 
@@ -739,8 +756,10 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm)
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
+      .Case("sve2p1", FPU & SveMode && HasSVE2p1)
       .Case("sme", HasSME)
       .Case("sme2", HasSME2)
+      .Case("sme2p1", HasSME2p1)
       .Case("sme-f64f64", HasSMEF64F64)
       .Case("sme-i16i64", HasSMEI16I64)
       .Case("sme-fa64", HasSMEFA64)
@@ -816,6 +835,13 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasFullFP16 = true;
       HasSVE2 = true;
     }
+    if (Feature == "+sve2p1") {
+      FPU |= NeonMode;
+      FPU |= SveMode;
+      HasFullFP16 = true;
+      HasSVE2 = true;
+      HasSVE2p1 = true;
+    }
     if (Feature == "+sve2-aes") {
       FPU |= NeonMode;
       FPU |= SveMode;
@@ -867,6 +893,13 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasBFloat16 = true;
       HasFullFP16 = true;
     }
+    if (Feature == "+sme2p1") {
+      HasSME = true;
+      HasSME2 = true;
+      HasSME2p1 = true;
+      HasBFloat16 = true;
+      HasFullFP16 = true;
+    }
     if (Feature == "+sme-f64f64") {
       HasSME = true;
       HasSMEF64F64 = true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 71510fe289510..7bdf5a2b4106e 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -49,6 +49,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasMatMul = false;
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
+  bool HasSVE2p1 = false;
   bool HasSVE2AES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
@@ -70,6 +71,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasSME2 = false;
   bool HasSMEF64F64 = false;
   bool HasSMEI16I64 = false;
+  bool HasSME2p1 = false;
   bool HasSB = false;
   bool HasPredRes = false;
   bool HasSSBS = false;
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 75f71a337b7a4..cb3fd12c48ddb 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -200,7 +200,24 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // Define __loongarch_arch.
   StringRef ArchName = getCPU();
-  Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  if (ArchName == "loongarch64") {
+    if (HasFeatureLSX) {
+      // TODO: As more features of the V1.1 ISA are supported, a unified "v1.1"
+      // arch feature set will be used to include all sub-features belonging to
+      // the V1.1 ISA version.
+      if (HasFeatureFrecipe)
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.1" + Twine('"'));
+      else
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.0" + Twine('"'));
+    } else {
+      Builder.defineMacro("__loongarch_arch",
+                          Twine('"') + ArchName + Twine('"'));
+    }
+  } else {
+    Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  }
 
   // Define __loongarch_tune.
   StringRef TuneCPU = getTargetOpts().TuneCPU;
@@ -216,6 +233,8 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_simd_width", "128");
     Builder.defineMacro("__loongarch_sx", Twine(1));
   }
+  if (HasFeatureFrecipe)
+    Builder.defineMacro("__loongarch_frecipe", Twine(1));
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -291,6 +310,8 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       HasFeatureLASX = true;
     else if (Feature == "-ual")
       HasUnalignedAccess = false;
+    else if (Feature == "+frecipe")
+      HasFeatureFrecipe = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 5fc223483951e..c668ca7eca047 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -29,6 +29,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
   bool HasFeatureF;
   bool HasFeatureLSX;
   bool HasFeatureLASX;
+  bool HasFeatureFrecipe;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -37,6 +38,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
     HasFeatureF = false;
     HasFeatureLSX = false;
     HasFeatureLASX = false;
+    HasFeatureFrecipe = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 899aefa6173ac..b56e2c7ca9c49 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -173,10 +173,10 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
   // "Under /fp:precise and /fp:strict, the compiler doesn't do any mathematical
   // transformation unless the transformation is guaranteed to produce a bitwise
   // identical result."
-  const bool any_imprecise_flags =
-      Opts.FastMath || Opts.FiniteMathOnly || Opts.UnsafeFPMath ||
-      Opts.AllowFPReassoc || Opts.NoHonorNaNs || Opts.NoHonorInfs ||
-      Opts.NoSignedZero || Opts.AllowRecip || Opts.ApproxFunc;
+  const bool any_imprecise_flags = Opts.FastMath || Opts.UnsafeFPMath ||
+                                   Opts.AllowFPReassoc || Opts.NoHonorNaNs ||
+                                   Opts.NoHonorInfs || Opts.NoSignedZero ||
+                                   Opts.AllowRecip || Opts.ApproxFunc;
 
   // "Under both /fp:precise and /fp:fast, the compiler generates code intended
   // to run in the default floating-point environment."
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 4ba4a49311d36..d8203f76a5468 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -14,6 +14,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 
 using namespace clang;
 using namespace clang::targets;
@@ -385,6 +386,8 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("_ARCH_PWR9");
   if (ArchDefs & ArchDefinePwr10)
     Builder.defineMacro("_ARCH_PWR10");
+  if (ArchDefs & ArchDefinePwr11)
+    Builder.defineMacro("_ARCH_PWR11");
   if (ArchDefs & ArchDefineA2)
     Builder.defineMacro("_ARCH_A2");
   if (ArchDefs & ArchDefineE500)
@@ -622,10 +625,17 @@ bool PPCTargetInfo::initFeatureMap(
     addP10SpecificFeatures(Features);
   }
 
-  // Future CPU should include all of the features of Power 10 as well as any
+  // Power11 includes all the same features as Power10 plus any features
+  // specific to the Power11 core.
+  if (CPU == "pwr11" || CPU == "power11") {
+    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
+    addP11SpecificFeatures(Features);
+  }
+
+  // Future CPU should include all of the features of Power 11 as well as any
   // additional features (yet to be determined) specific to it.
   if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
+    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
     addFutureSpecificFeatures(Features);
   }
 
@@ -696,6 +706,10 @@ void PPCTargetInfo::addP10SpecificFeatures(
   Features["isa-v31-instructions"] = true;
 }
 
+// Add any Power11 specific features.
+void PPCTargetInfo::addP11SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {}
+
 // Add features specific to the "Future" CPU.
 void PPCTargetInfo::addFutureSpecificFeatures(
     llvm::StringMap<bool> &Features) const {}
@@ -869,25 +883,12 @@ ArrayRef<TargetInfo::AddlRegName> PPCTargetInfo::getGCCAddlRegNames() const {
   return llvm::ArrayRef(GCCAddlRegNames);
 }
 
-static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"generic"},     {"440"},     {"450"},    {"601"},       {"602"},
-    {"603"},         {"603e"},    {"603ev"},  {"604"},       {"604e"},
-    {"620"},         {"630"},     {"g3"},     {"7400"},      {"g4"},
-    {"7450"},        {"g4+"},     {"750"},    {"8548"},      {"970"},
-    {"g5"},          {"a2"},      {"e500"},   {"e500mc"},    {"e5500"},
-    {"power3"},      {"pwr3"},    {"power4"}, {"pwr4"},      {"power5"},
-    {"pwr5"},        {"power5x"}, {"pwr5x"},  {"power6"},    {"pwr6"},
-    {"power6x"},     {"pwr6x"},   {"power7"}, {"pwr7"},      {"power8"},
-    {"pwr8"},        {"power9"},  {"pwr9"},   {"power10"},   {"pwr10"},
-    {"powerpc"},     {"ppc"},     {"ppc32"},  {"powerpc64"}, {"ppc64"},
-    {"powerpc64le"}, {"ppc64le"}, {"future"}};
-
 bool PPCTargetInfo::isValidCPUName(StringRef Name) const {
-  return llvm::is_contained(ValidCPUNames, Name);
+  return llvm::PPC::isValidCPU(Name);
 }
 
 void PPCTargetInfo::fillValidCPUList(SmallVectorImpl<StringRef> &Values) const {
-  Values.append(std::begin(ValidCPUNames), std::end(ValidCPUNames));
+  llvm::PPC::fillValidCPUList(Values);
 }
 
 void PPCTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index b15ab6fbcf492..6d5d8dd54d013 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -44,8 +44,9 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     ArchDefinePwr8 = 1 << 12,
     ArchDefinePwr9 = 1 << 13,
     ArchDefinePwr10 = 1 << 14,
-    ArchDefineFuture = 1 << 15,
-    ArchDefineA2 = 1 << 16,
+    ArchDefinePwr11 = 1 << 15,
+    ArchDefineFuture = 1 << 16,
+    ArchDefineA2 = 1 << 17,
     ArchDefineE500 = 1 << 18
   } ArchDefineTypes;
 
@@ -166,11 +167,16 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                          ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
+              .Cases("power11", "pwr11",
+                     ArchDefinePwr11 | ArchDefinePwr10 | ArchDefinePwr9 |
+                         ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
+                         ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                         ArchDefinePpcgr | ArchDefinePpcsq)
               .Case("future",
-                    ArchDefineFuture | ArchDefinePwr10 | ArchDefinePwr9 |
-                        ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
-                        ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
-                        ArchDefinePpcgr | ArchDefinePpcsq)
+                    ArchDefineFuture | ArchDefinePwr11 | ArchDefinePwr10 |
+                        ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
+                        ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
+                        ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("8548", "e500", ArchDefineE500)
               .Default(ArchDefineNone);
     }
@@ -192,6 +198,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                  const std::vector<std::string> &FeaturesVec) const override;
 
   void addP10SpecificFeatures(llvm::StringMap<bool> &Features) const;
+  void addP11SpecificFeatures(llvm::StringMap<bool> &Features) const;
   void addFutureSpecificFeatures(llvm::StringMap<bool> &Features) const;
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 9159162f01d1b..41d836330b38c 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -479,3 +479,9 @@ RISCVTargetInfo::checkCallingConvention(CallingConv CC) const {
     return CCCR_OK;
   }
 }
+
+bool RISCVTargetInfo::validateCpuSupports(StringRef Feature) const {
+  // Only allow extensions we have a known bit position for in the
+  // __riscv_feature_bits structure.
+  return -1 != llvm::RISCVISAInfo::getRISCVFeaturesBitPosition(Feature);
+}
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index d5df6344bedc0..626274b8fc437 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -126,6 +126,10 @@ class RISCVTargetInfo : public TargetInfo {
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
     return std::make_pair(32, 32);
   }
+
+  bool supportsCpuSupports() const override { return getTriple().isOSLinux(); }
+  bool supportsCpuInit() const override { return getTriple().isOSLinux(); }
+  bool validateCpuSupports(StringRef Feature) const override;
 };
 class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
 public:
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 121a2c2d795fe..18e6dbf03e00d 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -963,8 +963,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__CF__");
   if (HasZU)
     Builder.defineMacro("__ZU__");
-  // Condition here is aligned with the feature set of mapxf in Options.td
-  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF && HasCF)
+  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF &&
+      HasCF && HasZU)
     Builder.defineMacro("__APX_F__");
   if (HasEGPR && HasInlineAsmUseGPR32)
     Builder.defineMacro("__APX_INLINE_ASM_USE_GPR32__");
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index cdec41afd1a4b..ba34ab2c7f336 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -513,15 +513,6 @@ class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
 public:
   NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
-
-  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
-    VersionTuple OsVersion = getTriple().getOSVersion();
-    // New NetBSD uses the default rounding mode.
-    if (OsVersion >= VersionTuple(6, 99, 26) || OsVersion.getMajor() == 0)
-      return X86_32TargetInfo::getFPEvalMethod();
-    // NetBSD before 6.99.26 defaults to "double" rounding.
-    return LangOptions::FPEvalMethodKind::FEM_Double;
-  }
 };
 
 class LLVM_LIBRARY_VISIBILITY OpenBSDI386TargetInfo
diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h
index 35ec370a139c9..a18c7169af1eb 100644
--- a/clang/lib/CodeGen/Address.h
+++ b/clang/lib/CodeGen/Address.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_ADDRESS_H
 #define LLVM_CLANG_LIB_CODEGEN_ADDRESS_H
 
+#include "CGPointerAuthInfo.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Type.h"
 #include "llvm/ADT/PointerIntPair.h"
@@ -108,6 +109,22 @@ class RawAddress {
 
 /// Like RawAddress, an abstract representation of an aligned address, but the
 /// pointer contained in this class is possibly signed.
+///
+/// This is designed to be an IR-level abstraction, carrying just the
+/// information necessary to perform IR operations on an address like loads and
+/// stores.  In particular, it doesn't carry C type information or allow the
+/// representation of things like bit-fields; clients working at that level
+/// should generally be using `LValue`.
+///
+/// An address may be either *raw*, meaning that it's an ordinary machine
+/// pointer, or *signed*, meaning that the pointer carries an embedded
+/// pointer-authentication signature. Representing signed pointers directly in
+/// this abstraction allows the authentication to be delayed as long as possible
+/// without forcing IRGen to use totally different code paths for signed and
+/// unsigned values or to separately propagate signature information through
+/// every API that manipulates addresses. Pointer arithmetic on signed addresses
+/// (e.g. drilling down to a struct field) is accumulated into a separate offset
+/// which is applied when the address is finally accessed.
 class Address {
   friend class CGBuilderTy;
 
@@ -121,7 +138,11 @@ class Address {
 
   CharUnits Alignment;
 
-  /// Offset from the base pointer.
+  /// The ptrauth information needed to authenticate the base pointer.
+  CGPointerAuthInfo PtrAuthInfo;
+
+  /// Offset from the base pointer. This is non-null only when the base
+  /// pointer is signed.
   llvm::Value *Offset = nullptr;
 
   llvm::Value *emitRawPointerSlow(CodeGenFunction &CGF) const;
@@ -140,12 +161,14 @@ class Address {
   }
 
   Address(llvm::Value *BasePtr, llvm::Type *ElementType, CharUnits Alignment,
-          llvm::Value *Offset, KnownNonNull_t IsKnownNonNull = NotKnownNonNull)
+          CGPointerAuthInfo PtrAuthInfo, llvm::Value *Offset,
+          KnownNonNull_t IsKnownNonNull = NotKnownNonNull)
       : Pointer(BasePtr, IsKnownNonNull), ElementType(ElementType),
-        Alignment(Alignment), Offset(Offset) {}
+        Alignment(Alignment), PtrAuthInfo(PtrAuthInfo), Offset(Offset) {}
 
   Address(RawAddress RawAddr)
-      : Pointer(RawAddr.isValid() ? RawAddr.getPointer() : nullptr),
+      : Pointer(RawAddr.isValid() ? RawAddr.getPointer() : nullptr,
+                RawAddr.isValid() ? RawAddr.isKnownNonNull() : NotKnownNonNull),
         ElementType(RawAddr.isValid() ? RawAddr.getElementType() : nullptr),
         Alignment(RawAddr.isValid() ? RawAddr.getAlignment()
                                     : CharUnits::Zero()) {}
@@ -192,6 +215,9 @@ class Address {
   /// Return the IR name of the pointer value.
   llvm::StringRef getName() const { return Pointer.getPointer()->getName(); }
 
+  const CGPointerAuthInfo &getPointerAuthInfo() const { return PtrAuthInfo; }
+  void setPointerAuthInfo(const CGPointerAuthInfo &Info) { PtrAuthInfo = Info; }
+
   // This function is called only in CGBuilderBaseTy::CreateElementBitCast.
   void setElementType(llvm::Type *Ty) {
     assert(hasOffset() &&
@@ -199,6 +225,8 @@ class Address {
     ElementType = Ty;
   }
 
+  bool isSigned() const { return PtrAuthInfo.isSigned(); }
+
   /// Whether the pointer is known not to be null.
   KnownNonNull_t isKnownNonNull() const {
     assert(isValid());
@@ -215,10 +243,15 @@ class Address {
 
   llvm::Value *getOffset() const { return Offset; }
 
+  Address getResignedAddress(const CGPointerAuthInfo &NewInfo,
+                             CodeGenFunction &CGF) const;
+
   /// Return the pointer contained in this class after authenticating it and
   /// adding offset to it if necessary.
   llvm::Value *emitRawPointer(CodeGenFunction &CGF) const {
-    return getBasePointer();
+    if (!isSigned())
+      return getBasePointer();
+    return emitRawPointerSlow(CGF);
   }
 
   /// Return address with different pointer, but same element type and
@@ -240,7 +273,8 @@ class Address {
   /// alignment.
   Address withElementType(llvm::Type *ElemTy) const {
     if (!hasOffset())
-      return Address(getBasePointer(), ElemTy, getAlignment(), nullptr,
+      return Address(getBasePointer(), ElemTy, getAlignment(),
+                     getPointerAuthInfo(), /*Offset=*/nullptr,
                      isKnownNonNull());
     Address A(*this);
     A.ElementType = ElemTy;
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 0337c95dfa3cf..4e639a976389f 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -728,7 +728,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
   llvm::AtomicRMWInst *RMWI =
-      CGF.Builder.CreateAtomicRMW(Op, Ptr, LoadVal1, Order, Scope);
+      CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope);
   RMWI->setVolatile(E->isVolatile());
 
   // For __atomic_*_fetch operations, perform the operation again to
@@ -2035,6 +2035,17 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
                                            IsWeak);
 }
 
+llvm::AtomicRMWInst *
+CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr,
+                                   llvm::Value *Val, llvm::AtomicOrdering Order,
+                                   llvm::SyncScope::ID SSID) {
+
+  llvm::AtomicRMWInst *RMW =
+      Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID);
+  getTargetHooks().setTargetAtomicMetadata(*this, *RMW);
+  return RMW;
+}
+
 void CodeGenFunction::EmitAtomicUpdate(
     LValue LVal, llvm::AtomicOrdering AO,
     const llvm::function_ref<RValue(RValue)> &UpdateOp, bool IsVolatile) {
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index 0bc4fda62979c..5d59d5a4ae2c1 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -190,8 +190,8 @@ class CGBuilderTy : public CGBuilderBaseTy {
                               const llvm::Twine &Name = "") {
     if (!Addr.hasOffset())
       return Address(CreateAddrSpaceCast(Addr.getBasePointer(), Ty, Name),
-                     ElementTy, Addr.getAlignment(), nullptr,
-                     Addr.isKnownNonNull());
+                     ElementTy, Addr.getAlignment(), Addr.getPointerAuthInfo(),
+                     /*Offset=*/nullptr, Addr.isKnownNonNull());
     // Eagerly force a raw address if these is an offset.
     return RawAddress(
         CreateAddrSpaceCast(Addr.emitRawPointer(*getCGF()), Ty, Name),
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e192f5c701d8a..0ca0e8787ea08 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -63,6 +63,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
+#include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/X86TargetParser.h"
 #include <optional>
 #include <sstream>
@@ -766,7 +767,26 @@ static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
                               const CallExpr *E, llvm::Constant *calleeValue) {
   CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
-  return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
+  RValue Call =
+      CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
+
+  if (unsigned BuiltinID = FD->getBuiltinID()) {
+    // Check whether a FP math builtin function, such as BI__builtin_expf
+    ASTContext &Context = CGF.getContext();
+    bool ConstWithoutErrnoAndExceptions =
+        Context.BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
+    // Restrict to target with errno, for example, MacOS doesn't set errno.
+    // TODO: Support builtin function with complex type returned, eg: cacosh
+    if (ConstWithoutErrnoAndExceptions && CGF.CGM.getLangOpts().MathErrno &&
+        !CGF.Builder.getIsFPConstrained() && Call.isScalar()) {
+      // Emit "int" TBAA metadata on FP math libcalls.
+      clang::QualType IntTy = Context.IntTy;
+      TBAAAccessInfo TBAAInfo = CGF.CGM.getTBAAAccessInfo(IntTy);
+      Instruction *Inst = cast<llvm::Instruction>(Call.getScalarVal());
+      CGF.CGM.DecorateInstructionWithTBAA(Inst, TBAAInfo);
+    }
+  }
+  return Call;
 }
 
 /// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
@@ -1054,7 +1074,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
 
   // Build a load of the counted_by field.
   bool IsSigned = CountedByFD->getType()->isSignedIntegerType();
-  Value *CountedByInst = EmitCountedByFieldExpr(Base, FAMDecl, CountedByFD);
+  Value *CountedByInst = EmitLoadOfCountedByField(Base, FAMDecl, CountedByFD);
   if (!CountedByInst)
     return getDefaultBuiltinObjectSizeResult(Type, ResType);
 
@@ -2721,6 +2741,42 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   if (GenerateIntrinsics &&
       !(getLangOpts().SYCLIsDevice && getTarget().getTriple().isNVPTX())) {
     switch (BuiltinIDIfNoAsmLabel) {
+    case Builtin::BIacos:
+    case Builtin::BIacosf:
+    case Builtin::BIacosl:
+    case Builtin::BI__builtin_acos:
+    case Builtin::BI__builtin_acosf:
+    case Builtin::BI__builtin_acosf16:
+    case Builtin::BI__builtin_acosl:
+    case Builtin::BI__builtin_acosf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::acos, Intrinsic::experimental_constrained_acos,
+          Intrinsic::fpbuiltin_acos));
+
+    case Builtin::BIasin:
+    case Builtin::BIasinf:
+    case Builtin::BIasinl:
+    case Builtin::BI__builtin_asin:
+    case Builtin::BI__builtin_asinf:
+    case Builtin::BI__builtin_asinf16:
+    case Builtin::BI__builtin_asinl:
+    case Builtin::BI__builtin_asinf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::asin, Intrinsic::experimental_constrained_asin,
+          Intrinsic::fpbuiltin_asin));
+
+    case Builtin::BIatan:
+    case Builtin::BIatanf:
+    case Builtin::BIatanl:
+    case Builtin::BI__builtin_atan:
+    case Builtin::BI__builtin_atanf:
+    case Builtin::BI__builtin_atanf16:
+    case Builtin::BI__builtin_atanl:
+    case Builtin::BI__builtin_atanf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::atan, Intrinsic::experimental_constrained_atan,
+          Intrinsic::fpbuiltin_atan));
+
     case Builtin::BIceil:
     case Builtin::BIceilf:
     case Builtin::BIceill:
@@ -2756,6 +2812,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           *this, E, Intrinsic::cos, Intrinsic::experimental_constrained_cos,
           Intrinsic::fpbuiltin_cos));
 
+    case Builtin::BIcosh:
+    case Builtin::BIcoshf:
+    case Builtin::BIcoshl:
+    case Builtin::BI__builtin_cosh:
+    case Builtin::BI__builtin_coshf:
+    case Builtin::BI__builtin_coshf16:
+    case Builtin::BI__builtin_coshl:
+    case Builtin::BI__builtin_coshf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::cosh, Intrinsic::experimental_constrained_cosh,
+          Intrinsic::fpbuiltin_cosh));
+
     case Builtin::BIexp:
     case Builtin::BIexpf:
     case Builtin::BIexpl:
@@ -2972,6 +3040,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           *this, E, Intrinsic::sin, Intrinsic::experimental_constrained_sin,
           Intrinsic::fpbuiltin_sin));
 
+    case Builtin::BIsinh:
+    case Builtin::BIsinhf:
+    case Builtin::BIsinhl:
+    case Builtin::BI__builtin_sinh:
+    case Builtin::BI__builtin_sinhf:
+    case Builtin::BI__builtin_sinhf16:
+    case Builtin::BI__builtin_sinhl:
+    case Builtin::BI__builtin_sinhf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh,
+          Intrinsic::fpbuiltin_sinh));
+
     case Builtin::BIsqrt:
     case Builtin::BIsqrtf:
     case Builtin::BIsqrtl:
@@ -2999,6 +3079,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan,
           Intrinsic::fpbuiltin_tan));
 
+    case Builtin::BItanh:
+    case Builtin::BItanhf:
+    case Builtin::BItanhl:
+    case Builtin::BI__builtin_tanh:
+    case Builtin::BI__builtin_tanhf:
+    case Builtin::BI__builtin_tanhf16:
+    case Builtin::BI__builtin_tanhl:
+    case Builtin::BI__builtin_tanhf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::tanh, Intrinsic::experimental_constrained_tanh,
+          Intrinsic::fpbuiltin_tanh));
+
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:
@@ -5974,8 +6066,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         getTarget().getTriple().isAMDGCN() ||
         (getTarget().getTriple().isSPIRV() &&
          getTarget().getTriple().getVendor() == Triple::VendorType::AMD)) {
-      if (getLangOpts().OpenMPIsTargetDevice)
-        return EmitOpenMPDevicePrintfCallExpr(E);
       if (getTarget().getTriple().isNVPTX())
         return EmitNVPTXDevicePrintfCallExpr(E);
       if ((getTarget().getTriple().isAMDGCN() ||
@@ -14316,6 +14406,16 @@ Value *CodeGenFunction::EmitAArch64CpuInit() {
   return Builder.CreateCall(Func);
 }
 
+Value *CodeGenFunction::EmitRISCVCpuInit() {
+  llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
+  llvm::FunctionCallee Func =
+      CGM.CreateRuntimeFunction(FTy, "__init_riscv_feature_bits");
+  auto *CalleeGV = cast<llvm::GlobalValue>(Func.getCallee());
+  CalleeGV->setDSOLocal(true);
+  CalleeGV->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
+  return Builder.CreateCall(Func);
+}
+
 Value *CodeGenFunction::EmitX86CpuInit() {
   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
                                                     /*Variadic*/ false);
@@ -14368,6 +14468,42 @@ CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
   return Result;
 }
 
+Value *CodeGenFunction::EmitRISCVCpuSupports(const CallExpr *E) {
+
+  const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
+  StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
+  if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr))
+    return Builder.getFalse();
+
+  // Note: We are making an unchecked assumption that the size of the
+  // feature array is >= 1.  This holds for any version of compiler-rt
+  // which defines this interface.
+  llvm::ArrayType *ArrayOfInt64Ty = llvm::ArrayType::get(Int64Ty, 1);
+  llvm::Type *StructTy = llvm::StructType::get(Int32Ty, ArrayOfInt64Ty);
+  llvm::Constant *RISCVFeaturesBits =
+      CGM.CreateRuntimeVariable(StructTy, "__riscv_feature_bits");
+  auto *GV = cast<llvm::GlobalValue>(RISCVFeaturesBits);
+  GV->setDSOLocal(true);
+
+  auto LoadFeatureBit = [&](unsigned Index) {
+    // Create GEP then load.
+    Value *IndexVal = llvm::ConstantInt::get(Int32Ty, Index);
+    llvm::Value *GEPIndices[] = {Builder.getInt32(0), Builder.getInt32(1),
+                                 IndexVal};
+    Value *Ptr =
+        Builder.CreateInBoundsGEP(StructTy, RISCVFeaturesBits, GEPIndices);
+    Value *FeaturesBit =
+        Builder.CreateAlignedLoad(Int64Ty, Ptr, CharUnits::fromQuantity(8));
+    return FeaturesBit;
+  };
+
+  int BitPos = RISCVISAInfo::getRISCVFeaturesBitPosition(FeatureStr);
+  assert(BitPos != -1 && "validation should have rejected this feature");
+  Value *MaskV = Builder.getInt64(1ULL << BitPos);
+  Value *Bitset = Builder.CreateAnd(LoadFeatureBit(0), MaskV);
+  return Builder.CreateICmpEQ(Bitset, MaskV);
+}
+
 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
                                            const CallExpr *E) {
   if (BuiltinID == Builtin::BI__builtin_cpu_is)
@@ -14484,12 +14620,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
     // value, we should use that here instead of a zero.
     return llvm::Constant::getNullValue(ConvertType(E->getType()));
-  case X86::BI__builtin_ia32_vec_init_v8qi:
-  case X86::BI__builtin_ia32_vec_init_v4hi:
-  case X86::BI__builtin_ia32_vec_init_v2si:
-    return Builder.CreateBitCast(BuildVector(Ops),
-                                 llvm::Type::getX86_MMXTy(getLLVMContext()));
-  case X86::BI__builtin_ia32_vec_ext_v2si:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
   case X86::BI__builtin_ia32_vec_ext_v4si:
@@ -14507,6 +14638,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // Otherwise we could just do this in the header file.
     return Builder.CreateExtractElement(Ops[0], Index);
   }
+  case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v8hi:
   case X86::BI__builtin_ia32_vec_set_v4si:
@@ -18352,14 +18484,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return nullptr;
 
   switch (BuiltinID) {
-  case Builtin::BI__builtin_hlsl_elementwise_all: {
+  case Builtin::BI__builtin_hlsl_all: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     return Builder.CreateIntrinsic(
         /*ReturnType=*/llvm::Type::getInt1Ty(getLLVMContext()),
         CGM.getHLSLRuntime().getAllIntrinsic(), ArrayRef<Value *>{Op0}, nullptr,
         "hlsl.all");
   }
-  case Builtin::BI__builtin_hlsl_elementwise_any: {
+  case Builtin::BI__builtin_hlsl_any: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     return Builder.CreateIntrinsic(
         /*ReturnType=*/llvm::Type::getInt1Ty(getLLVMContext()),
@@ -18433,10 +18565,10 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
       llvm_unreachable("frac operand must have a float representation");
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/Op0->getType(), Intrinsic::dx_frac,
-        ArrayRef<Value *>{Op0}, nullptr, "dx.frac");
-  }
-  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+        /*ReturnType=*/Op0->getType(), CGM.getHLSLRuntime().getFracIntrinsic(),
+        ArrayRef<Value *>{Op0}, nullptr, "hlsl.frac");
+}
+case Builtin::BI__builtin_hlsl_elementwise_isinf: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     llvm::Type *Xty = Op0->getType();
     llvm::Type *retType = llvm::Type::getInt1Ty(this->getLLVMContext());
@@ -18827,7 +18959,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
 
     Intrinsic::ID IID;
     switch (BuiltinID) {
@@ -18836,7 +18972,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       IID = Intrinsic::amdgcn_global_load_tr_b64;
       break;
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
       IID = Intrinsic::amdgcn_global_load_tr_b128;
       break;
     }
@@ -19279,6 +19419,39 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
     return emitBuiltinWithOneOverloadedType<5>(
         *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
+    llvm::Type *RetTy = nullptr;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
+      RetTy = Int8Ty;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
+      RetTy = Int16Ty;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
+      RetTy = Int32Ty;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
+      RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
+      RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
+      RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
+      break;
+    }
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
+    return Builder.CreateCall(
+        F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
+            EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
+  }
   default:
     return nullptr;
   }
@@ -23489,6 +23662,13 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_extract_lane_f16x8);
     return Builder.CreateCall(Callee, {Vector, Index});
   }
+  case WebAssembly::BI__builtin_wasm_replace_lane_f16x8: {
+    Value *Vector = EmitScalarExpr(E->getArg(0));
+    Value *Index = EmitScalarExpr(E->getArg(1));
+    Value *Val = EmitScalarExpr(E->getArg(2));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_replace_lane_f16x8);
+    return Builder.CreateCall(Callee, {Vector, Index, Val});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
@@ -24158,6 +24338,12 @@ RValue CodeGenFunction::EmitIntelSYCLAllocaBuiltin(
 Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
                                              const CallExpr *E,
                                              ReturnValueSlot ReturnValue) {
+
+  if (BuiltinID == Builtin::BI__builtin_cpu_supports)
+    return EmitRISCVCpuSupports(E);
+  if (BuiltinID == Builtin::BI__builtin_cpu_init)
+    return EmitRISCVCpuInit();
+
   SmallVector<Value *, 4> Ops;
   llvm::Type *ResultType = ConvertType(E->getType());
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index c86804f7b072e..70758e6e17ec9 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1345,75 +1345,50 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   return CGF.Builder.CreateLoad(Tmp);
 }
 
-// Function to store a first-class aggregate into memory.  We prefer to
-// store the elements rather than the aggregate to be more friendly to
-// fast-isel.
-// FIXME: Do we need to recurse here?
-void CodeGenFunction::EmitAggregateStore(llvm::Value *Val, Address Dest,
-                                         bool DestIsVolatile) {
-  // Prefer scalar stores to first-class aggregate stores.
-  if (llvm::StructType *STy = dyn_cast<llvm::StructType>(Val->getType())) {
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-      Address EltPtr = Builder.CreateStructGEP(Dest, i);
-      llvm::Value *Elt = Builder.CreateExtractValue(Val, i);
-      Builder.CreateStore(Elt, EltPtr, DestIsVolatile);
-    }
-  } else {
-    Builder.CreateStore(Val, Dest, DestIsVolatile);
-  }
-}
-
-/// CreateCoercedStore - Create a store to \arg DstPtr from \arg Src,
-/// where the source and destination may have different types.  The
-/// destination is known to be aligned to \arg DstAlign bytes.
-///
-/// This safely handles the case when the src type is larger than the
-/// destination type; the upper bits of the src will be lost.
-static void CreateCoercedStore(llvm::Value *Src,
-                               Address Dst,
-                               bool DstIsVolatile,
-                               CodeGenFunction &CGF) {
-  llvm::Type *SrcTy = Src->getType();
-  llvm::Type *DstTy = Dst.getElementType();
-  if (SrcTy == DstTy) {
-    CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
+void CodeGenFunction::CreateCoercedStore(llvm::Value *Src, Address Dst,
+                                         llvm::TypeSize DstSize,
+                                         bool DstIsVolatile) {
+  if (!DstSize)
     return;
-  }
-
-  llvm::TypeSize SrcSize = CGF.CGM.getDataLayout().getTypeAllocSize(SrcTy);
 
-  if (llvm::StructType *DstSTy = dyn_cast<llvm::StructType>(DstTy)) {
-    Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy,
-                                             SrcSize.getFixedValue(), CGF);
-    DstTy = Dst.getElementType();
-  }
-
-  llvm::PointerType *SrcPtrTy = llvm::dyn_cast<llvm::PointerType>(SrcTy);
-  llvm::PointerType *DstPtrTy = llvm::dyn_cast<llvm::PointerType>(DstTy);
-  if (SrcPtrTy && DstPtrTy &&
-      SrcPtrTy->getAddressSpace() != DstPtrTy->getAddressSpace()) {
-    Src = CGF.Builder.CreateAddrSpaceCast(Src, DstTy);
-    CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
-    return;
-  }
-
-  // If the source and destination are integer or pointer types, just do an
-  // extension or truncation to the desired type.
-  if ((isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy)) &&
-      (isa<llvm::IntegerType>(DstTy) || isa<llvm::PointerType>(DstTy))) {
-    Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, CGF);
-    CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
-    return;
+  llvm::Type *SrcTy = Src->getType();
+  llvm::TypeSize SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
+
+  // GEP into structs to try to make types match.
+  // FIXME: This isn't really that useful with opaque types, but it impacts a
+  // lot of regression tests.
+  if (SrcTy != Dst.getElementType()) {
+    if (llvm::StructType *DstSTy =
+            dyn_cast<llvm::StructType>(Dst.getElementType())) {
+      assert(!SrcSize.isScalable());
+      Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy,
+                                               SrcSize.getFixedValue(), *this);
+    }
   }
 
-  llvm::TypeSize DstSize = CGF.CGM.getDataLayout().getTypeAllocSize(DstTy);
-
-  // If store is legal, just bitcast the src pointer.
-  if (isa<llvm::ScalableVectorType>(SrcTy) ||
-      isa<llvm::ScalableVectorType>(DstTy) ||
-      SrcSize.getFixedValue() <= DstSize.getFixedValue()) {
-    Dst = Dst.withElementType(SrcTy);
-    CGF.EmitAggregateStore(Src, Dst, DstIsVolatile);
+  if (SrcSize.isScalable() || SrcSize <= DstSize) {
+    if (SrcTy->isIntegerTy() && Dst.getElementType()->isPointerTy() &&
+        SrcSize == CGM.getDataLayout().getTypeAllocSize(Dst.getElementType())) {
+      // If the value is supposed to be a pointer, convert it before storing it.
+      Src = CoerceIntOrPtrToIntOrPtr(Src, Dst.getElementType(), *this);
+      Builder.CreateStore(Src, Dst, DstIsVolatile);
+    } else if (llvm::StructType *STy =
+                   dyn_cast<llvm::StructType>(Src->getType())) {
+      // Prefer scalar stores to first-class aggregate stores.
+      Dst = Dst.withElementType(SrcTy);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Address EltPtr = Builder.CreateStructGEP(Dst, i);
+        llvm::Value *Elt = Builder.CreateExtractValue(Src, i);
+        Builder.CreateStore(Elt, EltPtr, DstIsVolatile);
+      }
+    } else {
+      Builder.CreateStore(Src, Dst.withElementType(SrcTy), DstIsVolatile);
+    }
+  } else if (SrcTy->isIntegerTy()) {
+    // If the source is a simple integer, coerce it directly.
+    llvm::Type *DstIntTy = Builder.getIntNTy(DstSize.getFixedValue() * 8);
+    Src = CoerceIntOrPtrToIntOrPtr(Src, DstIntTy, *this);
+    Builder.CreateStore(Src, Dst.withElementType(DstIntTy), DstIsVolatile);
   } else {
     // Otherwise do coercion through memory. This is stupid, but
     // simple.
@@ -1425,12 +1400,12 @@ static void CreateCoercedStore(llvm::Value *Src,
     // FIXME: Assert that we aren't truncating non-padding bits when have access
     // to that information.
     RawAddress Tmp =
-        CreateTempAllocaForCoercion(CGF, SrcTy, Dst.getAlignment());
-    CGF.Builder.CreateStore(Src, Tmp);
-    CGF.Builder.CreateMemCpy(
-        Dst.emitRawPointer(CGF), Dst.getAlignment().getAsAlign(),
-        Tmp.getPointer(), Tmp.getAlignment().getAsAlign(),
-        llvm::ConstantInt::get(CGF.IntPtrTy, DstSize.getFixedValue()));
+        CreateTempAllocaForCoercion(*this, SrcTy, Dst.getAlignment());
+    Builder.CreateStore(Src, Tmp);
+    Builder.CreateMemCpy(Dst.emitRawPointer(*this),
+                         Dst.getAlignment().getAsAlign(), Tmp.getPointer(),
+                         Tmp.getAlignment().getAsAlign(),
+                         Builder.CreateTypeSize(IntPtrTy, DstSize));
   }
 }
 
@@ -2088,6 +2063,9 @@ static void getTrivialDefaultFunctionAttributes(
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
   }
 
+  if (CodeGenOpts.SaveRegParams && !AttrOnCallSite)
+    FuncAttrs.addAttribute("save-reg-params");
+
   for (StringRef Attr : CodeGenOpts.DefaultFunctionAttrs) {
     StringRef Var, Value;
     std::tie(Var, Value) = Attr.split('=');
@@ -2582,6 +2560,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
         }
       }
     }
+    // Remove 'convergent' if requested.
+    if (TargetDecl->hasAttr<NoConvergentAttr>())
+      FuncAttrs.removeAttribute(llvm::Attribute::Convergent);
   }
 
   // Add "sample-profile-suffix-elision-policy" attribute for internal linkage
@@ -3391,7 +3372,12 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         assert(NumIRArgs == 1);
         auto AI = Fn->getArg(FirstIRArg);
         AI->setName(Arg->getName() + ".coerce");
-        CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, *this);
+        CreateCoercedStore(
+            AI, Ptr,
+            llvm::TypeSize::getFixed(
+                getContext().getTypeSizeInChars(Ty).getQuantity() -
+                ArgI.getDirectOffset()),
+            /*DstIsVolatile=*/false);
       }
 
       // Match to what EmitParmDecl is expecting for this type.
@@ -5115,7 +5101,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  ReturnValueSlot ReturnValue,
                                  const CallArgList &CallArgs,
                                  llvm::CallBase **callOrInvoke, bool IsMustTail,
-                                 SourceLocation Loc) {
+                                 SourceLocation Loc,
+                                 bool IsVirtualFunctionPointerThunk) {
   // FIXME: We no longer need the types from CallArgs; lift up and simplify.
 
   assert(Callee.isOrdinary() || Callee.isVirtual());
@@ -5179,7 +5166,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   RawAddress SRetAlloca = RawAddress::invalid();
   llvm::Value *UnusedReturnSizePtr = nullptr;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
-    if (!ReturnValue.isNull()) {
+    if (IsVirtualFunctionPointerThunk && RetAI.isIndirect()) {
+      SRetPtr = makeNaturalAddressForPointer(CurFn->arg_begin() +
+                                                 IRFunctionArgs.getSRetArgNo(),
+                                             RetTy, CharUnits::fromQuantity(1));
+    } else if (!ReturnValue.isNull()) {
       SRetPtr = ReturnValue.getAddress();
     } else {
       SRetPtr = CreateMemTemp(RetTy, "tmp", &SRetAlloca);
@@ -5715,6 +5706,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     Attrs =
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
 
+  // Remove call-site convergent attribute if requested.
+  if (InNoConvergentAttributedStmt)
+    Attrs =
+        Attrs.removeFnAttribute(getLLVMContext(), llvm::Attribute::Convergent);
+
   // Apply some call-site-specific attributes.
   // TODO: work this into building the attribute set.
 
@@ -5992,119 +5988,127 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   CallArgs.freeArgumentMemory(*this);
 
   // Extract the return value.
-  RValue Ret = [&] {
-    switch (RetAI.getKind()) {
-    case ABIArgInfo::CoerceAndExpand: {
-      auto coercionType = RetAI.getCoerceAndExpandType();
+  RValue Ret;
 
-      Address addr = SRetPtr.withElementType(coercionType);
-
-      assert(CI->getType() == RetAI.getUnpaddedCoerceAndExpandType());
-      bool requiresExtract = isa<llvm::StructType>(CI->getType());
-
-      unsigned unpaddedIndex = 0;
-      for (unsigned i = 0, e = coercionType->getNumElements(); i != e; ++i) {
-        llvm::Type *eltType = coercionType->getElementType(i);
-        if (ABIArgInfo::isPaddingForCoerceAndExpand(eltType)) continue;
-        Address eltAddr = Builder.CreateStructGEP(addr, i);
-        llvm::Value *elt = CI;
-        if (requiresExtract)
-          elt = Builder.CreateExtractValue(elt, unpaddedIndex++);
-        else
-          assert(unpaddedIndex == 0);
-        Builder.CreateStore(elt, eltAddr);
+  // If the current function is a virtual function pointer thunk, avoid copying
+  // the return value of the musttail call to a temporary.
+  if (IsVirtualFunctionPointerThunk) {
+    Ret = RValue::get(CI);
+  } else {
+    Ret = [&] {
+      switch (RetAI.getKind()) {
+      case ABIArgInfo::CoerceAndExpand: {
+        auto coercionType = RetAI.getCoerceAndExpandType();
+
+        Address addr = SRetPtr.withElementType(coercionType);
+
+        assert(CI->getType() == RetAI.getUnpaddedCoerceAndExpandType());
+        bool requiresExtract = isa<llvm::StructType>(CI->getType());
+
+        unsigned unpaddedIndex = 0;
+        for (unsigned i = 0, e = coercionType->getNumElements(); i != e; ++i) {
+          llvm::Type *eltType = coercionType->getElementType(i);
+          if (ABIArgInfo::isPaddingForCoerceAndExpand(eltType))
+            continue;
+          Address eltAddr = Builder.CreateStructGEP(addr, i);
+          llvm::Value *elt = CI;
+          if (requiresExtract)
+            elt = Builder.CreateExtractValue(elt, unpaddedIndex++);
+          else
+            assert(unpaddedIndex == 0);
+          Builder.CreateStore(elt, eltAddr);
+        }
+        [[fallthrough]];
       }
-      [[fallthrough]];
-    }
-
-    case ABIArgInfo::InAlloca:
-    case ABIArgInfo::Indirect: {
-      RValue ret = convertTempToRValue(SRetPtr, RetTy, SourceLocation());
-      if (UnusedReturnSizePtr)
-        PopCleanupBlock();
-      return ret;
-    }
-
-    case ABIArgInfo::Ignore:
-      // If we are ignoring an argument that had a result, make sure to
-      // construct the appropriate return value for our caller.
-      return GetUndefRValue(RetTy);
 
-    case ABIArgInfo::Extend:
-    case ABIArgInfo::Direct: {
-      llvm::Type *RetIRTy = ConvertType(RetTy);
-      if (RetAI.getCoerceToType() == RetIRTy && RetAI.getDirectOffset() == 0) {
-        switch (getEvaluationKind(RetTy)) {
-        case TEK_Complex: {
-          llvm::Value *Real = Builder.CreateExtractValue(CI, 0);
-          llvm::Value *Imag = Builder.CreateExtractValue(CI, 1);
-          return RValue::getComplex(std::make_pair(Real, Imag));
-        }
-        case TEK_Aggregate: {
-          Address DestPtr = ReturnValue.getAddress();
-          bool DestIsVolatile = ReturnValue.isVolatile();
+      case ABIArgInfo::InAlloca:
+      case ABIArgInfo::Indirect: {
+        RValue ret = convertTempToRValue(SRetPtr, RetTy, SourceLocation());
+        if (UnusedReturnSizePtr)
+          PopCleanupBlock();
+        return ret;
+      }
 
-          if (!DestPtr.isValid()) {
-            DestPtr = CreateMemTemp(RetTy, "agg.tmp");
-            DestIsVolatile = false;
+      case ABIArgInfo::Ignore:
+        // If we are ignoring an argument that had a result, make sure to
+        // construct the appropriate return value for our caller.
+        return GetUndefRValue(RetTy);
+
+      case ABIArgInfo::Extend:
+      case ABIArgInfo::Direct: {
+        llvm::Type *RetIRTy = ConvertType(RetTy);
+        if (RetAI.getCoerceToType() == RetIRTy &&
+            RetAI.getDirectOffset() == 0) {
+          switch (getEvaluationKind(RetTy)) {
+          case TEK_Complex: {
+            llvm::Value *Real = Builder.CreateExtractValue(CI, 0);
+            llvm::Value *Imag = Builder.CreateExtractValue(CI, 1);
+            return RValue::getComplex(std::make_pair(Real, Imag));
+          }
+          case TEK_Aggregate:
+            break;
+          case TEK_Scalar: {
+            // If the argument doesn't match, perform a bitcast to coerce it.
+            // This can happen due to trivial type mismatches.
+            llvm::Value *V = CI;
+            if (V->getType() != RetIRTy)
+              V = Builder.CreateBitCast(V, RetIRTy);
+            return RValue::get(V);
+          }
           }
-          EmitAggregateStore(CI, DestPtr, DestIsVolatile);
-          return RValue::getAggregate(DestPtr);
         }
-        case TEK_Scalar: {
-          // If the argument doesn't match, perform a bitcast to coerce it.  This
-          // can happen due to trivial type mismatches.
+
+        // If coercing a fixed vector from a scalable vector for ABI
+        // compatibility, and the types match, use the llvm.vector.extract
+        // intrinsic to perform the conversion.
+        if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
           llvm::Value *V = CI;
-          if (V->getType() != RetIRTy)
-            V = Builder.CreateBitCast(V, RetIRTy);
-          return RValue::get(V);
-        }
+          if (auto *ScalableSrcTy =
+                  dyn_cast<llvm::ScalableVectorType>(V->getType())) {
+            if (FixedDstTy->getElementType() ==
+                ScalableSrcTy->getElementType()) {
+              llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
+              V = Builder.CreateExtractVector(FixedDstTy, V, Zero,
+                                              "cast.fixed");
+              return RValue::get(V);
+            }
+          }
         }
-        llvm_unreachable("bad evaluation kind");
-      }
 
-      // If coercing a fixed vector from a scalable vector for ABI
-      // compatibility, and the types match, use the llvm.vector.extract
-      // intrinsic to perform the conversion.
-      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
-        llvm::Value *V = CI;
-        if (auto *ScalableSrcTy =
-                dyn_cast<llvm::ScalableVectorType>(V->getType())) {
-          if (FixedDstTy->getElementType() == ScalableSrcTy->getElementType()) {
-            llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
-            V = Builder.CreateExtractVector(FixedDstTy, V, Zero, "cast.fixed");
-            return RValue::get(V);
-          }
+        Address DestPtr = ReturnValue.getValue();
+        bool DestIsVolatile = ReturnValue.isVolatile();
+        uint64_t DestSize =
+            getContext().getTypeInfoDataSizeInChars(RetTy).Width.getQuantity();
+
+        if (!DestPtr.isValid()) {
+          DestPtr = CreateMemTemp(RetTy, "coerce");
+          DestIsVolatile = false;
+          DestSize = getContext().getTypeSizeInChars(RetTy).getQuantity();
         }
-      }
 
-      Address DestPtr = ReturnValue.getValue();
-      bool DestIsVolatile = ReturnValue.isVolatile();
+        // An empty record can overlap other data (if declared with
+        // no_unique_address); omit the store for such types - as there is no
+        // actual data to store.
+        if (!isEmptyRecord(getContext(), RetTy, true)) {
+          // If the value is offset in memory, apply the offset now.
+          Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
+          CreateCoercedStore(
+              CI, StorePtr,
+              llvm::TypeSize::getFixed(DestSize - RetAI.getDirectOffset()),
+              DestIsVolatile);
+        }
 
-      if (!DestPtr.isValid()) {
-        DestPtr = CreateMemTemp(RetTy, "coerce");
-        DestIsVolatile = false;
+        return convertTempToRValue(DestPtr, RetTy, SourceLocation());
       }
 
-      // An empty record can overlap other data (if declared with
-      // no_unique_address); omit the store for such types - as there is no
-      // actual data to store.
-      if (!isEmptyRecord(getContext(), RetTy, true)) {
-        // If the value is offset in memory, apply the offset now.
-        Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
-        CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+      case ABIArgInfo::Expand:
+      case ABIArgInfo::IndirectAliased:
+        llvm_unreachable("Invalid ABI kind for return argument");
       }
 
-      return convertTempToRValue(DestPtr, RetTy, SourceLocation());
-    }
-
-    case ABIArgInfo::Expand:
-    case ABIArgInfo::IndirectAliased:
-      llvm_unreachable("Invalid ABI kind for return argument");
-    }
-
-    llvm_unreachable("Unhandled ABIArgInfo::Kind");
-  } ();
+      llvm_unreachable("Unhandled ABIArgInfo::Kind");
+    }();
+  }
 
   // Emit the assume_aligned check on the return value.
   if (Ret.isScalar() && TargetDecl) {
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 4c8a95fbb105f..c5fd90fd2487c 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1996,7 +1996,12 @@ CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method,
   if (Method->isStatic())
     return cast_or_null<llvm::DISubroutineType>(
         getOrCreateType(QualType(Func, 0), Unit));
-  return getOrCreateInstanceMethodType(Method->getThisType(), Func, Unit);
+
+  QualType ThisType;
+  if (!Method->hasCXXExplicitFunctionObjectParameter())
+    ThisType = Method->getThisType();
+
+  return getOrCreateInstanceMethodType(ThisType, Func, Unit);
 }
 
 llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
@@ -2028,27 +2033,31 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   Elts.push_back(Args[0]);
 
   // "this" pointer is always first argument.
-  const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
-  if (isa<ClassTemplateSpecializationDecl>(RD)) {
-    // Create pointer type directly in this case.
-    const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
-    uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
-    auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
-    llvm::DIType *PointeeType =
-        getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
-    llvm::DIType *ThisPtrType =
-        DBuilder.createPointerType(PointeeType, Size, Align);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    // TODO: This and the artificial type below are misleading, the
-    // types aren't artificial the argument is, but the current
-    // metadata doesn't represent that.
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
-  } else {
-    llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
+  // ThisPtr may be null if the member function has an explicit 'this'
+  // parameter.
+  if (!ThisPtr.isNull()) {
+    const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
+    if (isa<ClassTemplateSpecializationDecl>(RD)) {
+      // Create pointer type directly in this case.
+      const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
+      uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
+      auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
+      llvm::DIType *PointeeType =
+          getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
+      llvm::DIType *ThisPtrType =
+          DBuilder.createPointerType(PointeeType, Size, Align);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      // TODO: This and the artificial type below are misleading, the
+      // types aren't artificial the argument is, but the current
+      // metadata doesn't represent that.
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    } else {
+      llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    }
   }
 
   // Copy rest of the arguments.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index f1681ffa4c606..1d5be8451d050 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2845,7 +2845,7 @@ void CodeGenModule::EmitOMPRequiresDecl(const OMPRequiresDecl *D) {
 }
 
 void CodeGenModule::EmitOMPAllocateDecl(const OMPAllocateDecl *D) {
-  for (const Expr *E : D->varlists()) {
+  for (const Expr *E : D->varlist()) {
     const auto *DE = cast<DeclRefExpr>(E);
     const auto *VD = cast<VarDecl>(DE->getDecl());
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index cced2ceafe7b8..e7fc9be54874b 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1068,28 +1068,31 @@ class StructAccessBase
 
 } // end anonymous namespace
 
-using RecIndicesTy =
-    SmallVector<std::pair<const RecordDecl *, llvm::Value *>, 8>;
+using RecIndicesTy = SmallVector<llvm::Value *, 8>;
 
 static bool getGEPIndicesToField(CodeGenFunction &CGF, const RecordDecl *RD,
-                                 const FieldDecl *FD, RecIndicesTy &Indices) {
+                                 const FieldDecl *Field,
+                                 RecIndicesTy &Indices) {
   const CGRecordLayout &Layout = CGF.CGM.getTypes().getCGRecordLayout(RD);
   int64_t FieldNo = -1;
-  for (const Decl *D : RD->decls()) {
-    if (const auto *Field = dyn_cast<FieldDecl>(D)) {
-      FieldNo = Layout.getLLVMFieldNo(Field);
-      if (FD == Field) {
-        Indices.emplace_back(std::make_pair(RD, CGF.Builder.getInt32(FieldNo)));
-        return true;
-      }
+  for (const FieldDecl *FD : RD->fields()) {
+    if (!Layout.containsFieldDecl(FD))
+      // This could happen if the field has a struct type that's empty. I don't
+      // know why either.
+      continue;
+
+    FieldNo = Layout.getLLVMFieldNo(FD);
+    if (FD == Field) {
+      Indices.emplace_back(CGF.Builder.getInt32(FieldNo));
+      return true;
     }
 
-    if (const auto *Record = dyn_cast<RecordDecl>(D)) {
-      ++FieldNo;
-      if (getGEPIndicesToField(CGF, Record, FD, Indices)) {
+    QualType Ty = FD->getType();
+    if (Ty->isRecordType()) {
+      if (getGEPIndicesToField(CGF, Ty->getAsRecordDecl(), Field, Indices)) {
         if (RD->isUnion())
           FieldNo = 0;
-        Indices.emplace_back(std::make_pair(RD, CGF.Builder.getInt32(FieldNo)));
+        Indices.emplace_back(CGF.Builder.getInt32(FieldNo));
         return true;
       }
     }
@@ -1106,7 +1109,7 @@ static bool getGEPIndicesToField(CodeGenFunction &CGF, const RecordDecl *RD,
 /// - \p FAMDecl: the \p Decl for the flexible array member. It may not be
 ///   within the top-level struct.
 /// - \p CountDecl: must be within the same non-anonymous struct as \p FAMDecl.
-llvm::Value *CodeGenFunction::EmitCountedByFieldExpr(
+llvm::Value *CodeGenFunction::EmitLoadOfCountedByField(
     const Expr *Base, const FieldDecl *FAMDecl, const FieldDecl *CountDecl) {
   const RecordDecl *RD = CountDecl->getParent()->getOuterLexicalRecordContext();
 
@@ -1133,15 +1136,15 @@ llvm::Value *CodeGenFunction::EmitCountedByFieldExpr(
     return nullptr;
   }
 
-  llvm::Value *Zero = Builder.getInt32(0);
   RecIndicesTy Indices;
-
   getGEPIndicesToField(*this, RD, CountDecl, Indices);
+  if (Indices.empty())
+    return nullptr;
 
-  for (auto I = Indices.rbegin(), E = Indices.rend(); I != E; ++I)
-    Res = Builder.CreateInBoundsGEP(
-        ConvertType(QualType(I->first->getTypeForDecl(), 0)), Res,
-        {Zero, I->second}, "..counted_by.gep");
+  Indices.push_back(Builder.getInt32(0));
+  Res = Builder.CreateInBoundsGEP(
+      ConvertType(QualType(RD->getTypeForDecl(), 0)), Res,
+      RecIndicesTy(llvm::reverse(Indices)), "..counted_by.gep");
 
   return Builder.CreateAlignedLoad(ConvertType(CountDecl->getType()), Res,
                                    getIntAlign(), "..counted_by.load");
@@ -1313,7 +1316,8 @@ static Address EmitPointerWithAlignment(const Expr *E, LValueBaseInfo *BaseInfo,
         if (CE->getCastKind() == CK_AddressSpaceConversion)
           Addr = CGF.Builder.CreateAddrSpaceCast(
               Addr, CGF.ConvertType(E->getType()), ElemTy);
-        return Addr;
+        return CGF.authPointerToPointerCast(Addr, CE->getSubExpr()->getType(),
+                                            CE->getType());
       }
       break;
 
@@ -3887,6 +3891,8 @@ llvm::CallInst *CodeGenFunction::EmitTrapCall(llvm::Intrinsic::ID IntrID) {
     TrapCall->addFnAttr(A);
   }
 
+  if (InNoMergeAttributedStmt)
+    TrapCall->addFnAttr(llvm::Attribute::NoMerge);
   return TrapCall;
 }
 
@@ -4124,25 +4130,25 @@ static Address emitArraySubscriptGEP(CodeGenFunction &CGF, Address addr,
 
 /// The offset of a field from the beginning of the record.
 static bool getFieldOffsetInBits(CodeGenFunction &CGF, const RecordDecl *RD,
-                                 const FieldDecl *FD, int64_t &Offset) {
+                                 const FieldDecl *Field, int64_t &Offset) {
   ASTContext &Ctx = CGF.getContext();
   const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
   unsigned FieldNo = 0;
 
-  for (const Decl *D : RD->decls()) {
-    if (const auto *Record = dyn_cast<RecordDecl>(D))
-      if (getFieldOffsetInBits(CGF, Record, FD, Offset)) {
-        Offset += Layout.getFieldOffset(FieldNo);
-        return true;
-      }
+  for (const FieldDecl *FD : RD->fields()) {
+    if (FD == Field) {
+      Offset += Layout.getFieldOffset(FieldNo);
+      return true;
+    }
 
-    if (const auto *Field = dyn_cast<FieldDecl>(D))
-      if (FD == Field) {
+    QualType Ty = FD->getType();
+    if (Ty->isRecordType())
+      if (getFieldOffsetInBits(CGF, Ty->getAsRecordDecl(), Field, Offset)) {
         Offset += Layout.getFieldOffset(FieldNo);
         return true;
       }
 
-    if (isa<FieldDecl>(D))
+    if (!RD->isUnion())
       ++FieldNo;
   }
 
@@ -5884,6 +5890,15 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
           CGM.getLLVMContext(), {PrefixSigType, Int32Ty}, /*isPacked=*/true);
 
       llvm::Value *CalleePtr = Callee.getFunctionPointer();
+      if (CGM.getCodeGenOpts().PointerAuth.FunctionPointers) {
+        // Use raw pointer since we are using the callee pointer as data here.
+        Address Addr =
+            Address(CalleePtr, CalleePtr->getType(),
+                    CharUnits::fromQuantity(
+                        CalleePtr->getPointerAlignment(CGM.getDataLayout())),
+                    Callee.getPointerAuthInfo(), nullptr);
+        CalleePtr = Addr.emitRawPointer(*this);
+      }
 
       // On 32-bit Arm, the low bit of a function pointer indicates whether
       // it's using the Arm or Thumb instruction set. The actual first
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index e50d3c5bd9951..d9f44f4be617e 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -131,14 +131,12 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     EnsureDest(E->getType());
 
     if (llvm::Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
-      Address StoreDest = Dest.getAddress();
-      // The emitted value is guaranteed to have the same size as the
-      // destination but can have a different type. Just do a bitcast in this
-      // case to avoid incorrect GEPs.
-      if (Result->getType() != StoreDest.getType())
-        StoreDest = StoreDest.withElementType(Result->getType());
-      CGF.EmitAggregateStore(Result, StoreDest,
-                             E->getType().isVolatileQualified());
+      CGF.CreateCoercedStore(
+          Result, Dest.getAddress(),
+          llvm::TypeSize::getFixed(
+              Dest.getPreferredSize(CGF.getContext(), E->getType())
+                  .getQuantity()),
+          E->getType().isVolatileQualified());
       return;
     }
     return Visit(E->getSubExpr());
@@ -2049,6 +2047,10 @@ CodeGenFunction::getOverlapForFieldInit(const FieldDecl *FD) {
   if (!FD->hasAttr<NoUniqueAddressAttr>() || !FD->getType()->isRecordType())
     return AggValueSlot::DoesNotOverlap;
 
+  // Empty fields can overlap earlier fields.
+  if (FD->getType()->getAsCXXRecordDecl()->isEmpty())
+    return AggValueSlot::MayOverlap;
+
   // If the field lies entirely within the enclosing class's nvsize, its tail
   // padding cannot overlap any already-initialized object. (The only subobjects
   // with greater addresses that might already be initialized are vbases.)
@@ -2071,6 +2073,10 @@ AggValueSlot::Overlap_t CodeGenFunction::getOverlapForBaseInit(
   if (IsVirtual)
     return AggValueSlot::MayOverlap;
 
+  // Empty bases can overlap earlier bases.
+  if (BaseRD->isEmpty())
+    return AggValueSlot::MayOverlap;
+
   // If the base class is laid out entirely within the nvsize of the derived
   // class, its tail padding cannot yet be initialized, so we can issue
   // stores at the full width of the base class.
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 4d45f6d64c1cd..828a09856099a 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -649,7 +649,6 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
 
 ComplexPairTy ComplexExprEmitter::VisitUnaryPlus(const UnaryOperator *E,
                                                  QualType PromotionType) {
-  E->hasStoredFPFeatures();
   QualType promotionTy =
       PromotionType.isNull()
           ? getPromotionType(E->getStoredFPFeaturesOrDefault(),
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 7c65fccb60855..f22321f0e738a 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -814,8 +814,7 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
       llvm::Constant *VTableAddressPoint =
           CGM.getCXXABI().getVTableAddressPoint(BaseSubobject(CD, Offset),
                                                 VTableClass);
-      if (auto Authentication =
-              CGM.getVTablePointerAuthentication(VTableClass)) {
+      if (auto Authentication = CGM.getVTablePointerAuthentication(CD)) {
         VTableAddressPoint = Emitter.tryEmitConstantSignedPointer(
             VTableAddressPoint, *Authentication);
         if (!VTableAddressPoint)
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index e29d9b1f0b0e1..ef1f849814d45 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2400,7 +2400,9 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
       DestLV.setTBAAInfo(TBAAAccessInfo::getMayAliasInfo());
       return EmitLoadOfLValue(DestLV, CE->getExprLoc());
     }
-    return Builder.CreateBitCast(Src, DstTy);
+
+    llvm::Value *Result = Builder.CreateBitCast(Src, DstTy);
+    return CGF.authPointerToPointerCast(Result, E->getType(), DestTy);
   }
   case CK_AddressSpaceConversion: {
     Expr::EvalResult Result;
@@ -2550,6 +2552,8 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
       if (DestTy.mayBeDynamicClass())
         IntToPtr = Builder.CreateLaunderInvariantGroup(IntToPtr);
     }
+
+    IntToPtr = CGF.authPointerToPointerCast(IntToPtr, E->getType(), DestTy);
     return IntToPtr;
   }
   case CK_PointerToIntegral: {
@@ -2565,6 +2569,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
         PtrExpr = Builder.CreateStripInvariantGroup(PtrExpr);
     }
 
+    PtrExpr = CGF.authPointerToPointerCast(PtrExpr, E->getType(), DestTy);
     return Builder.CreatePtrToInt(PtrExpr, ConvertType(DestTy));
   }
   case CK_ToVoid: {
@@ -2862,9 +2867,10 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
           isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
       llvm::Value *amt = llvm::ConstantFP::get(
           VMContext, llvm::APFloat(static_cast<float>(1.0)));
-      llvm::Value *old =
-          Builder.CreateAtomicRMW(aop, LV.getAddress(), amt,
-                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      llvm::AtomicRMWInst *old =
+          CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
+                                llvm::AtomicOrdering::SequentiallyConsistent);
+
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -3604,9 +3610,9 @@ LValue ScalarExprEmitter::EmitCompoundAssignLValue(
             EmitScalarConversion(OpInfo.RHS, E->getRHS()->getType(), LHSTy,
                                  E->getExprLoc()),
             LHSTy);
-        Value *OldVal = Builder.CreateAtomicRMW(
-            AtomicOp, LHSLV.getAddress(), Amt,
-            llvm::AtomicOrdering::SequentiallyConsistent);
+
+        llvm::AtomicRMWInst *OldVal =
+            CGF.emitAtomicRMWInst(AtomicOp, LHSLV.getAddress(), Amt);
 
         // Since operation is atomic, the result type is guaranteed to be the
         // same as the input in LLVM terms.
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index 1937622618b6f..eb1cdacfd46a7 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -42,28 +42,6 @@ llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
       VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
 }
 
-llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
-  const char *Name = "__llvm_omp_vprintf";
-  llvm::Module &M = CGM.getModule();
-  llvm::Type *ArgTypes[] = {llvm::PointerType::getUnqual(M.getContext()),
-                            llvm::PointerType::getUnqual(M.getContext()),
-                            llvm::Type::getInt32Ty(M.getContext())};
-  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
-      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
-
-  if (auto *F = M.getFunction(Name)) {
-    if (F->getFunctionType() != VprintfFuncType) {
-      CGM.Error(SourceLocation(),
-                "Invalid type declaration for __llvm_omp_vprintf");
-      return nullptr;
-    }
-    return F;
-  }
-
-  return llvm::Function::Create(
-      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
-}
-
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
 // isn't particularly special; it's invoked just like a regular function).
 // vprintf takes two args: A format string, and a pointer to a buffer containing
@@ -221,10 +199,3 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
   Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
   return RValue::get(Printf);
 }
-
-RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) {
-  assert(getTarget().getTriple().isNVPTX() ||
-         getTarget().getTriple().isAMDGCN());
-  return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM),
-                                  true);
-}
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 4036ce711bea1..8c067f4963955 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -74,6 +74,7 @@ class CGHLSLRuntime {
 
   GENERATE_HLSL_INTRINSIC_FUNCTION(All, all)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Any, any)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(Frac, frac)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Rsqrt, rsqrt)
   GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 9e0cf1ba8fb44..a5fd660d77ac9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3861,7 +3861,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
         HasIterator = true;
         continue;
       }
-      for (const Expr *E : C->varlists()) {
+      for (const Expr *E : C->varlist()) {
         llvm::Value *Addr;
         llvm::Value *Size;
         std::tie(Addr, Size) = getPointerAndSize(CGF, E);
@@ -3894,7 +3894,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
         continue;
       OMPIteratorGeneratorScope IteratorScope(
           CGF, cast_or_null<OMPIteratorExpr>(Modifier->IgnoreParenImpCasts()));
-      for (const Expr *E : C->varlists()) {
+      for (const Expr *E : C->varlist()) {
         llvm::Value *Addr;
         llvm::Value *Size;
         std::tie(Addr, Size) = getPointerAndSize(CGF, E);
@@ -6049,11 +6049,6 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
         MinTeamsVal = MaxTeamsVal = 0;
         return nullptr;
       }
-      if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
-          isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
-        MinTeamsVal = MaxTeamsVal = 1;
-        return nullptr;
-      }
       MinTeamsVal = MaxTeamsVal = 1;
       return nullptr;
     }
@@ -8194,7 +8189,7 @@ class MappableExprsHandler {
       : CurDir(&Dir), CGF(CGF) {
     // Extract firstprivate clause information.
     for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
-      for (const auto *D : C->varlists())
+      for (const auto *D : C->varlist())
         FirstPrivateDecls.try_emplace(
             cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl()), C->isImplicit());
     // Extract implicit firstprivates from uses_allocators clauses.
@@ -8874,36 +8869,21 @@ emitMappingInformation(CodeGenFunction &CGF, llvm::OpenMPIRBuilder &OMPBuilder,
                                          PLoc.getLine(), PLoc.getColumn(),
                                          SrcLocStrSize);
 }
-
 /// Emit the arrays used to pass the captures and map information to the
 /// offloading runtime library. If there is no map or capture information,
 /// return nullptr by reference.
-static void emitOffloadingArrays(
+static void emitOffloadingArraysAndArgs(
     CodeGenFunction &CGF, MappableExprsHandler::MapCombinedInfoTy &CombinedInfo,
     CGOpenMPRuntime::TargetDataInfo &Info, llvm::OpenMPIRBuilder &OMPBuilder,
-    bool IsNonContiguous = false) {
+    bool IsNonContiguous = false, bool ForEndCall = false) {
   CodeGenModule &CGM = CGF.CGM;
 
-  // Reset the array information.
-  Info.clearArrayInfo();
-  Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
-
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
                          CGF.AllocaInsertPt->getIterator());
   InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
                           CGF.Builder.GetInsertPoint());
 
-  auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) {
-    return emitMappingInformation(CGF, OMPBuilder, MapExpr);
-  };
-  if (CGM.getCodeGenOpts().getDebugInfo() !=
-      llvm::codegenoptions::NoDebugInfo) {
-    CombinedInfo.Names.resize(CombinedInfo.Exprs.size());
-    llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(),
-                    FillInfoMap);
-  }
-
   auto DeviceAddrCB = [&](unsigned int I, llvm::Value *NewDecl) {
     if (const ValueDecl *DevVD = CombinedInfo.DevicePtrDecls[I]) {
       Info.CaptureDeviceAddrMap.try_emplace(DevVD, NewDecl);
@@ -8914,14 +8894,14 @@ static void emitOffloadingArrays(
     llvm::Value *MFunc = nullptr;
     if (CombinedInfo.Mappers[I]) {
       Info.HasMapper = true;
-      MFunc = CGF.CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
+      MFunc = CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
           cast<OMPDeclareMapperDecl>(CombinedInfo.Mappers[I]));
     }
     return MFunc;
   };
-  OMPBuilder.emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
-                                  /*IsNonContiguous=*/true, DeviceAddrCB,
-                                  CustomMapperCB);
+  OMPBuilder.emitOffloadingArraysAndArgs(
+      AllocaIP, CodeGenIP, Info, Info.RTArgs, CombinedInfo, IsNonContiguous,
+      ForEndCall, DeviceAddrCB, CustomMapperCB);
 }
 
 /// Check for inner distribute directive.
@@ -9486,29 +9466,14 @@ llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
   }
   return DynCGroupMem;
 }
+static void genMapInfoForCaptures(
+    MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
+    const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
+    llvm::OpenMPIRBuilder &OMPBuilder,
+    llvm::DenseSet<CanonicalDeclPtr<const Decl>> &MappedVarSet,
+    MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
 
-static void emitTargetCallKernelLaunch(
-    CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
-    const OMPExecutableDirective &D,
-    llvm::SmallVectorImpl<llvm::Value *> &CapturedVars, bool RequiresOuterTask,
-    const CapturedStmt &CS, bool OffloadingMandatory,
-    llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
-    llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
-    llvm::Value *&MapTypesArray, llvm::Value *&MapNamesArray,
-    llvm::function_ref<llvm::Value *(CodeGenFunction &CGF,
-                                     const OMPLoopDirective &D)>
-        SizeEmitter,
-    CodeGenFunction &CGF, CodeGenModule &CGM) {
-  llvm::OpenMPIRBuilder &OMPBuilder = OMPRuntime->getOMPBuilder();
-
-  // Fill up the arrays with all the captured variables.
-  MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
-
-  // Get mappable expression information.
-  MappableExprsHandler MEHandler(D, CGF);
   llvm::DenseMap<llvm::Value *, llvm::Value *> LambdaPointers;
-  llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;
-
   auto RI = CS.getCapturedRecordDecl()->field_begin();
   auto *CV = CapturedVars.begin();
   for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
@@ -9575,18 +9540,64 @@ static void emitTargetCallKernelLaunch(
   MEHandler.adjustMemberOfForLambdaCaptures(
       OMPBuilder, LambdaPointers, CombinedInfo.BasePointers,
       CombinedInfo.Pointers, CombinedInfo.Types);
+}
+static void
+genMapInfo(MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
+           MappableExprsHandler::MapCombinedInfoTy &CombinedInfo,
+           llvm::OpenMPIRBuilder &OMPBuilder,
+           const llvm::DenseSet<CanonicalDeclPtr<const Decl>> &SkippedVarSet =
+               llvm::DenseSet<CanonicalDeclPtr<const Decl>>()) {
+
+  CodeGenModule &CGM = CGF.CGM;
   // Map any list items in a map clause that were not captures because they
   // weren't referenced within the construct.
-  MEHandler.generateAllInfo(CombinedInfo, OMPBuilder, MappedVarSet);
+  MEHandler.generateAllInfo(CombinedInfo, OMPBuilder, SkippedVarSet);
 
+  auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) {
+    return emitMappingInformation(CGF, OMPBuilder, MapExpr);
+  };
+  if (CGM.getCodeGenOpts().getDebugInfo() !=
+      llvm::codegenoptions::NoDebugInfo) {
+    CombinedInfo.Names.resize(CombinedInfo.Exprs.size());
+    llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(),
+                    FillInfoMap);
+  }
+}
+
+static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF,
+                       const CapturedStmt &CS,
+                       llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
+                       llvm::OpenMPIRBuilder &OMPBuilder,
+                       MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
+  // Get mappable expression information.
+  MappableExprsHandler MEHandler(D, CGF);
+  llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;
+
+  genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, OMPBuilder,
+                        MappedVarSet, CombinedInfo);
+  genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder, MappedVarSet);
+}
+static void emitTargetCallKernelLaunch(
+    CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
+    const OMPExecutableDirective &D,
+    llvm::SmallVectorImpl<llvm::Value *> &CapturedVars, bool RequiresOuterTask,
+    const CapturedStmt &CS, bool OffloadingMandatory,
+    llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
+    llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
+    llvm::Value *&MapTypesArray, llvm::Value *&MapNamesArray,
+    llvm::function_ref<llvm::Value *(CodeGenFunction &CGF,
+                                     const OMPLoopDirective &D)>
+        SizeEmitter,
+    CodeGenFunction &CGF, CodeGenModule &CGM) {
+  llvm::OpenMPIRBuilder &OMPBuilder = OMPRuntime->getOMPBuilder();
+
+  // Fill up the arrays with all the captured variables.
+  MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
   CGOpenMPRuntime::TargetDataInfo Info;
-  // Fill up the arrays and create the arguments.
-  emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
-  bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
-                   llvm::codegenoptions::NoDebugInfo;
-  OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
-                                          EmitDebug,
-                                          /*ForEndCall=*/false);
+  genMapInfo(D, CGF, CS, CapturedVars, OMPBuilder, CombinedInfo);
+
+  emitOffloadingArraysAndArgs(CGF, CombinedInfo, Info, OMPBuilder,
+                              /*IsNonContiguous=*/true, /*ForEndCall=*/false);
 
   InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
   InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
@@ -10481,22 +10492,15 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
                                           PrePostActionTy &) {
     // Fill up the arrays with all the mapped variables.
     MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
-
-    // Get map clause information.
+    CGOpenMPRuntime::TargetDataInfo Info;
     MappableExprsHandler MEHandler(D, CGF);
-    MEHandler.generateAllInfo(CombinedInfo, OMPBuilder);
+    genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder);
+    emitOffloadingArraysAndArgs(CGF, CombinedInfo, Info, OMPBuilder,
+                                /*IsNonContiguous=*/true, /*ForEndCall=*/false);
 
-    CGOpenMPRuntime::TargetDataInfo Info;
-    // Fill up the arrays and create the arguments.
-    emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder,
-                         /*IsNonContiguous=*/true);
     bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>() ||
                              D.hasClausesOfKind<OMPNowaitClause>();
-    bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
-                     llvm::codegenoptions::NoDebugInfo;
-    OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
-                                            EmitDebug,
-                                            /*ForEndCall=*/false);
+
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
                                           CGF.VoidPtrTy, CGM.getPointerAlign());
@@ -11504,7 +11508,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
   }
   // Exclude vars in private clauses.
   for (const auto *C : S.getClausesOfKind<OMPPrivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11514,7 +11518,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPFirstprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11524,7 +11528,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11534,7 +11538,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11544,7 +11548,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLinearClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11587,7 +11591,7 @@ CGOpenMPRuntime::LastprivateConditionalRAII::LastprivateConditionalRAII(
     if (C->getKind() != OMPC_LASTPRIVATE_conditional)
       continue;
 
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       Data.DeclToUniqueName.insert(std::make_pair(
           cast<DeclRefExpr>(Ref->IgnoreParenImpCasts())->getDecl(),
           SmallString<16>(generateUniqueName(CGM, "pl_cond", Ref))));
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index f5bd4a141cc2d..8965a14d88a6f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1695,7 +1695,8 @@ void CGOpenMPRuntimeGPU::emitReduction(
                          CGF.AllocaInsertPt->getIterator());
   InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
                           CGF.Builder.GetInsertPoint());
-  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
+      CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
   llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
   CodeGenFunction::OMPPrivateScope Scope(CGF);
diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp
index 621d567dde721..0c63b9d6bb7e7 100644
--- a/clang/lib/CodeGen/CGPointerAuth.cpp
+++ b/clang/lib/CodeGen/CGPointerAuth.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenModule.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/SipHash.h"
 
 using namespace clang;
@@ -165,6 +166,128 @@ CGPointerAuthInfo CodeGenModule::getPointerAuthInfoForType(QualType T) {
   return ::getPointerAuthInfoForType(*this, T);
 }
 
+static bool isZeroConstant(const llvm::Value *Value) {
+  if (const auto *CI = dyn_cast<llvm::ConstantInt>(Value))
+    return CI->isZero();
+  return false;
+}
+
+static bool equalAuthPolicies(const CGPointerAuthInfo &Left,
+                              const CGPointerAuthInfo &Right) {
+  assert((Left.isSigned() || Right.isSigned()) &&
+         "shouldn't be called if neither is signed");
+  if (Left.isSigned() != Right.isSigned())
+    return false;
+  return Left.getKey() == Right.getKey() &&
+         Left.getAuthenticationMode() == Right.getAuthenticationMode();
+}
+
+// Return the discriminator or return zero if the discriminator is null.
+static llvm::Value *getDiscriminatorOrZero(const CGPointerAuthInfo &Info,
+                                           CGBuilderTy &Builder) {
+  llvm::Value *Discriminator = Info.getDiscriminator();
+  return Discriminator ? Discriminator : Builder.getSize(0);
+}
+
+llvm::Value *
+CodeGenFunction::emitPointerAuthResignCall(llvm::Value *Value,
+                                           const CGPointerAuthInfo &CurAuth,
+                                           const CGPointerAuthInfo &NewAuth) {
+  assert(CurAuth && NewAuth);
+
+  if (CurAuth.getAuthenticationMode() !=
+          PointerAuthenticationMode::SignAndAuth ||
+      NewAuth.getAuthenticationMode() !=
+          PointerAuthenticationMode::SignAndAuth) {
+    llvm::Value *AuthedValue = EmitPointerAuthAuth(CurAuth, Value);
+    return EmitPointerAuthSign(NewAuth, AuthedValue);
+  }
+  // Convert the pointer to intptr_t before signing it.
+  auto *OrigType = Value->getType();
+  Value = Builder.CreatePtrToInt(Value, IntPtrTy);
+
+  auto *CurKey = Builder.getInt32(CurAuth.getKey());
+  auto *NewKey = Builder.getInt32(NewAuth.getKey());
+
+  llvm::Value *CurDiscriminator = getDiscriminatorOrZero(CurAuth, Builder);
+  llvm::Value *NewDiscriminator = getDiscriminatorOrZero(NewAuth, Builder);
+
+  // call i64 @llvm.ptrauth.resign(i64 %pointer,
+  //                               i32 %curKey, i64 %curDiscriminator,
+  //                               i32 %newKey, i64 %newDiscriminator)
+  auto *Intrinsic = CGM.getIntrinsic(llvm::Intrinsic::ptrauth_resign);
+  Value = EmitRuntimeCall(
+      Intrinsic, {Value, CurKey, CurDiscriminator, NewKey, NewDiscriminator});
+
+  // Convert back to the original type.
+  Value = Builder.CreateIntToPtr(Value, OrigType);
+  return Value;
+}
+
+llvm::Value *CodeGenFunction::emitPointerAuthResign(
+    llvm::Value *Value, QualType Type, const CGPointerAuthInfo &CurAuthInfo,
+    const CGPointerAuthInfo &NewAuthInfo, bool IsKnownNonNull) {
+  // Fast path: if neither schema wants a signature, we're done.
+  if (!CurAuthInfo && !NewAuthInfo)
+    return Value;
+
+  llvm::Value *Null = nullptr;
+  // If the value is obviously null, we're done.
+  if (auto *PointerValue = dyn_cast<llvm::PointerType>(Value->getType())) {
+    Null = CGM.getNullPointer(PointerValue, Type);
+  } else {
+    assert(Value->getType()->isIntegerTy());
+    Null = llvm::ConstantInt::get(IntPtrTy, 0);
+  }
+  if (Value == Null)
+    return Value;
+
+  // If both schemas sign the same way, we're done.
+  if (equalAuthPolicies(CurAuthInfo, NewAuthInfo)) {
+    const llvm::Value *CurD = CurAuthInfo.getDiscriminator();
+    const llvm::Value *NewD = NewAuthInfo.getDiscriminator();
+    if (CurD == NewD)
+      return Value;
+
+    if ((CurD == nullptr && isZeroConstant(NewD)) ||
+        (NewD == nullptr && isZeroConstant(CurD)))
+      return Value;
+  }
+
+  llvm::BasicBlock *InitBB = Builder.GetInsertBlock();
+  llvm::BasicBlock *ResignBB = nullptr, *ContBB = nullptr;
+
+  // Null pointers have to be mapped to null, and the ptrauth_resign
+  // intrinsic doesn't do that.
+  if (!IsKnownNonNull && !llvm::isKnownNonZero(Value, CGM.getDataLayout())) {
+    ContBB = createBasicBlock("resign.cont");
+    ResignBB = createBasicBlock("resign.nonnull");
+
+    auto *IsNonNull = Builder.CreateICmpNE(Value, Null);
+    Builder.CreateCondBr(IsNonNull, ResignBB, ContBB);
+    EmitBlock(ResignBB);
+  }
+
+  // Perform the auth/sign/resign operation.
+  if (!NewAuthInfo)
+    Value = EmitPointerAuthAuth(CurAuthInfo, Value);
+  else if (!CurAuthInfo)
+    Value = EmitPointerAuthSign(NewAuthInfo, Value);
+  else
+    Value = emitPointerAuthResignCall(Value, CurAuthInfo, NewAuthInfo);
+
+  // Clean up with a phi if we branched before.
+  if (ContBB) {
+    EmitBlock(ContBB);
+    auto *Phi = Builder.CreatePHI(Value->getType(), 2);
+    Phi->addIncoming(Null, InitBB);
+    Phi->addIncoming(Value, ResignBB);
+    Value = Phi;
+  }
+
+  return Value;
+}
+
 llvm::Constant *
 CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key,
                                         llvm::Constant *StorageAddress,
@@ -242,6 +365,40 @@ llvm::Constant *CodeGenModule::getFunctionPointer(GlobalDecl GD,
   return getFunctionPointer(getRawFunctionPointer(GD, Ty), FuncType);
 }
 
+CGPointerAuthInfo CodeGenModule::getMemberFunctionPointerAuthInfo(QualType FT) {
+  assert(FT->getAs<MemberPointerType>() && "MemberPointerType expected");
+  const auto &Schema = getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers;
+  if (!Schema)
+    return CGPointerAuthInfo();
+
+  assert(!Schema.isAddressDiscriminated() &&
+         "function pointers cannot use address-specific discrimination");
+
+  llvm::ConstantInt *Discriminator =
+      getPointerAuthOtherDiscriminator(Schema, GlobalDecl(), FT);
+  return CGPointerAuthInfo(Schema.getKey(), Schema.getAuthenticationMode(),
+                           /* IsIsaPointer */ false,
+                           /* AuthenticatesNullValues */ false, Discriminator);
+}
+
+llvm::Constant *CodeGenModule::getMemberFunctionPointer(llvm::Constant *Pointer,
+                                                        QualType FT) {
+  if (CGPointerAuthInfo PointerAuth = getMemberFunctionPointerAuthInfo(FT))
+    return getConstantSignedPointer(
+        Pointer, PointerAuth.getKey(), nullptr,
+        cast_or_null<llvm::ConstantInt>(PointerAuth.getDiscriminator()));
+
+  return Pointer;
+}
+
+llvm::Constant *CodeGenModule::getMemberFunctionPointer(const FunctionDecl *FD,
+                                                        llvm::Type *Ty) {
+  QualType FT = FD->getType();
+  FT = getContext().getMemberPointerType(
+      FT, cast<CXXMethodDecl>(FD)->getParent()->getTypeForDecl());
+  return getMemberFunctionPointer(getRawFunctionPointer(FD, Ty), FT);
+}
+
 std::optional<PointerAuthQualifier>
 CodeGenModule::computeVTPointerAuthentication(const CXXRecordDecl *ThisClass) {
   auto DefaultAuthentication = getCodeGenOpts().PointerAuth.CXXVTablePointers;
@@ -351,3 +508,114 @@ CodeGenModule::getVTablePointerAuthInfo(CodeGenFunction *CGF,
                            /* IsIsaPointer */ false,
                            /* AuthenticatesNullValues */ false, Discriminator);
 }
+
+llvm::Value *CodeGenFunction::authPointerToPointerCast(llvm::Value *ResultPtr,
+                                                       QualType SourceType,
+                                                       QualType DestType) {
+  CGPointerAuthInfo CurAuthInfo, NewAuthInfo;
+  if (SourceType->isSignableType())
+    CurAuthInfo = getPointerAuthInfoForType(CGM, SourceType);
+
+  if (DestType->isSignableType())
+    NewAuthInfo = getPointerAuthInfoForType(CGM, DestType);
+
+  if (!CurAuthInfo && !NewAuthInfo)
+    return ResultPtr;
+
+  // If only one side of the cast is a function pointer, then we still need to
+  // resign to handle casts to/from opaque pointers.
+  if (!CurAuthInfo && DestType->isFunctionPointerType())
+    CurAuthInfo = CGM.getFunctionPointerAuthInfo(SourceType);
+
+  if (!NewAuthInfo && SourceType->isFunctionPointerType())
+    NewAuthInfo = CGM.getFunctionPointerAuthInfo(DestType);
+
+  return emitPointerAuthResign(ResultPtr, DestType, CurAuthInfo, NewAuthInfo,
+                               /*IsKnownNonNull=*/false);
+}
+
+Address CodeGenFunction::authPointerToPointerCast(Address Ptr,
+                                                  QualType SourceType,
+                                                  QualType DestType) {
+  CGPointerAuthInfo CurAuthInfo, NewAuthInfo;
+  if (SourceType->isSignableType())
+    CurAuthInfo = getPointerAuthInfoForType(CGM, SourceType);
+
+  if (DestType->isSignableType())
+    NewAuthInfo = getPointerAuthInfoForType(CGM, DestType);
+
+  if (!CurAuthInfo && !NewAuthInfo)
+    return Ptr;
+
+  if (!CurAuthInfo && DestType->isFunctionPointerType()) {
+    // When casting a non-signed pointer to a function pointer, just set the
+    // auth info on Ptr to the assumed schema. The pointer will be resigned to
+    // the effective type when used.
+    Ptr.setPointerAuthInfo(CGM.getFunctionPointerAuthInfo(SourceType));
+    return Ptr;
+  }
+
+  if (!NewAuthInfo && SourceType->isFunctionPointerType()) {
+    NewAuthInfo = CGM.getFunctionPointerAuthInfo(DestType);
+    Ptr = Ptr.getResignedAddress(NewAuthInfo, *this);
+    Ptr.setPointerAuthInfo(CGPointerAuthInfo());
+    return Ptr;
+  }
+
+  return Ptr;
+}
+
+Address CodeGenFunction::getAsNaturalAddressOf(Address Addr,
+                                               QualType PointeeTy) {
+  CGPointerAuthInfo Info =
+      PointeeTy.isNull() ? CGPointerAuthInfo()
+                         : CGM.getPointerAuthInfoForPointeeType(PointeeTy);
+  return Addr.getResignedAddress(Info, *this);
+}
+
+Address Address::getResignedAddress(const CGPointerAuthInfo &NewInfo,
+                                    CodeGenFunction &CGF) const {
+  assert(isValid() && "pointer isn't valid");
+  CGPointerAuthInfo CurInfo = getPointerAuthInfo();
+  llvm::Value *Val;
+
+  // Nothing to do if neither the current or the new ptrauth info needs signing.
+  if (!CurInfo.isSigned() && !NewInfo.isSigned())
+    return Address(getBasePointer(), getElementType(), getAlignment(),
+                   isKnownNonNull());
+
+  assert(ElementType && "Effective type has to be set");
+  assert(!Offset && "unexpected non-null offset");
+
+  // If the current and the new ptrauth infos are the same and the offset is
+  // null, just cast the base pointer to the effective type.
+  if (CurInfo == NewInfo && !hasOffset())
+    Val = getBasePointer();
+  else
+    Val = CGF.emitPointerAuthResign(getBasePointer(), QualType(), CurInfo,
+                                    NewInfo, isKnownNonNull());
+
+  Val = CGF.Builder.CreateBitCast(Val, getType());
+  return Address(Val, getElementType(), getAlignment(), NewInfo,
+                 /*Offset=*/nullptr, isKnownNonNull());
+}
+
+llvm::Value *Address::emitRawPointerSlow(CodeGenFunction &CGF) const {
+  return CGF.getAsNaturalPointerTo(*this, QualType());
+}
+
+llvm::Value *LValue::getPointer(CodeGenFunction &CGF) const {
+  assert(isSimple());
+  return emitResignedPointer(getType(), CGF);
+}
+
+llvm::Value *LValue::emitResignedPointer(QualType PointeeTy,
+                                         CodeGenFunction &CGF) const {
+  assert(isSimple());
+  return CGF.getAsNaturalAddressOf(Addr, PointeeTy).getBasePointer();
+}
+
+llvm::Value *LValue::emitRawPointer(CodeGenFunction &CGF) const {
+  assert(isSimple());
+  return Addr.isValid() ? Addr.emitRawPointer(CGF) : nullptr;
+}
diff --git a/clang/lib/CodeGen/CGRecordLayout.h b/clang/lib/CodeGen/CGRecordLayout.h
index 6c06ad20fbe56..44e888c93108f 100644
--- a/clang/lib/CodeGen/CGRecordLayout.h
+++ b/clang/lib/CodeGen/CGRecordLayout.h
@@ -193,6 +193,10 @@ class CGRecordLayout {
     return IsZeroInitializableAsBase;
   }
 
+  bool containsFieldDecl(const FieldDecl *FD) const {
+    return FieldInfo.count(FD) != 0;
+  }
+
   /// Return llvm::StructType element number that corresponds to the
   /// field FD.
   unsigned getLLVMFieldNo(const FieldDecl *FD) const {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 2e65e9fd26099..30b6fce5d016a 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -222,6 +222,12 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPUnrollDirectiveClass:
     EmitOMPUnrollDirective(cast<OMPUnrollDirective>(*S));
     break;
+  case Stmt::OMPReverseDirectiveClass:
+    EmitOMPReverseDirective(cast<OMPReverseDirective>(*S));
+    break;
+  case Stmt::OMPInterchangeDirectiveClass:
+    EmitOMPInterchangeDirective(cast<OMPInterchangeDirective>(*S));
+    break;
   case Stmt::OMPForDirectiveClass:
     EmitOMPForDirective(cast<OMPForDirective>(*S));
     break;
@@ -717,6 +723,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   bool nomerge = false;
   bool noinline = false;
   bool alwaysinline = false;
+  bool noconvergent = false;
   const CallExpr *musttail = nullptr;
 
   for (const auto *A : S.getAttrs()) {
@@ -732,6 +739,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
     case attr::AlwaysInline:
       alwaysinline = true;
       break;
+    case attr::NoConvergent:
+      noconvergent = true;
+      break;
     case attr::MustTail: {
       const Stmt *Sub = S.getSubStmt();
       const ReturnStmt *R = cast<ReturnStmt>(Sub);
@@ -750,6 +760,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
   SaveAndRestore save_noinline(InNoInlineAttributedStmt, noinline);
   SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
+  SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
   SaveAndRestore save_musttail(MustTailCall, musttail);
   EmitStmt(S.getSubStmt(), S.getAttrs());
 }
@@ -2459,7 +2470,8 @@ static llvm::MDNode *getAsmSrcLocInfo(const StringLiteral *Str,
 
 static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                               bool HasUnwindClobber, bool ReadOnly,
-                              bool ReadNone, bool NoMerge, const AsmStmt &S,
+                              bool ReadNone, bool NoMerge, bool NoConvergent,
+                              const AsmStmt &S,
                               const std::vector<llvm::Type *> &ResultRegTypes,
                               const std::vector<llvm::Type *> &ArgElemTypes,
                               CodeGenFunction &CGF,
@@ -2500,11 +2512,11 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
-  if (CGF.getLangOpts().assumeFunctionsAreConvergent())
+  if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
     // as bar.sync, and so can't have certain optimizations applied around
-    // them).
+    // them) unless it's explicitly marked 'noconvergent'.
     Result.addFnAttr(llvm::Attribute::Convergent);
   // Extract all of the register value results from the asm.
   if (ResultRegTypes.size() == 1) {
@@ -2745,7 +2757,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
       if (RequiresCast) {
         unsigned Size = getContext().getTypeSize(QTy);
-        Ty = llvm::IntegerType::get(getLLVMContext(), Size);
+        if (Size)
+          Ty = llvm::IntegerType::get(getLLVMContext(), Size);
+        else
+          CGM.Error(OutExpr->getExprLoc(), "output size should not be zero");
       }
       ResultRegTypes.push_back(Ty);
       // If this output is tied to an input, and if the input is larger, then
@@ -3031,9 +3046,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
   if (IsGCCAsmGoto) {
     CBR = Builder.CreateCallBr(IA, Fallthrough, Transfer, Args);
     EmitBlock(Fallthrough);
-    UpdateAsmCallInst(*CBR, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*CBR, HasSideEffect, /*HasUnwindClobber=*/false, ReadOnly,
+                      ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
     // Because we are emitting code top to bottom, we don't have enough
     // information at this point to know precisely whether we have a critical
     // edge. If we have outputs, split all indirect destinations.
@@ -3061,15 +3077,17 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     }
   } else if (HasUnwindClobber) {
     llvm::CallBase *Result = EmitCallOrInvoke(IA, Args, "");
-    UpdateAsmCallInst(*Result, HasSideEffect, true, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/true,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   } else {
     llvm::CallInst *Result =
         Builder.CreateCall(IA, Args, getBundlesForFunclet(IA));
-    UpdateAsmCallInst(*Result, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/false,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   }
 
   EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes,
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 08b569352fe81..1673508cfdfaa 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -44,6 +44,8 @@ using namespace llvm::omp;
 #define TTL_CODEGEN_TYPE "target-teams-loop-codegen"
 
 static const VarDecl *getBaseDecl(const Expr *Ref);
+static OpenMPDirectiveKind
+getEffectiveDirectiveKind(const OMPExecutableDirective &S);
 
 namespace {
 /// Lexical scope for OpenMP executable constructs, that handles correct codegen
@@ -111,10 +113,10 @@ class OMPLexicalScope : public CodeGenFunction::LexicalScope {
 /// for captured expressions.
 class OMPParallelScope final : public OMPLexicalScope {
   bool EmitPreInitStmt(const OMPExecutableDirective &S) {
-    OpenMPDirectiveKind Kind = S.getDirectiveKind();
-    return !(isOpenMPTargetExecutionDirective(Kind) ||
-             isOpenMPLoopBoundSharingDirective(Kind)) &&
-           isOpenMPParallelDirective(Kind);
+    OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+    return !(isOpenMPTargetExecutionDirective(EKind) ||
+             isOpenMPLoopBoundSharingDirective(EKind)) &&
+           isOpenMPParallelDirective(EKind);
   }
 
 public:
@@ -127,9 +129,9 @@ class OMPParallelScope final : public OMPLexicalScope {
 /// for captured expressions.
 class OMPTeamsScope final : public OMPLexicalScope {
   bool EmitPreInitStmt(const OMPExecutableDirective &S) {
-    OpenMPDirectiveKind Kind = S.getDirectiveKind();
-    return !isOpenMPTargetExecutionDirective(Kind) &&
-           isOpenMPTeamsDirective(Kind);
+    OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+    return !isOpenMPTargetExecutionDirective(EKind) &&
+           isOpenMPTeamsDirective(EKind);
   }
 
 public:
@@ -154,7 +156,7 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
       }
       // Mark private vars as undefs.
       for (const auto *C : LD->getClausesOfKind<OMPPrivateClause>()) {
-        for (const Expr *IRef : C->varlists()) {
+        for (const Expr *IRef : C->varlist()) {
           const auto *OrigVD =
               cast<VarDecl>(cast<DeclRefExpr>(IRef)->getDecl());
           if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
@@ -187,6 +189,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
       PreInits = Tile->getPreInits();
     } else if (const auto *Unroll = dyn_cast<OMPUnrollDirective>(&S)) {
       PreInits = Unroll->getPreInits();
+    } else if (const auto *Reverse = dyn_cast<OMPReverseDirective>(&S)) {
+      PreInits = Reverse->getPreInits();
+    } else if (const auto *Interchange =
+                   dyn_cast<OMPInterchangeDirective>(&S)) {
+      PreInits = Interchange->getPreInits();
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
@@ -250,20 +257,20 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
           }
         }
       } else if (const auto *UDP = dyn_cast<OMPUseDevicePtrClause>(C)) {
-        for (const Expr *E : UDP->varlists()) {
+        for (const Expr *E : UDP->varlist()) {
           const Decl *D = cast<DeclRefExpr>(E)->getDecl();
           if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
             CGF.EmitVarDecl(*OED);
         }
       } else if (const auto *UDP = dyn_cast<OMPUseDeviceAddrClause>(C)) {
-        for (const Expr *E : UDP->varlists()) {
+        for (const Expr *E : UDP->varlist()) {
           const Decl *D = getBaseDecl(E);
           if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
             CGF.EmitVarDecl(*OED);
         }
       }
     }
-    if (!isOpenMPSimdDirective(S.getDirectiveKind()))
+    if (!isOpenMPSimdDirective(getEffectiveDirectiveKind(S)))
       CGF.EmitOMPPrivateClause(S, InlinedShareds);
     if (const auto *TG = dyn_cast<OMPTaskgroupDirective>(&S)) {
       if (const Expr *E = TG->getReductionRef())
@@ -304,6 +311,30 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
 
 } // namespace
 
+// The loop directive with a bind clause will be mapped to a different
+// directive with corresponding semantics.
+static OpenMPDirectiveKind
+getEffectiveDirectiveKind(const OMPExecutableDirective &S) {
+  OpenMPDirectiveKind Kind = S.getDirectiveKind();
+  if (Kind != OMPD_loop)
+    return Kind;
+
+  OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
+  if (const auto *C = S.getSingleClause<OMPBindClause>())
+    BindKind = C->getBindKind();
+
+  switch (BindKind) {
+  case OMPC_BIND_parallel:
+    return OMPD_for;
+  case OMPC_BIND_teams:
+    return OMPD_distribute;
+  case OMPC_BIND_thread:
+    return OMPD_simd;
+  default:
+    return OMPD_loop;
+  }
+}
+
 static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
                                          const OMPExecutableDirective &S,
                                          const RegionCodeGenTy &CodeGen);
@@ -639,27 +670,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
   // Build the argument list.
   bool NeedWrapperFunction =
       getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
-  FunctionArgList Args;
-  llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
-  llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
+  FunctionArgList Args, WrapperArgs;
+  llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
+      WrapperLocalAddrs;
+  llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
+      WrapperVLASizes;
   SmallString<256> Buffer;
   llvm::raw_svector_ostream Out(Buffer);
   Out << CapturedStmtInfo->getHelperName();
-  if (NeedWrapperFunction)
+
+  CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
+  llvm::Function *WrapperF = nullptr;
+  if (NeedWrapperFunction) {
+    // Emit the final kernel early to allow attributes to be added by the
+    // OpenMPI-IR-Builder.
+    FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
+                              /*RegisterCastedArgsOnly=*/true,
+                              CapturedStmtInfo->getHelperName(), Loc);
+    WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
+    WrapperF =
+        emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
+                                     WrapperCGF.CXXThisValue, WrapperFO);
     Out << "_debug__";
+  }
   FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
                      Out.str(), Loc);
-  llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
-                                                   VLASizes, CXXThisValue, FO);
+  llvm::Function *F = emitOutlinedFunctionPrologue(
+      *this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
   CodeGenFunction::OMPPrivateScope LocalScope(*this);
-  for (const auto &LocalAddrPair : LocalAddrs) {
+  for (const auto &LocalAddrPair : WrapperLocalAddrs) {
     if (LocalAddrPair.second.first) {
       LocalScope.addPrivate(LocalAddrPair.second.first,
                             LocalAddrPair.second.second);
     }
   }
   (void)LocalScope.Privatize();
-  for (const auto &VLASizePair : VLASizes)
+  for (const auto &VLASizePair : WrapperVLASizes)
     VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
   PGO.assignRegionCounters(GlobalDecl(CD), F);
   CapturedStmtInfo->EmitBody(*this, CD->getBody());
@@ -668,17 +714,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
   if (!NeedWrapperFunction)
     return F;
 
-  FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
-                            /*RegisterCastedArgsOnly=*/true,
-                            CapturedStmtInfo->getHelperName(), Loc);
-  CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
-  WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
-  Args.clear();
-  LocalAddrs.clear();
-  VLASizes.clear();
-  llvm::Function *WrapperF =
-      emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
-                                   WrapperCGF.CXXThisValue, WrapperFO);
+  // Reverse the order.
+  WrapperF->removeFromParent();
+  F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);
+
   llvm::SmallVector<llvm::Value *, 4> CallArgs;
   auto *PI = F->arg_begin();
   for (const auto *Arg : Args) {
@@ -820,20 +859,20 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
                                                 OMPPrivateScope &PrivateScope) {
   if (!HaveInsertPoint())
     return false;
-  bool DeviceConstTarget =
-      getLangOpts().OpenMPIsTargetDevice &&
-      isOpenMPTargetExecutionDirective(D.getDirectiveKind());
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
+  bool DeviceConstTarget = getLangOpts().OpenMPIsTargetDevice &&
+                           isOpenMPTargetExecutionDirective(EKind);
   bool FirstprivateIsLastprivate = false;
   llvm::DenseMap<const VarDecl *, OpenMPLastprivateModifier> Lastprivates;
   for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const auto *D : C->varlists())
+    for (const auto *D : C->varlist())
       Lastprivates.try_emplace(
           cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl(),
           C->getKind());
   }
   llvm::DenseSet<const VarDecl *> EmittedAsFirstprivate;
   llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
-  getOpenMPCaptureRegions(CaptureRegions, D.getDirectiveKind());
+  getOpenMPCaptureRegions(CaptureRegions, EKind);
   // Force emission of the firstprivate copy if the directive does not emit
   // outlined function, like omp for, omp simd, omp distribute etc.
   bool MustEmitFirstprivateCopy =
@@ -1062,8 +1101,9 @@ bool CodeGenFunction::EmitOMPLastprivateClauseInit(
   if (!HaveInsertPoint())
     return false;
   bool HasAtLeastOneLastprivate = false;
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
   llvm::DenseSet<const VarDecl *> SIMDLCVs;
-  if (isOpenMPSimdDirective(D.getDirectiveKind())) {
+  if (isOpenMPSimdDirective(EKind)) {
     const auto *LoopDirective = cast<OMPLoopDirective>(&D);
     for (const Expr *C : LoopDirective->counters()) {
       SIMDLCVs.insert(
@@ -1073,8 +1113,7 @@ bool CodeGenFunction::EmitOMPLastprivateClauseInit(
   llvm::DenseSet<const VarDecl *> AlreadyEmittedVars;
   for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
     HasAtLeastOneLastprivate = true;
-    if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
-        !getLangOpts().OpenMPSimd)
+    if (isOpenMPTaskLoopDirective(EKind) && !getLangOpts().OpenMPSimd)
       break;
     const auto *IRef = C->varlist_begin();
     const auto *IDestRef = C->destination_exprs().begin();
@@ -1307,13 +1346,13 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     ++Count;
   }
   if (!Data.ReductionVars.empty()) {
+    OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
     Data.IsReductionWithTaskMod = true;
-    Data.IsWorksharingReduction =
-        isOpenMPWorksharingDirective(D.getDirectiveKind());
+    Data.IsWorksharingReduction = isOpenMPWorksharingDirective(EKind);
     llvm::Value *ReductionDesc = CGM.getOpenMPRuntime().emitTaskReductionInit(
         *this, D.getBeginLoc(), TaskLHSs, TaskRHSs, Data);
     const Expr *TaskRedRef = nullptr;
-    switch (D.getDirectiveKind()) {
+    switch (EKind) {
     case OMPD_parallel:
       TaskRedRef = cast<OMPParallelDirective>(D).getTaskReductionRefExpr();
       break;
@@ -1444,16 +1483,16 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
         IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
   }
   if (HasAtLeastOneReduction) {
+    OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
     if (IsReductionWithTaskMod) {
       CGM.getOpenMPRuntime().emitTaskReductionFini(
-          *this, D.getBeginLoc(),
-          isOpenMPWorksharingDirective(D.getDirectiveKind()));
+          *this, D.getBeginLoc(), isOpenMPWorksharingDirective(EKind));
     }
     bool TeamsLoopCanBeParallel = false;
     if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
       TeamsLoopCanBeParallel = TTLD->canBeParallelFor();
     bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
-                      isOpenMPParallelDirective(D.getDirectiveKind()) ||
+                      isOpenMPParallelDirective(EKind) ||
                       TeamsLoopCanBeParallel || ReductionKind == OMPD_simd;
     bool SimpleReduction = ReductionKind == OMPD_simd;
     // Emit nowait reduction if nowait clause is present or directive is a
@@ -1506,7 +1545,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     return;
   llvm::DenseSet<CanonicalDeclPtr<const VarDecl>> PrivateDecls;
   for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1517,7 +1556,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1528,7 +1567,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLinearClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1543,7 +1582,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
   // Firstprivates do not return value but may be passed by reference - no need
   // to check for updated lastprivate conditional.
   for (const auto *C : S.getClausesOfKind<OMPFirstprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1910,7 +1949,8 @@ void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
   // Update the linear variables.
   // In distribute directives only loop counters may be marked as linear, no
   // need to generate the code for them.
-  if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
+  if (!isOpenMPDistributeDirective(EKind)) {
     for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
       for (const Expr *UE : C->updates())
         EmitIgnoredExpr(UE);
@@ -1944,7 +1984,7 @@ void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
     OMPAfterScanBlock = createBasicBlock("omp.after.scan.bb");
     // No need to allocate inscan exit block, in simd mode it is selected in the
     // codegen for the scan directive.
-    if (D.getDirectiveKind() != OMPD_simd && !getLangOpts().OpenMPSimd)
+    if (EKind != OMPD_simd && !getLangOpts().OpenMPSimd)
       OMPScanExitBlock = createBasicBlock("omp.exit.inscan.bb");
     OMPScanDispatch = createBasicBlock("omp.inscan.dispatch");
     EmitBranch(OMPScanDispatch);
@@ -2248,7 +2288,7 @@ static void emitAlignedClause(CodeGenFunction &CGF,
           cast<llvm::ConstantInt>(CGF.EmitScalarExpr(AlignmentExpr));
       ClauseAlignment = AlignmentCI->getValue();
     }
-    for (const Expr *E : Clause->varlists()) {
+    for (const Expr *E : Clause->varlist()) {
       llvm::APInt Alignment(ClauseAlignment);
       if (Alignment == 0) {
         // OpenMP [2.8.1, Description]
@@ -2357,7 +2397,8 @@ void CodeGenFunction::EmitOMPLinearClause(
   if (!HaveInsertPoint())
     return;
   llvm::DenseSet<const VarDecl *> SIMDLCVs;
-  if (isOpenMPSimdDirective(D.getDirectiveKind())) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
+  if (isOpenMPSimdDirective(EKind)) {
     const auto *LoopDirective = cast<OMPLoopDirective>(&D);
     for (const Expr *C : LoopDirective->counters()) {
       SIMDLCVs.insert(
@@ -2366,7 +2407,7 @@ void CodeGenFunction::EmitOMPLinearClause(
   }
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
     auto CurPrivate = C->privates().begin();
-    for (const Expr *E : C->varlists()) {
+    for (const Expr *E : C->varlist()) {
       const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
       const auto *PrivateVD =
           cast<VarDecl>(cast<DeclRefExpr>(*CurPrivate)->getDecl());
@@ -2419,9 +2460,9 @@ void CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective &D) {
   if (const auto *C = D.getSingleClause<OMPOrderClause>())
     if (C->getKind() == OMPC_ORDER_concurrent)
       LoopStack.setParallel(/*Enable=*/true);
-  if ((D.getDirectiveKind() == OMPD_simd ||
-       (getLangOpts().OpenMPSimd &&
-        isOpenMPSimdDirective(D.getDirectiveKind()))) &&
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D);
+  if ((EKind == OMPD_simd ||
+       (getLangOpts().OpenMPSimd && isOpenMPSimdDirective(EKind))) &&
       llvm::any_of(D.getClausesOfKind<OMPReductionClause>(),
                    [](const OMPReductionClause *C) {
                      return C->getModifier() == OMPC_REDUCTION_inscan;
@@ -2508,7 +2549,8 @@ static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S,
     BodyCodeGen(CGF);
   };
   const Expr *IfCond = nullptr;
-  if (isOpenMPSimdDirective(S.getDirectiveKind())) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+  if (isOpenMPSimdDirective(EKind)) {
     for (const auto *C : S.getClausesOfKind<OMPIfClause>()) {
       if (CGF.getLangOpts().OpenMP >= 50 &&
           (C->getNameModifier() == OMPD_unknown ||
@@ -2529,21 +2571,24 @@ static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S,
 static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S,
                               PrePostActionTy &Action) {
   Action.Enter(CGF);
-  assert(isOpenMPSimdDirective(S.getDirectiveKind()) &&
-         "Expected simd directive");
   OMPLoopScope PreInitScope(CGF, S);
   // if (PreCond) {
   //   for (IV in 0..LastIteration) BODY;
   //   <Final counter/linear vars updates>;
   // }
-  //
-  if (isOpenMPDistributeDirective(S.getDirectiveKind()) ||
-      isOpenMPWorksharingDirective(S.getDirectiveKind()) ||
-      isOpenMPTaskLoopDirective(S.getDirectiveKind())) {
+
+  // The presence of lower/upper bound variable depends on the actual directive
+  // kind in the AST node. The variables must be emitted because some of the
+  // expressions associated with the loop will use them.
+  OpenMPDirectiveKind DKind = S.getDirectiveKind();
+  if (isOpenMPDistributeDirective(DKind) ||
+      isOpenMPWorksharingDirective(DKind) || isOpenMPTaskLoopDirective(DKind) ||
+      isOpenMPGenericLoopDirective(DKind)) {
     (void)EmitOMPHelperVar(CGF, cast<DeclRefExpr>(S.getLowerBoundVariable()));
     (void)EmitOMPHelperVar(CGF, cast<DeclRefExpr>(S.getUpperBoundVariable()));
   }
 
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
   // Emit: if (PreCond) - begin.
   // If the condition constant folds and can be elided, avoid emitting the
   // whole loop.
@@ -2588,7 +2633,7 @@ static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S,
         CGF, S, CGF.EmitLValue(S.getIterationVariable()));
     bool HasLastprivateClause = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
     (void)LoopScope.Privatize();
-    if (isOpenMPTargetExecutionDirective(S.getDirectiveKind()))
+    if (isOpenMPTargetExecutionDirective(EKind))
       CGF.CGM.getOpenMPRuntime().adjustTargetSpecificDataForLambdas(CGF, S);
 
     emitCommonSimdLoop(
@@ -2622,7 +2667,9 @@ static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S,
   }
 }
 
-static bool isSupportedByOpenMPIRBuilder(const OMPSimdDirective &S) {
+// Pass OMPLoopDirective (instead of OMPSimdDirective) to make this function
+// available for "loop bind(thread)", which maps to "simd".
+static bool isSimdSupportedByOpenMPIRBuilder(const OMPLoopDirective &S) {
   // Check for unsupported clauses
   for (OMPClause *C : S.clauses()) {
     // Currently only order, simdlen and safelen clauses are supported
@@ -2653,8 +2700,9 @@ static bool isSupportedByOpenMPIRBuilder(const OMPSimdDirective &S) {
   }
   return true;
 }
+
 static llvm::MapVector<llvm::Value *, llvm::Value *>
-GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
+GetAlignedMapping(const OMPLoopDirective &S, CodeGenFunction &CGF) {
   llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars;
   for (const auto *Clause : S.getClausesOfKind<OMPAlignedClause>()) {
     llvm::APInt ClauseAlignment(64, 0);
@@ -2663,7 +2711,7 @@ GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
           cast<llvm::ConstantInt>(CGF.EmitScalarExpr(AlignmentExpr));
       ClauseAlignment = AlignmentCI->getValue();
     }
-    for (const Expr *E : Clause->varlists()) {
+    for (const Expr *E : Clause->varlist()) {
       llvm::APInt Alignment(ClauseAlignment);
       if (Alignment == 0) {
         // OpenMP [2.8.1, Description]
@@ -2684,11 +2732,14 @@ GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
   return AlignedVars;
 }
 
-void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
+// Pass OMPLoopDirective (instead of OMPSimdDirective) to make this function
+// available for "loop bind(thread)", which maps to "simd".
+void emitOMPSimdDirective(const OMPLoopDirective &S, CodeGenFunction &CGF,
+                          CodeGenModule &CGM) {
   bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
+      CGM.getLangOpts().OpenMPIRBuilder && isSimdSupportedByOpenMPIRBuilder(S);
   if (UseOMPIRBuilder) {
-    auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF,
+    auto &&CodeGenIRBuilder = [&S, &CGM, UseOMPIRBuilder](CodeGenFunction &CGF,
                                                           PrePostActionTy &) {
       // Use the OpenMPIRBuilder if enabled.
       if (UseOMPIRBuilder) {
@@ -2697,30 +2748,28 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
         // Emit the associated statement and get its loop representation.
         const Stmt *Inner = S.getRawStmt();
         llvm::CanonicalLoopInfo *CLI =
-            EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
+            CGF.EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
 
         llvm::OpenMPIRBuilder &OMPBuilder =
             CGM.getOpenMPRuntime().getOMPBuilder();
         // Add SIMD specific metadata
         llvm::ConstantInt *Simdlen = nullptr;
         if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
+          RValue Len = CGF.EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
+                                       /*ignoreResult=*/true);
           auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
           Simdlen = Val;
         }
         llvm::ConstantInt *Safelen = nullptr;
         if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
+          RValue Len = CGF.EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
+                                       /*ignoreResult=*/true);
           auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
           Safelen = Val;
         }
         llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
         if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
-          if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) {
+          if (C->getKind() == OpenMPOrderClauseKind::OMPC_ORDER_concurrent) {
             Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
           }
         }
@@ -2733,27 +2782,31 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
     };
     {
       auto LPCRegion =
-          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-      OMPLexicalScope Scope(*this, S, OMPD_unknown);
-      CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd,
+          CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
+      OMPLexicalScope Scope(CGF, S, OMPD_unknown);
+      CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd,
                                                   CodeGenIRBuilder);
     }
     return;
   }
 
-  ParentLoopDirectiveForScanRegion ScanRegion(*this, S);
-  OMPFirstScanLoop = true;
+  CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S);
+  CGF.OMPFirstScanLoop = true;
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     emitOMPSimdRegion(CGF, S, Action);
   };
   {
     auto LPCRegion =
-        CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-    OMPLexicalScope Scope(*this, S, OMPD_unknown);
-    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+        CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
+    OMPLexicalScope Scope(CGF, S, OMPD_unknown);
+    CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd, CodeGen);
   }
   // Check for outer lastprivate conditional update.
-  checkForLastprivateConditionalUpdate(*this, S);
+  checkForLastprivateConditionalUpdate(CGF, S);
+}
+
+void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
+  emitOMPSimdDirective(S, *this, CGM);
 }
 
 void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) {
@@ -2762,6 +2815,19 @@ void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) {
   EmitStmt(S.getTransformedStmt());
 }
 
+void CodeGenFunction::EmitOMPReverseDirective(const OMPReverseDirective &S) {
+  // Emit the de-sugared statement.
+  OMPTransformDirectiveScopeRAII ReverseScope(*this, &S);
+  EmitStmt(S.getTransformedStmt());
+}
+
+void CodeGenFunction::EmitOMPInterchangeDirective(
+    const OMPInterchangeDirective &S) {
+  // Emit the de-sugared statement.
+  OMPTransformDirectiveScopeRAII InterchangeScope(*this, &S);
+  EmitStmt(S.getTransformedStmt());
+}
+
 void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
   bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
 
@@ -2885,12 +2951,13 @@ void CodeGenFunction::EmitOMPOuterLoop(
   JumpDest Continue = getJumpDestInCurrentScope("omp.dispatch.inc");
   BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));
 
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
   emitCommonSimdLoop(
       *this, S,
-      [&S, IsMonotonic](CodeGenFunction &CGF, PrePostActionTy &) {
+      [&S, IsMonotonic, EKind](CodeGenFunction &CGF, PrePostActionTy &) {
         // Generate !llvm.loop.parallel metadata for loads and stores for loops
         // with dynamic/guided scheduling and without ordered clause.
-        if (!isOpenMPSimdDirective(S.getDirectiveKind())) {
+        if (!isOpenMPSimdDirective(EKind)) {
           CGF.LoopStack.setParallel(!IsMonotonic);
           if (const auto *C = S.getSingleClause<OMPOrderClause>())
             if (C->getKind() == OMPC_ORDER_concurrent)
@@ -2937,7 +3004,7 @@ void CodeGenFunction::EmitOMPOuterLoop(
       CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getEndLoc(),
                                                      LoopArgs.DKind);
   };
-  OMPCancelStack.emitExit(*this, S.getDirectiveKind(), CodeGen);
+  OMPCancelStack.emitExit(*this, EKind, CodeGen);
 }
 
 void CodeGenFunction::EmitOMPForOuterLoop(
@@ -3023,8 +3090,9 @@ void CodeGenFunction::EmitOMPForOuterLoop(
     CGOpenMPRuntime::StaticRTInput StaticInit(
         IVSize, IVSigned, Ordered, LoopArgs.IL, LoopArgs.LB, LoopArgs.UB,
         LoopArgs.ST, LoopArgs.Chunk);
-    RT.emitForStaticInit(*this, S.getBeginLoc(), S.getDirectiveKind(),
-                         ScheduleKind, StaticInit);
+    OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+    RT.emitForStaticInit(*this, S.getBeginLoc(), EKind, ScheduleKind,
+                         StaticInit);
   }
 
   auto &&CodeGenOrdered = [Ordered](CodeGenFunction &CGF, SourceLocation Loc,
@@ -3069,6 +3137,7 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop(
   const Expr *IVExpr = S.getIterationVariable();
   const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
   const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
 
   CGOpenMPRuntime::StaticRTInput StaticInit(
       IVSize, IVSigned, /* Ordered = */ false, LoopArgs.IL, LoopArgs.LB,
@@ -3078,7 +3147,7 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop(
   // for combined 'distribute' and 'for' the increment expression of distribute
   // is stored in DistInc. For 'distribute' alone, it is in Inc.
   Expr *IncExpr;
-  if (isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()))
+  if (isOpenMPLoopBoundSharingDirective(EKind))
     IncExpr = S.getDistInc();
   else
     IncExpr = S.getInc();
@@ -3092,20 +3161,20 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop(
   OuterLoopArgs.ST = LoopArgs.ST;
   OuterLoopArgs.IL = LoopArgs.IL;
   OuterLoopArgs.Chunk = LoopArgs.Chunk;
-  OuterLoopArgs.EUB = isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())
+  OuterLoopArgs.EUB = isOpenMPLoopBoundSharingDirective(EKind)
                           ? S.getCombinedEnsureUpperBound()
                           : S.getEnsureUpperBound();
   OuterLoopArgs.IncExpr = IncExpr;
-  OuterLoopArgs.Init = isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())
+  OuterLoopArgs.Init = isOpenMPLoopBoundSharingDirective(EKind)
                            ? S.getCombinedInit()
                            : S.getInit();
-  OuterLoopArgs.Cond = isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())
+  OuterLoopArgs.Cond = isOpenMPLoopBoundSharingDirective(EKind)
                            ? S.getCombinedCond()
                            : S.getCond();
-  OuterLoopArgs.NextLB = isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())
+  OuterLoopArgs.NextLB = isOpenMPLoopBoundSharingDirective(EKind)
                              ? S.getCombinedNextLowerBound()
                              : S.getNextLowerBound();
-  OuterLoopArgs.NextUB = isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())
+  OuterLoopArgs.NextUB = isOpenMPLoopBoundSharingDirective(EKind)
                              ? S.getCombinedNextUpperBound()
                              : S.getNextUpperBound();
   OuterLoopArgs.DKind = OMPD_distribute;
@@ -3197,11 +3266,12 @@ static void
 emitInnerParallelForWhenCombined(CodeGenFunction &CGF,
                                  const OMPLoopDirective &S,
                                  CodeGenFunction::JumpDest LoopExit) {
-  auto &&CGInlinedWorksharingLoop = [&S](CodeGenFunction &CGF,
-                                         PrePostActionTy &Action) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+  auto &&CGInlinedWorksharingLoop = [&S, EKind](CodeGenFunction &CGF,
+                                                PrePostActionTy &Action) {
     Action.Enter(CGF);
     bool HasCancel = false;
-    if (!isOpenMPSimdDirective(S.getDirectiveKind())) {
+    if (!isOpenMPSimdDirective(EKind)) {
       if (const auto *D = dyn_cast<OMPTeamsDistributeParallelForDirective>(&S))
         HasCancel = D->hasCancel();
       else if (const auto *D = dyn_cast<OMPDistributeParallelForDirective>(&S))
@@ -3210,16 +3280,14 @@ emitInnerParallelForWhenCombined(CodeGenFunction &CGF,
                    dyn_cast<OMPTargetTeamsDistributeParallelForDirective>(&S))
         HasCancel = D->hasCancel();
     }
-    CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, S.getDirectiveKind(),
-                                                     HasCancel);
+    CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, EKind, HasCancel);
     CGF.EmitOMPWorksharingLoop(S, S.getPrevEnsureUpperBound(),
                                emitDistributeParallelForInnerBounds,
                                emitDistributeParallelForDispatchBounds);
   };
 
   emitCommonOMPParallelDirective(
-      CGF, S,
-      isOpenMPSimdDirective(S.getDirectiveKind()) ? OMPD_for_simd : OMPD_for,
+      CGF, S, isOpenMPSimdDirective(EKind) ? OMPD_for_simd : OMPD_for,
       CGInlinedWorksharingLoop,
       emitDistributeParallelForDistributeInnerBoundParams);
 }
@@ -3352,6 +3420,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
 
     // Emit 'then' code.
     {
+      OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
       OMPPrivateScope LoopScope(*this);
       if (EmitOMPFirstprivateClause(S, LoopScope) || HasLinears) {
         // Emit implicit barrier to synchronize threads and avoid data races on
@@ -3369,7 +3438,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       EmitOMPPrivateLoopCounters(S, LoopScope);
       EmitOMPLinearClause(S, LoopScope);
       (void)LoopScope.Privatize();
-      if (isOpenMPTargetExecutionDirective(S.getDirectiveKind()))
+      if (isOpenMPTargetExecutionDirective(EKind))
         CGM.getOpenMPRuntime().adjustTargetSpecificDataForLambdas(*this, S);
 
       // Detect the loop schedule kind and chunk.
@@ -3407,8 +3476,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       bool StaticChunkedOne =
           RT.isStaticChunked(ScheduleKind.Schedule,
                              /* Chunked */ Chunk != nullptr) &&
-          HasChunkSizeOne &&
-          isOpenMPLoopBoundSharingDirective(S.getDirectiveKind());
+          HasChunkSizeOne && isOpenMPLoopBoundSharingDirective(EKind);
       bool IsMonotonic =
           Ordered ||
           (ScheduleKind.Schedule == OMPC_SCHEDULE_static &&
@@ -3424,8 +3492,8 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
             getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
         emitCommonSimdLoop(
             *this, S,
-            [&S](CodeGenFunction &CGF, PrePostActionTy &) {
-              if (isOpenMPSimdDirective(S.getDirectiveKind())) {
+            [&S, EKind](CodeGenFunction &CGF, PrePostActionTy &) {
+              if (isOpenMPSimdDirective(EKind)) {
                 CGF.EmitOMPSimdInit(S);
               } else if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
                 if (C->getKind() == OMPC_ORDER_concurrent)
@@ -3433,7 +3501,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
               }
             },
             [IVSize, IVSigned, Ordered, IL, LB, UB, ST, StaticChunkedOne, Chunk,
-             &S, ScheduleKind, LoopExit,
+             &S, ScheduleKind, LoopExit, EKind,
              &LoopScope](CodeGenFunction &CGF, PrePostActionTy &) {
               // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
               // When no chunk_size is specified, the iteration space is divided
@@ -3445,8 +3513,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
                   UB.getAddress(), ST.getAddress(),
                   StaticChunkedOne ? Chunk : nullptr);
               CGF.CGM.getOpenMPRuntime().emitForStaticInit(
-                  CGF, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind,
-                  StaticInit);
+                  CGF, S.getBeginLoc(), EKind, ScheduleKind, StaticInit);
               // UB = min(UB, GlobalUB);
               if (!StaticChunkedOne)
                 CGF.EmitIgnoredExpr(S.getEnsureUpperBound());
@@ -3481,7 +3548,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
           CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getEndLoc(),
                                                          OMPD_for);
         };
-        OMPCancelStack.emitExit(*this, S.getDirectiveKind(), CodeGen);
+        OMPCancelStack.emitExit(*this, EKind, CodeGen);
       } else {
         // Emit the outer loop, which requests its work chunk [LB..UB] from
         // runtime and runs the inner loop to process it.
@@ -3492,14 +3559,14 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
         EmitOMPForOuterLoop(ScheduleKind, IsMonotonic, S, LoopScope, Ordered,
                             LoopArguments, CGDispatchBounds);
       }
-      if (isOpenMPSimdDirective(S.getDirectiveKind())) {
+      if (isOpenMPSimdDirective(EKind)) {
         EmitOMPSimdFinal(S, [IL, &S](CodeGenFunction &CGF) {
           return CGF.Builder.CreateIsNotNull(
               CGF.EmitLoadOfScalar(IL, S.getBeginLoc()));
         });
       }
       EmitOMPReductionClauseFinal(
-          S, /*ReductionKind=*/isOpenMPSimdDirective(S.getDirectiveKind())
+          S, /*ReductionKind=*/isOpenMPSimdDirective(EKind)
                  ? /*Parallel and Simd*/ OMPD_parallel_for_simd
                  : /*Parallel only*/ OMPD_parallel);
       // Emit post-update of the reduction variables if IsLastIter != 0.
@@ -3511,7 +3578,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       // Emit final copy of the lastprivate variables if IsLastIter != 0.
       if (HasLastprivateClause)
         EmitOMPLastprivateClauseFinal(
-            S, isOpenMPSimdDirective(S.getDirectiveKind()),
+            S, isOpenMPSimdDirective(EKind),
             Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getBeginLoc())));
       LoopScope.restoreMap();
       EmitOMPLinearClauseFinal(S, [IL, &S](CodeGenFunction &CGF) {
@@ -3807,7 +3874,8 @@ static void emitScanBasedDirective(
     auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc());
     CGF.EmitBlock(ExitBB);
   };
-  if (isOpenMPParallelDirective(S.getDirectiveKind())) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+  if (isOpenMPParallelDirective(EKind)) {
     CGF.CGM.getOpenMPRuntime().emitMasterRegion(CGF, CodeGen, S.getBeginLoc());
     CGF.CGM.getOpenMPRuntime().emitBarrierCall(
         CGF, S.getBeginLoc(), OMPD_unknown, /*EmitChecks=*/false,
@@ -3825,6 +3893,7 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
                                      const OMPLoopDirective &S,
                                      bool HasCancel) {
   bool HasLastprivates;
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
   if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
                    [](const OMPReductionClause *C) {
                      return C->getModifier() == OMPC_REDUCTION_inscan;
@@ -3834,9 +3903,8 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
       OMPLoopScope LoopScope(CGF, S);
       return CGF.EmitScalarExpr(S.getNumIterations());
     };
-    const auto &&FirstGen = [&S, HasCancel](CodeGenFunction &CGF) {
-      CodeGenFunction::OMPCancelStackRAII CancelRegion(
-          CGF, S.getDirectiveKind(), HasCancel);
+    const auto &&FirstGen = [&S, HasCancel, EKind](CodeGenFunction &CGF) {
+      CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, EKind, HasCancel);
       (void)CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
                                        emitForLoopBounds,
                                        emitDispatchForLoopBounds);
@@ -3844,22 +3912,20 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
       CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(),
                                                  OMPD_for);
     };
-    const auto &&SecondGen = [&S, HasCancel,
+    const auto &&SecondGen = [&S, HasCancel, EKind,
                               &HasLastprivates](CodeGenFunction &CGF) {
-      CodeGenFunction::OMPCancelStackRAII CancelRegion(
-          CGF, S.getDirectiveKind(), HasCancel);
+      CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, EKind, HasCancel);
       HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
                                                    emitForLoopBounds,
                                                    emitDispatchForLoopBounds);
     };
-    if (!isOpenMPParallelDirective(S.getDirectiveKind()))
+    if (!isOpenMPParallelDirective(EKind))
       emitScanBasedDirectiveDecls(CGF, S, NumIteratorsGen);
     emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
-    if (!isOpenMPParallelDirective(S.getDirectiveKind()))
+    if (!isOpenMPParallelDirective(EKind))
       emitScanBasedDirectiveFinals(CGF, S, NumIteratorsGen);
   } else {
-    CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, S.getDirectiveKind(),
-                                                     HasCancel);
+    CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, EKind, HasCancel);
     HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
                                                  emitForLoopBounds,
                                                  emitDispatchForLoopBounds);
@@ -3867,11 +3933,14 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
   return HasLastprivates;
 }
 
-static bool isSupportedByOpenMPIRBuilder(const OMPForDirective &S) {
-  if (S.hasCancel())
+// Pass OMPLoopDirective (instead of OMPForDirective) to make this check
+// available for "loop bind(parallel)", which maps to "for".
+static bool isForSupportedByOpenMPIRBuilder(const OMPLoopDirective &S,
+                                            bool HasCancel) {
+  if (HasCancel)
     return false;
   for (OMPClause *C : S.clauses()) {
-    if (isa<OMPNowaitClause>(C))
+    if (isa<OMPNowaitClause, OMPBindClause>(C))
       continue;
 
     if (auto *SC = dyn_cast<OMPScheduleClause>(C)) {
@@ -3916,11 +3985,14 @@ convertClauseKindToSchedKind(OpenMPScheduleClauseKind ScheduleClauseKind) {
   llvm_unreachable("Unhandled schedule kind");
 }
 
-void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
+// Pass OMPLoopDirective (instead of OMPForDirective) to make this function
+// available for "loop bind(parallel)", which maps to "for".
+void emitOMPForDirective(const OMPLoopDirective &S, CodeGenFunction &CGF,
+                         CodeGenModule &CGM, bool HasCancel) {
   bool HasLastprivates = false;
-  bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
-  auto &&CodeGen = [this, &S, &HasLastprivates,
+  bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder &&
+                         isForSupportedByOpenMPIRBuilder(S, HasCancel);
+  auto &&CodeGen = [&S, &CGM, HasCancel, &HasLastprivates,
                     UseOMPIRBuilder](CodeGenFunction &CGF, PrePostActionTy &) {
     // Use the OpenMPIRBuilder if enabled.
     if (UseOMPIRBuilder) {
@@ -3932,43 +4004,47 @@ void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
         SchedKind =
             convertClauseKindToSchedKind(SchedClause->getScheduleKind());
         if (const Expr *ChunkSizeExpr = SchedClause->getChunkSize())
-          ChunkSize = EmitScalarExpr(ChunkSizeExpr);
+          ChunkSize = CGF.EmitScalarExpr(ChunkSizeExpr);
       }
 
       // Emit the associated statement and get its loop representation.
       const Stmt *Inner = S.getRawStmt();
       llvm::CanonicalLoopInfo *CLI =
-          EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
+          CGF.EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
 
       llvm::OpenMPIRBuilder &OMPBuilder =
           CGM.getOpenMPRuntime().getOMPBuilder();
       llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
-          AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+          CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator());
       OMPBuilder.applyWorkshareLoop(
-          Builder.getCurrentDebugLocation(), CLI, AllocaIP, NeedsBarrier,
+          CGF.Builder.getCurrentDebugLocation(), CLI, AllocaIP, NeedsBarrier,
           SchedKind, ChunkSize, /*HasSimdModifier=*/false,
           /*HasMonotonicModifier=*/false, /*HasNonmonotonicModifier=*/false,
           /*HasOrderedClause=*/false);
       return;
     }
 
-    HasLastprivates = emitWorksharingDirective(CGF, S, S.hasCancel());
+    HasLastprivates = emitWorksharingDirective(CGF, S, HasCancel);
   };
   {
     auto LPCRegion =
-        CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-    OMPLexicalScope Scope(*this, S, OMPD_unknown);
-    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_for, CodeGen,
-                                                S.hasCancel());
+        CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
+    OMPLexicalScope Scope(CGF, S, OMPD_unknown);
+    CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_for, CodeGen,
+                                                HasCancel);
   }
 
   if (!UseOMPIRBuilder) {
     // Emit an implicit barrier at the end.
     if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates)
-      CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for);
+      CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(), OMPD_for);
   }
   // Check for outer lastprivate conditional update.
-  checkForLastprivateConditionalUpdate(*this, S);
+  checkForLastprivateConditionalUpdate(CGF, S);
+}
+
+void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
+  return emitOMPForDirective(S, *this, CGM, S.hasCancel());
 }
 
 void CodeGenFunction::EmitOMPForSimdDirective(const OMPForSimdDirective &S) {
@@ -4004,7 +4080,8 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
   const Stmt *CapturedStmt = S.getInnermostCapturedStmt()->getCapturedStmt();
   const auto *CS = dyn_cast<CompoundStmt>(CapturedStmt);
   bool HasLastprivates = false;
-  auto &&CodeGen = [&S, CapturedStmt, CS,
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+  auto &&CodeGen = [&S, CapturedStmt, CS, EKind,
                     &HasLastprivates](CodeGenFunction &CGF, PrePostActionTy &) {
     const ASTContext &C = CGF.getContext();
     QualType KmpInt32Ty =
@@ -4085,7 +4162,7 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
     HasLastprivates = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
     CGF.EmitOMPReductionClauseInit(S, LoopScope);
     (void)LoopScope.Privatize();
-    if (isOpenMPTargetExecutionDirective(S.getDirectiveKind()))
+    if (isOpenMPTargetExecutionDirective(EKind))
       CGF.CGM.getOpenMPRuntime().adjustTargetSpecificDataForLambdas(CGF, S);
 
     // Emit static non-chunked loop.
@@ -4094,8 +4171,8 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
     CGOpenMPRuntime::StaticRTInput StaticInit(
         /*IVSize=*/32, /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(),
         LB.getAddress(), UB.getAddress(), ST.getAddress());
-    CGF.CGM.getOpenMPRuntime().emitForStaticInit(
-        CGF, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind, StaticInit);
+    CGF.CGM.getOpenMPRuntime().emitForStaticInit(CGF, S.getBeginLoc(), EKind,
+                                                 ScheduleKind, StaticInit);
     // UB = min(UB, GlobalUB);
     llvm::Value *UBVal = CGF.EmitLoadOfScalar(UB, S.getBeginLoc());
     llvm::Value *MinUBGlobalUB = CGF.Builder.CreateSelect(
@@ -4111,7 +4188,7 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
       CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getEndLoc(),
                                                      OMPD_sections);
     };
-    CGF.OMPCancelStack.emitExit(CGF, S.getDirectiveKind(), CodeGen);
+    CGF.OMPCancelStack.emitExit(CGF, EKind, CodeGen);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel);
     // Emit post-update of the reduction variables if IsLastIter != 0.
     emitPostUpdateForReductionClause(CGF, S, [IL, &S](CodeGenFunction &CGF) {
@@ -4132,7 +4209,7 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
     HasCancel = OSD->hasCancel();
   else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&S))
     HasCancel = OPSD->hasCancel();
-  OMPCancelStackRAII CancelRegion(*this, S.getDirectiveKind(), HasCancel);
+  OMPCancelStackRAII CancelRegion(*this, EKind, HasCancel);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_sections, CodeGen,
                                               HasCancel);
   // Emit barrier for lastprivates only if 'sections' directive has 'nowait'
@@ -4252,7 +4329,7 @@ void CodeGenFunction::EmitOMPSingleDirective(const OMPSingleDirective &S) {
   // Build a list of copyprivate variables along with helper expressions
   // (<source>, <destination>, <destination>=<source> expressions)
   for (const auto *C : S.getClausesOfKind<OMPCopyprivateClause>()) {
-    CopyprivateVars.append(C->varlists().begin(), C->varlists().end());
+    CopyprivateVars.append(C->varlist_begin(), C->varlist_end());
     DestExprs.append(C->destination_exprs().begin(),
                      C->destination_exprs().end());
     SrcExprs.append(C->source_exprs().begin(), C->source_exprs().end());
@@ -4958,7 +5035,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
       auto IPriv = C->privates().begin();
       auto IRed = C->reduction_ops().begin();
       auto ITD = C->taskgroup_descriptors().begin();
-      for (const Expr *Ref : C->varlists()) {
+      for (const Expr *Ref : C->varlist()) {
         InRedVars.emplace_back(Ref);
         InRedPrivs.emplace_back(*IPriv);
         InRedOps.emplace_back(*IRed);
@@ -5010,12 +5087,12 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
     Action.Enter(CGF);
     BodyGen(CGF);
   };
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
   llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
-      S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, Data.Tied,
-      Data.NumberOfParts);
+      S, *I, *PartId, *TaskT, EKind, CodeGen, Data.Tied, Data.NumberOfParts);
   OMPLexicalScope Scope(*this, S, std::nullopt,
-                        !isOpenMPParallelDirective(S.getDirectiveKind()) &&
-                            !isOpenMPSimdDirective(S.getDirectiveKind()));
+                        !isOpenMPParallelDirective(EKind) &&
+                            !isOpenMPSimdDirective(EKind));
   TaskGen(*this, OutlinedFn, Data);
 }
 
@@ -5121,7 +5198,8 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
   }
   (void)TargetScope.Privatize();
   buildDependences(S, Data);
-  auto &&CodeGen = [&Data, &S, CS, &BodyGen, BPVD, PVD, SVD, MVD,
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
+  auto &&CodeGen = [&Data, &S, CS, &BodyGen, BPVD, PVD, SVD, MVD, EKind,
                     &InputInfo](CodeGenFunction &CGF, PrePostActionTy &Action) {
     // Set proper addresses for generated private copies.
     OMPPrivateScope Scope(CGF);
@@ -5176,7 +5254,7 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     OMPLexicalScope LexScope(CGF, S, OMPD_task, /*EmitPreInitStmt=*/false);
     auto *TL = S.getSingleClause<OMPThreadLimitClause>();
     if (CGF.CGM.getLangOpts().OpenMP >= 51 &&
-        needsTaskBasedThreadLimit(S.getDirectiveKind()) && TL) {
+        needsTaskBasedThreadLimit(EKind) && TL) {
       // Emit __kmpc_set_thread_limit() to set the thread_limit for the task
       // enclosing this target region. This will indirectly set the thread_limit
       // for every applicable construct within target region.
@@ -5186,7 +5264,7 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     BodyGen(CGF);
   };
   llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
-      S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, /*Tied=*/true,
+      S, *I, *PartId, *TaskT, EKind, CodeGen, /*Tied=*/true,
       Data.NumberOfParts);
   llvm::APInt TrueOrFalse(32, S.hasClausesOfKind<OMPNowaitClause>() ? 1 : 0);
   IntegerLiteral IfCond(getContext(), TrueOrFalse,
@@ -5201,8 +5279,9 @@ void CodeGenFunction::processInReduction(const OMPExecutableDirective &S,
                                          CodeGenFunction &CGF,
                                          const CapturedStmt *CS,
                                          OMPPrivateScope &Scope) {
+  OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(S);
   if (Data.Reductions) {
-    OpenMPDirectiveKind CapturedRegion = S.getDirectiveKind();
+    OpenMPDirectiveKind CapturedRegion = EKind;
     OMPLexicalScope LexScope(CGF, S, CapturedRegion);
     ReductionCodeGen RedCG(Data.ReductionVars, Data.ReductionVars,
                            Data.ReductionCopies, Data.ReductionOps);
@@ -5239,7 +5318,7 @@ void CodeGenFunction::processInReduction(const OMPExecutableDirective &S,
     auto IPriv = C->privates().begin();
     auto IRed = C->reduction_ops().begin();
     auto ITD = C->taskgroup_descriptors().begin();
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       InRedVars.emplace_back(Ref);
       InRedPrivs.emplace_back(*IPriv);
       InRedOps.emplace_back(*IRed);
@@ -5861,13 +5940,20 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
   }
 }
 
-void CodeGenFunction::EmitOMPDistributeDirective(
-    const OMPDistributeDirective &S) {
+// Pass OMPLoopDirective (instead of OMPDistributeDirective) to make this
+// function available for "loop bind(teams)", which maps to "distribute".
+void emitOMPDistributeDirective(const OMPLoopDirective &S, CodeGenFunction &CGF,
+                                CodeGenModule &CGM) {
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitOMPDistributeLoop(S, emitOMPLoopBodyWithStopPoint, S.getInc());
   };
-  OMPLexicalScope Scope(*this, S, OMPD_unknown);
-  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen);
+  OMPLexicalScope Scope(CGF, S, OMPD_unknown);
+  CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute, CodeGen);
+}
+
+void CodeGenFunction::EmitOMPDistributeDirective(
+    const OMPDistributeDirective &S) {
+  emitOMPDistributeDirective(S, *this, CGM);
 }
 
 static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM,
@@ -6240,8 +6326,8 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
       UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, IC,
                                          X.getAddress().getElementType());
   }
-  llvm::Value *Res =
-      CGF.Builder.CreateAtomicRMW(RMWOp, X.getAddress(), UpdateVal, AO);
+  llvm::AtomicRMWInst *Res =
+      CGF.emitAtomicRMWInst(RMWOp, X.getAddress(), UpdateVal, AO);
   return std::make_pair(true, RValue::get(Res));
 }
 
@@ -7260,7 +7346,7 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
     const llvm::DenseMap<const ValueDecl *, llvm::Value *>
         CaptureDeviceAddrMap) {
   llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
-  for (const Expr *OrigVarIt : C.varlists()) {
+  for (const Expr *OrigVarIt : C.varlist()) {
     const auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(OrigVarIt)->getDecl());
     if (!Processed.insert(OrigVD).second)
       continue;
@@ -7311,7 +7397,7 @@ void CodeGenFunction::EmitOMPUseDeviceAddrClause(
     const llvm::DenseMap<const ValueDecl *, llvm::Value *>
         CaptureDeviceAddrMap) {
   llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
-  for (const Expr *Ref : C.varlists()) {
+  for (const Expr *Ref : C.varlist()) {
     const VarDecl *OrigVD = getBaseDecl(Ref);
     if (!Processed.insert(OrigVD).second)
       continue;
@@ -7408,13 +7494,13 @@ void CodeGenFunction::EmitOMPTargetDataDirective(
         if (CGM.getLangOpts().OMPTargetTriples.empty()) {
           // Emit helper decls of the use_device_ptr/use_device_addr clauses.
           for (const auto *C : S.getClausesOfKind<OMPUseDevicePtrClause>())
-            for (const Expr *E : C->varlists()) {
+            for (const Expr *E : C->varlist()) {
               const Decl *D = cast<DeclRefExpr>(E)->getDecl();
               if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
                 CGF.EmitVarDecl(*OED);
             }
           for (const auto *C : S.getClausesOfKind<OMPUseDeviceAddrClause>())
-            for (const Expr *E : C->varlists()) {
+            for (const Expr *E : C->varlist()) {
               const Decl *D = getBaseDecl(E);
               if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
                 CGF.EmitVarDecl(*OED);
@@ -7903,6 +7989,24 @@ void CodeGenFunction::EmitOMPTargetUpdateDirective(
 
 void CodeGenFunction::EmitOMPGenericLoopDirective(
     const OMPGenericLoopDirective &S) {
+  // Always expect a bind clause on the loop directive. It it wasn't
+  // in the source, it should have been added in sema.
+
+  OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
+  if (const auto *C = S.getSingleClause<OMPBindClause>())
+    BindKind = C->getBindKind();
+
+  switch (BindKind) {
+  case OMPC_BIND_parallel: // for
+    return emitOMPForDirective(S, *this, CGM, /*HasCancel=*/false);
+  case OMPC_BIND_teams: // distribute
+    return emitOMPDistributeDirective(S, *this, CGM);
+  case OMPC_BIND_thread: // simd
+    return emitOMPSimdDirective(S, *this, CGM);
+  case OMPC_BIND_unknown:
+    break;
+  }
+
   // Unimplemented, just inline the underlying statement for now.
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     // Emit the loop iteration variable.
@@ -8128,7 +8232,7 @@ void CodeGenFunction::EmitSimpleOMPExecutableDirective(
     if (isOpenMPTaskingDirective(D.getDirectiveKind())) {
       // Capture global firstprivates to avoid crash.
       for (const auto *C : D.getClausesOfKind<OMPFirstprivateClause>()) {
-        for (const Expr *Ref : C->varlists()) {
+        for (const Expr *Ref : C->varlist()) {
           const auto *DRE = cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
           if (!DRE)
             continue;
diff --git a/clang/lib/CodeGen/CGValue.h b/clang/lib/CodeGen/CGValue.h
index f1ba3cf95ae59..c4ec8d207d2e3 100644
--- a/clang/lib/CodeGen/CGValue.h
+++ b/clang/lib/CodeGen/CGValue.h
@@ -15,6 +15,7 @@
 #define LLVM_CLANG_LIB_CODEGEN_CGVALUE_H
 
 #include "Address.h"
+#include "CGPointerAuthInfo.h"
 #include "CodeGenTBAA.h"
 #include "EHScopeStack.h"
 #include "clang/AST/ASTContext.h"
@@ -233,9 +234,6 @@ class LValue {
   // this lvalue.
   bool Nontemporal : 1;
 
-  // The pointer is known not to be null.
-  bool IsKnownNonNull : 1;
-
   LValueBaseInfo BaseInfo;
   TBAAAccessInfo TBAAInfo;
 
@@ -263,7 +261,6 @@ class LValue {
     this->ImpreciseLifetime = false;
     this->Nontemporal = false;
     this->ThreadLocalRef = false;
-    this->IsKnownNonNull = false;
     this->BaseIvarExp = nullptr;
   }
 
@@ -349,28 +346,26 @@ class LValue {
   LValueBaseInfo getBaseInfo() const { return BaseInfo; }
   void setBaseInfo(LValueBaseInfo Info) { BaseInfo = Info; }
 
-  KnownNonNull_t isKnownNonNull() const {
-    return (KnownNonNull_t)IsKnownNonNull;
-  }
+  KnownNonNull_t isKnownNonNull() const { return Addr.isKnownNonNull(); }
   LValue setKnownNonNull() {
-    IsKnownNonNull = true;
+    Addr.setKnownNonNull();
     return *this;
   }
 
   // simple lvalue
-  llvm::Value *getPointer(CodeGenFunction &CGF) const {
-    assert(isSimple());
-    return Addr.getBasePointer();
-  }
-  llvm::Value *emitRawPointer(CodeGenFunction &CGF) const {
-    assert(isSimple());
-    return Addr.isValid() ? Addr.emitRawPointer(CGF) : nullptr;
-  }
+  llvm::Value *getPointer(CodeGenFunction &CGF) const;
+  llvm::Value *emitResignedPointer(QualType PointeeTy,
+                                   CodeGenFunction &CGF) const;
+  llvm::Value *emitRawPointer(CodeGenFunction &CGF) const;
 
   Address getAddress() const { return Addr; }
 
   void setAddress(Address address) { Addr = address; }
 
+  CGPointerAuthInfo getPointerAuthInfo() const {
+    return Addr.getPointerAuthInfo();
+  }
+
   // vector elt lvalue
   Address getVectorAddress() const {
     assert(isVectorElt());
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index dd147f45d1351..dd2f7ed021029 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -113,7 +113,7 @@ add_clang_library(clangCodeGen
   MacroPPCallbacks.cpp
   MicrosoftCXXABI.cpp
   ModuleBuilder.cpp
-  ObjectFilePCHContainerOperations.cpp
+  ObjectFilePCHContainerWriter.cpp
   PatternInit.cpp
   SanitizerMetadata.cpp
   SwiftCallingConv.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 3cc4e093d2cf8..dc9eb90079458 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -197,34 +197,46 @@ CodeGenFunction::CGFPOptionsRAII::~CGFPOptionsRAII() {
   CGF.Builder.setDefaultConstrainedRounding(OldRounding);
 }
 
-static LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T,
-                                         bool ForPointeeType,
-                                         CodeGenFunction &CGF) {
+static LValue
+makeNaturalAlignAddrLValue(llvm::Value *V, QualType T, bool ForPointeeType,
+                           bool MightBeSigned, CodeGenFunction &CGF,
+                           KnownNonNull_t IsKnownNonNull = NotKnownNonNull) {
   LValueBaseInfo BaseInfo;
   TBAAAccessInfo TBAAInfo;
   CharUnits Alignment =
       CGF.CGM.getNaturalTypeAlignment(T, &BaseInfo, &TBAAInfo, ForPointeeType);
-  Address Addr = Address(V, CGF.ConvertTypeForMem(T), Alignment);
+  Address Addr =
+      MightBeSigned
+          ? CGF.makeNaturalAddressForPointer(V, T, Alignment, false, nullptr,
+                                             nullptr, IsKnownNonNull)
+          : Address(V, CGF.ConvertTypeForMem(T), Alignment, IsKnownNonNull);
   return CGF.MakeAddrLValue(Addr, T, BaseInfo, TBAAInfo);
 }
 
-LValue CodeGenFunction::MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T) {
-  return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, *this);
+LValue
+CodeGenFunction::MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T,
+                                            KnownNonNull_t IsKnownNonNull) {
+  return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false,
+                                      /*MightBeSigned*/ true, *this,
+                                      IsKnownNonNull);
 }
 
 LValue
 CodeGenFunction::MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T) {
-  return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, *this);
+  return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true,
+                                      /*MightBeSigned*/ true, *this);
 }
 
 LValue CodeGenFunction::MakeNaturalAlignRawAddrLValue(llvm::Value *V,
                                                       QualType T) {
-  return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, *this);
+  return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false,
+                                      /*MightBeSigned*/ false, *this);
 }
 
 LValue CodeGenFunction::MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V,
                                                              QualType T) {
-  return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, *this);
+  return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true,
+                                      /*MightBeSigned*/ false, *this);
 }
 
 llvm::Type *CodeGenFunction::ConvertTypeForMem(QualType T) {
@@ -1074,6 +1086,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   const CodeGenOptions &CodeGenOpts = CGM.getCodeGenOpts();
   if (CodeGenOpts.PointerAuth.FunctionPointers)
     Fn->addFnAttr("ptrauth-calls");
+  if (CodeGenOpts.PointerAuth.IndirectGotos)
+    Fn->addFnAttr("ptrauth-indirect-gotos");
 
   // Apply xray attributes to the function (as a string, for now)
   bool AlwaysXRayAttr = false;
@@ -1181,6 +1195,9 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   if (D && D->hasAttr<NoProfileFunctionAttr>())
     Fn->addFnAttr(llvm::Attribute::NoProfile);
 
+  if (D && D->hasAttr<HybridPatchableAttr>())
+    Fn->addFnAttr(llvm::Attribute::HybridPatchable);
+
   if (D) {
     // Function attributes take precedence over command line flags.
     if (auto *A = D->getAttr<FunctionReturnThunksAttr>()) {
@@ -3174,16 +3191,8 @@ void CodeGenFunction::EmitKCFIOperandBundle(
 llvm::Value *CodeGenFunction::FormAArch64ResolverCondition(
     const MultiVersionResolverOption &RO) {
   llvm::SmallVector<StringRef, 8> CondFeatures;
-  for (const StringRef &Feature : RO.Conditions.Features) {
-    // Optimize the Function Multi Versioning resolver by creating conditions
-    // only for features that are not enabled in the target. The exception is
-    // for features whose extension instructions are executed as NOP on targets
-    // without extension support.
-    if (!getContext().getTargetInfo().hasFeature(Feature) || Feature == "bti" ||
-        Feature == "memtag" || Feature == "memtag2" || Feature == "memtag3" ||
-        Feature == "dgh")
-      CondFeatures.push_back(Feature);
-  }
+  for (const StringRef &Feature : RO.Conditions.Features)
+    CondFeatures.push_back(Feature);
   if (!CondFeatures.empty()) {
     return EmitAArch64CpuSupports(CondFeatures);
   }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f38088e7de803..9f612c3e19b8f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -202,7 +202,7 @@ template <> struct DominatingValue<Address> {
   }
   static type restore(CodeGenFunction &CGF, saved_type value) {
     return Address(DominatingLLVMValue::restore(CGF, value.BasePtr),
-                   value.ElementType, value.Alignment,
+                   value.ElementType, value.Alignment, CGPointerAuthInfo(),
                    DominatingLLVMValue::restore(CGF, value.Offset));
   }
 };
@@ -612,6 +612,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// True if the current statement has always_inline attribute.
   bool InAlwaysInlineAttributedStmt = false;
 
+  /// True if the current statement has noconvergent attribute.
+  bool InNoConvergentAttributedStmt = false;
+
   // The CallExpr within the current statement that the musttail attribute
   // applies to.  nullptr if there is no 'musttail' on the current statement.
   const CallExpr *MustTailCall = nullptr;
@@ -2711,7 +2714,8 @@ class CodeGenFunction : public CodeGenTypeCache {
     if (Alignment.isZero())
       Alignment =
           CGM.getNaturalTypeAlignment(T, BaseInfo, TBAAInfo, ForPointeeType);
-    return Address(Ptr, ConvertTypeForMem(T), Alignment, nullptr,
+    return Address(Ptr, ConvertTypeForMem(T), Alignment,
+                   CGM.getPointerAuthInfoForPointeeType(T), /*Offset=*/nullptr,
                    IsKnownNonNull);
   }
 
@@ -2752,7 +2756,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// an l-value with the natural pointee alignment of T.
   LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T);
 
-  LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T);
+  LValue
+  MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T,
+                             KnownNonNull_t IsKnownNonNull = NotKnownNonNull);
 
   /// Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known
   /// to be unsigned.
@@ -3326,9 +3332,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   const FieldDecl *FindCountedByField(const FieldDecl *FD);
 
   /// Build an expression accessing the "counted_by" field.
-  llvm::Value *EmitCountedByFieldExpr(const Expr *Base,
-                                      const FieldDecl *FAMDecl,
-                                      const FieldDecl *CountDecl);
+  llvm::Value *EmitLoadOfCountedByField(const Expr *Base,
+                                        const FieldDecl *FAMDecl,
+                                        const FieldDecl *CountDecl);
 
   llvm::Value *EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
                                        bool isInc, bool isPre);
@@ -3839,6 +3845,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPSimdDirective(const OMPSimdDirective &S);
   void EmitOMPTileDirective(const OMPTileDirective &S);
   void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
+  void EmitOMPReverseDirective(const OMPReverseDirective &S);
+  void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S);
   void EmitOMPForDirective(const OMPForDirective &S);
   void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
   void EmitOMPSectionsDirective(const OMPSectionsDirective &S);
@@ -4177,6 +4185,13 @@ class CodeGenFunction : public CodeGenTypeCache {
           llvm::AtomicOrdering::SequentiallyConsistent,
       bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored());
 
+  /// Emit an atomicrmw instruction, and applying relevant metadata when
+  /// applicable.
+  llvm::AtomicRMWInst *emitAtomicRMWInst(
+      llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val,
+      llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent,
+      llvm::SyncScope::ID SSID = llvm::SyncScope::System);
+
   void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
                         const llvm::function_ref<RValue(RValue)> &UpdateOp,
                         bool IsVolatile);
@@ -4391,7 +4406,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee,
                   ReturnValueSlot ReturnValue, const CallArgList &Args,
                   llvm::CallBase **callOrInvoke, bool IsMustTail,
-                  SourceLocation Loc);
+                  SourceLocation Loc,
+                  bool IsVirtualFunctionPointerThunk = false);
   RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee,
                   ReturnValueSlot ReturnValue, const CallArgList &Args,
                   llvm::CallBase **callOrInvoke = nullptr,
@@ -4445,10 +4461,6 @@ class CodeGenFunction : public CodeGenTypeCache {
                                                CXXDtorType Type,
                                                const CXXRecordDecl *RD);
 
-  llvm::Value *getAsNaturalPointerTo(Address Addr, QualType PointeeType) {
-    return Addr.getBasePointer();
-  }
-
   bool isPointerKnownNonNull(const Expr *E);
 
   /// Create the discriminator from the storage address and the entity hash.
@@ -4458,16 +4470,36 @@ class CodeGenFunction : public CodeGenTypeCache {
                                         llvm::Value *StorageAddress,
                                         GlobalDecl SchemaDecl,
                                         QualType SchemaType);
-  llvm::Value *EmitPointerAuthSign(QualType PointeeType, llvm::Value *Pointer);
+
   llvm::Value *EmitPointerAuthSign(const CGPointerAuthInfo &Info,
                                    llvm::Value *Pointer);
+
   llvm::Value *EmitPointerAuthAuth(const CGPointerAuthInfo &Info,
                                    llvm::Value *Pointer);
 
+  llvm::Value *emitPointerAuthResign(llvm::Value *Pointer, QualType PointerType,
+                                     const CGPointerAuthInfo &CurAuthInfo,
+                                     const CGPointerAuthInfo &NewAuthInfo,
+                                     bool IsKnownNonNull);
+  llvm::Value *emitPointerAuthResignCall(llvm::Value *Pointer,
+                                         const CGPointerAuthInfo &CurInfo,
+                                         const CGPointerAuthInfo &NewInfo);
+
   void EmitPointerAuthOperandBundle(
       const CGPointerAuthInfo &Info,
       SmallVectorImpl<llvm::OperandBundleDef> &Bundles);
 
+  llvm::Value *authPointerToPointerCast(llvm::Value *ResultPtr,
+                                        QualType SourceType, QualType DestType);
+  Address authPointerToPointerCast(Address Ptr, QualType SourceType,
+                                   QualType DestType);
+
+  Address getAsNaturalAddressOf(Address Addr, QualType PointeeTy);
+
+  llvm::Value *getAsNaturalPointerTo(Address Addr, QualType PointeeType) {
+    return getAsNaturalAddressOf(Addr, PointeeType).getBasePointer();
+  }
+
   // Return the copy constructor name with the prefix "__copy_constructor_"
   // removed.
   static std::string getNonTrivialCopyConstructorStr(QualType QT,
@@ -4529,7 +4561,6 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
-  RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
@@ -4697,6 +4728,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitRISCVBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                     ReturnValueSlot ReturnValue);
 
+  llvm::Value *EmitRISCVCpuSupports(const CallExpr *E);
+  llvm::Value *EmitRISCVCpuInit();
+
   void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
                                       const CallExpr *E);
   void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope,
@@ -4852,9 +4886,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitAggFinalDestCopy(QualType Type, AggValueSlot Dest, const LValue &Src,
                             ExprValueKind SrcKind);
 
-  /// Build all the stores needed to initialize an aggregate at Dest with the
-  /// value Val.
-  void EmitAggregateStore(llvm::Value *Val, Address Dest, bool DestIsVolatile);
+  /// Create a store to \arg DstPtr from \arg Src, truncating the stored value
+  /// to at most \arg DstSize bytes.
+  void CreateCoercedStore(llvm::Value *Src, Address Dst, llvm::TypeSize DstSize,
+                          bool DstIsVolatile);
 
   /// EmitExtendGCLifetime - Given a pointer to an Objective-C object,
   /// make sure it survives garbage collection until this point.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index df5fd474494fc..7989964a7dc3c 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -161,6 +161,8 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
       return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64);
     else if (Target.getABI() == "aapcs-soft")
       Kind = AArch64ABIKind::AAPCSSoft;
+    else if (Target.getABI() == "pauthtest")
+      Kind = AArch64ABIKind::PAuthTest;
 
     return createAArch64TargetCodeGenInfo(CGM, Kind);
   }
@@ -4193,8 +4195,7 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     // Forward declarations are emitted lazily on first use.
     if (!FD->doesThisDeclarationHaveABody()) {
       if (!FD->doesDeclarationForceExternallyVisibleDefinition() &&
-          (!FD->isMultiVersion() ||
-           !FD->getASTContext().getTargetInfo().getTriple().isAArch64()))
+          (!FD->isMultiVersion() || !getTarget().getTriple().isAArch64()))
         return;
 
       StringRef MangledName = getMangledName(GD);
@@ -4472,6 +4473,11 @@ bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) {
     return true;
 
   const auto *F = cast<FunctionDecl>(GD.getDecl());
+  // Inline builtins declaration must be emitted. They often are fortified
+  // functions.
+  if (F->isInlineBuiltinDeclaration())
+    return true;
+
   if (CodeGenOpts.OptimizationLevel == 0 && !F->hasAttr<AlwaysInlineAttr>())
     return false;
 
@@ -4517,11 +4523,6 @@ bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) {
     }
   }
 
-  // Inline builtins declaration must be emitted. They often are fortified
-  // functions.
-  if (F->isInlineBuiltinDeclaration())
-    return true;
-
   // PR9614. Avoid cases where the source code is lying to us. An available
   // externally function should have an equivalent function somewhere else,
   // but a function that calls itself through asm label/`__builtin_` trickery is
@@ -4641,23 +4642,6 @@ llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM,
   return llvm::GlobalValue::WeakODRLinkage;
 }
 
-static FunctionDecl *createDefaultTargetVersionFrom(const FunctionDecl *FD) {
-  auto *DeclCtx = const_cast<DeclContext *>(FD->getDeclContext());
-  TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
-  StorageClass SC = FD->getStorageClass();
-  DeclarationName Name = FD->getNameInfo().getName();
-
-  FunctionDecl *NewDecl =
-      FunctionDecl::Create(FD->getASTContext(), DeclCtx, FD->getBeginLoc(),
-                           FD->getEndLoc(), Name, TInfo->getType(), TInfo, SC);
-
-  NewDecl->setIsMultiVersion();
-  NewDecl->addAttr(TargetVersionAttr::CreateImplicit(
-      NewDecl->getASTContext(), "default", NewDecl->getSourceRange()));
-
-  return NewDecl;
-}
-
 void CodeGenModule::emitMultiVersionFunctions() {
   std::vector<GlobalDecl> MVFuncsToEmit;
   MultiVersionFuncs.swap(MVFuncsToEmit);
@@ -4684,29 +4668,30 @@ void CodeGenModule::emitMultiVersionFunctions() {
       return cast<llvm::Function>(Func);
     };
 
-    bool HasDefaultDecl = !FD->isTargetVersionMultiVersion();
-    bool ShouldEmitResolver =
-        !getContext().getTargetInfo().getTriple().isAArch64();
+    // For AArch64, a resolver is only emitted if a function marked with
+    // target_version("default")) or target_clones() is present and defined
+    // in this TU. For other architectures it is always emitted.
+    bool ShouldEmitResolver = !getTarget().getTriple().isAArch64();
     SmallVector<CodeGenFunction::MultiVersionResolverOption, 10> Options;
 
     getContext().forEachMultiversionedFunctionVersion(
         FD, [&](const FunctionDecl *CurFD) {
           llvm::SmallVector<StringRef, 8> Feats;
+          bool IsDefined = CurFD->doesThisDeclarationHaveABody();
 
           if (const auto *TA = CurFD->getAttr<TargetAttr>()) {
             TA->getAddedFeatures(Feats);
             llvm::Function *Func = createFunction(CurFD);
             Options.emplace_back(Func, TA->getArchitecture(), Feats);
           } else if (const auto *TVA = CurFD->getAttr<TargetVersionAttr>()) {
-            bool HasDefaultDef = TVA->isDefaultVersion() &&
-                                 CurFD->doesThisDeclarationHaveABody();
-            HasDefaultDecl |= TVA->isDefaultVersion();
-            ShouldEmitResolver |= (CurFD->isUsed() || HasDefaultDef);
+            if (TVA->isDefaultVersion() && IsDefined)
+              ShouldEmitResolver = true;
             TVA->getFeatures(Feats);
             llvm::Function *Func = createFunction(CurFD);
             Options.emplace_back(Func, /*Architecture*/ "", Feats);
           } else if (const auto *TC = CurFD->getAttr<TargetClonesAttr>()) {
-            ShouldEmitResolver |= CurFD->doesThisDeclarationHaveABody();
+            if (IsDefined)
+              ShouldEmitResolver = true;
             for (unsigned I = 0; I < TC->featuresStrs_size(); ++I) {
               if (!TC->isFirstOfVersion(I))
                 continue;
@@ -4732,13 +4717,6 @@ void CodeGenModule::emitMultiVersionFunctions() {
     if (!ShouldEmitResolver)
       continue;
 
-    if (!HasDefaultDecl) {
-      FunctionDecl *NewFD = createDefaultTargetVersionFrom(FD);
-      llvm::Function *Func = createFunction(NewFD);
-      llvm::SmallVector<StringRef, 1> Feats;
-      Options.emplace_back(Func, /*Architecture*/ "", Feats);
-    }
-
     llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD);
     if (auto *IFunc = dyn_cast<llvm::GlobalIFunc>(ResolverConstant)) {
       ResolverConstant = IFunc->getResolver();
@@ -4787,6 +4765,14 @@ void CodeGenModule::emitMultiVersionFunctions() {
     emitMultiVersionFunctions();
 }
 
+static void replaceDeclarationWith(llvm::GlobalValue *Old,
+                                   llvm::Constant *New) {
+  assert(cast<llvm::Function>(Old)->isDeclaration() && "Not a declaration");
+  New->takeName(Old);
+  Old->replaceAllUsesWith(New);
+  Old->eraseFromParent();
+}
+
 void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) {
   const auto *FD = cast<FunctionDecl>(GD.getDecl());
   assert(FD && "Not a FunctionDecl?");
@@ -4891,12 +4877,9 @@ void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) {
     // Fix up function declarations that were created for cpu_specific before
     // cpu_dispatch was known
     if (!isa<llvm::GlobalIFunc>(IFunc)) {
-      assert(cast<llvm::Function>(IFunc)->isDeclaration());
       auto *GI = llvm::GlobalIFunc::create(DeclTy, 0, Linkage, "", ResolverFunc,
                                            &getModule());
-      GI->takeName(IFunc);
-      IFunc->replaceAllUsesWith(GI);
-      IFunc->eraseFromParent();
+      replaceDeclarationWith(IFunc, GI);
       IFunc = GI;
     }
 
@@ -4926,7 +4909,8 @@ void CodeGenModule::AddDeferredMultiVersionResolverToEmit(GlobalDecl GD) {
 }
 
 /// If a dispatcher for the specified mangled name is not in the module, create
-/// and return an llvm Function with the specified type.
+/// and return it. The dispatcher is either an llvm Function with the specified
+/// type, or a global ifunc.
 llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   const auto *FD = cast<FunctionDecl>(GD.getDecl());
   assert(FD && "Not a FunctionDecl?");
@@ -4954,8 +4938,15 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
     ResolverName += ".resolver";
   }
 
-  // If the resolver has already been created, just return it.
-  if (llvm::GlobalValue *ResolverGV = GetGlobalValue(ResolverName))
+  // If the resolver has already been created, just return it. This lookup may
+  // yield a function declaration instead of a resolver on AArch64. That is
+  // because we didn't know whether a resolver will be generated when we first
+  // encountered a use of the symbol named after this resolver. Therefore,
+  // targets which support ifuncs should not return here unless we actually
+  // found an ifunc.
+  llvm::GlobalValue *ResolverGV = GetGlobalValue(ResolverName);
+  if (ResolverGV &&
+      (isa<llvm::GlobalIFunc>(ResolverGV) || !getTarget().supportsIFunc()))
     return ResolverGV;
 
   const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
@@ -4981,7 +4972,8 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
                                   "", Resolver, &getModule());
     GIF->setName(ResolverName);
     SetCommonAttributes(FD, GIF);
-
+    if (ResolverGV)
+      replaceDeclarationWith(ResolverGV, GIF);
     return GIF;
   }
 
@@ -4990,6 +4982,8 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   assert(isa<llvm::GlobalValue>(Resolver) &&
          "Resolver should be created for the first time");
   SetCommonAttributes(FD, cast<llvm::GlobalValue>(Resolver));
+  if (ResolverGV)
+    replaceDeclarationWith(ResolverGV, Resolver);
   return Resolver;
 }
 
@@ -5019,6 +5013,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
     ForDefinition_t IsForDefinition) {
   const Decl *D = GD.getDecl();
 
+  std::string NameWithoutMultiVersionMangling;
   // Any attempts to use a MultiVersion function should result in retrieving
   // the iFunc instead. Name Mangling will handle the rest of the changes.
   if (const FunctionDecl *FD = cast_or_null<FunctionDecl>(D)) {
@@ -5040,14 +5035,24 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
 
     if (FD->isMultiVersion()) {
       UpdateMultiVersionNames(GD, FD, MangledName);
-      if (FD->getASTContext().getTargetInfo().getTriple().isAArch64() &&
-          !FD->isUsed())
-        AddDeferredMultiVersionResolverToEmit(GD);
-      else if (!IsForDefinition)
-        return GetOrCreateMultiVersionResolver(GD);
+      if (!IsForDefinition) {
+        // On AArch64 we do not immediatelly emit an ifunc resolver when a
+        // function is used. Instead we defer the emission until we see a
+        // default definition. In the meantime we just reference the symbol
+        // without FMV mangling (it may or may not be replaced later).
+        if (getTarget().getTriple().isAArch64()) {
+          AddDeferredMultiVersionResolverToEmit(GD);
+          NameWithoutMultiVersionMangling = getMangledNameImpl(
+              *this, GD, FD, /*OmitMultiVersionMangling=*/true);
+        } else
+          return GetOrCreateMultiVersionResolver(GD);
+      }
     }
   }
 
+  if (!NameWithoutMultiVersionMangling.empty())
+    MangledName = NameWithoutMultiVersionMangling;
+
   // Lookup the entry, lazily creating it if necessary.
   llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
   if (Entry) {
@@ -6342,7 +6347,7 @@ void CodeGenModule::EmitExternalFunctionDeclaration(const FunctionDecl *FD) {
     if (getCodeGenOpts().hasReducedDebugInfo()) {
       auto *Ty = getTypes().ConvertType(FD->getType());
       StringRef MangledName = getMangledName(FD);
-      auto *Fn = dyn_cast<llvm::Function>(
+      auto *Fn = cast<llvm::Function>(
           GetOrCreateLLVMFunction(MangledName, Ty, FD, /* ForVTable */ false));
       if (!Fn->getSubprogram())
         DI->EmitFunctionDecl(FD, FD->getLocation(), FD->getType(), Fn);
@@ -8183,7 +8188,7 @@ void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
   // Do not emit threadprivates in simd-only mode.
   if (LangOpts.OpenMP && LangOpts.OpenMPSimd)
     return;
-  for (auto RefExpr : D->varlists()) {
+  for (auto RefExpr : D->varlist()) {
     auto *VD = cast<VarDecl>(cast<DeclRefExpr>(RefExpr)->getDecl());
     bool PerformInit =
         VD->getAnyInitializer() &&
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index de27d590ebdf3..6564a029bcfe9 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -993,8 +993,16 @@ class CodeGenModule : public CodeGenTypeCache {
   llvm::Constant *getFunctionPointer(llvm::Constant *Pointer,
                                      QualType FunctionType);
 
+  llvm::Constant *getMemberFunctionPointer(const FunctionDecl *FD,
+                                           llvm::Type *Ty = nullptr);
+
+  llvm::Constant *getMemberFunctionPointer(llvm::Constant *Pointer,
+                                           QualType FT);
+
   CGPointerAuthInfo getFunctionPointerAuthInfo(QualType T);
 
+  CGPointerAuthInfo getMemberFunctionPointerAuthInfo(QualType FT);
+
   CGPointerAuthInfo getPointerAuthInfoForPointeeType(QualType type);
 
   CGPointerAuthInfo getPointerAuthInfoForType(QualType type);
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index b889d1bfe003f..b66b6234c9d3d 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -186,10 +186,56 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
     return getChar();
 
   // Handle pointers and references.
-  // TODO: Implement C++'s type "similarity" and consider dis-"similar"
-  // pointers distinct.
-  if (Ty->isPointerType() || Ty->isReferenceType())
-    return createScalarTypeNode("any pointer", getChar(), Size);
+  //
+  // C has a very strict rule for pointer aliasing. C23 6.7.6.1p2:
+  //     For two pointer types to be compatible, both shall be identically
+  //     qualified and both shall be pointers to compatible types.
+  //
+  // This rule is impractically strict; we want to at least ignore CVR
+  // qualifiers. Distinguishing by CVR qualifiers would make it UB to
+  // e.g. cast a `char **` to `const char * const *` and dereference it,
+  // which is too common and useful to invalidate. C++'s similar types
+  // rule permits qualifier differences in these nested positions; in fact,
+  // C++ even allows that cast as an implicit conversion.
+  //
+  // Other qualifiers could theoretically be distinguished, especially if
+  // they involve a significant representation difference.  We don't
+  // currently do so, however.
+  //
+  // Computing the pointee type string recursively is implicitly more
+  // forgiving than the standards require.  Effectively, we are turning
+  // the question "are these types compatible/similar" into "are
+  // accesses to these types allowed to alias".  In both C and C++,
+  // the latter question has special carve-outs for signedness
+  // mismatches that only apply at the top level.  As a result, we are
+  // allowing e.g. `int *` l-values to access `unsigned *` objects.
+  if (Ty->isPointerType() || Ty->isReferenceType()) {
+    llvm::MDNode *AnyPtr = createScalarTypeNode("any pointer", getChar(), Size);
+    if (!CodeGenOpts.PointerTBAA)
+      return AnyPtr;
+    // Compute the depth of the pointer and generate a tag of the form "p<depth>
+    // <base type tag>".
+    unsigned PtrDepth = 0;
+    do {
+      PtrDepth++;
+      Ty = Ty->getPointeeType().getTypePtr();
+    } while (Ty->isPointerType());
+    // TODO: Implement C++'s type "similarity" and consider dis-"similar"
+    // pointers distinct for non-builtin types.
+    if (isa<BuiltinType>(Ty)) {
+      llvm::MDNode *ScalarMD = getTypeInfoHelper(Ty);
+      StringRef Name =
+          cast<llvm::MDString>(
+              ScalarMD->getOperand(CodeGenOpts.NewStructPathTBAA ? 2 : 0))
+              ->getString();
+      SmallString<256> OutName("p");
+      OutName += std::to_string(PtrDepth);
+      OutName += " ";
+      OutName += Name;
+      return createScalarTypeNode(OutName, AnyPtr, Size);
+    }
+    return AnyPtr;
+  }
 
   // Accesses to arrays are accesses to objects of their element types.
   if (CodeGenOpts.NewStructPathTBAA && Ty->isArrayType())
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 63be23bf22527..5108712ca7e37 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -388,6 +388,9 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
 
   bool NeedsVTTParameter(GlobalDecl GD) override;
 
+  llvm::Constant *
+  getOrCreateVirtualFunctionPointerThunk(const CXXMethodDecl *MD);
+
   /**************************** RTTI Uniqueness ******************************/
 
 protected:
@@ -426,6 +429,9 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
                 const CXXRecordDecl *RD) override;
 
  private:
+   llvm::Constant *
+   getSignedVirtualMemberFunctionPointer(const CXXMethodDecl *MD);
+
    bool hasAnyUnusedVirtualInlineFunction(const CXXRecordDecl *RD) const {
      const auto &VtableLayout =
          CGM.getItaniumVTableContext().getVTableLayout(RD);
@@ -837,7 +843,25 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
   CalleePtr->addIncoming(VirtualFn, FnVirtual);
   CalleePtr->addIncoming(NonVirtualFn, FnNonVirtual);
 
-  CGCallee Callee(FPT, CalleePtr);
+  CGPointerAuthInfo PointerAuth;
+
+  if (const auto &Schema =
+          CGM.getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers) {
+    llvm::PHINode *DiscriminatorPHI = Builder.CreatePHI(CGF.IntPtrTy, 2);
+    DiscriminatorPHI->addIncoming(llvm::ConstantInt::get(CGF.IntPtrTy, 0),
+                                  FnVirtual);
+    const auto &AuthInfo =
+        CGM.getMemberFunctionPointerAuthInfo(QualType(MPT, 0));
+    assert(Schema.getKey() == AuthInfo.getKey() &&
+           "Keys for virtual and non-virtual member functions must match");
+    auto *NonVirtualDiscriminator = AuthInfo.getDiscriminator();
+    DiscriminatorPHI->addIncoming(NonVirtualDiscriminator, FnNonVirtual);
+    PointerAuth = CGPointerAuthInfo(
+        Schema.getKey(), Schema.getAuthenticationMode(), Schema.isIsaPointer(),
+        Schema.authenticatesNullValues(), DiscriminatorPHI);
+  }
+
+  CGCallee Callee(FPT, CalleePtr, PointerAuth);
   return Callee;
 }
 
@@ -855,6 +879,25 @@ llvm::Value *ItaniumCXXABI::EmitMemberDataPointerAddress(
                                    "memptr.offset");
 }
 
+// See if it's possible to return a constant signed pointer.
+static llvm::Constant *pointerAuthResignConstant(
+    llvm::Value *Ptr, const CGPointerAuthInfo &CurAuthInfo,
+    const CGPointerAuthInfo &NewAuthInfo, CodeGenModule &CGM) {
+  const auto *CPA = dyn_cast<llvm::ConstantPtrAuth>(Ptr);
+
+  if (!CPA)
+    return nullptr;
+
+  assert(CPA->getKey()->getZExtValue() == CurAuthInfo.getKey() &&
+         CPA->getAddrDiscriminator()->isZeroValue() &&
+         CPA->getDiscriminator() == CurAuthInfo.getDiscriminator() &&
+         "unexpected key or discriminators");
+
+  return CGM.getConstantSignedPointer(
+      CPA->getPointer(), NewAuthInfo.getKey(), nullptr,
+      cast<llvm::ConstantInt>(NewAuthInfo.getDiscriminator()));
+}
+
 /// Perform a bitcast, derived-to-base, or base-to-derived member pointer
 /// conversion.
 ///
@@ -882,21 +925,63 @@ llvm::Value *
 ItaniumCXXABI::EmitMemberPointerConversion(CodeGenFunction &CGF,
                                            const CastExpr *E,
                                            llvm::Value *src) {
+  // Use constant emission if we can.
+  if (isa<llvm::Constant>(src))
+    return EmitMemberPointerConversion(E, cast<llvm::Constant>(src));
+
   assert(E->getCastKind() == CK_DerivedToBaseMemberPointer ||
          E->getCastKind() == CK_BaseToDerivedMemberPointer ||
          E->getCastKind() == CK_ReinterpretMemberPointer);
 
+  CGBuilderTy &Builder = CGF.Builder;
+  QualType DstType = E->getType();
+
+  if (DstType->isMemberFunctionPointerType()) {
+    if (const auto &NewAuthInfo =
+            CGM.getMemberFunctionPointerAuthInfo(DstType)) {
+      QualType SrcType = E->getSubExpr()->getType();
+      assert(SrcType->isMemberFunctionPointerType());
+      const auto &CurAuthInfo = CGM.getMemberFunctionPointerAuthInfo(SrcType);
+      llvm::Value *MemFnPtr = Builder.CreateExtractValue(src, 0, "memptr.ptr");
+      llvm::Type *OrigTy = MemFnPtr->getType();
+
+      llvm::BasicBlock *StartBB = Builder.GetInsertBlock();
+      llvm::BasicBlock *ResignBB = CGF.createBasicBlock("resign");
+      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("merge");
+
+      // Check whether we have a virtual offset or a pointer to a function.
+      assert(UseARMMethodPtrABI && "ARM ABI expected");
+      llvm::Value *Adj = Builder.CreateExtractValue(src, 1, "memptr.adj");
+      llvm::Constant *Ptrdiff_1 = llvm::ConstantInt::get(CGM.PtrDiffTy, 1);
+      llvm::Value *AndVal = Builder.CreateAnd(Adj, Ptrdiff_1);
+      llvm::Value *IsVirtualOffset =
+          Builder.CreateIsNotNull(AndVal, "is.virtual.offset");
+      Builder.CreateCondBr(IsVirtualOffset, MergeBB, ResignBB);
+
+      CGF.EmitBlock(ResignBB);
+      llvm::Type *PtrTy = llvm::PointerType::getUnqual(CGM.Int8Ty);
+      MemFnPtr = Builder.CreateIntToPtr(MemFnPtr, PtrTy);
+      MemFnPtr =
+          CGF.emitPointerAuthResign(MemFnPtr, SrcType, CurAuthInfo, NewAuthInfo,
+                                    isa<llvm::Constant>(src));
+      MemFnPtr = Builder.CreatePtrToInt(MemFnPtr, OrigTy);
+      llvm::Value *ResignedVal = Builder.CreateInsertValue(src, MemFnPtr, 0);
+      ResignBB = Builder.GetInsertBlock();
+
+      CGF.EmitBlock(MergeBB);
+      llvm::PHINode *NewSrc = Builder.CreatePHI(src->getType(), 2);
+      NewSrc->addIncoming(src, StartBB);
+      NewSrc->addIncoming(ResignedVal, ResignBB);
+      src = NewSrc;
+    }
+  }
+
   // Under Itanium, reinterprets don't require any additional processing.
   if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;
 
-  // Use constant emission if we can.
-  if (isa<llvm::Constant>(src))
-    return EmitMemberPointerConversion(E, cast<llvm::Constant>(src));
-
   llvm::Constant *adj = getMemberPointerAdjustment(E);
   if (!adj) return src;
 
-  CGBuilderTy &Builder = CGF.Builder;
   bool isDerivedToBase = (E->getCastKind() == CK_DerivedToBaseMemberPointer);
 
   const MemberPointerType *destTy =
@@ -934,6 +1019,34 @@ ItaniumCXXABI::EmitMemberPointerConversion(CodeGenFunction &CGF,
   return Builder.CreateInsertValue(src, dstAdj, 1);
 }
 
+static llvm::Constant *
+pointerAuthResignMemberFunctionPointer(llvm::Constant *Src, QualType DestType,
+                                       QualType SrcType, CodeGenModule &CGM) {
+  assert(DestType->isMemberFunctionPointerType() &&
+         SrcType->isMemberFunctionPointerType() &&
+         "member function pointers expected");
+  if (DestType == SrcType)
+    return Src;
+
+  const auto &NewAuthInfo = CGM.getMemberFunctionPointerAuthInfo(DestType);
+  const auto &CurAuthInfo = CGM.getMemberFunctionPointerAuthInfo(SrcType);
+
+  if (!NewAuthInfo && !CurAuthInfo)
+    return Src;
+
+  llvm::Constant *MemFnPtr = Src->getAggregateElement(0u);
+  if (MemFnPtr->getNumOperands() == 0) {
+    // src must be a pair of null pointers.
+    assert(isa<llvm::ConstantInt>(MemFnPtr) && "constant int expected");
+    return Src;
+  }
+
+  llvm::Constant *ConstPtr = pointerAuthResignConstant(
+      cast<llvm::User>(MemFnPtr)->getOperand(0), CurAuthInfo, NewAuthInfo, CGM);
+  ConstPtr = llvm::ConstantExpr::getPtrToInt(ConstPtr, MemFnPtr->getType());
+  return ConstantFoldInsertValueInstruction(Src, ConstPtr, 0);
+}
+
 llvm::Constant *
 ItaniumCXXABI::EmitMemberPointerConversion(const CastExpr *E,
                                            llvm::Constant *src) {
@@ -941,6 +1054,12 @@ ItaniumCXXABI::EmitMemberPointerConversion(const CastExpr *E,
          E->getCastKind() == CK_BaseToDerivedMemberPointer ||
          E->getCastKind() == CK_ReinterpretMemberPointer);
 
+  QualType DstType = E->getType();
+
+  if (DstType->isMemberFunctionPointerType())
+    src = pointerAuthResignMemberFunctionPointer(
+        src, DstType, E->getSubExpr()->getType(), CGM);
+
   // Under Itanium, reinterprets don't require any additional processing.
   if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;
 
@@ -1038,9 +1157,32 @@ llvm::Constant *ItaniumCXXABI::BuildMemberPointer(const CXXMethodDecl *MD,
       //   least significant bit of adj then makes exactly the same
       //   discrimination as the least significant bit of ptr does for
       //   Itanium.
-      MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset);
-      MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
-                                         2 * ThisAdjustment.getQuantity() + 1);
+
+      // We cannot use the Itanium ABI's representation for virtual member
+      // function pointers under pointer authentication because it would
+      // require us to store both the virtual offset and the constant
+      // discriminator in the pointer, which would be immediately vulnerable
+      // to attack.  Instead we introduce a thunk that does the virtual dispatch
+      // and store it as if it were a non-virtual member function.  This means
+      // that virtual function pointers may not compare equal anymore, but
+      // fortunately they aren't required to by the standard, and we do make
+      // a best-effort attempt to re-use the thunk.
+      //
+      // To support interoperation with code in which pointer authentication
+      // is disabled, derefencing a member function pointer must still handle
+      // the virtual case, but it can use a discriminator which should never
+      // be valid.
+      const auto &Schema =
+          CGM.getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers;
+      if (Schema)
+        MemPtr[0] = llvm::ConstantExpr::getPtrToInt(
+            getSignedVirtualMemberFunctionPointer(MD), CGM.PtrDiffTy);
+      else
+        MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset);
+      // Don't set the LSB of adj to 1 if pointer authentication for member
+      // function pointers is enabled.
+      MemPtr[1] = llvm::ConstantInt::get(
+          CGM.PtrDiffTy, 2 * ThisAdjustment.getQuantity() + !Schema);
     } else {
       // Itanium C++ ABI 2.3:
       //   For a virtual function, [the pointer field] is 1 plus the
@@ -1062,7 +1204,7 @@ llvm::Constant *ItaniumCXXABI::BuildMemberPointer(const CXXMethodDecl *MD,
       // function type is incomplete.
       Ty = CGM.PtrDiffTy;
     }
-    llvm::Constant *addr = CGM.GetAddrOfFunction(MD, Ty);
+    llvm::Constant *addr = CGM.getMemberFunctionPointer(MD, Ty);
 
     MemPtr[0] = llvm::ConstantExpr::getPtrToInt(addr, CGM.PtrDiffTy);
     MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
@@ -1082,8 +1224,12 @@ llvm::Constant *ItaniumCXXABI::EmitMemberPointer(const APValue &MP,
 
   CharUnits ThisAdjustment = getContext().getMemberPointerPathAdjustment(MP);
 
-  if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD))
-    return BuildMemberPointer(MD, ThisAdjustment);
+  if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD)) {
+    llvm::Constant *Src = BuildMemberPointer(MD, ThisAdjustment);
+    QualType SrcType = getContext().getMemberPointerType(
+        MD->getType(), MD->getParent()->getTypeForDecl());
+    return pointerAuthResignMemberFunctionPointer(Src, MPType, SrcType, CGM);
+  }
 
   CharUnits FieldOffset =
     getContext().toCharUnitsFromBits(getContext().getFieldOffset(MPD));
@@ -3189,6 +3335,78 @@ bool ItaniumCXXABI::NeedsVTTParameter(GlobalDecl GD) {
   return false;
 }
 
+llvm::Constant *
+ItaniumCXXABI::getOrCreateVirtualFunctionPointerThunk(const CXXMethodDecl *MD) {
+  SmallString<256> MethodName;
+  llvm::raw_svector_ostream Out(MethodName);
+  getMangleContext().mangleCXXName(MD, Out);
+  MethodName += "_vfpthunk_";
+  StringRef ThunkName = MethodName.str();
+  llvm::Function *ThunkFn;
+  if ((ThunkFn = cast_or_null<llvm::Function>(
+           CGM.getModule().getNamedValue(ThunkName))))
+    return ThunkFn;
+
+  const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeCXXMethodDeclaration(MD);
+  llvm::FunctionType *ThunkTy = CGM.getTypes().GetFunctionType(FnInfo);
+  llvm::GlobalValue::LinkageTypes Linkage =
+      MD->isExternallyVisible() ? llvm::GlobalValue::LinkOnceODRLinkage
+                                : llvm::GlobalValue::InternalLinkage;
+  ThunkFn =
+      llvm::Function::Create(ThunkTy, Linkage, ThunkName, &CGM.getModule());
+  if (Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
+    ThunkFn->setVisibility(llvm::GlobalValue::HiddenVisibility);
+  assert(ThunkFn->getName() == ThunkName && "name was uniqued!");
+
+  CGM.SetLLVMFunctionAttributes(MD, FnInfo, ThunkFn, /*IsThunk=*/true);
+  CGM.SetLLVMFunctionAttributesForDefinition(MD, ThunkFn);
+
+  // Stack protection sometimes gets inserted after the musttail call.
+  ThunkFn->removeFnAttr(llvm::Attribute::StackProtect);
+  ThunkFn->removeFnAttr(llvm::Attribute::StackProtectStrong);
+  ThunkFn->removeFnAttr(llvm::Attribute::StackProtectReq);
+
+  // Start codegen.
+  CodeGenFunction CGF(CGM);
+  CGF.CurGD = GlobalDecl(MD);
+  CGF.CurFuncIsThunk = true;
+
+  // Build FunctionArgs.
+  FunctionArgList FunctionArgs;
+  CGF.BuildFunctionArgList(CGF.CurGD, FunctionArgs);
+
+  CGF.StartFunction(GlobalDecl(), FnInfo.getReturnType(), ThunkFn, FnInfo,
+                    FunctionArgs, MD->getLocation(), SourceLocation());
+  llvm::Value *ThisVal = loadIncomingCXXThis(CGF);
+  setCXXABIThisValue(CGF, ThisVal);
+
+  CallArgList CallArgs;
+  for (const VarDecl *VD : FunctionArgs)
+    CGF.EmitDelegateCallArg(CallArgs, VD, SourceLocation());
+
+  const FunctionProtoType *FPT = MD->getType()->getAs<FunctionProtoType>();
+  RequiredArgs Required = RequiredArgs::forPrototypePlus(FPT, /*this*/ 1);
+  const CGFunctionInfo &CallInfo =
+      CGM.getTypes().arrangeCXXMethodCall(CallArgs, FPT, Required, 0);
+  CGCallee Callee = CGCallee::forVirtual(nullptr, GlobalDecl(MD),
+                                         getThisAddress(CGF), ThunkTy);
+  llvm::CallBase *CallOrInvoke;
+  CGF.EmitCall(CallInfo, Callee, ReturnValueSlot(), CallArgs, &CallOrInvoke,
+               /*IsMustTail=*/true, SourceLocation(), true);
+  auto *Call = cast<llvm::CallInst>(CallOrInvoke);
+  Call->setTailCallKind(llvm::CallInst::TCK_MustTail);
+  if (Call->getType()->isVoidTy())
+    CGF.Builder.CreateRetVoid();
+  else
+    CGF.Builder.CreateRet(Call);
+
+  // Finish the function to maintain CodeGenFunction invariants.
+  // FIXME: Don't emit unreachable code.
+  CGF.EmitBlock(CGF.createBasicBlock());
+  CGF.FinishFunction();
+  return ThunkFn;
+}
+
 namespace {
 class ItaniumRTTIBuilder {
   CodeGenModule &CGM;  // Per-module state.
@@ -4885,6 +5103,18 @@ ItaniumCXXABI::LoadVTablePtr(CodeGenFunction &CGF, Address This,
   return {CGF.GetVTablePtr(This, CGM.Int8PtrTy, RD), RD};
 }
 
+llvm::Constant *
+ItaniumCXXABI::getSignedVirtualMemberFunctionPointer(const CXXMethodDecl *MD) {
+  const CXXMethodDecl *origMD =
+      cast<CXXMethodDecl>(CGM.getItaniumVTableContext()
+                              .findOriginalMethod(MD->getCanonicalDecl())
+                              .getDecl());
+  llvm::Constant *thunk = getOrCreateVirtualFunctionPointerThunk(origMD);
+  QualType funcType = CGM.getContext().getMemberPointerType(
+      MD->getType(), MD->getParent()->getTypeForDecl());
+  return CGM.getMemberFunctionPointer(thunk, funcType);
+}
+
 void WebAssemblyCXXABI::emitBeginCatch(CodeGenFunction &CGF,
                                        const CXXCatchStmt *C) {
   if (CGF.getTarget().hasFeature("exception-handling"))
diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
similarity index 89%
rename from clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
rename to clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
index ee543e40b4609..3a1f745d9ed77 100644
--- a/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
+++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
@@ -1,4 +1,4 @@
-//===--- ObjectFilePCHContainerOperations.cpp -----------------------------===//
+//===--- ObjectFilePCHContainerWriter.cpp -----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
+#include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "CGDebugInfo.h"
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
@@ -351,46 +351,3 @@ ObjectFilePCHContainerWriter::CreatePCHContainerGenerator(
   return std::make_unique<PCHContainerGenerator>(
       CI, MainFileName, OutputFileName, std::move(OS), Buffer);
 }
-
-ArrayRef<StringRef> ObjectFilePCHContainerReader::getFormats() const {
-  static StringRef Formats[] = {"obj", "raw"};
-  return Formats;
-}
-
-StringRef
-ObjectFilePCHContainerReader::ExtractPCH(llvm::MemoryBufferRef Buffer) const {
-  StringRef PCH;
-  auto OFOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
-  if (OFOrErr) {
-    auto &OF = OFOrErr.get();
-    bool IsCOFF = isa<llvm::object::COFFObjectFile>(*OF);
-    // Find the clang AST section in the container.
-    for (auto &Section : OF->sections()) {
-      StringRef Name;
-      if (Expected<StringRef> NameOrErr = Section.getName())
-        Name = *NameOrErr;
-      else
-        consumeError(NameOrErr.takeError());
-
-      if ((!IsCOFF && Name == "__clangast") || (IsCOFF && Name == "clangast")) {
-        if (Expected<StringRef> E = Section.getContents())
-          return *E;
-        else {
-          handleAllErrors(E.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
-            EIB.log(llvm::errs());
-          });
-          return "";
-        }
-      }
-    }
-  }
-  handleAllErrors(OFOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
-    if (EIB.convertToErrorCode() ==
-        llvm::object::object_error::invalid_file_type)
-      // As a fallback, treat the buffer as a raw AST.
-      PCH = Buffer.getBuffer();
-    else
-      EIB.log(llvm::errs());
-  });
-  return PCH;
-}
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 0925609cc74aa..8f17c053f4783 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -334,6 +334,10 @@ class TargetCodeGenInfo {
                                                  llvm::AtomicOrdering Ordering,
                                                  llvm::LLVMContext &Ctx) const;
 
+  /// Allow the target to apply other metadata to an atomic instruction
+  virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                                       llvm::AtomicRMWInst &RMW) const {}
+
   /// Interface class for filling custom fields of a block literal for OpenCL.
   class TargetOpenCLBlockHelper {
   public:
@@ -437,6 +441,7 @@ enum class AArch64ABIKind {
   DarwinPCS,
   Win64,
   AAPCSSoft,
+  PAuthTest,
 };
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index b9df54b0c67c4..1dec3cd40ebd2 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -883,8 +883,10 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
 
   if (!CalleeIsStreamingCompatible &&
       (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
-    CGM.getDiags().Report(CallLoc,
-                          diag::err_function_always_inline_attribute_mismatch)
+    CGM.getDiags().Report(
+        CallLoc, CalleeIsStreaming
+                     ? diag::err_function_always_inline_attribute_mismatch
+                     : diag::warn_function_always_inline_attribute_mismatch)
         << Caller->getDeclName() << Callee->getDeclName() << "streaming";
   if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
     if (NewAttr->isNewZA())
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4d3275e17c386..37e6af3d4196a 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -311,6 +311,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
                                          llvm::LLVMContext &Ctx) const override;
+  void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                               llvm::AtomicRMWInst &RMW) const override;
   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
                                          llvm::Function *BlockInvokeFunc,
                                          llvm::Type *BlockTy) const override;
@@ -546,6 +548,23 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
   return Ctx.getOrInsertSyncScopeID(Name);
 }
 
+void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
+    CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const {
+  if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
+    return;
+
+  // TODO: Introduce new, more controlled options that also work for integers,
+  // and deprecate allowAMDGPUUnsafeFPAtomics.
+  llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation();
+  if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
+    llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
+    RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty);
+
+    if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy())
+      RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty);
+  }
+}
+
 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
   return false;
 }
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index e6b63b6fe093f..4d61f51379346 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -412,13 +412,16 @@ ABIArgInfo SystemZABIInfo::classifyReturnType(QualType RetTy) const {
 }
 
 ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
+  // Handle transparent union types.
+  Ty = useFirstFieldIfTransparentUnion(Ty);
+
   // Handle the generic C++ ABI.
   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
     return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
 
   // Integers and enums are extended to full register width.
   if (isPromotableIntegerTypeForABI(Ty))
-    return ABIArgInfo::getExtend(Ty);
+    return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty));
 
   // Handle vector types and vector-like structure types.  Note that
   // as opposed to float-like structure types, we do not allow any
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index d1ce53539e8a7..c116aa6f589f0 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -24,22 +24,9 @@ bool IsX86_MMXType(llvm::Type *IRType) {
     IRType->getScalarSizeInBits() != 64;
 }
 
-static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
+static llvm::Type *X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
                                           StringRef Constraint,
-                                          llvm::Type* Ty) {
-  bool IsMMXCons = llvm::StringSwitch<bool>(Constraint)
-                     .Cases("y", "&y", "^Ym", true)
-                     .Default(false);
-  if (IsMMXCons && Ty->isVectorTy()) {
-    if (cast<llvm::VectorType>(Ty)->getPrimitiveSizeInBits().getFixedValue() !=
-        64) {
-      // Invalid MMX constraint
-      return nullptr;
-    }
-
-    return llvm::Type::getX86_MMXTy(CGF.getLLVMContext());
-  }
-
+                                          llvm::Type *Ty) {
   if (Constraint == "k") {
     llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext());
     return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits());
@@ -808,6 +795,10 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
     if (!IsWin32StructABI && isEmptyRecord(getContext(), Ty, true))
       return ABIArgInfo::getIgnore();
 
+    // Ignore 0 sized structs.
+    if (TI.Width == 0)
+      return ABIArgInfo::getIgnore();
+
     llvm::LLVMContext &LLVMContext = getVMContext();
     llvm::IntegerType *Int32 = llvm::Type::getInt32Ty(LLVMContext);
     bool NeedsPadding = false;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f8f2ff94e2a2c..fc94b470dbd5d 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2607,7 +2607,7 @@ void Driver::HandleAutocompletions(StringRef PassedFlags) const {
   llvm::outs() << llvm::join(SuggestedCompletions, "\n") << '\n';
 }
 
-bool Driver::HandleImmediateArgs(const Compilation &C) {
+bool Driver::HandleImmediateArgs(Compilation &C) {
   // The order these options are handled in gcc is all over the place, but we
   // don't expect inconsistencies w.r.t. that to matter in practice.
 
@@ -2755,6 +2755,14 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
   if (C.getArgs().hasArg(options::OPT_print_libgcc_file_name)) {
     ToolChain::RuntimeLibType RLT = TC.GetRuntimeLibType(C.getArgs());
     const llvm::Triple Triple(TC.ComputeEffectiveClangTriple(C.getArgs()));
+    // The 'Darwin' toolchain is initialized only when its arguments are
+    // computed. Get the default arguments for OFK_None to ensure that
+    // initialization is performed before trying to access properties of
+    // the toolchain in the functions below.
+    // FIXME: Remove when darwin's toolchain is initialized during construction.
+    // FIXME: For some more esoteric targets the default toolchain is not the
+    //        correct one.
+    C.getArgsForToolChain(&TC, Triple.getArchName(), Action::OFK_None);
     RegisterEffectiveTriple TripleRAII(TC, Triple);
     switch (RLT) {
     case ToolChain::RLT_CompilerRT:
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 9d64f0d1eb754..33b493ebe4828 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1109,11 +1109,12 @@ std::string ToolChain::ComputeLLVMTriple(const ArgList &Args,
   }
   case llvm::Triple::aarch64: {
     llvm::Triple Triple = getTriple();
+    tools::aarch64::setPAuthABIInTriple(getDriver(), Args, Triple);
     if (!Triple.isOSBinFormatMachO())
-      return getTripleString();
+      return Triple.getTriple();
 
     if (Triple.isArm64e())
-      return getTripleString();
+      return Triple.getTriple();
 
     // FIXME: older versions of ld64 expect the "arm64" component in the actual
     // triple string and query it to determine whether an LTO file can be
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index b04502a57a9f7..aa51c1a8df58c 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -548,6 +548,9 @@ void AIX::addClangTargetOptions(
                   options::OPT_mtocdata))
     addTocDataOptions(Args, CC1Args, getDriver());
 
+  if (Args.hasArg(options::OPT_msave_reg_params))
+    CC1Args.push_back("-msave-reg-params");
+
   if (Args.hasFlag(options::OPT_fxl_pragma_pack,
                    options::OPT_fno_xl_pragma_pack, true))
     CC1Args.push_back("-fxl-pragma-pack");
@@ -557,6 +560,10 @@ void AIX::addClangTargetOptions(
   if (!Args.getLastArgNoClaim(options::OPT_fsized_deallocation,
                               options::OPT_fno_sized_deallocation))
     CC1Args.push_back("-fno-sized-deallocation");
+
+  if (!Args.hasFlag(options::OPT_ferr_pragma_mc_func_aix,
+                    options::OPT_fno_err_pragma_mc_func_aix, true))
+    CC1Args.push_back("-fno-err-pragma-mc-func-aix");
 }
 
 void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index e7e155b02d747..4384c10dbf3d1 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -620,19 +620,36 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const char *LinkingOutput) const {
   std::string Linker = getToolChain().GetLinkerPath();
   ArgStringList CmdArgs;
-  CmdArgs.push_back("--no-undefined");
-  CmdArgs.push_back("-shared");
+  if (!Args.hasArg(options::OPT_r)) {
+    CmdArgs.push_back("--no-undefined");
+    CmdArgs.push_back("-shared");
+  }
 
   addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   getToolChain().AddFilePathLibArgs(Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
-  if (C.getDriver().isUsingLTO())
+  if (C.getDriver().isUsingLTO()) {
     addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
                   C.getDriver().getLTOMode() == LTOK_Thin);
-  else if (Args.hasArg(options::OPT_mcpu_EQ))
+  } else if (Args.hasArg(options::OPT_mcpu_EQ)) {
     CmdArgs.push_back(Args.MakeArgString(
-        "-plugin-opt=mcpu=" + Args.getLastArgValue(options::OPT_mcpu_EQ)));
+        "-plugin-opt=mcpu=" +
+        getProcessorFromTargetID(getToolChain().getTriple(),
+                                 Args.getLastArgValue(options::OPT_mcpu_EQ))));
+  }
+
+  // Always pass the target-id features to the LTO job.
+  std::vector<StringRef> Features;
+  getAMDGPUTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args,
+                          Features);
+  if (!Features.empty()) {
+    CmdArgs.push_back(
+        Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
+  }
+
+  addGPULibraries(getToolChain(), Args, CmdArgs);
+
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(std::make_unique<Command>(
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 5fbf38cdda12b..f083e40df1314 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -449,3 +449,24 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
   if (Args.getLastArg(options::OPT_mno_bti_at_return_twice))
     Features.push_back("+no-bti-at-return-twice");
 }
+
+void aarch64::setPAuthABIInTriple(const Driver &D, const ArgList &Args,
+                                  llvm::Triple &Triple) {
+  Arg *ABIArg = Args.getLastArg(options::OPT_mabi_EQ);
+  bool HasPAuthABI =
+      ABIArg ? (StringRef(ABIArg->getValue()) == "pauthtest") : false;
+
+  switch (Triple.getEnvironment()) {
+  case llvm::Triple::UnknownEnvironment:
+    if (HasPAuthABI)
+      Triple.setEnvironment(llvm::Triple::PAuthTest);
+    break;
+  case llvm::Triple::PAuthTest:
+    break;
+  default:
+    if (HasPAuthABI)
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << ABIArg->getAsString(Args) << Triple.getTriple();
+    break;
+  }
+}
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.h b/clang/lib/Driver/ToolChains/Arch/AArch64.h
index d47c402d4a42d..6d071167bd392 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.h
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.h
@@ -28,6 +28,9 @@ void getAArch64TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 std::string getAArch64TargetCPU(const llvm::opt::ArgList &Args,
                                 const llvm::Triple &Triple, llvm::opt::Arg *&A);
 
+void setPAuthABIInTriple(const Driver &D, const llvm::opt::ArgList &Args,
+                         llvm::Triple &triple);
+
 } // end namespace aarch64
 } // end namespace target
 } // end namespace driver
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 4a2b9efc9ffad..1e8aac71dc9ba 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -127,6 +127,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const llvm::Triple &Triple,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
+  // Enable the `lsx` feature on 64-bit LoongArch by default.
+  if (Triple.isLoongArch64() &&
+      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
+    Features.push_back("+lsx");
+
   std::string ArchName;
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     ArchName = A->getValue();
@@ -145,9 +150,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (A->getOption().matches(options::OPT_msingle_float)) {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else /*Soft-float*/ {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     }
   } else if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) {
     StringRef FPU = A->getValue();
@@ -157,9 +164,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (FPU == "32") {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else if (FPU == "0" || FPU == "none") {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else {
       D.Diag(diag::err_drv_loongarch_invalid_mfpu_EQ) << FPU;
     }
@@ -174,6 +183,42 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     A->ignoreTargetSpecific();
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
+  if (Arg *A = Args.getLastArgNoClaim(options::OPT_msimd_EQ))
+    A->ignoreTargetSpecific();
+
+  // Select lsx/lasx feature determined by -msimd=.
+  // Option -msimd= precedes -m[no-]lsx and -m[no-]lasx.
+  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
+    StringRef MSIMD = A->getValue();
+    if (MSIMD == "lsx") {
+      // Option -msimd=lsx depends on 64-bit FPU.
+      // -m*-float and -mfpu=none/0/32 conflict with -msimd=lsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
+      else
+        Features.push_back("+lsx");
+    } else if (MSIMD == "lasx") {
+      // Option -msimd=lasx depends on 64-bit FPU and LSX.
+      // -m*-float, -mfpu=none/0/32 and -mno-lsx conflict with -msimd=lasx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+
+      // The command options do not contain -mno-lasx.
+      if (!Args.getLastArg(options::OPT_mno_lasx)) {
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else if (MSIMD == "none") {
+      if (llvm::find(Features, "+lsx") != Features.end())
+        Features.push_back("-lsx");
+      if (llvm::find(Features, "+lasx") != Features.end())
+        Features.push_back("-lasx");
+    } else {
+      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
+    }
+  }
 
   // Select lsx feature determined by -m[no-]lsx.
   if (const Arg *A = Args.getLastArg(options::OPT_mlsx, options::OPT_mno_lsx)) {
@@ -197,8 +242,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
         D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
         Features.push_back("+lsx");
         Features.push_back("+lasx");
@@ -206,35 +249,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else /*-mno-lasx*/
       Features.push_back("-lasx");
   }
-
-  // Select lsx/lasx feature determined by -msimd=.
-  // Option -msimd= has lower priority than -m[no-]lsx and -m[no-]lasx.
-  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
-    StringRef MSIMD = A->getValue();
-    if (MSIMD == "lsx") {
-      // Option -msimd=lsx depends on 64-bit FPU.
-      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
-      // The previous option does not contain feature -lsx.
-      else if (llvm::find(Features, "-lsx") == Features.end())
-        Features.push_back("+lsx");
-    } else if (MSIMD == "lasx") {
-      // Option -msimd=lasx depends on 64-bit FPU and LSX.
-      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
-      // The previous option does not contain feature -lasx.
-      else if (llvm::find(Features, "-lasx") == Features.end()) {
-        Features.push_back("+lsx");
-        Features.push_back("+lasx");
-      }
-    } else if (MSIMD != "none") {
-      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
-    }
-  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
@@ -253,8 +267,14 @@ std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
 std::string loongarch::getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
                                              const llvm::Triple &Triple) {
   std::string CPU;
+  std::string Arch;
   // If we have -march, use that.
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
-    CPU = A->getValue();
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+    Arch = A->getValue();
+    if (Arch == "la64v1.0" || Arch == "la64v1.1")
+      CPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+    else
+      CPU = Arch;
+  }
   return postProcessTargetCPUString(CPU, Triple);
 }
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index 634c096523319..57baa186a9eb7 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -20,78 +20,6 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
-static std::string getPPCGenericTargetCPU(const llvm::Triple &T) {
-  // LLVM may default to generating code for the native CPU,
-  // but, like gcc, we default to a more generic option for
-  // each architecture. (except on AIX)
-  if (T.isOSAIX())
-    return "pwr7";
-  else if (T.getArch() == llvm::Triple::ppc64le)
-    return "ppc64le";
-  else if (T.getArch() == llvm::Triple::ppc64)
-    return "ppc64";
-  else
-    return "ppc";
-}
-
-static std::string normalizeCPUName(StringRef CPUName, const llvm::Triple &T) {
-  // Clang/LLVM does not actually support code generation
-  // for the 405 CPU. However, there are uses of this CPU ID
-  // in projects that previously used GCC and rely on Clang
-  // accepting it. Clang has always ignored it and passed the
-  // generic CPU ID to the back end.
-  if (CPUName == "generic" || CPUName == "405")
-    return getPPCGenericTargetCPU(T);
-
-  if (CPUName == "native") {
-    std::string CPU = std::string(llvm::sys::getHostCPUName());
-    if (!CPU.empty() && CPU != "generic")
-      return CPU;
-    else
-      return getPPCGenericTargetCPU(T);
-  }
-
-  return llvm::StringSwitch<const char *>(CPUName)
-      .Case("common", "generic")
-      .Case("440fp", "440")
-      .Case("630", "pwr3")
-      .Case("G3", "g3")
-      .Case("G4", "g4")
-      .Case("G4+", "g4+")
-      .Case("8548", "e500")
-      .Case("G5", "g5")
-      .Case("power3", "pwr3")
-      .Case("power4", "pwr4")
-      .Case("power5", "pwr5")
-      .Case("power5x", "pwr5x")
-      .Case("power6", "pwr6")
-      .Case("power6x", "pwr6x")
-      .Case("power7", "pwr7")
-      .Case("power8", "pwr8")
-      .Case("power9", "pwr9")
-      .Case("power10", "pwr10")
-      .Case("future", "future")
-      .Case("powerpc", "ppc")
-      .Case("powerpc64", "ppc64")
-      .Case("powerpc64le", "ppc64le")
-      .Default(CPUName.data());
-}
-
-/// Get the (LLVM) name of the PowerPC cpu we are tuning for.
-std::string ppc::getPPCTuneCPU(const ArgList &Args, const llvm::Triple &T) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ))
-    return normalizeCPUName(A->getValue(), T);
-  return getPPCGenericTargetCPU(T);
-}
-
-/// Get the (LLVM) name of the PowerPC cpu we are targeting.
-std::string ppc::getPPCTargetCPU(const Driver &D, const ArgList &Args,
-                                 const llvm::Triple &T) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
-    return normalizeCPUName(A->getValue(), T);
-  return getPPCGenericTargetCPU(T);
-}
-
 const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
   return llvm::StringSwitch<const char *>(Name)
       .Case("pwr7", "-mpower7")
@@ -103,6 +31,8 @@ const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
       .Case("power9", "-mpower9")
       .Case("pwr10", "-mpower10")
       .Case("power10", "-mpower10")
+      .Case("pwr11", "-mpower11")
+      .Case("power11", "-mpower11")
       .Default("-many");
 }
 
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.h b/clang/lib/Driver/ToolChains/Arch/PPC.h
index ec5b3c8140b66..89b9af92e8ddb 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.h
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.h
@@ -35,10 +35,6 @@ enum class ReadGOTPtrMode {
 
 FloatABI getPPCFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 
-std::string getPPCTargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
-                            const llvm::Triple &T);
-std::string getPPCTuneCPU(const llvm::opt::ArgList &Args,
-                          const llvm::Triple &T);
 const char *getPPCAsmModeForCPU(StringRef Name);
 ReadGOTPtrMode getPPCReadGOTPtrMode(const Driver &D, const llvm::Triple &Triple,
                                     const llvm::opt::ArgList &Args);
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 2f63333b732f6..dc6c8695488bb 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -266,19 +266,29 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     }
 
     bool IsNegative = Name.starts_with("no-");
+
+    bool Not64Bit = ArchType != llvm::Triple::x86_64;
+    if (Not64Bit && Name == "uintr")
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getSpelling() << Triple.getTriple();
+
     if (A->getOption().matches(options::OPT_mapx_features_EQ) ||
         A->getOption().matches(options::OPT_mno_apx_features_EQ)) {
 
+      if (Not64Bit && !IsNegative)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << StringRef(A->getSpelling().str() + "|-mapxf")
+            << Triple.getTriple();
+
       for (StringRef Value : A->getValues()) {
-        if (Value == "egpr" || Value == "push2pop2" || Value == "ppx" ||
-            Value == "ndd" || Value == "ccmp" || Value == "nf" ||
-            Value == "cf" || Value == "zu") {
-          Features.push_back(
-              Args.MakeArgString((IsNegative ? "-" : "+") + Value));
-          continue;
-        }
-        D.Diag(clang::diag::err_drv_unsupported_option_argument)
-            << A->getSpelling() << Value;
+        if (Value != "egpr" && Value != "push2pop2" && Value != "ppx" &&
+            Value != "ndd" && Value != "ccmp" && Value != "nf" &&
+            Value != "cf" && Value != "zu")
+          D.Diag(clang::diag::err_drv_unsupported_option_argument)
+              << A->getSpelling() << Value;
+
+        Features.push_back(
+            Args.MakeArgString((IsNegative ? "-" : "+") + Value));
       }
       continue;
     }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 19aec9d2284f8..3779c8b884d41 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -64,6 +64,7 @@
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include <cctype>
@@ -833,10 +834,6 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
       Args.hasArg(options::OPT_coverage))
     FProfileDir = Args.getLastArg(options::OPT_fprofile_dir);
 
-  // TODO: Don't claim -c/-S to warn about -fsyntax-only -c/-S, -E -c/-S,
-  // like we warn about -fsyntax-only -E.
-  (void)(Args.hasArg(options::OPT_c) || Args.hasArg(options::OPT_S));
-
   // Put the .gcno and .gcda files (if needed) next to the primary output file,
   // or fall back to a file in the current directory for `clang -c --coverage
   // d/a.c` in the absence of -o.
@@ -1170,33 +1167,6 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     }
   }
 
-  // If we are compiling for a GPU target we want to override the system headers
-  // with ones created by the 'libc' project if present.
-  if (!Args.hasArg(options::OPT_nostdinc) &&
-      !Args.hasArg(options::OPT_nogpuinc) &&
-      !Args.hasArg(options::OPT_nobuiltininc)) {
-    // Without an offloading language we will include these headers directly.
-    // Offloading languages will instead only use the declarations stored in
-    // the resource directory at clang/lib/Headers/llvm_libc_wrappers.
-    if ((getToolChain().getTriple().isNVPTX() ||
-         getToolChain().getTriple().isAMDGCN()) &&
-        C.getActiveOffloadKinds() == Action::OFK_None) {
-      SmallString<128> P(llvm::sys::path::parent_path(D.Dir));
-      llvm::sys::path::append(P, "include");
-      llvm::sys::path::append(P, getToolChain().getTripleString());
-      CmdArgs.push_back("-internal-isystem");
-      CmdArgs.push_back(Args.MakeArgString(P));
-    } else if (C.getActiveOffloadKinds() == Action::OFK_OpenMP) {
-      // TODO: CUDA / HIP include their own headers for some common functions
-      // implemented here. We'll need to clean those up so they do not conflict.
-      SmallString<128> P(D.ResourceDir);
-      llvm::sys::path::append(P, "include");
-      llvm::sys::path::append(P, "llvm_libc_wrappers");
-      CmdArgs.push_back("-internal-isystem");
-      CmdArgs.push_back(Args.MakeArgString(P));
-    }
-  }
-
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
@@ -1387,6 +1357,35 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
         });
   }
 
+  // If we are compiling for a GPU target we want to override the system headers
+  // with ones created by the 'libc' project if present.
+  // TODO: This should be moved to `AddClangSystemIncludeArgs` by passing the
+  //       OffloadKind as an argument.
+  if (!Args.hasArg(options::OPT_nostdinc) &&
+      !Args.hasArg(options::OPT_nogpuinc) &&
+      !Args.hasArg(options::OPT_nobuiltininc)) {
+    // Without an offloading language we will include these headers directly.
+    // Offloading languages will instead only use the declarations stored in
+    // the resource directory at clang/lib/Headers/llvm_libc_wrappers.
+    if ((getToolChain().getTriple().isNVPTX() ||
+         getToolChain().getTriple().isAMDGCN()) &&
+        C.getActiveOffloadKinds() == Action::OFK_None) {
+      SmallString<128> P(llvm::sys::path::parent_path(D.Dir));
+      llvm::sys::path::append(P, "include");
+      llvm::sys::path::append(P, getToolChain().getTripleString());
+      CmdArgs.push_back("-internal-isystem");
+      CmdArgs.push_back(Args.MakeArgString(P));
+    } else if (C.getActiveOffloadKinds() == Action::OFK_OpenMP) {
+      // TODO: CUDA / HIP include their own headers for some common functions
+      // implemented here. We'll need to clean those up so they do not conflict.
+      SmallString<128> P(D.ResourceDir);
+      llvm::sys::path::append(P, "include");
+      llvm::sys::path::append(P, "llvm_libc_wrappers");
+      CmdArgs.push_back("-internal-isystem");
+      CmdArgs.push_back(Args.MakeArgString(P));
+    }
+  }
+
   // Add system include arguments for all targets but IAMCU.
   if (!IsIAMCU)
     forAllAssociatedToolChains(C, JA, getToolChain(),
@@ -1595,6 +1594,45 @@ void AddUnalignedAccessWarning(ArgStringList &CmdArgs) {
 }
 }
 
+// Each combination of options here forms a signing schema, and in most cases
+// each signing schema is its own incompatible ABI. The default values of the
+// options represent the default signing schema.
+static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
+                         options::OPT_fno_ptrauth_intrinsics))
+    CC1Args.push_back("-fptrauth-intrinsics");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
+                         options::OPT_fno_ptrauth_calls))
+    CC1Args.push_back("-fptrauth-calls");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
+                         options::OPT_fno_ptrauth_returns))
+    CC1Args.push_back("-fptrauth-returns");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
+                         options::OPT_fno_ptrauth_auth_traps))
+    CC1Args.push_back("-fptrauth-auth-traps");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_vtable_pointer_address_discrimination,
+          options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
+    CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_vtable_pointer_type_discrimination,
+          options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
+    CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
+                         options::OPT_fno_ptrauth_indirect_gotos))
+    CC1Args.push_back("-fptrauth-indirect-gotos");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
+                         options::OPT_fno_ptrauth_init_fini))
+    CC1Args.push_back("-fptrauth-init-fini");
+}
+
 static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
                                     ArgStringList &CmdArgs, bool isAArch64) {
   const Arg *A = isAArch64
@@ -1657,16 +1695,30 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
 
   CmdArgs.push_back(
       Args.MakeArgString(Twine("-msign-return-address=") + Scope));
-  if (Scope != "none")
+  if (Scope != "none") {
+    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getAsString(Args) << Triple.getTriple();
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
-  if (BranchProtectionPAuthLR)
+  }
+  if (BranchProtectionPAuthLR) {
+    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getAsString(Args) << Triple.getTriple();
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
+  }
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
-  if (GuardedControlStack)
+  // GCS is currently untested with PAuthABI, but enabling this could be allowed
+  // in future after testing with a suitable system.
+  if (GuardedControlStack) {
+    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getAsString(Args) << Triple.getTriple();
     CmdArgs.push_back("-mguarded-control-stack");
+  }
 }
 
 void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
@@ -1810,6 +1862,8 @@ void RenderAArch64ABI(const llvm::Triple &Triple, const ArgList &Args,
     ABIName = A->getValue();
   else if (Triple.isOSDarwin())
     ABIName = "darwinpcs";
+  else if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+    ABIName = "pauthtest";
   else
     ABIName = "aapcs";
 
@@ -1846,6 +1900,9 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   // Enable/disable return address signing and indirect branch targets.
   CollectARMPACBTIOptions(getToolChain(), Args, CmdArgs, true /*isAArch64*/);
 
+  if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+    handlePAuthABI(Args, CmdArgs);
+
   // Handle -msve_vector_bits=<bits>
   if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
     StringRef Val = A->getValue();
@@ -1898,11 +1955,17 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   Args.addOptInFlag(
       CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination,
       options::OPT_fno_ptrauth_vtable_pointer_type_discrimination);
+  Args.addOptInFlag(
+      CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
+      options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini,
                     options::OPT_fno_ptrauth_init_fini);
   Args.addOptInFlag(
       CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination,
       options::OPT_fno_ptrauth_function_pointer_type_discrimination);
+
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos,
+                    options::OPT_fno_ptrauth_indirect_gotos);
 }
 
 void Clang::AddLoongArchTargetArgs(const ArgList &Args,
@@ -2075,10 +2138,10 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
   const llvm::Triple &T = getToolChain().getTriple();
-  if (Args.getLastArg(options::OPT_mtune_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     CmdArgs.push_back("-tune-cpu");
-    std::string CPU = ppc::getPPCTuneCPU(Args, T);
-    CmdArgs.push_back(Args.MakeArgString(CPU));
+    StringRef CPU = llvm::PPC::getNormalizedPPCTuneCPU(T, A->getValue());
+    CmdArgs.push_back(Args.MakeArgString(CPU.str()));
   }
 
   // Select the ABI to use.
@@ -2923,6 +2986,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   // If one wasn't given by the user, don't pass it here.
   StringRef FPContract;
   StringRef LastSeenFfpContractOption;
+  StringRef LastFpContractOverrideOption;
   bool SeenUnsafeMathModeOption = false;
   if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
       !JA.isOffloading(Action::OFK_HIP))
@@ -2965,6 +3029,27 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     SeenUnsafeMathModeOption = true;
   };
 
+  // Lambda to consolidate common handling for fp-contract
+  auto restoreFPContractState = [&]() {
+    // CUDA and HIP don't rely on the frontend to pass an ffp-contract option.
+    // For other targets, if the state has been changed by one of the
+    // unsafe-math umbrella options a subsequent -fno-fast-math or
+    // -fno-unsafe-math-optimizations option reverts to the last value seen for
+    // the -ffp-contract option or "on" if we have not seen the -ffp-contract
+    // option. If we have not seen an unsafe-math option or -ffp-contract,
+    // we leave the FPContract state unchanged.
+    if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
+        !JA.isOffloading(Action::OFK_HIP)) {
+      if (LastSeenFfpContractOption != "")
+        FPContract = LastSeenFfpContractOption;
+      else if (SeenUnsafeMathModeOption)
+        FPContract = "on";
+    }
+    // In this case, we're reverting to the last explicit fp-contract option
+    // or the platform default
+    LastFpContractOverrideOption = "";
+  };
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -3073,7 +3158,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
-      FPContract = "on";
 
       StringRef Val = A->getValue();
       if (OFastEnabled && Val != "fast") {
@@ -3090,14 +3174,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       if (Val == "fast") {
         FPModel = Val;
         applyFastMath();
+        // applyFastMath sets fp-contract="fast"
+        LastFpContractOverrideOption = "-ffp-model=fast";
       } else if (Val == "precise") {
         FPModel = Val;
         FPContract = "on";
+        LastFpContractOverrideOption = "-ffp-model=precise";
       } else if (Val == "strict") {
         StrictFPModel = true;
         FPExceptionBehavior = "strict";
         FPModel = Val;
         FPContract = "off";
+        LastFpContractOverrideOption = "-ffp-model=strict";
         TrappingMath = true;
         RoundingFPMath = true;
       } else
@@ -3181,8 +3269,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       StringRef Val = A->getValue();
       if (Val == "fast" || Val == "on" || Val == "off" ||
           Val == "fast-honor-pragmas") {
+        if (Val != FPContract && LastFpContractOverrideOption != "") {
+          D.Diag(clang::diag::warn_drv_overriding_option)
+              << LastFpContractOverrideOption
+              << Args.MakeArgString("-ffp-contract=" + Val);
+        }
+
         FPContract = Val;
         LastSeenFfpContractOption = Val;
+        LastFpContractOverrideOption = "";
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
@@ -3261,6 +3356,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       TrappingMath = false;
       FPExceptionBehavior = "";
       FPContract = "fast";
+      LastFpContractOverrideOption = "-funsafe-math-optimizations";
       SeenUnsafeMathModeOption = true;
       break;
     case options::OPT_fno_unsafe_math_optimizations:
@@ -3268,14 +3364,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       SignedZeros = true;
       ApproxFunc = false;
-
-      if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
-          !JA.isOffloading(Action::OFK_HIP)) {
-        if (LastSeenFfpContractOption != "") {
-          FPContract = LastSeenFfpContractOption;
-        } else if (SeenUnsafeMathModeOption)
-          FPContract = "on";
-      }
+      restoreFPContractState();
       break;
 
     case options::OPT_Ofast:
@@ -3283,10 +3372,13 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       if (!OFastEnabled)
         continue;
       [[fallthrough]];
-    case options::OPT_ffast_math: {
+    case options::OPT_ffast_math:
       applyFastMath();
+      if (A->getOption().getID() == options::OPT_Ofast)
+        LastFpContractOverrideOption = "-Ofast";
+      else
+        LastFpContractOverrideOption = "-ffast-math";
       break;
-    }
     case options::OPT_fno_fast_math:
       HonorINFs = true;
       HonorNaNs = true;
@@ -3299,16 +3391,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       ApproxFunc = false;
       SignedZeros = true;
-      // -fno_fast_math restores default fpcontract handling
-      if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
-          !JA.isOffloading(Action::OFK_HIP)) {
-        if (LastSeenFfpContractOption != "") {
-          FPContract = LastSeenFfpContractOption;
-        } else if (SeenUnsafeMathModeOption)
-          FPContract = "on";
-      }
+      restoreFPContractState();
+      LastFpContractOverrideOption = "";
       break;
-    }
+    } // End switch (A->getOption().getID())
+
     // The StrictFPModel local variable is needed to report warnings
     // in the way we intend. If -ffp-model=strict has been used, we
     // want to report a warning for the next option encountered that
@@ -3326,12 +3413,17 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       else {
         StrictFPModel = false;
         FPModel = "";
+        // The warning for -ffp-contract would have been reported by the
+        // OPT_ffp_contract_EQ handler above. A special check here is needed
+        // to avoid duplicating the warning.
         auto RHS = (A->getNumValues() == 0)
                        ? A->getSpelling()
                        : Args.MakeArgString(A->getSpelling() + A->getValue());
-        if (RHS != "-ffp-model=strict")
-          D.Diag(clang::diag::warn_drv_overriding_option)
-              << "-ffp-model=strict" << RHS;
+        if (A->getSpelling() != "-ffp-contract=") {
+          if (RHS != "-ffp-model=strict")
+            D.Diag(clang::diag::warn_drv_overriding_option)
+                << "-ffp-model=strict" << RHS;
+        }
       }
     }
 
@@ -3413,26 +3505,32 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   // individual features enabled by -ffast-math instead of the option itself as
   // that's consistent with gcc's behaviour.
   if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath && ApproxFunc &&
-      ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath) {
+      ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath)
     CmdArgs.push_back("-ffast-math");
-    if (FPModel == "fast") {
-      if (FPContract == "fast")
-        // All set, do nothing.
-        ;
-      else if (FPContract.empty())
-        // Enable -ffp-contract=fast
-        CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast"));
-      else
-        D.Diag(clang::diag::warn_drv_overriding_option)
-            << "-ffp-model=fast"
-            << Args.MakeArgString("-ffp-contract=" + FPContract);
-    }
-  }
 
   // Handle __FINITE_MATH_ONLY__ similarly.
-  if (!HonorINFs && !HonorNaNs)
+  // The -ffinite-math-only is added to CmdArgs when !HonorINFs && !HonorNaNs.
+  // Otherwise process the Xclang arguments to determine if -menable-no-infs and
+  // -menable-no-nans are set by the user.
+  bool shouldAddFiniteMathOnly = false;
+  if (!HonorINFs && !HonorNaNs) {
+    shouldAddFiniteMathOnly = true;
+  } else {
+    bool InfValues = true;
+    bool NanValues = true;
+    for (const auto *Arg : Args.filtered(options::OPT_Xclang)) {
+      StringRef ArgValue = Arg->getValue();
+      if (ArgValue == "-menable-no-nans")
+        NanValues = false;
+      else if (ArgValue == "-menable-no-infs")
+        InfValues = false;
+    }
+    if (!NanValues && !InfValues)
+      shouldAddFiniteMathOnly = true;
+  }
+  if (shouldAddFiniteMathOnly) {
     CmdArgs.push_back("-ffinite-math-only");
-
+  }
   if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) {
     CmdArgs.push_back("-mfpmath");
     CmdArgs.push_back(A->getValue());
@@ -3801,6 +3899,11 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs,
     CmdArgs.push_back(Args.MakeArgString(CLExtStr));
   }
 
+  if (Args.hasArg(options::OPT_cl_finite_math_only)) {
+    CmdArgs.push_back("-menable-no-infs");
+    CmdArgs.push_back("-menable-no-nans");
+  }
+
   for (const auto &Arg : ForwardedArguments)
     if (const auto *A = Args.getLastArg(Arg))
       CmdArgs.push_back(Args.MakeArgString(A->getOption().getPrefixedName()));
@@ -4026,6 +4129,9 @@ static bool RenderModulesOptions(Compilation &C, const Driver &D,
                    options::OPT_fno_modules_strict_decluse, false))
     CmdArgs.push_back("-fmodules-strict-decluse");
 
+  Args.addOptOutFlag(CmdArgs, options::OPT_fmodulemap_allow_subdirectory_search,
+                     options::OPT_fno_modulemap_allow_subdirectory_search);
+
   // -fno-implicit-modules turns off implicitly compiling modules on demand.
   bool ImplicitModules = false;
   if (!Args.hasFlag(options::OPT_fimplicit_modules,
@@ -4789,11 +4895,8 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
 
   // -gdwarf-aranges turns on the emission of the aranges section in the
   // backend.
-  // Always enabled for SCE tuning.
-  bool NeedAranges = DebuggerTuning == llvm::DebuggerKind::SCE;
-  if (const Arg *A = Args.getLastArg(options::OPT_gdwarf_aranges))
-    NeedAranges = checkDebugInfoOption(A, Args, D, TC) || NeedAranges;
-  if (NeedAranges) {
+  if (const Arg *A = Args.getLastArg(options::OPT_gdwarf_aranges);
+      A && checkDebugInfoOption(A, Args, D, TC)) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-generate-arange-section");
   }
@@ -6453,6 +6556,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                      options::OPT_fno_zero_initialized_in_bss);
 
   bool OFastEnabled = isOptimizationLevelFast(Args);
+  if (OFastEnabled)
+    D.Diag(diag::warn_drv_deprecated_arg_ofast);
   // If -Ofast is the optimization level, then -fstrict-aliasing should be
   // enabled.  This alias option is being used to simplify the hasFlag logic.
   OptSpecifier StrictAliasingAliasOption =
@@ -6462,6 +6567,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (!Args.hasFlag(options::OPT_fstrict_aliasing, StrictAliasingAliasOption,
                     options::OPT_fno_strict_aliasing, !IsWindowsMSVC))
     CmdArgs.push_back("-relaxed-aliasing");
+  if (Args.hasFlag(options::OPT_fpointer_tbaa, options::OPT_fno_pointer_tbaa,
+                   false))
+    CmdArgs.push_back("-pointer-tbaa");
   if (!Args.hasFlag(options::OPT_fstruct_path_tbaa,
                     options::OPT_fno_struct_path_tbaa, true))
     CmdArgs.push_back("-no-struct-path-tbaa");
@@ -7451,7 +7559,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     StringRef S0 = A->getValue(), S = S0;
     unsigned Size, Offset = 0;
     if (!Triple.isAArch64() && !Triple.isLoongArch() && !Triple.isRISCV() &&
-        !Triple.isX86())
+        !Triple.isX86() &&
+        !(!Triple.isOSAIX() && (Triple.getArch() == llvm::Triple::ppc ||
+                                Triple.getArch() == llvm::Triple::ppc64)))
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     else if (S.consumeInteger(10, Size) ||
@@ -7544,6 +7654,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (const char *Name = C.getTimeTraceFile(&JA)) {
     CmdArgs.push_back(Args.MakeArgString("-ftime-trace=" + Twine(Name)));
     Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ);
+    Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_verbose);
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
@@ -8479,17 +8590,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        options::OPT_mno_amdgpu_ieee);
   }
 
-  // For all the host OpenMP offloading compile jobs we need to pass the targets
-  // information using -fopenmp-targets= option.
-  if (JA.isHostOffloading(Action::OFK_OpenMP)) {
-    SmallString<128> Targets("-fopenmp-targets=");
-
-    SmallVector<std::string, 4> Triples;
-    auto TCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
-    std::transform(TCRange.first, TCRange.second, std::back_inserter(Triples),
-                   [](auto TC) { return TC.second->getTripleString(); });
-    CmdArgs.push_back(Args.MakeArgString(Targets + llvm::join(Triples, ",")));
-  }
+  addOpenMPHostOffloadingArgs(C, JA, Args, CmdArgs);
 
   // For all the host SYCL offloading compile jobs we need to pass the targets
   // information using -fsycl-targets= option.
@@ -10515,8 +10616,7 @@ static void getTripleBasedSPIRVTransOpts(Compilation &C,
               ",+SPV_INTEL_tensor_float32_conversion"
               ",+SPV_INTEL_optnone"
               ",+SPV_KHR_non_semantic_info"
-              ",+SPV_KHR_cooperative_matrix"
-              ",+SPV_INTEL_memory_access_aliasing";
+              ",+SPV_KHR_cooperative_matrix";
   if (IsCPU)
     ExtArg += ",+SPV_INTEL_fp_max_error";
 
@@ -11320,6 +11420,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     A->claim();
   }
 
+  // If we disable the GPU C library support it needs to be forwarded to the
+  // link job.
+  if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, true))
+    CmdArgs.push_back("--device-compiler=-nolibc");
+
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 5ddec78442a94..c7555836a46c4 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <optional>
 
@@ -517,6 +518,22 @@ void tools::addLinkerCompressDebugSectionsOption(
   }
 }
 
+void tools::addGPULibraries(const ToolChain &TC, const llvm::opt::ArgList &Args,
+                            llvm::opt::ArgStringList &CmdArgs) {
+  if (Args.hasArg(options::OPT_nostdlib, options::OPT_r,
+                  options::OPT_nodefaultlibs, options::OPT_nolibc,
+                  options::OPT_nogpulibc))
+    return;
+
+  // If the user's toolchain has the 'include/<triple>/` path, we assume it
+  // supports the standard C libraries for the GPU and include them.
+  bool HasLibC = TC.getStdlibIncludePath().has_value();
+  if (HasLibC) {
+    CmdArgs.push_back("-lc");
+    CmdArgs.push_back("-lm");
+  }
+}
+
 void tools::AddTargetFeature(const ArgList &Args,
                              std::vector<StringRef> &Features,
                              OptSpecifier OnOpt, OptSpecifier OffOpt,
@@ -626,7 +643,10 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
-    return ppc::getPPCTargetCPU(D, Args, T);
+    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+      return std::string(
+          llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
+    return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
 
   case llvm::Triple::csky:
     if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
@@ -1153,42 +1173,6 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   }
 }
 
-/// Adds the '-lcgpu' and '-lmgpu' libraries to the compilation to include the
-/// LLVM C library for GPUs.
-static void addOpenMPDeviceLibC(const Compilation &C, const ArgList &Args,
-                                ArgStringList &CmdArgs) {
-  if (Args.hasArg(options::OPT_nogpulib) || Args.hasArg(options::OPT_nolibc))
-    return;
-
-  // Check the resource directory for the LLVM libc GPU declarations. If it's
-  // found we can assume that LLVM was built with support for the GPU libc.
-  SmallString<256> LibCDecls(C.getDriver().ResourceDir);
-  llvm::sys::path::append(LibCDecls, "include", "llvm_libc_wrappers",
-                          "llvm-libc-decls");
-  bool HasLibC = llvm::sys::fs::exists(LibCDecls) &&
-                 llvm::sys::fs::is_directory(LibCDecls);
-  if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC))
-    return;
-
-  SmallVector<const ToolChain *> ToolChains;
-  auto TCRange = C.getOffloadToolChains(Action::OFK_OpenMP);
-  for (auto TI = TCRange.first, TE = TCRange.second; TI != TE; ++TI)
-    ToolChains.push_back(TI->second);
-
-  if (llvm::any_of(ToolChains, [](const ToolChain *TC) {
-        return TC->getTriple().isAMDGPU();
-      })) {
-    CmdArgs.push_back("-lcgpu-amdgpu");
-    CmdArgs.push_back("-lmgpu-amdgpu");
-  }
-  if (llvm::any_of(ToolChains, [](const ToolChain *TC) {
-        return TC->getTriple().isNVPTX();
-      })) {
-    CmdArgs.push_back("-lcgpu-nvptx");
-    CmdArgs.push_back("-lmgpu-nvptx");
-  }
-}
-
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
                                         const ArgList &Args,
                                         ArgStringList &CmdArgs) {
@@ -1259,15 +1243,32 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
   if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
     CmdArgs.push_back("-lomptarget.devicertl");
 
-  if (IsOffloadingHost)
-    addOpenMPDeviceLibC(C, Args, CmdArgs);
-
   addArchSpecificRPath(TC, Args, CmdArgs);
+
   addOpenMPRuntimeLibraryPath(TC, Args, CmdArgs);
 
   return true;
 }
 
+void tools::addOpenMPHostOffloadingArgs(const Compilation &C,
+                                        const JobAction &JA,
+                                        const llvm::opt::ArgList &Args,
+                                        llvm::opt::ArgStringList &CmdArgs) {
+  if (!JA.isHostOffloading(Action::OFK_OpenMP))
+    return;
+
+  // For all the host OpenMP offloading compile jobs we need to pass the targets
+  // information using -fopenmp-targets= option.
+  constexpr llvm::StringLiteral Targets("-fopenmp-targets=");
+
+  SmallVector<std::string> Triples;
+  auto TCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
+  std::transform(TCRange.first, TCRange.second, std::back_inserter(Triples),
+                 [](auto TC) { return TC.second->getTripleString(); });
+  CmdArgs.push_back(
+      Args.MakeArgString(Twine(Targets) + llvm::join(Triples, ",")));
+}
+
 /// Add Fortran runtime libs
 void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
                                   llvm::opt::ArgStringList &CmdArgs) {
@@ -1989,8 +1990,8 @@ tools::ParsePICArgs(const ToolChain &ToolChain, const ArgList &Args) {
   return std::make_tuple(RelocM, 0U, false);
 }
 
-// `-falign-functions` indicates that the functions should be aligned to a
-// 16-byte boundary.
+// `-falign-functions` indicates that the functions should be aligned to the
+// backend's preferred alignment.
 //
 // `-falign-functions=1` is the same as `-fno-align-functions`.
 //
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index ce97a7dbefc33..33c724692dee5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -35,6 +35,9 @@ void addLinkerCompressDebugSectionsOption(const ToolChain &TC,
                                           const llvm::opt::ArgList &Args,
                                           llvm::opt::ArgStringList &CmdArgs);
 
+void addGPULibraries(const ToolChain &TC, const llvm::opt::ArgList &Args,
+                     llvm::opt::ArgStringList &CmdArgs);
+
 void claimNoWarnArgs(const llvm::opt::ArgList &Args);
 
 bool addSanitizerRuntimes(const ToolChain &TC, const llvm::opt::ArgList &Args,
@@ -121,6 +124,11 @@ bool addOpenMPRuntime(const Compilation &C, llvm::opt::ArgStringList &CmdArgs,
                       bool ForceStaticHostRuntime = false,
                       bool IsOffloadingHost = false, bool GompNeedsRT = false);
 
+/// Adds offloading options for OpenMP host compilation to \p CmdArgs.
+void addOpenMPHostOffloadingArgs(const Compilation &C, const JobAction &JA,
+                                 const llvm::opt::ArgList &Args,
+                                 llvm::opt::ArgStringList &CmdArgs);
+
 /// Adds Fortran runtime libraries to \p CmdArgs.
 void addFortranRuntimeLibs(const ToolChain &TC, const llvm::opt::ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index f3bddf04878b1..f5c182537d97c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -465,13 +465,6 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
-  // If we are invoking `nvlink` internally we need to output a `.cubin` file.
-  // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
-  if (!C.getInputArgs().getLastArg(options::OPT_c)) {
-    SmallString<256> Filename(Output.getFilename());
-    llvm::sys::path::replace_extension(Filename, "cubin");
-    OutputFileName = Filename.str();
-  }
   if (Output.isFilename() && OutputFileName != Output.getFilename())
     C.addTempFile(Args.MakeArgString(OutputFileName));
 
@@ -691,14 +684,24 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
-  if (GPUArch.empty()) {
+  if (GPUArch.empty() && !C.getDriver().isUsingLTO()) {
     C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
         << getToolChain().getArchName() << getShortName();
     return;
   }
 
-  CmdArgs.push_back("-arch");
-  CmdArgs.push_back(Args.MakeArgString(GPUArch));
+  if (!GPUArch.empty()) {
+    CmdArgs.push_back("-arch");
+    CmdArgs.push_back(Args.MakeArgString(GPUArch));
+  }
+
+  if (Args.hasArg(options::OPT_ptxas_path_EQ))
+    CmdArgs.push_back(Args.MakeArgString(
+        "--pxtas-path=" + Args.getLastArgValue(options::OPT_ptxas_path_EQ)));
+
+  if (Args.hasArg(options::OPT_cuda_path_EQ))
+    CmdArgs.push_back(Args.MakeArgString(
+        "--cuda-path=" + Args.getLastArgValue(options::OPT_cuda_path_EQ)));
 
   // Add paths specified in LIBRARY_PATH environment variable as -L options.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
@@ -706,6 +709,20 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // Add standard library search paths passed on the command line.
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   getToolChain().AddFilePathLibArgs(Args, CmdArgs);
+  AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
+
+  if (C.getDriver().isUsingLTO())
+    addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
+                  C.getDriver().getLTOMode() == LTOK_Thin);
+
+  // Forward the PTX features if the nvlink-wrapper needs it.
+  std::vector<StringRef> Features;
+  getNVPTXTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args,
+                         Features);
+  for (StringRef Feature : Features)
+    CmdArgs.append({"--feature", Args.MakeArgString(Feature)});
+
+  addGPULibraries(getToolChain(), Args, CmdArgs);
 
   // Add paths for the default clang library path.
   SmallString<256> DefaultLibPath =
@@ -713,51 +730,12 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
-  for (const auto &II : Inputs) {
-    if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
-        II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) {
-      C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
-          << getToolChain().getTripleString();
-      continue;
-    }
-
-    // The 'nvlink' application performs RDC-mode linking when given a '.o'
-    // file and device linking when given a '.cubin' file. We always want to
-    // perform device linking, so just rename any '.o' files.
-    // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
-    if (II.isFilename()) {
-      auto InputFile = getToolChain().getInputFilename(II);
-      if (llvm::sys::path::extension(InputFile) != ".cubin") {
-        // If there are no actions above this one then this is direct input and
-        // we can copy it. Otherwise the input is internal so a `.cubin` file
-        // should exist.
-        if (II.getAction() && II.getAction()->getInputs().size() == 0) {
-          const char *CubinF =
-              Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
-                  llvm::sys::path::stem(InputFile), "cubin"));
-          if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF)))
-            continue;
-
-          CmdArgs.push_back(CubinF);
-        } else {
-          SmallString<256> Filename(InputFile);
-          llvm::sys::path::replace_extension(Filename, "cubin");
-          CmdArgs.push_back(Args.MakeArgString(Filename));
-        }
-      } else {
-        CmdArgs.push_back(Args.MakeArgString(InputFile));
-      }
-    } else if (!II.isNothing()) {
-      II.getInputArg().renderAsInput(Args, CmdArgs);
-    }
-  }
-
   C.addCommand(std::make_unique<Command>(
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
                           "--options-file"},
-      Args.MakeArgString(getToolChain().GetProgramPath("nvlink")), CmdArgs,
-      Inputs, Output));
+      Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")),
+      CmdArgs, Inputs, Output));
 }
 
 void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
@@ -1112,11 +1090,7 @@ std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
   if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
     return ToolChain::getInputFilename(Input);
 
-  // Replace extension for object files with cubin because nvlink relies on
-  // these particular file names.
-  SmallString<256> Filename(ToolChain::getInputFilename(Input));
-  llvm::sys::path::replace_extension(Filename, "cubin");
-  return std::string(Filename);
+  return ToolChain::getInputFilename(Input);
 }
 
 llvm::opt::DerivedArgList *
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 70851d8356f25..9ea82bd379d54 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -182,6 +182,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
   bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
     return false;
   }
+  bool HasNativeLLVMSupport() const override { return true; }
   bool isPICDefaultForced() const override { return false; }
   bool SupportsProfiling() const override { return false; }
 
@@ -220,6 +221,8 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
     return &HostTC.getTriple();
   }
 
+  bool HasNativeLLVMSupport() const override { return false; }
+
   std::string getInputFilename(const InputInfo &Input) const override;
 
   llvm::opt::DerivedArgList *
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index f354b0974d5f2..17b6074160716 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1272,23 +1272,8 @@ unsigned DarwinClang::GetDefaultDwarfVersion() const {
 void MachO::AddLinkRuntimeLib(const ArgList &Args, ArgStringList &CmdArgs,
                               StringRef Component, RuntimeLinkOptions Opts,
                               bool IsShared) const {
-  SmallString<64> DarwinLibName = StringRef("libclang_rt.");
-  // On Darwin the builtins component is not in the library name.
-  if (Component != "builtins") {
-    DarwinLibName += Component;
-    if (!(Opts & RLO_IsEmbedded))
-      DarwinLibName += "_";
-  }
-
-  DarwinLibName += getOSLibraryNameSuffix();
-  DarwinLibName += IsShared ? "_dynamic.dylib" : ".a";
-  SmallString<128> Dir(getDriver().ResourceDir);
-  llvm::sys::path::append(Dir, "lib", "darwin");
-  if (Opts & RLO_IsEmbedded)
-    llvm::sys::path::append(Dir, "macho_embedded");
-
-  SmallString<128> P(Dir);
-  llvm::sys::path::append(P, DarwinLibName);
+  std::string P = getCompilerRT(
+      Args, Component, IsShared ? ToolChain::FT_Shared : ToolChain::FT_Static);
 
   // For now, allow missing resource libraries to support developers who may
   // not have compiler-rt checked out or integrated into their build (unless
@@ -1303,20 +1288,58 @@ void MachO::AddLinkRuntimeLib(const ArgList &Args, ArgStringList &CmdArgs,
   // rpaths. This is currently true from this place, but we need to be
   // careful if this function is ever called before user's rpaths are emitted.
   if (Opts & RLO_AddRPath) {
-    assert(DarwinLibName.ends_with(".dylib") && "must be a dynamic library");
+    assert(StringRef(P).ends_with(".dylib") && "must be a dynamic library");
 
     // Add @executable_path to rpath to support having the dylib copied with
     // the executable.
     CmdArgs.push_back("-rpath");
     CmdArgs.push_back("@executable_path");
 
-    // Add the path to the resource dir to rpath to support using the dylib
-    // from the default location without copying.
+    // Add the compiler-rt library's directory to rpath to support using the
+    // dylib from the default location without copying.
     CmdArgs.push_back("-rpath");
-    CmdArgs.push_back(Args.MakeArgString(Dir));
+    CmdArgs.push_back(Args.MakeArgString(llvm::sys::path::parent_path(P)));
   }
 }
 
+std::string MachO::getCompilerRT(const ArgList &, StringRef Component,
+                                 FileType Type) const {
+  assert(Type != ToolChain::FT_Object &&
+         "it doesn't make sense to ask for the compiler-rt library name as an "
+         "object file");
+  SmallString<64> MachOLibName = StringRef("libclang_rt");
+  // On MachO, the builtins component is not in the library name
+  if (Component != "builtins") {
+    MachOLibName += '.';
+    MachOLibName += Component;
+  }
+  MachOLibName += Type == ToolChain::FT_Shared ? "_dynamic.dylib" : ".a";
+
+  SmallString<128> FullPath(getDriver().ResourceDir);
+  llvm::sys::path::append(FullPath, "lib", "darwin", "macho_embedded",
+                          MachOLibName);
+  return std::string(FullPath);
+}
+
+std::string Darwin::getCompilerRT(const ArgList &, StringRef Component,
+                                  FileType Type) const {
+  assert(Type != ToolChain::FT_Object &&
+         "it doesn't make sense to ask for the compiler-rt library name as an "
+         "object file");
+  SmallString<64> DarwinLibName = StringRef("libclang_rt.");
+  // On Darwin, the builtins component is not in the library name
+  if (Component != "builtins") {
+    DarwinLibName += Component;
+    DarwinLibName += '_';
+  }
+  DarwinLibName += getOSLibraryNameSuffix();
+  DarwinLibName += Type == ToolChain::FT_Shared ? "_dynamic.dylib" : ".a";
+
+  SmallString<128> FullPath(getDriver().ResourceDir);
+  llvm::sys::path::append(FullPath, "lib", "darwin", DarwinLibName);
+  return std::string(FullPath);
+}
+
 StringRef Darwin::getPlatformFamily() const {
   switch (TargetPlatform) {
     case DarwinPlatformKind::MacOS:
@@ -3013,6 +3036,35 @@ void Darwin::addClangTargetOptions(
   if (!DriverArgs.hasArgNoClaim(options::OPT_fdefine_target_os_macros,
                                 options::OPT_fno_define_target_os_macros))
     CC1Args.push_back("-fdefine-target-os-macros");
+
+  // Disable subdirectory modulemap search on sufficiently recent SDKs.
+  if (SDKInfo &&
+      !DriverArgs.hasFlag(options::OPT_fmodulemap_allow_subdirectory_search,
+                          options::OPT_fno_modulemap_allow_subdirectory_search,
+                          false)) {
+    bool RequiresSubdirectorySearch;
+    VersionTuple SDKVersion = SDKInfo->getVersion();
+    switch (TargetPlatform) {
+    default:
+      RequiresSubdirectorySearch = true;
+      break;
+    case MacOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(15, 0);
+      break;
+    case IPhoneOS:
+    case TvOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(18, 0);
+      break;
+    case WatchOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(11, 0);
+      break;
+    case XROS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(2, 0);
+      break;
+    }
+    if (!RequiresSubdirectorySearch)
+      CC1Args.push_back("-fno-modulemap-allow-subdirectory-search");
+  }
 }
 
 void Darwin::addClangCC1ASTargetOptions(
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index b45279ecedeb2..2e55b49682a7e 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -223,6 +223,13 @@ class LLVM_LIBRARY_VISIBILITY MachO : public ToolChain {
     // There aren't any profiling libs for embedded targets currently.
   }
 
+  // Return the full path of the compiler-rt library on a non-Darwin MachO
+  // system. Those are under
+  // <resourcedir>/lib/darwin/macho_embedded/<...>(.dylib|.a).
+  std::string
+  getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component,
+                FileType Type = ToolChain::FT_Static) const override;
+
   /// }
   /// @name ToolChain Implementation
   /// {
@@ -356,6 +363,12 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
   void addProfileRTLibs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
 
+  // Return the full path of the compiler-rt library on a Darwin MachO system.
+  // Those are under <resourcedir>/lib/darwin/<...>(.dylib|.a).
+  std::string
+  getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component,
+                FileType Type = ToolChain::FT_Static) const override;
+
 protected:
   /// }
   /// @name Darwin specific Toolchain functions
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index c4f2375c64034..492d5a3021d9b 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -342,6 +342,9 @@ void Flang::AddAMDGPUTargetArgs(const ArgList &Args,
     StringRef Val = A->getValue();
     CmdArgs.push_back(Args.MakeArgString("-mcode-object-version=" + Val));
   }
+
+  const ToolChain &TC = getToolChain();
+  TC.addClangTargetOptions(Args, CmdArgs, Action::OffloadKind::OFK_OpenMP);
 }
 
 void Flang::addTargetOptions(const ArgList &Args,
@@ -492,6 +495,8 @@ void Flang::addOffloadOptions(Compilation &C, const InputInfoList &Inputs,
     if (Args.hasArg(options::OPT_nogpulib))
       CmdArgs.push_back("-nogpulib");
   }
+
+  addOpenMPHostOffloadingArgs(C, JA, Args, CmdArgs);
 }
 
 static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 12b3b99df7ca1..29781399cbab4 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -366,11 +366,14 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
                               options::OPT_t, options::OPT_u_Group});
     AddLinkerInputs(HTC, Inputs, Args, CmdArgs, JA);
 
+    ToolChain::UnwindLibType UNW = HTC.GetUnwindLibType(Args);
+
     if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
       if (NeedsSanitizerDeps) {
         linkSanitizerRuntimeDeps(HTC, Args, CmdArgs);
 
-        CmdArgs.push_back("-lunwind");
+        if (UNW != ToolChain::UNW_None)
+          CmdArgs.push_back("-lunwind");
       }
       if (NeedsXRayDeps)
         linkXRayRuntimeDeps(HTC, Args, CmdArgs);
@@ -618,13 +621,24 @@ HexagonToolChain::~HexagonToolChain() {}
 void HexagonToolChain::AddCXXStdlibLibArgs(const ArgList &Args,
                                            ArgStringList &CmdArgs) const {
   CXXStdlibType Type = GetCXXStdlibType(Args);
+  ToolChain::UnwindLibType UNW = GetUnwindLibType(Args);
+  if (UNW != ToolChain::UNW_None && UNW != ToolChain::UNW_CompilerRT) {
+    const Arg *A = Args.getLastArg(options::OPT_unwindlib_EQ);
+    if (A) {
+      getDriver().Diag(diag::err_drv_unsupported_unwind_for_platform)
+          << A->getValue() << getTriple().normalize();
+      return;
+    }
+  }
+
   switch (Type) {
   case ToolChain::CST_Libcxx:
     CmdArgs.push_back("-lc++");
     if (Args.hasArg(options::OPT_fexperimental_library))
       CmdArgs.push_back("-lc++experimental");
     CmdArgs.push_back("-lc++abi");
-    CmdArgs.push_back("-lunwind");
+    if (UNW != ToolChain::UNW_None)
+      CmdArgs.push_back("-lunwind");
     break;
 
   case ToolChain::CST_Libstdcxx:
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 4fede7dec75e9..314101091835b 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -86,6 +86,9 @@ std::string Linux::getMultiarchTriple(const Driver &D,
   case llvm::Triple::aarch64:
     if (IsAndroid)
       return "aarch64-linux-android";
+    if (hasEffectiveTriple() &&
+        getEffectiveTriple().getEnvironment() == llvm::Triple::PAuthTest)
+      return "aarch64-linux-pauthtest";
     return "aarch64-linux-gnu";
   case llvm::Triple::aarch64_be:
     return "aarch64_be-linux-gnu";
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 974e486a0082b..a9e612c44da06 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -152,52 +152,38 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Output.getFilename());
   }
 
-  const bool UseLTO = D.isUsingLTO();
   const bool UseJMC =
       Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false);
 
   const char *LTOArgs = "";
-  auto AddCodeGenFlag = [&](Twine Flag) {
+  auto AddLTOFlag = [&](Twine Flag) {
     LTOArgs = Args.MakeArgString(Twine(LTOArgs) + " " + Flag);
   };
 
-  if (UseLTO) {
-    // We default to creating the arange section, but LTO does not. Enable it
-    // here.
-    AddCodeGenFlag("-generate-arange-section");
+  // If the linker sees bitcode objects it will perform LTO. We can't tell
+  // whether or not that will be the case at this point. So, unconditionally
+  // pass LTO options to ensure proper codegen, metadata production, etc if
+  // LTO indeed occurs.
+  if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
+                   true))
+    CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
+                                                  : "--lto=full");
+  if (UseJMC)
+    AddLTOFlag("-enable-jmc-instrument");
 
-    // This tells LTO to perform JustMyCode instrumentation.
-    if (UseJMC)
-      AddCodeGenFlag("-enable-jmc-instrument");
+  if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
+    AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
 
-    if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
-      AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+  if (StringRef Threads = getLTOParallelism(Args, D); !Threads.empty())
+    AddLTOFlag(Twine("-threads=") + Threads);
 
-    StringRef Parallelism = getLTOParallelism(Args, D);
-    if (!Parallelism.empty())
-      AddCodeGenFlag(Twine("-threads=") + Parallelism);
-
-    const char *Prefix = nullptr;
-    if (D.getLTOMode() == LTOK_Thin)
-      Prefix = "-lto-thin-debug-options=";
-    else if (D.getLTOMode() == LTOK_Full)
-      Prefix = "-lto-debug-options=";
-    else
-      llvm_unreachable("new LTO mode?");
-
-    CmdArgs.push_back(Args.MakeArgString(Twine(Prefix) + LTOArgs));
-  }
+  if (*LTOArgs)
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-lto-debug-options=") + LTOArgs));
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
-  if (D.isUsingLTO() && Args.hasArg(options::OPT_funified_lto)) {
-    if (D.getLTOMode() == LTOK_Thin)
-      CmdArgs.push_back("--lto=thin");
-    else if (D.getLTOMode() == LTOK_Full)
-      CmdArgs.push_back("--lto=full");
-  }
-
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
@@ -263,41 +249,38 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Output.getFilename());
   }
 
-  const bool UseLTO = D.isUsingLTO();
   const bool UseJMC =
       Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false);
 
-  auto AddCodeGenFlag = [&](Twine Flag) {
+  auto AddLTOFlag = [&](Twine Flag) {
     CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=") + Flag));
   };
 
-  if (UseLTO) {
-    // We default to creating the arange section, but LTO does not. Enable it
-    // here.
-    AddCodeGenFlag("-generate-arange-section");
+  // If the linker sees bitcode objects it will perform LTO. We can't tell
+  // whether or not that will be the case at this point. So, unconditionally
+  // pass LTO options to ensure proper codegen, metadata production, etc if
+  // LTO indeed occurs.
+  if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
+                   true))
+    CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
+                                                  : "--lto=full");
 
-    // This tells LTO to perform JustMyCode instrumentation.
-    if (UseJMC)
-      AddCodeGenFlag("-enable-jmc-instrument");
+  if (UseJMC)
+    AddLTOFlag("-enable-jmc-instrument");
 
-    if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
-      AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+  if (Args.hasFlag(options::OPT_fstack_size_section,
+                   options::OPT_fno_stack_size_section, false))
+    AddLTOFlag("-stack-size-section");
 
-    StringRef Parallelism = getLTOParallelism(Args, D);
-    if (!Parallelism.empty())
-      CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=jobs=") + Parallelism));
-  }
+  if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
+    AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+
+  if (StringRef Jobs = getLTOParallelism(Args, D); !Jobs.empty())
+    AddLTOFlag(Twine("jobs=") + Jobs);
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
-  if (D.isUsingLTO() && Args.hasArg(options::OPT_funified_lto)) {
-    if (D.getLTOMode() == LTOK_Thin)
-      CmdArgs.push_back("--lto=thin");
-    else if (D.getLTOMode() == LTOK_Full)
-      CmdArgs.push_back("--lto=full");
-  }
-
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.h b/clang/lib/Driver/ToolChains/PS4CPU.h
index 0be90183c637c..a33728bce5186 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.h
+++ b/clang/lib/Driver/ToolChains/PS4CPU.h
@@ -179,7 +179,7 @@ class LLVM_LIBRARY_VISIBILITY PS5CPU : public PS4PS5Base {
                         llvm::opt::ArgStringList &CmdArgs, const char *Prefix,
                         const char *Suffix) const override;
   const char *getProfileRTLibName() const override {
-    return "libclang_rt.profile-x86_64_nosubmission.a";
+    return "libclang_rt.profile_nosubmission.a";
   }
 
 protected:
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 60bd97e0ee987..9aacda5fd5702 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -61,6 +61,13 @@ std::string wasm::Linker::getLinkerPath(const ArgList &Args) const {
   return ToolChain.GetProgramPath(ToolChain.getDefaultLinker());
 }
 
+static bool TargetBuildsComponents(const llvm::Triple &TargetTriple) {
+  // WASIp2 and above are all based on components, so test for WASI but exclude
+  // the original `wasi` target in addition to the `wasip1` name.
+  return TargetTriple.isOSWASI() && TargetTriple.getOSName() != "wasip1" &&
+         TargetTriple.getOSName() != "wasi";
+}
+
 void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                 const InputInfo &Output,
                                 const InputInfoList &Inputs,
@@ -158,46 +165,52 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
-  if (Args.hasFlag(options::OPT_wasm_opt, options::OPT_no_wasm_opt, true)) {
-    // When optimizing, if wasm-opt is available, run it.
-    std::string WasmOptPath;
-    if (Args.getLastArg(options::OPT_O_Group)) {
-      WasmOptPath = ToolChain.GetProgramPath("wasm-opt");
-      if (WasmOptPath == "wasm-opt") {
-        WasmOptPath = {};
-      }
+  // Don't use wasm-opt by default on `wasip2` as it doesn't have support for
+  // components at this time. Retain the historical default otherwise, though,
+  // of running `wasm-opt` by default.
+  bool WasmOptDefault = !TargetBuildsComponents(ToolChain.getTriple());
+  bool RunWasmOpt = Args.hasFlag(options::OPT_wasm_opt,
+                                 options::OPT_no_wasm_opt, WasmOptDefault);
+
+  // If wasm-opt is enabled and optimizations are happening look for the
+  // `wasm-opt` program. If it's not found auto-disable it.
+  std::string WasmOptPath;
+  if (RunWasmOpt && Args.getLastArg(options::OPT_O_Group)) {
+    WasmOptPath = ToolChain.GetProgramPath("wasm-opt");
+    if (WasmOptPath == "wasm-opt") {
+      WasmOptPath = {};
     }
+  }
 
-    if (!WasmOptPath.empty()) {
-      CmdArgs.push_back("--keep-section=target_features");
-    }
+  if (!WasmOptPath.empty()) {
+    CmdArgs.push_back("--keep-section=target_features");
+  }
 
-    C.addCommand(std::make_unique<Command>(JA, *this,
-                                           ResponseFileSupport::AtFileCurCP(),
-                                           Linker, CmdArgs, Inputs, Output));
-
-    if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
-      if (!WasmOptPath.empty()) {
-        StringRef OOpt = "s";
-        if (A->getOption().matches(options::OPT_O4) ||
-            A->getOption().matches(options::OPT_Ofast))
-          OOpt = "4";
-        else if (A->getOption().matches(options::OPT_O0))
-          OOpt = "0";
-        else if (A->getOption().matches(options::OPT_O))
-          OOpt = A->getValue();
-
-        if (OOpt != "0") {
-          const char *WasmOpt = Args.MakeArgString(WasmOptPath);
-          ArgStringList OptArgs;
-          OptArgs.push_back(Output.getFilename());
-          OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
-          OptArgs.push_back("-o");
-          OptArgs.push_back(Output.getFilename());
-          C.addCommand(std::make_unique<Command>(
-              JA, *this, ResponseFileSupport::AtFileCurCP(), WasmOpt, OptArgs,
-              Inputs, Output));
-        }
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Linker, CmdArgs, Inputs, Output));
+
+  if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+    if (!WasmOptPath.empty()) {
+      StringRef OOpt = "s";
+      if (A->getOption().matches(options::OPT_O4) ||
+          A->getOption().matches(options::OPT_Ofast))
+        OOpt = "4";
+      else if (A->getOption().matches(options::OPT_O0))
+        OOpt = "0";
+      else if (A->getOption().matches(options::OPT_O))
+        OOpt = A->getValue();
+
+      if (OOpt != "0") {
+        const char *WasmOpt = Args.MakeArgString(WasmOptPath);
+        ArgStringList OptArgs;
+        OptArgs.push_back(Output.getFilename());
+        OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
+        OptArgs.push_back("-o");
+        OptArgs.push_back(Output.getFilename());
+        C.addCommand(std::make_unique<Command>(
+            JA, *this, ResponseFileSupport::AtFileCurCP(), WasmOpt, OptArgs,
+            Inputs, Output));
       }
     }
   }
@@ -241,7 +254,7 @@ WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple,
 }
 
 const char *WebAssembly::getDefaultLinker() const {
-  if (getOS() == "wasip2")
+  if (TargetBuildsComponents(getTriple()))
     return "wasm-component-ld";
   return "wasm-ld";
 }
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index b07360425ca6e..df86a774ba0f4 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -803,6 +803,37 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
     return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
                                   tok::kw_switch);
   };
+  auto IsFunctionCallParen = [](const FormatToken &Tok) {
+    return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous &&
+           Tok.Previous->is(tok::identifier);
+  };
+  const auto IsInTemplateString = [this](const FormatToken &Tok) {
+    if (!Style.isJavaScript())
+      return false;
+    for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) {
+      if (Prev->is(TT_TemplateString) && Prev->opensScope())
+        return true;
+      if (Prev->is(TT_TemplateString) && Prev->closesScope())
+        break;
+    }
+    return false;
+  };
+  // Identifies simple (no expression) one-argument function calls.
+  const auto IsSimpleFunction = [&](const FormatToken &Tok) {
+    if (!Tok.FakeLParens.empty() && Tok.FakeLParens.back() > prec::Unknown)
+      return false;
+    const auto *Previous = Tok.Previous;
+    if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen,
+                                         TT_LambdaDefinitionLParen) &&
+                      !IsFunctionCallParen(*Previous))) {
+      return true;
+    }
+    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok))
+      return true;
+    const auto *Next = Tok.Next;
+    return !Next || Next->isMemberAccess() ||
+           Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next);
+  };
   if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
       IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
@@ -813,10 +844,10 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       //       caaaaaaaaaaaall(
       //           caaaaaaaaaaaall(
       //               caaaaaaaaaaaaaaaaaaaaaaall(aaaaaaaaaaaaaa, aaaaaaaaa))));
-      Current.FakeLParens.size() > 0 &&
-      Current.FakeLParens.back() > prec::Unknown) {
+      !IsSimpleFunction(Current)) {
     CurrentState.NoLineBreak = true;
   }
+
   if (Previous.is(TT_TemplateString) && Previous.opensScope())
     CurrentState.NoLineBreak = true;
 
@@ -831,7 +862,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       Previous.isNot(TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
-       Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen))) {
+       Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) &&
+      !IsInTemplateString(Current)) {
     CurrentState.Indent = State.Column + Spaces;
     CurrentState.IsAligned = true;
   }
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index cc45d5a8c5c1e..abcedb66b57cc 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -102,6 +102,7 @@ namespace format {
   TYPE(JsTypeColon)                                                            \
   TYPE(JsTypeOperator)                                                         \
   TYPE(JsTypeOptionalQuestion)                                                 \
+  TYPE(LambdaDefinitionLParen)                                                 \
   TYPE(LambdaLBrace)                                                           \
   TYPE(LambdaLSquare)                                                          \
   TYPE(LeadingJavaAnnotation)                                                  \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index b6d6e52ccb8f8..8cd5cf2484160 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -62,6 +62,7 @@ static bool canBeObjCSelectorComponent(const FormatToken &Tok) {
 
 /// With `Left` being '(', check if we're at either `[...](` or
 /// `[...]<...>(`, where the [ opens a lambda capture list.
+// FIXME: this doesn't cover attributes/constraints before the l_paren.
 static bool isLambdaParameterList(const FormatToken *Left) {
   // Skip <...> if present.
   if (Left->Previous && Left->Previous->is(tok::greater) &&
@@ -154,8 +155,8 @@ class AnnotatingParser {
     if (NonTemplateLess.count(CurrentToken->Previous) > 0)
       return false;
 
-    const FormatToken &Previous = *CurrentToken->Previous; // The '<'.
-    if (Previous.Previous) {
+    if (const auto &Previous = *CurrentToken->Previous; // The '<'.
+        Previous.Previous) {
       if (Previous.Previous->Tok.isLiteral())
         return false;
       if (Previous.Previous->is(tok::r_brace))
@@ -175,11 +176,13 @@ class AnnotatingParser {
     FormatToken *Left = CurrentToken->Previous;
     Left->ParentBracket = Contexts.back().ContextKind;
     ScopedContextCreator ContextCreator(*this, tok::less, 12);
-
     Contexts.back().IsExpression = false;
+
+    const auto *BeforeLess = Left->Previous;
+
     // If there's a template keyword before the opening angle bracket, this is a
     // template parameter, not an argument.
-    if (Left->Previous && Left->Previous->isNot(tok::kw_template))
+    if (BeforeLess && BeforeLess->isNot(tok::kw_template))
       Contexts.back().ContextType = Context::TemplateArgument;
 
     if (Style.Language == FormatStyle::LK_Java &&
@@ -187,19 +190,24 @@ class AnnotatingParser {
       next();
     }
 
-    while (CurrentToken) {
+    for (bool SeenTernaryOperator = false; CurrentToken;) {
+      const bool InExpr = Contexts[Contexts.size() - 2].IsExpression;
       if (CurrentToken->is(tok::greater)) {
+        const auto *Next = CurrentToken->Next;
         // Try to do a better job at looking for ">>" within the condition of
         // a statement. Conservatively insert spaces between consecutive ">"
         // tokens to prevent splitting right bitshift operators and potentially
         // altering program semantics. This check is overly conservative and
         // will prevent spaces from being inserted in select nested template
         // parameter cases, but should not alter program semantics.
-        if (CurrentToken->Next && CurrentToken->Next->is(tok::greater) &&
+        if (Next && Next->is(tok::greater) &&
             Left->ParentBracket != tok::less &&
             CurrentToken->getStartOfNonWhitespace() ==
-                CurrentToken->Next->getStartOfNonWhitespace().getLocWithOffset(
-                    -1)) {
+                Next->getStartOfNonWhitespace().getLocWithOffset(-1)) {
+          return false;
+        }
+        if (InExpr && SeenTernaryOperator &&
+            (!Next || !Next->isOneOf(tok::l_paren, tok::l_brace))) {
           return false;
         }
         Left->MatchingParen = CurrentToken;
@@ -210,14 +218,14 @@ class AnnotatingParser {
         //   msg: < item: data >
         // In TT_TextProto, map<key, value> does not occur.
         if (Style.Language == FormatStyle::LK_TextProto ||
-            (Style.Language == FormatStyle::LK_Proto && Left->Previous &&
-             Left->Previous->isOneOf(TT_SelectorName, TT_DictLiteral))) {
+            (Style.Language == FormatStyle::LK_Proto && BeforeLess &&
+             BeforeLess->isOneOf(TT_SelectorName, TT_DictLiteral))) {
           CurrentToken->setType(TT_DictLiteral);
         } else {
           CurrentToken->setType(TT_TemplateCloser);
           CurrentToken->Tok.setLength(1);
         }
-        if (CurrentToken->Next && CurrentToken->Next->Tok.isLiteral())
+        if (Next && Next->Tok.isLiteral())
           return false;
         next();
         return true;
@@ -229,18 +237,21 @@ class AnnotatingParser {
       }
       if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace))
         return false;
+      const auto &Prev = *CurrentToken->Previous;
       // If a && or || is found and interpreted as a binary operator, this set
       // of angles is likely part of something like "a < b && c > d". If the
       // angles are inside an expression, the ||/&& might also be a binary
       // operator that was misinterpreted because we are parsing template
       // parameters.
       // FIXME: This is getting out of hand, write a decent parser.
-      if (CurrentToken->Previous->isOneOf(tok::pipepipe, tok::ampamp) &&
-          CurrentToken->Previous->is(TT_BinaryOperator) &&
-          Contexts[Contexts.size() - 2].IsExpression &&
-          !Line.startsWith(tok::kw_template)) {
-        return false;
+      if (InExpr && !Line.startsWith(tok::kw_template) &&
+          Prev.is(TT_BinaryOperator)) {
+        const auto Precedence = Prev.getPrecedence();
+        if (Precedence > prec::Conditional && Precedence < prec::Relational)
+          return false;
       }
+      if (Prev.is(TT_ConditionalExpr))
+        SeenTernaryOperator = true;
       updateParameterCount(Left, CurrentToken);
       if (Style.Language == FormatStyle::LK_Proto) {
         if (FormatToken *Previous = CurrentToken->getPreviousNonComment()) {
@@ -365,6 +376,7 @@ class AnnotatingParser {
       Contexts.back().IsExpression = false;
     } else if (isLambdaParameterList(&OpeningParen)) {
       // This is a parameter list of a lambda expression.
+      OpeningParen.setType(TT_LambdaDefinitionLParen);
       Contexts.back().IsExpression = false;
     } else if (OpeningParen.is(TT_RequiresExpressionLParen)) {
       Contexts.back().IsExpression = false;
@@ -2458,9 +2470,9 @@ class AnnotatingParser {
         Current.setType(TT_CastRParen);
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
-          !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
-                                 tok::comma, tok::period, tok::arrow,
-                                 tok::coloncolon, tok::kw_noexcept)) {
+          !Current.Next->isOneOf(
+              tok::semi, tok::colon, tok::l_brace, tok::l_paren, tok::comma,
+              tok::period, tok::arrow, tok::coloncolon, tok::kw_noexcept)) {
         if (FormatToken *AfterParen = Current.MatchingParen->Next;
             AfterParen && AfterParen->isNot(tok::caret)) {
           // Make sure this isn't the return type of an Obj-C block declaration.
@@ -2616,8 +2628,10 @@ class AnnotatingParser {
       return false;
 
     // int a or auto a.
-    if (PreviousNotConst->isOneOf(tok::identifier, tok::kw_auto))
+    if (PreviousNotConst->isOneOf(tok::identifier, tok::kw_auto) &&
+        PreviousNotConst->isNot(TT_StatementAttributeLikeMacro)) {
       return true;
+    }
 
     // *a or &a or &&a.
     if (PreviousNotConst->is(TT_PointerOrReference))
@@ -6194,6 +6208,12 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf()));
   }
 
+  if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) &&
+      Right.is(TT_TrailingAnnotation) &&
+      Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) {
+    return false;
+  }
+
   // Allow breaking after a trailing annotation, e.g. after a method
   // declaration.
   if (Left.is(TT_TrailingAnnotation)) {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index d406a531a5c0c..ed056b9dd5b75 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2554,7 +2554,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         const bool DoubleParens =
             Prev && Prev->is(tok::l_paren) && Next && Next->is(tok::r_paren);
         const auto *PrevPrev = Prev ? Prev->getPreviousNonComment() : nullptr;
-        const bool Blacklisted =
+        const bool Disallowed =
             PrevPrev &&
             (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
              (SeenEqual &&
@@ -2566,7 +2566,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
              (!NestedLambdas.empty() && !NestedLambdas.back())) &&
             Prev && Prev->isOneOf(tok::kw_return, tok::kw_co_return) && Next &&
             Next->is(tok::semi);
-        if ((DoubleParens && !Blacklisted) || ReturnParens) {
+        if ((DoubleParens && !Disallowed) || ReturnParens) {
           LeftParen->Optional = true;
           FormatTok->Optional = true;
         }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d2e8f9b47c06f..da52cddf5bb06 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1495,20 +1495,30 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
         Key::ASDA, LangOpts.PointerAuthVTPtrAddressDiscrimination,
         LangOpts.PointerAuthVTPtrTypeDiscrimination ? Discrimination::Type
                                                     : Discrimination::None);
-    Opts.CXXTypeInfoVTablePointer =
-        PointerAuthSchema(Key::ASDA, false, Discrimination::None);
+
+    if (LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
+      Opts.CXXTypeInfoVTablePointer =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            StdTypeInfoVTablePointerConstantDiscrimination);
+    else
+      Opts.CXXTypeInfoVTablePointer =
+          PointerAuthSchema(Key::ASDA, false, Discrimination::None);
+
     Opts.CXXVTTVTablePointers =
         PointerAuthSchema(Key::ASDA, false, Discrimination::None);
     Opts.CXXVirtualFunctionPointers = Opts.CXXVirtualVariadicFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::Decl);
+    Opts.CXXMemberFunctionPointers =
+        PointerAuthSchema(Key::ASIA, false, Discrimination::Type);
   }
+  Opts.IndirectGotos = LangOpts.PointerAuthIndirectGotos;
 }
 
 static void parsePointerAuthOptions(PointerAuthOptions &Opts,
                                     const LangOptions &LangOpts,
                                     const llvm::Triple &Triple,
                                     DiagnosticsEngine &Diags) {
-  if (!LangOpts.PointerAuthCalls)
+  if (!LangOpts.PointerAuthCalls && !LangOpts.PointerAuthIndirectGotos)
     return;
 
   CompilerInvocation::setDefaultPointerAuthOptions(Opts, LangOpts, Triple);
@@ -3421,12 +3431,17 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_calls);
   if (Opts.PointerAuthReturns)
     GenerateArg(Consumer, OPT_fptrauth_returns);
+  if (Opts.PointerAuthIndirectGotos)
+    GenerateArg(Consumer, OPT_fptrauth_indirect_gotos);
   if (Opts.PointerAuthAuthTraps)
     GenerateArg(Consumer, OPT_fptrauth_auth_traps);
   if (Opts.PointerAuthVTPtrAddressDiscrimination)
     GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_address_discrimination);
   if (Opts.PointerAuthVTPtrTypeDiscrimination)
     GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_type_discrimination);
+  if (Opts.PointerAuthTypeInfoVTPtrDiscrimination)
+    GenerateArg(Consumer, OPT_fptrauth_type_info_vtable_pointer_discrimination);
+
   if (Opts.PointerAuthInitFini)
     GenerateArg(Consumer, OPT_fptrauth_init_fini);
   if (Opts.PointerAuthFunctionTypeDiscrimination)
@@ -3438,11 +3453,15 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthIntrinsics = Args.hasArg(OPT_fptrauth_intrinsics);
   Opts.PointerAuthCalls = Args.hasArg(OPT_fptrauth_calls);
   Opts.PointerAuthReturns = Args.hasArg(OPT_fptrauth_returns);
+  Opts.PointerAuthIndirectGotos = Args.hasArg(OPT_fptrauth_indirect_gotos);
   Opts.PointerAuthAuthTraps = Args.hasArg(OPT_fptrauth_auth_traps);
   Opts.PointerAuthVTPtrAddressDiscrimination =
       Args.hasArg(OPT_fptrauth_vtable_pointer_address_discrimination);
   Opts.PointerAuthVTPtrTypeDiscrimination =
       Args.hasArg(OPT_fptrauth_vtable_pointer_type_discrimination);
+  Opts.PointerAuthTypeInfoVTPtrDiscrimination =
+      Args.hasArg(OPT_fptrauth_type_info_vtable_pointer_discrimination);
+
   Opts.PointerAuthInitFini = Args.hasArg(OPT_fptrauth_init_fini);
   Opts.PointerAuthFunctionTypeDiscrimination =
       Args.hasArg(OPT_fptrauth_function_pointer_type_discrimination);
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 96a96289c35ef..87170fa26e631 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1346,7 +1346,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (!LangOpts.MathErrno)
     Builder.defineMacro("__NO_MATH_ERRNO__");
 
-  if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
+  if (LangOpts.FastMath || (LangOpts.NoHonorInfs && LangOpts.NoHonorNaNs))
     Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
   else
     Builder.defineMacro("__FINITE_MATH_ONLY__", "0");
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 00549416ae811..6d413cbd07c8f 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -1016,7 +1016,6 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
         *Callbacks->OS << static_cast<unsigned>(*Iter);
         PrintComma = true;
       }
-      IsStartOfLine = true;
     } else if (Tok.isAnnotation()) {
       // Ignore annotation tokens created by pragmas - the pragmas themselves
       // will be reproduced in the preprocessed output.
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e85bfc47aa5cc..a3176570a468f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -52,9 +52,12 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@@ -1486,8 +1489,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }
 
 /// Converts the two double-precision floating-point elements of a
@@ -1505,8 +1508,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }
 
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@@ -1520,8 +1523,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }
 
 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@@ -2108,9 +2111,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
 }
 
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2431,9 +2433,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
-                                                            __m64 __b) {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
+                                             (__v4si)__anyext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2539,9 +2541,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
 }
 
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -4889,8 +4890,10 @@ void _mm_pause(void);
 #if defined(__cplusplus)
 } // extern "C"
 #endif
+
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 09f26a4588c14..6d86d278bc711 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -145,106 +145,106 @@ float4 acos(float4);
 
 #ifdef __HLSL_ENABLE_16_BIT
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int16_t);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int16_t2);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int16_t3);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int16_t4);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint16_t);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint16_t2);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint16_t3);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint16_t4);
 #endif
 
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(half);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(half2);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(half3);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(half4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(bool);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(bool2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(bool3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(bool4);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(float);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(float2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(float3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(float4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(int64_t4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(uint64_t4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(double);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(double2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(double3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_all)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_all)
 bool all(double4);
 
 //===----------------------------------------------------------------------===//
@@ -257,106 +257,106 @@ bool all(double4);
 
 #ifdef __HLSL_ENABLE_16_BIT
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int16_t);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int16_t2);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int16_t3);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int16_t4);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint16_t);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint16_t2);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint16_t3);
 _HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint16_t4);
 #endif
 
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(half);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(half2);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(half3);
 _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(half4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(bool);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(bool2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(bool3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(bool4);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(float);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(float2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(float3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(float4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(int64_t4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(uint64_t4);
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(double);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(double2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(double3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_any)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_any)
 bool any(double4);
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 4e154e2d85935..9d1e135be63be 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -21,10 +21,33 @@ typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
+/* Unsigned types */
+typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
+typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v8qs __attribute__((__vector_size__(8)));
+
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
-                 __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@@ -50,10 +73,10 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }
 
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -67,10 +90,10 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }
 
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -83,7 +106,7 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
@@ -99,7 +122,7 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
@@ -124,10 +147,11 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packsswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -149,10 +173,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __trunc64(__builtin_ia32_packssdw128(
+        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -174,10 +199,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packuswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -201,10 +227,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          4, 12, 5, 13, 6, 14, 7, 15);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -224,10 +251,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          2, 6, 3, 7);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -245,10 +273,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -272,10 +300,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          0, 8, 1, 9, 2, 10, 3, 11);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -295,10 +324,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          0, 4, 1, 5);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -316,10 +346,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -337,10 +367,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }
 
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -358,10 +388,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }
 
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -379,10 +409,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }
 
 /// Adds, with saturation, each 8-bit signed integer element of the first
@@ -403,10 +433,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Adds, with saturation, each 16-bit signed integer element of the first
@@ -427,10 +457,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Adds, with saturation, each 8-bit unsigned integer element of the first
@@ -450,10 +480,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Adds, with saturation, each 16-bit unsigned integer element of the first
@@ -473,10 +503,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -494,10 +524,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }
 
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -515,10 +545,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }
 
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -536,10 +566,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }
 
 /// Subtracts, with saturation, each 8-bit signed integer element of the second
@@ -560,10 +590,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Subtracts, with saturation, each 16-bit signed integer element of the
@@ -584,10 +614,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -608,10 +638,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -632,10 +662,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -659,10 +689,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -680,10 +711,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -701,10 +733,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }
 
 /// Left-shifts each 16-bit signed integer element of the first
@@ -724,10 +756,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -746,10 +779,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts each 32-bit signed integer element of the first
@@ -769,10 +803,11 @@ _mm_slli_pi16(__m64 __m, int __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -791,10 +826,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -811,10 +847,11 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }
 
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -831,10 +868,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -855,10 +893,11 @@ _mm_slli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -878,10 +917,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -902,10 +942,11 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -925,10 +966,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -948,10 +990,11 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -970,10 +1013,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -993,10 +1037,11 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1015,10 +1060,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1035,10 +1081,11 @@ _mm_srli_pi32(__m64 __m, int __count)
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }
 
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1056,10 +1103,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1074,10 +1122,10 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1095,10 +1143,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1113,10 +1161,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }
 
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1131,10 +1179,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1153,10 +1201,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1175,10 +1223,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1197,10 +1245,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1219,10 +1267,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+  /* This function always performs a signed comparison, but __v8qi is a char
+     which may be signed or unsigned, so use __v8qs. */
+    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1241,10 +1291,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1263,10 +1313,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 
 /// Constructs a 64-bit integer vector initialized to zero.
@@ -1276,7 +1326,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setzero_si64(void)
 {
     return __extension__ (__m64){ 0LL };
@@ -1297,10 +1347,10 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi32(int __i1, int __i0)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+    return __extension__ (__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1320,10 +1370,10 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1351,12 +1401,12 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
 {
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                         __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1372,7 +1422,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
@@ -1391,7 +1441,7 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
@@ -1409,7 +1459,7 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
@@ -1430,7 +1480,7 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
@@ -1453,7 +1503,7 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
@@ -1484,14 +1534,16 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
 {
     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
diff --git a/clang/lib/Headers/prfchwintrin.h b/clang/lib/Headers/prfchwintrin.h
index 8a13784543c5f..eaea5f3cf8feb 100644
--- a/clang/lib/Headers/prfchwintrin.h
+++ b/clang/lib/Headers/prfchwintrin.h
@@ -8,7 +8,7 @@
  */
 
 #if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
-#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> instead."
 #endif
 
 #ifndef __PRFCHWINTRIN_H
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index 40ac6dcac2ab8..4724155b0dc79 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -58,6 +58,21 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 /* Authenticating a pointer that was not signed with the given key
    and extra-data value will (likely) fail by trapping. */
 
+/* The null function pointer is always the all-zero bit pattern.
+   Signing an all-zero bit pattern will embed a (likely) non-zero
+   signature in the result, and so the result will not seem to be
+   a null function pointer.  Authenticating this value will yield
+   a null function pointer back.  However, authenticating an
+   all-zero bit pattern will probably fail, because the
+   authentication will expect a (likely) non-zero signature to
+   embedded in the value.
+
+   Because of this, if a pointer may validly be null, you should
+   check for null before attempting to authenticate it with one
+   of these intrinsics.  This is not necessary when using the
+   __ptrauth qualifier; the compiler will perform this check
+   automatically. */
+
 #if __has_feature(ptrauth_intrinsics)
 
 /* Strip the signature from a value without authenticating it.
@@ -187,6 +202,23 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define ptrauth_string_discriminator(__string)                                 \
   __builtin_ptrauth_string_discriminator(__string)
 
+/* Compute a constant discriminator from the given type.
+
+   The result can be used as the second argument to
+   ptrauth_blend_discriminator or the third argument to the
+   __ptrauth qualifier.  It has type size_t.
+
+   If the type is a C++ member function pointer type, the result is
+   the discriminator used to signed member function pointers of that
+   type.  If the type is a function, function pointer, or function
+   reference type, the result is the discriminator used to sign
+   functions of that type.  It is ill-formed to use this macro with any
+   other type.
+
+   A call to this function is an integer constant expression. */
+#define ptrauth_type_discriminator(__type)                                     \
+  __builtin_ptrauth_type_discriminator(__type)
+
 /* Compute a signature for the given pair of pointer-sized values.
    The order of the arguments is significant.
 
@@ -274,6 +306,8 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
     ((ptrauth_extra_data_t)0);                                                 \
   })
 
+#define ptrauth_type_discriminator(__type) ((ptrauth_extra_data_t)0)
+
 #define ptrauth_sign_generic_data(__value, __data)                             \
   ({                                                                           \
     (void)__value;                                                             \
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 8292ab907becf..6203d7a600a23 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -20,19 +20,18 @@
  * modules.
  */
 #if defined(__MVS__) && __has_include_next(<stdarg.h>)
-#include <__stdarg_header_macro.h>
 #undef __need___va_list
 #undef __need_va_list
 #undef __need_va_arg
 #undef __need___va_copy
 #undef __need_va_copy
+#include <__stdarg_header_macro.h>
 #include_next <stdarg.h>
 
 #else
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
     !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
     !defined(__need_va_copy)
-#include <__stdarg_header_macro.h>
 #define __need___va_list
 #define __need_va_list
 #define __need_va_arg
@@ -45,6 +44,7 @@
     !defined(__STRICT_ANSI__)
 #define __need_va_copy
 #endif
+#include <__stdarg_header_macro.h>
 #endif
 
 #ifdef __need___va_list
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 2027055f38796..1991351f9e9ef 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -172,7 +172,11 @@ typedef _Atomic(uintmax_t)          atomic_uintmax_t;
 
 typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
 
+#ifdef __cplusplus
+#define ATOMIC_FLAG_INIT {false}
+#else
 #define ATOMIC_FLAG_INIT { 0 }
+#endif
 
 /* These should be provided by the libc implementation. */
 #ifdef __cplusplus
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index 8985c526e8fc5..99b275aebf5aa 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -20,7 +20,6 @@
  * modules.
  */
 #if defined(__MVS__) && __has_include_next(<stddef.h>)
-#include <__stddef_header_macro.h>
 #undef __need_ptrdiff_t
 #undef __need_size_t
 #undef __need_rsize_t
@@ -31,6 +30,7 @@
 #undef __need_max_align_t
 #undef __need_offsetof
 #undef __need_wint_t
+#include <__stddef_header_macro.h>
 #include_next <stddef.h>
 
 #else
@@ -40,7 +40,6 @@
     !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
     !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
     !defined(__need_offsetof) && !defined(__need_wint_t)
-#include <__stddef_header_macro.h>
 #define __need_ptrdiff_t
 #define __need_size_t
 /* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
@@ -49,7 +48,24 @@
 #define __need_rsize_t
 #endif
 #define __need_wchar_t
+#if !defined(__STDDEF_H) || __has_feature(modules)
+/*
+ * __stddef_null.h is special when building without modules: if __need_NULL is
+ * set, then it will unconditionally redefine NULL. To avoid stepping on client
+ * definitions of NULL, __need_NULL should only be set the first time this
+ * header is included, that is when __STDDEF_H is not defined. However, when
+ * building with modules, this header is a textual header and needs to
+ * unconditionally include __stdef_null.h to support multiple submodules
+ * exporting _Builtin_stddef.null. Take module SM with submodules A and B, whose
+ * headers both include stddef.h When SM.A builds, __STDDEF_H will be defined.
+ * When SM.B builds, the definition from SM.A will leak when building without
+ * local submodule visibility. stddef.h wouldn't include __stddef_null.h, and
+ * SM.B wouldn't import _Builtin_stddef.null, and SM.B's `export *` wouldn't
+ * export NULL as expected. When building with modules, always include
+ * __stddef_null.h so that everything works as expected.
+ */
 #define __need_NULL
+#endif
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
     defined(__cplusplus)
 #define __need_nullptr_t
@@ -65,6 +81,7 @@
 /* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
  * for compatibility, but must be explicitly requested. Therefore
  * __need_wint_t is intentionally not defined here. */
+#include <__stddef_header_macro.h>
 #endif
 
 #if defined(__need_ptrdiff_t)
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index bf8327b692d1c..bd832ce8dddfd 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -19,11 +19,13 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,ssse3,no-evex512"),                           \
-                 __min_vector_width__(64)))
+                 __target__("ssse3,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@@ -37,10 +39,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+  return (__m64)__builtin_elementwise_abs((__v8qs)__a);
 }
 
 /// Computes the absolute value of each of the packed 8-bit signed
@@ -73,10 +75,10 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return (__m64)__builtin_elementwise_abs((__v4hi)__a);
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed
@@ -109,10 +111,10 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return (__m64)__builtin_elementwise_abs((__v2si)__a);
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed
@@ -177,7 +179,10 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
+  ((__m64)__builtin_shufflevector(                                       \
+       __builtin_ia32_psrldqi128_byteshift(                              \
+           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
+           (n)), __extension__ (__v2di){}, 0))
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@@ -242,10 +247,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -265,10 +271,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phaddd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -317,10 +324,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -386,10 +394,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -409,10 +418,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phsubd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -461,10 +471,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -525,10 +536,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                                 (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -565,10 +577,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -614,12 +627,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
 ///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
+///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128(
+        (__v16qi)__builtin_shufflevector(
+            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
+        (__v16qi)__anyext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -720,10 +736,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+                                              (__v16qi)__anyext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -746,10 +763,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+                                              (__v8hi)__anyext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -772,13 +790,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+                                              (__v4si)__anyext128(__b)));
 }
 
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #endif /* __TMMINTRIN_H */
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 1ef89de9c9f56..5b9e90e8f061c 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -35,9 +35,21 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
                  __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
   __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
+#define __zeroupper64(x)                                                       \
+  (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
+                                    1, 4, 5)
 
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@@ -1448,10 +1460,10 @@ _mm_cvtss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of
@@ -1468,7 +1480,7 @@ _mm_cvtps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
@@ -1558,10 +1570,10 @@ _mm_cvttss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
@@ -1579,7 +1591,7 @@ _mm_cvttps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
@@ -1674,10 +1686,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
+      (__v4sf)__a,
+      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+      4, 5, 2, 3);
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1697,7 +1712,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
@@ -2231,10 +2246,10 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(void *__p, __m64 __a)
 {
-  __builtin_ia32_movntq((__m64 *)__p, __a);
+  __builtin_nontemporal_store(__a, (__m64 *)__p);
 }
 
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2296,7 +2311,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2342,10 +2357,10 @@ void _mm_sfence(void);
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2361,10 +2376,10 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
 }
 
 /// Compares each of the corresponding packed 16-bit integer values of
@@ -2380,10 +2395,10 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2399,10 +2414,10 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
 }
 
 /// Takes the most significant bit from each 8-bit element in a 64-bit
@@ -2417,10 +2432,10 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }
 
 /// Multiplies packed 16-bit unsigned integer values and writes the
@@ -2436,10 +2451,11 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
+                                             (__v8hi)__anyext128(__b)));
 }
 
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@@ -2476,8 +2492,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
-  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
+#define _mm_shuffle_pi16(a, n)                                                 \
+  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
+                                  (n) & 0x3, ((n) >> 2) & 0x3,                 \
+                                  ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
 
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@@ -2502,10 +2520,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
+  // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
+  // write might cause a trap where a 64-bit maskmovq would not. (Memory
+  // locations not selected by the mask bits might still cause traps.)
+  __m128i __d128  = __anyext128(__d);
+  __m128i __n128  = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
+      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+    // If there's a risk of spurious trap due to a 128-bit write, back up the
+    // pointer by 8 bytes and shift values in registers to match.
+    __p -= 8;
+    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+  }
+
+  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }
 
 /// Computes the rounded averages of the packed unsigned 8-bit integer
@@ -2521,10 +2554,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
+                                           (__v16qi)__anyext128(__b)));
 }
 
 /// Computes the rounded averages of the packed unsigned 16-bit integer
@@ -2540,10 +2574,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
+                                           (__v8hi)__anyext128(__b)));
 }
 
 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@@ -2562,10 +2597,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
+                                            (__v16qi)__zext128(__b)));
 }
 
 #if defined(__cplusplus)
@@ -2846,22 +2882,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi16(__b, __a);
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hi)__a, __v4sf);
 }
 
 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
@@ -2876,21 +2900,10 @@ _mm_cvtpi16_ps(__m64 __a)
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hu)__a, __v4sf);
 }
 
 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@@ -2905,16 +2918,12 @@ _mm_cvtpu16_ps(__m64 __a)
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi8(__b, __a);
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@@ -2930,15 +2939,12 @@ _mm_cvtpi8_ps(__m64 __a)
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the two 32-bit signed integer values from each 64-bit vector
@@ -2957,16 +2963,12 @@ _mm_cvtpu8_ps(__m64 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
-  __m128 __c;
-
-  __c = _mm_setzero_ps();
-  __c = _mm_cvtpi32_ps(__c, __b);
-  __c = _mm_movelh_ps(__c, __c);
-
-  return _mm_cvtpi32_ps(__c, __a);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -2986,16 +2988,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi32(__a);
-  __a = _mm_movehl_ps(__a, __a);
-  __c = _mm_cvtps_pi32(__a);
-
-  return _mm_packs_pi32(__b, __c);
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -3016,7 +3013,7 @@ _mm_cvtps_pi16(__m128 __a)
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
@@ -3196,8 +3193,12 @@ do { \
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_
 
+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
diff --git a/clang/lib/InstallAPI/DirectoryScanner.cpp b/clang/lib/InstallAPI/DirectoryScanner.cpp
index 8984758e7b446..03a8208c7364e 100644
--- a/clang/lib/InstallAPI/DirectoryScanner.cpp
+++ b/clang/lib/InstallAPI/DirectoryScanner.cpp
@@ -130,7 +130,8 @@ Error DirectoryScanner::scanHeaders(StringRef Path, Library &Lib,
   if (ParentPath.empty())
     ParentPath = Path;
   for (const StringRef Dir : SubDirectories)
-    return scanHeaders(Dir, Lib, Type, BasePath, ParentPath);
+    if (Error Err = scanHeaders(Dir, Lib, Type, BasePath, ParentPath))
+      return Err;
 
   return Error::success();
 }
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index b4882ab5d2236..7209a33272ef2 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -26,7 +26,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CodeGenAction.h"
 #include "clang/CodeGen/ModuleBuilder.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
+#include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
@@ -38,6 +38,7 @@
 #include "clang/Interpreter/Value.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/IR/Module.h"
diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
index 8df158c17d491..c7b405b486d93 100644
--- a/clang/lib/Interpreter/InterpreterUtils.h
+++ b/clang/lib/Interpreter/InterpreterUtils.h
@@ -18,7 +18,6 @@
 #include "clang/AST/TypeVisitor.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/ModuleBuilder.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 57652be8244b4..088d1cc96e3a2 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -88,8 +88,8 @@ struct Scanner {
   [[nodiscard]] dependency_directives_scan::Token &
   lexToken(const char *&First, const char *const End);
 
-  dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
-                                                        const char *const End);
+  [[nodiscard]] dependency_directives_scan::Token &
+  lexIncludeFilename(const char *&First, const char *const End);
 
   void skipLine(const char *&First, const char *const End);
   void skipDirective(StringRef Name, const char *&First, const char *const End);
@@ -544,7 +544,7 @@ Scanner::lexIncludeFilename(const char *&First, const char *const End) {
 void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
   while (true) {
     const dependency_directives_scan::Token &Tok = lexToken(First, End);
-    if (Tok.is(tok::eod))
+    if (Tok.is(tok::eod) || Tok.is(tok::eof))
       break;
   }
 }
@@ -660,7 +660,18 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
   // an import.
 
   switch (*First) {
-  case ':':
+  case ':': {
+    // `module :` is never the start of a valid module declaration.
+    if (Id == "module") {
+      skipLine(First, End);
+      return false;
+    }
+    // `import:(type)name` is a valid ObjC method decl, so check one more token.
+    (void)lexToken(First, End);
+    if (!tryLexIdentifierOrSkipLine(First, End))
+      return false;
+    break;
+  }
   case '<':
   case '"':
     break;
@@ -901,7 +912,10 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   case pp___include_macros:
   case pp_include_next:
   case pp_import:
-    lexIncludeFilename(First, End);
+    // Ignore missing filenames in include or import directives.
+    if (lexIncludeFilename(First, End).is(tok::eod)) {
+      return false;
+    }
     break;
   default:
     break;
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index c3b3064cfbf28..d2210e7e18628 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -378,20 +378,22 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName,
         break;
     }
 
-    // If we've already performed the exhaustive search for module maps in this
-    // search directory, don't do it again.
-    if (Dir.haveSearchedAllModuleMaps())
-      continue;
+    if (HSOpts->AllowModuleMapSubdirectorySearch) {
+      // If we've already performed the exhaustive search for module maps in
+      // this search directory, don't do it again.
+      if (Dir.haveSearchedAllModuleMaps())
+        continue;
 
-    // Load all module maps in the immediate subdirectories of this search
-    // directory if ModuleName was from @import.
-    if (AllowExtraModuleMapSearch)
-      loadSubdirectoryModuleMaps(Dir);
+      // Load all module maps in the immediate subdirectories of this search
+      // directory if ModuleName was from @import.
+      if (AllowExtraModuleMapSearch)
+        loadSubdirectoryModuleMaps(Dir);
 
-    // Look again for the module.
-    Module = ModMap.findModule(ModuleName);
-    if (Module)
-      break;
+      // Look again for the module.
+      Module = ModMap.findModule(ModuleName);
+      if (Module)
+        break;
+    }
   }
 
   return Module;
diff --git a/clang/lib/Lex/PPCaching.cpp b/clang/lib/Lex/PPCaching.cpp
index f38ff62ebf437..cbacda9d31ae2 100644
--- a/clang/lib/Lex/PPCaching.cpp
+++ b/clang/lib/Lex/PPCaching.cpp
@@ -14,6 +14,15 @@
 #include "clang/Lex/Preprocessor.h"
 using namespace clang;
 
+std::pair<Preprocessor::CachedTokensTy::size_type, bool>
+Preprocessor::LastBacktrackPos() {
+  assert(isBacktrackEnabled());
+  auto BacktrackPos = BacktrackPositions.back();
+  bool Unannotated =
+      static_cast<CachedTokensTy::difference_type>(BacktrackPos) < 0;
+  return {Unannotated ? ~BacktrackPos : BacktrackPos, Unannotated};
+}
+
 // EnableBacktrackAtThisPos - From the point that this method is called, and
 // until CommitBacktrackedTokens() or Backtrack() is called, the Preprocessor
 // keeps track of the lexed tokens so that a subsequent Backtrack() call will
@@ -22,26 +31,45 @@ using namespace clang;
 // Nested backtracks are allowed, meaning that EnableBacktrackAtThisPos can
 // be called multiple times and CommitBacktrackedTokens/Backtrack calls will
 // be combined with the EnableBacktrackAtThisPos calls in reverse order.
-void Preprocessor::EnableBacktrackAtThisPos() {
+void Preprocessor::EnableBacktrackAtThisPos(bool Unannotated) {
   assert(LexLevel == 0 && "cannot use lookahead while lexing");
-  BacktrackPositions.push_back(CachedLexPos);
+  BacktrackPositions.push_back(Unannotated ? ~CachedLexPos : CachedLexPos);
+  if (Unannotated)
+    UnannotatedBacktrackTokens.emplace_back(CachedTokens, CachedTokens.size());
   EnterCachingLexMode();
 }
 
+Preprocessor::CachedTokensTy Preprocessor::PopUnannotatedBacktrackTokens() {
+  assert(isUnannotatedBacktrackEnabled() && "missing unannotated tokens?");
+  auto [UnannotatedTokens, NumCachedToks] =
+      std::move(UnannotatedBacktrackTokens.back());
+  UnannotatedBacktrackTokens.pop_back();
+  // If another unannotated backtrack is active, propagate any tokens that were
+  // lexed (not cached) since EnableBacktrackAtThisPos was last called.
+  if (isUnannotatedBacktrackEnabled())
+    UnannotatedBacktrackTokens.back().first.append(
+        UnannotatedTokens.begin() + NumCachedToks, UnannotatedTokens.end());
+  return std::move(UnannotatedTokens);
+}
+
 // Disable the last EnableBacktrackAtThisPos call.
 void Preprocessor::CommitBacktrackedTokens() {
-  assert(!BacktrackPositions.empty()
-         && "EnableBacktrackAtThisPos was not called!");
+  assert(isBacktrackEnabled() && "EnableBacktrackAtThisPos was not called!");
+  auto [BacktrackPos, Unannotated] = LastBacktrackPos();
   BacktrackPositions.pop_back();
+  if (Unannotated)
+    PopUnannotatedBacktrackTokens();
 }
 
 // Make Preprocessor re-lex the tokens that were lexed since
 // EnableBacktrackAtThisPos() was previously called.
 void Preprocessor::Backtrack() {
-  assert(!BacktrackPositions.empty()
-         && "EnableBacktrackAtThisPos was not called!");
-  CachedLexPos = BacktrackPositions.back();
+  assert(isBacktrackEnabled() && "EnableBacktrackAtThisPos was not called!");
+  auto [BacktrackPos, Unannotated] = LastBacktrackPos();
   BacktrackPositions.pop_back();
+  CachedLexPos = BacktrackPos;
+  if (Unannotated)
+    CachedTokens = PopUnannotatedBacktrackTokens();
   recomputeCurLexerKind();
 }
 
@@ -67,6 +95,8 @@ void Preprocessor::CachingLex(Token &Result) {
     EnterCachingLexModeUnchecked();
     CachedTokens.push_back(Result);
     ++CachedLexPos;
+    if (isUnannotatedBacktrackEnabled())
+      UnannotatedBacktrackTokens.back().first.push_back(Result);
     return;
   }
 
@@ -108,6 +138,8 @@ const Token &Preprocessor::PeekAhead(unsigned N) {
   for (size_t C = CachedLexPos + N - CachedTokens.size(); C > 0; --C) {
     CachedTokens.push_back(Token());
     Lex(CachedTokens.back());
+    if (isUnannotatedBacktrackEnabled())
+      UnannotatedBacktrackTokens.back().first.push_back(CachedTokens.back());
   }
   EnterCachingLexMode();
   return CachedTokens.back();
@@ -124,7 +156,7 @@ void Preprocessor::AnnotatePreviousCachedTokens(const Token &Tok) {
   for (CachedTokensTy::size_type i = CachedLexPos; i != 0; --i) {
     CachedTokensTy::iterator AnnotBegin = CachedTokens.begin() + i-1;
     if (AnnotBegin->getLocation() == Tok.getLocation()) {
-      assert((BacktrackPositions.empty() || BacktrackPositions.back() <= i) &&
+      assert((!isBacktrackEnabled() || LastBacktrackPos().first <= i) &&
              "The backtrack pos points inside the annotated tokens!");
       // Replace the cached tokens with the single annotation token.
       if (i < CachedLexPos)
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index a53540b12dee6..4e77df9ec444c 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1137,7 +1137,9 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile,
     SeparateComponents(LookupPath, Entry, Filename, false);
     llvm::Expected<FileEntryRef> ShouldBeEntry =
         FM.getFileRef(LookupPath, OpenFile);
-    return llvm::expectedToOptional(std::move(ShouldBeEntry));
+    if (ShouldBeEntry)
+      return llvm::expectedToOptional(std::move(ShouldBeEntry));
+    llvm::consumeError(ShouldBeEntry.takeError());
   }
   return std::nullopt;
 }
@@ -3624,12 +3626,10 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
   LexEmbedParametersResult Result{};
   SmallVector<Token, 2> ParameterTokens;
   tok::TokenKind EndTokenKind = ForHasEmbed ? tok::r_paren : tok::eod;
-  Result.ParamRange = {CurTok.getLocation(), CurTok.getLocation()};
 
   auto DiagMismatchedBracesAndSkipToEOD =
       [&](tok::TokenKind Expected,
           std::pair<tok::TokenKind, SourceLocation> Matches) {
-        Result.ParamRange.setEnd(CurTok.getEndLoc());
         Diag(CurTok, diag::err_expected) << Expected;
         Diag(Matches.second, diag::note_matching) << Matches.first;
         if (CurTok.isNot(tok::eod))
@@ -3638,7 +3638,6 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
 
   auto ExpectOrDiagAndSkipToEOD = [&](tok::TokenKind Kind) {
     if (CurTok.isNot(Kind)) {
-      Result.ParamRange.setEnd(CurTok.getEndLoc());
       Diag(CurTok, diag::err_expected) << Kind;
       if (CurTok.isNot(tok::eod))
         DiscardUntilEndOfDirective(CurTok);
@@ -3872,7 +3871,6 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
       }
     }
   }
-  Result.ParamRange.setEnd(CurTok.getLocation());
   return Result;
 }
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 2ccd42d88b198..366212cb786fb 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -52,7 +52,9 @@
 #include <cstddef>
 #include <cstring>
 #include <ctime>
+#include <iomanip>
 #include <optional>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -1602,6 +1604,16 @@ static bool isTargetVariantEnvironment(const TargetInfo &TI,
   return false;
 }
 
+#if defined(__sun__) && defined(__svr4__)
+// GCC mangles std::tm as tm for binary compatibility on Solaris (Issue
+// #33114).  We need to match this to allow the std::put_time calls to link
+// (PR #99075).
+asm("_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPKSt2tmPKcSB_ = "
+    "_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPK2tmPKcSB_");
+#endif
+
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
 /// as a builtin macro, handle it and return the next token as 'Tok'.
 void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
@@ -1721,11 +1733,13 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
     Diag(Tok.getLocation(), diag::warn_pp_date_time);
     // MSVC, ICC, GCC, VisualAge C++ extension.  The generated string should be
     // of the form "Ddd Mmm dd hh::mm::ss yyyy", which is returned by asctime.
-    const char *Result;
+    std::string Result;
+    std::stringstream TmpStream;
+    TmpStream.imbue(std::locale("C"));
     if (getPreprocessorOpts().SourceDateEpoch) {
       time_t TT = *getPreprocessorOpts().SourceDateEpoch;
       std::tm *TM = std::gmtime(&TT);
-      Result = asctime(TM);
+      TmpStream << std::put_time(TM, "%a %b %e %T %Y");
     } else {
       // Get the file that we are lexing out of.  If we're currently lexing from
       // a macro, dig into the include stack.
@@ -1735,13 +1749,13 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
       if (CurFile) {
         time_t TT = CurFile->getModificationTime();
         struct tm *TM = localtime(&TT);
-        Result = asctime(TM);
-      } else {
-        Result = "??? ??? ?? ??:??:?? ????\n";
+        TmpStream << std::put_time(TM, "%a %b %e %T %Y");
       }
     }
-    // Surround the string with " and strip the trailing newline.
-    OS << '"' << StringRef(Result).drop_back() << '"';
+    Result = TmpStream.str();
+    if (Result.empty())
+      Result = "??? ??? ?? ??:??:?? ????";
+    OS << '"' << Result << '"';
     Tok.setKind(tok::string_literal);
   } else if (II == Ident__FLT_EVAL_METHOD__) {
     // __FLT_EVAL_METHOD__ is set to the default value.
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index e04ff58ae8c9d..3a8bbb72591cd 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -170,7 +170,7 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
 }
 
 Preprocessor::~Preprocessor() {
-  assert(BacktrackPositions.empty() && "EnableBacktrack/Backtrack imbalance!");
+  assert(!isBacktrackEnabled() && "EnableBacktrack/Backtrack imbalance!");
 
   IncludeMacroStack.clear();
 
@@ -988,7 +988,7 @@ void Preprocessor::LexTokensUntilEOF(std::vector<Token> *Tokens) {
 }
 
 /// Lex a header-name token (including one formed from header-name-tokens if
-/// \p AllowConcatenation is \c true).
+/// \p AllowMacroExpansion is \c true).
 ///
 /// \param FilenameTok Filled in with the next token. On success, this will
 ///        be either a header_name token. On failure, it will be whatever other
diff --git a/clang/lib/Parse/ParseAST.cpp b/clang/lib/Parse/ParseAST.cpp
index dc8df8e8f3f03..910bbac0c5920 100644
--- a/clang/lib/Parse/ParseAST.cpp
+++ b/clang/lib/Parse/ParseAST.cpp
@@ -153,7 +153,15 @@ void clang::ParseAST(Sema &S, bool PrintStats, bool SkipFunctionBodies) {
   bool HaveLexer = S.getPreprocessor().getCurrentLexer();
 
   if (HaveLexer) {
-    llvm::TimeTraceScope TimeScope("Frontend");
+    llvm::TimeTraceScope TimeScope("Frontend", [&]() {
+      llvm::TimeTraceMetadata M;
+      if (llvm::isTimeTraceVerbose()) {
+        const SourceManager &SM = S.getSourceManager();
+        if (const auto *FE = SM.getFileEntryForID(SM.getMainFileID()))
+          M.File = FE->tryGetRealPathName();
+      }
+      return M;
+    });
     P.Initialize();
     Parser::DeclGroupPtrTy ADecl;
     Sema::ModuleImportState ImportState;
diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp
index 9ccbbf9a7d5d0..b461743833c82 100644
--- a/clang/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp
@@ -1205,41 +1205,6 @@ bool Parser::ConsumeAndStoreConditional(CachedTokens &Toks) {
   return true;
 }
 
-/// A tentative parsing action that can also revert token annotations.
-class Parser::UnannotatedTentativeParsingAction : public TentativeParsingAction {
-public:
-  explicit UnannotatedTentativeParsingAction(Parser &Self,
-                                             tok::TokenKind EndKind)
-      : TentativeParsingAction(Self), Self(Self), EndKind(EndKind) {
-    // Stash away the old token stream, so we can restore it once the
-    // tentative parse is complete.
-    TentativeParsingAction Inner(Self);
-    Self.ConsumeAndStoreUntil(EndKind, Toks, true, /*ConsumeFinalToken*/false);
-    Inner.Revert();
-  }
-
-  void RevertAnnotations() {
-    Revert();
-
-    // Put back the original tokens.
-    Self.SkipUntil(EndKind, StopAtSemi | StopBeforeMatch);
-    if (Toks.size()) {
-      auto Buffer = std::make_unique<Token[]>(Toks.size());
-      std::copy(Toks.begin() + 1, Toks.end(), Buffer.get());
-      Buffer[Toks.size() - 1] = Self.Tok;
-      Self.PP.EnterTokenStream(std::move(Buffer), Toks.size(), true,
-                               /*IsReinject*/ true);
-
-      Self.Tok = Toks.front();
-    }
-  }
-
-private:
-  Parser &Self;
-  CachedTokens Toks;
-  tok::TokenKind EndKind;
-};
-
 /// ConsumeAndStoreInitializer - Consume and store the token at the passed token
 /// container until the end of the current initializer expression (either a
 /// default argument or an in-class initializer for a non-static data member).
@@ -1277,9 +1242,7 @@ bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks,
       //    syntactically-valid init-declarator-list, then this comma ends
       //    the default initializer.
       {
-        UnannotatedTentativeParsingAction PA(*this,
-                                             CIK == CIK_DefaultInitializer
-                                               ? tok::semi : tok::r_paren);
+        TentativeParsingAction TPA(*this, /*Unannotated=*/true);
         Sema::TentativeAnalysisScope Scope(Actions);
 
         TPResult Result = TPResult::Error;
@@ -1307,7 +1270,7 @@ bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks,
         // Put the token stream back and undo any annotations we performed
         // after the comma. They may reflect a different parse than the one
         // we will actually perform at the end of the class.
-        PA.RevertAnnotations();
+        TPA.Revert();
 
         // If what follows could be a declaration, it is a declaration.
         if (Result != TPResult::False && Result != TPResult::Error)
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 941009abebc43..e37c820d1b23c 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -314,64 +314,92 @@ void Parser::ParseGNUAttributes(ParsedAttributes &Attrs,
 }
 
 /// Determine whether the given attribute has an identifier argument.
-static bool attributeHasIdentifierArg(const IdentifierInfo &II) {
+static bool attributeHasIdentifierArg(const IdentifierInfo &II,
+                                      ParsedAttr::Syntax Syntax,
+                                      IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_IDENTIFIER_ARG_LIST
 }
 
 /// Determine whether the given attribute has an identifier argument.
 static ParsedAttributeArgumentsProperties
-attributeStringLiteralListArg(const llvm::Triple &T, const IdentifierInfo &II) {
+attributeStringLiteralListArg(const llvm::Triple &T, const IdentifierInfo &II,
+                              ParsedAttr::Syntax Syntax,
+                              IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRING_LITERAL_ARG_LIST
-  return llvm::StringSwitch<uint32_t>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<uint32_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
       .Default(0);
 #undef CLANG_ATTR_STRING_LITERAL_ARG_LIST
 }
 
 /// Determine whether the given attribute has a variadic identifier argument.
-static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II) {
+static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II,
+                                              ParsedAttr::Syntax Syntax,
+                                              IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
 }
 
 /// Determine whether the given attribute treats kw_this as an identifier.
-static bool attributeTreatsKeywordThisAsIdentifier(const IdentifierInfo &II) {
+static bool attributeTreatsKeywordThisAsIdentifier(const IdentifierInfo &II,
+                                                   ParsedAttr::Syntax Syntax,
+                                                   IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
 }
 
 /// Determine if an attribute accepts parameter packs.
-static bool attributeAcceptsExprPack(const IdentifierInfo &II) {
+static bool attributeAcceptsExprPack(const IdentifierInfo &II,
+                                     ParsedAttr::Syntax Syntax,
+                                     IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_ACCEPTS_EXPR_PACK
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
       .Default(false);
 #undef CLANG_ATTR_ACCEPTS_EXPR_PACK
 }
 
 /// Determine whether the given attribute parses a type argument.
-static bool attributeIsTypeArgAttr(const IdentifierInfo &II) {
+static bool attributeIsTypeArgAttr(const IdentifierInfo &II,
+                                   ParsedAttr::Syntax Syntax,
+                                   IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_TYPE_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_TYPE_ARG_LIST
 }
 
 /// Determine whether the given attribute takes identifier arguments.
-static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II) {
+static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II,
+                                             ParsedAttr::Syntax Syntax,
+                                             IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
-  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+  return (llvm::StringSwitch<uint64_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
               .Default(0)) != 0;
 #undef CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
@@ -380,9 +408,13 @@ static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II) {
 /// Determine whether the given attribute takes an identifier argument at a
 /// specific index
 static bool attributeHasStrictIdentifierArgAtIndex(const IdentifierInfo &II,
+                                                   ParsedAttr::Syntax Syntax,
+                                                   IdentifierInfo *ScopeName,
                                                    size_t argIndex) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
-  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+  return (llvm::StringSwitch<uint64_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
               .Default(0)) &
          (1ull << argIndex);
@@ -391,11 +423,15 @@ static bool attributeHasStrictIdentifierArgAtIndex(const IdentifierInfo &II,
 
 /// Determine whether the given attribute requires parsing its arguments
 /// in an unevaluated context or not.
-static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II) {
+static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II,
+                                           ParsedAttr::Syntax Syntax,
+                                           IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_ARG_CONTEXT_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_ARG_CONTEXT_LIST
 }
 
@@ -523,10 +559,12 @@ unsigned Parser::ParseAttributeArgsCommon(
   // Ignore the left paren location for now.
   ConsumeParen();
 
-  bool ChangeKWThisToIdent = attributeTreatsKeywordThisAsIdentifier(*AttrName);
-  bool AttributeIsTypeArgAttr = attributeIsTypeArgAttr(*AttrName);
+  bool ChangeKWThisToIdent = attributeTreatsKeywordThisAsIdentifier(
+      *AttrName, Form.getSyntax(), ScopeName);
+  bool AttributeIsTypeArgAttr =
+      attributeIsTypeArgAttr(*AttrName, Form.getSyntax(), ScopeName);
   bool AttributeHasVariadicIdentifierArg =
-      attributeHasVariadicIdentifierArg(*AttrName);
+      attributeHasVariadicIdentifierArg(*AttrName, Form.getSyntax(), ScopeName);
 
   // Interpret "kw_this" as an identifier if the attributed requests it.
   if (ChangeKWThisToIdent && Tok.is(tok::kw_this))
@@ -535,8 +573,9 @@ unsigned Parser::ParseAttributeArgsCommon(
   ArgsVector ArgExprs;
   if (Tok.is(tok::identifier)) {
     // If this attribute wants an 'identifier' argument, make it so.
-    bool IsIdentifierArg = AttributeHasVariadicIdentifierArg ||
-                           attributeHasIdentifierArg(*AttrName);
+    bool IsIdentifierArg =
+        AttributeHasVariadicIdentifierArg ||
+        attributeHasIdentifierArg(*AttrName, Form.getSyntax(), ScopeName);
     ParsedAttr::Kind AttrKind =
         ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax());
 
@@ -568,7 +607,8 @@ unsigned Parser::ParseAttributeArgsCommon(
       if (T.isUsable())
         TheParsedType = T.get();
     } else if (AttributeHasVariadicIdentifierArg ||
-               attributeHasStrictIdentifierArgs(*AttrName)) {
+               attributeHasStrictIdentifierArgs(*AttrName, Form.getSyntax(),
+                                                ScopeName)) {
       // Parse variadic identifier arg. This can either consume identifiers or
       // expressions. Variadic identifier args do not support parameter packs
       // because those are typically used for attributes with enumeration
@@ -579,8 +619,9 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (ChangeKWThisToIdent && Tok.is(tok::kw_this))
           Tok.setKind(tok::identifier);
 
-        if (Tok.is(tok::identifier) && attributeHasStrictIdentifierArgAtIndex(
-                                           *AttrName, ArgExprs.size())) {
+        if (Tok.is(tok::identifier) &&
+            attributeHasStrictIdentifierArgAtIndex(
+                *AttrName, Form.getSyntax(), ScopeName, ArgExprs.size())) {
           ArgExprs.push_back(ParseIdentifierLoc());
           continue;
         }
@@ -589,7 +630,8 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (Tok.is(tok::identifier)) {
           ArgExprs.push_back(ParseIdentifierLoc());
         } else {
-          bool Uneval = attributeParsedArgsUnevaluated(*AttrName);
+          bool Uneval = attributeParsedArgsUnevaluated(
+              *AttrName, Form.getSyntax(), ScopeName);
           EnterExpressionEvaluationContext Unevaluated(
               Actions,
               Uneval ? Sema::ExpressionEvaluationContext::Unevaluated
@@ -610,7 +652,8 @@ unsigned Parser::ParseAttributeArgsCommon(
       } while (TryConsumeToken(tok::comma));
     } else {
       // General case. Parse all available expressions.
-      bool Uneval = attributeParsedArgsUnevaluated(*AttrName);
+      bool Uneval = attributeParsedArgsUnevaluated(*AttrName, Form.getSyntax(),
+                                                   ScopeName);
       EnterExpressionEvaluationContext Unevaluated(
           Actions,
           Uneval ? Sema::ExpressionEvaluationContext::Unevaluated
@@ -621,7 +664,8 @@ unsigned Parser::ParseAttributeArgsCommon(
 
       ExprVector ParsedExprs;
       ParsedAttributeArgumentsProperties ArgProperties =
-          attributeStringLiteralListArg(getTargetInfo().getTriple(), *AttrName);
+          attributeStringLiteralListArg(getTargetInfo().getTriple(), *AttrName,
+                                        Form.getSyntax(), ScopeName);
       if (ParseAttributeArgumentList(*AttrName, ParsedExprs, ArgProperties)) {
         SkipUntil(tok::r_paren, StopAtSemi);
         return 0;
@@ -632,7 +676,7 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (!isa<PackExpansionExpr>(ParsedExprs[I]))
           continue;
 
-        if (!attributeAcceptsExprPack(*AttrName)) {
+        if (!attributeAcceptsExprPack(*AttrName, Form.getSyntax(), ScopeName)) {
           Diag(Tok.getLocation(),
                diag::err_attribute_argument_parm_pack_not_supported)
               << AttrName;
@@ -696,7 +740,7 @@ void Parser::ParseGNUAttributeArgs(
     ParseTypeTagForDatatypeAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc,
                                      ScopeName, ScopeLoc, Form);
     return;
-  } else if (attributeIsTypeArgAttr(*AttrName)) {
+  } else if (attributeIsTypeArgAttr(*AttrName, Form.getSyntax(), ScopeName)) {
     ParseAttributeWithTypeArg(*AttrName, AttrNameLoc, Attrs, ScopeName,
                               ScopeLoc, Form);
     return;
@@ -6651,48 +6695,66 @@ void Parser::ParseDeclaratorInternal(Declarator &D,
        (Tok.is(tok::identifier) &&
         (NextToken().is(tok::coloncolon) || NextToken().is(tok::less))) ||
        Tok.is(tok::annot_cxxscope))) {
+    TentativeParsingAction TPA(*this, /*Unannotated=*/true);
     bool EnteringContext = D.getContext() == DeclaratorContext::File ||
                            D.getContext() == DeclaratorContext::Member;
     CXXScopeSpec SS;
     SS.setTemplateParamLists(D.getTemplateParameterLists());
-    ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
-                                   /*ObjectHasErrors=*/false, EnteringContext);
 
-    if (SS.isNotEmpty()) {
-      if (Tok.isNot(tok::star)) {
-        // The scope spec really belongs to the direct-declarator.
-        if (D.mayHaveIdentifier())
-          D.getCXXScopeSpec() = SS;
-        else
-          AnnotateScopeToken(SS, true);
+    if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
+                                       /*ObjectHasErrors=*/false,
+                                       /*EnteringContext=*/false,
+                                       /*MayBePseudoDestructor=*/nullptr,
+                                       /*IsTypename=*/false, /*LastII=*/nullptr,
+                                       /*OnlyNamespace=*/false,
+                                       /*InUsingDeclaration=*/false,
+                                       /*Disambiguation=*/EnteringContext) ||
+
+        SS.isEmpty() || SS.isInvalid() || !EnteringContext ||
+        Tok.is(tok::star)) {
+      TPA.Commit();
+      if (SS.isNotEmpty() && Tok.is(tok::star)) {
+        if (SS.isValid()) {
+          checkCompoundToken(SS.getEndLoc(), tok::coloncolon,
+                             CompoundToken::MemberPtr);
+        }
 
-        if (DirectDeclParser)
-          (this->*DirectDeclParser)(D);
+        SourceLocation StarLoc = ConsumeToken();
+        D.SetRangeEnd(StarLoc);
+        DeclSpec DS(AttrFactory);
+        ParseTypeQualifierListOpt(DS);
+        D.ExtendWithDeclSpec(DS);
+
+        // Recurse to parse whatever is left.
+        Actions.runWithSufficientStackSpace(D.getBeginLoc(), [&] {
+          ParseDeclaratorInternal(D, DirectDeclParser);
+        });
+
+        // Sema will have to catch (syntactically invalid) pointers into global
+        // scope. It has to catch pointers into namespace scope anyway.
+        D.AddTypeInfo(DeclaratorChunk::getMemberPointer(
+                          SS, DS.getTypeQualifiers(), StarLoc, DS.getEndLoc()),
+                      std::move(DS.getAttributes()),
+                      /*EndLoc=*/SourceLocation());
         return;
       }
+    } else {
+      TPA.Revert();
+      SS.clear();
+      ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
+                                     /*ObjectHasErrors=*/false,
+                                     /*EnteringContext=*/true);
+    }
 
-      if (SS.isValid()) {
-        checkCompoundToken(SS.getEndLoc(), tok::coloncolon,
-                           CompoundToken::MemberPtr);
-      }
+    if (SS.isNotEmpty()) {
+      // The scope spec really belongs to the direct-declarator.
+      if (D.mayHaveIdentifier())
+        D.getCXXScopeSpec() = SS;
+      else
+        AnnotateScopeToken(SS, true);
 
-      SourceLocation StarLoc = ConsumeToken();
-      D.SetRangeEnd(StarLoc);
-      DeclSpec DS(AttrFactory);
-      ParseTypeQualifierListOpt(DS);
-      D.ExtendWithDeclSpec(DS);
-
-      // Recurse to parse whatever is left.
-      Actions.runWithSufficientStackSpace(D.getBeginLoc(), [&] {
-        ParseDeclaratorInternal(D, DirectDeclParser);
-      });
-
-      // Sema will have to catch (syntactically invalid) pointers into global
-      // scope. It has to catch pointers into namespace scope anyway.
-      D.AddTypeInfo(DeclaratorChunk::getMemberPointer(
-                        SS, DS.getTypeQualifiers(), StarLoc, DS.getEndLoc()),
-                    std::move(DS.getAttributes()),
-                    /* Don't replace range end. */ SourceLocation());
+      if (DirectDeclParser)
+        (this->*DirectDeclParser)(D);
       return;
     }
   }
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index ec8d5fd0feb95..7372a2e3b4084 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -763,6 +763,9 @@ class CastExpressionIdValidator final : public CorrectionCandidateCallback {
 bool Parser::isRevertibleTypeTrait(const IdentifierInfo *II,
                                    tok::TokenKind *Kind) {
   if (RevertibleTypeTraits.empty()) {
+// Revertible type trait is a feature for backwards compatibility with older
+// standard libraries that declare their own structs with the same name as
+// the builtins listed below. New builtins should NOT be added to this list.
 #define RTT_JOIN(X, Y) X##Y
 #define REVERTIBLE_TYPE_TRAIT(Name)                                            \
   RevertibleTypeTraits[PP.getIdentifierInfo(#Name)] = RTT_JOIN(tok::kw_, Name)
@@ -790,7 +793,6 @@ bool Parser::isRevertibleTypeTrait(const IdentifierInfo *II,
     REVERTIBLE_TYPE_TRAIT(__is_fundamental);
     REVERTIBLE_TYPE_TRAIT(__is_integral);
     REVERTIBLE_TYPE_TRAIT(__is_interface_class);
-    REVERTIBLE_TYPE_TRAIT(__is_layout_compatible);
     REVERTIBLE_TYPE_TRAIT(__is_literal);
     REVERTIBLE_TYPE_TRAIT(__is_lvalue_expr);
     REVERTIBLE_TYPE_TRAIT(__is_lvalue_reference);
@@ -841,6 +843,26 @@ bool Parser::isRevertibleTypeTrait(const IdentifierInfo *II,
   return false;
 }
 
+ExprResult Parser::ParseBuiltinPtrauthTypeDiscriminator() {
+  SourceLocation Loc = ConsumeToken();
+
+  BalancedDelimiterTracker T(*this, tok::l_paren);
+  if (T.expectAndConsume())
+    return ExprError();
+
+  TypeResult Ty = ParseTypeName();
+  if (Ty.isInvalid()) {
+    SkipUntil(tok::r_paren, StopAtSemi);
+    return ExprError();
+  }
+
+  SourceLocation EndLoc = Tok.getLocation();
+  T.consumeClose();
+  return Actions.ActOnUnaryExprOrTypeTraitExpr(
+      Loc, UETT_PtrAuthTypeDiscriminator,
+      /*isType=*/true, Ty.get().getAsOpaquePtr(), SourceRange(Loc, EndLoc));
+}
+
 /// Parse a cast-expression, or, if \pisUnaryExpression is true, parse
 /// a unary-expression.
 ///
@@ -1818,6 +1840,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
     Res = ParseArrayTypeTrait();
     break;
 
+  case tok::kw___builtin_ptrauth_type_discriminator:
+    return ParseBuiltinPtrauthTypeDiscriminator();
+
   case tok::kw___is_lvalue_expr:
   case tok::kw___is_rvalue_expr:
     if (NotPrimaryExpression)
@@ -2523,7 +2548,19 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok,
       return ExprError();
     }
 
-    Operand = ParseCastExpression(UnaryExprOnly);
+    // If we're parsing a chain that consists of keywords that could be
+    // followed by a non-parenthesized expression, BalancedDelimiterTracker
+    // is not going to help when the nesting is too deep. In this corner case
+    // we continue to parse with sufficient stack space to avoid crashing.
+    if (OpTok.isOneOf(tok::kw_sizeof, tok::kw___datasizeof, tok::kw___alignof,
+                      tok::kw_alignof, tok::kw__Alignof) &&
+        Tok.isOneOf(tok::kw_sizeof, tok::kw___datasizeof, tok::kw___alignof,
+                    tok::kw_alignof, tok::kw__Alignof))
+      Actions.runWithSufficientStackSpace(Tok.getLocation(), [&] {
+        Operand = ParseCastExpression(UnaryExprOnly);
+      });
+    else
+      Operand = ParseCastExpression(UnaryExprOnly);
   } else {
     // If it starts with a '(', we know that it is either a parenthesized
     // type-name, or it is a unary-expression that starts with a compound
@@ -3697,7 +3734,7 @@ void Parser::injectEmbedTokens() {
     I += 2;
   }
   PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
-                      /*IsReinject=*/false);
+                      /*IsReinject=*/true);
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
 }
 
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 1d364f77a8146..c0eae73927cdd 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -159,8 +159,8 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType,
 bool Parser::ParseOptionalCXXScopeSpecifier(
     CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors,
     bool EnteringContext, bool *MayBePseudoDestructor, bool IsTypename,
-    const IdentifierInfo **LastII, bool OnlyNamespace,
-    bool InUsingDeclaration) {
+    const IdentifierInfo **LastII, bool OnlyNamespace, bool InUsingDeclaration,
+    bool Disambiguation) {
   assert(getLangOpts().CPlusPlus &&
          "Call sites of this function should be guarded by checking for C++");
 
@@ -528,13 +528,11 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
       UnqualifiedId TemplateName;
       TemplateName.setIdentifier(&II, Tok.getLocation());
       bool MemberOfUnknownSpecialization;
-      if (TemplateNameKind TNK = Actions.isTemplateName(getCurScope(), SS,
-                                              /*hasTemplateKeyword=*/false,
-                                                        TemplateName,
-                                                        ObjectType,
-                                                        EnteringContext,
-                                                        Template,
-                                              MemberOfUnknownSpecialization)) {
+      if (TemplateNameKind TNK = Actions.isTemplateName(
+              getCurScope(), SS,
+              /*hasTemplateKeyword=*/false, TemplateName, ObjectType,
+              EnteringContext, Template, MemberOfUnknownSpecialization,
+              Disambiguation)) {
         // If lookup didn't find anything, we treat the name as a template-name
         // anyway. C++20 requires this, and in prior language modes it improves
         // error recovery. But before we commit to this, check that we actually
@@ -557,7 +555,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
         continue;
       }
 
-      if (MemberOfUnknownSpecialization && (ObjectType || SS.isSet()) &&
+      if (MemberOfUnknownSpecialization && !Disambiguation &&
+          (ObjectType || SS.isSet()) &&
           (IsTypename || isTemplateArgumentList(1) == TPResult::True)) {
         // If we had errors before, ObjectType can be dependent even without any
         // templates. Do not report missing template keyword in that case.
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 326cd22ff9005..e975e96c5c7e4 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2885,6 +2885,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     }
     break;
   }
+  case OMPD_reverse:
+  case OMPD_interchange:
   case OMPD_declare_target: {
     SourceLocation DTLoc = ConsumeAnyToken();
     bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end);
@@ -3835,6 +3837,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
     // Get a defaultmap modifier
     unsigned Modifier = getOpenMPSimpleClauseType(
         Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+
     // Set defaultmap modifier to unknown if it is either scalar, aggregate, or
     // pointer
     if (Modifier < OMPC_DEFAULTMAP_MODIFIER_unknown)
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index cc6f18b5b319f..aef4ddb758816 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -14,6 +14,7 @@
 #include "clang/Basic/PragmaKinds.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Lex/Token.h"
 #include "clang/Parse/LoopHint.h"
 #include "clang/Parse/ParseDiagnostic.h"
@@ -411,6 +412,19 @@ struct PragmaRISCVHandler : public PragmaHandler {
   Sema &Actions;
 };
 
+struct PragmaMCFuncHandler : public PragmaHandler {
+  PragmaMCFuncHandler(bool ReportError)
+      : PragmaHandler("mc_func"), ReportError(ReportError) {}
+  void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer,
+                    Token &Tok) override {
+    if (ReportError)
+      PP.Diag(Tok, diag::err_pragma_mc_func_not_supported);
+  }
+
+private:
+  bool ReportError = false;
+};
+
 void markAsReinjectedForRelexing(llvm::MutableArrayRef<clang::Token> Toks) {
   for (auto &T : Toks)
     T.setFlag(clang::Token::IsReinjected);
@@ -568,6 +582,12 @@ void Parser::initializePragmaHandlers() {
     RISCVPragmaHandler = std::make_unique<PragmaRISCVHandler>(Actions);
     PP.AddPragmaHandler("clang", RISCVPragmaHandler.get());
   }
+
+  if (getTargetInfo().getTriple().isOSAIX()) {
+    MCFuncPragmaHandler = std::make_unique<PragmaMCFuncHandler>(
+        PP.getPreprocessorOpts().ErrorOnPragmaMcfuncOnAIX);
+    PP.AddPragmaHandler(MCFuncPragmaHandler.get());
+  }
 }
 
 void Parser::resetPragmaHandlers() {
@@ -702,6 +722,11 @@ void Parser::resetPragmaHandlers() {
     PP.RemovePragmaHandler("clang", RISCVPragmaHandler.get());
     RISCVPragmaHandler.reset();
   }
+
+  if (getTargetInfo().getTriple().isOSAIX()) {
+    PP.RemovePragmaHandler(MCFuncPragmaHandler.get());
+    MCFuncPragmaHandler.reset();
+  }
 }
 
 /// Handle the annotation token produced for #pragma unused(...)
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 22d38adc28ebe..bdb3fc051d0b3 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -299,6 +299,15 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes(
     goto Retry;
   }
 
+  case tok::kw_template: {
+    SourceLocation DeclEnd;
+    ParsedAttributes Attrs(AttrFactory);
+    ParseTemplateDeclarationOrSpecialization(DeclaratorContext::Block, DeclEnd,
+                                             Attrs,
+                                             getAccessSpecifierIfPresent());
+    return StmtError();
+  }
+
   case tok::kw_case:                // C99 6.8.1: labeled-statement
     return ParseCaseStatement(StmtCtx);
   case tok::kw_default:             // C99 6.8.1: labeled-statement
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index e6a2d132f9618..49d6be63bbfe8 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -45,6 +45,7 @@ add_clang_library(clangSema
   SemaAvailability.cpp
   SemaBPF.cpp
   SemaBase.cpp
+  SemaBoundsSafety.cpp
   SemaCXXScopeSpec.cpp
   SemaCast.cpp
   SemaChecking.cpp
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 995e4cbadacfe..7389046eaddde 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CheckExprLifetime.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Sema/Initialization.h"
@@ -191,7 +192,8 @@ struct IndirectLocalPathEntry {
     TemporaryCopy,
     LambdaCaptureInit,
     GslReferenceInit,
-    GslPointerInit
+    GslPointerInit,
+    GslPointerAssignment,
   } Kind;
   Expr *E;
   union {
@@ -307,8 +309,7 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
   const auto *RD = FD->getParamDecl(0)->getType()->getPointeeCXXRecordDecl();
   if (!FD->isInStdNamespace() || !RD || !RD->isInStdNamespace())
     return false;
-  if (!isRecordWithAttr<PointerAttr>(QualType(RD->getTypeForDecl(), 0)) &&
-      !isRecordWithAttr<OwnerAttr>(QualType(RD->getTypeForDecl(), 0)))
+  if (!RD->hasAttr<PointerAttr>() && !RD->hasAttr<OwnerAttr>())
     return false;
   if (FD->getReturnType()->isPointerType() ||
       isRecordWithAttr<PointerAttr>(FD->getReturnType())) {
@@ -337,7 +338,8 @@ static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
       for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
         if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit)
           continue;
-        if (PE.Kind == IndirectLocalPathEntry::GslPointerInit)
+        if (PE.Kind == IndirectLocalPathEntry::GslPointerInit ||
+            PE.Kind == IndirectLocalPathEntry::GslPointerAssignment)
           return;
         break;
       }
@@ -547,6 +549,14 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
                                        EnableLifetimeWarnings);
   }
 
+  if (auto *M = dyn_cast<MemberExpr>(Init)) {
+    // Lifetime of a non-reference type field is same as base object.
+    if (auto *F = dyn_cast<FieldDecl>(M->getMemberDecl());
+        F && !F->getType()->isReferenceType())
+      visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true,
+                                       EnableLifetimeWarnings);
+  }
+
   if (isa<CallExpr>(Init)) {
     if (EnableLifetimeWarnings)
       handleGslAnnotatedTypes(Path, Init, Visit);
@@ -815,7 +825,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
   if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
     if (CCE->getConstructor()->isCopyOrMoveConstructor()) {
       if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CCE->getArg(0))) {
-        // assert(false && "hit temporary copy path");
         Expr *Arg = MTE->getSubExpr();
         Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
                         CCE->getConstructor()});
@@ -937,6 +946,7 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
     case IndirectLocalPathEntry::TemporaryCopy:
     case IndirectLocalPathEntry::GslReferenceInit:
     case IndirectLocalPathEntry::GslPointerInit:
+    case IndirectLocalPathEntry::GslPointerAssignment:
       // These exist primarily to mark the path as not permitting or
       // supporting lifetime extension.
       break;
@@ -957,16 +967,20 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
   return E->getSourceRange();
 }
 
-static bool pathOnlyInitializesGslPointer(IndirectLocalPath &Path) {
+static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) {
   for (const auto &It : llvm::reverse(Path)) {
-    if (It.Kind == IndirectLocalPathEntry::VarInit)
-      continue;
-    if (It.Kind == IndirectLocalPathEntry::AddressOf)
-      continue;
-    if (It.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
+    switch (It.Kind) {
+    case IndirectLocalPathEntry::VarInit:
+    case IndirectLocalPathEntry::AddressOf:
+    case IndirectLocalPathEntry::LifetimeBoundCall:
       continue;
-    return It.Kind == IndirectLocalPathEntry::GslPointerInit ||
-           It.Kind == IndirectLocalPathEntry::GslReferenceInit;
+    case IndirectLocalPathEntry::GslPointerInit:
+    case IndirectLocalPathEntry::GslReferenceInit:
+    case IndirectLocalPathEntry::GslPointerAssignment:
+      return true;
+    default:
+      return false;
+    }
   }
   return false;
 }
@@ -975,7 +989,8 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
                                   const InitializedEntity *InitEntity,
                                   const InitializedEntity *ExtendingEntity,
                                   LifetimeKind LK,
-                                  const AssignedEntity *AEntity, Expr *Init) {
+                                  const AssignedEntity *AEntity, Expr *Init,
+                                  bool EnableLifetimeWarnings) {
   assert((AEntity && LK == LK_Assignment) ||
          (InitEntity && LK != LK_Assignment));
   // If this entity doesn't have an interesting lifetime, don't bother looking
@@ -992,9 +1007,9 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
 
     auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L);
 
-    bool IsGslPtrInitWithGslTempOwner = false;
+    bool IsGslPtrValueFromGslTempOwner = false;
     bool IsLocalGslOwner = false;
-    if (pathOnlyInitializesGslPointer(Path)) {
+    if (pathOnlyHandlesGslPointer(Path)) {
       if (isa<DeclRefExpr>(L)) {
         // We do not want to follow the references when returning a pointer
         // originating from a local owner to avoid the following false positive:
@@ -1005,13 +1020,13 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         if (pathContainsInit(Path) || !IsLocalGslOwner)
           return false;
       } else {
-        IsGslPtrInitWithGslTempOwner =
+        IsGslPtrValueFromGslTempOwner =
             MTE && !MTE->getExtendingDecl() &&
             isRecordWithAttr<OwnerAttr>(MTE->getType());
         // Skipping a chain of initializing gsl::Pointer annotated objects.
         // We are looking only for the final source to find out if it was
         // a local or temporary owner or the address of a local variable/param.
-        if (!IsGslPtrInitWithGslTempOwner)
+        if (!IsGslPtrValueFromGslTempOwner)
           return true;
       }
     }
@@ -1030,7 +1045,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         return false;
       }
 
-      if (IsGslPtrInitWithGslTempOwner && DiagLoc.isValid()) {
+      if (IsGslPtrValueFromGslTempOwner && DiagLoc.isValid()) {
         SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer)
             << DiagRange;
         return false;
@@ -1073,14 +1088,16 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
     }
 
     case LK_Assignment: {
-      if (!MTE)
+      if (!MTE || pathContainsInit(Path))
         return false;
       assert(shouldLifetimeExtendThroughPath(Path) ==
                  PathLifetimeKind::NoExtend &&
              "No lifetime extension for assignments");
-      if (!pathContainsInit(Path))
-        SemaRef.Diag(DiagLoc, diag::warn_dangling_pointer_assignment)
-            << AEntity->LHS << DiagRange;
+      SemaRef.Diag(DiagLoc,
+                   IsGslPtrValueFromGslTempOwner
+                       ? diag::warn_dangling_lifetime_pointer_assignment
+                       : diag::warn_dangling_pointer_assignment)
+          << AEntity->LHS << DiagRange;
       return false;
     }
     case LK_MemInitializer: {
@@ -1090,7 +1107,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         // temporary, the program is ill-formed.
         if (auto *ExtendingDecl =
                 ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) {
-          if (IsGslPtrInitWithGslTempOwner) {
+          if (IsGslPtrValueFromGslTempOwner) {
             SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer_member)
                 << ExtendingDecl << DiagRange;
             SemaRef.Diag(ExtendingDecl->getLocation(),
@@ -1131,7 +1148,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
 
         // Suppress false positives for code like the one below:
         //   Ctor(unique_ptr<T> up) : member(*up), member2(move(up)) {}
-        if (IsLocalGslOwner && pathOnlyInitializesGslPointer(Path))
+        if (IsLocalGslOwner && pathOnlyHandlesGslPointer(Path))
           return false;
 
         auto *DRE = dyn_cast<DeclRefExpr>(L);
@@ -1159,7 +1176,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
 
     case LK_New:
       if (isa<MaterializeTemporaryExpr>(L)) {
-        if (IsGslPtrInitWithGslTempOwner)
+        if (IsGslPtrValueFromGslTempOwner)
           SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer)
               << DiagRange;
         else
@@ -1226,6 +1243,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
       case IndirectLocalPathEntry::TemporaryCopy:
       case IndirectLocalPathEntry::GslPointerInit:
       case IndirectLocalPathEntry::GslReferenceInit:
+      case IndirectLocalPathEntry::GslPointerAssignment:
         // FIXME: Consider adding a note for these.
         break;
 
@@ -1265,9 +1283,11 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
     return false;
   };
 
-  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
-      diag::warn_dangling_lifetime_pointer, SourceLocation());
   llvm::SmallVector<IndirectLocalPathEntry, 8> Path;
+  if (EnableLifetimeWarnings && LK == LK_Assignment &&
+      isRecordWithAttr<PointerAttr>(AEntity->LHS->getType()))
+    Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init});
+
   if (Init->isGLValue())
     visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding,
                                           TemporaryVisitor,
@@ -1284,16 +1304,26 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
   auto LTResult = getEntityLifetime(&Entity);
   LifetimeKind LK = LTResult.getInt();
   const InitializedEntity *ExtendingEntity = LTResult.getPointer();
-  checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, nullptr, Init);
+  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
+      diag::warn_dangling_lifetime_pointer, SourceLocation());
+  checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK,
+                        /*AEntity*/ nullptr, Init, EnableLifetimeWarnings);
 }
 
 void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
                        Expr *Init) {
-  if (!Entity.LHS->getType()->isPointerType()) // builtin pointer type
+  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
+      diag::warn_dangling_lifetime_pointer, SourceLocation());
+  bool RunAnalysis = Entity.LHS->getType()->isPointerType() ||
+                     (EnableLifetimeWarnings &&
+                      isRecordWithAttr<PointerAttr>(Entity.LHS->getType()));
+
+  if (!RunAnalysis)
     return;
+
   checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr,
                         /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity,
-                        Init);
+                        Init, EnableLifetimeWarnings);
 }
 
 } // namespace clang::sema
diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp
index 5c2ebf51c03c0..23c7982f722b8 100644
--- a/clang/lib/Sema/ParsedAttr.cpp
+++ b/clang/lib/Sema/ParsedAttr.cpp
@@ -229,7 +229,7 @@ bool ParsedAttr::slidesFromDeclToDeclSpecLegacyBehavior() const {
     // atributes.
     return false;
 
-  assert(isStandardAttributeSyntax());
+  assert(isStandardAttributeSyntax() || isAlignas());
 
   // We have historically allowed some type attributes with standard attribute
   // syntax to slide to the decl-specifier-seq, so we have to keep supporting
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index bf7ce4b1ffc3d..8287db1afb302 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -496,7 +496,9 @@ void Sema::Initialize() {
 #include "clang/Basic/OpenCLExtensionTypes.def"
   }
 
-  if (Context.getTargetInfo().hasAArch64SVETypes()) {
+  if (Context.getTargetInfo().hasAArch64SVETypes() ||
+      (Context.getAuxTargetInfo() &&
+       Context.getAuxTargetInfo()->hasAArch64SVETypes())) {
 #define SVE_TYPE(Name, Id, SingletonId) \
     addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AArch64SVEACLETypes.def"
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 3482f3741fce6..2c49c1f64b2da 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -546,6 +546,13 @@ static void ProcessAPINotes(Sema &S, FunctionOrMethod AnyFunc,
                   Metadata);
 }
 
+/// Process API notes for a C++ method.
+static void ProcessAPINotes(Sema &S, CXXMethodDecl *Method,
+                            const api_notes::CXXMethodInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  ProcessAPINotes(S, (FunctionOrMethod)Method, Info, Metadata);
+}
+
 /// Process API notes for a global function.
 static void ProcessAPINotes(Sema &S, FunctionDecl *D,
                             const api_notes::GlobalFunctionInfo &Info,
@@ -776,6 +783,64 @@ static void ProcessVersionedAPINotes(
   }
 }
 
+static std::optional<api_notes::Context>
+UnwindNamespaceContext(DeclContext *DC, api_notes::APINotesManager &APINotes) {
+  if (auto NamespaceContext = dyn_cast<NamespaceDecl>(DC)) {
+    for (auto Reader : APINotes.findAPINotes(NamespaceContext->getLocation())) {
+      // Retrieve the context ID for the parent namespace of the decl.
+      std::stack<NamespaceDecl *> NamespaceStack;
+      {
+        for (auto CurrentNamespace = NamespaceContext; CurrentNamespace;
+             CurrentNamespace =
+                 dyn_cast<NamespaceDecl>(CurrentNamespace->getParent())) {
+          if (!CurrentNamespace->isInlineNamespace())
+            NamespaceStack.push(CurrentNamespace);
+        }
+      }
+      std::optional<api_notes::ContextID> NamespaceID;
+      while (!NamespaceStack.empty()) {
+        auto CurrentNamespace = NamespaceStack.top();
+        NamespaceStack.pop();
+        NamespaceID =
+            Reader->lookupNamespaceID(CurrentNamespace->getName(), NamespaceID);
+        if (!NamespaceID)
+          return std::nullopt;
+      }
+      if (NamespaceID)
+        return api_notes::Context(*NamespaceID,
+                                  api_notes::ContextKind::Namespace);
+    }
+  }
+  return std::nullopt;
+}
+
+static std::optional<api_notes::Context>
+UnwindTagContext(TagDecl *DC, api_notes::APINotesManager &APINotes) {
+  assert(DC && "tag context must not be null");
+  for (auto Reader : APINotes.findAPINotes(DC->getLocation())) {
+    // Retrieve the context ID for the parent tag of the decl.
+    std::stack<TagDecl *> TagStack;
+    {
+      for (auto CurrentTag = DC; CurrentTag;
+           CurrentTag = dyn_cast<TagDecl>(CurrentTag->getParent()))
+        TagStack.push(CurrentTag);
+    }
+    assert(!TagStack.empty());
+    std::optional<api_notes::Context> Ctx =
+        UnwindNamespaceContext(TagStack.top()->getDeclContext(), APINotes);
+    while (!TagStack.empty()) {
+      auto CurrentTag = TagStack.top();
+      TagStack.pop();
+      auto CtxID = Reader->lookupTagID(CurrentTag->getName(), Ctx);
+      if (!CtxID)
+        return std::nullopt;
+      Ctx = api_notes::Context(*CtxID, api_notes::ContextKind::Tag);
+    }
+    return Ctx;
+  }
+  return std::nullopt;
+}
+
 /// Process API notes that are associated with this declaration, mapping them
 /// to attributes as appropriate.
 void Sema::ProcessAPINotes(Decl *D) {
@@ -787,35 +852,8 @@ void Sema::ProcessAPINotes(Decl *D) {
       D->getDeclContext()->isNamespace() ||
       D->getDeclContext()->isExternCContext() ||
       D->getDeclContext()->isExternCXXContext()) {
-    std::optional<api_notes::Context> APINotesContext;
-    if (auto NamespaceContext = dyn_cast<NamespaceDecl>(D->getDeclContext())) {
-      for (auto Reader :
-           APINotes.findAPINotes(NamespaceContext->getLocation())) {
-        // Retrieve the context ID for the parent namespace of the decl.
-        std::stack<NamespaceDecl *> NamespaceStack;
-        {
-          for (auto CurrentNamespace = NamespaceContext; CurrentNamespace;
-               CurrentNamespace =
-                   dyn_cast<NamespaceDecl>(CurrentNamespace->getParent())) {
-            if (!CurrentNamespace->isInlineNamespace())
-              NamespaceStack.push(CurrentNamespace);
-          }
-        }
-        std::optional<api_notes::ContextID> NamespaceID;
-        while (!NamespaceStack.empty()) {
-          auto CurrentNamespace = NamespaceStack.top();
-          NamespaceStack.pop();
-          NamespaceID = Reader->lookupNamespaceID(CurrentNamespace->getName(),
-                                                  NamespaceID);
-          if (!NamespaceID)
-            break;
-        }
-        if (NamespaceID)
-          APINotesContext = api_notes::Context(
-              *NamespaceID, api_notes::ContextKind::Namespace);
-      }
-    }
-
+    std::optional<api_notes::Context> APINotesContext =
+        UnwindNamespaceContext(D->getDeclContext(), APINotes);
     // Global variables.
     if (auto VD = dyn_cast<VarDecl>(D)) {
       for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
@@ -887,6 +925,8 @@ void Sema::ProcessAPINotes(Decl *D) {
       }
 
       for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        if (auto ParentTag = dyn_cast<TagDecl>(Tag->getDeclContext()))
+          APINotesContext = UnwindTagContext(ParentTag, APINotes);
         auto Info = Reader->lookupTag(LookupName, APINotesContext);
         ProcessVersionedAPINotes(*this, Tag, Info);
       }
@@ -1001,4 +1041,30 @@ void Sema::ProcessAPINotes(Decl *D) {
       return;
     }
   }
+
+  if (auto TagContext = dyn_cast<TagDecl>(D->getDeclContext())) {
+    if (auto CXXMethod = dyn_cast<CXXMethodDecl>(D)) {
+      if (!isa<CXXConstructorDecl>(CXXMethod) &&
+          !isa<CXXDestructorDecl>(CXXMethod) &&
+          !isa<CXXConversionDecl>(CXXMethod) &&
+          !CXXMethod->isOverloadedOperator()) {
+        for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+          if (auto Context = UnwindTagContext(TagContext, APINotes)) {
+            auto Info =
+                Reader->lookupCXXMethod(Context->id, CXXMethod->getName());
+            ProcessVersionedAPINotes(*this, CXXMethod, Info);
+          }
+        }
+      }
+    }
+
+    if (auto Tag = dyn_cast<TagDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        if (auto Context = UnwindTagContext(TagContext, APINotes)) {
+          auto Info = Reader->lookupTag(Tag->getName(), Context);
+          ProcessVersionedAPINotes(*this, Tag, Info);
+        }
+      }
+    }
+  }
 }
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index aaabd989c5c9f..b0c239678d0b0 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -117,7 +117,7 @@ void Sema::inferGslPointerAttribute(NamedDecl *ND,
   if (!Parent)
     return;
 
-  static llvm::StringSet<> Containers{
+  static const llvm::StringSet<> Containers{
       "array",
       "basic_string",
       "deque",
@@ -137,9 +137,9 @@ void Sema::inferGslPointerAttribute(NamedDecl *ND,
       "unordered_multimap",
   };
 
-  static llvm::StringSet<> Iterators{"iterator", "const_iterator",
-                                     "reverse_iterator",
-                                     "const_reverse_iterator"};
+  static const llvm::StringSet<> Iterators{"iterator", "const_iterator",
+                                           "reverse_iterator",
+                                           "const_reverse_iterator"};
 
   if (Parent->isInStdNamespace() && Iterators.count(ND->getName()) &&
       Containers.count(Parent->getName()))
@@ -165,7 +165,7 @@ void Sema::inferGslPointerAttribute(TypedefNameDecl *TD) {
 }
 
 void Sema::inferGslOwnerPointerAttribute(CXXRecordDecl *Record) {
-  static llvm::StringSet<> StdOwners{
+  static const llvm::StringSet<> StdOwners{
       "any",
       "array",
       "basic_regex",
@@ -189,10 +189,11 @@ void Sema::inferGslOwnerPointerAttribute(CXXRecordDecl *Record) {
       "unordered_multimap",
       "variant",
   };
-  static llvm::StringSet<> StdPointers{
+  static const llvm::StringSet<> StdPointers{
       "basic_string_view",
       "reference_wrapper",
       "regex_iterator",
+      "span",
   };
 
   if (!Record->getIdentifier())
@@ -216,7 +217,7 @@ void Sema::inferGslOwnerPointerAttribute(CXXRecordDecl *Record) {
 }
 
 void Sema::inferNullableClassAttribute(CXXRecordDecl *CRD) {
-  static llvm::StringSet<> Nullable{
+  static const llvm::StringSet<> Nullable{
       "auto_ptr",         "shared_ptr", "unique_ptr",         "exception_ptr",
       "coroutine_handle", "function",   "move_only_function",
   };
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index b677a1f170cd1..dd98ab1a7f3f9 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -107,6 +107,12 @@ ShouldDiagnoseAvailabilityOfDecl(Sema &S, const NamedDecl *D,
     break;
   }
 
+  // For alias templates, get the underlying declaration.
+  if (const auto *ADecl = dyn_cast<TypeAliasTemplateDecl>(D)) {
+    D = ADecl->getTemplatedDecl();
+    Result = D->getAvailability(Message);
+  }
+
   // Forward class declarations get their attributes from their definition.
   if (const auto *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
     if (IDecl->getDefinition()) {
diff --git a/clang/lib/Sema/SemaBoundsSafety.cpp b/clang/lib/Sema/SemaBoundsSafety.cpp
new file mode 100644
index 0000000000000..290c820938898
--- /dev/null
+++ b/clang/lib/Sema/SemaBoundsSafety.cpp
@@ -0,0 +1,193 @@
+//===-- SemaBoundsSafety.cpp - Bounds Safety specific routines-*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis functions specific to `-fbounds-safety`
+/// (Bounds Safety) and also its attributes when used without `-fbounds-safety`
+/// (e.g. `counted_by`)
+///
+//===----------------------------------------------------------------------===//
+#include "clang/Sema/Sema.h"
+
+namespace clang {
+
+static CountAttributedType::DynamicCountPointerKind
+getCountAttrKind(bool CountInBytes, bool OrNull) {
+  if (CountInBytes)
+    return OrNull ? CountAttributedType::SizedByOrNull
+                  : CountAttributedType::SizedBy;
+  return OrNull ? CountAttributedType::CountedByOrNull
+                : CountAttributedType::CountedBy;
+}
+
+static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
+  const auto *RD = FD->getParent();
+  // An unnamed struct is anonymous struct only if it's not instantiated.
+  // However, the struct may not be fully processed yet to determine
+  // whether it's anonymous or not. In that case, this function treats it as
+  // an anonymous struct and tries to find a named parent.
+  while (RD && (RD->isAnonymousStructOrUnion() ||
+                (!RD->isCompleteDefinition() && RD->getName().empty()))) {
+    const auto *Parent = dyn_cast<RecordDecl>(RD->getParent());
+    if (!Parent)
+      break;
+    RD = Parent;
+  }
+  return RD;
+}
+
+enum class CountedByInvalidPointeeTypeKind {
+  INCOMPLETE,
+  SIZELESS,
+  FUNCTION,
+  FLEXIBLE_ARRAY_MEMBER,
+  VALID,
+};
+
+bool Sema::CheckCountedByAttrOnField(
+    FieldDecl *FD, Expr *E,
+    llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls, bool CountInBytes,
+    bool OrNull) {
+  // Check the context the attribute is used in
+
+  unsigned Kind = getCountAttrKind(CountInBytes, OrNull);
+
+  if (FD->getParent()->isUnion()) {
+    Diag(FD->getBeginLoc(), diag::err_count_attr_in_union)
+        << Kind << FD->getSourceRange();
+    return true;
+  }
+
+  const auto FieldTy = FD->getType();
+  if (FieldTy->isArrayType() && (CountInBytes || OrNull)) {
+    Diag(FD->getBeginLoc(),
+         diag::err_count_attr_not_on_ptr_or_flexible_array_member)
+        << Kind << FD->getLocation() << /* suggest counted_by */ 1;
+    return true;
+  }
+  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
+    Diag(FD->getBeginLoc(),
+         diag::err_count_attr_not_on_ptr_or_flexible_array_member)
+        << Kind << FD->getLocation() << /* do not suggest counted_by */ 0;
+    return true;
+  }
+
+  LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
+      LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
+  if (FieldTy->isArrayType() &&
+      !Decl::isFlexibleArrayMemberLike(getASTContext(), FD, FieldTy,
+                                       StrictFlexArraysLevel, true)) {
+    Diag(FD->getBeginLoc(),
+         diag::err_counted_by_attr_on_array_not_flexible_array_member)
+        << Kind << FD->getLocation();
+    return true;
+  }
+
+  CountedByInvalidPointeeTypeKind InvalidTypeKind =
+      CountedByInvalidPointeeTypeKind::VALID;
+  QualType PointeeTy;
+  int SelectPtrOrArr = 0;
+  if (FieldTy->isPointerType()) {
+    PointeeTy = FieldTy->getPointeeType();
+    SelectPtrOrArr = 0;
+  } else {
+    assert(FieldTy->isArrayType());
+    const ArrayType *AT = getASTContext().getAsArrayType(FieldTy);
+    PointeeTy = AT->getElementType();
+    SelectPtrOrArr = 1;
+  }
+  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
+  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
+  // when `FieldTy->isArrayType()`.
+  bool ShouldWarn = false;
+  if (PointeeTy->isIncompleteType() && !CountInBytes) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
+  } else if (PointeeTy->isSizelessType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
+  } else if (PointeeTy->isFunctionType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
+  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
+    if (FieldTy->isArrayType() && !getLangOpts().BoundsSafety) {
+      // This is a workaround for the Linux kernel that has already adopted
+      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
+      // should be an error because computing the bounds of the array cannot be
+      // done correctly without manually traversing every struct object in the
+      // array at runtime. To allow the code to be built this error is
+      // downgraded to a warning.
+      ShouldWarn = true;
+    }
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
+  }
+
+  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
+    unsigned DiagID = ShouldWarn
+                          ? diag::warn_counted_by_attr_elt_type_unknown_size
+                          : diag::err_counted_by_attr_pointee_unknown_size;
+    Diag(FD->getBeginLoc(), DiagID)
+        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
+        << (ShouldWarn ? 1 : 0) << Kind << FD->getSourceRange();
+    return true;
+  }
+
+  // Check the expression
+
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    Diag(E->getBeginLoc(), diag::err_count_attr_argument_not_integer)
+        << Kind << E->getSourceRange();
+    return true;
+  }
+
+  auto *DRE = dyn_cast<DeclRefExpr>(E);
+  if (!DRE) {
+    Diag(E->getBeginLoc(),
+         diag::err_count_attr_only_support_simple_decl_reference)
+        << Kind << E->getSourceRange();
+    return true;
+  }
+
+  auto *CountDecl = DRE->getDecl();
+  FieldDecl *CountFD = dyn_cast<FieldDecl>(CountDecl);
+  if (auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl)) {
+    CountFD = IFD->getAnonField();
+  }
+  if (!CountFD) {
+    Diag(E->getBeginLoc(), diag::err_count_attr_must_be_in_structure)
+        << CountDecl << Kind << E->getSourceRange();
+
+    Diag(CountDecl->getBeginLoc(),
+         diag::note_flexible_array_counted_by_attr_field)
+        << CountDecl << CountDecl->getSourceRange();
+    return true;
+  }
+
+  if (FD->getParent() != CountFD->getParent()) {
+    if (CountFD->getParent()->isUnion()) {
+      Diag(CountFD->getBeginLoc(), diag::err_count_attr_refer_to_union)
+          << Kind << CountFD->getSourceRange();
+      return true;
+    }
+    // Whether CountRD is an anonymous struct is not determined at this
+    // point. Thus, an additional diagnostic in case it's not anonymous struct
+    // is done later in `Parser::ParseStructDeclaration`.
+    auto *RD = GetEnclosingNamedOrTopAnonRecord(FD);
+    auto *CountRD = GetEnclosingNamedOrTopAnonRecord(CountFD);
+
+    if (RD != CountRD) {
+      Diag(E->getBeginLoc(), diag::err_count_attr_param_not_in_same_struct)
+          << CountFD << Kind << FieldTy->isArrayType() << E->getSourceRange();
+      Diag(CountFD->getBeginLoc(),
+           diag::note_flexible_array_counted_by_attr_field)
+          << CountFD << CountFD->getSourceRange();
+      return true;
+    }
+  }
+
+  Decls.push_back(TypeCoupledDeclRefInfo(CountFD, /*IsDref*/ false));
+  return false;
+}
+
+} // namespace clang
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e4df5848ff132..a2842343be968 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1481,6 +1481,18 @@ static bool BuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
   return false;
 }
 
+// In OpenCL, __builtin_alloca_* should return a pointer to address space
+// that corresponds to the stack address space i.e private address space.
+static void builtinAllocaAddrSpace(Sema &S, CallExpr *TheCall) {
+  QualType RT = TheCall->getType();
+  assert((RT->isPointerType() && !(RT->getPointeeType().hasAddressSpace())) &&
+         "__builtin_alloca has invalid address space");
+
+  RT = RT->getPointeeType();
+  RT = S.Context.getAddrSpaceQualType(RT, LangAS::opencl_private);
+  TheCall->setType(S.Context.getPointerType(RT));
+}
+
 namespace {
 enum PointerAuthOpKind {
   PAO_Strip,
@@ -1493,14 +1505,18 @@ enum PointerAuthOpKind {
 };
 }
 
-static bool checkPointerAuthEnabled(Sema &S, Expr *E) {
-  if (S.getLangOpts().PointerAuthIntrinsics)
+bool Sema::checkPointerAuthEnabled(SourceLocation Loc, SourceRange Range) {
+  if (getLangOpts().PointerAuthIntrinsics)
     return false;
 
-  S.Diag(E->getExprLoc(), diag::err_ptrauth_disabled) << E->getSourceRange();
+  Diag(Loc, diag::err_ptrauth_disabled) << Range;
   return true;
 }
 
+static bool checkPointerAuthEnabled(Sema &S, Expr *E) {
+  return S.checkPointerAuthEnabled(E->getExprLoc(), E->getSourceRange());
+}
+
 static bool checkPointerAuthKey(Sema &S, Expr *&Arg) {
   // Convert it to type 'int'.
   if (convertArgumentToType(S, Arg, S.Context.IntTy))
@@ -2214,6 +2230,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_alloca_uninitialized:
     Diag(TheCall->getBeginLoc(), diag::warn_alloca)
         << TheCall->getDirectCallee();
+    if (getLangOpts().OpenCL) {
+      builtinAllocaAddrSpace(*this, TheCall);
+    }
     break;
   case Builtin::BI__arithmetic_fence:
     if (BuiltinArithmeticFence(TheCall))
@@ -6400,7 +6419,7 @@ static const Expr *maybeConstEvalStringLiteral(ASTContext &Context,
 Sema::FormatStringType Sema::GetFormatStringType(const FormatAttr *Format) {
   return llvm::StringSwitch<FormatStringType>(Format->getType()->getName())
       .Case("scanf", FST_Scanf)
-      .Cases("printf", "printf0", FST_Printf)
+      .Cases("printf", "printf0", "syslog", FST_Printf)
       .Cases("NSString", "CFString", FST_NSString)
       .Case("strftime", FST_Strftime)
       .Case("strfmon", FST_Strfmon)
@@ -6494,6 +6513,7 @@ bool Sema::CheckFormatArguments(ArrayRef<const Expr *> Args,
     case FST_Kprintf:
     case FST_FreeBSDKPrintf:
     case FST_Printf:
+    case FST_Syslog:
       Diag(FormatLoc, diag::note_format_security_fixit)
         << FixItHint::CreateInsertion(FormatLoc, "\"%s\", ");
       break;
@@ -8230,7 +8250,7 @@ static void CheckFormatString(
 
   if (Type == Sema::FST_Printf || Type == Sema::FST_NSString ||
       Type == Sema::FST_FreeBSDKPrintf || Type == Sema::FST_OSLog ||
-      Type == Sema::FST_OSTrace) {
+      Type == Sema::FST_OSTrace || Type == Sema::FST_Syslog) {
     CheckPrintfHandler H(
         S, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
         (Type == Sema::FST_NSString || Type == Sema::FST_OSTrace), Str, APK,
@@ -8572,20 +8592,46 @@ static bool IsStdFunction(const FunctionDecl *FDecl,
   return true;
 }
 
+enum class MathCheck { NaN, Inf };
+static bool IsInfOrNanFunction(StringRef calleeName, MathCheck Check) {
+  auto MatchesAny = [&](std::initializer_list<llvm::StringRef> names) {
+    return std::any_of(names.begin(), names.end(), [&](llvm::StringRef name) {
+      return calleeName == name;
+    });
+  };
+
+  switch (Check) {
+  case MathCheck::NaN:
+    return MatchesAny({"__builtin_nan", "__builtin_nanf", "__builtin_nanl",
+                       "__builtin_nanf16", "__builtin_nanf128"});
+  case MathCheck::Inf:
+    return MatchesAny({"__builtin_inf", "__builtin_inff", "__builtin_infl",
+                       "__builtin_inff16", "__builtin_inff128"});
+  }
+  llvm_unreachable("unknown MathCheck");
+}
+
 void Sema::CheckInfNaNFunction(const CallExpr *Call,
                                const FunctionDecl *FDecl) {
   FPOptions FPO = Call->getFPFeaturesInEffect(getLangOpts());
-  if ((IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered") ||
-       (Call->getBuiltinCallee() == Builtin::BI__builtin_nanf)) &&
-      FPO.getNoHonorNaNs())
+  bool HasIdentifier = FDecl->getIdentifier() != nullptr;
+  bool IsNaNOrIsUnordered =
+      IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered");
+  bool IsSpecialNaN =
+      HasIdentifier && IsInfOrNanFunction(FDecl->getName(), MathCheck::NaN);
+  if ((IsNaNOrIsUnordered || IsSpecialNaN) && FPO.getNoHonorNaNs()) {
     Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
         << 1 << 0 << Call->getSourceRange();
-  else if ((IsStdFunction(FDecl, "isinf") ||
-            (IsStdFunction(FDecl, "isfinite") ||
-             (FDecl->getIdentifier() && FDecl->getName() == "infinity"))) &&
-           FPO.getNoHonorInfs())
-    Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
-        << 0 << 0 << Call->getSourceRange();
+  } else {
+    bool IsInfOrIsFinite =
+        IsStdFunction(FDecl, "isinf") || IsStdFunction(FDecl, "isfinite");
+    bool IsInfinityOrIsSpecialInf =
+        HasIdentifier && ((FDecl->getName() == "infinity") ||
+                          IsInfOrNanFunction(FDecl->getName(), MathCheck::Inf));
+    if ((IsInfOrIsFinite || IsInfinityOrIsSpecialInf) && FPO.getNoHonorInfs())
+      Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+          << 0 << 0 << Call->getSourceRange();
+  }
 }
 
 void Sema::CheckAbsoluteValueFunction(const CallExpr *Call,
@@ -14050,10 +14096,11 @@ void Sema::DiagnoseSelfMove(const Expr *LHSExpr, const Expr *RHSExpr,
 
 //===--- Layout compatibility ----------------------------------------------//
 
-static bool isLayoutCompatible(ASTContext &C, QualType T1, QualType T2);
+static bool isLayoutCompatible(const ASTContext &C, QualType T1, QualType T2);
 
 /// Check if two enumeration types are layout-compatible.
-static bool isLayoutCompatible(ASTContext &C, EnumDecl *ED1, EnumDecl *ED2) {
+static bool isLayoutCompatible(const ASTContext &C, const EnumDecl *ED1,
+                               const EnumDecl *ED2) {
   // C++11 [dcl.enum] p8:
   // Two enumeration types are layout-compatible if they have the same
   // underlying type.
@@ -14064,8 +14111,8 @@ static bool isLayoutCompatible(ASTContext &C, EnumDecl *ED1, EnumDecl *ED2) {
 /// Check if two fields are layout-compatible.
 /// Can be used on union members, which are exempt from alignment requirement
 /// of common initial sequence.
-static bool isLayoutCompatible(ASTContext &C, FieldDecl *Field1,
-                               FieldDecl *Field2,
+static bool isLayoutCompatible(const ASTContext &C, const FieldDecl *Field1,
+                               const FieldDecl *Field2,
                                bool AreUnionMembers = false) {
   [[maybe_unused]] const Type *Field1Parent =
       Field1->getParent()->getTypeForDecl();
@@ -14108,60 +14155,33 @@ static bool isLayoutCompatible(ASTContext &C, FieldDecl *Field1,
 
 /// Check if two standard-layout structs are layout-compatible.
 /// (C++11 [class.mem] p17)
-static bool isLayoutCompatibleStruct(ASTContext &C, RecordDecl *RD1,
-                                     RecordDecl *RD2) {
-  // If both records are C++ classes, check that base classes match.
-  if (const CXXRecordDecl *D1CXX = dyn_cast<CXXRecordDecl>(RD1)) {
-    // If one of records is a CXXRecordDecl we are in C++ mode,
-    // thus the other one is a CXXRecordDecl, too.
-    const CXXRecordDecl *D2CXX = cast<CXXRecordDecl>(RD2);
-    // Check number of base classes.
-    if (D1CXX->getNumBases() != D2CXX->getNumBases())
-      return false;
+static bool isLayoutCompatibleStruct(const ASTContext &C, const RecordDecl *RD1,
+                                     const RecordDecl *RD2) {
+  // Get to the class where the fields are declared
+  if (const CXXRecordDecl *D1CXX = dyn_cast<CXXRecordDecl>(RD1))
+    RD1 = D1CXX->getStandardLayoutBaseWithFields();
 
-    // Check the base classes.
-    for (CXXRecordDecl::base_class_const_iterator
-               Base1 = D1CXX->bases_begin(),
-           BaseEnd1 = D1CXX->bases_end(),
-              Base2 = D2CXX->bases_begin();
-         Base1 != BaseEnd1;
-         ++Base1, ++Base2) {
-      if (!isLayoutCompatible(C, Base1->getType(), Base2->getType()))
-        return false;
-    }
-  } else if (const CXXRecordDecl *D2CXX = dyn_cast<CXXRecordDecl>(RD2)) {
-    // If only RD2 is a C++ class, it should have zero base classes.
-    if (D2CXX->getNumBases() > 0)
-      return false;
-  }
+  if (const CXXRecordDecl *D2CXX = dyn_cast<CXXRecordDecl>(RD2))
+    RD2 = D2CXX->getStandardLayoutBaseWithFields();
 
   // Check the fields.
-  RecordDecl::field_iterator Field2 = RD2->field_begin(),
-                             Field2End = RD2->field_end(),
-                             Field1 = RD1->field_begin(),
-                             Field1End = RD1->field_end();
-  for ( ; Field1 != Field1End && Field2 != Field2End; ++Field1, ++Field2) {
-    if (!isLayoutCompatible(C, *Field1, *Field2))
-      return false;
-  }
-  if (Field1 != Field1End || Field2 != Field2End)
-    return false;
-
-  return true;
+  return llvm::equal(RD1->fields(), RD2->fields(),
+                     [&C](const FieldDecl *F1, const FieldDecl *F2) -> bool {
+                       return isLayoutCompatible(C, F1, F2);
+                     });
 }
 
 /// Check if two standard-layout unions are layout-compatible.
 /// (C++11 [class.mem] p18)
-static bool isLayoutCompatibleUnion(ASTContext &C, RecordDecl *RD1,
-                                    RecordDecl *RD2) {
-  llvm::SmallPtrSet<FieldDecl *, 8> UnmatchedFields;
+static bool isLayoutCompatibleUnion(const ASTContext &C, const RecordDecl *RD1,
+                                    const RecordDecl *RD2) {
+  llvm::SmallPtrSet<const FieldDecl *, 8> UnmatchedFields;
   for (auto *Field2 : RD2->fields())
     UnmatchedFields.insert(Field2);
 
   for (auto *Field1 : RD1->fields()) {
-    llvm::SmallPtrSet<FieldDecl *, 8>::iterator
-        I = UnmatchedFields.begin(),
-        E = UnmatchedFields.end();
+    auto I = UnmatchedFields.begin();
+    auto E = UnmatchedFields.end();
 
     for ( ; I != E; ++I) {
       if (isLayoutCompatible(C, Field1, *I, /*IsUnionMember=*/true)) {
@@ -14178,8 +14198,8 @@ static bool isLayoutCompatibleUnion(ASTContext &C, RecordDecl *RD1,
   return UnmatchedFields.empty();
 }
 
-static bool isLayoutCompatible(ASTContext &C, RecordDecl *RD1,
-                               RecordDecl *RD2) {
+static bool isLayoutCompatible(const ASTContext &C, const RecordDecl *RD1,
+                               const RecordDecl *RD2) {
   if (RD1->isUnion() != RD2->isUnion())
     return false;
 
@@ -14190,7 +14210,7 @@ static bool isLayoutCompatible(ASTContext &C, RecordDecl *RD1,
 }
 
 /// Check if two types are layout-compatible in C++11 sense.
-static bool isLayoutCompatible(ASTContext &C, QualType T1, QualType T2) {
+static bool isLayoutCompatible(const ASTContext &C, QualType T1, QualType T2) {
   if (T1.isNull() || T2.isNull())
     return false;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index a315ad3373b48..44aedb416e670 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -5946,7 +5946,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
 
 static QualType getCoreType(QualType Ty) {
   do {
-    if (Ty->isPointerType() || Ty->isReferenceType())
+    if (Ty->isPointerOrReferenceType())
       Ty = Ty->getPointeeType();
     else if (Ty->isArrayType())
       Ty = Ty->castAsArrayTypeUnsafe()->getElementType();
@@ -6949,6 +6949,11 @@ static void checkAttributesAfterMerging(Sema &S, NamedDecl &ND) {
     }
   }
 
+  if (HybridPatchableAttr *Attr = ND.getAttr<HybridPatchableAttr>()) {
+    if (!ND.isExternallyVisible())
+      S.Diag(Attr->getLocation(),
+             diag::warn_attribute_hybrid_patchable_non_extern);
+  }
   if (const InheritableAttr *Attr = getDLLAttr(&ND)) {
     auto *VD = dyn_cast<VarDecl>(&ND);
     bool IsAnonymousNS = false;
@@ -7495,10 +7500,10 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     tryToFixVariablyModifiedVarType(TInfo, R, D.getIdentifierLoc(),
                                     /*DiagID=*/0);
 
-  if (const AutoType *AutoT = R->getAs<AutoType>())
-    CheckConstrainedAuto(
-        AutoT,
-        TInfo->getTypeLoc().getContainedAutoTypeLoc().getConceptNameLoc());
+  if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) {
+    const AutoType *AT = TL.getTypePtr();
+    CheckConstrainedAuto(AT, TL.getConceptNameLoc());
+  }
 
   bool IsMemberSpecialization = false;
   bool IsVariableTemplateSpecialization = false;
@@ -9437,7 +9442,7 @@ static OpenCLParamType getOpenCLKernelParameterType(Sema &S, QualType PT) {
   if (PT->isDependentType())
     return InvalidKernelParam;
 
-  if (PT->isPointerType() || PT->isReferenceType()) {
+  if (PT->isPointerOrReferenceType()) {
     QualType PointeeType = PT->getPointeeType();
     if (PointeeType.getAddressSpace() == LangAS::opencl_generic ||
         PointeeType.getAddressSpace() == LangAS::opencl_private ||
@@ -10882,11 +10887,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
   for (const ParmVarDecl *Param : NewFD->parameters()) {
     QualType PT = Param->getType();
 
-    if (const PipeType *PipeTy = PT->getAs<PipeType>()) {
-      QualType ElemTy = PipeTy->getElementType();
-      if (ElemTy->isReferenceType() || ElemTy->isPointerType()) {
-        Diag(Param->getTypeSpecStartLoc(), diag::err_reference_pipe_type );
-        D.setInvalidType();
+    // OpenCL 2.0 pipe restrictions forbids pipe packet types to be non-value
+    // types.
+    if (getLangOpts().getOpenCLCompatibleVersion() >= 200) {
+      if(const PipeType *PipeTy = PT->getAs<PipeType>()) {
+        QualType ElemTy = PipeTy->getElementType();
+        if (ElemTy->isPointerOrReferenceType()) {
+          Diag(Param->getTypeSpecStartLoc(), diag::err_reference_pipe_type);
+          D.setInvalidType();
+        }
       }
     }
     // WebAssembly tables can't be used as function parameters.
@@ -11108,6 +11117,9 @@ static bool AttrCompatibleWithMultiVersion(attr::Kind Kind,
   switch (Kind) {
   default:
     return false;
+  case attr::ArmLocallyStreaming:
+    return MVKind == MultiVersionKind::TargetVersion ||
+           MVKind == MultiVersionKind::TargetClones;
   case attr::Used:
     return MVKind == MultiVersionKind::Target;
   case attr::NonNull:
@@ -11244,7 +11256,21 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     FunctionType::ExtInfo OldTypeInfo = OldType->getExtInfo();
     FunctionType::ExtInfo NewTypeInfo = NewType->getExtInfo();
 
-    if (OldTypeInfo.getCC() != NewTypeInfo.getCC())
+    const auto *OldFPT = OldFD->getType()->getAs<FunctionProtoType>();
+    const auto *NewFPT = NewFD->getType()->getAs<FunctionProtoType>();
+
+    bool ArmStreamingCCMismatched = false;
+    if (OldFPT && NewFPT) {
+      unsigned Diff =
+          OldFPT->getAArch64SMEAttributes() ^ NewFPT->getAArch64SMEAttributes();
+      // Arm-streaming, arm-streaming-compatible and non-streaming versions
+      // cannot be mixed.
+      if (Diff & (FunctionType::SME_PStateSMEnabledMask |
+                  FunctionType::SME_PStateSMCompatibleMask))
+        ArmStreamingCCMismatched = true;
+    }
+
+    if (OldTypeInfo.getCC() != NewTypeInfo.getCC() || ArmStreamingCCMismatched)
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << CallingConv;
 
     QualType OldReturnType = OldType->getReturnType();
@@ -11264,9 +11290,8 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     if (!CLinkageMayDiffer && OldFD->isExternC() != NewFD->isExternC())
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << LanguageLinkage;
 
-    if (CheckEquivalentExceptionSpec(
-            OldFD->getType()->getAs<FunctionProtoType>(), OldFD->getLocation(),
-            NewFD->getType()->getAs<FunctionProtoType>(), NewFD->getLocation()))
+    if (CheckEquivalentExceptionSpec(OldFPT, OldFD->getLocation(), NewFPT,
+                                     NewFD->getLocation()))
       return true;
   }
   return false;
@@ -20287,8 +20312,10 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
     // be emitted, because (say) the definition could include "inline".
     const FunctionDecl *Def = FD->getDefinition();
 
-    return Def && !isDiscardableGVALinkage(
-                      getASTContext().GetGVALinkageForFunction(Def));
+    // We can't compute linkage when we skip function bodies.
+    return Def && !Def->hasSkippedBody() &&
+           !isDiscardableGVALinkage(
+               getASTContext().GetGVALinkageForFunction(Def));
   };
 
   if (LangOpts.OpenMPIsTargetDevice) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 8e120b8c43510..b29346b7b33d4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1527,6 +1527,14 @@ static void handleOwnershipAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     break;
   }
 
+  // Allow only pointers to be return type for functions with ownership_returns
+  // attribute. This matches with current OwnershipAttr::Takes semantics
+  if (K == OwnershipAttr::Returns &&
+      !getFunctionOrMethodResultType(D)->isPointerType()) {
+    S.Diag(AL.getLoc(), diag::err_ownership_takes_return_type) << AL;
+    return;
+  }
+
   IdentifierInfo *Module = AL.getArgAsIdent(0)->Ident;
 
   StringRef ModuleName = Module->getName();
@@ -1583,6 +1591,16 @@ static void handleOwnershipAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
                 << Idx.getSourceIndex() << Ex->getSourceRange();
           return;
         }
+      } else if (K == OwnershipAttr::Takes &&
+                 I->getOwnKind() == OwnershipAttr::Takes) {
+        if (I->getModule()->getName() != ModuleName) {
+          S.Diag(I->getLocation(), diag::err_ownership_takes_class_mismatch)
+              << I->getModule()->getName();
+          S.Diag(AL.getLoc(), diag::note_ownership_takes_class_mismatch)
+              << ModuleName << Ex->getSourceRange();
+
+          return;
+        }
       }
     }
     OwnershipArgs.push_back(Idx);
@@ -3083,9 +3101,6 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
-  if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                             /*IncludeLocallyStreaming=*/false))
-    return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
 
@@ -3182,10 +3197,6 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
-      if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                                 /*IncludeLocallyStreaming=*/false))
-        return Diag(LiteralLoc,
-                    diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
       // Other targets ( currently X86 )
       if (Cur.starts_with("arch=")) {
@@ -3458,8 +3469,8 @@ static FormatAttrKind getFormatAttrKind(StringRef Format) {
       // Otherwise, check for supported formats.
       .Cases("scanf", "printf", "printf0", "strfmon", SupportedFormat)
       .Cases("cmn_err", "vcmn_err", "zcmn_err", SupportedFormat)
-      .Case("kprintf", SupportedFormat)         // OpenBSD.
-      .Case("freebsd_kprintf", SupportedFormat) // FreeBSD.
+      .Cases("kprintf", "syslog", SupportedFormat) // OpenBSD.
+      .Case("freebsd_kprintf", SupportedFormat)    // FreeBSD.
       .Case("os_trace", SupportedFormat)
       .Case("os_log", SupportedFormat)
 
@@ -5300,6 +5311,10 @@ static void handleXRayLogArgsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static void handlePatchableFunctionEntryAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
+  if (S.Context.getTargetInfo().getTriple().isOSAIX()) {
+    S.Diag(AL.getLoc(), diag::err_aix_attr_unsupported) << AL;
+    return;
+  }
   uint32_t Count = 0, Offset = 0;
   if (!S.checkUInt32Argument(AL, AL.getArgAsExpr(0), Count, 0, true))
     return;
@@ -5913,181 +5928,6 @@ static void handleZeroCallUsedRegsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(ZeroCallUsedRegsAttr::Create(S.Context, Kind, AL));
 }
 
-static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
-  const auto *RD = FD->getParent();
-  // An unnamed struct is anonymous struct only if it's not instantiated.
-  // However, the struct may not be fully processed yet to determine
-  // whether it's anonymous or not. In that case, this function treats it as
-  // an anonymous struct and tries to find a named parent.
-  while (RD && (RD->isAnonymousStructOrUnion() ||
-                (!RD->isCompleteDefinition() && RD->getName().empty()))) {
-    const auto *Parent = dyn_cast<RecordDecl>(RD->getParent());
-    if (!Parent)
-      break;
-    RD = Parent;
-  }
-  return RD;
-}
-
-static CountAttributedType::DynamicCountPointerKind
-getCountAttrKind(bool CountInBytes, bool OrNull) {
-  if (CountInBytes)
-    return OrNull ? CountAttributedType::SizedByOrNull
-                  : CountAttributedType::SizedBy;
-  return OrNull ? CountAttributedType::CountedByOrNull
-                : CountAttributedType::CountedBy;
-}
-
-enum class CountedByInvalidPointeeTypeKind {
-  INCOMPLETE,
-  SIZELESS,
-  FUNCTION,
-  FLEXIBLE_ARRAY_MEMBER,
-  VALID,
-};
-
-static bool
-CheckCountedByAttrOnField(Sema &S, FieldDecl *FD, Expr *E,
-                          llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls,
-                          bool CountInBytes, bool OrNull) {
-  // Check the context the attribute is used in
-
-  unsigned Kind = getCountAttrKind(CountInBytes, OrNull);
-
-  if (FD->getParent()->isUnion()) {
-    S.Diag(FD->getBeginLoc(), diag::err_count_attr_in_union)
-        << Kind << FD->getSourceRange();
-    return true;
-  }
-
-  const auto FieldTy = FD->getType();
-  if (FieldTy->isArrayType() && (CountInBytes || OrNull)) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_count_attr_not_on_ptr_or_flexible_array_member)
-        << Kind << FD->getLocation() << /* suggest counted_by */ 1;
-    return true;
-  }
-  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_count_attr_not_on_ptr_or_flexible_array_member)
-        << Kind << FD->getLocation() << /* do not suggest counted_by */ 0;
-    return true;
-  }
-
-  LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
-      LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-  if (FieldTy->isArrayType() &&
-      !Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FieldTy,
-                                       StrictFlexArraysLevel, true)) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_counted_by_attr_on_array_not_flexible_array_member)
-        << Kind << FD->getLocation();
-    return true;
-  }
-
-  CountedByInvalidPointeeTypeKind InvalidTypeKind =
-      CountedByInvalidPointeeTypeKind::VALID;
-  QualType PointeeTy;
-  int SelectPtrOrArr = 0;
-  if (FieldTy->isPointerType()) {
-    PointeeTy = FieldTy->getPointeeType();
-    SelectPtrOrArr = 0;
-  } else {
-    assert(FieldTy->isArrayType());
-    const ArrayType *AT = S.getASTContext().getAsArrayType(FieldTy);
-    PointeeTy = AT->getElementType();
-    SelectPtrOrArr = 1;
-  }
-  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
-  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
-  // when `FieldTy->isArrayType()`.
-  bool ShouldWarn = false;
-  if (PointeeTy->isIncompleteType() && !CountInBytes) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
-  } else if (PointeeTy->isSizelessType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
-  } else if (PointeeTy->isFunctionType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
-  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
-    if (FieldTy->isArrayType()) {
-      // This is a workaround for the Linux kernel that has already adopted
-      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
-      // should be an error because computing the bounds of the array cannot be
-      // done correctly without manually traversing every struct object in the
-      // array at runtime. To allow the code to be built this error is
-      // downgraded to a warning.
-      ShouldWarn = true;
-    }
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
-  }
-
-  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
-    unsigned DiagID = ShouldWarn
-                          ? diag::warn_counted_by_attr_elt_type_unknown_size
-                          : diag::err_counted_by_attr_pointee_unknown_size;
-    S.Diag(FD->getBeginLoc(), DiagID)
-        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
-        << (ShouldWarn ? 1 : 0) << Kind << FD->getSourceRange();
-    return true;
-  }
-
-  // Check the expression
-
-  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
-    S.Diag(E->getBeginLoc(), diag::err_count_attr_argument_not_integer)
-        << Kind << E->getSourceRange();
-    return true;
-  }
-
-  auto *DRE = dyn_cast<DeclRefExpr>(E);
-  if (!DRE) {
-    S.Diag(E->getBeginLoc(),
-           diag::err_count_attr_only_support_simple_decl_reference)
-        << Kind << E->getSourceRange();
-    return true;
-  }
-
-  auto *CountDecl = DRE->getDecl();
-  FieldDecl *CountFD = dyn_cast<FieldDecl>(CountDecl);
-  if (auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl)) {
-    CountFD = IFD->getAnonField();
-  }
-  if (!CountFD) {
-    S.Diag(E->getBeginLoc(), diag::err_count_attr_must_be_in_structure)
-        << CountDecl << Kind << E->getSourceRange();
-
-    S.Diag(CountDecl->getBeginLoc(),
-           diag::note_flexible_array_counted_by_attr_field)
-        << CountDecl << CountDecl->getSourceRange();
-    return true;
-  }
-
-  if (FD->getParent() != CountFD->getParent()) {
-    if (CountFD->getParent()->isUnion()) {
-      S.Diag(CountFD->getBeginLoc(), diag::err_count_attr_refer_to_union)
-          << Kind << CountFD->getSourceRange();
-      return true;
-    }
-    // Whether CountRD is an anonymous struct is not determined at this
-    // point. Thus, an additional diagnostic in case it's not anonymous struct
-    // is done later in `Parser::ParseStructDeclaration`.
-    auto *RD = GetEnclosingNamedOrTopAnonRecord(FD);
-    auto *CountRD = GetEnclosingNamedOrTopAnonRecord(CountFD);
-
-    if (RD != CountRD) {
-      S.Diag(E->getBeginLoc(), diag::err_count_attr_param_not_in_same_struct)
-          << CountFD << Kind << FieldTy->isArrayType() << E->getSourceRange();
-      S.Diag(CountFD->getBeginLoc(),
-             diag::note_flexible_array_counted_by_attr_field)
-          << CountFD << CountFD->getSourceRange();
-      return true;
-    }
-  }
-
-  Decls.push_back(TypeCoupledDeclRefInfo(CountFD, /*IsDref*/ false));
-  return false;
-}
-
 static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
   auto *FD = dyn_cast<FieldDecl>(D);
   assert(FD);
@@ -6120,7 +5960,7 @@ static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
   }
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
-  if (CheckCountedByAttrOnField(S, FD, CountExpr, Decls, CountInBytes, OrNull))
+  if (S.CheckCountedByAttrOnField(FD, CountExpr, Decls, CountInBytes, OrNull))
     return;
 
   QualType CAT = S.BuildCountAttributedArrayOrPointerType(
@@ -6213,7 +6053,7 @@ static void handleMSAllocatorAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   // Warn if the return type is not a pointer or reference type.
   if (auto *FD = dyn_cast<FunctionDecl>(D)) {
     QualType RetTy = FD->getReturnType();
-    if (!RetTy->isPointerType() && !RetTy->isReferenceType()) {
+    if (!RetTy->isPointerOrReferenceType()) {
       S.Diag(AL.getLoc(), diag::warn_declspec_allocator_nonpointer)
           << AL.getRange() << RetTy;
       return;
@@ -6503,9 +6343,16 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     return;
 
   // Ignore C++11 attributes on declarator chunks: they appertain to the type
-  // instead.
-  if (AL.isCXX11Attribute() && !Options.IncludeCXX11Attributes &&
-      (!IsDeclLambdaCallOperator(D) || !AL.supportsNonconformingLambdaSyntax()))
+  // instead. Note, isCXX11Attribute() will look at whether the attribute is
+  // [[]] or alignas, while isC23Attribute() will only look at [[]]. This is
+  // important for ensuring that alignas in C23 is properly handled on a
+  // structure member declaration because it is a type-specifier-qualifier in
+  // C but still applies to the declaration rather than the type.
+  if ((S.getLangOpts().CPlusPlus
+           ? AL.isCXX11Attribute() && (!IsDeclLambdaCallOperator(D) ||
+                                       !AL.supportsNonconformingLambdaSyntax())
+           : AL.isC23Attribute()) &&
+      !Options.IncludeCXX11Attributes)
     return;
 
   // Unknown attributes are automatically warned on. Target-specific attributes
@@ -7172,6 +7019,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_MSConstexpr:
     handleMSConstexprAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_HybridPatchable:
+    handleSimpleAttribute<HybridPatchableAttr>(S, D, AL);
+    break;
 
   // HLSL attributes:
   case ParsedAttr::AT_HLSLNumThreads:
@@ -7716,29 +7566,37 @@ void Sema::ProcessDeclAttributes(Scope *S, Decl *D, const Declarator &PD) {
   // Ordering of attributes can be important, so we take care to process
   // attributes in the order in which they appeared in the source code.
 
+  auto ProcessAttributesWithSliding =
+      [&](const ParsedAttributesView &Src,
+          const ProcessDeclAttributeOptions &Options) {
+        ParsedAttributesView NonSlidingAttrs;
+        for (ParsedAttr &AL : Src) {
+          // FIXME: this sliding is specific to standard attributes and should
+          // eventually be deprecated and removed as those are not intended to
+          // slide to anything.
+          if ((AL.isStandardAttributeSyntax() || AL.isAlignas()) &&
+              AL.slidesFromDeclToDeclSpecLegacyBehavior()) {
+            // Skip processing the attribute, but do check if it appertains to
+            // the declaration. This is needed for the `MatrixType` attribute,
+            // which, despite being a type attribute, defines a `SubjectList`
+            // that only allows it to be used on typedef declarations.
+            AL.diagnoseAppertainsTo(*this, D);
+          } else {
+            NonSlidingAttrs.addAtEnd(&AL);
+          }
+        }
+        ProcessDeclAttributeList(S, D, NonSlidingAttrs, Options);
+      };
+
   // First, process attributes that appeared on the declaration itself (but
   // only if they don't have the legacy behavior of "sliding" to the DeclSepc).
-  ParsedAttributesView NonSlidingAttrs;
-  for (ParsedAttr &AL : PD.getDeclarationAttributes()) {
-    if (AL.slidesFromDeclToDeclSpecLegacyBehavior()) {
-      // Skip processing the attribute, but do check if it appertains to the
-      // declaration. This is needed for the `MatrixType` attribute, which,
-      // despite being a type attribute, defines a `SubjectList` that only
-      // allows it to be used on typedef declarations.
-      AL.diagnoseAppertainsTo(*this, D);
-    } else {
-      NonSlidingAttrs.addAtEnd(&AL);
-    }
-  }
-  ProcessDeclAttributeList(S, D, NonSlidingAttrs);
+  ProcessAttributesWithSliding(PD.getDeclarationAttributes(), {});
 
   // Apply decl attributes from the DeclSpec if present.
-  if (!PD.getDeclSpec().getAttributes().empty()) {
-    ProcessDeclAttributeList(S, D, PD.getDeclSpec().getAttributes(),
-                             ProcessDeclAttributeOptions()
-                                 .WithIncludeCXX11Attributes(false)
-                                 .WithIgnoreTypeAttributes(true));
-  }
+  ProcessAttributesWithSliding(PD.getDeclSpec().getAttributes(),
+                               ProcessDeclAttributeOptions()
+                                   .WithIncludeCXX11Attributes(false)
+                                   .WithIgnoreTypeAttributes(true));
 
   // Walk the declarator structure, applying decl attributes that were in a type
   // position to the decl itself.  This handles cases like:
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 072577f7c742d..626fcb997fc1b 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -10413,7 +10413,7 @@ void Sema::checkIncorrectVTablePointerAuthenticationAttribute(
   while (1) {
     assert(PrimaryBase);
     const CXXRecordDecl *Base = nullptr;
-    for (auto BasePtr : PrimaryBase->bases()) {
+    for (const CXXBaseSpecifier &BasePtr : PrimaryBase->bases()) {
       if (!BasePtr.getType()->getAsCXXRecordDecl()->isDynamicClass())
         continue;
       Base = BasePtr.getType()->getAsCXXRecordDecl();
@@ -12276,16 +12276,15 @@ Decl *Sema::ActOnUsingEnumDeclaration(Scope *S, AccessSpecifier AS,
                                       SourceLocation EnumLoc, SourceRange TyLoc,
                                       const IdentifierInfo &II, ParsedType Ty,
                                       CXXScopeSpec *SS) {
-  assert(!SS->isInvalid() && "ScopeSpec is invalid");
+  assert(SS && !SS->isInvalid() && "ScopeSpec is invalid");
   TypeSourceInfo *TSI = nullptr;
   SourceLocation IdentLoc = TyLoc.getBegin();
   QualType EnumTy = GetTypeFromParser(Ty, &TSI);
   if (EnumTy.isNull()) {
-    Diag(IdentLoc, SS && isDependentScopeSpecifier(*SS)
+    Diag(IdentLoc, isDependentScopeSpecifier(*SS)
                        ? diag::err_using_enum_is_dependent
                        : diag::err_unknown_typename)
-        << II.getName()
-        << SourceRange(SS ? SS->getBeginLoc() : IdentLoc, TyLoc.getEnd());
+        << II.getName() << SourceRange(SS->getBeginLoc(), TyLoc.getEnd());
     return nullptr;
   }
 
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 3727261f76d2b..288b95e8c9343 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1470,6 +1470,8 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPSimdDirectiveClass:
   case Stmt::OMPTileDirectiveClass:
   case Stmt::OMPUnrollDirectiveClass:
+  case Stmt::OMPReverseDirectiveClass:
+  case Stmt::OMPInterchangeDirectiveClass:
   case Stmt::OMPSingleDirectiveClass:
   case Stmt::OMPTargetDataDirectiveClass:
   case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index f74bdbf728281..d287c43bec22d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -4185,6 +4185,21 @@ static bool CheckVectorElementsTraitOperandType(Sema &S, QualType T,
   return false;
 }
 
+static bool checkPtrAuthTypeDiscriminatorOperandType(Sema &S, QualType T,
+                                                     SourceLocation Loc,
+                                                     SourceRange ArgRange) {
+  if (S.checkPointerAuthEnabled(Loc, ArgRange))
+    return true;
+
+  if (!T->isFunctionType() && !T->isFunctionPointerType() &&
+      !T->isFunctionReferenceType() && !T->isMemberFunctionPointerType()) {
+    S.Diag(Loc, diag::err_ptrauth_type_disc_undiscriminated) << T << ArgRange;
+    return true;
+  }
+
+  return false;
+}
+
 static bool CheckExtensionTraitOperandType(Sema &S, QualType T,
                                            SourceLocation Loc,
                                            SourceRange ArgRange,
@@ -4579,6 +4594,10 @@ bool Sema::CheckUnaryExprOrTypeTraitOperand(QualType ExprType,
     return CheckVectorElementsTraitOperandType(*this, ExprType, OpLoc,
                                                ExprRange);
 
+  if (ExprKind == UETT_PtrAuthTypeDiscriminator)
+    return checkPtrAuthTypeDiscriminatorOperandType(*this, ExprType, OpLoc,
+                                                    ExprRange);
+
   // Explicitly list some types as extensions.
   if (!CheckExtensionTraitOperandType(*this, ExprType, OpLoc, ExprRange,
                                       ExprKind))
@@ -5779,7 +5798,6 @@ static bool isParenthetizedAndQualifiedAddressOfExpr(Expr *Fn) {
   if (!UO || UO->getOpcode() != clang::UO_AddrOf)
     return false;
   if (auto *DRE = dyn_cast<DeclRefExpr>(UO->getSubExpr()->IgnoreParens())) {
-    assert(isa<FunctionDecl>(DRE->getDecl()) && "expected a function");
     return DRE->hasQualifier();
   }
   if (auto *OVL = dyn_cast<OverloadExpr>(UO->getSubExpr()->IgnoreParens()))
@@ -10982,6 +11000,14 @@ QualType Sema::CheckAdditionOperands(ExprResult &LHS, ExprResult &RHS,
   if (isObjCPointer && checkArithmeticOnObjCPointer(*this, Loc, PExp))
     return QualType();
 
+  // Arithmetic on label addresses is normally allowed, except when we add
+  // a ptrauth signature to the addresses.
+  if (isa<AddrLabelExpr>(PExp) && getLangOpts().PointerAuthIndirectGotos) {
+    Diag(Loc, diag::err_ptrauth_indirect_goto_addrlabel_arithmetic)
+        << /*addition*/ 1;
+    return QualType();
+  }
+
   // Check array bounds for pointer arithemtic
   CheckArrayAccess(PExp, IExp);
 
@@ -11056,6 +11082,15 @@ QualType Sema::CheckSubtractionOperands(ExprResult &LHS, ExprResult &RHS,
         checkArithmeticOnObjCPointer(*this, Loc, LHS.get()))
       return QualType();
 
+    // Arithmetic on label addresses is normally allowed, except when we add
+    // a ptrauth signature to the addresses.
+    if (isa<AddrLabelExpr>(LHS.get()) &&
+        getLangOpts().PointerAuthIndirectGotos) {
+      Diag(Loc, diag::err_ptrauth_indirect_goto_addrlabel_arithmetic)
+          << /*subtraction*/ 0;
+      return QualType();
+    }
+
     // The result type of a pointer-int computation is the pointer type.
     if (RHS.get()->getType()->isIntegerType()) {
       // Subtracting from a null pointer should produce a warning.
@@ -14190,7 +14225,14 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
       // Okay: we can take the address of a field.
       // Could be a pointer to member, though, if there is an explicit
       // scope qualifier for the class.
-      if (isa<DeclRefExpr>(op) && cast<DeclRefExpr>(op)->getQualifier()) {
+
+      // [C++26] [expr.prim.id.general]
+      // If an id-expression E denotes a non-static non-type member
+      // of some class C [...] and if E is a qualified-id, E is
+      // not the un-parenthesized operand of the unary & operator [...]
+      // the id-expression is transformed into a class member access expression.
+      if (isa<DeclRefExpr>(op) && cast<DeclRefExpr>(op)->getQualifier() &&
+          !isa<ParenExpr>(OrigOp.get())) {
         DeclContext *Ctx = dcl->getDeclContext();
         if (Ctx && Ctx->isRecord()) {
           if (dcl->getType()->isReferenceType()) {
@@ -14200,22 +14242,6 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
             return QualType();
           }
 
-          // C++11 [expr.unary.op] p4:
-          // A pointer to member is only formed when an explicit & is used and
-          // its operand is a qualified-id not enclosed in parentheses.
-          if (isa<ParenExpr>(OrigOp.get())) {
-            SourceLocation LeftParenLoc = OrigOp.get()->getBeginLoc(),
-                           RightParenLoc = OrigOp.get()->getEndLoc();
-
-            Diag(LeftParenLoc,
-                 diag::err_form_ptr_to_member_from_parenthesized_expr)
-                << SourceRange(OpLoc, RightParenLoc)
-                << FixItHint::CreateRemoval(LeftParenLoc)
-                << FixItHint::CreateRemoval(RightParenLoc);
-
-            // Continuing might lead to better error recovery.
-          }
-
           while (cast<RecordDecl>(Ctx)->isAnonymousStructOrUnion())
             Ctx = Ctx->getParent();
 
@@ -17092,7 +17118,8 @@ Sema::VerifyIntegerConstantExpression(Expr *E, llvm::APSInt *Result,
   // not a constant expression as a side-effect.
   bool Folded =
       E->EvaluateAsRValue(EvalResult, Context, /*isConstantContext*/ true) &&
-      EvalResult.Val.isInt() && !EvalResult.HasSideEffects;
+      EvalResult.Val.isInt() && !EvalResult.HasSideEffects &&
+      (!getLangOpts().CPlusPlus || !EvalResult.HasUndefinedBehavior);
 
   if (!isa<ConstantExpr>(E))
     E = ConstantExpr::Create(Context, E, EvalResult.Val);
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 2b21586e8a263..722f59f00b83b 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -711,7 +711,7 @@ getUuidAttrOfType(Sema &SemaRef, QualType QT,
                   llvm::SmallSetVector<const UuidAttr *, 1> &UuidAttrs) {
   // Optionally remove one level of pointer, reference or array indirection.
   const Type *Ty = QT.getTypePtr();
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     Ty = QT->getPointeeType().getTypePtr();
   else if (QT->isArrayType())
     Ty = Ty->getBaseElementTypeUnsafe();
@@ -3815,6 +3815,9 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
                                                      Overaligned, DeleteName);
     }
 
+    if (OperatorDelete->isInvalidDecl())
+      return ExprError();
+
     MarkFunctionReferenced(StartLoc, OperatorDelete);
 
     // Check access and ambiguity of destructor if we're going to call it.
@@ -6036,6 +6039,32 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
     return cast<CXXRecordDecl>(rhsRecord->getDecl())
       ->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getDecl()));
   }
+  case BTT_IsVirtualBaseOf: {
+    const RecordType *BaseRecord = LhsT->getAs<RecordType>();
+    const RecordType *DerivedRecord = RhsT->getAs<RecordType>();
+
+    if (!BaseRecord || !DerivedRecord) {
+      DiagnoseVLAInCXXTypeTrait(Self, Lhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      DiagnoseVLAInCXXTypeTrait(Self, Rhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      return false;
+    }
+
+    if (BaseRecord->isUnionType() || DerivedRecord->isUnionType())
+      return false;
+
+    if (!BaseRecord->isStructureOrClassType() ||
+        !DerivedRecord->isStructureOrClassType())
+      return false;
+
+    if (Self.RequireCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT,
+                                 diag::err_incomplete_type))
+      return false;
+
+    return cast<CXXRecordDecl>(DerivedRecord->getDecl())
+        ->isVirtuallyDerivedFrom(cast<CXXRecordDecl>(BaseRecord->getDecl()));
+  }
   case BTT_IsSame:
     return Self.Context.hasSameType(LhsT, RhsT);
   case BTT_TypeCompatible: {
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 7ccecf055feed..2751c7cec2842 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -3206,9 +3206,10 @@ ExprResult SemaObjC::BuildInstanceMessage(
     }
     if (!isDesignatedInitChain) {
       const ObjCMethodDecl *InitMethod = nullptr;
+      auto *CurMD = SemaRef.getCurMethodDecl();
+      assert(CurMD && "Current method declaration should not be null");
       bool isDesignated =
-          SemaRef.getCurMethodDecl()->isDesignatedInitializerForTheInterface(
-              &InitMethod);
+          CurMD->isDesignatedInitializerForTheInterface(&InitMethod);
       assert(isDesignated && InitMethod);
       (void)isDesignated;
       Diag(SelLoc, SuperLoc.isValid() ?
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9940bc5b4a606..7724fafb253c5 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -650,7 +650,10 @@ class DiagnoseHLSLAvailability
   bool HasMatchingEnvironmentOrNone(const AvailabilityAttr *AA);
 
 public:
-  DiagnoseHLSLAvailability(Sema &SemaRef) : SemaRef(SemaRef) {}
+  DiagnoseHLSLAvailability(Sema &SemaRef)
+      : SemaRef(SemaRef),
+        CurrentShaderEnvironment(llvm::Triple::UnknownEnvironment),
+        CurrentShaderStageBit(0), ReportOnlyShaderStageIssues(false) {}
 
   // AST traversal methods
   void RunOnTranslationUnit(const TranslationUnitDecl *TU);
@@ -1014,8 +1017,8 @@ void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
 // returning an ExprError
 bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   switch (BuiltinID) {
-  case Builtin::BI__builtin_hlsl_elementwise_all:
-  case Builtin::BI__builtin_hlsl_elementwise_any: {
+  case Builtin::BI__builtin_hlsl_all:
+  case Builtin::BI__builtin_hlsl_any: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
     break;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index d7a7f1f123185..be417489506e1 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1986,6 +1986,9 @@ static bool checkDestructorReference(QualType ElementType, SourceLocation Loc,
     return false;
 
   CXXDestructorDecl *Destructor = SemaRef.LookupDestructor(CXXRD);
+  if (!Destructor)
+    return false;
+
   SemaRef.CheckDestructorAccess(Loc, Destructor,
                                 SemaRef.PDiag(diag::err_access_dtor_temp)
                                 << ElementType);
@@ -2016,7 +2019,7 @@ canInitializeArrayWithEmbedDataString(ArrayRef<Expr *> ExprList,
   if (InitType->isArrayType()) {
     const ArrayType *InitArrayType = InitType->getAsArrayTypeUnsafe();
     QualType InitElementTy = InitArrayType->getElementType();
-    QualType EmbedExprElementTy = EE->getType();
+    QualType EmbedExprElementTy = EE->getDataStringLiteral()->getType();
     const bool TypesMatch =
         Context.typesAreCompatible(InitElementTy, EmbedExprElementTy) ||
         (InitElementTy->isCharType() && EmbedExprElementTy->isCharType());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index b48640c48ed45..cabf22f97c3e6 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerEmbeddedInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Frontend/OpenMP/OMPAssume.h"
@@ -167,10 +168,6 @@ class DSAStackTy {
     SourceLocation DefaultAttrLoc;
     DefaultmapInfo DefaultmapMap[OMPC_DEFAULTMAP_unknown + 1];
     OpenMPDirectiveKind Directive = OMPD_unknown;
-    /// GenericLoopDirective with bind clause is mapped to other directives,
-    /// like for, distribute and simd. Presently, set MappedDirective to
-    /// OMPLoop. This may also be used in a similar way for other constructs.
-    OpenMPDirectiveKind MappedDirective = OMPD_unknown;
     DeclarationNameInfo DirectiveName;
     Scope *CurScope = nullptr;
     DeclContext *Context = nullptr;
@@ -644,24 +641,6 @@ class DSAStackTy {
     const SharingMapTy *Top = getTopOfStackOrNull();
     return Top ? Top->Directive : OMPD_unknown;
   }
-  OpenMPDirectiveKind getMappedDirective() const {
-    const SharingMapTy *Top = getTopOfStackOrNull();
-    return Top ? Top->MappedDirective : OMPD_unknown;
-  }
-  void setCurrentDirective(OpenMPDirectiveKind NewDK) {
-    SharingMapTy *Top = getTopOfStackOrNull();
-    assert(Top &&
-           "Before calling setCurrentDirective Top of Stack not to be NULL.");
-    // Store the old into MappedDirective & assign argument NewDK to Directive.
-    Top->Directive = NewDK;
-  }
-  void setMappedDirective(OpenMPDirectiveKind NewDK) {
-    SharingMapTy *Top = getTopOfStackOrNull();
-    assert(Top &&
-           "Before calling setMappedDirective Top of Stack not to be NULL.");
-    // Store the old into MappedDirective & assign argument NewDK to Directive.
-    Top->MappedDirective = NewDK;
-  }
   /// Returns directive kind at specified level.
   OpenMPDirectiveKind getDirective(unsigned Level) const {
     assert(!isStackEmpty() && "No directive at specified level.");
@@ -822,7 +801,8 @@ class DSAStackTy {
       return (M == OMPC_DEFAULTMAP_MODIFIER_alloc) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_to) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_from) ||
-             (M == OMPC_DEFAULTMAP_MODIFIER_tofrom);
+             (M == OMPC_DEFAULTMAP_MODIFIER_tofrom) ||
+             (M == OMPC_DEFAULTMAP_MODIFIER_present);
     }
     return true;
   }
@@ -2853,7 +2833,7 @@ static void checkReductionClauses(Sema &S, DSAStackTy *Stack,
         S.Diag(InscanLoc, diag::note_omp_previous_inscan_reduction);
         continue;
       }
-      for (Expr *Ref : RC->varlists()) {
+      for (Expr *Ref : RC->varlist()) {
         assert(Ref && "NULL expr in OpenMP nontemporal clause.");
         SourceLocation ELoc;
         SourceRange ERange;
@@ -2893,7 +2873,7 @@ void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) {
     for (OMPClause *C : D->clauses()) {
       if (auto *Clause = dyn_cast<OMPLastprivateClause>(C)) {
         SmallVector<Expr *, 8> PrivateCopies;
-        for (Expr *DE : Clause->varlists()) {
+        for (Expr *DE : Clause->varlist()) {
           if (DE->isValueDependent() || DE->isTypeDependent()) {
             PrivateCopies.push_back(nullptr);
             continue;
@@ -2931,7 +2911,7 @@ void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) {
       // Finalize nontemporal clause by handling private copies, if any.
       if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
         SmallVector<Expr *, 8> PrivateRefs;
-        for (Expr *RefExpr : Clause->varlists()) {
+        for (Expr *RefExpr : Clause->varlist()) {
           assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
           SourceLocation ELoc;
           SourceRange ERange;
@@ -3101,11 +3081,11 @@ ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope,
     if (TypoCorrection Corrected =
             SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
                                 CCC, Sema::CTK_ErrorRecovery)) {
-      SemaRef.diagnoseTypo(Corrected,
-                           PDiag(Lookup.empty()
-                                     ? diag::err_undeclared_var_use_suggest
-                                     : diag::err_omp_expected_var_arg_suggest)
-                               << Id.getName());
+      SemaRef.diagnoseTypo(
+          Corrected,
+          SemaRef.PDiag(Lookup.empty() ? diag::err_undeclared_var_use_suggest
+                                       : diag::err_omp_expected_var_arg_suggest)
+              << Id.getName());
       VD = Corrected.getCorrectionDeclAs<VarDecl>();
     } else {
       Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use
@@ -3774,7 +3754,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
         !isOpenMPTaskLoopDirective(S->getDirectiveKind())) {
       for (OMPClause *C : S->clauses())
         if (auto *FC = dyn_cast<OMPFirstprivateClause>(C)) {
-          for (Expr *Ref : FC->varlists())
+          for (Expr *Ref : FC->varlist())
             Visit(Ref);
         }
     }
@@ -4407,6 +4387,8 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
   case OMPD_section:
   case OMPD_tile:
   case OMPD_unroll:
+  case OMPD_reverse:
+  case OMPD_interchange:
     break;
   default:
     processCapturedRegions(SemaRef, DKind, CurScope,
@@ -4749,7 +4731,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
               SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
         if (auto *AC = dyn_cast<OMPAlignedClause>(C)) {
-          for (Expr *E : AC->varlists())
+          for (Expr *E : AC->varlist())
             SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
       }
@@ -5331,7 +5313,7 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
              diag::warn_omp_allocate_thread_on_task_target_directive)
           << getOpenMPDirectiveName(Stack->getCurrentDirective());
     }
-    for (Expr *E : AC->varlists()) {
+    for (Expr *E : AC->varlist()) {
       SourceLocation ELoc;
       SourceRange ERange;
       Expr *SimpleRefExpr = E;
@@ -5980,127 +5962,63 @@ static bool teamsLoopCanBeParallelFor(Stmt *AStmt, Sema &SemaRef) {
   return Checker.teamsLoopCanBeParallelFor();
 }
 
-bool SemaOpenMP::mapLoopConstruct(
-    llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
-    ArrayRef<OMPClause *> Clauses, OpenMPBindClauseKind &BindKind,
-    OpenMPDirectiveKind &Kind, OpenMPDirectiveKind &PrevMappedDirective,
-    SourceLocation StartLoc, SourceLocation EndLoc,
-    const DeclarationNameInfo &DirName, OpenMPDirectiveKind CancelRegion) {
-
-  bool UseClausesWithoutBind = false;
-
-  // Restricting to "#pragma omp loop bind"
-  if (getLangOpts().OpenMP >= 50 && Kind == OMPD_loop) {
-
-    const OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
-
-    if (BindKind == OMPC_BIND_unknown) {
-      // Setting the enclosing teams or parallel construct for the loop
-      // directive without bind clause.
-      // [5.0:129:25-28] If the bind clause is not present on the construct and
-      // the loop construct is closely nested inside a teams or parallel
-      // construct, the binding region is the corresponding teams or parallel
-      // region. If none of those conditions hold, the binding region is not
-      // defined.
-      BindKind = OMPC_BIND_thread; // Default bind(thread) if binding is unknown
-      ArrayRef<OpenMPDirectiveKind> ParentLeafs =
-          getLeafConstructsOrSelf(ParentDirective);
-
-      if (ParentDirective == OMPD_unknown) {
-        Diag(DSAStack->getDefaultDSALocation(),
-             diag::err_omp_bind_required_on_loop);
-      } else if (ParentLeafs.back() == OMPD_parallel) {
-        BindKind = OMPC_BIND_parallel;
-      } else if (ParentLeafs.back() == OMPD_teams) {
-        BindKind = OMPC_BIND_teams;
-      }
-    } else {
-      // bind clause is present in loop directive. When the loop directive is
-      // changed to a new directive the bind clause is not used. So, we should
-      // set flag indicating to only use the clauses that aren't the
-      // bind clause.
-      UseClausesWithoutBind = true;
-    }
-
-    for (OMPClause *C : Clauses) {
-      // Spec restriction : bind(teams) and reduction not permitted.
-      if (BindKind == OMPC_BIND_teams &&
-          C->getClauseKind() == llvm::omp::Clause::OMPC_reduction)
-        Diag(DSAStack->getDefaultDSALocation(),
-             diag::err_omp_loop_reduction_clause);
-
-      // A new Vector ClausesWithoutBind, which does not contain the bind
-      // clause, for passing to new directive.
-      if (C->getClauseKind() != llvm::omp::Clause::OMPC_bind)
-        ClausesWithoutBind.push_back(C);
-    }
-
-    switch (BindKind) {
-    case OMPC_BIND_parallel:
-      Kind = OMPD_for;
-      DSAStack->setCurrentDirective(OMPD_for);
-      DSAStack->setMappedDirective(OMPD_loop);
-      PrevMappedDirective = OMPD_loop;
-      break;
-    case OMPC_BIND_teams:
-      Kind = OMPD_distribute;
-      DSAStack->setCurrentDirective(OMPD_distribute);
-      DSAStack->setMappedDirective(OMPD_loop);
-      PrevMappedDirective = OMPD_loop;
-      break;
-    case OMPC_BIND_thread:
-      Kind = OMPD_simd;
-      DSAStack->setCurrentDirective(OMPD_simd);
-      DSAStack->setMappedDirective(OMPD_loop);
-      PrevMappedDirective = OMPD_loop;
-      break;
-    case OMPC_BIND_unknown:
-      break;
-    }
-  } else if (PrevMappedDirective == OMPD_loop) {
-    /// An initial pass after recognizing all the statements is done in the
-    /// Parser when the directive OMPD_loop is mapped to OMPD_for,
-    /// OMPD_distribute or OMPD_simd. A second transform pass with call from
-    /// clang::TreeTransform::TransformOMPExecutableDirective() is done
-    /// with the Directive as one of the above mapped directive without
-    /// the bind clause. Then "PrevMappedDirective" stored in the
-    /// OMPExecutableDirective is accessed and hence this else statement.
-
-    DSAStack->setMappedDirective(OMPD_loop);
-  }
-
-  return UseClausesWithoutBind;
-}
-
 StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
     OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
-    Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
-    OpenMPDirectiveKind PrevMappedDirective) {
+    Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
   assert(isOpenMPExecutableDirective(Kind) && "Unexpected directive category");
 
   StmtResult Res = StmtError();
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
-  llvm::SmallVector<OMPClause *> ClausesWithoutBind;
-  bool UseClausesWithoutBind = false;
+  llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
 
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
 
-  // Variable used to note down the DirectiveKind because mapLoopConstruct may
-  // change "Kind" variable, due to mapping of "omp loop" to other directives.
-  OpenMPDirectiveKind DK = Kind;
-  if (Kind == OMPD_loop || PrevMappedDirective == OMPD_loop) {
-    UseClausesWithoutBind = mapLoopConstruct(
-        ClausesWithoutBind, Clauses, BindKind, Kind, PrevMappedDirective,
-        StartLoc, EndLoc, DirName, CancelRegion);
-    DK = OMPD_loop;
+  if (Kind == OMPD_loop && BindKind == OMPC_BIND_unknown) {
+    const OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
+
+    // Setting the enclosing teams or parallel construct for the loop
+    // directive without bind clause.
+    // [5.0:129:25-28] If the bind clause is not present on the construct and
+    // the loop construct is closely nested inside a teams or parallel
+    // construct, the binding region is the corresponding teams or parallel
+    // region. If none of those conditions hold, the binding region is not
+    // defined.
+    BindKind = OMPC_BIND_thread; // Default bind(thread) if binding is unknown
+    ArrayRef<OpenMPDirectiveKind> ParentLeafs =
+        getLeafConstructsOrSelf(ParentDirective);
+
+    if (ParentDirective == OMPD_unknown) {
+      Diag(DSAStack->getDefaultDSALocation(),
+           diag::err_omp_bind_required_on_loop);
+    } else if (ParentLeafs.back() == OMPD_parallel) {
+      BindKind = OMPC_BIND_parallel;
+    } else if (ParentLeafs.back() == OMPD_teams) {
+      BindKind = OMPC_BIND_teams;
+    }
+
+    assert(BindKind != OMPC_BIND_unknown && "Expecting BindKind");
+
+    OMPClause *C =
+        ActOnOpenMPBindClause(BindKind, SourceLocation(), SourceLocation(),
+                              SourceLocation(), SourceLocation());
+    ClausesWithImplicit.push_back(C);
+  }
+
+  // Diagnose "loop bind(teams)" with "reduction".
+  if (Kind == OMPD_loop && BindKind == OMPC_BIND_teams) {
+    for (OMPClause *C : Clauses) {
+      if (C->getClauseKind() == OMPC_reduction)
+        Diag(DSAStack->getDefaultDSALocation(),
+             diag::err_omp_loop_reduction_clause);
+    }
   }
 
   // First check CancelRegion which is then used in checkNestingOfRegions.
   if (checkCancelRegion(SemaRef, Kind, CancelRegion, StartLoc) ||
-      checkNestingOfRegions(SemaRef, DSAStack, DK, DirName, CancelRegion,
+      checkNestingOfRegions(SemaRef, DSAStack, Kind, DirName, CancelRegion,
                             BindKind, StartLoc)) {
     return StmtError();
   }
@@ -6110,15 +6028,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
                             isOpenMPTargetDataManagementDirective(Kind)))
     Diag(StartLoc, diag::warn_hip_omp_target_directives);
 
-  llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
   VarsWithInheritedDSAType VarsWithInheritedDSA;
   bool ErrorFound = false;
-  if (getLangOpts().OpenMP >= 50 && UseClausesWithoutBind) {
-    ClausesWithImplicit.append(ClausesWithoutBind.begin(),
-                               ClausesWithoutBind.end());
-  } else {
-    ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
-  }
+  ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
+
   if (AStmt && !SemaRef.CurContext->isDependentContext() &&
       isOpenMPCapturingDirective(Kind)) {
     assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
@@ -6167,14 +6080,14 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
               DMC->getDefaultmapModifierLoc();
     }
     for (unsigned VC = 0; VC < DefaultmapKindNum; ++VC) {
-      auto Kind = static_cast<OpenMPDefaultmapClauseKind>(VC);
+      auto K = static_cast<OpenMPDefaultmapClauseKind>(VC);
       for (unsigned I = 0; I < OMPC_MAP_delete; ++I) {
-        ArrayRef<Expr *> ImplicitMap = DSAChecker.getImplicitMap(
-            Kind, static_cast<OpenMPMapClauseKind>(I));
+        ArrayRef<Expr *> ImplicitMap =
+            DSAChecker.getImplicitMap(K, static_cast<OpenMPMapClauseKind>(I));
         ImplicitMaps[VC][I].append(ImplicitMap.begin(), ImplicitMap.end());
       }
       ArrayRef<OpenMPMapModifierKind> ImplicitModifier =
-          DSAChecker.getImplicitMapModifier(Kind);
+          DSAChecker.getImplicitMapModifier(K);
       ImplicitMapModifiers[VC].append(ImplicitModifier.begin(),
                                       ImplicitModifier.end());
       std::fill_n(std::back_inserter(ImplicitMapModifiersLoc[VC]),
@@ -6224,7 +6137,7 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       SmallVector<Expr *, 4> ImplicitExprs;
       for (OMPClause *C : Clauses) {
         if (auto *RC = dyn_cast<OMPReductionClause>(C))
-          for (Expr *E : RC->varlists())
+          for (Expr *E : RC->varlist())
             if (!isa<DeclRefExpr>(E->IgnoreParenImpCasts()))
               ImplicitExprs.emplace_back(E);
       }
@@ -6248,10 +6161,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
           continue;
         CXXScopeSpec MapperIdScopeSpec;
         DeclarationNameInfo MapperId;
-        auto Kind = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
+        auto K = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
         if (OMPClause *Implicit = ActOnOpenMPMapClause(
                 nullptr, ImplicitMapModifiers[I], ImplicitMapModifiersLoc[I],
-                MapperIdScopeSpec, MapperId, Kind, /*IsMapTypeImplicit=*/true,
+                MapperIdScopeSpec, MapperId, K, /*IsMapTypeImplicit=*/true,
                 SourceLocation(), SourceLocation(), ImplicitMap,
                 OMPVarListLocTy())) {
           ClausesWithImplicit.emplace_back(Implicit);
@@ -6286,6 +6199,15 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPUnrollDirective(ClausesWithImplicit, AStmt, StartLoc,
                                      EndLoc);
     break;
+  case OMPD_reverse:
+    assert(ClausesWithImplicit.empty() &&
+           "reverse directive does not support any clauses");
+    Res = ActOnOpenMPReverseDirective(AStmt, StartLoc, EndLoc);
+    break;
+  case OMPD_interchange:
+    Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc,
+                                          EndLoc);
+    break;
   case OMPD_for:
     Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                   VarsWithInheritedDSA);
@@ -7581,9 +7503,9 @@ SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG,
                               PartialDiagnostic::NullDiagnostic()),
           PartialDiagnosticAt(
               VariantRef->getExprLoc(),
-              PDiag(diag::err_omp_declare_variant_doesnt_support)),
+              SemaRef.PDiag(diag::err_omp_declare_variant_doesnt_support)),
           PartialDiagnosticAt(VariantRef->getExprLoc(),
-                              PDiag(diag::err_omp_declare_variant_diff)
+                              SemaRef.PDiag(diag::err_omp_declare_variant_diff)
                                   << FD->getLocation()),
           /*TemplatesSupported=*/true, /*ConstexprSupported=*/false,
           /*CLinkageMayDiffer=*/true))
@@ -9160,13 +9082,9 @@ static bool checkOpenMPIterationSpace(
   auto *CXXFor = dyn_cast_or_null<CXXForRangeStmt>(S);
   // Ranged for is supported only in OpenMP 5.0.
   if (!For && (SemaRef.LangOpts.OpenMP <= 45 || !CXXFor)) {
-    OpenMPDirectiveKind DK = (SemaRef.getLangOpts().OpenMP < 50 ||
-                              DSA.getMappedDirective() == OMPD_unknown)
-                                 ? DKind
-                                 : DSA.getMappedDirective();
     SemaRef.Diag(S->getBeginLoc(), diag::err_omp_not_for)
         << (CollapseLoopCountExpr != nullptr || OrderedLoopCountExpr != nullptr)
-        << getOpenMPDirectiveName(DK) << TotalNestedLoopCount
+        << getOpenMPDirectiveName(DKind) << TotalNestedLoopCount
         << (CurrentNestedLoopCount > 0) << CurrentNestedLoopCount;
     if (TotalNestedLoopCount > 1) {
       if (CollapseLoopCountExpr && OrderedLoopCountExpr)
@@ -9504,7 +9422,7 @@ static Stmt *buildPreInits(ASTContext &Context,
 /// contained DeclStmts need to be visible after the execution of the list. Used
 /// for OpenMP pre-init declarations/statements.
 static void appendFlattenedStmtList(SmallVectorImpl<Stmt *> &TargetList,
-                                     Stmt *Item) {
+                                    Stmt *Item) {
   // nullptr represents an empty list.
   if (!Item)
     return;
@@ -10321,33 +10239,13 @@ static bool checkSimdlenSafelenSpecified(Sema &S,
   return false;
 }
 
-static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
-                                        OpenMPDirectiveKind K,
-                                        DSAStackTy *Stack);
-
-bool SemaOpenMP::checkLastPrivateForMappedDirectives(
-    ArrayRef<OMPClause *> Clauses) {
-
-  // Check for syntax of lastprivate
-  // Param of the lastprivate have different meanings in the mapped directives
-  // e.g. "omp loop" Only loop iteration vars are allowed in lastprivate clause
-  //      "omp for"  lastprivate vars must be shared
-  if (getLangOpts().OpenMP >= 50 &&
-      DSAStack->getMappedDirective() == OMPD_loop &&
-      checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack)) {
-    return false;
-  }
-  return true;
-}
-
 StmtResult SemaOpenMP::ActOnOpenMPSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
-  if (!checkLastPrivateForMappedDirectives(Clauses))
-    return StmtError();
+  CapturedStmt *CS = setBranchProtectedScope(SemaRef, OMPD_simd, AStmt);
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
@@ -10355,7 +10253,7 @@ StmtResult SemaOpenMP::ActOnOpenMPSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B);
+      CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -10365,10 +10263,8 @@ StmtResult SemaOpenMP::ActOnOpenMPSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   auto *SimdDirective = OMPSimdDirective::Create(
-      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
-      DSAStack->getMappedDirective());
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
   return SimdDirective;
 }
 
@@ -10378,9 +10274,6 @@ StmtResult SemaOpenMP::ActOnOpenMPForDirective(
   if (!AStmt)
     return StmtError();
 
-  if (!checkLastPrivateForMappedDirectives(Clauses))
-    return StmtError();
-
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -10396,8 +10289,7 @@ StmtResult SemaOpenMP::ActOnOpenMPForDirective(
 
   auto *ForDirective = OMPForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
-      DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion(),
-      DSAStack->getMappedDirective());
+      DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
   return ForDirective;
 }
 
@@ -10407,13 +10299,15 @@ StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective(
   if (!AStmt)
     return StmtError();
 
+  CapturedStmt *CS = setBranchProtectedScope(SemaRef, OMPD_for_simd, AStmt);
+
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack,
+                      getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
@@ -10424,7 +10318,6 @@ StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPForSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                      NestedLoopCount, Clauses, AStmt, B);
 }
@@ -10554,7 +10447,7 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
   bool ErrorFound = false;
   for (OMPClause *C : Clauses) {
     if (auto *LPC = dyn_cast<OMPLastprivateClause>(C)) {
-      for (Expr *RefExpr : LPC->varlists()) {
+      for (Expr *RefExpr : LPC->varlist()) {
         SourceLocation ELoc;
         SourceRange ERange;
         Expr *SimpleRefExpr = RefExpr;
@@ -10876,14 +10769,15 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  setBranchProtectedScope(SemaRef, OMPD_parallel_for_simd, AStmt);
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack,
+                      getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
@@ -13233,14 +13127,17 @@ StmtResult SemaOpenMP::ActOnOpenMPTaskLoopSimdDirective(
   if (!AStmt)
     return StmtError();
 
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_taskloop_simd, AStmt);
+
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -13261,7 +13158,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTaskLoopSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTaskLoopSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                           NestedLoopCount, Clauses, AStmt, B);
 }
@@ -13348,14 +13244,17 @@ StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopSimdDirective(
   if (!AStmt)
     return StmtError();
 
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_master_taskloop_simd, AStmt);
+
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_master_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -13376,7 +13275,6 @@ StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPMasterTaskLoopSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -13387,14 +13285,17 @@ StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopSimdDirective(
   if (!AStmt)
     return StmtError();
 
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_masked_taskloop_simd, AStmt);
+
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_masked_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -13415,7 +13316,6 @@ StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPMaskedTaskLoopSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -13584,9 +13484,6 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeDirective(
   if (!AStmt)
     return StmtError();
 
-  if (!checkLastPrivateForMappedDirectives(Clauses))
-    return StmtError();
-
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -13603,8 +13500,7 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeDirective(
 
   SemaRef.setFunctionHasBranchProtectedScope();
   auto *DistributeDirective = OMPDistributeDirective::Create(
-      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
-      DSAStack->getMappedDirective());
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
   return DistributeDirective;
 }
 
@@ -14042,6 +13938,10 @@ bool SemaOpenMP::checkTransformableLoopNest(
           DependentPreInits = Dir->getPreInits();
         else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
           DependentPreInits = Dir->getPreInits();
+        else if (auto *Dir = dyn_cast<OMPReverseDirective>(Transform))
+          DependentPreInits = Dir->getPreInits();
+        else if (auto *Dir = dyn_cast<OMPInterchangeDirective>(Transform))
+          DependentPreInits = Dir->getPreInits();
         else
           llvm_unreachable("Unhandled loop transformation");
 
@@ -14660,6 +14560,345 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
                                     buildPreInits(Context, PreInits));
 }
 
+StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation EndLoc) {
+  ASTContext &Context = getASTContext();
+  Scope *CurScope = SemaRef.getCurScope();
+
+  // Empty statement should only be possible if there already was an error.
+  if (!AStmt)
+    return StmtError();
+
+  constexpr unsigned NumLoops = 1;
+  Stmt *Body = nullptr;
+  SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
+      NumLoops);
+  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+  if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers,
+                                  Body, OriginalInits))
+    return StmtError();
+
+  // Delay applying the transformation to when template is completely
+  // instantiated.
+  if (SemaRef.CurContext->isDependentContext())
+    return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt,
+                                       nullptr, nullptr);
+
+  assert(LoopHelpers.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
+  assert(OriginalInits.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
+  OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front();
+
+  // Find the loop statement.
+  Stmt *LoopStmt = nullptr;
+  collectLoopStmts(AStmt, {LoopStmt});
+
+  // Determine the PreInit declarations.
+  SmallVector<Stmt *> PreInits;
+  addLoopPreInits(Context, LoopHelper, LoopStmt, OriginalInits[0], PreInits);
+
+  auto *IterationVarRef = cast<DeclRefExpr>(LoopHelper.IterationVarRef);
+  QualType IVTy = IterationVarRef->getType();
+  uint64_t IVWidth = Context.getTypeSize(IVTy);
+  auto *OrigVar = cast<DeclRefExpr>(LoopHelper.Counters.front());
+
+  // Iteration variable SourceLocations.
+  SourceLocation OrigVarLoc = OrigVar->getExprLoc();
+  SourceLocation OrigVarLocBegin = OrigVar->getBeginLoc();
+  SourceLocation OrigVarLocEnd = OrigVar->getEndLoc();
+
+  // Locations pointing to the transformation.
+  SourceLocation TransformLoc = StartLoc;
+  SourceLocation TransformLocBegin = StartLoc;
+  SourceLocation TransformLocEnd = EndLoc;
+
+  // Internal variable names.
+  std::string OrigVarName = OrigVar->getNameInfo().getAsString();
+  SmallString<64> ForwardIVName(".forward.iv.");
+  ForwardIVName += OrigVarName;
+  SmallString<64> ReversedIVName(".reversed.iv.");
+  ReversedIVName += OrigVarName;
+
+  // LoopHelper.Updates will read the logical iteration number from
+  // LoopHelper.IterationVarRef, compute the value of the user loop counter of
+  // that logical iteration from it, then assign it to the user loop counter
+  // variable. We cannot directly use LoopHelper.IterationVarRef as the
+  // induction variable of the generated loop because it may cause an underflow:
+  // \code{.c}
+  //   for (unsigned i = 0; i < n; ++i)
+  //     body(i);
+  // \endcode
+  //
+  // Naive reversal:
+  // \code{.c}
+  //   for (unsigned i = n-1; i >= 0; --i)
+  //     body(i);
+  // \endcode
+  //
+  // Instead, we introduce a new iteration variable representing the logical
+  // iteration counter of the original loop, convert it to the logical iteration
+  // number of the reversed loop, then let LoopHelper.Updates compute the user's
+  // loop iteration variable from it.
+  // \code{.cpp}
+  //   for (auto .forward.iv = 0; .forward.iv < n; ++.forward.iv) {
+  //     auto .reversed.iv = n - .forward.iv - 1;
+  //     i = (.reversed.iv + 0) * 1;                // LoopHelper.Updates
+  //     body(i);                                   // Body
+  //   }
+  // \endcode
+
+  // Subexpressions with more than one use. One of the constraints of an AST is
+  // that every node object must appear at most once, hence we define a lambda
+  // that creates a new AST node at every use.
+  CaptureVars CopyTransformer(SemaRef);
+  auto MakeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * {
+    return AssertSuccess(
+        CopyTransformer.TransformExpr(LoopHelper.NumIterations));
+  };
+
+  // Create the iteration variable for the forward loop (from 0 to n-1).
+  VarDecl *ForwardIVDecl =
+      buildVarDecl(SemaRef, {}, IVTy, ForwardIVName, nullptr, OrigVar);
+  auto MakeForwardRef = [&SemaRef = this->SemaRef, ForwardIVDecl, IVTy,
+                         OrigVarLoc]() {
+    return buildDeclRefExpr(SemaRef, ForwardIVDecl, IVTy, OrigVarLoc);
+  };
+
+  // Iteration variable for the reversed induction variable (from n-1 downto 0):
+  // Reuse the iteration variable created by checkOpenMPLoop.
+  auto *ReversedIVDecl = cast<VarDecl>(IterationVarRef->getDecl());
+  ReversedIVDecl->setDeclName(
+      &SemaRef.PP.getIdentifierTable().get(ReversedIVName));
+
+  // For init-statement:
+  // \code{.cpp}
+  //   auto .forward.iv = 0;
+  // \endcode
+  auto *Zero = IntegerLiteral::Create(Context, llvm::APInt::getZero(IVWidth),
+                                      ForwardIVDecl->getType(), OrigVarLoc);
+  SemaRef.AddInitializerToDecl(ForwardIVDecl, Zero, /*DirectInit=*/false);
+  StmtResult Init = new (Context)
+      DeclStmt(DeclGroupRef(ForwardIVDecl), OrigVarLocBegin, OrigVarLocEnd);
+  if (!Init.isUsable())
+    return StmtError();
+
+  // Forward iv cond-expression:
+  // \code{.cpp}
+  //   .forward.iv < MakeNumIterations()
+  // \endcode
+  ExprResult Cond =
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                         MakeForwardRef(), MakeNumIterations());
+  if (!Cond.isUsable())
+    return StmtError();
+
+  // Forward incr-statement:
+  // \code{.c}
+  //   ++.forward.iv
+  // \endcode
+  ExprResult Incr = SemaRef.BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(),
+                                         UO_PreInc, MakeForwardRef());
+  if (!Incr.isUsable())
+    return StmtError();
+
+  // Reverse the forward-iv:
+  // \code{.cpp}
+  //   auto .reversed.iv = MakeNumIterations() - 1 - .forward.iv
+  // \endcode
+  auto *One = IntegerLiteral::Create(Context, llvm::APInt(IVWidth, 1), IVTy,
+                                     TransformLoc);
+  ExprResult Minus = SemaRef.BuildBinOp(CurScope, TransformLoc, BO_Sub,
+                                        MakeNumIterations(), One);
+  if (!Minus.isUsable())
+    return StmtError();
+  Minus = SemaRef.BuildBinOp(CurScope, TransformLoc, BO_Sub, Minus.get(),
+                             MakeForwardRef());
+  if (!Minus.isUsable())
+    return StmtError();
+  StmtResult InitReversed = new (Context) DeclStmt(
+      DeclGroupRef(ReversedIVDecl), TransformLocBegin, TransformLocEnd);
+  if (!InitReversed.isUsable())
+    return StmtError();
+  SemaRef.AddInitializerToDecl(ReversedIVDecl, Minus.get(),
+                               /*DirectInit=*/false);
+
+  // The new loop body.
+  SmallVector<Stmt *, 4> BodyStmts;
+  BodyStmts.reserve(LoopHelper.Updates.size() + 2 +
+                    (isa<CXXForRangeStmt>(LoopStmt) ? 1 : 0));
+  BodyStmts.push_back(InitReversed.get());
+  llvm::append_range(BodyStmts, LoopHelper.Updates);
+  if (auto *CXXRangeFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
+    BodyStmts.push_back(CXXRangeFor->getLoopVarStmt());
+  BodyStmts.push_back(Body);
+  auto *ReversedBody =
+      CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(),
+                           Body->getBeginLoc(), Body->getEndLoc());
+
+  // Finally create the reversed For-statement.
+  auto *ReversedFor = new (Context)
+      ForStmt(Context, Init.get(), Cond.get(), nullptr, Incr.get(),
+              ReversedBody, LoopHelper.Init->getBeginLoc(),
+              LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
+  return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt,
+                                     ReversedFor,
+                                     buildPreInits(Context, PreInits));
+}
+
+StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
+  ASTContext &Context = getASTContext();
+  DeclContext *CurContext = SemaRef.CurContext;
+  Scope *CurScope = SemaRef.getCurScope();
+
+  // Empty statement should only be possible if there already was an error.
+  if (!AStmt)
+    return StmtError();
+
+  // interchange without permutation clause swaps two loops.
+  constexpr size_t NumLoops = 2;
+
+  // Verify and diagnose loop nest.
+  SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
+  Stmt *Body = nullptr;
+  SmallVector<SmallVector<Stmt *, 0>, 2> OriginalInits;
+  if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops,
+                                  LoopHelpers, Body, OriginalInits))
+    return StmtError();
+
+  // Delay interchange to when template is completely instantiated.
+  if (CurContext->isDependentContext())
+    return OMPInterchangeDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                           NumLoops, AStmt, nullptr, nullptr);
+
+  assert(LoopHelpers.size() == NumLoops &&
+         "Expecting loop iteration space dimensionaly to match number of "
+         "affected loops");
+  assert(OriginalInits.size() == NumLoops &&
+         "Expecting loop iteration space dimensionaly to match number of "
+         "affected loops");
+
+  // Decode the permutation clause.
+  constexpr uint64_t Permutation[] = {1, 0};
+
+  // Find the affected loops.
+  SmallVector<Stmt *> LoopStmts(NumLoops, nullptr);
+  collectLoopStmts(AStmt, LoopStmts);
+
+  // Collect pre-init statements on the order before the permuation.
+  SmallVector<Stmt *> PreInits;
+  for (auto I : llvm::seq<int>(NumLoops)) {
+    OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
+
+    assert(LoopHelper.Counters.size() == 1 &&
+           "Single-dimensional loop iteration space expected");
+    auto *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters.front());
+
+    std::string OrigVarName = OrigCntVar->getNameInfo().getAsString();
+    addLoopPreInits(Context, LoopHelper, LoopStmts[I], OriginalInits[I],
+                    PreInits);
+  }
+
+  SmallVector<VarDecl *> PermutedIndVars(NumLoops);
+  CaptureVars CopyTransformer(SemaRef);
+
+  // Create the permuted loops from the inside to the outside of the
+  // interchanged loop nest. Body of the innermost new loop is the original
+  // innermost body.
+  Stmt *Inner = Body;
+  for (auto TargetIdx : llvm::reverse(llvm::seq<int>(NumLoops))) {
+    // Get the original loop that belongs to this new position.
+    uint64_t SourceIdx = Permutation[TargetIdx];
+    OMPLoopBasedDirective::HelperExprs &SourceHelper = LoopHelpers[SourceIdx];
+    Stmt *SourceLoopStmt = LoopStmts[SourceIdx];
+    assert(SourceHelper.Counters.size() == 1 &&
+           "Single-dimensional loop iteration space expected");
+    auto *OrigCntVar = cast<DeclRefExpr>(SourceHelper.Counters.front());
+
+    // Normalized loop counter variable: From 0 to n-1, always an integer type.
+    DeclRefExpr *IterVarRef = cast<DeclRefExpr>(SourceHelper.IterationVarRef);
+    QualType IVTy = IterVarRef->getType();
+    assert(IVTy->isIntegerType() &&
+           "Expected the logical iteration counter to be an integer");
+
+    std::string OrigVarName = OrigCntVar->getNameInfo().getAsString();
+    SourceLocation OrigVarLoc = IterVarRef->getExprLoc();
+
+    // Make a copy of the NumIterations expression for each use: By the AST
+    // constraints, every expression object in a DeclContext must be unique.
+    auto MakeNumIterations = [&CopyTransformer, &SourceHelper]() -> Expr * {
+      return AssertSuccess(
+          CopyTransformer.TransformExpr(SourceHelper.NumIterations));
+    };
+
+    // Iteration variable for the permuted loop. Reuse the one from
+    // checkOpenMPLoop which will also be used to update the original loop
+    // variable.
+    SmallString<64> PermutedCntName(".permuted_");
+    PermutedCntName.append({llvm::utostr(TargetIdx), ".iv.", OrigVarName});
+    auto *PermutedCntDecl = cast<VarDecl>(IterVarRef->getDecl());
+    PermutedCntDecl->setDeclName(
+        &SemaRef.PP.getIdentifierTable().get(PermutedCntName));
+    PermutedIndVars[TargetIdx] = PermutedCntDecl;
+    auto MakePermutedRef = [this, PermutedCntDecl, IVTy, OrigVarLoc]() {
+      return buildDeclRefExpr(SemaRef, PermutedCntDecl, IVTy, OrigVarLoc);
+    };
+
+    // For init-statement:
+    // \code
+    //   auto .permuted_{target}.iv = 0
+    // \endcode
+    ExprResult Zero = SemaRef.ActOnIntegerConstant(OrigVarLoc, 0);
+    if (!Zero.isUsable())
+      return StmtError();
+    SemaRef.AddInitializerToDecl(PermutedCntDecl, Zero.get(),
+                                 /*DirectInit=*/false);
+    StmtResult InitStmt = new (Context)
+        DeclStmt(DeclGroupRef(PermutedCntDecl), OrigCntVar->getBeginLoc(),
+                 OrigCntVar->getEndLoc());
+    if (!InitStmt.isUsable())
+      return StmtError();
+
+    // For cond-expression:
+    // \code
+    //   .permuted_{target}.iv < MakeNumIterations()
+    // \endcode
+    ExprResult CondExpr =
+        SemaRef.BuildBinOp(CurScope, SourceHelper.Cond->getExprLoc(), BO_LT,
+                           MakePermutedRef(), MakeNumIterations());
+    if (!CondExpr.isUsable())
+      return StmtError();
+
+    // For incr-statement:
+    // \code
+    //   ++.tile.iv
+    // \endcode
+    ExprResult IncrStmt = SemaRef.BuildUnaryOp(
+        CurScope, SourceHelper.Inc->getExprLoc(), UO_PreInc, MakePermutedRef());
+    if (!IncrStmt.isUsable())
+      return StmtError();
+
+    SmallVector<Stmt *, 4> BodyParts(SourceHelper.Updates.begin(),
+                                     SourceHelper.Updates.end());
+    if (auto *SourceCXXFor = dyn_cast<CXXForRangeStmt>(SourceLoopStmt))
+      BodyParts.push_back(SourceCXXFor->getLoopVarStmt());
+    BodyParts.push_back(Inner);
+    Inner = CompoundStmt::Create(Context, BodyParts, FPOptionsOverride(),
+                                 Inner->getBeginLoc(), Inner->getEndLoc());
+    Inner = new (Context) ForStmt(
+        Context, InitStmt.get(), CondExpr.get(), nullptr, IncrStmt.get(), Inner,
+        SourceHelper.Init->getBeginLoc(), SourceHelper.Init->getBeginLoc(),
+        SourceHelper.Inc->getEndLoc());
+  }
+
+  return OMPInterchangeDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                         NumLoops, AStmt, Inner,
+                                         buildPreInits(Context, PreInits));
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
                                                    Expr *Expr,
                                                    SourceLocation StartLoc,
@@ -18886,7 +19125,7 @@ static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
   auto CurInit = Clause.inits().begin();
   auto CurPrivate = Clause.privates().begin();
   OpenMPLinearClauseKind LinKind = Clause.getModifier();
-  for (Expr *RefExpr : Clause.varlists()) {
+  for (Expr *RefExpr : Clause.varlist()) {
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
@@ -21656,7 +21895,9 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
     bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) ||
                             (getLangOpts().OpenMP >= 50 && KindLoc.isInvalid());
     if (!isDefaultmapKind || !isDefaultmapModifier) {
-      StringRef KindValue = "'scalar', 'aggregate', 'pointer'";
+      StringRef KindValue = getLangOpts().OpenMP < 52
+                                ? "'scalar', 'aggregate', 'pointer'"
+                                : "'scalar', 'aggregate', 'pointer', 'all'";
       if (getLangOpts().OpenMP == 50) {
         StringRef ModifierValue = "'alloc', 'from', 'to', 'tofrom', "
                                   "'firstprivate', 'none', 'default'";
@@ -21700,7 +21941,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
       return nullptr;
     }
   }
-  if (Kind == OMPC_DEFAULTMAP_unknown) {
+  if (Kind == OMPC_DEFAULTMAP_unknown || Kind == OMPC_DEFAULTMAP_all) {
     // Variable category is not specified - mark all categories.
     DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_aggregate, StartLoc);
     DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_scalar, StartLoc);
@@ -21773,7 +22014,7 @@ NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName(
             SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
                                 CCC, Sema::CTK_ErrorRecovery)) {
       SemaRef.diagnoseTypo(Corrected,
-                           PDiag(diag::err_undeclared_var_use_suggest)
+                           SemaRef.PDiag(diag::err_undeclared_var_use_suggest)
                                << Id.getName());
       checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
       return nullptr;
@@ -22848,8 +23089,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
   if (DSAStack->getCurrentDirective() == OMPD_ordered &&
       DepType != OMPC_DOACROSS_source && DepType != OMPC_DOACROSS_sink &&
       DepType != OMPC_DOACROSS_sink_omp_cur_iteration &&
-      DepType != OMPC_DOACROSS_source_omp_cur_iteration &&
-      DepType != OMPC_DOACROSS_source) {
+      DepType != OMPC_DOACROSS_source_omp_cur_iteration) {
     Diag(DepLoc, diag::err_omp_unexpected_clause_value)
         << "'source' or 'sink'" << getOpenMPClauseName(OMPC_doacross);
     return nullptr;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 39af562f089b2..bce0bf497a952 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CheckExprLifetime.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/CXXInheritance.h"
@@ -47,6 +48,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <cstdlib>
 #include <optional>
@@ -6874,10 +6876,7 @@ void Sema::AddOverloadCandidate(
   Candidate.Viable = true;
   Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Function, PO);
-  Candidate.IsSurrogate = false;
   Candidate.IsADLCandidate = IsADLCandidate;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   Candidate.ExplicitCallArguments = Args.size();
 
   // Explicit functions are not actually candidates at all if we're not
@@ -7439,8 +7438,6 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   Candidate.Function = Method;
   Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Method, PO);
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
   Candidate.TookAddressOfOverload =
       CandidateSet.getKind() == OverloadCandidateSet::CSK_AddressOfOverloadSet;
   Candidate.ExplicitCallArguments = Args.size();
@@ -7634,7 +7631,6 @@ void Sema::AddMethodTemplateCandidate(
     Candidate.IgnoreObjectArgument =
         cast<CXXMethodDecl>(Candidate.Function)->isStatic() ||
         ObjectType.isNull();
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = Args.size();
     if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -7722,7 +7718,6 @@ void Sema::AddTemplateOverloadCandidate(
     Candidate.IgnoreObjectArgument =
         isa<CXXMethodDecl>(Candidate.Function) &&
         !isa<CXXConstructorDecl>(Candidate.Function);
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = Args.size();
     if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -7903,9 +7898,6 @@ void Sema::AddConversionCandidate(
   OverloadCandidate &Candidate = CandidateSet.addCandidate(1);
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = Conversion;
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   Candidate.FinalConversion.setAsIdentityConversion();
   Candidate.FinalConversion.setFromType(ConvType);
   Candidate.FinalConversion.setAllToTypes(ToType);
@@ -8101,9 +8093,6 @@ void Sema::AddTemplateConversionCandidate(
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_bad_deduction;
-    Candidate.IsSurrogate = false;
-    Candidate.IgnoreObjectArgument = false;
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = 1;
     Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
                                                           Info);
@@ -8136,10 +8125,8 @@ void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = nullptr;
   Candidate.Surrogate = Conversion;
-  Candidate.Viable = true;
   Candidate.IsSurrogate = true;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
+  Candidate.Viable = true;
   Candidate.ExplicitCallArguments = Args.size();
 
   // Determine the implicit conversion sequence for the implicit
@@ -8345,9 +8332,6 @@ void Sema::AddBuiltinCandidate(QualType *ParamTys, ArrayRef<Expr *> Args,
   OverloadCandidate &Candidate = CandidateSet.addCandidate(Args.size());
   Candidate.FoundDecl = DeclAccessPair::make(nullptr, AS_none);
   Candidate.Function = nullptr;
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   std::copy(ParamTys, ParamTys + Args.size(), Candidate.BuiltinParamTypes);
 
   // Determine the implicit conversion sequences for each of the
@@ -10012,8 +9996,9 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name,
                                    CandEnd = CandidateSet.end();
        Cand != CandEnd; ++Cand)
     if (Cand->Function) {
-      Fns.erase(Cand->Function);
-      if (FunctionTemplateDecl *FunTmpl = Cand->Function->getPrimaryTemplate())
+      FunctionDecl *Fn = Cand->Function;
+      Fns.erase(Fn);
+      if (FunctionTemplateDecl *FunTmpl = Fn->getPrimaryTemplate())
         Fns.erase(FunTmpl);
     }
 
@@ -11175,7 +11160,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn)) {
     if (I == 0)
       isObjectArgument = true;
-    else
+    else if (!Fn->hasCXXExplicitFunctionObjectParameter())
       I--;
   }
 
@@ -11376,8 +11361,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
     }
   }
 
-  if (TakingCandidateAddress &&
-      !checkAddressOfCandidateIsAvailable(S, Cand->Function))
+  if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
     return;
 
   // Emit the generic diagnostic and, optionally, add the hints to it.
@@ -11403,6 +11387,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
 /// over a candidate in any candidate set.
 static bool CheckArityMismatch(Sema &S, OverloadCandidate *Cand,
                                unsigned NumArgs, bool IsAddressOf = false) {
+  assert(Cand->Function && "Candidate is required to be a function.");
   FunctionDecl *Fn = Cand->Function;
   unsigned MinParams = Fn->getMinRequiredExplicitArguments() +
                        ((IsAddressOf && !Fn->isStatic()) ? 1 : 0);
@@ -11493,8 +11478,10 @@ static void DiagnoseArityMismatch(Sema &S, NamedDecl *Found, Decl *D,
 /// Arity mismatch diagnosis specific to a function overload candidate.
 static void DiagnoseArityMismatch(Sema &S, OverloadCandidate *Cand,
                                   unsigned NumFormalArgs) {
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
   if (!CheckArityMismatch(S, Cand, NumFormalArgs, Cand->TookAddressOfOverload))
-    DiagnoseArityMismatch(S, Cand->FoundDecl, Cand->Function, NumFormalArgs,
+    DiagnoseArityMismatch(S, Cand->FoundDecl, Fn, NumFormalArgs,
                           Cand->TookAddressOfOverload);
 }
 
@@ -11794,19 +11781,22 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
 static void DiagnoseBadDeduction(Sema &S, OverloadCandidate *Cand,
                                  unsigned NumArgs,
                                  bool TakingCandidateAddress) {
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
   TemplateDeductionResult TDK = Cand->DeductionFailure.getResult();
   if (TDK == TemplateDeductionResult::TooFewArguments ||
       TDK == TemplateDeductionResult::TooManyArguments) {
     if (CheckArityMismatch(S, Cand, NumArgs))
       return;
   }
-  DiagnoseBadDeduction(S, Cand->FoundDecl, Cand->Function, // pattern
+  DiagnoseBadDeduction(S, Cand->FoundDecl, Fn, // pattern
                        Cand->DeductionFailure, NumArgs, TakingCandidateAddress);
 }
 
 /// CUDA: diagnose an invalid call across targets.
 static void DiagnoseBadTarget(Sema &S, OverloadCandidate *Cand) {
   FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true);
+  assert(Cand->Function && "Candidate must be a Function.");
   FunctionDecl *Callee = Cand->Function;
 
   CUDAFunctionTarget CallerTarget = S.CUDA().IdentifyTarget(Caller),
@@ -11864,6 +11854,7 @@ static void DiagnoseBadTarget(Sema &S, OverloadCandidate *Cand) {
 }
 
 static void DiagnoseFailedEnableIfAttr(Sema &S, OverloadCandidate *Cand) {
+  assert(Cand->Function && "Candidate must be a function");
   FunctionDecl *Callee = Cand->Function;
   EnableIfAttr *Attr = static_cast<EnableIfAttr*>(Cand->DeductionFailure.Data);
 
@@ -11873,11 +11864,13 @@ static void DiagnoseFailedEnableIfAttr(Sema &S, OverloadCandidate *Cand) {
 }
 
 static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
-  ExplicitSpecifier ES = ExplicitSpecifier::getFromDecl(Cand->Function);
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
+  ExplicitSpecifier ES = ExplicitSpecifier::getFromDecl(Fn);
   assert(ES.isExplicit() && "not an explicit candidate");
 
   unsigned Kind;
-  switch (Cand->Function->getDeclKind()) {
+  switch (Fn->getDeclKind()) {
   case Decl::Kind::CXXConstructor:
     Kind = 0;
     break;
@@ -11885,7 +11878,7 @@ static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
     Kind = 1;
     break;
   case Decl::Kind::CXXDeductionGuide:
-    Kind = Cand->Function->isImplicit() ? 0 : 2;
+    Kind = Fn->isImplicit() ? 0 : 2;
     break;
   default:
     llvm_unreachable("invalid Decl");
@@ -11895,7 +11888,7 @@ static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
   // (particularly an out-of-class definition) will typically lack the
   // 'explicit' specifier.
   // FIXME: This is probably a good thing to do for all 'candidate' notes.
-  FunctionDecl *First = Cand->Function->getFirstDecl();
+  FunctionDecl *First = Fn->getFirstDecl();
   if (FunctionDecl *Pattern = First->getTemplateInstantiationPattern())
     First = Pattern->getFirstDecl();
 
@@ -11964,6 +11957,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
                                   unsigned NumArgs,
                                   bool TakingCandidateAddress,
                                   LangAS CtorDestAS = LangAS::Default) {
+  assert(Cand->Function && "Candidate must be a function");
   FunctionDecl *Fn = Cand->Function;
   if (shouldSkipNotingLambdaConversionDecl(Fn))
     return;
@@ -11978,8 +11972,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   // Skip implicit member functions when trying to resolve
   // the address of a an overload set for a function pointer.
   if (Cand->TookAddressOfOverload &&
-      !Cand->Function->hasCXXExplicitFunctionObjectParameter() &&
-      !Cand->Function->isStatic())
+      !Fn->hasCXXExplicitFunctionObjectParameter() && !Fn->isStatic())
     return;
 
   // Note deleted candidates, but only if they're viable.
@@ -12077,7 +12070,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
     return;
 
   case ovl_fail_addr_not_available: {
-    bool Available = checkAddressOfCandidateIsAvailable(S, Cand->Function);
+    bool Available = checkAddressOfCandidateIsAvailable(S, Fn);
     (void)Available;
     assert(!Available);
     break;
@@ -14741,10 +14734,12 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                 FnDecl))
           return ExprError();
 
-        // Check for a self move.
-        if (Op == OO_Equal)
+        if (Op == OO_Equal) {
+          // Check for a self move.
           DiagnoseSelfMove(Args[0], Args[1], OpLoc);
-
+          // lifetime check.
+          checkExprLifetime(*this, AssignedEntity{Args[0]}, Args[1]);
+        }
         if (ImplicitThis) {
           QualType ThisType = Context.getPointerType(ImplicitThis->getType());
           QualType ThisTypeFromDecl = Context.getPointerType(
diff --git a/clang/lib/Sema/SemaPPC.cpp b/clang/lib/Sema/SemaPPC.cpp
index 99f46b12e6968..5b764ed396ebc 100644
--- a/clang/lib/Sema/SemaPPC.cpp
+++ b/clang/lib/Sema/SemaPPC.cpp
@@ -93,7 +93,6 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
                                           unsigned BuiltinID,
                                           CallExpr *TheCall) {
   ASTContext &Context = getASTContext();
-  unsigned i = 0, l = 0, u = 0;
   bool IsTarget64Bit = TI.getTypeWidth(TI.getIntPtrType()) == 64;
   llvm::APSInt Result;
 
@@ -248,7 +247,7 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
     return BuiltinPPCMMACall(TheCall, BuiltinID, Types);
 #include "clang/Basic/BuiltinsPPC.def"
   }
-  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u);
+  llvm_unreachable("must return from switch");
 }
 
 // Check if the given type is a non-pointer PPC MMA type. This function is used
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index f736de01e6339..b6ce0d9ecfb50 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -634,7 +634,6 @@ class CallExprFinder : public ConstEvaluatedExprVisitor<CallExprFinder> {
 
 static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
                                SourceRange Range) {
-  NoMergeAttr NMA(S.Context, A);
   CallExprFinder CEF(S, St);
 
   if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
@@ -646,6 +645,19 @@ static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) NoMergeAttr(S.Context, A);
 }
 
+static Attr *handleNoConvergentAttr(Sema &S, Stmt *St, const ParsedAttr &A,
+                                    SourceRange Range) {
+  CallExprFinder CEF(S, St);
+
+  if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
+    S.Diag(St->getBeginLoc(), diag::warn_attribute_ignored_no_calls_in_stmt)
+        << A;
+    return nullptr;
+  }
+
+  return ::new (S.Context) NoConvergentAttr(S.Context, A);
+}
+
 template <typename OtherAttr, int DiagIdx>
 static bool CheckStmtInlineAttr(Sema &SemaRef, const Stmt *OrigSt,
                                 const Stmt *CurSt,
@@ -1138,13 +1150,6 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
 
   unsigned UnrollFactor = 0;
   if (A.getNumArgs() == 1) {
-
-    if (A.isArgIdent(0)) {
-      S.Diag(A.getLoc(), diag::err_attribute_argument_type)
-          << A << AANT_ArgumentIntegerConstant << A.getRange();
-      return nullptr;
-    }
-
     Expr *E = A.getArgAsExpr(0);
 
     if (S.CheckLoopHintExpr(E, St->getBeginLoc(),
@@ -1239,6 +1244,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleCodeAlignAttr(S, St, A);
   case ParsedAttr::AT_MSConstexpr:
     return handleMSConstexprAttr(S, St, A, Range);
+  case ParsedAttr::AT_NoConvergent:
+    return handleNoConvergentAttr(S, St, A, Range);
   default:
     // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a
     // declaration attribute is not written on a statement, but this code is
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 9d96201625389..c22e329bef2b9 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3344,6 +3344,10 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
         *this, /*PointOfInstantiation=*/TemplateLoc,
         /*Entity=*/AliasTemplate,
         /*TemplateArgs=*/TemplateArgLists.getInnermost());
+
+    // Diagnose uses of this alias.
+    (void)DiagnoseUseOfDecl(AliasTemplate, TemplateLoc);
+
     if (Inst.isInvalid())
       return QualType();
 
@@ -6715,7 +6719,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
       auto *VD = const_cast<ValueDecl *>(Base.dyn_cast<const ValueDecl *>());
       //   For a non-type template-parameter of pointer or reference type,
       //   the value of the constant expression shall not refer to
-      assert(ParamType->isPointerType() || ParamType->isReferenceType() ||
+      assert(ParamType->isPointerOrReferenceType() ||
              ParamType->isNullPtrType());
       // -- a temporary object
       // -- a string literal
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index b7b857ebf804b..db7f233dcef73 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -951,9 +951,11 @@ class PackDeductionScope {
 
     // Skip over the pack elements that were expanded into separate arguments.
     // If we partially expanded, this is the number of partial arguments.
+    // FIXME: `&& FixedNumExpansions` is a workaround for UB described in
+    // https://github.com/llvm/llvm-project/issues/100095
     if (IsPartiallyExpanded)
       PackElements += NumPartialPackArgs;
-    else if (IsExpanded)
+    else if (IsExpanded && FixedNumExpansions)
       PackElements += *FixedNumExpansions;
 
     for (auto &Pack : Packs) {
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 7dff2c8f98589..545da21183c3c 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -39,6 +39,7 @@
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -241,11 +242,10 @@ NamedDecl *buildDeductionGuide(
 }
 
 // Transform a given template type parameter `TTP`.
-TemplateTypeParmDecl *
-transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
-                           TemplateTypeParmDecl *TTP,
-                           MultiLevelTemplateArgumentList &Args,
-                           unsigned NewDepth, unsigned NewIndex) {
+TemplateTypeParmDecl *transformTemplateTypeParam(
+    Sema &SemaRef, DeclContext *DC, TemplateTypeParmDecl *TTP,
+    MultiLevelTemplateArgumentList &Args, unsigned NewDepth, unsigned NewIndex,
+    bool EvaluateConstraint) {
   // TemplateTypeParmDecl's index cannot be changed after creation, so
   // substitute it directly.
   auto *NewTTP = TemplateTypeParmDecl::Create(
@@ -257,7 +257,7 @@ transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
           : std::nullopt);
   if (const auto *TC = TTP->getTypeConstraint())
     SemaRef.SubstTypeConstraint(NewTTP, TC, Args,
-                                /*EvaluateConstraint=*/true);
+                                /*EvaluateConstraint=*/EvaluateConstraint);
   if (TTP->hasDefaultArgument()) {
     TemplateArgumentLoc InstantiatedDefaultArg;
     if (!SemaRef.SubstTemplateArgument(
@@ -284,6 +284,22 @@ transformTemplateParam(Sema &SemaRef, DeclContext *DC,
   return NewParam;
 }
 
+NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
+                                      NamedDecl *TemplateParam,
+                                      MultiLevelTemplateArgumentList &Args,
+                                      unsigned NewIndex, unsigned NewDepth,
+                                      bool EvaluateConstraint = true) {
+  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
+    return transformTemplateTypeParam(
+        SemaRef, DC, TTP, Args, NewDepth, NewIndex,
+        /*EvaluateConstraint=*/EvaluateConstraint);
+  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
+  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
+  llvm_unreachable("Unhandled template parameter types");
+}
+
 /// Transform to convert portions of a constructor declaration into the
 /// corresponding deduction guide, per C++1z [over.match.class.deduct]p1.
 struct ConvertConstructorToDeductionGuideTransform {
@@ -358,29 +374,28 @@ struct ConvertConstructorToDeductionGuideTransform {
         Args.addOuterRetainedLevel();
         if (NestedPattern)
           Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
-        NamedDecl *NewParam = transformTemplateParameter(Param, Args);
+        auto [Depth, Index] = getDepthAndIndex(Param);
+        NamedDecl *NewParam = transformTemplateParameter(
+            SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment, Depth - 1);
         if (!NewParam)
           return nullptr;
         // Constraints require that we substitute depth-1 arguments
         // to match depths when substituted for evaluation later
-        Depth1Args.push_back(SemaRef.Context.getCanonicalTemplateArgument(
-            SemaRef.Context.getInjectedTemplateArg(NewParam)));
+        Depth1Args.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
 
         if (NestedPattern) {
-          TemplateDeclInstantiator Instantiator(SemaRef, DC,
-                                                OuterInstantiationArgs);
-          Instantiator.setEvaluateConstraints(false);
-          SemaRef.runWithSufficientStackSpace(NewParam->getLocation(), [&] {
-            NewParam = cast<NamedDecl>(Instantiator.Visit(NewParam));
-          });
+          auto [Depth, Index] = getDepthAndIndex(NewParam);
+          NewParam = transformTemplateParameter(
+              SemaRef, DC, NewParam, OuterInstantiationArgs, Index,
+              Depth - OuterInstantiationArgs.getNumSubstitutedLevels(),
+              /*EvaluateConstraint=*/false);
         }
 
         assert(NewParam->getTemplateDepth() == 0 &&
                "Unexpected template parameter depth");
 
         AllParams.push_back(NewParam);
-        SubstArgs.push_back(SemaRef.Context.getCanonicalTemplateArgument(
-            SemaRef.Context.getInjectedTemplateArg(NewParam)));
+        SubstArgs.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
       }
 
       // Substitute new template parameters into requires-clause if present.
@@ -481,25 +496,6 @@ struct ConvertConstructorToDeductionGuideTransform {
   }
 
 private:
-  /// Transform a constructor template parameter into a deduction guide template
-  /// parameter, rebuilding any internal references to earlier parameters and
-  /// renumbering as we go.
-  NamedDecl *transformTemplateParameter(NamedDecl *TemplateParam,
-                                        MultiLevelTemplateArgumentList &Args) {
-    if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-      return transformTemplateTypeParam(
-          SemaRef, DC, TTP, Args, TTP->getDepth() - 1,
-          Depth1IndexAdjustment + TTP->getIndex());
-    if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-      return transformTemplateParam(SemaRef, DC, TTP, Args,
-                                    Depth1IndexAdjustment + TTP->getIndex(),
-                                    TTP->getDepth() - 1);
-    auto *NTTP = cast<NonTypeTemplateParmDecl>(TemplateParam);
-    return transformTemplateParam(SemaRef, DC, NTTP, Args,
-                                  Depth1IndexAdjustment + NTTP->getIndex(),
-                                  NTTP->getDepth() - 1);
-  }
-
   QualType transformFunctionProtoType(
       TypeLocBuilder &TLB, FunctionProtoTypeLoc TL,
       SmallVectorImpl<ParmVarDecl *> &Params,
@@ -636,26 +632,6 @@ struct ConvertConstructorToDeductionGuideTransform {
   }
 };
 
-unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getDepth();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
-unsigned getTemplateParameterIndex(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getIndex();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Find all template parameters that appear in the given DeducedArgs.
 // Return the indices of the template parameters in the TemplateParams.
 SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
@@ -691,8 +667,10 @@ SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
 
     void MarkAppeared(NamedDecl *ND) {
       if (llvm::isa<NonTypeTemplateParmDecl, TemplateTypeParmDecl,
-                    TemplateTemplateParmDecl>(ND))
-        Mark(getTemplateParameterDepth(ND), getTemplateParameterIndex(ND));
+                    TemplateTemplateParmDecl>(ND)) {
+        auto [Depth, Index] = getDepthAndIndex(ND);
+        Mark(Depth, Index);
+      }
     }
     void Mark(unsigned Depth, unsigned Index) {
       if (Index < TemplateParamList->size() &&
@@ -724,20 +702,6 @@ bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) {
   return false;
 }
 
-NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
-                                      NamedDecl *TemplateParam,
-                                      MultiLevelTemplateArgumentList &Args,
-                                      unsigned NewIndex, unsigned NewDepth) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth,
-                                      NewIndex);
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Build the associated constraints for the alias deduction guides.
 // C++ [over.match.class.deduct]p3.3:
 //   The associated constraints ([temp.constr.decl]) are the conjunction of the
@@ -793,10 +757,10 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
         /*NewIndex=*/AdjustedAliasTemplateArgs.size(),
-        getTemplateParameterDepth(TP) + AdjustDepth);
+        getDepthAndIndex(TP).first + AdjustDepth);
 
-    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
-        Context.getInjectedTemplateArg(NewParam));
+    TemplateArgument NewTemplateArgument =
+        Context.getInjectedTemplateArg(NewParam);
     AdjustedAliasTemplateArgs.push_back(NewTemplateArgument);
   }
   // Template arguments used to transform the template arguments in
@@ -816,14 +780,14 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
       Args.setKind(TemplateSubstitutionKind::Rewrite);
       Args.addOuterTemplateArguments(TemplateArgsForBuildingRC);
       // Rebuild the template parameter with updated depth and index.
-      NamedDecl *NewParam = transformTemplateParameter(
-          SemaRef, F->getDeclContext(), TP, Args,
-          /*NewIndex=*/FirstUndeducedParamIdx,
-          getTemplateParameterDepth(TP) + AdjustDepth);
+      NamedDecl *NewParam =
+          transformTemplateParameter(SemaRef, F->getDeclContext(), TP, Args,
+                                     /*NewIndex=*/FirstUndeducedParamIdx,
+                                     getDepthAndIndex(TP).first + AdjustDepth);
       FirstUndeducedParamIdx += 1;
       assert(TemplateArgsForBuildingRC[Index].isNull());
-      TemplateArgsForBuildingRC[Index] = Context.getCanonicalTemplateArgument(
-          Context.getInjectedTemplateArg(NewParam));
+      TemplateArgsForBuildingRC[Index] =
+          Context.getInjectedTemplateArg(NewParam);
       continue;
     }
     TemplateArgumentLoc Input =
@@ -921,10 +885,10 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef,
       NamedDecl *NewParam = transformTemplateParameter(
           SemaRef, AliasTemplate->getDeclContext(), TP, Args,
           /*NewIndex=*/TransformedTemplateArgs.size(),
-          getTemplateParameterDepth(TP) + AdjustDepth);
+          getDepthAndIndex(TP).first + AdjustDepth);
 
-      auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
-          Context.getInjectedTemplateArg(NewParam));
+      TemplateArgument NewTemplateArgument =
+          Context.getInjectedTemplateArg(NewParam);
       TransformedTemplateArgs.push_back(NewTemplateArgument);
     }
     // Transformed the ReturnType to restore the uninstantiated depth.
@@ -1083,12 +1047,11 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
-        /*NewIndex=*/FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        /*NewIndex=*/FPrimeTemplateParams.size(), getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
-    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
-        Context.getInjectedTemplateArg(NewParam));
+    TemplateArgument NewTemplateArgument =
+        Context.getInjectedTemplateArg(NewParam);
     TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
   }
   unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size();
@@ -1103,14 +1066,13 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
     assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
            "The argument must be null before setting");
     TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
-        Context.getCanonicalTemplateArgument(
-            Context.getInjectedTemplateArg(NewParam));
+        Context.getInjectedTemplateArg(NewParam);
   }
 
   // To form a deduction guide f' from f, we leverage clang's instantiation
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 10e555e313de7..821dbae2aff13 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2673,16 +2673,12 @@ createSubstDiag(Sema &S, TemplateDeductionInfo &Info,
   } else {
     ErrorLoc = Info.getLocation();
   }
-  char *MessageBuf = new (S.Context) char[Message.size()];
-  std::copy(Message.begin(), Message.end(), MessageBuf);
   SmallString<128> Entity;
   llvm::raw_svector_ostream OS(Entity);
   Printer(OS);
-  char *EntityBuf = new (S.Context) char[Entity.size()];
-  std::copy(Entity.begin(), Entity.end(), EntityBuf);
-  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
-      StringRef(EntityBuf, Entity.size()), ErrorLoc,
-      StringRef(MessageBuf, Message.size())};
+  const ASTContext &C = S.Context;
+  return new (C) concepts::Requirement::SubstitutionDiagnostic{
+      C.backupStr(Entity), ErrorLoc, C.backupStr(Message)};
 }
 
 concepts::Requirement::SubstitutionDiagnostic *
@@ -2691,10 +2687,9 @@ concepts::createSubstDiagAt(Sema &S, SourceLocation Location,
   SmallString<128> Entity;
   llvm::raw_svector_ostream OS(Entity);
   Printer(OS);
-  char *EntityBuf = new (S.Context) char[Entity.size()];
-  llvm::copy(Entity, EntityBuf);
-  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
-      /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()),
+  const ASTContext &C = S.Context;
+  return new (C) concepts::Requirement::SubstitutionDiagnostic{
+      /*SubstitutedEntity=*/C.backupStr(Entity),
       /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()};
 }
 
@@ -2870,23 +2865,21 @@ TemplateInstantiator::TransformNestedRequirement(
     assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
                                        "by CheckConstraintSatisfaction.");
   }
+  ASTContext &C = SemaRef.Context;
   if (TransConstraint.isUsable() &&
       TransConstraint.get()->isInstantiationDependent())
-    return new (SemaRef.Context)
-        concepts::NestedRequirement(TransConstraint.get());
+    return new (C) concepts::NestedRequirement(TransConstraint.get());
   if (TransConstraint.isInvalid() || !TransConstraint.get() ||
       Satisfaction.HasSubstitutionFailure()) {
     SmallString<128> Entity;
     llvm::raw_svector_ostream OS(Entity);
     Req->getConstraintExpr()->printPretty(OS, nullptr,
                                           SemaRef.getPrintingPolicy());
-    char *EntityBuf = new (SemaRef.Context) char[Entity.size()];
-    std::copy(Entity.begin(), Entity.end(), EntityBuf);
-    return new (SemaRef.Context) concepts::NestedRequirement(
-        SemaRef.Context, StringRef(EntityBuf, Entity.size()), Satisfaction);
+    return new (C) concepts::NestedRequirement(
+        SemaRef.Context, C.backupStr(Entity), Satisfaction);
   }
-  return new (SemaRef.Context) concepts::NestedRequirement(
-      SemaRef.Context, TransConstraint.get(), Satisfaction);
+  return new (C)
+      concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
 }
 
 TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3523,11 +3516,16 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
     return true;
 
   llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() {
-    std::string Name;
-    llvm::raw_string_ostream OS(Name);
+    llvm::TimeTraceMetadata M;
+    llvm::raw_string_ostream OS(M.Detail);
     Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(),
                                         /*Qualified=*/true);
-    return Name;
+    if (llvm::isTimeTraceVerbose()) {
+      auto Loc = SourceMgr.getExpansionLoc(Instantiation->getLocation());
+      M.File = SourceMgr.getFilename(Loc);
+      M.Line = SourceMgr.getExpansionLineNumber(Loc);
+    }
+    return M;
   });
 
   Pattern = PatternDef;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 8731602be0db6..27c07de65378b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -331,7 +331,7 @@ instantiateDependentModeAttr(Sema &S,
 static void instantiateOMPDeclareSimdDeclAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const OMPDeclareSimdDeclAttr &Attr, Decl *New) {
-  // Allow 'this' in clauses with varlists.
+  // Allow 'this' in clauses with varlist.
   if (auto *FTD = dyn_cast<FunctionTemplateDecl>(New))
     New = FTD->getTemplatedDecl();
   auto *FD = cast<FunctionDecl>(New);
@@ -414,7 +414,7 @@ static void instantiateOMPDeclareSimdDeclAttr(
 static void instantiateOMPDeclareVariantAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const OMPDeclareVariantAttr &Attr, Decl *New) {
-  // Allow 'this' in clauses with varlists.
+  // Allow 'this' in clauses with varlist.
   if (auto *FTD = dyn_cast<FunctionTemplateDecl>(New))
     New = FTD->getTemplatedDecl();
   auto *FD = cast<FunctionDecl>(New);
@@ -4108,7 +4108,7 @@ Decl *TemplateDeclInstantiator::VisitUsingPackDecl(UsingPackDecl *D) {
 Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl(
                                      OMPThreadPrivateDecl *D) {
   SmallVector<Expr *, 5> Vars;
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     Expr *Var = SemaRef.SubstExpr(I, TemplateArgs).get();
     assert(isa<DeclRefExpr>(Var) && "threadprivate arg is not a DeclRefExpr");
     Vars.push_back(Var);
@@ -4125,7 +4125,7 @@ Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl(
 
 Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) {
   SmallVector<Expr *, 5> Vars;
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     Expr *Var = SemaRef.SubstExpr(I, TemplateArgs).get();
     assert(isa<DeclRefExpr>(Var) && "allocate arg is not a DeclRefExpr");
     Vars.push_back(Var);
@@ -4302,7 +4302,7 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
   for (OMPClause *C : D->clauselists()) {
     auto *OldC = cast<OMPMapClause>(C);
     SmallVector<Expr *, 4> NewVars;
-    for (Expr *OE : OldC->varlists()) {
+    for (Expr *OE : OldC->varlist()) {
       Expr *NE = SemaRef.SubstExpr(OE, TemplateArgs).get();
       if (!NE) {
         IsCorrect = false;
@@ -5486,11 +5486,16 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   }
 
   llvm::TimeTraceScope TimeScope("InstantiateFunction", [&]() {
-    std::string Name;
-    llvm::raw_string_ostream OS(Name);
+    llvm::TimeTraceMetadata M;
+    llvm::raw_string_ostream OS(M.Detail);
     Function->getNameForDiagnostic(OS, getPrintingPolicy(),
                                    /*Qualified=*/true);
-    return Name;
+    if (llvm::isTimeTraceVerbose()) {
+      auto Loc = SourceMgr.getExpansionLoc(Function->getLocation());
+      M.File = SourceMgr.getFilename(Loc);
+      M.Line = SourceMgr.getExpansionLineNumber(Loc);
+    }
+    return M;
   });
 
   // If we're performing recursive template instantiation, create our own
@@ -6761,7 +6766,12 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D,
                   getTrivialTemplateArgumentLoc(UnpackedArg, QualType(), Loc));
           }
           QualType T = CheckTemplateIdType(TemplateName(TD), Loc, Args);
-          if (T.isNull())
+          // We may get a non-null type with errors, in which case
+          // `getAsCXXRecordDecl` will return `nullptr`. For instance, this
+          // happens when one of the template arguments is an invalid
+          // expression. We return early to avoid triggering the assertion
+          // about the `CodeSynthesisContext`.
+          if (T.isNull() || T->containsErrors())
             return nullptr;
           CXXRecordDecl *SubstRecord = T->getAsCXXRecordDecl();
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index e3383f3623bcd..570aa10a58019 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -6389,11 +6389,10 @@ TypeResult Sema::ActOnTypeName(Declarator &D) {
     CheckExtraCXXDefaultArguments(D);
   }
 
-  if (const AutoType *AutoT = T->getAs<AutoType>())
-    CheckConstrainedAuto(
-        AutoT,
-        TInfo->getTypeLoc().getContainedAutoTypeLoc().getConceptNameLoc());
-
+  if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) {
+    const AutoType *AT = TL.getTypePtr();
+    CheckConstrainedAuto(AT, TL.getConceptNameLoc());
+  }
   return CreateParsedType(T, TInfo);
 }
 
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index be26454ce909d..8f9057bbaf259 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -502,7 +502,6 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   switch (BuiltinID) {
   default:
     return false;
-  case X86::BI__builtin_ia32_vec_ext_v2si:
   case X86::BI__builtin_ia32_vec_ext_v2di:
   case X86::BI__builtin_ia32_vextractf128_pd256:
   case X86::BI__builtin_ia32_vextractf128_ps256:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 34c92de087c1c..98ff398344b67 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1669,15 +1669,15 @@ class TreeTransform {
   ///
   /// By default, performs semantic analysis to build the new statement.
   /// Subclasses may override this routine to provide different behavior.
-  StmtResult RebuildOMPExecutableDirective(
-      OpenMPDirectiveKind Kind, DeclarationNameInfo DirName,
-      OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
-      Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
-      OpenMPDirectiveKind PrevMappedDirective = OMPD_unknown) {
+  StmtResult RebuildOMPExecutableDirective(OpenMPDirectiveKind Kind,
+                                           DeclarationNameInfo DirName,
+                                           OpenMPDirectiveKind CancelRegion,
+                                           ArrayRef<OMPClause *> Clauses,
+                                           Stmt *AStmt, SourceLocation StartLoc,
+                                           SourceLocation EndLoc) {
 
     return getSema().OpenMP().ActOnOpenMPExecutableDirective(
-        Kind, DirName, CancelRegion, Clauses, AStmt, StartLoc, EndLoc,
-        PrevMappedDirective);
+        Kind, DirName, CancelRegion, Clauses, AStmt, StartLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'if' clause.
@@ -9210,8 +9210,7 @@ StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
 
   return getDerived().RebuildOMPExecutableDirective(
       D->getDirectiveKind(), DirName, CancelRegion, TClauses,
-      AssociatedStmt.get(), D->getBeginLoc(), D->getEndLoc(),
-      D->getMappedDirective());
+      AssociatedStmt.get(), D->getBeginLoc(), D->getEndLoc());
 }
 
 template <typename Derived>
@@ -9267,6 +9266,28 @@ TreeTransform<Derived>::TransformOMPUnrollDirective(OMPUnrollDirective *D) {
   return Res;
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPReverseDirective(OMPReverseDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPInterchangeDirective(
+    OMPInterchangeDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
@@ -10316,7 +10337,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPInitClause(OMPInitClause *C) {
 
   OMPInteropInfo InteropInfo(C->getIsTarget(), C->getIsTargetSync());
   InteropInfo.PreferTypes.reserve(C->varlist_size() - 1);
-  for (Expr *E : llvm::drop_begin(C->varlists())) {
+  for (Expr *E : llvm::drop_begin(C->varlist())) {
     ExprResult ER = getDerived().TransformExpr(cast<Expr>(E));
     if (ER.isInvalid())
       return nullptr;
@@ -10454,7 +10475,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPPrivateClause(OMPPrivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10469,7 +10490,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPFirstprivateClause(
     OMPFirstprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10484,7 +10505,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPLastprivateClause(OMPLastprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10500,7 +10521,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPSharedClause(OMPSharedClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10515,7 +10536,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPReductionClause(OMPReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10562,7 +10583,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPTaskReductionClause(
     OMPTaskReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10608,7 +10629,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPInReductionClause(OMPInReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10654,7 +10675,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPLinearClause(OMPLinearClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10674,7 +10695,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPAlignedClause(OMPAlignedClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10693,7 +10714,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPCopyinClause(OMPCopyinClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10708,7 +10729,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPCopyprivateClause(OMPCopyprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10722,7 +10743,7 @@ template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPFlushClause(OMPFlushClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10754,7 +10775,7 @@ TreeTransform<Derived>::TransformOMPDependClause(OMPDependClause *C) {
     DepModifier = DepModRes.get();
   }
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10785,7 +10806,7 @@ bool transformOMPMappableExprListClause(
     llvm::SmallVectorImpl<Expr *> &UnresolvedMappers) {
   // Transform expressions in the list.
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = TT.getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return true;
@@ -10865,7 +10886,7 @@ TreeTransform<Derived>::TransformOMPAllocateClause(OMPAllocateClause *C) {
   }
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10997,7 +11018,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPUseDevicePtrClause(
     OMPUseDevicePtrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11012,7 +11033,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPUseDeviceAddrClause(
     OMPUseDeviceAddrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11027,7 +11048,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11042,7 +11063,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPHasDeviceAddrClause(
     OMPHasDeviceAddrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11057,7 +11078,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPNontemporalClause(OMPNontemporalClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11072,7 +11093,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPInclusiveClause(OMPInclusiveClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11087,7 +11108,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPExclusiveClause(OMPExclusiveClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11134,7 +11155,7 @@ TreeTransform<Derived>::TransformOMPAffinityClause(OMPAffinityClause *C) {
     if (ModifierRes.isInvalid())
       return nullptr;
   }
-  for (Expr *E : C->varlists()) {
+  for (Expr *E : C->varlist()) {
     ExprResult Locator = getDerived().TransformExpr(E);
     if (Locator.isInvalid())
       continue;
@@ -11174,7 +11195,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPDoacrossClause(OMPDoacrossClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 5a53d0d4ebade..a4bf5bf2f748e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1051,10 +1051,10 @@ IdentifierID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d)
   return Reader.getGlobalIdentifierID(F, RawID >> 1);
 }
 
-static void markIdentifierFromAST(ASTReader &Reader, IdentifierInfo &II) {
+static void markIdentifierFromAST(ASTReader &Reader, IdentifierInfo &II,
+                                  bool IsModule) {
   if (!II.isFromAST()) {
     II.setIsFromAST();
-    bool IsModule = Reader.getPreprocessor().getCurrentModule() != nullptr;
     if (isInterestingIdentifier(Reader, II, IsModule))
       II.setChangedSinceDeserialization();
   }
@@ -1080,7 +1080,8 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
     II = &Reader.getIdentifierTable().getOwn(k);
     KnownII = II;
   }
-  markIdentifierFromAST(Reader, *II);
+  bool IsModule = Reader.getPreprocessor().getCurrentModule() != nullptr;
+  markIdentifierFromAST(Reader, *II, IsModule);
   Reader.markIdentifierUpToDate(II);
 
   IdentifierID ID = Reader.getGlobalIdentifierID(F, RawID);
@@ -3023,8 +3024,9 @@ ASTReader::ReadControlBlock(ModuleFile &F,
     case METADATA: {
       if (Record[0] != VERSION_MAJOR && !DisableValidation) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(Record[0] < VERSION_MAJOR? diag::err_pch_version_too_old
-                                        : diag::err_pch_version_too_new);
+          Diag(Record[0] < VERSION_MAJOR ? diag::err_ast_file_version_too_old
+                                         : diag::err_ast_file_version_too_new)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName;
         return VersionMismatch;
       }
 
@@ -3037,7 +3039,8 @@ ASTReader::ReadControlBlock(ModuleFile &F,
           return OutOfDate;
 
         if (!AllowASTWithCompilerErrors) {
-          Diag(diag::err_pch_with_compiler_errors);
+          Diag(diag::err_ast_file_with_compiler_errors)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName;
           return HadErrors;
         }
       }
@@ -3060,7 +3063,9 @@ ASTReader::ReadControlBlock(ModuleFile &F,
       StringRef ASTBranch = Blob;
       if (StringRef(CurBranch) != ASTBranch && !DisableValidation) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(diag::err_pch_different_branch) << ASTBranch << CurBranch;
+          Diag(diag::err_ast_file_different_branch)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName << ASTBranch
+              << CurBranch;
         return VersionMismatch;
       }
       break;
@@ -4543,7 +4548,7 @@ ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName, ModuleKind Type,
 
       // Mark this identifier as being from an AST file so that we can track
       // whether we need to serialize it.
-      markIdentifierFromAST(*this, *II);
+      markIdentifierFromAST(*this, *II, /*IsModule=*/true);
 
       // Associate the ID with the identifier so that the writer can reuse it.
       auto ID = Trait.ReadIdentifierID(Data + KeyDataLen.first);
@@ -4827,7 +4832,8 @@ ASTReader::ReadASTCore(StringRef FileName,
     case AST_BLOCK_ID:
       if (!HaveReadControlBlock) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(diag::err_pch_version_too_old);
+          Diag(diag::err_ast_file_version_too_old)
+              << moduleKindForDiagnostic(Type) << FileName;
         return VersionMismatch;
       }
 
@@ -8968,7 +8974,8 @@ IdentifierInfo *ASTReader::DecodeIdentifierInfo(IdentifierID ID) {
     auto Key = Trait.ReadKey(Data, KeyDataLen.first);
     auto &II = PP.getIdentifierTable().get(Key);
     IdentifiersLoaded[Index] = &II;
-    markIdentifierFromAST(*this,  II);
+    bool IsModule = getPreprocessor().getCurrentModule() != nullptr;
+    markIdentifierFromAST(*this, II, IsModule);
     if (DeserializationListener)
       DeserializationListener->IdentifierRead(ID, &II);
   }
@@ -9882,6 +9889,7 @@ void ASTReader::finishPendingActions() {
             // We only perform ODR checks for decls not in the explicit
             // global module fragment.
             !shouldSkipCheckingODR(FD) &&
+            !shouldSkipCheckingODR(NonConstDefn) &&
             FD->getODRHash() != NonConstDefn->getODRHash()) {
           if (!isa<CXXMethodDecl>(FD)) {
             PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 3de9d327f1b3b..31ab6c651d59f 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -821,7 +821,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
       Reader.mergeDefinitionVisibility(OldDef, ED);
       // We don't want to check the ODR hash value for declarations from global
       // module fragment.
-      if (!shouldSkipCheckingODR(ED) &&
+      if (!shouldSkipCheckingODR(ED) && !shouldSkipCheckingODR(OldDef) &&
           OldDef->getODRHash() != ED->getODRHash())
         Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED);
     } else {
@@ -2134,7 +2134,7 @@ void ASTDeclReader::MergeDefinitionData(
   }
 
   // We don't want to check ODR for decls in the global module fragment.
-  if (shouldSkipCheckingODR(MergeDD.Definition))
+  if (shouldSkipCheckingODR(MergeDD.Definition) || shouldSkipCheckingODR(D))
     return;
 
   if (D->getODRHash() != MergeDD.ODRHash) {
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 39cecf750cc4e..caaa2da3cfe53 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -795,29 +795,22 @@ void ASTStmtReader::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) {
   E->setRParenLoc(readSourceLocation());
 }
 
-static StringRef saveStrToCtx(const std::string &S, ASTContext &Ctx) {
-  char *Buf = new (Ctx) char[S.size()];
-  std::copy(S.begin(), S.end(), Buf);
-  return StringRef(Buf, S.size());
-}
-
 static ConstraintSatisfaction
 readConstraintSatisfaction(ASTRecordReader &Record) {
   ConstraintSatisfaction Satisfaction;
   Satisfaction.IsSatisfied = Record.readInt();
   Satisfaction.ContainsErrors = Record.readInt();
+  const ASTContext &C = Record.getContext();
   if (!Satisfaction.IsSatisfied) {
     unsigned NumDetailRecords = Record.readInt();
     for (unsigned i = 0; i != NumDetailRecords; ++i) {
       if (/* IsDiagnostic */Record.readInt()) {
         SourceLocation DiagLocation = Record.readSourceLocation();
-        StringRef DiagMessage =
-            saveStrToCtx(Record.readString(), Record.getContext());
+        StringRef DiagMessage = C.backupStr(Record.readString());
 
         Satisfaction.Details.emplace_back(
-            new (Record.getContext())
-                ConstraintSatisfaction::SubstitutionDiagnostic(DiagLocation,
-                                                               DiagMessage));
+            new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
+                DiagLocation, DiagMessage));
       } else
         Satisfaction.Details.emplace_back(Record.readExpr());
     }
@@ -838,12 +831,10 @@ void ASTStmtReader::VisitConceptSpecializationExpr(
 
 static concepts::Requirement::SubstitutionDiagnostic *
 readSubstitutionDiagnostic(ASTRecordReader &Record) {
-  StringRef SubstitutedEntity =
-      saveStrToCtx(Record.readString(), Record.getContext());
-
+  const ASTContext &C = Record.getContext();
+  StringRef SubstitutedEntity = C.backupStr(Record.readString());
   SourceLocation DiagLoc = Record.readSourceLocation();
-  StringRef DiagMessage =
-      saveStrToCtx(Record.readString(), Record.getContext());
+  StringRef DiagMessage = C.backupStr(Record.readString());
 
   return new (Record.getContext())
       concepts::Requirement::SubstitutionDiagnostic{SubstitutedEntity, DiagLoc,
@@ -929,22 +920,21 @@ void ASTStmtReader::VisitRequiresExpr(RequiresExpr *E) {
                   std::move(*Req));
       } break;
       case concepts::Requirement::RK_Nested: {
+        ASTContext &C = Record.getContext();
         bool HasInvalidConstraint = Record.readInt();
         if (HasInvalidConstraint) {
-          StringRef InvalidConstraint =
-              saveStrToCtx(Record.readString(), Record.getContext());
-          R = new (Record.getContext()) concepts::NestedRequirement(
+          StringRef InvalidConstraint = C.backupStr(Record.readString());
+          R = new (C) concepts::NestedRequirement(
               Record.getContext(), InvalidConstraint,
               readConstraintSatisfaction(Record));
           break;
         }
         Expr *E = Record.readExpr();
         if (E->isInstantiationDependent())
-          R = new (Record.getContext()) concepts::NestedRequirement(E);
+          R = new (C) concepts::NestedRequirement(E);
         else
-          R = new (Record.getContext())
-              concepts::NestedRequirement(Record.getContext(), E,
-                                          readConstraintSatisfaction(Record));
+          R = new (C) concepts::NestedRequirement(
+              C, E, readConstraintSatisfaction(Record));
       } break;
     }
     if (!R)
@@ -2432,7 +2422,6 @@ void ASTStmtReader::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.readOMPChildren(E->Data);
   E->setLocStart(readSourceLocation());
   E->setLocEnd(readSourceLocation());
-  E->setMappedDirective(Record.readEnum<OpenMPDirectiveKind>());
 }
 
 void ASTStmtReader::VisitOMPLoopBasedDirective(OMPLoopBasedDirective *D) {
@@ -2477,6 +2466,14 @@ void ASTStmtReader::VisitOMPUnrollDirective(OMPUnrollDirective *D) {
   VisitOMPLoopTransformationDirective(D);
 }
 
+void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+}
+
+void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+}
+
 void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   D->setHasCancel(Record.readBool());
@@ -3500,6 +3497,22 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
     }
 
+    case STMT_OMP_REVERSE_DIRECTIVE: {
+      assert(Record[ASTStmtReader::NumStmtFields] == 1 &&
+             "Reverse directive accepts only a single loop");
+      assert(Record[ASTStmtReader::NumStmtFields + 1] == 0 &&
+             "Reverse directive has no clauses");
+      S = OMPReverseDirective::CreateEmpty(Context);
+      break;
+    }
+
+    case STMT_OMP_INTERCHANGE_DIRECTIVE: {
+      unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPInterchangeDirective::CreateEmpty(Context, NumClauses, NumLoops);
+      break;
+    }
+
     case STMT_OMP_FOR_DIRECTIVE: {
       unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields];
       unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index c78d8943d6d92..f0f9d397f1717 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7220,7 +7220,7 @@ void OMPClauseWriter::VisitOMPNogroupClause(OMPNogroupClause *) {}
 
 void OMPClauseWriter::VisitOMPInitClause(OMPInitClause *C) {
   Record.push_back(C->varlist_size());
-  for (Expr *VE : C->varlists())
+  for (Expr *VE : C->varlist())
     Record.AddStmt(VE);
   Record.writeBool(C->getIsTarget());
   Record.writeBool(C->getIsTargetSync());
@@ -7266,7 +7266,7 @@ void OMPClauseWriter::VisitOMPAlignClause(OMPAlignClause *C) {
 void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
@@ -7278,7 +7278,7 @@ void OMPClauseWriter::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
   Record.push_back(C->varlist_size());
   VisitOMPClauseWithPreInit(C);
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
@@ -7296,7 +7296,7 @@ void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
   Record.writeEnum(C->getKind());
   Record.AddSourceLocation(C->getKindLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->private_copies())
     Record.AddStmt(E);
@@ -7311,7 +7311,7 @@ void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
 void OMPClauseWriter::VisitOMPSharedClause(OMPSharedClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7324,7 +7324,7 @@ void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7351,7 +7351,7 @@ void OMPClauseWriter::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7370,7 +7370,7 @@ void OMPClauseWriter::VisitOMPInReductionClause(OMPInReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7391,7 +7391,7 @@ void OMPClauseWriter::VisitOMPLinearClause(OMPLinearClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.push_back(C->getModifier());
   Record.AddSourceLocation(C->getModifierLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->privates()) {
@@ -7416,7 +7416,7 @@ void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   Record.AddStmt(C->getAlignment());
 }
@@ -7424,7 +7424,7 @@ void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
 void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
     Record.AddStmt(E);
@@ -7437,7 +7437,7 @@ void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
 void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
     Record.AddStmt(E);
@@ -7450,7 +7450,7 @@ void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
 void OMPClauseWriter::VisitOMPFlushClause(OMPFlushClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7468,7 +7468,7 @@ void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) {
   Record.AddSourceLocation(C->getDependencyLoc());
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddSourceLocation(C->getOmpAllMemoryLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (unsigned I = 0, E = C->getNumLoops(); I < E; ++I)
     Record.AddStmt(C->getLoopData(I));
@@ -7500,7 +7500,7 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
   Record.push_back(C->getMapType());
   Record.AddSourceLocation(C->getMapLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7523,7 +7523,7 @@ void OMPClauseWriter::VisitOMPAllocateClause(OMPAllocateClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddStmt(C->getAllocator());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7596,7 +7596,7 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7626,7 +7626,7 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7649,7 +7649,7 @@ void OMPClauseWriter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *VE : C->private_copies())
     Record.AddStmt(VE);
@@ -7673,7 +7673,7 @@ void OMPClauseWriter::VisitOMPUseDeviceAddrClause(OMPUseDeviceAddrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7693,7 +7693,7 @@ void OMPClauseWriter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7713,7 +7713,7 @@ void OMPClauseWriter::VisitOMPHasDeviceAddrClause(OMPHasDeviceAddrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7765,7 +7765,7 @@ void OMPClauseWriter::VisitOMPMessageClause(OMPMessageClause *C) {
 void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->private_refs())
     Record.AddStmt(E);
@@ -7774,14 +7774,14 @@ void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
 void OMPClauseWriter::VisitOMPInclusiveClause(OMPInclusiveClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
 void OMPClauseWriter::VisitOMPExclusiveClause(OMPExclusiveClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7810,7 +7810,7 @@ void OMPClauseWriter::VisitOMPAffinityClause(OMPAffinityClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddStmt(C->getModifier());
   Record.AddSourceLocation(C->getColonLoc());
-  for (Expr *E : C->varlists())
+  for (Expr *E : C->varlist())
     Record.AddStmt(E);
 }
 
@@ -7833,7 +7833,7 @@ void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
   Record.push_back(C->getDependenceType());
   Record.AddSourceLocation(C->getDependenceLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (unsigned I = 0, E = C->getNumLoops(); I < E; ++I)
     Record.AddStmt(C->getLoopData(I));
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 006dead02e486..8c78fb316c5ff 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2426,7 +2426,6 @@ void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.writeOMPChildren(E->Data);
   Record.AddSourceLocation(E->getBeginLoc());
   Record.AddSourceLocation(E->getEndLoc());
-  Record.writeEnum(E->getMappedDirective());
 }
 
 void ASTStmtWriter::VisitOMPLoopBasedDirective(OMPLoopBasedDirective *D) {
@@ -2474,6 +2473,16 @@ void ASTStmtWriter::VisitOMPUnrollDirective(OMPUnrollDirective *D) {
   Code = serialization::STMT_OMP_UNROLL_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPReverseDirective(OMPReverseDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+  Code = serialization::STMT_OMP_REVERSE_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+  Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   Record.writeBool(D->hasCancel());
diff --git a/clang/lib/Serialization/CMakeLists.txt b/clang/lib/Serialization/CMakeLists.txt
index 5a4b3a58e9c45..99c47c15a2f47 100644
--- a/clang/lib/Serialization/CMakeLists.txt
+++ b/clang/lib/Serialization/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS
   BitReader
   BitstreamReader
+  Object
   Support
   TargetParser
   )
@@ -21,6 +22,7 @@ add_clang_library(clangSerialization
   ModuleFileExtension.cpp
   ModuleManager.cpp
   PCHContainerOperations.cpp
+  ObjectFilePCHContainerReader.cpp
 
   ADDITIONAL_HEADERS
   ASTCommon.h
diff --git a/clang/lib/Serialization/ObjectFilePCHContainerReader.cpp b/clang/lib/Serialization/ObjectFilePCHContainerReader.cpp
new file mode 100644
index 0000000000000..59fc46960dc60
--- /dev/null
+++ b/clang/lib/Serialization/ObjectFilePCHContainerReader.cpp
@@ -0,0 +1,56 @@
+//===--- ObjectFilePCHContainerReader.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
+
+using namespace clang;
+
+ArrayRef<StringRef> ObjectFilePCHContainerReader::getFormats() const {
+  static StringRef Formats[] = {"obj", "raw"};
+  return Formats;
+}
+
+StringRef
+ObjectFilePCHContainerReader::ExtractPCH(llvm::MemoryBufferRef Buffer) const {
+  StringRef PCH;
+  auto OFOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
+  if (OFOrErr) {
+    auto &OF = OFOrErr.get();
+    bool IsCOFF = isa<llvm::object::COFFObjectFile>(*OF);
+    // Find the clang AST section in the container.
+    for (auto &Section : OF->sections()) {
+      StringRef Name;
+      if (Expected<StringRef> NameOrErr = Section.getName())
+        Name = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
+
+      if ((!IsCOFF && Name == "__clangast") || (IsCOFF && Name == "clangast")) {
+        if (Expected<StringRef> E = Section.getContents())
+          return *E;
+        else {
+          handleAllErrors(E.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
+            EIB.log(llvm::errs());
+          });
+          return "";
+        }
+      }
+    }
+  }
+  handleAllErrors(OFOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
+    if (EIB.convertToErrorCode() ==
+        llvm::object::object_error::invalid_file_type)
+      // As a fallback, treat the buffer as a raw AST.
+      PCH = Buffer.getBuffer();
+    else
+      EIB.log(llvm::errs());
+  });
+  return PCH;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index f82288f1099e8..3f837564cf47c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -373,14 +373,14 @@ static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
 }
 
 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
-  std::string RegName = getRegionName(Region);
-  SmallString<128> Buf;
-  llvm::raw_svector_ostream Out(Buf);
-  Out << "Access of " << RegName << " at negative byte offset";
-  if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
-    Out << ' ' << ConcreteIdx->getValue();
-  return {formatv("Out of bound access to memory preceding {0}", RegName),
-          std::string(Buf)};
+  std::string RegName = getRegionName(Region), OffsetStr = "";
+
+  if (auto ConcreteOffset = getConcreteValue(Offset))
+    OffsetStr = formatv(" {0}", ConcreteOffset);
+
+  return {
+      formatv("Out of bound access to memory preceding {0}", RegName),
+      formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
 }
 
 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
@@ -609,7 +609,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // CHECK UPPER BOUND
   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
   if (auto KnownSize = Size.getAs<NonLoc>()) {
-    // In a situation where both overflow and overflow are possible (but the
+    // In a situation where both underflow and overflow are possible (but the
     // index is either tainted or known to be invalid), the logic of this
     // checker will first assume that the offset is non-negative, and then
     // (with this additional assumption) it will detect an overflow error.
diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index 1a75d7b52ad6e..b198b1c2ff4d1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -18,6 +18,7 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
@@ -30,8 +31,40 @@ namespace {
 class BuiltinFunctionChecker : public Checker<eval::Call> {
 public:
   bool evalCall(const CallEvent &Call, CheckerContext &C) const;
+
+private:
+  // From: clang/include/clang/Basic/Builtins.def
+  // C++ standard library builtins in namespace 'std'.
+  const CallDescriptionSet BuiltinLikeStdFunctions{
+      {CDM::SimpleFunc, {"std", "addressof"}},        //
+      {CDM::SimpleFunc, {"std", "__addressof"}},      //
+      {CDM::SimpleFunc, {"std", "as_const"}},         //
+      {CDM::SimpleFunc, {"std", "forward"}},          //
+      {CDM::SimpleFunc, {"std", "forward_like"}},     //
+      {CDM::SimpleFunc, {"std", "move"}},             //
+      {CDM::SimpleFunc, {"std", "move_if_noexcept"}}, //
+  };
+
+  bool isBuiltinLikeFunction(const CallEvent &Call) const;
 };
 
+} // namespace
+
+bool BuiltinFunctionChecker::isBuiltinLikeFunction(
+    const CallEvent &Call) const {
+  const auto *FD = llvm::dyn_cast_or_null<FunctionDecl>(Call.getDecl());
+  if (!FD || FD->getNumParams() != 1)
+    return false;
+
+  if (QualType RetTy = FD->getReturnType();
+      !RetTy->isPointerType() && !RetTy->isReferenceType())
+    return false;
+
+  if (QualType ParmTy = FD->getParamDecl(0)->getType();
+      !ParmTy->isPointerType() && !ParmTy->isReferenceType())
+    return false;
+
+  return BuiltinLikeStdFunctions.contains(Call);
 }
 
 bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
@@ -44,6 +77,11 @@ bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
   const LocationContext *LCtx = C.getLocationContext();
   const Expr *CE = Call.getOriginExpr();
 
+  if (isBuiltinLikeFunction(Call)) {
+    C.addTransition(state->BindExpr(CE, LCtx, Call.getArgSVal(0)));
+    return true;
+  }
+
   switch (FD->getBuiltinID()) {
   default:
     return false;
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index fe202c79ed620..95ec28bfd20da 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -103,14 +103,48 @@ using namespace std::placeholders;
 namespace {
 
 // Used to check correspondence between allocators and deallocators.
-enum AllocationFamily {
+enum AllocationFamilyKind {
   AF_None,
   AF_Malloc,
   AF_CXXNew,
   AF_CXXNewArray,
   AF_IfNameIndex,
   AF_Alloca,
-  AF_InnerBuffer
+  AF_InnerBuffer,
+  AF_Custom,
+};
+
+struct AllocationFamily {
+  AllocationFamilyKind Kind;
+  std::optional<StringRef> CustomName;
+
+  explicit AllocationFamily(AllocationFamilyKind AKind,
+                            std::optional<StringRef> Name = std::nullopt)
+      : Kind(AKind), CustomName(Name) {
+    assert((Kind != AF_Custom || CustomName.has_value()) &&
+           "Custom family must specify also the name");
+
+    // Preseve previous behavior when "malloc" class means AF_Malloc
+    if (Kind == AF_Custom && CustomName.value() == "malloc") {
+      Kind = AF_Malloc;
+      CustomName = std::nullopt;
+    }
+  }
+
+  bool operator==(const AllocationFamily &Other) const {
+    return std::tie(Kind, CustomName) == std::tie(Other.Kind, Other.CustomName);
+  }
+
+  bool operator!=(const AllocationFamily &Other) const {
+    return !(*this == Other);
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.AddInteger(Kind);
+
+    if (Kind == AF_Custom)
+      ID.AddString(CustomName.value());
+  }
 };
 
 } // end of anonymous namespace
@@ -158,7 +192,7 @@ class RefState {
 
   RefState(Kind k, const Stmt *s, AllocationFamily family)
       : S(s), K(k), Family(family) {
-    assert(family != AF_None);
+    assert(family.Kind != AF_None);
   }
 
 public:
@@ -194,7 +228,7 @@ class RefState {
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddInteger(K);
     ID.AddPointer(S);
-    ID.AddInteger(Family);
+    Family.Profile(ID);
   }
 
   LLVM_DUMP_METHOD void dump(raw_ostream &OS) const {
@@ -899,7 +933,7 @@ class MallocBugVisitor final : public BugReporterVisitor {
     bool IsReleased =
         (RSCurr && RSCurr->isReleased()) && (!RSPrev || !RSPrev->isReleased());
     assert(!IsReleased || (isa_and_nonnull<CallExpr, CXXDeleteExpr>(Stmt)) ||
-           (!Stmt && RSCurr->getAllocationFamily() == AF_InnerBuffer));
+           (!Stmt && RSCurr->getAllocationFamily().Kind == AF_InnerBuffer));
     return IsReleased;
   }
 
@@ -1122,7 +1156,7 @@ MallocChecker::performKernelMalloc(const CallEvent &Call, CheckerContext &C,
   if (TrueState && !FalseState) {
     SVal ZeroVal = C.getSValBuilder().makeZeroVal(Ctx.CharTy);
     return MallocMemAux(C, Call, Call.getArgExpr(0), ZeroVal, TrueState,
-                        AF_Malloc);
+                        AllocationFamily(AF_Malloc));
   }
 
   return std::nullopt;
@@ -1143,7 +1177,7 @@ void MallocChecker::checkBasicAlloc(const CallEvent &Call,
                                     CheckerContext &C) const {
   ProgramStateRef State = C.getState();
   State = MallocMemAux(C, Call, Call.getArgExpr(0), UndefinedVal(), State,
-                       AF_Malloc);
+                       AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 0, State);
   C.addTransition(State);
 }
@@ -1157,7 +1191,7 @@ void MallocChecker::checkKernelMalloc(const CallEvent &Call,
     State = *MaybeState;
   else
     State = MallocMemAux(C, Call, Call.getArgExpr(0), UndefinedVal(), State,
-                         AF_Malloc);
+                         AllocationFamily(AF_Malloc));
   C.addTransition(State);
 }
 
@@ -1194,7 +1228,8 @@ void MallocChecker::checkRealloc(const CallEvent &Call, CheckerContext &C,
     return;
 
   ProgramStateRef State = C.getState();
-  State = ReallocMemAux(C, Call, ShouldFreeOnFail, State, AF_Malloc);
+  State = ReallocMemAux(C, Call, ShouldFreeOnFail, State,
+                        AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 1, State);
   C.addTransition(State);
 }
@@ -1214,7 +1249,7 @@ void MallocChecker::checkFree(const CallEvent &Call, CheckerContext &C) const {
   if (suppressDeallocationsInSuspiciousContexts(Call, C))
     return;
   State = FreeMemAux(C, Call, State, 0, false, IsKnownToBeAllocatedMemory,
-                     AF_Malloc);
+                     AllocationFamily(AF_Malloc));
   C.addTransition(State);
 }
 
@@ -1222,7 +1257,7 @@ void MallocChecker::checkAlloca(const CallEvent &Call,
                                 CheckerContext &C) const {
   ProgramStateRef State = C.getState();
   State = MallocMemAux(C, Call, Call.getArgExpr(0), UndefinedVal(), State,
-                       AF_Alloca);
+                       AllocationFamily(AF_Alloca));
   State = ProcessZeroAllocCheck(Call, 0, State);
   C.addTransition(State);
 }
@@ -1233,7 +1268,7 @@ void MallocChecker::checkStrdup(const CallEvent &Call,
   const auto *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
   if (!CE)
     return;
-  State = MallocUpdateRefState(C, CE, State, AF_Malloc);
+  State = MallocUpdateRefState(C, CE, State, AllocationFamily(AF_Malloc));
 
   C.addTransition(State);
 }
@@ -1243,8 +1278,8 @@ void MallocChecker::checkIfNameIndex(const CallEvent &Call,
   ProgramStateRef State = C.getState();
   // Should we model this differently? We can allocate a fixed number of
   // elements with zeros in the last one.
-  State =
-      MallocMemAux(C, Call, UnknownVal(), UnknownVal(), State, AF_IfNameIndex);
+  State = MallocMemAux(C, Call, UnknownVal(), UnknownVal(), State,
+                       AllocationFamily(AF_IfNameIndex));
 
   C.addTransition(State);
 }
@@ -1254,7 +1289,7 @@ void MallocChecker::checkIfFreeNameIndex(const CallEvent &Call,
   ProgramStateRef State = C.getState();
   bool IsKnownToBeAllocatedMemory = false;
   State = FreeMemAux(C, Call, State, 0, false, IsKnownToBeAllocatedMemory,
-                     AF_IfNameIndex);
+                     AllocationFamily(AF_IfNameIndex));
   C.addTransition(State);
 }
 
@@ -1275,25 +1310,26 @@ void MallocChecker::checkCXXNewOrCXXDelete(const CallEvent &Call,
   const FunctionDecl *FD = C.getCalleeDecl(CE);
   switch (FD->getOverloadedOperator()) {
   case OO_New:
-    State =
-        MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State, AF_CXXNew);
+    State = MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State,
+                         AllocationFamily(AF_CXXNew));
     State = ProcessZeroAllocCheck(Call, 0, State);
     break;
   case OO_Array_New:
     State = MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State,
-                         AF_CXXNewArray);
+                         AllocationFamily(AF_CXXNewArray));
     State = ProcessZeroAllocCheck(Call, 0, State);
     break;
   case OO_Delete:
     State = FreeMemAux(C, Call, State, 0, false, IsKnownToBeAllocatedMemory,
-                       AF_CXXNew);
+                       AllocationFamily(AF_CXXNew));
     break;
   case OO_Array_Delete:
     State = FreeMemAux(C, Call, State, 0, false, IsKnownToBeAllocatedMemory,
-                       AF_CXXNewArray);
+                       AllocationFamily(AF_CXXNewArray));
     break;
   default:
-    llvm_unreachable("not a new/delete operator");
+    assert(false && "not a new/delete operator");
+    return;
   }
 
   C.addTransition(State);
@@ -1304,7 +1340,8 @@ void MallocChecker::checkGMalloc0(const CallEvent &Call,
   ProgramStateRef State = C.getState();
   SValBuilder &svalBuilder = C.getSValBuilder();
   SVal zeroVal = svalBuilder.makeZeroVal(svalBuilder.getContext().CharTy);
-  State = MallocMemAux(C, Call, Call.getArgExpr(0), zeroVal, State, AF_Malloc);
+  State = MallocMemAux(C, Call, Call.getArgExpr(0), zeroVal, State,
+                       AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 0, State);
   C.addTransition(State);
 }
@@ -1312,8 +1349,8 @@ void MallocChecker::checkGMalloc0(const CallEvent &Call,
 void MallocChecker::checkGMemdup(const CallEvent &Call,
                                  CheckerContext &C) const {
   ProgramStateRef State = C.getState();
-  State =
-      MallocMemAux(C, Call, Call.getArgExpr(1), UnknownVal(), State, AF_Malloc);
+  State = MallocMemAux(C, Call, Call.getArgExpr(1), UnknownVal(), State,
+                       AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 1, State);
   C.addTransition(State);
 }
@@ -1323,7 +1360,8 @@ void MallocChecker::checkGMallocN(const CallEvent &Call,
   ProgramStateRef State = C.getState();
   SVal Init = UndefinedVal();
   SVal TotalSize = evalMulForBufferSize(C, Call.getArgExpr(0), Call.getArgExpr(1));
-  State = MallocMemAux(C, Call, TotalSize, Init, State, AF_Malloc);
+  State = MallocMemAux(C, Call, TotalSize, Init, State,
+                       AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 0, State);
   State = ProcessZeroAllocCheck(Call, 1, State);
   C.addTransition(State);
@@ -1335,7 +1373,8 @@ void MallocChecker::checkGMallocN0(const CallEvent &Call,
   SValBuilder &SB = C.getSValBuilder();
   SVal Init = SB.makeZeroVal(SB.getContext().CharTy);
   SVal TotalSize = evalMulForBufferSize(C, Call.getArgExpr(0), Call.getArgExpr(1));
-  State = MallocMemAux(C, Call, TotalSize, Init, State, AF_Malloc);
+  State = MallocMemAux(C, Call, TotalSize, Init, State,
+                       AllocationFamily(AF_Malloc));
   State = ProcessZeroAllocCheck(Call, 0, State);
   State = ProcessZeroAllocCheck(Call, 1, State);
   C.addTransition(State);
@@ -1365,7 +1404,8 @@ void MallocChecker::preGetdelim(const CallEvent &Call,
   // of reporting any violation of the preconditions.
   bool IsKnownToBeAllocated = false;
   State = FreeMemAux(C, Call.getArgExpr(0), Call, State, false,
-                     IsKnownToBeAllocated, AF_Malloc, false, LinePtr);
+                     IsKnownToBeAllocated, AllocationFamily(AF_Malloc), false,
+                     LinePtr);
   if (State)
     C.addTransition(State);
 }
@@ -1394,13 +1434,15 @@ void MallocChecker::checkGetdelim(const CallEvent &Call,
     return;
 
   State = setDynamicExtent(State, LinePtr->getAsRegion(), *Size, SVB);
-  C.addTransition(MallocUpdateRefState(C, CE, State, AF_Malloc, *LinePtr));
+  C.addTransition(MallocUpdateRefState(C, CE, State,
+                                       AllocationFamily(AF_Malloc), *LinePtr));
 }
 
 void MallocChecker::checkReallocN(const CallEvent &Call,
                                   CheckerContext &C) const {
   ProgramStateRef State = C.getState();
-  State = ReallocMemAux(C, Call, /*ShouldFreeOnFail=*/false, State, AF_Malloc,
+  State = ReallocMemAux(C, Call, /*ShouldFreeOnFail=*/false, State,
+                        AllocationFamily(AF_Malloc),
                         /*SuffixWithN=*/true);
   State = ProcessZeroAllocCheck(Call, 1, State);
   State = ProcessZeroAllocCheck(Call, 2, State);
@@ -1489,8 +1531,10 @@ ProgramStateRef MallocChecker::ProcessZeroAllocCheck(
     } else {
       return State;
     }
-  } else
-    llvm_unreachable("not a CallExpr or CXXNewExpr");
+  } else {
+    assert(false && "not a CallExpr or CXXNewExpr");
+    return nullptr;
+  }
 
   assert(Arg);
 
@@ -1598,7 +1642,8 @@ MallocChecker::processNewAllocation(const CXXAllocatorCall &Call,
   SVal Target = Call.getObjectUnderConstruction();
   if (Call.getOriginExpr()->isArray()) {
     if (auto SizeEx = NE->getArraySize())
-      checkTaintedness(C, Call, C.getSVal(*SizeEx), State, AF_CXXNewArray);
+      checkTaintedness(C, Call, C.getSVal(*SizeEx), State,
+                       AllocationFamily(AF_CXXNewArray));
   }
 
   State = MallocUpdateRefState(C, NE, State, Family, Target);
@@ -1611,7 +1656,8 @@ void MallocChecker::checkNewAllocator(const CXXAllocatorCall &Call,
   if (!C.wasInlined) {
     ProgramStateRef State = processNewAllocation(
         Call, C,
-        (Call.getOriginExpr()->isArray() ? AF_CXXNewArray : AF_CXXNew));
+        AllocationFamily(Call.getOriginExpr()->isArray() ? AF_CXXNewArray
+                                                         : AF_CXXNew));
     C.addTransition(State);
   }
 }
@@ -1655,10 +1701,10 @@ void MallocChecker::checkPostObjCMessage(const ObjCMethodCall &Call,
     return;
 
   bool IsKnownToBeAllocatedMemory;
-  ProgramStateRef State =
-      FreeMemAux(C, Call.getArgExpr(0), Call, C.getState(),
-                 /*Hold=*/true, IsKnownToBeAllocatedMemory, AF_Malloc,
-                 /*ReturnsNullOnFailure=*/true);
+  ProgramStateRef State = FreeMemAux(C, Call.getArgExpr(0), Call, C.getState(),
+                                     /*Hold=*/true, IsKnownToBeAllocatedMemory,
+                                     AllocationFamily(AF_Malloc),
+                                     /*ReturnsNullOnFailure=*/true);
 
   C.addTransition(State);
 }
@@ -1670,15 +1716,15 @@ MallocChecker::MallocMemReturnsAttr(CheckerContext &C, const CallEvent &Call,
   if (!State)
     return nullptr;
 
-  if (Att->getModule()->getName() != "malloc")
-    return nullptr;
+  auto attrClassName = Att->getModule()->getName();
+  auto Family = AllocationFamily(AF_Custom, attrClassName);
 
   if (!Att->args().empty()) {
     return MallocMemAux(C, Call,
                         Call.getArgExpr(Att->args_begin()->getASTIndex()),
-                        UndefinedVal(), State, AF_Malloc);
+                        UndefinedVal(), State, Family);
   }
-  return MallocMemAux(C, Call, UnknownVal(), UndefinedVal(), State, AF_Malloc);
+  return MallocMemAux(C, Call, UnknownVal(), UndefinedVal(), State, Family);
 }
 
 ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
@@ -1768,10 +1814,10 @@ ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
   unsigned Count = C.blockCount();
   SValBuilder &SVB = C.getSValBuilder();
   const LocationContext *LCtx = C.getPredecessor()->getLocationContext();
-  DefinedSVal RetVal =
-      ((Family == AF_Alloca) ? SVB.getAllocaRegionVal(CE, LCtx, Count)
-                             : SVB.getConjuredHeapSymbolVal(CE, LCtx, Count)
-                                   .castAs<DefinedSVal>());
+  DefinedSVal RetVal = ((Family.Kind == AF_Alloca)
+                            ? SVB.getAllocaRegionVal(CE, LCtx, Count)
+                            : SVB.getConjuredHeapSymbolVal(CE, LCtx, Count)
+                                  .castAs<DefinedSVal>());
   State = State->BindExpr(CE, C.getLocationContext(), RetVal);
 
   // Fill the region with the initialization value.
@@ -1781,7 +1827,7 @@ ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
   if (Size.isUndef())
     Size = UnknownVal();
 
-  checkTaintedness(C, Call, Size, State, AF_Malloc);
+  checkTaintedness(C, Call, Size, State, AllocationFamily(AF_Malloc));
 
   // Set the region's extent.
   State = setDynamicExtent(State, RetVal.getAsRegion(),
@@ -1830,8 +1876,8 @@ ProgramStateRef MallocChecker::FreeMemAttr(CheckerContext &C,
   if (!State)
     return nullptr;
 
-  if (Att->getModule()->getName() != "malloc")
-    return nullptr;
+  auto attrClassName = Att->getModule()->getName();
+  auto Family = AllocationFamily(AF_Custom, attrClassName);
 
   bool IsKnownToBeAllocated = false;
 
@@ -1839,7 +1885,7 @@ ProgramStateRef MallocChecker::FreeMemAttr(CheckerContext &C,
     ProgramStateRef StateI =
         FreeMemAux(C, Call, State, Arg.getASTIndex(),
                    Att->getOwnKind() == OwnershipAttr::Holds,
-                   IsKnownToBeAllocated, AF_Malloc);
+                   IsKnownToBeAllocated, Family);
     if (StateI)
       State = StateI;
   }
@@ -1877,6 +1923,27 @@ static bool didPreviousFreeFail(ProgramStateRef State,
   return false;
 }
 
+static void printOwnershipTakesList(raw_ostream &os, CheckerContext &C,
+                                    const Expr *E) {
+  const CallExpr *CE = dyn_cast<CallExpr>(E);
+
+  if (!CE)
+    return;
+
+  const FunctionDecl *FD = CE->getDirectCallee();
+  if (!FD)
+    return;
+
+  // Only one ownership_takes attribute is allowed.
+  for (const auto *I : FD->specific_attrs<OwnershipAttr>()) {
+    if (I->getOwnKind() != OwnershipAttr::Takes)
+      continue;
+
+    os << ", which takes ownership of '" << I->getModule()->getName() << '\'';
+    break;
+  }
+}
+
 static bool printMemFnName(raw_ostream &os, CheckerContext &C, const Expr *E) {
   if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
     // FIXME: This doesn't handle indirect calls.
@@ -1884,9 +1951,12 @@ static bool printMemFnName(raw_ostream &os, CheckerContext &C, const Expr *E) {
     if (!FD)
       return false;
 
-    os << *FD;
+    os << '\'' << *FD;
+
     if (!FD->isOverloadedOperator())
       os << "()";
+
+    os << '\'';
     return true;
   }
 
@@ -1918,26 +1988,55 @@ static bool printMemFnName(raw_ostream &os, CheckerContext &C, const Expr *E) {
 
 static void printExpectedAllocName(raw_ostream &os, AllocationFamily Family) {
 
-  switch(Family) {
-    case AF_Malloc: os << "malloc()"; return;
-    case AF_CXXNew: os << "'new'"; return;
-    case AF_CXXNewArray: os << "'new[]'"; return;
-    case AF_IfNameIndex: os << "'if_nameindex()'"; return;
-    case AF_InnerBuffer: os << "container-specific allocator"; return;
-    case AF_Alloca:
-    case AF_None: llvm_unreachable("not a deallocation expression");
+  switch (Family.Kind) {
+  case AF_Malloc:
+    os << "'malloc()'";
+    return;
+  case AF_CXXNew:
+    os << "'new'";
+    return;
+  case AF_CXXNewArray:
+    os << "'new[]'";
+    return;
+  case AF_IfNameIndex:
+    os << "'if_nameindex()'";
+    return;
+  case AF_InnerBuffer:
+    os << "container-specific allocator";
+    return;
+  case AF_Custom:
+    os << Family.CustomName.value();
+    return;
+  case AF_Alloca:
+  case AF_None:
+    assert(false && "not a deallocation expression");
   }
 }
 
 static void printExpectedDeallocName(raw_ostream &os, AllocationFamily Family) {
-  switch(Family) {
-    case AF_Malloc: os << "free()"; return;
-    case AF_CXXNew: os << "'delete'"; return;
-    case AF_CXXNewArray: os << "'delete[]'"; return;
-    case AF_IfNameIndex: os << "'if_freenameindex()'"; return;
-    case AF_InnerBuffer: os << "container-specific deallocator"; return;
-    case AF_Alloca:
-    case AF_None: llvm_unreachable("suspicious argument");
+  switch (Family.Kind) {
+  case AF_Malloc:
+    os << "'free()'";
+    return;
+  case AF_CXXNew:
+    os << "'delete'";
+    return;
+  case AF_CXXNewArray:
+    os << "'delete[]'";
+    return;
+  case AF_IfNameIndex:
+    os << "'if_freenameindex()'";
+    return;
+  case AF_InnerBuffer:
+    os << "container-specific deallocator";
+    return;
+  case AF_Custom:
+    os << "function that takes ownership of '" << Family.CustomName.value()
+       << "\'";
+    return;
+  case AF_Alloca:
+  case AF_None:
+    assert(false && "not a deallocation expression");
   }
 }
 
@@ -1991,7 +2090,7 @@ MallocChecker::FreeMemAux(CheckerContext &C, const Expr *ArgExpr,
     // code. In that case, the ZERO_SIZE_PTR defines a special value used for a
     // zero-sized memory block which is allowed to be freed, despite not being a
     // null pointer.
-    if (Family != AF_Malloc || !isArgZERO_SIZE_PTR(State, C, ArgVal))
+    if (Family.Kind != AF_Malloc || !isArgZERO_SIZE_PTR(State, C, ArgVal))
       HandleNonHeapDealloc(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
                            Family);
     return nullptr;
@@ -2041,7 +2140,7 @@ MallocChecker::FreeMemAux(CheckerContext &C, const Expr *ArgExpr,
   if (RsBase) {
 
     // Memory returned by alloca() shouldn't be freed.
-    if (RsBase->getAllocationFamily() == AF_Alloca) {
+    if (RsBase->getAllocationFamily().Kind == AF_Alloca) {
       HandleFreeAlloca(C, ArgVal, ArgExpr->getSourceRange());
       return nullptr;
     }
@@ -2119,9 +2218,10 @@ MallocChecker::FreeMemAux(CheckerContext &C, const Expr *ArgExpr,
 std::optional<MallocChecker::CheckKind>
 MallocChecker::getCheckIfTracked(AllocationFamily Family,
                                  bool IsALeakCheck) const {
-  switch (Family) {
+  switch (Family.Kind) {
   case AF_Malloc:
   case AF_Alloca:
+  case AF_Custom:
   case AF_IfNameIndex: {
     if (ChecksEnabled[CK_MallocChecker])
       return CK_MallocChecker;
@@ -2145,10 +2245,12 @@ MallocChecker::getCheckIfTracked(AllocationFamily Family,
     return std::nullopt;
   }
   case AF_None: {
-    llvm_unreachable("no family");
+    assert(false && "no family");
+    return std::nullopt;
   }
   }
-  llvm_unreachable("unhandled family");
+  assert(false && "unhandled family");
+  return std::nullopt;
 }
 
 std::optional<MallocChecker::CheckKind>
@@ -2316,11 +2418,11 @@ void MallocChecker::HandleFreeAlloca(CheckerContext &C, SVal ArgVal,
   if (ExplodedNode *N = C.generateErrorNode()) {
     if (!BT_FreeAlloca[*CheckKind])
       BT_FreeAlloca[*CheckKind].reset(new BugType(
-          CheckNames[*CheckKind], "Free alloca()", categories::MemoryError));
+          CheckNames[*CheckKind], "Free 'alloca()'", categories::MemoryError));
 
     auto R = std::make_unique<PathSensitiveBugReport>(
         *BT_FreeAlloca[*CheckKind],
-        "Memory allocated by alloca() should not be deallocated", N);
+        "Memory allocated by 'alloca()' should not be deallocated", N);
     R->markInteresting(ArgVal.getAsRegion());
     R->addRange(Range);
     C.emitReport(std::move(R));
@@ -2373,6 +2475,8 @@ void MallocChecker::HandleMismatchedDealloc(CheckerContext &C,
 
         if (printMemFnName(DeallocOs, C, DeallocExpr))
           os << ", not " << DeallocOs.str();
+
+        printOwnershipTakesList(os, C, DeallocExpr);
     }
 
     auto R = std::make_unique<PathSensitiveBugReport>(*BT_MismatchedDealloc,
@@ -2465,7 +2569,7 @@ void MallocChecker::HandleUseAfterFree(CheckerContext &C, SourceRange Range,
 
     auto R = std::make_unique<PathSensitiveBugReport>(
         *BT_UseFree[*CheckKind],
-        AF == AF_InnerBuffer
+        AF.Kind == AF_InnerBuffer
             ? "Inner pointer of container used after re/deallocation"
             : "Use of memory after it is freed",
         N);
@@ -2474,7 +2578,7 @@ void MallocChecker::HandleUseAfterFree(CheckerContext &C, SourceRange Range,
     R->addRange(Range);
     R->addVisitor<MallocBugVisitor>(Sym);
 
-    if (AF == AF_InnerBuffer)
+    if (AF.Kind == AF_InnerBuffer)
       R->addVisitor(allocation_state::getInnerPointerBRVisitor(Sym));
 
     C.emitReport(std::move(R));
@@ -2733,7 +2837,8 @@ ProgramStateRef MallocChecker::CallocMem(CheckerContext &C,
   SVal TotalSize =
       evalMulForBufferSize(C, Call.getArgExpr(0), Call.getArgExpr(1));
 
-  return MallocMemAux(C, Call, TotalSize, zeroVal, State, AF_Malloc);
+  return MallocMemAux(C, Call, TotalSize, zeroVal, State,
+                      AllocationFamily(AF_Malloc));
 }
 
 MallocChecker::LeakInfo MallocChecker::getAllocationSite(const ExplodedNode *N,
@@ -2788,7 +2893,7 @@ void MallocChecker::HandleLeak(SymbolRef Sym, ExplodedNode *N,
   assert(RS && "cannot leak an untracked symbol");
   AllocationFamily Family = RS->getAllocationFamily();
 
-  if (Family == AF_Alloca)
+  if (Family.Kind == AF_Alloca)
     return;
 
   std::optional<MallocChecker::CheckKind> CheckKind =
@@ -2915,9 +3020,10 @@ void MallocChecker::checkPreCall(const CallEvent &Call,
 
     ProgramStateRef State = C.getState();
     bool IsKnownToBeAllocated;
-    State = FreeMemAux(C, DE->getArgument(), Call, State,
-                       /*Hold*/ false, IsKnownToBeAllocated,
-                       (DE->isArrayForm() ? AF_CXXNewArray : AF_CXXNew));
+    State = FreeMemAux(
+        C, DE->getArgument(), Call, State,
+        /*Hold*/ false, IsKnownToBeAllocated,
+        AllocationFamily(DE->isArrayForm() ? AF_CXXNewArray : AF_CXXNew));
 
     C.addTransition(State);
     return;
@@ -3349,8 +3455,8 @@ ProgramStateRef MallocChecker::checkConstPointerEscape(ProgramStateRef State,
 }
 
 static bool checkIfNewOrNewArrayFamily(const RefState *RS) {
-  return (RS->getAllocationFamily() == AF_CXXNewArray ||
-          RS->getAllocationFamily() == AF_CXXNew);
+  return (RS->getAllocationFamily().Kind == AF_CXXNewArray ||
+          RS->getAllocationFamily().Kind == AF_CXXNew);
 }
 
 ProgramStateRef MallocChecker::checkPointerEscapeAux(
@@ -3431,7 +3537,7 @@ PathDiagnosticPieceRef MallocBugVisitor::VisitNode(const ExplodedNode *N,
   const Stmt *S = N->getStmtForDiagnostics();
   // When dealing with containers, we sometimes want to give a note
   // even if the statement is missing.
-  if (!S && (!RSCurr || RSCurr->getAllocationFamily() != AF_InnerBuffer))
+  if (!S && (!RSCurr || RSCurr->getAllocationFamily().Kind != AF_InnerBuffer))
     return nullptr;
 
   const LocationContext *CurrentLC = N->getLocationContext();
@@ -3483,53 +3589,55 @@ PathDiagnosticPieceRef MallocBugVisitor::VisitNode(const ExplodedNode *N,
           Sym, "Returned allocated memory");
     } else if (isReleased(RSCurr, RSPrev, S)) {
       const auto Family = RSCurr->getAllocationFamily();
-      switch (Family) {
-        case AF_Alloca:
-        case AF_Malloc:
-        case AF_CXXNew:
-        case AF_CXXNewArray:
-        case AF_IfNameIndex:
-          Msg = "Memory is released";
+      switch (Family.Kind) {
+      case AF_Alloca:
+      case AF_Malloc:
+      case AF_Custom:
+      case AF_CXXNew:
+      case AF_CXXNewArray:
+      case AF_IfNameIndex:
+        Msg = "Memory is released";
+        StackHint = std::make_unique<StackHintGeneratorForSymbol>(
+            Sym, "Returning; memory was released");
+        break;
+      case AF_InnerBuffer: {
+        const MemRegion *ObjRegion =
+            allocation_state::getContainerObjRegion(statePrev, Sym);
+        const auto *TypedRegion = cast<TypedValueRegion>(ObjRegion);
+        QualType ObjTy = TypedRegion->getValueType();
+        OS << "Inner buffer of '" << ObjTy << "' ";
+
+        if (N->getLocation().getKind() == ProgramPoint::PostImplicitCallKind) {
+          OS << "deallocated by call to destructor";
           StackHint = std::make_unique<StackHintGeneratorForSymbol>(
-              Sym, "Returning; memory was released");
-          break;
-        case AF_InnerBuffer: {
-          const MemRegion *ObjRegion =
-              allocation_state::getContainerObjRegion(statePrev, Sym);
-          const auto *TypedRegion = cast<TypedValueRegion>(ObjRegion);
-          QualType ObjTy = TypedRegion->getValueType();
-          OS << "Inner buffer of '" << ObjTy << "' ";
-
-          if (N->getLocation().getKind() == ProgramPoint::PostImplicitCallKind) {
-            OS << "deallocated by call to destructor";
-            StackHint = std::make_unique<StackHintGeneratorForSymbol>(
-                Sym, "Returning; inner buffer was deallocated");
-          } else {
-            OS << "reallocated by call to '";
-            const Stmt *S = RSCurr->getStmt();
-            if (const auto *MemCallE = dyn_cast<CXXMemberCallExpr>(S)) {
-              OS << MemCallE->getMethodDecl()->getDeclName();
-            } else if (const auto *OpCallE = dyn_cast<CXXOperatorCallExpr>(S)) {
-              OS << OpCallE->getDirectCallee()->getDeclName();
-            } else if (const auto *CallE = dyn_cast<CallExpr>(S)) {
-              auto &CEMgr = BRC.getStateManager().getCallEventManager();
-              CallEventRef<> Call =
-                  CEMgr.getSimpleCall(CallE, state, CurrentLC, {nullptr, 0});
-              if (const auto *D = dyn_cast_or_null<NamedDecl>(Call->getDecl()))
-                OS << D->getDeclName();
-              else
-                OS << "unknown";
-            }
-            OS << "'";
-            StackHint = std::make_unique<StackHintGeneratorForSymbol>(
-                Sym, "Returning; inner buffer was reallocated");
+              Sym, "Returning; inner buffer was deallocated");
+        } else {
+          OS << "reallocated by call to '";
+          const Stmt *S = RSCurr->getStmt();
+          if (const auto *MemCallE = dyn_cast<CXXMemberCallExpr>(S)) {
+            OS << MemCallE->getMethodDecl()->getDeclName();
+          } else if (const auto *OpCallE = dyn_cast<CXXOperatorCallExpr>(S)) {
+            OS << OpCallE->getDirectCallee()->getDeclName();
+          } else if (const auto *CallE = dyn_cast<CallExpr>(S)) {
+            auto &CEMgr = BRC.getStateManager().getCallEventManager();
+            CallEventRef<> Call =
+                CEMgr.getSimpleCall(CallE, state, CurrentLC, {nullptr, 0});
+            if (const auto *D = dyn_cast_or_null<NamedDecl>(Call->getDecl()))
+              OS << D->getDeclName();
+            else
+              OS << "unknown";
           }
-          Msg = OS.str();
-          break;
+          OS << "'";
+          StackHint = std::make_unique<StackHintGeneratorForSymbol>(
+              Sym, "Returning; inner buffer was reallocated");
+        }
+        Msg = OS.str();
+        break;
         }
         case AF_None:
-          llvm_unreachable("Unhandled allocation family!");
-      }
+          assert(false && "Unhandled allocation family!");
+          return nullptr;
+        }
 
       // See if we're releasing memory while inlining a destructor
       // (or one of its callees). This turns on various common
@@ -3603,7 +3711,7 @@ PathDiagnosticPieceRef MallocBugVisitor::VisitNode(const ExplodedNode *N,
   // Generate the extra diagnostic.
   PathDiagnosticLocation Pos;
   if (!S) {
-    assert(RSCurr->getAllocationFamily() == AF_InnerBuffer);
+    assert(RSCurr->getAllocationFamily().Kind == AF_InnerBuffer);
     auto PostImplCall = N->getLocation().getAs<PostImplicitCall>();
     if (!PostImplCall)
       return nullptr;
@@ -3650,7 +3758,7 @@ namespace allocation_state {
 
 ProgramStateRef
 markReleased(ProgramStateRef State, SymbolRef Sym, const Expr *Origin) {
-  AllocationFamily Family = AF_InnerBuffer;
+  AllocationFamily Family(AF_InnerBuffer);
   return State->set<RegionState>(Sym, RefState::getReleased(Family, Origin));
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index cd1dd1b2fc511..4b8e5216550d9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -21,49 +21,56 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
 
 using namespace clang;
 using namespace ento;
 
 namespace {
-class MmapWriteExecChecker : public Checker<check::PreCall> {
+class MmapWriteExecChecker
+    : public Checker<check::ASTDecl<TranslationUnitDecl>, check::PreCall> {
   CallDescription MmapFn{CDM::CLibrary, {"mmap"}, 6};
   CallDescription MprotectFn{CDM::CLibrary, {"mprotect"}, 3};
-  static int ProtWrite;
-  static int ProtExec;
-  static int ProtRead;
   const BugType BT{this, "W^X check fails, Write Exec prot flags set",
                    "Security"};
 
+  // Default values are used if definition of the flags is not found.
+  mutable int ProtRead = 0x01;
+  mutable int ProtWrite = 0x02;
+  mutable int ProtExec = 0x04;
+
 public:
+  void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &Mgr,
+                    BugReporter &BR) const;
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
-  int ProtExecOv;
-  int ProtReadOv;
 };
 }
 
-int MmapWriteExecChecker::ProtWrite = 0x02;
-int MmapWriteExecChecker::ProtExec  = 0x04;
-int MmapWriteExecChecker::ProtRead  = 0x01;
+void MmapWriteExecChecker::checkASTDecl(const TranslationUnitDecl *TU,
+                                        AnalysisManager &Mgr,
+                                        BugReporter &BR) const {
+  Preprocessor &PP = Mgr.getPreprocessor();
+  const std::optional<int> FoundProtRead = tryExpandAsInteger("PROT_READ", PP);
+  const std::optional<int> FoundProtWrite =
+      tryExpandAsInteger("PROT_WRITE", PP);
+  const std::optional<int> FoundProtExec = tryExpandAsInteger("PROT_EXEC", PP);
+  if (FoundProtRead && FoundProtWrite && FoundProtExec) {
+    ProtRead = *FoundProtRead;
+    ProtWrite = *FoundProtWrite;
+    ProtExec = *FoundProtExec;
+  }
+}
 
 void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
-                                         CheckerContext &C) const {
+                                        CheckerContext &C) const {
   if (matchesAny(Call, MmapFn, MprotectFn)) {
     SVal ProtVal = Call.getArgSVal(2);
     auto ProtLoc = ProtVal.getAs<nonloc::ConcreteInt>();
     if (!ProtLoc)
       return;
     int64_t Prot = ProtLoc->getValue().getSExtValue();
-    if (ProtExecOv != ProtExec)
-      ProtExec = ProtExecOv;
-    if (ProtReadOv != ProtRead)
-      ProtRead = ProtReadOv;
-
-    // Wrong settings
-    if (ProtRead == ProtExec)
-      return;
 
-    if ((Prot & (ProtWrite | ProtExec)) == (ProtWrite | ProtExec)) {
+    if ((Prot & ProtWrite) && (Prot & ProtExec)) {
       ExplodedNode *N = C.generateNonFatalErrorNode();
       if (!N)
         return;
@@ -80,17 +87,10 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
   }
 }
 
-void ento::registerMmapWriteExecChecker(CheckerManager &mgr) {
-  MmapWriteExecChecker *Mwec =
-      mgr.registerChecker<MmapWriteExecChecker>();
-  Mwec->ProtExecOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtExec");
-  Mwec->ProtReadOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtRead");
+void ento::registerMmapWriteExecChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<MmapWriteExecChecker>();
 }
 
-bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index eea93a41f1384..b856b0edc6151 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -19,6 +19,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace clang;
 using namespace ento;
@@ -40,6 +41,11 @@ class PointerSubChecker
       "Indexing the address of a variable with other than 1 at this place "
       "is undefined behavior.";
 
+  /// Check that an array is indexed in the allowed range that is 0 to "one
+  /// after the end". The "array" can be address of a non-array variable.
+  /// @param E Expression of the pointer subtraction.
+  /// @param ElemReg An indexed region in the subtraction expression.
+  /// @param Reg Region of the other side of the expression.
   bool checkArrayBounds(CheckerContext &C, const Expr *E,
                         const ElementRegion *ElemReg,
                         const MemRegion *Reg) const;
@@ -49,12 +55,28 @@ class PointerSubChecker
 };
 }
 
+static bool isArrayVar(const MemRegion *R) {
+  while (R) {
+    if (isa<VarRegion>(R))
+      return true;
+    if (const auto *ER = dyn_cast<ElementRegion>(R))
+      R = ER->getSuperRegion();
+    else
+      return false;
+  }
+  return false;
+}
+
 bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
                                          const ElementRegion *ElemReg,
                                          const MemRegion *Reg) const {
   if (!ElemReg)
     return true;
 
+  const MemRegion *SuperReg = ElemReg->getSuperRegion();
+  if (!isArrayVar(SuperReg))
+    return true;
+
   auto ReportBug = [&](const llvm::StringLiteral &Msg) {
     if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
       auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
@@ -64,10 +86,10 @@ bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
   };
 
   ProgramStateRef State = C.getState();
-  const MemRegion *SuperReg = ElemReg->getSuperRegion();
   SValBuilder &SVB = C.getSValBuilder();
 
   if (SuperReg == Reg) {
+    // Case like `(&x + 1) - &x`. Only 1 or 0 is allowed as index.
     if (const llvm::APSInt *I = SVB.getKnownValue(State, ElemReg->getIndex());
         I && (!I->isOne() && !I->isZero()))
       ReportBug(Msg_BadVarIndex);
@@ -121,8 +143,9 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
   if (LR == RR)
     return;
 
-  // No warning if one operand is unknown.
-  if (isa<SymbolicRegion>(LR) || isa<SymbolicRegion>(RR))
+  // No warning if one operand is unknown or resides in a region that could be
+  // equal to the other.
+  if (LR->getSymbolicBase() || RR->getSymbolicBase())
     return;
 
   const auto *ElemLR = dyn_cast<ElementRegion>(LR);
@@ -159,12 +182,16 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
     //   do_something(&a.array[5] - &b.array[5]);
     // In this case don't emit notes.
     if (DiffDeclL != DiffDeclR) {
-      if (DiffDeclL)
-        R->addNote("Array at the left-hand side of subtraction",
-                   {DiffDeclL, C.getSourceManager()});
-      if (DiffDeclR)
-        R->addNote("Array at the right-hand side of subtraction",
-                   {DiffDeclR, C.getSourceManager()});
+      auto AddNote = [&R, &C](const ValueDecl *D, StringRef SideStr) {
+        if (D) {
+          std::string Msg = llvm::formatv(
+              "{0} at the {1}-hand side of subtraction",
+              D->getType()->isArrayType() ? "Array" : "Object", SideStr);
+          R->addNote(Msg, {D, C.getSourceManager()});
+        }
+      };
+      AddNote(DiffDeclL, "left");
+      AddNote(DiffDeclR, "right");
     }
     C.emitReport(std::move(R));
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index e8d538388e56c..4454f30630e27 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -254,7 +254,8 @@ inline void assertStreamStateOpened(const StreamState *SS) {
 }
 
 class StreamChecker : public Checker<check::PreCall, eval::Call,
-                                     check::DeadSymbols, check::PointerEscape> {
+                                     check::DeadSymbols, check::PointerEscape,
+                                     check::ASTDecl<TranslationUnitDecl>> {
   BugType BT_FileNull{this, "NULL stream pointer", "Stream handling error"};
   BugType BT_UseAfterClose{this, "Closed stream", "Stream handling error"};
   BugType BT_UseAfterOpenFailed{this, "Invalid stream",
@@ -276,11 +277,21 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
                                      const CallEvent *Call,
                                      PointerEscapeKind Kind) const;
 
+  /// Finds the declarations of 'FILE *stdin, *stdout, *stderr'.
+  void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &,
+                    BugReporter &) const;
+
   const BugType *getBT_StreamEof() const { return &BT_StreamEof; }
   const BugType *getBT_IndeterminatePosition() const {
     return &BT_IndeterminatePosition;
   }
 
+  /// Assumes that the result of 'fopen' can't alias with the pointee of
+  /// 'stdin', 'stdout' or 'stderr'.
+  ProgramStateRef assumeNoAliasingWithStdStreams(ProgramStateRef State,
+                                                 DefinedSVal RetVal,
+                                                 CheckerContext &C) const;
+
   const NoteTag *constructSetEofNoteTag(CheckerContext &C,
                                         SymbolRef StreamSym) const {
     return C.getNoteTag([this, StreamSym](PathSensitiveBugReport &BR) {
@@ -451,6 +462,10 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
   /// The built-in va_list type is platform-specific
   mutable QualType VaListType;
 
+  mutable const VarDecl *StdinDecl = nullptr;
+  mutable const VarDecl *StdoutDecl = nullptr;
+  mutable const VarDecl *StderrDecl = nullptr;
+
   void evalFopen(const FnDescription *Desc, const CallEvent &Call,
                  CheckerContext &C) const;
 
@@ -600,30 +615,22 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
     });
   }
 
-  void initMacroValues(CheckerContext &C) const {
+  void initMacroValues(const Preprocessor &PP) const {
     if (EofVal)
       return;
 
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("EOF", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("EOF", PP))
       EofVal = *OptInt;
     else
       EofVal = -1;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_SET", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_SET", PP))
       SeekSetVal = *OptInt;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_END", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_END", PP))
       SeekEndVal = *OptInt;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_CUR", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_CUR", PP))
       SeekCurVal = *OptInt;
   }
 
-  void initVaListType(CheckerContext &C) const {
-    VaListType = C.getASTContext().getBuiltinVaListType().getCanonicalType();
-  }
-
   /// Searches for the ExplodedNode where the file descriptor was acquired for
   /// StreamSym.
   static const ExplodedNode *getAcquisitionSite(const ExplodedNode *N,
@@ -865,9 +872,6 @@ static ProgramStateRef escapeArgs(ProgramStateRef State, CheckerContext &C,
 
 void StreamChecker::checkPreCall(const CallEvent &Call,
                                  CheckerContext &C) const {
-  initMacroValues(C);
-  initVaListType(C);
-
   const FnDescription *Desc = lookupFn(Call);
   if (!Desc || !Desc->PreFn)
     return;
@@ -887,6 +891,30 @@ bool StreamChecker::evalCall(const CallEvent &Call, CheckerContext &C) const {
   return C.isDifferent();
 }
 
+ProgramStateRef StreamChecker::assumeNoAliasingWithStdStreams(
+    ProgramStateRef State, DefinedSVal RetVal, CheckerContext &C) const {
+  auto assumeRetNE = [&C, RetVal](ProgramStateRef State,
+                                  const VarDecl *Var) -> ProgramStateRef {
+    if (!Var)
+      return State;
+    const auto *LCtx = C.getLocationContext();
+    auto &StoreMgr = C.getStoreManager();
+    auto &SVB = C.getSValBuilder();
+    SVal VarValue = State->getSVal(StoreMgr.getLValueVar(Var, LCtx));
+    auto NoAliasState =
+        SVB.evalBinOp(State, BO_NE, RetVal, VarValue, SVB.getConditionType())
+            .castAs<DefinedOrUnknownSVal>();
+    return State->assume(NoAliasState, true);
+  };
+
+  assert(State);
+  State = assumeRetNE(State, StdinDecl);
+  State = assumeRetNE(State, StdoutDecl);
+  State = assumeRetNE(State, StderrDecl);
+  assert(State);
+  return State;
+}
+
 void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
                               CheckerContext &C) const {
   ProgramStateRef State = C.getState();
@@ -911,6 +939,8 @@ void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
   StateNull =
       StateNull->set<StreamMap>(RetSym, StreamState::getOpenFailed(Desc));
 
+  StateNotNull = assumeNoAliasingWithStdStreams(StateNotNull, RetVal, C);
+
   C.addTransition(StateNotNull,
                   constructLeakNoteTag(C, RetSym, "Stream opened here"));
   C.addTransition(StateNull);
@@ -2017,6 +2047,38 @@ ProgramStateRef StreamChecker::checkPointerEscape(
   return State;
 }
 
+static const VarDecl *
+getGlobalStreamPointerByName(const TranslationUnitDecl *TU, StringRef VarName) {
+  ASTContext &Ctx = TU->getASTContext();
+  const auto &SM = Ctx.getSourceManager();
+  const QualType FileTy = Ctx.getFILEType();
+
+  if (FileTy.isNull())
+    return nullptr;
+
+  const QualType FilePtrTy = Ctx.getPointerType(FileTy).getCanonicalType();
+
+  auto LookupRes = TU->lookup(&Ctx.Idents.get(VarName));
+  for (const Decl *D : LookupRes) {
+    if (auto *VD = dyn_cast_or_null<VarDecl>(D)) {
+      if (SM.isInSystemHeader(VD->getLocation()) && VD->hasExternalStorage() &&
+          VD->getType().getCanonicalType() == FilePtrTy) {
+        return VD;
+      }
+    }
+  }
+  return nullptr;
+}
+
+void StreamChecker::checkASTDecl(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &) const {
+  StdinDecl = getGlobalStreamPointerByName(TU, "stdin");
+  StdoutDecl = getGlobalStreamPointerByName(TU, "stdout");
+  StderrDecl = getGlobalStreamPointerByName(TU, "stderr");
+  VaListType = TU->getASTContext().getBuiltinVaListType().getCanonicalType();
+  initMacroValues(Mgr.getPreprocessor());
+}
+
 //===----------------------------------------------------------------------===//
 // Checker registration.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
index e2002bfbe594a..d73dc40cf03fb 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
@@ -2198,7 +2198,7 @@ const Decl *PathSensitiveBugReport::getDeclWithIssue() const {
 void BasicBugReport::Profile(llvm::FoldingSetNodeID& hash) const {
   hash.AddInteger(static_cast<int>(getKind()));
   hash.AddPointer(&BT);
-  hash.AddString(Description);
+  hash.AddString(getShortDescription());
   assert(Location.isValid());
   Location.Profile(hash);
 
@@ -2213,7 +2213,7 @@ void BasicBugReport::Profile(llvm::FoldingSetNodeID& hash) const {
 void PathSensitiveBugReport::Profile(llvm::FoldingSetNodeID &hash) const {
   hash.AddInteger(static_cast<int>(getKind()));
   hash.AddPointer(&BT);
-  hash.AddString(Description);
+  hash.AddString(getShortDescription());
   PathDiagnosticLocation UL = getUniqueingLocation();
   if (UL.isValid()) {
     UL.Profile(hash);
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 0e317ec765ec0..eba224b8ec01c 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -923,12 +923,31 @@ SVal AnyCXXConstructorCall::getCXXThisVal() const {
   return UnknownVal();
 }
 
+static bool isWithinStdNamespace(const Decl *D) {
+  const DeclContext *DC = D->getDeclContext();
+  while (DC) {
+    if (const auto *NS = dyn_cast<NamespaceDecl>(DC);
+        NS && NS->isStdNamespace())
+      return true;
+    DC = DC->getParent();
+  }
+  return false;
+}
+
 void AnyCXXConstructorCall::getExtraInvalidatedValues(ValueList &Values,
                            RegionAndSymbolInvalidationTraits *ETraits) const {
   SVal V = getCXXThisVal();
   if (SymbolRef Sym = V.getAsSymbol(true))
     ETraits->setTrait(Sym,
                       RegionAndSymbolInvalidationTraits::TK_SuppressEscape);
+
+  // Standard classes don't reinterpret-cast and modify super regions.
+  const bool IsStdClassCtor = isWithinStdNamespace(getDecl());
+  if (const MemRegion *Obj = V.getAsRegion(); Obj && IsStdClassCtor) {
+    ETraits->setTrait(
+        Obj, RegionAndSymbolInvalidationTraits::TK_DoNotInvalidateSuperRegion);
+  }
+
   Values.push_back(V);
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 33fc4d6f6c9cf..38aacd3b3b02f 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1811,7 +1811,9 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPTargetTeamsDistributeParallelForDirectiveClass:
     case Stmt::OMPTargetTeamsDistributeParallelForSimdDirectiveClass:
     case Stmt::OMPTargetTeamsDistributeSimdDirectiveClass:
+    case Stmt::OMPReverseDirectiveClass:
     case Stmt::OMPTileDirectiveClass:
+    case Stmt::OMPInterchangeDirectiveClass:
     case Stmt::OMPInteropDirectiveClass:
     case Stmt::OMPDispatchDirectiveClass:
     case Stmt::OMPMaskedDirectiveClass:
diff --git a/clang/lib/Tooling/ArgumentsAdjusters.cpp b/clang/lib/Tooling/ArgumentsAdjusters.cpp
index df4c74205b087..d01c57ee69c00 100644
--- a/clang/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/clang/lib/Tooling/ArgumentsAdjusters.cpp
@@ -49,10 +49,11 @@ ArgumentsAdjuster getClangSyntaxOnlyAdjuster() {
           }))
         continue;
 
-      if (!Arg.starts_with("-fcolor-diagnostics") &&
+      if (Arg != "-c" && Arg != "-S" &&
+          !Arg.starts_with("-fcolor-diagnostics") &&
           !Arg.starts_with("-fdiagnostics-color"))
         AdjustedArgs.push_back(Args[i]);
-      // If we strip a color option, make sure we strip any preceeding `-Xclang`
+      // If we strip an option, make sure we strip any preceeding `-Xclang`
       // option as well.
       // FIXME: This should be added to most argument adjusters!
       else if (!AdjustedArgs.empty() && AdjustedArgs.back() == "-Xclang")
diff --git a/clang/lib/Tooling/DependencyScanning/CMakeLists.txt b/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
index bc807e162c919..66795b0be0baa 100644
--- a/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
+++ b/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
@@ -19,7 +19,6 @@ add_clang_library(clangDependencyScanning
   LINK_LIBS
   clangAST
   clangBasic
-  clangCodeGen
   clangDriver
   clangFrontend
   clangLex
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 0f82f22d8b9a8..91842627b001c 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -10,7 +10,6 @@
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticSerialization.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
@@ -21,6 +20,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "clang/Tooling/DependencyScanning/ModuleDepCollector.h"
 #include "clang/Tooling/Tooling.h"
diff --git a/clang/test/APINotes/Inputs/Headers/Methods.apinotes b/clang/test/APINotes/Inputs/Headers/Methods.apinotes
new file mode 100644
index 0000000000000..618c2cb14b47f
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Methods.apinotes
@@ -0,0 +1,32 @@
+---
+Name: Methods
+Tags:
+- Name: IntWrapper
+  Methods:
+  - Name: getIncremented
+    Availability: none
+    AvailabilityMsg: "oh no"
+  - Name: getDecremented
+    Availability: none
+    AvailabilityMsg: "this should have no effect"
+- Name: Outer
+  Tags:
+  - Name: Inner
+    Methods:
+    - Name: getDecremented
+      Availability: none
+      AvailabilityMsg: "nope"
+    - Name: getIncremented
+      Availability: none
+      AvailabilityMsg: "this should have no effect"
+  Methods:
+  - Name: getDecremented
+    Availability: none
+    AvailabilityMsg: "this should have no effect"  
+Functions:
+- Name: getIncremented
+  Availability: none
+  AvailabilityMsg: "this should have no effect"
+- Name: getDecremented
+  Availability: none
+  AvailabilityMsg: "this should have no effect"
diff --git a/clang/test/APINotes/Inputs/Headers/Methods.h b/clang/test/APINotes/Inputs/Headers/Methods.h
new file mode 100644
index 0000000000000..cbb57ccd0afbd
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Methods.h
@@ -0,0 +1,19 @@
+struct IntWrapper {
+  int value;
+
+  IntWrapper getIncremented() const { return {value + 1}; }
+
+  IntWrapper operator+(const IntWrapper& RHS) const { return {value + RHS.value}; }
+};
+
+struct Outer {
+  struct Inner {
+    int value;
+
+    Inner getDecremented() const { return {value - 1}; }
+
+    bool operator==(const Inner& RHS) const {
+      return value == RHS.value;
+    }
+  };
+};
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
index e9da36787b638..b7d962e0efda6 100644
--- a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
@@ -38,6 +38,12 @@ Namespaces:
         Tags:
           - Name: char_box
             SwiftName: NestedCharBox
+            Methods:
+              - Name: methodInNestedNamespace
+                SwiftName: swiftMethodInNestedNamespace()
+            Tags:
+              - Name: inner_char_box
+                SwiftName: InnerCharBox
         Namespaces:
           - Name: Namespace1
             Tags:
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.h b/clang/test/APINotes/Inputs/Headers/Namespaces.h
index 6a79e996be86c..c7a891f205c06 100644
--- a/clang/test/APINotes/Inputs/Headers/Namespaces.h
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.h
@@ -9,6 +9,10 @@ namespace Nested1 {
 void funcInNestedNamespace(int i);
 struct char_box {
   char c;
+  void methodInNestedNamespace();
+  struct inner_char_box {
+    char c;
+  };
 };
 }
 
diff --git a/clang/test/APINotes/Inputs/Headers/module.modulemap b/clang/test/APINotes/Inputs/Headers/module.modulemap
index d515169184f4f..faf6042c78d57 100644
--- a/clang/test/APINotes/Inputs/Headers/module.modulemap
+++ b/clang/test/APINotes/Inputs/Headers/module.modulemap
@@ -24,6 +24,10 @@ module BrokenTypes {
   header "BrokenTypes.h"
 }
 
+module Methods {
+  header "Methods.h"
+}
+
 module ModuleWithWrongCase {
   header "ModuleWithWrongCase.h"
 }
diff --git a/clang/test/APINotes/methods.cpp b/clang/test/APINotes/methods.cpp
new file mode 100644
index 0000000000000..910565745bea2
--- /dev/null
+++ b/clang/test/APINotes/methods.cpp
@@ -0,0 +1,17 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x c++
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter IntWrapper::getIncremented -x c++ | FileCheck --check-prefix=CHECK-METHOD %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Outer::Inner::getDecremented -x c++ | FileCheck --check-prefix=CHECK-DEEP-METHOD %s
+
+#include "Methods.h"
+
+// CHECK-METHOD: Dumping IntWrapper::getIncremented:
+// CHECK-METHOD-NEXT: CXXMethodDecl {{.+}} getIncremented
+// CHECK-METHOD: UnavailableAttr {{.+}} <<invalid sloc>> "oh no"
+
+// CHECK-DEEP-METHOD: Dumping Outer::Inner::getDecremented:
+// CHECK-DEEP-METHOD-NEXT: CXXMethodDecl {{.+}} getDecremented
+// CHECK-DEEP-METHOD: UnavailableAttr {{.+}} <<invalid sloc>> "nope"
+
+// CHECK-METHOD-NOT: this should have no effect
+// CHECK-DEEP-METHOD-NOT: this should have no effect
diff --git a/clang/test/APINotes/namespaces.cpp b/clang/test/APINotes/namespaces.cpp
index c19eee565c2da..df108b02e6301 100644
--- a/clang/test/APINotes/namespaces.cpp
+++ b/clang/test/APINotes/namespaces.cpp
@@ -8,7 +8,9 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box::inner_char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box::methodInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-METHOD-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter varInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-INLINE-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter funcInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-INLINE-NAMESPACE %s
@@ -55,6 +57,14 @@
 // CHECK-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
 // CHECK-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "NestedCharBox"
 
+// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box::inner_char_box:
+// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct inner_char_box
+// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "InnerCharBox"
+
+// CHECK-METHOD-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box::methodInNestedNamespace:
+// CHECK-METHOD-IN-NESTED-NAMESPACE-NEXT: CXXMethodDecl {{.+}} imported in Namespaces methodInNestedNamespace
+// CHECK-METHOD-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftMethodInNestedNamespace()"
+
 // CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: Dumping Namespace1::Nested1::Namespace1::char_box:
 // CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
 // CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "DeepNestedCharBox"
diff --git a/clang/test/AST/Interp/atomic.c b/clang/test/AST/Interp/atomic.c
index c5fd9ab222934..c8469d4a938b8 100644
--- a/clang/test/AST/Interp/atomic.c
+++ b/clang/test/AST/Interp/atomic.c
@@ -58,3 +58,16 @@ _Static_assert(atomic_is_lock_free((atomic_short*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_int*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_long*)0), "");
 _Static_assert(atomic_is_lock_free(0 + (atomic_char*)0), "");
+
+_Static_assert(__atomic_always_lock_free(1, (void*)1), "");
+_Static_assert(__atomic_always_lock_free(1, (void*)-1), "");
+_Static_assert(!__atomic_always_lock_free(4, (void*)2), "");
+_Static_assert(!__atomic_always_lock_free(4, (void*)-2), "");
+_Static_assert(__atomic_always_lock_free(4, (void*)4), "");
+_Static_assert(__atomic_always_lock_free(4, (void*)-4), "");
+
+_Static_assert(__atomic_always_lock_free(1, "string"), "");
+_Static_assert(!__atomic_always_lock_free(2, "string"), "");
+_Static_assert(__atomic_always_lock_free(2, (int[2]){}), "");
+void dummyfn();
+_Static_assert(__atomic_always_lock_free(2, dummyfn) || 1, "");
diff --git a/clang/test/AST/Interp/builtin-constant-p.cpp b/clang/test/AST/Interp/builtin-constant-p.cpp
new file mode 100644
index 0000000000000..0d222d1c96277
--- /dev/null
+++ b/clang/test/AST/Interp/builtin-constant-p.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+
+static_assert(__builtin_constant_p(12), "");
+static_assert(__builtin_constant_p(1.0), "");
+
+constexpr int I = 100;
+static_assert(__builtin_constant_p(I), "");
+static_assert(__builtin_constant_p(I + 10), "");
+static_assert(__builtin_constant_p(I + 10.0), "");
+static_assert(__builtin_constant_p(nullptr), "");
+static_assert(__builtin_constant_p(&I), ""); // both-error {{failed due to requirement}}
+static_assert(__builtin_constant_p((void)I), ""); // both-error {{failed due to requirement}}
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index a74b68bb9d89b..9b2b20773be58 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -31,3 +31,8 @@ constexpr bool assume() {
   return true;
 }
 static_assert(assume(), "");
+
+void test_builtin_os_log(void *buf, int i, const char *data) {
+  constexpr int len = __builtin_os_log_format_buffer_size("%d %{public}s %{private}.16P", i, data, data);
+  static_assert(len > 0, "Expect len > 0");
+}
diff --git a/clang/test/AST/Interp/codegen.cpp b/clang/test/AST/Interp/codegen.cpp
new file mode 100644
index 0000000000000..a5583d953d234
--- /dev/null
+++ b/clang/test/AST/Interp/codegen.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s
+
+
+int arr[2];
+// CHECK: @pastEnd = constant ptr getelementptr (i8, ptr @arr, i64 8)
+int &pastEnd = arr[2];
+
+// CHECK: @F = constant ptr @arr, align 8
+int &F = arr[0];
+
+struct S {
+  int a;
+  float c[3];
+};
+
+// CHECK: @s = global %struct.S zeroinitializer, align 4
+S s;
+// CHECK: @sp = constant ptr getelementptr (i8, ptr @s, i64 16), align 8
+float &sp = s.c[3];
+
+
+namespace BaseClassOffsets {
+  struct A { int a; };
+  struct B { int b; };
+  struct C : A, B { int c; };
+
+  extern C c;
+  // CHECK: @_ZN16BaseClassOffsets1aE = global ptr @_ZN16BaseClassOffsets1cE, align 8
+  A* a = &c;
+  // CHECK: @_ZN16BaseClassOffsets1bE = global ptr getelementptr (i8, ptr @_ZN16BaseClassOffsets1cE, i64 4), align 8
+  B* b = &c;
+}
diff --git a/clang/test/AST/Interp/constexpr-subobj-initialization.cpp b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
index 4976b165468bd..1a35994944190 100644
--- a/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
+++ b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
@@ -5,8 +5,6 @@
 /// Differences:
 ///   1) The type of the uninitialized base class is printed WITH the namespace,
 ///      i.e. 'baseclass_uninit::DelBase' instead of just 'DelBase'.
-///   2) The location is not the base specifier declaration, but the call site
-///      of the constructor.
 
 
 namespace baseclass_uninit {
@@ -14,33 +12,29 @@ struct DelBase {
   constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
 };
 
-struct Foo : DelBase {
+struct Foo : DelBase { // expected-note 2{{constructor of base class 'baseclass_uninit::DelBase' is not called}}
   constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
 };
-constexpr Foo f; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
 
 struct Bar : Foo {
   constexpr Bar() {};
 };
-constexpr Bar bar; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
 
 struct Base {};
-struct A : Base {
+struct A : Base { // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
   constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
 };
 
-constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+constexpr A a; // expected-error {{must be initialized by a constant expression}}
 
 
-struct B : Base {
+struct B : Base { // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
   constexpr B() : {} // expected-error {{expected class member or base class name}}
 };
 
-constexpr B b; // expected-error {{must be initialized by a constant expression}} \
-               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+constexpr B b; // expected-error {{must be initialized by a constant expression}}
 } // namespace baseclass_uninit
 
 
diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp
index 82b2727bbadbb..cf2dfba079ef7 100644
--- a/clang/test/AST/Interp/cxx11.cpp
+++ b/clang/test/AST/Interp/cxx11.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=both,expected -std=c++11 %s
-// RUN: %clang_cc1 -verify=both,ref -std=c++11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=both,expected -std=c++11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify=both,ref -std=c++11 %s
 
 namespace IntOrEnum {
   const int k = 0;
@@ -62,3 +62,101 @@ namespace ReferenceToConst {
     }
   };
 }
+
+
+
+namespace GH50055 {
+// Enums without fixed underlying type
+enum E1 {e11=-4, e12=4};
+enum E2 {e21=0, e22=4};
+enum E3 {e31=-4, e32=1024};
+enum E4 {e41=0};
+// Empty but as-if it had a single enumerator with value 0
+enum EEmpty {};
+
+// Enum with fixed underlying type because the underlying type is explicitly specified
+enum EFixed : int {efixed1=-4, efixed2=4};
+// Enum with fixed underlying type because it is scoped
+enum class EScoped {escoped1=-4, escoped2=4};
+
+enum EMaxInt {emaxint1=-1, emaxint2=__INT_MAX__};
+
+enum NumberType {};
+
+E2 testDefaultArgForParam(E2 e2Param = (E2)-1) { // ok, not a constant expression context
+  E2 e2LocalInit = e2Param; // ok, not a constant expression context
+  return e2LocalInit;
+}
+
+// #include <enum-constexpr-conversion-system-header.h>
+
+void testValueInRangeOfEnumerationValues() {
+  constexpr E1 x1 = static_cast<E1>(-8);
+  constexpr E1 x2 = static_cast<E1>(8);
+  // both-error@-1 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}}
+  E1 x2b = static_cast<E1>(8); // ok, not a constant expression context
+
+  constexpr E2 x3 = static_cast<E2>(-8);
+  // both-error@-1 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}}
+  constexpr E2 x4 = static_cast<E2>(0);
+  constexpr E2 x5 = static_cast<E2>(8);
+  // both-error@-1 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}}
+
+  constexpr E3 x6 = static_cast<E3>(-2048);
+  constexpr E3 x7 = static_cast<E3>(-8);
+  constexpr E3 x8 = static_cast<E3>(0);
+  constexpr E3 x9 = static_cast<E3>(8);
+  constexpr E3 x10 = static_cast<E3>(2048);
+  // both-error@-1 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}}
+
+  constexpr E4 x11 = static_cast<E4>(0);
+  constexpr E4 x12 = static_cast<E4>(1);
+  constexpr E4 x13 = static_cast<E4>(2);
+  // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}}
+
+  constexpr EEmpty x14 = static_cast<EEmpty>(0);
+  constexpr EEmpty x15 = static_cast<EEmpty>(1);
+  constexpr EEmpty x16 = static_cast<EEmpty>(2);
+  // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}}
+
+  constexpr EFixed x17 = static_cast<EFixed>(100);
+  constexpr EScoped x18 = static_cast<EScoped>(100);
+
+  constexpr EMaxInt x19 = static_cast<EMaxInt>(__INT_MAX__-1);
+  constexpr EMaxInt x20 = static_cast<EMaxInt>((long)__INT_MAX__+1);
+  // both-error@-1 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}}
+
+  const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context
+}
+
+template<class T, unsigned size> struct Bitfield {
+  static constexpr T max = static_cast<T>((1 << size) - 1); // #enum
+};
+
+void testValueInRangeOfEnumerationValuesViaTemplate() {
+  Bitfield<E2, 3> good;
+  Bitfield<E2, 4> bad; // both-error@#enum {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}}
+}
+
+enum SortOrder {
+  AscendingOrder,
+  DescendingOrder
+};
+
+class A {
+  static void f(SortOrder order);
+};
+
+void A::f(SortOrder order) {
+  if (order == SortOrder(-1)) // ok, not a constant expression context
+    return;
+}
+}
+
+namespace FinalLtorDiags {
+  template<int*> struct A {}; // both-note {{template parameter is declared here}}
+  int k;
+  int *q = &k; // both-note {{declared here}}
+  A<q> c; // both-error {{non-type template argument of type 'int *' is not a constant expression}} \
+          // both-note {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
+}
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 2faacbbf70fd7..da80454b7a820 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -827,3 +827,17 @@ namespace CheckingNullPtrForInitialization {
     return x;
   }
 }
+
+namespace VariadicCallOperator {
+  class F {
+  public:
+    constexpr void operator()(int a, int b, ...) {}
+  };
+  constexpr int foo() {
+    F f;
+
+    f(1,2, 3);
+    return 1;
+  }
+  constexpr int A = foo();
+}
diff --git a/clang/test/AST/Interp/cxx2a.cpp b/clang/test/AST/Interp/cxx2a.cpp
new file mode 100644
index 0000000000000..ad021b30cfd3c
--- /dev/null
+++ b/clang/test/AST/Interp/cxx2a.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=ref,both %s
+// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter
+
+template <unsigned N>
+struct S {
+  S() requires (N==1) = default;
+  S() requires (N==2) {} // both-note {{declared here}}
+  consteval S() requires (N==3) = default;
+};
+
+consteval int aConstevalFunction() { // both-error {{consteval function never produces a constant expression}}
+  S<2> s4; // both-note {{non-constexpr constructor 'S' cannot be used in a constant expression}}
+  return 0;
+}
+/// We're NOT calling the above function. The diagnostics should appear anyway.
+
+namespace Covariant {
+  struct A {
+    virtual constexpr char f() const { return 'Z'; }
+    char a = f();
+  };
+
+  struct D : A {};
+  struct Covariant1 {
+    D d;
+    virtual const A *f() const;
+  };
+
+  struct Covariant3 : Covariant1 {
+    constexpr virtual const D *f() const { return &this->d; }
+  };
+
+  constexpr Covariant3 cb;
+  constexpr const Covariant1 *cb1 = &cb;
+  static_assert(cb1->f()->a == 'Z');
+}
diff --git a/clang/test/AST/Interp/lifetimes.cpp b/clang/test/AST/Interp/lifetimes.cpp
index d47533ab547b3..9fca54fe11120 100644
--- a/clang/test/AST/Interp/lifetimes.cpp
+++ b/clang/test/AST/Interp/lifetimes.cpp
@@ -33,3 +33,30 @@ struct S {
 constexpr int k1 = S().t; // both-error {{must be initialized by a constant expression}} \
                           // ref-note {{in call to}} \
                           // expected-note {{in call to}}
+
+
+namespace MoveFnWorks {
+  template<typename T> constexpr T &&ref(T &&t) { return (T&&)t; }
+
+  struct Buf {};
+
+  struct A {
+    constexpr A(Buf &buf) : buf(buf) { }
+    Buf &buf;
+  };
+
+  constexpr bool dtor_calls_dtor() {
+    struct B {
+      A &&d;
+      constexpr B(Buf &buf) : d(ref(A(buf))) {}
+    };
+
+    Buf buf;
+    {
+      B b(buf);
+    }
+
+    return true;
+  }
+  static_assert(dtor_calls_dtor(), "");
+}
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 9cd65462a0af3..815fb67b9bbfc 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -1214,6 +1214,10 @@ namespace StmtExprs {
     return 76;
   }
   static_assert(foo() == 76, "");
+
+  namespace CrossFuncLabelDiff {
+    constexpr long a(bool x) { return x ? 0 : (intptr_t)&&lbl + (0 && ({lbl: 0;})); }
+  }
 }
 #endif
 
@@ -1303,3 +1307,15 @@ namespace VolatileReads {
   static_assert(b, ""); // both-error {{not an integral constant expression}} \
                         // both-note {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
 }
+#if __cplusplus >= 201703L
+namespace {
+  struct C {
+    int x;
+  };
+
+  template <const C *p> void f() {
+    const auto &[c] = *p;
+    &c; // both-warning {{expression result unused}}
+  }
+}
+#endif
diff --git a/clang/test/AST/Interp/new-delete.cpp b/clang/test/AST/Interp/new-delete.cpp
index 04ce3ae5f6637..7a85def784920 100644
--- a/clang/test/AST/Interp/new-delete.cpp
+++ b/clang/test/AST/Interp/new-delete.cpp
@@ -476,7 +476,80 @@ constexpr Sp ss[] = {Sp{new int{154}}}; // both-error {{must be initialized by a
                                         // both-note {{pointer to heap-allocated object}} \
                                         // both-note {{allocation performed here}}
 
+namespace DeleteRunsDtors {
+  struct InnerFoo {
+    int *mem;
+    constexpr ~InnerFoo() {
+      delete mem;
+    }
+  };
+
+  struct Foo {
+    int *a;
+    InnerFoo IF;
+
+    constexpr Foo() {
+      a = new int(13);
+      IF.mem = new int(100);
+    }
+    constexpr ~Foo() { delete a; }
+  };
+
+  constexpr int abc() {
+    Foo *F = new Foo();
+    int n = *F->a;
+    delete F;
 
+    return n;
+  }
+  static_assert(abc() == 13);
+
+  constexpr int abc2() {
+    Foo *f = new Foo[3];
+
+    delete[] f;
+
+    return 1;
+  }
+  static_assert(abc2() == 1);
+}
+
+/// FIXME: There is a slight difference in diagnostics here, because we don't
+/// create a new frame when we delete record fields or bases at all.
+namespace FaultyDtorCalledByDelete {
+  struct InnerFoo {
+    int *mem;
+    constexpr ~InnerFoo() {
+      if (mem) {
+        (void)(1/0); // both-warning {{division by zero is undefined}} \
+                     // both-note {{division by zero}}
+      }
+      delete mem;
+    }
+  };
+
+  struct Foo {
+    int *a;
+    InnerFoo IF;
+
+    constexpr Foo() {
+      a = new int(13);
+      IF.mem = new int(100);
+    }
+    constexpr ~Foo() { delete a; }
+  };
+
+  constexpr int abc() {
+    Foo *F = new Foo();
+    int n = *F->a;
+    delete F; // both-note {{in call to}} \
+              // ref-note {{in call to}}
+
+    return n;
+  }
+  static_assert(abc() == 13); // both-error {{not an integral constant expression}} \
+                              // both-note {{in call to 'abc()'}}
+}
 
 
 #else
@@ -487,4 +560,9 @@ constexpr int a() { // both-error {{never produces a constant expression}}
 }
 static_assert(a() == 1, ""); // both-error {{not an integral constant expression}} \
                              // both-note {{in call to 'a()'}}
+
+
+static_assert(true ? *new int : 4, ""); // both-error {{expression is not an integral constant expression}} \
+                                        // both-note {{read of uninitialized object is not allowed in a constant expression}}
+
 #endif
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 2fc88a0b1df6a..479c0487fecae 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1537,3 +1537,42 @@ namespace BitSet {
     Bitset()
   };
 }
+
+namespace ArrayInitChain {
+  struct StringLiteral {
+    const char *S;
+  };
+
+  struct CustomOperandVal {
+    StringLiteral Str;
+    unsigned Width;
+    unsigned Mask = Width + 1;
+  };
+
+  constexpr CustomOperandVal A[] = {
+    {},
+    {{"depctr_hold_cnt"},  12,   13},
+  };
+  static_assert(A[0].Str.S == nullptr, "");
+  static_assert(A[0].Width == 0, "");
+  static_assert(A[0].Mask == 1, "");
+
+  static_assert(A[1].Width == 12, "");
+  static_assert(A[1].Mask == 13, "");
+}
+
+#if __cplusplus >= 202002L
+namespace ctorOverrider {
+  // Ensure that we pick the right final overrider during construction.
+  struct A {
+    virtual constexpr char f() const { return 'A'; }
+    char a = f();
+  };
+
+  struct Covariant1 {
+    A d;
+  };
+
+  constexpr Covariant1 cb;
+}
+#endif
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index a088c0a7b0272..b1631f7822ce0 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -32,22 +32,24 @@ Out2<double>::AInner t(1.0);
 // CHECK-NEXT: |     | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
 // CHECK-NEXT: |     | | |-TemplateArgument type 'int'
 // CHECK-NEXT: |     | | | `-BuiltinType {{.*}} 'int'
-// CHECK-NEXT: |     | | `-TemplateArgument type 'type-parameter-1-0'
-// CHECK-NEXT: |     | |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
+// CHECK-NEXT: |     | | `-TemplateArgument type 'Y':'type-parameter-1-0'
+// CHECK-NEXT: |     | |   `-TemplateTypeParmType {{.*}} 'Y' dependent depth 1 index 0
+// CHECK-NEXT: |     | |     `-TemplateTypeParm {{.*}} 'Y'
 // CHECK-NEXT: |     | `-TypeTraitExpr {{.*}} 'bool' __is_deducible
 // CHECK-NEXT: |     |   |-DeducedTemplateSpecializationType {{.*}} 'Out2<double>::AInner' dependent
 // CHECK-NEXT: |     |   | `-name: 'Out2<double>::AInner'
 // CHECK-NEXT: |     |   |   `-TypeAliasTemplateDecl {{.+}} AInner{{$}}
-// CHECK-NEXT: |     |   `-ElaboratedType {{.*}} 'Inner<type-parameter-1-0>' sugar dependent
-// CHECK-NEXT: |     |     `-TemplateSpecializationType {{.*}} 'Inner<type-parameter-1-0>' dependent
+// CHECK-NEXT: |     |   `-ElaboratedType {{.*}} 'Inner<Y>' sugar dependent
+// CHECK-NEXT: |     |     `-TemplateSpecializationType {{.*}} 'Inner<Y>' dependent
 // CHECK-NEXT: |     |       |-name: 'Inner':'Out<int>::Inner' qualified
 // CHECK-NEXT: |     |       | `-ClassTemplateDecl {{.+}} Inner{{$}}
-// CHECK-NEXT: |     |       `-TemplateArgument type 'type-parameter-1-0'
-// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'type-parameter-1-0'
+// CHECK-NEXT: |     |       `-TemplateArgument type 'Y'
+// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'Y'
 // CHECK-NEXT: |     |           |-FunctionTemplate {{.*}} '<deduction guide for Inner>'
-// CHECK-NEXT: |     |           `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
-// CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (type-parameter-0-0) -> Inner<type-parameter-0-0>'
-// CHECK-NEXT: |     | `-ParmVarDecl {{.*}} 'type-parameter-0-0'
+// CHECK-NEXT: |     |           `-TemplateTypeParmType {{.*}} 'Y' dependent depth 1 index 0
+// CHECK-NEXT: |     |             `-TemplateTypeParm {{.*}} 'Y'
+// CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (Y) -> Inner<Y>'
+// CHECK-NEXT: |     | `-ParmVarDecl {{.*}} 'Y'
 // CHECK-NEXT: |     `-CXXDeductionGuideDecl {{.*}} used <deduction guide for AInner> 'auto (double) -> Inner<double>' implicit_instantiation
 // CHECK-NEXT: |       |-TemplateArgument type 'double'
 // CHECK-NEXT: |       | `-BuiltinType {{.*}} 'double'
@@ -77,8 +79,8 @@ AFoo3 afoo3{0, 1};
 // CHECK-NEXT: | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
 // CHECK-NEXT: | | |-TemplateArgument type 'int'
 // CHECK-NEXT: | | | `-BuiltinType {{.*}} 'int'
-// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-1'
-// CHECK-NEXT: | |   `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK-NEXT: | | `-TemplateArgument type 'V'
+// CHECK-NEXT: | |   `-TemplateTypeParmType {{.*}} 'V' dependent depth 0 index 1
 
 template <typename... T1>
 struct Foo {
@@ -88,16 +90,16 @@ struct Foo {
 template <typename...T2>
 using AFoo = Foo<T2...>;
 AFoo a(1, 2);
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (type-parameter-0-0...) -> Foo<type-parameter-0-0...>'
-// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'type-parameter-0-0...' pack
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (T2...) -> Foo<T2...>'
+// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'T2...' pack
 // CHECK-NEXT: | `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for AFoo> 'auto (int, int) -> Foo<int, int>' implicit_instantiation
 
 template <typename T>
 using BFoo = Foo<T, T>;
 BFoo b2(1.0, 2.0);
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for BFoo> 'auto (type-parameter-0-0, type-parameter-0-0) -> Foo<type-parameter-0-0, type-parameter-0-0>'
-// CHECK-NEXT: | | |-ParmVarDecl {{.*}} 'type-parameter-0-0'
-// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'type-parameter-0-0'
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for BFoo> 'auto (T, T) -> Foo<T, T>'
+// CHECK-NEXT: | | |-ParmVarDecl {{.*}} 'T'
+// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'T'
 // CHECK-NEXT: | `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for BFoo> 'auto (double, double) -> Foo<double, double>' implicit_instantiation
 
 namespace GH90209 {
diff --git a/clang/test/AST/ast-dump-openmp-for-simd.c b/clang/test/AST/ast-dump-openmp-for-simd.c
index 5dcd9175d4998..06ae226ccb17d 100644
--- a/clang/test/AST/ast-dump-openmp-for-simd.c
+++ b/clang/test/AST/ast-dump-openmp-for-simd.c
@@ -41,7 +41,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:22, line:7:1>
 // CHECK-NEXT: |   `-OMPForSimdDirective {{.*}} <line:4:1, col:21>
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:5:3, line:6:5>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:5:3, line:6:5>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:5:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -65,7 +65,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:29, line:14:1>
 // CHECK-NEXT: |   `-OMPForSimdDirective {{.*}} <line:10:1, col:21>
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:11:3, line:13:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:11:3, line:13:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:11:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -108,7 +108,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-value: Int 1
 // CHECK-NEXT: |     |   `-IntegerLiteral {{.*}} <col:31> 'int' 1
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:18:3, line:20:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:18:3, line:20:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:18:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -151,7 +151,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-value: Int 2
 // CHECK-NEXT: |     |   `-IntegerLiteral {{.*}} <col:31> 'int' 2
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:25:3, line:27:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:25:3, line:27:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:25:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -195,7 +195,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:       | |-value: Int 2
 // CHECK-NEXT:       |   `-IntegerLiteral {{.*}} <col:31> 'int' 2
 // CHECK-NEXT:       `-CapturedStmt {{.*}} <line:32:3, line:35:9>
-// CHECK-NEXT:         |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT:         |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT:         | |-ForStmt {{.*}} <line:32:3, line:35:9>
 // CHECK-NEXT:         | | |-DeclStmt {{.*}} <line:32:8, col:17>
 // CHECK-NEXT:         | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
diff --git a/clang/test/AST/ast-dump-openmp-simd.c b/clang/test/AST/ast-dump-openmp-simd.c
index 3ccddd426d741..cffbb9065f1d6 100644
--- a/clang/test/AST/ast-dump-openmp-simd.c
+++ b/clang/test/AST/ast-dump-openmp-simd.c
@@ -41,7 +41,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:22, line:7:1>
 // CHECK-NEXT: |   `-OMPSimdDirective {{.*}} <line:4:1, col:17>
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:5:3, line:6:5>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:5:3, line:6:5>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:5:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -65,7 +65,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:29, line:14:1>
 // CHECK-NEXT: |   `-OMPSimdDirective {{.*}} <line:10:1, col:17>
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:11:3, line:13:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:11:3, line:13:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:11:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -108,7 +108,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-value: Int 1
 // CHECK-NEXT: |     |   `-IntegerLiteral {{.*}} <col:27> 'int' 1
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:18:3, line:20:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:18:3, line:20:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:18:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -151,7 +151,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-value: Int 2
 // CHECK-NEXT: |     |   `-IntegerLiteral {{.*}} <col:27> 'int' 2
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:25:3, line:27:7>
-// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |       | |-ForStmt {{.*}} <line:25:3, line:27:7>
 // CHECK-NEXT: |       | | |-DeclStmt {{.*}} <line:25:8, col:17>
 // CHECK-NEXT: |       | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -195,7 +195,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:       | |-value: Int 2
 // CHECK-NEXT:       |   `-IntegerLiteral {{.*}} <col:27> 'int' 2
 // CHECK-NEXT:       `-CapturedStmt {{.*}} <line:32:3, line:35:9>
-// CHECK-NEXT:         |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT:         |-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT:         | |-ForStmt {{.*}} <line:32:3, line:35:9>
 // CHECK-NEXT:         | | |-DeclStmt {{.*}} <line:32:8, col:17>
 // CHECK-NEXT:         | | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
diff --git a/clang/test/AST/ast-dump-openmp-taskloop-simd.c b/clang/test/AST/ast-dump-openmp-taskloop-simd.c
index 25f9cd30ad1c1..de70ecdefd16a 100644
--- a/clang/test/AST/ast-dump-openmp-taskloop-simd.c
+++ b/clang/test/AST/ast-dump-openmp-taskloop-simd.c
@@ -43,7 +43,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     |-OMPFirstprivateClause {{.*}} <<invalid sloc>> <implicit>
 // CHECK-NEXT: |     | `-DeclRefExpr {{.*}} <line:5:23> 'int' lvalue ParmVar {{.*}} 'x' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <col:3, line:6:5>
-// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |         |-ForStmt {{.*}} <line:5:3, line:6:5>
 // CHECK-NEXT: |         | |-DeclStmt {{.*}} <line:5:8, col:17>
 // CHECK-NEXT: |         | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -80,7 +80,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-DeclRefExpr {{.*}} <line:11:23> 'int' lvalue ParmVar {{.*}} 'x' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     | `-DeclRefExpr {{.*}} <line:12:25> 'int' lvalue ParmVar {{.*}} 'y' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:11:3, line:13:7>
-// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |         |-ForStmt {{.*}} <line:11:3, line:13:7>
 // CHECK-NEXT: |         | |-DeclStmt {{.*}} <line:11:8, col:17>
 // CHECK-NEXT: |         | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -135,7 +135,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-DeclRefExpr {{.*}} <line:18:23> 'int' lvalue ParmVar {{.*}} 'x' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     | `-DeclRefExpr {{.*}} <line:19:25> 'int' lvalue ParmVar {{.*}} 'y' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:18:3, line:20:7>
-// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |         |-ForStmt {{.*}} <line:18:3, line:20:7>
 // CHECK-NEXT: |         | |-DeclStmt {{.*}} <line:18:8, col:17>
 // CHECK-NEXT: |         | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -190,7 +190,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |     | |-DeclRefExpr {{.*}} <line:25:23> 'int' lvalue ParmVar {{.*}} 'x' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     | `-DeclRefExpr {{.*}} <line:26:25> 'int' lvalue ParmVar {{.*}} 'y' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT: |     `-CapturedStmt {{.*}} <line:25:3, line:27:7>
-// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT: |         |-ForStmt {{.*}} <line:25:3, line:27:7>
 // CHECK-NEXT: |         | |-DeclStmt {{.*}} <line:25:8, col:17>
 // CHECK-NEXT: |         | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
@@ -247,7 +247,7 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:       | |-DeclRefExpr {{.*}} <line:33:25> 'int' lvalue ParmVar {{.*}} 'y' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT:       | `-DeclRefExpr {{.*}} <line:34:27> 'int' lvalue ParmVar {{.*}} 'z' 'int' refers_to_enclosing_variable_or_capture
 // CHECK-NEXT:       `-CapturedStmt {{.*}} <line:32:3, line:35:9>
-// CHECK-NEXT:         `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT:         `-CapturedDecl {{.*}} <<invalid sloc>> <invalid sloc> nothrow
 // CHECK-NEXT:           |-ForStmt {{.*}} <line:32:3, line:35:9>
 // CHECK-NEXT:           | |-DeclStmt {{.*}} <line:32:8, col:17>
 // CHECK-NEXT:           | | `-VarDecl {{.*}} <col:8, col:16> col:12 used i 'int' cinit
diff --git a/clang/test/AST/ast-dump-ptrauth-json.cpp b/clang/test/AST/ast-dump-ptrauth-json.cpp
new file mode 100644
index 0000000000000..125cda0cff53a
--- /dev/null
+++ b/clang/test/AST/ast-dump-ptrauth-json.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -std=c++11 -ast-dump=json %s | FileCheck %s
+
+// CHECK: "name": "__builtin_ptrauth_type_discriminator",
+
+int d = __builtin_ptrauth_type_discriminator(int());
diff --git a/clang/test/AST/attr-print-emit.cpp b/clang/test/AST/attr-print-emit.cpp
index 8c8a2b2080599..d8e62ed5f6cd1 100644
--- a/clang/test/AST/attr-print-emit.cpp
+++ b/clang/test/AST/attr-print-emit.cpp
@@ -32,8 +32,8 @@ int *aa(int i) __attribute__((alloc_align(1)));
 void ownt(int *, int *) __attribute__((ownership_takes(foo, 1, 2)));
 // CHECK: void ownh(int *, int *) __attribute__((ownership_holds(foo, 1, 2)));
 void ownh(int *, int *) __attribute__((ownership_holds(foo, 1, 2)));
-// CHECK: void ownr(int) __attribute__((ownership_returns(foo, 1)));
-void ownr(int) __attribute__((ownership_returns(foo, 1)));
+// CHECK: void *ownr(int) __attribute__((ownership_returns(foo, 1)));
+void *ownr(int) __attribute__((ownership_returns(foo, 1)));
 
 // CHECK: void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 3, 2)));
 void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 3, 2)));
@@ -65,8 +65,8 @@ class C {
   void ownt(int *, int *) __attribute__((ownership_takes(foo, 2, 3)));
   // CHECK: void ownh(int *, int *) __attribute__((ownership_holds(foo, 2, 3)));
   void ownh(int *, int *) __attribute__((ownership_holds(foo, 2, 3)));
-  // CHECK: void ownr(int) __attribute__((ownership_returns(foo, 2)));
-  void ownr(int) __attribute__((ownership_returns(foo, 2)));
+  // CHECK: void *ownr(int) __attribute__((ownership_returns(foo, 2)));
+  void *ownr(int) __attribute__((ownership_returns(foo, 2)));
 
   // CHECK: void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 4, 3)));
   void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 4, 3)));
diff --git a/clang/test/AST/explicit-base-class-move-cntr.cpp b/clang/test/AST/explicit-base-class-move-cntr.cpp
new file mode 100644
index 0000000000000..808af2fc94336
--- /dev/null
+++ b/clang/test/AST/explicit-base-class-move-cntr.cpp
@@ -0,0 +1,171 @@
+// RUN: %clang_cc1 -ast-dump=json %s | FileCheck -strict-whitespace %s
+
+struct ExplicitBase {
+  explicit ExplicitBase(const char *) { }
+  ExplicitBase(const ExplicitBase &) {}
+  ExplicitBase(ExplicitBase &&) {}
+  ExplicitBase &operator=(const ExplicitBase &) { return *this; }
+  ExplicitBase &operator=(ExplicitBase &&) { return *this; }
+  ~ExplicitBase() { }
+};
+
+struct Derived1 : ExplicitBase {};
+
+Derived1 makeDerived1() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived1",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived1"
+// CHECK-NEXT:   },
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived1"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived1"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived1"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ExplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (ExplicitBase &&)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "MaterializeTemporaryExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "ExplicitBase"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "xvalue",
+// CHECK-NEXT:             "storageDuration": "full expression",
+
+// CHECK:                    "kind": "CXXBindTemporaryExpr",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "ExplicitBase"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "prvalue",
+
+// CHECK:                      "kind": "CXXTemporaryObjectExpr",
+// CHECK:                      "type": {
+// CHECK-NEXT:                   "qualType": "ExplicitBase"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "valueCategory": "prvalue",
+// CHECK-NEXT:                 "ctorType": {
+// CHECK-NEXT:                   "qualType": "void (const char *)"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "list": true,
+// CHECK-NEXT:                 "hadMultipleCandidates": true,
+// CHECK-NEXT:                 "constructionKind": "complete",
+
+// CHECK:                        "kind": "ImplicitCastExpr",
+// CHECK:                        "type": {
+// CHECK-NEXT:                     "qualType": "const char *"
+// CHECK-NEXT:                   },
+// CHECK-NEXT:                   "valueCategory": "prvalue",
+// CHECK-NEXT:                   "castKind": "ArrayToPointerDecay",
+
+// CHECK:                          "kind": "StringLiteral",
+// CHECK:                          "type": {
+// CHECK-NEXT:                       "qualType": "const char[10]"
+// CHECK-NEXT:                     },
+// CHECK-NEXT:                     "valueCategory": "lvalue",
+// CHECK-NEXT:                     "value": "\"Move Ctor\""
+  return Derived1{ExplicitBase{"Move Ctor"}};
+}
+
+struct ImplicitBase {
+  ImplicitBase(const char *) { }
+  ImplicitBase(const ImplicitBase &) {}
+  ImplicitBase(ImplicitBase &&) {}
+  ImplicitBase &operator=(const ImplicitBase &) { return *this; }
+  ImplicitBase &operator=(ImplicitBase &&) { return *this; }
+  ~ImplicitBase() { }
+};
+
+struct Derived2 : ImplicitBase {};
+
+Derived2 makeDerived2() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived2",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived2"
+// CHECK-NEXT:   },
+// CHECK-NEXT:   "valueCategory": "prvalue",
+// CHECK-NEXT:   "cleanupsHaveSideEffects": true,
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived2"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived2"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived2"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ImplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (const char *)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "list": true,
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "ImplicitCastExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "const char *"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "prvalue",
+// CHECK-NEXT:             "castKind": "ArrayToPointerDecay",
+
+// CHECK:                    "kind": "StringLiteral",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "const char[8]"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "lvalue",
+// CHECK-NEXT:               "value": "\"No Ctor\""
+  return Derived2{{"No Ctor"}};
+}
+
+// NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
+// using --filters=FunctionDecl,CompoundStmt,ReturnStmt,MaterializeTemporaryExpr,CXXBindTemporaryExpr,CXXTemporaryObjectExpr,ImplicitCastExpr,StringLiteralStringLiteral
diff --git a/clang/test/Analysis/Inputs/expected-plists/plist-macros.cpp.plist b/clang/test/Analysis/Inputs/expected-plists/plist-macros.cpp.plist
index 9981fc6a1f517..13645d47f8cdc 100644
--- a/clang/test/Analysis/Inputs/expected-plists/plist-macros.cpp.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/plist-macros.cpp.plist
@@ -130,12 +130,12 @@
      </array>
      <key>depth</key><integer>0</integer>
      <key>extended_message</key>
-     <string>Memory allocated by malloc() should be deallocated by free(), not &apos;delete&apos;</string>
+     <string>Memory allocated by &apos;malloc()&apos; should be deallocated by &apos;free()&apos;, not &apos;delete&apos;</string>
      <key>message</key>
-     <string>Memory allocated by malloc() should be deallocated by free(), not &apos;delete&apos;</string>
+     <string>Memory allocated by &apos;malloc()&apos; should be deallocated by &apos;free()&apos;, not &apos;delete&apos;</string>
     </dict>
    </array>
-   <key>description</key><string>Memory allocated by malloc() should be deallocated by free(), not &apos;delete&apos;</string>
+   <key>description</key><string>Memory allocated by &apos;malloc()&apos; should be deallocated by &apos;free()&apos;, not &apos;delete&apos;</string>
    <key>category</key><string>Memory error</string>
    <key>type</key><string>Bad deallocator</string>
    <key>check_name</key><string>unix.MismatchedDeallocator</string>
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
index 29326ec1f9280..a379a47515668 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -263,11 +263,13 @@ namespace std {
   template< class T > struct remove_reference<T&>  {typedef T type;};
   template< class T > struct remove_reference<T&&> {typedef T type;};
 
-  template<class T> 
-  typename remove_reference<T>::type&& move(T&& a) {
-    typedef typename remove_reference<T>::type&& RvalRef;
-    return static_cast<RvalRef>(a);
-  }
+  template<typename T> typename remove_reference<T>::type&& move(T&& a);
+  template <typename T> T *__addressof(T &x);
+  template <typename T> T *addressof(T &x);
+  template <typename T> const T& as_const(T& x);
+  template <typename T> T&& forward(T&& x);
+  // FIXME: Declare forward_like
+  // FIXME: Declare move_if_noexcept
 
   template< class T >
   using remove_reference_t = typename remove_reference<T>::type;
@@ -754,7 +756,7 @@ namespace std {
     template<class InputIter, class OutputIter>
     OutputIter __copy(InputIter II, InputIter IE, OutputIter OI) {
       while (II != IE)
-        *OI++ = *II++;
+        *OI++ = *II++; // #system_header_simulator_cxx_std_copy_impl_loop
 
       return OI;
     }
diff --git a/clang/test/Analysis/Malloc+MismatchedDeallocator+NewDelete.cpp b/clang/test/Analysis/Malloc+MismatchedDeallocator+NewDelete.cpp
index b5e47b3355da3..6c20b4ba53dae 100644
--- a/clang/test/Analysis/Malloc+MismatchedDeallocator+NewDelete.cpp
+++ b/clang/test/Analysis/Malloc+MismatchedDeallocator+NewDelete.cpp
@@ -24,12 +24,12 @@ void testMallocUseAfterFree() {
 
 void testMallocBadFree() {
   int i;
-  free(&i); // expected-warning{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  free(&i); // expected-warning{{Argument to 'free()' is the address of the local variable 'i', which is not memory allocated by 'malloc()'}}
 }
 
 void testMallocOffsetFree() {
   int *p = (int *)malloc(sizeof(int));
-  free(++p); // expected-warning{{Argument to free() is offset by 4 bytes from the start of memory allocated by malloc()}}
+  free(++p); // expected-warning{{Argument to 'free()' is offset by 4 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 //-----------------------------------------------------------------
@@ -37,7 +37,7 @@ void testMallocOffsetFree() {
 //-----------------------------------------------------------------
 void testMismatchedDeallocator() {
   int *x = (int *)malloc(sizeof(int));
-  delete x; // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+  delete x; // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 //----------------------------------------------------------------
@@ -69,7 +69,7 @@ void testNewBadFree() {
 
 void testNewOffsetFree() {
   int *p = new int;
-  operator delete(++p); // expected-warning{{Argument to operator delete is offset by 4 bytes from the start of memory allocated by 'new'}}
+  operator delete(++p); // expected-warning{{Argument to 'operator delete' is offset by 4 bytes from the start of memory allocated by 'new'}}
 }
 
 //----------------------------------------------------------------
@@ -88,7 +88,7 @@ void testMismatchedChangePtrThroughCall() {
 void testMismatchedChangePointeeThroughCall() {
   int *p = (int*)malloc(sizeof(int)*4);
   changePointee(p);
-  delete p; // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testShouldReportDoubleFreeNotMismatched() {
diff --git a/clang/test/Analysis/MismatchedDeallocator-checker-test.mm b/clang/test/Analysis/MismatchedDeallocator-checker-test.mm
index 013d677e515cf..ef8b24ba8de32 100644
--- a/clang/test/Analysis/MismatchedDeallocator-checker-test.mm
+++ b/clang/test/Analysis/MismatchedDeallocator-checker-test.mm
@@ -14,6 +14,19 @@
 void free(void *);
 void __attribute((ownership_takes(malloc, 1))) my_free(void *);
 
+void __attribute((ownership_returns(malloc1))) *my_malloc1(size_t);
+void __attribute((ownership_takes(malloc1, 1))) my_free1(void *);
+
+void __attribute((ownership_returns(malloc2))) *my_malloc2(size_t);
+
+// The order of these declarations are important to verify that analisys still works even
+// if there are less specific declarations of the same functions
+void __attribute((ownership_returns(malloc3))) *my_malloc3(size_t);
+void *my_malloc3(size_t);
+
+void *my_malloc4(size_t);
+void __attribute((ownership_returns(malloc4))) *my_malloc4(size_t);
+
 //---------------------------------------------------------------
 // Test if an allocation function matches deallocation function
 //---------------------------------------------------------------
@@ -21,79 +34,114 @@
 //--------------- test malloc family
 void testMalloc1() {
   int *p = (int *)malloc(sizeof(int));
-  delete p; // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testMalloc2() {
   int *p = (int *)malloc(8);
   int *q = (int *)realloc(p, 16);
-  delete q; // expected-warning{{Memory allocated by realloc() should be deallocated by free(), not 'delete'}}
+  delete q; // expected-warning{{Memory allocated by 'realloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testMalloc3() {
   int *p = (int *)calloc(1, sizeof(int));
-  delete p; // expected-warning{{Memory allocated by calloc() should be deallocated by free(), not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'calloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testMalloc4(const char *s) {
   char *p = strdup(s);
-  delete p; // expected-warning{{Memory allocated by strdup() should be deallocated by free(), not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'strdup()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testMalloc5() {
   int *p = (int *)my_malloc(sizeof(int));
-  delete p; // expected-warning{{Memory allocated by my_malloc() should be deallocated by free(), not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'my_malloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void testMalloc6() {
   int *p = (int *)malloc(sizeof(int));
-  operator delete(p); // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not operator delete}}
+  operator delete(p); // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'operator delete'}}
 }
 
 void testMalloc7() {
   int *p = (int *)malloc(sizeof(int));
-  delete[] p; // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not 'delete[]'}}
+  delete[] p; // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete[]'}}
 }
 
 void testMalloc8() {
   int *p = (int *)malloc(sizeof(int));
-  operator delete[](p); // expected-warning{{Memory allocated by malloc() should be deallocated by free(), not operator delete[]}}
+  operator delete[](p); // expected-warning{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'operator delete[]'}}
+}
+
+void testMalloc9() {
+  int *p = (int *)my_malloc(sizeof(int));
+  my_free(p); // no warning
+}
+
+void testMalloc10() {
+  int *p = (int *)my_malloc1(sizeof(int));
+  my_free1(p); // no warning
+}
+
+void testMalloc11() {
+  int *p = (int *)my_malloc1(sizeof(int));
+  my_free(p); // expected-warning{{Memory allocated by 'my_malloc1()' should be deallocated by function that takes ownership of 'malloc1', not 'my_free()', which takes ownership of 'malloc'}}
+}
+
+void testMalloc12() {
+  int *p = (int *)my_malloc2(sizeof(int));
+  my_free1(p); // expected-warning{{Memory allocated by 'my_malloc2()' should be deallocated by function that takes ownership of 'malloc2', not 'my_free1()', which takes ownership of 'malloc1'}}
+}
+
+void testMalloc13() {
+  int *p = (int *)my_malloc1(sizeof(int));
+  free(p); // expected-warning{{Memory allocated by 'my_malloc1()' should be deallocated by function that takes ownership of 'malloc1', not 'free()'}}
+}
+
+void testMalloc14() {
+  int *p = (int *)my_malloc3(sizeof(int));
+  free(p); // expected-warning{{Memory allocated by 'my_malloc3()' should be deallocated by function that takes ownership of 'malloc3', not 'free()'}}
+}
+
+void testMalloc15() {
+  int *p = (int *)my_malloc4(sizeof(int));
+  free(p); // expected-warning{{Memory allocated by 'my_malloc4()' should be deallocated by function that takes ownership of 'malloc4', not 'free()'}}
 }
 
 void testAlloca() {
   int *p = (int *)__builtin_alloca(sizeof(int));
-  delete p; // expected-warning{{Memory allocated by alloca() should not be deallocated}}
+  delete p; // expected-warning{{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 //--------------- test new family
 void testNew1() {
   int *p = new int;
-  free(p); // expected-warning{{Memory allocated by 'new' should be deallocated by 'delete', not free()}}
+  free(p); // expected-warning{{Memory allocated by 'new' should be deallocated by 'delete', not 'free()'}}
 }
 
 void testNew2() {
   int *p = (int *)operator new(0);
-  free(p); // expected-warning{{Memory allocated by operator new should be deallocated by 'delete', not free()}}
+  free(p); // expected-warning{{Memory allocated by 'operator new' should be deallocated by 'delete', not 'free()'}}
 }
 
 void testNew3() {
   int *p = new int[1];
-  free(p); // expected-warning{{Memory allocated by 'new[]' should be deallocated by 'delete[]', not free()}}
+  free(p); // expected-warning{{Memory allocated by 'new[]' should be deallocated by 'delete[]', not 'free()'}}
 }
 
 void testNew4() {
   int *p = new int;
-  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by 'new' should be deallocated by 'delete', not realloc()}}
+  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by 'new' should be deallocated by 'delete', not 'realloc()'}}
 }
 
 void testNew5() {
   int *p = (int *)operator new(0);
-  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by operator new should be deallocated by 'delete', not realloc()}}
+  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by 'operator new' should be deallocated by 'delete', not 'realloc()'}}
 }
 
 void testNew6() {
   int *p = new int[1];
-  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by 'new[]' should be deallocated by 'delete[]', not realloc()}}
+  realloc(p, sizeof(long)); // expected-warning{{Memory allocated by 'new[]' should be deallocated by 'delete[]', not 'realloc()'}}
 }
 
 int *allocInt() {
@@ -106,7 +154,7 @@ void testNew7() {
 
 void testNew8() {
   int *p = (int *)operator new(0);
-  delete[] p; // expected-warning{{Memory allocated by operator new should be deallocated by 'delete', not 'delete[]'}}
+  delete[] p; // expected-warning{{Memory allocated by 'operator new' should be deallocated by 'delete', not 'delete[]'}}
 }
 
 int *allocIntArray(unsigned c) {
@@ -120,7 +168,7 @@ void testNew9() {
 
 void testNew10() {
   int *p = (int *)operator new[](0);
-  delete p; // expected-warning{{Memory allocated by operator new[] should be deallocated by 'delete[]', not 'delete'}}
+  delete p; // expected-warning{{Memory allocated by 'operator new[]' should be deallocated by 'delete[]', not 'delete'}}
 }
 
 void testNew11(NSUInteger dataLength) {
@@ -208,7 +256,7 @@ explicit SimpleSmartPointer(T *p = 0) : ptr(p) {}
   ~SimpleSmartPointer() {
     delete ptr;
     // expected-warning@-1 {{Memory allocated by 'new[]' should be deallocated by 'delete[]', not 'delete'}}
-    // expected-warning@-2 {{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+    // expected-warning@-2 {{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
   }
 };
 
diff --git a/clang/test/Analysis/NewDelete-intersections.mm b/clang/test/Analysis/NewDelete-intersections.mm
index 6f81034ee349f..9ac471600e8b9 100644
--- a/clang/test/Analysis/NewDelete-intersections.mm
+++ b/clang/test/Analysis/NewDelete-intersections.mm
@@ -44,7 +44,7 @@ void testMallocFreeNoWarn() {
 void testDeleteMalloced() {
   int *p1 = (int *)malloc(sizeof(int));
   delete p1;
-  // mismatch-warning@-1{{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+  // mismatch-warning@-1{{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
 
   int *p2 = (int *)__builtin_alloca(sizeof(int));
   delete p2; // no warn
@@ -59,13 +59,13 @@ void testUseZeroAllocatedMalloced() {
 void testFreeOpNew() {
   void *p = operator new(0);
   free(p);
-  // mismatch-warning@-1{{Memory allocated by operator new should be deallocated by 'delete', not free()}}
+  // mismatch-warning@-1{{Memory allocated by 'operator new' should be deallocated by 'delete', not 'free()'}}
 }
 
 void testFreeNewExpr() {
   int *p = new int;
   free(p);
-  // mismatch-warning@-1{{Memory allocated by 'new' should be deallocated by 'delete', not free()}}
+  // mismatch-warning@-1{{Memory allocated by 'new' should be deallocated by 'delete', not 'free()'}}
   free(p);
 }
 
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 2a4c40005a4dc..b8dbcdd7e55af 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -9,8 +9,6 @@
 // CHECK-NEXT: alpha.clone.CloneChecker:ReportNormalClones = true
 // CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false
 // CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtExec = 0x04
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtRead = 0x01
 // CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
 // CHECK-NEXT: apply-fixits = false
 // CHECK-NEXT: assume-controlled-environment = false
diff --git a/clang/test/Analysis/builtin-functions.cpp b/clang/test/Analysis/builtin-functions.cpp
index 8719193e405c4..f7bafabd23cbc 100644
--- a/clang/test/Analysis/builtin-functions.cpp
+++ b/clang/test/Analysis/builtin-functions.cpp
@@ -1,6 +1,16 @@
 // RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,debug.ExprInspection %s -std=c++11 -verify
 // RUN: %clang_analyze_cc1 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions -analyzer-checker=core,debug.ExprInspection %s -std=c++11 -verify
 
+#include "Inputs/system-header-simulator-cxx.h"
+
+namespace std {
+// Intentionally not using an "auto" return type here, as such must also have a definition.
+template <typename T, typename U> constexpr int&& forward_like(U&& x) noexcept;
+template <typename T> const T& move_if_noexcept(T& x) noexcept;
+} // namespace std
+
+template <typename T> void clang_analyzer_dump_ref(T&&);
+template <typename T> void clang_analyzer_dump_ptr(T*);
 void clang_analyzer_eval(bool);
 void clang_analyzer_warnIfReached();
 
@@ -8,6 +18,16 @@ void testAddressof(int x) {
   clang_analyzer_eval(&x == __builtin_addressof(x)); // expected-warning{{TRUE}}
 }
 
+void testStdBuiltinLikeFunctions(int x) {
+  clang_analyzer_dump_ptr(std::addressof(x));           // expected-warning{{&x}}
+  clang_analyzer_dump_ptr(std::__addressof(x));         // expected-warning{{&x}}
+  clang_analyzer_dump_ref(std::as_const(x));            // expected-warning{{&x}}
+  clang_analyzer_dump_ref(std::forward<int &>(x));      // expected-warning{{&x}}
+  clang_analyzer_dump_ref(std::forward_like<int &>(x)); // expected-warning{{&x}}
+  clang_analyzer_dump_ref(std::move(x));                // expected-warning{{&x}}
+  clang_analyzer_dump_ref(std::move_if_noexcept(x));    // expected-warning{{&x}}
+}
+
 void testSize() {
   struct {
     int x;
diff --git a/clang/test/Analysis/call-invalidation.cpp b/clang/test/Analysis/call-invalidation.cpp
index 727217f228b05..fb2b892b31a1f 100644
--- a/clang/test/Analysis/call-invalidation.cpp
+++ b/clang/test/Analysis/call-invalidation.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s
 
+template <class T> void clang_analyzer_dump(T);
 void clang_analyzer_eval(bool);
 
 void usePointer(int * const *);
@@ -165,3 +166,117 @@ void testMixedConstNonConstCalls() {
   useFirstNonConstSecondConst(&(s2.x), &(s2.y));
   clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
 }
+
+namespace std {
+class Opaque {
+public:
+  Opaque();
+  int nested_member;
+};
+} // namespace std
+
+struct StdWrappingOpaque {
+  std::Opaque o; // first member
+  int uninit;
+};
+struct StdWrappingOpaqueSwapped {
+  int uninit; // first member
+  std::Opaque o;
+};
+
+int testStdCtorDoesNotInvalidateParentObject() {
+  StdWrappingOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: std::Opaque::ctor might initialized this
+  int y = obj.uninit; // FIXME: We should have a garbage read here. Read the details.
+  // As the first member ("obj.o") is invalidated, a conjured default binding is bound
+  // to the offset 0 within cluster "obj", and this masks every uninitialized fields
+  // that follows. We need a better store with extents to fix this.
+  return x + y;
+}
+
+int testStdCtorDoesNotInvalidateParentObjectSwapped() {
+  StdWrappingOpaqueSwapped obj;
+  int x = obj.o.nested_member; // no-garbage: std::Opaque::ctor might initialized this
+  int y = obj.uninit; // expected-warning {{Assigned value is garbage or undefined}}
+  return x + y;
+}
+
+class UserProvidedOpaque {
+public:
+  UserProvidedOpaque(); // might reinterpret_cast(this)
+  int nested_member;
+};
+
+struct WrappingUserProvidedOpaque {
+  UserProvidedOpaque o; // first member
+  int uninit;
+};
+struct WrappingUserProvidedOpaqueSwapped {
+  int uninit; // first member
+  UserProvidedOpaque o;
+};
+
+int testUserProvidedCtorInvalidatesParentObject() {
+  WrappingUserProvidedOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: UserProvidedOpaque::ctor might initialized this
+  int y = obj.uninit; // no-garbage: UserProvidedOpaque::ctor might reinterpret_cast(this) and write to the "uninit" member.
+  return x + y;
+}
+
+int testUserProvidedCtorInvalidatesParentObjectSwapped() {
+  WrappingUserProvidedOpaqueSwapped obj;
+  int x = obj.o.nested_member; // no-garbage: same as above
+  int y = obj.uninit; // no-garbage: same as above
+  return x + y;
+}
+
+struct WrappingStdWrappingOpaqueOuterInits {
+  int first = 1;
+  std::Opaque second;
+  int third = 3;
+  WrappingStdWrappingOpaqueOuterInits() {
+    clang_analyzer_dump(first); // expected-warning {{1 S32b}}
+    clang_analyzer_dump(second.nested_member); // expected-warning {{derived_}}
+    clang_analyzer_dump(third); // expected-warning {{3 S32b}}
+  }
+};
+
+struct WrappingUserProvidedOpaqueOuterInits {
+  int first = 1; // Potentially overwritten by UserProvidedOpaque::ctor
+  UserProvidedOpaque second; // Invalidates the object so far.
+  int third = 3; // Happens after UserProvidedOpaque::ctor, thus preserved!
+  WrappingUserProvidedOpaqueOuterInits() {
+    clang_analyzer_dump(first); // expected-warning {{derived_}}
+    clang_analyzer_dump(second.nested_member); // expected-warning {{derived_}}
+    clang_analyzer_dump(third); // expected-warning {{3 S32b}}
+  }
+};
+
+extern "C++" {
+namespace std {
+inline namespace v1 {
+namespace custom_ranges {
+struct Fancy {
+struct iterator {
+struct Opaque {
+  Opaque();
+  int nested_member;
+}; // struct Opaque
+}; // struct iterator
+}; // struct Fancy
+} // namespace custom_ranges
+} // namespace v1
+} // namespace std
+} // extern "C++"
+
+struct StdWrappingFancyOpaque {
+  int uninit;
+  std::custom_ranges::Fancy::iterator::Opaque o;
+};
+
+int testNestedStdNamespacesAndRecords() {
+  StdWrappingFancyOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: ctor
+  int y = obj.uninit; // expected-warning {{Assigned value is garbage or undefined}}
+  return x + y;
+}
diff --git a/clang/test/Analysis/ctor-array.cpp b/clang/test/Analysis/ctor-array.cpp
index 053669cc2aada..49412ee5a68c7 100644
--- a/clang/test/Analysis/ctor-array.cpp
+++ b/clang/test/Analysis/ctor-array.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-disable-checker=cplusplus -analyzer-config c++-inlining=constructors -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-inlining=constructors -verify %s
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -119,16 +119,6 @@ struct s5 {
 };
 
 void g1(void) {
-  // FIXME: This test requires -analyzer-disable-checker=cplusplus,
-  // because of the checker's weird behaviour in case of arrays.
-  // E.g.:
-  //        s3 *arr = new s3[4];
-  //        s3 *arr2 = new (arr + 1) s3[1];
-  //                   ^~~~~~~~~~~~~~~~~~~
-  //                   warning: 12 bytes is possibly not enough
-  //                            for array allocation which requires
-  //                            4 bytes.
-
   s5::c = 0;
   s5 *arr = new s5[4];
   new (arr + 1) s5[3];
diff --git a/clang/test/Analysis/ctor.mm b/clang/test/Analysis/ctor.mm
index fb385833df9c7..6ac9050fc29f7 100644
--- a/clang/test/Analysis/ctor.mm
+++ b/clang/test/Analysis/ctor.mm
@@ -56,8 +56,6 @@ void testNonPODCopyConstructor() {
 namespace ConstructorVirtualCalls {
   class A {
   public:
-    int *out1, *out2, *out3;
-
     virtual int get() { return 1; }
 
     A(int *out1) {
diff --git a/clang/test/Analysis/diagnostics/explicit-suppression.cpp b/clang/test/Analysis/diagnostics/explicit-suppression.cpp
index 24586e37fe207..3a904dac1c0fe 100644
--- a/clang/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/clang/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -19,6 +19,6 @@ class C {
 void testCopyNull(C *I, C *E) {
   std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@../Inputs/system-header-simulator-cxx.h:757 {{Called C++ object pointer is null}}
+  // expected-warning@#system_header_simulator_cxx_std_copy_impl_loop {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/clang/test/Analysis/free.c b/clang/test/Analysis/free.c
index 50c1efdfb1309..5fd956a4ce110 100644
--- a/clang/test/Analysis/free.c
+++ b/clang/test/Analysis/free.c
@@ -13,21 +13,21 @@ void *alloca(size_t);
 void t1 (void) {
   int a[] = { 1 };
   free(a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t2 (void) {
   int a = 1;
   free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t3 (void) {
   static int a[] = { 1 };
   free(a);
-  // expected-warning@-1{{Argument to free() is the address of the static variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the static variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
@@ -42,7 +42,7 @@ void t5 (void) {
 
 void t6 (void) {
   free((void*)1000);
-  // expected-warning@-1{{Argument to free() is a constant address (1000), which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a constant address (1000), which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object '(void *)1000'}}
 }
 
@@ -58,42 +58,42 @@ void t8 (char **x) {
 void t9 (void) {
 label:
   free(&&label);
-  // expected-warning@-1{{Argument to free() is the address of the label 'label', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the label 'label', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'label'}}
 }
 
 void t10 (void) {
   free((void*)&t10);
-  // expected-warning@-1{{Argument to free() is the address of the function 't10', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function 't10', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 't10'}}
 }
 
 void t11 (void) {
   char *p = (char*)alloca(2);
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t12 (void) {
   char *p = (char*)__builtin_alloca(2);
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t13 (void) {
   free(^{return;});
-  // expected-warning@-1{{Argument to free() is a block, which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a block, which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object: block expression}}
 }
 
 void t14 (char a) {
   free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the parameter 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the parameter 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 static int someGlobal[2];
 void t15 (void) {
   free(someGlobal);
-  // expected-warning@-1{{Argument to free() is the address of the global variable 'someGlobal', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the global variable 'someGlobal', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'someGlobal'}}
 }
 
@@ -105,7 +105,7 @@ void t16 (char **x, int offset) {
 int *iptr(void);
 void t17(void) {
   free(iptr); // Oops, forgot to call iptr().
-  // expected-warning@-1{{Argument to free() is the address of the function 'iptr', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function 'iptr', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'iptr'}}
 }
 
diff --git a/clang/test/Analysis/free.cpp b/clang/test/Analysis/free.cpp
index a812a22c47d39..b7f2e49855cf1 100644
--- a/clang/test/Analysis/free.cpp
+++ b/clang/test/Analysis/free.cpp
@@ -17,42 +17,42 @@ extern "C" void *alloca(std::size_t);
 void t1a () {
   int a[] = { 1 };
   free(a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t1b () {
   int a[] = { 1 };
   std::free(a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'a'}}
 }
 
 void t2a () {
   int a = 1;
   free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t2b () {
   int a = 1;
   std::free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the local variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the local variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'a'}}
 }
 
 void t3a () {
   static int a[] = { 1 };
   free(a);
-  // expected-warning@-1{{Argument to free() is the address of the static variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the static variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t3b () {
   static int a[] = { 1 };
   std::free(a);
-  // expected-warning@-1{{Argument to free() is the address of the static variable 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the static variable 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'a'}}
 }
 
@@ -76,13 +76,13 @@ void t5b () {
 
 void t6a () {
   free((void*)1000);
-  // expected-warning@-1{{Argument to free() is a constant address (1000), which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a constant address (1000), which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object '(void *)1000'}}
 }
 
 void t6b () {
   std::free((void*)1000);
-  // expected-warning@-1{{Argument to free() is a constant address (1000), which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a constant address (1000), which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object '(void *)1000'}}
 }
 
@@ -107,95 +107,95 @@ void t8b (char **x) {
 void t9a () {
 label:
   free(&&label);
-  // expected-warning@-1{{Argument to free() is the address of the label 'label', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the label 'label', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'label'}}
 }
 
 void t9b () {
 label:
   std::free(&&label);
-  // expected-warning@-1{{Argument to free() is the address of the label 'label', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the label 'label', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'label'}}
 }
 
 void t10a () {
   free((void*)&t10a);
-  // expected-warning@-1{{Argument to free() is the address of the function 't10a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function 't10a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 't10a'}}
 }
 
 void t10b () {
   std::free((void*)&t10b);
-  // expected-warning@-1{{Argument to free() is the address of the function 't10b', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function 't10b', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 't10b'}}
 }
 
 void t11a () {
   char *p = (char*)alloca(2);
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t11b () {
   char *p = (char*)alloca(2);
-  std::free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  std::free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t12a () {
   char *p = (char*)__builtin_alloca(2);
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t12b () {
   char *p = (char*)__builtin_alloca(2);
-  std::free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  std::free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void t13a () {
   free(^{return;});
-  // expected-warning@-1{{Argument to free() is a block, which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a block, which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object: block expression}}
 }
 
 void t13b () {
   std::free(^{return;});
-  // expected-warning@-1{{Argument to free() is a block, which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is a block, which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object: block expression}}
 }
 
 void t14a () {
   free((void *)+[]{ return; });
-  // expected-warning@-1{{Argument to free() is the address of the function '__invoke', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function '__invoke', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object: lambda-to-function-pointer conversion}}
 }
 
 void t14b () {
   std::free((void *)+[]{ return; });
-  // expected-warning@-1{{Argument to free() is the address of the function '__invoke', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function '__invoke', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object: lambda-to-function-pointer conversion}}
 }
 
 void t15a (char a) {
   free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the parameter 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the parameter 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'a'}}
 }
 
 void t15b (char a) {
   std::free(&a);
-  // expected-warning@-1{{Argument to free() is the address of the parameter 'a', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the parameter 'a', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'a'}}
 }
 
 static int someGlobal[2];
 void t16a () {
   free(someGlobal);
-  // expected-warning@-1{{Argument to free() is the address of the global variable 'someGlobal', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the global variable 'someGlobal', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 'someGlobal'}}
 }
 
 void t16b () {
   std::free(someGlobal);
-  // expected-warning@-1{{Argument to free() is the address of the global variable 'someGlobal', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the global variable 'someGlobal', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call std::free on non-heap object 'someGlobal'}}
 }
 
diff --git a/clang/test/Analysis/getline-alloc.c b/clang/test/Analysis/getline-alloc.c
index 5b5c716cb605a..74a40a11b9782 100644
--- a/clang/test/Analysis/getline-alloc.c
+++ b/clang/test/Analysis/getline-alloc.c
@@ -40,7 +40,7 @@ void test_getline_alloca() {
     return;
   size_t n = 10;
   char *buffer = alloca(n);
-  getline(&buffer, &n, F1); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  getline(&buffer, &n, F1); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
   fclose(F1);
 }
 
@@ -50,7 +50,7 @@ void test_getline_invalid_ptr() {
     return;
   size_t n = 10;
   char *buffer = (char*)test_getline_invalid_ptr;
-  getline(&buffer, &n, F1); // expected-warning {{Argument to getline() is the address of the function 'test_getline_invalid_ptr', which is not memory allocated by malloc()}}
+  getline(&buffer, &n, F1); // expected-warning {{Argument to 'getline()' is the address of the function 'test_getline_invalid_ptr', which is not memory allocated by 'malloc()'}}
   fclose(F1);
 }
 
@@ -79,7 +79,7 @@ void test_getline_stack() {
   if (!F1)
     return;
 
-  getline(&ptr, &n, F1); // expected-warning {{Argument to getline() is the address of the local variable 'buffer', which is not memory allocated by malloc()}}
+  getline(&ptr, &n, F1); // expected-warning {{Argument to 'getline()' is the address of the local variable 'buffer', which is not memory allocated by 'malloc()'}}
 }
 
 void test_getline_static() {
@@ -91,5 +91,5 @@ void test_getline_static() {
   if (!F1)
     return;
 
-  getline(&ptr, &n, F1); // expected-warning {{Argument to getline() is the address of the static variable 'buffer', which is not memory allocated by malloc()}}
+  getline(&ptr, &n, F1); // expected-warning {{Argument to 'getline()' is the address of the static variable 'buffer', which is not memory allocated by 'malloc()'}}
 }
diff --git a/clang/test/Analysis/issue-94193.cpp b/clang/test/Analysis/issue-94193.cpp
new file mode 100644
index 0000000000000..97acfdd8685be
--- /dev/null
+++ b/clang/test/Analysis/issue-94193.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_analyze_cc1 %s -verify -analyzer-checker=core
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+
+namespace GH94193 {
+template<typename T> class optional {
+  union {
+    char x;
+    T uvalue;
+  };
+  bool holds_value = false;
+public:
+  optional() = default;
+  optional(const optional&) = delete;
+  optional(optional&&) = delete;
+  template <typename U = T> explicit optional(U&& value) : holds_value(true) {
+    new (static_cast<void*>(std::addressof(uvalue))) T(std::forward<U>(value));
+  }
+  optional& operator=(const optional&) = delete;
+  optional& operator=(optional&&) = delete;
+  explicit operator bool() const {
+    return holds_value;
+  }
+  T& unwrap() & {
+    return uvalue; // no-warning: returns a valid value
+  }
+};
+
+int top1(int x) {
+  optional<int> opt{x}; // note: Ctor was inlined.
+  return opt.unwrap();  // no-warning: returns a valid value
+}
+
+std::string *top2() {
+  std::string a = "123";
+  // expected-warning@+2 {{address of stack memory associated with local variable 'a' returned}} diagnosed by -Wreturn-stack-address
+  // expected-warning@+1 {{Address of stack memory associated with local variable 'a' returned to caller [core.StackAddressEscape]}}
+  return std::addressof(a);
+}
+} // namespace GH94193
diff --git a/clang/test/Analysis/kmalloc-linux.c b/clang/test/Analysis/kmalloc-linux.c
index af1af24126b6a..0114c4ef000e2 100644
--- a/clang/test/Analysis/kmalloc-linux.c
+++ b/clang/test/Analysis/kmalloc-linux.c
@@ -133,5 +133,5 @@ void test_kfree_ZERO_SIZE_PTR(void) {
 
 void test_kfree_other_constant_value(void) {
   void *ptr = (void *)1;
-  kfree(ptr); // expected-warning{{Argument to kfree() is a constant address (1)}}
+  kfree(ptr); // expected-warning{{Argument to 'kfree()' is a constant address (1)}}
 }
diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp
index 16954f30129f7..c60f522588e39 100644
--- a/clang/test/Analysis/live-stmts.cpp
+++ b/clang/test/Analysis/live-stmts.cpp
@@ -193,3 +193,112 @@ void test_lambda_refcapture() {
 // CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
+
+int logicalOpInTernary(bool b) {
+  return (b || b) ? 0 : 1;
+}
+
+// [B6 (ENTRY)]
+//    |
+//    V
+// [B5 (b || ...)]
+//   |            \
+//   |             |
+//   V             V
+// [B4 (b||b)] ? [B2 (0)] : [B3 (1)]
+//                \        /
+//                 ---|----
+//                    V
+//                   [B1] --> [B0 (EXIT)]
+//                  return
+
+// CHECK: [ B0 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B1 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B2 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B3 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B4 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B5 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B6 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
diff --git a/clang/test/Analysis/malloc-fnptr-plist.c b/clang/test/Analysis/malloc-fnptr-plist.c
index e3482980dfbf6..9e82406459234 100644
--- a/clang/test/Analysis/malloc-fnptr-plist.c
+++ b/clang/test/Analysis/malloc-fnptr-plist.c
@@ -5,7 +5,7 @@ void free(void *);
 void (*fnptr)(int);
 void foo(void) {
   free((void *)fnptr);
-  // expected-warning@-1{{Argument to free() is a function pointer}}
+  // expected-warning@-1{{Argument to 'free()' is a function pointer}}
   // expected-warning@-2{{attempt to call free on non-heap object '(void *)fnptr'}}
 }
 
diff --git a/clang/test/Analysis/malloc-std-namespace.cpp b/clang/test/Analysis/malloc-std-namespace.cpp
index d4e397bb812aa..21566ad617920 100644
--- a/clang/test/Analysis/malloc-std-namespace.cpp
+++ b/clang/test/Analysis/malloc-std-namespace.cpp
@@ -18,7 +18,7 @@ void no_leak() {
 void invalid_free() {
   int i;
   int *p = &i;
-  //expected-note@+2{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
-  //expected-warning@+1{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  //expected-note@+2{{Argument to 'free()' is the address of the local variable 'i', which is not memory allocated by 'malloc()'}}
+  //expected-warning@+1{{Argument to 'free()' is the address of the local variable 'i', which is not memory allocated by 'malloc()'}}
   std::free(p);
 }
diff --git a/clang/test/Analysis/malloc.c b/clang/test/Analysis/malloc.c
index 8ee29aeb324f7..9c7ca43bfbc5a 100644
--- a/clang/test/Analysis/malloc.c
+++ b/clang/test/Analysis/malloc.c
@@ -732,7 +732,7 @@ void mallocCastToFP(void) {
   free(p);
 }
 
-// This tests that malloc() buffers are undefined by default
+// This tests that 'malloc()' buffers are undefined by default
 char mallocGarbage (void) {
 	char *buf = malloc(2);
 	char result = buf[1]; // expected-warning{{undefined}}
@@ -778,17 +778,17 @@ void paramFree(int *p) {
 
 void allocaFree(void) {
   int *p = alloca(sizeof(int));
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void allocaFreeBuiltin(void) {
   int *p = __builtin_alloca(sizeof(int));
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 void allocaFreeBuiltinAlign(void) {
   int *p = __builtin_alloca_with_align(sizeof(int), 64);
-  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+  free(p); // expected-warning {{Memory allocated by 'alloca()' should not be deallocated}}
 }
 
 
@@ -1327,7 +1327,7 @@ void radar10978247_positive(int myValueSize) {
   else
     return; // expected-warning {{leak}}
 }
-// Previously this triggered a false positive because malloc() is known to
+// Previously this triggered a false positive because 'malloc()' is known to
 // return uninitialized memory and the binding of 'o' to 'p->n' was not getting
 // propertly handled. Now we report a leak.
 struct rdar11269741_a_t {
@@ -1656,26 +1656,26 @@ void testOffsetDeallocate(int *memoryBlock) {
 void testOffsetOfRegionFreed(void) {
   __int64_t * array = malloc(sizeof(__int64_t)*2);
   array += 1;
-  free(&array[0]); // expected-warning{{Argument to free() is offset by 8 bytes from the start of memory allocated by malloc()}}
+  free(&array[0]); // expected-warning{{Argument to 'free()' is offset by 8 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 void testOffsetOfRegionFreed2(void) {
   __int64_t *p = malloc(sizeof(__int64_t)*2);
   p += 1;
-  free(p); // expected-warning{{Argument to free() is offset by 8 bytes from the start of memory allocated by malloc()}}
+  free(p); // expected-warning{{Argument to 'free()' is offset by 8 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 void testOffsetOfRegionFreed3(void) {
   char *r = malloc(sizeof(char));
   r = r - 10;
-  free(r); // expected-warning {{Argument to free() is offset by -10 bytes from the start of memory allocated by malloc()}}
+  free(r); // expected-warning {{Argument to 'free()' is offset by -10 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 void testOffsetOfRegionFreedAfterFunctionCall(void) {
   int *p = malloc(sizeof(int)*2);
   p += 1;
   myfoo(p);
-  free(p); // expected-warning{{Argument to free() is offset by 4 bytes from the start of memory allocated by malloc()}}
+  free(p); // expected-warning{{Argument to 'free()' is offset by 4 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 void testFixManipulatedPointerBeforeFree(void) {
@@ -1695,7 +1695,7 @@ void freeOffsetPointerPassedToFunction(void) {
   p[1] = 0;
   p += 1;
   myfooint(*p); // not passing the pointer, only a value pointed by pointer
-  free(p); // expected-warning {{Argument to free() is offset by 8 bytes from the start of memory allocated by malloc()}}
+  free(p); // expected-warning {{Argument to 'free()' is offset by 8 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 int arbitraryInt(void);
@@ -1709,13 +1709,13 @@ void testFreeNonMallocPointerWithNoOffset(void) {
   char c;
   char *r = &c;
   r = r + 10;
-  free(r-10); // expected-warning {{Argument to free() is the address of the local variable 'c', which is not memory allocated by malloc()}}
+  free(r-10); // expected-warning {{Argument to 'free()' is the address of the local variable 'c', which is not memory allocated by 'malloc()'}}
 }
 
 void testFreeNonMallocPointerWithOffset(void) {
   char c;
   char *r = &c;
-  free(r+1); // expected-warning {{Argument to free() is the address of the local variable 'c', which is not memory allocated by malloc()}}
+  free(r+1); // expected-warning {{Argument to 'free()' is the address of the local variable 'c', which is not memory allocated by 'malloc()'}}
 }
 
 void testOffsetZeroDoubleFree(void) {
@@ -1735,14 +1735,14 @@ void testOffsetPassedToStrlenThenFree(void) {
   char * string = malloc(sizeof(char)*10);
   string += 1;
   int length = strlen(string);
-  free(string); // expected-warning {{Argument to free() is offset by 1 byte from the start of memory allocated by malloc()}}
+  free(string); // expected-warning {{Argument to 'free()' is offset by 1 byte from the start of memory allocated by 'malloc()'}}
 }
 
 void testOffsetPassedAsConst(void) {
   char * string = malloc(sizeof(char)*10);
   string += 1;
   passConstPtr(string);
-  free(string); // expected-warning {{Argument to free() is offset by 1 byte from the start of memory allocated by malloc()}}
+  free(string); // expected-warning {{Argument to 'free()' is offset by 1 byte from the start of memory allocated by 'malloc()'}}
 }
 
 char **_vectorSegments;
@@ -1842,12 +1842,12 @@ int testNoCheckerDataPropogationFromLogicalOpOperandToOpResult(void) {
 void (*fnptr)(int);
 void freeIndirectFunctionPtr(void) {
   void *p = (void *)fnptr;
-  free(p); // expected-warning {{Argument to free() is a function pointer}}
+  free(p); // expected-warning {{Argument to 'free()' is a function pointer}}
 }
 
 void freeFunctionPtr(void) {
   free((void *)fnptr);
-  // expected-warning@-1{{Argument to free() is a function pointer}}
+  // expected-warning@-1{{Argument to 'free()' is a function pointer}}
   // expected-warning@-2{{attempt to call free on non-heap object '(void *)fnptr'}}
 }
 
@@ -1905,8 +1905,8 @@ enum { BUFSIZE = 256 };
 void MEM34_C(void) {
   char buf[BUFSIZE];
   char *p = (char *)realloc(buf, 2 * BUFSIZE);
-  // expected-warning@-1{{Argument to realloc() is the address of the local \
-variable 'buf', which is not memory allocated by malloc() [unix.Malloc]}}
+  // expected-warning@-1{{Argument to 'realloc()' is the address of the local \
+variable 'buf', which is not memory allocated by 'malloc()' [unix.Malloc]}}
   if (p == NULL) {
     /* Handle error */
   }
diff --git a/clang/test/Analysis/malloc.mm b/clang/test/Analysis/malloc.mm
index 94a46d731090b..5b816a1524aec 100644
--- a/clang/test/Analysis/malloc.mm
+++ b/clang/test/Analysis/malloc.mm
@@ -89,7 +89,7 @@ void testNSStringFreeWhenDoneNO2(NSUInteger dataLength) {
 
 void testOffsetFree() {
   int *p = (int *)malloc(sizeof(int));
-  NSData *nsdata = [NSData dataWithBytesNoCopy:++p length:sizeof(int) freeWhenDone:1]; // expected-warning{{Argument to +dataWithBytesNoCopy:length:freeWhenDone: is offset by 4 bytes from the start of memory allocated by malloc()}}
+  NSData *nsdata = [NSData dataWithBytesNoCopy:++p length:sizeof(int) freeWhenDone:1]; // expected-warning{{Argument to +dataWithBytesNoCopy:length:freeWhenDone: is offset by 4 bytes from the start of memory allocated by 'malloc()'}}
 }
 
 void testRelinquished1() {
diff --git a/clang/test/Analysis/mmap-writeexec.c b/clang/test/Analysis/mmap-writeexec.c
index 8fd86ceb9d2a2..579cc75069eec 100644
--- a/clang/test/Analysis/mmap-writeexec.c
+++ b/clang/test/Analysis/mmap-writeexec.c
@@ -1,13 +1,14 @@
-// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -analyzer-config alpha.security.MmapWriteExec:MmapProtExec=1 -analyzer-config alpha.security.MmapWriteExec:MmapProtRead=4 -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
+// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
 // RUN: %clang_analyze_cc1 -triple x86_64-unknown-apple-darwin10 -analyzer-checker=alpha.security.MmapWriteExec -verify %s
 
-#define PROT_WRITE  0x02
 #ifndef USE_ALTERNATIVE_PROT_EXEC_DEFINITION
-#define PROT_EXEC   0x04
-#define PROT_READ   0x01
-#else
 #define PROT_EXEC   0x01
+#define PROT_WRITE  0x02
 #define PROT_READ   0x04
+#else
+#define PROT_EXEC   0x08
+#define PROT_WRITE  0x04
+#define PROT_READ   0x02
 #endif
 #define MAP_PRIVATE 0x0002
 #define MAP_ANON    0x1000
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 92f983d8b1561..de70e483add1c 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -17,6 +17,27 @@ int underflowWithDeref(void) {
   // expected-note@-2 {{Access of 'TenElements' at negative byte offset -4}}
 }
 
+int rng(void);
+int getIndex(void) {
+  switch (rng()) {
+    case 1: return -152;
+    case 2: return -160;
+    case 3: return -168;
+    default: return -172;
+  }
+}
+
+void gh86959(void) {
+  // Previously code like this produced many almost-identical bug reports that
+  // only differed in the offset value. Verify that now we only see one report.
+
+  // expected-note@+1 {{Entering loop body}}
+  while (rng())
+    TenElements[getIndex()] = 10;
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -688}}
+}
+
 int scanf(const char *restrict fmt, ...);
 
 void taintedIndex(void) {
diff --git a/clang/test/Analysis/plist-macros.cpp b/clang/test/Analysis/plist-macros.cpp
index 94f8e514c8b39..d7b3c7cc1c86f 100644
--- a/clang/test/Analysis/plist-macros.cpp
+++ b/clang/test/Analysis/plist-macros.cpp
@@ -13,7 +13,7 @@ void noteOnMacro(int y) {
   mallocmemory
   y++; 
   y++;
-  delete x; // expected-warning {{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
+  delete x; // expected-warning {{Memory allocated by 'malloc()' should be deallocated by 'free()', not 'delete'}}
 }
 
 void macroIsFirstInFunction(int y) {
diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c
index 88e6dec2d172f..194c891889952 100644
--- a/clang/test/Analysis/pointer-sub.c
+++ b/clang/test/Analysis/pointer-sub.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.PointerSub -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.PointerSub -analyzer-output=text-minimal -verify %s
 
 void f1(void) {
   int x, y, z[10];
@@ -73,15 +73,15 @@ void f4(void) {
   d = a[2] - a[1]; // expected-warning{{Subtraction of two pointers that}}
 }
 
-typedef struct {
+struct S {
   int a;
   int b;
   int c[10]; // expected-note2{{Array at the right-hand side of subtraction}}
   int d[10]; // expected-note2{{Array at the left-hand side of subtraction}}
-} S;
+};
 
 void f5(void) {
-  S s;
+  struct S s;
   int y;
   int d;
 
@@ -92,18 +92,18 @@ void f5(void) {
   d = &s.d[3] - &s.c[2]; // expected-warning{{Subtraction of two pointers that}}
   d = s.d - s.c; // expected-warning{{Subtraction of two pointers that}}
 
-  S sa[10];
+  struct S sa[10];
   d = &sa[2] - &sa[1];
   d = &sa[2].a - &sa[1].b; // expected-warning{{Subtraction of two pointers that}}
 }
 
 void f6(void) {
-  long long l;
+  long long l = 2;
   char *a1 = (char *)&l;
   int d = a1[3] - l;
 
-  long long la1[3]; // expected-note{{Array at the right-hand side of subtraction}}
-  long long la2[3]; // expected-note{{Array at the left-hand side of subtraction}}
+  long long la1[3] = {1}; // expected-note{{Array at the right-hand side of subtraction}}
+  long long la2[3] = {1}; // expected-note{{Array at the left-hand side of subtraction}}
   char *pla1 = (char *)la1;
   char *pla2 = (char *)la2;
   d = pla1[1] - pla1[0];
@@ -117,6 +117,40 @@ void f7(int *p) {
 }
 
 void f8(int n) {
-  int a[10];
-  int d = a[n] - a[0];
+  int a[10] = {1};
+  int d = a[n] - a[0]; // no-warning
+}
+
+int f9(const char *p1) {
+  const char *p2 = p1;
+  --p1;
+  ++p2;
+  return p1 - p2; // no-warning
+}
+
+int f10(struct S *p1, struct S *p2) {
+  return &p1->c[5] - &p2->c[5]; // no-warning
+}
+
+struct S1 {
+  int a;
+  int b; // expected-note{{Object at the right-hand side of subtraction}}
+};
+
+int f11() {
+  struct S1 s; // expected-note{{Object at the left-hand side of subtraction}}
+  return (char *)&s - (char *)&s.b; // expected-warning{{Subtraction of two pointers that}}
+}
+
+struct S2 {
+  char *p1;
+  char *p2;
+};
+
+void init_S2(struct S2 *);
+
+int f12() {
+  struct S2 s;
+  init_S2(&s);
+  return s.p1 - s.p2; // no-warning (pointers are unknown)
 }
diff --git a/clang/test/Analysis/short-circuiting-eval.cpp b/clang/test/Analysis/short-circuiting-eval.cpp
new file mode 100644
index 0000000000000..d0f29a849ab1c
--- /dev/null
+++ b/clang/test/Analysis/short-circuiting-eval.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -verify %s
+
+int div0LogicalOpInTernary(bool b1) {
+  int y = (b1 || b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalAndArith(bool b1, int x) {
+  int y = (b1 || (x < 3)) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0NestedLogicalOp(bool b1) {
+  int y = (b1 && b1 || b1 && b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0TernaryInTernary(bool b) {
+  int y = ((b || b) ? false : true) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpParensInTernary(bool b1) {
+  int y = ((((b1)) || ((b1)))) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpInsideStExpr(bool b1) {
+  int y = ({1; (b1 || b1);}) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0StExprInsideLogicalOp(bool b1) {
+  int y = (({1; b1;}) || ({1; b1;})) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  // expected-warning@-2 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index c924cbd36f759..b9a5b1ba8cd49 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,11 +1,11 @@
 // RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=armv8-none-linux-eabi -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=aarch64-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=hexagon -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 
 #include "Inputs/system-header-simulator.h"
 #include "Inputs/system-header-simulator-for-malloc.h"
@@ -498,3 +498,35 @@ void gh_93408_regression_ZeroSized(struct ZeroSized *buffer) {
   fread(buffer, 1, 1, f); // expected-warning {{Stream pointer might be NULL}} no-crash
   fclose(f);
 }
+
+extern FILE *non_standard_stream_ptr;
+void test_fopen_does_not_alias_with_standard_streams(void) {
+  FILE *f = fopen("file", "r");
+  if (!f) return;
+  clang_analyzer_eval(f == stdin);  // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == stdout); // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == stderr); // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == non_standard_stream_ptr); // expected-warning {{UNKNOWN}}
+  if (f != stdout) {
+    fclose(f);
+  }
+} // no-leak: 'fclose()' is always called because 'f' cannot be 'stdout'.
+
+void reopen_std_stream(void) {
+  FILE *oldStdout = stdout;
+  fclose(stdout);
+  FILE *fp = fopen("blah", "w");
+  if (!fp) return;
+
+  stdout = fp; // Let's make them alias.
+  clang_analyzer_eval(fp == oldStdout);     // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(fp == stdout);        // expected-warning {{TRUE}} no-FALSE
+  clang_analyzer_eval(oldStdout == stdout); // expected-warning {{UNKNOWN}}
+}
+
+void only_success_path_does_not_alias_with_stdout(void) {
+  if (stdout) return;
+  FILE *f = fopen("/tmp/foof", "r"); // no-crash
+  if (!f) return;
+  fclose(f);
+}
diff --git a/clang/test/Analysis/use-after-move.cpp b/clang/test/Analysis/use-after-move.cpp
index 33980e6ea2b8b..24d5dd8a8b3d2 100644
--- a/clang/test/Analysis/use-after-move.cpp
+++ b/clang/test/Analysis/use-after-move.cpp
@@ -570,13 +570,8 @@ void differentBranchesTest(int i) {
   {
     A a;
     a.foo() > 0 ? a.foo() : A(std::move(a)).foo();
-#ifdef DFS
-    // peaceful-note@-2 {{Assuming the condition is false}}
-    // peaceful-note@-3 {{'?' condition is false}}
-#else
-    // peaceful-note@-5 {{Assuming the condition is true}}
-    // peaceful-note@-6 {{'?' condition is true}}
-#endif
+    // peaceful-note@-1 {{Assuming the condition is true}}
+    // peaceful-note@-2 {{'?' condition is true}}
   }
   // Same thing, but with a switch statement.
   {
diff --git a/clang/test/Analysis/weak-functions.c b/clang/test/Analysis/weak-functions.c
index 26cbfb3523a92..5bdb411fcbf4f 100644
--- a/clang/test/Analysis/weak-functions.c
+++ b/clang/test/Analysis/weak-functions.c
@@ -72,7 +72,7 @@ void free(void *) __attribute__((weak_import));
 
 void t10 (void) {
   free((void*)&t10);
-  // expected-warning@-1{{Argument to free() is the address of the function 't10', which is not memory allocated by malloc()}}
+  // expected-warning@-1{{Argument to 'free()' is the address of the function 't10', which is not memory allocated by 'malloc()'}}
   // expected-warning@-2{{attempt to call free on non-heap object 't10'}}
 }
 
diff --git a/clang/test/C/C23/n3018.c b/clang/test/C/C23/n3018.c
index 0d54d53b7499f..4ad2fffbfde80 100644
--- a/clang/test/C/C23/n3018.c
+++ b/clang/test/C/C23/n3018.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c23 -verify -triple x86_64 -pedantic -Wno-conversion -Wno-constant-conversion %s
+// RUN: %clang_cc1 -std=c23 -verify -triple x86_64 -pedantic -Wno-conversion -Wno-constant-conversion -fexperimental-new-constant-interpreter %s
 
 /* WG14 N3018: Full
  * The constexpr specifier for object definitions
diff --git a/clang/test/C/C2y/n3254.c b/clang/test/C/C2y/n3254.c
index e08659cf377aa..60d068cf9980b 100644
--- a/clang/test/C/C2y/n3254.c
+++ b/clang/test/C/C2y/n3254.c
@@ -21,9 +21,9 @@ struct S {
 // CHECK-LABEL: define dso_local i32 @foo(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 1
+// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 4
 // CHECK-NEXT:    [[S_PTR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[BUFFER]], i8 0, i64 12, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[BUFFER]], i8 0, i64 12, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[S_PTR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_PTR]], align 8
@@ -40,13 +40,13 @@ int foo() {
 // CHECK-LABEL: define dso_local signext i8 @bar(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 1
+// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 1
-// CHECK-NEXT:    store i8 97, ptr [[C]], align 1
+// CHECK-NEXT:    store i8 97, ptr [[C]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[C2]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[C2]], align 4
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
 char bar() {
@@ -58,13 +58,13 @@ char bar() {
 // CHECK-LABEL: define dso_local float @baz(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 1
+// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 2
-// CHECK-NEXT:    store float 3.000000e+00, ptr [[F]], align 1
+// CHECK-NEXT:    store float 3.000000e+00, ptr [[F]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY1]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F2]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F2]], align 4
 // CHECK-NEXT:    ret float [[TMP0]]
 //
 float baz() {
@@ -80,9 +80,9 @@ struct T {
 // CHECK-LABEL: define dso_local signext i8 @quux(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_T:%.*]], align 1
+// CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_T:%.*]], align 4
 // CHECK-NEXT:    [[S_PTR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[T]], i8 0, i64 12, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[T]], i8 0, i64 12, i1 false)
 // CHECK-NEXT:    [[BUFFER:%.*]] = getelementptr inbounds [[STRUCT_T]], ptr [[T]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[S_PTR]], align 8
@@ -100,10 +100,10 @@ char quux() {
 // CHECK-LABEL: define dso_local float @quibble(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 1
+// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 4
 // CHECK-NEXT:    [[T_PTR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[S_PTR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[BUFFER]], i8 0, i64 12, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[BUFFER]], i8 0, i64 12, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[T_PTR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_PTR]], align 8
@@ -125,13 +125,13 @@ float quibble() {
 // CHECK-LABEL: define dso_local i32 @quorble(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 1
+// CHECK-NEXT:    [[BUFFER:%.*]] = alloca [12 x i8], align 4
 // CHECK-NEXT:    [[S_PTR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[BUFFER1:%.*]] = getelementptr inbounds [[STRUCT_T:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER1]], i64 0, i64 0
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY2]], i32 0, i32 0
-// CHECK-NEXT:    store i32 12, ptr [[X]], align 1
+// CHECK-NEXT:    store i32 12, ptr [[X]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY3:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0
 // CHECK-NEXT:    [[BUFFER4:%.*]] = getelementptr inbounds [[STRUCT_T]], ptr [[ARRAYDECAY3]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER4]], i64 0, i64 0
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 1c397f277d3c4..6832b5adcdea3 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -80,6 +80,7 @@ list(APPEND CLANG_TEST_DEPS
   clang-installapi
   clang-scan-deps
   clang-linker-wrapper
+  clang-nvlink-wrapper
   clang-offload-bundler
   clang-offload-packager
   diagtool
diff --git a/clang/test/CXX/basic/basic.types/p10.cpp b/clang/test/CXX/basic/basic.types/p10.cpp
index a543f248e5371..92d6da0035ea5 100644
--- a/clang/test/CXX/basic/basic.types/p10.cpp
+++ b/clang/test/CXX/basic/basic.types/p10.cpp
@@ -142,7 +142,7 @@ constexpr int arb(int n) { // expected-note {{declared here}}
                expected-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
 }
 constexpr long Overflow[(1 << 30) << 2]{}; // expected-warning {{requires 34 bits to represent}} \
-                                              expected-warning {{variable length array folded to constant array as an extension}} \
+                                              expected-error {{variable length array declaration not allowed at file scope}} \
                                               expected-warning {{variable length arrays in C++ are a Clang extension}} \
                                               expected-note {{signed left shift discards bits}}
 
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp
new file mode 100644
index 0000000000000..a06b107755596
--- /dev/null
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+template<typename T>
+struct A0 {
+  struct B0;
+
+  template<typename U>
+  struct C0 {
+    struct D0;
+
+    template<typename V>
+    struct E0;
+  };
+};
+
+template<typename T>
+int A0<T>::B0::* f0();
+
+template<typename T>
+int A0<T>::B1::* f1();
+
+template<typename T>
+int A0<T>::C0<int>::* f2(); // expected-error {{expected unqualified-id}}
+
+template<typename T>
+int A0<T>::C1<int>::* f3(); // expected-error {{no member named 'C1' in 'A0<T>'}}
+                            // expected-error@-1 {{expected ';' after top level declarator}}
+
+template<typename T>
+int A0<T>::template C2<int>::* f4();
+
+template<typename T>
+int A0<T>::template C0<int>::D0::* f5();
+
+template<typename T>
+int A0<T>::template C2<int>::D1::* f6();
+
+template<typename T>
+int A0<T>::template C0<int>::E0<int>::* f7(); // expected-error {{use 'template' keyword to treat 'E0' as a dependent template name}}
+                                              // expected-error@-1 {{expected unqualified-id}}
+
+template<typename T>
+int A0<T>::template C2<int>::E1<int>::* f8(); // expected-error {{no member named 'C2' in 'A0<T>'}}
+
+template<typename T>
+int A0<T>::template C0<int>::template E0<int>::* f9();
+
+template<typename T>
+int A0<T>::template C2<int>::template E1<int>::* f10();
+
+namespace TypoCorrection {
+  template<typename T>
+  struct A {
+    template<typename U>
+    struct Typo; // expected-note {{'Typo' declared here}}
+  };
+
+  template<typename T>
+  int A<T>::template typo<int>::* f();
+
+  template<typename T>
+  int A<T>::typo<int>::* g(); // expected-error {{no template named 'typo' in 'A<T>'; did you mean 'Typo'?}}
+                              // expected-error@-1 {{expected unqualified-id}}
+}
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 323e56f9c5278..adfdb738e81c9 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -3,8 +3,8 @@
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -480,8 +480,12 @@ namespace cwg1872 { // cwg1872: 9
   static_assert(y == 0);
 #endif
   constexpr int z = A<Z>().f();
-  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}
-  //   since-cxx11-note@-2 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
+  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}a
+#if __cplusplus < 202302L
+  //   since-cxx11-note@-3 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
+#else
+  //   since-cxx23-note@-5 {{cannot construct object of type 'A<cwg1872::Z>' with virtual base class in a constant expression}}
+#endif
 #endif
 }
 
diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 16b8ec07fc50f..00b6bb5a865df 100644
--- a/clang/test/CXX/drs/cwg24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
@@ -82,6 +82,15 @@ auto h() -> C auto {
   C auto foo = T();
   // expected-warning@-1 {{'C' is deprecated}}
   //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  C auto *bar = T();
+  // expected-warning@-1 {{'C' is deprecated}}
+  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  C auto &baz = T();
+  // expected-warning@-1 {{'C' is deprecated}}
+  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  C auto &&quux = T();
+  // expected-warning@-1 {{'C' is deprecated}}
+  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   return foo;
 }
 #endif
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 0934f0cc19c6a..1c0d32fe3fdfc 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -139,7 +139,7 @@ struct D3 : B {
 #endif
 
 #if __cplusplus >= 202302L
-namespace cwg2561 { // cwg2561: no tentatively ready 2024-03-18
+namespace cwg2561 { // cwg2561: no
 struct C {
     constexpr C(auto) { }
 };
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index c77bd433d8e21..9796607a790ce 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -30,7 +30,7 @@ using U2 = decltype(&main);
 #endif
 } // namespace cwg2811
 
-namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
+namespace cwg2819 { // cwg2819: 19
 #if __cpp_constexpr >= 202306L
   constexpr void* p = nullptr;
   constexpr int* q = static_cast<int*>(p);
@@ -111,7 +111,7 @@ struct D : N::B {
 #endif
 } // namespace cwg2857
 
-namespace cwg2858 { // cwg2858: 19 tentatively ready 2024-04-05
+namespace cwg2858 { // cwg2858: 19
 
 #if __cplusplus > 202302L
 
@@ -134,23 +134,23 @@ struct A {
 
 } // namespace cwg2858
 
-namespace cwg2877 { // cwg2877: 19 tentatively ready 2024-05-31
+namespace cwg2877 { // cwg2877: 19
 #if __cplusplus >= 202002L
 enum E { x };
 void f() {
   int E;
-  using enum E;   // OK, names ::E
+  using enum E;   // OK
 }
 using F = E;
-using enum F;     // OK, designates ::E
+using enum F;     // OK
 template<class T> using EE = T;
 void g() {
-  using enum EE<E>;  // OK, designates ::E
+  using enum EE<E>;  // OK
 }
 #endif
 } // namespace cwg2877
 
-namespace cwg2881 { // cwg2881: 19 tentatively ready 2024-04-19
+namespace cwg2881 { // cwg2881: 19
 
 #if __cplusplus >= 202302L
 
@@ -216,11 +216,61 @@ void f() {
   o.decltype(L2)::operator()();
 }
 
+void f2() {
+  int x = 0;
+  auto lambda = [x] (this auto self) { return x; };
+  using Lambda = decltype(lambda);
+  struct D : private Lambda { // expected-note {{declared private here}}
+    D(Lambda l) : Lambda(l) {}
+    using Lambda::operator();
+    friend Lambda;
+  } d(lambda);
+  d(); // expected-error {{must derive publicly from the lambda}}
+}
+
+template <typename L>
+struct Private : private L {
+  using L::operator();
+  Private(L l) : L(l) {}
+};
+
+template<typename T>
+struct Indirect : T {
+  using T::operator();
+};
+
+template<typename T>
+struct Ambiguous : Indirect<T>, T { // expected-warning {{is inaccessible due to ambiguity}}
+  using Indirect<T>::operator();
+};
+
+template <typename L>
+constexpr auto f3(L l) -> decltype(Private<L>{l}()) { return l(); }
+// expected-note@-1 {{must derive publicly from the lambda}}
+
+template <typename L>
+constexpr auto f4(L l) -> decltype(Ambiguous<L>{{l}, l}()) { return l(); }
+// expected-note@-1 {{is inaccessible due to ambiguity}}
+// expected-note@-2 {{in instantiation of template class}}
+
+template<typename T>
+concept is_callable = requires(T t) { { t() }; };
+
+void g() {
+  int x = 0;
+  auto lambda = [x](this auto self) {};
+  f3(lambda); // expected-error {{no matching function for call to 'f3'}}
+  f4(lambda); // expected-error {{no matching function for call to 'f4'}}
+  // expected-note@-1 {{while substituting deduced template arguments into function template 'f4'}}
+  static_assert(!is_callable<Private<decltype(lambda)>>);
+  static_assert(!is_callable<Ambiguous<decltype(lambda)>>);
+}
+
 #endif
 
 } // namespace cwg2881
 
-namespace cwg2882 { // cwg2882: 2.7 tentatively ready 2024-05-31
+namespace cwg2882 { // cwg2882: 2.7
 struct C {
   operator void() = delete;
   // expected-warning@-1 {{conversion function converting 'cwg2882::C' to 'void' will never be used}}
@@ -232,7 +282,7 @@ void f(C c) {
 }
 } // namespace cwg2882
 
-namespace cwg2883 { // cwg2883: no tentatively ready 2024-05-31
+namespace cwg2883 { // cwg2883: no
 #if __cplusplus >= 201103L
 void f() {
   int x;
@@ -257,7 +307,7 @@ void g() {
 #endif
 } // namespace cwg2883
 
-namespace cwg2885 { // cwg2885: 16 tentatively ready 2024-05-31
+namespace cwg2885 { // cwg2885: 16 review 2024-05-31
 #if __cplusplus >= 202002L
 template <class T>
 struct A {
@@ -271,7 +321,7 @@ static_assert(!__is_trivially_constructible(B));
 #endif
 } // namespace cwg2885
 
-namespace cwg2886 { // cwg2886: 9 tentatively ready 2024-05-31
+namespace cwg2886 { // cwg2886: 9
 #if __cplusplus >= 201103L
 struct C {
   C() = default;
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 069102d9c5975..1c56dd3907152 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -1265,7 +1265,7 @@ namespace cwg687 { // cwg687 (9 c++20, but the issue is still considered open)
 
     // This is not.
     template g<int>(a);
-    // expected-error@-1 {{expected expression}}
+    // expected-error@-1 {{expected '<' after 'template'}}
   }
 }
 
diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index c8cbdfcee3f4d..38bff3adf262a 100644
--- a/clang/test/CXX/drs/cwg8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
@@ -30,3 +30,9 @@ void g(int i) {
 }
 #endif
 } // namespace cwg873
+
+// cwg882: 3.5
+#if __cplusplus >= 201103L
+int main() = delete;
+// since-cxx11-error@-1 {{'main' is not allowed to be deleted}}
+#endif
diff --git a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
index 162d59439d08e..170ca0a3f1c6b 100644
--- a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
@@ -43,18 +43,36 @@ namespace test2 {
 }
 
 namespace GH40906 {
-  struct A {
-    int val;
-    void func() {}
-  };
+struct S {
+    int x;
+    void func();
+    static_assert(__is_same_as(decltype((S::x)), int&), "");
+    static_assert(__is_same_as(decltype(&(S::x)), int*), "");
 
-  void test() {
-    decltype(&(A::val)) ptr1; // expected-error {{cannot form pointer to member from a parenthesized expression; did you mean to remove the parentheses?}}
-    int A::* ptr2 = &(A::val); // expected-error {{invalid use of non-static data member 'val'}}
+    // FIXME: provide better error messages
+    static_assert(__is_same_as(decltype((S::func)), int&), ""); // expected-error {{call to non-static member function without an object argument}}
+    static_assert(__is_same_as(decltype(&(S::func)), int*), ""); // expected-error {{call to non-static member function without an object argument}}
+};
+static_assert(__is_same_as(decltype((S::x)), int&), "");
+static_assert(__is_same_as(decltype(&(S::x)), int*), "");
+static_assert(__is_same_as(decltype((S::func)), int&), ""); // expected-error {{call to non-static member function without an object argument}}
+static_assert(__is_same_as(decltype(&(S::func)), int*), ""); // expected-error {{call to non-static member function without an object argument}}
+
+struct A { int x;};
+
+char q(int *);
+short q(int A::*);
+
+template <typename T>
+constexpr int f(char (*)[sizeof(q(&T::x))]) { return 1; }
+
+template <typename T>
+constexpr int f(char (*)[sizeof(q(&(T::x)))]) { return 2; }
+
+constexpr int g(char (*p)[sizeof(char)] = 0) { return f<A>(p); }
+constexpr int h(char (*p)[sizeof(short)] = 0) { return f<A>(p); }
+
+static_assert(g() == 2);
+static_assert(h() == 1);
 
-    // FIXME: Error messages in these cases are less than clear, we can do
-    // better.
-    int size = sizeof(&(A::func)); // expected-error {{call to non-static member function without an object argument}}
-    void (A::* ptr3)() = &(A::func); // expected-error {{call to non-static member function without an object argument}}
-  }
 }
diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
index 51489c5eac5ad..19793fe826372 100644
--- a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
+++ b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
@@ -38,3 +38,13 @@ X1<int, X1a> inst_x1a;
 X1<long, X1b> inst_x1b;
 X1<short, X1c> inst_x1c;
 X1<short, X1d> inst_x1d; // expected-error{{template template argument has different template parameters than its corresponding template template paramete}}
+
+template <int> class X2; // expected-note{{template is declared here}} \
+                         // expected-note{{template is declared here}}
+class X3 : X2<1> {}; // expected-error{{implicit instantiation of undefined template 'X2<1>'}}
+
+template <int> class X4 : X3 {
+  struct {
+    X2<1> e; // expected-error{{implicit instantiation of undefined template 'X2<1>'}}
+  } f;
+};
diff --git a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
index ed445360c4fdd..b9ff26a7620db 100644
--- a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
+++ b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
@@ -78,7 +78,7 @@ namespace std_example {
   template<class T> struct A { // expected-note {{candidate}} expected-note {{implicit deduction guide}}
     template<class U>
     A(T &&, U &&, int *); // expected-note {{[with T = int, U = int] not viable: expects an rvalue}} \
-                          // expected-note {{implicit deduction guide declared as 'template <class T, class U> A(T &&, type-parameter-0-1 &&, int *) -> A<T>'}}
+                          // expected-note {{implicit deduction guide declared as 'template <class T, class U> A(T &&, U &&, int *) -> A<T>'}}
     A(T &&, int *);       // expected-note {{requires 2}} \
                           // expected-note {{implicit deduction guide declared as 'template <class T> A(T &&, int *) -> A<T>'}}
   };
diff --git a/clang/test/CXX/temp/temp.res/p3.cpp b/clang/test/CXX/temp/temp.res/p3.cpp
index 37ab93559e369..1eda967523a59 100644
--- a/clang/test/CXX/temp/temp.res/p3.cpp
+++ b/clang/test/CXX/temp/temp.res/p3.cpp
@@ -2,7 +2,7 @@
 
 template<typename T> struct A {
   template<typename U> struct B;
-  template<typename U> using C = U; // expected-note {{here}}
+  template<typename U> using C = U;
 };
 
 struct X {
@@ -20,12 +20,10 @@ template<typename T> A<T>::C<T> f2(); // expected-warning {{missing 'typename'}}
 template<typename T> A<T>::C<X>::X(T) {}
 template<typename T> A<T>::C<X>::X::Y::Y(T) {}
 
-// FIXME: This is ill-formed
-template<typename T> int A<T>::B<T>::*f3() {}
-template<typename T> int A<T>::C<X>::*f4() {}
+template<typename T> int A<T>::B<T>::*f3() {} // expected-error {{expected unqualified-id}}
+template<typename T> int A<T>::C<X>::*f4() {} // expected-error {{expected unqualified-id}}
 
-// FIXME: This is valid
-template<typename T> int A<T>::template C<int>::*f5() {} // expected-error {{has no members}}
+template<typename T> int A<T>::template C<int>::*f5() {}
 
 template<typename T> template<typename U> struct A<T>::B {
   friend A<T>::C<T> f6(); // ok, same as 'friend T f6();'
diff --git a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
index 7a261fef27361..935dd360847bc 100644
--- a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
+++ b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
@@ -56,7 +56,7 @@ struct B {
   static int y;
 
   template<typename T>
-  int z; // expected-error {{member 'z' declared as a template}}
+  int z; // expected-error {{non-static data member 'z' cannot be declared as a template}}
 
   template<typename T>
   static int x<T*>;
@@ -65,7 +65,7 @@ struct B {
   static int y<T*>;
 
   template<typename T>
-  int x<T**>; // expected-error {{member 'x' declared as a template}}
+  int x<T**>; // expected-error {{non-static data member 'x' cannot be declared as a template}}
 
   template<>
   int x<short>;
@@ -169,7 +169,7 @@ struct D {
   static int y;
 
   template<typename U>
-  int z; // expected-error {{member 'z' declared as a template}}
+  int z; // expected-error {{non-static data member 'z' cannot be declared as a template}}
 
   template<typename U>
   static int x<U*>;
@@ -178,7 +178,7 @@ struct D {
   static int y<U*>;
 
   template<typename U>
-  int x<U**>; // expected-error {{member 'x' declared as a template}}
+  int x<U**>; // expected-error {{non-static data member 'x' cannot be declared as a template}}
 
   template<>
   int x<short>;
diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
new file mode 100644
index 0000000000000..6deff1116e1d8
--- /dev/null
+++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,SAFE %s
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -munsafe-fp-atomics -o - %s | FileCheck -check-prefixes=CHECK,UNSAFE %s
+
+// SAFE-LABEL: define dso_local float @test_float_post_inc(
+// SAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_inc()
+{
+    static _Atomic float n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_dc()
+{
+    static _Atomic float n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_dc()
+{
+    static _Atomic float n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_inc()
+{
+    static _Atomic float n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+    static _Atomic double n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+    static _Atomic double n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+    static _Atomic double n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+    static _Atomic double n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_inc()
+{
+    static _Atomic _Float16 n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_dc()
+{
+    static _Atomic _Float16 n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_dc()
+{
+    static _Atomic _Float16 n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_inc()
+{
+    static _Atomic _Float16 n;
+    return ++n;
+}
+//.
+// UNSAFE: [[META3]] = !{}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGen/LoongArch/align.c b/clang/test/CodeGen/LoongArch/align.c
new file mode 100644
index 0000000000000..1b171b74529d1
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/align.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +lsx -target-feature \
+// RUN:   +lasx -emit-llvm %s -o - | FileCheck %s --check-prefix=LA32
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature \
+// RUN:   +lasx -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
+
+#include <stddef.h>
+#include <stdint.h>
+
+char *s1 = "1234";
+// LA32: @.str{{.*}} ={{.*}} constant [5 x i8] c"1234\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [5 x i8] c"1234\00", align 1
+
+char *s2 = "12345678abcd";
+// LA32: @.str{{.*}} ={{.*}} constant [13 x i8] c"12345678abcd\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [13 x i8] c"12345678abcd\00", align 1
+
+char *s3 = "123456789012345678901234567890ab";
+// LA32: @.str{{.*}} ={{.*}} constant [33 x i8] c"1234{{.*}}ab\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [33 x i8] c"1234{{.*}}ab\00", align 1
+
+char *s4 = "123456789012345678901234567890123456789012345678901234567890abcdef";
+// LA32: @.str{{.*}} ={{.*}} constant [67 x i8] c"1234{{.*}}cdef\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [67 x i8] c"1234{{.*}}cdef\00", align 1
+
+int8_t a;
+// LA32: @a ={{.*}} global i8 0, align 1
+// LA64: @a ={{.*}} global i8 0, align 1
+
+int16_t b;
+// LA32: @b ={{.*}} global i16 0, align 2
+// LA64: @b ={{.*}} global i16 0, align 2
+
+int32_t c;
+// LA32: @c ={{.*}} global i32 0, align 4
+// LA64: @c ={{.*}} global i32 0, align 4
+
+int64_t d;
+// LA32: @d ={{.*}} global i64 0, align 8
+// LA64: @d ={{.*}} global i64 0, align 8
+
+intptr_t e;
+// LA32: @e ={{.*}} global i32 0, align 4
+// LA64: @e ={{.*}} global i64 0, align 8
+
+float f;
+// LA32: @f ={{.*}} global float 0.000000e+00, align 4
+// LA64: @f ={{.*}} global float 0.000000e+00, align 4
+
+double g;
+// LA32: @g ={{.*}} global double 0.000000e+00, align 8
+// LA64: @g ={{.*}} global double 0.000000e+00, align 8
+
+struct H {
+  int8_t a;
+};
+struct H h;
+// LA32: @h ={{.*}} global %struct.H zeroinitializer, align 1
+// LA64: @h ={{.*}} global %struct.H zeroinitializer, align 1
diff --git a/clang/test/CodeGen/PowerPC/save-reg-params.c b/clang/test/CodeGen/PowerPC/save-reg-params.c
new file mode 100644
index 0000000000000..d34c97a8fc45f
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/save-reg-params.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
+
+void bar(int);
+void foo(int x) { bar(x); }
+
+// SAVE: attributes #{{[0-9]+}} = { {{.+}} "save-reg-params" {{.+}} }
+// NOSAVE-NOT: "save-reg-params"
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt.c
index dcdfb7be46ad6..ba164258552fe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt.c
@@ -327,55 +327,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4(vuint32m8_t src, size_t vl) {
   return __riscv_vfncvt_f_xu_w_f16m4(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4(vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4(src, vl);
-}
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
@@ -537,46 +488,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4(vuint64m8_t src, size_t vl) {
   return __riscv_vfncvt_f_xu_w_f32m4(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2(vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1(vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2(vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4(vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +808,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_m(vbool4_t mask, vuint32m8_t src, size_t v
   return __riscv_vfncvt_f_xu_w_f16m4_m(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_m(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +968,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_m(vbool8_t mask, vuint64m8_t src, size_t v
   return __riscv_vfncvt_f_xu_w_f32m4_m(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_m(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm
 // CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1467,56 +1288,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm(vuint32m8_t src, size_t vl) {
   return __riscv_vfncvt_f_xu_w_f16m4_rm(src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm(vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm(src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm
 // CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1448,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm(vuint64m8_t src, size_t vl) {
   return __riscv_vfncvt_f_xu_w_f32m4_rm(src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm(vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm(vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm(vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm(vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm(src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1768,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_m(vbool4_t mask, vuint32m8_t src, size_
   return __riscv_vfncvt_f_xu_w_f16m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,43 +1928,3 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_m(vbool8_t mask, vuint64m8_t src, size_
   return __riscv_vfncvt_f_xu_w_f32m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
-}
-
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt_f_f.c
new file mode 100644
index 0000000000000..414e2bddedb6c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfncvt_f_f.c
@@ -0,0 +1,369 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4(vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2(vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1(vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2(vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4(vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm(vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm(vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm(vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm(vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm(vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm_m(mask, src, __RISCV_FRM_RNE, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt.c
index 39e4e70edfaa8..98bdcc5fa030b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt.c
@@ -327,56 +327,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8(vuint16m4_t src, size_t vl) {
   return __riscv_vfwcvt_f_xu_v_f32m8(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2(vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1(vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2(vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4(vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8(vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1
 // CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8(vuint32m4_t src, size_t vl) {
   return __riscv_vfwcvt_f_xu_v_f64m8(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_m(vbool4_t mask, vuint16m4_t src, size_t v
   return __riscv_vfwcvt_f_xu_v_f32m8_m(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_m(vbool64_t mask, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_m(vbool32_t mask, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_m(vbool16_t mask, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_m(vbool8_t mask, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_m(vbool4_t mask, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8_m(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_m(vbool8_t mask, vuint32m4_t src, size_t v
   return __riscv_vfwcvt_f_xu_v_f64m8_m(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4_m(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8_m(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfwcvt_x_f_v_i32mf2_rm
 // CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt_f_f.c
new file mode 100644
index 0000000000000..5bff79780d926
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vfwcvt_f_f.c
@@ -0,0 +1,188 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2(vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1(vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2(vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4(vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8(vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_m(vbool64_t mask, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_m(vbool32_t mask, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_m(vbool16_t mask, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_m(vbool8_t mask, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_m(vbool4_t mask, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4_m(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8_m(mask, src, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16.c
index d7abbd756b732..5fa1d8ccd2665 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16ff.c
index 76b615ee0c04e..e72305a058d6f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vle16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei16.c
index 7c75172584e3d..7b19d8b8cfebf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei32.c
index 87d241e0f4db2..fd018ceb27e91 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei64.c
index 288c4a6ba0ff6..fe0209c9112d0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei8.c
index ef40b006d9ffa..1517f44f3d46b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei16.c
index e28fe15b3e732..cb6e295971432 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32.c
index ba04a70594012..07c906d7198ef 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei64.c
index 6852385f63cbb..4e258ce5625d5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei8.c
index e3fe02c416f04..98e33f136a1da 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei16.c
index 32c988dffa113..89a9bb51d9fbc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei32.c
index 43e430b83e8af..9346bd01e7956 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei64.c
index 23897e99620d7..0aa85eaf1f060 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei8.c
index 74bac5c5ccceb..fee229eb57599 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei16.c
index 68471decdceab..495b6a53a15d3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei32.c
index 974c3b1e2c2c1..a45577ececeb7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei64.c
index 36434dac4e48b..a55a911046ec0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei8.c
index c965f450dcb90..492b4dbd572e9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei16.c
index bb58f183afb48..4ae47c980f336 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei32.c
index 6c2ad98934b68..cee8f91a61187 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei64.c
index 5c39c277d446a..ed5a6971f0bc1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei8.c
index 044bc21bbffbc..acfdeae83af33 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei16.c
index 1e7b515423234..f3b173e97cf6e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei32.c
index 739a09830ac46..3cb1f7cda90f8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei64.c
index 9cb708221fca8..cc2a6e0f46dbf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei8.c
index 0bdc3f486a9cf..91398673a642a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei16.c
index 1e89aefeadbb1..cb6e74a3510f3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei32.c
index 0545d6e4d81c0..8a8d02ef71627 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei64.c
index 804ab4eaa959e..fb1104ec684f5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei8.c
index bf35d15d22fd6..55888b21b15c4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei16.c
index a6c3abe09788a..aafe70020e095 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei32.c
index 9c289773fe239..8d22079a9f1f3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei64.c
index 1d1622b6314ad..52cc01095e4b9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei8.c
index 314ea7021d517..f79da9ac5008b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlse16.c
index 4df96f31fd211..10ecf902d414c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16.c
index 285d9981a65ce..0bc9ffe892437 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16ff.c
index 3bc95e0364362..c211aec2da58d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32.c
index f7e4999f8feb6..1905edc5e451e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff.c
index 822aec3394772..610d89e0785bd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64.c
index 243d7458c3810..5db107d5ac8fa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64ff.c
index 18c0a997503f1..672b74623cb7d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e64ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16.c
index 3083a75e28d6f..6b6ea48ac8987 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16ff.c
index b76598f0ba8b7..183ed065936f4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg3e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16.c
index 54e34a7291606..7dc4dc29cbc53 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16ff.c
index d4cc8767a2ec3..223513ef6d041 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e32ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e32ff.c
index 17f8ecc876e64..2b2d578dbddae 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e32ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg4e32ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16.c
index ddeac49b5fac7..6c2a1300802b3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16ff.c
index 3eb6d2a623672..9a6e218270ec7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg5e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16.c
index 2dccc5dcdca41..ae7141b522440 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16ff.c
index ba4c2ff12b8bb..3cf099ed8156c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg6e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16.c
index e793ae1fad418..d47291d956cb2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16ff.c
index e6d99bb2d40e1..5499c85228316 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg7e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16.c
index eeca3596a928d..f145ec75c1ef9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16ff.c
index 3fa816e8c4754..6be6749b32c21 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg8e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e16.c
index c05d1d3e06584..465e546f241d6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg3e16.c
index 02f55e960bd01..808271bc598ba 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg4e16.c
index b8843db070310..75604bd2fab38 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg5e16.c
index ec65b79092a33..81bc04cb5fc44 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg6e16.c
index 1dea6fa39ee9e..3ddcba0de3c75 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg7e16.c
index 9bf73d4b71ba5..b997240af861a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg8e16.c
index 9d9ff88ef1945..ef40369807a0f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei16.c
index a426aa39d1094..8419892183de8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei32.c
index 7cc93d1a4dcc8..da42d627f7814 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei64.c
index 304a8837efa88..91bd6b3b8c8a9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei8.c
index b279cd3018e2c..529a1124e9399 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei16.c
index 062db9ef80274..4e0413418e4eb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32.c
index af0f75c4a9362..79f5999130ae8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei64.c
index 64d0599ff9f65..155452e397699 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei8.c
index 5a33217ef34b2..bcfe6076afbc0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei16.c
index c985dd6456aac..70eb73aebb379 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei32.c
index f58be50bee18c..9b30ed74232fd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei64.c
index e9fcd9638c0c0..083dc211f2fb4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei8.c
index 4d24b3e1dbaba..74a4f25b33b20 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei16.c
index 2c8f4013a4008..8f0d0763cac22 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei32.c
index 0b593f9f78312..3546a1462d911 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei64.c
index 03b2cee43b6e8..9a5ec01f8409f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei8.c
index 0428eb7bda333..1cae1cc1f50c0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei16.c
index eb5d743bca3c3..65b4782532c34 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei32.c
index c3329e1a150e3..a30f8931fbb07 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei64.c
index 16aedc6566138..26e385377beaa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei8.c
index 1856a1044fb30..d46a3340d8767 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei16.c
index 7e2e8d631ccd3..a1d644c7c7160 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei32.c
index 1bb269900fbb2..1b25720c8e3e0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei64.c
index c98083af14403..ebb5cc36e380d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei8.c
index 8d5b3c08669bd..0295a956d14e4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei16.c
index d169faf7f12ad..219ee239cb22d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei32.c
index 0514e317680d7..a0595eddfbd71 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei64.c
index ac6660b5be089..56de99c4aa348 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei8.c
index 1d470393be971..87a5a07f34f25 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei16.c
index 2c4e1fd23dd47..0a6b6d11ed75e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei32.c
index fededc60f1680..cb882d3468898 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei64.c
index bd071394a216b..b0e7cfa57d5ea 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei8.c
index ebebfdd7eb3de..32a8d73f12084 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmerge.c
index c52402424a81b..fb41a07cccec0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmerge.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmv.c
index be668d8c5db90..c25719a80d4fb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vmv.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vreinterpret.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vreinterpret.c
index d141adb8337b3..31c6d11af6712 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vreinterpret.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vreinterpret.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vse16.c
index 738e8a9148035..ea4efc5f726cf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei16.c
index 857df5bef7f59..4e19c562d99d0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei32.c
index 2aab1ef35c714..ec0c718684d6f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei64.c
index ce3dba93db6f9..a1874ca82db74 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei8.c
index e09a61edb38d2..bc152e6040360 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei16.c
index be3cf941d5822..f63c698f063bd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32.c
index 5e710a2fa3e7a..3e766f8535ad7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei64.c
index 9444d32f8e8ca..a0f0c01596157 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei8.c
index eb10c05443ed1..79f5b2406ccb7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei16.c
index dd23c3222605e..bacfb5fd74a26 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei32.c
index e2ecf97081aae..0de8d95294567 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei64.c
index ec42accf3555e..fe92e2e40780a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei8.c
index 2a7ca507e7b33..f09b7e376fcd8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei16.c
index 8fa5968a05360..66911b61480e6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei32.c
index c5543eae848c8..08f6edbb08814 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei64.c
index 4618d9714b39f..9af14464edae2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei8.c
index c6caf517cbfbc..869b895a985c4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei16.c
index 3a56bae880c49..b6a7c654bf0e0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei32.c
index 1f7e4a268b235..284e57fcff355 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei64.c
index 224d32e156e96..bad29fc2482d1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei8.c
index c65e2aae8806a..224fd13603fb6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei16.c
index 6785a1ce404db..1c014eef1dc17 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei32.c
index e7ca9997bdd21..dffed80c778e1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei64.c
index 6cdcd393e51c3..eba5fb2b152bd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei8.c
index 56ae39a0ced73..68cb43a5af71d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei16.c
index 1ca056390d7b2..3a4dcc66ec93f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei32.c
index e9d93241baa0d..4b189864554ce 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei64.c
index 14b8537f9bbd7..d2ce965c8f193 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei8.c
index e9e25f61fa3e5..d4790b89a2085 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei16.c
index c1766d023d107..99ec694780004 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei32.c
index 69b69ea4fffab..8b0f21ec75ba2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei64.c
index abc4b8c9fc0ad..2d7aaf5930f4a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei8.c
index 63246b34ef375..cad9326b85319 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsse16.c
index aa72848727211..72acf08cefa16 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e16.c
index aa50433411cf2..a46fb29388366 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg3e16.c
index 2bb662dcf0094..7810857321814 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg4e16.c
index 64e2970bb7f85..d7ad5fe0795e3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg5e16.c
index 5d9e05e721ad3..369729fef0251 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg6e16.c
index 24f0a257aee01..cf4e6b2733f7d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg7e16.c
index bbec658271543..c876fa50b25ca 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg8e16.c
index dd14f3bf56f2e..a00514e87deab 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e16.c
index 51840657fd62b..dd664ac51d0af 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg3e16.c
index be91941218884..cc2622ecd5bf3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg4e16.c
index 8943ab92113bd..c53efec7dfcd2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg5e16.c
index 6d8c9d38e567a..5d37ba071baf4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg6e16.c
index 71cc5d70e9fe8..cf5565c280a8f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg7e16.c
index a5f0df94b00fa..f39e422693775 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg8e16.c
index d13667b9dcf2e..d691d64fbcfd7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei16.c
index 27d0068dabacf..32c0fab691a2f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei32.c
index 502ea760d215a..b046612a2896c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei64.c
index dba856e29ec63..1a4ac1850a314 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei8.c
index 23fa635db59ee..92737c1fa4c6c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei16.c
index c0706dabf158b..fa283f8a6357b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32.c
index 93c47d1eff81e..c5bef275632f1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei64.c
index 765b78b7aef0c..7da29bbc3ef8d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei8.c
index ef4a611335251..3ea283b974135 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei16.c
index 631d4f1700134..307b66473cd27 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei32.c
index 711ea8cc8362d..a402734ff7849 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei64.c
index c87cf5c2e27f8..5771a4aab78ff 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei8.c
index c6710598d3cff..ffd6405c09fec 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei16.c
index e4725f291adf2..f1ddac3211bde 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei32.c
index dc569b840941b..58e32a6f30601 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei64.c
index e0a9f4569c5df..74b2b929d50c7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei8.c
index 93f5ab1998088..ef4f4ef0ffb29 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei16.c
index 4574e1cf354a1..44cb1e9a9a812 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei32.c
index 6244a4f0c3832..e689873bf43f9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei64.c
index 3a92dcfc577c7..c5d0dd1b76bc2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei8.c
index 0980a91d69021..3bad6de8b12fb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei16.c
index abd8be607ff6b..cda0d96d04b82 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei32.c
index e58d76a46c0b4..f84d6e46f7e3d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei64.c
index 125be14dab509..e3b07df9bfc51 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei8.c
index a401f305d6cdc..c24145ac7be67 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei16.c
index 8df81d192619e..f452ce75de724 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei32.c
index 43f5dd46a284e..44fd0c9803085 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei64.c
index 369f3c0cd73b0..78df8958be7c0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei8.c
index 59242eb71a33b..7fa707b5c4682 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei16.c
index 6e892339dafd2..28f4623f39620 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei32.c
index 0f40c87648d07..8b8a12c63d8c3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei64.c
index 708204fced65c..ce2704f913e5c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei8.c
index 5719ffea7335d..bd61169c84ed5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt.c
index a62dd2e054542..10eb4ea7d1238 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt.c
@@ -327,56 +327,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4(vuint32m8_t src, size_t vl) {
   return __riscv_vfncvt_f(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4(vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4(vuint64m8_t src, size_t vl) {
   return __riscv_vfncvt_f(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2(vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1(vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2(vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4(vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_m(vbool4_t mask, vuint32m8_t src, size_t v
   return __riscv_vfncvt_f(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_m(vbool8_t mask, vuint64m8_t src, size_t v
   return __riscv_vfncvt_f(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm
 // CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1467,56 +1287,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm(vuint32m8_t src, size_t vl) {
   return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm(vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm
 // CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1447,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm(vuint64m8_t src, size_t vl) {
   return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm(vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm(vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm(vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm(vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1767,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_m(vbool4_t mask, vuint32m8_t src, size_
   return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,43 +1927,3 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_m(vbool8_t mask, vuint64m8_t src, size_
   return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
-}
-
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt_f_f.c
new file mode 100644
index 0000000000000..2c017d5fccac6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfncvt_f_f.c
@@ -0,0 +1,369 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4(vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2(vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1(vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2(vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4(vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm(vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm(vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm(vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm(vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm(vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> poison, <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_m(vbool4_t mask, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> poison, <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_m(vbool64_t mask, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> poison, <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_m(vbool32_t mask, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> poison, <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_m(vbool16_t mask, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> poison, <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_m(vbool8_t mask, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f(mask, src, __RISCV_FRM_RNE, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt.c
index f59f986b7ad44..4dcab0dc956d3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt.c
@@ -327,56 +327,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8(vuint16m4_t src, size_t vl) {
   return __riscv_vfwcvt_f(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2(vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1(vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2(vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4(vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8(vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1
 // CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8(vuint32m4_t src, size_t vl) {
   return __riscv_vfwcvt_f(src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1(vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2(vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8(vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_m(vbool4_t mask, vuint16m4_t src, size_t v
   return __riscv_vfwcvt_f(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_m(vbool64_t mask, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_m(vbool32_t mask, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_m(vbool16_t mask, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_m(vbool8_t mask, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_m
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_m(vbool4_t mask, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_m(vbool8_t mask, vuint32m4_t src, size_t v
   return __riscv_vfwcvt_f(mask, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_m
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_m
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_m
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_m
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f(mask, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfwcvt_x_f_v_i32mf2_rm
 // CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt_f_f.c
new file mode 100644
index 0000000000000..96825ff3ab73d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vfwcvt_f_f.c
@@ -0,0 +1,188 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2(vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1(vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2(vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4(vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8(vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1(vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2(vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4(vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8(vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> poison, <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_m(vbool64_t mask, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> poison, <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_m(vbool32_t mask, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_m(vbool16_t mask, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> poison, <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_m(vbool8_t mask, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_m
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_m(vbool4_t mask, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_m
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> poison, <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_m(vbool64_t mask, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> poison, <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_m(vbool32_t mask, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_m
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> poison, <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_m(vbool16_t mask, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_m
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> poison, <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_m(vbool8_t mask, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f(mask, src, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16.c
index e6b9b12966859..2619ab270ff3a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16ff.c
index d2a9819a10677..7b56c07da07d6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vle16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei16.c
index 5fadd983f6225..1e844dc531ee2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei32.c
index b4a3d98694d20..8e5bf2f7db245 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei64.c
index 34ab499f78bf5..57b945e01293d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei8.c
index 4b1a0b4a96936..3d330b38dbd10 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei16.c
index bdd448a2e7aba..3a7f583b5967a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei32.c
index 801ad6f4fd45d..c7d9ec9044564 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei64.c
index 539c0733e827a..ab973072be3bb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei8.c
index 602c6e9552686..02f11042afc65 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei16.c
index 892d2e39be380..f7061b87281c8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei32.c
index 3137ed3abe3e0..aee251a1b3e14 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei64.c
index 1abcc1f7d35b9..b440267822dd0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei8.c
index 0fbe8a38406a8..c322762102908 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei16.c
index 86250ece1cf5a..9c6420d484c61 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei32.c
index 20e998f2921a9..9812f060fa39e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei64.c
index 5d23db26f4d48..8f0a8e54ba075 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei8.c
index f772f190030b0..0e07251576351 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei16.c
index 5df3d1597ca20..66b8916772e7e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei32.c
index 5ffde2352624e..edd5aaaaddbf5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei64.c
index 1312733533b49..5bf228fbbe390 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei8.c
index a0de00862d4db..e5ee08be86d79 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei16.c
index 0a183183b8989..d4e92cd0b898d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei32.c
index 108bed88cd9a5..3f42dfdbf12c9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei64.c
index 42d4b1af039e4..22f0cbd3a88c2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei8.c
index 69a0e9ede8be7..5f05d592748ab 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei16.c
index b0b14540340f6..976321790fc08 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei32.c
index 604dcd84cb05c..275f12f8ab401 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei64.c
index e30f8018c05bf..cf25c8daed08d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei8.c
index 62b602ae6a81c..09a3cc183768e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei16.c
index a691eaf564a6e..b478d96d6a247 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei32.c
index 608b2457b3ecd..07e93052d056b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei64.c
index 14b94cde4be7c..c95bab7c8b865 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei8.c
index 2e2eb03c941d7..25fb1cbb07390 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vloxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlse16.c
index f2aba9a6b2849..4fdb4045769b4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16.c
index 4eb2c7102dbcf..8eb78f9c2a833 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16ff.c
index a3d0d595736ff..975ea0865e14c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg2e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16.c
index 036dd8b855ac3..6915855fa438f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16ff.c
index 57635694d5d3f..94ece8d731414 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg3e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16.c
index 39a64d0e0066b..96da31eef0822 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16ff.c
index 9017f5951b56f..6faee625460a7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg4e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16.c
index fdd31c3b3fa77..998905b8eecb9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16ff.c
index 9bc3c2659a1ff..fd2b398629bff 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg5e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16.c
index 511a9495cb113..57c4cff6a85c3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16ff.c
index 8057b94b49b37..cfeb725c1e690 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg6e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16.c
index 14a521e3d9d25..8ffff51bd2f36 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16ff.c
index b4e94e31ecf13..31da9fa31b3bc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg7e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16.c
index 5a35fc6d59b8a..4d229ac49f6e8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16ff.c
index 54e325286e459..c5dc4cff8a7b2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlseg8e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg2e16.c
index 9d23e742d0d64..2e22f0e731174 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg3e16.c
index bd59dd6c1eef1..e8cb448459545 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg4e16.c
index 44a65ac07b598..ec6ec2a9cd111 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg5e16.c
index 654fd18fcf0d3..6e77e24991af0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg6e16.c
index 8a2d8e063695e..c1b97a8ed6cbc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg7e16.c
index 2bbffe0f33301..3b7894b75a1d5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg8e16.c
index e5b67749fa0ef..1eb13dffe1c37 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vlsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei16.c
index 15d2379cb769d..59820e5572b20 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei32.c
index 6e6f472d13b92..3697e0d7014dc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei64.c
index fb3adb4a56518..e794888b7c73b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei8.c
index c2337ed01b298..853a7adb6c478 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei16.c
index f85a4d21e0d80..2d7400a03629e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei32.c
index 05b82b50b41e9..e8a933a9e3edf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei64.c
index f2b86dc6cf0ac..7537868b10d9d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei8.c
index f215f7c06be09..bcb378288e99d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei16.c
index bd01e4a26ef42..2618904bda3ba 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei32.c
index d1e29f62400bc..37de7d3d9ab6e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei64.c
index f1aa76a9910d8..9c6120d546cbe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei8.c
index 32cf171063f4e..5a7f1657a2a3a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei16.c
index f51854479df0c..bbb64b057781e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei32.c
index 0002ed1f54bf0..68a12760e8c86 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei64.c
index e0936fb0e3f7c..1cf8074143ee3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei8.c
index c6448978d08cc..1c8273f71797b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei16.c
index e86e80aad18ba..a45981209d449 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei32.c
index f6280c9315ac7..58f9f8f1377bb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei64.c
index fc5ab22010d20..256fd9f6b181d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei8.c
index 7973a137e411a..9cbbe4ba0fe73 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei16.c
index a91e920113b4d..051627ce134c9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei32.c
index 7e7324f1aa41f..d5321b8570a63 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei64.c
index fc248bce9165c..30e9944c30365 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei8.c
index e78ccf41101d2..1e15ab2b9c0a7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei16.c
index 4337a97d5a129..1acf7e77c1ee2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei32.c
index 78609f0473205..4721c74f6699b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei64.c
index 002178b126130..f69c4a40c23f9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei8.c
index 882ad94a1dec6..f950650cc3c2f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei16.c
index 42a01fdf73374..b1b60e5894f36 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei32.c
index 02591e137c960..6278d5ef47844 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei64.c
index 9ee2585b02a54..d35e436110942 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei8.c
index c9536929512d4..9592ecd4a5e25 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vluxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmerge.c
index 2ed27e6d7b467..d67aa70fc8c8a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmerge.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmv.c
index ffd6f7dc4f577..44f34c4fe8101 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vmv.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vreinterpret.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vreinterpret.c
index e1af8ba146562..0c33d509114df 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vreinterpret.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vreinterpret.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vse16.c
index 3cb965b826840..6d15de10bdf46 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei16.c
index 1986cc90727d4..0ea2fa60849b1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei32.c
index b414a5a36d308..e79d8c592a673 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei64.c
index ba1a6dbf7edaa..a238c56f45542 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei8.c
index 8295692189bb9..8c4805270440c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei16.c
index 4c4c1b3aa2d6b..53d203ba5202b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei32.c
index c77eb7ea794aa..16e4e2b045877 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei64.c
index 066f844c40d70..b852fd81eb4ff 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei8.c
index fe8f137556acb..69abe299351ac 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei16.c
index e6df42969f01a..db106d7e31ffe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei32.c
index 5e4b7c1123a8a..23976cdac17b6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei64.c
index 8b70ad11aed26..7a1c4a5260944 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei8.c
index dbcb1a9d24542..b8fd905d9ae83 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei16.c
index c891f1ecabf07..201b0e38655a1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei32.c
index fcfc94dcf9dd3..70ac9357a2dfc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei64.c
index 2631aed62911f..c0255f3ce6735 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei8.c
index 43353a9a8f4af..27a4a520335a4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei16.c
index 751d564fcfe4c..99a2ad5085c48 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei32.c
index cac2ac580eb66..e6a39b48889dd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei64.c
index 5db0e8f43aa4b..867758cdfe351 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei8.c
index 878a396be458f..80b60b3b59353 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei16.c
index 25d812a4dcb29..5f6728ec93de0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei32.c
index 2cef48221a471..690ea92cbd7f4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei64.c
index e418542898761..9aa6577fcc9ee 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei8.c
index 3f62203eb682a..580464506d46e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei16.c
index 77f4ef8f62a5c..2931707e2489d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei32.c
index c37268c32160b..ea0d8b47c4001 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei64.c
index b0748812dae74..9144bc14042e4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei8.c
index e978b6c1654b7..9548a2abf245c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei16.c
index 385719ae80b95..9b68352389aa6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei32.c
index c8d7df8791918..6ae6c2d62420b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei64.c
index 3aec2a1c30d4d..bc36d126d92df 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei8.c
index 2c6ceee55c49c..f52c91bdc6649 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsoxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsse16.c
index c1c1dd465a864..576d8df491b36 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg2e16.c
index 495c8536001c8..fc09cfd4bf781 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg3e16.c
index 83889bc693ec6..8aa890eb4ee2d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg4e16.c
index 4f3d79d6941a2..692cb8f46e3a5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg5e16.c
index 73db6a6f10e20..ec51aa19c7155 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg6e16.c
index ea7115567c36e..4c790d52ff46e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg7e16.c
index 3a4248a6e86f6..6ac7c800b6f8c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg8e16.c
index eb31c335b3228..3778d499d2f6f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg2e16.c
index e379891e43759..fc58882b568e5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg3e16.c
index ccb28e126e757..1d17d93318327 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg4e16.c
index de3fc0718f252..c4731e71b6fbc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg5e16.c
index 5143b7bbcd69a..1b254aba03291 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg6e16.c
index 2fdef1570a3c7..c094f08e38375 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg7e16.c
index 9f3f007e6d066..4f8c7fad1c1dc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg8e16.c
index 992b84da5ba1f..8c89dfa0a818e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vssseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei16.c
index 42dd207d78a39..2d38a79950829 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei32.c
index 09f6cdf3dc66c..e0ca634229712 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei64.c
index 1f897a1b2e6f9..819fa8af5affb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei8.c
index 79d49fef32de4..23386d7ce3919 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei16.c
index 3f9a46069dc4e..42684da49e62c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei32.c
index b9b7798f36a73..bb329086b5af0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei64.c
index a5fd03f1e47f7..b79fd032f9ba8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei8.c
index f9da66db4a52b..1292fd91a1ab1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei16.c
index b4b830694e83b..aba54c0887ef8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei32.c
index 196b404da034b..c2d6fee7e279a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei64.c
index d42924709a5dd..52c2438cac544 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei8.c
index f7046b11aaae4..398f9d6ba1e42 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei16.c
index 853612483ff3f..b5ae454ef843f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei32.c
index ad2f062c0d0f7..7ea185ba9fe90 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei64.c
index 245b9041e64fb..b6e4aa5ac1fcd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei8.c
index 16334f8d4f7bc..037630051d405 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei16.c
index 2c001f00436ed..f7d9682853512 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei32.c
index fab54bfe517b2..067d7c5431aff 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei64.c
index 08fee857416a0..5f7c0ce93952d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei8.c
index 43cdd093d73fd..20535e664b05d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei16.c
index 8e8788272ddf5..7eb59526a55dc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei32.c
index 3255a5ca32606..c8b7caa75aaa8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei64.c
index 6ea66a6f7d5be..6f0b838651336 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei8.c
index 671014b152540..8a2c64c38a3fc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei16.c
index 66ec251fff8f9..0b347cf8513e8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei32.c
index b6d35f5080298..d8270260a1c9b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei64.c
index c8bf3c372c8e1..68916f17747a9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei8.c
index c527a7231f3a3..3b4f8af16ffc5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei16.c
index 52c7a8f067ad1..f11b5911c305b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei32.c
index ccb43829631b3..be8993a1990dc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei64.c
index c71edaaad735d..47e12eea323a8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei8.c
index b025649ef2cff..c9f70e2ae96a0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vsuxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt.c
index e52e5b592d3f6..cd2f074f74b98 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt.c
@@ -327,56 +327,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tu(vfloat16m4_t maskedoff, vuint32m8_t src
   return __riscv_vfncvt_f_xu_w_f16m4_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tu
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tu
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tu
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tu
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tu
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tu(vfloat32m4_t maskedoff, vuint64m8_t src
   return __riscv_vfncvt_f_xu_w_f32m4_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff,
   return __riscv_vfncvt_f_xu_w_f16m4_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff,
   return __riscv_vfncvt_f_xu_w_f32m4_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1467,56 +1287,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff
   return __riscv_vfncvt_f_xu_w_f16m4_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1447,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff
   return __riscv_vfncvt_f_xu_w_f32m4_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1767,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff,
   return __riscv_vfncvt_f_xu_w_f16m4_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,46 +1927,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff,
   return __riscv_vfncvt_f_xu_w_f32m4_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2607,56 +2247,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vuint32m8_t
   return __riscv_vfncvt_f_xu_w_f16m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tu
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2817,46 +2407,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vuint64m8_t
   return __riscv_vfncvt_f_xu_w_f32m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3177,56 +2727,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedo
   return __riscv_vfncvt_f_xu_w_f16m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3387,46 +2887,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedo
   return __riscv_vfncvt_f_xu_w_f32m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3747,56 +3207,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t masked
   return __riscv_vfncvt_f_xu_w_f16m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3957,46 +3367,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t masked
   return __riscv_vfncvt_f_xu_w_f32m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -4317,56 +3687,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedof
   return __riscv_vfncvt_f_xu_w_f16m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16mf2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m1_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f16m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -4527,43 +3847,3 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedof
   return __riscv_vfncvt_f_xu_w_f32m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32mf2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m1_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_f_w_f32m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt_f_f.c
new file mode 100644
index 0000000000000..a704525df0e51
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfncvt_f_f.c
@@ -0,0 +1,729 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tu
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tu
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tu
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tu
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tu
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tu
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16mf2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m1_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f16m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32mf2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m1_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m2_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_f_w_f32m4_rm_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt.c
index 16a26766806f7..acb88ed5f3bd8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt.c
@@ -327,56 +327,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tu(vfloat32m8_t maskedoff, vuint16m4_t src
   return __riscv_vfwcvt_f_xu_v_f32m8_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tu(vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tu(vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tu(vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tu(vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tu(vfloat64m8_t maskedoff, vuint32m4_t src
   return __riscv_vfwcvt_f_xu_v_f64m8_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tu
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tu(vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tu
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tu(vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tu
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tu(vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tu
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tu(vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff,
   return __riscv_vfwcvt_f_xu_v_f32m8_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff,
   return __riscv_vfwcvt_f_xu_v_f64m8_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tum(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tum(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tum(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1467,56 +1287,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff
   return __riscv_vfwcvt_f_xu_v_f32m8_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1447,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff
   return __riscv_vfwcvt_f_xu_v_f64m8_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tumu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tumu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tumu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1767,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff,
   return __riscv_vfwcvt_f_xu_v_f32m8_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32mf2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m1_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m4_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f32m8_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,46 +1927,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff,
   return __riscv_vfwcvt_f_xu_v_f64m8_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_mu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m1_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_mu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m2_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_mu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m4_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_f_v_f64m8_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfwcvt_x_f_v_i32mf2_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt_f_f.c
new file mode 100644
index 0000000000000..3a8cfb46c9b0e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vfwcvt_f_f.c
@@ -0,0 +1,368 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tu(vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tu(vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tu(vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tu
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tu(vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tu
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tu(vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tu
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tu(vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tu
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tu(vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tu
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tu(vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tum(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tum(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tum(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tumu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tumu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tumu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32mf2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m1_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m4_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f32m8_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_mu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m1_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_mu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m2_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_mu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m4_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_f64m8_mu(mask, maskedoff, src, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16.c
index 81ebd741ff356..f34cf48053cc4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16ff.c
index 6c7a930bfdf9a..1d0722dbf2008 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vle16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei16.c
index 95c81e92a6c54..c042b73228b60 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei32.c
index fc4cba11d3890..0f1ac7312ad89 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei64.c
index 3a4909fe5de0b..2755a4c5e59b8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei8.c
index eb1a22c016edb..54497b607bf92 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei16.c
index ecc3943488f86..83db0fabc771a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei32.c
index c37e14b059c64..0844848e74af8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei64.c
index 1669cba208dc9..edbf69833f089 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei8.c
index fde713cfe0be3..7601c65d1dc63 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei16.c
index 1e3bc5ade6855..be97d06949b86 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei32.c
index cb25f67f7f86f..04bb11fec3551 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei64.c
index c99e2741bc3fe..71c591b780230 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei8.c
index be1132ddc0a5a..8eb9763240ce6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei16.c
index 7d776e01209c4..add1067e4a4bf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei32.c
index 932355d381f9c..e5ea6fea80df9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei64.c
index 263e48868f6c5..dbaa07e075a81 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei8.c
index 7f6d944629f92..4bf420cf5b3e2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei16.c
index 08f4d24856594..2d680aca126b0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei32.c
index 44d49e5e2c558..ca39a856d2018 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei64.c
index e8b259635dc36..0985eb069a70e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei8.c
index 27d0e843b4ebb..d09fb723d2f93 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei16.c
index b5a05193af2dc..fc1c6926710be 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei32.c
index 2081f947da874..567002ccb232f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei64.c
index 0e8888bd25f03..c3a0b42b94549 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei8.c
index 0dd8667ba76ad..0e9765c70d6d9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei16.c
index 01c461d4b619f..f73a859bae48a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei32.c
index b835c5f8cbdb9..d557943ac57d8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei64.c
index c3969d223c737..9b9d502fc96a4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei8.c
index 0b6b3424d8e2c..04c48afa37cf8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei16.c
index 3d658e3effdb3..bd27639fc498f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei32.c
index 3c91aecab77a1..5012d7e6383aa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei64.c
index 394fa602cd984..658ab532a7a5d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei8.c
index c19e4637abfe0..3c764c248f7d0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vloxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlse16.c
index 9ff1aa0003cae..469dd7d654272 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16.c
index 69e3656edc748..e28809ee7df7b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16ff.c
index 36a2ff30b3010..e10b9971d8ecf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg2e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16.c
index 1e7a689bc2327..c97bfa10ca5cd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16ff.c
index 3089f0e369c94..d54571666c367 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg3e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16.c
index 969363878108b..7c20dcc2fea6f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16ff.c
index bf2b469585dca..a9e79452772f0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg4e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16.c
index 9a393f0331ca3..0c74d19237f3c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16ff.c
index 42e788e127e72..68be4e5f18e6c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg5e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16.c
index 4fca146f8c7a0..9ed47f14726a4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16ff.c
index abdd90fbd0466..2bcae7ee412e3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg6e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16.c
index ac15d63747272..7e558a3bad6a1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16ff.c
index 3d54c04ad1458..e0262f295dca5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg7e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16.c
index 8d60f5f0bed54..7d9f54712142c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16ff.c
index fa41b017a6c8c..765bb29c87dbd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlseg8e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg2e16.c
index 7260cfa0bb3be..7cfa8002ffecd 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg3e16.c
index a4fe77b229afa..3ae4c113d96bf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg4e16.c
index eff8ed9996db1..74986a9760ccb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg5e16.c
index 16dd04e106ed2..bef42c1993cbe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg6e16.c
index 872ca88ca4796..20fff1909d9f3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg7e16.c
index 39b81c4a52841..64d580e6211e1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg8e16.c
index 5053d27ef29dc..e5349b4c1fb91 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vlsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei16.c
index 7b584c9ee79c5..19496e5cbac1b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei32.c
index 7e8eda71ff436..ba4c9a6fb33d1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei64.c
index f788ff0b67acf..3f9b85eae1bad 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei8.c
index dde110aa2fc97..d5b858276dccf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei16.c
index a62d13ab140a0..32857db63474f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei32.c
index fdcb33efd8840..bd8bfcc4fcbf0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei64.c
index fb361396f75bc..778f26a5153e1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei8.c
index 47cc216bac392..27b257e4ac316 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei16.c
index a62294be66a36..4bff4f9f62a82 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei32.c
index f5558324cd3e9..253ea1304ff76 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei64.c
index 52c80e3fa647e..c4337e936086d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei8.c
index d6d5854b9129a..4aa07e42b6208 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei16.c
index e86cabd103510..b3d57e23fff04 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei32.c
index 98d882b66a5a1..36b79dbd4d3b5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei64.c
index 9d82d8566514b..11bc4c10f31ef 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei8.c
index b26f072ffd0bd..65b39071f9a6f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei16.c
index b98d447dcd12b..dfb0faf693b0c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei32.c
index aef2f814c79a9..e7d30dcc482b5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei64.c
index e7acb1e2f1fcd..8a8ee26fc1633 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei8.c
index ec89215f1cd46..a17eb2faa5008 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei16.c
index 999b4d1fdfecd..ac39344ce1fa8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei32.c
index e80bb4ee66e17..69d6bf33e3cbe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei64.c
index 14a7749aaf959..793380c6d3510 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei8.c
index bf2edd94340ea..8570fdc150226 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei16.c
index dcd2e93cb6611..c2f7afb6983a1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei32.c
index ca4294a707a54..5e277566e5724 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei64.c
index 1fd9fc1bcdcbb..373466580ce5a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei8.c
index 8912a7e9587cc..9e5ed180900a4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei16.c
index 1b951f9d01877..176f753e40a3d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei32.c
index 4a61541dfd231..01703cf756efe 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei64.c
index 055dc860c9afc..6a782d444e6cc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei8.c
index 6488b02056a7e..595fc708663e4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vluxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmerge.c
index 75365b75e3a65..4f723c579597e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmerge.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmv.c
index a60c19eaf7f9f..2a5a0f4f892d3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vmv.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt.c
index 5ec971affac90..f6a454556e9dc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt.c
@@ -327,56 +327,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tu(vfloat16m4_t maskedoff, vuint32m8_t src
   return __riscv_vfncvt_f_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tu
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tu
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tu
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tu
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tu
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tu(vfloat32m4_t maskedoff, vuint64m8_t src
   return __riscv_vfncvt_f_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff,
   return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,46 +967,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff,
   return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1467,56 +1287,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff
   return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1447,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff
   return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1767,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff,
   return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,46 +1927,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff,
   return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2607,56 +2247,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vuint32m8_t
   return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tu
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2817,46 +2407,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vuint64m8_t
   return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3177,56 +2727,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedo
   return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3387,46 +2887,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedo
   return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3747,56 +3207,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t masked
   return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -3957,46 +3367,6 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t masked
   return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_i8mf8_rm_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -4317,56 +3687,6 @@ vfloat16m4_t test_vfncvt_f_xu_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedof
   return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
-//
-vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
-//
-vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
-//
-vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfncvt_x_f_w_i32mf2_rm_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -4527,43 +3847,3 @@ vfloat32m4_t test_vfncvt_f_xu_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedof
   return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
-  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
-}
-
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt_f_f.c
new file mode 100644
index 0000000000000..88d67f0b0a882
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfncvt_f_f.c
@@ -0,0 +1,729 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tu
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tu
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tu
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tu
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tu
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tu(vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tu(vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tu
+// CHECK-RV64-SAME: (<vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tu(vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tu(vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tu(vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tu(vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tu(vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tu(vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tu(vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tu(maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tum(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tum(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tum(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tum(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tum(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tum(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_tumu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_tumu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_tumu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_tumu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_tumu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_tumu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfncvt_f_f_w_f16mf4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32.i64(<vscale x 1 x half> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_vfncvt_f_f_w_f16mf4_rm_mu(vbool64_t mask, vfloat16mf4_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vfncvt_f_f_w_f16mf2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x half> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32.i64(<vscale x 2 x half> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_vfncvt_f_f_w_f16mf2_rm_mu(vbool32_t mask, vfloat16mf2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vfncvt_f_f_w_f16m1_rm_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x half> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32.i64(<vscale x 4 x half> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_vfncvt_f_f_w_f16m1_rm_mu(vbool16_t mask, vfloat16m1_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vfncvt_f_f_w_f16m2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x half> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32.i64(<vscale x 8 x half> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_vfncvt_f_f_w_f16m2_rm_mu(vbool8_t mask, vfloat16m2_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vfncvt_f_f_w_f16m4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x half> [[MASKEDOFF:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32.i64(<vscale x 16 x half> [[MASKEDOFF]], <vscale x 16 x float> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_vfncvt_f_f_w_f16m4_rm_mu(vbool4_t mask, vfloat16m4_t maskedoff, vfloat32m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfncvt_f_f_w_f32mf2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x double> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfncvt_f_f_w_f32mf2_rm_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat64m1_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfncvt_f_f_w_f32m1_rm_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x double> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfncvt_f_f_w_f32m1_rm_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat64m2_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfncvt_f_f_w_f32m2_rm_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x double> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfncvt_f_f_w_f32m2_rm_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat64m4_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfncvt_f_f_w_f32m4_rm_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x double> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfncvt_f_f_w_f32m4_rm_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat64m8_t src, size_t vl) {
+  return __riscv_vfncvt_f_mu(mask, maskedoff, src, __RISCV_FRM_RNE, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt.c
index ecd1e7d3c7d39..327a67c133091 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt.c
@@ -327,56 +327,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tu(vfloat32m8_t maskedoff, vuint16m4_t src
   return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tu
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tu(vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tu(vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tu(vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tu(vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -537,46 +487,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tu(vfloat64m8_t maskedoff, vuint32m4_t src
   return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tu
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tu(vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tu
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tu(vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tu
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tu(vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tu
-// CHECK-RV64-SAME: (<vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 [[VL]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tu(vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -897,56 +807,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff,
   return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tum
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1107,45 +967,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff,
   return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tum
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tum(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tum
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tum(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tum
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tum(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tum
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
-}
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
@@ -1467,56 +1288,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff
   return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tumu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -1677,46 +1448,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff
   return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tumu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tumu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tumu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tumu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tumu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tumu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tumu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vfwcvt_f_x_v_f16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x half> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2037,56 +1768,6 @@ vfloat32m8_t test_vfwcvt_f_xu_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff,
   return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
-//
-vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
-//
-vfloat32m1_t test_vfwcvt_f_f_v_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
-//
-vfloat32m4_t test_vfwcvt_f_f_v_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_mu
-// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
-//
-vfloat32m8_t test_vfwcvt_f_f_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vfwcvt_x_f_v_i64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
@@ -2247,46 +1928,6 @@ vfloat64m8_t test_vfwcvt_f_xu_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff,
   return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
 }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_mu
-// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
-//
-vfloat64m1_t test_vfwcvt_f_f_v_f64m1_mu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_mu
-// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-vfloat64m2_t test_vfwcvt_f_f_v_f64m2_mu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_mu
-// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
-//
-vfloat64m4_t test_vfwcvt_f_f_v_f64m4_mu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_mu
-// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
-//
-vfloat64m8_t test_vfwcvt_f_f_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
-  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
-}
-
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vfwcvt_x_f_v_i32mf2_rm_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt_f_f.c
new file mode 100644
index 0000000000000..202998020b359
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vfwcvt_f_f.c
@@ -0,0 +1,368 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tu
+// CHECK-RV64-SAME: (<vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tu(vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tu
+// CHECK-RV64-SAME: (<vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tu(vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tu
+// CHECK-RV64-SAME: (<vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tu(vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tu
+// CHECK-RV64-SAME: (<vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tu(vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tu
+// CHECK-RV64-SAME: (<vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tu(vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tu
+// CHECK-RV64-SAME: (<vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tu(vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tu
+// CHECK-RV64-SAME: (<vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tu(vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tu
+// CHECK-RV64-SAME: (<vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tu(vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tu
+// CHECK-RV64-SAME: (<vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tu(vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tum(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tum(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tum(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tum(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tum
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tum(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tum
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tum(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tum
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tum(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tum
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tum(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tum
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tum(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tum(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_tumu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_tumu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_tumu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_tumu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_tumu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_tumu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_tumu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_tumu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_tumu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_tumu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_tumu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_tumu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_tumu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_tumu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_f32mf2_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x float> [[MASKEDOFF:%.*]], <vscale x 1 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1f16.i64(<vscale x 1 x float> [[MASKEDOFF]], <vscale x 1 x half> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_f32mf2_mu(vbool64_t mask, vfloat32mf2_t maskedoff, vfloat16mf4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_f32m1_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> [[MASKEDOFF:%.*]], <vscale x 2 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2f16.i64(<vscale x 2 x float> [[MASKEDOFF]], <vscale x 2 x half> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_f32m1_mu(vbool32_t mask, vfloat32m1_t maskedoff, vfloat16mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_f32m2_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[MASKEDOFF:%.*]], <vscale x 4 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4f16.i64(<vscale x 4 x float> [[MASKEDOFF]], <vscale x 4 x half> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_f32m2_mu(vbool16_t mask, vfloat32m2_t maskedoff, vfloat16m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_f32m4_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x float> [[MASKEDOFF:%.*]], <vscale x 8 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8f16.i64(<vscale x 8 x float> [[MASKEDOFF]], <vscale x 8 x half> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_f32m4_mu(vbool8_t mask, vfloat32m4_t maskedoff, vfloat16m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_f32m8_mu
+// CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x float> [[MASKEDOFF:%.*]], <vscale x 16 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16f16.i64(<vscale x 16 x float> [[MASKEDOFF]], <vscale x 16 x half> [[SRC]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_f32m8_mu(vbool4_t mask, vfloat32m8_t maskedoff, vfloat16m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vfwcvt_f_f_v_f64m1_mu
+// CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x double> [[MASKEDOFF:%.*]], <vscale x 1 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f64.nxv1f32.i64(<vscale x 1 x double> [[MASKEDOFF]], <vscale x 1 x float> [[SRC]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_vfwcvt_f_f_v_f64m1_mu(vbool64_t mask, vfloat64m1_t maskedoff, vfloat32mf2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vfwcvt_f_f_v_f64m2_mu
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x double> [[MASKEDOFF:%.*]], <vscale x 2 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f64.nxv2f32.i64(<vscale x 2 x double> [[MASKEDOFF]], <vscale x 2 x float> [[SRC]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_vfwcvt_f_f_v_f64m2_mu(vbool32_t mask, vfloat64m2_t maskedoff, vfloat32m1_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vfwcvt_f_f_v_f64m4_mu
+// CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x double> [[MASKEDOFF:%.*]], <vscale x 4 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f64.nxv4f32.i64(<vscale x 4 x double> [[MASKEDOFF]], <vscale x 4 x float> [[SRC]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_vfwcvt_f_f_v_f64m4_mu(vbool16_t mask, vfloat64m4_t maskedoff, vfloat32m2_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vfwcvt_f_f_v_f64m8_mu
+// CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x double> [[MASKEDOFF:%.*]], <vscale x 8 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f64.nxv8f32.i64(<vscale x 8 x double> [[MASKEDOFF]], <vscale x 8 x float> [[SRC]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_vfwcvt_f_f_v_f64m8_mu(vbool8_t mask, vfloat64m8_t maskedoff, vfloat32m4_t src, size_t vl) {
+  return __riscv_vfwcvt_f_mu(mask, maskedoff, src, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16.c
index dce131aaa0c4a..f7e1fc5d81f2a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16ff.c
index f03c58045559c..bfdae9a1ee139 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vle16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei16.c
index 8878764bed26f..7128f29739b25 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei32.c
index 0f44dde7a98c1..5cb048af9ba31 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei64.c
index 5483359c4c7ab..23370beef8b9d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei8.c
index 7b2a921072cff..32577b6180acc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei16.c
index 89b461a92008b..0b8692166a07f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei32.c
index 9acbfb3f7c378..6c019b624b4a1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei64.c
index a79123ce54fa9..7409e48418cb2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei8.c
index 2036206790044..716170e24d1cb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei16.c
index cfa032ad414a6..d874457ab740e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei32.c
index 25dab4b8ba536..3ba48a2a9a515 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei64.c
index 6489a05647067..ce7cacaf04da5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei8.c
index eef6e63265cdc..0941b6ad3e483 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei16.c
index dc5e103cc6432..3103c1b0f1105 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei32.c
index 561b848af1ac3..4dc9e5f480407 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei64.c
index 0d3f335c105e7..fefb90b8a6e23 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei8.c
index 06740621dea9d..70f1a47e126eb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei16.c
index c70badf3dc4a5..6c11ca833d523 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei32.c
index bfa1894c61464..4e71e118744f2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei64.c
index 8dd6889f7b860..c68c15a638004 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei8.c
index 145504c5b552e..8681c32b5cdba 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei16.c
index bf282e258fdf3..0d59ddd619623 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei32.c
index 39a28ba87b669..f42ae9605928b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei64.c
index a8e79c4bddc03..e5f6b864be775 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei8.c
index 1dea35c86531e..74033769f7114 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei16.c
index 29eaa61876361..634ba33976bb4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei32.c
index 60ea5de100e37..2a5293be5a4f1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei64.c
index b49ed58c2c1b7..576db6adb6f35 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei8.c
index bb21a6cc985a7..9a8d7ba7558c8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei16.c
index c4d6cecfecf21..65295514609d6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei32.c
index f8b6d7436d885..8a309703d681c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei64.c
index 0db4ec702b733..705d4485b70bc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei8.c
index 54b77a2d440b2..99e11bd39dd7b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vloxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlse16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlse16.c
index 137847218513d..c0c4d832e4d3f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlse16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlse16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16.c
index 14e7842cdb8ed..b0658ab02e201 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16ff.c
index d09b80f023f36..6820030490b85 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg2e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16.c
index 3dbe9e88a235a..9568d22bd0f4a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16ff.c
index 797c14005df40..1a3d86c8fdb1e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg3e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16.c
index e66fb24e49802..9a8f8b1866659 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16ff.c
index 4658867e45dea..9b93997474448 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg4e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16.c
index b17da4099bfb9..7ad2b43935c35 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16ff.c
index 7beda5dc44424..5627ce44fc172 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg5e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16.c
index 9cb5895682233..e5c3974d30868 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16ff.c
index 206f722803d0a..0296f8c56b700 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg6e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16.c
index b73d2fe7cb7ef..fed3395a20610 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16ff.c
index f2b87726155e1..6c1732847aa94 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg7e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16.c
index 719c552f2930b..28bbbd6ef052d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16ff.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16ff.c
index b09649fd34f92..2b9b654b8eeee 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16ff.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlseg8e16ff.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg2e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg2e16.c
index cf893e304e22f..b9e926d485b41 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg2e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg2e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg3e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg3e16.c
index 3ae596218ea56..606aa3ed518b0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg3e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg3e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg4e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg4e16.c
index 2b349817c775f..bcb9ccf1581b6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg4e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg4e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg5e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg5e16.c
index d88daa4fc8bb1..e61b33e009c0e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg5e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg5e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg6e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg6e16.c
index 03b46d080b957..0dfd496bb958b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg6e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg6e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg7e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg7e16.c
index f08f46f00886a..c69d05d7352e8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg7e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg7e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg8e16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg8e16.c
index 23b304a3bd99e..1377dbbfaa862 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg8e16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vlsseg8e16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei16.c
index a09087aa217f4..67f7db5265b7c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei32.c
index 519f54568b8f2..2cd1ef7c7df4b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei64.c
index a154c734a07bf..55f84a640bb1f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei8.c
index e2d5273cc8449..2c6a65a4c2851 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei16.c
index 10ede6e64a89b..f5344338395e4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei32.c
index 22610e7d4eb73..693055395619a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei64.c
index d5f640c137b12..938f29e178dd3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei8.c
index 4d4e32a75263c..ed70a912dbfd9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg2ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei16.c
index 2c201fad04fe4..e32ec5a147880 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei32.c
index a0e7f502ed823..2e105d82ac759 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei64.c
index 8bdcd630405df..507ef6f77e98c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei8.c
index c1bb3e6f56ccc..18c5da3aaaf56 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg3ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei16.c
index 7ddf1cb8a8203..5d78f73c9f06f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei32.c
index a2fcbfcf19bb1..b57f1e8af1b12 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei64.c
index 45c7218fb0c59..7e6380bf00b56 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei8.c
index 6d63e63382aac..ad4e97c6df82f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg4ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei16.c
index b03e5799c9715..090b6bdb06460 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei32.c
index 55830fac00306..ab3a9c980ac4b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei64.c
index 02e73420d722e..d2794b85e3d7b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei8.c
index fe71382caf4ca..e9dff851dd888 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg5ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei16.c
index 4bbba7ae1687b..4f4e62dc7b5aa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei32.c
index 6ebb5c2435187..0d92c170b12a9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei64.c
index 1d3720f4f8393..8dfc0f32bfe92 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei8.c
index f5178d0d688dc..6ecb54ca08f7f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg6ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei16.c
index d202d6f68f56c..8d2b7151fbbf4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei32.c
index 07344f4ca6d46..b097573990fd6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei64.c
index 2688563e0b3e7..fa673cd236ed2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei8.c
index f09032a190a76..68a005107de36 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg7ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei16.c
index c6d7982700fd8..bc4d9b7103045 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei32.c
index 801edf325f47c..12e29355a954b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei32.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei32.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei64.c
index cec356d8166d9..077383005ad76 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei64.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei64.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei8.c
index e58722f747160..ddf8824025e6c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei8.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vluxseg8ei8.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmerge.c
index d346a8da669e7..8149be4cb2e71 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmerge.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmv.c
index 47b68303a15b2..ac95c77340202 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vmv.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c
deleted file mode 100644
index 100699ae19ef2..0000000000000
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c
+++ /dev/null
@@ -1,216 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: riscv-registered-target
-// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
-// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
-// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
-// RUN:   FileCheck --check-prefix=CHECK-ZVFHMIN %s
-
-#include <riscv_vector.h>
-
-// CHECK-ZVFHMIN-LABEL: @test_vfncvt_f_f_w_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64(<vscale x 4 x half> poison, <vscale x 4 x float> [[SRC:%.*]], i64 7, i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) {
-  return __riscv_vfncvt_f(src, vl);
-}
-
-
-// CHECK-ZVFHMIN-LABEL: @test_vfwcvt_f_f_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64(<vscale x 4 x float> poison, <vscale x 4 x half> [[SRC:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-vfloat32m2_t test_vfwcvt_f_f_v_f16m1(vfloat16m1_t src, size_t vl) {
-  return __riscv_vfwcvt_f(src, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vle16_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vle.nxv4f16.i64(<vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vle16_v_f16m1(const _Float16 *base, size_t vl) {
-  return __riscv_vle16_v_f16m1(base, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vse16_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    call void @llvm.riscv.vse.nxv4f16.i64(<vscale x 4 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret void
-//
-void test_vse16_v_f16m1(_Float16 *base, vfloat16m1_t value, size_t vl) {
-  return __riscv_vse16_v_f16m1(base, value, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vlse16_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vlse.nxv4f16.i64(<vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[BSTRIDE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vlse16_v_f16m1(const _Float16 *base, ptrdiff_t bstride, size_t vl) {
-  return __riscv_vlse16_v_f16m1(base, bstride, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vsse16_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    call void @llvm.riscv.vsse.nxv4f16.i64(<vscale x 4 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i64 [[BSTRIDE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret void
-//
-void test_vsse16_v_f16m1(_Float16 *base, ptrdiff_t bstride, vfloat16m1_t value, size_t vl) {
-  return __riscv_vsse16_v_f16m1(base, bstride, value, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vluxei32_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32.i64(<vscale x 4 x half> poison, ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vluxei32_v_f16m1(const _Float16 *base, vuint32m2_t bindex, size_t vl) {
-  return __riscv_vluxei32_v_f16m1(base, bindex, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vsuxei32_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32.i64(<vscale x 4 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret void
-//
-void test_vsuxei32_v_f16m1(_Float16 *base, vuint32m2_t bindex, vfloat16m1_t value, size_t vl) {
-  return __riscv_vsuxei32_v_f16m1(base, bindex, value, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vloxei32_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32.i64(<vscale x 4 x half> poison, ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vloxei32_v_f16m1(const _Float16 *base, vuint32m2_t bindex, size_t vl) {
-  return __riscv_vloxei32_v_f16m1(base, bindex, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vsoxei32_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32.i64(<vscale x 4 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret void
-//
-void test_vsoxei32_v_f16m1(_Float16 *base, vuint32m2_t bindex, vfloat16m1_t value, size_t vl) {
-  return __riscv_vsoxei32_v_f16m1(base, bindex, value, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vle16ff_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, i64 } @llvm.riscv.vleff.nxv4f16.i64(<vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x half>, i64 } [[TMP0]], 0
-// CHECK-ZVFHMIN-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x half>, i64 } [[TMP0]], 1
-// CHECK-ZVFHMIN-NEXT:    store i64 [[TMP2]], ptr [[NEW_VL:%.*]], align 8
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP1]]
-//
-vfloat16m1_t test_vle16ff_v_f16m1(const _Float16 *base, size_t *new_vl, size_t vl) {
-  return __riscv_vle16ff_v_f16m1(base, new_vl, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vlseg2e16_v_f16m1x2(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.riscv.vlseg2.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]]
-//
-vfloat16m1x2_t test_vlseg2e16_v_f16m1x2(const _Float16 *base, size_t vl) {
-  return __riscv_vlseg2e16_v_f16m1x2(base, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vlseg2e16ff_v_f16m1x2(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, <vscale x 4 x half>, i64 } @llvm.riscv.vlseg2ff.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x half>, <vscale x 4 x half>, i64 } [[TMP0]], 0
-// CHECK-ZVFHMIN-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[TMP1]], 0
-// CHECK-ZVFHMIN-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x half>, <vscale x 4 x half>, i64 } [[TMP0]], 1
-// CHECK-ZVFHMIN-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP2]], <vscale x 4 x half> [[TMP3]], 1
-// CHECK-ZVFHMIN-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x half>, <vscale x 4 x half>, i64 } [[TMP0]], 2
-// CHECK-ZVFHMIN-NEXT:    store i64 [[TMP5]], ptr [[NEW_VL:%.*]], align 8
-// CHECK-ZVFHMIN-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP4]]
-//
-vfloat16m1x2_t test_vlseg2e16ff_v_f16m1x2(const _Float16 *base, size_t *new_vl, size_t vl) {
-  return __riscv_vlseg2e16ff_v_f16m1x2(base, new_vl, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vlsseg2e16_v_f16m1x2(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.riscv.vlsseg2.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> poison, ptr [[BASE:%.*]], i64 [[BSTRIDE:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]]
-//
-vfloat16m1x2_t test_vlsseg2e16_v_f16m1x2(const _Float16 *base, ptrdiff_t bstride, size_t vl) {
-  return __riscv_vlsseg2e16_v_f16m1x2(base, bstride, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vluxseg2ei32_v_f16m1x2(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.riscv.vluxseg2.nxv4f16.nxv4i32.i64(<vscale x 4 x half> poison, <vscale x 4 x half> poison, ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]]
-//
-vfloat16m1x2_t test_vluxseg2ei32_v_f16m1x2(const _Float16 *base, vuint32m2_t bindex, size_t vl) {
-  return __riscv_vluxseg2ei32_v_f16m1x2(base, bindex, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vloxseg2ei32_v_f16m1x2(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.riscv.vloxseg2.nxv4f16.nxv4i32.i64(<vscale x 4 x half> poison, <vscale x 4 x half> poison, ptr [[BASE:%.*]], <vscale x 4 x i32> [[BINDEX:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]]
-//
-vfloat16m1x2_t test_vloxseg2ei32_v_f16m1x2(const _Float16 *base, vuint32m2_t bindex, size_t vl) {
-  return __riscv_vloxseg2ei32_v_f16m1x2(base, bindex, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vmerge_vvm_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[OP1:%.*]], <vscale x 4 x half> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vmerge_vvm_f16m1(vfloat16m1_t op1, vfloat16m1_t op2, vbool16_t mask, size_t vl) {
-  return __riscv_vmerge(op1, op2, mask, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vmv_v_v_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[SRC:%.*]], i64 [[VL:%.*]])
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vmv_v_v_f16m1(vfloat16m1_t src, size_t vl) {
-  return __riscv_vmv_v(src, vl);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vreinterpret_v_f16m1_i16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x half> [[SRC:%.*]] to <vscale x 4 x i16>
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
-//
-vint16m1_t test_vreinterpret_v_f16m1_i16m1(vfloat16m1_t src) {
-  return __riscv_vreinterpret_v_f16m1_i16m1(src);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vreinterpret_v_f16m1_u16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x half> [[SRC:%.*]] to <vscale x 4 x i16>
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
-//
-vuint16m1_t test_vreinterpret_v_f16m1_u16m1(vfloat16m1_t src) {
-  return __riscv_vreinterpret_v_f16m1_u16m1(src);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vreinterpret_v_i16m1_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i16> [[SRC:%.*]] to <vscale x 4 x half>
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vreinterpret_v_i16m1_f16m1(vint16m1_t src) {
-  return __riscv_vreinterpret_v_i16m1_f16m1(src);
-}
-
-// CHECK-ZVFHMIN-LABEL: @test_vreinterpret_v_u16m1_f16m1(
-// CHECK-ZVFHMIN-NEXT:  entry:
-// CHECK-ZVFHMIN-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i16> [[SRC:%.*]] to <vscale x 4 x half>
-// CHECK-ZVFHMIN-NEXT:    ret <vscale x 4 x half> [[TMP0]]
-//
-vfloat16m1_t test_vreinterpret_v_u16m1_f16m1(vuint16m1_t src) {
-  return __riscv_vreinterpret_v_u16m1_f16m1(src);
-}
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 65a2bc9bbb680..3e39425e57f17 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -173,6 +173,29 @@ union union_double pass_union_double(union union_double arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_union_double(ptr dead_on_unwind noalias writable sret(%union.union_double) align 8 %{{.*}}, i64 %{{.*}})
 
 
+// Verify that transparent unions are passed like their first member (but returned like a union)
+
+union tu_char { char a; } __attribute__((transparent_union));
+union tu_char pass_tu_char(union tu_char arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_tu_char(ptr dead_on_unwind noalias writable sret(%union.tu_char) align 1 %{{.*}}, i8 signext %{{.*}})
+
+union tu_short { short a; } __attribute__((transparent_union));
+union tu_short pass_tu_short(union tu_short arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_tu_short(ptr dead_on_unwind noalias writable sret(%union.tu_short) align 2 %{{.*}}, i16 signext %{{.*}})
+
+union tu_int { int a; } __attribute__((transparent_union));
+union tu_int pass_tu_int(union tu_int arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_tu_int(ptr dead_on_unwind noalias writable sret(%union.tu_int) align 4 %{{.*}}, i32 signext %{{.*}})
+
+union tu_long { long a; } __attribute__((transparent_union));
+union tu_long pass_tu_long(union tu_long arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_tu_long(ptr dead_on_unwind noalias writable sret(%union.tu_long) align 8 %{{.*}}, i64 %{{.*}})
+
+union tu_ptr { void *a; } __attribute__((transparent_union));
+union tu_ptr pass_tu_ptr(union tu_ptr arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_tu_ptr(ptr dead_on_unwind noalias writable sret(%union.tu_ptr) align 8 %{{.*}}, ptr %{{.*}})
+
+
 // Accessing variable argument lists
 
 int va_int(__builtin_va_list l) { return __builtin_va_arg(l, int); }
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index d26db19574051..48465df21cca1 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -168,10 +168,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   /* math */
   __builtin_acos(f);       __builtin_acosf(f);      __builtin_acosl(f); __builtin_acosf128(f);
 
-// NO__ERRNO: declare double @acos(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @acosf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @acosl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @acosf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.acos.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.acos.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.acos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.acos.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @acos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @acosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @acosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -190,10 +190,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_asin(f);       __builtin_asinf(f);      __builtin_asinl(f); __builtin_asinf128(f);
 
-// NO__ERRNO: declare double @asin(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @asinf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @asinl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @asinf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.asin.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.asin.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.asin.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.asin.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @asin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @asinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @asinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -212,10 +212,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_atan(f);       __builtin_atanf(f);      __builtin_atanl(f); __builtin_atanf128(f);
 
-// NO__ERRNO: declare double @atan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @atanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @atanl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @atanf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.atan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.atan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.atan.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @atan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @atanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @atanl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -267,10 +267,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_cosh(f);       __builtin_coshf(f);      __builtin_coshl(f); __builtin_coshf128(f);
 
-// NO__ERRNO: declare double @cosh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @coshf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @coshl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @coshf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.cosh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.cosh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.cosh.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.cosh.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @cosh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @coshf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @coshl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -656,10 +656,10 @@ __builtin_sin(f);        __builtin_sinf(f);       __builtin_sinl(f); __builtin_s
 
 __builtin_sinh(f);       __builtin_sinhf(f);      __builtin_sinhl(f); __builtin_sinhf128(f);
 
-// NO__ERRNO: declare double @sinh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @sinhf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @sinhf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.sinh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.sinh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.sinh.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.sinh.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @sinh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinhf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -689,10 +689,10 @@ __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_t
 
 __builtin_tanh(f);       __builtin_tanhf(f);      __builtin_tanhl(f); __builtin_tanhf128(f);
 
-// NO__ERRNO: declare double @tanh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanhf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanhl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @tanhf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tanh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tanh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tanh.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.tanh.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tanh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanhf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanhl(x86_fp80 noundef) [[NOT_READNONE]]
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 5b5bc301bddc0..495ae7e181159 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -1,193 +1,200 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
 
 
 #include <immintrin.h>
 
 __m64 test_mm_abs_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b
+  // CHECK: call <8 x i8> @llvm.abs.v8i8(
   return _mm_abs_pi8(a);
 }
 
 __m64 test_mm_abs_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w
+  // CHECK: call <4 x i16> @llvm.abs.v4i16(
   return _mm_abs_pi16(a);
 }
 
 __m64 test_mm_abs_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d
+  // CHECK: call <2 x i32> @llvm.abs.v2i32(
   return _mm_abs_pi32(a);
 }
 
 __m64 test_mm_add_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.b
+  // CHECK: add <8 x i8> {{%.*}}, {{%.*}}
   return _mm_add_pi8(a, b);
 }
 
 __m64 test_mm_add_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
+  // CHECK: add <4 x i16> {{%.*}}, {{%.*}}
   return _mm_add_pi16(a, b);
 }
 
 __m64 test_mm_add_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.d
+  // CHECK: add <2 x i32> {{%.*}}, {{%.*}}
   return _mm_add_pi32(a, b);
 }
 
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: add i64 {{%.*}}, {{%.*}}
   return _mm_add_si64(a, b);
 }
 
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.b
+  // CHECK: call <8 x i8> @llvm.sadd.sat.v8i8(
   return _mm_adds_pi8(a, b);
 }
 
 __m64 test_mm_adds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
+  // CHECK: call <4 x i16> @llvm.sadd.sat.v4i16(
   return _mm_adds_pi16(a, b);
 }
 
 __m64 test_mm_adds_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b
+  // CHECK: call <8 x i8> @llvm.uadd.sat.v8i8(
   return _mm_adds_pu8(a, b);
 }
 
 __m64 test_mm_adds_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w
+  // CHECK: call <4 x i16> @llvm.uadd.sat.v4i16(
   return _mm_adds_pu16(a, b);
 }
 
 __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_alignr_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b
+  // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_pi8(a, b, 2);
 }
 
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_and_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pand
+  // CHECK: and <1 x i64> {{%.*}}, {{%.*}}
   return _mm_and_si64(a, b);
 }
 
 __m64 test_mm_andnot_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_andnot_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pandn
+  // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, <i64 -1>
+  // CHECK: and <1 x i64> [[TMP]], {{%.*}}
   return _mm_andnot_si64(a, b);
 }
 
 __m64 test_mm_avg_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(
   return _mm_avg_pu8(a, b);
 }
 
 __m64 test_mm_avg_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(
   return _mm_avg_pu16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b
+  // CHECK:      [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpeq_pi8(a, b);
 }
 
 __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w
+  // CHECK:      [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpeq_pi16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
+  // CHECK:      [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpeq_pi32(a, b);
 }
 
 __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b
+  // CHECK:      [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpgt_pi8(a, b);
 }
 
 __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w
+  // CHECK:      [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpgt_pi16(a, b);
 }
 
 __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d
+  // CHECK:      [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpgt_pi32(a, b);
 }
 
 __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvt_pi2ps
-  // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvt_pi2ps(a, b);
 }
 
 __m64 test_mm_cvt_ps2pi(__m128 a) {
   // CHECK-LABEL: test_mm_cvt_ps2pi
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvt_ps2pi(a);
 }
 
 __m64 test_mm_cvtpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvtpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(
   return _mm_cvtpd_pi32(a);
 }
 
 __m128 test_mm_cvtpi16_ps(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi16_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float>
   return _mm_cvtpi16_ps(a);
 }
 
 __m128d test_mm_cvtpi32_pd(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd
+  // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double>
   return _mm_cvtpi32_pd(a);
 }
 
 __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32_ps(a, b);
 }
 
 __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32x2_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32x2_ps(a, b);
 }
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}})
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]],
   return _mm_cvtps_pi16(a);
 }
 
 __m64 test_mm_cvtps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvtps_pi32(a);
 }
 
@@ -205,19 +212,19 @@ int test_mm_cvtsi64_si32(__m64 a) {
 
 __m64 test_mm_cvttpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvttpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(
   return _mm_cvttpd_pi32(a);
 }
 
 __m64 test_mm_cvttps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvttps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(
   return _mm_cvttps_pi32(a);
 }
 
 int test_mm_extract_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_extract_pi16
-  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
   return _mm_extract_pi16(a, 2);
 }
 
@@ -234,151 +241,153 @@ __m64 test_m_from_int64(long long a) {
 
 __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  // CHECK: insertelement <4 x i16>
   return _mm_insert_pi16(a, d, 2);
 }
 
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
   return _mm_madd_pi16(a, b);
 }
 
 __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_maddubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
   return _mm_maddubs_pi16(a, b);
 }
 
 void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
   // CHECK-LABEL: test_mm_maskmove_si64
-  // CHECK: call void @llvm.x86.mmx.maskmovq
+  // CHECK: call void @llvm.x86.sse2.maskmov.dqu(
   _mm_maskmove_si64(d, n, p);
 }
 
 __m64 test_mm_max_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w
+  // CHECK: call <4 x i16> @llvm.smax.v4i16(
   return _mm_max_pi16(a, b);
 }
 
 __m64 test_mm_max_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b
+  // CHECK: call <8 x i8> @llvm.umax.v8i8(
   return _mm_max_pu8(a, b);
 }
 
 __m64 test_mm_min_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w
+  // CHECK: call <4 x i16> @llvm.smin.v4i16(
   return _mm_min_pi16(a, b);
 }
 
 __m64 test_mm_min_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b
+  // CHECK: call <8 x i8> @llvm.umin.v8i8(
   return _mm_min_pu8(a, b);
 }
 
 int test_mm_movemask_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_movemask_pi8
-  // CHECK: call i32 @llvm.x86.mmx.pmovmskb
+  // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(
   return _mm_movemask_pi8(a);
 }
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(
   return _mm_mulhi_pi16(a, b);
 }
 
 __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(
   return _mm_mulhi_pu16(a, b);
 }
 
 __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhrs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
   return _mm_mulhrs_pi16(a, b);
 }
 
 __m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mullo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w
+  // CHECK: mul <4 x i16> {{%.*}}, {{%.*}}
   return _mm_mullo_pi16(a, b);
 }
 
 __m64 test_mm_or_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_or_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.por
+  // CHECK: or <1 x i64> {{%.*}}, {{%.*}}
   return _mm_or_si64(a, b);
 }
 
 __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packsswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
   return _mm_packs_pi16(a, b);
 }
 
 __m64 test_mm_packs_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
   return _mm_packs_pi32(a, b);
 }
 
 __m64 test_mm_packs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
   return _mm_packs_pu16(a, b);
 }
 
 __m64 test_mm_sad_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sad_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>
   return _mm_sad_pu8(a, b);
 }
 
@@ -471,187 +480,187 @@ __m64 test_mm_set1_pi32(int a) {
 
 __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_shuffle_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(
   return _mm_shuffle_pi8(a, b);
 }
 
 __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_shuffle_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm_shuffle_pi16(a, 3);
 }
 
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
   return _mm_sign_pi8(a, b);
 }
 
 __m64 test_mm_sign_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
   return _mm_sign_pi16(a, b);
 }
 
 __m64 test_mm_sign_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
   return _mm_sign_pi32(a, b);
 }
 
 __m64 test_mm_sll_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(
   return _mm_sll_pi16(a, b);
 }
 
 __m64 test_mm_sll_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(
   return _mm_sll_pi32(a, b);
 }
 
 __m64 test_mm_sll_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(
   return _mm_sll_si64(a, b);
 }
 
 __m64 test_mm_slli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(
   return _mm_slli_pi16(a, 3);
 }
 
 __m64 test_mm_slli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(
   return _mm_slli_pi32(a, 3);
 }
 
 __m64 test_mm_slli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_slli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
   return _mm_slli_si64(a, 3);
 }
 
 __m64 test_mm_sra_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(
   return _mm_sra_pi16(a, b);
 }
 
 __m64 test_mm_sra_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(
   return _mm_sra_pi32(a, b);
 }
 
 __m64 test_mm_srai_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(
   return _mm_srai_pi16(a, 3);
 }
 
 __m64 test_mm_srai_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(
   return _mm_srai_pi32(a, 3);
 }
 
 __m64 test_mm_srl_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(
   return _mm_srl_pi16(a, b);
 }
 
 __m64 test_mm_srl_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(
   return _mm_srl_pi32(a, b);
 }
 
 __m64 test_mm_srl_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(
   return _mm_srl_si64(a, b);
 }
 
 __m64 test_mm_srli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(
   return _mm_srli_pi16(a, 3);
 }
 
 __m64 test_mm_srli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(
   return _mm_srli_pi32(a, 3);
 }
 
 __m64 test_mm_srli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_srli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
   return _mm_srli_si64(a, 3);
 }
 
 void test_mm_stream_pi(__m64 *p, __m64 a) {
   // CHECK-LABEL: test_mm_stream_pi
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
   _mm_stream_pi(p, a);
 }
 
 void test_mm_stream_pi_void(void *p, __m64 a) {
   // CHECK-LABEL: test_mm_stream_pi_void
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
   _mm_stream_pi(p, a);
 }
 
 __m64 test_mm_sub_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
+  // CHECK: sub <8 x i8> {{%.*}}, {{%.*}}
   return _mm_sub_pi8(a, b);
 }
 
 __m64 test_mm_sub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.w
+  // CHECK: sub <4 x i16> {{%.*}}, {{%.*}}
   return _mm_sub_pi16(a, b);
 }
 
 __m64 test_mm_sub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.d
+  // CHECK: sub <2 x i32> {{%.*}}, {{%.*}}
   return _mm_sub_pi32(a, b);
 }
 
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: sub i64 {{%.*}}, {{%.*}}
   return _mm_sub_si64(a, b);
 }
 
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b
+  // CHECK: call <8 x i8> @llvm.ssub.sat.v8i8(
   return _mm_subs_pi8(a, b);
 }
 
 __m64 test_mm_subs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w
+  // CHECK: call <4 x i16> @llvm.ssub.sat.v4i16(
   return _mm_subs_pi16(a, b);
 }
 
 __m64 test_mm_subs_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b
+  // CHECK: call <8 x i8> @llvm.usub.sat.v8i8(
   return _mm_subs_pu8(a, b);
 }
 
 __m64 test_mm_subs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w
+  // CHECK: call <4 x i16> @llvm.usub.sat.v4i16(
   return _mm_subs_pu16(a, b);
 }
 
@@ -668,42 +677,42 @@ long long test_m_to_int64(__m64 a) {
 
 __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   return _mm_unpackhi_pi8(a, b);
 }
 
 __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_pi16(a, b);
 }
 
 __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_pi32(a, b);
 }
 
 __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   return _mm_unpacklo_pi8(a, b);
 }
 
 __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_pi16(a, b);
 }
 
 __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_pi32(a, b);
 }
 
 __m64 test_mm_xor_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_xor_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pxor
+  // CHECK: xor <1 x i64> {{%.*}}, {{%.*}}
   return _mm_xor_si64(a, b);
 }
diff --git a/clang/test/CodeGen/X86/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c
index 19c24a3a91e14..17fce1a48755e 100644
--- a/clang/test/CodeGen/X86/mmx-inline-asm.c
+++ b/clang/test/CodeGen/X86/mmx-inline-asm.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 
-// CHECK: { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+// CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
 
 void foo(long long fill) {
   __m64 vfill = _mm_cvtsi64_m64(fill);
diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
index 83be6b5517c01..741cb9c9c5ecf 100644
--- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
+++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
@@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 
 void shift(__m64 a, __m64 b, int c) {
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_slli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_slli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_slli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_srli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srai_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srai_pi32(a, c);
 }
diff --git a/clang/test/CodeGen/aarch64-fmv-resolver-emission.c b/clang/test/CodeGen/aarch64-fmv-resolver-emission.c
new file mode 100644
index 0000000000000..eeafb3d41860d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-resolver-emission.c
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @used_before_default_def = weak_odr ifunc void (), ptr @used_before_default_def.resolver
+// CHECK: @used_after_default_def = weak_odr ifunc void (), ptr @used_after_default_def.resolver
+// CHECK-NOT: @used_before_default_decl = weak_odr ifunc void (), ptr @used_before_default_decl.resolver
+// CHECK-NOT: @used_after_default_decl = weak_odr ifunc void (), ptr @used_after_default_decl.resolver
+// CHECK-NOT: @used_no_default = weak_odr ifunc void (), ptr @used_no_default.resolver
+// CHECK-NOT: @not_used_no_default = weak_odr ifunc void (), ptr @not_used_no_default.resolver
+// CHECK: @not_used_with_default = weak_odr ifunc void (), ptr @not_used_with_default.resolver
+
+
+// Test that an ifunc is generated and used when the default
+// version is defined after the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_before_default_def(void) {}
+// CHECK-LABEL: define dso_local void @used_before_default_def._Maes(
+//
+void call_before_def(void) { used_before_default_def(); }
+// CHECK-LABEL: define dso_local void @call_before_def(
+// CHECK: call void @used_before_default_def()
+//
+__attribute__((target_version("default"))) void used_before_default_def(void) {}
+// CHECK-LABEL: define dso_local void @used_before_default_def.default(
+//
+// CHECK-NOT: declare void @used_before_default_def(
+
+
+// Test that an ifunc is generated and used when the default
+// version is defined before the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_after_default_def(void) {}
+// CHECK-LABEL: define dso_local void @used_after_default_def._Maes(
+//
+__attribute__((target_version("default"))) void used_after_default_def(void) {}
+// CHECK-LABEL: define dso_local void @used_after_default_def.default(
+//
+void call_after_def(void) { used_after_default_def(); }
+// CHECK-LABEL: define dso_local void @call_after_def(
+// CHECK: call void @used_after_default_def()
+//
+// CHECK-NOT: declare void @used_after_default_def(
+
+
+// Test that an unmagled declaration is generated and used when the
+// default version is declared after the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_before_default_decl(void) {}
+// CHECK-LABEL: define dso_local void @used_before_default_decl._Maes(
+//
+void call_before_decl(void) { used_before_default_decl(); }
+// CHECK-LABEL: define dso_local void @call_before_decl(
+// CHECK: call void @used_before_default_decl()
+//
+__attribute__((target_version("default"))) void used_before_default_decl(void);
+// CHECK: declare void @used_before_default_decl()
+
+
+// Test that an unmagled declaration is generated and used when the
+// default version is declared before the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_after_default_decl(void) {}
+// CHECK-LABEL: define dso_local void @used_after_default_decl._Maes(
+//
+__attribute__((target_version("default"))) void used_after_default_decl(void);
+// CHECK: declare void @used_after_default_decl()
+//
+void call_after_decl(void) { used_after_default_decl(); }
+// CHECK-LABEL: define dso_local void @call_after_decl(
+// CHECK: call void @used_after_default_decl()
+
+
+// Test that an unmagled declaration is generated and used when
+// the default version is not present.
+//
+__attribute__((target_version("aes"))) void used_no_default(void) {}
+// CHECK-LABEL: define dso_local void @used_no_default._Maes(
+//
+void call_no_default(void) { used_no_default(); }
+// CHECK-LABEL: define dso_local void @call_no_default(
+// CHECK: call void @used_no_default()
+//
+// CHECK: declare void @used_no_default()
+
+
+// Test that neither an ifunc nor a declaration is generated if the default
+// definition is missing since the versioned function is not used.
+//
+__attribute__((target_version("aes"))) void not_used_no_default(void) {}
+// CHECK-LABEL: define dso_local void @not_used_no_default._Maes(
+//
+// CHECK-NOT: declare void @not_used_no_default(
+
+
+// Test that an ifunc is generated if the default version is defined but not used.
+//
+__attribute__((target_version("aes"))) void not_used_with_default(void) {}
+// CHECK-LABEL: define dso_local void @not_used_with_default._Maes(
+//
+__attribute__((target_version("default"))) void not_used_with_default(void) {}
+// CHECK-LABEL: define dso_local void @not_used_with_default.default(
+//
+// CHECK-NOT: declare void @not_used_with_default(
+
+
+// CHECK: define weak_odr ptr @used_before_default_def.resolver()
+// CHECK: define weak_odr ptr @used_after_default_def.resolver()
+// CHECK-NOT: define weak_odr ptr @used_before_default_decl.resolver(
+// CHECK-NOT: define weak_odr ptr @used_after_default_decl.resolver(
+// CHECK-NOT: define weak_odr ptr @used_no_default.resolver(
+// CHECK-NOT: define weak_odr ptr @not_used_no_default.resolver(
+// CHECK: define weak_odr ptr @not_used_with_default.resolver()
diff --git a/clang/test/CodeGen/aarch64-fmv-streaming.c b/clang/test/CodeGen/aarch64-fmv-streaming.c
new file mode 100644
index 0000000000000..e549ccda59ad8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-streaming.c
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -emit-llvm -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msve
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msimd
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msme2
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+//
+__attribute__((target_version("sme2"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee.default
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void n_callee(void) {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msve
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msimd
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msme2
+// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee.default
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void s_callee(void) __arm_streaming {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msve
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msimd
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msme2
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee.default
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_compatible {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_caller
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[ATTR13:[0-9]+]]
+//
+void n_caller(void) {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_caller
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void s_caller(void) __arm_streaming {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_caller
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void sc_caller(void) __arm_streaming_compatible {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK: attributes #[[ATTR0:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR1:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR2:[0-9]+]] = {{.*}}
+// CHECK: attributes #[[ATTR3]] = {{.*}}
+// CHECK: attributes #[[ATTR4:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR5:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR6:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR7]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR8:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR9:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR10]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR11]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR12]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR13]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
index 3c047fec6ceed..d779abd395b5f 100644
--- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c
+++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
@@ -261,9 +261,9 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" }
 // CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
 // CHECK: attributes #[[ATTR5:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR6:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR7:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.5a" }
-// CHECK: attributes #[[ATTR8:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.5a" }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" }
+// CHECK: attributes #[[ATTR8:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 //.
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 25aebeced9379..9c3d08a25945a 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
 
+// REQUIRES: aarch64-registered-target
+
 #define __ai __attribute__((always_inline))
 __ai void inlined_fn(void) {}
 __ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
@@ -20,7 +22,7 @@ void caller(void) {
 
 #ifdef TEST_COMPATIBLE
 void caller_compatible(void) __arm_streaming_compatible {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}}
     inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}}
@@ -29,7 +31,7 @@ void caller_compatible(void) __arm_streaming_compatible {
 
 #ifdef TEST_STREAMING
 void caller_streaming(void) __arm_streaming {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
@@ -39,7 +41,7 @@ void caller_streaming(void) __arm_streaming {
 #ifdef TEST_LOCALLY
 __arm_locally_streaming
 void caller_local(void) {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
diff --git a/clang/test/CodeGen/aix-builtin-cpu-is.c b/clang/test/CodeGen/aix-builtin-cpu-is.c
index e17cf7353511a..83e8c99e0a78d 100644
--- a/clang/test/CodeGen/aix-builtin-cpu-is.c
+++ b/clang/test/CodeGen/aix-builtin-cpu-is.c
@@ -1,55 +1,67 @@
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"pwr7\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=65536 \
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=131072\
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"pwr10\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \
 // RUN:   --check-prefix=CHECKOP
 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power11\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=524288 \
+// RUN:   --check-prefix=CHECKOP
+
 // CHECK:     define i32 @main() #0 {
 // CHECK-NEXT: entry:
 // CHECK-NEXT:   %retval = alloca i32, align 4
@@ -63,7 +75,7 @@
 // CHECKOP-NEXT:   %retval = alloca i32, align 4
 // CHECKOP-NEXT:   store i32 0, ptr %retval, align 4
 // CHECKOP-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }, ptr @_system_configuration, i32 0, i32 1), align 4
-// CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]] 
+// CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]]
 // CHECKOP-NEXT:  %conv = zext i1 %1 to i32
 // CHECKOP-NEXT:   ret i32 %conv
 // CHECKOP-NEXT: }
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
index 03c870e281549..15619bef5373d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
@@ -48,10 +48,13 @@ uint8x16x4_t test_vld4q_u8(const uint8_t *addr)
 
 // CHECK-LABEL: @test_vst2q_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0)
-// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 0
+// CHECK-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 1
+// CHECK-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 0)
+// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value)
@@ -65,14 +68,19 @@ void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value)
 
 // CHECK-LABEL: @test_vst4q_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 2
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 3
-// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 0)
-// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 1)
-// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 2)
-// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 0
+// CHECK-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 1
+// CHECK-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 2
+// CHECK-NEXT:    [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 3
+// CHECK-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[DOTFCA_2_EXTRACT]], 0, 2
+// CHECK-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[DOTFCA_3_EXTRACT]], 0, 3
+// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 0)
+// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 1)
+// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 3)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_s8(int8_t *addr, int8x16x4_t value)
@@ -86,10 +94,13 @@ void test_vst4q_s8(int8_t *addr, int8x16x4_t value)
 
 // CHECK-LABEL: @test_vst2q_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT:    [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0)
-// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 0
+// CHECK-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 1
+// CHECK-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 0)
+// CHECK-NEXT:    call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_f16(float16_t *addr, float16x8x2_t value)
diff --git a/clang/test/CodeGen/arm-vfp16-arguments2.cpp b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
index 6221e85e856b4..b7c6852c47b7f 100644
--- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp
+++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
@@ -44,20 +44,20 @@ struct S1 f1(struct S1 s1) { return s1; }
 
 // CHECK-SOFT: define{{.*}} void @_Z2f22S2(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S2) align 8 %agg.result, [4 x i32] %s2.coerce)
 // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f22S2([2 x <2 x i32>] returned %s2.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 returned %s2.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 %s2.coerce)
 struct S2 f2(struct S2 s2) { return s2; }
 
 // CHECK-SOFT: define{{.*}} void @_Z2f32S3(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S3) align 8 %agg.result, [2 x i64] %s3.coerce)
 // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f32S3([2 x <2 x i32>] returned %s3.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 returned %s3.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 %s3.coerce)
 struct S3 f3(struct S3 s3) { return s3; }
 
 // CHECK-SOFT: define{{.*}} void @_Z2f42S4(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S4) align 8 %agg.result, [2 x i64] %s4.coerce)
 // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f42S4([2 x <2 x i32>] returned %s4.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 returned %s4.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 %s4.coerce)
 struct S4 f4(struct S4 s4) { return s4; }
 
 // CHECK-SOFT: define{{.*}} void @_Z2f52S5(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S5) align 8 %agg.result, [2 x i64] %s5.coerce)
-// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce)
+// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce)
 struct S5 f5(struct S5 s5) { return s5; }
diff --git a/clang/test/CodeGen/arm64ec-hybrid-patchable.c b/clang/test/CodeGen/arm64ec-hybrid-patchable.c
new file mode 100644
index 0000000000000..4d1fa12afd2aa
--- /dev/null
+++ b/clang/test/CodeGen/arm64ec-hybrid-patchable.c
@@ -0,0 +1,34 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple arm64ec-pc-windows -fms-extensions -emit-llvm -o - %s -verify | FileCheck %s
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func() #0 {
+int __attribute__((hybrid_patchable)) func(void) {  return 1; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func2() #0 {
+int __declspec(hybrid_patchable) func2(void) {  return 2; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func3() #0 {
+int __declspec(hybrid_patchable) func3(void);
+int func3(void) {  return 3; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func4() #0 {
+[[clang::hybrid_patchable]] int func4(void);
+int func4(void) {  return 3; }
+
+// CHECK: ; Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define internal void @static_func() #0 {
+// expected-warning@+1 {{'hybrid_patchable' is ignored on functions without external linkage}}
+static void __declspec(hybrid_patchable) static_func(void) {}
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define linkonce_odr dso_local i32 @func5() #0 comdat {
+int inline __declspec(hybrid_patchable) func5(void) {  return 4; }
+
+void caller(void) {
+  static_func();
+  func5();
+}
diff --git a/clang/test/CodeGen/asm-inout.c b/clang/test/CodeGen/asm-inout.c
index 1383a421efbc2..6d40451b778d9 100644
--- a/clang/test/CodeGen/asm-inout.c
+++ b/clang/test/CodeGen/asm-inout.c
@@ -38,11 +38,11 @@ int test4(volatile int *addr) {
   return (int)oldval;
 }
 
-// This should have both inputs be of type x86_mmx.
+// This should have both inputs be of type <1 x i64>.
 // CHECK: @test5
 typedef long long __m64 __attribute__((__vector_size__(8)));
 __m64 test5(__m64 __A, __m64 __B) {
-  // CHECK: call x86_mmx asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: call <1 x i64> asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %{{.*}}, <1 x i64> %{{.*}})
   asm ("pmulhuw %1, %0\n\t" : "+y" (__A) : "y" (__B));
   return __A;
 }
@@ -51,7 +51,7 @@ __m64 test5(__m64 __A, __m64 __B) {
 int test6(void) {
   typedef unsigned char __attribute__((vector_size(8))) _m64u8;
   _m64u8 __attribute__((aligned(16))) Mu8_0, __attribute__((aligned(16))) Mu8_1;
-  // CHECK: call x86_mmx asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %1)
+  // CHECK: call <8 x i8> asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(<8 x i8> %0)
   asm ("nop" : "=y"(Mu8_1 ) : "0"(Mu8_0 ));
   return 0;
 }
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 79922eb4159f1..f5032ded971a8 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -754,11 +754,11 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont9:
+// SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    store i8 [[DOT_COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
@@ -908,11 +908,11 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont9:
+// SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[BYTES]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
@@ -1852,3 +1852,105 @@ struct annotated_struct_array {
 void test29(struct annotated_struct_array *ann, int idx1, int idx2) {
   ann->ann_array[idx1]->array[idx2] = __builtin_dynamic_object_size(ann->ann_array[idx1]->array, 1);
 }
+
+typedef struct {
+  char __padding[0];
+} test30_spinlock_t;
+
+struct test30_struct {
+  struct test30_decl *name_node;
+  int priv_len;
+  test30_spinlock_t pcpu_refcnt;
+  char priv[] __counted_by(priv_len);
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 4)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i8 0, i8 [[TMP2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[PCPU_REFCNT:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[PCPU_REFCNT]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[PCPU_REFCNT:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 12
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[PCPU_REFCNT]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test30(struct test30_struct *ptr, int idx) {
+  ptr->pcpu_refcnt.__padding[idx] = __builtin_dynamic_object_size(ptr, 1);
+}
+
+struct test31_empty {};
+
+struct test31_struct {
+  struct test31_empty y;
+  int s;
+  int x[] __counted_by(s);
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
+// SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
+//
+int test31(struct test31_struct *ptr, int idx) {
+  return __builtin_dynamic_object_size(ptr, 0);
+}
diff --git a/clang/test/CodeGen/attr-nomerge.cpp b/clang/test/CodeGen/attr-nomerge.cpp
index 7305fb73cf1dc..1cf5bb1619b31 100644
--- a/clang/test/CodeGen/attr-nomerge.cpp
+++ b/clang/test/CodeGen/attr-nomerge.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -fms-extensions -o - | FileCheck %s
 
 class A {
 public:
@@ -42,6 +42,9 @@ void foo(int i, A *ap, B *bp) {
 
   A *newA = new B();
   delete newA;
+  [[clang::nomerge]] __builtin_trap();
+  [[clang::nomerge]] __debugbreak();
+  [[clang::nomerge]] __builtin_verbose_trap("check null", "Argument must not be null.");
 }
 
 int g(int i);
@@ -62,22 +65,22 @@ void something_else_again() {
   g(1);
 }
 
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0:[0-9]+]]
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0]]
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0]]
-// CHECK: call noundef zeroext i1 @_Z3barv(){{$}}
-// CHECK: call noundef zeroext i1 @_Z3barv(){{$}}
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0:[0-9]+]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv(){{$}}
+// CHECK: call noundef{{.*}} i1 @_Z3barv(){{$}}
 // CHECK: call void @_Z1fbb({{.*}}) #[[ATTR0]]
 // CHECK: %[[FPTR:.*]] = load ptr, ptr @fptr
 // CHECK-NEXT: call void %[[FPTR]]() #[[ATTR0]]
 // CHECK: call void @"_ZZ3fooiP1AP1BENK3$_0clEv"{{.*}} #[[ATTR0]]
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0]]
 // CHECK-LABEL: for.cond:
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0]]
 // CHECK-LABEL: for.inc:
-// CHECK: call noundef zeroext i1 @_Z3barv() #[[ATTR0]]
+// CHECK: call noundef{{.*}} i1 @_Z3barv() #[[ATTR0]]
 // CHECK: call void asm sideeffect "nop"{{.*}} #[[ATTR1:[0-9]+]]
-// CHECK: call noundef zeroext i1 @_Z3barv(){{$}}
+// CHECK: call noundef{{.*}} i1 @_Z3barv(){{$}}
 // CHECK: load ptr, ptr
 // CHECK: load ptr, ptr
 // CHECK: %[[AG:.*]] = load ptr, ptr
@@ -97,6 +100,9 @@ void something_else_again() {
 // CHECK: load ptr, ptr
 // CHECK: %[[AG:.*]] = load ptr, ptr
 // CHECK-NEXT: call void %[[AG]](ptr {{.*}}) #[[ATTR1]]
+// CHECK: call void @llvm.trap() #[[ATTR0]]
+// CHECK: call void @llvm.debugtrap() #[[ATTR0]]
+// CHECK: call void @llvm.trap() #[[ATTR0]]
 // CHECK: call void  @_ZN1AD1Ev(ptr {{.*}}) #[[ATTR1]]
 
 // CHECK-DAG: attributes #[[ATTR0]] = {{{.*}}nomerge{{.*}}}
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 60f9c7f1fc24e..274e05de594b8 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -32,8 +32,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
 // CHECK: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
 // CHECK: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver
-// CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
 // CHECK: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
+// CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
 // CHECK: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
 //.
 // CHECK-MTE-BTI: @__aarch64_cpu_features = external dso_local global { i64 }
@@ -42,8 +42,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
 // CHECK-MTE-BTI: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
 // CHECK-MTE-BTI: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver
-// CHECK-MTE-BTI: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
 // CHECK-MTE-BTI: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
+// CHECK-MTE-BTI: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
 // CHECK-MTE-BTI: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
 //.
 // CHECK: Function Attrs: noinline nounwind optnone
@@ -157,7 +157,15 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_dup2._McrcMdotprod
 // CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 256
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 256
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_dup2._Mfp
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @ftc_dup2.default
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
@@ -210,12 +218,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline2._Mfp16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_direct(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
@@ -236,86 +238,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 [[ADD5]]
 //
 //
-// CHECK-LABEL: @ftc_inline1.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 513
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513
-// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
-// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
-// CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @ftc_inline1._MrngMsimd
-// CHECK:       resolver_else4:
-// CHECK-NEXT:    ret ptr @ftc_inline1.default
-//
-//
-// CHECK-LABEL: @ftc_inline2.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline2._Mfp16
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_inline2.default
-//
-//
-// CHECK-LABEL: @ftc_inline3.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_inline3.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-//
-//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc.default(
 // CHECK-NEXT:  entry:
@@ -347,11 +269,45 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: @ftc_inline2._Mfp16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline2.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
+// CHECK-LABEL: @ftc_inline2.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @ftc_inline2._Mfp16
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @ftc_inline2.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline1._MrngMsimd(
 // CHECK-NEXT:  entry:
@@ -376,6 +332,36 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK-LABEL: @ftc_inline1.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 513
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK:       resolver_return3:
+// CHECK-NEXT:    ret ptr @ftc_inline1._MrngMsimd
+// CHECK:       resolver_else4:
+// CHECK-NEXT:    ret ptr @ftc_inline1.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline3._Mbti(
 // CHECK-NEXT:  entry:
@@ -394,6 +380,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 3
 //
 //
+// CHECK-LABEL: @ftc_inline3.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @ftc_inline3.default
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc(
 // CHECK-NOFMV-NEXT:  entry:
@@ -571,7 +579,15 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI:       resolver_return:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2._McrcMdotprod
 // CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 256
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 256
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2._Mfp
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2.default
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
@@ -624,12 +640,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
-// CHECK-MTE-BTI-LABEL: @ftc_inline2._Mfp16(
-// CHECK-MTE-BTI-NEXT:  entry:
-// CHECK-MTE-BTI-NEXT:    ret i32 2
-//
-//
-// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: @ftc_direct(
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
@@ -650,86 +660,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret i32 [[ADD5]]
 //
 //
-// CHECK-MTE-BTI-LABEL: @ftc_inline1.resolver(
-// CHECK-MTE-BTI-NEXT:  resolver_entry:
-// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
-// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
-// CHECK-MTE-BTI:       resolver_else:
-// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
-// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
-// CHECK-MTE-BTI:       resolver_else2:
-// CHECK-MTE-BTI-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 513
-// CHECK-MTE-BTI-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513
-// CHECK-MTE-BTI-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
-// CHECK-MTE-BTI:       resolver_return3:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MrngMsimd
-// CHECK-MTE-BTI:       resolver_else4:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1.default
-//
-//
-// CHECK-MTE-BTI-LABEL: @ftc_inline2.resolver(
-// CHECK-MTE-BTI-NEXT:  resolver_entry:
-// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
-// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-MTE-BTI:       resolver_else:
-// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
-// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._Mfp16
-// CHECK-MTE-BTI:       resolver_else2:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2.default
-//
-//
-// CHECK-MTE-BTI-LABEL: @ftc_inline3.resolver(
-// CHECK-MTE-BTI-NEXT:  resolver_entry:
-// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
-// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
-// CHECK-MTE-BTI:       resolver_else:
-// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
-// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
-// CHECK-MTE-BTI:       resolver_else2:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
-//
-//
-// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
-// CHECK-MTE-BTI-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
-// CHECK-MTE-BTI-NEXT:  entry:
-// CHECK-MTE-BTI-NEXT:    ret i32 2
-//
-//
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: @ftc.default(
 // CHECK-MTE-BTI-NEXT:  entry:
@@ -761,11 +691,45 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline2._Mfp16(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: @ftc_inline2.default(
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
+// CHECK-MTE-BTI-LABEL: @ftc_inline2.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._Mfp16
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2.default
+//
+//
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: @ftc_inline1._MrngMsimd(
 // CHECK-MTE-BTI-NEXT:  entry:
@@ -790,6 +754,36 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
+// CHECK-MTE-BTI-LABEL: @ftc_inline1.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 513
+// CHECK-MTE-BTI-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513
+// CHECK-MTE-BTI-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK-MTE-BTI:       resolver_return3:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MrngMsimd
+// CHECK-MTE-BTI:       resolver_else4:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1.default
+//
+//
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: @ftc_inline3._Mbti(
 // CHECK-MTE-BTI-NEXT:  entry:
@@ -807,6 +801,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
+//
+// CHECK-MTE-BTI-LABEL: @ftc_inline3.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
+//
 //.
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 4edfc5408fae7..b058f84f78baa 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -v9.5a -target-feature -fp-armv8 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
 
 int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; }
@@ -11,15 +11,15 @@ int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; }
 int __attribute__((target_version("crc+ls64_v"))) fmv(void) { return 7; }
 int __attribute__((target_version("bti"))) fmv(void) { return 8; }
 int __attribute__((target_version("sme2"))) fmv(void) { return 9; }
-int __attribute__((target_version("default"))) fmv(void);
+int __attribute__((target_version("default"))) fmv(void) { return 0; }
 int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; }
 int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; }
-int __attribute__((target_version("default"))) fmv_one(void);
+int __attribute__((target_version("default"))) fmv_one(void) { return 0; }
 int __attribute__((target_version("fp"))) fmv_two(void) { return 1; }
 int __attribute__((target_version("simd"))) fmv_two(void) { return 2; }
 int __attribute__((target_version("dgh"))) fmv_two(void) { return 3; }
 int __attribute__((target_version("fp16+simd"))) fmv_two(void) { return 4; }
-int __attribute__((target_version("default"))) fmv_two(void);
+int __attribute__((target_version("default"))) fmv_two(void) { return 0; }
 int foo() {
   return fmv()+fmv_one()+fmv_two();
 }
@@ -124,11 +124,11 @@ __attribute__((target_version("rdma"))) int default_def_with_version_decls(void)
 
 // The following is guarded because in NOFMV we get errors for calling undeclared functions.
 #ifdef __HAVE_FUNCTION_MULTI_VERSIONING
-// This should generate a default declaration, two target versions and the resolver.
+// This should generate a default declaration, two target versions but no resolver.
 __attribute__((target_version("jscvt"))) int used_def_without_default_decl(void) { return 1; }
 __attribute__((target_version("rdma"))) int used_def_without_default_decl(void) { return 2; }
 
-// This should generate a default declaration and the resolver.
+// This should generate a default declaration but no resolver.
 __attribute__((target_version("jscvt"))) int used_decl_without_default_decl(void);
 __attribute__((target_version("rdma"))) int used_decl_without_default_decl(void);
 
@@ -140,12 +140,10 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver
 // CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver
 // CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
-// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
 // CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver
 // CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
 // CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
-// CHECK: @used_def_without_default_decl = weak_odr ifunc i32 (), ptr @used_def_without_default_decl.resolver
-// CHECK: @used_decl_without_default_decl = weak_odr ifunc i32 (), ptr @used_decl_without_default_decl.resolver
+// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
 // CHECK: @unused_with_default_def = weak_odr ifunc i32 (), ptr @unused_with_default_def.resolver
 // CHECK: @unused_with_implicit_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_default_def.resolver
 // CHECK: @unused_with_implicit_forward_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_forward_default_def.resolver
@@ -215,6 +213,13 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv.default
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_one._Mls64Msimd
 // CHECK-SAME: () #[[ATTR5]] {
 // CHECK-NEXT:  entry:
@@ -229,6 +234,13 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
 // CHECK-SAME: () #[[ATTR5]] {
 // CHECK-NEXT:  entry:
@@ -244,21 +256,28 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mdgh
-// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp16Msimd
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -268,6 +287,183 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret i32 [[ADD3]]
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_e.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 20
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@goo
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d()
+// CHECK-NEXT:    call void @fmv_c()
+// CHECK-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
+// CHECK-NEXT:    ret i32 [[CALL3]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 111
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@recur
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @reca()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@hoo
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    call void @f(ptr noundef @fmv)
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP1]], align 8
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP2]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 [[TMP0]]()
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 [[TMP1]]()
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    ret i32 [[ADD]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes
+// CHECK-SAME: () #[[ATTR5]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16
+// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
+// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
+// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
+// CHECK-SAME: () #[[ATTR17]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    ret i32 [[ADD]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @recur()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @goo()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -406,57 +602,185 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_two.default
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_e.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 20
+// CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_e._Mls64
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_e.default
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb
-// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_d.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK-LABEL: define {{[^@]+}}@fmv_d.resolver() {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_d._Msb
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_d.default
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_c.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
+// CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
+// CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@goo
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d()
-// CHECK-NEXT:    call void @fmv_c()
-// CHECK-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
-// CHECK-NEXT:    ret i32 [[CALL3]]
+// CHECK-NEXT:    ret i32 12
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16
+// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 8
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2
+// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 6
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt
+// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 7
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc
+// CHECK-SAME: () #[[ATTR27:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16
+// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-SAME: () #[[ATTR29:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 5
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128
+// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 9
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4
+// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 10
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3
+// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 11
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod
+// CHECK-SAME: () #[[ATTR14]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 13
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 14
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4
+// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 15
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm
+// CHECK-SAME: () #[[ATTR34:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 16
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398048673856
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398048673856
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
@@ -587,355 +911,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_inline.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_e._Mls64
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_e.default
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@fmv_d.resolver() {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_d._Msb
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_d.default
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_c.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 111
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@recur
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @reca()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@hoo
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @f(ptr noundef @fmv)
-// CHECK-NEXT:    store ptr @fmv, ptr [[FP1]], align 8
-// CHECK-NEXT:    store ptr @fmv, ptr [[FP2]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 [[TMP0]]()
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 [[TMP1]]()
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    ret i32 [[ADD]]
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
-// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes
-// CHECK-SAME: () #[[ATTR5]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
-// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16
-// CHECK-SAME: () #[[ATTR12]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
-// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
-// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
-// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
-// CHECK-SAME: () #[[ATTR18]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@caller
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    ret i32 [[ADD]]
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @used_def_without_default_decl._Mjscvt
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 64
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 64
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @used_def_without_default_decl._Mrdm
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @used_def_without_default_decl.default
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@used_decl_without_default_decl.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @used_decl_without_default_decl._Mjscvt
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 64
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 64
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @used_decl_without_default_decl._Mrdm
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @used_decl_without_default_decl.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@main
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NEXT:    call void @recur()
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @goo()
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1
-// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme
-// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3
-// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 12
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16
-// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 8
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2
-// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 6
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt
-// CHECK-SAME: () #[[ATTR27:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 7
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc
-// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 3
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16
-// CHECK-SAME: () #[[ATTR29:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 4
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3
-// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 5
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128
-// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 9
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4
-// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 10
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3
-// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 11
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod
-// CHECK-SAME: () #[[ATTR15]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 13
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd
-// CHECK-SAME: () #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 14
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4
-// CHECK-SAME: () #[[ATTR34:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 15
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm
-// CHECK-SAME: () #[[ATTR35:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 16
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline.default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 3
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -1013,6 +988,27 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 0
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 0
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 0
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_e
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
@@ -1116,42 +1112,42 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand,-v9.5a" }
-// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+sha2,-v9.5a" }
-// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+ls64,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
-// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
-// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" }
-// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" }
-// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" }
-// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" }
-// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" }
-// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" }
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+sha2" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+ls64,+neon" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" }
+// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
+// CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2" }
+// CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp" }
+// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR12:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
+// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" }
+// CHECK: attributes #[[ATTR18:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon" }
+// CHECK: attributes #[[ATTR19]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon" }
+// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb" }
+// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve" }
+// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc" }
+// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon" }
+// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc" }
+// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3" }
+// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm" }
+// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4" }
+// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3" }
+// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4" }
+// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm" }
+// CHECK: attributes #[[ATTR35:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c
index 01663766d9842..39b26619475af 100644
--- a/clang/test/CodeGen/attr-target-x86-mmx.c
+++ b/clang/test/CodeGen/attr-target-x86-mmx.c
@@ -1,12 +1,11 @@
 // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// Picking a cpu that doesn't have mmx or sse by default so we can enable it later.
+// Picking a cpu that doesn't have sse by default so we can enable it later.
 
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-// Verify that when we turn on sse that we also turn on mmx.
-void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
+void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) {
   _mm_slli_pi16(a, c);
   _mm_slli_pi32(a, c);
   _mm_slli_si64(a, c);
@@ -19,4 +18,4 @@ void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
   _mm_srai_pi32(a, c);
 }
 
-// CHECK: "target-features"="+cx8,+mmx,+sse,+x87"
+// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87"
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 3c2b511157f99..b1ae6678531b9 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -64,7 +64,7 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {}
 // CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-avx10.1-256,-avx10.1-512,-vaes"
 // CHECK-NOT: tune-cpu
-// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"
+// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-mmx"
 // CHECK: #7 = {{.*}}"target-cpu"="lakemont" "target-features"="+cx8,+mmx"
 // CHECK-NOT: tune-cpu
 // CHECK: #8 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="sandybridge"
diff --git a/clang/test/CodeGen/builtin-cpu-supports.c b/clang/test/CodeGen/builtin-cpu-supports.c
index 88eb7b0fa786e..92c407653e660 100644
--- a/clang/test/CodeGen/builtin-cpu-supports.c
+++ b/clang/test/CodeGen/builtin-cpu-supports.c
@@ -3,8 +3,11 @@
 // RUN:   FileCheck %s --check-prefix=CHECK-X86
 // RUN: %clang_cc1 -triple ppc64le-linux-gnu -emit-llvm -o - %s | FileCheck %s \
 // RUN:   --check-prefix=CHECK-PPC
-
-#ifndef __PPC__
+// RUN: %clang_cc1 -triple riscv32-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefix=CHECK-RV32
+// RUN: %clang_cc1 -triple riscv64-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefix=CHECK-RV64
+#ifdef __x86_64__
 
 // Test that we have the structure definition, the gep offsets, the name of the
 // global, the bit grab, and the icmp correct.
@@ -101,8 +104,10 @@ int v3() { return __builtin_cpu_supports("x86-64-v3"); }
 // CHECK-X86-NEXT:    ret i32 [[CONV]]
 //
 int v4() { return __builtin_cpu_supports("x86-64-v4"); }
-#else
-// CHECK-PPC-LABEL: define dso_local signext i32 @test(
+#endif
+
+#ifdef __PPC__
+// CHECK-PPC-LABEL: define dso_local signext i32 @test_ppc(
 // CHECK-PPC-SAME: i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-PPC-NEXT:  entry:
 // CHECK-PPC-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
@@ -129,33 +134,206 @@ int v4() { return __builtin_cpu_supports("x86-64-v4"); }
 // CHECK-PPC:       if.else3:
 // CHECK-PPC-NEXT:    [[CPU_IS:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
 // CHECK-PPC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[CPU_IS]], 39
-// CHECK-PPC-NEXT:    br i1 [[TMP6]], label [[IF_THEN4:%.*]], label [[IF_END:%.*]]
+// CHECK-PPC-NEXT:    br i1 [[TMP6]], label [[IF_THEN4:%.*]], label [[IF_ELSE5:%.*]]
 // CHECK-PPC:       if.then4:
 // CHECK-PPC-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK-PPC-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK-PPC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP8]]
 // CHECK-PPC-NEXT:    store i32 [[ADD]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else5:
+// CHECK-PPC-NEXT:    [[CPU_IS6:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[CPU_IS6]], 39
+// CHECK-PPC-NEXT:    br i1 [[TMP9]], label [[IF_THEN7:%.*]], label [[IF_ELSE8:%.*]]
+// CHECK-PPC:       if.then7:
+// CHECK-PPC-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 3
+// CHECK-PPC-NEXT:    store i32 [[MUL]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else8:
+// CHECK-PPC-NEXT:    [[CPU_IS9:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[CPU_IS9]], 33
+// CHECK-PPC-NEXT:    br i1 [[TMP11]], label [[IF_THEN10:%.*]], label [[IF_ELSE12:%.*]]
+// CHECK-PPC:       if.then10:
+// CHECK-PPC-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[TMP12]], 4
+// CHECK-PPC-NEXT:    store i32 [[MUL11]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else12:
+// CHECK-PPC-NEXT:    [[CPU_IS13:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[CPU_IS13]], 45
+// CHECK-PPC-NEXT:    br i1 [[TMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE16:%.*]]
+// CHECK-PPC:       if.then14:
+// CHECK-PPC-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP14]], 3
+// CHECK-PPC-NEXT:    store i32 [[ADD15]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else16:
+// CHECK-PPC-NEXT:    [[CPU_IS17:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[CPU_IS17]], 46
+// CHECK-PPC-NEXT:    br i1 [[TMP15]], label [[IF_THEN18:%.*]], label [[IF_ELSE20:%.*]]
+// CHECK-PPC:       if.then18:
+// CHECK-PPC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[SUB19:%.*]] = sub nsw i32 [[TMP16]], 3
+// CHECK-PPC-NEXT:    store i32 [[SUB19]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else20:
+// CHECK-PPC-NEXT:    [[CPU_IS21:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[CPU_IS21]], 47
+// CHECK-PPC-NEXT:    br i1 [[TMP17]], label [[IF_THEN22:%.*]], label [[IF_ELSE24:%.*]]
+// CHECK-PPC:       if.then22:
+// CHECK-PPC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP18]], 7
+// CHECK-PPC-NEXT:    store i32 [[ADD23]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else24:
+// CHECK-PPC-NEXT:    [[CPU_IS25:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[CPU_IS25]], 48
+// CHECK-PPC-NEXT:    br i1 [[TMP19]], label [[IF_THEN26:%.*]], label [[IF_END:%.*]]
+// CHECK-PPC:       if.then26:
+// CHECK-PPC-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 7
+// CHECK-PPC-NEXT:    store i32 [[SUB27]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
 // CHECK-PPC:       if.end:
-// CHECK-PPC-NEXT:    br label [[IF_END5:%.*]]
-// CHECK-PPC:       if.end5:
-// CHECK-PPC-NEXT:    br label [[IF_END6:%.*]]
-// CHECK-PPC:       if.end6:
-// CHECK-PPC-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP9]], 5
-// CHECK-PPC-NEXT:    store i32 [[ADD7]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[IF_END28:%.*]]
+// CHECK-PPC:       if.end28:
+// CHECK-PPC-NEXT:    br label [[IF_END29:%.*]]
+// CHECK-PPC:       if.end29:
+// CHECK-PPC-NEXT:    br label [[IF_END30:%.*]]
+// CHECK-PPC:       if.end30:
+// CHECK-PPC-NEXT:    br label [[IF_END31:%.*]]
+// CHECK-PPC:       if.end31:
+// CHECK-PPC-NEXT:    br label [[IF_END32:%.*]]
+// CHECK-PPC:       if.end32:
+// CHECK-PPC-NEXT:    br label [[IF_END33:%.*]]
+// CHECK-PPC:       if.end33:
+// CHECK-PPC-NEXT:    br label [[IF_END34:%.*]]
+// CHECK-PPC:       if.end34:
+// CHECK-PPC-NEXT:    br label [[IF_END35:%.*]]
+// CHECK-PPC:       if.end35:
+// CHECK-PPC-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[ADD36:%.*]] = add nsw i32 [[TMP21]], 5
+// CHECK-PPC-NEXT:    store i32 [[ADD36]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
 // CHECK-PPC:       return:
-// CHECK-PPC-NEXT:    [[TMP10:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK-PPC-NEXT:    ret i32 [[TMP10]]
+// CHECK-PPC-NEXT:    [[TMP22:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    ret i32 [[TMP22]]
 //
-int test(int a) {
+int test_ppc(int a) {
   if (__builtin_cpu_supports("arch_3_00")) // HWCAP2
     return a;
   else if (__builtin_cpu_supports("mmu"))  // HWCAP
     return a - 5;
   else if (__builtin_cpu_is("power7"))     // CPUID
     return a + a;
+  else if (__builtin_cpu_is("pwr7"))     // CPUID
+    return a * 3;
+  else if (__builtin_cpu_is("ppc970"))     // CPUID
+    return a * 4;
+  else if (__builtin_cpu_is("power8"))
+    return a + 3;
+  else if (__builtin_cpu_is("power9"))
+    return a - 3;
+  else if (__builtin_cpu_is("power10"))
+    return a + 7;
+  else if (__builtin_cpu_is("power11"))
+    return a - 7;
   return a + 5;
 }
 #endif
+
+#ifdef __riscv
+// CHECK-RV32-LABEL: define dso_local i32 @test_riscv(
+// CHECK-RV32-SAME: i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV32-NEXT:  entry:
+// CHECK-RV32-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-RV32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-RV32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-RV32-NEXT:    call void @__init_riscv_feature_bits()
+// CHECK-RV32-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV32-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1
+// CHECK-RV32-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1
+// CHECK-RV32-NEXT:    br i1 [[TMP2]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK-RV32:       if.then:
+// CHECK-RV32-NEXT:    store i32 3, ptr [[RETVAL]], align 4
+// CHECK-RV32-NEXT:    br label [[RETURN:%.*]]
+// CHECK-RV32:       if.else:
+// CHECK-RV32-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV32-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], 4
+// CHECK-RV32-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 4
+// CHECK-RV32-NEXT:    br i1 [[TMP5]], label [[IF_THEN1:%.*]], label [[IF_ELSE2:%.*]]
+// CHECK-RV32:       if.then1:
+// CHECK-RV32-NEXT:    store i32 7, ptr [[RETVAL]], align 4
+// CHECK-RV32-NEXT:    br label [[RETURN]]
+// CHECK-RV32:       if.else2:
+// CHECK-RV32-NEXT:    [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV32-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], 2097152
+// CHECK-RV32-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2097152
+// CHECK-RV32-NEXT:    br i1 [[TMP8]], label [[IF_THEN3:%.*]], label [[IF_END:%.*]]
+// CHECK-RV32:       if.then3:
+// CHECK-RV32-NEXT:    store i32 11, ptr [[RETVAL]], align 4
+// CHECK-RV32-NEXT:    br label [[RETURN]]
+// CHECK-RV32:       if.end:
+// CHECK-RV32-NEXT:    br label [[IF_END4:%.*]]
+// CHECK-RV32:       if.end4:
+// CHECK-RV32-NEXT:    br label [[IF_END5:%.*]]
+// CHECK-RV32:       if.end5:
+// CHECK-RV32-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-RV32-NEXT:    br label [[RETURN]]
+// CHECK-RV32:       return:
+// CHECK-RV32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-RV32-NEXT:    ret i32 [[TMP9]]
+//
+// CHECK-RV64-LABEL: define dso_local signext i32 @test_riscv(
+// CHECK-RV64-SAME: i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-RV64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-RV64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-RV64-NEXT:    call void @__init_riscv_feature_bits()
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1
+// CHECK-RV64-NEXT:    br i1 [[TMP2]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK-RV64:       if.then:
+// CHECK-RV64-NEXT:    store i32 3, ptr [[RETVAL]], align 4
+// CHECK-RV64-NEXT:    br label [[RETURN:%.*]]
+// CHECK-RV64:       if.else:
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], 4
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 4
+// CHECK-RV64-NEXT:    br i1 [[TMP5]], label [[IF_THEN1:%.*]], label [[IF_ELSE2:%.*]]
+// CHECK-RV64:       if.then1:
+// CHECK-RV64-NEXT:    store i32 7, ptr [[RETVAL]], align 4
+// CHECK-RV64-NEXT:    br label [[RETURN]]
+// CHECK-RV64:       if.else2:
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], 2097152
+// CHECK-RV64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2097152
+// CHECK-RV64-NEXT:    br i1 [[TMP8]], label [[IF_THEN3:%.*]], label [[IF_END:%.*]]
+// CHECK-RV64:       if.then3:
+// CHECK-RV64-NEXT:    store i32 11, ptr [[RETVAL]], align 4
+// CHECK-RV64-NEXT:    br label [[RETURN]]
+// CHECK-RV64:       if.end:
+// CHECK-RV64-NEXT:    br label [[IF_END4:%.*]]
+// CHECK-RV64:       if.end4:
+// CHECK-RV64-NEXT:    br label [[IF_END5:%.*]]
+// CHECK-RV64:       if.end5:
+// CHECK-RV64-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-RV64-NEXT:    br label [[RETURN]]
+// CHECK-RV64:       return:
+// CHECK-RV64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-RV64-NEXT:    ret i32 [[TMP9]]
+//
+int test_riscv(int a) {
+  __builtin_cpu_init();
+  if (__builtin_cpu_supports("a"))
+    return 3;
+  else if (__builtin_cpu_supports("c"))
+    return 7;
+  else if (__builtin_cpu_supports("v"))
+    return 11;
+  return 0;
+}
+#endif
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 75861b1b4bd6d..f494aeada0157 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -840,6 +840,12 @@ float extract_lane_f16x8(f16x8 a, int i) {
   return __builtin_wasm_extract_lane_f16x8(a, i);
 }
 
+f16x8 replace_lane_f16x8(f16x8 a, int i, float v) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %a, i32 %i, float %v)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_replace_lane_f16x8(a, i, v);
+}
+
 f16x8 min_f16x8(f16x8 a, f16x8 b) {
   // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b)
   // WEBASSEMBLY-NEXT: ret <8 x half> %0
diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c
index de31a4db5b0c1..c42c3216ec53c 100644
--- a/clang/test/CodeGen/builtins-x86.c
+++ b/clang/test/CodeGen/builtins-x86.c
@@ -168,26 +168,6 @@ void f0(void) {
   tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
   tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);
 
-  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
-  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
-  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
@@ -220,45 +200,17 @@ void f0(void) {
   tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
   tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
   tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
   tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
   tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
   tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
   tmp_V8s = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
   tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
   tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
   tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
   tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
-  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
-  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
-  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
-  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
-  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
-  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
-  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
-  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);
 
   __builtin_ia32_incsspd(tmp_Ui);
   __builtin_ia32_incsspq(tmp_ULLi);
@@ -306,8 +258,6 @@ void f0(void) {
   (void) __builtin_ia32_clzero(tmp_vp);
   (void) __builtin_ia32_cldemote(tmp_vp);
 
-  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
-  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
   tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
   tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);
 
@@ -320,17 +270,12 @@ void f0(void) {
   tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
   tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
 #endif
-  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
-  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
   tmp_i = __builtin_ia32_movmskps(tmp_V4f);
-  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
-  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
 #ifndef OPENCL
   (void) _mm_sfence();
 #endif
 
-  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
   tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
   tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
   tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
@@ -348,11 +293,8 @@ void f0(void) {
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
-  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
   tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
-  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
-  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
   tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
   tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
@@ -379,26 +321,9 @@ void f0(void) {
   (void) _mm_pause();
 #endif
 
-  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, imm_i_0_8);
-  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, imm_i_0_8);
-  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, imm_i_0_8);
-  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, imm_i_0_8);
-  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, imm_i_0_8);
 
   // Using non-immediate argument supported for gcc compatibility
-  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
-  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
-  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
-  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
-  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
-
-  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
+
   tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
   tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
@@ -433,7 +358,6 @@ void f0(void) {
   (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
   tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
   tmp_V16c = __builtin_ia32_palignr128(tmp_V16c, tmp_V16c, imm_i);
-  tmp_V8c = __builtin_ia32_palignr(tmp_V8c, tmp_V8c, imm_i);
 #ifdef USE_SSE4
   tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index 42c9e3c5008a3..aa77620b44535 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -36,6 +36,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call float @llvm.experimental.constrained.ldexp.f32.i32(float %{{.*}}, i32 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(x86_fp80 %{{.*}}, i32 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_acos(f);        __builtin_acosf(f);       __builtin_acosl(f); __builtin_acosf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.acos.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.acos.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.acos.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.acos.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
+__builtin_asin(f);        __builtin_asinf(f);       __builtin_asinl(f); __builtin_asinf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.asin.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.asin.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.asin.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.asin.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
+__builtin_atan(f);        __builtin_atanf(f);       __builtin_atanl(f); __builtin_atanf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.atan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.atan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.atan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.atan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
   __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.ceil.f64(double %{{.*}}, metadata !"fpexcept.strict")
@@ -50,6 +71,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.cos.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_cosh(f);        __builtin_coshf(f);       __builtin_coshl(f); __builtin_coshf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.cosh.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.cosh.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.cosh.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.cosh.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
   __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_expf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.exp.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -177,6 +205,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.sin.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_sinh(f);        __builtin_sinhf(f);       __builtin_sinhl(f); __builtin_sinhf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.sinh.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.sinh.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.sinh.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.sinh.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
   __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.sqrt.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -191,6 +226,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_tanh(f);        __builtin_tanhf(f);       __builtin_tanhl(f); __builtin_tanhf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.tanh.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.tanh.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.tanh.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.tanh.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
   __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
diff --git a/clang/test/CodeGen/finite-math.c b/clang/test/CodeGen/finite-math.c
index d1a2956b69feb..9cddba99ddf63 100644
--- a/clang/test/CodeGen/finite-math.c
+++ b/clang/test/CodeGen/finite-math.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffinite-math-only -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
+// RUN: %clang_cc1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
 // RUN: %clang_cc1 -fno-signed-zeros -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=NSZ
 // RUN: %clang_cc1 -freciprocal-math -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=RECIP
 // RUN: %clang_cc1 -mreassociate -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=REASSOC
diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c
index c5a26849b4b3d..71606c95e620c 100644
--- a/clang/test/CodeGen/fp-accuracy.c
+++ b/clang/test/CodeGen/fp-accuracy.c
@@ -97,15 +97,15 @@ double rsqrt(double);
 // CHECK: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_HIGH]]
 
 // CHECK-F1-LABEL: define dso_local void @f1
-// CHECK-F1: call double @acos(double {{.*}})
+// CHECK-F1: call double @llvm.acos.f64(double {{.*}})
 // CHECK-F1: call double @acosh(double {{.*}})
-// CHECK-F1: call double @asin(double {{.*}})
+// CHECK-F1: call double @llvm.asin.f64(double {{.*}})
 // CHECK-F1: call double @asinh(double {{.*}})
-// CHECK-F1: call double @atan(double {{.*}})
+// CHECK-F1: call double @llvm.atan.f64(double {{.*}})
 // CHECK-F1: call double @atan2(double {{.*}}, double {{.*}})
 // CHECK-F1: call double @atanh(double {{.*}})
 // CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]]
-// CHECK-F1: call double @cosh(double {{.*}})
+// CHECK-F1: call double @llvm.cosh.f64(double {{.*}})
 // CHECK-F1: call double @erf(double {{.*}})
 // CHECK-F1: call double @erfc(double {{.*}})
 // CHECK-F1: call i32 (double, ...) @exp10(double {{.*}})
@@ -126,10 +126,10 @@ double rsqrt(double);
 // CHECK-F1: call i32 (double, ...) @rsqrt(double {{.*}})
 // CHECK-F1: call double @llvm.sin.f64(double {{.*}})
 // CHECK-F1: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F1_MEDIUM]]
-// CHECK-F1: call double @sinh(double {{.*}})
+// CHECK-F1: call double @llvm.sinh.f64(double {{.*}})
 // CHECK-F1: call double @llvm.sqrt.f64(double {{.*}})
 // CHECK-F1: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]]
-// CHECK-F1: call double @tanh(double {{.*}})
+// CHECK-F1: call double @llvm.tanh.f64(double {{.*}})
 //
 // CHECK-F2-LABEL: define dso_local void @f1
 // CHECK-F2: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F2_MEDIUM:[0-9]+]]
@@ -259,15 +259,15 @@ double rsqrt(double);
 // CHECK-F4: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
 //
 // CHECK-F5-LABEL: define dso_local void @f1
-// CHECK-F5: call double @acos(double {{.*}})
+// CHECK-F5: call double @llvm.acos.f64(double {{.*}})
 // CHECK-F5: call double @acosh(double {{.*}})
-// CHECK-F5: call double @asin(double {{.*}})
+// CHECK-F5: call double @llvm.asin.f64(double {{.*}})
 // CHECK-F5: call double @asinh(double {{.*}})
-// CHECK-F5: call double @atan(double {{.*}})
+// CHECK-F5: call double @llvm.atan.f64(double {{.*}})
 // CHECK-F5: call double @atan2(double {{.*}}, double {{.*}})
 // CHECK-F5: call double @atanh(double {{.*}})
 // CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM:[0-9]+]]
-// CHECK-F5: call double @cosh(double {{.*}})
+// CHECK-F5: call double @llvm.cosh.f64(double {{.*}})
 // CHECK-F5: call double @erf(double {{.*}})
 // CHECK-F5: call double @erfc(double {{.*}})
 // CHECK-F5: call double @llvm.exp.f64(double {{.*}})
@@ -289,10 +289,10 @@ double rsqrt(double);
 // CHECK-F5: call i32 (double, ...) @rsqrt(double {{.*}})
 // CHECK-F5: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F5_HIGH:[0-9]+]]
 // CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}})
-// CHECK-F5: call double @sinh(double {{.*}})
+// CHECK-F5: call double @llvm.sinh.f64(double {{.*}})
 // CHECK-F5: call double @llvm.sqrt.f64(double {{.*}})
 // CHECK-F5: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F5_HIGH]]
-// CHECK-F5: call double @tanh(double {{.*}})
+// CHECK-F5: call double @llvm.tanh.f64(double {{.*}})
 //
 //
 // CHECK-F6-LABEL: define dso_local void @f1
@@ -560,15 +560,15 @@ void f1(float a, float b) {
 // CHECK-SPIR: attributes #[[ATTR_SYCL8]] = {{.*}}"fpbuiltin-max-error"="2.0"
 
 // CHECK-DEFAULT-LABEL: define dso_local void @f1
-// CHECK-DEFAULT: call double @acos(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.acos.f64(double {{.*}})
 // CHECK-DEFAULT: call double @acosh(double {{.*}})
-// CHECK-DEFAULT: call double @asin(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.asin.f64(double {{.*}})
 // CHECK-DEFAULT: call double @asinh(double {{.*}})
-// CHECK-DEFAULT: call double @atan(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.atan.f64(double {{.*}})
 // CHECK-DEFAULT: call double @atan2(double {{.*}}, double {{.*}})
 // CHECK-DEFAULT: call double @atanh(double {{.*}})
 // CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}})
-// CHECK-DEFAULT: call double @cosh(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.cosh.f64(double {{.*}})
 // CHECK-DEFAULT: call double @erf(double {{.*}})
 // CHECK-DEFAULT: call double @erfc(double {{.*}})
 // CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}})
@@ -590,10 +590,10 @@ void f1(float a, float b) {
 // CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double {{.*}})
 // CHECK-DEFAULT: call double @llvm.sin.f64(double {{.*}})
 // CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}})
-// CHECK-DEFAULT: call double @sinh(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.sinh.f64(double {{.*}})
 // CHECK-DEFAULT: call double @llvm.sqrt.f64(double {{.*}})
 // CHECK-DEFAULT: call double @llvm.tan.f64(double {{.*}})
-// CHECK-DEFAULT: call double @tanh(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.tanh.f64(double {{.*}})
 //
 // CHECK-DEFAULT-LABEL: define dso_local void @f2
 // CHECK-DEFAULT: call float @llvm.cos.f32(float {{.*}})
diff --git a/clang/test/CodeGen/fp-floatcontrol-stack.cpp b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
index 090da25d21207..237c9d4f9a37e 100644
--- a/clang/test/CodeGen/fp-floatcontrol-stack.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DDEFAULT=1 -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DDEFAULT %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DEBSTRICT=1 -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DEBSTRICT %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -DFAST=1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -ffinite-math-only -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
 
 #define FUN(n) \
   (float z) { return n * z + n; }
diff --git a/clang/test/CodeGen/fp-options-to-fast-math-flags.c b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
index abdcf8541f225..6aa62266d9898 100644
--- a/clang/test/CodeGen/fp-options-to-fast-math-flags.c
+++ b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck -check-prefix CHECK-PRECISE %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-NANS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-INFS %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fno-signed-zeros -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-SIGNED-ZEROS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -mreassociate -emit-llvm -o - %s | FileCheck -check-prefix CHECK-REASSOC %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -freciprocal-math -emit-llvm -o - %s | FileCheck -check-prefix CHECK-RECIP %s
diff --git a/clang/test/CodeGen/inline-asm-size-zero.c b/clang/test/CodeGen/inline-asm-size-zero.c
new file mode 100644
index 0000000000000..564f5207d1e71
--- /dev/null
+++ b/clang/test/CodeGen/inline-asm-size-zero.c
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 -S %s -verify -o -
+
+void foo(void) {
+    extern long bar[];
+    asm ("" : "=r"(bar)); // expected-error{{output size should not be zero}}
+}
diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
index 645fa016f3f77..b1637121127c5 100644
--- a/clang/test/CodeGen/libcalls.c
+++ b/clang/test/CodeGen/libcalls.c
@@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan_ = atan(d);
   long double atanl_ = atanl(ld);
   float atanf_ = atanf(f);
-// CHECK-NO: declare double @atan(double noundef) [[NUW_RN:#[0-9]+]]
-// CHECK-NO: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW_RN]]
-// CHECK-NO: declare float @atanf(float noundef) [[NUW_RN]]
+// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]]
+// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]]
+// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]]
 // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]]
 // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atanf(float noundef) [[NUW]]
@@ -95,7 +95,7 @@ void test_builtins(double d, float f, long double ld) {
   double atan2_ = atan2(d, 2);
   long double atan2l_ = atan2l(ld, ld);
   float atan2f_ = atan2f(f, f);
-// CHECK-NO: declare double @atan2(double noundef, double noundef) [[NUW_RN]]
+// CHECK-NO: declare double @atan2(double noundef, double noundef) [[NUW_RN:#[0-9]+]]
 // CHECK-NO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW_RN]]
 // CHECK-NO: declare float @atan2f(float noundef, float noundef) [[NUW_RN]]
 // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.c b/clang/test/CodeGen/math-libcalls-tbaa.c
new file mode 100644
index 0000000000000..9c86eea67d14d
--- /dev/null
+++ b/clang/test/CodeGen/math-libcalls-tbaa.c
@@ -0,0 +1,170 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NONEWSTRUCTPATHTBAA
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NEWSTRUCTPATHTBAA
+
+float expf(float);
+double remainder(double, double);
+double fabs(double);
+double frexp(double, int *exp);
+void sincos(float a, float *s, float *c);
+float _Complex cacoshf(float _Complex);
+float crealf(float _Complex);
+
+// Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis
+
+// CHECK-LABEL: define dso_local float @test_expf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
+float test_expf (float num[]) {
+   const float expm2 = expf(num[10]);  // Emit TBAA metadata on @expf
+   float tmp = expm2 * num[10];
+   return tmp;
+}
+
+// CHECK-LABEL: define dso_local float @test_builtin_expf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
+float test_builtin_expf (float num[]) {
+   const float expm2 = __builtin_expf(num[10]);  // Emit TBAA metadata on @expf
+   float tmp = expm2 * num[10];
+   return tmp;
+}
+
+//
+// Negative test: fabs cannot set errno
+// CHECK-LABEL: define dso_local double @test_fabs(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 80
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_fabs (double num[]) {
+   const double expm2 = fabs(num[10]);          // Don't emit TBAA metadata
+   double tmp = expm2 * num[10];
+   return tmp;
+}
+
+// CHECK-LABEL: define dso_local double @test_remainder(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]], double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 80
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_remainder (double num[], double a) {
+   const double expm2 = remainder(num[10], a);  // Emit TBAA metadata
+   double tmp = expm2 * num[10];
+   return tmp;
+}
+
+//
+// TODO: frexp is not subject to any errors, but also writes to
+// its int pointer out argument, so it could emit int TBAA metadata.
+// CHECK-LABEL: define dso_local double @test_frexp(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = call double @frexp(double noundef [[TMP0]], ptr noundef nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_frexp (double num[]) {
+   int e;
+   double expm2 = frexp(num[2], &e);  // Don't emit TBAA metadata
+   double tmp = expm2 * num[2];
+   return tmp;
+}
+
+//
+// Negative test: sincos is a library function, but is not a builtin function
+// checked in CodeGenFunction::EmitCallExpr.
+// CHECK-LABEL: define dso_local float @test_sincos(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SIN:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[COS:%.*]] = alloca float, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[SIN]]) #[[ATTR9]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[SIN]]) #[[ATTR9]]
+// CHECK-NEXT:    ret float [[ADD]]
+//
+float test_sincos (float num[]) {
+   float sin, cos;
+   sincos(num[2], &sin, &cos);  // Don't emit TBAA metadata
+   float tmp = sin * cos + num[2];
+   return tmp;
+}
+
+// TODO: The builtin return a complex type
+// CHECK-LABEL: define dso_local float @test_cacoshf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
+// CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
+// CHECK-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret float [[ADD]]
+//
+float test_cacoshf (float num[]) {
+   float _Complex z = cacoshf(num[2]);  // Don't emit TBAA metadata
+   float tmp = crealf(z) + num[2];
+   return tmp;
+}
+
+//.
+// NONEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NONEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META7]] = !{!"int", [[META4]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META9]] = !{!"double", [[META4]], i64 0}
+//.
+// NEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
+// NEWSTRUCTPATHTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"}
+// NEWSTRUCTPATHTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
+// NEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0, i64 4}
+// NEWSTRUCTPATHTBAA: [[META7]] = !{[[META4]], i64 4, !"int"}
+// NEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0, i64 8}
+// NEWSTRUCTPATHTBAA: [[META9]] = !{[[META4]], i64 8, !"double"}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// NEWSTRUCTPATHTBAA: {{.*}}
+// NONEWSTRUCTPATHTBAA: {{.*}}
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.cpp b/clang/test/CodeGen/math-libcalls-tbaa.cpp
deleted file mode 100644
index 0b231d474df77..0000000000000
--- a/clang/test/CodeGen/math-libcalls-tbaa.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NoNewStructPathTBAA
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NewStructPathTBAA
-
-extern "C" float expf(float);
-
-// Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis
-
-// CHECK-LABEL: define dso_local float @foo(
-// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]], float noundef [[R2INV:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR2:[0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CALL]], [[TMP1]]
-// CHECK-NEXT:    ret float [[MUL]]
-//
-extern "C" float foo (float num[], float r2inv, int n) {
-   const float expm2 =  expf(num[10]);  // Emit TBAA metadata on @expf
-   float tmp = expm2 * num[10];
-   return tmp;
-}
-//.
-// NoNewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// NoNewStructPathTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
-// NoNewStructPathTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// NoNewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"}
-//.
-// NewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
-// NewStructPathTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"}
-// NewStructPathTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
-// NewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"}
-//.
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// NewStructPathTBAA: {{.*}}
-// NoNewStructPathTBAA: {{.*}}
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index a249182692762..5b23a4a3faef3 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -121,15 +121,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   /* math */
   acos(f);       acosf(f);      acosl(f);
 
-// NO__ERRNO: declare double @acos(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @acosf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @acosl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.acos.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.acos.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.acos.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @acos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @acosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @acosl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @acos(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @acosf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @acosl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.acos.f64(
+// HAS_MAYTRAP: declare float  @llvm.experimental.constrained.acos.f32(
+// HAS_MAYTRAP: declare x86_fp80  @llvm.experimental.constrained.acos.f80(
 
 
   acosh(f);      acoshf(f);     acoshl(f);
@@ -146,15 +146,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   asin(f);       asinf(f);      asinl(f);
 
-// NO__ERRNO: declare double @asin(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @asinf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @asinl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.asin.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.asin.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.asin.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @asin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @asinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @asinl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @asin(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @asinf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @asinl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.asin.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.asin.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.asin.f80(
 
   asinh(f);      asinhf(f);     asinhl(f);
 
@@ -170,15 +170,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   atan(f);       atanf(f);      atanl(f);
 
-// NO__ERRNO: declare double @atan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @atanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @atanl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.atan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.atan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @atan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @atanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @atanl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @atan(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @atanf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @atanl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.atan.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.atan.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.atan.f80(
 
   atanh(f);      atanhf(f);     atanhl(f);
 
@@ -230,15 +230,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   cosh(f);       coshf(f);      coshl(f);
 
-// NO__ERRNO: declare double @cosh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @coshf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @coshl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.cosh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.cosh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.cosh.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @cosh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @coshf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @coshl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @cosh(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @coshf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @coshl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.cosh.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.cosh.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.cosh.f80(
 
   erf(f);        erff(f);       erfl(f);
 
@@ -638,15 +638,16 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   sinh(f);       sinhf(f);      sinhl(f);
 
-// NO__ERRNO: declare double @sinh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @sinhf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.sinh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.sinh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.sinh.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @sinh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinhf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @sinh(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @sinhf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @sinhl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.sinh.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.sinh.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.sinh.f80(
+
 
   sqrt(f);       sqrtf(f);      sqrtl(f);
 
@@ -674,15 +675,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   tanh(f);       tanhf(f);      tanhl(f);
 
-// NO__ERRNO: declare double @tanh(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanhf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanhl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tanh.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tanh.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tanh.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tanh(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanhf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanhl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @tanh(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @tanhf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @tanhl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tanh.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tanh.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tanh.f80(
 
   tgamma(f);     tgammaf(f);    tgammal(f);
 
diff --git a/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp b/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp
new file mode 100644
index 0000000000000..b17d97bb04cad
--- /dev/null
+++ b/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm -std=c++20 -x c++ < %s | FileCheck -check-prefix=WIN64 %s
+
+struct A {
+  const int* ptr;
+};
+
+template<A> void tfn() {};
+
+// WIN64: ??$tfn@$2UA@@PEBH5CE?ints@@3QBHB06@@@@@YAXXZ
+constexpr int ints[] = { 1, 2, 7, 8, 9, -17, -10 };
+
+// WIN64: ??$tfn@$2UA@@PEBH5E?one_int@@3HB@@@@YAXXZ
+constexpr int one_int = 7;
+
+void template_instance() {
+  tfn<A{ints + sizeof(ints)/sizeof(int)}>();
+  tfn<A{&one_int + 1}>();
+}
+
diff --git a/clang/test/CodeGen/nofpclass.c b/clang/test/CodeGen/nofpclass.c
index fc4c64f9b921b..23470914d0ab4 100644
--- a/clang/test/CodeGen/nofpclass.c
+++ b/clang/test/CodeGen/nofpclass.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --version 2
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
-// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -cl-finite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
+// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=NONANS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=NOINFS %s
diff --git a/clang/test/CodeGen/palignr.c b/clang/test/CodeGen/palignr.c
index 5a77597c34031..092937ac115de 100644
--- a/clang/test/CodeGen/palignr.c
+++ b/clang/test/CodeGen/palignr.c
@@ -14,18 +14,3 @@ int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); }
 int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); }
 // CHECK: xor
 int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); }
-
-#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n)))
-typedef __attribute__((vector_size(8))) int int2;
-
-// CHECK: palignr
-int2 align5(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 8); }
-
-// CHECK: palignr
-int2 align6(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 9); }
-
-// CHECK: palignr
-int2 align7(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 16); }
-
-// CHECK: palignr
-int2 align8(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 7); }
diff --git a/clang/test/CodeGen/pr26099.c b/clang/test/CodeGen/pr26099.c
deleted file mode 100644
index 15b73b832e9d8..0000000000000
--- a/clang/test/CodeGen/pr26099.c
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=i686-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
-// REQUIRES: asserts
-
-#include <x86intrin.h>
-
-int __attribute__ ((__vector_size__ (8))) b;
-
-void bar(int a)
-{
-  b = __builtin_ia32_vec_init_v2si (0, a);
-}
\ No newline at end of file
diff --git a/clang/test/CodeGen/pr3518.c b/clang/test/CodeGen/pr3518.c
index f888add986258..a3cd866e92201 100644
--- a/clang/test/CodeGen/pr3518.c
+++ b/clang/test/CodeGen/pr3518.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fexperimental-new-constant-interpreter -emit-llvm -o - | FileCheck %s
 // PR 3518
 // Some of the objects were coming out as uninitialized (external) before 3518
 // was fixed.  Internal names are different between llvm-gcc and clang so they
diff --git a/clang/test/CodeGen/ptrauth-function-attributes.c b/clang/test/CodeGen/ptrauth-function-attributes.c
index 7ec30498b9d35..6a09cd37bf485 100644
--- a/clang/test/CodeGen/ptrauth-function-attributes.c
+++ b/clang/test/CodeGen/ptrauth-function-attributes.c
@@ -4,10 +4,16 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios  -fptrauth-calls   -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls  -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+// RUN: %clang_cc1 -triple arm64e-apple-ios  -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+
 // ALL: define {{(dso_local )?}}void @test() #0
 void test() {
 }
 
 // CALLS: attributes #0 = {{{.*}} "ptrauth-calls" {{.*}}}
 
+// GOTOS: attributes #0 = {{{.*}} "ptrauth-indirect-gotos" {{.*}}}
+
 // OFF-NOT: attributes {{.*}} "ptrauth-
diff --git a/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c b/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c
new file mode 100644
index 0000000000000..40bba99478192
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 %s -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- -fptrauth-function-pointer-type-discrimination | FileCheck -check-prefixes CHECK,TYPE %s
+// RUN: %clang_cc1 %s -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- -fptrauth-function-pointer-type-discrimination | FileCheck -check-prefixes CHECK,TYPE %s
+// RUN: %clang_cc1 %s -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- | FileCheck -check-prefixes CHECK,ZERO %s
+// RUN: %clang_cc1 %s -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- | FileCheck -check-prefixes CHECK,ZERO %s
+
+typedef void (*fptr_t)(void);
+
+char *cptr;
+void (*fptr)(void);
+
+// CHECK-LABEL: define{{.*}} void @test1
+void test1() {
+  // TYPE: [[LOAD:%.*]] = load ptr, ptr @cptr
+  // TYPE: [[TOINT:%.*]] = ptrtoint ptr [[LOAD]] to i64
+  // TYPE: call i64 @llvm.ptrauth.resign(i64 [[TOINT]], i32 0, i64 0, i32 0, i64 18983)
+  // TYPE: call void {{.*}}() [ "ptrauth"(i32 0, i64 18983) ]
+  // ZERO-NOT: @llvm.ptrauth.resign
+
+  (*(fptr_t)cptr)();
+}
+
+// CHECK-LABEL: define{{.*}} i8 @test2
+char test2() {
+  return *(char *)fptr;
+
+  // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr
+  // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null
+  // TYPE-NEXT: br i1 [[CMP]], label %[[NONNULL:.*]], label %[[CONT:.*]]
+
+  // TYPE: [[NONNULL]]:
+  // TYPE: [[TOINT:%.*]] = ptrtoint ptr [[LOAD]] to i64
+  // TYPE: [[CALL:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[TOINT]], i32 0, i64 18983, i32 0, i64 0)
+  // TYPE: [[TOPTR:%.*]] = inttoptr i64 [[CALL]] to ptr
+
+  // TYPE: [[CONT]]:
+  // TYPE: phi ptr [ null, {{.*}} ], [ [[TOPTR]], %[[NONNULL]] ]
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
+
+// CHECK-LABEL: define{{.*}} void @test4
+void test4() {
+  (*((fptr_t)(&*((char *)(&*(fptr_t)cptr)))))();
+
+  // CHECK: [[LOAD:%.*]] = load ptr, ptr @cptr
+  // TYPE-NEXT: [[CAST4:%.*]] = ptrtoint ptr [[LOAD]] to i64
+  // TYPE-NEXT: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[CAST4]], i32 0, i64 0, i32 0, i64 18983)
+  // TYPE-NEXT: [[CAST5:%.*]] = inttoptr i64 [[RESIGN]] to ptr
+  // TYPE-NEXT: call void [[CAST5]]() [ "ptrauth"(i32 0, i64 18983) ]
+  // ZERO-NOT: @llvm.ptrauth.resign
+  // ZERO: call void [[LOAD]]() [ "ptrauth"(i32 0, i64 0) ]
+}
+
+void *vptr;
+// CHECK-LABEL: define{{.*}} void @test5
+void test5() {
+  vptr = &*(char *)fptr;
+
+  // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr
+  // TYPE-NEXT: [[CMP]] = icmp ne ptr [[LOAD]], null
+  // TYPE-NEXT: br i1 [[CMP]], label %[[NONNULL:.*]], label %[[CONT:.*]]
+
+  // TYPE: [[NONNULL]]:
+  // TYPE: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 18983, i32 0, i64 0)
+  // TYPE: [[CAST:%.*]] = inttoptr i64 [[RESIGN]] to ptr
+
+  // TYPE: [[CONT]]:
+  // TYPE: [[PHI:%.*]] = phi ptr [ null, {{.*}} ], [ [[CAST]], %[[NONNULL]] ]
+  // TYPE: store ptr [[PHI]], ptr @vptr
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
diff --git a/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c b/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c
new file mode 100644
index 0000000000000..1a1dce6f4a66e
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c
@@ -0,0 +1,108 @@
+// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -o-       | FileCheck %s --check-prefixes=CHECK,TYPE
+
+// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -o-       | FileCheck %s --check-prefixes=CHECK,TYPE
+
+// RUN: %clang_cc1 %s                                                -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -o-       | FileCheck %s --check-prefixes=CHECK,ZERO
+
+// RUN: %clang_cc1 %s                                                -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -o-       | FileCheck %s --check-prefixes=CHECK,ZERO
+
+// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -xc++ -o- | FileCheck %s --check-prefixes=CHECK,CHECKCXX,TYPE,TYPECXX
+
+// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -xc++ -o- | FileCheck %s --check-prefixes=CHECK,CHECKCXX,TYPE,TYPECXX
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void f(void);
+void f2(int);
+void (*fptr)(void);
+void *opaque;
+unsigned long uintptr;
+
+#ifdef __cplusplus
+struct ptr_member {
+  void (*fptr_)(int) = 0;
+};
+ptr_member pm;
+void (*test_member)() = (void (*)())pm.fptr_;
+
+// CHECKCXX-LABEL: define{{.*}} internal void @__cxx_global_var_init
+// TYPECXX: call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 2712, i32 0, i64 18983)
+#endif
+
+
+// CHECK-LABEL: define{{.*}} void @test_cast_to_opaque
+void test_cast_to_opaque() {
+  opaque = (void *)f;
+
+  // TYPE: [[RESIGN_VAL:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @f, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 0, i64 0)
+  // TYPE: [[RESIGN_PTR:%.*]] = inttoptr i64 [[RESIGN_VAL]] to ptr
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
+
+// CHECK-LABEL: define{{.*}} void @test_cast_from_opaque
+void test_cast_from_opaque() {
+  fptr = (void (*)(void))opaque;
+
+  // TYPE: [[LOAD:%.*]] = load ptr, ptr @opaque
+  // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null
+  // TYPE: br i1 [[CMP]], label %[[RESIGN_LAB:.*]], label
+
+  // TYPE: [[RESIGN_LAB]]:
+  // TYPE: [[INT:%.*]] = ptrtoint ptr [[LOAD]] to i64
+  // TYPE: [[RESIGN_INT:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[INT]], i32 0, i64 0, i32 0, i64 18983)
+
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
+
+// CHECK-LABEL: define{{.*}} void @test_cast_to_intptr
+void test_cast_to_intptr() {
+  uintptr = (unsigned long)fptr;
+
+  // TYPE: [[ENTRY:.*]]:
+  // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr
+  // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null
+  // TYPE: br i1 [[CMP]], label %[[RESIGN_LAB:.*]], label %[[RESIGN_CONT:.*]]
+
+  // TYPE: [[RESIGN_LAB]]:
+  // TYPE: [[INT:%.*]] = ptrtoint ptr [[LOAD]] to i64
+  // TYPE: [[RESIGN_INT:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[INT]], i32 0, i64 18983, i32 0, i64 0)
+  // TYPE: [[RESIGN:%.*]] = inttoptr i64 [[RESIGN_INT]] to ptr
+  // TYPE: br label %[[RESIGN_CONT]]
+
+  // TYPE: [[RESIGN_CONT]]:
+  // TYPE: phi ptr [ null, %[[ENTRY]] ], [ [[RESIGN]], %[[RESIGN_LAB]] ]
+
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
+
+// CHECK-LABEL: define{{.*}} void @test_function_to_function_cast
+void test_function_to_function_cast() {
+  void (*fptr2)(int) = (void (*)(int))fptr;
+  // TYPE: call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 18983, i32 0, i64 2712)
+  // ZERO-NOT: @llvm.ptrauth.resign
+}
+
+// CHECK-LABEL: define{{.*}} void @test_call_lvalue_cast
+void test_call_lvalue_cast() {
+  (*(void (*)(int))f)(42);
+
+  // TYPE: entry:
+  // TYPE-NEXT: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @f, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 0, i64 2712)
+  // TYPE-NEXT: [[RESIGN_INT:%.*]] = inttoptr i64 [[RESIGN]] to ptr
+  // TYPE-NEXT: call void [[RESIGN_INT]](i32 noundef 42) [ "ptrauth"(i32 0, i64 2712) ]
+  // ZERO-NOT: @llvm.ptrauth.resign
+  // ZERO: call void ptrauth (ptr @f, i32 0)(i32 noundef 42) [ "ptrauth"(i32 0, i64 0) ]
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/clang/test/CodeGen/ptrauth-function-type-discriminator.c b/clang/test/CodeGen/ptrauth-function-type-discriminator.c
index 5dea48fe5915b..0952c1abf6c07 100644
--- a/clang/test/CodeGen/ptrauth-function-type-discriminator.c
+++ b/clang/test/CodeGen/ptrauth-function-type-discriminator.c
@@ -1,7 +1,18 @@
-// RUN: %clang_cc1 %s       -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKC
-// RUN: %clang_cc1 -xc++ %s -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-pch %s -o %t.ast
-// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -x ast -o - %t.ast | FileCheck -check-prefix=CHECK --check-prefix=CHECKC %s
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm %s       -o- | FileCheck --check-prefixes=CHECK,CHECKC %s
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -xc++ %s -o- | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios    -fptrauth-calls -fptrauth-intrinsics -emit-pch %s -o %t.ast
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios    -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -emit-llvm -x ast -o - %t.ast | FileCheck --check-prefixes=CHECK,CHECKC %s
+
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm %s       -o- | FileCheck --check-prefixes=CHECK,CHECKC %s
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -disable-llvm-passes -emit-llvm -xc++ %s -o- | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics -emit-pch %s -o %t.ast
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu  -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -emit-llvm -x ast -o - %t.ast | FileCheck --check-prefixes=CHECK,CHECKC %s
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,6 +30,13 @@ void (*test_constant_null)(int) = 0;
 // CHECK: @test_constant_cast = global ptr ptrauth (ptr @f, i32 0, i64 2712)
 void (*test_constant_cast)(int) = (void (*)(int))f;
 
+#ifndef __cplusplus
+// CHECKC: @enum_func_ptr = global ptr ptrauth (ptr @enum_func, i32 0, i64 2712)
+enum Enum0;
+void enum_func(enum Enum0);
+void (*enum_func_ptr)(enum Enum0) = enum_func;
+#endif
+
 // CHECK: @test_opaque = global ptr ptrauth (ptr @f, i32 0)
 void *test_opaque =
 #ifdef __cplusplus
@@ -47,14 +65,14 @@ void (*fptr3)(void) = __builtin_ptrauth_sign_constant(&external_function, 2, 26)
 // CHECK: @fptr4 = global ptr ptrauth (ptr @external_function, i32 2, i64 26, ptr @fptr4)
 void (*fptr4)(void) = __builtin_ptrauth_sign_constant(&external_function, 2, __builtin_ptrauth_blend_discriminator(&fptr4, 26));
 
-// CHECK-LABEL: define void @test_call()
+// CHECK-LABEL: define{{.*}} void @test_call()
 void test_call() {
   // CHECK:      [[T0:%.*]] = load ptr, ptr @fnptr,
   // CHECK-NEXT: call void [[T0]]() [ "ptrauth"(i32 0, i64 18983) ]
   fnptr();
 }
 
-// CHECK-LABEL: define ptr @test_function_pointer()
+// CHECK-LABEL: define{{.*}} ptr @test_function_pointer()
 // CHECK:  ret ptr ptrauth (ptr @external_function, i32 0, i64 18983)
 void (*test_function_pointer())(void) {
   return external_function;
@@ -62,14 +80,14 @@ void (*test_function_pointer())(void) {
 
 struct InitiallyIncomplete;
 extern struct InitiallyIncomplete returns_initially_incomplete(void);
-// CHECK-LABEL: define void @use_while_incomplete()
+// CHECK-LABEL: define{{.*}} void @use_while_incomplete()
 void use_while_incomplete() {
   // CHECK:      [[VAR:%.*]] = alloca ptr,
   // CHECK-NEXT: store ptr ptrauth (ptr @returns_initially_incomplete, i32 0, i64 25106), ptr [[VAR]]
   struct InitiallyIncomplete (*fnptr)(void) = &returns_initially_incomplete;
 }
 struct InitiallyIncomplete { int x; };
-// CHECK-LABEL: define void @use_while_complete()
+// CHECK-LABEL: define{{.*}} void @use_while_complete()
 void use_while_complete() {
   // CHECK:      [[VAR:%.*]] = alloca ptr,
   // CHECK-NEXT: store ptr ptrauth (ptr @returns_initially_incomplete, i32 0, i64 25106), ptr [[VAR]]
@@ -83,7 +101,7 @@ void knr(param)
   int param;
 {}
 
-// CHECKC-LABEL: define void @test_knr
+// CHECKC-LABEL: define{{.*}} void @test_knr
 void test_knr() {
   void (*p)() = knr;
   p(0);
@@ -94,7 +112,7 @@ void test_knr() {
   // CHECKC: call void [[LOAD]](i32 noundef 0) [ "ptrauth"(i32 0, i64 18983) ]
 }
 
-// CHECKC-LABEL: define void @test_redeclaration
+// CHECKC-LABEL: define{{.*}} void @test_redeclaration
 void test_redeclaration() {
   void redecl();
   void (*ptr)() = redecl;
@@ -113,7 +131,7 @@ void knr2(param)
      int param;
 {}
 
-// CHECKC-LABEL: define void @test_redecl_knr
+// CHECKC-LABEL: define{{.*}} void @test_redecl_knr
 void test_redecl_knr() {
   void (*p)() = knr2;
   p();
diff --git a/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp b/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp
index 6c36004641477..8045e7bdc7460 100644
--- a/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp
+++ b/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -triple arm64e-apple-ios15 -fsanitize=vptr -O0 -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple arm64e-apple-ios15 -fsanitize=vptr -O2 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
+// RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsanitize=vptr -O0 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsanitize=vptr -O2 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+
 struct S {
   S() {}
   ~S() {}
diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
index 388a3df547a73..da0c809148018 100644
--- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -12,7 +12,7 @@
 //.
 // CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local void @escape
-// CHECK-SAME: (ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections !2 {
+// CHECK-SAME: (ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META2:![0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -23,13 +23,13 @@ __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) {
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local i32 @normal_function
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections !4 {
+// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META4:![0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections !10
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11:![0-9]+]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA11:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 int normal_function(int *x, int *y) {
@@ -46,7 +46,7 @@ int normal_function(int *x, int *y) {
 // CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA11]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_instrumentation(int *x, int *y) {
@@ -57,13 +57,13 @@ __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_ins
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local i32 @test_no_sanitize_thread
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections !13 {
+// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections !10
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA11]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *y) {
@@ -74,13 +74,13 @@ __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local i32 @test_no_sanitize_all
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections !13 {
+// CHECK-SAME: (ptr noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections !10
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA11]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
@@ -89,23 +89,9 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
   return *y;
 }
 //.
-// CHECK: attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #1 = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #2 = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #3 = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #4 = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-//.
-// CHECK: !2 = !{!"sanmd_covered2!C", !3}
-// CHECK: !3 = !{i64 0}
-// CHECK: !4 = !{!"sanmd_covered2!C", !5}
-// CHECK: !5 = !{i64 3}
-// CHECK: !6 = !{!7, !7, i64 0}
-// CHECK: !7 = !{!"any pointer", !8, i64 0}
-// CHECK: !8 = !{!"omnipotent char", !9, i64 0}
-// CHECK: !9 = !{!"Simple C/C++ TBAA"}
-// CHECK: !10 = !{!"sanmd_atomics2!C"}
-// CHECK: !11 = !{!12, !12, i64 0}
-// CHECK: !12 = !{!"int", !8, i64 0}
-// CHECK: !13 = !{!"sanmd_covered2!C", !14}
-// CHECK: !14 = !{i64 2}
+// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR2]] = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 //.
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index b9ebe87982001..75d8c3d501750 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -1,80 +1,108 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck --check-prefixes=COMMON,DEFAULT %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes -pointer-tbaa %s -emit-llvm -o - | FileCheck --check-prefixes=COMMON,ENABLED %s
+// RUN: %clang --target=x86_64-apple-darwin -O1 %s -emit-llvm -S -mllvm -disable-llvm-optzns -o - | FileCheck --check-prefixes=COMMON,DEFAULT %s
+// RUN: %clang --target=x86_64-apple-darwin -O1 -fpointer-tbaa %s -emit-llvm -S -mllvm -disable-llvm-optzns -o - | FileCheck --check-prefixes=COMMON,ENABLED %s
 
 void p2unsigned(unsigned **ptr) {
-  // CHECK-LABEL: define void @p2unsigned(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:  %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:  store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0:!.+]]
-  // CHECK-NEXT:  [[BASE:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:  store ptr null, ptr [[BASE]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:  ret void
+  // COMMON-LABEL: define void @p2unsigned(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:        [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0:!.+]]
+  // ENABLED-NEXT:  [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
+  // ENABLED-NEXT:  store ptr null, ptr [[BASE]], align 8, !tbaa [[P1INT_0:!.+]]
+  // DEFAULT-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
+  // DEFAULT-NEXT:  [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:  store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:   ret void
   //
   *ptr = 0;
 }
 
 void p2unsigned_volatile(unsigned *volatile *ptr) {
-  // CHECK-LABEL: define void @p2unsigned_volatile(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store volatile ptr null, ptr [[BASE]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p2unsigned_volatile(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
+  // ENABLED-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
+  // ENABLED-NEXT:   store volatile ptr null, ptr [[BASE]], align 8, !tbaa [[P1INT_0]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store volatile ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
   //
   *ptr = 0;
 }
 
 void p3int(int ***ptr) {
-  // CHECK-LABEL: define void @p3int(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_0:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store ptr null, ptr [[BASE_1]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p3int(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P3INT_0:!.+]]
+  // ENABLED-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P3INT_0]]
+  // ENABLED-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P2INT_0]]
+  // ENABLED-NEXT:   store ptr null, ptr [[BASE_1]], align 8, !tbaa [[P1INT_0]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
   //
   **ptr = 0;
 }
 
 void p4char(char ****ptr) {
-  // CHECK-LABEL: define void @p4char(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_0:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p4char(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0:!.+]]
+  // ENABLED-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0:!.+]]
+  // ENABLED-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0:!.+]]
+  // ENABLED-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0:!.+]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
   //
   ***ptr = 0;
 }
 
 void p4char_const1(const char ****ptr) {
-  // CHECK-LABEL: define void @p4char_const1(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_0:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p4char_const1(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0]]
+  // ENABLED-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:   ret void
   //
   ***ptr = 0;
 }
 
 void p4char_const2(const char **const **ptr) {
-  // CHECK-LABEL: define void @p4char_const2(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_0:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p4char_const2(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0]]
+  // ENABLED-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0]]
+  // ENABLED-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
   //
   ***ptr = 0;
 }
@@ -85,19 +113,35 @@ struct S1 {
 };
 
 void p2struct(struct S1 **ptr) {
-  // CHECK-LABEL: define void @p2struct(ptr noundef %ptr)
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %ptr.addr = alloca ptr, align 8
-  // CHECK-NEXT:   store ptr %ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   [[BASE:%.+]] = load ptr, ptr %ptr.addr, align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   store ptr null, ptr [[BASE]], align 8, !tbaa [[ANY_POINTER_0]]
-  // CHECK-NEXT:   ret void
+  // COMMON-LABEL: define void @p2struct(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0:!.+]]
+  // ENABLED-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0]]
+  // ENABLED-NEXT:   store ptr null, ptr [[BASE]], align 8, !tbaa [[P1S1_:!.+]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
   //
   *ptr = 0;
 }
 
-// CHECK: [[ANY_POINTER_0]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
-// CHECK: [[ANY_POINTER]] = !{!"any pointer", [[CHAR:!.+]], i64 0}
-// CHECK: [[CHAR]] = !{!"omnipotent char", [[TBAA_ROOT:!.+]], i64 0}
-// CHECK: [[TBAA_ROOT]] = !{!"Simple C/C++ TBAA"}
-//
+// ENABLED: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0}
+// ENABLED: [[P2INT]] = !{!"p2 int", [[ANY_POINTER:!.+]], i64 0}
+// DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
+// COMMON: [[ANY_POINTER]] = !{!"any pointer", [[CHAR:!.+]], i64 0}
+// COMMON: [[CHAR]] = !{!"omnipotent char", [[TBAA_ROOT:!.+]], i64 0}
+// COMMON: [[TBAA_ROOT]] = !{!"Simple C/C++ TBAA"}
+// ENABLED: [[P1INT_0]] = !{[[P1INT:!.+]], [[P1INT]], i64 0}
+// ENABLED: [[P1INT]] = !{!"p1 int", [[ANY_POINTER]], i64 0}
+// ENABLED: [[P3INT_0]] = !{[[P3INT:!.+]], [[P3INT]], i64 0}
+// ENABLED: [[P3INT]] = !{!"p3 int", [[ANY_POINTER]], i64 0}
+// ENABLED: [[P4CHAR_0]] = !{[[P4CHAR:!.+]], [[P4CHAR]], i64 0}
+// ENABLED: [[P4CHAR]] = !{!"p4 omnipotent char", [[ANY_POINTER]], i64 0}
+// ENABLED: [[P3CHAR_0]] = !{[[P3CHAR:!.+]], [[P3CHAR]], i64 0}
+// ENABLED: [[P3CHAR]] = !{!"p3 omnipotent char", [[ANY_POINTER]], i64 0}
+// ENABLED: [[P2CHAR_0]] = !{[[P2CHAR:!.+]], [[P2CHAR]], i64 0}
+// ENABLED: [[P2CHAR]] = !{!"p2 omnipotent char", [[ANY_POINTER]], i64 0}
+// ENABLED: [[P1CHAR_0]] = !{[[P1CHAR:!.+]], [[P1CHAR]], i64 0}
+// ENABLED: [[P1CHAR]] = !{!"p1 omnipotent char", [[ANY_POINTER]], i64 0}
diff --git a/clang/test/CodeGen/ubsan-function.cpp b/clang/test/CodeGen/ubsan-function.cpp
index bc37a61c98ee3..76d4237383f83 100644
--- a/clang/test/CodeGen/ubsan-function.cpp
+++ b/clang/test/CodeGen/ubsan-function.cpp
@@ -4,6 +4,9 @@
 // RUN: %clang_cc1 -triple aarch64_be-linux-gnu -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all | FileCheck %s --check-prefixes=CHECK,GNU,64
 // RUN: %clang_cc1 -triple arm-none-eabi -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all | FileCheck %s --check-prefixes=CHECK,ARM,GNU,32
 
+// RUN: %clang_cc1 -triple arm64e-apple-ios  -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all -fptrauth-calls | FileCheck %s --check-prefixes=CHECK,GNU,64,AUTH
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all -fptrauth-calls | FileCheck %s --check-prefixes=CHECK,GNU,64,AUTH
+
 // GNU:  define{{.*}} void @_Z3funv() #0 !func_sanitize ![[FUNCSAN:.*]] {
 // MSVC: define{{.*}} void @"?fun@@YAXXZ"() #0 !func_sanitize ![[FUNCSAN:.*]] {
 void fun() {}
@@ -13,6 +16,8 @@ void fun() {}
 // ARM:   ptrtoint ptr {{.*}} to i32, !nosanitize !5
 // ARM:   and i32 {{.*}}, -2, !nosanitize !5
 // ARM:   inttoptr i32 {{.*}} to ptr, !nosanitize !5
+// AUTH:  %[[STRIPPED:.*]] = ptrtoint ptr {{.*}} to i64, !nosanitize
+// AUTH:  call i64 @llvm.ptrauth.auth(i64 %[[STRIPPED]], i32 0, i64 0), !nosanitize
 // CHECK: getelementptr <{ i32, i32 }>, ptr {{.*}}, i32 -1, i32 0, !nosanitize
 // CHECK: load i32, ptr {{.*}}, align {{.*}}, !nosanitize
 // CHECK: icmp eq i32 {{.*}}, -1056584962, !nosanitize
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index eeb23bc7e1c01..8bf8241e343e7 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
-// RUN:   -fnative-half-arguments-and-returns | FileCheck %s
+// RUN:   -fnative-half-arguments-and-returns | FileCheck -check-prefixes=CHECK,SAFEIR %s
+
+// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
+// RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
+// RUN:   -fnative-half-arguments-and-returns -munsafe-fp-atomics | FileCheck -check-prefixes=CHECK,UNSAFEIR %s
 
 // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx1100 -fnative-half-type \
@@ -18,24 +22,38 @@
 
 __global__ void ffp1(float *p) {
   // CHECK-LABEL: @_Z4ffp1Pf
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp1Pf
   // UNSAFE: global_atomic_add_f32
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
+
   __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0f, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
@@ -44,23 +62,36 @@ __global__ void ffp1(float *p) {
 
 __global__ void ffp2(double *p) {
   // CHECK-LABEL: @_Z4ffp2Pd
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
+  // SAFE: global_atomic_cmpswap_b64
+
   // UNSAFE-LABEL: @_Z4ffp2Pd
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE: global_atomic_add_f64
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -71,11 +102,20 @@ __global__ void ffp2(double *p) {
 // long double is the same as double for amdgcn.
 __global__ void ffp3(long double *p) {
   // CHECK-LABEL: @_Z4ffp3Pe
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
@@ -84,10 +124,11 @@ __global__ void ffp3(long double *p) {
   // SAFE: global_atomic_cmpswap_b64
   // UNSAFE-LABEL: @_Z4ffp3Pe
   // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0L, memory_order_relaxed);
@@ -98,37 +139,52 @@ __global__ void ffp3(long double *p) {
 __device__ double ffp4(double *p, float f) {
   // CHECK-LABEL: @_Z4ffp4Pdf
   // CHECK: fpext float {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, f, memory_order_relaxed);
 }
 
 __device__ double ffp5(double *p, int i) {
   // CHECK-LABEL: @_Z4ffp5Pdi
   // CHECK: sitofp i32 {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, i, memory_order_relaxed);
 }
 
 __global__ void ffp6(_Float16 *p) {
   // CHECK-LABEL: @_Z4ffp6PDF16
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp6PDF16
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
   __atomic_fetch_add(p, 1.0, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index a5135ab01f0f3..70c86cbb8c3d4 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -16,9 +16,8 @@
 
 // HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce)
-// CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
+// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
 // OPT: ret void
@@ -28,9 +27,8 @@ __global__ void kernel1(int *x) {
 
 // HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
-// CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
 // OPT: ret void
@@ -67,7 +65,7 @@ struct S {
 // OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
 // OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
-// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
@@ -126,9 +124,8 @@ struct SS {
 };
 // HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce)
-// CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4{{$}}
 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
 // OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4
 // OPT: ret void
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 2e88afac813f4..4bf23e529c7a5 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -17,17 +17,16 @@
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
 // CHECK-NEXT:    [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DISPATCH_PTR]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT:    store ptr [[TMP2]], ptr [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_dispatch_ptr(int* out) {
@@ -43,17 +42,16 @@ __global__ void use_dispatch_ptr(int* out) {
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
 // CHECK-NEXT:    [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[QUEUE_PTR]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT:    store ptr [[TMP2]], ptr [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_queue_ptr(int* out) {
@@ -69,17 +67,16 @@ __global__ void use_queue_ptr(int* out) {
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
 // CHECK-NEXT:    [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMPLICITARG_PTR]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT:    store ptr [[TMP2]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_implicitarg_ptr(int* out) {
@@ -134,16 +131,15 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
 // CHECK-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT:    store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT:    store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin(float src, float *shared) {
@@ -184,17 +180,16 @@ __global__ void endpgm() {
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
 // CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
 // CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i64 [[TMP3]], ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
 // CHECK-NEXT:    ret void
 //
 __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
@@ -210,13 +205,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
 __global__ void test_s_memtime(unsigned long long* out)
@@ -237,18 +231,17 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
 // CHECK-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT:    store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT:    store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call void @_Z4funcPf(ptr noundef [[TMP4]]) #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
@@ -264,14 +257,13 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
 // CHECK-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
 // CHECK-NEXT:    [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP0]])
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
 __global__ void test_is_shared(float *x){
@@ -286,14 +278,13 @@ __global__ void test_is_shared(float *x){
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
 // CHECK-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
 // CHECK-NEXT:    [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP0]])
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
 __global__ void test_is_private(int *x){
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 32851805298f1..1cbe358910b85 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -17,16 +17,15 @@
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_dispatch_ptr(int* out) {
@@ -42,16 +41,15 @@ __global__ void use_dispatch_ptr(int* out) {
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_queue_ptr(int* out) {
@@ -67,16 +65,15 @@ __global__ void use_queue_ptr(int* out) {
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void use_implicitarg_ptr(int* out) {
@@ -131,16 +128,15 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT:    store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin(float src, float *shared) {
@@ -175,17 +171,16 @@ __global__ void endpgm() {
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
 // CHECK-NEXT:    ret void
 //
 __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
@@ -201,13 +196,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
 __global__ void test_s_memtime(unsigned long long* out)
@@ -228,18 +222,17 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT:    store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
@@ -255,15 +248,14 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
 // CHECK-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT:    [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP2]])
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
 __global__ void test_is_shared(float *x){
@@ -278,15 +270,14 @@ __global__ void test_is_shared(float *x){
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
 // CHECK-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
 // CHECK-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT:    [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP2]])
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
 __global__ void test_is_private(int *x){
diff --git a/clang/test/CodeGenCUDA/convergent.cu b/clang/test/CodeGenCUDA/convergent.cu
index 5d98d4ba69262..334e5ec34d7cd 100644
--- a/clang/test/CodeGenCUDA/convergent.cu
+++ b/clang/test/CodeGenCUDA/convergent.cu
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
@@ -10,36 +11,89 @@
 
 #include "Inputs/cuda.h"
 
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3foov
+// DEVICE-LABEL: define dso_local void @_Z3foov(
+// DEVICE-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
 __device__ void foo() {}
+// DEVICE-LABEL: define dso_local void @_Z3baxv(
+// DEVICE-SAME: ) #[[ATTR1:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
+[[clang::noconvergent]] __device__ void bax() {}
 
-// HOST: Function Attrs:
-// HOST-NOT: convergent
-// HOST-NEXT: define{{.*}} void @_Z3barv
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3barv
 __host__ __device__ void baz();
+
+__host__ __device__ float aliasf0(int) asm("something");
+__host__ __device__ [[clang::noconvergent]] float aliasf1(int) asm("somethingelse");
+
+// DEVICE-LABEL: define dso_local void @_Z3barv(
+// DEVICE-SAME: ) #[[ATTR0]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    [[X:%.*]] = alloca i32, align 4
+// DEVICE-NEXT:    call void @_Z3bazv() #[[ATTR4:[0-9]+]]
+// DEVICE-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l"() #[[ATTR5:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// DEVICE-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// DEVICE-NEXT:    call void asm sideeffect "trap", ""() #[[ATTR4]], !srcloc [[META4:![0-9]+]]
+// DEVICE-NEXT:    call void asm sideeffect "nop", ""() #[[ATTR6:[0-9]+]], !srcloc [[META5:![0-9]+]]
+// DEVICE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]]) #[[ATTR4]]
+// DEVICE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]]) #[[ATTR6]]
+// DEVICE-NEXT:    ret void
+//
+// HOST-LABEL: define dso_local void @_Z3barv(
+// HOST-SAME: ) #[[ATTR0:[0-9]+]] {
+// HOST-NEXT:  [[ENTRY:.*:]]
+// HOST-NEXT:    [[X:%.*]] = alloca i32, align 4
+// HOST-NEXT:    call void @_Z3bazv()
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR2:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// HOST-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// HOST-NEXT:    call void asm sideeffect "trap", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// HOST-NEXT:    call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3]], !srcloc [[META4:![0-9]+]]
+// HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]])
+// HOST-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]])
+// HOST-NEXT:    ret void
+//
 __host__ __device__ void bar() {
-  // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
   baz();
-  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
   int x;
-  asm ("trap;" : "=l"(x));
-  // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
-  asm volatile ("trap;");
+  asm ("trap" : "=l"(x));
+  asm volatile ("trap");
+  [[clang::noconvergent]] { asm volatile ("nop"); }
+  aliasf0(x);
+  aliasf1(x);
 }
 
-// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// DEVICE: attributes [[BAZ_ATTR]] = {
-// DEVICE-SAME: convergent
-// DEVICE-SAME: }
-// DEVICE-DAG: attributes [[CALL_ATTR]] = { convergent
-// DEVICE-DAG: attributes [[ASM_ATTR]] = { convergent
-
-// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// HOST: attributes [[BAZ_ATTR]] = {
-// HOST-NOT: convergent
-// HOST-SAME: }
+
+//.
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
+// DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
+// DEVICE: attributes #[[ATTR6]] = { nounwind }
+//.
+// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR2]] = { nounwind memory(none) }
+// HOST: attributes #[[ATTR3]] = { nounwind }
+//.
+// DEVICE: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// DEVICE: [[META1:![0-9]+]] = !{i32 7, !"nvvm-reflect-ftz", i32 0}
+// DEVICE: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// DEVICE: [[META3]] = !{i64 3120}
+// DEVICE: [[META4]] = !{i64 3155}
+// DEVICE: [[META5]] = !{i64 3206}
+//.
+// HOST: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// HOST: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// HOST: [[META2]] = !{i64 3120}
+// HOST: [[META3]] = !{i64 3155}
+// HOST: [[META4]] = !{i64 3206}
+//.
diff --git a/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp b/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp
new file mode 100644
index 0000000000000..79c07c0d9db11
--- /dev/null
+++ b/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @_Z23used_before_default_defv = weak_odr ifunc void (), ptr @_Z23used_before_default_defv.resolver
+// CHECK: @_Z22used_after_default_defv = weak_odr ifunc void (), ptr @_Z22used_after_default_defv.resolver
+// CHECK-NOT: @_Z24used_before_default_declv = weak_odr ifunc void (), ptr @_Z24used_before_default_declv.resolver
+// CHECK-NOT: @_Z23used_after_default_declv = weak_odr ifunc void (), ptr @_Z23used_after_default_declv.resolver
+// CHECK-NOT: @_Z15used_no_defaultv = weak_odr ifunc void (), ptr @_Z15used_no_defaultv.resolver
+// CHECK-NOT: @_Z19not_used_no_defaultv = weak_odr ifunc void (), ptr @_Z19not_used_no_defaultv.resolver
+// CHECK: @_Z21not_used_with_defaultv = weak_odr ifunc void (), ptr @_Z21not_used_with_defaultv.resolver
+
+
+// Test that an ifunc is generated and used when the default
+// version is defined after the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_before_default_def(void) {}
+// CHECK-LABEL: define dso_local void @_Z23used_before_default_defv._Maes(
+//
+void call_before_def(void) { used_before_default_def(); }
+// CHECK-LABEL: define dso_local void @_Z15call_before_defv(
+// CHECK: call void @_Z23used_before_default_defv()
+//
+__attribute__((target_version("default"))) void used_before_default_def(void) {}
+// CHECK-LABEL: define dso_local void @_Z23used_before_default_defv.default(
+//
+// CHECK-NOT: declare void @_Z23used_before_default_defv(
+
+
+// Test that an ifunc is generated and used when the default
+// version is defined before the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_after_default_def(void) {}
+// CHECK-LABEL: define dso_local void @_Z22used_after_default_defv._Maes(
+//
+__attribute__((target_version("default"))) void used_after_default_def(void) {}
+// CHECK-LABEL: define dso_local void @_Z22used_after_default_defv.default(
+//
+void call_after_def(void) { used_after_default_def(); }
+// CHECK-LABEL: define dso_local void @_Z14call_after_defv(
+// CHECK: call void @_Z22used_after_default_defv()
+//
+// CHECK-NOT: declare void @_Z22used_after_default_defv(
+
+
+// Test that an unmagled declaration is generated and used when the
+// default version is declared after the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_before_default_decl(void) {}
+// CHECK-LABEL: define dso_local void @_Z24used_before_default_declv._Maes(
+//
+void call_before_decl(void) { used_before_default_decl(); }
+// CHECK-LABEL: define dso_local void @_Z16call_before_declv(
+// CHECK: call void @_Z24used_before_default_declv()
+//
+__attribute__((target_version("default"))) void used_before_default_decl(void);
+// CHECK: declare void @_Z24used_before_default_declv()
+
+
+// Test that an unmagled declaration is generated and used when the
+// default version is declared before the first use of the function.
+//
+__attribute__((target_version("aes"))) void used_after_default_decl(void) {}
+// CHECK-LABEL: define dso_local void @_Z23used_after_default_declv._Maes(
+//
+__attribute__((target_version("default"))) void used_after_default_decl(void);
+// CHECK: declare void @_Z23used_after_default_declv()
+//
+void call_after_decl(void) { used_after_default_decl(); }
+// CHECK-LABEL: define dso_local void @_Z15call_after_declv(
+// CHECK: call void @_Z23used_after_default_declv()
+
+
+// Test that an unmagled declaration is generated and used when
+// the default version is not present.
+//
+__attribute__((target_version("aes"))) void used_no_default(void) {}
+// CHECK-LABEL: define dso_local void @_Z15used_no_defaultv._Maes(
+//
+void call_no_default(void) { used_no_default(); }
+// CHECK-LABEL: define dso_local void @_Z15call_no_defaultv(
+// CHECK: call void @_Z15used_no_defaultv()
+//
+// CHECK: declare void @_Z15used_no_defaultv()
+
+
+// Test that neither an ifunc nor a declaration is generated if the default
+// definition is missing since the versioned function is not used.
+//
+__attribute__((target_version("aes"))) void not_used_no_default(void) {}
+// CHECK-LABEL: define dso_local void @_Z19not_used_no_defaultv._Maes(
+//
+// CHECK-NOT: declare void @_Z19not_used_no_defaultv(
+
+
+// Test that an ifunc is generated if the default version is defined but not used.
+//
+__attribute__((target_version("aes"))) void not_used_with_default(void) {}
+// CHECK-LABEL: define dso_local void @_Z21not_used_with_defaultv._Maes(
+//
+__attribute__((target_version("default"))) void not_used_with_default(void) {}
+// CHECK-LABEL: define dso_local void @_Z21not_used_with_defaultv.default(
+//
+// CHECK-NOT: declare void @_Z21not_used_with_defaultv(
+
+
+// CHECK: define weak_odr ptr @_Z23used_before_default_defv.resolver()
+// CHECK: define weak_odr ptr @_Z22used_after_default_defv.resolver()
+// CHECK-NOT: define weak_odr ptr @_Z24used_before_default_declv.resolver(
+// CHECK-NOT: define weak_odr ptr @_Z23used_after_default_declv.resolver(
+// CHECK-NOT: define weak_odr ptr @_Z15used_no_defaultv.resolver(
+// CHECK-NOT: define weak_odr ptr @_Z19not_used_no_defaultv.resolver(
+// CHECK: define weak_odr ptr @_Z21not_used_with_defaultv.resolver()
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
index 55b87da122b04..435feec774ece 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
@@ -3,5 +3,5 @@
 
 template<typename T> struct S {};
 
-// CHECK: cannot mangle this built-in __SVInt8_t type yet
+// CHECK: cannot mangle this built-in type: __SVInt8_t yet
 void f1(S<__SVInt8_t>) {}
diff --git a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
index 7279b6c7f23a0..1ad46042b6efd 100644
--- a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
+++ b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
@@ -46,9 +46,9 @@ int mane() {
     char1 f1{1};
     char1 f2{1};
 
-// CHECK: [[TMP:%.+]] = alloca i16
-// CHECK: [[COERCE:%.+]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %{{.+}}, ptr align 2 [[COERCE]], i64 1, i1 false)
+// CHECK: [[CALL:%.*]] = call i16
+// CHECK: [[TRUNC:%.*]] = trunc i16 [[CALL]] to i8
+// CHECK: store i8 [[TRUNC]]
 
     char1 f3 = f1 + f2;
 }
diff --git a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
index 412edd6d3edc0..e842de6335046 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
@@ -68,23 +68,23 @@ void w_branch_elided(unsigned e){
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3:[0-9]+]]
 // CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 // CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[INC:%.*]] = add i32 [[TMP4]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[INC:%.*]] = add i32 [[TMP2]], 1
 // CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK:       for.end:
@@ -100,23 +100,23 @@ void fl(unsigned e)
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 // CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK:       for.end:
@@ -146,42 +146,42 @@ void f_branch_elided()
 // CHECK-NEXT:    [[__END1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[E:%.*]], ptr [[E_ADDR]], align 8, !tbaa [[TBAA14:![0-9]+]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    store ptr [[TMP1]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP3]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP5]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 // CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__END1]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i32 [[TMP13]], ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
@@ -198,42 +198,42 @@ void frl(int (&&e) [4])
 // CHECK-NEXT:    [[__END1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[E:%.*]], ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    store ptr [[TMP1]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP3]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP5]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 // CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__END1]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RANGE1]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i32 [[TMP13]], ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index 29ae6b6856500..6405621a9d647 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
 
 int __attribute__((target_clones("ls64_v+fp16", "default"))) foo_ovl(int) { return 1; }
@@ -45,56 +45,60 @@ void run_foo_tml() {
 // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
 // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver
 //.
-// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64_v(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16Mls64_v(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_Z7foo_ovli.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4503599627436032
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4503599627436032
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovli._Mfp16Mls64_v
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovli.default
 //
 //
-// CHECK-LABEL: @_Z7foo_ovlv._Mls64Mls64_accdata(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mls64Mls64_accdata(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_Z7foo_ovlv.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovlv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 11258999068426240
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 11258999068426240
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mls64Mls64_accdata
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovlv.default
 //
 //
-// CHECK-LABEL: @_Z3barv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
+// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z7foo_ovli(i32 noundef 1)
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z7foo_ovlv()
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
 // CHECK-NEXT:    ret i32 [[ADD]]
 //
 //
-// CHECK-LABEL: @_Z11run_foo_tmlv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z11run_foo_tmlv(
+// CHECK-SAME: ) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MC1:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
 // CHECK-NEXT:    [[MC2:%.*]] = alloca [[STRUCT_MYCLASS_0:%.*]], align 1
 // CHECK-NEXT:    [[MC3:%.*]] = alloca [[STRUCT_MYCLASS_1:%.*]], align 1
@@ -106,131 +110,141 @@ void run_foo_tml() {
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv._Mfrintts
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv.default
-//
-//
-// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv._Mfrintts
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv.default
-//
-//
-// CHECK-LABEL: @_ZN7MyClassIfsE7foo_tmlEv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIfsE7foo_tmlEv(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIdfE7foo_tmlEv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIdfE7foo_tmlEv(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 4
 //
 //
-// CHECK-LABEL: @_Z7foo_ovli.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli.default(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_Z7foo_ovlv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv.default(
+// CHECK-SAME: ) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv._Mfrintts(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv._Mfrintts(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR3:[0-9]+]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4:[0-9]+]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv._Mfrintts(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]]
+// CHECK:       [[RESOLVER_RETURN1]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv._Mfrintts
+// CHECK:       [[RESOLVER_ELSE2]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv.default
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv._Mfrintts(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR3]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
 //
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIisE7foo_tmlEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]]
+// CHECK:       [[RESOLVER_RETURN1]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv._Mfrintts
+// CHECK:       [[RESOLVER_ELSE2]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv.default
+//
 //.
-// CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" }
-// CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" }
+// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" }
+// CHECK: attributes #[[ATTR4]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index fd19f4c5a3030..6661abead20c6 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
 
 int __attribute__((target_version("sme-f64f64+bf16"))) foo(int) { return 1; }
@@ -59,152 +59,169 @@ int bar() {
   return m.goo(1) + foo(1) + foo();
 }
 
-
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
 // CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
 // CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
+// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
 // CHECK: @_ZN7MyClass23unused_with_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass23unused_with_default_defEv.resolver
 // CHECK: @_ZN7MyClass32unused_with_implicit_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver
 // CHECK: @_ZN7MyClass40unused_with_implicit_forward_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver
 //.
-// CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3fooi._Mbf16Msme-f64f64(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_Z3fooi.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3fooi.default(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_Z3foov._Mebf16Msm4(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3foov._Mebf16Msm4(
+// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: @_Z3foov.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3foov.default(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 4
 //
 //
-// CHECK-LABEL: @_ZN7MyClass3gooEi.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClass3gooEi._Mcrc(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi._Mcrc(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi._Mdotprod(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: @_ZN7MyClass32unused_with_forward_default_declEv._Mmops(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_forward_default_declEv._Mmops(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass41unused_with_implicit_forward_default_declEv._Mdotprod(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass41unused_with_implicit_forward_default_declEv._Mdotprod(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass24unused_with_default_declEv._Maes(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass24unused_with_default_declEv._Maes(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv._Msve(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass23unused_with_default_defEv._Msve(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass23unused_with_default_defEv.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_implicit_default_defEv.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR9:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_ZN7MyClass22unused_without_defaultEv._Mrdm(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass22unused_without_defaultEv._Mrdm(
+// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR10:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: @_Z3barv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi(i32 noundef 1)
@@ -214,109 +231,109 @@ int bar() {
 // CHECK-NEXT:    ret i32 [[ADD3]]
 //
 //
-// CHECK-LABEL: @_ZN7MyClass3gooEi.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1024
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mcrc
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mdotprod
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi.default
-//
-//
-// CHECK-LABEL: @_Z3fooi.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z3fooi.default
 //
 //
-// CHECK-LABEL: @_Z3foov.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_Z3foov._Mebf16Msm4
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z3foov.default
 //
 //
-// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1024
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mcrc
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]]
+// CHECK:       [[RESOLVER_RETURN1]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mdotprod
+// CHECK:       [[RESOLVER_ELSE2]]:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi.default
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass23unused_with_default_defEv._Msve
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass23unused_with_default_defEv.default
 //
 //
-// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65536
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv.default
 //
 //
-// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver(
-// CHECK-NEXT:  resolver_entry:
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 128
 // CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 128
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse
-// CHECK:       resolver_else:
+// CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default
 //
 //.
-// CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" }
-// CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR5:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
-// CHECK: attributes #[[ATTR6:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR7:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
-// CHECK: attributes #[[ATTR8:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #[[ATTR9:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
-// CHECK: attributes #[[ATTR10:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" }
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
+// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" }
+// CHECK: attributes #[[ATTR4]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR5]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR6]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR7]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR8]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR9]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
+// CHECK: attributes #[[ATTR10]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" }
 // CHECK: attributes #[[ATTR11:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
diff --git a/clang/test/CodeGenCXX/cxx2a-consteval.cpp b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
index 075cab58358ab..6c09053a74d24 100644
--- a/clang/test/CodeGenCXX/cxx2a-consteval.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
@@ -1,4 +1,3 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -emit-llvm %s -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll
 // RUN: FileCheck -check-prefix=EVAL -input-file=%t.ll %s
 // RUN: FileCheck -check-prefix=EVAL-STATIC -input-file=%t.ll %s
@@ -275,3 +274,26 @@ void f() {
     // EVAL-FN:     call void @_ZN7GH821542S3C2Ei
 }
 }
+
+namespace GH93040 {
+struct C { char c = 1; };
+struct Empty { consteval Empty() {} };
+struct Empty2 { consteval Empty2() {} };
+struct Test : C, Empty {
+  [[no_unique_address]] Empty2 e;
+};
+static_assert(sizeof(Test) == 1);
+void f() {
+  Test test;
+
+// Make sure we don't overwrite the initialization of c.
+
+// EVAL-FN-LABEL: define {{.*}} void @_ZN7GH930404TestC2Ev
+// EVAL-FN: entry:
+// EVAL-FN-NEXT:  [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// EVAL-FN-NEXT:  store ptr {{.*}}, ptr [[THIS_ADDR]], align 8
+// EVAL-FN-NEXT:  [[THIS:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// EVAL-FN-NEXT:  call void @_ZN7GH930401CC2Ev(ptr noundef nonnull align 1 dereferenceable(1) [[THIS]])
+// EVAL-FN-NEXT:  ret void
+}
+}
diff --git a/clang/test/CodeGenCXX/debug-info-explicit-this.cpp b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
new file mode 100644
index 0000000000000..45ab2a0216ca7
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -std=c++2b %s -o - | FileCheck %s
+
+struct Foo {
+  void Bar(this Foo&& self) {}
+};
+
+void fn() {
+  Foo{}.Bar();
+}
+
+// CHECK: distinct !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE:[0-9]+]], {{.*}}, declaration: ![[BAR_DECL:[0-9]+]], {{.*}}
+// CHECK: ![[FOO:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo"
+// CHECK: ![[BAR_DECL]] = !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE]], {{.*}},
+// CHECK: ![[BAR_TYPE]] = !DISubroutineType(types: ![[PARAMS:[0-9]+]])
+// CHECK: ![[PARAMS]] = !{null, ![[SELF:[0-9]+]]}
+// CHECK: ![[SELF]] = !DIDerivedType(tag: DW_TAG_rvalue_reference_type, baseType: ![[FOO]]
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
index abfff1a74f86a..1ac88e68a3a12 100644
--- a/clang/test/CodeGenCXX/fmv-namespace.cpp
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -26,7 +26,6 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
-// CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver
 // CHECK: @_ZN3Foo3barEv = weak_odr ifunc i32 (), ptr @_ZN3Foo3barEv.resolver
 //.
 // CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
@@ -42,20 +41,6 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
-// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
-// CHECK:       [[RESOLVER_RETURN]]:
-// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv._Msve
-// CHECK:       [[RESOLVER_ELSE]]:
-// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
-//
-//
 // CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -69,20 +54,6 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN9OtherName3fooEv.resolver() comdat {
-// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
-// CHECK:       [[RESOLVER_RETURN]]:
-// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv._Msve
-// CHECK:       [[RESOLVER_ELSE]]:
-// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv.default
-//
-//
 // CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv.default(
 // CHECK-SAME: ) #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -90,7 +61,7 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv._Mmops(
-// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 1
 //
@@ -101,6 +72,20 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret i32 0
 //
 //
+// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv._Msve
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
+//
+//
 // CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver() comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -117,8 +102,8 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 //.
 // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
 // CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenCXX/mangle-fail.cpp b/clang/test/CodeGenCXX/mangle-fail.cpp
index b588d57749fa3..f3b50cfb54dbd 100644
--- a/clang/test/CodeGenCXX/mangle-fail.cpp
+++ b/clang/test/CodeGenCXX/mangle-fail.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm-only -x c++ -std=c++11 -triple %itanium_abi_triple -verify %s -DN=1
 // RUN: %clang_cc1 -emit-llvm-only -x c++ -std=c++11 -triple %itanium_abi_triple -verify %s -DN=2
+// RUN: %clang_cc1 -emit-llvm-only -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-intrinsics -verify %s -DN=3
 
 struct A { int a; };
 
@@ -13,6 +14,19 @@ template void test<int>(int (&)[sizeof(int)]);
 template<class T> void test(int (&)[sizeof((A){}, T())]) {} // expected-error {{cannot yet mangle}}
 template void test<int>(int (&)[sizeof(A)]);
 
+#elif N == 3
+// __builtin_ptrauth_type_discriminator
+template <class T, unsigned disc>
+struct S1 {};
+
+template<class T>
+void func(S1<T, __builtin_ptrauth_type_discriminator(T)> s1) { // expected-error {{cannot yet mangle __builtin_ptrauth_type_discriminator expression}}
+}
+
+void testfunc1() {
+  func(S1<int(), __builtin_ptrauth_type_discriminator(int())>());
+}
+
 // FIXME: There are several more cases we can't yet mangle.
 
 #else
diff --git a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
index e6497b3f152aa..1b103719fbe46 100644
--- a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
@@ -1,15 +1,31 @@
 // RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
@@ -111,7 +127,7 @@ int TVDisc_ExplicitNoExtraDiscrimination = ptrauth_string_discriminator("_ZTVN5t
 int TVDisc_ExplicitTypeDiscrimination = ptrauth_string_discriminator("_ZTVN5test126ExplicitTypeDiscriminationE");
 
 
-// CHECK-LABEL: define void @test_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_default(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -133,7 +149,7 @@ void test_default(NoExplicitAuth *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_disabled(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 // CHECK-NOT:     call i64 @llvm.ptrauth.auth
@@ -141,7 +157,7 @@ void test_disabled(ExplicitlyDisableAuth *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -166,7 +182,7 @@ void test_addr_disc(ExplicitAddressDiscrimination *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -185,7 +201,7 @@ void test_no_addr_disc(ExplicitNoAddressDiscrimination *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -206,7 +222,7 @@ void test_no_extra_disc(ExplicitNoExtraDiscrimination *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_type_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_type_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -229,7 +245,7 @@ void test_type_disc(ExplicitTypeDiscrimination *a) {
   a->f();
 }
 
-// CHECK-LABEL: define void @test_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -257,7 +273,7 @@ void test_custom_disc(ExplicitCustomDiscrimination *a) {
 // Codegen should be the same as the simple cases above once we have a vtable.
 //
 
-// CHECK-LABEL: define void @test_subclass_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_default(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -279,7 +295,7 @@ void test_subclass_default(NoExplicitAuth *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_disabled(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 // CHECK-NOT:     call i64 @llvm.ptrauth.auth
@@ -287,7 +303,7 @@ void test_subclass_disabled(ExplicitlyDisableAuth *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -312,7 +328,7 @@ void test_subclass_addr_disc(ExplicitAddressDiscrimination *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -331,7 +347,7 @@ void test_subclass_no_addr_disc(ExplicitNoAddressDiscrimination *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -352,7 +368,7 @@ void test_subclass_no_extra_disc(ExplicitNoExtraDiscrimination *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_type_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_type_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -375,7 +391,7 @@ void test_subclass_type_disc(ExplicitTypeDiscrimination *a) {
   make_subclass(a)->f();
 }
 
-// CHECK-LABEL: define void @test_subclass_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_subclass_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
 //
@@ -404,7 +420,7 @@ void test_subclass_custom_disc(ExplicitCustomDiscrimination *a) {
 // Codegen should be the same as the simple cases above once we have a vtable.
 //
 
-// CHECK-LABEL: define void @test_multiple_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_multiple_default(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
 // CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
@@ -427,7 +443,7 @@ void test_multiple_default(NoExplicitAuth *a) {
   make_multiple_primary(a)->f();
 }
 
-// CHECK-LABEL: define void @test_multiple_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_multiple_disabled(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
 // CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
@@ -436,7 +452,7 @@ void test_multiple_disabled(ExplicitlyDisableAuth *a) {
   make_multiple_primary(a)->f();
 }
 
-// CHECK-LABEL: define void @test_multiple_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_multiple_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
 // CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
 // CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
@@ -466,7 +482,7 @@ void test_multiple_custom_disc(ExplicitCustomDiscrimination *a) {
 // but twice for vtt/vtable.  The names in the vtt version have "VTT" prefixes.
 //
 
-// CHECK-LABEL: define void @test_virtual_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_virtual_default(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTTADDR:%.*]] = call noundef ptr @_ZN5test120make_virtual_primary
 // CHECK:         [[VTTABLE:%.*]] = load ptr, ptr [[VTTADDR]], align 8
 //
@@ -509,13 +525,13 @@ void test_virtual_default(NoExplicitAuth *a) {
   make_virtual_primary(a)->f();
 }
 
-// CHECK-LABEL: define void @test_virtual_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_virtual_disabled(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK-NOT:     call i64 @llvm.ptrauth.auth
 void test_virtual_disabled(ExplicitlyDisableAuth *a) {
   make_virtual_primary(a)->f();
 }
 
-// CHECK-LABEL: define void @test_virtual_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-LABEL: define{{.*}} void @test_virtual_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
 // CHECK:         [[VTTADDR:%.*]] = call noundef ptr @_ZN5test120make_virtual_primary
 // CHECK:         [[VTTABLE:%.*]] = load ptr, ptr [[VTTADDR]], align 8
 //
diff --git a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
new file mode 100644
index 0000000000000..9ce9def6156ef
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
@@ -0,0 +1,245 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,DARWIN
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,ELF
+
+// CHECK: %struct.Base1 = type { ptr }
+// CHECK: %struct.Base2 = type { ptr }
+// CHECK: %struct.Derived1 = type { %struct.Base1, %struct.Base2 }
+// CHECK: %struct.Derived2 = type { %struct.Base2, %struct.Base1 }
+// CHECK: %struct.Derived3 = type { %struct.Base1, %struct.Base2 }
+
+// CHECK: @_ZTV5Base1 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC:38871]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2))] },{{.*}} align 8
+// CHECK: @g_b1 = global %struct.Base1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC:6511]], ptr @g_b1) },{{.*}} align 8
+// CHECK: @_ZTV5Base2 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC:27651]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2))] },{{.*}} align 8
+// CHECK: @g_b2 = global %struct.Base2 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC:63631]], ptr @g_b2) },{{.*}} align 8
+// CHECK: @_ZTV8Derived1 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived11cEv, i32 0, i64 [[DERIVED1_C_DISC:54092]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived11dEv, i32 0, i64 [[DERIVED1_D_DISC:37391]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d1 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d1), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d1, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @_ZTV8Derived2 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived21cEv, i32 0, i64 [[DERIVED2_C_DISC:15537]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived21eEv, i32 0, i64 209, ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d2 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr @g_d2), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d2, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @_ZTV8Derived3 = linkonce_odr unnamed_addr constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived31iEv, i32 0, i64 [[DERIVED3_I_DISC:19084]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 3))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d3 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d3), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d3, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @g_vb1 = global %struct.VirtualBase1 zeroinitializer,{{.*}} align 8
+// CHECK: @g_vb2 = global %struct.VirtualBase2 zeroinitializer,{{.*}} align 8
+// CHECK: @g_d4 = global %struct.Derived4 zeroinitializer,{{.*}} align 8
+// CHECK: @_ZTV12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC:7987]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 5))] },{{.*}} align 8
+// CHECK: @_ZTT12VirtualBase1 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2)],{{.*}} align 8
+// CHECK: @_ZTV12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC:51224]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3))] },{{.*}} align 8
+// CHECK: @_ZTT12VirtualBase2 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3), i32 2)],{{.*}} align 8
+// CHECK: @_ZTV8Derived4 = linkonce_odr unnamed_addr constant { [7 x ptr], [5 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 5)), ptr ptrauth (ptr @_ZN8Derived41hEv, i32 0, i64 [[DERIVED4_H_DISC:31844]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 6))], [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 4))] },{{.*}} align 8
+// CHECK: @_ZTT8Derived4 = linkonce_odr unnamed_addr constant [7 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3), i32 2)],{{.*}} align 8
+// CHECK: @_ZTC8Derived40_12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 5))] },{{.*}} align 8
+// CHECK: @_ZTC8Derived48_12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3))] },{{.*}} align 8
+
+struct Base1 { virtual void a() {} };
+struct Base2 { virtual void b() {} };
+struct Derived1 : public Base1, public Base2 {
+  virtual void c() {}
+  virtual void d() {}
+};
+struct Derived2 : public Base2, public Base1 {
+  virtual void c() {}
+  virtual void e() {}
+};
+
+struct Derived3 : public Base1, public Base2 {
+  constexpr Derived3(){}
+  virtual void i() {}
+};
+
+Base1 g_b1;
+Base2 g_b2;
+Derived1 g_d1;
+Derived2 g_d2;
+Derived3 g_d3;
+
+extern "C" void test_basic_inheritance() {
+  Base1 g_b1;
+  Base2 g_b2;
+  Derived1 g_d1;
+  Derived2 g_d2;
+  Derived3 g_d3;
+}
+
+struct VirtualBase1 : virtual Base1 {
+  VirtualBase1(){}
+  virtual void f() {}
+};
+struct VirtualBase2 : virtual Base1, Base2 {
+  VirtualBase2(){}
+  virtual void g() {}
+};
+struct Derived4 : VirtualBase1, VirtualBase2 {
+  virtual void h() {}
+};
+struct Derived5 : VirtualBase2, VirtualBase1 {
+  virtual void h() {}
+};
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN12VirtualBase1C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN12VirtualBase1C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN12VirtualBase2C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN12VirtualBase2C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived4C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived4C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived5C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived5C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+
+VirtualBase1 g_vb1;
+VirtualBase2 g_vb2;
+Derived4 g_d4;
+Derived5 g_d5;
+
+
+extern "C" void cross_check_vtables(Base1 *b1,
+                   Base2 *b2,
+                   Derived1 *d1,
+                   Derived2 *d2,
+                   Derived3 *d3,
+                   VirtualBase1 *vb1,
+                   VirtualBase2 *vb2,
+                   Derived4 *d4,
+                   Derived4 *d5) {
+  asm("; b1->a()" ::: "memory");
+  b1->a();
+  asm("; b2->b()" ::: "memory");
+  b2->b();
+  asm("; d1->a()" ::: "memory");
+  d1->a();
+  asm("; d1->c()" ::: "memory");
+  d1->c();
+  asm("; d2->a()" ::: "memory");
+  d2->a();
+  asm("; d2->c()" ::: "memory");
+  d2->c();
+  asm("; d3->a()" ::: "memory");
+  d3->a();
+  asm("; d3->b()" ::: "memory");
+  d3->b();
+  asm("; d3->i()" ::: "memory");
+  d3->i();
+  asm("; vb1->a()" ::: "memory");
+  vb1->a();
+  asm("; vb1->f()" ::: "memory");
+  vb1->f();
+  asm("; vb2->a()" ::: "memory");
+  vb2->a();
+  asm("; vb2->g()" ::: "memory");
+  vb2->g();
+  asm("; d4->a()" ::: "memory");
+  d4->a();
+  asm("; d4->b()" ::: "memory");
+  d4->b();
+  asm("; d4->f()" ::: "memory");
+  d4->f();
+  asm("; d4->g()" ::: "memory");
+  d4->g();
+  asm("; d4->h()" ::: "memory");
+  d4->h();
+  asm("; d5->a()" ::: "memory");
+  d5->a();
+  asm("; d5->b()" ::: "memory");
+  d5->b();
+  asm("; d5->f()" ::: "memory");
+  d5->f();
+  asm("; d5->g()" ::: "memory");
+  d5->g();
+  asm("; d5->h()" ::: "memory");
+  d5->h();
+}
+
+// CHECK-LABEL: define{{.*}} void @cross_check_vtables(
+// CHECK: "; b1->a()",
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; b2->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d1->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d1->c()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED1_C_DISC]])
+// CHECK: "; d2->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d2->c()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED2_C_DISC]])
+// CHECK: "; d3->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d3->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d3->i()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED3_I_DISC]])
+// CHECK: "; vb1->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; vb1->f()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
+// CHECK: "; vb2->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; vb2->g()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
+// CHECK: "; d4->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d4->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d4->f()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
+// CHECK: "; d4->g()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
+// CHECK: "; d4->h()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED4_H_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN5Base1C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN5Base1C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN5Base2C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN5Base2C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived1C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived1C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived2C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived2C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived3C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived3C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
diff --git a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
new file mode 100644
index 0000000000000..0a9ac3fa510f5
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
@@ -0,0 +1,440 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,DARWIN %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,DARWIN %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,ELF %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,ELF %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+
+
+// CHECK: @gmethod0 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1:35591]]) to i64), i64 0 }, align 8
+// CHECK: @gmethod1 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011nonvirtual5Ev, i32 0, i64 [[TYPEDISC0:22163]]) to i64), i64 0 }, align 8
+// CHECK: @gmethod2 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, align 8
+
+// CHECK: @__const._Z13testArrayInitv.p0 = private unnamed_addr constant [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 35591) to i64), i64 0 }], align 8
+// CHECK: @__const._Z13testArrayInitv.p1 = private unnamed_addr constant [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 35591) to i64), i64 0 }], align 8
+// CHECK: @__const._Z13testArrayInitv.c0 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 35591) to i64), i64 0 } }, align 8
+// CHECK: @__const._Z13testArrayInitv.c1 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 35591) to i64), i64 0 } }, align 8
+
+// CHECK: @_ZTV5Base0 = unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI5Base0,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual1Ev, i32 0, i64 55600, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual3Ev, i32 0, i64 53007, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base016virtual_variadicEiz, i32 0, i64 7464, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 4))] }, align 8
+
+typedef __SIZE_TYPE__ size_t;
+
+namespace std {
+template <typename _Ep>
+class initializer_list {
+  const _Ep *__begin_;
+  size_t __size_;
+
+  initializer_list(const _Ep *__b, size_t __s);
+};
+} // namespace std
+
+struct Base0 {
+  void nonvirtual0();
+  virtual void virtual1();
+  virtual void virtual3();
+  virtual void virtual_variadic(int, ...);
+};
+
+struct A0 {
+  int d[4];
+};
+
+struct A1 {
+  int d[8];
+};
+
+struct __attribute__((trivial_abi)) TrivialS {
+  TrivialS(const TrivialS &);
+  ~TrivialS();
+  int p[4];
+};
+
+struct Derived0 : Base0 {
+  void virtual1() override;
+  void nonvirtual5();
+  virtual void virtual6();
+  virtual A0 return_agg();
+  virtual A1 sret();
+  virtual void trivial_abi(TrivialS);
+};
+
+struct Base1 {
+  virtual void virtual7();
+};
+
+struct Derived1 : Base0, Base1 {
+  void virtual1() override;
+  void virtual7() override;
+};
+
+typedef void (Base0::*MethodTy0)();
+typedef void (Base0::*VariadicMethodTy0)(int, ...);
+typedef void (Derived0::*MethodTy1)();
+
+struct Class0 {
+  MethodTy1 m0;
+};
+
+// CHECK: define{{.*}} void @_ZN5Base08virtual1Ev(
+
+// CHECK: define{{.*}} void @_Z5test0v()
+// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[VARMETHOD1:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD2:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD3:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD4:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD5:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD6:.*]] = alloca { i64, i64 }, align 8
+// CHECK-NEXT: %[[METHOD7:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual3Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base016virtual_variadicEiz_vfpthunk_, i32 0, i64 34368) to i64), i64 0 }, ptr %[[VARMETHOD1]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual3Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011nonvirtual5Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived08virtual6Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived010return_aggEv_vfpthunk_, i32 0, i64 64418) to i64), i64 0 }, ptr %[[METHOD3]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived04sretEv_vfpthunk_, i32 0, i64 28187) to i64), i64 0 }, ptr %[[METHOD4]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011trivial_abiE8TrivialS_vfpthunk_, i32 0, i64 8992) to i64), i64 0 }, ptr %[[METHOD5]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base18virtual7Ev_vfpthunk_, i32 0, i64 [[TYPEDISC2:61596]]) to i64), i64 0 }, ptr %[[METHOD6]], align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived18virtual7Ev_vfpthunk_, i32 0, i64 25206) to i64), i64 0 }, ptr %[[METHOD7]], align 8
+// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 25206) to i64), i64 0 }, ptr %[[METHOD7]], align 8
+// CHECK: ret void
+
+// CHECK: define linkonce_odr hidden void @_ZN5Base08virtual1Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
+// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
+// CHECK-NEXT: %[[V2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK-NEXT: %[[V3:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V2]], i32 2, i64 0)
+// CHECK-NEXT: %[[V4:.*]] = inttoptr i64 %[[V3]] to ptr
+// CHECK-NEXT: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V4]], i64 0
+// CHECK-NEXT: %[[V5:.*]] = load ptr, ptr %[[VFN]], align 8
+// CHECK-NEXT: %[[V6:.*]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK-NEXT: %[[V7:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V6]], i64 55600)
+// CHECK-NEXT: musttail call void %[[V5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V0]]) [ "ptrauth"(i32 0, i64 %[[V7]]) ]
+// CHECK-NEXT: ret void
+
+// CHECK: define linkonce_odr hidden void @_ZN5Base08virtual3Ev_vfpthunk_(ptr noundef %{{.*}})
+// CHECK: load ptr, ptr %{{.*}}, align 8
+// CHECK: load ptr, ptr %{{.*}}, align 8
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}, align 8
+// CHECK: %[[V2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[V3:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V2]], i32 2, i64 0)
+// CHECK: %[[V4:.*]] = inttoptr i64 %[[V3]] to ptr
+// CHECK: getelementptr inbounds ptr, ptr %[[V4]], i64 1
+// CHECK: call i64 @llvm.ptrauth.blend(i64 %{{.*}}, i64 53007)
+
+// CHECK: define linkonce_odr hidden void @_ZN5Base016virtual_variadicEiz_vfpthunk_(ptr noundef %[[THIS:.*]], i32 noundef %0, ...)
+// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[_ADDR:.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
+// CHECK: store i32 %0, ptr %[[_ADDR]], align 4
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK-NEXT: %[[V1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK-NEXT: %[[V2:.*]] = load i32, ptr %[[_ADDR]], align 4
+// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
+// CHECK-NEXT: %[[V4:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK-NEXT: %[[V5:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V4]], i32 2, i64 0)
+// CHECK-NEXT: %[[V6:.*]] = inttoptr i64 %[[V5]] to ptr
+// CHECK-NEXT: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V6]], i64 2
+// CHECK-NEXT: %[[V7:.*]] = load ptr, ptr %[[VFN]], align 8
+// CHECK-NEXT: %[[V8:.*]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK-NEXT: %[[V9:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V8]], i64 7464)
+// CHECK-NEXT: musttail call void (ptr, i32, ...) %[[V7]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V1]], i32 noundef %[[V2]], ...) [ "ptrauth"(i32 0, i64 %[[V9]]) ]
+// CHECK-NEXT: ret void
+
+// CHECK: define linkonce_odr hidden void @_ZN8Derived08virtual6Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
+// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
+// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
+// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
+// CHECK: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V3]], i64 3 
+// CHECK: %[[V5:.*]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 55535)
+
+// Check that the return value of the musttail call isn't copied to a temporary.
+
+// CHECK: define linkonce_odr hidden [2 x i64] @_ZN8Derived010return_aggEv_vfpthunk_(ptr noundef %{{.*}})
+// CHECK: %[[CALL:.*]] = musttail call [2 x i64] %{{.*}}(ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) [ "ptrauth"(i32 0, i64 %{{.*}}) ]
+// CHECK-NEXT: ret [2 x i64] %[[CALL]]
+
+// Check that the sret pointer passed to the caller is forwarded to the musttail
+// call.
+
+// CHECK: define linkonce_odr hidden void @_ZN8Derived04sretEv_vfpthunk_(ptr dead_on_unwind noalias writable sret(%struct.A1) align 4 %[[AGG_RESULT:.*]], ptr noundef %{{.*}})
+// CHECK: musttail call void %{{.*}}(ptr dead_on_unwind writable  sret(%struct.A1) align 4 %[[AGG_RESULT]], ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) [ "ptrauth"(i32 0, i64 %{{.*}}) ]
+// CHECK-NEXT: ret void
+
+// Check that the thunk function doesn't destruct the trivial_abi argument.
+
+// CHECK: define linkonce_odr hidden void @_ZN8Derived011trivial_abiE8TrivialS_vfpthunk_(ptr noundef %{{.*}}, [2 x i64] %{{.*}})
+// NODEBUG-NOT: call
+// CHECK: call i64 @llvm.ptrauth.auth(
+// NODEBUG-NOT: call
+// CHECK: call i64 @llvm.ptrauth.blend(
+// NODEBUG-NOT: call
+// CHECK: musttail call void
+// CHECK-NEXT: ret void
+
+// CHECK: define linkonce_odr hidden void @_ZN5Base18virtual7Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
+// CHECK: entry:
+// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
+// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
+// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
+// CHECK: getelementptr inbounds ptr, ptr %[[V3]], i64 0
+
+// CHECK: define linkonce_odr hidden void @_ZN8Derived18virtual7Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
+// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: load ptr, ptr %[[THIS_ADDR]], align 8
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
+// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
+// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
+// CHECK: getelementptr inbounds ptr, ptr %[[V3]], i64 3
+
+void Base0::virtual1() {}
+
+void test0() {
+  MethodTy0 method0;
+  method0 = &Base0::nonvirtual0;
+  method0 = &Base0::virtual1;
+  method0 = &Base0::virtual3;
+
+  VariadicMethodTy0 varmethod1;
+  varmethod1 = &Base0::virtual_variadic;
+
+  MethodTy1 method2;
+  method2 = &Derived0::nonvirtual0;
+  method2 = &Derived0::virtual1;
+  method2 = &Derived0::virtual3;
+  method2 = &Derived0::nonvirtual5;
+  method2 = &Derived0::virtual6;
+
+  A0 (Derived0::*method3)();
+  method3 = &Derived0::return_agg;
+
+  A1 (Derived0::*method4)();
+  method4 = &Derived0::sret;
+
+  void (Derived0::*method5)(TrivialS);
+  method5 = &Derived0::trivial_abi;
+
+  void (Base1::*method6)();
+  method6 = &Base1::virtual7;
+
+  void (Derived1::*method7)();
+  method7 = &Derived1::virtual7;
+  method7 = &Derived1::virtual1;
+}
+
+// CHECK: define{{.*}} void @_Z5test1P5Base0MS_FvvE(ptr noundef %[[A0:.*]], [2 x i64] %[[A1_COERCE:.*]])
+// CHECK: %[[A1:.*]] = alloca { i64, i64 }, align 8
+// CHECK: %[[A0_ADDR:.*]] = alloca ptr, align 8
+// CHECK: %[[A1_ADDR:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store [2 x i64] %[[A1_COERCE]], ptr %[[A1]], align 8
+// CHECK: %[[A11:.*]] = load { i64, i64 }, ptr %[[A1]], align 8
+// CHECK: store ptr %[[A0]], ptr %[[A0_ADDR]], align 8
+// CHECK: store { i64, i64 } %[[A11]], ptr %[[A1_ADDR]], align 8
+// CHECK: %[[V1:.*]] = load ptr, ptr %[[A0_ADDR]], align 8
+// CHECK: %[[V2:.*]] = load { i64, i64 }, ptr %[[A1_ADDR]], align 8
+// CHECK: %[[MEMPTR_ADJ:.*]] = extractvalue { i64, i64 } %[[V2]], 1
+// CHECK: %[[MEMPTR_ADJ_SHIFTED:.*]] = ashr i64 %[[MEMPTR_ADJ]], 1
+// CHECK: %[[V4:.*]] = getelementptr inbounds i8, ptr %[[V1]], i64 %[[MEMPTR_ADJ_SHIFTED]]
+// CHECK: %[[MEMPTR_PTR:.*]] = extractvalue { i64, i64 } %[[V2]], 0
+// CHECK: %[[V5:.*]] = and i64 %[[MEMPTR_ADJ]], 1
+// CHECK: %[[MEMPTR_ISVIRTUAL:.*]] = icmp ne i64 %[[V5]], 0
+// CHECK: br i1 %[[MEMPTR_ISVIRTUAL]]
+
+// CHECK:  %[[VTABLE:.*]] = load ptr, ptr %[[V4]], align 8
+// CHECK:  %[[V7:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK:  %[[V8:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V7]], i32 2, i64 0)
+// CHECK:  %[[V9:.*]] = inttoptr i64 %[[V8]] to ptr
+// DARWIN: %[[V10:.*]] = trunc i64 %[[MEMPTR_PTR]] to i32
+// DARWIN: %[[V11:.*]] = zext i32 %[[V10]] to i64
+// DARWIN: %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[V11]]
+// ELF:    %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[MEMPTR_PTR]]
+// CHECK:  %[[MEMPTR_VIRTUALFN:.*]] = load ptr, ptr %[[V12]], align 8
+// CHECK:  br
+
+// CHECK: %[[MEMPTR_NONVIRTUALFN:.*]] = inttoptr i64 %[[MEMPTR_PTR]] to ptr
+// CHECK: br
+
+// CHECK: %[[V14:.*]] = phi ptr [ %[[MEMPTR_VIRTUALFN]], {{.*}} ], [ %[[MEMPTR_NONVIRTUALFN]], {{.*}} ]
+// CHECK: %[[V15:.*]] = phi i64 [ 0, {{.*}} ], [ [[TYPEDISC0]], {{.*}} ]
+// CHECK: call void %[[V14]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V4]]) [ "ptrauth"(i32 0, i64 %[[V15]]) ]
+// CHECK: ret void
+
+void test1(Base0 *a0, MethodTy0 a1) {
+  (a0->*a1)();
+}
+
+// CHECK: define{{.*}} void @_Z15testConversion0M5Base0FvvEM8Derived0FvvE([2 x i64] %[[METHOD0_COERCE:.*]], [2 x i64] %[[METHOD1_COERCE:.*]])
+// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
+// CHECK: %[[METHOD1:.*]] = alloca { i64, i64 }, align 8
+// CHECK: %[[METHOD0_ADDR:.*]] = alloca { i64, i64 }, align 8
+// CHECK: %[[METHOD1_ADDR:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store [2 x i64] %[[METHOD0_COERCE]], ptr %[[METHOD0]], align 8
+// CHECK: %[[METHOD01:.*]] = load { i64, i64 }, ptr %[[METHOD0]], align 8
+// CHECK: store [2 x i64] %[[METHOD1_COERCE]], ptr %[[METHOD1]], align 8
+// CHECK: %[[METHOD12:.*]] = load { i64, i64 }, ptr %[[METHOD1]], align 8
+// CHECK: store { i64, i64 } %[[METHOD01]], ptr %[[METHOD0_ADDR]], align 8
+// CHECK: store { i64, i64 } %[[METHOD12]], ptr %[[METHOD1_ADDR]], align 8
+// CHECK: %[[V2:.*]] = load { i64, i64 }, ptr %[[METHOD0_ADDR]], align 8
+// CHECK: %[[MEMPTR_PTR:.*]] = extractvalue { i64, i64 } %[[V2]], 0
+// CHECK: %[[MEMPTR_ADJ:.*]] = extractvalue { i64, i64 } %[[V2]], 1
+// CHECK: %[[V3:.*]] = and i64 %[[MEMPTR_ADJ]], 1
+// CHECK: %[[IS_VIRTUAL_OFFSET:.*]] = icmp ne i64 %[[V3]], 0
+// CHECK: br i1 %[[IS_VIRTUAL_OFFSET]]
+
+// CHECK: %[[V4:.*]] = inttoptr i64 %[[MEMPTR_PTR]] to ptr
+// CHECK: %[[V5:.*]] = icmp ne ptr %[[V4]], null
+// CHECK: br i1 %[[V5]]
+
+// CHECK: %[[V6:.*]] = ptrtoint ptr %[[V4]] to i64
+// CHECK: %[[V7:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V6]], i32 0, i64 [[TYPEDISC0]], i32 0, i64 [[TYPEDISC1]])
+// CHECK: %[[V8:.*]] = inttoptr i64 %[[V7]] to ptr
+// CHECK: br
+
+// CHECK: %[[V9:.*]] = phi ptr [ null, {{.*}} ], [ %[[V8]], {{.*}} ]
+// CHECK: %[[V1:.*]] = ptrtoint ptr %[[V9]] to i64
+// CHECK: %[[V11:.*]] = insertvalue { i64, i64 } %[[V2]], i64 %[[V1]], 0
+// CHECK: br
+
+// CHECK: %[[V12:.*]] = phi { i64, i64 } [ %[[V2]], {{.*}} ], [ %[[V11]], {{.*}} ]
+// CHECK: store { i64, i64 } %[[V12]], ptr %[[METHOD1_ADDR]], align 8
+// CHECK: ret void
+
+void testConversion0(MethodTy0 method0, MethodTy1 method1) {
+  method1 = method0;
+}
+
+// CHECK: define{{.*}} void @_Z15testConversion1M5Base0FvvE(
+// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC0]], i32 0, i64 [[TYPEDISC1]])
+
+void testConversion1(MethodTy0 method0) {
+  MethodTy1 method1 = reinterpret_cast<MethodTy1>(method0);
+}
+
+// CHECK: define{{.*}} void @_Z15testConversion2M8Derived0FvvE(
+// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
+
+void testConversion2(MethodTy1 method1) {
+  MethodTy0 method0 = static_cast<MethodTy0>(method1);
+}
+
+// CHECK: define{{.*}} void @_Z15testConversion3M8Derived0FvvE(
+// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
+
+void testConversion3(MethodTy1 method1) {
+  MethodTy0 method0 = reinterpret_cast<MethodTy0>(method1);
+}
+
+// No need to call @llvm.ptrauth.resign if the source member function
+// pointer is a constant.
+
+// CHECK: define{{.*}} void @_Z15testConversion4v(
+// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
+// CHECK: ret void
+
+void testConversion4() {
+  MethodTy0 method0 = reinterpret_cast<MethodTy0>(&Derived0::virtual1);
+}
+
+// This code used to crash.
+namespace testNonVirtualThunk {
+  struct R {};
+
+  struct B0 {
+    virtual void bar();
+  };
+
+  struct B1 {
+    virtual R foo();
+  };
+
+  struct D : B0, B1 {
+    virtual R foo();
+  };
+
+  D d;
+}
+
+// CHECK: define internal void @_ZN22TestAnonymousNamespace12_GLOBAL__N_11S3fooEv_vfpthunk_(
+
+namespace TestAnonymousNamespace {
+namespace {
+struct S {
+  virtual void foo(){};
+};
+} // namespace
+
+void test() {
+  auto t = &S::foo;
+}
+} // namespace TestAnonymousNamespace
+
+MethodTy1 gmethod0 = reinterpret_cast<MethodTy1>(&Base0::nonvirtual0);
+MethodTy0 gmethod1 = reinterpret_cast<MethodTy0>(&Derived0::nonvirtual5);
+MethodTy0 gmethod2 = reinterpret_cast<MethodTy0>(&Derived0::virtual1);
+
+// CHECK-LABEL: define{{.*}} void @_Z13testArrayInitv()
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p0, ptr align 8 @__const._Z13testArrayInitv.p0, i64 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p1, ptr align 8 @__const._Z13testArrayInitv.p1, i64 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %c0, ptr align 8 @__const._Z13testArrayInitv.c0, i64 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %c1, ptr align 8 @__const._Z13testArrayInitv.c1, i64 16, i1 false)
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %{{.*}} align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %{{.*}}, align 8
+
+void initList(std::initializer_list<MethodTy1>);
+
+void testArrayInit() {
+  MethodTy1 p0[] = {&Base0::nonvirtual0};
+  MethodTy1 p1[] = {&Base0::virtual1};
+  Class0 c0{&Base0::nonvirtual0};
+  Class0 c1{&Base0::virtual1};
+  initList({&Base0::nonvirtual0});
+  initList({&Base0::virtual1});
+}
+
+
+
+// STACK-PROT: define {{.*}}_vfpthunk{{.*}}[[ATTRS:#[0-9]+]]
+// STACK-PROT: attributes [[ATTRS]] =
+// STACK-PROT-NOT: ssp
+// STACK-PROT-NOT: sspstrong
+// STACK-PROT-NOT: sspreq
+// STACK-PROT-NEXT: attributes
+
+// CHECK: define{{.*}} void @_Z15testConvertNullv(
+// CHECK: %[[T:.*]] = alloca { i64, i64 },
+// store { i64, i64 } zeroinitializer, { i64, i64 }* %[[T]],
+
+void testConvertNull() {
+  VariadicMethodTy0 t = (VariadicMethodTy0)(MethodTy0{});
+}
diff --git a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
index b4a8784a33d8c..2b633addd677e 100644
--- a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
@@ -1,10 +1,16 @@
-// RUN: %clang_cc1 %s -I%S -triple=arm64-apple-ios -fptrauth-calls -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -I%S -triple=arm64-apple-ios   -fptrauth-calls -std=c++11 -emit-llvm -o - | FileCheck --check-prefix=DARWIN %s
+// RUN: %clang_cc1 %s -I%S -triple=aarch64-linux-gnu -fptrauth-calls -std=c++11 -emit-llvm -o - | FileCheck --check-prefix=ELF %s
+
 #include <typeinfo>
 
 struct A { int a; };
 
-// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
-// CHECK: @_ZTS1A = linkonce_odr hidden constant [3 x i8] c"1A\00"
-// CHECK: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) }
+// DARWIN: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+// DARWIN: @_ZTS1A = linkonce_odr hidden constant [3 x i8] c"1A\00"
+// DARWIN: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) }
+
+// ELF: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+// ELF: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+// ELF: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }
 
 auto ATI = typeid(A);
diff --git a/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
index cad43dc0746df..634450bf62ea9 100644
--- a/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
@@ -2,8 +2,27 @@
 // RUN:  | FileCheck %s --check-prefix=CXAATEXIT
 
 // RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
-// RUN:    -fno-use-cxa-atexit \
-// RUN:  | FileCheck %s --check-prefix=ATEXIT
+// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,ATEXIT_DARWIN
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:  | FileCheck %s --check-prefix=CXAATEXIT
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,ATEXIT_ELF
+
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s \
+// RUN:  -fptrauth-function-pointer-type-discrimination  -o - | FileCheck %s --check-prefix=CXAATEXIT_DISC
+
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:   -fptrauth-function-pointer-type-discrimination  -fno-use-cxa-atexit \
+// RUN:  | FileCheck %s --check-prefixes=ATEXIT_DISC,ATEXIT_DISC_DARWIN
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s \
+// RUN:  -fptrauth-function-pointer-type-discrimination  -o - | FileCheck %s --check-prefix=CXAATEXIT_DISC
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:   -fptrauth-function-pointer-type-discrimination -fno-use-cxa-atexit \
+// RUN:  | FileCheck %s --check-prefixes=ATEXIT_DISC,ATEXIT_DISC_ELF
 
 class Foo {
  public:
@@ -16,9 +35,22 @@ Foo global;
 // CXAATEXIT: define internal void @__cxx_global_var_init()
 // CXAATEXIT:   call i32 @__cxa_atexit(ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0), ptr @global, ptr @__dso_handle)
 
+// CXAATEXIT_DISC: define internal void @__cxx_global_var_init()
+// CXAATEXIT_DISC:   call i32 @__cxa_atexit(ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0, i64 10942), ptr @global, ptr @__dso_handle)
 
 // ATEXIT: define internal void @__cxx_global_var_init()
 // ATEXIT:   %{{.*}} = call i32 @atexit(ptr ptrauth (ptr @__dtor_global, i32 0))
 
-// ATEXIT: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
-// ATEXIT:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_DARWIN: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
+// ATEXIT_ELF:    define internal void @__dtor_global() {{.*}} section ".text.startup" {
+// ATEXIT_DARWIN:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_ELF:      call void @_ZN3FooD1Ev(ptr @global)
+
+// ATEXIT_DISC: define internal void @__cxx_global_var_init()
+// ATEXIT_DISC:   %{{.*}} = call i32 @atexit(ptr ptrauth (ptr @__dtor_global, i32 0, i64 10942))
+
+
+// ATEXIT_DISC_DARWIN: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
+// ATEXIT_DISC_ELF:    define internal void @__dtor_global() {{.*}} section ".text.startup" {
+// ATEXIT_DISC_DARWIN:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_DISC_ELF:      call void @_ZN3FooD1Ev(ptr @global)
diff --git a/clang/test/CodeGenCXX/ptrauth-throw.cpp b/clang/test/CodeGenCXX/ptrauth-throw.cpp
index cea7226547e5a..0e6091a370223 100644
--- a/clang/test/CodeGenCXX/ptrauth-throw.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-throw.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1                                                -triple arm64-apple-ios -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKDISC
+// RUN: %clang_cc1                                                -triple arm64-apple-ios   -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios   -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKDISC
+
+// RUN: %clang_cc1                                                -triple aarch64-linux-gnu -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKDISC
 
 class Foo {
  public:
@@ -7,10 +10,10 @@ class Foo {
   }
 };
 
-// CHECK-LABEL: define void @_Z1fv()
+// CHECK-LABEL: define{{.*}} void @_Z1fv()
 // CHECK:  call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTI3Foo, ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0))
 
-// CHECKDISC-LABEL: define void @_Z1fv()
+// CHECKDISC-LABEL: define{{.*}} void @_Z1fv()
 // CHECKDISC:  call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTI3Foo, ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0, i64 10942))
 
 void f() {
@@ -18,10 +21,10 @@ void f() {
 }
 
 // __cxa_throw is defined to take its destructor as "void (*)(void *)" in the ABI.
-// CHECK-LABEL: define void @__cxa_throw({{.*}})
+// CHECK-LABEL: define{{.*}} void @__cxa_throw({{.*}})
 // CHECK: call void {{%.*}}(ptr noundef {{%.*}}) [ "ptrauth"(i32 0, i64 0) ]
 
-// CHECKDISC-LABEL: define void @__cxa_throw({{.*}})
+// CHECKDISC-LABEL: define{{.*}} void @__cxa_throw({{.*}})
 // CHECKDISC: call void {{%.*}}(ptr noundef {{%.*}}) [ "ptrauth"(i32 0, i64 10942) ]
 
 extern "C" void __cxa_throw(void *exception, void *, void (*dtor)(void *)) {
diff --git a/clang/test/CodeGenCXX/ptrauth-thunks.cpp b/clang/test/CodeGenCXX/ptrauth-thunks.cpp
index a85c8c4c065c4..f4491a3aab7ab 100644
--- a/clang/test/CodeGenCXX/ptrauth-thunks.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-thunks.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
 
 namespace Test1 {
   struct B1 {
diff --git a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
new file mode 100644
index 0000000000000..174aeda89d175
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -DENABLE_TID=0 -I%S -std=c++11 -triple=arm64e-apple-darwin \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NODISC
+
+// RUN: %clang_cc1 -DENABLE_TID=0 -I%S -std=c++11 -triple=aarch64-linux-gnu \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NODISC
+
+// RUN: %clang_cc1 -DENABLE_TID=1 -I%S -std=c++11 -triple=arm64e-apple-darwin \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,DISC
+
+// RUN: %clang_cc1 -DENABLE_TID=1 -I%S -std=c++11 -triple=aarch64-linux-gnu \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,DISC
+
+// copied from typeinfo
+namespace std {
+
+#if __has_cpp_attribute(clang::ptrauth_vtable_pointer)
+#  if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
+#    define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH \
+       [[clang::ptrauth_vtable_pointer(process_independent, address_discrimination, type_discrimination)]]
+#  else
+#    define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH \
+       [[clang::ptrauth_vtable_pointer(process_independent, no_address_discrimination, no_extra_discrimination)]]
+#  endif
+#else
+#  define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH
+#endif
+
+  class _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH type_info
+  {
+    type_info& operator=(const type_info&);
+    type_info(const type_info&);
+
+  protected:
+      explicit type_info(const char* __n);
+
+  public:
+      virtual ~type_info();
+
+      virtual void test_method();
+  };
+} // namespace std
+
+static_assert(__has_feature(ptrauth_type_info_vtable_pointer_discrimination) == ENABLE_TID, "incorrect feature state");
+
+// CHECK: @disc_std_type_info = global i32 [[STDTYPEINFO_DISC:45546]]
+extern "C" int disc_std_type_info = __builtin_ptrauth_string_discriminator("_ZTVSt9type_info");
+
+// CHECK: @_ZTV10TestStruct = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI10TestStruct, ptr ptrauth (ptr @_ZN10TestStructD1Ev, i32 0, i64 52216, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN10TestStructD0Ev, i32 0, i64 39671, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 3))] }, align 8
+// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1
+
+// NODISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS10TestStruct }, align 8
+
+// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]]), ptr @_ZTS10TestStruct }, align 8
+
+struct TestStruct {
+  virtual ~TestStruct();
+  int a;
+};
+
+TestStruct::~TestStruct(){}
+
+extern "C" void test_vtable(std::type_info* t) {
+  t->test_method();
+}
+// NODISC: define{{.*}} void @test_vtable(ptr noundef %t)
+// NODISC: [[T_ADDR:%.*]] = alloca ptr, align 8
+// NODISC: store ptr %t, ptr [[T_ADDR]], align 8
+// NODISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+// NODISC: [[VPTR:%.*]] = load ptr, ptr [[T]], align 8
+// NODISC: [[CAST_VPTR:%.*]] = ptrtoint ptr [[VPTR]] to i64
+// NODISC: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VPTR]], i32 2, i64 0)
+
+// DISC: define{{.*}} void @test_vtable(ptr noundef %t)
+// DISC: [[T_ADDR:%.*]] = alloca ptr, align 8
+// DISC: store ptr %t, ptr [[T_ADDR]], align 8
+// DISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+// DISC: [[VPTR:%.*]] = load ptr, ptr [[T]], align 8
+// DISC: [[ADDR:%.*]] = ptrtoint ptr [[T]] to i64
+// DISC: [[DISCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[ADDR]], i64 [[STDTYPEINFO_DISC]])
+// DISC: [[VPTRI:%.*]] = ptrtoint ptr [[VPTR]] to i64
+// DISC: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VPTRI]], i32 2, i64 [[DISCRIMINATOR]])
+
+extern "C" const void *ensure_typeinfo() {
+  return new TestStruct;
+}
diff --git a/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp b/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp
index 563d60780769b..4aa24738d8ce3 100644
--- a/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck --check-prefixes=CHECK,DARWIN %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck --check-prefixes=CHECK,ELF    %s
 
 // Check virtual function pointers in vtables are signed.
 
@@ -182,7 +183,8 @@ V1::~V1() {
 // Check sign/authentication of vtable pointers and authentication of virtual
 // functions.
 
-// CHECK-LABEL: define noundef ptr @_ZN2V1D2Ev(
+// DARWIN-LABEL: define noundef ptr @_ZN2V1D2Ev(
+// ELF-LABEL:    define dso_local void @_ZN2V1D2Ev(
 // CHECK: %[[THIS1:.*]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T1:[0-9]+]] = ptrtoint ptr %[[T0]] to i64
@@ -193,7 +195,7 @@ V1::~V1() {
 // CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T7]] to ptr
 // CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS1]]
 
-// CHECK-LABEL: define void @_Z8testB0m0P2B0(
+// CHECK-LABEL: define{{.*}} void @_Z8testB0m0P2B0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -208,7 +210,7 @@ void testB0m0(B0 *a) {
   a->m0();
 }
 
-// CHECK-LABEL: define void @_Z8testB0m1P2B0(
+// CHECK-LABEL: define{{.*}} void @_Z8testB0m1P2B0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -223,7 +225,7 @@ void testB0m1(B0 *a) {
   a->m1();
 }
 
-// CHECK-LABEL: define void @_Z8testB0m2P2B0(
+// CHECK-LABEL: define{{.*}} void @_Z8testB0m2P2B0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -238,7 +240,7 @@ void testB0m2(B0 *a) {
   a->m2();
 }
 
-// CHECK-LABEL: define void @_Z8testD0m0P2D0(
+// CHECK-LABEL: define{{.*}} void @_Z8testD0m0P2D0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -253,7 +255,7 @@ void testD0m0(D0 *a) {
   a->m0();
 }
 
-// CHECK-LABEL: define void @_Z8testD0m1P2D0(
+// CHECK-LABEL: define{{.*}} void @_Z8testD0m1P2D0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -268,7 +270,7 @@ void testD0m1(D0 *a) {
   a->m1();
 }
 
-// CHECK-LABEL: define void @_Z8testD0m2P2D0(
+// CHECK-LABEL: define{{.*}} void @_Z8testD0m2P2D0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -283,7 +285,7 @@ void testD0m2(D0 *a) {
   a->m2();
 }
 
-// CHECK-LABEL: define void @_Z8testD0m3P2D0(
+// CHECK-LABEL: define{{.*}} void @_Z8testD0m3P2D0(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -299,7 +301,7 @@ void testD0m3(D0 *a) {
 }
 
 
-// CHECK-LABEL: define void @_Z8testD1m0P2D1(
+// CHECK-LABEL: define{{.*}} void @_Z8testD1m0P2D1(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -314,7 +316,7 @@ void testD1m0(D1 *a) {
   a->m0();
 }
 
-// CHECK-LABEL: define void @_Z8testD1m1P2D1(
+// CHECK-LABEL: define{{.*}} void @_Z8testD1m1P2D1(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -329,7 +331,7 @@ void testD1m1(D1 *a) {
   a->m1();
 }
 
-// CHECK-LABEL: define void @_Z8testD1m2P2D1(
+// CHECK-LABEL: define{{.*}} void @_Z8testD1m2P2D1(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -345,7 +347,7 @@ void testD1m2(D1 *a) {
 }
 
 
-// CHECK-LABEL: define void @_Z8testD2m0P2D2(
+// CHECK-LABEL: define{{.*}} void @_Z8testD2m0P2D2(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -360,7 +362,7 @@ void testD2m0(D2 *a) {
   a->m0();
 }
 
-// CHECK-LABEL: define void @_Z8testD2m1P2D2(
+// CHECK-LABEL: define{{.*}} void @_Z8testD2m1P2D2(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -375,21 +377,21 @@ void testD2m1(D2 *a) {
   a->m1();
 }
 
-// CHECK-LABEL: define void @_Z10testD2m2D0P2D2(
+// CHECK-LABEL: define{{.*}} void @_Z10testD2m2D0P2D2(
 // CHECK: call void @_ZN2B02m2Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}){{$}}
 
 void testD2m2D0(D2 *a) {
   a->D0::m2();
 }
 
-// CHECK-LABEL: define void @_Z10testD2m2D1P2D2(
+// CHECK-LABEL: define{{.*}} void @_Z10testD2m2D1P2D2(
 // CHECK: call void @_ZN2B02m2Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}){{$}}
 
 void testD2m2D1(D2 *a) {
   a->D1::m2();
 }
 
-// CHECK-LABEL: define void @_Z8testD2m3P2D2(
+// CHECK-LABEL: define{{.*}} void @_Z8testD2m3P2D2(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -404,7 +406,7 @@ void testD2m3(D2 *a) {
   a->m3();
 }
 
-// CHECK-LABEL: define void @_Z8testD3m0P2D3(
+// CHECK-LABEL: define{{.*}} void @_Z8testD3m0P2D3(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -419,7 +421,7 @@ void testD3m0(D3 *a) {
   a->m0();
 }
 
-// CHECK-LABEL: define void @_Z8testD3m1P2D3(
+// CHECK-LABEL: define{{.*}} void @_Z8testD3m1P2D3(
 // CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
 // CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
@@ -434,7 +436,7 @@ void testD3m1(D3 *a) {
   a->m1();
 }
 
-// CHECK: define void @_Z8testD3m2P2D3(ptr noundef %[[A:.*]])
+// CHECK: define{{.*}} void @_Z8testD3m2P2D3(ptr noundef %[[A:.*]])
 // CHECK: %[[A_ADDR:.*]] = alloca ptr, align 8
 // CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8
 // CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8
@@ -459,7 +461,7 @@ void testD3m2(D3 *a) {
   a->m2();
 }
 
-// CHECK-LABEL: define void @_Z17testD3Destructor0P2D3(
+// CHECK-LABEL: define{{.*}} void @_Z17testD3Destructor0P2D3(
 // CHECK: load ptr, ptr
 // CHECK: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T2:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
@@ -475,7 +477,7 @@ void testD3Destructor0(D3 *a) {
   delete a;
 }
 
-// CHECK-LABEL: define void @_Z17testD3Destructor1P2D3(
+// CHECK-LABEL: define{{.*}} void @_Z17testD3Destructor1P2D3(
 // CHECK: %[[T6:.*]] = load ptr, ptr %
 // CHECK: %[[VTABLE0:[a-z0-9]+]] = load ptr, ptr %
 // CHECK: %[[T2:[0-9]+]] = ptrtoint ptr %[[VTABLE0]] to i64
@@ -492,14 +494,15 @@ void testD3Destructor0(D3 *a) {
 // CHECK: %[[T12:[0-9]+]] = load ptr, ptr %[[VFN]]
 // CHECK: %[[T13:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
 // CHECK: %[[T14:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T13]], i64 57279)
-// CHECK: %call = call noundef ptr %[[T12]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T14]]) ]
+// DARWIN: %call = call noundef ptr %[[T12]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T14]]) ]
+// ELF:    call void %[[T12]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T14]]) ]
 // CHECK: call void @_ZdlPv(ptr noundef %[[T7]])
 
 void testD3Destructor1(D3 *a) {
   ::delete a;
 }
 
-// CHECK-LABEL: define void @_Z17testD3Destructor2P2D3(
+// CHECK-LABEL: define{{.*}} void @_Z17testD3Destructor2P2D3(
 // CHECK: load ptr, ptr
 // CHECK: %[[VTABLE:.*]] = load ptr, ptr %
 // CHECK: %[[T2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
@@ -509,7 +512,8 @@ void testD3Destructor1(D3 *a) {
 // CHECK: %[[T5:.*]] = load ptr, ptr %[[VFN]]
 // CHECK: %[[T6:.*]] = ptrtoint ptr %[[VFN]] to i64
 // CHECK: %[[T7:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 57279)
-// CHECK: %call = call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+// DARWIN: %call = call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+// ELF:    call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T7]]) ]
 
 void testD3Destructor2(D3 *a) {
   a->~D3();
@@ -526,23 +530,27 @@ void materializeConstructors() {
   V1 V1;
 }
 
-// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2B0C2Ev(
+// DARWIN-LABEL: define linkonce_odr noundef ptr @_ZN2B0C2Ev(
+// ELF-LABEL:    define linkonce_odr void @_ZN2B0C2Ev(
 // CHECK: %[[THIS:.*]] = load ptr, ptr %
 // CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 40) ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
 // CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
 // CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
 
-// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D0C2Ev(
+// DARWIN-LABEL: define linkonce_odr noundef ptr @_ZN2D0C2Ev(
+// ELF-LABEL:    define linkonce_odr void @_ZN2D0C2Ev(
 // CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 56) ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
 // CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
 // CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
 
-// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D1C2Ev(
+// DARWIN-LABEL: define linkonce_odr noundef ptr @_ZN2D1C2Ev(
+// ELF-LABEL:    define linkonce_odr void @_ZN2D1C2Ev(
 // CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 48) ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
 // CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
 // CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
 
-// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D2C2Ev(
+// DARWIN-LABEL: define linkonce_odr noundef ptr @_ZN2D2C2Ev(
+// ELF-LABEL:    define linkonce_odr void @_ZN2D2C2Ev(
 // CHECK: %[[SLOT0:.*]] = load ptr, ptr
 // CHECK: %[[SIGN_VTADDR0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 56) ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
 // CHECK: %[[T1:[0-9]+]] = inttoptr i64 %[[SIGN_VTADDR0]] to ptr
@@ -552,7 +560,8 @@ void materializeConstructors() {
 // CHECK: %[[T5:[0-9]+]] = inttoptr i64 %[[SIGN_VTADDR1]] to ptr
 // CHECK: store ptr %[[T5]], ptr %[[T3]]
 
-// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2V0C2Ev(
+// DARWIN-LABEL: define linkonce_odr noundef ptr @_ZN2V0C2Ev(
+// ELF-LABEL:    define linkonce_odr void @_ZN2V0C2Ev(
 // CHECK: %[[THIS1]] = load ptr, ptr %
 // CHECK: %[[VTT:[a-z0-9]+]] = load ptr, ptr %{{.*}}
 // CHECK: %[[T0:[0-9]+]] = load ptr, ptr %[[VTT]]
diff --git a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
index 00b1cbd06e0f0..031bb48608af7 100644
--- a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-intrinsics -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -emit-llvm -O0 -disable-llvm-passes -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -disable-llvm-passes -fptrauth-intrinsics -fptrauth-calls \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination -emit-llvm -O0 -o - | FileCheck --check-prefixes=CHECK,DARWIN %s
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -disable-llvm-passes -fptrauth-intrinsics -fptrauth-calls \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination -emit-llvm -O0 -o - | FileCheck --check-prefixes=CHECK,ELF %s
 
 // The actual vtable construction
 
@@ -103,9 +106,11 @@
 
 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
 
-// CHECK: @_ZTS1B = linkonce_odr hidden constant [3 x i8] c"1B\00", align 1
+// DARWIN: @_ZTS1B = linkonce_odr hidden constant [3 x i8] c"1B\00", align 1
+// ELF:    @_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00", comdat, align 1
 
-// CHECK: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8
+// DARWIN: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8
+// ELF:    @_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8
 
 // CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
 
@@ -177,7 +182,8 @@
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 3)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 4)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1BD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 5)),
-// CHECK-SAME: ptr ptrauth (ptr @_ZN1BD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 6))] }, align 8
+// DARWIN-SAME: ptr ptrauth (ptr @_ZN1BD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 6))] }, align 8
+// ELF-SAME:    ptr ptrauth (ptr @_ZN1BD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 6))] }, comdat, align 8
 
 extern "C" int printf(const char *format, ...);
 
@@ -284,13 +290,15 @@ int main() {
 }
 
 // And check the thunks
-// CHECK: ptr @_ZTv0_n48_N1CD1Ev(ptr noundef %this)
+// DARWIN: ptr @_ZTv0_n48_N1CD1Ev(ptr noundef %this)
+// ELF:    void @_ZTv0_n48_N1CD1Ev(ptr noundef %this)
 // CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
 
 // CHECK: void @_ZTv0_n48_N1CD0Ev(ptr noundef %this)
 // CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
 
-// CHECK: ptr @_ZTv0_n48_N1DD1Ev(ptr noundef %this)
+// DARWIN: ptr @_ZTv0_n48_N1DD1Ev(ptr noundef %this)
+// ELF:    void @_ZTv0_n48_N1DD1Ev(ptr noundef %this)
 // CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
 
 // CHECK: void @_ZTv0_n48_N1DD0Ev(ptr noundef %this)
diff --git a/clang/test/CodeGenCXX/regparm.cpp b/clang/test/CodeGenCXX/regparm.cpp
index 1fd471c2d0727..b9735485db8de 100644
--- a/clang/test/CodeGenCXX/regparm.cpp
+++ b/clang/test/CodeGenCXX/regparm.cpp
@@ -32,7 +32,7 @@ struct S3 {
   } a;
 };
 __attribute((regparm(2))) void foo4(S3 a, int b);
-// CHECK: declare void @_Z4foo42S3i(ptr noundef byval(%struct.S3) align 4, i32 inreg noundef)
+// CHECK: declare void @_Z4foo42S3i(i32 inreg noundef)
 void bar3(S3 a, int b) {
   foo4(a, b);
 }
diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp
index 3012b0f2bc33d..54912a617c287 100644
--- a/clang/test/CodeGenCXX/trivial_abi.cpp
+++ b/clang/test/CodeGenCXX/trivial_abi.cpp
@@ -262,6 +262,26 @@ void testExceptionLarge() {
   calleeExceptionLarge(Large(), Large());
 }
 
+// CHECK: define void @_ZN7GH930401gEPNS_1SE
+// CHECK: [[CALL:%.*]] = call i64 @_ZN7GH930401fEv
+// CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[CALL]] to i56
+// CHECK-NEXT: store i56 [[TRUNC]]
+// CHECK-NEXT: ret void
+void* operator new(unsigned long, void*);
+namespace GH93040 {
+struct [[clang::trivial_abi]] S {
+  char a;
+  int x;
+  __attribute((aligned(2))) char y;
+  S();
+} __attribute((packed));
+S f();
+void g(S* s) { new(s) S(f()); }
+struct S2 { [[no_unique_address]] S s; char c;};
+static_assert(sizeof(S) == 8 && sizeof(S2) == 8, "");
+}
+
+
 // PR42961
 
 // CHECK: define{{.*}} @"_ZN3$_08__invokeEv"()
diff --git a/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp b/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
index 72c59cb41c34b..c891ff0a4fa42 100644
--- a/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
+++ b/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux -emit-llvm -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=ITANIUM
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-windows -emit-llvm -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=MSABI  --check-prefix=CHECK-VPTR-MS
 // RUN: %clang_cc1 -std=c++11 -triple arm64e-ios-13 -emit-llvm -fptrauth-intrinsics -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=ITANIUM --check-prefix=CHECK-PTRAUTH
+// RUN: %clang_cc1 -std=c++11 -triple aarch64-unknown-linux -emit-llvm -fptrauth-intrinsics -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=ITANIUM --check-prefix=CHECK-PTRAUTH
+
 struct T {
   virtual ~T() {}
   virtual int v() { return 1; }
@@ -29,7 +31,7 @@ int get_v(T* t) {
   // CHECK-NULL: load ptr, ptr {{.*}}
 
   // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-PTRAUTH: [[STRIPPED_VTABLE:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[CAST_VTABLE]], i32 0), !nosanitize !2
+  // CHECK-PTRAUTH: [[STRIPPED_VTABLE:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[CAST_VTABLE]], i32 0), !nosanitize
   // CHECK-PTRAUTH: [[STRIPPED_PTR:%.*]] = inttoptr i64 [[STRIPPED_VTABLE]] to ptr
   // CHECK-PTRAUTH: [[STRIPPED_INT:%.*]] = ptrtoint ptr [[STRIPPED_PTR]] to i64
   // Make sure authed vtable pointer feeds into hashing
diff --git a/clang/test/CodeGenCXX/x86_32-vaarg.cpp b/clang/test/CodeGenCXX/x86_32-vaarg.cpp
index a3f2791484362..90665652a0286 100644
--- a/clang/test/CodeGenCXX/x86_32-vaarg.cpp
+++ b/clang/test/CodeGenCXX/x86_32-vaarg.cpp
@@ -18,3 +18,25 @@ empty empty_record_test(int z, ...) {
   __builtin_va_start(list, z);
   return __builtin_va_arg(list, empty);
 }
+
+typedef struct {
+  struct {
+    int a[0];
+  } b;
+} SortOfEmpty;
+
+// CHECK-LABEL: @_Z18test_sort_of_emptyiz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[AGG_RESULT:%.*]], ptr [[RESULT_PTR]], align 4
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[LIST]])
+// CHECK-NEXT:    ret void
+//
+SortOfEmpty test_sort_of_empty(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, SortOfEmpty);
+}
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
new file mode 100644
index 0000000000000..54063cf0704aa
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: x86_64-linux
+// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
+// This test is adapted from coro-elide.cpp and splits functions into two files.
+//
+// RUN: split-file %s %t
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o %t/coro-elide-callee.bc
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o %t/coro-elide-caller.bc
+// RUN: llvm-lto --thinlto %t/coro-elide-callee.bc %t/coro-elide-caller.bc -o %t/summary
+// RUN: %clang_cc1 -O2 -x ir %t/coro-elide-caller.bc -fthinlto-index=%t/summary.thinlto.bc -emit-llvm -o - | FileCheck %s
+
+//--- coro-elide-task.h
+#pragma once
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        if (!h)
+          return std::noop_coroutine();
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_value(int x) noexcept {
+      _value = x;
+    }
+    std::coroutine_handle<> continuation;
+    int _value;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
+  ~Task() {
+    if (handle)
+      handle.destroy();
+  }
+
+  struct Awaiter {
+    bool await_ready() const noexcept { return false; }
+    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
+    int await_resume() noexcept {
+      return 43;
+    }
+  };
+
+  auto operator co_await() {
+    return Awaiter{};
+  }
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+//--- coro-elide-callee.cpp
+#include "coro-elide-task.h"
+Task task0() {
+  co_return 43;
+}
+
+//--- coro-elide-caller.cpp
+#include "coro-elide-task.h"
+
+Task task0();
+
+Task task1() {
+  co_return co_await task0();
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
+// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/CodeGenHIP/dpp-const-fold.hip b/clang/test/CodeGenHIP/dpp-const-fold.hip
index f5a97c6b0e77f..c5450ec4b8413 100644
--- a/clang/test/CodeGenHIP/dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/dpp-const-fold.hip
@@ -22,25 +22,25 @@ constexpr static bool BountCtrl()
     return true & false;
 }
 
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
 }
 
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
 }
 
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
 }
 
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
index 2b785200e8eea..71270bc1c68d8 100644
--- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
@@ -21,25 +21,25 @@ constexpr static bool BountCtrl()
     return true & false;
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
 {
   *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index 7c4d1468e96d2..b457f5c278791 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -1,53 +1,84 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,SPIR_CHECK,NATIVE_HALF,SPIR_NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,SPIR_CHECK,NO_HALF,SPIR_NO_HALF
 
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: %dx.frac = call half @llvm.dx.frac.f16(
-// NATIVE_HALF: ret half %dx.frac
-// NO_HALF: define noundef float @"?test_frac_half@@YA$halff@$halff@@Z"(
-// NO_HALF: %dx.frac = call float @llvm.dx.frac.f32(
-// NO_HALF: ret float %dx.frac
+// DXIL_NATIVE_HALF: define noundef half @
+// SPIR_NATIVE_HALF: define spir_func noundef half @
+// DXIL_NATIVE_HALF: %hlsl.frac = call half @llvm.dx.frac.f16(
+// SPIR_NATIVE_HALF: %hlsl.frac = call half @llvm.spv.frac.f16(
+// NATIVE_HALF: ret half %hlsl.frac
+// DXIL_NO_HALF: define noundef float @
+// SPIR_NO_HALF: define spir_func noundef float @
+// DXIL_NO_HALF: %hlsl.frac = call float @llvm.dx.frac.f32(
+// SPIR_NO_HALF: %hlsl.frac = call float @llvm.spv.frac.f32(
+// NO_HALF: ret float %hlsl.frac
 half test_frac_half(half p0) { return frac(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: %dx.frac = call <2 x half> @llvm.dx.frac.v2f16
-// NATIVE_HALF: ret <2 x half> %dx.frac
-// NO_HALF: define noundef <2 x float> @
-// NO_HALF: %dx.frac = call <2 x float> @llvm.dx.frac.v2f32(
-// NO_HALF: ret <2 x float> %dx.frac
+// DXIL_NATIVE_HALF: define noundef <2 x half> @
+// SPIR_NATIVE_HALF: define spir_func noundef <2 x half> @
+// DXIL_NATIVE_HALF: %hlsl.frac = call <2 x half> @llvm.dx.frac.v2f16
+// SPIR_NATIVE_HALF: %hlsl.frac = call <2 x half> @llvm.spv.frac.v2f16
+// NATIVE_HALF: ret <2 x half> %hlsl.frac
+// DXIL_NO_HALF: define noundef <2 x float> @
+// SPIR_NO_HALF: define spir_func noundef <2 x float> @
+// DXIL_NO_HALF: %hlsl.frac = call <2 x float> @llvm.dx.frac.v2f32(
+// SPIR_NO_HALF: %hlsl.frac = call <2 x float> @llvm.spv.frac.v2f32(
+// NO_HALF: ret <2 x float> %hlsl.frac
 half2 test_frac_half2(half2 p0) { return frac(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: %dx.frac = call <3 x half> @llvm.dx.frac.v3f16
-// NATIVE_HALF: ret <3 x half> %dx.frac
-// NO_HALF: define noundef <3 x float> @
-// NO_HALF: %dx.frac = call <3 x float> @llvm.dx.frac.v3f32(
-// NO_HALF: ret <3 x float> %dx.frac
+// DXIL_NATIVE_HALF: define noundef <3 x half> @
+// SPIR_NATIVE_HALF: define spir_func noundef <3 x half> @
+// DXIL_NATIVE_HALF: %hlsl.frac = call <3 x half> @llvm.dx.frac.v3f16
+// SPIR_NATIVE_HALF: %hlsl.frac = call <3 x half> @llvm.spv.frac.v3f16
+// NATIVE_HALF: ret <3 x half> %hlsl.frac
+// DXIL_NO_HALF: define noundef <3 x float> @
+// SPIR_NO_HALF: define spir_func noundef <3 x float> @
+// DXIL_NO_HALF: %hlsl.frac = call <3 x float> @llvm.dx.frac.v3f32(
+// SPIR_NO_HALF: %hlsl.frac = call <3 x float> @llvm.spv.frac.v3f32(
+// NO_HALF: ret <3 x float> %hlsl.frac
 half3 test_frac_half3(half3 p0) { return frac(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: %dx.frac = call <4 x half> @llvm.dx.frac.v4f16
-// NATIVE_HALF: ret <4 x half> %dx.frac
-// NO_HALF: define noundef <4 x float> @
-// NO_HALF: %dx.frac = call <4 x float> @llvm.dx.frac.v4f32(
-// NO_HALF: ret <4 x float> %dx.frac
+// DXIL_NATIVE_HALF: define noundef <4 x half> @
+// SPIR_NATIVE_HALF: define spir_func noundef <4 x half> @
+// DXIL_NATIVE_HALF: %hlsl.frac = call <4 x half> @llvm.dx.frac.v4f16
+// SPIR_NATIVE_HALF: %hlsl.frac = call <4 x half> @llvm.spv.frac.v4f16
+// NATIVE_HALF: ret <4 x half> %hlsl.frac
+// DXIL_NO_HALF: define noundef <4 x float> @
+// SPIR_NO_HALF: define spir_func noundef <4 x float> @
+// DXIL_NO_HALF: %hlsl.frac = call <4 x float> @llvm.dx.frac.v4f32(
+// SPIR_NO_HALF: %hlsl.frac = call <4 x float> @llvm.spv.frac.v4f32(
+// NO_HALF: ret <4 x float> %hlsl.frac
 half4 test_frac_half4(half4 p0) { return frac(p0); }
 
-// CHECK: define noundef float @
-// CHECK: %dx.frac = call float @llvm.dx.frac.f32(
-// CHECK: ret float %dx.frac
+// DXIL_CHECK: define noundef float @
+// SPIR_CHECK: define spir_func noundef float @
+// DXIL_CHECK: %hlsl.frac = call float @llvm.dx.frac.f32(
+// SPIR_CHECK: %hlsl.frac = call float @llvm.spv.frac.f32(
+// CHECK: ret float %hlsl.frac
 float test_frac_float(float p0) { return frac(p0); }
-// CHECK: define noundef <2 x float> @
-// CHECK: %dx.frac = call <2 x float> @llvm.dx.frac.v2f32
-// CHECK: ret <2 x float> %dx.frac
+// DXIL_CHECK: define noundef <2 x float> @
+// SPIR_CHECK: define spir_func noundef <2 x float> @
+// DXIL_CHECK: %hlsl.frac = call <2 x float> @llvm.dx.frac.v2f32
+// SPIR_CHECK: %hlsl.frac = call <2 x float> @llvm.spv.frac.v2f32
+// CHECK: ret <2 x float> %hlsl.frac
 float2 test_frac_float2(float2 p0) { return frac(p0); }
-// CHECK: define noundef <3 x float> @
-// CHECK: %dx.frac = call <3 x float> @llvm.dx.frac.v3f32
-// CHECK: ret <3 x float> %dx.frac
+// DXIL_CHECK: define noundef <3 x float> @
+// SPIR_CHECK: define spir_func noundef <3 x float> @
+// DXIL_CHECK: %hlsl.frac = call <3 x float> @llvm.dx.frac.v3f32
+// SPIR_CHECK: %hlsl.frac = call <3 x float> @llvm.spv.frac.v3f32
+// CHECK: ret <3 x float> %hlsl.frac
 float3 test_frac_float3(float3 p0) { return frac(p0); }
-// CHECK: define noundef <4 x float> @
-// CHECK: %dx.frac = call <4 x float> @llvm.dx.frac.v4f32
-// CHECK: ret <4 x float> %dx.frac
+// DXIL_CHECK: define noundef <4 x float> @
+// SPIR_CHECK: define spir_func noundef <4 x float> @
+// DXIL_CHECK: %hlsl.frac = call <4 x float> @llvm.dx.frac.v4f32
+// SPIR_CHECK: %hlsl.frac = call <4 x float> @llvm.spv.frac.v4f32
+// CHECK: ret <4 x float> %hlsl.frac
 float4 test_frac_float4(float4 p0) { return frac(p0); }
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 385f8a753cd8e..1651cb379a20f 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -145,7 +145,9 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
 
 // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
 // AMDGCN:  %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN:  store %struct.LargeStructOneMember %u.coerce, ptr addrspace(5) %[[U]], align 8
+// AMDGCN:  %[[U_ELEM:.*]] = getelementptr inbounds %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0
+// AMDGCN:  %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0
+// AMDGCN:  store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8
 // AMDGCN:  call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
 kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
   FuncOneLargeMember(u);
@@ -177,7 +179,12 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
 // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember
 // AMDGCN-SAME:  (%struct.LargeStructTwoMember %[[u_coerce:.*]])
 // AMDGCN:  %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN:  store %struct.LargeStructTwoMember %[[u_coerce]], ptr addrspace(5) %[[u]]
+// AMDGCN:  %[[U_PTR0:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0
+// AMDGCN:  %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0
+// AMDGCN:  store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]]
+// AMDGCN:  %[[U_PTR1:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1
+// AMDGCN:  %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1
+// AMDGCN:  store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]]
 // AMDGCN:  call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
 kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
   FuncLargeTwoMember(u);
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index fa83a38a01b0a..fe0a2f9578db0 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -61,7 +61,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // the return value.
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -74,7 +74,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
-// AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @foo([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
 // AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
 // AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
@@ -98,7 +98,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !7 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -168,7 +168,7 @@ void test_indirect_arg_globl(void) {
 #endif
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local
-// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space !9 !kernel_arg_access_qual !9 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !9 {
+// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
@@ -193,7 +193,7 @@ void test_indirect_arg_private(void) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember
-// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
@@ -208,7 +208,7 @@ kernel void KernelOneMember(struct StructOneMember u) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !14 !kernel_arg_access_qual !11 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
@@ -223,10 +223,12 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember
-// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    store [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
 // AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
@@ -271,15 +273,20 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember
-// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    store [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR3]]
+// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 kernel void KernelTwoMember(struct StructTwoMember u) {
@@ -287,10 +294,15 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
 }
 
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember
-// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    store [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
 // AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
new file mode 100644
index 0000000000000..474e95e74e006
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -0,0 +1,141 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -cl-ext=+__opencl_c_generic_address_space \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca(unsigned n) {
+    __private float* alloc_ptr = (__private float*)__builtin_alloca(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_uninitialized(unsigned n) {
+    __private float* alloc_ptr_uninitialized = (__private float*)__builtin_alloca_uninitialized(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align(unsigned n) {
+    __private float* alloc_ptr_align = (__private float*)__builtin_alloca_with_align((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private float* alloc_ptr_align_uninitialized = (__private float*)__builtin_alloca_with_align_uninitialized((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca(unsigned n) {
+    __private void *alloc_ptr = __builtin_alloca(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_uninitialized(unsigned n) {
+    __private void *alloc_ptr_uninitialized = __builtin_alloca_uninitialized(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align(unsigned n) {
+    __private void *alloc_ptr_align = __builtin_alloca_with_align(n, 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private void *alloc_ptr_align_uninitialized = __builtin_alloca_with_align_uninitialized(n, 8);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 1fcb1d721ad72..8242ae6a98c40 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -5,13 +5,22 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr,
+                           global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr)
 {
   v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
   v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
 
-  int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
-  v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+  v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4y out_8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 7a36881c051b1..6f7a93ef897ac 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -4,9 +4,13 @@
 // REQUIRES: amdgpu-registered-target
 
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
-void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr)
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr)
 {
   int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
   v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4y out_4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index 9155ee6e61822..b7323f1b41c2a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -5,9 +5,13 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr)
 {
   v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
   v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index ce8b2c2c7c5ba..186fc4eacfaaf 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -4,6 +4,8 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 
 // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
 // CHECK-GFX1200-NEXT:  entry:
@@ -24,3 +26,23 @@ v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
 {
   return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
 }
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_global_load_tr_b128_v8bf16(global v8y* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index b0eed07627f41..b6627f1c8114d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
 // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
 // CHECK-GFX1200-NEXT:  entry:
@@ -23,3 +25,23 @@ v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
 {
   return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
 }
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <4 x half> [[TMP0]]
+//
+v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <4 x bfloat> [[TMP0]]
+//
+v4y test_amdgcn_global_load_tr_b128_v4bf16(global v4y* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl
new file mode 100644
index 0000000000000..3403b69e07e4b
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl
@@ -0,0 +1,172 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned int v2u32 __attribute__((ext_vector_type(2)));
+typedef unsigned int v3u32 __attribute__((ext_vector_type(3)));
+typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
+u8 test_amdgcn_raw_ptr_buffer_load_b8(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+u16 test_amdgcn_raw_ptr_buffer_load_b16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+u32 test_amdgcn_raw_ptr_buffer_load_b32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2u32 test_amdgcn_raw_ptr_buffer_load_b64(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3u32 test_amdgcn_raw_ptr_buffer_load_b96(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_raw_ptr_buffer_load_b128(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
+u8 test_amdgcn_raw_ptr_buffer_load_b8_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b8(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+u16 test_amdgcn_raw_ptr_buffer_load_b16_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b16(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+u32 test_amdgcn_raw_ptr_buffer_load_b32_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b32(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2u32 test_amdgcn_raw_ptr_buffer_load_b64_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b64(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b96(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b128(rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
+u8 test_amdgcn_raw_ptr_buffer_load_b8_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+u16 test_amdgcn_raw_ptr_buffer_load_b16_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+u32 test_amdgcn_raw_ptr_buffer_load_b32_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2u32 test_amdgcn_raw_ptr_buffer_load_b64_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl
index 37975d59730c5..097c545917270 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl
@@ -2,19 +2,19 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
 
-typedef char i8;
-typedef short i16;
-typedef int i32;
-typedef int i64 __attribute__((ext_vector_type(2)));
-typedef int i96 __attribute__((ext_vector_type(3)));
-typedef int i128 __attribute__((ext_vector_type(4)));
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned int v2u32 __attribute__((ext_vector_type(2)));
+typedef unsigned int v3u32 __attribute__((ext_vector_type(3)));
+typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));
 
 // CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b8(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b8(u8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -23,7 +23,7 @@ void test_amdgcn_raw_ptr_buffer_store_b8(i8 vdata, __amdgpu_buffer_rsrc_t rsrc,
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b16(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b16(u16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -32,7 +32,7 @@ void test_amdgcn_raw_ptr_buffer_store_b16(i16 vdata, __amdgpu_buffer_rsrc_t rsrc
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b32(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b32(u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -41,7 +41,7 @@ void test_amdgcn_raw_ptr_buffer_store_b32(i32 vdata, __amdgpu_buffer_rsrc_t rsrc
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b64(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b64(v2u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -50,7 +50,7 @@ void test_amdgcn_raw_ptr_buffer_store_b64(i64 vdata, __amdgpu_buffer_rsrc_t rsrc
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b96(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b96(v3u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -59,7 +59,7 @@ void test_amdgcn_raw_ptr_buffer_store_b96(i96 vdata, __amdgpu_buffer_rsrc_t rsrc
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b128(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b128(v4u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -68,7 +68,7 @@ void test_amdgcn_raw_ptr_buffer_store_b128(i128 vdata, __amdgpu_buffer_rsrc_t rs
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b8_non_const_offset(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b8_non_const_offset(u8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -77,7 +77,7 @@ void test_amdgcn_raw_ptr_buffer_store_b8_non_const_offset(i8 vdata, __amdgpu_buf
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b16_non_const_offset(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b16_non_const_offset(u16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -86,7 +86,7 @@ void test_amdgcn_raw_ptr_buffer_store_b16_non_const_offset(i16 vdata, __amdgpu_b
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b32_non_const_offset(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b32_non_const_offset(u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -95,7 +95,7 @@ void test_amdgcn_raw_ptr_buffer_store_b32_non_const_offset(i32 vdata, __amdgpu_b
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b64_non_const_offset(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b64_non_const_offset(v2u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -104,7 +104,7 @@ void test_amdgcn_raw_ptr_buffer_store_b64_non_const_offset(i64 vdata, __amdgpu_b
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b96_non_const_offset(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b96_non_const_offset(v3u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -113,7 +113,7 @@ void test_amdgcn_raw_ptr_buffer_store_b96_non_const_offset(i96 vdata, __amdgpu_b
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b128_non_const_offset(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b128_non_const_offset(v4u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
 }
 
@@ -122,7 +122,7 @@ void test_amdgcn_raw_ptr_buffer_store_b128_non_const_offset(i128 vdata, __amdgpu
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b8_non_const_soffset(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b8_non_const_soffset(u8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
 
@@ -131,7 +131,7 @@ void test_amdgcn_raw_ptr_buffer_store_b8_non_const_soffset(i8 vdata, __amdgpu_bu
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b16_non_const_soffset(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b16_non_const_soffset(u16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
 
@@ -140,7 +140,7 @@ void test_amdgcn_raw_ptr_buffer_store_b16_non_const_soffset(i16 vdata, __amdgpu_
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b32_non_const_soffset(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b32_non_const_soffset(u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
 
@@ -149,7 +149,7 @@ void test_amdgcn_raw_ptr_buffer_store_b32_non_const_soffset(i32 vdata, __amdgpu_
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b64_non_const_soffset(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b64_non_const_soffset(v2u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
 
@@ -158,7 +158,7 @@ void test_amdgcn_raw_ptr_buffer_store_b64_non_const_soffset(i64 vdata, __amdgpu_
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b96_non_const_soffset(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b96_non_const_soffset(v3u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
 
@@ -167,6 +167,6 @@ void test_amdgcn_raw_ptr_buffer_store_b96_non_const_soffset(i96 vdata, __amdgpu_
 // CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_amdgcn_raw_ptr_buffer_store_b128_non_const_soffset(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+void test_amdgcn_raw_ptr_buffer_store_b128_non_const_soffset(v4u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
   __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl
index d7bffdad5c548..8150bc1ac9e2d 100644
--- a/clang/test/CodeGenOpenCL/builtins-f16.cl
+++ b/clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -6,6 +6,15 @@
 void test_half_builtins(half h0, half h1, half h2, int i0) {
   volatile half res;
 
+  // CHECK: call half @llvm.acos.f16(half %h0)
+  res = __builtin_acosf16(h0);
+
+  // CHECK: call half @llvm.asin.f16(half %h0)
+  res = __builtin_asinf16(h0);
+
+  // CHECK: call half @llvm.atan.f16(half %h0)
+  res = __builtin_atanf16(h0);
+
   // CHECK: call half @llvm.copysign.f16(half %h0, half %h1)
   res = __builtin_copysignf16(h0, h1);
 
@@ -18,6 +27,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.cos.f16(half %h0)
   res = __builtin_cosf16(h0);
 
+  // CHECK: call half @llvm.cosh.f16(half %h0)
+  res = __builtin_coshf16(h0);
+
   // CHECK: call half @llvm.exp.f16(half %h0)
   res = __builtin_expf16(h0);
 
@@ -63,12 +75,18 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.sin.f16(half %h0)
   res = __builtin_sinf16(h0);
 
+  // CHECK: call half @llvm.sinh.f16(half %h0)
+  res = __builtin_sinhf16(h0);
+
   // CHECK: call half @llvm.sqrt.f16(half %h0)
   res = __builtin_sqrtf16(h0);
 
   // CHECK: call half @llvm.tan.f16(half %h0)
   res = __builtin_tanf16(h0);
 
+  // CHECK: call half @llvm.tanh.f16(half %h0)
+  res = __builtin_tanhf16(h0);
+
   // CHECK: call half @llvm.trunc.f16(half %h0)
   res = __builtin_truncf16(h0);
 
diff --git a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
index 2751caa973072..a5f0019dbc1e5 100644
--- a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
+++ b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
 // RUN: %clang_cc1 %s -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
 // RUN: %clang_cc1 %s -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
 // RUN: %clang_cc1 %s -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
 // RUN: %clang_cc1 %s -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 %s -DGEN_PCH=1 -finclude-default-header -triple spir-unknown-unknown -emit-pch -o %t.pch
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
diff --git a/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-gnu/.keep b/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-gnu/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-pauthtest/.keep b/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-pauthtest/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c
index 8b7f2217eca2f..91de296a4c3ff 100644
--- a/clang/test/Driver/Ofast.c
+++ b/clang/test/Driver/Ofast.c
@@ -3,19 +3,21 @@
 // RUN: %clang -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
+// RUN: %clang -Ofast -O2 -### -Werror %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
 // RUN:  %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \
 // RUN:  %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s
 // RUN: %clang -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s
 // RUN: %clang -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s
 // RUN: %clang -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
 
+// CHECK-OFAST: use '-O3 -ffast-math' for the same behavior, or '-O3' to enable only conforming optimizations
 // CHECK-OFAST: -cc1
 // CHECK-OFAST-NOT: -relaxed-aliasing
 // CHECK-OFAST: -ffast-math
 // CHECK-OFAST: -Ofast
 // CHECK-OFAST: -vectorize-loops
 
+// Lack of warning about '-Ofast' deprecation is checked via -Werror
 // CHECK-OFAST-O2: -cc1
 // CHECK-OFAST-O2-ALIASING-NOT: -relaxed-aliasing
 // CHECK-OFAST-O2-ALIASING-MSVC: -relaxed-aliasing
@@ -23,18 +25,21 @@
 // CHECK-OFAST-O2-NOT: -Ofast
 // CHECK-OFAST-O2: -vectorize-loops
 
+// CHECK-OFAST-NO-FAST-MATH: use '-O3 -ffast-math' for the same behavior, or '-O3' to enable only conforming optimizations
 // CHECK-OFAST-NO-FAST-MATH: -cc1
 // CHECK-OFAST-NO-FAST-MATH-NOT: -relaxed-aliasing
 // CHECK-OFAST-NO-FAST-MATH-NOT: -ffast-math
 // CHECK-OFAST-NO-FAST-MATH: -Ofast
 // CHECK-OFAST-NO-FAST-MATH: -vectorize-loops
 
+// CHECK-OFAST-NO-STRICT-ALIASING: use '-O3 -ffast-math' for the same behavior, or '-O3' to enable only conforming optimizations
 // CHECK-OFAST-NO-STRICT-ALIASING: -cc1
 // CHECK-OFAST-NO-STRICT-ALIASING: -relaxed-aliasing
 // CHECK-OFAST-NO-STRICT-ALIASING: -ffast-math
 // CHECK-OFAST-NO-STRICT-ALIASING: -Ofast
 // CHECK-OFAST-NO-STRICT-ALIASING: -vectorize-loops
 
+// CHECK-OFAST-NO-VECTORIZE: use '-O3 -ffast-math' for the same behavior, or '-O3' to enable only conforming optimizations
 // CHECK-OFAST-NO-VECTORIZE: -cc1
 // CHECK-OFAST-NO-VECTORIZE-NOT: -relaxed-aliasing
 // CHECK-OFAST-NO-VECTORIZE: -ffast-math
diff --git a/clang/test/Driver/aarch64-multilib-pauthabi.c b/clang/test/Driver/aarch64-multilib-pauthabi.c
new file mode 100644
index 0000000000000..3046cb856e83c
--- /dev/null
+++ b/clang/test/Driver/aarch64-multilib-pauthabi.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=aarch64-linux-pauthtest       --sysroot=%S/Inputs/multilib_aarch64_linux_tree -### -c %s 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-linux -mabi=pauthtest --sysroot=%S/Inputs/multilib_aarch64_linux_tree -### -c %s 2>&1 | FileCheck %s
+
+// CHECK: "-internal-externc-isystem" "{{.*}}/usr/include/aarch64-linux-pauthtest"
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index fa0125f4b22a9..75190c4380826 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -1,5 +1,7 @@
+// REQUIRES: aarch64-registered-target
+
 // RUN: %clang -### -c --target=aarch64 %s 2>&1 | FileCheck %s --check-prefix NONE
-// NONE: "-cc1"
+// NONE:     "-cc1"
 // NONE-NOT: "-fptrauth-
 
 // RUN: %clang -### -c --target=aarch64 \
@@ -9,17 +11,87 @@
 // RUN:   -fno-ptrauth-auth-traps -fptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \
 // RUN:   -fno-ptrauth-init-fini -fptrauth-init-fini \
+// RUN:   -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \
 // RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL
-// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
+// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-init-fini" "-fptrauth-indirect-gotos"
+
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
+// PAUTHABI1:      "-cc1"{{.*}} "-triple" "aarch64-unknown-linux-pauthtest"
+// PAUTHABI1-SAME: "-target-abi" "pauthtest"
+// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini"
+
+// RUN: %clang -### -c --target=aarch64 -mabi=pauthtest -fno-ptrauth-intrinsics \
+// RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
+// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// RUN: %clang -### -c --target=aarch64-pauthtest -fno-ptrauth-intrinsics \
+// RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
+// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// PAUTHABI2:     "-cc1"
+// PAUTHABI2-NOT: "-fptrauth-
 
 // RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=ERR
-// ERR:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
-// ERR-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
+// RUN:   -fptrauth-type-info-vtable-pointer-discrimination -fptrauth-indirect-gotos -fptrauth-init-fini %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR1
+// ERR1:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-type-info-vtable-pointer-discrimination' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-indirect-gotos' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
+
+//// Only support PAuth ABI for Linux as for now.
+// RUN: not %clang -o /dev/null -c --target=aarch64-unknown -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
+// RUN: not %clang -o /dev/null -c --target=aarch64-unknown-pauthtest       %s 2>&1 | FileCheck %s --check-prefix=ERR2
+// ERR2: error: ABI 'pauthtest' is not supported for 'aarch64-unknown-unknown-pauthtest'
+
+//// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments.
+// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
+// ERR3: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu'
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s
+
+//// The only branch protection option compatible with PAuthABI is BTI.
+// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR4
+// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=pac-ret %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR4
+// ERR4: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
+
+// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=gcs %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR5
+// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=gcs %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR5
+// ERR5: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
+
+// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=standard %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR6
+// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=standard %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR6
+// ERR6: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
+
+// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=all %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR7
+// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=all %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR7
+// ERR7: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest'
+
+// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=non-leaf %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR8
+// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=non-leaf %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR8
+// ERR8: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest'
+
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=none %s
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=none %s
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=bti %s
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=bti %s
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=none %s
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=none %s
diff --git a/clang/test/Driver/aix-save-reg-params.c b/clang/test/Driver/aix-save-reg-params.c
new file mode 100644
index 0000000000000..ac58ef136061c
--- /dev/null
+++ b/clang/test/Driver/aix-save-reg-params.c
@@ -0,0 +1,7 @@
+// RUN: %clang -### -target powerpc-ibm-aix-xcoff -msave-reg-params -c %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: %clang -### -target powerpc64-ibm-aix-xcoff -msave-reg-params -c %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: %clang -### -target powerpc-ibm-aix-xcoff -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DISABLE
+// RUN: %clang -### -target powerpc64-ibm-aix-xcoff -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DISABLE
+
+// CHECK: "-msave-reg-params"
+// DISABLE-NOT: "-msave-reg-params"
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 8ab6a07131474..b60d31bae6270 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -18,13 +18,17 @@
 // AS_LINK_UR: "-cc1as"
 // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
 
-// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
 // RUN:   -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
-// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
 // RUN:   -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
 // LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
-// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx906"
+// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
 // RUN:   -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
 // LD: ld.lld
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN:   -r %s 2>&1 | FileCheck -check-prefixes=RELO %s
+// RELO-NOT: -shared
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index a5e392150d757..d055cda1b7461 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -32,8 +32,8 @@
 // RUN:   | FileCheck -check-prefix=ARGS %s
 
 //      ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
-// ARGS-NEXT: ptxas{{.*}}"-m64" "-O3" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
-// ARGS-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "[[CUBIN]].cubin"
+// ARGS-NEXT: ptxas{{.*}}"-m64" "-O3" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].o" "[[PTX]].s" "-c"
+// ARGS-NEXT: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61"{{.*}}"[[CUBIN]].o"
 
 //
 // Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
@@ -55,7 +55,7 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=LINK %s
 
-// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"
+// LINK: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61"{{.*}}[[CUBIN:.+]].o
 
 //
 // Test to ensure that we enable handling global constructors in a freestanding
@@ -72,7 +72,7 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -march=sm_52 -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=LINKER-ARGS %s
 
-// LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b"
+// LINKER-ARGS: clang-nvlink-wrapper{{.*}}"-v"{{.*}}"a" "b"
 
 // Tests for handling a missing architecture.
 //
@@ -84,9 +84,24 @@
 // MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'ptxas'
 // MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
+// Do not error when performing LTO.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -flto %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=MISSING-LTO %s
+
+// MISSING-LTO-NOT: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
+
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_52 -march=generic -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 
 // GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
+
+//
+// Test forwarding the necessary +ptx feature.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda --cuda-feature=+ptx63 -march=sm_52 -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=FEATURE %s
+
+// FEATURE: clang-nvlink-wrapper{{.*}}"--feature" "+ptx63"
diff --git a/clang/test/Driver/darwin-print-libgcc-file-name.c b/clang/test/Driver/darwin-print-libgcc-file-name.c
new file mode 100644
index 0000000000000..73e89b6718642
--- /dev/null
+++ b/clang/test/Driver/darwin-print-libgcc-file-name.c
@@ -0,0 +1,91 @@
+// Test the output of -print-libgcc-file-name on Darwin.
+
+//
+// All platforms
+//
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=x86_64-apple-macos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-MACOS %s
+// CHECK-CLANGRT-MACOS: libclang_rt.osx.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-ios \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-IOS %s
+// CHECK-CLANGRT-IOS: libclang_rt.ios.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-watchos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-WATCHOS %s
+// CHECK-CLANGRT-WATCHOS: libclang_rt.watchos.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-tvos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-TVOS %s
+// CHECK-CLANGRT-TVOS: libclang_rt.tvos.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-driverkit \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-DRIVERKIT %s
+// CHECK-CLANGRT-DRIVERKIT: libclang_rt.driverkit.a
+
+//
+// Simulators
+//
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-ios-simulator \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-IOS-SIMULATOR %s
+// CHECK-CLANGRT-IOS-SIMULATOR: libclang_rt.iossim.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-watchos-simulator \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-WATCHOS-SIMULATOR %s
+// CHECK-CLANGRT-WATCHOS-SIMULATOR: libclang_rt.watchossim.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     --target=arm64-apple-tvos-simulator \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-TVOS-SIMULATOR %s
+// CHECK-CLANGRT-TVOS-SIMULATOR: libclang_rt.tvossim.a
+
+// Check the sanitizer and profile variants
+// While the driver also links in sanitizer-specific dylibs, the result of
+// -print-libgcc-file-name is the path of the basic compiler-rt library.
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     -fsanitize=address --target=x86_64-apple-macos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-MACOS-SAN %s
+// CHECK-CLANGRT-MACOS-SAN: libclang_rt.osx.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     -fsanitize=address --target=arm64-apple-ios \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-IOS-SAN %s
+// CHECK-CLANGRT-IOS-SAN: libclang_rt.ios.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     -fsanitize=address --target=arm64-apple-watchos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-WATCHOS-SAN %s
+// CHECK-CLANGRT-WATCHOS-SAN: libclang_rt.watchos.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     -fsanitize=address --target=arm64-apple-tvos \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-TVOS-SAN %s
+// CHECK-CLANGRT-TVOS-SAN: libclang_rt.tvos.a
+
+// RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
+// RUN:     -fsanitize=address --target=arm64-apple-driverkit \
+// RUN:     -resource-dir=%S/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-DRIVERKIT-SAN %s
+// CHECK-CLANGRT-DRIVERKIT-SAN: libclang_rt.driverkit.a
diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c
index 0a665f7017d63..21785ba01cb41 100644
--- a/clang/test/Driver/debug-options.c
+++ b/clang/test/Driver/debug-options.c
@@ -118,27 +118,28 @@
 // RUN: %clang_cl -### -c -Z7 -target x86_64-windows-msvc -- %s 2>&1 \
 // RUN:             | FileCheck -check-prefix=G_NOTUNING %s
 
-// On the PS4/PS5, -g defaults to -gno-column-info, and we always generate the
-// arange section.
+// On the PS4/PS5, -g defaults to -gno-column-info. We default to always
+// generating the arange section, but keyed off SCE DebuggerTuning being in
+// play during codegen, instead of -generate-arange-section.
 // RUN: %clang -### -c %s -target x86_64-scei-ps4 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOG_PS %s
 // RUN: %clang -### -c %s -target x86_64-sie-ps5 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOG_PS %s
 /// PS4 will stay on v4 even if the generic default version changes.
 // RUN: %clang -### -c %s -g -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefixes=G_DWARF4,GARANGE,G_SCE,NOCI,FWD_TMPL_PARAMS %s
+// RUN:             | FileCheck -check-prefixes=G_DWARF4,G_SCE,NOCI,FWD_TMPL_PARAMS %s
 // RUN: %clang -### -c %s -g -target x86_64-sie-ps5 2>&1 \
-// RUN:             | FileCheck -check-prefixes=G_DWARF5,GARANGE,G_SCE,NOCI,FWD_TMPL_PARAMS %s
+// RUN:             | FileCheck -check-prefixes=G_DWARF5,G_SCE,NOCI,FWD_TMPL_PARAMS %s
 // RUN: %clang -### -c %s -g -gcolumn-info -target x86_64-scei-ps4 2>&1 \
 // RUN:             | FileCheck -check-prefix=CI %s
 // RUN: %clang -### -c %s -gsce -target x86_64-unknown-linux 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOCI %s
 // RUN: %clang -### %s -g -flto=thin -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefix=SNLDTLTOGARANGE %s
+// RUN:             | FileCheck -check-prefix=LDGARANGE %s
 // RUN: %clang -### %s -g -flto=full -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefix=SNLDFLTOGARANGE %s
+// RUN:             | FileCheck -check-prefix=LDGARANGE %s
 // RUN: %clang -### %s -g -flto -target x86_64-scei-ps5 2>&1 \
-// RUN:             | FileCheck -check-prefix=LLDGARANGE %s
+// RUN:             | FileCheck -check-prefix=LDGARANGE %s
 // RUN: %clang -### %s -g -target x86_64-scei-ps5 2>&1 \
 // RUN:             | FileCheck -check-prefix=LDGARANGE %s
 
@@ -321,8 +322,7 @@
 //
 // NOG_PS: "-cc1"
 // NOG_PS-NOT: "-dwarf-version=
-// NOG_PS: "-generate-arange-section"
-// NOG_PS-NOT: "-dwarf-version=
+// NOG_PS-NOT: "-generate-arange-section"
 //
 // G_ERR: error: unknown argument:
 //
@@ -402,8 +402,7 @@
 //
 
 // LDGARANGE: {{".*ld.*"}} {{.*}}
-// LDGARANGE-NOT: "-plugin-opt=-generate-arange-section"
-// LLDGARANGE: {{".*lld.*"}} {{.*}} "-plugin-opt=-generate-arange-section"
+// LDGARANGE-NOT: -generate-arange-section"
 // SNLDTLTOGARANGE: {{".*orbis-ld.*"}} {{.*}} "-lto-thin-debug-options= -generate-arange-section"
 // SNLDFLTOGARANGE: {{".*orbis-ld.*"}} {{.*}} "-lto-debug-options= -generate-arange-section"
 
diff --git a/clang/test/Driver/fp-contract.c b/clang/test/Driver/fp-contract.c
index e2691dc211cc3..cab63683ee813 100644
--- a/clang/test/Driver/fp-contract.c
+++ b/clang/test/Driver/fp-contract.c
@@ -2,6 +2,14 @@
 // the options -ffast-math, -fno-fast-math, funsafe-math-optimizations,
 // fno-unsafe-math-optimizations.
 
+// These warning checks are above the run lines because the warning is reported
+// before the drive options that are checked below the run lines.
+// WARN_FM_OFF: warning: overriding '-ffast-math' option with '-ffp-contract=off'
+// WARN_FM_ON: warning: overriding '-ffast-math' option with '-ffp-contract=on'
+// WARN_FM_FHP: warning: overriding '-ffast-math' option with '-ffp-contract=fast-honor-pragmas'
+// WARN_UM_OFF: warning: overriding '-funsafe-math-optimizations' option with '-ffp-contract=off'
+// WARN_UM_ON: warning: overriding '-funsafe-math-optimizations' option with '-ffp-contract=on'
+
 // ffast-math, fno-fast-math
 // RUN: %clang -### -ffast-math -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -10,19 +18,19 @@
 // RUN: %clang -### -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 // CHECK-FPC-ON:       "-ffp-contract=on"
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 // CHECK-FPC-OFF:      "-ffp-contract=off"
 
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST-HONOR %s
+// RUN: %clang -### -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST-HONOR,WARN_FM_FHP %s
 // CHECK-FPC-FAST-HONOR:     "-ffp-contract=fast-honor-pragmas"
 
 // RUN: %clang -### -Werror -ffp-contract=fast -ffast-math -c %s 2>&1 \
@@ -43,23 +51,23 @@
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_OFF %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=fast \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=fast \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_FM_OFF %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -112,24 +120,24 @@
 // RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=on \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=on \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=off \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=off \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 
 // funsafe-math-optimizations, fno-unsafe-math-optimizations
-// RUN: %clang -### -funsafe-math-optimizations -c %s 2>&1  \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
 
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -151,27 +159,27 @@
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
-// RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
+// RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -ffp-contract=fast -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
-// RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
+// RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_OFF %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -ffp-contract=fast \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_UM_OFF %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
 
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
@@ -229,9 +237,21 @@
 // RUN:   -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN:   -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN:   -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
+
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN_UM_OFF %s
+
+// This case should not warn
+// RUN: %clang -### -Werror -funsafe-math-optimizations \
+// RUN: -fno-unsafe-math-optimizations -ffp-contract=off -c %s
+
+// RUN: %clang -### -ffast-math -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN_FM_OFF %s
 
+// This case should not warn
+// RUN: %clang -### -Werror -ffast-math -fno-fast-math -ffp-contract=off -c %s
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index 644523394d6b1..2348d4b41f43a 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -81,6 +81,19 @@
 // RUN:   | FileCheck --check-prefix=WARN13 %s
 // WARN13: warning: overriding '-ffp-model=strict' option with '-fapprox-func' [-Woverriding-option]
 
+// RUN: %clang -### -ffp-model=precise -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN14 %s
+// WARN14: warning: overriding '-ffp-model=precise' option with '-ffp-contract=off' [-Woverriding-option]
+
+// RUN: %clang -### -ffp-model=precise -ffp-contract=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN15 %s
+// WARN15: warning: overriding '-ffp-model=precise' option with '-ffp-contract=fast' [-Woverriding-option]
+
+// RUN: %clang -### -ffp-model=strict -fassociative-math -ffp-contract=on \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefix=WARN16 %s
+// WARN16: warning: overriding '-ffp-model=strict' option with '-fassociative-math' [-Woverriding-option]
+// WARN16: warning: overriding '-ffp-model=strict' option with '-ffp-contract=on' [-Woverriding-option]
+
 // RUN: %clang -### -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NOROUND %s
 // CHECK-NOROUND: "-cc1"
diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c
index ab04fd39ffa1c..5f07ca99a69de 100644
--- a/clang/test/Driver/fpatchable-function-entry.c
+++ b/clang/test/Driver/fpatchable-function-entry.c
@@ -6,6 +6,8 @@
 // RUN: %clang --target=loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=powerpc-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=powerpc64-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // CHECK: "-fpatchable-function-entry=1"
 
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
@@ -13,8 +15,11 @@
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
 // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1"
 
-// RUN: not %clang --target=ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
-// TARGET: error: unsupported option '-fpatchable-function-entry=1' for target 'ppc64'
+// RUN: not %clang --target=powerpc64-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX64 %s
+// AIX64: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc64-ibm-aix-xcoff'
+
+// RUN: not %clang --target=powerpc-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX32 %s
+// AIX32: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc-ibm-aix-xcoff'
 
 // RUN: not %clang --target=x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: invalid argument '1,0,' to -fpatchable-function-entry=
diff --git a/clang/test/Driver/ftime-trace-sections.cpp b/clang/test/Driver/ftime-trace-sections.cpp
index 0c16052bc0c3a..da7109b9d81a6 100644
--- a/clang/test/Driver/ftime-trace-sections.cpp
+++ b/clang/test/Driver/ftime-trace-sections.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir %t && cd %t
-// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o out %s
+// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
 // RUN: %python %S/ftime-trace-sections.py < out.json
 
 template <typename T>
diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp
index 5fe63de915a71..60c5885704b58 100644
--- a/clang/test/Driver/ftime-trace.cpp
+++ b/clang/test/Driver/ftime-trace.cpp
@@ -1,18 +1,18 @@
 // RUN: rm -rf %t && mkdir -p %t && cd %t
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
 // RUN: cat out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
 // RUN: cat new-name.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
 // RUN: mkdir dir1 dir2
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
 // RUN: cat dir1/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
 // RUN: cat dir2/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
@@ -34,32 +34,33 @@
 // RUN: mkdir d e f && cp %s d/a.cpp && touch d/b.c
 
 /// TODO: Support -fno-integrated-as.
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
-// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
+// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
 
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0"
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
 
 /// -o specifies the link output. Create ${output}-${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0"
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
 
 /// -dumpdir is f/g, not ending with a path separator. We create f/g${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
-// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0"
-// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
+// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
 
-// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0"
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0"
+// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
 
-// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -xassembler d/a.cpp 2>&1 | \
+// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -ftime-trace-verbose -xassembler d/a.cpp 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=UNUSED
 // UNUSED:      warning: argument unused during compilation: '-ftime-trace'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace=e'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-granularity=1'
+// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose'
 // UNUSED-NOT:  warning:
 
 template <typename T>
diff --git a/clang/test/Driver/gpu-libc-headers.c b/clang/test/Driver/gpu-libc-headers.c
index 32a5edb175e61..53c016837dde6 100644
--- a/clang/test/Driver/gpu-libc-headers.c
+++ b/clang/test/Driver/gpu-libc-headers.c
@@ -4,15 +4,15 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --sysroot=./ \
 // RUN:     -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_70  \
 // RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS
-// CHECK-HEADERS: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"{{.*}}"-isysroot" "./"
-// CHECK-HEADERS: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"{{.*}}"-isysroot" "./"
+// CHECK-HEADERS: "-cc1"{{.*}}"-isysroot" "./"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"
+// CHECK-HEADERS: "-cc1"{{.*}}"-isysroot" "./"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"
 
 // RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a --sysroot=./ \
 // RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-AMDGPU
 // RUN:   %clang -### --target=nvptx64-nvidia-cuda -march=sm_89 --sysroot=./ \
 // RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-NVPTX
-// CHECK-HEADERS-AMDGPU: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}amdgcn-amd-amdhsa"{{.*}}"-isysroot" "./"
-// CHECK-HEADERS-NVPTX: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}nvptx64-nvidia-cuda"{{.*}}"-isysroot" "./"
+// CHECK-HEADERS-AMDGPU: "-cc1"{{.*}}"-isysroot" "./"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}amdgcn-amd-amdhsa"
+// CHECK-HEADERS-NVPTX: "-cc1"{{.*}}"-isysroot" "./"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}nvptx64-nvidia-cuda"
 
 // RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
 // RUN:     -nogpuinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
diff --git a/clang/test/Driver/hexagon-toolchain-linux.c b/clang/test/Driver/hexagon-toolchain-linux.c
index fe32638417ea4..86cc9a30e932c 100644
--- a/clang/test/Driver/hexagon-toolchain-linux.c
+++ b/clang/test/Driver/hexagon-toolchain-linux.c
@@ -119,3 +119,36 @@
 // CHECK010:   crt1.o
 // CHECK010:   "-L/tmp"
 // CHECK010-NOT:  "-lstandalone"
+
+// -----------------------------------------------------------------------------
+// unwindlib
+// -----------------------------------------------------------------------------
+// RUN: %clangxx --unwindlib=none \
+// RUN:    --target=hexagon-unknown-linux-musl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=CHECK011 %s
+// CHECK011:   InstalledDir: [[INSTALLED_DIR:.+]]
+// CHECK011:   crt1.o
+// CHECK011-NOT:  "-lunwind"
+// CHECK011-NOT:  "-lgcc_eh"
+// CHECK012-NOT:  "-lgcc_s"
+
+
+// RUN: %clangxx --rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:    --target=hexagon-unknown-linux-musl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=CHECK012 %s
+// RUN: %clangxx \
+// RUN:    --target=hexagon-unknown-linux-musl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=CHECK012 %s
+// CHECK012:   InstalledDir: [[INSTALLED_DIR:.+]]
+// CHECK012:   crt1.o
+// CHECK012:  "-lunwind"
+// CHECK012-NOT:  "-lgcc_eh"
+// CHECK012-NOT:  "-lgcc_s"
+
+// RUN: not %clangxx --rtlib=compiler-rt --unwindlib=libgcc \
+// RUN:    --target=hexagon-unknown-linux-musl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=CHECK013 %s
+// CHECK013:  error: unsupported unwind library 'libgcc' for platform 'hexagon-unknown-linux-musl'
+// CHECK013-NOT:  "-lgcc_eh"
+// CHECK013-NOT:  "-lgcc_s"
+// CHECK013-NOT:  "-lunwind"
diff --git a/clang/test/Driver/immediate-options.c b/clang/test/Driver/immediate-options.c
index 77878fe2e9c58..b74f6b41f22a0 100644
--- a/clang/test/Driver/immediate-options.c
+++ b/clang/test/Driver/immediate-options.c
@@ -2,6 +2,8 @@
 // HELP: isystem
 // HELP-NOT: ast-dump
 // HELP-NOT: driver-mode
+// HELP:     -Wa,
+// HELP-NOT: -W{{[a-z][a-z]}}
 
 // Make sure that Flang-only options are not available in Clang
 // HELP-NOT: test-io
diff --git a/clang/test/Driver/linker-wrapper-libs.c b/clang/test/Driver/linker-wrapper-libs.c
index 22cc24f2e258a..44331b90e7d93 100644
--- a/clang/test/Driver/linker-wrapper-libs.c
+++ b/clang/test/Driver/linker-wrapper-libs.c
@@ -48,7 +48,7 @@ int bar() { return weak; }
 // RUN:   --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
 // RUN: | FileCheck %s --check-prefix=LIBRARY-RESOLVES
 
-// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
+// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
 // LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
 
 //
@@ -72,7 +72,7 @@ int bar() { return weak; }
 // RUN:   --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
 // RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL
 
-// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
+// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
 // LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
 
 //
@@ -96,7 +96,7 @@ int bar() { return weak; }
 // RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-NONE
 
 // LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
+// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
 
 //
 // Check that we do not extract an external weak symbol.
@@ -161,7 +161,7 @@ int bar() { return weak; }
 // RUN:   --linker-path=/usr/bin/ld %t.o %t.a %t.a -o a.out 2>&1 \
 // RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-DEFINED
 
-// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
+// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}} {{.*}}.o
 // LIBRARY-GLOBAL-DEFINED-NOT: {{.*}}gfx1030{{.*}}.o
 // LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
 
@@ -185,7 +185,7 @@ int bar() { return weak; }
 // RUN:   --linker-path=/usr/bin/ld %t.o --whole-archive %t.a -o a.out 2>&1 \
 // RUN: | FileCheck %s --check-prefix=LIBRARY-WHOLE-ARCHIVE
 
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
+// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}} {{.*}}.o
 // LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.s
+// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.o
 // LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a {{.*}}.o
diff --git a/clang/test/Driver/linker-wrapper-passes.c b/clang/test/Driver/linker-wrapper-passes.c
index aadcf472e9b63..fb63ef7970ac7 100644
--- a/clang/test/Driver/linker-wrapper-passes.c
+++ b/clang/test/Driver/linker-wrapper-passes.c
@@ -4,6 +4,9 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
+// https://github.com/llvm/llvm-project/issues/100212
+// XFAIL: *
+
 // Setup.
 // RUN: mkdir -p %t
 // RUN: %clang -cc1 -emit-llvm-bc -o %t/host-x86_64-unknown-linux-gnu.bc \
@@ -13,7 +16,7 @@
 // RUN: opt %t/openmp-amdgcn-amd-amdhsa.bc -o %t/openmp-amdgcn-amd-amdhsa.bc \
 // RUN:     -passes=forceattrs -force-remove-attribute=f:noinline
 // RUN: clang-offload-packager -o %t/openmp-x86_64-unknown-linux-gnu.out \
-// RUN:     --image=file=%t/openmp-amdgcn-amd-amdhsa.bc,triple=amdgcn-amd-amdhsa
+// RUN:     --image=file=%t/openmp-amdgcn-amd-amdhsa.bc,arch=gfx90a,triple=amdgcn-amd-amdhsa
 // RUN: %clang -cc1 -S -o %t/host-x86_64-unknown-linux-gnu.s \
 // RUN:     -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
 // RUN:     -fembed-offload-object=%t/openmp-x86_64-unknown-linux-gnu.out \
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index b9fa08ace0ff7..a205247c77161 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -48,7 +48,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS
 
-// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.s -save-temps
+// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}} -save-temps
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
@@ -128,14 +128,15 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b \
+// RUN:   --linker-path=/usr/bin/ld --device-linker=foo=bar --device-linker=a \
+// RUN:   --device-linker=nvptx64-nvidia-cuda=b --device-compiler=foo\
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER-ARGS
 
-// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}a
-// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}a b
+// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}-Xlinker foo=bar{{.*}}-Xlinker a{{.*}}foo
+// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}-Xlinker foo=bar{{.*}}-Xlinker a -Xlinker b{{.*}}foo
 
-// RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -ldummy \
-// RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b \
+// RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
+// RUN:   -ldummy --linker-path=/usr/bin/ld \
 // RUN:   -o a.out 2>&1 | FileCheck %s --check-prefix=MISSING-LIBRARY
 
 // MISSING-LIBRARY: error: unable to find library -ldummy
@@ -147,7 +148,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND
 
-// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.bc
+// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
@@ -233,3 +234,13 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: | FileCheck %s --check-prefix=OVERRIDE
 // OVERRIDE-NOT: clang
 // OVERRIDE: /usr/bin/ld
+
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --offload-opt=-pass-remarks=foo \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OFFLOAD-OPT
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run -mllvm -pass-remarks=foo \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OFFLOAD-OPT
+
+// OFFLOAD-OPT: clang{{.*}}-Wl,--plugin-opt=-pass-remarks=foo
diff --git a/clang/test/Driver/loongarch-features.c b/clang/test/Driver/loongarch-features.c
index 3cdf3ba3d23e1..90634bbcf0035 100644
--- a/clang/test/Driver/loongarch-features.c
+++ b/clang/test/Driver/loongarch-features.c
@@ -2,7 +2,7 @@
 // RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
 
 // LA32: "target-features"="+32bit"
-// LA64: "target-features"="+64bit,+d,+f,+ual"
+// LA64: "target-features"="+64bit,+d,+f,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index 9214130cd034f..2d5b315d962a1 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -2,10 +2,22 @@
 // RUN:   FileCheck %s --check-prefix=CC1-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA664
 // RUN: %clang --target=loongarch64 -march=loongarch64 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA664
 
 // CC1-LOONGARCH64: "-target-cpu" "loongarch64"
 // CC1-LOONGARCH64-NOT: "-target-feature"
@@ -19,8 +31,29 @@
 // CC1-LA464-NOT: "-target-feature"
 // CC1-LA464: "-target-abi" "lp64d"
 
+// CC1-LA64V1P0: "-target-cpu" "loongarch64"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-abi" "lp64d"
+
+// CC1-LA64V1P1: "-target-cpu" "loongarch64"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-abi" "lp64d"
+
+// CC1-LA664: "-target-cpu" "la664"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-abi" "lp64d"
+
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
+// IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
+// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lsx,+ual"
+// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+f,+frecipe,+lasx,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mlasx.c b/clang/test/Driver/loongarch-mlasx.c
index 0b934f125c9e4..87634ff5a9a40 100644
--- a/clang/test/Driver/loongarch-mlasx.c
+++ b/clang/test/Driver/loongarch-mlasx.c
@@ -5,7 +5,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -fsyntax-only %s -### 2>&1 | \
@@ -18,7 +18,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NOLASX
+// RUN:   FileCheck %s --check-prefix=IR-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -S -emit-llvm %s -o - | \
@@ -26,9 +26,11 @@
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 
+// CC1-LSX: "-target-feature" "+lsx"
 // CC1-LASX: "-target-feature" "+lsx" "-target-feature" "+lasx"
 // CC1-NOLASX: "-target-feature" "-lasx"
 
+// IR-LSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lsx{{(,.*)?}}"
 // IR-LASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lasx{{(,.*)?}}"
 // IR-NOLASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lasx{{(,.*)?}}"
 
diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
index cd463300c8747..49d298e1b2e3f 100644
--- a/clang/test/Driver/loongarch-msimd.c
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -75,9 +75,9 @@
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
diff --git a/clang/test/Driver/loongarch-msingle-float.c b/clang/test/Driver/loongarch-msingle-float.c
index bd9b3e8a8c019..4eb0865b53a59 100644
--- a/clang/test/Driver/loongarch-msingle-float.c
+++ b/clang/test/Driver/loongarch-msingle-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64s' as it conflicts with that implied by '-msingle-float' (lp64f)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msingle-float' (32)
 
-// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64f"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-msoft-float.c b/clang/test/Driver/loongarch-msoft-float.c
index 0e5121ac84b4c..ebf27fb00e309 100644
--- a/clang/test/Driver/loongarch-msoft-float.c
+++ b/clang/test/Driver/loongarch-msoft-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64d' as it conflicts with that implied by '-msoft-float' (lp64s)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msoft-float' (0)
 
-// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64s"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mtune.c b/clang/test/Driver/loongarch-mtune.c
index 6f3f39e9bbd86..face12e1a1a82 100644
--- a/clang/test/Driver/loongarch-mtune.c
+++ b/clang/test/Driver/loongarch-mtune.c
@@ -8,6 +8,11 @@
 // RUN: %clang --target=loongarch64 -mtune=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la464
 
+// RUN: %clang --target=loongarch64 -mtune=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=la664
+// RUN: %clang --target=loongarch64 -mtune=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la664
+
 // RUN: %clang --target=loongarch64 -mtune=invalidcpu -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=invalidcpu
 // RUN: not %clang --target=loongarch64 -mtune=invalidcpu -S -emit-llvm %s -o /dev/null 2>&1 | \
diff --git a/clang/test/Driver/lto-jobs.c b/clang/test/Driver/lto-jobs.c
index 43a478b0664d8..2c7ca02ea4779 100644
--- a/clang/test/Driver/lto-jobs.c
+++ b/clang/test/Driver/lto-jobs.c
@@ -6,12 +6,15 @@
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS-ACTION < %t %s
 //
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto-jobs=5 2> %t
+// RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS-ACTION < %t %s
+//
 // CHECK-LINK-THIN-JOBS-ACTION: "-plugin-opt=jobs=5"
 //
 // RUN: %clang --target=x86_64-scei-ps4 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-PS4-LINK-THIN-JOBS-ACTION < %t %s
 //
-// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-thin-debug-options= -generate-arange-section -threads=5"
+// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-debug-options= -threads=5"
 
 // RUN: %clang --target=x86_64-apple-darwin13.3.0 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS2-ACTION < %t %s
diff --git a/clang/test/Driver/modulemap-allow-subdirectory-search.c b/clang/test/Driver/modulemap-allow-subdirectory-search.c
new file mode 100644
index 0000000000000..ee993a75b5272
--- /dev/null
+++ b/clang/test/Driver/modulemap-allow-subdirectory-search.c
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// Check that with a sufficiently new SDK not searching for module maps in subdirectories.
+
+// New SDK.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=NO-SUBDIRECTORIES %t/test.c
+// Old SDK.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX14.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+// Non-Darwin platform.
+// RUN: %clang -target i386-unknown-linux -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+// New SDK overriding the default.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -fmodulemap-allow-subdirectory-search -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+
+//--- test.c
+// NO-SUBDIRECTORIES: "-fno-modulemap-allow-subdirectory-search"
+// SEARCH-SUBDIRECTORIES-NOT: "-fno-modulemap-allow-subdirectory-search"
+
+//--- MacOSX15.0.sdk/SDKSettings.json
+{"Version":"15.0", "MaximumDeploymentTarget": "15.0.99"}
+
+//--- MacOSX14.0.sdk/SDKSettings.json
+{"Version":"14.0", "MaximumDeploymentTarget": "14.0.99"}
diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c
new file mode 100644
index 0000000000000..318315ddaca34
--- /dev/null
+++ b/clang/test/Driver/nvlink-wrapper.c
@@ -0,0 +1,72 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+#if defined(X)
+extern int y;
+int foo() { return y; }
+
+int x = 0;
+#elif defined(Y)
+int y = 42;
+#elif defined(Z)
+int z = 42;
+#elif defined(W)
+int w = 42;
+#elif defined(U)
+extern int x;
+extern int __attribute__((weak)) w;
+
+int bar() {
+  return x + w;
+}
+#else
+extern int y;
+int __attribute__((visibility("hidden"))) x = 999;
+int baz() { return y + x; }
+#endif
+
+// Create various inputs to test basic linking and LTO capabilities. Creating a
+// CUDA binary requires access to the `ptxas` executable, so we just use x64.
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DX -o %t-x.o
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DY -o %t-y.o
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DZ -o %t-z.o
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DW -o %t-w.o
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DU -o %t-u.o
+// RUN: llvm-ar rcs %t-x.a %t-x.o
+// RUN: llvm-ar rcs %t-y.a %t-y.o
+// RUN: llvm-ar rcs %t-z.a %t-z.o
+// RUN: llvm-ar rcs %t-w.a %t-w.o
+
+//
+// Check that we forward any unrecognized argument to 'nvlink'.
+//
+// RUN: clang-nvlink-wrapper --dry-run -arch sm_52 %t-u.o -foo -o a.out 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=ARGS
+// ARGS: nvlink{{.*}} -arch sm_52 -foo -o a.out [[INPUT:.+]].cubin
+
+//
+// Check the symbol resolution for static archives. We expect to only link
+// `libx.a` and `liby.a` because extern weak symbols do not extract and `libz.a`
+// is not used at all.
+//
+// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.o %t-y.a %t-z.a %t-w.a \
+// RUN:   -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LINK
+// LINK: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x-{{.*}}.cubin{{.*}}-y-{{.*}}.cubin
+
+// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.o
+
+//
+// Check that the LTO interface works and properly preserves symbols used in a
+// regular object file.
+//
+// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \
+// RUN:   -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
+// LTO: ptxas{{.*}} -m64 -c [[PTX:.+]].s -O3 -arch sm_52 -o [[CUBIN:.+]].cubin
+// LTO: nvlink{{.*}} -arch sm_52 -o a.out [[CUBIN]].cubin {{.*}}-u-{{.*}}.cubin {{.*}}-y-{{.*}}.cubin
+
+//
+// Check that we don't forward some arguments.
+//
+// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \
+// RUN:   -arch sm_52 --cuda-path/opt/cuda -o a.out 2>&1 | FileCheck %s --check-prefix=PATH
+// PATH-NOT: --cuda-path=/opt/cuda
diff --git a/clang/test/Driver/opencl.cl b/clang/test/Driver/opencl.cl
index aba37fc328fbb..3b0b191827b4c 100644
--- a/clang/test/Driver/opencl.cl
+++ b/clang/test/Driver/opencl.cl
@@ -35,7 +35,7 @@
 // CHECK-OPT-DISABLE: "-cc1" {{.*}} "-cl-opt-disable"
 // CHECK-STRICT-ALIASING: "-cc1" {{.*}} "-cl-strict-aliasing"
 // CHECK-SINGLE-PRECISION-CONST: "-cc1" {{.*}} "-cl-single-precision-constant"
-// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-cl-finite-math-only"
+// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-menable-no-infs" "-menable-no-nans" "-cl-finite-math-only"
 // CHECK-KERNEL-ARG-INFO: "-cc1" {{.*}} "-cl-kernel-arg-info"
 // CHECK-UNSAFE-MATH-OPT: "-cc1" {{.*}} "-cl-unsafe-math-optimizations"
 // CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index 31287b27deb5f..d3f9f04ae85d1 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -372,33 +372,9 @@
 // XARCH-DEVICE: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-O3"
 // XARCH-DEVICE-NOT: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-O3"
 
-//
-// Check that `-gpulibc` includes the LLVM C libraries for the GPU.
-//
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
-// RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
-// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
-// RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:      --rocm-path=%S/Inputs/rocm \
-// RUN:      --offload-arch=sm_52,gfx803 -gpulibc -nogpuinc %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
-// RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
-// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
-// RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:      --rocm-path=%S/Inputs/rocm \
-// RUN:      -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \
-// RUN:      -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 \
-// RUN:      -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -gpulibc -nogpuinc %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-// LIBC-GPU-DAG: "-lcgpu-amdgpu"
-// LIBC-GPU-DAG: "-lmgpu-amdgpu"
-// LIBC-GPU-DAG: "-lcgpu-nvptx"
-// LIBC-GPU-DAG: "-lmgpu-nvptx"
-
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
 // RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
 // RUN:      --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=NO-LIBC-GPU %s
-// NO-LIBC-GPU-NOT: -lmgpu{{.*}}-lcgpu
+// RUN:   | FileCheck --check-prefix=LIBC-GPU %s
+// LIBC-GPU: clang-linker-wrapper{{.*}}"--device-compiler=-nolibc"
diff --git a/clang/test/Driver/ppc-unsupported.c b/clang/test/Driver/ppc-unsupported.c
index dde4d8d1c1935..c2746bba16129 100644
--- a/clang/test/Driver/ppc-unsupported.c
+++ b/clang/test/Driver/ppc-unsupported.c
@@ -16,4 +16,8 @@
 // RUN:   -c %s 2>&1 | FileCheck %s
 // RUN: not %clang -target powerpc-unknown-aix -mabi=quadword-atomics \
 // RUN:   -c %s 2>&1 | FileCheck %s
+// RUN: not %clang -target powerpc64le-unknown-linux-gnu -msave-reg-params \
+// RUN:   -c %s 2>&1 | FileCheck %s
+// RUN: not %clang -target powerpc-unknown-unknown -msave-reg-params \
+// RUN:   -c %s 2>&1 | FileCheck %s
 // CHECK: unsupported option
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 1dc4580ec202e..91f12b8416b2a 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -35,7 +35,6 @@
 // CHECK-NEXT:     za64rs               1.0       'Za64rs' (Reservation Set Size of at Most 64 Bytes)
 // CHECK-NEXT:     zaamo                1.0       'Zaamo' (Atomic Memory Operations)
 // CHECK-NEXT:     zabha                1.0       'Zabha' (Byte and Halfword Atomic Memory Operations)
-// CHECK-NEXT:     zacas                1.0       'Zacas' (Atomic Compare-And-Swap Instructions)
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zama16b              1.0       'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)
 // CHECK-NEXT:     zawrs                1.0       'Zawrs' (Wait on Reservation Set)
@@ -171,6 +170,7 @@
 // CHECK-NEXT: Experimental extensions
 // CHECK-NEXT:     zicfilp              1.0       'Zicfilp' (Landing pad)
 // CHECK-NEXT:     zicfiss              1.0       'Zicfiss' (Shadow stack)
+// CHECK-NEXT:     zacas                1.0       'Zacas' (Atomic Compare-And-Swap Instructions)
 // CHECK-NEXT:     zalasr               0.1       'Zalasr' (Load-Acquire and Store-Release Instructions)
 // CHECK-NEXT:     smmpm                1.0       'Smmpm' (Machine-level Pointer Masking for M-mode)
 // CHECK-NEXT:     smnpm                1.0       'Smnpm' (Machine-level Pointer Masking for next lower privilege mode)
diff --git a/clang/test/Driver/ps4-linker.c b/clang/test/Driver/ps4-linker.c
index be0103bffe813..2a095d660bf36 100644
--- a/clang/test/Driver/ps4-linker.c
+++ b/clang/test/Driver/ps4-linker.c
@@ -1,20 +1,23 @@
 // Test the driver's control over the JustMyCode behavior with linker flags.
 
-// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-THIN-LTO,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-FULL-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
 
-// CHECK-NOT: -enable-jmc-instrument
-// CHECK-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -enable-jmc-instrument"
-// CHECK-FULL-LTO: "-lto-debug-options= -generate-arange-section -enable-jmc-instrument"
+// CHECK-LTO: "-lto-debug-options= -enable-jmc-instrument"
 
 // Check the default library name.
 // CHECK-LIB: "--whole-archive" "-lSceDbgJmc" "--no-whole-archive"
 
 // Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags.
 
-// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-THIN-LTO %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-FULL-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
 
-// CHECK-DIAG-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps"
-// CHECK-DIAG-FULL-LTO: "-lto-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps"
+// CHECK-DIAG-LTO: "-lto-debug-options= -crash-diagnostics-dir=mydumps"
+
+// Test that -lto-debug-options is only supplied to the linker when necessary
+
+// RUN: %clang --target=x86_64-scei-ps4 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-LTO %s
+// CHECK-NO-LTO-NOT: -lto-debug-options
diff --git a/clang/test/Driver/ps4-ps5-runtime-flags.c b/clang/test/Driver/ps4-ps5-runtime-flags.c
index e75ba97948d23..4b00aead6fe5c 100644
--- a/clang/test/Driver/ps4-ps5-runtime-flags.c
+++ b/clang/test/Driver/ps4-ps5-runtime-flags.c
@@ -40,5 +40,5 @@
 // RUN: %clang -target x86_64-sie-ps5 -fcs-profile-generate %s -### 2>&1 | FileCheck --check-prefix=CHECK-PS5-PROFILE %s
 // RUN: %clang -target x86_64-sie-ps5 -fcs-profile-generate -fno-profile-generate %s -### 2>&1 | FileCheck --check-prefix=CHECK-PS5-NO-PROFILE %s
 //
-// CHECK-PS5-PROFILE: "--dependent-lib=libclang_rt.profile-x86_64_nosubmission.a"
-// CHECK-PS5-NO-PROFILE-NOT: "--dependent-lib=libclang_rt.profile-x86_64_nosubmission.a"
+// CHECK-PS5-PROFILE: "--dependent-lib=libclang_rt.profile_nosubmission.a"
+// CHECK-PS5-NO-PROFILE-NOT: "--dependent-lib=libclang_rt.profile_nosubmission.a"
diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c
index 9f1e3a273b2db..cf39d5bae97ac 100644
--- a/clang/test/Driver/ps5-linker.c
+++ b/clang/test/Driver/ps5-linker.c
@@ -1,10 +1,9 @@
 // Test the driver's control over the JustMyCode behavior with linker flags.
 
 // RUN: %clang --target=x86_64-scei-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
 
-// CHECK-NOT: -plugin-opt=-enable-jmc-instrument
-// CHECK-LTO: -plugin-opt=-enable-jmc-instrument
+// CHECK: -plugin-opt=-enable-jmc-instrument
 
 // Check the default library name.
 // CHECK-LIB: "--whole-archive" "-lSceJmc_nosubmission" "--no-whole-archive"
@@ -12,7 +11,6 @@
 // Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags.
 
 // RUN: %clang --target=x86_64-scei-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s
-// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s
 
-// CHECK-DIAG-NOT: -plugin-opt=-crash-diagnostics-dir=mydumps
-// CHECK-DIAG-LTO: -plugin-opt=-crash-diagnostics-dir=mydumps
+// CHECK-DIAG: -plugin-opt=-crash-diagnostics-dir=mydumps
diff --git a/clang/test/Driver/stack-size-section.c b/clang/test/Driver/stack-size-section.c
index 71b9f85692b99..7cd41e491a817 100644
--- a/clang/test/Driver/stack-size-section.c
+++ b/clang/test/Driver/stack-size-section.c
@@ -14,6 +14,7 @@
 
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section -fno-stack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO-NO
+// RUN: %clang -### --target=x86_64-sie-ps5 -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 
 // LTO: "-plugin-opt=-stack-size-section"
 // LTO-NO-NOT: "-plugin-opt=-stack-size-section"
diff --git a/clang/test/Driver/sycl-spirv-ext-old-model.c b/clang/test/Driver/sycl-spirv-ext-old-model.c
index 97ea39ff3ea46..643de972d9f70 100644
--- a/clang/test/Driver/sycl-spirv-ext-old-model.c
+++ b/clang/test/Driver/sycl-spirv-ext-old-model.c
@@ -61,8 +61,7 @@
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_tensor_float32_conversion
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_optnone
 // CHECK-DEFAULT-SAME:,+SPV_KHR_non_semantic_info
-// CHECK-DEFAULT-SAME:,+SPV_KHR_cooperative_matrix
-// CHECK-DEFAULT-SAME:,+SPV_INTEL_memory_access_aliasing"
+// CHECK-DEFAULT-SAME:,+SPV_KHR_cooperative_matrix"
 // CHECK-FPGA-HW: llvm-spirv{{.*}}"-spirv-ext=-all
 // CHECK-FPGA-HW-SAME:,+SPV_EXT_shader_atomic_float_add
 // CHECK-FPGA-HW-SAME:,+SPV_EXT_shader_atomic_float_min_max
@@ -128,5 +127,4 @@
 // CHECK-CPU-SAME:,+SPV_INTEL_optnone
 // CHECK-CPU-SAME:,+SPV_KHR_non_semantic_info
 // CHECK-CPU-SAME:,+SPV_KHR_cooperative_matrix
-// CHECK-CPU-SAME:,+SPV_INTEL_memory_access_aliasing
 // CHECK-CPU-SAME:,+SPV_INTEL_fp_max_error"
diff --git a/clang/test/Driver/sycl-spirv-metadata-old-model.cpp b/clang/test/Driver/sycl-spirv-metadata-old-model.cpp
index 2e19dd9ed1dc4..4efbe45a080b2 100644
--- a/clang/test/Driver/sycl-spirv-metadata-old-model.cpp
+++ b/clang/test/Driver/sycl-spirv-metadata-old-model.cpp
@@ -9,7 +9,7 @@
 // RUN:  FileCheck -check-prefix CHECK-WITHOUT %s
 
 // CHECK-WITH: llvm-spirv{{.*}} "--spirv-preserve-auxdata"
-// CHECK-WITH-SAME: "-spirv-ext=-all,{{.*}},+SPV_INTEL_memory_access_aliasing"
+// CHECK-WITH-SAME: "-spirv-ext=-all,{{.*}},+SPV_KHR_cooperative_matrix"
 
 // CHECK-WITHOUT: "{{.*}}llvm-spirv"
 // CHECK-WITHOUT-NOT: --spirv-preserve-auxdata
diff --git a/clang/test/Driver/sycl-spirv-obj-old-model.cpp b/clang/test/Driver/sycl-spirv-obj-old-model.cpp
index 7a5f6215e31ae..809c52516fbb9 100644
--- a/clang/test/Driver/sycl-spirv-obj-old-model.cpp
+++ b/clang/test/Driver/sycl-spirv-obj-old-model.cpp
@@ -11,7 +11,7 @@
 // SPIRV_DEVICE_OBJ-SAME: "-o" "[[DEVICE_BC:.+\.bc]]"
 // SPIRV_DEVICE_OBJ: llvm-spirv{{.*}} "-o" "[[DEVICE_SPV:.+\.spv]]"
 // SPIRV_DEVICE_OBJ-SAME: "--spirv-preserve-auxdata"
-// SPIRV_DEVICE_OBJ-SAME: "-spirv-ext=-all,{{.*}},+SPV_INTEL_memory_access_aliasing"
+// SPIRV_DEVICE_OBJ-SAME: "-spirv-ext=-all,{{.*}},+SPV_KHR_cooperative_matrix"
 // SPIRV_DEVICE_OBJ-SAME: "[[DEVICE_BC]]"
 // SPIRV_DEVICE_OBJ: clang{{.*}} "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // SPIRV_DEVICE_OBJ-SAME: "-fsycl-is-host"
diff --git a/clang/test/Driver/sycl-spirv-obj.cpp b/clang/test/Driver/sycl-spirv-obj.cpp
index 4c86daf4a1264..6c651129f2f3e 100644
--- a/clang/test/Driver/sycl-spirv-obj.cpp
+++ b/clang/test/Driver/sycl-spirv-obj.cpp
@@ -11,7 +11,7 @@
 // SPIRV_DEVICE_OBJ-SAME: "-o" "[[DEVICE_BC:.+\.bc]]"
 // SPIRV_DEVICE_OBJ: llvm-spirv{{.*}} "-o" "[[DEVICE_SPV:.+\.spv]]"
 // SPIRV_DEVICE_OBJ-SAME: "--spirv-preserve-auxdata"
-// SPIRV_DEVICE_OBJ-SAME: "-spirv-ext=-all,{{.*}},+SPV_INTEL_memory_access_aliasing"
+// SPIRV_DEVICE_OBJ-SAME: "-spirv-ext=-all,{{.*}},+SPV_KHR_cooperative_matrix"
 // SPIRV_DEVICE_OBJ-SAME: "[[DEVICE_BC]]"
 // SPIRV_DEVICE_OBJ: clang-offload-packager{{.*}} "--image=file=[[DEVICE_SPV]]{{.*}}"
 // SPIRV_DEVICE_OBJ: clang{{.*}} "-cc1" "-triple" "x86_64-unknown-linux-gnu"
diff --git a/clang/test/Driver/unified-lto.c b/clang/test/Driver/unified-lto.c
index 3a6fe44f5b32d..445ca0bbf14f1 100644
--- a/clang/test/Driver/unified-lto.c
+++ b/clang/test/Driver/unified-lto.c
@@ -7,6 +7,27 @@
 // NOUNIT-NOT: "-flto-unit"
 
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -funified-lto 2>&1 | FileCheck --check-prefix=NOUNILTO %s
-// NOUNILTO: clang: warning: argument unused during compilation: '-funified-lto'
 // NOUNILTO: "-cc1"
 // NOUNILTO-NOT: "-funified-lto
+
+// On PlayStation -funified-lto is the default. `-flto(=...)` influences the
+// `--lto=...` option passed to linker, unless `-fno-unified-lto` is supplied.
+// PS4:
+// RUN: %clang --target=x86_64-sie-ps4 -### %s 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto=full 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto=thin 2>&1 | FileCheck --check-prefixes=LD,LTOTHIN %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -fno-unified-lto -flto=full 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -fno-unified-lto -flto=thin 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// PS5:
+// RUN: %clang --target=x86_64-sie-ps5 -### %s 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=full 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=thin 2>&1 | FileCheck --check-prefixes=LD,LTOTHIN %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=full 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=thin 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+
+// LD: {{.*ld(\.exe)?}}"
+// LTOFULL-SAME: "--lto=full"
+// LTOTHIN-SAME: "--lto=thin"
+// NOLTO-NOT: "--lto
diff --git a/clang/test/Driver/warn-fsyntax-only.c b/clang/test/Driver/warn-fsyntax-only.c
new file mode 100644
index 0000000000000..d13bab41c047a
--- /dev/null
+++ b/clang/test/Driver/warn-fsyntax-only.c
@@ -0,0 +1,7 @@
+// RUN: %clang --target=x86_64 -fsyntax-only -E %s 2>&1 | FileCheck %s --check-prefix=CHECK-PP
+// RUN: %clang --target=x86_64 -fsyntax-only -S %s 2>&1 | FileCheck %s --check-prefix=CHECK-ASM
+// RUN: %clang --target=x86_64 -fsyntax-only -c %s 2>&1 | FileCheck %s --check-prefix=CHECK-OBJ
+
+// CHECK-PP:  warning: argument unused during compilation: '-fsyntax-only' [-Wunused-command-line-argument]
+// CHECK-ASM: warning: argument unused during compilation: '-S' [-Wunused-command-line-argument]
+// CHECK-OBJ: warning: argument unused during compilation: '-c' [-Wunused-command-line-argument]
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index ba61457102a57..7d77ae75f8c47 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -309,8 +309,8 @@
 // HRESET: "-target-feature" "+hreset"
 // NO-HRESET: "-target-feature" "-hreset"
 
-// RUN: %clang --target=i386 -march=i386 -muintr %s -### 2>&1 | FileCheck -check-prefix=UINTR %s
-// RUN: %clang --target=i386 -march=i386 -mno-uintr %s -### 2>&1 | FileCheck -check-prefix=NO-UINTR %s
+// RUN: %clang --target=x86_64 -muintr %s -### 2>&1 | FileCheck -check-prefix=UINTR %s
+// RUN: %clang --target=x86_64 -mno-uintr %s -### 2>&1 | FileCheck -check-prefix=NO-UINTR %s
 // UINTR: "-target-feature" "+uintr"
 // NO-UINTR: "-target-feature" "-uintr"
 
@@ -409,6 +409,15 @@
 // NONX86-NEXT: warning: argument unused during compilation: '-msse4.2' [-Wunused-command-line-argument]
 // NONX86-NEXT: error: unsupported option '-mno-sgx' for target 'aarch64'
 
+// RUN: not %clang -### --target=i386 -muintr %s 2>&1 | FileCheck --check-prefix=NON-UINTR %s
+// RUN: %clang -### --target=i386 -mno-uintr %s 2>&1 > /dev/null
+// RUN: not %clang -### --target=i386 -mapx-features=ndd %s 2>&1 | FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang -### --target=i386 -mapxf %s 2>&1 | FileCheck --check-prefix=NON-APX %s
+// RUN: %clang -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-UINTR:    error: unsupported option '-muintr' for target 'i386'
+// NON-APX:      error: unsupported option '-mapx-features=|-mapxf' for target 'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=return %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-RET,NO-SLS %s
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=indirect-jmp %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-IJMP,NO-SLS %s
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=none -mharden-sls=all %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-IJMP,SLS-RET %s
@@ -428,8 +437,8 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+cf"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-cf"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
diff --git a/clang/test/Frontend/skip-function-bodies.cpp b/clang/test/Frontend/skip-function-bodies.cpp
new file mode 100644
index 0000000000000..d0593b474bda2
--- /dev/null
+++ b/clang/test/Frontend/skip-function-bodies.cpp
@@ -0,0 +1,13 @@
+// Trivial check to ensure skip-function-bodies flag is propagated.
+//
+// RUN: %clang_cc1 -verify -skip-function-bodies -pedantic-errors %s
+// expected-no-diagnostics
+
+int f() {
+  // normally this should emit some diags, but we're skipping it!
+  this is garbage;
+}
+
+// Make sure we only accept it as a cc1 arg.
+// RUN: not %clang -skip-function-bodies %s 2>&1 | FileCheck %s
+// CHECK: clang: error: unknown argument '-skip-function-bodies'; did you mean '-Xclang -skip-function-bodies'?
diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index cd085fdb5039a..ed1030b820627 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -13,7 +13,8 @@
 // RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
 // RUN:   -internal-isystem %S/Inputs/include \
 // RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN:   -menable-no-nans -o - \
 // RUN:   -D__HIPCC_RTC__ | FileCheck -check-prefix=FINITEONLY %s
 
 // DEFAULT-LABEL: @test_fma_f16(
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 26da82843c512..6ee10976f1207 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -14,7 +14,8 @@
 // RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
 // RUN:   -internal-isystem %S/Inputs/include \
 // RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN:   -menable-no-nans -o - \
 // RUN:   -D__HIPCC_RTC__ | FileCheck -check-prefixes=CHECK,FINITEONLY %s
 
 // Check that we end up with -fapprox-func set on intrinsic calls
diff --git a/clang/test/Headers/float.c b/clang/test/Headers/float.c
index d524d0e53f3fd..a3dd9c90f785c 100644
--- a/clang/test/Headers/float.c
+++ b/clang/test/Headers/float.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c99 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c23 -ffreestanding %s
-// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -ffinite-math-only %s
+// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -menable-no-nans -menable-no-infs %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++11 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++14 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++17 -ffreestanding %s
@@ -223,8 +223,9 @@
   #ifndef NAN
     #error "Mandatory macro NAN is missing."
   #endif
-  // FIXME: the NAN diagnostic should only be issued once, not twice.
-  _Static_assert(_Generic(INFINITY, float : 1, default : 0), ""); // finite-warning {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// FIXME: the NAN and INF diagnostics should only be issued once, not twice.
+  _Static_assert(_Generic(INFINITY, float : 1, default : 0), ""); // finite-warning {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}} \
+								  finite-warning {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
   _Static_assert(_Generic(NAN, float : 1, default : 0), ""); // finite-warning {{use of NaN is undefined behavior due to the currently enabled floating-point options}} \
                                                                 finite-warning {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
 
diff --git a/clang/test/Headers/stdarg-cxx-modules.cpp b/clang/test/Headers/stdarg-cxx-modules.cpp
new file mode 100644
index 0000000000000..113ece4fb64b3
--- /dev/null
+++ b/clang/test/Headers/stdarg-cxx-modules.cpp
@@ -0,0 +1,25 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header h1.h
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header h2.h -fmodule-file=h1.pcm
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only main.cpp -fmodule-file=h1.pcm -fmodule-file=h2.pcm
+
+//--- h1.h
+#include <stdarg.h>
+// expected-no-diagnostics
+
+//--- h2.h
+import "h1.h";
+// expected-no-diagnostics
+
+//--- main.cpp
+import "h1.h";
+import "h2.h";
+
+void foo(int x, ...) {
+  va_list v;
+  va_start(v, x);
+  va_end(v);
+}
+// expected-no-diagnostics
diff --git a/clang/test/Headers/stdatomic.c b/clang/test/Headers/stdatomic.c
index 3643fd4245b31..9afd531a9ed9b 100644
--- a/clang/test/Headers/stdatomic.c
+++ b/clang/test/Headers/stdatomic.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -std=c11 -E %s | FileCheck %s
 // RUN: %clang_cc1 -std=c11 -fms-compatibility -E %s | FileCheck %s
+// RUN: %clang_cc1 -std=c11 %s -verify
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -verify
+// expected-no-diagnostics
 #include <stdatomic.h>
 
 int bool_lock_free = ATOMIC_BOOL_LOCK_FREE;
@@ -31,3 +34,5 @@ int llong_lock_free = ATOMIC_LLONG_LOCK_FREE;
 
 int pointer_lock_free = ATOMIC_POINTER_LOCK_FREE;
 // CHECK: pointer_lock_free = {{ *[012] *;}}
+
+atomic_flag f = ATOMIC_FLAG_INIT;
diff --git a/clang/test/Headers/stddefneeds.cpp b/clang/test/Headers/stddefneeds.cpp
index 0763bbdee13ae..0282e8afa600d 100644
--- a/clang/test/Headers/stddefneeds.cpp
+++ b/clang/test/Headers/stddefneeds.cpp
@@ -56,14 +56,21 @@ max_align_t m5;
 #undef NULL
 #define NULL 0
 
-// glibc (and other) headers then define __need_NULL and rely on stddef.h
-// to redefine NULL to the correct value again.
-#define __need_NULL
+// Including stddef.h again shouldn't redefine NULL
 #include <stddef.h>
 
 // gtk headers then use __attribute__((sentinel)), which doesn't work if NULL
 // is 0.
-void f(const char* c, ...) __attribute__((sentinel));
+void f(const char* c, ...) __attribute__((sentinel)); // expected-note{{function has been explicitly marked sentinel here}}
 void g() {
+  f("", NULL); // expected-warning{{missing sentinel in function call}}
+}
+
+// glibc (and other) headers then define __need_NULL and rely on stddef.h
+// to redefine NULL to the correct value again.
+#define __need_NULL
+#include <stddef.h>
+
+void h() {
   f("", NULL);  // Shouldn't warn.
 }
diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c
index a75b3380368c0..15e4a431df65b 100644
--- a/clang/test/Headers/xmmintrin.c
+++ b/clang/test/Headers/xmmintrin.c
@@ -14,7 +14,7 @@ _MM_ALIGN16 char c;
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 
 // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16
-// CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   return _mm_cvtps_pi16(a);
diff --git a/clang/test/Index/pch-with-errors.c b/clang/test/Index/pch-with-errors.c
index e8711c8e26a9b..cfe58c155cd6d 100644
--- a/clang/test/Index/pch-with-errors.c
+++ b/clang/test/Index/pch-with-errors.c
@@ -38,7 +38,7 @@ void foo(void) {
 // CHECK-INDEX: [indexEntityReference]: kind: function | name: erroneous
 
 // RUN: not %clang -fsyntax-only %s -include %t.h 2>&1 | FileCheck -check-prefix=PCH-ERR %s
-// PCH-ERR: error: PCH file contains compiler errors
+// PCH-ERR: error: PCH file '{{.*}}' contains compiler errors
 
 // RUN: not c-index-test -write-pch %t.pch foobar.c 2>&1 | FileCheck -check-prefix=NONEXISTENT %s
 // NONEXISTENT: Unable to load translation unit
diff --git a/clang/test/InstallAPI/diagnostics-dsym.test b/clang/test/InstallAPI/diagnostics-dsym.test
index 42fa67a1f9b1e..b8433653c3f76 100644
--- a/clang/test/InstallAPI/diagnostics-dsym.test
+++ b/clang/test/InstallAPI/diagnostics-dsym.test
@@ -19,8 +19,8 @@
 ; RUN: --verify-mode=Pedantic 2>&1 | FileCheck %s
 
 ; CHECK: violations found for arm64 
-; CHECK: foo.c:5:0: error: no declaration found for exported symbol 'bar' in dynamic library
-; CHECK: foo.c:1:0: error: no declaration found for exported symbol 'foo' in dynamic library
+; CHECK-DAG: foo.c:5:0: error: no declaration found for exported symbol 'bar' in dynamic library
+; CHECK-DAG: foo.c:1:0: error: no declaration found for exported symbol 'foo' in dynamic library
 
 ;--- foo.c
 int foo(void) {
diff --git a/clang/test/InstallAPI/directory-scanning-subdirectories.test b/clang/test/InstallAPI/directory-scanning-subdirectories.test
new file mode 100644
index 0000000000000..3eac90440fa1e
--- /dev/null
+++ b/clang/test/InstallAPI/directory-scanning-subdirectories.test
@@ -0,0 +1,61 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: mkdir -p %t/DstRoot/
+; RUN: cp -r %S/Inputs/LibFoo/* %t/DstRoot/
+
+; RUN: clang-installapi \
+; RUN: -target arm64-apple-macos12 -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -I%t/DstRoot/usr/include -dynamiclib \
+; RUN: -exclude-public-header %t/DstRoot/usr/include/public.h \
+; RUN: %t/DstRoot -o %t/output.tbd 2>&1 | FileCheck %s --allow-empty \
+; RUN: --implicit-check-not=error --implicit-check-not=warning 
+; RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 
+
+
+;--- DstRoot/usr/include/extra/extra.h
+int extra(void);
+
+;--- DstRoot/usr/include/extra/additional/additional.h
+int additional(void);
+
+;--- DstRoot/usr/include/more/more.h
+int more(void);
+
+;--- DstRoot/usr/include/another/another.h
+int another(void);
+
+;--- expected.tbd
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [
+            "_foo", "_additional", "_more",
+            "_another", "_extra"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libfoo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "12",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 7809c826755f8..d191067b93d78 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -83,6 +83,7 @@
 // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable)
 // CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_record_not_is_union)
 // CHECK-NEXT: Hot (SubjectMatchRule_function)
+// CHECK-NEXT: HybridPatchable (SubjectMatchRule_function)
 // CHECK-NEXT: IBAction (SubjectMatchRule_objc_method_is_instance)
 // CHECK-NEXT: IFunc (SubjectMatchRule_function)
 // CHECK-NEXT: InitPriority (SubjectMatchRule_variable)
@@ -110,6 +111,7 @@
 // CHECK-NEXT: Naked (SubjectMatchRule_function)
 // CHECK-NEXT: NoBuiltin (SubjectMatchRule_function)
 // CHECK-NEXT: NoCommon (SubjectMatchRule_variable)
+// CHECK-NEXT: NoConvergent (SubjectMatchRule_function)
 // CHECK-NEXT: NoDebug (SubjectMatchRule_type_alias, SubjectMatchRule_hasType_functionType, SubjectMatchRule_objc_method, SubjectMatchRule_variable_not_is_parameter)
 // CHECK-NEXT: NoDestroy (SubjectMatchRule_variable)
 // CHECK-NEXT: NoDuplicate (SubjectMatchRule_function)
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index a5f9ffa21220a..e0757b69242a8 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -57,7 +57,7 @@
 
 // RUN: not %clang_cc1 -triple powerpc--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix PPC
 // PPC: error: unknown target CPU 'not-a-cpu'
-// PPC-NEXT: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, ppc32, powerpc64, ppc64, powerpc64le, ppc64le, future{{$}}
+// PPC-NEXT: note: valid target CPU values are: generic, 440, 440fp, ppc440, 450, 601, 602, 603, 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, 8548, ppc405, ppc464, ppc476, 970, ppc970, g5, a2, ppca2, ppc-cell-be, e500, e500mc, e5500, power3, pwr3, pwr4, power4, pwr5, power5, pwr5+, power5+, pwr5x, power5x, pwr6, power6, pwr6x, power6x, pwr7, power7, pwr8, power8, pwr9, power9, pwr10, power10, pwr11, power11, powerpc, ppc, ppc32, powerpc64, ppc64, powerpc64le, ppc64le, future{{$}}
 
 // RUN: not %clang_cc1 -triple mips--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix MIPS
 // MIPS: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/inline-builtins.cppm b/clang/test/Modules/inline-builtins.cppm
new file mode 100644
index 0000000000000..8a0fffbfc25bc
--- /dev/null
+++ b/clang/test/Modules/inline-builtins.cppm
@@ -0,0 +1,36 @@
+// REQUIRES: !system-windows
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 -O3 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 -O3 %t/test.cc -fmodule-file=a=%t/a.pcm \
+// RUN:   -emit-llvm -o - | FileCheck %t/test.cc
+
+//--- memmove.h
+typedef long unsigned int size_t;
+extern "C" void *memmove (void *__dest, const void *__src, size_t __n)
+     throw () __attribute__ ((__nonnull__ (1, 2)));
+extern "C" __inline __attribute__ ((__always_inline__)) __attribute__ ((__gnu_inline__)) void *
+ memmove (void *__dest, const void *__src, size_t __len) throw ()
+{
+  return __builtin_memmove(__dest, __src, __len);
+}
+
+//--- a.cppm
+module;
+#include "memmove.h"
+export module a;
+export using ::memmove;
+
+//--- test.cc
+import a;
+
+void test() {
+  int a, b;
+  unsigned c = 0;
+  memmove(&a, &b, c);
+}
+
+// CHECK-NOT: memmove
diff --git a/clang/test/Modules/load-module-with-errors.m b/clang/test/Modules/load-module-with-errors.m
index 1f8e483a19e92..6e10cb3381be8 100644
--- a/clang/test/Modules/load-module-with-errors.m
+++ b/clang/test/Modules/load-module-with-errors.m
@@ -1,7 +1,7 @@
 // Note: the run lines follow their respective tests, since line/column
 // matter in this test.
 
-// pcherror-error@* {{PCH file contains compiler errors}}
+// pcherror-error-re@* {{module file '{{.*}}use_error_a.pcm' contains compiler errors}}
 @import use_error_a; // notallowerror-error {{could not build module 'use_error_a'}}
 @import use_error_b;
 // expected-no-diagnostics
@@ -61,7 +61,7 @@ void test(Error *x) {
 // RUN:   -fmodule-file=%t/prebuilt/use_error_a.pcm \
 // RUN:   -fmodule-file=%t/prebuilt/use_error_b.pcm \
 // RUN:   -fmodules-cache-path=%t 2>&1 | \
-// RUN: grep "PCH file contains compiler errors"
+// RUN: grep "module file .* contains compiler errors"
 
 // Shouldn't build the cached modules (that have errors) when not allowing
 // errors
diff --git a/clang/test/Modules/modulemap-allow-subdirectory-search.m b/clang/test/Modules/modulemap-allow-subdirectory-search.m
new file mode 100644
index 0000000000000..ef6f9b1009bab
--- /dev/null
+++ b/clang/test/Modules/modulemap-allow-subdirectory-search.m
@@ -0,0 +1,18 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m -fmodulemap-allow-subdirectory-search
+// RUN: not %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m -fno-modulemap-allow-subdirectory-search
+
+//--- include/UnrelatedName/Header.h
+// empty
+
+//--- include/UnrelatedName/module.modulemap
+module UsefulCode {
+  header "Header.h"
+  export *
+}
+
+//--- test.m
+@import UsefulCode;
diff --git a/clang/test/Modules/pch-in-module-units.cppm b/clang/test/Modules/pch-in-module-units.cppm
new file mode 100644
index 0000000000000..790a3af09a0ca
--- /dev/null
+++ b/clang/test/Modules/pch-in-module-units.cppm
@@ -0,0 +1,51 @@
+// Test that we will skip ODR checks for declarations from PCH if they
+// were from GMF.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/A.cppm \
+// RUN:   -o %t/A.pcm -fskip-odr-check-in-gmf
+// RUN: %clang_cc1 -std=c++20 -DDIFF -x c++-header %t/foo.h \
+// RUN:   -emit-pch -o %t/foo.pch -fskip-odr-check-in-gmf
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=A=%t/A.pcm -include-pch \
+// RUN:   %t/foo.pch -verify -fsyntax-only -fskip-odr-check-in-gmf
+
+//--- foo.h
+#ifndef FOO_H
+#define FOO_H
+inline int foo() {
+#ifndef DIFF
+  return 43;
+#else
+  return 45;
+#endif
+}
+
+class f {
+public:
+  int mem() {
+#ifndef DIFF
+    return 47;
+#else
+    return 45;
+#endif
+  }
+};
+#endif
+
+//--- A.cppm
+module;
+#include "foo.h"
+export module A;
+export using ::foo;
+export using ::f;
+
+//--- B.cppm
+// expected-no-diagnostics
+module;
+#include "foo.h"
+export module B;
+import A;
+export int b = foo() + f().mem();
diff --git a/clang/test/Modules/stddef.cpp b/clang/test/Modules/stddef.cpp
new file mode 100644
index 0000000000000..c53bfa3485194
--- /dev/null
+++ b/clang/test/Modules/stddef.cpp
@@ -0,0 +1,73 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/no-lsv -I%t %t/stddef.cpp -verify
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-local-submodule-visibility -fmodules-cache-path=%t/lsv -I%t %t/stddef.cpp -verify
+
+//--- stddef.cpp
+#include <b.h>
+
+void *pointer = NULL;
+size_t size = 0;
+
+// When building with modules, a pcm is never re-imported, so re-including
+// stddef.h will not re-import _Builtin_stddef.null to restore the definition of
+// NULL, even though stddef.h will unconditionally include __stddef_null.h when
+// building with modules.
+#undef NULL
+#include <stddef.h>
+
+void *anotherPointer = NULL; // expected-error{{use of undeclared identifier 'NULL'}}
+
+// stddef.h needs to be a `textual` header to support clients doing things like
+// this.
+//
+// #define __need_NULL
+// #include <stddef.h>
+//
+// As a textual header designed to be included multiple times, it can't directly
+// declare anything, or those declarations would go into every module that
+// included it. e.g. if stddef.h contained all of its declarations, and modules
+// A and B included stddef.h, they would both have the declaration for size_t.
+// That breaks Swift, which uses the module name as part of the type name, i.e.
+// A.size_t and B.size_t are treated as completely different types in Swift and
+// cannot be interchanged. To fix that, stddef.h (and stdarg.h) are split out
+// into a separate file per __need macro that can be normal headers in explicit
+// submodules. That runs into yet another wrinkle though. When modules build,
+// declarations from previous submodules leak into subsequent ones when not
+// using local submodule visibility. Consider if stddef.h did the normal thing.
+//
+// #ifndef __STDDEF_H
+// #define __STDDEF_H
+// // include all of the sub-headers
+// #endif
+//
+// When SM builds without local submodule visibility, it will precompile a.h
+// first. When it gets to b.h, the __STDDEF_H declaration from precompiling a.h
+// will leak, and so when b.h includes stddef.h, it won't include any of its
+// sub-headers, and SM.B will thus not import _Builtin_stddef or make any of its
+// submodules visible. Precompiling b.h will be fine since it sees all of the
+// declarations from a.h including stddef.h, but clients that only include b.h
+// will not see any of the stddef.h types. stddef.h thus has to make sure to
+// always include the necessary sub-headers, even if they've been included
+// already. They all have their own header guards to allow this.
+// __stddef_null.h is extra special, so this test makes sure to cover NULL plus
+// one of the normal stddef.h types.
+
+//--- module.modulemap
+module SM {
+  module A {
+    header "a.h"
+    export *
+  }
+
+  module B {
+    header "b.h"
+    export *
+  }
+}
+
+//--- a.h
+#include <stddef.h>
+
+//--- b.h
+#include <stddef.h>
diff --git a/clang/test/Modules/subdirectory-module-maps-working-dir.m b/clang/test/Modules/subdirectory-module-maps-working-dir.m
index 4fb19ff05ef38..43488e8d13601 100644
--- a/clang/test/Modules/subdirectory-module-maps-working-dir.m
+++ b/clang/test/Modules/subdirectory-module-maps-working-dir.m
@@ -1,8 +1,8 @@
 // RUN: rm -rf %t
-// RUN: %clang -fsyntax-only -fmodules -fmodules-cache-path=%t \
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:    -working-directory %S/Inputs \
 // RUN:    -I subdirectory-module-maps-working-dir \
-// RUN:    %s -Werror=implicit-function-declaration -Xclang -verify
+// RUN:    %s -Werror=implicit-function-declaration -verify
 
 @import ModuleInSubdir;
 
diff --git a/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
new file mode 100644
index 0000000000000..7a34113cec8fa
--- /dev/null
+++ b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=DEFAULT %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -munsafe-fp-atomics -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+
+#pragma omp declare target
+
+float fv, fx;
+double dv, dx;
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]], !amdgpu.ignore.denormal.mode [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f32() {
+#pragma omp atomic capture
+  fv = fx = fx + fv;
+}
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// DEFAULT-SAME: ) #[[ATTR0]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f64() {
+#pragma omp atomic capture
+  dv = dx = dx + dv;
+}
+
+#pragma omp end declare target
+//.
+// UNSAFE-FP-ATOMICS: [[META5]] = !{}
+//.
diff --git a/clang/test/OpenMP/generic_loop_ast_print.cpp b/clang/test/OpenMP/generic_loop_ast_print.cpp
index b61ee79615d04..b361724c12a0d 100644
--- a/clang/test/OpenMP/generic_loop_ast_print.cpp
+++ b/clang/test/OpenMP/generic_loop_ast_print.cpp
@@ -23,7 +23,7 @@
 
 //PRINT: template <typename T, int C> void templ_foo(T t) {
 //PRINT:   T j, z;
-//PRINT:   #pragma omp simd collapse(C) reduction(+: z) lastprivate(j)
+//PRINT:   #pragma omp loop collapse(C) reduction(+: z) lastprivate(j) bind(thread)
 //PRINT:   for (T i = 0; i < t; ++i)
 //PRINT:       for (j = 0; j < t; ++j)
 //PRINT:           z += i + j;
@@ -31,19 +31,20 @@
 //DUMP: FunctionTemplateDecl{{.*}}templ_foo
 //DUMP: TemplateTypeParmDecl{{.*}}T
 //DUMP: NonTypeTemplateParmDecl{{.*}}C
-//DUMP: OMPSimdDirective
+//DUMP: OMPGenericLoopDirective
 //DUMP: OMPCollapseClause
 //DUMP: DeclRefExpr{{.*}}'C' 'int'
 //DUMP: OMPReductionClause
 //DUMP: DeclRefExpr{{.*}}'z' 'T'
 //DUMP: OMPLastprivateClause
 //DUMP: DeclRefExpr{{.*}}'j' 'T'
+//DUMP: OMPBindClause
 //DUMP: ForStmt
 //DUMP: ForStmt
 
 //PRINT: template<> void templ_foo<int, 2>(int t) {
 //PRINT:     int j, z;
-//PRINT:     #pragma omp simd collapse(2) reduction(+: z) lastprivate(j)
+//PRINT:     #pragma omp loop collapse(2) reduction(+: z) lastprivate(j) bind(thread)
 //PRINT:         for (int i = 0; i < t; ++i)
 //PRINT:             for (j = 0; j < t; ++j)
 //PRINT:                 z += i + j;
@@ -52,7 +53,7 @@
 //DUMP: TemplateArgument type 'int'
 //DUMP: TemplateArgument integral '2'
 //DUMP: ParmVarDecl{{.*}}'int'
-//DUMP: OMPSimdDirective
+//DUMP: OMPGenericLoopDirective
 //DUMP: OMPCollapseClause
 //DUMP: ConstantExpr{{.*}}'int'
 //DUMP: value: Int 2
@@ -60,6 +61,7 @@
 //DUMP: DeclRefExpr{{.*}}'z' 'int'
 //DUMP: OMPLastprivateClause
 //DUMP: DeclRefExpr{{.*}}'j' 'int'
+//DUMP: OMPBindClause
 //DUMP: ForStmt
 template <typename T, int C>
 void templ_foo(T t) {
@@ -80,12 +82,12 @@ void test() {
   int aaa[1000];
 
   //PRINT: #pragma omp target teams distribute parallel for map(tofrom: MTX)
-  //PRINT: #pragma omp simd
+  //PRINT: #pragma omp loop
   //DUMP: OMPTargetTeamsDistributeParallelForDirective
   //DUMP: CapturedStmt
   //DUMP: ForStmt
   //DUMP: CompoundStmt
-  //DUMP: OMPSimdDirective
+  //DUMP: OMPGenericLoopDirective
   #pragma omp target teams distribute parallel for map(MTX)
   for (auto i = 0; i < N; ++i) {
     #pragma omp loop
@@ -95,11 +97,11 @@ void test() {
   }
 
   //PRINT: #pragma omp target teams
-  //PRINT: #pragma omp distribute
+  //PRINT: #pragma omp loop
   //DUMP: OMPTargetTeamsDirective
   //DUMP: CapturedStmt
   //DUMP: ForStmt
-  //DUMP: OMPDistributeDirective
+  //DUMP: OMPGenericLoopDirective
   #pragma omp target teams
   for (int i=0; i<1000; ++i) {
     #pragma omp loop
@@ -109,8 +111,8 @@ void test() {
   }
 
   int j, z, z1;
-  //PRINT: #pragma omp for collapse(2) private(z) lastprivate(j) order(concurrent) reduction(+: z1)
-  //DUMP: OMPForDirective
+  //PRINT: #pragma omp loop collapse(2) private(z) lastprivate(j) order(concurrent) reduction(+: z1) bind(parallel)
+  //DUMP: OMPGenericLoopDirective
   //DUMP: OMPCollapseClause
   //DUMP: IntegerLiteral{{.*}}2
   //DUMP: OMPPrivateClause
@@ -120,6 +122,7 @@ void test() {
   //DUMP: OMPOrderClause
   //DUMP: OMPReductionClause
   //DUMP-NEXT: DeclRefExpr{{.*}}'z1'
+  //DUMP: OMPBindClause
   //DUMP: ForStmt
   //DUMP: ForStmt
   #pragma omp loop collapse(2) private(z) lastprivate(j) order(concurrent) \
@@ -133,9 +136,10 @@ void test() {
   }
 
   //PRINT: #pragma omp target teams
-  //PRINT: #pragma omp distribute
+  //PRINT: #pragma omp loop bind(teams)
   //DUMP: OMPTargetTeamsDirective
-  //DUMP: OMPDistributeDirective
+  //DUMP: OMPGenericLoopDirective
+  //DUMP: OMPBindClause
   //DUMP: ForStmt
   #pragma omp target teams
   #pragma omp loop bind(teams)
@@ -143,10 +147,11 @@ void test() {
 
   //PRINT: #pragma omp target
   //PRINT: #pragma omp teams
-  //PRINT: #pragma omp distribute
+  //PRINT: #pragma omp loop bind(teams)
   //DUMP: OMPTargetDirective
   //DUMP: OMPTeamsDirective
-  //DUMP: OMPDistributeDirective
+  //DUMP: OMPGenericLoopDirective
+  //DUMP: OMPBindClause
   //DUMP: ForStmt
   #pragma omp target
   #pragma omp teams
diff --git a/clang/test/OpenMP/generic_loop_codegen.cpp b/clang/test/OpenMP/generic_loop_codegen.cpp
index c3ad43bebccaf..d062695fee281 100644
--- a/clang/test/OpenMP/generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/generic_loop_codegen.cpp
@@ -32,6 +32,8 @@ void foo(int t) {
 // IR-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
 // IR-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
 // IR-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// IR-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// IR-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
 // IR-NEXT:    [[I8:%.*]] = alloca i32, align 4
 // IR-NEXT:    [[J9:%.*]] = alloca i32, align 4
 // IR-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
@@ -54,86 +56,89 @@ void foo(int t) {
 // IR-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
 // IR-NEXT:    [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
 // IR-NEXT:    store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// IR-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// IR-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// IR-NEXT:    store i64 [[TMP4]], ptr [[DOTOMP_UB]], align 8
 // IR-NEXT:    store i32 0, ptr [[I8]], align 4
 // IR-NEXT:    store i32 0, ptr [[J9]], align 4
-// IR-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// IR-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
 // IR-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[SIMD_IF_END:%.*]]
 // IR:       land.lhs.true:
-// IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    [[CMP10:%.*]] = icmp slt i32 0, [[TMP5]]
+// IR-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    [[CMP10:%.*]] = icmp slt i32 0, [[TMP6]]
 // IR-NEXT:    br i1 [[CMP10]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END]]
 // IR:       simd.if.then:
-// IR-NEXT:    store i64 0, ptr [[DOTOMP_IV]], align 8
+// IR-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// IR-NEXT:    store i64 [[TMP7]], ptr [[DOTOMP_IV]], align 8
 // IR-NEXT:    store i32 0, ptr [[Z13]], align 4
 // IR-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // IR:       omp.inner.for.cond:
-// IR-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3:![0-9]+]]
-// IR-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP7]], 1
-// IR-NEXT:    [[CMP14:%.*]] = icmp slt i64 [[TMP6]], [[ADD]]
+// IR-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[CMP14:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
 // IR-NEXT:    br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // IR:       omp.inner.for.body:
-// IR-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP9]], 0
+// IR-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP11]], 0
 // IR-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
 // IR-NEXT:    [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]]
 // IR-NEXT:    [[CONV18:%.*]] = sext i32 [[MUL17]] to i64
-// IR-NEXT:    [[DIV19:%.*]] = sdiv i64 [[TMP8]], [[CONV18]]
+// IR-NEXT:    [[DIV19:%.*]] = sdiv i64 [[TMP10]], [[CONV18]]
 // IR-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1
 // IR-NEXT:    [[ADD21:%.*]] = add nsw i64 0, [[MUL20]]
 // IR-NEXT:    [[CONV22:%.*]] = trunc i64 [[ADD21]] to i32
 // IR-NEXT:    store i32 [[CONV22]], ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[SUB23:%.*]] = sub nsw i32 [[TMP12]], 0
+// IR-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[SUB23:%.*]] = sub nsw i32 [[TMP14]], 0
 // IR-NEXT:    [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1
 // IR-NEXT:    [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]]
 // IR-NEXT:    [[CONV26:%.*]] = sext i32 [[MUL25]] to i64
-// IR-NEXT:    [[DIV27:%.*]] = sdiv i64 [[TMP11]], [[CONV26]]
-// IR-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[SUB28:%.*]] = sub nsw i32 [[TMP13]], 0
+// IR-NEXT:    [[DIV27:%.*]] = sdiv i64 [[TMP13]], [[CONV26]]
+// IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[SUB28:%.*]] = sub nsw i32 [[TMP15]], 0
 // IR-NEXT:    [[DIV29:%.*]] = sdiv i32 [[SUB28]], 1
 // IR-NEXT:    [[MUL30:%.*]] = mul nsw i32 1, [[DIV29]]
 // IR-NEXT:    [[CONV31:%.*]] = sext i32 [[MUL30]] to i64
 // IR-NEXT:    [[MUL32:%.*]] = mul nsw i64 [[DIV27]], [[CONV31]]
-// IR-NEXT:    [[SUB33:%.*]] = sub nsw i64 [[TMP10]], [[MUL32]]
+// IR-NEXT:    [[SUB33:%.*]] = sub nsw i64 [[TMP12]], [[MUL32]]
 // IR-NEXT:    [[MUL34:%.*]] = mul nsw i64 [[SUB33]], 1
 // IR-NEXT:    [[ADD35:%.*]] = add nsw i64 0, [[MUL34]]
 // IR-NEXT:    [[CONV36:%.*]] = trunc i64 [[ADD35]] to i32
 // IR-NEXT:    store i32 [[CONV36]], ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[ADD37:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[ADD38:%.*]] = add nsw i32 [[TMP16]], [[ADD37]]
+// IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[TMP17:%.*]] = load i32, ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[ADD37:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[ADD38:%.*]] = add nsw i32 [[TMP18]], [[ADD37]]
 // IR-NEXT:    store i32 [[ADD38]], ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
 // IR-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // IR:       omp.body.continue:
 // IR-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // IR:       omp.inner.for.inc:
-// IR-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP17]], 1
+// IR-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP19]], 1
 // IR-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
 // IR-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
 // IR:       omp.inner.for.end:
-// IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// IR-NEXT:    [[SUB40:%.*]] = sub nsw i32 [[TMP18]], 0
+// IR-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    [[SUB40:%.*]] = sub nsw i32 [[TMP20]], 0
 // IR-NEXT:    [[DIV41:%.*]] = sdiv i32 [[SUB40]], 1
 // IR-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[DIV41]], 1
 // IR-NEXT:    [[ADD43:%.*]] = add nsw i32 0, [[MUL42]]
 // IR-NEXT:    store i32 [[ADD43]], ptr [[I11]], align 4
-// IR-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    [[SUB44:%.*]] = sub nsw i32 [[TMP19]], 0
+// IR-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    [[SUB44:%.*]] = sub nsw i32 [[TMP21]], 0
 // IR-NEXT:    [[DIV45:%.*]] = sdiv i32 [[SUB44]], 1
 // IR-NEXT:    [[MUL46:%.*]] = mul nsw i32 [[DIV45]], 1
 // IR-NEXT:    [[ADD47:%.*]] = add nsw i32 0, [[MUL46]]
 // IR-NEXT:    store i32 [[ADD47]], ptr [[J]], align 4
-// IR-NEXT:    [[TMP20:%.*]] = load i32, ptr [[Z]], align 4
-// IR-NEXT:    [[TMP21:%.*]] = load i32, ptr [[Z13]], align 4
-// IR-NEXT:    [[ADD48:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// IR-NEXT:    [[TMP22:%.*]] = load i32, ptr [[Z]], align 4
+// IR-NEXT:    [[TMP23:%.*]] = load i32, ptr [[Z13]], align 4
+// IR-NEXT:    [[ADD48:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
 // IR-NEXT:    store i32 [[ADD48]], ptr [[Z]], align 4
 // IR-NEXT:    br label [[SIMD_IF_END]]
 // IR:       simd.if.end:
@@ -152,6 +157,8 @@ void foo(int t) {
 // IR-PCH-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
 // IR-PCH-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
 // IR-PCH-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
 // IR-PCH-NEXT:    [[I8:%.*]] = alloca i32, align 4
 // IR-PCH-NEXT:    [[J9:%.*]] = alloca i32, align 4
 // IR-PCH-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
@@ -174,86 +181,89 @@ void foo(int t) {
 // IR-PCH-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
 // IR-PCH-NEXT:    [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
 // IR-PCH-NEXT:    store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// IR-PCH-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// IR-PCH-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// IR-PCH-NEXT:    store i64 [[TMP4]], ptr [[DOTOMP_UB]], align 8
 // IR-PCH-NEXT:    store i32 0, ptr [[I8]], align 4
 // IR-PCH-NEXT:    store i32 0, ptr [[J9]], align 4
-// IR-PCH-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// IR-PCH-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// IR-PCH-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
 // IR-PCH-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[SIMD_IF_END:%.*]]
 // IR-PCH:       land.lhs.true:
-// IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// IR-PCH-NEXT:    [[CMP10:%.*]] = icmp slt i32 0, [[TMP5]]
+// IR-PCH-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// IR-PCH-NEXT:    [[CMP10:%.*]] = icmp slt i32 0, [[TMP6]]
 // IR-PCH-NEXT:    br i1 [[CMP10]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END]]
 // IR-PCH:       simd.if.then:
-// IR-PCH-NEXT:    store i64 0, ptr [[DOTOMP_IV]], align 8
+// IR-PCH-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// IR-PCH-NEXT:    store i64 [[TMP7]], ptr [[DOTOMP_IV]], align 8
 // IR-PCH-NEXT:    store i32 0, ptr [[Z13]], align 4
 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // IR-PCH:       omp.inner.for.cond:
-// IR-PCH-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3:![0-9]+]]
-// IR-PCH-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP7]], 1
-// IR-PCH-NEXT:    [[CMP14:%.*]] = icmp slt i64 [[TMP6]], [[ADD]]
+// IR-PCH-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-PCH-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[CMP14:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
 // IR-PCH-NEXT:    br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // IR-PCH:       omp.inner.for.body:
-// IR-PCH-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP9]], 0
+// IR-PCH-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP11]], 0
 // IR-PCH-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
 // IR-PCH-NEXT:    [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]]
 // IR-PCH-NEXT:    [[CONV18:%.*]] = sext i32 [[MUL17]] to i64
-// IR-PCH-NEXT:    [[DIV19:%.*]] = sdiv i64 [[TMP8]], [[CONV18]]
+// IR-PCH-NEXT:    [[DIV19:%.*]] = sdiv i64 [[TMP10]], [[CONV18]]
 // IR-PCH-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1
 // IR-PCH-NEXT:    [[ADD21:%.*]] = add nsw i64 0, [[MUL20]]
 // IR-PCH-NEXT:    [[CONV22:%.*]] = trunc i64 [[ADD21]] to i32
 // IR-PCH-NEXT:    store i32 [[CONV22]], ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[SUB23:%.*]] = sub nsw i32 [[TMP12]], 0
+// IR-PCH-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[SUB23:%.*]] = sub nsw i32 [[TMP14]], 0
 // IR-PCH-NEXT:    [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1
 // IR-PCH-NEXT:    [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]]
 // IR-PCH-NEXT:    [[CONV26:%.*]] = sext i32 [[MUL25]] to i64
-// IR-PCH-NEXT:    [[DIV27:%.*]] = sdiv i64 [[TMP11]], [[CONV26]]
-// IR-PCH-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[SUB28:%.*]] = sub nsw i32 [[TMP13]], 0
+// IR-PCH-NEXT:    [[DIV27:%.*]] = sdiv i64 [[TMP13]], [[CONV26]]
+// IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[SUB28:%.*]] = sub nsw i32 [[TMP15]], 0
 // IR-PCH-NEXT:    [[DIV29:%.*]] = sdiv i32 [[SUB28]], 1
 // IR-PCH-NEXT:    [[MUL30:%.*]] = mul nsw i32 1, [[DIV29]]
 // IR-PCH-NEXT:    [[CONV31:%.*]] = sext i32 [[MUL30]] to i64
 // IR-PCH-NEXT:    [[MUL32:%.*]] = mul nsw i64 [[DIV27]], [[CONV31]]
-// IR-PCH-NEXT:    [[SUB33:%.*]] = sub nsw i64 [[TMP10]], [[MUL32]]
+// IR-PCH-NEXT:    [[SUB33:%.*]] = sub nsw i64 [[TMP12]], [[MUL32]]
 // IR-PCH-NEXT:    [[MUL34:%.*]] = mul nsw i64 [[SUB33]], 1
 // IR-PCH-NEXT:    [[ADD35:%.*]] = add nsw i64 0, [[MUL34]]
 // IR-PCH-NEXT:    [[CONV36:%.*]] = trunc i64 [[ADD35]] to i32
 // IR-PCH-NEXT:    store i32 [[CONV36]], ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[ADD37:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[ADD38:%.*]] = add nsw i32 [[TMP16]], [[ADD37]]
+// IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I11]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[TMP17:%.*]] = load i32, ptr [[J12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[ADD37:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[ADD38:%.*]] = add nsw i32 [[TMP18]], [[ADD37]]
 // IR-PCH-NEXT:    store i32 [[ADD38]], ptr [[Z13]], align 4, !llvm.access.group [[ACC_GRP3]]
 // IR-PCH-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // IR-PCH:       omp.body.continue:
 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // IR-PCH:       omp.inner.for.inc:
-// IR-PCH-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
-// IR-PCH-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP17]], 1
+// IR-PCH-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP19]], 1
 // IR-PCH-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP3]]
 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
 // IR-PCH:       omp.inner.for.end:
-// IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// IR-PCH-NEXT:    [[SUB40:%.*]] = sub nsw i32 [[TMP18]], 0
+// IR-PCH-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// IR-PCH-NEXT:    [[SUB40:%.*]] = sub nsw i32 [[TMP20]], 0
 // IR-PCH-NEXT:    [[DIV41:%.*]] = sdiv i32 [[SUB40]], 1
 // IR-PCH-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[DIV41]], 1
 // IR-PCH-NEXT:    [[ADD43:%.*]] = add nsw i32 0, [[MUL42]]
 // IR-PCH-NEXT:    store i32 [[ADD43]], ptr [[I11]], align 4
-// IR-PCH-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// IR-PCH-NEXT:    [[SUB44:%.*]] = sub nsw i32 [[TMP19]], 0
+// IR-PCH-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// IR-PCH-NEXT:    [[SUB44:%.*]] = sub nsw i32 [[TMP21]], 0
 // IR-PCH-NEXT:    [[DIV45:%.*]] = sdiv i32 [[SUB44]], 1
 // IR-PCH-NEXT:    [[MUL46:%.*]] = mul nsw i32 [[DIV45]], 1
 // IR-PCH-NEXT:    [[ADD47:%.*]] = add nsw i32 0, [[MUL46]]
 // IR-PCH-NEXT:    store i32 [[ADD47]], ptr [[J]], align 4
-// IR-PCH-NEXT:    [[TMP20:%.*]] = load i32, ptr [[Z]], align 4
-// IR-PCH-NEXT:    [[TMP21:%.*]] = load i32, ptr [[Z13]], align 4
-// IR-PCH-NEXT:    [[ADD48:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// IR-PCH-NEXT:    [[TMP22:%.*]] = load i32, ptr [[Z]], align 4
+// IR-PCH-NEXT:    [[TMP23:%.*]] = load i32, ptr [[Z13]], align 4
+// IR-PCH-NEXT:    [[ADD48:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
 // IR-PCH-NEXT:    store i32 [[ADD48]], ptr [[Z]], align 4
 // IR-PCH-NEXT:    br label [[SIMD_IF_END]]
 // IR-PCH:       simd.if.end:
diff --git a/clang/test/OpenMP/interchange_ast_print.cpp b/clang/test/OpenMP/interchange_ast_print.cpp
new file mode 100644
index 0000000000000..f8bf075cd300f
--- /dev/null
+++ b/clang/test/OpenMP/interchange_ast_print.cpp
@@ -0,0 +1,135 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo1
+void foo1() {
+  // PRINT: #pragma omp interchange
+  // DUMP:  OMPInterchangeDirective
+  #pragma omp interchange
+  // PRINT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT: for (int j = 7; j < 17; j += 3)
+    // DUMP:  ForStmt
+    for (int j = 7; j < 17; j += 3)
+      // PRINT: body(i, j);
+      // DUMP:  CallExpr
+      body(i, j);
+}
+
+
+
+
+// PRINT-LABEL: void foo3(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo3
+void foo3() {
+  // PRINT: #pragma omp for collapse(3)
+  // DUMP:      OMPForDirective
+  // DUMP-NEXT:   OMPCollapseClause
+  // DUMP-NEXT:     ConstantExpr
+  // DUMP-NEXT:       value: Int 3
+  // DUMP-NEXT:     IntegerLiteral {{.*}} 3
+  // DUMP-NEXT:     CapturedStmt
+  // DUMP-NEXT:       CapturedDecl
+  #pragma omp for collapse(3)
+  // PRINT: #pragma omp interchange
+  // DUMP:  OMPInterchangeDirective
+  #pragma omp interchange
+  // PRINT: for (int i = 7; i < 17; i += 1)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 1)
+    // PRINT: for (int j = 7; j < 17; j += 1)
+    // DUMP:  ForStmt
+    for (int j = 7; j < 17; j += 1)
+      // PRINT: for (int k = 7; k < 17; k += 1)
+      // DUMP:  ForStmt
+      for (int k = 7; k < 17; k += 1)
+        // PRINT: body(i, j, k);
+        // DUMP:  CallExpr
+        body(i, j, k);
+}
+
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL: FunctionTemplateDecl {{.*}} foo6
+template<int Tile>
+void foo6() {
+    // PRINT:     #pragma omp interchange
+    // DUMP:      OMPInterchangeDirective
+    #pragma omp interchange
+      // PRINT-NEXT: for (int i = 0; i < 11; i += 2)
+      // DUMP-NEXT:  ForStmt
+      for (int i = 0; i < 11; i += 2)
+        // PRINT-NEXT: #pragma omp tile sizes(Tile)
+        // DUMP:       OMPTileDirective
+        #pragma omp tile sizes(Tile)
+        // PRINT-NEXT: for (int j = 0; j < 13; j += 2)
+        // DUMP:       ForStmt
+        for (int j = 0; j < 13; j += 2)
+          // PRINT-NEXT: body(i, j);
+          // DUMP:       CallExpr
+          body(i, j);
+}
+
+// Also test instantiating the template.
+void tfoo6() {
+  foo6<32>();
+}
+
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL: FunctionDecl {{.*}} foo7
+void foo7() {
+  double arr[128];
+  // PRINT: #pragma omp interchange
+  // DUMP:  OMPInterchangeDirective
+  #pragma omp interchange
+  // PRINT-NEXT: for (double c = 42; auto &&v : arr)
+  // DUMP-NEXT:  CXXForRangeStmt
+  for (double c = 42; auto &&v : arr)
+    // PRINT-NEXT: for (int i = 0; i < 42; i += 2)
+    // DUMP:       ForStmt
+    for (int i = 0; i < 42; i += 2)
+      // PRINT-NEXT: body(c, v, i);
+      // DUMP:       CallExpr
+      body(c, v, i);
+}
+
+
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL: FunctionDecl {{.*}} foo8
+void foo8() {
+  double arr[128];
+  // PRINT: #pragma omp interchange
+  // DUMP:  OMPInterchangeDirective
+  #pragma omp interchange
+  // PRINT-NEXT: for (int i = 0; i < 42; i += 2)
+  // DUMP-NEXT:  ForStmt
+  for (int i = 0; i < 42; i += 2)
+    // PRINT-NEXT: for (double c = 42; auto &&v : arr)
+    // DUMP:       CXXForRangeStmt
+    for (double c = 42; auto &&v : arr)
+      // PRINT-NEXT: body(i, c, v);
+      // DUMP:       CallExpr
+      body(i, c, v);
+}
+
+#endif
+
diff --git a/clang/test/OpenMP/interchange_codegen.cpp b/clang/test/OpenMP/interchange_codegen.cpp
new file mode 100644
index 0000000000000..9c1782183cf98
--- /dev/null
+++ b/clang/test/OpenMP/interchange_codegen.cpp
@@ -0,0 +1,1990 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// FIXME: They should be exactly the same but currently differ in function order
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+
+
+extern "C" void foo2(int start1, int start2, int end1, int end2, int step1, int step2) {
+#pragma omp interchange
+  for (int i = start1; i < end1; i += step1)
+    for (int j = start2; j < end2; j += step2)
+      body(i, j);
+}
+
+
+extern "C" void foo3() {
+#pragma omp for
+#pragma omp interchange
+  for (int i = 7; i < 17; i += 3)
+    for (int j = 7; j < 17; j += 3)
+      body(i, j);
+}
+
+
+extern "C" void foo4() {
+#pragma omp for collapse(2)
+  for (int k = 7; k < 17; k += 3)
+#pragma omp interchange
+    for (int i = 7; i < 17; i += 3)
+      for (int j = 7; j < 17; j += 3)
+        body(i, j);
+}
+
+
+extern "C" void foo6() {
+#pragma omp for collapse(4)
+  for (int i = 7; i < 17; i += 3)
+#pragma omp interchange
+    for (int j = 7; j < 17; j += 3)
+      for (int k = 7; k < 17; k += 3)
+        for (int l = 7; l < 17; l += 3)
+          body(i, j, k, l);
+}
+
+
+extern "C" void foo9() {
+  double arr[128];
+  #pragma omp interchange
+  for (double c = 42; auto && v : arr)
+    for (int i = 0; i < 42; i += 2)
+      body(c, v, i);
+}
+
+
+extern "C" void foo10() {
+  double A[128], B[16];
+  #pragma omp for collapse(4)
+  for (int i = 0; i < 128; ++i)
+    #pragma omp interchange
+    for (double c = 42; auto aa : A)
+      for (double d = 42; auto &bb : B)
+        for (int j = 0; j < 128; ++j)
+          body(i, c, aa, d, bb, j);
+}
+
+#endif /* HEADER */
+
+// CHECK1-LABEL: define {{[^@]+}}@body
+// CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo2
+// CHECK1-SAME: (i32 noundef [[START1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP8]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTNEW_STEP7]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK1-NEXT:    [[SUB9:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT:    [[SUB10:%.*]] = sub i32 [[SUB9]], 1
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[SUB10]], [[TMP14]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK1-NEXT:    [[DIV12:%.*]] = udiv i32 [[ADD11]], [[TMP15]]
+// CHECK1-NEXT:    [[SUB13:%.*]] = sub i32 [[DIV12]], 1
+// CHECK1-NEXT:    store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_8]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4
+// CHECK1-NEXT:    [[ADD14:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP16]], [[ADD14]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP18]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD15]], ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND16:%.*]]
+// CHECK1:       for.cond16:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD17:%.*]] = add i32 [[TMP22]], 1
+// CHECK1-NEXT:    [[CMP18:%.*]] = icmp ult i32 [[TMP21]], [[ADD17]]
+// CHECK1-NEXT:    br i1 [[CMP18]], label [[FOR_BODY19:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body19:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL20:%.*]] = mul i32 [[TMP24]], [[TMP25]]
+// CHECK1-NEXT:    [[ADD21:%.*]] = add i32 [[TMP23]], [[MUL20]]
+// CHECK1-NEXT:    store i32 [[ADD21]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP26]], i32 noundef [[TMP27]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP28]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND16]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC22:%.*]]
+// CHECK1:       for.inc22:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[INC23:%.*]] = add i32 [[TMP29]], 1
+// CHECK1-NEXT:    store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1:       for.end24:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo3
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP7]], 3
+// CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP8]], 4
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 7, [[MUL5]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP10]], i32 noundef [[TMP11]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo4
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 15, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 15
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 15, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV3:%.*]] = sdiv i32 [[TMP8]], 4
+// CHECK1-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL4]]
+// CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK1-NEXT:    [[MUL7:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL7]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[CMP9:%.*]] = icmp slt i32 [[TMP10]], 4
+// CHECK1-NEXT:    br i1 [[CMP9]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP11]], 3
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 7, [[MUL10]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP12]], i32 noundef [[TMP13]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK1-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo6
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV_K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV_J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[L:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK1-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 7, ptr [[K]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 255, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 255
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 255, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 64
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV5:%.*]] = sdiv i32 [[TMP8]], 64
+// CHECK1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 64
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL6]]
+// CHECK1-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB]], 16
+// CHECK1-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 0, [[MUL8]]
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[DOTPERMUTED_0_IV_K]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV10:%.*]] = sdiv i32 [[TMP10]], 64
+// CHECK1-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 64
+// CHECK1-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP9]], [[MUL11]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV13:%.*]] = sdiv i32 [[TMP12]], 64
+// CHECK1-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 64
+// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP11]], [[MUL14]]
+// CHECK1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 16
+// CHECK1-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[DIV16]], 16
+// CHECK1-NEXT:    [[SUB18:%.*]] = sub nsw i32 [[SUB12]], [[MUL17]]
+// CHECK1-NEXT:    [[DIV19:%.*]] = sdiv i32 [[SUB18]], 4
+// CHECK1-NEXT:    [[MUL20:%.*]] = mul nsw i32 [[DIV19]], 1
+// CHECK1-NEXT:    [[ADD21:%.*]] = add nsw i32 0, [[MUL20]]
+// CHECK1-NEXT:    store i32 [[ADD21]], ptr [[DOTPERMUTED_1_IV_J]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV22:%.*]] = sdiv i32 [[TMP14]], 64
+// CHECK1-NEXT:    [[MUL23:%.*]] = mul nsw i32 [[DIV22]], 64
+// CHECK1-NEXT:    [[SUB24:%.*]] = sub nsw i32 [[TMP13]], [[MUL23]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV25:%.*]] = sdiv i32 [[TMP16]], 64
+// CHECK1-NEXT:    [[MUL26:%.*]] = mul nsw i32 [[DIV25]], 64
+// CHECK1-NEXT:    [[SUB27:%.*]] = sub nsw i32 [[TMP15]], [[MUL26]]
+// CHECK1-NEXT:    [[DIV28:%.*]] = sdiv i32 [[SUB27]], 16
+// CHECK1-NEXT:    [[MUL29:%.*]] = mul nsw i32 [[DIV28]], 16
+// CHECK1-NEXT:    [[SUB30:%.*]] = sub nsw i32 [[SUB24]], [[MUL29]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV31:%.*]] = sdiv i32 [[TMP18]], 64
+// CHECK1-NEXT:    [[MUL32:%.*]] = mul nsw i32 [[DIV31]], 64
+// CHECK1-NEXT:    [[SUB33:%.*]] = sub nsw i32 [[TMP17]], [[MUL32]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV34:%.*]] = sdiv i32 [[TMP20]], 64
+// CHECK1-NEXT:    [[MUL35:%.*]] = mul nsw i32 [[DIV34]], 64
+// CHECK1-NEXT:    [[SUB36:%.*]] = sub nsw i32 [[TMP19]], [[MUL35]]
+// CHECK1-NEXT:    [[DIV37:%.*]] = sdiv i32 [[SUB36]], 16
+// CHECK1-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[DIV37]], 16
+// CHECK1-NEXT:    [[SUB39:%.*]] = sub nsw i32 [[SUB33]], [[MUL38]]
+// CHECK1-NEXT:    [[DIV40:%.*]] = sdiv i32 [[SUB39]], 4
+// CHECK1-NEXT:    [[MUL41:%.*]] = mul nsw i32 [[DIV40]], 4
+// CHECK1-NEXT:    [[SUB42:%.*]] = sub nsw i32 [[SUB30]], [[MUL41]]
+// CHECK1-NEXT:    [[MUL43:%.*]] = mul nsw i32 [[SUB42]], 3
+// CHECK1-NEXT:    [[ADD44:%.*]] = add nsw i32 7, [[MUL43]]
+// CHECK1-NEXT:    store i32 [[ADD44]], ptr [[L]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_K]], align 4
+// CHECK1-NEXT:    [[MUL45:%.*]] = mul nsw i32 [[TMP21]], 3
+// CHECK1-NEXT:    [[ADD46:%.*]] = add nsw i32 7, [[MUL45]]
+// CHECK1-NEXT:    store i32 [[ADD46]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_J]], align 4
+// CHECK1-NEXT:    [[MUL47:%.*]] = mul nsw i32 [[TMP22]], 3
+// CHECK1-NEXT:    [[ADD48:%.*]] = add nsw i32 7, [[MUL47]]
+// CHECK1-NEXT:    store i32 [[ADD48]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[L]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP23]], i32 noundef [[TMP24]], i32 noundef [[TMP25]], i32 noundef [[TMP26]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP27]], 1
+// CHECK1-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo9
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [128 x double], align 16
+// CHECK1-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP6]], 21
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END15:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 2
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[I]], align 4
+// CHECK1-NEXT:    store i64 0, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK1:       for.cond7:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP9]], 1
+// CHECK1-NEXT:    [[CMP9:%.*]] = icmp slt i64 [[TMP8]], [[ADD8]]
+// CHECK1-NEXT:    br i1 [[CMP9]], label [[FOR_BODY10:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body10:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL11:%.*]] = mul nsw i64 [[TMP11]], 1
+// CHECK1-NEXT:    [[ADD_PTR12:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL11]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR12]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[C]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP13]], double noundef [[TMP15]], i32 noundef [[TMP16]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP17]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC13:%.*]]
+// CHECK1:       for.inc13:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK1-NEXT:    [[INC14:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK1-NEXT:    store i32 [[INC14]], ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1:       for.end15:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo10
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A:%.*]] = alloca [128 x double], align 16
+// CHECK1-NEXT:    [[B:%.*]] = alloca [16 x double], align 16
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[_TMP2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[__RANGE3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[D:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[__RANGE4:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END4:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN4:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_12:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_14:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_15:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_24:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_26:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_28:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV___BEGIN4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I37:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPERMUTED_0_IV___BEGIN438:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTPERMUTED_1_IV___BEGIN339:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[J40:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[BB:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[AA:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK1-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK1-NEXT:    store ptr [[A]], ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END3]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY5]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB8]], ptr [[DOTCAPTURE_EXPR_7]], align 8
+// CHECK1-NEXT:    store double 4.200000e+01, ptr [[D]], align 8
+// CHECK1-NEXT:    store ptr [[B]], ptr [[__RANGE4]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP7]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY9]], i64 16
+// CHECK1-NEXT:    store ptr [[ADD_PTR10]], ptr [[__END4]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY11:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY11]], ptr [[__BEGIN4]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY13]], ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__END4]], align 8
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[DOTCAPTURE_EXPR_14]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_14]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST16:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST17:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB18:%.*]] = sub i64 [[SUB_PTR_LHS_CAST16]], [[SUB_PTR_RHS_CAST17]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV19:%.*]] = sdiv exact i64 [[SUB_PTR_SUB18]], 8
+// CHECK1-NEXT:    [[SUB20:%.*]] = sub nsw i64 [[SUB_PTR_DIV19]], 1
+// CHECK1-NEXT:    [[ADD21:%.*]] = add nsw i64 [[SUB20]], 1
+// CHECK1-NEXT:    [[DIV22:%.*]] = sdiv i64 [[ADD21]], 1
+// CHECK1-NEXT:    [[SUB23:%.*]] = sub nsw i64 [[DIV22]], 1
+// CHECK1-NEXT:    store i64 [[SUB23]], ptr [[DOTCAPTURE_EXPR_15]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_15]], align 8
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i64 [[TMP13]], 1
+// CHECK1-NEXT:    store i64 [[ADD25]], ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_7]], align 8
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT:    store i64 [[ADD27]], ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB29:%.*]] = sub nsw i64 [[TMP15]], 0
+// CHECK1-NEXT:    [[DIV30:%.*]] = sdiv i64 [[SUB29]], 1
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 128, [[DIV30]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB31:%.*]] = sub nsw i64 [[TMP16]], 0
+// CHECK1-NEXT:    [[DIV32:%.*]] = sdiv i64 [[SUB31]], 1
+// CHECK1-NEXT:    [[MUL33:%.*]] = mul nsw i64 [[MUL]], [[DIV32]]
+// CHECK1-NEXT:    [[MUL34:%.*]] = mul nsw i64 [[MUL33]], 128
+// CHECK1-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[MUL34]], 1
+// CHECK1-NEXT:    store i64 [[SUB35]], ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    store i64 0, ptr [[DOTPERMUTED_0_IV___BEGIN4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTPERMUTED_1_IV___BEGIN3]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 0, [[TMP17]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       land.lhs.true:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[CMP36:%.*]] = icmp slt i64 0, [[TMP18]]
+// CHECK1-NEXT:    br i1 [[CMP36]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK1-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT:    br i1 [[CMP41]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[TMP22]], [[COND_TRUE]] ], [ [[TMP23]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT:    store i64 [[TMP24]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[CMP42:%.*]] = icmp sle i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    br i1 [[CMP42]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB43:%.*]] = sub nsw i64 [[TMP28]], 0
+// CHECK1-NEXT:    [[DIV44:%.*]] = sdiv i64 [[SUB43]], 1
+// CHECK1-NEXT:    [[MUL45:%.*]] = mul nsw i64 1, [[DIV44]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB46:%.*]] = sub nsw i64 [[TMP29]], 0
+// CHECK1-NEXT:    [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul nsw i64 [[MUL45]], [[DIV47]]
+// CHECK1-NEXT:    [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 128
+// CHECK1-NEXT:    [[DIV50:%.*]] = sdiv i64 [[TMP27]], [[MUL49]]
+// CHECK1-NEXT:    [[MUL51:%.*]] = mul nsw i64 [[DIV50]], 1
+// CHECK1-NEXT:    [[ADD52:%.*]] = add nsw i64 0, [[MUL51]]
+// CHECK1-NEXT:    [[CONV:%.*]] = trunc i64 [[ADD52]] to i32
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[I37]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[TMP32]], 0
+// CHECK1-NEXT:    [[DIV54:%.*]] = sdiv i64 [[SUB53]], 1
+// CHECK1-NEXT:    [[MUL55:%.*]] = mul nsw i64 1, [[DIV54]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB56:%.*]] = sub nsw i64 [[TMP33]], 0
+// CHECK1-NEXT:    [[DIV57:%.*]] = sdiv i64 [[SUB56]], 1
+// CHECK1-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[MUL55]], [[DIV57]]
+// CHECK1-NEXT:    [[MUL59:%.*]] = mul nsw i64 [[MUL58]], 128
+// CHECK1-NEXT:    [[DIV60:%.*]] = sdiv i64 [[TMP31]], [[MUL59]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB61:%.*]] = sub nsw i64 [[TMP34]], 0
+// CHECK1-NEXT:    [[DIV62:%.*]] = sdiv i64 [[SUB61]], 1
+// CHECK1-NEXT:    [[MUL63:%.*]] = mul nsw i64 1, [[DIV62]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB64:%.*]] = sub nsw i64 [[TMP35]], 0
+// CHECK1-NEXT:    [[DIV65:%.*]] = sdiv i64 [[SUB64]], 1
+// CHECK1-NEXT:    [[MUL66:%.*]] = mul nsw i64 [[MUL63]], [[DIV65]]
+// CHECK1-NEXT:    [[MUL67:%.*]] = mul nsw i64 [[MUL66]], 128
+// CHECK1-NEXT:    [[MUL68:%.*]] = mul nsw i64 [[DIV60]], [[MUL67]]
+// CHECK1-NEXT:    [[SUB69:%.*]] = sub nsw i64 [[TMP30]], [[MUL68]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB70:%.*]] = sub nsw i64 [[TMP36]], 0
+// CHECK1-NEXT:    [[DIV71:%.*]] = sdiv i64 [[SUB70]], 1
+// CHECK1-NEXT:    [[MUL72:%.*]] = mul nsw i64 1, [[DIV71]]
+// CHECK1-NEXT:    [[MUL73:%.*]] = mul nsw i64 [[MUL72]], 128
+// CHECK1-NEXT:    [[DIV74:%.*]] = sdiv i64 [[SUB69]], [[MUL73]]
+// CHECK1-NEXT:    [[MUL75:%.*]] = mul nsw i64 [[DIV74]], 1
+// CHECK1-NEXT:    [[ADD76:%.*]] = add nsw i64 0, [[MUL75]]
+// CHECK1-NEXT:    store i64 [[ADD76]], ptr [[DOTPERMUTED_0_IV___BEGIN438]], align 8
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB77:%.*]] = sub nsw i64 [[TMP39]], 0
+// CHECK1-NEXT:    [[DIV78:%.*]] = sdiv i64 [[SUB77]], 1
+// CHECK1-NEXT:    [[MUL79:%.*]] = mul nsw i64 1, [[DIV78]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB80:%.*]] = sub nsw i64 [[TMP40]], 0
+// CHECK1-NEXT:    [[DIV81:%.*]] = sdiv i64 [[SUB80]], 1
+// CHECK1-NEXT:    [[MUL82:%.*]] = mul nsw i64 [[MUL79]], [[DIV81]]
+// CHECK1-NEXT:    [[MUL83:%.*]] = mul nsw i64 [[MUL82]], 128
+// CHECK1-NEXT:    [[DIV84:%.*]] = sdiv i64 [[TMP38]], [[MUL83]]
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB85:%.*]] = sub nsw i64 [[TMP41]], 0
+// CHECK1-NEXT:    [[DIV86:%.*]] = sdiv i64 [[SUB85]], 1
+// CHECK1-NEXT:    [[MUL87:%.*]] = mul nsw i64 1, [[DIV86]]
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB88:%.*]] = sub nsw i64 [[TMP42]], 0
+// CHECK1-NEXT:    [[DIV89:%.*]] = sdiv i64 [[SUB88]], 1
+// CHECK1-NEXT:    [[MUL90:%.*]] = mul nsw i64 [[MUL87]], [[DIV89]]
+// CHECK1-NEXT:    [[MUL91:%.*]] = mul nsw i64 [[MUL90]], 128
+// CHECK1-NEXT:    [[MUL92:%.*]] = mul nsw i64 [[DIV84]], [[MUL91]]
+// CHECK1-NEXT:    [[SUB93:%.*]] = sub nsw i64 [[TMP37]], [[MUL92]]
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB94:%.*]] = sub nsw i64 [[TMP45]], 0
+// CHECK1-NEXT:    [[DIV95:%.*]] = sdiv i64 [[SUB94]], 1
+// CHECK1-NEXT:    [[MUL96:%.*]] = mul nsw i64 1, [[DIV95]]
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB97:%.*]] = sub nsw i64 [[TMP46]], 0
+// CHECK1-NEXT:    [[DIV98:%.*]] = sdiv i64 [[SUB97]], 1
+// CHECK1-NEXT:    [[MUL99:%.*]] = mul nsw i64 [[MUL96]], [[DIV98]]
+// CHECK1-NEXT:    [[MUL100:%.*]] = mul nsw i64 [[MUL99]], 128
+// CHECK1-NEXT:    [[DIV101:%.*]] = sdiv i64 [[TMP44]], [[MUL100]]
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB102:%.*]] = sub nsw i64 [[TMP47]], 0
+// CHECK1-NEXT:    [[DIV103:%.*]] = sdiv i64 [[SUB102]], 1
+// CHECK1-NEXT:    [[MUL104:%.*]] = mul nsw i64 1, [[DIV103]]
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB105:%.*]] = sub nsw i64 [[TMP48]], 0
+// CHECK1-NEXT:    [[DIV106:%.*]] = sdiv i64 [[SUB105]], 1
+// CHECK1-NEXT:    [[MUL107:%.*]] = mul nsw i64 [[MUL104]], [[DIV106]]
+// CHECK1-NEXT:    [[MUL108:%.*]] = mul nsw i64 [[MUL107]], 128
+// CHECK1-NEXT:    [[MUL109:%.*]] = mul nsw i64 [[DIV101]], [[MUL108]]
+// CHECK1-NEXT:    [[SUB110:%.*]] = sub nsw i64 [[TMP43]], [[MUL109]]
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB111:%.*]] = sub nsw i64 [[TMP49]], 0
+// CHECK1-NEXT:    [[DIV112:%.*]] = sdiv i64 [[SUB111]], 1
+// CHECK1-NEXT:    [[MUL113:%.*]] = mul nsw i64 1, [[DIV112]]
+// CHECK1-NEXT:    [[MUL114:%.*]] = mul nsw i64 [[MUL113]], 128
+// CHECK1-NEXT:    [[DIV115:%.*]] = sdiv i64 [[SUB110]], [[MUL114]]
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB116:%.*]] = sub nsw i64 [[TMP50]], 0
+// CHECK1-NEXT:    [[DIV117:%.*]] = sdiv i64 [[SUB116]], 1
+// CHECK1-NEXT:    [[MUL118:%.*]] = mul nsw i64 1, [[DIV117]]
+// CHECK1-NEXT:    [[MUL119:%.*]] = mul nsw i64 [[MUL118]], 128
+// CHECK1-NEXT:    [[MUL120:%.*]] = mul nsw i64 [[DIV115]], [[MUL119]]
+// CHECK1-NEXT:    [[SUB121:%.*]] = sub nsw i64 [[SUB93]], [[MUL120]]
+// CHECK1-NEXT:    [[DIV122:%.*]] = sdiv i64 [[SUB121]], 128
+// CHECK1-NEXT:    [[MUL123:%.*]] = mul nsw i64 [[DIV122]], 1
+// CHECK1-NEXT:    [[ADD124:%.*]] = add nsw i64 0, [[MUL123]]
+// CHECK1-NEXT:    store i64 [[ADD124]], ptr [[DOTPERMUTED_1_IV___BEGIN339]], align 8
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB125:%.*]] = sub nsw i64 [[TMP53]], 0
+// CHECK1-NEXT:    [[DIV126:%.*]] = sdiv i64 [[SUB125]], 1
+// CHECK1-NEXT:    [[MUL127:%.*]] = mul nsw i64 1, [[DIV126]]
+// CHECK1-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB128:%.*]] = sub nsw i64 [[TMP54]], 0
+// CHECK1-NEXT:    [[DIV129:%.*]] = sdiv i64 [[SUB128]], 1
+// CHECK1-NEXT:    [[MUL130:%.*]] = mul nsw i64 [[MUL127]], [[DIV129]]
+// CHECK1-NEXT:    [[MUL131:%.*]] = mul nsw i64 [[MUL130]], 128
+// CHECK1-NEXT:    [[DIV132:%.*]] = sdiv i64 [[TMP52]], [[MUL131]]
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB133:%.*]] = sub nsw i64 [[TMP55]], 0
+// CHECK1-NEXT:    [[DIV134:%.*]] = sdiv i64 [[SUB133]], 1
+// CHECK1-NEXT:    [[MUL135:%.*]] = mul nsw i64 1, [[DIV134]]
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB136:%.*]] = sub nsw i64 [[TMP56]], 0
+// CHECK1-NEXT:    [[DIV137:%.*]] = sdiv i64 [[SUB136]], 1
+// CHECK1-NEXT:    [[MUL138:%.*]] = mul nsw i64 [[MUL135]], [[DIV137]]
+// CHECK1-NEXT:    [[MUL139:%.*]] = mul nsw i64 [[MUL138]], 128
+// CHECK1-NEXT:    [[MUL140:%.*]] = mul nsw i64 [[DIV132]], [[MUL139]]
+// CHECK1-NEXT:    [[SUB141:%.*]] = sub nsw i64 [[TMP51]], [[MUL140]]
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB142:%.*]] = sub nsw i64 [[TMP59]], 0
+// CHECK1-NEXT:    [[DIV143:%.*]] = sdiv i64 [[SUB142]], 1
+// CHECK1-NEXT:    [[MUL144:%.*]] = mul nsw i64 1, [[DIV143]]
+// CHECK1-NEXT:    [[TMP60:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB145:%.*]] = sub nsw i64 [[TMP60]], 0
+// CHECK1-NEXT:    [[DIV146:%.*]] = sdiv i64 [[SUB145]], 1
+// CHECK1-NEXT:    [[MUL147:%.*]] = mul nsw i64 [[MUL144]], [[DIV146]]
+// CHECK1-NEXT:    [[MUL148:%.*]] = mul nsw i64 [[MUL147]], 128
+// CHECK1-NEXT:    [[DIV149:%.*]] = sdiv i64 [[TMP58]], [[MUL148]]
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB150:%.*]] = sub nsw i64 [[TMP61]], 0
+// CHECK1-NEXT:    [[DIV151:%.*]] = sdiv i64 [[SUB150]], 1
+// CHECK1-NEXT:    [[MUL152:%.*]] = mul nsw i64 1, [[DIV151]]
+// CHECK1-NEXT:    [[TMP62:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB153:%.*]] = sub nsw i64 [[TMP62]], 0
+// CHECK1-NEXT:    [[DIV154:%.*]] = sdiv i64 [[SUB153]], 1
+// CHECK1-NEXT:    [[MUL155:%.*]] = mul nsw i64 [[MUL152]], [[DIV154]]
+// CHECK1-NEXT:    [[MUL156:%.*]] = mul nsw i64 [[MUL155]], 128
+// CHECK1-NEXT:    [[MUL157:%.*]] = mul nsw i64 [[DIV149]], [[MUL156]]
+// CHECK1-NEXT:    [[SUB158:%.*]] = sub nsw i64 [[TMP57]], [[MUL157]]
+// CHECK1-NEXT:    [[TMP63:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB159:%.*]] = sub nsw i64 [[TMP63]], 0
+// CHECK1-NEXT:    [[DIV160:%.*]] = sdiv i64 [[SUB159]], 1
+// CHECK1-NEXT:    [[MUL161:%.*]] = mul nsw i64 1, [[DIV160]]
+// CHECK1-NEXT:    [[MUL162:%.*]] = mul nsw i64 [[MUL161]], 128
+// CHECK1-NEXT:    [[DIV163:%.*]] = sdiv i64 [[SUB158]], [[MUL162]]
+// CHECK1-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB164:%.*]] = sub nsw i64 [[TMP64]], 0
+// CHECK1-NEXT:    [[DIV165:%.*]] = sdiv i64 [[SUB164]], 1
+// CHECK1-NEXT:    [[MUL166:%.*]] = mul nsw i64 1, [[DIV165]]
+// CHECK1-NEXT:    [[MUL167:%.*]] = mul nsw i64 [[MUL166]], 128
+// CHECK1-NEXT:    [[MUL168:%.*]] = mul nsw i64 [[DIV163]], [[MUL167]]
+// CHECK1-NEXT:    [[SUB169:%.*]] = sub nsw i64 [[SUB141]], [[MUL168]]
+// CHECK1-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB170:%.*]] = sub nsw i64 [[TMP67]], 0
+// CHECK1-NEXT:    [[DIV171:%.*]] = sdiv i64 [[SUB170]], 1
+// CHECK1-NEXT:    [[MUL172:%.*]] = mul nsw i64 1, [[DIV171]]
+// CHECK1-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB173:%.*]] = sub nsw i64 [[TMP68]], 0
+// CHECK1-NEXT:    [[DIV174:%.*]] = sdiv i64 [[SUB173]], 1
+// CHECK1-NEXT:    [[MUL175:%.*]] = mul nsw i64 [[MUL172]], [[DIV174]]
+// CHECK1-NEXT:    [[MUL176:%.*]] = mul nsw i64 [[MUL175]], 128
+// CHECK1-NEXT:    [[DIV177:%.*]] = sdiv i64 [[TMP66]], [[MUL176]]
+// CHECK1-NEXT:    [[TMP69:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB178:%.*]] = sub nsw i64 [[TMP69]], 0
+// CHECK1-NEXT:    [[DIV179:%.*]] = sdiv i64 [[SUB178]], 1
+// CHECK1-NEXT:    [[MUL180:%.*]] = mul nsw i64 1, [[DIV179]]
+// CHECK1-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB181:%.*]] = sub nsw i64 [[TMP70]], 0
+// CHECK1-NEXT:    [[DIV182:%.*]] = sdiv i64 [[SUB181]], 1
+// CHECK1-NEXT:    [[MUL183:%.*]] = mul nsw i64 [[MUL180]], [[DIV182]]
+// CHECK1-NEXT:    [[MUL184:%.*]] = mul nsw i64 [[MUL183]], 128
+// CHECK1-NEXT:    [[MUL185:%.*]] = mul nsw i64 [[DIV177]], [[MUL184]]
+// CHECK1-NEXT:    [[SUB186:%.*]] = sub nsw i64 [[TMP65]], [[MUL185]]
+// CHECK1-NEXT:    [[TMP71:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP72:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP73:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB187:%.*]] = sub nsw i64 [[TMP73]], 0
+// CHECK1-NEXT:    [[DIV188:%.*]] = sdiv i64 [[SUB187]], 1
+// CHECK1-NEXT:    [[MUL189:%.*]] = mul nsw i64 1, [[DIV188]]
+// CHECK1-NEXT:    [[TMP74:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB190:%.*]] = sub nsw i64 [[TMP74]], 0
+// CHECK1-NEXT:    [[DIV191:%.*]] = sdiv i64 [[SUB190]], 1
+// CHECK1-NEXT:    [[MUL192:%.*]] = mul nsw i64 [[MUL189]], [[DIV191]]
+// CHECK1-NEXT:    [[MUL193:%.*]] = mul nsw i64 [[MUL192]], 128
+// CHECK1-NEXT:    [[DIV194:%.*]] = sdiv i64 [[TMP72]], [[MUL193]]
+// CHECK1-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK1-NEXT:    [[SUB195:%.*]] = sub nsw i64 [[TMP75]], 0
+// CHECK1-NEXT:    [[DIV196:%.*]] = sdiv i64 [[SUB195]], 1
+// CHECK1-NEXT:    [[MUL197:%.*]] = mul nsw i64 1, [[DIV196]]
+// CHECK1-NEXT:    [[TMP76:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB198:%.*]] = sub nsw i64 [[TMP76]], 0
+// CHECK1-NEXT:    [[DIV199:%.*]] = sdiv i64 [[SUB198]], 1
+// CHECK1-NEXT:    [[MUL200:%.*]] = mul nsw i64 [[MUL197]], [[DIV199]]
+// CHECK1-NEXT:    [[MUL201:%.*]] = mul nsw i64 [[MUL200]], 128
+// CHECK1-NEXT:    [[MUL202:%.*]] = mul nsw i64 [[DIV194]], [[MUL201]]
+// CHECK1-NEXT:    [[SUB203:%.*]] = sub nsw i64 [[TMP71]], [[MUL202]]
+// CHECK1-NEXT:    [[TMP77:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB204:%.*]] = sub nsw i64 [[TMP77]], 0
+// CHECK1-NEXT:    [[DIV205:%.*]] = sdiv i64 [[SUB204]], 1
+// CHECK1-NEXT:    [[MUL206:%.*]] = mul nsw i64 1, [[DIV205]]
+// CHECK1-NEXT:    [[MUL207:%.*]] = mul nsw i64 [[MUL206]], 128
+// CHECK1-NEXT:    [[DIV208:%.*]] = sdiv i64 [[SUB203]], [[MUL207]]
+// CHECK1-NEXT:    [[TMP78:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK1-NEXT:    [[SUB209:%.*]] = sub nsw i64 [[TMP78]], 0
+// CHECK1-NEXT:    [[DIV210:%.*]] = sdiv i64 [[SUB209]], 1
+// CHECK1-NEXT:    [[MUL211:%.*]] = mul nsw i64 1, [[DIV210]]
+// CHECK1-NEXT:    [[MUL212:%.*]] = mul nsw i64 [[MUL211]], 128
+// CHECK1-NEXT:    [[MUL213:%.*]] = mul nsw i64 [[DIV208]], [[MUL212]]
+// CHECK1-NEXT:    [[SUB214:%.*]] = sub nsw i64 [[SUB186]], [[MUL213]]
+// CHECK1-NEXT:    [[DIV215:%.*]] = sdiv i64 [[SUB214]], 128
+// CHECK1-NEXT:    [[MUL216:%.*]] = mul nsw i64 [[DIV215]], 128
+// CHECK1-NEXT:    [[SUB217:%.*]] = sub nsw i64 [[SUB169]], [[MUL216]]
+// CHECK1-NEXT:    [[MUL218:%.*]] = mul nsw i64 [[SUB217]], 1
+// CHECK1-NEXT:    [[ADD219:%.*]] = add nsw i64 0, [[MUL218]]
+// CHECK1-NEXT:    [[CONV220:%.*]] = trunc i64 [[ADD219]] to i32
+// CHECK1-NEXT:    store i32 [[CONV220]], ptr [[J40]], align 4
+// CHECK1-NEXT:    [[TMP79:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK1-NEXT:    [[TMP80:%.*]] = load i64, ptr [[DOTPERMUTED_0_IV___BEGIN438]], align 8
+// CHECK1-NEXT:    [[MUL221:%.*]] = mul nsw i64 [[TMP80]], 1
+// CHECK1-NEXT:    [[ADD_PTR222:%.*]] = getelementptr inbounds double, ptr [[TMP79]], i64 [[MUL221]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR222]], ptr [[__BEGIN4]], align 8
+// CHECK1-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[__BEGIN4]], align 8
+// CHECK1-NEXT:    store ptr [[TMP81]], ptr [[BB]], align 8
+// CHECK1-NEXT:    [[TMP82:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP83:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN339]], align 8
+// CHECK1-NEXT:    [[MUL223:%.*]] = mul nsw i64 [[TMP83]], 1
+// CHECK1-NEXT:    [[ADD_PTR224:%.*]] = getelementptr inbounds double, ptr [[TMP82]], i64 [[MUL223]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR224]], ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP85:%.*]] = load double, ptr [[TMP84]], align 8
+// CHECK1-NEXT:    store double [[TMP85]], ptr [[AA]], align 8
+// CHECK1-NEXT:    [[TMP86:%.*]] = load i32, ptr [[I37]], align 4
+// CHECK1-NEXT:    [[TMP87:%.*]] = load double, ptr [[C]], align 8
+// CHECK1-NEXT:    [[TMP88:%.*]] = load double, ptr [[AA]], align 8
+// CHECK1-NEXT:    [[TMP89:%.*]] = load double, ptr [[D]], align 8
+// CHECK1-NEXT:    [[TMP90:%.*]] = load ptr, ptr [[BB]], align 8
+// CHECK1-NEXT:    [[TMP91:%.*]] = load double, ptr [[TMP90]], align 8
+// CHECK1-NEXT:    [[TMP92:%.*]] = load i32, ptr [[J40]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP86]], double noundef [[TMP87]], double noundef [[TMP88]], double noundef [[TMP89]], double noundef [[TMP91]], i32 noundef [[TMP92]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP93:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[ADD225:%.*]] = add nsw i64 [[TMP93]], 1
+// CHECK1-NEXT:    store i64 [[ADD225]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@body
+// CHECK2-SAME: (...) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo10
+// CHECK2-SAME: () #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A:%.*]] = alloca [128 x double], align 16
+// CHECK2-NEXT:    [[B:%.*]] = alloca [16 x double], align 16
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP1:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[_TMP2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[__RANGE3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[D:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[__RANGE4:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END4:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN4:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_12:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_14:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_15:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_24:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_26:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_28:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV___BEGIN4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I37:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV___BEGIN438:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV___BEGIN339:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[J40:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[BB:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[AA:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK2-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK2-NEXT:    store ptr [[A]], ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END3]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY5]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8
+// CHECK2-NEXT:    store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB8]], ptr [[DOTCAPTURE_EXPR_7]], align 8
+// CHECK2-NEXT:    store double 4.200000e+01, ptr [[D]], align 8
+// CHECK2-NEXT:    store ptr [[B]], ptr [[__RANGE4]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP7]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY9]], i64 16
+// CHECK2-NEXT:    store ptr [[ADD_PTR10]], ptr [[__END4]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY11:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY11]], ptr [[__BEGIN4]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE4]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [16 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY13]], ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__END4]], align 8
+// CHECK2-NEXT:    store ptr [[TMP10]], ptr [[DOTCAPTURE_EXPR_14]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_14]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST16:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST17:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB18:%.*]] = sub i64 [[SUB_PTR_LHS_CAST16]], [[SUB_PTR_RHS_CAST17]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV19:%.*]] = sdiv exact i64 [[SUB_PTR_SUB18]], 8
+// CHECK2-NEXT:    [[SUB20:%.*]] = sub nsw i64 [[SUB_PTR_DIV19]], 1
+// CHECK2-NEXT:    [[ADD21:%.*]] = add nsw i64 [[SUB20]], 1
+// CHECK2-NEXT:    [[DIV22:%.*]] = sdiv i64 [[ADD21]], 1
+// CHECK2-NEXT:    [[SUB23:%.*]] = sub nsw i64 [[DIV22]], 1
+// CHECK2-NEXT:    store i64 [[SUB23]], ptr [[DOTCAPTURE_EXPR_15]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_15]], align 8
+// CHECK2-NEXT:    [[ADD25:%.*]] = add nsw i64 [[TMP13]], 1
+// CHECK2-NEXT:    store i64 [[ADD25]], ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_7]], align 8
+// CHECK2-NEXT:    [[ADD27:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT:    store i64 [[ADD27]], ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB29:%.*]] = sub nsw i64 [[TMP15]], 0
+// CHECK2-NEXT:    [[DIV30:%.*]] = sdiv i64 [[SUB29]], 1
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 128, [[DIV30]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB31:%.*]] = sub nsw i64 [[TMP16]], 0
+// CHECK2-NEXT:    [[DIV32:%.*]] = sdiv i64 [[SUB31]], 1
+// CHECK2-NEXT:    [[MUL33:%.*]] = mul nsw i64 [[MUL]], [[DIV32]]
+// CHECK2-NEXT:    [[MUL34:%.*]] = mul nsw i64 [[MUL33]], 128
+// CHECK2-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[MUL34]], 1
+// CHECK2-NEXT:    store i64 [[SUB35]], ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    store i64 0, ptr [[DOTPERMUTED_0_IV___BEGIN4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTPERMUTED_1_IV___BEGIN3]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 0, [[TMP17]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2:       land.lhs.true:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[CMP36:%.*]] = icmp slt i64 0, [[TMP18]]
+// CHECK2-NEXT:    br i1 [[CMP36]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK2:       omp.precond.then:
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK2-NEXT:    store i64 [[TMP19]], ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_8(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK2-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT:    br i1 [[CMP41]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_28]], align 8
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[TMP22]], [[COND_TRUE]] ], [ [[TMP23]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK2-NEXT:    store i64 [[TMP24]], ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[CMP42:%.*]] = icmp sle i64 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    br i1 [[CMP42]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB43:%.*]] = sub nsw i64 [[TMP28]], 0
+// CHECK2-NEXT:    [[DIV44:%.*]] = sdiv i64 [[SUB43]], 1
+// CHECK2-NEXT:    [[MUL45:%.*]] = mul nsw i64 1, [[DIV44]]
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB46:%.*]] = sub nsw i64 [[TMP29]], 0
+// CHECK2-NEXT:    [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul nsw i64 [[MUL45]], [[DIV47]]
+// CHECK2-NEXT:    [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 128
+// CHECK2-NEXT:    [[DIV50:%.*]] = sdiv i64 [[TMP27]], [[MUL49]]
+// CHECK2-NEXT:    [[MUL51:%.*]] = mul nsw i64 [[DIV50]], 1
+// CHECK2-NEXT:    [[ADD52:%.*]] = add nsw i64 0, [[MUL51]]
+// CHECK2-NEXT:    [[CONV:%.*]] = trunc i64 [[ADD52]] to i32
+// CHECK2-NEXT:    store i32 [[CONV]], ptr [[I37]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[TMP32]], 0
+// CHECK2-NEXT:    [[DIV54:%.*]] = sdiv i64 [[SUB53]], 1
+// CHECK2-NEXT:    [[MUL55:%.*]] = mul nsw i64 1, [[DIV54]]
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB56:%.*]] = sub nsw i64 [[TMP33]], 0
+// CHECK2-NEXT:    [[DIV57:%.*]] = sdiv i64 [[SUB56]], 1
+// CHECK2-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[MUL55]], [[DIV57]]
+// CHECK2-NEXT:    [[MUL59:%.*]] = mul nsw i64 [[MUL58]], 128
+// CHECK2-NEXT:    [[DIV60:%.*]] = sdiv i64 [[TMP31]], [[MUL59]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB61:%.*]] = sub nsw i64 [[TMP34]], 0
+// CHECK2-NEXT:    [[DIV62:%.*]] = sdiv i64 [[SUB61]], 1
+// CHECK2-NEXT:    [[MUL63:%.*]] = mul nsw i64 1, [[DIV62]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB64:%.*]] = sub nsw i64 [[TMP35]], 0
+// CHECK2-NEXT:    [[DIV65:%.*]] = sdiv i64 [[SUB64]], 1
+// CHECK2-NEXT:    [[MUL66:%.*]] = mul nsw i64 [[MUL63]], [[DIV65]]
+// CHECK2-NEXT:    [[MUL67:%.*]] = mul nsw i64 [[MUL66]], 128
+// CHECK2-NEXT:    [[MUL68:%.*]] = mul nsw i64 [[DIV60]], [[MUL67]]
+// CHECK2-NEXT:    [[SUB69:%.*]] = sub nsw i64 [[TMP30]], [[MUL68]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB70:%.*]] = sub nsw i64 [[TMP36]], 0
+// CHECK2-NEXT:    [[DIV71:%.*]] = sdiv i64 [[SUB70]], 1
+// CHECK2-NEXT:    [[MUL72:%.*]] = mul nsw i64 1, [[DIV71]]
+// CHECK2-NEXT:    [[MUL73:%.*]] = mul nsw i64 [[MUL72]], 128
+// CHECK2-NEXT:    [[DIV74:%.*]] = sdiv i64 [[SUB69]], [[MUL73]]
+// CHECK2-NEXT:    [[MUL75:%.*]] = mul nsw i64 [[DIV74]], 1
+// CHECK2-NEXT:    [[ADD76:%.*]] = add nsw i64 0, [[MUL75]]
+// CHECK2-NEXT:    store i64 [[ADD76]], ptr [[DOTPERMUTED_0_IV___BEGIN438]], align 8
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB77:%.*]] = sub nsw i64 [[TMP39]], 0
+// CHECK2-NEXT:    [[DIV78:%.*]] = sdiv i64 [[SUB77]], 1
+// CHECK2-NEXT:    [[MUL79:%.*]] = mul nsw i64 1, [[DIV78]]
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB80:%.*]] = sub nsw i64 [[TMP40]], 0
+// CHECK2-NEXT:    [[DIV81:%.*]] = sdiv i64 [[SUB80]], 1
+// CHECK2-NEXT:    [[MUL82:%.*]] = mul nsw i64 [[MUL79]], [[DIV81]]
+// CHECK2-NEXT:    [[MUL83:%.*]] = mul nsw i64 [[MUL82]], 128
+// CHECK2-NEXT:    [[DIV84:%.*]] = sdiv i64 [[TMP38]], [[MUL83]]
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB85:%.*]] = sub nsw i64 [[TMP41]], 0
+// CHECK2-NEXT:    [[DIV86:%.*]] = sdiv i64 [[SUB85]], 1
+// CHECK2-NEXT:    [[MUL87:%.*]] = mul nsw i64 1, [[DIV86]]
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB88:%.*]] = sub nsw i64 [[TMP42]], 0
+// CHECK2-NEXT:    [[DIV89:%.*]] = sdiv i64 [[SUB88]], 1
+// CHECK2-NEXT:    [[MUL90:%.*]] = mul nsw i64 [[MUL87]], [[DIV89]]
+// CHECK2-NEXT:    [[MUL91:%.*]] = mul nsw i64 [[MUL90]], 128
+// CHECK2-NEXT:    [[MUL92:%.*]] = mul nsw i64 [[DIV84]], [[MUL91]]
+// CHECK2-NEXT:    [[SUB93:%.*]] = sub nsw i64 [[TMP37]], [[MUL92]]
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB94:%.*]] = sub nsw i64 [[TMP45]], 0
+// CHECK2-NEXT:    [[DIV95:%.*]] = sdiv i64 [[SUB94]], 1
+// CHECK2-NEXT:    [[MUL96:%.*]] = mul nsw i64 1, [[DIV95]]
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB97:%.*]] = sub nsw i64 [[TMP46]], 0
+// CHECK2-NEXT:    [[DIV98:%.*]] = sdiv i64 [[SUB97]], 1
+// CHECK2-NEXT:    [[MUL99:%.*]] = mul nsw i64 [[MUL96]], [[DIV98]]
+// CHECK2-NEXT:    [[MUL100:%.*]] = mul nsw i64 [[MUL99]], 128
+// CHECK2-NEXT:    [[DIV101:%.*]] = sdiv i64 [[TMP44]], [[MUL100]]
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB102:%.*]] = sub nsw i64 [[TMP47]], 0
+// CHECK2-NEXT:    [[DIV103:%.*]] = sdiv i64 [[SUB102]], 1
+// CHECK2-NEXT:    [[MUL104:%.*]] = mul nsw i64 1, [[DIV103]]
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB105:%.*]] = sub nsw i64 [[TMP48]], 0
+// CHECK2-NEXT:    [[DIV106:%.*]] = sdiv i64 [[SUB105]], 1
+// CHECK2-NEXT:    [[MUL107:%.*]] = mul nsw i64 [[MUL104]], [[DIV106]]
+// CHECK2-NEXT:    [[MUL108:%.*]] = mul nsw i64 [[MUL107]], 128
+// CHECK2-NEXT:    [[MUL109:%.*]] = mul nsw i64 [[DIV101]], [[MUL108]]
+// CHECK2-NEXT:    [[SUB110:%.*]] = sub nsw i64 [[TMP43]], [[MUL109]]
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB111:%.*]] = sub nsw i64 [[TMP49]], 0
+// CHECK2-NEXT:    [[DIV112:%.*]] = sdiv i64 [[SUB111]], 1
+// CHECK2-NEXT:    [[MUL113:%.*]] = mul nsw i64 1, [[DIV112]]
+// CHECK2-NEXT:    [[MUL114:%.*]] = mul nsw i64 [[MUL113]], 128
+// CHECK2-NEXT:    [[DIV115:%.*]] = sdiv i64 [[SUB110]], [[MUL114]]
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB116:%.*]] = sub nsw i64 [[TMP50]], 0
+// CHECK2-NEXT:    [[DIV117:%.*]] = sdiv i64 [[SUB116]], 1
+// CHECK2-NEXT:    [[MUL118:%.*]] = mul nsw i64 1, [[DIV117]]
+// CHECK2-NEXT:    [[MUL119:%.*]] = mul nsw i64 [[MUL118]], 128
+// CHECK2-NEXT:    [[MUL120:%.*]] = mul nsw i64 [[DIV115]], [[MUL119]]
+// CHECK2-NEXT:    [[SUB121:%.*]] = sub nsw i64 [[SUB93]], [[MUL120]]
+// CHECK2-NEXT:    [[DIV122:%.*]] = sdiv i64 [[SUB121]], 128
+// CHECK2-NEXT:    [[MUL123:%.*]] = mul nsw i64 [[DIV122]], 1
+// CHECK2-NEXT:    [[ADD124:%.*]] = add nsw i64 0, [[MUL123]]
+// CHECK2-NEXT:    store i64 [[ADD124]], ptr [[DOTPERMUTED_1_IV___BEGIN339]], align 8
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB125:%.*]] = sub nsw i64 [[TMP53]], 0
+// CHECK2-NEXT:    [[DIV126:%.*]] = sdiv i64 [[SUB125]], 1
+// CHECK2-NEXT:    [[MUL127:%.*]] = mul nsw i64 1, [[DIV126]]
+// CHECK2-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB128:%.*]] = sub nsw i64 [[TMP54]], 0
+// CHECK2-NEXT:    [[DIV129:%.*]] = sdiv i64 [[SUB128]], 1
+// CHECK2-NEXT:    [[MUL130:%.*]] = mul nsw i64 [[MUL127]], [[DIV129]]
+// CHECK2-NEXT:    [[MUL131:%.*]] = mul nsw i64 [[MUL130]], 128
+// CHECK2-NEXT:    [[DIV132:%.*]] = sdiv i64 [[TMP52]], [[MUL131]]
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB133:%.*]] = sub nsw i64 [[TMP55]], 0
+// CHECK2-NEXT:    [[DIV134:%.*]] = sdiv i64 [[SUB133]], 1
+// CHECK2-NEXT:    [[MUL135:%.*]] = mul nsw i64 1, [[DIV134]]
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB136:%.*]] = sub nsw i64 [[TMP56]], 0
+// CHECK2-NEXT:    [[DIV137:%.*]] = sdiv i64 [[SUB136]], 1
+// CHECK2-NEXT:    [[MUL138:%.*]] = mul nsw i64 [[MUL135]], [[DIV137]]
+// CHECK2-NEXT:    [[MUL139:%.*]] = mul nsw i64 [[MUL138]], 128
+// CHECK2-NEXT:    [[MUL140:%.*]] = mul nsw i64 [[DIV132]], [[MUL139]]
+// CHECK2-NEXT:    [[SUB141:%.*]] = sub nsw i64 [[TMP51]], [[MUL140]]
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP58:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB142:%.*]] = sub nsw i64 [[TMP59]], 0
+// CHECK2-NEXT:    [[DIV143:%.*]] = sdiv i64 [[SUB142]], 1
+// CHECK2-NEXT:    [[MUL144:%.*]] = mul nsw i64 1, [[DIV143]]
+// CHECK2-NEXT:    [[TMP60:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB145:%.*]] = sub nsw i64 [[TMP60]], 0
+// CHECK2-NEXT:    [[DIV146:%.*]] = sdiv i64 [[SUB145]], 1
+// CHECK2-NEXT:    [[MUL147:%.*]] = mul nsw i64 [[MUL144]], [[DIV146]]
+// CHECK2-NEXT:    [[MUL148:%.*]] = mul nsw i64 [[MUL147]], 128
+// CHECK2-NEXT:    [[DIV149:%.*]] = sdiv i64 [[TMP58]], [[MUL148]]
+// CHECK2-NEXT:    [[TMP61:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB150:%.*]] = sub nsw i64 [[TMP61]], 0
+// CHECK2-NEXT:    [[DIV151:%.*]] = sdiv i64 [[SUB150]], 1
+// CHECK2-NEXT:    [[MUL152:%.*]] = mul nsw i64 1, [[DIV151]]
+// CHECK2-NEXT:    [[TMP62:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB153:%.*]] = sub nsw i64 [[TMP62]], 0
+// CHECK2-NEXT:    [[DIV154:%.*]] = sdiv i64 [[SUB153]], 1
+// CHECK2-NEXT:    [[MUL155:%.*]] = mul nsw i64 [[MUL152]], [[DIV154]]
+// CHECK2-NEXT:    [[MUL156:%.*]] = mul nsw i64 [[MUL155]], 128
+// CHECK2-NEXT:    [[MUL157:%.*]] = mul nsw i64 [[DIV149]], [[MUL156]]
+// CHECK2-NEXT:    [[SUB158:%.*]] = sub nsw i64 [[TMP57]], [[MUL157]]
+// CHECK2-NEXT:    [[TMP63:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB159:%.*]] = sub nsw i64 [[TMP63]], 0
+// CHECK2-NEXT:    [[DIV160:%.*]] = sdiv i64 [[SUB159]], 1
+// CHECK2-NEXT:    [[MUL161:%.*]] = mul nsw i64 1, [[DIV160]]
+// CHECK2-NEXT:    [[MUL162:%.*]] = mul nsw i64 [[MUL161]], 128
+// CHECK2-NEXT:    [[DIV163:%.*]] = sdiv i64 [[SUB158]], [[MUL162]]
+// CHECK2-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB164:%.*]] = sub nsw i64 [[TMP64]], 0
+// CHECK2-NEXT:    [[DIV165:%.*]] = sdiv i64 [[SUB164]], 1
+// CHECK2-NEXT:    [[MUL166:%.*]] = mul nsw i64 1, [[DIV165]]
+// CHECK2-NEXT:    [[MUL167:%.*]] = mul nsw i64 [[MUL166]], 128
+// CHECK2-NEXT:    [[MUL168:%.*]] = mul nsw i64 [[DIV163]], [[MUL167]]
+// CHECK2-NEXT:    [[SUB169:%.*]] = sub nsw i64 [[SUB141]], [[MUL168]]
+// CHECK2-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB170:%.*]] = sub nsw i64 [[TMP67]], 0
+// CHECK2-NEXT:    [[DIV171:%.*]] = sdiv i64 [[SUB170]], 1
+// CHECK2-NEXT:    [[MUL172:%.*]] = mul nsw i64 1, [[DIV171]]
+// CHECK2-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB173:%.*]] = sub nsw i64 [[TMP68]], 0
+// CHECK2-NEXT:    [[DIV174:%.*]] = sdiv i64 [[SUB173]], 1
+// CHECK2-NEXT:    [[MUL175:%.*]] = mul nsw i64 [[MUL172]], [[DIV174]]
+// CHECK2-NEXT:    [[MUL176:%.*]] = mul nsw i64 [[MUL175]], 128
+// CHECK2-NEXT:    [[DIV177:%.*]] = sdiv i64 [[TMP66]], [[MUL176]]
+// CHECK2-NEXT:    [[TMP69:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB178:%.*]] = sub nsw i64 [[TMP69]], 0
+// CHECK2-NEXT:    [[DIV179:%.*]] = sdiv i64 [[SUB178]], 1
+// CHECK2-NEXT:    [[MUL180:%.*]] = mul nsw i64 1, [[DIV179]]
+// CHECK2-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB181:%.*]] = sub nsw i64 [[TMP70]], 0
+// CHECK2-NEXT:    [[DIV182:%.*]] = sdiv i64 [[SUB181]], 1
+// CHECK2-NEXT:    [[MUL183:%.*]] = mul nsw i64 [[MUL180]], [[DIV182]]
+// CHECK2-NEXT:    [[MUL184:%.*]] = mul nsw i64 [[MUL183]], 128
+// CHECK2-NEXT:    [[MUL185:%.*]] = mul nsw i64 [[DIV177]], [[MUL184]]
+// CHECK2-NEXT:    [[SUB186:%.*]] = sub nsw i64 [[TMP65]], [[MUL185]]
+// CHECK2-NEXT:    [[TMP71:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP72:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP73:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB187:%.*]] = sub nsw i64 [[TMP73]], 0
+// CHECK2-NEXT:    [[DIV188:%.*]] = sdiv i64 [[SUB187]], 1
+// CHECK2-NEXT:    [[MUL189:%.*]] = mul nsw i64 1, [[DIV188]]
+// CHECK2-NEXT:    [[TMP74:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB190:%.*]] = sub nsw i64 [[TMP74]], 0
+// CHECK2-NEXT:    [[DIV191:%.*]] = sdiv i64 [[SUB190]], 1
+// CHECK2-NEXT:    [[MUL192:%.*]] = mul nsw i64 [[MUL189]], [[DIV191]]
+// CHECK2-NEXT:    [[MUL193:%.*]] = mul nsw i64 [[MUL192]], 128
+// CHECK2-NEXT:    [[DIV194:%.*]] = sdiv i64 [[TMP72]], [[MUL193]]
+// CHECK2-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_24]], align 8
+// CHECK2-NEXT:    [[SUB195:%.*]] = sub nsw i64 [[TMP75]], 0
+// CHECK2-NEXT:    [[DIV196:%.*]] = sdiv i64 [[SUB195]], 1
+// CHECK2-NEXT:    [[MUL197:%.*]] = mul nsw i64 1, [[DIV196]]
+// CHECK2-NEXT:    [[TMP76:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB198:%.*]] = sub nsw i64 [[TMP76]], 0
+// CHECK2-NEXT:    [[DIV199:%.*]] = sdiv i64 [[SUB198]], 1
+// CHECK2-NEXT:    [[MUL200:%.*]] = mul nsw i64 [[MUL197]], [[DIV199]]
+// CHECK2-NEXT:    [[MUL201:%.*]] = mul nsw i64 [[MUL200]], 128
+// CHECK2-NEXT:    [[MUL202:%.*]] = mul nsw i64 [[DIV194]], [[MUL201]]
+// CHECK2-NEXT:    [[SUB203:%.*]] = sub nsw i64 [[TMP71]], [[MUL202]]
+// CHECK2-NEXT:    [[TMP77:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB204:%.*]] = sub nsw i64 [[TMP77]], 0
+// CHECK2-NEXT:    [[DIV205:%.*]] = sdiv i64 [[SUB204]], 1
+// CHECK2-NEXT:    [[MUL206:%.*]] = mul nsw i64 1, [[DIV205]]
+// CHECK2-NEXT:    [[MUL207:%.*]] = mul nsw i64 [[MUL206]], 128
+// CHECK2-NEXT:    [[DIV208:%.*]] = sdiv i64 [[SUB203]], [[MUL207]]
+// CHECK2-NEXT:    [[TMP78:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_26]], align 8
+// CHECK2-NEXT:    [[SUB209:%.*]] = sub nsw i64 [[TMP78]], 0
+// CHECK2-NEXT:    [[DIV210:%.*]] = sdiv i64 [[SUB209]], 1
+// CHECK2-NEXT:    [[MUL211:%.*]] = mul nsw i64 1, [[DIV210]]
+// CHECK2-NEXT:    [[MUL212:%.*]] = mul nsw i64 [[MUL211]], 128
+// CHECK2-NEXT:    [[MUL213:%.*]] = mul nsw i64 [[DIV208]], [[MUL212]]
+// CHECK2-NEXT:    [[SUB214:%.*]] = sub nsw i64 [[SUB186]], [[MUL213]]
+// CHECK2-NEXT:    [[DIV215:%.*]] = sdiv i64 [[SUB214]], 128
+// CHECK2-NEXT:    [[MUL216:%.*]] = mul nsw i64 [[DIV215]], 128
+// CHECK2-NEXT:    [[SUB217:%.*]] = sub nsw i64 [[SUB169]], [[MUL216]]
+// CHECK2-NEXT:    [[MUL218:%.*]] = mul nsw i64 [[SUB217]], 1
+// CHECK2-NEXT:    [[ADD219:%.*]] = add nsw i64 0, [[MUL218]]
+// CHECK2-NEXT:    [[CONV220:%.*]] = trunc i64 [[ADD219]] to i32
+// CHECK2-NEXT:    store i32 [[CONV220]], ptr [[J40]], align 4
+// CHECK2-NEXT:    [[TMP79:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_12]], align 8
+// CHECK2-NEXT:    [[TMP80:%.*]] = load i64, ptr [[DOTPERMUTED_0_IV___BEGIN438]], align 8
+// CHECK2-NEXT:    [[MUL221:%.*]] = mul nsw i64 [[TMP80]], 1
+// CHECK2-NEXT:    [[ADD_PTR222:%.*]] = getelementptr inbounds double, ptr [[TMP79]], i64 [[MUL221]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR222]], ptr [[__BEGIN4]], align 8
+// CHECK2-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[__BEGIN4]], align 8
+// CHECK2-NEXT:    store ptr [[TMP81]], ptr [[BB]], align 8
+// CHECK2-NEXT:    [[TMP82:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP83:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN339]], align 8
+// CHECK2-NEXT:    [[MUL223:%.*]] = mul nsw i64 [[TMP83]], 1
+// CHECK2-NEXT:    [[ADD_PTR224:%.*]] = getelementptr inbounds double, ptr [[TMP82]], i64 [[MUL223]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR224]], ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP85:%.*]] = load double, ptr [[TMP84]], align 8
+// CHECK2-NEXT:    store double [[TMP85]], ptr [[AA]], align 8
+// CHECK2-NEXT:    [[TMP86:%.*]] = load i32, ptr [[I37]], align 4
+// CHECK2-NEXT:    [[TMP87:%.*]] = load double, ptr [[C]], align 8
+// CHECK2-NEXT:    [[TMP88:%.*]] = load double, ptr [[AA]], align 8
+// CHECK2-NEXT:    [[TMP89:%.*]] = load double, ptr [[D]], align 8
+// CHECK2-NEXT:    [[TMP90:%.*]] = load ptr, ptr [[BB]], align 8
+// CHECK2-NEXT:    [[TMP91:%.*]] = load double, ptr [[TMP90]], align 8
+// CHECK2-NEXT:    [[TMP92:%.*]] = load i32, ptr [[J40]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP86]], double noundef [[TMP87]], double noundef [[TMP88]], double noundef [[TMP89]], double noundef [[TMP91]], i32 noundef [[TMP92]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP93:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[ADD225:%.*]] = add nsw i64 [[TMP93]], 1
+// CHECK2-NEXT:    store i64 [[ADD225]], ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK2:       omp.precond.end:
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo2
+// CHECK2-SAME: (i32 noundef [[START1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP8]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTNEW_STEP7]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK2-NEXT:    [[SUB9:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+// CHECK2-NEXT:    [[SUB10:%.*]] = sub i32 [[SUB9]], 1
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK2-NEXT:    [[ADD11:%.*]] = add i32 [[SUB10]], [[TMP14]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK2-NEXT:    [[DIV12:%.*]] = udiv i32 [[ADD11]], [[TMP15]]
+// CHECK2-NEXT:    [[SUB13:%.*]] = sub i32 [[DIV12]], 1
+// CHECK2-NEXT:    store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_8]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4
+// CHECK2-NEXT:    [[ADD14:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP16]], [[ADD14]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], [[TMP20]]
+// CHECK2-NEXT:    [[ADD15:%.*]] = add i32 [[TMP18]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD15]], ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND16:%.*]]
+// CHECK2:       for.cond16:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD17:%.*]] = add i32 [[TMP22]], 1
+// CHECK2-NEXT:    [[CMP18:%.*]] = icmp ult i32 [[TMP21]], [[ADD17]]
+// CHECK2-NEXT:    br i1 [[CMP18]], label [[FOR_BODY19:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body19:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL20:%.*]] = mul i32 [[TMP24]], [[TMP25]]
+// CHECK2-NEXT:    [[ADD21:%.*]] = add i32 [[TMP23]], [[MUL20]]
+// CHECK2-NEXT:    store i32 [[ADD21]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP26]], i32 noundef [[TMP27]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP28]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND16]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC22:%.*]]
+// CHECK2:       for.inc22:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[INC23:%.*]] = add i32 [[TMP29]], 1
+// CHECK2-NEXT:    store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2:       for.end24:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo3
+// CHECK2-SAME: () #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP7]], 3
+// CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP8]], 4
+// CHECK2-NEXT:    br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 7, [[MUL5]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP10]], i32 noundef [[TMP11]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo4
+// CHECK2-SAME: () #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 15, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 15
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 15, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV3:%.*]] = sdiv i32 [[TMP8]], 4
+// CHECK2-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL4]]
+// CHECK2-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
+// CHECK2-NEXT:    [[MUL7:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL7]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[CMP9:%.*]] = icmp slt i32 [[TMP10]], 4
+// CHECK2-NEXT:    br i1 [[CMP9]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP11]], 3
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 7, [[MUL10]]
+// CHECK2-NEXT:    store i32 [[ADD11]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP12]], i32 noundef [[TMP13]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK2-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo6
+// CHECK2-SAME: () #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV_K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV_J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[L:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK2-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 7, ptr [[K]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 255, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 255
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 255, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 64
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV5:%.*]] = sdiv i32 [[TMP8]], 64
+// CHECK2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 64
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL6]]
+// CHECK2-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB]], 16
+// CHECK2-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 0, [[MUL8]]
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[DOTPERMUTED_0_IV_K]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV10:%.*]] = sdiv i32 [[TMP10]], 64
+// CHECK2-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 64
+// CHECK2-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP9]], [[MUL11]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV13:%.*]] = sdiv i32 [[TMP12]], 64
+// CHECK2-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 64
+// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP11]], [[MUL14]]
+// CHECK2-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 16
+// CHECK2-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[DIV16]], 16
+// CHECK2-NEXT:    [[SUB18:%.*]] = sub nsw i32 [[SUB12]], [[MUL17]]
+// CHECK2-NEXT:    [[DIV19:%.*]] = sdiv i32 [[SUB18]], 4
+// CHECK2-NEXT:    [[MUL20:%.*]] = mul nsw i32 [[DIV19]], 1
+// CHECK2-NEXT:    [[ADD21:%.*]] = add nsw i32 0, [[MUL20]]
+// CHECK2-NEXT:    store i32 [[ADD21]], ptr [[DOTPERMUTED_1_IV_J]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV22:%.*]] = sdiv i32 [[TMP14]], 64
+// CHECK2-NEXT:    [[MUL23:%.*]] = mul nsw i32 [[DIV22]], 64
+// CHECK2-NEXT:    [[SUB24:%.*]] = sub nsw i32 [[TMP13]], [[MUL23]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV25:%.*]] = sdiv i32 [[TMP16]], 64
+// CHECK2-NEXT:    [[MUL26:%.*]] = mul nsw i32 [[DIV25]], 64
+// CHECK2-NEXT:    [[SUB27:%.*]] = sub nsw i32 [[TMP15]], [[MUL26]]
+// CHECK2-NEXT:    [[DIV28:%.*]] = sdiv i32 [[SUB27]], 16
+// CHECK2-NEXT:    [[MUL29:%.*]] = mul nsw i32 [[DIV28]], 16
+// CHECK2-NEXT:    [[SUB30:%.*]] = sub nsw i32 [[SUB24]], [[MUL29]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV31:%.*]] = sdiv i32 [[TMP18]], 64
+// CHECK2-NEXT:    [[MUL32:%.*]] = mul nsw i32 [[DIV31]], 64
+// CHECK2-NEXT:    [[SUB33:%.*]] = sub nsw i32 [[TMP17]], [[MUL32]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV34:%.*]] = sdiv i32 [[TMP20]], 64
+// CHECK2-NEXT:    [[MUL35:%.*]] = mul nsw i32 [[DIV34]], 64
+// CHECK2-NEXT:    [[SUB36:%.*]] = sub nsw i32 [[TMP19]], [[MUL35]]
+// CHECK2-NEXT:    [[DIV37:%.*]] = sdiv i32 [[SUB36]], 16
+// CHECK2-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[DIV37]], 16
+// CHECK2-NEXT:    [[SUB39:%.*]] = sub nsw i32 [[SUB33]], [[MUL38]]
+// CHECK2-NEXT:    [[DIV40:%.*]] = sdiv i32 [[SUB39]], 4
+// CHECK2-NEXT:    [[MUL41:%.*]] = mul nsw i32 [[DIV40]], 4
+// CHECK2-NEXT:    [[SUB42:%.*]] = sub nsw i32 [[SUB30]], [[MUL41]]
+// CHECK2-NEXT:    [[MUL43:%.*]] = mul nsw i32 [[SUB42]], 3
+// CHECK2-NEXT:    [[ADD44:%.*]] = add nsw i32 7, [[MUL43]]
+// CHECK2-NEXT:    store i32 [[ADD44]], ptr [[L]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_K]], align 4
+// CHECK2-NEXT:    [[MUL45:%.*]] = mul nsw i32 [[TMP21]], 3
+// CHECK2-NEXT:    [[ADD46:%.*]] = add nsw i32 7, [[MUL45]]
+// CHECK2-NEXT:    store i32 [[ADD46]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_J]], align 4
+// CHECK2-NEXT:    [[MUL47:%.*]] = mul nsw i32 [[TMP22]], 3
+// CHECK2-NEXT:    [[ADD48:%.*]] = add nsw i32 7, [[MUL47]]
+// CHECK2-NEXT:    store i32 [[ADD48]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[L]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP23]], i32 noundef [[TMP24]], i32 noundef [[TMP25]], i32 noundef [[TMP26]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP27]], 1
+// CHECK2-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo9
+// CHECK2-SAME: () #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [128 x double], align 16
+// CHECK2-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTPERMUTED_1_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP6]], 21
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END15:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 2
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[I]], align 4
+// CHECK2-NEXT:    store i64 0, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK2:       for.cond7:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP9]], 1
+// CHECK2-NEXT:    [[CMP9:%.*]] = icmp slt i64 [[TMP8]], [[ADD8]]
+// CHECK2-NEXT:    br i1 [[CMP9]], label [[FOR_BODY10:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body10:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL11:%.*]] = mul nsw i64 [[TMP11]], 1
+// CHECK2-NEXT:    [[ADD_PTR12:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL11]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR12]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load double, ptr [[C]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP14]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP13]], double noundef [[TMP15]], i32 noundef [[TMP16]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP17]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC13:%.*]]
+// CHECK2:       for.inc13:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK2-NEXT:    [[INC14:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK2-NEXT:    store i32 [[INC14]], ptr [[DOTPERMUTED_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       for.end15:
+// CHECK2-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/interchange_messages.cpp b/clang/test/OpenMP/interchange_messages.cpp
new file mode 100644
index 0000000000000..175c2f1efa744
--- /dev/null
+++ b/clang/test/OpenMP/interchange_messages.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+  // expected-warning@+1 {{extra tokens at the end of '#pragma omp interchange' are ignored}}
+  #pragma omp interchange foo
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j < 13; ++j)
+      ;
+
+  // expected-error@+1 {{unexpected OpenMP clause 'collapse' in directive '#pragma omp interchange'}}
+  #pragma omp interchange collapse(2)
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j < 13; ++j)
+      ;
+
+  {
+    // expected-error@+2 {{expected statement}}
+    #pragma omp interchange
+  }
+
+  // expected-error@+2 {{statement after '#pragma omp interchange' must be a for loop}}
+  #pragma omp interchange
+  int b = 0;
+
+  // expected-error@+3 {{statement after '#pragma omp interchange' must be a for loop}}
+  #pragma omp interchange
+  for (int i = 0; i < 7; ++i)
+    ;
+
+  // expected-error@+2 {{statement after '#pragma omp interchange' must be a for loop}}
+  #pragma omp interchange
+  for (int i = 0; i < 7; ++i) {
+    int k = 3;
+    for (int j = 0; j < 7; ++j)
+      ;
+  }
+
+  // expected-error@+3 {{expected loop invariant expression}}
+  #pragma omp interchange
+  for (int i = 0; i < 7; ++i)
+    for (int j = i; j < 7; ++j)
+      ;
+
+  // expected-error@+3 {{expected loop invariant expression}}
+  #pragma omp interchange
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j < i; ++j)
+      ;
+
+  // expected-error@+3 {{expected loop invariant expression}}
+  #pragma omp interchange
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j < i; ++j)
+      ;
+
+  // expected-error@+6 {{expected 3 for loops after '#pragma omp for', but found only 2}}
+  // expected-note@+1 {{as specified in 'collapse' clause}}
+  #pragma omp for collapse(3)
+  #pragma omp interchange 
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j < 13; ++j)
+      ;
+
+  // expected-error@+2 {{statement after '#pragma omp interchange' must be a for loop}}
+  #pragma omp interchange
+  #pragma omp for
+  for (int i = 0; i < 7; ++i)
+    ;
+
+  // expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', '>=', or '!=') of loop variable 'j'}}
+  #pragma omp interchange 
+  for (int i = 0; i < 7; ++i)
+    for (int j = 0; j/3<7; ++j)
+      ;
+}
+
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 7b37480856ca2..4c2d40e21bef0 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA15:![0-9]+]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -66,78 +66,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA19:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR12:[0-9]+]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA21:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -161,13 +161,13 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA24:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[THIS1]], ptr nonnull align 4 dereferenceable(4) [[TMP0]], ptr nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR12]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[THIS1]], ptr nonnull align 4 dereferenceable(4) [[TMP0]], ptr nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR11]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -197,79 +197,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR12]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -278,30 +278,30 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4
+// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR12]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR12]]
+// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
@@ -309,25 +309,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -335,7 +335,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[TMP2]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR12]]
+// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[TMP2]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR11]]
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
@@ -354,30 +354,30 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIfEpLIfEERS0_RKS_IT_E
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(8) [[__C:%.*]]) #[[ATTR6:[0-9]+]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(8) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
-// CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR12]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA26:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
-// CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR12]]
+// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA28:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4
+// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
@@ -442,59 +442,59 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
-// CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 2
+// CHECK1-NEXT:    br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]])
+// CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
-// CHECK1-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP6]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store volatile i32 [[TMP12]], ptr addrspace(3) [[TMP11]], align 4
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]]
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]]
 // CHECK1:       then3:
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-// CHECK1-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP2]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP6]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, ptr addrspace(3) [[TMP14]], align 4
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
+// CHECK1-NEXT:    br label [[IFCONT5:%.*]]
 // CHECK1:       else4:
-// CHECK1-NEXT:    br label [[IFCONT4]]
+// CHECK1-NEXT:    br label [[IFCONT5]]
 // CHECK1:       ifcont5:
-// CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -507,17 +507,17 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA29:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -528,14 +528,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -558,78 +558,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA31:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR12]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA31]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -653,13 +653,13 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA33:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA33]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[THIS1]], ptr nonnull align 8 dereferenceable(8) [[TMP0]], ptr nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR12]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[THIS1]], ptr nonnull align 8 dereferenceable(8) [[TMP0]], ptr nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR11]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -689,79 +689,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA31]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR12]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA31]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -770,30 +770,30 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8
+// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA31]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR12]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR12]]
+// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA31]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
@@ -801,25 +801,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -827,7 +827,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[TMP2]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR12]]
+// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[TMP2]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR11]]
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
@@ -846,30 +846,30 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEpLIdEERS0_RKS_IT_E
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR6]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
-// CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR12]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA35:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
-// CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR12]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA35]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA37:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8
+// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA37]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
@@ -947,59 +947,59 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 4
-// CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 4
+// CHECK1-NEXT:    br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
+// CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
-// CHECK1-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP6]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store volatile i32 [[TMP12]], ptr addrspace(3) [[TMP11]], align 4
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]]
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]]
 // CHECK1:       then3:
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-// CHECK1-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP2]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP6]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, ptr addrspace(3) [[TMP14]], align 4
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
+// CHECK1-NEXT:    br label [[IFCONT5:%.*]]
 // CHECK1:       else4:
-// CHECK1-NEXT:    br label [[IFCONT4]]
+// CHECK1-NEXT:    br label [[IFCONT5]]
 // CHECK1:       ifcont5:
-// CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -1012,17 +1012,17 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA29]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA15]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -1033,40 +1033,40 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
-// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA26]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
-// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIfE4realEv
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA26]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIfE4imagEv
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1076,39 +1076,39 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA33]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA33]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA33]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA31]]
+// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA35]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8
-// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA33]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA31]]
+// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA37]]
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4realEv
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA35]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4imagEv
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA37]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
diff --git a/clang/test/OpenMP/nvptx_target_printf_codegen.c b/clang/test/OpenMP/nvptx_target_printf_codegen.c
deleted file mode 100644
index f53daf65205c9..0000000000000
--- a/clang/test/OpenMP/nvptx_target_printf_codegen.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK-32
-// expected-no-diagnostics
-extern int printf(const char *, ...);
-
-
-// Check a simple call to printf end-to-end.
-int CheckSimple(void) {
-#pragma omp target
-  {
-    // printf in master-only basic block.
-    const char* fmt = "%d %lld %f";
-
-    printf(fmt, 1, 2ll, 3.0);
-  }
-
-  return 0;
-}
-
-void CheckNoArgs(void) {
-#pragma omp target
-  {
-    // printf in master-only basic block.
-    printf("hello, world!");
-  }
-}
-
-// Check that printf's alloca happens in the entry block, not inside the if
-// statement.
-int foo;
-void CheckAllocaIsInEntryBlock(void) {
-#pragma omp target
-  {
-    if (foo) {
-      printf("%d", 42);
-    }
-  }
-}
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[FMT:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    store ptr @.str, ptr [[FMT]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FMT]], align 8
-// CHECK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 0
-// CHECK-64-NEXT:    store i32 1, ptr [[TMP2]], align 4
-// CHECK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 1
-// CHECK-64-NEXT:    store i64 2, ptr [[TMP3]], align 8
-// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 2
-// CHECK-64-NEXT:    store double 3.000000e+00, ptr [[TMP4]], align 8
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__llvm_omp_vprintf(ptr [[TMP1]], ptr [[TMP]], i32 24)
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str1, ptr null, i32 0)
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[FOO:%.*]]) #[[ATTR0]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[FOO_ADDR:%.*]] = alloca i64, align 8
-// CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    store i64 [[FOO]], ptr [[FOO_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FOO_ADDR]], align 4
-// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
-// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK-64:       if.then:
-// CHECK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], ptr [[TMP]], i32 0, i32 0
-// CHECK-64-NEXT:    store i32 42, ptr [[TMP2]], align 4
-// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str2, ptr [[TMP]], i32 4)
-// CHECK-64-NEXT:    br label [[IF_END]]
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       if.end:
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[FMT:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    store ptr @.str, ptr [[FMT]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FMT]], align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 0
-// CHECK-32-NEXT:    store i32 1, ptr [[TMP2]], align 4
-// CHECK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 1
-// CHECK-32-NEXT:    store i64 2, ptr [[TMP3]], align 8
-// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 2
-// CHECK-32-NEXT:    store double 3.000000e+00, ptr [[TMP4]], align 8
-// CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__llvm_omp_vprintf(ptr [[TMP1]], ptr [[TMP]], i32 24)
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str1, ptr null, i32 0)
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[FOO:%.*]]) #[[ATTR0]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[FOO_ADDR:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    store i32 [[FOO]], ptr [[FOO_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FOO_ADDR]], align 4
-// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
-// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK-32:       if.then:
-// CHECK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], ptr [[TMP]], i32 0, i32 0
-// CHECK-32-NEXT:    store i32 42, ptr [[TMP2]], align 4
-// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str2, ptr [[TMP]], i32 4)
-// CHECK-32-NEXT:    br label [[IF_END]]
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       if.end:
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-//
diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
index 87eb2913537ba..6c163c1875171 100644
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -3,15 +3,17 @@
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
 // RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
+// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -dwarf-version=5 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
 // expected-no-diagnostics
 
 
 // Check that the target attributes are set on the generated kernel
 void func() {
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16(ptr {{[^,]+}}) #0
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}})
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
 
   #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
   {}
@@ -35,6 +37,6 @@ void func() {
 // NVIDIA: "omp_target_thread_limit"="20"
 // NVIDIA: "omp_target_thread_limit"="45"
 // NVIDIA: "omp_target_thread_limit"="17"
-// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
-// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"maxntidx", i32 45}
-// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}
+// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
+// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
+// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index 41d43048d6a12..2a0a881b109be 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -323,8 +323,8 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:      #dbg_declare(ptr [[__VLA_EXPR0]], [[META24:![0-9]+]], !DIExpression(), [[META26:![0-9]+]])
 // CHECK2-NEXT:      #dbg_declare(ptr [[VLA]], [[META27:![0-9]+]], !DIExpression(), [[META31:![0-9]+]])
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @main.omp_outlined, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG32:![0-9]+]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.2, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.4, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.3, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8, !dbg [[DBG35:![0-9]+]]
 // CHECK2-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIPPcEiT_(ptr noundef [[TMP3]]), !dbg [[DBG36:![0-9]+]]
 // CHECK2-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4, !dbg [[DBG37:![0-9]+]]
@@ -368,47 +368,47 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    unreachable, !dbg [[DBG53]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_
-// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR3:[0-9]+]] comdat !dbg [[DBG58:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META64:![0-9]+]])
-// CHECK2-NEXT:    ret void, !dbg [[DBG65:![0-9]+]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] comdat {
-// CHECK2-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5:[0-9]+]]
-// CHECK2-NEXT:    call void @_ZSt9terminatev() #[[ATTR6]]
-// CHECK2-NEXT:    unreachable
-//
-//
 // CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG66:![0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG58:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META67:![0-9]+]], !DIExpression(), [[META68:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META59:![0-9]+]], !DIExpression(), [[META60:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META69:![0-9]+]], !DIExpression(), [[META68]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META61:![0-9]+]], !DIExpression(), [[META60]])
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META70:![0-9]+]], !DIExpression(), [[META68]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META62:![0-9]+]], !DIExpression(), [[META60]])
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META68]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG72:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5]], !dbg [[DBG72]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG72]]
+// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META60]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG64:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5:[0-9]+]], !dbg [[DBG64]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG64]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1
+// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_
+// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR3:[0-9]+]] comdat !dbg [[DBG65:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META70:![0-9]+]], !DIExpression(), [[META71:![0-9]+]])
+// CHECK2-NEXT:    ret void, !dbg [[DBG72:![0-9]+]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] comdat {
+// CHECK2-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
+// CHECK2-NEXT:    call void @_ZSt9terminatev() #[[ATTR6]]
+// CHECK2-NEXT:    unreachable
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] !dbg [[DBG75:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,14 +431,33 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    store i64 [[TMP0]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG82]]
 // CHECK2-NEXT:      #dbg_declare(ptr [[__VLA_EXPR0]], [[META84:![0-9]+]], !DIExpression(), [[META79]])
 // CHECK2-NEXT:      #dbg_declare(ptr [[VLA1]], [[META85:![0-9]+]], !DIExpression(), [[META79]])
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.1.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.2.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8, !dbg [[DBG86:![0-9]+]]
 // CHECK2-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP2]]), !dbg [[DBG86]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG88:![0-9]+]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined_debug__
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 !dbg [[DBG89:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.1
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] !dbg [[DBG89:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META90:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
+// CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META92:![0-9]+]], !DIExpression(), [[META91]])
+// CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META93:![0-9]+]], !DIExpression(), [[META91]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG94:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG94]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG94]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.2(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR5]], !dbg [[DBG94]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG94]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined_debug__
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 !dbg [[DBG95:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -446,37 +465,37 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META92:![0-9]+]], !DIExpression(), [[META93:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META98:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META94:![0-9]+]], !DIExpression(), [[META93]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META100:![0-9]+]], !DIExpression(), [[META99]])
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META95:![0-9]+]], !DIExpression(), [[META93]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META101:![0-9]+]], !DIExpression(), [[META99]])
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META96:![0-9]+]], !DIExpression(), [[META97:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META102:![0-9]+]], !DIExpression(), [[META103:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[GLOBAL_ADDR]], [[META98:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG100:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG101:![0-9]+]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG101]]
+// CHECK2-NEXT:      #dbg_declare(ptr [[GLOBAL_ADDR]], [[META104:![0-9]+]], !DIExpression(), [[META105:![0-9]+]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG106:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG107:![0-9]+]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG107]]
 // CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]])
-// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG100]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG106]]
 // CHECK2:       invoke.cont:
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG103:![0-9]+]]
-// CHECK2-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG102]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG109:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG108]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:            catch ptr null, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG100]]
-// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG100]]
-// CHECK2-NEXT:    unreachable, !dbg [[DBG100]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG106]]
+// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG106]]
+// CHECK2-NEXT:    unreachable, !dbg [[DBG106]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR2]] !dbg [[DBG105:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR2]] !dbg [[DBG111:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -484,46 +503,27 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META106:![0-9]+]], !DIExpression(), [[META107:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META112:![0-9]+]], !DIExpression(), [[META113:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META108:![0-9]+]], !DIExpression(), [[META107]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META114:![0-9]+]], !DIExpression(), [[META113]])
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META109:![0-9]+]], !DIExpression(), [[META107]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META115:![0-9]+]], !DIExpression(), [[META113]])
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META110:![0-9]+]], !DIExpression(), [[META107]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META116:![0-9]+]], !DIExpression(), [[META113]])
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[GLOBAL_ADDR]], [[META111:![0-9]+]], !DIExpression(), [[META107]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG112:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.1.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR5]], !dbg [[DBG112]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG112]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.2
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] !dbg [[DBG113:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
-// CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META114:![0-9]+]], !DIExpression(), [[META115:![0-9]+]])
-// CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META116:![0-9]+]], !DIExpression(), [[META115]])
-// CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META117:![0-9]+]], !DIExpression(), [[META115]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[GLOBAL_ADDR]], [[META117:![0-9]+]], !DIExpression(), [[META113]])
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG118:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.1(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR5]], !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.2.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR5]], !dbg [[DBG118]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG118]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG119:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -540,12 +540,12 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META124:![0-9]+]], !DIExpression(), [[META125:![0-9]+]])
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG126:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.3.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.4.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG127:![0-9]+]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined_debug__
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 !dbg [[DBG128:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.3
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG128:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -558,51 +558,51 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
 // CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META132:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META134:![0-9]+]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG135:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG135]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG136:![0-9]+]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG136]]
-// CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG135]]
-// CHECK2:       invoke.cont:
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG137:![0-9]+]]
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG138:![0-9]+]]
-// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG139:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG137]]
-// CHECK2:       terminate.lpad:
-// CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:            catch ptr null, !dbg [[DBG135]]
-// CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG135]]
-// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6]], !dbg [[DBG135]]
-// CHECK2-NEXT:    unreachable, !dbg [[DBG135]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG140:![0-9]+]] {
+// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META130]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.4(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5]], !dbg [[DBG134]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG134]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined_debug__
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 !dbg [[DBG135:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META141:![0-9]+]], !DIExpression(), [[META142:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META136:![0-9]+]], !DIExpression(), [[META137:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META143:![0-9]+]], !DIExpression(), [[META142]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META138:![0-9]+]], !DIExpression(), [[META137]])
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META144:![0-9]+]], !DIExpression(), [[META142]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META139:![0-9]+]], !DIExpression(), [[META137]])
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META145:![0-9]+]], !DIExpression(), [[META142]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.3.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5]], !dbg [[DBG146]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG146]]
+// CHECK2-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META140:![0-9]+]], !DIExpression(), [[META141:![0-9]+]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG142:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG142]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG143:![0-9]+]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG143]]
+// CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG142]]
+// CHECK2:       invoke.cont:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG144:![0-9]+]]
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG145:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG146:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG144]]
+// CHECK2:       terminate.lpad:
+// CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG142]]
+// CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG142]]
+// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6]], !dbg [[DBG142]]
+// CHECK2-NEXT:    unreachable, !dbg [[DBG142]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.4
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] !dbg [[DBG147:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -622,7 +622,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG153]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG153]]
 // CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG153]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.3(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5]], !dbg [[DBG153]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.4.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR5]], !dbg [[DBG153]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG153]]
 //
 //
@@ -678,37 +678,37 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    unreachable, !dbg [[DBG178]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_
-// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR3]] comdat !dbg [[DBG189:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META192:![0-9]+]], !DIExpression(), [[META193:![0-9]+]])
-// CHECK2-NEXT:    ret void, !dbg [[DBG194:![0-9]+]]
-//
-//
 // CHECK2-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] !dbg [[DBG195:![0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] !dbg [[DBG189:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META196:![0-9]+]], !DIExpression(), [[META197:![0-9]+]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META190:![0-9]+]], !DIExpression(), [[META191:![0-9]+]])
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META198:![0-9]+]], !DIExpression(), [[META197]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META192:![0-9]+]], !DIExpression(), [[META191]])
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META199:![0-9]+]], !DIExpression(), [[META197]])
+// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META193:![0-9]+]], !DIExpression(), [[META191]])
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META200:![0-9]+]], !DIExpression(), [[META197]])
-// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR5]], !dbg [[DBG201]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG201]]
+// CHECK2-NEXT:      #dbg_declare(ptr [[VLA_ADDR]], [[META194:![0-9]+]], !DIExpression(), [[META191]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR5]], !dbg [[DBG195]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG195]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_
+// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR3]] comdat !dbg [[DBG196:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
+// CHECK2-NEXT:      #dbg_declare(ptr [[ARGC_ADDR]], [[META199:![0-9]+]], !DIExpression(), [[META200:![0-9]+]])
+// CHECK2-NEXT:    ret void, !dbg [[DBG201:![0-9]+]]
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@main
diff --git a/clang/test/OpenMP/reverse_ast_print.cpp b/clang/test/OpenMP/reverse_ast_print.cpp
new file mode 100644
index 0000000000000..3ff6d18cfdf8b
--- /dev/null
+++ b/clang/test/OpenMP/reverse_ast_print.cpp
@@ -0,0 +1,159 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo1
+void foo1() {
+  // PRINT:     #pragma omp reverse
+  // DUMP:      OMPReverseDirective
+  #pragma omp reverse
+  // PRINT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT: body(i);
+    // DUMP:  CallExpr
+      body(i);
+}
+
+
+// PRINT-LABEL: void foo2(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo2
+void foo2(int start, int end, int step) {
+  // PRINT:     #pragma omp reverse
+  // DUMP:      OMPReverseDirective
+  #pragma omp reverse
+  // PRINT: for (int i = start; i < end; i += step)
+  // DUMP-NEXT: ForStmt
+  for (int i = start; i < end; i += step)
+      // PRINT: body(i);
+      // DUMP:  CallExpr
+      body(i);
+}
+
+
+// PRINT-LABEL: void foo3(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo3
+void foo3() {
+  // PRINT: #pragma omp for
+  // DUMP:  OMPForDirective
+  // DUMP-NEXT:    CapturedStmt
+  // DUMP-NEXT:      CapturedDecl
+  #pragma omp for
+  // PRINT:     #pragma omp reverse 
+  // DUMP-NEXT: OMPReverseDirective
+  #pragma omp reverse
+  for (int i = 7; i < 17; i += 3)
+    // PRINT: body(i);
+    // DUMP:  CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: void foo4(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo4
+void foo4() {
+  // PRINT: #pragma omp for collapse(2)
+  // DUMP: OMPForDirective
+  // DUMP-NEXT: OMPCollapseClause
+  // DUMP-NEXT:  ConstantExpr
+  // DUMP-NEXT:    value: Int 2
+  // DUMP-NEXT:  IntegerLiteral {{.*}} 2
+  // DUMP-NEXT:    CapturedStmt
+  // DUMP-NEXT:      CapturedDecl
+  #pragma omp for collapse(2)
+  // PRINT:     #pragma omp reverse
+  // DUMP:      OMPReverseDirective
+  #pragma omp reverse
+  // PRINT: for (int i = 7; i < 17; i += 1)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 1)
+    // PRINT: for (int j = 7; j < 17; j += 1)
+    // DUMP:  ForStmt
+    for (int j = 7; j < 17; j += 1)
+      // PRINT: body(i, j);
+      // DUMP:  CallExpr
+      body(i, j);
+}
+
+
+// PRINT-LABEL: void foo5(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo5
+void foo5(int start, int end, int step) {
+  // PRINT:     #pragma omp for collapse(2)
+  // DUMP:      OMPForDirective
+  // DUMP-NEXT:   OMPCollapseClause
+  // DUMP-NEXT:    ConstantExpr
+  // DUMP-NEXT:      value: Int 2
+  // DUMP-NEXT:    IntegerLiteral {{.*}} 2
+  // DUMP-NEXT:  CapturedStmt
+  // DUMP-NEXT:    CapturedDecl
+  #pragma omp for collapse(2)
+  // PRINT:     for (int i = 7; i < 17; i += 1)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 1)
+    // PRINT: #pragma omp reverse
+    // DUMP:  OMPReverseDirective
+    #pragma omp reverse 
+    // PRINT:     for (int j = 7; j < 17; j += 1)
+    // DUMP-NEXT: ForStmt
+    for (int j = 7; j < 17; j += 1)
+      // PRINT: body(i, j);
+      // DUMP:  CallExpr
+      body(i, j);
+}
+
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL:  FunctionTemplateDecl {{.*}} foo6
+template<typename T, T Step>
+void foo6(T start, T end) {
+  // PRINT: #pragma omp reverse
+  // DUMP:  OMPReverseDirective
+  #pragma omp reverse
+    // PRINT-NEXT: for (T i = start; i < end; i += Step)
+    // DUMP-NEXT:  ForStmt
+    for (T i = start; i < end; i += Step)
+      // PRINT-NEXT: body(i);
+      // DUMP:       CallExpr
+      body(i);
+}
+
+// Also test instantiating the template.
+void tfoo6() {
+  foo6<int,3>(0, 42);
+}
+
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo7
+void foo7() {
+  double arr[128];
+  // PRINT: #pragma omp reverse
+  // DUMP:  OMPReverseDirective
+  #pragma omp reverse
+  // PRINT-NEXT: for (auto &&v : arr)
+  // DUMP-NEXT:  CXXForRangeStmt
+  for (auto &&v : arr)
+    // PRINT-NEXT: body(v);
+    // DUMP:       CallExpr
+    body(v);
+}
+
+#endif
+
diff --git a/clang/test/OpenMP/reverse_codegen.cpp b/clang/test/OpenMP/reverse_codegen.cpp
new file mode 100644
index 0000000000000..9adaa6cc7d18d
--- /dev/null
+++ b/clang/test/OpenMP/reverse_codegen.cpp
@@ -0,0 +1,1554 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+struct S {
+  int i;
+  S() {
+#pragma omp reverse
+    for (i = 7; i < 17; i += 3)
+      body(i);
+  }
+} s;
+
+
+extern "C" void foo1(int start, int end, int step) {
+  int i;
+#pragma omp reverse
+  for (i = start; i < end; i += step)
+    body(i);
+}
+
+
+extern "C" void foo2() {
+#pragma omp for
+#pragma omp reverse
+    for (int i = 7; i < 17; i += 3)
+        body(i);
+}
+
+
+extern "C" void foo3() {
+#pragma omp for collapse(3)
+  for (int k = 7; k < 17; k += 3)
+#pragma omp reverse
+    for (int i = 7; i < 17; i += 3)
+      for (int j = 7; j < 17; j += 3)
+        body(k, i, j);
+}
+
+
+extern "C" void foo4() {
+#pragma omp parallel for
+#pragma omp reverse
+  for (int i = 7; i < 17; i += 3)
+    body(i);
+}
+
+
+template<typename T, T Step>
+void foo5(T start, T end) {
+#pragma omp reverse
+  for (T i = start; i < end; i += Step)
+    body(i);
+}
+
+extern "C" void tfoo5() {
+  foo5<int,3>(0, 42);
+}
+
+
+extern "C" void foo6() {
+  double arr[128];
+#pragma omp reverse
+  for (int c = 42; auto && v : arr)
+    body(v, c);
+}
+
+
+extern "C" void foo7() {
+  double A[128];
+
+#pragma omp for collapse(3)
+  for (int k = 7; k < 17; k += 3)
+#pragma omp reverse
+    for (int c = 42; auto && v : A)
+      for (int j = 7; j < 17; j += 3)
+        body(k, c, v, j);
+}
+
+#endif /* HEADER */
+
+// CHECK1-LABEL: define {{[^@]+}}@body
+// CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK1-SAME: () #[[ATTR1:[0-9]+]] section ".text.startup" {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    call void @_ZN1SC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[I2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP1]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 3
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[TMP3]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP5]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo1
+// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD6:%.*]] = add i32 [[TMP10]], 1
+// CHECK1-NEXT:    [[SUB7:%.*]] = sub i32 [[ADD6]], 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP11]]
+// CHECK1-NEXT:    store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP15]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo2
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP7]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP8]], 3
+// CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP9]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo3
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 63, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 16
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16
+// CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]]
+// CHECK1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[SUB]], 4
+// CHECK1-NEXT:    [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 1
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL7]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16
+// CHECK1-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]]
+// CHECK1-NEXT:    [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4
+// CHECK1-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4
+// CHECK1-NEXT:    [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]]
+// CHECK1-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3
+// CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i32 7, [[MUL18]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB20:%.*]] = sub nsw i32 3, [[TMP13]]
+// CHECK1-NEXT:    store i32 [[SUB20]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[TMP14]], 3
+// CHECK1-NEXT:    [[ADD22:%.*]] = add nsw i32 7, [[MUL21]]
+// CHECK1-NEXT:    store i32 [[ADD22]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP15]], i32 noundef [[TMP16]], i32 noundef [[TMP17]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo4
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo4.omp_outlined)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo4.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 3
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP8]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP10]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@tfoo5
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void @_Z4foo5IiTnT_Li3EEvS0_S0_(i32 noundef 0, i32 noundef 42)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z4foo5IiTnT_Li3EEvS0_S0_
+// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 3
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 3
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP6]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP5]], [[ADD5]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD6:%.*]] = add i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[SUB7:%.*]] = sub i32 [[ADD6]], 1
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP8]]
+// CHECK1-NEXT:    store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 3
+// CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP11]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP12]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo6
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [128 x double], align 16
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTFORWARD_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTREVERSED_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i64 [[TMP8]], 1
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[ADD7]], 1
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[SUB9:%.*]] = sub nsw i64 [[SUB8]], [[TMP9]]
+// CHECK1-NEXT:    store i64 [[SUB9]], ptr [[DOTREVERSED_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP11]], 1
+// CHECK1-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR10]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP14]], i32 noundef [[TMP15]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP16]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@foo7
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A:%.*]] = alloca [128 x double], align 16
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_5:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K15:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFORWARD_IV___BEGIN316:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[J17:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTREVERSED_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[A]], ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END3]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY3:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY3]], ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB7:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    store i64 [[ADD9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub nsw i64 [[TMP8]], 0
+// CHECK1-NEXT:    [[DIV12:%.*]] = sdiv i64 [[SUB11]], 1
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 4, [[DIV12]]
+// CHECK1-NEXT:    [[MUL13:%.*]] = mul nsw i64 [[MUL]], 4
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub nsw i64 [[MUL13]], 1
+// CHECK1-NEXT:    store i64 [[SUB14]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    store i32 7, ptr [[K]], align 4
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFORWARD_IV___BEGIN3]], align 8
+// CHECK1-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 0, [[TMP9]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    store i64 [[TMP10]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT:    br i1 [[CMP18]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT:    [[CMP19:%.*]] = icmp sle i64 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT:    br i1 [[CMP19]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB20:%.*]] = sub nsw i64 [[TMP19]], 0
+// CHECK1-NEXT:    [[DIV21:%.*]] = sdiv i64 [[SUB20]], 1
+// CHECK1-NEXT:    [[MUL22:%.*]] = mul nsw i64 1, [[DIV21]]
+// CHECK1-NEXT:    [[MUL23:%.*]] = mul nsw i64 [[MUL22]], 4
+// CHECK1-NEXT:    [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[MUL23]]
+// CHECK1-NEXT:    [[MUL25:%.*]] = mul nsw i64 [[DIV24]], 3
+// CHECK1-NEXT:    [[ADD26:%.*]] = add nsw i64 7, [[MUL25]]
+// CHECK1-NEXT:    [[CONV:%.*]] = trunc i64 [[ADD26]] to i32
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[K15]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB27:%.*]] = sub nsw i64 [[TMP22]], 0
+// CHECK1-NEXT:    [[DIV28:%.*]] = sdiv i64 [[SUB27]], 1
+// CHECK1-NEXT:    [[MUL29:%.*]] = mul nsw i64 1, [[DIV28]]
+// CHECK1-NEXT:    [[MUL30:%.*]] = mul nsw i64 [[MUL29]], 4
+// CHECK1-NEXT:    [[DIV31:%.*]] = sdiv i64 [[TMP21]], [[MUL30]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB32:%.*]] = sub nsw i64 [[TMP23]], 0
+// CHECK1-NEXT:    [[DIV33:%.*]] = sdiv i64 [[SUB32]], 1
+// CHECK1-NEXT:    [[MUL34:%.*]] = mul nsw i64 1, [[DIV33]]
+// CHECK1-NEXT:    [[MUL35:%.*]] = mul nsw i64 [[MUL34]], 4
+// CHECK1-NEXT:    [[MUL36:%.*]] = mul nsw i64 [[DIV31]], [[MUL35]]
+// CHECK1-NEXT:    [[SUB37:%.*]] = sub nsw i64 [[TMP20]], [[MUL36]]
+// CHECK1-NEXT:    [[DIV38:%.*]] = sdiv i64 [[SUB37]], 4
+// CHECK1-NEXT:    [[MUL39:%.*]] = mul nsw i64 [[DIV38]], 1
+// CHECK1-NEXT:    [[ADD40:%.*]] = add nsw i64 0, [[MUL39]]
+// CHECK1-NEXT:    store i64 [[ADD40]], ptr [[DOTFORWARD_IV___BEGIN316]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB41:%.*]] = sub nsw i64 [[TMP26]], 0
+// CHECK1-NEXT:    [[DIV42:%.*]] = sdiv i64 [[SUB41]], 1
+// CHECK1-NEXT:    [[MUL43:%.*]] = mul nsw i64 1, [[DIV42]]
+// CHECK1-NEXT:    [[MUL44:%.*]] = mul nsw i64 [[MUL43]], 4
+// CHECK1-NEXT:    [[DIV45:%.*]] = sdiv i64 [[TMP25]], [[MUL44]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB46:%.*]] = sub nsw i64 [[TMP27]], 0
+// CHECK1-NEXT:    [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul nsw i64 1, [[DIV47]]
+// CHECK1-NEXT:    [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 4
+// CHECK1-NEXT:    [[MUL50:%.*]] = mul nsw i64 [[DIV45]], [[MUL49]]
+// CHECK1-NEXT:    [[SUB51:%.*]] = sub nsw i64 [[TMP24]], [[MUL50]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB52:%.*]] = sub nsw i64 [[TMP30]], 0
+// CHECK1-NEXT:    [[DIV53:%.*]] = sdiv i64 [[SUB52]], 1
+// CHECK1-NEXT:    [[MUL54:%.*]] = mul nsw i64 1, [[DIV53]]
+// CHECK1-NEXT:    [[MUL55:%.*]] = mul nsw i64 [[MUL54]], 4
+// CHECK1-NEXT:    [[DIV56:%.*]] = sdiv i64 [[TMP29]], [[MUL55]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB57:%.*]] = sub nsw i64 [[TMP31]], 0
+// CHECK1-NEXT:    [[DIV58:%.*]] = sdiv i64 [[SUB57]], 1
+// CHECK1-NEXT:    [[MUL59:%.*]] = mul nsw i64 1, [[DIV58]]
+// CHECK1-NEXT:    [[MUL60:%.*]] = mul nsw i64 [[MUL59]], 4
+// CHECK1-NEXT:    [[MUL61:%.*]] = mul nsw i64 [[DIV56]], [[MUL60]]
+// CHECK1-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[TMP28]], [[MUL61]]
+// CHECK1-NEXT:    [[DIV63:%.*]] = sdiv i64 [[SUB62]], 4
+// CHECK1-NEXT:    [[MUL64:%.*]] = mul nsw i64 [[DIV63]], 4
+// CHECK1-NEXT:    [[SUB65:%.*]] = sub nsw i64 [[SUB51]], [[MUL64]]
+// CHECK1-NEXT:    [[MUL66:%.*]] = mul nsw i64 [[SUB65]], 3
+// CHECK1-NEXT:    [[ADD67:%.*]] = add nsw i64 7, [[MUL66]]
+// CHECK1-NEXT:    [[CONV68:%.*]] = trunc i64 [[ADD67]] to i32
+// CHECK1-NEXT:    store i32 [[CONV68]], ptr [[J17]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK1-NEXT:    [[ADD69:%.*]] = add nsw i64 [[TMP32]], 1
+// CHECK1-NEXT:    [[SUB70:%.*]] = sub nsw i64 [[ADD69]], 1
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN316]], align 8
+// CHECK1-NEXT:    [[SUB71:%.*]] = sub nsw i64 [[SUB70]], [[TMP33]]
+// CHECK1-NEXT:    store i64 [[SUB71]], ptr [[DOTREVERSED_IV___BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN3]], align 8
+// CHECK1-NEXT:    [[MUL72:%.*]] = mul nsw i64 [[TMP35]], 1
+// CHECK1-NEXT:    [[ADD_PTR73:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[MUL72]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR73]], ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[__BEGIN3]], align 8
+// CHECK1-NEXT:    store ptr [[TMP36]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[K15]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP40:%.*]] = load double, ptr [[TMP39]], align 8
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[J17]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP37]], i32 noundef [[TMP38]], double noundef [[TMP40]], i32 noundef [[TMP41]])
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    [[ADD74:%.*]] = add nsw i64 [[TMP42]], 1
+// CHECK1-NEXT:    store i64 [[ADD74]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_reverse_codegen.cpp
+// CHECK1-SAME: () #[[ATTR1]] section ".text.startup" {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void @__cxx_global_var_init()
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK2-SAME: () #[[ATTR0:[0-9]+]] section ".text.startup" {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC1Ev
+// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK2-NEXT:    call void @_ZN1SC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC2Ev
+// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[I2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK2-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP1]]
+// CHECK2-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 3
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[TMP3]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP5]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@body
+// CHECK2-SAME: (...) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo1
+// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD6:%.*]] = add i32 [[TMP10]], 1
+// CHECK2-NEXT:    [[SUB7:%.*]] = sub i32 [[ADD6]], 1
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP11]]
+// CHECK2-NEXT:    store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP15]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo2
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP7]]
+// CHECK2-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP8]], 3
+// CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP9]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo3
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 63, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP6]], 16
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 3
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16
+// CHECK2-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]]
+// CHECK2-NEXT:    [[DIV6:%.*]] = sdiv i32 [[SUB]], 4
+// CHECK2-NEXT:    [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 1
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL7]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16
+// CHECK2-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16
+// CHECK2-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]]
+// CHECK2-NEXT:    [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4
+// CHECK2-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4
+// CHECK2-NEXT:    [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]]
+// CHECK2-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3
+// CHECK2-NEXT:    [[ADD19:%.*]] = add nsw i32 7, [[MUL18]]
+// CHECK2-NEXT:    store i32 [[ADD19]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB20:%.*]] = sub nsw i32 3, [[TMP13]]
+// CHECK2-NEXT:    store i32 [[SUB20]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[TMP14]], 3
+// CHECK2-NEXT:    [[ADD22:%.*]] = add nsw i32 7, [[MUL21]]
+// CHECK2-NEXT:    store i32 [[ADD22]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP15]], i32 noundef [[TMP16]], i32 noundef [[TMP17]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK2-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo4
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo4.omp_outlined)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo4.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 3
+// CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK2-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 3, [[TMP8]]
+// CHECK2-NEXT:    store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 7, [[MUL2]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP10]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo6
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [128 x double], align 16
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTFORWARD_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTREVERSED_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i64 [[TMP8]], 1
+// CHECK2-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[ADD7]], 1
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[SUB9:%.*]] = sub nsw i64 [[SUB8]], [[TMP9]]
+// CHECK2-NEXT:    store i64 [[SUB9]], ptr [[DOTREVERSED_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP11]], 1
+// CHECK2-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR10]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP13]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP14]], i32 noundef [[TMP15]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP16]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTFORWARD_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@foo7
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A:%.*]] = alloca [128 x double], align 16
+// CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[_TMP1:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_5:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K15:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV___BEGIN316:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[J17:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV___BEGIN3:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[A]], ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END3]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY3:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY3]], ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8
+// CHECK2-NEXT:    store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB7:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    store i64 [[ADD9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub nsw i64 [[TMP8]], 0
+// CHECK2-NEXT:    [[DIV12:%.*]] = sdiv i64 [[SUB11]], 1
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 4, [[DIV12]]
+// CHECK2-NEXT:    [[MUL13:%.*]] = mul nsw i64 [[MUL]], 4
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub nsw i64 [[MUL13]], 1
+// CHECK2-NEXT:    store i64 [[SUB14]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    store i32 7, ptr [[K]], align 4
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFORWARD_IV___BEGIN3]], align 8
+// CHECK2-NEXT:    store i32 7, ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 0, [[TMP9]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2:       omp.precond.then:
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    store i64 [[TMP10]], ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT:    call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[TMP11]], [[TMP12]]
+// CHECK2-NEXT:    br i1 [[CMP18]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK2-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2:       omp.inner.for.cond:
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK2-NEXT:    [[CMP19:%.*]] = icmp sle i64 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT:    br i1 [[CMP19]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2:       omp.inner.for.body:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB20:%.*]] = sub nsw i64 [[TMP19]], 0
+// CHECK2-NEXT:    [[DIV21:%.*]] = sdiv i64 [[SUB20]], 1
+// CHECK2-NEXT:    [[MUL22:%.*]] = mul nsw i64 1, [[DIV21]]
+// CHECK2-NEXT:    [[MUL23:%.*]] = mul nsw i64 [[MUL22]], 4
+// CHECK2-NEXT:    [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[MUL23]]
+// CHECK2-NEXT:    [[MUL25:%.*]] = mul nsw i64 [[DIV24]], 3
+// CHECK2-NEXT:    [[ADD26:%.*]] = add nsw i64 7, [[MUL25]]
+// CHECK2-NEXT:    [[CONV:%.*]] = trunc i64 [[ADD26]] to i32
+// CHECK2-NEXT:    store i32 [[CONV]], ptr [[K15]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB27:%.*]] = sub nsw i64 [[TMP22]], 0
+// CHECK2-NEXT:    [[DIV28:%.*]] = sdiv i64 [[SUB27]], 1
+// CHECK2-NEXT:    [[MUL29:%.*]] = mul nsw i64 1, [[DIV28]]
+// CHECK2-NEXT:    [[MUL30:%.*]] = mul nsw i64 [[MUL29]], 4
+// CHECK2-NEXT:    [[DIV31:%.*]] = sdiv i64 [[TMP21]], [[MUL30]]
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB32:%.*]] = sub nsw i64 [[TMP23]], 0
+// CHECK2-NEXT:    [[DIV33:%.*]] = sdiv i64 [[SUB32]], 1
+// CHECK2-NEXT:    [[MUL34:%.*]] = mul nsw i64 1, [[DIV33]]
+// CHECK2-NEXT:    [[MUL35:%.*]] = mul nsw i64 [[MUL34]], 4
+// CHECK2-NEXT:    [[MUL36:%.*]] = mul nsw i64 [[DIV31]], [[MUL35]]
+// CHECK2-NEXT:    [[SUB37:%.*]] = sub nsw i64 [[TMP20]], [[MUL36]]
+// CHECK2-NEXT:    [[DIV38:%.*]] = sdiv i64 [[SUB37]], 4
+// CHECK2-NEXT:    [[MUL39:%.*]] = mul nsw i64 [[DIV38]], 1
+// CHECK2-NEXT:    [[ADD40:%.*]] = add nsw i64 0, [[MUL39]]
+// CHECK2-NEXT:    store i64 [[ADD40]], ptr [[DOTFORWARD_IV___BEGIN316]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB41:%.*]] = sub nsw i64 [[TMP26]], 0
+// CHECK2-NEXT:    [[DIV42:%.*]] = sdiv i64 [[SUB41]], 1
+// CHECK2-NEXT:    [[MUL43:%.*]] = mul nsw i64 1, [[DIV42]]
+// CHECK2-NEXT:    [[MUL44:%.*]] = mul nsw i64 [[MUL43]], 4
+// CHECK2-NEXT:    [[DIV45:%.*]] = sdiv i64 [[TMP25]], [[MUL44]]
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB46:%.*]] = sub nsw i64 [[TMP27]], 0
+// CHECK2-NEXT:    [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul nsw i64 1, [[DIV47]]
+// CHECK2-NEXT:    [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 4
+// CHECK2-NEXT:    [[MUL50:%.*]] = mul nsw i64 [[DIV45]], [[MUL49]]
+// CHECK2-NEXT:    [[SUB51:%.*]] = sub nsw i64 [[TMP24]], [[MUL50]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB52:%.*]] = sub nsw i64 [[TMP30]], 0
+// CHECK2-NEXT:    [[DIV53:%.*]] = sdiv i64 [[SUB52]], 1
+// CHECK2-NEXT:    [[MUL54:%.*]] = mul nsw i64 1, [[DIV53]]
+// CHECK2-NEXT:    [[MUL55:%.*]] = mul nsw i64 [[MUL54]], 4
+// CHECK2-NEXT:    [[DIV56:%.*]] = sdiv i64 [[TMP29]], [[MUL55]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB57:%.*]] = sub nsw i64 [[TMP31]], 0
+// CHECK2-NEXT:    [[DIV58:%.*]] = sdiv i64 [[SUB57]], 1
+// CHECK2-NEXT:    [[MUL59:%.*]] = mul nsw i64 1, [[DIV58]]
+// CHECK2-NEXT:    [[MUL60:%.*]] = mul nsw i64 [[MUL59]], 4
+// CHECK2-NEXT:    [[MUL61:%.*]] = mul nsw i64 [[DIV56]], [[MUL60]]
+// CHECK2-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[TMP28]], [[MUL61]]
+// CHECK2-NEXT:    [[DIV63:%.*]] = sdiv i64 [[SUB62]], 4
+// CHECK2-NEXT:    [[MUL64:%.*]] = mul nsw i64 [[DIV63]], 4
+// CHECK2-NEXT:    [[SUB65:%.*]] = sub nsw i64 [[SUB51]], [[MUL64]]
+// CHECK2-NEXT:    [[MUL66:%.*]] = mul nsw i64 [[SUB65]], 3
+// CHECK2-NEXT:    [[ADD67:%.*]] = add nsw i64 7, [[MUL66]]
+// CHECK2-NEXT:    [[CONV68:%.*]] = trunc i64 [[ADD67]] to i32
+// CHECK2-NEXT:    store i32 [[CONV68]], ptr [[J17]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8
+// CHECK2-NEXT:    [[ADD69:%.*]] = add nsw i64 [[TMP32]], 1
+// CHECK2-NEXT:    [[SUB70:%.*]] = sub nsw i64 [[ADD69]], 1
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN316]], align 8
+// CHECK2-NEXT:    [[SUB71:%.*]] = sub nsw i64 [[SUB70]], [[TMP33]]
+// CHECK2-NEXT:    store i64 [[SUB71]], ptr [[DOTREVERSED_IV___BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN3]], align 8
+// CHECK2-NEXT:    [[MUL72:%.*]] = mul nsw i64 [[TMP35]], 1
+// CHECK2-NEXT:    [[ADD_PTR73:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[MUL72]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR73]], ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[__BEGIN3]], align 8
+// CHECK2-NEXT:    store ptr [[TMP36]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[K15]], align 4
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP40:%.*]] = load double, ptr [[TMP39]], align 8
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[J17]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP37]], i32 noundef [[TMP38]], double noundef [[TMP40]], i32 noundef [[TMP41]])
+// CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2:       omp.body.continue:
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2:       omp.inner.for.inc:
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    [[ADD74:%.*]] = add nsw i64 [[TMP42]], 1
+// CHECK2-NEXT:    store i64 [[ADD74]], ptr [[DOTOMP_IV]], align 8
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2:       omp.inner.for.end:
+// CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2:       omp.loop.exit:
+// CHECK2-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK2:       omp.precond.end:
+// CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@tfoo5
+// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    call void @_Z4foo5IiTnT_Li3EEvS0_S0_(i32 noundef 0, i32 noundef 42)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z4foo5IiTnT_Li3EEvS0_S0_
+// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 3
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 3
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP6]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP5]], [[ADD5]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD6:%.*]] = add i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[SUB7:%.*]] = sub i32 [[ADD6]], 1
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP8]]
+// CHECK2-NEXT:    store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 3
+// CHECK2-NEXT:    [[ADD9:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP11]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP12]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_reverse_codegen.cpp
+// CHECK2-SAME: () #[[ATTR0]] section ".text.startup" {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    call void @__cxx_global_var_init()
+// CHECK2-NEXT:    ret void
+
diff --git a/clang/test/OpenMP/reverse_messages.cpp b/clang/test/OpenMP/reverse_messages.cpp
new file mode 100644
index 0000000000000..9636a70bf2753
--- /dev/null
+++ b/clang/test/OpenMP/reverse_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+  // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}}
+  #pragma omp reverse
+    ;
+
+  // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}}
+  #pragma omp reverse
+  int b = 0;
+
+  // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}}
+  #pragma omp reverse
+  #pragma omp for
+  for (int i = 0; i < 7; ++i)
+    ;
+
+  {
+    // expected-error@+2 {{expected statement}}
+    #pragma omp reverse
+  }
+
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', '>=', or '!=') of loop variable 'i'}}
+  #pragma omp reverse
+  for (int i = 0; i/3<7; ++i)
+    ;
+
+  // expected-error@+1 {{unexpected OpenMP clause 'sizes' in directive '#pragma omp reverse'}}
+  #pragma omp reverse sizes(5)
+  for (int i = 0; i < 7; ++i)
+    ;
+
+  // expected-warning@+1 {{extra tokens at the end of '#pragma omp reverse' are ignored}}
+  #pragma omp reverse foo
+  for (int i = 0; i < 7; ++i)
+    ;
+
+}
+
diff --git a/clang/test/OpenMP/target_defaultmap_messages.cpp b/clang/test/OpenMP/target_defaultmap_messages.cpp
index 8052e34d5c933..88ae3b7962d55 100644
--- a/clang/test/OpenMP/target_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -verify=expected,omp51 -Wuninitialized -DOMP51
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -30,23 +33,23 @@ template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
   #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} 
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (scalar: // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target defaultmap (scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp51-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
 
   foo();
 
@@ -93,23 +96,23 @@ T tmain(T argc, S **argv) {
 int main(int argc, char **argv) {
 #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(scalar: // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp51-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
   foo();
 
 #ifdef OMP5
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index 7f5592841fa68..ec26cf2186285 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -122,8 +122,40 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG58]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG64:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META72:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META73:![0-9]+]], !DIExpression(), [[META72]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META74:![0-9]+]], !DIExpression(), [[META72]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META75:![0-9]+]], !DIExpression(), [[META72]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META76:![0-9]+]], !DIExpression(), [[META72]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR3:[0-9]+]], !dbg [[DBG77]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG77]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG64:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG78:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -140,83 +172,83 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META72:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META85:![0-9]+]], !DIExpression(), [[META86:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META73:![0-9]+]], !DIExpression(), [[META72]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META87:![0-9]+]], !DIExpression(), [[META86]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META74:![0-9]+]], !DIExpression(), [[META75:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META88:![0-9]+]], !DIExpression(), [[META89:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META76:![0-9]+]], !DIExpression(), [[META77:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META90:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META78:![0-9]+]], !DIExpression(), [[META79:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META92:![0-9]+]], !DIExpression(), [[META93:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META80:![0-9]+]], !DIExpression(), [[META81:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[B3]], [[META83:![0-9]+]], !DIExpression(), [[META72]])
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG82]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META84:![0-9]+]], !DIExpression(), [[META87:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG88]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG88]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[META87]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META89:![0-9]+]], !DIExpression(), [[META90:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META90]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META91:![0-9]+]], !DIExpression(), [[META92:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG93]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[META92]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META94:![0-9]+]], !DIExpression(), [[META95:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META95]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG100]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG103]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG105]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG108]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG108]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG110]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG110]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG110]]
-// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG110]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG110]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG110]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META94:![0-9]+]], !DIExpression(), [[META95:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG96]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG96]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[B3]], [[META97:![0-9]+]], !DIExpression(), [[META86]])
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG96]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META98:![0-9]+]], !DIExpression(), [[META101:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG102]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG102]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[META101]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META103:![0-9]+]], !DIExpression(), [[META104:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META104]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META105:![0-9]+]], !DIExpression(), [[META106:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[META106]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META108:![0-9]+]], !DIExpression(), [[META109:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META109]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG111]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG114]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG117]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG118:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG117]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG117]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG117]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG119]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG119]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG121:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG124:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG124]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG124]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG124]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG124]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG124]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP7]], align 1, !dbg [[DBG124]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG125:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG112:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG126:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -225,61 +257,29 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META119:![0-9]+]], !DIExpression(), [[META120:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META129:![0-9]+]], !DIExpression(), [[META130:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META121:![0-9]+]], !DIExpression(), [[META120]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META122:![0-9]+]], !DIExpression(), [[META120]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META123:![0-9]+]], !DIExpression(), [[META120]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META124:![0-9]+]], !DIExpression(), [[META120]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META125:![0-9]+]], !DIExpression(), [[META120]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG126:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG126]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG126]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG126]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG126]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG127:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META130:![0-9]+]], !DIExpression(), [[META131:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META131:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META132:![0-9]+]], !DIExpression(), [[META131]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META132:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META131]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META134:![0-9]+]], !DIExpression(), [[META131]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META134:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META135:![0-9]+]], !DIExpression(), [[META131]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META135:![0-9]+]], !DIExpression(), [[META130]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR3]], !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG136]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG136]]
 //
 //
@@ -342,8 +342,41 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG152]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG158:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META159:![0-9]+]], !DIExpression(), [[META160:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META161:![0-9]+]], !DIExpression(), [[META160]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META162:![0-9]+]], !DIExpression(), [[META160]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META163:![0-9]+]], !DIExpression(), [[META160]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META164:![0-9]+]], !DIExpression(), [[META160]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG165]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG165]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG165]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG158:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG166:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -359,74 +392,74 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META161:![0-9]+]], !DIExpression(), [[META162:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META169:![0-9]+]], !DIExpression(), [[META170:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META163:![0-9]+]], !DIExpression(), [[META162]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META171:![0-9]+]], !DIExpression(), [[META170]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META164:![0-9]+]], !DIExpression(), [[META165:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META172:![0-9]+]], !DIExpression(), [[META173:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META166:![0-9]+]], !DIExpression(), [[META167:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META174:![0-9]+]], !DIExpression(), [[META175:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META168:![0-9]+]], !DIExpression(), [[META169:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META176:![0-9]+]], !DIExpression(), [[META177:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META170:![0-9]+]], !DIExpression(), [[META171:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG172:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG172]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG172]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG172]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG172]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META173:![0-9]+]], !DIExpression(), [[META175:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG176:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG176]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG176]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[META175]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META177:![0-9]+]], !DIExpression(), [[META178:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META178]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META179:![0-9]+]], !DIExpression(), [[META180:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[META180]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META182:![0-9]+]], !DIExpression(), [[META183:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META183]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG184:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG188:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG188]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG188]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG188]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG193]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG196]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG196]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META178:![0-9]+]], !DIExpression(), [[META179:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG180]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG180]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG180]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META181:![0-9]+]], !DIExpression(), [[META183:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG184]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[META183]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META185:![0-9]+]], !DIExpression(), [[META186:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META186]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META187:![0-9]+]], !DIExpression(), [[META188:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG189]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[META188]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META190:![0-9]+]], !DIExpression(), [[META191:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META191]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG193]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG193]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG196]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG196]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG196]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG199]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG201:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG201]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG201]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG204]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG206:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG199:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG207:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -435,63 +468,30 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META200:![0-9]+]], !DIExpression(), [[META201:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META208:![0-9]+]], !DIExpression(), [[META209:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META202:![0-9]+]], !DIExpression(), [[META201]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META210:![0-9]+]], !DIExpression(), [[META209]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META203:![0-9]+]], !DIExpression(), [[META201]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META211:![0-9]+]], !DIExpression(), [[META209]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META204:![0-9]+]], !DIExpression(), [[META201]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META212:![0-9]+]], !DIExpression(), [[META209]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META205:![0-9]+]], !DIExpression(), [[META201]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META213:![0-9]+]], !DIExpression(), [[META209]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META206:![0-9]+]], !DIExpression(), [[META201]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG207]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG207]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG208:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META209:![0-9]+]], !DIExpression(), [[META210:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META211:![0-9]+]], !DIExpression(), [[META210]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META212:![0-9]+]], !DIExpression(), [[META210]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META213:![0-9]+]], !DIExpression(), [[META210]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META214:![0-9]+]], !DIExpression(), [[META210]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META214:![0-9]+]], !DIExpression(), [[META209]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG215]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG215]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG215]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG215]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG215]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG215]]
 //
 //
@@ -554,8 +554,43 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG231]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG237:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META240:![0-9]+]], !DIExpression(), [[META241:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META242:![0-9]+]], !DIExpression(), [[META241]])
+// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META243:![0-9]+]], !DIExpression(), [[META241]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META244:![0-9]+]], !DIExpression(), [[META241]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META245:![0-9]+]], !DIExpression(), [[META241]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG246:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG246]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG246]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG237:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG247:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -572,82 +607,82 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META240:![0-9]+]], !DIExpression(), [[META241:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META250:![0-9]+]], !DIExpression(), [[META251:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META242:![0-9]+]], !DIExpression(), [[META241]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META252:![0-9]+]], !DIExpression(), [[META251]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META243:![0-9]+]], !DIExpression(), [[META244:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META253:![0-9]+]], !DIExpression(), [[META254:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META245:![0-9]+]], !DIExpression(), [[META246:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META255:![0-9]+]], !DIExpression(), [[META256:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META247:![0-9]+]], !DIExpression(), [[META248:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META257:![0-9]+]], !DIExpression(), [[META258:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META249:![0-9]+]], !DIExpression(), [[META250:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG251:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG251]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG251]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG251]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG251]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META252:![0-9]+]], !DIExpression(), [[META254:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[META254]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META256:![0-9]+]], !DIExpression(), [[META257:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META257]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META258:![0-9]+]], !DIExpression(), [[META259:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG260:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG260]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[META259]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META261:![0-9]+]], !DIExpression(), [[META262:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META262]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG265:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG266:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG267:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG268:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG267]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG269:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG270:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG270]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG271:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG270]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG270]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG270]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG272:![0-9]+]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG273:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG272]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG272]]
-// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG274:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG275:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG277:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG277]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META259:![0-9]+]], !DIExpression(), [[META260:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG261:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG261]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG261]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG261]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG261]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG261]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META262:![0-9]+]], !DIExpression(), [[META264:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG265:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG265]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG265]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[META264]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META266:![0-9]+]], !DIExpression(), [[META267:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META267]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META268:![0-9]+]], !DIExpression(), [[META269:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG270]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[META269]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META271:![0-9]+]], !DIExpression(), [[META272:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META272]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG273:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG275:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG274]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG274]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG276:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG277:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG277]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG277]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG277]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG279:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG280:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG280]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG281:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG280]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG280]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG280]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG282:![0-9]+]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG282]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG282]]
+// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG286:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP11]], align 1, !dbg [[DBG287]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG288:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG279:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG289:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -656,66 +691,31 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META282:![0-9]+]], !DIExpression(), [[META283:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META292:![0-9]+]], !DIExpression(), [[META293:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META284:![0-9]+]], !DIExpression(), [[META283]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META285:![0-9]+]], !DIExpression(), [[META283]])
-// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META286:![0-9]+]], !DIExpression(), [[META283]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META287:![0-9]+]], !DIExpression(), [[META283]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META288:![0-9]+]], !DIExpression(), [[META283]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG289]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG289]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG289]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG290:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META293:![0-9]+]], !DIExpression(), [[META294:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META294:![0-9]+]], !DIExpression(), [[META293]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META295:![0-9]+]], !DIExpression(), [[META294]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META295:![0-9]+]], !DIExpression(), [[META293]])
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META296:![0-9]+]], !DIExpression(), [[META294]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META296:![0-9]+]], !DIExpression(), [[META293]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META297:![0-9]+]], !DIExpression(), [[META294]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META297:![0-9]+]], !DIExpression(), [[META293]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META298:![0-9]+]], !DIExpression(), [[META294]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META298:![0-9]+]], !DIExpression(), [[META293]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG299]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG299]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG299]]
 //
diff --git a/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
index 4c226e9ca2aef..3c628c4f3e172 100644
--- a/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized -Wno-vla -DOMP51
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized -Wno-vla -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -Wno-vla -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -Wno-vla -DOMP5
@@ -28,19 +31,19 @@ void foo() {
 
 template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
-#pragma omp target parallel defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target parallel defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
@@ -90,19 +93,19 @@ T tmain(T argc, S **argv) {
 }
 
 int main(int argc, char **argv) {
-#pragma omp target parallel defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target parallel defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
index e27cc0d536269..5addc4928918f 100644
--- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
@@ -78,8 +78,8 @@ int main() {
 // CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META46:![0-9]+]], !DIExpression(), [[META47:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
 // CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META48:![0-9]+]], !DIExpression(), [[META49:![0-9]+]])
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
 // CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META50:![0-9]+]], !DIExpression(), [[META51:![0-9]+]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]]
@@ -110,8 +110,8 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG55]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG58:![0-9]+]]
@@ -119,8 +119,45 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG52]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META66:![0-9]+]], !DIExpression(), [[META67:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META68:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META69:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META70:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META72:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG73]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[LOADEDV]]) #[[ATTR3:[0-9]+]], !dbg [[DBG73]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG73]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG74:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -144,149 +181,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META66:![0-9]+]], !DIExpression(), [[META67:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META81:![0-9]+]], !DIExpression(), [[META82:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META68:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META83:![0-9]+]], !DIExpression(), [[META82]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META69:![0-9]+]], !DIExpression(), [[META70:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META84:![0-9]+]], !DIExpression(), [[META85:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META72:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META86:![0-9]+]], !DIExpression(), [[META87:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META73:![0-9]+]], !DIExpression(), [[META74:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META88:![0-9]+]], !DIExpression(), [[META89:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META75:![0-9]+]], !DIExpression(), [[META76:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META78:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META79:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META81:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META82:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META83:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[B4]], [[META84:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG77]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META85:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG86:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META90:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META93:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META94:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META96:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META97:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META98:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[B4]], [[META99:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG92]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META100:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG80]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG95]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG95]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG80]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG95]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG95]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG80]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG95]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG92]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG92]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG87]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG87]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META88:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG92]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META91]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META93:![0-9]+]], !DIExpression(), [[META94:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META94]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META95:![0-9]+]], !DIExpression(), [[META96:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG97]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META96]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META98:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META99]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG104]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG109]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG112:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG113:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG112]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG114]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG114]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG102]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG102]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META103:![0-9]+]], !DIExpression(), [[META106:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META106]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META108:![0-9]+]], !DIExpression(), [[META109:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META109]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META110:![0-9]+]], !DIExpression(), [[META111:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG112]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META111]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META113:![0-9]+]], !DIExpression(), [[META114:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META114]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG116]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG118:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG119]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG119]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG119]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG121:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG124:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG125:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG124]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG124]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG126:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG127:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG127]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG127]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG127]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG129:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG129]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG129]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP7]], align 1, !dbg [[DBG129]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG130:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG86]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG101]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP116:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP131:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG86]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG101]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP118:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP133:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG117:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG132:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG134:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG120:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG135:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -295,66 +332,29 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META127:![0-9]+]], !DIExpression(), [[META128:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META138:![0-9]+]], !DIExpression(), [[META139:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META129:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META130:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META131:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META132:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG134]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG134]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG134]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG135:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META138:![0-9]+]], !DIExpression(), [[META139:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META140:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META140:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META141:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META141:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META142:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META142:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META143:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META143:![0-9]+]], !DIExpression(), [[META139]])
-// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META144:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META144:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG145]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR3]], !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG145]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG145]]
 //
 //
@@ -417,8 +417,41 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG161]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG167:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META170:![0-9]+]], !DIExpression(), [[META171:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META172:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META174:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG176]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG176]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG167:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG177:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -441,140 +474,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META170:![0-9]+]], !DIExpression(), [[META171:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META180:![0-9]+]], !DIExpression(), [[META181:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META172:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META182:![0-9]+]], !DIExpression(), [[META181]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META174:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META183:![0-9]+]], !DIExpression(), [[META184:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META176:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META185:![0-9]+]], !DIExpression(), [[META186:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META177:![0-9]+]], !DIExpression(), [[META178:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META187:![0-9]+]], !DIExpression(), [[META188:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META179:![0-9]+]], !DIExpression(), [[META180:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META182:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META183:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META185:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META186:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META187:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META188:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META189:![0-9]+]], !DIExpression(), [[META190:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META192:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META193:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META195:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META196:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META197:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META198:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG184]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG194]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG194]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG184]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG194]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG194]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG184]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG181]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG191]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG181]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG191]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG190]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG190]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META191:![0-9]+]], !DIExpression(), [[META193:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG194]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG194]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[META193]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META195:![0-9]+]], !DIExpression(), [[META196:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META196]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META197:![0-9]+]], !DIExpression(), [[META198:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG199]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[META198]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META200:![0-9]+]], !DIExpression(), [[META201:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META201]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG203]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG203]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG206]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG206]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG206]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG209:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG210:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG209]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG211]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG211]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG214:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG214]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG214]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG215:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG200]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG200]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META201:![0-9]+]], !DIExpression(), [[META203:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG204]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[META203]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META205:![0-9]+]], !DIExpression(), [[META206:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META206]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META207:![0-9]+]], !DIExpression(), [[META208:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG209]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[META208]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META210:![0-9]+]], !DIExpression(), [[META211:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META211]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG213]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG213]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG217:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG216]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG219:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG220:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG219]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG221]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG223:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG224:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG224]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG224]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG225:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG226:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG189]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG199]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP227:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG189]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG199]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP219:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP229:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG218:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG220:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG228:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG230:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG221:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG231:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -583,63 +616,30 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META222:![0-9]+]], !DIExpression(), [[META223:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META232:![0-9]+]], !DIExpression(), [[META233:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META224:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META225:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META226:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META227:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META228:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG229]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG229]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG230:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META233:![0-9]+]], !DIExpression(), [[META234:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META234:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META235:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META235:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META236:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META236:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META237:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META237:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META238:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META238:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG239]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG239]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG239]]
 //
 //
@@ -702,8 +702,43 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG255]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG261:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META264:![0-9]+]], !DIExpression(), [[META265:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META266:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META267:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META268:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META269:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG270]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG270]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG261:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG271:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -727,148 +762,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META264:![0-9]+]], !DIExpression(), [[META265:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META274:![0-9]+]], !DIExpression(), [[META275:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META266:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META276:![0-9]+]], !DIExpression(), [[META275]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META267:![0-9]+]], !DIExpression(), [[META268:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META277:![0-9]+]], !DIExpression(), [[META278:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META269:![0-9]+]], !DIExpression(), [[META270:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META279:![0-9]+]], !DIExpression(), [[META280:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META271:![0-9]+]], !DIExpression(), [[META272:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META281:![0-9]+]], !DIExpression(), [[META282:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META273:![0-9]+]], !DIExpression(), [[META274:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META276:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META277:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META279:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META280:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META281:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META282:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META283:![0-9]+]], !DIExpression(), [[META284:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META286:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META287:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META289:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META290:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META291:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META292:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG288]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG288]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG278]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG288]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG288]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG288]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG285]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG285]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META285:![0-9]+]], !DIExpression(), [[META287:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META287]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META289:![0-9]+]], !DIExpression(), [[META290:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META290]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META291:![0-9]+]], !DIExpression(), [[META292:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META292]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META294:![0-9]+]], !DIExpression(), [[META295:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META295]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG294:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG294]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG294]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META295:![0-9]+]], !DIExpression(), [[META297:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG298]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG298]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META297]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META299:![0-9]+]], !DIExpression(), [[META300:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META300]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META301:![0-9]+]], !DIExpression(), [[META302:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG303:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG303]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META302]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META304:![0-9]+]], !DIExpression(), [[META305:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META305]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG307:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG308:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG307]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG307]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG310:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG310]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG310]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG310]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG312:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG313:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG314:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG313]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG316:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG315]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG317:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG318:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG319:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG318]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP11]], align 1, !dbg [[DBG320]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG321:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG293]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP322:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG293]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP324:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG323:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG325:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG316:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG326:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -877,66 +912,31 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META319:![0-9]+]], !DIExpression(), [[META320:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META329:![0-9]+]], !DIExpression(), [[META330:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META321:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META322:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META323:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META324:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META325:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG326]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG326]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG327:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META330:![0-9]+]], !DIExpression(), [[META331:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META331:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META332:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META332:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META333:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META333:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META334:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META334:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META335:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META335:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG336]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG336]]
 //
diff --git a/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
index 8b58bb9111c81..b142ae1733a3c 100644
--- a/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp51-error {{expected ')'}} omp5-note {{to match this '('}} omp51-note {{to match this '('}}
+#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5x-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
   #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp51-error {{expected ')'}} omp5-note {{to match this '('}} omp51-note {{to match this '('}}
+#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5x-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (scalar: // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (scalar: // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
index 7d72547d24e1e..46f78290cba4b 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
   #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
index ea0ef01f8161c..26a01c75f951f 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
@@ -78,8 +78,8 @@ int main() {
 // CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META46:![0-9]+]], !DIExpression(), [[META47:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
 // CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META48:![0-9]+]], !DIExpression(), [[META49:![0-9]+]])
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
 // CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META50:![0-9]+]], !DIExpression(), [[META51:![0-9]+]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]]
@@ -110,8 +110,8 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG55]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG58:![0-9]+]]
@@ -119,8 +119,45 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG52]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META66:![0-9]+]], !DIExpression(), [[META67:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META68:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META69:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META70:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META72:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG73]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG73]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[LOADEDV]]) #[[ATTR3:[0-9]+]], !dbg [[DBG73]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG73]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG74:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -144,149 +181,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META66:![0-9]+]], !DIExpression(), [[META67:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META81:![0-9]+]], !DIExpression(), [[META82:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META68:![0-9]+]], !DIExpression(), [[META67]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META83:![0-9]+]], !DIExpression(), [[META82]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META69:![0-9]+]], !DIExpression(), [[META70:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META84:![0-9]+]], !DIExpression(), [[META85:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META71:![0-9]+]], !DIExpression(), [[META72:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META86:![0-9]+]], !DIExpression(), [[META87:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META73:![0-9]+]], !DIExpression(), [[META74:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META88:![0-9]+]], !DIExpression(), [[META89:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META75:![0-9]+]], !DIExpression(), [[META76:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG77]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META78:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META79:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META81:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META82:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META83:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[B4]], [[META84:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG77]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META85:![0-9]+]], !DIExpression(), [[META67]])
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG86:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META90:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META93:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META94:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META96:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META97:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META98:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[B4]], [[META99:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG92]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META100:![0-9]+]], !DIExpression(), [[META82]])
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG80]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG95]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG95]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG80]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG95]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG95]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG80]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG95]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG92]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG92]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG92]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG87]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG87]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META88:![0-9]+]], !DIExpression(), [[META91:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG92]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META91]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META93:![0-9]+]], !DIExpression(), [[META94:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META94]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META95:![0-9]+]], !DIExpression(), [[META96:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG97]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META96]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META98:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META99]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG104]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG109]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG112:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG113:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG112]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG114]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG114]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG102]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG102]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META103:![0-9]+]], !DIExpression(), [[META106:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META106]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META108:![0-9]+]], !DIExpression(), [[META109:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META109]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META110:![0-9]+]], !DIExpression(), [[META111:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG112]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META111]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META113:![0-9]+]], !DIExpression(), [[META114:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META114]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG116]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG118:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG119]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG119]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG119]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG121:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG122]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG124:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG125:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG124]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG124]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG126:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG127:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG127]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG127]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG127]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG129:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG129]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG129]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG129]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP7]], align 1, !dbg [[DBG129]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG130:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG86]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG101]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP116:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP131:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG86]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG101]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP118:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP133:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG117:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG132:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG134:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG120:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG135:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -295,66 +332,29 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META127:![0-9]+]], !DIExpression(), [[META128:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META138:![0-9]+]], !DIExpression(), [[META139:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META129:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META130:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META131:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META132:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META133:![0-9]+]], !DIExpression(), [[META128]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG134]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG134]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG134]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG134]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG135:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META138:![0-9]+]], !DIExpression(), [[META139:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META140:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META140:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META141:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META141:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META142:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META142:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META143:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META143:![0-9]+]], !DIExpression(), [[META139]])
-// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTCAPTURE_EXPR__ADDR]], [[META144:![0-9]+]], !DIExpression(), [[META139]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META144:![0-9]+]], !DIExpression(), [[META139]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG145]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG145]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR3]], !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG145]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG145]]
 //
 //
@@ -417,8 +417,41 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG161]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG167:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META170:![0-9]+]], !DIExpression(), [[META171:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META172:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META174:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG176]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG176]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG176]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG167:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG177:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -441,140 +474,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META170:![0-9]+]], !DIExpression(), [[META171:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META180:![0-9]+]], !DIExpression(), [[META181:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META172:![0-9]+]], !DIExpression(), [[META171]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META182:![0-9]+]], !DIExpression(), [[META181]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META174:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META183:![0-9]+]], !DIExpression(), [[META184:![0-9]+]])
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META176:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META185:![0-9]+]], !DIExpression(), [[META186:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META177:![0-9]+]], !DIExpression(), [[META178:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META187:![0-9]+]], !DIExpression(), [[META188:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META179:![0-9]+]], !DIExpression(), [[META180:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG181]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META182:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META183:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META185:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META186:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META187:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META188:![0-9]+]], !DIExpression(), [[META171]])
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META189:![0-9]+]], !DIExpression(), [[META190:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG191]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META192:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META193:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META195:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META196:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META197:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META198:![0-9]+]], !DIExpression(), [[META181]])
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG184]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG194]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG194]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG184]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG194]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG194]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG184]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG181]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG191]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG181]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG191]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG191]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG190]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG190]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META191:![0-9]+]], !DIExpression(), [[META193:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG194]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG194]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[META193]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META195:![0-9]+]], !DIExpression(), [[META196:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META196]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META197:![0-9]+]], !DIExpression(), [[META198:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG199]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[META198]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META200:![0-9]+]], !DIExpression(), [[META201:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META201]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG203]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG203]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG206]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG206]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG206]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG209:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG210:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG209]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG209]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG211]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG211]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG214:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG214]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG214]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG215:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG200]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG200]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META201:![0-9]+]], !DIExpression(), [[META203:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG204]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[META203]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META205:![0-9]+]], !DIExpression(), [[META206:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[META206]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META207:![0-9]+]], !DIExpression(), [[META208:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG209]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[META208]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META210:![0-9]+]], !DIExpression(), [[META211:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META211]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG213]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG213]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG217:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG216]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG219:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG220:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG219]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG219]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG221]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG223:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG224:![0-9]+]]
+// CHECK1-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG224]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[LOADEDV]] to i32, !dbg [[DBG224]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG225:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG226:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG189]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG199]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP227:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG189]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG199]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG181]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP219:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP229:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG218:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG220:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG228:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG230:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG221:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG231:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -583,63 +616,30 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META222:![0-9]+]], !DIExpression(), [[META223:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META232:![0-9]+]], !DIExpression(), [[META233:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META224:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META225:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META226:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META227:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META228:![0-9]+]], !DIExpression(), [[META223]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG229]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG229]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG229]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG230:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META233:![0-9]+]], !DIExpression(), [[META234:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META234:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META235:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META235:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META236:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META236:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META237:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META237:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META238:![0-9]+]], !DIExpression(), [[META234]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META238:![0-9]+]], !DIExpression(), [[META233]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG239]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG239]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR3]], !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG239]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG239]]
 //
 //
@@ -702,8 +702,43 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG255]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG261:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META264:![0-9]+]], !DIExpression(), [[META265:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META266:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META267:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META268:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META269:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG270]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG270]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG270]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG261:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG271:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -727,148 +762,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META264:![0-9]+]], !DIExpression(), [[META265:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META274:![0-9]+]], !DIExpression(), [[META275:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META266:![0-9]+]], !DIExpression(), [[META265]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META276:![0-9]+]], !DIExpression(), [[META275]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META267:![0-9]+]], !DIExpression(), [[META268:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META277:![0-9]+]], !DIExpression(), [[META278:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META269:![0-9]+]], !DIExpression(), [[META270:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META279:![0-9]+]], !DIExpression(), [[META280:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META271:![0-9]+]], !DIExpression(), [[META272:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META281:![0-9]+]], !DIExpression(), [[META282:![0-9]+]])
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META273:![0-9]+]], !DIExpression(), [[META274:![0-9]+]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META276:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META277:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META279:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META280:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META281:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META282:![0-9]+]], !DIExpression(), [[META265]])
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META283:![0-9]+]], !DIExpression(), [[META284:![0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG285]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IV]], [[META286:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_LB]], [[META287:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_UB]], [[META289:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_STRIDE]], [[META290:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTOMP_IS_LAST]], [[META291:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[I]], [[META292:![0-9]+]], !DIExpression(), [[META275]])
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG288]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG288]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG278]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG288]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG288]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG288]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG285]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG285]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG285]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META285:![0-9]+]], !DIExpression(), [[META287:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META287]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META289:![0-9]+]], !DIExpression(), [[META290:![0-9]+]])
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META290]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META291:![0-9]+]], !DIExpression(), [[META292:![0-9]+]])
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META292]]
-// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META294:![0-9]+]], !DIExpression(), [[META295:![0-9]+]])
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META295]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG294:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG294]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG294]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[F]], [[META295:![0-9]+]], !DIExpression(), [[META297:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG298]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG298]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[META297]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[G]], [[META299:![0-9]+]], !DIExpression(), [[META300:![0-9]+]])
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[META300]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[H]], [[META301:![0-9]+]], !DIExpression(), [[META302:![0-9]+]])
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG303:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG303]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[META302]]
+// CHECK1-NEXT:      #dbg_declare(ptr [[D]], [[META304:![0-9]+]], !DIExpression(), [[META305:![0-9]+]])
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[META305]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG307:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG308:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG307]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG307]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG310:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG310]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG310]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG310]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG312:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG313:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG314:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG313]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG313]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG316:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG315]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG317:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG318:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG319:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG318]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG318]]
+// CHECK1-NEXT:    [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[STOREDV]], ptr [[TMP11]], align 1, !dbg [[DBG320]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG321:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG293]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP322:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG293]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG285]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG285]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP324:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG323:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG325:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG316:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG326:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -877,66 +912,31 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META319:![0-9]+]], !DIExpression(), [[META320:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTGLOBAL_TID__ADDR]], [[META329:![0-9]+]], !DIExpression(), [[META330:![0-9]+]])
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META321:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META322:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META323:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META324:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META325:![0-9]+]], !DIExpression(), [[META320]])
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG326]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG326]]
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG327:![0-9]+]] {
-// CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[DYN_PTR_ADDR]], [[META330:![0-9]+]], !DIExpression(), [[META331:![0-9]+]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[DOTBOUND_TID__ADDR]], [[META331:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META332:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[C_ADDR]], [[META332:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META333:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META333:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META334:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META334:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META335:![0-9]+]], !DIExpression(), [[META331]])
+// CHECK1-NEXT:      #dbg_declare(ptr [[BB_ADDR]], [[META335:![0-9]+]], !DIExpression(), [[META330]])
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR3]], !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG336]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG336]]
 //
diff --git a/clang/test/OpenMP/target_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
index 0fa86cc7279ad..2169681e05749 100644
--- a/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,24 +18,24 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
@@ -46,28 +49,28 @@ int main(int argc, char **argv) {
 #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
diff --git a/clang/test/OpenMP/target_teams_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
index 206a1e4f18af2..97ed5315c4c8e 100644
--- a/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -14,21 +17,21 @@ template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
   #pragma omp target teams defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   foo();
 #pragma omp target teams defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
@@ -39,21 +42,21 @@ T tmain(T argc, S **argv) {
 int main(int argc, char **argv) {
   #pragma omp target teams defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   foo();
 #pragma omp target teams defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
index 84d5b198c2878..3e3392d94af98 100644
--- a/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
 #pragma omp target teams distribute defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
index 0bb519be21740..b789c39dfad6b 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
 #pragma omp target teams distribute parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
index 0eeca55a9dcd2..ab3ea1b283d3b 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,22 +18,22 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
@@ -44,27 +47,27 @@ int main(int argc, char **argv) {
 #pragma omp target teams distribute parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
diff --git a/clang/test/PCH/aarch64-sve-types.c b/clang/test/PCH/aarch64-sve-types.c
index 4c4549af0b6d6..249618c3a9708 100644
--- a/clang/test/PCH/aarch64-sve-types.c
+++ b/clang/test/PCH/aarch64-sve-types.c
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -include-pch %t \
 // RUN:   -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple aarch64-linux-gnu \
+// RUN:   -x hip-cpp-output -emit-pch -o %t %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/PCH/pragma-loop.cpp b/clang/test/PCH/pragma-loop.cpp
index a3c6871041c0e..b02383c1485d5 100644
--- a/clang/test/PCH/pragma-loop.cpp
+++ b/clang/test/PCH/pragma-loop.cpp
@@ -18,9 +18,9 @@
 // CHECK: #pragma nounroll{{$}}
 // CHECK: #pragma clang loop vectorize_width(V)
 // CHECK: #pragma clang loop interleave_count(I)
-// CHECK: #pragma omp simd
-// CHECK: #pragma omp for
-// CHECK: #pragma omp distribute
+// CHECK: #pragma omp loop bind(thread)
+// CHECK: #pragma omp loop bind(parallel)
+// CHECK: #pragma omp loop bind(teams)
 
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/Parser/cxx-template-decl.cpp b/clang/test/Parser/cxx-template-decl.cpp
index 734438069b9ae..476341686a64b 100644
--- a/clang/test/Parser/cxx-template-decl.cpp
+++ b/clang/test/Parser/cxx-template-decl.cpp
@@ -297,3 +297,7 @@ namespace PR46231 {
   template<> int; // expected-error {{declaration does not declare anything}}
   template<int> int; // expected-error {{declaration does not declare anything}}
 }
+
+namespace PR99933 {
+  void foo() { template <typename> int i; } // expected-error {{templates can only be declared in namespace or class scope}}
+}
diff --git a/clang/test/Parser/namelookup-anonymous-struct.c b/clang/test/Parser/namelookup-anonymous-struct.c
new file mode 100644
index 0000000000000..cb691c22f97ff
--- /dev/null
+++ b/clang/test/Parser/namelookup-anonymous-struct.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c11 -verify %s
+
+struct GH31295 {
+  struct { int x; };
+  int arr[sizeof(x)]; // expected-error{{use of undeclared identifier 'x'}}
+};
diff --git a/clang/test/Preprocessor/Inputs/big_char.txt b/clang/test/Preprocessor/Inputs/big_char.txt
new file mode 100644
index 0000000000000..ce542efaa5124
--- /dev/null
+++ b/clang/test/Preprocessor/Inputs/big_char.txt
@@ -0,0 +1 @@
+�
\ No newline at end of file
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 71cc36acf3f0e..87bd3e142d2c4 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -236,6 +236,15 @@
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s
 // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1
 
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2p1 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p1 %s
+// CHECK-SVE2p1: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
+// CHECK-SVE2p1: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
+// CHECK-SVE2p1: __ARM_FEATURE_SVE2 1
+// CHECK-SVE2p1: __ARM_FEATURE_SVE2p1 1
+// CHECK-SVE2p1: __ARM_NEON 1
+// CHECK-SVE2p1: __ARM_NEON_FP 0xE
+// CHECK-SVE2p1: __ARM_NEON_SVE_BRIDGE 1
+
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.4a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s
 // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1
@@ -694,3 +703,9 @@
 // CHECK-SME2: __ARM_FEATURE_LOCALLY_STREAMING 1
 // CHECK-SME2: __ARM_FEATURE_SME 1
 // CHECK-SME2: __ARM_FEATURE_SME2 1
+
+// RUN: %clang --target=aarch64 -march=armv9-a+sme2p1 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p1 %s
+// CHECK-SME2p1: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME2p1: __ARM_FEATURE_SME 1
+// CHECK-SME2p1: __ARM_FEATURE_SME2 1
+// CHECK-SME2p1: __ARM_FEATURE_SME2p1 1
diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp
index 2cf14d8d6a15d..5baab9b59a9c1 100644
--- a/clang/test/Preprocessor/embed_codegen.cpp
+++ b/clang/test/Preprocessor/embed_codegen.cpp
@@ -14,9 +14,9 @@ int ca[] = {
 };
 
 // CHECK: %arrayinit.element = getelementptr inbounds i32, ptr %notca, i64 1
-// CHECK: store i8 106, ptr %arrayinit.element, align 4
+// CHECK: store i32 106, ptr %arrayinit.element, align 4
 // CHECK: %arrayinit.element1 = getelementptr inbounds i32, ptr %notca, i64 2
-// CHECK: store i8 107, ptr %arrayinit.element1, align 4
+// CHECK: store i32 107, ptr %arrayinit.element1, align 4
 int notca[] = {
 a
 #embed <jk.txt> prefix(,)
@@ -75,9 +75,9 @@ constexpr struct T t[] = {
 // CHECK:  %arrayinit.element7 = getelementptr inbounds %struct.T, ptr %tnonc, i64 1
 // CHECK:  call void @llvm.memset.p0.i64(ptr align 4 %arrayinit.element7, i8 0, i64 20, i1 false)
 // CHECK:  %arr8 = getelementptr inbounds %struct.T, ptr %arrayinit.element7, i32 0, i32 0
-// CHECK:  store i8 106, ptr %arr8, align 4
+// CHECK:  store i32 106, ptr %arr8, align 4
 // CHECK:  %arrayinit.element9 = getelementptr inbounds i32, ptr %arr8, i64 1
-// CHECK:  store i8 107, ptr %arrayinit.element9, align 4
+// CHECK:  store i32 107, ptr %arrayinit.element9, align 4
 struct T tnonc[] = {
   a, 300, 1, 2, 3
 #embed <jk.txt> prefix(,)
diff --git a/clang/test/Preprocessor/embed_search_paths.c b/clang/test/Preprocessor/embed_search_paths.c
new file mode 100644
index 0000000000000..5cc1bbf9f87a9
--- /dev/null
+++ b/clang/test/Preprocessor/embed_search_paths.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -std=c23 %s -E -verify --embed-dir=%S --embed-dir=%S/Inputs
+// expected-no-diagnostics
+
+#embed <jk.txt>
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index cc73a88e5a657..9a984e40d4aa2 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -4,6 +4,8 @@
 // RUN: printf "\0" > %t/null_byte.bin
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%t -verify=expected,cxx -Wno-c23-extensions
 // RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only --embed-dir=%t -verify=expected,c
+// RUN: %clang_cc1 %s -fsyntax-only -fexperimental-new-constant-interpreter --embed-dir=%t -verify=expected,cxx -Wno-c23-extensions
+// RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only -fexperimental-new-constant-interpreter --embed-dir=%t -verify=expected,c
 #embed <media/empty>
 ;
 
@@ -115,3 +117,22 @@ void f1() {
   };
 }
 #endif
+
+static_assert(_Generic(
+#embed __FILE__ limit(1)
+  , int : 1, default : 0));
+
+static_assert(alignof(typeof(
+#embed __FILE__ limit(1)
+)) == alignof(int));
+
+struct HasChar {
+  signed char ch;
+};
+
+constexpr struct HasChar c = {
+#embed "Inputs/big_char.txt" // cxx-error {{constant expression evaluates to 255 which cannot be narrowed to type 'signed char'}} \
+                                cxx-note {{insert an explicit cast to silence this issue}} \
+                                c-error {{constexpr initializer evaluates to 255 which is not exactly representable in type 'signed char'}}
+
+};
diff --git a/clang/test/Preprocessor/has_builtin_cpuid.c b/clang/test/Preprocessor/has_builtin_cpuid.c
index 35ef65ecdd9b9..e69f912ce688d 100644
--- a/clang/test/Preprocessor/has_builtin_cpuid.c
+++ b/clang/test/Preprocessor/has_builtin_cpuid.c
@@ -2,14 +2,29 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-- -DX86 -verify %s
 // RUN: %clang_cc1 -fsyntax-only -triple powerpc64-unknown-linux-gnu -DPPC \
 // RUN:   -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple riscv32-unknown-linux-gnu -DRISCV \
+// RUN:   -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple riscv64-unknown-linux-gnu -DRISCV \
+// RUN:   -verify %s
 // expected-no-diagnostics
 #if __has_builtin(__builtin_cpu_is)
-# ifdef ARM
-#   error "ARM shouldn't have __builtin_cpu_is"
+# if defined(ARM) || defined(RISCV)
+#   error "ARM/RISCV shouldn't have __builtin_cpu_is"
 # endif
 #endif
+
 #if __has_builtin(__builtin_cpu_init)
 # if defined(ARM) || defined(PPC)
 #   error "ARM/PPC shouldn't have __builtin_cpu_init"
 # endif
+#else
+# ifdef RISCV
+#   error "RISCV should have __builtin_cpu_init"
+# endif
+#endif
+
+#if !__has_builtin(__builtin_cpu_supports)
+# if defined(ARM) || defined(X86) || defined(RISCV)
+#   error "ARM/X86/RISCV should have __builtin_cpu_supports"
+# endif
 #endif
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 182f904b76592..771d56ffb1c1b 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -798,24 +798,51 @@
 // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float
 // LA64-FPU0-LP64S: #define __loongarch_soft_float 1
 
-/// Check __loongarch_arch and __loongarch_tune.
+/// Check __loongarch_arch{_tune/_frecipe}.
 
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la464 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 -mtune=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \
+// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
+// FRECIPE: #define __loongarch_frecipe 1
 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]"
 
 // RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \
@@ -824,6 +851,8 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mlsx -mno-lasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
@@ -832,12 +861,12 @@
 
 // RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
 // MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
@@ -850,8 +879,6 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index 42e5232824de7..56164beb913d5 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -632,6 +632,27 @@
 // PPCPOWER10:#define __PCREL__ 1
 // PPCPOWER10-NOT:#define __ROP_PROTECT__ 1
 //
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr11 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER11 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power11 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER11 %s
+//
+// PPCPOWER11:#define _ARCH_PPC 1
+// PPCPOWER11:#define _ARCH_PPC64 1
+// PPCPOWER11:#define _ARCH_PPCGR 1
+// PPCPOWER11:#define _ARCH_PPCSQ 1
+// PPCPOWER11:#define _ARCH_PWR10 1
+// PPCPOWER11:#define _ARCH_PWR11 1
+// PPCPOWER11:#define _ARCH_PWR4 1
+// PPCPOWER11:#define _ARCH_PWR5 1
+// PPCPOWER11:#define _ARCH_PWR5X 1
+// PPCPOWER11:#define _ARCH_PWR6 1
+// PPCPOWER11-NOT:#define _ARCH_PWR6X 1
+// PPCPOWER11:#define _ARCH_PWR7 1
+// PPCPOWER11:#define _ARCH_PWR8 1
+// PPCPOWER11:#define _ARCH_PWR9 1
+// PPCPOWER11:#define __MMA__ 1
+// PPCPOWER11:#define __PCREL__ 1
+// PPCPOWER11-NOT:#define __ROP_PROTECT__ 1
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu future -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCFUTURE %s
 //
 // PPCFUTURE:#define _ARCH_PPC 1
@@ -639,6 +660,7 @@
 // PPCFUTURE:#define _ARCH_PPCGR 1
 // PPCFUTURE:#define _ARCH_PPCSQ 1
 // PPCFUTURE:#define _ARCH_PWR10 1
+// PPCFUTURE:#define _ARCH_PWR11 1
 // PPCFUTURE:#define _ARCH_PWR4 1
 // PPCFUTURE:#define _ARCH_PWR5 1
 // PPCFUTURE:#define _ARCH_PWR5X 1
diff --git a/clang/test/Preprocessor/pragma_mc_func.c b/clang/test/Preprocessor/pragma_mc_func.c
new file mode 100644
index 0000000000000..bf12f7107ff5c
--- /dev/null
+++ b/clang/test/Preprocessor/pragma_mc_func.c
@@ -0,0 +1,25 @@
+// RUN: not %clang --target=powerpc64-ibm-aix -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: not %clang --target=powerpc64-ibm-aix -ferr-pragma-mc-func-aix -fsyntax-only \
+// RUN:   %s 2>&1 | FileCheck %s
+// RUN: not %clang --target=powerpc64-ibm-aix -fno-err-pragma-mc-func-aix \
+// RUN:   -ferr-pragma-mc-func-aix -fsyntax-only %s 2>&1 | FileCheck %s
+#pragma mc_func asm_barrier {"60000000"}
+
+// CHECK:  error: #pragma mc_func is not supported
+
+// Cases where no errors occur.
+// RUN: %clang --target=powerpc64-ibm-aix -fno-err-pragma-mc-func-aix -fsyntax-only %s
+// RUN: %clang --target=powerpc64-ibm-aix -ferr-pragma-mc-func-aix -fsyntax-only \
+// RUN:    -fno-err-pragma-mc-func-aix %s
+// RUN: %clang --target=powerpc64-ibm-aix -Werror=unknown-pragmas \
+// RUN:   -fno-err-pragma-mc-func-aix -fsyntax-only %s
+
+// Cases on a non-AIX target.
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu \
+// RUN:   -Werror=unknown-pragmas -fno-err-pragma-mc-func-aix -fsyntax-only %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=UNUSED %s
+// RUN: %clang --target=powerpc64le-unknown-linux-gnu \
+// RUN:   -fno-err-pragma-mc-func-aix -fsyntax-only %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=UNUSED %s
+
+// UNUSED: clang: warning: argument unused during compilation: '-fno-err-pragma-mc-func-aix' [-Wunused-command-line-argument]
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index ebe192ecf2549..317bb7c0dc01c 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -70,7 +70,7 @@
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-MATH-ERRNO
 // CHECK-NO-MATH-ERRNO: #define __NO_MATH_ERRNO__ 1
 //
-// RUN: %clang_cc1 %s -E -dM -ffinite-math-only -o - \
+// RUN: %clang_cc1 %s -E -dM -menable-no-nans -menable-no-infs -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-ONLY
 // CHECK-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 1
 //
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index 1330ad10b4b47..14059f827b94c 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -2,25 +2,31 @@
 //// For example, -fptrauth-init-fini will not affect codegen without -fptrauth-calls, but the preprocessor feature would be set anyway.
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-intrinsics | \
-// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-returns | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-address-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-type-info-vtable-pointer-discrimination | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,TYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-function-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,FUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,FUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-init-fini | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,INITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,INITFINI,NOGOTOS
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-indirect-gotos | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,GOTOS
 
 #if __has_feature(ptrauth_intrinsics)
 // INTRIN: has_ptrauth_intrinsics
@@ -71,6 +77,14 @@ void has_ptrauth_vtable_pointer_type_discrimination() {}
 void no_ptrauth_vtable_pointer_type_discrimination() {}
 #endif
 
+#if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
+// TYPE_INFO_DISCR: has_ptrauth_type_info_vtable_pointer_discrimination
+void has_ptrauth_type_info_vtable_pointer_discrimination() {}
+#else
+// NOTYPE_INFO_DISCR: no_ptrauth_type_info_vtable_pointer_discrimination
+void no_ptrauth_type_info_vtable_pointer_discrimination() {}
+#endif
+
 #if __has_feature(ptrauth_function_pointer_type_discrimination)
 // FUNC: has_ptrauth_function_pointer_type_discrimination
 void has_ptrauth_function_pointer_type_discrimination() {}
@@ -86,3 +100,11 @@ void has_ptrauth_init_fini() {}
 // NOINITFINI: no_ptrauth_init_fini
 void no_ptrauth_init_fini() {}
 #endif
+
+#if __has_feature(ptrauth_indirect_gotos)
+// GOTOS: has_ptrauth_indirect_gotos
+void has_ptrauth_indirect_gotos() {}
+#else
+// NOGOTOS: no_ptrauth_indirect_gotos
+void no_ptrauth_indirect_gotos() {}
+#endif
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index fd718a126aaa7..72131108cb5f6 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -86,7 +86,6 @@
 // CHECK-NOT: __riscv_za64rs {{.*$}}
 // CHECK-NOT: __riscv_zaamo {{.*$}}
 // CHECK-NOT: __riscv_zabha {{.*$}}
-// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zalrsc {{.*$}}
 // CHECK-NOT: __riscv_zama16b {{.*$}}
 // CHECK-NOT: __riscv_zawrs {{.*$}}
@@ -182,6 +181,7 @@
 // CHECK-NOT: __riscv_sspm{{.*$}}
 // CHECK-NOT: __riscv_ssqosid{{.*$}}
 // CHECK-NOT: __riscv_supm{{.*$}}
+// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zalasr {{.*$}}
 // CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
@@ -747,14 +747,6 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZABHA-EXT %s
 // CHECK-ZABHA-EXT: __riscv_zabha 1000000{{$}}
 
-// RUN: %clang --target=riscv32 \
-// RUN:   -march=rv32ia_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// RUN: %clang --target=riscv64 \
-// RUN:   -march=rv64ia_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
-
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_zalrsc1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZALRSC-EXT %s
@@ -1626,6 +1618,14 @@
 // CHECK-ZVKT-EXT: __riscv_zvkt 1000000{{$}}
 
 // Experimental extensions
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN:   -march=rv32ia_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN:   -march=rv64ia_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
+
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN:   -march=rv32i_zalasr0p1 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZALASR-EXT %s
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 5c0b815c8ae6f..5d510cb4667f4 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -512,11 +512,11 @@
 
 // NOHRESET-NOT: #define __HRESET__ 1
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -muintr -x c -E -dM -o - %s | FileCheck -check-prefix=UINTR %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -muintr -x c -E -dM -o - %s | FileCheck -check-prefix=UINTR %s
 
 // UINTR: #define __UINTR__ 1
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
 
 // NOUINTR-NOT: #define __UINTR__ 1
 
@@ -755,7 +755,7 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=zu -x c -E -dM -o - %s | FileCheck --check-prefix=ZU %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,CF,APXF %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,CF,ZU,APXF %s
 // APXF: #define __APX_F__ 1
 // CCMP: #define __CCMP__ 1
 // CF: #define __CF__ 1
diff --git a/clang/test/Sema/aarch64-fmv-streaming.c b/clang/test/Sema/aarch64-fmv-streaming.c
new file mode 100644
index 0000000000000..93b7656216c0c
--- /dev/null
+++ b/clang/test/Sema/aarch64-fmv-streaming.c
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming(void) __arm_streaming {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming(void) __arm_streaming {}
+__attribute__((target_version("default"))) void ok_arm_streaming(void) __arm_streaming {}
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__attribute__((target_version("default"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void ok_no_streaming(void) {}
+__attribute__((target_version("sme2"))) void ok_no_streaming(void) {}
+__attribute__((target_version("default"))) void ok_no_streaming(void) {}
+
+__attribute__((target_clones("sve", "simd"))) void bad_mixed_streaming(void) {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("sme2"))) void bad_mixed_streaming(void) __arm_streaming {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("default"))) void bad_mixed_streaming(void) __arm_streaming_compatible {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__arm_locally_streaming __attribute__((target_version("dotprod"))) void bad_mixed_streaming(void) __arm_streaming {}
+
+void n_caller(void) {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void s_caller(void) __arm_streaming {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void sc_caller(void) __arm_streaming_compatible {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
diff --git a/clang/test/Sema/aarch64-neon-without-target-feature.cpp b/clang/test/Sema/aarch64-neon-without-target-feature.cpp
new file mode 100644
index 0000000000000..0831eb7c754a7
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-without-target-feature.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +dotprod  -target-feature +fullfp16 -target-feature +fp16fml -target-feature +i8mm -target-feature +bf16 -verify -emit-llvm -o - %s
+
+// REQUIRES: aarch64-registered-target
+
+// This test is testing the diagnostics that Clang emits when compiling without '+neon'.
+
+#include <arm_neon.h>
+
+void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, float64x2_t v2f64, bfloat16x4_t v4bf16, __bf16 bf16, poly64_t poly64, poly64x2_t poly64x2) {
+  // dotprod
+  vdot_u32(v2i32, v8i8, v8i8); // expected-error {{always_inline function 'vdot_u32' requires target feature 'neon'}}
+  vdot_laneq_u32(v2i32, v8i8, v16i8, 1); // expected-error {{always_inline function 'vdot_u32' requires target feature 'neon'}} expected-error {{'__builtin_neon_splat_laneq_v' needs target feature neon}}
+  // fp16
+  vceqz_f16(v4f16); // expected-error {{always_inline function 'vceqz_f16' requires target feature 'neon'}}
+  vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'neon'}}
+  vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'neon'}}
+  vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'neon'}}
+  // fp16fml depends on fp-armv8
+  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'neon'}}
+  // i8mm
+  vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'neon'}}
+  vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'neon'}} expected-error {{'__builtin_neon_splat_laneq_v' needs target feature neon}}
+  // bf16
+  vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'neon'}}
+  vcreate_bf16(10);
+  vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16,neon}}
+  vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'neon'}}
+  vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16,neon}}
+  vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'neon'}}
+  vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'neon'}}
+  vmull_p64(poly64, poly64);  // expected-error {{always_inline function 'vmull_p64' requires target feature 'neon'}}
+  vmull_high_p64(poly64x2, poly64x2);  // expected-error {{always_inline function 'vmull_high_p64' requires target feature 'neon'}}
+  vtrn1_s8(v8i8, v8i8); // expected-error {{always_inline function 'vtrn1_s8' requires target feature 'neon'}}
+  vqabsq_s16(v8i16); // expected-error {{always_inline function 'vqabsq_s16' requires target feature 'neon'}}
+  vbslq_s16(v8i16, v8i16, v8i16);// expected-error {{always_inline function 'vbslq_s16' requires target feature 'neon'}}
+}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 6db39d6a71e36..0c263eb2610cf 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -455,48 +455,6 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
   share_zt0_only();
 }
 
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_version("sme2")))
-void cannot_work_version(void) __arm_streaming {}
-// expected-cpp-error@+5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
-// expected-cpp-note@-2 {{previous declaration is here}}
-// expected-error@+3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
-// expected-note@-4 {{previous declaration is here}}
-__attribute__((target_version("default")))
-void cannot_work_version(void) {}
-
-
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_clones("sme2")))
-void cannot_work_clones(void) __arm_streaming {}
-
-
-__attribute__((target("sme2")))
-void just_fine_streaming(void) __arm_streaming {}
-__attribute__((target_version("sme2")))
-void just_fine(void) { just_fine_streaming(); }
-__attribute__((target_version("default")))
-void just_fine(void) {}
-
-
-__arm_locally_streaming
-__attribute__((target_version("sme2")))
-void incompatible_locally_streaming(void) {}
-// expected-error@-1 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-// expected-cpp-error@-2 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-__attribute__((target_version("default")))
-void incompatible_locally_streaming(void) {}
-
-
-void fmv_caller() {
-    cannot_work_version();
-    cannot_work_clones();
-    just_fine();
-    incompatible_locally_streaming();
-}
-
 void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { }
 
 __SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; }
diff --git a/clang/test/Sema/alignas.c b/clang/test/Sema/alignas.c
index 020eff6a141c0..391553bc540ec 100644
--- a/clang/test/Sema/alignas.c
+++ b/clang/test/Sema/alignas.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -Dalignof=__alignof %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -Dalignof=_Alignof -DUSING_C11_SYNTAX %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 -DUSING_C11_SYNTAX %s
 
 _Alignas(3) int align_illegal; //expected-error {{requested alignment is not a power of 2}}
 _Alignas(int) char align_big;
@@ -18,12 +19,24 @@ void f(_Alignas(1) char c) { // expected-error {{'_Alignas' attribute cannot be
 }
 
 #ifdef USING_C11_SYNTAX
-// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}}
-// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}}
-// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}}
+// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}}
+// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}}
+// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}}
 #endif
 _Static_assert(alignof(align_big) == alignof(int), "k's alignment is wrong");
 _Static_assert(alignof(align_small) == 1, "j's alignment is wrong");
 _Static_assert(alignof(align_multiple) == 8, "l's alignment is wrong");
 _Static_assert(alignof(struct align_member) == 8, "quuux's alignment is wrong");
 _Static_assert(sizeof(struct align_member) == 8, "quuux's size is wrong");
+
+struct GH95032_1 {
+  _Alignas(16) char bytes[16];
+};
+_Static_assert(_Alignof(struct GH95032_1) == 16, "");
+
+#if __STDC_VERSION__ >= 202311L
+struct GH95032_2 {
+  alignas(16) char bytes[16];
+};
+static_assert(alignof(struct GH95032_2) == 16);
+#endif
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index 9b82d82ff8269..2405f804d0da5 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -124,6 +124,31 @@ _Static_assert(__atomic_always_lock_free(4, &i64), "");
 _Static_assert(!__atomic_always_lock_free(8, &i32), "");
 _Static_assert(__atomic_always_lock_free(8, &i64), "");
 
+// Validate use with fake pointers constants. This mechanism is used to allow
+// validating atomicity of a given size and alignment.
+_Static_assert(__atomic_is_lock_free(1, (void*)1), "");
+_Static_assert(__atomic_is_lock_free(1, (void*)-1), "");
+_Static_assert(__atomic_is_lock_free(4, (void*)2), ""); // expected-error {{not an integral constant expression}}
+_Static_assert(__atomic_is_lock_free(4, (void*)-2), ""); // expected-error {{not an integral constant expression}}
+_Static_assert(__atomic_is_lock_free(4, (void*)4), "");
+_Static_assert(__atomic_is_lock_free(4, (void*)-4), "");
+
+_Static_assert(__atomic_always_lock_free(1, (void*)1), "");
+_Static_assert(__atomic_always_lock_free(1, (void*)-1), "");
+_Static_assert(!__atomic_always_lock_free(4, (void*)2), "");
+_Static_assert(!__atomic_always_lock_free(4, (void*)-2), "");
+_Static_assert(__atomic_always_lock_free(4, (void*)4), "");
+_Static_assert(__atomic_always_lock_free(4, (void*)-4), "");
+
+// Ensure that "weird" constants don't cause trouble.
+_Static_assert(__atomic_always_lock_free(1, "string"), "");
+_Static_assert(!__atomic_always_lock_free(2, "string"), "");
+_Static_assert(__atomic_always_lock_free(2, (int[2]){}), "");
+void dummyfn();
+_Static_assert(__atomic_always_lock_free(2, dummyfn) || 1, "");
+
+
+
 #define _AS1 __attribute__((address_space(1)))
 #define _AS2 __attribute__((address_space(2)))
 
diff --git a/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
new file mode 100644
index 0000000000000..7d9c9a90880ff
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-bounds-safety -verify %s
+//
+// This is a portion of the `attr-counted-by-vla.c` test but is checked
+// under the semantics of `-fexperimental-bounds-safety` which has different
+// behavior.
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct has_unannotated_VLA {
+  int count;
+  char buffer[];
+};
+
+struct has_annotated_VLA {
+  int count;
+  char buffer[] __counted_by(count);
+};
+
+struct buffer_of_structs_with_unnannotated_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member}}
+  struct has_unannotated_VLA Arr[] __counted_by(count);
+};
+
+
+struct buffer_of_structs_with_annotated_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member}}
+  struct has_annotated_VLA Arr[] __counted_by(count);
+};
+
+struct buffer_of_const_structs_with_annotated_vla {
+  int count;
+  // Make sure the `const` qualifier is printed when printing the element type.
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member}}
+  const struct has_annotated_VLA Arr[] __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-format.c b/clang/test/Sema/attr-format.c
index 1f4c864d4f78b..5a8b1ac9eca5c 100644
--- a/clang/test/Sema/attr-format.c
+++ b/clang/test/Sema/attr-format.c
@@ -99,3 +99,10 @@ void forward_fixed(const char *fmt, _Bool b, char i, short j, int k, float l, do
   a(fmt, b, i, j, k, l, m);
 }
 
+// OpenBSD
+// same as format(printf(...))...
+void a2(const char *a, ...) __attribute__((format(syslog, 1, 2)));    // no-error
+void b2(const char *a, ...) __attribute__((format(syslog, 1, 1)));    // expected-error {{'format' attribute parameter 3 is out of bounds}}
+void c2(const char *a, ...) __attribute__((format(syslog, 0, 2)));    // expected-error {{'format' attribute parameter 2 is out of bounds}}
+void d2(const char *a, int c) __attribute__((format(syslog, 1, 2)));  // expected-warning {{GCC requires a function with the 'format' attribute to be variadic}}
+void e2(char *str, int c, ...) __attribute__((format(syslog, 2, 3))); // expected-error {{format argument not a string type}}
diff --git a/clang/test/Sema/attr-ownership.c b/clang/test/Sema/attr-ownership.c
index 8157ba7145a24..d2e40538a40f0 100644
--- a/clang/test/Sema/attr-ownership.c
+++ b/clang/test/Sema/attr-ownership.c
@@ -18,9 +18,16 @@ void *f12(float i, int k, int f, int *j) __attribute__((ownership_returns(foo, 4
 void f13(int *i, int *j) __attribute__((ownership_holds(foo, 1))) __attribute__((ownership_takes(foo, 2)));
 void f14(int i, int j, int *k) __attribute__((ownership_holds(foo, 3))) __attribute__((ownership_takes(foo, 3)));  // expected-error {{'ownership_takes' and 'ownership_holds' attributes are not compatible}}
 
-void f15(int, int)
+void *f15(int, int)
   __attribute__((ownership_returns(foo, 1)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 1}}
   __attribute__((ownership_returns(foo, 2))); // expected-note {{declared with index 2 here}}
 void f16(int *i, int *j) __attribute__((ownership_holds(foo, 1))) __attribute__((ownership_holds(foo, 1))); // OK, same index
 void f17(void*) __attribute__((ownership_takes(__, 1)));
 void f18() __attribute__((ownership_takes(foo, 1)));  // expected-warning {{'ownership_takes' attribute only applies to non-K&R-style functions}}
+
+int f19(void *)
+  __attribute__((ownership_takes(foo, 1)))    // expected-error {{'ownership_takes' attribute class does not match; here it is 'foo'}}
+  __attribute__((ownership_takes(foo1, 1)));  // expected-note {{declared with class 'foo1' here}}
+
+void f20(void) __attribute__((ownership_returns(foo)));  // expected-error {{'ownership_returns' attribute only applies to functions that return a pointer}}
+int f21(void) __attribute__((ownership_returns(foo)));  // expected-error {{'ownership_returns' attribute only applies to functions that return a pointer}}
diff --git a/clang/test/Sema/attr-ownership.cpp b/clang/test/Sema/attr-ownership.cpp
index 7381285e2da48..0626efa5aaf9a 100644
--- a/clang/test/Sema/attr-ownership.cpp
+++ b/clang/test/Sema/attr-ownership.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -verify -fsyntax-only
 
 class C {
-  void f(int, int)
-      __attribute__((ownership_returns(foo, 2)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 2}}
-      __attribute__((ownership_returns(foo, 3))); // expected-note {{declared with index 3 here}}
+  void *f(int, int)
+       __attribute__((ownership_returns(foo, 2)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 2}}
+       __attribute__((ownership_returns(foo, 3))); // expected-note {{declared with index 3 here}}
 };
diff --git a/clang/test/Sema/builtin-cpu-supports.c b/clang/test/Sema/builtin-cpu-supports.c
index 51ee9661807f8..4dace25b42a97 100644
--- a/clang/test/Sema/builtin-cpu-supports.c
+++ b/clang/test/Sema/builtin-cpu-supports.c
@@ -1,5 +1,10 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux-gnu -verify %s
 // RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple riscv32-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple riscv64-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc64le-unknown-linux -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc64-unknown-aix7.2.0.0 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc-unknown-aix7.2.0.0 -verify %s
 
 extern void a(const char *);
 
@@ -26,7 +31,9 @@ int main(void) {
   (void)__builtin_cpu_supports("x86-64-v3");
   (void)__builtin_cpu_supports("x86-64-v4");
   (void)__builtin_cpu_supports("x86-64-v5"); // expected-warning {{invalid cpu feature string for builtin}}
-#else
+#endif
+
+#ifdef __aarch64__
   if (__builtin_cpu_supports("neon")) // expected-warning {{invalid cpu feature string for builtin}}
     a("vsx");
 
@@ -36,5 +43,21 @@ int main(void) {
   __builtin_cpu_init(); // expected-error {{builtin is not supported on this target}}
 #endif
 
+#ifdef __riscv
+  if (__builtin_cpu_supports("garbage")) // expected-warning {{invalid cpu feature string for builtin}}
+    a("vsx");
+#endif
+
+#ifdef __powerpc__
+  if (__builtin_cpu_is("garbage")) // expected-error {{invalid cpu name for builtin}}
+    a("vsx");
+
+  if (__builtin_cpu_is("power3")) // expected-error {{invalid cpu name for builtin}}
+    a("vsx");
+
+  if (__builtin_cpu_supports("garbage")) // expected-warning {{invalid cpu feature string for builtin}}
+    a("vsx");
+#endif
+
   return 0;
 }
diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c
index 00767267cd6c2..37b63cf4f6b32 100644
--- a/clang/test/Sema/constant-builtins-2.c
+++ b/clang/test/Sema/constant-builtins-2.c
@@ -265,10 +265,8 @@ char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 :
 char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 char clz54[__builtin_clzg((unsigned __int128)0xf) == BITSIZE(__int128) - 4 ? 1 : -1];
 char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
-char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                             // expected-note@-1 {{shift count 127 >= width of type 'int' (32 bits)}}
-char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                 // expected-note@-1 {{shift count 127 >= width of type 'int' (32 bits)}}
+char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
+char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 #endif
 int clz58 = __builtin_clzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
 char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
@@ -276,10 +274,8 @@ char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) -
 char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
 char clz62[__builtin_clzg((unsigned _BitInt(128))0xf) == BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
 char clz63[__builtin_clzg((unsigned _BitInt(128))0xf, 42) == BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
-char clz64[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1))) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                     // expected-note@-1 {{shift count 127 >= width of type 'int' (32 bits)}}
-char clz65[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1)), 42) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                         // expected-note@-1 {{shift count 127 >= width of type 'int' (32 bits)}}
+char clz64[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
+char clz65[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 
 char ctz1[__builtin_ctz(1) == 0 ? 1 : -1];
 char ctz2[__builtin_ctz(8) == 3 ? 1 : -1];
diff --git a/clang/test/Sema/patchable-function-entry-attr.cpp b/clang/test/Sema/patchable-function-entry-attr.cpp
index 9134c851da588..bd4d57a7e3093 100644
--- a/clang/test/Sema/patchable-function-entry-attr.cpp
+++ b/clang/test/Sema/patchable-function-entry-attr.cpp
@@ -6,9 +6,14 @@
 // RUN: %clang_cc1 -triple loongarch64 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv32 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv64 -fsyntax-only -verify=silence %s
+// RUN: %clang_cc1 -triple powerpc-unknown-linux-gnu -fsyntax-only -verify=silence %s
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple ppc64le -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
 
 // silence-no-diagnostics
 
+// AIX-error@+2 {{'patchable_function_entry' attribute is not yet supported on AIX}}
 // expected-warning@+1 {{unknown attribute 'patchable_function_entry' ignored}}
 [[gnu::patchable_function_entry(0)]] void f();
diff --git a/clang/test/Sema/ptrauth-indirect-goto.c b/clang/test/Sema/ptrauth-indirect-goto.c
new file mode 100644
index 0000000000000..7304f5c30a117
--- /dev/null
+++ b/clang/test/Sema/ptrauth-indirect-goto.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple arm64e-apple-darwin -fsyntax-only -verify %s -fptrauth-indirect-gotos
+// RUN: %clang_cc1 -triple aarch64-linux-gnu   -fsyntax-only -verify %s -fptrauth-indirect-gotos
+
+int f() {
+  static void *addrs[] = { &&l1, &&l2 };
+
+  static int diffs[] = {
+    &&l1 - &&l1, // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
+    &&l1 - &&l2 // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
+  };
+
+  int diff_32 = &&l1 - &&l2; // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
+  goto *(&&l1 + diff_32); // expected-error{{addition of address-of-label expressions is not supported with ptrauth indirect gotos}}
+
+l1:
+  return 0;
+l2:
+  return 1;
+}
diff --git a/clang/test/Sema/ptrauth-intrinsics-macro.c b/clang/test/Sema/ptrauth-intrinsics-macro.c
index f76f677315dd3..adbb71a9d6e50 100644
--- a/clang/test/Sema/ptrauth-intrinsics-macro.c
+++ b/clang/test/Sema/ptrauth-intrinsics-macro.c
@@ -38,6 +38,11 @@ void test_string_discriminator(int *dp) {
   (void)t0;
 }
 
+void test_type_discriminator(int *dp) {
+  ptrauth_extra_data_t t0 = ptrauth_type_discriminator(int (*)(int));
+  (void)t0;
+}
+
 void test_sign_constant(int *dp) {
   dp = ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0);
 }
diff --git a/clang/test/Sema/switch.c b/clang/test/Sema/switch.c
index 69b34f96820d3..6e912d02d6cc7 100644
--- a/clang/test/Sema/switch.c
+++ b/clang/test/Sema/switch.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wswitch-enum -Wcovered-switch-default -triple x86_64-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wswitch-enum -Wcovered-switch-default -triple x86_64-linux-gnu %s -fexperimental-new-constant-interpreter
 void f (int z) { 
   while (z) { 
     default: z--;            // expected-error {{statement not in switch}}
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index 03a432e05851d..357c9e5b64107 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -1,10 +1,11 @@
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
 // RUN: %s -std=c++23
@@ -87,11 +88,15 @@ class numeric_limits<double>  {
 
 int compareit(float a, float b) {
   volatile int i, j, k, l, m, n, o, p;
-// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   i = a == INFINITY;
 
-// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY == a;
 
@@ -107,11 +112,15 @@ int compareit(float a, float b) {
 // no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = NAN == a;
 
-// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY <= a;
 
-// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY < a;
 
@@ -192,7 +201,9 @@ int compareit(float a, float b) {
 // no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   j = isunorderedf(a, NAN);
 
-// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = isunorderedf(a, INFINITY);
 
@@ -204,9 +215,11 @@ int compareit(float a, float b) {
 // no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   i = std::isunordered(a, NAN);
 
-// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
-// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
-// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+6 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+5 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   i = std::isunordered(a, INFINITY);
 
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
index 51f9d325619ba..ee4eb33a16e44 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -6,8 +6,9 @@
 // RUN: -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
 // RUN: %s -std=c++23
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp
index 60b8f3ddedcd1..d1266027bdd34 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp
@@ -21,3 +21,7 @@ MyIntPointer g() {
   MyIntOwner o;
   return o; // No warning, it is disabled.
 }
+
+void h(MyIntPointer p) {
+  p = MyIntOwner(); //  No warning, it is disabled.
+}
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index b3ca173c1fdbc..09dfb2b5d96a8 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -120,11 +120,13 @@ MyLongPointerFromConversion global2;
 
 void initLocalGslPtrWithTempOwner() {
   MyIntPointer p = MyIntOwner{}; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
-  p = MyIntOwner{}; // TODO ?
-  global = MyIntOwner{}; // TODO ?
+  MyIntPointer pp = p = MyIntOwner{}; // expected-warning {{object backing the pointer p will be}}
+  p = MyIntOwner{}; // expected-warning {{object backing the pointer p }}
+  pp = p; // no warning
+  global = MyIntOwner{}; // expected-warning {{object backing the pointer global }}
   MyLongPointerFromConversion p2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
-  p2 = MyLongOwnerWithConversion{}; // TODO ?
-  global2 = MyLongOwnerWithConversion{}; // TODO ?
+  p2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer p2 }}
+  global2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer global2 }}
 }
 
 namespace __gnu_cxx {
@@ -170,6 +172,7 @@ struct basic_string_view {
   basic_string_view(const T *);
   const T *begin() const;
 };
+using string_view = basic_string_view<char>;
 
 template<class _Mystr> struct iter {
     iter& operator-=(int);
@@ -188,7 +191,7 @@ struct basic_string {
   operator basic_string_view<T> () const;
   using const_iterator = iter<T>;
 };
-
+using string = basic_string<char>;
 
 template<typename T>
 struct unique_ptr {
@@ -346,6 +349,12 @@ void handleTernaryOperator(bool cond) {
     std::basic_string_view<char> v = cond ? def : ""; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
 }
 
+std::string operator+(std::string_view s1, std::string_view s2);
+void danglingStringviewAssignment(std::string_view a1, std::string_view a2) {
+  a1 = std::string(); // expected-warning {{object backing}}
+  a2 = a1 + a1; // expected-warning {{object backing}}
+}
+
 std::reference_wrapper<int> danglingPtrFromNonOwnerLocal() {
   int i = 5;
   return i; // TODO
diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c
index e055cbb70e9e5..33a963c15b00d 100644
--- a/clang/test/Sema/x86-builtin-palignr.c
+++ b/clang/test/Sema/x86-builtin-palignr.c
@@ -4,5 +4,5 @@
 #include <tmmintrin.h>
 
 __m64 test1(__m64 a, __m64 b, int c) {
-   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
 }
diff --git a/clang/test/SemaCUDA/attr-noconvergent.cu b/clang/test/SemaCUDA/attr-noconvergent.cu
new file mode 100644
index 0000000000000..0c051fdde4379
--- /dev/null
+++ b/clang/test/SemaCUDA/attr-noconvergent.cu
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+__device__ float f0(float) __attribute__((noconvergent));
+__device__ __attribute__((noconvergent)) float f1(float);
+[[clang::noconvergent]] __device__ float f2(float);
+
+__device__ [[clang::noconvergent(1)]] float f3(float);
+// expected-error@-1 {{'noconvergent' attribute takes no arguments}}
+
+__device__ [[clang::noconvergent]] float g0;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+__device__ __attribute__((convergent)) __attribute__((noconvergent)) float f4(float);
+// expected-error@-1 {{'noconvergent' and 'convergent' attributes are not compatible}}
+// expected-note@-2 {{conflicting attribute is here}}
+
+__device__ [[clang::noconvergent]] float f5(float);
+__device__ [[clang::convergent]] float f5(float);
+// expected-error@-1 {{'convergent' and 'noconvergent' attributes are not compatible}}
+// expected-note@-3 {{conflicting attribute is here}}
+
+__device__ float f5(float x) {
+  [[clang::noconvergent]] float y;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+  float z;
+
+  [[clang::noconvergent]] z = 1;
+// expected-warning@-1 {{'noconvergent' attribute is ignored because there exists no call expression inside the statement}}
+
+  [[clang::noconvergent]] z = f0(x);
+}
diff --git a/clang/test/SemaCXX/GH82167.cpp b/clang/test/SemaCXX/GH82167.cpp
new file mode 100644
index 0000000000000..033331b82c5d4
--- /dev/null
+++ b/clang/test/SemaCXX/GH82167.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++23 -fsyntax-only -verify -xobjective-c++ %s
+// expected-no-diagnostics
+
+namespace t1 {
+  struct array {
+    char elems[2];
+  };
+
+  template <unsigned> struct Literal {
+    array arr;
+    constexpr Literal() : arr("") {}
+  };
+
+  template struct Literal<0>;
+} // namespace t1
+
+namespace t2 {
+  struct array {
+    char elems[2];
+  };
+
+  template <unsigned> struct Literal {
+    array arr;
+    constexpr Literal() : arr(@encode(int)) {}
+  };
+
+  template struct Literal<0>;
+} // namespace t2
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 70bc545c07bd9..7db0a4d64d259 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -47,6 +47,31 @@ namespace usage_ok {
     q = A(); // expected-warning {{object backing the pointer q will be destroyed at the end of the full-expression}}
     r = A(1); // expected-warning {{object backing the pointer r will be destroyed at the end of the full-expression}}
   }
+
+  struct FieldCheck {
+    struct Set {
+      int a;
+    };
+    struct Pair {
+      const int& a;
+      int b;
+      Set c;
+      int * d;
+    };
+    Pair p;  
+    FieldCheck(const int& a): p(a){}
+    Pair& getR() [[clang::lifetimebound]] { return p; }
+    Pair* getP() [[clang::lifetimebound]] { return &p; }
+    Pair* getNoLB() { return &p; }
+  };
+  void test_field_access() {
+    int x = 0;
+    const int& a = FieldCheck{x}.getR().a;
+    const int& b = FieldCheck{x}.getP()->b;   // expected-warning {{temporary bound to local reference 'b' will be destroyed at the end of the full-expression}}
+    const int& c = FieldCheck{x}.getP()->c.a; // expected-warning {{temporary bound to local reference 'c' will be destroyed at the end of the full-expression}}
+    const int& d = FieldCheck{x}.getNoLB()->c.a;
+    const int* e = FieldCheck{x}.getR().d;
+  }
 }
 
 # 1 "<std>" 1 3
@@ -239,3 +264,4 @@ namespace move_forward_et_al_examples {
   S X;
   S *AddressOfOk = std::addressof(X);
 } // namespace move_forward_et_al_examples
+
diff --git a/clang/test/SemaCXX/builtin_vectorelements.cpp b/clang/test/SemaCXX/builtin_vectorelements.cpp
index 59ff09ac72e42..b23675ea0ac6a 100644
--- a/clang/test/SemaCXX/builtin_vectorelements.cpp
+++ b/clang/test/SemaCXX/builtin_vectorelements.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
+// RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 template <typename T>
 using VecT __attribute__((vector_size(16))) = T;
diff --git a/clang/test/SemaCXX/class.cpp b/clang/test/SemaCXX/class.cpp
index f874b7be2b70e..2f59544e7f36c 100644
--- a/clang/test/SemaCXX/class.cpp
+++ b/clang/test/SemaCXX/class.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s 
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -Wc++11-compat %s
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s -std=c++98
 class C {
 public:
@@ -55,6 +55,13 @@ class C {
   // expected-error@-2 {{static const volatile data member must be initialized out of line}}
 #endif
   static const E evi = 0;
+  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+                                               // expected-warning@-1 {{overflow in expression}}
+  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
 
   void m() {
     sx = 0;
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index efb391ba0922d..6df8a4740d6cc 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++23 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx20_23,cxx23    -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
-// RUN: %clang_cc1 -std=c++20 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx20_23 -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
-// RUN: %clang_cc1 -std=c++11 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx11    -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
+// RUN: %clang_cc1 -std=c++23 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx20_23,cxx23              -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
+// RUN: %clang_cc1 -std=c++20 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx20_23,pre-cxx23 -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
+// RUN: %clang_cc1 -std=c++11 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx11,pre-cxx23    -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
 
 namespace StaticAssertFoldTest {
 
@@ -1011,10 +1011,12 @@ constexpr bool b(int n) { return &n; }
 static_assert(b(0), "");
 
 struct NonLiteral {
-  NonLiteral();
+  NonLiteral(); // cxx23-note {{declared here}}
   int f();
 };
-constexpr int k = NonLiteral().f(); // expected-error {{constant expression}} expected-note {{non-literal type 'NonLiteral'}}
+constexpr int k = NonLiteral().f(); // expected-error {{constant expression}} \
+				    // pre-cxx23-note {{non-literal type 'NonLiteral'}} \
+				    // cxx23-note {{non-constexpr constructor 'NonLiteral' cannot be used in a constant expression}}
 
 }
 
@@ -1270,8 +1272,10 @@ static_assert(makeComplexWrap(1,0) != complex(0, 1), "");
 
 namespace PR11595 {
   struct A { constexpr bool operator==(int x) const { return true; } };
-  struct B { B(); A& x; };
-  static_assert(B().x == 3, "");  // expected-error {{constant expression}} expected-note {{non-literal type 'B' cannot be used in a constant expression}}
+  struct B { B(); A& x; }; // cxx23-note {{declared here}}
+  static_assert(B().x == 3, "");  // expected-error {{constant expression}} \
+				  // pre-cxx23-note {{non-literal type 'B' cannot be used in a constant expression}} \
+				  // cxx23-note {{non-constexpr constructor 'B' cannot be used in a constant expression}}
 
   constexpr bool f(int k) { // cxx11_20-error {{constexpr function never produces a constant expression}}
     return B().x == k; // cxx11_20-note {{non-literal type 'B' cannot be used in a constant expression}}
diff --git a/clang/test/SemaCXX/constant-expression-cxx2b.cpp b/clang/test/SemaCXX/constant-expression-cxx2b.cpp
index 2519839b7ac57..42b3f81cc713d 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2b.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2b.cpp
@@ -3,7 +3,7 @@
 
 struct NonLiteral { // cxx2a-note {{'NonLiteral' is not literal}} \
                     // cxx23-note 2{{'NonLiteral' is not literal}}
-  NonLiteral() {}
+  NonLiteral() {} // cxx23-note 2{{declared here}}
 };
 
 struct Constexpr{};
@@ -165,9 +165,9 @@ int test_in_lambdas() {
 
   auto non_literal = [](bool b) constexpr {
     if (!b)
-      NonLiteral n; // cxx23-note {{non-literal type 'NonLiteral' cannot be used in a constant expression}} \
-                    // cxx2a-error {{variable of non-literal type 'NonLiteral' cannot be defined in a constexpr function before C++23}} \
-                    // cxx23-warning {{definition of a variable of non-literal type in a constexpr function is incompatible with C++ standards before C++23}}
+      NonLiteral n; // cxx2a-error {{variable of non-literal type 'NonLiteral' cannot be defined in a constexpr function before C++23}} \
+                    // cxx23-warning {{definition of a variable of non-literal type in a constexpr function is incompatible with C++ standards before C++23}} \
+		    // cxx23-note {{non-constexpr constructor 'NonLiteral' cannot be used in a constant expression}}
     return 0;
   };
 
@@ -217,7 +217,7 @@ int test_lambdas_implicitly_constexpr() {
 
   auto non_literal = [](bool b) { // cxx2a-note 2{{declared here}}
     if (b)
-      NonLiteral n; // cxx23-note {{non-literal type 'NonLiteral' cannot be used in a constant expression}}
+      NonLiteral n; // cxx23-note {{non-constexpr constructor 'NonLiteral' cannot be used in a constant expression}}
     return 0;
   };
 
diff --git a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
index cd096a9270937..f0252df1e2ce1 100644
--- a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
+++ b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
@@ -1,11 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -fexperimental-new-constant-interpreter %s
 
 namespace baseclass_uninit {
 struct DelBase {
   constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
 };
 
-struct Foo : DelBase {  // expected-note 2{{constructor of base class 'DelBase' is not called}}
+struct Foo : DelBase {  // expected-note-re 2{{constructor of base class '{{.*}}DelBase' is not called}}
   constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
 };
 constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
@@ -15,13 +16,13 @@ struct Bar : Foo {
 constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
 
 struct Base {};
-struct A : Base { // expected-note {{constructor of base class 'Base' is not called}}
+struct A : Base { // expected-note-re {{constructor of base class '{{.*}}Base' is not called}}
   constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
 };
 
 constexpr A a; // expected-error {{must be initialized by a constant expression}}
 
-struct B : Base { // expected-note {{constructor of base class 'Base' is not called}}
+struct B : Base { // expected-note-re {{constructor of base class '{{.*}}Base' is not called}}
   constexpr B() : {} // expected-error {{expected class member or base class name}}
 };
 
diff --git a/clang/test/SemaCXX/cxx-deprecated.cpp b/clang/test/SemaCXX/cxx-deprecated.cpp
index 81eb07608300d..d7de609d58cdd 100644
--- a/clang/test/SemaCXX/cxx-deprecated.cpp
+++ b/clang/test/SemaCXX/cxx-deprecated.cpp
@@ -36,4 +36,13 @@ template <C T>
 // expected-warning@-1 {{'C' is deprecated}}
 //   expected-note@#C {{'C' has been explicitly marked deprecated here}}
 void f();
+
+namespace GH98164 {
+template <int>
+auto b() = delete; // #b
+
+decltype(b<0>()) x;
+// expected-error@-1 {{call to deleted function 'b'}}
+//   expected-note@#b {{candidate function [with $0 = 0] has been explicitly deleted}}
+} // namespace GH98164
 } // namespace cxx20_concept
diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index 5e189b758c61e..eafadb07b29e1 100644
--- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -9,12 +9,12 @@
 #endif
 
 class A {
-  template<typename T> CONST T wrong;           // expected-error {{member 'wrong' declared as a template}}
-  template<typename T> CONST T wrong_init = 5;      // expected-error {{member 'wrong_init' declared as a template}}
+  template<typename T> CONST T wrong;           // expected-error {{non-static data member 'wrong' cannot be declared as a template}}
+  template<typename T> CONST T wrong_init = 5;      // expected-error {{non-static data member 'wrong_init' cannot be declared as a template}}
   template<typename T, typename T0> static CONST T right = T(100);
   template<typename T> static CONST T right<T,int> = 5;
-  template<typename T> CONST int right<int,T>;  // expected-error {{member 'right' declared as a template}}
-  template<typename T> CONST float right<float,T> = 5;  // expected-error {{member 'right' declared as a template}}
+  template<typename T> CONST int right<int,T>;  // expected-error {{non-static data member 'right' cannot be declared as a template}}
+  template<typename T> CONST float right<float,T> = 5;  // expected-error {{non-static data member 'right' cannot be declared as a template}}
 #ifdef PRECXX11
                                                         // expected-warning@-2 {{in-class initializer for static data member of type 'const float' is a GNU extension}}
 #else
@@ -161,14 +161,14 @@ namespace non_const_init {
 #ifndef PRECXX11
 namespace constexpred {
   class A {
-    template<typename T> constexpr T wrong;           // expected-error {{member 'wrong' declared as a template}}
+    template<typename T> constexpr T wrong;           // expected-error {{non-static data member 'wrong' cannot be declared as a template}}
                                                       // expected-error@-1 {{declaration of constexpr static data member 'wrong' requires an initializer}}
-    template<typename T> constexpr T wrong_init = 5;  // expected-error {{member 'wrong_init' declared as a template}}
+    template<typename T> constexpr T wrong_init = 5;  // expected-error {{non-static data member 'wrong_init' cannot be declared as a template}}
     template<typename T, typename T0> static constexpr T right = T(100);
     template<typename T> static constexpr T right<T,int> = 5;
-    template<typename T> constexpr int right<int,T>;         // expected-error {{member 'right' declared as a template}}
+    template<typename T> constexpr int right<int,T>;         // expected-error {{non-static data member 'right' cannot be declared as a template}}
                                                              // expected-error@-1 {{declaration of constexpr static data member 'right<int, T>' requires an initializer}}
-    template<typename T> constexpr float right<float,T> = 5; // expected-error {{member 'right' declared as a template}}
+    template<typename T> constexpr float right<float,T> = 5; // expected-error {{non-static data member 'right' cannot be declared as a template}}
     template<> constexpr int right<int,int> = 7;
     template<> constexpr float right<float, int>; // expected-error {{declaration of constexpr static data member 'right<float, int>' requires an initializer}}
     template static constexpr int right<int,int>;     // expected-error {{expected '<' after 'template'}}
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 305a9ac2ebc24..19c730303625e 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -std=c++17 -Wc++20-extensions -verify=expected %s
 // RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -verify=expected %s
+// RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -fexperimental-new-constant-interpreter -verify=expected %s
 
 void use_from_own_init() {
   auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 2193be03ad9a3..5392573fcdb9d 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -110,8 +110,8 @@ struct Foo {
 
 template <typename X, int Y>
 using Bar = Foo<X, sizeof(X)>; // expected-note {{candidate template ignored: couldn't infer template argument 'X'}} \
-                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) Bar(Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) -> Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>'}} \
-                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) Bar(const type-parameter-0-0 (&)[sizeof(type-parameter-0-0)]) -> Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>'}} \
+                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<X, sizeof(X)>) Bar(Foo<X, sizeof(X)>) -> Foo<X, sizeof(X)>'}} \
+                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<X, sizeof(X)>) Bar(const X (&)[sizeof(X)]) -> Foo<X, sizeof(X)>'}} \
                                // expected-note {{candidate template ignored: constraints not satisfied [with X = int]}} \
                                // expected-note {{cannot deduce template arguments for 'Bar' from 'Foo<int, 4UL>'}}
 
@@ -138,13 +138,13 @@ namespace test11 {
 struct A {};
 template<class T> struct Foo { T c; };
 template<class X, class Y=A>
-using AFoo = Foo<Y>; // expected-note {{candidate template ignored: could not match 'Foo<type-parameter-0-0>' against 'int'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo(Foo<type-parameter-0-0>) -> Foo<type-parameter-0-0>'}} \
+using AFoo = Foo<Y>; // expected-note {{candidate template ignored: could not match 'Foo<Y>' against 'int'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo(Foo<Y>) -> Foo<Y>'}} \
                     // expected-note {{candidate template ignored: constraints not satisfied [with Y = int]}} \
                     // expected-note {{cannot deduce template arguments for 'AFoo' from 'Foo<int>'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo(type-parameter-0-0) -> Foo<type-parameter-0-0>'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo(Y) -> Foo<Y>'}} \
                     // expected-note {{candidate function template not viable: requires 0 arguments, but 1 was provided}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo() -> Foo<type-parameter-0-0>'}}
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo() -> Foo<Y>'}}
 
 AFoo s = {1}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'AFoo'}}
 } // namespace test11
@@ -211,9 +211,9 @@ template<typename> concept False = false;
 template<False W>
 using BFoo = AFoo<W>; // expected-note {{candidate template ignored: constraints not satisfied [with V = int]}} \
                       // expected-note {{cannot deduce template arguments for 'BFoo' from 'Foo<int *>'}} \
-                      // expected-note {{implicit deduction guide declared as 'template <class V> requires __is_deducible(AFoo, Foo<type-parameter-0-0 *>) && __is_deducible(test15::BFoo, Foo<type-parameter-0-0 *>) BFoo(type-parameter-0-0 *) -> Foo<type-parameter-0-0 *>}} \
-                      // expected-note {{candidate template ignored: could not match 'Foo<type-parameter-0-0 *>' against 'int *'}} \
-                      // expected-note {{template <class V> requires __is_deducible(AFoo, Foo<type-parameter-0-0 *>) && __is_deducible(test15::BFoo, Foo<type-parameter-0-0 *>) BFoo(Foo<type-parameter-0-0 *>) -> Foo<type-parameter-0-0 *>}}
+                      // expected-note {{implicit deduction guide declared as 'template <class V> requires __is_deducible(AFoo, Foo<V *>) && __is_deducible(test15::BFoo, Foo<V *>) BFoo(V *) -> Foo<V *>}} \
+                      // expected-note {{candidate template ignored: could not match 'Foo<V *>' against 'int *'}} \
+                      // expected-note {{template <class V> requires __is_deducible(AFoo, Foo<V *>) && __is_deducible(test15::BFoo, Foo<V *>) BFoo(Foo<V *>) -> Foo<V *>}}
 int i = 0;
 AFoo a1(&i); // OK, deduce Foo<int *>
 
@@ -263,12 +263,12 @@ template<typename T> requires False<T> // expected-note {{because 'int' does not
 Foo(T) -> Foo<int>;
 
 template <typename U>
-using Bar = Foo<U>; // expected-note {{could not match 'Foo<type-parameter-0-0>' against 'int'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar(Foo<type-parameter-0-0>) -> Foo<type-parameter-0-0>'}} \
+using Bar = Foo<U>; // expected-note {{could not match 'Foo<U>' against 'int'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<U>) Bar(Foo<U>) -> Foo<U>'}} \
                     // expected-note {{candidate template ignored: constraints not satisfied}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<type-parameter-0-0> && __is_deducible(test18::Bar, Foo<int>) Bar(type-parameter-0-0) -> Foo<int>'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<T> && __is_deducible(test18::Bar, Foo<int>) Bar(T) -> Foo<int>'}} \
                     // expected-note {{candidate function template not viable}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar() -> Foo<type-parameter-0-0>'}}
+                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<U>) Bar() -> Foo<U>'}}
 
 Bar s = {1}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments}}
 } // namespace test18
@@ -296,8 +296,8 @@ class Foo {};
 // Verify that template template type parameter TTP is referenced/used in the
 // template arguments of the RHS.
 template <template<typename> typename TTP>
-using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<template-parameter-0-0>>' against 'int'}} \
-                        // expected-note {{implicit deduction guide declared as 'template <template <typename> typename TTP> requires __is_deducible(test20::Bar, Foo<K<template-parameter-0-0>>) Bar(Foo<K<template-parameter-0-0>>) -> Foo<K<template-parameter-0-0>>'}}
+using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<TTP>>' against 'int'}} \
+                        // expected-note {{implicit deduction guide declared as 'template <template <typename> typename TTP> requires __is_deducible(test20::Bar, Foo<K<TTP>>) Bar(Foo<K<TTP>>) -> Foo<K<TTP>>'}}
 
 template <class T>
 class Container {};
diff --git a/clang/test/SemaCXX/cxx23-invalid-constexpr.cpp b/clang/test/SemaCXX/cxx23-invalid-constexpr.cpp
index 4dc16c59d8058..3229a91fbcefb 100644
--- a/clang/test/SemaCXX/cxx23-invalid-constexpr.cpp
+++ b/clang/test/SemaCXX/cxx23-invalid-constexpr.cpp
@@ -25,14 +25,14 @@ constexpr int FT(T N) {
 class NonLiteral { // expected-note {{'NonLiteral' is not literal because it is not an aggregate and has no constexpr constructors}}
 public:
   NonLiteral() {}
-  ~NonLiteral() {}
+  ~NonLiteral() {} // expected-note {{declared here}}
 };
 
 constexpr NonLiteral F1() {
   return NonLiteral{};
 }
 
-constexpr int F2(NonLiteral N) {
+constexpr int F2(NonLiteral N) { // expected-note {{non-constexpr function '~NonLiteral' cannot be used in a constant expression}}
   return 8;
 }
 
@@ -46,7 +46,7 @@ class Derived1 : public NonLiteral {
 
 
 struct X {
-  X();
+  X(); // expected-note 2{{declared here}}
   X(const X&);
   X(X&&);
   X& operator=(X&);
@@ -80,7 +80,7 @@ struct WrapperNonT {
 };
 
 struct NonDefaultMembers {
-  constexpr NonDefaultMembers() {}; // expected-note {{non-literal type 'X' cannot be used in a constant expression}}
+  constexpr NonDefaultMembers() {}; // expected-note 2{{non-constexpr constructor 'X' cannot be used in a constant expression}}
   constexpr NonDefaultMembers(NonDefaultMembers const&) {};
   constexpr NonDefaultMembers(NonDefaultMembers &&) {};
   constexpr NonDefaultMembers& operator=(NonDefaultMembers &other) {this->t = other.t; return *this;}
@@ -109,7 +109,6 @@ void test() {
   F1();
   NonLiteral L;
   constexpr auto D = F2(L); // expected-error {{constexpr variable 'D' must be initialized by a constant expression}}
-                            // expected-note@-1 {{non-literal type 'NonLiteral' cannot be used in a constant expression}}
 
   constexpr auto E = FT(1); // expected-error {{constexpr variable 'E' must be initialized by a constant expression}}
                             // expected-note@-1 {{in call}}
@@ -125,8 +124,8 @@ void test() {
 
   static_assert((NonDefaultMembers(), true),""); // expected-error{{expression is not an integral constant expression}} \
                                                  // expected-note {{in call to}}
-  constexpr bool FFF = (NonDefaultMembers() == NonDefaultMembers()); // expected-error{{must be initialized by a constant expression}} \
-                                                                     // expected-note{{non-literal}}
+  constexpr bool FFF = (NonDefaultMembers() == NonDefaultMembers()); // expected-error {{must be initialized by a constant expression}} \
+								     // expected-note {{in call to 'NonDefaultMembers()'}}
 }
 
 struct A {
diff --git a/clang/test/SemaCXX/cxx2a-destroying-delete.cpp b/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
index 349e6e9538a4c..25b985ef11d15 100644
--- a/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
+++ b/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
@@ -187,3 +187,12 @@ namespace delete_from_new {
 #endif
   }
 }
+
+namespace GH96191 {
+  struct S {};
+  struct T {
+    void operator delete(S) { } // expected-error {{first parameter of 'operator delete' must have type 'void *'}}
+  };
+
+  void foo(T *t) { delete t; }
+}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 5cbc1f735383b..45fee6514c12b 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -895,6 +895,10 @@ void g() {
 }
 
 namespace P2797 {
+
+int bar(void) { return 55; }
+int (&fref)(void) = bar;
+
 struct C {
   void c(this const C&);    // #first
   void c() &;               // #second
@@ -915,6 +919,8 @@ struct C {
     (&C::c)(C{});
     (&C::c)(*this);     // expected-error {{call to non-static member function without an object argument}}
     (&C::c)();
+
+    (&fref)();
   }
 };
 }
@@ -959,3 +965,10 @@ void f();
 };
 void a::f(this auto) {} // expected-error {{an explicit object parameter cannot appear in a non-member function}}
 }
+
+struct R {
+  void f(this auto &&self, int &&r_value_ref) {} // expected-note {{candidate function template not viable: expects an rvalue for 2nd argument}}
+  void g(int &&r_value_ref) {
+	f(r_value_ref); // expected-error {{no matching member function for call to 'f'}}
+  }
+};
diff --git a/clang/test/SemaCXX/decltype.cpp b/clang/test/SemaCXX/decltype.cpp
index 96abb60836e40..9961c5e2bd918 100644
--- a/clang/test/SemaCXX/decltype.cpp
+++ b/clang/test/SemaCXX/decltype.cpp
@@ -139,6 +139,14 @@ namespace GH58674 {
   }
 }
 
+namespace GH97646 {
+  template<bool B>
+  void f() {
+    decltype(B) x = false;
+    !x;
+  }
+}
+
 template<typename>
 class conditional {
 };
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index 028bc7cc19698..dfcd1b033af5a 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -577,4 +577,13 @@ static_assert(!__is_trivially_constructible(Foo, const Foo &), "");
 static_assert(!__is_trivially_constructible(Foo, Foo &&), "");
 } // namespace GH89544
 
+namespace GH97230 {
+struct X {
+  ~X() = defaul; // expected-error {{initializer on function does not look like a pure-specifier}} \
+                 // expected-error {{use of undeclared identifier 'defaul'}}
+};
+struct Y : X {} y1{ }; // expected-error {{call to implicitly-deleted default constructor of 'struct Y'}} \
+                       // expected-note {{default constructor of 'Y' is implicitly deleted because base class 'X' has no destructor}}
+}
+
 #endif // BE_THE_HEADER
diff --git a/clang/test/SemaCXX/enum.cpp b/clang/test/SemaCXX/enum.cpp
index 739d35ec4a06b..9c398cc8da886 100644
--- a/clang/test/SemaCXX/enum.cpp
+++ b/clang/test/SemaCXX/enum.cpp
@@ -103,14 +103,14 @@ void PR8089() {
 // This is accepted as a GNU extension. In C++98, there was no provision for
 // expressions with UB to be non-constant.
 enum { overflow = 123456 * 234567 };
-#if __cplusplus >= 201103L
-// expected-warning@-2 {{expression is not an integral constant expression; folding it to a constant is a GNU extension}}
-// expected-note@-3 {{value 28958703552 is outside the range of representable values of type 'int'}}
-#else
-// expected-error@-5 {{expression is not an integral constant expression}}
-// expected-note@-6 {{value 28958703552 is outside the range of representable values of type 'int'}}
-// expected-warning@-7 {{overflow in expression; result is -1'106'067'520 with type 'int'}}
+// expected-error@-1 {{expression is not an integral constant expression}}
+// expected-note@-2 {{value 28958703552 is outside the range of representable values of type 'int'}}
+#if __cplusplus < 201103L
+// expected-warning@-4 {{overflow in expression; result is -1'106'067'520 with type 'int'}}
 #endif
+enum { overflow_shift = 1 << 32 };
+// expected-error@-1 {{expression is not an integral constant expression}}
+// expected-note@-2 {{shift count 32 >= width of type 'int' (32 bits)}}
 
 // FIXME: This is not consistent with the above case.
 enum NoFold : int { overflow2 = 123456 * 234567 };
@@ -123,6 +123,16 @@ enum NoFold : int { overflow2 = 123456 * 234567 };
 // expected-error@-7 {{expression is not an integral constant expression}}
 // expected-note@-8 {{value 28958703552 is outside the range of representable values of type 'int'}}
 #endif
+enum : int { overflow2_shift = 1 << 32 };
+#if __cplusplus >= 201103L
+// expected-error@-2 {{enumerator value is not a constant expression}}
+// expected-note@-3 {{shift count 32 >= width of type 'int' (32 bits)}}
+#else
+// expected-error@-5 {{expression is not an integral constant expression}}
+// expected-note@-6 {{shift count 32 >= width of type 'int' (32 bits)}}
+// expected-warning@-7 {{enumeration types with a fixed underlying type are a C++11 extension}}
+#endif
+
 
 // PR28903
 struct PR28903 {
diff --git a/clang/test/SemaCXX/instantiate-template-broken-nontype-default.cpp b/clang/test/SemaCXX/instantiate-template-broken-nontype-default.cpp
new file mode 100644
index 0000000000000..eb798e5887bda
--- /dev/null
+++ b/clang/test/SemaCXX/instantiate-template-broken-nontype-default.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
+
+constexpr Missing a = 0; // expected-error {{unknown type name 'Missing'}}
+
+template < typename T, Missing b = a> // expected-error {{unknown type name 'Missing'}}
+class Klass { // expected-note {{candidate template ignored: could not match 'Klass<T, b>' against 'int'}} \
+                 expected-note {{implicit deduction guide declared as 'template <typename T, int b = <recovery-expr>()> Klass(Klass<T, b>) -> Klass<T, b>'}}
+  Klass(T);   // expected-note {{candidate template ignored: substitution failure [with T = int, b = <recovery-expr>()]}} \
+                 expected-note {{implicit deduction guide declared as 'template <typename T, int b = <recovery-expr>()> Klass(T) -> Klass<T, b>'}}
+};
+
+Klass foo{5}; // no-crash \
+                 expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'Klass'}}
+
diff --git a/clang/test/SemaCXX/invalid-template-declaration.cpp b/clang/test/SemaCXX/invalid-template-declaration.cpp
new file mode 100644
index 0000000000000..fda147520cca8
--- /dev/null
+++ b/clang/test/SemaCXX/invalid-template-declaration.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only
+// PR99933
+
+struct S {
+  template <typename> int i; // expected-error {{non-static data member 'i' cannot be declared as a template}}
+};
diff --git a/clang/test/SemaCXX/new-delete-0x.cpp b/clang/test/SemaCXX/new-delete-0x.cpp
index a4b43308d010f..aa9ce37c8e3e4 100644
--- a/clang/test/SemaCXX/new-delete-0x.cpp
+++ b/clang/test/SemaCXX/new-delete-0x.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -std=c++11 -fexperimental-new-constant-interpreter
 
 using size_t = decltype(sizeof(0));
 struct noreturn_t {} constexpr noreturn = {};
diff --git a/clang/test/SemaCXX/pr100095.cpp b/clang/test/SemaCXX/pr100095.cpp
new file mode 100644
index 0000000000000..15913fec9d5ae
--- /dev/null
+++ b/clang/test/SemaCXX/pr100095.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s
+// XFAIL: asserts
+
+template <class> struct Pair;
+template <class...> struct Tuple {
+  template <class _Up> Tuple(_Up);
+};
+template <typename> struct StatusOr;
+template <int> using ElementType = int;
+template <int... fields>
+using Key = Tuple<ElementType<fields>...>;
+template <int... fields>
+StatusOr<Pair<Key<fields...>>> Parser();
+struct Helper { Helper(Tuple<>, Tuple<>, int, int); };
+struct D : Helper {
+  D(Key<> f, int n, int e) : Helper(f, Parser<>, n, e) {}
+};
diff --git a/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp b/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp
index 41bbba0e832fd..9cc88750363c6 100644
--- a/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp
+++ b/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fptrauth-calls %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fptrauth-calls %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fptrauth-calls %s
 
 struct Incomplete0; // expected-note 3 {{forward declaration of 'Incomplete0'}}
 
diff --git a/clang/test/SemaCXX/ptrauth-type-discriminator.cpp b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp
new file mode 100644
index 0000000000000..685ca1f03fddd
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s
+
+// RUN: not %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only %s 2>&1 | FileCheck %s
+// CHECK: this target does not support pointer authentication
+
+struct S {
+  virtual int foo();
+};
+
+template <class T>
+constexpr unsigned dependentOperandDisc() {
+  return __builtin_ptrauth_type_discriminator(T);
+}
+
+void test_builtin_ptrauth_type_discriminator(unsigned s) {
+  typedef int (S::*MemFnTy)();
+  MemFnTy memFnPtr;
+  int (S::*memFnPtr2)();
+  constexpr unsigned d0 = __builtin_ptrauth_type_discriminator(MemFnTy);
+  static_assert(d0 == __builtin_ptrauth_string_discriminator("_ZTSM1SFivE"));
+  static_assert(d0 == 60844);
+  static_assert(__builtin_ptrauth_type_discriminator(int (S::*)()) == d0);
+  static_assert(__builtin_ptrauth_type_discriminator(decltype(memFnPtr)) == d0);
+  static_assert(__builtin_ptrauth_type_discriminator(decltype(memFnPtr2)) == d0);
+  static_assert(__builtin_ptrauth_type_discriminator(decltype(&S::foo)) == d0);
+  static_assert(dependentOperandDisc<decltype(&S::foo)>() == d0);
+
+  constexpr unsigned d1 = __builtin_ptrauth_type_discriminator(void (S::*)(int));
+  static_assert(__builtin_ptrauth_string_discriminator("_ZTSM1SFviE") == d1);
+  static_assert(d1 == 39121);
+
+  constexpr unsigned d2 = __builtin_ptrauth_type_discriminator(void (S::*)(float));
+  static_assert(__builtin_ptrauth_string_discriminator("_ZTSM1SFvfE") == d2);
+  static_assert(d2 == 52453);
+
+  constexpr unsigned d3 = __builtin_ptrauth_type_discriminator(int (*())[s]);
+  static_assert(__builtin_ptrauth_string_discriminator("FPE") == d3);
+  static_assert(d3 == 34128);
+
+  int f4(float);
+  constexpr unsigned d4 = __builtin_ptrauth_type_discriminator(decltype(f4));
+  static_assert(__builtin_ptrauth_type_discriminator(int (*)(float)) == d4);
+  static_assert(__builtin_ptrauth_string_discriminator("FifE") == d4);
+  static_assert(d4 == 48468);
+
+  int f5(int);
+  constexpr unsigned d5 = __builtin_ptrauth_type_discriminator(decltype(f5));
+  static_assert(__builtin_ptrauth_type_discriminator(int (*)(int)) == d5);
+  static_assert(__builtin_ptrauth_type_discriminator(short (*)(short)) == d5);
+  static_assert(__builtin_ptrauth_type_discriminator(char (*)(char)) == d5);
+  static_assert(__builtin_ptrauth_type_discriminator(long (*)(long)) == d5);
+  static_assert(__builtin_ptrauth_type_discriminator(unsigned int (*)(unsigned int)) == d5);
+  static_assert(__builtin_ptrauth_type_discriminator(int (&)(int)) == d5);
+  static_assert(__builtin_ptrauth_string_discriminator("FiiE") == d5);
+  static_assert(d5 == 2981);
+
+  int t;
+  int vmarray[s];
+  (void)__builtin_ptrauth_type_discriminator(t); // expected-error {{unknown type name 't'}}
+  (void)__builtin_ptrauth_type_discriminator(&t); // expected-error {{expected a type}}
+  (void)__builtin_ptrauth_type_discriminator(decltype(vmarray)); // expected-error {{cannot pass undiscriminated type 'decltype(vmarray)' (aka 'int[s]')}}
+  (void)__builtin_ptrauth_type_discriminator(int *); // expected-error {{cannot pass undiscriminated type 'int *' to '__builtin_ptrauth_type_discriminator'}}
+  (void)__builtin_ptrauth_type_discriminator(); // expected-error {{expected a type}}
+  (void)__builtin_ptrauth_type_discriminator(int (*)(int), int (*)(int));
+  // expected-error@-1 {{expected ')'}}
+  // expected-note@-2 {{to match this '('}}
+}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 23b07cac13eaf..4acb3d6c9eebe 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -1738,6 +1738,11 @@ struct CStructWithFMA2 {
   int f[];
 };
 
+template<int N>
+struct UniqueEmpty {};
+template<typename... Bases>
+struct D : Bases... {};
+
 void is_layout_compatible(int n)
 {
   static_assert(__is_layout_compatible(void, void));
@@ -1841,6 +1846,12 @@ void is_layout_compatible(int n)
   static_assert(!__is_layout_compatible(EnumClassLayout, int));
   static_assert(!__is_layout_compatible(EnumForward, int));
   static_assert(!__is_layout_compatible(EnumClassForward, int));
+  static_assert(__is_layout_compatible(CStruct, D<CStruct>));
+  static_assert(__is_layout_compatible(CStruct, D<UniqueEmpty<0>, CStruct>));
+  static_assert(__is_layout_compatible(CStruct, D<UniqueEmpty<0>, D<UniqueEmpty<1>, CStruct>, D<UniqueEmpty<2>>>));
+  static_assert(__is_layout_compatible(CStruct, D<CStructWithQualifiers>));
+  static_assert(__is_layout_compatible(CStruct, D<UniqueEmpty<0>, CStructWithQualifiers>));
+  static_assert(__is_layout_compatible(CStructWithQualifiers, D<UniqueEmpty<0>, D<UniqueEmpty<1>, CStruct>, D<UniqueEmpty<2>>>));
 }
 
 namespace IPIBO {
@@ -2402,11 +2413,11 @@ template<typename T> struct DerivedB : BaseA<T> { };
 template<typename T> struct CrazyDerived : T { };
 
 
-class class_forward; // expected-note 2 {{forward declaration of 'class_forward'}}
+class class_forward; // expected-note 4 {{forward declaration of 'class_forward'}}
 
 template <class T> class DerivedTemp : Base {};
 template <class T> class NonderivedTemp {};
-template <class T> class UndefinedTemp; // expected-note {{declared here}}
+template <class T> class UndefinedTemp; // expected-note 2 {{declared here}}
 
 void is_base_of() {
   static_assert(__is_base_of(Base, Derived));
@@ -2457,6 +2468,76 @@ void is_base_of() {
   static_assert(!__is_base_of(DerivedB<int>, BaseA<int>));
 }
 
+struct DerivedTransitiveViaNonVirtual : Derived3 {};
+struct DerivedTransitiveViaVirtual : virtual Derived3 {};
+
+template <typename T>
+struct CrazyDerivedVirtual : virtual T {};
+
+struct DerivedPrivate : private virtual Base {};
+struct DerivedProtected : protected virtual Base {};
+struct DerivedPrivatePrivate : private DerivedPrivate {};
+struct DerivedPrivateProtected : private DerivedProtected {};
+struct DerivedProtectedPrivate : protected DerivedProtected {};
+struct DerivedProtectedProtected : protected DerivedProtected {};
+
+void is_virtual_base_of(int n) {
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(const Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Base));
+  static_assert(!__builtin_is_virtual_base_of(Derived, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, Base));
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(BaseA<int>, DerivedB<int>));
+  static_assert(!__builtin_is_virtual_base_of(DerivedB<int>, BaseA<int>));
+  static_assert(!__builtin_is_virtual_base_of(Union, Union));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Empty));
+  static_assert(!__builtin_is_virtual_base_of(class_forward, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(Empty, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(class_forward, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Base&, Derived&));
+  static_assert(!__builtin_is_virtual_base_of(Base[10], Derived[10]));
+  static_assert(!__builtin_is_virtual_base_of(Base[n], Derived[n])); // expected-error 2 {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+  static_assert(!__builtin_is_virtual_base_of(int, int));
+  static_assert(!__builtin_is_virtual_base_of(int[], int[]));
+  static_assert(!__builtin_is_virtual_base_of(long, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, DerivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, NonderivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, UndefinedTemp<int>)); // expected-error {{implicit instantiation of undefined template 'UndefinedTemp<int>'}}
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivatePrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivateProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedProtected));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaVirtual));
+  static_assert(!__builtin_is_virtual_base_of(Base, CrazyDerived<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerived<Base>, Base));
+  static_assert(__builtin_is_virtual_base_of(Base, CrazyDerivedVirtual<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerivedVirtual<Base>, Base));
+
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(Union, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Union));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteStruct));
+  static_assert(!__builtin_is_virtual_base_of(Empty, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, int));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Union));
+  static_assert(!__builtin_is_virtual_base_of(Union, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Union, int));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteStruct[n])); // expected-error {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+}
+
 template<class T, class U>
 class TemplateClass {};
 
diff --git a/clang/test/SemaCXX/typeof_unqual.cpp b/clang/test/SemaCXX/typeof.cpp
similarity index 68%
rename from clang/test/SemaCXX/typeof_unqual.cpp
rename to clang/test/SemaCXX/typeof.cpp
index 335e57995377c..421cfc59f5311 100644
--- a/clang/test/SemaCXX/typeof_unqual.cpp
+++ b/clang/test/SemaCXX/typeof.cpp
@@ -3,3 +3,11 @@
 typeof_unqual(int) u = 12; // expected-error {{expected function body after function declarator}}
 __typeof_unqual(int) _u = 12;
 __typeof_unqual__(int) __u = 12;
+
+namespace GH97646 {
+  template<bool B>
+  void f() {
+    __typeof__(B) x = false;
+    !x;
+  }
+}
diff --git a/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp b/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp
index 3a3386196fbfa..f228c4e6ad0c0 100644
--- a/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp
+++ b/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -triple arm64-apple-ios -verify -fptrauth-calls -std=c++2a %s
+// RUN: %clang_cc1 -fsyntax-only -triple arm64-apple-ios   -verify -fptrauth-calls -std=c++2a %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify -fptrauth-calls -std=c++2a %s
 
 namespace basic {
 
diff --git a/clang/test/SemaCXX/warn-dangling-local.cpp b/clang/test/SemaCXX/warn-dangling-local.cpp
index 2808a4c01f88d..5ad5013b6f025 100644
--- a/clang/test/SemaCXX/warn-dangling-local.cpp
+++ b/clang/test/SemaCXX/warn-dangling-local.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++11 -Wdangling-assignment-gsl %s
 
 using T = int[];
 
@@ -34,6 +34,6 @@ struct basic_string {
 };
 }  // namespace std
 void test(const char* a) {
-  // verify we're emitting the `-Wdangling-assignment` warning.
+  // verify we're emitting the `-Wdangling-assignment-gsl` warning.
   a = std::basic_string().c_str(); // expected-warning {{object backing the pointer a will be destroyed at the end of the full-expression}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
new file mode 100644
index 0000000000000..4afd799f8539e
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
@@ -0,0 +1,12 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+bool test_too_few_arg() {
+  return __builtin_hlsl_all();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+bool test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_all(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
index 7bb5308c5d5ba..e42fd97b40219 100644
--- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
@@ -2,11 +2,11 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
-  return __builtin_hlsl_elementwise_any();
+  return __builtin_hlsl_any();
   // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
 }
 
 bool test_too_many_arg(float2 p0) {
-  return __builtin_hlsl_elementwise_any(p0, p0);
+  return __builtin_hlsl_any(p0, p0);
   // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
 }
diff --git a/clang/test/SemaHLSL/Loops/unroll.hlsl b/clang/test/SemaHLSL/Loops/unroll.hlsl
index 2e2be319e4666..c94dc5857c2e8 100644
--- a/clang/test/SemaHLSL/Loops/unroll.hlsl
+++ b/clang/test/SemaHLSL/Loops/unroll.hlsl
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -O0 -finclude-default-header -fsyntax-only -triple dxil-pc-shadermodel6.6-library %s -verify
 void unroll_no_vars() {
+  // expected-note@+1 {{declared here}}
   int I = 3;
-  [unroll(I)]  // expected-error {{'unroll' attribute requires an integer constant}}
+  // expected-error@+2 {{expression is not an integral constant expression}}
+  // expected-note@+1 {{read of non-const variable 'I' is not allowed in a constant expression}}
+  [unroll(I)]
   while (I--);
 }
 
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl
new file mode 100644
index 0000000000000..5d123c8e81d87
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned int v2u32 __attribute__((ext_vector_type(2)));
+typedef unsigned int v3u32 __attribute__((ext_vector_type(3)));
+typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));
+
+u8 test_amdgcn_raw_ptr_buffer_load_b8(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b8' must be a constant integer}}
+}
+
+u16 test_amdgcn_raw_ptr_buffer_load_b16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b16' must be a constant integer}}
+}
+
+u32 test_amdgcn_raw_ptr_buffer_load_b32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b32' must be a constant integer}}
+}
+
+v2u32 test_amdgcn_raw_ptr_buffer_load_b64(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b64' must be a constant integer}}
+}
+
+v3u32 test_amdgcn_raw_ptr_buffer_load_b96(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b96' must be a constant integer}}
+}
+
+v4u32 test_amdgcn_raw_ptr_buffer_load_b128(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b128' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl
index 356c031640483..119adcd6b4e63 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl
@@ -3,33 +3,33 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-typedef char i8;
-typedef short i16;
-typedef int i32;
-typedef int i64 __attribute__((ext_vector_type(2)));
-typedef int i96 __attribute__((ext_vector_type(3)));
-typedef int i128 __attribute__((ext_vector_type(4)));
-
-void test_amdgcn_raw_ptr_buffer_store_b8(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned int v2u32 __attribute__((ext_vector_type(2)));
+typedef unsigned int v3u32 __attribute__((ext_vector_type(3)));
+typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));
+
+void test_amdgcn_raw_ptr_buffer_store_b8(u8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b8' must be a constant integer}}
 }
 
-void test_amdgcn_raw_ptr_buffer_store_b16(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+void test_amdgcn_raw_ptr_buffer_store_b16(u16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b16' must be a constant integer}}
 }
 
-void test_amdgcn_raw_ptr_buffer_store_b32(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+void test_amdgcn_raw_ptr_buffer_store_b32(u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b32' must be a constant integer}}
 }
 
-void test_amdgcn_raw_ptr_buffer_store_b64(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+void test_amdgcn_raw_ptr_buffer_store_b64(v2u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b64' must be a constant integer}}
 }
 
-void test_amdgcn_raw_ptr_buffer_store_b96(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+void test_amdgcn_raw_ptr_buffer_store_b96(v3u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b96' must be a constant integer}}
 }
 
-void test_amdgcn_raw_ptr_buffer_store_b128(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+void test_amdgcn_raw_ptr_buffer_store_b128(v4u32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
   __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b128' must be a constant integer}}
 }
diff --git a/clang/test/SemaTemplate/alias-template-deprecated.cpp b/clang/test/SemaTemplate/alias-template-deprecated.cpp
new file mode 100644
index 0000000000000..7418e222bcfc5
--- /dev/null
+++ b/clang/test/SemaTemplate/alias-template-deprecated.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+//
+// This test checks that a deprecated attribute on an alias
+// template triggers a warning diagnostic when it is used.
+
+template <typename T>
+struct NoAttr {
+  void foo() {}
+};
+
+// expected-note@+2 7{{'UsingWithAttr' has been explicitly marked deprecated here}}
+template <typename T>
+using UsingWithAttr __attribute__((deprecated)) = NoAttr<T>;
+
+// expected-note@+1 {{'UsingInstWithAttr' has been explicitly marked deprecated here}}
+using UsingInstWithAttr __attribute__((deprecated)) = NoAttr<int>;
+
+// expected-note@+1 {{'TDWithAttr' has been explicitly marked deprecated here}}
+typedef NoAttr<int> TDWithAttr __attribute__((deprecated));
+
+// expected-warning@+1 {{'UsingWithAttr' is deprecated}}
+typedef UsingWithAttr<int> TDUsingWithAttr;
+
+typedef NoAttr<int> TDNoAttr;
+
+// expected-note@+1 {{'UsingTDWithAttr' has been explicitly marked deprecated here}}
+using UsingTDWithAttr __attribute__((deprecated)) = TDNoAttr;
+
+struct S {
+  NoAttr<float> f1;
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
+  UsingWithAttr<float> f2;
+};
+
+// expected-warning@+1 {{'UsingWithAttr' is deprecated}}
+void foo(NoAttr<short> s1, UsingWithAttr<short> s2) {
+}
+
+// expected-note@+2 {{'UsingWithCPPAttr' has been explicitly marked deprecated here}}
+template <typename T>
+using UsingWithCPPAttr [[deprecated]] = NoAttr<T>;
+
+// expected-note@+1 {{'UsingInstWithCPPAttr' has been explicitly marked deprecated here}}
+using UsingInstWithCPPAttr [[deprecated("Do not use this")]] = NoAttr<int>;
+
+void bar() {
+  NoAttr<int> obj; // Okay
+
+  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
+  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  UsingWithAttr<int> objUsingWA;
+
+  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
+  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  NoAttr<UsingWithAttr<int>> s;
+
+  // expected-note@+1 {{'DepInt' has been explicitly marked deprecated here}}
+  using DepInt [[deprecated]] = int;
+  // expected-warning@+3 {{'UsingWithAttr' is deprecated}}
+  // expected-warning@+2 {{'DepInt' is deprecated}}
+  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  using X = UsingWithAttr<DepInt>;
+
+  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
+  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  UsingWithAttr<int>().foo();
+
+  // expected-warning@+1 {{'UsingInstWithAttr' is deprecated}}
+  UsingInstWithAttr objUIWA;
+
+  // expected-warning@+1 {{'TDWithAttr' is deprecated}}
+  TDWithAttr objTDWA;
+
+  // expected-warning@+1 {{'UsingTDWithAttr' is deprecated}}
+  UsingTDWithAttr objUTDWA;
+
+  // expected-warning@+2 {{'UsingWithCPPAttr' is deprecated}}
+  // expected-note@+1 {{in instantiation of template type alias 'UsingWithCPPAttr' requested here}}
+  UsingWithCPPAttr<int> objUsingWCPPA;
+
+  // expected-warning@+1 {{'UsingInstWithCPPAttr' is deprecated: Do not use this}}
+  UsingInstWithCPPAttr objUICPPWA;
+}
diff --git a/clang/test/SemaTemplate/class-template-decl.cpp b/clang/test/SemaTemplate/class-template-decl.cpp
index c054a6a8d82f7..0374c4eba982a 100644
--- a/clang/test/SemaTemplate/class-template-decl.cpp
+++ b/clang/test/SemaTemplate/class-template-decl.cpp
@@ -63,7 +63,7 @@ class X {
 };
 
 void f() {
-  template<typename T> class X; // expected-error{{expression}}
+  template<typename T> class X; // expected-error{{templates can only be declared in namespace or class scope}}
 }
 
 template<typename T> class X1 var; // expected-error {{variable has incomplete type 'class X1'}} \
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 7b9ca4d83b020..043a4ccf56e36 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -68,7 +68,7 @@ using BT = B<char, 'x'>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'T' depth 0 index 1 V
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
-// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 W
+// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'U' depth 0 index 3 W
 // CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (X<W, V>) -> B<T, V>'
 // CHECK: | `-ParmVarDecl {{.*}} 'X<W, V>'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (X<nullptr, 'x'>) -> B<char, 'x'>'
@@ -81,7 +81,7 @@ using BT = B<char, 'x'>;
 // CHECK: |-InjectedClassNameType {{.*}} 'B<T, V>' dependent
 // CHECK: `-TemplateSpecializationType {{.*}} 'X<W, V>' dependent
 // CHECK:   |-TemplateArgument expr
-// CHECK:   | `-DeclRefExpr {{.*}} 'type-parameter-0-2' NonTypeTemplateParm {{.*}} 'W' 'type-parameter-0-2'
+// CHECK:   | `-DeclRefExpr {{.*}} 'U' NonTypeTemplateParm {{.*}} 'W' 'U'
 // CHECK:   `-TemplateArgument expr
 // CHECK:     `-DeclRefExpr {{.*}} 'T' NonTypeTemplateParm {{.*}} 'V' 'T'
 
@@ -99,13 +99,13 @@ using CT = C<int>;
 // CHECK: | |-TemplateTypeParmDecl {{.*}} typename depth 1 index 0 X
 // CHECK: | `-NonTypeTemplateParmDecl {{.*}} 'X' depth 1 index 1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
-// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 V
+// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'U' depth 0 index 3 V
 // CHECK: | `-TemplateArgument {{.*}} expr
 // CHECK: |   `-IntegerLiteral {{.*}} 'int' 0
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<T>, U) -> C<A>'
 // CHECK: | |-ParmVarDecl {{.*}} 'A'
-// CHECK: | |-ParmVarDecl {{.*}} 'Y<template-parameter-0-1>'
-// CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2'
+// CHECK: | |-ParmVarDecl {{.*}} 'Y<T>'
+// CHECK: | `-ParmVarDecl {{.*}} 'U'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y<B>, int) -> C<int>'
 // CHECK:  |-TemplateArgument type 'int'
 // CHECK:  |-TemplateArgument template 'B'
@@ -114,20 +114,20 @@ using CT = C<int>;
 // CHECK:  |-ParmVarDecl {{.*}} 'int'
 // CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
 // CHECK:  `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<T>, U) -> C<A>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'C<A>' dependent
 // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0
 // CHECK: | `-TemplateTypeParm {{.*}} 'A'
-// CHECK: |-ElaboratedType {{.*}} 'Y<template-parameter-0-1>' sugar dependent
-// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<template-parameter-0-1>' dependent
+// CHECK: |-ElaboratedType {{.*}} 'Y<T>' sugar dependent
+// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<T>' dependent
 // CHECK: |   `-TemplateArgument template
-// CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
+// CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
 
 template<typename ...T> struct D { // expected-note {{candidate}} \
                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T> D(D<T...>) -> D<T...>'}}
   template<typename... U> using B = int(int (*...p)(T, U));
   template<typename U1, typename U2> D(B<U1, U2>*); // expected-note {{candidate}} \
-                                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T, typename U1, typename U2> D(B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'}}
+                                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T, typename U1, typename U2> D(B<U1, U2> *) -> D<T...>'}}
 };
 int f(int(int, int), int(int, int));
 // FIXME: We can't deduce this because we can't deduce through a
@@ -141,14 +141,14 @@ using DT = D<int, int>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 ... T
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U2
-// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'
-// CHECK:   `-ParmVarDecl {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *'
-// CHECK: FunctionProtoType {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>' dependent trailing_return
+// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<U1, U2> *) -> D<T...>'
+// CHECK:   `-ParmVarDecl {{.*}} 'B<U1, U2> *'
+// CHECK: FunctionProtoType {{.*}} 'auto (B<U1, U2> *) -> D<T...>' dependent trailing_return
 // CHECK: |-InjectedClassNameType {{.*}} 'D<T...>' dependent
-// CHECK: `-PointerType {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *' dependent
-// CHECK:   `-TemplateSpecializationType {{.*}} 'B<type-parameter-0-1, type-parameter-0-2>' sugar dependent alias
-// CHECK:     |-TemplateArgument type 'type-parameter-0-1'
-// CHECK:     |-TemplateArgument type 'type-parameter-0-2'
+// CHECK: `-PointerType {{.*}} 'B<U1, U2> *' dependent
+// CHECK:   `-TemplateSpecializationType {{.*}} 'B<U1, U2>' sugar dependent alias
+// CHECK:     |-TemplateArgument type 'U1'
+// CHECK:     |-TemplateArgument type 'U2'
 // CHECK:     `-FunctionProtoType {{.*}} 'int (int (*)(T, U)...)' dependent cdecl
 // CHECK:       |-BuiltinType {{.*}} 'int'
 // CHECK:       `-PackExpansionType {{.*}} 'int (*)(T, U)...' dependent expansions 2
@@ -232,17 +232,17 @@ F s(0);
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
 // CHECK: |-ParenExpr {{.*}} 'bool'
 // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false
-// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (type-parameter-0-1) -> F<>'
-// CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-1'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<>'
+// CHECK: | `-ParmVarDecl {{.*}} 'U'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
 // CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (type-parameter-0-1) -> F<>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
-// CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 1
 
 template<typename T>
 struct G { T t; };
@@ -259,7 +259,7 @@ AG ag = {1};
 // Verify that the aggregate deduction guide for alias templates is built.
 // CHECK-LABEL: Dumping <deduction guide for AG>
 // CHECK: FunctionTemplateDecl
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (type-parameter-0-0) -> G<type-parameter-0-0>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (X) -> G<X>'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int) -> G<int>' implicit_instantiation
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
@@ -281,7 +281,7 @@ struct Foo {
 template <typename U>
 using AFoo = Foo<G<U>>;
 // Verify that the require-clause from the Foo deduction guide is transformed.
-// The D occurrence should be rewritten to G<type-parameter-0-0>.
+// The D occurrence should be rewritten to G<U>.
 //
 // CHECK-LABEL: Dumping <deduction guide for AFoo>
 // CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for AFoo>
@@ -289,16 +289,16 @@ using AFoo = Foo<G<U>>;
 // CHECK-NEXT: |-BinaryOperator {{.*}} '&&'
 // CHECK-NEXT: | |-ParenExpr {{.*}} 'bool'
 // CHECK-NEXT: | | `-BinaryOperator {{.*}} 'bool' '=='
-// CHECK-NEXT: | |   |-UnaryExprOrTypeTraitExpr {{.*}} 'G<type-parameter-0-0>'
+// CHECK-NEXT: | |   |-UnaryExprOrTypeTraitExpr {{.*}} 'G<U>'
 // CHECK-NEXT: | |   `-ImplicitCastExpr {{.*}}
 // CHECK-NEXT: | |     `-IntegerLiteral {{.*}}
 // CHECK-NEXT: | `-TypeTraitExpr {{.*}} 'bool' __is_deducible
 // CHECK-NEXT: |   |-DeducedTemplateSpecializationType {{.*}} 'AFoo' dependent
 // CHECK-NEXT: |   | `-name: 'AFoo'
 // CHECK-NEXT: |   |   `-TypeAliasTemplateDecl {{.+}} AFoo
-// CHECK-NEXT: |   `-TemplateSpecializationType {{.*}} 'Foo<G<type-parameter-0-0>>' dependent
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (G<type-parameter-0-0>) -> Foo<G<type-parameter-0-0>>'
-// CHECK-NEXT: | `-ParmVarDecl {{.*}} 'G<type-parameter-0-0>'
+// CHECK-NEXT: |   `-TemplateSpecializationType {{.*}} 'Foo<G<U>>' dependent
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (G<U>) -> Foo<G<U>>'
+// CHECK-NEXT: | `-ParmVarDecl {{.*}} 'G<U>'
 // CHECK-NEXT: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for AFoo> 'auto (G<int>) -> Foo<G<int>>' implicit_instantiation
 // CHECK-NEXT:   |-TemplateArgument type 'int'
 // CHECK-NEXT:   | `-BuiltinType {{.*}} 'int'
@@ -321,20 +321,20 @@ namespace TTP {
 // CHECK-NEXT:  |-TemplateTypeParmDecl {{.+}} class depth 0 index 0 T{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl {{.+}} depth 0 index 1 TT{{$}}
 // CHECK-NEXT:  | `-TemplateTypeParmDecl {{.+}} class depth 1 index 0{{$}}
-// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>'{{$}}
-// CHECK-NEXT:  | `-ParmVarDecl {{.+}} 'template-parameter-0-1<T>'{{$}}
+// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (TT<T>) -> B<T>'{{$}}
+// CHECK-NEXT:  | `-ParmVarDecl {{.+}} 'TT<T>'{{$}}
 // CHECK-NEXT:  `-CXXDeductionGuideDecl {{.+}} 'auto (A<int>) -> TTP::B<int>'
 // CHECK-NEXT:    |-TemplateArgument type 'int'
 // CHECK-NEXT:    | `-BuiltinType {{.+}} 'int'{{$}}
 // CHECK-NEXT:    |-TemplateArgument template 'TTP::A'{{$}}
 // CHECK-NEXT:    | `-ClassTemplateDecl {{.+}} A{{$}}
 // CHECK-NEXT:    `-ParmVarDecl {{.+}} 'A<int>':'TTP::A<int>'{{$}}
-// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>' dependent trailing_return cdecl{{$}}
+// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (TT<T>) -> B<T>' dependent trailing_return cdecl{{$}}
 // CHECK-NEXT:  |-InjectedClassNameType {{.+}} 'B<T>' dependent{{$}}
 // CHECK-NEXT:  | `-CXXRecord {{.+}} 'B'{{$}}
-// CHECK-NEXT:  `-ElaboratedType {{.+}} 'template-parameter-0-1<T>' sugar dependent{{$}}
-// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} 'template-parameter-0-1<T>' dependent{{$}}
-// CHECK-NEXT:      |-name: 'template-parameter-0-1' qualified
+// CHECK-NEXT:  `-ElaboratedType {{.+}} 'TT<T>' sugar dependent{{$}}
+// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} 'TT<T>' dependent{{$}}
+// CHECK-NEXT:      |-name: 'TT':'template-parameter-0-1' qualified
 // CHECK-NEXT:      | `-TemplateTemplateParmDecl {{.+}} depth 0 index 1
 // CHECK-NEXT:      `-TemplateArgument type 'T':'type-parameter-0-0'{{$}}
 // CHECK-NEXT:        `-TemplateTypeParmType {{.+}} 'T' dependent depth 0 index 0{{$}}
diff --git a/clang/test/SemaTemplate/nested-template.cpp b/clang/test/SemaTemplate/nested-template.cpp
index a6ede8e99037e..a54992a397187 100644
--- a/clang/test/SemaTemplate/nested-template.cpp
+++ b/clang/test/SemaTemplate/nested-template.cpp
@@ -132,7 +132,7 @@ namespace PR10896 {
   private:
 
     template<typename T>
-    T SomeField; // expected-error {{member 'SomeField' declared as a template}}
+    T SomeField; // expected-error {{non-static data member 'SomeField' cannot be declared as a template}}
     template<> int SomeField2; // expected-error {{extraneous 'template<>' in declaration of variable 'SomeField2'}}
   };
 
diff --git a/clang/test/TableGen/attrs-parser-string-switches.td b/clang/test/TableGen/attrs-parser-string-switches.td
new file mode 100644
index 0000000000000..c15ab104e0ccd
--- /dev/null
+++ b/clang/test/TableGen/attrs-parser-string-switches.td
@@ -0,0 +1,232 @@
+// RUN: clang-tblgen -gen-clang-attr-parser-string-switches -I%p/../../include %s -o - 2>&1 | FileCheck %s
+
+// Tests that the tablegen can support attributes with the same spellings but
+// different argument types.
+
+include "clang/Basic/Attr.td"
+
+// Test attributeParsedArgsUnevaluated : different ParseArgumentsAsUnevaluated
+def TestUnEvalOne : InheritableAttr {
+  let Spellings = [Clang<"test_uneval">];
+  let Args = [ExprArgument<"Count">];
+  let Subjects = SubjectList<[Function]>;
+  let ParseArgumentsAsUnevaluated = 1;
+  let Documentation = [Undocumented];
+}
+
+def TestUnEvalTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_uneval">];
+  let Args = [ExprArgument<"Count">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_ARG_CONTEXT_LIST)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("GNU::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("CXX11::clang::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("C23::clang::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: #endif // CLANG_ATTR_ARG_CONTEXT_LIST
+
+// Test attributeHasIdentifierArg: Same spelling, one with and one without
+// an IdentifierArg.
+def TestIdentOne : Attr {
+  let Spellings = [Clang<"test_ident">];
+  let Args = [EnumArgument<"Option", "OptionType", /*is_string=*/false,
+              ["optA", "optB"], ["OPTA", "OPTB"]>];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+def TestIdentTwo : StmtAttr {
+  let Spellings = [Pragma<"", "test_ident">];
+  let Args = [UnsignedArgument<"val", /*opt*/1>];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("GNU::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("CXX11::clang::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("C23::clang::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: #endif // CLANG_ATTR_IDENTIFIER_ARG_LIST
+
+// Test attributeStringLiteralListArg : Same spelling, some with a
+// StringArgument, some without, some in different locations.
+def TestStringOne : DeclOrTypeAttr {
+  let Spellings = [Clang<"test_string">];
+  let Args = [StringArgument<"strarg">];
+  let Subjects = SubjectList<[Function, TypedefName, ParmVar]>;
+  let Documentation = [AcquireHandleDocs];
+}
+
+def TestStringTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_string">];
+  let Args = [UnsignedArgument<"unsarg">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// In a different position
+def TestStringThree : Attr {
+  let Spellings = [Declspec<"test_string">];
+  let Args = [UnsignedArgument<"uarg">, StringArgument<"strarg">];
+  let Subjects = SubjectList<[Function, TypedefName, ParmVar]>;
+  let Documentation = [AcquireHandleDocs];
+}
+
+// CHECK: #if defined(CLANG_ATTR_STRING_LITERAL_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: .Case("GNU::test_string", 1)
+// CHECK: .Case("CXX11::clang::test_string", 1)
+// CHECK: .Case("C23::clang::test_string", 1)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: .Case("Declspec::test_string", 2)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: #endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST
+
+// Test attributeHasVariadicIdentifierArg : One with VariadicIdentifierArgument
+// and one without.
+def TestVariadicIdentOne : InheritableAttr {
+  let Spellings = [Clang<"test_var_ident">];
+  let Args = [VariadicIdentifierArgument<"iargs">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestVariadicIdentTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_var_ident">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::"test_var_ident", true)
+// CHECK: .Case("GNU::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: .Case("CXX11::clang::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: .Case("C23::clang::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: #endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
+
+// Test attributeTreatsKeywordThisAsIdentifier : Same spelling, one with and
+// one without VariadicParamOrParamIdxArgument.
+def TestVarOrIdxOne : InheritableAttr {
+  let Spellings = [Clang<"test_var_idx">];
+  let Args = [VariadicParamOrParamIdxArgument<"arg">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestVarOrIdxTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_var_idx">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("GNU::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("CXX11::clang::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("C23::clang::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: #endif // CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
+
+// Test attributeAcceptsExprPack : One with, one without.
+def TestExprPackOne : InheritableAttr {
+  let Spellings = [Clang<"test_expr_pack">];
+  let Args = [StringArgument<"str">, VariadicExprArgument<"args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let AcceptsExprPack = 1;
+  let Documentation = [Undocumented];
+}
+
+def TestExprPackTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_expr_pack">];
+  let Args = [StringArgument<"str">, VariadicExprArgument<"args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_ACCEPTS_EXPR_PACK)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("GNU::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("CXX11::clang::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("C23::clang::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: #endif // CLANG_ATTR_ACCEPTS_EXPR_PACK
+
+
+// Test attributeIsTypeArgAttr : Same spelling, one with TypeArgument and one
+// without.
+def TestTypeOne : InheritableAttr {
+  let Spellings = [Clang<"test_type">];
+  let Args = [TypeArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestTypeTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_type">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_TYPE_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("GNU::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("CXX11::clang::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("C23::clang::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: #endif // CLANG_ATTR_TYPE_ARG_LIST
+
+// Test attributeHasStrictIdentifierArgs and
+// attributeHasStrictIdentifierArgAtIndex, one used StrictEnumParameters, the
+// other does not.
+def TestStrictEnumOne : InheritableAttr {
+  let Spellings = [Clang<"strict_enum">];
+  let StrictEnumParameters = 1;
+  let Args = [EnumArgument<"One", "OneType", /*is_string=*/true,
+                ["a", "b", "c", "d"],
+                ["A", "B", "C", "D"]>,
+              IntArgument<"Other", 1>,
+              EnumArgument<"Two", "TwoType", /*is_string=*/true,
+                ["e", "f", "g", "h"],
+                ["E", "F", "G", "H"]>];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestStrictEnumTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "strict_enum">];
+  let Args = [VariadicExprArgument<"Args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("GNU::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("CXX11::clang::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("C23::clang::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: #endif // CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
diff --git a/clang/test/Tooling/clang-check-extra-arg.cpp b/clang/test/Tooling/clang-check-extra-arg.cpp
index df5fb930eedcd..488497edd4e81 100644
--- a/clang/test/Tooling/clang-check-extra-arg.cpp
+++ b/clang/test/Tooling/clang-check-extra-arg.cpp
@@ -1,4 +1,6 @@
-// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -c 2>&1 | FileCheck %s
+/// Check we do not report "argument unused during compilation: '-c'"
+// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -c 2>&1 | FileCheck %s --implicit-check-not='argument unused'
+// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -S -Xclang -S 2>&1 | FileCheck %s --implicit-check-not='argument unused'
 
 // CHECK: unknown warning option '-Wunimplemented-warning-before'
 // CHECK: unknown warning option '-Wunimplemented-warning'
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 2e0fbc2c9e1dd..2bd7501136a10 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -95,6 +95,7 @@
     "llvm-ifs",
     "yaml2obj",
     "clang-linker-wrapper",
+    "clang-nvlink-wrapper",
     "llvm-lto",
     "llvm-lto2",
     "llvm-profdata",
diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt
index 1a2694b97fd9e..7f878b860abf8 100644
--- a/clang/tools/CMakeLists.txt
+++ b/clang/tools/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_subdirectory(clang-fuzzer)
 add_clang_subdirectory(clang-import-test)
 add_clang_subdirectory(clang-nvlink-wrapper)
 add_clang_subdirectory(clang-linker-wrapper)
+add_clang_subdirectory(clang-nvlink-wrapper)
 add_clang_subdirectory(clang-offload-packager)
 add_clang_subdirectory(clang-offload-bundler)
 add_clang_subdirectory(clang-offload-deps)
diff --git a/clang/tools/c-index-test/CMakeLists.txt b/clang/tools/c-index-test/CMakeLists.txt
index 0ae1b4e55244e..24e7c9692ca56 100644
--- a/clang/tools/c-index-test/CMakeLists.txt
+++ b/clang/tools/c-index-test/CMakeLists.txt
@@ -27,7 +27,6 @@ else()
     libclang
     clangAST
     clangBasic
-    clangCodeGen
     clangFrontend
     clangIndex
     clangSerialization
diff --git a/clang/tools/c-index-test/core_main.cpp b/clang/tools/c-index-test/core_main.cpp
index c552466c9a188..003b1baef3a25 100644
--- a/clang/tools/c-index-test/core_main.cpp
+++ b/clang/tools/c-index-test/core_main.cpp
@@ -8,7 +8,6 @@
 
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/LangOptions.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -19,6 +18,7 @@
 #include "clang/Index/USRGeneration.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Serialization/ASTReader.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/PrettyStackTrace.h"
diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp
index 0524becf4f484..fa6dd06a1ee58 100644
--- a/clang/tools/clang-check/ClangCheck.cpp
+++ b/clang/tools/clang-check/ClangCheck.cpp
@@ -16,7 +16,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
 #include "clang/Driver/Options.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
diff --git a/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt b/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
index ee4aa587ea54d..6b72b98f5e1c4 100644
--- a/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
+++ b/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
@@ -1,3 +1,4 @@
+set(CMAKE_CXX_FLAGS ${CXX_FLAGS_NOFUZZ})
 add_clang_executable(clang-fuzzer-dictionary
   dictionary.c
   )
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index edb97cf431d1e..eb37fa583d63a 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -332,6 +332,14 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
   return *Path;
 }
 
+/// We will defer LTO to the target's linker if we are not doing JIT and it is
+/// supported by the toolchain.
+bool linkerSupportsLTO(const ArgList &Args) {
+  llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
+  return Triple.isNVPTX() || Triple.isAMDGPU() ||
+         Args.getLastArgValue(OPT_linker_path_EQ).ends_with("ld.lld");
+}
+
 /// Returns the hashed value for a constant string.
 std::string getHash(StringRef Str) {
   llvm::MD5 Hasher;
@@ -875,8 +883,7 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args,
             ",+SPV_INTEL_tensor_float32_conversion"
             ",+SPV_INTEL_optnone"
             ",+SPV_KHR_non_semantic_info"
-            ",+SPV_KHR_cooperative_matrix"
-            ",+SPV_INTEL_memory_access_aliasing";
+            ",+SPV_KHR_cooperative_matrix";
   if (IsCPU)
     ExtArg += ",+SPV_INTEL_fp_max_error";
   TranslatorArgs.push_back(Args.MakeArgString(ExtArg));
@@ -1388,6 +1395,13 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args,
       Args.MakeArgString("-" + OptLevel),
   };
 
+  // Forward all of the `--offload-opt` and similar options to the device.
+  if (linkerSupportsLTO(Args)) {
+    for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm))
+      CmdArgs.push_back(
+          Args.MakeArgString("-Wl,--plugin-opt=" + StringRef(Arg->getValue())));
+  }
+
   if (!Triple.isNVPTX())
     CmdArgs.push_back("-Wl,--no-undefined");
 
@@ -1428,11 +1442,10 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args,
     llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
   }
 
-  // Pass on -mllvm options to the clang invocation.
-  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
-    CmdArgs.push_back("-mllvm");
-    CmdArgs.push_back(Arg->getValue());
-  }
+  // Pass on -mllvm options to the linker invocation.
+  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
+    CmdArgs.push_back(
+        Args.MakeArgString("-Wl,-mllvm=" + StringRef(Arg->getValue())));
 
   if (Args.hasArg(OPT_debug))
     CmdArgs.push_back("-g");
@@ -1440,6 +1453,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args,
   if (SaveTemps)
     CmdArgs.push_back("-save-temps");
 
+  if (SaveTemps && linkerSupportsLTO(Args))
+    CmdArgs.push_back("-Wl,--save-temps");
+
+  if (Args.hasArg(OPT_embed_bitcode))
+    CmdArgs.push_back("-Wl,--lto-emit-llvm");
+
   if (Verbose)
     CmdArgs.push_back("-v");
 
@@ -1452,6 +1471,8 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args,
         std::back_inserter(CmdArgs));
 
   for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
+    CmdArgs.append({"-Xlinker", Args.MakeArgString(Arg)});
+  for (StringRef Arg : Args.getAllArgValues(OPT_compiler_arg_EQ))
     CmdArgs.push_back(Args.MakeArgString(Arg));
 
   for (StringRef Arg : Args.getAllArgValues(OPT_builtin_bitcode_EQ)) {
@@ -1460,8 +1481,8 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args,
                       Args.MakeArgString(Arg.split('=').second)});
   }
 
-  // The OpenMPOpt pass can introduce new calls and is expensive, we do not want
-  // this when running CodeGen through clang.
+  // The OpenMPOpt pass can introduce new calls and is expensive, we do
+  // not want this when running CodeGen through clang.
   if (Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ))
     CmdArgs.append({"-mllvm", "-openmp-opt-disable"});
 
@@ -1677,8 +1698,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
   BumpPtrAllocator Alloc;
   StringSaver Saver(Alloc);
 
-  // Search for bitcode files in the input and create an LTO input file. If it
-  // is not a bitcode file, scan its symbol table for symbols we need to save.
+  // Search for bitcode files in the input and create an LTO input file. If
+  // it is not a bitcode file, scan its symbol table for symbols we need to
+  // save.
   for (OffloadFile &File : InputFiles) {
     MemoryBufferRef Buffer = MemoryBufferRef(File.getBinary()->getImage(), "");
 
@@ -1712,7 +1734,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
         if (!Name)
           return Name.takeError();
 
-        // Record if we've seen these symbols in any object or shared libraries.
+        // Record if we've seen these symbols in any object or shared
+        // libraries.
         if ((*ObjFile)->isRelocatableObject())
           UsedInRegularObj.insert(Saver.save(*Name));
         else
@@ -1749,7 +1772,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     return false;
   };
 
-  // We assume visibility of the whole program if every input file was bitcode.
+  // We assume visibility of the whole program if every input file was
+  // bitcode.
   auto Features = getTargetFeatures(BitcodeInputFiles);
   auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
                             Args.hasArg(OPT_builtin_bitcode_EQ) ||
@@ -1757,9 +1781,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                         ? createLTO(Args, Features, OutputBitcode)
                         : createLTO(Args, Features);
 
-  // We need to resolve the symbols so the LTO backend knows which symbols need
-  // to be kept or can be internalized. This is a simplified symbol resolution
-  // scheme to approximate the full resolution a linker would do.
+  // We need to resolve the symbols so the LTO backend knows which symbols
+  // need to be kept or can be internalized. This is a simplified symbol
+  // resolution scheme to approximate the full resolution a linker would do.
   uint64_t Idx = 0;
   DenseSet<StringRef> PrevailingSymbols;
   for (auto &BitcodeInput : BitcodeInputFiles) {
@@ -1791,7 +1815,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
       // We need LTO to preseve the following global symbols:
       // 1) Symbols used in regular objects.
       // 2) Sections that will be given a __start/__stop symbol.
-      // 3) Prevailing symbols that are needed visible to external libraries.
+      // 3) Prevailing symbols that are needed visible to external
+      // libraries.
       Res.VisibleToRegularObj =
           UsedInRegularObj.contains(Sym.getName()) ||
           isValidCIdentifier(Sym.getSectionName()) ||
@@ -1806,9 +1831,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
           (UsedInSharedLib.contains(Sym.getName()) ||
            !Sym.canBeOmittedFromSymbolTable());
 
-      // The final definition will reside in this linkage unit if the symbol is
-      // defined and local to the module. This only checks for bitcode files,
-      // full assertion will require complete symbol resolution.
+      // The final definition will reside in this linkage unit if the symbol
+      // is defined and local to the module. This only checks for bitcode
+      // files, full assertion will require complete symbol resolution.
       Res.FinalDefinitionInLinkageUnit =
           Sym.getVisibility() != GlobalValue::DefaultVisibility &&
           (!Sym.isUndefined() && !Sym.isCommon());
@@ -1861,8 +1886,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     return Error::success();
   }
 
-  // Append the new inputs to the device linker input. If the user requested an
-  // internalizing link we need to pass the bitcode to clang.
+  // Append the new inputs to the device linker input. If the user requested
+  // an internalizing link we need to pass the bitcode to clang.
   for (StringRef File :
        Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ)
            ? BitcodeOutput
@@ -2077,8 +2102,8 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
                    Tbl.getOption(OPT_sycl_backend_link_options_from_image_EQ),
                    Args.MakeArgString(Bin->getString(LINK_OPTS)));
 
-  // If every input file is bitcode we have whole program visibility as we do
-  // only support static linking with bitcode.
+  // If every input file is bitcode we have whole program visibility as we
+  // do only support static linking with bitcode.
   auto ContainsBitcode = [](const OffloadFile &F) {
     return identify_magic(F.getBinary()->getImage()) == file_magic::bitcode;
   };
@@ -2088,7 +2113,12 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
   // Forward '-Xoffload-linker' options to the appropriate backend.
   for (StringRef Arg : Args.getAllArgValues(OPT_device_linker_args_EQ)) {
     auto [Triple, Value] = Arg.split('=');
-    if (Value.empty())
+    llvm::Triple TT(Triple);
+    // If this isn't a recognized triple then it's an `arg=value` option.
+    if (TT.getArch() == Triple::ArchType::UnknownArch)
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_linker_arg_EQ),
+                       Args.MakeArgString(Arg));
+    else if (Value.empty())
       DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_linker_arg_EQ),
                        Args.MakeArgString(Triple));
     else if (Triple == DAL.getLastArgValue(OPT_triple_EQ))
@@ -2096,6 +2126,22 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
                        Args.MakeArgString(Value));
   }
 
+  // Forward '-Xoffload-compiler' options to the appropriate backend.
+  for (StringRef Arg : Args.getAllArgValues(OPT_device_compiler_args_EQ)) {
+    auto [Triple, Value] = Arg.split('=');
+    llvm::Triple TT(Triple);
+    // If this isn't a recognized triple then it's an `arg=value` option.
+    if (TT.getArch() == Triple::ArchType::UnknownArch)
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Arg));
+    else if (Value.empty())
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Triple));
+    else if (Triple == DAL.getLastArgValue(OPT_triple_EQ))
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Value));
+  }
+
   return DAL;
 }
 
@@ -2174,6 +2220,22 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
       else
         HasNonSYCLOffloadKinds = true;
     }
+
+    // First link and remove all the input files containing bitcode if
+    // the target linker does not support it natively.
+    SmallVector<StringRef> InputFiles;
+    if (!linkerSupportsLTO(LinkerArgs))
+      if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
+        return Err;
+
+    // Write any remaining device inputs to an output file for the
+    // linker.
+    for (const OffloadFile &File : Input) {
+      auto FileNameOrErr = writeOffloadFile(File);
+      if (!FileNameOrErr)
+        return FileNameOrErr.takeError();
+      InputFiles.emplace_back(*FileNameOrErr);
+    }
     if (HasSYCLOffloadKind) {
       SmallVector<StringRef> InputFiles;
       // Write device inputs to an output file for the linker.
@@ -2248,6 +2310,7 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
           SplitModules[I].ModuleFilePath = *ClangOutputOrErr;
         }
       }
+
       // TODO(NOM7): Remove this call and use community flow for bundle/wrap
       auto OutputFile = sycl::runWrapperAndCompile(SplitModules, LinkerArgs);
       if (!OutputFile)
@@ -2403,12 +2466,14 @@ Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
       bool NewSymbol = Syms.count(Sym.getName()) == 0;
       auto OldSym = NewSymbol ? Sym_None : Syms[Sym.getName()];
 
-      // We will extract if it defines a currenlty undefined non-weak symbol.
+      // We will extract if it defines a currenlty undefined non-weak
+      // symbol.
       bool ResolvesStrongReference =
           ((OldSym & Sym_Undefined && !(OldSym & Sym_Weak)) &&
            !Sym.isUndefined());
-      // We will extract if it defines a new global symbol visible to the host.
-      // This is only necessary for code targeting an offloading language.
+      // We will extract if it defines a new global symbol visible to the
+      // host. This is only necessary for code targeting an offloading
+      // language.
       bool NewGlobalSymbol =
           ((NewSymbol || (OldSym & Sym_Undefined)) && !Sym.isUndefined() &&
            !Sym.canBeOmittedFromSymbolTable() && Kind != object::OFK_None &&
@@ -2463,8 +2528,9 @@ Expected<bool> getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind,
                                    !(OldSym & Sym_Weak) &&
                                    !(*FlagsOrErr & SymbolRef::SF_Undefined);
 
-    // We will extract if it defines a new global symbol visible to the host.
-    // This is only necessary for code targeting an offloading language.
+    // We will extract if it defines a new global symbol visible to the
+    // host. This is only necessary for code targeting an offloading
+    // language.
     bool NewGlobalSymbol =
         ((NewSymbol || (OldSym & Sym_Undefined)) &&
          !(*FlagsOrErr & SymbolRef::SF_Undefined) && Kind != object::OFK_None &&
@@ -2629,8 +2695,8 @@ getDeviceInput(const ArgList &Args) {
 
         Expected<bool> ExtractOrErr =
             getSymbols(Binary.getBinary()->getImage(),
-                       Binary.getBinary()->getOffloadKind(), /*IsArchive=*/true,
-                       Saver, Syms[ID]);
+                       Binary.getBinary()->getOffloadKind(),
+                       /*IsArchive=*/true, Saver, Syms[ID]);
         if (!ExtractOrErr)
           return ExtractOrErr.takeError();
 
@@ -2709,7 +2775,7 @@ int main(int Argc, char **Argv) {
   for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
     NewArgv.push_back(Arg->getValue());
   for (const opt::Arg *Arg : Args.filtered(OPT_offload_opt_eq_minus))
-    NewArgv.push_back(Args.MakeArgString(StringRef("-") + Arg->getValue()));
+    NewArgv.push_back(Arg->getValue());
   SmallVector<PassPlugin, 1> PluginList;
   PassPlugins.setCallback([&](const std::string &PluginPath) {
     auto Plugin = PassPlugin::Load(PluginPath);
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index c9effdacdda31..2926a08c87591 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -32,6 +32,9 @@ def builtin_bitcode_EQ : Joined<["--"], "builtin-bitcode=">,
 def device_linker_args_EQ : Joined<["--"], "device-linker=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<value> or <triple>=<value>">,
   HelpText<"Arguments to pass to the device linker invocation">;
+def device_compiler_args_EQ : Joined<["--"], "device-compiler=">,
+  Flags<[WrapperOnlyOption]>, MetaVarName<"<value> or <triple>=<value>">,
+  HelpText<"Arguments to pass to the device compiler invocation">;
 def clang_backend : Flag<["--"], "clang-backend">,
   Flags<[WrapperOnlyOption]>,
   HelpText<"Run the backend using clang rather than the LTO backend">;
@@ -91,6 +94,9 @@ def whole_program : Flag<["--"], "whole-program">,
 def linker_arg_EQ : Joined<["--"], "linker-arg=">,
   Flags<[DeviceOnlyOption, HelpHidden]>,
   HelpText<"An extra argument to be passed to the linker">;
+def compiler_arg_EQ : Joined<["--"], "compiler-arg=">,
+  Flags<[DeviceOnlyOption, HelpHidden]>,
+  HelpText<"An extra argument to be passed to the compiler">;
 
 // Arguments for the LLVM backend.
 def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
@@ -98,7 +104,7 @@ def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
   HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
            "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
            "of options.">;
-def offload_opt_eq_minus : Joined<["--", "-"], "offload-opt=-">, Flags<[HelpHidden, WrapperOnlyOption]>,
+def offload_opt_eq_minus : Joined<["--", "-"], "offload-opt=">, Flags<[HelpHidden, WrapperOnlyOption]>,
   HelpText<"Options passed to LLVM, not including the Clang invocation. Use "
            "'--offload-opt=--help' for a list of options.">;
 
diff --git a/clang/tools/clang-nvlink-wrapper/CMakeLists.txt b/clang/tools/clang-nvlink-wrapper/CMakeLists.txt
index 2c979e5097958..846fa952ba58d 100644
--- a/clang/tools/clang-nvlink-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-nvlink-wrapper/CMakeLists.txt
@@ -1,11 +1,32 @@
-set(LLVM_LINK_COMPONENTS BitWriter Core Object Support)
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  BitWriter
+  Core
+  BinaryFormat
+  MC
+  Target
+  TransformUtils
+  Analysis
+  Passes
+  IRReader
+  Object
+  Option
+  Support
+  TargetParser
+  CodeGen
+  LTO
+  )
+
+set(LLVM_TARGET_DEFINITIONS NVLinkOpts.td)
+tablegen(LLVM NVLinkOpts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(NVLinkWrapperOpts)
 
 if(NOT CLANG_BUILT_STANDALONE)
-  set(tablegen_deps intrinsics_gen)
+  set(tablegen_deps intrinsics_gen NVLinkWrapperOpts)
 endif()
 
-add_clang_executable(clang-nvlink-wrapper
-  ClangNvlinkWrapper.cpp
+add_clang_tool(clang-nvlink-wrapper
+  ClangNVLinkWrapper.cpp
 
   DEPENDS
   ${tablegen_deps}
@@ -15,11 +36,7 @@ set(CLANG_NVLINK_WRAPPER_LIB_DEPS
   clangBasic
   )
 
-add_dependencies(clang clang-nvlink-wrapper)
-
 target_link_libraries(clang-nvlink-wrapper
   PRIVATE
   ${CLANG_NVLINK_WRAPPER_LIB_DEPS}
   )
-
-install(TARGETS clang-nvlink-wrapper RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
new file mode 100644
index 0000000000000..7851414ba7f4d
--- /dev/null
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -0,0 +1,793 @@
+//===-- clang-nvlink-wrapper/ClangNVLinkWrapper.cpp - NVIDIA linker util --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This tool wraps around the NVIDIA linker called 'nvlink'. The NVIDIA linker
+// is required to create NVPTX applications, but does not support common
+// features like LTO or archives. This utility wraps around the tool to cover
+// its deficiencies. This tool can be removed once NVIDIA improves their linker
+// or ports it to `ld.lld`.
+//
+//===---------------------------------------------------------------------===//
+
+#include "clang/Basic/Version.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/LTO/LTO.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/OptTable.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
+
+using namespace llvm;
+using namespace llvm::opt;
+using namespace llvm::object;
+
+// Various tools (e.g., llc and opt) duplicate this series of declarations for
+// options related to passes and remarks.
+static cl::opt<bool> RemarksWithHotness(
+    "pass-remarks-with-hotness",
+    cl::desc("With PGO, include profile count in optimization remarks"),
+    cl::Hidden);
+
+static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary."),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
+
+static cl::opt<std::string>
+    RemarksFilename("pass-remarks-output",
+                    cl::desc("Output filename for pass remarks"),
+                    cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    RemarksPasses("pass-remarks-filter",
+                  cl::desc("Only record optimization remarks from passes whose "
+                           "names match the given regular expression"),
+                  cl::value_desc("regex"));
+
+static cl::opt<std::string> RemarksFormat(
+    "pass-remarks-format",
+    cl::desc("The format used for serializing remarks (default: YAML)"),
+    cl::value_desc("format"), cl::init("yaml"));
+
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
+static cl::opt<std::string> PassPipeline(
+    "passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'. "
+        "'-passes' overrides the pass pipeline (but not all effects) from "
+        "specifying '--opt-level=O?' (O2 is the default) to "
+        "clang-linker-wrapper.  Be sure to include the corresponding "
+        "'default<O?>' in '-passes'."));
+static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
+                               cl::desc("Alias for -passes"));
+
+static void printVersion(raw_ostream &OS) {
+  OS << clang::getClangToolFullVersion("clang-nvlink-wrapper") << '\n';
+}
+
+/// The value of `argv[0]` when run.
+static const char *Executable;
+
+/// Temporary files to be cleaned up.
+static SmallVector<SmallString<128>> TempFiles;
+
+/// Codegen flags for LTO backend.
+static codegen::RegisterCodeGenFlags CodeGenFlags;
+
+namespace {
+// Must not overlap with llvm::opt::DriverFlag.
+enum WrapperFlags { WrapperOnlyOption = (1 << 4) };
+
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
+#include "NVLinkOpts.inc"
+  LastOption
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr StringLiteral NAME##_init[] = VALUE;                        \
+  static constexpr ArrayRef<StringLiteral> NAME(NAME##_init,                   \
+                                                std::size(NAME##_init) - 1);
+#include "NVLinkOpts.inc"
+#undef PREFIX
+
+static constexpr OptTable::Info InfoTable[] = {
+#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
+#include "NVLinkOpts.inc"
+#undef OPTION
+};
+
+class WrapperOptTable : public opt::GenericOptTable {
+public:
+  WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
+};
+
+const OptTable &getOptTable() {
+  static const WrapperOptTable *Table = []() {
+    auto Result = std::make_unique<WrapperOptTable>();
+    return Result.release();
+  }();
+  return *Table;
+}
+
+[[noreturn]] void reportError(Error E) {
+  outs().flush();
+  logAllUnhandledErrors(std::move(E), WithColor::error(errs(), Executable));
+  exit(EXIT_FAILURE);
+}
+
+void diagnosticHandler(const DiagnosticInfo &DI) {
+  std::string ErrStorage;
+  raw_string_ostream OS(ErrStorage);
+  DiagnosticPrinterRawOStream DP(OS);
+  DI.print(DP);
+
+  switch (DI.getSeverity()) {
+  case DS_Error:
+    WithColor::error(errs(), Executable) << ErrStorage << "\n";
+    break;
+  case DS_Warning:
+    WithColor::warning(errs(), Executable) << ErrStorage << "\n";
+    break;
+  case DS_Note:
+    WithColor::note(errs(), Executable) << ErrStorage << "\n";
+    break;
+  case DS_Remark:
+    WithColor::remark(errs()) << ErrStorage << "\n";
+    break;
+  }
+}
+
+Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix,
+                                   StringRef Extension) {
+  SmallString<128> OutputFile;
+  if (Args.hasArg(OPT_save_temps)) {
+    (Prefix + "." + Extension).toNullTerminatedStringRef(OutputFile);
+  } else {
+    if (std::error_code EC =
+            sys::fs::createTemporaryFile(Prefix, Extension, OutputFile))
+      return createFileError(OutputFile, EC);
+  }
+
+  TempFiles.emplace_back(std::move(OutputFile));
+  return TempFiles.back();
+}
+
+Expected<std::string> findProgram(const ArgList &Args, StringRef Name,
+                                  ArrayRef<StringRef> Paths) {
+  if (Args.hasArg(OPT_dry_run))
+    return Name.str();
+  ErrorOr<std::string> Path = sys::findProgramByName(Name, Paths);
+  if (!Path)
+    Path = sys::findProgramByName(Name);
+  if (!Path)
+    return createStringError(Path.getError(),
+                             "Unable to find '" + Name + "' in path");
+  return *Path;
+}
+
+std::optional<std::string> findFile(StringRef Dir, StringRef Root,
+                                    const Twine &Name) {
+  SmallString<128> Path;
+  if (Dir.starts_with("="))
+    sys::path::append(Path, Root, Dir.substr(1), Name);
+  else
+    sys::path::append(Path, Dir, Name);
+
+  if (sys::fs::exists(Path))
+    return static_cast<std::string>(Path);
+  return std::nullopt;
+}
+
+std::optional<std::string>
+findFromSearchPaths(StringRef Name, StringRef Root,
+                    ArrayRef<StringRef> SearchPaths) {
+  for (StringRef Dir : SearchPaths)
+    if (std::optional<std::string> File = findFile(Dir, Root, Name))
+      return File;
+  return std::nullopt;
+}
+
+std::optional<std::string>
+searchLibraryBaseName(StringRef Name, StringRef Root,
+                      ArrayRef<StringRef> SearchPaths) {
+  for (StringRef Dir : SearchPaths)
+    if (std::optional<std::string> File =
+            findFile(Dir, Root, "lib" + Name + ".a"))
+      return File;
+  return std::nullopt;
+}
+
+/// Search for static libraries in the linker's library path given input like
+/// `-lfoo` or `-l:libfoo.a`.
+std::optional<std::string> searchLibrary(StringRef Input, StringRef Root,
+                                         ArrayRef<StringRef> SearchPaths) {
+  if (Input.starts_with(":"))
+    return findFromSearchPaths(Input.drop_front(), Root, SearchPaths);
+  return searchLibraryBaseName(Input, Root, SearchPaths);
+}
+
+void printCommands(ArrayRef<StringRef> CmdArgs) {
+  if (CmdArgs.empty())
+    return;
+
+  llvm::errs() << " \"" << CmdArgs.front() << "\" ";
+  llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ")
+               << "\n";
+}
+
+/// A minimum symbol interface that provides the necessary information to
+/// extract archive members and resolve LTO symbols.
+struct Symbol {
+  enum Flags {
+    None = 0,
+    Undefined = 1 << 0,
+    Weak = 1 << 1,
+  };
+
+  Symbol() : File(), Flags(None), UsedInRegularObj(false) {}
+
+  Symbol(MemoryBufferRef File, const irsymtab::Reader::SymbolRef Sym)
+      : File(File), Flags(0), UsedInRegularObj(false) {
+    if (Sym.isUndefined())
+      Flags |= Undefined;
+    if (Sym.isWeak())
+      Flags |= Weak;
+  }
+
+  Symbol(MemoryBufferRef File, const SymbolRef Sym)
+      : File(File), Flags(0), UsedInRegularObj(false) {
+    auto FlagsOrErr = Sym.getFlags();
+    if (!FlagsOrErr)
+      reportError(FlagsOrErr.takeError());
+    if (*FlagsOrErr & SymbolRef::SF_Undefined)
+      Flags |= Undefined;
+    if (*FlagsOrErr & SymbolRef::SF_Weak)
+      Flags |= Weak;
+
+    auto NameOrErr = Sym.getName();
+    if (!NameOrErr)
+      reportError(NameOrErr.takeError());
+  }
+
+  bool isWeak() const { return Flags & Weak; }
+  bool isUndefined() const { return Flags & Undefined; }
+
+  MemoryBufferRef File;
+  uint32_t Flags;
+  bool UsedInRegularObj;
+};
+
+Expected<StringRef> runPTXAs(StringRef File, const ArgList &Args) {
+  std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str();
+  std::string GivenPath = Args.getLastArgValue(OPT_ptxas_path_EQ).str();
+  Expected<std::string> PTXAsPath =
+      findProgram(Args, "ptxas", {CudaPath + "/bin", GivenPath});
+  if (!PTXAsPath)
+    return PTXAsPath.takeError();
+  if (!Args.hasArg(OPT_arch))
+    return createStringError(
+        "must pass in an explicit nvptx64 gpu architecture to 'ptxas'");
+
+  auto TempFileOrErr = createTempFile(
+      Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "cubin");
+  if (!TempFileOrErr)
+    return TempFileOrErr.takeError();
+
+  SmallVector<StringRef> AssemblerArgs({*PTXAsPath, "-m64", "-c", File});
+  if (Args.hasArg(OPT_verbose))
+    AssemblerArgs.push_back("-v");
+  if (Args.hasArg(OPT_g)) {
+    if (Args.hasArg(OPT_O))
+      WithColor::warning(errs(), Executable)
+          << "Optimized debugging not supported, overriding to '-O0'\n";
+    AssemblerArgs.push_back("-O0");
+  } else
+    AssemblerArgs.push_back(
+        Args.MakeArgString("-O" + Args.getLastArgValue(OPT_O, "3")));
+  AssemblerArgs.append({"-arch", Args.getLastArgValue(OPT_arch)});
+  AssemblerArgs.append({"-o", *TempFileOrErr});
+
+  if (Args.hasArg(OPT_dry_run) || Args.hasArg(OPT_verbose))
+    printCommands(AssemblerArgs);
+  if (Args.hasArg(OPT_dry_run))
+    return Args.MakeArgString(*TempFileOrErr);
+  if (sys::ExecuteAndWait(*PTXAsPath, AssemblerArgs))
+    return createStringError("'" + sys::path::filename(*PTXAsPath) + "'" +
+                             " failed");
+  return Args.MakeArgString(*TempFileOrErr);
+}
+
+Expected<std::unique_ptr<lto::LTO>> createLTO(const ArgList &Args) {
+  const llvm::Triple Triple("nvptx64-nvidia-cuda");
+  lto::Config Conf;
+  lto::ThinBackend Backend;
+  unsigned Jobs = 0;
+  if (auto *Arg = Args.getLastArg(OPT_jobs))
+    if (!llvm::to_integer(Arg->getValue(), Jobs) || Jobs == 0)
+      reportError(createStringError("%s: expected a positive integer, got '%s'",
+                                    Arg->getSpelling().data(),
+                                    Arg->getValue()));
+  Backend = lto::createInProcessThinBackend(
+      llvm::heavyweight_hardware_concurrency(Jobs));
+
+  Conf.CPU = Args.getLastArgValue(OPT_arch);
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+  Conf.RemarksFilename = RemarksFilename;
+  Conf.RemarksPasses = RemarksPasses;
+  Conf.RemarksWithHotness = RemarksWithHotness;
+  Conf.RemarksHotnessThreshold = RemarksHotnessThreshold;
+  Conf.RemarksFormat = RemarksFormat;
+
+  Conf.MAttrs = {Args.getLastArgValue(OPT_feature, "").str()};
+  std::optional<CodeGenOptLevel> CGOptLevelOrNone =
+      CodeGenOpt::parseLevel(Args.getLastArgValue(OPT_O, "2")[0]);
+  assert(CGOptLevelOrNone && "Invalid optimization level");
+  Conf.CGOptLevel = *CGOptLevelOrNone;
+  Conf.OptLevel = Args.getLastArgValue(OPT_O, "2")[0] - '0';
+  Conf.DefaultTriple = Triple.getTriple();
+
+  Conf.OptPipeline = PassPipeline;
+  Conf.PassPlugins = PassPlugins;
+
+  Conf.DiagHandler = diagnosticHandler;
+  Conf.CGFileType = CodeGenFileType::AssemblyFile;
+
+  if (Args.hasArg(OPT_lto_emit_llvm)) {
+    Conf.PreCodeGenModuleHook = [&](size_t, const Module &M) {
+      std::error_code EC;
+      raw_fd_ostream LinkedBitcode(Args.getLastArgValue(OPT_o, "a.out"), EC);
+      if (EC)
+        reportError(errorCodeToError(EC));
+      WriteBitcodeToFile(M, LinkedBitcode);
+      return false;
+    };
+  }
+
+  if (Args.hasArg(OPT_save_temps))
+    if (Error Err = Conf.addSaveTemps(
+            (Args.getLastArgValue(OPT_o, "a.out") + ".").str()))
+      return Err;
+
+  unsigned Partitions = 1;
+  if (auto *Arg = Args.getLastArg(OPT_lto_partitions))
+    if (!llvm::to_integer(Arg->getValue(), Partitions) || Partitions == 0)
+      reportError(createStringError("%s: expected a positive integer, got '%s'",
+                                    Arg->getSpelling().data(),
+                                    Arg->getValue()));
+  lto::LTO::LTOKind Kind = Args.hasArg(OPT_thinlto) ? lto::LTO::LTOK_UnifiedThin
+                                                    : lto::LTO::LTOK_Default;
+  return std::make_unique<lto::LTO>(std::move(Conf), Backend, Partitions, Kind);
+}
+
+Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer,
+                                     StringMap<Symbol> &SymTab, bool IsLazy) {
+  Expected<IRSymtabFile> IRSymtabOrErr = readIRSymtab(Buffer);
+  if (!IRSymtabOrErr)
+    return IRSymtabOrErr.takeError();
+  bool Extracted = !IsLazy;
+  StringMap<Symbol> PendingSymbols;
+  for (unsigned I = 0; I != IRSymtabOrErr->Mods.size(); ++I) {
+    for (const auto &IRSym : IRSymtabOrErr->TheReader.module_symbols(I)) {
+      if (IRSym.isFormatSpecific() || !IRSym.isGlobal())
+        continue;
+
+      Symbol &OldSym = !SymTab.count(IRSym.getName()) && IsLazy
+                           ? PendingSymbols[IRSym.getName()]
+                           : SymTab[IRSym.getName()];
+      Symbol Sym = Symbol(Buffer, IRSym);
+      if (OldSym.File.getBuffer().empty())
+        OldSym = Sym;
+
+      bool ResolvesReference =
+          !Sym.isUndefined() &&
+          (OldSym.isUndefined() || (OldSym.isWeak() && !Sym.isWeak())) &&
+          !(OldSym.isWeak() && OldSym.isUndefined() && IsLazy);
+      Extracted |= ResolvesReference;
+
+      Sym.UsedInRegularObj = OldSym.UsedInRegularObj;
+      if (ResolvesReference)
+        OldSym = Sym;
+    }
+  }
+  if (Extracted)
+    for (const auto &[Name, Symbol] : PendingSymbols)
+      SymTab[Name] = Symbol;
+  return Extracted;
+}
+
+Expected<bool> getSymbolsFromObject(ObjectFile &ObjFile,
+                                    StringMap<Symbol> &SymTab, bool IsLazy) {
+  bool Extracted = !IsLazy;
+  StringMap<Symbol> PendingSymbols;
+  for (SymbolRef ObjSym : ObjFile.symbols()) {
+    auto NameOrErr = ObjSym.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+
+    Symbol &OldSym = !SymTab.count(*NameOrErr) && IsLazy
+                         ? PendingSymbols[*NameOrErr]
+                         : SymTab[*NameOrErr];
+    Symbol Sym = Symbol(ObjFile.getMemoryBufferRef(), ObjSym);
+    if (OldSym.File.getBuffer().empty())
+      OldSym = Sym;
+
+    bool ResolvesReference = OldSym.isUndefined() && !Sym.isUndefined() &&
+                             (!OldSym.isWeak() || !IsLazy);
+    Extracted |= ResolvesReference;
+
+    if (ResolvesReference)
+      OldSym = Sym;
+    OldSym.UsedInRegularObj = true;
+  }
+  if (Extracted)
+    for (const auto &[Name, Symbol] : PendingSymbols)
+      SymTab[Name] = Symbol;
+  return Extracted;
+}
+
+Expected<bool> getSymbols(MemoryBufferRef Buffer, StringMap<Symbol> &SymTab,
+                          bool IsLazy) {
+  switch (identify_magic(Buffer.getBuffer())) {
+  case file_magic::bitcode: {
+    return getSymbolsFromBitcode(Buffer, SymTab, IsLazy);
+  }
+  case file_magic::elf_relocatable: {
+    Expected<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(Buffer);
+    if (!ObjFile)
+      return ObjFile.takeError();
+    return getSymbolsFromObject(**ObjFile, SymTab, IsLazy);
+  }
+  default:
+    return createStringError("Unsupported file type");
+  }
+}
+
+Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
+  SmallVector<StringRef> LibraryPaths;
+  for (const opt::Arg *Arg : Args.filtered(OPT_library_path))
+    LibraryPaths.push_back(Arg->getValue());
+
+  bool WholeArchive = false;
+  SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles;
+  for (const opt::Arg *Arg : Args.filtered(
+           OPT_INPUT, OPT_library, OPT_whole_archive, OPT_no_whole_archive)) {
+    if (Arg->getOption().matches(OPT_whole_archive) ||
+        Arg->getOption().matches(OPT_no_whole_archive)) {
+      WholeArchive = Arg->getOption().matches(OPT_whole_archive);
+      continue;
+    }
+
+    std::optional<std::string> Filename =
+        Arg->getOption().matches(OPT_library)
+            ? searchLibrary(Arg->getValue(), /*Root=*/"", LibraryPaths)
+            : std::string(Arg->getValue());
+
+    if (!Filename && Arg->getOption().matches(OPT_library))
+      return createStringError("unable to find library -l%s", Arg->getValue());
+
+    if (!Filename || !sys::fs::exists(*Filename) ||
+        sys::fs::is_directory(*Filename))
+      continue;
+
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+        MemoryBuffer::getFileOrSTDIN(*Filename);
+    if (std::error_code EC = BufferOrErr.getError())
+      return createFileError(*Filename, EC);
+
+    MemoryBufferRef Buffer = **BufferOrErr;
+    switch (identify_magic(Buffer.getBuffer())) {
+    case file_magic::bitcode:
+    case file_magic::elf_relocatable:
+      InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false);
+      break;
+    case file_magic::archive: {
+      Expected<std::unique_ptr<llvm::object::Archive>> LibFile =
+          object::Archive::create(Buffer);
+      if (!LibFile)
+        return LibFile.takeError();
+      Error Err = Error::success();
+      for (auto Child : (*LibFile)->children(Err)) {
+        auto ChildBufferOrErr = Child.getMemoryBufferRef();
+        if (!ChildBufferOrErr)
+          return ChildBufferOrErr.takeError();
+        std::unique_ptr<MemoryBuffer> ChildBuffer =
+            MemoryBuffer::getMemBufferCopy(
+                ChildBufferOrErr->getBuffer(),
+                ChildBufferOrErr->getBufferIdentifier());
+        InputFiles.emplace_back(std::move(ChildBuffer), !WholeArchive);
+      }
+      if (Err)
+        return Err;
+      break;
+    }
+    default:
+      return createStringError("Unsupported file type");
+    }
+  }
+
+  bool Extracted = true;
+  StringMap<Symbol> SymTab;
+  SmallVector<std::unique_ptr<MemoryBuffer>> LinkerInput;
+  while (Extracted) {
+    Extracted = false;
+    for (auto &[Input, IsLazy] : InputFiles) {
+      if (!Input)
+        continue;
+
+      // Archive members only extract if they define needed symbols. We will
+      // re-scan all the inputs if any files were extracted for the link job.
+      Expected<bool> ExtractOrErr = getSymbols(*Input, SymTab, IsLazy);
+      if (!ExtractOrErr)
+        return ExtractOrErr.takeError();
+
+      Extracted |= *ExtractOrErr;
+      if (!*ExtractOrErr)
+        continue;
+
+      LinkerInput.emplace_back(std::move(Input));
+    }
+  }
+  InputFiles.clear();
+
+  // Extract any bitcode files to be passed to the LTO pipeline.
+  SmallVector<std::unique_ptr<MemoryBuffer>> BitcodeFiles;
+  for (auto &Input : LinkerInput)
+    if (identify_magic(Input->getBuffer()) == file_magic::bitcode)
+      BitcodeFiles.emplace_back(std::move(Input));
+  llvm::erase_if(LinkerInput, [](const auto &F) { return !F; });
+
+  // Run the LTO pipeline on the extracted inputs.
+  SmallVector<StringRef> Files;
+  if (!BitcodeFiles.empty()) {
+    auto LTOBackendOrErr = createLTO(Args);
+    if (!LTOBackendOrErr)
+      return LTOBackendOrErr.takeError();
+    lto::LTO &LTOBackend = **LTOBackendOrErr;
+    for (auto &BitcodeFile : BitcodeFiles) {
+      Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+          llvm::lto::InputFile::create(*BitcodeFile);
+      if (!BitcodeFileOrErr)
+        return BitcodeFileOrErr.takeError();
+
+      const auto Symbols = (*BitcodeFileOrErr)->symbols();
+      SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+      size_t Idx = 0;
+      for (auto &Sym : Symbols) {
+        lto::SymbolResolution &Res = Resolutions[Idx++];
+        Symbol ObjSym = SymTab[Sym.getName()];
+        // We will use this as the prevailing symbol in LTO if it is not
+        // undefined and it is from the file that contained the canonical
+        // definition.
+        Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile;
+
+        // We need LTO to preseve the following global symbols:
+        // 1) All symbols during a relocatable link.
+        // 2) Symbols used in regular objects.
+        // 3) Prevailing symbols that are needed visible to the gpu runtime.
+        Res.VisibleToRegularObj =
+            Args.hasArg(OPT_relocatable) || ObjSym.UsedInRegularObj ||
+            (Res.Prevailing &&
+             (Sym.getVisibility() != GlobalValue::HiddenVisibility &&
+              !Sym.canBeOmittedFromSymbolTable()));
+
+        // Identify symbols that must be exported dynamically and can be
+        // referenced by other files, (i.e. the runtime).
+        Res.ExportDynamic =
+            Sym.getVisibility() != GlobalValue::HiddenVisibility &&
+            !Sym.canBeOmittedFromSymbolTable();
+
+        // The NVIDIA platform does not support any symbol preemption.
+        Res.FinalDefinitionInLinkageUnit = true;
+
+        // We do not support linker redefined symbols (e.g. --wrap) for device
+        // image linking, so the symbols will not be changed after LTO.
+        Res.LinkerRedefined = false;
+      }
+
+      // Add the bitcode file with its resolved symbols to the LTO job.
+      if (Error Err = LTOBackend.add(std::move(*BitcodeFileOrErr), Resolutions))
+        return Err;
+    }
+
+    // Run the LTO job to compile the bitcode.
+    size_t MaxTasks = LTOBackend.getMaxTasks();
+    SmallVector<StringRef> LTOFiles(MaxTasks);
+    auto AddStream =
+        [&](size_t Task,
+            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+      int FD = -1;
+      auto &TempFile = LTOFiles[Task];
+      if (Args.hasArg(OPT_lto_emit_asm))
+        TempFile = Args.getLastArgValue(OPT_o, "a.out");
+      else {
+        auto TempFileOrErr = createTempFile(
+            Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "s");
+        if (!TempFileOrErr)
+          reportError(TempFileOrErr.takeError());
+        TempFile = Args.MakeArgString(*TempFileOrErr);
+      }
+      if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD))
+        reportError(errorCodeToError(EC));
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+    };
+
+    if (Error Err = LTOBackend.run(AddStream))
+      return Err;
+
+    if (Args.hasArg(OPT_lto_emit_llvm) || Args.hasArg(OPT_lto_emit_asm))
+      return Files;
+
+    for (StringRef LTOFile : LTOFiles) {
+      auto FileOrErr = runPTXAs(LTOFile, Args);
+      if (!FileOrErr)
+        return FileOrErr.takeError();
+      Files.emplace_back(*FileOrErr);
+    }
+  }
+
+  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
+  // linker requires all NVPTX inputs to have this extension for some reason.
+  for (auto &Input : LinkerInput) {
+    auto TempFileOrErr = createTempFile(
+        Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
+    if (!TempFileOrErr)
+      return TempFileOrErr.takeError();
+    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
+        FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
+    if (!OutputOrErr)
+      return OutputOrErr.takeError();
+    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
+    llvm::copy(Input->getBuffer(), Output->getBufferStart());
+    if (Error E = Output->commit())
+      return E;
+    Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
+  }
+
+  return Files;
+}
+
+Error runNVLink(ArrayRef<StringRef> Files, const ArgList &Args) {
+  if (Args.hasArg(OPT_lto_emit_asm) || Args.hasArg(OPT_lto_emit_llvm))
+    return Error::success();
+
+  std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str();
+  Expected<std::string> NVLinkPath =
+      findProgram(Args, "nvlink", {CudaPath + "/bin"});
+  if (!NVLinkPath)
+    return NVLinkPath.takeError();
+
+  if (!Args.hasArg(OPT_arch))
+    return createStringError(
+        "must pass in an explicit nvptx64 gpu architecture to 'nvlink'");
+
+  ArgStringList NewLinkerArgs;
+  for (const opt::Arg *Arg : Args) {
+    // Do not forward arguments only intended for the linker wrapper.
+    if (Arg->getOption().hasFlag(WrapperOnlyOption))
+      continue;
+
+    // Do not forward any inputs that we have processed.
+    if (Arg->getOption().matches(OPT_INPUT) ||
+        Arg->getOption().matches(OPT_library))
+      continue;
+
+    Arg->render(Args, NewLinkerArgs);
+  }
+
+  llvm::transform(Files, std::back_inserter(NewLinkerArgs),
+                  [&](StringRef Arg) { return Args.MakeArgString(Arg); });
+
+  SmallVector<StringRef> LinkerArgs({*NVLinkPath});
+  if (!Args.hasArg(OPT_o))
+    LinkerArgs.append({"-o", "a.out"});
+  for (StringRef Arg : NewLinkerArgs)
+    LinkerArgs.push_back(Arg);
+
+  if (Args.hasArg(OPT_dry_run) || Args.hasArg(OPT_verbose))
+    printCommands(LinkerArgs);
+  if (Args.hasArg(OPT_dry_run))
+    return Error::success();
+  if (sys::ExecuteAndWait(*NVLinkPath, LinkerArgs))
+    return createStringError("'" + sys::path::filename(*NVLinkPath) + "'" +
+                             " failed");
+  return Error::success();
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmParsers();
+  InitializeAllAsmPrinters();
+
+  Executable = argv[0];
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+
+  const OptTable &Tbl = getOptTable();
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  auto Args = Tbl.parseArgs(argc, argv, OPT_INVALID, Saver, [&](StringRef Err) {
+    reportError(createStringError(inconvertibleErrorCode(), Err));
+  });
+
+  if (Args.hasArg(OPT_help) || Args.hasArg(OPT_help_hidden)) {
+    Tbl.printHelp(
+        outs(), "clang-nvlink-wrapper [options] <options to passed to nvlink>",
+        "A utility that wraps around the NVIDIA 'nvlink' linker.\n"
+        "This enables static linking and LTO handling for NVPTX targets.",
+        Args.hasArg(OPT_help_hidden), Args.hasArg(OPT_help_hidden));
+    return EXIT_SUCCESS;
+  }
+
+  if (Args.hasArg(OPT_version))
+    printVersion(outs());
+
+  // This forwards '-mllvm' arguments to LLVM if present.
+  SmallVector<const char *> NewArgv = {argv[0]};
+  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
+    NewArgv.push_back(Arg->getValue());
+  for (const opt::Arg *Arg : Args.filtered(OPT_plugin_opt))
+    NewArgv.push_back(Arg->getValue());
+  cl::ParseCommandLineOptions(NewArgv.size(), &NewArgv[0]);
+
+  // Get the input files to pass to 'nvlink'.
+  auto FilesOrErr = getInput(Args);
+  if (!FilesOrErr)
+    reportError(FilesOrErr.takeError());
+
+  // Run 'nvlink' on the generated inputs.
+  if (Error Err = runNVLink(*FilesOrErr, Args))
+    reportError(std::move(Err));
+
+  // Remove the temporary files created.
+  if (!Args.hasArg(OPT_save_temps))
+    for (const auto &TempFile : TempFiles)
+      if (std::error_code EC = sys::fs::remove(TempFile))
+        reportError(createFileError(TempFile, EC));
+
+  return EXIT_SUCCESS;
+}
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNvlinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNvlinkWrapper.cpp
deleted file mode 100644
index 7ccc284a48314..0000000000000
--- a/clang/tools/clang-nvlink-wrapper/ClangNvlinkWrapper.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===-- clang-nvlink-wrapper/ClangNvlinkWrapper.cpp - wrapper over nvlink-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This tool works as a wrapper over nvlink program. It transparently passes
-/// every input option and objects to nvlink except archive files. It reads
-/// each input archive file to extract archived cubin files as temporary files.
-/// These temp (*.cubin) files are passed to nvlink, because nvlink does not
-/// support linking of archive files implicitly.
-///
-/// During linking of heterogeneous device archive libraries, the
-/// clang-offload-bundler creates a device specific archive of cubin files.
-/// Such an archive is then passed to this tool to extract cubin files before
-/// passing to nvlink.
-///
-/// Example:
-/// clang-nvlink-wrapper -o a.out-openmp-nvptx64 /tmp/libTest-nvptx-sm_50.a
-///
-/// 1. Extract (libTest-nvptx-sm_50.a) => /tmp/a.cubin /tmp/b.cubin
-/// 2. nvlink -o a.out-openmp-nvptx64 /tmp/a.cubin /tmp/b.cubin
-//===---------------------------------------------------------------------===//
-
-#include "clang/Basic/Version.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/WithColor.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden);
-
-// Mark all our options with this category, everything else (except for -help)
-// will be hidden.
-static cl::OptionCategory
-    ClangNvlinkWrapperCategory("clang-nvlink-wrapper options");
-
-static cl::opt<std::string> NvlinkUserPath("nvlink-path",
-                                           cl::desc("Path of nvlink binary"),
-                                           cl::cat(ClangNvlinkWrapperCategory));
-
-// Do not parse nvlink options
-static cl::list<std::string>
-    NVArgs(cl::Sink, cl::desc("<options to be passed to nvlink>..."));
-
-static bool isEmptyFile(StringRef Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-      MemoryBuffer::getFileOrSTDIN(Filename, false, false);
-  if (std::error_code EC = BufOrErr.getError())
-    return false;
-  return (*BufOrErr)->getBuffer().empty();
-}
-
-static Error runNVLink(std::string NVLinkPath,
-                       SmallVectorImpl<std::string> &Args) {
-  std::vector<StringRef> NVLArgs;
-  NVLArgs.push_back(NVLinkPath);
-  StringRef Output = *(llvm::find(Args, "-o") + 1);
-  for (auto &Arg : Args) {
-    if (!(sys::fs::exists(Arg) && Arg != Output && isEmptyFile(Arg)))
-      NVLArgs.push_back(Arg);
-  }
-
-  if (sys::ExecuteAndWait(NVLinkPath, NVLArgs))
-    return createStringError(inconvertibleErrorCode(), "'nvlink' failed");
-  return Error::success();
-}
-
-static Error extractArchiveFiles(StringRef Filename,
-                                 SmallVectorImpl<std::string> &Args,
-                                 SmallVectorImpl<std::string> &TmpFiles) {
-  std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-      MemoryBuffer::getFileOrSTDIN(Filename, false, false);
-  if (std::error_code EC = BufOrErr.getError())
-    return createFileError(Filename, EC);
-
-  ArchiveBuffers.push_back(std::move(*BufOrErr));
-  Expected<std::unique_ptr<llvm::object::Archive>> LibOrErr =
-      object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
-  if (!LibOrErr)
-    return LibOrErr.takeError();
-
-  auto Archive = std::move(*LibOrErr);
-
-  Error Err = Error::success();
-  auto ChildEnd = Archive->child_end();
-  for (auto ChildIter = Archive->child_begin(Err); ChildIter != ChildEnd;
-       ++ChildIter) {
-    if (Err)
-      return Err;
-    auto ChildNameOrErr = (*ChildIter).getName();
-    if (!ChildNameOrErr)
-      return ChildNameOrErr.takeError();
-
-    StringRef ChildName = sys::path::filename(ChildNameOrErr.get());
-
-    auto ChildBufferRefOrErr = (*ChildIter).getMemoryBufferRef();
-    if (!ChildBufferRefOrErr)
-      return ChildBufferRefOrErr.takeError();
-
-    auto ChildBuffer =
-        MemoryBuffer::getMemBuffer(ChildBufferRefOrErr.get(), false);
-    auto ChildNameSplit = ChildName.split('.');
-
-    SmallString<16> Path;
-    int FileDesc;
-    if (std::error_code EC = sys::fs::createTemporaryFile(
-            (ChildNameSplit.first), (ChildNameSplit.second), FileDesc, Path))
-      return createFileError(ChildName, EC);
-
-    std::string TmpFileName(Path.str());
-    Args.push_back(TmpFileName);
-    TmpFiles.push_back(TmpFileName);
-    std::error_code EC;
-    raw_fd_ostream OS(Path.c_str(), EC, sys::fs::OF_None);
-    if (EC)
-      return createFileError(TmpFileName, errc::io_error);
-    OS << ChildBuffer->getBuffer();
-    OS.close();
-  }
-  return Err;
-}
-
-static Error cleanupTmpFiles(SmallVectorImpl<std::string> &TmpFiles) {
-  for (auto &TmpFile : TmpFiles) {
-    if (std::error_code EC = sys::fs::remove(TmpFile))
-      return createFileError(TmpFile, errc::no_such_file_or_directory);
-  }
-  return Error::success();
-}
-
-static void PrintVersion(raw_ostream &OS) {
-  OS << clang::getClangToolFullVersion("clang-nvlink-wrapper") << '\n';
-}
-
-int main(int argc, const char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  cl::SetVersionPrinter(PrintVersion);
-  cl::HideUnrelatedOptions(ClangNvlinkWrapperCategory);
-  cl::ParseCommandLineOptions(
-      argc, argv,
-      "A wrapper tool over nvlink program. It transparently passes every \n"
-      "input option and objects to nvlink except archive files and path of \n"
-      "nvlink binary. It reads each input archive file to extract archived \n"
-      "cubin files as temporary files.\n");
-
-  if (Help) {
-    cl::PrintHelpMessage();
-    return 0;
-  }
-
-  auto reportError = [argv](Error E) {
-    logAllUnhandledErrors(std::move(E), WithColor::error(errs(), argv[0]));
-    exit(1);
-  };
-
-  std::string NvlinkPath;
-  SmallVector<const char *, 0> Argv(argv, argv + argc);
-  SmallVector<std::string, 0> ArgvSubst;
-  SmallVector<std::string, 0> TmpFiles;
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
-
-  for (const std::string &Arg : NVArgs) {
-    if (sys::path::extension(Arg) == ".a") {
-      if (Error Err = extractArchiveFiles(Arg, ArgvSubst, TmpFiles))
-        reportError(std::move(Err));
-    } else {
-      ArgvSubst.push_back(Arg);
-    }
-  }
-
-  NvlinkPath = NvlinkUserPath;
-
-  // If user hasn't specified nvlink binary then search it in PATH
-  if (NvlinkPath.empty()) {
-    ErrorOr<std::string> NvlinkPathErr = sys::findProgramByName("nvlink");
-    if (!NvlinkPathErr) {
-      reportError(createStringError(NvlinkPathErr.getError(),
-                                    "unable to find 'nvlink' in path"));
-    }
-    NvlinkPath = NvlinkPathErr.get();
-  }
-
-  if (Error Err = runNVLink(NvlinkPath, ArgvSubst))
-    reportError(std::move(Err));
-  if (Error Err = cleanupTmpFiles(TmpFiles))
-    reportError(std::move(Err));
-
-  return 0;
-}
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
new file mode 100644
index 0000000000000..01bd0f85b1a33
--- /dev/null
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -0,0 +1,93 @@
+include "llvm/Option/OptParser.td"
+
+def WrapperOnlyOption : OptionFlag;
+
+def help : Flag<["-", "--"], "help">,
+  HelpText<"Display available options (--help-hidden for more)">;
+
+def help_hidden : Flag<["-", "--"], "help-hidden">,
+  HelpText<"Display all available options">;
+
+def verbose : Flag<["-"], "v">, HelpText<"Print verbose information">;
+def version : Flag<["--"], "version">,
+  HelpText<"Display the version number and exit">;
+
+def cuda_path_EQ : Joined<["--"], "cuda-path=">, Flags<[WrapperOnlyOption]>,
+  MetaVarName<"<dir>">, HelpText<"Set the system CUDA path">;
+def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Flags<[WrapperOnlyOption]>,
+  MetaVarName<"<dir>">, HelpText<"Set the 'ptxas' path">;
+
+def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
+  HelpText<"Path to file to write output">;
+def output : Separate<["--"], "output-file">, Alias<o>, Flags<[HelpHidden]>,
+  HelpText<"Alias for -o">;
+
+def library_path : JoinedOrSeparate<["-"], "L">, MetaVarName<"<dir>">,
+  HelpText<"Add <dir> to the library search path">;
+def library_path_S : Separate<["--", "-"], "library-path">, Flags<[HelpHidden]>,
+  Alias<library_path>;
+def library_path_EQ : Joined<["--", "-"], "library-path=">, Flags<[HelpHidden]>,
+  Alias<library_path>;
+
+def library : JoinedOrSeparate<["-"], "l">, MetaVarName<"<libname>">,
+  HelpText<"Search for library <libname>">;
+def library_S : Separate<["--", "-"], "library">, Flags<[HelpHidden]>,
+  Alias<library_path>;
+def library_EQ : Joined<["--", "-"], "library=">, Flags<[HelpHidden]>,
+  Alias<library_path>;
+
+def arch : Separate<["--", "-"], "arch">,
+  HelpText<"Specify the 'sm_' name of the target architecture.">;
+def : Joined<["--", "-"], "plugin-opt=mcpu=">,
+  Flags<[HelpHidden, WrapperOnlyOption]>, Alias<arch>;
+
+def feature : Separate<["--", "-"], "feature">, Flags<[WrapperOnlyOption]>,
+  HelpText<"Specify the '+ptx' freature to use for LTO.">;
+
+def g : Flag<["-"], "g">, HelpText<"Specify that this was a debug compile.">;
+def debug : Flag<["--"], "debug">, Alias<g>;
+
+def lto_emit_llvm : Flag<["--"], "lto-emit-llvm">, Flags<[WrapperOnlyOption]>,
+  HelpText<"Emit LLVM-IR bitcode">;
+def lto_emit_asm : Flag<["--"], "lto-emit-asm">, Flags<[WrapperOnlyOption]>,
+  HelpText<"Emit assembly code">;
+
+def O : Joined<["--", "-"], "plugin-opt=O">,
+  Flags<[WrapperOnlyOption]>, MetaVarName<"<O0, O1, O2, or O3>">,
+  HelpText<"Optimization level for LTO">;
+
+def thinlto : Joined<["--", "-"], "plugin-opt=thinlto">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Enable the thin-lto backend">;
+def lto_partitions : Joined<["--", "-"], "plugin-opt=lto-partitions=">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Number of LTO codegen partitions">;
+def jobs : Joined<["--", "-"], "plugin-opt=jobs=">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Number of LTO codegen partitions">;
+def : Joined<["--", "-"], "plugin-opt=emit-llvm">,
+  Flags<[WrapperOnlyOption]>, Alias<lto_emit_llvm>;
+def : Joined<["--", "-"], "plugin-opt=emit-asm">,
+  Flags<[WrapperOnlyOption]>, Alias<lto_emit_asm>;
+def plugin_opt : Joined<["--", "-"], "plugin-opt=">, Flags<[WrapperOnlyOption]>,
+  HelpText<"Options passed to LLVM, not including the Clang invocation. Use "
+           "'--plugin-opt=--help' for a list of options.">;
+
+def save_temps : Flag<["--", "-"], "save-temps">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
+
+def relocatable : Flag<["--", "-"], "relocatable">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Perform a relocatable link (LTO only)">;
+def r : Flag<["-"], "r">, Flags<[WrapperOnlyOption]>, Alias<relocatable>;
+
+def whole_archive : Flag<["--", "-"], "whole-archive">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def no_whole_archive : Flag<["--", "-"], "no-whole-archive">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+
+def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
+  MetaVarName<"<arg>">,
+  HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
+           "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
+           "of options.">;
+def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, Alias<mllvm>;
+
+def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
+  HelpText<"Print generated commands without running.">;
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 42618e4e31cb0..a35ff13494e11 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -69,7 +69,7 @@ export_executable_symbols_for_plugins(clang-repl)
 # gold. This flag tells the linker to build a PLT for the full address range.
 # Linkers without this flag are assumed to support proper PLTs by default.
 set(flag_long_plt "-Wl,--long-plt")
-llvm_check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
+check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
 if(HAVE_LINKER_FLAG_LONG_PLT)
   target_link_options(clang-repl PRIVATE ${flag_long_plt})
 endif()
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index b2f54eab8e484..476b8131248fb 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -108,7 +108,7 @@ endif()
 
 if(CLANG_ORDER_FILE AND
     (LLVM_LINKER_IS_APPLE OR LLVM_LINKER_IS_GOLD OR LLVM_LINKER_IS_LLD))
-  include(LLVMCheckLinkerFlag)
+  include(CheckLinkerFlag)
 
   if (LLVM_LINKER_IS_APPLE OR (LLVM_LINKER_IS_LLD AND APPLE))
     set(LINKER_ORDER_FILE_OPTION "-Wl,-order_file,${CLANG_ORDER_FILE}")
@@ -119,7 +119,7 @@ if(CLANG_ORDER_FILE AND
   endif()
 
   # This is a test to ensure the actual order file works with the linker.
-  llvm_check_linker_flag(CXX ${LINKER_ORDER_FILE_OPTION} LINKER_ORDER_FILE_WORKS)
+  check_linker_flag(CXX ${LINKER_ORDER_FILE_OPTION} LINKER_ORDER_FILE_WORKS)
 
   # Passing an empty order file disables some linker layout optimizations.
   # To work around this and enable workflows for re-linking when the order file
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index c2ccb47a15bc8..554dc956c7cfe 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -14,7 +14,7 @@
 
 #include "clang/Basic/Stack.h"
 #include "clang/Basic/TargetOptions.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
+#include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
@@ -25,6 +25,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h"
@@ -241,7 +242,8 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
 
   if (!Clang->getFrontendOpts().TimeTracePath.empty()) {
     llvm::timeTraceProfilerInitialize(
-        Clang->getFrontendOpts().TimeTraceGranularity, Argv0);
+        Clang->getFrontendOpts().TimeTraceGranularity, Argv0,
+        Clang->getFrontendOpts().TimeTraceVerbose);
   }
   // --print-supported-cpus takes priority over the actual compilation.
   if (Clang->getFrontendOpts().PrintSupportedCPUs)
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index ec93f092713f5..b661a43c88b08 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -531,6 +531,9 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
   MCOptions.MCNoWarn = Opts.NoWarn;
   MCOptions.MCFatalWarnings = Opts.FatalWarnings;
   MCOptions.MCNoTypeCheck = Opts.NoTypeCheck;
+  MCOptions.ShowMCInst = Opts.ShowInst;
+  MCOptions.AsmVerbose = true;
+  MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
   MCOptions.ABIName = Opts.TargetABI;
 
   // FIXME: There is a bit of code duplication with addPassesToEmitFile.
@@ -545,10 +548,8 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
 
     auto FOut = std::make_unique<formatted_raw_ostream>(*Out);
-    Str.reset(TheTarget->createAsmStreamer(
-        Ctx, std::move(FOut), /*asmverbose*/ true,
-        /*useDwarfDirectory*/ true, IP, std::move(CE), std::move(MAB),
-        Opts.ShowInst));
+    Str.reset(TheTarget->createAsmStreamer(Ctx, std::move(FOut), IP,
+                                           std::move(CE), std::move(MAB)));
   } else if (Opts.OutputType == AssemblerInvocation::FT_Null) {
     Str.reset(createNullStreamer(Ctx));
   } else {
@@ -571,9 +572,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
 
     Triple T(Opts.Triple);
     Str.reset(TheTarget->createMCObjectStreamer(
-        T, Ctx, std::move(MAB), std::move(OW), std::move(CE), *STI,
-        Opts.RelaxAll, Opts.IncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ true));
+        T, Ctx, std::move(MAB), std::move(OW), std::move(CE), *STI));
     Str.get()->initSections(Opts.NoExecStack, *STI);
   }
 
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index eb710813f9c24..aa76a445c599d 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2187,6 +2187,8 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
   VisitOMPLoopTransformationDirective(const OMPLoopTransformationDirective *D);
   void VisitOMPTileDirective(const OMPTileDirective *D);
   void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
+  void VisitOMPReverseDirective(const OMPReverseDirective *D);
+  void VisitOMPInterchangeDirective(const OMPInterchangeDirective *D);
   void VisitOMPForDirective(const OMPForDirective *D);
   void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
   void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2529,7 +2531,7 @@ void OMPClauseEnqueue::VisitOMPHintClause(const OMPHintClause *C) {
 }
 
 template <typename T> void OMPClauseEnqueue::VisitOMPClauseList(T *Node) {
-  for (const auto *I : Node->varlists()) {
+  for (const auto *I : Node->varlist()) {
     Visitor->AddStmt(I);
   }
 }
@@ -2749,7 +2751,7 @@ void OMPClauseEnqueue::VisitOMPUsesAllocatorsClause(
 }
 void OMPClauseEnqueue::VisitOMPAffinityClause(const OMPAffinityClause *C) {
   Visitor->AddStmt(C->getModifier());
-  for (const Expr *E : C->varlists())
+  for (const Expr *E : C->varlist())
     Visitor->AddStmt(E);
 }
 void OMPClauseEnqueue::VisitOMPBindClause(const OMPBindClause *C) {}
@@ -3233,6 +3235,15 @@ void EnqueueVisitor::VisitOMPUnrollDirective(const OMPUnrollDirective *D) {
   VisitOMPLoopTransformationDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPReverseDirective(const OMPReverseDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPInterchangeDirective(
+    const OMPInterchangeDirective *D) {
+  VisitOMPLoopTransformationDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
   VisitOMPLoopDirective(D);
 }
@@ -6102,6 +6113,10 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPTileDirective");
   case CXCursor_OMPUnrollDirective:
     return cxstring::createRef("OMPUnrollDirective");
+  case CXCursor_OMPReverseDirective:
+    return cxstring::createRef("OMPReverseDirective");
+  case CXCursor_OMPInterchangeDirective:
+    return cxstring::createRef("OMPInterchangeDirective");
   case CXCursor_OMPForDirective:
     return cxstring::createRef("OMPForDirective");
   case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index 7b634003d11f4..968b46acb784c 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -190,8 +190,8 @@ if(ENABLE_SHARED)
       include(CheckLinkerFlag)
       # The Solaris 11.4 linker supports a subset of GNU ld version scripts,
       # but requires a special option to enable it.
-      llvm_check_linker_flag(CXX "-Wl,-z,gnu-version-script-compat"
-                             LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
+      check_linker_flag(CXX "-Wl,-z,gnu-version-script-compat"
+                        LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
       # Older Solaris (and illumos) linker does not support GNU ld version scripts
       # and does not support GNU version script compat.
       if (LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 9a7b53b41617e..88ed5c053b3a5 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -678,6 +678,12 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPUnrollDirectiveClass:
     K = CXCursor_OMPUnrollDirective;
     break;
+  case Stmt::OMPReverseDirectiveClass:
+    K = CXCursor_OMPReverseDirective;
+    break;
+  case Stmt::OMPInterchangeDirectiveClass:
+    K = CXCursor_OMPTileDirective;
+    break;
   case Stmt::OMPForDirectiveClass:
     K = CXCursor_OMPForDirective;
     break;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 92f9bae6cb064..57242ff49fe3b 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9376,29 +9376,6 @@ TEST_P(ASTImporterOptionSpecificTestBase, VaListCpp) {
       ToVaList->getUnderlyingType(), ToBuiltinVaList->getUnderlyingType()));
 }
 
-TEST_P(ASTImporterOptionSpecificTestBase,
-       ImportDefinitionOfEmptyClassWithNoUniqueAddressField) {
-  Decl *FromTU = getTuDecl(
-      R"(
-      struct B {};
-      struct A { B b; };
-      )",
-      Lang_CXX20);
-
-  CXXRecordDecl *FromD = FirstDeclMatcher<CXXRecordDecl>().match(
-      FromTU, cxxRecordDecl(hasName("A")));
-
-  for (auto *FD : FromD->fields())
-    FD->addAttr(clang::NoUniqueAddressAttr::Create(FromD->getASTContext(),
-                                                   clang::SourceRange()));
-  FromD->markEmpty();
-
-  CXXRecordDecl *ToD = Import(FromD, Lang_CXX20);
-  EXPECT_TRUE(ToD->isEmpty());
-  for (auto *FD : ToD->fields())
-    EXPECT_EQ(true, FD->hasAttr<NoUniqueAddressAttr>());
-}
-
 TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingTypedefToRecord) {
   const char *Code =
       R"(
@@ -9681,37 +9658,265 @@ AST_MATCHER_P(EnumDecl, hasEnumConstName, StringRef, ConstName) {
   return false;
 }
 
-TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnum) {
+TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnums) {
+  const char *Code =
+      R"(
+      struct A {
+        enum { E1, E2 } x;
+        enum { E3, E4 } y;
+      };
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E1")));
+  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
+  EXPECT_TRUE(ImportedEnumE1);
+  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E3")));
+  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
+  EXPECT_TRUE(ImportedEnumE3);
+  EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportFreeStandingAnonymousEnums) {
+  const char *Code =
+      R"(
+      struct A {
+        enum { E1, E2 };
+        enum { E3, E4 };
+      };
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E1")));
+  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
+  EXPECT_TRUE(ImportedEnumE1);
+  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E3")));
+  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
+  EXPECT_TRUE(ImportedEnumE3);
+  EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingAnonymousEnums) {
   const char *ToCode =
       R"(
       struct A {
-        enum { E1, E2} x;
-        enum { E3, E4} y;
+        enum { E1, E2 } x;
+        enum { E3, E4 } y;
       };
       )";
   Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
-  auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(
+  auto *ToEnumE1 = FirstDeclMatcher<EnumDecl>().match(
       ToTU, enumDecl(hasEnumConstName("E1")));
-  auto *ToE3 = FirstDeclMatcher<EnumDecl>().match(
+  auto *ToEnumE3 = FirstDeclMatcher<EnumDecl>().match(
       ToTU, enumDecl(hasEnumConstName("E3")));
   const char *Code =
       R"(
       struct A {
-        enum { E1, E2} x;
-        enum { E3, E4} y;
+        enum { E1, E2 } x;
+        enum { E3, E4 } y;
       };
       )";
   Decl *FromTU = getTuDecl(Code, Lang_CXX11);
-  auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(
+  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
       FromTU, enumDecl(hasEnumConstName("E1")));
+  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
+  ASSERT_TRUE(ImportedEnumE1);
+  EXPECT_EQ(ImportedEnumE1, ToEnumE1);
+  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E3")));
+  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
+  ASSERT_TRUE(ImportedEnumE3);
+  EXPECT_EQ(ImportedEnumE3, ToEnumE3);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
+  const char *ToCode =
+      R"(
+      struct A {
+        enum {};
+      };
+      )";
+  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
+  auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl());
+  const char *Code =
+      R"(
+      struct A {
+        enum {};
+        enum {};
+      };
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
   auto *ImportedE1 = Import(FromE1, Lang_CXX11);
   ASSERT_TRUE(ImportedE1);
   EXPECT_EQ(ImportedE1, ToE1);
-  auto *FromE3 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E3")));
-  auto *ImportedE3 = Import(FromE3, Lang_CXX11);
-  ASSERT_TRUE(ImportedE3);
-  EXPECT_EQ(ImportedE3, ToE3);
+  auto *FromE2 = LastDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
+  ASSERT_NE(FromE1, FromE2);
+  auto *ImportedE2 = Import(FromE2, Lang_CXX11);
+  ASSERT_TRUE(ImportedE2);
+  // FIXME: These should not be equal, or the import should fail.
+  EXPECT_EQ(ImportedE2, ToE1);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
+  Decl *ToTU = getToTuDecl("", Lang_CXX03);
+  Decl *FromTU = getTuDecl(
+      R"(
+        struct foo {
+          enum { A };
+          enum { B };
+        };
+      )",
+      Lang_CXX03);
+
+  auto EnumConstA = enumConstantDecl(hasName("A"));
+  auto EnumConstB = enumConstantDecl(hasName("B"));
+
+  auto *FromA = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstA);
+  auto *FromB = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstB);
+
+  auto *ToA = Import(FromA, Lang_CXX03);
+  auto *ToB = Import(FromB, Lang_CXX03);
+
+  ASSERT_TRUE(ToA);
+  ASSERT_TRUE(ToB);
+
+  auto *ToFooA = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstA)))));
+  auto *ToFooB = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstB)))));
+  ASSERT_EQ(ToFooA, ToFooB);
+
+  // different EnumDecl
+  auto *ToEnumDeclA =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstA)));
+  auto *ToEnumDeclB =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstB)));
+  ASSERT_NE(ToEnumDeclA, ToEnumDeclB);
+}
+
+struct ImportTemplateParmDeclDefaultValue
+    : public ASTImporterOptionSpecificTestBase {
+protected:
+  void checkTemplateParams(RedeclarableTemplateDecl *D) {
+    auto *CanD = cast<RedeclarableTemplateDecl>(D->getCanonicalDecl());
+    auto *CanNonTypeP = cast<NonTypeTemplateParmDecl>(
+        CanD->getTemplateParameters()->getParam(0));
+    auto *CanTypeP =
+        cast<TemplateTypeParmDecl>(CanD->getTemplateParameters()->getParam(1));
+    auto *CanTemplateP = cast<TemplateTemplateParmDecl>(
+        CanD->getTemplateParameters()->getParam(2));
+    EXPECT_FALSE(CanNonTypeP->getDefaultArgStorage().isInherited());
+    EXPECT_FALSE(CanTypeP->getDefaultArgStorage().isInherited());
+    EXPECT_FALSE(CanTemplateP->getDefaultArgStorage().isInherited());
+    for (Decl *Redecl : D->redecls()) {
+      auto *ReD = cast<RedeclarableTemplateDecl>(Redecl);
+      if (ReD != CanD) {
+        auto *NonTypeP = cast<NonTypeTemplateParmDecl>(
+            ReD->getTemplateParameters()->getParam(0));
+        auto *TypeP = cast<TemplateTypeParmDecl>(
+            ReD->getTemplateParameters()->getParam(1));
+        auto *TemplateP = cast<TemplateTemplateParmDecl>(
+            ReD->getTemplateParameters()->getParam(2));
+        EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited());
+        EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited());
+        EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited());
+        EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(),
+                  CanNonTypeP);
+        EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(), CanTypeP);
+        EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(),
+                  CanTemplateP);
+      }
+    }
+  }
+
+  void testImport(RedeclarableTemplateDecl *FromD) {
+    RedeclarableTemplateDecl *ToD = Import(FromD, Lang_CXX14);
+    checkTemplateParams(ToD);
+  }
+
+  const char *CodeFunction =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      void f();
+      template <int A, typename B, template<class> class C>
+      void f();
+      template <int A, typename B, template<class> class C>
+      void f() {}
+      )";
+
+  const char *CodeClass =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      struct S;
+      template <int A, typename B, template<class> class C>
+      struct S;
+      template <int A, typename B, template<class> class C>
+      struct S {};
+      )";
+
+  const char *CodeVar =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      extern int V;
+      template <int A, typename B, template<class> class C>
+      extern int V;
+      template <int A, typename B, template<class> class C>
+      int V = A;
+      )";
+};
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) {
+  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("f")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingFunctionTemplate) {
+  getToTuDecl(CodeFunction, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("f")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportClassTemplate) {
+  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("S")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingClassTemplate) {
+  getToTuDecl(CodeClass, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("S")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportVarTemplate) {
+  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("V")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingVarTemplate) {
+  getToTuDecl(CodeVar, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("V")));
+  testImport(FromLastD);
 }
 
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
@@ -9797,6 +10002,9 @@ INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportInjectedClassNameType,
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportMatrixType,
                          DefaultTestValuesForRunOptions);
 
+INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportTemplateParmDeclDefaultValue,
+                         DefaultTestValuesForRunOptions);
+
 // FIXME: Make ImportOpenCLPipe test work.
 // INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportOpenCLPipe,
 //                          DefaultTestValuesForRunOptions);
diff --git a/clang/unittests/AST/Interp/toAPValue.cpp b/clang/unittests/AST/Interp/toAPValue.cpp
index d6879d6e0bca3..5ec607a824349 100644
--- a/clang/unittests/AST/Interp/toAPValue.cpp
+++ b/clang/unittests/AST/Interp/toAPValue.cpp
@@ -27,6 +27,7 @@ TEST(ToAPValue, Pointers) {
   auto AST = tooling::buildASTFromCodeWithArgs(
       Code, {"-fexperimental-new-constant-interpreter"});
 
+  auto &ASTCtx = AST->getASTContext();
   auto &Ctx = AST->getASTContext().getInterpContext();
   Program &Prog = Ctx.getProgram();
 
@@ -47,7 +48,7 @@ TEST(ToAPValue, Pointers) {
     const Pointer &GP = getGlobalPtr("b");
     const Pointer &P = GP.deref<Pointer>();
     ASSERT_TRUE(P.isLive());
-    APValue A = P.toAPValue();
+    APValue A = P.toAPValue(ASTCtx);
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.hasLValuePath());
     const auto &Path = A.getLValuePath();
@@ -62,7 +63,7 @@ TEST(ToAPValue, Pointers) {
     const Pointer &GP = getGlobalPtr("p");
     const Pointer &P = GP.deref<Pointer>();
     ASSERT_TRUE(P.isIntegralPointer());
-    APValue A = P.toAPValue();
+    APValue A = P.toAPValue(ASTCtx);
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.getLValueBase().isNull());
     APSInt I;
@@ -77,7 +78,7 @@ TEST(ToAPValue, Pointers) {
     const Pointer &GP = getGlobalPtr("nullp");
     const Pointer &P = GP.deref<Pointer>();
     ASSERT_TRUE(P.isIntegralPointer());
-    APValue A = P.toAPValue();
+    APValue A = P.toAPValue(ASTCtx);
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.getLValueBase().isNull());
     ASSERT_TRUE(A.isNullPointer());
@@ -96,6 +97,7 @@ TEST(ToAPValue, FunctionPointers) {
   auto AST = tooling::buildASTFromCodeWithArgs(
       Code, {"-fexperimental-new-constant-interpreter"});
 
+  auto &ASTCtx = AST->getASTContext();
   auto &Ctx = AST->getASTContext().getInterpContext();
   Program &Prog = Ctx.getProgram();
 
@@ -117,7 +119,7 @@ TEST(ToAPValue, FunctionPointers) {
     const Pointer &GP = getGlobalPtr("func");
     const FunctionPointer &FP = GP.deref<FunctionPointer>();
     ASSERT_FALSE(FP.isZero());
-    APValue A = FP.toAPValue();
+    APValue A = FP.toAPValue(ASTCtx);
     ASSERT_TRUE(A.hasValue());
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.hasLValuePath());
@@ -132,7 +134,7 @@ TEST(ToAPValue, FunctionPointers) {
     ASSERT_NE(D, nullptr);
     const Pointer &GP = getGlobalPtr("nullp");
     const auto &P = GP.deref<FunctionPointer>();
-    APValue A = P.toAPValue();
+    APValue A = P.toAPValue(ASTCtx);
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.getLValueBase().isNull());
     ASSERT_TRUE(A.isNullPointer());
@@ -151,6 +153,7 @@ TEST(ToAPValue, FunctionPointersC) {
   auto AST = tooling::buildASTFromCodeWithArgs(
       Code, {"-x", "c", "-fexperimental-new-constant-interpreter"});
 
+  auto &ASTCtx = AST->getASTContext();
   auto &Ctx = AST->getASTContext().getInterpContext();
   Program &Prog = Ctx.getProgram();
 
@@ -174,7 +177,7 @@ TEST(ToAPValue, FunctionPointersC) {
     ASSERT_TRUE(GP.isLive());
     const FunctionPointer &FP = GP.deref<FunctionPointer>();
     ASSERT_FALSE(FP.isZero());
-    APValue A = FP.toAPValue();
+    APValue A = FP.toAPValue(ASTCtx);
     ASSERT_TRUE(A.hasValue());
     ASSERT_TRUE(A.isLValue());
     const auto &Path = A.getLValuePath();
@@ -197,6 +200,7 @@ TEST(ToAPValue, MemberPointers) {
   auto AST = tooling::buildASTFromCodeWithArgs(
       Code, {"-fexperimental-new-constant-interpreter"});
 
+  auto &ASTCtx = AST->getASTContext();
   auto &Ctx = AST->getASTContext().getInterpContext();
   Program &Prog = Ctx.getProgram();
 
@@ -218,7 +222,7 @@ TEST(ToAPValue, MemberPointers) {
     const Pointer &GP = getGlobalPtr("pm");
     ASSERT_TRUE(GP.isLive());
     const MemberPointer &FP = GP.deref<MemberPointer>();
-    APValue A = FP.toAPValue();
+    APValue A = FP.toAPValue(ASTCtx);
     ASSERT_EQ(A.getMemberPointerDecl(), getDecl("m"));
     ASSERT_EQ(A.getKind(), APValue::MemberPointer);
   }
@@ -228,7 +232,7 @@ TEST(ToAPValue, MemberPointers) {
     ASSERT_TRUE(GP.isLive());
     const MemberPointer &NP = GP.deref<MemberPointer>();
     ASSERT_TRUE(NP.isZero());
-    APValue A = NP.toAPValue();
+    APValue A = NP.toAPValue(ASTCtx);
     ASSERT_EQ(A.getKind(), APValue::MemberPointer);
   }
 }
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 952c83be0cb64..e994086c99d04 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -1109,6 +1109,20 @@ TEST_F(StructuralEquivalenceEnumTest, EnumsWithDifferentBody) {
   EXPECT_FALSE(testStructuralMatch(t));
 }
 
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithSameConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { A } x;};", Lang_CXX11);
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithDiffConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { B } x;};", Lang_CXX11);
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
 struct StructuralEquivalenceEnumConstantTest : StructuralEquivalenceTest {};
 
 TEST_F(StructuralEquivalenceEnumConstantTest, EnumConstantsWithSameValues) {
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index f26140675fd46..611e1f9ba5327 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2552,6 +2552,10 @@ TEST_P(ASTMatchersTest, HasName_MatchesNamespaces) {
                          recordDecl(hasName("a+b::C"))));
   EXPECT_TRUE(notMatches("namespace a { namespace b { class AC; } }",
                          recordDecl(hasName("C"))));
+  EXPECT_TRUE(matches("namespace a { inline namespace a { class C; } }",
+                      recordDecl(hasName("::a::C"))));
+  EXPECT_TRUE(matches("namespace a { inline namespace a { class C; } }",
+                      recordDecl(hasName("::a::a::C"))));
 }
 
 TEST_P(ASTMatchersTest, HasName_MatchesOuterClasses) {
diff --git a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
index a4ac597bb06d6..737277e167edd 100644
--- a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
@@ -442,6 +442,55 @@ TEST_F(EnvironmentTest, CXXDefaultInitExprResultObjIsWrappedExprResultObj) {
             &Env.getResultObjectLocation(*DefaultInit->getExpr()));
 }
 
+// This test verifies the behavior of `getResultObjectLocation()` in
+// scenarios involving inherited constructors.
+// Since the specific AST node of interest `CXXConstructorDecl` is implicitly
+// generated, we cannot annotate any statements inside of it as we do in tests
+// within TransferTest. Thus, the only way to get the right `Environment` is by
+// explicitly initializing it as we do in tests within EnvironmentTest.
+// This is why this test is not inside TransferTest, where most of the tests for
+// `getResultObjectLocation()` are located.
+TEST_F(EnvironmentTest, ResultObjectLocationForInheritedCtorInitExpr) {
+  using namespace ast_matchers;
+
+  std::string Code = R"(
+    struct Base {
+      Base(int b) {}
+    };
+    struct Derived : Base {
+      using Base::Base;
+    };
+
+    Derived d = Derived(0);
+  )";
+
+  auto Unit =
+      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++20"});
+  auto &Context = Unit->getASTContext();
+
+  ASSERT_EQ(Context.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+  auto Results =
+      match(cxxConstructorDecl(
+                hasAnyConstructorInitializer(cxxCtorInitializer(
+                    withInitializer(expr().bind("inherited_ctor_init_expr")))))
+                .bind("ctor"),
+            Context);
+  const auto *Constructor = selectFirst<CXXConstructorDecl>("ctor", Results);
+  const auto *InheritedCtorInit = selectFirst<CXXInheritedCtorInitExpr>(
+      "inherited_ctor_init_expr", Results);
+
+  EXPECT_EQ(InheritedCtorInit->child_begin(), InheritedCtorInit->child_end());
+
+  Environment Env(DAContext, *Constructor);
+  Env.initialize();
+
+  RecordStorageLocation &Loc = Env.getResultObjectLocation(*InheritedCtorInit);
+  EXPECT_NE(&Loc, nullptr);
+
+  EXPECT_EQ(&Loc, Env.getThisPointeeStorageLocation());
+}
+
 TEST_F(EnvironmentTest, Stmt) {
   using namespace ast_matchers;
 
@@ -473,4 +522,32 @@ TEST_F(EnvironmentTest, Stmt) {
   Env.getResultObjectLocation(*Init);
 }
 
+// This is a crash repro.
+TEST_F(EnvironmentTest, LambdaCapturingThisInFieldInitializer) {
+  using namespace ast_matchers;
+  std::string Code = R"cc(
+      struct S {
+        int f{[this]() { return 1; }()};
+      };
+    )cc";
+
+  auto Unit =
+      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++11"});
+  auto &Context = Unit->getASTContext();
+
+  ASSERT_EQ(Context.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+  auto *LambdaCallOperator = selectFirst<CXXMethodDecl>(
+      "method", match(cxxMethodDecl(hasName("operator()"),
+                                    ofClass(cxxRecordDecl(isLambda())))
+                          .bind("method"),
+                      Context));
+
+  Environment Env(DAContext, *LambdaCallOperator);
+  // Don't crash when initializing.
+  Env.initialize();
+  // And initialize the captured `this` pointee.
+  ASSERT_NE(nullptr, Env.getThisPointeeStorageLocation());
+}
+
 } // namespace
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 1a52b82d65665..8717d9753d161 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -9,6 +9,7 @@
 #include "TestingSupport.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/OperationKinds.h"
 #include "clang/AST/Type.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -79,7 +80,7 @@ class DataflowAnalysisTest : public Test {
 
   /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
   const CFGBlock *blockForStmt(const Stmt &S) {
-    const CFGBlock *Block = ACFG->getStmtToBlock().lookup(&S);
+    const CFGBlock *Block = ACFG->blockForStmt(S);
     assert(Block != nullptr);
     return Block;
   }
@@ -370,6 +371,42 @@ TEST_F(DiscardExprStateTest, ConditionalOperator) {
   EXPECT_EQ(CallGState.Env.get<PointerValue>(AddrOfI), nullptr);
 }
 
+TEST_F(DiscardExprStateTest, CallWithParenExprTreatedCorrectly) {
+  // This is a regression test.
+  // In the CFG for `target()` below, the expression that evaluates the function
+  // pointer for `expect` and the actual call are separated into different
+  // baseic blocks (because of the control flow introduced by the `||`
+  // operator).
+  // The value for the `expect` function pointer was erroneously discarded
+  // from the environment between these two blocks because the code that
+  // determines whether the expression values for a block need to be preserved
+  // did not ignore the `ParenExpr` around `(i == 1)` (which is not represented
+  // in the CFG).
+  std::string Code = R"(
+    bool expect(bool, bool);
+    void target(int i) {
+      expect(false || (i == 1), false);
+    }
+  )";
+  auto BlockStates = llvm::cantFail(runAnalysis<NoopAnalysis>(
+      Code, [](ASTContext &C) { return NoopAnalysis(C); }));
+
+  const auto &FnToPtrDecay = matchNode<ImplicitCastExpr>(
+      implicitCastExpr(hasCastKind(CK_FunctionToPointerDecay)));
+  const auto &CallExpect =
+      matchNode<CallExpr>(callExpr(callee(functionDecl(hasName("expect")))));
+
+  // In the block that evaluates the implicit cast of `expect` to a pointer,
+  // this expression is associated with a value.
+  const auto &FnToPtrDecayState = blockStateForStmt(BlockStates, FnToPtrDecay);
+  EXPECT_NE(FnToPtrDecayState.Env.getValue(FnToPtrDecay), nullptr);
+
+  // In the block that calls `expect()`, the implicit cast of `expect` to a
+  // pointer is still associated with a value.
+  const auto &CallExpectState = blockStateForStmt(BlockStates, CallExpect);
+  EXPECT_NE(CallExpectState.Env.getValue(FnToPtrDecay), nullptr);
+}
+
 struct NonConvergingLattice {
   int State;
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d01ce137b8fcb..d7f81813835fa 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8742,6 +8742,7 @@ TEST_F(FormatTest, FunctionAnnotations) {
                "    << abc;");
   verifyFormat("MACRO(abc)::function() // wrap\n"
                "    << abc;");
+  verifyFormat("FOO(bar)();", getLLVMStyleWithColumns(0));
 }
 
 TEST_F(FormatTest, BreaksDesireably) {
@@ -9337,6 +9338,31 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) {
       "    aaaaaaaaaaaaaaaa\n"
       ");",
       Style);
+  verifyFormat("bool aaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    const bool &aaaaaaaaa, const void *aaaaaaaaaa\n"
+               ") const {\n"
+               "  return true;\n"
+               "}",
+               Style);
+  verifyFormat("bool aaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    const bool &aaaaaaaaaa, const void *aaaaaaaaaa\n"
+               ") const;",
+               Style);
+  verifyFormat("void aaaaaaaaa(\n"
+               "    int aaaaaa, int bbbbbb, int cccccc, int dddddddddd\n"
+               ") const noexcept -> std::vector<of_very_long_type>;",
+               Style);
+  verifyFormat(
+      "x = aaaaaaaaaaaaaaa(\n"
+      "    \"a aaaaaaa aaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaa aaaaaaaaaaaaa\"\n"
+      ");",
+      Style);
+  Style.ColumnLimit = 60;
+  verifyFormat("auto lambda =\n"
+               "    [&b](\n"
+               "        auto aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+               "    ) {};",
+               Style);
 }
 
 TEST_F(FormatTest, ParenthesesAndOperandAlignment) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index c5e8aa72cd2cb..f432b95cc1e2b 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -577,12 +577,20 @@ TEST_F(TokenAnnotatorTest, UnderstandsTernaryInTemplate) {
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
 
   // IsExpression = true
+
   Tokens = annotate("return foo<true ? 1 : 2>();");
   ASSERT_EQ(Tokens.size(), 13u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[4], tok::question, TT_ConditionalExpr);
   EXPECT_TOKEN(Tokens[6], tok::colon, TT_ConditionalExpr);
   EXPECT_TOKEN(Tokens[8], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("return foo<true ? 1 : 2>{};");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[4], tok::question, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_TemplateCloser);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
@@ -596,6 +604,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
   EXPECT_TOKEN(Tokens[1], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_BinaryOperator);
 
+  Tokens = annotate("return A < B ? true : A > B;");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("return A < B ? true : A > B ? false : false;");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("return A < B ^ A > B;");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[6], tok::greater, TT_BinaryOperator);
+
   Tokens = annotate("ratio{-1, 2} < ratio{-1, 3} == -1 / 3 > -1 / 2;");
   ASSERT_EQ(Tokens.size(), 27u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_BinaryOperator);
@@ -1645,38 +1668,45 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   auto Tokens = annotate("[]() constexpr {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() consteval {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() mutable {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() static {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto {}");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto & {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto * {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
@@ -1705,6 +1735,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   Tokens = annotate("foo([&](u32 bar) __attribute__((attr)) -> void {});");
   ASSERT_EQ(Tokens.size(), 22u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace);
 
@@ -1712,6 +1743,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> {}");
@@ -1724,6 +1756,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename... T> {}");
@@ -1736,6 +1769,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <int... T> {}");
@@ -1748,6 +1782,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <Foo... T> {}");
@@ -1761,6 +1796,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[14]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
@@ -1769,6 +1805,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 19u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[11], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[15]->ClosesRequiresClause);
@@ -1778,6 +1815,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 23u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[19]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
@@ -1786,6 +1824,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[12], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[16]->ClosesRequiresClause);
@@ -1797,6 +1836,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> requires Bar<T> (T &&t) {}");
@@ -1805,6 +1846,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[13], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[16], tok::l_brace, TT_LambdaLBrace);
 
@@ -1814,6 +1857,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[15]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[16], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> requires true (T&& t) {}");
@@ -1822,6 +1867,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[7]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace);
 
@@ -1856,6 +1903,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[19]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
@@ -1865,6 +1914,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <int I = 0> (T t) {}");
@@ -1872,6 +1922,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <bool b = false> (T t) {}");
@@ -1879,6 +1930,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <bool b = true && false> (T&& t) {}");
@@ -1887,6 +1939,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[9], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[10], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
 
@@ -1896,6 +1949,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
   EXPECT_TOKEN(Tokens[8], tok::kw_requires, TT_RequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[13], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace);
 }
 
@@ -1910,6 +1965,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionAnnotations) {
                     "A(T) noexcept;");
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_Unknown);
+
+  Tokens = annotate("FOO(bar)();");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
@@ -2105,6 +2164,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsTrailingReturnArrow) {
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown);
 
+  auto Style = getLLVMStyle();
+  Style.StatementAttributeLikeMacros.push_back("emit");
+  Tokens = annotate("emit foo()->bar;", Style);
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_StatementAttributeLikeMacro);
+  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_Unknown);
+
   // Mixed
   Tokens = annotate("auto f() -> int { auto a = b()->c; }");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
@@ -2930,6 +2996,13 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl);
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName);
+
+  auto Style = getLLVMStyle();
+  Style.StatementAttributeLikeMacros.push_back("emit");
+  Tokens = annotate("emit foo = 0;", Style);
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_StatementAttributeLikeMacro);
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, BraceKind) {
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 94af9688a96e2..bdb5e23510118 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -650,6 +650,33 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
   EXPECT_STREQ("@import A.B;\n", Out.data());
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIncludesAndImports) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#import\n", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include\n", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#import \n"
+                                                    "#endif\n",
+                                                    Out));
+  // The ifdef block is removed because it's "empty".
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#import \n"
+                                                    "#define B\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifdef A\n"
+               "#define B\n"
+               "#endif\n",
+               Out.data());
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, AtImportFailures) {
   SmallVector<char, 128> Out;
 
@@ -970,6 +997,23 @@ ort \
   EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl);
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest, ObjCMethodArgs) {
+  SmallVector<char, 128> Out;
+
+  StringRef Source = R"(
+    @interface SomeObjcClass
+      - (void)func:(int)otherData
+              module:(int)module
+              import:(int)import;
+    @end
+  )";
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
+  // `module :` and `import :` not followed by an identifier are not treated as
+  // directive lines because they can be method argument decls.
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, TokensBeforeEOF) {
   SmallString<128> Out;
 
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index 5f3950ff033f1..f53fe71d630bf 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -10,11 +10,15 @@
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PreprocessorOptions.h"
 
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include <stack>
 
 #include "gtest/gtest.h"
+#include <tuple>
 
 using namespace clang;
 using namespace llvm;
@@ -23,7 +27,8 @@ namespace {
 
 // Should be called before testing.
 void setupProfiler() {
-  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test");
+  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test",
+                              /*TimeTraceVerbose=*/true);
 }
 
 // Should be called after `compileFromString()`.
@@ -38,14 +43,24 @@ std::string teardownProfiler() {
 
 // Returns true if code compiles successfully.
 // We only parse AST here. This is enough for constexpr evaluation.
-bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) {
+bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
+                       llvm::StringMap<std::string> Headers = {}) {
   CompilerInstance Compiler;
   Compiler.createDiagnostics();
 
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS(
+      new llvm::vfs::InMemoryFileSystem());
+  FS->addFile(File, 0, MemoryBuffer::getMemBuffer(Code));
+  for (const auto &Header : Headers) {
+    FS->addFile(Header.getKey(), 0,
+                MemoryBuffer::getMemBuffer(Header.getValue()));
+  }
+  llvm::IntrusiveRefCntPtr<FileManager> Files(
+      new FileManager(FileSystemOptions(), FS));
+  Compiler.setFileManager(Files.get());
+
   auto Invocation = std::make_shared<CompilerInvocation>();
-  Invocation->getPreprocessorOpts().addRemappedFile(
-      FileName, MemoryBuffer::getMemBuffer(Code).release());
-  const char *Args[] = {Standard.data(), FileName.data()};
+  std::vector<const char *> Args = {Standard.data(), File.data()};
   CompilerInvocation::CreateFromArgs(*Invocation, Args,
                                      Compiler.getDiagnostics());
   Compiler.setInvocation(std::move(Invocation));
@@ -60,13 +75,28 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) {
   return Compiler.ExecuteAction(Action);
 }
 
+std::string GetMetadata(json::Object *Event) {
+  std::string M;
+  llvm::raw_string_ostream OS(M);
+  if (json::Object *Args = Event->getObject("args")) {
+    if (auto Detail = Args->getString("detail"))
+      OS << Detail;
+    // Use only filename to not include os-specific path separators.
+    if (auto File = Args->getString("file"))
+      OS << (M.empty() ? "" : ", ") << llvm::sys::path::filename(*File);
+    if (auto Line = Args->getInteger("line"))
+      OS << ":" << *Line;
+  }
+  return M;
+}
+
 // Returns pretty-printed trace graph.
 std::string buildTraceGraph(StringRef Json) {
   struct EventRecord {
     int64_t TimestampBegin;
     int64_t TimestampEnd;
-    StringRef Name;
-    StringRef Detail;
+    std::string Name;
+    std::string Metadata;
   };
   std::vector<EventRecord> Events;
 
@@ -81,10 +111,13 @@ std::string buildTraceGraph(StringRef Json) {
     int64_t TimestampBegin = TraceEventObj->getInteger("ts").value_or(0);
     int64_t TimestampEnd =
         TimestampBegin + TraceEventObj->getInteger("dur").value_or(0);
-    StringRef Name = TraceEventObj->getString("name").value_or("");
-    StringRef Detail = "";
-    if (json::Object *Args = TraceEventObj->getObject("args"))
-      Detail = Args->getString("detail").value_or("");
+    std::string Name = TraceEventObj->getString("name").value_or("").str();
+    std::string Metadata = GetMetadata(TraceEventObj);
+
+    // Source events are asynchronous events and may not perfectly nest the
+    // synchronous events. Skip testing them.
+    if (Name == "Source")
+      continue;
 
     // This is a "summary" event, like "Total PerformPendingInstantiations",
     // skip it
@@ -92,7 +125,7 @@ std::string buildTraceGraph(StringRef Json) {
       continue;
 
     Events.emplace_back(
-        EventRecord{TimestampBegin, TimestampEnd, Name, Detail});
+        EventRecord{TimestampBegin, TimestampEnd, Name, Metadata});
   }
 
   // There can be nested events that are very fast, for example:
@@ -132,9 +165,9 @@ std::string buildTraceGraph(StringRef Json) {
       Stream << "| ";
     }
     Stream.write(Event.Name.data(), Event.Name.size());
-    if (!Event.Detail.empty()) {
+    if (!Event.Metadata.empty()) {
       Stream << " (";
-      Stream.write(Event.Detail.data(), Event.Detail.size());
+      Stream.write(Event.Metadata.data(), Event.Metadata.size());
       Stream << ")";
     }
     Stream << "\n";
@@ -145,7 +178,7 @@ std::string buildTraceGraph(StringRef Json) {
 } // namespace
 
 TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
-  constexpr StringRef Code = R"(
+  std::string Code = R"(
 void print(double value);
 
 namespace slow_namespace {
@@ -175,9 +208,8 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc"));
   std::string Json = teardownProfiler();
-  std::string TraceGraph = buildTraceGraph(Json);
-  ASSERT_TRUE(TraceGraph == R"(
-Frontend
+  ASSERT_EQ(R"(
+Frontend (test.cc)
 | ParseDeclarationOrFunctionDefinition (test.cc:2:1)
 | ParseDeclarationOrFunctionDefinition (test.cc:6:1)
 | | ParseFunctionDefinition (slow_func)
@@ -202,14 +234,54 @@ Frontend
 | ParseDeclarationOrFunctionDefinition (test.cc:25:1)
 | | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
-)");
+)",
+            buildTraceGraph(Json));
+}
+
+TEST(TimeProfilerTest, TemplateInstantiations) {
+  std::string B_H = R"(
+    template <typename T>
+    T fooB(T t) {
+      return T();
+    }
+
+    #define MacroTemp(x) template <typename T> void foo##x(T) { T(); }
+  )";
+
+  std::string A_H = R"(
+    #include "b.h"
+
+    MacroTemp(MTA)
+
+    template <typename T>
+    void fooA(T t) { fooB(t); fooMTA(t); }
+  )";
+  std::string Code = R"(
+    #include "a.h"
+    void user() { fooA(0); }
+  )";
 
-  // NOTE: If this test is failing, run this test with
-  // `llvm::errs() << TraceGraph;` and change the assert above.
+  setupProfiler();
+  ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc",
+                                /*Headers=*/{{"a.h", A_H}, {"b.h", B_H}}));
+  std::string Json = teardownProfiler();
+  ASSERT_EQ(R"(
+Frontend (test.cc)
+| ParseFunctionDefinition (fooB)
+| ParseFunctionDefinition (fooMTA)
+| ParseFunctionDefinition (fooA)
+| ParseDeclarationOrFunctionDefinition (test.cc:3:5)
+| | ParseFunctionDefinition (user)
+| PerformPendingInstantiations
+| | InstantiateFunction (fooA<int>, a.h:7)
+| | | InstantiateFunction (fooB<int>, b.h:3)
+| | | InstantiateFunction (fooMTA<int>, a.h:4)
+)",
+            buildTraceGraph(Json));
 }
 
 TEST(TimeProfilerTest, ConstantEvaluationC99) {
-  constexpr StringRef Code = R"(
+  std::string Code = R"(
 struct {
   short quantval[4]; // 3rd line
 } value;
@@ -218,15 +290,12 @@ struct {
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c99", "test.c"));
   std::string Json = teardownProfiler();
-  std::string TraceGraph = buildTraceGraph(Json);
-  ASSERT_TRUE(TraceGraph == R"(
-Frontend
+  ASSERT_EQ(R"(
+Frontend (test.c)
 | ParseDeclarationOrFunctionDefinition (test.c:2:1)
 | | isIntegerConstantExpr (<test.c:3:18>)
 | | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
-)");
-
-  // NOTE: If this test is failing, run this test with
-  // `llvm::errs() << TraceGraph;` and change the assert above.
+)",
+            buildTraceGraph(Json));
 }
diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt
index 0eb612f8d9498..401978c31863c 100644
--- a/clang/unittests/Tooling/CMakeLists.txt
+++ b/clang/unittests/Tooling/CMakeLists.txt
@@ -40,6 +40,7 @@ add_clang_unittest(ToolingTests
   RecursiveASTVisitorTests/CXXMethodDecl.cpp
   RecursiveASTVisitorTests/CXXOperatorCallExprTraverser.cpp
   RecursiveASTVisitorTests/DeclRefExpr.cpp
+  RecursiveASTVisitorTests/DeductionGuide.cpp
   RecursiveASTVisitorTests/ImplicitCtor.cpp
   RecursiveASTVisitorTests/ImplicitCtorInitializer.cpp
   RecursiveASTVisitorTests/InitListExprPostOrder.cpp
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
index cd4bf0eb7bd5a..df878bfc113e5 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
@@ -37,7 +37,7 @@ TEST(RecursiveASTVisitor, DeductionGuideNonImplicitMode) {
   // Verify that the synthezied deduction guide for alias is not visited in
   // RAV's implicit mode.
   Visitor.ExpectMatch("Foo(T) -> Foo<int>", 11, 1);
-  Visitor.DisallowMatch("Bar(type-parameter-0-0) -> Foo<int>", 14, 1);
+  Visitor.DisallowMatch("Bar(T) -> Foo<int>", 14, 1);
   EXPECT_TRUE(Visitor.runOver(
       R"cpp(
 template <typename T>
@@ -61,7 +61,7 @@ Bar s(1);
 TEST(RecursiveASTVisitor, DeductionGuideImplicitMode) {
   DeductionGuideVisitor Visitor(/*ShouldVisitImplicitCode*/ true);
   Visitor.ExpectMatch("Foo(T) -> Foo<int>", 11, 1);
-  Visitor.ExpectMatch("Bar(type-parameter-0-0) -> Foo<int>", 14, 1);
+  Visitor.ExpectMatch("Bar(T) -> Foo<int>", 14, 1);
   EXPECT_TRUE(Visitor.runOver(
       R"cpp(
 template <typename T>
diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp
index 2e3da2cd2a701..f41a44fa0922a 100644
--- a/clang/unittests/Tooling/ToolingTest.cpp
+++ b/clang/unittests/Tooling/ToolingTest.cpp
@@ -586,6 +586,11 @@ TEST(runToolOnCode, TestSkipFunctionBody) {
   EXPECT_FALSE(runToolOnCodeWithArgs(
       std::make_unique<SkipBodyAction>(),
       "template<typename T> int skipMeNot() { an_error_here }", Args2));
+
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      std::make_unique<SkipBodyAction>(),
+      "__inline __attribute__((__gnu_inline__)) void skipMe() {}",
+      {"--cuda-host-only", "-nocudainc", "-xcuda"}));
 }
 
 TEST(runToolOnCodeWithArgs, TestNoDepFile) {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 29380aabc529c..13a3345e07ee8 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2383,13 +2383,10 @@ void PragmaClangAttributeSupport::generateParsingHelpers(raw_ostream &OS) {
   OS << "}\n\n";
 }
 
-template <typename Fn>
-static void forEachUniqueSpelling(const Record &Attr, Fn &&F) {
+template <typename Fn> static void forEachSpelling(const Record &Attr, Fn &&F) {
   std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(Attr);
-  SmallDenseSet<StringRef, 8> Seen;
   for (const FlattenedSpelling &S : Spellings) {
-    if (Seen.insert(S.name()).second)
-      F(S);
+    F(S);
   }
 }
 
@@ -2413,8 +2410,11 @@ static void emitClangAttrTypeArgList(RecordKeeper &Records, raw_ostream &OS) {
       continue;
 
     // All these spellings take a single type argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_TYPE_ARG_LIST\n\n";
@@ -2432,8 +2432,11 @@ static void emitClangAttrArgContextList(RecordKeeper &Records, raw_ostream &OS)
       continue;
 
     // All these spellings take are parsed unevaluated.
-    forEachUniqueSpelling(Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_ARG_CONTEXT_LIST\n\n";
@@ -2494,10 +2497,11 @@ static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records,
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*A, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", "
-         << "true"
-         << ")\n";
+    forEachSpelling(*A, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST\n\n";
@@ -2563,8 +2567,11 @@ static void emitClangAttrUnevaluatedStringLiteralList(RecordKeeper &Records,
       continue;
 
     // All these spellings have at least one string literal has argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << MaskStr << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", " << MaskStr << ")\n";
     });
   }
   OS << "#endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST\n\n";
@@ -2582,8 +2589,11 @@ static void emitClangAttrIdentifierArgList(RecordKeeper &Records, raw_ostream &O
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_IDENTIFIER_ARG_LIST\n\n";
@@ -2598,18 +2608,20 @@ static void emitClangAttrStrictIdentifierArgAtIndexList(RecordKeeper &Records,
   for (const auto *Attr : Attrs) {
     if (!Attr->getValueAsBit("StrictEnumParameters"))
       continue;
-    // Determine whether the first argument is an identifier.
+    // Determine whether each argument is an identifier.
     std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
     uint64_t enumAtIndex = 0;
-    for (size_t i = 0; i < Args.size(); i++) {
-      enumAtIndex |= ((uint64_t)isIdentifierArgument(Args[0])) << i;
-    }
+    for (size_t I = 0; I < Args.size(); I++)
+      enumAtIndex |= ((uint64_t)isIdentifierArgument(Args[I])) << I;
     if (!enumAtIndex)
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << enumAtIndex << "ull)\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", " << enumAtIndex << "ull)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST\n\n";
@@ -2634,10 +2646,11 @@ static void emitClangAttrThisIsaIdentifierArgList(RecordKeeper &Records,
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*A, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", "
-         << "true"
-         << ")\n";
+    forEachSpelling(*A, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST\n\n";
@@ -2653,8 +2666,11 @@ static void emitClangAttrAcceptsExprPack(RecordKeeper &Records,
     if (!Attr.getValueAsBit("AcceptsExprPack"))
       continue;
 
-    forEachUniqueSpelling(Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", true)\n";
+    forEachSpelling(Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_ACCEPTS_EXPR_PACK\n\n";
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 626031d38cf00..30fbb8c5d65e5 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -952,7 +952,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   char typeCode = '\0';
   bool printNumber = true;
 
-  if (CK == ClassB && TargetGuard == "")
+  if (CK == ClassB && TargetGuard == "neon")
     return "";
 
   if (T.isBFloat16())
@@ -976,7 +976,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
       break;
     }
   }
-  if (CK == ClassB && TargetGuard == "") {
+  if (CK == ClassB && TargetGuard == "neon") {
     typeCode = '\0';
   }
 
@@ -1078,7 +1078,7 @@ std::string Intrinsic::mangleName(std::string Name, ClassKind LocalCK) const {
     S += "_" + getInstTypeCode(InBaseType, LocalCK);
   }
 
-  if (LocalCK == ClassB && TargetGuard == "")
+  if (LocalCK == ClassB && TargetGuard == "neon")
     S += "_v";
 
   // Insert a 'q' before the first '_' character so that it ends up before
diff --git a/clang/www/builtins.py b/clang/www/builtins.py
index 0c2e181a8cfa5..849e6bd4a816d 100755
--- a/clang/www/builtins.py
+++ b/clang/www/builtins.py
@@ -4,8 +4,9 @@
 
 err = 0
 
-# Giant associative set of builtin->intrinsic mappings where clang doesn't
-# implement the builtin since the vector operation works by default.
+# Giant associative set of builtin->intrinsic mappings where clang
+# doesn't implement the builtin. (Either because the vector operation
+# works without a builtin, or for other reasons.)
 
 repl_map = {
     "__builtin_ia32_addps": "_mm_add_ps",
@@ -134,6 +135,99 @@
     "__builtin_ia32_vec_ext_v2di": "_mm_extract_epi64",
     "__builtin_ia32_vec_ext_v4hi": "_mm_extract_pi16",
     "__builtin_ia32_vec_ext_v4sf": "_mm_extract_ps",
+    # Removed MMX builtins
+    "__builtin_ia32_paddb": "_mm_add_pi8",
+    "__builtin_ia32_paddw": "_mm_add_pi16",
+    "__builtin_ia32_paddd": "_mm_add_pi32",
+    "__builtin_ia32_paddsb": "_mm_adds_pi8",
+    "__builtin_ia32_paddsw": "_mm_adds_pi16",
+    "__builtin_ia32_paddusb": "_mm_adds_pu8",
+    "__builtin_ia32_paddusw": "_mm_adds_pu16",
+    "__builtin_ia32_psubb": "_mm_sub_pi8",
+    "__builtin_ia32_psubw": "_mm_sub_pi16",
+    "__builtin_ia32_psubd": "_mm_sub_pi32",
+    "__builtin_ia32_psubsb": "_mm_subs_pi8",
+    "__builtin_ia32_psubsw": "_mm_subs_pi16",
+    "__builtin_ia32_psubusb": "_mm_subs_pu8",
+    "__builtin_ia32_psubusw": "_mm_subs_pu16",
+    "__builtin_ia32_pmulhw": "_mm_mulhi_pi16",
+    "__builtin_ia32_pmullw": "_mm_mullo_pi16",
+    "__builtin_ia32_pmaddwd": "_mm_madd_pi16",
+    "__builtin_ia32_pand": "_mm_and_si64",
+    "__builtin_ia32_pandn": "_mm_andnot_si64",
+    "__builtin_ia32_por": "_mm_or_si64",
+    "__builtin_ia32_pxor": "_mm_xor_si64",
+    "__builtin_ia32_psllw": "_mm_sll_pi16",
+    "__builtin_ia32_pslld": "_mm_sll_pi32",
+    "__builtin_ia32_psllq": "_mm_sll_si64",
+    "__builtin_ia32_psrlw": "_mm_srl_pi16",
+    "__builtin_ia32_psrld": "_mm_srl_pi32",
+    "__builtin_ia32_psrlq": "_mm_srl_si64",
+    "__builtin_ia32_psraw": "_mm_sra_pi16",
+    "__builtin_ia32_psrad": "_mm_sra_pi32",
+    "__builtin_ia32_psllwi": "_mm_slli_pi16",
+    "__builtin_ia32_pslldi": "_mm_slli_pi32",
+    "__builtin_ia32_psllqi": "_mm_slli_si64",
+    "__builtin_ia32_psrlwi": "_mm_srli_pi16",
+    "__builtin_ia32_psrldi": "_mm_srli_pi32",
+    "__builtin_ia32_psrlqi": "_mm_srli_si64",
+    "__builtin_ia32_psrawi": "_mm_srai_pi16",
+    "__builtin_ia32_psradi": "_mm_srai_pi32",
+    "__builtin_ia32_packsswb": "_mm_packs_pi16",
+    "__builtin_ia32_packssdw": "_mm_packs_pi32",
+    "__builtin_ia32_packuswb": "_mm_packs_pu16",
+    "__builtin_ia32_punpckhbw": "_mm_unpackhi_pi8",
+    "__builtin_ia32_punpckhwd": "_mm_unpackhi_pi16",
+    "__builtin_ia32_punpckhdq": "_mm_unpackhi_pi32",
+    "__builtin_ia32_punpcklbw": "_mm_unpacklo_pi8",
+    "__builtin_ia32_punpcklwd": "_mm_unpacklo_pi16",
+    "__builtin_ia32_punpckldq": "_mm_unpacklo_pi32",
+    "__builtin_ia32_pcmpeqb": "_mm_cmpeq_pi8",
+    "__builtin_ia32_pcmpeqw": "_mm_cmpeq_pi16",
+    "__builtin_ia32_pcmpeqd": "_mm_cmpeq_pi32",
+    "__builtin_ia32_pcmpgtb": "_mm_cmpgt_pi8",
+    "__builtin_ia32_pcmpgtw": "_mm_cmpgt_pi16",
+    "__builtin_ia32_pcmpgtd": "_mm_cmpgt_pi32",
+    "__builtin_ia32_maskmovq": "_mm_maskmove_si64",
+    "__builtin_ia32_movntq": "_mm_stream_pi",
+    "__builtin_ia32_vec_init_v2si": "_mm_setr_pi32",
+    "__builtin_ia32_vec_init_v4hi": "_mm_setr_pi16",
+    "__builtin_ia32_vec_init_v8qi": "_mm_setr_pi8",
+    "__builtin_ia32_cvtpi2ps": "_mm_cvtpi32_ps",
+    "__builtin_ia32_cvtps2pi": "_mm_cvtps_pi32",
+    "__builtin_ia32_cvttps2pi": "_mm_cvttps_pi32",
+    "__builtin_ia32_pavgb": "_mm_avg_pu8",
+    "__builtin_ia32_pavgw": "_mm_avg_pu16",
+    "__builtin_ia32_pmaxsw": "_mm_max_pi16",
+    "__builtin_ia32_pmaxub": "_mm_max_pu8",
+    "__builtin_ia32_pminsw": "_mm_min_pi16",
+    "__builtin_ia32_pminub": "_mm_min_pu8",
+    "__builtin_ia32_pmovmskb": "_mm_movemask_pi8",
+    "__builtin_ia32_pmulhuw": "_mm_mulhi_pu16",
+    "__builtin_ia32_psadbw": "_mm_sad_pu8",
+    "__builtin_ia32_pshufw": "_mm_shuffle_pi16",
+    "__builtin_ia32_cvtpd2pi": "_mm_cvtpd_pi32",
+    "__builtin_ia32_cvtpi2pd": "_mm_cvtpi32_pd",
+    "__builtin_ia32_cvttpd2pi": "_mm_cvttpd_pi32",
+    "__builtin_ia32_paddq": "_mm_add_si64",
+    "__builtin_ia32_pmuludq": "_mm_mul_su32",
+    "__builtin_ia32_psubq": "_mm_sub_si64",
+    "__builtin_ia32_pabsb": "_mm_abs_pi8",
+    "__builtin_ia32_pabsd": "_mm_abs_pi32",
+    "__builtin_ia32_pabsw": "_mm_abs_pi16",
+    "__builtin_ia32_palignr": "_mm_alignr_pi8",
+    "__builtin_ia32_phaddd": "_mm_hadd_pi32",
+    "__builtin_ia32_phaddsw": "_mm_hadds_pi16",
+    "__builtin_ia32_phaddw": "_mm_hadd_pi16",
+    "__builtin_ia32_phsubd": "_mm_hsub_pi32",
+    "__builtin_ia32_phsubsw": "_mm_hsubs_pi16",
+    "__builtin_ia32_phsubw": "_mm_hsub_pi16",
+    "__builtin_ia32_pmaddubsw": "_mm_maddubs_pi16",
+    "__builtin_ia32_pmulhrsw": "_mm_mulhrs_pi16",
+    "__builtin_ia32_pshufb": "_mm_shuffle_pi8",
+    "__builtin_ia32_psignw": "_mm_sign_pi16",
+    "__builtin_ia32_psignb": "_mm_sign_pi8",
+    "__builtin_ia32_psignd": "_mm_sign_pi32",
 }
 
 # Special unhandled cases:
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 937f67981e296..a18d5d7fec63d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1440,11 +1440,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Is indirection through a null pointer undefined behavior?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="233">
+  <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>References vs pointers in UDC overload resolution</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="234">
     <td><a href="https://cplusplus.github.io/CWG/issues/234.html">234</a></td>
@@ -2763,7 +2763,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -5161,7 +5161,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/882.html">882</a></td>
     <td>CD2</td>
     <td>Defining <TT>main</TT> as deleted</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="883">
     <td><a href="https://cplusplus.github.io/CWG/issues/883.html">883</a></td>
@@ -7329,11 +7329,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Overloading member function templates based on dependent return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1253">
+  <tr id="1253">
     <td><a href="https://cplusplus.github.io/CWG/issues/1253.html">1253</a></td>
-    <td>open</td>
+    <td>C++17</td>
     <td>Generic non-template members</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1254">
     <td><a href="https://cplusplus.github.io/CWG/issues/1254.html">1254</a></td>
@@ -8805,11 +8805,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1499">
+  <tr id="1499">
     <td><a href="https://cplusplus.github.io/CWG/issues/1499.html">1499</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Missing case for deleted move assignment operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1500">
     <td><a href="https://cplusplus.github.io/CWG/issues/1500.html">1500</a></td>
@@ -9567,11 +9567,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Adding spaces between tokens in stringizing</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1626">
+  <tr id="1626">
     <td><a href="https://cplusplus.github.io/CWG/issues/1626.html">1626</a></td>
-    <td>open</td>
+    <td>dup</td>
     <td><TT>constexpr</TT> member functions in <I>brace-or-equal-initializer</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1627">
     <td><a href="https://cplusplus.github.io/CWG/issues/1627.html">1627</a></td>
@@ -11537,7 +11537,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12675,11 +12675,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Value-dependency via injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2144">
+  <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Function/variable declaration ambiguity</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2145">
     <td><a href="https://cplusplus.github.io/CWG/issues/2145.html">2145</a></td>
@@ -12707,7 +12707,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Brace elision and array length deduction</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
@@ -12927,11 +12927,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Cv-qualified numeric types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2186">
+  <tr id="2186">
     <td><a href="https://cplusplus.github.io/CWG/issues/2186.html">2186</a></td>
-    <td>drafting</td>
+    <td>C++20</td>
     <td>Unclear point that &#8220;preceding initialization&#8221; must precede</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2187">
     <td><a href="https://cplusplus.github.io/CWG/issues/2187.html">2187</a></td>
@@ -14669,7 +14669,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15011,7 +15011,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Storage duration of implicitly created objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15089,13 +15089,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15173,15 +15173,15 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2561">
+  <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
-    <td title="Clang does not implement 2024-03-18 resolution" align="center">Not Resolved*</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
@@ -15221,7 +15221,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15339,11 +15339,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Visible side effects and initial value of an object</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2588">
+  <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>friend declarations and module linkage</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2589">
     <td><a href="https://cplusplus.github.io/CWG/issues/2589.html">2589</a></td>
@@ -15541,7 +15541,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2621.html">2621</a></td>
     <td>C++23</td>
     <td>Kind of lookup for <TT>using enum</TT> declarations</td>
-    <td title="Clang 19 implements 2024-05-31 resolution" align="center">Superseded by <a href="#2877">2877</a></td>
+    <td class="unreleased-superseded" align="center">Superseded by <a href="#2877">2877</a></td>
   </tr>
   <tr id="2622">
     <td><a href="https://cplusplus.github.io/CWG/issues/2622.html">2622</a></td>
@@ -15617,7 +15617,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15635,13 +15635,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15755,7 +15755,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15779,7 +15779,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15821,7 +15821,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15947,7 +15947,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16013,7 +16013,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16055,7 +16055,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16097,7 +16097,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16179,11 +16179,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Importing header units synthesized from source files</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2728">
+  <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2729">
     <td><a href="https://cplusplus.github.io/CWG/issues/2729.html">2729</a></td>
@@ -16283,13 +16283,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Dependent odr-use in generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Checking of default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16301,7 +16301,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16440,7 +16440,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -16464,7 +16464,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16476,7 +16476,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16632,7 +16632,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16668,19 +16668,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clarify "use" of main</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
@@ -16692,19 +16692,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Class member access with prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2814">
+  <tr id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
-    <td>tentatively ready</td>
+    <td>NAD</td>
     <td>Alignment requirement of incomplete class type</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16720,21 +16720,21 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>sizeof(abstract class) is underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2818">
+  <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Use of predefined reserved identifiers</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2819">
+  <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>tentatively ready</td>
+    <td>accepted</td>
     <td>Cast from null pointer value in a constant expression</td>
-    <td title="Clang 19 implements 2023-12-01 resolution" align="center">Not Resolved*</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Value-initialization and default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16746,7 +16746,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16758,13 +16758,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16782,7 +16782,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Ambiguous interpretation of C-style cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16794,13 +16794,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16828,11 +16828,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Name-independent declarations</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2836">
+  <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2837">
     <td><a href="https://cplusplus.github.io/CWG/issues/2837.html">2837</a></td>
@@ -16884,13 +16884,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2845">
     <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Make the closure type of a captureless lambda a structural type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2846">
     <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Out-of-class definitions of explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16902,25 +16902,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2848">
     <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Omitting an empty template argument list for explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2849">
     <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Parameter objects are not temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2850">
     <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear storage duration for function parameter objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2851">
     <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Allow floating-point conversions in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16932,45 +16932,45 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2853">
     <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Pointer arithmetic with pointer to hypothetical element</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2854">
     <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Storage duration of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2855">
     <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Undefined behavior in postfix increment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2856">
     <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Copy-list-initialization with explicit default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2857">
     <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Argument-dependent lookup with incomplete class types</td>
     <td class="none" align="center">No</td>
   </tr>
-  <tr class="open" id="2858">
+  <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
-    <td>tentatively ready</td>
+    <td>accepted</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
-    <td title="Clang 19 implements 2024-04-05 resolution" align="center">Not Resolved*</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
-  <tr class="open" id="2859">
+  <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Value-initialization with multiple default constructors</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2860">
     <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
@@ -16978,15 +16978,15 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Remove and fix the term "vacuous initialization"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2861">
+  <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2862">
     <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
-    <td>tentatively ready</td>
+    <td>review</td>
     <td>Unclear boundaries of template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16996,17 +16996,17 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Unclear synchronization requirements for object lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2864">
+  <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Narrowing floating-point conversions</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2865">
+  <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Regression on result of conditional operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2866">
     <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
@@ -17014,11 +17014,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2867">
+  <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Order of initialization for structured bindings</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2868">
     <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
@@ -17026,29 +17026,29 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Self-references in trivially copyable objects as function return values</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2869">
+  <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>this</TT> in local classes</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2870">
+  <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2871">
+  <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2872">
+  <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Linkage and unclear "can be referred to"</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2873">
     <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
@@ -17056,29 +17056,29 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Taking the address of a function involving template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2874">
+  <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Qualified declarations of partial specializations</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2875">
     <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
-    <td>tentatively ready</td>
+    <td>review</td>
     <td>Missing support for round-tripping null pointer values through indirection/address operators</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2876">
+  <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
-    <td>tentatively ready</td>
+    <td>accepted</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2877">
+  <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
-    <td title="Clang 19 implements 2024-05-31 resolution" align="center">Not Resolved*</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2878">
     <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
@@ -17098,23 +17098,23 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2881">
+  <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
-    <td title="Clang 19 implements 2024-04-19 resolution" align="center">Not Resolved*</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
-  <tr class="open" id="2882">
+  <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
-    <td title="Clang 2.7 implements 2024-05-31 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
-  <tr class="open" id="2883">
+  <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
-    <td title="Clang does not implement 2024-05-31 resolution" align="center">Not Resolved*</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr id="2884">
     <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
@@ -17124,21 +17124,21 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2885">
     <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
-    <td>tentatively ready</td>
+    <td>review</td>
     <td>Non-eligible trivial default constructors</td>
     <td title="Clang 16 implements 2024-05-31 resolution" align="center">Not Resolved*</td>
   </tr>
-  <tr class="open" id="2886">
+  <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
-    <td title="Clang 9 implements 2024-05-31 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 9</td>
   </tr>
-  <tr class="open" id="2887">
+  <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Missing compatibility entries for xvalues</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2888">
     <td><a href="https://cplusplus.github.io/CWG/issues/2888.html">2888</a></td>
@@ -17158,17 +17158,17 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Defining members of local classes</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2891">
+  <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Normative status of implementation limits</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2892">
+  <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear usual arithmetic conversions</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2893">
     <td><a href="https://cplusplus.github.io/CWG/issues/2893.html">2893</a></td>
@@ -17182,11 +17182,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Functional casts create prvalues of reference type</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2895">
+  <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2896">
     <td><a href="https://cplusplus.github.io/CWG/issues/2896.html">2896</a></td>
@@ -17235,6 +17235,96 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>tentatively ready</td>
     <td>Can we omit the <TT>template</TT> disambiguator in <I>nested-name-specifier</I>s in type-only contexts?</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2904">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2904.html">2904</a></td>
+    <td>open</td>
+    <td>Introducing <I>template-name</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2905">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
+    <td>tentatively ready</td>
+    <td>Value-dependence of <I>noexcept-expression</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2906">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
+    <td>tentatively ready</td>
+    <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2907">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
+    <td>open</td>
+    <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2908">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
+    <td>open</td>
+    <td>Counting physical source lines for <TT>__LINE__</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2909">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
+    <td>open</td>
+    <td>Subtle difference between constant-initialized and constexpr</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2910">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
+    <td>open</td>
+    <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2911">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
+    <td>open</td>
+    <td>Unclear meaning of expressions "appearing within" subexpressions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2912">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2912.html">2912</a></td>
+    <td>open</td>
+    <td>Too-large value for size in array new</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2913">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
+    <td>open</td>
+    <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2914">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
+    <td>open</td>
+    <td>Unclear order of initialization of static and thread-local variables</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2915">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
+    <td>open</td>
+    <td>Explicit object parameters of type <TT>void</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2916">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2916.html">2916</a></td>
+    <td>open</td>
+    <td>Variable template partial specializations should not be declared <TT>static</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2917">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2917.html">2917</a></td>
+    <td>open</td>
+    <td>Disallow multiple <I>friend-type-specifier</I>s for a friend template</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2918">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
+    <td>open</td>
+    <td>Consideration of constraints for address of overloaded function</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 1d337fc4e5f87..773f14cfe3199 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -142,13 +142,12 @@ availability_error_occurred = False
 
 def availability(issue):
   status = status_map.get(issue, 'unknown')
-
   unresolved_status = ''
   proposed_resolution = ''
-  unresolved_status_match = re.search(r' (open|drafting|review|tentatively ready)', status)
+  unresolved_status_match = re.search(r' (open|drafting|review|tentatively ready|ready)', status)
   if unresolved_status_match:
     unresolved_status = unresolved_status_match.group(1)
-    proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
+    proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
       raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
@@ -246,7 +245,7 @@ for dr in drs:
     avail = 'Extension'
     avail_style = ''
 
-  elif dr.status in ('open', 'drafting', 'review', 'tentatively ready'):
+  elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'):
     row_style = ' class="open"'
     try:
       avail, avail_style, unresolved_status, tooltip = availability(dr.issue)
diff --git a/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake b/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
index f61ec0585f9a4..2524aafff0574 100644
--- a/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
+++ b/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
@@ -1,14 +1,8 @@
 include(CMakePushCheckState)
-
-include(CheckCompilerFlag OPTIONAL)
-
-if(NOT COMMAND check_compiler_flag)
-  include(CheckCCompilerFlag)
-  include(CheckCXXCompilerFlag)
-endif()
+include(CheckCompilerFlag)
 
 function(llvm_check_compiler_linker_flag lang flag out_var)
-  # If testing a flag with check_c_compiler_flag, it gets added to the compile
+  # If testing a flag with check_compiler_flag, it gets added to the compile
   # command only, but not to the linker command in that test. If the flag
   # is vital for linking to succeed, the test would fail even if it would
   # have succeeded if it was included on both commands.
@@ -18,18 +12,6 @@ function(llvm_check_compiler_linker_flag lang flag out_var)
 
   cmake_push_check_state()
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${flag}")
-  if(COMMAND check_compiler_flag)
-    check_compiler_flag("${lang}" "" ${out_var})
-  else()
-    # Until the minimum CMAKE version is 3.19
-    # cmake builtin compatible, except we assume lang is C or CXX
-    if("${lang}" STREQUAL "C")
-      check_c_compiler_flag("" ${out_var})
-    elseif("${lang}" STREQUAL "CXX")
-      check_cxx_compiler_flag("" ${out_var})
-    else()
-      message(FATAL_ERROR "\"${lang}\" is not C or CXX")
-    endif()
-  endif()
+  check_compiler_flag("${lang}" "" ${out_var})
   cmake_pop_check_state()
 endfunction()
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 5e28283fbc1c6..897dd963bd9ab 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -1,7 +1,7 @@
 # The LLVM Version number information
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 19)
+  set(LLVM_VERSION_MAJOR 20)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 65063e0057bbc..2207555b03a03 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -604,10 +604,6 @@ if (COMPILER_RT_TEST_STANDALONE_BUILD_LIBS)
   if ("${COMPILER_RT_TEST_COMPILER_ID}" MATCHES "Clang")
     list(APPEND COMPILER_RT_UNITTEST_LINK_FLAGS "-resource-dir=${COMPILER_RT_OUTPUT_DIR}")
   endif()
-  get_compiler_rt_output_dir(${COMPILER_RT_DEFAULT_TARGET_ARCH} rtlib_dir)
-  if (NOT WIN32)
-    list(APPEND COMPILER_RT_UNITTEST_LINK_FLAGS "-Wl,-rpath,${rtlib_dir}")
-  endif()
 endif()
 
 if(COMPILER_RT_USE_LLVM_UNWINDER)
@@ -801,6 +797,10 @@ if(ANDROID)
   append_list_if(COMPILER_RT_HAS_FUSE_LD_LLD_FLAG -fuse-ld=lld SANITIZER_COMMON_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LLD -fuse-ld=lld COMPILER_RT_UNITTEST_LINK_FLAGS)
 endif()
+if(${COMPILER_RT_DEFAULT_TARGET_ARCH} MATCHES sparc)
+  # lld has several bugs/limitations on SPARC, so disable (Issue #100320).
+  set(COMPILER_RT_HAS_LLD FALSE)
+endif()
 pythonize_bool(COMPILER_RT_HAS_LLD)
 pythonize_bool(COMPILER_RT_TEST_USE_LLD)
 
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index c8bec41db36e9..29e5beb6182ba 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -32,9 +32,7 @@ set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${LOONGARCH64})
 set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM64_32})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${LOONGARCH64})
-#set(ALL_RTSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
-#    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
-#    ${LOONGARCH64})
+set(ALL_RTSAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
 
 if(ANDROID)
   set(OS_NAME "Android")
@@ -77,7 +75,7 @@ set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${LOONGARCH64})
 set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64}
-    ${HEXAGON} ${LOONGARCH64})
+    ${HEXAGON} ${LOONGARCH64} ${SPARC} ${SPARCV9})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS64}
     ${HEXAGON} ${LOONGARCH64})
 set(ALL_SCUDO_STANDALONE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
index 80bbca1cbfceb..5a97992756a9c 100644
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -43,7 +43,7 @@ if (LLVM_TREE_AVAILABLE)
   get_clang_resource_dir(COMPILER_RT_OUTPUT_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/..)
   set(COMPILER_RT_EXEC_OUTPUT_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
   get_clang_resource_dir(COMPILER_RT_INSTALL_PATH)
-  option(COMPILER_RT_INCLUDE_TESTS "Generate and build compiler-rt unit tests."
+  option(COMPILER_RT_INCLUDE_TESTS "Generate and build compiler-rt tests."
          ${LLVM_INCLUDE_TESTS})
   option(COMPILER_RT_ENABLE_WERROR "Fail and stop if warning is triggered"
          ${LLVM_ENABLE_WERROR})
@@ -70,7 +70,7 @@ else()
     "Path where built compiler-rt executables should be stored.")
   set(COMPILER_RT_INSTALL_PATH "" CACHE PATH
     "Prefix for directories where built compiler-rt artifacts should be installed.")
-  option(COMPILER_RT_INCLUDE_TESTS "Generate and build compiler-rt unit tests." OFF)
+  option(COMPILER_RT_INCLUDE_TESTS "Generate and build compiler-rt tests." OFF)
   option(COMPILER_RT_ENABLE_WERROR "Fail and stop if warning is triggered" OFF)
   # Use a host compiler to compile/link tests.
   set(COMPILER_RT_TEST_COMPILER ${CMAKE_C_COMPILER} CACHE PATH "Compiler to use for testing")
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 74fdbd288a1f0..97204177cde9a 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -1,6 +1,5 @@
 include(CMakePushCheckState)
 include(AddLLVM)
-include(LLVMCheckCompilerLinkerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 include(CheckIncludeFiles)
@@ -15,7 +14,7 @@ include(TestBigEndian)
 # in tree version of runtimes, we'd be linking against the just-built
 # libunwind (and the compiler implicit -lunwind wouldn't succeed as the newly
 # built libunwind isn't installed yet). For those cases, it'd be good to
-# link with --uwnindlib=none. Check if that option works.
+# link with --unwindlib=none. Check if that option works.
 llvm_check_compiler_linker_flag(C "--unwindlib=none" CXX_SUPPORTS_UNWINDLIB_NONE_FLAG)
 
 check_library_exists(c fopen "" COMPILER_RT_HAS_LIBC)
@@ -176,6 +175,7 @@ check_cxx_compiler_flag(-nostdlib++ COMPILER_RT_HAS_NOSTDLIBXX_FLAG)
 check_include_files("sys/auxv.h"    COMPILER_RT_HAS_AUXV)
 
 # Libraries.
+check_library_exists(atomic __atomic_load_8 "" COMPILER_RT_HAS_LIBATOMIC)
 check_library_exists(dl dlopen "" COMPILER_RT_HAS_LIBDL)
 check_library_exists(rt shm_open "" COMPILER_RT_HAS_LIBRT)
 check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
@@ -266,6 +266,19 @@ function(get_target_link_flags_for_arch arch out_var)
   endif()
 endfunction()
 
+# Returns a list of architecture specific dynamic ldflags in @out_var list.
+function(get_dynamic_link_flags_for_arch arch out_var)
+  list(FIND COMPILER_RT_SUPPORTED_ARCH ${arch} ARCH_INDEX)
+  if(ARCH_INDEX EQUAL -1)
+    message(FATAL_ERROR "Unsupported architecture: ${arch}")
+  else()
+    get_compiler_rt_output_dir(${arch} rtlib_dir)
+    if (NOT WIN32)
+      set(${out_var} "-Wl,-rpath,${rtlib_dir}" PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
 # Returns a compiler and CFLAGS that should be used to run tests for the
 # specific architecture.  When cross-compiling, this is controled via
 # COMPILER_RT_TEST_COMPILER and COMPILER_RT_TEST_COMPILER_CFLAGS.
@@ -759,7 +772,7 @@ else()
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND RTSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Android|Darwin|Linux")
+    OS_NAME MATCHES "Darwin|Linux")
   set(COMPILER_RT_HAS_RTSAN TRUE)
 else()
   set(COMPILER_RT_HAS_RTSAN FALSE)
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..b9df3266fbcf8 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -738,7 +738,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_VAR __llvm_profile_bitmap_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
+#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp
index d413b1ebc9fc0..e2ff0a51acf77 100644
--- a/compiler-rt/lib/asan/asan_globals.cpp
+++ b/compiler-rt/lib/asan/asan_globals.cpp
@@ -21,6 +21,8 @@
 #include "asan_suppressions.h"
 #include "asan_thread.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_list.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
@@ -30,13 +32,14 @@ namespace __asan {
 
 typedef __asan_global Global;
 
-struct ListOfGlobals {
-  const Global *g;
-  ListOfGlobals *next;
+struct GlobalListNode {
+  const Global *g = nullptr;
+  GlobalListNode *next = nullptr;
 };
+typedef IntrusiveList<GlobalListNode> ListOfGlobals;
 
 static Mutex mu_for_globals;
-static ListOfGlobals *list_of_all_globals;
+static ListOfGlobals list_of_all_globals;
 
 static const int kDynamicInitGlobalsInitialCapacity = 512;
 struct DynInitGlobal {
@@ -55,6 +58,19 @@ struct GlobalRegistrationSite {
 typedef InternalMmapVector<GlobalRegistrationSite> GlobalRegistrationSiteVector;
 static GlobalRegistrationSiteVector *global_registration_site_vector;
 
+static ListOfGlobals &GlobalsByIndicator(uptr odr_indicator) {
+  using MapOfGlobals = DenseMap<uptr, ListOfGlobals>;
+
+  static MapOfGlobals *globals_by_indicator = nullptr;
+  if (!globals_by_indicator) {
+    alignas(
+        alignof(MapOfGlobals)) static char placeholder[sizeof(MapOfGlobals)];
+    globals_by_indicator = new (placeholder) MapOfGlobals();
+  }
+
+  return (*globals_by_indicator)[odr_indicator];
+}
+
 ALWAYS_INLINE void PoisonShadowForGlobal(const Global *g, u8 value) {
   FastPoisonShadow(g->beg, g->size_with_redzone, value);
 }
@@ -73,6 +89,10 @@ ALWAYS_INLINE void PoisonRedZones(const Global &g) {
 
 const uptr kMinimalDistanceFromAnotherGlobal = 64;
 
+static void AddGlobalToList(ListOfGlobals &list, const Global *g) {
+  list.push_front(new (GetGlobalLowLevelAllocator()) GlobalListNode{g});
+}
+
 static bool IsAddressNearGlobal(uptr addr, const __asan_global &g) {
   if (addr <= g.beg - kMinimalDistanceFromAnotherGlobal) return false;
   if (addr >= g.beg + g.size_with_redzone) return false;
@@ -114,8 +134,8 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites,
   if (!flags()->report_globals) return 0;
   Lock lock(&mu_for_globals);
   int res = 0;
-  for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
-    const Global &g = *l->g;
+  for (const auto &l : list_of_all_globals) {
+    const Global &g = *l.g;
     if (flags()->report_globals >= 2)
       ReportGlobal(g, "Search");
     if (IsAddressNearGlobal(addr, g)) {
@@ -142,20 +162,24 @@ static void CheckODRViolationViaIndicator(const Global *g) {
   // Instrumentation requests to skip ODR check.
   if (g->odr_indicator == UINTPTR_MAX)
     return;
+
+  ListOfGlobals &relevant_globals = GlobalsByIndicator(g->odr_indicator);
+
   u8 *odr_indicator = reinterpret_cast<u8 *>(g->odr_indicator);
-  if (*odr_indicator == UNREGISTERED) {
+  if (*odr_indicator == REGISTERED) {
+    // If *odr_indicator is REGISTERED, some module have already registered
+    // externally visible symbol with the same name. This is an ODR violation.
+    for (const auto &l : relevant_globals) {
+      if ((flags()->detect_odr_violation >= 2 || g->size != l.g->size) &&
+          !IsODRViolationSuppressed(g->name))
+        ReportODRViolation(g, FindRegistrationSite(g), l.g,
+                           FindRegistrationSite(l.g));
+    }
+  } else {  // UNREGISTERED
     *odr_indicator = REGISTERED;
-    return;
-  }
-  // If *odr_indicator is DEFINED, some module have already registered
-  // externally visible symbol with the same name. This is an ODR violation.
-  for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
-    if (g->odr_indicator == l->g->odr_indicator &&
-        (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
-        !IsODRViolationSuppressed(g->name))
-      ReportODRViolation(g, FindRegistrationSite(g),
-                         l->g, FindRegistrationSite(l->g));
   }
+
+  AddGlobalToList(relevant_globals, g);
 }
 
 // Check ODR violation for given global G by checking if it's already poisoned.
@@ -165,12 +189,13 @@ static void CheckODRViolationViaPoisoning(const Global *g) {
   if (__asan_region_is_poisoned(g->beg, g->size_with_redzone)) {
     // This check may not be enough: if the first global is much larger
     // the entire redzone of the second global may be within the first global.
-    for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
-      if (g->beg == l->g->beg &&
-          (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
-          !IsODRViolationSuppressed(g->name))
-        ReportODRViolation(g, FindRegistrationSite(g),
-                           l->g, FindRegistrationSite(l->g));
+    for (const auto &l : list_of_all_globals) {
+      if (g->beg == l.g->beg &&
+          (flags()->detect_odr_violation >= 2 || g->size != l.g->size) &&
+          !IsODRViolationSuppressed(g->name)) {
+        ReportODRViolation(g, FindRegistrationSite(g), l.g,
+                           FindRegistrationSite(l.g));
+      }
     }
   }
 }
@@ -225,10 +250,9 @@ static void RegisterGlobal(const Global *g) {
   }
   if (CanPoisonMemory())
     PoisonRedZones(*g);
-  ListOfGlobals *l = new (GetGlobalLowLevelAllocator()) ListOfGlobals;
-  l->g = g;
-  l->next = list_of_all_globals;
-  list_of_all_globals = l;
+
+  AddGlobalToList(list_of_all_globals, g);
+
   if (g->has_dynamic_init) {
     if (!dynamic_init_globals) {
       dynamic_init_globals = new (GetGlobalLowLevelAllocator()) VectorOfGlobals;
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index f8f86a766b204..74af2e65e9bfa 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -747,7 +747,7 @@ INTERCEPTOR(int, atexit, void (*func)()) {
 extern "C" {
 extern int _pthread_atfork(void (*prepare)(), void (*parent)(),
                            void (*child)());
-};
+}
 
 INTERCEPTOR(int, pthread_atfork, void (*prepare)(), void (*parent)(),
             void (*child)()) {
diff --git a/compiler-rt/lib/asan/asan_mapping.h b/compiler-rt/lib/asan/asan_mapping.h
index c5f95c07a2105..91fe60db6329a 100644
--- a/compiler-rt/lib/asan/asan_mapping.h
+++ b/compiler-rt/lib/asan/asan_mapping.h
@@ -72,7 +72,10 @@
 // || `[0x2000000000, 0x23ffffffff]` || LowShadow  ||
 // || `[0x0000000000, 0x1fffffffff]` || LowMem     ||
 //
-// Default Linux/RISCV64 Sv39 mapping:
+// Default Linux/RISCV64 Sv39 mapping with SHADOW_OFFSET == 0xd55550000;
+// (the exact location of SHADOW_OFFSET may vary depending the dynamic probing
+//  by FindDynamicShadowStart).
+//
 // || `[0x1555550000, 0x3fffffffff]` || HighMem    ||
 // || `[0x0fffffa000, 0x1555555fff]` || HighShadow ||
 // || `[0x0effffa000, 0x0fffff9fff]` || ShadowGap  ||
@@ -186,7 +189,7 @@
 #  elif SANITIZER_FREEBSD && defined(__aarch64__)
 #    define ASAN_SHADOW_OFFSET_CONST 0x0000800000000000
 #  elif SANITIZER_RISCV64
-#    define ASAN_SHADOW_OFFSET_CONST 0x0000000d55550000
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
 #  elif defined(__aarch64__)
 #    define ASAN_SHADOW_OFFSET_CONST 0x0000001000000000
 #  elif defined(__powerpc64__)
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 7abd4c89ac6bc..b489bb99aeff3 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -206,13 +206,15 @@ function(add_asan_tests arch test_runtime)
           -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames
         )
     else()
+      set(DYNAMIC_LINK_FLAGS)
+      get_dynamic_link_flags_for_arch(${arch} DYNAMIC_LINK_FLAGS)
 
       # Otherwise, reuse ASAN_INST_TEST_OBJECTS.
       add_compiler_rt_test(AsanDynamicUnitTests "${dynamic_test_name}" "${arch}"
         SUBDIR "${CONFIG_NAME_DYNAMIC}"
         OBJECTS ${ASAN_INST_TEST_OBJECTS}
         DEPS asan ${ASAN_INST_TEST_OBJECTS}
-        LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS}
+        LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS} ${DYNAMIC_LINK_FLAGS}
         )
     endif()
   endif()
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f780b917cbdb1..13adbd6c4d57d 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -571,7 +571,7 @@ set(aarch64_SOURCES
 
 if (COMPILER_RT_HAS_AARCH64_SME)
   if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
+    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
     message(STATUS "AArch64 SME ABI routines enabled")
     set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
   else()
@@ -739,6 +739,7 @@ endif()
 set(powerpc64le_SOURCES ${powerpc64_SOURCES})
 
 set(riscv_SOURCES
+  cpu_model/riscv.c
   riscv/fp_mode.c
   riscv/save.S
   riscv/restore.S
diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 2d213d95f333a..19f26c92a0f94 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -272,6 +272,11 @@ switch32
 switch8
 switchu8
 
+// This function generates a custom trampoline function with the specific
+// realFunc and localsPtr values.
+void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
+                        const void* realFunc, void* localsPtr);
+
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
index 062cf80fc6848..4b9ee8c1d382d 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
@@ -10,40 +10,9 @@ struct FEATURES {
 
 extern struct FEATURES __aarch64_cpu_features;
 
-struct SME_STATE {
-  long PSTATE;
-  long TPIDR2_EL0;
-};
-
-extern struct SME_STATE __arm_sme_state(void) __arm_streaming_compatible;
-
-extern bool __aarch64_has_sme_and_tpidr2_el0;
-
-#if __GNUC__ >= 9
-#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
-#endif
-__attribute__((constructor(90))) static void get_aarch64_cpu_features(void) {
+CONSTRUCTOR_ATTRIBUTE static void get_aarch64_cpu_features(void) {
   if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   __init_cpu_features();
 }
-
-__attribute__((target("sve"))) long
-__arm_get_current_vg(void) __arm_streaming_compatible {
-  struct SME_STATE State = __arm_sme_state();
-  unsigned long long features =
-      __atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED);
-  bool HasSVE = features & (1ULL << FEAT_SVE);
-
-  if (!HasSVE && !__aarch64_has_sme_and_tpidr2_el0)
-    return 0;
-
-  if (HasSVE || (State.PSTATE & 1)) {
-    long vl;
-    __asm__ __volatile__("cntd %0" : "=r"(vl));
-    return vl;
-  }
-
-  return 0;
-}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 4c0ff66931db7..cd8153f60670f 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -12,11 +12,15 @@
 #if !defined(__APPLE__)
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
 #define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)
+#define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features)
 #else
 // MachO requires @page/@pageoff directives because the global is defined
 // in a different file. Otherwise this file may fail to build.
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page
 #define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page
+#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
 #endif
 
 .arch armv9-a+sme
@@ -180,6 +184,46 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
   ret
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
 
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+  .variant_pcs __arm_get_current_vg
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     w17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbnz    w17, #30, 0f
+  adrp    x16, TPIDR2_SYMBOL
+  ldrb    w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz     w16, 1f
+0:
+  mov     x18, x1
+  bl      __arm_sme_state
+  mov     x1, x18
+  and     x17, x17, #0x40000000
+  bfxil   x17, x0, #0, #1
+  cbz     x17, 1f
+  cntd    x0
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+1:
+  mov     x0, xzr
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+
 NO_EXEC_STACK_DIRECTIVE
 
 // GNU property note for BTI and PAC
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
new file mode 100644
index 0000000000000..926ad3b1b6331
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -0,0 +1,344 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Routines taken from libc/AOR_v20.02/string/aarch64
+
+#include "../assembly.h"
+
+#ifdef __aarch64__
+
+#define L(l) .L ## l
+
+//
+//  __arm_sc_memcpy / __arm_sc_memmove
+//
+
+#define dstin    x0
+#define src      x1
+#define count    x2
+#define dst      x3
+#define srcend1  x4
+#define dstend1  x5
+#define A_l      x6
+#define A_lw     w6
+#define A_h      x7
+#define B_l      x8
+#define B_lw     w8
+#define B_h      x9
+#define C_l      x10
+#define C_lw     w10
+#define C_h      x11
+#define D_l      x12
+#define D_h      x13
+#define E_l      x14
+#define E_h      x15
+#define F_l      x16
+#define F_h      x17
+#define G_l      count
+#define G_h      dst
+#define H_l      src
+#define H_h      srcend1
+#define tmp1     x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+        add     srcend1, src, count
+        add     dstend1, dstin, count
+        cmp     count, 128
+        b.hi    L(copy_long)
+        cmp     count, 32
+        b.hi    L(copy32_128)
+
+        /* Small copies: 0..32 bytes.  */
+        cmp     count, 16
+        b.lo    L(copy16)
+        ldp     A_l, A_h, [src]
+        ldp     D_l, D_h, [srcend1, -16]
+        stp     A_l, A_h, [dstin]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        /* Copy 8-15 bytes.  */
+L(copy16):
+        tbz     count, 3, L(copy8)
+        ldr     A_l, [src]
+        ldr     A_h, [srcend1, -8]
+        str     A_l, [dstin]
+        str     A_h, [dstend1, -8]
+        ret
+
+        .p2align 3
+        /* Copy 4-7 bytes.  */
+L(copy8):
+        tbz     count, 2, L(copy4)
+        ldr     A_lw, [src]
+        ldr     B_lw, [srcend1, -4]
+        str     A_lw, [dstin]
+        str     B_lw, [dstend1, -4]
+        ret
+
+        /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+        cbz     count, L(copy0)
+        lsr     tmp1, count, 1
+        ldrb    A_lw, [src]
+        ldrb    C_lw, [srcend1, -1]
+        ldrb    B_lw, [src, tmp1]
+        strb    A_lw, [dstin]
+        strb    B_lw, [dstin, tmp1]
+        strb    C_lw, [dstend1, -1]
+L(copy0):
+        ret
+
+        .p2align 4
+        /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+        ldp     A_l, A_h, [src]
+        ldp     B_l, B_h, [src, 16]
+        ldp     C_l, C_h, [srcend1, -32]
+        ldp     D_l, D_h, [srcend1, -16]
+        cmp     count, 64
+        b.hi    L(copy128)
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy 65..128 bytes.  */
+L(copy128):
+        ldp     E_l, E_h, [src, 32]
+        ldp     F_l, F_h, [src, 48]
+        cmp     count, 96
+        b.ls    L(copy96)
+        ldp     G_l, G_h, [srcend1, -64]
+        ldp     H_l, H_h, [srcend1, -48]
+        stp     G_l, G_h, [dstend1, -64]
+        stp     H_l, H_h, [dstend1, -48]
+L(copy96):
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     E_l, E_h, [dstin, 32]
+        stp     F_l, F_h, [dstin, 48]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy more than 128 bytes.  */
+L(copy_long):
+        /* Use backwards copy if there is an overlap.  */
+        sub     tmp1, dstin, src
+        cbz     tmp1, L(copy0)
+        cmp     tmp1, count
+        b.lo    L(copy_long_backwards)
+
+        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+        ldp     D_l, D_h, [src]
+        and     tmp1, dstin, 15
+        bic     dst, dstin, 15
+        sub     src, src, tmp1
+        add     count, count, tmp1      /* Count is now 16 too large.  */
+        ldp     A_l, A_h, [src, 16]
+        stp     D_l, D_h, [dstin]
+        ldp     B_l, B_h, [src, 32]
+        ldp     C_l, C_h, [src, 48]
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 128 + 16  /* Test and readjust count.  */
+        b.ls    L(copy64_from_end)
+L(loop64):
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [src, 16]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [src, 32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [src, 48]
+        stp     D_l, D_h, [dst, 64]!
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 64
+        b.hi    L(loop64)
+
+        /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+        ldp     E_l, E_h, [srcend1, -64]
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [srcend1, -48]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [srcend1, -16]
+        stp     D_l, D_h, [dst, 64]
+        stp     E_l, E_h, [dstend1, -64]
+        stp     A_l, A_h, [dstend1, -48]
+        stp     B_l, B_h, [dstend1, -32]
+        stp     C_l, C_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+
+        /* Large backwards copy for overlapping copies.
+           Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+        ldp     D_l, D_h, [srcend1, -16]
+        and     tmp1, dstend1, 15
+        sub     srcend1, srcend1, tmp1
+        sub     count, count, tmp1
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     D_l, D_h, [dstend1, -16]
+        ldp     B_l, B_h, [srcend1, -32]
+        ldp     C_l, C_h, [srcend1, -48]
+        ldp     D_l, D_h, [srcend1, -64]!
+        sub     dstend1, dstend1, tmp1
+        subs    count, count, 128
+        b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [srcend1, -48]
+        stp     D_l, D_h, [dstend1, -64]!
+        ldp     D_l, D_h, [srcend1, -64]!
+        subs    count, count, 64
+        b.hi    L(loop64_backwards)
+
+        /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+        ldp     G_l, G_h, [src, 48]
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [src, 32]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [src, 16]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [src]
+        stp     D_l, D_h, [dstend1, -64]
+        stp     G_l, G_h, [dstin, 48]
+        stp     A_l, A_h, [dstin, 32]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstin]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
+
+
+//
+//  __arm_sc_memset
+//
+
+#define dstin    x0
+#define val      x1
+#define valw     w1
+#define count    x2
+#define dst      x3
+#define dstend2  x4
+#define zva_val  x5
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+        dup     v0.16B, valw
+        add     dstend2, dstin, count
+
+        cmp     count, 96
+        b.hi    L(set_long)
+        cmp     count, 16
+        b.hs    L(set_medium)
+        mov     val, v0.D[0]
+
+        /* Set 0..15 bytes.  */
+        tbz     count, 3, 1f
+        str     val, [dstin]
+        str     val, [dstend2, -8]
+        ret
+        nop
+1:      tbz     count, 2, 2f
+        str     valw, [dstin]
+        str     valw, [dstend2, -4]
+        ret
+2:      cbz     count, 3f
+        strb    valw, [dstin]
+        tbz     count, 1, 3f
+        strh    valw, [dstend2, -2]
+3:      ret
+
+        /* Set 17..96 bytes.  */
+L(set_medium):
+        str     q0, [dstin]
+        tbnz    count, 6, L(set96)
+        str     q0, [dstend2, -16]
+        tbz     count, 5, 1f
+        str     q0, [dstin, 16]
+        str     q0, [dstend2, -32]
+1:      ret
+
+        .p2align 4
+        /* Set 64..96 bytes.  Write 64 bytes from the start and
+           32 bytes from the end.  */
+L(set96):
+        str     q0, [dstin, 16]
+        stp     q0, q0, [dstin, 32]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+        .p2align 4
+L(set_long):
+        and     valw, valw, 255
+        bic     dst, dstin, 15
+        str     q0, [dstin]
+        cmp     count, 160
+        ccmp    valw, 0, 0, hs
+        b.ne    L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+        mrs     zva_val, dczid_el0
+        and     zva_val, zva_val, 31
+        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+        b.ne    L(no_zva)
+#endif
+        str     q0, [dst, 16]
+        stp     q0, q0, [dst, 32]
+        bic     dst, dst, 63
+        sub     count, dstend2, dst      /* Count is now 64 too large.  */
+        sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+        .p2align 4
+L(zva_loop):
+        add     dst, dst, 64
+        dc      zva, dst
+        subs    count, count, 64
+        b.hi    L(zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+L(no_zva):
+        sub     count, dstend2, dst      /* Count is 16 too large.  */
+        sub     dst, dst, 16            /* Dst is biased by -32.  */
+        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
+        stp     q0, q0, [dst, 32]
+        stp     q0, q0, [dst, 64]!
+        subs    count, count, 64
+        b.hi    L(no_zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+
+#endif // __aarch64__
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 89b52b0d1a880..315490e73ea2b 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -1,80 +1,5 @@
 #include <stddef.h>
 
-// WARNING: When building the scalar versions of these functions you need to
-// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
-// from recognising a loop idiom and planting calls to memcpy!
-
-static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = srcp[i];
-
-  return dest;
-}
-
-// If dest and src overlap then behaviour is undefined, hence we can add the
-// restrict keywords here. This also matches the definition of the libc memcpy
-// according to the man page.
-void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
-                      size_t n) __arm_streaming_compatible {
-  return __arm_sc_memcpy_fwd(dest, src, n);
-}
-
-void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  unsigned char c8 = (unsigned char)c;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = c8;
-
-  return dest;
-}
-
-static void *__arm_sc_memcpy_rev(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  // TODO: Improve performance by copying larger chunks in reverse, or by
-  // using SVE.
-  while (n > 0) {
-    --n;
-    destp[n] = srcp[n];
-  }
-  return dest;
-}
-
-// Semantically a memmove is equivalent to the following:
-//   1. Copy the entire contents of src to a temporary array that does not
-//      overlap with src or dest.
-//   2. Copy the contents of the temporary array into dest.
-void *__arm_sc_memmove(void *dest, const void *src,
-                       size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-
-  // If src and dest don't overlap then just invoke memcpy
-  if ((srcp > (destp + n)) || (destp > (srcp + n)))
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 1:
-  //     src: Low     |   ->   |     High
-  //    dest: Low  |   ->   |        High
-  // Here src is always ahead of dest at a higher addres. If we first read a
-  // chunk of data from src we can safely write the same chunk to dest without
-  // corrupting future reads of src.
-  if (srcp > destp)
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 2:
-  //     src: Low  |   ->   |        High
-  //    dest: Low     |   ->   |     High
-  // While we're in the overlap region we're always corrupting future reads of
-  // src when writing to dest. An efficient way to do this is to copy the data
-  // in reverse by starting at the highest address.
-  return __arm_sc_memcpy_rev(dest, src, n);
-}
-
 const void *__arm_sc_memchr(const void *src, int c,
                             size_t n) __arm_streaming_compatible {
   const unsigned char *srcp = (const unsigned char *)src;
diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
new file mode 100644
index 0000000000000..92931fae64fbf
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -0,0 +1,311 @@
+//=== cpu_model/riscv.c - Update RISC-V Feature Bits Structure -*- C -*-======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "cpu_model.h"
+
+#define RISCV_FEATURE_BITS_LENGTH 1
+struct {
+  unsigned length;
+  unsigned long long features[RISCV_FEATURE_BITS_LENGTH];
+} __riscv_feature_bits __attribute__((visibility("hidden"), nocommon));
+
+#define RISCV_VENDOR_FEATURE_BITS_LENGTH 1
+struct {
+  unsigned vendorID;
+  unsigned length;
+  unsigned long long features[RISCV_VENDOR_FEATURE_BITS_LENGTH];
+} __riscv_vendor_feature_bits __attribute__((visibility("hidden"), nocommon));
+
+// NOTE: Should sync-up with RISCVFeatures.td
+// TODO: Maybe generate a header from tablegen then include it.
+#define A_GROUPID 0
+#define A_BITMASK (1ULL << 0)
+#define C_GROUPID 0
+#define C_BITMASK (1ULL << 2)
+#define D_GROUPID 0
+#define D_BITMASK (1ULL << 3)
+#define F_GROUPID 0
+#define F_BITMASK (1ULL << 5)
+#define I_GROUPID 0
+#define I_BITMASK (1ULL << 8)
+#define M_GROUPID 0
+#define M_BITMASK (1ULL << 12)
+#define V_GROUPID 0
+#define V_BITMASK (1ULL << 21)
+#define ZACAS_GROUPID 0
+#define ZACAS_BITMASK (1ULL << 26)
+#define ZBA_GROUPID 0
+#define ZBA_BITMASK (1ULL << 27)
+#define ZBB_GROUPID 0
+#define ZBB_BITMASK (1ULL << 28)
+#define ZBC_GROUPID 0
+#define ZBC_BITMASK (1ULL << 29)
+#define ZBKB_GROUPID 0
+#define ZBKB_BITMASK (1ULL << 30)
+#define ZBKC_GROUPID 0
+#define ZBKC_BITMASK (1ULL << 31)
+#define ZBKX_GROUPID 0
+#define ZBKX_BITMASK (1ULL << 32)
+#define ZBS_GROUPID 0
+#define ZBS_BITMASK (1ULL << 33)
+#define ZFA_GROUPID 0
+#define ZFA_BITMASK (1ULL << 34)
+#define ZFH_GROUPID 0
+#define ZFH_BITMASK (1ULL << 35)
+#define ZFHMIN_GROUPID 0
+#define ZFHMIN_BITMASK (1ULL << 36)
+#define ZICBOZ_GROUPID 0
+#define ZICBOZ_BITMASK (1ULL << 37)
+#define ZICOND_GROUPID 0
+#define ZICOND_BITMASK (1ULL << 38)
+#define ZIHINTNTL_GROUPID 0
+#define ZIHINTNTL_BITMASK (1ULL << 39)
+#define ZIHINTPAUSE_GROUPID 0
+#define ZIHINTPAUSE_BITMASK (1ULL << 40)
+#define ZKND_GROUPID 0
+#define ZKND_BITMASK (1ULL << 41)
+#define ZKNE_GROUPID 0
+#define ZKNE_BITMASK (1ULL << 42)
+#define ZKNH_GROUPID 0
+#define ZKNH_BITMASK (1ULL << 43)
+#define ZKSED_GROUPID 0
+#define ZKSED_BITMASK (1ULL << 44)
+#define ZKSH_GROUPID 0
+#define ZKSH_BITMASK (1ULL << 45)
+#define ZKT_GROUPID 0
+#define ZKT_BITMASK (1ULL << 46)
+#define ZTSO_GROUPID 0
+#define ZTSO_BITMASK (1ULL << 47)
+#define ZVBB_GROUPID 0
+#define ZVBB_BITMASK (1ULL << 48)
+#define ZVBC_GROUPID 0
+#define ZVBC_BITMASK (1ULL << 49)
+#define ZVFH_GROUPID 0
+#define ZVFH_BITMASK (1ULL << 50)
+#define ZVFHMIN_GROUPID 0
+#define ZVFHMIN_BITMASK (1ULL << 51)
+#define ZVKB_GROUPID 0
+#define ZVKB_BITMASK (1ULL << 52)
+#define ZVKG_GROUPID 0
+#define ZVKG_BITMASK (1ULL << 53)
+#define ZVKNED_GROUPID 0
+#define ZVKNED_BITMASK (1ULL << 54)
+#define ZVKNHA_GROUPID 0
+#define ZVKNHA_BITMASK (1ULL << 55)
+#define ZVKNHB_GROUPID 0
+#define ZVKNHB_BITMASK (1ULL << 56)
+#define ZVKSED_GROUPID 0
+#define ZVKSED_BITMASK (1ULL << 57)
+#define ZVKSH_GROUPID 0
+#define ZVKSH_BITMASK (1ULL << 58)
+#define ZVKT_GROUPID 0
+#define ZVKT_BITMASK (1ULL << 59)
+
+#if defined(__linux__)
+
+// The RISC-V hwprobe interface is documented here:
+// <https://docs.kernel.org/arch/riscv/hwprobe.html>.
+
+static long syscall_impl_5_args(long number, long arg1, long arg2, long arg3,
+                                long arg4, long arg5) {
+  register long a7 __asm__("a7") = number;
+  register long a0 __asm__("a0") = arg1;
+  register long a1 __asm__("a1") = arg2;
+  register long a2 __asm__("a2") = arg3;
+  register long a3 __asm__("a3") = arg4;
+  register long a4 __asm__("a4") = arg5;
+  __asm__ __volatile__("ecall\n\t"
+                       : "=r"(a0)
+                       : "r"(a7), "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4)
+                       : "memory");
+  return a0;
+}
+
+#define RISCV_HWPROBE_KEY_MVENDORID 0
+#define RISCV_HWPROBE_KEY_MARCHID 1
+#define RISCV_HWPROBE_KEY_MIMPID 2
+#define RISCV_HWPROBE_KEY_BASE_BEHAVIOR 3
+#define RISCV_HWPROBE_BASE_BEHAVIOR_IMA (1ULL << 0)
+#define RISCV_HWPROBE_KEY_IMA_EXT_0 4
+#define RISCV_HWPROBE_IMA_FD (1ULL << 0)
+#define RISCV_HWPROBE_IMA_C (1ULL << 1)
+#define RISCV_HWPROBE_IMA_V (1ULL << 2)
+#define RISCV_HWPROBE_EXT_ZBA (1ULL << 3)
+#define RISCV_HWPROBE_EXT_ZBB (1ULL << 4)
+#define RISCV_HWPROBE_EXT_ZBS (1ULL << 5)
+#define RISCV_HWPROBE_EXT_ZICBOZ (1ULL << 6)
+#define RISCV_HWPROBE_EXT_ZBC (1ULL << 7)
+#define RISCV_HWPROBE_EXT_ZBKB (1ULL << 8)
+#define RISCV_HWPROBE_EXT_ZBKC (1ULL << 9)
+#define RISCV_HWPROBE_EXT_ZBKX (1ULL << 10)
+#define RISCV_HWPROBE_EXT_ZKND (1ULL << 11)
+#define RISCV_HWPROBE_EXT_ZKNE (1ULL << 12)
+#define RISCV_HWPROBE_EXT_ZKNH (1ULL << 13)
+#define RISCV_HWPROBE_EXT_ZKSED (1ULL << 14)
+#define RISCV_HWPROBE_EXT_ZKSH (1ULL << 15)
+#define RISCV_HWPROBE_EXT_ZKT (1ULL << 16)
+#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17)
+#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18)
+#define RISCV_HWPROBE_EXT_ZVKB (1ULL << 19)
+#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20)
+#define RISCV_HWPROBE_EXT_ZVKNED (1ULL << 21)
+#define RISCV_HWPROBE_EXT_ZVKNHA (1ULL << 22)
+#define RISCV_HWPROBE_EXT_ZVKNHB (1ULL << 23)
+#define RISCV_HWPROBE_EXT_ZVKSED (1ULL << 24)
+#define RISCV_HWPROBE_EXT_ZVKSH (1ULL << 25)
+#define RISCV_HWPROBE_EXT_ZVKT (1ULL << 26)
+#define RISCV_HWPROBE_EXT_ZFH (1ULL << 27)
+#define RISCV_HWPROBE_EXT_ZFHMIN (1ULL << 28)
+#define RISCV_HWPROBE_EXT_ZIHINTNTL (1ULL << 29)
+#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30)
+#define RISCV_HWPROBE_EXT_ZVFHMIN (1ULL << 31)
+#define RISCV_HWPROBE_EXT_ZFA (1ULL << 32)
+#define RISCV_HWPROBE_EXT_ZTSO (1ULL << 33)
+#define RISCV_HWPROBE_EXT_ZACAS (1ULL << 34)
+#define RISCV_HWPROBE_EXT_ZICOND (1ULL << 35)
+#define RISCV_HWPROBE_EXT_ZIHINTPAUSE (1ULL << 36)
+#define RISCV_HWPROBE_KEY_CPUPERF_0 5
+#define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0)
+#define RISCV_HWPROBE_MISALIGNED_EMULATED (1ULL << 0)
+#define RISCV_HWPROBE_MISALIGNED_SLOW (2 << 0)
+#define RISCV_HWPROBE_MISALIGNED_FAST (3 << 0)
+#define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0)
+#define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0)
+#define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6
+/* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
+
+struct riscv_hwprobe {
+  long long key;
+  unsigned long long value;
+};
+
+#define __NR_riscv_hwprobe 258
+static long initHwProbe(struct riscv_hwprobe *Hwprobes, int len) {
+  return syscall_impl_5_args(__NR_riscv_hwprobe, (long)Hwprobes, len, 0, 0, 0);
+}
+
+#define SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(EXTNAME)                    \
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_EXT_##EXTNAME, EXTNAME)
+
+#define SET_SINGLE_IMAEXT_RISCV_FEATURE(HWPROBE_BITMASK, EXT)                  \
+  SET_SINGLE_RISCV_FEATURE(IMAEXT0Value &HWPROBE_BITMASK, EXT)
+
+#define SET_SINGLE_RISCV_FEATURE(COND, EXT)                                    \
+  if (COND) {                                                                  \
+    SET_RISCV_FEATURE(EXT);                                                    \
+  }
+
+#define SET_RISCV_FEATURE(EXT) features[EXT##_GROUPID] |= EXT##_BITMASK
+
+static void initRISCVFeature(struct riscv_hwprobe Hwprobes[]) {
+
+  // Note: If a hwprobe key is unknown to the kernel, its key field
+  // will be cleared to -1, and its value set to 0.
+  // This unsets all extension bitmask bits.
+
+  // Init vendor extension
+  __riscv_vendor_feature_bits.vendorID = Hwprobes[2].value;
+
+  // Init standard extension
+  // TODO: Maybe Extension implied generate from tablegen?
+
+  unsigned long long features[RISCV_FEATURE_BITS_LENGTH];
+  int i;
+
+  for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++)
+    features[i] = 0;
+
+  // Check RISCV_HWPROBE_KEY_BASE_BEHAVIOR
+  unsigned long long BaseValue = Hwprobes[0].value;
+  if (BaseValue & RISCV_HWPROBE_BASE_BEHAVIOR_IMA) {
+    SET_RISCV_FEATURE(I);
+    SET_RISCV_FEATURE(M);
+    SET_RISCV_FEATURE(A);
+  }
+
+  // Check RISCV_HWPROBE_KEY_IMA_EXT_0
+  unsigned long long IMAEXT0Value = Hwprobes[1].value;
+  if (IMAEXT0Value & RISCV_HWPROBE_IMA_FD) {
+    SET_RISCV_FEATURE(F);
+    SET_RISCV_FEATURE(D);
+  }
+
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_C, C);
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_V, V);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBS);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICBOZ);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKX);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKND);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNE);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKT);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKG);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKT);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFHMIN);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTNTL);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTPAUSE);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFHMIN);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZTSO);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZACAS);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICOND);
+
+  for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++)
+    __riscv_feature_bits.features[i] = features[i];
+}
+
+#endif // defined(__linux__)
+
+static int FeaturesBitCached = 0;
+
+void __init_riscv_feature_bits() CONSTRUCTOR_ATTRIBUTE;
+
+// A constructor function that sets __riscv_feature_bits, and
+// __riscv_vendor_feature_bits to the right values.  This needs to run
+// only once.  This constructor is given the highest priority and it should
+// run before constructors without the priority set.  However, it still runs
+// after ifunc initializers and needs to be called explicitly there.
+void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits() {
+
+  if (FeaturesBitCached)
+    return;
+
+  __riscv_feature_bits.length = RISCV_FEATURE_BITS_LENGTH;
+  __riscv_vendor_feature_bits.length = RISCV_VENDOR_FEATURE_BITS_LENGTH;
+
+#if defined(__linux__)
+  struct riscv_hwprobe Hwprobes[] = {
+      {RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0},
+      {RISCV_HWPROBE_KEY_IMA_EXT_0, 0},
+      {RISCV_HWPROBE_KEY_MVENDORID, 0},
+  };
+  if (initHwProbe(Hwprobes, sizeof(Hwprobes) / sizeof(Hwprobes[0])))
+    return;
+
+  initRISCVFeature(Hwprobes);
+#endif // defined(__linux__)
+
+  FeaturesBitCached = 1;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index ab2b685e67ef8..6fe2a84b646ee 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -23,6 +23,10 @@
 
 #include <assert.h>
 
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+#include <cpuid.h>
+#endif
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -141,7 +145,7 @@ enum ProcessorFeatures {
   FEATURE_AVX512VP2INTERSECT,
   // FIXME: Below Features has some missings comparing to gcc, it's because gcc
   // has some not one-to-one mapped in llvm.
-  FEATURE_3DNOW,
+  // FEATURE_3DNOW,
   // FEATURE_3DNOWP,
   FEATURE_ADX = 40,
   // FEATURE_ABM,
@@ -224,38 +228,6 @@ enum ProcessorFeatures {
   CPU_FEATURE_MAX
 };
 
-// The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
-// Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
-// support. Consequently, for i386, the presence of CPUID is checked first
-// via the corresponding eflags bit.
-static bool isCpuIdSupported(void) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__i386__)
-  int __cpuid_supported;
-  __asm__("  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   %%eax,%%ecx\n"
-          "  xorl   $0x00200000,%%eax\n"
-          "  pushl  %%eax\n"
-          "  popfl\n"
-          "  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   $0,%0\n"
-          "  cmpl   %%eax,%%ecx\n"
-          "  je     1f\n"
-          "  movl   $1,%0\n"
-          "1:"
-          : "=r"(__cpuid_supported)
-          :
-          : "eax", "ecx");
-  if (!__cpuid_supported)
-    return false;
-#endif
-  return true;
-#endif
-  return true;
-}
-
 // This code is copied from lib/Support/Host.cpp.
 // Changes to either file should be mirrored in the other.
 
@@ -263,26 +235,8 @@ static bool isCpuIdSupported(void) {
 /// the specified arguments.  If we can't run cpuid on the host, return true.
 static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
                                unsigned *rECX, unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#else
-  return true;
-#endif
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+  return !__get_cpuid(value, rEAX, rEBX, rECX, rEDX);
 #elif defined(_MSC_VER)
   // The MSVC intrinsic is portable across x86 and x64.
   int registers[4];
@@ -303,26 +257,12 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#else
-  return true;
-#endif
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that __cpuidex is defined within cpuid.h for both, we can remove
+  // the __get_cpuid_count function and share the MSVC implementation between
+  // all three.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+  return !__get_cpuid_count(value, subleaf, rEAX, rEBX, rECX, rEDX);
 #elif defined(_MSC_VER)
   int registers[4];
   __cpuidex(registers, value, subleaf);
@@ -338,6 +278,9 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
 
 // Read control register 0 (XCR0). Used to detect features such as AVX.
 static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that _xgetbv is supported by both, we can unify the implementation
+  // with MSVC and remove all inline assembly.
 #if defined(__GNUC__) || defined(__clang__)
   // Check xgetbv; this uses a .byte sequence instead of the instruction
   // directly because older assemblers do not include support for xgetbv and
@@ -1022,7 +965,7 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1))
     setFeature(FEATURE_APXF);
 
-  unsigned MaxLevel;
+  unsigned MaxLevel = 0;
   getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX);
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
@@ -1038,7 +981,7 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1) && HasLeaf24 && ((EBX >> 18) & 1))
     setFeature(FEATURE_AVX10_1_512);
 
-  unsigned MaxExtLevel;
+  unsigned MaxExtLevel = 0;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
 
   bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
@@ -1132,7 +1075,7 @@ unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32];
 // needs to be called explicitly there.
 
 int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
-  unsigned EAX, EBX, ECX, EDX;
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLeaf = 5;
   unsigned Vendor;
   unsigned Model, Family;
@@ -1144,8 +1087,7 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   if (__cpu_model.__cpu_vendor)
     return 0;
 
-  if (!isCpuIdSupported() ||
-      getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
+  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
     return -1;
   }
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index 7637417649e62..22bf2b2514e57 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -37,16 +37,7 @@ static const int srcSigFracBits = 52;
 // srcBits - srcSigFracBits - 1
 static const int srcExpBits = 11;
 
-static inline int src_rep_t_clz_impl(src_rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static inline int src_rep_t_clz_impl(src_rep_t a) { return __builtin_clzll(a); }
 #define src_rep_t_clz src_rep_t_clz_impl
 
 #elif defined SRC_80
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index 8404d98c93508..b2a89506135be 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -58,16 +58,7 @@ typedef double fp_t;
 #define REP_C UINT64_C
 #define significandBits 52
 
-static __inline int rep_clz(rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static inline int rep_clz(rep_t a) { return __builtin_clzll(a); }
 
 #define loWord(a) (a & 0xffffffffU)
 #define hiWord(a) (a >> 32)
diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c
index 01fae834ab219..b10f23a81a9c6 100644
--- a/compiler-rt/lib/builtins/os_version_check.c
+++ b/compiler-rt/lib/builtins/os_version_check.c
@@ -14,6 +14,7 @@
 #ifdef __APPLE__
 
 #include <TargetConditionals.h>
+#include <assert.h>
 #include <dispatch/dispatch.h>
 #include <dlfcn.h>
 #include <stdint.h>
@@ -270,6 +271,8 @@ static inline uint32_t ConstructVersion(uint32_t Major, uint32_t Minor,
   return ((Major & 0xffff) << 16) | ((Minor & 0xff) << 8) | (Subminor & 0xff);
 }
 
+#define PLATFORM_MACOS 1
+
 int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
                                    uint32_t Minor, uint32_t Subminor) {
   dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
@@ -282,6 +285,29 @@ int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
   return AvailabilityVersionCheck(1, Versions);
 }
 
+#if TARGET_OS_OSX
+
+int32_t __isPlatformOrVariantPlatformVersionAtLeast(
+    uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor,
+    uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2) {
+  dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
+
+  if (!AvailabilityVersionCheck) {
+    // Handle case of back-deployment for older macOS.
+    if (Platform == PLATFORM_MACOS) {
+      return __isOSVersionAtLeast(Major, Minor, Subminor);
+    }
+    assert(Platform2 == PLATFORM_MACOS && "unexpected platform");
+    return __isOSVersionAtLeast(Major2, Minor2, Subminor2);
+  }
+  dyld_build_version_t Versions[] = {
+      {Platform, ConstructVersion(Major, Minor, Subminor)},
+      {Platform2, ConstructVersion(Major2, Minor2, Subminor2)}};
+  return AvailabilityVersionCheck(2, Versions);
+}
+
+#endif
+
 #elif __ANDROID__
 
 #include <pthread.h>
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 844eb27944142..830e25e4c0303 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,3 +41,45 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
+
+// The AArch64 compiler generates calls to __trampoline_setup() when creating
+// trampoline functions on the stack for use with nested functions.
+// This function creates a custom 36-byte trampoline function on the stack
+// which loads x18 with a pointer to the outer function's locals
+// and then jumps to the target nested function.
+// Note: x18 is a reserved platform register on Windows and macOS.
+
+#if defined(__aarch64__) && defined(__ELF__)
+COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
+                                        int trampSizeAllocated,
+                                        const void *realFunc, void *localsPtr) {
+  // This should never happen, but if compiler did not allocate
+  // enough space on stack for the trampoline, abort.
+  if (trampSizeAllocated < 36)
+    compilerrt_abort();
+
+  // create trampoline
+  // Load realFunc into x17. mov/movk 16 bits at a time.
+  trampOnStack[0] =
+      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
+  trampOnStack[1] =
+      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
+  trampOnStack[2] =
+      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
+  trampOnStack[3] =
+      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
+  // Load localsPtr into x18
+  trampOnStack[4] =
+      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
+  trampOnStack[5] =
+      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
+  trampOnStack[6] =
+      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
+  trampOnStack[7] =
+      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
+  trampOnStack[8] = 0xd61f0220; // br x17
+
+  // Clear instruction cache.
+  __clear_cache(trampOnStack, &trampOnStack[9]);
+}
+#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/lib/gwp_asan/definitions.h b/compiler-rt/lib/gwp_asan/definitions.h
index bec0290389346..c6785d4aba695 100644
--- a/compiler-rt/lib/gwp_asan/definitions.h
+++ b/compiler-rt/lib/gwp_asan/definitions.h
@@ -12,7 +12,8 @@
 #define GWP_ASAN_TLS_INITIAL_EXEC                                              \
   __thread __attribute__((tls_model("initial-exec")))
 
-#define GWP_ASAN_UNLIKELY(X) __builtin_expect(!!(X), 0)
+#define GWP_ASAN_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define GWP_ASAN_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #define GWP_ASAN_ALWAYS_INLINE inline __attribute__((always_inline))
 
 #define GWP_ASAN_WEAK __attribute__((weak))
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
index 3f39402652a99..5d5c729cdc8b7 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
@@ -24,13 +24,13 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create Vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create Vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
   zx_vaddr_t Addr;
   Status = _zx_vmar_map(_zx_vmar_root_self(),
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_ALLOW_FAULTS,
                         0, Vmo, 0, Size, &Addr);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
   return reinterpret_cast<void *>(Addr);
 }
@@ -40,7 +40,7 @@ void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_status_t Status = _zx_vmar_unmap(_zx_vmar_root_self(),
                                       reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmo unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo unmapping failed", Status);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
@@ -50,7 +50,8 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
       _zx_vmar_root_self(),
       ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
       Size, &GuardedPagePoolPlatformData.Vmar, &Addr);
-  check(Status == ZX_OK, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Status == ZX_OK,
+                     "Failed to reserve guarded pool allocator memory", Status);
   _zx_object_set_property(GuardedPagePoolPlatformData.Vmar, ZX_PROP_NAME,
                           kGwpAsanGuardPageName, strlen(kGwpAsanGuardPageName));
   return reinterpret_cast<void *>(Addr);
@@ -59,8 +60,10 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
 void GuardedPoolAllocator::unreserveGuardedPool() {
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
-  check(_zx_vmar_destroy(Vmar) == ZX_OK, "Failed to destroy a vmar");
-  check(_zx_handle_close(Vmar) == ZX_OK, "Failed to close a vmar");
+  zx_status_t Status = _zx_vmar_destroy(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to destroy a vmar", Status);
+  Status = _zx_handle_close(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to close a vmar", Status);
   GuardedPagePoolPlatformData.Vmar = ZX_HANDLE_INVALID;
 }
 
@@ -69,7 +72,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, kGwpAsanAliveSlotName,
                           strlen(kGwpAsanAliveSlotName));
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
@@ -81,7 +84,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE |
                             ZX_VM_ALLOW_FAULTS | ZX_VM_SPECIFIC,
                         Offset, Vmo, 0, Size, &P);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
 }
 
@@ -93,7 +96,7 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
   const zx_status_t Status =
       _zx_vmar_unmap(Vmar, reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmar unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmar unmapping failed", Status);
 }
 
 size_t GuardedPoolAllocator::getPlatformPageSize() {
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
index 549e31acb176d..7b2e19956c796 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
@@ -12,6 +12,7 @@
 #include "gwp_asan/utilities.h"
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -46,7 +47,8 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   void *Ptr = mmap(nullptr, Size, PROT_READ | PROT_WRITE,
                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to map guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to map guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, Name);
   return Ptr;
 }
@@ -54,15 +56,16 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
 void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(munmap(Ptr, Size) == 0,
-        "Failed to unmap guarded pool allocator memory.");
+  checkWithErrorCode(munmap(Ptr, Size) == 0,
+                     "Failed to unmap guarded pool allocator memory.", errno);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
   assert((Size % State.PageSize) == 0);
   void *Ptr =
       mmap(nullptr, Size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to reserve guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
   return Ptr;
 }
@@ -75,8 +78,9 @@ void GuardedPoolAllocator::unreserveGuardedPool() {
 void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
-        "Failed to allocate in guarded pool allocator memory");
+  checkWithErrorCode(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
+                     "Failed to allocate in guarded pool allocator memory",
+                     errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanAliveSlotName);
 }
 
@@ -87,9 +91,10 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   // mmap() a PROT_NONE page over the address to release it to the system, if
   // we used mprotect() here the system would count pages in the quarantine
   // against the RSS.
-  check(mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
-             0) != MAP_FAILED,
-        "Failed to deallocate in guarded pool allocator memory");
+  checkWithErrorCode(
+      mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
+           0) != MAP_FAILED,
+      "Failed to deallocate in guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
 }
 
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
index bc9d3a4462a2f..fecf94b0d1a19 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
@@ -8,12 +8,25 @@
 
 #include "gwp_asan/utilities.h"
 
+#include <alloca.h>
+#include <stdio.h>
 #include <string.h>
 #include <zircon/sanitizer.h>
+#include <zircon/status.h>
 
 namespace gwp_asan {
 void die(const char *Message) {
   __sanitizer_log_write(Message, strlen(Message));
   __builtin_trap();
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+  const char *error_str =
+      _zx_status_get_string(static_cast<zx_status_t>(ErrorCode));
+  size_t buffer_size = strlen(Message) + 32 + strlen(error_str);
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %s)", Message, error_str);
+  __sanitizer_log_write(buffer, strlen(buffer));
+  __builtin_trap();
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
index 7357963055095..750198062ce4f 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
@@ -6,7 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <alloca.h>
 #include <features.h> // IWYU pragma: keep (for __BIONIC__ macro)
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
 
 #ifdef __BIONIC__
 #include "gwp_asan/definitions.h"
@@ -27,4 +31,21 @@ void die(const char *Message) {
   __builtin_trap();
 #endif // __BIONIC__
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+#ifdef __BIONIC__
+  if (&android_set_abort_message == nullptr)
+    abort();
+
+  size_t buffer_size = strlen(Message) + 48;
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %" PRId64 ")", Message,
+           ErrorCode);
+  android_set_abort_message(buffer);
+  abort();
+#else  // __BIONIC__
+  fprintf(stderr, "%s (Error Code: %" PRId64 ")", Message, ErrorCode);
+  __builtin_trap();
+#endif // __BIONIC__
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
index ca43ec2a94ac4..5de1af10eec36 100644
--- a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
@@ -28,7 +28,9 @@ set(GWP_ASAN_UNITTESTS
   late_init.cpp
   options.cpp
   recoverable.cpp
-  never_allocated.cpp)
+  never_allocated.cpp
+  utilities.cpp
+)
 
 set(GWP_ASAN_UNIT_TEST_HEADERS
   ${GWP_ASAN_HEADERS}
diff --git a/compiler-rt/lib/gwp_asan/tests/utilities.cpp b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
new file mode 100644
index 0000000000000..09a54e526f5db
--- /dev/null
+++ b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
@@ -0,0 +1,24 @@
+//===-- utilities.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gwp_asan/utilities.h"
+#include "gwp_asan/tests/harness.h"
+
+using gwp_asan::check;
+using gwp_asan::checkWithErrorCode;
+
+TEST(UtilitiesDeathTest, CheckPrintsAsExpected) {
+  EXPECT_DEATH({ check(false, "Hello world"); }, "Hello world");
+  check(true, "Should not crash");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", 1337); },
+      "Hello world \\(Error Code: 1337\\)");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", -1337); },
+      "Hello world \\(Error Code: -1337\\)");
+}
diff --git a/compiler-rt/lib/gwp_asan/utilities.h b/compiler-rt/lib/gwp_asan/utilities.h
index 76e5df2e3eb8f..02f450a9eeec3 100644
--- a/compiler-rt/lib/gwp_asan/utilities.h
+++ b/compiler-rt/lib/gwp_asan/utilities.h
@@ -12,17 +12,28 @@
 #include "gwp_asan/definitions.h"
 
 #include <stddef.h>
+#include <stdint.h>
 
 namespace gwp_asan {
 // Terminates in a platform-specific way with `Message`.
 void die(const char *Message);
+void dieWithErrorCode(const char *Message, int64_t ErrorCode);
 
 // Checks that `Condition` is true, otherwise dies with `Message`.
 GWP_ASAN_ALWAYS_INLINE void check(bool Condition, const char *Message) {
-  if (Condition)
+  if (GWP_ASAN_LIKELY(Condition))
     return;
   die(Message);
 }
+
+// Checks that `Condition` is true, otherwise dies with `Message` (including
+// errno at the end).
+GWP_ASAN_ALWAYS_INLINE void
+checkWithErrorCode(bool Condition, const char *Message, int64_t ErrorCode) {
+  if (GWP_ASAN_LIKELY(Condition))
+    return;
+  dieWithErrorCode(Message, ErrorCode);
+}
 } // namespace gwp_asan
 
 #endif // GWP_ASAN_UTILITIES_H_
diff --git a/compiler-rt/lib/interception/interception_linux.h b/compiler-rt/lib/interception/interception_linux.h
index 433a3d9bd7fa7..2e01ff44578c3 100644
--- a/compiler-rt/lib/interception/interception_linux.h
+++ b/compiler-rt/lib/interception/interception_linux.h
@@ -28,12 +28,14 @@ bool InterceptFunction(const char *name, const char *ver, uptr *ptr_to_real,
                        uptr func, uptr trampoline);
 }  // namespace __interception
 
-#define INTERCEPT_FUNCTION_LINUX_OR_FREEBSD(func) \
-  ::__interception::InterceptFunction(            \
-      #func,                                      \
-      (::__interception::uptr *)&REAL(func),      \
-      (::__interception::uptr)&(func),            \
-      (::__interception::uptr)&TRAMPOLINE(func))
+// Cast func to type of REAL(func) before casting to uptr in case it is an
+// overloaded function, which is the case for some glibc functions when
+// _FORTIFY_SOURCE is used. This disambiguates which overload to use.
+#define INTERCEPT_FUNCTION_LINUX_OR_FREEBSD(func)            \
+  ::__interception::InterceptFunction(                       \
+      #func, (::__interception::uptr *)&REAL(func),          \
+      (::__interception::uptr)(decltype(REAL(func)))&(func), \
+      (::__interception::uptr) &TRAMPOLINE(func))
 
 // dlvsym is a GNU extension supported by some other platforms.
 #if SANITIZER_GLIBC || SANITIZER_FREEBSD || SANITIZER_NETBSD
@@ -41,7 +43,7 @@ bool InterceptFunction(const char *name, const char *ver, uptr *ptr_to_real,
   ::__interception::InterceptFunction(                        \
       #func, symver,                                          \
       (::__interception::uptr *)&REAL(func),                  \
-      (::__interception::uptr)&(func),                        \
+      (::__interception::uptr)(decltype(REAL(func)))&(func),  \
       (::__interception::uptr)&TRAMPOLINE(func))
 #else
 #define INTERCEPT_FUNCTION_VER_LINUX_OR_FREEBSD(func, symver) \
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index 6df4b6865b379..b569c337e9764 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -389,7 +389,7 @@ INTERCEPTOR(int, atexit, void (*f)()) {
 extern "C" {
 extern int _pthread_atfork(void (*prepare)(), void (*parent)(),
                            void (*child)());
-};
+}
 
 INTERCEPTOR(int, pthread_atfork, void (*prepare)(), void (*parent)(),
             void (*child)()) {
diff --git a/compiler-rt/lib/memprof/memprof_mapping.h b/compiler-rt/lib/memprof/memprof_mapping.h
index fef8acfcfc921..6da385ab3d6e2 100644
--- a/compiler-rt/lib/memprof/memprof_mapping.h
+++ b/compiler-rt/lib/memprof/memprof_mapping.h
@@ -55,7 +55,7 @@ extern uptr kHighMemEnd; // Initialized in __memprof_init.
 // computed by summing up all individual 1 byte counters. This can incur an
 // accuracy penalty.
 
-#define HISTOGRAM_GRANULARITY 8U
+#define HISTOGRAM_GRANULARITY 8ULL
 
 #define HISTOGRAM_MAX_COUNTER 255U
 
diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp
index 718242c2ecdf8..568a1b3a65575 100644
--- a/compiler-rt/lib/nsan/nsan.cpp
+++ b/compiler-rt/lib/nsan/nsan.cpp
@@ -54,8 +54,6 @@ using namespace __nsan;
 constexpr int kMaxVectorWidth = 8;
 
 // When copying application memory, we also copy its shadow and shadow type.
-// FIXME: We could provide fixed-size versions that would nicely
-// vectorize for known sizes.
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
 __nsan_copy_values(const u8 *daddr, const u8 *saddr, uptr size) {
   internal_memmove((void *)GetShadowTypeAddrFor(daddr),
@@ -64,13 +62,33 @@ __nsan_copy_values(const u8 *daddr, const u8 *saddr, uptr size) {
                    size * kShadowScale);
 }
 
-// FIXME: We could provide fixed-size versions that would nicely
-// vectorize for known sizes.
+#define NSAN_COPY_VALUES_N(N)                                                  \
+  extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_copy_##N(               \
+      const u8 *daddr, const u8 *saddr) {                                      \
+    __builtin_memmove((void *)GetShadowTypeAddrFor(daddr),                     \
+                      GetShadowTypeAddrFor(saddr), N);                         \
+    __builtin_memmove((void *)GetShadowAddrFor(daddr),                         \
+                      GetShadowAddrFor(saddr), N * kShadowScale);              \
+  }
+
+NSAN_COPY_VALUES_N(4)
+NSAN_COPY_VALUES_N(8)
+NSAN_COPY_VALUES_N(16)
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
 __nsan_set_value_unknown(const u8 *addr, uptr size) {
   internal_memset((void *)GetShadowTypeAddrFor(addr), 0, size);
 }
 
+#define NSAN_SET_VALUE_UNKNOWN_N(N)                                            \
+  extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_set_value_unknown_##N(  \
+      const u8 *daddr) {                                                       \
+    __builtin_memset((void *)GetShadowTypeAddrFor(daddr), 0, N);               \
+  }
+
+NSAN_SET_VALUE_UNKNOWN_N(4)
+NSAN_SET_VALUE_UNKNOWN_N(8)
+NSAN_SET_VALUE_UNKNOWN_N(16)
 
 const char *FTInfo<float>::kCppTypeName = "float";
 const char *FTInfo<double>::kCppTypeName = "double";
diff --git a/compiler-rt/lib/nsan/nsan_interceptors.cpp b/compiler-rt/lib/nsan/nsan_interceptors.cpp
index 544b44f53cc42..852524bd37332 100644
--- a/compiler-rt/lib/nsan/nsan_interceptors.cpp
+++ b/compiler-rt/lib/nsan/nsan_interceptors.cpp
@@ -21,10 +21,6 @@
 
 #include <wchar.h>
 
-#if SANITIZER_LINUX
-extern "C" int mallopt(int param, int value);
-#endif
-
 using namespace __sanitizer;
 using __nsan::nsan_init_is_running;
 using __nsan::nsan_initialized;
@@ -209,12 +205,6 @@ void __nsan::InitializeInterceptors() {
   static bool initialized = false;
   CHECK(!initialized);
 
-  // Instruct libc malloc to consume less memory.
-#if SANITIZER_LINUX
-  mallopt(1, 0);          // M_MXFAST
-  mallopt(-3, 32 * 1024); // M_MMAP_THRESHOLD
-#endif
-
   InitializeMallocInterceptors();
 
   INTERCEPT_FUNCTION(memset);
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index d424a22c212c3..6906d52eacaf1 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,7 +49,6 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
-typedef void *IntPtrT;
 typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
 #define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
 #include "profile/InstrProfData.inc"
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 1c58584d2d4f7..db3918d841031 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -101,6 +101,8 @@ static const int UseBiasVar = 0;
 static const char *FileOpenMode = "a+b";
 static void *BiasAddr = NULL;
 static void *BiasDefaultAddr = NULL;
+static void *BitmapBiasAddr = NULL;
+static void *BitmapBiasDefaultAddr = NULL;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Get the sizes of various profile data sections. Taken from
    * __llvm_profile_get_size_for_buffer(). */
@@ -199,11 +201,15 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR                            \
   INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR, _default)
 COMPILER_RT_VISIBILITY intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR = 0;
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR                             \
+  INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_BITMAP_BIAS_VAR, _default)
+COMPILER_RT_VISIBILITY intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR = 0;
 
 /* This variable is a weak external reference which could be used to detect
  * whether or not the compiler defined this symbol. */
 #if defined(_MSC_VER)
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
+COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_VAR;
 #if defined(_M_IX86) || defined(__i386__)
 #define WIN_SYM_PREFIX "_"
 #else
@@ -213,10 +219,17 @@ COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
     linker, "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
                 INSTR_PROF_PROFILE_COUNTER_BIAS_VAR) "=" WIN_SYM_PREFIX        \
                 INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))
+#pragma comment(                                                               \
+    linker, "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
+                INSTR_PROF_PROFILE_BITMAP_BIAS_VAR) "=" WIN_SYM_PREFIX         \
+                INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR))
 #else
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR
     __attribute__((weak, alias(INSTR_PROF_QUOTE(
                              INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))));
+COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_VAR
+    __attribute__((weak, alias(INSTR_PROF_QUOTE(
+                             INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR))));
 #endif
 static const int ContinuousModeSupported = 1;
 static const int UseBiasVar = 1;
@@ -227,6 +240,9 @@ static const char *FileOpenMode = "w+b";
  * used and runtime provides a weak alias so we can check if it's defined. */
 static void *BiasAddr = &INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
 static void *BiasDefaultAddr = &INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR;
+static void *BitmapBiasAddr = &INSTR_PROF_PROFILE_BITMAP_BIAS_VAR;
+static void *BitmapBiasDefaultAddr =
+    &INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Get the sizes of various profile data sections. Taken from
    * __llvm_profile_get_size_for_buffer(). */
@@ -237,12 +253,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const char *BitmapBegin = __llvm_profile_begin_bitmap();
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
+  uint64_t CountersSize =
+      __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   /* Get the file size. */
   uint64_t FileSize = 0;
   if (getProfileFileSizeForMerging(File, &FileSize))
     return 1;
 
   int Fileno = fileno(File);
+  uint64_t PaddingBytesAfterCounters =
+      __llvm_profile_get_num_padding_bytes(CountersSize);
   uint64_t FileOffsetToCounters =
       sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) + DataSize;
 
@@ -260,7 +282,17 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)CountersBegin, (uintptr_t)CountersEnd);
 
-  /* BIAS MODE not supported yet for Bitmap (MCDC). */
+  /* Also mmap MCDC bitmap bytes. If there aren't any bitmap bytes, mmap()
+   * will fail with EINVAL. */
+  if (NumBitmapBytes == 0)
+    return 0;
+
+  /* Update profbm_bias. */
+  uint64_t FileOffsetToBitmap =
+      FileOffsetToCounters + CountersSize + PaddingBytesAfterCounters;
+  /* Update the profile fields based on the current mapping. */
+  INSTR_PROF_PROFILE_BITMAP_BIAS_VAR =
+      (uintptr_t)Profile - (uintptr_t)BitmapBegin + FileOffsetToBitmap;
 
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)BitmapBegin, (uintptr_t)BitmapEnd);
@@ -272,6 +304,8 @@ static const int UseBiasVar = 0;
 static const char *FileOpenMode = "a+b";
 static void *BiasAddr = NULL;
 static void *BiasDefaultAddr = NULL;
+static void *BitmapBiasAddr = NULL;
+static void *BitmapBiasDefaultAddr = NULL;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   return 0;
 }
@@ -619,8 +653,10 @@ static void initializeProfileForContinuousMode(void) {
     PROF_ERR("%s\n", "continuous mode is unsupported on this platform");
     return;
   }
-  if (UseBiasVar && BiasAddr == BiasDefaultAddr) {
-    PROF_ERR("%s\n", "__llvm_profile_counter_bias is undefined");
+  if (UseBiasVar && BiasAddr == BiasDefaultAddr &&
+      BitmapBiasAddr == BitmapBiasDefaultAddr) {
+    PROF_ERR("%s\n", "Neither __llvm_profile_counter_bias nor "
+                     "__llvm_profile_bitmap_bias is defined");
     return;
   }
 
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index 43a63b4636f1e..cf7fbddd9eb9c 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -12,10 +12,23 @@
 #include <rtsan/rtsan_context.h>
 #include <rtsan/rtsan_interceptors.h>
 
+using namespace __rtsan;
+
+bool __rtsan::rtsan_initialized;
+bool __rtsan::rtsan_init_is_running;
+
 extern "C" {
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() {
-  __rtsan::InitializeInterceptors();
+  CHECK(!rtsan_init_is_running);
+  if (rtsan_initialized)
+    return;
+  rtsan_init_is_running = true;
+
+  InitializeInterceptors();
+
+  rtsan_init_is_running = false;
+  rtsan_initialized = true;
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter() {
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index 8b1219c13cbd3..ccddaf2c893ef 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -14,6 +14,13 @@
 
 extern "C" {
 
+namespace __rtsan {
+
+extern bool rtsan_initialized;
+extern bool rtsan_init_is_running;
+
+} // namespace __rtsan
+
 // Initialise rtsan interceptors.
 // A call to this method is added to the preinit array on Linux systems.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init();
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
index 3a65f9d3f779d..b63040446e53c 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
@@ -10,13 +10,29 @@
 
 #include "rtsan/rtsan_interceptors.h"
 
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_allocator_dlsym.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_platform.h"
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 
 #include "interception/interception.h"
+#include "rtsan/rtsan.h"
 #include "rtsan/rtsan_context.h"
 
 #if SANITIZER_APPLE
+
+#if TARGET_OS_MAC
+// On MacOS OSSpinLockLock is deprecated and no longer present in the headers,
+// but the symbol still exists on the system. Forward declare here so we
+// don't get compilation errors.
+#include <stdint.h>
+extern "C" {
+typedef int32_t OSSpinLock;
+void OSSpinLockLock(volatile OSSpinLock *__lock);
+}
+#endif
+
 #include <libkern/OSAtomic.h>
 #include <os/lock.h>
 #endif
@@ -35,6 +51,15 @@
 
 using namespace __sanitizer;
 
+using __rtsan::rtsan_init_is_running;
+using __rtsan::rtsan_initialized;
+
+namespace {
+struct DlsymAlloc : public DlSymAllocator<DlsymAlloc> {
+  static bool UseImpl() { return !rtsan_initialized; }
+};
+} // namespace
+
 void ExpectNotRealtime(const char *intercepted_function_name) {
   __rtsan::GetContextForThisThread().ExpectNotRealtime(
       intercepted_function_name);
@@ -238,11 +263,17 @@ INTERCEPTOR(int, nanosleep, const struct timespec *rqtp,
 // Memory
 
 INTERCEPTOR(void *, calloc, SIZE_T num, SIZE_T size) {
+  if (DlsymAlloc::Use())
+    return DlsymAlloc::Callocate(num, size);
+
   ExpectNotRealtime("calloc");
   return REAL(calloc)(num, size);
 }
 
 INTERCEPTOR(void, free, void *ptr) {
+  if (DlsymAlloc::PointerIsMine(ptr))
+    return DlsymAlloc::Free(ptr);
+
   if (ptr != NULL) {
     ExpectNotRealtime("free");
   }
@@ -250,11 +281,17 @@ INTERCEPTOR(void, free, void *ptr) {
 }
 
 INTERCEPTOR(void *, malloc, SIZE_T size) {
+  if (DlsymAlloc::Use())
+    return DlsymAlloc::Allocate(size);
+
   ExpectNotRealtime("malloc");
   return REAL(malloc)(size);
 }
 
 INTERCEPTOR(void *, realloc, void *ptr, SIZE_T size) {
+  if (DlsymAlloc::Use() || DlsymAlloc::PointerIsMine(ptr))
+    return DlsymAlloc::Realloc(ptr, size);
+
   ExpectNotRealtime("realloc");
   return REAL(realloc)(ptr, size);
 }
diff --git a/compiler-rt/lib/rtsan/tests/CMakeLists.txt b/compiler-rt/lib/rtsan/tests/CMakeLists.txt
index 9eda116541ae3..3b783c90c2658 100644
--- a/compiler-rt/lib/rtsan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/rtsan/tests/CMakeLists.txt
@@ -36,6 +36,7 @@ set(RTSAN_UNITTEST_LINK_FLAGS
   ${SANITIZER_TEST_CXX_LIBRARIES}
   -no-pie)
 
+append_list_if(COMPILER_RT_HAS_LIBATOMIC -latomic RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBDL -ldl RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBRT -lrt RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBM -lm RTSAN_UNITTEST_LINK_FLAGS)
diff --git a/compiler-rt/lib/safestack/safestack.cpp b/compiler-rt/lib/safestack/safestack.cpp
index 0751f3988b9c1..f01a642987646 100644
--- a/compiler-rt/lib/safestack/safestack.cpp
+++ b/compiler-rt/lib/safestack/safestack.cpp
@@ -13,14 +13,39 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define SANITIZER_COMMON_NO_REDEFINE_BUILTINS
+
 #include "safestack_platform.h"
 #include "safestack_util.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
 
 #include <errno.h>
+#include <string.h>
 #include <sys/resource.h>
 
 #include "interception/interception.h"
 
+// interception.h drags in sanitizer_redefine_builtins.h, which in turn
+// creates references to __sanitizer_internal_memcpy etc.  The interceptors
+// aren't needed here, so just forward to libc.
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memcpy(void *dest,
+                                                                const void *src,
+                                                                size_t n) {
+  return memcpy(dest, src, n);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memmove(
+    void *dest, const void *src, size_t n) {
+  return memmove(dest, src, n);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memset(void *s, int c,
+                                                                size_t n) {
+  return memset(s, c, n);
+}
+}  // extern "C"
+
 using namespace safestack;
 
 // TODO: To make accessing the unsafe stack pointer faster, we plan to
@@ -224,6 +249,17 @@ INTERCEPTOR(int, pthread_create, pthread_t *thread,
     pthread_attr_destroy(&tmpattr);
   }
 
+#if SANITIZER_SOLARIS
+  // Solaris pthread_attr_init initializes stacksize to 0 (the default), so
+  // hardcode the actual values as documented in pthread_create(3C).
+  if (size == 0)
+#  if defined(_LP64)
+    size = 2 * 1024 * 1024;
+#  else
+    size = 1024 * 1024;
+#  endif
+#endif
+
   SFS_CHECK(size);
   size = RoundUpTo(size, kStackAlign);
 
diff --git a/compiler-rt/lib/safestack/safestack_platform.h b/compiler-rt/lib/safestack/safestack_platform.h
index d4b2e2ef7391c..41c7c25fdaf4d 100644
--- a/compiler-rt/lib/safestack/safestack_platform.h
+++ b/compiler-rt/lib/safestack/safestack_platform.h
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements platform specific parts of SafeStack runtime.
+// Don't use equivalent functionality from sanitizer_common to avoid dragging
+// a large codebase into security sensitive code.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,6 +19,7 @@
 #include "sanitizer_common/sanitizer_platform.h"
 
 #include <dlfcn.h>
+#include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +47,19 @@ extern "C" void *__mmap(void *, size_t, int, int, int, int, off_t);
 #  include <thread.h>
 #endif
 
+// Keep in sync with sanitizer_linux.cpp.
+//
+// Are we using 32-bit or 64-bit Linux syscalls?
+// x32 (which defines __x86_64__) has SANITIZER_WORDSIZE == 32
+// but it still needs to use 64-bit syscalls.
+#if SANITIZER_LINUX &&                                \
+    (defined(__x86_64__) || defined(__powerpc64__) || \
+     SANITIZER_WORDSIZE == 64 || (defined(__mips__) && _MIPS_SIM == _ABIN32))
+#  define SANITIZER_LINUX_USES_64BIT_SYSCALLS 1
+#else
+#  define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
+#endif
+
 namespace safestack {
 
 #if SANITIZER_NETBSD
@@ -68,6 +84,24 @@ static void *GetRealLibcAddress(const char *symbol) {
   SFS_CHECK(real_##func);
 #endif
 
+#if SANITIZER_SOLARIS
+#  define _REAL(func) _##func
+#  define DEFINE__REAL(ret_type, func, ...) \
+    extern "C" ret_type _REAL(func)(__VA_ARGS__)
+
+#  if !defined(_LP64) && _FILE_OFFSET_BITS == 64
+#    define _REAL64(func) _##func##64
+#  else
+#    define _REAL64(func) _REAL(func)
+#  endif
+#  define DEFINE__REAL64(ret_type, func, ...) \
+    extern "C" ret_type _REAL64(func)(__VA_ARGS__)
+
+DEFINE__REAL64(void *, mmap, void *a, size_t b, int c, int d, int e, off_t f);
+DEFINE__REAL(int, munmap, void *a, size_t b);
+DEFINE__REAL(int, mprotect, void *a, size_t b, int c);
+#endif
+
 using ThreadId = uint64_t;
 
 inline ThreadId GetTid() {
@@ -91,15 +125,15 @@ inline int TgKill(pid_t pid, ThreadId tid, int sig) {
   (void)pid;
   return _REAL(_lwp_kill, tid, sig);
 #elif SANITIZER_SOLARIS
-#  ifdef SYS_lwp_kill
-  return syscall(SYS_lwp_kill, tid, sig);
-#  else
-  return -1;
-#  endif
+  (void)pid;
+  errno = thr_kill(tid, sig);
+  // TgKill is expected to return -1 on error, not an errno.
+  return errno != 0 ? -1 : 0;
 #elif SANITIZER_FREEBSD
   return syscall(SYS_thr_kill2, pid, tid, sig);
 #else
-  return syscall(SYS_tgkill, pid, tid, sig);
+  // tid is pid_t (int), not ThreadId (uint64_t).
+  return syscall(SYS_tgkill, pid, (pid_t)tid, sig);
 #endif
 }
 
@@ -109,11 +143,17 @@ inline void *Mmap(void *addr, size_t length, int prot, int flags, int fd,
   return __mmap(addr, length, prot, flags, fd, 0, offset);
 #elif SANITIZER_FREEBSD && (defined(__aarch64__) || defined(__x86_64__))
   return (void *)__syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
+#elif SANITIZER_FREEBSD && (defined(__i386__))
+  return (void *)syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
 #elif SANITIZER_SOLARIS
-  return (void *)(uintptr_t)syscall(SYS_mmap, addr, length, prot, flags, fd,
-                                    offset);
-#else
+  return _REAL64(mmap)(addr, length, prot, flags, fd, offset);
+#elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
   return (void *)syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
+#else
+  // mmap2 specifies file offset in 4096-byte units.
+  SFS_CHECK(IsAligned(offset, 4096));
+  return (void *)syscall(SYS_mmap2, addr, length, prot, flags, fd,
+                         offset / 4096);
 #endif
 }
 
@@ -121,6 +161,8 @@ inline int Munmap(void *addr, size_t length) {
 #if SANITIZER_NETBSD
   DEFINE__REAL(int, munmap, void *a, size_t b);
   return _REAL(munmap, addr, length);
+#elif SANITIZER_SOLARIS
+  return _REAL(munmap)(addr, length);
 #else
   return syscall(SYS_munmap, addr, length);
 #endif
@@ -130,6 +172,8 @@ inline int Mprotect(void *addr, size_t length, int prot) {
 #if SANITIZER_NETBSD
   DEFINE__REAL(int, mprotect, void *a, size_t b, int c);
   return _REAL(mprotect, addr, length, prot);
+#elif SANITIZER_SOLARIS
+  return _REAL(mprotect)(addr, length, prot);
 #else
   return syscall(SYS_mprotect, addr, length, prot);
 #endif
diff --git a/compiler-rt/lib/safestack/safestack_util.h b/compiler-rt/lib/safestack/safestack_util.h
index da3f11b54cabc..ca724312c3eec 100644
--- a/compiler-rt/lib/safestack/safestack_util.h
+++ b/compiler-rt/lib/safestack/safestack_util.h
@@ -33,6 +33,10 @@ inline size_t RoundUpTo(size_t size, size_t boundary) {
   return (size + boundary - 1) & ~(boundary - 1);
 }
 
+inline constexpr bool IsAligned(size_t a, size_t alignment) {
+  return (a & (alignment - 1)) == 0;
+}
+
 class MutexLock {
  public:
   explicit MutexLock(pthread_mutex_t &mutex) : mutex_(&mutex) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
index 16cdc4ce53b35..0b0bdb07041e5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
@@ -316,13 +316,13 @@ class SizeClassAllocator64 {
     Printf(
         "%s %02zd (%6zd): mapped: %6zdK allocs: %7zd frees: %7zd inuse: %6zd "
         "num_freed_chunks %7zd avail: %6zd rss: %6zdK releases: %6zd "
-        "last released: %6lldK region: 0x%zx\n",
+        "last released: %6lldK region: %p\n",
         region->exhausted ? "F" : " ", class_id, ClassIdToSize(class_id),
         region->mapped_user >> 10, region->stats.n_allocated,
         region->stats.n_freed, in_use, region->num_freed_chunks, avail_chunks,
         rss >> 10, region->rtoi.num_releases,
         region->rtoi.last_released_bytes >> 10,
-        SpaceBeg() + kRegionSize * class_id);
+        (void *)(SpaceBeg() + kRegionSize * class_id));
   }
 
   void PrintStats() {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 032b04a09ae76..49c9dcbef358f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -6336,7 +6336,7 @@ INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
 
       const char *SelfFName = DladdrSelfFName();
       VPrintf(1, "dlopen interceptor: DladdrSelfFName: %p %s\n",
-              (void *)SelfFName, SelfFName);
+              (const void *)SelfFName, SelfFName);
 
       if (SelfFName && internal_strcmp(SelfFName, filename) == 0) {
         // It's possible they copied the string from dladdr, so
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_libcdep_new.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
index ce4326967180d..506659a58c45e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
@@ -75,7 +75,8 @@ static void SanitizerDumpCoverage(const uptr* unsorted_pcs, uptr len) {
     if (!pc) continue;
 
     if (!GetModuleAndOffsetForPc(pc, nullptr, 0, &pcs[i])) {
-      Printf("ERROR: unknown pc 0x%zx (may happen if dlclose is used)\n", pc);
+      Printf("ERROR: unknown pc %p (may happen if dlclose is used)\n",
+             (void*)pc);
       continue;
     }
     uptr module_base = pc - pcs[i];
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
index b7fc9444cc66b..f0e1e3d69def5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
@@ -105,8 +105,8 @@ void LibIgnore::OnLibraryLoaded(const char *name) {
           continue;
         if (IsPcInstrumented(range.beg) && IsPcInstrumented(range.end - 1))
           continue;
-        VReport(1, "Adding instrumented range 0x%zx-0x%zx from library '%s'\n",
-                range.beg, range.end, mod.full_name());
+        VReport(1, "Adding instrumented range %p-%p from library '%s'\n",
+                (void *)range.beg, (void *)range.end, mod.full_name());
         const uptr idx =
             atomic_load(&instrumented_ranges_count_, memory_order_relaxed);
         CHECK_LT(idx, ARRAY_SIZE(instrumented_code_ranges_));
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 7935c88204a05..fd5ff1b400545 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -33,11 +33,15 @@
 // For mips64, syscall(__NR_stat) fills the buffer in the 'struct kernel_stat'
 // format. Struct kernel_stat is defined as 'struct stat' in asm/stat.h. To
 // access stat from asm/stat.h, without conflicting with definition in
-// sys/stat.h, we use this trick.
-#  if SANITIZER_MIPS64
+// sys/stat.h, we use this trick.  sparc64 is similar, using
+// syscall(__NR_stat64) and struct kernel_stat64.
+#  if SANITIZER_LINUX && (SANITIZER_MIPS64 || SANITIZER_SPARC64)
 #    include <asm/unistd.h>
 #    include <sys/types.h>
 #    define stat kernel_stat
+#    if SANITIZER_SPARC64
+#      define stat64 kernel_stat64
+#    endif
 #    if SANITIZER_GO
 #      undef st_atime
 #      undef st_mtime
@@ -48,6 +52,7 @@
 #    endif
 #    include <asm/stat.h>
 #    undef stat
+#    undef stat64
 #  endif
 
 #  include <dlfcn.h>
@@ -220,7 +225,7 @@ uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
   // mmap2 specifies file offset in 4096-byte units.
   CHECK(IsAligned(offset, 4096));
   return internal_syscall(SYSCALL(mmap2), addr, length, prot, flags, fd,
-                          offset / 4096);
+                          (OFF_T)(offset / 4096));
 #      endif
 }
 #    endif  // !SANITIZER_S390
@@ -285,8 +290,7 @@ uptr internal_ftruncate(fd_t fd, uptr size) {
   return res;
 }
 
-#    if (!SANITIZER_LINUX_USES_64BIT_SYSCALLS || SANITIZER_SPARC) && \
-        SANITIZER_LINUX
+#    if !SANITIZER_LINUX_USES_64BIT_SYSCALLS && SANITIZER_LINUX
 static void stat64_to_stat(struct stat64 *in, struct stat *out) {
   internal_memset(out, 0, sizeof(*out));
   out->st_dev = in->st_dev;
@@ -327,7 +331,12 @@ static void statx_to_stat(struct statx *in, struct stat *out) {
 }
 #    endif
 
-#    if SANITIZER_MIPS64
+#    if SANITIZER_MIPS64 || SANITIZER_SPARC64
+#      if SANITIZER_MIPS64
+typedef struct kernel_stat kstat_t;
+#      else
+typedef struct kernel_stat64 kstat_t;
+#      endif
 // Undefine compatibility macros from <sys/stat.h>
 // so that they would not clash with the kernel_stat
 // st_[a|m|c]time fields
@@ -345,7 +354,7 @@ static void statx_to_stat(struct statx *in, struct stat *out) {
 #        undef st_mtime_nsec
 #        undef st_ctime_nsec
 #      endif
-static void kernel_stat_to_stat(struct kernel_stat *in, struct stat *out) {
+static void kernel_stat_to_stat(kstat_t *in, struct stat *out) {
   internal_memset(out, 0, sizeof(*out));
   out->st_dev = in->st_dev;
   out->st_ino = in->st_ino;
@@ -391,6 +400,12 @@ uptr internal_stat(const char *path, void *buf) {
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           0);
+#      elif SANITIZER_SPARC64
+  kstat_t buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, 0);
+  kernel_stat_to_stat(&buf64, (struct stat *)buf);
+  return res;
 #      else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
@@ -423,6 +438,12 @@ uptr internal_lstat(const char *path, void *buf) {
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
+#      elif SANITIZER_SPARC64
+  kstat_t buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, AT_SYMLINK_NOFOLLOW);
+  kernel_stat_to_stat(&buf64, (struct stat *)buf);
+  return res;
 #      else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
@@ -442,10 +463,16 @@ uptr internal_fstat(fd_t fd, void *buf) {
 #    if SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
 #      if SANITIZER_MIPS64
   // For mips64, fstat syscall fills buffer in the format of kernel_stat
-  struct kernel_stat kbuf;
+  kstat_t kbuf;
   int res = internal_syscall(SYSCALL(fstat), fd, &kbuf);
   kernel_stat_to_stat(&kbuf, (struct stat *)buf);
   return res;
+#      elif SANITIZER_LINUX && SANITIZER_SPARC64
+  // For sparc64, fstat64 syscall fills buffer in the format of kernel_stat64
+  kstat_t kbuf;
+  int res = internal_syscall(SYSCALL(fstat64), fd, &kbuf);
+  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+  return res;
 #      elif SANITIZER_LINUX && defined(__loongarch__)
   struct statx bufx;
   int res = internal_syscall(SYSCALL(statx), fd, "", AT_EMPTY_PATH,
@@ -826,10 +853,16 @@ uptr internal_sigaltstack(const void *ss, void *oss) {
   return internal_syscall(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
 }
 
+extern "C" pid_t __fork(void);
+
 int internal_fork() {
 #    if SANITIZER_LINUX
 #      if SANITIZER_S390
   return internal_syscall(SYSCALL(clone), 0, SIGCHLD);
+#      elif SANITIZER_SPARC
+  // The clone syscall interface on SPARC differs massively from the rest,
+  // so fall back to __fork.
+  return __fork();
 #      else
   return internal_syscall(SYSCALL(clone), SIGCHLD, 0);
 #      endif
@@ -1109,7 +1142,8 @@ uptr GetMaxVirtualAddress() {
 #  if SANITIZER_NETBSD && defined(__x86_64__)
   return 0x7f7ffffff000ULL;  // (0x00007f8000000000 - PAGE_SIZE)
 #  elif SANITIZER_WORDSIZE == 64
-#    if defined(__powerpc64__) || defined(__aarch64__) || defined(__loongarch__)
+#    if defined(__powerpc64__) || defined(__aarch64__) || \
+        defined(__loongarch__) || SANITIZER_RISCV64
   // On PowerPC64 we have two different address space layouts: 44- and 46-bit.
   // We somehow need to figure out which one we are using now and choose
   // one of 0x00000fffffffffffUL and 0x00003fffffffffffUL.
@@ -1118,9 +1152,8 @@ uptr GetMaxVirtualAddress() {
   // This should (does) work for both PowerPC64 Endian modes.
   // Similarly, aarch64 has multiple address space layouts: 39, 42 and 47-bit.
   // loongarch64 also has multiple address space layouts: default is 47-bit.
+  // RISC-V 64 also has multiple address space layouts: 39, 48 and 57-bit.
   return (1ULL << (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1)) - 1;
-#    elif SANITIZER_RISCV64
-  return (1ULL << 38) - 1;
 #    elif SANITIZER_MIPS64
   return (1ULL << 40) - 1;  // 0x000000ffffffffffUL;
 #    elif defined(__s390x__)
@@ -2118,8 +2151,359 @@ bool SignalContext::IsTrueFaultingAddress() const {
   return si->si_signo == SIGSEGV && si->si_code != 128;
 }
 
+UNUSED
+static const char *RegNumToRegName(int reg) {
+  switch (reg) {
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
+#    if defined(__x86_64__)
+#      if SANITIZER_NETBSD
+#        define REG_RAX _REG_RAX
+#        define REG_RBX _REG_RBX
+#        define REG_RCX _REG_RCX
+#        define REG_RDX _REG_RDX
+#        define REG_RDI _REG_RDI
+#        define REG_RSI _REG_RSI
+#        define REG_RBP _REG_RBP
+#        define REG_RSP _REG_RSP
+#        define REG_R8 _REG_R8
+#        define REG_R9 _REG_R9
+#        define REG_R10 _REG_R10
+#        define REG_R11 _REG_R11
+#        define REG_R12 _REG_R12
+#        define REG_R13 _REG_R13
+#        define REG_R14 _REG_R14
+#        define REG_R15 _REG_R15
+#      endif
+    case REG_RAX:
+      return "rax";
+    case REG_RBX:
+      return "rbx";
+    case REG_RCX:
+      return "rcx";
+    case REG_RDX:
+      return "rdx";
+    case REG_RDI:
+      return "rdi";
+    case REG_RSI:
+      return "rsi";
+    case REG_RBP:
+      return "rbp";
+    case REG_RSP:
+      return "rsp";
+    case REG_R8:
+      return "r8";
+    case REG_R9:
+      return "r9";
+    case REG_R10:
+      return "r10";
+    case REG_R11:
+      return "r11";
+    case REG_R12:
+      return "r12";
+    case REG_R13:
+      return "r13";
+    case REG_R14:
+      return "r14";
+    case REG_R15:
+      return "r15";
+#    elif defined(__i386__)
+#      if SANITIZER_NETBSD
+#        define REG_EAX _REG_EAX
+#        define REG_EBX _REG_EBX
+#        define REG_ECX _REG_ECX
+#        define REG_EDX _REG_EDX
+#        define REG_EDI _REG_EDI
+#        define REG_ESI _REG_ESI
+#        define REG_EBP _REG_EBP
+#        define REG_ESP _REG_ESP
+#      endif
+    case REG_EAX:
+      return "eax";
+    case REG_EBX:
+      return "ebx";
+    case REG_ECX:
+      return "ecx";
+    case REG_EDX:
+      return "edx";
+    case REG_EDI:
+      return "edi";
+    case REG_ESI:
+      return "esi";
+    case REG_EBP:
+      return "ebp";
+    case REG_ESP:
+      return "esp";
+#    elif defined(__arm__)
+#      ifdef MAKE_CASE
+#        undef MAKE_CASE
+#      endif
+#      define REG_STR(reg) #reg
+#      define MAKE_CASE(N) \
+        case REG_R##N:     \
+          return REG_STR(r##N)
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    MAKE_CASE(11);
+    MAKE_CASE(12);
+    case REG_R13:
+      return "sp";
+    case REG_R14:
+      return "lr";
+    case REG_R15:
+      return "pc";
+#    elif defined(__aarch64__)
+#      define REG_STR(reg) #reg
+#      define MAKE_CASE(N) \
+        case N:            \
+          return REG_STR(x##N)
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    MAKE_CASE(11);
+    MAKE_CASE(12);
+    MAKE_CASE(13);
+    MAKE_CASE(14);
+    MAKE_CASE(15);
+    MAKE_CASE(16);
+    MAKE_CASE(17);
+    MAKE_CASE(18);
+    MAKE_CASE(19);
+    MAKE_CASE(20);
+    MAKE_CASE(21);
+    MAKE_CASE(22);
+    MAKE_CASE(23);
+    MAKE_CASE(24);
+    MAKE_CASE(25);
+    MAKE_CASE(26);
+    MAKE_CASE(27);
+    MAKE_CASE(28);
+    case 29:
+      return "fp";
+    case 30:
+      return "lr";
+    case 31:
+      return "sp";
+#    endif
+#  endif  // SANITIZER_LINUX && SANITIZER_GLIBC
+    default:
+      return NULL;
+  }
+  return NULL;
+}
+
+#  if SANITIZER_LINUX && SANITIZER_GLIBC && \
+      (defined(__arm__) || defined(__aarch64__))
+static uptr GetArmRegister(ucontext_t *ctx, int RegNum) {
+  switch (RegNum) {
+#    if defined(__arm__)
+#      ifdef MAKE_CASE
+#        undef MAKE_CASE
+#      endif
+#      define MAKE_CASE(N) \
+        case REG_R##N:     \
+          return ctx->uc_mcontext.arm_r##N
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    case REG_R11:
+      return ctx->uc_mcontext.arm_fp;
+    case REG_R12:
+      return ctx->uc_mcontext.arm_ip;
+    case REG_R13:
+      return ctx->uc_mcontext.arm_sp;
+    case REG_R14:
+      return ctx->uc_mcontext.arm_lr;
+    case REG_R15:
+      return ctx->uc_mcontext.arm_pc;
+#    elif defined(__aarch64__)
+    case 0 ... 30:
+      return ctx->uc_mcontext.regs[RegNum];
+    case 31:
+      return ctx->uc_mcontext.sp;
+#    endif
+    default:
+      return 0;
+  }
+  return 0;
+}
+#  endif  // SANITIZER_LINUX && SANITIZER_GLIBC && (defined(__arm__) ||
+          // defined(__aarch64__))
+
+UNUSED
+static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
+  const char *RegName = RegNumToRegName(RegNum);
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
+#    if defined(__x86_64__)
+  Printf("%s%s = 0x%016llx  ", internal_strlen(RegName) == 2 ? " " : "",
+         RegName,
+#      if SANITIZER_LINUX
+         ctx->uc_mcontext.gregs[RegNum]
+#      elif SANITIZER_NETBSD
+         ctx->uc_mcontext.__gregs[RegNum]
+#      endif
+  );
+#    elif defined(__i386__)
+  Printf("%s = 0x%08x  ", RegName,
+#      if SANITIZER_LINUX
+         ctx->uc_mcontext.gregs[RegNum]
+#      elif SANITIZER_NETBSD
+         ctx->uc_mcontext.__gregs[RegNum]
+#      endif
+  );
+#    elif defined(__arm__)
+  Printf("%s%s = 0x%08zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
+         GetArmRegister(ctx, RegNum));
+#    elif defined(__aarch64__)
+  Printf("%s%s = 0x%016zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
+         GetArmRegister(ctx, RegNum));
+#    else
+  (void)RegName;
+#    endif
+#  else
+  (void)RegName;
+#  endif
+}
+
 void SignalContext::DumpAllRegisters(void *context) {
-  // FIXME: Implement this.
+  ucontext_t *ucontext = (ucontext_t *)context;
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
+#    if defined(__x86_64__)
+  Report("Register values:\n");
+  DumpSingleReg(ucontext, REG_RAX);
+  DumpSingleReg(ucontext, REG_RBX);
+  DumpSingleReg(ucontext, REG_RCX);
+  DumpSingleReg(ucontext, REG_RDX);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_RDI);
+  DumpSingleReg(ucontext, REG_RSI);
+  DumpSingleReg(ucontext, REG_RBP);
+  DumpSingleReg(ucontext, REG_RSP);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R8);
+  DumpSingleReg(ucontext, REG_R9);
+  DumpSingleReg(ucontext, REG_R10);
+  DumpSingleReg(ucontext, REG_R11);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R12);
+  DumpSingleReg(ucontext, REG_R13);
+  DumpSingleReg(ucontext, REG_R14);
+  DumpSingleReg(ucontext, REG_R15);
+  Printf("\n");
+#    elif defined(__i386__)
+  // Duplication of this report print is caused by partial support
+  // of register values dumping. In case of unsupported yet architecture let's
+  // avoid printing 'Register values:' without actual values in the following
+  // output.
+  Report("Register values:\n");
+  DumpSingleReg(ucontext, REG_EAX);
+  DumpSingleReg(ucontext, REG_EBX);
+  DumpSingleReg(ucontext, REG_ECX);
+  DumpSingleReg(ucontext, REG_EDX);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_EDI);
+  DumpSingleReg(ucontext, REG_ESI);
+  DumpSingleReg(ucontext, REG_EBP);
+  DumpSingleReg(ucontext, REG_ESP);
+  Printf("\n");
+#    elif defined(__arm__) && !SANITIZER_NETBSD
+  Report("Register values:\n");
+  DumpSingleReg(ucontext, REG_R0);
+  DumpSingleReg(ucontext, REG_R1);
+  DumpSingleReg(ucontext, REG_R2);
+  DumpSingleReg(ucontext, REG_R3);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R4);
+  DumpSingleReg(ucontext, REG_R5);
+  DumpSingleReg(ucontext, REG_R6);
+  DumpSingleReg(ucontext, REG_R7);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R8);
+  DumpSingleReg(ucontext, REG_R9);
+  DumpSingleReg(ucontext, REG_R10);
+  DumpSingleReg(ucontext, REG_R11);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R12);
+  DumpSingleReg(ucontext, REG_R13);
+  DumpSingleReg(ucontext, REG_R14);
+  DumpSingleReg(ucontext, REG_R15);
+  Printf("\n");
+#    elif defined(__aarch64__) && !SANITIZER_NETBSD
+  Report("Register values:\n");
+  for (int i = 0; i <= 31; ++i) {
+    DumpSingleReg(ucontext, i);
+    if (i % 4 == 3)
+      Printf("\n");
+  }
+#    else
+  (void)ucontext;
+#    endif
+#  elif SANITIZER_FREEBSD
+#    if defined(__x86_64__)
+  Report("Register values:\n");
+  Printf("rax = 0x%016lx  ", ucontext->uc_mcontext.mc_rax);
+  Printf("rbx = 0x%016lx  ", ucontext->uc_mcontext.mc_rbx);
+  Printf("rcx = 0x%016lx  ", ucontext->uc_mcontext.mc_rcx);
+  Printf("rdx = 0x%016lx  ", ucontext->uc_mcontext.mc_rdx);
+  Printf("\n");
+  Printf("rdi = 0x%016lx  ", ucontext->uc_mcontext.mc_rdi);
+  Printf("rsi = 0x%016lx  ", ucontext->uc_mcontext.mc_rsi);
+  Printf("rbp = 0x%016lx  ", ucontext->uc_mcontext.mc_rbp);
+  Printf("rsp = 0x%016lx  ", ucontext->uc_mcontext.mc_rsp);
+  Printf("\n");
+  Printf(" r8 = 0x%016lx  ", ucontext->uc_mcontext.mc_r8);
+  Printf(" r9 = 0x%016lx  ", ucontext->uc_mcontext.mc_r9);
+  Printf("r10 = 0x%016lx  ", ucontext->uc_mcontext.mc_r10);
+  Printf("r11 = 0x%016lx  ", ucontext->uc_mcontext.mc_r11);
+  Printf("\n");
+  Printf("r12 = 0x%016lx  ", ucontext->uc_mcontext.mc_r12);
+  Printf("r13 = 0x%016lx  ", ucontext->uc_mcontext.mc_r13);
+  Printf("r14 = 0x%016lx  ", ucontext->uc_mcontext.mc_r14);
+  Printf("r15 = 0x%016lx  ", ucontext->uc_mcontext.mc_r15);
+  Printf("\n");
+#    elif defined(__i386__)
+  Report("Register values:\n");
+  Printf("eax = 0x%08x  ", ucontext->uc_mcontext.mc_eax);
+  Printf("ebx = 0x%08x  ", ucontext->uc_mcontext.mc_ebx);
+  Printf("ecx = 0x%08x  ", ucontext->uc_mcontext.mc_ecx);
+  Printf("edx = 0x%08x  ", ucontext->uc_mcontext.mc_edx);
+  Printf("\n");
+  Printf("edi = 0x%08x  ", ucontext->uc_mcontext.mc_edi);
+  Printf("esi = 0x%08x  ", ucontext->uc_mcontext.mc_esi);
+  Printf("ebp = 0x%08x  ", ucontext->uc_mcontext.mc_ebp);
+  Printf("esp = 0x%08x  ", ucontext->uc_mcontext.mc_esp);
+  Printf("\n");
+#    else
+  (void)ucontext;
+#    endif
+#  else
+  (void)ucontext;
+#  endif
+  // FIXME: Implement this for other OSes and architectures.
 }
 
 static void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index cbdf3e95925bf..8ebe37d649415 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -1372,8 +1372,8 @@ void DumpProcessMap() {
   for (uptr i = 0; i < modules.size(); ++i) {
     char uuid_str[128];
     FormatUUID(uuid_str, sizeof(uuid_str), modules[i].uuid());
-    Printf("0x%zx-0x%zx %s (%s) %s\n", modules[i].base_address(),
-           modules[i].max_address(), modules[i].full_name(),
+    Printf("%p-%p %s (%s) %s\n", (void *)modules[i].base_address(),
+           (void *)modules[i].max_address(), modules[i].full_name(),
            ModuleArchToString(modules[i].arch()), uuid_str);
   }
   Printf("End of module map.\n");
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 5965281555059..57966403c92a9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -295,8 +295,8 @@
 // For such platforms build this code with -DSANITIZER_CAN_USE_ALLOCATOR64=0 or
 // change the definition of SANITIZER_CAN_USE_ALLOCATOR64 here.
 #ifndef SANITIZER_CAN_USE_ALLOCATOR64
-#  if (SANITIZER_RISCV64 && !SANITIZER_FUCHSIA) || SANITIZER_IOS || \
-      SANITIZER_DRIVERKIT
+#  if (SANITIZER_RISCV64 && !SANITIZER_FUCHSIA && !SANITIZER_LINUX) || \
+      SANITIZER_IOS || SANITIZER_DRIVERKIT
 #    define SANITIZER_CAN_USE_ALLOCATOR64 0
 #  elif defined(__mips64) || defined(__hexagon__)
 #    define SANITIZER_CAN_USE_ALLOCATOR64 0
@@ -322,7 +322,7 @@
 #  if SANITIZER_FUCHSIA
 #    define SANITIZER_MMAP_RANGE_SIZE (1ULL << 38)
 #  else
-#    define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
+#    define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 56)
 #  endif
 #elif defined(__aarch64__)
 #  if SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
index 969327a7a0fbe..7d7d575431994 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
@@ -130,8 +130,8 @@ static void *MmapFixedImpl(uptr fixed_addr, uptr size, bool tolerate_enomem,
     if (tolerate_enomem && reserrno == ENOMEM)
       return nullptr;
     char mem_type[40];
-    internal_snprintf(mem_type, sizeof(mem_type), "memory at address 0x%zx",
-                      fixed_addr);
+    internal_snprintf(mem_type, sizeof(mem_type), "memory at address %p",
+                      (void *)fixed_addr);
     ReportMmapFailureAndDie(size, mem_type, "allocate", reserrno);
   }
   IncreaseTotalMmap(size);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index ece2d7d63dd61..9ffb36f812c45 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -327,9 +327,10 @@ static bool MmapFixed(uptr fixed_addr, uptr size, int additional_flags,
                 MAP_PRIVATE | MAP_FIXED | additional_flags | MAP_ANON, name);
   int reserrno;
   if (internal_iserror(p, &reserrno)) {
-    Report("ERROR: %s failed to "
-           "allocate 0x%zx (%zd) bytes at address %zx (errno: %d)\n",
-           SanitizerToolName, size, size, fixed_addr, reserrno);
+    Report(
+        "ERROR: %s failed to "
+        "allocate 0x%zx (%zd) bytes at address %p (errno: %d)\n",
+        SanitizerToolName, size, size, (void *)fixed_addr, reserrno);
     return false;
   }
   IncreaseTotalMmap(size);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index 7d7c009f64442..ee3fd5b0a817a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -47,7 +47,8 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   struct kinfo_proc2 *InfoProc;
   uptr Len = sizeof(*InfoProc);
   uptr Size = Len;
-  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID, getpid(), Size, 1};
+  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID,
+                     getpid(), (int)Size,  1};
   InfoProc = (struct kinfo_proc2 *)MmapOrDie(Size, "GetMemoryProfile()");
   CHECK_EQ(
       internal_sysctl(Mib, ARRAY_SIZE(Mib), nullptr, (uptr *)InfoProc, &Len, 0),
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
index 5200354694851..265a9925a15a0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
@@ -9,31 +9,33 @@
 #ifndef SANITIZER_PTRAUTH_H
 #define SANITIZER_PTRAUTH_H
 
-#if __has_feature(ptrauth_calls)
-#include <ptrauth.h>
+#if __has_feature(ptrauth_intrinsics)
+#  include <ptrauth.h>
 #elif defined(__ARM_FEATURE_PAC_DEFAULT) && !defined(__APPLE__)
-inline unsigned long ptrauth_strip(void* __value, unsigned int __key) {
-  // On the stack the link register is protected with Pointer
-  // Authentication Code when compiled with -mbranch-protection.
-  // Let's stripping the PAC unconditionally because xpaclri is in
-  // the NOP space so will do nothing when it is not enabled or not available.
-  unsigned long ret;
-  asm volatile(
-      "mov x30, %1\n\t"
-      "hint #7\n\t"  // xpaclri
-      "mov %0, x30\n\t"
-      : "=r"(ret)
-      : "r"(__value)
-      : "x30");
-  return ret;
-}
-#define ptrauth_auth_data(__value, __old_key, __old_data) __value
-#define ptrauth_string_discriminator(__string) ((int)0)
+// On the stack the link register is protected with Pointer
+// Authentication Code when compiled with -mbranch-protection.
+// Let's stripping the PAC unconditionally because xpaclri is in
+// the NOP space so will do nothing when it is not enabled or not available.
+#  define ptrauth_strip(__value, __key) \
+    ({                                  \
+      __typeof(__value) ret;            \
+      asm volatile(                     \
+          "mov x30, %1\n\t"             \
+          "hint #7\n\t"                 \
+          "mov %0, x30\n\t"             \
+          "mov x30, xzr\n\t"            \
+          : "=r"(ret)                   \
+          : "r"(__value)                \
+          : "x30");                     \
+      ret;                              \
+    })
+#  define ptrauth_auth_data(__value, __old_key, __old_data) __value
+#  define ptrauth_string_discriminator(__string) ((int)0)
 #else
 // Copied from <ptrauth.h>
-#define ptrauth_strip(__value, __key) __value
-#define ptrauth_auth_data(__value, __old_key, __old_data) __value
-#define ptrauth_string_discriminator(__string) ((int)0)
+#  define ptrauth_strip(__value, __key) __value
+#  define ptrauth_auth_data(__value, __old_key, __old_data) __value
+#  define ptrauth_string_discriminator(__string) ((int)0)
 #endif
 
 #define STRIP_PAC_PC(pc) ((uptr)ptrauth_strip(pc, 0))
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
index b23796fccf6e3..5631d132da74a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
@@ -192,7 +192,7 @@ void FormattedStackTracePrinter::RenderFrame(InternalScopedString *buffer,
       buffer->AppendF("%u", frame_no);
       break;
     case 'p':
-      buffer->AppendF("0x%zx", address);
+      buffer->AppendF("%p", (void *)address);
       break;
     case 'm':
       buffer->AppendF("%s", StripPathPrefix(info->module, strip_path_prefix));
@@ -269,7 +269,7 @@ void FormattedStackTracePrinter::RenderFrame(InternalScopedString *buffer,
       break;
     default:
       Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-             (void *)p);
+             (const void *)p);
       Die();
     }
   }
@@ -323,7 +323,7 @@ void FormattedStackTracePrinter::RenderData(InternalScopedString *buffer,
         break;
       default:
         Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-               (void *)p);
+               (const void *)p);
         Die();
     }
   }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index 25c4af7085608..526a71c398260 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -257,8 +257,8 @@ static void TracerThreadDieCallback() {
 static void TracerThreadSignalHandler(int signum, __sanitizer_siginfo *siginfo,
                                       void *uctx) {
   SignalContext ctx(siginfo, uctx);
-  Printf("Tracer caught signal %d: addr=0x%zx pc=0x%zx sp=0x%zx\n", signum,
-         ctx.addr, ctx.pc, ctx.sp);
+  Printf("Tracer caught signal %d: addr=%p pc=%p sp=%p\n", signum,
+         (void *)ctx.addr, (void *)ctx.pc, (void *)ctx.sp);
   ThreadSuspender *inst = thread_suspender_instance;
   if (inst) {
     if (signum == SIGABRT)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp
index 701db72619a3d..58a0cfdbf9d48 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp
@@ -158,8 +158,8 @@ static void TracerThreadDieCallback() {
 static void TracerThreadSignalHandler(int signum, __sanitizer_siginfo *siginfo,
                                       void *uctx) {
   SignalContext ctx(siginfo, uctx);
-  Printf("Tracer caught signal %d: addr=0x%zx pc=0x%zx sp=0x%zx\n", signum,
-         ctx.addr, ctx.pc, ctx.sp);
+  Printf("Tracer caught signal %d: addr=%p pc=%p sp=%p\n", signum,
+         (void *)ctx.addr, (void *)ctx.pc, (void *)ctx.sp);
   ThreadSuspender *inst = thread_suspender_instance;
   if (inst) {
     if (signum == SIGABRT)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
index 252979f1c2baa..ee293bbd68875 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
@@ -121,25 +121,26 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res,
   uptr tls_size = 0;
   uptr tls_beg = reinterpret_cast<uptr>(res) - arg->offset - kDtvOffset;
   VReport(2,
-          "__tls_get_addr: %p {0x%zx,0x%zx} => %p; tls_beg: 0x%zx; sp: %p "
+          "__tls_get_addr: %p {0x%zx,0x%zx} => %p; tls_beg: %p; sp: %p "
           "num_live_dtls %zd\n",
-          (void *)arg, arg->dso_id, arg->offset, res, tls_beg, (void *)&tls_beg,
+          (void *)arg, arg->dso_id, arg->offset, res, (void *)tls_beg,
+          (void *)&tls_beg,
           atomic_load(&number_of_live_dtls, memory_order_relaxed));
   if (dtls.last_memalign_ptr == tls_beg) {
     tls_size = dtls.last_memalign_size;
-    VReport(2, "__tls_get_addr: glibc <=2.24 suspected; tls={0x%zx,0x%zx}\n",
-            tls_beg, tls_size);
+    VReport(2, "__tls_get_addr: glibc <=2.24 suspected; tls={%p,0x%zx}\n",
+            (void *)tls_beg, tls_size);
   } else if (tls_beg >= static_tls_begin && tls_beg < static_tls_end) {
     // This is the static TLS block which was initialized / unpoisoned at thread
     // creation.
-    VReport(2, "__tls_get_addr: static tls: 0x%zx\n", tls_beg);
+    VReport(2, "__tls_get_addr: static tls: %p\n", (void *)tls_beg);
     tls_size = 0;
   } else if (const void *start =
                  __sanitizer_get_allocated_begin((void *)tls_beg)) {
     tls_beg = (uptr)start;
     tls_size = __sanitizer_get_allocated_size(start);
-    VReport(2, "__tls_get_addr: glibc >=2.25 suspected; tls={0x%zx,0x%zx}\n",
-            tls_beg, tls_size);
+    VReport(2, "__tls_get_addr: glibc >=2.25 suspected; tls={%p,0x%zx}\n",
+            (void *)tls_beg, tls_size);
   } else {
     VReport(2, "__tls_get_addr: Can't guess glibc version\n");
     // This may happen inside the DTOR of main thread, so just ignore it.
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
index 0b198890fc798..995f00eddc38a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
@@ -276,8 +276,8 @@ void *MmapFixedOrDie(uptr fixed_addr, uptr size, const char *name) {
       MEM_COMMIT, PAGE_READWRITE);
   if (p == 0) {
     char mem_type[30];
-    internal_snprintf(mem_type, sizeof(mem_type), "memory at address 0x%zx",
-                      fixed_addr);
+    internal_snprintf(mem_type, sizeof(mem_type), "memory at address %p",
+                      (void *)fixed_addr);
     ReportMmapFailureAndDie(size, mem_type, "allocate", GetLastError());
   }
   return p;
@@ -308,8 +308,8 @@ void *MmapFixedOrDieOnFatalError(uptr fixed_addr, uptr size, const char *name) {
       MEM_COMMIT, PAGE_READWRITE);
   if (p == 0) {
     char mem_type[30];
-    internal_snprintf(mem_type, sizeof(mem_type), "memory at address 0x%zx",
-                      fixed_addr);
+    internal_snprintf(mem_type, sizeof(mem_type), "memory at address %p",
+                      (void *)fixed_addr);
     return ReturnNullptrOnOOMOrDie(size, mem_type, "allocate");
   }
   return p;
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
index 1a1ccce82d259..601897a64f051 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
@@ -28,12 +28,13 @@
 
 using namespace __sanitizer;
 
-#if SANITIZER_SOLARIS && defined(__sparcv9)
+#if defined(__sparcv9)
 // FIXME: These tests probably fail because Solaris/sparcv9 uses the full
-// 64-bit address space.  Needs more investigation
-#define SKIP_ON_SOLARIS_SPARCV9(x) DISABLED_##x
+// 64-bit address space.  Same on Linux/sparc64, so probably a general SPARC
+// issue.  Needs more investigation
+#  define SKIP_ON_SPARCV9(x) DISABLED_##x
 #else
-#define SKIP_ON_SOLARIS_SPARCV9(x) x
+#  define SKIP_ON_SPARCV9(x) x
 #endif
 
 // On 64-bit systems with small virtual address spaces (e.g. 39-bit) we can't
@@ -781,7 +782,7 @@ TEST(SanitizerCommon, CombinedAllocator64VeryCompact) {
 }
 #endif
 
-TEST(SanitizerCommon, SKIP_ON_SOLARIS_SPARCV9(CombinedAllocator32Compact)) {
+TEST(SanitizerCommon, SKIP_ON_SPARCV9(CombinedAllocator32Compact)) {
   TestCombinedAllocator<Allocator32Compact>();
 }
 
@@ -1028,7 +1029,7 @@ TEST(SanitizerCommon, SizeClassAllocator64DynamicPremappedIteration) {
 #endif
 #endif
 
-TEST(SanitizerCommon, SKIP_ON_SOLARIS_SPARCV9(SizeClassAllocator32Iteration)) {
+TEST(SanitizerCommon, SKIP_ON_SPARCV9(SizeClassAllocator32Iteration)) {
   TestSizeClassAllocatorIteration<Allocator32Compact>();
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
index 8da09f693c2b8..8500d3aa91fec 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
@@ -77,7 +77,8 @@ TEST(SanitizerIoctl, Fixup) {
 // Test decoding KVM ioctl numbers.
 TEST(SanitizerIoctl, KVM_GET_MP_STATE) {
   ioctl_desc desc;
-  unsigned int desc_value = SANITIZER_MIPS ? 0x4004ae98U : 0x8004ae98U;
+  unsigned int desc_value =
+      SANITIZER_MIPS || SANITIZER_SPARC ? 0x4004ae98U : 0x8004ae98U;
   bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
@@ -86,7 +87,8 @@ TEST(SanitizerIoctl, KVM_GET_MP_STATE) {
 
 TEST(SanitizerIoctl, KVM_GET_LAPIC) {
   ioctl_desc desc;
-  unsigned int desc_value = SANITIZER_MIPS ? 0x4400ae8eU : 0x8400ae8eU;
+  unsigned int desc_value =
+      SANITIZER_MIPS || SANITIZER_SPARC ? 0x4400ae8eU : 0x8400ae8eU;
   bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index e810122a824f6..02833888747ac 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -98,13 +98,13 @@ TEST_F(StackDepotTest, Print) {
       return s;
     return std::regex_replace(s, std::regex("\\.\\*"), ".*\\n.*");
   };
-  EXPECT_EXIT(
-      (StackDepotPrintAll(), exit(0)), ::testing::ExitedWithCode(0),
-      fix_regex("Stack for id .*#0 0x1.*#1 0x2.*#2 0x3.*#3 0x4.*#4 0x7.*"));
   EXPECT_EXIT(
       (StackDepotPrintAll(), exit(0)), ::testing::ExitedWithCode(0),
       fix_regex(
-          "Stack for id .*#0 0x1.*#1 0x2.*#2 0x3.*#3 0x4.*#4 0x8.*#5 0x9.*"));
+          "Stack for id .*#0 0x0*1.*#1 0x0*2.*#2 0x0*3.*#3 0x0*4.*#4 0x0*7.*"));
+  EXPECT_EXIT((StackDepotPrintAll(), exit(0)), ::testing::ExitedWithCode(0),
+              fix_regex("Stack for id .*#0 0x0*1.*#1 0x0*2.*#2 0x0*3.*#3 "
+                        "0x0*4.*#4 0x0*8.*#5 0x0*9.*"));
 }
 
 TEST_F(StackDepotTest, PrintNoLock) {
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_printer_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_printer_test.cpp
index 9602ba31621e6..76a434be275e5 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_printer_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_printer_test.cpp
@@ -11,9 +11,12 @@
 //===----------------------------------------------------------------------===//
 #include "sanitizer_common/sanitizer_stacktrace_printer.h"
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "interception/interception.h"
 
+using testing::MatchesRegex;
+
 namespace __sanitizer {
 
 class TestFormattedStackTracePrinter final : public FormattedStackTracePrinter {
@@ -96,10 +99,12 @@ TEST(FormattedStackTracePrinter, RenderFrame) {
                       "Function:%f FunctionOffset:%q Source:%s Line:%l "
                       "Column:%c",
                       frame_no, info.address, &info, false, "/path/to/");
-  EXPECT_STREQ("% Frame:42 PC:0x400000 Module:my/module ModuleOffset:0x200 "
-               "Function:foo FunctionOffset:0x100 Source:my/source Line:10 "
-               "Column:5",
-               str.data());
+  EXPECT_THAT(
+      str.data(),
+      MatchesRegex(
+          "% Frame:42 PC:0x0*400000 Module:my/module ModuleOffset:0x200 "
+          "Function:foo FunctionOffset:0x100 Source:my/source Line:10 "
+          "Column:5"));
 
   str.clear();
   // Check that RenderFrame() strips interceptor prefixes.
@@ -109,10 +114,12 @@ TEST(FormattedStackTracePrinter, RenderFrame) {
                       "Function:%f FunctionOffset:%q Source:%s Line:%l "
                       "Column:%c",
                       frame_no, info.address, &info, false, "/path/to/");
-  EXPECT_STREQ("% Frame:42 PC:0x400000 Module:my/module ModuleOffset:0x200 "
-               "Function:bar FunctionOffset:0x100 Source:my/source Line:10 "
-               "Column:5",
-               str.data());
+  EXPECT_THAT(
+      str.data(),
+      MatchesRegex(
+          "% Frame:42 PC:0x0*400000 Module:my/module ModuleOffset:0x200 "
+          "Function:bar FunctionOffset:0x100 Source:my/source Line:10 "
+          "Column:5"));
   info.Clear();
   str.clear();
 
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
index 769a9b9a9fa0a..11ca1fd7f0517 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
@@ -284,7 +284,7 @@ TEST(GetCurrentPc, Basic) {
           StackTrace::GetCurrentPc(),
       };
       for (uptr i = 0; i < ARRAY_SIZE(pcs); i++)
-        Printf("pc%zu: 0x%zx\n", i, pcs[i]);
+        Printf("pc%zu: %p\n", i, (void *)(pcs[i]));
       for (uptr i = 1; i < ARRAY_SIZE(pcs); i++) {
         EXPECT_GT(pcs[i], pcs[0]);
         EXPECT_LT(pcs[i], pcs[0] + 1000);
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 9a8e53be388b7..d8505742d6054 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -19,6 +19,7 @@
 #include "stats.h"
 #include "string_utils.h"
 #include "thread_annotations.h"
+#include "vector.h"
 
 namespace scudo {
 
@@ -73,12 +74,18 @@ static inline void unmap(LargeBlock::Header *H) {
 }
 
 namespace {
+
 struct CachedBlock {
+  static constexpr u16 CacheIndexMax = UINT16_MAX;
+  static constexpr u16 InvalidEntry = CacheIndexMax;
+
   uptr CommitBase = 0;
   uptr CommitSize = 0;
   uptr BlockBegin = 0;
   MemMapT MemMap = {};
   u64 Time = 0;
+  u16 Next = 0;
+  u16 Prev = 0;
 
   bool isValid() { return CommitBase != 0; }
 
@@ -188,10 +195,11 @@ template <typename Config> class MapAllocatorCache {
     Str->append("Stats: CacheRetrievalStats: SuccessRate: %u/%u "
                 "(%zu.%02zu%%)\n",
                 SuccessfulRetrieves, CallsToRetrieve, Integral, Fractional);
-    for (CachedBlock Entry : Entries) {
-      if (!Entry.isValid())
-        continue;
-      Str->append("StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
+    Str->append("Cache Entry Info (Most Recent -> Least Recent):\n");
+
+    for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) {
+      CachedBlock &Entry = Entries[I];
+      Str->append("  StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
                   "BlockSize: %zu %s\n",
                   Entry.CommitBase, Entry.CommitBase + Entry.CommitSize,
                   Entry.CommitSize, Entry.Time == 0 ? "[R]" : "");
@@ -202,6 +210,10 @@ template <typename Config> class MapAllocatorCache {
   static_assert(Config::getDefaultMaxEntriesCount() <=
                     Config::getEntriesArraySize(),
                 "");
+  // Ensure the cache entry array size fits in the LRU list Next and Prev
+  // index fields
+  static_assert(Config::getEntriesArraySize() <= CachedBlock::CacheIndexMax,
+                "Cache entry array is too large to be indexed.");
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
@@ -213,23 +225,33 @@ template <typename Config> class MapAllocatorCache {
     if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
       ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
+
+    // The cache is initially empty
+    LRUHead = CachedBlock::InvalidEntry;
+    LRUTail = CachedBlock::InvalidEntry;
+
+    // Available entries will be retrieved starting from the beginning of the
+    // Entries array
+    AvailableHead = 0;
+    for (u32 I = 0; I < Config::getEntriesArraySize() - 1; I++)
+      Entries[I].Next = static_cast<u16>(I + 1);
+
+    Entries[Config::getEntriesArraySize() - 1].Next = CachedBlock::InvalidEntry;
   }
 
   void store(const Options &Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
-    bool EntryCached = false;
-    bool EmptyCache = false;
     const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
-    const u64 Time = getMonotonicTimeFast();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    u64 Time;
     CachedBlock Entry;
+
     Entry.CommitBase = H->CommitBase;
     Entry.CommitSize = H->CommitSize;
     Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
     Entry.MemMap = H->MemMap;
-    Entry.Time = Time;
+    Entry.Time = UINT64_MAX;
     if (useMemoryTagging<Config>(Options)) {
       if (Interval == 0 && !SCUDO_FUCHSIA) {
         // Release the memory and make it inaccessible at the same time by
@@ -243,17 +265,32 @@ template <typename Config> class MapAllocatorCache {
         Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize,
                                          MAP_NOACCESS);
       }
-    } else if (Interval == 0) {
-      Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
-      Entry.Time = 0;
     }
+
+    // Usually only one entry will be evicted from the cache.
+    // Only in the rare event that the cache shrinks in real-time
+    // due to a decrease in the configurable value MaxEntriesCount
+    // will more than one cache entry be evicted.
+    // The vector is used to save the MemMaps of evicted entries so
+    // that the unmap call can be performed outside the lock
+    Vector<MemMapT, 1U> EvictionMemMaps;
+
     do {
       ScopedLock L(Mutex);
+
+      // Time must be computed under the lock to ensure
+      // that the LRU cache remains sorted with respect to
+      // time in a multithreaded environment
+      Time = getMonotonicTimeFast();
+      if (Entry.Time != 0)
+        Entry.Time = Time;
+
       if (useMemoryTagging<Config>(Options) && QuarantinePos == -1U) {
         // If we get here then memory tagging was disabled in between when we
         // read Options and when we locked Mutex. We can't insert our entry into
         // the quarantine or the cache because the permissions would be wrong so
         // just unmap it.
+        Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
         break;
       }
       if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
@@ -269,36 +306,32 @@ template <typename Config> class MapAllocatorCache {
           OldestTime = Entry.Time;
         Entry = PrevEntry;
       }
-      if (EntriesCount >= MaxCount) {
-        if (IsFullEvents++ == 4U)
-          EmptyCache = true;
-      } else {
-        for (u32 I = 0; I < MaxCount; I++) {
-          if (Entries[I].isValid())
-            continue;
-          if (I != 0)
-            Entries[I] = Entries[0];
-          Entries[0] = Entry;
-          EntriesCount++;
-          if (OldestTime == 0)
-            OldestTime = Entry.Time;
-          EntryCached = true;
-          break;
-        }
+
+      // All excess entries are evicted from the cache
+      while (needToEvict()) {
+        // Save MemMaps of evicted entries to perform unmap outside of lock
+        EvictionMemMaps.push_back(Entries[LRUTail].MemMap);
+        remove(LRUTail);
       }
+
+      insert(Entry);
+
+      if (OldestTime == 0)
+        OldestTime = Entry.Time;
     } while (0);
-    if (EmptyCache)
-      empty();
-    else if (Interval >= 0)
+
+    for (MemMapT &EvictMemMap : EvictionMemMaps)
+      EvictMemMap.unmap(EvictMemMap.getBase(), EvictMemMap.getCapacity());
+
+    if (Interval >= 0) {
+      // TODO: Add ReleaseToOS logic to LRU algorithm
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
-    if (!EntryCached)
-      Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
+    }
   }
 
   bool retrieve(Options Options, uptr Size, uptr Alignment, uptr HeadersSize,
                 LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     // 10% of the requested size proved to be the optimal choice for
     // retrieving cached blocks after testing several options.
     constexpr u32 FragmentedBytesDivisor = 10;
@@ -312,9 +345,8 @@ template <typename Config> class MapAllocatorCache {
         return false;
       u32 OptimalFitIndex = 0;
       uptr MinDiff = UINTPTR_MAX;
-      for (u32 I = 0; I < MaxCount; I++) {
-        if (!Entries[I].isValid())
-          continue;
+      for (u32 I = LRUHead; I != CachedBlock::InvalidEntry;
+           I = Entries[I].Next) {
         const uptr CommitBase = Entries[I].CommitBase;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
@@ -347,8 +379,7 @@ template <typename Config> class MapAllocatorCache {
       }
       if (Found) {
         Entry = Entries[OptimalFitIndex];
-        Entries[OptimalFitIndex].invalidate();
-        EntriesCount--;
+        remove(OptimalFitIndex);
         SuccessfulRetrieves++;
       }
     }
@@ -417,12 +448,9 @@ template <typename Config> class MapAllocatorCache {
         Quarantine[I].invalidate();
       }
     }
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    for (u32 I = 0; I < MaxCount; I++) {
-      if (Entries[I].isValid()) {
-        Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
-                                              Entries[I].CommitSize, 0);
-      }
+    for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) {
+      Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
+                                            Entries[I].CommitSize, 0);
     }
     QuarantinePos = -1U;
   }
@@ -434,6 +462,66 @@ template <typename Config> class MapAllocatorCache {
   void unmapTestOnly() { empty(); }
 
 private:
+  bool needToEvict() REQUIRES(Mutex) {
+    return (EntriesCount >= atomic_load_relaxed(&MaxEntriesCount));
+  }
+
+  void insert(const CachedBlock &Entry) REQUIRES(Mutex) {
+    DCHECK_LT(EntriesCount, atomic_load_relaxed(&MaxEntriesCount));
+
+    // Cache should be populated with valid entries when not empty
+    DCHECK_NE(AvailableHead, CachedBlock::InvalidEntry);
+
+    u32 FreeIndex = AvailableHead;
+    AvailableHead = Entries[AvailableHead].Next;
+
+    if (EntriesCount == 0) {
+      LRUTail = static_cast<u16>(FreeIndex);
+    } else {
+      // Check list order
+      if (EntriesCount > 1)
+        DCHECK_GE(Entries[LRUHead].Time, Entries[Entries[LRUHead].Next].Time);
+      Entries[LRUHead].Prev = static_cast<u16>(FreeIndex);
+    }
+
+    Entries[FreeIndex] = Entry;
+    Entries[FreeIndex].Next = LRUHead;
+    Entries[FreeIndex].Prev = CachedBlock::InvalidEntry;
+    LRUHead = static_cast<u16>(FreeIndex);
+    EntriesCount++;
+
+    // Availability stack should not have available entries when all entries
+    // are in use
+    if (EntriesCount == Config::getEntriesArraySize())
+      DCHECK_EQ(AvailableHead, CachedBlock::InvalidEntry);
+  }
+
+  void remove(uptr I) REQUIRES(Mutex) {
+    DCHECK(Entries[I].isValid());
+
+    Entries[I].invalidate();
+
+    if (I == LRUHead)
+      LRUHead = Entries[I].Next;
+    else
+      Entries[Entries[I].Prev].Next = Entries[I].Next;
+
+    if (I == LRUTail)
+      LRUTail = Entries[I].Prev;
+    else
+      Entries[Entries[I].Next].Prev = Entries[I].Prev;
+
+    Entries[I].Next = AvailableHead;
+    AvailableHead = static_cast<u16>(I);
+    EntriesCount--;
+
+    // Cache should not have valid entries when not empty
+    if (EntriesCount == 0) {
+      DCHECK_EQ(LRUHead, CachedBlock::InvalidEntry);
+      DCHECK_EQ(LRUTail, CachedBlock::InvalidEntry);
+    }
+  }
+
   void empty() {
     MemMapT MapInfo[Config::getEntriesArraySize()];
     uptr N = 0;
@@ -443,11 +531,10 @@ template <typename Config> class MapAllocatorCache {
         if (!Entries[I].isValid())
           continue;
         MapInfo[N] = Entries[I].MemMap;
-        Entries[I].invalidate();
+        remove(I);
         N++;
       }
       EntriesCount = 0;
-      IsFullEvents = 0;
     }
     for (uptr I = 0; I < N; I++) {
       MemMapT &MemMap = MapInfo[I];
@@ -484,7 +571,6 @@ template <typename Config> class MapAllocatorCache {
   atomic_u32 MaxEntriesCount = {};
   atomic_uptr MaxEntrySize = {};
   u64 OldestTime GUARDED_BY(Mutex) = 0;
-  u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
   u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
   u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
@@ -492,6 +578,13 @@ template <typename Config> class MapAllocatorCache {
   CachedBlock Entries[Config::getEntriesArraySize()] GUARDED_BY(Mutex) = {};
   NonZeroLengthArray<CachedBlock, Config::getQuarantineSize()>
       Quarantine GUARDED_BY(Mutex) = {};
+
+  // The LRUHead of the cache is the most recently used cache entry
+  u16 LRUHead GUARDED_BY(Mutex) = 0;
+  // The LRUTail of the cache is the least recently used cache entry
+  u16 LRUTail GUARDED_BY(Mutex) = 0;
+  // The AvailableHead is the top of the stack of available entries
+  u16 AvailableHead GUARDED_BY(Mutex) = 0;
 };
 
 template <typename Config> class MapAllocator {
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index 672cdc0ba655e..a85eb737dba0a 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -46,11 +46,8 @@ set(SCUDO_UNITTEST_LINK_FLAGS
   ${COMPILER_RT_UNWINDER_LINK_LIBS}
   ${SANITIZER_TEST_CXX_LIBRARIES})
 list(APPEND SCUDO_UNITTEST_LINK_FLAGS -pthread -no-pie)
-# Linking against libatomic is required with some compilers
-check_library_exists(atomic __atomic_load_8 "" COMPILER_RT_HAS_LIBATOMIC)
-if (COMPILER_RT_HAS_LIBATOMIC)
-  list(APPEND SCUDO_UNITTEST_LINK_FLAGS -latomic)
-endif()
+
+append_list_if(COMPILER_RT_HAS_LIBATOMIC -latomic SCUDO_UNITTEST_LINK_FLAGS)
 
 set(SCUDO_TEST_HEADERS
   scudo_unit_test.h
diff --git a/compiler-rt/lib/ubsan/ubsan_type_hash_itanium.cpp b/compiler-rt/lib/ubsan/ubsan_type_hash_itanium.cpp
index 468a8fcd603f0..15788574dd995 100644
--- a/compiler-rt/lib/ubsan/ubsan_type_hash_itanium.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_type_hash_itanium.cpp
@@ -207,7 +207,7 @@ struct VtablePrefix {
   std::type_info *TypeInfo;
 };
 VtablePrefix *getVtablePrefix(void *Vtable) {
-  Vtable = ptrauth_auth_data(Vtable, ptrauth_key_cxx_vtable_pointer, 0);
+  Vtable = ptrauth_strip(Vtable, ptrauth_key_cxx_vtable_pointer);
   VtablePrefix *Vptr = reinterpret_cast<VtablePrefix*>(Vtable);
   VtablePrefix *Prefix = Vptr - 1;
   if (!IsAccessibleMemoryRange((uptr)Prefix, sizeof(VtablePrefix)))
diff --git a/compiler-rt/test/CMakeLists.txt b/compiler-rt/test/CMakeLists.txt
index 83e443944c01c..84a98f3674749 100644
--- a/compiler-rt/test/CMakeLists.txt
+++ b/compiler-rt/test/CMakeLists.txt
@@ -48,7 +48,7 @@ umbrella_lit_testsuite_begin(check-compiler-rt)
 
 function(compiler_rt_test_runtime runtime)
   string(TOUPPER ${runtime} runtime_uppercase)
-  if(COMPILER_RT_HAS_${runtime_uppercase})
+  if(COMPILER_RT_HAS_${runtime_uppercase} AND COMPILER_RT_INCLUDE_TESTS)
     if (${runtime} STREQUAL cfi AND NOT COMPILER_RT_HAS_UBSAN)
       # CFI tests require diagnostic mode, which is implemented in UBSan.
     elseif (${runtime} STREQUAL scudo_standalone)
diff --git a/compiler-rt/test/asan/CMakeLists.txt b/compiler-rt/test/asan/CMakeLists.txt
index 2d683e61d6954..fb9e81bbe8082 100644
--- a/compiler-rt/test/asan/CMakeLists.txt
+++ b/compiler-rt/test/asan/CMakeLists.txt
@@ -130,39 +130,37 @@ if(APPLE)
 endif()
 
 # Add unit tests.
-if(COMPILER_RT_INCLUDE_TESTS)
-  foreach(arch ${ASAN_TEST_ARCH})
-    string(TOUPPER ${arch} ARCH_UPPER_CASE)
-    set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
-    set(CONFIG_NAME_DYNAMIC ${ARCH_UPPER_CASE}${OS_NAME}DynamicConfig)
+foreach(arch ${ASAN_TEST_ARCH})
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+  set(CONFIG_NAME_DYNAMIC ${ARCH_UPPER_CASE}${OS_NAME}DynamicConfig)
 
-    if(NOT MINGW)
-      # MinGW environments don't provide a statically linked CRT, so only the
-      # dynamic asan test configuration can be expected to work.
-      set(ASAN_TEST_DYNAMIC False)
-      configure_lit_site_cfg(
-        ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-        ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+  if(NOT MINGW)
+    # MinGW environments don't provide a statically linked CRT, so only the
+    # dynamic asan test configuration can be expected to work.
+    set(ASAN_TEST_DYNAMIC False)
+    configure_lit_site_cfg(
+      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+  endif()
+  if(COMPILER_RT_ASAN_HAS_STATIC_RUNTIME)
+    set(ASAN_TEST_DYNAMIC True)
+    configure_lit_site_cfg(
+      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME_DYNAMIC}/lit.site.cfg.py)
+  endif()
+  # FIXME: support unit test in the android test runner
+  if (NOT ANDROID)
+    if (NOT MINGW)
+      list(APPEND ASAN_TEST_DEPS AsanUnitTests)
+      list(APPEND ASAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
     endif()
     if(COMPILER_RT_ASAN_HAS_STATIC_RUNTIME)
-      set(ASAN_TEST_DYNAMIC True)
-      configure_lit_site_cfg(
-        ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-        ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME_DYNAMIC}/lit.site.cfg.py)
+      list(APPEND ASAN_DYNAMIC_TEST_DEPS AsanDynamicUnitTests)
+      list(APPEND ASAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME_DYNAMIC})
     endif()
-    # FIXME: support unit test in the android test runner
-    if (NOT ANDROID)
-      if (NOT MINGW)
-        list(APPEND ASAN_TEST_DEPS AsanUnitTests)
-        list(APPEND ASAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
-      endif()
-      if(COMPILER_RT_ASAN_HAS_STATIC_RUNTIME)
-        list(APPEND ASAN_DYNAMIC_TEST_DEPS AsanDynamicUnitTests)
-        list(APPEND ASAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME_DYNAMIC})
-      endif()
-    endif()
-  endforeach()
-endif()
+  endif()
+endforeach()
 
 if (SHADOW_MAPPING_UNRELIABLE)
   set(exclude_from_check_all.g "EXCLUDE_FROM_CHECK_ALL")
diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp
new file mode 100644
index 0000000000000..091ecfae5e5ca
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp
@@ -0,0 +1,76 @@
+// Test that captures the current behavior of indicator-based ASan ODR checker
+// when globals get unregistered.
+//
+// RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so
+// RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so
+// RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t
+// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s
+
+#include <cstdlib>
+#include <dlfcn.h>
+#include <stdio.h>
+#include <string>
+
+#ifdef SHARED_LIB
+namespace foo {
+char G[SIZE];
+}
+#else // SHARED_LIB
+void *dlopen_or_die(std::string &path) {
+  void *handle = dlopen(path.c_str(), RTLD_NOW);
+  if (handle) {
+    printf("Successfully called dlopen() on %s\n", path.c_str());
+  } else {
+    printf("Error in dlopen(): %s\n", dlerror());
+    std::exit(1);
+  }
+
+  return handle;
+}
+
+void dlclose_or_die(void *handle, std::string &path) {
+  if (!dlclose(handle)) {
+    printf("Successfully called dlclose() on %s\n", path.c_str());
+  } else {
+    printf("Error in dlclose(): %s\n", dlerror());
+    std::exit(1);
+  }
+}
+
+namespace foo {
+char G[1];
+}
+
+// main has its own version of foo::G
+// CHECK: Added Global[[MAIN_G:[^\s]+]] size=1/32 name=foo::G {{.*}}
+int main(int argc, char *argv[]) {
+  std::string base_path = std::string(argv[0]);
+
+  std::string path1 = base_path + "-so-1.so";
+  // dlopen() brings another foo::G but it matches MAIN_G in size so it's not a
+  // violation
+  //
+  //
+  // CHECK: Added Global[[SO1_G:[^\s]+]] size=1/32 name=foo::G {{.*}}
+  // CHECK-NOT: ERROR: AddressSanitizer: odr-violation
+  void *handle1 = dlopen_or_die(path1);
+  // CHECK: Removed Global[[SO1_G]] size=1/32 name=foo::G {{.*}}
+  dlclose_or_die(handle1, path1);
+
+  // At this point the indicator for foo::G is switched to UNREGISTERED for
+  // **both** MAIN_G and SO1_G because the indicator value is shared.
+
+  std::string path2 = base_path + "-so-2.so";
+  // CHECK: Added Global[[SO2_G:[^\s]+]] size=2/32 name=foo::G {{.*}}
+  //
+  // This brings another foo::G but now different in size from MAIN_G. We
+  // should've reported a violation, but we actually don't because of what's
+  // described on line60
+  //
+  // CHECK-NOT: ERROR: AddressSanitizer: odr-violation
+  void *handle2 = dlopen_or_die(path2);
+  // CHECK: Removed Global[[MAIN_G]] size=1/32 name=foo::G {{.*}}
+  // CHECK: Removed Global[[SO2_G]] size=2/32 name=foo::G {{.*}}
+}
+
+#endif // SHARED_LIB
diff --git a/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp b/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
index 3266eef826eda..87be90014d56e 100644
--- a/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
@@ -1,15 +1,15 @@
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-BASIC
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore && %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-2
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-MISMATCH
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-MISMATCH
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=double_delete && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/asan/TestCases/debug_stacks.cpp b/compiler-rt/test/asan/TestCases/debug_stacks.cpp
index 7c320bfcb2d98..67a9ac849c3aa 100644
--- a/compiler-rt/test/asan/TestCases/debug_stacks.cpp
+++ b/compiler-rt/test/asan/TestCases/debug_stacks.cpp
@@ -42,9 +42,9 @@ int main() {
   fprintf(stderr, "thread id = %d\n", thread_id);
   // CHECK: thread id = 0
   fprintf(stderr, "0x" PTR "\n", trace[0]);
-  // CHECK: [[ALLOC_FRAME_0:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[ALLOC_FRAME_0:[0-9a-f]+]]
   fprintf(stderr, "0x" PTR "\n", trace[1]);
-  // CHECK: [[ALLOC_FRAME_1:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[ALLOC_FRAME_1:[0-9a-f]+]]
 
   num_frames = 100;
   num_frames = __asan_get_free_stack(mem, trace, num_frames, &thread_id);
@@ -55,20 +55,20 @@ int main() {
   fprintf(stderr, "thread id = %d\n", thread_id);
   // CHECK: thread id = 0
   fprintf(stderr, "0x" PTR "\n", trace[0]);
-  // CHECK: [[FREE_FRAME_0:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[FREE_FRAME_0:[0-9a-f]+]]
   fprintf(stderr, "0x" PTR "\n", trace[1]);
-  // CHECK: [[FREE_FRAME_1:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[FREE_FRAME_1:[0-9a-f]+]]
 
   mem[0] = 'A'; // BOOM
 
   // CHECK: ERROR: AddressSanitizer: heap-use-after-free
   // CHECK: WRITE of size 1 at 0x{{.*}}
   // CHECK: freed by thread T0 here:
-  // CHECK: #0 [[FREE_FRAME_0]]
-  // CHECK: #1 [[FREE_FRAME_1]]
+  // CHECK: #0 0x{{0*}}[[FREE_FRAME_0]]
+  // CHECK: #1 0x{{0*}}[[FREE_FRAME_1]]
   // CHECK: previously allocated by thread T0 here:
-  // CHECK: #0 [[ALLOC_FRAME_0]]
-  // CHECK: #1 [[ALLOC_FRAME_1]]
+  // CHECK: #0 0x{{0*}}[[ALLOC_FRAME_0]]
+  // CHECK: #1 0x{{0*}}[[ALLOC_FRAME_1]]
 
   return 0;
 }
diff --git a/compiler-rt/test/asan/Unit/lit.site.cfg.py.in b/compiler-rt/test/asan/Unit/lit.site.cfg.py.in
index 638e1dedfc1d2..ac652b53dcb9d 100644
--- a/compiler-rt/test/asan/Unit/lit.site.cfg.py.in
+++ b/compiler-rt/test/asan/Unit/lit.site.cfg.py.in
@@ -53,7 +53,7 @@ config.test_source_root = config.test_exec_root
 # host triple as the trailing path component. The value is incorrect for i386
 # tests on x86_64 hosts and vice versa. Adjust config.compiler_rt_libdir
 # accordingly.
-if config.enable_per_target_runtime_dir and config.target_arch != config.host_arch:
+if config.enable_per_target_runtime_dir:
     if config.target_arch == 'i386':
         config.compiler_rt_libdir = re.sub(r'/x86_64(?=-[^/]+$)', '/i386', config.compiler_rt_libdir)
     elif config.target_arch == 'x86_64':
diff --git a/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c b/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
index da0e366430a73..ebbeba1cd0014 100644
--- a/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
+++ b/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
@@ -7,11 +7,22 @@ typedef unsigned int uint32_t;
 int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
                                    uint32_t Minor, uint32_t Subminor);
 
+int32_t __isPlatformOrVariantPlatformVersionAtLeast(
+    uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor,
+    uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2);
+
+void exit(int status);
+
 #define PLATFORM_MACOS 1
+#define PLATFORM_IOS 2
 
 int32_t check(uint32_t Major, uint32_t Minor, uint32_t Subminor) {
   int32_t Result =
       __isPlatformVersionAtLeast(PLATFORM_MACOS, Major, Minor, Subminor);
+  int32_t ResultVariant = __isPlatformOrVariantPlatformVersionAtLeast(
+      PLATFORM_MACOS, Major, Minor, Subminor, PLATFORM_IOS, 13, 0, 0);
+  if (Result != ResultVariant)
+    exit(-1);
   return Result;
 }
 
diff --git a/compiler-rt/test/builtins/Unit/ppc/test b/compiler-rt/test/builtins/Unit/ppc/test
deleted file mode 100755
index 96e06320dbe2a..0000000000000
--- a/compiler-rt/test/builtins/Unit/ppc/test
+++ /dev/null
@@ -1,18 +0,0 @@
-for FILE in $(ls *.c); do
-	if gcc -arch ppc -O0 $FILE ../../../Release/ppc/libcompiler_rt.Optimized.a -mlong-double-128
-	then
-	    echo "Testing $FILE"
-		if ./a.out
-		then
-			rm ./a.out
-		else
-			echo "fail"
-# 			exit 1
-		fi
-	else
-		echo "$FILE failed to compile"
-#		exit 1
-	fi
-done
-echo "pass"
-exit
diff --git a/compiler-rt/test/builtins/Unit/test b/compiler-rt/test/builtins/Unit/test
deleted file mode 100755
index e0683790c225c..0000000000000
--- a/compiler-rt/test/builtins/Unit/test
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env bash
-
-ARCHS='<host>'
-REMOTE=0
-if test `uname` = "Darwin"; then
-  if test "$1" = "armv6"; then
-    ARCHS="armv6"
-    LIBS="-lSystem"
-    REMOTE=1
-    mkdir -p remote
-  else
-    ARCHS="i386 x86_64 ppc"
-    LIBS="-lSystem"
-  fi
-else
-  LIBS="-lc -lm"
-fi
-
-for ARCH in $ARCHS; do
-  CFLAGS="-Os -nodefaultlibs -I../../lib"
-  if test "$ARCH" != '<host>'; then
-    CFLAGS="-arch $ARCH $CFLAGS"
-  fi
-  for FILE in $(ls *.c); do
-    # Use -nodefaultlibs to avoid using libgcc.a
-    # Use -lSystem to link with libSystem.dylb.
-    # Note -lSystem is *after* libcompiler_rt.Optimized.a so that linker will 
-    # prefer our implementation over the ones in libSystem.dylib
-    EXTRA=
-    if test $FILE = gcc_personality_test.c
-    then
-      # the gcc_personality_test.c requires a helper C++ program
-      EXTRA="-fexceptions gcc_personality_test_helper.cxx -lstdc++ /usr/lib/libgcc_s.1.dylib"
-      # the libgcc_s.1.dylib use at the end is a hack until libSystem contains _Unwind_Resume
-    fi
-    if test "$REMOTE" = "1"
-    then 
-      if gcc $CFLAGS $FILE ../../darwin_fat/Release/libcompiler_rt.a $LIBS $EXTRA -o ./remote/$FILE.exe
-      then
-        echo "Built $FILE.exe for $ARCH"
-      else
-        echo "$FILE failed to compile"
-      fi
-    else
-      if gcc $CFLAGS $FILE ../../darwin_fat/Release/libcompiler_rt.a $LIBS $EXTRA
-      then
-        echo "Testing $FILE for $ARCH"
-        if ./a.out
-        then
-          rm ./a.out
-        else
-          echo "fail"
-          exit 1
-        fi
-      else
-        echo "$FILE failed to compile"
-        exit 1
-      fi
-    fi
-  done
-done
-echo "pass"
-exit
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
index da115fe764271..d51d35acaa02f 100644
--- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
+++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc compiler generates a call to __trampoline_setup
+ * The ppc and aarch64 compilers generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 
diff --git a/compiler-rt/test/ctx_profile/CMakeLists.txt b/compiler-rt/test/ctx_profile/CMakeLists.txt
index 371f1a2dcbb05..fc3b3f3ec431f 100644
--- a/compiler-rt/test/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/test/ctx_profile/CMakeLists.txt
@@ -25,17 +25,15 @@ foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
 endforeach()
 
 # Add unit tests.
-if(COMPILER_RT_INCLUDE_TESTS)
-  foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
-    string(TOUPPER ${arch} ARCH_UPPER_CASE)
-    set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
-    configure_lit_site_cfg(
-      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
-    list(APPEND CTX_PROFILE_TEST_DEPS CtxProfileUnitTests)
-    list(APPEND CTX_PROFILE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
-  endforeach()
-endif()
+foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND CTX_PROFILE_TEST_DEPS CtxProfileUnitTests)
+  list(APPEND CTX_PROFILE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
+endforeach()
 
 add_lit_testsuite(check-ctx_profile "Running the Contextual Profiler tests"
   ${CTX_PROFILE_TESTSUITES}
diff --git a/compiler-rt/test/fuzzer/CMakeLists.txt b/compiler-rt/test/fuzzer/CMakeLists.txt
index 8d29a4bea5f34..b19f52e591fc0 100644
--- a/compiler-rt/test/fuzzer/CMakeLists.txt
+++ b/compiler-rt/test/fuzzer/CMakeLists.txt
@@ -22,20 +22,16 @@ if (APPLE)
   darwin_filter_host_archs(FUZZER_SUPPORTED_ARCH FUZZER_TEST_ARCH)
 endif()
 
-if(COMPILER_RT_INCLUDE_TESTS)
-  list(APPEND LIBFUZZER_TEST_DEPS FuzzerUnitTests)
-  list(APPEND LIBFUZZER_TEST_DEPS FuzzedDataProviderUnitTests)
-endif()
+list(APPEND LIBFUZZER_TEST_DEPS FuzzerUnitTests)
+list(APPEND LIBFUZZER_TEST_DEPS FuzzedDataProviderUnitTests)
 
 set(LIBFUZZER_TESTSUITES)
 
-if(COMPILER_RT_INCLUDE_TESTS)
-  # libFuzzer unit tests.
-  configure_lit_site_cfg(
-    ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg.py)
-  list(APPEND LIBFUZZER_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit)
-endif()
+# libFuzzer unit tests.
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg.py)
+list(APPEND LIBFUZZER_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit)
 
 macro(test_fuzzer stdlib)
   cmake_parse_arguments(TEST "" "" "DEPS" ${ARGN})
diff --git a/compiler-rt/test/gwp_asan/CMakeLists.txt b/compiler-rt/test/gwp_asan/CMakeLists.txt
index f9580e2592b2f..6ffa36e7ebbda 100644
--- a/compiler-rt/test/gwp_asan/CMakeLists.txt
+++ b/compiler-rt/test/gwp_asan/CMakeLists.txt
@@ -14,7 +14,7 @@ set(GWP_ASAN_TEST_DEPS
 # exported libc++ from the Android NDK is x86-64, even though it's part of the
 # ARM[64] toolchain... See similar measures for ASan and sanitizer-common that
 # disable unit tests for Android.
-if (COMPILER_RT_INCLUDE_TESTS AND NOT ANDROID)
+if (NOT ANDROID)
   list(APPEND GWP_ASAN_TEST_DEPS GwpAsanUnitTests)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.py.in
@@ -22,7 +22,7 @@ if (COMPILER_RT_INCLUDE_TESTS AND NOT ANDROID)
   list(APPEND GWP_ASAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit)
 endif()
 
-if (COMPILER_RT_INCLUDE_TESTS AND COMPILER_RT_HAS_SCUDO_STANDALONE)
+if (COMPILER_RT_HAS_SCUDO_STANDALONE)
   foreach(arch ${GWP_ASAN_SUPPORTED_ARCH})
     set(GWP_ASAN_TEST_TARGET_ARCH ${arch})
     string(TOLOWER "-${arch}" GWP_ASAN_TEST_CONFIG_SUFFIX)
diff --git a/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp b/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
index b6f17d89083b4..8c48654f84c0a 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
+++ b/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-BASIC
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore && %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-2
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=double_delete && not %run %t \
-// RUN:   |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
+// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
 
 #include <sanitizer/hwasan_interface.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/interception/CMakeLists.txt b/compiler-rt/test/interception/CMakeLists.txt
index df69453c4fb4e..8ed1a003c1b3c 100644
--- a/compiler-rt/test/interception/CMakeLists.txt
+++ b/compiler-rt/test/interception/CMakeLists.txt
@@ -3,7 +3,7 @@ set(INTERCEPTION_TESTSUITES)
 
 # Unit tests. There are currently no unit tests capable to running on Apple or
 # Android targets.
-if(COMPILER_RT_INCLUDE_TESTS AND NOT ANDROID AND NOT APPLE)
+if(NOT ANDROID AND NOT APPLE)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
diff --git a/compiler-rt/test/memprof/CMakeLists.txt b/compiler-rt/test/memprof/CMakeLists.txt
index 001245215201e..4c50ae6b83719 100644
--- a/compiler-rt/test/memprof/CMakeLists.txt
+++ b/compiler-rt/test/memprof/CMakeLists.txt
@@ -44,17 +44,15 @@ foreach(arch ${MEMPROF_TEST_ARCH})
 endforeach()
 
 # Add unit tests.
-if(COMPILER_RT_INCLUDE_TESTS)
-  foreach(arch ${MEMPROF_TEST_ARCH})
-    string(TOUPPER ${arch} ARCH_UPPER_CASE)
-    set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
-    configure_lit_site_cfg(
-      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
-    list(APPEND MEMPROF_TEST_DEPS MemProfUnitTests)
-    list(APPEND MEMPROF_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
-  endforeach()
-endif()
+foreach(arch ${MEMPROF_TEST_ARCH})
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND MEMPROF_TEST_DEPS MemProfUnitTests)
+  list(APPEND MEMPROF_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
+endforeach()
 
 add_lit_testsuite(check-memprof "Running the MemProfiler tests"
   ${MEMPROF_TESTSUITES}
diff --git a/compiler-rt/test/msan/CMakeLists.txt b/compiler-rt/test/msan/CMakeLists.txt
index ff19c11971b2d..df5719761e607 100644
--- a/compiler-rt/test/msan/CMakeLists.txt
+++ b/compiler-rt/test/msan/CMakeLists.txt
@@ -41,9 +41,7 @@ foreach(arch ${MSAN_TEST_ARCH})
   endif()
 endforeach()
 
-if(COMPILER_RT_INCLUDE_TESTS AND
-   COMPILER_RT_LIBCXX_PATH AND
-   COMPILER_RT_LIBCXXABI_PATH)
+if(COMPILER_RT_LIBCXX_PATH AND COMPILER_RT_LIBCXXABI_PATH)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
diff --git a/compiler-rt/test/nsan/CMakeLists.txt b/compiler-rt/test/nsan/CMakeLists.txt
index fb73587574fba..78a1f4baee116 100644
--- a/compiler-rt/test/nsan/CMakeLists.txt
+++ b/compiler-rt/test/nsan/CMakeLists.txt
@@ -3,9 +3,23 @@ set(NSAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(NSAN_TESTSUITES)
 set(NSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} nsan)
 
-if(COMPILER_RT_INCLUDE_TESTS AND
-   COMPILER_RT_LIBCXX_PATH AND
-   COMPILER_RT_LIBCXXABI_PATH)
+macro(add_nsan_testsuite arch)
+  set(NSAN_TEST_TARGET_ARCH ${arch})
+  get_test_cc_for_arch(${arch} NSAN_TEST_TARGET_CC NSAN_TEST_TARGET_CFLAGS)
+
+  string(TOUPPER ${arch} CONFIG_NAME)
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND NSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endmacro()
+
+set(NSAN_TEST_ARCH ${NSAN_SUPPORTED_ARCH})
+foreach(arch ${NSAN_TEST_ARCH})
+  add_nsan_testsuite(${arch})
+endforeach()
+
+if(COMPILER_RT_LIBCXX_PATH AND COMPILER_RT_LIBCXXABI_PATH)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
diff --git a/compiler-rt/test/nsan/alloca.cpp b/compiler-rt/test/nsan/alloca.cpp
new file mode 100644
index 0000000000000..33f7c1364e664
--- /dev/null
+++ b/compiler-rt/test/nsan/alloca.cpp
@@ -0,0 +1,26 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// RUN: %clangxx_nsan -O3 -g %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+#include <cstddef>
+
+#include "helpers.h"
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes,
+                                       size_t bytes_per_line, size_t reserved);
+
+int main() {
+  int size = 3 * sizeof(float);
+  // Make sure we allocate dynamically: https://godbolt.org/z/T3h998.
+  DoNotOptimize(size);
+  float *array = reinterpret_cast<float *>(__builtin_alloca(size));
+  DoNotOptimize(array);
+  array[0] = 1.0;
+  array[1] = 2.0;
+  // The third float is uninitialized.
+  __nsan_dump_shadow_mem((const char *)array, 3 * sizeof(float), 16, 0);
+  // CHECK: {{.*}} f0 f1 f2 f3 f0 f1 f2 f3 __ __ __ __ (1.00000000000000000000) (2.00000000000000000000)
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/helpers.h b/compiler-rt/test/nsan/helpers.h
new file mode 100644
index 0000000000000..487707a0a8fa3
--- /dev/null
+++ b/compiler-rt/test/nsan/helpers.h
@@ -0,0 +1,4 @@
+// Prevents the compiler from optimizing everything away.
+template <class T> void DoNotOptimize(const T &var) {
+  asm volatile("" : "+m"(const_cast<T &>(var)));
+}
diff --git a/compiler-rt/test/nsan/lit.cfg.py b/compiler-rt/test/nsan/lit.cfg.py
index 8b137891791fe..2d67911a7d5d8 100644
--- a/compiler-rt/test/nsan/lit.cfg.py
+++ b/compiler-rt/test/nsan/lit.cfg.py
@@ -1 +1,36 @@
+config.name = "NSan" + config.name_suffix
 
+# Setup source root.
+config.test_source_root = os.path.dirname(__file__)
+
+# Test suffixes.
+config.suffixes = [".c", ".cpp", ".test"]
+
+# C & CXX flags.
+c_flags = [config.target_cflags]
+
+# CXX flags
+cxx_flags = c_flags + config.cxx_mode_flags + ["-std=c++17"]
+
+nsan_flags = [
+    "-fsanitize=numerical",
+    "-g",
+    "-mno-omit-leaf-frame-pointer",
+    "-fno-omit-frame-pointer",
+]
+
+
+def build_invocation(compile_flags):
+    return " " + " ".join([config.clang] + compile_flags) + " "
+
+
+# Add substitutions.
+config.substitutions.append(("%clang ", build_invocation(c_flags)))
+config.substitutions.append(("%clang_nsan ", build_invocation(c_flags + nsan_flags)))
+config.substitutions.append(
+    ("%clangxx_nsan ", build_invocation(cxx_flags + nsan_flags))
+)
+
+# NSan tests are currently supported on Linux only.
+if config.host_os not in ["Linux"]:
+    config.unsupported = True
diff --git a/compiler-rt/test/nsan/lit.site.cfg.py.in b/compiler-rt/test/nsan/lit.site.cfg.py.in
index 69fd057e75748..496a9d5277a06 100644
--- a/compiler-rt/test/nsan/lit.site.cfg.py.in
+++ b/compiler-rt/test/nsan/lit.site.cfg.py.in
@@ -4,8 +4,6 @@
 config.name_suffix = "-@CONFIG_NAME@"
 config.target_cflags = "@NSAN_TEST_TARGET_CFLAGS@"
 config.target_arch = "@NSAN_TEST_TARGET_ARCH@"
-config.use_lld = @NSAN_TEST_USE_LLD@
-config.use_thinlto = @NSAN_TEST_USE_THINLTO@
 
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
diff --git a/compiler-rt/test/nsan/sum.cpp b/compiler-rt/test/nsan/sum.cpp
new file mode 100644
index 0000000000000..31bd62e73966b
--- /dev/null
+++ b/compiler-rt/test/nsan/sum.cpp
@@ -0,0 +1,70 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t
+// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s
+
+// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t
+// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s
+
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t
+// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t
+
+// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t
+// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t
+
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <vector>
+
+// A naive, unstable summation.
+template <typename T>
+__attribute__((noinline))  // To check call stack reporting.
+T NaiveSum(const std::vector<T>& values) {
+  T sum = 0;
+  for (T v : values) {
+    sum += v;
+  }
+  return sum;
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
+  // CHECK: float{{ *}}precision (native):
+  // CHECK: double{{ *}}precision (shadow):
+  // CHECK: {{#0 .*in .* NaiveSum}}
+}
+
+// Kahan's summation is a numerically stable sum.
+// https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+template <typename T>
+__attribute__((noinline)) T KahanSum(const std::vector<T> &values) {
+  T sum = 0;
+  T c = 0;
+  for (T v : values) {
+    T y = v - c;
+    T t = sum + y;
+    c = (t - sum) - y;
+    sum = t;
+  }
+  return sum;
+}
+
+int main() {
+  std::vector<FLT> values;
+  constexpr int kNumValues = 1000000;
+  values.reserve(kNumValues);
+  // Using a seed to avoid flakiness.
+  constexpr uint32_t kSeed = 0x123456;
+  std::mt19937 gen(kSeed);
+  std::uniform_real_distribution<FLT> dis(0.0f, 1000.0f);
+  for (int i = 0; i < kNumValues; ++i) {
+    values.push_back(dis(gen));
+  }
+
+  const auto t1 = std::chrono::high_resolution_clock::now();
+  const auto sum = SUM(values);
+  const auto t2 = std::chrono::high_resolution_clock::now();
+  printf("sum: %.8f\n", sum);
+  std::cout << "runtime: "
+            << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1)
+                       .count() /
+                   1000.0
+            << "ms\n";
+  return 0;
+}
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c b/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
index 85caca9a56b40..3ed7c1894b6d1 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
@@ -8,8 +8,8 @@
 
 // Align counters and data to the maximum expected page size (16K).
 // RUN: %clang -g -o %t %s \
-// RUN:   -Wl,-sectalign,__DATA,__pcnts,0x4000 \
-// RUN:   -Wl,-sectalign,__DATA,__pdata,0x4000
+// RUN:   -Wl,-sectalign,__DATA,__pcnts,0x1000 \
+// RUN:   -Wl,-sectalign,__DATA,__pdata,0x1000
 
 // Create a 'profile' using mmap() and validate it.
 // RUN: %run %t create %t.tmpfile
@@ -24,7 +24,7 @@
 
 __attribute__((section("__DATA,__pcnts"))) int counters[] = {0xbad};
 extern int cnts_start __asm("section$start$__DATA$__pcnts");
-const size_t cnts_len = 0x4000;
+const size_t cnts_len = 0x1000;
 
 __attribute__((section("__DATA,__pdata"))) int data[] = {1, 2, 3};
 extern int data_start __asm("section$start$__DATA$__pdata");
@@ -44,8 +44,8 @@ int create_tmpfile(char *path) {
     return EXIT_FAILURE;
   }
 
-  // Write the data first (at offset 0x4000, after the counters).
-  if (data_len != pwrite(fd, &data, data_len, 0x4000)) {
+  // Write the data first (at offset 0x1000, after the counters).
+  if (data_len != pwrite(fd, &data, data_len, cnts_len)) {
     perror("write");
     return EXIT_FAILURE;
   }
@@ -55,8 +55,8 @@ int create_tmpfile(char *path) {
   // Requirements (on Darwin):
   // - &cnts_start must be page-aligned.
   // - The length and offset-into-fd must be page-aligned.
-  int *counter_map = (int *)mmap(&cnts_start, 0x4000, PROT_READ | PROT_WRITE,
-      MAP_FIXED | MAP_SHARED, fd, 0);
+  int *counter_map = (int *)mmap(&cnts_start, cnts_len, PROT_READ | PROT_WRITE,
+                                 MAP_FIXED | MAP_SHARED, fd, 0);
   if (counter_map != &cnts_start) {
     perror("mmap");
     return EXIT_FAILURE;
@@ -97,7 +97,7 @@ int validate_tmpfile(char *path) {
   }
 
   // Verify that the rest of the counters (after counter 9) are 0.
-  const int num_cnts = 0x4000 / sizeof(int);
+  const int num_cnts = cnts_len / sizeof(int);
   for (int i = 10; i < num_cnts; ++i) {
     if (buf[i] != 0) {
       fprintf(stderr,
@@ -131,11 +131,12 @@ int main(int argc, char **argv) {
     fprintf(stderr, "__pcnts is not page-aligned: 0x%lx.\n", cnts_start_int);
     return EXIT_FAILURE;
   }
-  if (data_start_int % pagesz != 0) {
-    fprintf(stderr, "__pdata is not page-aligned: 0x%lx.\n", data_start_int);
+  if (data_start_int % 0x1000 != 0) {
+    fprintf(stderr, "__pdata is not correctly aligned: 0x%lx.\n",
+            data_start_int);
     return EXIT_FAILURE;
   }
-  if (cnts_start_int + 0x4000 != data_start_int) {
+  if (cnts_start_int + 0x1000 != data_start_int) {
     fprintf(stderr, "__pdata not ordered after __pcnts.\n");
     return EXIT_FAILURE;
   }
diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 24a96a227393a..214c1504db7f4 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -5,9 +5,9 @@
 // ld.lld: error: /lib/../lib64/Scrt1.o: ABI version 1 is not supported
 // UNSUPPORTED: ppc && host-byteorder-big-endian
 
-// RUN: rm -rf %t && mkdir %t && cd %t
+// RUN: rm -rf %t && mkdir %t && split-file %s %t && cd %t
 
-// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o test
+// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling lib.cpp main.cpp -o test
 // RUN: env LLVM_PROFILE_FILE=test.profraw ./test
 
 // Show vtable profiles from raw profile.
@@ -37,23 +37,23 @@
 // COMMON-NEXT:  Number of instrumented vtables: 2
 // RAW:  Indirect Target Results:
 // RAW-NEXT:       [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// RAW-NEXT:       [  0, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// RAW-NEXT:       [  1, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // RAW-NEXT:  VTable Results:
 // RAW-NEXT:       [  0, _ZTV8Derived1,        50 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// RAW-NEXT:       [  0, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZTV8Derived1,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// RAW-NEXT:       [  1, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED:     Indirect Target Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
 // INDEXED-NEXT:         [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
 // INDEXED-NEXT:     VTable Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // INDEXED-NEXT:         [  0, _ZTV8Derived1,        50 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZTV8Derived1,        250 ] (25.00%)
 // COMMON: Instrumentation level: IR  entry_first = 0
 // COMMON-NEXT: Functions shown: 1
@@ -93,22 +93,41 @@
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
+// ICTEXT: {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
 // ICTEXT: _ZN8Derived14funcEii:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
+// ICTEXT: {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
 // ICTEXT: _ZN8Derived1D0Ev:250
 // ICTEXT: # ValueKind = IPVK_VTableTarget:
 // ICTEXT: 2
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
+// ICTEXT: {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
 // ICTEXT: _ZTV8Derived1:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
+// ICTEXT: {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
 // ICTEXT: _ZTV8Derived1:250
 
+// When vtable value profiles exist, pgo-instr-use pass should annotate them
+// even if `-enable-vtable-value-profiling` is not explicitly on.
+// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
+// RUN:   -mllvm -print-after=pgo-instr-use -mllvm -filter-print-funcs=main \
+// RUN:   -mllvm -print-module-scope lib.cpp main.cpp 2>&1 | FileCheck %s --check-prefix=ANNOTATE
+
+// ANNOTATE-NOT: Inconsistent number of value sites
+// ANNOTATE: !{!"VP", i32 2
+
+// When vtable value profiles exist, pgo-instr-use pass will not annotate them
+// if `-icp-max-num-vtables` is set to zero.
+// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
+// RUN:   -mllvm -icp-max-num-vtables=0 -mllvm -print-after=pgo-instr-use \
+// RUN:   -mllvm -filter-print-funcs=main -mllvm -print-module-scope lib.cpp main.cpp 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=OMIT
+
+// OMIT: Inconsistent number of value sites
+// OMIT-NOT: !{!"VP", i32 2
+
 // Test indirect call promotion transformation using vtable profiles.
 // - Build with `-g` to enable debug information.
 // - In real world settings, ICP pass is disabled in prelink pipeline. In
@@ -122,28 +141,29 @@
 // RUN:    -g -flto=thin -fwhole-program-vtables -fno-split-lto-unit -O2 \
 // RUN:    -mllvm -enable-vtable-value-profiling -Wl,-mllvm,-enable-vtable-value-profiling \
 // RUN:    -mllvm -enable-vtable-profile-use \
+// RUN:    -Wl,-plugin-opt,-import-assume-unique-local \
 // RUN:    -Wl,-mllvm,-enable-vtable-profile-use -Rpass=pgo-icall-prom \
 // RUN:    -Wl,-mllvm,-print-after=pgo-icall-prom \
-// RUN:    -Wl,-mllvm,-filter-print-funcs=main %s 2>&1 \
+// RUN:    -Wl,-mllvm,-filter-print-funcs=main lib.cpp main.cpp 2>&1 \
 // RUN:    | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
 // For the indirect call site `ptr->func`
-// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: main.cpp:10:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii.llvm.{{.*}} with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}}
+// REMARK: main.cpp:10:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 //
 // For the indirect call site `delete ptr`
-// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: main.cpp:12:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev.llvm.{{.*}} with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}}
+// REMARK: main.cpp:12:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 
 // The IR matchers for indirect callsite `ptr->func`.
 // IR-LABEL: @main
 // IR:   [[OBJ:%.*]] = {{.*}}call {{.*}} @_Z10createTypei
 // IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
-// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E, i32 16)
+// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}, i32 16)
 // IR:   br i1 [[CMP1]], label %[[BB1:.*]], label %[[BB2:[a-zA-Z0-9_.]+]],
 //
 // IR: [[BB1]]:
-// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
+// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii.llvm.{{.*}}
 // IR:   br label %[[MERGE0:[a-zA-Z0-9_.]+]]
 //
 // IR: [[BB2]]:
@@ -166,6 +186,7 @@
 // IR: [[MERGE0]]:
 // IR:    [[RES2:%.*]] = phi i32 [ [[RES1]], %[[MERGE1]] ], [ [[RESBB1]], %[[BB1]] ]
 
+//--- lib.h
 #include <stdio.h>
 #include <stdlib.h>
 class Base {
@@ -174,12 +195,19 @@ class Base {
 
   virtual ~Base() {};
 };
+
 class Derived1 : public Base {
 public:
-  int func(int a, int b) override { return a * b; }
+  int func(int a, int b) override;
 
   ~Derived1() {}
 };
+
+__attribute__((noinline)) Base *createType(int a);
+
+//--- lib.cpp
+#include "lib.h"
+
 namespace {
 class Derived2 : public Base {
 public:
@@ -188,7 +216,10 @@ class Derived2 : public Base {
   ~Derived2() {}
 };
 } // namespace
-__attribute__((noinline)) Base *createType(int a) {
+
+int Derived1::func(int a, int b) { return a * b; }
+
+Base *createType(int a) {
   Base *base = nullptr;
   if (a % 4 == 0)
     base = new Derived1();
@@ -197,6 +228,9 @@ __attribute__((noinline)) Base *createType(int a) {
   return base;
 }
 
+//--- main.cpp
+#include "lib.h"
+
 int main(int argc, char **argv) {
   int sum = 0;
   for (int i = 0; i < 1000; i++) {
diff --git a/compiler-rt/test/rtsan/CMakeLists.txt b/compiler-rt/test/rtsan/CMakeLists.txt
index 2a59a57f27372..e1f9eb39408dc 100644
--- a/compiler-rt/test/rtsan/CMakeLists.txt
+++ b/compiler-rt/test/rtsan/CMakeLists.txt
@@ -34,20 +34,18 @@ foreach(arch ${RTSAN_TEST_ARCH})
   list(APPEND RTSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
 endforeach()
 
-if(COMPILER_RT_INCLUDE_TESTS)
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
+if(COMPILER_RT_RTSAN_HAS_STATIC_RUNTIME)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
-  if(COMPILER_RT_RTSAN_HAS_STATIC_RUNTIME)
-    configure_lit_site_cfg(
-      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-      ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic/lit.site.cfg.py)
-  endif()
-  list(APPEND RTSAN_TEST_DEPS RtsanUnitTests)
-  list(APPEND RTSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
-  if(COMPILER_RT_RTSAN_HAS_STATIC_RUNTIME)
-    list(APPEND RTSAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic)
-  endif()
+    ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic/lit.site.cfg.py)
+endif()
+list(APPEND RTSAN_TEST_DEPS RtsanUnitTests)
+list(APPEND RTSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
+if(COMPILER_RT_RTSAN_HAS_STATIC_RUNTIME)
+  list(APPEND RTSAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic)
 endif()
 
 add_lit_testsuite(check-rtsan "Running the Rtsan tests"
diff --git a/compiler-rt/test/safestack/CMakeLists.txt b/compiler-rt/test/safestack/CMakeLists.txt
index 89ba6e74884b1..fca06024c9856 100644
--- a/compiler-rt/test/safestack/CMakeLists.txt
+++ b/compiler-rt/test/safestack/CMakeLists.txt
@@ -1,29 +1,40 @@
 set(SAFESTACK_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(SAFESTACK_LIT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
+set(SAFESTACK_TESTSUITES)
 set(SAFESTACK_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
 list(APPEND SAFESTACK_TEST_DEPS safestack)
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  # Some tests require LTO, so add a dependency on the relevant LTO plugin.
-  if(LLVM_ENABLE_PIC)
-    if(LLVM_BINUTILS_INCDIR)
-      list(APPEND SAFESTACK_TEST_DEPS
-        LLVMgold
-      )
-    endif()
-    if(APPLE)
-      list(APPEND SAFESTACK_TEST_DEPS
-        LTO
-      )
+
+macro(add_safestack_testsuite test_mode sanitizer arch)
+  set(SAFESTACK_LIT_TEST_MODE "${test_mode}")
+  set(CONFIG_NAME ${SAFESTACK_LIT_TEST_MODE})
+
+  if(NOT COMPILER_RT_STANDALONE_BUILD)
+    # Some tests require LTO, so add a dependency on the relevant LTO plugin.
+    if(LLVM_ENABLE_PIC)
+      if(LLVM_BINUTILS_INCDIR)
+        list(APPEND SAFESTACK_TEST_DEPS LLVMgold)
+      endif()
+      if(APPLE)
+        list(APPEND SAFESTACK_TEST_DEPS LTO)
+      endif()
     endif()
   endif()
-endif()
+  set(CONFIG_NAME ${CONFIG_NAME}-${arch})
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND SAFESTACK_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endmacro()
+
+set(SAFESTACK_TEST_ARCH ${SAFESTACK_SUPPORTED_ARCH})
 
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
-  )
+foreach(arch ${SAFESTACK_TEST_ARCH})
+  set(SAFESTACK_TEST_TARGET_ARCH ${arch})
+  get_test_cc_for_arch(${arch} SAFESTACK_TEST_TARGET_CC SAFESTACK_TEST_TARGET_CFLAGS)
+  add_safestack_testsuite("Standalone" safestack ${arch})
+endforeach()
 
 add_lit_testsuite(check-safestack "Running the SafeStack tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
+  ${SAFESTACK_TESTSUITES}
   DEPENDS ${SAFESTACK_TEST_DEPS})
diff --git a/compiler-rt/test/safestack/lit.cfg.py b/compiler-rt/test/safestack/lit.cfg.py
index aadb8bf0d5c77..4ab9c1ce70bac 100644
--- a/compiler-rt/test/safestack/lit.cfg.py
+++ b/compiler-rt/test/safestack/lit.cfg.py
@@ -3,7 +3,7 @@
 import os
 
 # Setup config name.
-config.name = "SafeStack"
+config.name = "SafeStack-" + config.name_suffix
 
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
@@ -33,5 +33,5 @@
         )
     )
 
-if config.host_os not in ["Linux", "FreeBSD", "NetBSD"]:
+if config.host_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
     config.unsupported = True
diff --git a/compiler-rt/test/safestack/lit.site.cfg.py.in b/compiler-rt/test/safestack/lit.site.cfg.py.in
index 3c8bf41b283b8..ee866bca2bad3 100644
--- a/compiler-rt/test/safestack/lit.site.cfg.py.in
+++ b/compiler-rt/test/safestack/lit.site.cfg.py.in
@@ -1,5 +1,11 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+# Tool-specific config options.
+config.name_suffix = "@CONFIG_NAME@"
+config.safestack_lit_test_mode = "@SAFESTACK_LIT_TEST_MODE@"
+config.target_cflags = "@SAFESTACK_TEST_TARGET_CFLAGS@"
+config.target_arch = "@SAFESTACK_TEST_TARGET_ARCH@"
+
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
 
diff --git a/compiler-rt/test/sanitizer_common/CMakeLists.txt b/compiler-rt/test/sanitizer_common/CMakeLists.txt
index edecc04d48c75..fa06b82acebd9 100644
--- a/compiler-rt/test/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/test/sanitizer_common/CMakeLists.txt
@@ -99,15 +99,13 @@ foreach(tool ${SUPPORTED_TOOLS})
 endforeach()
 
 # Unit tests.
-if(COMPILER_RT_INCLUDE_TESTS)
-  configure_lit_site_cfg(
-    ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
-  # FIXME: support unit test in the android test runner
-  if (NOT ANDROID)
-    list(APPEND SANITIZER_COMMON_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
-    list(APPEND SANITIZER_COMMON_TEST_DEPS SanitizerUnitTests)
-  endif()
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
+# FIXME: support unit test in the android test runner
+if (NOT ANDROID)
+  list(APPEND SANITIZER_COMMON_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
+  list(APPEND SANITIZER_COMMON_TEST_DEPS SanitizerUnitTests)
 endif()
 
 if(SANITIZER_COMMON_TESTSUITES)
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp
new file mode 100644
index 0000000000000..74aea4d8b360a
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp
@@ -0,0 +1,17 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: i386-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: eax = {{0x[0-9a-f]+}}  ebx = {{0x[0-9a-f]+}}  ecx = {{0x[0-9a-f]+}}  edx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: edi = {{0x[0-9a-f]+}}  esi = {{0x[0-9a-f]+}}  ebp = {{0x[0-9a-f]+}}  esp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp
new file mode 100644
index 0000000000000..3d11ef0e098f1
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: x86_64-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: rax = {{0x[0-9a-f]+}}  rbx = {{0x[0-9a-f]+}}  rcx = {{0x[0-9a-f]+}}  rdx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: rdi = {{0x[0-9a-f]+}}  rsi = {{0x[0-9a-f]+}}  rbp = {{0x[0-9a-f]+}}  rsp = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:  r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r12 = {{0x[0-9a-f]+}}  r13 = {{0x[0-9a-f]+}}  r14 = {{0x[0-9a-f]+}}  r15 = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
new file mode 100644
index 0000000000000..d1015a4e36956
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
@@ -0,0 +1,23 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
+// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
+//
+// REQUIRES: aarch64-target-arch && glibc
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: x0 = {{0x[0-9a-f]+}}   x1 = {{0x[0-9a-f]+}}   x2 = {{0x[0-9a-f]+}}   x3 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: x4 = {{0x[0-9a-f]+}}   x5 = {{0x[0-9a-f]+}}   x6 = {{0x[0-9a-f]+}}   x7 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: x8 = {{0x[0-9a-f]+}}   x9 = {{0x[0-9a-f]+}}  x10 = {{0x[0-9a-f]+}}  x11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x12 = {{0x[0-9a-f]+}}  x13 = {{0x[0-9a-f]+}}  x14 = {{0x[0-9a-f]+}}  x15 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x16 = {{0x[0-9a-f]+}}  x17 = {{0x[0-9a-f]+}}  x18 = {{0x[0-9a-f]+}}  x19 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x20 = {{0x[0-9a-f]+}}  x21 = {{0x[0-9a-f]+}}  x22 = {{0x[0-9a-f]+}}  x23 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x24 = {{0x[0-9a-f]+}}  x25 = {{0x[0-9a-f]+}}  x26 = {{0x[0-9a-f]+}}  x27 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x28 = {{0x[0-9a-f]+}}   fp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
new file mode 100644
index 0000000000000..e747f78188d32
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
+// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
+//
+// REQUIRES: arm-target-arch && glibc
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: r0 = {{0x[0-9a-f]+}}   r1 = {{0x[0-9a-f]+}}   r2 = {{0x[0-9a-f]+}}   r3 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r4 = {{0x[0-9a-f]+}}   r5 = {{0x[0-9a-f]+}}   r6 = {{0x[0-9a-f]+}}   r7 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:r12 = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   pc = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp
new file mode 100644
index 0000000000000..5a62ef88ccd89
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp
@@ -0,0 +1,17 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: i386-target-arch && glibc
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: eax = {{0x[0-9a-f]+}}  ebx = {{0x[0-9a-f]+}}  ecx = {{0x[0-9a-f]+}}  edx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: edi = {{0x[0-9a-f]+}}  esi = {{0x[0-9a-f]+}}  ebp = {{0x[0-9a-f]+}}  esp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp
new file mode 100644
index 0000000000000..aac3c3fbd1052
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: x86_64-target-arch && glibc
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: rax = {{0x[0-9a-f]+}}  rbx = {{0x[0-9a-f]+}}  rcx = {{0x[0-9a-f]+}}  rdx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: rdi = {{0x[0-9a-f]+}}  rsi = {{0x[0-9a-f]+}}  rbp = {{0x[0-9a-f]+}}  rsp = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:  r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r12 = {{0x[0-9a-f]+}}  r13 = {{0x[0-9a-f]+}}  r14 = {{0x[0-9a-f]+}}  r15 = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
index 208ece3e05af4..f1afd859c207a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
@@ -20,7 +20,8 @@ int main(int argc, char **argv) {
   // CHECK1: SUMMARY: [[SAN]]: SEGV {{.*}}signal_line.cpp:[[@LINE-2]]:[[TAB]] in main
 
   if (n == 2)
-    *((volatile int *)0x1) = __LINE__;
+    // Allow for strict-alignment targets that require natural alignment.
+    *((volatile int *)0x8) = __LINE__;
   // CHECK2: #{{[0-9]+ .*}}main {{.*}}signal_line.cpp:[[@LINE-1]]:[[TAB:[0-9]+]]
   // CHECK2: SUMMARY: [[SAN]]: SEGV {{.*}}signal_line.cpp:[[@LINE-2]]:[[TAB]] in main
 }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
index 035a5a8df77ae..638be63397dc6 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
@@ -62,14 +62,14 @@ void test_sigwait() {
   int res;
   res = fork_and_signal(s);
   fprintf(stderr, "fork_and_signal with SIGUSR1,2: %d\n", res);
-  // CHECK: died with sig 10
+  // CHECK: died with sig {{10|30}}
   // CHECK: fork_and_signal with SIGUSR1,2: 0
 
   // test sigandset... s should only have SIGUSR2 now
   s = sigset_and(s, mkset(1, SIGUSR2));
   res = fork_and_signal(s);
   fprintf(stderr, "fork_and_signal with SIGUSR2: %d\n", res);
-  // CHECK: died with sig 12
+  // CHECK: died with sig {{12|31}}
   // CHECK: fork_and_signal with SIGUSR2: 0
 }
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp
new file mode 100644
index 0000000000000..74aea4d8b360a
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp
@@ -0,0 +1,17 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: i386-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: eax = {{0x[0-9a-f]+}}  ebx = {{0x[0-9a-f]+}}  ecx = {{0x[0-9a-f]+}}  edx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: edi = {{0x[0-9a-f]+}}  esi = {{0x[0-9a-f]+}}  ebp = {{0x[0-9a-f]+}}  esp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp
new file mode 100644
index 0000000000000..3d11ef0e098f1
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: x86_64-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: rax = {{0x[0-9a-f]+}}  rbx = {{0x[0-9a-f]+}}  rcx = {{0x[0-9a-f]+}}  rdx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: rdi = {{0x[0-9a-f]+}}  rsi = {{0x[0-9a-f]+}}  rbp = {{0x[0-9a-f]+}}  rsp = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:  r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r12 = {{0x[0-9a-f]+}}  r13 = {{0x[0-9a-f]+}}  r14 = {{0x[0-9a-f]+}}  r15 = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp
index 286eafc315baf..21ffe1381bd46 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp
@@ -14,15 +14,12 @@ int main(int argc, char **argv) {
   sprintf(buff, "%s.report_path/report", argv[0]);
   __sanitizer_set_report_path(buff);
   assert(strncmp(buff, __sanitizer_get_report_path(), strlen(buff)) == 0);
-  printf("Path %s\n", __sanitizer_get_report_path());
-  fflush(stdout);
 
   // Try setting again with an invalid/inaccessible directory.
-  sprintf(buff, "%s/report", argv[0]);
-  __sanitizer_set_report_path(buff);
-  printf("Path %s\n", __sanitizer_get_report_path());
+  char buff_bad[1000];
+  sprintf(buff_bad, "%s/report", argv[0]);
+  __sanitizer_set_report_path(buff_bad);
+  assert(strncmp(buff, __sanitizer_get_report_path(), strlen(buff)) == 0);
 }
 
-// CHECK: Path {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp.report_path/report.
 // CHECK: ERROR: Can't create directory: {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp
-// CHECK-NOT: Path {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp
diff --git a/compiler-rt/test/scudo/standalone/CMakeLists.txt b/compiler-rt/test/scudo/standalone/CMakeLists.txt
index 0034cea360f37..3e6c8ab234b7e 100644
--- a/compiler-rt/test/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/test/scudo/standalone/CMakeLists.txt
@@ -1,21 +1,19 @@
-if(COMPILER_RT_INCLUDE_TESTS)
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg.py)
+list(APPEND SCUDO_STANDALONE_TEST_DEPS ScudoUnitTests)
+list(APPEND SCUDO_STANDALONE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit)
+if (COMPILER_RT_HAS_GWP_ASAN)
   configure_lit_site_cfg(
-    ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg.py)
-  list(APPEND SCUDO_STANDALONE_TEST_DEPS ScudoUnitTests)
-  list(APPEND SCUDO_STANDALONE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit)
-  if (COMPILER_RT_HAS_GWP_ASAN)
-    configure_lit_site_cfg(
-      ${CMAKE_CURRENT_SOURCE_DIR}/unit/gwp_asan/lit.site.cfg.py.in
-      ${CMAKE_CURRENT_BINARY_DIR}/unit/gwp_asan/lit.site.cfg.py)
-    list(APPEND SCUDO_STANDALONE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit/gwp_asan)
-  endif()
+    ${CMAKE_CURRENT_SOURCE_DIR}/unit/gwp_asan/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/unit/gwp_asan/lit.site.cfg.py)
+  list(APPEND SCUDO_STANDALONE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/unit/gwp_asan)
+endif()
 
-  add_lit_testsuite(check-scudo_standalone
-    "Running Scudo Standalone tests"
-    ${SCUDO_STANDALONE_TESTSUITES}
-    DEPENDS ${SCUDO_STANDALONE_TEST_DEPS})
+add_lit_testsuite(check-scudo_standalone
+  "Running Scudo Standalone tests"
+  ${SCUDO_STANDALONE_TESTSUITES}
+  DEPENDS ${SCUDO_STANDALONE_TEST_DEPS})
 
-  set_target_properties(check-scudo_standalone
-    PROPERTIES FOLDER "Compiler-RT Tests")
-endif()
+set_target_properties(check-scudo_standalone
+  PROPERTIES FOLDER "Compiler-RT Tests")
diff --git a/compiler-rt/test/tsan/CMakeLists.txt b/compiler-rt/test/tsan/CMakeLists.txt
index 31cda53efd47f..163355d68ebc2 100644
--- a/compiler-rt/test/tsan/CMakeLists.txt
+++ b/compiler-rt/test/tsan/CMakeLists.txt
@@ -111,20 +111,18 @@ if(APPLE)
   endforeach()
 endif()
 
-if(COMPILER_RT_INCLUDE_TESTS)
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
+if(COMPILER_RT_TSAN_HAS_STATIC_RUNTIME)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py)
-  if(COMPILER_RT_TSAN_HAS_STATIC_RUNTIME)
-    configure_lit_site_cfg(
-      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-      ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic/lit.site.cfg.py)
-  endif()
-  list(APPEND TSAN_TEST_DEPS TsanUnitTests)
-  list(APPEND TSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
-  if(COMPILER_RT_TSAN_HAS_STATIC_RUNTIME)
-    list(APPEND TSAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic)
-  endif()
+    ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic/lit.site.cfg.py)
+endif()
+list(APPEND TSAN_TEST_DEPS TsanUnitTests)
+list(APPEND TSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit)
+if(COMPILER_RT_TSAN_HAS_STATIC_RUNTIME)
+  list(APPEND TSAN_DYNAMIC_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/dynamic)
 endif()
 
 add_lit_testsuite(check-tsan "Running ThreadSanitizer tests"
diff --git a/compiler-rt/test/tsan/debug_alloc_stack.cpp b/compiler-rt/test/tsan/debug_alloc_stack.cpp
index 5deed49b18b28..70e9759f0af63 100644
--- a/compiler-rt/test/tsan/debug_alloc_stack.cpp
+++ b/compiler-rt/test/tsan/debug_alloc_stack.cpp
@@ -61,11 +61,11 @@ int main() {
   fprintf(stderr, "thread os id = 0x%llx\n", thread_os_id);
   // CHECK: thread os id = [[THREAD_OS_ID]]
   fprintf(stderr, "%p\n", trace[0]);
-  // CHECK: [[ALLOC_FRAME_0:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[ALLOC_FRAME_0:[0-9a-f]+]]
   fprintf(stderr, "%p\n", trace[1]);
-  // CHECK: [[ALLOC_FRAME_1:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[ALLOC_FRAME_1:[0-9a-f]+]]
   fprintf(stderr, "%p\n", trace[2]);
-  // CHECK: [[ALLOC_FRAME_2:0x[0-9a-f]+]]
+  // CHECK: 0x{{0*}}[[ALLOC_FRAME_2:[0-9a-f]+]]
 
   pthread_create(&t, NULL, RaceThread, NULL);
   barrier_wait(&barrier);
@@ -79,6 +79,6 @@ int main() {
 
 // CHECK: WARNING: ThreadSanitizer: data race
 // CHECK: Location is heap block of size 10 at {{.*}} allocated by thread T1
-// CHECK: #0 [[ALLOC_FRAME_0]]
-// CHECK: #1 [[ALLOC_FRAME_1]] in alloc_func
-// CHECK: #2 [[ALLOC_FRAME_2]] in AllocThread
+// CHECK: #0 0x{{0*}}[[ALLOC_FRAME_0]]
+// CHECK: #1 0x{{0*}}[[ALLOC_FRAME_1]] in alloc_func
+// CHECK: #2 0x{{0*}}[[ALLOC_FRAME_2]] in AllocThread
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 774c4eaf4d976..9935fe6a199da 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -4,9 +4,6 @@
 import subprocess
 import sys
 
-# TODO: LooseVersion is undocumented; use something else.
-from distutils.version import LooseVersion
-
 import lit.formats
 import lit.util
 
@@ -279,7 +276,11 @@ def get_clang_default_dwarf_version_string(triple):
 gdb_version_string = get_gdb_version_string()
 if dwarf_version_string and gdb_version_string:
     if int(dwarf_version_string) >= 5:
-        if LooseVersion(gdb_version_string) < LooseVersion("10.1"):
+        try:
+            from packaging import version
+        except:
+            lit_config.fatal("Running gdb tests requires the packaging package")
+        if version.parse(gdb_version_string) < version.parse("10.1"):
             # Example for llgdb-tests, which use lldb on darwin but gdb elsewhere:
             # XFAIL: !system-darwin && gdb-clang-incompatibility
             config.available_features.add("gdb-clang-incompatibility")
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index cbe8f1186236a..070c39eb6e9ab 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -538,3 +538,4 @@ get_clang_resource_dir(HEADER_INSTALL_DIR SUBDIR include)
 install(
   FILES include/flang/ISO_Fortran_binding.h
   DESTINATION ${HEADER_INSTALL_DIR} )
+
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 82f9a021c14ee..093596c9dc8eb 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -134,6 +134,13 @@ end
   implicitly simply appearing in an asynchronous data transfer statement,
   without the attribute being visible in the procedure's explicit
   interface.
+* When the name of an extended derived type's base type is the
+  result of `USE` association with renaming, the name of the extended
+  derived type's parent component is the new name by which the base
+  is known in the scope of the extended derived type, not the original.
+  This interpretation has usability advantages and is what six other
+  Fortran compilers do, but is not conforming now that J3 approved an
+  "interp" in June 2024 to the contrary.
 
 ## Extensions, deletions, and legacy features supported by default
 
diff --git a/flang/docs/InternalProcedureTrampolines.md b/flang/docs/InternalProcedureTrampolines.md
index ef02f1d737c82..41f6155332a47 100644
--- a/flang/docs/InternalProcedureTrampolines.md
+++ b/flang/docs/InternalProcedureTrampolines.md
@@ -239,7 +239,7 @@ automatically deallocated at the end of `host()` invocation.
 Unfortunately, this requires the program stack to be writeable and executable
 at the same time, which might be a security concern.
 
-> NOTE: LLVM's AArch64 backend supports `nest` attribute, but it does not seem to support trampoline intrinsics.
+> NOTE: LLVM's AArch64 backend supports `nest` attribute, but it requires the compiler-rt runtime selected via the `-rtlib=compiler-rt` flag.
 
 ## Alternative implementation(s)
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 7346d702b073d..938da08e19d6b 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -70,7 +70,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     IgnoredIntrinsicFunctionType, PreviousScalarUse,
     RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition,
     IncompatibleImplicitInterfaces, BadTypeForTarget,
-    VectorSubscriptFinalization)
+    VectorSubscriptFinalization, UndefinedFunctionResult)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
@@ -144,6 +144,7 @@ class LanguageFeatureControl {
     warnUsage_.set(UsageWarning::IncompatibleImplicitInterfaces);
     warnUsage_.set(UsageWarning::BadTypeForTarget);
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
+    warnUsage_.set(UsageWarning::UndefinedFunctionResult);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8555073a2d0d4..8c6d3b37166a9 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1243,22 +1243,30 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
-// Get the number of distinct symbols with CUDA attribute in the expression.
+inline bool IsCUDADeviceSymbol(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    if (details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Get the number of distinct symbols with CUDA device
+// attribute in the expression.
 template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
   semantics::UnorderedSymbolSet symbols;
   for (const Symbol &sym : CollectCudaSymbols(expr)) {
-    if (const auto *details =
-            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-      if (details->cudaDataAttr() &&
-          *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-        symbols.insert(sym);
-      }
+    if (IsCUDADeviceSymbol(sym)) {
+      symbols.insert(sym);
     }
   }
   return symbols.size();
 }
 
-// Check if any of the symbols part of the expression has a CUDA data
+// Check if any of the symbols part of the expression has a CUDA device
 // attribute.
 template <typename A> inline bool HasCUDADeviceAttrs(const A &expr) {
   return GetNbOfCUDADeviceSymbols(expr) > 0;
@@ -1270,26 +1278,15 @@ inline bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   unsigned hostSymbols{0};
   unsigned deviceSymbols{0};
   for (const Symbol &sym : CollectCudaSymbols(expr)) {
-    if (const auto *details =
-            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-      if (details->cudaDataAttr() &&
-          *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-        ++deviceSymbols;
-      } else {
-        if (sym.owner().IsDerivedType()) {
-          if (const auto *details =
-                  sym.owner()
-                      .GetSymbol()
-                      ->GetUltimate()
-                      .detailsIf<semantics::ObjectEntityDetails>()) {
-            if (details->cudaDataAttr() &&
-                *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-              ++deviceSymbols;
-            }
-          }
+    if (IsCUDADeviceSymbol(sym)) {
+      ++deviceSymbols;
+    } else {
+      if (sym.owner().IsDerivedType()) {
+        if (IsCUDADeviceSymbol(sym.owner().GetSymbol()->GetUltimate())) {
+          ++deviceSymbols;
         }
-        ++hostSymbols;
       }
+      ++hostSymbols;
     }
   }
   return hostSymbols > 0 && deviceSymbols > 0;
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 757d7f2b10cba..85401f50b6849 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -30,7 +30,7 @@
 #endif
 
 /* 18.5.4 */
-#define CFI_VERSION 20180515
+#define CFI_VERSION 20240719
 
 #define CFI_MAX_RANK 15
 typedef unsigned char CFI_rank_t;
@@ -146,7 +146,9 @@ extern "C++" template <typename T> struct FlexibleArray : T {
   CFI_rank_t rank; /* [0 .. CFI_MAX_RANK] */ \
   CFI_type_t type; \
   CFI_attribute_t attribute; \
-  unsigned char f18Addendum;
+  /* This encodes both the presence of the f18Addendum and the index of the \
+   * allocator used to managed memory of the data hold by the descriptor. */ \
+  unsigned char extra;
 
 typedef struct CFI_cdesc_t {
   _CFI_CDESC_T_HEADER_MEMBERS
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 515f4695951b4..de394a39e112e 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -62,6 +62,14 @@ using AggregateStoreMap = llvm::DenseMap<AggregateStoreKey, mlir::Value>;
 void instantiateVariable(AbstractConverter &, const pft::Variable &var,
                          SymMap &symMap, AggregateStoreMap &storeMap);
 
+/// Does this variable have a default initialization?
+bool hasDefaultInitialization(const Fortran::semantics::Symbol &sym);
+
+/// Call default initialization runtime routine to initialize \p var.
+void defaultInitializeAtRuntime(Fortran::lower::AbstractConverter &converter,
+                                const Fortran::semantics::Symbol &sym,
+                                Fortran::lower::SymMap &symMap);
+
 /// Create a fir::GlobalOp given a module variable definition. This is intended
 /// to be used when lowering a module definition, not when lowering variables
 /// used from a module. For used variables instantiateVariable must directly be
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 80f077ad133f3..78bb82b17d405 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -330,6 +330,8 @@ struct IntrinsicLibrary {
   mlir::Value genModulo(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genMoveAlloc(llvm::ArrayRef<fir::ExtendedValue>);
   void genMvbits(llvm::ArrayRef<fir::ExtendedValue>);
+  enum class NearestProc { Nearest, NextAfter, NextDown, NextUp };
+  template <NearestProc>
   mlir::Value genNearest(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genNint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genNorm2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
@@ -422,9 +424,12 @@ struct IntrinsicLibrary {
                                   mlir::Type resultType,
                                   llvm::ArrayRef<fir::ExtendedValue> args);
 
-  /// Generate code to raise \p except if \p cond is absent,
+  /// Generate code to raise \p excepts if \p cond is absent,
   /// or present and true.
-  void genRaiseExcept(int except, mlir::Value cond = {});
+  void genRaiseExcept(int excepts, mlir::Value cond = {});
+
+  /// Generate a quiet NaN of a given floating point type.
+  mlir::Value genQNan(mlir::Type resultType);
 
   /// Define the different FIR generators that can be mapped to intrinsic to
   /// generate the related code.
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
index 29745b8c231db..aa6e33c7440ad 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
@@ -21,10 +21,10 @@ class FirOpBuilder;
 
 namespace fir::runtime {
 
-/// Generate a runtime call to map an ieee_flag_type exception value to a
-/// libm fenv.h value.
-mlir::Value genMapException(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value except);
+/// Generate a runtime call to map a set of ieee_flag_type exceptions to a
+/// libm fenv.h excepts value.
+mlir::Value genMapExcept(fir::FirOpBuilder &builder, mlir::Location loc,
+                         mlir::Value excepts);
 
 } // namespace fir::runtime
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_EXCEPTIONS_H
diff --git a/flang/include/flang/Optimizer/CodeGen/CGOps.td b/flang/include/flang/Optimizer/CodeGen/CGOps.td
index c375edee1fa77..34c5dc07284f0 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGOps.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGOps.td
@@ -48,6 +48,7 @@ def fircg_XEmboxOp : fircg_Op<"ext_embox", [AttrSizedOperandSegments]> {
        - substring: A substring operator (offset, length) for CHARACTER.
        - LEN type parameters: A vector of runtime LEN type parameters that
          describe an correspond to the elemental derived type.
+       - allocator_idx: specify special allocator to use.
 
     The memref and shape arguments are mandatory. The rest are optional.
   }];
@@ -60,7 +61,8 @@ def fircg_XEmboxOp : fircg_Op<"ext_embox", [AttrSizedOperandSegments]> {
     Variadic<AnyCoordinateType>:$subcomponent,
     Variadic<AnyIntegerType>:$substr,
     Variadic<AnyIntegerType>:$lenParams,
-    Optional<fir_ClassType>:$sourceBox
+    Optional<fir_ClassType>:$sourceBox,
+    OptionalAttr<I32Attr>:$allocator_idx
   );
   let results = (outs BoxOrClassType);
 
@@ -78,16 +80,24 @@ def fircg_XEmboxOp : fircg_Op<"ext_embox", [AttrSizedOperandSegments]> {
     unsigned getOutRank();
 
     // The shape operands are mandatory and always start at 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned substrOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
     }
-    unsigned lenParamOffset() { return substrOffset() + getSubstr().size(); }
-    unsigned getSourceBoxOffset() {
-      return lenParamOffset() + getLenParams().size();
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getSubstrOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
+    }
+    unsigned getLenParamOperandIndex() {
+      return getSubstrOperandIndex() + getSubstr().size();
+    }
+    unsigned getSourceBoxOperandIndex() {
+      return getLenParamOperandIndex() + getLenParams().size();
     }
   }];
 }
@@ -135,12 +145,18 @@ def fircg_XReboxOp : fircg_Op<"ext_rebox", [AttrSizedOperandSegments]> {
     // The rank of the result box
     unsigned getOutRank();
 
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned substrOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getSubstrOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
     }
   }];
 }
@@ -193,14 +209,22 @@ def fircg_XArrayCoorOp : fircg_Op<"ext_array_coor", [AttrSizedOperandSegments]>
     unsigned getRank();
 
     // Shape is optional, but if it exists, it will be at offset 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned indicesOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
-    }
-    unsigned lenParamsOffset() { return indicesOffset() + getIndices().size(); }
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getIndicesOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
+    }
+    unsigned getLenParamsOperandIndex() {
+    return getIndicesOperandIndex() + getIndices().size();
+    }
   }];
 }
 
@@ -231,8 +255,10 @@ def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> {
 
   let extraClassDeclaration = [{
     // Shape is optional, but if it exists, it will be at offset 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
   }];
 }
 
diff --git a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
index 5420e48146bbf..8032fb8fe3d56 100644
--- a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
+++ b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
@@ -55,7 +55,7 @@ namespace fir {
 //                  <@type_desc_3, 20>, // rank
 //                  <@type_desc_3, 21>, // type
 //                  <@type_desc_3, 22>, // attribute
-//                  <@type_desc_3, 23>} // f18Addendum
+//                  <@type_desc_3, 23>} // extra
 //   }
 //   llvm.tbaa_type_desc @type_desc_5 {
 //       id = "CFI_cdesc_t_dim1",
@@ -65,7 +65,7 @@ namespace fir {
 //                  <@type_desc_3, 20>, // rank
 //                  <@type_desc_3, 21>, // type
 //                  <@type_desc_3, 22>, // attribute
-//                  <@type_desc_3, 23>, // f18Addendum
+//                  <@type_desc_3, 23>, // extra
 //                  <@type_desc_3, 24>, // dim[0].lower_bound
 //                  <@type_desc_3, 32>, // dim[0].extent
 //                  <@type_desc_3, 40>} // dim[0].sm
@@ -78,7 +78,7 @@ namespace fir {
 //                  <@type_desc_3, 20>, // rank
 //                  <@type_desc_3, 21>, // type
 //                  <@type_desc_3, 22>, // attribute
-//                  <@type_desc_3, 23>, // f18Addendum
+//                  <@type_desc_3, 23>, // extra
 //                  <@type_desc_3, 24>, // dim[0].lower_bound
 //                  <@type_desc_3, 32>, // dim[0].extent
 //                  <@type_desc_3, 40>, // dim[0].sm
diff --git a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
index ece3e4e073f68..7215abf3a8f81 100644
--- a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
+++ b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h
@@ -29,7 +29,7 @@ static constexpr unsigned kVersionPosInBox = 2;
 static constexpr unsigned kRankPosInBox = 3;
 static constexpr unsigned kTypePosInBox = 4;
 static constexpr unsigned kAttributePosInBox = 5;
-static constexpr unsigned kF18AddendumPosInBox = 6;
+static constexpr unsigned kExtraPosInBox = 6;
 static constexpr unsigned kDimsPosInBox = 7;
 static constexpr unsigned kOptTypePtrPosInBox = 8;
 static constexpr unsigned kOptRowTypePosInBox = 9;
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index aedb6769186e9..60281dfa63713 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -111,4 +111,17 @@ def fir_LowerBoundModifierAttribute : I32EnumAttr<
   let cppNamespace = "::fir";
 }
 
+def fir_LocationKind : I32EnumAttr<"LocationKind", "Flang location kind",
+  [
+    I32EnumAttrCase<"Base", 0, "base">,
+    I32EnumAttrCase<"Inclusion", 1, "inclusion">,
+  ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
+}
+def fir_LocationKindAttr : EnumAttr<FIROpsDialect, fir_LocationKind, "loc_kind">;
+
+def LocationKindArrayAttr : ArrayOfAttr<FIROpsDialect, "LocationKindArray",
+    "loc_kind_array", "LocationKindAttr">;
+
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 89c13fa7cebe6..7856fa7d90184 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -783,6 +783,7 @@ def fir_EmboxOp : fir_Op<"embox", [NoMemoryEffect, AttrSizedOperandSegments]> {
         - slice: an array section can be described with a slice triple,
         - typeparams: for emboxing a derived type with LEN type parameters,
         - accessMap: unused/experimental.
+        - allocator_idx: specify special allocator to use.
   }];
 
   let arguments = (ins
@@ -791,7 +792,8 @@ def fir_EmboxOp : fir_Op<"embox", [NoMemoryEffect, AttrSizedOperandSegments]> {
     Optional<fir_SliceType>:$slice,
     Variadic<AnyIntegerType>:$typeparams,
     Optional<fir_ClassType>:$sourceBox,
-    OptionalAttr<AffineMapAttr>:$accessMap
+    OptionalAttr<AffineMapAttr>:$accessMap,
+    OptionalAttr<I32Attr>:$allocator_idx
   );
 
   let results = (outs BoxOrClassType);
@@ -801,9 +803,11 @@ def fir_EmboxOp : fir_Op<"embox", [NoMemoryEffect, AttrSizedOperandSegments]> {
       "mlir::Value":$memref, CArg<"mlir::Value", "{}">:$shape,
       CArg<"mlir::Value", "{}">:$slice,
       CArg<"mlir::ValueRange", "{}">:$typeparams,
-      CArg<"mlir::Value", "{}">:$sourceBox),
+      CArg<"mlir::Value", "{}">:$sourceBox,
+      CArg<"mlir::IntegerAttr", "{}">:$allocator_idx),
     [{ return build($_builder, $_state, resultTypes, memref, shape, slice,
-                    typeparams, sourceBox, mlir::AffineMapAttr{}); }]>
+                    typeparams, sourceBox, mlir::AffineMapAttr{},
+                    allocator_idx); }]>
   ];
 
   let assemblyFormat = [{
@@ -817,7 +821,7 @@ def fir_EmboxOp : fir_Op<"embox", [NoMemoryEffect, AttrSizedOperandSegments]> {
   let extraClassDeclaration = [{
     bool hasLenParams() { return !getTypeparams().empty(); }
     unsigned numLenParams() { return getTypeparams().size(); }
-    unsigned getSourceBoxOffset() {
+    unsigned getSourceBoxOperandIndex() {
       return 1 + (getShape() ? 1 : 0) + (getSlice() ? 1 : 0)
           + numLenParams();
     }
diff --git a/flang/include/flang/Parser/provenance.h b/flang/include/flang/Parser/provenance.h
index 3189e012cd495..a9224b727fd05 100644
--- a/flang/include/flang/Parser/provenance.h
+++ b/flang/include/flang/Parser/provenance.h
@@ -172,6 +172,8 @@ class AllSources {
   }
   void setShowColors(bool showColors) { showColors_ = showColors; }
   bool getShowColors() const { return showColors_; }
+  std::optional<ProvenanceRange> GetInclusionInfo(
+      const std::optional<ProvenanceRange> &) const;
   void EmitMessage(llvm::raw_ostream &, const std::optional<ProvenanceRange> &,
       const std::string &message, const std::string &prefix,
       llvm::raw_ostream::Colors color, bool echoSourceLine = false) const;
diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h
new file mode 100644
index 0000000000000..c481bec8e8e51
--- /dev/null
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -0,0 +1,59 @@
+//===-- runtime/allocator-registry.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_ALLOCATOR_H_
+#define FORTRAN_RUNTIME_ALLOCATOR_H_
+
+#include "flang/Common/api-attrs.h"
+#include <cstdlib>
+#include <vector>
+
+static constexpr unsigned kDefaultAllocator = 0;
+
+#define MAX_ALLOCATOR 5
+
+namespace Fortran::runtime {
+
+using AllocFct = void *(*)(std::size_t);
+using FreeFct = void (*)(void *);
+
+typedef struct Allocator_t {
+  AllocFct alloc{nullptr};
+  FreeFct free{nullptr};
+} Allocator_t;
+
+#ifdef RT_DEVICE_COMPILATION
+static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+  return std::malloc(size);
+}
+static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
+#endif
+
+struct AllocatorRegistry {
+#ifdef RT_DEVICE_COMPILATION
+  RT_API_ATTRS constexpr AllocatorRegistry()
+      : allocators{{&MallocWrapper, &FreeWrapper}} {}
+#else
+  constexpr AllocatorRegistry() {
+    allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+  };
+#endif
+  RT_API_ATTRS void Register(int, Allocator_t);
+  RT_API_ATTRS AllocFct GetAllocator(int pos);
+  RT_API_ATTRS FreeFct GetDeallocator(int pos);
+
+  Allocator_t allocators[MAX_ALLOCATOR];
+};
+
+RT_OFFLOAD_VAR_GROUP_BEGIN
+extern RT_VAR_ATTRS AllocatorRegistry allocatorRegistry;
+RT_OFFLOAD_VAR_GROUP_END
+
+} // namespace Fortran::runtime
+
+#endif // FORTRAN_RUNTIME_ALLOCATOR_H_
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 1b0b7e23ce6cc..043f6931afad9 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -92,8 +92,8 @@ class Dimension {
 // The storage for this object follows the last used dim[] entry in a
 // Descriptor (CFI_cdesc_t) generic descriptor.  Space matters here, since
 // descriptors serve as POINTER and ALLOCATABLE components of derived type
-// instances.  The presence of this structure is implied by the flag
-// CFI_cdesc_t.f18Addendum, and the number of elements in the len_[]
+// instances.  The presence of this structure is encoded in the
+// CFI_cdesc_t.extra field, and the number of elements in the len_[]
 // array is determined by derivedType_->LenParameters().
 class DescriptorAddendum {
 public:
@@ -339,14 +339,14 @@ class Descriptor {
       const SubscriptValue *, const int *permutation = nullptr) const;
 
   RT_API_ATTRS DescriptorAddendum *Addendum() {
-    if (raw_.f18Addendum != 0) {
+    if (HasAddendum()) {
       return reinterpret_cast<DescriptorAddendum *>(&GetDimension(rank()));
     } else {
       return nullptr;
     }
   }
   RT_API_ATTRS const DescriptorAddendum *Addendum() const {
-    if (raw_.f18Addendum != 0) {
+    if (HasAddendum()) {
       return reinterpret_cast<const DescriptorAddendum *>(
           &GetDimension(rank()));
     } else {
@@ -420,6 +420,27 @@ class Descriptor {
 
   void Dump(FILE * = stdout) const;
 
+// Value of the addendum presence flag.
+#define _CFI_ADDENDUM_FLAG 1
+// Number of bits needed to be shifted when manipulating the allocator index.
+#define _CFI_ALLOCATOR_IDX_SHIFT 1
+// Allocator index mask.
+#define _CFI_ALLOCATOR_IDX_MASK 0b00001110
+
+  RT_API_ATTRS inline bool HasAddendum() const {
+    return raw_.extra & _CFI_ADDENDUM_FLAG;
+  }
+  RT_API_ATTRS inline void SetHasAddendum() {
+    raw_.extra |= _CFI_ADDENDUM_FLAG;
+  }
+  RT_API_ATTRS inline int GetAllocIdx() const {
+    return (raw_.extra & _CFI_ALLOCATOR_IDX_MASK) >> _CFI_ALLOCATOR_IDX_SHIFT;
+  }
+  RT_API_ATTRS inline void SetAllocIdx(int pos) {
+    raw_.extra &= ~_CFI_ALLOCATOR_IDX_MASK; // Clear the allocator index bits.
+    raw_.extra |= (pos << _CFI_ALLOCATOR_IDX_SHIFT);
+  }
+
 private:
   ISO::CFI_cdesc_t raw_;
 };
diff --git a/flang/include/flang/Runtime/exceptions.h b/flang/include/flang/Runtime/exceptions.h
index 8f806ab9ad98a..1ab22da103a50 100644
--- a/flang/include/flang/Runtime/exceptions.h
+++ b/flang/include/flang/Runtime/exceptions.h
@@ -12,7 +12,6 @@
 #define FORTRAN_RUNTIME_EXCEPTIONS_H_
 
 #include "flang/Runtime/entry-names.h"
-#include "flang/Runtime/magic-numbers.h"
 #include <cinttypes>
 
 namespace Fortran::runtime {
@@ -21,11 +20,9 @@ class Descriptor;
 
 extern "C" {
 
-// Map a (single) IEEE_FLAG_TYPE exception value to a libm fenv.h value.
-// This could be extended to handle sets of exceptions, but there is no
-// current use case for that. This mapping is done at runtime to support
-// cross compilation.
-std::int32_t RTNAME(MapException)(std::int32_t except);
+// Map a set of IEEE_FLAG_TYPE exception values to a libm fenv.h excepts value.
+// This mapping is done at runtime to support cross compilation.
+std::uint32_t RTNAME(MapException)(std::uint32_t excepts);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h
index 1cded1fd63238..bab0e9ae05299 100644
--- a/flang/include/flang/Runtime/magic-numbers.h
+++ b/flang/include/flang/Runtime/magic-numbers.h
@@ -100,6 +100,10 @@ The denorm value is a nonstandard extension.
 #define _FORTRAN_RUNTIME_IEEE_OVERFLOW 8
 #define _FORTRAN_RUNTIME_IEEE_UNDERFLOW 16
 #define _FORTRAN_RUNTIME_IEEE_INEXACT 32
+#define _FORTRAN_RUNTIME_IEEE_ALL \
+  _FORTRAN_RUNTIME_IEEE_INVALID | _FORTRAN_RUNTIME_IEEE_DENORM | \
+      _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO | _FORTRAN_RUNTIME_IEEE_OVERFLOW | \
+      _FORTRAN_RUNTIME_IEEE_UNDERFLOW | _FORTRAN_RUNTIME_IEEE_INEXACT
 
 #if 0
 ieee_round_type values
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 3ee71fe485524..ec8d12b0f9865 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -254,6 +254,9 @@ class SemanticsContext {
   // behavior.
   CommonBlockList GetCommonBlocks() const;
 
+  void NoteDefinedSymbol(const Symbol &);
+  bool IsSymbolDefined(const Symbol &) const;
+
 private:
   struct ScopeIndexComparator {
     bool operator()(parser::CharBlock, parser::CharBlock) const;
@@ -303,6 +306,7 @@ class SemanticsContext {
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
   ModuleDependences moduleDependences_;
   std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
+  UnorderedSymbolSet isDefined_;
 };
 
 class Semantics {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index cdbe3e39386bb..cf0350735b5b9 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -460,15 +460,19 @@ class ProcEntityDetails : public EntityDetails, public WithPassArg {
 // and specialized for each distinct set of type parameter values.
 class DerivedTypeDetails {
 public:
-  const std::list<SourceName> &paramNames() const { return paramNames_; }
-  const SymbolVector &paramDecls() const { return paramDecls_; }
+  const SymbolVector &paramNameOrder() const { return paramNameOrder_; }
+  const SymbolVector &paramDeclOrder() const { return paramDeclOrder_; }
   bool sequence() const { return sequence_; }
   bool isDECStructure() const { return isDECStructure_; }
   std::map<SourceName, SymbolRef> &finals() { return finals_; }
   const std::map<SourceName, SymbolRef> &finals() const { return finals_; }
   bool isForwardReferenced() const { return isForwardReferenced_; }
-  void add_paramName(const SourceName &name) { paramNames_.push_back(name); }
-  void add_paramDecl(const Symbol &symbol) { paramDecls_.push_back(symbol); }
+  void add_paramNameOrder(const Symbol &symbol) {
+    paramNameOrder_.push_back(symbol);
+  }
+  void add_paramDeclOrder(const Symbol &symbol) {
+    paramDeclOrder_.push_back(symbol);
+  }
   void add_component(const Symbol &);
   void set_sequence(bool x = true) { sequence_ = x; }
   void set_isDECStructure(bool x = true) { isDECStructure_ = x; }
@@ -491,12 +495,12 @@ class DerivedTypeDetails {
   const Symbol *GetFinalForRank(int) const;
 
 private:
-  // These are (1) the names of the derived type parameters in the order
+  // These are (1) the symbols of the derived type parameters in the order
   // in which they appear on the type definition statement(s), and (2) the
   // symbols that correspond to those names in the order in which their
   // declarations appear in the derived type definition(s).
-  std::list<SourceName> paramNames_;
-  SymbolVector paramDecls_;
+  SymbolVector paramNameOrder_;
+  SymbolVector paramDeclOrder_;
   // These are the names of the derived type's components in component
   // order.  A parent component, if any, appears first in this list.
   std::list<SourceName> componentNames_;
@@ -565,18 +569,19 @@ class MiscDetails {
 
 class TypeParamDetails {
 public:
-  explicit TypeParamDetails(common::TypeParamAttr attr) : attr_{attr} {}
+  TypeParamDetails() = default;
   TypeParamDetails(const TypeParamDetails &) = default;
-  common::TypeParamAttr attr() const { return attr_; }
+  std::optional<common::TypeParamAttr> attr() const { return attr_; }
+  TypeParamDetails &set_attr(common::TypeParamAttr);
   MaybeIntExpr &init() { return init_; }
   const MaybeIntExpr &init() const { return init_; }
   void set_init(MaybeIntExpr &&expr) { init_ = std::move(expr); }
   const DeclTypeSpec *type() const { return type_; }
-  void set_type(const DeclTypeSpec &);
+  TypeParamDetails &set_type(const DeclTypeSpec &);
   void ReplaceType(const DeclTypeSpec &);
 
 private:
-  common::TypeParamAttr attr_;
+  std::optional<common::TypeParamAttr> attr_;
   MaybeIntExpr init_;
   const DeclTypeSpec *type_{nullptr};
 };
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 0fcba3131fad1..ec275f349e81b 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -52,7 +52,6 @@ const Symbol *FindPointerComponent(const DeclTypeSpec &);
 const Symbol *FindPointerComponent(const Symbol &);
 const Symbol *FindInterface(const Symbol &);
 const Symbol *FindSubprogram(const Symbol &);
-const Symbol *FindFunctionResult(const Symbol &);
 const Symbol *FindOverriddenBinding(
     const Symbol &, bool &isInaccessibleDeferred);
 const Symbol *FindGlobal(const Symbol &);
@@ -87,6 +86,7 @@ bool IsIntrinsicConcat(
 bool IsGenericDefinedOp(const Symbol &);
 bool IsDefinedOperator(SourceName);
 std::string MakeOpName(SourceName);
+bool IsCommonBlockContaining(const Symbol &, const Symbol &);
 
 // Returns true if maybeAncestor exists and is a proper ancestor of a
 // descendent scope (or symbol owner).  Will be false, unlike Scope::Contains(),
@@ -261,7 +261,7 @@ std::optional<parser::MessageFixedText> GetImageControlStmtCoarrayMsg(
 SymbolVector OrderParameterDeclarations(const Symbol &);
 // Returns the complete list of derived type parameter names in the
 // order defined by 7.5.3.2.
-std::list<SourceName> OrderParameterNames(const Symbol &);
+SymbolVector OrderParameterNames(const Symbol &);
 
 // Return an existing or new derived type instance
 const DeclTypeSpec &FindOrInstantiateDerivedType(Scope &, DerivedTypeSpec &&,
diff --git a/flang/lib/Evaluate/fold-character.cpp b/flang/lib/Evaluate/fold-character.cpp
index 877bc2eac1fc2..5bdfa539eb0e0 100644
--- a/flang/lib/Evaluate/fold-character.cpp
+++ b/flang/lib/Evaluate/fold-character.cpp
@@ -97,7 +97,7 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
     return Expr<T>{Constant<T>{CharacterUtils<KIND>::NEW_LINE()}};
   } else if (name == "repeat") { // not elemental
     if (auto scalars{GetScalarConstantArguments<T, SubscriptInteger>(
-            context, funcRef.arguments())}) {
+            context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
       auto str{std::get<Scalar<T>>(*scalars)};
       auto n{std::get<Scalar<SubscriptInteger>>(*scalars).ToInt64()};
       if (n < 0) {
@@ -117,8 +117,8 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
       }
     }
   } else if (name == "trim") { // not elemental
-    if (auto scalar{
-            GetScalarConstantArguments<T>(context, funcRef.arguments())}) {
+    if (auto scalar{GetScalarConstantArguments<T>(
+            context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
       return Expr<T>{Constant<T>{
           CharacterUtils<KIND>::TRIM(std::get<Scalar<T>>(*scalar))}};
     }
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index d5c393140c574..9ce0edbdcb779 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -54,7 +54,8 @@ static constexpr bool useKahanSummation{false};
 // Utilities
 template <typename T> class Folder {
 public:
-  explicit Folder(FoldingContext &c) : context_{c} {}
+  explicit Folder(FoldingContext &c, bool forOptionalArgument = false)
+      : context_{c}, forOptionalArgument_{forOptionalArgument} {}
   std::optional<Constant<T>> GetNamedConstant(const Symbol &);
   std::optional<Constant<T>> ApplySubscripts(const Constant<T> &array,
       const std::vector<Constant<SubscriptInteger>> &subscripts);
@@ -81,6 +82,7 @@ template <typename T> class Folder {
 
 private:
   FoldingContext &context_;
+  bool forOptionalArgument_{false};
 };
 
 std::optional<Constant<SubscriptInteger>> GetConstantSubscript(
@@ -407,7 +409,14 @@ Constant<T> *Folder<T>::Folding(std::optional<ActualArgument> &arg) {
   if (auto *expr{UnwrapExpr<Expr<SomeType>>(arg)}) {
     if constexpr (T::category != TypeCategory::Derived) {
       if (!UnwrapExpr<Expr<T>>(*expr)) {
-        if (auto converted{ConvertToType(T::GetType(), std::move(*expr))}) {
+        if (const Symbol *
+                var{forOptionalArgument_
+                        ? UnwrapWholeSymbolOrComponentDataRef(*expr)
+                        : nullptr};
+            var && (IsOptional(*var) || IsAllocatableOrObjectPointer(var))) {
+          // can't safely convert item that may not be present
+        } else if (auto converted{
+                       ConvertToType(T::GetType(), std::move(*expr))}) {
           *expr = Fold(context_, std::move(*converted));
         }
       }
@@ -420,10 +429,10 @@ Constant<T> *Folder<T>::Folding(std::optional<ActualArgument> &arg) {
 template <typename... A, std::size_t... I>
 std::optional<std::tuple<const Constant<A> *...>> GetConstantArgumentsHelper(
     FoldingContext &context, ActualArguments &arguments,
-    std::index_sequence<I...>) {
+    bool hasOptionalArgument, std::index_sequence<I...>) {
   static_assert(sizeof...(A) > 0);
   std::tuple<const Constant<A> *...> args{
-      Folder<A>{context}.Folding(arguments.at(I))...};
+      Folder<A>{context, hasOptionalArgument}.Folding(arguments.at(I))...};
   if ((... && (std::get<I>(args)))) {
     return args;
   } else {
@@ -433,15 +442,17 @@ std::optional<std::tuple<const Constant<A> *...>> GetConstantArgumentsHelper(
 
 template <typename... A>
 std::optional<std::tuple<const Constant<A> *...>> GetConstantArguments(
-    FoldingContext &context, ActualArguments &args) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument) {
   return GetConstantArgumentsHelper<A...>(
-      context, args, std::index_sequence_for<A...>{});
+      context, args, hasOptionalArgument, std::index_sequence_for<A...>{});
 }
 
 template <typename... A, std::size_t... I>
 std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArgumentsHelper(
-    FoldingContext &context, ActualArguments &args, std::index_sequence<I...>) {
-  if (auto constArgs{GetConstantArguments<A...>(context, args)}) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument,
+    std::index_sequence<I...>) {
+  if (auto constArgs{
+          GetConstantArguments<A...>(context, args, hasOptionalArgument)}) {
     return std::tuple<Scalar<A>...>{
         std::get<I>(*constArgs)->GetScalarValue().value()...};
   } else {
@@ -451,9 +462,9 @@ std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArgumentsHelper(
 
 template <typename... A>
 std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArguments(
-    FoldingContext &context, ActualArguments &args) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument) {
   return GetScalarConstantArgumentsHelper<A...>(
-      context, args, std::index_sequence_for<A...>{});
+      context, args, hasOptionalArgument, std::index_sequence_for<A...>{});
 }
 
 // helpers to fold intrinsic function references
@@ -470,9 +481,10 @@ template <template <typename, typename...> typename WrapperType, typename TR,
     typename... TA, std::size_t... I>
 Expr<TR> FoldElementalIntrinsicHelper(FoldingContext &context,
     FunctionRef<TR> &&funcRef, WrapperType<TR, TA...> func,
-    std::index_sequence<I...>) {
+    bool hasOptionalArgument, std::index_sequence<I...>) {
   if (std::optional<std::tuple<const Constant<TA> *...>> args{
-          GetConstantArguments<TA...>(context, funcRef.arguments())}) {
+          GetConstantArguments<TA...>(
+              context, funcRef.arguments(), hasOptionalArgument)}) {
     // Compute the shape of the result based on shapes of arguments
     ConstantSubscripts shape;
     int rank{0};
@@ -542,15 +554,19 @@ Expr<TR> FoldElementalIntrinsicHelper(FoldingContext &context,
 
 template <typename TR, typename... TA>
 Expr<TR> FoldElementalIntrinsic(FoldingContext &context,
-    FunctionRef<TR> &&funcRef, ScalarFunc<TR, TA...> func) {
-  return FoldElementalIntrinsicHelper<ScalarFunc, TR, TA...>(
-      context, std::move(funcRef), func, std::index_sequence_for<TA...>{});
+    FunctionRef<TR> &&funcRef, ScalarFunc<TR, TA...> func,
+    bool hasOptionalArgument = false) {
+  return FoldElementalIntrinsicHelper<ScalarFunc, TR, TA...>(context,
+      std::move(funcRef), func, hasOptionalArgument,
+      std::index_sequence_for<TA...>{});
 }
 template <typename TR, typename... TA>
 Expr<TR> FoldElementalIntrinsic(FoldingContext &context,
-    FunctionRef<TR> &&funcRef, ScalarFuncWithContext<TR, TA...> func) {
-  return FoldElementalIntrinsicHelper<ScalarFuncWithContext, TR, TA...>(
-      context, std::move(funcRef), func, std::index_sequence_for<TA...>{});
+    FunctionRef<TR> &&funcRef, ScalarFuncWithContext<TR, TA...> func,
+    bool hasOptionalArgument = false) {
+  return FoldElementalIntrinsicHelper<ScalarFuncWithContext, TR, TA...>(context,
+      std::move(funcRef), func, hasOptionalArgument,
+      std::index_sequence_for<TA...>{});
 }
 
 std::optional<std::int64_t> GetInt64ArgOr(
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 981cdff7f350b..821fa4e5dadfd 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -347,7 +347,8 @@ template <WhichLocation WHICH> class LocationHelper {
     bool back{false};
     if (arg_[backArg]) {
       const auto *backConst{
-          Folder<LogicalResult>{context_}.Folding(arg_[backArg])};
+          Folder<LogicalResult>{context_, /*forOptionalArgument=*/true}.Folding(
+              arg_[backArg])};
       if (backConst) {
         back = backConst->GetScalarValue().value().IsTrue();
       } else {
@@ -910,8 +911,10 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     const auto *argCon{Folder<T>(context).Folding(args[0])};
     const auto *shiftCon{Folder<Int4>(context).Folding(args[1])};
     const auto *shiftVals{shiftCon ? &shiftCon->values() : nullptr};
-    const auto *sizeCon{
-        args.size() == 3 ? Folder<Int4>(context).Folding(args[2]) : nullptr};
+    const auto *sizeCon{args.size() == 3
+            ? Folder<Int4>{context, /*forOptionalArgument=*/true}.Folding(
+                  args[2])
+            : nullptr};
     const auto *sizeVals{sizeCon ? &sizeCon->values() : nullptr};
     if ((argCon && argCon->empty()) || !shiftVals || shiftVals->empty() ||
         (sizeVals && sizeVals->empty())) {
@@ -985,7 +988,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
                 auto shiftVal{static_cast<int>(shift.ToInt64())};
                 auto sizeVal{static_cast<int>(size.ToInt64())};
                 return i.ISHFTC(shiftVal, sizeVal);
-              }));
+              }),
+          /*hasOptionalArgument=*/true);
     }
   } else if (name == "izext" || name == "jzext") {
     if (args.size() == 1) {
@@ -1208,20 +1212,23 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
           cx->u)};
     }
   } else if (name == "rank") {
-    if (const auto *array{UnwrapExpr<Expr<SomeType>>(args[0])}) {
-      if (auto named{ExtractNamedEntity(*array)}) {
-        const Symbol &symbol{named->GetLastSymbol()};
-        if (IsAssumedRank(symbol)) {
-          // DescriptorInquiry can only be placed in expression of kind
-          // DescriptorInquiry::Result::kind.
-          return ConvertToType<T>(Expr<
-              Type<TypeCategory::Integer, DescriptorInquiry::Result::kind>>{
-              DescriptorInquiry{*named, DescriptorInquiry::Field::Rank}});
-        }
+    if (args[0]) {
+      const Symbol *symbol{nullptr};
+      if (auto dataRef{ExtractDataRef(args[0])}) {
+        symbol = &dataRef->GetLastSymbol();
+      } else {
+        symbol = args[0]->GetAssumedTypeDummy();
+      }
+      if (symbol && IsAssumedRank(*symbol)) {
+        // DescriptorInquiry can only be placed in expression of kind
+        // DescriptorInquiry::Result::kind.
+        return ConvertToType<T>(
+            Expr<Type<TypeCategory::Integer, DescriptorInquiry::Result::kind>>{
+                DescriptorInquiry{
+                    NamedEntity{*symbol}, DescriptorInquiry::Field::Rank}});
       }
-      return Expr<T>{args[0].value().Rank()};
+      return Expr<T>{args[0]->Rank()};
     }
-    return Expr<T>{args[0].value().Rank()};
   } else if (name == "selected_char_kind") {
     if (const auto *chCon{UnwrapExpr<Constant<TypeOf<std::string>>>(args[0])}) {
       if (std::optional<std::string> value{chCon->GetScalarValue()}) {
diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index 69c7a924cc1c3..2dd08a7d1a6e1 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -20,8 +20,8 @@ static Expr<T> FoldTransformationalBessel(
   /// arguments to Int4, any overflow error will be reported during the
   /// conversion folding.
   using Int4 = Type<TypeCategory::Integer, 4>;
-  if (auto args{
-          GetConstantArguments<Int4, Int4, T>(context, funcRef.arguments())}) {
+  if (auto args{GetConstantArguments<Int4, Int4, T>(
+          context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
     const std::string &name{std::get<SpecificIntrinsic>(funcRef.proc().u).name};
     if (auto elementalBessel{GetHostRuntimeWrapper<T, Int4, T>(name)}) {
       std::vector<Scalar<T>> results;
diff --git a/flang/lib/Evaluate/real.cpp b/flang/lib/Evaluate/real.cpp
index 223f67fee41df..a5f8070c684fe 100644
--- a/flang/lib/Evaluate/real.cpp
+++ b/flang/lib/Evaluate/real.cpp
@@ -330,12 +330,12 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::SQRT(Rounding rounding) const {
 template <typename W, int P>
 ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
   ValueWithRealFlags<Real> result;
+  bool isNegative{IsNegative()};
   if (IsFinite()) {
     Fraction fraction{GetFraction()};
     int expo{Exponent()};
     Fraction one{1};
     Fraction nearest;
-    bool isNegative{IsNegative()};
     if (upward != isNegative) { // upward in magnitude
       auto next{fraction.AddUnsigned(one)};
       if (next.carry) {
@@ -359,6 +359,8 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
       }
     }
     result.flags = result.value.Normalize(isNegative, expo, nearest);
+  } else if (IsInfinite() && upward == isNegative) {
+    result.value = isNegative ? HUGE().Negate() : HUGE(); // largest mag finite
   } else {
     result.flags.set(RealFlag::InvalidArgument);
     result.value = *this;
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index 247386a365de9..b074ae6d811ac 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -250,7 +250,8 @@ DescriptorInquiry::DescriptorInquiry(NamedEntity &&base, Field field, int dim)
   const Symbol &last{base_.GetLastSymbol()};
   CHECK(IsDescriptor(last));
   CHECK((field == Field::Len && dim == 0) ||
-      (field != Field::Len && dim >= 0 && dim < last.Rank()));
+      (field != Field::Len && dim >= 0 &&
+          (dim < last.Rank() || IsAssumedRank(last))));
 }
 
 // LEN()
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 8c892d9d032e1..7c7f003cf731c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1072,7 +1072,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(fpContractMode);
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_infinities)) {
+  if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
     opts.NoHonorInfs = true;
   }
 
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 77e02898ac9fb..f40c15db93db8 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -1104,14 +1104,14 @@ void Fortran::lower::associateMutableBox(
 bool Fortran::lower::isWholeAllocatable(const Fortran::lower::SomeExpr &expr) {
   if (const Fortran::semantics::Symbol *sym =
           Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(expr))
-    return Fortran::semantics::IsAllocatable(*sym);
+    return Fortran::semantics::IsAllocatable(sym->GetUltimate());
   return false;
 }
 
 bool Fortran::lower::isWholePointer(const Fortran::lower::SomeExpr &expr) {
   if (const Fortran::semantics::Symbol *sym =
           Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(expr))
-    return Fortran::semantics::IsPointer(*sym);
+    return Fortran::semantics::IsPointer(sym->GetUltimate());
   return false;
 }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index a4043744c6386..3d5b2e2a2fe02 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -939,24 +939,59 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return mlir::UnknownLoc::get(&getMLIRContext());
   }
 
+  static mlir::Location genLocation(Fortran::parser::SourcePosition pos,
+                                    mlir::MLIRContext &ctx) {
+    llvm::SmallString<256> path(*pos.path);
+    llvm::sys::fs::make_absolute(path);
+    llvm::sys::path::remove_dots(path);
+    return mlir::FileLineColLoc::get(&ctx, path.str(), pos.line, pos.column);
+  }
+
   /// Generate a `Location` from the `CharBlock`.
   mlir::Location
   genLocation(const Fortran::parser::CharBlock &block) override final {
+    mlir::Location mainLocation = genUnknownLocation();
     if (const Fortran::parser::AllCookedSources *cooked =
             bridge.getCookedSource()) {
       if (std::optional<Fortran::parser::ProvenanceRange> provenance =
               cooked->GetProvenanceRange(block)) {
         if (std::optional<Fortran::parser::SourcePosition> filePos =
-                cooked->allSources().GetSourcePosition(provenance->start())) {
-          llvm::SmallString<256> filePath(*filePos->path);
-          llvm::sys::fs::make_absolute(filePath);
-          llvm::sys::path::remove_dots(filePath);
-          return mlir::FileLineColLoc::get(&getMLIRContext(), filePath.str(),
-                                           filePos->line, filePos->column);
+                cooked->allSources().GetSourcePosition(provenance->start()))
+          mainLocation = genLocation(*filePos, getMLIRContext());
+
+        llvm::SmallVector<mlir::Location> locs;
+        locs.push_back(mainLocation);
+
+        llvm::SmallVector<fir::LocationKindAttr> locAttrs;
+        locAttrs.push_back(fir::LocationKindAttr::get(&getMLIRContext(),
+                                                      fir::LocationKind::Base));
+
+        // Gather include location information if any.
+        Fortran::parser::ProvenanceRange *prov = &*provenance;
+        while (prov) {
+          if (std::optional<Fortran::parser::ProvenanceRange> include =
+                  cooked->allSources().GetInclusionInfo(*prov)) {
+            if (std::optional<Fortran::parser::SourcePosition> incPos =
+                    cooked->allSources().GetSourcePosition(include->start())) {
+              locs.push_back(genLocation(*incPos, getMLIRContext()));
+              locAttrs.push_back(fir::LocationKindAttr::get(
+                  &getMLIRContext(), fir::LocationKind::Inclusion));
+            }
+            prov = &*include;
+          } else {
+            prov = nullptr;
+          }
+        }
+        if (locs.size() > 1) {
+          assert(locs.size() == locAttrs.size() &&
+                 "expect as many attributes as locations");
+          return mlir::FusedLocWith<fir::LocationKindArrayAttr>::get(
+              &getMLIRContext(), locs,
+              fir::LocationKindArrayAttr::get(&getMLIRContext(), locAttrs));
         }
       }
     }
-    return genUnknownLocation();
+    return mainLocation;
   }
 
   const Fortran::semantics::Scope &getCurrentScope() override final {
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index ba65b644e5a93..fd873f55dd844 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1227,26 +1227,32 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     return hlfir::Entity{copyIn.getCopiedIn()};
   };
 
+  auto genSetDynamicTypeToDummyType = [&](hlfir::Entity var) -> hlfir::Entity {
+    fir::BaseBoxType boxType = fir::BoxType::get(
+        hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
+    if (actualIsAssumedRank)
+      return hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
+          loc, boxType, var, fir::LowerBoundModifierAttribute::SetToOnes)};
+    // Use actual shape when creating descriptor with dummy type, the dummy
+    // shape may be unknown in case of sequence association.
+    mlir::Type actualTy =
+        hlfir::getFortranElementOrSequenceType(actual.getType());
+    boxType = boxType.getBoxTypeWithNewShape(actualTy);
+    return hlfir::Entity{builder.create<fir::ReboxOp>(loc, boxType, var,
+                                                      /*shape=*/mlir::Value{},
+                                                      /*slice=*/mlir::Value{})};
+  };
+
   // Step 2: prepare the storage for the dummy arguments, ensuring that it
   // matches the dummy requirements (e.g., must be contiguous or must be
   // a temporary).
   hlfir::Entity entity =
       hlfir::derefPointersAndAllocatables(loc, builder, actual);
   if (entity.isVariable()) {
-    if (mustSetDynamicTypeToDummyType) {
-      // Note: this is important to do this before any copy-in or copy so
-      // that the dummy is contiguous according to the dummy type.
-      mlir::Type boxType = fir::BoxType::get(
-          hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
-      if (actualIsAssumedRank) {
-        entity = hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
-            loc, boxType, entity, fir::LowerBoundModifierAttribute::SetToOnes)};
-      } else {
-        entity = hlfir::Entity{builder.create<fir::ReboxOp>(
-            loc, boxType, entity, /*shape=*/mlir::Value{},
-            /*slice=*/mlir::Value{})};
-      }
-    }
+    // Set dynamic type if needed before any copy-in or copy so that the dummy
+    // is contiguous according to the dummy type.
+    if (mustSetDynamicTypeToDummyType)
+      entity = genSetDynamicTypeToDummyType(entity);
     if (arg.hasValueAttribute() ||
         // Constant expressions might be lowered as variables with
         // 'parameter' attribute. Even though the constant expressions
@@ -1285,20 +1291,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
         loc, builder, entity, storageType, "", byRefAttr);
     entity = hlfir::Entity{associate.getBase()};
     preparedDummy.pushExprAssociateCleanUp(associate);
+    // Rebox the actual argument to the dummy argument's type, and make sure
+    // that we pass a contiguous entity (i.e. make copy-in, if needed).
+    //
+    // TODO: this can probably be optimized by associating the expression with
+    // properly typed temporary, but this needs either a new operation or
+    // making the hlfir.associate more complex.
     if (mustSetDynamicTypeToDummyType) {
-      // Rebox the actual argument to the dummy argument's type, and make
-      // sure that we pass a contiguous entity (i.e. make copy-in,
-      // if needed).
-      //
-      // TODO: this can probably be optimized by associating the expression
-      // with properly typed temporary, but this needs either a new operation
-      // or making the hlfir.associate more complex.
-      assert(!actualIsAssumedRank && "only variables are assumed-rank");
-      mlir::Type boxType = fir::BoxType::get(
-          hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
-      entity = hlfir::Entity{builder.create<fir::ReboxOp>(
-          loc, boxType, entity, /*shape=*/mlir::Value{},
-          /*slice=*/mlir::Value{})};
+      entity = genSetDynamicTypeToDummyType(entity);
       entity = genCopyIn(entity, /*doCopyOut=*/false);
     }
   }
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 47ad48fb322cc..4fcfa0b126e04 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -72,7 +72,8 @@ static mlir::Value genScalarValue(Fortran::lower::AbstractConverter &converter,
 }
 
 /// Does this variable have a default initialization?
-static bool hasDefaultInitialization(const Fortran::semantics::Symbol &sym) {
+bool Fortran::lower::hasDefaultInitialization(
+    const Fortran::semantics::Symbol &sym) {
   if (sym.has<Fortran::semantics::ObjectEntityDetails>() && sym.size())
     if (!Fortran::semantics::IsAllocatableOrPointer(sym))
       if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
@@ -353,7 +354,7 @@ static mlir::Value genComponentDefaultInit(
       // global constructor since this has no runtime cost.
       componentValue = fir::factory::createUnallocatedBox(
           builder, loc, componentTy, std::nullopt);
-    } else if (hasDefaultInitialization(component)) {
+    } else if (Fortran::lower::hasDefaultInitialization(component)) {
       // Component type has default initialization.
       componentValue = genDefaultInitializerValue(converter, loc, component,
                                                   componentTy, stmtCtx);
@@ -556,7 +557,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                 builder.createConvert(loc, symTy, fir::getBase(initVal));
             builder.create<fir::HasValueOp>(loc, castTo);
           });
-    } else if (hasDefaultInitialization(sym)) {
+    } else if (Fortran::lower::hasDefaultInitialization(sym)) {
       Fortran::lower::createGlobalInitialization(
           builder, global, [&](fir::FirOpBuilder &builder) {
             Fortran::lower::StatementContext stmtCtx(
@@ -752,17 +753,15 @@ mustBeDefaultInitializedAtRuntime(const Fortran::lower::pft::Variable &var) {
     return true;
   // Local variables (including function results), and intent(out) dummies must
   // be default initialized at runtime if their type has default initialization.
-  return hasDefaultInitialization(sym);
+  return Fortran::lower::hasDefaultInitialization(sym);
 }
 
 /// Call default initialization runtime routine to initialize \p var.
-static void
-defaultInitializeAtRuntime(Fortran::lower::AbstractConverter &converter,
-                           const Fortran::lower::pft::Variable &var,
-                           Fortran::lower::SymMap &symMap) {
+void Fortran::lower::defaultInitializeAtRuntime(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::semantics::Symbol &sym, Fortran::lower::SymMap &symMap) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
-  const Fortran::semantics::Symbol &sym = var.getSymbol();
   fir::ExtendedValue exv = converter.getSymbolExtendedValue(sym, &symMap);
   if (Fortran::semantics::IsOptional(sym)) {
     // 15.5.2.12 point 3, absent optional dummies are not initialized.
@@ -927,7 +926,8 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
   if (needDummyIntentoutFinalization(var))
     finalizeAtRuntime(converter, var, symMap);
   if (mustBeDefaultInitializedAtRuntime(var))
-    defaultInitializeAtRuntime(converter, var, symMap);
+    Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
+                                               symMap);
   if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
     auto *builder = &converter.getFirOpBuilder();
     mlir::Location loc = converter.getCurrentLocation();
@@ -1168,7 +1168,8 @@ static void instantiateAlias(Fortran::lower::AbstractConverter &converter,
   // do not try optimizing this to single default initializations of
   // the equivalenced storages. Keep lowering simple.
   if (mustBeDefaultInitializedAtRuntime(var))
-    defaultInitializeAtRuntime(converter, var, symMap);
+    Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
+                                               symMap);
 }
 
 //===--------------------------------------------------------------===//
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index e8d6bb591e361..24cb7c2fcf7db 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -33,6 +33,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-directive-sets.h"
@@ -135,6 +136,23 @@ static inline void genOmpAtomicHintAndMemoryOrderClauses(
   }
 }
 
+template <typename AtomicListT>
+static void processOmpAtomicTODO(mlir::Type elementType,
+                                 [[maybe_unused]] mlir::Location loc) {
+  if (!elementType)
+    return;
+  if constexpr (std::is_same<AtomicListT,
+                             Fortran::parser::OmpAtomicClauseList>()) {
+    // Based on assertion for supported element types in OMPIRBuilder.cpp
+    // createAtomicRead
+    mlir::Type unwrappedEleTy = fir::unwrapRefType(elementType);
+    bool supportedAtomicType =
+        (!fir::isa_complex(unwrappedEleTy) && fir::isa_trivial(unwrappedEleTy));
+    if (!supportedAtomicType)
+      TODO(loc, "Unsupported atomic type");
+  }
+}
+
 /// Used to generate atomic.read operation which is created in existing
 /// location set by builder.
 template <typename AtomicListT>
@@ -147,6 +165,8 @@ static inline void genOmpAccAtomicCaptureStatement(
   // Generate `atomic.read` operation for atomic assigment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
+  processOmpAtomicTODO<AtomicListT>(elementType, loc);
+
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
     // If no hint clause is specified, the effect is as if
@@ -183,6 +203,8 @@ static inline void genOmpAccAtomicWriteStatement(
   mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
   rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
 
+  processOmpAtomicTODO<AtomicListT>(varType, loc);
+
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
     // If no hint clause is specified, the effect is as if
@@ -323,6 +345,8 @@ static inline void genOmpAccAtomicUpdateStatement(
         currentLocation, lhsAddr);
   }
 
+  processOmpAtomicTODO<AtomicListT>(varType, loc);
+
   llvm::SmallVector<mlir::Type> varTys = {varType};
   llvm::SmallVector<mlir::Location> locs = {currentLocation};
   firOpBuilder.createBlock(&atomicUpdateOp->getRegion(0), {}, varTys, locs);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b26c1679086b9..310c0b0b5fb63 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -181,19 +181,19 @@ static void addUseDeviceClause(
 
 static void convertLoopBounds(lower::AbstractConverter &converter,
                               mlir::Location loc,
-                              mlir::omp::CollapseClauseOps &result,
+                              mlir::omp::LoopRelatedOps &result,
                               std::size_t loopVarTypeSize) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // The types of lower bound, upper bound, and step are converted into the
   // type of the loop variable if necessary.
   mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  for (unsigned it = 0; it < (unsigned)result.loopLBVar.size(); it++) {
-    result.loopLBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopLBVar[it]);
-    result.loopUBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopUBVar[it]);
-    result.loopStepVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopStepVar[it]);
+  for (unsigned it = 0; it < (unsigned)result.loopLowerBounds.size(); it++) {
+    result.loopLowerBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopLowerBounds[it]);
+    result.loopUpperBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopUpperBounds[it]);
+    result.loopSteps[it] =
+        firOpBuilder.createConvert(loc, loopVarType, result.loopSteps[it]);
   }
 }
 
@@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::CollapseClauseOps &result,
+    mlir::omp::LoopRelatedOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -232,15 +232,15 @@ bool ClauseProcessor::processCollapse(
         std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
     assert(bounds && "Expected bounds for worksharing do loop");
     lower::StatementContext stmtCtx;
-    result.loopLBVar.push_back(fir::getBase(
+    result.loopLowerBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx)));
-    result.loopUBVar.push_back(fir::getBase(
+    result.loopUpperBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx)));
     if (bounds->step) {
-      result.loopStepVar.push_back(fir::getBase(
+      result.loopSteps.push_back(fir::getBase(
           converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx)));
     } else { // If `step` is not present, assume it as `1`.
-      result.loopStepVar.push_back(firOpBuilder.createIntegerConstant(
+      result.loopSteps.push_back(firOpBuilder.createIntegerConstant(
           currentLocation, firOpBuilder.getIntegerType(32), 1));
     }
     iv.push_back(bounds->name.thing.symbol);
@@ -291,8 +291,7 @@ bool ClauseProcessor::processDevice(lower::StatementContext &stmtCtx,
       }
     }
     const auto &deviceExpr = std::get<omp::SomeExpr>(clause->t);
-    result.deviceVar =
-        fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
+    result.device = fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
     return true;
   }
   return false;
@@ -322,10 +321,10 @@ bool ClauseProcessor::processDistSchedule(
     lower::StatementContext &stmtCtx,
     mlir::omp::DistScheduleClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::DistSchedule>()) {
-    result.distScheduleStaticAttr = converter.getFirOpBuilder().getUnitAttr();
+    result.distScheduleStatic = converter.getFirOpBuilder().getUnitAttr();
     const auto &chunkSize = std::get<std::optional<ExprTy>>(clause->t);
     if (chunkSize)
-      result.distScheduleChunkSizeVar =
+      result.distScheduleChunkSize =
           fir::getBase(converter.genExprValue(*chunkSize, stmtCtx));
     return true;
   }
@@ -335,7 +334,7 @@ bool ClauseProcessor::processDistSchedule(
 bool ClauseProcessor::processFilter(lower::StatementContext &stmtCtx,
                                     mlir::omp::FilterClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Filter>()) {
-    result.filteredThreadIdVar =
+    result.filteredThreadId =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -351,7 +350,7 @@ bool ClauseProcessor::processFinal(lower::StatementContext &stmtCtx,
 
     mlir::Value finalVal =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
-    result.finalVar = firOpBuilder.createConvert(
+    result.final = firOpBuilder.createConvert(
         clauseLocation, firOpBuilder.getI1Type(), finalVal);
     return true;
   }
@@ -362,7 +361,7 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Hint>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     int64_t hintValue = *evaluate::ToInt64(clause->v);
-    result.hintAttr = firOpBuilder.getI64IntegerAttr(hintValue);
+    result.hint = firOpBuilder.getI64IntegerAttr(hintValue);
     return true;
   }
   return false;
@@ -370,11 +369,11 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
 
 bool ClauseProcessor::processMergeable(
     mlir::omp::MergeableClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeableAttr);
+  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
 }
 
 bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Nowait>(result.nowaitAttr);
+  return markClauseOccurrence<omp::clause::Nowait>(result.nowait);
 }
 
 bool ClauseProcessor::processNumTeams(
@@ -385,7 +384,7 @@ bool ClauseProcessor::processNumTeams(
   if (auto *clause = findUniqueClause<omp::clause::NumTeams>()) {
     // auto lowerBound = std::get<std::optional<ExprTy>>(clause->t);
     auto &upperBound = std::get<ExprTy>(clause->t);
-    result.numTeamsUpperVar =
+    result.numTeamsUpper =
         fir::getBase(converter.genExprValue(upperBound, stmtCtx));
     return true;
   }
@@ -397,7 +396,7 @@ bool ClauseProcessor::processNumThreads(
     mlir::omp::NumThreadsClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::NumThreads>()) {
     // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`.
-    result.numThreadsVar =
+    result.numThreads =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -408,17 +407,17 @@ bool ClauseProcessor::processOrder(mlir::omp::OrderClauseOps &result) const {
   using Order = omp::clause::Order;
   if (auto *clause = findUniqueClause<Order>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.orderAttr = mlir::omp::ClauseOrderKindAttr::get(
+    result.order = mlir::omp::ClauseOrderKindAttr::get(
         firOpBuilder.getContext(), mlir::omp::ClauseOrderKind::Concurrent);
     const auto &modifier =
         std::get<std::optional<Order::OrderModifier>>(clause->t);
     if (modifier && *modifier == Order::OrderModifier::Unconstrained) {
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::unconstrained);
     } else {
       // "If order-modifier is not unconstrained, the behavior is as if the
       // reproducible modifier is present."
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::reproducible);
     }
     return true;
@@ -433,7 +432,7 @@ bool ClauseProcessor::processOrdered(
     int64_t orderedClauseValue = 0l;
     if (clause->v.has_value())
       orderedClauseValue = *evaluate::ToInt64(*clause->v);
-    result.orderedAttr = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
+    result.ordered = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
     return true;
   }
   return false;
@@ -443,8 +442,7 @@ bool ClauseProcessor::processPriority(
     lower::StatementContext &stmtCtx,
     mlir::omp::PriorityClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Priority>()) {
-    result.priorityVar =
-        fir::getBase(converter.genExprValue(clause->v, stmtCtx));
+    result.priority = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -454,7 +452,7 @@ bool ClauseProcessor::processProcBind(
     mlir::omp::ProcBindClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ProcBind>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.procBindKindAttr = genProcBindKindAttr(firOpBuilder, *clause);
+    result.procBindKind = genProcBindKindAttr(firOpBuilder, *clause);
     return true;
   }
   return false;
@@ -465,7 +463,7 @@ bool ClauseProcessor::processSafelen(
   if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> safelenVal = evaluate::ToInt64(clause->v);
-    result.safelenAttr = firOpBuilder.getI64IntegerAttr(*safelenVal);
+    result.safelen = firOpBuilder.getI64IntegerAttr(*safelenVal);
     return true;
   }
   return false;
@@ -498,19 +496,19 @@ bool ClauseProcessor::processSchedule(
       break;
     }
 
-    result.scheduleValAttr =
+    result.scheduleKind =
         mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind);
 
-    mlir::omp::ScheduleModifier scheduleModifier = getScheduleModifier(*clause);
-    if (scheduleModifier != mlir::omp::ScheduleModifier::none)
-      result.scheduleModAttr =
-          mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier);
+    mlir::omp::ScheduleModifier scheduleMod = getScheduleModifier(*clause);
+    if (scheduleMod != mlir::omp::ScheduleModifier::none)
+      result.scheduleMod =
+          mlir::omp::ScheduleModifierAttr::get(context, scheduleMod);
 
     if (getSimdModifier(*clause) != mlir::omp::ScheduleModifier::none)
-      result.scheduleSimdAttr = firOpBuilder.getUnitAttr();
+      result.scheduleSimd = firOpBuilder.getUnitAttr();
 
     if (const auto &chunkExpr = std::get<omp::MaybeExpr>(clause->t))
-      result.scheduleChunkVar =
+      result.scheduleChunk =
           fir::getBase(converter.genExprValue(*chunkExpr, stmtCtx));
 
     return true;
@@ -523,7 +521,7 @@ bool ClauseProcessor::processSimdlen(
   if (auto *clause = findUniqueClause<omp::clause::Simdlen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> simdlenVal = evaluate::ToInt64(clause->v);
-    result.simdlenAttr = firOpBuilder.getI64IntegerAttr(*simdlenVal);
+    result.simdlen = firOpBuilder.getI64IntegerAttr(*simdlenVal);
     return true;
   }
   return false;
@@ -533,7 +531,7 @@ bool ClauseProcessor::processThreadLimit(
     lower::StatementContext &stmtCtx,
     mlir::omp::ThreadLimitClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ThreadLimit>()) {
-    result.threadLimitVar =
+    result.threadLimit =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -541,7 +539,7 @@ bool ClauseProcessor::processThreadLimit(
 }
 
 bool ClauseProcessor::processUntied(mlir::omp::UntiedClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Untied>(result.untiedAttr);
+  return markClauseOccurrence<omp::clause::Untied>(result.untied);
 }
 
 //===----------------------------------------------------------------------===//
@@ -565,7 +563,7 @@ static void
 addAlignedClause(lower::AbstractConverter &converter,
                  const omp::clause::Aligned &clause,
                  llvm::SmallVectorImpl<mlir::Value> &alignedVars,
-                 llvm::SmallVectorImpl<mlir::Attribute> &alignmentAttrs) {
+                 llvm::SmallVectorImpl<mlir::Attribute> &alignments) {
   using Aligned = omp::clause::Aligned;
   lower::StatementContext stmtCtx;
   mlir::IntegerAttr alignmentValueAttr;
@@ -594,7 +592,7 @@ addAlignedClause(lower::AbstractConverter &converter,
     alignmentValueAttr = builder.getI64IntegerAttr(alignment);
     // All the list items in a aligned clause will have same alignment
     for (std::size_t i = 0; i < objects.size(); i++)
-      alignmentAttrs.push_back(alignmentValueAttr);
+      alignments.push_back(alignmentValueAttr);
   }
 }
 
@@ -603,7 +601,7 @@ bool ClauseProcessor::processAligned(
   return findRepeatableClause<omp::clause::Aligned>(
       [&](const omp::clause::Aligned &clause, const parser::CharBlock &) {
         addAlignedClause(converter, clause, result.alignedVars,
-                         result.alignmentAttrs);
+                         result.alignments);
       });
 }
 
@@ -798,7 +796,7 @@ bool ClauseProcessor::processCopyprivate(
     result.copyprivateVars.push_back(cpVar);
     mlir::func::FuncOp funcOp =
         createCopyFunc(currentLocation, converter, cpVar.getType(), attrs);
-    result.copyprivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp));
+    result.copyprivateSyms.push_back(mlir::SymbolRefAttr::get(funcOp));
   };
 
   bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>(
@@ -832,7 +830,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
 
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
             genDependKindAttr(firOpBuilder, kind);
-        result.dependTypeAttrs.append(objects.size(), dependTypeOperand);
+        result.dependKinds.append(objects.size(), dependTypeOperand);
 
         for (const omp::Object &object : objects) {
           assert(object.ref() && "Expecting designator");
@@ -1037,10 +1035,9 @@ bool ClauseProcessor::processReduction(
 
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
-        llvm::copy(reduceVarByRef,
-                   std::back_inserter(result.reductionVarsByRef));
+        llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
         llvm::copy(reductionDeclSymbols,
-                   std::back_inserter(result.reductionDeclSymbols));
+                   std::back_inserter(result.reductionSyms));
 
         if (outReductionTypes) {
           outReductionTypes->reserve(outReductionTypes->size() +
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 4340c3278cebc..2c4b3857fda9f 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -55,7 +55,7 @@ class ClauseProcessor {
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::CollapseClauseOps &result,
+                  mlir::omp::LoopRelatedOps &result,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDefault() const;
   bool processDevice(lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 7df3905c29990..e1a193edc004a 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -13,10 +13,12 @@
 #include "DataSharingProcessor.h"
 
 #include "Utils.h"
+#include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Semantics/tools.h"
 
 namespace Fortran {
@@ -116,6 +118,11 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
   bool success = converter.createHostAssociateVarClone(*sym);
   (void)success;
   assert(success && "Privatization failed due to existing binding");
+
+  bool isFirstPrivate = sym->test(semantics::Symbol::Flag::OmpFirstPrivate);
+  if (!isFirstPrivate &&
+      Fortran::lower::hasDefaultInitialization(sym->GetUltimate()))
+    Fortran::lower::defaultInitializeAtRuntime(converter, *sym, *symTable);
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
@@ -127,8 +134,52 @@ void DataSharingProcessor::copyFirstPrivateSymbol(
 void DataSharingProcessor::copyLastPrivateSymbol(
     const semantics::Symbol *sym,
     [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) {
-  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate))
-    converter.copyHostAssociateVar(*sym, lastPrivIP);
+  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate)) {
+    bool allocatable = semantics::IsAllocatable(sym->GetUltimate());
+    if (!allocatable) {
+      converter.copyHostAssociateVar(*sym, lastPrivIP);
+      return;
+    }
+
+    // copyHostAssociateVar doesn't work properly if the privatised copy was
+    // reallocated (e.g. by assignment): it will only copy if the ultimate
+    // symbol was already allocated, and it only copies data so any reallocated
+    // lengths etc are lost
+
+    // 1) Fetch the original copy of the variable.
+    assert(sym->has<Fortran::semantics::HostAssocDetails>() &&
+           "No host-association found");
+    const Fortran::semantics::Symbol &hsym = sym->GetUltimate();
+    Fortran::lower::SymbolBox hsb = symTable->lookupOneLevelUpSymbol(hsym);
+    assert(hsb && "Host symbol box not found");
+
+    // 2) Fetch the copied one that will mask the original.
+    Fortran::lower::SymbolBox sb = symTable->shallowLookupSymbol(sym);
+    assert(sb && "Host-associated symbol box not found");
+    assert(hsb.getAddr() != sb.getAddr() &&
+           "Host and associated symbol boxes are the same");
+
+    // 3) Perform the assignment.
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = converter.genLocation(sym->name());
+    mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
+    if (lastPrivIP && lastPrivIP->isSet())
+      builder.restoreInsertionPoint(*lastPrivIP);
+    else
+      builder.setInsertionPointAfter(sb.getAddr().getDefiningOp());
+
+    hlfir::Entity dst{hsb.getAddr()};
+    hlfir::Entity src{sb.getAddr()};
+    builder.create<hlfir::AssignOp>(
+        loc, src, dst, /*isWholeAllocatableAssignment=*/allocatable,
+        /*keepLhsLengthInAllocatableAssignment=*/false,
+        /*temporary_lhs=*/false);
+
+    if (lastPrivIP && lastPrivIP->isSet() &&
+        sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
+      builder.restoreInsertionPoint(insPt);
+    }
+  }
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
@@ -139,7 +190,6 @@ void DataSharingProcessor::collectOmpObjectListSymbol(
 }
 
 void DataSharingProcessor::collectSymbolsForPrivatization() {
-  bool hasCollapse = false;
   for (const omp::Clause &clause : clauses) {
     if (const auto &privateClause =
             std::get_if<omp::clause::Private>(&clause.u)) {
@@ -153,16 +203,11 @@ void DataSharingProcessor::collectSymbolsForPrivatization() {
       const ObjectList &objects = std::get<ObjectList>(lastPrivateClause->t);
       collectOmpObjectListSymbol(objects, explicitlyPrivatizedSymbols);
       hasLastPrivateOp = true;
-    } else if (std::get_if<omp::clause::Collapse>(&clause.u)) {
-      hasCollapse = true;
     }
   }
 
   for (auto *sym : explicitlyPrivatizedSymbols)
     allPrivatizedSymbols.insert(sym);
-
-  if (hasCollapse && hasLastPrivateOp)
-    TODO(converter.getCurrentLocation(), "Collapse clause with lastprivate");
 }
 
 bool DataSharingProcessor::needBarrier() {
@@ -225,28 +270,40 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
       mlir::Operation *lastOper = loopOp.getRegion().back().getTerminator();
       firOpBuilder.setInsertionPoint(lastOper);
 
-      mlir::Value iv = loopOp.getIVs()[0];
-      mlir::Value ub = loopOp.getUpperBound()[0];
-      mlir::Value step = loopOp.getStep()[0];
-
-      // v = iv + step
-      // cmp = step < 0 ? v < ub : v > ub
-      mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
-      mlir::Value zero =
-          firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
-      mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::slt, step, zero);
-      mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::slt, v, ub);
-      mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sgt, v, ub);
-      mlir::Value cmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
-          loc, negativeStep, vLT, vGT);
+      mlir::Value cmpOp;
+      llvm::SmallVector<mlir::Value> vs;
+      vs.reserve(loopOp.getIVs().size());
+      for (auto [iv, ub, step] :
+           llvm::zip_equal(loopOp.getIVs(), loopOp.getLoopUpperBounds(),
+                           loopOp.getLoopSteps())) {
+        // v = iv + step
+        // cmp = step < 0 ? v < ub : v > ub
+        mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
+        vs.push_back(v);
+        mlir::Value zero =
+            firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
+        mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
+            loc, mlir::arith::CmpIPredicate::slt, step, zero);
+        mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
+            loc, mlir::arith::CmpIPredicate::slt, v, ub);
+        mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
+            loc, mlir::arith::CmpIPredicate::sgt, v, ub);
+        mlir::Value icmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
+            loc, negativeStep, vLT, vGT);
+
+        if (cmpOp) {
+          cmpOp = firOpBuilder.create<mlir::arith::AndIOp>(loc, cmpOp, icmpOp);
+        } else {
+          cmpOp = icmpOp;
+        }
+      }
 
       auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
       firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      assert(loopIV && "loopIV was not set");
-      firOpBuilder.createStoreWithConvert(loc, v, loopIV);
+      for (auto [v, loopIV] : llvm::zip_equal(vs, loopIVs)) {
+        assert(loopIV && "loopIV was not set");
+        firOpBuilder.createStoreWithConvert(loc, v, loopIV);
+      }
       lastPrivIP = firOpBuilder.saveInsertionPoint();
     } else if (mlir::isa<mlir::omp::SectionsOp>(op)) {
       // Already handled by genOMP()
@@ -492,9 +549,20 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
       symTable->addSymbol(*sym, localExV);
       symTable->pushScope();
       cloneSymbol(sym);
-      firOpBuilder.create<mlir::omp::YieldOp>(
-          hsb.getAddr().getLoc(),
-          symTable->shallowLookupSymbol(*sym).getAddr());
+      mlir::Value cloneAddr = symTable->shallowLookupSymbol(*sym).getAddr();
+      mlir::Type cloneType = cloneAddr.getType();
+
+      // A `convert` op is required for variables that are storage associated
+      // via `equivalence`. The problem is that these variables are declared as
+      // `fir.ptr`s while their privatized storage is declared as `fir.ref`,
+      // therefore we convert to proper symbol type.
+      mlir::Value yieldedValue =
+          (symType == cloneType) ? cloneAddr
+                                 : firOpBuilder.createConvert(
+                                       cloneAddr.getLoc(), symType, cloneAddr);
+
+      firOpBuilder.create<mlir::omp::YieldOp>(hsb.getAddr().getLoc(),
+                                              yieldedValue);
       symTable->popScope();
     }
 
@@ -537,7 +605,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
   }();
 
   if (clauseOps) {
-    clauseOps->privatizers.push_back(mlir::SymbolRefAttr::get(privatizerOp));
+    clauseOps->privateSyms.push_back(mlir::SymbolRefAttr::get(privatizerOp));
     clauseOps->privateVars.push_back(hsb.getAddr());
   }
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 7c0c831ed43a8..1e9d07ec13857 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -71,7 +71,7 @@ class DataSharingProcessor {
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
-  mlir::Value loopIV;
+  llvm::SmallVector<mlir::Value> loopIVs;
   // Symbols in private, firstprivate, and/or lastprivate clauses.
   llvm::SetVector<const semantics::Symbol *> explicitlyPrivatizedSymbols;
   llvm::SetVector<const semantics::Symbol *> defaultSymbols;
@@ -147,10 +147,7 @@ class DataSharingProcessor {
   void processStep1(mlir::omp::PrivateClauseOps *clauseOps = nullptr);
   void processStep2(mlir::Operation *op, bool isLoop);
 
-  void setLoopIV(mlir::Value iv) {
-    assert(!loopIV && "Loop iteration variable already set");
-    loopIV = iv;
-  }
+  void pushLoopIV(mlir::Value iv) { loopIVs.push_back(iv); }
 
   const llvm::SetVector<const semantics::Symbol *> &
   getAllSymbolsToPrivatize() const {
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 47ce2d8c8db87..2b1839b5270d4 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -284,7 +284,7 @@ static void getDeclareTargetInfo(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    mlir::omp::DeclareTargetClauseOps &clauseOps,
+    mlir::omp::DeclareTargetOperands &clauseOps,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
   const auto &spec =
       std::get<parser::OmpDeclareTargetSpecifier>(declareTargetConstruct.t);
@@ -322,7 +322,7 @@ static void collectDeferredDeclareTargets(
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
     llvm::SmallVectorImpl<lower::OMPDeferredDeclareTargetInfo>
         &deferredDeclareTarget) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -347,7 +347,7 @@ getDeclareTargetFunctionDevice(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -677,8 +677,11 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
         assert(tempDsp.has_value());
         tempDsp->processStep2(privatizationTopLevelOp, isLoop);
       } else {
-        if (isLoop && regionArgs.size() > 0)
-          info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0]));
+        if (isLoop && regionArgs.size() > 0) {
+          for (const auto &regionArg : regionArgs) {
+            info.dsp->pushLoopIV(info.converter.getSymbolAddress(*regionArg));
+          }
+        }
         info.dsp->processStep2(privatizationTopLevelOp, isLoop);
       }
     }
@@ -926,7 +929,7 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
             mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
-        targetOp.getMapOperandsMutable().append(mapOp);
+        targetOp.getMapVarsMutable().append(mapOp);
         mlir::Value clonedValArg =
             region.addArgument(copyVal.getType(), copyVal.getLoc());
         firOpBuilder.setInsertionPointToStart(regionBlock);
@@ -1019,15 +1022,13 @@ static OpTy genWrapperOp(lower::AbstractConverter &converter,
 // Code generation functions for clauses
 //===----------------------------------------------------------------------===//
 
-static void genCriticalDeclareClauses(lower::AbstractConverter &converter,
-                                      semantics::SemanticsContext &semaCtx,
-                                      const List<Clause> &clauses,
-                                      mlir::Location loc,
-                                      mlir::omp::CriticalClauseOps &clauseOps,
-                                      llvm::StringRef name) {
+static void genCriticalDeclareClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    const List<Clause> &clauses, mlir::Location loc,
+    mlir::omp::CriticalDeclareOperands &clauseOps, llvm::StringRef name) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processHint(clauseOps);
-  clauseOps.criticalNameAttr =
+  clauseOps.symName =
       mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name);
 }
 
@@ -1036,7 +1037,7 @@ static void genDistributeClauses(lower::AbstractConverter &converter,
                                  lower::StatementContext &stmtCtx,
                                  const List<Clause> &clauses,
                                  mlir::Location loc,
-                                 mlir::omp::DistributeClauseOps &clauseOps) {
+                                 mlir::omp::DistributeOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDistSchedule(stmtCtx, clauseOps);
@@ -1060,18 +1061,18 @@ static void
 genLoopNestClauses(lower::AbstractConverter &converter,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, const List<Clause> &clauses,
-                   mlir::Location loc, mlir::omp::LoopNestClauseOps &clauseOps,
+                   mlir::Location loc, mlir::omp::LoopNestOperands &clauseOps,
                    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processCollapse(loc, eval, clauseOps, iv);
-  clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr();
+  clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr();
 }
 
 static void genMaskedClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              lower::StatementContext &stmtCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::MaskedClauseOps &clauseOps) {
+                             mlir::omp::MaskedOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processFilter(stmtCtx, clauseOps);
 }
@@ -1080,7 +1081,7 @@ static void
 genOrderedRegionClauses(lower::AbstractConverter &converter,
                         semantics::SemanticsContext &semaCtx,
                         const List<Clause> &clauses, mlir::Location loc,
-                        mlir::omp::OrderedRegionClauseOps &clauseOps) {
+                        mlir::omp::OrderedRegionOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
 }
@@ -1088,7 +1089,7 @@ genOrderedRegionClauses(lower::AbstractConverter &converter,
 static void genParallelClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::ParallelClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1103,7 +1104,7 @@ static void genParallelClauses(
 static void genSectionsClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     const List<Clause> &clauses, mlir::Location loc,
-    mlir::omp::SectionsClauseOps &clauseOps,
+    mlir::omp::SectionsOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1116,7 +1117,7 @@ static void genSectionsClauses(
 static void genSimdClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::SimdClauseOps &clauseOps) {
+                           mlir::omp::SimdOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAligned(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
@@ -1133,7 +1134,7 @@ static void genSimdClauses(lower::AbstractConverter &converter,
 static void genSingleClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::SingleClauseOps &clauseOps) {
+                             mlir::omp::SingleOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processCopyprivate(loc, clauseOps);
@@ -1145,7 +1146,7 @@ static void genTargetClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, bool processHostOnlyClauses,
-    mlir::omp::TargetClauseOps &clauseOps,
+    mlir::omp::TargetOperands &clauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
     llvm::SmallVectorImpl<mlir::Location> &mapLocs,
     llvm::SmallVectorImpl<mlir::Type> &mapTypes,
@@ -1182,7 +1183,7 @@ static void genTargetClauses(
 static void genTargetDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::TargetDataClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::TargetDataOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) {
@@ -1215,7 +1216,7 @@ static void genTargetEnterExitUpdateDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, llvm::omp::Directive directive,
-    mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) {
+    mlir::omp::TargetEnterExitUpdateDataOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDepend(clauseOps);
   cp.processDevice(stmtCtx, clauseOps);
@@ -1234,7 +1235,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            lower::StatementContext &stmtCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::TaskClauseOps &clauseOps) {
+                           mlir::omp::TaskOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1253,7 +1254,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
 static void genTaskgroupClauses(lower::AbstractConverter &converter,
                                 semantics::SemanticsContext &semaCtx,
                                 const List<Clause> &clauses, mlir::Location loc,
-                                mlir::omp::TaskgroupClauseOps &clauseOps) {
+                                mlir::omp::TaskgroupOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processTODO<clause::TaskReduction>(loc,
@@ -1263,7 +1264,7 @@ static void genTaskgroupClauses(lower::AbstractConverter &converter,
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
                                semantics::SemanticsContext &semaCtx,
                                const List<Clause> &clauses, mlir::Location loc,
-                               mlir::omp::TaskwaitClauseOps &clauseOps) {
+                               mlir::omp::TaskwaitOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Depend, clause::Nowait>(
       loc, llvm::omp::Directive::OMPD_taskwait);
@@ -1273,7 +1274,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
                             semantics::SemanticsContext &semaCtx,
                             lower::StatementContext &stmtCtx,
                             const List<Clause> &clauses, mlir::Location loc,
-                            mlir::omp::TeamsClauseOps &clauseOps) {
+                            mlir::omp::TeamsOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1288,7 +1289,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
 static void genWsloopClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::WsloopClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1329,7 +1330,7 @@ genCriticalOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     mlir::ModuleOp mod = firOpBuilder.getModule();
     auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
     if (!global) {
-      mlir::omp::CriticalClauseOps clauseOps;
+      mlir::omp::CriticalDeclareOperands clauseOps;
       genCriticalDeclareClauses(converter, semaCtx, item->clauses, loc,
                                 clauseOps, nameStr);
 
@@ -1364,7 +1365,7 @@ genLoopNestOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::LoopNestClauseOps &clauseOps,
+              mlir::omp::LoopNestOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> iv,
               llvm::ArrayRef<const semantics::Symbol *> wrapperSyms,
               llvm::ArrayRef<mlir::BlockArgument> wrapperArgs,
@@ -1392,7 +1393,7 @@ genMaskedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::MaskedClauseOps clauseOps;
+  mlir::omp::MaskedOperands clauseOps;
   genMaskedClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::MaskedOp>(
@@ -1426,7 +1427,7 @@ genOrderedRegionOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, mlir::Location loc,
                    const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::OrderedRegionClauseOps clauseOps;
+  mlir::omp::OrderedRegionOperands clauseOps;
   genOrderedRegionClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
@@ -1440,7 +1441,7 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::ParallelClauseOps &clauseOps,
+              mlir::omp::ParallelOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
               llvm::ArrayRef<mlir::Type> reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -1491,13 +1492,26 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     allSymbols.append(dsp.getAllSymbolsToPrivatize().begin(),
                       dsp.getAllSymbolsToPrivatize().end());
 
-    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
-      converter.bindSymbol(*arg, hlfir::translateToExtendedValue(
-                                     loc, firOpBuilder, hlfir::Entity{prv},
-                                     /*contiguousHint=*/
-                                     evaluate::IsSimplyContiguous(
-                                         *arg, converter.getFoldingContext()))
-                                     .first);
+    unsigned argIdx = 0;
+    for (const semantics::Symbol *arg : allSymbols) {
+      auto bind = [&](const semantics::Symbol *sym) {
+        mlir::BlockArgument blockArg = region.getArgument(argIdx);
+        ++argIdx;
+        converter.bindSymbol(*sym,
+                             hlfir::translateToExtendedValue(
+                                 loc, firOpBuilder, hlfir::Entity{blockArg},
+                                 /*contiguousHint=*/
+                                 evaluate::IsSimplyContiguous(
+                                     *sym, converter.getFoldingContext()))
+                                 .first);
+      };
+
+      if (const auto *commonDet =
+              arg->detailsIf<semantics::CommonBlockDetails>()) {
+        for (const auto &mem : commonDet->objects())
+          bind(&*mem);
+      } else
+        bind(arg);
     }
 
     return allSymbols;
@@ -1518,7 +1532,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               const parser::OmpSectionBlocks &sectionBlocks) {
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
-  mlir::omp::SectionsClauseOps clauseOps;
+  mlir::omp::SectionsOperands clauseOps;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
                      reductionTypes, reductionSyms);
 
@@ -1619,7 +1633,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // Emit implicit barrier to synchronize threads and avoid data
   // races on post-update of lastprivate variables when `nowait`
   // clause is present.
-  if (clauseOps.nowaitAttr && !lastprivates.empty())
+  if (clauseOps.nowait && !lastprivates.empty())
     builder.create<mlir::omp::BarrierOp>(loc);
 
   symTable.popScope();
@@ -1631,7 +1645,7 @@ genSingleOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
-  mlir::omp::SingleClauseOps clauseOps;
+  mlir::omp::SingleOperands clauseOps;
   genSingleClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
@@ -1653,7 +1667,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       !llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
            .getIsTargetDevice();
 
-  mlir::omp::TargetClauseOps clauseOps;
+  mlir::omp::TargetOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> mapSyms, devicePtrSyms,
       deviceAddrSyms;
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
@@ -1781,7 +1795,7 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 lower::pft::Evaluation &eval, mlir::Location loc,
                 const ConstructQueue &queue, ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TargetDataClauseOps clauseOps;
+  mlir::omp::TargetDataOperands clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const semantics::Symbol *> useDeviceSyms;
@@ -1819,7 +1833,7 @@ static OpTy genTargetEnterExitUpdateDataOp(lower::AbstractConverter &converter,
     llvm_unreachable("Unexpected TARGET DATA construct");
   }
 
-  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
+  mlir::omp::TargetEnterExitUpdateDataOperands clauseOps;
   genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx,
                                       item->clauses, loc, directive, clauseOps);
 
@@ -1832,7 +1846,7 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mlir::Location loc, const ConstructQueue &queue,
           ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TaskClauseOps clauseOps;
+  mlir::omp::TaskOperands clauseOps;
   genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
@@ -1847,7 +1861,7 @@ genTaskgroupOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                semantics::SemanticsContext &semaCtx,
                lower::pft::Evaluation &eval, mlir::Location loc,
                const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskgroupClauseOps clauseOps;
+  mlir::omp::TaskgroupOperands clauseOps;
   genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
@@ -1862,7 +1876,7 @@ genTaskwaitOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskwaitClauseOps clauseOps;
+  mlir::omp::TaskwaitOperands clauseOps;
   genTaskwaitClauses(converter, semaCtx, item->clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
                                                                    clauseOps);
@@ -1882,7 +1896,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TeamsClauseOps clauseOps;
+  mlir::omp::TeamsOperands clauseOps;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
@@ -1904,11 +1918,11 @@ static void genStandaloneDistribute(
     ConstructQueue::iterator item, DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1932,13 +1946,13 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
                             DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, reductionTypes, reductionSyms);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1962,7 +1976,7 @@ static void genStandaloneParallel(lower::AbstractConverter &converter,
                                   ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::ParallelClauseOps clauseOps;
+  mlir::omp::ParallelOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -1979,10 +1993,10 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                               const ConstructQueue &queue,
                               ConstructQueue::iterator item,
                               DataSharingProcessor &dsp) {
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2033,14 +2047,14 @@ static void genCompositeDistributeSimd(
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2079,16 +2093,16 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
   llvm::SmallVector<mlir::Type> wsloopReductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, wsloopReductionTypes, wsloopReductionSyms);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2299,7 +2313,7 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 2961df96b3cab..d54715d3fa3f5 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -248,6 +248,11 @@ mlir::Value fir::FirOpBuilder::allocateLocal(
 
 /// Get the block for adding Allocas.
 mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
+  if (auto accComputeRegionIface =
+          getRegion().getParentOfType<mlir::acc::ComputeRegionOpInterface>()) {
+    return accComputeRegionIface.getAllocaBlock();
+  }
+
   if (auto ompOutlineableIface =
           getRegion()
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
@@ -1541,21 +1546,44 @@ mlir::Value fir::factory::genMaxWithZero(fir::FirOpBuilder &builder,
                                                zero);
 }
 
+static std::pair<mlir::Value, mlir::Type>
+genCPtrOrCFunptrFieldIndex(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Type cptrTy) {
+  auto recTy = mlir::cast<fir::RecordType>(cptrTy);
+  assert(recTy.getTypeList().size() == 1);
+  auto addrFieldName = recTy.getTypeList()[0].first;
+  mlir::Type addrFieldTy = recTy.getTypeList()[0].second;
+  auto fieldIndexType = fir::FieldType::get(cptrTy.getContext());
+  mlir::Value addrFieldIndex = builder.create<fir::FieldIndexOp>(
+      loc, fieldIndexType, addrFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  return {addrFieldIndex, addrFieldTy};
+}
+
 mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                                mlir::Location loc,
                                                mlir::Value cPtr,
                                                mlir::Type ty) {
-  assert(mlir::isa<fir::RecordType>(ty));
-  auto recTy = mlir::dyn_cast<fir::RecordType>(ty);
-  assert(recTy.getTypeList().size() == 1);
-  auto fieldName = recTy.getTypeList()[0].first;
-  mlir::Type fieldTy = recTy.getTypeList()[0].second;
-  auto fieldIndexType = fir::FieldType::get(ty.getContext());
-  mlir::Value field =
-      builder.create<fir::FieldIndexOp>(loc, fieldIndexType, fieldName, recTy,
-                                        /*typeParams=*/mlir::ValueRange{});
-  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(fieldTy),
-                                           cPtr, field);
+  auto [addrFieldIndex, addrFieldTy] =
+      genCPtrOrCFunptrFieldIndex(builder, loc, ty);
+  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(addrFieldTy),
+                                           cPtr, addrFieldIndex);
+}
+
+mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
+                                                mlir::Location loc,
+                                                mlir::Value cPtr) {
+  mlir::Type cPtrTy = fir::unwrapRefType(cPtr.getType());
+  if (fir::isa_ref_type(cPtr.getType())) {
+    mlir::Value cPtrAddr =
+        fir::factory::genCPtrOrCFunptrAddr(builder, loc, cPtr, cPtrTy);
+    return builder.create<fir::LoadOp>(loc, cPtrAddr);
+  }
+  auto [addrFieldIndex, addrFieldTy] =
+      genCPtrOrCFunptrFieldIndex(builder, loc, cPtrTy);
+  auto arrayAttr =
+      builder.getArrayAttr({builder.getIntegerAttr(builder.getIndexType(), 0)});
+  return builder.create<fir::ExtractValueOp>(loc, addrFieldTy, cPtr, arrayAttr);
 }
 
 fir::BoxValue fir::factory::createBoxValue(fir::FirOpBuilder &builder,
@@ -1596,15 +1624,6 @@ fir::BoxValue fir::factory::createBoxValue(fir::FirOpBuilder &builder,
   return fir::BoxValue(box, lbounds, explicitTypeParams);
 }
 
-mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
-                                                mlir::Location loc,
-                                                mlir::Value cPtr) {
-  mlir::Type cPtrTy = fir::unwrapRefType(cPtr.getType());
-  mlir::Value cPtrAddr =
-      fir::factory::genCPtrOrCFunptrAddr(builder, loc, cPtr, cPtrTy);
-  return builder.create<fir::LoadOp>(loc, cPtrAddr);
-}
-
 mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
                                             mlir::Location loc,
                                             mlir::Type boxType) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 0e5e30a7024d8..22439010e7797 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -98,9 +98,6 @@ static bool isStaticallyPresent(const fir::ExtendedValue &exv) {
 /// IEEE module procedure names not yet implemented for genModuleProcTODO.
 static constexpr char ieee_int[] = "ieee_int";
 static constexpr char ieee_get_underflow_mode[] = "ieee_get_underflow_mode";
-static constexpr char ieee_next_after[] = "ieee_next_after";
-static constexpr char ieee_next_down[] = "ieee_next_down";
-static constexpr char ieee_next_up[] = "ieee_next_up";
 static constexpr char ieee_real[] = "ieee_real";
 static constexpr char ieee_rem[] = "ieee_rem";
 static constexpr char ieee_rint[] = "ieee_rint";
@@ -355,9 +352,9 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genIeeeMaxMin</*isMax=*/false, /*isNum=*/true, /*isMag=*/false>},
     {"ieee_min_num_mag",
      &I::genIeeeMaxMin</*isMax=*/false, /*isNum=*/true, /*isMag=*/true>},
-    {"ieee_next_after", &I::genModuleProcTODO<ieee_next_after>},
-    {"ieee_next_down", &I::genModuleProcTODO<ieee_next_down>},
-    {"ieee_next_up", &I::genModuleProcTODO<ieee_next_up>},
+    {"ieee_next_after", &I::genNearest<I::NearestProc::NextAfter>},
+    {"ieee_next_down", &I::genNearest<I::NearestProc::NextDown>},
+    {"ieee_next_up", &I::genNearest<I::NearestProc::NextUp>},
     {"ieee_quiet_eq", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OEQ>},
     {"ieee_quiet_ge", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OGE>},
     {"ieee_quiet_gt", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OGT>},
@@ -497,7 +494,7 @@ static constexpr IntrinsicHandler handlers[]{
        {"len", asValue},
        {"to", asAddr},
        {"topos", asValue}}}},
-    {"nearest", &I::genNearest},
+    {"nearest", &I::genNearest<I::NearestProc::Nearest>},
     {"nint", &I::genNint},
     {"norm2",
      &I::genNorm2,
@@ -3972,11 +3969,14 @@ IntrinsicLibrary::genIchar(mlir::Type resultType,
 //   8   Positive normal
 //   9   Positive infinity
 static constexpr int finiteTest = 0b0111111000;
+static constexpr int infiniteTest = 0b1000000100;
 static constexpr int nanTest = 0b0000000011;
 static constexpr int negativeTest = 0b0000111100;
 static constexpr int normalTest = 0b0101101000;
 static constexpr int positiveTest = 0b1111000000;
 static constexpr int snanTest = 0b0000000001;
+static constexpr int subnormalTest = 0b0010010000;
+static constexpr int zeroTest = 0b0001100000;
 
 mlir::Value IntrinsicLibrary::genIsFPClass(mlir::Type resultType,
                                            llvm::ArrayRef<mlir::Value> args,
@@ -3988,8 +3988,15 @@ mlir::Value IntrinsicLibrary::genIsFPClass(mlir::Type resultType,
   return builder.createConvert(loc, resultType, isfpclass);
 }
 
-/// Generate code to raise \p except if \p cond is absent, or present and true.
-void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
+// Generate a quiet NaN of a given floating point type.
+mlir::Value IntrinsicLibrary::genQNan(mlir::Type resultType) {
+  return genIeeeValue(resultType, builder.createIntegerConstant(
+                                      loc, builder.getIntegerType(8),
+                                      _FORTRAN_RUNTIME_IEEE_QUIET_NAN));
+}
+
+// Generate code to raise \p excepts if \p cond is absent, or present and true.
+void IntrinsicLibrary::genRaiseExcept(int excepts, mlir::Value cond) {
   fir::IfOp ifOp;
   if (cond) {
     ifOp = builder.create<fir::IfOp>(loc, cond, /*withElseRegion=*/false);
@@ -3998,8 +4005,8 @@ void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
   mlir::Type i32Ty = builder.getIntegerType(32);
   genRuntimeCall(
       "feraiseexcept", i32Ty,
-      fir::runtime::genMapException(
-          builder, loc, builder.createIntegerConstant(loc, i32Ty, except)));
+      fir::runtime::genMapExcept(
+          builder, loc, builder.createIntegerConstant(loc, i32Ty, excepts)));
   if (cond)
     builder.setInsertionPointAfter(ifOp);
 }
@@ -4363,14 +4370,14 @@ void IntrinsicLibrary::genIeeeGetFlag(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
   mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
-  mlir::Value exceptSet = IntrinsicLibrary::genRuntimeCall(
+  mlir::Value excepts = IntrinsicLibrary::genRuntimeCall(
       "fetestexcept", i32Ty,
-      fir::runtime::genMapException(
+      fir::runtime::genMapExcept(
           builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
   mlir::Value logicalResult = builder.create<fir::ConvertOp>(
       loc, resultTy,
       builder.create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::ne,
-                                          exceptSet, zero));
+                                          excepts, zero));
   builder.create<fir::StoreOp>(loc, logicalResult, flagValue);
 }
 
@@ -4391,7 +4398,7 @@ void IntrinsicLibrary::genIeeeGetHaltingMode(
       IntrinsicLibrary::genRuntimeCall("fegetexcept", i32Ty, {});
   mlir::Value intResult = builder.create<mlir::arith::AndIOp>(
       loc, haltSet,
-      fir::runtime::genMapException(
+      fir::runtime::genMapExcept(
           builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
   mlir::Value logicalResult = builder.create<fir::ConvertOp>(
       loc, resultTy,
@@ -4657,7 +4664,6 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
     y1 = y;
   }
   mlir::Type i1Ty = builder.getI1Type();
-  mlir::Type i8Ty = builder.getIntegerType(8);
   mlir::arith::CmpFPredicate pred;
   mlir::Value cmp, result, resultIsX, resultIsY;
 
@@ -4698,12 +4704,10 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
   } else {
     resultIsX = resultIsY = builder.createBool(loc, false);
   }
-  mlir::Value qNaN =
-      genIeeeValue(resultType, builder.createIntegerConstant(
-                                   loc, i8Ty, _FORTRAN_RUNTIME_IEEE_QUIET_NAN));
   result = builder.create<mlir::arith::SelectOp>(
       loc, resultIsX, x,
-      builder.create<mlir::arith::SelectOp>(loc, resultIsY, y, qNaN));
+      builder.create<mlir::arith::SelectOp>(loc, resultIsY, y,
+                                            genQNan(resultType)));
   mlir::Value hasSNaNOp = builder.create<mlir::arith::OrIOp>(
       loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
       genIsFPClass(builder.getI1Type(), args[1], snanTest));
@@ -4747,7 +4751,7 @@ void IntrinsicLibrary::genIeeeSetFlagOrHaltingMode(
   mlir::Type i32Ty = builder.getIntegerType(32);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, getBase(args[0]));
   mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
-  mlir::Value except = fir::runtime::genMapException(
+  mlir::Value except = fir::runtime::genMapExcept(
       builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field));
   auto ifOp = builder.create<fir::IfOp>(
       loc, builder.create<fir::ConvertOp>(loc, i1Ty, getBase(args[1])),
@@ -5610,16 +5614,186 @@ void IntrinsicLibrary::genMvbits(llvm::ArrayRef<fir::ExtendedValue> args) {
   builder.create<fir::StoreOp>(loc, res, toAddr);
 }
 
-// NEAREST
+// NEAREST, IEEE_NEXT_AFTER, IEEE_NEXT_DOWN, IEEE_NEXT_UP
+template <I::NearestProc proc>
 mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
+  // NEAREST
+  //   Return the number adjacent to arg X in the direction of the infinity
+  //   with the sign of arg S. Terminate with an error if arg S is zero.
+  //   Generate exceptions as for IEEE_NEXT_AFTER.
+  // IEEE_NEXT_AFTER
+  //   Return isNan(Y) ? NaN : X==Y ? X : num adjacent to X in the dir of Y.
+  //   Signal IEEE_OVERFLOW, IEEE_INEXACT for finite X and infinite result.
+  //   Signal IEEE_UNDERFLOW, IEEE_INEXACT for subnormal result.
+  // IEEE_NEXT_DOWN
+  //   Return the number adjacent to X and less than X.
+  //   Signal IEEE_INVALID when X is a signaling NaN.
+  // IEEE_NEXT_UP
+  //   Return the number adjacent to X and greater than X.
+  //   Signal IEEE_INVALID when X is a signaling NaN.
+  //
+  // valueUp     -- true if a finite result must be larger than X.
+  // magnitudeUp -- true if a finite abs(result) must be larger than abs(X).
+  //
+  // if (isNextAfter && isNan(Y)) X = NaN // result = NaN
+  // if (isNan(X) || (isNextAfter && X == Y) || (isInfinite(X) && magnitudeUp))
+  //   result = X
+  // else if (isZero(X))
+  //   result = valueUp ? minPositiveSubnormal : minNegativeSubnormal
+  // else
+  //   result = magUp ? (X + minPositiveSubnormal) : (X - minPositiveSubnormal)
 
-  mlir::Value realX = fir::getBase(args[0]);
-  mlir::Value realS = fir::getBase(args[1]);
+  assert(args.size() == 1 || args.size() == 2);
+  mlir::Value x = args[0];
+  mlir::FloatType xType = mlir::dyn_cast<mlir::FloatType>(x.getType());
+  const unsigned xBitWidth = xType.getWidth();
+  mlir::Type i1Ty = builder.getI1Type();
+  if constexpr (proc == NearestProc::NextAfter)
+    // If isNan(Y), set X to a qNaN that will propagate to the resultIsX result.
+    x = builder.create<mlir::arith::SelectOp>(
+        loc, genIsFPClass(i1Ty, args[1], nanTest), genQNan(xType), x);
+  mlir::Value resultIsX = genIsFPClass(i1Ty, x, nanTest);
+  mlir::Type intType = builder.getIntegerType(xBitWidth);
+  mlir::Value one = builder.createIntegerConstant(loc, intType, 1);
 
-  return builder.createConvert(
-      loc, resultType, fir::runtime::genNearest(builder, loc, realX, realS));
+  // Set valueUp to true if a finite result must be larger than arg X.
+  mlir::Value valueUp;
+  if constexpr (proc == NearestProc::Nearest) {
+    // Arg S must not be zero.
+    fir::IfOp ifOp =
+        builder.create<fir::IfOp>(loc, genIsFPClass(i1Ty, args[1], zeroTest),
+                                  /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    fir::runtime::genReportFatalUserError(
+        builder, loc, "intrinsic nearest S argument is zero");
+    builder.setInsertionPointAfter(ifOp);
+    mlir::Value sSign = IntrinsicLibrary::genIeeeSignbit(intType, {args[1]});
+    valueUp = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::ne, sSign, one);
+  } else if constexpr (proc == NearestProc::NextAfter) {
+    // Convert X and Y to a common type to allow comparison. Direct conversions
+    // between kinds 2, 3, 10, and 16 are not all supported. These conversions
+    // are implemented by converting kind=2,3 values to kind=4, possibly
+    // followed with a conversion of that value to a larger type.
+    mlir::Value x1 = x;
+    mlir::Value y = args[1];
+    mlir::FloatType yType = mlir::dyn_cast<mlir::FloatType>(args[1].getType());
+    const unsigned yBitWidth = yType.getWidth();
+    if (xType != yType) {
+      mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+      if (xBitWidth < 32)
+        x1 = builder.createConvert(loc, f32Ty, x1);
+      if (yBitWidth > 32 && yBitWidth > xBitWidth)
+        x1 = builder.createConvert(loc, yType, x1);
+      if (yBitWidth < 32)
+        y = builder.createConvert(loc, f32Ty, y);
+      if (xBitWidth > 32 && xBitWidth > yBitWidth)
+        y = builder.createConvert(loc, xType, y);
+    }
+    resultIsX = builder.create<mlir::arith::OrIOp>(
+        loc, resultIsX,
+        builder.create<mlir::arith::CmpFOp>(
+            loc, mlir::arith::CmpFPredicate::OEQ, x1, y));
+    valueUp = builder.create<mlir::arith::CmpFOp>(
+        loc, mlir::arith::CmpFPredicate::OLT, x1, y);
+  } else if constexpr (proc == NearestProc::NextDown) {
+    valueUp = builder.createBool(loc, false);
+  } else if constexpr (proc == NearestProc::NextUp) {
+    valueUp = builder.createBool(loc, true);
+  }
+  mlir::Value magnitudeUp = builder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::ne, valueUp,
+      IntrinsicLibrary::genIeeeSignbit(i1Ty, {args[0]}));
+  resultIsX = builder.create<mlir::arith::OrIOp>(
+      loc, resultIsX,
+      builder.create<mlir::arith::AndIOp>(
+          loc, genIsFPClass(i1Ty, x, infiniteTest), magnitudeUp));
+
+  // Result is X. (For ieee_next_after with isNan(Y), X has been set to a NaN.)
+  fir::IfOp outerIfOp = builder.create<fir::IfOp>(loc, resultType, resultIsX,
+                                                  /*withElseRegion=*/true);
+  builder.setInsertionPointToStart(&outerIfOp.getThenRegion().front());
+  if constexpr (proc == NearestProc::NextDown || proc == NearestProc::NextUp)
+    genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID,
+                   genIsFPClass(i1Ty, x, snanTest));
+  builder.create<fir::ResultOp>(loc, x);
+
+  // Result is minPositiveSubnormal or minNegativeSubnormal. (X is zero.)
+  builder.setInsertionPointToStart(&outerIfOp.getElseRegion().front());
+  mlir::Value resultIsMinSubnormal = builder.create<mlir::arith::CmpFOp>(
+      loc, mlir::arith::CmpFPredicate::OEQ, x,
+      builder.createRealZeroConstant(loc, xType));
+  fir::IfOp innerIfOp =
+      builder.create<fir::IfOp>(loc, resultType, resultIsMinSubnormal,
+                                /*withElseRegion=*/true);
+  builder.setInsertionPointToStart(&innerIfOp.getThenRegion().front());
+  mlir::Value minPositiveSubnormal =
+      builder.create<mlir::arith::BitcastOp>(loc, resultType, one);
+  mlir::Value minNegativeSubnormal = builder.create<mlir::arith::BitcastOp>(
+      loc, resultType,
+      builder.create<mlir::arith::ConstantOp>(
+          loc, intType,
+          builder.getIntegerAttr(
+              intType, llvm::APInt::getBitsSetWithWrap(
+                           xBitWidth, /*lo=*/xBitWidth - 1, /*hi=*/1))));
+  mlir::Value result = builder.create<mlir::arith::SelectOp>(
+      loc, valueUp, minPositiveSubnormal, minNegativeSubnormal);
+  if constexpr (proc == NearestProc::Nearest || proc == NearestProc::NextAfter)
+    genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
+                   _FORTRAN_RUNTIME_IEEE_INEXACT);
+  builder.create<fir::ResultOp>(loc, result);
+
+  // Result is (X + minPositiveSubnormal) or (X - minPositiveSubnormal).
+  builder.setInsertionPointToStart(&innerIfOp.getElseRegion().front());
+  if (xBitWidth == 80) {
+    // Kind 10. Call std::nextafter, which generates exceptions as required
+    // for ieee_next_after and nearest. Override this exception processing
+    // for ieee_next_down and ieee_next_up.
+    constexpr bool overrideExceptionGeneration =
+        proc == NearestProc::NextDown || proc == NearestProc::NextUp;
+    [[maybe_unused]] mlir::Type i32Ty;
+    [[maybe_unused]] mlir::Value allExcepts, excepts, mask;
+    if constexpr (overrideExceptionGeneration) {
+      i32Ty = builder.getIntegerType(32);
+      allExcepts = fir::runtime::genMapExcept(
+          builder, loc,
+          builder.createIntegerConstant(loc, i32Ty, _FORTRAN_RUNTIME_IEEE_ALL));
+      excepts = genRuntimeCall("fetestexcept", i32Ty, allExcepts);
+      mask = genRuntimeCall("fedisableexcept", i32Ty, allExcepts);
+    }
+    result = fir::runtime::genNearest(builder, loc, x, valueUp);
+    if constexpr (overrideExceptionGeneration) {
+      genRuntimeCall("feclearexcept", i32Ty, allExcepts);
+      genRuntimeCall("feraiseexcept", i32Ty, excepts);
+      genRuntimeCall("feenableexcept", i32Ty, mask);
+    }
+    builder.create<fir::ResultOp>(loc, result);
+  } else {
+    // Kind 2, 3, 4, 8, 16. Increment or decrement X cast to integer.
+    mlir::Value intX = builder.create<mlir::arith::BitcastOp>(loc, intType, x);
+    result = builder.create<mlir::arith::BitcastOp>(
+        loc, resultType,
+        builder.create<mlir::arith::SelectOp>(
+            loc, magnitudeUp,
+            builder.create<mlir::arith::AddIOp>(loc, intX, one),
+            builder.create<mlir::arith::SubIOp>(loc, intX, one)));
+    if constexpr (proc == NearestProc::Nearest ||
+                  proc == NearestProc::NextAfter) {
+      genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
+                         _FORTRAN_RUNTIME_IEEE_INEXACT,
+                     genIsFPClass(i1Ty, result, infiniteTest));
+      genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
+                         _FORTRAN_RUNTIME_IEEE_INEXACT,
+                     genIsFPClass(i1Ty, result, subnormalTest));
+    }
+    builder.create<fir::ResultOp>(loc, result);
+  }
+
+  builder.setInsertionPointAfter(innerIfOp);
+  builder.create<fir::ResultOp>(loc, innerIfOp.getResult(0));
+  builder.setInsertionPointAfter(outerIfOp);
+  return outerIfOp.getResult(0);
 }
 
 // NINT
diff --git a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
index 294ccbaf82a06..8775b50437af2 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
@@ -13,10 +13,10 @@
 
 using namespace Fortran::runtime;
 
-mlir::Value fir::runtime::genMapException(fir::FirOpBuilder &builder,
-                                          mlir::Location loc,
-                                          mlir::Value except) {
+mlir::Value fir::runtime::genMapExcept(fir::FirOpBuilder &builder,
+                                       mlir::Location loc,
+                                       mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(MapException)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func, except).getResult(0);
+  return builder.create<fir::CallOp>(loc, func, excepts).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 1d13248db5984..d98288419d68f 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -406,10 +406,10 @@ mlir::Value fir::runtime::genModulo(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
-/// Generate call to Nearest intrinsic runtime routine.
+/// Generate call to Nearest intrinsic or a "Next" intrinsic module procedure.
 mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value x,
-                                     mlir::Value s) {
+                                     mlir::Value valueUp) {
   mlir::func::FuncOp func;
   mlir::Type fltTy = x.getType();
 
@@ -425,19 +425,7 @@ mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
     fir::intrinsicTypeTODO(builder, fltTy, loc, "NEAREST");
 
   auto funcTy = func.getFunctionType();
-
-  mlir::Type sTy = s.getType();
-  mlir::Value zero = builder.createRealZeroConstant(loc, sTy);
-  auto cmp = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OGT, s, zero);
-
-  mlir::Type boolTy = mlir::IntegerType::get(builder.getContext(), 1);
-  mlir::Value False = builder.createIntegerConstant(loc, boolTy, 0);
-  mlir::Value True = builder.createIntegerConstant(loc, boolTy, 1);
-
-  mlir::Value positive =
-      builder.create<mlir::arith::SelectOp>(loc, cmp, True, False);
-  auto args = fir::runtime::createArguments(builder, loc, funcTy, x, positive);
+  auto args = fir::runtime::createArguments(builder, loc, funcTy, x, valueUp);
 
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f9ea92a843b23..7934f1fdad2a0 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -23,6 +23,8 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Support/TypeCode.h"
 #include "flang/Optimizer/Support/Utils.h"
+#include "flang/Runtime/allocator-registry.h"
+#include "flang/Runtime/descriptor.h"
 #include "flang/Semantics/runtime-type-info.h"
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
@@ -1224,8 +1226,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                                  fir::BaseBoxType boxTy, mlir::Type inputType,
                                  mlir::ConversionPatternRewriter &rewriter,
                                  unsigned rank, mlir::Value eleSize,
-                                 mlir::Value cfiTy,
-                                 mlir::Value typeDesc) const {
+                                 mlir::Value cfiTy, mlir::Value typeDesc,
+                                 int allocatorIdx = kDefaultAllocator) const {
     auto llvmBoxTy = this->lowerTy().convertBoxTypeAsStruct(boxTy, rank);
     bool isUnlimitedPolymorphic = fir::isUnlimitedPolymorphicType(boxTy);
     bool useInputType = fir::isPolymorphicType(boxTy) || isUnlimitedPolymorphic;
@@ -1241,10 +1243,19 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     descriptor =
         insertField(rewriter, loc, descriptor, {kAttributePosInBox},
                     this->genI32Constant(loc, rewriter, getCFIAttr(boxTy)));
+
     const bool hasAddendum = fir::boxHasAddendum(boxTy);
+
+    // Descriptor used to set the correct value of the extra field.
+    Fortran::runtime::StaticDescriptor<0> staticDescriptor;
+    Fortran::runtime::Descriptor &desc{staticDescriptor.descriptor()};
+    desc.raw().extra = 0;
+    desc.SetAllocIdx(allocatorIdx);
+    if (hasAddendum)
+      desc.SetHasAddendum();
     descriptor =
-        insertField(rewriter, loc, descriptor, {kF18AddendumPosInBox},
-                    this->genI32Constant(loc, rewriter, hasAddendum ? 1 : 0));
+        insertField(rewriter, loc, descriptor, {kExtraPosInBox},
+                    this->genI32Constant(loc, rewriter, desc.raw().extra));
 
     if (hasAddendum) {
       unsigned typeDescFieldId = getTypeDescFieldId(boxTy);
@@ -1299,6 +1310,13 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
         typeparams.push_back(substrParams[1]);
     }
 
+    int allocatorIdx = 0;
+    if constexpr (std::is_same_v<BOX, fir::EmboxOp> ||
+                  std::is_same_v<BOX, fir::cg::XEmboxOp>) {
+      if (box.getAllocatorIdx())
+        allocatorIdx = *box.getAllocatorIdx();
+    }
+
     // Write each of the fields with the appropriate values.
     // When emboxing an element to a polymorphic descriptor, use the
     // input type since the destination descriptor type has not the exact
@@ -1320,8 +1338,9 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                                     cfiTy.getType(), rewriter, kTypePosInBox);
     }
     auto mod = box->template getParentOfType<mlir::ModuleOp>();
-    mlir::Value descriptor = populateDescriptor(
-        loc, mod, boxTy, inputType, rewriter, rank, eleSize, cfiTy, typeDesc);
+    mlir::Value descriptor =
+        populateDescriptor(loc, mod, boxTy, inputType, rewriter, rank, eleSize,
+                           cfiTy, typeDesc, allocatorIdx);
 
     return {boxTy, descriptor, eleSize};
   }
@@ -1431,8 +1450,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
             fir::unwrapPassByRefType(memref.getType()))))
       TODO(xbox.getLoc(),
            "fir.embox codegen dynamic size component in derived type");
-    indices.append(operands.begin() + xbox.subcomponentOffset(),
-                   operands.begin() + xbox.subcomponentOffset() +
+    indices.append(operands.begin() + xbox.getSubcomponentOperandIndex(),
+                   operands.begin() + xbox.getSubcomponentOperandIndex() +
                        xbox.getSubcomponent().size());
   }
 
@@ -1487,7 +1506,7 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
     mlir::Value sourceBox;
     mlir::Type sourceBoxType;
     if (embox.getSourceBox()) {
-      sourceBox = operands[embox.getSourceBoxOffset()];
+      sourceBox = operands[embox.getSourceBoxOperandIndex()];
       sourceBoxType = embox.getSourceBox().getType();
     }
     assert(!embox.getShape() && "There should be no dims on this embox op");
@@ -1519,7 +1538,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     mlir::Value sourceBox;
     mlir::Type sourceBoxType;
     if (xbox.getSourceBox()) {
-      sourceBox = operands[xbox.getSourceBoxOffset()];
+      sourceBox = operands[xbox.getSourceBoxOperandIndex()];
       sourceBoxType = xbox.getSourceBox().getType();
     }
     auto [boxTy, dest, resultEleSize] = consDescriptorPrefix(
@@ -1529,11 +1548,11 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     // Generate the triples in the dims field of the descriptor
     auto i64Ty = mlir::IntegerType::get(xbox.getContext(), 64);
     assert(!xbox.getShape().empty() && "must have a shape");
-    unsigned shapeOffset = xbox.shapeOffset();
+    unsigned shapeOffset = xbox.getShapeOperandIndex();
     bool hasShift = !xbox.getShift().empty();
-    unsigned shiftOffset = xbox.shiftOffset();
+    unsigned shiftOffset = xbox.getShiftOperandIndex();
     bool hasSlice = !xbox.getSlice().empty();
-    unsigned sliceOffset = xbox.sliceOffset();
+    unsigned sliceOffset = xbox.getSliceOperandIndex();
     mlir::Location loc = xbox.getLoc();
     mlir::Value zero = genConstantIndex(loc, i64Ty, rewriter, 0);
     mlir::Value one = genConstantIndex(loc, i64Ty, rewriter, 1);
@@ -1682,7 +1701,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
       if (hasSubcomp)
         getSubcomponentIndices(xbox, xbox.getMemref(), operands, fieldIndices);
       if (hasSubstr)
-        substringOffset = operands[xbox.substrOffset()];
+        substringOffset = operands[xbox.getSubstrOperandIndex()];
       mlir::Type llvmBaseType =
           convertType(fir::unwrapRefType(xbox.getMemref().getType()));
       base = genBoxOffsetGep(rewriter, loc, base, llvmBaseType, ptrOffset,
@@ -1843,7 +1862,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
       if (!rebox.getSubcomponent().empty())
         getSubcomponentIndices(rebox, rebox.getBox(), operands, fieldIndices);
       if (!rebox.getSubstr().empty())
-        substringOffset = operands[rebox.substrOffset()];
+        substringOffset = operands[rebox.getSubstrOperandIndex()];
       base = genBoxOffsetGep(rewriter, loc, base, llvmBaseObjectType, zero,
                              /*cstInteriorIndices=*/std::nullopt, fieldIndices,
                              substringOffset);
@@ -1862,8 +1881,8 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     llvm::SmallVector<mlir::Value> slicedStrides;
     mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1);
     const bool sliceHasOrigins = !rebox.getShift().empty();
-    unsigned sliceOps = rebox.sliceOffset();
-    unsigned shiftOps = rebox.shiftOffset();
+    unsigned sliceOps = rebox.getSliceOperandIndex();
+    unsigned shiftOps = rebox.getShiftOperandIndex();
     auto strideOps = inputStrides.begin();
     const unsigned inputRank = inputStrides.size();
     for (unsigned i = 0; i < inputRank;
@@ -1912,9 +1931,10 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
              mlir::Value base, mlir::ValueRange inputExtents,
              mlir::ValueRange inputStrides, mlir::ValueRange operands,
              mlir::ConversionPatternRewriter &rewriter) const {
-    mlir::ValueRange reboxShifts{operands.begin() + rebox.shiftOffset(),
-                                 operands.begin() + rebox.shiftOffset() +
-                                     rebox.getShift().size()};
+    mlir::ValueRange reboxShifts{
+        operands.begin() + rebox.getShiftOperandIndex(),
+        operands.begin() + rebox.getShiftOperandIndex() +
+            rebox.getShift().size()};
     if (rebox.getShape().empty()) {
       // Only setting new lower bounds.
       return finalizeRebox(rebox, destBoxTy, dest, base, reboxShifts,
@@ -1934,7 +1954,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
                              ? genConstantIndex(loc, idxTy, rewriter, 1)
                              : inputStrides[0];
     for (unsigned i = 0; i < rebox.getShape().size(); ++i) {
-      mlir::Value rawExtent = operands[rebox.shapeOffset() + i];
+      mlir::Value rawExtent = operands[rebox.getShapeOperandIndex() + i];
       mlir::Value extent = integerCast(loc, rewriter, idxTy, rawExtent);
       newExtents.emplace_back(extent);
       newStrides.emplace_back(stride);
@@ -2137,10 +2157,10 @@ struct XArrayCoorOpConversion
     assert(coor.getShift().empty() || coor.getShift().size() == rank);
     assert(coor.getSlice().empty() || coor.getSlice().size() == 3 * rank);
     mlir::Type idxTy = lowerTy().indexType();
-    unsigned indexOffset = coor.indicesOffset();
-    unsigned shapeOffset = coor.shapeOffset();
-    unsigned shiftOffset = coor.shiftOffset();
-    unsigned sliceOffset = coor.sliceOffset();
+    unsigned indexOffset = coor.getIndicesOperandIndex();
+    unsigned shapeOffset = coor.getShapeOperandIndex();
+    unsigned shiftOffset = coor.getShiftOperandIndex();
+    unsigned sliceOffset = coor.getSliceOperandIndex();
     auto sliceOps = coor.getSlice().begin();
     mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1);
     mlir::Value prevExt = one;
@@ -2238,7 +2258,7 @@ struct XArrayCoorOpConversion
       }
       llvm::SmallVector<mlir::Value> indices = convertSubcomponentIndices(
           loc, elementType,
-          operands.slice(coor.subcomponentOffset(),
+          operands.slice(coor.getSubcomponentOperandIndex(),
                          coor.getSubcomponent().size()));
       args.append(indices.begin(), indices.end());
       rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(coor, llvmPtrTy,
@@ -2262,7 +2282,7 @@ struct XArrayCoorOpConversion
         if (fir::characterWithDynamicLen(eleTy)) {
           assert(coor.getLenParams().size() == 1);
           auto length = integerCast(loc, rewriter, idxTy,
-                                    operands[coor.lenParamsOffset()]);
+                                    operands[coor.getLenParamsOperandIndex()]);
           offset = rewriter.create<mlir::LLVM::MulOp>(loc, idxTy, offset,
                                                       length, nsw);
         } else {
@@ -2275,7 +2295,7 @@ struct XArrayCoorOpConversion
       args.push_back(offset);
       llvm::SmallVector<mlir::Value> indices = convertSubcomponentIndices(
           loc, gepObjectType,
-          operands.slice(coor.subcomponentOffset(),
+          operands.slice(coor.getSubcomponentOperandIndex(),
                          coor.getSubcomponent().size()));
       args.append(indices.begin(), indices.end());
     }
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index c57f5140c29eb..86b81d8d652b2 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -18,8 +18,8 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "mlir/IR/Iterators.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 
@@ -109,7 +109,7 @@ class EmboxConversion : public mlir::OpRewritePattern<fir::EmboxOp> {
     auto xbox = rewriter.create<fir::cg::XEmboxOp>(
         loc, embox.getType(), embox.getMemref(), shapeOpers, std::nullopt,
         std::nullopt, std::nullopt, std::nullopt, embox.getTypeparams(),
-        embox.getSourceBox());
+        embox.getSourceBox(), embox.getAllocatorIdxAttr());
     LLVM_DEBUG(llvm::dbgs() << "rewriting " << embox << " to " << xbox << '\n');
     rewriter.replaceOp(embox, xbox.getOperation()->getResults());
     return mlir::success();
@@ -145,7 +145,7 @@ class EmboxConversion : public mlir::OpRewritePattern<fir::EmboxOp> {
     auto xbox = rewriter.create<fir::cg::XEmboxOp>(
         loc, embox.getType(), embox.getMemref(), shapeOpers, shiftOpers,
         sliceOpers, subcompOpers, substrOpers, embox.getTypeparams(),
-        embox.getSourceBox());
+        embox.getSourceBox(), embox.getAllocatorIdxAttr());
     LLVM_DEBUG(llvm::dbgs() << "rewriting " << embox << " to " << xbox << '\n');
     rewriter.replaceOp(embox, xbox.getOperation()->getResults());
     return mlir::success();
@@ -325,6 +325,25 @@ class DummyScopeOpConversion
   }
 };
 
+/// Simple DCE to erase fir.shape/shift/slice/unused shape operands after this
+/// pass (fir.shape and like have no codegen).
+/// mlir::RegionDCE is expensive and requires running
+/// mlir::eraseUnreachableBlocks. It does things that are not needed here, like
+/// removing unused block arguments. fir.shape/shift/slice cannot be block
+/// arguments.
+/// This helper does a naive backward walk of the IR. It is not even guaranteed
+/// to walk blocks according to backward dominance, but that is good enough for
+/// what is done here, fir.shape/shift/slice have no usages anymore. The
+/// backward walk allows getting rid of most of the unused operands, it is not a
+/// problem to leave some in the weird cases.
+static void simpleDCE(mlir::RewriterBase &rewriter, mlir::Operation *op) {
+  op->walk<mlir::WalkOrder::PostOrder, mlir::ReverseIterator>(
+      [&](mlir::Operation *subOp) {
+        if (mlir::isOpTriviallyDead(subOp))
+          rewriter.eraseOp(subOp);
+      });
+}
+
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
   using CodeGenRewriteBase<CodeGenRewrite>::CodeGenRewriteBase;
@@ -356,7 +375,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
     }
     // Erase any residual (fir.shape, fir.slice...).
     mlir::IRRewriter rewriter(&context);
-    (void)mlir::runRegionDCE(rewriter, mod->getRegions());
+    simpleDCE(rewriter, mod.getOperation());
   }
 };
 
diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
index 59f67f4fcad44..1043494b6fb48 100644
--- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
+++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
@@ -177,7 +177,7 @@ bool LLVMTypeConverter::requiresExtendedDesc(mlir::Type boxElementType) const {
 // the addendum defined in descriptor.h.
 mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
                                                      int rank) const {
-  // (base_addr*, elem_len, version, rank, type, attribute, f18Addendum, [dim]
+  // (base_addr*, elem_len, version, rank, type, attribute, extra, [dim]
   llvm::SmallVector<mlir::Type> dataDescFields;
   mlir::Type ele = box.getEleTy();
   // remove fir.heap/fir.ref/fir.ptr
@@ -206,9 +206,9 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
   // attribute
   dataDescFields.push_back(
       getDescFieldTypeModel<kAttributePosInBox>()(&getContext()));
-  // f18Addendum
+  // extra
   dataDescFields.push_back(
-      getDescFieldTypeModel<kF18AddendumPosInBox>()(&getContext()));
+      getDescFieldTypeModel<kExtraPosInBox>()(&getContext()));
   // [dims]
   if (rank == unknownRank()) {
     if (auto seqTy = mlir::dyn_cast<SequenceType>(ele))
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index a0202a0159228..443e94ae6606f 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -298,5 +298,6 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, ReduceAttr,
-                SubclassAttr, UpperBoundAttr>();
+                SubclassAttr, UpperBoundAttr, LocationKindAttr,
+                LocationKindArrayAttr>();
 }
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index 3906aa553cb34..ff37310224e85 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -59,14 +59,16 @@ static mlir::FunctionType getNewFunctionType(mlir::FunctionType funcTy,
                                  /*resultTypes=*/{});
 }
 
+static mlir::Type getVoidPtrType(mlir::MLIRContext *context) {
+  return fir::ReferenceType::get(mlir::NoneType::get(context));
+}
+
 /// This is for function result types that are of type C_PTR from ISO_C_BINDING.
 /// Follow the ABI for interoperability with C.
 static mlir::FunctionType getCPtrFunctionType(mlir::FunctionType funcTy) {
-  auto resultType = funcTy.getResult(0);
-  assert(fir::isa_builtin_cptr_type(resultType));
-  llvm::SmallVector<mlir::Type> outputTypes;
-  auto recTy = mlir::dyn_cast<fir::RecordType>(resultType);
-  outputTypes.emplace_back(recTy.getTypeList()[0].second);
+  assert(fir::isa_builtin_cptr_type(funcTy.getResult(0)));
+  llvm::SmallVector<mlir::Type> outputTypes{
+      getVoidPtrType(funcTy.getContext())};
   return mlir::FunctionType::get(funcTy.getContext(), funcTy.getInputs(),
                                  outputTypes);
 }
@@ -109,15 +111,11 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
           saveResult.getTypeparams());
 
     llvm::SmallVector<mlir::Type> newResultTypes;
-    // TODO: This should be generalized for derived types, and it is
-    // architecture and OS dependent.
     bool isResultBuiltinCPtr = fir::isa_builtin_cptr_type(result.getType());
-    Op newOp;
-    if (isResultBuiltinCPtr) {
-      auto recTy = mlir::dyn_cast<fir::RecordType>(result.getType());
-      newResultTypes.emplace_back(recTy.getTypeList()[0].second);
-    }
+    if (isResultBuiltinCPtr)
+      newResultTypes.emplace_back(getVoidPtrType(result.getContext()));
 
+    Op newOp;
     // fir::CallOp specific handling.
     if constexpr (std::is_same_v<Op, fir::CallOp>) {
       if (op.getCallee()) {
@@ -175,7 +173,7 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
       FirOpBuilder builder(rewriter, module);
       mlir::Value saveAddr = fir::factory::genCPtrOrCFunptrAddr(
           builder, loc, save, result.getType());
-      rewriter.create<fir::StoreOp>(loc, newOp->getResult(0), saveAddr);
+      builder.createStoreWithConvert(loc, newOp->getResult(0), saveAddr);
     }
     op->dropAllReferences();
     rewriter.eraseOp(op);
@@ -210,42 +208,52 @@ class ReturnOpConversion : public mlir::OpRewritePattern<mlir::func::ReturnOp> {
                   mlir::PatternRewriter &rewriter) const override {
     auto loc = ret.getLoc();
     rewriter.setInsertionPoint(ret);
-    auto returnedValue = ret.getOperand(0);
-    bool replacedStorage = false;
-    if (auto *op = returnedValue.getDefiningOp())
-      if (auto load = mlir::dyn_cast<fir::LoadOp>(op)) {
-        auto resultStorage = load.getMemref();
-        // The result alloca may be behind a fir.declare, if any.
-        if (auto declare = mlir::dyn_cast_or_null<fir::DeclareOp>(
-                resultStorage.getDefiningOp()))
-          resultStorage = declare.getMemref();
-        // TODO: This should be generalized for derived types, and it is
-        // architecture and OS dependent.
-        if (fir::isa_builtin_cptr_type(returnedValue.getType())) {
-          rewriter.eraseOp(load);
-          auto module = ret->getParentOfType<mlir::ModuleOp>();
-          FirOpBuilder builder(rewriter, module);
-          mlir::Value retAddr = fir::factory::genCPtrOrCFunptrAddr(
-              builder, loc, resultStorage, returnedValue.getType());
-          mlir::Value retValue = rewriter.create<fir::LoadOp>(
-              loc, fir::unwrapRefType(retAddr.getType()), retAddr);
-          rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(
-              ret, mlir::ValueRange{retValue});
-          return mlir::success();
-        }
-        resultStorage.replaceAllUsesWith(newArg);
-        replacedStorage = true;
-        if (auto *alloc = resultStorage.getDefiningOp())
-          if (alloc->use_empty())
-            rewriter.eraseOp(alloc);
+    mlir::Value resultValue = ret.getOperand(0);
+    fir::LoadOp resultLoad;
+    mlir::Value resultStorage;
+    // Identify result local storage.
+    if (auto load = resultValue.getDefiningOp<fir::LoadOp>()) {
+      resultLoad = load;
+      resultStorage = load.getMemref();
+      // The result alloca may be behind a fir.declare, if any.
+      if (auto declare = resultStorage.getDefiningOp<fir::DeclareOp>())
+        resultStorage = declare.getMemref();
+    }
+    // Replace old local storage with new storage argument, unless
+    // the derived type is C_PTR/C_FUN_PTR, in which case the return
+    // type is updated to return void* (no new argument is passed).
+    if (fir::isa_builtin_cptr_type(resultValue.getType())) {
+      auto module = ret->getParentOfType<mlir::ModuleOp>();
+      FirOpBuilder builder(rewriter, module);
+      mlir::Value cptr = resultValue;
+      if (resultLoad) {
+        // Replace whole derived type load by component load.
+        cptr = resultLoad.getMemref();
+        rewriter.setInsertionPoint(resultLoad);
       }
-    // The result storage may have been optimized out by a memory to
-    // register pass, this is possible for fir.box results, or fir.record
-    // with no length parameters. Simply store the result in the result storage.
-    // at the return point.
-    if (!replacedStorage)
-      rewriter.create<fir::StoreOp>(loc, returnedValue, newArg);
-    rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(ret);
+      mlir::Value newResultValue =
+          fir::factory::genCPtrOrCFunptrValue(builder, loc, cptr);
+      newResultValue = builder.createConvert(
+          loc, getVoidPtrType(ret.getContext()), newResultValue);
+      rewriter.setInsertionPoint(ret);
+      rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(
+          ret, mlir::ValueRange{newResultValue});
+    } else if (resultStorage) {
+      resultStorage.replaceAllUsesWith(newArg);
+      rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(ret);
+    } else {
+      // The result storage may have been optimized out by a memory to
+      // register pass, this is possible for fir.box results, or fir.record
+      // with no length parameters. Simply store the result in the result
+      // storage. at the return point.
+      rewriter.create<fir::StoreOp>(loc, resultValue, newArg);
+      rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(ret);
+    }
+    // Delete result old local storage if unused.
+    if (resultStorage)
+      if (auto alloc = resultStorage.getDefiningOp<fir::AllocaOp>())
+        if (alloc->use_empty())
+          rewriter.eraseOp(alloc);
     return mlir::success();
   }
 
@@ -263,8 +271,6 @@ class AddrOfOpConversion : public mlir::OpRewritePattern<fir::AddrOfOp> {
                   mlir::PatternRewriter &rewriter) const override {
     auto oldFuncTy = mlir::cast<mlir::FunctionType>(addrOf.getType());
     mlir::FunctionType newFuncTy;
-    // TODO: This should be generalized for derived types, and it is
-    // architecture and OS dependent.
     if (oldFuncTy.getNumResults() != 0 &&
         fir::isa_builtin_cptr_type(oldFuncTy.getResult(0)))
       newFuncTy = getCPtrFunctionType(oldFuncTy);
@@ -298,8 +304,6 @@ class AbstractResultOpt
     // Convert function type itself if it has an abstract result.
     auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
     if (hasAbstractResult(funcTy)) {
-      // TODO: This should be generalized for derived types, and it is
-      // architecture and OS dependent.
       if (fir::isa_builtin_cptr_type(funcTy.getResult(0))) {
         func.setType(getCPtrFunctionType(funcTy));
         patterns.insert<ReturnOpConversion>(context, mlir::Value{});
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 8bb24fb6c8078..3c067bf946cfc 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -66,6 +66,9 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
   void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
                       mlir::LLVM::DIScopeAttr scope,
                       mlir::SymbolTable *symbolTable);
+  void handleFuncOp(mlir::func::FuncOp funcOp, mlir::LLVM::DIFileAttr fileAttr,
+                    mlir::LLVM::DICompileUnitAttr cuAttr,
+                    mlir::SymbolTable *symbolTable);
 };
 
 static uint32_t getLineFromLoc(mlir::Location loc) {
@@ -76,8 +79,11 @@ static uint32_t getLineFromLoc(mlir::Location loc) {
 }
 
 bool debugInfoIsAlreadySet(mlir::Location loc) {
-  if (mlir::isa<mlir::FusedLoc>(loc))
+  if (mlir::isa<mlir::FusedLoc>(loc)) {
+    if (loc->findInstanceOf<mlir::FusedLocWith<fir::LocationKindAttr>>())
+      return false;
     return true;
+  }
   return false;
 }
 
@@ -126,7 +132,7 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
   auto localVarAttr = mlir::LLVM::DILocalVariableAttr::get(
       context, scopeAttr, mlir::StringAttr::get(context, result.second.name),
       fileAttr, getLineFromLoc(declOp.getLoc()), argNo, /* alignInBits*/ 0,
-      tyAttr);
+      tyAttr, mlir::LLVM::DIFlags::Zero);
   declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
 }
 
@@ -204,11 +210,112 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
   globalOp->setLoc(builder.getFusedLoc({globalOp->getLoc()}, gvAttr));
 }
 
+void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
+                                    mlir::LLVM::DIFileAttr fileAttr,
+                                    mlir::LLVM::DICompileUnitAttr cuAttr,
+                                    mlir::SymbolTable *symbolTable) {
+  mlir::Location l = funcOp->getLoc();
+  // If fused location has already been created then nothing to do
+  // Otherwise, create a fused location.
+  if (debugInfoIsAlreadySet(l))
+    return;
+
+  mlir::ModuleOp module = getOperation();
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+  llvm::StringRef fileName(fileAttr.getName());
+  llvm::StringRef filePath(fileAttr.getDirectory());
+  unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
+                        ? llvm::dwarf::getCallingConvention("DW_CC_program")
+                        : llvm::dwarf::getCallingConvention("DW_CC_normal");
+
+  if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
+    fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
+    filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
+  }
+
+  mlir::StringAttr fullName = mlir::StringAttr::get(context, funcOp.getName());
+  mlir::Attribute attr = funcOp->getAttr(fir::getInternalFuncNameAttrName());
+  mlir::StringAttr funcName =
+      (attr) ? mlir::cast<mlir::StringAttr>(attr)
+             : mlir::StringAttr::get(context, funcOp.getName());
+
+  auto result = fir::NameUniquer::deconstruct(funcName);
+  funcName = mlir::StringAttr::get(context, result.second.name);
+
+  llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
+  fir::DebugTypeGenerator typeGen(module);
+  for (auto resTy : funcOp.getResultTypes()) {
+    auto tyAttr = typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc());
+    types.push_back(tyAttr);
+  }
+  for (auto inTy : funcOp.getArgumentTypes()) {
+    auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr,
+                                      cuAttr, funcOp.getLoc());
+    types.push_back(tyAttr);
+  }
+
+  mlir::LLVM::DISubroutineTypeAttr subTypeAttr =
+      mlir::LLVM::DISubroutineTypeAttr::get(context, CC, types);
+  mlir::LLVM::DIFileAttr funcFileAttr =
+      mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
+
+  // Only definitions need a distinct identifier and a compilation unit.
+  mlir::DistinctAttr id;
+  mlir::LLVM::DIScopeAttr Scope = fileAttr;
+  mlir::LLVM::DICompileUnitAttr compilationUnit;
+  mlir::LLVM::DISubprogramFlags subprogramFlags =
+      mlir::LLVM::DISubprogramFlags{};
+  if (isOptimized)
+    subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
+  if (!funcOp.isExternal()) {
+    id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+    compilationUnit = cuAttr;
+    subprogramFlags =
+        subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
+  }
+  unsigned line = getLineFromLoc(l);
+  if (fir::isInternalProcedure(funcOp)) {
+    // For contained functions, the scope is the parent subroutine.
+    mlir::SymbolRefAttr sym = mlir::cast<mlir::SymbolRefAttr>(
+        funcOp->getAttr(fir::getHostSymbolAttrName()));
+    if (sym) {
+      if (auto func =
+              symbolTable->lookup<mlir::func::FuncOp>(sym.getLeafReference())) {
+        // Make sure that parent is processed.
+        handleFuncOp(func, fileAttr, cuAttr, symbolTable);
+        if (auto fusedLoc =
+                mlir::dyn_cast_if_present<mlir::FusedLoc>(func.getLoc())) {
+          if (auto spAttr =
+                  mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(
+                      fusedLoc.getMetadata()))
+            Scope = spAttr;
+        }
+      }
+    }
+  } else if (!result.second.modules.empty()) {
+    Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
+                                  line - 1, false);
+  }
+
+  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+      line, line, subprogramFlags, subTypeAttr);
+  funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
+
+  // Don't process variables if user asked for line tables only.
+  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
+    return;
+
+  funcOp.walk([&](fir::cg::XDeclareOp declOp) {
+    handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable);
+  });
+}
+
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
   mlir::SymbolTable symbolTable(module);
-  mlir::OpBuilder builder(context);
   llvm::StringRef fileName;
   std::string filePath;
   // We need 2 type of file paths here.
@@ -245,76 +352,7 @@ void AddDebugInfoPass::runOnOperation() {
       isOptimized, debugLevel);
 
   module.walk([&](mlir::func::FuncOp funcOp) {
-    mlir::Location l = funcOp->getLoc();
-    // If fused location has already been created then nothing to do
-    // Otherwise, create a fused location.
-    if (debugInfoIsAlreadySet(l))
-      return;
-
-    unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
-                          ? llvm::dwarf::getCallingConvention("DW_CC_program")
-                          : llvm::dwarf::getCallingConvention("DW_CC_normal");
-
-    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
-      fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
-      filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
-    }
-
-    mlir::StringAttr fullName =
-        mlir::StringAttr::get(context, funcOp.getName());
-    auto result = fir::NameUniquer::deconstruct(funcOp.getName());
-    mlir::StringAttr funcName =
-        mlir::StringAttr::get(context, result.second.name);
-
-    llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
-    fir::DebugTypeGenerator typeGen(module);
-    for (auto resTy : funcOp.getResultTypes()) {
-      auto tyAttr =
-          typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc());
-      types.push_back(tyAttr);
-    }
-    for (auto inTy : funcOp.getArgumentTypes()) {
-      auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr,
-                                        cuAttr, funcOp.getLoc());
-      types.push_back(tyAttr);
-    }
-
-    mlir::LLVM::DISubroutineTypeAttr subTypeAttr =
-        mlir::LLVM::DISubroutineTypeAttr::get(context, CC, types);
-    mlir::LLVM::DIFileAttr funcFileAttr =
-        mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
-
-    // Only definitions need a distinct identifier and a compilation unit.
-    mlir::DistinctAttr id;
-    mlir::LLVM::DIScopeAttr Scope = fileAttr;
-    mlir::LLVM::DICompileUnitAttr compilationUnit;
-    mlir::LLVM::DISubprogramFlags subprogramFlags =
-        mlir::LLVM::DISubprogramFlags{};
-    if (isOptimized)
-      subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
-    if (!funcOp.isExternal()) {
-      id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-      compilationUnit = cuAttr;
-      subprogramFlags =
-          subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
-    }
-    unsigned line = getLineFromLoc(l);
-    if (!result.second.modules.empty())
-      Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
-                                    line - 1, false);
-
-    auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
-        line, line, subprogramFlags, subTypeAttr);
-    funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-
-    // Don't process variables if user asked for line tables only.
-    if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
-      return;
-
-    funcOp.walk([&](fir::cg::XDeclareOp declOp) {
-      handleDeclareOp(declOp, fileAttr, spAttr, typeGen, &symbolTable);
-    });
+    handleFuncOp(funcOp, fileAttr, cuAttr, &symbolTable);
   });
   // Process any global which was not processed through DeclareOp.
   if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
index 35203fe89f5bc..ddaa3c5f404f0 100644
--- a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
@@ -109,10 +109,10 @@ class OMPMapInfoFinalizationPass
     if (auto mapClauseOwner =
             llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
       llvm::SmallVector<mlir::Value> newMapOps;
-      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+      mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
 
-      for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-        if (mapOperandsArr[i] == op) {
+      for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+        if (mapVarsArr[i] == op) {
           // Push new implicit maps generated for the descriptor.
           newMapOps.push_back(baseAddr);
 
@@ -120,13 +120,13 @@ class OMPMapInfoFinalizationPass
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
             targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
         }
-        newMapOps.push_back(mapOperandsArr[i]);
+        newMapOps.push_back(mapVarsArr[i]);
       }
-      mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+      mapClauseOwner.getMapVarsMutable().assign(newMapOps);
     }
 
     mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
@@ -196,27 +196,27 @@ class OMPMapInfoFinalizationPass
       return;
 
     llvm::SmallVector<mlir::Value> newMapOps;
-    mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+    mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
     auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target);
 
-    for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-      if (mapOperandsArr[i] == op) {
+    for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+      if (mapVarsArr[i] == op) {
         for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
           newMapOps.push_back(mapMember);
           // for TargetOp's which have IsolatedFromAbove we must align the
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (targetOp) {
             targetOp.getRegion().insertArgument(i + j, mapMember.getType(),
                                                 targetOp->getLoc());
           }
         }
       }
-      newMapOps.push_back(mapOperandsArr[i]);
+      newMapOps.push_back(mapVarsArr[i]);
     }
-    mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+    mapClauseOwner.getMapVarsMutable().assign(newMapOps);
   }
 
   // This pass executes on omp::MapInfoOp's containing descriptor based types
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index 746d04ad649d1..0bdc4c4e033c7 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1304,10 +1304,16 @@ TYPE_PARSER(extension<LanguageFeature::CUDA>(construct<CUDAAttributesStmt>(
     defaulted(
         maybe("::"_tok) >> nonemptyList("expected names"_err_en_US, name)))))
 
-// Subtle: the name includes the surrounding slashes, which avoids
+// Subtle: A structure's name includes the surrounding slashes, which avoids
 // clashes with other uses of the name in the same scope.
-TYPE_PARSER(construct<StructureStmt>(
-    "STRUCTURE" >> maybe(sourced("/" >> name / "/")), optionalList(entityDecl)))
+constexpr auto structureName{maybe(sourced("/" >> name / "/"))};
+
+// Note that Parser<StructureStmt>{} has a mandatory list of entity-decls
+// and is used only by NestedStructureStmt{}.Parse() in user-state.cpp.
+TYPE_PARSER(construct<StructureStmt>("STRUCTURE" >> structureName,
+    localRecovery(
+        "entity declarations are required on a nested structure"_err_en_US,
+        nonemptyList(entityDecl), ok)))
 
 constexpr auto nestedStructureDef{
     CONTEXT_PARSER("nested STRUCTURE definition"_en_US,
@@ -1323,7 +1329,9 @@ TYPE_PARSER(construct<StructureField>(statement(StructureComponents{})) ||
 TYPE_CONTEXT_PARSER("STRUCTURE definition"_en_US,
     extension<LanguageFeature::DECStructures>(
         "nonstandard usage: STRUCTURE"_port_en_US,
-        construct<StructureDef>(statement(Parser<StructureStmt>{}),
+        construct<StructureDef>(
+            statement(construct<StructureStmt>(
+                "STRUCTURE" >> structureName, optionalList(entityDecl))),
             many(Parser<StructureField>{}),
             statement(construct<StructureDef::EndStructureStmt>(
                 "END STRUCTURE"_tok)))))
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index f703e09612d54..5057e89164c9f 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -67,20 +67,19 @@ constexpr auto obsoleteExecutionPartConstruct{recovery(ignoredStatementPrefix >>
                 parenthesized(nonemptyList(Parser<AllocateShapeSpec>{}))))))};
 
 TYPE_PARSER(recovery(
-    withMessage("expected execution part construct"_err_en_US,
-        CONTEXT_PARSER("execution part construct"_en_US,
-            first(construct<ExecutionPartConstruct>(executableConstruct),
+    CONTEXT_PARSER("execution part construct"_en_US,
+        first(construct<ExecutionPartConstruct>(executableConstruct),
+            construct<ExecutionPartConstruct>(statement(indirect(formatStmt))),
+            construct<ExecutionPartConstruct>(statement(indirect(entryStmt))),
+            construct<ExecutionPartConstruct>(statement(indirect(dataStmt))),
+            extension<LanguageFeature::ExecutionPartNamelist>(
+                "nonstandard usage: NAMELIST in execution part"_port_en_US,
                 construct<ExecutionPartConstruct>(
-                    statement(indirect(formatStmt))),
-                construct<ExecutionPartConstruct>(
-                    statement(indirect(entryStmt))),
-                construct<ExecutionPartConstruct>(
-                    statement(indirect(dataStmt))),
-                extension<LanguageFeature::ExecutionPartNamelist>(
-                    "nonstandard usage: NAMELIST in execution part"_port_en_US,
-                    construct<ExecutionPartConstruct>(
-                        statement(indirect(Parser<NamelistStmt>{})))),
-                obsoleteExecutionPartConstruct))),
+                    statement(indirect(Parser<NamelistStmt>{})))),
+            obsoleteExecutionPartConstruct,
+            lookAhead(declarationConstruct) >> SkipTo<'\n'>{} >>
+                fail<ExecutionPartConstruct>(
+                    "misplaced declaration in the execution part"_err_en_US))),
     construct<ExecutionPartConstruct>(executionPartErrorRecovery)))
 
 // R509 execution-part -> executable-construct [execution-part-construct]...
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 0ea48ce29ca2f..52789d6e5f0f6 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -31,10 +31,11 @@ TYPE_PARSER(construct<OmpDefaultClause>(
     "SHARED" >> pure(OmpDefaultClause::Type::Shared) ||
     "NONE" >> pure(OmpDefaultClause::Type::None)))
 
-// 2.5 PROC_BIND (MASTER | CLOSE | SPREAD)
+// 2.5 PROC_BIND (MASTER | CLOSE | PRIMARY | SPREAD )
 TYPE_PARSER(construct<OmpProcBindClause>(
     "CLOSE" >> pure(OmpProcBindClause::Type::Close) ||
     "MASTER" >> pure(OmpProcBindClause::Type::Master) ||
+    "PRIMARY" >> pure(OmpProcBindClause::Type::Primary) ||
     "SPREAD" >> pure(OmpProcBindClause::Type::Spread)))
 
 // 2.15.5.1 MAP ([ [ALWAYS[,]] map-type : ] variable-name-list)
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index ca6a823999244..c01d512b4653d 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -111,9 +111,13 @@ void Prescanner::Statement() {
     skipLeadingAmpersand_ |= !inFixedForm_;
     return;
   case LineClassification::Kind::PreprocessorDirective:
+    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
+    afterPreprocessingDirective_ = true;
+    // Don't set skipLeadingAmpersand_
+    return;
   case LineClassification::Kind::DefinitionDirective:
     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
-    // Don't set afterPreprocessingDirective_
+    // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
     return;
   case LineClassification::Kind::CompilerDirective: {
     directiveSentinel_ = line.sentinel;
@@ -189,11 +193,19 @@ void Prescanner::Statement() {
       // a comment marker or directive sentinel.  If so, disable line
       // continuation, so that NextToken() won't consume anything from
       // following lines.
-      if (IsLegalIdentifierStart(*at_) && NextToken(tokens) &&
-          tokens.SizeInTokens() > 0) {
-        if (CharBlock id{tokens.TokenAt(0)}; preprocessor_.IsNameDefined(id) &&
+      if (IsLegalIdentifierStart(*at_)) {
+        // TODO: Only bother with these cases when any keyword macro has
+        // been defined with replacement text that could begin a comment
+        // or directive sentinel.
+        const char *p{at_};
+        while (IsLegalInIdentifier(*++p)) {
+        }
+        CharBlock id{at_, static_cast<std::size_t>(p - at_)};
+        if (preprocessor_.IsNameDefined(id) &&
             !preprocessor_.IsFunctionLikeDefinition(id)) {
-          if (auto replaced{preprocessor_.MacroReplacement(tokens, *this)}) {
+          TokenSequence toks;
+          toks.Put(id, GetProvenance(at_));
+          if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
             auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
             disableSourceContinuation_ =
                 newLineClass.kind != LineClassification::Kind::Source;
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index b51b60157f39c..c43696c52c160 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -19,6 +19,31 @@
 
 namespace Fortran::parser {
 
+// R1530 function-stmt ->
+//         [prefix] FUNCTION function-name ( [dummy-arg-name-list] ) [suffix]
+// R1526 prefix -> prefix-spec [prefix-spec]...
+// R1531 dummy-arg-name -> name
+
+static constexpr auto validFunctionStmt{
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
+        parenthesized(optionalList(name)), maybe(suffix)) /
+        atEndOfStmt ||
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name / atEndOfStmt,
+        // PGI & Intel accept "FUNCTION F"
+        extension<LanguageFeature::OmitFunctionDummies>(
+            "nonstandard usage: FUNCTION statement without dummy argument list"_port_en_US,
+            pure<std::list<Name>>()),
+        pure<std::optional<Suffix>>())};
+
+// function-stmt with error recovery -- used in interfaces and internal
+// subprograms, but not at the top level, where REALFUNCTIONF and
+// INTEGERPUREELEMENTALFUNCTIONG(10) might appear as the first statement
+// of a main program.
+TYPE_PARSER(validFunctionStmt ||
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
+        defaulted(parenthesized(optionalList(name))), maybe(suffix)) /
+        checkEndOfKnownStmt)
+
 // R502 program-unit ->
 //        main-program | external-subprogram | module | submodule | block-data
 // R503 external-subprogram -> function-subprogram | subroutine-subprogram
@@ -36,10 +61,11 @@ namespace Fortran::parser {
 // Enforcing C1547 is done in semantics.
 static constexpr auto programUnit{
     construct<ProgramUnit>(indirect(Parser<Module>{})) ||
-    construct<ProgramUnit>(indirect(functionSubprogram)) ||
     construct<ProgramUnit>(indirect(subroutineSubprogram)) ||
     construct<ProgramUnit>(indirect(Parser<Submodule>{})) ||
     construct<ProgramUnit>(indirect(Parser<BlockData>{})) ||
+    lookAhead(validFunctionStmt) >>
+        construct<ProgramUnit>(indirect(functionSubprogram)) ||
     construct<ProgramUnit>(indirect(Parser<MainProgram>{}))};
 static constexpr auto normalProgramUnit{StartNewSubprogram{} >> programUnit /
         skipMany(";"_tok) / space / recovery(endOfLine, SkipPast<'\n'>{})};
@@ -66,16 +92,6 @@ TYPE_PARSER(
             normalProgramUnit) /
             skipStuffBeforeStatement))
 
-// R504 specification-part ->
-//         [use-stmt]... [import-stmt]... [implicit-part]
-//         [declaration-construct]...
-TYPE_CONTEXT_PARSER("specification part"_en_US,
-    construct<SpecificationPart>(many(openaccDeclarativeConstruct),
-        many(openmpDeclarativeConstruct), many(indirect(compilerDirective)),
-        many(statement(indirect(Parser<UseStmt>{}))),
-        many(unambiguousStatement(indirect(Parser<ImportStmt>{}))),
-        implicitPart, many(declarationConstruct)))
-
 // R507 declaration-construct ->
 //        specification-construct | data-stmt | format-stmt |
 //        entry-stmt | stmt-function-stmt
@@ -106,18 +122,29 @@ constexpr auto misplacedSpecificationStmt{Parser<UseStmt>{} >>
         fail<DeclarationConstruct>(
             "IMPLICIT statements must follow USE and IMPORT and precede all other declarations"_err_en_US)};
 
-TYPE_PARSER(recovery(
-    withMessage("expected declaration construct"_err_en_US,
-        CONTEXT_PARSER("declaration construct"_en_US,
-            first(construct<DeclarationConstruct>(specificationConstruct),
-                construct<DeclarationConstruct>(statement(indirect(dataStmt))),
-                construct<DeclarationConstruct>(
-                    statement(indirect(formatStmt))),
-                construct<DeclarationConstruct>(statement(indirect(entryStmt))),
-                construct<DeclarationConstruct>(
-                    statement(indirect(Parser<StmtFunctionStmt>{}))),
-                misplacedSpecificationStmt))),
-    construct<DeclarationConstruct>(declErrorRecovery)))
+TYPE_CONTEXT_PARSER("declaration construct"_en_US,
+    first(construct<DeclarationConstruct>(specificationConstruct),
+        construct<DeclarationConstruct>(statement(indirect(dataStmt))),
+        construct<DeclarationConstruct>(statement(indirect(formatStmt))),
+        construct<DeclarationConstruct>(statement(indirect(entryStmt))),
+        construct<DeclarationConstruct>(
+            statement(indirect(Parser<StmtFunctionStmt>{}))),
+        misplacedSpecificationStmt))
+
+constexpr auto recoveredDeclarationConstruct{
+    recovery(withMessage("expected declaration construct"_err_en_US,
+                 declarationConstruct),
+        construct<DeclarationConstruct>(declErrorRecovery))};
+
+// R504 specification-part ->
+//         [use-stmt]... [import-stmt]... [implicit-part]
+//         [declaration-construct]...
+TYPE_CONTEXT_PARSER("specification part"_en_US,
+    construct<SpecificationPart>(many(openaccDeclarativeConstruct),
+        many(openmpDeclarativeConstruct), many(indirect(compilerDirective)),
+        many(statement(indirect(Parser<UseStmt>{}))),
+        many(unambiguousStatement(indirect(Parser<ImportStmt>{}))),
+        implicitPart, many(recoveredDeclarationConstruct)))
 
 // R507 variant of declaration-construct for use in limitedSpecificationPart.
 constexpr auto invalidDeclarationStmt{formatStmt >>
@@ -528,20 +555,6 @@ TYPE_CONTEXT_PARSER("FUNCTION subprogram"_en_US,
         executionPart, maybe(internalSubprogramPart),
         unterminatedStatement(endFunctionStmt)))
 
-// R1530 function-stmt ->
-//         [prefix] FUNCTION function-name ( [dummy-arg-name-list] ) [suffix]
-// R1526 prefix -> prefix-spec [prefix-spec]...
-// R1531 dummy-arg-name -> name
-TYPE_CONTEXT_PARSER("FUNCTION statement"_en_US,
-    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
-        parenthesized(optionalList(name)), maybe(suffix)) ||
-        extension<LanguageFeature::OmitFunctionDummies>(
-            "nonstandard usage: FUNCTION statement without dummy argument list"_port_en_US,
-            construct<FunctionStmt>( // PGI & Intel accept "FUNCTION F"
-                many(prefixSpec), "FUNCTION" >> name,
-                construct<std::list<Name>>(),
-                construct<std::optional<Suffix>>())))
-
 // R1532 suffix ->
 //         proc-language-binding-spec [RESULT ( result-name )] |
 //         RESULT ( result-name ) [proc-language-binding-spec]
@@ -566,11 +579,13 @@ TYPE_CONTEXT_PARSER("SUBROUTINE subprogram"_en_US,
 //         [prefix] SUBROUTINE subroutine-name [( [dummy-arg-list] )
 //         [proc-language-binding-spec]]
 TYPE_PARSER(
-    construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
-        parenthesized(optionalList(dummyArg)), maybe(languageBindingSpec)) ||
-    construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
-        pure<std::list<DummyArg>>(),
-        pure<std::optional<LanguageBindingSpec>>()))
+    (construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
+         !"("_tok >> pure<std::list<DummyArg>>(),
+         pure<std::optional<LanguageBindingSpec>>()) ||
+        construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
+            defaulted(parenthesized(optionalList(dummyArg))),
+            maybe(languageBindingSpec))) /
+    checkEndOfKnownStmt)
 
 // R1536 dummy-arg -> dummy-arg-name | *
 TYPE_PARSER(construct<DummyArg>(name) || construct<DummyArg>(star))
diff --git a/flang/lib/Parser/provenance.cpp b/flang/lib/Parser/provenance.cpp
index c67ddd22f0968..e31038b09e407 100644
--- a/flang/lib/Parser/provenance.cpp
+++ b/flang/lib/Parser/provenance.cpp
@@ -246,6 +246,27 @@ static void EmitPrefix(llvm::raw_ostream &o, llvm::raw_ostream::Colors color,
   }
 }
 
+std::optional<ProvenanceRange> AllSources::GetInclusionInfo(
+    const std::optional<ProvenanceRange> &range) const {
+  if (!range)
+    return std::nullopt;
+  const Origin &origin{MapToOrigin(range->start())};
+
+  return common::visit(
+      common::visitors{
+          [&](const Inclusion &inc) -> std::optional<ProvenanceRange> {
+            if (IsValid(origin.replaces) &&
+                range_.Contains(origin.replaces.start()))
+              return origin.replaces;
+            return std::nullopt;
+          },
+          [&](const auto &) -> std::optional<ProvenanceRange> {
+            return std::nullopt;
+          },
+      },
+      origin.u);
+}
+
 void AllSources::EmitMessage(llvm::raw_ostream &o,
     const std::optional<ProvenanceRange> &range, const std::string &message,
     const std::string &prefix, llvm::raw_ostream::Colors color,
diff --git a/flang/lib/Parser/stmt-parser.h b/flang/lib/Parser/stmt-parser.h
index ba647fd60d4ae..00bae2bf950c8 100644
--- a/flang/lib/Parser/stmt-parser.h
+++ b/flang/lib/Parser/stmt-parser.h
@@ -30,6 +30,10 @@ inline constexpr auto unterminatedStatement(const PA &p) {
           maybe(label), space >> p));
 }
 
+constexpr auto atEndOfStmt{space >>
+    withMessage("expected end of statement"_err_en_US, lookAhead(";\n"_ch))};
+constexpr auto checkEndOfKnownStmt{recovery(atEndOfStmt, SkipTo<'\n'>{})};
+
 constexpr auto endOfLine{
     "\n"_ch >> ok || fail("expected end of line"_err_en_US)};
 
@@ -86,8 +90,6 @@ constexpr auto executionPartErrorRecovery{stmtErrorRecoveryStart >>
 // END statement error recovery
 constexpr auto missingOptionalName{pure<std::optional<Name>>()};
 constexpr auto noNameEnd{"END" >> missingOptionalName};
-constexpr auto atEndOfStmt{space >>
-    withMessage("expected end of statement"_err_en_US, lookAhead(";\n"_ch))};
 constexpr auto bareEnd{noNameEnd / recovery(atEndOfStmt, SkipTo<'\n'>{})};
 
 // For unrecognizable construct END statements.  Be sure to not consume
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index e344390372c12..8f7a200d23239 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -600,10 +600,13 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   const Scope &subpScope{
       GetProgramUnitContaining(context.FindScope(name_.source))};
   if (allocateObject_.typedExpr && allocateObject_.typedExpr->v) {
-    if (auto whyNot{WhyNotDefinable(name_.source, subpScope,
-            {DefinabilityFlag::PointerDefinition,
-                DefinabilityFlag::AcceptAllocatable},
-            *allocateObject_.typedExpr->v)}) {
+    DefinabilityFlags flags{DefinabilityFlag::PointerDefinition,
+        DefinabilityFlag::AcceptAllocatable};
+    if (allocateInfo_.gotSource) {
+      flags.set(DefinabilityFlag::SourcedAllocation);
+    }
+    if (auto whyNot{WhyNotDefinable(
+            name_.source, subpScope, flags, *allocateObject_.typedExpr->v)}) {
       context
           .Say(name_.source,
               "Name in ALLOCATE statement is not definable"_err_en_US)
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index ef51b9a0d0ce3..9fad1aa3dd0bf 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -27,7 +27,7 @@ namespace characteristics = Fortran::evaluate::characteristics;
 namespace Fortran::semantics {
 
 static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
-    parser::ContextualMessages &messages, evaluate::FoldingContext &context) {
+    parser::ContextualMessages &messages, SemanticsContext &context) {
   auto restorer{
       messages.SetLocation(arg.sourceLocation().value_or(messages.at()))};
   if (auto kw{arg.keyword()}) {
@@ -79,8 +79,12 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         messages.Say(
             "VOLATILE argument requires an explicit interface"_err_en_US);
       }
+      if (const Symbol & base{named->GetFirstSymbol()};
+          IsFunctionResult(base)) {
+        context.NoteDefinedSymbol(base);
+      }
     } else if (auto argChars{characteristics::DummyArgument::FromActual(
-                   "actual argument", *expr, context,
+                   "actual argument", *expr, context.foldingContext(),
                    /*forImplicitInterface=*/true)}) {
       const auto *argProcDesignator{
           std::get_if<evaluate::ProcedureDesignator>(&expr->u)};
@@ -647,8 +651,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         actualLastSymbol->name(), dummyName);
   }
 
-  // Definability
-  bool actualIsVariable{evaluate::IsVariable(actual)};
+  // Definability checking
+  // Problems with polymorphism are caught in the callee's definition.
   if (scope) {
     std::optional<parser::MessageFixedText> undefinableMessage;
     if (dummy.intent == common::Intent::Out) {
@@ -670,7 +674,6 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       }
     }
     if (undefinableMessage) {
-      // Problems with polymorphism are caught in the callee's definition.
       DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
       if (isElemental) { // 15.5.2.4(21)
         flags.set(DefinabilityFlag::VectorSubscriptIsOk);
@@ -689,6 +692,14 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
           messages.Say(std::move(*whyNot));
         }
       }
+    } else if (dummy.intent != common::Intent::In ||
+        (dummyIsPointer && !actualIsPointer)) {
+      if (auto named{evaluate::ExtractNamedEntity(actual)}) {
+        if (const Symbol & base{named->GetFirstSymbol()};
+            IsFunctionResult(base)) {
+          context.NoteDefinedSymbol(base);
+        }
+      }
     }
   }
 
@@ -893,6 +904,7 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   // argument
   if (dummy.attrs.test(characteristics::DummyDataObject::Attr::Target) &&
       context.ShouldWarn(common::UsageWarning::NonTargetPassedToTarget)) {
+    bool actualIsVariable{evaluate::IsVariable(actual)};
     bool actualIsTemp{!actualIsVariable || HasVectorSubscript(actual) ||
         evaluate::ExtractCoarrayRef(actual)};
     if (actualIsTemp) {
@@ -1416,7 +1428,8 @@ static void CheckAssociated(evaluate::ActualArguments &arguments,
             if (auto whyNot{WhyNotDefinable(
                     pointerArg->sourceLocation().value_or(messages.at()),
                     *scope,
-                    DefinabilityFlags{DefinabilityFlag::PointerDefinition},
+                    DefinabilityFlags{DefinabilityFlag::PointerDefinition,
+                        DefinabilityFlag::DoNotNoteDefinition},
                     *pointerExpr)}) {
               if (whyNot->IsFatal()) {
                 if (auto *msg{messages.Say(pointerArg->sourceLocation(),
@@ -2021,7 +2034,7 @@ bool CheckArguments(const characteristics::Procedure &proc,
       auto restorer{messages.SetMessages(buffer)};
       for (auto &actual : actuals) {
         if (actual) {
-          CheckImplicitInterfaceArg(*actual, messages, foldingContext);
+          CheckImplicitInterfaceArg(*actual, messages, context);
         }
       }
     }
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 5b3ea214d63e9..60b8b3213fdb8 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -307,6 +307,25 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
       WarnOnIoStmt(source);
     }
   }
+  template <typename A>
+  void ErrorIfHostSymbol(const A &expr, const parser::CharBlock &source) {
+    for (const Symbol &sym : CollectCudaSymbols(expr)) {
+      if (const auto *details =
+              sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+        if (details->IsArray() &&
+            (!details->cudaDataAttr() ||
+                (details->cudaDataAttr() &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Device &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
+                    *details->cudaDataAttr() !=
+                        common::CUDADataAttr::Unified))) {
+          context_.Say(source,
+              "Host array '%s' cannot be present in CUF kernel"_err_en_US,
+              sym.name());
+        }
+      }
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
@@ -349,6 +368,19 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
             [&](const common::Indirection<parser::IfStmt> &x) {
               Check(x.value());
             },
+            [&](const common::Indirection<parser::AssignmentStmt> &x) {
+              if (IsCUFKernelDo) {
+                const evaluate::Assignment *assign{
+                    semantics::GetAssignment(x.value())};
+                if (assign) {
+                  ErrorIfHostSymbol(assign->lhs, source);
+                  ErrorIfHostSymbol(assign->rhs, source);
+                }
+              }
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+                context_.Say(source, std::move(*msg));
+              }
+            },
             [&](const auto &x) {
               if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
                 context_.Say(source, std::move(*msg));
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index eb898bad53a4f..a52f013a70b9d 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2919,7 +2919,7 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
     if (derived->sequence()) { // C1801
       msgs.Say(symbol.name(),
           "An interoperable derived type cannot have the SEQUENCE attribute"_err_en_US);
-    } else if (!derived->paramDecls().empty()) { // C1802
+    } else if (!derived->paramNameOrder().empty()) { // C1802
       msgs.Say(symbol.name(),
           "An interoperable derived type cannot have a type parameter"_err_en_US);
     } else if (const auto *parent{
diff --git a/flang/lib/Semantics/check-purity.cpp b/flang/lib/Semantics/check-purity.cpp
index 55a9a2f107388..1046f363e9485 100644
--- a/flang/lib/Semantics/check-purity.cpp
+++ b/flang/lib/Semantics/check-purity.cpp
@@ -31,7 +31,7 @@ void PurityChecker::Enter(const parser::FunctionSubprogram &func) {
       stmt.source, std::get<std::list<parser::PrefixSpec>>(stmt.statement.t));
 }
 
-void PurityChecker::Leave(const parser::FunctionSubprogram &) { Left(); }
+void PurityChecker::Leave(const parser::FunctionSubprogram &func) { Left(); }
 
 bool PurityChecker::InPureSubprogram() const {
   return pureDepth_ >= 0 && depth_ >= pureDepth_;
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index 96af46abd6180..ae76f668f6ce7 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -127,6 +127,12 @@ static std::optional<parser::Message> WhyNotDefinableBase(parser::CharBlock at,
       (!IsPointer(ultimate) || (isWholeSymbol && isPointerDefinition))) {
     return BlameSymbol(
         at, "'%s' is an INTENT(IN) dummy argument"_en_US, original);
+  } else if (acceptAllocatable &&
+      !flags.test(DefinabilityFlag::SourcedAllocation)) {
+    // allocating a function result doesn't count as a def'n
+    // unless there's SOURCE=
+  } else if (!flags.test(DefinabilityFlag::DoNotNoteDefinition)) {
+    scope.context().NoteDefinedSymbol(ultimate);
   }
   if (const Scope * pure{FindPureProcedureContaining(scope)}) {
     // Additional checking for pure subprograms.
@@ -178,7 +184,10 @@ static std::optional<parser::Message> WhyNotDefinableBase(parser::CharBlock at,
 static std::optional<parser::Message> WhyNotDefinableLast(parser::CharBlock at,
     const Scope &scope, DefinabilityFlags flags, const Symbol &original) {
   const Symbol &ultimate{original.GetUltimate()};
-  if (const auto *association{ultimate.detailsIf<AssocEntityDetails>()}) {
+  if (const auto *association{ultimate.detailsIf<AssocEntityDetails>()};
+      association &&
+      (association->rank().has_value() ||
+          !flags.test(DefinabilityFlag::PointerDefinition))) {
     if (auto dataRef{
             evaluate::ExtractDataRef(*association->expr(), true, true)}) {
       return WhyNotDefinableLast(at, scope, flags, dataRef->GetLastSymbol());
diff --git a/flang/lib/Semantics/definable.h b/flang/lib/Semantics/definable.h
index b14c644349674..709bbba494d10 100644
--- a/flang/lib/Semantics/definable.h
+++ b/flang/lib/Semantics/definable.h
@@ -30,7 +30,9 @@ ENUM_CLASS(DefinabilityFlag,
     DuplicatesAreOk, // vector subscript may have duplicates
     PointerDefinition, // a pointer is being defined, not its target
     AcceptAllocatable, // treat allocatable as if it were a pointer
-    PolymorphicOkInPure) // don't check for polymorphic type in pure subprogram
+    SourcedAllocation, // ALLOCATE(a,SOURCE=)
+    PolymorphicOkInPure, // don't check for polymorphic type in pure subprogram
+    DoNotNoteDefinition) // context does not imply definition
 
 using DefinabilityFlags =
     common::EnumSet<DefinabilityFlag, DefinabilityFlag_enumSize>;
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 2d91962a24a4c..4413d77fcf255 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -4533,6 +4533,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
         !OkLogicalIntegerAssignment(lhsType->category(), rhsType->category())) {
       SayNoMatch("ASSIGNMENT(=)", true);
     }
+  } else if (!fatalErrors_) {
+    CheckAssignmentConformance();
   }
   return std::nullopt;
 }
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index a1c4c031ca509..c7e7716a7b481 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -410,15 +410,14 @@ bool ModFileWriter::PutComponents(const Symbol &typeSymbol) {
   llvm::raw_string_ostream typeBindings{buf};
   UnorderedSymbolSet emitted;
   SymbolVector symbols{scope.GetSymbols()};
-  // Emit type parameters first
-  for (const Symbol &symbol : symbols) {
-    if (symbol.has<TypeParamDetails>()) {
-      PutSymbol(typeBindings, symbol);
-      emitted.emplace(symbol);
-    }
-  }
-  // Emit components in component order.
+  // Emit type parameter declarations first, in order
   const auto &details{typeSymbol.get<DerivedTypeDetails>()};
+  for (const Symbol &symbol : details.paramDeclOrder()) {
+    CHECK(symbol.has<TypeParamDetails>());
+    PutSymbol(typeBindings, symbol);
+    emitted.emplace(symbol);
+  }
+  // Emit actual components in component order.
   for (SourceName name : details.componentNames()) {
     auto iter{scope.find(name)};
     if (iter != scope.end()) {
@@ -549,10 +548,10 @@ void ModFileWriter::PutDerivedType(
     decls_ << ",extends(" << extends->name() << ')';
   }
   decls_ << "::" << typeSymbol.name();
-  if (!details.paramNames().empty()) {
+  if (!details.paramNameOrder().empty()) {
     char sep{'('};
-    for (const auto &name : details.paramNames()) {
-      decls_ << sep << name;
+    for (const SymbolRef &ref : details.paramNameOrder()) {
+      decls_ << sep << ref->name();
       sep = ',';
     }
     decls_ << ')';
@@ -1046,7 +1045,7 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
       os, symbol,
       [&]() {
         PutType(os, DEREF(symbol.GetType()));
-        PutLower(os << ',', common::EnumToString(details.attr()));
+        PutLower(os << ',', common::EnumToString(details.attr().value()));
       },
       symbol.attrs());
   PutInit(os, details.init());
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index dae3b1a2eb1e1..4948fce77a250 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -358,8 +358,10 @@ bool PointerAssignmentChecker::Check(const evaluate::Designator<T> &d) {
       Say(std::get<MessageFormattedText>(*msg));
     }
     return false;
+  } else {
+    context_.NoteDefinedSymbol(*base);
+    return true;
   }
-  return true;
 }
 
 // Common handling for procedure pointer right-hand sides
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 16658d50c151c..d635a7b8b7874 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -19,6 +19,7 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/expression.h"
+#include "flang/Semantics/tools.h"
 #include <list>
 #include <map>
 #include <sstream>
@@ -729,11 +730,12 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   void CheckNameInAllocateStmt(const parser::CharBlock &source,
       const parser::Name &ompObject, const parser::AllocateStmt &allocate);
 
-  bool HasSymbolInEnclosingScope(const Symbol &, Scope &);
   std::int64_t ordCollapseLevel{0};
 
   void AddOmpRequiresToScope(Scope &, WithOmpDeclarative::RequiresFlags,
       std::optional<common::OmpAtomicDefaultMemOrderType>);
+  void IssueNonConformanceWarning(
+      llvm::omp::Directive D, parser::CharBlock source);
 };
 
 template <typename T>
@@ -1524,6 +1526,8 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) {
     // TODO others
     break;
   }
+  if (beginDir.v == llvm::omp::Directive::OMPD_master)
+    IssueNonConformanceWarning(beginDir.v, beginDir.source);
   ClearDataSharingAttributeObjects();
   ClearPrivateDataSharingAttributeObjects();
   ClearAllocateNames();
@@ -1634,11 +1638,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
     break;
   }
   if (beginDir.v == llvm::omp::Directive::OMPD_target_loop)
-    if (context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
-      context_.Say(beginDir.source,
-          "Usage of directive %s is non-confirming to OpenMP standard"_warn_en_US,
-          llvm::omp::getOpenMPDirectiveName(beginDir.v).str());
-    }
+    IssueNonConformanceWarning(beginDir.v, beginDir.source);
   ClearDataSharingAttributeObjects();
   SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList));
 
@@ -2035,6 +2035,14 @@ void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
 // and adjust the symbol for each Name if necessary
 void OmpAttributeVisitor::Post(const parser::Name &name) {
   auto *symbol{name.symbol};
+  auto IsPrivatizable = [](const Symbol *sym) {
+    return !IsProcedure(*sym) && !IsNamedConstant(*sym) &&
+        !sym->owner().IsDerivedType() &&
+        sym->owner().kind() != Scope::Kind::ImpliedDos &&
+        !sym->detailsIf<semantics::AssocEntityDetails>() &&
+        !sym->detailsIf<semantics::NamelistDetails>();
+  };
+
   if (symbol && !dirContext_.empty() && GetContext().withinConstruct) {
     // Exclude construct-names
     if (auto *details{symbol->detailsIf<semantics::MiscDetails>()}) {
@@ -2042,8 +2050,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         return;
       }
     }
-    if (!symbol->owner().IsDerivedType() && !IsProcedure(*symbol) &&
-        !IsObjectWithDSA(*symbol) && !IsNamedConstant(*symbol)) {
+    if (IsPrivatizable(symbol) && !IsObjectWithDSA(*symbol)) {
       // TODO: create a separate function to go through the rules for
       //       predetermined, explicitly determined, and implicitly
       //       determined data-sharing attributes (2.15.1.1).
@@ -2068,6 +2075,9 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       if (found->test(semantics::Symbol::Flag::OmpThreadprivate))
         return;
     }
+    if (!IsPrivatizable(symbol)) {
+      return;
+    }
 
     // Implicitly determined DSAs
     // OMP 5.2 5.1.1 - Variables Referenced in a Construct
@@ -2085,16 +2095,22 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         }
       }
 
-      // When handling each implicit rule, either a new private symbol is
-      // declared or the last declared symbol is used.
-      // In the latter case, it's necessary to insert a new symbol in the scope
-      // being processed, associated with the last declared symbol.
-      // This captures the fact that, although we are using the last declared
-      // symbol, its DSA could be different in this scope.
-      // Also, because of how symbols are collected in lowering, not inserting
-      // a new symbol in this scope could lead to the conclusion that the
-      // symbol was declared in this construct, which would result in wrong
-      // privatization code being generated.
+      // When handling each implicit rule for a given symbol, one of the
+      // following 3 actions may be taken:
+      // 1. Declare a new private symbol.
+      // 2. Create a new association symbol with no flags, that will represent
+      //    a shared symbol in the current scope. Note that symbols without
+      //    any private flags are considered as shared.
+      // 3. Use the last declared private symbol, by inserting a new symbol
+      //    in the scope being processed, associated with it.
+      //    If no private symbol was declared previously, then no association
+      //    is needed and the symbol from the enclosing scope will be
+      //    inherited by the current one.
+      //
+      // Because of how symbols are collected in lowering, not inserting a new
+      // symbol in the last case could lead to the conclusion that a symbol
+      // from an enclosing construct was declared in the current construct,
+      // which would result in wrong privatization code being generated.
       // Consider the following example:
       //
       // !$omp parallel default(private)              ! p1
@@ -2107,48 +2123,56 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       // (p2), it would use the x symbol definition from the enclosing scope.
       // Then, when p2's default symbols were collected in lowering, the x
       // symbol from the outer parallel construct (p1) would be collected, as
-      // it would have the private flag set (note that symbols that don't have
-      // any private flag are considered as shared).
+      // it would have the private flag set.
       // This would make x appear to be defined in p2, causing it to be
       // privatized in p2 and its privatization in p1 to be skipped.
-      auto declNewSymbol = [&](Symbol::Flag flag) {
+      auto makePrivateSymbol = [&](Symbol::Flag flag) {
         Symbol *hostSymbol =
             lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
         lastDeclSymbol = DeclarePrivateAccessEntity(
             *hostSymbol, flag, context_.FindScope(dirContext.directiveSource));
         return lastDeclSymbol;
       };
+      auto makeSharedSymbol = [&]() {
+        Symbol *hostSymbol =
+            lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
+        MakeAssocSymbol(symbol->name(), *hostSymbol,
+            context_.FindScope(dirContext.directiveSource));
+      };
       auto useLastDeclSymbol = [&]() {
         if (lastDeclSymbol)
           MakeAssocSymbol(symbol->name(), *lastDeclSymbol,
               context_.FindScope(dirContext.directiveSource));
       };
 
+      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
+      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
+      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
+      bool teamsDir = llvm::omp::allTeamsSet.test(dirContext.directive);
+
       if (dsa.has_value()) {
-        useLastDeclSymbol();
+        if (dsa.value() == Symbol::Flag::OmpShared &&
+            (parallelDir || taskGenDir || teamsDir))
+          makeSharedSymbol();
+        // Private symbols will have been declared already.
         prevDSA = dsa;
         continue;
       }
 
-      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
-      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
-      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
-
       if (dirContext.defaultDSA == Symbol::Flag::OmpPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpFirstPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpShared) {
         // 1) default
         // Allowed only with parallel, teams and task generating constructs.
-        assert(parallelDir || taskGenDir ||
-            llvm::omp::allTeamsSet.test(dirContext.directive));
+        assert(parallelDir || taskGenDir || teamsDir);
         if (dirContext.defaultDSA != Symbol::Flag::OmpShared)
-          declNewSymbol(dirContext.defaultDSA);
+          makePrivateSymbol(dirContext.defaultDSA);
         else
-          useLastDeclSymbol();
+          makeSharedSymbol();
         dsa = dirContext.defaultDSA;
       } else if (parallelDir) {
         // 2) parallel -> shared
-        useLastDeclSymbol();
+        makeSharedSymbol();
         dsa = Symbol::Flag::OmpShared;
       } else if (!taskGenDir && !targetDir) {
         // 3) enclosing context
@@ -2161,12 +2185,12 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
         if (prevDSA == Symbol::Flag::OmpShared) {
           // 6) shared in enclosing context -> shared
-          useLastDeclSymbol();
+          makeSharedSymbol();
           dsa = Symbol::Flag::OmpShared;
         } else {
           // 7) firstprivate
           dsa = Symbol::Flag::OmpFirstPrivate;
-          declNewSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
+          makePrivateSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
         }
       }
       prevDSA = dsa;
@@ -2570,20 +2594,59 @@ void ResolveOmpTopLevelParts(
   });
 }
 
-void OmpAttributeVisitor::CheckDataCopyingClause(
-    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
-  const auto *checkSymbol{&symbol};
+static bool IsSymbolInCommonBlock(const Symbol &symbol) {
+  // TODO Improve the performance of this predicate function.
+  //      Going through all symbols sequentially, in all common blocks, can be
+  //      slow when there are many symbols. A possible optimization is to add
+  //      an OmpInCommonBlock flag to Symbol, to make it possible to quickly
+  //      test if a given symbol is in a common block.
+  for (const auto &cb : symbol.owner().commonBlocks()) {
+    if (IsCommonBlockContaining(cb.second.get(), symbol)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool IsSymbolThreadprivate(const Symbol &symbol) {
   if (const auto *details{symbol.detailsIf<HostAssocDetails>()}) {
-    checkSymbol = &details->symbol();
+    return details->symbol().test(Symbol::Flag::OmpThreadprivate);
+  }
+  return symbol.test(Symbol::Flag::OmpThreadprivate);
+}
+
+static bool IsSymbolPrivate(const Symbol &symbol) {
+  if (symbol.test(Symbol::Flag::OmpPrivate) ||
+      symbol.test(Symbol::Flag::OmpFirstPrivate)) {
+    return true;
   }
+  // A symbol that has not gone through constructs that may privatize the
+  // original symbol may be predetermined as private.
+  // (OMP 5.2 5.1.1 - Variables Referenced in a Construct)
+  if (symbol == symbol.GetUltimate()) {
+    switch (symbol.owner().kind()) {
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Subprogram:
+    case Scope::Kind::BlockConstruct:
+      return !symbol.attrs().test(Attr::SAVE) &&
+          !symbol.attrs().test(Attr::PARAMETER) && !IsAssumedShape(symbol) &&
+          !IsSymbolInCommonBlock(symbol);
+    default:
+      return false;
+    }
+  }
+  return false;
+}
 
+void OmpAttributeVisitor::CheckDataCopyingClause(
+    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
   if (ompFlag == Symbol::Flag::OmpCopyIn) {
     // List of items/objects that can appear in a 'copyin' clause must be
     // 'threadprivate'
-    if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate)) {
+    if (!IsSymbolThreadprivate(symbol)) {
       context_.Say(name.source,
           "Non-THREADPRIVATE object '%s' in COPYIN clause"_err_en_US,
-          checkSymbol->name());
+          symbol.name());
     }
   } else if (ompFlag == Symbol::Flag::OmpCopyPrivate &&
       GetContext().directive == llvm::omp::Directive::OMPD_single) {
@@ -2596,18 +2659,13 @@ void OmpAttributeVisitor::CheckDataCopyingClause(
           "COPYPRIVATE variable '%s' may not appear on a PRIVATE or "
           "FIRSTPRIVATE clause on a SINGLE construct"_err_en_US,
           symbol.name());
-    } else {
+    } else if (!IsSymbolThreadprivate(symbol) && !IsSymbolPrivate(symbol)) {
       // List of items/objects that can appear in a 'copyprivate' clause must be
       // either 'private' or 'threadprivate' in enclosing context.
-      if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate) &&
-          !(HasSymbolInEnclosingScope(symbol, currScope()) &&
-              (symbol.test(Symbol::Flag::OmpPrivate) ||
-                  symbol.test(Symbol::Flag::OmpFirstPrivate)))) {
-        context_.Say(name.source,
-            "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
-            "outer context"_err_en_US,
-            symbol.name());
-      }
+      context_.Say(name.source,
+          "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
+          "outer context"_err_en_US,
+          symbol.name());
     }
   }
 }
@@ -2677,12 +2735,6 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
 }
 
-bool OmpAttributeVisitor::HasSymbolInEnclosingScope(
-    const Symbol &symbol, Scope &scope) {
-  const auto symbols{scope.parent().GetSymbols()};
-  return llvm::is_contained(symbols, symbol);
-}
-
 // Goes through the names in an OmpObjectList and checks if each name appears
 // in the given allocate statement
 void OmpAttributeVisitor::CheckAllNamesInAllocateStmt(
@@ -2758,4 +2810,21 @@ void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
   } while (!scopeIter->IsGlobal());
 }
 
+void OmpAttributeVisitor::IssueNonConformanceWarning(
+    llvm::omp::Directive D, parser::CharBlock source) {
+  std::string warnStr = "";
+  std::string dirName = llvm::omp::getOpenMPDirectiveName(D).str();
+  switch (D) {
+  case llvm::omp::OMPD_master:
+    warnStr = "OpenMP directive '" + dirName +
+        "' has been deprecated, please use 'masked' instead.";
+    break;
+  case llvm::omp::OMPD_target_loop:
+  default:
+    warnStr = "OpenMP directive '" + dirName + "' has been deprecated.";
+  }
+  if (context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
+    context_.Say(source, "%s"_warn_en_US, warnStr);
+  }
+}
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f761355d0da32..b7725c5b00228 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5456,34 +5456,14 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
   CHECK(scope.symbol());
   CHECK(scope.symbol()->scope() == &scope);
   auto &details{scope.symbol()->get<DerivedTypeDetails>()};
-  std::set<SourceName> paramNames;
   for (auto &paramName : std::get<std::list<parser::Name>>(stmt.statement.t)) {
-    details.add_paramName(paramName.source);
-    auto *symbol{FindInScope(scope, paramName)};
-    if (!symbol) {
-      Say(paramName,
-          "No definition found for type parameter '%s'"_err_en_US); // C742
-      // No symbol for a type param.  Create one and mark it as containing an
-      // error to improve subsequent semantic processing
-      BeginAttrs();
-      Symbol *typeParam{MakeTypeSymbol(
-          paramName, TypeParamDetails{common::TypeParamAttr::Len})};
-      context().SetError(*typeParam);
-      EndAttrs();
-    } else if (!symbol->has<TypeParamDetails>()) {
-      Say2(paramName, "'%s' is not defined as a type parameter"_err_en_US,
-          *symbol, "Definition of '%s'"_en_US); // C741
-    }
-    if (!paramNames.insert(paramName.source).second) {
-      Say(paramName,
-          "Duplicate type parameter name: '%s'"_err_en_US); // C731
-    }
-  }
-  for (const auto &[name, symbol] : currScope()) {
-    if (symbol->has<TypeParamDetails>() && !paramNames.count(name)) {
-      SayDerivedType(name,
-          "'%s' is not a type parameter of this derived type"_err_en_US,
-          currScope()); // C741
+    if (auto *symbol{FindInScope(scope, paramName)}) {
+      if (auto *details{symbol->detailsIf<TypeParamDetails>()}) {
+        if (!details->attr()) {
+          Say(paramName,
+              "No definition found for type parameter '%s'"_err_en_US); // C742
+        }
+      }
     }
   }
   Walk(std::get<std::list<parser::Statement<parser::PrivateOrSequence>>>(x.t));
@@ -5499,7 +5479,7 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
             "A sequence type should have at least one component"_warn_en_US);
       }
     }
-    if (!details.paramNames().empty()) { // C740
+    if (!details.paramDeclOrder().empty()) { // C740
       Say(stmt.source,
           "A sequence type may not have type parameters"_err_en_US);
     }
@@ -5559,24 +5539,50 @@ void DeclarationVisitor::Post(const parser::DerivedTypeStmt &x) {
       details.add_component(comp);
     }
   }
+  // Create symbols now for type parameters so that they shadow names
+  // from the enclosing specification part.
+  if (auto *details{symbol.detailsIf<DerivedTypeDetails>()}) {
+    for (const auto &name : std::get<std::list<parser::Name>>(x.t)) {
+      if (Symbol * symbol{MakeTypeSymbol(name, TypeParamDetails{})}) {
+        details->add_paramNameOrder(*symbol);
+      }
+    }
+  }
   EndAttrs();
 }
 
 void DeclarationVisitor::Post(const parser::TypeParamDefStmt &x) {
   auto *type{GetDeclTypeSpec()};
+  DerivedTypeDetails *derivedDetails{nullptr};
+  if (Symbol * dtSym{currScope().symbol()}) {
+    derivedDetails = dtSym->detailsIf<DerivedTypeDetails>();
+  }
   auto attr{std::get<common::TypeParamAttr>(x.t)};
   for (auto &decl : std::get<std::list<parser::TypeParamDecl>>(x.t)) {
     auto &name{std::get<parser::Name>(decl.t)};
-    if (Symbol * symbol{MakeTypeSymbol(name, TypeParamDetails{attr})}) {
-      SetType(name, *type);
-      if (auto &init{
-              std::get<std::optional<parser::ScalarIntConstantExpr>>(decl.t)}) {
-        if (auto maybeExpr{AnalyzeExpr(context(), *init)}) {
-          if (auto *intExpr{std::get_if<SomeIntExpr>(&maybeExpr->u)}) {
-            symbol->get<TypeParamDetails>().set_init(std::move(*intExpr));
+    if (Symbol * symbol{FindInScope(currScope(), name)}) {
+      if (auto *paramDetails{symbol->detailsIf<TypeParamDetails>()}) {
+        if (!paramDetails->attr()) {
+          paramDetails->set_attr(attr);
+          SetType(name, *type);
+          if (auto &init{std::get<std::optional<parser::ScalarIntConstantExpr>>(
+                  decl.t)}) {
+            if (auto maybeExpr{AnalyzeExpr(context(), *init)}) {
+              if (auto *intExpr{std::get_if<SomeIntExpr>(&maybeExpr->u)}) {
+                paramDetails->set_init(std::move(*intExpr));
+              }
+            }
           }
+          if (derivedDetails) {
+            derivedDetails->add_paramDeclOrder(*symbol);
+          }
+        } else {
+          Say(name,
+              "Type parameter '%s' was already declared in this derived type"_err_en_US);
         }
       }
+    } else {
+      Say(name, "'%s' is not a parameter of this derived type"_err_en_US);
     }
   }
   EndDecl();
@@ -6779,9 +6785,6 @@ Symbol *DeclarationVisitor::MakeTypeSymbol(
     }
     Symbol &result{MakeSymbol(name, attrs, std::move(details))};
     SetCUDADataAttr(name, result, cudaDataAttr());
-    if (result.has<TypeParamDetails>()) {
-      derivedType.symbol()->get<DerivedTypeDetails>().add_paramDecl(result);
-    }
     return &result;
   }
 }
@@ -7118,9 +7121,13 @@ void ConstructVisitor::Post(const parser::AssociateStmt &x) {
   for (auto nthLastAssoc{assocCount}; nthLastAssoc > 0; --nthLastAssoc) {
     SetCurrentAssociation(nthLastAssoc);
     if (auto *symbol{MakeAssocEntity()}) {
-      if (ExtractCoarrayRef(GetCurrentAssociation().selector.expr)) { // C1103
+      const MaybeExpr &expr{GetCurrentAssociation().selector.expr};
+      if (ExtractCoarrayRef(expr)) { // C1103
         Say("Selector must not be a coindexed object"_err_en_US);
       }
+      if (evaluate::IsAssumedRank(expr)) {
+        Say("Selector must not be assumed-rank"_err_en_US);
+      }
       SetTypeFromAssociation(*symbol);
       SetAttrsFromAssociation(*symbol);
     }
@@ -7830,6 +7837,12 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
       CheckEntryDummyUse(name.source, symbol);
       ConvertToObjectEntity(*symbol);
       ApplyImplicitRules(*symbol);
+    } else if (const auto *tpd{symbol->detailsIf<TypeParamDetails>()};
+               tpd && !tpd->attr()) {
+      Say(name,
+          "Type parameter '%s' was referenced before being declared"_err_en_US,
+          name.source);
+      context().SetError(*symbol);
     }
     if (checkIndexUseInOwnBounds_ &&
         *checkIndexUseInOwnBounds_ == name.source && !InModuleFile()) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 8939dc4499ec4..6690924196673 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -42,7 +42,8 @@ static int FindLenParameterIndex(
     if (&*ref == &symbol) {
       return lenIndex;
     }
-    if (ref->get<TypeParamDetails>().attr() == common::TypeParamAttr::Len) {
+    if (auto attr{ref->get<TypeParamDetails>().attr()};
+        attr && *attr == common::TypeParamAttr::Len) {
       ++lenIndex;
     }
   }
@@ -371,7 +372,7 @@ static std::optional<std::string> GetSuffixIfTypeKindParameters(
     std::optional<std::string> suffix;
     for (SymbolRef ref : *parameters) {
       const auto &tpd{ref->get<TypeParamDetails>()};
-      if (tpd.attr() == common::TypeParamAttr::Kind) {
+      if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Kind) {
         if (const auto *pv{derivedTypeSpec.FindParameter(ref->name())}) {
           if (pv->GetExplicit()) {
             if (auto instantiatedValue{evaluate::ToInt64(*pv->GetExplicit())}) {
@@ -497,7 +498,7 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) {
     for (SymbolRef ref : *parameters) {
       if (const auto *inst{dtScope.FindComponent(ref->name())}) {
         const auto &tpd{inst->get<TypeParamDetails>()};
-        if (tpd.attr() == common::TypeParamAttr::Kind) {
+        if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Kind) {
           auto value{evaluate::ToInt64(tpd.init()).value_or(0)};
           if (derivedTypeSpec) {
             if (const auto *pv{derivedTypeSpec->FindParameter(inst->name())}) {
@@ -799,7 +800,7 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
           specParams{GetTypeParameters(spec.typeSymbol())}) {
         for (SymbolRef ref : *specParams) {
           const auto &tpd{ref->get<TypeParamDetails>()};
-          if (tpd.attr() == common::TypeParamAttr::Len) {
+          if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Len) {
             if (const ParamValue *
                 paramValue{spec.FindParameter(ref->name())}) {
               lenParams.emplace_back(GetValue(*paramValue, parameters));
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index c09734e5676f3..3cb24f6c6af43 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -160,6 +160,41 @@ class MiscChecker : public virtual BaseChecker {
   SemanticsContext &context_;
 };
 
+static void WarnUndefinedFunctionResult(
+    SemanticsContext &context, const Scope &scope) {
+  auto WasDefined{[&context](const Symbol &symbol) {
+    return context.IsSymbolDefined(symbol) ||
+        IsInitialized(symbol, /*ignoreDataStatements=*/true,
+            /*ignoreAllocatable=*/true, /*ignorePointer=*/true);
+  }};
+  if (const Symbol * symbol{scope.symbol()}) {
+    if (const auto *subp{symbol->detailsIf<SubprogramDetails>()}) {
+      if (subp->isFunction() && !subp->isInterface() && !subp->stmtFunction()) {
+        bool wasDefined{WasDefined(subp->result())};
+        if (!wasDefined) {
+          // Definitions of ENTRY result variables also count.
+          for (const auto &pair : scope) {
+            const Symbol &local{*pair.second};
+            if (IsFunctionResult(local) && WasDefined(local)) {
+              wasDefined = true;
+              break;
+            }
+          }
+          if (!wasDefined) {
+            context.Say(
+                symbol->name(), "Function result is never defined"_warn_en_US);
+          }
+        }
+      }
+    }
+  }
+  if (!scope.IsModuleFile()) {
+    for (const Scope &child : scope.children()) {
+      WarnUndefinedFunctionResult(context, child);
+    }
+  }
+}
+
 using StatementSemanticsPass1 = ExprChecker;
 using StatementSemanticsPass2 = SemanticsVisitor<AllocateChecker,
     ArithmeticIfStmtChecker, AssignmentChecker, CaseChecker, CoarrayChecker,
@@ -187,6 +222,9 @@ static bool PerformStatementSemantics(
     SemanticsVisitor<CUDAChecker>{context}.Walk(program);
   }
   if (!context.AnyFatalError()) {
+    if (context.ShouldWarn(common::UsageWarning::UndefinedFunctionResult)) {
+      WarnUndefinedFunctionResult(context, context.globalScope());
+    }
     pass2.CompileDataInitializationsIntoInitializers();
   }
   return !context.AnyFatalError();
@@ -712,4 +750,12 @@ CommonBlockList SemanticsContext::GetCommonBlocks() const {
   return {};
 }
 
+void SemanticsContext::NoteDefinedSymbol(const Symbol &symbol) {
+  isDefined_.insert(symbol);
+}
+
+bool SemanticsContext::IsSymbolDefined(const Symbol &symbol) const {
+  return isDefined_.find(symbol) != isDefined_.end();
+}
+
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 023ab7b64e4fc..31e91ee7355e3 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -588,7 +588,11 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
           },
           [&](const TypeParamDetails &x) {
             DumpOptional(os, "type", x.type());
-            os << ' ' << common::EnumToString(x.attr());
+            if (auto attr{x.attr()}) {
+              os << ' ' << common::EnumToString(*attr);
+            } else {
+              os << " (no attr)";
+            }
             DumpExpr(os, "init", x.init());
           },
           [&](const MiscDetails &x) {
@@ -739,9 +743,16 @@ const Symbol *DerivedTypeDetails::GetFinalForRank(int rank) const {
   return nullptr;
 }
 
-void TypeParamDetails::set_type(const DeclTypeSpec &type) {
+TypeParamDetails &TypeParamDetails::set_attr(common::TypeParamAttr attr) {
+  CHECK(!attr_);
+  attr_ = attr;
+  return *this;
+}
+
+TypeParamDetails &TypeParamDetails::set_type(const DeclTypeSpec &type) {
   CHECK(!type_);
   type_ = &type;
+  return *this;
 }
 
 bool GenericKind::IsIntrinsicOperator() const {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 99381918fc638..fdaf052c2d34e 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -685,7 +685,7 @@ bool IsInitialized(const Symbol &symbol, bool ignoreDataStatements,
     return true;
   } else if (IsPointer(symbol)) {
     return !ignorePointer;
-  } else if (IsNamedConstant(symbol) || IsFunctionResult(symbol)) {
+  } else if (IsNamedConstant(symbol)) {
     return false;
   } else if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
     if (!object->isDummy() && object->type()) {
@@ -1135,12 +1135,12 @@ std::optional<parser::MessageFormattedText> CheckAccessibleSymbol(
   return std::nullopt;
 }
 
-std::list<SourceName> OrderParameterNames(const Symbol &typeSymbol) {
-  std::list<SourceName> result;
+SymbolVector OrderParameterNames(const Symbol &typeSymbol) {
+  SymbolVector result;
   if (const DerivedTypeSpec * spec{typeSymbol.GetParentTypeSpec()}) {
     result = OrderParameterNames(spec->typeSymbol());
   }
-  const auto &paramNames{typeSymbol.get<DerivedTypeDetails>().paramNames()};
+  const auto &paramNames{typeSymbol.get<DerivedTypeDetails>().paramNameOrder()};
   result.insert(result.end(), paramNames.begin(), paramNames.end());
   return result;
 }
@@ -1150,7 +1150,7 @@ SymbolVector OrderParameterDeclarations(const Symbol &typeSymbol) {
   if (const DerivedTypeSpec * spec{typeSymbol.GetParentTypeSpec()}) {
     result = OrderParameterDeclarations(spec->typeSymbol());
   }
-  const auto &paramDecls{typeSymbol.get<DerivedTypeDetails>().paramDecls()};
+  const auto &paramDecls{typeSymbol.get<DerivedTypeDetails>().paramDeclOrder()};
   result.insert(result.end(), paramDecls.begin(), paramDecls.end());
   return result;
 }
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index ed2474377173f..cfaee0b8ba6dc 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -63,7 +63,6 @@ void DerivedTypeSpec::CookParameters(evaluate::FoldingContext &foldingContext) {
   // Parameters of the most deeply nested "base class" come first when the
   // derived type is an extension.
   auto parameterNames{OrderParameterNames(typeSymbol_)};
-  auto parameterDecls{OrderParameterDeclarations(typeSymbol_)};
   auto nextNameIter{parameterNames.begin()};
   RawParameters raw{std::move(rawParameters_)};
   for (auto &[maybeKeyword, value] : raw) {
@@ -71,25 +70,25 @@ void DerivedTypeSpec::CookParameters(evaluate::FoldingContext &foldingContext) {
     common::TypeParamAttr attr{common::TypeParamAttr::Kind};
     if (maybeKeyword) {
       name = maybeKeyword->v.source;
-      auto it{std::find_if(parameterDecls.begin(), parameterDecls.end(),
+      auto it{std::find_if(parameterNames.begin(), parameterNames.end(),
           [&](const Symbol &symbol) { return symbol.name() == name; })};
-      if (it == parameterDecls.end()) {
+      if (it == parameterNames.end()) {
         messages.Say(name,
             "'%s' is not the name of a parameter for derived type '%s'"_err_en_US,
             name, typeSymbol_.name());
       } else {
         // Resolve the keyword's symbol
         maybeKeyword->v.symbol = const_cast<Symbol *>(&it->get());
-        attr = it->get().get<TypeParamDetails>().attr();
+        if (const auto *tpd{it->get().detailsIf<TypeParamDetails>()}) {
+          attr = tpd->attr().value_or(attr);
+        }
       }
     } else if (nextNameIter != parameterNames.end()) {
-      name = *nextNameIter++;
-      auto it{std::find_if(parameterDecls.begin(), parameterDecls.end(),
-          [&](const Symbol &symbol) { return symbol.name() == name; })};
-      if (it == parameterDecls.end()) {
-        break;
+      name = nextNameIter->get().name();
+      if (const auto *tpd{nextNameIter->get().detailsIf<TypeParamDetails>()}) {
+        attr = tpd->attr().value_or(attr);
       }
-      attr = it->get().get<TypeParamDetails>().attr();
+      ++nextNameIter;
     } else {
       messages.Say(name_,
           "Too many type parameters given for derived type '%s'"_err_en_US,
@@ -160,7 +159,7 @@ void DerivedTypeSpec::EvaluateParameters(SemanticsContext &context) {
       // Default type parameter value expressions are folded within
       // the scope of the derived type being instantiated.
       const TypeParamDetails &details{symbol.get<TypeParamDetails>()};
-      if (details.init()) {
+      if (details.init() && details.attr()) {
         evaluate::DynamicType dyType{TypeCategory::Integer, parameterKind};
         if (auto converted{
                 evaluate::ConvertToType(dyType, SomeExpr{*details.init()})}) {
@@ -169,8 +168,8 @@ void DerivedTypeSpec::EvaluateParameters(SemanticsContext &context) {
               evaluate::Fold(foldingContext, std::move(*converted))};
           ok = ok || evaluate::IsActuallyConstant(folded);
           AddParamValue(name,
-              ParamValue{
-                  std::move(std::get<SomeIntExpr>(folded.u)), details.attr()});
+              ParamValue{std::move(std::get<SomeIntExpr>(folded.u)),
+                  details.attr().value()});
         } else {
           if (!context.HasError(symbol)) {
             evaluate::SayWithDeclaration(messages, symbol,
@@ -233,7 +232,7 @@ ParamValue *DerivedTypeSpec::FindParameter(SourceName target) {
 
 static bool MatchKindParams(const Symbol &typeSymbol,
     const DerivedTypeSpec &thisSpec, const DerivedTypeSpec &thatSpec) {
-  for (auto ref : typeSymbol.get<DerivedTypeDetails>().paramDecls()) {
+  for (auto ref : typeSymbol.get<DerivedTypeDetails>().paramNameOrder()) {
     if (ref->get<TypeParamDetails>().attr() == common::TypeParamAttr::Kind) {
       const auto *thisValue{thisSpec.FindParameter(ref->name())};
       const auto *thatValue{thatSpec.FindParameter(ref->name())};
@@ -369,12 +368,15 @@ void DerivedTypeSpec::Instantiate(Scope &containingScope) {
       // uninitialized type parameter to forestall use of any default.
       if (ParamValue * paramValue{FindParameter(name)}) {
         const TypeParamDetails &details{symbol.get<TypeParamDetails>()};
-        paramValue->set_attr(details.attr());
+        TypeParamDetails instanceDetails{};
+        if (details.attr()) {
+          paramValue->set_attr(*details.attr());
+          instanceDetails.set_attr(*details.attr());
+        }
         desc += sep;
         desc += name.ToString();
         desc += '=';
         sep = ',';
-        TypeParamDetails instanceDetails{details.attr()};
         if (MaybeIntExpr expr{paramValue->GetExplicit()}) {
           desc += expr->AsFortran();
           instanceDetails.set_init(
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index 6ca98e518aeac..cc1f58e7feccb 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -22,6 +22,23 @@ module iso_fortran_env
     compiler_options => __builtin_compiler_options, &
     compiler_version => __builtin_compiler_version
 
+  use iso_fortran_env_impl, only: &
+    selectedInt8, selectedInt16, selectedInt32, selectedInt64, selectedInt128, &
+    safeInt8, safeInt16, safeInt32, safeInt64, safeInt128, &
+    int8, int16, int32, int64, int128, &
+    logical8, logical16, logical32, logical64, &
+    selectedReal16, selectedBfloat16, selectedReal32, &
+    selectedReal64, selectedReal80, selectedReal64x2, &
+    selectedReal128, &
+    safeReal16, safeBfloat16, safeReal32, &
+    safeReal64, safeReal80, safeReal64x2, &
+    safeReal128, &
+    real16, bfloat16, real32, real64, &
+    real80, real64x2, real128, &
+    integer_kinds => __builtin_integer_kinds, &
+    real_kinds => __builtin_real_kinds, &
+    logical_kinds => __builtin_logical_kinds
+
   implicit none
   private
 
@@ -38,95 +55,22 @@ module iso_fortran_env
     pack([selectedUCS_2], selectedUCS_2 >= 0), &
     pack([selectedUnicode], selectedUnicode >= 0)]
 
-  integer, parameter :: &
-    selectedInt8 = selected_int_kind(2), &
-    selectedInt16 = selected_int_kind(4), &
-    selectedInt32 = selected_int_kind(9), &
-    selectedInt64 = selected_int_kind(18),&
-    selectedInt128 = selected_int_kind(38), &
-    safeInt8 = merge(selectedInt8, selected_int_kind(0), &
-                     selectedInt8 >= 0), &
-    safeInt16 = merge(selectedInt16, selected_int_kind(0), &
-                      selectedInt16 >= 0), &
-    safeInt32 = merge(selectedInt32, selected_int_kind(0), &
-                      selectedInt32 >= 0), &
-    safeInt64 = merge(selectedInt64, selected_int_kind(0), &
-                      selectedInt64 >= 0), &
-    safeInt128 = merge(selectedInt128, selected_int_kind(0), &
-                       selectedInt128 >= 0)
-  integer, parameter, public :: &
-    int8 = merge(selectedInt8, merge(-2, -1, selectedInt8 >= 0), &
-                 digits(int(0,kind=safeInt8)) == 7), &
-    int16 = merge(selectedInt16, merge(-2, -1, selectedInt16 >= 0), &
-                  digits(int(0,kind=safeInt16)) == 15), &
-    int32 = merge(selectedInt32, merge(-2, -1, selectedInt32 >= 0), &
-                  digits(int(0,kind=safeInt32)) == 31), &
-    int64 = merge(selectedInt64, merge(-2, -1, selectedInt64 >= 0), &
-                  digits(int(0,kind=safeInt64)) == 63), &
-    int128 = merge(selectedInt128, merge(-2, -1, selectedInt128 >= 0), &
-                   digits(int(0,kind=safeInt128)) == 127)
-
-  integer, parameter, public :: integer_kinds(*) = [ &
-    selected_int_kind(0), &
-    [(pack([selected_int_kind(k)], &
-           selected_int_kind(k) >= 0 .and. &
-             selected_int_kind(k) /= selected_int_kind(k-1)), &
-      integer :: k=1, 39)]]
+  public :: selectedInt8, selectedInt16, selectedInt32, selectedInt64, selectedInt128, &
+    safeInt8, safeInt16, safeInt32, safeInt64, safeInt128, &
+    int8, int16, int32, int64, int128
 
-  integer, parameter, public :: &
-    logical8 = int8, logical16 = int16, logical32 = int32, logical64 = int64
-  integer, parameter, public :: logical_kinds(*) = [ &
-    pack([logical8],  logical8 >= 0), &
-    pack([logical16], logical16 >= 0), &
-    pack([logical32], logical32 >= 0), &
-    pack([logical64], logical64 >= 0)]
+  public :: logical8, logical16, logical32, logical64
 
-  integer, parameter :: &
-    selectedReal16 = selected_real_kind(3, 4), &      ! IEEE half
-    selectedBfloat16 = selected_real_kind(2, 37), &   ! truncated IEEE single
-    selectedReal32 = selected_real_kind(6, 37), &     ! IEEE single
-    selectedReal64 = selected_real_kind(15, 307), &   ! IEEE double
-    selectedReal80 = selected_real_kind(18, 4931), &  ! 80x87 extended
-    selectedReal64x2 = selected_real_kind(31, 307), & ! "double-double"
-    selectedReal128 = selected_real_kind(33, 4931), & ! IEEE quad
-    safeReal16 = merge(selectedReal16, selected_real_kind(0,0), &
-                       selectedReal16 >= 0), &
-    safeBfloat16 = merge(selectedBfloat16, selected_real_kind(0,0), &
-                         selectedBfloat16 >= 0), &
-    safeReal32 = merge(selectedReal32, selected_real_kind(0,0), &
-                       selectedReal32 >= 0), &
-    safeReal64 = merge(selectedReal64, selected_real_kind(0,0), &
-                       selectedReal64 >= 0), &
-    safeReal80 = merge(selectedReal80, selected_real_kind(0,0), &
-                       selectedReal80 >= 0), &
-    safeReal64x2 = merge(selectedReal64x2, selected_real_kind(0,0), &
-                         selectedReal64x2 >= 0), &
-    safeReal128 = merge(selectedReal128, selected_real_kind(0,0), &
-                        selectedReal128 >= 0)
-  integer, parameter, public :: &
-    real16 = merge(selectedReal16, merge(-2, -1, selectedReal16 >= 0), &
-                   digits(real(0,kind=safeReal16)) == 11), &
-    bfloat16 = merge(selectedBfloat16, merge(-2, -1, selectedBfloat16 >= 0), &
-                     digits(real(0,kind=safeBfloat16)) == 8), &
-    real32 = merge(selectedReal32, merge(-2, -1, selectedReal32 >= 0), &
-                   digits(real(0,kind=safeReal32)) == 24), &
-    real64 = merge(selectedReal64, merge(-2, -1, selectedReal64 >= 0), &
-                   digits(real(0,kind=safeReal64)) == 53), &
-    real80 = merge(selectedReal80, merge(-2, -1, selectedReal80 >= 0), &
-                   digits(real(0,kind=safeReal80)) == 64), &
-    real64x2 = merge(selectedReal64x2, merge(-2, -1, selectedReal64x2 >= 0), &
-                     digits(real(0,kind=safeReal64x2)) == 106), &
-    real128 = merge(selectedReal128, merge(-2, -1, selectedReal128 >= 0), &
-                    digits(real(0,kind=safeReal128)) == 113)
-
-  integer, parameter, public :: real_kinds(*) = [ &
-    pack([real16], real16 >= 0), &
-    pack([bfloat16], bfloat16 >= 0), &
-    pack([real32], real32 >= 0), &
-    pack([real64], real64 >= 0), &
-    pack([real80], real80 >= 0), &
-    pack([real64x2], real64x2 >= 0), &
-    pack([real128], real128 >= 0)]
+  public :: selectedReal16, selectedBfloat16, selectedReal32, &
+    selectedReal64, selectedReal80, selectedReal64x2, &
+    selectedReal128, &
+    safeReal16, safeBfloat16, safeReal32, &
+    safeReal64, safeReal80, safeReal64x2, &
+    safeReal128, &
+    real16, bfloat16, real32, real64, &
+    real80, real64x2, real128
+
+  public :: integer_kinds, real_kinds, logical_kinds
 
   integer, parameter, public :: current_team = -1, &
     initial_team = -2, &
diff --git a/flang/module/iso_fortran_env_impl.f90 b/flang/module/iso_fortran_env_impl.f90
new file mode 100644
index 0000000000000..4de54dda7bab1
--- /dev/null
+++ b/flang/module/iso_fortran_env_impl.f90
@@ -0,0 +1,114 @@
+!===-- module/iso_fortran_env_impl.f90 --=--------------------------------===!
+!
+! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+! See https://llvm.org/LICENSE.txt for license information.
+! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!
+!===------------------------------------------------------------------------===!
+
+! This MODULE implements part of the ISO_FORTRAN_ENV module file, which
+! partially requires linkable symbols for some entities defined
+! (e.g., real_kinds).
+
+module iso_fortran_env_impl
+  implicit none
+  private
+
+  ! INTEGER types
+  integer, parameter, public :: &
+    selectedInt8 = selected_int_kind(2), &
+    selectedInt16 = selected_int_kind(4), &
+    selectedInt32 = selected_int_kind(9), &
+    selectedInt64 = selected_int_kind(18),&
+    selectedInt128 = selected_int_kind(38), &
+    safeInt8 = merge(selectedInt8, selected_int_kind(0), &
+                     selectedInt8 >= 0), &
+    safeInt16 = merge(selectedInt16, selected_int_kind(0), &
+                      selectedInt16 >= 0), &
+    safeInt32 = merge(selectedInt32, selected_int_kind(0), &
+                      selectedInt32 >= 0), &
+    safeInt64 = merge(selectedInt64, selected_int_kind(0), &
+                      selectedInt64 >= 0), &
+    safeInt128 = merge(selectedInt128, selected_int_kind(0), &
+                       selectedInt128 >= 0)
+
+  integer, parameter, public :: &
+    int8 = merge(selectedInt8, merge(-2, -1, selectedInt8 >= 0), &
+                 digits(int(0,kind=safeInt8)) == 7), &
+    int16 = merge(selectedInt16, merge(-2, -1, selectedInt16 >= 0), &
+                  digits(int(0,kind=safeInt16)) == 15), &
+    int32 = merge(selectedInt32, merge(-2, -1, selectedInt32 >= 0), &
+                  digits(int(0,kind=safeInt32)) == 31), &
+    int64 = merge(selectedInt64, merge(-2, -1, selectedInt64 >= 0), &
+                  digits(int(0,kind=safeInt64)) == 63), &
+    int128 = merge(selectedInt128, merge(-2, -1, selectedInt128 >= 0), &
+                   digits(int(0,kind=safeInt128)) == 127)
+
+  integer, parameter, dimension(*), public :: __builtin_integer_kinds = [ &
+      selected_int_kind(0), &
+      [(pack([selected_int_kind(k)], &
+             selected_int_kind(k) >= 0 .and. &
+               selected_int_kind(k) /= selected_int_kind(k-1)), &
+        integer :: k=1, 39)]]
+
+  ! LOGICAL TYPES
+  integer, parameter, public :: &
+    logical8 = int8, logical16 = int16, logical32 = int32, logical64 = int64
+
+  integer, parameter, dimension(*), public :: __builtin_logical_kinds = [ &
+      pack([logical8],  logical8 >= 0), &
+      pack([logical16], logical16 >= 0), &
+      pack([logical32], logical32 >= 0), &
+      pack([logical64], logical64 >= 0) &
+    ]
+
+  ! REAL types
+  integer, parameter, public :: &
+    selectedReal16 = selected_real_kind(3, 4), &      ! IEEE half
+    selectedBfloat16 = selected_real_kind(2, 37), &   ! truncated IEEE single
+    selectedReal32 = selected_real_kind(6, 37), &     ! IEEE single
+    selectedReal64 = selected_real_kind(15, 307), &   ! IEEE double
+    selectedReal80 = selected_real_kind(18, 4931), &  ! 80x87 extended
+    selectedReal64x2 = selected_real_kind(31, 307), & ! "double-double"
+    selectedReal128 = selected_real_kind(33, 4931), & ! IEEE quad
+    safeReal16 = merge(selectedReal16, selected_real_kind(0,0), &
+                       selectedReal16 >= 0), &
+    safeBfloat16 = merge(selectedBfloat16, selected_real_kind(0,0), &
+                         selectedBfloat16 >= 0), &
+    safeReal32 = merge(selectedReal32, selected_real_kind(0,0), &
+                       selectedReal32 >= 0), &
+    safeReal64 = merge(selectedReal64, selected_real_kind(0,0), &
+                       selectedReal64 >= 0), &
+    safeReal80 = merge(selectedReal80, selected_real_kind(0,0), &
+                       selectedReal80 >= 0), &
+    safeReal64x2 = merge(selectedReal64x2, selected_real_kind(0,0), &
+                         selectedReal64x2 >= 0), &
+    safeReal128 = merge(selectedReal128, selected_real_kind(0,0), &
+                        selectedReal128 >= 0)
+
+  integer, parameter, public :: &
+    real16 = merge(selectedReal16, merge(-2, -1, selectedReal16 >= 0), &
+                   digits(real(0,kind=safeReal16)) == 11), &
+    bfloat16 = merge(selectedBfloat16, merge(-2, -1, selectedBfloat16 >= 0), &
+                     digits(real(0,kind=safeBfloat16)) == 8), &
+    real32 = merge(selectedReal32, merge(-2, -1, selectedReal32 >= 0), &
+                   digits(real(0,kind=safeReal32)) == 24), &
+    real64 = merge(selectedReal64, merge(-2, -1, selectedReal64 >= 0), &
+                   digits(real(0,kind=safeReal64)) == 53), &
+    real80 = merge(selectedReal80, merge(-2, -1, selectedReal80 >= 0), &
+                   digits(real(0,kind=safeReal80)) == 64), &
+    real64x2 = merge(selectedReal64x2, merge(-2, -1, selectedReal64x2 >= 0), &
+                     digits(real(0,kind=safeReal64x2)) == 106), &
+    real128 = merge(selectedReal128, merge(-2, -1, selectedReal128 >= 0), &
+                    digits(real(0,kind=safeReal128)) == 113)
+
+  integer, parameter, dimension(*), public :: __builtin_real_kinds = [ &
+      pack([real16], real16 >= 0), &
+      pack([bfloat16], bfloat16 >= 0), &
+      pack([real32], real32 >= 0), &
+      pack([real64], real64 >= 0), &
+      pack([real80], real80 >= 0), &
+      pack([real64x2], real64x2 >= 0), &
+      pack([real128], real128 >= 0) &
+    ]
+end module iso_fortran_env_impl
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index a826980e19411..1f3ae23dcbf12 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -107,6 +107,7 @@ add_subdirectory(Float128Math)
 
 set(sources
   ISO_Fortran_binding.cpp
+  allocator-registry.cpp
   allocatable.cpp
   array-constructor.cpp
   assign.cpp
@@ -169,6 +170,7 @@ set(sources
   unit-map.cpp
   unit.cpp
   utf.cpp
+  ${FORTRAN_MODULE_OBJECTS}
 )
 
 include(AddFlangOffloadRuntime)
@@ -177,6 +179,7 @@ include(AddFlangOffloadRuntime)
 set(supported_files
   ISO_Fortran_binding.cpp
   allocatable.cpp
+  allocator-registry.cpp
   array-constructor.cpp
   assign.cpp
   buffer.cpp
@@ -294,3 +297,15 @@ else()
     FortranRuntime.static_dbg FortranRuntime.dynamic_dbg)
 endif()
 set_target_properties(FortranRuntime PROPERTIES FOLDER "Flang/Runtime Libraries")
+
+# If FortranRuntime is part of a Flang build (and not a separate build) then
+# add dependency to make sure that Fortran runtime library is being built after
+# we have the Flang compiler available.  This also includes the MODULE files
+# that compile when the 'flang-new' target is built.
+#
+# TODO: This is a workaround and should be updated when runtime build procedure
+# is changed to a regular runtime build.  See discussion in PR #95388.
+if (TARGET flang-new AND TARGET module_files)
+  add_dependencies(FortranRuntime flang-new module_files)
+endif()
+
diff --git a/flang/runtime/ISO_Fortran_util.h b/flang/runtime/ISO_Fortran_util.h
index 469067600bd90..dd0eeef80bb89 100644
--- a/flang/runtime/ISO_Fortran_util.h
+++ b/flang/runtime/ISO_Fortran_util.h
@@ -86,7 +86,7 @@ static inline RT_API_ATTRS void EstablishDescriptor(CFI_cdesc_t *descriptor,
   descriptor->rank = rank;
   descriptor->type = type;
   descriptor->attribute = attribute;
-  descriptor->f18Addendum = 0;
+  descriptor->extra = 0;
   std::size_t byteSize{elem_len};
   constexpr std::size_t lower_bound{0};
   if (base_addr) {
diff --git a/flang/runtime/allocator-registry.cpp b/flang/runtime/allocator-registry.cpp
new file mode 100644
index 0000000000000..f5670331d6dbe
--- /dev/null
+++ b/flang/runtime/allocator-registry.cpp
@@ -0,0 +1,42 @@
+//===-- runtime/allocator-registry.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/allocator-registry.h"
+#include "terminator.h"
+
+namespace Fortran::runtime {
+
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+RT_OFFLOAD_VAR_GROUP_BEGIN
+RT_VAR_ATTRS AllocatorRegistry allocatorRegistry;
+RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+
+RT_OFFLOAD_API_GROUP_BEGIN
+RT_API_ATTRS void AllocatorRegistry::Register(int pos, Allocator_t allocator) {
+  // pos 0 is reserved for the default allocator and is registered in the
+  // struct ctor.
+  INTERNAL_CHECK(pos > 0 && pos < MAX_ALLOCATOR);
+  allocators[pos] = allocator;
+}
+
+RT_API_ATTRS AllocFct AllocatorRegistry::GetAllocator(int pos) {
+  INTERNAL_CHECK(pos >= 0 && pos < MAX_ALLOCATOR);
+  AllocFct f{allocators[pos].alloc};
+  INTERNAL_CHECK(f != nullptr);
+  return f;
+}
+
+RT_API_ATTRS FreeFct AllocatorRegistry::GetDeallocator(int pos) {
+  INTERNAL_CHECK(pos >= 0 && pos < MAX_ALLOCATOR);
+  FreeFct f{allocators[pos].free};
+  INTERNAL_CHECK(f != nullptr);
+  return f;
+}
+RT_OFFLOAD_API_GROUP_END
+} // namespace Fortran::runtime
diff --git a/flang/runtime/copy.cpp b/flang/runtime/copy.cpp
index 7cf9483654141..c2dbbc4a11c06 100644
--- a/flang/runtime/copy.cpp
+++ b/flang/runtime/copy.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "copy.h"
+#include "stack.h"
 #include "terminator.h"
 #include "type-info.h"
 #include "flang/Runtime/allocatable.h"
@@ -14,76 +15,170 @@
 #include <cstring>
 
 namespace Fortran::runtime {
+namespace {
+using StaticDescTy = StaticDescriptor<maxRank, true, 0>;
+
+// A structure describing the data copy that needs to be done
+// from one descriptor to another. It is a helper structure
+// for CopyElement.
+struct CopyDescriptor {
+  // A constructor specifying all members explicitly.
+  RT_API_ATTRS CopyDescriptor(const Descriptor &to, const SubscriptValue toAt[],
+      const Descriptor &from, const SubscriptValue fromAt[],
+      std::size_t elements, bool usesStaticDescriptors = false)
+      : to_(to), from_(from), elements_(elements),
+        usesStaticDescriptors_(usesStaticDescriptors) {
+    for (int dim{0}; dim < to.rank(); ++dim) {
+      toAt_[dim] = toAt[dim];
+    }
+    for (int dim{0}; dim < from.rank(); ++dim) {
+      fromAt_[dim] = fromAt[dim];
+    }
+  }
+  // The number of elements to copy is initialized from the to descriptor.
+  // The current element subscripts are initialized from the lower bounds
+  // of the to and from descriptors.
+  RT_API_ATTRS CopyDescriptor(const Descriptor &to, const Descriptor &from,
+      bool usesStaticDescriptors = false)
+      : to_(to), from_(from), elements_(to.Elements()),
+        usesStaticDescriptors_(usesStaticDescriptors) {
+    to.GetLowerBounds(toAt_);
+    from.GetLowerBounds(fromAt_);
+  }
+
+  // Descriptor of the destination.
+  const Descriptor &to_;
+  // A subscript specifying the current element position to copy to.
+  SubscriptValue toAt_[maxRank];
+  // Descriptor of the source.
+  const Descriptor &from_;
+  // A subscript specifying the current element position to copy from.
+  SubscriptValue fromAt_[maxRank];
+  // Number of elements left to copy.
+  std::size_t elements_;
+  // Must be true, if the to and from descriptors are allocated
+  // by the CopyElement runtime. The allocated memory belongs
+  // to a separate stack that needs to be popped in correspondence
+  // with popping such a CopyDescriptor node.
+  bool usesStaticDescriptors_;
+};
+
+// A pair of StaticDescTy elements.
+struct StaticDescriptorsPair {
+  StaticDescTy to;
+  StaticDescTy from;
+};
+} // namespace
+
 RT_OFFLOAD_API_GROUP_BEGIN
 
 RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
     const Descriptor &from, const SubscriptValue fromAt[],
     Terminator &terminator) {
-  char *toPtr{to.Element<char>(toAt)};
-  char *fromPtr{from.Element<char>(fromAt)};
-  RUNTIME_CHECK(terminator, to.ElementBytes() == from.ElementBytes());
-  std::memcpy(toPtr, fromPtr, to.ElementBytes());
-  // Deep copy allocatable and automatic components if any.
-  if (const auto *addendum{to.Addendum()}) {
-    if (const auto *derived{addendum->derivedType()};
-        derived && !derived->noDestructionNeeded()) {
-      RUNTIME_CHECK(terminator,
-          from.Addendum() && derived == from.Addendum()->derivedType());
-      const Descriptor &componentDesc{derived->component()};
-      const typeInfo::Component *component{
-          componentDesc.OffsetElement<typeInfo::Component>()};
-      std::size_t nComponents{componentDesc.Elements()};
-      for (std::size_t j{0}; j < nComponents; ++j, ++component) {
-        if (component->genre() == typeInfo::Component::Genre::Allocatable ||
-            component->genre() == typeInfo::Component::Genre::Automatic) {
-          Descriptor &toDesc{
-              *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
-          if (toDesc.raw().base_addr != nullptr) {
-            toDesc.set_base_addr(nullptr);
-            RUNTIME_CHECK(terminator, toDesc.Allocate() == CFI_SUCCESS);
-            const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
-                fromPtr + component->offset())};
-            CopyArray(toDesc, fromDesc, terminator);
-          }
-        } else if (component->genre() == typeInfo::Component::Genre::Data &&
-            component->derivedType() &&
-            !component->derivedType()->noDestructionNeeded()) {
-          SubscriptValue extents[maxRank];
-          const typeInfo::Value *bounds{component->bounds()};
-          for (int dim{0}; dim < component->rank(); ++dim) {
-            SubscriptValue lb{bounds[2 * dim].GetValue(&to).value_or(0)};
-            SubscriptValue ub{bounds[2 * dim + 1].GetValue(&to).value_or(0)};
-            extents[dim] = ub >= lb ? ub - lb + 1 : 0;
+#if !defined(RT_DEVICE_COMPILATION)
+  constexpr unsigned copyStackReserve{16};
+  constexpr unsigned descriptorStackReserve{6};
+#else
+  // Always use dynamic allocation on the device to avoid
+  // big stack sizes. This may be tuned as needed.
+  constexpr unsigned copyStackReserve{0};
+  constexpr unsigned descriptorStackReserve{0};
+#endif
+  // Keep a stack of CopyDescriptor's to avoid recursive calls.
+  Stack<CopyDescriptor, copyStackReserve> copyStack{terminator};
+  // Keep a separate stack of StaticDescTy pairs. These descriptors
+  // may be used for representing copies of Component::Genre::Data
+  // components (since they do not have their descriptors allocated
+  // in memory).
+  Stack<StaticDescriptorsPair, descriptorStackReserve> descriptorsStack{
+      terminator};
+  copyStack.emplace(to, toAt, from, fromAt, /*elements=*/std::size_t{1});
+
+  while (!copyStack.empty()) {
+    CopyDescriptor &currentCopy{copyStack.top()};
+    std::size_t &elements{currentCopy.elements_};
+    if (elements == 0) {
+      // This copy has been exhausted.
+      if (currentCopy.usesStaticDescriptors_) {
+        // Pop the static descriptors, if they were used
+        // for the current copy.
+        descriptorsStack.pop();
+      }
+      copyStack.pop();
+      continue;
+    }
+    const Descriptor &curTo{currentCopy.to_};
+    SubscriptValue *curToAt{currentCopy.toAt_};
+    const Descriptor &curFrom{currentCopy.from_};
+    SubscriptValue *curFromAt{currentCopy.fromAt_};
+    char *toPtr{curTo.Element<char>(curToAt)};
+    char *fromPtr{curFrom.Element<char>(curFromAt)};
+    RUNTIME_CHECK(terminator, curTo.ElementBytes() == curFrom.ElementBytes());
+    // TODO: the memcpy can be optimized when both to and from are contiguous.
+    // Moreover, if we came here from an Component::Genre::Data component,
+    // all the per-element copies are redundant, because the parent
+    // has already been copied as a whole.
+    std::memcpy(toPtr, fromPtr, curTo.ElementBytes());
+    --elements;
+    if (elements != 0) {
+      curTo.IncrementSubscripts(curToAt);
+      curFrom.IncrementSubscripts(curFromAt);
+    }
+
+    // Deep copy allocatable and automatic components if any.
+    if (const auto *addendum{curTo.Addendum()}) {
+      if (const auto *derived{addendum->derivedType()};
+          derived && !derived->noDestructionNeeded()) {
+        RUNTIME_CHECK(terminator,
+            curFrom.Addendum() && derived == curFrom.Addendum()->derivedType());
+        const Descriptor &componentDesc{derived->component()};
+        const typeInfo::Component *component{
+            componentDesc.OffsetElement<typeInfo::Component>()};
+        std::size_t nComponents{componentDesc.Elements()};
+        for (std::size_t j{0}; j < nComponents; ++j, ++component) {
+          if (component->genre() == typeInfo::Component::Genre::Allocatable ||
+              component->genre() == typeInfo::Component::Genre::Automatic) {
+            Descriptor &toDesc{
+                *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
+            if (toDesc.raw().base_addr != nullptr) {
+              toDesc.set_base_addr(nullptr);
+              RUNTIME_CHECK(terminator, toDesc.Allocate() == CFI_SUCCESS);
+              const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
+                  fromPtr + component->offset())};
+              copyStack.emplace(toDesc, fromDesc);
+            }
+          } else if (component->genre() == typeInfo::Component::Genre::Data &&
+              component->derivedType() &&
+              !component->derivedType()->noDestructionNeeded()) {
+            SubscriptValue extents[maxRank];
+            const typeInfo::Value *bounds{component->bounds()};
+            std::size_t elements{1};
+            for (int dim{0}; dim < component->rank(); ++dim) {
+              SubscriptValue lb{bounds[2 * dim].GetValue(&curTo).value_or(0)};
+              SubscriptValue ub{
+                  bounds[2 * dim + 1].GetValue(&curTo).value_or(0)};
+              extents[dim] = ub >= lb ? ub - lb + 1 : 0;
+              elements *= extents[dim];
+            }
+            if (elements != 0) {
+              const typeInfo::DerivedType &compType{*component->derivedType()};
+              // Place a pair of static descriptors onto the descriptors stack.
+              descriptorsStack.emplace();
+              StaticDescriptorsPair &descs{descriptorsStack.top()};
+              Descriptor &toCompDesc{descs.to.descriptor()};
+              toCompDesc.Establish(compType, toPtr + component->offset(),
+                  component->rank(), extents);
+              Descriptor &fromCompDesc{descs.from.descriptor()};
+              fromCompDesc.Establish(compType, fromPtr + component->offset(),
+                  component->rank(), extents);
+              copyStack.emplace(toCompDesc, fromCompDesc,
+                  /*usesStaticDescriptors=*/true);
+            }
           }
-          const typeInfo::DerivedType &compType{*component->derivedType()};
-          StaticDescriptor<maxRank, true, 0> toStaticDescriptor;
-          Descriptor &toCompDesc{toStaticDescriptor.descriptor()};
-          toCompDesc.Establish(compType, toPtr + component->offset(),
-              component->rank(), extents);
-          StaticDescriptor<maxRank, true, 0> fromStaticDescriptor;
-          Descriptor &fromCompDesc{fromStaticDescriptor.descriptor()};
-          fromCompDesc.Establish(compType, fromPtr + component->offset(),
-              component->rank(), extents);
-          CopyArray(toCompDesc, fromCompDesc, terminator);
         }
       }
     }
   }
 }
-
-RT_API_ATTRS void CopyArray(
-    const Descriptor &to, const Descriptor &from, Terminator &terminator) {
-  std::size_t elements{to.Elements()};
-  RUNTIME_CHECK(terminator, elements == from.Elements());
-  SubscriptValue toAt[maxRank], fromAt[maxRank];
-  to.GetLowerBounds(toAt);
-  from.GetLowerBounds(fromAt);
-  while (elements-- > 0) {
-    CopyElement(to, toAt, from, fromAt, terminator);
-    to.IncrementSubscripts(toAt);
-    from.IncrementSubscripts(fromAt);
-  }
-}
-
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/copy.h b/flang/runtime/copy.h
index 5d725de725735..542660530bfb6 100644
--- a/flang/runtime/copy.h
+++ b/flang/runtime/copy.h
@@ -21,9 +21,5 @@ namespace Fortran::runtime {
 RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
     const Descriptor &from, const SubscriptValue fromAt[], Terminator &);
 
-// Copies data from one allocated descriptor's array to another.
-RT_API_ATTRS void CopyArray(
-    const Descriptor &to, const Descriptor &from, Terminator &);
-
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_COPY_H_
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 0d9e033df4e27..659f54fa344bb 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -23,10 +23,9 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
     const typeInfo::Component &comp, const Descriptor &derivedInstance) {
   const typeInfo::Value *bounds{comp.bounds()};
   for (int dim{0}; dim < comp.rank(); ++dim) {
-    SubscriptValue lb{bounds[2 * dim].GetValue(&derivedInstance).value_or(0)};
-    SubscriptValue ub{
-        bounds[2 * dim + 1].GetValue(&derivedInstance).value_or(0)};
-    extents[dim] = ub >= lb ? ub - lb + 1 : 0;
+    auto lb{bounds[2 * dim].GetValue(&derivedInstance).value_or(0)};
+    auto ub{bounds[2 * dim + 1].GetValue(&derivedInstance).value_or(0)};
+    extents[dim] = ub >= lb ? static_cast<SubscriptValue>(ub - lb + 1) : 0;
   }
 }
 
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index 9b04cb4f8d0d0..34f7a02ea8c7b 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -14,6 +14,7 @@
 #include "terminator.h"
 #include "tools.h"
 #include "type-info.h"
+#include "flang/Runtime/allocator-registry.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -50,7 +51,9 @@ RT_API_ATTRS void Descriptor::Establish(TypeCode t, std::size_t elementBytes,
       GetDimension(j).SetByteStride(0);
     }
   }
-  raw_.f18Addendum = addendum;
+  if (addendum) {
+    SetHasAddendum();
+  }
   DescriptorAddendum *a{Addendum()};
   RUNTIME_CHECK(terminator, addendum == (a != nullptr));
   if (a) {
@@ -162,7 +165,9 @@ RT_API_ATTRS int Descriptor::Allocate() {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{byteSize ? std::malloc(byteSize) : std::malloc(1)};
+
+  AllocFct alloc{allocatorRegistry.GetAllocator(GetAllocIdx())};
+  void *p{alloc(byteSize ? byteSize : 1)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
@@ -204,7 +209,8 @@ RT_API_ATTRS int Descriptor::Deallocate() {
   if (!descriptor.base_addr) {
     return CFI_ERROR_BASE_ADDR_NULL;
   } else {
-    std::free(descriptor.base_addr);
+    FreeFct free{allocatorRegistry.GetDeallocator(GetAllocIdx())};
+    free(descriptor.base_addr);
     descriptor.base_addr = nullptr;
     return CFI_SUCCESS;
   }
@@ -290,7 +296,9 @@ void Descriptor::Dump(FILE *f) const {
   std::fprintf(f, "  rank      %d\n", static_cast<int>(raw_.rank));
   std::fprintf(f, "  type      %d\n", static_cast<int>(raw_.type));
   std::fprintf(f, "  attribute %d\n", static_cast<int>(raw_.attribute));
-  std::fprintf(f, "  addendum  %d\n", static_cast<int>(raw_.f18Addendum));
+  std::fprintf(f, "  extra     %d\n", static_cast<int>(raw_.extra));
+  std::fprintf(f, "    addendum  %d\n", static_cast<int>(HasAddendum()));
+  std::fprintf(f, "    alloc_idx %d\n", static_cast<int>(GetAllocIdx()));
   for (int j{0}; j < raw_.rank; ++j) {
     std::fprintf(f, "  dim[%d] lower_bound %jd\n", j,
         static_cast<std::intmax_t>(raw_.dim[j].lower_bound));
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index dfd3b812e22a1..2032ce7b12242 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Map Fortran ieee_arithmetic module exceptions to fenv.h exceptions.
+// Runtime exception support.
 
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
-#include "flang/Runtime/magic-numbers.h"
 #include <cfenv>
 
 #ifndef __FE_DENORM
@@ -21,58 +20,32 @@ namespace Fortran::runtime {
 
 extern "C" {
 
-std::int32_t RTNAME(MapException)(int32_t except) {
+// Map a set of Fortran ieee_arithmetic module exceptions to a libm fenv.h
+// excepts value.
+uint32_t RTNAME(MapException)(uint32_t excepts) {
   Terminator terminator{__FILE__, __LINE__};
 
-  static constexpr int32_t mask{_FORTRAN_RUNTIME_IEEE_INVALID |
-      _FORTRAN_RUNTIME_IEEE_DENORM | _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO |
-      _FORTRAN_RUNTIME_IEEE_OVERFLOW | _FORTRAN_RUNTIME_IEEE_UNDERFLOW |
-      _FORTRAN_RUNTIME_IEEE_INEXACT};
-  if (except == 0 || except != (except & mask)) {
-    terminator.Crash("Invalid exception value: %d", except);
+  static constexpr uint32_t v{FE_INVALID};
+  static constexpr uint32_t s{__FE_DENORM}; // subnormal
+  static constexpr uint32_t z{FE_DIVBYZERO};
+  static constexpr uint32_t o{FE_OVERFLOW};
+  static constexpr uint32_t u{FE_UNDERFLOW};
+  static constexpr uint32_t x{FE_INEXACT};
+
+#define vm(p) p, p | v
+#define sm(p) vm(p), vm(p | s)
+#define zm(p) sm(p), sm(p | z)
+#define om(p) zm(p), zm(p | o)
+#define um(p) om(p), om(p | u)
+#define xm um(0), um(x)
+
+  static constexpr uint32_t map[]{xm};
+  static constexpr uint32_t mapSize{sizeof(map) / sizeof(uint32_t)};
+  static_assert(mapSize == 64);
+  if (excepts == 0 || excepts >= mapSize) {
+    terminator.Crash("Invalid excepts value: %d", excepts);
   }
-
-  // Fortran and fenv.h values are identical; return the value.
-  if constexpr (_FORTRAN_RUNTIME_IEEE_INVALID == FE_INVALID &&
-      _FORTRAN_RUNTIME_IEEE_DENORM == __FE_DENORM &&
-      _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO == FE_DIVBYZERO &&
-      _FORTRAN_RUNTIME_IEEE_OVERFLOW == FE_OVERFLOW &&
-      _FORTRAN_RUNTIME_IEEE_UNDERFLOW == FE_UNDERFLOW &&
-      _FORTRAN_RUNTIME_IEEE_INEXACT == FE_INEXACT) {
-    return except;
-  }
-
-  // fenv.h calls that take exception arguments are able to process multiple
-  // exceptions in one call, such as FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID.
-  // And intrinsic module procedures that manage exceptions are elemental
-  // procedures that may specify multiple exceptions, such as ieee_all.
-  // However, general elemental call processing places single scalar arguments
-  // in a loop. As a consequence, argument 'except' here will be a power of
-  // two, corresponding to a single exception. If code generation were
-  // modified to bypass normal elemental call processing for calls with
-  // ieee_usual, ieee_all, or user-specified array arguments, this switch
-  // could be extended to support that.
-
-  // Fortran and fenv.h values differ.
-  switch (except) {
-  case _FORTRAN_RUNTIME_IEEE_INVALID:
-    return FE_INVALID;
-  case _FORTRAN_RUNTIME_IEEE_DENORM:
-    if (__FE_DENORM) {
-      return __FE_DENORM;
-    }
-    break;
-  case _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO:
-    return FE_DIVBYZERO;
-  case _FORTRAN_RUNTIME_IEEE_OVERFLOW:
-    return FE_OVERFLOW;
-  case _FORTRAN_RUNTIME_IEEE_UNDERFLOW:
-    return FE_UNDERFLOW;
-  case _FORTRAN_RUNTIME_IEEE_INEXACT:
-    return FE_INEXACT;
-  }
-
-  terminator.Crash("Invalid exception set: %d", except);
+  return map[excepts];
 }
 
 // Verify that the size of ieee_modes_type and ieee_status_type objects from
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp
index 2f7fcd2e2341f..f7d893829fc0d 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang/runtime/misc-intrinsic.cpp
@@ -56,10 +56,10 @@ static RT_API_ATTRS void TransferImpl(Descriptor &result,
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2,
+void RTDEF(Rename)(const Descriptor &path1, const Descriptor &path2,
     const Descriptor *status, const char *sourceFile, int line) {
   Terminator terminator{sourceFile, line};
-
+#if !defined(RT_DEVICE_COMPILATION)
   char *pathSrc{EnsureNullTerminated(
       path1.OffsetElement(), path1.ElementBytes(), terminator)};
   char *pathDst{EnsureNullTerminated(
@@ -84,6 +84,9 @@ void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2,
   if (pathDst != path2.OffsetElement()) {
     FreeMemory(pathDst);
   }
+#else // !defined(RT_DEVICE_COMPILATION)
+  terminator.Crash("RENAME intrinsic is only supported on host devices");
+#endif // !defined(RT_DEVICE_COMPILATION)
 }
 
 void RTDEF(Transfer)(Descriptor &result, const Descriptor &source,
diff --git a/flang/runtime/stack.h b/flang/runtime/stack.h
new file mode 100644
index 0000000000000..b6e6edb595e9a
--- /dev/null
+++ b/flang/runtime/stack.h
@@ -0,0 +1,136 @@
+//===-- runtime/stack.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Trivial implementation of stack that can be used on all targets.
+// It is a list based stack with dynamic allocation/deallocation
+// of the list nodes.
+
+#ifndef FORTRAN_RUNTIME_STACK_H
+#define FORTRAN_RUNTIME_STACK_H
+
+#include "terminator.h"
+#include "flang/Runtime/memory.h"
+
+namespace Fortran::runtime {
+// Storage for the Stack elements of type T.
+template <typename T, unsigned N> struct StackStorage {
+  RT_API_ATTRS void *getElement(unsigned i) {
+    if (i < N) {
+      return storage[i];
+    } else {
+      return nullptr;
+    }
+  }
+  RT_API_ATTRS const void *getElement(unsigned i) const {
+    if (i < N) {
+      return storage[i];
+    } else {
+      return nullptr;
+    }
+  }
+
+private:
+  // Storage to hold N elements of type T.
+  // It is declared as an array of bytes to avoid
+  // default construction (if any is implied by type T).
+  alignas(T) char storage[N][sizeof(T)];
+};
+
+// 0-size specialization that provides no storage.
+template <typename T> struct alignas(T) StackStorage<T, 0> {
+  RT_API_ATTRS void *getElement(unsigned) { return nullptr; }
+  RT_API_ATTRS const void *getElement(unsigned) const { return nullptr; }
+};
+
+template <typename T, unsigned N = 0> class Stack : public StackStorage<T, N> {
+public:
+  Stack() = delete;
+  Stack(const Stack &) = delete;
+  Stack(Stack &&) = delete;
+  RT_API_ATTRS Stack(Terminator &terminator) : terminator_{terminator} {}
+  RT_API_ATTRS ~Stack() {
+    while (!empty()) {
+      pop();
+    }
+  }
+  RT_API_ATTRS void push(const T &object) {
+    if (void *ptr{this->getElement(size_)}) {
+      new (ptr) T{object};
+    } else {
+      top_ = New<List>{terminator_}(top_, object).release();
+    }
+    ++size_;
+  }
+  RT_API_ATTRS void push(T &&object) {
+    if (void *ptr{this->getElement(size_)}) {
+      new (ptr) T{std::move(object)};
+    } else {
+      top_ = New<List>{terminator_}(top_, std::move(object)).release();
+    }
+    ++size_;
+  }
+  template <typename... Args> RT_API_ATTRS void emplace(Args &&...args) {
+    if (void *ptr{this->getElement(size_)}) {
+      new (ptr) T{std::forward<Args>(args)...};
+    } else {
+      top_ =
+          New<List>{terminator_}(top_, std::forward<Args>(args)...).release();
+    }
+    ++size_;
+  }
+  RT_API_ATTRS T &top() {
+    RUNTIME_CHECK(terminator_, size_ > 0);
+    if (void *ptr{this->getElement(size_ - 1)}) {
+      return *reinterpret_cast<T *>(ptr);
+    } else {
+      RUNTIME_CHECK(terminator_, top_);
+      return top_->object_;
+    }
+  }
+  RT_API_ATTRS const T &top() const {
+    RUNTIME_CHECK(terminator_, size_ > 0);
+    if (void *ptr{this->getElement(size_ - 1)}) {
+      return *reinterpret_cast<const T *>(ptr);
+    } else {
+      RUNTIME_CHECK(terminator_, top_);
+      return top_->object_;
+    }
+  }
+  RT_API_ATTRS void pop() {
+    RUNTIME_CHECK(terminator_, size_ > 0);
+    if (void *ptr{this->getElement(size_ - 1)}) {
+      reinterpret_cast<T *>(ptr)->~T();
+    } else {
+      RUNTIME_CHECK(terminator_, top_);
+      List *next{top_->next_};
+      top_->~List();
+      FreeMemory(top_);
+      top_ = next;
+    }
+    --size_;
+  }
+  RT_API_ATTRS bool empty() const { return size_ == 0; }
+
+private:
+  struct List {
+    template <typename... Args>
+    RT_API_ATTRS List(List *next, Args &&...args)
+        : next_(next), object_(std::forward<Args>(args)...) {}
+    RT_API_ATTRS List(List *next, const T &object)
+        : next_(next), object_(object) {}
+    RT_API_ATTRS List(List *next, T &&object)
+        : next_(next), object_(std::move(object)) {}
+    List *next_{nullptr};
+    T object_;
+  };
+  List *top_{nullptr};
+  std::size_t size_{0};
+  Terminator &terminator_;
+};
+} // namespace Fortran::runtime
+#endif // FORTRAN_RUNTIME_STACK_H
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index cf1e61c0844d8..b6b204be4418c 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -508,7 +508,8 @@ void RTDEF(CshiftVector)(Descriptor &result, const Descriptor &source,
   SubscriptValue lb{sourceDim.LowerBound()};
   for (SubscriptValue j{0}; j < extent; ++j) {
     SubscriptValue resultAt{1 + j};
-    SubscriptValue sourceAt{lb + (j + shift) % extent};
+    SubscriptValue sourceAt{
+        lb + static_cast<SubscriptValue>(j + shift) % extent};
     if (sourceAt < lb) {
       sourceAt += extent;
     }
@@ -619,7 +620,7 @@ void RTDEF(EoshiftVector)(Descriptor &result, const Descriptor &source,
   }
   SubscriptValue lb{source.GetDimension(0).LowerBound()};
   for (SubscriptValue j{1}; j <= extent; ++j) {
-    SubscriptValue sourceAt{lb + j - 1 + shift};
+    SubscriptValue sourceAt{lb + j - 1 + static_cast<SubscriptValue>(shift)};
     if (sourceAt >= lb && sourceAt < lb + extent) {
       CopyElement(result, &j, source, &sourceAt, terminator);
     } else if (boundary) {
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index a11f444d8d754..5c5bca835f3d8 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -268,6 +268,7 @@ void ExternalFileUnit::FinishReadingRecord(IoErrorHandler &handler) {
     recordOffsetInFrame_ = 0;
   }
   BeginRecord();
+  leftTabLimit.reset();
 }
 
 bool ExternalFileUnit::AdvanceRecord(IoErrorHandler &handler) {
diff --git a/flang/test/Driver/Inputs/config-1.cfg b/flang/test/Driver/Inputs/config-1.cfg
new file mode 100644
index 0000000000000..824e128a42b63
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-1.cfg
@@ -0,0 +1 @@
+-flto
diff --git a/flang/test/Driver/Inputs/config-2.cfg b/flang/test/Driver/Inputs/config-2.cfg
new file mode 100644
index 0000000000000..4e8d01b668e83
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-2.cfg
@@ -0,0 +1 @@
+-fno-signed-zeros
diff --git a/flang/test/Driver/Inputs/config-2a.cfg b/flang/test/Driver/Inputs/config-2a.cfg
new file mode 100644
index 0000000000000..cd2916c98afe2
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-2a.cfg
@@ -0,0 +1 @@
+-fopenmp
diff --git a/flang/test/Driver/Inputs/config-6.cfg b/flang/test/Driver/Inputs/config-6.cfg
new file mode 100644
index 0000000000000..81e9830f63be4
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-6.cfg
@@ -0,0 +1 @@
+-fstack-arrays
diff --git a/flang/test/Driver/Inputs/config/config-4.cfg b/flang/test/Driver/Inputs/config/config-4.cfg
new file mode 100644
index 0000000000000..d15a7108d4e21
--- /dev/null
+++ b/flang/test/Driver/Inputs/config/config-4.cfg
@@ -0,0 +1 @@
+-O3
diff --git a/flang/test/Driver/Inputs/config2/config-4.cfg b/flang/test/Driver/Inputs/config2/config-4.cfg
new file mode 100644
index 0000000000000..9d1c3e38c8680
--- /dev/null
+++ b/flang/test/Driver/Inputs/config2/config-4.cfg
@@ -0,0 +1 @@
+-ffp-contract=fast
diff --git a/flang/test/Driver/config-file.f90 b/flang/test/Driver/config-file.f90
new file mode 100644
index 0000000000000..70316dd971f36
--- /dev/null
+++ b/flang/test/Driver/config-file.f90
@@ -0,0 +1,63 @@
+!--- Config file (full path) in output of -###
+!
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-DIRS
+! CHECK-DIRS: System configuration file directory: {{.*}}/Inputs/config
+! CHECK-DIRS: User configuration file directory: {{.*}}/Inputs/config2
+!
+!--- Config file (full path) in output of -###
+!
+! RUN: %flang --config %S/Inputs/config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH
+! RUN: %flang --config=%S/Inputs/config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH
+! CHECK-HHH: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-HHH: -flto
+!
+!
+!--- Config file (full path) in output of -v
+!
+! RUN: %flang --config %S/Inputs/config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V
+! CHECK-V: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-V: -flto
+!
+!--- Config file in output of -###
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH2
+! CHECK-HHH2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-HHH2: -flto
+!
+!--- Config file in output of -v
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V2
+! CHECK-V2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-V2: -flto
+!
+!--- Nested config files
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-2.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED
+! CHECK-NESTED: Configuration file: {{.*}}Inputs{{.}}config-2.cfg
+! CHECK-NESTED: -fno-signed-zeros
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir=%S/Inputs/config --config config-6.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2
+! CHECK-NESTED2: Configuration file: {{.*}}Inputs{{.}}config-6.cfg
+! CHECK-NESTED2: -fstack-arrays
+!
+!
+! RUN: %flang --config %S/Inputs/config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTEDa
+! CHECK-NESTEDa: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+! CHECK-NESTEDa: -fopenmp
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2a
+! CHECK-NESTED2a: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+! CHECK-NESTED2a: -fopenmp
+!
+!--- User directory is searched first.
+!
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 --config config-4.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-PRECEDENCE
+! CHECK-PRECEDENCE: Configuration file: {{.*}}Inputs{{.}}config2{{.}}config-4.cfg
+! CHECK-PRECEDENCE: -ffp-contract=fast
+!
+!--- Multiple configuration files can be specified.
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir= --config config-4.cfg --config %S/Inputs/config2/config-4.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-TWO-CONFIGS
+! CHECK-TWO-CONFIGS: Configuration file: {{.*}}Inputs{{.}}config{{.}}config-4.cfg
+! CHECK-TWO-CONFIGS-NEXT: Configuration file: {{.*}}Inputs{{.}}config2{{.}}config-4.cfg
+! CHECK-TWO-CONFIGS: -ffp-contract=fast
+! CHECK-TWO-CONFIGS: -O3
diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index 6fb4f4eeeeca1..b0b94ab1386a7 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -14,12 +14,12 @@
 ! Test regular -fopenmp with offload, and invocation filtering options
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-device \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! OFFLOAD-HOST-AND-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -29,7 +29,7 @@
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST
 
 ! OFFLOAD-HOST: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -39,7 +39,7 @@
 
 ! RUN: %flang -S -### %s 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-device-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-DEVICE
 
 ! OFFLOAD-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -48,13 +48,13 @@
 ! OFFLOAD-DEVICE-NOT: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 
 ! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp 
-! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
+! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
 ! CHECK-OPENMP-IS-TARGET-DEVICE: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" {{.*}}.f90"
 
 ! Testing appropriate flags are gnerated and appropriately assigned by the driver when offloading
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OPENMP-OFFLOAD-ARGS
 ! OPENMP-OFFLOAD-ARGS: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu" {{.*}} "-fopenmp" {{.*}}.f90"
 ! OPENMP-OFFLOAD-ARGS-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
@@ -70,19 +70,19 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription -nogpulib \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
 ! RUN: -fopenmp-targets=nvptx64-nvidia-cuda \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription  \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! CHECK-THREADS-OVS: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" "-fopenmp-assume-threads-oversubscription" {{.*}}.f90"
 
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-teams-oversubscription  \
+! RUN: -fopenmp-assume-teams-oversubscription  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TEAMS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -94,7 +94,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-nested-parallelism  \
+! RUN: -fopenmp-assume-no-nested-parallelism  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-NEST-PAR
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -106,7 +106,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREAD-STATE
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -118,7 +118,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -130,7 +130,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -144,7 +144,7 @@
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
 ! RUN: -fopenmp-target-debug -fopenmp-assume-threads-oversubscription \
 ! RUN: -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-RTL-ALL
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -160,7 +160,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-version=45 \
+! RUN: -fopenmp-version=45 -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-OPENMP-VERSION
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -175,33 +175,6 @@
 ! RUN: | FileCheck %s --check-prefix=HOST-IR-MISSING
 ! HOST-IR-MISSING: error: provided host compiler IR file 'non-existant-file.bc' is required to generate code for OpenMP target regions but cannot be found
 
-! Check that `-gpulibc` includes the LLVM C libraries for the GPU.
-! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=sm_52 \
-! RUN:      -gpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=LIBC-GPU-NVPTX %s
-! LIBC-GPU-NVPTX-DAG: "-lcgpu-nvptx"
-! LIBC-GPU-NVPTX-DAG: "-lmgpu-nvptx"
-
-! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=sm_52 \
-! RUN:      -nogpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-NVPTX %s
-! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"
-
-! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=gfx90a \
-! RUN:      -gpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=LIBC-GPU-AMDGPU %s
-! LIBC-GPU-AMDGPU-DAG: "-lcgpu-amdgpu"
-! LIBC-GPU-AMDGPU-DAG: "-lmgpu-amdgpu"
-
-! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=gfx90a \
-! RUN:      -nogpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-AMDGPU %s
-! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"
-
 ! RUN:   %flang -### -v --target=x86_64-unknown-linux-gnu -fopenmp  \
 ! RUN:      --offload-arch=gfx900 \
 ! RUN:      --rocm-path=%S/Inputs/rocm %s 2>&1 \
@@ -220,10 +193,35 @@
 ! Test -fopenmp-force-usm option with offload
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp -fopenmp-force-usm --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=FORCE-USM-OFFLOAD
 
 ! FORCE-USM-OFFLOAD: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
 ! FORCE-USM-OFFLOAD-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
+
+! RUN:   %flang -### -v --target=x86_64-unknown-linux-gnu -fopenmp  \
+! RUN:      --offload-arch=gfx900 \
+! RUN:      --rocm-path=%S/Inputs/rocm %s 2>&1 \
+! RUN:   | FileCheck --check-prefix=MLINK-BUILTIN-BITCODE  %s
+! MLINK-BUILTIN-BITCODE:      "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
+! MLINK-BUILTIN-BITCODE-SAME: "-mlink-builtin-bitcode" {{.*Inputs.*rocm.*amdgcn.*bitcode.*}}oclc_isa_version_900.bc
+
+! Test that the -fopenmp-targets option is added to host compilation invocations
+! when --offload-arch or -fopenmp-targets are set.
+! RUN: %flang -S -### %s -o %t 2>&1 \
+! RUN: -fopenmp --offload-arch=gfx90a \
+! RUN: --target=x86_64-unknown-linux-gnu -nogpulib \
+! RUN: | FileCheck %s --check-prefix=OFFLOAD-TARGETS
+! RUN: %flang -S -### %s -o %t 2>&1 \
+! RUN: -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa --offload-arch=gfx90a \
+! RUN: --target=x86_64-unknown-linux-gnu -nogpulib \
+! RUN: | FileCheck %s --check-prefix=OFFLOAD-TARGETS
+
+! OFFLOAD-TARGETS: "{{[^"]*}}flang-new" "-fc1" "-triple" "x86_64-unknown-linux-gnu"
+! OFFLOAD-TARGETS-SAME: "-fopenmp-targets=amdgcn-amd-amdhsa"
+! OFFLOAD-TARGETS-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
+! OFFLOAD-TARGETS-NOT: -fopenmp-targets
+! OFFLOAD-TARGETS: "{{[^"]*}}flang-new" "-fc1" "-triple" "x86_64-unknown-linux-gnu"
+! OFFLOAD-TARGETS-SAME: "-fopenmp-targets=amdgcn-amd-amdhsa"
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 0f19e4ebff2a0..1c77d4ace5fbc 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -29,10 +29,10 @@
 ! RUN: %flang --target=riscv64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-RV64
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU
 
-! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -c %s -### 2>&1 \
+! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU-R600
 
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
diff --git a/flang/test/Driver/target-gpu-features.f90 b/flang/test/Driver/target-gpu-features.f90
index 9cc9ce4baaf4d..b783574370a0f 100644
--- a/flang/test/Driver/target-gpu-features.f90
+++ b/flang/test/Driver/target-gpu-features.f90
@@ -3,7 +3,7 @@
 ! Test that -mcpu are used and that the -target-cpu and -target-features
 ! are also added to the fc1 command.
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902  -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGCN
 
 ! CHECK-AMDGCN: "-fc1" "-triple" "amdgcn-amd-amdhsa"
diff --git a/flang/test/Evaluate/fold-assumed-type-rank.f90 b/flang/test/Evaluate/fold-assumed-type-rank.f90
new file mode 100644
index 0000000000000..ce296c8e27abf
--- /dev/null
+++ b/flang/test/Evaluate/fold-assumed-type-rank.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine sub3(ar_at)
+  type(*) :: ar_at(..)
+!CHECK:  PRINT *, int(int(rank(ar_at),kind=8),kind=4)
+  print *, rank(ar_at)
+end
diff --git a/flang/test/Evaluate/fold-nearest.f90 b/flang/test/Evaluate/fold-nearest.f90
index a7366e6d75407..41853a699d8e9 100644
--- a/flang/test/Evaluate/fold-nearest.f90
+++ b/flang/test/Evaluate/fold-nearest.f90
@@ -39,6 +39,7 @@ subroutine subr(a)
 module m2
   use ieee_arithmetic, only: ieee_next_after
   real, parameter :: minSubnormal = 1.e-45
+  real, parameter :: h = huge(0.0)
   logical, parameter :: test_0 = ieee_next_after(0., 0.) == 0.
   logical, parameter :: test_1 = ieee_next_after(0., 1.) == minSubnormal
   logical, parameter :: test_2 = ieee_next_after(minSubnormal, -1.) == 0
@@ -47,9 +48,9 @@ module m2
   !WARN: warning: division by zero
   real, parameter :: inf = 1. / 0.
   logical, parameter :: test_5 = ieee_next_after(inf, inf) == inf
-  logical, parameter :: test_6 = ieee_next_after(inf, -inf) == inf
-  logical, parameter :: test_7 = ieee_next_after(-inf, inf) == -inf
-  logical, parameter :: test_8 = ieee_next_after(-inf, -1.) == -inf
+  logical, parameter :: test_6 = ieee_next_after(inf, -inf) == h
+  logical, parameter :: test_7 = ieee_next_after(-inf, inf) == -h
+  logical, parameter :: test_8 = ieee_next_after(-inf, -1.) == -h
   logical, parameter :: test_9 = ieee_next_after(1.9999999, 3.) == 2.
   logical, parameter :: test_10 = ieee_next_after(2., 1.) == 1.9999999
 #if __x86_64__
@@ -69,6 +70,7 @@ module m2
 module m3
   use ieee_arithmetic, only: ieee_next_up, ieee_next_down
   real(kind(0.d0)), parameter :: minSubnormal = 5.d-324
+  real(kind(0.d0)), parameter :: h = huge(0.d0)
   logical, parameter :: test_1 = ieee_next_up(0.d0) == minSubnormal
   logical, parameter :: test_2 = ieee_next_down(0.d0) == -minSubnormal
   logical, parameter :: test_3 = ieee_next_up(1.d0) == 1.0000000000000002d0
@@ -81,10 +83,8 @@ module m3
   logical, parameter :: test_6 = ieee_next_down(-huge(0.d0)) == -inf
   !WARN: warning: IEEE_NEXT_UP intrinsic folding: bad argument
   logical, parameter :: test_7 = ieee_next_up(inf) == inf
-  !WARN: warning: IEEE_NEXT_DOWN intrinsic folding: bad argument
-  logical, parameter :: test_8 = ieee_next_down(inf) == inf
-  !WARN: warning: IEEE_NEXT_UP intrinsic folding: bad argument
-  logical, parameter :: test_9 = ieee_next_up(-inf) == -inf
+  logical, parameter :: test_8 = ieee_next_down(inf) == h
+  logical, parameter :: test_9 = ieee_next_up(-inf) == -h
   !WARN: warning: IEEE_NEXT_DOWN intrinsic folding: bad argument
   logical, parameter :: test_10 = ieee_next_down(-inf) == -inf
   logical, parameter :: test_11 = ieee_next_up(1.9999999999999997d0) == 2.d0
diff --git a/flang/test/Evaluate/folding08.f90 b/flang/test/Evaluate/folding08.f90
index 1b2e5605e85d4..53603474fe1c8 100644
--- a/flang/test/Evaluate/folding08.f90
+++ b/flang/test/Evaluate/folding08.f90
@@ -11,6 +11,11 @@ module m
   end type
   type(t) :: ta(0:2)
   character(len=2) :: ca(-1:1)
+  interface
+    function foo()
+      real :: foo(2:3,4:6)
+    end function
+  end interface
   integer, parameter :: lbtadim = lbound(ta,1)
   logical, parameter :: test_lbtadim = lbtadim == 0
   integer, parameter :: ubtadim = ubound(ta,1)
@@ -47,9 +52,6 @@ module m
   logical, parameter :: test_lb_empty_dim = lbound(empty, 1) == 1
   logical, parameter :: test_ub_empty_dim = ubound(empty, 1) == 0
  contains
-  function foo()
-    real :: foo(2:3,4:6)
-  end function
   subroutine test(n1,a1,a2)
     integer, intent(in) :: n1
     real, intent(in) :: a1(1:n1), a2(0:*)
diff --git a/flang/test/Evaluate/rewrite08.f90 b/flang/test/Evaluate/rewrite08.f90
new file mode 100644
index 0000000000000..c59605581d63c
--- /dev/null
+++ b/flang/test/Evaluate/rewrite08.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine s(oi,ol)
+  integer(1), optional, intent(in) :: oi
+  logical(1), optional, intent(in) :: ol
+  integer(1), allocatable :: ai
+  logical(1), allocatable :: al
+  integer(1), pointer :: pi
+  logical(1), pointer :: pl
+!CHECK: PRINT *, ishftc(-1_4,1_4,oi)
+!CHECK: PRINT *, ishftc(-1_4,1_4,ai)
+!CHECK: PRINT *, ishftc(-1_4,1_4,pi)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=ol)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=al)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=pl)
+  print *, ishftc(-1,1,oi)
+  print *, ishftc(-1,1,ai)
+  print *, ishftc(-1,1,pi)
+  print *, findloc([1,2,1],1,back=ol)
+  print *, findloc([1,2,1],1,back=al)
+  print *, findloc([1,2,1],1,back=pl)
+end
diff --git a/flang/test/Fir/abstract-results.fir b/flang/test/Fir/abstract-results.fir
index 82f1cd33073fd..93e63dc657f0c 100644
--- a/flang/test/Fir/abstract-results.fir
+++ b/flang/test/Fir/abstract-results.fir
@@ -87,8 +87,8 @@ func.func @boxfunc_callee() -> !fir.box<!fir.heap<f64>> {
   // FUNC-BOX: return
 }
 
-// FUNC-REF-LABEL: func @retcptr() -> i64
-// FUNC-BOX-LABEL: func @retcptr() -> i64
+// FUNC-REF-LABEL: func @retcptr() -> !fir.ref<none>
+// FUNC-BOX-LABEL: func @retcptr() -> !fir.ref<none>
 func.func @retcptr() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {
   %0 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "rec", uniq_name = "_QFrecErec"}
   %1 = fir.load %0 : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
@@ -98,12 +98,14 @@ func.func @retcptr() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__addres
   // FUNC-REF: %[[FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
   // FUNC-REF: %[[ADDR:.*]] = fir.coordinate_of %[[ALLOC]], %[[FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
   // FUNC-REF: %[[VAL:.*]] = fir.load %[[ADDR]] : !fir.ref<i64>
-  // FUNC-REF: return %[[VAL]] : i64
+  // FUNC-REF: %[[CAST:.*]] = fir.convert %[[VAL]] : (i64) -> !fir.ref<none>
+  // FUNC-REF: return %[[CAST]] : !fir.ref<none>
   // FUNC-BOX: %[[ALLOC:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "rec", uniq_name = "_QFrecErec"}
   // FUNC-BOX: %[[FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
   // FUNC-BOX: %[[ADDR:.*]] = fir.coordinate_of %[[ALLOC]], %[[FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
   // FUNC-BOX: %[[VAL:.*]] = fir.load %[[ADDR]] : !fir.ref<i64>
-  // FUNC-BOX: return %[[VAL]] : i64
+  // FUNC-BOX: %[[CAST:.*]] = fir.convert %[[VAL]] : (i64) -> !fir.ref<none>
+  // FUNC-BOX: return %[[CAST]] : !fir.ref<none>
 }
 
 // FUNC-REF-LABEL:  func private @arrayfunc_callee_declare(
@@ -311,8 +313,8 @@ func.func @test_address_of() {
 
 }
 
-// FUNC-REF-LABEL: func.func private @returns_null() -> i64
-// FUNC-BOX-LABEL: func.func private @returns_null() -> i64
+// FUNC-REF-LABEL: func.func private @returns_null() -> !fir.ref<none>
+// FUNC-BOX-LABEL: func.func private @returns_null() -> !fir.ref<none>
 func.func private @returns_null() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 
 // FUNC-REF-LABEL: func @test_address_of_cptr
@@ -323,12 +325,12 @@ func.func @test_address_of_cptr() {
   fir.call @_QMtest_c_func_modPsubr(%1) : (() -> ()) -> ()
   return
 
-  // FUNC-REF: %[[VAL_0:.*]] = fir.address_of(@returns_null) : () -> i64
-  // FUNC-REF: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> i64) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
+  // FUNC-REF: %[[VAL_0:.*]] = fir.address_of(@returns_null) : () -> !fir.ref<none>
+  // FUNC-REF: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> !fir.ref<none>) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
   // FUNC-REF: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> ())
   // FUNC-REF: fir.call @_QMtest_c_func_modPsubr(%[[VAL_2]]) : (() -> ()) -> ()
-  // FUNC-BOX: %[[VAL_0:.*]] = fir.address_of(@returns_null) : () -> i64
-  // FUNC-BOX: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> i64) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
+  // FUNC-BOX: %[[VAL_0:.*]] = fir.address_of(@returns_null) : () -> !fir.ref<none>
+  // FUNC-BOX: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> !fir.ref<none>) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
   // FUNC-BOX: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> ())
   // FUNC-BOX: fir.call @_QMtest_c_func_modPsubr(%[[VAL_2]]) : (() -> ()) -> ()
 }
@@ -380,18 +382,20 @@ func.func @test_indirect_calls_return_cptr(%arg0: () -> ()) {
 
   // FUNC-REF: %[[VAL_0:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = ".result"}
   // FUNC-REF: %[[VAL_1:.*]] = fir.convert %[[ARG0]] : (() -> ()) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
-  // FUNC-REF: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> i64)
-  // FUNC-REF: %[[VAL_3:.*]] = fir.call %[[VAL_2]]() : () -> i64
+  // FUNC-REF: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> !fir.ref<none>)
+  // FUNC-REF: %[[VAL_3:.*]] = fir.call %[[VAL_2]]() : () -> !fir.ref<none>
   // FUNC-REF: %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
   // FUNC-REF: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-  // FUNC-REF: fir.store %[[VAL_3]] to %[[VAL_5]] : !fir.ref<i64>
+  // FUNC-REF: %[[CAST:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<none>) -> i64
+  // FUNC-REF: fir.store %[[CAST]] to %[[VAL_5]] : !fir.ref<i64>
   // FUNC-BOX: %[[VAL_0:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = ".result"}
   // FUNC-BOX: %[[VAL_1:.*]] = fir.convert %[[ARG0]] : (() -> ()) -> (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>)
-  // FUNC-BOX: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> i64)
-  // FUNC-BOX: %[[VAL_3:.*]] = fir.call %[[VAL_2]]() : () -> i64
+  // FUNC-BOX: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (() -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> (() -> !fir.ref<none>)
+  // FUNC-BOX: %[[VAL_3:.*]] = fir.call %[[VAL_2]]() : () -> !fir.ref<none>
   // FUNC-BOX: %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
   // FUNC-BOX: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-  // FUNC-BOX: fir.store %[[VAL_3]] to %[[VAL_5]] : !fir.ref<i64>
+  // FUNC-BOX: %[[CAST:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<none>) -> i64
+  // FUNC-BOX: fir.store %[[CAST]] to %[[VAL_5]] : !fir.ref<i64>
 }
 
 // ----------------------- Test GlobalOp rewrite ------------------------
diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir
index 1a9c9e0496814..81a4d8bc13bf0 100644
--- a/flang/test/Fir/box.fir
+++ b/flang/test/Fir/box.fir
@@ -1,7 +1,7 @@
 // RUN: tco -o - %s | FileCheck %s
 
 // Global box initialization (test must come first because llvm globals are emitted first).
-// CHECK-LABEL: @globalx = internal global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }
+// CHECK-LABEL: @globalx = internal global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 9, i8 2, i8 0 }
 fir.global internal @globalx : !fir.box<!fir.heap<i32>> {
   %c0 = arith.constant 0 : index
   %0 = fir.convert %c0 : (index) -> !fir.heap<i32>
@@ -9,7 +9,7 @@ fir.global internal @globalx : !fir.box<!fir.heap<i32>> {
   fir.has_value %1 : !fir.box<!fir.heap<i32>>
 }
 
-// CHECK-LABEL: @globaly = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr null, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 27, i8 2, i8 0,{{.*}}[3 x i64] [i64 1, i64 0, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)]
+// CHECK-LABEL: @globaly = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr null, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 2, i8 0,{{.*}}[3 x i64] [i64 1, i64 0, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)]
 fir.global internal @globaly : !fir.box<!fir.heap<!fir.array<?xf32>>> {
   %c0 = arith.constant 0 : index
   %0 = fir.convert %c0 : (index) -> !fir.heap<!fir.array<?xf32>>
@@ -27,7 +27,7 @@ func.func private @ga(%b : !fir.box<!fir.array<?xf32>>)
 // CHECK: (ptr %[[ARG:.*]])
 func.func @f(%a : !fir.ref<f32>) {
   // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
-  // CHECK: %[[INS0:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 27, i8 0, i8 0 }, ptr %[[ARG]], 0
+  // CHECK: %[[INS0:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 27, i8 0, i8 0 }, ptr %[[ARG]], 0
   // CHECK: store {{.*}} %[[INS0]], {{.*}} %[[DESC]]
   %b = fir.embox %a : (!fir.ref<f32>) -> !fir.box<f32>
 
@@ -44,7 +44,7 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %d = fir.shape %c100 : (index) -> !fir.shape<1>
-  // CHECK: %[[INS70:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 27, i8 0, i8 0, {{.*}} }, ptr %{{.*}}, 0
+  // CHECK: %[[INS70:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 0, {{.*}} }, ptr %{{.*}}, 0
   %b = fir.embox %c(%d) : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
   // CHECK: call void @ga(
   fir.call @ga(%b) : (!fir.box<!fir.array<?xf32>>) -> ()
@@ -58,7 +58,7 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
 func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.char<1,?>> {
   // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1
-  // CHECK: insertvalue {{.*}} i32 20180515, 2
+  // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %x = fir.embox %arg0 typeparams %arg1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
   // CHECK: store {{.*}}, ptr %[[res]]
@@ -71,7 +71,7 @@ func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.
 // CHECK-SAME: ptr %[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) -> !fir.box<!fir.array<?x!fir.char<1,5>>> {
   %1 = fir.shape %arg1 : (index) -> !fir.shape<1>
-  // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20180515, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1
+  // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1
   // CHECK: insertvalue {{.*}} %{{.*}}, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<?x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<?x!fir.char<1,5>>>
@@ -86,7 +86,7 @@ func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %ar
   %1 = fir.shape %arg2 : (index) -> !fir.shape<1>
   // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
-  // CHECK: insertvalue {{.*}} i32 20180515, 2
+  // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 %[[arg2]], 7, 0, 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
@@ -103,7 +103,7 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
   %1 = fir.shape %c_7 : (index) -> !fir.shape<1>
   // CHECK:   %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
-  // CHECK: insertvalue {{.*}} i32 20180515, 2
+  // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 7, 7, 0, 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
@@ -147,7 +147,7 @@ func.func @box6(%0 : !fir.ref<!fir.array<?x?x?x?xf32>>, %1 : index, %2 : index)
   // CHECK: %[[sdp2:.*]] = sdiv i64 %[[dp2]], 2
   // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[sdp2]], 0
   // CHECK: %[[extent:.*]] = select i1 %[[cmp]], i64 %[[sdp2]], i64 0
-  // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20180515, i8 2, i8 27, i8 0, i8 0, [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 undef, i64 undef], [3 x i64] undef] }, i64 %[[extent]], 7, 0, 1
+  // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 2, i8 27, i8 0, i8 0, [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 undef, i64 undef], [3 x i64] undef] }, i64 %[[extent]], 7, 0, 1
   // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 mul (i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i64 200), 7, 0, 2
   // CHECK: %[[op25:.*]] = add i64 25000, %[[i100p40]]
   // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 1, 7, 1, 0
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index eca762d52a724..4b9afd5675ea3 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -472,7 +472,7 @@ func.func @_QPomp_target() {
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(511 : index) : i64
 // CHECK:           %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[LOWER]] : i64) upper_bound(%[[UPPER]] : i64) extent(%[[EXTENT]] : i64) stride(%[[STRIDE]] : i64) start_idx(%[[STRIDE]] : i64)
 // CHECK:           %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !llvm.ptr {name = "a"}
-// CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) {
+// CHECK:           omp.target map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) thread_limit(%[[VAL_2]] : i32) {
 // CHECK:           ^bb0(%[[ARG_0]]: !llvm.ptr):
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index c7f3160328f74..af9b928cb4b18 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -1653,7 +1653,7 @@ func.func @embox0(%arg0: !fir.ref<!fir.array<100xi32>>) {
 // CHECK:         %[[I64_ELEM_SIZE:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
 // CHECK:         %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
 // CHECK:         %[[DESC0:.*]] = llvm.insertvalue %[[I64_ELEM_SIZE]], %[[DESC]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
-// CHECK:         %[[CFI_VERSION:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:         %[[CFI_VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:         %[[DESC1:.*]] = llvm.insertvalue %[[CFI_VERSION]], %[[DESC0]][2] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
 // CHECK:         %[[RANK:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:         %[[RANK_I8:.*]] = llvm.trunc %[[RANK]] : i32 to i8
@@ -1785,11 +1785,10 @@ func.func @embox1(%arg0: !fir.ref<!fir.type<_QMtest_dinitTtseq{i:i32}>>) {
 // CHECK-NO-COMDAT: llvm.mlir.global linkonce constant @_QMtest_dinitE.dt.tseq() {addr_space = 0 : i32} : i8
 // CHECK-LABEL: llvm.func @embox1
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(42 : i32) : i32
+// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
+// CHECK:         %{{.*}} = llvm.insertvalue %[[VERSION]], %{{.*}}[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> 
 // CHECK:         %[[TYPE_CODE_I8:.*]] = llvm.trunc %[[TYPE_CODE]] : i32 to i8
 // CHECK:         %{{.*}} = llvm.insertvalue %[[TYPE_CODE_I8]], %{{.*}}[4] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
-// CHECK:         %[[F18ADDENDUM:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK:         %[[F18ADDENDUM_I8:.*]] = llvm.trunc %[[F18ADDENDUM]] : i32 to i8
-// CHECK:         %{{.*}} = llvm.insertvalue %[[F18ADDENDUM_I8]], %{{.*}}[6] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
 // CHECK:         %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr
 // CHECK:         %{{.*}} = llvm.insertvalue %[[TDESC]], %{{.*}}[7] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
 
@@ -1879,7 +1878,7 @@ func.func @xembox0(%arg0: !fir.ref<!fir.array<?xi32>>) {
 // CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
-// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:         %[[BOX2:.*]] = llvm.insertvalue %[[VERSION]], %[[BOX1]][2] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[RANK:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:         %[[RANK_I8:.*]] = llvm.trunc %[[RANK]] : i32 to i8
@@ -1981,7 +1980,7 @@ func.func private @_QPxb(!fir.box<!fir.array<?x?xf64>>)
 // CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)>
-// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:         %[[BOX2:.*]] = llvm.insertvalue %[[VERSION]], %[[BOX1]][2] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)>
 // CHECK:         %[[RANK:.*]] = llvm.mlir.constant(2 : i32) : i32
 // CHECK:         %[[RANK_I8:.*]] = llvm.trunc %[[RANK]] : i32 to i8
@@ -2065,7 +2064,7 @@ func.func private @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>)
 // CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
-// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:         %[[BOX2:.*]] = llvm.insertvalue %[[VERSION]], %[[BOX1]][2] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[RANK:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:         %[[RANK_I8:.*]] = llvm.trunc %[[RANK]] : i32 to i8
@@ -2368,7 +2367,7 @@ func.func @test_rebox_1(%arg0: !fir.box<!fir.array<?x?xf32>>) {
 //CHECK:    %[[ELEM_SIZE_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
 //CHECK:    %[[RBOX:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
 //CHECK:    %[[RBOX_TMP1:.*]] = llvm.insertvalue %[[ELEM_SIZE_I64]], %[[RBOX]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-//CHECK:    %[[CFI_VERSION:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+//CHECK:    %[[CFI_VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 //CHECK:    %[[RBOX_TMP2:.*]] = llvm.insertvalue %[[CFI_VERSION]], %[[RBOX_TMP1]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
 //CHECK:    %[[RANK:.*]] = llvm.mlir.constant(1 : i32) : i32
 //CHECK:    %[[RANK_I8:.*]] = llvm.trunc %[[RANK]] : i32 to i8
diff --git a/flang/test/Fir/declare-codegen.fir b/flang/test/Fir/declare-codegen.fir
index c5879facb1572..fe8d84ef2d19f 100644
--- a/flang/test/Fir/declare-codegen.fir
+++ b/flang/test/Fir/declare-codegen.fir
@@ -38,3 +38,17 @@ func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.arra
 
 // DECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
 // DECL: fircg.ext_declare
+
+// Test DCE does not crash because of unreachable code.
+func.func @unreachable_code(%arg0: !fir.ref<!fir.char<1,10>>) {
+  %c10 = arith.constant 10 : index
+  %2 = fir.declare %arg0 typeparams %c10 {uniq_name = "live_code"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>)
+  return
+^bb2:  // no predecessors
+  %3 = fir.declare %arg0 typeparams %c10 {uniq_name = "dead_code"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>)
+  fir.unreachable
+}
+// NODECL-LABEL:   func.func @unreachable_code(
+// NODECL-NOT:       uniq_name = "live_code"
+// DECL-LABEL:    func.func @unreachable_code(
+// DECL:             uniq_name = "live_code"
diff --git a/flang/test/Fir/embox-char.fir b/flang/test/Fir/embox-char.fir
index 30015a5f7ae38..bf8344dbb60fc 100644
--- a/flang/test/Fir/embox-char.fir
+++ b/flang/test/Fir/embox-char.fir
@@ -46,7 +46,7 @@
 // CHECK:           %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31_LEN]]  : i64
 // CHECK:           %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_39:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:           %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:           %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32
 // CHECK:           %[[VAL_42:.*]] = llvm.trunc %[[VAL_41]] : i32 to i8
@@ -142,7 +142,7 @@ func.func @test_char4(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.cha
 // CHECK:           %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_16_BYTESIZE]]  : i64
 // CHECK:           %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_39:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:           %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:           %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32
 // CHECK:           %[[VAL_42:.*]] = llvm.trunc %[[VAL_41]] : i32 to i8
diff --git a/flang/test/Fir/embox.fir b/flang/test/Fir/embox.fir
index 65a536d10c472..cd13c5cfaaa9a 100644
--- a/flang/test/Fir/embox.fir
+++ b/flang/test/Fir/embox.fir
@@ -13,7 +13,7 @@ func.func @_QPtest_slice() {
 // CHECK:  %[[a2:.*]] = alloca [20 x i32], i64 1, align 4
 // CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i64 0, i64 0
 // CHECK:  %[[a4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-// CHECK:  { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
+// CHECK:  { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK: [i64 1, i64 5, i64 mul (i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i64 2)]] }, ptr %[[a3]], 0
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[a4]], ptr %[[a1]], align 8
 // CHECK:  call void @_QPtest_callee(ptr %[[a1]])
@@ -40,7 +40,7 @@ func.func @_QPtest_dt_slice() {
 // CHECK:  %[[a3:.*]] = alloca [20 x %_QFtest_dt_sliceTt], i64 1, align 8
 // CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i64 0, i64 0, i32 0
 // CHECK: %[[a5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
+// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK-SAME: [i64 1, i64 5, i64 mul
 // CHECK-SAME: (i64 ptrtoint (ptr getelementptr (%_QFtest_dt_sliceTt, ptr null, i32 1) to i64), i64 2)]] }
 // CHECK-SAME: , ptr %[[a4]], 0
@@ -75,7 +75,7 @@ func.func @emboxSubstring(%arg0: !fir.ref<!fir.array<2x3x!fir.char<1,4>>>) {
   %1 = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 substr %c1_i64, %c2_i64 : (index, index, index, index, index, index, i64, i64) -> !fir.slice<2>
   %2 = fir.embox %arg0(%0) [%1] : (!fir.ref<!fir.array<2x3x!fir.char<1,4>>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
   // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i64 0, i64 0, i64 0, i64 1
-  // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 mul (i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), i64 2), i32 20180515, i8 2, i8 40, i8 0, i8 0
+  // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 mul (i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), i64 2), i32 20240719, i8 2, i8 40, i8 0, i8 0
   // CHECK-SAME: [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 2, i64 4], [3 x i64] [i64 1, i64 3, i64 8]] }
   // CHECK-SAME: ptr %[[addr]], 0
 
@@ -98,8 +98,32 @@ func.func @fir_dev_issue_1416(%arg0: !fir.ref<!fir.array<40x?xf32>>, %low: index
 // CHECK: %[[offset:.*]] = add i64 %[[mul]], 0
 // CHECK: %[[addr:.*]] = getelementptr [40 x float], ptr %0, i64 %[[offset]], i64 0
 // CHECK: %[[box:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 40, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)]] }, ptr %[[addr]], 0
+// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 40, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)]] }, ptr %[[addr]], 0
     %3 = fir.embox %arg0(%1) [%2] : (!fir.ref<!fir.array<40x?xf32>>, !fir.shapeshift<2>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
     fir.call @do_something(%3) : (!fir.box<!fir.array<?xf32>>) -> ()
     return
 }
+
+// CHECK-LABEL: define void @_QPtest_allocator1()
+func.func @_QPtest_allocator1() {
+  %c20 = arith.constant 20 : index
+  %0 = fir.alloca !fir.array<20xi32> {bindc_name = "x", uniq_name = "_QFtest_sliceEx"}
+  %1 = fir.shape %c20 : (index) -> !fir.shape<1>
+  %3 = fir.embox %0(%1) {allocator_idx = 1 : i32} : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.call @_QPtest_callee(%3) : (!fir.box<!fir.array<?xi32>>) -> ()
+  return
+}
+
+// %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 2, [1 x [3 x i64]] [[3 x i64] [i64 1, i64 20, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }
+
+// CHECK-LABEL: define void @_QPtest_allocator2()
+func.func @_QPtest_allocator2() {
+  %c20 = arith.constant 20 : index
+  %0 = fir.alloca !fir.array<20xi32> {bindc_name = "x", uniq_name = "_QFtest_sliceEx"}
+  %1 = fir.shape %c20 : (index) -> !fir.shape<1>
+  %3 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.call @_QPtest_callee(%3) : (!fir.box<!fir.array<?xi32>>) -> ()
+  return
+}
+
+// CHECK: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 6
diff --git a/flang/test/Fir/ignore-missing-type-descriptor.fir b/flang/test/Fir/ignore-missing-type-descriptor.fir
index e83c8d684f891..228d9a1c7af66 100644
--- a/flang/test/Fir/ignore-missing-type-descriptor.fir
+++ b/flang/test/Fir/ignore-missing-type-descriptor.fir
@@ -18,5 +18,5 @@ func.func @test_embox(%addr: !fir.ref<!some_freestyle_type>) {
 // CHECK-SAME: ptr %[[ADDR:.*]])
 // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (%some_not_mangled_type, ptr null, i32 1) to i64),
-// CHECK-SAME: i32 20180515, i8 0, i8 42, i8 0, i8 1, ptr null, [1 x i64] zeroinitializer },
+// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 0, i8 1, ptr null, [1 x i64] zeroinitializer },
 // CHECK-SAME: ptr %[[ADDR]], 0
diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir
index fc891c32acc95..7493c7012e920 100644
--- a/flang/test/Fir/polymorphic.fir
+++ b/flang/test/Fir/polymorphic.fir
@@ -13,7 +13,7 @@ func.func @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived()
 // CHECK-LABEL: define void @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived(){{.*}}{
 // CHECK:   %[[MEM:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK:   %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
-// CHECK:   store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 0, i32 20180515, i8 0, i8 -1, i8 1, i8 1, ptr null, [1 x i64] zeroinitializer }, ptr %[[MEM]]
+// CHECK:   store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 0, i32 20240719, i8 0, i8 -1, i8 1, i8 1, ptr null, [1 x i64] zeroinitializer }, ptr %[[MEM]]
 // CHECK:   %[[LOADED:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[MEM]], align 8
 // CHECK:   store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED]], ptr %[[DESC]]
 // CHECK:   ret void
@@ -64,7 +64,7 @@ func.func @_QMpolymorphic_testPtest_embox() {
 
 // CHECK-LABEL: @_QMpolymorphic_testPtest_embox()
 // CHECK: %[[ALLOCA_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }
-// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]]
+// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]]
 // CHECK: %[[LOADED_DESC:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_DESC]], align 8
 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[LOADED_DESC]], ptr @_QFEx, align 8
 
@@ -155,7 +155,7 @@ func.func @_QQmain() {
 // CHECK-LABEL: define void @_QQmain(){{.*}}{
 // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
-// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1E.dt.t.2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8
+// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1E.dt.t.2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8
 // CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS_NONE]]
 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[DESC]]
 // CHECK: call void @_QMmod1Psub1(ptr %[[DESC]])
@@ -195,4 +195,4 @@ func.func @_QQembox_input_type(%arg0 : !fir.ref<!fir.type<_QMmod1Tp2{v:!fir.arra
 }
 
 // CHECK-LABEL: define void @_QQembox_input_type
-// CHECK: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr undef, i64 ptrtoint (ptr getelementptr (%_QMmod1Tp2, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 42, i8 0, i8 1, ptr @_QMmod1E.dt.p2, [1 x i64] zeroinitializer }, ptr %{{.*}}, 0
+// CHECK: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr undef, i64 ptrtoint (ptr getelementptr (%_QMmod1Tp2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 0, i8 1, ptr @_QMmod1E.dt.p2, [1 x i64] zeroinitializer }, ptr %{{.*}}, 0
diff --git a/flang/test/Fir/rebox-global.fir b/flang/test/Fir/rebox-global.fir
index cf39e71f30da3..509487ab61d53 100644
--- a/flang/test/Fir/rebox-global.fir
+++ b/flang/test/Fir/rebox-global.fir
@@ -20,4 +20,4 @@ fir.global @pointer_char4_init : !fir.box<!fir.ptr<!fir.char<4,10>>> {
   fir.has_value %2 : !fir.box<!fir.ptr<!fir.char<4,10>>>
 }
 // CHECK-LABEL: @pointer_char4_init
-// CHECK-SAME: { ptr @char4, i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64), i32 20180515, i8 0, i8 44, i8 1, i8 0 }
+// CHECK-SAME: { ptr @char4, i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64), i32 20240719, i8 0, i8 44, i8 1, i8 0 }
diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir
index d2d886cf2e218..d0393eadef58b 100644
--- a/flang/test/Fir/rebox.fir
+++ b/flang/test/Fir/rebox.fir
@@ -121,7 +121,7 @@ func.func @test_rebox_4(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>>) {
   // CHECK: %[[STRIDE:.*]] = load i64, ptr %[[STRIDE_GEP]]
   // CHECK: %[[BASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INPUT]], i32 0, i32 0
   // CHECK: %[[BASE:.*]] = load ptr, ptr %[[BASE_GEP]]
-  // CHECK: %[[NEWBOX1:.*]] = insertvalue {{{.*}}} { ptr undef, i64 ptrtoint (ptr getelementptr ([10 x i8], ptr null, i32 1) to i64), i32 20180515, i8 1, i8 40, i8 1, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 undef, i64 undef]] }, i64 %[[EXTENT]], 7, 0, 1
+  // CHECK: %[[NEWBOX1:.*]] = insertvalue {{{.*}}} { ptr undef, i64 ptrtoint (ptr getelementptr ([10 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 1, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 undef, i64 undef]] }, i64 %[[EXTENT]], 7, 0, 1
   // CHECK: %[[NEWBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX1]], i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[NEWBOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX2]], ptr %[[BASE]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX3]], ptr %[[NEWBOX_STORAGE]]
diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir
index 89679afc386c6..32e264519388b 100644
--- a/flang/test/Fir/tbaa.fir
+++ b/flang/test/Fir/tbaa.fir
@@ -54,7 +54,7 @@ module {
 // CHECK:           %[[VAL_23:.*]] = llvm.load %[[VAL_22]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32
 // CHECK:           %[[VAL_24:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_25:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_24]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)>
-// CHECK:           %[[VAL_26:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:           %[[VAL_26:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:           %[[VAL_27:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_25]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_28:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:           %[[VAL_29:.*]] = llvm.trunc %[[VAL_28]] : i32 to i8
@@ -154,7 +154,7 @@ module {
 // CHECK:           %[[VAL_27:.*]] = llvm.load %[[VAL_26]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> !llvm.ptr
 // CHECK:           %[[VAL_28:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_29:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_28]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
-// CHECK:           %[[VAL_30:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:           %[[VAL_30:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:           %[[VAL_31:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_29]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_32:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[VAL_33:.*]] = llvm.trunc %[[VAL_32]] : i32 to i8
@@ -206,7 +206,7 @@ module {
 // CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(-1 : i32) : i32
 // CHECK:           %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
-// CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(20180515 : i32) : i32
+// CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(20240719 : i32) : i32
 // CHECK:           %[[VAL_7:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_5]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
 // CHECK:           %[[VAL_8:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[VAL_9:.*]] = llvm.trunc %[[VAL_8]] : i32 to i8
diff --git a/flang/test/Fir/type-descriptor.fir b/flang/test/Fir/type-descriptor.fir
index 6551efe1a5feb..f0ebd8ddeee1e 100644
--- a/flang/test/Fir/type-descriptor.fir
+++ b/flang/test/Fir/type-descriptor.fir
@@ -14,7 +14,7 @@ fir.global internal @_QFfooEx : !fir.box<!fir.heap<!sometype>> {
 }
 // CHECK: @_QFfooEx = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsometype, ptr null, i32 1) to i64),
-// CHECK-SAME: i32 20180515, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.sometype, [1 x i64] zeroinitializer }
+// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.sometype, [1 x i64] zeroinitializer }
 
 !some_pdt_type = !fir.type<_QFfooTsome_pdt_typeK42K43{num:i32,values:!fir.box<!fir.ptr<!fir.array<?x?xf32>>>}>
 fir.global internal @_QFfooE.dt.some_pdt_type.42.43 constant : i8
@@ -26,4 +26,4 @@ fir.global internal @_QFfooEx2 : !fir.box<!fir.heap<!some_pdt_type>> {
 }
 // CHECK: @_QFfooEx2 = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsome_pdt_typeK42K43, ptr null, i32 1) to i64),
-// CHECK-SAME: i32 20180515, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.some_pdt_type.42.43, [1 x i64] zeroinitializer }
+// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.some_pdt_type.42.43, [1 x i64] zeroinitializer }
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index d657f819dfbf1..7eb74a4234f5a 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -133,7 +133,7 @@ subroutine sub4()
   integer, parameter :: n = 10
   real, device :: adev(n)
   real :: ahost(n)
-  real :: b
+  real, managed :: b
   integer :: i
 
   adev = ahost
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 99cb6eb289e0b..ba5d390df4785 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -7,8 +7,8 @@ subroutine sub1()
   integer :: i, j
   integer, parameter :: n = 100
   integer(8) :: istream
-  real :: a(n), b(n)
-  real :: c(n,n), d(n,n)
+  real, device :: a(n), b(n)
+  real, device :: c(n,n), d(n,n)
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90 b/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90
new file mode 100644
index 0000000000000..3c60a84692bdb
--- /dev/null
+++ b/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90
@@ -0,0 +1,26 @@
+! Test passing polymorphic variable for non-polymorphic dummy arguments:
+! RUN: bbc -emit-hlfir -o - -I nowhere %s | FileCheck %s
+
+subroutine test_sequence_association(x)
+  type t
+    integer :: i
+  end type
+  interface
+    subroutine sequence_assoc(x, n)
+      import :: t
+      type(t) :: x(n)
+    end subroutine
+  end interface
+  class(t) :: x(:, :)
+  call sequence_assoc(x, 100)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_sequence_association(
+! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.class<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]]
+! CHECK:           %[[REBOX:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.class<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.copy_in %[[REBOX]] to %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>>) -> (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>, i1)
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]]#0 : (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.ref<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.ref<!fir.array<?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
+! CHECK:           fir.call @_QPsequence_assoc(%[[VAL_7]], %{{.*}})
+! CHECK:           hlfir.copy_out %[[VAL_1]], %[[VAL_5]]#1 to %[[REBOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>>, i1, !fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> ()
diff --git a/flang/test/Lower/Intrinsics/ieee_next.f90 b/flang/test/Lower/Intrinsics/ieee_next.f90
new file mode 100644
index 0000000000000..fa9692b83bc87
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/ieee_next.f90
@@ -0,0 +1,284 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+! CHECK-LABEL: c.func @_QQmain
+program p
+  use ieee_arithmetic, only: ieee_value, ieee_negative_inf, ieee_positive_inf
+  use ieee_arithmetic, only: ieee_next_after, ieee_next_down, ieee_next_up
+  implicit none
+  ! CHECK-DAG: %[[V_4:[0-9]+]] = fir.alloca f80 {bindc_name = "r10", uniq_name = "_QFEr10"}
+  ! CHECK-DAG: %[[V_5:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = "_QFEr10"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK-DAG: %[[V_6:[0-9]+]] = fir.alloca f128 {bindc_name = "r16", uniq_name = "_QFEr16"}
+  ! CHECK-DAG: %[[V_7:[0-9]+]] = fir.declare %[[V_6]] {uniq_name = "_QFEr16"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-DAG: %[[V_8:[0-9]+]] = fir.alloca f16 {bindc_name = "r2", uniq_name = "_QFEr2"}
+  ! CHECK-DAG: %[[V_9:[0-9]+]] = fir.declare %[[V_8]] {uniq_name = "_QFEr2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK-DAG: %[[V_10:[0-9]+]] = fir.alloca bf16 {bindc_name = "r3", uniq_name = "_QFEr3"}
+  ! CHECK-DAG: %[[V_11:[0-9]+]] = fir.declare %[[V_10]] {uniq_name = "_QFEr3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_12:[0-9]+]] = fir.alloca f32 {bindc_name = "r4", uniq_name = "_QFEr4"}
+  ! CHECK-DAG: %[[V_13:[0-9]+]] = fir.declare %[[V_12]] {uniq_name = "_QFEr4"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK-DAG: %[[V_14:[0-9]+]] = fir.alloca f64 {bindc_name = "r8", uniq_name = "_QFEr8"}
+  ! CHECK-DAG: %[[V_15:[0-9]+]] = fir.declare %[[V_14]] {uniq_name = "_QFEr8"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  ! CHECK-DAG: %[[V_16:[0-9]+]] = fir.address_of(@_QFEx10) : !fir.ref<f80>
+  ! CHECK-DAG: %[[V_17:[0-9]+]] = fir.declare %[[V_16]] {uniq_name = "_QFEx10"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK-DAG: %[[V_18:[0-9]+]] = fir.alloca f128 {bindc_name = "x16", uniq_name = "_QFEx16"}
+  ! CHECK-DAG: %[[V_19:[0-9]+]] = fir.declare %[[V_18]] {uniq_name = "_QFEx16"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-DAG: %[[V_20:[0-9]+]] = fir.alloca f16 {bindc_name = "x2", uniq_name = "_QFEx2"}
+  ! CHECK-DAG: %[[V_21:[0-9]+]] = fir.declare %[[V_20]] {uniq_name = "_QFEx2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK-DAG: %[[V_22:[0-9]+]] = fir.address_of(@_QFEx3) : !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_23:[0-9]+]] = fir.declare %[[V_22]] {uniq_name = "_QFEx3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_24:[0-9]+]] = fir.address_of(@_QFEx4) : !fir.ref<f32>
+  ! CHECK-DAG: %[[V_25:[0-9]+]] = fir.declare %[[V_24]] {uniq_name = "_QFEx4"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK-DAG: %[[V_26:[0-9]+]] = fir.address_of(@_QFEx8) : !fir.ref<f64>
+  ! CHECK-DAG: %[[V_27:[0-9]+]] = fir.declare %[[V_26]] {uniq_name = "_QFEx8"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  real(2)  ::  r2,  x2
+  real(3)  ::  r3,  x3 = -huge(x3)
+  real(4)  ::  r4,  x4 = -0.
+  real(8)  ::  r8,  x8 =  0.
+  real(10) :: r10, x10 =  huge(x10)
+  real(16) :: r16, x16
+
+  x2  = ieee_value(x2, ieee_negative_inf)
+  x16 = ieee_value(x2, ieee_positive_inf)
+
+  ! CHECK:     %[[V_45:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK:     %[[V_46:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
+  ! CHECK-DAG: %[[V_47:[0-9]+]] = fir.coordinate_of %{{.*}}, %c2{{.*}} : (!fir.ref<!fir.array<12xi16>>, i8) -> !fir.ref<i16>
+  ! CHECK-DAG: %[[V_48:[0-9]+]] = fir.load %[[V_47]] : !fir.ref<i16>
+  ! CHECK-DAG: %[[V_49:[0-9]+]] = arith.bitcast %[[V_48]] : i16 to f16
+  ! CHECK-DAG: %[[V_50:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_46]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_51:[0-9]+]] = arith.select %[[V_50]], %[[V_49]], %[[V_45]] : f16
+  ! CHECK:     %[[V_52:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_53:[0-9]+]] = fir.convert %[[V_51]] : (f16) -> f80
+  ! CHECK:     %[[V_54:[0-9]+]] = arith.cmpf oeq, %[[V_53]], %[[V_46]] fastmath<contract> : f80
+  ! CHECK:     %[[V_55:[0-9]+]] = arith.ori %[[V_52]], %[[V_54]] : i1
+  ! CHECK:     %[[V_56:[0-9]+]] = arith.cmpf olt, %[[V_53]], %[[V_46]] fastmath<contract> : f80
+  ! CHECK:     %[[V_57:[0-9]+]] = arith.bitcast %[[V_45]] : f16 to i16
+  ! CHECK:     %[[V_58:[0-9]+]] = arith.shrui %[[V_57]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_59:[0-9]+]] = fir.convert %[[V_58]] : (i16) -> i1
+  ! CHECK:     %[[V_60:[0-9]+]] = arith.cmpi ne, %[[V_56]], %[[V_59]] : i1
+  ! CHECK:     %[[V_61:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_62:[0-9]+]] = arith.andi %[[V_61]], %[[V_60]] : i1
+  ! CHECK:     %[[V_63:[0-9]+]] = arith.ori %[[V_55]], %[[V_62]] : i1
+  ! CHECK:     %[[V_64:[0-9]+]] = fir.if %[[V_63]] -> (f16) {
+  ! CHECK:       fir.result %[[V_51]] : f16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_51]], %cst{{[_0-9]*}} fastmath<contract> : f16
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f16) {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_56]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f16
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_204]] : f16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_51]] : f16 to i16
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_60]], %[[V_206]], %[[V_205]] : i16
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to f16
+  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_209]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_210]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_208]] : f16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_64]] to %[[V_9]] : !fir.ref<f16>
+  r2 = ieee_next_after(x2, x10)
+  print "('after:  ', z4.4, ' -> ', z4.4, ' = ', g0)", x2, r2, r2
+
+  ! CHECK:     %[[V_81:[0-9]+]] = fir.load %[[V_23]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_82:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 3 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_83:[0-9]+]] = fir.convert %[[V_81]] : (bf16) -> f32
+  ! CHECK:     %[[V_84:[0-9]+]] = arith.bitcast %[[V_83]] : f32 to i32
+  ! CHECK:     %[[V_85:[0-9]+]] = arith.shrui %[[V_84]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_86:[0-9]+]] = fir.convert %[[V_85]] : (i32) -> i1
+  ! CHECK:     %[[V_87:[0-9]+]] = arith.cmpi ne, %[[V_86]], %true{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_88:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_89:[0-9]+]] = arith.andi %[[V_88]], %[[V_87]] : i1
+  ! CHECK:     %[[V_90:[0-9]+]] = arith.ori %[[V_82]], %[[V_89]] : i1
+  ! CHECK:     %[[V_91:[0-9]+]] = fir.if %[[V_90]] -> (bf16) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 1 : i32}> : (bf16) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_81]] : bf16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_81]], %cst{{[_0-9]*}} fastmath<contract> : bf16
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (bf16) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : bf16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_81]] : bf16 to i16
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_87]], %[[V_206]], %[[V_205]] : i16
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to bf16
+  ! CHECK:         fir.result %[[V_208]] : bf16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : bf16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_91]] to %[[V_11]] : !fir.ref<bf16>
+  r3 = ieee_next_up(x3)
+  print "('up:     ', z4.4, ' -> ', z4.4, ' = ', g0)", x3, r3, r3
+
+  ! CHECK:     %[[V_104:[0-9]+]] = fir.load %[[V_25]] : !fir.ref<f32>
+  ! CHECK:     %[[V_105:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 3 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_106:[0-9]+]] = arith.bitcast %[[V_104]] : f32 to i32
+  ! CHECK:     %[[V_107:[0-9]+]] = arith.shrui %[[V_106]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_108:[0-9]+]] = fir.convert %[[V_107]] : (i32) -> i1
+  ! CHECK:     %[[V_109:[0-9]+]] = arith.cmpi ne, %[[V_108]], %false{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_110:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_109]] : i1
+  ! CHECK:     %[[V_112:[0-9]+]] = arith.ori %[[V_105]], %[[V_111]] : i1
+  ! CHECK:     %[[V_113:[0-9]+]] = fir.if %[[V_112]] -> (f32) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 1 : i32}> : (f32) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_104]] : f32
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_104]], %cst{{[_0-9]*}} fastmath<contract> : f32
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f32) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f32
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_106]], %c1{{.*}} : i32
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_106]], %c1{{.*}} : i32
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_109]], %[[V_205]], %[[V_204]] : i32
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i32 to f32
+  ! CHECK:         fir.result %[[V_207]] : f32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_113]] to %[[V_13]] : !fir.ref<f32>
+  r4 = ieee_next_down(x4)
+  print "('down:   ', z8.8, ' -> ', z8.8, ' = ', g0)", x4, r4, r4
+
+  ! CHECK:     %[[V_125:[0-9]+]] = fir.load %[[V_27]] : !fir.ref<f64>
+  ! CHECK:     %[[V_126:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK-DAG: %[[V_127:[0-9]+]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
+  ! CHECK-DAG: %[[V_128:[0-9]+]] = fir.coordinate_of %[[V_127]], %c2{{.*}} : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
+  ! CHECK-DAG: %[[V_129:[0-9]+]] = fir.load %[[V_128]] : !fir.ref<i64>
+  ! CHECK-DAG: %[[V_130:[0-9]+]] = arith.bitcast %[[V_129]] : i64 to f64
+  ! CHECK-DAG: %[[V_131:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_126]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_132:[0-9]+]] = arith.select %[[V_131]], %[[V_130]], %[[V_125]] : f64
+  ! CHECK:     %[[V_133:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 3 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_134:[0-9]+]] = fir.convert %[[V_126]] : (f16) -> f64
+  ! CHECK:     %[[V_135:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %[[V_134]] fastmath<contract> : f64
+  ! CHECK:     %[[V_136:[0-9]+]] = arith.ori %[[V_133]], %[[V_135]] : i1
+  ! CHECK:     %[[V_137:[0-9]+]] = arith.cmpf olt, %[[V_132]], %[[V_134]] fastmath<contract> : f64
+  ! CHECK:     %[[V_138:[0-9]+]] = arith.bitcast %[[V_125]] : f64 to i64
+  ! CHECK:     %[[V_139:[0-9]+]] = arith.shrui %[[V_138]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_140:[0-9]+]] = fir.convert %[[V_139]] : (i64) -> i1
+  ! CHECK:     %[[V_141:[0-9]+]] = arith.cmpi ne, %[[V_137]], %[[V_140]] : i1
+  ! CHECK:     %[[V_142:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_143:[0-9]+]] = arith.andi %[[V_142]], %[[V_141]] : i1
+  ! CHECK:     %[[V_144:[0-9]+]] = arith.ori %[[V_136]], %[[V_143]] : i1
+  ! CHECK:     %[[V_145:[0-9]+]] = fir.if %[[V_144]] -> (f64) {
+  ! CHECK:       fir.result %[[V_132]] : f64
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %cst{{[_0-9]*}} fastmath<contract> : f64
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f64) {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_137]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f64
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_204]] : f64
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_132]] : f64 to i64
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i64
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i64
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_141]], %[[V_206]], %[[V_205]] : i64
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i64 to f64
+  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_209]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_210]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_208]] : f64
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f64
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_145]] to %[[V_15]] : !fir.ref<f64>
+  r8 = ieee_next_after(x8, x2)
+  print "('after:  ', z16.16, ' -> ', z16.16, ' = ', g0)", x8, r8, r8
+
+  ! CHECK:     %[[V_158:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
+  ! CHECK:     %[[V_159:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_160:[0-9]+]] = arith.bitcast %[[V_158]] : f80 to i80
+  ! CHECK:     %[[V_161:[0-9]+]] = arith.shrui %[[V_160]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_162:[0-9]+]] = fir.convert %[[V_161]] : (i80) -> i1
+  ! CHECK:     %[[V_163:[0-9]+]] = arith.cmpi ne, %[[V_162]], %true{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_164:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 516 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_165:[0-9]+]] = arith.andi %[[V_164]], %[[V_163]] : i1
+  ! CHECK:     %[[V_166:[0-9]+]] = arith.ori %[[V_159]], %[[V_165]] : i1
+  ! CHECK:     %[[V_167:[0-9]+]] = fir.if %[[V_166]] -> (f80) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 1 : i32}> : (f80) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_158]] : f80
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_158]], %cst{{[_0-9]*}} fastmath<contract> : f80
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f80) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f80
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = fir.call @_FortranAMapException(%c63{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @fetestexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_206:[0-9]+]] = fir.call @fedisableexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_207:[0-9]+]] = fir.call @_FortranANearest10(%[[V_158]], %true{{[_0-9]*}}) fastmath<contract> : (f80, i1) -> f80
+  ! CHECK:         %[[V_208:[0-9]+]] = fir.call @feclearexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_209:[0-9]+]] = fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_210:[0-9]+]] = fir.call @feenableexcept(%[[V_206]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_207]] : f80
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f80
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_167]] to %[[V_5]] : !fir.ref<f80>
+  r10 = ieee_next_up(x10)
+  print "('up:     ', z20.20, ' -> ', z20.20, ' = ', g0)", x10, r10, r10
+
+  ! CHECK:     %[[V_180:[0-9]+]] = fir.load %[[V_19]] : !fir.ref<f128>
+  ! CHECK:     %[[V_181:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_182:[0-9]+]] = arith.bitcast %[[V_180]] : f128 to i128
+  ! CHECK:     %[[V_183:[0-9]+]] = arith.shrui %[[V_182]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_184:[0-9]+]] = fir.convert %[[V_183]] : (i128) -> i1
+  ! CHECK:     %[[V_185:[0-9]+]] = arith.cmpi ne, %[[V_184]], %false{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_186:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_185]] : i1
+  ! CHECK:     %[[V_188:[0-9]+]] = arith.ori %[[V_181]], %[[V_187]] : i1
+  ! CHECK:     %[[V_189:[0-9]+]] = fir.if %[[V_188]] -> (f128) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 1 : i32}> : (f128) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_180]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_180]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f128) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_182]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_182]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_185]], %[[V_205]], %[[V_204]] : i128
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i128 to f128
+  ! CHECK:         fir.result %[[V_207]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_189]] to %[[V_7]] : !fir.ref<f128>
+
+  r16 = ieee_next_down(x16)
+  print "('down:   ', z32.32, ' -> ', z32.32, ' = ', g0)", x16, r16, r16
+end
diff --git a/flang/test/Lower/Intrinsics/nearest.f90 b/flang/test/Lower/Intrinsics/nearest.f90
index a023fa8cd804e..5920d299d5fdf 100644
--- a/flang/test/Lower/Intrinsics/nearest.f90
+++ b/flang/test/Lower/Intrinsics/nearest.f90
@@ -1,72 +1,407 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
-! CHECK-LABEL: nearest_test1
+! CHECK-LABEL: c.func @_QPnearest_test1
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f16 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test1Eres"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f16>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f16>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f16) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f16 to i16
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i16
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f16 to i16
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i16) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f16) {
+  ! CHECK:       fir.result %[[V_5]] : f16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f16
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f16) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f16
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f16
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i16
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i16 to f16
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f16>
+  ! CHECK:     return
+  ! CHECK:   }
 subroutine nearest_test1(x, s)
-    real :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f32>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f32
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest4(%[[x]], %[[pos]]) {{.*}}: (f32, i1) -> f32
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f32>
-  end subroutine nearest_test1
-  
-  ! CHECK-LABEL: nearest_test2
-  subroutine nearest_test2(x, s)
-    real(kind=8) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f64>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f64>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f64
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f64
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest8(%[[x]], %[[pos]]) {{.*}}: (f64, i1) -> f64
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f64>
-  end subroutine nearest_test2
-  
-  ! CHECK-LABEL: nearest_test3
-  subroutine nearest_test3(x, s)
-    real(kind=10) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f80>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f80>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f80
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f80
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest10(%[[x]], %[[pos]]) {{.*}}: (f80, i1) -> f80
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f80>
-  end subroutine nearest_test3
-  
-  ! CHECK-LABEL: nearest_test4
-  subroutine nearest_test4(x, s)
-    real(kind=16) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f128>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f128>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f128
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f128
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest16(%[[x]], %[[pos]]) {{.*}}: (f128, i1) -> f128
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f128>
-  end subroutine nearest_test4
-  
-  ! CHECK-LABEL: nearest_test5
-  subroutine nearest_test5(x, s)
-    real(kind=16) :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f128>
-    real :: s
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f32>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f32
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest16(%[[x]], %[[pos]]) {{.*}}: (f128, i1) -> f128
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f128>
-  end subroutine nearest_test5
+  real(kind=2) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test2
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca bf16 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test2Eres"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (bf16) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = fir.convert %[[V_6]] : (bf16) -> f32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.bitcast %[[V_9]] : f32 to i32
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.shrui %[[V_10]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_12:[0-9]+]] = fir.convert %[[V_11]] : (i32) -> i16
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.cmpi ne, %[[V_12]], %c1{{.*}} : i16
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_5]] : (bf16) -> f32
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.bitcast %[[V_14]] : f32 to i32
+  ! CHECK:     %[[V_16:[0-9]+]] = arith.shrui %[[V_15]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (i32) -> i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.cmpi ne, %[[V_13]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_20:[0-9]+]] = arith.andi %[[V_19]], %[[V_18]] : i1
+  ! CHECK:     %[[V_21:[0-9]+]] = arith.ori %[[V_7]], %[[V_20]] : i1
+  ! CHECK:     %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (bf16) {
+  ! CHECK:       fir.result %[[V_5]] : bf16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_23:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : bf16
+  ! CHECK:       %[[V_24:[0-9]+]] = fir.if %[[V_23]] -> (bf16) {
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.select %[[V_13]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : bf16
+  ! CHECK:         %[[V_26:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_26]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_25]] : bf16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_5]] : bf16 to i16
+  ! CHECK-DAG:     %[[V_26:[0-9]+]] = arith.subi %[[V_25]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_27:[0-9]+]] = arith.addi %[[V_25]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_28:[0-9]+]] = arith.select %[[V_18]], %[[V_27]], %[[V_26]] : i16
+  ! CHECK:         %[[V_29:[0-9]+]] = arith.bitcast %[[V_28]] : i16 to bf16
+  ! CHECK:         %[[V_30:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_29]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:         fir.if %[[V_30]] {
+  ! CHECK:           %[[V_32:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_32]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_31:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_29]]) <{bit = 144 : i32}> : (bf16) -> i1
+  ! CHECK:         fir.if %[[V_31]] {
+  ! CHECK:           %[[V_32:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_32]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_29]] : bf16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_24]] : bf16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_22]] to %[[V_2]] : !fir.ref<bf16>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test2(x, s)
+  real(kind=3) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test3
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test3Eres"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f32>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i32
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f32 to i32
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i32) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f32) {
+  ! CHECK:       fir.result %[[V_5]] : f32
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f32
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f32) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f32
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f32
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i32
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i32
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i32
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i32 to f32
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f32) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f32>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test3(x, s)
+  real :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test4
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test4Eres"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f64>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f64>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f64) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f64 to i64
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i64
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f64 to i64
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i64) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f64) {
+  ! CHECK:       fir.result %[[V_5]] : f64
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f64
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f64) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f64
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f64
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i64
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i64
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i64
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i64 to f64
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f64
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f64
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f64>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test4(x, s)
+  real(kind=8) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test5
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f80) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f80 to i80
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i80
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f80 to i80
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i80) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f80) {
+  ! CHECK:       fir.result %[[V_5]] : f80
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f80
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f80) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f80
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f80
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_22:[0-9]+]] = fir.call @_FortranANearest10(%[[V_5]], %[[V_11]]) fastmath<contract> : (f80, i1) -> f80
+  ! CHECK:         fir.result %[[V_22]] : f80
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f80
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f80>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test5(x, s)
+  real(kind=10) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test6
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f128) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f128 to i128
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i128
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i128) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f128) {
+  ! CHECK:       fir.result %[[V_5]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f128) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i128
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i128 to f128
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test6(x, s)
+  real(kind=16) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test7
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_11:[0-9]+]] = fir.convert %[[V_10]] : (i32) -> i128
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.cmpi ne, %[[V_11]], %c1{{.*}} : i128
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK:     %[[V_14:[0-9]+]] = arith.shrui %[[V_13]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_15:[0-9]+]] = fir.convert %[[V_14]] : (i128) -> i1
+  ! CHECK:     %[[V_16:[0-9]+]] = arith.cmpi ne, %[[V_12]], %[[V_15]] : i1
+  ! CHECK:     %[[V_17:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.andi %[[V_17]], %[[V_16]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = arith.ori %[[V_7]], %[[V_18]] : i1
+  ! CHECK:     %[[V_20:[0-9]+]] = fir.if %[[V_19]] -> (f128) {
+  ! CHECK:       fir.result %[[V_5]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_21:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (f128) {
+  ! CHECK:         %[[V_23:[0-9]+]] = arith.select %[[V_12]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK:         %[[V_24:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_24]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_23]] : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.subi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_24:[0-9]+]] = arith.addi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.select %[[V_16]], %[[V_24]], %[[V_23]] : i128
+  ! CHECK:         %[[V_26:[0-9]+]] = arith.bitcast %[[V_25]] : i128 to f128
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_28:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_28]] {
+  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_26]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_22]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_20]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test7(x, s)
+  real(kind=16) :: x, res
+  real :: s
+  res = nearest(x, s)
+end
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index fa910e79bd766..0d2594c86a6eb 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -317,10 +317,10 @@ subroutine sub1(i, j, k)
 end subroutine
 
 ! CHECK: func.func @_QPsub1
+! CHECK: acc.parallel
 ! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"}
 ! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"}
 ! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"}
-! CHECK: acc.parallel
 ! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
 ! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
 ! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
new file mode 100644
index 0000000000000..1cbbcdcd0e4fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
@@ -0,0 +1,37 @@
+! Test delayed privatization for variables that are storage associated via `EQUIVALENCE`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 \
+! RUN:   | FileCheck %s
+
+subroutine private_common
+  real x, y
+  equivalence (x,y)
+  !$omp parallel firstprivate(x)
+    x = 3.14
+  !$omp end parallel
+end subroutine
+
+! CHECK:  omp.private {type = firstprivate} @[[X_PRIVATIZER:.*]] : ![[X_TYPE:fir.ptr<f32>]] alloc {
+! CHECK:  ^bb0(%{{.*}}: ![[X_TYPE]]):
+! CHECK:    %[[PRIV_ALLOC:.*]] = fir.alloca f32 {bindc_name = "x", {{.*}}}
+! CHECK:    %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {{{.*}}} : (![[PRIV_TYPE:fir.ref<f32>]]) -> ({{.*}})
+! CHECK:    %[[PRIV_CONV:.*]] = fir.convert %[[PRIV_DECL]]#0 : (![[PRIV_TYPE]]) -> ![[X_TYPE]]
+! CHECK:    omp.yield(%[[PRIV_CONV]] : ![[X_TYPE]])
+! CHECK:  } copy {
+! CHECK:  ^bb0(%[[ORIG_PTR:.*]]: ![[X_TYPE]], %[[PRIV_REF:.*]]: ![[X_TYPE]]):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_PTR]] : !fir.ptr<f32>
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] temporary_lhs : f32, ![[X_TYPE]]
+! CHECK:    omp.yield(%[[PRIV_REF]] : ![[X_TYPE]])
+! CHECK:  }
+
+! CHECK:  func.func @_QPprivate_common() {
+! CHECK:    omp.parallel private(@[[X_PRIVATIZER]] %{{.*}}#0 -> %[[PRIV_ARG:.*]] : ![[X_TYPE]]) {
+! CHECK:      %[[REG_DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]] {{{.*}}} : (![[X_TYPE]]) -> ({{.*}})
+! CHECK:      %[[CST:.*]] = arith.constant {{.*}}
+! CHECK:      hlfir.assign %[[CST]] to %[[REG_DECL]]#0 : {{.*}}
+! CHECK:      omp.terminator
+! CHECK:    }
+! CHECK:    return
+! CHECK:  }
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-character.f90 b/flang/test/Lower/OpenMP/Todo/atomic-character.f90
new file mode 100644
index 0000000000000..88effa4a2a515
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/atomic-character.f90
@@ -0,0 +1,8 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unsupported atomic type
+subroutine character_atomic
+  character :: l, r
+  !$omp atomic read
+    l = r
+end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-complex.f90 b/flang/test/Lower/OpenMP/Todo/atomic-complex.f90
new file mode 100644
index 0000000000000..6d6e4399ee192
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/atomic-complex.f90
@@ -0,0 +1,8 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unsupported atomic type
+subroutine complex_atomic
+  complex :: l, r
+  !$omp atomic read
+    l = r
+end subroutine
diff --git a/flang/test/Lower/OpenMP/associate.f90 b/flang/test/Lower/OpenMP/associate.f90
new file mode 100644
index 0000000000000..c6890f0954a7f
--- /dev/null
+++ b/flang/test/Lower/OpenMP/associate.f90
@@ -0,0 +1,38 @@
+! Check that constructs with associate and variables that have implicitly
+! determined DSAs are lowered properly.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: func @_QPtest_parallel_assoc
+!CHECK:         omp.parallel {
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEa"}
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
+!CHECK:           omp.wsloop {
+!CHECK:           }
+!CHECK:         }
+!CHECK:         omp.parallel {
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
+!CHECK:           omp.wsloop {
+!CHECK:           }
+!CHECK:         }
+subroutine test_parallel_assoc()
+  integer, parameter :: l = 3
+  integer :: a(l)
+  integer :: i
+  a = 1
+
+  !$omp parallel do
+  do i = 1,l
+    associate (b=>a)
+      b(i) = b(i) * 2
+    end associate
+  enddo
+  !$omp end parallel do
+
+  !$omp parallel do default(private)
+  do i = 1,l
+    associate (b=>a)
+      b(i) = b(i) * 2
+    end associate
+  enddo
+  !$omp end parallel do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index 8c3f37c94975e..9559df171111d 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -5,22 +5,18 @@
 ! This test checks the lowering of atomic read
 
 !CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
-!CHECK:    %[[A_C1:.*]] = arith.constant 1 : index
-!CHECK:    %[[A_REF:.*]] = fir.alloca !fir.char<1> {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] typeparams %[[A_C1]] {uniq_name = "_QFEa"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-!CHECK:    %[[B_C1:.*]] = arith.constant 1 : index
-!CHECK:    %[[B_REF:.*]] = fir.alloca !fir.char<1> {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_REF]] typeparams %[[B_C1]] {uniq_name = "_QFEb"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+!CHECK:    %[[A_REF:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
+!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[B_REF:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
+!CHECK:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_REF]] {uniq_name = "_QFEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "c", uniq_name = "_QFEc"}
 !CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_REF]] {uniq_name = "_QFEc"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 !CHECK:    %[[D_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "d", uniq_name = "_QFEd"}
 !CHECK:    %[[D_DECL:.*]]:2 = hlfir.declare %[[D_REF]] {uniq_name = "_QFEd"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[E_C8:.*]] = arith.constant 8 : index
-!CHECK:    %[[E_REF:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "e", uniq_name = "_QFEe"}
-!CHECK:    %[[E_DECL:.*]]:2 = hlfir.declare %[[E_REF]] typeparams %[[E_C8]] {uniq_name = "_QFEe"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
-!CHECK:    %[[F_C8:.*]] = arith.constant 8 : index
-!CHECK:    %[[F_REF:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "f", uniq_name = "_QFEf"}
-!CHECK:    %[[F_DECL:.*]]:2 = hlfir.declare %[[F_REF]] typeparams %[[F_C8]] {uniq_name = "_QFEf"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
+!CHECK:    %[[E_REF:.*]] = fir.alloca i32 {bindc_name = "e", uniq_name = "_QFEe"}
+!CHECK:    %[[E_DECL:.*]]:2 = hlfir.declare %[[E_REF]] {uniq_name = "_QFEe"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[F_REF:.*]] = fir.alloca i32 {bindc_name = "f", uniq_name = "_QFEf"}
+!CHECK:    %[[F_DECL:.*]]:2 = hlfir.declare %[[F_REF]] {uniq_name = "_QFEf"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[G_REF:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
 !CHECK:    %[[G_DECL:.*]]:2 = hlfir.declare %[[G_REF]] {uniq_name = "_QFEg"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[H_REF:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
@@ -30,9 +26,9 @@
 !CHECK:    %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
 !CHECK:    %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1   memory_order(acquire) hint(uncontended) : !fir.ref<i32>, i32
-!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<!fir.char<1>>, !fir.char<1>
+!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<i32>, i32
 !CHECK:    omp.atomic.read %[[C_DECL]]#1 = %[[D_DECL]]#1   memory_order(seq_cst) hint(contended) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
-!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<!fir.char<1,8>>, !fir.char<1,8>
+!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<i32>, i32
 !CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   hint(nonspeculative) : !fir.ref<f32>, f32
 !CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   : !fir.ref<f32>, f32
 
@@ -40,9 +36,9 @@ program OmpAtomic
 
     use omp_lib
     integer :: x, y
-    character :: a, b
+    integer :: a, b
     logical :: c, d
-    character(8) :: e, f
+    integer :: e, f
     real g, h
     !$omp atomic acquire read hint(omp_sync_hint_uncontended)
        x = y
diff --git a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90 b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
index 25579272a6e0b..21992aa629baf 100644
--- a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
+++ b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: @_QPsb
+!CHECK-LABEL: func @_QPsb
 subroutine sb(a)
   integer :: a(:)
 !CHECK: omp.parallel
@@ -9,3 +9,16 @@ subroutine sb(a)
     if (any(a/=(/(100,i=1,5)/))) print *, "OK"
   !$omp end parallel
 end subroutine
+
+!CHECK-LABEL: func @_QPsb2
+subroutine sb2()
+  integer, parameter :: SIZE=20
+  integer :: i, a(SIZE)
+
+! Just check that the construct below doesn't hit a TODO in lowering.
+!CHECK: omp.parallel
+  !$omp parallel
+    a = [ (i, i=1, SIZE) ]
+    print *, i
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/firstprivate-allocatable.f90 b/flang/test/Lower/OpenMP/firstprivate-allocatable.f90
new file mode 100644
index 0000000000000..9a16555543cff
--- /dev/null
+++ b/flang/test/Lower/OpenMP/firstprivate-allocatable.f90
@@ -0,0 +1,25 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+program firstprivateallocatable
+  Integer, Allocatable :: a,u
+  a = 137
+
+  !$omp parallel firstprivate(a,u)
+  u = a**2
+  !$omp end parallel
+end program
+
+
+! CHECK-LABEL:   func.func @_QQmain()
+! [...]
+! CHECK:           omp.parallel {
+! [...]
+! CHECK:             %[[VAL_50:.*]] = arith.constant 2 : i32
+! CHECK:             %[[VAL_51:.*]] = math.ipowi %{{.*}}, %[[VAL_50]] : i32
+! this is what we are really checking: the hlfir.assign must have realloc so that
+! u is allocated when the assignment occurs
+! CHECK:             hlfir.assign %[[VAL_51]] to %{{.*}}#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! [...]
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/firstprivate-commonblock.f90 b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
index d0fcdac76ad79..0fa0d2bc32495 100644
--- a/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
+++ b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
@@ -1,4 +1,6 @@
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp \
+! RUN:   -mmlir --openmp-enable-delayed-privatization=true -o - %s 2>&1 \
+! RUN: | FileCheck %s
 
 !CHECK: func.func @_QPfirstprivate_common() {
 !CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
@@ -12,15 +14,9 @@
 !CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
 !CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<f32>
 !CHECK: %[[VAL_6_DECL:.*]]:2 = hlfir.declare %[[val_6]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: omp.parallel {
-!CHECK: %[[val_7:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFfirstprivate_commonEx"}
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[val_7:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[val_9:.*]] : {{.*}}) {
 !CHECK: %[[VAL_7_DECL:.*]]:2 = hlfir.declare %[[val_7]] {uniq_name = "_QFfirstprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: %[[val_8:.*]] = fir.load %[[VAL_3_DECL]]#0 : !fir.ref<f32>
-!CHECK: hlfir.assign %[[val_8]] to %[[VAL_7_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
-!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFfirstprivate_commonEy"}
 !CHECK: %[[VAL_9_DECL:.*]]:2 = hlfir.declare %[[val_9]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: %[[val_10:.*]] = fir.load %[[VAL_6_DECL]]#0 : !fir.ref<f32>
-!CHECK: hlfir.assign %[[val_10]] to %[[VAL_9_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: return
diff --git a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
new file mode 100644
index 0000000000000..41bbb182aade2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
@@ -0,0 +1,38 @@
+! RUN: %flang_fc1 -emit-hlfir -o - -fopenmp %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -fopenmp %s | FileCheck %s
+
+program lastprivate_allocatable
+  integer, allocatable :: a
+  integer :: i
+  ! a is unallocated here
+  !$omp parallel do lastprivate(a)
+  do i=1,1
+    a = 42
+  enddo
+  !$omp end parallel do
+  ! a should be allocated here
+end program
+
+! CHECK-LABEL:   func.func @_QQmain()
+! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
+! CHECK:           %[[VAL_1:.*]] = fir.zero_bits !fir.heap<i32>
+! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:           omp.parallel {
+!                    create original copy of private variable
+! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:             %[[VAL_17:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
+! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             omp.wsloop {
+! CHECK:               omp.loop_nest
+!                        [...]
+!                        if this is the last iteration
+! CHECK:                 fir.if %{{.*}} {
+!                          store loop IV
+! CHECK:                   fir.store %{{.*}} to %[[VAL_18]]#1 : !fir.ref<i32>
+!                          assign private variable to original copy: realloc
+! CHECK:                   hlfir.assign %[[VAL_16]]#0 to %[[VAL_3]]#0 realloc : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:            }
+! CHECK-NEXT:            omp.yield
+! CHECK-NEXT:          }
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
new file mode 100644
index 0000000000000..9b90ac28f693c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -0,0 +1,239 @@
+! This test checks lowering of OpenMP parallel DO, with the loop bound being
+! a lastprivate variable
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK: func @_QPomp_do_lastprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
+subroutine omp_do_lastprivate(a)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  integer::n
+  n = a+1
+  !$omp parallel do lastprivate(a)
+  ! CHECK:  omp.parallel {
+
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivateEa"}
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP]] : i32
+  ! CHECK:      %[[ZERO:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP]], %[[ZERO]] : i32
+  ! CHECK:      %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB]] : i32
+  ! CHECK:      %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB]] : i32
+  ! CHECK:      %[[SEL:.*]] = arith.select %[[STEP_DIR]], %[[LT_UB]], %[[GT_UB]] : i1
+  ! CHECK:      fir.if %[[SEL]] {
+  ! CHECK:        fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:      }
+
+  ! CHECK-NEXT: omp.yield
+  ! CHECK-NEXT: }
+  ! CHECK-NEXT: omp.terminator
+  ! CHECK-NEXT: }
+    do i=1, a
+      call foo(i, a)
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG0_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(a)
+end subroutine omp_do_lastprivate
+
+! CHECK: func @_QPomp_do_lastprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"})
+subroutine omp_do_lastprivate2(a, n)
+  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  integer::n
+  n = a+1
+  !$omp parallel do lastprivate(a, n)
+  ! CHECK:  omp.parallel {
+
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, {{.*}}}
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_lastprivate2En"}
+  ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[LB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[UB:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.wsloop {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK: fir.store %[[ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK: %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP]] : i32
+  ! CHECK: %[[ZERO:.*]] = arith.constant 0 : i32
+  ! CHECK: %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP]], %[[ZERO]] : i32
+  ! CHECK: %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB]] : i32
+  ! CHECK: %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB]] : i32
+  ! CHECK: %[[SEL:.*]] = arith.select %[[STEP_DIR]], %[[LT_UB]], %[[GT_UB]] : i1
+  ! CHECK: fir.if %[[SEL]] {
+  ! CHECK:   fir.store %[[NEXT_ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:   %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:   hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:   %[[N_PVT_LOAD:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:   hlfir.assign %[[N_PVT_LOAD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK: }
+
+  ! CHECK: omp.yield
+  ! CHECK: omp.terminator
+    do i= a, n
+      call foo(i, a)
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG1_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(n)
+end subroutine omp_do_lastprivate2
+
+! CHECK: func @_QPomp_do_lastprivate_collapse2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
+subroutine omp_do_lastprivate_collapse2(a)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  !$omp parallel do lastprivate(a) collapse(2)
+  ! CHECK:  omp.parallel {
+
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivate_collapse2Ea"}
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !
+  ! CHECK: %[[J_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "j", pinned, {{.*}}}
+  ! CHECK: %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP1:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP2:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[LB1]], %[[LB2]]) to (%[[UB1]], %[[UB2]]) inclusive step (%[[STEP1]], %[[STEP2]]) {
+  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.store %[[ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1]] : i32
+  ! CHECK:      %[[ZERO1:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1]], %[[ZERO1]] : i32
+  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[SEL1:.*]] = arith.select %[[STEP1_END]], %[[LT_UB1]], %[[GT_UB1]] : i1
+  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2]] : i32
+  ! CHECK:      %[[ZERO2:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2]], %[[ZERO2]] : i32
+  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[SEL2:.*]] = arith.select %[[STEP2_END]], %[[LT_UB2]], %[[GT_UB2]] : i1
+  ! CHECK:      %[[AND:.*]] = arith.andi %[[SEL1]], %[[SEL2]] : i1
+  ! CHECK:      fir.if %[[AND]] {
+  ! CHECK:        fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:      }
+
+  ! CHECK-NEXT: omp.yield
+  ! CHECK-NEXT: }
+  ! CHECK-NEXT: omp.terminator
+  ! CHECK-NEXT: }
+    do i=1, a
+      do j=1, a
+        call foo(i, a)
+      end do
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG0_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(a)
+end subroutine omp_do_lastprivate_collapse2
+
+! CHECK: func @_QPomp_do_lastprivate_collapse3(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
+subroutine omp_do_lastprivate_collapse3(a)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  !$omp parallel do lastprivate(a) collapse(3)
+  ! CHECK:  omp.parallel {
+
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivate_collapse3Ea"}
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[J_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "j", pinned, {{.*}}}
+  ! CHECK: %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[K_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "k", pinned, {{.*}}}
+  ! CHECK: %[[K_PVT_DECL:.*]]:2 = hlfir.declare %[[K_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  ! CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP1:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP2:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[LB3:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB3:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP3:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]], %[[ARG3:.*]]) : i32 = (%[[LB1]], %[[LB2]], %[[LB3]]) to (%[[UB1]], %[[UB2]], %[[UB3]]) inclusive step (%[[STEP1]], %[[STEP2]], %[[STEP3]]) {
+  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.store %[[ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.store %[[ARG3]] to %[[K_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1]] : i32
+  ! CHECK:      %[[ZERO1:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1]], %[[ZERO1]] : i32
+  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[SEL1:.*]] = arith.select %[[STEP1_END]], %[[LT_UB1]], %[[GT_UB1]] : i1
+  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2]] : i32
+  ! CHECK:      %[[ZERO2:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2]], %[[ZERO2]] : i32
+  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[SEL2:.*]] = arith.select %[[STEP2_END]], %[[LT_UB2]], %[[GT_UB2]] : i1
+  ! CHECK:      %[[AND1:.*]] = arith.andi %[[SEL1]], %[[SEL2]] : i1
+  ! CHECK:      %[[NEXT_ARG3:.*]] = arith.addi %[[ARG3]], %[[STEP3]] : i32
+  ! CHECK:      %[[ZERO3:.*]] = arith.constant 0 : i32
+  ! CHECK:      %[[STEP3_END:.*]] = arith.cmpi slt, %[[STEP3]], %[[ZERO3]] : i32
+  ! CHECK:      %[[LT_UB3:.*]] = arith.cmpi slt, %[[NEXT_ARG3]], %[[UB3]] : i32
+  ! CHECK:      %[[GT_UB3:.*]] = arith.cmpi sgt, %[[NEXT_ARG3]], %[[UB3]] : i32
+  ! CHECK:      %[[SEL3:.*]] = arith.select %[[STEP3_END]], %[[LT_UB3]], %[[GT_UB3]] : i1
+  ! CHECK:      %[[AND2:.*]] = arith.andi %[[AND1]], %[[SEL3]] : i1
+  ! CHECK:      fir.if %[[AND2]] {
+  ! CHECK:        fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        fir.store %[[NEXT_ARG3]] to %[[K_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:      }
+
+  ! CHECK-NEXT: omp.yield
+  ! CHECK-NEXT: }
+  ! CHECK-NEXT: omp.terminator
+  ! CHECK-NEXT: }
+    do i=1, a
+      do j=1, a
+        do k=1, a
+          call foo(i, a)
+        end do
+      end do
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG0_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(a)
+end subroutine omp_do_lastprivate_collapse3
diff --git a/flang/test/Lower/OpenMP/private-commonblock.f90 b/flang/test/Lower/OpenMP/private-commonblock.f90
index ee580594f7c3f..20f6cf57c2ae5 100644
--- a/flang/test/Lower/OpenMP/private-commonblock.f90
+++ b/flang/test/Lower/OpenMP/private-commonblock.f90
@@ -1,10 +1,10 @@
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp \
+! RUN:   -mmlir --openmp-enable-delayed-privatization=true -o - %s 2>&1 \
+! RUN: | FileCheck %s
 
 !CHECK: func.func @_QPprivate_common() {
-!CHECK: omp.parallel {
-!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_commonEx"}
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[X:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[Y:.*]] : {{.*}}) {
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: %[[Y:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_commonEy"}
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: omp.terminator
 !CHECK: }
@@ -48,17 +48,13 @@ subroutine private_common
 !CHECK:    %[[D_REF:.*]] = fir.convert %[[D_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
 !CHECK:    %[[D_BOX:.*]] = fir.emboxchar %[[D_REF]], %[[TP5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK:    fir.call @_QPsub1(%[[A_DECL]]#1, %[[B_DECL]]#1, %[[C_BOX]], %[[D_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK:        omp.parallel {
-!CHECK:      %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblockEa"}
+!CHECK:    omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[B_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[D_PVT_REF:.*]] : {{.*}}) {
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblockEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:      %[[B_PVT_REF:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "b", pinned, uniq_name = "_QFprivate_clause_commonblockEb"}
-!CHECK:      %[[SH10:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
+!CHECK:      %[[SH10:.*]] = fir.shape %c10{{.*}} : (index) -> !fir.shape<1>
 !CHECK:      %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT_REF]](%[[SH10]]) {uniq_name = "_QFprivate_clause_commonblockEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
-!CHECK:      %[[C_PVT_REF:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblockEc"}
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] typeparams %{{.*}} {uniq_name = "_QFprivate_clause_commonblockEc"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
-!CHECK:      %[[D_PVT_REF:.*]] = fir.alloca !fir.array<5x!fir.char<1,5>> {bindc_name = "d", pinned, uniq_name = "_QFprivate_clause_commonblockEd"}
-!CHECK:      %[[SH5:.*]] = fir.shape %c5_1 : (index) -> !fir.shape<1>
-!CHECK:      %[[D_PVT_DECL:.*]]:2 = hlfir.declare %[[D_PVT_REF]](%[[SH5]]) typeparams %[[TP5]] {uniq_name = "_QFprivate_clause_commonblockEd"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK:      %[[SH5:.*]] = fir.shape %c5{{.*}} : (index) -> !fir.shape<1>
+!CHECK:      %[[D_PVT_DECL:.*]]:2 = hlfir.declare %[[D_PVT_REF]](%[[SH5]]) typeparams %c5{{.*}} {uniq_name = "_QFprivate_clause_commonblockEd"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
 !CHECK:      %[[C_PVT_BOX:.*]] = fir.emboxchar %[[C_PVT_DECL]]#1, %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK:      %[[D_PVT_REF:.*]] = fir.convert %[[D_PVT_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
 !CHECK:      %[[D_PVT_BOX:.*]] = fir.emboxchar %[[D_PVT_REF]], %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
@@ -98,10 +94,8 @@ subroutine private_clause_commonblock()
 !CHECK:    %[[C_ADDR:.*]] = fir.box_addr %[[C_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
 !CHECK:    %[[C_REF:.*]] = fir.convert %[[C_ADDR]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
 !CHECK:    fir.call @_QPsub4(%[[C_REF]], %[[A_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK:    omp.parallel {
-!CHECK:      %[[C_PVT_REF:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.complex<4>>> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEc"}
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]] : {{.*}}) {
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
-!CHECK:      %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEa"}
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
 !CHECK:      %[[C_PVT_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
diff --git a/flang/test/Lower/OpenMP/private-derived-type.f90 b/flang/test/Lower/OpenMP/private-derived-type.f90
new file mode 100644
index 0000000000000..230484f20c11d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/private-derived-type.f90
@@ -0,0 +1,47 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+subroutine s4
+  type y3
+    integer,allocatable::x
+  end type y3
+  type(y3)::v
+  !$omp parallel
+  !$omp do private(v)
+  do i=1,10
+    v%x=1
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine s4
+
+
+! CHECK-LABEL:   func.func @_QPs4() {
+!                  Example of how the lowering for regular derived type variables:
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", uniq_name = "_QFs4Ev"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
+! CHECK:           %[[VAL_10:.*]] = fir.embox %[[VAL_9]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:           %[[VAL_11:.*]] = fir.address_of
+! CHECK:           %[[VAL_12:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK:           %[[VAL_15:.*]] = fir.call @_FortranAInitialize(%[[VAL_13]], %[[VAL_14]], %[[VAL_12]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_23:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", pinned, uniq_name = "_QFs4Ev"}
+! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
+! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_24]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:             %[[VAL_26:.*]] = fir.address_of
+! CHECK:             %[[VAL_27:.*]] = arith.constant 4 : i32
+! CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+!                    Check we do call FortranAInitialize on the derived type
+! CHECK:             %[[VAL_30:.*]] = fir.call @_FortranAInitialize(%[[VAL_28]], %[[VAL_29]], %[[VAL_27]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             omp.wsloop {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           %[[VAL_39:.*]] = fir.embox %[[VAL_9]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:           %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+!                  Check the derived type is destroyed
+! CHECK:           %[[VAL_41:.*]] = fir.call @_FortranADestroy(%[[VAL_40]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 0500b21a84cb0..9b92293cbf92f 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -66,7 +66,7 @@ subroutine omp_target_enter_nowait
    integer :: a(1024)
    !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data  nowait map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !CHECK: omp.target_enter_data map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) nowait
    !$omp target enter data map(to: a) nowait
 end subroutine omp_target_enter_nowait
 
@@ -278,7 +278,7 @@ subroutine omp_target_update_nowait
    !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
    !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
 
-   !CHECK: omp.target_update nowait map_entries
+   !CHECK: omp.target_update map_entries({{.*}}) nowait
    !$omp target update from(a) nowait
 end subroutine omp_target_update_nowait
 
@@ -493,7 +493,7 @@ subroutine omp_target_thread_limit
    integer :: a
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
    !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
+   !CHECK: omp.target map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) thread_limit(%[[VAL_1]] : i32) {
    !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
       a = 10
@@ -512,7 +512,7 @@ subroutine omp_target_device_ptr
    type(c_ptr) :: a
    integer, target :: b
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
-   !CHECK: omp.target_data use_device_ptr({{.*}}) map_entries(%[[MAP]]{{.*}}
+   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}})
    !$omp target data map(tofrom: a) use_device_ptr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
@@ -533,7 +533,7 @@ subroutine omp_target_device_addr
    !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) {
+   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
    !$omp target data map(tofrom: a) use_device_addr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
    !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index 71ff57cd8fa54..afbe2cbfa746e 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -227,7 +227,7 @@ subroutine task_multiple_clauses()
   integer :: x, y, z
   logical :: buzz
 
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
+  !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) final(%{{.+}}) if(%{{.+}}) priority(%{{.+}}) {
   !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
 
 !CHECK: %[[X_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFtask_multiple_clausesEx"}
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index 0ad06f73a8ce9..acb5f533b619e 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -8,7 +8,7 @@
 ! functionality 
 
 !CHECK: func.func @{{.*}}only_use_device_ptr()
-!CHECK: omp.target_data use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine only_use_device_ptr 
     use iso_c_binding
@@ -21,7 +21,7 @@ subroutine only_use_device_ptr
 end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
-!CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr 
     use iso_c_binding
@@ -47,7 +47,7 @@ subroutine only_use_device_addr
 end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
-!CHECK: omp.target_data use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr_and_map
     use iso_c_binding
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index bc22c8b05b967..cda7593b217ab 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -244,8 +244,7 @@ program reduce15
 ! CHECK:                   %[[VAL_85:.*]] = arith.select %[[VAL_84]], %[[VAL_76]], %[[VAL_83]] : i32
 ! CHECK:                   hlfir.yield_element %[[VAL_85]] : i32
 ! CHECK:                 }
-! CHECK:                 %[[VAL_86:.*]] = fir.load %[[VAL_62]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK:                 hlfir.assign %[[VAL_68]] to %[[VAL_86]] : !hlfir.expr<?xi32>, !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK:                 hlfir.assign %[[VAL_68]] to %[[VAL_62]]#0 realloc : !hlfir.expr<?xi32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:                 hlfir.destroy %[[VAL_68]] : !hlfir.expr<?xi32>
 ! CHECK:                 omp.yield
 ! CHECK:               }
@@ -288,8 +287,7 @@ program reduce15
 ! CHECK:                   %[[VAL_117:.*]] = arith.select %[[VAL_116]], %[[VAL_108]], %[[VAL_115]] : i32
 ! CHECK:                   hlfir.yield_element %[[VAL_117]] : i32
 ! CHECK:                 }
-! CHECK:                 %[[VAL_118:.*]] = fir.load %[[VAL_94]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK:                 hlfir.assign %[[VAL_100]] to %[[VAL_118]] : !hlfir.expr<?xi32>, !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK:                 hlfir.assign %[[VAL_100]] to %[[VAL_94]]#0 realloc : !hlfir.expr<?xi32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:                 hlfir.destroy %[[VAL_100]] : !hlfir.expr<?xi32>
 ! CHECK:                 omp.yield
 ! CHECK:               }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index 13858ff2f34e6..3c5388b7e5d90 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -85,9 +85,7 @@ program reduce
 ! CHECK:                 %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK:                 fir.store %[[VAL_15]] to %[[VAL_10]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
-! CHECK:                 %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-! CHECK:                 hlfir.assign %[[VAL_17]] to %[[VAL_19]] : i32, !fir.heap<i32>
+! CHECK:                 hlfir.assign %[[VAL_17]] to %[[VAL_16]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:                 omp.yield
 ! CHECK:               }
 ! CHECK:               omp.terminator
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index e96945ef89e56..7632b22f98b7e 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -643,7 +643,7 @@ program test_alloc
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
 ! LLVM: %[[TYPE_CODE:.*]] = load i32, ptr %[[TYPE_CODE_GEP]]
 ! LLVM: %[[BOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } undef, i64 %[[ELE_SIZE]], 1
-! LLVM: %[[BOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX0]], i32 20180515, 2
+! LLVM: %[[BOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX0]], i32 20240719, 2
 ! LLVM: %[[BOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX1]], i8 0, 3
 ! LLVM: %[[TYPE_CODE_TRUNC:.*]] = trunc i32 %[[TYPE_CODE]] to i8
 ! LLVM: %[[BOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX2]], i8 %[[TYPE_CODE_TRUNC]], 4
@@ -664,7 +664,7 @@ program test_alloc
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
 ! LLVM: %[[TYPE_CODE:.*]] = load i32, ptr %[[TYPE_CODE_GEP]]
 ! LLVM: %[[BOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } undef, i64 %[[ELE_SIZE]], 1
-! LLVM: %[[BOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX0]], i32 20180515, 2
+! LLVM: %[[BOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX0]], i32 20240719, 2
 ! LLVM: %[[BOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX1]], i8 0, 3
 ! LLVM: %[[TYPE_CODE_TRUNC:.*]] = trunc i32 %[[TYPE_CODE]] to i8
 ! LLVM: %[[BOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX2]], i8 %[[TYPE_CODE_TRUNC]], 4
@@ -681,7 +681,7 @@ program test_alloc
 ! allocatable.
 
 ! LLVM-LABEL: define void @_QMpolyPtest_deallocate()
-! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyE.dt.p1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
+! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyE.dt.p1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
 ! LLVM: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ALLOCA1]]
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA2:[0-9]*]]
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyE.dt.p1, i32 0, i32 0)
diff --git a/flang/test/Lower/location.f90 b/flang/test/Lower/location.f90
new file mode 100644
index 0000000000000..a6ece31bbebed
--- /dev/null
+++ b/flang/test/Lower/location.f90
@@ -0,0 +1,13 @@
+! RUN: bbc -emit-hlfir --mlir-print-debuginfo %s -o - | FileCheck %s
+
+program test
+include 'location0.inc'
+
+end 
+
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK: fir.call @_FortranAioOutputAscii(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 loc(fused<#fir<loc_kind_array[ base,  inclusion,  inclusion]>>["{{.*}}location1.inc":1:10, "{{.*}}location0.inc":1:1, "{{.*}}location.f90":4:1])
+! CHECK: return loc("{{.*}}location.f90":6:1)
+! CHECK: } loc("{{.*}}location.f90":3:1)
+
+
diff --git a/flang/test/Lower/location0.inc b/flang/test/Lower/location0.inc
new file mode 100644
index 0000000000000..d46282c73c51e
--- /dev/null
+++ b/flang/test/Lower/location0.inc
@@ -0,0 +1 @@
+include 'location1.inc'
diff --git a/flang/test/Lower/location1.inc b/flang/test/Lower/location1.inc
new file mode 100644
index 0000000000000..6a2671c61828e
--- /dev/null
+++ b/flang/test/Lower/location1.inc
@@ -0,0 +1 @@
+print *, 'from location.inc'
diff --git a/flang/test/Parser/OpenMP/proc-bind.f90 b/flang/test/Parser/OpenMP/proc-bind.f90
new file mode 100644
index 0000000000000..08bcf69e5e765
--- /dev/null
+++ b/flang/test/Parser/OpenMP/proc-bind.f90
@@ -0,0 +1,14 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+! CHECK: !$OMP PARALLEL  PROC_BIND(PRIMARY)
+
+! PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+! PARSE-TREE:  OmpBeginBlockDirective
+! PARSE-TREE:   OmpBlockDirective -> llvm::omp::Directive = parallel
+! PARSE-TREE:   OmpClauseList -> OmpClause -> ProcBind -> OmpProcBindClause -> Type = Primary
+subroutine sb1
+  !$omp parallel proc_bind(primary)
+  print *, "Hello"
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 9d73204e3f5f6..9341f054d79d4 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,7 +23,8 @@ module m
   end subroutine
   subroutine test
     logical isPinned
-    real a(10), x, y, z
+    real, device :: a(10)
+    real :: x, y, z
     !$cuf kernel do(1) <<<*, *, stream = 1>>>
     do j = 1, 10
     end do
diff --git a/flang/test/Parser/recovery03.f90 b/flang/test/Parser/recovery03.f90
new file mode 100644
index 0000000000000..f3340f0d99e0e
--- /dev/null
+++ b/flang/test/Parser/recovery03.f90
@@ -0,0 +1,9 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! CHECK: error: misplaced declaration in the execution part
+! CHECK:  real, pointer :: p2(:,:)
+! CHECK: in the context: execution part construct
+real, allocatable, target :: a2(:,:)
+allocate(a2(2:11,0:9))
+real, pointer :: p2(:,:)
+p2 => a2(2:3,1:2)
+end
diff --git a/flang/test/Parser/recovery04.f90 b/flang/test/Parser/recovery04.f90
new file mode 100644
index 0000000000000..144ebd24f71b5
--- /dev/null
+++ b/flang/test/Parser/recovery04.f90
@@ -0,0 +1,24 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+module m
+ contains
+  !CHECK: expected end of statement
+  !CHECK: subroutine s1(var i, j)
+  subroutine s1(var i, j)
+  end subroutine
+  !CHECK: expected end of statement
+  !CHECK: subroutine s2[b]
+  subroutine s2[b]
+  end subroutine
+  !CHECK: expected end of statement
+  !CHECK: function f1(var i, j)
+  function f1(var i, j)
+  end function
+  !CHECK: expected end of statement
+  !CHECK: function f2[b]
+  function f2[b]
+  end function
+  !CHECK: expected end of statement
+  !CHECK: function f3(a,*)
+  function f3(a,*)
+  end function
+end
diff --git a/flang/test/Preprocessing/directive-contin-with-pp.F90 b/flang/test/Preprocessing/directive-contin-with-pp.F90
index 64f1dc43f72b4..544c6619f6b53 100644
--- a/flang/test/Preprocessing/directive-contin-with-pp.F90
+++ b/flang/test/Preprocessing/directive-contin-with-pp.F90
@@ -11,7 +11,7 @@
 
 module m
  contains
-  subroutine s(x1, x2, x3, x4, x5, x6, x7)
+  subroutine s1(x1, x2, x3, x4, x5, x6, x7)
 
 !dir$ ignore_tkr x1
 
@@ -51,11 +51,18 @@ subroutine s(x1, x2, x3, x4, x5, x6, x7)
     do j3 = 1, n
     end do
   end
-end
+
+COMMENT &
+  subroutine s2
+  end subroutine
+COMMENT&
+  subroutine s3
+  end subroutine
+end module
 
 !CHECK: MODULE m
 !CHECK: CONTAINS
-!CHECK:  SUBROUTINE s (x1, x2, x3, x4, x5, x6, x7)
+!CHECK:  SUBROUTINE s1 (x1, x2, x3, x4, x5, x6, x7)
 !CHECK:   !DIR$ IGNORE_TKR x1
 !CHECK:   !DIR$ IGNORE_TKR x2
 !CHECK:   !DIR$ IGNORE_TKR x3
@@ -73,4 +80,8 @@ subroutine s(x1, x2, x3, x4, x5, x6, x7)
 !CHECK:   DO j3=1_4,n
 !CHECK:   END DO
 !CHECK:  END SUBROUTINE
+!CHECK:  SUBROUTINE s2
+!CHECK:  END SUBROUTINE
+!CHECK:  SUBROUTINE s3
+!CHECK:  END SUBROUTINE
 !CHECK: END MODULE
diff --git a/flang/test/Preprocessing/line-in-contin.F90 b/flang/test/Preprocessing/line-in-contin.F90
new file mode 100644
index 0000000000000..138e579bffaa2
--- /dev/null
+++ b/flang/test/Preprocessing/line-in-contin.F90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
+! CHECK: call foo( 0.)
+! CHECK: call foo( 1.)
+! CHECK: call foo( 2.)
+! CHECK: call foo( 3.)
+call foo( &
+# 100 "bar.h"
+         & 0.)
+call foo( &
+# 101 "bar.h"
+         1.)
+call foo( &
+# 102 "bar.h"
+         & 2. &
+    & )
+call foo( &
+# 103 "bar.h"
+         & 3. &
+    )
+end
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 745895248ddf4..020d63f735596 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -470,12 +470,14 @@
 ! 2.13.1 master
 
   !$omp parallel
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !$omp master
   a=3.14
   !$omp end master
   !$omp end parallel
 
   !$omp parallel
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !ERROR: NUM_THREADS clause is not allowed on the MASTER directive
   !$omp master num_threads(4)
   a=3.14
diff --git a/flang/test/Semantics/OpenMP/copyprivate04.f90 b/flang/test/Semantics/OpenMP/copyprivate04.f90
new file mode 100644
index 0000000000000..291cf1103fb27
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/copyprivate04.f90
@@ -0,0 +1,112 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! OpenMP Version 5.2
+! 5.1.1 - Variables Referenced in a Construct
+! Copyprivate must accept variables that are predetermined as private.
+
+module m1
+  integer :: m
+end module
+
+program omp_copyprivate
+  use m1
+  implicit none
+  integer :: i
+  integer, save :: j
+  integer :: k
+  common /c/ k
+  real, parameter :: pi = 3.14
+  integer :: a1(10)
+
+  ! Local variables are private.
+  !$omp single
+    i = 123
+  !$omp end single copyprivate(i)
+  !$omp single
+  !$omp end single copyprivate(a1)
+
+  ! Variables with the SAVE attribute are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'j' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(j)
+
+  ! Common block variables are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(/c/)
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(k)
+
+  ! Module variables are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'm' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(m)
+
+  ! Parallel can make a variable shared.
+  !$omp parallel
+    !$omp single
+      i = 456
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+    call sub(j, a1)
+  !$omp end parallel
+
+  !$omp parallel shared(i)
+    !$omp single
+      i = 456
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+  !$omp end parallel
+
+  !FIXME: an error should be emitted in this case.
+  !       copyprivate(i) should be considered as a reference to i and a new
+  !       symbol should be created in `parallel` scope, for this case to be
+  !       handled properly.
+  !$omp parallel
+    !$omp single
+    !$omp end single copyprivate(i)
+  !$omp end parallel
+
+  ! Named constants are shared.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'pi' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(pi)
+
+  !$omp parallel do
+  do i = 1, 10
+    !$omp parallel
+    !$omp single
+      j = i
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+    !$omp end parallel
+  end do
+  !$omp end parallel do
+
+contains
+  subroutine sub(s1, a)
+    integer :: s1
+    integer :: a(:)
+
+    ! Dummy argument.
+    !$omp single
+    !$omp end single copyprivate(s1)
+
+    ! Assumed shape arrays are shared.
+    !$omp single
+    !ERROR: COPYPRIVATE variable 'a' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(a)
+  end subroutine
+
+  integer function fun(f1)
+    integer :: f1
+
+    ! Dummy argument.
+    !$omp single
+    !$omp end single copyprivate(f1)
+
+    ! Function result is private.
+    !$omp single
+    !$omp end single copyprivate(fun)
+  end function
+end program
diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90
index 4e02235f58a1a..3b512a5b4f25e 100644
--- a/flang/test/Semantics/OpenMP/do05-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90
@@ -20,12 +20,12 @@ program omp_do
   !$omp parallel  default(shared)
   !$omp do
   !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /omp_do/n
+  !DEF: /omp_do/OtherConstruct2/n HostAssoc INTEGER(4)
   do i=1,n
     !$omp parallel
     !$omp single
     !DEF: /work EXTERNAL (Subroutine) ProcEntity
-    !REF: /omp_do/OtherConstruct2/OtherConstruct1/i
+    !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
     call work(i, 1)
     !$omp end single
     !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/do20.f90 b/flang/test/Semantics/OpenMP/do20.f90
index 915d01e69edd7..0cafae76b86b0 100644
--- a/flang/test/Semantics/OpenMP/do20.f90
+++ b/flang/test/Semantics/OpenMP/do20.f90
@@ -10,7 +10,7 @@ subroutine shared_iv
 
   !$omp parallel shared(i)
     !$omp single
-      !REF: /shared_iv/i
+      !DEF: /shared_iv/OtherConstruct1/i HostAssoc INTEGER(4)
       do i = 0, 1
       end do
     !$omp end single
diff --git a/flang/test/Semantics/OpenMP/flush02.f90 b/flang/test/Semantics/OpenMP/flush02.f90
index 18a0d0356bbd7..d12c76bad0334 100644
--- a/flang/test/Semantics/OpenMP/flush02.f90
+++ b/flang/test/Semantics/OpenMP/flush02.f90
@@ -80,6 +80,7 @@
 
   !$omp parallel num_threads(4)
     array = (/1, 2, 3, 4, 5, 6, 7, 8, 9, 10/)
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !$omp master
       !$omp flush (array)
     !$omp end master
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 92d2421d06f97..2abe3a0e16d62 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -15,14 +15,14 @@ subroutine implicit_dsa_test1
   !$omp task private(y) shared(z)
     !DEF: /implicit_dsa_test1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
     !DEF: /implicit_dsa_test1/OtherConstruct1/y (OmpPrivate) HostAssoc INTEGER(4)
-    !REF: /implicit_dsa_test1/z
+    !DEF: /implicit_dsa_test1/OtherConstruct1/z HostAssoc INTEGER(4)
     x = y + z
   !$omp end task
 
   !$omp task default(shared)
-    !REF: /implicit_dsa_test1/x
-    !REF: /implicit_dsa_test1/y
-    !REF: /implicit_dsa_test1/z
+    !DEF: /implicit_dsa_test1/OtherConstruct2/x HostAssoc INTEGER(4)
+    !DEF: /implicit_dsa_test1/OtherConstruct2/y HostAssoc INTEGER(4)
+    !DEF: /implicit_dsa_test1/OtherConstruct2/z HostAssoc INTEGER(4)
     x = y + z
   !$omp end task
 
@@ -61,16 +61,16 @@ subroutine implicit_dsa_test3
 
   !$omp parallel
     !$omp task
-      !REF: /implicit_dsa_test3/x
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
       x = 1
-      !REF: /implicit_dsa_test3/y
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
       y = 1
     !$omp end task
 
     !$omp task firstprivate(x)
       !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate) HostAssoc INTEGER(4)
       x = 1
-      !REF: /implicit_dsa_test3/z
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/z HostAssoc INTEGER(4)
       z = 1
     !$omp end task
   !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/nested-barrier.f90 b/flang/test/Semantics/OpenMP/nested-barrier.f90
index cad31d7985607..aae283229e330 100644
--- a/flang/test/Semantics/OpenMP/nested-barrier.f90
+++ b/flang/test/Semantics/OpenMP/nested-barrier.f90
@@ -75,6 +75,7 @@ program omp_nest_barrier
   end do
   !$omp end critical
 
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !$omp master
   do i = 1, 10
     k = k + 1
@@ -107,6 +108,7 @@ program omp_nest_barrier
   end do
   !$omp end ordered
 
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !$omp master
   do i = 1, 10
     !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
diff --git a/flang/test/Semantics/OpenMP/nested-master.f90 b/flang/test/Semantics/OpenMP/nested-master.f90
index ef7d2cef6f88a..069de67cafae2 100644
--- a/flang/test/Semantics/OpenMP/nested-master.f90
+++ b/flang/test/Semantics/OpenMP/nested-master.f90
@@ -9,6 +9,7 @@ program omp_nest_master
   !$omp do
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -16,6 +17,7 @@ program omp_nest_master
   end do
 
   !$omp sections 
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     do i = 1, 10
@@ -25,6 +27,7 @@ program omp_nest_master
   !$omp end sections
 
   !$omp single 
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     do i = 1, 10
@@ -38,6 +41,7 @@ program omp_nest_master
   !$omp task
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -48,6 +52,7 @@ program omp_nest_master
   !$omp taskloop
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -58,6 +63,7 @@ program omp_nest_master
   !$omp target parallel do simd
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: The only OpenMP constructs that can be encountered during execution of a 'SIMD' region are the `ATOMIC` construct, the `LOOP` construct, the `SIMD` construct and the `ORDERED` construct with the `SIMD` clause.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
@@ -69,6 +75,7 @@ program omp_nest_master
   !$omp critical
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !$omp master
     j = j -1
     !$omp end master
@@ -78,6 +85,7 @@ program omp_nest_master
   !$omp ordered
   do i = 1, 10
     k = k + 1
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !$omp master
     j = j -1
     !$omp end master
@@ -91,6 +99,7 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
+      !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
       !$omp master
       j = j -1
       !$omp end master
@@ -107,6 +116,7 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
+      !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
       !$omp master
       j = j -1
       !$omp end master
@@ -123,6 +133,7 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
+      !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
       !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
       !$omp master
       j = j -1
@@ -140,6 +151,7 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
+      !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
       !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
       !$omp master
       j = j -1
diff --git a/flang/test/Semantics/OpenMP/nested-teams.f90 b/flang/test/Semantics/OpenMP/nested-teams.f90
index 80c59e07fbaa6..f3b96b0ab4390 100644
--- a/flang/test/Semantics/OpenMP/nested-teams.f90
+++ b/flang/test/Semantics/OpenMP/nested-teams.f90
@@ -42,6 +42,7 @@ program main
   !$omp end teams
   end do
 
+  !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
   !$omp master
   !ERROR: TEAMS region can only be strictly nested within the implicit parallel region or TARGET region
   !$omp teams
diff --git a/flang/test/Semantics/OpenMP/ordered-simd.f90 b/flang/test/Semantics/OpenMP/ordered-simd.f90
index c33ec745f2dda..ed52b75949100 100644
--- a/flang/test/Semantics/OpenMP/ordered-simd.f90
+++ b/flang/test/Semantics/OpenMP/ordered-simd.f90
@@ -95,6 +95,7 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP CRITICAL  
     C =  C - A * B
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !$OMP MASTER
     DO I = 1,N
       !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region.
@@ -107,6 +108,7 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP ORDERED  
     C =  C - A * B
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !$OMP MASTER
     DO I = 1,N
       !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region.
@@ -119,6 +121,7 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP TASK  
     C =  C - A * B
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$OMP MASTER
     DO I = 1,N
@@ -133,6 +136,7 @@ SUBROUTINE ORDERED_BAD(N)
   !$OMP TASKLOOP
   DO J= 1,N  
     C =  C - A * B
+    !WARNING: OpenMP directive 'master' has been deprecated, please use 'masked' instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$OMP MASTER
     DO I = 1,N
diff --git a/flang/test/Semantics/OpenMP/parallel-shared05.f90 b/flang/test/Semantics/OpenMP/parallel-shared05.f90
new file mode 100644
index 0000000000000..bcc1a9437c11e
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-shared05.f90
@@ -0,0 +1,17 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! OpenMP Version 4.5
+! 2.15.3.2 parallel shared Clause
+program omp_parallel_shared
+  type derived
+    integer :: field(2, 3)
+  end type
+  integer :: field(2)
+  type(derived) :: y
+
+  ! Check that derived type fields and variables with the same name
+  ! don't cause errors.
+  !$omp parallel
+    y%field(2, 3) = 1
+    field(1) = 1
+  !$omp end parallel
+end program omp_parallel_shared
diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90
index 99163327cdafa..9442fbd4d5978 100644
--- a/flang/test/Semantics/OpenMP/reduction08.f90
+++ b/flang/test/Semantics/OpenMP/reduction08.f90
@@ -15,7 +15,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct1/m HostAssoc INTEGER(4)
     k = max(k, m)
   end do
   !$omp end parallel do
@@ -25,7 +25,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct2/m HostAssoc INTEGER(4)
     k = min(k, m)
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct3/m HostAssoc INTEGER(4)
     k = iand(k, m)
   end do
   !$omp end parallel do
@@ -45,7 +45,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct4/m HostAssoc INTEGER(4)
     k = ior(k, m)
   end do
   !$omp end parallel do
@@ -55,7 +55,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct5/m HostAssoc INTEGER(4)
     k = ieor(k,m)
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90
index dbc8d1b060e65..1af2fc4fd9691 100644
--- a/flang/test/Semantics/OpenMP/reduction09.f90
+++ b/flang/test/Semantics/OpenMP/reduction09.f90
@@ -26,7 +26,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(10))
   !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct2/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(1:10:1))
   !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct3/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -43,7 +43,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2))
   !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct4/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -51,7 +51,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2:5:1))
   !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct5/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index 0b435a9ab9850..ecfb8622f8179 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -48,7 +48,7 @@ program mm
  !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,10
   !DEF: /mm/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
-  !REF: /mm/b
+  !DEF: /mm/OtherConstruct1/b HostAssoc INTEGER(4)
   !REF: /mm/OtherConstruct1/i
   a = a+b(i)
   !DEF: /mm/OtherConstruct1/t (OmpPrivate) HostAssoc TYPE(myty)
@@ -62,7 +62,7 @@ program mm
   !REF: /mm/OtherConstruct1/i
   !REF: /mm/OtherConstruct1/y
   x = a+i+y
-  !REF: /mm/c
+  !DEF: /mm/OtherConstruct1/c HostAssoc REAL(4)
   c = 3.0
  end do
 end program
diff --git a/flang/test/Semantics/OpenMP/symbol02.f90 b/flang/test/Semantics/OpenMP/symbol02.f90
index f6ffc5500d0a4..c199c526e1fa8 100644
--- a/flang/test/Semantics/OpenMP/symbol02.f90
+++ b/flang/test/Semantics/OpenMP/symbol02.f90
@@ -15,9 +15,9 @@
   a = 3.
   !DEF: /MainProgram1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 4
-  !REF: /MainProgram1/c
+  !DEF: /MainProgram1/OtherConstruct1/c HostAssoc REAL(4)
   c = 5
-  !DEF: /MainProgram1/d (Implicit) ObjectEntity REAL(4)
+  !DEF: /MainProgram1/OtherConstruct1/d HostAssoc REAL(4)
   d = 6
   !$omp end parallel
   !DEF: /MainProgram1/a (Implicit) ObjectEntity REAL(4)
diff --git a/flang/test/Semantics/OpenMP/symbol03.f90 b/flang/test/Semantics/OpenMP/symbol03.f90
index 93e9b7a3eae6b..ba941b9c9e7c4 100644
--- a/flang/test/Semantics/OpenMP/symbol03.f90
+++ b/flang/test/Semantics/OpenMP/symbol03.f90
@@ -9,10 +9,10 @@
   !$omp parallel  private(a) shared(b)
   !DEF: /MainProgram1/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   a = 3.
-  !REF: /MainProgram1/b
+  !DEF: /MainProgram1/OtherConstruct1/b HostAssoc REAL(4)
   b = 4
   !$omp parallel  private(b) shared(a)
-  !REF: /MainProgram1/OtherConstruct1/a
+  !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/a HostAssoc REAL(4)
   a = 5.
   !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 6
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index fa0a8f65a4294..1ad0c10a40135 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -15,10 +15,10 @@ subroutine foo
     !DEF: /mm/foo/a ObjectEntity INTEGER(4)
     integer :: a = 3
     !$omp parallel
-    !REF: /mm/foo/a
+    !DEF: /mm/foo/OtherConstruct1/a HostAssoc INTEGER(4)
     a = 1
     !DEF: /mm/i PUBLIC (Implicit, OmpThreadprivate) ObjectEntity INTEGER(4)
-    !REF: /mm/foo/a
+    !REF: /mm/foo/OtherConstruct1/a
     i = a
     !$omp end parallel
     !REF: /mm/foo/a
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index e2250f5c7908a..8b4716999820b 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !REF: /function_call_in_region/b
+  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 3af85af74ee97..69ccd17391b54 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -28,18 +28,18 @@ subroutine test_do
  !DEF: /test_do/k ObjectEntity INTEGER(4)
  integer i, j, k
 !$omp parallel
- !REF: /test_do/i
+ !DEF: /test_do/OtherConstruct1/i HostAssoc INTEGER(4)
  i = 99
 !$omp do  collapse(2)
  !DEF: /test_do/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,5
   !DEF: /test_do/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do j=6,10
-   !REF: /test_do/a
+   !DEF: /test_do/OtherConstruct1/a HostAssoc REAL(4)
    a(1,1,1) = 0.
    !DEF: /test_do/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_do/a
+    !REF: /test_do/OtherConstruct1/a
     !REF: /test_do/OtherConstruct1/k
     !REF: /test_do/OtherConstruct1/OtherConstruct1/j
     !REF: /test_do/OtherConstruct1/OtherConstruct1/i
@@ -65,11 +65,11 @@ subroutine test_pardo
  do i=1,5
    !DEF: /test_pardo/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=6,10
-   !REF: /test_pardo/a
+   !DEF: /test_pardo/OtherConstruct1/a HostAssoc REAL(4)
    a(1,1,1) = 0.
    !DEF: /test_pardo/OtherConstruct1/k (OmpPrivate) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_pardo/a
+    !REF: /test_pardo/OtherConstruct1/a
     !REF: /test_pardo/OtherConstruct1/k
     !REF: /test_pardo/OtherConstruct1/j
     !REF: /test_pardo/OtherConstruct1/i
@@ -138,15 +138,15 @@ subroutine dotprod (b, c, n, block_size, num_teams, block_threads)
  do i0=1,n,block_size
 !$omp parallel do  reduction(+:sum)
   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0 HostAssoc INTEGER(4)
   !DEF: /dotprod/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-  !REF: /dotprod/block_size
-  !REF: /dotprod/n
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/block_size HostAssoc INTEGER(4)
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/n HostAssoc INTEGER(4)
   do i=i0,min(i0+block_size, n)
    !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/sum (OmpReduction) HostAssoc REAL(4)
-   !REF: /dotprod/b
+   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/b HostAssoc REAL(4)
    !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i
-   !REF: /dotprod/c
+   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/c HostAssoc REAL(4)
    sum = sum+b(i)*c(i)
   end do
  end do
@@ -174,7 +174,7 @@ subroutine test_simd
   do j=6,10
    !DEF: /test_simd/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_simd/a
+    !DEF: /test_simd/OtherConstruct1/a HostAssoc REAL(4)
     !REF: /test_simd/OtherConstruct1/k
     !REF: /test_simd/OtherConstruct1/j
     !REF: /test_simd/OtherConstruct1/i
@@ -201,7 +201,7 @@ subroutine test_simd_multi
   do j=6,10
    !DEF: /test_simd_multi/OtherConstruct1/k (OmpLastPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_simd_multi/a
+    !DEF: /test_simd_multi/OtherConstruct1/a HostAssoc REAL(4)
     !REF: /test_simd_multi/OtherConstruct1/k
     !REF: /test_simd_multi/OtherConstruct1/j
     !REF: /test_simd_multi/OtherConstruct1/i
@@ -223,11 +223,11 @@ subroutine test_seq_loop
   !REF: /test_seq_loop/j
   j = -1
   !$omp parallel
-  !REF: /test_seq_loop/i
-  !REF: /test_seq_loop/j
+  !DEF: /test_seq_loop/OtherConstruct1/i HostAssoc INTEGER(4)
+  !DEF: /test_seq_loop/OtherConstruct1/j HostAssoc INTEGER(4)
   print *, i, j
   !$omp parallel
-  !REF: /test_seq_loop/i
+  !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
   !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   print *, i, j
   !$omp do
@@ -237,12 +237,12 @@ subroutine test_seq_loop
    do j=1,10
    end do
   end do
-  !REF: /test_seq_loop/i
+  !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i
   !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j
   print *, i, j
   !$omp end parallel
-  !REF: /test_seq_loop/i
-  !REF: /test_seq_loop/j
+  !REF: /test_seq_loop/OtherConstruct1/i
+  !REF: /test_seq_loop/OtherConstruct1/j
   print *, i, j
   !$omp end parallel
   !REF: /test_seq_loop/i
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index e2250f5c7908a..8b4716999820b 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !REF: /function_call_in_region/b
+  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a
diff --git a/flang/test/Semantics/array-constr-values.f90 b/flang/test/Semantics/array-constr-values.f90
index b93f774e0b0d0..6742cd35fe52d 100644
--- a/flang/test/Semantics/array-constr-values.f90
+++ b/flang/test/Semantics/array-constr-values.f90
@@ -35,8 +35,10 @@ subroutine arrayconstructorvalues()
   intarray = [integer:: EMPLOYEE (19, "Jack"), 2, 3, 4, 5]
 
   ! C7112
+  !ERROR: Dimension 1 of left-hand side has extent 3, but right-hand side has extent 2
   !ERROR: Value in array constructor of type 'INTEGER(4)' could not be converted to the type of the array 'employee'
   emparray = (/ EMPLOYEE:: EMPLOYEE(19, "Ganesh"), EMPLOYEE(22, "Omkar"), 19 /)
+  !ERROR: Dimension 1 of left-hand side has extent 3, but right-hand side has extent 2
   !ERROR: Value in array constructor of type 'employeer' could not be converted to the type of the array 'employee'
   emparray = (/ EMPLOYEE:: EMPLOYEE(19, "Ganesh"), EMPLOYEE(22, "Ram"),EMPLOYEER("ShriniwasPvtLtd") /)
 
diff --git a/flang/test/Semantics/assign10.f90 b/flang/test/Semantics/assign10.f90
index 6b98097bfe62c..f0ea4b251a6d6 100644
--- a/flang/test/Semantics/assign10.f90
+++ b/flang/test/Semantics/assign10.f90
@@ -1,7 +1,11 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Shape conformance checks on assignments
 program test
+  type t
+    integer n
+  end type
   real :: a0, a1a(2), a1b(3), a2a(2,3), a2b(3,2)
+  type(t) c(10)
   a0 = 0. ! ok
   !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
   a0 = [0.]
@@ -20,4 +24,6 @@ program test
   a2a(1,:) = a1a
   !ERROR: Dimension 1 of left-hand side has extent 2, but right-hand side has extent 3
   a2a = a2b
+  !ERROR: Dimension 1 of left-hand side has extent 10, but right-hand side has extent 0
+  c(1:10) = c(10:1)
 end
diff --git a/flang/test/Semantics/associate03.f90 b/flang/test/Semantics/associate03.f90
new file mode 100644
index 0000000000000..f57dc17839aab
--- /dev/null
+++ b/flang/test/Semantics/associate03.f90
@@ -0,0 +1,79 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
+! A construct entity does not have the POINTER or ALLOCATABLE attribute,
+! except in SELECT RANK.
+
+subroutine test(up,ua,rp,ra)
+  class(*), pointer :: up
+  class(*), allocatable :: ua
+  real, pointer :: rp(..)
+  real, allocatable :: ra(..)
+  real, target :: x
+  real, pointer :: p
+  real, allocatable :: a
+  associate (s => p)
+    !ERROR: The left-hand side of a pointer assignment is not definable
+    !BECAUSE: 's' is not a pointer
+    s => x
+    !ERROR: Entity in ALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    allocate(s)
+    !ERROR: Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    deallocate(s)
+    !ERROR: 's' may not appear in NULLIFY
+    !BECAUSE: 's' is not a pointer
+    nullify(s)
+  end associate
+  select type(s => up)
+  type is (real)
+    !ERROR: The left-hand side of a pointer assignment is not definable
+    !BECAUSE: 's' is not a pointer
+    s => x
+    !ERROR: Entity in ALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    allocate(s)
+    !ERROR: Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    deallocate(s)
+    !ERROR: 's' may not appear in NULLIFY
+    !BECAUSE: 's' is not a pointer
+    nullify(s)
+  end select
+  select rank(s => rp)
+  rank(0)
+    s => x ! ok
+    allocate(s) ! ok
+    deallocate(s) ! ok
+    nullify(s) ! ok
+  !ERROR: RANK (*) cannot be used when selector is POINTER or ALLOCATABLE
+  rank(*)
+  rank default
+    !ERROR: The left-hand side of a pointer assignment must not be an assumed-rank dummy argument
+    !ERROR: pointer 's' associated with object 'x' with incompatible type or shape
+    s => x
+    !ERROR: An assumed-rank dummy argument may not appear in an ALLOCATE statement
+    allocate(s)
+    deallocate(s) ! ok
+    nullify(s) ! ok
+  end select
+  associate (s => a)
+    !ERROR: Entity in ALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    allocate(s)
+    !ERROR: Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    deallocate(s)
+  end associate
+  select type(s => ua)
+  type is (real)
+    !ERROR: Entity in ALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    allocate(s)
+    !ERROR: Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute
+    deallocate(s)
+  end select
+  select rank(s => ra)
+  rank(0)
+    allocate(s) ! ok
+    deallocate(s) ! ok
+  !ERROR: RANK (*) cannot be used when selector is POINTER or ALLOCATABLE
+  rank(*)
+  rank default
+    !ERROR: An assumed-rank dummy argument may not appear in an ALLOCATE statement
+    allocate(s)
+    deallocate(s) ! ok
+  end select
+end
diff --git a/flang/test/Semantics/associate04.f90 b/flang/test/Semantics/associate04.f90
new file mode 100644
index 0000000000000..5a73ba419c0a3
--- /dev/null
+++ b/flang/test/Semantics/associate04.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine bad(a)
+  real :: a(..)
+  !ERROR: Selector must not be assumed-rank
+  associate(x => a)
+  end associate
+end subroutine
diff --git a/flang/test/Semantics/call02.f90 b/flang/test/Semantics/call02.f90
index bc3dd6075969c..0ec5530f98089 100644
--- a/flang/test/Semantics/call02.f90
+++ b/flang/test/Semantics/call02.f90
@@ -72,6 +72,7 @@ subroutine callme(f)
  contains
   elemental real function elem03(x)
     real, value :: x
+    elem03 = 0.
   end function
   subroutine test
     intrinsic :: cos
@@ -87,6 +88,7 @@ subroutine test
    contains
     elemental real function elem04(x)
       real, value :: x
+      elem04 = 0.
     end function
   end subroutine
 end module
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
index 8a4386e28e2bc..a06fe4f196c8c 100644
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -155,11 +155,13 @@ subroutine smb(b)
 
   function return_deferred_length_ptr()
     character(len=:), pointer :: return_deferred_length_ptr
+    return_deferred_length_ptr => p2
   end function
 
   function return_explicit_length_ptr(n)
     integer :: n
     character(len=n), pointer :: return_explicit_length_ptr
+    return_explicit_length_ptr => p2(1:n)
   end function
 
   subroutine test()
diff --git a/flang/test/Semantics/contiguous01.f90 b/flang/test/Semantics/contiguous01.f90
index 0f086624a20ae..89382775261b8 100644
--- a/flang/test/Semantics/contiguous01.f90
+++ b/flang/test/Semantics/contiguous01.f90
@@ -30,8 +30,10 @@ function func(ashape,arank) result(r)
     contiguous r2
     !PORTABILITY: CONTIGUOUS entity 'e' should be an array pointer, assumed-shape, or assumed-rank
     entry e() result(r2)
+    r2 = 0
   end
   function fp()
     real, pointer, contiguous :: fp(:) ! ok
+    nullify(fp)
   end
 end
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index d2d4d239815e4..195ddac11d575 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -18,6 +18,8 @@ module m
 end
 
 program main
+  integer, device :: a_d(10 ,10)
+  integer :: b(10, 10)
   !$cuf kernel do <<< *, * >>> ! ok
   do j = 1, 0
   end do
@@ -90,4 +92,12 @@ program main
     else if (ifunc() /= 1) then
     end if
   end do
+
+  !$cuf kernel do (2) <<<*, *>>>
+  do j = 1, 10
+     do i = 1, 10
+        !ERROR: Host array 'b' cannot be present in CUF kernel
+        a_d(i,j) = b(i,j)
+     enddo
+  enddo
 end
diff --git a/flang/test/Semantics/parent-comp-name.f90 b/flang/test/Semantics/parent-comp-name.f90
new file mode 100644
index 0000000000000..d7878430a5849
--- /dev/null
+++ b/flang/test/Semantics/parent-comp-name.f90
@@ -0,0 +1,70 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Every other Fortran compiler (but one) interprets the names of parent
+! components like this when the names of their types are the product of
+! USE association with renaming.
+
+module m1
+  type originalName
+    integer m
+  end type
+end
+
+module m2
+  use m1, newName => originalName
+  type, extends(newName) :: extended
+    integer n
+  end type
+  type, extends(newName) :: extended2
+    integer originalName ! ok
+  end type
+ contains
+  subroutine s1
+    type(extended) x
+    type(extended2) x2
+    print *, x%newName%m ! ok
+    !ERROR: Component 'originalname' not found in derived type 'extended'
+    print *, x%originalName
+    print *, extended(newName=newName(m=1),n=2) ! ok
+    !ERROR: Structure constructor lacks a value for component 'm'
+    !ERROR: Keyword 'originalname=' does not name a component of derived type 'extended'
+    !ERROR: Keyword 'm=' may not appear in a reference to a procedure with an implicit interface
+    print *, extended(originalName=originalName(m=1),n=2)
+    !ERROR: Value in structure constructor of type 'REAL(4)' is incompatible with component 'newname' of type 'newname'
+    !ERROR: Keyword 'm=' may not appear in a reference to a procedure with an implicit interface
+    print *, extended(newName=originalName(m=1),n=2)
+    !ERROR: Structure constructor lacks a value for component 'm'
+    !ERROR: Keyword 'originalname=' does not name a component of derived type 'extended'
+    print *, extended(originalName=newName(m=1),n=2)
+    print *, x2%newName%m ! ok
+    print *, x2%originalName ! ok
+    print *, extended2(newName=newName(m=1),originalName=2) ! ok
+  end
+end
+
+module m3
+  use m2
+ contains
+  ! Same as above, but not in the same module as the derived
+  ! types' definitions.
+  subroutine s2
+    type(extended) x
+    type(extended2) x2
+    print *, x%newName%m ! ok
+    !ERROR: Component 'originalname' not found in derived type 'extended'
+    print *, x%originalName
+    print *, extended(newName=newName(m=1),n=2) ! ok
+    !ERROR: Structure constructor lacks a value for component 'm'
+    !ERROR: Keyword 'originalname=' does not name a component of derived type 'extended'
+    !ERROR: Keyword 'm=' may not appear in a reference to a procedure with an implicit interface
+    print *, extended(originalName=originalName(m=1),n=2)
+    !ERROR: Value in structure constructor of type 'REAL(4)' is incompatible with component 'newname' of type 'newname'
+    !ERROR: Keyword 'm=' may not appear in a reference to a procedure with an implicit interface
+    print *, extended(newName=originalName(m=1),n=2)
+    !ERROR: Structure constructor lacks a value for component 'm'
+    !ERROR: Keyword 'originalname=' does not name a component of derived type 'extended'
+    print *, extended(originalName=newName(m=1),n=2)
+    print *, x2%newName%m ! ok
+    print *, x2%originalName ! ok
+    print *, extended2(newName=newName(m=1),originalName=2) ! ok
+  end
+end
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
index 95ff2e87c09b4..92d12ab149010 100644
--- a/flang/test/Semantics/reduce.cuf
+++ b/flang/test/Semantics/reduce.cuf
@@ -1,9 +1,9 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 subroutine s(n,m,a,l)
   integer, intent(in) :: n
-  integer, intent(in) :: m(n)
-  real, intent(in) :: a(n)
-  logical, intent(in) :: l(n)
+  integer, device, intent(in) :: m(n)
+  real, device, intent(in) :: a(n)
+  logical, device, intent(in) :: l(n)
   integer j, mr
   real ar
   logical lr
diff --git a/flang/test/Semantics/resolve33.f90 b/flang/test/Semantics/resolve33.f90
index 4b27a8d231f52..88ebd1be4637b 100644
--- a/flang/test/Semantics/resolve33.f90
+++ b/flang/test/Semantics/resolve33.f90
@@ -10,7 +10,7 @@
 ! in that derived-type-def.
 
 module m
-  !ERROR: Duplicate type parameter name: 'a'
+  !ERROR: Type parameter, component, or procedure binding 'a' already defined in this type
   type t1(a, b, a)
     integer, kind :: a
     integer(8), len :: b
@@ -23,23 +23,31 @@ module m
   !ERROR: No definition found for type parameter 'b'
   type t3(a, b)
     integer, kind :: a
+    !ERROR: Component 'b' is already declared in this derived type
     integer :: b
   end type
   type t4(a)
     integer, kind :: a
-    !ERROR: 'd' is not a type parameter of this derived type
+    !ERROR: 'd' is not a parameter of this derived type
     integer(8), len :: d
   end type
   type t5(a, b)
     integer, len :: a
     integer, len :: b
-    !ERROR: Type parameter, component, or procedure binding 'a' already defined in this type
+    !ERROR: Type parameter 'a' was already declared in this derived type
     integer, len :: a
   end type
   !ERROR: No definition found for type parameter 'k'
   !ERROR: No definition found for type parameter 'l'
   type :: t6(k, l)
+    !ERROR: Type parameter 'k' was referenced before being declared
+    !ERROR: Type parameter 'l' was referenced before being declared
     character(kind=k, len=l) :: d3
   end type
   type(t6(2, 10)) :: x3
+  type :: t7(k1,k2)
+    !ERROR: Type parameter 'k2' was referenced before being declared
+    integer(kind(k2)), kind :: k1
+    integer(kind(k1)), kind :: k2
+  end type
 end module
diff --git a/flang/test/Semantics/resolve53.f90 b/flang/test/Semantics/resolve53.f90
index 1b0f3f8f8e385..0ab4b7c4d303e 100644
--- a/flang/test/Semantics/resolve53.f90
+++ b/flang/test/Semantics/resolve53.f90
@@ -227,14 +227,17 @@ module m14
   real function f1(x, y)
     real, intent(in) :: x
     logical, intent(in) :: y
+    f1 = 0.
   end
   integer function f2(x, y)
     integer, intent(in) :: x
     logical, intent(in) :: y
+    f2 = 0.
   end
   real function f3(x, y)
     real, value :: x
     logical, value :: y
+    f3 = 0.
   end
 end module
 
@@ -447,12 +450,15 @@ module m19
 contains
   integer function f1(i)
     integer, intent(in) :: i
+    f1 = 0
   end
   integer function f2(i, j)
     integer, value :: i, j
+    f2 = 0
   end
   integer function f3(i, j)
     integer, intent(in) :: i, j
+    f3 = 0
   end
 end
 
diff --git a/flang/test/Semantics/struct03.f90 b/flang/test/Semantics/struct03.f90
new file mode 100644
index 0000000000000..a334cca8945fc
--- /dev/null
+++ b/flang/test/Semantics/struct03.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+  structure /s/
+    !ERROR: entity declarations are required on a nested structure
+    structure /nested/
+    end structure
+  end structure
+end
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
new file mode 100644
index 0000000000000..a372fcd544fee
--- /dev/null
+++ b/flang/test/Semantics/undef-result01.f90
@@ -0,0 +1,144 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
+
+!WARNING: Function result is never defined
+function basic()
+end
+
+function defdByIntentOut()
+  call intentout(defdByIntentOut)
+ contains
+  subroutine intentout(x)
+    real, intent(out) :: x
+  end
+end
+
+function defdByIntentInOut()
+  call intentinout(defdByIntentInOut)
+ contains
+  subroutine intentInout(x)
+    real, intent(out) :: x
+  end
+end
+
+function defdByIntentInPtr()
+  real, target :: defdByIntentInPtr
+  call intentInPtr(defdByIntentInPtr)
+ contains
+  subroutine intentInPtr(p)
+    real, intent(in), pointer :: p
+  end
+end
+
+!WARNING: Function result is never defined
+function notDefdByCall()
+  call intentin(notDefdByCall)
+ contains
+  subroutine intentin(n)
+    integer, intent(in) :: n
+  end
+end
+
+!WARNING: Function result is never defined
+function basicAlloc()
+  real, allocatable :: basicAlloc
+  allocate(basicAlloc)
+end
+
+function sourcedAlloc()
+  real, allocatable :: sourcedAlloc
+  allocate(sourcedAlloc, source=0.)
+end
+
+function defdByEntry()
+  entry entry1
+  entry1 = 0.
+end
+
+function defdByEntry2()
+  entry entry2() result(entryResult)
+  entryResult = 0.
+end
+
+function usedAsTarget()
+  real, target :: usedAsTarget
+  real, pointer :: p
+  p => usedAsTarget
+end
+
+function entryUsedAsTarget()
+  real, target :: entryResult
+  real, pointer :: p
+  entry entry5() result(entryResult)
+  p => entryResult
+end
+
+function defdByCall()
+  call implicitInterface(defdByCall)
+end
+
+function defdInInternal()
+ contains
+  subroutine internal
+    defdInInternal = 0.
+  end
+end
+
+function defdByEntryInInternal()
+  entry entry3() result(entryResult)
+ contains
+  subroutine internal
+    entryResult = 0.
+  end
+end
+
+type(defaultInitialized) function defdByDefault()
+  type defaultInitialized
+    integer :: n = 123
+  end type
+end
+
+integer function defdByDo()
+  do defdByDo = 1, 10
+  end do
+end
+
+function defdByRead()
+  read(*,*) defdByRead
+end function
+
+function defdByNamelist()
+  namelist /nml/ defdByNamelist
+  read(*,nml=nml)
+end
+
+character(4) function defdByWrite()
+  write(defdByWrite) 'abcd'
+end
+
+integer function defdBySize()
+  real arr(10)
+  read(*,size=defdBySize) arr
+end
+
+character(40) function defdByIomsg()
+  write(123,*,iomsg=defdByIomsg)
+end
+
+character(20) function defdByInquire()
+  inquire(6,status=defdByInquire)
+end
+
+!WARNING: Function result is never defined
+character(20) function notDefdByInquire()
+  inquire(file=notDefdByInquire)
+end
+
+integer function defdByNewunit()
+  open(newunit=defdByNewunit, file="foo.txt")
+end
+
+function defdByAssociate()
+  associate(s => defdByAssociate)
+    s = 1.
+  end associate
+end
diff --git a/flang/test/Transforms/debug-92391.fir b/flang/test/Transforms/debug-92391.fir
new file mode 100644
index 0000000000000..2d83be995a0f6
--- /dev/null
+++ b/flang/test/Transforms/debug-92391.fir
@@ -0,0 +1,18 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} {
+  func.func @my_square_(%arg0: !fir.ref<i32> {fir.bindc_name = "x"} ) -> i32 attributes {fir.internal_name = "_QPmy_square"} {
+    %0 = fir.undefined !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "my_square", uniq_name = "_QFmy_squareEmy_square"}
+    %2 = fircg.ext_declare %1 {uniq_name = "_QFmy_squareEmy_square"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %3 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFmy_squareEx"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+    %4 = fir.load %3 : !fir.ref<i32>
+    %5 = arith.muli %4, %4 : i32
+    fir.store %5 to %2 : !fir.ref<i32>
+    %6 = fir.load %2 : !fir.ref<i32>
+    return %6 : i32
+  } loc(#loc1)
+}
+#loc1 = loc("92391.f90":15:1)
+
+// CHECK: #di_subprogram = #llvm.di_subprogram<{{.*}}name = "my_square", linkageName = "my_square_"{{.*}}>
diff --git a/flang/test/Transforms/debug-96314.fir b/flang/test/Transforms/debug-96314.fir
new file mode 100644
index 0000000000000..e2d0f24a1105c
--- /dev/null
+++ b/flang/test/Transforms/debug-96314.fir
@@ -0,0 +1,26 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s -o - | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  func.func @_QMhelperPmod_sub(%arg0: !fir.ref<i32> {fir.bindc_name = "a"} ) {
+    return
+  } loc(#loc1)
+  func.func private @_QMhelperFmod_subPchild1(%arg0: !fir.ref<i32> {fir.bindc_name = "b"} ) attributes {fir.host_symbol = @_QMhelperPmod_sub, llvm.linkage = #llvm.linkage<internal>} {
+    return
+  } loc(#loc2)
+  func.func @global_sub_(%arg0: !fir.ref<i32> {fir.bindc_name = "n"} ) attributes {fir.internal_name = "_QPglobal_sub"} {
+    return
+  } loc(#loc3)
+  func.func private @_QFglobal_subPchild2(%arg0: !fir.ref<i32> {fir.bindc_name = "c"}) attributes {fir.host_symbol = @global_sub_, llvm.linkage = #llvm.linkage<internal>} {
+    return
+  } loc(#loc4)
+}
+
+#loc1 = loc("test.f90":5:1)
+#loc2 = loc("test.f90":15:1)
+#loc3 = loc("test.f90":25:1)
+#loc4 = loc("test.f90":35:1)
+
+// CHECK-DAG: #[[SP1:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}scope = #[[SP1]], name = "child1"{{.*}}>
+// CHECK-DAG: #[[SP2:.*]] = #llvm.di_subprogram<{{.*}}linkageName = "global_sub_"{{.*}}>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}scope = #[[SP2]], name = "child2"{{.*}}>
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 4771199651602..cec4e2d810720 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -4,7 +4,16 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-set(MODULES
+# Define the list of Fortran module files that need to be compiled
+# to produce an object file for inclusion into the FortranRuntime
+# library.
+set(MODULES_WITH_IMPLEMENTATION
+  "iso_fortran_env_impl"
+)
+
+# Define the list of Fortran module files for which it is
+# sufficient to generate the module file via -fsyntax-only.
+set(MODULES_WITHOUT_IMPLEMENTATION
   "__fortran_builtins"
   "__fortran_ieee_exceptions"
   "__fortran_type_info"
@@ -20,6 +29,12 @@ set(MODULES
   "iso_fortran_env"
 )
 
+set(MODULES ${MODULES_WITH_IMPLEMENTATION} ${MODULES_WITHOUT_IMPLEMENTATION})
+
+# Init variable to hold extra object files coming from the Fortran modules;
+# these module files will be contributed from the CMakeLists in flang/tools/f18.
+set(module_objects "")
+
 # Create module files directly from the top-level module source directory.
 # If CMAKE_CROSSCOMPILING, then the newly built flang-new executable was
 # cross compiled, and thus can't be executed on the build system and thus
@@ -41,6 +56,9 @@ if (NOT CMAKE_CROSSCOMPILING)
       if(NOT ${filename} STREQUAL "__fortran_type_info")
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
       endif()
+      if(${filename} STREQUAL "iso_fortran_env")
+        set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/iso_fortran_env_impl.mod)
+      endif()
       if(${filename} STREQUAL "ieee_arithmetic" OR
          ${filename} STREQUAL "ieee_exceptions")
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod)
@@ -58,18 +76,41 @@ if (NOT CMAKE_CROSSCOMPILING)
       endif()
     endif()
 
+
+    # Some modules have an implementation part that needs to be added to the
+    # FortranRuntime library.
+    set(compile_with "-fsyntax-only")
+    set(object_output "")
+    set(include_in_link FALSE)
+    if(${filename} IN_LIST MODULES_WITH_IMPLEMENTATION)
+      set(object_output "${CMAKE_CURRENT_BINARY_DIR}/${filename}${CMAKE_CXX_OUTPUT_EXTENSION}")
+      set(compile_with -c -o ${object_output})
+      set(include_in_link TRUE)
+    endif()
+
     set(base ${FLANG_INTRINSIC_MODULES_DIR}/${filename})
     # TODO: We may need to flag this with conditional, in case Flang is built w/o OpenMP support
-    add_custom_command(OUTPUT ${base}.mod
+    add_custom_command(OUTPUT ${base}.mod ${object_output}
       COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
-      COMMAND flang-new ${opts} -cpp -fsyntax-only -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
+      COMMAND flang-new ${opts} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
         ${FLANG_SOURCE_DIR}/module/${filename}.f90
       DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends}
     )
     list(APPEND MODULE_FILES ${base}.mod)
     install(FILES ${base}.mod DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang")
+
+    # If a module has been compiled into an object file, add the file to
+    # the link line for the FortranRuntime library.
+    if(include_in_link)
+      list(APPEND module_objects ${object_output})
+    endif()
   endforeach()
 
+  # Set a CACHE variable that is visible to the CMakeLists.txt in runtime/, so that
+  # the compiled Fortran modules can be added to the link line of the FortranRuntime
+  # library.
+  set(FORTRAN_MODULE_OBJECTS ${module_objects} CACHE INTERNAL "" FORCE)
+
   # Special case for omp_lib.mod, because its source comes from openmp/runtime/src/include.
   # It also produces two module files: omp_lib.mod and omp_lib_kinds.mod.  Compile these
   # files only if OpenMP support has been configured.
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index ce30ecff028dc..9f33cdfe3fa90 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -14,13 +14,6 @@ set( LLVM_LINK_COMPONENTS
 add_flang_tool(flang-new
   driver.cpp
   fc1_main.cpp
-
-  DEPENDS
-  # These libraries are used in the linker invocation generated by the driver
-  # (i.e. when constructing the linker job). Without them the driver would be
-  # unable to generate executables.
-  FortranRuntime
-  FortranDecimal
 )
 
 target_link_libraries(flang-new
diff --git a/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp b/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
index becaa3c69f6c4..47342da07f060 100644
--- a/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
+++ b/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
@@ -56,14 +56,6 @@ void testGenNearest(fir::FirOpBuilder &builder, mlir::Type xType,
   mlir::Value s = builder.create<fir::UndefOp>(loc, sType);
   mlir::Value nearest = fir::runtime::genNearest(builder, loc, x, s);
   checkCallOp(nearest.getDefiningOp(), fctName, 2, /*addLocArg=*/false);
-  auto callOp = mlir::dyn_cast<fir::CallOp>(nearest.getDefiningOp());
-  mlir::Value select = callOp.getOperands()[1];
-  EXPECT_TRUE(mlir::isa<mlir::arith::SelectOp>(select.getDefiningOp()));
-  auto selectOp = mlir::dyn_cast<mlir::arith::SelectOp>(select.getDefiningOp());
-  mlir::Value cmp = selectOp.getCondition();
-  EXPECT_TRUE(mlir::isa<mlir::arith::CmpFOp>(cmp.getDefiningOp()));
-  auto cmpOp = mlir::dyn_cast<mlir::arith::CmpFOp>(cmp.getDefiningOp());
-  EXPECT_EQ(s, cmpOp.getLhs());
 }
 
 TEST_F(RuntimeCallTest, genNearestTest) {
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6ba54475d0fd1..dd45d6cc8cb6a 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -73,6 +73,8 @@ if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD))
   add_subdirectory(utils/gpu)
 endif()
 
+option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" ON)
+
 set(NEED_LIBC_HDRGEN FALSE)
 if(NOT LLVM_RUNTIMES_BUILD)
   if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
@@ -226,7 +228,7 @@ foreach(config_path IN LISTS LIBC_CONFIG_JSON_FILE_LIST)
   load_libc_config(${config_path}/config.json ${cmd_line_conf})
 endforeach()
 
-if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
+if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND (LIBC_ENABLE_USE_BY_CLANG OR LIBC_TARGET_OS_IS_GPU))
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
@@ -242,9 +244,7 @@ else()
     set(LIBC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
   endif()
   if(LIBC_TARGET_OS_IS_GPU)
-    if(LLVM_RUNTIMES_TARGET)
-      set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_RUNTIMES_TARGET})
-    elseif(LIBC_TARGET_TRIPLE)
+    if(LIBC_TARGET_TRIPLE)
       set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LIBC_TARGET_TRIPLE})
     else()
       set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 14ba9f3f64b48..5fa3e44e8d48c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -7,9 +7,10 @@ function(add_benchmark benchmark_name)
     "BENCHMARK"
     "" # Optional arguments
     "" # Single value arguments
-    "LINK_LIBRARIES" # Multi-value arguments
+    "LINK_LIBRARIES;DEPENDS" # Multi-value arguments
     ${ARGN}
   )
+  
   if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
     message(FATAL_ERROR "target does not support clock")
   endif()
@@ -19,7 +20,14 @@ function(add_benchmark benchmark_name)
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
+    DEPENDS
+      libc.src.stdio.printf
+      libc.src.stdlib.srand
+      libc.src.stdlib.rand
+      ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
+    COMPILE_OPTIONS
+      -flto
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
   set(fq_build_target_name ${fq_target_name}.__build__)
@@ -46,14 +54,19 @@ add_unittest_framework_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.array
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
+    libc.src.stdio.printf
 )
 
 add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 23fff3e8180f7..a5dbc62a2087b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -7,6 +7,8 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf.h"
+#include "src/stdlib/srand.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -73,11 +75,10 @@ struct AtomicBenchmarkSums {
 };
 
 AtomicBenchmarkSums all_results;
+constexpr auto GREEN = "\033[32m";
+constexpr auto RESET = "\033[0m";
 
 void print_results(Benchmark *b) {
-  constexpr auto GREEN = "\033[32m";
-  constexpr auto RESET = "\033[0m";
-
   BenchmarkResult result;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
@@ -92,21 +93,55 @@ void print_results(Benchmark *b) {
       all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
   result.total_iterations =
       all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  result.total_time =
+  const uint64_t duration_ns =
       all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+  const uint64_t duration_us = duration_ns / 1000;
+  const uint64_t duration_ms = duration_ns / (1000 * 1000);
+  uint64_t converted_duration = duration_ns;
+  const char *time_unit;
+  if (duration_ms != 0) {
+    converted_duration = duration_ms;
+    time_unit = "ms";
+  } else if (duration_us != 0) {
+    converted_duration = duration_us;
+    time_unit = "us";
+  } else {
+    converted_duration = duration_ns;
+    time_unit = "ns";
+  }
+  result.total_time = converted_duration;
+  // result.total_time =
+  //     all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
-  log << GREEN << "[ RUN      ] " << RESET << b->get_name() << '\n';
-  log << GREEN << "[       OK ] " << RESET << b->get_name() << ": "
-      << result.cycles << " cycles, " << result.min << " min, " << result.max
-      << " max, " << result.total_iterations << " iterations, "
-      << result.total_time << " ns, "
-      << static_cast<uint64_t>(result.standard_deviation)
-      << " stddev (num threads: " << num_threads << ")\n";
+  LIBC_NAMESPACE::printf(
+      "%-20s |%8ld |%8ld |%8ld |%11d |%9ld %2s |%9ld |%9d |\n",
+      b->get_test_name().data(), result.cycles, result.min, result.max,
+      result.total_iterations, result.total_time, time_unit,
+      static_cast<uint64_t>(result.standard_deviation), num_threads);
+}
+
+void print_header() {
+  LIBC_NAMESPACE::printf("%s", GREEN);
+  LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
+                         benchmarks[0]->get_suite_name().data());
+  LIBC_NAMESPACE::printf("%s", RESET);
+  LIBC_NAMESPACE::printf("Benchmark            |  Cycles |     Min |     Max | "
+                         "Iterations |        "
+                         "Time |   Stddev |  Threads |\n");
+  LIBC_NAMESPACE::printf(
+      "---------------------------------------------------------------------"
+      "--------------------------------\n");
 }
 
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
+
+  if (id == 0) {
+    print_header();
+    LIBC_NAMESPACE::srand(gpu::processor_clock());
+  }
+
   gpu::sync_threads();
 
   for (Benchmark *b : benchmarks) {
@@ -114,8 +149,10 @@ void Benchmark::run_benchmarks() {
       all_results.reset();
 
     gpu::sync_threads();
-    auto current_result = b->run();
-    all_results.update(current_result);
+    if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) {
+      auto current_result = b->run();
+      all_results.update(current_result);
+    }
     gpu::sync_threads();
 
     if (id == 0)
@@ -171,6 +208,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     if (samples >= options.max_samples || iterations >= options.max_iterations)
       break;
     if (total_time >= options.min_duration && samples >= options.min_samples &&
+        total_iterations >= options.min_iterations &&
         change_ratio < options.epsilon)
       break;
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 1f813f8655de6..2b85b146ed745 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,14 @@
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -17,12 +21,13 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
+  uint32_t min_iterations = 50;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
-  int64_t min_duration = 0;                  // in nanoseconds (ns)
+  int64_t min_duration = 500 * 1000;         // 500 * 1000 nanoseconds = 500 us
   int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
-  double epsilon = 0.01;
+  double epsilon = 0.0001;
   double scaling_factor = 1.4;
 };
 
@@ -79,16 +84,21 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
 
 class Benchmark {
   const cpp::function<uint64_t(void)> func;
-  const cpp::string_view name;
+  const cpp::string_view suite_name;
+  const cpp::string_view test_name;
+  const uint32_t num_threads;
 
 public:
-  Benchmark(cpp::function<uint64_t(void)> func, char const *name)
-      : func(func), name(name) {
+  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
+            char const *test_name, uint32_t num_threads)
+      : func(func), suite_name(suite_name), test_name(test_name),
+        num_threads(num_threads) {
     add_benchmark(this);
   }
 
   static void run_benchmarks();
-  const cpp::string_view get_name() const { return name; }
+  const cpp::string_view get_suite_name() const { return suite_name; }
+  const cpp::string_view get_test_name() const { return test_name; }
 
 protected:
   static void add_benchmark(Benchmark *benchmark);
@@ -99,11 +109,72 @@ class Benchmark {
     return benchmark(options, func);
   }
 };
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+template <typename T> static T get_rand_input() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+
+  // Required to correctly instantiate FPBits for floats and doubles.
+  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
+                                               uint64_t, uint32_t>;
+  RandType bits;
+  if constexpr (cpp::is_same_v<T, uint64_t>)
+    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
+           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
+  else
+    bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+template <typename T> class MathPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+  static constexpr StorageType UIntMax =
+      cpp::numeric_limits<StorageType>::max();
+
+public:
+  typedef T Func(T);
+
+  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+                                    StorageType ending_bit, StorageType step) {
+    uint64_t total_time = 0;
+    if (step <= 0)
+      step = 1;
+    volatile T result;
+    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+      T x = FPBits(bits).get_val();
+      total_time += LIBC_NAMESPACE::latency(f, x);
+    }
+    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+    return total_time / num_runs;
+  }
+};
+
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
+// Passing -1 indicates the benchmark should be run with as many threads as
+// allocated by the user in the benchmark's CMake.
 #define BENCHMARK(SuiteName, TestName, Func)                                   \
   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName "." #TestName)
+      Func, #SuiteName, #TestName, -1)
+
+#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
+  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
+      Func, #SuiteName, #TestName, NumThreads)
+
+#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
+  BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
+
+#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
+  BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
+                      LIBC_NAMESPACE::gpu::get_lane_size())
 
 #endif
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 79f01425770da..f277624dbb901 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -8,6 +8,8 @@ add_benchmark(
     isalnum_benchmark.cpp
   DEPENDS
     libc.src.ctype.isalnum
+  LOADER_ARGS
+    --threads 64
 )
 
 add_benchmark(
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 6f8d247902f76..ffa5a99860bfc 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -7,6 +7,10 @@ uint64_t BM_IsAlnum() {
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
 BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum);
+SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
+                          BM_IsAlnum);
+SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
+                      BM_IsAlnum);
 
 uint64_t BM_IsAlnumCapital() {
   char x = 'A';
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..77250edb6bb52
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,44 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+set(math_benchmark_flags "")
+if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  if(CUDAToolkit_FOUND)
+    set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+    if (EXISTS ${libdevice_path})
+      list(APPEND math_benchmark_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+      # Compile definition needed so the benchmark knows to register
+      # NVPTX benchmarks.
+      list(APPEND math_benchmark_flags "-DNVPTX_MATH_FOUND=1")
+    endif()
+  endif()
+endif()
+
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+  if(AMDDeviceLibs_FOUND)
+    get_target_property(ocml_path ocml IMPORTED_LOCATION)
+    list(APPEND math_benchmark_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
+    list(APPEND math_benchmark_flags "-DAMDGPU_MATH_FOUND=1")
+  endif()
+endif()
+
+add_benchmark(
+  sin_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    sin_benchmark.cpp
+  DEPENDS
+    libc.src.math.sin
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
+  COMPILE_OPTIONS
+    ${math_benchmark_flags}
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..5849ea3e99bb0
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,65 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+#include "src/math/amdgpu/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func)                                                  \
+  []() {                                                                       \
+    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
+    return LIBC_NAMESPACE::latency(Func, x);                                   \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func)                                                        \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func)                                                     \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+#endif
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
index 8bbc7e33f122a..b6d84607aa607 100644
--- a/libc/benchmarks/gpu/timing/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -1,4 +1,4 @@
-foreach(target nvptx)
+foreach(target nvptx amdgpu)
   add_subdirectory(${target})
   list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
 endforeach()
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 9b40f9282b16b..e308d619e9569 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -34,7 +34,7 @@ namespace LIBC_NAMESPACE_DECL {
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
   uint32_t result = 0.0;
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
   asm("" ::"s"(start));
   uint64_t stop = gpu::processor_clock();
   return stop - start;
@@ -67,7 +67,8 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -98,7 +99,8 @@ template <typename F, typename T1, typename T2>
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index d141b08d4c922..b426dfd0ea153 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   uint64_t start = gpu::processor_clock();
   asm("" ::"r"(y), "llr"(start));
   uint32_t result = y;
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
   volatile auto storage = result;
   return stop - start;
@@ -57,7 +57,7 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both be
   // used and prevents us from exiting this region before it's complete.
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -85,7 +85,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   auto result = f(arg, arg2);
 
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index a6d793d495c45..63145fe709dda 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -5,6 +5,8 @@
 set(
   ALL_COMPILER_FEATURES
     "builtin_ceil_floor_rint_trunc"
+    "builtin_fmax_fmin"
+    "builtin_fmaxf16_fminf16"
     "builtin_round"
     "builtin_roundeven"
     "float16"
@@ -15,6 +17,12 @@ set(
 # Making sure ALL_COMPILER_FEATURES is sorted.
 list(SORT ALL_COMPILER_FEATURES)
 
+# Compiler features that are unavailable on GPU targets with the in-tree Clang.
+set(
+  CPU_ONLY_COMPILER_FEATURES
+    "float128"
+)
+
 # Function to check whether the compiler supports the provided set of features.
 # Usage:
 # compiler_supports(
@@ -65,13 +73,26 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
     set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE)
   endif()
 
-  try_compile(
-    has_feature
-    ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
-    SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
-    COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
-    LINK_OPTIONS ${link_options}
-  )
+  if(LIBC_TARGET_OS_IS_GPU)
+    # CUDA shouldn't be required to build the libc, only to test it, so we can't
+    # try to build CUDA binaries here. Since GPU builds are always compiled with
+    # the in-tree Clang, we just hardcode which compiler features are available
+    # when targeting GPUs.
+    if(feature IN_LIST CPU_ONLY_COMPILER_FEATURES)
+      set(has_feature FALSE)
+    else()
+      set(has_feature TRUE)
+    endif()
+  else()
+    try_compile(
+      has_feature
+      ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
+      SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
+      COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
+      LINK_OPTIONS ${link_options}
+    )
+  endif()
+
   if(has_feature)
     list(APPEND AVAILABLE_COMPILER_FEATURES ${feature})
     if(${feature} STREQUAL "float16")
@@ -82,6 +103,10 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
       set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
     elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc")
       set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE)
+    elseif(${feature} STREQUAL "builtin_fmax_fmin")
+      set(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN TRUE)
+    elseif(${feature} STREQUAL "builtin_fmaxf16_fminf16")
+      set(LIBC_COMPILER_HAS_BUILTIN_FMAXF16_FMINF16 TRUE)
     elseif(${feature} STREQUAL "builtin_round")
       set(LIBC_COMPILER_HAS_BUILTIN_ROUND TRUE)
     elseif(${feature} STREQUAL "builtin_roundeven")
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index 73b249374a066..be569f6f9cabf 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -9,6 +9,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX AVX2 AVX512F AVX512BW FMA)
   set(LIBC_COMPILE_OPTIONS_NATIVE -march=native)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
+  set(ALL_CPU_FEATURES "FullFP16")
   set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native)
 endif()
 
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 62e16d52cb3ea..9fc10375a1d37 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -6,6 +6,7 @@ function(_get_compile_options_from_flags output_var)
   endif()
   check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
   check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
+  check_flag(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG ${MISC_MATH_BASIC_OPS_OPT_FLAG} ${ARGN})
 
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
     if(ADD_FMA_FLAG)
@@ -37,6 +38,20 @@ function(_get_compile_options_from_flags output_var)
     if(ADD_EXPLICIT_SIMD_OPT_FLAG)
       list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
     endif()
+    if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
+      list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
+      if(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_FMAX_FMIN")
+      endif()
+      if(LIBC_COMPILER_HAS_BUILTIN_FMAXF16_FMINF16)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_FMAXF16_FMINF16")
+      endif()
+      if("FullFP16" IN_LIST LIBC_CPU_FEATURES AND
+         CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        list(APPEND compile_options
+             "SHELL:-Xclang -target-feature -Xclang +fullfp16")
+      endif()
+    endif()
   elseif(MSVC)
     if(ADD_FMA_FLAG)
       list(APPEND compile_options "/arch:AVX2")
@@ -88,7 +103,11 @@ function(_get_common_compile_options output_var flags)
          (LIBC_CC_SUPPORTS_NOSTDLIBINC OR COMPILER_RESOURCE_DIR))
         # We use -idirafter to avoid preempting libc's own headers in case the
         # directory (e.g. /usr/include) contains other headers.
-        list(APPEND compile_options "-idirafter${LIBC_KERNEL_HEADERS}")
+        if(CMAKE_CROSSCOMPILING)
+          list(APPEND compile_options "-idirafter=${LIBC_KERNEL_HEADERS}")
+        else()
+          list(APPEND compile_options "-idirafter${LIBC_KERNEL_HEADERS}")
+        endif()
       endif()
     endif()
 
@@ -96,16 +115,7 @@ function(_get_common_compile_options output_var flags)
       list(APPEND compile_options "-ffixed-point")
     endif()
 
-    # Builtin recognition causes issues when trying to implement the builtin
-    # functions themselves. The GPU backends do not use libcalls so we disable
-    # the known problematic ones. This allows inlining during LTO linking.
-    if(LIBC_TARGET_OS_IS_GPU)
-      set(libc_builtins bcmp strlen memmem bzero memcmp memcpy memmem memmove
-                        memset strcmp strstr)
-      foreach(builtin ${libc_builtins})
-        list(APPEND compile_options "-fno-builtin-${builtin}")
-      endforeach()
-    else()
+    if(NOT LIBC_TARGET_OS_IS_GPU)
       list(APPEND compile_options "-fno-builtin")
     endif()
 
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index eca7ba8d183e6..3629a7f111a7c 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -263,6 +263,9 @@ set(FMA_OPT_FLAG "FMA_OPT")
 set(ROUND_OPT_FLAG "ROUND_OPT")
 # This flag controls whether we use explicit SIMD instructions or not.
 set(EXPLICIT_SIMD_OPT_FLAG "EXPLICIT_SIMD_OPT")
+# This flag controls whether we use compiler builtin functions to implement
+# various basic math operations or not.
+set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")
 
 # Skip FMA_OPT flag for targets that don't support fma.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR
@@ -276,8 +279,10 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2")))
   set(SKIP_FLAG_EXPANSION_EXPLICIT_SIMD_OPT TRUE)
 endif()
 
-# Skip ROUND_OPT flag for targets that don't support SSE 4.2.
+# Skip ROUND_OPT flag for targets that don't support rounding instructions. On
+# x86, these are SSE4.1 instructions, but we already had code to check for
+# SSE4.2 support.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR
-       LIBC_TARGET_ARCHITECTURE_IS_AARCH64))
+       LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 7fc6860f23eb2..3049f4db7301f 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -66,7 +66,107 @@ function(add_header target_name)
   )
 endfunction(add_header)
 
-# A rule for generated header file targets.
+function(add_gen_header2 target_name)
+  cmake_parse_arguments(
+    "ADD_GEN_HDR2"
+    "PUBLIC" # No optional arguments
+    "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments
+    "DEPENDS"     # Multi value arguments
+    ${ARGN}
+  )
+  get_fq_target_name(${target_name} fq_target_name)
+  if(NOT LLVM_LIBC_FULL_BUILD)
+    add_library(${fq_target_name} INTERFACE)
+    return()
+  endif()
+  if(NOT ADD_GEN_HDR2_DEF_FILE)
+    message(FATAL_ERROR "`add_gen_hdr2` rule requires DEF_FILE to be specified.")
+  endif()
+  if(NOT ADD_GEN_HDR2_GEN_HDR)
+    message(FATAL_ERROR "`add_gen_hdr2` rule requires GEN_HDR to be specified.")
+  endif()
+  if(NOT ADD_GEN_HDR2_YAML_FILE)
+    message(FATAL_ERROR "`add_gen_hdr2` rule requires YAML_FILE to be specified.")
+  endif()
+
+  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_GEN_HDR})
+  file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
+  set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
+  set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR2_YAML_FILE})
+  set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_DEF_FILE})
+
+  set(fq_data_files "")
+  if(ADD_GEN_HDR2_DATA_FILES)
+    foreach(data_file IN LISTS ADD_GEN_HDR2_DATA_FILES)
+      list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}")
+    endforeach(data_file)
+  endif()
+
+  set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
+  list(TRANSFORM entry_points PREPEND "--e=")
+
+  add_custom_command(
+    OUTPUT ${out_file}
+    COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py
+            ${yaml_file}
+            --h_def_file ${def_file}
+            ${entry_points}
+            --output_dir ${out_file}
+    DEPENDS ${yaml_file} ${def_file} ${fq_data_files}
+    COMMENT "Generating header ${ADD_GEN_HDR2_GE2N_HDR} from ${yaml_file} and ${def_file}"
+  )
+  if(LIBC_TARGET_OS_IS_GPU)
+    file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
+    file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls/gpu)
+    set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
+    add_custom_command(
+      OUTPUT ${decl_out_file}
+      COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py
+              ${yaml_file}
+              --export-decls
+              ${entry_points}
+              --output_dir ${decl_out_file}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      DEPENDS ${yaml_file} ${fq_data_files}
+    )
+  endif()
+  
+  if(ADD_GEN_HDR2_DEPENDS)
+    get_fq_deps_list(fq_deps_list ${ADD_GEN_HDR2_DEPENDS})
+    # Dependencies of a add_header target can only be another add_gen_header target
+    # or an add_header target.
+    foreach(dep IN LISTS fq_deps_list)
+      get_target_property(header_file ${dep} HEADER_FILE_PATH)
+      if(NOT header_file)
+        message(FATAL_ERROR "Invalid dependency '${dep}' for '${fq_target_name}'.")
+      endif()
+    endforeach()
+  endif()
+  set(generated_hdr_target ${fq_target_name}.__generated_hdr__)
+  add_custom_target(
+    ${generated_hdr_target}
+    DEPENDS ${out_file} ${fq_deps_list} ${decl_out_file}
+  )
+
+  add_header_library(
+    ${target_name}
+    HDRS
+      ${out_file}
+  )
+
+  add_dependencies(${fq_target_name} ${generated_hdr_target})
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      HEADER_FILE_PATH ${out_file}
+      DECLS_FILE_PATH "${decl_out_file}"
+      DEPS "${fq_deps_list}"
+  )
+
+
+endfunction(add_gen_header2)
+
 # Usage:
 #     add_gen_header(
 #       <target name>
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 75bc81e2aee8e..0b1878d7b5222 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -83,97 +83,6 @@ function(get_all_object_file_deps result fq_deps_list)
   set(${result} ${all_deps} PARENT_SCOPE)
 endfunction()
 
-# A rule to build a library from a collection of entrypoint objects and bundle
-# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
-# Usage:
-#     add_gpu_entrypoint_library(
-#       DEPENDS <list of add_entrypoint_object targets>
-#     )
-function(add_gpu_entrypoint_library target_name base_target_name)
-  cmake_parse_arguments(
-    "ENTRYPOINT_LIBRARY"
-    "" # No optional arguments
-    "" # No single value arguments
-    "DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
-    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
-                        "of 'add_entrypoint_object' targets.")
-  endif()
-
-  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
-  get_all_object_file_deps(all_deps "${fq_deps_list}")
-
-  # The GPU 'libc' needs to be exported in a format that can be linked with
-  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
-  # fat binary and adds them to a static library.
-  set(objects "")
-  foreach(dep IN LISTS all_deps)
-    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
-    string(FIND ${dep} "." last_dot_loc REVERSE)
-    math(EXPR name_loc "${last_dot_loc} + 1")
-    string(SUBSTRING ${dep} ${name_loc} -1 name)
-    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
-    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
-    endif()
-
-    # Use the 'clang-offload-packager' to merge these files into a binary blob.
-    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
-      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-              "${prefix},file=$<JOIN:${object},,file=>" -o
-              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
-      DEPENDS ${dep} ${base_target_name}
-      COMMENT "Packaging LLVM offloading binary for '${object}'"
-    )
-    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
-                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
-    if(TARGET clang-offload-packager)
-      add_dependencies(${dep}.__gpubin__ clang-offload-packager)
-    endif()
-
-    # CMake does not permit setting the name on object files. In order to have
-    # human readable names we create an empty stub file with the entrypoint
-    # name. This empty file will then have the created binary blob embedded.
-    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
-      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
-      DEPENDS ${dep} ${dep}.__gpubin__ ${base_target_name}
-    )
-    add_custom_target(${dep}.__stub__
-                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
-
-    add_library(${dep}.__fatbin__
-      EXCLUDE_FROM_ALL OBJECT
-      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
-    )
-
-    # This is always compiled for the LLVM host triple instead of the native GPU
-    # triple that is used by default in the build.
-    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
-    target_compile_options(${dep}.__fatbin__ PRIVATE
-      --target=${LLVM_HOST_TRIPLE}
-      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
-    add_dependencies(${dep}.__fatbin__
-                     ${dep} ${dep}.__stub__ ${dep}.__gpubin__ ${base_target_name})
-
-    # Set the list of newly create fat binaries containing embedded device code.
-    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
-  endforeach()
-
-  add_library(
-    ${target_name}
-    STATIC
-      ${objects}
-  )
-  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
-endfunction(add_gpu_entrypoint_library)
-
 # A rule to build a library from a collection of entrypoint objects and bundle
 # it in a single LLVM-IR bitcode file.
 # Usage:
@@ -202,19 +111,9 @@ function(add_bitcode_entrypoint_library target_name base_target_name)
     list(APPEND objects ${object})
   endforeach()
 
-  set(output ${CMAKE_CURRENT_BINARY_DIR}/${target_name}.bc)
-  add_custom_command(
-    OUTPUT ${output}
-    COMMAND ${LIBC_LLVM_LINK} ${objects} -o ${output}
-    DEPENDS ${all_deps} ${base_target_name}
-    COMMENT "Linking LLVM-IR bitcode for ${base_target_name}"
-    COMMAND_EXPAND_LISTS
-  )
-  add_custom_target(${target_name} DEPENDS ${output} ${all_deps})
-  set_target_properties(${target_name} PROPERTIES TARGET_OBJECT ${output})
-  if(TARGET llvm-link)
-    add_dependencies(${target_name} llvm-link)
-  endif()
+  add_executable(${target_name} ${objects})
+  target_link_options(${target_name} PRIVATE
+                      "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
 endfunction(add_bitcode_entrypoint_library)
 
 # A rule to build a library from a collection of entrypoint objects.
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0b092e94ca8a1..68b5ed1ed51c0 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -65,25 +65,6 @@ function(create_object_library fq_target_name)
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
-  # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
-  # cannot handle it natively. Make a separate internal target for testing.
-  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED)
-    add_library(
-      ${internal_target_name}
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_OBJECT_SRCS}
-      ${ADD_OBJECT_HDRS}
-    )
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    target_compile_options(${internal_target_name} PRIVATE ${compile_options}
-                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
-    set_target_properties(${internal_target_name}
-                          PROPERTIES
-                          CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD})
-  endif()
-
   if(SHOW_INTERMEDIATE_OBJECTS)
     message(STATUS "Adding object library ${fq_target_name}")
     if(${SHOW_INTERMEDIATE_OBJECTS} STREQUAL "DEPS")
@@ -283,12 +264,6 @@ function(create_entrypoint_object fq_target_name)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
-  # The NVPTX target cannot use LTO for the internal targets used for testing.
-  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    target_compile_options(${internal_target_name} PRIVATE
-                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
-  endif()
-
   add_library(
     ${fq_target_name}
     # We want an object library as the objects will eventually get packaged into
@@ -304,6 +279,17 @@ function(create_entrypoint_object fq_target_name)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
+  # Builtin recognition causes issues when trying to implement the builtin
+  # functions themselves. The GPU backends do not use libcalls so we disable the
+  # known problematic ones on the entrypoints that implement them.
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(libc_builtins bcmp strlen memmem bzero memcmp memcpy memmem memmove
+                      memset strcmp strstr)
+    if(${ADD_ENTRYPOINT_OBJ_NAME} IN_LIST libc_builtins)
+      target_compile_options(${fq_target_name} PRIVATE -fno-builtin-${ADD_ENTRYPOINT_OBJ_NAME})
+    endif()
+  endif()
+
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 4d349cb1799da..a8b0c61d2236a 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -453,8 +453,6 @@ function(add_integration_test test_name)
   add_executable(
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
-    # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${INTEGRATION_TEST_SRCS}
     ${INTEGRATION_TEST_HDRS}
   )
@@ -473,11 +471,11 @@ function(add_integration_test test_name)
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    # We need to use the internal object versions for NVPTX.
-    set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
+      "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
+      "-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
@@ -490,10 +488,9 @@ function(add_integration_test test_name)
   endif()
   target_link_libraries(
     ${fq_build_target_name}
-    # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
-    libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
-    libc.test.IntegrationTest.test${internal_suffix}
+    ${fq_target_name}.__libc__
+    libc.startup.${LIBC_TARGET_OS}.crt1
+    libc.test.IntegrationTest.test
   )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
@@ -628,8 +625,6 @@ function(add_libc_hermetic test_name)
   add_executable(
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
-    # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${HERMETIC_TEST_SRCS}
     ${HERMETIC_TEST_HDRS}
   )
@@ -656,16 +651,16 @@ function(add_libc_hermetic test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    # We need to use the internal object versions for NVPTX.
-    set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
+      "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
+      "-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
@@ -679,11 +674,10 @@ function(add_libc_hermetic test_name)
   target_link_libraries(
     ${fq_build_target_name}
     PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+      libc.startup.${LIBC_TARGET_OS}.crt1
       ${link_libraries}
       LibcHermeticTestSupport.hermetic
-      # The NVIDIA 'nvlink' linker does not currently support static libraries.
-      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+      ${fq_target_name}.__libc__)
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
diff --git a/libc/cmake/modules/LibcConfig.cmake b/libc/cmake/modules/LibcConfig.cmake
index 7a3e6066b3cc0..da166dd6cc3fc 100644
--- a/libc/cmake/modules/LibcConfig.cmake
+++ b/libc/cmake/modules/LibcConfig.cmake
@@ -113,7 +113,7 @@ function(load_libc_config config_file)
       message(FATAL_ERROR ${json_error})
     endif()
     if(NOT DEFINED ${opt_name})
-      message(FATAL_ERROR: " Option ${opt_name} defined in ${config_file} is invalid.")
+      message(FATAL_ERROR " Option ${opt_name} defined in ${config_file} is invalid.")
     endif()
     if(ARGN)
       list(FIND ARGN ${opt_name} optname_exists)
diff --git a/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp b/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp
new file mode 100644
index 0000000000000..594d839f174bc
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp
@@ -0,0 +1,7 @@
+float try_builtin_fmaxf(float x, float y) { return __builtin_fmaxf(x, y); }
+float try_builtin_fminf(float x, float y) { return __builtin_fminf(x, y); }
+
+double try_builtin_fmaxf(double x, double y) { return __builtin_fmax(x, y); }
+double try_builtin_fminf(double x, double y) { return __builtin_fmin(x, y); }
+
+extern "C" void _start() {}
diff --git a/libc/cmake/modules/compiler_features/check_builtin_fmaxf16_fminf16.cpp b/libc/cmake/modules/compiler_features/check_builtin_fmaxf16_fminf16.cpp
new file mode 100644
index 0000000000000..4ce377782800c
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_fmaxf16_fminf16.cpp
@@ -0,0 +1,9 @@
+_Float16 try_builtin_fmaxf16(_Float16 x, _Float16 y) {
+  return __builtin_fmaxf16(x, y);
+}
+
+_Float16 try_builtin_fminf16(_Float16 x, _Float16 y) {
+  return __builtin_fminf16(x, y);
+}
+
+extern "C" void _start() {}
diff --git a/libc/cmake/modules/cpu_features/check_FullFP16.cpp b/libc/cmake/modules/cpu_features/check_FullFP16.cpp
new file mode 100644
index 0000000000000..b757fccc86e4c
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_FullFP16.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/cpu_features.h"
+
+#ifndef LIBC_TARGET_CPU_HAS_FULLFP16
+#error unsupported
+#endif
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index e2a0908023a02..f3537f2634e5b 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -21,36 +21,10 @@ if(LIBC_TARGET_TRIPLE)
   set(CMAKE_REQUIRED_FLAGS "--target=${LIBC_TARGET_TRIPLE}")
 endif()
 if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib -nostdlib")
 elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   set(CMAKE_REQUIRED_FLAGS
-      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument")
-endif()
-
-# Identify the program used to package multiple images into a single binary.
-get_filename_component(compiler_path ${CMAKE_CXX_COMPILER} DIRECTORY)
-if(TARGET clang-offload-packager)
-  get_target_property(LIBC_CLANG_OFFLOAD_PACKAGER clang-offload-packager LOCATION)
-else()
-  find_program(LIBC_CLANG_OFFLOAD_PACKAGER
-               NAMES clang-offload-packager NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
-    message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
-                        "build")
-  endif()
-endif()
-
-# Identify llvm-link program so we can merge the output IR into a single blob.
-if(TARGET llvm-link)
-  get_target_property(LIBC_LLVM_LINK llvm-link LOCATION)
-else()
-  find_program(LIBC_LLVM_LINK
-               NAMES llvm-link NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  if(NOT LIBC_LLVM_LINK)
-    message(FATAL_ERROR "Cannot find 'llvm-link' for the GPU build")
-  endif()
+      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument -nostdlib")
 endif()
 
 # Optionally set up a job pool to limit the number of GPU tests run in parallel.
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 4f2462abc7452..ef145e994a3d1 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -90,9 +90,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.remove
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
 
     # stdbit.h entrypoints
     libc.src.stdbit.stdc_bit_ceil_uc
@@ -236,6 +238,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index 12f4c2aa3a805..f0ff268b56275 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -18,11 +18,6 @@
       "value": false
     }
   },
-  "malloc": {
-    "LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE": {
-      "value": 102400
-    }
-  },
   "qsort": {
     "LIBC_CONF_QSORT_IMPL": {
       "value": "LIBC_QSORT_HEAP_SORT"
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index c2ab9d7f19921..be41f9a13aac2 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -86,9 +86,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.remove
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
 
     # stdbit.h entrypoints
     libc.src.stdbit.stdc_bit_ceil_uc
@@ -232,6 +234,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/config.json b/libc/config/config.json
index 94bfed894c173..538fea53cc704 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -1,8 +1,8 @@
 {
   "errno": {
     "LIBC_CONF_ERRNO_MODE": {
-      "value": "",
-      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
+      "value": "LIBC_ERRNO_MODE_DEFAULT",
+      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
     }
   },
   "printf": {
@@ -71,12 +71,6 @@
       "doc": "Default number of spins before blocking if a rwlock is in contention (default to 100)."
     }
   },
-  "malloc": {
-    "LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE": {
-      "value": 1073741824,
-      "doc": "Default size for the constinit freelist buffer used for the freelist malloc implementation (default 1o 1GB)."
-    }
-  },
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": 0,
@@ -88,5 +82,11 @@
       "value": "LIBC_QSORT_QUICK_SORT",
       "doc": "Configures sorting algorithm for qsort and qsort_r. Values accepted are LIBC_QSORT_QUICK_SORT, LIBC_QSORT_HEAP_SORT."
     }
+  },
+  "setjmp": {
+    "LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER": {
+      "value": true,
+      "doc": "Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved."
+    }
   }
 }
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 32a08f20b328f..3e4cb3cebce9b 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -94,6 +94,16 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.calloc
     libc.src.stdlib.realloc
     libc.src.stdlib.free
+
+    # stdio.h external entrypoints
+    libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
+    libc.src.stdio.asprintf
+    libc.src.stdio.vprintf
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
@@ -120,6 +130,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
@@ -135,6 +146,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.cospif
+    libc.src.math.dfmal
+    libc.src.math.dsqrtl
+    libc.src.math.daddl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
@@ -240,6 +255,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 02912decadcf7..5aa8354a75279 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -119,6 +119,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.ceill
     #libc.src.math.coshf
     #libc.src.math.cosf
+    #libc.src.math.daddl
+    #libc.src.math.dfmal
+    #libc.src.math.dsqrtl
+    #libc.src.math.dsubl
     #libc.src.math.expf
     #libc.src.math.exp2f
     #libc.src.math.expm1f
@@ -206,6 +210,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.sqrtl
     #libc.src.math.tanf
     #libc.src.math.tanhf
+    #libc.src.math.totalordermag
+    #libc.src.math.totalordermagf
+    #libc.src.math.totalordermagl
     #libc.src.math.trunc
     #libc.src.math.truncf
     #libc.src.math.truncl
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 3c6a92d279e50..8d29e7e2e253b 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -48,7 +48,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.strcspn
     libc.src.string.strdup
     libc.src.string.strerror
-    libc.src.string.strerror_r
     libc.src.string.strlcat
     libc.src.string.strlcpy
     libc.src.string.strlen
@@ -167,8 +166,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
 
-    # Only implemented in the test suite
+    # TODO: Implement these correctly
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
     libc.src.stdlib.malloc
     libc.src.stdlib.realloc
@@ -183,10 +183,14 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vprintf
     libc.src.stdio.fprintf
     libc.src.stdio.vfprintf
-    libc.src.stdio.sprintf
     libc.src.stdio.snprintf
-    libc.src.stdio.vsprintf
+    libc.src.stdio.sprintf
     libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
+    libc.src.stdio.asprintf
+    libc.src.stdio.vasprintf
+    libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
@@ -226,7 +230,6 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # gpu/rpc.h entrypoints
     libc.src.gpu.rpc_host_call
-    libc.src.gpu.rpc_fprintf
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
@@ -267,6 +270,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
+    libc.src.math.fadd
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.floor
@@ -281,6 +285,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -347,6 +353,74 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.truncf
 )
 
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C23 _Float16 entrypoints
+    libc.src.math.canonicalizef16
+    libc.src.math.ceilf16
+    libc.src.math.copysignf16
+    libc.src.math.f16add
+    libc.src.math.f16addf
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16fma
+    libc.src.math.f16fmaf
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
+    libc.src.math.f16sqrt
+    libc.src.math.f16sqrtf
+    libc.src.math.f16sub
+    libc.src.math.f16subf
+    libc.src.math.fabsf16
+    libc.src.math.fdimf16
+    libc.src.math.floorf16
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
+    libc.src.math.fminf16
+    libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
+    libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
+    libc.src.math.fmodf16
+    libc.src.math.frexpf16
+    libc.src.math.fromfpf16
+    libc.src.math.fromfpxf16
+    libc.src.math.getpayloadf16
+    libc.src.math.ilogbf16
+    libc.src.math.ldexpf16
+    libc.src.math.llogbf16
+    libc.src.math.llrintf16
+    libc.src.math.llroundf16
+    libc.src.math.logbf16
+    libc.src.math.lrintf16
+    libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
+    libc.src.math.nearbyintf16
+    libc.src.math.nextafterf16
+    libc.src.math.nextdownf16
+    libc.src.math.nexttowardf16
+    libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
+    libc.src.math.rintf16
+    libc.src.math.roundevenf16
+    libc.src.math.roundf16
+    libc.src.math.scalblnf16
+    libc.src.math.scalbnf16
+    libc.src.math.setpayloadf16
+    libc.src.math.setpayloadsigf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
+    libc.src.math.truncf16
+    libc.src.math.ufromfpf16
+    libc.src.math.ufromfpxf16
+  )
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 9b718c3f81151..2c449c49d912a 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -207,10 +207,12 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.rename
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
     #libc.src.stdio.scanf
     #libc.src.stdio.sscanf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
@@ -342,9 +344,13 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
     libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
@@ -357,18 +363,25 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.daddl
+    libc.src.math.dfmal
+    libc.src.math.dmull
+    libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
@@ -411,6 +424,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.fmodl
     libc.src.math.fmul
+    libc.src.math.fmull
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -420,6 +434,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.fsqrt
+    libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -489,14 +507,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
-    libc.src.math.roundf
-    libc.src.math.roundl
     libc.src.math.roundeven
     libc.src.math.roundevenf
     libc.src.math.roundevenl
+    libc.src.math.roundf
+    libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -509,6 +529,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -531,6 +556,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.f16div
     libc.src.math.f16divf
     libc.src.math.f16fmaf
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
     libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
     libc.src.math.f16sub
@@ -592,6 +619,11 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C23 _Float128 entrypoints
     libc.src.math.ceilf128
     libc.src.math.copysignf128
+    libc.src.math.daddf128
+    libc.src.math.ddivf128
+    libc.src.math.dfmaf128
+    libc.src.math.dsqrtf128
+    libc.src.math.dsubf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -609,6 +641,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -628,7 +661,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundf128
     libc.src.math.roundevenf128
     libc.src.math.scalbnf128
+    libc.src.math.setpayloadf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
@@ -680,6 +716,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
@@ -702,6 +740,10 @@ if(LLVM_LIBC_FULL_BUILD)
     # sched.h entrypoints
     libc.src.sched.__sched_getcpucount
 
+    # setjmp.h entrypoints
+    libc.src.setjmp.longjmp
+    libc.src.setjmp.setjmp
+
     # stdio.h entrypoints
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index a10dec06e2452..320f3e92183bf 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -267,5 +267,5 @@ def SearchAPI : PublicAPI<"search.h"> {
 }
 
 def SysStatvfsAPI : PublicAPI<"sys/statvfs.h"> {
-  let Types = ["fsblkcnt_t", "fsfilcnt_t", "struct statvfs"];
+  let Types = ["struct statvfs"];
 }
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index a72f8668808a5..90aae962080cd 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -213,6 +213,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
@@ -227,6 +228,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.dfmal
+    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -239,6 +242,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
@@ -289,6 +293,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -360,6 +366,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -371,6 +379,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 266c94d54a9df..771f169332fcb 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -17,6 +17,12 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.ctype.tolower
     libc.src.ctype.toupper
 
+    # dlfcn.h entrypoints
+    libc.src.dlfcn.dlclose
+    libc.src.dlfcn.dlerror
+    libc.src.dlfcn.dlopen
+    libc.src.dlfcn.dlsym
+
     # errno.h entrypoints
     libc.src.errno.errno
 
@@ -52,6 +58,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.mempcpy
     libc.src.string.memrchr
     libc.src.string.memset
+    libc.src.string.memset_explicit
     libc.src.string.rindex
     libc.src.string.stpcpy
     libc.src.string.stpncpy
@@ -180,6 +187,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.qsort_r
     libc.src.stdlib.rand
     libc.src.stdlib.srand
+    libc.src.stdlib.strfromd
+    libc.src.stdlib.strfromf
+    libc.src.stdlib.strfroml
     libc.src.stdlib.strtod
     libc.src.stdlib.strtof
     libc.src.stdlib.strtol
@@ -197,6 +207,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # stdio.h entrypoints
     libc.src.stdio.fdopen
+    libc.src.stdio.fileno
     libc.src.stdio.fprintf
     libc.src.stdio.fscanf
     libc.src.stdio.printf
@@ -205,11 +216,24 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.scanf
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
     libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
+
+    # sys/epoll.h entrypoints
+    libc.src.sys.epoll.epoll_create
+    libc.src.sys.epoll.epoll_create1
+    libc.src.sys.epoll.epoll_ctl
+    libc.src.sys.epoll.epoll_pwait
+    libc.src.sys.epoll.epoll_wait
+    # TODO: Need to check if pwait2 is available before providing.
+    # https://github.com/llvm/llvm-project/issues/80060
+    # libc.src.sys.epoll.epoll_pwait2
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
@@ -247,6 +271,10 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.stat.mkdirat
     libc.src.sys.stat.stat
 
+    # sys/statvfs.h
+    libc.src.sys.statvfs.fstatvfs
+    libc.src.sys.statvfs.statvfs
+
     # sys/utsname.h entrypoints
     libc.src.sys.utsname.uname
 
@@ -261,12 +289,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/auxv.h entrypoints
     libc.src.sys.auxv.getauxval
 
-    # sys/epoll.h entrypoints
-    # Disabled due to epoll_wait syscalls not being available on this platform.
-    # libc.src.sys.epoll.epoll_wait
-    # libc.src.sys.epoll.epoll_pwait
-    # libc.src.sys.epoll.epoll_pwait2
-
     # termios.h entrypoints
     libc.src.termios.cfgetispeed
     libc.src.termios.cfgetospeed
@@ -302,6 +324,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.linkat
     libc.src.unistd.lseek
     libc.src.unistd.pathconf
+    libc.src.unistd.pipe
     libc.src.unistd.pread
     libc.src.unistd.pwrite
     libc.src.unistd.read
@@ -344,9 +367,13 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
     libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
@@ -359,18 +386,25 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.daddl
+    libc.src.math.dfmal
+    libc.src.math.dmull
+    libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
@@ -413,6 +447,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.fmodl
     libc.src.math.fmul
+    libc.src.math.fmull
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -422,6 +457,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.fsqrt
+    libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -491,11 +530,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
+    libc.src.math.roundeven
+    libc.src.math.roundevenf
+    libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -508,6 +552,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -522,8 +571,11 @@ set(TARGET_LIBM_ENTRYPOINTS
 if(LIBC_TYPES_HAS_FLOAT128)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # math.h C23 _Float128 entrypoints
+    libc.src.math.canonicalizef128
     libc.src.math.ceilf128
     libc.src.math.copysignf128
+    libc.src.math.dmulf128
+    libc.src.math.dsqrtf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -538,9 +590,12 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fminimum_numf128
     libc.src.math.fminimumf128
     libc.src.math.fmodf128
+    libc.src.math.fmulf128
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
+    libc.src.math.fsqrtf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -555,24 +610,61 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
+    libc.src.math.remquof128
     libc.src.math.rintf128
+    libc.src.math.roundevenf128
     libc.src.math.roundf128
     libc.src.math.scalbnf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
   )
 endif()
 
+if(LIBC_COMPILER_HAS_FIXED_POINT)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # stdfix.h _Fract and _Accum entrypoints
+    libc.src.stdfix.abshk
+    libc.src.stdfix.abshr
+    libc.src.stdfix.absk
+    libc.src.stdfix.abslk
+    libc.src.stdfix.abslr
+    libc.src.stdfix.absr
+    libc.src.stdfix.exphk
+    libc.src.stdfix.expk
+    libc.src.stdfix.roundhk
+    libc.src.stdfix.roundhr
+    libc.src.stdfix.roundk
+    libc.src.stdfix.roundlk
+    libc.src.stdfix.roundlr
+    libc.src.stdfix.roundr
+    libc.src.stdfix.rounduhk
+    libc.src.stdfix.rounduhr
+    libc.src.stdfix.rounduk
+    libc.src.stdfix.roundulk
+    libc.src.stdfix.roundulr
+    libc.src.stdfix.roundur
+    libc.src.stdfix.sqrtuhk
+    libc.src.stdfix.sqrtuhr
+    libc.src.stdfix.sqrtuk
+    libc.src.stdfix.sqrtur
+    libc.src.stdfix.sqrtulr
+    libc.src.stdfix.uhksqrtus
+    libc.src.stdfix.uksqrtui
+  )
+endif()
+
 if(LLVM_LIBC_FULL_BUILD)
   list(APPEND TARGET_LIBC_ENTRYPOINTS
-    # compiler entrypoints (no corresponding header)
-    libc.src.compiler.__stack_chk_fail
-
     # assert.h entrypoints
     libc.src.assert.__assert_fail
 
+    # compiler entrypoints (no corresponding header)
+    libc.src.compiler.__stack_chk_fail
+
     # dirent.h entrypoints
     libc.src.dirent.closedir
     libc.src.dirent.dirfd
@@ -597,6 +689,12 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_attr_setguardsize
     libc.src.pthread.pthread_attr_setstack
     libc.src.pthread.pthread_attr_setstacksize
+    libc.src.pthread.pthread_condattr_destroy
+    libc.src.pthread.pthread_condattr_getclock
+    libc.src.pthread.pthread_condattr_getpshared
+    libc.src.pthread.pthread_condattr_init
+    libc.src.pthread.pthread_condattr_setclock
+    libc.src.pthread.pthread_condattr_setpshared
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_detach
     libc.src.pthread.pthread_equal
@@ -619,6 +717,23 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
+    libc.src.pthread.pthread_rwlock_destroy
+    libc.src.pthread.pthread_rwlock_init
+    libc.src.pthread.pthread_rwlock_rdlock
+    libc.src.pthread.pthread_rwlock_timedrdlock
+    libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_tryrdlock
+    libc.src.pthread.pthread_rwlock_trywrlock
+    libc.src.pthread.pthread_rwlock_unlock
+    libc.src.pthread.pthread_rwlock_wrlock
+    libc.src.pthread.pthread_rwlockattr_destroy
+    libc.src.pthread.pthread_rwlockattr_getkind_np
+    libc.src.pthread.pthread_rwlockattr_getpshared
+    libc.src.pthread.pthread_rwlockattr_init
+    libc.src.pthread.pthread_rwlockattr_setkind_np
+    libc.src.pthread.pthread_rwlockattr_setpshared
     libc.src.pthread.pthread_self
     libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_setspecific
@@ -642,7 +757,6 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
-    libc.src.stdio.fileno
     libc.src.stdio.flockfile
     libc.src.stdio.fopen
     libc.src.stdio.fopencookie
@@ -651,7 +765,9 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fread
     libc.src.stdio.fread_unlocked
     libc.src.stdio.fseek
+    libc.src.stdio.fseeko
     libc.src.stdio.ftell
+    libc.src.stdio.ftello
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.fwrite_unlocked
@@ -672,9 +788,11 @@ if(LLVM_LIBC_FULL_BUILD)
     # stdlib.h entrypoints
     libc.src.stdlib._Exit
     libc.src.stdlib.abort
+    libc.src.stdlib.at_quick_exit
     libc.src.stdlib.atexit
     libc.src.stdlib.exit
     libc.src.stdlib.getenv
+    libc.src.stdlib.quick_exit
 
     # signal.h entrypoints
     libc.src.signal.kill
@@ -756,6 +874,10 @@ if(LLVM_LIBC_FULL_BUILD)
 
     # sys/select.h entrypoints
     libc.src.sys.select.select
+
+    # sys/socket.h entrypoints
+    libc.src.sys.socket.bind
+    libc.src.sys.socket.socket
   )
 endif()
 
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index da203e9850603..0294f62bc2f7a 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
     libc.include.dirent
+    libc.include.dlfcn
     libc.include.errno
     libc.include.fcntl
     libc.include.features
@@ -18,6 +19,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.setjmp
     libc.include.stdckdint
     libc.include.stdbit
+    libc.include.stdfix
     libc.include.stdio
     libc.include.stdlib
     libc.include.string
@@ -33,17 +35,17 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.arpa_inet
 
     libc.include.sys_auxv
-    # Disabled due to epoll_wait syscalls not being available on this platform.
-    # libc.include.sys_epoll
+    libc.include.sys_epoll
     libc.include.sys_ioctl
     libc.include.sys_mman
     libc.include.sys_prctl
-    libc.include.sys_random
     libc.include.sys_queue
+    libc.include.sys_random
     libc.include.sys_resource
     libc.include.sys_select
     libc.include.sys_socket
     libc.include.sys_stat
+    libc.include.sys_statvfs
     libc.include.sys_syscall
     libc.include.sys_time
     libc.include.sys_types
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 4d19a28f4a2b3..044852b49c75f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -216,11 +216,14 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.scanf
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
+    libc.src.stdio.asprintf
     libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.vasprintf
 
     # sys/epoll.h entrypoints
     libc.src.sys.epoll.epoll_create
@@ -364,6 +367,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
@@ -382,6 +386,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.dfmal
+    libc.src.math.dmull
+    libc.src.math.dsqrtl
+    libc.src.math.daddl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -395,6 +404,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
@@ -437,6 +447,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.fmodl
     libc.src.math.fmul
+    libc.src.math.fmull
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -446,6 +457,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.fsqrt
+    libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -523,6 +538,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -535,6 +552,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -552,6 +574,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.expf16
     libc.src.math.f16add
     libc.src.math.f16addf
     libc.src.math.f16addl
@@ -561,6 +584,9 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.f16fma
     libc.src.math.f16fmaf
     libc.src.math.f16fmal
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
+    libc.src.math.f16mull
     libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
     libc.src.math.f16sqrtl
@@ -622,6 +648,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
       libc.src.math.f16addf128
       libc.src.math.f16divf128
       libc.src.math.f16fmaf128
+      libc.src.math.f16mulf128
       libc.src.math.f16sqrtf128
       libc.src.math.f16subf128
     )
@@ -634,6 +661,12 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.canonicalizef128
     libc.src.math.ceilf128
     libc.src.math.copysignf128
+    libc.src.math.daddf128
+    libc.src.math.ddivf128
+    libc.src.math.dfmaf128
+    libc.src.math.dmulf128
+    libc.src.math.dsqrtf128
+    libc.src.math.dsubf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -648,9 +681,12 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fminimum_numf128
     libc.src.math.fminimumf128
     libc.src.math.fmodf128
+    libc.src.math.fmulf128
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
+    libc.src.math.fsqrtf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -670,7 +706,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundevenf128
     libc.src.math.roundf128
     libc.src.math.scalbnf128
+    libc.src.math.setpayloadf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
@@ -771,6 +810,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 8a52d80e1fbfb..0294f62bc2f7a 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -45,8 +45,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.sys_select
     libc.include.sys_socket
     libc.include.sys_stat
-    # statvfs is broken, will uncomment once it's fixed.
-    # libc.include.sys_statvfs
+    libc.include.sys_statvfs
     libc.include.sys_syscall
     libc.include.sys_time
     libc.include.sys_types
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index afc9ca87ff094..b7aac225ee055 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -118,6 +118,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
@@ -132,6 +133,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.daddl
+    libc.src.math.dfmal
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
@@ -144,6 +148,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index dfb35f6a6611a..950de0eee4c05 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,9 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
-* **"malloc" options**
-    - ``LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE``: Default size for the constinit freelist buffer used for the freelist malloc implementation (default 1o 1GB).
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
 * **"math" options**
     - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
 * **"printf" options**
@@ -49,6 +47,8 @@ to learn about the defaults for your platform and target.
 * **"scanf" options**
     - ``LIBC_CONF_SCANF_DISABLE_FLOAT``: Disable parsing floating point values in scanf and friends.
     - ``LIBC_CONF_SCANF_DISABLE_INDEX_MODE``: Disable index mode in the scanf format string.
+* **"setjmp" options**
+    - ``LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER``: Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved.
 * **"string" options**
     - ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
     - ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
diff --git a/libc/docs/date_and_time.rst b/libc/docs/date_and_time.rst
index 303dd3fa12df2..db77d2c5ba61a 100644
--- a/libc/docs/date_and_time.rst
+++ b/libc/docs/date_and_time.rst
@@ -22,7 +22,7 @@ Implementation Status
 
   - `linux-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/arm/entrypoints.txt>`_
 
-  - `linux-riscv64 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv64/entrypoints.txt>`_
+  - `linux-riscv and linux-riscv32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv/entrypoints.txt>`_
 
 * To check date and time functions enabled for Windows:
 
@@ -46,64 +46,64 @@ Implementation Status
 
   - barebone-riscv32 - to be added
 
-+---------------------+---------------------------------------+-------------------+-------------------+-------------------+-------------------+
-| <Func>              |  Linux                                | Windows           | MacOS             | Embedded          | GPU               |
-|                     +---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-|                     | x86_64  | aarch64 | aarch32 | riscv64 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
-+=====================+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
-| asctime             | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asctime_r           | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock               | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_getcpuclockid |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_getres        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_gettime       | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_nanosleep     |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_settime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ctime               |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ctime_r             |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| difftime            | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| getdate             |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gettimeofday        | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gmtime              | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gmtime_r            | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| localtime           |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| localtime_r         |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| mktime              | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanosleep           | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| strftime            |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| strptime            |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| time                | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_create        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_delete        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_gettime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_getoverrun    |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_settime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tzset               |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
++---------------------+-----------------------------------------------+-------------------+-------------------+-------------------+-------------------+
+| <Func>              |  Linux                                        | Windows           | MacOS             | Embedded          | GPU               |
+|                     +---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+|                     | x86_64  | aarch64 | aarch32 | riscv64/riscv32 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
++=====================+=========+=========+=========+=================+=========+=========+=========+=========+=========+=========+=========+=========+
+| asctime             | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| asctime_r           | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock               | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_getcpuclockid |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_getres        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_gettime       | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_nanosleep     |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_settime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ctime               |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ctime_r             |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| difftime            | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| getdate             |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gettimeofday        | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gmtime              | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gmtime_r            | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| localtime           |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| localtime_r         |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| mktime              | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nanosleep           | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| strftime            |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| strptime            |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| time                | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_create        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_delete        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_gettime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_getoverrun    |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_settime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| tzset               |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index ed6127e403ec7..598c8b8c8a6e8 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -1,120 +1,200 @@
 Generating Public and Internal headers
 ======================================
 
-.. warning::
-  This page is severely out of date. Much of the information it contains may be
-  incorrect. Please only remove this warning once the page has been updated.
+This is a new implementation of the previous libc header generator. The old
+header generator (libc-hdrgen aka "Headergen") was based on TableGen, which
+created an awkward dependency on the rest of LLVM for our build system. By
+creating a new standalone Headergen we can eliminate these dependencies for
+easier cross compatibility.
 
-Other libc implementations make use of preprocessor macro tricks to make header
-files platform agnostic. When macros aren't suitable, they rely on build
-system tricks to pick the right set of files to compile and export. While these
-approaches have served them well, parts of their systems have become extremely
-complicated making it hard to modify, extend or maintain. To avoid these
-problems in llvm-libc, we use a header generation mechanism. The mechanism is
-driven by a *header configuration language*.
+There are 3 main components of the new Headergen. The first component are the
+YAML files that contain all the function header information and are separated by
+header specification and standard. The second component are the classes that are
+created for each component of the function header: macros, enumerations, types,
+function, arguments, and objects. The third component is the Python script that
+uses the class representation to deserialize YAML files into its specific
+components and then reserializes the components into the function header. The
+Python script also combines the generated header content with header definitions
+and extra macro and type inclusions from the .h.def file.
 
-Header Configuration Language
------------------------------
 
-Header configuration language consists of few special *commands*. The header
-generation mechanism takes an input file, which has an extension of
-``.h.def``, and produces a header file with ``.h`` extension. The header
-configuration language commands are listed in the input ``.h.def`` file. While
-reading a ``.h.def`` file, the header generation tool does two things:
+Instructions
+------------
 
-1. Copy the lines not containing commands as is into the output ``.h`` file.
-2. Replace the line on which a command occurs with some other text as directed
-   by the command. The replacement text can span multiple lines.
+Required Versions:
+  - Python Version: 3.8
+  - PyYAML Version: 5.1
 
-Command syntax
-~~~~~~~~~~~~~~
+1. Keep full-build mode on when building, otherwise headers will not be
+   generated.
+2. Once the build is complete, enter in the command line within the build
+   directory ``ninja check-newhdrgen`` to ensure that the integration tests are
+   passing.
+3. Then enter in the command line ``ninja libc`` to generate headers. Headers
+   will be in ``build/projects/libc/include`` or ``build/libc/include`` in a
+   runtime build. Sys spec headers will be located in
+   ``build/projects/libc/include/sys``.
 
-A command should be listed on a line by itself, and should not span more than
-one line. The first token to appear on the line is the command name prefixed
-with ``%%``. For example, a line with the ``include_file`` command should start
-with ``%%include_file``. There can be indentation spaces before the ``%%``
-prefix.
 
-Most commands typically take arguments. They are listed as a comma separated
-list of named identifiers within parenthesis, similar to the C function call
-syntax. Before performing the action corresponding to the command, the header
-generator replaces the arguments with concrete values.
+New Headergen is turned on by default, but if you want to use old Headergen,
+you can include this statement when building: ``-DLIBC_USE_NEW_HEADER_GEN=OFF``
 
-Argument Syntax
-~~~~~~~~~~~~~~~
+To add a function to the YAML files, you can either manually enter it in the
+YAML file corresponding to the header it belongs to or add it through the
+command line.
 
-Arguments are named indentifiers but prefixed with ``$`` and enclosed in ``{``
-and ``}``. For example, ``${path_to_constants}``.
+To add through the command line:
 
-Comments
-~~~~~~~~
+1. Make sure you are in the llvm-project directory.
 
-There can be cases wherein one wants to add comments in the .h.def file but
-does not want them to be copied into the generated header file. Such comments
-can be added by beginning the comment lines with the ``<!>`` prefix. Currently,
-comments have to be on lines of their own. That is, they cannot be suffixes like
-this:
+2. Enter in the command line:
 
-```
-%%include_file(a/b/c) <!> Path to c in b of a.  !!! WRONG SYNTAX
-```
+   .. code-block:: none
 
-Available Commands
-------------------
+     python3 libc/newhdrgen/yaml_to_classes.py
+     libc/newhdrgen/yaml/[yaml_file.yaml] --add_function "<return_type>" <function_name> "<function_arg1, function_arg2>" <standard> <guard> <attribute>
 
-Sub-sections below describe the commands currently available. Under each command
-is the description of the arguments to the command, and the action taken by the
-header generation tool when processing a command.
+   Example:
 
-``include_file``
-~~~~~~~~~~~~~~~~
+   .. code-block:: none
 
-This is a replacement command which should be listed in an input ``.h.def``
-file.
+      python3 libc/newhdrgen/yaml_to_classes.py
+      libc/newhdrgen/yaml/ctype.yaml --add_function "char" example_function
+      "int, void, const void" stdc example_float example_attribute
 
-Arguments
+   Keep in mind only the return_type and arguments have quotes around them. If
+   you do not have any guards or attributes you may enter "null" for both.
 
-  * **path argument** - An argument representing a path to a file. The file
-    should have an extension of ``.h.inc``.
+3. Check the YAML file that the added function is present. You will also get a
+   generated header file with the new addition in the newhdrgen directory to
+   examine.
 
-Action
 
-  This command instructs that the line on which the command appears should be
-  replaced by the contents of the file whose path is passed as argument to the
-  command.
+Testing
+-------
 
-``begin``
-~~~~~~~~~
+New Headergen has an integration test that you may run once you have configured
+your CMake within the build directory. In the command line, enter the following:
+``ninja check-newhdrgen``. The integration test is one test that ensures the
+process of YAML to classes to generate headers works properly. If there are any
+new additions on formatting headers, make sure the test is updated with the
+specific addition.
 
-This is not a replacement command. It is an error to list it in the input
-``.h.def`` file. It is normally listed in the files included by the
-``include_file`` command (the ``.h.inc`` files). A common use of this command it
-mark the beginning of what is to be included. This prevents copying items like
-license headers into the generated header file.
+Integration Test can be found in: ``libc/newhdrgen/tests/test_integration.py``
 
-Arguments
+File to modify if adding something to formatting:
+``libc/newhdrgen/tests/expected_output/test_header.h``
 
-  None.
 
-Action
+Common Errors
+-------------
+1. Missing function specific component
 
-  The header generator will only include content starting from the line after the
-  line on which this command is listed.
+   Example:
 
-``public_api``
-~~~~~~~~~~~~~~
+   .. code-block:: none
 
-This is a replacement command which should be listed in an input ``.h.def``
-file. The header file generator will replace this command with the public API of
-the target platform. See the build system document for more information on the
-relevant build rules. Also, see "Mechanics of public_api" to learn the mechanics
-of how the header generator replaces this command with the public API.
+      "/llvm-project/libc/newhdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
 
-Arguments
+   If you receive this error or any error pertaining to
+   ``function_data[function_specific_component]`` while building the headers
+   that means the function specific component is missing within the YAML files.
+   Through the call stack, you will be able to find the header file which has
+   the issue. Ensure there is no missing function specific component for that
+   YAML header file.
 
-  None.
+2. CMake Error: require argument to be specified
 
-Action
+   Example:
 
-  The header generator will replace this command with the public API to be exposed
-  from the generated header file.
+   .. code-block:: none
+
+     CMake Error at:
+     /llvm-project/libc/cmake/modules/LLVMLibCHeaderRules.cmake:86 (message):
+     'add_gen_hdr2' rule requires GEN_HDR to be specified.
+     Call Stack (most recent call first):
+     /llvm-project/libc/include/CMakeLists.txt:22 (add_gen_header2)
+     /llvm-project/libc/include/CMakeLists.txt:62 (add_header_macro)
+
+   If you receive this error, there is a missing YAML file, h_def file, or
+   header name within the ``libc/include/CMakeLists.txt``. The last line in the
+   error call stack will point to the header where there is a specific component
+   missing. Ensure the correct style and required files are present:
+
+   | ``[header_name]``
+   | ``[../libc/newhdrgen/yaml/[yaml_file.yaml]``
+   | ``[header_name.h.def]``
+   | ``[header_name.h]``
+   | ``DEPENDS``
+   |   ``{Necessary Depend Files}``
+
+3. Command line: expected arguments
+
+   Example:
+
+   .. code-block:: none
+
+     usage: yaml_to_classes.py [-h] [--output_dir OUTPUT_DIR] [--h_def_file H_DEF_FILE]
+     [--add_function RETURN_TYPE NAME ARGUMENTS STANDARDS GUARD ATTRIBUTES]
+     [--e ENTRY_POINTS] [--export-decls]
+     yaml_file
+     yaml_to_classes.py:
+     error: argument --add_function: expected 6 arguments
+
+   In the process of adding a function, you may run into an issue where the
+   command line is requiring more arguments than what you currently have. Ensure
+   that all components of the new function are filled. Even if you do not have a
+   guard or attribute, make sure to put null in those two areas.
+
+4. Object has no attribute
+
+   Example:
+
+   .. code-block:: none
+
+     File "/llvm-project/libc/newhdrgen/header.py", line 60, in __str__ for
+     function in self.functions: AttributeError: 'HeaderFile' object has no
+     attribute 'functions'
+
+   When running ``ninja libc`` in the build directory to generate headers you
+   may receive the error above. Essentially this means that in
+   ``libc/newhdrgen/header.py`` there is a missing attribute named functions.
+   Make sure all function components are defined within this file and there are
+   no missing functions to add these components.
+
+5. Unknown type name
+
+   Example:
+
+   .. code-block:: none
+
+     /llvm-project/build/projects/libc/include/sched.h:20:25: error: unknown type
+     name 'size_t'; did you mean 'time_t'?
+     20 | int_sched_getcpucount(size_t, const cpu_set_t*) __NOEXCEPT
+      |           ^
+     /llvm-project/build/projects/libc/include/llvm-libc-types/time_t.h:15:24:
+     note: 'time_t' declared here
+     15 | typedef __INT64_TYPE__ time_t;
+     |                    ^
+
+   During the header generation process errors like the one above may occur
+   because there are missing types for a specific header file. Check the YAML
+   file corresponding to the header file and make sure all the necessary types
+   that are being used are input into the types as well. Delete the specific
+   header file from the build folder and re-run ``ninja libc`` to ensure the
+   types are being recognized.
+
+6. Test Integration Errors
+
+   Sometimes the integration test will fail but that
+   still means the process is working unless the comparison between the output
+   and expected_output is not showing. If that is the case make sure in
+   ``libc/newhdrgen/tests/test_integration.py`` there are no missing arguments
+   that run through the script.
+
+   If the integration tests are failing due to mismatching of lines or small
+   errors in spacing that is nothing to worry about. If this is happening while
+   you are making a new change to the formatting of the headers, then
+   ensure the expected output file
+   ``libc/newhdrgen/tests/expected_output/test_header.h`` has the changes you
+   are applying.
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 3faae3134ce2a..9f50545d745d6 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -93,3 +93,8 @@ direction in this case.
 Non-const Constant Return Values
 --------------------------------
 Some libc functions, like ``dlerror()``, return ``char *`` instead of ``const char *`` and then tell the caller they promise not to to modify this value. Any modification of this value is undefined behavior.
+
+Unrecognized ``clockid_t`` values for ``pthread_rwlock_clock*`` APIs
+----------------------------------------------------------------------
+POSIX.1-2024 only demands support for ``CLOCK_REALTIME`` and ``CLOCK_MONOTONIC``. Currently,
+as in LLVM libc, if other clock ids are used, they will be treated as monotonic clocks.
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index d3e64c6d42431..60498e348395a 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -151,25 +151,6 @@ Build overview
 Once installed, the GPU build will create several files used for different
 targets. This section will briefly describe their purpose.
 
-**lib/<host-triple>/libcgpu-amdgpu.a or lib/libcgpu-amdgpu.a**
-  A static library containing fat binaries supporting AMD GPUs. These are built
-  using the support described in the `clang documentation
-  <https://clang.llvm.org/docs/OffloadingDesign.html>`_. These are intended to
-  be static libraries included natively for offloading languages like CUDA, HIP,
-  or OpenMP. This implements the standard C library.
-
-**lib/<host-triple>/libmgpu-amdgpu.a or lib/libmgpu-amdgpu.a**
-  A static library containing fat binaries that implements the standard math
-  library for AMD GPUs.
-
-**lib/<host-triple>/libcgpu-nvptx.a or lib/libcgpu-nvptx.a**
-  A static library containing fat binaries that implement the standard C library
-  for NVIDIA GPUs.
-
-**lib/<host-triple>/libmgpu-nvptx.a or lib/libmgpu-nvptx.a**
-  A static library containing fat binaries that implement the standard math
-  library for NVIDIA GPUs.
-
 **include/<target-triple>**
   The include directory where all of the generated headers for the target will
   go. These definitions are strictly for the GPU when being targeted directly.
diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst
index 0d026e8c04215..5ef298a2ba58f 100644
--- a/libc/docs/gpu/support.rst
+++ b/libc/docs/gpu/support.rst
@@ -70,6 +70,7 @@ strcoll        |check|
 strcpy         |check|
 strcspn        |check|
 strdup         |check|
+strerror       |check|
 strlcat        |check|
 strlcpy        |check|
 strlen         |check|
@@ -233,10 +234,11 @@ printf         |check|    |check|
 vprintf        |check|    |check|
 fprintf        |check|    |check|
 vfprintf       |check|    |check|
-sprintf        |check|    |check|
-snprintf       |check|    |check|
-vsprintf       |check|    |check|
-vsnprintf      |check|    |check|
+sprintf        |check|
+snprintf       |check|
+vsprintf       |check|
+vsnprintf      |check|
+sscanf         |check|
 putchar        |check|    |check|
 fclose         |check|    |check|
 fopen          |check|    |check|
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index d5ad4c7a0368d..4034c04867c99 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -34,16 +34,17 @@ described in the `clang documentation
 by the OpenMP toolchain, but is currently opt-in for the CUDA and HIP toolchains
 through the ``--offload-new-driver``` and ``-fgpu-rdc`` flags.
 
-The installation should contain a static library called ``libcgpu-amdgpu.a`` or
-``libcgpu-nvptx.a`` depending on which GPU architectures your build targeted.
-These contain fat binaries compatible with the offloading toolchain such that
-they can be used directly.
+In order or link the GPU runtime, we simply pass this library to the embedded 
+device linker job. This can be done using the ``-Xoffload-linker`` option, which 
+forwards an argument to a ``clang`` job used to create the final GPU executable. 
+The toolchain should pick up the C libraries automatically in most cases, so 
+this shouldn't be necessary.
 
 .. code-block:: sh
 
-  $> clang openmp.c -fopenmp --offload-arch=gfx90a -lcgpu-amdgpu
-  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -lcgpu-nvptx
-  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -lcgpu-amdgpu
+  $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc
+  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
+  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
 
 This will automatically link in the needed function definitions if they were
 required by the user's application. Normally using the ``-fgpu-rdc`` option
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 205d14946535e..defd075d10997 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -77,7 +77,7 @@ Implementation Status
 
   - `linux-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/arm/entrypoints.txt>`_
 
-  - `linux-riscv64 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv64/entrypoints.txt>`_
+  - `linux-riscv and linux-riscv32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv/entrypoints.txt>`_
 
 * To check math functions enabled for Windows:
 
@@ -114,15 +114,15 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | copysign         | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.11.1              | F.10.8.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dadd             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.1              | F.10.11                    |
+| dadd             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| ddiv             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.4              | F.10.11                    |
+| ddiv             | N/A              | N/A             |                        | N/A                  | |check|\*              | 7.12.14.4              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dfma             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.5              | F.10.11                    |
+| dfma             | N/A              | N/A             | |check|                | N/A                  | |check|\*             | 7.12.14.5              | F.10.11                     |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dmul             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.3              | F.10.11                    |
+| dmul             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.3              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dsub             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
+| dsub             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16add           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -130,11 +130,13 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16fma           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.5              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
+| f16mul           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.5              | F.10.11                    |
++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16sub           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fabs             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.7.3               | F.10.4.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fadd             | N/A              |                 |                        | N/A                  |                        | 7.12.14.1              | F.10.11                    |
+| fadd             | N/A              | |check|         | |check|                | N/A                  | |check|                | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fdim             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.12.1              | F.10.9.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -166,7 +168,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fmod             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.10.1              | F.10.7.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fmul             | N/A              | |check|         |                        | N/A                  |                        | 7.12.14.3              | F.10.11                    |
+| fmul             | N/A              | |check|         | |check|                | N/A                  | |check|\*              | 7.12.14.3              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | frexp            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.7               | F.10.3.7                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -176,7 +178,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fsub             | N/A              |                 |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| getpayload       |                  |                 |                        | |check|              |                        | F.10.13.1              | N/A                        |
+| getpayload       | |check|          | |check|         |                        | |check|              | |check|                | F.10.13.1              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | ilogb            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.8               | F.10.3.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -222,13 +224,13 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | scalbn           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| setpayload       |                  |                 |                        | |check|              |                        | F.10.13.2              | N/A                        |
+| setpayload       | |check|          | |check|         |                        | |check|              | |check|                | F.10.13.2              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | setpayloadsig    |                  |                 |                        | |check|              |                        | F.10.13.3              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| totalorder       |                  |                 |                        | |check|              |                        | F.10.12.1              | N/A                        |
+| totalorder       | |check|          | |check|         |                        | |check|              | |check|                | F.10.12.1              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| totalordermag    |                  |                 |                        | |check|              |                        | F.10.12.2              | N/A                        |
+| totalordermag    | |check|          | |check|         | |check|                | |check|              | |check|                | F.10.12.2              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | trunc            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.9               | F.10.6.9                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -258,7 +260,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | atan      | |check|          |                 |                        |                      |                        | 7.12.4.3               | F.10.1.3                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| atan2     | |check|          |                 |                        |                      |                        | 7.12.4.4               | F.10.1.4                   |
+| atan2     | |check|          | 1 ULP           |                        |                      |                        | 7.12.4.4               | F.10.1.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | atan2pi   |                  |                 |                        |                      |                        | 7.12.4.11              | F.10.1.11                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -276,13 +278,13 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cospi     | |check|          |                 |                        |                      |                        | 7.12.4.12              | F.10.1.12                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dsqrt     | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
+| dsqrt     | N/A              | N/A             |   |check|              | N/A                  |       |check|\*        | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | erf       | |check|          |                 |                        |                      |                        | 7.12.8.1               | F.10.5.1                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | erfc      |                  |                 |                        |                      |                        | 7.12.8.2               | F.10.5.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp       | |check|          | |check|         |                        |                      |                        | 7.12.6.1               | F.10.3.1                   |
+| exp       | |check|          | |check|         |                        | |check|              |                        | 7.12.6.1               | F.10.3.1                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp10     | |check|          | |check|         |                        |                      |                        | 7.12.6.2               | F.10.3.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -298,7 +300,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16sqrt   | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fsqrt     | N/A              |                 |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
+| fsqrt     | N/A              | |check|         |  |check|               | N/A                  | |check|\*              | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | hypot     | |check|          | |check|         |                        |                      |                        | 7.12.7.4               | F.10.4.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index 816691b4bd440..e2dcecca7f7df 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -3,7 +3,7 @@ add_custom_target(libc-fuzzer)
 
 add_subdirectory(__support)
 # TODO(#85680): Re-enable math fuzzing after headers are sorted out
-# add_subdirectory(math)
+add_subdirectory(math)
 add_subdirectory(stdlib)
 add_subdirectory(stdio)
 add_subdirectory(string)
diff --git a/libc/fuzzing/math/CMakeLists.txt b/libc/fuzzing/math/CMakeLists.txt
index 6990a04922a5c..31336fa3e49f3 100644
--- a/libc/fuzzing/math/CMakeLists.txt
+++ b/libc/fuzzing/math/CMakeLists.txt
@@ -61,3 +61,30 @@ add_libc_fuzzer(
     libc.src.math.nextafterf
     libc.src.math.nextafterl
 )
+
+add_libc_fuzzer(
+  sin_fuzz
+  NEED_MPFR
+  SRCS
+    sin_fuzz.cpp
+  DEPENDS
+    libc.src.math.sin
+)
+
+add_libc_fuzzer(
+  cos_fuzz
+  NEED_MPFR
+  SRCS
+    cos_fuzz.cpp
+  DEPENDS
+    libc.src.math.cos
+)
+
+add_libc_fuzzer(
+  tan_fuzz
+  NEED_MPFR
+  SRCS
+    tan_fuzz.cpp
+  DEPENDS
+    libc.src.math.tan
+)
diff --git a/libc/fuzzing/math/RemQuoDiff.h b/libc/fuzzing/math/RemQuoDiff.h
index 84a6a24ce5271..6fdc02e6e6a2f 100644
--- a/libc/fuzzing/math/RemQuoDiff.h
+++ b/libc/fuzzing/math/RemQuoDiff.h
@@ -31,21 +31,22 @@ void RemQuoDiff(RemQuoFunc<T> func1, RemQuoFunc<T> func2, const uint8_t *data,
   T remainder1 = func1(x, y, &q1);
   T remainder2 = func2(x, y, &q2);
 
-  if (isnan(remainder1)) {
-    if (!isnan(remainder2))
+  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
+  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
+
+  if (bits1.is_nan()) {
+    if (!bits2.is_nan())
       __builtin_trap();
     return;
   }
 
-  if (isinf(remainder2) != isinf(remainder1))
+  if (bits1.is_inf() != bits2.is_inf())
     __builtin_trap();
 
   // Compare only the 3 LS bits of the quotient.
   if ((q1 & 0x7) != (q2 & 0x7))
     __builtin_trap();
 
-  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
-  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
   if (bits1.uintval() != bits2.uintval())
     __builtin_trap();
 }
diff --git a/libc/fuzzing/math/cos_fuzz.cpp b/libc/fuzzing/math/cos_fuzz.cpp
new file mode 100644
index 0000000000000..3e4ad659fd3a5
--- /dev/null
+++ b/libc/fuzzing/math/cos_fuzz.cpp
@@ -0,0 +1,40 @@
+//===-- cos_fuzz.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc cos implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cos.h"
+#include <math.h>
+#include <mpfr.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const double x) {
+  // remove NaN and inf as preconditions
+  if (isnan(x))
+    return 0;
+  if (isinf(x))
+    return 0;
+  // signed zeros already tested in unit tests
+  if (signbit(x) && x == 0.0)
+    return 0;
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  mpfr_set_d(input, x, MPFR_RNDN);
+  int output = mpfr_cos(input, input, MPFR_RNDN);
+  mpfr_subnormalize(input, output, MPFR_RNDN);
+  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+  double result = LIBC_NAMESPACE::cos(x);
+
+  if (result != to_compare)
+    __builtin_trap();
+
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/nextafter_differential_fuzz.cpp b/libc/fuzzing/math/nextafter_differential_fuzz.cpp
index 2e6d58995a31f..e48ef51d54807 100644
--- a/libc/fuzzing/math/nextafter_differential_fuzz.cpp
+++ b/libc/fuzzing/math/nextafter_differential_fuzz.cpp
@@ -17,6 +17,8 @@
 #include "src/math/nextafterf.h"
 #include "src/math/nextafterl.h"
 
+#include <math.h>
+
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   TwoInputSingleOutputDiff<float, float>(&LIBC_NAMESPACE::nextafterf,
                                          &::nextafterf, data, size);
diff --git a/libc/fuzzing/math/sin_fuzz.cpp b/libc/fuzzing/math/sin_fuzz.cpp
new file mode 100644
index 0000000000000..009b9f2080298
--- /dev/null
+++ b/libc/fuzzing/math/sin_fuzz.cpp
@@ -0,0 +1,40 @@
+//===-- sin_fuzz.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc sin implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sin.h"
+#include <math.h>
+#include <mpfr.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const double x) {
+  // remove NaN and inf as preconditions
+  if (isnan(x))
+    return 0;
+  if (isinf(x))
+    return 0;
+  // signed zeros already tested in unit tests
+  if (signbit(x) && x == 0.0)
+    return 0;
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  mpfr_set_d(input, x, MPFR_RNDN);
+  int output = mpfr_sin(input, input, MPFR_RNDN);
+  mpfr_subnormalize(input, output, MPFR_RNDN);
+  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+  double result = LIBC_NAMESPACE::sin(x);
+
+  if (result != to_compare)
+    __builtin_trap();
+
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/tan_fuzz.cpp b/libc/fuzzing/math/tan_fuzz.cpp
new file mode 100644
index 0000000000000..700ebb80974c8
--- /dev/null
+++ b/libc/fuzzing/math/tan_fuzz.cpp
@@ -0,0 +1,40 @@
+//===-- tan_fuzz.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc tan implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tan.h"
+#include <math.h>
+#include <mpfr.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const double x) {
+  // remove NaN and inf as preconditions
+  if (isnan(x))
+    return 0;
+  if (isinf(x))
+    return 0;
+  // signed zeros already tested in unit tests
+  if (signbit(x) && x == 0.0)
+    return 0;
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  mpfr_set_d(input, x, MPFR_RNDN);
+  int output = mpfr_tan(input, input, MPFR_RNDN);
+  mpfr_subnormalize(input, output, MPFR_RNDN);
+  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+  double result = LIBC_NAMESPACE::tan(x);
+
+  if (result != to_compare)
+    __builtin_trap();
+
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt
index 204bc619318da..9b3298cfc55a7 100644
--- a/libc/fuzzing/stdlib/CMakeLists.txt
+++ b/libc/fuzzing/stdlib/CMakeLists.txt
@@ -6,6 +6,14 @@ add_libc_fuzzer(
     libc.src.stdlib.qsort
 )
 
+add_libc_fuzzer(
+  heap_sort_fuzz
+  SRCS
+    heap_sort_fuzz.cpp
+  DEPENDS
+    libc.src.stdlib.qsort_util
+)
+
 add_libc_fuzzer(
   atof_differential_fuzz
   SRCS
diff --git a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
new file mode 100644
index 0000000000000..876c5f9975d4d
--- /dev/null
+++ b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
@@ -0,0 +1,48 @@
+//===-- heap_sort_fuzz.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc heap_sort implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/heap_sort.h"
+#include <stdint.h>
+
+static int int_compare(const void *l, const void *r) {
+  int li = *reinterpret_cast<const int *>(l);
+  int ri = *reinterpret_cast<const int *>(r);
+  if (li == ri)
+    return 0;
+  if (li > ri)
+    return 1;
+  return -1;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+
+  const size_t array_size = size / sizeof(int);
+  if (array_size == 0)
+    return 0;
+
+  int *array = new int[array_size];
+  const int *data_as_int = reinterpret_cast<const int *>(data);
+  for (size_t i = 0; i < array_size; ++i)
+    array[i] = data_as_int[i];
+
+  auto arr = LIBC_NAMESPACE::internal::Array(
+      reinterpret_cast<uint8_t *>(array), array_size, sizeof(int), int_compare);
+
+  LIBC_NAMESPACE::internal::heap_sort(arr);
+
+  for (size_t i = 0; i < array_size - 1; ++i)
+    if (array[i] > array[i + 1])
+      __builtin_trap();
+
+  delete[] array;
+  return 0;
+}
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index c158162ba6238..503b55978e2cb 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -118,9 +118,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
       __builtin_trap();
     // If any result is NaN, all of them should be NaN. We can't use the usual
     // comparisons because NaN != NaN.
-    if (isnan(float_result) ^ isnan(strtof_result))
+    if (FPBits<float>(float_result).is_nan() !=
+        FPBits<float>(strtof_result).is_nan())
       __builtin_trap();
-    if (!isnan(float_result) && float_result != strtof_result)
+    if (!FPBits<float>(float_result).is_nan() && float_result != strtof_result)
       __builtin_trap();
     mpfr_clear(mpfr_float);
   }
@@ -136,10 +137,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtod_strlen = out_ptr - str_ptr;
     if (result_strlen != strtod_strlen)
       __builtin_trap();
-    if (isnan(double_result) ^ isnan(strtod_result) ||
-        isnan(double_result) ^ isnan(atof_result))
+    if (FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(strtod_result).is_nan() ||
+        FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(atof_result).is_nan())
       __builtin_trap();
-    if (!isnan(double_result) &&
+    if (!FPBits<double>(double_result).is_nan() &&
         (double_result != strtod_result || double_result != atof_result))
       __builtin_trap();
     mpfr_clear(mpfr_double);
@@ -156,9 +159,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtold_strlen = out_ptr - str_ptr;
     if (result_strlen != strtold_strlen)
       __builtin_trap();
-    if (isnan(long_double_result) ^ isnan(strtold_result))
+    if (FPBits<long double>(long_double_result).is_nan() ^
+        FPBits<long double>(strtold_result).is_nan())
       __builtin_trap();
-    if (!isnan(long_double_result) && long_double_result != strtold_result)
+    if (!FPBits<long double>(long_double_result).is_nan() &&
+        long_double_result != strtold_result)
       __builtin_trap();
     mpfr_clear(mpfr_long_double);
   }
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index da640e8c0f70c..a2fad9b473ed7 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -32,6 +32,15 @@ add_proxy_header_library(
     libc.include.math
 )
 
+add_proxy_header_library(
+  math_function_macros
+  HDRS
+    math_function_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+    libc.include.math
+)
+
 add_proxy_header_library(
   errno_macros
   HDRS
diff --git a/libc/hdr/math_function_macros.h b/libc/hdr/math_function_macros.h
new file mode 100644
index 0000000000000..48dec8260ef89
--- /dev/null
+++ b/libc/hdr/math_function_macros.h
@@ -0,0 +1,27 @@
+//===-- Definition of macros from math.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_MATH_FUNCTION_MACROS_H
+#define LLVM_LIBC_HDR_MATH_FUNCTION_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+#else // Overlay mode
+
+// GCC will include CXX headers when __cplusplus is defined. This behavior
+// can be suppressed by defining _GLIBCXX_INCLUDE_NEXT_C_HEADERS.
+#if defined(__GNUC__) && !defined(__clang__)
+#define _GLIBCXX_INCLUDE_NEXT_C_HEADERS
+#endif
+#include <math.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_MATH_MACROS_H
diff --git a/libc/hdr/math_macros.h b/libc/hdr/math_macros.h
index d5a823723747c..863451123f3f8 100644
--- a/libc/hdr/math_macros.h
+++ b/libc/hdr/math_macros.h
@@ -11,7 +11,6 @@
 
 #ifdef LIBC_FULL_BUILD
 
-#include "include/llvm-libc-macros/math-function-macros.h"
 #include "include/llvm-libc-macros/math-macros.h"
 
 #else // Overlay mode
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 2cf7206f3a625..37cae19123318 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -17,18 +17,41 @@ add_header(
     __llvm-libc-common.h
 )
 
-add_gen_header(
+macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS)
+  if (LIBC_USE_NEW_HEADER_GEN)
+    add_gen_header2(
+      ${TARGET_NAME}
+      YAML_FILE ${YAML_FILE}
+      DEF_FILE ${DEF_FILE}
+      GEN_HDR ${GEN_HDR}
+      ${DEPENDS}
+      ${ARGN}
+    )
+  else()
+    add_gen_header(
+      ${TARGET_NAME}
+      DEF_FILE ${DEF_FILE}
+      GEN_HDR ${GEN_HDR}
+      ${DEPENDS}
+      ${ARGN}
+    )
+  endif()
+endmacro()
+
+add_header_macro(
   ctype
-  DEF_FILE ctype.h.def
-  GEN_HDR ctype.h
+  ../libc/newhdrgen/yaml/ctype.yaml
+  ctype.h.def
+  ctype.h
   DEPENDS
     .llvm_libc_common_h
 )
 
-add_gen_header(
+add_header_macro(
   dirent
-  DEF_FILE dirent.h.def
-  GEN_HDR dirent.h
+  ../libc/newhdrgen/yaml/dirent.yaml
+  dirent.h.def
+  dirent.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.ino_t
@@ -36,10 +59,11 @@ add_gen_header(
     .llvm-libc-types.struct_dirent
 )
 
-add_gen_header(
+add_header_macro(
   fcntl
-  DEF_FILE fcntl.h.def
-  GEN_HDR fcntl.h
+  ../libc/newhdrgen/yaml/fcntl.yaml
+  fcntl.h.def
+  fcntl.h
   DEPENDS
     .llvm-libc-macros.fcntl_macros
     .llvm-libc-types.mode_t
@@ -51,28 +75,31 @@ add_gen_header(
     .llvm_libc_common_h
 )
 
-add_gen_header(
+add_header_macro(
   dlfcn
-  DEF_FILE dlfcn.h.def
-  GEN_HDR dlfcn.h
+  ../libc/newhdrgen/yaml/dlfcn.yaml
+  dlfcn.h.def
+  dlfcn.h
   DEPENDS
     .llvm-libc-macros.dlfcn_macros
     .llvm_libc_common_h
 )
 
-add_gen_header(
+add_header_macro(
   features
-  DEF_FILE features.h.def
-  GEN_HDR features.h
+  ../libc/newhdrgen/yaml/features.yaml
+  features.h.def
+  features.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.features_macros
 )
 
-add_gen_header(
+add_header_macro(
   fenv
-  DEF_FILE fenv.h.def
-  GEN_HDR fenv.h
+  ../libc/newhdrgen/yaml/fenv.yaml
+  fenv.h.def
+  fenv.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.fenv_macros
@@ -80,44 +107,49 @@ add_gen_header(
     .llvm-libc-types.fexcept_t
 )
 
-add_gen_header(
+add_header_macro(
   inttypes
-  DEF_FILE inttypes.h.def
-  GEN_HDR inttypes.h
+  ../libc/newhdrgen/yaml/inttypes.yaml
+  inttypes.h.def
+  inttypes.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.imaxdiv_t
     .llvm-libc-macros.inttypes_macros
 )
 
-add_gen_header(
+add_header_macro(
   float
-  DEF_FILE float.h.def
-  GEN_HDR float.h
+  ../libc/newhdrgen/yaml/float.yaml
+  float.h.def
+  float.h
   DEPENDS
     .llvm-libc-macros.float_macros
 )
 
-add_gen_header(
+add_header_macro(
   stdint
-  DEF_FILE stdint.h.def
-  GEN_HDR stdint.h
+  ../libc/newhdrgen/yaml/stdint.yaml
+  stdint.h.def
+  stdint.h
   DEPENDS
     .llvm-libc-macros.stdint_macros
 )
 
-add_gen_header(
+add_header_macro(
   limits
-  DEF_FILE limits.h.def
-  GEN_HDR limits.h
+  ../libc/newhdrgen/yaml/limits.yaml
+  limits.h.def
+  limits.h
   DEPENDS
     .llvm-libc-macros.limits_macros
 )
 
-add_gen_header(
+add_header_macro(
   math
-  DEF_FILE math.h.def
-  GEN_HDR math.h
+  ../libc/newhdrgen/yaml/math.yaml
+  math.h.def
+  math.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.float16_macros
@@ -128,10 +160,11 @@ add_gen_header(
     .llvm-libc-types.float128
 )
 
-add_gen_header(
+add_header_macro(
   stdfix
-  DEF_FILE stdfix.h.def
-  GEN_HDR stdfix.h
+  ../libc/newhdrgen/yaml/stdfix.yaml
+  stdfix.h.def
+  stdfix.h
   DEPENDS
     .llvm-libc-macros.stdfix_macros
 )
@@ -139,55 +172,61 @@ add_gen_header(
 # TODO: This should be conditional on POSIX networking being included.
 file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa)
 
-add_gen_header(
+add_header_macro(
   arpa_inet
-  DEF_FILE arpa/inet.h.def
-  GEN_HDR arpa/inet.h
+  ../libc/newhdrgen/yaml/arpa/inet.yaml
+  arpa/inet.h.def
+  arpa/inet.h
   DEPENDS
     .llvm_libc_common_h
 )
 
-add_gen_header(
+add_header_macro(
   assert
-  DEF_FILE assert.h.def
-  GEN_HDR assert.h
+  ../libc/newhdrgen/yaml/assert.yaml
+  assert.h.def
+  assert.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.assert_macros
 )
 
-add_gen_header(
+add_header_macro(
   setjmp
-  DEF_FILE setjmp.h.def
-  GEN_HDR setjmp.h
+  ../libc/newhdrgen/yaml/setjmp.yaml
+  setjmp.h.def
+  setjmp.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.jmp_buf
 )
 
-add_gen_header(
+add_header_macro(
   string
-  DEF_FILE string.h.def
-  GEN_HDR string.h
+  ../libc/newhdrgen/yaml/string.yaml
+  string.h.def
+  string.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.null_macro
     .llvm-libc-types.size_t
 )
 
-add_gen_header(
+add_header_macro(
   strings
-  DEF_FILE strings.h.def
-  GEN_HDR strings.h
+  ../libc/newhdrgen/yaml/strings.yaml
+  strings.h.def
+  strings.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.size_t
 )
 
-add_gen_header(
+add_header_macro(
   search
-  DEF_FILE search.h.def
-  GEN_HDR search.h
+  ../libc/newhdrgen/yaml/search.yaml
+  search.h.def
+  search.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.ACTION
@@ -196,10 +235,11 @@ add_gen_header(
     .llvm-libc-types.size_t
 )
 
-add_gen_header(
+add_header_macro(
   time
-  DEF_FILE time.h.def
-  GEN_HDR time.h
+  ../libc/newhdrgen/yaml/time.yaml
+  time.h.def
+  time.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.time_macros
@@ -211,10 +251,11 @@ add_gen_header(
     .llvm-libc-types.clockid_t
 )
 
-add_gen_header(
+add_header_macro(
   threads
-  DEF_FILE threads.h.def
-  GEN_HDR threads.h
+  ../libc/newhdrgen/yaml/threads.yaml
+  threads.h.def
+  threads.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.__call_once_func_t
@@ -227,19 +268,21 @@ add_gen_header(
     .llvm-libc-types.tss_dtor_t
 )
 
-add_gen_header(
+add_header_macro(
   errno
-  DEF_FILE errno.h.def
-  GEN_HDR errno.h
+  ../libc/newhdrgen/yaml/errno.yaml
+  errno.h.def
+  errno.h
   DEPENDS
     .llvm-libc-macros.generic_error_number_macros
     .llvm-libc-macros.error_number_macros
 )
 
-add_gen_header(
+add_header_macro(
   signal
-  DEF_FILE signal.h.def
-  GEN_HDR signal.h
+  ../libc/newhdrgen/yaml/signal.yaml
+  signal.h.def
+  signal.h
   DEPENDS
     .llvm-libc-macros.signal_macros
     .llvm-libc-types.sig_atomic_t
@@ -251,28 +294,31 @@ add_gen_header(
     .llvm-libc-types.pid_t
 )
 
-add_gen_header(
+add_header_macro(
   stdbit
-  DEF_FILE stdbit.h.def
-  GEN_HDR stdbit.h
+  ../libc/newhdrgen/yaml/stdbit.yaml
+  stdbit.h.def
+  stdbit.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.stdbit_macros
 )
 
-add_gen_header(
+add_header_macro(
   stdckdint
-  DEF_FILE stdckdint.h.def
-  GEN_HDR stdckdint.h
+  ../libc/newhdrgen/yaml/stdckdint.yaml
+  stdckdint.h.def
+  stdckdint.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.stdckdint_macros
 )
 
-add_gen_header(
+add_header_macro(
   stdio
-  DEF_FILE stdio.h.def
-  GEN_HDR stdio.h
+  ../libc/newhdrgen/yaml/stdio.yaml
+  stdio.h.def
+  stdio.h
   DEPENDS
     .llvm-libc-macros.file_seek_macros
     .llvm-libc-macros.stdio_macros
@@ -284,10 +330,11 @@ add_gen_header(
     .llvm_libc_common_h
 )
 
-add_gen_header(
+add_header_macro(
   stdlib
-  DEF_FILE stdlib.h.def
-  GEN_HDR stdlib.h
+  ../libc/newhdrgen/yaml/stdlib.yaml
+  stdlib.h.def
+  stdlib.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.stdlib_macros
@@ -301,10 +348,11 @@ add_gen_header(
     .llvm-libc-types.__atexithandler_t
 )
 
-add_gen_header(
+add_header_macro(
   unistd
-  DEF_FILE unistd.h.def
-  GEN_HDR unistd.h
+  ../libc/newhdrgen/yaml/unistd.yaml
+  unistd.h.def
+  unistd.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.file_seek_macros
@@ -319,10 +367,11 @@ add_gen_header(
     .llvm-libc-types.__getoptargv_t
 )
 
-add_gen_header(
+add_header_macro(
   pthread
-  DEF_FILE pthread.h.def
-  GEN_HDR pthread.h
+  ../libc/newhdrgen/yaml/pthread.yaml
+  pthread.h.def
+  pthread.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.__atfork_callback_t
@@ -340,10 +389,11 @@ add_gen_header(
     .llvm-libc-types.pthread_t
 )
 
-add_gen_header(
+add_header_macro(
   sched
-  DEF_FILE sched.h.def
-  GEN_HDR sched.h
+  ../libc/newhdrgen/yaml/sched.yaml
+  sched.h.def
+  sched.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sched_macros
@@ -356,10 +406,11 @@ add_gen_header(
     .llvm-libc-types.struct_timespec
 )
 
-add_gen_header(
+add_header_macro(
   spawn
-  DEF_FILE spawn.h.def
-  GEN_HDR spawn.h
+  ../libc/newhdrgen/yaml/spawn.yaml
+  spawn.h.def
+  spawn.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.mode_t
@@ -373,19 +424,21 @@ add_gen_header(
 # them.
 file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys)
 
-add_gen_header(
+add_header_macro(
   sys_auxv
-  DEF_FILE sys/auxv.h.def
-  GEN_HDR sys/auxv.h
+  ../libc/newhdrgen/yaml/sys/auxv.yaml
+  sys/auxv.h.def
+  sys/auxv.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_auxv_macros
 )
 
-add_gen_header(
+add_header_macro(
   sys_epoll
-  DEF_FILE sys/epoll.h.def
-  GEN_HDR sys/epoll.h
+  ../libc/newhdrgen/yaml/sys/epoll.yaml
+  sys/epoll.h.def
+  sys/epoll.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.struct_epoll_event
@@ -394,19 +447,21 @@ add_gen_header(
     .llvm-libc-macros.sys_epoll_macros
 )
 
-add_gen_header(
+add_header_macro(
   sys_ioctl
-  DEF_FILE sys/ioctl.h.def
-  GEN_HDR sys/ioctl.h
+  ../libc/newhdrgen/yaml/sys/ioctl.yaml
+  sys/ioctl.h.def
+  sys/ioctl.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_ioctl_macros
 )
 
-add_gen_header(
+add_header_macro(
   sys_mman
-  DEF_FILE sys/mman.h.def
-  GEN_HDR sys/mman.h
+  ../libc/newhdrgen/yaml/sys/mman.yaml
+  sys/mman.h.def
+  sys/mman.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_mman_macros
@@ -415,10 +470,11 @@ add_gen_header(
     .llvm-libc-types.ssize_t
 )
 
-add_gen_header(
+add_header_macro(
   sys_prctl
-  DEF_FILE sys/prctl.h.def
-  GEN_HDR sys/prctl.h
+  ../libc/newhdrgen/yaml/sys/prctl.yaml
+  sys/prctl.h.def
+  sys/prctl.h
   DEPENDS
     .llvm_libc_common_h
 )
@@ -431,10 +487,11 @@ add_header(
     .llvm-libc-macros.sys_queue_macros
 )
 
-add_gen_header(
+add_header_macro(
   sys_random
-  DEF_FILE sys/random.h.def
-  GEN_HDR sys/random.h
+  ../libc/newhdrgen/yaml/sys/random.yaml
+  sys/random.h.def
+  sys/random.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_random_macros
@@ -442,10 +499,11 @@ add_gen_header(
     .llvm-libc-types.ssize_t
 )
 
-add_gen_header(
+add_header_macro(
   sys_resource
-  DEF_FILE sys/resource.h.def
-  GEN_HDR sys/resource.h
+  ../libc/newhdrgen/yaml/sys/resource.yaml
+  sys/resource.h.def
+  sys/resource.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_resource_macros
@@ -453,10 +511,11 @@ add_gen_header(
     .llvm-libc-types.struct_rlimit
 )
 
-add_gen_header(
+add_header_macro(
   sys_stat
-  DEF_FILE sys/stat.h.def
-  GEN_HDR sys/stat.h
+  ../libc/newhdrgen/yaml/sys/stat.yaml
+  sys/stat.h.def
+  sys/stat.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_stat_macros
@@ -474,10 +533,11 @@ add_gen_header(
     .llvm-libc-types.struct_stat
 )
 
-add_gen_header(
+add_header_macro(
   sys_select
-  DEF_FILE sys/select.h.def
-  GEN_HDR sys/select.h
+  ../libc/newhdrgen/yaml/sys/select.yaml
+  sys/select.h.def
+  sys/select.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_select_macros
@@ -489,10 +549,11 @@ add_gen_header(
     .llvm-libc-types.struct_timeval
 )
 
-add_gen_header(
+add_header_macro(
   sys_sendfile
-  DEF_FILE sys/sendfile.h.def
-  GEN_HDR sys/sendfile.h
+  ../libc/newhdrgen/yaml/sys/sendfile.yaml
+  sys/sendfile.h.def
+  sys/sendfile.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.off_t
@@ -500,10 +561,11 @@ add_gen_header(
     .llvm-libc-types.ssize_t
 )
 
-add_gen_header(
+add_header_macro(
   sys_socket
-  DEF_FILE sys/socket.h.def
-  GEN_HDR sys/socket.h
+  ../libc/newhdrgen/yaml/sys/socket.yaml
+  sys/socket.h.def
+  sys/socket.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_socket_macros
@@ -513,35 +575,40 @@ add_gen_header(
     .llvm-libc-types.struct_sockaddr_un
 )
 
-add_gen_header(
+add_header_macro(
   sys_statvfs
-  DEF_FILE sys/statvfs.h.def
-  GEN_HDR sys/statvfs.h
+  ../libc/newhdrgen/yaml/sys/statvfs.yaml
+  sys/statvfs.h.def
+  sys/statvfs.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.struct_statvfs
 )
 
-add_gen_header(
+add_header_macro(
   sys_syscall
-  DEF_FILE sys/syscall.h.def
-  GEN_HDR sys/syscall.h
+  ../libc/newhdrgen/yaml/sys/syscall.yaml
+  sys/syscall.h.def
+  sys/syscall.h
+  DEPENDS
 )
 
-add_gen_header(
+add_header_macro(
   sys_time
-  DEF_FILE sys/time.h.def
-  GEN_HDR sys/time.h
+  ../libc/newhdrgen/yaml/sys/time.yaml
+  sys/time.h.def
+  sys/time.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.struct_timeval
     .llvm-libc-macros.sys_time_macros
 )
 
-add_gen_header(
+add_header_macro(
   sys_types
-  DEF_FILE sys/types.h.def
-  GEN_HDR sys/types.h
+  ../libc/newhdrgen/yaml/sys/types.yaml
+  sys/types.h.def
+  sys/types.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.blkcnt_t
@@ -567,19 +634,21 @@ add_gen_header(
     .llvm-libc-types.uid_t
 )
 
-add_gen_header(
+add_header_macro(
   sys_utsname
-  DEF_FILE sys/utsname.h.def
-  GEN_HDR sys/utsname.h
+  ../libc/newhdrgen/yaml/sys/utsname.yaml
+  sys/utsname.h.def
+  sys/utsname.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.struct_utsname
 )
 
-add_gen_header(
+add_header_macro(
   sys_wait
-  DEF_FILE sys/wait.h.def
-  GEN_HDR sys/wait.h
+  ../libc/newhdrgen/yaml/sys/wait.yaml
+  sys/wait.h.def
+  sys/wait.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.sys_wait_macros
@@ -588,10 +657,11 @@ add_gen_header(
     .llvm-libc-types.siginfo_t
 )
 
-add_gen_header(
+add_header_macro(
   termios
-  DEF_FILE termios.h.def
-  GEN_HDR termios.h
+  ../libc/newhdrgen/yaml/termios.yaml
+  termios.h.def
+  termios.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.termios_macros
@@ -602,10 +672,11 @@ add_gen_header(
     .llvm-libc-types.tcflag_t
 )
 
-add_gen_header(
+add_header_macro(
   uchar
-  DEF_FILE uchar.h.def
-  GEN_HDR uchar.h
+  ../libc/newhdrgen/yaml/uchar.yaml
+  uchar.h.def
+  uchar.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.mbstate_t
@@ -614,10 +685,11 @@ add_gen_header(
     .llvm-libc-types.char32_t
 )
 
-add_gen_header(
+add_header_macro(
   wchar
-  DEF_FILE wchar.h.def
-  GEN_HDR wchar.h
+  ../libc/newhdrgen/yaml/wchar.yaml
+  wchar.h.def
+  wchar.h
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.wchar_macros
@@ -630,10 +702,11 @@ add_gen_header(
 if(LIBC_TARGET_OS_IS_GPU)
   file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu)
 
-  add_gen_header(
+  add_header_macro(
     gpu_rpc
-    DEF_FILE gpu/rpc.h.def
-    GEN_HDR gpu/rpc.h
+    ../libc/newhdrgen/yaml/gpu/rpc.yaml
+    gpu/rpc.h.def
+    gpu/rpc.h
     DEPENDS
       .llvm_libc_common_h
       .llvm-libc-types.rpc_opcodes_t
diff --git a/libc/include/assert.h.def b/libc/include/assert.h.def
index 9c924c7f58545..d5ae14a1cd810 100644
--- a/libc/include/assert.h.def
+++ b/libc/include/assert.h.def
@@ -12,22 +12,19 @@
 // This file may be usefully included multiple times to change assert()'s
 // definition based on NDEBUG.
 
-
-#undef assert
-#ifdef NDEBUG
-#define assert(e) (void)0
-#else
-
 #ifndef __cplusplus
 #undef static_assert
 #define static_assert _Static_assert
 #endif
 
+#undef assert
+#ifdef NDEBUG
+#define assert(e) (void)0
+#else
 #ifdef __cplusplus
 extern "C"
 #endif
 _Noreturn void __assert_fail(const char *, const char *, unsigned, const char *) __NOEXCEPT;
-
 #define assert(e)  \
   ((e) ? (void)0 : __assert_fail(#e, __FILE__, __LINE__, __PRETTY_FUNCTION__))
 #endif
diff --git a/libc/include/llvm-libc-types/fsblkcnt_t.h b/libc/include/llvm-libc-types/fsblkcnt_t.h
index 88a53d38cb1b3..8c7d3307278cd 100644
--- a/libc/include/llvm-libc-types/fsblkcnt_t.h
+++ b/libc/include/llvm-libc-types/fsblkcnt_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_FSBLKCNT_T_H
 #define LLVM_LIBC_TYPES_FSBLKCNT_T_H
 
-typedef __SIZE_TYPE__ fsblkcnt_t;
+typedef __UINT64_TYPE__ fsblkcnt_t;
 
 #endif // LLVM_LIBC_TYPES_FSBLKCNT_T_H
diff --git a/libc/include/llvm-libc-types/fsfilcnt_t.h b/libc/include/llvm-libc-types/fsfilcnt_t.h
index c5693591907ab..12697830e92eb 100644
--- a/libc/include/llvm-libc-types/fsfilcnt_t.h
+++ b/libc/include/llvm-libc-types/fsfilcnt_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_FSFILCNT_T_H
 #define LLVM_LIBC_TYPES_FSFILCNT_T_H
 
-typedef __SIZE_TYPE__ fsfilcnt_t;
+typedef __UINT64_TYPE__ fsfilcnt_t;
 
 #endif // LLVM_LIBC_TYPES_FSFILCNT_T_H
diff --git a/libc/include/llvm-libc-types/jmp_buf.h b/libc/include/llvm-libc-types/jmp_buf.h
index 8949be9fa0ab7..60e033c6c65a9 100644
--- a/libc/include/llvm-libc-types/jmp_buf.h
+++ b/libc/include/llvm-libc-types/jmp_buf.h
@@ -35,6 +35,11 @@ typedef struct {
 #elif defined(__arm__)
   // r4, r5, r6, r7, r8, r9, r10, r11, r12, lr
   long opaque[10];
+#elif defined(__aarch64__)
+  long opaque[14]; // x19-x29, lr, sp, optional x18
+#if __ARM_FP
+  long fopaque[8]; // d8-d15
+#endif
 #else
 #error "__jmp_buf not available for your target architecture."
 #endif
diff --git a/libc/include/llvm-libc-types/struct_statvfs.h b/libc/include/llvm-libc-types/struct_statvfs.h
index f467cfd936bdb..9c649af151ff6 100644
--- a/libc/include/llvm-libc-types/struct_statvfs.h
+++ b/libc/include/llvm-libc-types/struct_statvfs.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_TYPES_STRUCT_STATVFS_H
 #define LLVM_LIBC_TYPES_STRUCT_STATVFS_H
 
-#include <llvm-libc-types/fsblkcnt_t.h>
-#include <llvm-libc-types/fsfilcnt_t.h>
+#include "fsblkcnt_t.h"
+#include "fsfilcnt_t.h"
 
 struct statvfs {
   unsigned long f_bsize;   /* Filesystem block size */
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 37acf3950b460..ce0b07fb6cb49 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -40,20 +40,6 @@ foreach(archive IN ZIP_LISTS
   # Add the offloading version of the library for offloading languages. These
   # are installed in the standard search path separate from the other libraries.
   if(LIBC_TARGET_OS_IS_GPU)
-    add_gpu_entrypoint_library(
-      ${archive_1}gpu
-      ${archive_1}
-      DEPENDS
-        ${${archive_2}}
-    )
-    set_target_properties(
-      ${archive_1}gpu
-      PROPERTIES
-        ARCHIVE_OUTPUT_NAME ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE}
-        ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
-    )
-    list(APPEND added_gpu_archive_targets ${archive_1}gpu)
-
     add_bitcode_entrypoint_library(
       ${archive_1}bitcode
       ${archive_1}
@@ -65,8 +51,7 @@ foreach(archive IN ZIP_LISTS
       PROPERTIES
         OUTPUT_NAME ${archive_1}.bc
     )
-    add_dependencies(${archive_1}gpu ${archive_1}bitcode)
-    list(APPEND added_gpu_bitcode_targets ${archive_1}bitcode)
+    list(APPEND added_bitcode_targets ${archive_1}bitcode)
   endif()
 endforeach()
 
@@ -76,24 +61,13 @@ install(
   COMPONENT libc
 )
 
-if(LIBC_TARGET_OS_IS_GPU)
-  set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX})
-  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
-    set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE})
-  endif()
-  install(
-    TARGETS ${added_gpu_archive_targets}
-    ARCHIVE DESTINATION ${gpu_install_dir}
-    COMPONENT libc
+foreach(file ${added_bitcode_targets})
+  install(FILES $<TARGET_FILE:${file}>
+          DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
+          RENAME $<TARGET_PROPERTY:${file},OUTPUT_NAME>
+          COMPONENT libc
   )
-  foreach(file ${added_gpu_bitcode_targets})
-    install(FILES $<TARGET_PROPERTY:${file},TARGET_OBJECT>
-            DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
-            RENAME $<TARGET_PROPERTY:${file},OUTPUT_NAME>
-            COMPONENT libc
-    )
-  endforeach()
-endif()
+endforeach()
 
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
   # For now we will disable libc-startup installation for baremetal. The
@@ -108,6 +82,7 @@ endif()
 
 add_custom_target(install-libc
                   DEPENDS ${added_archive_targets}
+                          ${added_bitcode_targets}
                           ${startup_target}
                           ${header_install_target}
                   COMMAND "${CMAKE_COMMAND}"
@@ -115,6 +90,7 @@ add_custom_target(install-libc
                           -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
 add_custom_target(install-libc-stripped
                   DEPENDS ${added_archive_targets}
+                          ${added_bitcode_targets}
                           ${startup_target}
                           ${header_install_target}
                   COMMAND "${CMAKE_COMMAND}"
diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py
index ccfd93547c1d8..845ef1aebf54b 100644
--- a/libc/newhdrgen/class_implementation/classes/function.py
+++ b/libc/newhdrgen/class_implementation/classes/function.py
@@ -26,7 +26,7 @@ def __str__(self):
         attributes_str = " ".join(self.attributes)
         arguments_str = ", ".join(self.arguments)
         if attributes_str == "":
-            result = f"{self.return_type} {self.name}({arguments_str});"
+            result = f"{self.return_type} {self.name}({arguments_str})"
         else:
             result = f"{attributes_str} {self.return_type} {self.name}({arguments_str})"
         return result
diff --git a/libc/newhdrgen/class_implementation/classes/include.py b/libc/newhdrgen/class_implementation/classes/include.py
deleted file mode 100644
index 2657f2e7c931c..0000000000000
--- a/libc/newhdrgen/class_implementation/classes/include.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env python
-#
-# ====-- Include class for libc function headers --------------*- python -*--==#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ==-------------------------------------------------------------------------==#
-
-
-class Include:
-    def __init__(self, name):
-        self.name = name
-
-    def __str__(self):
-        return f'#include "{self.name}"'
diff --git a/libc/newhdrgen/gpu_headers.py b/libc/newhdrgen/gpu_headers.py
index cc13096cd47c1..d123e4af2de65 100644
--- a/libc/newhdrgen/gpu_headers.py
+++ b/libc/newhdrgen/gpu_headers.py
@@ -17,7 +17,6 @@ def __init__(self, name):
         self.enumerations = []
         self.objects = []
         self.functions = []
-        self.includes = []
 
     def add_macro(self, macro):
         self.macros.append(macro)
diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py
index 69de81eebb719..7ec2dee596f8c 100644
--- a/libc/newhdrgen/header.py
+++ b/libc/newhdrgen/header.py
@@ -17,7 +17,6 @@ def __init__(self, name):
         self.enumerations = []
         self.objects = []
         self.functions = []
-        self.includes = []
 
     def add_macro(self, macro):
         self.macros.append(macro)
@@ -34,15 +33,9 @@ def add_object(self, object):
     def add_function(self, function):
         self.functions.append(function)
 
-    def add_include(self, include):
-        self.includes.append(include)
-
     def __str__(self):
         content = [""]
 
-        for include in self.includes:
-            content.append(str(include))
-
         for macro in self.macros:
             content.append(f"{macro}\n")
 
@@ -60,16 +53,16 @@ def __str__(self):
         current_guard = None
         for function in self.functions:
             if function.guard == None:
-                content.append(str(function) + "__NOEXCEPT")
+                content.append(str(function) + " __NOEXCEPT;")
                 content.append("")
             else:
                 if current_guard == None:
                     current_guard = function.guard
                     content.append(f"#ifdef {current_guard}")
-                    content.append(str(function) + "__NOEXCEPT")
+                    content.append(str(function) + " __NOEXCEPT;")
                     content.append("")
                 elif current_guard == function.guard:
-                    content.append(str(function) + "__NOEXCEPT")
+                    content.append(str(function) + " __NOEXCEPT;")
                     content.append("")
                 else:
                     content.pop()
@@ -77,7 +70,7 @@ def __str__(self):
                     content.append("")
                     current_guard = function.guard
                     content.append(f"#ifdef {current_guard}")
-                    content.append(str(function) + "__NOEXCEPT")
+                    content.append(str(function) + " __NOEXCEPT;")
                     content.append("")
         if current_guard != None:
             content.pop()
diff --git a/libc/newhdrgen/tests/input/test_small.yaml b/libc/newhdrgen/tests/input/test_small.yaml
index f1f9f020ce6a4..a2e96939b4be0 100644
--- a/libc/newhdrgen/tests/input/test_small.yaml
+++ b/libc/newhdrgen/tests/input/test_small.yaml
@@ -54,4 +54,13 @@ functions:
     standards: 
       - stdc
     guard: LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
+  - name: func_f
+    return_type: _Float16
+    arguments:
+      - type: int
+      - type: double
+      - type: float
+    standards: 
+      - stdc
+
 
diff --git a/libc/newhdrgen/tests/output/test_small.h b/libc/newhdrgen/tests/output/test_small.h
deleted file mode 100644
index a777976134b04..0000000000000
--- a/libc/newhdrgen/tests/output/test_small.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- C standard library header test_small-------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_TEST_SMALL_H
-#define LLVM_LIBC_TEST_SMALL_H
-
-#include "__llvm-libc-common.h"
-#include "llvm-libc-macros/float16-macros.h"
-#include "llvm-libc-macros/test_small-macros.h"
-#include "llvm-libc-types/float128.h"
-
-#define MACRO_A 1
-
-#define MACRO_B 2
-
-#include <llvm-libc-types/type_a.h>
-#include <llvm-libc-types/type_b.h>
-
-enum {
-  enum_a = value_1,
-  enum_b = value_2,
-};
-
-__BEGIN_C_DECLS
-
-CONST_FUNC_A void func_a() __NOEXCEPT;
-
-#ifdef LIBC_TYPES_HAS_FLOAT128
-float128 func_b() __NOEXCEPT;
-#endif // LIBC_TYPES_HAS_FLOAT128
-
-#ifdef LIBC_TYPES_HAS_FLOAT16
-_Float16 func_c(int, float) __NOEXCEPT;
-
-_Float16 func_d(int, float) __NOEXCEPT;
-#endif // LIBC_TYPES_HAS_FLOAT16
-
-#ifdef LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
-_Float16 func_e(float128) __NOEXCEPT;
-#endif // LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
-
-extern obj object_1;
-extern obj object_2;
-
-__END_C_DECLS
-
-#endif // LLVM_LIBC_TEST_SMALL_H
diff --git a/libc/newhdrgen/tests/test_integration.py b/libc/newhdrgen/tests/test_integration.py
index f12d18bc04a49..33353785af233 100644
--- a/libc/newhdrgen/tests/test_integration.py
+++ b/libc/newhdrgen/tests/test_integration.py
@@ -8,7 +8,6 @@
 
 class TestHeaderGenIntegration(unittest.TestCase):
     def setUp(self):
-
         self.output_dir = Path(
             args.output_dir if args.output_dir else "libc/newhdrgen/tests/output"
         )
@@ -17,18 +16,24 @@ def setUp(self):
 
         self.source_dir = Path(__file__).resolve().parent.parent.parent.parent
 
-    def run_script(self, yaml_file, h_def_file, output_dir):
+    def run_script(self, yaml_file, h_def_file, output_dir, entry_points):
         yaml_file = self.source_dir / yaml_file
         h_def_file = self.source_dir / h_def_file
+        command = [
+            "python3",
+            str(self.source_dir / "libc/newhdrgen/yaml_to_classes.py"),
+            str(yaml_file),
+            "--h_def_file",
+            str(h_def_file),
+            "--output_dir",
+            str(output_dir),
+        ]
+
+        for entry_point in entry_points:
+            command.extend(["--e", entry_point])
+
         result = subprocess.run(
-            [
-                "python3",
-                str(self.source_dir / "libc/newhdrgen/yaml_to_classes.py"),
-                str(yaml_file),
-                str(h_def_file),
-                "--output_dir",
-                str(output_dir),
-            ],
+            command,
             capture_output=True,
             text=True,
         )
@@ -52,11 +57,12 @@ def test_generate_header(self):
             self.source_dir / "libc/newhdrgen/tests/expected_output/test_header.h"
         )
         output_file = self.output_dir / "test_small.h"
+        entry_points = {"func_b", "func_a", "func_c", "func_d", "func_e"}
 
         if not self.output_dir.exists():
             self.output_dir.mkdir(parents=True)
 
-        self.run_script(yaml_file, h_def_file, self.output_dir)
+        self.run_script(yaml_file, h_def_file, self.output_dir, entry_points)
 
         self.compare_files(output_file, expected_output_file)
 
diff --git a/libc/newhdrgen/yaml/arpa/arpa_inet.yaml b/libc/newhdrgen/yaml/arpa/inet.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/arpa/arpa_inet.yaml
rename to libc/newhdrgen/yaml/arpa/inet.yaml
diff --git a/libc/newhdrgen/yaml/gpu/gpu_rpc.yaml b/libc/newhdrgen/yaml/gpu/rpc.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/gpu/gpu_rpc.yaml
rename to libc/newhdrgen/yaml/gpu/rpc.yaml
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index 8588389bca4d2..ce562c653a6d2 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -64,8 +64,6 @@ functions:
     return_type: double
     arguments:
       - type: double
-    attributes:
-      - __LIBC_CONST_ATTR
   - name: fabsf
     standards: 
       - stdc
@@ -364,6 +362,20 @@ functions:
     arguments:
       - type: double
       - type: double
+  - name: fmull
+    standards:
+      - stdc
+    return_type: float
+    arguments:
+      - type: long double
+      - type: long double
+  - name: dmull
+    standards:
+      - stdc
+    return_type: double
+    arguments:
+      - type: long double
+      - type: long double
   - name: frexp
     standards: 
       - stdc
@@ -1323,6 +1335,30 @@ functions:
       - type: long double
       - type: long double
     guard: LIBC_TYPES_HAS_FLOAT16
+  - name: f16mul
+    standards:
+      - llvm_libc_ext
+    return_type: _Float16
+    arguments:
+      - type: double
+      - type: double
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: f16mulf
+    standards:
+      - llvm_libc_ext
+    return_type: _Float16
+    arguments:
+      - type: float
+      - type: float
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: f16mull
+    standards:
+      - llvm_libc_ext
+    return_type: _Float16
+    arguments:
+      - type: long double
+      - type: long double
+    guard: LIBC_TYPES_HAS_FLOAT16
   - name: f16sqrt
     standards: 
       - llvm_libc_ext
@@ -1756,6 +1792,14 @@ functions:
       - type: float128
       - type: float128
     guard: LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
+  - name: f16mulf128
+    standards:
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
   - name: f16sqrtf128
     standards: 
       - llvm_libc_ext
@@ -1896,6 +1940,22 @@ functions:
       - type: float128
       - type: float128
     guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmulf128
+    standards:
+      - llvm_libc_ext
+    return_type: float
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: dmulf128
+    standards:
+      - llvm_libc_ext
+    return_type: double
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
   - name: frexpf128
     standards: 
       - stdc
diff --git a/libc/newhdrgen/yaml/pthread.yaml b/libc/newhdrgen/yaml/pthread.yaml
index 292d91751e406..d492dfcbd51eb 100644
--- a/libc/newhdrgen/yaml/pthread.yaml
+++ b/libc/newhdrgen/yaml/pthread.yaml
@@ -370,6 +370,20 @@ functions:
     arguments:
       - type: pthread_rwlock_t *__restrict
       - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockrdlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockwrlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
   - name: pthread_rwlock_rdlock
     standards: POSIX
     return_type: int
diff --git a/libc/newhdrgen/yaml/stdio.yaml b/libc/newhdrgen/yaml/stdio.yaml
index 687a6d696cad6..f0acf5338b1d6 100644
--- a/libc/newhdrgen/yaml/stdio.yaml
+++ b/libc/newhdrgen/yaml/stdio.yaml
@@ -97,6 +97,22 @@ functions:
     arguments:
       - type: const char *__restrict
       - type: va_list
+  - name: asprintf  
+    standards: 
+      - GNUExtensions  
+    return_type: int  
+    arguments:  
+      - type: char **__restrict  
+      - type: const char *__restrict  
+      - type: ...  
+  - name: vasprintf  
+    standards: 
+      - GNUExtensions  
+    return_type: int  
+    arguments:  
+      - type: char **__restrict  
+      - type: const char *__restrict  
+      - type: va_list 
   - name: sscanf
     standards: 
       - stdc
@@ -105,6 +121,14 @@ functions:
       - type: const char *__restrict
       - type: const char *__restrict
       - type: ...
+  - name: vsscanf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *__restrict
+      - type: const char *__restrict
+      - type: va_list
   - name: scanf
     standards: 
       - stdc
diff --git a/libc/newhdrgen/yaml/sys/sys_auxv.yaml b/libc/newhdrgen/yaml/sys/auxv.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_auxv.yaml
rename to libc/newhdrgen/yaml/sys/auxv.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_epoll.yaml b/libc/newhdrgen/yaml/sys/epoll.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_epoll.yaml
rename to libc/newhdrgen/yaml/sys/epoll.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_ioctl.yaml b/libc/newhdrgen/yaml/sys/ioctl.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_ioctl.yaml
rename to libc/newhdrgen/yaml/sys/ioctl.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_mman.yaml b/libc/newhdrgen/yaml/sys/mman.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_mman.yaml
rename to libc/newhdrgen/yaml/sys/mman.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_prctl.yaml b/libc/newhdrgen/yaml/sys/prctl.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_prctl.yaml
rename to libc/newhdrgen/yaml/sys/prctl.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_random.yaml b/libc/newhdrgen/yaml/sys/random.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_random.yaml
rename to libc/newhdrgen/yaml/sys/random.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_resource.yaml b/libc/newhdrgen/yaml/sys/resource.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_resource.yaml
rename to libc/newhdrgen/yaml/sys/resource.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_select.yaml b/libc/newhdrgen/yaml/sys/select.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_select.yaml
rename to libc/newhdrgen/yaml/sys/select.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_sendfile.yaml b/libc/newhdrgen/yaml/sys/sendfile.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_sendfile.yaml
rename to libc/newhdrgen/yaml/sys/sendfile.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_socket.yaml b/libc/newhdrgen/yaml/sys/socket.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_socket.yaml
rename to libc/newhdrgen/yaml/sys/socket.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_stat.yaml b/libc/newhdrgen/yaml/sys/stat.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_stat.yaml
rename to libc/newhdrgen/yaml/sys/stat.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_statvfs.yaml b/libc/newhdrgen/yaml/sys/statvfs.yaml
similarity index 88%
rename from libc/newhdrgen/yaml/sys/sys_statvfs.yaml
rename to libc/newhdrgen/yaml/sys/statvfs.yaml
index 3651901e9f2c7..53666770a5de6 100644
--- a/libc/newhdrgen/yaml/sys/sys_statvfs.yaml
+++ b/libc/newhdrgen/yaml/sys/statvfs.yaml
@@ -2,6 +2,8 @@ header: sys-statvfs.h
 macros: []
 types:
   - type_name: struct_statvfs
+  - type_name: fsblkcnt_t
+  - type_name: fsfilcnt_t
 enums: []
 objects: []
 functions:
diff --git a/libc/newhdrgen/yaml/sys/sys_syscall.yaml b/libc/newhdrgen/yaml/sys/syscall.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_syscall.yaml
rename to libc/newhdrgen/yaml/sys/syscall.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_time.yaml b/libc/newhdrgen/yaml/sys/time.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_time.yaml
rename to libc/newhdrgen/yaml/sys/time.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_types.yaml b/libc/newhdrgen/yaml/sys/types.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_types.yaml
rename to libc/newhdrgen/yaml/sys/types.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_utsname.yaml b/libc/newhdrgen/yaml/sys/utsname.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_utsname.yaml
rename to libc/newhdrgen/yaml/sys/utsname.yaml
diff --git a/libc/newhdrgen/yaml/sys/sys_wait.yaml b/libc/newhdrgen/yaml/sys/wait.yaml
similarity index 100%
rename from libc/newhdrgen/yaml/sys/sys_wait.yaml
rename to libc/newhdrgen/yaml/sys/wait.yaml
diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
index 205bb35fe691a..a9d0e6424eb2c 100644
--- a/libc/newhdrgen/yaml_to_classes.py
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -16,7 +16,6 @@
 from class_implementation.classes.macro import Macro
 from class_implementation.classes.type import Type
 from class_implementation.classes.function import Function
-from class_implementation.classes.include import Include
 from class_implementation.classes.enumeration import Enumeration
 from class_implementation.classes.object import Object
 
@@ -103,9 +102,6 @@ def yaml_to_classes(yaml_data, header_class, entry_points=None):
             Object(object_data["object_name"], object_data["object_type"])
         )
 
-    for include_data in yaml_data.get("includes", []):
-        header.add_include(Include(include_data))
-
     return header
 
 
@@ -122,7 +118,7 @@ def load_yaml_file(yaml_file, header_class, entry_points):
         HeaderFile: An instance of HeaderFile populated with the data.
     """
     with open(yaml_file, "r") as f:
-        yaml_data = yaml.safe_load(f)
+        yaml_data = yaml.load(f, Loader=yaml.FullLoader)
     return yaml_to_classes(yaml_data, header_class, entry_points)
 
 
@@ -177,8 +173,7 @@ def add_function_to_yaml(yaml_file, function_details):
     new_function = parse_function_details(function_details)
 
     with open(yaml_file, "r") as f:
-        yaml_data = yaml.safe_load(f)
-
+        yaml_data = yaml.load(f, Loader=yaml.FullLoader)
     if "functions" not in yaml_data:
         yaml_data["functions"] = []
 
@@ -257,8 +252,6 @@ def main(
         with open(output_file_path, "w") as f:
             f.write(header_str)
 
-    print(f"Generated header file: {output_file_path}")
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate header files from YAML")
diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td
index 5400e0afa7564..dce81ff778620 100644
--- a/libc/spec/gpu_ext.td
+++ b/libc/spec/gpu_ext.td
@@ -10,14 +10,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
             RetValSpec<VoidType>,
             [ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
         >,
-        FunctionSpec<
-            "rpc_fprintf",
-            RetValSpec<IntType>,
-            [ArgSpec<FILERestrictedPtr>,
-             ArgSpec<ConstCharRestrictedPtr>,
-             ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>]
-        >,
     ]
   >;
   let Headers = [
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index 86215029831ca..f86a8c1c6c106 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -57,6 +57,13 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
       [], // Types
       [], // Enumerations
       [
+          GuardedFunctionSpec<"daddf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          GuardedFunctionSpec<"ddivf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          GuardedFunctionSpec<"dfmaf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          GuardedFunctionSpec<"dsqrtf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          GuardedFunctionSpec<"dsubf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+	  
           GuardedFunctionSpec<"f16add", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16addf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16addl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
@@ -65,6 +72,14 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           GuardedFunctionSpec<"f16subf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16subl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"fmulf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+          GuardedFunctionSpec<"dmulf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+          GuardedFunctionSpec<"f16mul", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16mulf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16mull", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+
           GuardedFunctionSpec<"f16div", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16divf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16divl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
@@ -77,6 +92,8 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16sqrtl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"fsqrtf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
           FunctionSpec<"powi", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"powif", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
       ]
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 1878b1ee2ae41..1b7e18e999600 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -1325,6 +1325,16 @@ def POSIX : StandardSpec<"POSIX"> {
         RetValSpec<IntType>,
         [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
       >,
+      FunctionSpec<
+        "pthread_rwlock_clockrdlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
+      FunctionSpec<
+        "pthread_rwlock_clockwrlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
       FunctionSpec<
         "pthread_rwlock_rdlock",
         RetValSpec<IntType>,
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index a4c6b40b98388..3f68eeb7853ad 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -396,12 +396,18 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"ceill", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"ceilf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          
+          FunctionSpec<"daddl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
+          FunctionSpec<"dfmal", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          FunctionSpec<"dsubl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          
           FunctionSpec<"fabs", RetValSpec<DoubleType>, [ArgSpec<DoubleType>], [ConstAttr]>,
           FunctionSpec<"fabsf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"fabsf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          FunctionSpec<"fadd", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
           FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -475,8 +481,6 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"fminimum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"fminimum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
-          FunctionSpec<"fmul", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
@@ -574,6 +578,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"exp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"expf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          GuardedFunctionSpec<"expf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
           FunctionSpec<"exp2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
@@ -693,6 +698,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"atanf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"atan2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"atan2f", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
           FunctionSpec<"acoshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
@@ -718,14 +724,29 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"canonicalizel", RetValSpec<IntType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"canonicalizef16", RetValSpec<IntType>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"canonicalizef128", RetValSpec<IntType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+	  
+          FunctionSpec<"dsqrtl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>]>,
 
-          GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+          FunctionSpec<"totalorder", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoublePtr>]>,
+	  FunctionSpec<"totalorderf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]>,
+	  GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"totalorderf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
-          GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+          FunctionSpec<"totalordermag", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoublePtr>]>,
+	  FunctionSpec<"totalordermagf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]>,
+	  FunctionSpec<"totalordermagl", RetValSpec<IntType>, [ArgSpec<LongDoublePtr>, ArgSpec<LongDoublePtr>]>,
+	  GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"totalordermagf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
+          FunctionSpec<"getpayload", RetValSpec<DoubleType>, [ArgSpec<DoublePtr>]>,
+          FunctionSpec<"getpayloadf", RetValSpec<FloatType>, [ArgSpec<FloatPtr>]>,
           GuardedFunctionSpec<"getpayloadf16", RetValSpec<Float16Type>, [ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-
+          GuardedFunctionSpec<"getpayloadf128", RetValSpec<Float128Type>, [ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
+	  
+          FunctionSpec<"setpayload", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoubleType>]>,
+          FunctionSpec<"setpayloadf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatType>]>,
           GuardedFunctionSpec<"setpayloadf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"setpayloadf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           GuardedFunctionSpec<"setpayloadsigf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
@@ -733,6 +754,16 @@ def StdC : StandardSpec<"stdc"> {
 
           GuardedFunctionSpec<"f16subf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
 
+          FunctionSpec<"fmul", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+          FunctionSpec<"fmull", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+
+          FunctionSpec<"dmull", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+
+          GuardedFunctionSpec<"f16mulf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
+
+          FunctionSpec<"fsqrt", RetValSpec<FloatType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"fsqrtl", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>]>,
+	  
           GuardedFunctionSpec<"f16divf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
 
           GuardedFunctionSpec<"f16sqrtf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
@@ -894,6 +925,13 @@ def StdC : StandardSpec<"stdc"> {
                ArgSpec<ConstCharRestrictedPtr>,
                ArgSpec<VarArgType>]
           >,
+          FunctionSpec<
+              "vsscanf",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<VaListType>]
+          >,
           FunctionSpec<
               "scanf",
               RetValSpec<IntType>,
@@ -935,6 +973,13 @@ def StdC : StandardSpec<"stdc"> {
                ArgSpec<ConstCharRestrictedPtr>,
                ArgSpec<VarArgType>]
           >,
+          FunctionSpec<
+              "asprintf",
+              RetValSpec<IntType>,
+              [ArgSpec<CharRestrictedPtrPtr>,
+               ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<VarArgType>]
+          >,
           FunctionSpec<
               "vsprintf",
               RetValSpec<IntType>,
@@ -968,6 +1013,13 @@ def StdC : StandardSpec<"stdc"> {
               RetValSpec<IntType>,
               [ArgSpec<IntType>, ArgSpec<FILEPtr>]
           >,
+          FunctionSpec<
+              "vasprintf",
+              RetValSpec<IntType>,
+              [ArgSpec<CharRestrictedPtrPtr>,
+               ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<VaListType>]
+          >,
       ],
       [
           ObjectSpec<
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index a963a92bfb074..d68f3ae6a26bb 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -11,12 +11,15 @@
 
 #include "FEnvImpl.h"
 #include "FPBits.h"
+#include "dyadic_float.h"
 
-#include "FEnvImpl.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/big_int.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/types.h"
 #include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -27,6 +30,90 @@ LIBC_INLINE T abs(T x) {
   return FPBits<T>(x).abs().get_val();
 }
 
+namespace internal {
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> max(T x, T y) {
+  FPBits<T> x_bits(x);
+  FPBits<T> y_bits(y);
+
+  // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and y
+  // have different signs and both are not NaNs, we return the number with
+  // positive sign.
+  if (x_bits.sign() != y_bits.sign())
+    return x_bits.is_pos() ? x : y;
+  return x > y ? x : y;
+}
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#if defined(__LIBC_USE_BUILTIN_FMAXF16_FMINF16)
+template <> LIBC_INLINE float16 max(float16 x, float16 y) {
+  return __builtin_fmaxf16(x, y);
+}
+#elif !defined(LIBC_TARGET_ARCH_IS_AARCH64)
+template <> LIBC_INLINE float16 max(float16 x, float16 y) {
+  FPBits<float16> x_bits(x);
+  FPBits<float16> y_bits(y);
+
+  int16_t xi = static_cast<int16_t>(x_bits.uintval());
+  int16_t yi = static_cast<int16_t>(y_bits.uintval());
+  return ((xi > yi) != (xi < 0 && yi < 0)) ? x : y;
+}
+#endif
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#if defined(__LIBC_USE_BUILTIN_FMAX_FMIN) && !defined(LIBC_TARGET_ARCH_IS_X86)
+template <> LIBC_INLINE float max(float x, float y) {
+  return __builtin_fmaxf(x, y);
+}
+
+template <> LIBC_INLINE double max(double x, double y) {
+  return __builtin_fmax(x, y);
+}
+#endif
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> min(T x, T y) {
+  FPBits<T> x_bits(x);
+  FPBits<T> y_bits(y);
+
+  // To make sure that fmin(+0, -0) == -0 == fmin(-0, +0), whenever x and y have
+  // different signs and both are not NaNs, we return the number with negative
+  // sign.
+  if (x_bits.sign() != y_bits.sign())
+    return x_bits.is_neg() ? x : y;
+  return x < y ? x : y;
+}
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#if defined(__LIBC_USE_BUILTIN_FMAXF16_FMINF16)
+template <> LIBC_INLINE float16 min(float16 x, float16 y) {
+  return __builtin_fminf16(x, y);
+}
+#elif !defined(LIBC_TARGET_ARCH_IS_AARCH64)
+template <> LIBC_INLINE float16 min(float16 x, float16 y) {
+  FPBits<float16> x_bits(x);
+  FPBits<float16> y_bits(y);
+
+  int16_t xi = static_cast<int16_t>(x_bits.uintval());
+  int16_t yi = static_cast<int16_t>(y_bits.uintval());
+  return ((xi < yi) != (xi < 0 && yi < 0)) ? x : y;
+}
+#endif
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#if defined(__LIBC_USE_BUILTIN_FMAX_FMIN) && !defined(LIBC_TARGET_ARCH_IS_X86)
+template <> LIBC_INLINE float min(float x, float y) {
+  return __builtin_fminf(x, y);
+}
+
+template <> LIBC_INLINE double min(double x, double y) {
+  return __builtin_fmin(x, y);
+}
+#endif
+
+} // namespace internal
+
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T fmin(T x, T y) {
   const FPBits<T> bitx(x), bity(y);
@@ -35,12 +122,7 @@ LIBC_INLINE T fmin(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    // To make sure that fmin(+0, -0) == -0 == fmin(-0, +0), whenever x and
-    // y has different signs and both are not NaNs, we return the number
-    // with negative sign.
-    return bitx.is_neg() ? x : y;
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -51,12 +133,7 @@ LIBC_INLINE T fmax(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and
-    // y has different signs and both are not NaNs, we return the number
-    // with positive sign.
-    return bitx.is_neg() ? y : x;
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -67,9 +144,7 @@ LIBC_INLINE T fmaximum(T x, T y) {
     return x;
   if (bity.is_nan())
     return y;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? y : x);
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -80,9 +155,7 @@ LIBC_INLINE T fminimum(T x, T y) {
     return x;
   if (bity.is_nan())
     return y;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg()) ? x : y;
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -97,9 +170,7 @@ LIBC_INLINE T fmaximum_num(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? y : x);
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -114,9 +185,7 @@ LIBC_INLINE T fminimum_num(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? x : y);
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -252,7 +321,7 @@ totalorder(T x, T y) {
   StorageType x_u = x_bits.uintval();
   StorageType y_u = y_bits.uintval();
 
-  using signed_t = cpp::make_signed_t<StorageType>;
+  using signed_t = make_integral_or_big_int_signed_t<StorageType>;
   signed_t x_signed = static_cast<signed_t>(x_u);
   signed_t y_signed = static_cast<signed_t>(y_u);
 
@@ -269,12 +338,21 @@ totalordermag(T x, T y) {
 template <typename T>
 LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> getpayload(T x) {
   using FPBits = FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
   FPBits x_bits(x);
 
   if (!x_bits.is_nan())
     return T(-1.0);
 
-  return T(x_bits.uintval() & (FPBits::FRACTION_MASK >> 1));
+  StorageType payload = x_bits.uintval() & (FPBits::FRACTION_MASK >> 1);
+
+  if constexpr (is_big_int_v<StorageType>) {
+    DyadicFloat<FPBits::STORAGE_LEN> payload_dfloat(Sign::POS, 0, payload);
+
+    return static_cast<T>(payload_dfloat);
+  } else {
+    return static_cast<T>(payload);
+  }
 }
 
 template <bool IsSignaling, typename T>
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 793d3a121c742..ea1e0e8b39d10 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -75,19 +75,6 @@ add_header_library(
     libc.src.__support.common
 )
 
-add_header_library(
-  basic_operations
-  HDRS
-    BasicOperations.h
-  DEPENDS
-    .fp_bits
-    .fenv_impl
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-    libc.src.__support.common
-    libc.src.__support.macros.optimization
-)
-
 add_header_library(
   division_and_remainder_operations
   HDRS
@@ -113,21 +100,6 @@ add_header_library(
 )
 
 
-add_header_library(
-  hypot
-  HDRS
-    Hypot.h
-  DEPENDS
-    .basic_operations
-    .fenv_impl
-    .fp_bits
-    .rounding_mode
-    libc.src.__support.common
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-)
-
 add_header_library(
   sqrt
   HDRS
@@ -208,6 +180,38 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  basic_operations
+  HDRS
+    BasicOperations.h
+  DEPENDS
+    .dyadic_float
+    .fp_bits
+    .fenv_impl
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.big_int
+    libc.src.__support.uint128
+    libc.src.__support.common
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.types
+)
+
+add_header_library(
+  hypot
+  HDRS
+    Hypot.h
+  DEPENDS
+    .basic_operations
+    .fenv_impl
+    .fp_bits
+    .rounding_mode
+    libc.src.__support.common
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.uint128
+)
+
 add_header_library(
   manipulation_functions
   HDRS
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index a5a991e75ed01..6aa808446d6d9 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -109,45 +109,39 @@ LIBC_INLINE T hypot(T x, T y) {
   using StorageType = typename FPBits<T>::StorageType;
   using DStorageType = typename DoubleLength<StorageType>::Type;
 
-  FPBits_t x_bits(x), y_bits(y);
+  FPBits_t x_abs = FPBits_t(x).abs();
+  FPBits_t y_abs = FPBits_t(y).abs();
 
-  if (x_bits.is_inf() || y_bits.is_inf()) {
-    return FPBits_t::inf().get_val();
-  }
-  if (x_bits.is_nan()) {
-    return x;
-  }
-  if (y_bits.is_nan()) {
+  bool x_abs_larger = x_abs.uintval() >= y_abs.uintval();
+
+  FPBits_t a_bits = x_abs_larger ? x_abs : y_abs;
+  FPBits_t b_bits = x_abs_larger ? y_abs : x_abs;
+
+  if (LIBC_UNLIKELY(a_bits.is_inf_or_nan())) {
+    if (x_abs.is_signaling_nan() || y_abs.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits_t::quiet_nan().get_val();
+    }
+    if (x_abs.is_inf() || y_abs.is_inf())
+      return FPBits_t::inf().get_val();
+    if (x_abs.is_nan())
+      return x;
+    // y is nan
     return y;
   }
 
-  uint16_t x_exp = x_bits.get_biased_exponent();
-  uint16_t y_exp = y_bits.get_biased_exponent();
-  uint16_t exp_diff = (x_exp > y_exp) ? (x_exp - y_exp) : (y_exp - x_exp);
+  uint16_t a_exp = a_bits.get_biased_exponent();
+  uint16_t b_exp = b_bits.get_biased_exponent();
 
-  if ((exp_diff >= FPBits_t::FRACTION_LEN + 2) || (x == 0) || (y == 0)) {
-    return abs(x) + abs(y);
-  }
+  if ((a_exp - b_exp >= FPBits_t::FRACTION_LEN + 2) || (x == 0) || (y == 0))
+    return x_abs.get_val() + y_abs.get_val();
 
-  uint16_t a_exp, b_exp, out_exp;
-  StorageType a_mant, b_mant;
+  uint64_t out_exp = a_exp;
+  StorageType a_mant = a_bits.get_mantissa();
+  StorageType b_mant = b_bits.get_mantissa();
   DStorageType a_mant_sq, b_mant_sq;
   bool sticky_bits;
 
-  if (abs(x) >= abs(y)) {
-    a_exp = x_exp;
-    a_mant = x_bits.get_mantissa();
-    b_exp = y_exp;
-    b_mant = y_bits.get_mantissa();
-  } else {
-    a_exp = y_exp;
-    a_mant = y_bits.get_mantissa();
-    b_exp = x_exp;
-    b_mant = x_bits.get_mantissa();
-  }
-
-  out_exp = a_exp;
-
   // Add an extra bit to simplify the final rounding bit computation.
   constexpr StorageType ONE = StorageType(1) << (FPBits_t::FRACTION_LEN + 1);
 
@@ -165,11 +159,10 @@ LIBC_INLINE T hypot(T x, T y) {
     a_exp = 1;
   }
 
-  if (b_exp != 0) {
+  if (b_exp != 0)
     b_mant |= ONE;
-  } else {
+  else
     b_exp = 1;
-  }
 
   a_mant_sq = static_cast<DStorageType>(a_mant) * a_mant;
   b_mant_sq = static_cast<DStorageType>(b_mant) * b_mant;
@@ -260,6 +253,10 @@ LIBC_INLINE T hypot(T x, T y) {
   }
 
   y_new |= static_cast<StorageType>(out_exp) << FPBits_t::FRACTION_LEN;
+
+  if (!(round_bit || sticky_bits || (r != 0)))
+    fputil::clear_except_if_required(FE_INEXACT);
+
   return cpp::bit_cast<T>(y_new);
 }
 
diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index c73f68723e232..43096aa529fc3 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -84,3 +84,20 @@ add_header_library(
     libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
 )
+
+add_header_library(
+  mul
+  HDRS
+    mul.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 337301d86d919..e5683c8ff61ea 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -129,27 +129,27 @@ fma(InType x, InType y, InType z) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       if (z_bits.is_quiet_nan()) {
-        InStorageType z_payload = static_cast<InStorageType>(getpayload(z));
-        if ((z_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(z_bits.sign(),
-                                      static_cast<OutStorageType>(z_payload))
-              .get_val();
+        InStorageType z_payload = z_bits.get_mantissa();
+        z_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(z_bits.sign(),
+                                    static_cast<OutStorageType>(z_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -163,18 +163,27 @@ fma(InType x, InType y, InType z) {
   int y_exp = 0;
   int z_exp = 0;
 
+  // Denormal scaling = 2^(fraction length).
+  constexpr InStorageType IMPLICIT_MASK =
+      InFPBits::SIG_MASK - InFPBits::FRACTION_MASK;
+
+  constexpr InType DENORMAL_SCALING =
+      InFPBits::create_value(
+          Sign::POS, InFPBits::FRACTION_LEN + InFPBits::EXP_BIAS, IMPLICIT_MASK)
+          .get_val();
+
   // Normalize denormal inputs.
   if (LIBC_UNLIKELY(InFPBits(x).is_subnormal())) {
     x_exp -= InFPBits::FRACTION_LEN;
-    x *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    x *= DENORMAL_SCALING;
   }
   if (LIBC_UNLIKELY(InFPBits(y).is_subnormal())) {
     y_exp -= InFPBits::FRACTION_LEN;
-    y *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    y *= DENORMAL_SCALING;
   }
   if (LIBC_UNLIKELY(InFPBits(z).is_subnormal())) {
     z_exp -= InFPBits::FRACTION_LEN;
-    z *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    z *= DENORMAL_SCALING;
   }
 
   x_bits = InFPBits(x);
diff --git a/libc/src/__support/FPUtil/generic/add_sub.h b/libc/src/__support/FPUtil/generic/add_sub.h
index ec20a8723b704..850db3f83209e 100644
--- a/libc/src/__support/FPUtil/generic/add_sub.h
+++ b/libc/src/__support/FPUtil/generic/add_sub.h
@@ -56,19 +56,19 @@ add_or_sub(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -174,10 +174,12 @@ add_or_sub(InType x, InType y) {
     else
       aligned_min_mant_sticky = true;
 
+    InStorageType min_mant_sticky(static_cast<int>(aligned_min_mant_sticky));
+
     if (is_effectively_add)
-      result_mant = max_mant + (aligned_min_mant | aligned_min_mant_sticky);
+      result_mant = max_mant + (aligned_min_mant | min_mant_sticky);
     else
-      result_mant = max_mant - (aligned_min_mant | aligned_min_mant_sticky);
+      result_mant = max_mant - (aligned_min_mant | min_mant_sticky);
   }
 
   int result_exp = max_bits.get_exponent() - RESULT_FRACTION_LEN;
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
index 27545786aea17..dad1772fce750 100644
--- a/libc/src/__support/FPUtil/generic/div.h
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -49,19 +49,19 @@ div(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
diff --git a/libc/src/__support/FPUtil/generic/mul.h b/libc/src/__support/FPUtil/generic/mul.h
new file mode 100644
index 0000000000000..20d9a77792762
--- /dev/null
+++ b/libc/src/__support/FPUtil/generic/mul.h
@@ -0,0 +1,105 @@
+//===-- Multiplication of IEEE 754 floating-point numbers -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace fputil::generic {
+
+template <typename OutType, typename InType>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
+                                 cpp::is_floating_point_v<InType> &&
+                                 sizeof(OutType) <= sizeof(InType),
+                             OutType>
+mul(InType x, InType y) {
+  using OutFPBits = FPBits<OutType>;
+  using OutStorageType = typename OutFPBits::StorageType;
+  using InFPBits = FPBits<InType>;
+  using InStorageType = typename InFPBits::StorageType;
+  // The product of two p-digit numbers is a 2p-digit number.
+  using DyadicFloat =
+      DyadicFloat<cpp::bit_ceil(2 * static_cast<size_t>(InFPBits::SIG_LEN))>;
+
+  InFPBits x_bits(x);
+  InFPBits y_bits(y);
+
+  Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG;
+
+  if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() ||
+                    x_bits.is_zero() || y_bits.is_zero())) {
+    if (x_bits.is_nan() || y_bits.is_nan()) {
+      if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+        raise_except_if_required(FE_INVALID);
+
+      if (x_bits.is_quiet_nan()) {
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
+      }
+
+      if (y_bits.is_quiet_nan()) {
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
+      }
+
+      return OutFPBits::quiet_nan().get_val();
+    }
+
+    if (x_bits.is_inf()) {
+      if (y_bits.is_zero()) {
+        set_errno_if_required(EDOM);
+        raise_except_if_required(FE_INVALID);
+        return OutFPBits::quiet_nan().get_val();
+      }
+
+      return OutFPBits::inf(result_sign).get_val();
+    }
+
+    if (y_bits.is_inf()) {
+      if (x_bits.is_zero()) {
+        set_errno_if_required(EDOM);
+        raise_except_if_required(FE_INVALID);
+        return OutFPBits::quiet_nan().get_val();
+      }
+
+      return OutFPBits::inf(result_sign).get_val();
+    }
+
+    // Now either x or y is zero, and the other one is finite.
+    return OutFPBits::zero(result_sign).get_val();
+  }
+
+  DyadicFloat xd(x);
+  DyadicFloat yd(y);
+
+  DyadicFloat result = quick_mul(xd, yd);
+  return result.template as<OutType, /*ShouldSignalExceptions=*/true>();
+}
+
+} // namespace fputil::generic
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 1b545c5096936..51811a27c1acd 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -282,7 +282,7 @@ int File::ungetc_unlocked(int c) {
   return c;
 }
 
-ErrorOr<int> File::seek(long offset, int whence) {
+ErrorOr<int> File::seek(off_t offset, int whence) {
   FileLock lock(this);
   if (prev_op == FileOp::WRITE && pos > 0) {
 
diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h
index 0cedf866519d6..42e1d11b4ab1a 100644
--- a/libc/src/__support/File/file.h
+++ b/libc/src/__support/File/file.h
@@ -183,7 +183,7 @@ class File {
     return read_unlocked(data, len);
   }
 
-  ErrorOr<int> seek(long offset, int whence);
+  ErrorOr<int> seek(off_t offset, int whence);
 
   ErrorOr<off_t> tell();
 
diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h
index 06d3e84a710e7..244dd41be3eec 100644
--- a/libc/src/__support/HashTable/randomness.h
+++ b/libc/src/__support/HashTable/randomness.h
@@ -36,7 +36,7 @@ LIBC_INLINE uint64_t next_random_seed() {
     entropy[1] = reinterpret_cast<uint64_t>(&state);
 #if defined(LIBC_HASHTABLE_USE_GETRANDOM)
     int errno_backup = libc_errno;
-    ssize_t count = sizeof(entropy);
+    size_t count = sizeof(entropy);
     uint8_t *buffer = reinterpret_cast<uint8_t *>(entropy);
     while (count > 0) {
       ssize_t len = getrandom(buffer, count, 0);
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index e11013121b04b..4742b2a00220b 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -33,7 +33,8 @@ int fcntl(int fd, int cmd, void *arg) {
 #error "fcntl and fcntl64 syscalls not available."
 #endif
 
-  switch (cmd) {
+  int new_cmd = cmd;
+  switch (new_cmd) {
   case F_OFD_SETLKW: {
     struct flock *flk = reinterpret_cast<struct flock *>(arg);
     // convert the struct to a flock64
@@ -44,7 +45,8 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
+    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
+                                             &flk64);
   }
   case F_OFD_GETLK:
   case F_OFD_SETLK: {
@@ -57,8 +59,8 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    int retVal =
-        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
+    int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
+                                                   new_cmd, &flk64);
     // On failure, return
     if (retVal == -1)
       return -1;
@@ -86,17 +88,31 @@ int fcntl(int fd, int cmd, void *arg) {
     libc_errno = -ret;
     return -1;
   }
-  // The general case
-  default: {
-    int retVal = LIBC_NAMESPACE::syscall_impl<int>(
-        FCNTL_SYSCALL_ID, fd, cmd, reinterpret_cast<void *>(arg));
-    if (retVal >= 0) {
-      return retVal;
-    }
-    libc_errno = -retVal;
-    return -1;
+#ifdef SYS_fcntl64
+  case F_GETLK: {
+    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
+      new_cmd = F_GETLK64;
+    break;
+  }
+  case F_SETLK: {
+    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
+      new_cmd = F_SETLK64;
+    break;
+  }
+  case F_SETLKW: {
+    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
+      new_cmd = F_SETLKW64;
+    break;
   }
+#endif
+  }
+  int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
+                                                 reinterpret_cast<void *>(arg));
+  if (retVal >= 0) {
+    return retVal;
   }
+  libc_errno = -retVal;
+  return -1;
 }
 
 } // namespace internal
diff --git a/libc/src/__support/block.h b/libc/src/__support/block.h
index 242602ad2f856..6e7186f3224b9 100644
--- a/libc/src/__support/block.h
+++ b/libc/src/__support/block.h
@@ -65,62 +65,61 @@ using cpp::optional;
 
 /// Memory region with links to adjacent blocks.
 ///
-/// The blocks do not encode their size directly. Instead, they encode offsets
-/// to the next and previous blocks using the type given by the `OffsetType`
-/// template parameter. The encoded offsets are simply the offsets divded by the
-/// minimum block alignment, `ALIGNMENT`.
+/// The blocks store their offsets to the previous and next blocks. The latter
+/// is also the block's size.
 ///
 /// The `ALIGNMENT` constant provided by the derived block is typically the
-/// minimum value of `alignof(OffsetType)`. Since the addressable range of a
-/// block is given by `std::numeric_limits<OffsetType>::max() *
-/// ALIGNMENT`, it may be advantageous to set a higher alignment if it allows
-/// using a smaller offset type, even if this wastes some bytes in order to
-/// align block headers.
-///
-/// Blocks will always be aligned to a `ALIGNMENT` boundary. Block sizes will
-/// always be rounded up to a multiple of `ALIGNMENT`.
+/// minimum value of `alignof(OffsetType)`. Blocks will always be aligned to a
+/// `ALIGNMENT` boundary. Block sizes will always be rounded up to a multiple of
+/// `ALIGNMENT`.
 ///
 /// As an example, the diagram below represents two contiguous
 /// `Block<uint32_t, 8>`s. The indices indicate byte offsets:
 ///
 /// @code{.unparsed}
 /// Block 1:
-/// +---------------------+------+--------------+
-/// | Header              | Info | Usable space |
-/// +----------+----------+------+--------------+
-/// | prev     | next     |      |              |
-/// | 0......3 | 4......7 | 8..9 | 10.......280 |
-/// | 00000000 | 00000046 | 8008 |  <app data>  |
-/// +----------+----------+------+--------------+
+/// +---------------------+--------------+
+/// | Header              | Usable space |
+/// +----------+----------+--------------+
+/// | prev     | next     |              |
+/// | 0......3 | 4......7 | 8........227 |
+/// | 00000000 | 00000230 |  <app data>  |
+/// +----------+----------+--------------+
 /// Block 2:
-/// +---------------------+------+--------------+
-/// | Header              | Info | Usable space |
-/// +----------+----------+------+--------------+
-/// | prev     | next     |      |              |
-/// | 0......3 | 4......7 | 8..9 | 10......1056 |
-/// | 00000046 | 00000106 | 2008 | f7f7....f7f7 |
-/// +----------+----------+------+--------------+
+/// +---------------------+--------------+
+/// | Header              | Usable space |
+/// +----------+----------+--------------+
+/// | prev     | next     |              |
+/// | 0......3 | 4......7 | 8........827 |
+/// | 00000230 | 00000830 | f7f7....f7f7 |
+/// +----------+----------+--------------+
 /// @endcode
 ///
-/// The overall size of the block (e.g. 280 bytes) is given by its next offset
-/// multiplied by the alignment (e.g. 0x106 * 4). Also, the next offset of a
-/// block matches the previous offset of its next block. The first block in a
-/// list is denoted by having a previous offset of `0`.
+/// The next offset of a block matches the previous offset of its next block.
+/// The first block in a list is denoted by having a previous offset of `0`.
 ///
 /// @tparam   OffsetType  Unsigned integral type used to encode offsets. Larger
 ///                       types can address more memory, but consume greater
 ///                       overhead.
 /// @tparam   kAlign      Sets the overall alignment for blocks. Minimum is
-///                       `alignof(OffsetType)` (the default). Larger values can
-///                       address more memory, but consume greater overhead.
-template <typename OffsetType = uintptr_t, size_t kAlign = alignof(OffsetType)>
+///                       `alignof(OffsetType)`, but the default is max_align_t,
+///                       since the usable space will then already be
+///                       aligned to max_align_t if the size of OffsetType is no
+///                       less than half of max_align_t. Larger values cause
+///                       greater overhead.
+template <typename OffsetType = uintptr_t, size_t kAlign = alignof(max_align_t)>
 class Block {
+  // Masks for the contents of the next_ field.
+  static constexpr size_t USED_MASK = 1 << 0;
+  static constexpr size_t LAST_MASK = 1 << 1;
+  static constexpr size_t SIZE_MASK = ~(USED_MASK | LAST_MASK);
+
 public:
   using offset_type = OffsetType;
   static_assert(cpp::is_unsigned_v<offset_type>,
                 "offset type must be unsigned");
-
-  static constexpr size_t ALIGNMENT = cpp::max(kAlign, alignof(offset_type));
+  static constexpr size_t ALIGNMENT =
+      cpp::max(cpp::max(kAlign, alignof(offset_type)), size_t{4});
   static constexpr size_t BLOCK_OVERHEAD = align_up(sizeof(Block), ALIGNMENT);
 
   // No copy or move.
@@ -147,14 +146,11 @@ class Block {
   }
 
   /// @returns The total size of the block in bytes, including the header.
-  size_t outer_size() const { return next_ * ALIGNMENT; }
+  size_t outer_size() const { return next_ & SIZE_MASK; }
 
   /// @returns The number of usable bytes inside the block.
   size_t inner_size() const { return outer_size() - BLOCK_OVERHEAD; }
 
-  /// @returns The number of bytes requested using AllocFirst or AllocLast.
-  size_t requested_size() const { return inner_size() - padding_; }
-
   /// @returns A pointer to the usable space inside this block.
   cpp::byte *usable_space() {
     return reinterpret_cast<cpp::byte *>(this) + BLOCK_OVERHEAD;
@@ -163,13 +159,10 @@ class Block {
     return reinterpret_cast<const cpp::byte *>(this) + BLOCK_OVERHEAD;
   }
 
-  /// Marks the block as free and merges it with any free neighbors.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer. If neither member is free, the returned pointer will point to the
-  /// original block. Otherwise, it will point to the new, larger block created
-  /// by merging adjacent free blocks together.
-  static void free(Block *&block);
+  // @returns The region of memory the block manages, including the header.
+  ByteSpan region() {
+    return {reinterpret_cast<cpp::byte *>(this), outer_size()};
+  }
 
   /// Attempts to split this block.
   ///
@@ -180,77 +173,33 @@ class Block {
   /// This method may fail if the remaining space is too small to hold a new
   /// block. If this method fails for any reason, the original block is
   /// unmodified.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer with a pointer to the new, smaller block.
-  static optional<Block *> split(Block *&block, size_t new_inner_size);
+  optional<Block *> split(size_t new_inner_size);
 
   /// Merges this block with the one that comes after it.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer with a pointer to the new, larger block.
-  static bool merge_next(Block *&block);
+  bool merge_next();
 
-  /// Fetches the block immediately after this one.
-  ///
-  /// For performance, this always returns a block pointer, even if the returned
-  /// pointer is invalid. The pointer is valid if and only if `last()` is false.
-  ///
-  /// Typically, after calling `Init` callers may save a pointer past the end of
-  /// the list using `next()`. This makes it easy to subsequently iterate over
-  /// the list:
-  /// @code{.cpp}
-  ///   auto result = Block<>::init(byte_span);
-  ///   Block<>* begin = *result;
-  ///   Block<>* end = begin->next();
-  ///   ...
-  ///   for (auto* block = begin; block != end; block = block->next()) {
-  ///     // Do something which each block.
-  ///   }
-  /// @endcode
+  /// @returns The block immediately after this one, or a null pointer if this
+  /// is the last block.
   Block *next() const;
 
-  /// @copydoc `next`.
-  static Block *next_block(const Block *block) {
-    return block == nullptr ? nullptr : block->next();
-  }
-
   /// @returns The block immediately before this one, or a null pointer if this
   /// is the first block.
   Block *prev() const;
 
-  /// @copydoc `prev`.
-  static Block *prev_block(const Block *block) {
-    return block == nullptr ? nullptr : block->prev();
-  }
-
-  /// Returns the current alignment of a block.
-  size_t alignment() const { return used() ? info_.alignment : 1; }
-
   /// Indicates whether the block is in use.
   ///
   /// @returns `true` if the block is in use or `false` if not.
-  bool used() const { return info_.used; }
-
-  /// Indicates whether this block is the last block or not (i.e. whether
-  /// `next()` points to a valid block or not). This is needed because
-  /// `next()` points to the end of this block, whether there is a valid
-  /// block there or not.
-  ///
-  /// @returns `true` is this is the last block or `false` if not.
-  bool last() const { return info_.last; }
+  bool used() const { return next_ & USED_MASK; }
 
   /// Marks this block as in use.
-  void mark_used() { info_.used = 1; }
+  void mark_used() { next_ |= USED_MASK; }
 
   /// Marks this block as free.
-  void mark_free() { info_.used = 0; }
-
-  /// Marks this block as the last one in the chain.
-  constexpr void mark_last() { info_.last = 1; }
+  void mark_free() { next_ &= ~USED_MASK; }
 
-  /// Clears the last bit from this block.
-  void clear_last() { info_.last = 1; }
+  /// Marks this block as the last one in the chain. Makes next() return
+  /// nullptr.
+  constexpr void mark_last() { next_ |= LAST_MASK; }
 
   /// @brief Checks if a block is valid.
   ///
@@ -321,10 +270,8 @@ class Block {
   static BlockInfo allocate(Block *block, size_t alignment, size_t size);
 
 private:
-  /// Consumes the block and returns as a span of bytes.
-  static ByteSpan as_bytes(Block *&&block);
-
-  /// Consumes the span of bytes and uses it to construct and return a block.
+  /// Construct a block to represent a span of bytes. Overwrites only enough
+  /// memory for the block header; the rest of the span is left alone.
   static Block *as_block(size_t prev_outer_size, ByteSpan bytes);
 
   /// Returns a `BlockStatus` that is either VALID or indicates the reason why
@@ -336,34 +283,28 @@ class Block {
 
   /// Like `split`, but assumes the caller has already checked to parameters to
   /// ensure the split will succeed.
-  static Block *split_impl(Block *&block, size_t new_inner_size);
+  Block *split_impl(size_t new_inner_size);
 
-  /// Offset (in increments of the minimum alignment) from this block to the
-  /// previous block. 0 if this is the first block.
+  /// Offset from this block to the previous block. 0 if this is the first
+  /// block.
   offset_type prev_ = 0;
 
-  /// Offset (in increments of the minimum alignment) from this block to the
-  /// next block. Valid even if this is the last block, since it equals the
-  /// size of the block.
+  /// Offset from this block to the next block. Valid even if this is the last
+  /// block, since it equals the size of the block.
   offset_type next_ = 0;
 
-  /// Information about the current state of the block:
+  /// Information about the current state of the block is stored in the two low
+  /// order bits of the next_ value. These are guaranteed free by a minimum
+  /// alignment (and thus, alignment of the size) of 4. The lowest bit is the
+  /// `used` flag, and the other bit is the `last` flag.
+  ///
   /// * If the `used` flag is set, the block's usable memory has been allocated
   ///   and is being used.
   /// * If the `last` flag is set, the block does not have a next block.
   /// * If the `used` flag is set, the alignment represents the requested value
   ///   when the memory was allocated, which may be less strict than the actual
   ///   alignment.
-  struct {
-    uint16_t used : 1;
-    uint16_t last : 1;
-    uint16_t alignment : 14;
-  } info_;
-
-  /// Number of bytes allocated beyond what was requested. This will be at most
-  /// the minimum alignment, i.e. `alignof(offset_type).`
-  uint16_t padding_ = 0;
-} __attribute__((packed, aligned(kAlign)));
+} __attribute__((packed, aligned(cpp::max(kAlign, size_t{4}))));
 
 // Public template method implementations.
 
@@ -394,7 +335,7 @@ Block<OffsetType, kAlign>::init(ByteSpan region) {
   if (region.size() < BLOCK_OVERHEAD)
     return {};
 
-  if (cpp::numeric_limits<OffsetType>::max() < region.size() / ALIGNMENT)
+  if (cpp::numeric_limits<OffsetType>::max() < region.size())
     return {};
 
   Block *block = as_block(0, region);
@@ -402,20 +343,6 @@ Block<OffsetType, kAlign>::init(ByteSpan region) {
   return block;
 }
 
-template <typename OffsetType, size_t kAlign>
-void Block<OffsetType, kAlign>::free(Block *&block) {
-  if (block == nullptr)
-    return;
-
-  block->mark_free();
-  Block *prev = block->prev();
-
-  if (merge_next(prev))
-    block = prev;
-
-  merge_next(block);
-}
-
 template <typename OffsetType, size_t kAlign>
 bool Block<OffsetType, kAlign>::can_allocate(size_t alignment,
                                              size_t size) const {
@@ -449,7 +376,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
 
     Block *original = info.block;
     optional<Block *> maybe_aligned_block =
-        Block::split(original, adjustment - BLOCK_OVERHEAD);
+        original->split(adjustment - BLOCK_OVERHEAD);
     LIBC_ASSERT(maybe_aligned_block.has_value() &&
                 "This split should always result in a new block. The check in "
                 "`can_allocate` ensures that we have enough space here to make "
@@ -458,7 +385,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
     if (Block *prev = original->prev()) {
       // If there is a block before this, we can merge the current one with the
       // newly created one.
-      merge_next(prev);
+      prev->merge_next();
     } else {
       // Otherwise, this was the very first block in the chain. Now we can make
       // it the new first block.
@@ -472,7 +399,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
   }
 
   // Now get a block for the requested size.
-  if (optional<Block *> next = Block::split(info.block, size))
+  if (optional<Block *> next = info.block->split(size))
     info.next = *next;
 
   return info;
@@ -480,14 +407,11 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
 
 template <typename OffsetType, size_t kAlign>
 optional<Block<OffsetType, kAlign> *>
-Block<OffsetType, kAlign>::split(Block *&block, size_t new_inner_size) {
-  if (block == nullptr)
+Block<OffsetType, kAlign>::split(size_t new_inner_size) {
+  if (used())
     return {};
 
-  if (block->used())
-    return {};
-
-  size_t old_inner_size = block->inner_size();
+  size_t old_inner_size = inner_size();
   new_inner_size = align_up(new_inner_size, ALIGNMENT);
   if (old_inner_size < new_inner_size)
     return {};
@@ -495,68 +419,61 @@ Block<OffsetType, kAlign>::split(Block *&block, size_t new_inner_size) {
   if (old_inner_size - new_inner_size < BLOCK_OVERHEAD)
     return {};
 
-  return split_impl(block, new_inner_size);
+  return split_impl(new_inner_size);
 }
 
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *
-Block<OffsetType, kAlign>::split_impl(Block *&block, size_t new_inner_size) {
-  size_t prev_outer_size = block->prev_ * ALIGNMENT;
+Block<OffsetType, kAlign>::split_impl(size_t new_inner_size) {
   size_t outer_size1 = new_inner_size + BLOCK_OVERHEAD;
-  bool is_last = block->last();
-  ByteSpan bytes = as_bytes(cpp::move(block));
-  Block *block1 = as_block(prev_outer_size, bytes.subspan(0, outer_size1));
-  Block *block2 = as_block(outer_size1, bytes.subspan(outer_size1));
-
-  if (is_last)
-    block2->mark_last();
-  else
-    block2->next()->prev_ = block2->next_;
-
-  block = cpp::move(block1);
-  return block2;
+  bool has_next = next();
+  ByteSpan new_region = region().subspan(outer_size1);
+  LIBC_ASSERT(!used() && "used blocks cannot be split");
+  // The low order bits of outer_size1 should both be zero, and is the correct
+  // value for the flags is false.
+  next_ = outer_size1;
+  LIBC_ASSERT(!used() && next() && "incorrect first split flags");
+  Block *new_block = as_block(outer_size1, new_region);
+
+  if (has_next) {
+    // The two flags are both false, so next_ is a plain size.
+    LIBC_ASSERT(!new_block->used() && next() && "flags disrupt use of size");
+    new_block->next()->prev_ = new_block->next_;
+  } else {
+    new_block->mark_last();
+  }
+  return new_block;
 }
 
 template <typename OffsetType, size_t kAlign>
-bool Block<OffsetType, kAlign>::merge_next(Block *&block) {
-  if (block == nullptr)
+bool Block<OffsetType, kAlign>::merge_next() {
+  if (used() || !next() || next()->used())
     return false;
 
-  if (block->last())
-    return false;
-
-  Block *next = block->next();
-  if (block->used() || next->used())
-    return false;
+  // Extend the size and copy the last() flag from the next block to this one.
+  next_ &= SIZE_MASK;
+  next_ += next()->next_;
 
-  size_t prev_outer_size = block->prev_ * ALIGNMENT;
-  bool is_last = next->last();
-  ByteSpan prev_bytes = as_bytes(cpp::move(block));
-  ByteSpan next_bytes = as_bytes(cpp::move(next));
-  size_t outer_size = prev_bytes.size() + next_bytes.size();
-  cpp::byte *merged = ::new (prev_bytes.data()) cpp::byte[outer_size];
-  block = as_block(prev_outer_size, ByteSpan(merged, outer_size));
-
-  if (is_last)
-    block->mark_last();
-  else
-    block->next()->prev_ = block->next_;
+  if (next()) {
+    // The two flags are both false, so next_ is a plain size.
+    LIBC_ASSERT(!used() && next() && "flags disrupt use of size");
+    next()->prev_ = next_;
+  }
 
   return true;
 }
 
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *Block<OffsetType, kAlign>::next() const {
-  uintptr_t addr =
-      last() ? 0 : reinterpret_cast<uintptr_t>(this) + outer_size();
-  return reinterpret_cast<Block *>(addr);
+  if (next_ & LAST_MASK)
+    return nullptr;
+  return reinterpret_cast<Block *>(reinterpret_cast<uintptr_t>(this) +
+                                   outer_size());
 }
 
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *Block<OffsetType, kAlign>::prev() const {
-  uintptr_t addr =
-      (prev_ == 0) ? 0
-                   : reinterpret_cast<uintptr_t>(this) - (prev_ * ALIGNMENT);
+  uintptr_t addr = (prev_ == 0) ? 0 : reinterpret_cast<uintptr_t>(this) - prev_;
   return reinterpret_cast<Block *>(addr);
 }
 
@@ -564,20 +481,10 @@ Block<OffsetType, kAlign> *Block<OffsetType, kAlign>::prev() const {
 
 template <typename OffsetType, size_t kAlign>
 constexpr Block<OffsetType, kAlign>::Block(size_t prev_outer_size,
-                                           size_t outer_size)
-    : info_{} {
-  prev_ = prev_outer_size / ALIGNMENT;
-  next_ = outer_size / ALIGNMENT;
-  info_.used = 0;
-  info_.last = 0;
-  info_.alignment = ALIGNMENT;
-}
-
-template <typename OffsetType, size_t kAlign>
-ByteSpan Block<OffsetType, kAlign>::as_bytes(Block *&&block) {
-  size_t block_size = block->outer_size();
-  cpp::byte *bytes = new (cpp::move(block)) cpp::byte[block_size];
-  return {bytes, block_size};
+                                           size_t outer_size) {
+  prev_ = prev_outer_size;
+  LIBC_ASSERT(outer_size % ALIGNMENT == 0 && "block sizes must be aligned");
+  next_ = outer_size;
 }
 
 template <typename OffsetType, size_t kAlign>
@@ -591,7 +498,7 @@ internal::BlockStatus Block<OffsetType, kAlign>::check_status() const {
   if (reinterpret_cast<uintptr_t>(this) % ALIGNMENT != 0)
     return internal::BlockStatus::MISALIGNED;
 
-  if (!last() && (this >= next() || this != next()->prev()))
+  if (next() && (this >= next() || this != next()->prev()))
     return internal::BlockStatus::NEXT_MISMATCHED;
 
   if (prev() && (this <= prev() || this != prev()->next()))
diff --git a/libc/src/__support/freelist_heap.h b/libc/src/__support/freelist_heap.h
index ce4f14b431585..fed00d06716cf 100644
--- a/libc/src/__support/freelist_heap.h
+++ b/libc/src/__support/freelist_heap.h
@@ -22,6 +22,9 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+extern "C" cpp::byte _end;
+extern "C" cpp::byte __llvm_libc_heap_limit;
+
 using cpp::optional;
 using cpp::span;
 
@@ -47,18 +50,10 @@ template <size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()> class FreeListHeap {
     size_t total_free_calls;
   };
 
-  FreeListHeap(span<cpp::byte> region)
-      : FreeListHeap(&*region.begin(), &*region.end(), region.size()) {
-    auto result = BlockType::init(region);
-    BlockType *block = *result;
-    freelist_.add_chunk(block_to_span(block));
-  }
+  constexpr FreeListHeap() : begin_(&_end), end_(&__llvm_libc_heap_limit) {}
 
-  constexpr FreeListHeap(void *start, cpp::byte *end, size_t total_bytes)
-      : block_region_start_(start), block_region_end_(end),
-        freelist_(DEFAULT_BUCKETS), heap_stats_{} {
-    heap_stats_.total_bytes = total_bytes;
-  }
+  constexpr FreeListHeap(span<cpp::byte> region)
+      : begin_(region.begin()), end_(region.end()) {}
 
   void *allocate(size_t size);
   void *aligned_allocate(size_t alignment, size_t size);
@@ -69,61 +64,57 @@ template <size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()> class FreeListHeap {
   void *calloc(size_t num, size_t size);
 
   const HeapStats &heap_stats() const { return heap_stats_; }
-  void reset_heap_stats() { heap_stats_ = {}; }
 
-  void *region_start() const { return block_region_start_; }
-  size_t region_size() const {
-    return reinterpret_cast<uintptr_t>(block_region_end_) -
-           reinterpret_cast<uintptr_t>(block_region_start_);
-  }
+  cpp::span<cpp::byte> region() const { return {begin_, end_}; }
 
-protected:
-  constexpr void set_freelist_node(typename FreeListType::FreeListNode &node,
-                                   cpp::span<cpp::byte> chunk) {
-    freelist_.set_freelist_node(node, chunk);
-  }
+private:
+  void init();
 
   void *allocate_impl(size_t alignment, size_t size);
 
-private:
   span<cpp::byte> block_to_span(BlockType *block) {
     return span<cpp::byte>(block->usable_space(), block->inner_size());
   }
 
-  bool is_valid_ptr(void *ptr) {
-    return ptr >= block_region_start_ && ptr < block_region_end_;
-  }
+  bool is_valid_ptr(void *ptr) { return ptr >= begin_ && ptr < end_; }
 
-  void *block_region_start_;
-  void *block_region_end_;
-  FreeListType freelist_;
-  HeapStats heap_stats_;
+  bool is_initialized_ = false;
+  cpp::byte *begin_;
+  cpp::byte *end_;
+  FreeListType freelist_{DEFAULT_BUCKETS};
+  HeapStats heap_stats_{};
 };
 
 template <size_t BUFF_SIZE, size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()>
-struct FreeListHeapBuffer : public FreeListHeap<NUM_BUCKETS> {
+class FreeListHeapBuffer : public FreeListHeap<NUM_BUCKETS> {
   using parent = FreeListHeap<NUM_BUCKETS>;
   using FreeListNode = typename parent::FreeListType::FreeListNode;
 
+public:
   constexpr FreeListHeapBuffer()
-      : FreeListHeap<NUM_BUCKETS>(&block, buffer + sizeof(buffer), BUFF_SIZE),
-        block(0, BUFF_SIZE), node{}, buffer{} {
-    block.mark_last();
-
-    cpp::span<cpp::byte> chunk(buffer, sizeof(buffer));
-    parent::set_freelist_node(node, chunk);
-  }
+      : FreeListHeap<NUM_BUCKETS>{buffer}, buffer{} {}
 
-  typename parent::BlockType block;
-  FreeListNode node;
-  cpp::byte buffer[BUFF_SIZE - sizeof(block) - sizeof(node)];
+private:
+  cpp::byte buffer[BUFF_SIZE];
 };
 
+template <size_t NUM_BUCKETS> void FreeListHeap<NUM_BUCKETS>::init() {
+  LIBC_ASSERT(!is_initialized_ && "duplicate initialization");
+  heap_stats_.total_bytes = region().size();
+  auto result = BlockType::init(region());
+  BlockType *block = *result;
+  freelist_.add_chunk(block_to_span(block));
+  is_initialized_ = true;
+}
+
 template <size_t NUM_BUCKETS>
 void *FreeListHeap<NUM_BUCKETS>::allocate_impl(size_t alignment, size_t size) {
   if (size == 0)
     return nullptr;
 
+  if (!is_initialized_)
+    init();
+
   // Find a chunk in the freelist. Split it if needed, then return.
   auto chunk =
       freelist_.find_chunk_if([alignment, size](span<cpp::byte> chunk) {
@@ -189,19 +180,19 @@ template <size_t NUM_BUCKETS> void FreeListHeap<NUM_BUCKETS>::free(void *ptr) {
   BlockType *prev = chunk_block->prev();
   BlockType *next = nullptr;
 
-  if (!chunk_block->last())
+  if (chunk_block->next())
     next = chunk_block->next();
 
   if (prev != nullptr && !prev->used()) {
     // Remove from freelist and merge
     freelist_.remove_chunk(block_to_span(prev));
     chunk_block = chunk_block->prev();
-    BlockType::merge_next(chunk_block);
+    chunk_block->merge_next();
   }
 
   if (next != nullptr && !next->used()) {
     freelist_.remove_chunk(block_to_span(next));
-    BlockType::merge_next(chunk_block);
+    chunk_block->merge_next();
   }
   // Add back to the freelist
   freelist_.add_chunk(block_to_span(chunk_block));
diff --git a/libc/src/__support/libc_assert.h b/libc/src/__support/libc_assert.h
index e3235199780c2..e21a58a0c8aad 100644
--- a/libc/src/__support/libc_assert.h
+++ b/libc/src/__support/libc_assert.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_LIBC_ASSERT_H
 
 #include "src/__support/macros/config.h"
-#ifdef LIBC_COPT_USE_C_ASSERT
+#if defined(LIBC_COPT_USE_C_ASSERT) || !defined(LIBC_FULL_BUILD)
 
 // The build is configured to just use the public <assert.h> API
 // for libc's internal assertions.
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 80d48be702070..d2cea367516db 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -14,6 +14,10 @@
 
 #include "architectures.h"
 
+#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+#define LIBC_TARGET_CPU_HAS_FULLFP16
+#endif
+
 #if defined(__SSE2__)
 #define LIBC_TARGET_CPU_HAS_SSE2
 #endif
@@ -49,4 +53,8 @@
 #define LIBC_TARGET_CPU_HAS_NEAREST_INT
 #endif
 
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#define LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
diff --git a/libc/src/__support/threads/linux/raw_mutex.h b/libc/src/__support/threads/linux/raw_mutex.h
index dbf8b53b42f7e..47f0aa70f1c46 100644
--- a/libc/src/__support/threads/linux/raw_mutex.h
+++ b/libc/src/__support/threads/linux/raw_mutex.h
@@ -13,6 +13,7 @@
 #include "src/__support/libc_assert.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 #include "src/__support/threads/linux/futex_utils.h"
 #include "src/__support/threads/linux/futex_word.h"
 #include "src/__support/threads/sleep.h"
diff --git a/libc/src/__support/threads/thread.h b/libc/src/__support/threads/thread.h
index ce23a880e048a..68640cef32964 100644
--- a/libc/src/__support/threads/thread.h
+++ b/libc/src/__support/threads/thread.h
@@ -17,6 +17,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
+// TODO: fix this unguarded linux dep
 #include <linux/param.h> // for exec_pagesize.
 
 #include <stddef.h> // For size_t
@@ -97,11 +98,11 @@ struct alignas(STACK_ALIGNMENT) ThreadAttributes {
   //          exits. It will clean up the thread resources once the thread
   //          exits.
   cpp::Atomic<uint32_t> detach_state;
-  void *stack;                  // Pointer to the thread stack
-  unsigned long long stacksize; // Size of the stack
-  unsigned long long guardsize; // Guard size on stack
-  uintptr_t tls;                // Address to the thread TLS memory
-  uintptr_t tls_size;           // The size of area pointed to by |tls|.
+  void *stack;               // Pointer to the thread stack
+  size_t stacksize;          // Size of the stack
+  size_t guardsize;          // Guard size on stack
+  uintptr_t tls;             // Address to the thread TLS memory
+  uintptr_t tls_size;        // The size of area pointed to by |tls|.
   unsigned char owned_stack; // Indicates if the thread owns this stack memory
   int tid;
   ThreadStyle style;
diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index 639204d29590a..c76ec1407ad35 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -13,7 +13,7 @@
 extern "C" {
 
 void __stack_chk_fail(void) {
-  LIBC_NAMESPACE::write_to_stderr("stack smashing detected");
+  LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();
 }
 
diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp
index 7a17a5a8217c7..d1600d1b050e3 100644
--- a/libc/src/errno/libc_errno.cpp
+++ b/libc/src/errno/libc_errno.cpp
@@ -9,6 +9,8 @@
 #include "libc_errno.h"
 #include "src/__support/macros/config.h"
 
+// libc uses a fallback default value, either system or thread local.
+#define LIBC_ERRNO_MODE_DEFAULT 0
 // libc never stores a value; `errno` macro uses get link-time failure.
 #define LIBC_ERRNO_MODE_UNDEFINED 1
 // libc maintains per-thread state (requires C++ `thread_local` support).
@@ -23,7 +25,8 @@
 // fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
 #define LIBC_ERRNO_MODE_SYSTEM 5
 
-#ifndef LIBC_ERRNO_MODE
+#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
+#undef LIBC_ERRNO_MODE
 #if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
 #define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
 #else
@@ -31,12 +34,14 @@
 #endif
 #endif // LIBC_ERRNO_MODE
 
-#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM
 #error LIBC_ERRNO_MODE must be one of the following values: \
+LIBC_ERRNO_MODE_DEFAULT, \
 LIBC_ERRNO_MODE_UNDEFINED, \
 LIBC_ERRNO_MODE_THREAD_LOCAL, \
 LIBC_ERRNO_MODE_SHARED, \
diff --git a/libc/src/gpu/CMakeLists.txt b/libc/src/gpu/CMakeLists.txt
index 4508abea7a888..e20228516b511 100644
--- a/libc/src/gpu/CMakeLists.txt
+++ b/libc/src/gpu/CMakeLists.txt
@@ -8,15 +8,3 @@ add_entrypoint_object(
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
 )
-
-add_entrypoint_object(
-  rpc_fprintf
-  SRCS
-    rpc_fprintf.cpp
-  HDRS
-    rpc_fprintf.h
-  DEPENDS
-    libc.src.stdio.gpu.gpu_file
-    libc.src.__support.RPC.rpc_client
-    libc.src.__support.GPU.utils
-)
diff --git a/libc/src/gpu/rpc_fprintf.cpp b/libc/src/gpu/rpc_fprintf.cpp
deleted file mode 100644
index 70056daa25e2e..0000000000000
--- a/libc/src/gpu/rpc_fprintf.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- GPU implementation of fprintf -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "rpc_fprintf.h"
-
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/GPU/utils.h"
-#include "src/__support/RPC/rpc_client.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/stdio/gpu/file.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-template <uint16_t opcode>
-int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
-                 size_t format_size, void *args, size_t args_size) {
-  uint64_t mask = gpu::get_lane_mask();
-  rpc::Client::Port port = rpc::client.open<opcode>();
-
-  if constexpr (opcode == RPC_PRINTF_TO_STREAM) {
-    port.send([&](rpc::Buffer *buffer) {
-      buffer->data[0] = reinterpret_cast<uintptr_t>(file);
-    });
-  }
-
-  port.send_n(format, format_size);
-  port.recv([&](rpc::Buffer *buffer) {
-    args_size = static_cast<size_t>(buffer->data[0]);
-  });
-  port.send_n(args, args_size);
-
-  uint32_t ret = 0;
-  for (;;) {
-    const char *str = nullptr;
-    port.recv([&](rpc::Buffer *buffer) {
-      ret = static_cast<uint32_t>(buffer->data[0]);
-      str = reinterpret_cast<const char *>(buffer->data[1]);
-    });
-    // If any lanes have a string argument it needs to be copied back.
-    if (!gpu::ballot(mask, str))
-      break;
-
-    uint64_t size = str ? internal::string_length(str) + 1 : 0;
-    port.send_n(str, size);
-  }
-
-  port.close();
-  return ret;
-}
-
-// TODO: Delete this and port OpenMP to use `printf`.
-// place of varargs. Once varargs support is added we will use that to
-// implement the real version.
-LLVM_LIBC_FUNCTION(int, rpc_fprintf,
-                   (::FILE *__restrict stream, const char *__restrict format,
-                    void *args, size_t size)) {
-  cpp::string_view str(format);
-  if (stream == stdout)
-    return fprintf_impl<RPC_PRINTF_TO_STDOUT>(stream, format, str.size() + 1,
-                                              args, size);
-  else if (stream == stderr)
-    return fprintf_impl<RPC_PRINTF_TO_STDERR>(stream, format, str.size() + 1,
-                                              args, size);
-  else
-    return fprintf_impl<RPC_PRINTF_TO_STREAM>(stream, format, str.size() + 1,
-                                              args, size);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index dc2339896f2bb..e513d72260fb1 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -86,11 +86,27 @@ add_math_entrypoint_object(cosh)
 add_math_entrypoint_object(coshf)
 add_math_entrypoint_object(cospif)
 
+add_math_entrypoint_object(daddl)
+add_math_entrypoint_object(daddf128)
+add_math_entrypoint_object(ddivf128)
+add_math_entrypoint_object(dmull)
+add_math_entrypoint_object(dmulf128)
+
+add_math_entrypoint_object(dfmal)
+add_math_entrypoint_object(dfmaf128)
+
+add_math_entrypoint_object(dsqrtl)
+add_math_entrypoint_object(dsqrtf128)
+
+add_math_entrypoint_object(dsubl)
+add_math_entrypoint_object(dsubf128)
+
 add_math_entrypoint_object(erf)
 add_math_entrypoint_object(erff)
 
 add_math_entrypoint_object(exp)
 add_math_entrypoint_object(expf)
+add_math_entrypoint_object(expf16)
 
 add_math_entrypoint_object(exp2)
 add_math_entrypoint_object(exp2f)
@@ -118,11 +134,20 @@ add_math_entrypoint_object(f16fmaf)
 add_math_entrypoint_object(f16fmal)
 add_math_entrypoint_object(f16fmaf128)
 
+add_math_entrypoint_object(f16mul)
+add_math_entrypoint_object(f16mulf)
+add_math_entrypoint_object(f16mull)
+add_math_entrypoint_object(f16mulf128)
+
 add_math_entrypoint_object(f16sqrt)
 add_math_entrypoint_object(f16sqrtf)
 add_math_entrypoint_object(f16sqrtl)
 add_math_entrypoint_object(f16sqrtf128)
 
+add_math_entrypoint_object(fsqrt)
+add_math_entrypoint_object(fsqrtl)
+add_math_entrypoint_object(fsqrtf128)
+
 add_math_entrypoint_object(f16sub)
 add_math_entrypoint_object(f16subf)
 add_math_entrypoint_object(f16subl)
@@ -133,6 +158,7 @@ add_math_entrypoint_object(fabsf)
 add_math_entrypoint_object(fabsl)
 add_math_entrypoint_object(fabsf16)
 add_math_entrypoint_object(fabsf128)
+add_math_entrypoint_object(fadd)
 
 add_math_entrypoint_object(fdim)
 add_math_entrypoint_object(fdimf)
@@ -210,6 +236,8 @@ add_math_entrypoint_object(fminimum_mag_numf16)
 add_math_entrypoint_object(fminimum_mag_numf128)
 
 add_math_entrypoint_object(fmul)
+add_math_entrypoint_object(fmull)
+add_math_entrypoint_object(fmulf128)
 
 add_math_entrypoint_object(fmod)
 add_math_entrypoint_object(fmodf)
@@ -235,7 +263,10 @@ add_math_entrypoint_object(fromfpxl)
 add_math_entrypoint_object(fromfpxf16)
 add_math_entrypoint_object(fromfpxf128)
 
+add_math_entrypoint_object(getpayload)
+add_math_entrypoint_object(getpayloadf)
 add_math_entrypoint_object(getpayloadf16)
+add_math_entrypoint_object(getpayloadf128)
 
 add_math_entrypoint_object(hypot)
 add_math_entrypoint_object(hypotf)
@@ -387,7 +418,10 @@ add_math_entrypoint_object(scalbnl)
 add_math_entrypoint_object(scalbnf16)
 add_math_entrypoint_object(scalbnf128)
 
+add_math_entrypoint_object(setpayload)
+add_math_entrypoint_object(setpayloadf)
 add_math_entrypoint_object(setpayloadf16)
+add_math_entrypoint_object(setpayloadf128)
 
 add_math_entrypoint_object(setpayloadsigf16)
 
@@ -415,9 +449,16 @@ add_math_entrypoint_object(tanhf)
 add_math_entrypoint_object(tgamma)
 add_math_entrypoint_object(tgammaf)
 
+add_math_entrypoint_object(totalorder)
+add_math_entrypoint_object(totalorderf)
 add_math_entrypoint_object(totalorderf16)
+add_math_entrypoint_object(totalorderf128)
 
+add_math_entrypoint_object(totalordermag)
+add_math_entrypoint_object(totalordermagf)
+add_math_entrypoint_object(totalordermagl)
 add_math_entrypoint_object(totalordermagf16)
+add_math_entrypoint_object(totalordermagf128)
 
 add_math_entrypoint_object(trunc)
 add_math_entrypoint_object(truncf)
diff --git a/libc/src/gpu/rpc_fprintf.h b/libc/src/math/daddf128.h
similarity index 54%
rename from libc/src/gpu/rpc_fprintf.h
rename to libc/src/math/daddf128.h
index 7658b214c07c2..cf6c1999d9d03 100644
--- a/libc/src/gpu/rpc_fprintf.h
+++ b/libc/src/math/daddf128.h
@@ -1,4 +1,4 @@
-//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//===-- Implementation header for daddf128 ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,18 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
-#define LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
+#ifndef LLVM_LIBC_SRC_MATH_DADDF128_H
+#define LLVM_LIBC_SRC_MATH_DADDF128_H
 
-#include "hdr/types/FILE.h"
 #include "src/__support/macros/config.h"
-#include <stddef.h>
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-int rpc_fprintf(::FILE *__restrict stream, const char *__restrict format,
-                void *argc, size_t size);
+double daddf128(float128 x, float128 y);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
+#endif // LLVM_LIBC_SRC_MATH_DADDF128_H
diff --git a/libc/src/math/daddl.h b/libc/src/math/daddl.h
new file mode 100644
index 0000000000000..8dc5d4d8d4b69
--- /dev/null
+++ b/libc/src/math/daddl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for daddl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DADDL_H
+#define LLVM_LIBC_SRC_MATH_DADDL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double daddl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DADDL_H
diff --git a/libc/src/math/ddivf128.h b/libc/src/math/ddivf128.h
new file mode 100644
index 0000000000000..f7a5ef07cb5fe
--- /dev/null
+++ b/libc/src/math/ddivf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for ddivf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DDIVF128_H
+#define LLVM_LIBC_SRC_MATH_DDIVF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double ddivf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DDIVF128_H
diff --git a/libc/src/math/dfmaf128.h b/libc/src/math/dfmaf128.h
new file mode 100644
index 0000000000000..1b2e72851c806
--- /dev/null
+++ b/libc/src/math/dfmaf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for dfmaf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DFMAF128_H
+#define LLVM_LIBC_SRC_MATH_DFMAF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dfmaf128(float128 x, float128 y, float128 z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DFMAF128_H
diff --git a/libc/src/math/dfmal.h b/libc/src/math/dfmal.h
new file mode 100644
index 0000000000000..e0867305b1286
--- /dev/null
+++ b/libc/src/math/dfmal.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for dfmal -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DFMAL_H
+#define LLVM_LIBC_SRC_MATH_DFMAL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dfmal(long double x, long double y, long double z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DFMAL_H
diff --git a/libc/src/math/dmulf128.h b/libc/src/math/dmulf128.h
new file mode 100644
index 0000000000000..623f22c910a41
--- /dev/null
+++ b/libc/src/math/dmulf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for dmulf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DMULF128_H
+#define LLVM_LIBC_SRC_MATH_DMULF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dmulf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DMULF128_H
diff --git a/libc/src/math/dmull.h b/libc/src/math/dmull.h
new file mode 100644
index 0000000000000..656776a603009
--- /dev/null
+++ b/libc/src/math/dmull.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for dmull -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DMULL_H
+#define LLVM_LIBC_SRC_MATH_DMULL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dmull(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DMULL_H
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index 9c23b8ca789bc..e9a6aadc6c44f 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/spec/stdc.td
+  libc/newhdrgen/yaml/math.yaml
 ```
 
 ## Implementation
diff --git a/libc/src/math/dsqrtf128.h b/libc/src/math/dsqrtf128.h
new file mode 100644
index 0000000000000..97103392e3fef
--- /dev/null
+++ b/libc/src/math/dsqrtf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for dsqrtf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DSQRTF128_H
+#define LLVM_LIBC_SRC_MATH_DSQRTF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dsqrtf128(float128 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSQRTF128_H
diff --git a/libc/src/math/dsqrtl.h b/libc/src/math/dsqrtl.h
new file mode 100644
index 0000000000000..7bf0255697da4
--- /dev/null
+++ b/libc/src/math/dsqrtl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for dsqrtl ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DSQRTL_H
+#define LLVM_LIBC_SRC_MATH_DSQRTL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dsqrtl(long double x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSQRTL_H
diff --git a/libc/src/math/dsubf128.h b/libc/src/math/dsubf128.h
new file mode 100644
index 0000000000000..8ac58a916f0de
--- /dev/null
+++ b/libc/src/math/dsubf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for dsubf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DSUBF128_H
+#define LLVM_LIBC_SRC_MATH_DSUBF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dsubf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSUBF128_H
diff --git a/libc/src/math/dsubl.h b/libc/src/math/dsubl.h
new file mode 100644
index 0000000000000..a512bbe06d4e8
--- /dev/null
+++ b/libc/src/math/dsubl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for dsubl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DSUBL_H
+#define LLVM_LIBC_SRC_MATH_DSUBL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dsubl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSUBL_H
diff --git a/libc/src/math/expf16.h b/libc/src/math/expf16.h
new file mode 100644
index 0000000000000..8547f65f942d7
--- /dev/null
+++ b/libc/src/math/expf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for expf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXPF16_H
+#define LLVM_LIBC_SRC_MATH_EXPF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 expf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXPF16_H
diff --git a/libc/src/math/f16mul.h b/libc/src/math/f16mul.h
new file mode 100644
index 0000000000000..89403cf219271
--- /dev/null
+++ b/libc/src/math/f16mul.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for f16mul ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16MUL_H
+#define LLVM_LIBC_SRC_MATH_F16MUL_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 f16mul(double x, double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_F16MUL_H
diff --git a/libc/src/math/f16mulf.h b/libc/src/math/f16mulf.h
new file mode 100644
index 0000000000000..755886d6f14d0
--- /dev/null
+++ b/libc/src/math/f16mulf.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for f16mulf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16MULF_H
+#define LLVM_LIBC_SRC_MATH_F16MULF_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 f16mulf(float x, float y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_F16MULF_H
diff --git a/libc/src/math/f16mulf128.h b/libc/src/math/f16mulf128.h
new file mode 100644
index 0000000000000..14371c57ca88c
--- /dev/null
+++ b/libc/src/math/f16mulf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for f16mulf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16MULF128_H
+#define LLVM_LIBC_SRC_MATH_F16MULF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 f16mulf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_F16MULF128_H
diff --git a/libc/src/math/f16mull.h b/libc/src/math/f16mull.h
new file mode 100644
index 0000000000000..a3177cadc1306
--- /dev/null
+++ b/libc/src/math/f16mull.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for f16mull -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16MULL_H
+#define LLVM_LIBC_SRC_MATH_F16MULL_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 f16mull(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_F16MULL_H
diff --git a/libc/src/math/fadd.h b/libc/src/math/fadd.h
new file mode 100644
index 0000000000000..ec3ce18bb676a
--- /dev/null
+++ b/libc/src/math/fadd.h
@@ -0,0 +1,20 @@
+//===-- Implementation of fadd function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/macros/config.h"
+
+#ifndef LLVM_LIBC_SRC_MATH_FADD_H
+#define LLVM_LIBC_SRC_MATH_FADD_H
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fadd(double x, double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FADD_H
diff --git a/libc/src/math/fmulf128.h b/libc/src/math/fmulf128.h
new file mode 100644
index 0000000000000..94137ae87eb1e
--- /dev/null
+++ b/libc/src/math/fmulf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for fmulf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMULF128_H
+#define LLVM_LIBC_SRC_MATH_FMULF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fmulf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FMULF128_H
diff --git a/libc/src/math/fmull.h b/libc/src/math/fmull.h
new file mode 100644
index 0000000000000..46e6c77cc66a2
--- /dev/null
+++ b/libc/src/math/fmull.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fmull -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMULL_H
+#define LLVM_LIBC_SRC_MATH_FMULL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fmull(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FMULL_H
diff --git a/libc/src/math/fsqrt.h b/libc/src/math/fsqrt.h
new file mode 100644
index 0000000000000..e6a8c3179f811
--- /dev/null
+++ b/libc/src/math/fsqrt.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fsqrt -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FSQRT_H
+#define LLVM_LIBC_SRC_MATH_FSQRT_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fsqrt(double x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FSQRT_H
diff --git a/libc/src/math/fsqrtf128.h b/libc/src/math/fsqrtf128.h
new file mode 100644
index 0000000000000..5286cbc33116d
--- /dev/null
+++ b/libc/src/math/fsqrtf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for fsqrtf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FSQRTF128_H
+#define LLVM_LIBC_SRC_MATH_FSQRTF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fsqrtf128(float128 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FSQRTF128_H
diff --git a/libc/src/math/fsqrtl.h b/libc/src/math/fsqrtl.h
new file mode 100644
index 0000000000000..a50eea3335458
--- /dev/null
+++ b/libc/src/math/fsqrtl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fsqrtl ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FSQRTL_H
+#define LLVM_LIBC_SRC_MATH_FSQRTL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fsqrtl(long double x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FSQRTL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 415ca3fbce796..58daffaf4e932 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -111,7 +111,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -129,6 +129,120 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
+add_entrypoint_object(
+  daddl
+  SRCS
+    daddl.cpp
+  HDRS
+    ../daddl.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.generic.add_sub
+)
+
+add_entrypoint_object(
+  daddf128
+  SRCS
+    daddf128.cpp
+  HDRS
+    ../daddf128.h
+  COMPILE_OPTIONS
+    -O3
+ DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.add_sub
+)
+
+add_entrypoint_object(
+  ddivf128
+  SRCS
+    ddivf128.cpp
+  HDRS
+    ../ddivf128.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.div
+)
+
+add_entrypoint_object(
+  dfmaf128
+  SRCS
+    dfmaf128.cpp
+  HDRS
+    ../dfmaf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dfmal
+  SRCS
+    dfmal.cpp
+  HDRS
+    ../dfmal.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dsqrtl
+  SRCS
+    dsqrtl.cpp
+  HDRS
+    ../dsqrtl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dsqrtf128
+  SRCS
+    dsqrtf128.cpp
+  HDRS
+    ../dsqrtf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
+
+add_entrypoint_object(
+  dsubf128
+  SRCS
+    dsubf128.cpp
+  HDRS
+    ../dsubf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dsubl
+  SRCS
+    dsubl.cpp
+  HDRS
+    ../dsubl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.add_sub
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_header_library(
   range_reduction
   HDRS
@@ -399,7 +513,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -411,7 +527,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -423,7 +541,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -435,8 +553,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.compiler
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -452,6 +574,18 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  fadd
+  SRCS
+    fadd.cpp
+  HDRS
+    ../fadd.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   trunc
   SRCS
@@ -503,7 +637,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -572,7 +706,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -641,7 +775,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -710,7 +844,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -903,7 +1037,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -1181,6 +1315,28 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  expf16
+  SRCS
+    expf16.cpp
+  HDRS
+    ../expf16.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   exp2
   SRCS
@@ -1406,6 +1562,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1418,6 +1576,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1443,6 +1603,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2014,7 +2176,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2026,7 +2190,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2038,7 +2204,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2065,6 +2231,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 
@@ -2077,7 +2245,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2089,7 +2259,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2101,7 +2273,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2128,6 +2300,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2139,7 +2313,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2151,7 +2327,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2163,7 +2341,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2177,6 +2355,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2201,7 +2381,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2213,7 +2395,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2225,7 +2409,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2239,6 +2423,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2387,7 +2573,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2399,7 +2587,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2411,7 +2601,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2425,6 +2615,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2449,7 +2641,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2461,7 +2655,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2473,7 +2669,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2487,6 +2683,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2633,11 +2831,32 @@ add_entrypoint_object(
   HDRS
     ../fmul.h
   DEPENDS
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.uint128
-    libc.src.__support.CPP.bit
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmull
+  SRCS
+    fmull.cpp
+  HDRS
+    ../fmull.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmulf128
+  SRCS
+    fmulf128.cpp
+  HDRS
+    ../fmulf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
   COMPILE_OPTIONS
     -O3
 )
@@ -2809,9 +3028,12 @@ add_entrypoint_object(
   HDRS
     ../hypotf.h
   DEPENDS
-    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
   COMPILE_OPTIONS
     -O3
 )
@@ -3737,6 +3959,26 @@ add_entrypoint_object(
     libc.src.__support.macros.optimization
 )
 
+add_entrypoint_object(
+  atan2
+  SRCS
+    atan2.cpp
+  HDRS
+    ../atan2.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    .inv_trigf_utils
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
 add_entrypoint_object(
   scalblnf16
   SRCS
@@ -3842,6 +4084,30 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  totalorder
+  SRCS
+    totalorder.cpp
+  HDRS
+    ../totalorder.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalorderf
+  SRCS
+    totalorderf.cpp
+  HDRS
+    ../totalorderf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   totalorderf16
   SRCS
@@ -3854,6 +4120,54 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  totalorderf128
+  SRCS
+    totalorderf128.cpp
+  HDRS
+    ../totalorderf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+add_entrypoint_object(
+  totalordermag
+  SRCS
+    totalordermag.cpp
+  HDRS
+    ../totalordermag.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalordermagf
+  SRCS
+    totalordermagf.cpp
+  HDRS
+    ../totalordermagf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalordermagl
+  SRCS
+    totalordermagl.cpp
+  HDRS
+    ../totalordermagl.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   totalordermagf16
   SRCS
@@ -3866,6 +4180,43 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  totalordermagf128
+  SRCS
+    totalordermagf128.cpp
+  HDRS
+    ../totalordermagf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  getpayload
+  SRCS
+    getpayload.cpp
+  HDRS
+    ../getpayload.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  getpayloadf
+  SRCS
+    getpayloadf.cpp
+  HDRS
+    ../getpayloadf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   getpayloadf16
   SRCS
@@ -3879,6 +4230,43 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  getpayloadf128
+  SRCS
+    getpayloadf128.cpp
+  HDRS
+    ../getpayloadf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  setpayload
+  SRCS
+    setpayload.cpp
+  HDRS
+    ../setpayload.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  setpayloadf
+  SRCS
+    setpayloadf.cpp
+  HDRS
+    ../setpayloadf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   setpayloadf16
   SRCS
@@ -3892,6 +4280,19 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  setpayloadf128
+  SRCS
+    setpayloadf128.cpp
+  HDRS
+    ../setpayloadf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   setpayloadsigf16
   SRCS
@@ -4071,7 +4472,7 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O0 -ggdb3
+    -O3
 )
 
 add_entrypoint_object(
@@ -4165,6 +4566,43 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  fsqrt
+  SRCS
+    fsqrt.cpp
+  HDRS
+    ../fsqrt.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fsqrtl
+  SRCS
+    fsqrtl.cpp
+  HDRS
+    ../fsqrtl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fsqrtf128
+  SRCS
+    fsqrtf128.cpp
+  HDRS
+    ../fsqrtf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   cbrtf
   SRCS
@@ -4200,3 +4638,80 @@ add_entrypoint_object(
     libc.src.__support.macros.optimization
     libc.src.__support.integer_literals
 )
+
+add_entrypoint_object(
+  dmull
+  SRCS
+    dmull.cpp
+  HDRS
+    ../dmull.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dmulf128
+  SRCS
+    dmulf128.cpp
+  HDRS
+    ../dmulf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16mul
+  SRCS
+    f16mul.cpp
+  HDRS
+    ../f16mul.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16mulf
+  SRCS
+    f16mulf.cpp
+  HDRS
+    ../f16mulf.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16mull
+  SRCS
+    f16mull.cpp
+  HDRS
+    ../f16mull.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16mulf128
+  SRCS
+    f16mulf128.cpp
+  HDRS
+    ../f16mulf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.mul
+  COMPILE_OPTIONS
+    -O3
+)
diff --git a/libc/src/math/generic/atan2.cpp b/libc/src/math/generic/atan2.cpp
new file mode 100644
index 0000000000000..c39deebca4d40
--- /dev/null
+++ b/libc/src/math/generic/atan2.cpp
@@ -0,0 +1,313 @@
+//===-- Double-precision atan2 function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+using DoubleDouble = fputil::DoubleDouble;
+
+// atan(i/64) with i = 0..64, generated by Sollya with:
+// > for i from 0 to 64 do {
+//     a = round(atan(i/64), D, RN);
+//     b = round(atan(i/64) - a, D, RN);
+//     print("{", b, ",", a, "},");
+//   };
+constexpr fputil::DoubleDouble ATAN_I[65] = {
+    {0.0, 0.0},
+    {-0x1.220c39d4dff5p-61, 0x1.fff555bbb729bp-7},
+    {-0x1.5ec431444912cp-60, 0x1.ffd55bba97625p-6},
+    {-0x1.86ef8f794f105p-63, 0x1.7fb818430da2ap-5},
+    {-0x1.c934d86d23f1dp-60, 0x1.ff55bb72cfdeap-5},
+    {0x1.ac4ce285df847p-58, 0x1.3f59f0e7c559dp-4},
+    {-0x1.cfb654c0c3d98p-58, 0x1.7ee182602f10fp-4},
+    {0x1.f7b8f29a05987p-58, 0x1.be39ebe6f07c3p-4},
+    {-0x1.cd37686760c17p-59, 0x1.fd5ba9aac2f6ep-4},
+    {-0x1.b485914dacf8cp-59, 0x1.1e1fafb043727p-3},
+    {0x1.61a3b0ce9281bp-57, 0x1.3d6eee8c6626cp-3},
+    {-0x1.054ab2c010f3dp-58, 0x1.5c9811e3ec26ap-3},
+    {0x1.347b0b4f881cap-58, 0x1.7b97b4bce5b02p-3},
+    {0x1.cf601e7b4348ep-59, 0x1.9a6a8e96c8626p-3},
+    {0x1.17b10d2e0e5abp-61, 0x1.b90d7529260a2p-3},
+    {0x1.c648d1534597ep-57, 0x1.d77d5df205736p-3},
+    {0x1.8ab6e3cf7afbdp-57, 0x1.f5b75f92c80ddp-3},
+    {0x1.62e47390cb865p-56, 0x1.09dc597d86362p-2},
+    {0x1.30ca4748b1bf9p-57, 0x1.18bf5a30bf178p-2},
+    {-0x1.077cdd36dfc81p-56, 0x1.278372057ef46p-2},
+    {-0x1.963a544b672d8p-57, 0x1.362773707ebccp-2},
+    {-0x1.5d5e43c55b3bap-56, 0x1.44aa436c2af0ap-2},
+    {-0x1.2566480884082p-57, 0x1.530ad9951cd4ap-2},
+    {-0x1.a725715711fp-56, 0x1.614840309cfe2p-2},
+    {-0x1.c63aae6f6e918p-56, 0x1.6f61941e4def1p-2},
+    {0x1.69c885c2b249ap-56, 0x1.7d5604b63b3f7p-2},
+    {0x1.b6d0ba3748fa8p-56, 0x1.8b24d394a1b25p-2},
+    {0x1.9e6c988fd0a77p-56, 0x1.98cd5454d6b18p-2},
+    {-0x1.24dec1b50b7ffp-56, 0x1.a64eec3cc23fdp-2},
+    {0x1.ae187b1ca504p-56, 0x1.b3a911da65c6cp-2},
+    {-0x1.cc1ce70934c34p-56, 0x1.c0db4c94ec9fp-2},
+    {-0x1.a2cfa4418f1adp-56, 0x1.cde53432c1351p-2},
+    {0x1.a2b7f222f65e2p-56, 0x1.dac670561bb4fp-2},
+    {0x1.0e53dc1bf3435p-56, 0x1.e77eb7f175a34p-2},
+    {-0x1.a3992dc382a23p-57, 0x1.f40dd0b541418p-2},
+    {-0x1.b32c949c9d593p-55, 0x1.0039c73c1a40cp-1},
+    {-0x1.d5b495f6349e6p-56, 0x1.0657e94db30dp-1},
+    {0x1.974fa13b5404fp-58, 0x1.0c6145b5b43dap-1},
+    {-0x1.2bdaee1c0ee35p-58, 0x1.1255d9bfbd2a9p-1},
+    {0x1.c621cec00c301p-55, 0x1.1835a88be7c13p-1},
+    {-0x1.928df287a668fp-58, 0x1.1e00babdefeb4p-1},
+    {0x1.c421c9f38224ep-57, 0x1.23b71e2cc9e6ap-1},
+    {-0x1.09e73b0c6c087p-56, 0x1.2958e59308e31p-1},
+    {0x1.c5d5e9ff0cf8dp-55, 0x1.2ee628406cbcap-1},
+    {0x1.1021137c71102p-55, 0x1.345f01cce37bbp-1},
+    {-0x1.2304331d8bf46p-55, 0x1.39c391cd4171ap-1},
+    {0x1.ecf8b492644fp-56, 0x1.3f13fb89e96f4p-1},
+    {-0x1.f76d0163f79c8p-56, 0x1.445065b795b56p-1},
+    {0x1.2419a87f2a458p-56, 0x1.4978fa3269ee1p-1},
+    {0x1.4a33dbeb3796cp-55, 0x1.4e8de5bb6ec04p-1},
+    {-0x1.1bb74abda520cp-55, 0x1.538f57b89061fp-1},
+    {-0x1.5e5c9d8c5a95p-56, 0x1.587d81f732fbbp-1},
+    {0x1.0028e4bc5e7cap-57, 0x1.5d58987169b18p-1},
+    {-0x1.2b785350ee8c1p-57, 0x1.6220d115d7b8ep-1},
+    {-0x1.6ea6febe8bbbap-56, 0x1.66d663923e087p-1},
+    {-0x1.a80386188c50ep-55, 0x1.6b798920b3d99p-1},
+    {-0x1.8c34d25aadef6p-56, 0x1.700a7c5784634p-1},
+    {0x1.7b2a6165884a1p-59, 0x1.748978fba8e0fp-1},
+    {0x1.406a08980374p-55, 0x1.78f6bbd5d315ep-1},
+    {0x1.560821e2f3aa9p-55, 0x1.7d528289fa093p-1},
+    {-0x1.bf76229d3b917p-56, 0x1.819d0b7158a4dp-1},
+    {0x1.6b66e7fc8b8c3p-57, 0x1.85d69576cc2c5p-1},
+    {-0x1.55b9a5e177a1bp-55, 0x1.89ff5ff57f1f8p-1},
+    {-0x1.ec182ab042f61p-56, 0x1.8e17aa99cc05ep-1},
+    {0x1.1a62633145c07p-55, 0x1.921fb54442d18p-1},
+};
+
+// Approximate atan(x) for |x| <= 2^-7.
+// Using degree-9 Taylor polynomial:
+//  P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9;
+// Then the absolute error is bounded by:
+//   |atan(x) - P(x)| < |x|^11/11 < 2^(-7*11) / 11 < 2^-80.
+// And the relative error is bounded by:
+//   |(atan(x) - P(x))/atan(x)| < |x|^10 / 10 < 2^-73.
+// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
+//   ulp(x_hi^3 / 3) gives us:
+// P(x) ~ x_hi - x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+//        + x_lo * (1 - x_hi^2 + x_hi^4)
+DoubleDouble atan_eval(const DoubleDouble &x) {
+  DoubleDouble p;
+  p.hi = x.hi;
+  double x_hi_sq = x.hi * x.hi;
+  // c0 ~ x_hi^2 * 1/5 - 1/3
+  double c0 = fputil::multiply_add(x_hi_sq, 0x1.999999999999ap-3,
+                                   -0x1.5555555555555p-2);
+  // c1 ~ x_hi^2 * 1/9 - 1/7
+  double c1 = fputil::multiply_add(x_hi_sq, 0x1.c71c71c71c71cp-4,
+                                   -0x1.2492492492492p-3);
+  // x_hi^3
+  double x_hi_3 = x_hi_sq * x.hi;
+  // x_hi^4
+  double x_hi_4 = x_hi_sq * x_hi_sq;
+  // d0 ~ 1/3 - x_hi^2 / 5 + x_hi^4 / 7 - x_hi^6 / 9
+  double d0 = fputil::multiply_add(x_hi_4, c1, c0);
+  // x_lo - x_lo * x_hi^2 + x_lo * x_hi^4
+  double d1 = fputil::multiply_add(x_hi_4 - x_hi_sq, x.lo, x.lo);
+  // p.lo ~ -x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+  //        + x_lo * (1 - x_hi^2 + x_hi^4)
+  p.lo = fputil::multiply_add(x_hi_3, d0, d1);
+  return p;
+}
+
+} // anonymous namespace
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+//   atan2(y, x) = atan( y/x )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi + atan( y/x )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( y/x )   if x < 0 and y < 0   (III-quadrant)
+//               = atan( y/x )         if x >= 0 and y < 0  (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+//   atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+//   atan2(y, x) = atan( |y|/|x| )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi - atan( |y|/|x| )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( |y|/|x| )   if x < 0 and y < 0   (III-quadrant)
+//               = -atan( |y|/|x| )        if x >= 0 and y < 0  (IV-quadrant)
+// Which can be simplified to:
+//   atan2(y, x) = sign(y) * atan( |y|/|x| )             if x >= 0
+//               = sign(y) * (pi - atan( |y|/|x| ))      if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+//   atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+//   atan2(y, x) = sign(y) * atan( |y|/|x|)            if 0 <= |y| <= x
+//               = sign(y) * (pi/2 - atan( |x|/|y| )   if 0 <= x < |y|
+//               = sign(y) * (pi - atan( |y|/|x| ))    if 0 <= |y| <= -x
+//               = sign(y) * (pi/2 + atan( |x|/|y| ))  if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+//   atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree.  To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+//   | n/d - idx/64 | <= 1/128.
+// In particular,
+//   idx := round(2^6 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+//   atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64)
+// For the accurate pass, we use the addition formula:
+//   atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) )
+//                                = atan( (n - d*(idx/64))/(d + n*(idx/64)) )
+// And for the fast pass, we use degree-9 Taylor polynomial to compute the RHS:
+//   atan(u) ~ P(u) = u - u^3/3 + u^5/5 - u^7/7 + u^9/9
+// with absolute errors bounded by:
+//   |atan(u) - P(u)| < |u|^11 / 11 < 2^-80
+// and relative errors bounded by:
+//   |(atan(u) - P(u)) / P(u)| < u^10 / 11 < 2^-73.
+
+LLVM_LIBC_FUNCTION(double, atan2, (double y, double x)) {
+  using FPBits = fputil::FPBits<double>;
+
+  constexpr double IS_NEG[2] = {1.0, -1.0};
+  constexpr DoubleDouble ZERO = {0.0, 0.0};
+  constexpr DoubleDouble MZERO = {-0.0, -0.0};
+  constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p+1};
+  constexpr DoubleDouble MPI = {-0x1.1a62633145c07p-53, -0x1.921fb54442d18p+1};
+  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
+                                      0x1.921fb54442d18p0};
+  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
+                                       -0x1.921fb54442d18p0};
+  constexpr DoubleDouble PI_OVER_4 = {0x1.1a62633145c07p-55,
+                                      0x1.921fb54442d18p-1};
+  constexpr DoubleDouble THREE_PI_OVER_4 = {0x1.a79394c9e8a0ap-54,
+                                            0x1.2d97c7f3321d2p+1};
+  // Adjustment for constant term:
+  //   CONST_ADJ[x_sign][y_sign][recip]
+  constexpr DoubleDouble CONST_ADJ[2][2][2] = {
+      {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+      {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+  FPBits x_bits(x), y_bits(y);
+  bool x_sign = x_bits.sign().is_neg();
+  bool y_sign = y_bits.sign().is_neg();
+  x_bits = x_bits.abs();
+  y_bits = y_bits.abs();
+  uint64_t x_abs = x_bits.uintval();
+  uint64_t y_abs = y_bits.uintval();
+  bool recip = x_abs < y_abs;
+  uint64_t min_abs = recip ? x_abs : y_abs;
+  uint64_t max_abs = !recip ? x_abs : y_abs;
+  unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+  unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+  double num = FPBits(min_abs).get_val();
+  double den = FPBits(max_abs).get_val();
+
+  // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+  // overflow, or close to underflow.
+  if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) {
+    if (x_bits.is_nan() || y_bits.is_nan())
+      return FPBits::quiet_nan().get_val();
+    unsigned x_except = x_abs == 0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+    unsigned y_except = y_abs == 0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+    // Exceptional cases:
+    //   EXCEPT[y_except][x_except][x_is_neg]
+    // with x_except & y_except:
+    //   0: zero
+    //   1: finite, non-zero
+    //   2: infinity
+    constexpr DoubleDouble EXCEPTS[3][3][2] = {
+        {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2},
+         {PI_OVER_2, PI_OVER_2},
+         {PI_OVER_4, THREE_PI_OVER_4}},
+    };
+
+    if ((x_except != 1) || (y_except != 1)) {
+      DoubleDouble r = EXCEPTS[y_except][x_except][x_sign];
+      return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+    }
+    bool scale_up = min_exp < 128U;
+    bool scale_down = max_exp > 0x7ffU - 128U;
+    // At least one input is denormal, multiply both numerator and denominator
+    // by some large enough power of 2 to normalize denormal inputs.
+    if (scale_up) {
+      num *= 0x1.0p64;
+      if (!scale_down)
+        den *= 0x1.0p64;
+    } else if (scale_down) {
+      den *= 0x1.0p-64;
+      if (!scale_up)
+        num *= 0x1.0p-64;
+    }
+
+    min_abs = FPBits(num).uintval();
+    max_abs = FPBits(den).uintval();
+    min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+    max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+  }
+
+  double final_sign = IS_NEG[(x_sign != y_sign) != recip];
+  DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip];
+  unsigned exp_diff = max_exp - min_exp;
+  // We have the following bound for normalized n and d:
+  //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+  if (LIBC_UNLIKELY(exp_diff > 54)) {
+    return fputil::multiply_add(final_sign, const_term.hi,
+                                final_sign * (const_term.lo + num / den));
+  }
+
+  double k = fputil::nearest_integer(64.0 * num / den);
+  unsigned idx = static_cast<unsigned>(k);
+  // k = idx / 64
+  k *= 0x1.0p-6;
+
+  // Range reduction:
+  // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
+  //                        = atan((n - d * k/64)) / (d + n * k/64))
+  DoubleDouble num_k = fputil::exact_mult(num, k);
+  DoubleDouble den_k = fputil::exact_mult(den, k);
+
+  // num_dd = n - d * k
+  DoubleDouble num_dd = fputil::exact_add(num - den_k.hi, -den_k.lo);
+  // den_dd = d + n * k
+  DoubleDouble den_dd = fputil::exact_add(den, num_k.hi);
+  den_dd.lo += num_k.lo;
+
+  // q = (n - d * k) / (d + n * k)
+  DoubleDouble q = fputil::div(num_dd, den_dd);
+  // p ~ atan(q)
+  DoubleDouble p = atan_eval(q);
+
+  DoubleDouble r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+  r.hi *= final_sign;
+  r.lo *= final_sign;
+
+  return r.hi + r.lo;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp
index 708bc4cfd4860..8af31c6623a02 100644
--- a/libc/src/math/generic/ceilf16.cpp
+++ b/libc/src/math/generic/ceilf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_ceilf(x));
 #else
   return fputil::ceil(x);
diff --git a/libc/src/math/generic/copysign.cpp b/libc/src/math/generic/copysign.cpp
index 149d725af08e2..186bb2c5983f4 100644
--- a/libc/src/math/generic/copysign.cpp
+++ b/libc/src/math/generic/copysign.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysign(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf.cpp b/libc/src/math/generic/copysignf.cpp
index 17cd70d37c308..c79e50b61ebda 100644
--- a/libc/src/math/generic/copysignf.cpp
+++ b/libc/src/math/generic/copysignf.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysignf(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf16.cpp b/libc/src/math/generic/copysignf16.cpp
index 42695b3b4a6de..546622f049ebe 100644
--- a/libc/src/math/generic/copysignf16.cpp
+++ b/libc/src/math/generic/copysignf16.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, copysignf16, (float16 x, float16 y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysignf16(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/daddf128.cpp b/libc/src/math/generic/daddf128.cpp
new file mode 100644
index 0000000000000..6edba3b8f5e43
--- /dev/null
+++ b/libc/src/math/generic/daddf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of daddf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/daddf128.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, daddf128, (float128 x, float128 y)) {
+  return fputil::generic::add<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/daddl.cpp b/libc/src/math/generic/daddl.cpp
new file mode 100644
index 0000000000000..708de3833869c
--- /dev/null
+++ b/libc/src/math/generic/daddl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of daddl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/daddl.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, daddl, (long double x, long double y)) {
+  return fputil::generic::add<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/ddivf128.cpp b/libc/src/math/generic/ddivf128.cpp
new file mode 100644
index 0000000000000..1ce4fd66b42cb
--- /dev/null
+++ b/libc/src/math/generic/ddivf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of ddivf128 function--------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ddivf128.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, ddivf128, (float128 x, float128 y)) {
+  return fputil::generic::div<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dfmaf128.cpp b/libc/src/math/generic/dfmaf128.cpp
new file mode 100644
index 0000000000000..b6e1bdb085cf7
--- /dev/null
+++ b/libc/src/math/generic/dfmaf128.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of dfmaf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DFMAf128_H
+#define LLVM_LIBC_SRC_MATH_DFMAf128_H
+
+#include "src/math/dfmaf128.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dfmaf128, (float128 x, float128 y, float128 z)) {
+  return fputil::fma<double>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DFMAf128_H
diff --git a/libc/src/math/generic/dfmal.cpp b/libc/src/math/generic/dfmal.cpp
new file mode 100644
index 0000000000000..02e0ce84ace83
--- /dev/null
+++ b/libc/src/math/generic/dfmal.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of dfmal function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dfmal.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dfmal,
+                   (long double x, long double y, long double z)) {
+  return fputil::fma<double>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dmulf128.cpp b/libc/src/math/generic/dmulf128.cpp
new file mode 100644
index 0000000000000..7e6ef95362c09
--- /dev/null
+++ b/libc/src/math/generic/dmulf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dmulf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dmulf128.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dmulf128, (float128 x, float128 y)) {
+  return fputil::generic::mul<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dmull.cpp b/libc/src/math/generic/dmull.cpp
new file mode 100644
index 0000000000000..428caa84a9977
--- /dev/null
+++ b/libc/src/math/generic/dmull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dmull function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dmull.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dmull, (long double x, long double y)) {
+  return fputil::generic::mul<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsqrtf128.cpp b/libc/src/math/generic/dsqrtf128.cpp
new file mode 100644
index 0000000000000..ad8339309b0f3
--- /dev/null
+++ b/libc/src/math/generic/dsqrtf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsqrtf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsqrtf128.h"
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsqrtf128, (float128 x)) {
+  return fputil::sqrt<double>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsqrtl.cpp b/libc/src/math/generic/dsqrtl.cpp
new file mode 100644
index 0000000000000..bf1dae9161460
--- /dev/null
+++ b/libc/src/math/generic/dsqrtl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsqrtl function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsqrtl.h"
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsqrtl, (long double x)) {
+  return fputil::sqrt<double>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsubf128.cpp b/libc/src/math/generic/dsubf128.cpp
new file mode 100644
index 0000000000000..1b2f1214b3a6f
--- /dev/null
+++ b/libc/src/math/generic/dsubf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsubf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsubf128.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsubf128, (float128 x, float128 y)) {
+  return fputil::generic::sub<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsubl.cpp b/libc/src/math/generic/dsubl.cpp
new file mode 100644
index 0000000000000..8b567d0869d2a
--- /dev/null
+++ b/libc/src/math/generic/dsubl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsubl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsubl.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsubl, (long double x, long double y)) {
+  return fputil::generic::sub<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expf16.cpp b/libc/src/math/generic/expf16.cpp
new file mode 100644
index 0000000000000..b198c559dfedb
--- /dev/null
+++ b/libc/src/math/generic/expf16.cpp
@@ -0,0 +1,172 @@
+//===-- Half-precision e^x function ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 2> EXPF16_EXCEPTS_LO = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.de4p-8, expf16(x) = 0x1.01cp+0 (RZ)
+    {0x1f79U, 0x3c07U, 1U, 0U, 0U},
+    // x = 0x1.73cp-6, expf16(x) = 0x1.05cp+0 (RZ)
+    {0x25cfU, 0x3c17U, 1U, 0U, 0U},
+}};
+
+static constexpr fputil::ExceptValues<float16, 3> EXPF16_EXCEPTS_HI = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.c34p+0, expf16(x) = 0x1.74cp+2 (RZ)
+    {0x3f0dU, 0x45d3U, 1U, 0U, 1U},
+    // x = -0x1.488p-5, expf16(x) = 0x1.ebcp-1 (RZ)
+    {0xa922U, 0x3bafU, 1U, 0U, 0U},
+    // x = -0x1.55p-5, expf16(x) = 0x1.ebp-1 (RZ)
+    {0xa954U, 0x3bacU, 1U, 0U, 0U},
+}};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
+static constexpr cpp::array<float, 31> EXP_HI = {
+    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
+    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
+    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
+    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
+    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
+    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
+    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
+    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
+};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
+static constexpr cpp::array<float, 8> EXP_MID = {
+    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
+    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
+};
+
+LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When 0 < |x| <= 2^(-5), or |x| >= 12, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x2800U || x_abs >= 0x4a00U)) {
+    // exp(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 12.
+    if (x_bits.is_pos() && x_abs >= 0x4a00U) {
+      // exp(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -18.
+    if (x_u >= 0xcc80U) {
+      // exp(-inf) = +0
+      if (x_bits.is_inf())
+        return FPBits::zero().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+      switch (fputil::quick_get_round()) {
+      case FE_UPWARD:
+        return FPBits::min_subnormal().get_val();
+      default:
+        return FPBits::zero().get_val();
+      }
+    }
+
+    // When 0 < |x| <= 2^(-5).
+    if (x_abs <= 0x2800U && !x_bits.is_zero()) {
+      if (auto r = EXPF16_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      float xf = x;
+      // Degree-3 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]);
+      //   > 1 + x * P;
+      return static_cast<float16>(
+          fputil::polyeval(xf, 0x1p+0f, 0x1p+0f, 0x1.0004p-1f, 0x1.555778p-3f));
+    }
+  }
+
+  if (auto r = EXPF16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // For -18 < x < 12, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo, such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
+  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
+  // generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
+
+  float exp_hi = EXP_HI[x_hi + 18];
+  float exp_mid = EXP_MID[x_mid];
+  // Degree-3 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp_lo =
+      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
+  return static_cast<float16>(exp_hi * exp_mid * exp_lo);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index d9485864df899..a4dbf38ab27d0 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -30,13 +30,6 @@
 #define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
 #endif
 
-// #define DEBUGDEBUG
-
-#ifdef DEBUGDEBUG
-#include <iomanip>
-#include <iostream>
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
 
 using fputil::DoubleDouble;
diff --git a/libc/src/math/generic/f16mul.cpp b/libc/src/math/generic/f16mul.cpp
new file mode 100644
index 0000000000000..f7a5225b60f11
--- /dev/null
+++ b/libc/src/math/generic/f16mul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of f16mul function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16mul.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float16, f16mul, (double x, double y)) {
+  return fputil::generic::mul<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16mulf.cpp b/libc/src/math/generic/f16mulf.cpp
new file mode 100644
index 0000000000000..2c04664f804ea
--- /dev/null
+++ b/libc/src/math/generic/f16mulf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of f16mulf function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16mulf.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float16, f16mulf, (float x, float y)) {
+  return fputil::generic::mul<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16mulf128.cpp b/libc/src/math/generic/f16mulf128.cpp
new file mode 100644
index 0000000000000..7e2d6a0d194ae
--- /dev/null
+++ b/libc/src/math/generic/f16mulf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of f16mulf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16mulf128.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float16, f16mulf128, (float128 x, float128 y)) {
+  return fputil::generic::mul<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16mull.cpp b/libc/src/math/generic/f16mull.cpp
new file mode 100644
index 0000000000000..fc66fba4d9f23
--- /dev/null
+++ b/libc/src/math/generic/f16mull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of f16mull function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16mull.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float16, f16mull, (long double x, long double y)) {
+  return fputil::generic::mul<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabs.cpp b/libc/src/math/generic/fabs.cpp
index 472297aecb2f7..55fa958cd7c00 100644
--- a/libc/src/math/generic/fabs.cpp
+++ b/libc/src/math/generic/fabs.cpp
@@ -13,6 +13,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(double, fabs, (double x)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_fabs(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf.cpp b/libc/src/math/generic/fabsf.cpp
index ad4fcb30c795d..2ba18d09bbd5b 100644
--- a/libc/src/math/generic/fabsf.cpp
+++ b/libc/src/math/generic/fabsf.cpp
@@ -13,6 +13,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(float, fabsf, (float x)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_fabsf(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf16.cpp b/libc/src/math/generic/fabsf16.cpp
index 57671fb6067e2..02e11330db718 100644
--- a/libc/src/math/generic/fabsf16.cpp
+++ b/libc/src/math/generic/fabsf16.cpp
@@ -10,9 +10,20 @@
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
+  // For x86, GCC generates better code from the generic implementation.
+  // https://godbolt.org/z/K9orM4hTa
+#if defined(__LIBC_MISC_MATH_BASIC_OPS_OPT) &&                                 \
+    !(defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_GCC))
+  return __builtin_fabsf16(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fadd.cpp b/libc/src/math/generic/fadd.cpp
new file mode 100644
index 0000000000000..66e5188cbcfd4
--- /dev/null
+++ b/libc/src/math/generic/fadd.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fadd function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fadd.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fadd, (double x, double y)) {
+  return fputil::generic::add<float>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp
index 84e4b0730ac68..3092048f5ab06 100644
--- a/libc/src/math/generic/floorf16.cpp
+++ b/libc/src/math/generic/floorf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_floorf(x));
 #else
   return fputil::floor(x);
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
index 16fa11e93df09..64c27d6e2f956 100644
--- a/libc/src/math/generic/fmul.cpp
+++ b/libc/src/math/generic/fmul.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fmul function------------------------------------===//
+//===-- Implementation of fmul function -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,123 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmul.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/FPUtil/BasicOperations.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/generic/mul.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
-  auto x_bits = fputil::FPBits<double>(x);
-
-  auto y_bits = fputil::FPBits<double>(y);
-
-  auto output_sign = (x_bits.sign() != y_bits.sign()) ? Sign::NEG : Sign::POS;
-
-  if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() ||
-                    x_bits.is_zero() || y_bits.is_zero())) {
-    if (x_bits.is_nan())
-      return static_cast<float>(x);
-    if (y_bits.is_nan())
-      return static_cast<float>(y);
-    if (x_bits.is_inf())
-      return y_bits.is_zero()
-                 ? fputil::FPBits<float>::quiet_nan().get_val()
-                 : fputil::FPBits<float>::inf(output_sign).get_val();
-    if (y_bits.is_inf())
-      return x_bits.is_zero()
-                 ? fputil::FPBits<float>::quiet_nan().get_val()
-                 : fputil::FPBits<float>::inf(output_sign).get_val();
-    // Now either x or y is zero, and the other one is finite.
-    return fputil::FPBits<float>::zero(output_sign).get_val();
-  }
-
-  uint64_t mx, my;
-
-  // Get mantissa and append the hidden bit if needed.
-  mx = x_bits.get_explicit_mantissa();
-  my = y_bits.get_explicit_mantissa();
-
-  // Get the corresponding biased exponent.
-  int ex = x_bits.get_explicit_exponent();
-  int ey = y_bits.get_explicit_exponent();
-
-  // Count the number of leading zeros of the explicit mantissas.
-  int nx = cpp::countl_zero(mx);
-  int ny = cpp::countl_zero(my);
-  // Shift the leading 1 bit to the most significant bit.
-  mx <<= nx;
-  my <<= ny;
-
-  // Adjust exponent accordingly: If x or y are normal, we will only need to
-  // shift by (exponent length + sign bit = 11 bits. If x or y are denormal, we
-  // will need to shift more than 11 bits.
-  ex -= (nx - 11);
-  ey -= (ny - 11);
-
-  UInt128 product = static_cast<UInt128>(mx) * static_cast<UInt128>(my);
-  int32_t dm1;
-  uint64_t highs, lows;
-  uint64_t g, hight, lowt;
-  uint32_t m;
-  uint32_t b;
-  int c;
-
-  highs = static_cast<uint64_t>(product >> 64);
-  c = static_cast<int>(highs >= 0x8000000000000000);
-  lows = static_cast<uint64_t>(product);
-
-  lowt = (lows != 0);
-
-  dm1 = ex + ey + c + fputil::FPBits<float>::EXP_BIAS;
-
-  int round_mode = fputil::quick_get_round();
-  if (dm1 >= 255) {
-    if ((round_mode == FE_TOWARDZERO) ||
-        (round_mode == FE_UPWARD && output_sign.is_neg()) ||
-        (round_mode == FE_DOWNWARD && output_sign.is_pos())) {
-      return fputil::FPBits<float>::max_normal(output_sign).get_val();
-    }
-    return fputil::FPBits<float>::inf().get_val();
-  } else if (dm1 <= 0) {
-
-    int m_shift = 40 + c - dm1;
-    int g_shift = m_shift - 1;
-    int h_shift = 64 - g_shift;
-    m = (m_shift >= 64) ? 0 : static_cast<uint32_t>(highs >> m_shift);
-
-    g = g_shift >= 64 ? 0 : (highs >> g_shift) & 1;
-    hight = h_shift >= 64 ? highs : (highs << h_shift) != 0;
-
-    dm1 = 0;
-  } else {
-    m = static_cast<uint32_t>(highs >> (39 + c));
-    g = (highs >> (38 + c)) & 1;
-    hight = (highs << (26 - c)) != 0;
-  }
-
-  if (round_mode == FE_TONEAREST) {
-    b = g && ((hight && lowt) || ((m & 1) != 0));
-  } else if ((output_sign.is_neg() && round_mode == FE_DOWNWARD) ||
-             (output_sign.is_pos() && round_mode == FE_UPWARD)) {
-    b = (g == 0 && (hight && lowt) == 0) ? 0 : 1;
-  } else {
-    b = 0;
-  }
-
-  uint32_t exp16 = (dm1 << 23);
-
-  uint32_t m2 = m & fputil::FPBits<float>::FRACTION_MASK;
-
-  uint32_t result = (exp16 + m2) + b;
-
-  auto result_bits = fputil::FPBits<float>(result);
-  result_bits.set_sign(output_sign);
-  return result_bits.get_val();
+  return fputil::generic::mul<float>(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fmulf128.cpp b/libc/src/math/generic/fmulf128.cpp
new file mode 100644
index 0000000000000..c0c55ace641b8
--- /dev/null
+++ b/libc/src/math/generic/fmulf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fmulf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmulf128.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fmulf128, (float128 x, float128 y)) {
+  return fputil::generic::mul<float>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fmull.cpp b/libc/src/math/generic/fmull.cpp
new file mode 100644
index 0000000000000..41ab165e7d09d
--- /dev/null
+++ b/libc/src/math/generic/fmull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fmull function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmull.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fmull, (long double x, long double y)) {
+  return fputil::generic::mul<float>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fsqrt.cpp b/libc/src/math/generic/fsqrt.cpp
new file mode 100644
index 0000000000000..d54471fd067bf
--- /dev/null
+++ b/libc/src/math/generic/fsqrt.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of fsqrt function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fsqrt.h"
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fsqrt, (double x)) { return fputil::sqrt<float>(x); }
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fsqrtf128.cpp b/libc/src/math/generic/fsqrtf128.cpp
new file mode 100644
index 0000000000000..f2c049529d6cd
--- /dev/null
+++ b/libc/src/math/generic/fsqrtf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fsqrt128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fsqrtf128.h"
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fsqrtf128, (float128 x)) {
+  return fputil::sqrt<float>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fsqrtl.cpp b/libc/src/math/generic/fsqrtl.cpp
new file mode 100644
index 0000000000000..b896a84aeded8
--- /dev/null
+++ b/libc/src/math/generic/fsqrtl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fsqrtl function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fsqrtl.h"
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fsqrtl, (long double x)) {
+  return fputil::sqrt<float>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayload.cpp b/libc/src/math/generic/getpayload.cpp
new file mode 100644
index 0000000000000..14d95516c42c3
--- /dev/null
+++ b/libc/src/math/generic/getpayload.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayload function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayload.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, getpayload, (const double *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayloadf.cpp b/libc/src/math/generic/getpayloadf.cpp
new file mode 100644
index 0000000000000..22db186a8c354
--- /dev/null
+++ b/libc/src/math/generic/getpayloadf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayloadf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayloadf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, getpayloadf, (const float *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayloadf128.cpp b/libc/src/math/generic/getpayloadf128.cpp
new file mode 100644
index 0000000000000..b57469eb7de91
--- /dev/null
+++ b/libc/src/math/generic/getpayloadf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayloadf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayloadf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float128, getpayloadf128, (const float128 *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp
index 75c55ed846726..959c0420ae149 100644
--- a/libc/src/math/generic/hypotf.cpp
+++ b/libc/src/math/generic/hypotf.cpp
@@ -6,11 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 #include "src/math/hypotf.h"
-#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -18,54 +21,75 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
   using DoubleBits = fputil::FPBits<double>;
   using FPBits = fputil::FPBits<float>;
 
-  FPBits x_bits(x), y_bits(y);
+  FPBits x_abs = FPBits(x).abs();
+  FPBits y_abs = FPBits(y).abs();
 
-  uint16_t x_exp = x_bits.get_biased_exponent();
-  uint16_t y_exp = y_bits.get_biased_exponent();
-  uint16_t exp_diff = (x_exp > y_exp) ? (x_exp - y_exp) : (y_exp - x_exp);
+  bool x_abs_larger = x_abs.uintval() >= y_abs.uintval();
 
-  if (exp_diff >= FPBits::FRACTION_LEN + 2) {
-    return fputil::abs(x) + fputil::abs(y);
-  }
+  FPBits a_bits = x_abs_larger ? x_abs : y_abs;
+  FPBits b_bits = x_abs_larger ? y_abs : x_abs;
 
-  double xd = static_cast<double>(x);
-  double yd = static_cast<double>(y);
+  uint32_t a_u = a_bits.uintval();
+  uint32_t b_u = b_bits.uintval();
 
-  // These squares are exact.
-  double x_sq = xd * xd;
-  double y_sq = yd * yd;
+  // Note: replacing `a_u >= FPBits::EXP_MASK` with `a_bits.is_inf_or_nan()`
+  // generates extra exponent bit masking instructions on x86-64.
+  if (LIBC_UNLIKELY(a_u >= FPBits::EXP_MASK)) {
+    // x or y is inf or nan
+    if (a_bits.is_signaling_nan() || b_bits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+    if (a_bits.is_inf() || b_bits.is_inf())
+      return FPBits::inf().get_val();
+    return a_bits.get_val();
+  }
 
-  // Compute the sum of squares.
-  double sum_sq = x_sq + y_sq;
+  if (LIBC_UNLIKELY(a_u - b_u >=
+                    static_cast<uint32_t>((FPBits::FRACTION_LEN + 2)
+                                          << FPBits::FRACTION_LEN)))
+    return x_abs.get_val() + y_abs.get_val();
 
-  // Compute the rounding error with Fast2Sum algorithm:
-  // x_sq + y_sq = sum_sq - err
-  double err = (x_sq >= y_sq) ? (sum_sq - x_sq) - y_sq : (sum_sq - y_sq) - x_sq;
+  double ad = static_cast<double>(a_bits.get_val());
+  double bd = static_cast<double>(b_bits.get_val());
+
+  // These squares are exact.
+  double a_sq = ad * ad;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+  double sum_sq = fputil::multiply_add(bd, bd, a_sq);
+#else
+  double b_sq = bd * bd;
+  double sum_sq = a_sq + b_sq;
+#endif
 
   // Take sqrt in double precision.
   DoubleBits result(fputil::sqrt<double>(sum_sq));
+  uint64_t r_u = result.uintval();
 
-  if (!DoubleBits(sum_sq).is_inf_or_nan()) {
-    // Correct rounding.
-    double r_sq = result.get_val() * result.get_val();
-    double diff = sum_sq - r_sq;
-    constexpr uint64_t MASK = 0x0000'0000'3FFF'FFFFULL;
-    uint64_t lrs = result.uintval() & MASK;
-
-    if (lrs == 0x0000'0000'1000'0000ULL && err < diff) {
-      result.set_uintval(result.uintval() | 1ULL);
-    } else if (lrs == 0x0000'0000'3000'0000ULL && err > diff) {
-      result.set_uintval(result.uintval() - 1ULL);
-    }
-  } else {
-    FPBits bits_x(x), bits_y(y);
-    if (bits_x.is_inf_or_nan() || bits_y.is_inf_or_nan()) {
-      if (bits_x.is_inf() || bits_y.is_inf())
-        return FPBits::inf().get_val();
-      if (bits_x.is_nan())
-        return x;
-      return y;
+  // If any of the sticky bits of the result are non-zero, except the LSB, then
+  // the rounded result is correct.
+  if (LIBC_UNLIKELY(((r_u + 1) & 0x0000'0000'0FFF'FFFE) == 0)) {
+    double r_d = result.get_val();
+
+    // Perform rounding correction.
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+    double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
+    double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
+#else
+    fputil::DoubleDouble r_sq = fputil::exact_mult(r_d, r_d);
+    double sum_sq_lo = b_sq - (sum_sq - a_sq);
+    double err = (sum_sq - r_sq.hi) + (sum_sq_lo - r_sq.lo);
+#endif
+
+    if (err > 0) {
+      r_u |= 1;
+    } else if ((err < 0) && (r_u & 1) == 0) {
+      r_u -= 1;
+    } else if ((r_u & 0x0000'0000'1FFF'FFFF) == 0) {
+      // The rounded result is exact.
+      fputil::clear_except_if_required(FE_INEXACT);
     }
+    return static_cast<float>(DoubleBits(r_u).get_val());
   }
 
   return static_cast<float>(result.get_val());
diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp
index 0e8c091efcf9b..3a53dd28e3d10 100644
--- a/libc/src/math/generic/rintf16.cpp
+++ b/libc/src/math/generic/rintf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_rintf(x));
 #else
   return fputil::round_using_current_rounding_mode(x);
diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp
index b45670bd24ff1..c3dbd779b9739 100644
--- a/libc/src/math/generic/roundevenf16.cpp
+++ b/libc/src/math/generic/roundevenf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) &&                                   \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundevenf(x));
 #else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp
index cb668c0e76388..a5e2b44fbd54b 100644
--- a/libc/src/math/generic/roundf16.cpp
+++ b/libc/src/math/generic/roundf16.cpp
@@ -10,12 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) {
-#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(__LIBC_USE_BUILTIN_ROUND) &&                                       \
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundf(x));
 #else
   return fputil::round(x);
diff --git a/libc/src/math/generic/setpayload.cpp b/libc/src/math/generic/setpayload.cpp
new file mode 100644
index 0000000000000..7e7078c95bdce
--- /dev/null
+++ b/libc/src/math/generic/setpayload.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayload function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayload.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayload, (double *res, double pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/setpayloadf.cpp b/libc/src/math/generic/setpayloadf.cpp
new file mode 100644
index 0000000000000..50d2ffd22fb77
--- /dev/null
+++ b/libc/src/math/generic/setpayloadf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayloadf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayloadf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayloadf, (float *res, float pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/setpayloadf128.cpp b/libc/src/math/generic/setpayloadf128.cpp
new file mode 100644
index 0000000000000..a50e5efd82ceb
--- /dev/null
+++ b/libc/src/math/generic/setpayloadf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayloadf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayloadf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayloadf128, (float128 * res, float128 pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorder.cpp b/libc/src/math/generic/totalorder.cpp
new file mode 100644
index 0000000000000..f052c81c98fa9
--- /dev/null
+++ b/libc/src/math/generic/totalorder.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalorder function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorder.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorder, (const double *x, const double *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorderf.cpp b/libc/src/math/generic/totalorderf.cpp
new file mode 100644
index 0000000000000..17c1304072175
--- /dev/null
+++ b/libc/src/math/generic/totalorderf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalorderf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorderf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorderf, (const float *x, const float *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorderf128.cpp b/libc/src/math/generic/totalorderf128.cpp
new file mode 100644
index 0000000000000..83d77686d212a
--- /dev/null
+++ b/libc/src/math/generic/totalorderf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalorderf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorderf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorderf128,
+                   (const float128 *x, const float128 *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermag.cpp b/libc/src/math/generic/totalordermag.cpp
new file mode 100644
index 0000000000000..fb8215cad68fe
--- /dev/null
+++ b/libc/src/math/generic/totalordermag.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalordermag function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermag.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermag, (const double *x, const double *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagf.cpp b/libc/src/math/generic/totalordermagf.cpp
new file mode 100644
index 0000000000000..6b93bac51d0fb
--- /dev/null
+++ b/libc/src/math/generic/totalordermagf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalordermagf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagf, (const float *x, const float *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagf128.cpp b/libc/src/math/generic/totalordermagf128.cpp
new file mode 100644
index 0000000000000..b32abfa8d09b1
--- /dev/null
+++ b/libc/src/math/generic/totalordermagf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalordermagf128 function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagf128,
+                   (const float128 *x, const float128 *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagl.cpp b/libc/src/math/generic/totalordermagl.cpp
new file mode 100644
index 0000000000000..49eed62bf99eb
--- /dev/null
+++ b/libc/src/math/generic/totalordermagl.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalordermagl function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagl.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagl,
+                   (const long double *x, const long double *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp
index b931053e53438..31b1214a9a0e4 100644
--- a/libc/src/math/generic/truncf16.cpp
+++ b/libc/src/math/generic/truncf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_truncf(x));
 #else
   return fputil::trunc(x);
diff --git a/libc/src/math/getpayload.h b/libc/src/math/getpayload.h
new file mode 100644
index 0000000000000..b00d313c4da50
--- /dev/null
+++ b/libc/src/math/getpayload.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for getpayload --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double getpayload(const double *x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
diff --git a/libc/src/math/getpayloadf.h b/libc/src/math/getpayloadf.h
new file mode 100644
index 0000000000000..20901cd8bd98a
--- /dev/null
+++ b/libc/src/math/getpayloadf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for getpayloadf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float getpayloadf(const float *x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
diff --git a/libc/src/math/getpayloadf128.h b/libc/src/math/getpayloadf128.h
new file mode 100644
index 0000000000000..7ebb4290d4f8a
--- /dev/null
+++ b/libc/src/math/getpayloadf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for getpayloadf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float128 getpayloadf128(const float128 *x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
diff --git a/libc/src/math/setpayload.h b/libc/src/math/setpayload.h
new file mode 100644
index 0000000000000..3f30673e6c564
--- /dev/null
+++ b/libc/src/math/setpayload.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for setpayload --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayload(double *res, double pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
diff --git a/libc/src/math/setpayloadf.h b/libc/src/math/setpayloadf.h
new file mode 100644
index 0000000000000..95544c8df83b8
--- /dev/null
+++ b/libc/src/math/setpayloadf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for setpayloadf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayloadf(float *res, float pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
diff --git a/libc/src/math/setpayloadf128.h b/libc/src/math/setpayloadf128.h
new file mode 100644
index 0000000000000..e46aef38256db
--- /dev/null
+++ b/libc/src/math/setpayloadf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for setpayloadf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayloadf128(float128 *res, float128 pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
diff --git a/libc/src/math/totalorder.h b/libc/src/math/totalorder.h
new file mode 100644
index 0000000000000..d8d0297c52797
--- /dev/null
+++ b/libc/src/math/totalorder.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalorder --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDER_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDER_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorder(const double *x, const double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDER_H
diff --git a/libc/src/math/totalorderf.h b/libc/src/math/totalorderf.h
new file mode 100644
index 0000000000000..bade04ce6e68a
--- /dev/null
+++ b/libc/src/math/totalorderf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalorderf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERF_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorderf(const float *x, const float *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERF_H
diff --git a/libc/src/math/totalorderf128.h b/libc/src/math/totalorderf128.h
new file mode 100644
index 0000000000000..958714849da67
--- /dev/null
+++ b/libc/src/math/totalorderf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for totalorderf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorderf128(const float128 *x, const float128 *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
diff --git a/libc/src/math/totalordermag.h b/libc/src/math/totalordermag.h
new file mode 100644
index 0000000000000..cfdd868add674
--- /dev/null
+++ b/libc/src/math/totalordermag.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermag -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermag(const double *x, const double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
diff --git a/libc/src/math/totalordermagf.h b/libc/src/math/totalordermagf.h
new file mode 100644
index 0000000000000..47ddd831808f5
--- /dev/null
+++ b/libc/src/math/totalordermagf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermagf ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagf(const float *x, const float *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
diff --git a/libc/src/math/totalordermagf128.h b/libc/src/math/totalordermagf128.h
new file mode 100644
index 0000000000000..389d0596b9d4b
--- /dev/null
+++ b/libc/src/math/totalordermagf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for totalordermagf128 -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagf128(const float128 *x, const float128 *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
diff --git a/libc/src/math/totalordermagl.h b/libc/src/math/totalordermagl.h
new file mode 100644
index 0000000000000..64d27f1566877
--- /dev/null
+++ b/libc/src/math/totalordermagl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermag -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGL_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagl(const long double *x, const long double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGl_H
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index dc748b22e0378..70d10e6c4e3f8 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -556,6 +556,28 @@ add_entrypoint_object(
     libc.src.__support.threads.linux.rwlock
 )
 
+add_entrypoint_object(
+  pthread_rwlock_clockrdlock
+  SRCS
+    pthread_rwlock_clockrdlock.cpp
+  HDRS
+    pthread_rwlock_clockrdlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
+add_entrypoint_object(
+  pthread_rwlock_clockwrlock
+  SRCS
+    pthread_rwlock_clockwrlock.cpp
+  HDRS
+    pthread_rwlock_clockwrlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
 add_entrypoint_object(
   pthread_rwlock_timedrdlock
   SRCS
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.cpp b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
new file mode 100644
index 0000000000000..1e44e6d7694f6
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
@@ -0,0 +1,50 @@
+//===-- Implementation of the Rwlock's clockrdlock function ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockrdlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockrdlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->read_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.h b/libc/src/pthread/pthread_rwlock_clockrdlock.h
new file mode 100644
index 0000000000000..8fbd3b089a111
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockrdlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockrdlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.cpp b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
new file mode 100644
index 0000000000000..8f58c7f24bb10
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
@@ -0,0 +1,51 @@
+//===-- Implementation of the Rwlock's clockwrlock function----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+#include "src/__support/time/linux/abs_timeout.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockwrlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockwrlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->write_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.h b/libc/src/pthread/pthread_rwlock_clockwrlock.h
new file mode 100644
index 0000000000000..cb3fa3909fd2b
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockwrlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockwrlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
diff --git a/libc/src/setjmp/aarch64/CMakeLists.txt b/libc/src/setjmp/aarch64/CMakeLists.txt
new file mode 100644
index 0000000000000..47eeb1a5c0ea6
--- /dev/null
+++ b/libc/src/setjmp/aarch64/CMakeLists.txt
@@ -0,0 +1,28 @@
+if(LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER)
+  list(APPEND setjmp_config_options "-DLIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER")
+endif()
+if(setjmp_config_options)
+  list(PREPEND setjmp_config_options "COMPILE_OPTIONS")
+endif()
+
+add_entrypoint_object(
+  setjmp
+  SRCS
+    setjmp.cpp
+  HDRS
+    ../setjmp_impl.h
+  DEPENDS
+    libc.include.setjmp
+  ${setjmp_config_options}
+)
+
+add_entrypoint_object(
+  longjmp
+  SRCS
+    longjmp.cpp
+  HDRS
+    ../longjmp.h
+  DEPENDS
+    libc.include.setjmp
+  ${setjmp_config_options}
+)
diff --git a/libc/src/setjmp/aarch64/longjmp.cpp b/libc/src/setjmp/aarch64/longjmp.cpp
new file mode 100644
index 0000000000000..fbb86524e3951
--- /dev/null
+++ b/libc/src/setjmp/aarch64/longjmp.cpp
@@ -0,0 +1,92 @@
+//===-- Implementation of longjmp for AArch64 -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/setjmp/longjmp.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: if MTE stack tagging is in use (-fsanitize=memtag-stack), we need to
+// iterate over the region between the old and new values of sp, using STG or
+// ST2G instructions to clear the memory tags on the invalidated region of the
+// stack. But this requires a means of finding out that we're in that mode, and
+// as far as I can see there isn't currently a predefined macro for that.
+//
+// (__ARM_FEATURE_MEMORY_TAGGING only indicates whether the target architecture
+// supports the MTE instructions, not whether the compiler is configured to use
+// them.)
+
+[[gnu::naked]] LLVM_LIBC_FUNCTION(void, longjmp,
+                                  ([[maybe_unused]] __jmp_buf * buf,
+                                   [[maybe_unused]] int val)) {
+  // If BTI branch protection is in use, the compiler will automatically insert
+  // a BTI here, so we don't need to make any extra effort to do so.
+
+  // If PAC branch protection is in use, there's no need to sign the return
+  // address at the start of longjmp, because we're not going to use it anyway!
+
+  asm(
+      // Reload the callee-saved GPRs, including fp and lr.
+      R"(
+        ldp x19, x20, [x0,  #0*16]
+        ldp x21, x22, [x0,  #1*16]
+        ldp x23, x24, [x0,  #2*16]
+        ldp x25, x26, [x0,  #3*16]
+        ldp x27, x28, [x0,  #4*16]
+        ldp x29, x30, [x0,  #5*16]
+      )"
+
+#if LIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER
+      // Reload the stack pointer, and the platform register x18.
+      R"(
+        ldp x2,  x18, [x0,  #6*16]
+        mov sp, x2
+      )"
+#else
+      // Reload just the stack pointer.
+      R"(
+        ldr x2,       [x0,  #6*16]
+        mov sp, x2
+      )"
+#endif
+
+#if __ARM_FP
+      // Reload the callee-saved FP registers.
+      R"(
+        ldp d8,  d9,  [x0,  #7*16]
+        ldp d10, d11, [x0,  #8*16]
+        ldp d12, d13, [x0,  #9*16]
+        ldp d14, d15, [x0, #10*16]
+      )"
+#endif
+
+      // Calculate the return value.
+      R"(
+        cmp w1, #0
+        cinc w0, w1, eq
+      )"
+
+#if __ARM_FEATURE_PAC_DEFAULT & 1
+      // Authenticate the return address using the PAC A key.
+      R"(
+        autiasp
+      )"
+#elif __ARM_FEATURE_PAC_DEFAULT & 2
+      // Authenticate the return address using the PAC B key.
+      R"(
+        autibsp
+      )"
+#endif
+
+      R"(
+        ret
+      )");
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/setjmp/aarch64/setjmp.cpp b/libc/src/setjmp/aarch64/setjmp.cpp
new file mode 100644
index 0000000000000..90e49be49a8fc
--- /dev/null
+++ b/libc/src/setjmp/aarch64/setjmp.cpp
@@ -0,0 +1,94 @@
+//===-- Implementation of setjmp for AArch64 ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/setjmp/setjmp_impl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+[[gnu::naked]] LLVM_LIBC_FUNCTION(int, setjmp,
+                                  ([[maybe_unused]] __jmp_buf * buf)) {
+  // If BTI branch protection is in use, the compiler will automatically insert
+  // a BTI here, so we don't need to make any extra effort to do so.
+
+  asm(
+#if __ARM_FEATURE_PAC_DEFAULT & 1
+      // Sign the return address using the PAC A key.
+      R"(
+        paciasp
+      )"
+#elif __ARM_FEATURE_PAC_DEFAULT & 2
+      // Sign the return address using the PAC B key.
+      R"(
+        pacibsp
+      )"
+#endif
+
+      // Store all the callee-saved GPRs, including fp (x29) and also lr (x30).
+      // Of course lr isn't normally callee-saved (the call instruction itself
+      // can't help clobbering it), but we certainly need to save it for this
+      // purpose.
+      R"(
+        stp x19, x20, [x0,  #0*16]
+        stp x21, x22, [x0,  #1*16]
+        stp x23, x24, [x0,  #2*16]
+        stp x25, x26, [x0,  #3*16]
+        stp x27, x28, [x0,  #4*16]
+        stp x29, x30, [x0,  #5*16]
+      )"
+
+#if LIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER
+      // Store the stack pointer, and the platform register x18.
+      R"(
+        add x1, sp, #0
+        stp x1, x18,  [x0,  #6*16]
+      )"
+#else
+      // Store just the stack pointer.
+      R"(
+        add x1, sp, #0
+        str x1,       [x0,  #6*16]
+      )"
+#endif
+
+#if __ARM_FP
+      // Store the callee-saved FP registers. AAPCS64 only requires the low 64
+      // bits of v8-v15 to be preserved, i.e. each of d8,...,d15.
+      R"(
+        stp d8,  d9,  [x0,  #7*16]
+        stp d10, d11, [x0,  #8*16]
+        stp d12, d13, [x0,  #9*16]
+        stp d14, d15, [x0, #10*16]
+      )"
+#endif
+
+      // Set up return value of zero.
+      R"(
+        mov x0, #0
+      )"
+
+#if (__ARM_FEATURE_PAC_DEFAULT & 7) == 5
+      // Authenticate the return address using the PAC A key, since the
+      // compilation options ask for PAC protection even on leaf functions.
+      R"(
+        autiasp
+      )"
+#elif (__ARM_FEATURE_PAC_DEFAULT & 7) == 6
+      // Same, but using the PAC B key.
+      R"(
+        autibsp
+      )"
+#endif
+
+      R"(
+        ret
+      )");
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 2d528a903cc2f..bc5ef5fe0e9b4 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -121,6 +121,18 @@ add_entrypoint_object(
     libc.src.stdio.scanf_core.scanf_main
 )
 
+add_entrypoint_object(
+  vsscanf
+  SRCS
+    vsscanf.cpp
+  HDRS
+    vsscanf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.scanf_core.reader
+    libc.src.stdio.scanf_core.scanf_main
+)
+
 add_entrypoint_object(
   fscanf
   SRCS
@@ -163,6 +175,16 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.writer
 )
 
+add_entrypoint_object(
+  asprintf
+  SRCS
+    asprintf.cpp
+  HDRS
+    asprintf.h
+  DEPENDS
+    libc.src.stdio.printf_core.vasprintf_internal
+)
+
 add_entrypoint_object(
   vsprintf
   SRCS
@@ -185,6 +207,16 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.writer
 )
 
+add_entrypoint_object(
+  vasprintf
+  SRCS
+    vasprintf.cpp
+  HDRS
+    vasprintf.h
+  DEPENDS
+    libc.src.stdio.printf_core.vasprintf_internal
+)
+
 add_subdirectory(printf_core)
 add_subdirectory(scanf_core)
 
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
new file mode 100644
index 0000000000000..88b458a9e103b
--- /dev/null
+++ b/libc/src/stdio/asprintf.cpp
@@ -0,0 +1,28 @@
+//===-- Implementation of asprintf -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/asprintf.h"
+#include "src/__support/arg_list.h"
+#include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/vasprintf_internal.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, asprintf,
+                   (char **__restrict buffer, const char *format, ...)) {
+  va_list vlist;
+  va_start(vlist, format);
+  internal::ArgList args(vlist); // This holder class allows for easier copying
+                                 // and pointer semantics, as well as handling
+                                 // destruction automatically.
+  va_end(vlist);
+  int ret = printf_core::vasprintf_internal(buffer, format, args);
+  return ret;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h
new file mode 100644
index 0000000000000..fd2b908db171d
--- /dev/null
+++ b/libc/src/stdio/asprintf.h
@@ -0,0 +1,22 @@
+//===-- Implementation header of asprintf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_ASPRINTF_H
+#define LLVM_LIBC_SRC_STDIO_ASPRINTF_H
+
+#include "src/__support/macros/config.h"
+#include <stdarg.h>
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE {
+
+int asprintf(char **__restrict s, const char *format, ...);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_ASPRINTF_H
diff --git a/libc/src/stdio/fopencookie.cpp b/libc/src/stdio/fopencookie.cpp
index 07be9a5635a92..9f5694e8e0581 100644
--- a/libc/src/stdio/fopencookie.cpp
+++ b/libc/src/stdio/fopencookie.cpp
@@ -43,16 +43,16 @@ FileIOResult CookieFile::cookie_write(File *f, const void *data, size_t size) {
   auto cookie_file = reinterpret_cast<CookieFile *>(f);
   if (cookie_file->ops.write == nullptr)
     return 0;
-  return cookie_file->ops.write(cookie_file->cookie,
-                                reinterpret_cast<const char *>(data), size);
+  return static_cast<size_t>(cookie_file->ops.write(
+      cookie_file->cookie, reinterpret_cast<const char *>(data), size));
 }
 
 FileIOResult CookieFile::cookie_read(File *f, void *data, size_t size) {
   auto cookie_file = reinterpret_cast<CookieFile *>(f);
   if (cookie_file->ops.read == nullptr)
     return 0;
-  return cookie_file->ops.read(cookie_file->cookie,
-                               reinterpret_cast<char *>(data), size);
+  return static_cast<size_t>(cookie_file->ops.read(
+      cookie_file->cookie, reinterpret_cast<char *>(data), size));
 }
 
 ErrorOr<off_t> CookieFile::cookie_seek(File *f, off_t offset, int whence) {
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 21ff0d43ab728..2d84cb79ea378 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -99,7 +99,6 @@ add_object_library(
     libc.src.__support.uint128
 )
 
-
 add_object_library(
   printf_main
   SRCS
@@ -114,6 +113,18 @@ add_object_library(
     libc.src.__support.arg_list
 )
 
+add_header_library(
+  vasprintf_internal
+  HDRS
+    vasprintf_internal.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.printf_main
+    libc.src.stdio.printf_core.writer
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
+)
+
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 76d006b813dd1..4c3b81ff018ab 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -134,7 +134,7 @@ constexpr int FILE_STATUS_ERROR = -2;
 constexpr int NULLPTR_WRITE_ERROR = -3;
 constexpr int INT_CONVERSION_ERROR = -4;
 constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
-
+constexpr int ALLOCATION_ERROR = -6;
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
new file mode 100644
index 0000000000000..24ebc02a0b33f
--- /dev/null
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -0,0 +1,67 @@
+//===-- Internal Implementation of asprintf ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/arg_list.h"
+#include "src/stdio/printf.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/printf_main.h"
+#include "src/stdio/printf_core/writer.h"
+#include <stdlib.h> // malloc, realloc, free
+
+namespace LIBC_NAMESPACE {
+namespace printf_core {
+
+LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
+  printf_core::WriteBuffer *wb =
+      reinterpret_cast<printf_core::WriteBuffer *>(target);
+  size_t new_size = new_str.size() + wb->buff_cur;
+  const bool isBuffOnStack = (wb->buff == wb->init_buff);
+  char *new_buff = static_cast<char *>(
+      isBuffOnStack ? malloc(new_size + 1)
+                    : realloc(wb->buff, new_size + 1)); // +1 for null
+  if (new_buff == nullptr) {
+    if (wb->buff != wb->init_buff)
+      free(wb->buff);
+    return printf_core::ALLOCATION_ERROR;
+  }
+  if (isBuffOnStack)
+    inline_memcpy(new_buff, wb->buff, wb->buff_cur);
+  wb->buff = new_buff;
+  inline_memcpy(wb->buff + wb->buff_cur, new_str.data(), new_str.size());
+  wb->buff_cur = new_size;
+  wb->buff_len = new_size;
+  return printf_core::WRITE_OK;
+}
+
+constexpr size_t DEFAULT_BUFFER_SIZE = 200;
+
+LIBC_INLINE int vasprintf_internal(char **ret, const char *format,
+                                   internal::ArgList args) {
+  char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
+  printf_core::WriteBuffer wb(init_buff_on_stack, DEFAULT_BUFFER_SIZE,
+                              resize_overflow_hook);
+  printf_core::Writer writer(&wb);
+
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (ret_val < 0) {
+    *ret = nullptr;
+    return -1;
+  }
+  if (wb.buff == init_buff_on_stack) {
+    *ret = static_cast<char *>(malloc(ret_val + 1));
+    if (ret == nullptr)
+      return printf_core::ALLOCATION_ERROR;
+    inline_memcpy(*ret, wb.buff, ret_val);
+  } else {
+    *ret = wb.buff;
+  }
+  (*ret)[ret_val] = '\0';
+  return ret_val;
+}
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 89421561f3b2e..5526a478ea620 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -22,61 +22,85 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 struct WriteBuffer {
+  enum class WriteMode {
+    FILL_BUFF_AND_DROP_OVERFLOW,
+    FLUSH_TO_STREAM,
+    RESIZE_AND_FILL_BUFF,
+  };
   using StreamWriter = int (*)(cpp::string_view, void *);
   char *buff;
-  const size_t buff_len;
+  const char *init_buff; // for checking when resize.
+  size_t buff_len;
   size_t buff_cur = 0;
 
   // The stream writer will be called when the buffer is full. It will be passed
   // string_views to write to the stream.
   StreamWriter stream_writer;
   void *output_target;
+  WriteMode write_mode;
 
   LIBC_INLINE WriteBuffer(char *Buff, size_t Buff_len, StreamWriter hook,
                           void *target)
-      : buff(Buff), buff_len(Buff_len), stream_writer(hook),
-        output_target(target) {}
+      : buff(Buff), init_buff(Buff), buff_len(Buff_len), stream_writer(hook),
+        output_target(target), write_mode(WriteMode::FLUSH_TO_STREAM) {}
 
   LIBC_INLINE WriteBuffer(char *Buff, size_t Buff_len)
-      : buff(Buff), buff_len(Buff_len), stream_writer(nullptr),
-        output_target(nullptr) {}
+      : buff(Buff), init_buff(Buff), buff_len(Buff_len), stream_writer(nullptr),
+        output_target(nullptr),
+        write_mode(WriteMode::FILL_BUFF_AND_DROP_OVERFLOW) {}
+
+  LIBC_INLINE WriteBuffer(char *Buff, size_t Buff_len, StreamWriter hook)
+      : buff(Buff), init_buff(Buff), buff_len(Buff_len), stream_writer(hook),
+        output_target(this), write_mode(WriteMode::RESIZE_AND_FILL_BUFF) {}
+
+  LIBC_INLINE int flush_to_stream(cpp::string_view new_str) {
+    if (buff_cur > 0) {
+      int retval = stream_writer({buff, buff_cur}, output_target);
+      if (retval < 0)
+        return retval;
+    }
+    if (new_str.size() > 0) {
+      int retval = stream_writer(new_str, output_target);
+      if (retval < 0)
+        return retval;
+    }
+    buff_cur = 0;
+    return WRITE_OK;
+  }
+
+  LIBC_INLINE int fill_remaining_to_buff(cpp::string_view new_str) {
+    if (buff_cur < buff_len) {
+      size_t bytes_to_write = buff_len - buff_cur;
+      if (bytes_to_write > new_str.size()) {
+        bytes_to_write = new_str.size();
+      }
+      inline_memcpy(buff + buff_cur, new_str.data(), bytes_to_write);
+      buff_cur += bytes_to_write;
+    }
+    return WRITE_OK;
+  }
+
+  LIBC_INLINE int resize_and_write(cpp::string_view new_str) {
+    return stream_writer(new_str, output_target);
+  }
 
   // The overflow_write method is intended to be called to write the contents of
-  // the buffer and new_str to the stream_writer if it exists, else it will
-  // write as much of new_str to the buffer as it can. The current position in
-  // the buffer will be reset iff stream_writer is called. Calling this with an
-  // empty string will flush the buffer if relevant.
+  // the buffer and new_str to the stream_writer if it exists. If a resizing
+  // hook is provided, it will resize the buffer and write the contents. If
+  // neither a stream_writer nor a resizing hook is provided, it will fill the
+  // remaining space in the buffer with new_str and drop the overflow. Calling
+  // this with an empty string will flush the buffer if relevant.
+
   LIBC_INLINE int overflow_write(cpp::string_view new_str) {
-    // If there is a stream_writer, write the contents of the buffer, then
-    // new_str, then clear the buffer.
-    if (stream_writer != nullptr) {
-      if (buff_cur > 0) {
-        int retval = stream_writer({buff, buff_cur}, output_target);
-        if (retval < 0) {
-          return retval;
-        }
-      }
-      if (new_str.size() > 0) {
-        int retval = stream_writer(new_str, output_target);
-        if (retval < 0) {
-          return retval;
-        }
-      }
-      buff_cur = 0;
-      return WRITE_OK;
-    } else {
-      // We can't flush to the stream, so fill the rest of the buffer, then drop
-      // the overflow.
-      if (buff_cur < buff_len) {
-        size_t bytes_to_write = buff_len - buff_cur;
-        if (bytes_to_write > new_str.size()) {
-          bytes_to_write = new_str.size();
-        }
-        inline_memcpy(buff + buff_cur, new_str.data(), bytes_to_write);
-        buff_cur += bytes_to_write;
-      }
-      return WRITE_OK;
+    switch (write_mode) {
+    case WriteMode::FILL_BUFF_AND_DROP_OVERFLOW:
+      return fill_remaining_to_buff(new_str);
+    case WriteMode::FLUSH_TO_STREAM:
+      return flush_to_stream(new_str);
+    case WriteMode::RESIZE_AND_FILL_BUFF:
+      return resize_and_write(new_str);
     }
+    __builtin_unreachable();
   }
 };
 
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
new file mode 100644
index 0000000000000..7fa4cc6f127dd
--- /dev/null
+++ b/libc/src/stdio/vasprintf.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of vasprintf -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vasprintf.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/printf_core/vasprintf_internal.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, vasprintf,
+                   (char **__restrict ret, const char *format, va_list vlist)) {
+  internal::ArgList args(vlist); // This holder class allows for easier copying
+                                 // and pointer semantics, as well as handling
+                                 // destruction automatically.
+  return printf_core::vasprintf_internal(ret, format, args);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h
new file mode 100644
index 0000000000000..792e948cf1850
--- /dev/null
+++ b/libc/src/stdio/vasprintf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header of vasprintf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_VASPRINTF_H
+#define LLVM_LIBC_SRC_STDIO_VASPRINTF_H
+
+#include <stdarg.h>
+
+namespace LIBC_NAMESPACE {
+
+int vasprintf(char **__restrict s, const char *format, va_list vlist);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_VASPRINTF_H
diff --git a/libc/src/stdio/vsscanf.cpp b/libc/src/stdio/vsscanf.cpp
new file mode 100644
index 0000000000000..fcf0b88885f17
--- /dev/null
+++ b/libc/src/stdio/vsscanf.cpp
@@ -0,0 +1,33 @@
+//===-- Implementation of vsscanf -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vsscanf.h"
+
+#include "src/__support/CPP/limits.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/scanf_core/reader.h"
+#include "src/stdio/scanf_core/scanf_main.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, vsscanf,
+                   (const char *buffer, const char *format, va_list vlist)) {
+  internal::ArgList args(vlist);
+  scanf_core::ReadBuffer rb{const_cast<char *>(buffer),
+                            cpp::numeric_limits<size_t>::max()};
+  scanf_core::Reader reader(&rb);
+  int ret_val = scanf_core::scanf_main(&reader, format, args);
+  // This is done to avoid including stdio.h in the internals. On most systems
+  // EOF is -1, so this will be transformed into just "return ret_val".
+  return (ret_val == -1) ? EOF : ret_val;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsscanf.h b/libc/src/stdio/vsscanf.h
new file mode 100644
index 0000000000000..992c44d3d95b9
--- /dev/null
+++ b/libc/src/stdio/vsscanf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header of vsscanf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_VSSCANF_H
+#define LLVM_LIBC_SRC_STDIO_VSSCANF_H
+
+#include <stdarg.h>
+
+namespace LIBC_NAMESPACE {
+
+int vsscanf(const char *s, const char *format, va_list vlist);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_VSSCANF_H
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 513f6ad723d56..0f363eecc6251 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -51,6 +51,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.OSUtil.osutil
     .exit_handler
+    .at_quick_exit
+    .atexit
 )
 
 add_entrypoint_object(
@@ -329,7 +331,12 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
     include(${LIBC_SOURCE_DIR}/../compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake)
 
     # scudo distinguishes riscv32 and riscv64, so we need to translate the architecture
-    set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO ${LIBC_TARGET_ARCHITECTURE})
+    # set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO ${LIBC_TARGET_ARCHITECTURE})
+    if (LIBC_TARGET_OS_IS_DARWIN AND (LIBC_TARGET_ARCHITECTURE STREQUAL "arm"))
+      set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO arm64)
+    else()
+      set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO ${LIBC_TARGET_ARCHITECTURE})
+    endif()
     if(LIBC_TARGET_ARCHITECTURE_IS_RISCV64)
       set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO riscv64)
     elseif(LIBC_TARGET_ARCHITECTURE_IS_RISCV32)
@@ -385,8 +392,6 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
         malloc.h
       DEPENDS
         libc.src.__support.freelist_heap
-      COMPILE_OPTIONS
-        -DLIBC_FREELIST_MALLOC_SIZE=${LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE}
     )
     get_target_property(freelist_malloc_is_skipped libc.src.stdlib.freelist_malloc "SKIPPED")
     if(LIBC_TARGET_OS_IS_BAREMETAL AND NOT freelist_malloc_is_skipped)
@@ -438,14 +443,23 @@ if(LIBC_TARGET_OS_IS_GPU)
     DEPENDS
       .${LIBC_TARGET_OS}.free
   )
-  add_entrypoint_external(
-    calloc
-  )
-  add_entrypoint_external(
+  add_entrypoint_object(
     realloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.realloc
+  )
+  add_entrypoint_object(
+    calloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.calloc
   )
-  add_entrypoint_external(
+  add_entrypoint_object(
     aligned_alloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.aligned_alloc
   )
 endif()
 
@@ -462,15 +476,11 @@ add_entrypoint_object(
 
 # TODO: Move all exit functions to linux specific 
 
-if (TARGET libc.src.__support.threads.mutex)
-add_object_library(
+if(TARGET libc.src.__support.threads.mutex)
+add_header_library(
   exit_handler
-  SRCS
-    exit_handler.cpp
   HDRS
     exit_handler.h
-  CXX_STANDARD
-    20 # For constinit 
   DEPENDS
     libc.src.__support.CPP.mutex
     libc.src.__support.CPP.new
@@ -487,6 +497,8 @@ add_entrypoint_object(
     atexit.cpp
   HDRS
     atexit.h
+  CXX_STANDARD
+    20 # For constinit 
   DEPENDS
     .exit_handler
 )
@@ -497,8 +509,11 @@ add_entrypoint_object(
     at_quick_exit.cpp
   HDRS
     at_quick_exit.h
+  CXX_STANDARD
+    20 # For constinit 
   DEPENDS
     .exit_handler
+    .atexit
 )
 
 list(APPEND exit_deps
diff --git a/libc/src/stdlib/at_quick_exit.cpp b/libc/src/stdlib/at_quick_exit.cpp
index 7acae8c52def3..d2b4c0cec986f 100644
--- a/libc/src/stdlib/at_quick_exit.cpp
+++ b/libc/src/stdlib/at_quick_exit.cpp
@@ -14,6 +14,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+constinit ExitCallbackList at_quick_exit_callbacks;
+
 LLVM_LIBC_FUNCTION(int, at_quick_exit, (__atexithandler_t callback)) {
   return add_atexit_unit(
       at_quick_exit_callbacks,
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index 6844fb7aacaf6..c8a15dd3cfef2 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -14,6 +14,9 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+constinit ExitCallbackList atexit_callbacks;
+Mutex handler_list_mtx(false, false, false, false);
+
 extern "C" {
 
 int __cxa_atexit(AtExitCallback *callback, void *payload, void *) {
diff --git a/libc/src/stdlib/exit_handler.cpp b/libc/src/stdlib/exit_handler.cpp
deleted file mode 100644
index ed0751a4c889e..0000000000000
--- a/libc/src/stdlib/exit_handler.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===--- Implementation of exit_handler------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/stdlib/exit_handler.h"
-#include "src/__support/CPP/mutex.h" // lock_guard
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-constinit ExitCallbackList at_quick_exit_callbacks;
-constinit ExitCallbackList atexit_callbacks;
-
-Mutex handler_list_mtx(false, false, false, false);
-
-void stdc_at_exit_func(void *payload) {
-  reinterpret_cast<StdCAtExitCallback *>(payload)();
-}
-
-void call_exit_callbacks(ExitCallbackList &callbacks) {
-  handler_list_mtx.lock();
-  while (!callbacks.empty()) {
-    AtExitUnit &unit = callbacks.back();
-    callbacks.pop_back();
-    handler_list_mtx.unlock();
-    unit.callback(unit.payload);
-    handler_list_mtx.lock();
-  }
-  ExitCallbackList::destroy(&callbacks);
-}
-
-int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit) {
-  cpp::lock_guard lock(handler_list_mtx);
-  if (callbacks.push_back(unit))
-    return 0;
-  return -1;
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h
index 41733b7a0e7e5..9720c5473940e 100644
--- a/libc/src/stdlib/exit_handler.h
+++ b/libc/src/stdlib/exit_handler.h
@@ -38,16 +38,32 @@ using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>;
 using ExitCallbackList = FixedVector<AtExitUnit, CALLBACK_LIST_SIZE_FOR_TESTS>;
 #endif
 
-extern ExitCallbackList atexit_callbacks;
-extern ExitCallbackList at_quick_exit_callbacks;
-
+// This is handled by the 'atexit' implementation and shared by 'at_quick_exit'.
 extern Mutex handler_list_mtx;
 
-void stdc_at_exit_func(void *payload);
+LIBC_INLINE void stdc_at_exit_func(void *payload) {
+  reinterpret_cast<StdCAtExitCallback *>(payload)();
+}
 
-void call_exit_callbacks(ExitCallbackList &callbacks);
+LIBC_INLINE void call_exit_callbacks(ExitCallbackList &callbacks) {
+  handler_list_mtx.lock();
+  while (!callbacks.empty()) {
+    AtExitUnit &unit = callbacks.back();
+    callbacks.pop_back();
+    handler_list_mtx.unlock();
+    unit.callback(unit.payload);
+    handler_list_mtx.lock();
+  }
+  ExitCallbackList::destroy(&callbacks);
+}
 
-int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit);
+LIBC_INLINE int add_atexit_unit(ExitCallbackList &callbacks,
+                                const AtExitUnit &unit) {
+  cpp::lock_guard lock(handler_list_mtx);
+  if (callbacks.push_back(unit))
+    return 0;
+  return -1;
+}
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdlib/freelist_malloc.cpp b/libc/src/stdlib/freelist_malloc.cpp
index cfffa0425ff66..47240bc53aa37 100644
--- a/libc/src/stdlib/freelist_malloc.cpp
+++ b/libc/src/stdlib/freelist_malloc.cpp
@@ -18,17 +18,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace {
-#ifdef LIBC_FREELIST_MALLOC_SIZE
-// This is set via the LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE configuration.
-constexpr size_t SIZE = LIBC_FREELIST_MALLOC_SIZE;
-#else
-#error "LIBC_FREELIST_MALLOC_SIZE was not defined for this build."
-#endif
-LIBC_CONSTINIT FreeListHeapBuffer<SIZE> freelist_heap_buffer;
-} // namespace
-
-FreeListHeap<> *freelist_heap = &freelist_heap_buffer;
+static LIBC_CONSTINIT FreeListHeap<> freelist_heap_symbols;
+FreeListHeap<> *freelist_heap = &freelist_heap_symbols;
 
 LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
   return freelist_heap->allocate(size);
diff --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt
index f8a11ec3ffb00..073f81515870b 100644
--- a/libc/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/src/stdlib/gpu/CMakeLists.txt
@@ -20,6 +20,39 @@ add_entrypoint_object(
     libc.src.__support.RPC.rpc_client
 )
 
+add_entrypoint_object(
+  realloc
+  SRCS
+    realloc.cpp
+  HDRS
+    ../realloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
+add_entrypoint_object(
+  calloc
+  SRCS
+    calloc.cpp
+  HDRS
+    ../calloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
+add_entrypoint_object(
+  aligned_alloc
+  SRCS
+    aligned_alloc.cpp
+  HDRS
+    ../aligned_alloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
 add_entrypoint_object(
   abort
   SRCS
diff --git a/libc/src/stdlib/gpu/aligned_alloc.cpp b/libc/src/stdlib/gpu/aligned_alloc.cpp
new file mode 100644
index 0000000000000..cd2c7e55128fe
--- /dev/null
+++ b/libc/src/stdlib/gpu/aligned_alloc.cpp
@@ -0,0 +1,29 @@
+//===-- GPU Implementation of aligned_alloc -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/aligned_alloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
+  if ((alignment & -alignment) != alignment)
+    return nullptr;
+
+  void *ptr = gpu::allocate(size);
+  if ((reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) != 0) {
+    gpu::deallocate(ptr);
+    return nullptr;
+  }
+  return ptr;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/calloc.cpp b/libc/src/stdlib/gpu/calloc.cpp
new file mode 100644
index 0000000000000..9150affc7c200
--- /dev/null
+++ b/libc/src/stdlib/gpu/calloc.cpp
@@ -0,0 +1,31 @@
+//===-- GPU Implementation of calloc --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/calloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memset.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, calloc, (size_t num, size_t size)) {
+  size_t bytes = num * size;
+  if (bytes == 0)
+    return nullptr;
+
+  void *ptr = gpu::allocate(bytes);
+  if (!ptr)
+    return nullptr;
+
+  inline_memset(ptr, 0, bytes);
+  return ptr;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/realloc.cpp b/libc/src/stdlib/gpu/realloc.cpp
new file mode 100644
index 0000000000000..4fd4d6b278179
--- /dev/null
+++ b/libc/src/stdlib/gpu/realloc.cpp
@@ -0,0 +1,32 @@
+//===-- GPU Implementation of realloc -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/realloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, realloc, (void *ptr, size_t size)) {
+  if (ptr == nullptr)
+    return gpu::allocate(size);
+
+  void *newmem = gpu::allocate(size);
+  if (newmem == nullptr)
+    return nullptr;
+
+  // This will copy garbage if it goes beyond the old allocation size.
+  inline_memcpy(newmem, ptr, size);
+  gpu::deallocate(ptr);
+  return newmem;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index 2ca894d091532..bfa6b23b5ef91 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -155,7 +155,7 @@ static void initialize_auxv_once(void) {
 
 static AuxEntry read_entry(int fd) {
   AuxEntry buf;
-  ssize_t size = sizeof(AuxEntry);
+  size_t size = sizeof(AuxEntry);
   char *ptr = reinterpret_cast<char *>(&buf);
   while (size > 0) {
     ssize_t ret = read(fd, ptr, size);
diff --git a/libc/src/sys/stat/linux/kernel_statx.h b/libc/src/sys/stat/linux/kernel_statx.h
index f26f0b826ac1e..d0e223aec3e1e 100644
--- a/libc/src/sys/stat/linux/kernel_statx.h
+++ b/libc/src/sys/stat/linux/kernel_statx.h
@@ -80,7 +80,7 @@ LIBC_INLINE int statx(int dirfd, const char *__restrict path, int flags,
     return -ret;
 
   statbuf->st_dev = MKDEV(xbuf.stx_dev_major, xbuf.stx_dev_minor);
-  statbuf->st_ino = xbuf.stx_ino;
+  statbuf->st_ino = static_cast<decltype(statbuf->st_ino)>(xbuf.stx_ino);
   statbuf->st_mode = xbuf.stx_mode;
   statbuf->st_nlink = xbuf.stx_nlink;
   statbuf->st_uid = xbuf.stx_uid;
@@ -94,7 +94,8 @@ LIBC_INLINE int statx(int dirfd, const char *__restrict path, int flags,
   statbuf->st_ctim.tv_sec = xbuf.stx_ctime.tv_sec;
   statbuf->st_ctim.tv_nsec = xbuf.stx_ctime.tv_nsec;
   statbuf->st_blksize = xbuf.stx_blksize;
-  statbuf->st_blocks = xbuf.stx_blocks;
+  statbuf->st_blocks =
+      static_cast<decltype(statbuf->st_blocks)>(xbuf.stx_blocks);
 
   return 0;
 }
diff --git a/libc/src/sys/statvfs/fstatvfs.h b/libc/src/sys/statvfs/fstatvfs.h
index 60c271c253d32..dfff757d29fee 100644
--- a/libc/src/sys/statvfs/fstatvfs.h
+++ b/libc/src/sys/statvfs/fstatvfs.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SYS_STATVFS_FSTATVFS_H
 #define LLVM_LIBC_SRC_SYS_STATVFS_FSTATVFS_H
 
-#include "llvm-libc-types/struct_statvfs.h"
+#include "include/llvm-libc-types/struct_statvfs.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/statvfs/linux/CMakeLists.txt b/libc/src/sys/statvfs/linux/CMakeLists.txt
index a6660c02badf7..2953717e70392 100644
--- a/libc/src/sys/statvfs/linux/CMakeLists.txt
+++ b/libc/src/sys/statvfs/linux/CMakeLists.txt
@@ -8,7 +8,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.CPP.optional
     libc.include.sys_syscall
-    libc.include.llvm-libc-types.struct_statvfs
+    libc.include.sys_statvfs
 )
 
 add_entrypoint_object(
@@ -19,7 +19,7 @@ add_entrypoint_object(
     ../statvfs.h
   DEPENDS
     libc.src.__support.libc_assert
-    libc.include.llvm-libc-types.struct_statvfs
+    libc.include.sys_statvfs
     .statfs_utils
 )
 
@@ -31,7 +31,7 @@ add_entrypoint_object(
     ../fstatvfs.h
   DEPENDS
     libc.src.__support.libc_assert
-    libc.include.llvm-libc-types.struct_statvfs
+    libc.include.sys_statvfs
     .statfs_utils
 )
 
diff --git a/libc/src/sys/statvfs/linux/statfs_utils.h b/libc/src/sys/statvfs/linux/statfs_utils.h
index ca981c1364748..1e5be51531012 100644
--- a/libc/src/sys/statvfs/linux/statfs_utils.h
+++ b/libc/src/sys/statvfs/linux/statfs_utils.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SYS_STATVFS_LINUX_STATFS_TO_STATVFS_H
 #define LLVM_LIBC_SRC_SYS_STATVFS_LINUX_STATFS_TO_STATVFS_H
 
-#include "llvm-libc-types/struct_statvfs.h"
+#include "include/llvm-libc-types/struct_statvfs.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/macros/attributes.h"
diff --git a/libc/src/sys/statvfs/statvfs.h b/libc/src/sys/statvfs/statvfs.h
index 674a079b593c4..1350880c4ad9f 100644
--- a/libc/src/sys/statvfs/statvfs.h
+++ b/libc/src/sys/statvfs/statvfs.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SYS_STATVFS_STATVFS_H
 #define LLVM_LIBC_SRC_SYS_STATVFS_STATVFS_H
 
-#include "llvm-libc-types/struct_statvfs.h"
+#include "include/llvm-libc-types/struct_statvfs.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp
index b267c3238b01f..7a856376ffb20 100644
--- a/libc/src/time/linux/nanosleep.cpp
+++ b/libc/src/time/linux/nanosleep.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/nanosleep.h"
-
+#include "hdr/time_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 6f67fa9ff44f7..5e5745063fc8c 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -26,6 +26,16 @@ function(add_startup_object name)
     PROPERTIES
       OUTPUT_NAME ${name}.o
   )
+
+  # Make an executable target of relocatable bitcode for clang if needed.
+  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+    add_executable(${fq_target_name}.exe $<TARGET_OBJECTS:${fq_target_name}>)
+    set_target_properties(${fq_target_name}.exe PROPERTIES
+      RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
+      RUNTIME_OUTPUT_NAME ${name}.o)
+    target_link_options(${fq_target_name}.exe PRIVATE
+                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
+  endif()
 endfunction()
 
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4adc2f5c725f7..6427b86158077 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -11,17 +11,10 @@ function(add_unittest_framework_library name)
                         "header only libraries, use 'add_header_library'")
   endif()
 
-  # The Nvidia 'nvlink' linker does not support static libraries.
-  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    set(library_type OBJECT)
-  else()
-    set(library_type STATIC)
-  endif()
-
   foreach(lib IN ITEMS ${name}.unit ${name}.hermetic)
     add_library(
       ${lib}
-      ${library_type}
+      STATIC
       EXCLUDE_FROM_ALL
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 6b50f3d23075a..2749908ef1849 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -74,7 +74,8 @@ template <typename T> struct FPTest : public Test {
   static constexpr T inf = FPBits::inf(Sign::POS).get_val();
   static constexpr T neg_inf = FPBits::inf(Sign::NEG).get_val();
   static constexpr T min_normal = FPBits::min_normal().get_val();
-  static constexpr T max_normal = FPBits::max_normal().get_val();
+  static constexpr T max_normal = FPBits::max_normal(Sign::POS).get_val();
+  static constexpr T neg_max_normal = FPBits::max_normal(Sign::NEG).get_val();
   static constexpr T min_denormal = FPBits::min_subnormal().get_val();
   static constexpr T max_denormal = FPBits::max_subnormal().get_val();
 
diff --git a/libc/test/UnitTest/LibcDeathTestExecutors.cpp b/libc/test/UnitTest/LibcDeathTestExecutors.cpp
index e43dbb19ad152..77b0559c7acea 100644
--- a/libc/test/UnitTest/LibcDeathTestExecutors.cpp
+++ b/libc/test/UnitTest/LibcDeathTestExecutors.cpp
@@ -14,13 +14,18 @@
 
 #include <cassert>
 
+namespace {
+constexpr unsigned TIMEOUT_MS = 10000;
+} // Anonymous namespace
+
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
 
 bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
                              const char *LHSStr, const char *RHSStr,
                              internal::Location Loc) {
-  testutils::ProcessStatus Result = testutils::invoke_in_subprocess(Func, 1000);
+  testutils::ProcessStatus Result =
+      testutils::invoke_in_subprocess(Func, TIMEOUT_MS);
 
   if (const char *error = Result.get_error()) {
     Ctx->markFail();
@@ -32,7 +37,7 @@ bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
   if (Result.timed_out()) {
     Ctx->markFail();
     tlog << Loc;
-    tlog << "Process timed out after " << 1000 << " milliseconds.\n";
+    tlog << "Process timed out after " << TIMEOUT_MS << " milliseconds.\n";
     return false;
   }
 
@@ -63,7 +68,8 @@ bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
 bool Test::testProcessExits(testutils::FunctionCaller *Func, int ExitCode,
                             const char *LHSStr, const char *RHSStr,
                             internal::Location Loc) {
-  testutils::ProcessStatus Result = testutils::invoke_in_subprocess(Func, 1000);
+  testutils::ProcessStatus Result =
+      testutils::invoke_in_subprocess(Func, TIMEOUT_MS);
 
   if (const char *error = Result.get_error()) {
     Ctx->markFail();
@@ -75,7 +81,7 @@ bool Test::testProcessExits(testutils::FunctionCaller *Func, int ExitCode,
   if (Result.timed_out()) {
     Ctx->markFail();
     tlog << Loc;
-    tlog << "Process timed out after " << 1000 << " milliseconds.\n";
+    tlog << "Process timed out after " << TIMEOUT_MS << " milliseconds.\n";
     return false;
   }
 
diff --git a/libc/test/integration/src/pthread/CMakeLists.txt b/libc/test/integration/src/pthread/CMakeLists.txt
index fa5fd3ad55d5f..eb26822597c2f 100644
--- a/libc/test/integration/src/pthread/CMakeLists.txt
+++ b/libc/test/integration/src/pthread/CMakeLists.txt
@@ -32,9 +32,11 @@ add_integration_test(
     libc.src.pthread.pthread_rwlock_rdlock
     libc.src.pthread.pthread_rwlock_tryrdlock
     libc.src.pthread.pthread_rwlock_timedrdlock
+    libc.src.pthread.pthread_rwlock_clockrdlock
     libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlock_trywrlock
     libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_unlock
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_join
diff --git a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
index 455003b6af811..9f5fba187713e 100644
--- a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
@@ -15,6 +15,8 @@
 #include "src/__support/threads/sleep.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
 #include "src/pthread/pthread_rwlock_destroy.h"
 #include "src/pthread/pthread_rwlock_init.h"
 #include "src/pthread/pthread_rwlock_rdlock.h"
@@ -112,6 +114,12 @@ static void nullptr_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_wrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(nullptr, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(nullptr, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_tryrdlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_trywrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(nullptr), EINVAL);
@@ -159,16 +167,40 @@ static void unusual_timespec_test() {
   timespec ts = {0, -1};
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 1'000'000'000;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec += 1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 0;
   ts.tv_sec = -1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
 }
 
 static void timedlock_with_deadlock_test() {
@@ -184,6 +216,13 @@ static void timedlock_with_deadlock_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), 0);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_REALTIME, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_REALTIME, &ts),
+      0);
+  ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   // notice that ts is already expired, but the following should still succeed.
@@ -270,9 +309,11 @@ enum class Operation : int {
   WRITE = 1,
   TIMED_READ = 2,
   TIMED_WRITE = 3,
-  TRY_READ = 4,
-  TRY_WRITE = 5,
-  COUNT = 6
+  CLOCK_READ = 4,
+  CLOCK_WRITE = 5,
+  TRY_READ = 6,
+  TRY_WRITE = 7,
+  COUNT = 8
 };
 
 LIBC_NAMESPACE::RawMutex *io_mutex;
@@ -358,6 +399,24 @@ static void randomized_thread_operation(SharedData *data, ThreadGuard &guard) {
     }
     break;
   }
+  case Operation::CLOCK_READ: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      read_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
+  case Operation::CLOCK_WRITE: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      write_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
   case Operation::TRY_READ: {
     if (LIBC_NAMESPACE::pthread_rwlock_tryrdlock(&data->lock) == 0) {
       read_ops();
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index b9a50a47af75d..60ea7e6a90d71 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -24,7 +24,11 @@ function(add_fp_unittest name)
       message(FATAL_ERROR "Hermetic math test cannot require MPFR.")
     endif()
     set(test_type UNIT_TEST_ONLY)
-    list(APPEND MATH_UNITTEST_LINK_LIBRARIES libcMPFRWrapper -lmpfr -lgmp -latomic)
+    list(APPEND MATH_UNITTEST_LINK_LIBRARIES libcMPFRWrapper -lmpfr -lgmp)
+    if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+      # macOS does not have libatomic.
+      list(APPEND MATH_UNITTEST_LINK_LIBRARIES -latomic)
+    endif()
   endif()
   list(APPEND MATH_UNITTEST_LINK_LIBRARIES LibcFPTestHelpers)
 
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index f994f65e08cbb..90de520405981 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -34,6 +34,7 @@ if(LLVM_LIBC_FULL_BUILD)
     SUITE
       libc-support-tests
     SRCS
+      fake_heap.s
       freelist_heap_test.cpp
       freelist_malloc_test.cpp
     DEPENDS
diff --git a/libc/test/src/__support/File/platform_file_test.cpp b/libc/test/src/__support/File/platform_file_test.cpp
index 469d7500032b9..8aa07219a6527 100644
--- a/libc/test/src/__support/File/platform_file_test.cpp
+++ b/libc/test/src/__support/File/platform_file_test.cpp
@@ -103,7 +103,8 @@ TEST(LlvmLibcPlatformFileTest, CreateAppendSeekAndReadBack) {
   constexpr size_t APPEND_TEXT_SIZE = sizeof(APPEND_TEXT) - 1;
   ASSERT_EQ(file->write(APPEND_TEXT, APPEND_TEXT_SIZE).value, APPEND_TEXT_SIZE);
 
-  ASSERT_EQ(file->seek(-APPEND_TEXT_SIZE, SEEK_END).value(), 0);
+  ASSERT_EQ(file->seek(-static_cast<off_t>(APPEND_TEXT_SIZE), SEEK_END).value(),
+            0);
   char data[APPEND_TEXT_SIZE + 1];
   ASSERT_EQ(file->read(data, APPEND_TEXT_SIZE).value, APPEND_TEXT_SIZE);
   data[APPEND_TEXT_SIZE] = '\0';
diff --git a/libc/test/src/__support/block_test.cpp b/libc/test/src/__support/block_test.cpp
index 04704482b5147..ecce00b7926f9 100644
--- a/libc/test/src/__support/block_test.cpp
+++ b/libc/test/src/__support/block_test.cpp
@@ -54,7 +54,6 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCreateSingleAlignedBlock) {
   EXPECT_EQ(block->prev(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(block->next(), static_cast<BlockType *>(nullptr));
   EXPECT_FALSE(block->used());
-  EXPECT_TRUE(block->last());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CanCreateUnalignedSingleBlock) {
@@ -96,18 +95,16 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitBlock) {
   ASSERT_TRUE(result.has_value());
   auto *block1 = *result;
 
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
 
   auto *block2 = *result;
 
   EXPECT_EQ(block1->inner_size(), kSplitN);
   EXPECT_EQ(block1->outer_size(), kSplitN + BlockType::BLOCK_OVERHEAD);
-  EXPECT_FALSE(block1->last());
 
   EXPECT_EQ(block2->outer_size(), kN - kSplitN - BlockType::BLOCK_OVERHEAD);
   EXPECT_FALSE(block2->used());
-  EXPECT_TRUE(block2->last());
 
   EXPECT_EQ(block1->next(), block2);
   EXPECT_EQ(block2->prev(), block1);
@@ -128,7 +125,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitBlockUnaligned) {
   split_addr += alignof(BlockType) - (split_addr % alignof(BlockType));
   uintptr_t split_len = split_addr - (uintptr_t)&bytes;
 
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
@@ -161,11 +158,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitMidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block1, kSplit2);
+  result = block1->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
@@ -184,7 +181,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitTooSmallBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, kSplitN);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -197,13 +194,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitBlockWithoutHeaderSpace) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, kSplitN);
-  ASSERT_FALSE(result.has_value());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CannotSplitNull) {
-  BlockType *block = nullptr;
-  auto result = BlockType::split(block, 1);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -216,7 +207,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMakeBlockLargerInSplit) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, block->inner_size() + 1);
+  result = block->split(block->inner_size() + 1);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -229,8 +220,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMakeSecondBlockLargerInSplit) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, block->inner_size() -
-                                       BlockType::BLOCK_OVERHEAD + 1);
+  result = block->split(block->inner_size() - BlockType::BLOCK_OVERHEAD + 1);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -243,7 +233,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMakeZeroSizeFirstBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, 0);
+  result = block->split(0);
   ASSERT_TRUE(result.has_value());
   EXPECT_EQ(block->inner_size(), static_cast<size_t>(0));
 }
@@ -257,8 +247,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMakeZeroSizeSecondBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1,
-                            block1->inner_size() - BlockType::BLOCK_OVERHEAD);
+  result = block1->split(block1->inner_size() - BlockType::BLOCK_OVERHEAD);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
@@ -293,7 +282,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitUsedBlock) {
   BlockType *block = *result;
 
   block->mark_used();
-  result = BlockType::split(block, kSplitN);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -309,14 +298,14 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMergeWithNextBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
 
-  result = BlockType::split(block1, kSplit2);
+  result = block1->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
-  EXPECT_TRUE(BlockType::merge_next(block3));
+  EXPECT_TRUE(block3->merge_next());
 
   EXPECT_EQ(block1->next(), block3);
   EXPECT_EQ(block3->prev(), block1);
@@ -334,16 +323,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMergeWithFirstOrLastBlock) {
   BlockType *block1 = *result;
 
   // Do a split, just to check that the checks on next/prev are different...
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  EXPECT_FALSE(BlockType::merge_next(block2));
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CannotMergeNull) {
-  BlockType *block = nullptr;
-  EXPECT_FALSE(BlockType::merge_next(block));
+  EXPECT_FALSE(block2->merge_next());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CannotMergeUsedBlock) {
@@ -356,131 +340,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMergeUsedBlock) {
   BlockType *block = *result;
 
   // Do a split, just to check that the checks on next/prev are different...
-  result = BlockType::split(block, kSplitN);
-  ASSERT_TRUE(result.has_value());
-
-  block->mark_used();
-  EXPECT_FALSE(BlockType::merge_next(block));
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeSingleBlock) {
-  constexpr size_t kN = 1024;
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-
-  auto result = BlockType::init(bytes);
+  result = block->split(kSplitN);
   ASSERT_TRUE(result.has_value());
-  BlockType *block = *result;
 
   block->mark_used();
-  BlockType::free(block);
-  EXPECT_FALSE(block->used());
-  EXPECT_EQ(block->outer_size(), kN);
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockWithoutMerging) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block3 = *result;
-
-  block1->mark_used();
-  block2->mark_used();
-  block3->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_NE(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_FALSE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockAndMergeWithPrev) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block3 = *result;
-
-  block2->mark_used();
-  block3->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_EQ(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_FALSE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockAndMergeWithNext) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-
-  block1->mark_used();
-  block2->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_NE(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeUsedBlockAndMergeWithBoth) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-
-  block2->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_EQ(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(block2->last());
+  EXPECT_FALSE(block->merge_next());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CanCheckValidBlock) {
@@ -493,11 +357,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCheckValidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block2, kSplit2);
+  result = block2->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
@@ -517,15 +381,15 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCheckInvalidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block2, kSplit2);
+  result = block2->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
-  result = BlockType::split(block3, kSplit3);
+  result = block3->split(kSplit3);
   ASSERT_TRUE(result.has_value());
 
   // Corrupt a Block header.
@@ -630,7 +494,6 @@ TEST_FOR_EACH_BLOCK_TYPE(AllocateAlreadyAligned) {
   // Check the next block.
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(),
             bytes.data() + bytes.size());
 }
@@ -674,7 +537,6 @@ TEST_FOR_EACH_BLOCK_TYPE(AllocateNeedsAlignment) {
   // Check the next block.
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(), &*bytes.end());
 }
 
@@ -687,7 +549,7 @@ TEST_FOR_EACH_BLOCK_TYPE(PreviousBlockMergedIfNotFirst) {
   BlockType *block = *result;
 
   // Split the block roughly halfway and work on the second half.
-  auto result2 = BlockType::split(block, kN / 2);
+  auto result2 = block->split(kN / 2);
   ASSERT_TRUE(result2.has_value());
   BlockType *newblock = *result2;
   ASSERT_EQ(newblock->prev(), block);
@@ -744,20 +606,17 @@ TEST_FOR_EACH_BLOCK_TYPE(CanRemergeBlockAllocations) {
 
   // Check we have the appropriate blocks.
   ASSERT_NE(prev, static_cast<BlockType *>(nullptr));
-  ASSERT_FALSE(prev->last());
   ASSERT_EQ(aligned_block->prev(), prev);
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
   EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
-  ASSERT_TRUE(next->last());
 
   // Now check for successful merges.
-  EXPECT_TRUE(BlockType::merge_next(prev));
+  EXPECT_TRUE(prev->merge_next());
   EXPECT_EQ(prev->next(), next);
-  EXPECT_TRUE(BlockType::merge_next(prev));
+  EXPECT_TRUE(prev->merge_next());
   EXPECT_EQ(prev->next(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(prev->last());
 
   // We should have the original buffer.
   EXPECT_EQ(reinterpret_cast<byte *>(prev), &*bytes.begin());
diff --git a/libc/test/src/__support/blockstore_test.cpp b/libc/test/src/__support/blockstore_test.cpp
index dd74ea18f2c02..de7bd72b96c80 100644
--- a/libc/test/src/__support/blockstore_test.cpp
+++ b/libc/test/src/__support/blockstore_test.cpp
@@ -183,11 +183,10 @@ TEST_F(LlvmLibcBlockStoreTest, PopulateAndIterateReverse10) {
   populate_and_iterate<4, 10, true>();
 }
 
-TEST_F(LlvmLibcBlockStoreTest, Back) { back_test<false>(); }
-
-// FIXME: Combing this test with the above test makes the AMDGPU backend
-// generate code which hangs. This should be fixed in the clang compiler.
-TEST_F(LlvmLibcBlockStoreTest, BackReverse) { back_test<true>(); }
+TEST_F(LlvmLibcBlockStoreTest, Back) {
+  back_test<false>();
+  back_test<true>();
+}
 
 TEST_F(LlvmLibcBlockStoreTest, Empty) {
   empty_test<false>();
diff --git a/libc/test/src/__support/fake_heap.s b/libc/test/src/__support/fake_heap.s
new file mode 100644
index 0000000000000..69522f53c8b1f
--- /dev/null
+++ b/libc/test/src/__support/fake_heap.s
@@ -0,0 +1,15 @@
+//===-- Test fake definition for heap symbols -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+.globl _end, __llvm_libc_heap_limit
+
+.bss
+_end:
+.fill 1024
+__llvm_libc_heap_limit:
+
diff --git a/libc/test/src/__support/freelist_heap_test.cpp b/libc/test/src/__support/freelist_heap_test.cpp
index 5815d5dfc01ab..fc4348aed6b56 100644
--- a/libc/test/src/__support/freelist_heap_test.cpp
+++ b/libc/test/src/__support/freelist_heap_test.cpp
@@ -30,6 +30,11 @@ using LIBC_NAMESPACE::freelist_heap;
 #define TEST_FOR_EACH_ALLOCATOR(TestCase, BufferSize)                          \
   class LlvmLibcFreeListHeapTest##TestCase : public testing::Test {            \
   public:                                                                      \
+    FreeListHeapBuffer<BufferSize> fake_global_buffer;                         \
+    void SetUp() override {                                                    \
+      freelist_heap =                                                          \
+          new (&fake_global_buffer) FreeListHeapBuffer<BufferSize>;            \
+    }                                                                          \
     void RunTest(FreeListHeap<> &allocator, [[maybe_unused]] size_t N);        \
   };                                                                           \
   TEST_F(LlvmLibcFreeListHeapTest##TestCase, TestCase) {                       \
@@ -37,7 +42,7 @@ using LIBC_NAMESPACE::freelist_heap;
         cpp::byte buf[BufferSize] = {cpp::byte(0)};                            \
     FreeListHeap<> allocator(buf);                                             \
     RunTest(allocator, BufferSize);                                            \
-    RunTest(*freelist_heap, freelist_heap->region_size());                     \
+    RunTest(*freelist_heap, freelist_heap->region().size());                   \
   }                                                                            \
   void LlvmLibcFreeListHeapTest##TestCase::RunTest(FreeListHeap<> &allocator,  \
                                                    size_t N)
diff --git a/libc/test/src/__support/freelist_malloc_test.cpp b/libc/test/src/__support/freelist_malloc_test.cpp
index e9d7c63a4d438..66a923f778218 100644
--- a/libc/test/src/__support/freelist_malloc_test.cpp
+++ b/libc/test/src/__support/freelist_malloc_test.cpp
@@ -14,15 +14,13 @@
 #include "test/UnitTest/Test.h"
 
 using LIBC_NAMESPACE::freelist_heap;
+using LIBC_NAMESPACE::FreeListHeapBuffer;
 
 TEST(LlvmLibcFreeListMalloc, MallocStats) {
   constexpr size_t kAllocSize = 256;
   constexpr size_t kCallocNum = 4;
   constexpr size_t kCallocSize = 64;
 
-  freelist_heap->reset_heap_stats(); // Do this because other tests might've
-                                     // called the same global allocator.
-
   void *ptr1 = LIBC_NAMESPACE::malloc(kAllocSize);
 
   const auto &freelist_heap_stats = freelist_heap->heap_stats();
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 64b4d2c58fb6a..2fbfa741c037e 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -44,6 +44,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+
+add_fp_unittest(
+  daddl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    daddl_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.daddl
+)
+
 add_fp_unittest(
   sinf_test
   NEED_MPFR
@@ -174,6 +188,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fadd_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fadd_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.fadd
+    libc.src.__support.FPUtil.basic_operations
+)
+
 add_fp_unittest(
   trunc_test
   NEED_MPFR
@@ -874,6 +902,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  exp_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   expf_test
   NEED_MPFR
@@ -888,16 +929,14 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
- exp_test
- NEED_MPFR
- SUITE
-   libc-math-unittests
- SRCS
-   exp_test.cpp
- DEPENDS
-   libc.src.errno.errno
-   libc.src.math.exp
-   libc.src.__support.FPUtil.fp_bits
+  expf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    expf16_test.cpp
+  DEPENDS
+    libc.src.math.expf16
 )
 
 add_fp_unittest(
@@ -1847,10 +1886,28 @@ add_fp_unittest(
   SRCS
     fmul_test.cpp
   HDRS
-    FMulTest.h
+    MulTest.h
   DEPENDS
     libc.src.math.fmul
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
 )
+
+add_fp_unittest(
+  fmull_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.src.math.fmull
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
 add_fp_unittest(
   asinhf_test
   NEED_MPFR
@@ -2012,6 +2069,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  atan2_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    atan2_test.cpp
+  DEPENDS
+    libc.src.math.atan2
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   f16add_test
   NEED_MPFR
@@ -2213,6 +2282,45 @@ add_fp_unittest(
     libc.src.math.f16sqrtl
 )
 
+add_fp_unittest(
+  fsqrt_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fsqrt_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.fsqrt
+)
+
+add_fp_unittest(
+  fsqrtl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fsqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.fsqrtl
+)
+
+add_fp_unittest(
+  dsqrtl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dsqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.dsqrtl
+)
+
 add_fp_unittest(
   cbrtf_test
   NEED_MPFR
@@ -2237,6 +2345,95 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  dmull_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dmull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.src.math.dmull
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  f16mul_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16mul_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.src.math.f16mul
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  f16mulf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16mulf_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.src.math.f16mulf
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  f16mull_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16mull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.src.math.f16mull
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  dfmal_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dfmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmal
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  dsubl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dsubl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.dsubl
+)
+
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/CopySignTest.h b/libc/test/src/math/CopySignTest.h
index c66f91477480b..8db4f6941e603 100644
--- a/libc/test/src/math/CopySignTest.h
+++ b/libc/test/src/math/CopySignTest.h
@@ -39,7 +39,7 @@ class CopySignTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double res1 = func(x, -x);
diff --git a/libc/test/src/math/FAbsTest.h b/libc/test/src/math/FAbsTest.h
index 92b589beeb675..7b4ea93e4db99 100644
--- a/libc/test/src/math/FAbsTest.h
+++ b/libc/test/src/math/FAbsTest.h
@@ -41,7 +41,7 @@ class FAbsTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
       ASSERT_MPFR_MATCH(mpfr::Operation::Abs, x, func(x), 0.0);
     }
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index fefcefe5052a9..20c63e7d1ab59 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -67,9 +67,9 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
 
       if (x > y) {
diff --git a/libc/test/src/math/FMaxTest.h b/libc/test/src/math/FMaxTest.h
index 405642c6b9684..43904a489f76d 100644
--- a/libc/test/src/math/FMaxTest.h
+++ b/libc/test/src/math/FMaxTest.h
@@ -65,9 +65,9 @@ class FMaxTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
       if ((x == 0) && (y == 0))
         continue;
diff --git a/libc/test/src/math/FMinTest.h b/libc/test/src/math/FMinTest.h
index eae0008ddfe39..51c21ae56a774 100644
--- a/libc/test/src/math/FMinTest.h
+++ b/libc/test/src/math/FMinTest.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMINTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMINTEST_H
 
+#include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -65,9 +66,9 @@ class FMinTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
       if ((x == 0) && (y == 0))
         continue;
diff --git a/libc/test/src/math/FMulTest.h b/libc/test/src/math/FMulTest.h
deleted file mode 100644
index 8ca33ea71b712..0000000000000
--- a/libc/test/src/math/FMulTest.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- Utility class to test fmul[f|l] -------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
-#define LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
-
-#include "src/__support/FPUtil/FPBits.h"
-#include "test/UnitTest/FEnvSafeTest.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include "utils/MPFRWrapper/MPFRUtils.h"
-
-namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
-
-template <typename OutType, typename InType>
-class FmulMPFRTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
-
-  DECLARE_SPECIAL_CONSTANTS(InType)
-
-public:
-  typedef OutType (*FMulFunc)(InType, InType);
-
-  void testFMulMPFR(FMulFunc func) {
-    constexpr int N = 10;
-    mpfr::BinaryInput<InType> INPUTS[N] = {
-        {3.0, 5.0},
-        {0x1.0p1, 0x1.0p-131},
-        {0x1.0p2, 0x1.0p-129},
-        {1.0, 1.0},
-        {-0.0, -0.0},
-        {-0.0, 0.0},
-        {0.0, -0.0},
-        {0x1.0p100, 0x1.0p100},
-        {1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150},
-        {1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150}};
-
-    for (int i = 0; i < N; ++i) {
-      InType x = INPUTS[i].x;
-      InType y = INPUTS[i].y;
-      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i],
-                                     func(x, y), 0.5);
-    }
-  }
-
-  void testSpecialInputsMPFR(FMulFunc func) {
-    constexpr int N = 27;
-    mpfr::BinaryInput<InType> INPUTS[N] = {{inf, 0x1.0p-129},
-                                           {0x1.0p-129, inf},
-                                           {inf, 2.0},
-                                           {3.0, inf},
-                                           {0.0, 0.0},
-                                           {neg_inf, aNaN},
-                                           {aNaN, neg_inf},
-                                           {neg_inf, neg_inf},
-                                           {0.0, neg_inf},
-                                           {neg_inf, 0.0},
-                                           {neg_inf, 1.0},
-                                           {1.0, neg_inf},
-                                           {neg_inf, 0x1.0p-129},
-                                           {0x1.0p-129, neg_inf},
-                                           {0.0, 0x1.0p-129},
-                                           {inf, 0.0},
-                                           {0.0, inf},
-                                           {0.0, aNaN},
-                                           {2.0, aNaN},
-                                           {0x1.0p-129, aNaN},
-                                           {inf, aNaN},
-                                           {aNaN, aNaN},
-                                           {0.0, sNaN},
-                                           {2.0, sNaN},
-                                           {0x1.0p-129, sNaN},
-                                           {inf, sNaN},
-                                           {sNaN, sNaN}};
-
-    for (int i = 0; i < N; ++i) {
-      InType x = INPUTS[i].x;
-      InType y = INPUTS[i].y;
-      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i],
-                                     func(x, y), 0.5);
-    }
-  }
-
-  void testNormalRange(FMulFunc func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<InType>;
-    using StorageType = typename FPBits::StorageType;
-    static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-    static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-
-    constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (int signs = 0; signs < 4; ++signs) {
-      for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
-           v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
-        InType x = FPBits(v).get_val(), y = FPBits(w).get_val();
-        if (signs % 2 == 1) {
-          x = -x;
-        }
-        if (signs >= 2) {
-          y = -y;
-        }
-
-        mpfr::BinaryInput<InType> input{x, y};
-        ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, input, func(x, y),
-                                       0.5);
-      }
-    }
-  }
-};
-
-#define LIST_FMUL_MPFR_TESTS(OutType, InType, func)                            \
-  using LlvmLibcFmulTest = FmulMPFRTest<OutType, InType>;                      \
-  TEST_F(LlvmLibcFmulTest, MulMpfr) { testFMulMPFR(&func); }                   \
-  TEST_F(LlvmLibcFmulTest, NanInfMpfr) { testSpecialInputsMPFR(&func); }       \
-  TEST_F(LlvmLibcFmulTest, NormalRange) { testNormalRange(&func); }
-
-#endif // LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index 3ba64afa3c620..74a2d607b0f9f 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -99,7 +99,7 @@ class FrexpTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0l)
         continue;
 
       mpfr::BinaryOutput<T> result;
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index c2d5a1326e0ed..ccb92eb552301 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -82,7 +82,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0)
         continue;
 
       int exponent;
@@ -101,7 +101,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0)
         continue;
 
       int exponent;
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index d6042e3c200c7..5ef3b2675a49e 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -78,7 +78,7 @@ class LogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0l)
         continue;
 
       int exponent;
diff --git a/libc/test/src/math/ModfTest.h b/libc/test/src/math/ModfTest.h
index d6c6f27a5edf6..3377290169610 100644
--- a/libc/test/src/math/ModfTest.h
+++ b/libc/test/src/math/ModfTest.h
@@ -90,7 +90,7 @@ class ModfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == T(0.0))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == T(0.0))
         continue;
 
       T integral;
diff --git a/libc/test/src/math/MulTest.h b/libc/test/src/math/MulTest.h
new file mode 100644
index 0000000000000..cb81a795be36b
--- /dev/null
+++ b/libc/test/src/math/MulTest.h
@@ -0,0 +1,95 @@
+//===-- Utility class to test different flavors of float mul ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_MULTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_MULTEST_H
+
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+template <typename OutType, typename InType>
+class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+  static constexpr InStorageType IN_MAX_NORMAL_U =
+      InFPBits::max_normal().uintval();
+  static constexpr InStorageType IN_MIN_NORMAL_U =
+      InFPBits::min_normal().uintval();
+  static constexpr InStorageType IN_MAX_SUBNORMAL_U =
+      InFPBits::max_subnormal().uintval();
+  static constexpr InStorageType IN_MIN_SUBNORMAL_U =
+      InFPBits::min_subnormal().uintval();
+
+  InStorageType get_random_bit_pattern() {
+    InStorageType bits{0};
+    for (InStorageType i = 0; i < sizeof(InStorageType) / 2; ++i)
+      bits = (bits << 2) + static_cast<uint16_t>(LIBC_NAMESPACE::rand());
+    return bits;
+  }
+
+public:
+  using MulFunc = OutType (*)(InType, InType);
+
+  void test_subnormal_range(MulFunc func) {
+    constexpr InStorageType COUNT = 10'001;
+    constexpr InStorageType STEP =
+        (IN_MAX_SUBNORMAL_U - IN_MIN_SUBNORMAL_U) / COUNT;
+    LIBC_NAMESPACE::srand(1);
+    for (int signs = 0; signs < 4; signs++) {
+      for (InStorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+        InType x = InFPBits(get_random_bit_pattern()).get_val();
+        InType y = InFPBits(v).get_val();
+        if ((signs & 1) != 0)
+          x = -x;
+        if ((signs & 2) != 0)
+          y = -y;
+        mpfr::BinaryInput<InType> input{x, y};
+        EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input, func(x, y),
+                                       0.5);
+      }
+    }
+  }
+
+  void test_normal_range(MulFunc func) {
+    constexpr InStorageType COUNT = 10'001;
+    constexpr InStorageType STEP = (IN_MAX_NORMAL_U - IN_MIN_NORMAL_U) / COUNT;
+    LIBC_NAMESPACE::srand(1);
+    for (int signs = 0; signs < 4; signs++) {
+      for (InStorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+        InType x = InFPBits(get_random_bit_pattern()).get_val();
+        InType y = InFPBits(v).get_val();
+        if ((signs & 1) != 0)
+          x = -x;
+        if ((signs & 2) != 0)
+          y = -y;
+        mpfr::BinaryInput<InType> input{x, y};
+        EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input, func(x, y),
+                                       0.5);
+      }
+    }
+  }
+};
+
+#define LIST_MUL_TESTS(OutType, InType, func)                                  \
+  using LlvmLibcMulTest = MulTest<OutType, InType>;                            \
+  TEST_F(LlvmLibcMulTest, SubnormalRange) { test_subnormal_range(&func); }     \
+  TEST_F(LlvmLibcMulTest, NormalRange) { test_normal_range(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_MULTEST_H
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index c39f2394555ea..a35a6eee8c2e0 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -127,7 +127,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. Hence we test for them separately.
-      if (isnan(x) || isnan(y)) {
+      if (FPBits(v).is_nan() || FPBits(w).is_nan()) {
         ASSERT_FP_EQ(result.f, nan);
         continue;
       }
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 0d25a808e0bf3..488058237d191 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcAcosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
                                    LIBC_NAMESPACE::acosf(x), 0.5);
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index 32761e25b5ce5..5d7f59792b52b 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcAcoshfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x,
                                    LIBC_NAMESPACE::acoshf(x), 0.5);
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index 91e61085e91b8..09dc3c960ed9a 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -46,7 +46,7 @@ TEST_F(LlvmLibcAsinfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x,
                                    LIBC_NAMESPACE::asinf(x), 0.5);
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index b19e26efd07bf..3e55a563879d0 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcAsinhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
                                    LIBC_NAMESPACE::asinhf(x), 0.5);
diff --git a/libc/test/src/math/atan2_test.cpp b/libc/test/src/math/atan2_test.cpp
new file mode 100644
index 0000000000000..637ac8928a2bc
--- /dev/null
+++ b/libc/test/src/math/atan2_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+using LIBC_NAMESPACE::testing::tlog;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcAtan2Test, TrickyInputs) {
+  mpfr::BinaryInput<double> inputs[] = {
+      {0x1.0853408534085p-2, 0x1.e7b54166c6126p-2},
+      {FPBits::inf().get_val(), 0x0.0000000000001p-1022},
+  };
+
+  for (mpfr::BinaryInput<double> &input : inputs) {
+    double x = input.x;
+    double y = input.y;
+    mpfr::RoundingMode rm = mpfr::RoundingMode::Downward;
+    mpfr::ForceRoundingMode rr(rm);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Atan2, input,
+                      LIBC_NAMESPACE::atan2(x, y), 0.5, rm);
+    input.x = -input.x;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(-x, y), 0.5);
+    input.y = -input.y;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(-x, -y), 0.5);
+    input.x = -input.x;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(x, -y), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcAtan2Test, InDoubleRange) {
+  constexpr uint64_t X_COUNT = 123;
+  constexpr uint64_t X_START = FPBits(0.25).uintval();
+  constexpr uint64_t X_STOP = FPBits(4.0).uintval();
+  constexpr uint64_t X_STEP = (X_STOP - X_START) / X_COUNT;
+
+  constexpr uint64_t Y_COUNT = 137;
+  constexpr uint64_t Y_START = FPBits(0.25).uintval();
+  constexpr uint64_t Y_STOP = FPBits(4.0).uintval();
+  constexpr uint64_t Y_STEP = (Y_STOP - Y_START) / Y_COUNT;
+
+  auto test = [&](mpfr::RoundingMode rounding_mode) {
+    mpfr::ForceRoundingMode __r(rounding_mode);
+    if (!__r.success)
+      return;
+
+    uint64_t fails = 0;
+    uint64_t finite_count = 0;
+    uint64_t total_count = 0;
+    double failed_x = 0.0, failed_y = 0.0, failed_r = 0.0;
+    double tol = 0.5;
+
+    for (uint64_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
+      double x = FPBits(v).get_val();
+      if (FPBits(x).is_inf_or_nan() || x < 0.0)
+        continue;
+
+      for (uint64_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
+        double y = FPBits(w).get_val();
+        if (FPBits(y).is_inf_or_nan())
+          continue;
+
+        double result = LIBC_NAMESPACE::atan2(x, y);
+        ++total_count;
+        if (FPBits(result).is_inf_or_nan())
+          continue;
+
+        ++finite_count;
+        mpfr::BinaryInput<double> inputs{x, y};
+
+        if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Atan2, inputs,
+                                               result, 0.5, rounding_mode)) {
+          ++fails;
+          while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(
+              mpfr::Operation::Atan2, inputs, result, tol, rounding_mode)) {
+            failed_x = x;
+            failed_y = y;
+            failed_r = result;
+
+            if (tol > 1000.0)
+              break;
+
+            tol *= 2.0;
+          }
+        }
+      }
+    }
+    if (fails || (finite_count < total_count)) {
+      tlog << " Atan2 failed: " << fails << "/" << finite_count << "/"
+           << total_count << " tests.\n"
+           << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
+    }
+    if (fails) {
+      mpfr::BinaryInput<double> inputs{failed_x, failed_y};
+      EXPECT_MPFR_MATCH(mpfr::Operation::Atan2, inputs, failed_r, 0.5,
+                        rounding_mode);
+    }
+  };
+
+  tlog << " Test Rounding To Nearest...\n";
+  test(mpfr::RoundingMode::Nearest);
+
+  tlog << " Test Rounding Downward...\n";
+  test(mpfr::RoundingMode::Downward);
+
+  tlog << " Test Rounding Upward...\n";
+  test(mpfr::RoundingMode::Upward);
+
+  tlog << " Test Rounding Toward Zero...\n";
+  test(mpfr::RoundingMode::TowardZero);
+}
diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp
index 1242b7e66528f..331f4281af839 100644
--- a/libc/test/src/math/atan2f_test.cpp
+++ b/libc/test/src/math/atan2f_test.cpp
@@ -73,18 +73,18 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) {
 
     for (uint32_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
 
       for (uint32_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
         float y = FPBits(w).get_val();
-        if (isnan(y) || isinf(y))
+        if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
         LIBC_NAMESPACE::libc_errno = 0;
         float result = LIBC_NAMESPACE::atan2f(x, y);
         ++total_count;
-        if (isnan(result) || isinf(result))
+        if (FPBits(result).is_nan() || FPBits(result).is_inf())
           continue;
 
         ++finite_count;
diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp
index 123351496118b..2ef2140966f52 100644
--- a/libc/test/src/math/cbrt_test.cpp
+++ b/libc/test/src/math/cbrt_test.cpp
@@ -21,8 +21,8 @@ using LIBC_NAMESPACE::testing::tlog;
 
 TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
   constexpr uint64_t COUNT = 123'451;
-  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(1.0).uintval();
-  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(8.0).uintval();
+  uint64_t START = FPBits(1.0).uintval();
+  uint64_t STOP = FPBits(8.0).uintval();
   uint64_t STEP = (STOP - START) / COUNT;
 
   auto test = [&](mpfr::RoundingMode rounding_mode) {
@@ -38,12 +38,12 @@ TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(x).is_inf_or_nan())
         continue;
 
       double result = LIBC_NAMESPACE::cbrt(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_inf_or_nan())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index e12e9a8274692..484d47fd3e96c 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -81,12 +81,12 @@ TEST_F(LlvmLibcCosTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double result = LIBC_NAMESPACE::cos(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index dab35fa1a9fe7..82790e3a9d480 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -47,7 +47,7 @@ TEST_F(LlvmLibcCosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
                                    LIBC_NAMESPACE::cosf(x), 0.5);
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 7c5d6630e1093..00bbf4b0bd946 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -61,7 +61,7 @@ TEST_F(LlvmLibcCoshfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Cosh, x, LIBC_NAMESPACE::coshf(x), 0.5);
   }
diff --git a/libc/test/src/math/daddl_test.cpp b/libc/test/src/math/daddl_test.cpp
new file mode 100644
index 0000000000000..7a34d962d5207
--- /dev/null
+++ b/libc/test/src/math/daddl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for daddl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/daddl.h"
+
+LIST_ADD_TESTS(double, long double, LIBC_NAMESPACE::daddl)
diff --git a/libc/test/src/math/dfmal_test.cpp b/libc/test/src/math/dfmal_test.cpp
new file mode 100644
index 0000000000000..3c38f5eb7db9d
--- /dev/null
+++ b/libc/test/src/math/dfmal_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dfmal -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/dfmal.h"
+
+LIST_NARROWING_FMA_TESTS(double, long double, LIBC_NAMESPACE::dfmal)
diff --git a/libc/test/src/math/dmull_test.cpp b/libc/test/src/math/dmull_test.cpp
new file mode 100644
index 0000000000000..1b9c9c2c24ed3
--- /dev/null
+++ b/libc/test/src/math/dmull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dmull -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/dmull.h"
+
+LIST_MUL_TESTS(double, long double, LIBC_NAMESPACE::dmull)
diff --git a/libc/test/src/math/dsqrtl_test.cpp b/libc/test/src/math/dsqrtl_test.cpp
new file mode 100644
index 0000000000000..c178e389a57cd
--- /dev/null
+++ b/libc/test/src/math/dsqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsqrtl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/dsqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(double, long double, LIBC_NAMESPACE::dsqrtl)
diff --git a/libc/test/src/math/dsubl_test.cpp b/libc/test/src/math/dsubl_test.cpp
new file mode 100644
index 0000000000000..98846e0b6e3b3
--- /dev/null
+++ b/libc/test/src/math/dsubl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubl.h"
+
+LIST_SUB_TESTS(double, long double, LIBC_NAMESPACE::dsubl)
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index 5c848d7d5bf7d..851eda4b1542f 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -64,12 +64,12 @@ TEST_F(LlvmLibcErffTest, InFloatRange) {
 
     for (uint32_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x))
+      if (FPBits(v).is_nan())
         continue;
 
       float result = LIBC_NAMESPACE::erff(x);
       ++cc;
-      if (isnan(result))
+      if (FPBits(result).is_nan())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index 4cbdd169d8032..61ae33ecae525 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -104,12 +104,12 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp10(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index e9b2786681042..001b37809d930 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -111,7 +111,7 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10f(x);
@@ -120,7 +120,8 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index 73232ed36077b..f218eea0b358f 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -79,12 +79,12 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp2(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 8ff0ce6a6e724..7caf1489d0221 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -107,7 +107,7 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2f(x);
@@ -116,7 +116,8 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
index cb948289b6179..793cf0cc2cbb4 100644
--- a/libc/test/src/math/exp2m1f_test.cpp
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -49,7 +49,7 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2m1f(x);
@@ -58,7 +58,8 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 64d8198e64f2d..ee674c5fe6cb5 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -77,12 +77,12 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/expf16_test.cpp b/libc/test/src/math/expf16_test.cpp
new file mode 100644
index 0000000000000..ee89a9c716e12
--- /dev/null
+++ b/libc/test/src/math/expf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for expf16 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExpf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
+                                   LIBC_NAMESPACE::expf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExpf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
+                                   LIBC_NAMESPACE::expf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 1dce381918eb6..26a0bca4ce253 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -108,7 +108,7 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::expf(x);
@@ -117,7 +117,8 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                    LIBC_NAMESPACE::expf(x), 0.5);
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index bcca87f590d75..01197b835433f 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -17,6 +17,7 @@
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
 using LlvmLibcExplogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+using FPBits = LIBC_NAMESPACE::fputil::FPBits<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
@@ -24,7 +25,8 @@ constexpr int def_count = 100003;
 constexpr float def_prec = 0.500001f;
 
 auto f_normal = [](float x) -> bool {
-  return !(isnan(x) || isinf(x) || LIBC_NAMESPACE::fabs(x) < 2E-38);
+  return !(FPBits(x).is_nan() || FPBits(x).is_inf() ||
+           LIBC_NAMESPACE::fabs(x) < 2E-38);
 };
 
 TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
@@ -34,7 +36,7 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
     return static_cast<float>(result.mh * r);
   };
   auto f_check = [](float x) -> bool {
-    return !((isnan(x) || isinf(x) || x < -70 || x > 70 ||
+    return !((FPBits(x).is_nan() || FPBits(x).is_inf() || x < -70 || x > 70 ||
               LIBC_NAMESPACE::fabsf(x) < 0x1.0p-10));
   };
   CHECK_DATA(0.0f, neg_inf, mpfr::Operation::Exp, fx, f_check, def_count,
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index df5c08864bb8a..9720773d9f960 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -62,12 +62,12 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::expm1(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 515f988b62649..274fe3bb7afb0 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -117,7 +117,7 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::expm1f(x);
@@ -126,7 +126,8 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                    LIBC_NAMESPACE::expm1f(x), 0.5);
diff --git a/libc/test/src/math/f16mul_test.cpp b/libc/test/src/math/f16mul_test.cpp
new file mode 100644
index 0000000000000..49b443870c483
--- /dev/null
+++ b/libc/test/src/math/f16mul_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mul ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mul.h"
+
+LIST_MUL_TESTS(float16, double, LIBC_NAMESPACE::f16mul)
diff --git a/libc/test/src/math/f16mulf_test.cpp b/libc/test/src/math/f16mulf_test.cpp
new file mode 100644
index 0000000000000..bf2530863621d
--- /dev/null
+++ b/libc/test/src/math/f16mulf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mulf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mulf.h"
+
+LIST_MUL_TESTS(float16, float, LIBC_NAMESPACE::f16mulf)
diff --git a/libc/test/src/math/f16mull_test.cpp b/libc/test/src/math/f16mull_test.cpp
new file mode 100644
index 0000000000000..5292ddb87b7f4
--- /dev/null
+++ b/libc/test/src/math/f16mull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mull ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mull.h"
+
+LIST_MUL_TESTS(float16, long double, LIBC_NAMESPACE::f16mull)
diff --git a/libc/test/src/math/fadd_test.cpp b/libc/test/src/math/fadd_test.cpp
new file mode 100644
index 0000000000000..fe9ac8b252ed3
--- /dev/null
+++ b/libc/test/src/math/fadd_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fadd ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/fadd.h"
+
+LIST_ADD_TESTS(float, double, LIBC_NAMESPACE::fadd)
diff --git a/libc/test/src/math/fmul_test.cpp b/libc/test/src/math/fmul_test.cpp
index 16eaa1a818daf..3f6df66456bac 100644
--- a/libc/test/src/math/fmul_test.cpp
+++ b/libc/test/src/math/fmul_test.cpp
@@ -1,13 +1,13 @@
-//===-- Unittests for fmul-------------------------------------------------===//
+//===-- Unittests for fmul ------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 
-#include "FMulTest.h"
+#include "MulTest.h"
 
 #include "src/math/fmul.h"
 
-LIST_FMUL_MPFR_TESTS(float, double, LIBC_NAMESPACE::fmul)
+LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
diff --git a/libc/test/src/math/fmull_test.cpp b/libc/test/src/math/fmull_test.cpp
new file mode 100644
index 0000000000000..ef694063f20dc
--- /dev/null
+++ b/libc/test/src/math/fmull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmull -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/fmull.h"
+
+LIST_MUL_TESTS(float, long double, LIBC_NAMESPACE::fmull)
diff --git a/libc/test/src/math/fsqrt_test.cpp b/libc/test/src/math/fsqrt_test.cpp
new file mode 100644
index 0000000000000..8471e3c9365e9
--- /dev/null
+++ b/libc/test/src/math/fsqrt_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fsqrt -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/fsqrt.h"
+
+LIST_NARROWING_SQRT_TESTS(float, double, LIBC_NAMESPACE::fsqrt)
diff --git a/libc/test/src/math/fsqrtl_test.cpp b/libc/test/src/math/fsqrtl_test.cpp
new file mode 100644
index 0000000000000..1082a33466857
--- /dev/null
+++ b/libc/test/src/math/fsqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fsqrtl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/fsqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(float, long double, LIBC_NAMESPACE::fsqrtl)
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index fd9a615ca87f7..32b84686badea 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -100,12 +100,12 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log10(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index 47dfa406ec257..98486deb12c74 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -101,12 +101,12 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = start; i <= COUNT; ++i, v += step) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log1p(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index db0772d3c8b87..b42cf3b8681b2 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -74,7 +74,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index 9992c1340e99d..f9bd93d44e50f 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -99,12 +99,12 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log2(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index 24b51adac94d1..83691fb75300e 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::log2f(x);
@@ -58,7 +58,8 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2f(x), 0.5);
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index de1e59579419e..c0f9edfc540a1 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -98,12 +98,12 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index 28a171d540665..79d8275856b6f 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -82,7 +82,7 @@ TEST_F(LlvmLibcLogfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
                                    LIBC_NAMESPACE::logf(x), 0.5);
diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 995e41ba84b03..63d9768e21899 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
@@ -28,11 +29,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
                                 size_t N, size_t rounds, std::ofstream &log) {
-    if (endingBit - startingBit < N)
-      N = endingBit - startingBit;
+    if (sizeof(StorageType) <= sizeof(size_t))
+      N = cpp::min(N, static_cast<size_t>(endingBit - startingBit));
 
     auto runner = [=](Func func) {
-      volatile T result;
+      [[maybe_unused]] volatile T result;
       if (endingBit < startingBit) {
         return;
       }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index a75becba04d07..b43d21a242f35 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_library(
-  libc_diff_test_utils
+  libc_diff_test_utils STATIC
   Timer.cpp
   Timer.h
 )
@@ -95,6 +95,9 @@ add_header_library(
   single_input_single_output_diff
   HDRS
     SingleInputSingleOutputPerf.h
+  DEPENDS
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_header_library(
@@ -102,6 +105,7 @@ add_header_library(
   HDRS
     BinaryOpSingleOutputPerf.h
   DEPENDS
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -171,6 +175,17 @@ add_perf_binary(
     -fno-builtin
 )
 
+add_perf_binary(
+  expf16_perf
+  SRCS
+    expf16_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.expf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
 add_perf_binary(
   fabsf_perf
   SRCS
@@ -402,3 +417,40 @@ add_perf_binary(
   LINK_LIBRARIES
     LibcFPTestHelpers
 )
+
+add_perf_binary(
+  misc_basic_ops_perf
+  SRCS
+    misc_basic_ops_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    .single_input_single_output_diff
+    libc.src.math.copysignf
+    libc.src.math.copysignf16
+    libc.src.math.fabsf
+    libc.src.math.fabsf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_perf_binary(
+  max_min_funcs_perf
+  SRCS
+    max_min_funcs_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    libc.src.math.fmaxf
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximumf16
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fminf
+    libc.src.math.fminf16
+    libc.src.math.fminimumf
+    libc.src.math.fminimumf16
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
index 48ae43d6315e3..efad1259d6bf1 100644
--- a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
@@ -26,16 +27,21 @@ template <typename T> class SingleInputSingleOutputPerf {
 
   static void runPerfInRange(Func myFunc, Func otherFunc,
                              StorageType startingBit, StorageType endingBit,
-                             std::ofstream &log) {
+                             size_t rounds, std::ofstream &log) {
+    size_t n = 10'010'001;
+    if (sizeof(StorageType) <= sizeof(size_t))
+      n = cpp::min(n, static_cast<size_t>(endingBit - startingBit));
+
     auto runner = [=](Func func) {
-      constexpr StorageType N = 10'010'001;
-      StorageType step = (endingBit - startingBit) / N;
+      StorageType step = (endingBit - startingBit) / n;
       if (step == 0)
         step = 1;
-      volatile T result;
-      for (StorageType bits = startingBit; bits < endingBit; bits += step) {
-        T x = FPBits(bits).get_val();
-        result = func(x);
+      [[maybe_unused]] volatile T result;
+      for (size_t i = 0; i < rounds; i++) {
+        for (StorageType bits = startingBit; bits < endingBit; bits += step) {
+          T x = FPBits(bits).get_val();
+          result = func(x);
+        }
       }
     };
 
@@ -44,8 +50,7 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    StorageType numberOfRuns = endingBit - startingBit + 1;
-    double myAverage = static_cast<double>(timer.nanoseconds()) / numberOfRuns;
+    double myAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << myAverage << " ns/op \n";
@@ -56,8 +61,7 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double otherAverage =
-        static_cast<double>(timer.nanoseconds()) / numberOfRuns;
+    double otherAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << otherAverage << " ns/op \n";
@@ -68,15 +72,18 @@ template <typename T> class SingleInputSingleOutputPerf {
     log << "     Mine / Other's  : " << myAverage / otherAverage << " \n";
   }
 
-  static void runPerf(Func myFunc, Func otherFunc, const char *logFile) {
+  static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
+                      const char *logFile) {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
+                   /* endingBit= */ FPBits::max_subnormal().uintval(), rounds,
+                   log);
     log << "\n Performance tests with inputs in normal range:\n";
     runPerfInRange(myFunc, otherFunc,
                    /* startingBit= */ FPBits::min_normal().uintval(),
-                   /* endingBit= */ FPBits::max_normal().uintval(), log);
+                   /* endingBit= */ FPBits::max_normal().uintval(), rounds,
+                   log);
   }
 };
 
@@ -86,6 +93,13 @@ template <typename T> class SingleInputSingleOutputPerf {
 #define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)        \
   int main() {                                                                 \
     LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
-        &myFunc, &otherFunc, filename);                                        \
+        &myFunc, &otherFunc, 1, filename);                                     \
     return 0;                                                                  \
   }
+
+#define SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds,       \
+                                           filename)                           \
+  {                                                                            \
+    LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
+        &myFunc, &otherFunc, rounds, filename);                                \
+  }
diff --git a/libc/test/src/math/performance_testing/expf16_perf.cpp b/libc/test/src/math/performance_testing/expf16_perf.cpp
new file mode 100644
index 0000000000000..bc9d9f05559a3
--- /dev/null
+++ b/libc/test/src/math/performance_testing/expf16_perf.cpp
@@ -0,0 +1,22 @@
+//===-- Performance test for expf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SingleInputSingleOutputPerf.h"
+
+#include "src/math/expf16.h"
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+static float16 placeholderf16(float16 x) { return x; }
+
+int main() {
+  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::expf16,
+                                     ::placeholderf16, 20'000,
+                                     "expf16_perf.log")
+}
diff --git a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp
new file mode 100644
index 0000000000000..9540112e69ea6
--- /dev/null
+++ b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp
@@ -0,0 +1,75 @@
+//===-- Performance test for maximum and minimum functions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputPerf.h"
+#include "src/math/fmaxf.h"
+#include "src/math/fmaxf16.h"
+#include "src/math/fmaximum_numf.h"
+#include "src/math/fmaximum_numf16.h"
+#include "src/math/fmaximumf.h"
+#include "src/math/fmaximumf16.h"
+#include "src/math/fminf.h"
+#include "src/math/fminf16.h"
+#include "src/math/fminimum_numf.h"
+#include "src/math/fminimum_numf16.h"
+#include "src/math/fminimumf.h"
+#include "src/math/fminimumf16.h"
+
+#include <math.h>
+
+static constexpr size_t FLOAT16_ROUNDS = 20'000;
+static constexpr size_t FLOAT_ROUNDS = 40;
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+float16 placeholder_binaryf16(float16 x, float16 y) { return x; }
+
+// The system libc might not provide the fmaximum* and fminimum* C23 math
+// functions either.
+float placeholder_binaryf(float x, float y) { return x; }
+
+int main() {
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaxf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fmaxf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fminf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximumf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fmaximumf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimumf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fminimumf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximum_numf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fmaximum_numf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimum_numf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "fminimum_numf16_perf.log")
+
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaxf, ::fmaxf,
+                                  FLOAT_ROUNDS, "fmaxf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminf, ::fminf,
+                                  FLOAT_ROUNDS, "fminf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximumf,
+                                  placeholder_binaryf, FLOAT_ROUNDS,
+                                  "fmaximumf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimumf,
+                                  placeholder_binaryf, FLOAT_ROUNDS,
+                                  "fminimumf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximum_numf,
+                                  placeholder_binaryf, FLOAT_ROUNDS,
+                                  "fmaximum_numf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimum_numf,
+                                  placeholder_binaryf, FLOAT_ROUNDS,
+                                  "fminimum_numf_perf.log")
+
+  return 0;
+}
diff --git a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
new file mode 100644
index 0000000000000..ace1d21c62c32
--- /dev/null
+++ b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
@@ -0,0 +1,41 @@
+//===-- Performance test for miscellaneous basic operations ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputPerf.h"
+#include "SingleInputSingleOutputPerf.h"
+#include "src/math/copysignf.h"
+#include "src/math/copysignf16.h"
+#include "src/math/fabsf.h"
+#include "src/math/fabsf16.h"
+
+#include <math.h>
+
+static constexpr size_t FLOAT16_ROUNDS = 20'000;
+static constexpr size_t FLOAT_ROUNDS = 40;
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+float16 placeholder_unaryf16(float16 x) { return x; }
+float16 placeholder_binaryf16(float16 x, float16 y) { return x; }
+
+int main() {
+  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16,
+                                     placeholder_unaryf16, FLOAT16_ROUNDS,
+                                     "fabsf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16,
+                                  placeholder_binaryf16, FLOAT16_ROUNDS,
+                                  "copysignf16_perf.log")
+
+  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf,
+                                     FLOAT_ROUNDS, "fabsf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf,
+                                  FLOAT_ROUNDS, "copysignf_perf.log")
+
+  return 0;
+}
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 797913e5b7eef..c13231f9d83b8 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -71,18 +71,18 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) {
 
     for (uint32_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
 
       for (uint32_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
         float y = FPBits(w).get_val();
-        if (isnan(y) || isinf(y))
+        if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
         LIBC_NAMESPACE::libc_errno = 0;
         float result = LIBC_NAMESPACE::powf(x, y);
         ++cc;
-        if (isnan(result) || isinf(result))
+        if (FPBits(result).is_nan() || FPBits(result).is_inf())
           continue;
 
         ++count;
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 89534ae82b56e..60f6ef5c84463 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -67,12 +67,12 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::sin(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/sincos_test.cpp b/libc/test/src/math/sincos_test.cpp
index 7e06456f2e05c..09c87153890d0 100644
--- a/libc/test/src/math/sincos_test.cpp
+++ b/libc/test/src/math/sincos_test.cpp
@@ -110,7 +110,7 @@ TEST_F(LlvmLibcSincosTest, InDoubleRange) {
 
   for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
     double x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
 
     ASSERT_SINCOS_MATCH_ALL_ROUNDING(x);
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 7c359b345f4c3..7254c3be5b75f 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -101,7 +101,7 @@ TEST_F(LlvmLibcSinCosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
 
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 6a8f8f4ee4288..370362639490b 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcSinfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                    LIBC_NAMESPACE::sinf(x), 0.5);
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index cc0552f728947..400df2f18dabc 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -46,7 +46,7 @@ TEST_F(LlvmLibcSinhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Sinh, x, LIBC_NAMESPACE::sinhf(x), 0.5);
   }
diff --git a/libc/test/src/math/smoke/AddTest.h b/libc/test/src/math/smoke/AddTest.h
index c713c5a88763c..0b7e395a22d4c 100644
--- a/libc/test/src/math/smoke/AddTest.h
+++ b/libc/test/src/math/smoke/AddTest.h
@@ -38,23 +38,8 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 76d5919ad9156..2aba5abb5a4d8 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -141,6 +141,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fadd_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fadd_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.fadd
+    libc.src.__support.FPUtil.basic_operations
+
+)
+
 add_fp_unittest(
   trunc_test
   SUITE
@@ -344,6 +358,58 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  dfmal_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dfmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmal
+)
+
+add_fp_unittest(
+  dfmaf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dfmaf128_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmaf128
+)
+
+add_fp_unittest(
+  dsubl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsubl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.hdr.fenv_macros
+    libc.src.math.dsubl
+)
+
+add_fp_unittest(
+  dsubf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsubf128_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.dsubf128
+)
+
 add_fp_unittest(
   floor_test
   SUITE
@@ -926,6 +992,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  exp_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   expf_test
   SUITE
@@ -939,15 +1017,16 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
- exp_test
- SUITE
-   libc-math-smoke-tests
- SRCS
-   exp_test.cpp
- DEPENDS
-   libc.src.errno.errno
-   libc.src.math.exp
-   libc.src.__support.FPUtil.fp_bits
+  expf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    expf16_test.cpp
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.expf16
 )
 
 add_fp_unittest(
@@ -2531,10 +2610,27 @@ add_fp_unittest(
   SRCS
     fmul_test.cpp
   HDRS
-    FMulTest.h
+    MulTest.h
   DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
     libc.src.math.fmul
-    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
+  fmull_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.fmull
 )
 
 add_fp_unittest(
@@ -2716,9 +2812,12 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     hypotf_test.cpp
+  HDRS
+    HypotTest.h
   DEPENDS
     libc.src.math.hypotf
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.architectures    
 )
 
 add_fp_unittest(
@@ -2727,9 +2826,12 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     hypot_test.cpp
+  HDRS
+    HypotTest.h
   DEPENDS
     libc.src.math.hypot
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.architectures
 )
 
 add_fp_unittest(
@@ -3494,6 +3596,16 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  atan2_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    atan2_test.cpp
+  DEPENDS
+    libc.src.math.atan2
+)
+
 add_fp_unittest(
   scalblnf16_test
   SUITE
@@ -3594,6 +3706,30 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  totalorder_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorder_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorder
+)
+
+add_fp_unittest(
+  totalorderf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorderf_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorderf
+)
+
 add_fp_unittest(
   totalorderf16_test
   SUITE
@@ -3606,6 +3742,54 @@ add_fp_unittest(
     libc.src.math.totalorderf16
 )
 
+add_fp_unittest(
+  totalorderf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorderf128_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorderf128
+)
+
+add_fp_unittest(
+  totalordermag_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermag_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermag
+)
+
+add_fp_unittest(
+  totalordermagf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagf_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagf
+)
+
+add_fp_unittest(
+  totalordermagl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagl_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagl
+)
+
 add_fp_unittest(
   totalordermagf16_test
   SUITE
@@ -3618,6 +3802,42 @@ add_fp_unittest(
     libc.src.math.totalordermagf16
 )
 
+add_fp_unittest(
+  totalordermagf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagf128_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagf128
+)
+
+add_fp_unittest(
+  getpayload_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayload_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayload
+)
+
+add_fp_unittest(
+  getpayloadf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayloadf_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayloadf
+)
+
 add_fp_unittest(
   getpayloadf16_test
   SUITE
@@ -3630,6 +3850,42 @@ add_fp_unittest(
     libc.src.math.getpayloadf16
 )
 
+add_fp_unittest(
+  getpayloadf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayloadf128_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayloadf128
+)
+
+add_fp_unittest(
+  setpayload_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayload_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayload
+)
+
+add_fp_unittest(
+  setpayloadf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayloadf_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayloadf
+)
+
 add_fp_unittest(
   setpayloadf16_test
   SUITE
@@ -3642,6 +3898,18 @@ add_fp_unittest(
     libc.src.math.setpayloadf16
 )
 
+add_fp_unittest(
+  setpayloadf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayloadf128_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayloadf128
+)
+
 add_fp_unittest(
   setpayloadsigf16_test
   SUITE
@@ -3723,8 +3991,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16sub
 )
 
@@ -3737,8 +4005,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subf
 )
 
@@ -3751,8 +4019,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subl
 )
 
@@ -3765,8 +4033,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subf128
 )
 
@@ -3922,6 +4190,67 @@ add_fp_unittest(
     libc.src.math.f16sqrtf128
 )
 
+add_fp_unittest(
+  fsqrt_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fsqrt_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.fsqrt
+)
+
+
+add_fp_unittest(
+  fsqrtl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fsqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.fsqrtl
+)
+
+add_fp_unittest(
+  fsqrtf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fsqrtf128_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.fsqrtf128
+)
+
+add_fp_unittest(
+  dsqrtl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.dsqrtl
+)
+
+add_fp_unittest(
+  dsqrtf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsqrtf128_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.dsqrtf128
+)
+
 add_fp_unittest(
   sin_test
   SUITE
@@ -3981,3 +4310,114 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.cbrt
 )
+
+add_fp_unittest(
+  dmull_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dmull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.dmull
+)
+
+add_fp_unittest(
+  f16mul_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16mul_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16mul
+)
+
+add_fp_unittest(
+  f16mulf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16mulf_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16mulf
+)
+
+add_fp_unittest(
+  f16mull_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16mull_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16mull
+)
+
+add_fp_unittest(
+  f16mulf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16mulf128_test.cpp
+  HDRS
+    MulTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16mulf128
+)
+
+add_fp_unittest(
+  daddl
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    daddl_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.daddl
+)
+
+add_fp_unittest(
+  daddf128
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    daddf128_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.daddf128
+)
+
+add_fp_unittest(
+  ddivf128
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ddivf128_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.ddivf128
+)
diff --git a/libc/test/src/math/smoke/DivTest.h b/libc/test/src/math/smoke/DivTest.h
index 71cfb326b97cc..b30fc17aac1d5 100644
--- a/libc/test/src/math/smoke/DivTest.h
+++ b/libc/test/src/math/smoke/DivTest.h
@@ -36,23 +36,8 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/FMulTest.h b/libc/test/src/math/smoke/FMulTest.h
deleted file mode 100644
index 33fb82c8d2da1..0000000000000
--- a/libc/test/src/math/smoke/FMulTest.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- Utility class to test fmul[f|l] ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
-#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
-
-#include "test/UnitTest/FEnvSafeTest.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-
-template <typename T, typename R>
-class FmulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
-
-  DECLARE_SPECIAL_CONSTANTS(T)
-
-public:
-  typedef T (*FMulFunc)(R, R);
-
-  void testMul(FMulFunc func) {
-
-    EXPECT_FP_EQ_ALL_ROUNDING(T(15.0), func(3.0, 5.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-130), func(0x1.0p1, 0x1.0p-131));
-    EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-127), func(0x1.0p2, 0x1.0p-129));
-    EXPECT_FP_EQ_ALL_ROUNDING(T(1.0), func(1.0, 1.0));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(T(0.0), func(-0.0, -0.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(0.0, -0.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(-0.0, 0.0));
-
-    EXPECT_FP_EQ_ROUNDING_NEAREST(inf, func(0x1.0p100, 0x1.0p100));
-    EXPECT_FP_EQ_ROUNDING_UPWARD(inf, func(0x1.0p100, 0x1.0p100));
-    EXPECT_FP_EQ_ROUNDING_DOWNWARD(max_normal, func(0x1.0p100, 0x1.0p100));
-    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(max_normal, func(0x1.0p100, 0x1.0p100));
-
-    EXPECT_FP_EQ_ROUNDING_NEAREST(
-        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
-        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
-        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_UPWARD(
-        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-
-    EXPECT_FP_EQ_ROUNDING_NEAREST(
-        0x1.0p-128f + 0x1.0p-148f,
-        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_UPWARD(
-        0x1.0p-128f + 0x1.0p-148f,
-        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
-        0x1.0p-128f + 0x1.0p-149f,
-        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
-        0x1.0p-128f + 0x1.0p-149f,
-        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
-  }
-
-  void testSpecialInputs(FMulFunc func) {
-    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 0x1.0p-129));
-    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(0x1.0p-129, inf));
-    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 2.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(3.0, inf));
-    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0.0));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, neg_inf));
-    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(neg_inf, neg_inf));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, neg_inf));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, 0.0));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 1.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(1.0, neg_inf));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 0x1.0p-129));
-    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(0x1.0p-129, neg_inf));
-
-    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0x1.0p-129));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, 0.0));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, inf));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, aNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, sNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, sNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, sNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, sNaN));
-    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(sNaN, sNaN));
-  }
-};
-
-#define LIST_FMUL_TESTS(T, R, func)                                            \
-  using LlvmLibcFmulTest = FmulTest<T, R>;                                     \
-  TEST_F(LlvmLibcFmulTest, Mul) { testMul(&func); }                            \
-  TEST_F(LlvmLibcFmulTest, NaNInf) { testSpecialInputs(&func); }
-
-#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 80e9bb7366dfe..d7c62dcbeb0ed 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -9,30 +9,16 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "test/UnitTest/FEnvSafeTest.h"
+#include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "hdr/math_macros.h"
-
 template <typename T>
-class HypotTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
 private:
   using Func = T (*)(T, T);
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
-
-  const T nan = FPBits::quiet_nan().get_val();
-  const T inf = FPBits::inf(Sign::POS).get_val();
-  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
-  const T zero = FPBits::zero(Sign::POS).get_val();
-  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
 
-  const T max_normal = FPBits::max_normal().get_val();
-  const T min_normal = FPBits::min_normal().get_val();
-  const T max_subnormal = FPBits::max_subnormal().get_val();
-  const T min_subnormal = FPBits::min_subnormal().get_val();
+  DECLARE_SPECIAL_CONSTANTS(T)
 
 public:
   void test_special_numbers(Func func) {
@@ -40,11 +26,18 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     // Pythagorean triples.
     constexpr T PYT[N][3] = {{3, 4, 5}, {5, 12, 13}, {8, 15, 17}, {7, 24, 25}};
 
-    EXPECT_FP_EQ(func(inf, nan), inf);
-    EXPECT_FP_EQ(func(nan, neg_inf), inf);
-    EXPECT_FP_EQ(func(nan, nan), nan);
-    EXPECT_FP_EQ(func(nan, zero), nan);
-    EXPECT_FP_EQ(func(neg_zero, nan), nan);
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+    // TODO: Investigate why sNaN tests are failing on nVidia.
+    // https://github.com/llvm/llvm-project/issues/99706.
+    EXPECT_FP_EQ(func(inf, sNaN), aNaN);
+    EXPECT_FP_EQ(func(sNaN, neg_inf), aNaN);
+#endif // !LIBC_TARGET_ARCH_IS_NVPTX
+
+    EXPECT_FP_EQ(func(inf, aNaN), inf);
+    EXPECT_FP_EQ(func(aNaN, neg_inf), inf);
+    EXPECT_FP_EQ(func(aNaN, aNaN), aNaN);
+    EXPECT_FP_EQ(func(aNaN, zero), aNaN);
+    EXPECT_FP_EQ(func(neg_zero, aNaN), aNaN);
 
     for (int i = 0; i < N; ++i) {
       EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(PYT[i][0], PYT[i][1]));
diff --git a/libc/test/src/math/smoke/MulTest.h b/libc/test/src/math/smoke/MulTest.h
new file mode 100644
index 0000000000000..0c847e39687b7
--- /dev/null
+++ b/libc/test/src/math/smoke/MulTest.h
@@ -0,0 +1,156 @@
+//===-- Utility class to test different flavors of float mul ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename OutType, typename InType>
+class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(OutType)
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+  InConstants in;
+
+public:
+  using MulFunc = OutType (*)(InType, InType);
+
+  void test_special_numbers(MulFunc func) {
+    EXPECT_FP_IS_NAN(func(aNaN, aNaN));
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
+
+    InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
+
+    EXPECT_FP_EQ(inf, func(inf, InType(1.0)));
+    EXPECT_FP_EQ(neg_inf, func(neg_inf, InType(1.0)));
+    EXPECT_FP_EQ(neg_inf, func(inf, InType(-1.0)));
+    EXPECT_FP_EQ(inf, func(neg_inf, InType(-1.0)));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero, zero));
+    EXPECT_FP_EQ_ALL_ROUNDING(zero, func(neg_zero, neg_zero));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(zero, neg_zero));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(neg_zero, zero));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(OutType(1.0), func(1.0, 1.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(OutType(15.0), func(3.0, 5.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(OutType(0x1.0p-13), func(0x1.0p+1, 0x1.0p-14));
+    EXPECT_FP_EQ_ALL_ROUNDING(OutType(0x1.0p-10), func(0x1.0p+2, 0x1.0p-12));
+  }
+
+  void test_invalid_operations(MulFunc func) {
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_zero), FE_INVALID);
+  }
+
+  void test_range_errors(MulFunc func) {
+    using namespace LIBC_NAMESPACE::fputil::testing;
+
+    if (ForceRoundingMode r(RoundingMode::Nearest); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero,
+                                  func(in.neg_min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal,
+                                  func(neg_max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero,
+                                  func(in.neg_min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::Downward); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_min_denormal,
+                                  func(in.neg_min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::Upward); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal,
+                                  func(neg_max_normal, max_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal,
+                                  func(in.min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero,
+                                  func(in.neg_min_denormal, in.min_denormal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+  }
+
+  void test_inexact_results(MulFunc func) {
+    InFPBits x_bits = InFPBits::one();
+    x_bits.set_mantissa(InFPBits::SIG_MASK);
+    InType x = x_bits.get_val();
+    func(x, x);
+    EXPECT_FP_EXCEPTION(FE_INEXACT);
+  }
+};
+
+#define LIST_MUL_TESTS(OutType, InType, func)                                  \
+  using LlvmLibcMulTest = MulTest<OutType, InType>;                            \
+  TEST_F(LlvmLibcMulTest, SpecialNumbers) { test_special_numbers(&func); }     \
+  TEST_F(LlvmLibcMulTest, InvalidOperations) {                                 \
+    test_invalid_operations(&func);                                            \
+  }                                                                            \
+  TEST_F(LlvmLibcMulTest, RangeErrors) { test_range_errors(&func); }           \
+  TEST_F(LlvmLibcMulTest, InexactResults) { test_inexact_results(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H
diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h
index a4b3822aca43c..9ee4220b38208 100644
--- a/libc/test/src/math/smoke/SubTest.h
+++ b/libc/test/src/math/smoke/SubTest.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_SUBTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_SUBTEST_H
 
+#include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -37,23 +37,8 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/TotalOrderMagTest.h b/libc/test/src/math/smoke/TotalOrderMagTest.h
index 5fe2983a0e678..3d7f24fc74d10 100644
--- a/libc/test/src/math/smoke/TotalOrderMagTest.h
+++ b/libc/test/src/math/smoke/TotalOrderMagTest.h
@@ -104,24 +104,24 @@ class TotalOrderMagTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 
   void testNaNPayloads(TotalOrderMagFunc func) {
-    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
-    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
-    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
-    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+    T qnan_0x42 = FPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    T neg_qnan_0x42 = FPBits::quiet_nan(Sign::NEG, 0x42).get_val();
+    T snan_0x42 = FPBits::signaling_nan(Sign::POS, 0x42).get_val();
+    T neg_snan_0x42 = FPBits::signaling_nan(Sign::NEG, 0x42).get_val();
 
     EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
     EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
-    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
-    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
-    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
-    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, sNaN, snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, qnan_0x42, aNaN));
+    EXPECT_TRUE(funcWrapper(func, snan_0x42, sNaN));
 
     EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
     EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
-    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_qnan_123));
-    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_snan_123));
-    EXPECT_FALSE(funcWrapper(func, neg_qnan_123, neg_aNaN));
-    EXPECT_FALSE(funcWrapper(func, neg_snan_123, neg_sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, neg_qnan_0x42, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_snan_0x42, neg_sNaN));
   }
 };
 
diff --git a/libc/test/src/math/smoke/TotalOrderTest.h b/libc/test/src/math/smoke/TotalOrderTest.h
index 281b2a59f930d..4d4257d089daf 100644
--- a/libc/test/src/math/smoke/TotalOrderTest.h
+++ b/libc/test/src/math/smoke/TotalOrderTest.h
@@ -102,24 +102,24 @@ class TotalOrderTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 
   void testNaNPayloads(TotalOrderFunc func) {
-    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
-    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
-    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
-    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+    T qnan_0x42 = FPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    T neg_qnan_0x42 = FPBits::quiet_nan(Sign::NEG, 0x42).get_val();
+    T snan_0x42 = FPBits::signaling_nan(Sign::POS, 0x42).get_val();
+    T neg_snan_0x42 = FPBits::signaling_nan(Sign::NEG, 0x42).get_val();
 
     EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
     EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
-    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
-    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
-    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
-    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, sNaN, snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, qnan_0x42, aNaN));
+    EXPECT_TRUE(funcWrapper(func, snan_0x42, sNaN));
 
     EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
     EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
-    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_qnan_123));
-    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_snan_123));
-    EXPECT_TRUE(funcWrapper(func, neg_qnan_123, neg_aNaN));
-    EXPECT_TRUE(funcWrapper(func, neg_snan_123, neg_sNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_qnan_0x42));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_snan_0x42));
+    EXPECT_TRUE(funcWrapper(func, neg_qnan_0x42, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_snan_0x42, neg_sNaN));
   }
 };
 
diff --git a/libc/test/src/math/smoke/atan2_test.cpp b/libc/test/src/math/smoke/atan2_test.cpp
new file mode 100644
index 0000000000000..61dd6cab1049f
--- /dev/null
+++ b/libc/test/src/math/smoke/atan2_test.cpp
@@ -0,0 +1,22 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+
+TEST_F(LlvmLibcAtan2Test, SpecialNumbers) {
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(aNaN, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(1.0, aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(zero, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-0.0, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(1.0, inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-1.0, inf));
+}
diff --git a/libc/test/src/math/smoke/daddf128_test.cpp b/libc/test/src/math/smoke/daddf128_test.cpp
new file mode 100644
index 0000000000000..ca0bbffe73451
--- /dev/null
+++ b/libc/test/src/math/smoke/daddf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for daddf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/daddf128.h"
+
+LIST_ADD_TESTS(double, float128, LIBC_NAMESPACE::daddf128)
diff --git a/libc/test/src/math/smoke/daddl_test.cpp b/libc/test/src/math/smoke/daddl_test.cpp
new file mode 100644
index 0000000000000..7a34d962d5207
--- /dev/null
+++ b/libc/test/src/math/smoke/daddl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for daddl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/daddl.h"
+
+LIST_ADD_TESTS(double, long double, LIBC_NAMESPACE::daddl)
diff --git a/libc/test/src/math/smoke/ddivf128_test.cpp b/libc/test/src/math/smoke/ddivf128_test.cpp
new file mode 100644
index 0000000000000..b5f39154b0e84
--- /dev/null
+++ b/libc/test/src/math/smoke/ddivf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ddivf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/ddivf128.h"
+
+LIST_DIV_TESTS(double, float128, LIBC_NAMESPACE::ddivf128)
diff --git a/libc/test/src/math/smoke/dfmaf128_test.cpp b/libc/test/src/math/smoke/dfmaf128_test.cpp
new file mode 100644
index 0000000000000..56c11747047b1
--- /dev/null
+++ b/libc/test/src/math/smoke/dfmaf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dfmaf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/dfmaf128.h"
+
+LIST_NARROWING_FMA_TESTS(double, float128, LIBC_NAMESPACE::dfmaf128)
diff --git a/libc/test/src/math/smoke/dfmal_test.cpp b/libc/test/src/math/smoke/dfmal_test.cpp
new file mode 100644
index 0000000000000..3c38f5eb7db9d
--- /dev/null
+++ b/libc/test/src/math/smoke/dfmal_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dfmal -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/dfmal.h"
+
+LIST_NARROWING_FMA_TESTS(double, long double, LIBC_NAMESPACE::dfmal)
diff --git a/libc/test/src/math/smoke/dmulf128_test.cpp b/libc/test/src/math/smoke/dmulf128_test.cpp
new file mode 100644
index 0000000000000..2ee2d10b9a19b
--- /dev/null
+++ b/libc/test/src/math/smoke/dmulf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dmulf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/dmulf128.h"
+
+LIST_MUL_TESTS(double, float128, LIBC_NAMESPACE::dmulf128)
diff --git a/libc/test/src/math/smoke/dmull_test.cpp b/libc/test/src/math/smoke/dmull_test.cpp
new file mode 100644
index 0000000000000..1b9c9c2c24ed3
--- /dev/null
+++ b/libc/test/src/math/smoke/dmull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dmull -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/dmull.h"
+
+LIST_MUL_TESTS(double, long double, LIBC_NAMESPACE::dmull)
diff --git a/libc/test/src/math/smoke/dsqrtf128_test.cpp b/libc/test/src/math/smoke/dsqrtf128_test.cpp
new file mode 100644
index 0000000000000..6f98d1cfc6042
--- /dev/null
+++ b/libc/test/src/math/smoke/dsqrtf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsqrtf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/dsqrtf128.h"
+
+LIST_NARROWING_SQRT_TESTS(double, float128, LIBC_NAMESPACE::dsqrtf128)
diff --git a/libc/test/src/math/smoke/dsqrtl_test.cpp b/libc/test/src/math/smoke/dsqrtl_test.cpp
new file mode 100644
index 0000000000000..c178e389a57cd
--- /dev/null
+++ b/libc/test/src/math/smoke/dsqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsqrtl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/dsqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(double, long double, LIBC_NAMESPACE::dsqrtl)
diff --git a/libc/test/src/math/smoke/dsubf128_test.cpp b/libc/test/src/math/smoke/dsubf128_test.cpp
new file mode 100644
index 0000000000000..e496cdd245070
--- /dev/null
+++ b/libc/test/src/math/smoke/dsubf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubf128.h"
+
+LIST_SUB_TESTS(double, float128, LIBC_NAMESPACE::dsubf128)
diff --git a/libc/test/src/math/smoke/dsubl_test.cpp b/libc/test/src/math/smoke/dsubl_test.cpp
new file mode 100644
index 0000000000000..98846e0b6e3b3
--- /dev/null
+++ b/libc/test/src/math/smoke/dsubl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubl.h"
+
+LIST_SUB_TESTS(double, long double, LIBC_NAMESPACE::dsubl)
diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp
new file mode 100644
index 0000000000000..969870fe247bc
--- /dev/null
+++ b/libc/test/src/math/smoke/expf16_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for expf16 ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/expf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::expf16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(zero),
+                            LIBC_NAMESPACE::expf16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(1.0f),
+                            LIBC_NAMESPACE::expf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(1.0f),
+                            LIBC_NAMESPACE::expf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExpf16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(
+      inf, LIBC_NAMESPACE::expf16(static_cast<float16>(12.0)), FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
+
+TEST_F(LlvmLibcExpf16Test, Underflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal),
+                              FE_UNDERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(
+      zero, LIBC_NAMESPACE::expf16(static_cast<float16>(-18.0)),
+      FE_UNDERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
diff --git a/libc/test/src/math/smoke/f16mul_test.cpp b/libc/test/src/math/smoke/f16mul_test.cpp
new file mode 100644
index 0000000000000..49b443870c483
--- /dev/null
+++ b/libc/test/src/math/smoke/f16mul_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mul ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mul.h"
+
+LIST_MUL_TESTS(float16, double, LIBC_NAMESPACE::f16mul)
diff --git a/libc/test/src/math/smoke/f16mulf128_test.cpp b/libc/test/src/math/smoke/f16mulf128_test.cpp
new file mode 100644
index 0000000000000..46e52cf068a7b
--- /dev/null
+++ b/libc/test/src/math/smoke/f16mulf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mulf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mulf128.h"
+
+LIST_MUL_TESTS(float16, float128, LIBC_NAMESPACE::f16mulf128)
diff --git a/libc/test/src/math/smoke/f16mulf_test.cpp b/libc/test/src/math/smoke/f16mulf_test.cpp
new file mode 100644
index 0000000000000..bf2530863621d
--- /dev/null
+++ b/libc/test/src/math/smoke/f16mulf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mulf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mulf.h"
+
+LIST_MUL_TESTS(float16, float, LIBC_NAMESPACE::f16mulf)
diff --git a/libc/test/src/math/smoke/f16mull_test.cpp b/libc/test/src/math/smoke/f16mull_test.cpp
new file mode 100644
index 0000000000000..5292ddb87b7f4
--- /dev/null
+++ b/libc/test/src/math/smoke/f16mull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16mull ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/f16mull.h"
+
+LIST_MUL_TESTS(float16, long double, LIBC_NAMESPACE::f16mull)
diff --git a/libc/test/src/math/smoke/fadd_test.cpp b/libc/test/src/math/smoke/fadd_test.cpp
new file mode 100644
index 0000000000000..fe9ac8b252ed3
--- /dev/null
+++ b/libc/test/src/math/smoke/fadd_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fadd ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/fadd.h"
+
+LIST_ADD_TESTS(float, double, LIBC_NAMESPACE::fadd)
diff --git a/libc/test/src/math/smoke/fmul_test.cpp b/libc/test/src/math/smoke/fmul_test.cpp
index 0eb664f7411ee..3f6df66456bac 100644
--- a/libc/test/src/math/smoke/fmul_test.cpp
+++ b/libc/test/src/math/smoke/fmul_test.cpp
@@ -1,13 +1,13 @@
-//===-- Unittests for fmul-------------------------------------------------===//
+//===-- Unittests for fmul ------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 
-#include "FMulTest.h"
+#include "MulTest.h"
 
 #include "src/math/fmul.h"
 
-LIST_FMUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
+LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
diff --git a/libc/test/src/math/smoke/fmulf128_test.cpp b/libc/test/src/math/smoke/fmulf128_test.cpp
new file mode 100644
index 0000000000000..37c8d1cf9908d
--- /dev/null
+++ b/libc/test/src/math/smoke/fmulf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmulf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/fmulf128.h"
+
+LIST_MUL_TESTS(float, float128, LIBC_NAMESPACE::fmulf128)
diff --git a/libc/test/src/math/smoke/fmull_test.cpp b/libc/test/src/math/smoke/fmull_test.cpp
new file mode 100644
index 0000000000000..ef694063f20dc
--- /dev/null
+++ b/libc/test/src/math/smoke/fmull_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmull -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MulTest.h"
+
+#include "src/math/fmull.h"
+
+LIST_MUL_TESTS(float, long double, LIBC_NAMESPACE::fmull)
diff --git a/libc/test/src/math/smoke/fsqrt_test.cpp b/libc/test/src/math/smoke/fsqrt_test.cpp
new file mode 100644
index 0000000000000..8471e3c9365e9
--- /dev/null
+++ b/libc/test/src/math/smoke/fsqrt_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fsqrt -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/fsqrt.h"
+
+LIST_NARROWING_SQRT_TESTS(float, double, LIBC_NAMESPACE::fsqrt)
diff --git a/libc/test/src/math/smoke/fsqrtf128_test.cpp b/libc/test/src/math/smoke/fsqrtf128_test.cpp
new file mode 100644
index 0000000000000..b125a35b09b9e
--- /dev/null
+++ b/libc/test/src/math/smoke/fsqrtf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fsqrtf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/fsqrtf128.h"
+
+LIST_NARROWING_SQRT_TESTS(float, float128, LIBC_NAMESPACE::fsqrtf128)
diff --git a/libc/test/src/math/smoke/fsqrtl_test.cpp b/libc/test/src/math/smoke/fsqrtl_test.cpp
new file mode 100644
index 0000000000000..1082a33466857
--- /dev/null
+++ b/libc/test/src/math/smoke/fsqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fsqrtl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/fsqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(float, long double, LIBC_NAMESPACE::fsqrtl)
diff --git a/libc/test/src/math/smoke/getpayload_test.cpp b/libc/test/src/math/smoke/getpayload_test.cpp
new file mode 100644
index 0000000000000..f157d4572d09a
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayload_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayload ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayload.h"
+
+LIST_GETPAYLOAD_TESTS(double, LIBC_NAMESPACE::getpayload)
diff --git a/libc/test/src/math/smoke/getpayloadf128_test.cpp b/libc/test/src/math/smoke/getpayloadf128_test.cpp
new file mode 100644
index 0000000000000..37bb506a3ed16
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayloadf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayloadf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayloadf128.h"
+
+LIST_GETPAYLOAD_TESTS(float128, LIBC_NAMESPACE::getpayloadf128)
diff --git a/libc/test/src/math/smoke/getpayloadf_test.cpp b/libc/test/src/math/smoke/getpayloadf_test.cpp
new file mode 100644
index 0000000000000..89ed02487df04
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayloadf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayloadf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayloadf.h"
+
+LIST_GETPAYLOAD_TESTS(float, LIBC_NAMESPACE::getpayloadf)
diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp
index 2ddef58325671..68c844181a194 100644
--- a/libc/test/src/math/smoke/nan_test.cpp
+++ b/libc/test/src/math/smoke/nan_test.cpp
@@ -43,7 +43,7 @@ TEST_F(LlvmLibcNanTest, RandomString) {
   run_test("123 ", 0x7ff8000000000000);
 }
 
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp
index 8c15c532ebcf4..015cc31e4be23 100644
--- a/libc/test/src/math/smoke/nanf128_test.cpp
+++ b/libc/test/src/math/smoke/nanf128_test.cpp
@@ -53,7 +53,7 @@ TEST_F(LlvmLibcNanf128Test, RandomString) {
            QUIET_NAN);
 }
 
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 #include <signal.h>
 TEST_F(LlvmLibcNanf128Test, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV));
diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp
index ec17a73d881aa..81b844bf6bb59 100644
--- a/libc/test/src/math/smoke/nanf16_test.cpp
+++ b/libc/test/src/math/smoke/nanf16_test.cpp
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcNanf16Test, RandomString) {
   run_test("123 ", 0x7e00);
 }
 
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanf16Test, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp
index 71f888c610aaf..ff5823685225c 100644
--- a/libc/test/src/math/smoke/nanf_test.cpp
+++ b/libc/test/src/math/smoke/nanf_test.cpp
@@ -42,7 +42,7 @@ TEST_F(LlvmLibcNanfTest, RandomString) {
   run_test("123 ", 0x7fc00000);
 }
 
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanfTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp
index 7fff20b1e7be3..de9af05100c10 100644
--- a/libc/test/src/math/smoke/nanl_test.cpp
+++ b/libc/test/src/math/smoke/nanl_test.cpp
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcNanlTest, RandomString) {
   run_test("123 ", expected);
 }
 
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanlTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
diff --git a/libc/test/src/math/smoke/setpayload_test.cpp b/libc/test/src/math/smoke/setpayload_test.cpp
new file mode 100644
index 0000000000000..e41b3f86c5f17
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayload_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayload ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayload.h"
+
+LIST_SETPAYLOAD_TESTS(double, LIBC_NAMESPACE::setpayload)
diff --git a/libc/test/src/math/smoke/setpayloadf128_test.cpp b/libc/test/src/math/smoke/setpayloadf128_test.cpp
new file mode 100644
index 0000000000000..4b17bfee093c0
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayloadf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayloadf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayloadf128.h"
+
+LIST_SETPAYLOAD_TESTS(float128, LIBC_NAMESPACE::setpayloadf128)
diff --git a/libc/test/src/math/smoke/setpayloadf_test.cpp b/libc/test/src/math/smoke/setpayloadf_test.cpp
new file mode 100644
index 0000000000000..51e285f171843
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayloadf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayloadf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayloadf.h"
+
+LIST_SETPAYLOAD_TESTS(float, LIBC_NAMESPACE::setpayloadf)
diff --git a/libc/test/src/math/smoke/totalorder_test.cpp b/libc/test/src/math/smoke/totalorder_test.cpp
new file mode 100644
index 0000000000000..21f49c31f50fe
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorder_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorder ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorder.h"
+
+LIST_TOTALORDER_TESTS(double, LIBC_NAMESPACE::totalorder)
diff --git a/libc/test/src/math/smoke/totalorderf128_test.cpp b/libc/test/src/math/smoke/totalorderf128_test.cpp
new file mode 100644
index 0000000000000..6f7fd6a94fcf9
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorderf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorderf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorderf128.h"
+
+LIST_TOTALORDER_TESTS(float128, LIBC_NAMESPACE::totalorderf128)
diff --git a/libc/test/src/math/smoke/totalorderf_test.cpp b/libc/test/src/math/smoke/totalorderf_test.cpp
new file mode 100644
index 0000000000000..71db87c0c2415
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorderf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorderf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorderf.h"
+
+LIST_TOTALORDER_TESTS(float, LIBC_NAMESPACE::totalorderf)
diff --git a/libc/test/src/math/smoke/totalordermag_test.cpp b/libc/test/src/math/smoke/totalordermag_test.cpp
new file mode 100644
index 0000000000000..6c3b6ddbd7422
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermag_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermag ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermag.h"
+
+LIST_TOTALORDERMAG_TESTS(double, LIBC_NAMESPACE::totalordermag)
diff --git a/libc/test/src/math/smoke/totalordermagf128_test.cpp b/libc/test/src/math/smoke/totalordermagf128_test.cpp
new file mode 100644
index 0000000000000..6cbe24ff8778f
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagf128 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagf128.h"
+
+LIST_TOTALORDERMAG_TESTS(float128, LIBC_NAMESPACE::totalordermagf128)
diff --git a/libc/test/src/math/smoke/totalordermagf_test.cpp b/libc/test/src/math/smoke/totalordermagf_test.cpp
new file mode 100644
index 0000000000000..c54cea21a52c7
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagf --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagf.h"
+
+LIST_TOTALORDERMAG_TESTS(float, LIBC_NAMESPACE::totalordermagf)
diff --git a/libc/test/src/math/smoke/totalordermagl_test.cpp b/libc/test/src/math/smoke/totalordermagl_test.cpp
new file mode 100644
index 0000000000000..bdb9f616aded7
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagl --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagl.h"
+
+LIST_TOTALORDERMAG_TESTS(long double, LIBC_NAMESPACE::totalordermagl)
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 80d57939a4f61..1ca67afdaddf2 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -75,12 +75,12 @@ TEST_F(LlvmLibcTanTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double result = LIBC_NAMESPACE::tan(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index e624d30f1e00f..9b9e1278cf563 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcTanfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
                                    LIBC_NAMESPACE::tanf(x), 0.5);
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index c34efe8d733be..2e74984ad9549 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcTanhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
                                    LIBC_NAMESPACE::tanhf(x), 0.5);
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 5eb8c9577893b..8b05b928a0269 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -191,6 +191,19 @@ add_libc_test(
     libc.src.stdio.printf
 )
 
+add_libc_test(
+   asprintf_test
+   SUITE
+     libc_stdio_unittests
+   SRCS
+     asprintf_test.cpp
+   DEPENDS
+     libc.src.stdio.asprintf
+     libc.src.string.memset
+     libc.include.stdlib
+     libc.src.stdio.sprintf
+ )
+
 add_fp_unittest(
   vsprintf_test
   UNIT_TEST_ONLY
@@ -236,6 +249,18 @@ add_libc_test(
     libc.src.stdio.vprintf
 )
 
+add_libc_test(
+   vasprintf_test
+   SUITE
+     libc_stdio_unittests
+   SRCS
+     vasprintf_test.cpp
+   DEPENDS
+     libc.src.stdio.vasprintf
+     libc.src.string.memset
+     libc.include.stdlib
+     libc.src.stdio.sprintf
+ )
 
 if(LLVM_LIBC_FULL_BUILD)
   # In fullbuild mode, fscanf's tests use the internal FILE for other functions.
@@ -261,6 +286,13 @@ add_libc_test(
     ${use_system_file}
 )
 
+if(LIBC_CONF_SCANF_DISABLE_FLOAT)
+  list(APPEND sscanf_test_copts "-DLIBC_COPT_SCANF_DISABLE_FLOAT")
+endif()
+if(LIBC_CONF_SCANF_DISABLE_INDEX_MODE)
+  list(APPEND sscanf_test_copts "-DLIBC_COPT_SCANF_DISABLE_INDEX_MODE")
+endif()
+
 add_libc_test(
   sscanf_test
   SUITE
@@ -271,6 +303,22 @@ add_libc_test(
     libc.src.stdio.sscanf
   LINK_LIBRARIES
     LibcFPTestHelpers
+  COMPILE_OPTIONS
+    ${sscanf_test_copts}
+)
+
+add_libc_test(
+  vsscanf_test
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    vsscanf_test.cpp
+  DEPENDS
+    libc.src.stdio.vsscanf
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+  COMPILE_OPTIONS
+    ${sscanf_test_copts}
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/asprintf_test.cpp b/libc/test/src/stdio/asprintf_test.cpp
new file mode 100644
index 0000000000000..9292cebb80e24
--- /dev/null
+++ b/libc/test/src/stdio/asprintf_test.cpp
@@ -0,0 +1,87 @@
+//===-- Unittests for asprintf--------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/asprintf.h"
+#include "src/stdio/sprintf.h"
+#include "src/string/memset.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcASPrintfTest, SimpleNoConv) {
+  char *buff = nullptr;
+  int written;
+  written =
+      LIBC_NAMESPACE::asprintf(&buff, "A simple string with no conversions.");
+  EXPECT_EQ(written, 36);
+  ASSERT_STREQ(buff, "A simple string with no conversions.");
+  free(buff);
+}
+
+TEST(LlvmLibcASPrintfTest, PercentConv) {
+  char *buff = nullptr;
+  int written;
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "%%");
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "%");
+  free(buff);
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "abc %% def");
+  EXPECT_EQ(written, 9);
+  ASSERT_STREQ(buff, "abc % def");
+  free(buff);
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "%%%%%%");
+  EXPECT_EQ(written, 3);
+  ASSERT_STREQ(buff, "%%%");
+  free(buff);
+}
+
+TEST(LlvmLibcASPrintfTest, CharConv) {
+  char *buff = nullptr;
+  int written;
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "%c", 'a');
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "a");
+  free(buff);
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "%3c %-3c", '1', '2');
+  EXPECT_EQ(written, 7);
+  ASSERT_STREQ(buff, "  1 2  ");
+  free(buff);
+
+  written = LIBC_NAMESPACE::asprintf(&buff, "%*c", 2, '3');
+  EXPECT_EQ(written, 2);
+  ASSERT_STREQ(buff, " 3");
+  free(buff);
+}
+
+TEST(LlvmLibcASPrintfTest, LargeStringNoConv) {
+  char *buff = nullptr;
+  char long_str[1001];
+  LIBC_NAMESPACE::memset(long_str, 'a', 1000);
+  long_str[1000] = '\0';
+  int written;
+  written = LIBC_NAMESPACE::asprintf(&buff, long_str);
+  EXPECT_EQ(written, 1000);
+  ASSERT_STREQ(buff, long_str);
+  free(buff);
+}
+
+TEST(LlvmLibcASPrintfTest, ManyReAlloc) {
+  char *buff = nullptr;
+  char long_str[1001];
+  auto expected_num_chars =
+      LIBC_NAMESPACE::sprintf(long_str, "%200s%200s%200s", "a", "b", "c");
+  long_str[expected_num_chars] = '\0';
+  int written;
+  written = LIBC_NAMESPACE::asprintf(&buff, long_str);
+  EXPECT_EQ(written, expected_num_chars);
+  ASSERT_STREQ(buff, long_str);
+  free(buff);
+}
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index 741815bb15171..59be4e6de6ed6 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -226,6 +226,7 @@ TEST(LlvmLibcSScanfTest, IntConvNoWriteTests) {
   EXPECT_EQ(result, 0);
 }
 
+#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
 TEST(LlvmLibcSScanfTest, FloatConvSimple) {
   int ret_val;
   float result = 0;
@@ -580,7 +581,9 @@ TEST(LlvmLibcSScanfTest, FloatConvNoWrite) {
   ret_val = LIBC_NAMESPACE::sscanf("Not a float", "%*f", &result);
   EXPECT_EQ(ret_val, 0);
 }
+#endif
 
+#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
 TEST(LlvmLibcSScanfTest, CurPosCombined) {
   int ret_val;
   int result = -1;
@@ -628,6 +631,7 @@ TEST(LlvmLibcSScanfTest, CurPosCombined) {
   EXPECT_EQ(ret_val, 1);
   EXPECT_EQ(result, 320);
 }
+#endif
 
 TEST(LlvmLibcSScanfTest, PointerConvCombined) {
   int ret_val;
diff --git a/libc/test/src/stdio/vasprintf_test.cpp b/libc/test/src/stdio/vasprintf_test.cpp
new file mode 100644
index 0000000000000..2eb1be3d7a9bf
--- /dev/null
+++ b/libc/test/src/stdio/vasprintf_test.cpp
@@ -0,0 +1,99 @@
+//===-- Unittests for vasprintf--------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/sprintf.h"
+#include "src/stdio/vasprintf.h"
+#include "src/string/memset.h"
+#include "test/UnitTest/Test.h"
+
+int call_vasprintf(char **__restrict buffer, const char *__restrict format,
+                   ...) {
+  va_list vlist;
+  va_start(vlist, format);
+  int ret = LIBC_NAMESPACE::vasprintf(buffer, format, vlist);
+  va_end(vlist);
+  return ret;
+}
+
+TEST(LlvmLibcVASPrintfTest, SimpleNoConv) {
+  char *buff = nullptr;
+  int written;
+  written = call_vasprintf(&buff, "A simple string with no conversions.");
+  EXPECT_EQ(written, 36);
+  ASSERT_STREQ(buff, "A simple string with no conversions.");
+  free(buff);
+}
+
+TEST(LlvmLibcVASPrintfTest, PercentConv) {
+  char *buff = nullptr;
+  int written;
+
+  written = call_vasprintf(&buff, "%%");
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "%");
+  free(buff);
+
+  written = call_vasprintf(&buff, "abc %% def");
+  EXPECT_EQ(written, 9);
+  ASSERT_STREQ(buff, "abc % def");
+  free(buff);
+
+  written = call_vasprintf(&buff, "%%%%%%");
+  EXPECT_EQ(written, 3);
+  ASSERT_STREQ(buff, "%%%");
+  free(buff);
+}
+
+TEST(LlvmLibcVASPrintfTest, CharConv) {
+  char *buff = nullptr;
+  int written;
+
+  written = call_vasprintf(&buff, "%c", 'a');
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "a");
+  free(buff);
+
+  written = call_vasprintf(&buff, "%3c %-3c", '1', '2');
+  EXPECT_EQ(written, 7);
+  ASSERT_STREQ(buff, "  1 2  ");
+  free(buff);
+
+  written = call_vasprintf(&buff, "%*c", 2, '3');
+  EXPECT_EQ(written, 2);
+  ASSERT_STREQ(buff, " 3");
+  free(buff);
+}
+
+TEST(LlvmLibcVASPrintfTest, LargeStringNoConv) {
+  char *buff = nullptr;
+  char long_str[1001];
+  LIBC_NAMESPACE::memset(long_str, 'a', 1000);
+  long_str[1000] = '\0';
+  int written;
+  written = call_vasprintf(&buff, long_str);
+  EXPECT_EQ(written, 1000);
+  ASSERT_STREQ(buff, long_str);
+  free(buff);
+}
+
+TEST(LlvmLibcVASPrintfTest, ManyReAlloc) {
+  char *buff = nullptr;
+  const int expected_num_chars = 600;
+  int written = call_vasprintf(&buff, "%200s%200s%200s", "", "", "");
+  EXPECT_EQ(written, expected_num_chars);
+
+  bool isPadding = true;
+  for (int i = 0; i < expected_num_chars; i++) {
+    if (buff[i] != ' ') {
+      isPadding = false;
+      break;
+    }
+  }
+  EXPECT_TRUE(isPadding);
+  free(buff);
+}
diff --git a/libc/test/src/stdio/vsscanf_test.cpp b/libc/test/src/stdio/vsscanf_test.cpp
new file mode 100644
index 0000000000000..4194e10f0602c
--- /dev/null
+++ b/libc/test/src/stdio/vsscanf_test.cpp
@@ -0,0 +1,159 @@
+//===-- Unittests for sscanf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vsscanf.h"
+
+#include "test/UnitTest/Test.h"
+
+int call_vsscanf(const char *__restrict buffer, const char *__restrict format,
+                 ...) {
+  va_list vlist;
+  va_start(vlist, format);
+  int ret = LIBC_NAMESPACE::vsscanf(buffer, format, vlist);
+  va_end(vlist);
+  return ret;
+}
+
+TEST(LlvmLibcVSScanfTest, SimpleStringConv) {
+  int ret_val;
+  char buffer[10];
+  char buffer2[10];
+  ret_val = call_vsscanf("abc123", "abc %s", buffer);
+  ASSERT_EQ(ret_val, 1);
+  ASSERT_STREQ(buffer, "123");
+
+  ret_val = call_vsscanf("abc123", "%3s %3s", buffer, buffer2);
+  ASSERT_EQ(ret_val, 2);
+  ASSERT_STREQ(buffer, "abc");
+  ASSERT_STREQ(buffer2, "123");
+
+  ret_val = call_vsscanf("abc 123", "%3s%3s", buffer, buffer2);
+  ASSERT_EQ(ret_val, 2);
+  ASSERT_STREQ(buffer, "abc");
+  ASSERT_STREQ(buffer2, "123");
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvSimple) {
+  int ret_val;
+  int result = 0;
+  ret_val = call_vsscanf("123", "%d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+
+  ret_val = call_vsscanf("456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 456);
+
+  ret_val = call_vsscanf("789", "%x", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0x789);
+
+  ret_val = call_vsscanf("012", "%o", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 012);
+
+  ret_val = call_vsscanf("345", "%u", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 345);
+
+  // 288 characters
+  ret_val = call_vsscanf("10000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000",
+                         "%d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, int(LIBC_NAMESPACE::cpp::numeric_limits<intmax_t>::max()));
+
+  ret_val = call_vsscanf("Not an integer", "%d", &result);
+  EXPECT_EQ(ret_val, 0);
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvLengthModifier) {
+  int ret_val;
+  uintmax_t max_result = 0;
+  int int_result = 0;
+  char char_result = 0;
+
+  ret_val = call_vsscanf("123", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(123));
+
+  // Check overflow handling
+  ret_val =
+      call_vsscanf("999999999999999999999999999999999999", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  // Because this is unsigned, any out of range value should return the maximum,
+  // even with a negative sign.
+  ret_val =
+      call_vsscanf("-999999999999999999999999999999999999", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  ret_val = call_vsscanf("-18446744073709551616", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  // But any number below the maximum should have the - sign applied.
+  ret_val = call_vsscanf("-1", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(-1));
+
+  ret_val = call_vsscanf("-1", "%u", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, -1);
+
+  max_result = 0xff00ff00ff00ff00;
+  char_result = 0x6f;
+
+  // Overflows for sizes larger than the maximum are handled by casting.
+  ret_val = call_vsscanf("8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(8589967360)); // 2^33 + 2^15
+
+  // Check that the adjacent values weren't touched by the overflow.
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = call_vsscanf("-8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(-8589967360));
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = call_vsscanf("25", "%hhd", &char_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(char_result, char(25));
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvBaseSelection) {
+  int ret_val;
+  int result = 0;
+  ret_val = call_vsscanf("0xabc123", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0xabc123);
+
+  ret_val = call_vsscanf("0456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0456);
+
+  ret_val = call_vsscanf("0999", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = call_vsscanf("123abc456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+}
diff --git a/libc/test/src/sys/epoll/linux/epoll_create_test.cpp b/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
index fdcdcf8eb4271..9c4bad10c8384 100644
--- a/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
+++ b/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
@@ -10,6 +10,7 @@
 #include "src/unistd/close.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
+#include <sys/syscall.h> // For syscall numbers.
 
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
@@ -21,6 +22,8 @@ TEST(LlvmLibcEpollCreateTest, Basic) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
 }
 
+#ifdef SYS_epoll_create
 TEST(LlvmLibcEpollCreateTest, Fails) {
   ASSERT_THAT(LIBC_NAMESPACE::epoll_create(0), Fails(EINVAL));
 }
+#endif
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 0895c33167151..8cb5f867453e4 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -9,10 +9,16 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
+#ifdef SYS_statfs64
+using StatFs = statfs64;
+#else
+using StatFs = statfs;
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
-static int fstatfs(int fd, struct statfs *buf) {
+static int fstatfs(int fd, StatFs *buf) {
   using namespace statfs_utils;
-  if (cpp::optional<LinuxStatFs> result = linux_fstatfs(fd)) {
+  if (cpp::optional<StatFs> result = linux_fstatfs(fd)) {
     *buf = *result;
     return 0;
   }
@@ -29,7 +35,7 @@ struct PathFD {
 };
 
 TEST(LlvmLibcSysStatvfsTest, FstatfsBasic) {
-  struct statfs buf;
+  StatFs buf;
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/"), &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/proc"), &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
index 6719c1ab26865..5329adb54d64d 100644
--- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
@@ -6,8 +6,14 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
+#ifdef SYS_statfs64
+using StatFs = statfs64;
+#else
+using StatFs = statfs;
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
-static int statfs(const char *path, struct statfs *buf) {
+static int statfs(const char *path, StatFs *buf) {
   using namespace statfs_utils;
   if (cpp::optional<LinuxStatFs> result = linux_statfs(path)) {
     *buf = *result;
@@ -18,7 +24,7 @@ static int statfs(const char *path, struct statfs *buf) {
 } // namespace LIBC_NAMESPACE_DECL
 
 TEST(LlvmLibcSysStatfsTest, StatfsBasic) {
-  struct statfs buf;
+  StatFs buf;
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/", &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/proc", &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/libc/test/utils/FPUtil/x86_long_double_test.cpp b/libc/test/utils/FPUtil/x86_long_double_test.cpp
index 87796b5c9f5ba..8d16869b2e816 100644
--- a/libc/test/utils/FPUtil/x86_long_double_test.cpp
+++ b/libc/test/utils/FPUtil/x86_long_double_test.cpp
@@ -27,8 +27,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent has the max value and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -38,8 +36,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // then the number is a NaN for all non-zero values of mantissa.
     // Note the initial value of |i| of 1 to avoid a zero mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -49,8 +45,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -60,8 +54,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 1,
     // then the number is normal value for all values of mantissa.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -70,8 +62,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -80,8 +70,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 }
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index e74b02204ed6f..be0415d0956fa 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(LIBC_TESTS_CAN_USE_MPFR)
-  add_library(libcMPFRWrapper
+  add_library(libcMPFRWrapper STATIC
     MPFRUtils.cpp
     MPFRUtils.h
     mpfr_inc.h
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index b67a9da40bd7b..4263c9dccb6a5 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -539,7 +539,7 @@ class MPFRNumber {
     return result;
   }
 
-  MPFRNumber fmul(const MPFRNumber &b) {
+  MPFRNumber mul(const MPFRNumber &b) {
     MPFRNumber result(*this);
     mpfr_mul(result.value, value, b.value, mpfr_rounding);
     return result;
@@ -800,12 +800,12 @@ binary_operation_one_output(Operation op, InputType x, InputType y,
     return inputX.fmod(inputY);
   case Operation::Hypot:
     return inputX.hypot(inputY);
+  case Operation::Mul:
+    return inputX.mul(inputY);
   case Operation::Pow:
     return inputX.pow(inputY);
   case Operation::Sub:
     return inputX.sub(inputY);
-  case Operation::Fmul:
-    return inputX.fmul(inputY);
   default:
     __builtin_unreachable();
   }
@@ -887,6 +887,16 @@ template void explain_unary_operation_single_output_error(Operation op,
                                                           long double,
                                                           long double, double,
                                                           RoundingMode);
+template void explain_unary_operation_single_output_error(Operation op, double,
+                                                          float, double,
+                                                          RoundingMode);
+template void explain_unary_operation_single_output_error(Operation op,
+                                                          long double, float,
+                                                          double, RoundingMode);
+template void explain_unary_operation_single_output_error(Operation op,
+                                                          long double, double,
+                                                          double, RoundingMode);
+
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_unary_operation_single_output_error(Operation op, float16,
                                                           float16, double,
@@ -1013,15 +1023,18 @@ void explain_binary_operation_one_output_error(
 template void
 explain_binary_operation_one_output_error(Operation, const BinaryInput<float> &,
                                           float, double, RoundingMode);
+template void explain_binary_operation_one_output_error(
+    Operation, const BinaryInput<double> &, float, double, RoundingMode);
 template void explain_binary_operation_one_output_error(
     Operation, const BinaryInput<double> &, double, double, RoundingMode);
+template void explain_binary_operation_one_output_error(
+    Operation, const BinaryInput<long double> &, float, double, RoundingMode);
+template void explain_binary_operation_one_output_error(
+    Operation, const BinaryInput<long double> &, double, double, RoundingMode);
 template void
 explain_binary_operation_one_output_error(Operation,
                                           const BinaryInput<long double> &,
                                           long double, double, RoundingMode);
-
-template void explain_binary_operation_one_output_error(
-    Operation, const BinaryInput<double> &, float, double, RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_binary_operation_one_output_error(
     Operation, const BinaryInput<float16> &, float16, double, RoundingMode);
@@ -1073,6 +1086,9 @@ template void
 explain_ternary_operation_one_output_error(Operation,
                                            const TernaryInput<long double> &,
                                            long double, double, RoundingMode);
+
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<long double> &, double, double, RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_ternary_operation_one_output_error(
     Operation, const TernaryInput<float> &, float16, double, RoundingMode);
@@ -1103,6 +1119,14 @@ template bool compare_unary_operation_single_output(Operation, double, double,
 template bool compare_unary_operation_single_output(Operation, long double,
                                                     long double, double,
                                                     RoundingMode);
+template bool compare_unary_operation_single_output(Operation, double, float,
+                                                    double, RoundingMode);
+template bool compare_unary_operation_single_output(Operation, long double,
+                                                    float, double,
+                                                    RoundingMode);
+template bool compare_unary_operation_single_output(Operation, long double,
+                                                    double, double,
+                                                    RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template bool compare_unary_operation_single_output(Operation, float16, float16,
                                                     double, RoundingMode);
@@ -1195,6 +1219,12 @@ template bool compare_binary_operation_one_output(Operation,
                                                   const BinaryInput<double> &,
                                                   double, double, RoundingMode);
 template bool
+compare_binary_operation_one_output(Operation, const BinaryInput<long double> &,
+                                    float, double, RoundingMode);
+template bool
+compare_binary_operation_one_output(Operation, const BinaryInput<long double> &,
+                                    double, double, RoundingMode);
+template bool
 compare_binary_operation_one_output(Operation, const BinaryInput<long double> &,
                                     long double, double, RoundingMode);
 
@@ -1244,6 +1274,9 @@ template bool
 compare_ternary_operation_one_output(Operation,
                                      const TernaryInput<long double> &,
                                      long double, double, RoundingMode);
+
+template bool compare_ternary_operation_one_output(
+    Operation, const TernaryInput<long double> &, double, double, RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template bool compare_ternary_operation_one_output(Operation,
                                                    const TernaryInput<float> &,
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 28390af9ee6d8..8d51fa4e47726 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -79,9 +79,9 @@ enum class Operation : int {
   Div,
   Fmod,
   Hypot,
+  Mul,
   Pow,
   Sub,
-  Fmul,
   EndBinaryOperationsSingleOutput,
 
   // Operations which take two floating point numbers of the same type as
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index b562cdc521c07..60597a67ce57a 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -4,7 +4,12 @@ target_include_directories(gpu_loader PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${LIBC_SOURCE_DIR}/include
   ${LIBC_SOURCE_DIR}
+  ${LLVM_MAIN_INCLUDE_DIR}
+  ${LLVM_BINARY_DIR}/include
 )
+if(NOT LLVM_ENABLE_RTTI)
+  target_compile_options(gpu_loader PUBLIC -fno-rtti)
+endif()
 
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
 if(hsa-runtime64_FOUND)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index e029816764427..2151013bae479 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -53,8 +53,9 @@ struct end_args_t {
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
-int load(int argc, char **argv, char **evnp, void *image, size_t size,
-         const LaunchParameters &params, bool print_resource_usage);
+int load(int argc, const char **argv, const char **evnp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage);
 
 /// Return \p V aligned "upwards" according to \p Align.
 template <typename V, typename A> inline V align_up(V val, A align) {
@@ -63,7 +64,7 @@ template <typename V, typename A> inline V align_up(V val, A align) {
 
 /// Copy the system's argument vector to GPU memory allocated using \p alloc.
 template <typename Allocator>
-void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
+void *copy_argument_vector(int argc, const char **argv, Allocator alloc) {
   size_t argv_size = sizeof(char *) * (argc + 1);
   size_t str_size = 0;
   for (int i = 0; i < argc; ++i)
@@ -90,9 +91,9 @@ void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
 
 /// Copy the system's environment to GPU memory allocated using \p alloc.
 template <typename Allocator>
-void *copy_environment(char **envp, Allocator alloc) {
+void *copy_environment(const char **envp, Allocator alloc) {
   int envc = 0;
-  for (char **env = envp; *env != 0; ++env)
+  for (const char **env = envp; *env != 0; ++env)
     ++envc;
 
   return copy_argument_vector(envc, envp, alloc);
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index a9c0b868725d0..44ed8bf58ab87 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -13,88 +13,97 @@
 
 #include "Loader.h"
 
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
+
 #include <cstdio>
 #include <cstdlib>
 #include <string>
-#include <vector>
 
-int main(int argc, char **argv, char **envp) {
-  if (argc < 2) {
-    printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
-           "--print-resource-usage] <device_image> "
-           "<args>, ...\n");
-    return EXIT_SUCCESS;
-  }
+using namespace llvm;
 
-  int offset = 0;
-  FILE *file = nullptr;
-  char *ptr;
-  LaunchParameters params = {1, 1, 1, 1, 1, 1};
-  bool print_resource_usage = false;
-  while (!file && ++offset < argc) {
-    if (argv[offset] == std::string("--threads") ||
-        argv[offset] == std::string("--threads-x")) {
-      params.num_threads_x =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--threads-y")) {
-      params.num_threads_y =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--threads-z")) {
-      params.num_threads_z =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--blocks") ||
-               argv[offset] == std::string("--blocks-x")) {
-      params.num_blocks_x =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--blocks-y")) {
-      params.num_blocks_y =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--blocks-z")) {
-      params.num_blocks_z =
-          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
-      offset++;
-      continue;
-    } else if (argv[offset] == std::string("--print-resource-usage")) {
-      print_resource_usage = true;
-      continue;
-    } else {
-      file = fopen(argv[offset], "r");
-      if (!file) {
-        fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
-        return EXIT_FAILURE;
-      }
-      break;
-    }
-  }
+static cl::OptionCategory loader_category("loader options");
+
+static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
+                          cl::cat(loader_category));
+
+static cl::opt<unsigned>
+    threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::alias threads("threads", cl::aliasopt(threads_x),
+                         cl::desc("Alias for --threads-x"),
+                         cl::cat(loader_category));
+
+static cl::opt<unsigned>
+    blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
+                        cl::desc("Alias for --blocks-x"),
+                        cl::cat(loader_category));
 
-  if (!file) {
-    fprintf(stderr, "No image file provided\n");
-    return EXIT_FAILURE;
+static cl::opt<bool>
+    print_resource_usage("print-resource-usage",
+                         cl::desc("Output resource usage of launched kernels"),
+                         cl::init(false), cl::cat(loader_category));
+
+static cl::opt<std::string> file(cl::Positional, cl::Required,
+                                 cl::desc("<gpu executable>"),
+                                 cl::cat(loader_category));
+static cl::list<std::string> args(cl::ConsumeAfter,
+                                  cl::desc("<program arguments>..."),
+                                  cl::cat(loader_category));
+
+[[noreturn]] void report_error(Error E) {
+  outs().flush();
+  logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, const char **argv, const char **envp) {
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  cl::HideUnrelatedOptions(loader_category);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "A utility used to launch unit tests built for a GPU target. This is\n"
+      "intended to provide an intrface simular to cross-compiling emulators\n");
+
+  if (help) {
+    cl::PrintHelpMessage();
+    return EXIT_SUCCESS;
   }
 
-  // TODO: We should perform some validation on the file.
-  fseek(file, 0, SEEK_END);
-  const auto size = ftell(file);
-  fseek(file, 0, SEEK_SET);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
+      MemoryBuffer::getFileOrSTDIN(file);
+  if (std::error_code ec = image_or_err.getError())
+    report_error(errorCodeToError(ec));
+  MemoryBufferRef image = **image_or_err;
 
-  void *image = malloc(size * sizeof(char));
-  fread(image, sizeof(char), size, file);
-  fclose(file);
+  SmallVector<const char *> new_argv = {file.c_str()};
+  llvm::transform(args, std::back_inserter(new_argv),
+                  [](const std::string &arg) { return arg.c_str(); });
 
   // Drop the loader from the program arguments.
-  int ret = load(argc - offset, &argv[offset], envp, image, size, params,
-                 print_resource_usage);
+  LaunchParameters params{threads_x, threads_y, threads_z,
+                          blocks_x,  blocks_y,  blocks_z};
+  int ret = load(new_argv.size(), new_argv.data(), envp,
+                 const_cast<char *>(image.getBufferStart()),
+                 image.getBufferSize(), params, print_resource_usage);
 
-  free(image);
   return ret;
 }
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 97a2de9f8379a..46c5631046ce2 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,4 +1,11 @@
-add_executable(amdhsa-loader Loader.cpp)
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  )
+
+add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
 
 target_link_libraries(amdhsa-loader
   PRIVATE
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
similarity index 99%
rename from libc/utils/gpu/loader/amdgpu/Loader.cpp
rename to libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index 8cf6ea5dc9aec..c1dcce84a1c24 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -334,8 +334,9 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
   return HSA_STATUS_SUCCESS;
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size,
-         const LaunchParameters &params, bool print_resource_usage) {
+int load(int argc, const char **argv, const char **envp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage) {
   // Initialize the HSA runtime used to communicate with the device.
   if (hsa_status_t err = hsa_init())
     handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index 948493959badf..21453b9ca0348 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,15 +1,15 @@
-add_executable(nvptx-loader Loader.cpp)
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  )
+
+add_llvm_executable(nvptx-loader nvptx-loader.cpp)
 
-if(NOT LLVM_ENABLE_RTTI)
-  target_compile_options(nvptx-loader PRIVATE -fno-rtti)
-endif()
-target_include_directories(nvptx-loader PRIVATE
-                           ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
 target_link_libraries(nvptx-loader
   PRIVATE
   gpu_loader
   llvmlibc_rpc_server
   CUDA::cuda_driver
-  LLVMObject
-  LLVMSupport
 )
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
similarity index 98%
rename from libc/utils/gpu/loader/nvptx/Loader.cpp
rename to libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
index 9c3cf3ae19b41..9fd3de2beb723 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -245,8 +245,9 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   return CUDA_SUCCESS;
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size,
-         const LaunchParameters &params, bool print_resource_usage) {
+int load(int argc, const char **argv, const char **envp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage) {
   if (CUresult err = cuInit(0))
     handle_error(err);
   // Obtain the first device found on the system.
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 119539e3cad44..ed23d22f0bc36 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -108,6 +108,10 @@ void handle_printf(rpc::Server::Port &port) {
       if (cur_section.has_conv && cur_section.conv_name == 's' &&
           cur_section.conv_val_ptr) {
         strs_to_copy[lane].emplace_back(cur_section.conv_val_ptr);
+        // Get the minimum size of the string in the case of padding.
+        char c = '\0';
+        cur_section.conv_val_ptr = &c;
+        convert(&writer, cur_section);
       } else if (cur_section.has_conv) {
         // Ignore conversion errors for the first pass.
         convert(&writer, cur_section);
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index f05492d777977..51f6ee10babb8 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.20.0)
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   project(libclc VERSION 0.2.0 LANGUAGES CXX C)
 endif()
+set(LLVM_SUBPROJECT_TITLE "libclc")
 
 set(CMAKE_CXX_STANDARD 17)
 
@@ -250,6 +251,7 @@ add_custom_command(
   COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
   DEPENDS ${script_loc} )
 add_custom_target( "generate_convert.cl" DEPENDS convert.cl )
+set_target_properties( "generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" )
 
 add_custom_command(
   OUTPUT convert-core.cl
@@ -274,6 +276,7 @@ add_custom_command(
   COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl
   DEPENDS ${script_loc} )
 add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl )
+set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" )
 
 enable_testing()
 
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 2a843dd67fb8c..c6fe0600c4a3e 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -118,7 +118,10 @@ function(link_bc)
   )
 
   add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
-  set_target_properties( ${ARG_TARGET} PROPERTIES TARGET_FILE ${ARG_TARGET}.bc )
+  set_target_properties( ${ARG_TARGET} PROPERTIES
+    TARGET_FILE ${ARG_TARGET}.bc
+    FOLDER "libclc/Device IR/Linking"
+  )
 endfunction()
 
 # Decomposes and returns variables based on a libclc triple and architecture
diff --git a/libclc/generic/lib/common/sign.cl b/libclc/generic/lib/common/sign.cl
index 1c745bc14df9c..30c08eb4fe77f 100644
--- a/libclc/generic/lib/common/sign.cl
+++ b/libclc/generic/lib/common/sign.cl
@@ -19,3 +19,12 @@ SIGN(double, )
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sign, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+SIGN(half,)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, sign, half)
+
+#endif
diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl
new file mode 100644
index 0000000000000..ae6117b7b2922
--- /dev/null
+++ b/libclc/generic/lib/math/clc_ldexp.cl
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "config.h"
+#include "../clcmacro.h"
+#include "math.h"
+
+_CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) {
+
+	if (!__clc_fp32_subnormals_supported()) {
+
+		// This treats subnormals as zeros
+		int i = as_int(x);
+		int e = (i >> 23) & 0xff;
+		int m = i & 0x007fffff;
+		int s = i & 0x80000000;
+		int v = add_sat(e, n);
+		v = clamp(v, 0, 0xff);
+		int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+		int c = e == 0xff;
+		mr = c ? m : mr;
+		int er = c ? e : v;
+		er = e ? er : e;
+		return as_float( s | (er << 23) | mr );
+	}
+
+	/* supports denormal values */
+	const int multiplier = 24;
+	float val_f;
+	uint val_ui;
+	uint sign;
+	int exponent;
+	val_ui = as_uint(x);
+	sign = val_ui & 0x80000000;
+	val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+	int val_x = val_ui;
+
+	exponent = val_ui >> 23; /* get the exponent */
+	int dexp = exponent;
+
+	/* denormal support */
+	int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+	int dexponent = 25 - fbh;
+	uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+	int ex = dexponent + n - multiplier;
+	dexponent = ex;
+	uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+	int ex1 = dexponent + multiplier;
+	ex1 = -ex1 +25;
+	dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+	dval_ui = dexponent > 0 ? val :dval_ui;
+	dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui;  /*overflow*/
+	dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
+	dval_ui = dval_ui | sign;
+	val_f = as_float(dval_ui);
+
+	exponent += n;
+
+	val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+	ex1 = exponent + multiplier;
+	ex1 = -ex1 +25;
+	val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+	val_ui = exponent > 0 ? val :val_ui;
+	val_ui = exponent > 254 ? 0x7f800000 :val_ui;  /*overflow*/
+	val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
+	val_ui = val_ui | sign;
+
+	val_ui = dexp == 0? dval_ui : val_ui;
+	val_f = as_float(val_ui);
+
+	val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+	return val_f;
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) {
+	long l = as_ulong(x);
+	int e = (l >> 52) & 0x7ff;
+	long s = l & 0x8000000000000000;
+
+	ulong ux = as_ulong(x * 0x1.0p+53);
+	int de = ((int)(ux >> 52) & 0x7ff) - 53;
+	int c = e == 0;
+	e = c ? de: e;
+
+	ux = c ? ux : l;
+
+	int v = e + n;
+	v = clamp(v, -0x7ff, 0x7ff);
+
+	ux &= ~EXPBITS_DP64;
+
+	double mr = as_double(ux | ((ulong)(v+53) << 52));
+	mr = mr * 0x1.0p-53;
+
+	mr = v > 0  ? as_double(ux | ((ulong)v << 52)) : mr;
+
+	mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64)  : mr;
+	mr = v < -53 ? as_double(s) : mr;
+
+	mr  = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr;
+	return mr;
+}
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_ldexp(half x, int n) {
+    return (half)__clc_ldexp((float)x, n);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_ldexp, half, int);
+
+#endif
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
new file mode 100644
index 0000000000000..3b9159ac967ef
--- /dev/null
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include <math/clc_remainder.h>
+#include "../clcmacro.h"
+#include "config.h"
+#include "math.h"
+
+_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, __private int *quo)
+{
+    x = __clc_flush_denormal_if_not_supported(x);
+    y = __clc_flush_denormal_if_not_supported(y);
+    int ux = as_int(x);
+    int ax = ux & EXSIGNBIT_SP32;
+    float xa = as_float(ax);
+    int sx = ux ^ ax;
+    int ex = ax >> EXPSHIFTBITS_SP32;
+
+    int uy = as_int(y);
+    int ay = uy & EXSIGNBIT_SP32;
+    float ya = as_float(ay);
+    int sy = uy ^ ay;
+    int ey = ay >> EXPSHIFTBITS_SP32;
+
+    float xr = as_float(0x3f800000 | (ax & 0x007fffff));
+    float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+    int c;
+    int k = ex - ey;
+
+    uint q = 0;
+
+    while (k > 0) {
+        c = xr >= yr;
+        q = (q << 1) | c;
+        xr -= c ? yr : 0.0f;
+        xr += xr;
+	--k;
+    }
+
+    c = xr > yr;
+    q = (q << 1) | c;
+    xr -= c ? yr : 0.0f;
+
+    int lt = ex < ey;
+
+    q = lt ? 0 : q;
+    xr = lt ? xa : xr;
+    yr = lt ? ya : yr;
+
+    c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
+    xr -= c ? yr : 0.0f;
+    q += c;
+
+    float s = as_float(ey << EXPSHIFTBITS_SP32);
+    xr *= lt ? 1.0f : s;
+
+    int qsgn = sx == sy ? 1 : -1;
+    int quot = (q & 0x7f) * qsgn;
+
+    c = ax == ay;
+    quot = c ? qsgn : quot;
+    xr = c ? 0.0f : xr;
+
+    xr = as_float(sx ^ as_int(xr));
+
+    c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0;
+    quot = c ? 0 : quot;
+    xr = c ? as_float(QNANBITPATT_SP32) : xr;
+
+    *quo = quot;
+
+    return xr;
+}
+// remquo singature is special, we don't have macro for this
+#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \
+_CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, __private int##VEC_SIZE *quo) \
+{ \
+	int##HALF_VEC_SIZE lo, hi; \
+	TYPE##VEC_SIZE ret; \
+	ret.lo = __clc_remquo(x.lo, y.lo, &lo); \
+	ret.hi = __clc_remquo(x.hi, y.hi, &hi); \
+	(*quo).lo = lo; \
+	(*quo).hi = hi; \
+	return ret; \
+}
+__VEC_REMQUO(float, 2,)
+__VEC_REMQUO(float, 3, 2)
+__VEC_REMQUO(float, 4, 2)
+__VEC_REMQUO(float, 8, 4)
+__VEC_REMQUO(float, 16, 8)
+
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y, __private int *pquo)
+{
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    ulong xsgn = ux ^ ax;
+    double dx = as_double(ax);
+    int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+    int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64);
+    xexp1 = xexp < 1 ? xexp1 : xexp;
+
+    ulong uy = as_ulong(y);
+    ulong ay = uy & ~SIGNBIT_DP64;
+    double dy = as_double(ay);
+    int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+    int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64);
+    yexp1 = yexp < 1 ? yexp1 : yexp;
+
+    int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
+
+    // First assume |x| > |y|
+
+    // Set ntimes to the number of times we need to do a
+    // partial remainder. If the exponent of x is an exact multiple
+    // of 53 larger than the exponent of y, and the mantissa of x is
+    // less than the mantissa of y, ntimes will be one too large
+    // but it doesn't matter - it just means that we'll go round
+    // the loop below one extra time.
+    int ntimes = max(0, (xexp1 - yexp1) / 53);
+    double w =  ldexp(dy, ntimes * 53);
+    w = ntimes == 0 ? dy : w;
+    double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+    // Each time round the loop we compute a partial remainder.
+    // This is done by subtracting a large multiple of w
+    // from x each time, where w is a scaled up version of y.
+    // The subtraction must be performed exactly in quad
+    // precision, though the result at each stage can
+    // fit exactly in a double precision number.
+    int i;
+    double t, v, p, pp;
+
+    for (i = 0; i < ntimes; i++) {
+        // Compute integral multiplier
+        t = trunc(dx / w);
+
+        // Compute w * t in quad precision
+        p = w * t;
+        pp = fma(w, t, -p);
+
+        // Subtract w * t from dx
+        v = dx - p;
+        dx = v + (((dx - v) - p) - pp);
+
+        // If t was one too large, dx will be negative. Add back one w.
+        dx += dx < 0.0 ? w : 0.0;
+
+        // Scale w down by 2^(-53) for the next iteration
+        w *= scale;
+    }
+
+    // One more time
+    // Variable todd says whether the integer t is odd or not
+    t = floor(dx / w);
+    long lt = (long)t;
+    int todd = lt & 1;
+
+    p = w * t;
+    pp = fma(w, t, -p);
+    v = dx - p;
+    dx = v + (((dx - v) - p) - pp);
+    i = dx < 0.0;
+    todd ^= i;
+    dx += i ? w : 0.0;
+
+    lt -= i;
+
+    // At this point, dx lies in the range [0,dy)
+
+    // For the remainder function, we need to adjust dx
+    // so that it lies in the range (-y/2, y/2] by carefully
+    // subtracting w (== dy == y) if necessary. The rigmarole
+    // with todd is to get the correct sign of the result
+    // when x/y lies exactly half way between two integers,
+    // when we need to choose the even integer.
+
+    int al = (2.0*dx > w) | (todd & (2.0*dx == w));
+    double dxl = dx - (al ? w : 0.0);
+
+    int ag = (dx > 0.5*w) | (todd & (dx == 0.5*w));
+    double dxg = dx - (ag ? w : 0.0);
+
+    dx = dy < 0x1.0p+1022 ? dxl : dxg;
+    lt += dy < 0x1.0p+1022 ? al : ag;
+    int quo = ((int)lt & 0x7f) * qsgn;
+
+    double ret = as_double(xsgn ^ as_ulong(dx));
+    dx = as_double(ax);
+
+    // Now handle |x| == |y|
+    int c = dx == dy;
+    t = as_double(xsgn);
+    quo = c ? qsgn : quo;
+    ret = c ? t : ret;
+
+    // Next, handle |x| < |y|
+    c = dx < dy;
+    quo = c ? 0 : quo;
+    ret = c ? x : ret;
+
+    c &= (yexp < 1023 & 2.0*dx > dy) | (dx > 0.5*dy);
+    quo = c ? qsgn : quo;
+    // we could use a conversion here instead since qsgn = +-1
+    p = qsgn == 1 ? -1.0 : 1.0;
+    t = fma(y, p, x);
+    ret = c ? t : ret;
+
+    // We don't need anything special for |x| == 0
+
+    // |y| is 0
+    c = dy == 0.0;
+    quo = c ? 0 : quo;
+    ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+    // y is +-Inf, NaN
+    c = yexp > BIASEDEMAX_DP64;
+    quo = c ? 0 : quo;
+    t = y == y ? x : y;
+    ret = c ? t : ret;
+
+    // x is +=Inf, NaN
+    c = xexp > BIASEDEMAX_DP64;
+    quo = c ? 0 : quo;
+    ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+    *pquo = quo;
+    return ret;
+}
+__VEC_REMQUO(double, 2,)
+__VEC_REMQUO(double, 3, 2)
+__VEC_REMQUO(double, 4, 2)
+__VEC_REMQUO(double, 8, 4)
+__VEC_REMQUO(double, 16, 8)
+#endif
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
new file mode 100644
index 0000000000000..0a2c98d3787cf
--- /dev/null
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "config.h"
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+// compute pow using log and exp
+// x^y = exp(y * log(x))
+//
+// we take care not to lose precision in the intermediate steps
+//
+// When computing log, calculate it in splits,
+//
+// r = f * (p_invead + p_inv_tail)
+// r = rh + rt
+//
+// calculate log polynomial using r, in end addition, do
+// poly = poly + ((rh-r) + rt)
+//
+// lth = -r
+// ltt = ((xexp * log2_t) - poly) + logT
+// lt = lth + ltt
+//
+// lh = (xexp * log2_h) + logH
+// l = lh + lt
+//
+// Calculate final log answer as gh and gt,
+// gh = l & higher-half bits
+// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
+//
+// yh = y & higher-half bits
+// yt = y - yh
+//
+// Before entering computation of exp,
+// vs = ((yt*gt + yt*gh) + yh*gt)
+// v = vs + yh*gh
+// vt = ((yh*gh - v) + vs)
+//
+// In calculation of exp, add vt to r that is used for poly
+// At the end of exp, do
+// ((((expT * poly) + expT) + expH*poly) + expH)
+
+_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny)
+{
+    float y = MATH_RECIP((float)ny);
+
+    int ix = as_int(x);
+    int ax = ix & EXSIGNBIT_SP32;
+    int xpos = ix == ax;
+
+    int iy = as_int(y);
+    int ay = iy & EXSIGNBIT_SP32;
+    int ypos = iy == ay;
+
+    // Extra precise log calculation
+    // First handle case that x is close to 1
+    float r = 1.0f - as_float(ax);
+    int near1 = fabs(r) < 0x1.0p-4f;
+    float r2 = r*r;
+
+    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+    float poly = mad(r,
+                     mad(r,
+                         mad(r,
+                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                             0x1.99999ap-3f),
+                         0x1.000000p-2f),
+                     0x1.555556p-2f);
+
+    poly *= r2*r;
+
+    float lth_near1 = -r2 * 0.5f;
+    float ltt_near1 = -poly;
+    float lt_near1 = lth_near1 + ltt_near1;
+    float lh_near1 = -r;
+    float l_near1 = lh_near1 + lt_near1;
+
+    // Computations for x not near 1
+    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    float mf = (float)m;
+    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+    int c = m == -127;
+    int ixn = c ? ixs : ax;
+    float mfn = c ? mfs : mf;
+
+    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+    // F - Y
+    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+    indx = indx >> 16;
+    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+    float rh = f * tv.s0;
+    float rt = f * tv.s1;
+    r = rh + rt;
+
+    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
+    poly += (rh - r) + rt;
+
+    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+    tv = USE_TABLE(loge_tbl, indx);
+    float lth = -r;
+    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+    float lt = lth + ltt;
+    float lh = mad(mfn, LOG2_HEAD, tv.s0);
+    float l = lh + lt;
+
+    // Select near 1 or not
+    lth = near1 ? lth_near1 : lth;
+    ltt = near1 ? ltt_near1 : ltt;
+    lt = near1 ? lt_near1 : lt;
+    lh = near1 ? lh_near1 : lh;
+    l = near1 ? l_near1 : l;
+
+    float gh = as_float(as_int(l) & 0xfffff000);
+    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+    float yh = as_float(iy & 0xfffff000);
+
+    float fny = (float)ny;
+    float fnyh = as_float(as_int(fny) & 0xfffff000);
+    float fnyt = (float)(ny - (int)fnyh);
+    float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny);
+
+    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
+    float ylogx = mad(yh, gh, ylogx_s);
+    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
+
+    // Extra precise exp of ylogx
+    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+    int n = convert_int(ylogx * R_64_BY_LOG2);
+    float nf = (float) n;
+
+    int j = n & 0x3f;
+    m = n >> 6;
+    int m2 = m << EXPSHIFTBITS_SP32;
+
+    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
+    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
+    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
+
+    // Truncated Taylor series for e^r
+    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
+
+    tv = USE_TABLE(exp_tbl_ep, j);
+
+    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
+    float sexpylogx = __clc_fp32_subnormals_supported() ? expylogx * as_float(0x1 << (m + 149)) : 0.0f;
+
+    float texpylogx = as_float(as_int(expylogx) + m2);
+    expylogx = m < -125 ? sexpylogx : texpylogx;
+
+    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
+
+    // Result is 0 if ylogx < -149*log2
+    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+    // Classify y:
+    //   inty = 0 means not an integer.
+    //   inty = 1 means odd integer.
+    //   inty = 2 means even integer.
+
+    int inty = 2 - (ny & 1);
+
+    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+    int ret = as_int(expylogx);
+
+    // Corner case handling
+    ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
+    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+    ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+    ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+    int xzero = xpos ? 0 : 0x80000000;
+    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+    ret = ax > PINFBITPATT_SP32 ? ix : ret;
+    ret = ny == 0 ? QNANBITPATT_SP32 : ret;
+
+    return as_float(ret);
+}
+_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int)
+
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
+{
+    const double real_log2_tail = 5.76999904754328540596e-08;
+    const double real_log2_lead = 6.93147122859954833984e-01;
+
+    double dny = (double)ny;
+    double y = 1.0 / dny;
+
+    long ux = as_long(x);
+    long ax = ux & (~SIGNBIT_DP64);
+    int xpos = ax == ux;
+
+    long uy = as_long(y);
+    long ay = uy & (~SIGNBIT_DP64);
+    int ypos = ay == uy;
+
+    // Extended precision log
+    double v, vt;
+    {
+        int exp = (int)(ax >> 52) - 1023;
+        int mask_exp_1023 = exp == -1023;
+        double xexp = (double) exp;
+        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+        double xexp1 = (double) exp;
+        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+        xexp = mask_exp_1023 ? xexp1 : xexp;
+        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
+        int index = rax >> 44;
+
+        double F = as_double(rax | 0x3FE0000000000000L);
+        double Y = as_double(mantissa | 0x3FE0000000000000L);
+        double f = F - Y;
+        double2 tv = USE_TABLE(log_f_inv_tbl, index);
+        double log_h = tv.s0;
+        double log_t = tv.s1;
+        double f_inv = (log_h + log_t) * f;
+        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+        double r2 = fma(-F, r1, f) * (log_h + log_t);
+        double r = r1 + r2;
+
+        double poly = fma(r,
+                          fma(r,
+                              fma(r,
+                                  fma(r, 1.0/7.0, 1.0/6.0),
+                                  1.0/5.0),
+                              1.0/4.0),
+                          1.0/3.0);
+        poly = poly * r * r * r;
+
+        double hr1r1 = 0.5*r1*r1;
+        double poly0h = r1 + hr1r1;
+        double poly0t = r1 - poly0h + hr1r1;
+        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
+
+        tv = USE_TABLE(powlog_tbl, index);
+        log_h = tv.s0;
+        log_t = tv.s1;
+
+        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
+        double resT = resT_t - poly0h;
+        double resH = fma(xexp, real_log2_lead, log_h);
+        double resT_h = poly0h;
+
+        double H = resT + resH;
+        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+        H = H_h;
+
+        double y_head = as_double(uy & 0xfffffffff8000000L);
+        double y_tail = y - y_head;
+
+        double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+        double fnyt = (double)(ny - (int)fnyh);
+        y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny;
+
+        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
+        v = fma(y_head, H, temp);
+        vt = fma(y_head, H, -v) + temp;
+    }
+
+    // Now calculate exp of (v,vt)
+
+    double expv;
+    {
+        const double max_exp_arg = 709.782712893384;
+        const double min_exp_arg = -745.1332191019411;
+        const double sixtyfour_by_lnof2 = 92.33248261689366;
+        const double lnof2_by_64_head = 0.010830424260348081;
+        const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+        double temp = v * sixtyfour_by_lnof2;
+        int n = (int)temp;
+        double dn = (double)n;
+        int j = n & 0x0000003f;
+        int m = n >> 6;
+
+        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+        double f1 = tv.s0;
+        double f2 = tv.s1;
+        double f = f1 + f2;
+
+        double r1 = fma(dn, -lnof2_by_64_head, v);
+        double r2 = dn * lnof2_by_64_tail;
+        double r = (r1 + r2) + vt;
+
+        double q = fma(r,
+                       fma(r,
+                           fma(r,
+                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                               4.16666666662260795726e-02),
+                           1.66666666665260878863e-01),
+                       5.00000000000000008883e-01);
+        q = fma(r*r, q, r);
+
+        expv = fma(f, q, f2) + f1;
+	      expv = ldexp(expv, m);
+
+        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+        expv = v < min_exp_arg ? 0.0 : expv;
+    }
+
+    // See whether y is an integer.
+    // inty = 0 means not an integer.
+    // inty = 1 means odd integer.
+    // inty = 2 means even integer.
+
+    int inty = 2 - (ny & 1);
+
+    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+    long ret = as_long(expv);
+
+    // Now all the edge cases
+    ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
+    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+    ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+    ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+    long xzero = xpos ? 0L : 0x8000000000000000L;
+    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
+    ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+    ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+    ret = ax > PINFBITPATT_DP64 ? ux : ret;
+    ret = ny == 0 ? QNANBITPATT_DP64 : ret;
+    return as_double(ret);
+}
+_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
+#endif
diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index 871920f15b5bc..b2ca452931fec 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -24,7 +24,6 @@ AttributeMacros: [
                   '_LIBCPP_CONSTEXPR_SINCE_CXX23',
                   '_LIBCPP_CONSTEXPR',
                   '_LIBCPP_CONSTINIT',
-                  '_LIBCPP_DEPRECATED_ATOMIC_SYNC',
                   '_LIBCPP_DEPRECATED_IN_CXX11',
                   '_LIBCPP_DEPRECATED_IN_CXX14',
                   '_LIBCPP_DEPRECATED_IN_CXX17',
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 155f81a74a974..6168c76bff6d9 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -71,12 +71,16 @@ if (NOT "${LIBCXX_HARDENING_MODE}" IN_LIST LIBCXX_SUPPORTED_HARDENING_MODES)
     "Unsupported hardening mode: '${LIBCXX_HARDENING_MODE}'. Supported values are ${LIBCXX_SUPPORTED_HARDENING_MODES}.")
 endif()
 set(LIBCXX_ASSERTION_HANDLER_FILE
-  "${CMAKE_CURRENT_SOURCE_DIR}/vendor/llvm/default_assertion_handler.in"
+  "vendor/llvm/default_assertion_handler.in"
   CACHE STRING
   "Specify the path to a header that contains a custom implementation of the
    assertion handler that gets invoked when a hardening assertion fails. If
    provided, this header will be included by the library, replacing the
-   default assertion handler.")
+   default assertion handler. If this is specified as a relative path, it
+   is assumed to be relative to '<monorepo>/libcxx'.")
+if (NOT IS_ABSOLUTE "${LIBCXX_ASSERTION_HANDLER_FILE}")
+  set(LIBCXX_ASSERTION_HANDLER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${LIBCXX_ASSERTION_HANDLER_FILE}")
+endif()
 option(LIBCXX_ENABLE_RANDOM_DEVICE
   "Whether to include support for std::random_device in the library. Disabling
    this can be useful when building the library for platforms that don't have
@@ -212,6 +216,14 @@ set(LIBCXX_ABI_DEFINES "" CACHE STRING "A semicolon separated list of ABI macros
 set(LIBCXX_EXTRA_SITE_DEFINES "" CACHE STRING "Extra defines to add into __config_site")
 option(LIBCXX_USE_COMPILER_RT "Use compiler-rt instead of libgcc" OFF)
 
+# C Library options -----------------------------------------------------------
+
+set(LIBCXX_SUPPORTED_C_LIBRARIES system llvm-libc)
+set(LIBCXX_LIBC "system" CACHE STRING "Specify C library to use. Supported values are ${LIBCXX_SUPPORTED_C_LIBRARIES}.")
+if (NOT "${LIBCXX_LIBC}" IN_LIST LIBCXX_SUPPORTED_C_LIBRARIES)
+  message(FATAL_ERROR "Unsupported C library: '${LIBCXX_CXX_ABI}'. Supported values are ${LIBCXX_SUPPORTED_C_LIBRARIES}.")
+endif()
+
 # ABI Library options ---------------------------------------------------------
 if (MSVC)
   set(LIBCXX_DEFAULT_ABI_LIBRARY "vcruntime")
@@ -298,17 +310,6 @@ endif()
 option(LIBCXX_ENABLE_PEDANTIC "Compile with pedantic enabled." OFF)
 option(LIBCXX_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
-option(LIBCXX_GENERATE_COVERAGE "Enable generating code coverage." OFF)
-set(LIBCXX_COVERAGE_LIBRARY "" CACHE STRING
-    "The Profile-rt library used to build with code coverage")
-
-set(LIBCXX_CONFIGURE_IDE_DEFAULT OFF)
-if (XCODE OR MSVC_IDE)
-  set(LIBCXX_CONFIGURE_IDE_DEFAULT ON)
-endif()
-option(LIBCXX_CONFIGURE_IDE "Configure libcxx for use within an IDE"
-      ${LIBCXX_CONFIGURE_IDE_DEFAULT})
-
 set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
 if (WIN32)
   set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT ON)
@@ -371,12 +372,6 @@ if (NOT LIBCXX_ENABLE_RTTI AND LIBCXX_ENABLE_EXCEPTIONS)
                       " for details.")
 endif()
 
-# Ensure LLVM_USE_SANITIZER is not specified when LIBCXX_GENERATE_COVERAGE
-# is ON.
-if (LLVM_USE_SANITIZER AND LIBCXX_GENERATE_COVERAGE)
-  message(FATAL_ERROR "LLVM_USE_SANITIZER cannot be used with LIBCXX_GENERATE_COVERAGE")
-endif()
-
 if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
     if (APPLE)
       message(FATAL_ERROR "LIBCXX_ENABLE_ABI_LINKER_SCRIPT cannot be used on APPLE targets")
@@ -485,16 +480,11 @@ endif()
 # Configure compiler.
 include(config-ix)
 
-# Configure coverage options.
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(CMAKE_BUILD_TYPE "COVERAGE" CACHE STRING "" FORCE)
-endif()
-
 #===============================================================================
 # Setup Compiler Flags
 #===============================================================================
 
+include(HandleLibC) # Setup the C library flags
 include(HandleLibCXXABI) # Setup the ABI library flags
 
 # FIXME(EricWF): See the FIXME on LIBCXX_ENABLE_PEDANTIC.
@@ -855,10 +845,6 @@ add_subdirectory(src)
 add_subdirectory(utils)
 add_subdirectory(modules)
 
-if (LIBCXX_INCLUDE_BENCHMARKS)
-  add_subdirectory(benchmarks)
-endif()
-
 if (LIBCXX_INCLUDE_TESTS)
   add_subdirectory(test)
   add_subdirectory(lib/abi)
diff --git a/libcxx/cmake/Modules/CodeCoverage.cmake b/libcxx/cmake/Modules/CodeCoverage.cmake
deleted file mode 100644
index 1bd3a786812a5..0000000000000
--- a/libcxx/cmake/Modules/CodeCoverage.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-find_program(CODE_COVERAGE_LCOV lcov)
-if (NOT CODE_COVERAGE_LCOV)
-  message(FATAL_ERROR "Cannot find lcov...")
-endif()
-
-find_program(CODE_COVERAGE_LLVM_COV llvm-cov)
-if (NOT CODE_COVERAGE_LLVM_COV)
-  message(FATAL_ERROR "Cannot find llvm-cov...")
-endif()
-
-find_program(CODE_COVERAGE_GENHTML genhtml)
-if (NOT CODE_COVERAGE_GENHTML)
-  message(FATAL_ERROR "Cannot find genhtml...")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE "-g -O0 --coverage")
-
-function(setup_lcov_test_target_coverage target_name output_dir capture_dirs source_dirs)
-  if (NOT DEFINED LIBCXX_BINARY_DIR)
-    message(FATAL_ERROR "Variable must be set")
-  endif()
-
-  set(GCOV_TOOL "${LIBCXX_BINARY_DIR}/llvm-cov-wrapper")
-  file(GENERATE OUTPUT ${GCOV_TOOL}
-    CONTENT "#!/usr/bin/env bash\n${CODE_COVERAGE_LLVM_COV} gcov \"$@\"\n")
-
-  file(MAKE_DIRECTORY ${output_dir})
-
-  set(CAPTURE_DIRS "")
-  foreach(cdir ${capture_dirs})
-    list(APPEND CAPTURE_DIRS "-d;${cdir}")
-  endforeach()
-
-  set(EXTRACT_DIRS "")
-  foreach(sdir ${source_dirs})
-    list(APPEND EXTRACT_DIRS "'${sdir}/*'")
-  endforeach()
-
-  message(STATUS "Capture Directories: ${CAPTURE_DIRS}")
-  message(STATUS "Extract Directories: ${EXTRACT_DIRS}")
-
-  add_custom_target(generate-lib${target_name}-coverage
-        COMMAND chmod +x ${GCOV_TOOL}
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --capture ${CAPTURE_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --extract test_coverage.info ${EXTRACT_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_GENHTML} --demangle-cpp test_coverage.info -o test_coverage
-        COMMAND ${CMAKE_COMMAND} -E remove test_coverage.info
-        WORKING_DIRECTORY ${output_dir}
-        COMMENT "Generating coverage results")
-endfunction()
diff --git a/libcxx/cmake/Modules/HandleLibC.cmake b/libcxx/cmake/Modules/HandleLibC.cmake
new file mode 100644
index 0000000000000..1b0564ae6fcc6
--- /dev/null
+++ b/libcxx/cmake/Modules/HandleLibC.cmake
@@ -0,0 +1,39 @@
+#===============================================================================
+# Define targets for linking against the selected C library
+#
+# After including this file, the following targets are defined:
+# - libcxx-libc-headers: An interface target that allows getting access to the
+#                        headers of the selected C library.
+# - libcxx-libc-shared: A target representing the selected shared C library.
+# - libcxx-libc-static: A target representing the selected static C library.
+#===============================================================================
+
+# Link against a system-provided libc
+if (LIBCXX_LIBC STREQUAL "system")
+  add_library(libcxx-libc-headers INTERFACE)
+
+  add_library(libcxx-libc-static INTERFACE)
+  add_library(libcxx-libc-shared INTERFACE)
+
+# Link against the in-tree LLVM libc
+elseif (LIBCXX_LIBC STREQUAL "llvm-libc")
+  add_library(libcxx-libc-headers INTERFACE)
+  target_link_libraries(libcxx-libc-headers INTERFACE libc-headers)
+  if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
+    target_compile_options(libcxx-libc-headers INTERFACE "-nostdlibinc")
+  endif()
+
+  add_library(libcxx-libc-static INTERFACE)
+  if (TARGET libc)
+    target_link_libraries(libcxx-libc-static INTERFACE libc)
+  endif()
+  if (TARGET libm)
+    target_link_libraries(libcxx-libc-static INTERFACE libm)
+  endif()
+  if (CXX_SUPPORTS_NOLIBC_FLAG)
+    target_link_options(libcxx-libc-static INTERFACE "-nolibc")
+  endif()
+
+  # TODO: There's no support for building LLVM libc as a shared library yet.
+  add_library(libcxx-libc-shared INTERFACE)
+endif()
diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index 1038f1aed54e3..8768653e620ad 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -15,3 +15,6 @@ set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
 set(LIBCXXABI_ENABLE_ASSERTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_FORGIVING_DYNAMIC_CAST ON CACHE BOOL "")
 set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") # libunwind is built separately
+
+set(LIBCXX_TEST_CONFIG "apple-libc++-shared.cfg.in" CACHE STRING "")
+set(LIBCXXABI_TEST_CONFIG "apple-libc++abi-shared.cfg.in" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
index 4860b590dcde9..4a9389fdcb41c 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
@@ -1,2 +1,4 @@
 set(LIBCXX_HARDENING_MODE "fast" CACHE STRING "")
 set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS" CACHE STRING "")
+set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING" CACHE STRING "")
+set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef60dba0..c5b2ffd0a36c8 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01418990..f68b265d6be2d 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a3e8ef2..57b8d9f4d541a 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44c7139a..d3150ecc38eb6 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc60cc80e..8cd1027edfb45 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-rtti.cmake b/libcxx/cmake/caches/Generic-no-rtti.cmake
index c62ddcec90149..d080360fe76d7 100644
--- a/libcxx/cmake/caches/Generic-no-rtti.cmake
+++ b/libcxx/cmake/caches/Generic-no-rtti.cmake
@@ -2,3 +2,7 @@ set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef1be7be..81c92fca78104 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,7 @@
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index 27c826edfecff..afe1c8ab7b80f 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf218981..27fbc330ce43d 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d41086a386..72c304508f650 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/config-ix.cmake b/libcxx/cmake/config-ix.cmake
index 7406fba482e69..270d80575adcf 100644
--- a/libcxx/cmake/config-ix.cmake
+++ b/libcxx/cmake/config-ix.cmake
@@ -12,7 +12,7 @@ include(CheckCSourceCompiles)
 # LIBCXXABI_USE_LLVM_UNWINDER set, we'd be linking against the just-built
 # libunwind (and the compiler implicit -lunwind wouldn't succeed as the newly
 # built libunwind isn't installed yet). For those cases, it'd be good to
-# link with --uwnindlib=none. Check if that option works.
+# link with --unwindlib=none. Check if that option works.
 llvm_check_compiler_linker_flag(C "--unwindlib=none" CXX_SUPPORTS_UNWINDLIB_EQ_NONE_FLAG)
 
 if (NOT LIBCXX_USE_COMPILER_RT)
@@ -27,6 +27,9 @@ if (NOT LIBCXX_USE_COMPILER_RT)
   endif()
 endif()
 
+check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
+check_cxx_compiler_flag(-nolibc CXX_SUPPORTS_NOLIBC_FLAG)
+
 # libc++ is using -nostdlib++ at the link step when available,
 # otherwise -nodefaultlibs is used. We want all our checks to also
 # use one of these options, otherwise we may end up with an inconsistency between
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 66bb19bb5b2cd..5c224689e0f9f 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -406,7 +406,8 @@ libc++ Feature Options
   Specify the path to a header that contains a custom implementation of the
   assertion handler that gets invoked when a hardening assertion fails. If
   provided, this header will be included by the library, replacing the
-  default assertion handler.
+  default assertion handler. If this is specified as a relative path, it
+  is assumed to be relative to ``<monorepo>/libcxx``.
 
 
 libc++ ABI Feature Options
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 1e347d043ef69..a1506e115fe70 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -290,7 +290,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_syncbuf``                                      ``201803L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_three_way_comparison``                         ``201711L``
+    ``__cpp_lib_three_way_comparison``                         ``201907L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_address``                                   ``201711L``
     ---------------------------------------------------------- -----------------
@@ -346,7 +346,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_optional``                                     ``202110L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_out_ptr``                                      *unimplemented*
+    ``__cpp_lib_out_ptr``                                      ``202106L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_print``                                        ``202207L``
     ---------------------------------------------------------- -----------------
@@ -360,6 +360,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_contains``                              ``202207L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_ranges_find_last``                             ``202207L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_iota``                                  *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_join_with``                             *unimplemented*
@@ -450,7 +452,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_optional_range_support``                       *unimplemented*
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_out_ptr``                                      *unimplemented*
+    ``__cpp_lib_out_ptr``                                      ``202311L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_philox_engine``                                *unimplemented*
     ---------------------------------------------------------- -----------------
@@ -474,6 +476,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_sstream_from_string_view``                     ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_string_view``                                  ``202403L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_submdspan``                                    *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_text_encoding``                                *unimplemented*
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 9aac059d27ecf..67791a5e55ac7 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -325,6 +325,22 @@ Vendors can use the following ABI options to enable additional hardening checks:
   - ``span``;
   - ``string_view``.
 
+- ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING`` -- changes the iterator type of
+  ``basic_string`` to a bounded iterator that keeps track of whether it's within
+  the bounds of the original container and asserts it on every dereference and
+  when performing iterator arithmetics.
+
+  ABI impact: changes the iterator type of ``basic_string`` and its
+  specializations, such as ``string`` and ``wstring``.
+
+- ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR`` -- changes the iterator type of
+  ``vector`` to a bounded iterator that keeps track of whether it's within the
+  bounds of the original container and asserts it on every dereference and when
+  performing iterator arithmetics. Note: this doesn't yet affect
+  ``vector<bool>``.
+
+  ABI impact: changes the iterator type of ``vector`` (except ``vector<bool>``).
+
 ABI tags
 --------
 
@@ -367,10 +383,10 @@ Hardened containers status
       - ❌
     * - ``vector``
       - ✅
-      - ❌
+      - ✅ (see note)
     * - ``string``
       - ✅
-      - ❌
+      - ✅ (see note)
     * - ``list``
       - ✅
       - ❌
@@ -429,6 +445,12 @@ Hardened containers status
       - ❌
       - N/A
 
+Note: for ``vector`` and ``string``, the iterator does not check for
+invalidation (accesses made via an invalidated iterator still lead to undefined
+behavior)
+
+Note: ``vector<bool>`` iterator is not currently hardened.
+
 Testing
 =======
 
diff --git a/libcxx/docs/ImplementationDefinedBehavior.rst b/libcxx/docs/ImplementationDefinedBehavior.rst
index 3000bb7cfa468..1f95de77db0eb 100644
--- a/libcxx/docs/ImplementationDefinedBehavior.rst
+++ b/libcxx/docs/ImplementationDefinedBehavior.rst
@@ -51,6 +51,25 @@ Libc++ determines that a stream is Unicode-capable terminal by:
   <http://eel.is/c++draft/print.fun#7>`_. This function is used for other
   ``std::print`` overloads that don't take an ``ostream&`` argument.
 
+`[sf.cmath] <https://wg21.link/sf.cmath>`_ Mathematical Special Functions: Large indices
+----------------------------------------------------------------------------------------
+
+Most functions within the Mathematical Special Functions section contain integral indices.
+The Standard specifies the result for larger indices as implementation-defined.
+Libc++ pursuits reasonable results by choosing the same formulas as for indices below that threshold.
+E.g.
+
+- ``std::hermite(unsigned n, T x)`` for ``n >= 128``
+
+
+`[stringbuf.cons] <http://eel.is/c++draft/stringbuf.cons>`_ Whether sequence pointers are initialized to null pointers
+----------------------------------------------------------------------------------------------------------------------
+
+Libc++ does not initialize the pointers to null pointers. It resizes the buffer
+to its capacity and uses that size. This means the SSO buffer of
+``std::string`` is used as initial output buffer.
+
+
 Listed in the index of implementation-defined behavior
 ======================================================
 
diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index 5d98ade8d9918..9a77a5c23f30b 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -1,11 +1,11 @@
-.. include:: ReleaseNotes/19.rst
+.. include:: ReleaseNotes/20.rst
 
 .. Make sure to reference the non-live release notes in a toctree to avoid Sphinx errors.
 .. toctree::
     :hidden:
 
     ReleaseNotes/18
-    ReleaseNotes/20
+    ReleaseNotes/19
 
 .. The release notes are in versioned files, but we make sure to keep publishing
 .. them in an unversioned ReleaseNotes.html page for external sites to reference.
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 80b9e18cec901..3d79def336a87 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -35,9 +35,25 @@ see the `releases page <https://llvm.org/releases/>`_.
 What's New in Libc++ 19.0.0?
 ==============================
 
+The main focus of the libc++ team has been to implement new C++20, C++23,
+and C++26 features.
+
+Experimental support for the time zone database has progressed.
+
+Work on the ranges support has progressed. See
+:ref:`ranges-status` for the current status.
+
+Work on the experimental C++17 Parallel STL has progressed. See
+:ref:`pstl-status` for the current status.
+
+Work on the C++17 mathematical special functions has started. See
+:ref:`special-math-status` for the current status.
+
 Implemented Papers
 ------------------
 
+- P1132R8 - ``out_ptr`` - a scalable output pointer abstraction
+- P1614R2 - The Mothership has Landed
 - P2637R3 - Member ``visit``
 - P2652R2 - Disallow User Specialization of ``allocator_traits``
 - P2819R2 - Add ``tuple`` protocol to ``complex``
@@ -46,7 +62,9 @@ Implemented Papers
 - P2872R3 - Remove ``wstring_convert`` From C++26
 - P3142R0 - Printing Blank Lines with ``println`` (as DR against C++23)
 - P2944R3 - Comparisons for ``reference_wrapper`` (comparison operators for ``reference_wrapper`` only)
+- P2591R5 - Concatenation of strings and string views
 - P2968R2 - Make ``std::ignore`` a first-class object
+- P2997R1 - Removing the common reference requirement from the indirectly invocable concepts (as DR against C++20)
 - P2302R4 - ``std::ranges::contains``
 - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with``
 - P3029R1 - Better ``mdspan``'s CTAD
@@ -55,20 +73,32 @@ Implemented Papers
 - P2231R1 - Missing ``constexpr`` in ``std::optional`` and ``std::variant``
 - P0019R8 - ``std::atomic_ref``
 - P2389R2 - Alias template ``dims`` for the ``extents`` of ``mdspan``
+- P1223R5 - ``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``
+- P2602R2 - Poison Pills are Too Toxic
+- P1981R0 - Rename ``leap`` to ``leap_second``
+- P1982R0 - Rename ``link`` to ``time_zone_link``
+
 
 Improvements and New Features
 -----------------------------
 
 - The performance of growing ``std::vector`` has been improved for trivially relocatable types.
-- A lot of types are considered trivially relocatable now, including ``vector`` and ``string``.
-- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``\s,
+
+- A lot of types are considered trivially relocatable now, including ``std::vector`` and ``std::string``.
+
+- The performance of ``std::ranges::fill`` and ``std::ranges::fill_n`` has been improved for ``std::vector<bool>::iterator``\s,
   resulting in a performance increase of up to 1400x.
+
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
 
 - The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
   up to 100x.
 
+- The ``std::set_intersection`` and ``std::ranges::set_intersection`` algorithms have been optimized to fast-forward over
+  contiguous ranges of non-matching values, reducing the number of comparisons from linear to
+  logarithmic growth with the number of elements in best-case scenarios.
+
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
 
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT`` macro has been added to make the declarations in ``<locale>``
@@ -79,15 +109,32 @@ Improvements and New Features
 - ``std::ignore``\s ``const __ignore_t& operator=(_Tp&&) const`` was changed to
   ``const __ignore_type& operator=(const _Tp&) const noexcept`` for all language versions.
 
+- Vendors can now configure the ABI so that ``string`` and ``vector`` will use bounded iterators when hardening is
+  enabled. Note that checks for iterator invalidation are currently not supported -- any accesses made through an
+  invalidated bounded iterator will still result in undefined behavior (bounded iterators follow the normal invalidation
+  rules of the associated container). ``string`` bounded iterators use the logical size of the container (``index
+  < str.size()``) whereas ``vector`` bounded iterators use the "physical" size of the container (``index
+  < vec.capacity()``) which is a less strict check; refer to the implementation for further details.
+
+  Bounded iterators can be enabled via the ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING`` ABI macro for ``string`` and via
+  the ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR`` ABI macro for ``vector``; note that checks will only be performed if
+  the hardening mode is set to ``fast`` or above (i.e., no checking is performed in the unchecked mode, even if bounded
+  iterators are enabled in the ABI configuration).
+
+  Note: bounded iterators currently are not supported for ``vector<bool>``.
+
+- In C++23 and C++26 the number of transitive includes in several headers has been reduced, improving the compilation speed.
+
+
 Deprecations and Removals
 -------------------------
 
-- The C++20 synchronization library (``<barrier>``, ``<latch>``, ``atomic::wait``, etc.) has been deprecated
+- The C++20 synchronization library (``<barrier>``, ``<latch>``, ``std::atomic::wait``, etc.) has been deprecated
   in language modes prior to C++20. If you are using these features prior to C++20, please update to ``-std=c++20``.
   In LLVM 20, the C++20 synchronization library will be removed entirely in language modes prior to C++20.
 
 - ``_LIBCPP_DISABLE_NODISCARD_EXT`` has been removed. ``[[nodiscard]]`` applications are now unconditional.
-  This decision is based on LEWGs discussion on `P3122 <https://wg21.link/P3122>` and `P3162 <https://wg21.link/P3162>`
+  This decision is based on LEWGs discussion on `P3122 <https://wg21.link/P3122>`_ and `P3162 <https://wg21.link/P3162>`_
   to not use ``[[nodiscard]]`` in the standard.
 
 - The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode has been deprecated and setting
@@ -129,10 +176,11 @@ Deprecations and Removals
 - libc++ no longer supports ``std::allocator<const T>`` and containers of ``const``-qualified element type, such
   as ``std::vector<const T>`` and ``std::list<const T>``. This used to be supported as an undocumented extension.
   If you were using ``std::vector<const T>``, replace it with ``std::vector<T>`` instead. The
-  ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro can be defined to temporarily re-enable this extension as
-  folks transition their code. This macro will be honored for one released and ignored starting in LLVM 20.
+  ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro can be defined to temporarily re-enable this extension.
+  to temporarily re-enable this extension to make it easier to update user code
+  This macro will be honored for one released and ignored starting in LLVM 20.
   To assist with the clean-up process, consider running your code through Clang Tidy, with
-  `std-allocator-const <https://clang.llvm.org/extra/clang-tidy/checks/portability/std-allocator-const.html>`
+  `std-allocator-const <https://clang.llvm.org/extra/clang-tidy/checks/portability/std-allocator-const.html>`_
   enabled.
 
 - When configuring libc++ with localization or threads disabled, the library no longer emits an error when
@@ -141,6 +189,12 @@ Deprecations and Removals
   of randomness, and others. Users that were checking whether including a header would fail (e.g. via a script
   or CMake's ``try_compile`` will experience a change in behavior).
 
+- libc++ no longer supports relational comparison for ``std::chrono::weekday``. The relational comparison operators were
+  provided as an undocumented extension. If you were using relational comparison on ``std::chrono::weekday``, compare
+  the results of ``c_encoding()`` or ``iso_encoding()`` instead. The
+  ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro can be defined to temporarily re-enable this extension.
+  This macro will be honored for one release and ignored starting in LLVM 20.
+
 - The operators in the ``rel_ops`` namespace have been deprecated. The deprecation is part of the paper
   P0768R1 "Library Support for the Spaceship (Comparison) Operator".
 
@@ -155,9 +209,22 @@ LLVM 20
 
 - The C++20 synchronization library will be removed entirely in language modes prior to C++20 in LLVM 20.
 
+- The relational operators for ``std::chrono::weekday`` will be removed entirely, and the
+  ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be
+  ignored in LLVM 20.
+
+- The ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro will no longer have an effect.
+
+
 LLVM 21
 ~~~~~~~
-TODO
+
+- The status of the C++03 implementation will be frozen after the LLVM 21 release. This means that starting in LLVM 22, non-critical bug fixes may not be back-ported
+  to C++03, including LWG issues. C++03 is a legacy platform, where most projects are no longer actively maintained. To
+  reduce the amount of fixes required to keep such legacy projects compiling with up-to-date toolchains, libc++ will aim to freeze the status of the headers in C++03 mode to avoid unintended breaking changes.
+  See https://discourse.llvm.org/t/rfc-freezing-c-03-headers-in-libc for more details.
+
+  If you are using C++03 in your project, you should consider moving to a newer version of the Standard to get the most out of libc++.
 
 
 ABI Affecting Changes
@@ -174,7 +241,7 @@ Build System Changes
 - The ``LIBCXX_EXECUTOR`` and ``LIBCXXABI_EXECUTOR`` CMake variables have been removed. Please
   set ``LIBCXX_TEST_PARAMS`` to ``executor=<...>`` instead.
 
-- The Cmake variable ``LIBCXX_ENABLE_CLANG_TIDY`` has been removed. The build system has been changed
+- The CMake variable ``LIBCXX_ENABLE_CLANG_TIDY`` has been removed. The build system has been changed
   to automatically detect the presence of ``clang-tidy`` and the required ``Clang`` libraries.
 
 - The CMake options ``LIBCXX_INSTALL_MODULES`` now defaults to ``ON``.
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 79b9788f92eda..960fdd7ce0562 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -53,18 +53,36 @@ Deprecations and Removals
 - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable and the ``_LIBCPP_ENABLE_ASSERTIONS`` macro that were used to enable
   the safe mode will be removed in LLVM 20.
 
-- TODO: The C++20 synchronization library will be removed entirely in language modes prior to C++20 in LLVM 20.
+- Support for the C++20 synchronization library (``<barrier>``, ``<latch>``, ``atomic::wait``, etc.) has been
+  removed in language modes prior to C++20. If you are using these features prior to C++20, you will need to
+  update to ``-std=c++20``.
 
+- TODO: The relational operators for ``std::chrono::weekday`` will be removed entirely, and the
+  ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be
+  ignored in LLVM 20.
+
+- TODO: The ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro will no longer have an effect.
 
 Upcoming Deprecations and Removals
 ----------------------------------
 
-LLVM 21
+LLVM 20
 ~~~~~~~
 
 - TODO
 
 
+LLVM 21
+~~~~~~~
+
+- The status of the C++03 implementation will be frozen after the LLVM 21 release. This means that starting in LLVM 22, non-critical bug fixes may not be back-ported
+  to C++03, including LWG issues. C++03 is a legacy platform, where most projects are no longer actively maintained. To
+  reduce the amount of fixes required to keep such legacy projects compiling with up-to-date toolchains, libc++ will aim to freeze the status of the headers in C++03 mode to avoid unintended breaking changes.
+  See https://discourse.llvm.org/t/rfc-freezing-c-03-headers-in-libc for more details.
+
+  If you are using C++03 in your project, you should consider moving to a newer version of the Standard to get the most out of libc++.
+
+
 ABI Affecting Changes
 ---------------------
 
diff --git a/libcxx/docs/Status/Cxx17.rst b/libcxx/docs/Status/Cxx17.rst
index d4426afa81638..ad4f8576f03db 100644
--- a/libcxx/docs/Status/Cxx17.rst
+++ b/libcxx/docs/Status/Cxx17.rst
@@ -41,6 +41,7 @@ Paper Status
 .. note::
 
    .. [#note-P0067] P0067: ``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``.
+   .. [#note-P0226] P0226: Progress is tracked `here <https://https://libcxx.llvm.org/Status/SpecialMath.html>`_.
    .. [#note-P0607] P0607: The parts of P0607 that are not done are the ``<regex>`` bits.
    .. [#note-P0154] P0154: The required macros are only implemented as of clang 19.
    .. [#note-P0452] P0452: The changes to ``std::transform_inclusive_scan`` and ``std::transform_exclusive_scan`` have not yet been implemented.
diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv
index 2e560cfe0d576..6c657d51f5c7e 100644
--- a/libcxx/docs/Status/Cxx17Papers.csv
+++ b/libcxx/docs/Status/Cxx17Papers.csv
@@ -26,7 +26,7 @@
 "`P0013R1 <https://wg21.link/p0013r1>`__","LWG","Logical type traits rev 2","Kona","|Complete|","3.8"
 "","","","","",""
 "`P0024R2 <https://wg21.link/P0024R2>`__","LWG","The Parallelism TS Should be Standardized","Jacksonville","|Partial|",""
-"`P0226R1 <https://wg21.link/P0226R1>`__","LWG","Mathematical Special Functions for C++17","Jacksonville","",""
+"`P0226R1 <https://wg21.link/P0226R1>`__","LWG","Mathematical Special Functions for C++17","Jacksonville","|In Progress| [#note-P0226]_",""
 "`P0220R1 <https://wg21.link/P0220R1>`__","LWG","Adopt Library Fundamentals V1 TS Components for C++17","Jacksonville","|Complete|","16.0"
 "`P0218R1 <https://wg21.link/P0218R1>`__","LWG","Adopt the File System TS for C++17","Jacksonville","|Complete|","7.0"
 "`P0033R1 <https://wg21.link/P0033R1>`__","LWG","Re-enabling shared_from_this","Jacksonville","|Complete|","3.9"
diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst
index c00d6fb237286..b76e30fbb3712 100644
--- a/libcxx/docs/Status/Cxx20.rst
+++ b/libcxx/docs/Status/Cxx20.rst
@@ -48,6 +48,7 @@ Paper Status
    .. [#note-P0883.1] P0883: shared_ptr and floating-point changes weren't applied as they themselves aren't implemented yet.
    .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0.
    .. [#note-P0660] P0660: The paper is implemented but the features are experimental and can be enabled via ``-fexperimental-library``.
+   .. [#note-P1614] P1614: ``std::strong_order(long double, long double)`` is partly implemented.
    .. [#note-P0355] P0355: The implementation status is:
 
       * ``Calendars`` mostly done in Clang 7
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 1a40a4472a405..1b76b0c5acfff 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -78,7 +78,7 @@
 "`2139 <https://wg21.link/LWG2139>`__","What is a user-defined type?","Rapperswil","",""
 "`2970 <https://wg21.link/LWG2970>`__","Return type of std::visit misspecified","Rapperswil","|Complete|","11.0"
 "`3058 <https://wg21.link/LWG3058>`__","Parallel adjacent_difference shouldn't require creating temporaries","Rapperswil","",""
-"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","",""
+"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","|Complete|","17.0"
 "`3067 <https://wg21.link/LWG3067>`__","recursive_directory_iterator::pop must invalidate","Rapperswil","|Nothing To Do|",""
 "`3071 <https://wg21.link/LWG3071>`__","[networking.ts] read_until still refers to ""input sequence""","Rapperswil","|Nothing To Do|",""
 "`3074 <https://wg21.link/LWG3074>`__","Non-member functions for valarray should only deduce from the valarray","Rapperswil","",""
@@ -101,7 +101,7 @@
 "`2936 <https://wg21.link/LWG2936>`__","Path comparison is defined in terms of the generic format","San Diego","|Complete|",""
 "`2943 <https://wg21.link/LWG2943>`__","Problematic specification of the wide version of ``basic_filebuf::open``\ ","San Diego","|Nothing To Do|",""
 "`2960 <https://wg21.link/LWG2960>`__","[fund.ts.v3] ``nonesuch``\  is insufficiently useless","San Diego","|Complete|",""
-"`2995 <https://wg21.link/LWG2995>`__","``basic_stringbuf``\  default constructor forbids it from using SSO capacity","San Diego","",""
+"`2995 <https://wg21.link/LWG2995>`__","``basic_stringbuf``\  default constructor forbids it from using SSO capacity","San Diego","|Complete|","20.0"
 "`2996 <https://wg21.link/LWG2996>`__","Missing rvalue overloads for ``shared_ptr``\  operations","San Diego","|Complete|","17.0"
 "`3008 <https://wg21.link/LWG3008>`__","``make_shared``\  (sub)object destruction semantics are not specified","San Diego","|Complete|","16.0"
 "`3022 <https://wg21.link/LWG3022>`__","``is_convertible<derived*, base*>``\  may lead to ODR","San Diego","Resolved by `P1285R0 <https://wg21.link/P1285R0>`__",""
@@ -124,7 +124,7 @@
 "`3137 <https://wg21.link/LWG3137>`__","Header for ``__cpp_lib_to_chars``\ ","San Diego","|Complete|",""
 "`3140 <https://wg21.link/LWG3140>`__","``COMMON_REF``\  is unimplementable as specified","San Diego","|Nothing To Do|",""
 "`3145 <https://wg21.link/LWG3145>`__","``file_clock``\  breaks ABI for C++17 implementations","San Diego","|Complete|",""
-"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","",""
+"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","|Nothing To Do|",""
 "`3148 <https://wg21.link/LWG3148>`__","``<concepts>``\  should be freestanding","San Diego","",""
 "`3153 <https://wg21.link/LWG3153>`__","``Common``\  and ``common_type``\  have too little in common","San Diego","|Complete|","13.0"
 "`3154 <https://wg21.link/LWG3154>`__","``Common``\  and ``CommonReference``\  have a common defect","San Diego","|Nothing To Do|",""
@@ -155,7 +155,7 @@
 "`3191 <https://wg21.link/LWG3191>`__","``std::ranges::shuffle``\  synopsis does not match algorithm definition","Cologne","|Complete|","15.0","|ranges|"
 "`3196 <https://wg21.link/LWG3196>`__","``std::optional<T>``\  is ill-formed is ``T``\  is an array","Cologne","|Complete|",""
 "`3198 <https://wg21.link/LWG3198>`__","Bad constraint on ``std::span::span()``\ ","Cologne","|Complete|",""
-"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","",""
+"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","|Complete|","10.0"
 "`3202 <https://wg21.link/LWG3202>`__","P0318R1 was supposed to be revised","Cologne","|Complete|",""
 "`3206 <https://wg21.link/LWG3206>`__","``year_month_day``\  conversion to ``sys_days``\  uses not-existing member function","Cologne","|Complete|",""
 "`3208 <https://wg21.link/LWG3208>`__","``Boolean``\ 's expression requirements are ordered inconsistently","Cologne","|Nothing To Do|",""
@@ -233,7 +233,7 @@
 "`3302 <https://wg21.link/LWG3302>`__","Range adaptor objects ``keys``\  and ``values``\  are unspecified","Prague","|Complete|","16.0","|ranges|"
 "`3303 <https://wg21.link/LWG3303>`__","Bad ""``constexpr``\ "" marker for ``destroy/destroy_n``\ ","Prague","",""
 "`3304 <https://wg21.link/LWG3304>`__","Allocate functions of ``std::polymorphic_allocator``\  should require ``[[nodiscard]]``\ ","Prague","|Complete|","16.0"
-"`3307 <https://wg21.link/LWG3307>`__","``std::allocator<void>().allocate(n)``\ ","Prague","",""
+"`3307 <https://wg21.link/LWG3307>`__","``std::allocator<void>().allocate(n)``\ ","Prague","|Complete|","20.0"
 "`3310 <https://wg21.link/LWG3310>`__","Replace ``SIZE_MAX``\  with ``numeric_limits<size_t>::max()``\ ","Prague","|Complete|","16.0"
 "`3313 <https://wg21.link/LWG3313>`__","``join_view::iterator::operator--``\  is incorrectly constrained","Prague","|Complete|","14.0","|ranges|"
 "`3314 <https://wg21.link/LWG3314>`__","Is stream insertion behavior locale dependent when ``Period::type``\  is ``micro``\ ?","Prague","|Complete|","16.0","|chrono|"
@@ -249,7 +249,7 @@
 "`3325 <https://wg21.link/LWG3325>`__","Constrain return type of transformation function for ``transform_view``\ ","Prague","|Complete|","15.0","|ranges|"
 "`3326 <https://wg21.link/LWG3326>`__","``enable_view``\  has false positives","Prague","|Complete|","15.0","|ranges|"
 "`3327 <https://wg21.link/LWG3327>`__","Format alignment specifiers vs. text direction","Prague","|Nothing To Do|","","|format|"
-"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","",""
+"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","|Nothing To Do|",""
 "`3329 <https://wg21.link/LWG3329>`__","``totally_ordered_with``\  both directly and indirectly requires ``common_reference_with``\ ","Prague","|Complete|","13.0"
 "`3330 <https://wg21.link/LWG3330>`__","Include ``<compare>``\  from most library headers","Prague","|Complete|","13.0","|spaceship|"
 "`3331 <https://wg21.link/LWG3331>`__","Define ``totally_ordered/_with``\  in terms of ``partially-ordered-with``\ ","Prague","|Complete|","13.0"
@@ -264,14 +264,14 @@
 "`3349 <https://wg21.link/LWG3349>`__","Missing ``__cpp_lib_constexpr_complex``\  for P0415R1","Prague","|Complete|","16.0"
 "`3350 <https://wg21.link/LWG3350>`__","Simplify return type of ``lexicographical_compare_three_way``\ ","Prague","|Complete|","17.0","|spaceship|"
 "`3351 <https://wg21.link/LWG3351>`__","``ranges::enable_safe_range``\  should not be constrained","Prague","|Complete|","15.0","|ranges|"
-"`3352 <https://wg21.link/LWG3352>`__","``strong_equality``\  isn't a thing","Prague","|Nothing To Do|","","|spaceship|"
+"`3352 <https://wg21.link/LWG3352>`__","``strong_equality``\  isn't a thing","Prague","|Complete|","19.0","|spaceship|"
 "`3354 <https://wg21.link/LWG3354>`__","``has_strong_structural_equality``\  has a meaningless definition","Prague","|Nothing To Do|","","|spaceship|"
 "`3355 <https://wg21.link/LWG3355>`__","The memory algorithms should support move-only input iterators introduced by P1207","Prague","|Complete|","15.0","|ranges|"
 "`3356 <https://wg21.link/LWG3356>`__","``__cpp_lib_nothrow_convertible``\  should be ``__cpp_lib_is_nothrow_convertible``\ ","Prague","|Complete|","12.0"
 "`3358 <https://wg21.link/LWG3358>`__","|sect|\ [span.cons] is mistaken that ``to_address``\  can throw","Prague","|Complete|","17.0"
 "`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\  leap second support should allow for negative leap seconds","Prague","|In Progress|","","|chrono|"
 "`3360 <https://wg21.link/LWG3360>`__","``three_way_comparable_with``\  is inconsistent with similar concepts","Prague","|Nothing To Do|","","|spaceship|"
-"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","",""
+"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","|Complete|","17.0"
 "`3363 <https://wg21.link/LWG3363>`__","``drop_while_view``\  should opt-out of ``sized_range``\ ","Prague","|Nothing To Do|","","|ranges|"
 "`3364 <https://wg21.link/LWG3364>`__","Initialize data members of ranges and their iterators","Prague","|Complete|","16.0","|ranges|"
 "`3367 <https://wg21.link/LWG3367>`__","Integer-class conversions should not throw","Prague","|Nothing To Do|",""
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 34fc5586f74d9..858c7897e9351 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -123,7 +123,7 @@
 "`P1522R1 <https://wg21.link/P1522R1>`__","LWG","Iterator Difference Type and Integer Overflow","Cologne","|Complete|","15.0","|ranges|"
 "`P1523R1 <https://wg21.link/P1523R1>`__","LWG","Views and Size Types","Cologne","|Complete|","15.0","|ranges|"
 "`P1612R1 <https://wg21.link/P1612R1>`__","LWG","Relocate Endian's Specification","Cologne","|Complete|","10.0"
-"`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|In Progress|",""
+"`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|Complete| [#note-P1614]_","19.0"
 "`P1638R1 <https://wg21.link/P1638R1>`__","LWG","basic_istream_view::iterator should not be copyable","Cologne","|Complete|","16.0","|ranges|"
 "`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","|Complete|","19.0"
 "`P1644R0 <https://wg21.link/P1644R0>`__","LWG","Add wait/notify to atomic<shared_ptr>","Cologne","",""
@@ -171,7 +171,6 @@
 "`P1739R4 <https://wg21.link/P1739R4>`__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","|Complete|","15.0","|ranges|"
 "`P1831R1 <https://wg21.link/P1831R1>`__","LWG","Deprecating volatile: library","Prague","* *",""
 "`P1868R2 <https://wg21.link/P1868R2>`__","LWG","width: clarifying units of width and precision in std::format","Prague","|Complete|","14.0"
-"`P1937R2 <https://wg21.link/P1937R2>`__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *",""
 "`P1956R1 <https://wg21.link/P1956R1>`__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0"
 "`P1957R2 <https://wg21.link/P1957R2>`__","CWG","Converting from ``T*``\  to bool should be considered narrowing (re: US 212)","Prague","|Complete|","18.0"
 "`P1963R0 <https://wg21.link/P1963R0>`__","LWG","Fixing US 313","Prague","* *","",""
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index cc601b3cd3c96..11b2c528da208 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -60,82 +60,82 @@
 "`3502 <https://wg21.link/LWG3502>`__","``elements_view`` should not be allowed to return dangling reference","February 2021","|Complete|","16.0","|ranges|"
 "`3505 <https://wg21.link/LWG3505>`__","``split_view::outer-iterator::operator++`` misspecified","February 2021","","","|ranges|"
 "","","","","",""
-`2774 <https://wg21.link/LWG2774>`__,"``std::function`` construction vs assignment","June 2021","",""
-`2818 <https://wg21.link/LWG2818>`__,"``::std::`` everywhere rule needs tweaking","June 2021","|Nothing To Do|",""
-`2997 <https://wg21.link/LWG2997>`__,"LWG 491 and the specification of ``{forward_,}list::unique``","June 2021","",""
-`3410 <https://wg21.link/LWG3410>`__,"``lexicographical_compare_three_way`` is overspecified","June 2021","|Complete|","17.0","|spaceship|"
-`3430 <https://wg21.link/LWG3430>`__,"``std::fstream`` & co. should be constructible from string_view","June 2021","|Complete|","19.0",""
-`3462 <https://wg21.link/LWG3462>`__,"§[formatter.requirements]: Formatter requirements forbid use of ``fc.arg()``","June 2021","|Nothing To Do|","","|format|"
-`3481 <https://wg21.link/LWG3481>`__,"``viewable_range`` mishandles lvalue move-only views","June 2021","Superseded by `P2415R2 <https://wg21.link/P2415R2>`__","","|ranges|"
-`3506 <https://wg21.link/LWG3506>`__,"Missing allocator-extended constructors for ``priority_queue``","June 2021","|Complete|","14.0"
-`3517 <https://wg21.link/LWG3517>`__,"``join_view::iterator``'s ``iter_swap`` is underconstrained","June 2021","|Complete|","14.0","|ranges|"
-`3518 <https://wg21.link/LWG3518>`__,"Exception requirements on char trait operations unclear","June 2021","|Nothing To Do|",""
-`3519 <https://wg21.link/LWG3519>`__,"Incomplete synopses for ``<random>`` classes","June 2021","",""
-`3520 <https://wg21.link/LWG3520>`__,"``iter_move`` and ``iter_swap`` are inconsistent for ``transform_view::iterator``","June 2021","|Complete|","14.0","|ranges|"
-`3521 <https://wg21.link/LWG3521>`__,"Overly strict requirements on ``qsort`` and ``bsearch``","June 2021","|Nothing To Do|",""
-`3522 <https://wg21.link/LWG3522>`__,"Missing requirement on ``InputIterator`` template parameter for ``priority_queue`` constructors","June 2021","|Complete|","14.0"
-`3523 <https://wg21.link/LWG3523>`__,"``iota_view::sentinel`` is not always ``iota_view``'s sentinel","June 2021","","","|ranges|"
-`3526 <https://wg21.link/LWG3526>`__,"Return types of ``uses_allocator_construction_args`` unspecified","June 2021","",""
-`3527 <https://wg21.link/LWG3527>`__,"``uses_allocator_construction_args`` handles rvalue pairs of rvalue references incorrectly","June 2021","",""
-`3528 <https://wg21.link/LWG3528>`__,"``make_from_tuple`` can perform (the equivalent of) a C-style cast","June 2021","|Complete|","19.0"
-`3529 <https://wg21.link/LWG3529>`__,"``priority_queue(first, last)`` should construct ``c`` with ``(first, last)``","June 2021","|Complete|","14.0"
-`3530 <https://wg21.link/LWG3530>`__,"``BUILTIN-PTR-MEOW`` should not opt the type out of syntactic checks","June 2021","",""
-`3532 <https://wg21.link/LWG3532>`__,"``split_view<V, P>::inner-iterator<true>::operator++(int)`` should depend on ``Base``","June 2021","","","|ranges|"
-`3533 <https://wg21.link/LWG3533>`__,"Make ``base() const &`` consistent across iterator wrappers that supports ``input_iterators``","June 2021","|Complete|","14.0","|ranges|"
-`3536 <https://wg21.link/LWG3536>`__,"Should ``chrono::from_stream()`` assign zero to duration for failure?","June 2021","","","|chrono|"
-`3539 <https://wg21.link/LWG3539>`__,"``format_to`` must not copy models of ``output_iterator<const charT&>``","June 2021","|Complete|","14.0","|format|"
-`3540 <https://wg21.link/LWG3540>`__,"§[format.arg] There should be no const in ``basic_format_arg(const T* p)``","June 2021","|Complete|","14.0","|format|"
-`3541 <https://wg21.link/LWG3541>`__,"``indirectly_readable_traits`` should be SFINAE-friendly for all types","June 2021","|Complete|","14.0","|ranges|"
-`3542 <https://wg21.link/LWG3542>`__,"``basic_format_arg`` mishandles ``basic_string_view`` with custom traits","June 2021","|Complete|","14.0","|format|"
-`3543 <https://wg21.link/LWG3543>`__,"Definition of when ``counted_iterators`` refer to the same sequence isn't quite right","June 2021","|Nothing To Do|","","|ranges|"
-`3544 <https://wg21.link/LWG3544>`__,"``format-arg-store::args`` is unintentionally not exposition-only","June 2021","|Complete|","14.0","|format|"
-`3546 <https://wg21.link/LWG3546>`__,"``common_iterator``'s postfix-proxy is not quite right","June 2021","","","|ranges|"
-`3548 <https://wg21.link/LWG3548>`__,"``shared_ptr`` construction from ``unique_ptr`` should move (not copy) the deleter","June 2021","|Complete|","15.0"
-`3549 <https://wg21.link/LWG3549>`__,"``view_interface`` is overspecified to derive from ``view_base``","June 2021","|Complete|","14.0","|ranges|"
-`3551 <https://wg21.link/LWG3551>`__,"``borrowed_{iterator,subrange}_t`` are overspecified","June 2021","|Nothing To Do|","","|ranges|"
-`3552 <https://wg21.link/LWG3552>`__,"Parallel specialized memory algorithms should require forward iterators","June 2021","",""
-`3553 <https://wg21.link/LWG3553>`__,"Useless constraint in ``split_view::outer-iterator::value_type::begin()``","June 2021","","","|ranges|"
-`3555 <https://wg21.link/LWG3555>`__,"``{transform,elements}_view::iterator::iterator_concept`` should consider const-qualification of the underlying range","June 2021","","","|ranges|"
+"`2774 <https://wg21.link/LWG2774>`__","``std::function`` construction vs assignment","June 2021","",""
+"`2818 <https://wg21.link/LWG2818>`__","``::std::`` everywhere rule needs tweaking","June 2021","|Nothing To Do|",""
+"`2997 <https://wg21.link/LWG2997>`__","LWG 491 and the specification of ``{forward_,}list::unique``","June 2021","",""
+"`3410 <https://wg21.link/LWG3410>`__","``lexicographical_compare_three_way`` is overspecified","June 2021","|Complete|","17.0","|spaceship|"
+"`3430 <https://wg21.link/LWG3430>`__","``std::fstream`` & co. should be constructible from string_view","June 2021","|Complete|","19.0",""
+"`3462 <https://wg21.link/LWG3462>`__","§[formatter.requirements]: Formatter requirements forbid use of ``fc.arg()``","June 2021","|Nothing To Do|","","|format|"
+"`3481 <https://wg21.link/LWG3481>`__","``viewable_range`` mishandles lvalue move-only views","June 2021","Superseded by `P2415R2 <https://wg21.link/P2415R2>`__","","|ranges|"
+"`3506 <https://wg21.link/LWG3506>`__","Missing allocator-extended constructors for ``priority_queue``","June 2021","|Complete|","14.0"
+"`3517 <https://wg21.link/LWG3517>`__","``join_view::iterator``'s ``iter_swap`` is underconstrained","June 2021","|Complete|","14.0","|ranges|"
+"`3518 <https://wg21.link/LWG3518>`__","Exception requirements on char trait operations unclear","June 2021","|Nothing To Do|",""
+"`3519 <https://wg21.link/LWG3519>`__","Incomplete synopses for ``<random>`` classes","June 2021","",""
+"`3520 <https://wg21.link/LWG3520>`__","``iter_move`` and ``iter_swap`` are inconsistent for ``transform_view::iterator``","June 2021","|Complete|","14.0","|ranges|"
+"`3521 <https://wg21.link/LWG3521>`__","Overly strict requirements on ``qsort`` and ``bsearch``","June 2021","|Nothing To Do|",""
+"`3522 <https://wg21.link/LWG3522>`__","Missing requirement on ``InputIterator`` template parameter for ``priority_queue`` constructors","June 2021","|Complete|","14.0"
+"`3523 <https://wg21.link/LWG3523>`__","``iota_view::sentinel`` is not always ``iota_view``'s sentinel","June 2021","","","|ranges|"
+"`3526 <https://wg21.link/LWG3526>`__","Return types of ``uses_allocator_construction_args`` unspecified","June 2021","",""
+"`3527 <https://wg21.link/LWG3527>`__","``uses_allocator_construction_args`` handles rvalue pairs of rvalue references incorrectly","June 2021","",""
+"`3528 <https://wg21.link/LWG3528>`__","``make_from_tuple`` can perform (the equivalent of) a C-style cast","June 2021","|Complete|","19.0"
+"`3529 <https://wg21.link/LWG3529>`__","``priority_queue(first, last)`` should construct ``c`` with ``(first, last)``","June 2021","|Complete|","14.0"
+"`3530 <https://wg21.link/LWG3530>`__","``BUILTIN-PTR-MEOW`` should not opt the type out of syntactic checks","June 2021","",""
+"`3532 <https://wg21.link/LWG3532>`__","``split_view<V, P>::inner-iterator<true>::operator++(int)`` should depend on ``Base``","June 2021","","","|ranges|"
+"`3533 <https://wg21.link/LWG3533>`__","Make ``base() const &`` consistent across iterator wrappers that supports ``input_iterators``","June 2021","|Complete|","14.0","|ranges|"
+"`3536 <https://wg21.link/LWG3536>`__","Should ``chrono::from_stream()`` assign zero to duration for failure?","June 2021","","","|chrono|"
+"`3539 <https://wg21.link/LWG3539>`__","``format_to`` must not copy models of ``output_iterator<const charT&>``","June 2021","|Complete|","14.0","|format|"
+"`3540 <https://wg21.link/LWG3540>`__","§[format.arg] There should be no const in ``basic_format_arg(const T* p)``","June 2021","|Complete|","14.0","|format|"
+"`3541 <https://wg21.link/LWG3541>`__","``indirectly_readable_traits`` should be SFINAE-friendly for all types","June 2021","|Complete|","14.0","|ranges|"
+"`3542 <https://wg21.link/LWG3542>`__","``basic_format_arg`` mishandles ``basic_string_view`` with custom traits","June 2021","|Complete|","14.0","|format|"
+"`3543 <https://wg21.link/LWG3543>`__","Definition of when ``counted_iterators`` refer to the same sequence isn't quite right","June 2021","|Nothing To Do|","","|ranges|"
+"`3544 <https://wg21.link/LWG3544>`__","``format-arg-store::args`` is unintentionally not exposition-only","June 2021","|Complete|","14.0","|format|"
+"`3546 <https://wg21.link/LWG3546>`__","``common_iterator``'s postfix-proxy is not quite right","June 2021","","","|ranges|"
+"`3548 <https://wg21.link/LWG3548>`__","``shared_ptr`` construction from ``unique_ptr`` should move (not copy) the deleter","June 2021","|Complete|","15.0"
+"`3549 <https://wg21.link/LWG3549>`__","``view_interface`` is overspecified to derive from ``view_base``","June 2021","|Complete|","14.0","|ranges|"
+"`3551 <https://wg21.link/LWG3551>`__","``borrowed_{iterator,subrange}_t`` are overspecified","June 2021","|Nothing To Do|","","|ranges|"
+"`3552 <https://wg21.link/LWG3552>`__","Parallel specialized memory algorithms should require forward iterators","June 2021","",""
+"`3553 <https://wg21.link/LWG3553>`__","Useless constraint in ``split_view::outer-iterator::value_type::begin()``","June 2021","","","|ranges|"
+"`3555 <https://wg21.link/LWG3555>`__","``{transform,elements}_view::iterator::iterator_concept`` should consider const-qualification of the underlying range","June 2021","","","|ranges|"
 "","","","","",""
-`2191 <https://wg21.link/LWG2191>`__,"Incorrect specification of ``match_results(match_results&&)``","October 2021","|Nothing To Do|",""
-`2381 <https://wg21.link/LWG2381>`__,"Inconsistency in parsing floating point numbers","October 2021","|Complete|","19.0"
-`2762 <https://wg21.link/LWG2762>`__,"``unique_ptr operator*()`` should be ``noexcept``","October 2021","",""
-`3121 <https://wg21.link/LWG3121>`__,"``tuple`` constructor constraints for ``UTypes&&...`` overloads","October 2021","",""
-`3123 <https://wg21.link/LWG3123>`__,"``duration`` constructor from representation shouldn't be effectively non-throwing","October 2021","","","|chrono|"
-`3146 <https://wg21.link/LWG3146>`__,"Excessive unwrapping in ``std::ref/cref``","October 2021","|Complete|","14.0"
-`3152 <https://wg21.link/LWG3152>`__,"``common_type`` and ``common_reference`` have flaws in common","October 2021","",""
-`3293 <https://wg21.link/LWG3293>`__,"``move_iterator operator+()`` has incorrect constraints","October 2021","|Complete|","15.0"
-`3361 <https://wg21.link/LWG3361>`__,"``safe_range<SomeRange&>`` case","October 2021","|Nothing To Do|","","|ranges|"
-`3392 <https://wg21.link/LWG3392>`__,"``ranges::distance()`` cannot be used on a move-only iterator with a sized sentinel","October 2021","|Complete|","14.0","|ranges|"
-`3407 <https://wg21.link/LWG3407>`__,"Some problems with the wording changes of P1739R4","October 2021","|Complete|","15.0","|ranges|"
-`3422 <https://wg21.link/LWG3422>`__,"Issues of ``seed_seq``'s constructors","October 2021","|Complete|","14.0"
-`3470 <https://wg21.link/LWG3470>`__,"``convertible-to-non-slicing`` seems to reject valid case","October 2021","|Complete|","14.0","|ranges|"
-`3480 <https://wg21.link/LWG3480>`__,"``directory_iterator`` and ``recursive_directory_iterator`` are not C++20 ranges","October 2021","|Complete|","14.0","|ranges|"
-`3498 <https://wg21.link/LWG3498>`__,"Inconsistent ``noexcept``-specifiers for ``basic_syncbuf``","October 2021","",""
-`3535 <https://wg21.link/LWG3535>`__,"``join_view::iterator::iterator_category`` and ``::iterator_concept`` lie","October 2021","|Complete|","15.0","|ranges|"
-`3554 <https://wg21.link/LWG3554>`__,"``chrono::parse`` needs ``const charT*`` and ``basic_string_view<charT>`` overloads","October 2021","","","|chrono|"
-`3557 <https://wg21.link/LWG3557>`__,"The ``static_cast`` expression in ``convertible_to`` has the wrong operand","October 2021","|Complete|","14.0"
-`3559 <https://wg21.link/LWG3559>`__,"Semantic requirements of ``sized_range`` is circular","October 2021","|Nothing To Do|","","|ranges|"
-`3560 <https://wg21.link/LWG3560>`__,"``ranges::equal`` and ``ranges::is_permutation`` should short-circuit for ``sized_ranges``","October 2021","","","|ranges|"
-`3561 <https://wg21.link/LWG3561>`__,"Issue with internal counter in ``discard_block_engine``","October 2021","",""
-`3563 <https://wg21.link/LWG3563>`__,"``keys_view`` example is broken","October 2021","","","|ranges|"
-`3566 <https://wg21.link/LWG3566>`__,"Constraint recursion for ``operator<=>(optional<T>, U)``","October 2021","|Complete|","17.0","|spaceship|"
-`3567 <https://wg21.link/LWG3567>`__,"Formatting move-only iterators take two","October 2021","|Complete|","16.0","|format| |ranges|"
-`3568 <https://wg21.link/LWG3568>`__,"``basic_istream_view`` needs to initialize ``value_``","October 2021","|Complete|","16.0","|ranges|"
-`3570 <https://wg21.link/LWG3570>`__,"``basic_osyncstream::emit`` should be an unformatted output function","October 2021","|Complete|","18.0"
-`3571 <https://wg21.link/LWG3571>`__,"``flush_emit`` should set ``badbit`` if the ``emit`` call fails","October 2021","",""
-`3572 <https://wg21.link/LWG3572>`__,"``copyable-box`` should be fully ``constexpr``","October 2021","|Complete|","14.0","|ranges|"
-`3573 <https://wg21.link/LWG3573>`__,"Missing Throws element for ``basic_string_view(It begin, End end)``","October 2021","|Complete|","14.0"
-`3574 <https://wg21.link/LWG3574>`__,"``common_iterator`` should be completely ``constexpr``-able","October 2021","|Complete|","14.0","|ranges|"
-`3580 <https://wg21.link/LWG3580>`__,"``iota_view``'s ``iterator``'s binary ``operator+`` should be improved","October 2021","|Complete|","14.0","|ranges|"
-`3581 <https://wg21.link/LWG3581>`__,"The range constructor makes ``basic_string_view`` not trivially move constructible","October 2021","|Complete|","14.0","|ranges|"
-`3585 <https://wg21.link/LWG3585>`__,"``variant`` converting assignment with immovable alternative","October 2021","",""
-`3589 <https://wg21.link/LWG3589>`__,"The ``const`` lvalue reference overload of ``get`` for ``subrange`` does not constrain ``I`` to be ``copyable`` when ``N == 0``","October 2021","|Complete|","14.0","|ranges|"
-`3590 <https://wg21.link/LWG3590>`__,"``split_view::base() const &`` is overconstrained","October 2021","|Complete|","16.0","|ranges|"
-`3591 <https://wg21.link/LWG3591>`__,"``lazy_split_view<input_view>::inner-iterator::base() &&`` invalidates outer iterators","October 2021","","","|ranges|"
-`3592 <https://wg21.link/LWG3592>`__,"``lazy_split_view`` needs to check the simpleness of Pattern","October 2021","","","|ranges|"
-`3593 <https://wg21.link/LWG3593>`__,"Several iterators' ``base() const &`` and ``lazy_split_view::outer-iterator::value_type::end()`` missing ``noexcept``","October 2021","","","|ranges|"
-`3595 <https://wg21.link/LWG3595>`__,"Exposition-only classes proxy and postfix-proxy for ``common_iterator`` should be fully ``constexpr``","October 2021","|Complete|","14.0","|ranges|"
+"`2191 <https://wg21.link/LWG2191>`__","Incorrect specification of ``match_results(match_results&&)``","October 2021","|Nothing To Do|",""
+"`2381 <https://wg21.link/LWG2381>`__","Inconsistency in parsing floating point numbers","October 2021","|Complete|","19.0"
+"`2762 <https://wg21.link/LWG2762>`__","``unique_ptr operator*()`` should be ``noexcept``","October 2021","|Complete|","19.0"
+"`3121 <https://wg21.link/LWG3121>`__","``tuple`` constructor constraints for ``UTypes&&...`` overloads","October 2021","",""
+"`3123 <https://wg21.link/LWG3123>`__","``duration`` constructor from representation shouldn't be effectively non-throwing","October 2021","","","|chrono|"
+"`3146 <https://wg21.link/LWG3146>`__","Excessive unwrapping in ``std::ref/cref``","October 2021","|Complete|","14.0"
+"`3152 <https://wg21.link/LWG3152>`__","``common_type`` and ``common_reference`` have flaws in common","October 2021","",""
+"`3293 <https://wg21.link/LWG3293>`__","``move_iterator operator+()`` has incorrect constraints","October 2021","|Complete|","15.0"
+"`3361 <https://wg21.link/LWG3361>`__","``safe_range<SomeRange&>`` case","October 2021","|Nothing To Do|","","|ranges|"
+"`3392 <https://wg21.link/LWG3392>`__","``ranges::distance()`` cannot be used on a move-only iterator with a sized sentinel","October 2021","|Complete|","14.0","|ranges|"
+"`3407 <https://wg21.link/LWG3407>`__","Some problems with the wording changes of P1739R4","October 2021","|Complete|","15.0","|ranges|"
+"`3422 <https://wg21.link/LWG3422>`__","Issues of ``seed_seq``'s constructors","October 2021","|Complete|","14.0"
+"`3470 <https://wg21.link/LWG3470>`__","``convertible-to-non-slicing`` seems to reject valid case","October 2021","|Complete|","14.0","|ranges|"
+"`3480 <https://wg21.link/LWG3480>`__","``directory_iterator`` and ``recursive_directory_iterator`` are not C++20 ranges","October 2021","|Complete|","14.0","|ranges|"
+"`3498 <https://wg21.link/LWG3498>`__","Inconsistent ``noexcept``-specifiers for ``basic_syncbuf``","October 2021","",""
+"`3535 <https://wg21.link/LWG3535>`__","``join_view::iterator::iterator_category`` and ``::iterator_concept`` lie","October 2021","|Complete|","15.0","|ranges|"
+"`3554 <https://wg21.link/LWG3554>`__","``chrono::parse`` needs ``const charT*`` and ``basic_string_view<charT>`` overloads","October 2021","","","|chrono|"
+"`3557 <https://wg21.link/LWG3557>`__","The ``static_cast`` expression in ``convertible_to`` has the wrong operand","October 2021","|Complete|","14.0"
+"`3559 <https://wg21.link/LWG3559>`__","Semantic requirements of ``sized_range`` is circular","October 2021","|Nothing To Do|","","|ranges|"
+"`3560 <https://wg21.link/LWG3560>`__","``ranges::equal`` and ``ranges::is_permutation`` should short-circuit for ``sized_ranges``","October 2021","","","|ranges|"
+"`3561 <https://wg21.link/LWG3561>`__","Issue with internal counter in ``discard_block_engine``","October 2021","",""
+"`3563 <https://wg21.link/LWG3563>`__","``keys_view`` example is broken","October 2021","","","|ranges|"
+"`3566 <https://wg21.link/LWG3566>`__","Constraint recursion for ``operator<=>(optional<T>, U)``","October 2021","|Complete|","17.0","|spaceship|"
+"`3567 <https://wg21.link/LWG3567>`__","Formatting move-only iterators take two","October 2021","|Complete|","16.0","|format| |ranges|"
+"`3568 <https://wg21.link/LWG3568>`__","``basic_istream_view`` needs to initialize ``value_``","October 2021","|Complete|","16.0","|ranges|"
+"`3570 <https://wg21.link/LWG3570>`__","``basic_osyncstream::emit`` should be an unformatted output function","October 2021","|Complete|","18.0"
+"`3571 <https://wg21.link/LWG3571>`__","``flush_emit`` should set ``badbit`` if the ``emit`` call fails","October 2021","",""
+"`3572 <https://wg21.link/LWG3572>`__","``copyable-box`` should be fully ``constexpr``","October 2021","|Complete|","14.0","|ranges|"
+"`3573 <https://wg21.link/LWG3573>`__","Missing Throws element for ``basic_string_view(It begin, End end)``","October 2021","|Complete|","14.0"
+"`3574 <https://wg21.link/LWG3574>`__","``common_iterator`` should be completely ``constexpr``-able","October 2021","|Complete|","14.0","|ranges|"
+"`3580 <https://wg21.link/LWG3580>`__","``iota_view``'s ``iterator``'s binary ``operator+`` should be improved","October 2021","|Complete|","14.0","|ranges|"
+"`3581 <https://wg21.link/LWG3581>`__","The range constructor makes ``basic_string_view`` not trivially move constructible","October 2021","|Complete|","14.0","|ranges|"
+"`3585 <https://wg21.link/LWG3585>`__","``variant`` converting assignment with immovable alternative","October 2021","",""
+"`3589 <https://wg21.link/LWG3589>`__","The ``const`` lvalue reference overload of ``get`` for ``subrange`` does not constrain ``I`` to be ``copyable`` when ``N == 0``","October 2021","|Complete|","14.0","|ranges|"
+"`3590 <https://wg21.link/LWG3590>`__","``split_view::base() const &`` is overconstrained","October 2021","|Complete|","16.0","|ranges|"
+"`3591 <https://wg21.link/LWG3591>`__","``lazy_split_view<input_view>::inner-iterator::base() &&`` invalidates outer iterators","October 2021","","","|ranges|"
+"`3592 <https://wg21.link/LWG3592>`__","``lazy_split_view`` needs to check the simpleness of Pattern","October 2021","","","|ranges|"
+"`3593 <https://wg21.link/LWG3593>`__","Several iterators' ``base() const &`` and ``lazy_split_view::outer-iterator::value_type::end()`` missing ``noexcept``","October 2021","","","|ranges|"
+"`3595 <https://wg21.link/LWG3595>`__","Exposition-only classes proxy and postfix-proxy for ``common_iterator`` should be fully ``constexpr``","October 2021","|Complete|","14.0","|ranges|"
 "","","","","",""
 "`3088 <https://wg21.link/LWG3088>`__","``forward_list::merge`` behaviour unclear when passed ``*this``","February 2022","",""
 "`3471 <https://wg21.link/LWG3471>`__","``polymorphic_allocator::allocate`` does not satisfy ``Cpp17Allocator`` requirements","February 2022","",""
@@ -145,8 +145,8 @@
 "`3607 <https://wg21.link/LWG3607>`__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","February 2022","|Nothing to do|","","|ranges|"
 "`3610 <https://wg21.link/LWG3610>`__","``iota_view::size`` sometimes rejects integer-class types","February 2022","","","|ranges|"
 "`3612 <https://wg21.link/LWG3612>`__","Inconsistent pointer alignment in ``std::format`` ","February 2022","|Complete|","14.0","|format|"
-"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","",""
-"`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","","","|ranges|"
+"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","|Complete|","18.0"
+"`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","|Complete|","19.0","|ranges|"
 "`3619 <https://wg21.link/LWG3619>`__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","February 2022","|Nothing to do|","","|format|"
 "`3621 <https://wg21.link/LWG3621>`__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","February 2022","|Complete|","15.0"
 "`3632 <https://wg21.link/LWG3632>`__","``unique_ptr`` ""Mandates: This constructor is not selected by class template argument deduction""","February 2022","|Nothing to do|",""
@@ -164,9 +164,9 @@
 "`3656 <https://wg21.link/LWG3656>`__","Inconsistent bit operations returning a count","July 2022","|Complete|","15.0",""
 "`3659 <https://wg21.link/LWG3659>`__","Consider ``ATOMIC_FLAG_INIT`` undeprecation","July 2022","|Complete|","15.0"
 "`3670 <https://wg21.link/LWG3670>`__","``Cpp17InputIterators`` don't have integer-class difference types","July 2022","","","|ranges|"
-"`3671 <https://wg21.link/LWG3671>`__","``atomic_fetch_xor`` missing from ``stdatomic.h``","July 2022","",""
+"`3671 <https://wg21.link/LWG3671>`__","``atomic_fetch_xor`` missing from ``stdatomic.h``","July 2022","|Complete|","20.0"
 "`3672 <https://wg21.link/LWG3672>`__","``common_iterator::operator->()`` should return by value","July 2022","|Complete|","19.0","|ranges|"
-"`3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","July 2022","",""
+"`3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","July 2022","|Complete|","20.0"
 "`3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","July 2022","|Complete|","16.0"
 "`3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","July 2022","","","|ranges| |spaceship|"
 "`3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","July 2022","|Complete|","15.0","|format|"
@@ -180,70 +180,70 @@
 "`3710 <https://wg21.link/LWG3710>`__","The ``end`` of ``chunk_view`` for input ranges can be ``const``","July 2022","","","|ranges|"
 "`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
 "`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
-"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
+"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","|Nothing To Do|",""
 "`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
 "`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
 "`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
 "`3724 <https://wg21.link/LWG3724>`__","``decay-copy`` should be constrained","July 2022","|Complete|","14.0"
 "","","","","",""
-"`3028 <https://wg21.link/LWG3028>`__","Container requirements tables should distinguish ``const`` and non-``const`` variables", "November 2022","","",""
-"`3118 <https://wg21.link/LWG3118>`__","``fpos`` equality comparison unspecified", "November 2022","","",""
-"`3177 <https://wg21.link/LWG3177>`__","Limit permission to specialize variable templates to program-defined types", "November 2022","|Nothing to do|","",""
-"`3515 <https://wg21.link/LWG3515>`__","§[stacktrace.basic.nonmem]: ``operator<<`` should be less templatized", "November 2022","","",""
-"`3545 <https://wg21.link/LWG3545>`__","``std::pointer_traits`` should be SFINAE-friendly", "November 2022","|Complete|","18.0",""
-"`3569 <https://wg21.link/LWG3569>`__","``join_view`` fails to support ranges of ranges with non-default_initializable iterators", "November 2022","","","|ranges|"
-"`3594 <https://wg21.link/LWG3594>`__","``inout_ptr`` — inconsistent ``release()`` in destructor", "November 2022","","",""
-"`3597 <https://wg21.link/LWG3597>`__","Unsigned integer types don't model advanceable", "November 2022","","","|ranges|"
-"`3600 <https://wg21.link/LWG3600>`__","Making ``istream_iterator`` copy constructor trivial is an ABI break", "November 2022","","",""
+"`3028 <https://wg21.link/LWG3028>`__","Container requirements tables should distinguish ``const`` and non-``const`` variables","November 2022","","",""
+"`3118 <https://wg21.link/LWG3118>`__","``fpos`` equality comparison unspecified","November 2022","","",""
+"`3177 <https://wg21.link/LWG3177>`__","Limit permission to specialize variable templates to program-defined types","November 2022","|Nothing to do|","",""
+"`3515 <https://wg21.link/LWG3515>`__","§[stacktrace.basic.nonmem]: ``operator<<`` should be less templatized","November 2022","","",""
+"`3545 <https://wg21.link/LWG3545>`__","``std::pointer_traits`` should be SFINAE-friendly","November 2022","|Complete|","18.0",""
+"`3569 <https://wg21.link/LWG3569>`__","``join_view`` fails to support ranges of ranges with non-default_initializable iterators","November 2022","","","|ranges|"
+"`3594 <https://wg21.link/LWG3594>`__","``inout_ptr`` — inconsistent ``release()`` in destructor","November 2022","|Complete|","19.0",""
+"`3597 <https://wg21.link/LWG3597>`__","Unsigned integer types don't model advanceable","November 2022","","","|ranges|"
+"`3600 <https://wg21.link/LWG3600>`__","Making ``istream_iterator`` copy constructor trivial is an ABI break","November 2022","","",""
 "`3629 <https://wg21.link/LWG3629>`__","``make_error_code`` and ``make_error_condition`` are customization points","November 2022","|Complete|","16.0",""
 "`3636 <https://wg21.link/LWG3636>`__","``formatter<T>::format`` should be ``const``-qualified","November 2022","|Complete|","16.0","|format|"
 "`3646 <https://wg21.link/LWG3646>`__","``std::ranges::view_interface::size`` returns a signed type","November 2022","|Complete|","16.0","|ranges|"
-"`3677 <https://wg21.link/LWG3677>`__","Is a cv-qualified ``pair`` specially handled in uses-allocator construction?", "November 2022","|Complete|","18.0",""
-"`3717 <https://wg21.link/LWG3717>`__","``common_view::end`` should improve ``random_access_range`` case", "November 2022","","","|ranges|"
-"`3732 <https://wg21.link/LWG3732>`__","``prepend_range`` and ``append_range`` can't be amortized constant time", "November 2022","|Nothing to do|","","|ranges|"
-"`3736 <https://wg21.link/LWG3736>`__","``move_iterator`` missing ``disable_sized_sentinel_for`` specialization", "November 2022","|Complete|","19.0","|ranges|"
-"`3737 <https://wg21.link/LWG3737>`__","``take_view::sentinel`` should provide ``operator-``", "November 2022","","","|ranges|"
-"`3738 <https://wg21.link/LWG3738>`__","Missing preconditions for ``take_view`` constructor", "November 2022","|Complete|","16.0","|ranges|"
-"`3743 <https://wg21.link/LWG3743>`__","``ranges::to``'s reserve may be ill-formed", "November 2022","","","|ranges|"
-"`3745 <https://wg21.link/LWG3745>`__","``std::atomic_wait`` and its friends lack ``noexcept``", "November 2022","|Complete|","16.0",""
-"`3746 <https://wg21.link/LWG3746>`__","``optional``'s spaceship with ``U`` with a type derived from optional causes infinite constraint meta-recursion", "November 2022","|Complete|","17.0","|spaceship|"
-"`3747 <https://wg21.link/LWG3747>`__","``ranges::uninitialized_copy_n``, ``ranges::uninitialized_move_n``, and ``ranges::destroy_n`` should use ``std::move``", "November 2022","","","|ranges|"
-"`3750 <https://wg21.link/LWG3750>`__","Too many papers bump ``__cpp_lib_format``", "November 2022","|Partial| [#note-LWG3750]_","","|format|"
-"`3751 <https://wg21.link/LWG3751>`__","Missing feature macro for ``flat_set``", "November 2022","","","|flat_containers|"
-"`3753 <https://wg21.link/LWG3753>`__","Clarify entity vs. freestanding entity", "November 2022","","",""
-"`3754 <https://wg21.link/LWG3754>`__","Class template expected synopsis contains declarations that do not match the detailed description", "November 2022","|Nothing to do|","",""
-"`3755 <https://wg21.link/LWG3755>`__","``tuple-for-each`` can call ``user-defined`` ``operator,``", "November 2022","|Complete|","17.0",""
-"`3757 <https://wg21.link/LWG3757>`__","What's the effect of ``std::forward_like<void>(x)``?", "November 2022","","",""
-"`3759 <https://wg21.link/LWG3759>`__","``ranges::rotate_copy`` should use ``std::move``", "November 2022","|Complete|","15.0","|ranges|"
-"`3760 <https://wg21.link/LWG3760>`__","``cartesian_product_view::iterator``'s ``parent_`` is never valid", "November 2022","","","|ranges|"
-"`3761 <https://wg21.link/LWG3761>`__","``cartesian_product_view::iterator::operator-`` should pass by reference", "November 2022","","","|ranges|"
-"`3762 <https://wg21.link/LWG3762>`__","``generator::iterator::operator==`` should pass by reference", "November 2022","","",""
-"`3764 <https://wg21.link/LWG3764>`__","``reference_wrapper::operator()`` should propagate noexcept", "November 2022","|Complete|","17.0",""
-"`3765 <https://wg21.link/LWG3765>`__","``const_sentinel`` should be constrained", "November 2022","","","|ranges|"
-"`3766 <https://wg21.link/LWG3766>`__","``view_interface::cbegin`` is underconstrained", "November 2022","","","|ranges|"
-"`3770 <https://wg21.link/LWG3770>`__","``const_sentinel_t`` is missing", "November 2022","","","|ranges|"
-"`3773 <https://wg21.link/LWG3773>`__","``views::zip_transform`` still requires ``F`` to be ``copy_constructible`` when empty pack", "November 2022","","","|ranges|"
-"`3774 <https://wg21.link/LWG3774>`__","``<flat_set>`` should include ``<compare>``", "November 2022","","","|flat_containers|"
-"`3775 <https://wg21.link/LWG3775>`__","Broken dependencies in the ``Cpp17Allocator`` requirements", "November 2022","","",""
-"`3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications", "November 2022","","",""
-"`3781 <https://wg21.link/LWG3781>`__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed", "November 2022","|Nothing to do|","",""
-"`3782 <https://wg21.link/LWG3782>`__","Should ``<math.h>`` declare ``::lerp``?", "November 2022","|Complete|","17.0",""
-"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","","",""
-"`3785 <https://wg21.link/LWG3785>`__","``ranges::to`` is over-constrained on the destination type being a range", "November 2022","","","|ranges|"
-"`3788 <https://wg21.link/LWG3788>`__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment", "November 2022","","",""
-"`3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``", "November 2022","|Complete|","16.0",""
-"`3795 <https://wg21.link/LWG3795>`__","Self-move-assignment of ``std::future`` and ``std::shared_future`` have unimplementable postconditions", "November 2022","","",""
-"`3796 <https://wg21.link/LWG3796>`__","``movable-box`` as member should use ``default-initialization`` instead of ``copy-initialization``", "November 2022","","","|ranges|"
-"`3798 <https://wg21.link/LWG3798>`__","Rvalue reference and ``iterator_category``", "November 2022","|Partial| [#note-LWG3798]_","","|ranges|"
-"`3801 <https://wg21.link/LWG3801>`__","``cartesian_product_view::iterator::distance-from`` ignores the size of last underlying range", "November 2022","","","|ranges|"
-"`3814 <https://wg21.link/LWG3814>`__","Add freestanding items requested by NB comments", "November 2022","","",""
-"`3816 <https://wg21.link/LWG3816>`__","``flat_map`` and ``flat_multimap`` should impose sequence container requirements", "November 2022","","","|flat_containers|"
-"`3817 <https://wg21.link/LWG3817>`__","Missing preconditions on ``forward_list`` modifiers", "November 2022","","",""
-"`3818 <https://wg21.link/LWG3818>`__","Exposition-only concepts are not described in library intro", "November 2022","|Nothing to do|","",""
-"`3822 <https://wg21.link/LWG3822>`__","Avoiding normalization in ``filesystem::weakly_canonical``", "November 2022","","",""
-"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","","",""
-"`3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified", "November 2022","|Nothing to do|","",""
-"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","","",""
+"`3677 <https://wg21.link/LWG3677>`__","Is a cv-qualified ``pair`` specially handled in uses-allocator construction?","November 2022","|Complete|","18.0",""
+"`3717 <https://wg21.link/LWG3717>`__","``common_view::end`` should improve ``random_access_range`` case","November 2022","","","|ranges|"
+"`3732 <https://wg21.link/LWG3732>`__","``prepend_range`` and ``append_range`` can't be amortized constant time","November 2022","|Nothing to do|","","|ranges|"
+"`3736 <https://wg21.link/LWG3736>`__","``move_iterator`` missing ``disable_sized_sentinel_for`` specialization","November 2022","|Complete|","19.0","|ranges|"
+"`3737 <https://wg21.link/LWG3737>`__","``take_view::sentinel`` should provide ``operator-``","November 2022","","","|ranges|"
+"`3738 <https://wg21.link/LWG3738>`__","Missing preconditions for ``take_view`` constructor","November 2022","|Complete|","16.0","|ranges|"
+"`3743 <https://wg21.link/LWG3743>`__","``ranges::to``'s reserve may be ill-formed","November 2022","","","|ranges|"
+"`3745 <https://wg21.link/LWG3745>`__","``std::atomic_wait`` and its friends lack ``noexcept``","November 2022","|Complete|","16.0",""
+"`3746 <https://wg21.link/LWG3746>`__","``optional``'s spaceship with ``U`` with a type derived from optional causes infinite constraint meta-recursion","November 2022","|Complete|","17.0","|spaceship|"
+"`3747 <https://wg21.link/LWG3747>`__","``ranges::uninitialized_copy_n``, ``ranges::uninitialized_move_n``, and ``ranges::destroy_n`` should use ``std::move``","November 2022","","","|ranges|"
+"`3750 <https://wg21.link/LWG3750>`__","Too many papers bump ``__cpp_lib_format``","November 2022","|Partial| [#note-LWG3750]_","","|format|"
+"`3751 <https://wg21.link/LWG3751>`__","Missing feature macro for ``flat_set``","November 2022","","","|flat_containers|"
+"`3753 <https://wg21.link/LWG3753>`__","Clarify entity vs. freestanding entity","November 2022","","",""
+"`3754 <https://wg21.link/LWG3754>`__","Class template expected synopsis contains declarations that do not match the detailed description","November 2022","|Nothing to do|","",""
+"`3755 <https://wg21.link/LWG3755>`__","``tuple-for-each`` can call ``user-defined`` ``operator,``","November 2022","|Complete|","17.0",""
+"`3757 <https://wg21.link/LWG3757>`__","What's the effect of ``std::forward_like<void>(x)``?","November 2022","","",""
+"`3759 <https://wg21.link/LWG3759>`__","``ranges::rotate_copy`` should use ``std::move``","November 2022","|Complete|","15.0","|ranges|"
+"`3760 <https://wg21.link/LWG3760>`__","``cartesian_product_view::iterator``'s ``parent_`` is never valid","November 2022","","","|ranges|"
+"`3761 <https://wg21.link/LWG3761>`__","``cartesian_product_view::iterator::operator-`` should pass by reference","November 2022","","","|ranges|"
+"`3762 <https://wg21.link/LWG3762>`__","``generator::iterator::operator==`` should pass by reference","November 2022","","",""
+"`3764 <https://wg21.link/LWG3764>`__","``reference_wrapper::operator()`` should propagate noexcept","November 2022","|Complete|","17.0",""
+"`3765 <https://wg21.link/LWG3765>`__","``const_sentinel`` should be constrained","November 2022","","","|ranges|"
+"`3766 <https://wg21.link/LWG3766>`__","``view_interface::cbegin`` is underconstrained","November 2022","","","|ranges|"
+"`3770 <https://wg21.link/LWG3770>`__","``const_sentinel_t`` is missing","November 2022","","","|ranges|"
+"`3773 <https://wg21.link/LWG3773>`__","``views::zip_transform`` still requires ``F`` to be ``copy_constructible`` when empty pack","November 2022","","","|ranges|"
+"`3774 <https://wg21.link/LWG3774>`__","``<flat_set>`` should include ``<compare>``","November 2022","","","|flat_containers|"
+"`3775 <https://wg21.link/LWG3775>`__","Broken dependencies in the ``Cpp17Allocator`` requirements","November 2022","","",""
+"`3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications","November 2022","|Complete|","3.7",""
+"`3781 <https://wg21.link/LWG3781>`__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed","November 2022","|Nothing to do|","",""
+"`3782 <https://wg21.link/LWG3782>`__","Should ``<math.h>`` declare ``::lerp``?","November 2022","|Complete|","17.0",""
+"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends","November 2022","|Complete|","19.0",""
+"`3785 <https://wg21.link/LWG3785>`__","``ranges::to`` is over-constrained on the destination type being a range","November 2022","","","|ranges|"
+"`3788 <https://wg21.link/LWG3788>`__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment","November 2022","","",""
+"`3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``","November 2022","|Complete|","16.0",""
+"`3795 <https://wg21.link/LWG3795>`__","Self-move-assignment of ``std::future`` and ``std::shared_future`` have unimplementable postconditions","November 2022","","",""
+"`3796 <https://wg21.link/LWG3796>`__","``movable-box`` as member should use ``default-initialization`` instead of ``copy-initialization``","November 2022","","","|ranges|"
+"`3798 <https://wg21.link/LWG3798>`__","Rvalue reference and ``iterator_category``","November 2022","|Partial| [#note-LWG3798]_","","|ranges|"
+"`3801 <https://wg21.link/LWG3801>`__","``cartesian_product_view::iterator::distance-from`` ignores the size of last underlying range","November 2022","","","|ranges|"
+"`3814 <https://wg21.link/LWG3814>`__","Add freestanding items requested by NB comments","November 2022","","",""
+"`3816 <https://wg21.link/LWG3816>`__","``flat_map`` and ``flat_multimap`` should impose sequence container requirements","November 2022","","","|flat_containers|"
+"`3817 <https://wg21.link/LWG3817>`__","Missing preconditions on ``forward_list`` modifiers","November 2022","","",""
+"`3818 <https://wg21.link/LWG3818>`__","Exposition-only concepts are not described in library intro","November 2022","|Nothing to do|","",""
+"`3822 <https://wg21.link/LWG3822>`__","Avoiding normalization in ``filesystem::weakly_canonical``","November 2022","","",""
+"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``","November 2022","|Nothing To Do|","",""
+"`3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified","November 2022","|Nothing to do|","",""
+"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]","November 2022","|Nothing To Do|","",""
 "","","","","",""
 "`2195 <https://wg21.link/LWG2195>`__","Missing constructors for ``match_results``","February 2023","","",""
 "`2295 <https://wg21.link/LWG2295>`__","Locale name when the provided ``Facet`` is a ``nullptr``","February 2023","","",""
@@ -282,13 +282,13 @@
 "`3645 <https://wg21.link/LWG3645>`__","``resize_and_overwrite`` is overspecified to call its callback with lvalues","February 2023","|Complete|","14.0",""
 "`3655 <https://wg21.link/LWG3655>`__","The ``INVOKE`` operation and union types","February 2023","|Complete|","18.0",""
 "`3723 <https://wg21.link/LWG3723>`__","``priority_queue::push_range`` needs to ``append_range``","February 2023","","","|ranges|"
-"`3734 <https://wg21.link/LWG3734>`__","Inconsistency in ``inout_ptr`` and ``out_ptr`` for empty case","February 2023","","",""
+"`3734 <https://wg21.link/LWG3734>`__","Inconsistency in ``inout_ptr`` and ``out_ptr`` for empty case","February 2023","|Complete|","19.0",""
 "`3772 <https://wg21.link/LWG3772>`__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","February 2023","|Complete|","17.0","|ranges|"
 "`3786 <https://wg21.link/LWG3786>`__","Flat maps' deduction guide needs to default ``Allocator`` to be useful","February 2023","","",""
 "`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
 "`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
 "`3827 <https://wg21.link/LWG3827>`__","Deprecate ``<stdalign.h>`` and ``<stdbool.h>`` macros","February 2023","","",""
-"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","","",""
+"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","|Nothing To Do|","",""
 "`3833 <https://wg21.link/LWG3833>`__","Remove specialization ``template<size_t N> struct formatter<const charT[N], charT>``","February 2023","|Complete|","17.0","|format|"
 "`3836 <https://wg21.link/LWG3836>`__","``std::expected<bool, E1>`` conversion constructor ``expected(const expected<U, G>&)`` should take precedence over ``expected(U&&)`` with operator ``bool``","February 2023","|Complete|","18.0",""
 "`3843 <https://wg21.link/LWG3843>`__","``std::expected<T,E>::value() &`` assumes ``E`` is copy constructible","February 2023","|Complete|","17.0",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 4f589cd938d7c..370aa8656836a 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -6,14 +6,14 @@
 "","","","","","",""
 "`P1682R3 <https://wg21.link/P1682R3>`__","LWG","std::to_underlying for enumerations","February 2021","|Complete|","13.0"
 "`P2017R1 <https://wg21.link/P2017R1>`__","LWG","Conditionally borrowed ranges","February 2021","|Complete|","16.0","|ranges|"
-"`P2160R1 <https://wg21.link/P2160R1>`__","LWG","Locks lock lockables","February 2021","",""
+"`P2160R1 <https://wg21.link/P2160R1>`__","LWG","Locks lock lockables","February 2021","Nothing to do",""
 "`P2162R2 <https://wg21.link/P2162R2>`__","LWG","Inheriting from std::variant","February 2021","|Complete|","13.0"
-"`P2212R2 <https://wg21.link/P2212R2>`__","LWG","Relax Requirements for time_point::clock","February 2021","",""
+"`P2212R2 <https://wg21.link/P2212R2>`__","LWG","Relax Requirements for time_point::clock","February 2021","Nothing to do",""
 "`P2259R1 <https://wg21.link/P2259R1>`__","LWG","Repairing input range adaptors and counted_iterator","February 2021","","","|ranges|"
 "","","","","","",""
 "`P0401R6 <https://wg21.link/P0401R6>`__","LWG","Providing size feedback in the Allocator interface","June 2021","|Complete|","15.0"
 "`P0448R4 <https://wg21.link/P0448R4>`__","LWG","A strstream replacement using span<charT> as buffer","June 2021","",""
-"`P1132R8 <https://wg21.link/P1132R8>`__","LWG","out_ptr - a scalable output pointer abstraction","June 2021","",""
+"`P1132R8 <https://wg21.link/P1132R8>`__","LWG","out_ptr - a scalable output pointer abstraction","June 2021","|Complete|","19.0"
 "`P1328R1 <https://wg21.link/P1328R1>`__","LWG","Making std::type_info::operator== constexpr","June 2021","|Complete|","17.0"
 "`P1425R4 <https://wg21.link/P1425R4>`__","LWG","Iterators pair constructors for stack and queue","June 2021","|Complete|","14.0","|ranges|"
 "`P1518R2 <https://wg21.link/P1518R2>`__","LWG","Stop overconstraining allocators in container deduction guides","June 2021","|Complete|","13.0"
@@ -29,12 +29,12 @@
 "`P1072R10 <https://wg21.link/P1072R10>`__","LWG","``basic_string::resize_and_overwrite``","October 2021","|Complete|","14.0"
 "`P1147R1 <https://wg21.link/P1147R1>`__","LWG","Printing ``volatile`` Pointers","October 2021","|Complete|","14.0"
 "`P1272R4 <https://wg21.link/P1272R4>`__","LWG","Byteswapping for fun&&nuf","October 2021","|Complete|","14.0"
-"`P1675R2 <https://wg21.link/P1675R2>`__","LWG","``rethrow_exception`` must be allowed to copy","October 2021","",""
+"`P1675R2 <https://wg21.link/P1675R2>`__","LWG","``rethrow_exception`` must be allowed to copy","October 2021","Nothing to do",""
 "`P2077R3 <https://wg21.link/P2077R3>`__","LWG","Heterogeneous erasure overloads for associative containers","October 2021","",""
 "`P2251R1 <https://wg21.link/P2251R1>`__","LWG","Require ``span`` & ``basic_string_view`` to be Trivially Copyable","October 2021","|Complete|","14.0"
 "`P2301R1 <https://wg21.link/P2301R1>`__","LWG","Add a ``pmr`` alias for ``std::stacktrace``","October 2021","",""
 "`P2321R2 <https://wg21.link/P2321R2>`__","LWG","``zip``","October 2021","|In Progress|","","|ranges|"
-"`P2340R1 <https://wg21.link/P2340R1>`__","LWG","Clarifying the status of the 'C headers'","October 2021","",""
+"`P2340R1 <https://wg21.link/P2340R1>`__","LWG","Clarifying the status of the 'C headers'","October 2021","Nothing to do",""
 "`P2393R1 <https://wg21.link/P2393R1>`__","LWG","Cleaning up ``integer``-class types","October 2021","",""
 "`P2401R0 <https://wg21.link/P2401R0>`__","LWG","Add a conditional ``noexcept`` specification to ``std::exchange``","October 2021","|Complete|","14.0"
 "","","","","","",""
@@ -55,7 +55,7 @@
 "`P0429R9 <https://wg21.link/P0429R9>`__","LWG","A Standard ``flat_map``","July 2022","",""
 "`P1169R4 <https://wg21.link/P1169R4>`__","LWG","``static operator()``","July 2022","|Complete|","16.0"
 "`P1222R4 <https://wg21.link/P1222R4>`__","LWG","A Standard ``flat_set``","July 2022","",""
-"`P1223R5 <https://wg21.link/P1223R5>`__","LWG","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","July 2022","","","|ranges|"
+"`P1223R5 <https://wg21.link/P1223R5>`__","LWG","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","July 2022","|Complete|","19.0","|ranges|"
 "`P1467R9 <https://wg21.link/P1467R9>`__","LWG","Extended ``floating-point`` types and standard names","July 2022","",""
 "`P1642R11 <https://wg21.link/P1642R11>`__","LWG","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","July 2022","",""
 "`P1899R3 <https://wg21.link/P1899R3>`__","LWG","``stride_view``","July 2022","","","|ranges|"
@@ -74,7 +74,7 @@
 "`P2438R2 <https://wg21.link/P2438R2>`__","LWG","``std::string::substr() &&``","July 2022","|Complete|","16.0"
 "`P2445R1 <https://wg21.link/P2445R1>`__","LWG","``forward_like``","July 2022","|Complete|","16.0"
 "`P2446R2 <https://wg21.link/P2446R2>`__","LWG","``views::as_rvalue``","July 2022","|Complete|","16.0","|ranges|"
-"`P2460R2 <https://wg21.link/P2460R2>`__","LWG","Relax requirements on ``wchar_t`` to match existing practices","July 2022","",""
+"`P2460R2 <https://wg21.link/P2460R2>`__","LWG","Relax requirements on ``wchar_t`` to match existing practices","July 2022","Nothing to do",""
 "`P2465R3 <https://wg21.link/P2465R3>`__","LWG","Standard Library Modules ``std`` and ``std.compat``","July 2022","|Complete|","19.0",""
 "`P2467R1 <https://wg21.link/P2467R1>`__","LWG","Support exclusive mode for ``fstreams``","July 2022","|Complete|","18.0",""
 "`P2474R2 <https://wg21.link/P2474R2>`__","LWG","``views::repeat``","July 2022","|Complete|","17.0","|ranges|"
@@ -93,31 +93,31 @@
 "`P2604R0 <https://wg21.link/P2604R0>`__","LWG","mdspan: rename pointer and contiguous","July 2022","|Complete|","18.0"
 "`P2613R1 <https://wg21.link/P2613R1>`__","LWG","Add the missing ``empty`` to ``mdspan``","July 2022","|Complete|","18.0"
 "","","","","","",""
-"`P1202R5 <https://wg21.link/P1202R5>`__","LWG", "Asymmetric Fences", "November 2022","","","|concurrency TS|"
-"`P1264R2 <https://wg21.link/P1264R2>`__","LWG", "Revising the wording of ``stream`` input operations", "November 2022","|Complete|","9.0",""
-"`P1478R8 <https://wg21.link/P1478R8>`__","LWG", "``Byte-wise`` ``atomic`` ``memcpy``", "November 2022","","","|concurrency TS|"
-"`P2167R3 <https://wg21.link/P2167R3>`__","LWG", "Improved Proposed Wording for LWG 2114", "November 2022","","",""
-"`P2396R1 <https://wg21.link/P2396R1>`__","LWG", "Concurrency TS 2 fixes ", "November 2022","","","|concurrency TS|"
-"`P2505R5 <https://wg21.link/P2505R5>`__","LWG", "Monadic Functions for ``std::expected``", "November 2022","|Complete|","17.0",""
-"`P2539R4 <https://wg21.link/P2539R4>`__","LWG", "Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?", "November 2022","|Complete|","18.0","|format|"
-"`P2602R2 <https://wg21.link/P2602R2>`__","LWG", "Poison Pills are Too Toxic", "November 2022","|Complete|","19.0","|ranges|"
-"`P2708R1 <https://wg21.link/P2708R1>`__","LWG", "No Further Fundamentals TSes", "November 2022","|Nothing to do|","",""
+"`P1202R5 <https://wg21.link/P1202R5>`__","LWG","Asymmetric Fences","November 2022","","","|concurrency TS|"
+"`P1264R2 <https://wg21.link/P1264R2>`__","LWG","Revising the wording of ``stream`` input operations","November 2022","|Complete|","9.0",""
+"`P1478R8 <https://wg21.link/P1478R8>`__","LWG","``Byte-wise`` ``atomic`` ``memcpy``","November 2022","","","|concurrency TS|"
+"`P2167R3 <https://wg21.link/P2167R3>`__","LWG","Improved Proposed Wording for LWG 2114","November 2022","","",""
+"`P2396R1 <https://wg21.link/P2396R1>`__","LWG","Concurrency TS 2 fixes ","November 2022","","","|concurrency TS|"
+"`P2505R5 <https://wg21.link/P2505R5>`__","LWG","Monadic Functions for ``std::expected``","November 2022","|Complete|","17.0",""
+"`P2539R4 <https://wg21.link/P2539R4>`__","LWG","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","November 2022","|Complete|","18.0","|format|"
+"`P2602R2 <https://wg21.link/P2602R2>`__","LWG","Poison Pills are Too Toxic","November 2022","|Complete|","19.0","|ranges|"
+"`P2708R1 <https://wg21.link/P2708R1>`__","LWG","No Further Fundamentals TSes","November 2022","|Nothing to do|","",""
 "","","","","","",""
-"`P0290R4 <https://wg21.link/P0290R4>`__","LWG", "``apply()`` for ``synchronized_value<T>``","February 2023","","","|concurrency TS|"
-"`P2770R0 <https://wg21.link/P2770R0>`__","LWG", "Stashing stashing ``iterators`` for proper flattening","February 2023","|Partial| [#note-P2770R0]_","","|ranges|"
-"`P2164R9 <https://wg21.link/P2164R9>`__","LWG", "``views::enumerate``","February 2023","","","|ranges|"
-"`P2711R1 <https://wg21.link/P2711R1>`__","LWG", "Making multi-param constructors of ``views`` ``explicit``","February 2023","|In Progress| [#note-P2711R1]_","","|ranges|"
-"`P2609R3 <https://wg21.link/P2609R3>`__","LWG", "Relaxing Ranges Just A Smidge","February 2023","","","|ranges|"
-"`P2713R1 <https://wg21.link/P2713R1>`__","LWG", "Escaping improvements in ``std::format``","February 2023","|Complete|","19.0","|format|"
-"`P2675R1 <https://wg21.link/P2675R1>`__","LWG", "``format``'s width estimation is too approximate and not forward compatible","February 2023","|Complete|","17.0","|format|"
-"`P2572R1 <https://wg21.link/P2572R1>`__","LWG", "``std::format`` fill character allowances","February 2023","|Complete|","17.0","|format|"
-"`P2693R1 <https://wg21.link/P2693R1>`__","LWG", "Formatting ``thread::id`` and ``stacktrace``","February 2023","|Partial| [#note-P2693R1]_","","|format|"
-"`P2679R2 <https://wg21.link/P2679R2>`__","LWG", "Fixing ``std::start_lifetime_as`` for arrays","February 2023","","",""
-"`P2674R1 <https://wg21.link/P2674R1>`__","LWG", "A trait for implicit lifetime types","February 2023","","",""
-"`P2655R3 <https://wg21.link/P2655R3>`__","LWG", "``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type","February 2023","","",""
-"`P2652R2 <https://wg21.link/P2652R2>`__","LWG", "Disallow User Specialization of ``allocator_traits``","February 2023","|Complete|","19.0",""
-"`P2787R1 <https://wg21.link/P2787R1>`__","LWG", "``pmr::generator`` - Promise Types are not Values","February 2023","","",""
-"`P2614R2 <https://wg21.link/P2614R2>`__","LWG", "Deprecate ``numeric_limits::has_denorm``","February 2023","|Complete|","18.0",""
-"`P2588R3 <https://wg21.link/P2588R3>`__","LWG", "``barrier``’s phase completion guarantees","February 2023","","",""
-"`P2763R1 <https://wg21.link/P2763R1>`__","LWG", "``layout_stride`` static extents default constructor fix","February 2023","","",""
-"`P2736R2 <https://wg21.link/P2736R2>`__","CWG","Referencing The Unicode Standard","February 2023","","","|format|"
+"`P0290R4 <https://wg21.link/P0290R4>`__","LWG","``apply()`` for ``synchronized_value<T>``","February 2023","","","|concurrency TS|"
+"`P2770R0 <https://wg21.link/P2770R0>`__","LWG","Stashing stashing ``iterators`` for proper flattening","February 2023","|Partial| [#note-P2770R0]_","","|ranges|"
+"`P2164R9 <https://wg21.link/P2164R9>`__","LWG","``views::enumerate``","February 2023","","","|ranges|"
+"`P2711R1 <https://wg21.link/P2711R1>`__","LWG","Making multi-param constructors of ``views`` ``explicit``","February 2023","|In Progress| [#note-P2711R1]_","","|ranges|"
+"`P2609R3 <https://wg21.link/P2609R3>`__","LWG","Relaxing Ranges Just A Smidge","February 2023","","","|ranges|"
+"`P2713R1 <https://wg21.link/P2713R1>`__","LWG","Escaping improvements in ``std::format``","February 2023","|Complete|","19.0","|format|"
+"`P2675R1 <https://wg21.link/P2675R1>`__","LWG","``format``'s width estimation is too approximate and not forward compatible","February 2023","|Complete|","17.0","|format|"
+"`P2572R1 <https://wg21.link/P2572R1>`__","LWG","``std::format`` fill character allowances","February 2023","|Complete|","17.0","|format|"
+"`P2693R1 <https://wg21.link/P2693R1>`__","LWG","Formatting ``thread::id`` and ``stacktrace``","February 2023","|Partial| [#note-P2693R1]_","","|format|"
+"`P2679R2 <https://wg21.link/P2679R2>`__","LWG","Fixing ``std::start_lifetime_as`` for arrays","February 2023","","",""
+"`P2674R1 <https://wg21.link/P2674R1>`__","LWG","A trait for implicit lifetime types","February 2023","","",""
+"`P2655R3 <https://wg21.link/P2655R3>`__","LWG","``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type","February 2023","","",""
+"`P2652R2 <https://wg21.link/P2652R2>`__","LWG","Disallow User Specialization of ``allocator_traits``","February 2023","|Complete|","19.0",""
+"`P2787R1 <https://wg21.link/P2787R1>`__","LWG","``pmr::generator`` - Promise Types are not Values","February 2023","","",""
+"`P2614R2 <https://wg21.link/P2614R2>`__","LWG","Deprecate ``numeric_limits::has_denorm``","February 2023","|Complete|","18.0",""
+"`P2588R3 <https://wg21.link/P2588R3>`__","LWG","``barrier``’s phase completion guarantees","February 2023","","",""
+"`P2763R1 <https://wg21.link/P2763R1>`__","LWG","``layout_stride`` static extents default constructor fix","February 2023","","",""
+"`P2736R2 <https://wg21.link/P2736R2>`__","CWG","Referencing The Unicode Standard","February 2023","Complete","19.0","|format|"
diff --git a/libcxx/docs/Status/Cxx2c.rst b/libcxx/docs/Status/Cxx2c.rst
index 03a6eeaa40c79..1b8d96588c3cb 100644
--- a/libcxx/docs/Status/Cxx2c.rst
+++ b/libcxx/docs/Status/Cxx2c.rst
@@ -43,6 +43,7 @@ Paper Status
    .. [#note-P3142R0] This paper is applied as DR against C++23. (MSVC STL and libstdc++ will do the same.)
    .. [#note-P2944R3] Implemented comparisons for ``reference_wrapper`` only.
    .. [#note-P2422R1] Libc++ keeps the ``nodiscard`` attributes as a conforming extension.
+   .. [#note-P2997R1] This paper is applied as DR against C++20. (MSVC STL and libstdc++ will do the same.)
 
 .. _issues-status-cxx2c:
 
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index b5732ee981ffb..29eb1638ff1e6 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -18,13 +18,13 @@
 "`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","|Complete|","18.0",""
 "","","","","",""
 "`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
-"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
+"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","|Nothing To Do|","",""
 "`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","|Complete|","18.0",""
 "`3431 <https://wg21.link/LWG3431>`__","``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``","Kona November 2023","","",""
 "`3749 <https://wg21.link/LWG3749>`__","``common_iterator`` should handle integer-class difference types","Kona November 2023","","",""
 "`3809 <https://wg21.link/LWG3809>`__","Is ``std::subtract_with_carry_engine<uint16_t>`` supposed to work","Kona November 2023","","",""
 "`3892 <https://wg21.link/LWG3892>`__","Incorrect formatting of nested ranges and tuples","Kona November 2023","|Complete|","17.0","|format|"
-"`3897 <https://wg21.link/LWG3897>`__","``inout_ptr`` will not update raw pointer to 0","Kona November 2023","","",""
+"`3897 <https://wg21.link/LWG3897>`__","``inout_ptr`` will not update raw pointer to 0","Kona November 2023","|Complete|","19.0",""
 "`3946 <https://wg21.link/LWG3946>`__","The definition of ``const_iterator_t`` should be reworked","Kona November 2023","","",""
 "`3947 <https://wg21.link/LWG3947>`__","Unexpected constraints on ``adjacent_transform_view::base()``","Kona November 2023","","","|ranges|"
 "`3948 <https://wg21.link/LWG3948>`__","``possibly-const-range and as-const-pointer`` should be ``noexcept``","Kona November 2023","","","|ranges|"
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 2c498f336b125..130e4ef894f29 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -55,7 +55,7 @@
 "`P2845R8 <https://wg21.link/P2845R8>`__","LWG","Formatting of ``std::filesystem::path``","Tokyo March 2024","","","|format|"
 "`P0493R5 <https://wg21.link/P0493R5>`__","LWG","Atomic minimum/maximum","Tokyo March 2024","","",""
 "`P2542R8 <https://wg21.link/P2542R8>`__","LWG","``views::concat``","Tokyo March 2024","","","|ranges|"
-"`P2591R5 <https://wg21.link/P2591R5>`__","LWG","Concatenation of strings and string views","Tokyo March 2024","","",""
+"`P2591R5 <https://wg21.link/P2591R5>`__","LWG","Concatenation of strings and string views","Tokyo March 2024","|Complete|","19.0",""
 "`P2248R8 <https://wg21.link/P2248R8>`__","LWG","Enabling list-initialization for algorithms","Tokyo March 2024","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","LWG","``is_debugger_present`` ``is_replaceable``","Tokyo March 2024","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","LWG","Vector API for random number generation","Tokyo March 2024","","",""
@@ -64,7 +64,7 @@
 "`P3029R1 <https://wg21.link/P3029R1>`__","LWG","Better ``mdspan``'s CTAD","Tokyo March 2024","|Complete|","19.0",""
 "","","","","","",""
 "`P2747R2 <https://wg21.link/P2747R2>`__","CWG","``constexpr`` placement new","St. Louis June 2024","","",""
-"`P2997R1 <https://wg21.link/P2997R1>`__","LWG","Removing the common reference requirement from the indirectly invocable concepts","St. Louis June 2024","","",""
+"`P2997R1 <https://wg21.link/P2997R1>`__","LWG","Removing the common reference requirement from the indirectly invocable concepts","St. Louis June 2024","|Complete| [#note-P2997R1]_","19.0",""
 "`P2389R2 <https://wg21.link/P2389R2>`__","LWG","``dextents`` Index Type Parameter","St. Louis June 2024","|Complete|","19.0",""
 "`P3168R2 <https://wg21.link/P3168R2>`__","LWG","Give ``std::optional`` Range Support","St. Louis June 2024","","","|ranges|"
 "`P3217R0 <https://wg21.link/P3217R0>`__","LWG","Adjoints to 'Enabling list-initialization for algorithms': find_last","St. Louis June 2024","","",""
diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv
index f29f1f7ca7487..fb96b1fff30ad 100644
--- a/libcxx/docs/Status/FormatPaper.csv
+++ b/libcxx/docs/Status/FormatPaper.csv
@@ -7,7 +7,7 @@ Section,Description,Dependencies,Assignee,Status,First released version
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::gps_time<Duration>``",A ``<chrono>`` implementation,Mark de Wever,,,
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::file_time<Duration>``",,Mark de Wever,|Complete|,17.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::local_time<Duration>``",,Mark de Wever,|Complete|,17.0
-`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::local-time-format-t<Duration>``",A ``<chrono>`` implementation,Mark de Wever,,,
+`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::local-time-format-t<Duration>``",,,|Nothing To Do|,
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::day``",,Mark de Wever,|Complete|,16.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::month``",,Mark de Wever,|Complete|,16.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::year``",,Mark de Wever,|Complete|,16.0
@@ -26,7 +26,7 @@ Section,Description,Dependencies,Assignee,Status,First released version
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::hh_mm_ss<duration<Rep, Period>>``",,Mark de Wever,|Complete|,17.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::sys_info``",,Mark de Wever,|Complete|,19.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::local_info``",,Mark de Wever,|Complete|,19.0
-`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::zoned_time<Duration, TimeZonePtr>``",A ``<chrono>`` implementation,Mark de Wever,,
+`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::zoned_time<Duration, TimeZonePtr>``",,Mark de Wever,|Complete|,19.0
 
 "`P2693R1 <https://wg21.link/P2693R1>`__","Formatting ``thread::id`` and ``stacktrace``"
 `[thread.thread.id] <https://wg21.link/thread.thread.id>`_,"Formatting ``thread::id``",,Mark de Wever,|Complete|,17.0
diff --git a/libcxx/docs/Status/RangesAlgorithms.csv b/libcxx/docs/Status/RangesAlgorithms.csv
index f7a51f732c4b1..469ea21a76aab 100644
--- a/libcxx/docs/Status/RangesAlgorithms.csv
+++ b/libcxx/docs/Status/RangesAlgorithms.csv
@@ -1,8 +1,8 @@
 Standard,Algorithm,Assignee,CL,Status
 C++20,all C++20 algorithms,N/A,N/A,✅
-C++23,`find_last <https://wg21.link/P1223R5>`_,Unassigned,No patch yet,Not started
-C++23,`find_last_if <https://wg21.link/P1223R5>`_,Unassigned,No patch yet,Not started
-C++23,`find_last_if_not <https://wg21.link/P1223R5>`_,Unassigned,No patch yet,Not started
+C++23,`find_last <https://wg21.link/P1223R5>`_,Nicole Mazzuca,`#99312 <https://github.com/llvm/llvm-project/pull/99312>`_,Complete
+C++23,`find_last_if <https://wg21.link/P1223R5>`_,Nicole Mazzuca,`#99312 <https://github.com/llvm/llvm-project/pull/99312>`_,Complete
+C++23,`find_last_if_not <https://wg21.link/P1223R5>`_,Nicole Mazzuca,`#99312 <https://github.com/llvm/llvm-project/pull/99312>`_,Complete
 C++23,`starts_with <https://wg21.link/P1659R3>`_,Zijun Zhao,`D150735 <https://llvm.org/D150735>`_,Complete
 C++23,`ends_with <https://wg21.link/P1659R3>`_,Zijun Zhao, `D150831 <https://llvm.org/D150831>`_,Complete
 C++23,`shift_left <https://wg21.link/p2440r1>`_,Unassigned,No patch yet,Not started
diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv
index 39e1f968c1754..1ab64a9caf86a 100644
--- a/libcxx/docs/Status/SpaceshipPapers.csv
+++ b/libcxx/docs/Status/SpaceshipPapers.csv
@@ -1,5 +1,5 @@
 "Number","Name","Status","First released version"
-`P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|In Progress|,
+`P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|Complete|,19.0
 `P2404R3 <https://wg21.link/P2404R3>`_,"Relaxing ``equality_comparable_with``'s, ``totally_ordered_with``'s, and ``three_way_comparable_with``'s common reference requirements to support move-only types",,
 `LWG3330 <https://wg21.link/LWG3330>`_,Include ``<compare>`` from most library headers,"|Complete|","13.0"
 `LWG3347 <https://wg21.link/LWG3347>`_,"``std::pair<T, U>`` now requires ``T`` and ``U`` to be *less-than-comparable*",|Nothing To Do|,
diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv
index e1cf2044cfd78..4dc43cdbbd08f 100644
--- a/libcxx/docs/Status/SpaceshipProjects.csv
+++ b/libcxx/docs/Status/SpaceshipProjects.csv
@@ -83,7 +83,7 @@ Section,Description,Dependencies,Assignee,Complete
 "| `[string.view.synop] <https://wg21.link/string.view.synop>`_
 | `[string.view.comparison] <https://wg21.link/string.view.comparison>`_",| `basic_string_view <https://reviews.llvm.org/D130295>`_,None,Mark de Wever,|Complete|
 - `5.7 Clause 22: Containers library <https://wg21.link/p1614r2#clause-22-containers-library>`_,,,,
-| `[container.requirements.general] <https://wg21.link/container.requirements.general>`_,|,None,Unassigned,|Not Started|
+| `[container.requirements.general] <https://wg21.link/container.requirements.general>`_,|,None,Mark de Wever,|Complete|
 | `[array.syn] <https://wg21.link/array.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| `array <https://reviews.llvm.org/D132265>`_,[expos.only.func],"| Adrian Vogelsgesang
 | Hristo Hristov",|Complete|
 | `[deque.syn] <https://wg21.link/deque.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| `deque <https://reviews.llvm.org/D144821>`_,[expos.only.func],Hristo Hristov,|Complete|
diff --git a/libcxx/docs/Status/SpecialMath.rst b/libcxx/docs/Status/SpecialMath.rst
new file mode 100644
index 0000000000000..46e5c97cdaab2
--- /dev/null
+++ b/libcxx/docs/Status/SpecialMath.rst
@@ -0,0 +1,35 @@
+.. _special-math-status:
+
+======================================================
+libc++ Mathematical Special Functions Status (P0226R1)
+======================================================
+
+.. include:: ../Helpers/Styles.rst
+
+.. contents::
+  :local:
+
+Overview
+========
+
+This document contains the status of the C++17 mathematical special functions implementation in libc++.
+It is used to track both the status of the sub-projects of the effort and who is assigned to these sub-projects.
+This avoids duplicating effort.
+
+If you are interested in contributing to this effort, please send a message
+to the #libcxx channel in the LLVM discord. Please *do not* start working
+on any items below that has already been assigned to someone else.
+
+Sub-projects in the Implementation Effort
+=========================================
+
+.. csv-table::
+  :file: SpecialMathProjects.csv
+  :header-rows: 1
+  :widths: auto
+
+Paper and Issue Status
+======================
+
+The underlying paper is `Mathematical Special Functions for C++17 (P0226) <https://wg21.link/P0226>`_ and is included in C++17.
+Implementation is *In Progress*.
diff --git a/libcxx/docs/Status/SpecialMathProjects.csv b/libcxx/docs/Status/SpecialMathProjects.csv
new file mode 100644
index 0000000000000..f964e79de91d3
--- /dev/null
+++ b/libcxx/docs/Status/SpecialMathProjects.csv
@@ -0,0 +1,22 @@
+Section,Description,Assignee,Complete
+| `[sf.cmath.assoc.laguerre] <https://wg21.link/sf.cmath.assoc.laguerre>`_, std::assoc_laguerre, None, |Not Started|
+| `[sf.cmath.assoc.legendre] <https://wg21.link/sf.cmath.assoc.legendre>`_, std::assoc_legendre, None, |Not Started|
+| `[sf.cmath.beta] <https://wg21.link/sf.cmath.beta>`_, std::beta, None, |Not Started|
+| `[sf.cmath.comp.ellint.1] <https://wg21.link/sf.cmath.comp.ellint.1>`_, std::comp_ellint_1, None, |Not Started|
+| `[sf.cmath.comp.ellint.2] <https://wg21.link/sf.cmath.comp.ellint.2>`_, std::comp_ellint_2, None, |Not Started|
+| `[sf.cmath.comp.ellint.3] <https://wg21.link/sf.cmath.comp.ellint.3>`_, std::comp_ellint_3, None, |Not Started|
+| `[sf.cmath.cyl.bessel.i] <https://wg21.link/sf.cmath.cyl.bessel.i>`_, std::cyl_bessel_i, None, |Not Started|
+| `[sf.cmath.cyl.bessel.j] <https://wg21.link/sf.cmath.cyl.bessel.j>`_, std::cyl_bessel_j, None, |Not Started|
+| `[sf.cmath.cyl.bessel.k] <https://wg21.link/sf.cmath.cyl.bessel.k>`_, std::cyl_bessel_k, None, |Not Started|
+| `[sf.cmath.cyl.neumann] <https://wg21.link/sf.cmath.cyl.neumann>`_, std::cyl_neumann, None, |Not Started|
+| `[sf.cmath.ellint.1] <https://wg21.link/sf.cmath.ellint.1>`_, std::ellint_1, None, |Not Started|
+| `[sf.cmath.ellint.2] <https://wg21.link/sf.cmath.ellint.2>`_, std::ellint_2, None, |Not Started|
+| `[sf.cmath.ellint.3] <https://wg21.link/sf.cmath.ellint.3>`_, std::ellint_3, None, |Not Started|
+| `[sf.cmath.expint] <https://wg21.link/sf.cmath.expint>`_, std::expint, None, |Not Started|
+| `[sf.cmath.hermite] <https://wg21.link/sf.cmath.hermite>`_, std::hermite, Paul Xi Cao, |Complete|
+| `[sf.cmath.laguerre] <https://wg21.link/sf.cmath.laguerre>`_, std::laguerre, None, |Not Started|
+| `[sf.cmath.legendre] <https://wg21.link/sf.cmath.legendre>`_, std::legendre, None, |Not Started|
+| `[sf.cmath.riemann.zeta] <https://wg21.link/sf.cmath.riemann.zeta>`_, std::riemann_zeta, None, |Not Started|
+| `[sf.cmath.sph.bessel] <https://wg21.link/sf.cmath.sph.bessel>`_, std::sph_bessel, None, |Not Started|
+| `[sf.cmath.sph.legendre] <https://wg21.link/sf.cmath.sph.legendre>`_, std::sph_legendre, None, |Not Started|
+| `[sf.cmath.sph.neumann] <https://wg21.link/sf.cmath.sph.neumann>`_, std::sph_neumann, None, |Not Started|
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 6d3417cabfd61..65a7e1ec30c96 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -84,6 +84,12 @@ flags to use, and how to run an executable. This system is meant to be easily
 extended for custom needs, in particular when porting the libc++ test suite to
 new platforms.
 
+.. note::
+  If you run the test suite on Apple platforms, we recommend adding the terminal application
+  used to run the test suite to the list of "Developer Tools". This prevents the system from
+  trying to scan each individual test binary for malware and dramatically speeds up the test
+  suite.
+
 Using a Custom Site Configuration
 ---------------------------------
 
@@ -449,9 +455,9 @@ An example build would look like:
 
   $ ninja -C build cxx-benchmarks
 
-This will build all of the benchmarks under ``<libcxx-src>/benchmarks`` to be
+This will build all of the benchmarks under ``<libcxx>/test/benchmarks`` to be
 built against the just-built libc++. The compiled tests are output into
-``build/projects/libcxx/benchmarks``.
+``build/libcxx/test/benchmarks``.
 
 Also See:
 
@@ -468,9 +474,9 @@ For example:
 
 .. code-block:: bash
 
-  $ cd build/projects/libcxx/benchmarks
-  $ ./algorithms.bench.out # Runs all the benchmarks
-  $ ./algorithms.bench.out --benchmark_filter=BM_Sort.* # Only runs the sort benchmarks
+  $ cd build/libcxx/test/benchmarks
+  $ ./find.bench.out # Runs all the benchmarks
+  $ ./find.bench.out --benchmark_filter="bm_ranges_find<std::vector<char>>" # Only runs that specific benchmark
 
 For more information about running benchmarks see `Google Benchmark`_.
 
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 69a9e575cfe7c..4bca3ccc8fa06 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -53,6 +53,7 @@ Getting Started with libc++
    Status/PSTL
    Status/Ranges
    Status/Spaceship
+   Status/SpecialMath
    Status/Zip
 
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index cd64fe91449c2..32579272858a8 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -97,6 +97,7 @@ set(files
   __algorithm/ranges_find_first_of.h
   __algorithm/ranges_find_if.h
   __algorithm/ranges_find_if_not.h
+  __algorithm/ranges_find_last.h
   __algorithm/ranges_for_each.h
   __algorithm/ranges_for_each_n.h
   __algorithm/ranges_generate.h
@@ -508,6 +509,7 @@ set(files
   __math/remainder.h
   __math/roots.h
   __math/rounding_functions.h
+  __math/special_functions.h
   __math/traits.h
   __math/trigonometric_functions.h
   __mbstate_t.h
@@ -533,6 +535,8 @@ set(files
   __memory/concepts.h
   __memory/construct_at.h
   __memory/destruct_n.h
+  __memory/inout_ptr.h
+  __memory/out_ptr.h
   __memory/pointer_traits.h
   __memory/ranges_construct_at.h
   __memory/ranges_uninitialized_algorithms.h
@@ -793,7 +797,6 @@ set(files
   __type_traits/is_referenceable.h
   __type_traits/is_same.h
   __type_traits/is_scalar.h
-  __type_traits/is_scoped_enum.h
   __type_traits/is_signed.h
   __type_traits/is_signed_integer.h
   __type_traits/is_specialization.h
@@ -1042,7 +1045,7 @@ list(APPEND _all_includes "${LIBCXX_GENERATED_INCLUDE_DIR}/libcxx.imp")
 add_custom_target(generate-cxx-headers ALL DEPENDS ${_all_includes})
 
 add_library(cxx-headers INTERFACE)
-target_link_libraries(cxx-headers INTERFACE libcxx-abi-headers)
+target_link_libraries(cxx-headers INTERFACE libcxx-libc-headers libcxx-abi-headers)
 add_dependencies(cxx-headers generate-cxx-headers)
 # It's important that the arch directory be included first so that its header files
 # which interpose on the default include dir be included instead of the default ones.
diff --git a/libcxx/include/__algorithm/find_end.h b/libcxx/include/__algorithm/find_end.h
index 7e08e7953534e..841e0fd509d56 100644
--- a/libcxx/include/__algorithm/find_end.h
+++ b/libcxx/include/__algorithm/find_end.h
@@ -80,109 +80,6 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1>
   }
 }
 
-template < class _IterOps,
-           class _Pred,
-           class _Iter1,
-           class _Sent1,
-           class _Iter2,
-           class _Sent2,
-           class _Proj1,
-           class _Proj2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter1 __find_end(
-    _Iter1 __first1,
-    _Sent1 __sent1,
-    _Iter2 __first2,
-    _Sent2 __sent2,
-    _Pred& __pred,
-    _Proj1& __proj1,
-    _Proj2& __proj2,
-    bidirectional_iterator_tag,
-    bidirectional_iterator_tag) {
-  auto __last1 = _IterOps::next(__first1, __sent1);
-  auto __last2 = _IterOps::next(__first2, __sent2);
-  // modeled after search algorithm (in reverse)
-  if (__first2 == __last2)
-    return __last1; // Everything matches an empty sequence
-  _Iter1 __l1 = __last1;
-  _Iter2 __l2 = __last2;
-  --__l2;
-  while (true) {
-    // Find last element in sequence 1 that matchs *(__last2-1), with a mininum of loop checks
-    while (true) {
-      if (__first1 == __l1) // return __last1 if no element matches *__first2
-        return __last1;
-      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
-        break;
-    }
-    // *__l1 matches *__l2, now match elements before here
-    _Iter1 __m1 = __l1;
-    _Iter2 __m2 = __l2;
-    while (true) {
-      if (__m2 == __first2) // If pattern exhausted, __m1 is the answer (works for 1 element pattern)
-        return __m1;
-      if (__m1 == __first1) // Otherwise if source exhaused, pattern not found
-        return __last1;
-
-      // if there is a mismatch, restart with a new __l1
-      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) {
-        break;
-      } // else there is a match, check next elements
-    }
-  }
-}
-
-template < class _AlgPolicy,
-           class _Pred,
-           class _Iter1,
-           class _Sent1,
-           class _Iter2,
-           class _Sent2,
-           class _Proj1,
-           class _Proj2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter1 __find_end(
-    _Iter1 __first1,
-    _Sent1 __sent1,
-    _Iter2 __first2,
-    _Sent2 __sent2,
-    _Pred& __pred,
-    _Proj1& __proj1,
-    _Proj2& __proj2,
-    random_access_iterator_tag,
-    random_access_iterator_tag) {
-  typedef typename iterator_traits<_Iter1>::difference_type _D1;
-  auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1);
-  auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2);
-  // Take advantage of knowing source and pattern lengths.  Stop short when source is smaller than pattern
-  auto __len2 = __last2 - __first2;
-  if (__len2 == 0)
-    return __last1;
-  auto __len1 = __last1 - __first1;
-  if (__len1 < __len2)
-    return __last1;
-  const _Iter1 __s = __first1 + _D1(__len2 - 1); // End of pattern match can't go before here
-  _Iter1 __l1      = __last1;
-  _Iter2 __l2      = __last2;
-  --__l2;
-  while (true) {
-    while (true) {
-      if (__s == __l1)
-        return __last1;
-      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
-        break;
-    }
-    _Iter1 __m1 = __l1;
-    _Iter2 __m2 = __l2;
-    while (true) {
-      if (__m2 == __first2)
-        return __m1;
-      // no need to check range on __m1 because __s guarantees we have enough source
-      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(*--__m2))) {
-        break;
-      }
-    }
-  }
-}
-
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_end_classic(
     _ForwardIterator1 __first1,
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 5cf13f0a3f292..8ced989233bc4 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/iter_swap.h>
 #include <__algorithm/ranges_iterator_concept.h>
+#include <__assert>
 #include <__config>
 #include <__iterator/advance.h>
 #include <__iterator/distance.h>
@@ -160,6 +161,59 @@ struct _IterOps<_ClassicAlgPolicy> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 void __advance_to(_Iter& __first, _Iter __last) {
     __first = __last;
   }
+
+  // advance with sentinel, a la std::ranges::advance
+  template <class _Iter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter>
+  __advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) {
+    return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
+  }
+
+private:
+  // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
+  template <class _InputIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to(
+      _InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
+    __difference_type<_InputIter> __dist = 0;
+    for (; __dist < __count && __iter != __sentinel; ++__dist)
+      ++__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
+  template <class _BiDirIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter>
+  __advance_to(_BiDirIter& __iter,
+               __difference_type<_BiDirIter> __count,
+               const _BiDirIter& __sentinel,
+               bidirectional_iterator_tag) {
+    __difference_type<_BiDirIter> __dist = 0;
+    if (__count >= 0)
+      for (; __dist < __count && __iter != __sentinel; ++__dist)
+        ++__iter;
+    else
+      for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist)
+        --__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
+  template <class _RandIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter>
+  __advance_to(_RandIter& __iter,
+               __difference_type<_RandIter> __count,
+               const _RandIter& __sentinel,
+               random_access_iterator_tag) {
+    auto __dist = _IterOps::distance(__iter, __sentinel);
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");
+    if (__count < 0)
+      __dist = __dist > __count ? __dist : __count;
+    else
+      __dist = __dist < __count ? __dist : __count;
+    __iter += __dist;
+    return __count - __dist;
+  }
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 8fd355a7cfc4a..c417d84835497 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -27,11 +27,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
-  auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
-
+template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
+    _Iter __first,
+    const _Type& __value,
+    typename iterator_traits<_Iter>::difference_type __len,
+    _Comp& __comp,
+    _Proj& __proj) {
   while (__len != 0) {
     auto __l2 = std::__half_positive(__len);
     _Iter __m = __first;
@@ -46,6 +48,48 @@ __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp,
   return __first;
 }
 
+// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
+// advantage of being \Omega(1) rather than the classic algorithm's \Omega(log(n)), with the downside of executing at
+// most 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the
+// container's size upfront, which adds \Omega(n) iterator increments to the complexity. The second one is when you're
+// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
+// would yield \Omega(n*log(n)) comparisons and, for non-random-access iterators, \Omega(n^2) iterator increments,
+// whereas the one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
+// comparisons.
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  // step = 0, ensuring we can always short-circuit when distance is 1 later on
+  if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
+    return __first;
+
+  using _Distance = typename iterator_traits<_ForwardIterator>::difference_type;
+  for (_Distance __step = 1; __first != __last; __step <<= 1) {
+    auto __it   = __first;
+    auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last);
+    // once we reach the last range where needle can be we must start
+    // looking inwards, bisecting that range
+    if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
+      // we've already checked the previous value and it was less, we can save
+      // one comparison by skipping bisection
+      if (__dist == 1)
+        return __it;
+      return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+    }
+    // range not found, move forward!
+    __first = __it;
+  }
+  return __first;
+}
+
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
+  return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+}
+
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
diff --git a/libcxx/include/__algorithm/ranges_adjacent_find.h b/libcxx/include/__algorithm/ranges_adjacent_find.h
index 3c54f723310a6..0f2cb66699374 100644
--- a/libcxx/include/__algorithm/ranges_adjacent_find.h
+++ b/libcxx/include/__algorithm/ranges_adjacent_find.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __adjacent_find {
-struct __fn {
+struct __adjacent_find {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __adjacent_find_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
@@ -67,10 +66,9 @@ struct __fn {
     return __adjacent_find_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __adjacent_find
 
 inline namespace __cpo {
-inline constexpr auto adjacent_find = __adjacent_find::__fn{};
+inline constexpr auto adjacent_find = __adjacent_find{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_all_of.h b/libcxx/include/__algorithm/ranges_all_of.h
index 2f603b32f32d0..fea089493811a 100644
--- a/libcxx/include/__algorithm/ranges_all_of.h
+++ b/libcxx/include/__algorithm/ranges_all_of.h
@@ -30,8 +30,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __all_of {
-struct __fn {
+struct __all_of {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool __all_of_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
     for (; __first != __last; ++__first) {
@@ -58,10 +57,9 @@ struct __fn {
     return __all_of_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __all_of
 
 inline namespace __cpo {
-inline constexpr auto all_of = __all_of::__fn{};
+inline constexpr auto all_of = __all_of{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_any_of.h b/libcxx/include/__algorithm/ranges_any_of.h
index 205fcecc086e7..34d23b4b7455a 100644
--- a/libcxx/include/__algorithm/ranges_any_of.h
+++ b/libcxx/include/__algorithm/ranges_any_of.h
@@ -30,8 +30,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __any_of {
-struct __fn {
+struct __any_of {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool __any_of_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
     for (; __first != __last; ++__first) {
@@ -58,10 +57,9 @@ struct __fn {
     return __any_of_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __any_of
 
 inline namespace __cpo {
-inline constexpr auto any_of = __any_of::__fn{};
+inline constexpr auto any_of = __any_of{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_binary_search.h b/libcxx/include/__algorithm/ranges_binary_search.h
index 1ef2bd62b5995..47bd0997334e8 100644
--- a/libcxx/include/__algorithm/ranges_binary_search.h
+++ b/libcxx/include/__algorithm/ranges_binary_search.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __binary_search {
-struct __fn {
+struct __binary_search {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Type,
@@ -57,10 +56,9 @@ struct __fn {
     return __ret != __last && !std::invoke(__comp, __value, std::invoke(__proj, *__ret));
   }
 };
-} // namespace __binary_search
 
 inline namespace __cpo {
-inline constexpr auto binary_search = __binary_search::__fn{};
+inline constexpr auto binary_search = __binary_search{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_clamp.h b/libcxx/include/__algorithm/ranges_clamp.h
index e6181ef9435e0..4bb3e46e73bd6 100644
--- a/libcxx/include/__algorithm/ranges_clamp.h
+++ b/libcxx/include/__algorithm/ranges_clamp.h
@@ -30,8 +30,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __clamp {
-struct __fn {
+struct __clamp {
   template <class _Type,
             class _Proj                                                      = identity,
             indirect_strict_weak_order<projected<const _Type*, _Proj>> _Comp = ranges::less>
@@ -50,10 +49,9 @@ struct __fn {
       return __value;
   }
 };
-} // namespace __clamp
 
 inline namespace __cpo {
-inline constexpr auto clamp = __clamp::__fn{};
+inline constexpr auto clamp = __clamp{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_contains.h b/libcxx/include/__algorithm/ranges_contains.h
index 4836c3baed173..88de215297e5b 100644
--- a/libcxx/include/__algorithm/ranges_contains.h
+++ b/libcxx/include/__algorithm/ranges_contains.h
@@ -33,8 +33,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __contains {
-struct __fn {
+struct __contains {
   template <input_iterator _Iter, sentinel_for<_Iter> _Sent, class _Type, class _Proj = identity>
     requires indirect_binary_predicate<ranges::equal_to, projected<_Iter, _Proj>, const _Type*>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool static
@@ -50,10 +49,9 @@ struct __fn {
            ranges::end(__range);
   }
 };
-} // namespace __contains
 
 inline namespace __cpo {
-inline constexpr auto contains = __contains::__fn{};
+inline constexpr auto contains = __contains{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_contains_subrange.h b/libcxx/include/__algorithm/ranges_contains_subrange.h
index 4398c457fd054..e8740d69dbef2 100644
--- a/libcxx/include/__algorithm/ranges_contains_subrange.h
+++ b/libcxx/include/__algorithm/ranges_contains_subrange.h
@@ -35,8 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __contains_subrange {
-struct __fn {
+struct __contains_subrange {
   template <forward_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             forward_iterator _Iter2,
@@ -81,10 +80,9 @@ struct __fn {
     return __ret.empty() == false;
   }
 };
-} // namespace __contains_subrange
 
 inline namespace __cpo {
-inline constexpr auto contains_subrange = __contains_subrange::__fn{};
+inline constexpr auto contains_subrange = __contains_subrange{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_copy.h b/libcxx/include/__algorithm/ranges_copy.h
index e1d6d32f05f7e..05b61c0d5d4ab 100644
--- a/libcxx/include/__algorithm/ranges_copy.h
+++ b/libcxx/include/__algorithm/ranges_copy.h
@@ -37,8 +37,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __copy {
-struct __fn {
+struct __copy {
   template <input_iterator _InIter, sentinel_for<_InIter> _Sent, weakly_incrementable _OutIter>
     requires indirectly_copyable<_InIter, _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr copy_result<_InIter, _OutIter>
@@ -55,10 +54,9 @@ struct __fn {
     return {std::move(__ret.first), std::move(__ret.second)};
   }
 };
-} // namespace __copy
 
 inline namespace __cpo {
-inline constexpr auto copy = __copy::__fn{};
+inline constexpr auto copy = __copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_copy_backward.h b/libcxx/include/__algorithm/ranges_copy_backward.h
index 93e326042503f..81d14e465f7f2 100644
--- a/libcxx/include/__algorithm/ranges_copy_backward.h
+++ b/libcxx/include/__algorithm/ranges_copy_backward.h
@@ -35,8 +35,7 @@ namespace ranges {
 template <class _Ip, class _Op>
 using copy_backward_result = in_out_result<_Ip, _Op>;
 
-namespace __copy_backward {
-struct __fn {
+struct __copy_backward {
   template <bidirectional_iterator _InIter1, sentinel_for<_InIter1> _Sent1, bidirectional_iterator _InIter2>
     requires indirectly_copyable<_InIter1, _InIter2>
   _LIBCPP_HIDE_FROM_ABI constexpr copy_backward_result<_InIter1, _InIter2>
@@ -53,10 +52,9 @@ struct __fn {
     return {std::move(__ret.first), std::move(__ret.second)};
   }
 };
-} // namespace __copy_backward
 
 inline namespace __cpo {
-inline constexpr auto copy_backward = __copy_backward::__fn{};
+inline constexpr auto copy_backward = __copy_backward{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_copy_if.h b/libcxx/include/__algorithm/ranges_copy_if.h
index 4b41d2154e7f8..1a08985fe35ca 100644
--- a/libcxx/include/__algorithm/ranges_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_copy_if.h
@@ -36,8 +36,7 @@ namespace ranges {
 template <class _Ip, class _Op>
 using copy_if_result = in_out_result<_Ip, _Op>;
 
-namespace __copy_if {
-struct __fn {
+struct __copy_if {
   template <class _InIter, class _Sent, class _OutIter, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI static constexpr copy_if_result<_InIter, _OutIter>
   __copy_if_impl(_InIter __first, _Sent __last, _OutIter __result, _Pred& __pred, _Proj& __proj) {
@@ -71,10 +70,9 @@ struct __fn {
     return __copy_if_impl(ranges::begin(__r), ranges::end(__r), std::move(__result), __pred, __proj);
   }
 };
-} // namespace __copy_if
 
 inline namespace __cpo {
-inline constexpr auto copy_if = __copy_if::__fn{};
+inline constexpr auto copy_if = __copy_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_copy_n.h b/libcxx/include/__algorithm/ranges_copy_n.h
index 4353fa99278c8..4407e07f5ca62 100644
--- a/libcxx/include/__algorithm/ranges_copy_n.h
+++ b/libcxx/include/__algorithm/ranges_copy_n.h
@@ -37,8 +37,7 @@ namespace ranges {
 template <class _Ip, class _Op>
 using copy_n_result = in_out_result<_Ip, _Op>;
 
-namespace __copy_n {
-struct __fn {
+struct __copy_n {
   template <class _InIter, class _DiffType, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr static copy_n_result<_InIter, _OutIter>
   __go(_InIter __first, _DiffType __n, _OutIter __result) {
@@ -65,10 +64,9 @@ struct __fn {
     return __go(std::move(__first), __n, std::move(__result));
   }
 };
-} // namespace __copy_n
 
 inline namespace __cpo {
-inline constexpr auto copy_n = __copy_n::__fn{};
+inline constexpr auto copy_n = __copy_n{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_count.h b/libcxx/include/__algorithm/ranges_count.h
index 4f35117438705..2b3969e763079 100644
--- a/libcxx/include/__algorithm/ranges_count.h
+++ b/libcxx/include/__algorithm/ranges_count.h
@@ -34,8 +34,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __count {
-struct __fn {
+struct __count {
   template <input_iterator _Iter, sentinel_for<_Iter> _Sent, class _Type, class _Proj = identity>
     requires indirect_binary_predicate<ranges::equal_to, projected<_Iter, _Proj>, const _Type*>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Iter>
@@ -50,10 +49,9 @@ struct __fn {
     return std::__count<_RangeAlgPolicy>(ranges::begin(__r), ranges::end(__r), __value, __proj);
   }
 };
-} // namespace __count
 
 inline namespace __cpo {
-inline constexpr auto count = __count::__fn{};
+inline constexpr auto count = __count{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_count_if.h b/libcxx/include/__algorithm/ranges_count_if.h
index 5f2396ff7d531..2663180242c1c 100644
--- a/libcxx/include/__algorithm/ranges_count_if.h
+++ b/libcxx/include/__algorithm/ranges_count_if.h
@@ -44,8 +44,7 @@ __count_if_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
   return __counter;
 }
 
-namespace __count_if {
-struct __fn {
+struct __count_if {
   template <input_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Proj = identity,
@@ -63,10 +62,9 @@ struct __fn {
     return ranges::__count_if_impl(ranges::begin(__r), ranges::end(__r), __pred, __proj);
   }
 };
-} // namespace __count_if
 
 inline namespace __cpo {
-inline constexpr auto count_if = __count_if::__fn{};
+inline constexpr auto count_if = __count_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_ends_with.h b/libcxx/include/__algorithm/ranges_ends_with.h
index 06efdef36b7cf..df7fad0bfc70b 100644
--- a/libcxx/include/__algorithm/ranges_ends_with.h
+++ b/libcxx/include/__algorithm/ranges_ends_with.h
@@ -36,8 +36,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __ends_with {
-struct __fn {
+struct __ends_with {
   template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
   _LIBCPP_HIDE_FROM_ABI static constexpr bool __ends_with_fn_impl_bidirectional(
       _Iter1 __first1,
@@ -185,10 +184,9 @@ struct __fn {
     }
   }
 };
-} // namespace __ends_with
 
 inline namespace __cpo {
-inline constexpr auto ends_with = __ends_with::__fn{};
+inline constexpr auto ends_with = __ends_with{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_equal.h b/libcxx/include/__algorithm/ranges_equal.h
index edbd0e3641c1b..c26d13f002201 100644
--- a/libcxx/include/__algorithm/ranges_equal.h
+++ b/libcxx/include/__algorithm/ranges_equal.h
@@ -34,8 +34,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __equal {
-struct __fn {
+struct __equal {
   template <input_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             input_iterator _Iter2,
@@ -93,10 +92,9 @@ struct __fn {
     return false;
   }
 };
-} // namespace __equal
 
 inline namespace __cpo {
-inline constexpr auto equal = __equal::__fn{};
+inline constexpr auto equal = __equal{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_equal_range.h b/libcxx/include/__algorithm/ranges_equal_range.h
index 4a308e016b546..cc765f196648e 100644
--- a/libcxx/include/__algorithm/ranges_equal_range.h
+++ b/libcxx/include/__algorithm/ranges_equal_range.h
@@ -38,9 +38,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __equal_range {
-
-struct __fn {
+struct __equal_range {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Tp,
@@ -64,10 +62,8 @@ struct __fn {
   }
 };
 
-} // namespace __equal_range
-
 inline namespace __cpo {
-inline constexpr auto equal_range = __equal_range::__fn{};
+inline constexpr auto equal_range = __equal_range{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h
index 7a177d85e9f07..c248009f98fe3 100644
--- a/libcxx/include/__algorithm/ranges_fill.h
+++ b/libcxx/include/__algorithm/ranges_fill.h
@@ -28,8 +28,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __fill {
-struct __fn {
+struct __fill {
   template <class _Type, output_iterator<const _Type&> _Iter, sentinel_for<_Iter> _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr _Iter operator()(_Iter __first, _Sent __last, const _Type& __value) const {
     if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
@@ -46,10 +45,9 @@ struct __fn {
     return (*this)(ranges::begin(__range), ranges::end(__range), __value);
   }
 };
-} // namespace __fill
 
 inline namespace __cpo {
-inline constexpr auto fill = __fill::__fn{};
+inline constexpr auto fill = __fill{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_fill_n.h b/libcxx/include/__algorithm/ranges_fill_n.h
index a6e988c0089ce..7a33268e1dd32 100644
--- a/libcxx/include/__algorithm/ranges_fill_n.h
+++ b/libcxx/include/__algorithm/ranges_fill_n.h
@@ -25,8 +25,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __fill_n {
-struct __fn {
+struct __fill_n {
   template <class _Type, output_iterator<const _Type&> _Iter>
   _LIBCPP_HIDE_FROM_ABI constexpr _Iter
   operator()(_Iter __first, iter_difference_t<_Iter> __n, const _Type& __value) const {
@@ -37,10 +36,9 @@ struct __fn {
     return __first;
   }
 };
-} // namespace __fill_n
 
 inline namespace __cpo {
-inline constexpr auto fill_n = __fill_n::__fn{};
+inline constexpr auto fill_n = __fill_n{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find.h b/libcxx/include/__algorithm/ranges_find.h
index 6b0d5efe37ab8..1eac4cfa02a4a 100644
--- a/libcxx/include/__algorithm/ranges_find.h
+++ b/libcxx/include/__algorithm/ranges_find.h
@@ -36,8 +36,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __find {
-struct __fn {
+struct __find {
   template <class _Iter, class _Sent, class _Tp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI static constexpr _Iter
   __find_unwrap(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
@@ -64,10 +63,9 @@ struct __fn {
     return __find_unwrap(ranges::begin(__r), ranges::end(__r), __value, __proj);
   }
 };
-} // namespace __find
 
 inline namespace __cpo {
-inline constexpr auto find = __find::__fn{};
+inline constexpr auto find = __find{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find_end.h b/libcxx/include/__algorithm/ranges_find_end.h
index e49e66dd4ac04..682724a48cd5a 100644
--- a/libcxx/include/__algorithm/ranges_find_end.h
+++ b/libcxx/include/__algorithm/ranges_find_end.h
@@ -35,8 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __find_end {
-struct __fn {
+struct __find_end {
   template <forward_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             forward_iterator _Iter2,
@@ -87,10 +86,9 @@ struct __fn {
     return {__ret.first, __ret.second};
   }
 };
-} // namespace __find_end
 
 inline namespace __cpo {
-inline constexpr auto find_end = __find_end::__fn{};
+inline constexpr auto find_end = __find_end{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find_first_of.h b/libcxx/include/__algorithm/ranges_find_first_of.h
index d92d9686bc442..102e16dd7a55b 100644
--- a/libcxx/include/__algorithm/ranges_find_first_of.h
+++ b/libcxx/include/__algorithm/ranges_find_first_of.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __find_first_of {
-struct __fn {
+struct __find_first_of {
   template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter1 __find_first_of_impl(
       _Iter1 __first1,
@@ -90,10 +89,9 @@ struct __fn {
         __proj2);
   }
 };
-} // namespace __find_first_of
 
 inline namespace __cpo {
-inline constexpr auto find_first_of = __find_first_of::__fn{};
+inline constexpr auto find_first_of = __find_first_of{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find_if.h b/libcxx/include/__algorithm/ranges_find_if.h
index 888f9ec3cb2d5..ed6406e6186a3 100644
--- a/libcxx/include/__algorithm/ranges_find_if.h
+++ b/libcxx/include/__algorithm/ranges_find_if.h
@@ -42,8 +42,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Ip __find_if_impl(_Ip __first, _Sp __last, _Pre
   return __first;
 }
 
-namespace __find_if {
-struct __fn {
+struct __find_if {
   template <input_iterator _Ip,
             sentinel_for<_Ip> _Sp,
             class _Proj = identity,
@@ -59,10 +58,9 @@ struct __fn {
     return ranges::__find_if_impl(ranges::begin(__r), ranges::end(__r), __pred, __proj);
   }
 };
-} // namespace __find_if
 
 inline namespace __cpo {
-inline constexpr auto find_if = __find_if::__fn{};
+inline constexpr auto find_if = __find_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find_if_not.h b/libcxx/include/__algorithm/ranges_find_if_not.h
index ec19545b5a1b7..9a359b2afdab1 100644
--- a/libcxx/include/__algorithm/ranges_find_if_not.h
+++ b/libcxx/include/__algorithm/ranges_find_if_not.h
@@ -34,8 +34,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __find_if_not {
-struct __fn {
+struct __find_if_not {
   template <input_iterator _Ip,
             sentinel_for<_Ip> _Sp,
             class _Proj = identity,
@@ -53,10 +52,9 @@ struct __fn {
     return ranges::__find_if_impl(ranges::begin(__r), ranges::end(__r), __pred2, __proj);
   }
 };
-} // namespace __find_if_not
 
 inline namespace __cpo {
-inline constexpr auto find_if_not = __find_if_not::__fn{};
+inline constexpr auto find_if_not = __find_if_not{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_find_last.h b/libcxx/include/__algorithm/ranges_find_last.h
new file mode 100644
index 0000000000000..838dd44e5d107
--- /dev/null
+++ b/libcxx/include/__algorithm/ranges_find_last.h
@@ -0,0 +1,169 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+#define _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/indirectly_comparable.h>
+#include <__iterator/next.h>
+#include <__iterator/prev.h>
+#include <__iterator/projected.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/subrange.h>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+template <class _Iter, class _Sent, class _Pred, class _Proj>
+_LIBCPP_HIDE_FROM_ABI constexpr subrange<_Iter>
+__find_last_impl(_Iter __first, _Sent __last, _Pred __pred, _Proj& __proj) {
+  if (__first == __last) {
+    return subrange<_Iter>(__first, __first);
+  }
+
+  if constexpr (bidirectional_iterator<_Iter>) {
+    auto __last_it = ranges::next(__first, __last);
+    for (auto __it = ranges::prev(__last_it); __it != __first; --__it) {
+      if (__pred(std::invoke(__proj, *__it))) {
+        return subrange<_Iter>(std::move(__it), std::move(__last_it));
+      }
+    }
+    if (__pred(std::invoke(__proj, *__first))) {
+      return subrange<_Iter>(std::move(__first), std::move(__last_it));
+    }
+    return subrange<_Iter>(__last_it, __last_it);
+  } else {
+    bool __found = false;
+    _Iter __found_it;
+    for (; __first != __last; ++__first) {
+      if (__pred(std::invoke(__proj, *__first))) {
+        __found    = true;
+        __found_it = __first;
+      }
+    }
+
+    if (__found) {
+      return subrange<_Iter>(std::move(__found_it), std::move(__first));
+    } else {
+      return subrange<_Iter>(__first, __first);
+    }
+  }
+}
+
+struct __find_last {
+  template <class _Type>
+  struct __op {
+    const _Type& __value;
+    template <class _Elem>
+    _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator()(_Elem&& __elem) const {
+      return std::forward<_Elem>(__elem) == __value;
+    }
+  };
+
+  template <forward_iterator _Iter, sentinel_for<_Iter> _Sent, class _Type, class _Proj = identity>
+    requires indirect_binary_predicate<ranges::equal_to, projected<_Iter, _Proj>, const _Type*>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static subrange<_Iter>
+  operator()(_Iter __first, _Sent __last, const _Type& __value, _Proj __proj = {}) {
+    return ranges::__find_last_impl(std::move(__first), std::move(__last), __op<_Type>{__value}, __proj);
+  }
+
+  template <forward_range _Range, class _Type, class _Proj = identity>
+    requires indirect_binary_predicate<ranges::equal_to, projected<iterator_t<_Range>, _Proj>, const _Type*>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static borrowed_subrange_t<_Range>
+  operator()(_Range&& __range, const _Type& __value, _Proj __proj = {}) {
+    return ranges::__find_last_impl(ranges::begin(__range), ranges::end(__range), __op<_Type>{__value}, __proj);
+  }
+};
+
+struct __find_last_if {
+  template <class _Pred>
+  struct __op {
+    _Pred& __pred;
+    template <class _Elem>
+    _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator()(_Elem&& __elem) const {
+      return std::invoke(__pred, std::forward<_Elem>(__elem));
+    }
+  };
+
+  template <forward_iterator _Iter,
+            sentinel_for<_Iter> _Sent,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<_Iter, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static subrange<_Iter>
+  operator()(_Iter __first, _Sent __last, _Pred __pred, _Proj __proj = {}) {
+    return ranges::__find_last_impl(std::move(__first), std::move(__last), __op<_Pred>{__pred}, __proj);
+  }
+
+  template <forward_range _Range,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<iterator_t<_Range>, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static borrowed_subrange_t<_Range>
+  operator()(_Range&& __range, _Pred __pred, _Proj __proj = {}) {
+    return ranges::__find_last_impl(ranges::begin(__range), ranges::end(__range), __op<_Pred>{__pred}, __proj);
+  }
+};
+
+struct __find_last_if_not {
+  template <class _Pred>
+  struct __op {
+    _Pred& __pred;
+    template <class _Elem>
+    _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator()(_Elem&& __elem) const {
+      return !std::invoke(__pred, std::forward<_Elem>(__elem));
+    }
+  };
+
+  template <forward_iterator _Iter,
+            sentinel_for<_Iter> _Sent,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<_Iter, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static subrange<_Iter>
+  operator()(_Iter __first, _Sent __last, _Pred __pred, _Proj __proj = {}) {
+    return ranges::__find_last_impl(std::move(__first), std::move(__last), __op<_Pred>{__pred}, __proj);
+  }
+
+  template <forward_range _Range,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<iterator_t<_Range>, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr static borrowed_subrange_t<_Range>
+  operator()(_Range&& __range, _Pred __pred, _Proj __proj = {}) {
+    return ranges::__find_last_impl(ranges::begin(__range), ranges::end(__range), __op<_Pred>{__pred}, __proj);
+  }
+};
+
+inline namespace __cpo {
+inline constexpr auto find_last        = __find_last{};
+inline constexpr auto find_last_if     = __find_last_if{};
+inline constexpr auto find_last_if_not = __find_last_if_not{};
+} // namespace __cpo
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 225dc774c8764..de39bc5522753 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -36,8 +36,7 @@ namespace ranges {
 template <class _Iter, class _Func>
 using for_each_result = in_fun_result<_Iter, _Func>;
 
-namespace __for_each {
-struct __fn {
+struct __for_each {
 private:
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
@@ -65,10 +64,9 @@ struct __fn {
     return __for_each_impl(ranges::begin(__range), ranges::end(__range), __func, __proj);
   }
 };
-} // namespace __for_each
 
 inline namespace __cpo {
-inline constexpr auto for_each = __for_each::__fn{};
+inline constexpr auto for_each = __for_each{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index d1fdca34cc5a1..603cb723233c8 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -36,8 +36,7 @@ namespace ranges {
 template <class _Iter, class _Func>
 using for_each_n_result = in_fun_result<_Iter, _Func>;
 
-namespace __for_each_n {
-struct __fn {
+struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
@@ -48,10 +47,9 @@ struct __fn {
     return {std::move(__first), std::move(__func)};
   }
 };
-} // namespace __for_each_n
 
 inline namespace __cpo {
-inline constexpr auto for_each_n = __for_each_n::__fn{};
+inline constexpr auto for_each_n = __for_each_n{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_generate.h b/libcxx/include/__algorithm/ranges_generate.h
index e6467198e6ba2..b94611d18263a 100644
--- a/libcxx/include/__algorithm/ranges_generate.h
+++ b/libcxx/include/__algorithm/ranges_generate.h
@@ -32,9 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __generate {
-
-struct __fn {
+struct __generate {
   template <class _OutIter, class _Sent, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static _OutIter __generate_fn_impl(_OutIter __first, _Sent __last, _Func& __gen) {
     for (; __first != __last; ++__first) {
@@ -57,10 +55,8 @@ struct __fn {
   }
 };
 
-} // namespace __generate
-
 inline namespace __cpo {
-inline constexpr auto generate = __generate::__fn{};
+inline constexpr auto generate = __generate{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h
index cd5fd7483ab2c..702e9a949cb45 100644
--- a/libcxx/include/__algorithm/ranges_generate_n.h
+++ b/libcxx/include/__algorithm/ranges_generate_n.h
@@ -33,9 +33,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __generate_n {
-
-struct __fn {
+struct __generate_n {
   template <input_or_output_iterator _OutIter, copy_constructible _Func>
     requires invocable<_Func&> && indirectly_writable<_OutIter, invoke_result_t<_Func&>>
   _LIBCPP_HIDE_FROM_ABI constexpr _OutIter
@@ -49,10 +47,8 @@ struct __fn {
   }
 };
 
-} // namespace __generate_n
-
 inline namespace __cpo {
-inline constexpr auto generate_n = __generate_n::__fn{};
+inline constexpr auto generate_n = __generate_n{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_includes.h b/libcxx/include/__algorithm/ranges_includes.h
index c4c3b8ed088d3..9145f3b5564ff 100644
--- a/libcxx/include/__algorithm/ranges_includes.h
+++ b/libcxx/include/__algorithm/ranges_includes.h
@@ -35,9 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __includes {
-
-struct __fn {
+struct __includes {
   template <input_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             input_iterator _Iter2,
@@ -82,10 +80,8 @@ struct __fn {
   }
 };
 
-} // namespace __includes
-
 inline namespace __cpo {
-inline constexpr auto includes = __includes::__fn{};
+inline constexpr auto includes = __includes{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_inplace_merge.h b/libcxx/include/__algorithm/ranges_inplace_merge.h
index d94c0ad465677..5879d0e7ef0fb 100644
--- a/libcxx/include/__algorithm/ranges_inplace_merge.h
+++ b/libcxx/include/__algorithm/ranges_inplace_merge.h
@@ -39,9 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __inplace_merge {
-
-struct __fn {
+struct __inplace_merge {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI static constexpr auto
   __inplace_merge_impl(_Iter __first, _Iter __middle, _Sent __last, _Comp&& __comp, _Proj&& __proj) {
@@ -68,10 +66,8 @@ struct __fn {
   }
 };
 
-} // namespace __inplace_merge
-
 inline namespace __cpo {
-inline constexpr auto inplace_merge = __inplace_merge::__fn{};
+inline constexpr auto inplace_merge = __inplace_merge{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_heap.h b/libcxx/include/__algorithm/ranges_is_heap.h
index 3d9e18ce1d906..b4724abfb62a5 100644
--- a/libcxx/include/__algorithm/ranges_is_heap.h
+++ b/libcxx/include/__algorithm/ranges_is_heap.h
@@ -34,9 +34,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __is_heap {
-
-struct __fn {
+struct __is_heap {
   template <class _Iter, class _Sent, class _Proj, class _Comp>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool
   __is_heap_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -65,10 +63,8 @@ struct __fn {
   }
 };
 
-} // namespace __is_heap
-
 inline namespace __cpo {
-inline constexpr auto is_heap = __is_heap::__fn{};
+inline constexpr auto is_heap = __is_heap{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_heap_until.h b/libcxx/include/__algorithm/ranges_is_heap_until.h
index 7a2e1fc7705b6..25f3b484faa66 100644
--- a/libcxx/include/__algorithm/ranges_is_heap_until.h
+++ b/libcxx/include/__algorithm/ranges_is_heap_until.h
@@ -35,9 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __is_heap_until {
-
-struct __fn {
+struct __is_heap_until {
   template <class _Iter, class _Sent, class _Proj, class _Comp>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __is_heap_until_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -65,10 +63,8 @@ struct __fn {
   }
 };
 
-} // namespace __is_heap_until
-
 inline namespace __cpo {
-inline constexpr auto is_heap_until = __is_heap_until::__fn{};
+inline constexpr auto is_heap_until = __is_heap_until{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_partitioned.h b/libcxx/include/__algorithm/ranges_is_partitioned.h
index 5be6fba46fd9e..8092abfcd1de3 100644
--- a/libcxx/include/__algorithm/ranges_is_partitioned.h
+++ b/libcxx/include/__algorithm/ranges_is_partitioned.h
@@ -31,8 +31,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __is_partitioned {
-struct __fn {
+struct __is_partitioned {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool
   __is_partitioned_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
@@ -70,10 +69,9 @@ struct __fn {
     return __is_partitioned_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __is_partitioned
 
 inline namespace __cpo {
-inline constexpr auto is_partitioned = __is_partitioned::__fn{};
+inline constexpr auto is_partitioned = __is_partitioned{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_permutation.h b/libcxx/include/__algorithm/ranges_is_permutation.h
index 1f8d67007a573..53a431d2ba425 100644
--- a/libcxx/include/__algorithm/ranges_is_permutation.h
+++ b/libcxx/include/__algorithm/ranges_is_permutation.h
@@ -33,8 +33,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __is_permutation {
-struct __fn {
+struct __is_permutation {
   template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Proj1, class _Proj2, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool __is_permutation_func_impl(
       _Iter1 __first1,
@@ -91,10 +90,9 @@ struct __fn {
         __proj2);
   }
 };
-} // namespace __is_permutation
 
 inline namespace __cpo {
-inline constexpr auto is_permutation = __is_permutation::__fn{};
+inline constexpr auto is_permutation = __is_permutation{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_sorted.h b/libcxx/include/__algorithm/ranges_is_sorted.h
index 5b88d422b4b09..ab0670688a0e9 100644
--- a/libcxx/include/__algorithm/ranges_is_sorted.h
+++ b/libcxx/include/__algorithm/ranges_is_sorted.h
@@ -31,8 +31,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __is_sorted {
-struct __fn {
+struct __is_sorted {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Proj                                               = identity,
@@ -51,10 +50,9 @@ struct __fn {
     return ranges::__is_sorted_until_impl(ranges::begin(__range), __last, __comp, __proj) == __last;
   }
 };
-} // namespace __is_sorted
 
 inline namespace __cpo {
-inline constexpr auto is_sorted = __is_sorted::__fn{};
+inline constexpr auto is_sorted = __is_sorted{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_is_sorted_until.h b/libcxx/include/__algorithm/ranges_is_sorted_until.h
index 54de530c8b2fd..f2e51c264e4a7 100644
--- a/libcxx/include/__algorithm/ranges_is_sorted_until.h
+++ b/libcxx/include/__algorithm/ranges_is_sorted_until.h
@@ -47,8 +47,7 @@ __is_sorted_until_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj
   return __i;
 }
 
-namespace __is_sorted_until {
-struct __fn {
+struct __is_sorted_until {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Proj                                               = identity,
@@ -66,10 +65,9 @@ struct __fn {
     return ranges::__is_sorted_until_impl(ranges::begin(__range), ranges::end(__range), __comp, __proj);
   }
 };
-} // namespace __is_sorted_until
 
 inline namespace __cpo {
-inline constexpr auto is_sorted_until = __is_sorted_until::__fn{};
+inline constexpr auto is_sorted_until = __is_sorted_until{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_lexicographical_compare.h b/libcxx/include/__algorithm/ranges_lexicographical_compare.h
index 6d82017e302a7..024cc6b707cab 100644
--- a/libcxx/include/__algorithm/ranges_lexicographical_compare.h
+++ b/libcxx/include/__algorithm/ranges_lexicographical_compare.h
@@ -31,8 +31,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __lexicographical_compare {
-struct __fn {
+struct __lexicographical_compare {
   template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Proj1, class _Proj2, class _Comp>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool __lexicographical_compare_impl(
       _Iter1 __first1,
@@ -90,10 +89,9 @@ struct __fn {
         __proj2);
   }
 };
-} // namespace __lexicographical_compare
 
 inline namespace __cpo {
-inline constexpr auto lexicographical_compare = __lexicographical_compare::__fn{};
+inline constexpr auto lexicographical_compare = __lexicographical_compare{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_lower_bound.h b/libcxx/include/__algorithm/ranges_lower_bound.h
index 0651147e04249..d1b332849b8b6 100644
--- a/libcxx/include/__algorithm/ranges_lower_bound.h
+++ b/libcxx/include/__algorithm/ranges_lower_bound.h
@@ -36,8 +36,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
 
-namespace __lower_bound {
-struct __fn {
+struct __lower_bound {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Type,
@@ -57,10 +56,9 @@ struct __fn {
     return std::__lower_bound<_RangeAlgPolicy>(ranges::begin(__r), ranges::end(__r), __value, __comp, __proj);
   }
 };
-} // namespace __lower_bound
 
 inline namespace __cpo {
-inline constexpr auto lower_bound = __lower_bound::__fn{};
+inline constexpr auto lower_bound = __lower_bound{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_make_heap.h b/libcxx/include/__algorithm/ranges_make_heap.h
index fe9c024fbf8a8..97148f77b4181 100644
--- a/libcxx/include/__algorithm/ranges_make_heap.h
+++ b/libcxx/include/__algorithm/ranges_make_heap.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __make_heap {
-
-struct __fn {
+struct __make_heap {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __make_heap_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -69,10 +67,8 @@ struct __fn {
   }
 };
 
-} // namespace __make_heap
-
 inline namespace __cpo {
-inline constexpr auto make_heap = __make_heap::__fn{};
+inline constexpr auto make_heap = __make_heap{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_max.h b/libcxx/include/__algorithm/ranges_max.h
index d0ee6f314b0c3..f631344422ed5 100644
--- a/libcxx/include/__algorithm/ranges_max.h
+++ b/libcxx/include/__algorithm/ranges_max.h
@@ -36,8 +36,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __max {
-struct __fn {
+struct __max {
   template <class _Tp,
             class _Proj                                                    = identity,
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
@@ -87,10 +86,9 @@ struct __fn {
     }
   }
 };
-} // namespace __max
 
 inline namespace __cpo {
-inline constexpr auto max = __max::__fn{};
+inline constexpr auto max = __max{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_max_element.h b/libcxx/include/__algorithm/ranges_max_element.h
index c577309271165..869f71ecc8d20 100644
--- a/libcxx/include/__algorithm/ranges_max_element.h
+++ b/libcxx/include/__algorithm/ranges_max_element.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __max_element {
-struct __fn {
+struct __max_element {
   template <forward_iterator _Ip,
             sentinel_for<_Ip> _Sp,
             class _Proj                                             = identity,
@@ -53,10 +52,9 @@ struct __fn {
     return ranges::__min_element_impl(ranges::begin(__r), ranges::end(__r), __comp_lhs_rhs_swapped, __proj);
   }
 };
-} // namespace __max_element
 
 inline namespace __cpo {
-inline constexpr auto max_element = __max_element::__fn{};
+inline constexpr auto max_element = __max_element{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_merge.h b/libcxx/include/__algorithm/ranges_merge.h
index bdf9a62d90bd2..f3e0486fe488e 100644
--- a/libcxx/include/__algorithm/ranges_merge.h
+++ b/libcxx/include/__algorithm/ranges_merge.h
@@ -39,42 +39,7 @@ namespace ranges {
 template <class _InIter1, class _InIter2, class _OutIter>
 using merge_result = in_in_out_result<_InIter1, _InIter2, _OutIter>;
 
-namespace __merge {
-
-template < class _InIter1,
-           class _Sent1,
-           class _InIter2,
-           class _Sent2,
-           class _OutIter,
-           class _Comp,
-           class _Proj1,
-           class _Proj2>
-_LIBCPP_HIDE_FROM_ABI constexpr merge_result<__remove_cvref_t<_InIter1>,
-                                             __remove_cvref_t<_InIter2>,
-                                             __remove_cvref_t<_OutIter>>
-__merge_impl(_InIter1&& __first1,
-             _Sent1&& __last1,
-             _InIter2&& __first2,
-             _Sent2&& __last2,
-             _OutIter&& __result,
-             _Comp&& __comp,
-             _Proj1&& __proj1,
-             _Proj2&& __proj2) {
-  for (; __first1 != __last1 && __first2 != __last2; ++__result) {
-    if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) {
-      *__result = *__first2;
-      ++__first2;
-    } else {
-      *__result = *__first1;
-      ++__first1;
-    }
-  }
-  auto __ret1 = ranges::copy(std::move(__first1), std::move(__last1), std::move(__result));
-  auto __ret2 = ranges::copy(std::move(__first2), std::move(__last2), std::move(__ret1.out));
-  return {std::move(__ret1.in), std::move(__ret2.in), std::move(__ret2.out)};
-}
-
-struct __fn {
+struct __merge {
   template <input_iterator _InIter1,
             sentinel_for<_InIter1> _Sent1,
             input_iterator _InIter2,
@@ -120,12 +85,43 @@ struct __fn {
         __proj1,
         __proj2);
   }
-};
 
-} // namespace __merge
+  template < class _InIter1,
+             class _Sent1,
+             class _InIter2,
+             class _Sent2,
+             class _OutIter,
+             class _Comp,
+             class _Proj1,
+             class _Proj2>
+  _LIBCPP_HIDE_FROM_ABI static constexpr merge_result<__remove_cvref_t<_InIter1>,
+                                                      __remove_cvref_t<_InIter2>,
+                                                      __remove_cvref_t<_OutIter>>
+  __merge_impl(_InIter1&& __first1,
+               _Sent1&& __last1,
+               _InIter2&& __first2,
+               _Sent2&& __last2,
+               _OutIter&& __result,
+               _Comp&& __comp,
+               _Proj1&& __proj1,
+               _Proj2&& __proj2) {
+    for (; __first1 != __last1 && __first2 != __last2; ++__result) {
+      if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) {
+        *__result = *__first2;
+        ++__first2;
+      } else {
+        *__result = *__first1;
+        ++__first1;
+      }
+    }
+    auto __ret1 = ranges::copy(std::move(__first1), std::move(__last1), std::move(__result));
+    auto __ret2 = ranges::copy(std::move(__first2), std::move(__last2), std::move(__ret1.out));
+    return {std::move(__ret1.in), std::move(__ret2.in), std::move(__ret2.out)};
+  }
+};
 
 inline namespace __cpo {
-inline constexpr auto merge = __merge::__fn{};
+inline constexpr auto merge = __merge{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_min.h b/libcxx/include/__algorithm/ranges_min.h
index cc569d2a060c2..302b5d7975b00 100644
--- a/libcxx/include/__algorithm/ranges_min.h
+++ b/libcxx/include/__algorithm/ranges_min.h
@@ -35,8 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __min {
-struct __fn {
+struct __min {
   template <class _Tp,
             class _Proj                                                    = identity,
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
@@ -79,10 +78,9 @@ struct __fn {
     }
   }
 };
-} // namespace __min
 
 inline namespace __cpo {
-inline constexpr auto min = __min::__fn{};
+inline constexpr auto min = __min{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_min_element.h b/libcxx/include/__algorithm/ranges_min_element.h
index 588ef258e26f5..fb92ae56bcd62 100644
--- a/libcxx/include/__algorithm/ranges_min_element.h
+++ b/libcxx/include/__algorithm/ranges_min_element.h
@@ -46,8 +46,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Ip __min_element_impl(_Ip __first, _Sp __last,
   return __first;
 }
 
-namespace __min_element {
-struct __fn {
+struct __min_element {
   template <forward_iterator _Ip,
             sentinel_for<_Ip> _Sp,
             class _Proj                                             = identity,
@@ -65,10 +64,9 @@ struct __fn {
     return ranges::__min_element_impl(ranges::begin(__r), ranges::end(__r), __comp, __proj);
   }
 };
-} // namespace __min_element
 
 inline namespace __cpo {
-inline constexpr auto min_element = __min_element::__fn{};
+inline constexpr auto min_element = __min_element{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index 09cbefd91a8c7..1b43b1e19cdec 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -47,8 +47,7 @@ namespace ranges {
 template <class _T1>
 using minmax_result = min_max_result<_T1>;
 
-namespace __minmax {
-struct __fn {
+struct __minmax {
   template <class _Type,
             class _Proj                                                      = identity,
             indirect_strict_weak_order<projected<const _Type*, _Proj>> _Comp = ranges::less>
@@ -159,10 +158,9 @@ struct __fn {
     }
   }
 };
-} // namespace __minmax
 
 inline namespace __cpo {
-inline constexpr auto minmax = __minmax::__fn{};
+inline constexpr auto minmax = __minmax{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_minmax_element.h b/libcxx/include/__algorithm/ranges_minmax_element.h
index 4bf6d2404e463..e1a22dde0955f 100644
--- a/libcxx/include/__algorithm/ranges_minmax_element.h
+++ b/libcxx/include/__algorithm/ranges_minmax_element.h
@@ -40,8 +40,7 @@ namespace ranges {
 template <class _T1>
 using minmax_element_result = min_max_result<_T1>;
 
-namespace __minmax_element {
-struct __fn {
+struct __minmax_element {
   template <forward_iterator _Ip,
             sentinel_for<_Ip> _Sp,
             class _Proj                                             = identity,
@@ -61,10 +60,9 @@ struct __fn {
     return {__ret.first, __ret.second};
   }
 };
-} // namespace __minmax_element
 
 inline namespace __cpo {
-inline constexpr auto minmax_element = __minmax_element::__fn{};
+inline constexpr auto minmax_element = __minmax_element{};
 } // namespace __cpo
 
 } // namespace ranges
diff --git a/libcxx/include/__algorithm/ranges_mismatch.h b/libcxx/include/__algorithm/ranges_mismatch.h
index c4bf0022a9bcc..b35747dfa43a2 100644
--- a/libcxx/include/__algorithm/ranges_mismatch.h
+++ b/libcxx/include/__algorithm/ranges_mismatch.h
@@ -39,8 +39,7 @@ namespace ranges {
 template <class _I1, class _I2>
 using mismatch_result = in_in_result<_I1, _I2>;
 
-namespace __mismatch {
-struct __fn {
+struct __mismatch {
   template <class _I1, class _S1, class _I2, class _S2, class _Pred, class _Proj1, class _Proj2>
   static _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result<_I1, _I2>
   __go(_I1 __first1, _S1 __last1, _I2 __first2, _S2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
@@ -84,10 +83,9 @@ struct __fn {
         ranges::begin(__r1), ranges::end(__r1), ranges::begin(__r2), ranges::end(__r2), __pred, __proj1, __proj2);
   }
 };
-} // namespace __mismatch
 
 inline namespace __cpo {
-constexpr inline auto mismatch = __mismatch::__fn{};
+constexpr inline auto mismatch = __mismatch{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_move.h b/libcxx/include/__algorithm/ranges_move.h
index be869f36c9730..02bf7fd006190 100644
--- a/libcxx/include/__algorithm/ranges_move.h
+++ b/libcxx/include/__algorithm/ranges_move.h
@@ -35,8 +35,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using move_result = in_out_result<_InIter, _OutIter>;
 
-namespace __move {
-struct __fn {
+struct __move {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr static move_result<_InIter, _OutIter>
   __move_impl(_InIter __first, _Sent __last, _OutIter __result) {
@@ -58,10 +57,9 @@ struct __fn {
     return __move_impl(ranges::begin(__range), ranges::end(__range), std::move(__result));
   }
 };
-} // namespace __move
 
 inline namespace __cpo {
-inline constexpr auto move = __move::__fn{};
+inline constexpr auto move = __move{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_move_backward.h b/libcxx/include/__algorithm/ranges_move_backward.h
index 6d4071a33b812..4737e6c9756de 100644
--- a/libcxx/include/__algorithm/ranges_move_backward.h
+++ b/libcxx/include/__algorithm/ranges_move_backward.h
@@ -37,8 +37,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using move_backward_result = in_out_result<_InIter, _OutIter>;
 
-namespace __move_backward {
-struct __fn {
+struct __move_backward {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr static move_backward_result<_InIter, _OutIter>
   __move_backward_impl(_InIter __first, _Sent __last, _OutIter __result) {
@@ -60,10 +59,9 @@ struct __fn {
     return __move_backward_impl(ranges::begin(__range), ranges::end(__range), std::move(__result));
   }
 };
-} // namespace __move_backward
 
 inline namespace __cpo {
-inline constexpr auto move_backward = __move_backward::__fn{};
+inline constexpr auto move_backward = __move_backward{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_next_permutation.h b/libcxx/include/__algorithm/ranges_next_permutation.h
index 18535e0a6254a..1b485423e892f 100644
--- a/libcxx/include/__algorithm/ranges_next_permutation.h
+++ b/libcxx/include/__algorithm/ranges_next_permutation.h
@@ -40,9 +40,7 @@ namespace ranges {
 template <class _InIter>
 using next_permutation_result = in_found_result<_InIter>;
 
-namespace __next_permutation {
-
-struct __fn {
+struct __next_permutation {
   template <bidirectional_iterator _Iter, sentinel_for<_Iter> _Sent, class _Comp = ranges::less, class _Proj = identity>
     requires sortable<_Iter, _Comp, _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr next_permutation_result<_Iter>
@@ -62,10 +60,8 @@ struct __fn {
   }
 };
 
-} // namespace __next_permutation
-
 inline namespace __cpo {
-constexpr inline auto next_permutation = __next_permutation::__fn{};
+constexpr inline auto next_permutation = __next_permutation{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_none_of.h b/libcxx/include/__algorithm/ranges_none_of.h
index 7df3c1829fcfc..a1612826220d9 100644
--- a/libcxx/include/__algorithm/ranges_none_of.h
+++ b/libcxx/include/__algorithm/ranges_none_of.h
@@ -30,8 +30,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __none_of {
-struct __fn {
+struct __none_of {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static bool
   __none_of_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
@@ -59,10 +58,9 @@ struct __fn {
     return __none_of_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __none_of
 
 inline namespace __cpo {
-inline constexpr auto none_of = __none_of::__fn{};
+inline constexpr auto none_of = __none_of{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_nth_element.h b/libcxx/include/__algorithm/ranges_nth_element.h
index 90ade9efe10da..e92c51e713cb4 100644
--- a/libcxx/include/__algorithm/ranges_nth_element.h
+++ b/libcxx/include/__algorithm/ranges_nth_element.h
@@ -39,9 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __nth_element {
-
-struct __fn {
+struct __nth_element {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __nth_element_fn_impl(_Iter __first, _Iter __nth, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -68,10 +66,8 @@ struct __fn {
   }
 };
 
-} // namespace __nth_element
-
 inline namespace __cpo {
-inline constexpr auto nth_element = __nth_element::__fn{};
+inline constexpr auto nth_element = __nth_element{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_partial_sort.h b/libcxx/include/__algorithm/ranges_partial_sort.h
index c67247d2e0a77..fc8a1f7d93065 100644
--- a/libcxx/include/__algorithm/ranges_partial_sort.h
+++ b/libcxx/include/__algorithm/ranges_partial_sort.h
@@ -41,9 +41,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __partial_sort {
-
-struct __fn {
+struct __partial_sort {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __partial_sort_fn_impl(_Iter __first, _Iter __middle, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -66,10 +64,8 @@ struct __fn {
   }
 };
 
-} // namespace __partial_sort
-
 inline namespace __cpo {
-inline constexpr auto partial_sort = __partial_sort::__fn{};
+inline constexpr auto partial_sort = __partial_sort{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_partial_sort_copy.h b/libcxx/include/__algorithm/ranges_partial_sort_copy.h
index b3bdeb78fb6f6..f221504a8cae0 100644
--- a/libcxx/include/__algorithm/ranges_partial_sort_copy.h
+++ b/libcxx/include/__algorithm/ranges_partial_sort_copy.h
@@ -42,9 +42,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using partial_sort_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __partial_sort_copy {
-
-struct __fn {
+struct __partial_sort_copy {
   template <input_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             random_access_iterator _Iter2,
@@ -98,10 +96,8 @@ struct __fn {
   }
 };
 
-} // namespace __partial_sort_copy
-
 inline namespace __cpo {
-inline constexpr auto partial_sort_copy = __partial_sort_copy::__fn{};
+inline constexpr auto partial_sort_copy = __partial_sort_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_partition.h b/libcxx/include/__algorithm/ranges_partition.h
index a67ac4c967570..7058cb0c39696 100644
--- a/libcxx/include/__algorithm/ranges_partition.h
+++ b/libcxx/include/__algorithm/ranges_partition.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __partition {
-
-struct __fn {
+struct __partition {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI static constexpr subrange<__remove_cvref_t<_Iter>>
   __partition_fn_impl(_Iter&& __first, _Sent&& __last, _Pred&& __pred, _Proj&& __proj) {
@@ -72,10 +70,8 @@ struct __fn {
   }
 };
 
-} // namespace __partition
-
 inline namespace __cpo {
-inline constexpr auto partition = __partition::__fn{};
+inline constexpr auto partition = __partition{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_partition_copy.h b/libcxx/include/__algorithm/ranges_partition_copy.h
index d60c865dd2a8a..47878a4017233 100644
--- a/libcxx/include/__algorithm/ranges_partition_copy.h
+++ b/libcxx/include/__algorithm/ranges_partition_copy.h
@@ -38,9 +38,7 @@ namespace ranges {
 template <class _InIter, class _OutIter1, class _OutIter2>
 using partition_copy_result = in_out_out_result<_InIter, _OutIter1, _OutIter2>;
 
-namespace __partition_copy {
-
-struct __fn {
+struct __partition_copy {
   // TODO(ranges): delegate to the classic algorithm.
   template <class _InIter, class _Sent, class _OutIter1, class _OutIter2, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static partition_copy_result<__remove_cvref_t<_InIter>,
@@ -94,10 +92,8 @@ struct __fn {
   }
 };
 
-} // namespace __partition_copy
-
 inline namespace __cpo {
-inline constexpr auto partition_copy = __partition_copy::__fn{};
+inline constexpr auto partition_copy = __partition_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_partition_point.h b/libcxx/include/__algorithm/ranges_partition_point.h
index c5b11b5fed192..324efbb86d64c 100644
--- a/libcxx/include/__algorithm/ranges_partition_point.h
+++ b/libcxx/include/__algorithm/ranges_partition_point.h
@@ -35,9 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __partition_point {
-
-struct __fn {
+struct __partition_point {
   // TODO(ranges): delegate to the classic algorithm.
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
@@ -77,10 +75,8 @@ struct __fn {
   }
 };
 
-} // namespace __partition_point
-
 inline namespace __cpo {
-inline constexpr auto partition_point = __partition_point::__fn{};
+inline constexpr auto partition_point = __partition_point{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_pop_heap.h b/libcxx/include/__algorithm/ranges_pop_heap.h
index 01f92c0f22888..eccf54c094e3d 100644
--- a/libcxx/include/__algorithm/ranges_pop_heap.h
+++ b/libcxx/include/__algorithm/ranges_pop_heap.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __pop_heap {
-
-struct __fn {
+struct __pop_heap {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __pop_heap_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -70,10 +68,8 @@ struct __fn {
   }
 };
 
-} // namespace __pop_heap
-
 inline namespace __cpo {
-inline constexpr auto pop_heap = __pop_heap::__fn{};
+inline constexpr auto pop_heap = __pop_heap{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_prev_permutation.h b/libcxx/include/__algorithm/ranges_prev_permutation.h
index 225cee9b75ec6..f2294b1cb00ba 100644
--- a/libcxx/include/__algorithm/ranges_prev_permutation.h
+++ b/libcxx/include/__algorithm/ranges_prev_permutation.h
@@ -40,9 +40,7 @@ namespace ranges {
 template <class _InIter>
 using prev_permutation_result = in_found_result<_InIter>;
 
-namespace __prev_permutation {
-
-struct __fn {
+struct __prev_permutation {
   template <bidirectional_iterator _Iter, sentinel_for<_Iter> _Sent, class _Comp = ranges::less, class _Proj = identity>
     requires sortable<_Iter, _Comp, _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr prev_permutation_result<_Iter>
@@ -62,10 +60,8 @@ struct __fn {
   }
 };
 
-} // namespace __prev_permutation
-
 inline namespace __cpo {
-constexpr inline auto prev_permutation = __prev_permutation::__fn{};
+constexpr inline auto prev_permutation = __prev_permutation{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_push_heap.h b/libcxx/include/__algorithm/ranges_push_heap.h
index 9d187af38c531..c5e0465bdcfe1 100644
--- a/libcxx/include/__algorithm/ranges_push_heap.h
+++ b/libcxx/include/__algorithm/ranges_push_heap.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __push_heap {
-
-struct __fn {
+struct __push_heap {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __push_heap_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -69,10 +67,8 @@ struct __fn {
   }
 };
 
-} // namespace __push_heap
-
 inline namespace __cpo {
-inline constexpr auto push_heap = __push_heap::__fn{};
+inline constexpr auto push_heap = __push_heap{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_remove.h b/libcxx/include/__algorithm/ranges_remove.h
index 17c3a2c5cd06b..6fbc49eba8a72 100644
--- a/libcxx/include/__algorithm/ranges_remove.h
+++ b/libcxx/include/__algorithm/ranges_remove.h
@@ -33,8 +33,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __remove {
-struct __fn {
+struct __remove {
   template <permutable _Iter, sentinel_for<_Iter> _Sent, class _Type, class _Proj = identity>
     requires indirect_binary_predicate<ranges::equal_to, projected<_Iter, _Proj>, const _Type*>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr subrange<_Iter>
@@ -52,10 +51,9 @@ struct __fn {
     return ranges::__remove_if_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __remove
 
 inline namespace __cpo {
-inline constexpr auto remove = __remove::__fn{};
+inline constexpr auto remove = __remove{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_remove_copy.h b/libcxx/include/__algorithm/ranges_remove_copy.h
index 84529eceac68c..764c52ee16b27 100644
--- a/libcxx/include/__algorithm/ranges_remove_copy.h
+++ b/libcxx/include/__algorithm/ranges_remove_copy.h
@@ -38,9 +38,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using remove_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __remove_copy {
-
-struct __fn {
+struct __remove_copy {
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
             weakly_incrementable _OutIter,
@@ -65,10 +63,8 @@ struct __fn {
   }
 };
 
-} // namespace __remove_copy
-
 inline namespace __cpo {
-inline constexpr auto remove_copy = __remove_copy::__fn{};
+inline constexpr auto remove_copy = __remove_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_remove_copy_if.h b/libcxx/include/__algorithm/ranges_remove_copy_if.h
index 56fe017533120..87136ae8258d6 100644
--- a/libcxx/include/__algorithm/ranges_remove_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_remove_copy_if.h
@@ -53,9 +53,7 @@ __remove_copy_if_impl(_InIter __first, _Sent __last, _OutIter __result, _Pred& _
   return {std::move(__first), std::move(__result)};
 }
 
-namespace __remove_copy_if {
-
-struct __fn {
+struct __remove_copy_if {
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
             weakly_incrementable _OutIter,
@@ -79,10 +77,8 @@ struct __fn {
   }
 };
 
-} // namespace __remove_copy_if
-
 inline namespace __cpo {
-inline constexpr auto remove_copy_if = __remove_copy_if::__fn{};
+inline constexpr auto remove_copy_if = __remove_copy_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_remove_if.h b/libcxx/include/__algorithm/ranges_remove_if.h
index 0ea5d9a01b881..384b3d41d0801 100644
--- a/libcxx/include/__algorithm/ranges_remove_if.h
+++ b/libcxx/include/__algorithm/ranges_remove_if.h
@@ -53,8 +53,7 @@ __remove_if_impl(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
   return {__new_end, __i};
 }
 
-namespace __remove_if {
-struct __fn {
+struct __remove_if {
   template <permutable _Iter,
             sentinel_for<_Iter> _Sent,
             class _Proj = identity,
@@ -73,10 +72,9 @@ struct __fn {
     return ranges::__remove_if_impl(ranges::begin(__range), ranges::end(__range), __pred, __proj);
   }
 };
-} // namespace __remove_if
 
 inline namespace __cpo {
-inline constexpr auto remove_if = __remove_if::__fn{};
+inline constexpr auto remove_if = __remove_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_replace.h b/libcxx/include/__algorithm/ranges_replace.h
index 2b88dc032972f..15b1f38554a8c 100644
--- a/libcxx/include/__algorithm/ranges_replace.h
+++ b/libcxx/include/__algorithm/ranges_replace.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __replace {
-struct __fn {
+struct __replace {
   template <input_iterator _Iter, sentinel_for<_Iter> _Sent, class _Type1, class _Type2, class _Proj = identity>
     requires indirectly_writable<_Iter, const _Type2&> &&
              indirect_binary_predicate<ranges::equal_to, projected<_Iter, _Proj>, const _Type1*>
@@ -52,10 +51,9 @@ struct __fn {
     return ranges::__replace_if_impl(ranges::begin(__range), ranges::end(__range), __pred, __new_value, __proj);
   }
 };
-} // namespace __replace
 
 inline namespace __cpo {
-inline constexpr auto replace = __replace::__fn{};
+inline constexpr auto replace = __replace{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_replace_copy.h b/libcxx/include/__algorithm/ranges_replace_copy.h
index 633f993e5c948..7ab1c71543e2a 100644
--- a/libcxx/include/__algorithm/ranges_replace_copy.h
+++ b/libcxx/include/__algorithm/ranges_replace_copy.h
@@ -38,9 +38,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using replace_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __replace_copy {
-
-struct __fn {
+struct __replace_copy {
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
             class _OldType,
@@ -77,10 +75,8 @@ struct __fn {
   }
 };
 
-} // namespace __replace_copy
-
 inline namespace __cpo {
-inline constexpr auto replace_copy = __replace_copy::__fn{};
+inline constexpr auto replace_copy = __replace_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_replace_copy_if.h b/libcxx/include/__algorithm/ranges_replace_copy_if.h
index e065c3ac0acc9..852ec45edaefe 100644
--- a/libcxx/include/__algorithm/ranges_replace_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_replace_copy_if.h
@@ -52,9 +52,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr replace_copy_if_result<_InIter, _OutIter> __repl
   return {std::move(__first), std::move(__result)};
 }
 
-namespace __replace_copy_if {
-
-struct __fn {
+struct __replace_copy_if {
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
             class _Type,
@@ -82,10 +80,8 @@ struct __fn {
   }
 };
 
-} // namespace __replace_copy_if
-
 inline namespace __cpo {
-inline constexpr auto replace_copy_if = __replace_copy_if::__fn{};
+inline constexpr auto replace_copy_if = __replace_copy_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_replace_if.h b/libcxx/include/__algorithm/ranges_replace_if.h
index 6445f42aea190..baa566810b5d0 100644
--- a/libcxx/include/__algorithm/ranges_replace_if.h
+++ b/libcxx/include/__algorithm/ranges_replace_if.h
@@ -42,8 +42,7 @@ __replace_if_impl(_Iter __first, _Sent __last, _Pred& __pred, const _Type& __new
   return __first;
 }
 
-namespace __replace_if {
-struct __fn {
+struct __replace_if {
   template <input_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Type,
@@ -65,10 +64,9 @@ struct __fn {
     return ranges::__replace_if_impl(ranges::begin(__range), ranges::end(__range), __pred, __new_value, __proj);
   }
 };
-} // namespace __replace_if
 
 inline namespace __cpo {
-inline constexpr auto replace_if = __replace_if::__fn{};
+inline constexpr auto replace_if = __replace_if{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_reverse.h b/libcxx/include/__algorithm/ranges_reverse.h
index 9ec865995b4a5..4e82118719772 100644
--- a/libcxx/include/__algorithm/ranges_reverse.h
+++ b/libcxx/include/__algorithm/ranges_reverse.h
@@ -27,8 +27,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __reverse {
-struct __fn {
+struct __reverse {
   template <bidirectional_iterator _Iter, sentinel_for<_Iter> _Sent>
     requires permutable<_Iter>
   _LIBCPP_HIDE_FROM_ABI constexpr _Iter operator()(_Iter __first, _Sent __last) const {
@@ -65,10 +64,9 @@ struct __fn {
     return (*this)(ranges::begin(__range), ranges::end(__range));
   }
 };
-} // namespace __reverse
 
 inline namespace __cpo {
-inline constexpr auto reverse = __reverse::__fn{};
+inline constexpr auto reverse = __reverse{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_reverse_copy.h b/libcxx/include/__algorithm/ranges_reverse_copy.h
index 60043787a7170..e5ca5cf652dc4 100644
--- a/libcxx/include/__algorithm/ranges_reverse_copy.h
+++ b/libcxx/include/__algorithm/ranges_reverse_copy.h
@@ -37,8 +37,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using reverse_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __reverse_copy {
-struct __fn {
+struct __reverse_copy {
   template <bidirectional_iterator _InIter, sentinel_for<_InIter> _Sent, weakly_incrementable _OutIter>
     requires indirectly_copyable<_InIter, _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr reverse_copy_result<_InIter, _OutIter>
@@ -54,10 +53,9 @@ struct __fn {
     return {ranges::next(ranges::begin(__range), ranges::end(__range)), std::move(__ret.out)};
   }
 };
-} // namespace __reverse_copy
 
 inline namespace __cpo {
-inline constexpr auto reverse_copy = __reverse_copy::__fn{};
+inline constexpr auto reverse_copy = __reverse_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_rotate.h b/libcxx/include/__algorithm/ranges_rotate.h
index 8d33a6f0799bf..c1affc684ae4f 100644
--- a/libcxx/include/__algorithm/ranges_rotate.h
+++ b/libcxx/include/__algorithm/ranges_rotate.h
@@ -33,9 +33,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __rotate {
-
-struct __fn {
+struct __rotate {
   template <class _Iter, class _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr static subrange<_Iter> __rotate_fn_impl(_Iter __first, _Iter __middle, _Sent __last) {
     auto __ret = std::__rotate<_RangeAlgPolicy>(std::move(__first), std::move(__middle), std::move(__last));
@@ -55,10 +53,8 @@ struct __fn {
   }
 };
 
-} // namespace __rotate
-
 inline namespace __cpo {
-inline constexpr auto rotate = __rotate::__fn{};
+inline constexpr auto rotate = __rotate{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_rotate_copy.h b/libcxx/include/__algorithm/ranges_rotate_copy.h
index 26fe110b53896..c0b4264a1b253 100644
--- a/libcxx/include/__algorithm/ranges_rotate_copy.h
+++ b/libcxx/include/__algorithm/ranges_rotate_copy.h
@@ -34,8 +34,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using rotate_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __rotate_copy {
-struct __fn {
+struct __rotate_copy {
   template <forward_iterator _InIter, sentinel_for<_InIter> _Sent, weakly_incrementable _OutIter>
     requires indirectly_copyable<_InIter, _OutIter>
   _LIBCPP_HIDE_FROM_ABI constexpr rotate_copy_result<_InIter, _OutIter>
@@ -52,10 +51,9 @@ struct __fn {
     return (*this)(ranges::begin(__range), std::move(__middle), ranges::end(__range), std::move(__result));
   }
 };
-} // namespace __rotate_copy
 
 inline namespace __cpo {
-inline constexpr auto rotate_copy = __rotate_copy::__fn{};
+inline constexpr auto rotate_copy = __rotate_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_sample.h b/libcxx/include/__algorithm/ranges_sample.h
index e4f60a7b66be2..a3b29608150d2 100644
--- a/libcxx/include/__algorithm/ranges_sample.h
+++ b/libcxx/include/__algorithm/ranges_sample.h
@@ -35,9 +35,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __sample {
-
-struct __fn {
+struct __sample {
   template <input_iterator _Iter, sentinel_for<_Iter> _Sent, weakly_incrementable _OutIter, class _Gen>
     requires(forward_iterator<_Iter> || random_access_iterator<_OutIter>) && indirectly_copyable<_Iter, _OutIter> &&
             uniform_random_bit_generator<remove_reference_t<_Gen>>
@@ -58,10 +56,8 @@ struct __fn {
   }
 };
 
-} // namespace __sample
-
 inline namespace __cpo {
-inline constexpr auto sample = __sample::__fn{};
+inline constexpr auto sample = __sample{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_search.h b/libcxx/include/__algorithm/ranges_search.h
index 55294c60631b1..b711512039635 100644
--- a/libcxx/include/__algorithm/ranges_search.h
+++ b/libcxx/include/__algorithm/ranges_search.h
@@ -33,8 +33,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __search {
-struct __fn {
+struct __search {
   template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
   _LIBCPP_HIDE_FROM_ABI static constexpr subrange<_Iter1> __ranges_search_impl(
       _Iter1 __first1,
@@ -120,10 +119,9 @@ struct __fn {
         __proj2);
   }
 };
-} // namespace __search
 
 inline namespace __cpo {
-inline constexpr auto search = __search::__fn{};
+inline constexpr auto search = __search{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_search_n.h b/libcxx/include/__algorithm/ranges_search_n.h
index 56e12755b9bf6..81b568c0965fd 100644
--- a/libcxx/include/__algorithm/ranges_search_n.h
+++ b/libcxx/include/__algorithm/ranges_search_n.h
@@ -39,8 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __search_n {
-struct __fn {
+struct __search_n {
   template <class _Iter1, class _Sent1, class _SizeT, class _Type, class _Pred, class _Proj>
   _LIBCPP_HIDE_FROM_ABI static constexpr subrange<_Iter1> __ranges_search_n_impl(
       _Iter1 __first, _Sent1 __last, _SizeT __count, const _Type& __value, _Pred& __pred, _Proj& __proj) {
@@ -100,10 +99,9 @@ struct __fn {
     return __ranges_search_n_impl(ranges::begin(__range), ranges::end(__range), __count, __value, __pred, __proj);
   }
 };
-} // namespace __search_n
 
 inline namespace __cpo {
-inline constexpr auto search_n = __search_n::__fn{};
+inline constexpr auto search_n = __search_n{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_set_difference.h b/libcxx/include/__algorithm/ranges_set_difference.h
index 0841fb4ffd0c0..aa36e358ef6f9 100644
--- a/libcxx/include/__algorithm/ranges_set_difference.h
+++ b/libcxx/include/__algorithm/ranges_set_difference.h
@@ -42,9 +42,7 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using set_difference_result = in_out_result<_InIter, _OutIter>;
 
-namespace __set_difference {
-
-struct __fn {
+struct __set_difference {
   template <input_iterator _InIter1,
             sentinel_for<_InIter1> _Sent1,
             input_iterator _InIter2,
@@ -93,10 +91,8 @@ struct __fn {
   }
 };
 
-} // namespace __set_difference
-
 inline namespace __cpo {
-inline constexpr auto set_difference = __set_difference::__fn{};
+inline constexpr auto set_difference = __set_difference{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_set_intersection.h b/libcxx/include/__algorithm/ranges_set_intersection.h
index 9427379745b60..068794cf1b14f 100644
--- a/libcxx/include/__algorithm/ranges_set_intersection.h
+++ b/libcxx/include/__algorithm/ranges_set_intersection.h
@@ -40,9 +40,7 @@ namespace ranges {
 template <class _InIter1, class _InIter2, class _OutIter>
 using set_intersection_result = in_in_out_result<_InIter1, _InIter2, _OutIter>;
 
-namespace __set_intersection {
-
-struct __fn {
+struct __set_intersection {
   template <input_iterator _InIter1,
             sentinel_for<_InIter1> _Sent1,
             input_iterator _InIter2,
@@ -98,10 +96,8 @@ struct __fn {
   }
 };
 
-} // namespace __set_intersection
-
 inline namespace __cpo {
-inline constexpr auto set_intersection = __set_intersection::__fn{};
+inline constexpr auto set_intersection = __set_intersection{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
index 995eb0999d940..e3b73c3eb1cf2 100644
--- a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
+++ b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
@@ -40,9 +40,7 @@ namespace ranges {
 template <class _InIter1, class _InIter2, class _OutIter>
 using set_symmetric_difference_result = in_in_out_result<_InIter1, _InIter2, _OutIter>;
 
-namespace __set_symmetric_difference {
-
-struct __fn {
+struct __set_symmetric_difference {
   template <input_iterator _InIter1,
             sentinel_for<_InIter1> _Sent1,
             input_iterator _InIter2,
@@ -98,10 +96,8 @@ struct __fn {
   }
 };
 
-} // namespace __set_symmetric_difference
-
 inline namespace __cpo {
-inline constexpr auto set_symmetric_difference = __set_symmetric_difference::__fn{};
+inline constexpr auto set_symmetric_difference = __set_symmetric_difference{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_set_union.h b/libcxx/include/__algorithm/ranges_set_union.h
index e870e390cc665..f2e3ddb89d7b3 100644
--- a/libcxx/include/__algorithm/ranges_set_union.h
+++ b/libcxx/include/__algorithm/ranges_set_union.h
@@ -43,9 +43,7 @@ namespace ranges {
 template <class _InIter1, class _InIter2, class _OutIter>
 using set_union_result = in_in_out_result<_InIter1, _InIter2, _OutIter>;
 
-namespace __set_union {
-
-struct __fn {
+struct __set_union {
   template <input_iterator _InIter1,
             sentinel_for<_InIter1> _Sent1,
             input_iterator _InIter2,
@@ -99,10 +97,8 @@ struct __fn {
   }
 };
 
-} // namespace __set_union
-
 inline namespace __cpo {
-inline constexpr auto set_union = __set_union::__fn{};
+inline constexpr auto set_union = __set_union{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_shuffle.h b/libcxx/include/__algorithm/ranges_shuffle.h
index ab98ea22caabe..87cb3685bb95f 100644
--- a/libcxx/include/__algorithm/ranges_shuffle.h
+++ b/libcxx/include/__algorithm/ranges_shuffle.h
@@ -39,9 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __shuffle {
-
-struct __fn {
+struct __shuffle {
   template <random_access_iterator _Iter, sentinel_for<_Iter> _Sent, class _Gen>
     requires permutable<_Iter> && uniform_random_bit_generator<remove_reference_t<_Gen>>
   _LIBCPP_HIDE_FROM_ABI _Iter operator()(_Iter __first, _Sent __last, _Gen&& __gen) const {
@@ -56,10 +54,8 @@ struct __fn {
   }
 };
 
-} // namespace __shuffle
-
 inline namespace __cpo {
-inline constexpr auto shuffle = __shuffle::__fn{};
+inline constexpr auto shuffle = __shuffle{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_sort.h b/libcxx/include/__algorithm/ranges_sort.h
index 0296c146b3ede..2afad4c41301e 100644
--- a/libcxx/include/__algorithm/ranges_sort.h
+++ b/libcxx/include/__algorithm/ranges_sort.h
@@ -39,9 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __sort {
-
-struct __fn {
+struct __sort {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __sort_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -68,10 +66,8 @@ struct __fn {
   }
 };
 
-} // namespace __sort
-
 inline namespace __cpo {
-inline constexpr auto sort = __sort::__fn{};
+inline constexpr auto sort = __sort{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_sort_heap.h b/libcxx/include/__algorithm/ranges_sort_heap.h
index bab30df1708c7..d3e20874fac50 100644
--- a/libcxx/include/__algorithm/ranges_sort_heap.h
+++ b/libcxx/include/__algorithm/ranges_sort_heap.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __sort_heap {
-
-struct __fn {
+struct __sort_heap {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI constexpr static _Iter
   __sort_heap_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
@@ -69,10 +67,8 @@ struct __fn {
   }
 };
 
-} // namespace __sort_heap
-
 inline namespace __cpo {
-inline constexpr auto sort_heap = __sort_heap::__fn{};
+inline constexpr auto sort_heap = __sort_heap{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_stable_partition.h b/libcxx/include/__algorithm/ranges_stable_partition.h
index f34027ff772c7..cfc02e1e97b3f 100644
--- a/libcxx/include/__algorithm/ranges_stable_partition.h
+++ b/libcxx/include/__algorithm/ranges_stable_partition.h
@@ -42,9 +42,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __stable_partition {
-
-struct __fn {
+struct __stable_partition {
   template <class _Iter, class _Sent, class _Proj, class _Pred>
   _LIBCPP_HIDE_FROM_ABI static subrange<__remove_cvref_t<_Iter>>
   __stable_partition_fn_impl(_Iter&& __first, _Sent&& __last, _Pred&& __pred, _Proj&& __proj) {
@@ -76,10 +74,8 @@ struct __fn {
   }
 };
 
-} // namespace __stable_partition
-
 inline namespace __cpo {
-inline constexpr auto stable_partition = __stable_partition::__fn{};
+inline constexpr auto stable_partition = __stable_partition{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_stable_sort.h b/libcxx/include/__algorithm/ranges_stable_sort.h
index 93909e253cc0f..9c7df80ae9872 100644
--- a/libcxx/include/__algorithm/ranges_stable_sort.h
+++ b/libcxx/include/__algorithm/ranges_stable_sort.h
@@ -39,9 +39,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __stable_sort {
-
-struct __fn {
+struct __stable_sort {
   template <class _Iter, class _Sent, class _Comp, class _Proj>
   _LIBCPP_HIDE_FROM_ABI static _Iter __stable_sort_fn_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) {
     auto __last_iter = ranges::next(__first, __last);
@@ -66,10 +64,8 @@ struct __fn {
   }
 };
 
-} // namespace __stable_sort
-
 inline namespace __cpo {
-inline constexpr auto stable_sort = __stable_sort::__fn{};
+inline constexpr auto stable_sort = __stable_sort{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_starts_with.h b/libcxx/include/__algorithm/ranges_starts_with.h
index 17084e4f24336..ae145d59010ae 100644
--- a/libcxx/include/__algorithm/ranges_starts_with.h
+++ b/libcxx/include/__algorithm/ranges_starts_with.h
@@ -32,8 +32,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __starts_with {
-struct __fn {
+struct __starts_with {
   template <input_iterator _Iter1,
             sentinel_for<_Iter1> _Sent1,
             input_iterator _Iter2,
@@ -50,7 +49,7 @@ struct __fn {
       _Pred __pred   = {},
       _Proj1 __proj1 = {},
       _Proj2 __proj2 = {}) {
-    return __mismatch::__fn::__go(
+    return __mismatch::__go(
                std::move(__first1),
                std::move(__last1),
                std::move(__first2),
@@ -69,7 +68,7 @@ struct __fn {
     requires indirectly_comparable<iterator_t<_Range1>, iterator_t<_Range2>, _Pred, _Proj1, _Proj2>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr bool
   operator()(_Range1&& __range1, _Range2&& __range2, _Pred __pred = {}, _Proj1 __proj1 = {}, _Proj2 __proj2 = {}) {
-    return __mismatch::__fn::__go(
+    return __mismatch::__go(
                ranges::begin(__range1),
                ranges::end(__range1),
                ranges::begin(__range2),
@@ -80,9 +79,8 @@ struct __fn {
                .in2 == ranges::end(__range2);
   }
 };
-} // namespace __starts_with
 inline namespace __cpo {
-inline constexpr auto starts_with = __starts_with::__fn{};
+inline constexpr auto starts_with = __starts_with{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_swap_ranges.h b/libcxx/include/__algorithm/ranges_swap_ranges.h
index b6d9f618395a5..ab6db50d8a13e 100644
--- a/libcxx/include/__algorithm/ranges_swap_ranges.h
+++ b/libcxx/include/__algorithm/ranges_swap_ranges.h
@@ -36,8 +36,7 @@ namespace ranges {
 template <class _I1, class _I2>
 using swap_ranges_result = in_in_result<_I1, _I2>;
 
-namespace __swap_ranges {
-struct __fn {
+struct __swap_ranges {
   template <input_iterator _I1, sentinel_for<_I1> _S1, input_iterator _I2, sentinel_for<_I2> _S2>
     requires indirectly_swappable<_I1, _I2>
   _LIBCPP_HIDE_FROM_ABI constexpr swap_ranges_result<_I1, _I2>
@@ -54,10 +53,9 @@ struct __fn {
     return operator()(ranges::begin(__r1), ranges::end(__r1), ranges::begin(__r2), ranges::end(__r2));
   }
 };
-} // namespace __swap_ranges
 
 inline namespace __cpo {
-inline constexpr auto swap_ranges = __swap_ranges::__fn{};
+inline constexpr auto swap_ranges = __swap_ranges{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_transform.h b/libcxx/include/__algorithm/ranges_transform.h
index 7850ec4f84656..091311821968c 100644
--- a/libcxx/include/__algorithm/ranges_transform.h
+++ b/libcxx/include/__algorithm/ranges_transform.h
@@ -41,8 +41,7 @@ using unary_transform_result = in_out_result<_Ip, _Op>;
 template <class _I1, class _I2, class _O1>
 using binary_transform_result = in_in_out_result<_I1, _I2, _O1>;
 
-namespace __transform {
-struct __fn {
+struct __transform {
 private:
   template <class _InIter, class _Sent, class _OutIter, class _Func, class _Proj>
   _LIBCPP_HIDE_FROM_ABI static constexpr unary_transform_result<_InIter, _OutIter>
@@ -161,10 +160,9 @@ struct __fn {
         __projection2);
   }
 };
-} // namespace __transform
 
 inline namespace __cpo {
-inline constexpr auto transform = __transform::__fn{};
+inline constexpr auto transform = __transform{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_unique.h b/libcxx/include/__algorithm/ranges_unique.h
index 7a9b784321873..a817359abd889 100644
--- a/libcxx/include/__algorithm/ranges_unique.h
+++ b/libcxx/include/__algorithm/ranges_unique.h
@@ -40,9 +40,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __unique {
-
-struct __fn {
+struct __unique {
   template <permutable _Iter,
             sentinel_for<_Iter> _Sent,
             class _Proj                                                  = identity,
@@ -66,10 +64,8 @@ struct __fn {
   }
 };
 
-} // namespace __unique
-
 inline namespace __cpo {
-inline constexpr auto unique = __unique::__fn{};
+inline constexpr auto unique = __unique{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_unique_copy.h b/libcxx/include/__algorithm/ranges_unique_copy.h
index 61133885ae809..3b4a64e94ca1f 100644
--- a/libcxx/include/__algorithm/ranges_unique_copy.h
+++ b/libcxx/include/__algorithm/ranges_unique_copy.h
@@ -44,12 +44,10 @@ namespace ranges {
 template <class _InIter, class _OutIter>
 using unique_copy_result = in_out_result<_InIter, _OutIter>;
 
-namespace __unique_copy {
-
 template <class _InIter, class _OutIter>
 concept __can_reread_from_output = (input_iterator<_OutIter> && same_as<iter_value_t<_InIter>, iter_value_t<_OutIter>>);
 
-struct __fn {
+struct __unique_copy {
   template <class _InIter, class _OutIter>
   static consteval auto __get_algo_tag() {
     if constexpr (forward_iterator<_InIter>) {
@@ -104,10 +102,8 @@ struct __fn {
   }
 };
 
-} // namespace __unique_copy
-
 inline namespace __cpo {
-inline constexpr auto unique_copy = __unique_copy::__fn{};
+inline constexpr auto unique_copy = __unique_copy{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/ranges_upper_bound.h b/libcxx/include/__algorithm/ranges_upper_bound.h
index fa6fa7f70ed5a..4b2835d4d58de 100644
--- a/libcxx/include/__algorithm/ranges_upper_bound.h
+++ b/libcxx/include/__algorithm/ranges_upper_bound.h
@@ -30,8 +30,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
-namespace __upper_bound {
-struct __fn {
+struct __upper_bound {
   template <forward_iterator _Iter,
             sentinel_for<_Iter> _Sent,
             class _Type,
@@ -60,10 +59,9 @@ struct __fn {
         ranges::begin(__r), ranges::end(__r), __value, __comp_lhs_rhs_swapped, __proj);
   }
 };
-} // namespace __upper_bound
 
 inline namespace __cpo {
-inline constexpr auto upper_bound = __upper_bound::__fn{};
+inline constexpr auto upper_bound = __upper_bound{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 73d888d1b0384..bb0d86cd0f58d 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -12,10 +12,15 @@
 #include <__algorithm/comp.h>
 #include <__algorithm/comp_ref_type.h>
 #include <__algorithm/iterator_operations.h>
+#include <__algorithm/lower_bound.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
+#include <__type_traits/is_same.h>
+#include <__utility/exchange.h>
 #include <__utility/move.h>
+#include <__utility/swap.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,10 +43,103 @@ struct __set_intersection_result {
       : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {}
 };
 
-template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+// Helper for __set_intersection() with one-sided binary search: populate result and advance input iterators if they
+// are found to potentially contain the same value in two consecutive calls. This function is very intimately related to
+// the way it is used and doesn't attempt to abstract that, it's not appropriate for general usage outside of its
+// context.
+template <class _InForwardIter1, class _InForwardIter2, class _OutIter>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_intersection_add_output_if_equal(
+    bool __may_be_equal,
+    _InForwardIter1& __first1,
+    _InForwardIter2& __first2,
+    _OutIter& __result,
+    bool& __prev_may_be_equal) {
+  if (__may_be_equal && __prev_may_be_equal) {
+    *__result = *__first1;
+    ++__result;
+    ++__first1;
+    ++__first2;
+    __prev_may_be_equal = false;
+  } else {
+    __prev_may_be_equal = __may_be_equal;
+  }
+}
+
+// With forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to
+// reduce best-case complexity to log(N). Understanding how we can use binary search and still respect complexity
+// guarantees is _not_ straightforward: the guarantee is "at most 2*(N+M)-1 comparisons", and one-sided binary search
+// will necessarily overshoot depending on the position of the needle in the haystack -- for instance, if we're
+// searching for 3 in (1, 2, 3, 4), we'll check if 3<1, then 3<2, then 3<4, and, finally, 3<3, for a total of 4
+// comparisons, when linear search would have yielded 3. However, because we won't need to perform the intervening
+// reciprocal comparisons (ie 1<3, 2<3, 4<3), that extra comparison doesn't run afoul of the guarantee. Additionally,
+// this type of scenario can only happen for match distances of up to 5 elements, because 2*log2(8) is 6, and we'll
+// still be worse-off at position 5 of an 8-element set. From then onwards these scenarios can't happen. TL;DR: we'll be
+// 1 comparison worse-off compared to the classic linear-searching algorithm if matching position 3 of a set with 4
+// elements, or position 5 if the set has 7 or 8 elements, but we'll never exceed the complexity guarantees from the
+// standard.
+template <class _AlgPolicy,
+          class _Compare,
+          class _InForwardIter1,
+          class _Sent1,
+          class _InForwardIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
 __set_intersection(
-    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+    _InForwardIter1 __first1,
+    _Sent1 __last1,
+    _InForwardIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::forward_iterator_tag,
+    std::forward_iterator_tag) {
+  _LIBCPP_CONSTEXPR std::__identity __proj;
+  bool __prev_may_be_equal = false;
+
+  while (__first2 != __last2) {
+    _InForwardIter1 __first1_next =
+        std::__lower_bound_onesided<_AlgPolicy>(__first1, __last1, *__first2, __comp, __proj);
+    std::swap(__first1_next, __first1);
+    // keeping in mind that a==b iff !(a<b) && !(b<a):
+    // if we can't advance __first1, that means !(*__first1 < *_first2), therefore __may_be_equal==true
+    std::__set_intersection_add_output_if_equal(
+        __first1 == __first1_next, __first1, __first2, __result, __prev_may_be_equal);
+    if (__first1 == __last1)
+      break;
+
+    _InForwardIter2 __first2_next =
+        std::__lower_bound_onesided<_AlgPolicy>(__first2, __last2, *__first1, __comp, __proj);
+    std::swap(__first2_next, __first2);
+    std::__set_intersection_add_output_if_equal(
+        __first2 == __first2_next, __first1, __first2, __result, __prev_may_be_equal);
+  }
+  return __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>(
+      _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
+      _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
+      std::move(__result));
+}
+
+// input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version
+template <class _AlgPolicy,
+          class _Compare,
+          class _InInputIter1,
+          class _Sent1,
+          class _InInputIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+__set_intersection(
+    _InInputIter1 __first1,
+    _Sent1 __last1,
+    _InInputIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::input_iterator_tag,
+    std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -55,12 +153,28 @@ __set_intersection(
     }
   }
 
-  return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
+  return __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>(
       _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
       _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
       std::move(__result));
 }
 
+template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+__set_intersection(
+    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+  return std::__set_intersection<_AlgPolicy>(
+      std::move(__first1),
+      std::move(__last1),
+      std::move(__first2),
+      std::move(__last2),
+      std::move(__result),
+      std::forward<_Compare>(__comp),
+      typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter1>(),
+      typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter2>());
+}
+
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection(
     _InputIterator1 __first1,
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index bd3f659c22df0..bcea21f5ce2e1 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -429,6 +429,8 @@ _LIBCPP_HIDE_FROM_ABI bool atomic_compare_exchange_strong_explicit(
   return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
+#if _LIBCPP_STD_VER >= 20
+
 // atomic_wait
 
 template <class _Tp>
@@ -462,29 +464,27 @@ atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
 // atomic_notify_all
 
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
+#endif // _LIBCPP_STD_VER >= 20
+
 // atomic_fetch_add
 
 template <class _Tp>
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index 7e26434c9c3a0..93f5c4cff0d1b 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -101,6 +101,7 @@ struct __atomic_base // false
     return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
   }
 
+#if _LIBCPP_STD_VER >= 20
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const
       volatile _NOEXCEPT {
     std::__atomic_wait(*this, __v, __m);
@@ -117,6 +118,7 @@ struct __atomic_base // false
     std::__atomic_notify_all(*this);
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+#endif //  _LIBCPP_STD_VER >= 20
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI constexpr __atomic_base() noexcept(is_nothrow_default_constructible_v<_Tp>) : __a_(_Tp()) {}
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 00b157cdff78b..abebfc112cb8e 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -48,26 +48,24 @@ struct atomic_flag {
     __cxx_atomic_store(&__a_, _LIBCPP_ATOMIC_FLAG_TYPE(false), __m);
   }
 
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-  wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT {
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(bool __v, memory_order __m = memory_order_seq_cst) const
+      volatile _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
   wait(bool __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
-    std::__atomic_notify_one(*this);
-  }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT {
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
     std::__atomic_notify_one(*this);
   }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT {
     std::__atomic_notify_all(*this);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT {
-    std::__atomic_notify_all(*this);
-  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+#endif
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI constexpr atomic_flag() _NOEXCEPT : __a_(false) {}
@@ -144,45 +142,45 @@ inline _LIBCPP_HIDE_FROM_ABI void atomic_flag_clear_explicit(atomic_flag* __o, m
   __o->clear(__m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+#if _LIBCPP_STD_VER >= 20
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const volatile atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_one(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
-atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_all(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
-atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
index 156f1961151c1..b0180a37ab500 100644
--- a/libcxx/include/__atomic/atomic_ref.h
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -42,13 +42,21 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
-template <class _Tp>
-struct __atomic_ref_base {
-protected:
-  _Tp* __ptr_;
+// These types are required to make __atomic_is_always_lock_free work across GCC and Clang.
+// The purpose of this trick is to make sure that we provide an object with the correct alignment
+// to __atomic_is_always_lock_free, since that answer depends on the alignment.
+template <size_t _Alignment>
+struct __alignment_checker_type {
+  alignas(_Alignment) char __data;
+};
 
-  _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
+template <size_t _Alignment>
+struct __get_aligner_instance {
+  static constexpr __alignment_checker_type<_Alignment> __instance{};
+};
 
+template <class _Tp>
+struct __atomic_ref_base {
 private:
   _LIBCPP_HIDE_FROM_ABI static _Tp* __clear_padding(_Tp& __val) noexcept {
     _Tp* __ptr = std::addressof(__val);
@@ -95,17 +103,21 @@ struct __atomic_ref_base {
 
   friend struct __atomic_waitable_traits<__atomic_ref_base<_Tp>>;
 
+  // require types that are 1, 2, 4, 8, or 16 bytes in length to be aligned to at least their size to be potentially
+  // used lock-free
+  static constexpr size_t __min_alignment = (sizeof(_Tp) & (sizeof(_Tp) - 1)) || (sizeof(_Tp) > 16) ? 0 : sizeof(_Tp);
+
 public:
   using value_type = _Tp;
 
-  static constexpr size_t required_alignment = alignof(_Tp);
+  static constexpr size_t required_alignment = alignof(_Tp) > __min_alignment ? alignof(_Tp) : __min_alignment;
 
   // The __atomic_always_lock_free builtin takes into account the alignment of the pointer if provided,
   // so we create a fake pointer with a suitable alignment when querying it. Note that we are guaranteed
   // that the pointer is going to be aligned properly at runtime because that is a (checked) precondition
   // of atomic_ref's constructor.
   static constexpr bool is_always_lock_free =
-      __atomic_always_lock_free(sizeof(_Tp), reinterpret_cast<void*>(-required_alignment));
+      __atomic_always_lock_free(sizeof(_Tp), &__get_aligner_instance<required_alignment>::__instance);
 
   _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const noexcept { return __atomic_is_lock_free(sizeof(_Tp), __ptr_); }
 
@@ -205,6 +217,12 @@ struct __atomic_ref_base {
   }
   _LIBCPP_HIDE_FROM_ABI void notify_one() const noexcept { std::__atomic_notify_one(*this); }
   _LIBCPP_HIDE_FROM_ABI void notify_all() const noexcept { std::__atomic_notify_all(*this); }
+
+protected:
+  typedef _Tp _Aligned_Tp __attribute__((aligned(required_alignment)));
+  _Aligned_Tp* __ptr_;
+
+  _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
 };
 
 template <class _Tp>
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 606069d98be72..22637d4397412 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -16,6 +16,7 @@
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
 #include <__bit/popcount.h>
+#include <__compare/ordering.h>
 #include <__config>
 #include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
@@ -913,6 +914,7 @@ public:
     return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
   }
 
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool
   operator!=(const __bit_iterator& __x, const __bit_iterator& __y) {
     return !(__x == __y);
@@ -937,6 +939,18 @@ public:
   operator>=(const __bit_iterator& __x, const __bit_iterator& __y) {
     return !(__x < __y);
   }
+#else  // _LIBCPP_STD_VER <= 17
+  _LIBCPP_HIDE_FROM_ABI constexpr friend strong_ordering
+  operator<=>(const __bit_iterator& __x, const __bit_iterator& __y) {
+    if (__x.__seg_ < __y.__seg_)
+      return strong_ordering::less;
+
+    if (__x.__seg_ == __y.__seg_)
+      return __x.__ctz_ <=> __y.__ctz_;
+
+    return strong_ordering::greater;
+  }
+#endif // _LIBCPP_STD_VER <= 17
 
 private:
   _LIBCPP_HIDE_FROM_ABI
diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h
index 881a4970822d8..3a51019b80784 100644
--- a/libcxx/include/__chrono/convert_to_tm.h
+++ b/libcxx/include/__chrono/convert_to_tm.h
@@ -29,11 +29,13 @@
 #include <__chrono/year_month.h>
 #include <__chrono/year_month_day.h>
 #include <__chrono/year_month_weekday.h>
+#include <__chrono/zoned_time.h>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__format/format_error.h>
 #include <__memory/addressof.h>
 #include <__type_traits/is_convertible.h>
+#include <__type_traits/is_specialization.h>
 #include <cstdint>
 #include <ctime>
 #include <limits>
@@ -178,7 +180,13 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) {
     // Has no time information.
   } else if constexpr (same_as<_ChronoT, chrono::local_info>) {
     // Has no time information.
-#  endif
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  } else if constexpr (__is_specialization_v<_ChronoT, chrono::zoned_time>) {
+    return std::__convert_to_tm<_Tm>(
+        chrono::sys_time<typename _ChronoT::duration>{__value.get_local_time().time_since_epoch()});
+#    endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   } else
     static_assert(sizeof(_ChronoT) == 0, "Add the missing type specialization");
 
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 9a77316385abd..449c415e95760 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -33,6 +33,7 @@
 #include <__chrono/year_month.h>
 #include <__chrono/year_month_day.h>
 #include <__chrono/year_month_weekday.h>
+#include <__chrono/zoned_time.h>
 #include <__concepts/arithmetic.h>
 #include <__concepts/same_as.h>
 #include <__config>
@@ -44,6 +45,7 @@
 #include <__format/parser_std_format_spec.h>
 #include <__format/write_escaped.h>
 #include <__memory/addressof.h>
+#include <__type_traits/is_specialization.h>
 #include <cmath>
 #include <ctime>
 #include <limits>
@@ -137,10 +139,24 @@ __format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::hh_mm_ss<
                    __value.fractional_width);
 }
 
+#  if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) && !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) &&                     \
+      !defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+template <class _CharT, class _Duration, class _TimeZonePtr>
+_LIBCPP_HIDE_FROM_ABI void
+__format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::zoned_time<_Duration, _TimeZonePtr>& __value) {
+  __formatter::__format_sub_seconds(__sstr, __value.get_local_time().time_since_epoch());
+}
+#  endif
+
 template <class _Tp>
 consteval bool __use_fraction() {
   if constexpr (__is_time_point<_Tp>)
     return chrono::hh_mm_ss<typename _Tp::duration>::fractional_width;
+#  if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) && !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) &&                     \
+      !defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return chrono::hh_mm_ss<typename _Tp::duration>::fractional_width;
+#  endif
   else if constexpr (chrono::__is_duration<_Tp>::value)
     return chrono::hh_mm_ss<_Tp>::fractional_width;
   else if constexpr (__is_hh_mm_ss<_Tp>)
@@ -212,8 +228,13 @@ _LIBCPP_HIDE_FROM_ABI __time_zone __convert_to_time_zone([[maybe_unused]] const
 #  if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   if constexpr (same_as<_Tp, chrono::sys_info>)
     return {__value.abbrev, __value.offset};
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return __formatter::__convert_to_time_zone(__value.get_info());
+#    endif
   else
-#  endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
     return {"UTC", chrono::seconds{0}};
 }
 
@@ -426,7 +447,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_ok(const _Tp& __value) {
     return true;
   else if constexpr (same_as<_Tp, chrono::local_info>)
     return true;
-#  endif
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return true;
+#    endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -472,7 +498,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_name_ok(const _Tp& __value) {
     return true;
   else if constexpr (same_as<_Tp, chrono::local_info>)
     return true;
-#  endif
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return true;
+#    endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -518,7 +549,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __date_ok(const _Tp& __value) {
     return true;
   else if constexpr (same_as<_Tp, chrono::local_info>)
     return true;
-#  endif
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return true;
+#    endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -564,7 +600,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __month_name_ok(const _Tp& __value) {
     return true;
   else if constexpr (same_as<_Tp, chrono::local_info>)
     return true;
-#  endif
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  else if constexpr (__is_specialization_v<_Tp, chrono::zoned_time>)
+    return true;
+#    endif
+#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -924,7 +965,23 @@ struct formatter<chrono::local_info, _CharT> : public __formatter_chrono<_CharT>
     return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags{});
   }
 };
-#  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+// Note due to how libc++'s formatters are implemented there is no need to add
+// the exposition only local-time-format-t abstraction.
+template <class _Duration, class _TimeZonePtr, __fmt_char_type _CharT>
+struct formatter<chrono::zoned_time<_Duration, _TimeZonePtr>, _CharT> : public __formatter_chrono<_CharT> {
+public:
+  using _Base = __formatter_chrono<_CharT>;
+
+  template <class _ParseContext>
+  _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
+    return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__clock);
+  }
+};
+#    endif // !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
+           // !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  endif   // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
 
 #endif // if _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index bb0341bc3ec63..e6c43254eea15 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -27,6 +27,7 @@
 #include <__chrono/year_month.h>
 #include <__chrono/year_month_day.h>
 #include <__chrono/year_month_weekday.h>
+#include <__chrono/zoned_time.h>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__format/format_functions.h>
@@ -302,6 +303,14 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const local_info& __info) {
              _LIBCPP_STATICALLY_WIDEN(_CharT, "{}: {{{}, {}}}"), __result(), __info.first, __info.second);
 }
 
+#    if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                          \
+        !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+template <class _CharT, class _Traits, class _Duration, class _TimeZonePtr>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const zoned_time<_Duration, _TimeZonePtr>& __tp) {
+  return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T %Z}"), __tp);
+}
+#    endif
 #  endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
 
 } // namespace chrono
diff --git a/libcxx/include/__chrono/weekday.h b/libcxx/include/__chrono/weekday.h
index 5a7dedc6e3a16..86c780cc71825 100644
--- a/libcxx/include/__chrono/weekday.h
+++ b/libcxx/include/__chrono/weekday.h
@@ -79,6 +79,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator==(const weekday& __lhs, con
   return __lhs.c_encoding() == __rhs.c_encoding();
 }
 
+// TODO(LLVM 20): Remove the escape hatch
+#  ifdef _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
 _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept {
   return __lhs.c_encoding() < __rhs.c_encoding();
 }
@@ -94,6 +96,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<=(const weekday& __lhs, con
 _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept {
   return !(__lhs < __rhs);
 }
+#  endif // _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS
 
 _LIBCPP_HIDE_FROM_ABI inline constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept {
   auto const __mu = static_cast<long long>(__lhs.c_encoding()) + __rhs.count();
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 108f700823cbf..392053a64a8dc 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -27,7 +27,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 190000
+#  define _LIBCPP_VERSION 200000
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
@@ -699,14 +699,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
-#  if _LIBCPP_STD_VER < 20
-#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC                                                                             \
-      _LIBCPP_DEPRECATED_("The C++20 synchronization library has been deprecated prior to C++20. Please update to "    \
-                          "using -std=c++20 if you need to use these facilities.")
-#  else
-#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC /* nothing */
-#  endif
-
 #  if !defined(_LIBCPP_CXX03_LANG)
 #    define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h
index cbde7887becf1..8efbb42d1d847 100644
--- a/libcxx/include/__configuration/abi.h
+++ b/libcxx/include/__configuration/abi.h
@@ -137,6 +137,19 @@
 // - `string_view`.
 // #define _LIBCPP_ABI_BOUNDED_ITERATORS
 
+// Changes the iterator type of `basic_string` to a bounded iterator that keeps track of whether it's within the bounds
+// of the original container and asserts it on every dereference and when performing iterator arithmetics.
+//
+// ABI impact: changes the iterator type of `basic_string` and its specializations, such as `string` and `wstring`.
+// #define _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
+
+// Changes the iterator type of `vector` to a bounded iterator that keeps track of whether it's within the bounds of the
+// original container and asserts it on every dereference and when performing iterator arithmetics. Note: this doesn't
+// yet affect `vector<bool>`.
+//
+// ABI impact: changes the iterator type of `vector` (except `vector<bool>`).
+// #define _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
+
 #if defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if defined(__APPLE__)
 #    if defined(__i386__) || defined(__x86_64__)
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index f618b20603e60..e47ec2f28844b 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -918,9 +918,9 @@ class expected : private __expected_base<_Tp, _Err> {
     requires is_constructible_v<_Err, _Err&>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) & {
     using _Up = remove_cvref_t<invoke_result_t<_Func, _Tp&>>;
-    static_assert(__is_std_expected<_Up>::value, "The result of f(**this) must be a specialization of std::expected");
+    static_assert(__is_std_expected<_Up>::value, "The result of f(value()) must be a specialization of std::expected");
     static_assert(is_same_v<typename _Up::error_type, _Err>,
-                  "The result of f(**this) must have the same error_type as this expected");
+                  "The result of f(value()) must have the same error_type as this expected");
     if (has_value()) {
       return std::invoke(std::forward<_Func>(__f), this->__val());
     }
@@ -931,9 +931,9 @@ class expected : private __expected_base<_Tp, _Err> {
     requires is_constructible_v<_Err, const _Err&>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const& {
     using _Up = remove_cvref_t<invoke_result_t<_Func, const _Tp&>>;
-    static_assert(__is_std_expected<_Up>::value, "The result of f(**this) must be a specialization of std::expected");
+    static_assert(__is_std_expected<_Up>::value, "The result of f(value()) must be a specialization of std::expected");
     static_assert(is_same_v<typename _Up::error_type, _Err>,
-                  "The result of f(**this) must have the same error_type as this expected");
+                  "The result of f(value()) must have the same error_type as this expected");
     if (has_value()) {
       return std::invoke(std::forward<_Func>(__f), this->__val());
     }
@@ -945,9 +945,9 @@ class expected : private __expected_base<_Tp, _Err> {
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) && {
     using _Up = remove_cvref_t<invoke_result_t<_Func, _Tp&&>>;
     static_assert(
-        __is_std_expected<_Up>::value, "The result of f(std::move(**this)) must be a specialization of std::expected");
+        __is_std_expected<_Up>::value, "The result of f(std::move(value())) must be a specialization of std::expected");
     static_assert(is_same_v<typename _Up::error_type, _Err>,
-                  "The result of f(std::move(**this)) must have the same error_type as this expected");
+                  "The result of f(std::move(value())) must have the same error_type as this expected");
     if (has_value()) {
       return std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
     }
@@ -959,9 +959,9 @@ class expected : private __expected_base<_Tp, _Err> {
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const&& {
     using _Up = remove_cvref_t<invoke_result_t<_Func, const _Tp&&>>;
     static_assert(
-        __is_std_expected<_Up>::value, "The result of f(std::move(**this)) must be a specialization of std::expected");
+        __is_std_expected<_Up>::value, "The result of f(std::move(value())) must be a specialization of std::expected");
     static_assert(is_same_v<typename _Up::error_type, _Err>,
-                  "The result of f(std::move(**this)) must have the same error_type as this expected");
+                  "The result of f(std::move(value())) must have the same error_type as this expected");
     if (has_value()) {
       return std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
     }
diff --git a/libcxx/include/__format/formatter.h b/libcxx/include/__format/formatter.h
index e2f418f936ee1..39c2670dd8431 100644
--- a/libcxx/include/__format/formatter.h
+++ b/libcxx/include/__format/formatter.h
@@ -39,6 +39,9 @@ struct _LIBCPP_TEMPLATE_VIS formatter {
 
 #  if _LIBCPP_STD_VER >= 23
 
+template <class _Tp>
+constexpr bool enable_nonlocking_formatter_optimization = false;
+
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr void __set_debug_format(_Tp& __formatter) {
   if constexpr (requires { __formatter.set_debug_format(); })
diff --git a/libcxx/include/__format/formatter_bool.h b/libcxx/include/__format/formatter_bool.h
index 17dc69541e8fe..63aa815efbe9b 100644
--- a/libcxx/include/__format/formatter_bool.h
+++ b/libcxx/include/__format/formatter_bool.h
@@ -69,7 +69,11 @@ struct _LIBCPP_TEMPLATE_VIS formatter<bool, _CharT> {
   __format_spec::__parser<_CharT> __parser_;
 };
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<bool> = true;
+#  endif //_LIBCPP_STD_VER >= 23
+#endif   //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_char.h b/libcxx/include/__format/formatter_char.h
index d33e84368a765..abfd65a428298 100644
--- a/libcxx/include/__format/formatter_char.h
+++ b/libcxx/include/__format/formatter_char.h
@@ -83,9 +83,17 @@ struct _LIBCPP_TEMPLATE_VIS formatter<char, wchar_t> : public __formatter_char<w
 
 template <>
 struct _LIBCPP_TEMPLATE_VIS formatter<wchar_t, wchar_t> : public __formatter_char<wchar_t> {};
-
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<char> = true;
+#    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t> = true;
+#    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
+#  endif   //_LIBCPP_STD_VER >= 23
+
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index fa42ba203b0b5..334755f4e8143 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -774,6 +774,14 @@ struct _LIBCPP_TEMPLATE_VIS formatter<double, _CharT> : public __formatter_float
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<long double, _CharT> : public __formatter_floating_point<_CharT> {};
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<float> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<double> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long double> = true;
+#  endif //_LIBCPP_STD_VER >= 23
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index 41400f00478eb..2c2e799505367 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -88,7 +88,38 @@ template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<__uint128_t, _CharT> : public __formatter_integer<_CharT> {};
 #  endif
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<signed char> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<short> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<int> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long long> = true;
+#    ifndef _LIBCPP_HAS_NO_INT128
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__int128_t> = true;
+#    endif
+
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned char> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned short> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long long> = true;
+#    ifndef _LIBCPP_HAS_NO_INT128
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true;
+#    endif
+#  endif //_LIBCPP_STD_VER >= 23
+#endif   //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h
index 6941343efd91f..e1c062cec6ed2 100644
--- a/libcxx/include/__format/formatter_pointer.h
+++ b/libcxx/include/__format/formatter_pointer.h
@@ -65,6 +65,14 @@ struct _LIBCPP_TEMPLATE_VIS formatter<void*, _CharT> : public __formatter_pointe
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<const void*, _CharT> : public __formatter_pointer<_CharT> {};
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<nullptr_t> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<void*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const void*> = true;
+#  endif //_LIBCPP_STD_VER >= 23
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index 347439fc8dff1..dee2b3ad073a5 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -143,7 +143,32 @@ struct _LIBCPP_TEMPLATE_VIS formatter<basic_string_view<_CharT, _Traits>, _CharT
   }
 };
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<char*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const char*> = true;
+template <size_t _Size>
+inline constexpr bool enable_nonlocking_formatter_optimization<char[_Size]> = true;
+template <class _Traits, class _Allocator>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string<char, _Traits, _Allocator>> = true;
+template <class _Traits>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string_view<char, _Traits>> = true;
+
+#    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const wchar_t*> = true;
+template <size_t _Size>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t[_Size]> = true;
+template <class _Traits, class _Allocator>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string<wchar_t, _Traits, _Allocator>> = true;
+template <class _Traits>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string_view<wchar_t, _Traits>> = true;
+#    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
+#  endif   //_LIBCPP_STD_VER >= 23
+#endif     //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__iterator/advance.h b/libcxx/include/__iterator/advance.h
index 296db1aaab652..57b1b845f1afa 100644
--- a/libcxx/include/__iterator/advance.h
+++ b/libcxx/include/__iterator/advance.h
@@ -76,9 +76,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void advance(_InputIter& __i
 // [range.iter.op.advance]
 
 namespace ranges {
-namespace __advance {
-
-struct __fn {
+struct __advance {
 private:
   template <class _Ip>
   _LIBCPP_HIDE_FROM_ABI static constexpr void __advance_forward(_Ip& __i, iter_difference_t<_Ip> __n) {
@@ -189,10 +187,8 @@ struct __fn {
   }
 };
 
-} // namespace __advance
-
 inline namespace __cpo {
-inline constexpr auto advance = __advance::__fn{};
+inline constexpr auto advance = __advance{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index a8f66f4a0126f..5a86bd98e7194 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -11,6 +11,8 @@
 #define _LIBCPP___ITERATOR_BOUNDED_ITER_H
 
 #include <__assert>
+#include <__compare/ordering.h>
+#include <__compare/three_way_comparable.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__memory/pointer_traits.h>
@@ -201,10 +203,13 @@ struct __bounded_iter {
   operator==(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ == __y.__current_;
   }
+
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool
   operator!=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ != __y.__current_;
   }
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool
   operator<(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ < __y.__current_;
@@ -222,9 +227,28 @@ struct __bounded_iter {
     return __x.__current_ >= __y.__current_;
   }
 
+#else
+  _LIBCPP_HIDE_FROM_ABI constexpr friend strong_ordering
+  operator<=>(__bounded_iter const& __x, __bounded_iter const& __y) noexcept {
+    if constexpr (three_way_comparable<_Iterator, strong_ordering>) {
+      return __x.__current_ <=> __y.__current_;
+    } else {
+      if (__x.__current_ < __y.__current_)
+        return strong_ordering::less;
+
+      if (__x.__current_ == __y.__current_)
+        return strong_ordering::equal;
+
+      return strong_ordering::greater;
+    }
+  }
+#endif // _LIBCPP_STD_VER >= 20
+
 private:
   template <class>
   friend struct pointer_traits;
+  template <class, class>
+  friend struct __bounded_iter;
   _Iterator __current_;       // current iterator
   _Iterator __begin_, __end_; // valid range represented as [begin, end]
 };
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index afb7b821a99ce..0a4878308d55f 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -177,19 +177,19 @@ concept __has_arrow = input_iterator<_Ip> && (is_pointer_v<_Ip> || requires(_Ip
 template <class _Fp, class _It>
 concept indirectly_unary_invocable =
     indirectly_readable<_It> && copy_constructible<_Fp> && invocable<_Fp&, iter_value_t<_It>&> &&
-    invocable<_Fp&, iter_reference_t<_It>> && invocable<_Fp&, iter_common_reference_t<_It>> &&
+    invocable<_Fp&, iter_reference_t<_It>> &&
     common_reference_with< invoke_result_t<_Fp&, iter_value_t<_It>&>, invoke_result_t<_Fp&, iter_reference_t<_It>>>;
 
 template <class _Fp, class _It>
 concept indirectly_regular_unary_invocable =
     indirectly_readable<_It> && copy_constructible<_Fp> && regular_invocable<_Fp&, iter_value_t<_It>&> &&
-    regular_invocable<_Fp&, iter_reference_t<_It>> && regular_invocable<_Fp&, iter_common_reference_t<_It>> &&
+    regular_invocable<_Fp&, iter_reference_t<_It>> &&
     common_reference_with< invoke_result_t<_Fp&, iter_value_t<_It>&>, invoke_result_t<_Fp&, iter_reference_t<_It>>>;
 
 template <class _Fp, class _It>
 concept indirect_unary_predicate =
     indirectly_readable<_It> && copy_constructible<_Fp> && predicate<_Fp&, iter_value_t<_It>&> &&
-    predicate<_Fp&, iter_reference_t<_It>> && predicate<_Fp&, iter_common_reference_t<_It>>;
+    predicate<_Fp&, iter_reference_t<_It>>;
 
 template <class _Fp, class _It1, class _It2>
 concept indirect_binary_predicate =
@@ -197,8 +197,7 @@ concept indirect_binary_predicate =
     predicate<_Fp&, iter_value_t<_It1>&, iter_value_t<_It2>&> &&
     predicate<_Fp&, iter_value_t<_It1>&, iter_reference_t<_It2>> &&
     predicate<_Fp&, iter_reference_t<_It1>, iter_value_t<_It2>&> &&
-    predicate<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>> &&
-    predicate<_Fp&, iter_common_reference_t<_It1>, iter_common_reference_t<_It2>>;
+    predicate<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>>;
 
 template <class _Fp, class _It1, class _It2 = _It1>
 concept indirect_equivalence_relation =
@@ -206,8 +205,7 @@ concept indirect_equivalence_relation =
     equivalence_relation<_Fp&, iter_value_t<_It1>&, iter_value_t<_It2>&> &&
     equivalence_relation<_Fp&, iter_value_t<_It1>&, iter_reference_t<_It2>> &&
     equivalence_relation<_Fp&, iter_reference_t<_It1>, iter_value_t<_It2>&> &&
-    equivalence_relation<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>> &&
-    equivalence_relation<_Fp&, iter_common_reference_t<_It1>, iter_common_reference_t<_It2>>;
+    equivalence_relation<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>>;
 
 template <class _Fp, class _It1, class _It2 = _It1>
 concept indirect_strict_weak_order =
@@ -215,8 +213,7 @@ concept indirect_strict_weak_order =
     strict_weak_order<_Fp&, iter_value_t<_It1>&, iter_value_t<_It2>&> &&
     strict_weak_order<_Fp&, iter_value_t<_It1>&, iter_reference_t<_It2>> &&
     strict_weak_order<_Fp&, iter_reference_t<_It1>, iter_value_t<_It2>&> &&
-    strict_weak_order<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>> &&
-    strict_weak_order<_Fp&, iter_common_reference_t<_It1>, iter_common_reference_t<_It2>>;
+    strict_weak_order<_Fp&, iter_reference_t<_It1>, iter_reference_t<_It2>>;
 
 template <class _Fp, class... _Its>
   requires(indirectly_readable<_Its> && ...) && invocable<_Fp, iter_reference_t<_Its>...>
diff --git a/libcxx/include/__iterator/distance.h b/libcxx/include/__iterator/distance.h
index 75bd49c9ae732..1732aa527f64a 100644
--- a/libcxx/include/__iterator/distance.h
+++ b/libcxx/include/__iterator/distance.h
@@ -52,9 +52,7 @@ distance(_InputIter __first, _InputIter __last) {
 // [range.iter.op.distance]
 
 namespace ranges {
-namespace __distance {
-
-struct __fn {
+struct __distance {
   template <class _Ip, sentinel_for<_Ip> _Sp>
     requires(!sized_sentinel_for<_Sp, _Ip>)
   _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Ip> operator()(_Ip __first, _Sp __last) const {
@@ -85,10 +83,8 @@ struct __fn {
   }
 };
 
-} // namespace __distance
-
 inline namespace __cpo {
-inline constexpr auto distance = __distance::__fn{};
+inline constexpr auto distance = __distance{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__iterator/next.h b/libcxx/include/__iterator/next.h
index 21d3688ad9eb6..fb6c8ea6d7550 100644
--- a/libcxx/include/__iterator/next.h
+++ b/libcxx/include/__iterator/next.h
@@ -41,9 +41,7 @@ next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 // [range.iter.op.next]
 
 namespace ranges {
-namespace __next {
-
-struct __fn {
+struct __next {
   template <input_or_output_iterator _Ip>
   _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
     ++__x;
@@ -69,10 +67,8 @@ struct __fn {
   }
 };
 
-} // namespace __next
-
 inline namespace __cpo {
-inline constexpr auto next = __next::__fn{};
+inline constexpr auto next = __next{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__iterator/prev.h b/libcxx/include/__iterator/prev.h
index 2f0e6a088edb3..e950d8dc41471 100644
--- a/libcxx/include/__iterator/prev.h
+++ b/libcxx/include/__iterator/prev.h
@@ -40,9 +40,7 @@ prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 // [range.iter.op.prev]
 
 namespace ranges {
-namespace __prev {
-
-struct __fn {
+struct __prev {
   template <bidirectional_iterator _Ip>
   _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
     --__x;
@@ -62,10 +60,8 @@ struct __fn {
   }
 };
 
-} // namespace __prev
-
 inline namespace __cpo {
-inline constexpr auto prev = __prev::__fn{};
+inline constexpr auto prev = __prev{};
 } // namespace __cpo
 } // namespace ranges
 
diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h
index 252d13b26c9e2..34f8d5f1663b2 100644
--- a/libcxx/include/__iterator/wrap_iter.h
+++ b/libcxx/include/__iterator/wrap_iter.h
@@ -10,6 +10,8 @@
 #ifndef _LIBCPP___ITERATOR_WRAP_ITER_H
 #define _LIBCPP___ITERATOR_WRAP_ITER_H
 
+#include <__compare/ordering.h>
+#include <__compare/three_way_comparable.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
@@ -131,6 +133,7 @@ operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXC
   return __x.base() < __y.base();
 }
 
+#if _LIBCPP_STD_VER <= 17
 template <class _Iter1>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT {
@@ -142,7 +145,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT {
   return !(__x == __y);
 }
-
 template <class _Iter1>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT {
@@ -179,6 +181,24 @@ operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEX
   return !(__y < __x);
 }
 
+#else
+template <class _Iter1, class _Iter2>
+_LIBCPP_HIDE_FROM_ABI constexpr strong_ordering
+operator<=>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept {
+  if constexpr (three_way_comparable_with<_Iter1, _Iter2, strong_ordering>) {
+    return __x.base() <=> __y.base();
+  } else {
+    if (__x.base() < __y.base())
+      return strong_ordering::less;
+
+    if (__x.base() == __y.base())
+      return strong_ordering::equal;
+
+    return strong_ordering::greater;
+  }
+}
+#endif // _LIBCPP_STD_VER >= 20
+
 template <class _Iter1, class _Iter2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/__math/special_functions.h b/libcxx/include/__math/special_functions.h
new file mode 100644
index 0000000000000..0b1c753a659ad
--- /dev/null
+++ b/libcxx/include/__math/special_functions.h
@@ -0,0 +1,84 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MATH_SPECIAL_FUNCTIONS_H
+#define _LIBCPP___MATH_SPECIAL_FUNCTIONS_H
+
+#include <__config>
+#include <__math/copysign.h>
+#include <__math/traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_integral.h>
+#include <limits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 17
+
+template <class _Real>
+_LIBCPP_HIDE_FROM_ABI _Real __hermite(unsigned __n, _Real __x) {
+  // The Hermite polynomial H_n(x).
+  // The implementation is based on the recurrence formula: H_{n+1}(x) = 2x H_n(x) - 2n H_{n-1}.
+  // Press, William H., et al. Numerical recipes 3rd edition: The art of scientific computing.
+  // Cambridge university press, 2007, p. 183.
+
+  // NOLINTBEGIN(readability-identifier-naming)
+  if (__math::isnan(__x))
+    return __x;
+
+  _Real __H_0{1};
+  if (__n == 0)
+    return __H_0;
+
+  _Real __H_n_prev = __H_0;
+  _Real __H_n      = 2 * __x;
+  for (unsigned __i = 1; __i < __n; ++__i) {
+    _Real __H_n_next = 2 * (__x * __H_n - __i * __H_n_prev);
+    __H_n_prev       = __H_n;
+    __H_n            = __H_n_next;
+  }
+
+  if (!__math::isfinite(__H_n)) {
+    // Overflow occured. Two possible cases:
+    //    n is odd:  return infinity of the same sign as x.
+    //    n is even: return +Inf
+    _Real __inf = std::numeric_limits<_Real>::infinity();
+    return (__n & 1) ? __math::copysign(__inf, __x) : __inf;
+  }
+  return __H_n;
+  // NOLINTEND(readability-identifier-naming)
+}
+
+inline _LIBCPP_HIDE_FROM_ABI double hermite(unsigned __n, double __x) { return std::__hermite(__n, __x); }
+
+inline _LIBCPP_HIDE_FROM_ABI float hermite(unsigned __n, float __x) {
+  // use double internally -- float is too prone to overflow!
+  return static_cast<float>(std::hermite(__n, static_cast<double>(__x)));
+}
+
+inline _LIBCPP_HIDE_FROM_ABI long double hermite(unsigned __n, long double __x) { return std::__hermite(__n, __x); }
+
+inline _LIBCPP_HIDE_FROM_ABI float hermitef(unsigned __n, float __x) { return std::hermite(__n, __x); }
+
+inline _LIBCPP_HIDE_FROM_ABI long double hermitel(unsigned __n, long double __x) { return std::hermite(__n, __x); }
+
+template <class _Integer, std::enable_if_t<std::is_integral_v<_Integer>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI double hermite(unsigned __n, _Integer __x) {
+  return std::hermite(__n, static_cast<double>(__x));
+}
+
+#endif // _LIBCPP_STD_VER >= 17
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___MATH_SPECIAL_FUNCTIONS_H
diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h
index a448266797557..27ec52ecef022 100644
--- a/libcxx/include/__math/traits.h
+++ b/libcxx/include/__math/traits.h
@@ -55,6 +55,18 @@ _LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfin
   return true;
 }
 
+_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(float __x) _NOEXCEPT {
+  return __builtin_isfinite(__x);
+}
+
+_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(double __x) _NOEXCEPT {
+  return __builtin_isfinite(__x);
+}
+
+_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(long double __x) _NOEXCEPT {
+  return __builtin_isfinite(__x);
+}
+
 // isinf
 
 template <class _A1, __enable_if_t<is_arithmetic<_A1>::value && numeric_limits<_A1>::has_infinity, int> = 0>
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 2d8624e771bce..ae1f549626ee4 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -110,6 +110,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator(const allocator<_Up>&) _NOEXCEPT {}
 
   _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* allocate(size_t __n) {
+    static_assert(sizeof(_Tp) >= 0, "cannot allocate memory for an incomplete type");
     if (__n > allocator_traits<allocator>::max_size(*this))
       __throw_bad_array_new_length();
     if (__libcpp_is_constant_evaluated()) {
@@ -121,6 +122,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
 
 #if _LIBCPP_STD_VER >= 23
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr allocation_result<_Tp*> allocate_at_least(size_t __n) {
+    static_assert(sizeof(_Tp) >= 0, "cannot allocate memory for an incomplete type");
     return {allocate(__n), __n};
   }
 #endif
diff --git a/libcxx/include/__memory/allocator_traits.h b/libcxx/include/__memory/allocator_traits.h
index ac564f0e6fa0c..c5fcc89327b8f 100644
--- a/libcxx/include/__memory/allocator_traits.h
+++ b/libcxx/include/__memory/allocator_traits.h
@@ -41,7 +41,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
   struct NAME<_Tp, __void_t<typename _Tp::PROPERTY > > : true_type {}
 
 // __pointer
-_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_pointer, pointer);
 template <class _Tp,
           class _Alloc,
           class _RawAlloc = __libcpp_remove_reference_t<_Alloc>,
diff --git a/libcxx/include/__memory/inout_ptr.h b/libcxx/include/__memory/inout_ptr.h
new file mode 100644
index 0000000000000..72e1a21ad6867
--- /dev/null
+++ b/libcxx/include/__memory/inout_ptr.h
@@ -0,0 +1,109 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___INOUT_PTR_H
+#define _LIBCPP___INOUT_PTR_H
+
+#include <__config>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__memory/shared_ptr.h>
+#include <__memory/unique_ptr.h>
+#include <__type_traits/is_same.h>
+#include <__type_traits/is_specialization.h>
+#include <__type_traits/is_void.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <tuple>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+template <class _Smart, class _Pointer, class... _Args>
+class _LIBCPP_TEMPLATE_VIS inout_ptr_t {
+  static_assert(!__is_specialization_v<_Smart, shared_ptr>, "std::shared_ptr<> is not supported with std::inout_ptr.");
+
+public:
+  _LIBCPP_HIDE_FROM_ABI explicit inout_ptr_t(_Smart& __smart, _Args... __args)
+      : __s_(__smart), __a_(std::forward<_Args>(__args)...), __p_([&__smart] {
+          if constexpr (is_pointer_v<_Smart>) {
+            return __smart;
+          } else {
+            return __smart.get();
+          }
+        }()) {
+    if constexpr (requires { __s_.release(); }) {
+      __s_.release();
+    } else {
+      __s_ = _Smart();
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI inout_ptr_t(const inout_ptr_t&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI ~inout_ptr_t() {
+    // LWG-3897 inout_ptr will not update raw pointer to null
+    if constexpr (!is_pointer_v<_Smart>) {
+      if (!__p_) {
+        return;
+      }
+    }
+
+    using _SP = __pointer_of_or_t<_Smart, _Pointer>;
+    if constexpr (is_pointer_v<_Smart>) {
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+                 std::move(__a_));
+    } else if constexpr (__resettable_smart_pointer_with_args<_Smart, _Pointer, _Args...>) {
+      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+                 std::move(__a_));
+    } else {
+      static_assert(is_constructible_v<_Smart, _SP, _Args...>,
+                    "The smart pointer must be constructible from arguments of types _Smart, _Pointer, _Args...");
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+                 std::move(__a_));
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI operator _Pointer*() const noexcept { return std::addressof(const_cast<_Pointer&>(__p_)); }
+
+  _LIBCPP_HIDE_FROM_ABI operator void**() const noexcept
+    requires(!is_same_v<_Pointer, void*>)
+  {
+    static_assert(is_pointer_v<_Pointer>, "The conversion to void** requires _Pointer to be a raw pointer.");
+
+    return reinterpret_cast<void**>(static_cast<_Pointer*>(*this));
+  }
+
+private:
+  _Smart& __s_;
+  tuple<_Args...> __a_;
+  _Pointer __p_;
+};
+
+template <class _Pointer = void, class _Smart, class... _Args>
+_LIBCPP_HIDE_FROM_ABI auto inout_ptr(_Smart& __s, _Args&&... __args) {
+  using _Ptr = conditional_t<is_void_v<_Pointer>, __pointer_of_t<_Smart>, _Pointer>;
+  return std::inout_ptr_t<_Smart, _Ptr, _Args&&...>(__s, std::forward<_Args>(__args)...);
+}
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___INOUT_PTR_H
diff --git a/libcxx/include/__memory/out_ptr.h b/libcxx/include/__memory/out_ptr.h
new file mode 100644
index 0000000000000..95aa2029c9231
--- /dev/null
+++ b/libcxx/include/__memory/out_ptr.h
@@ -0,0 +1,101 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___OUT_PTR_H
+#define _LIBCPP___OUT_PTR_H
+
+#include <__config>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__memory/shared_ptr.h>
+#include <__memory/unique_ptr.h>
+#include <__type_traits/is_specialization.h>
+#include <__type_traits/is_void.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <tuple>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+template <class _Smart, class _Pointer, class... _Args>
+class _LIBCPP_TEMPLATE_VIS out_ptr_t {
+  static_assert(!__is_specialization_v<_Smart, shared_ptr> || sizeof...(_Args) > 0,
+                "Using std::shared_ptr<> without a deleter in std::out_ptr is not supported.");
+
+public:
+  _LIBCPP_HIDE_FROM_ABI explicit out_ptr_t(_Smart& __smart, _Args... __args)
+      : __s_(__smart), __a_(std::forward<_Args>(__args)...), __p_() {
+    using _Ptr = decltype(__smart);
+    if constexpr (__resettable_smart_pointer<_Ptr>) {
+      __s_.reset();
+    } else if constexpr (is_constructible_v<_Smart>) {
+      __s_ = _Smart();
+    } else {
+      static_assert(__resettable_smart_pointer<_Ptr> || is_constructible_v<_Smart>,
+                    "The adapted pointer type must have a reset() member function or be default constructible.");
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI out_ptr_t(const out_ptr_t&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI ~out_ptr_t() {
+    if (!__p_) {
+      return;
+    }
+
+    using _SP = __pointer_of_or_t<_Smart, _Pointer>;
+    if constexpr (__resettable_smart_pointer_with_args<_Smart, _Pointer, _Args...>) {
+      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+                 std::move(__a_));
+    } else {
+      static_assert(is_constructible_v<_Smart, _SP, _Args...>,
+                    "The smart pointer must be constructible from arguments of types _Smart, _Pointer, _Args...");
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+                 std::move(__a_));
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI operator _Pointer*() const noexcept { return std::addressof(const_cast<_Pointer&>(__p_)); }
+
+  _LIBCPP_HIDE_FROM_ABI operator void**() const noexcept
+    requires(!is_same_v<_Pointer, void*>)
+  {
+    static_assert(is_pointer_v<_Pointer>, "The conversion to void** requires _Pointer to be a raw pointer.");
+
+    return reinterpret_cast<void**>(static_cast<_Pointer*>(*this));
+  }
+
+private:
+  _Smart& __s_;
+  tuple<_Args...> __a_;
+  _Pointer __p_ = _Pointer();
+};
+
+template <class _Pointer = void, class _Smart, class... _Args>
+_LIBCPP_HIDE_FROM_ABI auto out_ptr(_Smart& __s, _Args&&... __args) {
+  using _Ptr = conditional_t<is_void_v<_Pointer>, __pointer_of_t<_Smart>, _Pointer>;
+  return std::out_ptr_t<_Smart, _Ptr, _Args&&...>(__s, std::forward<_Args>(__args)...);
+}
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___OUT_PTR_H
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index fcd2c3edd9f6a..0914aceb318b7 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -20,19 +20,28 @@
 #include <__type_traits/is_void.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
+#include <__utility/forward.h>
 #include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Tp, class = void>
-struct __has_element_type : false_type {};
+// clang-format off
+#define _LIBCPP_CLASS_TRAITS_HAS_XXX(NAME, PROPERTY)                                                                   \
+  template <class _Tp, class = void>                                                                                   \
+  struct NAME : false_type {};                                                                                         \
+  template <class _Tp>                                                                                                 \
+  struct NAME<_Tp, __void_t<typename _Tp::PROPERTY> > : true_type {}
+// clang-format on
 
-template <class _Tp>
-struct __has_element_type<_Tp, __void_t<typename _Tp::element_type> > : true_type {};
+_LIBCPP_CLASS_TRAITS_HAS_XXX(__has_pointer, pointer);
+_LIBCPP_CLASS_TRAITS_HAS_XXX(__has_element_type, element_type);
 
 template <class _Ptr, bool = __has_element_type<_Ptr>::value>
 struct __pointer_traits_element_type {};
@@ -240,6 +249,59 @@ to_address(const _Pointer& __p) noexcept -> decltype(std::__to_address(__p)) {
 }
 #endif
 
+#if _LIBCPP_STD_VER >= 23
+
+template <class _Tp>
+struct __pointer_of {};
+
+template <class _Tp>
+  requires(__has_pointer<_Tp>::value)
+struct __pointer_of<_Tp> {
+  using type = typename _Tp::pointer;
+};
+
+template <class _Tp>
+  requires(!__has_pointer<_Tp>::value && __has_element_type<_Tp>::value)
+struct __pointer_of<_Tp> {
+  using type = typename _Tp::element_type*;
+};
+
+template <class _Tp>
+  requires(!__has_pointer<_Tp>::value && !__has_element_type<_Tp>::value &&
+           __has_element_type<pointer_traits<_Tp>>::value)
+struct __pointer_of<_Tp> {
+  using type = typename pointer_traits<_Tp>::element_type*;
+};
+
+template <typename _Tp>
+using __pointer_of_t = typename __pointer_of<_Tp>::type;
+
+template <class _Tp, class _Up>
+struct __pointer_of_or {
+  using type _LIBCPP_NODEBUG = _Up;
+};
+
+template <class _Tp, class _Up>
+  requires requires { typename __pointer_of_t<_Tp>; }
+struct __pointer_of_or<_Tp, _Up> {
+  using type _LIBCPP_NODEBUG = __pointer_of_t<_Tp>;
+};
+
+template <typename _Tp, typename _Up>
+using __pointer_of_or_t = typename __pointer_of_or<_Tp, _Up>::type;
+
+template <class _Smart>
+concept __resettable_smart_pointer = requires(_Smart __s) { __s.reset(); };
+
+template <class _Smart, class _Pointer, class... _Args>
+concept __resettable_smart_pointer_with_args = requires(_Smart __s, _Pointer __p, _Args... __args) {
+  __s.reset(static_cast<__pointer_of_or_t<_Smart, _Pointer>>(__p), std::forward<_Args>(__args)...);
+};
+
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MEMORY_POINTER_TRAITS_H
diff --git a/libcxx/include/__memory/ranges_construct_at.h b/libcxx/include/__memory/ranges_construct_at.h
index f731e75e7bdc0..b7523d4ba4b81 100644
--- a/libcxx/include/__memory/ranges_construct_at.h
+++ b/libcxx/include/__memory/ranges_construct_at.h
@@ -38,43 +38,33 @@ namespace ranges {
 
 // construct_at
 
-namespace __construct_at {
-
-struct __fn {
+struct __construct_at {
   template <class _Tp, class... _Args, class = decltype(::new(std::declval<void*>()) _Tp(std::declval<_Args>()...))>
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator()(_Tp* __location, _Args&&... __args) const {
     return std::construct_at(__location, std::forward<_Args>(__args)...);
   }
 };
 
-} // namespace __construct_at
-
 inline namespace __cpo {
-inline constexpr auto construct_at = __construct_at::__fn{};
+inline constexpr auto construct_at = __construct_at{};
 } // namespace __cpo
 
 // destroy_at
 
-namespace __destroy_at {
-
-struct __fn {
+struct __destroy_at {
   template <destructible _Tp>
   _LIBCPP_HIDE_FROM_ABI constexpr void operator()(_Tp* __location) const noexcept {
     std::destroy_at(__location);
   }
 };
 
-} // namespace __destroy_at
-
 inline namespace __cpo {
-inline constexpr auto destroy_at = __destroy_at::__fn{};
+inline constexpr auto destroy_at = __destroy_at{};
 } // namespace __cpo
 
 // destroy
 
-namespace __destroy {
-
-struct __fn {
+struct __destroy {
   template <__nothrow_input_iterator _InputIterator, __nothrow_sentinel_for<_InputIterator> _Sentinel>
     requires destructible<iter_value_t<_InputIterator>>
   _LIBCPP_HIDE_FROM_ABI constexpr _InputIterator operator()(_InputIterator __first, _Sentinel __last) const noexcept {
@@ -88,17 +78,13 @@ struct __fn {
   }
 };
 
-} // namespace __destroy
-
 inline namespace __cpo {
-inline constexpr auto destroy = __destroy::__fn{};
+inline constexpr auto destroy = __destroy{};
 } // namespace __cpo
 
 // destroy_n
 
-namespace __destroy_n {
-
-struct __fn {
+struct __destroy_n {
   template <__nothrow_input_iterator _InputIterator>
     requires destructible<iter_value_t<_InputIterator>>
   _LIBCPP_HIDE_FROM_ABI constexpr _InputIterator
@@ -107,10 +93,8 @@ struct __fn {
   }
 };
 
-} // namespace __destroy_n
-
 inline namespace __cpo {
-inline constexpr auto destroy_n = __destroy_n::__fn{};
+inline constexpr auto destroy_n = __destroy_n{};
 } // namespace __cpo
 
 } // namespace ranges
diff --git a/libcxx/include/__memory/ranges_uninitialized_algorithms.h b/libcxx/include/__memory/ranges_uninitialized_algorithms.h
index 90090055bbbbf..4815ac6e5de99 100644
--- a/libcxx/include/__memory/ranges_uninitialized_algorithms.h
+++ b/libcxx/include/__memory/ranges_uninitialized_algorithms.h
@@ -42,9 +42,7 @@ namespace ranges {
 
 // uninitialized_default_construct
 
-namespace __uninitialized_default_construct {
-
-struct __fn {
+struct __uninitialized_default_construct {
   template <__nothrow_forward_iterator _ForwardIterator, __nothrow_sentinel_for<_ForwardIterator> _Sentinel>
     requires default_initializable<iter_value_t<_ForwardIterator>>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator operator()(_ForwardIterator __first, _Sentinel __last) const {
@@ -59,17 +57,13 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_default_construct
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_default_construct = __uninitialized_default_construct::__fn{};
+inline constexpr auto uninitialized_default_construct = __uninitialized_default_construct{};
 } // namespace __cpo
 
 // uninitialized_default_construct_n
 
-namespace __uninitialized_default_construct_n {
-
-struct __fn {
+struct __uninitialized_default_construct_n {
   template <__nothrow_forward_iterator _ForwardIterator>
     requires default_initializable<iter_value_t<_ForwardIterator>>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator
@@ -79,17 +73,13 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_default_construct_n
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_default_construct_n = __uninitialized_default_construct_n::__fn{};
+inline constexpr auto uninitialized_default_construct_n = __uninitialized_default_construct_n{};
 } // namespace __cpo
 
 // uninitialized_value_construct
 
-namespace __uninitialized_value_construct {
-
-struct __fn {
+struct __uninitialized_value_construct {
   template <__nothrow_forward_iterator _ForwardIterator, __nothrow_sentinel_for<_ForwardIterator> _Sentinel>
     requires default_initializable<iter_value_t<_ForwardIterator>>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator operator()(_ForwardIterator __first, _Sentinel __last) const {
@@ -104,17 +94,13 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_value_construct
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_value_construct = __uninitialized_value_construct::__fn{};
+inline constexpr auto uninitialized_value_construct = __uninitialized_value_construct{};
 } // namespace __cpo
 
 // uninitialized_value_construct_n
 
-namespace __uninitialized_value_construct_n {
-
-struct __fn {
+struct __uninitialized_value_construct_n {
   template <__nothrow_forward_iterator _ForwardIterator>
     requires default_initializable<iter_value_t<_ForwardIterator>>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator
@@ -124,17 +110,13 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_value_construct_n
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_value_construct_n = __uninitialized_value_construct_n::__fn{};
+inline constexpr auto uninitialized_value_construct_n = __uninitialized_value_construct_n{};
 } // namespace __cpo
 
 // uninitialized_fill
 
-namespace __uninitialized_fill {
-
-struct __fn {
+struct __uninitialized_fill {
   template <__nothrow_forward_iterator _ForwardIterator, __nothrow_sentinel_for<_ForwardIterator> _Sentinel, class _Tp>
     requires constructible_from<iter_value_t<_ForwardIterator>, const _Tp&>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator operator()(_ForwardIterator __first, _Sentinel __last, const _Tp& __x) const {
@@ -149,17 +131,13 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_fill
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_fill = __uninitialized_fill::__fn{};
+inline constexpr auto uninitialized_fill = __uninitialized_fill{};
 } // namespace __cpo
 
 // uninitialized_fill_n
 
-namespace __uninitialized_fill_n {
-
-struct __fn {
+struct __uninitialized_fill_n {
   template <__nothrow_forward_iterator _ForwardIterator, class _Tp>
     requires constructible_from<iter_value_t<_ForwardIterator>, const _Tp&>
   _LIBCPP_HIDE_FROM_ABI _ForwardIterator
@@ -169,10 +147,8 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_fill_n
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_fill_n = __uninitialized_fill_n::__fn{};
+inline constexpr auto uninitialized_fill_n = __uninitialized_fill_n{};
 } // namespace __cpo
 
 // uninitialized_copy
@@ -180,9 +156,7 @@ inline constexpr auto uninitialized_fill_n = __uninitialized_fill_n::__fn{};
 template <class _InputIterator, class _OutputIterator>
 using uninitialized_copy_result = in_out_result<_InputIterator, _OutputIterator>;
 
-namespace __uninitialized_copy {
-
-struct __fn {
+struct __uninitialized_copy {
   template <input_iterator _InputIterator,
             sentinel_for<_InputIterator> _Sentinel1,
             __nothrow_forward_iterator _OutputIterator,
@@ -207,10 +181,8 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_copy
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_copy = __uninitialized_copy::__fn{};
+inline constexpr auto uninitialized_copy = __uninitialized_copy{};
 } // namespace __cpo
 
 // uninitialized_copy_n
@@ -218,9 +190,7 @@ inline constexpr auto uninitialized_copy = __uninitialized_copy::__fn{};
 template <class _InputIterator, class _OutputIterator>
 using uninitialized_copy_n_result = in_out_result<_InputIterator, _OutputIterator>;
 
-namespace __uninitialized_copy_n {
-
-struct __fn {
+struct __uninitialized_copy_n {
   template <input_iterator _InputIterator,
             __nothrow_forward_iterator _OutputIterator,
             __nothrow_sentinel_for<_OutputIterator> _Sentinel>
@@ -238,10 +208,8 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_copy_n
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_copy_n = __uninitialized_copy_n::__fn{};
+inline constexpr auto uninitialized_copy_n = __uninitialized_copy_n{};
 } // namespace __cpo
 
 // uninitialized_move
@@ -249,9 +217,7 @@ inline constexpr auto uninitialized_copy_n = __uninitialized_copy_n::__fn{};
 template <class _InputIterator, class _OutputIterator>
 using uninitialized_move_result = in_out_result<_InputIterator, _OutputIterator>;
 
-namespace __uninitialized_move {
-
-struct __fn {
+struct __uninitialized_move {
   template <input_iterator _InputIterator,
             sentinel_for<_InputIterator> _Sentinel1,
             __nothrow_forward_iterator _OutputIterator,
@@ -276,10 +242,8 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_move
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_move = __uninitialized_move::__fn{};
+inline constexpr auto uninitialized_move = __uninitialized_move{};
 } // namespace __cpo
 
 // uninitialized_move_n
@@ -287,9 +251,7 @@ inline constexpr auto uninitialized_move = __uninitialized_move::__fn{};
 template <class _InputIterator, class _OutputIterator>
 using uninitialized_move_n_result = in_out_result<_InputIterator, _OutputIterator>;
 
-namespace __uninitialized_move_n {
-
-struct __fn {
+struct __uninitialized_move_n {
   template <input_iterator _InputIterator,
             __nothrow_forward_iterator _OutputIterator,
             __nothrow_sentinel_for<_OutputIterator> _Sentinel>
@@ -308,10 +270,8 @@ struct __fn {
   }
 };
 
-} // namespace __uninitialized_move_n
-
 inline namespace __cpo {
-inline constexpr auto uninitialized_move_n = __uninitialized_move_n::__fn{};
+inline constexpr auto uninitialized_move_n = __uninitialized_move_n{};
 } // namespace __cpo
 
 } // namespace ranges
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 9519e4283868b..f75259473efb1 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -36,7 +36,9 @@
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/remove_extent.h>
+#include <__type_traits/remove_pointer.h>
 #include <__type_traits/type_identity.h>
+#include <__utility/declval.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <cstddef>
@@ -50,6 +52,17 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#ifndef _LIBCPP_CXX03_LANG
+
+template <class _Ptr>
+struct __is_noexcept_deref_or_void {
+  static constexpr bool value = noexcept(*std::declval<_Ptr>());
+};
+
+template <>
+struct __is_noexcept_deref_or_void<void*> : true_type {};
+#endif
+
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS default_delete {
   static_assert(!is_function<_Tp>::value, "default_delete cannot be instantiated for function types");
@@ -252,7 +265,8 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr {
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator*() const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator*() const
+      _NOEXCEPT_(__is_noexcept_deref_or_void<pointer>::value) {
     return *__ptr_.first();
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer operator->() const _NOEXCEPT { return __ptr_.first(); }
diff --git a/libcxx/include/__memory/uses_allocator_construction.h b/libcxx/include/__memory/uses_allocator_construction.h
index 5e5819d4c281e..955879ffc5845 100644
--- a/libcxx/include/__memory/uses_allocator_construction.h
+++ b/libcxx/include/__memory/uses_allocator_construction.h
@@ -40,104 +40,8 @@ inline constexpr bool __is_std_pair<pair<_Type1, _Type2>> = true;
 template <class _Tp>
 inline constexpr bool __is_cv_std_pair = __is_std_pair<remove_cv_t<_Tp>>;
 
-template <class _Type, class _Alloc, class... _Args, __enable_if_t<!__is_cv_std_pair<_Type>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, _Args&&... __args) noexcept {
-  if constexpr (!uses_allocator_v<remove_cv_t<_Type>, _Alloc> && is_constructible_v<_Type, _Args...>) {
-    return std::forward_as_tuple(std::forward<_Args>(__args)...);
-  } else if constexpr (uses_allocator_v<remove_cv_t<_Type>, _Alloc> &&
-                       is_constructible_v<_Type, allocator_arg_t, const _Alloc&, _Args...>) {
-    return tuple<allocator_arg_t, const _Alloc&, _Args&&...>(allocator_arg, __alloc, std::forward<_Args>(__args)...);
-  } else if constexpr (uses_allocator_v<remove_cv_t<_Type>, _Alloc> &&
-                       is_constructible_v<_Type, _Args..., const _Alloc&>) {
-    return std::forward_as_tuple(std::forward<_Args>(__args)..., __alloc);
-  } else {
-    static_assert(
-        sizeof(_Type) + 1 == 0, "If uses_allocator_v<Type> is true, the type has to be allocator-constructible");
-  }
-}
-
-template <class _Pair, class _Alloc, class _Tuple1, class _Tuple2, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto __uses_allocator_construction_args(
-    const _Alloc& __alloc, piecewise_construct_t, _Tuple1&& __x, _Tuple2&& __y) noexcept {
-  return std::make_tuple(
-      piecewise_construct,
-      std::apply(
-          [&__alloc](auto&&... __args1) {
-            return std::__uses_allocator_construction_args<typename _Pair::first_type>(
-                __alloc, std::forward<decltype(__args1)>(__args1)...);
-          },
-          std::forward<_Tuple1>(__x)),
-      std::apply(
-          [&__alloc](auto&&... __args2) {
-            return std::__uses_allocator_construction_args<typename _Pair::second_type>(
-                __alloc, std::forward<decltype(__args2)>(__args2)...);
-          },
-          std::forward<_Tuple2>(__y)));
-}
-
-template <class _Pair, class _Alloc, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto __uses_allocator_construction_args(const _Alloc& __alloc) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(__alloc, piecewise_construct, tuple<>{}, tuple<>{});
-}
-
-template <class _Pair, class _Alloc, class _Up, class _Vp, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, _Up&& __u, _Vp&& __v) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc,
-      piecewise_construct,
-      std::forward_as_tuple(std::forward<_Up>(__u)),
-      std::forward_as_tuple(std::forward<_Vp>(__v)));
-}
-
-#  if _LIBCPP_STD_VER >= 23
-template <class _Pair, class _Alloc, class _Up, class _Vp, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, pair<_Up, _Vp>& __pair) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc, piecewise_construct, std::forward_as_tuple(__pair.first), std::forward_as_tuple(__pair.second));
-}
-#  endif
-
-template <class _Pair, class _Alloc, class _Up, class _Vp, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, const pair<_Up, _Vp>& __pair) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc, piecewise_construct, std::forward_as_tuple(__pair.first), std::forward_as_tuple(__pair.second));
-}
-
-template <class _Pair, class _Alloc, class _Up, class _Vp, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, pair<_Up, _Vp>&& __pair) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc,
-      piecewise_construct,
-      std::forward_as_tuple(std::get<0>(std::move(__pair))),
-      std::forward_as_tuple(std::get<1>(std::move(__pair))));
-}
-
-#  if _LIBCPP_STD_VER >= 23
-template <class _Pair, class _Alloc, class _Up, class _Vp, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, const pair<_Up, _Vp>&& __pair) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc,
-      piecewise_construct,
-      std::forward_as_tuple(std::get<0>(std::move(__pair))),
-      std::forward_as_tuple(std::get<1>(std::move(__pair))));
-}
-
-template <class _Pair, class _Alloc, __pair_like_no_subrange _PairLike, __enable_if_t<__is_cv_std_pair<_Pair>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, _PairLike&& __p) noexcept {
-  return std::__uses_allocator_construction_args<_Pair>(
-      __alloc,
-      piecewise_construct,
-      std::forward_as_tuple(std::get<0>(std::forward<_PairLike>(__p))),
-      std::forward_as_tuple(std::get<1>(std::forward<_PairLike>(__p))));
-}
-#  endif
+template <class _Tp, class = void>
+struct __uses_allocator_construction_args;
 
 namespace __uses_allocator_detail {
 
@@ -165,46 +69,135 @@ inline constexpr bool __uses_allocator_constraints = __is_cv_std_pair<_Tp> && !_
 
 } // namespace __uses_allocator_detail
 
-template < class _Pair,
-           class _Alloc,
-           class _Type,
-           __enable_if_t<__uses_allocator_detail::__uses_allocator_constraints<_Pair, _Type>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, _Type&& __value) noexcept;
-
 template <class _Type, class _Alloc, class... _Args>
 _LIBCPP_HIDE_FROM_ABI constexpr _Type __make_obj_using_allocator(const _Alloc& __alloc, _Args&&... __args);
 
-template < class _Pair,
-           class _Alloc,
-           class _Type,
-           __enable_if_t< __uses_allocator_detail::__uses_allocator_constraints<_Pair, _Type>, int>>
-_LIBCPP_HIDE_FROM_ABI constexpr auto
-__uses_allocator_construction_args(const _Alloc& __alloc, _Type&& __value) noexcept {
-  struct __pair_constructor {
-    using _PairMutable = remove_cv_t<_Pair>;
+template <class _Pair>
+struct __uses_allocator_construction_args<_Pair, __enable_if_t<__is_cv_std_pair<_Pair>>> {
+  template <class _Alloc, class _Tuple1, class _Tuple2>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto
+  __apply(const _Alloc& __alloc, piecewise_construct_t, _Tuple1&& __x, _Tuple2&& __y) noexcept {
+    return std::make_tuple(
+        piecewise_construct,
+        std::apply(
+            [&__alloc](auto&&... __args1) {
+              return __uses_allocator_construction_args<typename _Pair::first_type>::__apply(
+                  __alloc, std::forward<decltype(__args1)>(__args1)...);
+            },
+            std::forward<_Tuple1>(__x)),
+        std::apply(
+            [&__alloc](auto&&... __args2) {
+              return __uses_allocator_construction_args<typename _Pair::second_type>::__apply(
+                  __alloc, std::forward<decltype(__args2)>(__args2)...);
+            },
+            std::forward<_Tuple2>(__y)));
+  }
 
-    _LIBCPP_HIDDEN constexpr auto __do_construct(const _PairMutable& __pair) const {
-      return std::__make_obj_using_allocator<_PairMutable>(__alloc_, __pair);
-    }
+  template <class _Alloc>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(__alloc, piecewise_construct, tuple<>{}, tuple<>{});
+  }
 
-    _LIBCPP_HIDDEN constexpr auto __do_construct(_PairMutable&& __pair) const {
-      return std::__make_obj_using_allocator<_PairMutable>(__alloc_, std::move(__pair));
-    }
+  template <class _Alloc, class _Up, class _Vp>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, _Up&& __u, _Vp&& __v) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc,
+        piecewise_construct,
+        std::forward_as_tuple(std::forward<_Up>(__u)),
+        std::forward_as_tuple(std::forward<_Vp>(__v)));
+  }
 
-    const _Alloc& __alloc_;
-    _Type& __value_;
+#  if _LIBCPP_STD_VER >= 23
+  template <class _Alloc, class _Up, class _Vp>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, pair<_Up, _Vp>& __pair) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc, piecewise_construct, std::forward_as_tuple(__pair.first), std::forward_as_tuple(__pair.second));
+  }
+#  endif
 
-    _LIBCPP_HIDDEN constexpr operator _PairMutable() const { return __do_construct(std::forward<_Type>(__value_)); }
-  };
+  template <class _Alloc, class _Up, class _Vp>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, const pair<_Up, _Vp>& __pair) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc, piecewise_construct, std::forward_as_tuple(__pair.first), std::forward_as_tuple(__pair.second));
+  }
 
-  return std::make_tuple(__pair_constructor{__alloc, __value});
-}
+  template <class _Alloc, class _Up, class _Vp>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, pair<_Up, _Vp>&& __pair) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc,
+        piecewise_construct,
+        std::forward_as_tuple(std::get<0>(std::move(__pair))),
+        std::forward_as_tuple(std::get<1>(std::move(__pair))));
+  }
+
+#  if _LIBCPP_STD_VER >= 23
+  template <class _Alloc, class _Up, class _Vp>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, const pair<_Up, _Vp>&& __pair) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc,
+        piecewise_construct,
+        std::forward_as_tuple(std::get<0>(std::move(__pair))),
+        std::forward_as_tuple(std::get<1>(std::move(__pair))));
+  }
+
+  template < class _Alloc, __pair_like_no_subrange _PairLike>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, _PairLike&& __p) noexcept {
+    return __uses_allocator_construction_args<_Pair>::__apply(
+        __alloc,
+        piecewise_construct,
+        std::forward_as_tuple(std::get<0>(std::forward<_PairLike>(__p))),
+        std::forward_as_tuple(std::get<1>(std::forward<_PairLike>(__p))));
+  }
+#  endif
+
+  template <class _Alloc,
+            class _Type,
+            __enable_if_t<__uses_allocator_detail::__uses_allocator_constraints<_Pair, _Type>, int> = 0>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, _Type&& __value) noexcept {
+    struct __pair_constructor {
+      using _PairMutable = remove_cv_t<_Pair>;
+
+      _LIBCPP_HIDDEN constexpr auto __do_construct(const _PairMutable& __pair) const {
+        return std::__make_obj_using_allocator<_PairMutable>(__alloc_, __pair);
+      }
+
+      _LIBCPP_HIDDEN constexpr auto __do_construct(_PairMutable&& __pair) const {
+        return std::__make_obj_using_allocator<_PairMutable>(__alloc_, std::move(__pair));
+      }
+
+      const _Alloc& __alloc_;
+      _Type& __value_;
+
+      _LIBCPP_HIDDEN constexpr operator _PairMutable() const { return __do_construct(std::forward<_Type>(__value_)); }
+    };
+
+    return std::make_tuple(__pair_constructor{__alloc, __value});
+  }
+};
+
+template <class _Type>
+struct __uses_allocator_construction_args<_Type, __enable_if_t<!__is_cv_std_pair<_Type>>> {
+  template <class _Alloc, class... _Args>
+  static _LIBCPP_HIDE_FROM_ABI constexpr auto __apply(const _Alloc& __alloc, _Args&&... __args) noexcept {
+    if constexpr (!uses_allocator_v<remove_cv_t<_Type>, _Alloc> && is_constructible_v<_Type, _Args...>) {
+      return std::forward_as_tuple(std::forward<_Args>(__args)...);
+    } else if constexpr (uses_allocator_v<remove_cv_t<_Type>, _Alloc> &&
+                         is_constructible_v<_Type, allocator_arg_t, const _Alloc&, _Args...>) {
+      return tuple<allocator_arg_t, const _Alloc&, _Args&&...>(allocator_arg, __alloc, std::forward<_Args>(__args)...);
+    } else if constexpr (uses_allocator_v<remove_cv_t<_Type>, _Alloc> &&
+                         is_constructible_v<_Type, _Args..., const _Alloc&>) {
+      return std::forward_as_tuple(std::forward<_Args>(__args)..., __alloc);
+    } else {
+      static_assert(
+          sizeof(_Type) + 1 == 0, "If uses_allocator_v<Type> is true, the type has to be allocator-constructible");
+    }
+  }
+};
 
 template <class _Type, class _Alloc, class... _Args>
 _LIBCPP_HIDE_FROM_ABI constexpr _Type __make_obj_using_allocator(const _Alloc& __alloc, _Args&&... __args) {
   return std::make_from_tuple<_Type>(
-      std::__uses_allocator_construction_args<_Type>(__alloc, std::forward<_Args>(__args)...));
+      __uses_allocator_construction_args<_Type>::__apply(__alloc, std::forward<_Args>(__args)...));
 }
 
 template <class _Type, class _Alloc, class... _Args>
@@ -212,7 +205,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Type*
 __uninitialized_construct_using_allocator(_Type* __ptr, const _Alloc& __alloc, _Args&&... __args) {
   return std::apply(
       [&__ptr](auto&&... __xs) { return std::__construct_at(__ptr, std::forward<decltype(__xs)>(__xs)...); },
-      std::__uses_allocator_construction_args<_Type>(__alloc, std::forward<_Args>(__args)...));
+      __uses_allocator_construction_args<_Type>::__apply(__alloc, std::forward<_Args>(__args)...));
 }
 
 #endif // _LIBCPP_STD_VER >= 17
@@ -221,8 +214,8 @@ __uninitialized_construct_using_allocator(_Type* __ptr, const _Alloc& __alloc, _
 
 template <class _Type, class _Alloc, class... _Args>
 _LIBCPP_HIDE_FROM_ABI constexpr auto uses_allocator_construction_args(const _Alloc& __alloc, _Args&&... __args) noexcept
-    -> decltype(std::__uses_allocator_construction_args<_Type>(__alloc, std::forward<_Args>(__args)...)) {
-  return /*--*/ std::__uses_allocator_construction_args<_Type>(__alloc, std::forward<_Args>(__args)...);
+    -> decltype(__uses_allocator_construction_args<_Type>::__apply(__alloc, std::forward<_Args>(__args)...)) {
+  return /*--*/ __uses_allocator_construction_args<_Type>::__apply(__alloc, std::forward<_Args>(__args)...);
 }
 
 template <class _Type, class _Alloc, class... _Args>
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index a71096d3e4784..3444e9575e1af 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -174,6 +174,17 @@ class _LIBCPP_AVAILABILITY_PMR _LIBCPP_TEMPLATE_VIS polymorphic_allocator {
 
   _LIBCPP_HIDE_FROM_ABI memory_resource* resource() const noexcept { return __res_; }
 
+  friend bool operator==(const polymorphic_allocator& __lhs, const polymorphic_allocator& __rhs) noexcept {
+    return *__lhs.resource() == *__rhs.resource();
+  }
+
+#  if _LIBCPP_STD_VER <= 17
+  // This overload is not specified, it was added due to LWG3683.
+  friend bool operator!=(const polymorphic_allocator& __lhs, const polymorphic_allocator& __rhs) noexcept {
+    return *__lhs.resource() != *__rhs.resource();
+  }
+#  endif
+
 private:
   template <class... _Args, size_t... _Is>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&...>
diff --git a/libcxx/include/__mutex/unique_lock.h b/libcxx/include/__mutex/unique_lock.h
index 4a616ba51ee1c..5df791de4c742 100644
--- a/libcxx/include/__mutex/unique_lock.h
+++ b/libcxx/include/__mutex/unique_lock.h
@@ -22,8 +22,6 @@
 #  pragma GCC system_header
 #endif
 
-#ifndef _LIBCPP_HAS_NO_THREADS
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Mutex>
@@ -172,6 +170,4 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(unique_lock<_Mutex>& __x, unique_lock<_Mu
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP_HAS_NO_THREADS
-
 #endif // _LIBCPP___MUTEX_UNIQUE_LOCK_H
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index dc3aaa59ed8c3..bcce389c0e680 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -326,13 +326,6 @@ class transform_view<_View, _Fn>::__iterator : public __transform_view_iterator_
   {
     return __x.__current_ - __y.__current_;
   }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr decltype(auto) iter_move(const __iterator& __i) noexcept(noexcept(*__i)) {
-    if constexpr (is_lvalue_reference_v<decltype(*__i)>)
-      return std::move(*__i);
-    else
-      return *__i;
-  }
 };
 
 #  if _LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index d2254a695f5e8..458c1cdb2da33 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___THREAD_THREAD_H
 #define _LIBCPP___THREAD_THREAD_H
 
+#include <__assert>
 #include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__exception/terminate.h>
diff --git a/libcxx/include/__type_traits/is_enum.h b/libcxx/include/__type_traits/is_enum.h
index 77ca3ea108742..2fab6db2c8d50 100644
--- a/libcxx/include/__type_traits/is_enum.h
+++ b/libcxx/include/__type_traits/is_enum.h
@@ -26,6 +26,16 @@ template <class _Tp>
 inline constexpr bool is_enum_v = __is_enum(_Tp);
 #endif
 
+#if _LIBCPP_STD_VER >= 23
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {};
+
+template <class _Tp>
+inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp);
+
+#endif // _LIBCPP_STD_VER >= 23
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_ENUM_H
diff --git a/libcxx/include/__type_traits/is_scoped_enum.h b/libcxx/include/__type_traits/is_scoped_enum.h
deleted file mode 100644
index cb3e25cf57331..0000000000000
--- a/libcxx/include/__type_traits/is_scoped_enum.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H
-#define _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#if _LIBCPP_STD_VER >= 23
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {};
-
-template <class _Tp>
-inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp);
-
-#endif // _LIBCPP_STD_VER >= 23
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index c1fc009d9ba2e..d09d6ed4a1e7c 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_MAKE_SIGNED_H
 
 #include <__config>
+#include <__type_traits/copy_cv.h>
 #include <__type_traits/is_enum.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/nat.h>
diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h
index 2c4e9e419a1be..50e9f3e8aa78d 100644
--- a/libcxx/include/__type_traits/remove_cv.h
+++ b/libcxx/include/__type_traits/remove_cv.h
@@ -28,7 +28,7 @@ using __remove_cv_t = typename remove_cv<_Tp>::type;
 #else
 template <class _Tp>
 using __remove_cv_t = __remove_cv(_Tp);
-#endif // __has_builtin(__remove_cv)
+#endif
 
 #if _LIBCPP_STD_VER >= 14
 template <class _Tp>
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index a522a60f1b551..698e6f5cb7ad1 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -102,6 +102,31 @@ namespace ranges {
     constexpr borrowed_iterator_t<R>
       find_if_not(R&& r, Pred pred, Proj proj = {});                                                      // since C++20
 
+  template<forward_iterator I, sentinel_for<I> S, class T, class Proj = identity>
+    requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>
+    constexpr subrange<I> find_last(I first, S last, const T& value, Proj proj = {});                     // since C++23
+
+  template<forward_range R, class T, class Proj = identity>
+    requires
+      indirect_binary_predicate<ranges::equal_to, projected<iterator_t<R>, Proj>, const T*>
+    constexpr borrowed_subrange_t<R> find_last(R&& r, const T& value, Proj proj = {});                   // since C++23
+
+  template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
+           indirect_unary_predicate<projected<I, Proj>> Pred>
+    constexpr subrange<I> find_last_if(I first, S last, Pred pred, Proj proj = {});                      // since C++23
+
+  template<forward_range R, class Proj = identity,
+           indirect_unary_predicate<projected<iterator_t<R>, Proj>> Pred>
+    constexpr borrowed_subrange_t<R> find_last_if(R&& r, Pred pred, Proj proj = {});                     // since C++23
+
+  template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
+           indirect_unary_predicate<projected<I, Proj>> Pred>
+    constexpr subrange<I> find_last_if_not(I first, S last, Pred pred, Proj proj = {});                  // since C++23
+
+  template<forward_range R, class Proj = identity,
+           indirect_unary_predicate<projected<iterator_t<R>, Proj>> Pred>
+    constexpr borrowed_subrange_t<R> find_last_if_not(R&& r, Pred pred, Proj proj = {});                 // since C++23
+
   template<class T, class Proj = identity,
            indirect_strict_weak_order<projected<const T*, Proj>> Comp = ranges::less>
     constexpr const T& min(const T& a, const T& b, Comp comp = {}, Proj proj = {});                       // since C++20
@@ -1989,6 +2014,7 @@ template <class BidirectionalIterator, class Compare>
 #  include <__algorithm/fold.h>
 #  include <__algorithm/ranges_contains_subrange.h>
 #  include <__algorithm/ranges_ends_with.h>
+#  include <__algorithm/ranges_find_last.h>
 #  include <__algorithm/ranges_starts_with.h>
 #endif // _LIBCPP_STD_VER >= 23
 
diff --git a/libcxx/include/array b/libcxx/include/array
index 6ffde852f4802..4db0cb7bd7e3b 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -19,17 +19,17 @@ template <class T, size_t N >
 struct array
 {
     // types:
-    typedef T & reference;
-    typedef const T & const_reference;
-    typedef implementation defined iterator;
-    typedef implementation defined const_iterator;
-    typedef size_t size_type;
-    typedef ptrdiff_t difference_type;
-    typedef T value_type;
-    typedef T* pointer;
-    typedef const T* const_pointer;
-    typedef std::reverse_iterator<iterator> reverse_iterator;
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+    using value_type             = T;
+    using pointer                = T*;
+    using const_pointer          = const T*;
+    using reference              = T&;
+    using const_reference        = const T&;
+    using size_type              = size_t;
+    using difference_type        = ptrdiff_t;
+    using iterator               = implementation-defined;
+    using const_iterator         = implementation-defined;
+    using reverse_iterator       = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
     // No explicit construct/copy/destroy for aggregate type
     void fill(const T& u);                                      // constexpr in C++20
@@ -270,20 +270,25 @@ struct _LIBCPP_TEMPLATE_VIS array {
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   // types:
-  typedef array __self;
-  typedef _Tp value_type;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-  typedef value_type* iterator;
-  typedef const value_type* const_iterator;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-
-  typedef __conditional_t<is_const<_Tp>::value, const __empty, __empty> _EmptyType;
+  using __self          = array;
+  using value_type      = _Tp;
+  using reference       = value_type&;
+  using const_reference = const value_type&;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
+#if defined(_LIBCPP_ABI_USE_WRAP_ITER_IN_STD_ARRAY)
+  using iterator       = __wrap_iter<pointer>;
+  using const_iterator = __wrap_iter<const_pointer>;
+#else
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+#endif
+  using size_type              = size_t;
+  using difference_type        = ptrdiff_t;
+  using reverse_iterator       = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  using _EmptyType = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
 
   struct _ArrayInStructT {
     _Tp __data_[1];
@@ -440,7 +445,7 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size<array<_Tp, _Size> > : public integral_con
 template <size_t _Ip, class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, array<_Tp, _Size> > {
   static_assert(_Ip < _Size, "Index out of bounds in std::tuple_element<> (std::array)");
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <size_t _Ip, class _Tp, size_t _Size>
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 80a0f9ee373e9..772ac998615a9 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -101,12 +101,12 @@ struct atomic
     bool compare_exchange_strong(T& expc, T desr,
                                  memory_order m = memory_order_seq_cst) noexcept;
 
-    void wait(T, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(T, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(T, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(T, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                        // since C++20
+    void notify_one() noexcept;                                                 // since C++20
+    void notify_all() volatile noexcept;                                        // since C++20
+    void notify_all() noexcept;                                                 // since C++20
 };
 
 template <>
@@ -184,12 +184,12 @@ struct atomic<integral>
     integral operator^=(integral op) volatile noexcept;
     integral operator^=(integral op) noexcept;
 
-    void wait(integral, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(integral, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(integral, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(integral, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                               // since C++20
+    void notify_one() noexcept;                                                        // since C++20
+    void notify_all() volatile noexcept;                                               // since C++20
+    void notify_all() noexcept;                                                        // since C++20
 };
 
 template <class T>
@@ -254,12 +254,12 @@ struct atomic<T*>
     T* operator-=(ptrdiff_t op) volatile noexcept;
     T* operator-=(ptrdiff_t op) noexcept;
 
-    void wait(T*, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(T*, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(T*, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(T*, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                         // since C++20
+    void notify_one() noexcept;                                                  // since C++20
+    void notify_all() volatile noexcept;                                         // since C++20
+    void notify_all() noexcept;                                                  // since C++20
 };
 
 template<>
@@ -321,12 +321,12 @@ struct atomic<floating-point-type> {  // since C++20
   floating-point-type operator-=(floating-point-type) volatile noexcept;
   floating-point-type operator-=(floating-point-type) noexcept;
 
-  void wait(floating-point-type, memory_order = memory_order::seq_cst) const volatile noexcept;
-  void wait(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
-  void notify_one() volatile noexcept;
-  void notify_one() noexcept;
-  void notify_all() volatile noexcept;
-  void notify_all() noexcept;
+  void wait(floating-point-type, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+  void wait(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+  void notify_one() volatile noexcept;                                                          // since C++20
+  void notify_one() noexcept;                                                                   // since C++20
+  void notify_all() volatile noexcept;                                                          // since C++20
+  void notify_all() noexcept;                                                                   // since C++20
 };
 
 // [atomics.nonmembers], non-member functions
@@ -443,23 +443,23 @@ template<class T>
                               memory_order) noexcept;
 
 template<class T>
-  void atomic_wait(const volatile atomic<T>*, atomic<T>::value_type) noexcept;
+  void atomic_wait(const volatile atomic<T>*, atomic<T>::value_type) noexcept; // since C++20
 template<class T>
-  void atomic_wait(const atomic<T>*, atomic<T>::value_type) noexcept;
+  void atomic_wait(const atomic<T>*, atomic<T>::value_type) noexcept;          // since C++20
 template<class T>
-  void atomic_wait_explicit(const volatile atomic<T>*, atomic<T>::value_type,
+  void atomic_wait_explicit(const volatile atomic<T>*, atomic<T>::value_type,  // since C++20
                             memory_order) noexcept;
 template<class T>
-  void atomic_wait_explicit(const atomic<T>*, atomic<T>::value_type,
+  void atomic_wait_explicit(const atomic<T>*, atomic<T>::value_type,           // since C++20
                             memory_order) noexcept;
 template<class T>
-  void atomic_notify_one(volatile atomic<T>*) noexcept;
+  void atomic_notify_one(volatile atomic<T>*) noexcept;                        // since C++20
 template<class T>
-  void atomic_notify_one(atomic<T>*) noexcept;
+  void atomic_notify_one(atomic<T>*) noexcept;                                 // since C++20
 template<class T>
-  void atomic_notify_all(volatile atomic<T>*) noexcept;
+  void atomic_notify_all(volatile atomic<T>*) noexcept;                        // since C++20
 template<class T>
-  void atomic_notify_all(atomic<T>*) noexcept;
+  void atomic_notify_all(atomic<T>*) noexcept;                                 // since C++20
 
 // Atomics for standard typedef types
 
@@ -534,12 +534,12 @@ typedef struct atomic_flag
     void clear(memory_order m = memory_order_seq_cst) volatile noexcept;
     void clear(memory_order m = memory_order_seq_cst) noexcept;
 
-    void wait(bool, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(bool, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(bool, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(bool, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                           // since C++20
+    void notify_one() noexcept;                                                    // since C++20
+    void notify_all() volatile noexcept;                                           // since C++20
+    void notify_all() noexcept;                                                    // since C++20
 } atomic_flag;
 
 bool atomic_flag_test(volatile atomic_flag* obj) noexcept;
@@ -557,14 +557,14 @@ void atomic_flag_clear(atomic_flag* obj) noexcept;
 void atomic_flag_clear_explicit(volatile atomic_flag* obj, memory_order m) noexcept;
 void atomic_flag_clear_explicit(atomic_flag* obj, memory_order m) noexcept;
 
-void atomic_wait(const volatile atomic_flag* obj, T old) noexcept;
-void atomic_wait(const atomic_flag* obj, T old) noexcept;
-void atomic_wait_explicit(const volatile atomic_flag* obj, T old, memory_order m) noexcept;
-void atomic_wait_explicit(const atomic_flag* obj, T old, memory_order m) noexcept;
-void atomic_one(volatile atomic_flag* obj) noexcept;
-void atomic_one(atomic_flag* obj) noexcept;
-void atomic_all(volatile atomic_flag* obj) noexcept;
-void atomic_all(atomic_flag* obj) noexcept;
+void atomic_wait(const volatile atomic_flag* obj, T old) noexcept;                          // since C++20
+void atomic_wait(const atomic_flag* obj, T old) noexcept;                                   // since C++20
+void atomic_wait_explicit(const volatile atomic_flag* obj, T old, memory_order m) noexcept; // since C++20
+void atomic_wait_explicit(const atomic_flag* obj, T old, memory_order m) noexcept;          // since C++20
+void atomic_one(volatile atomic_flag* obj) noexcept;                                        // since C++20
+void atomic_one(atomic_flag* obj) noexcept;                                                 // since C++20
+void atomic_all(volatile atomic_flag* obj) noexcept;                                        // since C++20
+void atomic_all(atomic_flag* obj) noexcept;                                                 // since C++20
 
 // fences
 
@@ -599,7 +599,6 @@ template <class T>
 #include <__atomic/atomic_flag.h>
 #include <__atomic/atomic_init.h>
 #include <__atomic/atomic_lock_free.h>
-#include <__atomic/atomic_ref.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/check_memory_order.h>
 #include <__atomic/contention_t.h>
@@ -610,6 +609,10 @@ template <class T>
 #include <__atomic/memory_order.h>
 #include <version>
 
+#if _LIBCPP_STD_VER >= 20
+#  include <__atomic/atomic_ref.h>
+#endif
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index edee181273e24..ba29ebc3212ee 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -17,7 +17,7 @@ namespace std
 {
 
   template<class CompletionFunction = see below>
-  class barrier
+  class barrier                                   // since C++20
   {
   public:
     using arrival_token = see below;
@@ -68,7 +68,7 @@ namespace std
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -254,7 +254,7 @@ public:
 #    endif // !_LIBCPP_HAS_NO_TREE_BARRIER
 
 template <class _CompletionF = __empty_completion>
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC barrier {
+class barrier {
   __barrier_base<_CompletionF> __b_;
 
 public:
@@ -290,7 +290,7 @@ public:
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -305,4 +305,4 @@ _LIBCPP_POP_MACROS
 #  include <variant>
 #endif
 
-#endif //_LIBCPP_BARRIER
+#endif // _LIBCPP_BARRIER
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 7f25c76fda542..990c415ec2e97 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -799,6 +799,11 @@ template<class Duration1, class Duration2, class TimeZonePtr>
   bool operator==(const zoned_time<Duration1, TimeZonePtr>& x,
                   const zoned_time<Duration2, TimeZonePtr>& y);
 
+template<class charT, class traits, class Duration, class TimeZonePtr>           // C++20
+  basic_ostream<charT, traits>&
+    operator<<(basic_ostream<charT, traits>& os,
+               const zoned_time<Duration, TimeZonePtr>& t);
+
 // [time.zone.leap], leap second support
 class leap_second {                                                              // C++20
 public:
@@ -881,6 +886,8 @@ namespace std {
     struct formatter<chrono::hh_mm_ss<duration<Rep, Period>>, charT>;             // C++20
   template<class charT> struct formatter<chrono::sys_info, charT>;                // C++20
   template<class charT> struct formatter<chrono::local_info, charT>;              // C++20
+  template<class Duration, class TimeZonePtr, class charT>                        // C++20
+    struct formatter<chrono::zoned_time<Duration, TimeZonePtr>, charT>;
 } // namespace std
 
 namespace chrono {
diff --git a/libcxx/include/cmath b/libcxx/include/cmath
index 7a87e35c84603..3c22604a683c3 100644
--- a/libcxx/include/cmath
+++ b/libcxx/include/cmath
@@ -204,6 +204,14 @@ floating_point fmin (arithmetic x, arithmetic y);
 float          fminf(float x, float y);
 long double    fminl(long double x, long double y);
 
+double         hermite(unsigned n, double x);                    // C++17
+float          hermite(unsigned n, float x);                     // C++17
+long double    hermite(unsigned n, long double x);               // C++17
+float          hermitef(unsigned n, float x);                    // C++17
+long double    hermitel(unsigned n, long double x);              // C++17
+template <class Integer>
+double         hermite(unsigned n, Integer x);                   // C++17
+
 floating_point hypot (arithmetic x, arithmetic y);
 float          hypotf(float x, float y);
 long double    hypotl(long double x, long double y);
@@ -315,6 +323,7 @@ constexpr long double lerp(long double a, long double b, long double t) noexcept
 #include <limits>
 #include <version>
 
+#include <__math/special_functions.h>
 #include <math.h>
 
 #ifndef _LIBCPP_MATH_H
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 4fc994a6e229b..759de5d3a030a 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -376,10 +376,10 @@ public:
     return __x.__ptr_ == __y.__ptr_;
   }
 
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __deque_iterator& __x, const __deque_iterator& __y) {
     return !(__x == __y);
   }
-
   _LIBCPP_HIDE_FROM_ABI friend bool operator<(const __deque_iterator& __x, const __deque_iterator& __y) {
     return __x.__m_iter_ < __y.__m_iter_ || (__x.__m_iter_ == __y.__m_iter_ && __x.__ptr_ < __y.__ptr_);
   }
@@ -396,6 +396,30 @@ public:
     return !(__x < __y);
   }
 
+#else
+
+  _LIBCPP_HIDE_FROM_ABI friend strong_ordering operator<=>(const __deque_iterator& __x, const __deque_iterator& __y) {
+    if (__x.__m_iter_ < __y.__m_iter_)
+      return strong_ordering::less;
+
+    if (__x.__m_iter_ == __y.__m_iter_) {
+      if constexpr (three_way_comparable<pointer, strong_ordering>) {
+        return __x.__ptr_ <=> __y.__ptr_;
+      } else {
+        if (__x.__ptr_ < __y.__ptr_)
+          return strong_ordering::less;
+
+        if (__x.__ptr_ == __y.__ptr_)
+          return strong_ordering::equal;
+
+        return strong_ordering::greater;
+      }
+    }
+
+    return strong_ordering::greater;
+  }
+#endif // _LIBCPP_STD_VER >= 20
+
 private:
   _LIBCPP_HIDE_FROM_ABI explicit __deque_iterator(__map_iterator __m, pointer __p) _NOEXCEPT
       : __m_iter_(__m),
@@ -2530,8 +2554,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const deque<_Tp, _Allocator>& __x,
 template <class _Tp, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const deque<_Tp, _Allocator>& __x, const deque<_Tp, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/include/expected b/libcxx/include/expected
index f455ab7d5d61c..6a2f12f2bf3b5 100644
--- a/libcxx/include/expected
+++ b/libcxx/include/expected
@@ -39,14 +39,24 @@ namespace std {
 */
 
 #include <__config>
-#include <__expected/bad_expected_access.h>
-#include <__expected/expected.h>
-#include <__expected/unexpect.h>
-#include <__expected/unexpected.h>
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__expected/bad_expected_access.h>
+#  include <__expected/expected.h>
+#  include <__expected/unexpect.h>
+#  include <__expected/unexpected.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstddef>
+#  include <initializer_list>
+#  include <new>
+#endif
+
 #endif // _LIBCPP_EXPECTED
diff --git a/libcxx/include/filesystem b/libcxx/include/filesystem
index eff7dff4a4551..6ea04df0a089b 100644
--- a/libcxx/include/filesystem
+++ b/libcxx/include/filesystem
@@ -534,22 +534,26 @@ inline constexpr bool std::ranges::enable_view<std::filesystem::recursive_direct
 */
 
 #include <__config>
-#include <__filesystem/copy_options.h>
-#include <__filesystem/directory_entry.h>
-#include <__filesystem/directory_iterator.h>
-#include <__filesystem/directory_options.h>
-#include <__filesystem/file_status.h>
-#include <__filesystem/file_time_type.h>
-#include <__filesystem/file_type.h>
-#include <__filesystem/filesystem_error.h>
-#include <__filesystem/operations.h>
-#include <__filesystem/path.h>
-#include <__filesystem/path_iterator.h>
-#include <__filesystem/perm_options.h>
-#include <__filesystem/perms.h>
-#include <__filesystem/recursive_directory_iterator.h>
-#include <__filesystem/space_info.h>
-#include <__filesystem/u8path.h>
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__filesystem/copy_options.h>
+#  include <__filesystem/directory_entry.h>
+#  include <__filesystem/directory_iterator.h>
+#  include <__filesystem/directory_options.h>
+#  include <__filesystem/file_status.h>
+#  include <__filesystem/file_time_type.h>
+#  include <__filesystem/file_type.h>
+#  include <__filesystem/filesystem_error.h>
+#  include <__filesystem/operations.h>
+#  include <__filesystem/path.h>
+#  include <__filesystem/path_iterator.h>
+#  include <__filesystem/perm_options.h>
+#  include <__filesystem/perms.h>
+#  include <__filesystem/recursive_directory_iterator.h>
+#  include <__filesystem/space_info.h>
+#  include <__filesystem/u8path.h>
+#endif
+
 #include <version>
 
 // standard-mandated includes
diff --git a/libcxx/include/format b/libcxx/include/format
index 07c2ba083199e..449e6f0bf3fb6 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -126,6 +126,9 @@ namespace std {
   // [format.formatter], formatter
   template<class T, class charT = char> struct formatter;
 
+  template<class T>
+  constexpr bool enable_nonlocking_formatter_optimization = false;   // since C++23
+
   // [format.parse.ctx], class template basic_format_parse_context
   template<class charT> class basic_format_parse_context;
   using format_parse_context = basic_format_parse_context<char>;
@@ -133,7 +136,7 @@ namespace std {
 
   // [format.range], formatting of ranges
   // [format.range.fmtkind], variable template format_kind
-  enum class range_format {                                     // since C++23
+  enum class range_format {                                          // since C++23
     disabled,
     map,
     set,
@@ -143,20 +146,20 @@ namespace std {
   };
 
   template<class R>
-    constexpr unspecified format_kind = unspecified;            // since C++23
+    constexpr unspecified format_kind = unspecified;                 // since C++23
 
   template<ranges::input_range R>
       requires same_as<R, remove_cvref_t<R>>
-    constexpr range_format format_kind<R> = see below;          // since C++23
+    constexpr range_format format_kind<R> = see below;               // since C++23
 
   // [format.range.formatter], class template range_formatter
   template<class T, class charT = char>
     requires same_as<remove_cvref_t<T>, T> && formattable<T, charT>
-  class range_formatter;                                        // since C++23
+  class range_formatter;                                             // since C++23
 
   // [format.range.fmtdef], class template range-default-formatter
   template<range_format K, ranges::input_range R, class charT>
-    struct range-default-formatter;                             // exposition only, since C++23
+    struct range-default-formatter;                                  // exposition only, since C++23
 
   // [format.range.fmtmap], [format.range.fmtset], [format.range.fmtstr],
   // specializations for maps, sets, and strings
@@ -173,7 +176,7 @@ namespace std {
     see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
 
   // [format.arg.store], class template format-arg-store
-  template<class Context, class... Args> struct format-arg-store;      // exposition only
+  template<class Context, class... Args> struct format-arg-store;    // exposition only
 
   template<class Context = format_context, class... Args>
     format-arg-store<Context, Args...>
@@ -189,34 +192,38 @@ namespace std {
 */
 
 #include <__config>
-#include <__format/buffer.h>
-#include <__format/concepts.h>
-#include <__format/container_adaptor.h>
-#include <__format/enable_insertable.h>
-#include <__format/escaped_output_table.h>
-#include <__format/extended_grapheme_cluster_table.h>
-#include <__format/format_arg.h>
-#include <__format/format_arg_store.h>
-#include <__format/format_args.h>
-#include <__format/format_context.h>
-#include <__format/format_error.h>
-#include <__format/format_functions.h>
-#include <__format/format_parse_context.h>
-#include <__format/format_string.h>
-#include <__format/format_to_n_result.h>
-#include <__format/formatter.h>
-#include <__format/formatter_bool.h>
-#include <__format/formatter_char.h>
-#include <__format/formatter_floating_point.h>
-#include <__format/formatter_integer.h>
-#include <__format/formatter_pointer.h>
-#include <__format/formatter_string.h>
-#include <__format/formatter_tuple.h>
-#include <__format/parser_std_format_spec.h>
-#include <__format/range_default_formatter.h>
-#include <__format/range_formatter.h>
-#include <__format/unicode.h>
-#include <__fwd/format.h>
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__format/buffer.h>
+#  include <__format/concepts.h>
+#  include <__format/container_adaptor.h>
+#  include <__format/enable_insertable.h>
+#  include <__format/escaped_output_table.h>
+#  include <__format/extended_grapheme_cluster_table.h>
+#  include <__format/format_arg.h>
+#  include <__format/format_arg_store.h>
+#  include <__format/format_args.h>
+#  include <__format/format_context.h>
+#  include <__format/format_error.h>
+#  include <__format/format_functions.h>
+#  include <__format/format_parse_context.h>
+#  include <__format/format_string.h>
+#  include <__format/format_to_n_result.h>
+#  include <__format/formatter.h>
+#  include <__format/formatter_bool.h>
+#  include <__format/formatter_char.h>
+#  include <__format/formatter_floating_point.h>
+#  include <__format/formatter_integer.h>
+#  include <__format/formatter_pointer.h>
+#  include <__format/formatter_string.h>
+#  include <__format/formatter_tuple.h>
+#  include <__format/parser_std_format_spec.h>
+#  include <__format/range_default_formatter.h>
+#  include <__format/range_formatter.h>
+#  include <__format/unicode.h>
+#  include <__fwd/format.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -224,9 +231,30 @@ namespace std {
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <array>
+#  include <cctype>
+#  include <cerrno>
+#  include <clocale>
+#  include <cmath>
+#  include <cstddef>
+#  include <cstdint>
+#  include <cstdlib>
+#  include <cstring>
+#  include <initializer_list>
+#  include <limits>
 #  include <locale>
+#  include <new>
+#  include <optional>
 #  include <queue>
 #  include <stack>
+#  include <stdexcept>
+#  include <string>
+#  include <string_view>
+#  include <tuple>
+
+#  if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#    include <cwchar>
+#  endif
 #endif
 
 #endif // _LIBCPP_FORMAT
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 1ae19d23f88cc..b14d2cb6c7803 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -224,6 +224,7 @@ template <class T, class Allocator, class Predicate>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
+#include <__type_traits/is_swappable.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/functional b/libcxx/include/functional
index 27cf21e1a4c8b..3d39f654ddb08 100644
--- a/libcxx/include/functional
+++ b/libcxx/include/functional
@@ -527,41 +527,60 @@ POLICY:  For non-variadic implementations, the number of arguments is limited
 
 */
 
-#include <__algorithm/search.h>
-#include <__compare/compare_three_way.h>
 #include <__config>
+
 #include <__functional/binary_function.h>
 #include <__functional/binary_negate.h>
 #include <__functional/bind.h>
-#include <__functional/bind_back.h>
-#include <__functional/bind_front.h>
 #include <__functional/binder1st.h>
 #include <__functional/binder2nd.h>
-#include <__functional/boyer_moore_searcher.h>
-#include <__functional/compose.h>
-#include <__functional/default_searcher.h>
-#include <__functional/function.h>
 #include <__functional/hash.h>
-#include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__functional/mem_fn.h> // TODO: deprecate
 #include <__functional/mem_fun_ref.h>
-#include <__functional/not_fn.h>
 #include <__functional/operations.h>
 #include <__functional/pointer_to_binary_function.h>
 #include <__functional/pointer_to_unary_function.h>
-#include <__functional/ranges_operations.h>
 #include <__functional/reference_wrapper.h>
 #include <__functional/unary_function.h>
 #include <__functional/unary_negate.h>
-#include <__type_traits/unwrap_ref.h>
-#include <__utility/forward.h>
+
+#ifndef _LIBCPP_CXX03_LANG
+#  include <__functional/function.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__functional/boyer_moore_searcher.h>
+#  include <__functional/default_searcher.h>
+#  include <__functional/invoke.h>
+#  include <__functional/not_fn.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__functional/bind_back.h>
+#  include <__functional/bind_front.h>
+#  include <__functional/identity.h>
+#  include <__functional/ranges_operations.h>
+#  include <__type_traits/unwrap_ref.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && defined(_LIBCPP_CXX03_LANG)
+#  include <limits>
+#  include <new>
+#endif
+
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14
+#  include <array>
+#  include <initializer_list>
+#  include <unordered_map>
+#  include <vector>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index 1b9e7eaf0c1e8..fca75f0a19ed1 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -683,43 +683,49 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #include <__iterator/access.h>
 #include <__iterator/advance.h>
 #include <__iterator/back_insert_iterator.h>
-#include <__iterator/bounded_iter.h>
-#include <__iterator/common_iterator.h>
-#include <__iterator/concepts.h>
-#include <__iterator/counted_iterator.h>
-#include <__iterator/data.h>
-#include <__iterator/default_sentinel.h>
 #include <__iterator/distance.h>
-#include <__iterator/empty.h>
-#include <__iterator/erase_if_container.h>
 #include <__iterator/front_insert_iterator.h>
-#include <__iterator/incrementable_traits.h>
-#include <__iterator/indirectly_comparable.h>
 #include <__iterator/insert_iterator.h>
 #include <__iterator/istream_iterator.h>
 #include <__iterator/istreambuf_iterator.h>
-#include <__iterator/iter_move.h>
-#include <__iterator/iter_swap.h>
 #include <__iterator/iterator.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/mergeable.h>
 #include <__iterator/move_iterator.h>
-#include <__iterator/move_sentinel.h>
 #include <__iterator/next.h>
 #include <__iterator/ostream_iterator.h>
 #include <__iterator/ostreambuf_iterator.h>
-#include <__iterator/permutable.h>
 #include <__iterator/prev.h>
-#include <__iterator/projected.h>
-#include <__iterator/readable_traits.h>
-#include <__iterator/reverse_access.h>
 #include <__iterator/reverse_iterator.h>
-#include <__iterator/size.h>
-#include <__iterator/sortable.h>
-#include <__iterator/unreachable_sentinel.h>
 #include <__iterator/wrap_iter.h>
-#include <__memory/addressof.h>
-#include <__memory/pointer_traits.h>
+
+#if _LIBCPP_STD_VER >= 14
+#  include <__iterator/reverse_access.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__iterator/data.h>
+#  include <__iterator/empty.h>
+#  include <__iterator/size.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__iterator/common_iterator.h>
+#  include <__iterator/concepts.h>
+#  include <__iterator/counted_iterator.h>
+#  include <__iterator/default_sentinel.h>
+#  include <__iterator/incrementable_traits.h>
+#  include <__iterator/indirectly_comparable.h>
+#  include <__iterator/iter_move.h>
+#  include <__iterator/iter_swap.h>
+#  include <__iterator/mergeable.h>
+#  include <__iterator/move_sentinel.h>
+#  include <__iterator/permutable.h>
+#  include <__iterator/projected.h>
+#  include <__iterator/readable_traits.h>
+#  include <__iterator/sortable.h>
+#  include <__iterator/unreachable_sentinel.h>
+#endif
+
 #include <version>
 
 // standard-mandated includes
@@ -732,6 +738,10 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
+#  include <variant>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <exception>
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 81d6028a9c2ce..b56e49bc768bf 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -16,7 +16,7 @@
 namespace std
 {
 
-  class latch
+  class latch                                     // since C++20
   {
   public:
     static constexpr ptrdiff_t max() noexcept;
@@ -59,11 +59,11 @@ namespace std
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC latch {
+class latch {
   __atomic_base<ptrdiff_t> __a_;
 
 public:
@@ -116,7 +116,7 @@ private:
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -126,4 +126,4 @@ _LIBCPP_POP_MACROS
 #  include <atomic>
 #endif
 
-#endif //_LIBCPP_LATCH
+#endif // _LIBCPP_LATCH
diff --git a/libcxx/include/locale b/libcxx/include/locale
index dbec23a2c936d..573910a85bef5 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -232,6 +232,10 @@ template <class charT> class messages_byname;
 #    include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h>
 #  endif
 
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    include <xlocale.h>
+#  endif
+
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #    pragma GCC system_header
 #  endif
diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan
index aa7ba278b1aa0..29190e4a9953e 100644
--- a/libcxx/include/mdspan
+++ b/libcxx/include/mdspan
@@ -409,17 +409,30 @@ namespace std {
 #define _LIBCPP_MDSPAN
 
 #include <__config>
-#include <__fwd/mdspan.h>
-#include <__mdspan/default_accessor.h>
-#include <__mdspan/extents.h>
-#include <__mdspan/layout_left.h>
-#include <__mdspan/layout_right.h>
-#include <__mdspan/layout_stride.h>
-#include <__mdspan/mdspan.h>
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__fwd/mdspan.h>
+#  include <__mdspan/default_accessor.h>
+#  include <__mdspan/extents.h>
+#  include <__mdspan/layout_left.h>
+#  include <__mdspan/layout_right.h>
+#  include <__mdspan/layout_stride.h>
+#  include <__mdspan/mdspan.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <array>
+#  include <cinttypes>
+#  include <concepts>
+#  include <cstddef>
+#  include <limits>
+#  include <span>
+#endif
+
 #endif // _LIBCPP_MDSPAN
diff --git a/libcxx/include/memory b/libcxx/include/memory
index a8c0264eb9eb7..b940a32c3ebe6 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -451,7 +451,8 @@ public:
     constexpr unique_ptr& operator=(nullptr_t) noexcept;                              // constexpr since C++23
 
     // observers
-    typename constexpr add_lvalue_reference<T>::type operator*() const;               // constexpr since C++23
+    constexpr
+    add_lvalue_reference<T>::type operator*() const noexcept(see below);              // constexpr since C++23
     constexpr pointer operator->() const noexcept;                                    // constexpr since C++23
     constexpr pointer get() const noexcept;                                           // constexpr since C++23
     constexpr deleter_type& get_deleter() noexcept;                                   // constexpr since C++23
@@ -911,6 +912,22 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 template<size_t N, class T>
 [[nodiscard]] constexpr T* assume_aligned(T* ptr); // since C++20
 
+// [out.ptr.t], class template out_ptr_t
+template<class Smart, class Pointer, class... Args>
+  class out_ptr_t;                                          // since c++23
+
+// [out.ptr], function template out_ptr
+template<class Pointer = void, class Smart, class... Args>
+  auto out_ptr(Smart& s, Args&&... args);                   // since c++23
+
+// [inout.ptr.t], class template inout_ptr_t
+template<class Smart, class Pointer, class... Args>
+  class inout_ptr_t;                                        // since c++23
+
+// [inout.ptr], function template inout_ptr
+template<class Pointer = void, class Smart, class... Args>
+  auto inout_ptr(Smart& s, Args&&... args);                 // since c++23
+
 }  // std
 
 */
@@ -920,30 +937,40 @@ template<size_t N, class T>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory/align.h>
-#include <__memory/allocate_at_least.h>
-#include <__memory/allocation_guard.h>
 #include <__memory/allocator.h>
 #include <__memory/allocator_arg_t.h>
 #include <__memory/allocator_traits.h>
-#include <__memory/assume_aligned.h>
 #include <__memory/auto_ptr.h>
-#include <__memory/compressed_pair.h>
-#include <__memory/concepts.h>
-#include <__memory/construct_at.h>
+#include <__memory/inout_ptr.h>
+#include <__memory/out_ptr.h>
 #include <__memory/pointer_traits.h>
-#include <__memory/ranges_construct_at.h>
-#include <__memory/ranges_uninitialized_algorithms.h>
 #include <__memory/raw_storage_iterator.h>
 #include <__memory/shared_ptr.h>
 #include <__memory/temporary_buffer.h>
 #include <__memory/uninitialized_algorithms.h>
 #include <__memory/unique_ptr.h>
 #include <__memory/uses_allocator.h>
-#include <__memory/uses_allocator_construction.h>
-#include <version>
 
 // standard-mandated includes
 
+#if _LIBCPP_STD_VER >= 17
+#  include <__memory/construct_at.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__memory/assume_aligned.h>
+#  include <__memory/concepts.h>
+#  include <__memory/ranges_construct_at.h>
+#  include <__memory/ranges_uninitialized_algorithms.h>
+#  include <__memory/uses_allocator_construction.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__memory/allocate_at_least.h>
+#endif
+
+#include <version>
+
 // [memory.syn]
 #include <compare>
 
diff --git a/libcxx/include/memory_resource b/libcxx/include/memory_resource
index e9c87777e8f75..67411054820a1 100644
--- a/libcxx/include/memory_resource
+++ b/libcxx/include/memory_resource
@@ -50,18 +50,32 @@ namespace std::pmr {
  */
 
 #include <__config>
-#include <__memory_resource/memory_resource.h>
-#include <__memory_resource/monotonic_buffer_resource.h>
-#include <__memory_resource/polymorphic_allocator.h>
-#include <__memory_resource/pool_options.h>
-#include <__memory_resource/synchronized_pool_resource.h>
-#include <__memory_resource/unsynchronized_pool_resource.h>
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__memory_resource/memory_resource.h>
+#  include <__memory_resource/monotonic_buffer_resource.h>
+#  include <__memory_resource/polymorphic_allocator.h>
+#  include <__memory_resource/pool_options.h>
+#  include <__memory_resource/synchronized_pool_resource.h>
+#  include <__memory_resource/unsynchronized_pool_resource.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14
+#  include <cstddef>
+#  include <cstdint>
+#  include <limits>
+#  include <mutex>
+#  include <new>
+#  include <stdexcept>
+#  include <tuple>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <stdexcept>
 #endif
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index f4aaa14c1c2ee..13d0dce34d97e 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -764,6 +764,7 @@ module std_private_algorithm_ranges_find_end                             [system
 module std_private_algorithm_ranges_find_first_of                        [system] { header "__algorithm/ranges_find_first_of.h" }
 module std_private_algorithm_ranges_find_if                              [system] { header "__algorithm/ranges_find_if.h" }
 module std_private_algorithm_ranges_find_if_not                          [system] { header "__algorithm/ranges_find_if_not.h" }
+module std_private_algorithm_ranges_find_last                            [system] { header "__algorithm/ranges_find_last.h" }
 module std_private_algorithm_ranges_for_each                             [system] {
   header "__algorithm/ranges_for_each.h"
   export std_private_algorithm_in_fun_result
@@ -1081,7 +1082,10 @@ module std_private_charconv_tables                  [system] { header "__charcon
 module std_private_charconv_to_chars                [system] { header "__charconv/to_chars.h" }
 module std_private_charconv_to_chars_base_10        [system] { header "__charconv/to_chars_base_10.h" }
 module std_private_charconv_to_chars_floating_point [system] { header "__charconv/to_chars_floating_point.h" }
-module std_private_charconv_to_chars_integral       [system] { header "__charconv/to_chars_integral.h" }
+module std_private_charconv_to_chars_integral       [system] {
+  header "__charconv/to_chars_integral.h"
+  export std_private_charconv_traits
+}
 module std_private_charconv_to_chars_result         [system] {
   header "__charconv/to_chars_result.h"
   export *
@@ -1130,6 +1134,7 @@ module std_private_chrono_steady_clock           [system] {
 }
 module std_private_chrono_time_zone              [system] {
   header "__chrono/time_zone.h"
+  export std_private_memory_unique_ptr
 }
 module std_private_chrono_time_zone_link         [system] {
   header "__chrono/time_zone_link.h"
@@ -1480,6 +1485,7 @@ module std_private_math_modulo                          [system] { header "__mat
 module std_private_math_remainder                       [system] { header "__math/remainder.h" }
 module std_private_math_roots                           [system] { header "__math/roots.h" }
 module std_private_math_rounding_functions              [system] { header "__math/rounding_functions.h" }
+module std_private_math_special_functions               [system] { header "__math/special_functions.h" }
 module std_private_math_traits                          [system] { header "__math/traits.h" }
 module std_private_math_trigonometric_functions         [system] { header "__math/trigonometric_functions.h" }
 
@@ -1517,6 +1523,8 @@ module std_private_memory_concepts                        [system] {
 module std_private_memory_construct_at                    [system] { header "__memory/construct_at.h" }
 module std_private_memory_destruct_n                      [system] { header "__memory/destruct_n.h" }
 module std_private_memory_fwd                             [system] { header "__fwd/memory.h" }
+module std_private_memory_inout_ptr                       [system] { header "__memory/inout_ptr.h" }
+module std_private_memory_out_ptr                         [system] { header "__memory/out_ptr.h" }
 module std_private_memory_pointer_traits                  [system] { header "__memory/pointer_traits.h" }
 module std_private_memory_ranges_construct_at             [system] { header "__memory/ranges_construct_at.h" }
 module std_private_memory_ranges_uninitialized_algorithms [system] {
@@ -1924,7 +1932,10 @@ module std_private_type_traits_is_array                                  [system
 module std_private_type_traits_is_assignable                             [system] { header "__type_traits/is_assignable.h" }
 module std_private_type_traits_is_base_of                                [system] { header "__type_traits/is_base_of.h" }
 module std_private_type_traits_is_bounded_array                          [system] { header "__type_traits/is_bounded_array.h" }
-module std_private_type_traits_is_callable                               [system] { header "__type_traits/is_callable.h" }
+module std_private_type_traits_is_callable                               [system] {
+  header "__type_traits/is_callable.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_type_traits_is_char_like_type                         [system] { header "__type_traits/is_char_like_type.h" }
 module std_private_type_traits_is_class                                  [system] { header "__type_traits/is_class.h" }
 module std_private_type_traits_is_compound                               [system] { header "__type_traits/is_compound.h" }
@@ -1959,7 +1970,10 @@ module std_private_type_traits_is_final                                  [system
 module std_private_type_traits_is_floating_point                         [system] { header "__type_traits/is_floating_point.h" }
 module std_private_type_traits_is_function                               [system] { header "__type_traits/is_function.h" }
 module std_private_type_traits_is_fundamental                            [system] { header "__type_traits/is_fundamental.h" }
-module std_private_type_traits_is_implicitly_default_constructible       [system] { header "__type_traits/is_implicitly_default_constructible.h" }
+module std_private_type_traits_is_implicitly_default_constructible       [system] {
+  header "__type_traits/is_implicitly_default_constructible.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_type_traits_is_integral                               [system] { header "__type_traits/is_integral.h" }
 module std_private_type_traits_is_literal_type                           [system] { header "__type_traits/is_literal_type.h" }
 module std_private_type_traits_is_member_pointer                         [system] { header "__type_traits/is_member_pointer.h" }
@@ -1999,7 +2013,6 @@ module std_private_type_traits_is_scalar                                 [system
   header "__type_traits/is_scalar.h"
   export std_private_type_traits_is_null_pointer
 }
-module std_private_type_traits_is_scoped_enum                            [system] { header "__type_traits/is_scoped_enum.h" }
 module std_private_type_traits_is_signed                                 [system] { header "__type_traits/is_signed.h" }
 module std_private_type_traits_is_signed_integer                         [system] { header "__type_traits/is_signed_integer.h" }
 module std_private_type_traits_is_specialization                         [system] { header "__type_traits/is_specialization.h" }
diff --git a/libcxx/include/numeric b/libcxx/include/numeric
index 9fb5e9fb1da70..6b92ce3a07123 100644
--- a/libcxx/include/numeric
+++ b/libcxx/include/numeric
@@ -157,31 +157,40 @@ constexpr T saturate_cast(U x) noexcept;                    // freestanding, Sin
 */
 
 #include <__config>
-#include <version>
 
 #include <__numeric/accumulate.h>
 #include <__numeric/adjacent_difference.h>
-#include <__numeric/exclusive_scan.h>
-#include <__numeric/gcd_lcm.h>
-#include <__numeric/inclusive_scan.h>
 #include <__numeric/inner_product.h>
 #include <__numeric/iota.h>
-#include <__numeric/midpoint.h>
 #include <__numeric/partial_sum.h>
-#include <__numeric/reduce.h>
-#include <__numeric/saturation_arithmetic.h>
-#include <__numeric/transform_exclusive_scan.h>
-#include <__numeric/transform_inclusive_scan.h>
-#include <__numeric/transform_reduce.h>
 
 #if _LIBCPP_STD_VER >= 17
+#  include <__numeric/exclusive_scan.h>
+#  include <__numeric/gcd_lcm.h>
+#  include <__numeric/inclusive_scan.h>
 #  include <__numeric/pstl.h>
+#  include <__numeric/reduce.h>
+#  include <__numeric/transform_exclusive_scan.h>
+#  include <__numeric/transform_inclusive_scan.h>
+#  include <__numeric/transform_reduce.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__numeric/midpoint.h>
+#  include <__numeric/saturation_arithmetic.h>
 #endif
 
+#include <version>
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14
+#  include <initializer_list>
+#  include <limits>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <climits>
 #  include <cmath>
diff --git a/libcxx/include/optional b/libcxx/include/optional
index e550745c17c00..f9cbcbfa595d1 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -136,12 +136,12 @@ namespace std {
     void swap(optional &) noexcept(see below ); // constexpr in C++20
 
     // [optional.observe], observers
-    constexpr T const *operator->() const;
-    constexpr T *operator->();
-    constexpr T const &operator*() const &;
-    constexpr T &operator*() &;
-    constexpr T &&operator*() &&;
-    constexpr const T &&operator*() const &&;
+    constexpr T const *operator->() const noexcept;
+    constexpr T *operator->() noexcept;
+    constexpr T const &operator*() const & noexcept;
+    constexpr T &operator*() & noexcept;
+    constexpr T &&operator*() && noexcept;
+    constexpr const T &&operator*() const && noexcept;
     constexpr explicit operator bool() const noexcept;
     constexpr bool has_value() const noexcept;
     constexpr T const &value() const &;
@@ -783,12 +783,12 @@ public:
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type const> operator->() const {
+  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type const> operator->() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value");
     return std::addressof(this->__get());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type> operator->() {
+  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type> operator->() noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value");
     return std::addressof(this->__get());
   }
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index f75110e7d73f7..359d3c0e19c4c 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -173,8 +173,13 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args);
 */
 
 #include <__config>
+
 #include <__ostream/basic_ostream.h>
-#include <__ostream/print.h>
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__ostream/print.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -186,8 +191,10 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args);
 #  include <concepts>
 #  include <cstdio>
 #  include <cstdlib>
+#  include <format>
 #  include <iosfwd>
 #  include <iterator>
+#  include <print>
 #  include <stdexcept>
 #  include <type_traits>
 #endif
diff --git a/libcxx/include/random b/libcxx/include/random
index 9edd6c4608ec2..6cc3760c20e16 100644
--- a/libcxx/include/random
+++ b/libcxx/include/random
@@ -1682,7 +1682,6 @@ class piecewise_linear_distribution
 #include <__random/binomial_distribution.h>
 #include <__random/cauchy_distribution.h>
 #include <__random/chi_squared_distribution.h>
-#include <__random/clamp_to_integral.h>
 #include <__random/default_random_engine.h>
 #include <__random/discard_block_engine.h>
 #include <__random/discrete_distribution.h>
@@ -1694,10 +1693,8 @@ class piecewise_linear_distribution
 #include <__random/geometric_distribution.h>
 #include <__random/independent_bits_engine.h>
 #include <__random/is_seed_sequence.h>
-#include <__random/is_valid.h>
 #include <__random/knuth_b.h>
 #include <__random/linear_congruential_engine.h>
-#include <__random/log2.h>
 #include <__random/lognormal_distribution.h>
 #include <__random/mersenne_twister_engine.h>
 #include <__random/negative_binomial_distribution.h>
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 07a525ed8641f..fa35874265de6 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -381,49 +381,56 @@ namespace std {
 */
 
 #include <__config>
-#include <__ranges/access.h>
-#include <__ranges/all.h>
-#include <__ranges/as_rvalue_view.h>
-#include <__ranges/chunk_by_view.h>
-#include <__ranges/common_view.h>
-#include <__ranges/concepts.h>
-#include <__ranges/counted.h>
-#include <__ranges/dangling.h>
-#include <__ranges/data.h>
-#include <__ranges/drop_view.h>
-#include <__ranges/drop_while_view.h>
-#include <__ranges/elements_view.h>
-#include <__ranges/empty.h>
-#include <__ranges/empty_view.h>
-#include <__ranges/enable_borrowed_range.h>
-#include <__ranges/enable_view.h>
-#include <__ranges/filter_view.h>
-#include <__ranges/from_range.h>
-#include <__ranges/iota_view.h>
-#include <__ranges/join_view.h>
-#include <__ranges/lazy_split_view.h>
-#include <__ranges/rbegin.h>
-#include <__ranges/ref_view.h>
-#include <__ranges/rend.h>
-#include <__ranges/repeat_view.h>
-#include <__ranges/reverse_view.h>
-#include <__ranges/single_view.h>
-#include <__ranges/size.h>
-#include <__ranges/split_view.h>
-#include <__ranges/subrange.h>
-#include <__ranges/take_view.h>
-#include <__ranges/take_while_view.h>
-#include <__ranges/to.h>
-#include <__ranges/transform_view.h>
-#include <__ranges/view_interface.h>
-#include <__ranges/views.h>
-#include <__ranges/zip_view.h>
-#include <version>
 
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <__ranges/istream_view.h>
+#if _LIBCPP_STD_VER >= 20
+#  include <__ranges/access.h>
+#  include <__ranges/all.h>
+#  include <__ranges/common_view.h>
+#  include <__ranges/concepts.h>
+#  include <__ranges/counted.h>
+#  include <__ranges/dangling.h>
+#  include <__ranges/data.h>
+#  include <__ranges/drop_view.h>
+#  include <__ranges/drop_while_view.h>
+#  include <__ranges/elements_view.h>
+#  include <__ranges/empty.h>
+#  include <__ranges/empty_view.h>
+#  include <__ranges/enable_borrowed_range.h>
+#  include <__ranges/enable_view.h>
+#  include <__ranges/filter_view.h>
+#  include <__ranges/iota_view.h>
+#  include <__ranges/join_view.h>
+#  include <__ranges/lazy_split_view.h>
+#  include <__ranges/rbegin.h>
+#  include <__ranges/ref_view.h>
+#  include <__ranges/rend.h>
+#  include <__ranges/reverse_view.h>
+#  include <__ranges/single_view.h>
+#  include <__ranges/size.h>
+#  include <__ranges/split_view.h>
+#  include <__ranges/subrange.h>
+#  include <__ranges/take_view.h>
+#  include <__ranges/take_while_view.h>
+#  include <__ranges/transform_view.h>
+#  include <__ranges/view_interface.h>
+#  include <__ranges/views.h>
+
+#  if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#    include <__ranges/istream_view.h>
+#  endif
+#endif
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__ranges/as_rvalue_view.h>
+#  include <__ranges/chunk_by_view.h>
+#  include <__ranges/from_range.h>
+#  include <__ranges/repeat_view.h>
+#  include <__ranges/to.h>
+#  include <__ranges/zip_view.h>
 #endif
 
+#include <version>
+
 // standard-mandated includes
 
 // [ranges.syn]
@@ -439,6 +446,14 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
+#  include <cstddef>
+#  include <limits>
+#  include <optional>
+#  include <span>
+#  include <tuple>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <iosfwd>
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 95a4375f21c17..bf6317c587e2f 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -16,7 +16,7 @@
 namespace std {
 
 template<ptrdiff_t least_max_value = implementation-defined>
-class counting_semaphore
+class counting_semaphore                          // since C++20
 {
 public:
 static constexpr ptrdiff_t max() noexcept;
@@ -39,7 +39,7 @@ private:
 ptrdiff_t counter; // exposition only
 };
 
-using binary_semaphore = counting_semaphore<1>;
+using binary_semaphore = counting_semaphore<1>; // since C++20
 
 }
 
@@ -68,7 +68,7 @@ using binary_semaphore = counting_semaphore<1>;
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -124,7 +124,7 @@ private:
 };
 
 template <ptrdiff_t __least_max_value = _LIBCPP_SEMAPHORE_MAX>
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC counting_semaphore {
+class counting_semaphore {
   __atomic_semaphore_base __semaphore_;
 
 public:
@@ -169,13 +169,11 @@ public:
   }
 };
 
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using binary_semaphore _LIBCPP_DEPRECATED_ATOMIC_SYNC = counting_semaphore<1>;
-_LIBCPP_SUPPRESS_DEPRECATED_POP
+using binary_semaphore = counting_semaphore<1>;
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -185,4 +183,4 @@ _LIBCPP_POP_MACROS
 #  include <atomic>
 #endif
 
-#endif //_LIBCPP_SEMAPHORE
+#endif // _LIBCPP_SEMAPHORE
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 9ba43ffeb850f..272d8861d59c1 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -354,9 +354,15 @@ private:
 
 public:
   // [stringbuf.cons] constructors:
-  _LIBCPP_HIDE_FROM_ABI basic_stringbuf() : __hm_(nullptr), __mode_(ios_base::in | ios_base::out) {}
+  _LIBCPP_HIDE_FROM_ABI basic_stringbuf() : __hm_(nullptr), __mode_(ios_base::in | ios_base::out) {
+    // it is implementation-defined whether we initialize eback() & friends to nullptr, and libc++ doesn't
+    __init_buf_ptrs();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(ios_base::openmode __wch) : __hm_(nullptr), __mode_(__wch) {}
+  _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(ios_base::openmode __wch) : __hm_(nullptr), __mode_(__wch) {
+    // it is implementation-defined whether we initialize eback() & friends to nullptr, and libc++ doesn't
+    __init_buf_ptrs();
+  }
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(const string_type& __s,
                                                  ios_base::openmode __wch = ios_base::in | ios_base::out)
@@ -369,7 +375,9 @@ public:
       : basic_stringbuf(ios_base::in | ios_base::out, __a) {}
 
   _LIBCPP_HIDE_FROM_ABI basic_stringbuf(ios_base::openmode __wch, const allocator_type& __a)
-      : __str_(__a), __hm_(nullptr), __mode_(__wch) {}
+      : __str_(__a), __hm_(nullptr), __mode_(__wch) {
+    __init_buf_ptrs();
+  }
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(string_type&& __s,
                                                  ios_base::openmode __wch = ios_base::in | ios_base::out)
diff --git a/libcxx/include/stdatomic.h b/libcxx/include/stdatomic.h
index 79772eb7fce1f..0ff14660e144a 100644
--- a/libcxx/include/stdatomic.h
+++ b/libcxx/include/stdatomic.h
@@ -103,6 +103,8 @@ using std::atomic_fetch_sub                            // see below
 using std::atomic_fetch_sub_explicit                   // see below
 using std::atomic_fetch_or                             // see below
 using std::atomic_fetch_or_explicit                    // see below
+using std::atomic_fetch_xor                            // see below
+using std::atomic_fetch_xor_explicit                   // see below
 using std::atomic_fetch_and                            // see below
 using std::atomic_fetch_and_explicit                   // see below
 using std::atomic_flag_test_and_set                    // see below
@@ -204,6 +206,8 @@ using std::atomic_fetch_add_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_and _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_and_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_or _LIBCPP_USING_IF_EXISTS;
+using std::atomic_fetch_xor_explicit _LIBCPP_USING_IF_EXISTS;
+using std::atomic_fetch_xor _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_or_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_sub _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_sub_explicit _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token
index c9c54dfb5a755..d4e651d9541f4 100644
--- a/libcxx/include/stop_token
+++ b/libcxx/include/stop_token
@@ -35,9 +35,12 @@ namespace std {
 
 #if !defined(_LIBCPP_HAS_NO_THREADS)
 
-#  include <__stop_token/stop_callback.h>
-#  include <__stop_token/stop_source.h>
-#  include <__stop_token/stop_token.h>
+#  if _LIBCPP_STD_VER >= 20
+#    include <__stop_token/stop_callback.h>
+#    include <__stop_token/stop_source.h>
+#    include <__stop_token/stop_token.h>
+#  endif
+
 #  include <version>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/string b/libcxx/include/string
index 9a52ab6aef41e..7b0cd828704ba 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -407,6 +407,24 @@ template<class charT, class traits, class Allocator>
 basic_string<charT, traits, Allocator>
 operator+(const basic_string<charT, traits, Allocator>& lhs, charT rhs);                        // constexpr since C++20
 
+template<class charT, class traits, class Allocator>
+  constexpr basic_string<charT, traits, Allocator>
+    operator+(const basic_string<charT, traits, Allocator>& lhs,
+              type_identity_t<basic_string_view<charT, traits>> rhs);                           // Since C++26
+template<class charT, class traits, class Allocator>
+  constexpr basic_string<charT, traits, Allocator>
+    operator+(basic_string<charT, traits, Allocator>&& lhs,
+              type_identity_t<basic_string_view<charT, traits>> rhs);                           // Since C++26
+template<class charT, class traits, class Allocator>
+  constexpr basic_string<charT, traits, Allocator>
+    operator+(type_identity_t<basic_string_view<charT, traits>> lhs,
+              const basic_string<charT, traits, Allocator>& rhs);                               // Since C++26
+template<class charT, class traits, class Allocator>
+  constexpr basic_string<charT, traits, Allocator>
+    operator+(type_identity_t<basic_string_view<charT, traits>> lhs,
+              basic_string<charT, traits, Allocator>&& rhs);                                    // Since C++26
+
+
 template<class charT, class traits, class Allocator>
 bool operator==(const basic_string<charT, traits, Allocator>& lhs,
                 const basic_string<charT, traits, Allocator>& rhs) noexcept;                    // constexpr since C++20
@@ -580,6 +598,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__functional/unary_function.h>
 #include <__fwd/string.h>
 #include <__ios/fpos.h>
+#include <__iterator/bounded_iter.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
@@ -687,6 +706,28 @@ template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>
 operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y);
 
+#if _LIBCPP_STD_VER >= 26
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs,
+          type_identity_t<basic_string_view<_CharT, _Traits>> __rhs);
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t<basic_string_view<_CharT, _Traits>> __rhs);
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(type_identity_t<basic_string_view<_CharT, _Traits>> __lhs,
+          const basic_string<_CharT, _Traits, _Allocator>& __rhs);
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(type_identity_t<basic_string_view<_CharT, _Traits>> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs);
+
+#endif
+
 extern template _LIBCPP_EXPORTED_FROM_ABI string operator+
     <char, char_traits<char>, allocator<char> >(char const*, string const&);
 
@@ -782,9 +823,16 @@ public:
                 "Allocator::value_type must be same type as value_type");
   static_assert(__check_valid_allocator<allocator_type>::value, "");
 
-  // TODO: Implement iterator bounds checking without requiring the global database.
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
+  // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
+  // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
+  // considered contiguous.
+  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
+  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
+#else
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
+#endif
   typedef std::reverse_iterator<iterator> reverse_iterator;
   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
@@ -900,10 +948,33 @@ private:
     __init_with_sentinel(std::move(__first), std::move(__last));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iterator(pointer __p) { return iterator(__p); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iterator(pointer __p) {
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
+    // Bound the iterator according to the size (and not the capacity, unlike vector).
+    //
+    // By the Standard, string iterators are generally not guaranteed to stay valid when the container is modified,
+    // regardless of whether reallocation occurs. This allows us to check for out-of-bounds accesses using logical size,
+    // a stricter check, since correct code can never rely on being able to access newly-added elements via an existing
+    // iterator.
+    return std::__make_bounded_iter(
+        std::__wrap_iter<pointer>(__p),
+        std::__wrap_iter<pointer>(__get_pointer()),
+        std::__wrap_iter<pointer>(__get_pointer() + size()));
+#else
+    return iterator(__p);
+#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
+  }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator __make_const_iterator(const_pointer __p) const {
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
+    // Bound the iterator according to the size (and not the capacity, unlike vector).
+    return std::__make_bounded_iter(
+        std::__wrap_iter<const_pointer>(__p),
+        std::__wrap_iter<const_pointer>(__get_pointer()),
+        std::__wrap_iter<const_pointer>(__get_pointer() + size()));
+#else
     return const_iterator(__p);
+#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
   }
 
 public:
@@ -1830,6 +1901,23 @@ private:
   template <class _Iterator, class _Sentinel>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last);
 
+  // Copy [__first, __last) into [__dest, __dest + (__last - __first)). Assumes that the ranges don't overlap.
+  template <class _ForwardIter, class _Sent>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static value_type*
+  __copy_non_overlapping_range(_ForwardIter __first, _Sent __last, value_type* __dest) {
+#ifndef _LIBCPP_CXX03_LANG
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIter>::value &&
+                  is_same<value_type, __iter_value_type<_ForwardIter>>::value && is_same<_ForwardIter, _Sent>::value) {
+      traits_type::copy(__dest, std::__to_address(__first), __last - __first);
+      return __dest + (__last - __first);
+    }
+#endif
+
+    for (; __first != __last; ++__first)
+      traits_type::assign(*__dest++, *__first);
+    return __dest;
+  }
+
   template <class _ForwardIterator, class _Sentinel>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 iterator
   __insert_from_safe_copy(size_type __n, size_type __ip, _ForwardIterator __first, _Sentinel __last) {
@@ -1849,8 +1937,7 @@ private:
     __sz += __n;
     __set_size(__sz);
     traits_type::assign(__p[__sz], value_type());
-    for (__p += __ip; __first != __last; ++__p, ++__first)
-      traits_type::assign(*__p, *__first);
+    __copy_non_overlapping_range(__first, __last, __p + __ip);
 
     return begin() + __ip;
   }
@@ -1927,6 +2014,11 @@ private:
     (void)__old_mid;
     (void)__new_mid;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
+  #if defined(__APPLE__)
+    // TODO: remove after addressing issue #96099 (https://github.com/llvm/llvm-project/issues/96099)
+    if(!__is_long())
+      return;
+  #endif
     std::__annotate_contiguous_container<_Allocator>(data(), data() + capacity() + 1, __old_mid, __new_mid);
 #endif
   }
@@ -2150,6 +2242,14 @@ private:
   friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&);
   friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*);
   friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type);
+#if _LIBCPP_STD_VER >= 26
+  friend constexpr basic_string operator+ <>(const basic_string&, type_identity_t<__self_view>);
+  friend constexpr basic_string operator+ <>(type_identity_t<__self_view>, const basic_string&);
+#endif
+
+  template <class _CharT2, class _Traits2, class _Allocator2>
+  friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
+  operator==(const basic_string<_CharT2, _Traits2, _Allocator2>&, const _CharT2*) _NOEXCEPT;
 };
 
 // These declarations must appear before any functions are implicitly used
@@ -2361,9 +2461,8 @@ basic_string<_CharT, _Traits, _Allocator>::__init_with_size(_InputIterator __fir
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    for (; __first != __last; ++__first, (void)++__p)
-      traits_type::assign(*__p, *__first);
-    traits_type::assign(*__p, value_type());
+    auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__p));
+    traits_type::assign(*__end, value_type());
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
     if (__is_long())
@@ -2829,10 +2928,8 @@ basic_string<_CharT, _Traits, _Allocator>::append(_ForwardIterator __first, _For
       if (__cap - __sz < __n)
         __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0);
       __annotate_increase(__n);
-      pointer __p = __get_pointer() + __sz;
-      for (; __first != __last; ++__p, (void)++__first)
-        traits_type::assign(*__p, *__first);
-      traits_type::assign(*__p, value_type());
+      auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__get_pointer() + __sz));
+      traits_type::assign(*__end, value_type());
       __set_size(__sz + __n);
     } else {
       const basic_string __temp(__first, __last, __alloc());
@@ -3265,23 +3362,34 @@ basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target
     __p        = __get_long_pointer();
   } else {
     if (__target_capacity > __cap) {
+      // Extend
+      // - called from reserve should propagate the exception thrown.
       auto __allocation = std::__allocate_at_least(__alloc(), __target_capacity + 1);
       __new_data        = __allocation.ptr;
       __target_capacity = __allocation.count - 1;
     } else {
+      // Shrink
+      // - called from shrink_to_fit should not throw.
+      // - called from reserve may throw but is not required to.
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
       try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
         auto __allocation = std::__allocate_at_least(__alloc(), __target_capacity + 1);
+
+        // The Standard mandates shrink_to_fit() does not increase the capacity.
+        // With equal capacity keep the existing buffer. This avoids extra work
+        // due to swapping the elements.
+        if (__allocation.count - 1 > __target_capacity) {
+          __alloc_traits::deallocate(__alloc(), __allocation.ptr, __allocation.count);
+          __annotate_new(__sz); // Undoes the __annotate_delete()
+          return;
+        }
         __new_data        = __allocation.ptr;
         __target_capacity = __allocation.count - 1;
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
       } catch (...) {
         return;
       }
-#else  // _LIBCPP_HAS_NO_EXCEPTIONS
-      if (__new_data == nullptr)
-        return;
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
     }
     __begin_lifetime(__new_data, __target_capacity + 1);
@@ -3751,16 +3859,18 @@ operator==(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>&
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
 operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) _NOEXCEPT {
-#if _LIBCPP_STD_VER >= 20
-  return basic_string_view<_CharT, _Traits>(__lhs) == basic_string_view<_CharT, _Traits>(__rhs);
-#else
-  typedef basic_string<_CharT, _Traits, _Allocator> _String;
   _LIBCPP_ASSERT_NON_NULL(__rhs != nullptr, "operator==(basic_string, char*): received nullptr");
+
+  using _String = basic_string<_CharT, _Traits, _Allocator>;
+
   size_t __rhs_len = _Traits::length(__rhs);
+  if (__builtin_constant_p(__rhs_len) && !_String::__fits_in_sso(__rhs_len)) {
+    if (!__lhs.__is_long())
+      return false;
+  }
   if (__rhs_len != __lhs.size())
     return false;
   return __lhs.compare(0, _String::npos, __rhs, __rhs_len) == 0;
-#endif
 }
 
 #if _LIBCPP_STD_VER >= 20
@@ -4007,6 +4117,60 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs) {
 
 #endif // _LIBCPP_CXX03_LANG
 
+#if _LIBCPP_STD_VER >= 26
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs,
+          type_identity_t<basic_string_view<_CharT, _Traits>> __rhs) {
+  using _String                        = basic_string<_CharT, _Traits, _Allocator>;
+  typename _String::size_type __lhs_sz = __lhs.size();
+  typename _String::size_type __rhs_sz = __rhs.size();
+  _String __r(__uninitialized_size_tag(),
+              __lhs_sz + __rhs_sz,
+              _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator()));
+  auto __ptr = std::__to_address(__r.__get_pointer());
+  _Traits::copy(__ptr, __lhs.data(), __lhs_sz);
+  _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz);
+  _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT());
+  return __r;
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs,
+          type_identity_t<basic_string_view<_CharT, _Traits>> __rhs) {
+  __lhs.append(__rhs);
+  return std::move(__lhs);
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(type_identity_t<basic_string_view<_CharT, _Traits>> __lhs,
+          const basic_string<_CharT, _Traits, _Allocator>& __rhs) {
+  using _String                        = basic_string<_CharT, _Traits, _Allocator>;
+  typename _String::size_type __lhs_sz = __lhs.size();
+  typename _String::size_type __rhs_sz = __rhs.size();
+  _String __r(__uninitialized_size_tag(),
+              __lhs_sz + __rhs_sz,
+              _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator()));
+  auto __ptr = std::__to_address(__r.__get_pointer());
+  _Traits::copy(__ptr, __lhs.data(), __lhs_sz);
+  _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz);
+  _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT());
+  return __r;
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator>
+operator+(type_identity_t<basic_string_view<_CharT, _Traits>> __lhs,
+          basic_string<_CharT, _Traits, _Allocator>&& __rhs) {
+  __rhs.insert(0, __lhs);
+  return std::move(__rhs);
+}
+
+#endif // _LIBCPP_STD_VER >= 26
+
 // swap
 
 template <class _CharT, class _Traits, class _Allocator>
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index b2a4db4e7519a..2a03ee99e9ab5 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -210,6 +210,7 @@ namespace std {
 #include <__config>
 #include <__functional/hash.h>
 #include <__functional/unary_function.h>
+#include <__fwd/ostream.h>
 #include <__fwd/string_view.h>
 #include <__iterator/bounded_iter.h>
 #include <__iterator/concepts.h>
@@ -447,8 +448,11 @@ public:
   }
 
   _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view substr(size_type __pos = 0, size_type __n = npos) const {
+    // Use the `__assume_valid` form of the constructor to avoid an unnecessary check. Any substring of a view is a
+    // valid view. In particular, `size()` is known to be smaller than `numeric_limits<difference_type>::max()`, so the
+    // new size is also smaller. See also https://github.com/llvm/llvm-project/issues/91634.
     return __pos > size() ? (__throw_out_of_range("string_view::substr"), basic_string_view())
-                          : basic_string_view(data() + __pos, std::min(__n, size() - __pos));
+                          : basic_string_view(__assume_valid(), data() + __pos, std::min(__n, size() - __pos));
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 int compare(basic_string_view __sv) const _NOEXCEPT {
@@ -673,6 +677,16 @@ public:
 #endif
 
 private:
+  struct __assume_valid {};
+
+  // This is the same as the pointer and length constructor, but without the additional hardening checks. It is intended
+  // for use within the class, when the class invariants already guarantee the resulting object is valid. The compiler
+  // usually cannot eliminate the redundant checks because it does not know class invariants.
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI
+  basic_string_view(__assume_valid, const _CharT* __s, size_type __len) _NOEXCEPT
+      : __data_(__s),
+        __size_(__len) {}
+
   const value_type* __data_;
   size_type __size_;
 };
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index a77ddadafb681..7f231cd09df51 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -428,97 +428,92 @@ namespace std
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/aligned_union.h>
 #include <__type_traits/alignment_of.h>
-#include <__type_traits/can_extract_key.h>
-#include <__type_traits/common_reference.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/conditional.h>
-#include <__type_traits/conjunction.h>
 #include <__type_traits/decay.h>
-#include <__type_traits/dependent_type.h>
-#include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/extent.h>
-#include <__type_traits/has_unique_object_representation.h>
 #include <__type_traits/has_virtual_destructor.h>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/invoke.h>
 #include <__type_traits/is_abstract.h>
-#include <__type_traits/is_aggregate.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_base_of.h>
-#include <__type_traits/is_bounded_array.h>
-#include <__type_traits/is_callable.h>
-#include <__type_traits/is_char_like_type.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/is_compound.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_enum.h>
-#include <__type_traits/is_final.h>
 #include <__type_traits/is_floating_point.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_fundamental.h>
-#include <__type_traits/is_implicitly_default_constructible.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/is_literal_type.h>
 #include <__type_traits/is_member_pointer.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_convertible.h>
 #include <__type_traits/is_nothrow_destructible.h>
-#include <__type_traits/is_null_pointer.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_pod.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_polymorphic.h>
 #include <__type_traits/is_reference.h>
-#include <__type_traits/is_reference_wrapper.h>
-#include <__type_traits/is_referenceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_scalar.h>
-#include <__type_traits/is_scoped_enum.h>
 #include <__type_traits/is_signed.h>
-#include <__type_traits/is_specialization.h>
 #include <__type_traits/is_standard_layout.h>
-#include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivial.h>
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/is_union.h>
 #include <__type_traits/is_unsigned.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/is_volatile.h>
-#include <__type_traits/make_const_lvalue_ref.h>
 #include <__type_traits/make_signed.h>
 #include <__type_traits/make_unsigned.h>
-#include <__type_traits/maybe_const.h>
-#include <__type_traits/negation.h>
 #include <__type_traits/rank.h>
 #include <__type_traits/remove_all_extents.h>
 #include <__type_traits/remove_const.h>
-#include <__type_traits/remove_const_ref.h>
 #include <__type_traits/remove_cv.h>
 #include <__type_traits/remove_extent.h>
 #include <__type_traits/remove_pointer.h>
 #include <__type_traits/remove_reference.h>
 #include <__type_traits/remove_volatile.h>
 #include <__type_traits/result_of.h>
-#include <__type_traits/type_identity.h>
 #include <__type_traits/underlying_type.h>
-#include <__type_traits/unwrap_ref.h>
-#include <__type_traits/void_t.h>
-#include <__utility/declval.h>
-#include <cstddef>
-#include <cstdint>
+
+#if _LIBCPP_STD_VER >= 14
+#  include <__type_traits/is_final.h>
+#  include <__type_traits/is_null_pointer.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__type_traits/conjunction.h>
+#  include <__type_traits/disjunction.h>
+#  include <__type_traits/has_unique_object_representation.h>
+#  include <__type_traits/invoke.h>
+#  include <__type_traits/is_aggregate.h>
+#  include <__type_traits/is_swappable.h>
+#  include <__type_traits/negation.h>
+#  include <__type_traits/void_t.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__type_traits/common_reference.h>
+#  include <__type_traits/is_bounded_array.h>
+#  include <__type_traits/is_constant_evaluated.h>
+#  include <__type_traits/is_nothrow_convertible.h>
+#  include <__type_traits/is_unbounded_array.h>
+#  include <__type_traits/type_identity.h>
+#  include <__type_traits/unwrap_ref.h>
+#endif
+
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index d1c0de3c1bfdd..2727cad02fa99 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -275,13 +275,14 @@ struct __type_info_implementations {
           __impl;
 };
 
-#    if defined(__arm64__) && __has_cpp_attribute(clang::ptrauth_vtable_pointer)
-#      if __has_feature(ptrauth_type_info_discriminated_vtable_pointer)
+#    if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__)
+#      if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
 #        define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH                                                                  \
-          [[clang::ptrauth_vtable_pointer(process_independent, address_discrimination, type_discrimination)]]
+          [[_Clang::__ptrauth_vtable_pointer__(process_independent, address_discrimination, type_discrimination)]]
 #      else
 #        define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH                                                                  \
-          [[clang::ptrauth_vtable_pointer(process_independent, no_address_discrimination, no_extra_discrimination)]]
+          [[_Clang::__ptrauth_vtable_pointer__(                                                                        \
+              process_independent, no_address_discrimination, no_extra_discrimination)]]
 #      endif
 #    else
 #      define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH
diff --git a/libcxx/include/utility b/libcxx/include/utility
index f2f0052df2755..f97907fbf72e9 100644
--- a/libcxx/include/utility
+++ b/libcxx/include/utility
@@ -247,25 +247,35 @@ template <class T>
 */
 
 #include <__config>
-#include <__utility/as_const.h>
-#include <__utility/as_lvalue.h>
-#include <__utility/auto_cast.h>
-#include <__utility/cmp.h>
+
 #include <__utility/declval.h>
-#include <__utility/exception_guard.h>
-#include <__utility/exchange.h>
 #include <__utility/forward.h>
-#include <__utility/forward_like.h>
-#include <__utility/in_place.h>
-#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/piecewise_construct.h>
-#include <__utility/priority_tag.h>
 #include <__utility/rel_ops.h>
 #include <__utility/swap.h>
-#include <__utility/to_underlying.h>
-#include <__utility/unreachable.h>
+
+#if _LIBCPP_STD_VER >= 14
+#  include <__utility/exchange.h>
+#  include <__utility/integer_sequence.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+#  include <__utility/as_const.h>
+#  include <__utility/in_place.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+#  include <__utility/cmp.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 23
+#  include <__utility/forward_like.h>
+#  include <__utility/to_underlying.h>
+#  include <__utility/unreachable.h>
+#endif
+
 #include <version>
 
 // standard-mandated includes
@@ -286,6 +296,10 @@ template <class T>
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <limits>
+#endif
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <iosfwd>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index aaf51d18fe30f..3aa23d8fc1e24 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -170,7 +170,7 @@ public:
 
     vector()
         noexcept(is_nothrow_default_constructible<allocator_type>::value);
-    explicit vector(const allocator_type&);
+    explicit vector(const allocator_type&) noexcept;
     explicit vector(size_type n, const allocator_type& a = allocator_type()); // C++14
     vector(size_type n, const value_type& value, const allocator_type& = allocator_type());
     template <class InputIterator>
@@ -178,8 +178,7 @@ public:
     template<container-compatible-range<bool> R>
       constexpr vector(from_range_t, R&& rg, const Allocator& = Allocator());
     vector(const vector& x);
-    vector(vector&& x)
-        noexcept(is_nothrow_move_constructible<allocator_type>::value);
+    vector(vector&& x) noexcept;
     vector(initializer_list<value_type> il);
     vector(initializer_list<value_type> il, const allocator_type& a);
     ~vector();
@@ -327,6 +326,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__functional/unary_function.h>
 #include <__fwd/vector.h>
 #include <__iterator/advance.h>
+#include <__iterator/bounded_iter.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
@@ -400,9 +400,16 @@ public:
   typedef typename __alloc_traits::difference_type difference_type;
   typedef typename __alloc_traits::pointer pointer;
   typedef typename __alloc_traits::const_pointer const_pointer;
-  // TODO: Implement iterator bounds checking without requiring the global database.
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
+  // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
+  // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
+  // considered contiguous.
+  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
+  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
+#else
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
+#endif
   typedef std::reverse_iterator<iterator> reverse_iterator;
   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
@@ -425,7 +432,7 @@ public:
 #if _LIBCPP_STD_VER <= 14
       _NOEXCEPT_(is_nothrow_copy_constructible<allocator_type>::value)
 #else
-      _NOEXCEPT
+      noexcept
 #endif
       : __end_cap_(nullptr, __a) {
   }
@@ -827,12 +834,39 @@ private:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x);
+
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT {
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
+    // Bound the iterator according to the capacity, rather than the size.
+    //
+    // Vector guarantees that iterators stay valid as long as no reallocation occurs even if new elements are inserted
+    // into the container; for these cases, we need to make sure that the newly-inserted elements can be accessed
+    // through the bounded iterator without failing checks. The downside is that the bounded iterator won't catch
+    // access that is logically out-of-bounds, i.e., goes beyond the size, but is still within the capacity. With the
+    // current implementation, there is no connection between a bounded iterator and its associated container, so we
+    // don't have a way to update existing valid iterators when the container is resized and thus have to go with
+    // a laxer approach.
+    return std::__make_bounded_iter(
+        std::__wrap_iter<pointer>(__p),
+        std::__wrap_iter<pointer>(this->__begin_),
+        std::__wrap_iter<pointer>(this->__end_cap()));
+#else
     return iterator(__p);
+#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
   }
+
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(const_pointer __p) const _NOEXCEPT {
+#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
+    // Bound the iterator according to the capacity, rather than the size.
+    return std::__make_bounded_iter(
+        std::__wrap_iter<const_pointer>(__p),
+        std::__wrap_iter<const_pointer>(this->__begin_),
+        std::__wrap_iter<const_pointer>(this->__end_cap()));
+#else
     return const_iterator(__p);
+#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
   }
+
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
   __swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer
@@ -1443,7 +1477,11 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOE
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
       allocator_type& __a = this->__alloc();
       __split_buffer<value_type, allocator_type&> __v(size(), size(), __a);
-      __swap_out_circular_buffer(__v);
+      // The Standard mandates shrink_to_fit() does not increase the capacity.
+      // With equal capacity keep the existing buffer. This avoids extra work
+      // due to swapping the elements.
+      if (__v.capacity() < capacity())
+        __swap_out_circular_buffer(__v);
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     } catch (...) {
     }
diff --git a/libcxx/include/version b/libcxx/include/version
index c971336bcb85c..fe64343eafbc9 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -190,6 +190,7 @@ __cpp_lib_ranges_chunk                                  202202L <ranges>
 __cpp_lib_ranges_chunk_by                               202202L <ranges>
 __cpp_lib_ranges_concat                                 202403L <ranges>
 __cpp_lib_ranges_contains                               202207L <algorithm>
+__cpp_lib_ranges_find_last                              202207L <algorithm>
 __cpp_lib_ranges_iota                                   202202L <numeric>
 __cpp_lib_ranges_join_with                              202202L <ranges>
 __cpp_lib_ranges_repeat                                 202207L <ranges>
@@ -231,12 +232,13 @@ __cpp_lib_stdatomic_h                                   202011L <stdatomic.h>
 __cpp_lib_string_contains                               202011L <string> <string_view>
 __cpp_lib_string_resize_and_overwrite                   202110L <string>
 __cpp_lib_string_udls                                   201304L <string>
-__cpp_lib_string_view                                   201803L <string> <string_view>
+__cpp_lib_string_view                                   202403L <string> <string_view>
+                                                        201803L // C++20
                                                         201606L // C++17
 __cpp_lib_submdspan                                     202306L <mdspan>
 __cpp_lib_syncbuf                                       201803L <syncstream>
 __cpp_lib_text_encoding                                 202306L <text_encoding>
-__cpp_lib_three_way_comparison                          201711L <compare>
+__cpp_lib_three_way_comparison                          201907L <compare>
 __cpp_lib_to_address                                    201711L <memory>
 __cpp_lib_to_array                                      201907L <array>
 __cpp_lib_to_chars                                      202306L <charconv>
@@ -444,7 +446,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_SYNCSTREAM)
 #   define __cpp_lib_syncbuf                            201803L
 # endif
-# define __cpp_lib_three_way_comparison                 201711L
+# define __cpp_lib_three_way_comparison                 201907L
 # define __cpp_lib_to_address                           201711L
 # define __cpp_lib_to_array                             201907L
 # define __cpp_lib_type_identity                        201806L
@@ -476,13 +478,14 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_move_only_function                   202110L
 # undef  __cpp_lib_optional
 # define __cpp_lib_optional                             202110L
-// # define __cpp_lib_out_ptr                              202106L
+# define __cpp_lib_out_ptr                              202106L
 # define __cpp_lib_print                                202207L
 // # define __cpp_lib_ranges_as_const                      202207L
 # define __cpp_lib_ranges_as_rvalue                     202207L
 // # define __cpp_lib_ranges_chunk                         202202L
 # define __cpp_lib_ranges_chunk_by                      202202L
 # define __cpp_lib_ranges_contains                      202207L
+# define __cpp_lib_ranges_find_last                     202207L
 // # define __cpp_lib_ranges_iota                          202202L
 // # define __cpp_lib_ranges_join_with                     202202L
 # define __cpp_lib_ranges_repeat                        202207L
@@ -535,7 +538,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_mdspan                               202406L
 // # define __cpp_lib_optional_range_support               202406L
 # undef  __cpp_lib_out_ptr
-// # define __cpp_lib_out_ptr                              202311L
+# define __cpp_lib_out_ptr                              202311L
 // # define __cpp_lib_philox_engine                        202406L
 // # define __cpp_lib_ranges_concat                        202403L
 # define __cpp_lib_ratio                                202306L
@@ -547,6 +550,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_span_at                              202311L
 # define __cpp_lib_span_initializer_list                202311L
 # define __cpp_lib_sstream_from_string_view             202306L
+# undef  __cpp_lib_string_view
+# define __cpp_lib_string_view                          202403L
 // # define __cpp_lib_submdspan                            202306L
 // # define __cpp_lib_text_encoding                        202306L
 # undef  __cpp_lib_to_chars
diff --git a/libcxx/modules/std/algorithm.inc b/libcxx/modules/std/algorithm.inc
index e7796bfa26af8..3c2139cd64ee4 100644
--- a/libcxx/modules/std/algorithm.inc
+++ b/libcxx/modules/std/algorithm.inc
@@ -77,13 +77,14 @@ export namespace std {
     using std::ranges::find_if_not;
   } // namespace ranges
 
+#if _LIBCPP_STD_VER >= 23
+  // [alg.find.last], find last
   namespace ranges {
-#if 0
     using std::ranges::find_last;
     using std::ranges::find_last_if;
     using std::ranges::find_last_if_not;
-#endif
   } // namespace ranges
+#endif
 
   // [alg.find.end], find end
   using std::find_end;
diff --git a/libcxx/modules/std/cmath.inc b/libcxx/modules/std/cmath.inc
index a463c1e3ccf86..fe8ac773c9d1c 100644
--- a/libcxx/modules/std/cmath.inc
+++ b/libcxx/modules/std/cmath.inc
@@ -334,12 +334,14 @@ export namespace std {
   using std::expint;
   using std::expintf;
   using std::expintl;
+#endif
 
   // [sf.cmath.hermite], Hermite polynomials
   using std::hermite;
   using std::hermitef;
   using std::hermitel;
 
+#if 0
   // [sf.cmath.laguerre], Laguerre polynomials
   using std::laguerre;
   using std::laguerref;
diff --git a/libcxx/modules/std/format.inc b/libcxx/modules/std/format.inc
index 743a43811005a..09aa03ad73e38 100644
--- a/libcxx/modules/std/format.inc
+++ b/libcxx/modules/std/format.inc
@@ -46,6 +46,8 @@ export namespace std {
   using std::formatter;
 
 #if _LIBCPP_STD_VER >= 23
+  using std::enable_nonlocking_formatter_optimization;
+
   // [format.formattable], concept formattable
   using std::formattable;
 #endif
diff --git a/libcxx/modules/std/memory.inc b/libcxx/modules/std/memory.inc
index 56c621c0cf17f..c15f37a21239c 100644
--- a/libcxx/modules/std/memory.inc
+++ b/libcxx/modules/std/memory.inc
@@ -177,17 +177,19 @@ export namespace std {
   // [util.smartptr.atomic], atomic smart pointers
   // using std::atomic;
 
+#if _LIBCPP_STD_VER >= 23
   // [out.ptr.t], class template out_ptr_t
-  //  using std::out_ptr_t;
+  using std::out_ptr_t;
 
   // [out.ptr], function template out_ptr
-  //  using std::out_ptr;
+  using std::out_ptr;
 
   // [inout.ptr.t], class template inout_ptr_t
-  //  using std::inout_ptr_t;
+  using std::inout_ptr_t;
 
   // [inout.ptr], function template inout_ptr
-  //  using std::inout_ptr;
+  using std::inout_ptr;
+#endif // _LIBCPP_STD_VER >= 23
 
 #ifndef _LIBCPP_HAS_NO_THREADS
   // [depr.util.smartptr.shared.atomic]
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 0ae58a10c879c..fe9d2666fa4ca 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -143,29 +143,10 @@ if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
     )
 endif()
 
-# Add all the headers to the project for IDEs.
-if (LIBCXX_CONFIGURE_IDE)
-  file(GLOB_RECURSE LIBCXX_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/*)
-  if(WIN32)
-    file( GLOB LIBCXX_WIN32_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/__support/win32/*.h)
-    list(APPEND LIBCXX_HEADERS ${LIBCXX_WIN32_HEADERS})
-  endif()
-  # Force them all into the headers dir on MSVC, otherwise they end up at
-  # project scope because they don't have extensions.
-  if (MSVC_IDE)
-    source_group("Header Files" FILES ${LIBCXX_HEADERS})
-  endif()
-endif()
-
 if(NOT LIBCXX_INSTALL_LIBRARY)
   set(exclude_from_all EXCLUDE_FROM_ALL)
 endif()
 
-if (LIBCXX_GENERATE_COVERAGE AND NOT LIBCXX_COVERAGE_LIBRARY)
-  find_compiler_rt_library(profile LIBCXX_COVERAGE_LIBRARY)
-endif()
-add_library_flags_if(LIBCXX_COVERAGE_LIBRARY "${LIBCXX_COVERAGE_LIBRARY}")
-
 if (APPLE AND LLVM_USE_SANITIZER)
   if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR
       ("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR
@@ -199,7 +180,7 @@ split_list(LIBCXX_LINK_FLAGS)
 if (LIBCXX_ENABLE_SHARED)
   add_library(cxx_shared SHARED ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
   target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-  target_link_libraries(cxx_shared PUBLIC cxx-headers
+  target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared
                                    PRIVATE ${LIBCXX_LIBRARIES})
   set_target_properties(cxx_shared
     PROPERTIES
@@ -292,7 +273,7 @@ set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 if (LIBCXX_ENABLE_STATIC)
   add_library(cxx_static STATIC ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
   target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-  target_link_libraries(cxx_static PUBLIC cxx-headers
+  target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static
                                    PRIVATE ${LIBCXX_LIBRARIES}
                                    PRIVATE libcxx-abi-static)
   set_target_properties(cxx_static
diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
index e71e4f104b290..c7639f56eee26 100644
--- a/libcxx/src/include/overridable_function.h
+++ b/libcxx/src/include/overridable_function.h
@@ -13,7 +13,7 @@
 #include <__config>
 #include <cstdint>
 
-#if defined(__arm64e__) && __has_feature(ptrauth_calls)
+#if __has_feature(ptrauth_calls)
 #  include <ptrauth.h>
 #endif
 
@@ -83,13 +83,13 @@ _LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) no
   uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
   uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
 
-#if defined(__arm64e__) && __has_feature(ptrauth_calls)
+#  if __has_feature(ptrauth_calls)
   // We must pass a void* to ptrauth_strip since it only accepts a pointer type. Also, in particular,
   // we must NOT pass a function pointer, otherwise we will strip the function pointer, and then attempt
   // to authenticate and re-sign it when casting it to a uintptr_t again, which will fail because we just
   // stripped the function pointer. See rdar://122927845.
   __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
-#endif
+#  endif
 
   // Finally, the function was overridden if it falls outside of the section's bounds.
   return __ptr < __start || __ptr > __end;
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index cdd1c2d90fbcf..b25712b0e363b 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,10 +1,8 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
 
-# By default, libcxx and libcxxabi share a library directory.
-if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
-  set(LIBCXX_CXX_ABI_LIBRARY_PATH "${LIBCXX_LIBRARY_DIR}" CACHE PATH
-      "The path to libc++abi library.")
+if (LIBCXX_INCLUDE_BENCHMARKS)
+  add_subdirectory(benchmarks)
 endif()
 
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
@@ -34,56 +32,18 @@ endif()
 
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
-if (LIBCXX_INCLUDE_TESTS)
-  include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
+include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
 
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
-                 "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
-                 @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
+                "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
+                @ONLY)
 
-  configure_lit_site_cfg(
-    "${LIBCXX_TEST_CONFIG}"
-    ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
-    MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
+configure_lit_site_cfg(
+  "${LIBCXX_TEST_CONFIG}"
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
 
-  add_lit_testsuite(check-cxx
-    "Running libcxx tests"
-    ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS cxx-test-depends)
-
-endif()
-
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(output_dir "${CMAKE_CURRENT_BINARY_DIR}/coverage")
-  set(capture_dirs
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_objects.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_experimental.dir/"
-      "${CMAKE_CURRENT_BINARY_DIR}")
-  set(extract_dirs "${LIBCXX_SOURCE_DIR}/include;${LIBCXX_SOURCE_DIR}/src")
-  setup_lcov_test_target_coverage("cxx" "${output_dir}" "${capture_dirs}" "${extract_dirs}")
-endif()
-
-if (LIBCXX_CONFIGURE_IDE)
-  # Create dummy targets for each of the tests in the test suite, this allows
-  # IDE's such as CLion to correctly highlight the tests because it knows
-  # roughly what include paths/compile flags/macro definitions are needed.
-  include_directories(support)
-  file(GLOB_RECURSE LIBCXX_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/*.pass.cpp)
-  file(GLOB LIBCXX_TEST_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/support/*)
-  file(GLOB_RECURSE LIBCXX_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/*)
-  add_executable(libcxx_test_objects EXCLUDE_FROM_ALL
-          ${LIBCXX_TESTS} ${LIBCXX_TEST_HEADERS} ${LIBCXX_HEADERS})
-  add_dependencies(libcxx_test_objects cxx)
-
-  split_list(LIBCXX_COMPILE_FLAGS)
-  split_list(LIBCXX_LINK_FLAGS)
-
-  set_target_properties(libcxx_test_objects
-          PROPERTIES
-            COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
-            LINK_FLAGS "${LIBCXX_LINK_FLAGS}"
-            EXCLUDE_FROM_ALL ON
-  )
-endif()
+add_lit_testsuite(check-cxx
+  "Running libcxx tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS cxx-test-depends)
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/test/benchmarks/CMakeLists.txt
similarity index 97%
rename from libcxx/benchmarks/CMakeLists.txt
rename to libcxx/test/benchmarks/CMakeLists.txt
index 110672600213a..d61367a367738 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/test/benchmarks/CMakeLists.txt
@@ -5,8 +5,6 @@ include(CheckCXXCompilerFlag)
 # Build Google Benchmark
 #==============================================================================
 
-set(CMAKE_FOLDER "${CMAKE_FOLDER}/Benchmarks")
-
 set(BENCHMARK_COMPILE_FLAGS
     -Wno-unused-command-line-argument
     -nostdinc++
@@ -135,6 +133,7 @@ set(BENCHMARK_TESTS
     algorithms/ranges_sort.bench.cpp
     algorithms/ranges_sort_heap.bench.cpp
     algorithms/ranges_stable_sort.bench.cpp
+    algorithms/set_intersection.bench.cpp
     algorithms/sort.bench.cpp
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
@@ -190,6 +189,10 @@ endforeach()
 if (LIBCXX_INCLUDE_TESTS)
   include(AddLLVM)
 
+  configure_lit_site_cfg(
+          ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py.in
+          ${CMAKE_CURRENT_BINARY_DIR}/lit.cfg.py)
+
   configure_lit_site_cfg(
           ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
           ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py)
diff --git a/libcxx/benchmarks/CartesianBenchmarks.h b/libcxx/test/benchmarks/CartesianBenchmarks.h
similarity index 100%
rename from libcxx/benchmarks/CartesianBenchmarks.h
rename to libcxx/test/benchmarks/CartesianBenchmarks.h
diff --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h
similarity index 100%
rename from libcxx/benchmarks/ContainerBenchmarks.h
rename to libcxx/test/benchmarks/ContainerBenchmarks.h
diff --git a/libcxx/benchmarks/GenerateInput.h b/libcxx/test/benchmarks/GenerateInput.h
similarity index 91%
rename from libcxx/benchmarks/GenerateInput.h
rename to libcxx/test/benchmarks/GenerateInput.h
index 5710b4e1418ba..cc1694311473e 100644
--- a/libcxx/benchmarks/GenerateInput.h
+++ b/libcxx/test/benchmarks/GenerateInput.h
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #ifndef BENCHMARK_GENERATE_INPUT_H
 #define BENCHMARK_GENERATE_INPUT_H
 
diff --git a/libcxx/benchmarks/Utilities.h b/libcxx/test/benchmarks/Utilities.h
similarity index 100%
rename from libcxx/benchmarks/Utilities.h
rename to libcxx/test/benchmarks/Utilities.h
diff --git a/libcxx/benchmarks/VariantBenchmarks.h b/libcxx/test/benchmarks/VariantBenchmarks.h
similarity index 100%
rename from libcxx/benchmarks/VariantBenchmarks.h
rename to libcxx/test/benchmarks/VariantBenchmarks.h
diff --git a/libcxx/benchmarks/algorithms.partition_point.bench.cpp b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
similarity index 89%
rename from libcxx/benchmarks/algorithms.partition_point.bench.cpp
rename to libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
index 77115175fa540..ed2e337fa6ea3 100644
--- a/libcxx/benchmarks/algorithms.partition_point.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <array>
 #include <cassert>
diff --git a/libcxx/benchmarks/algorithms/common.h b/libcxx/test/benchmarks/algorithms/common.h
similarity index 100%
rename from libcxx/benchmarks/algorithms/common.h
rename to libcxx/test/benchmarks/algorithms/common.h
diff --git a/libcxx/benchmarks/algorithms/count.bench.cpp b/libcxx/test/benchmarks/algorithms/count.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/count.bench.cpp
rename to libcxx/test/benchmarks/algorithms/count.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/equal.bench.cpp b/libcxx/test/benchmarks/algorithms/equal.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/equal.bench.cpp
rename to libcxx/test/benchmarks/algorithms/equal.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/fill.bench.cpp
rename to libcxx/test/benchmarks/algorithms/fill.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/find.bench.cpp b/libcxx/test/benchmarks/algorithms/find.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/find.bench.cpp
rename to libcxx/test/benchmarks/algorithms/find.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/for_each.bench.cpp
rename to libcxx/test/benchmarks/algorithms/for_each.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/lower_bound.bench.cpp b/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/lower_bound.bench.cpp
rename to libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/make_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/min.bench.cpp b/libcxx/test/benchmarks/algorithms/min.bench.cpp
similarity index 80%
rename from libcxx/benchmarks/algorithms/min.bench.cpp
rename to libcxx/test/benchmarks/algorithms/min.bench.cpp
index 1e1dd4e71c2ef..a09bd530d0120 100644
--- a/libcxx/benchmarks/algorithms/min.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/benchmarks/algorithms/min_max_element.bench.cpp b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/min_max_element.bench.cpp
rename to libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/minmax.bench.cpp b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
similarity index 80%
rename from libcxx/benchmarks/algorithms/minmax.bench.cpp
rename to libcxx/test/benchmarks/algorithms/minmax.bench.cpp
index b0ff7f91c1993..ca1cdb441deef 100644
--- a/libcxx/benchmarks/algorithms/minmax.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/mismatch.bench.cpp
rename to libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/pop_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp
rename to libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/push_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_contains.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_contains.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_ends_with.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_ends_with.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_make_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_pop_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_push_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_sort.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_sort_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/ranges_stable_sort.bench.cpp
rename to libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
diff --git a/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
new file mode 100644
index 0000000000000..b3fb15fc77b31
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
@@ -0,0 +1,184 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <set>
+#include <vector>
+
+#include "common.h"
+#include "test_iterators.h"
+
+namespace {
+
+// types of containers we'll want to test, covering interesting iterator types
+struct VectorContainer {
+  template <typename... Args>
+  using type = std::vector<Args...>;
+
+  static constexpr const char* Name = "Vector";
+};
+
+struct SetContainer {
+  template <typename... Args>
+  using type = std::set<Args...>;
+
+  static constexpr const char* Name = "Set";
+};
+
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer>;
+
+// set_intersection performance may depend on where matching values lie
+enum class OverlapPosition {
+  None,
+  Front,
+  // performance-wise, matches at the back are identical to ones at the front
+  Interlaced,
+};
+
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 3> {
+  static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
+};
+
+// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
+template <typename Wrapped>
+struct StridedFwdIt {
+  Wrapped base_;
+  unsigned stride_;
+
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type   = typename Wrapped::difference_type;
+  using value_type        = typename Wrapped::value_type;
+  using pointer           = typename Wrapped::pointer;
+  using reference         = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); }
+
+  StridedFwdIt operator++() {
+    for (unsigned i = 0; i < stride_; ++i)
+      ++base_;
+    return *this;
+  }
+  StridedFwdIt operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  value_type& operator*() { return *base_; }
+  const value_type& operator*() const { return *base_; }
+  value_type& operator->() { return *base_; }
+  const value_type& operator->() const { return *base_; }
+  bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; }
+  bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
+};
+template <typename Wrapped>
+StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
+
+template <typename T>
+std::vector<T> getVectorOfRandom(size_t N) {
+  std::vector<T> v;
+  fillValues(v, N, Order::Random);
+  sortValues(v, Order::Random);
+  return std::vector<T>(v);
+}
+
+// Realistically, data won't all be nicely contiguous in a container,
+// we'll go through some effort to ensure that it's shuffled through memory
+// this is especially important for containers with non-contiguous element
+// storage, but it will affect even a std::vector, because when you copy a
+// std::vector<std::string> the underlying data storage position for the char
+// arrays of the copy are likely to have high locality
+template <class Container>
+std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
+  using ValueType = typename Container::value_type;
+  auto move_into  = [](auto first, auto last) {
+    Container out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
+  };
+  const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
+
+  if (pos == OverlapPosition::None) {
+    std::sort(src.begin(), src.end());
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end()));
+  }
+
+  // All other overlap types will have to copy some part of the data, but if
+  // we copy after sorting it will likely have high locality, so we sort
+  // each copy separately
+  auto copy = src;
+  std::sort(src.begin(), src.end());
+  std::sort(copy.begin(), copy.end());
+
+  switch (pos) {
+  case OverlapPosition::None:
+    // we like -Wswitch :)
+    break;
+
+  case OverlapPosition::Front:
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
+
+  case OverlapPosition::Interlaced:
+    const auto stride1 = size1 < size2 ? size2 / size1 : 1;
+    const auto stride2 = size2 < size1 ? size1 / size2 : 1;
+    return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
+                          move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
+  }
+  std::abort(); // would be std::unreachable() if it could
+  return std::pair<Container, Container>();
+}
+
+template <class ValueType, class Container, class Overlap>
+struct SetIntersection {
+  using ContainerType = typename Container::template type<Value<ValueType>>;
+  size_t size1_;
+  size_t size2_;
+
+  SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
+
+  bool skip() const noexcept {
+    // let's save some time and skip simmetrical runs
+    return size1_ < size2_;
+  }
+
+  void run(benchmark::State& state) const {
+    auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
+    std::vector<Value<ValueType>> out(std::min(size1_, size2_));
+
+    const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
+    for (const auto& _ : state) {
+      while (state.KeepRunningBatch(BATCH_SIZE)) {
+        for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          const auto& [c1, c2] = input;
+          auto res             = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+          benchmark::DoNotOptimize(res);
+        }
+      }
+    }
+  }
+
+  std::string name() const {
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' +
+           std::to_string(size1_) + '_' + std::to_string(size2_);
+  }
+};
+
+} // namespace
+
+int main(int argc, char** argv) { /**/
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
+      Quantities, Quantities);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
diff --git a/libcxx/benchmarks/algorithms/sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sort.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/sort.bench.cpp
rename to libcxx/test/benchmarks/algorithms/sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/sort_heap.bench.cpp
rename to libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/algorithms/stable_sort.bench.cpp
rename to libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/allocation.bench.cpp b/libcxx/test/benchmarks/allocation.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/allocation.bench.cpp
rename to libcxx/test/benchmarks/allocation.bench.cpp
diff --git a/libcxx/benchmarks/atomic_wait.bench.cpp b/libcxx/test/benchmarks/atomic_wait.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/atomic_wait.bench.cpp
rename to libcxx/test/benchmarks/atomic_wait.bench.cpp
index 4a06a45739377..dd541b445448a 100644
--- a/libcxx/benchmarks/atomic_wait.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
rename to libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
index c60fcd579488c..1a52e5dddb8e9 100644
--- a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/deque.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/deque.bench.cpp
rename to libcxx/test/benchmarks/deque.bench.cpp
diff --git a/libcxx/benchmarks/deque_iterator.bench.cpp b/libcxx/test/benchmarks/deque_iterator.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/deque_iterator.bench.cpp
rename to libcxx/test/benchmarks/deque_iterator.bench.cpp
diff --git a/libcxx/benchmarks/exception_ptr.bench.cpp b/libcxx/test/benchmarks/exception_ptr.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/exception_ptr.bench.cpp
rename to libcxx/test/benchmarks/exception_ptr.bench.cpp
diff --git a/libcxx/benchmarks/filesystem.bench.cpp b/libcxx/test/benchmarks/filesystem.bench.cpp
similarity index 93%
rename from libcxx/benchmarks/filesystem.bench.cpp
rename to libcxx/test/benchmarks/filesystem.bench.cpp
index d1a1763a7e463..19f9586d08528 100644
--- a/libcxx/benchmarks/filesystem.bench.cpp
+++ b/libcxx/test/benchmarks/filesystem.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <filesystem>
 
 #include "GenerateInput.h"
diff --git a/libcxx/benchmarks/format.bench.cpp b/libcxx/test/benchmarks/format.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/format.bench.cpp
rename to libcxx/test/benchmarks/format.bench.cpp
index 89f113253ea62..d6cb0467b81dd 100644
--- a/libcxx/benchmarks/format.bench.cpp
+++ b/libcxx/test/benchmarks/format.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/format_to.bench.cpp b/libcxx/test/benchmarks/format_to.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/format_to.bench.cpp
rename to libcxx/test/benchmarks/format_to.bench.cpp
index 4e6041d448f50..48cb3da498235 100644
--- a/libcxx/benchmarks/format_to.bench.cpp
+++ b/libcxx/test/benchmarks/format_to.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/format_to_n.bench.cpp b/libcxx/test/benchmarks/format_to_n.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/format_to_n.bench.cpp
rename to libcxx/test/benchmarks/format_to_n.bench.cpp
index f5816d49f6dd5..99723c7c6f695 100644
--- a/libcxx/benchmarks/format_to_n.bench.cpp
+++ b/libcxx/test/benchmarks/format_to_n.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatted_size.bench.cpp b/libcxx/test/benchmarks/formatted_size.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/formatted_size.bench.cpp
rename to libcxx/test/benchmarks/formatted_size.bench.cpp
index de67dae719a7e..2945df24a1172 100644
--- a/libcxx/benchmarks/formatted_size.bench.cpp
+++ b/libcxx/test/benchmarks/formatted_size.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatter_float.bench.cpp b/libcxx/test/benchmarks/formatter_float.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/formatter_float.bench.cpp
rename to libcxx/test/benchmarks/formatter_float.bench.cpp
index e7a673bb5d655..d1da58598ea46 100644
--- a/libcxx/benchmarks/formatter_float.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_float.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatter_int.bench.cpp b/libcxx/test/benchmarks/formatter_int.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/formatter_int.bench.cpp
rename to libcxx/test/benchmarks/formatter_int.bench.cpp
index 7cd794a4d4d66..6c624e9e0379c 100644
--- a/libcxx/benchmarks/formatter_int.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_int.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/function.bench.cpp b/libcxx/test/benchmarks/function.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/function.bench.cpp
rename to libcxx/test/benchmarks/function.bench.cpp
diff --git a/libcxx/benchmarks/join_view.bench.cpp b/libcxx/test/benchmarks/join_view.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/join_view.bench.cpp
rename to libcxx/test/benchmarks/join_view.bench.cpp
diff --git a/libcxx/benchmarks/lexicographical_compare_three_way.bench.cpp b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/lexicographical_compare_three_way.bench.cpp
rename to libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
index e58134adaf212..03f20d05ed699 100644
--- a/libcxx/benchmarks/lexicographical_compare_three_way.bench.cpp
+++ b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/libcxxabi/dynamic_cast.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/libcxxabi/dynamic_cast.bench.cpp
rename to libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
diff --git a/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
rename to libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
diff --git a/libcxx/benchmarks/lit.cfg.py b/libcxx/test/benchmarks/lit.cfg.py.in
similarity index 73%
rename from libcxx/benchmarks/lit.cfg.py
rename to libcxx/test/benchmarks/lit.cfg.py.in
index 0d08966c26cc1..a3ce2477b9a85 100644
--- a/libcxx/benchmarks/lit.cfg.py
+++ b/libcxx/test/benchmarks/lit.cfg.py.in
@@ -3,7 +3,7 @@
 import os
 import site
 
-site.addsitedir(os.path.join(os.path.dirname(os.path.dirname(__file__)), "utils"))
+site.addsitedir(os.path.join("@LIBCXX_SOURCE_DIR@", "utils"))
 from libcxx.test.googlebenchmark import GoogleBenchmark
 
 # Tell pylint that we know config and lit_config exist somewhere.
@@ -15,8 +15,8 @@
 config.name = "libc++ benchmarks"
 config.suffixes = []
 
-config.test_exec_root = os.path.join(config.libcxx_obj_root, "benchmarks")
-config.test_source_root = config.test_exec_root
+config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.test_source_root = "@CMAKE_CURRENT_BINARY_DIR@"
 
 config.test_format = GoogleBenchmark(
     test_sub_dirs=".", test_suffix=".bench.out", benchmark_args=config.benchmark_args
diff --git a/libcxx/benchmarks/lit.site.cfg.py.in b/libcxx/test/benchmarks/lit.site.cfg.py.in
similarity index 76%
rename from libcxx/benchmarks/lit.site.cfg.py.in
rename to libcxx/test/benchmarks/lit.site.cfg.py.in
index e3ce8b22263e1..6d4b0ca3dae0e 100644
--- a/libcxx/benchmarks/lit.site.cfg.py.in
+++ b/libcxx/test/benchmarks/lit.site.cfg.py.in
@@ -7,4 +7,4 @@ config.libcxx_obj_root = "@LIBCXX_BINARY_DIR@"
 config.benchmark_args = "@LIBCXX_BENCHMARK_TEST_ARGS@".split(';')
 
 # Let the main config do the real work.
-lit_config.load_config(config, "@LIBCXX_SOURCE_DIR@/benchmarks/lit.cfg.py")
\ No newline at end of file
+lit_config.load_config(config, "@CMAKE_CURRENT_BINARY_DIR@/lit.cfg.py")
\ No newline at end of file
diff --git a/libcxx/benchmarks/map.bench.cpp b/libcxx/test/benchmarks/map.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/map.bench.cpp
rename to libcxx/test/benchmarks/map.bench.cpp
diff --git a/libcxx/benchmarks/monotonic_buffer.bench.cpp b/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/monotonic_buffer.bench.cpp
rename to libcxx/test/benchmarks/monotonic_buffer.bench.cpp
diff --git a/libcxx/benchmarks/numeric/gcd.bench.cpp b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/numeric/gcd.bench.cpp
rename to libcxx/test/benchmarks/numeric/gcd.bench.cpp
diff --git a/libcxx/benchmarks/ordered_set.bench.cpp b/libcxx/test/benchmarks/ordered_set.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/ordered_set.bench.cpp
rename to libcxx/test/benchmarks/ordered_set.bench.cpp
diff --git a/libcxx/benchmarks/random.bench.cpp b/libcxx/test/benchmarks/random.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/random.bench.cpp
rename to libcxx/test/benchmarks/random.bench.cpp
index fe2eb6672ae79..0645a4e876606 100644
--- a/libcxx/benchmarks/random.bench.cpp
+++ b/libcxx/test/benchmarks/random.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/shared_mutex_vs_mutex.bench.cpp b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/shared_mutex_vs_mutex.bench.cpp
rename to libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
index 19d13b799cc13..548293507ac03 100644
--- a/libcxx/benchmarks/shared_mutex_vs_mutex.bench.cpp
+++ b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
rename to libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
diff --git a/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
rename to libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
diff --git a/libcxx/benchmarks/stop_token.bench.cpp b/libcxx/test/benchmarks/stop_token.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/stop_token.bench.cpp
rename to libcxx/test/benchmarks/stop_token.bench.cpp
index e059a1166af16..6be4736c3aec7 100644
--- a/libcxx/benchmarks/stop_token.bench.cpp
+++ b/libcxx/test/benchmarks/stop_token.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/string.bench.cpp b/libcxx/test/benchmarks/string.bench.cpp
similarity index 98%
rename from libcxx/benchmarks/string.bench.cpp
rename to libcxx/test/benchmarks/string.bench.cpp
index 92018b0e48f53..49d37229fa35b 100644
--- a/libcxx/benchmarks/string.bench.cpp
+++ b/libcxx/test/benchmarks/string.bench.cpp
@@ -1,3 +1,10 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
 #include <cstdint>
 #include <new>
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/test/benchmarks/stringstream.bench.cpp
similarity index 86%
rename from libcxx/benchmarks/stringstream.bench.cpp
rename to libcxx/test/benchmarks/stringstream.bench.cpp
index 3cbe5ac5997a3..a333900d6d2fa 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/test/benchmarks/stringstream.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/benchmarks/system_error.bench.cpp b/libcxx/test/benchmarks/system_error.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/system_error.bench.cpp
rename to libcxx/test/benchmarks/system_error.bench.cpp
diff --git a/libcxx/benchmarks/to_chars.bench.cpp b/libcxx/test/benchmarks/to_chars.bench.cpp
similarity index 99%
rename from libcxx/benchmarks/to_chars.bench.cpp
rename to libcxx/test/benchmarks/to_chars.bench.cpp
index 1a3dc64497ba7..2e3c59f471826 100644
--- a/libcxx/benchmarks/to_chars.bench.cpp
+++ b/libcxx/test/benchmarks/to_chars.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/unordered_set_operations.bench.cpp b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
similarity index 97%
rename from libcxx/benchmarks/unordered_set_operations.bench.cpp
rename to libcxx/test/benchmarks/unordered_set_operations.bench.cpp
index d49de576ec977..bcf6adda2e021 100644
--- a/libcxx/benchmarks/unordered_set_operations.bench.cpp
+++ b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/libcxx/benchmarks/util_smartptr.bench.cpp b/libcxx/test/benchmarks/util_smartptr.bench.cpp
similarity index 100%
rename from libcxx/benchmarks/util_smartptr.bench.cpp
rename to libcxx/test/benchmarks/util_smartptr.bench.cpp
diff --git a/libcxx/benchmarks/variant_visit_1.bench.cpp b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
similarity index 68%
rename from libcxx/benchmarks/variant_visit_1.bench.cpp
rename to libcxx/test/benchmarks/variant_visit_1.bench.cpp
index 7d736f8abb16b..fa9b246a0a411 100644
--- a/libcxx/benchmarks/variant_visit_1.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/variant_visit_2.bench.cpp b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
similarity index 63%
rename from libcxx/benchmarks/variant_visit_2.bench.cpp
rename to libcxx/test/benchmarks/variant_visit_2.bench.cpp
index ed26cd44c3a92..84e26f9c61e40 100644
--- a/libcxx/benchmarks/variant_visit_2.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/variant_visit_3.bench.cpp b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
similarity index 60%
rename from libcxx/benchmarks/variant_visit_3.bench.cpp
rename to libcxx/test/benchmarks/variant_visit_3.bench.cpp
index b20d50391f26a..1b4e903ad8ff8 100644
--- a/libcxx/benchmarks/variant_visit_3.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/vector_operations.bench.cpp
similarity index 84%
rename from libcxx/benchmarks/vector_operations.bench.cpp
rename to libcxx/test/benchmarks/vector_operations.bench.cpp
index da21d1806c012..8698e45c00671 100644
--- a/libcxx/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/test/benchmarks/vector_operations.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/libcxx/test/configs/apple-libc++-shared.cfg.in b/libcxx/test/configs/apple-libc++-shared.cfg.in
new file mode 100644
index 0000000000000..5504bfd4e7f21
--- /dev/null
+++ b/libcxx/test/configs/apple-libc++-shared.cfg.in
@@ -0,0 +1,51 @@
+# Testing configuration for Apple's system libc++.
+#
+# This configuration differs from a normal LLVM shared library configuration in
+# that we must use DYLD_LIBRARY_PATH to run the tests against the just-built library,
+# since Apple's libc++ has an absolute install_name.
+#
+# We also don't use a per-target include directory layout, so we have only one
+# include directory for the libc++ headers.
+#
+# Finally, we also link against an artificial shims library that provides
+# the functionality necessary for the upstream libc++ to be usable in place
+# of a system-provided libc++. Without that, attempting to replace the system
+# libc++ with DYLD_LIBRARY_PATH would result in missing symbols and other similar
+# issues since the upstream libc++ does not contain all the symbols provided by
+# the system library.
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+import os, site
+site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils'))
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+ADDITIONAL_PARAMETERS = [
+    libcxx.test.dsl.Parameter(name='apple_system_shims', type=str,
+        actions=lambda path: [libcxx.test.dsl.AddSubstitution('%{apple-system-shims}', path)],
+        help="""
+        Path to a pre-built object file or static archive that contains shims necessary to
+        allow replacing the system libc++ by the just-built one.
+        """),
+]
+
+config.substitutions.append(('%{flags}',
+    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
+))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
+))
+config.substitutions.append(('%{link_flags}',
+    '-nostdlib++ -L %{lib-dir} -lc++ %{apple-system-shims}'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib-dir} -- '
+))
+
+config.stdlib = 'apple-libc++'
+
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS + ADDITIONAL_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_comparators.pass.cpp b/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_comparators.pass.cpp
index 1a6c0d11460c5..6bac7deadfa6e 100644
--- a/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_comparators.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_comparators.pass.cpp
@@ -119,6 +119,16 @@ constexpr bool all_the_algorithms()
     (void)std::ranges::find_if(a, UnaryTrue(&copies)); assert(copies == 0);
     (void)std::ranges::find_if_not(first, last, UnaryTrue(&copies)); assert(copies == 0);
     (void)std::ranges::find_if_not(a, UnaryTrue(&copies)); assert(copies == 0);
+#if TEST_STD_VER >= 23
+    (void)std::ranges::find_last_if(first, last, UnaryTrue(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if(a, UnaryTrue(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if_not(first, last, UnaryTrue(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if_not(a, UnaryTrue(&copies));
+    assert(copies == 0);
+#endif
     (void)std::ranges::for_each(first, last, UnaryVoid(&copies)); assert(copies == 1); copies = 0;
     (void)std::ranges::for_each(a, UnaryVoid(&copies)); assert(copies == 1); copies = 0;
     (void)std::ranges::for_each_n(first, count, UnaryVoid(&copies)); assert(copies == 1); copies = 0;
diff --git a/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_projections.pass.cpp b/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_projections.pass.cpp
index 71823d9afc1a4..bc71150c9e2d8 100644
--- a/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_projections.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/ranges_robust_against_copying_projections.pass.cpp
@@ -115,6 +115,20 @@ constexpr bool all_the_algorithms()
     (void)std::ranges::find_if(a, UnaryTrue(), Proj(&copies)); assert(copies == 0);
     (void)std::ranges::find_if_not(first, last, UnaryTrue(), Proj(&copies)); assert(copies == 0);
     (void)std::ranges::find_if_not(a, UnaryTrue(), Proj(&copies)); assert(copies == 0);
+#if TEST_STD_VER >= 23
+    (void)std::ranges::find_last(first, last, value, Proj(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last(a, value, Proj(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if(first, last, UnaryTrue(), Proj(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if(a, UnaryTrue(), Proj(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if_not(first, last, UnaryTrue(), Proj(&copies));
+    assert(copies == 0);
+    (void)std::ranges::find_last_if_not(a, UnaryTrue(), Proj(&copies));
+    assert(copies == 0);
+#endif
     (void)std::ranges::for_each(first, last, UnaryVoid(), Proj(&copies)); assert(copies == 0);
     (void)std::ranges::for_each(a, UnaryVoid(), Proj(&copies)); assert(copies == 0);
     (void)std::ranges::for_each_n(first, count, UnaryVoid(), Proj(&copies)); assert(copies == 0);
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index 7ba4bf0326240..f084c38a8fbd3 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -37,9 +37,6 @@
 // TODO: Investigate this failure
 // UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
-// TODO: Investigate this failure
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
-
 {lit_header_restrictions.get(header, '')}
 
 #include <{header}>
diff --git a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
index 9910befb0349e..7d2dd218f967b 100644
--- a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstdint>
 #include <deque>
 
 #include "min_allocator.h"
diff --git a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
index 8ace45d661df6..a16ae1d527921 100644
--- a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstdint>
 #include <list>
 
 #include "min_allocator.h"
diff --git a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
index b03f48434a0e0..5dcbdad41968a 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
@@ -14,6 +14,14 @@
 
 template <class T>
 class small_pointer {
+public:
+  using value_type        = T;
+  using difference_type   = std::int16_t;
+  using pointer           = small_pointer;
+  using reference         = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp
similarity index 64%
rename from libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp
index 42021824ce6ae..a066ad30ebd71 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp
@@ -10,13 +10,14 @@
 
 // Add to iterator out of bounds.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <vector>
 #include <cassert>
 
 #include "check_assertion.h"
+#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -24,22 +25,24 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += 1;
+    i += c.size();
     assert(i == c.end());
     i = c.begin();
-    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "Attempted to add/subtract an iterator outside its valid range");
+    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += 1;
+    i += c.size();
     assert(i == c.end());
     i = c.begin();
-    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "Attempted to add/subtract an iterator outside its valid range");
+    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp
similarity index 70%
rename from libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp
index d134527a967e5..59b9c16a6aa0e 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp
@@ -10,8 +10,8 @@
 
 // Decrement iterator prior to begin.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <vector>
 #include <cassert>
@@ -27,7 +27,7 @@ int main(int, char**) {
     C::iterator i = c.end();
     --i;
     assert(i == c.begin());
-    TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
   }
 
   {
@@ -37,7 +37,7 @@ int main(int, char**) {
     C::iterator i = c.end();
     --i;
     assert(i == c.begin());
-    TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp
similarity index 64%
rename from libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp
index 918cdd74b7916..877d3655fbe2e 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp
@@ -10,12 +10,13 @@
 
 // Dereference non-dereferenceable iterator.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <vector>
 
 #include "check_assertion.h"
+#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -23,16 +24,18 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.end();
-    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.end();
-    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp
similarity index 63%
rename from libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp
index d3e4b4ec3143f..e540f40f8c476 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp
@@ -10,13 +10,14 @@
 
 // Increment iterator past end.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <vector>
 #include <cassert>
 
 #include "check_assertion.h"
+#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -24,20 +25,22 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
-    ++i;
+    i += c.size();
     assert(i == c.end());
-    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
-    ++i;
+    i += c.size();
     assert(i == c.end());
-    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
similarity index 52%
rename from libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
index 8e8f6a5dae69d..63354af5af022 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
@@ -10,13 +10,14 @@
 
 // Index iterator out of bounds.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <vector>
 #include <cassert>
 
 #include "check_assertion.h"
+#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -24,18 +25,26 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
     assert(i[0] == 0);
-    TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        i[c.size()], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        i[c.size() + 1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
+    fill_to_capacity(c);
     C::iterator i = c.begin();
     assert(i[0] == 0);
-    TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        i[c.size()], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        i[c.size() + 1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h b/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h
new file mode 100644
index 0000000000000..abf88c477fece
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
+#define LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
+
+#include <vector>
+
+template <typename T, typename A>
+void fill_to_capacity(std::vector<T, A>& vec) {
+  // Fill the given vector up to its capacity. Our bounded iterators are currently unable to catch an out-of-bounds
+  // access that goes beyond the container's logical storage (above the size) but is still within its physical storage
+  // (below the capacity) due to iterator stability guarantees. Filling a vector makes this distinction go away.
+  while (vec.size() < vec.capacity()) {
+    vec.push_back(T());
+  }
+}
+
+#endif // LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
diff --git a/libcxx/test/libcxx/diagnostics/algorithm.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/algorithm.nodiscard.verify.cpp
index b36a426082ccb..14febc12a8a2d 100644
--- a/libcxx/test/libcxx/diagnostics/algorithm.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/algorithm.nodiscard.verify.cpp
@@ -372,6 +372,18 @@ void test() {
   // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   std::ranges::contains_subrange(iter, iter, iter, iter);
   // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last_if_not(iter, iter, pred);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last_if_not(range, pred);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last_if(iter, iter, pred);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last_if(range, pred);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last(iter, iter, 1);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::find_last(range, 1);
+  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   std::ranges::fold_left(range, 0, std::plus());
   // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}}
   std::ranges::fold_left(iter, iter, 0, std::plus());
diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp
index 663977696f2df..af80fb8ba18d9 100644
--- a/libcxx/test/libcxx/fuzzing/random.pass.cpp
+++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // UNSUPPORTED: c++03, c++11
 
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
index 9c801f5e044c1..52d8500f7fa3a 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
index 0d685f944e6bc..7a5bfb9926482 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
index ab524899e28a2..6cf5c425d573b 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
index fd570c2c76619..b89d02ba99425 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
index 7d17662da28fa..bcd1d05a3aeeb 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp b/libcxx/test/libcxx/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp
new file mode 100644
index 0000000000000..d6caa3389b8fa
--- /dev/null
+++ b/libcxx/test/libcxx/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp
@@ -0,0 +1,169 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// How the constructors of basic_stringbuf initialize the buffer pointers is
+// not specified. For some constructors it's implementation defined whether the
+// pointers are set to nullptr. Libc++'s implementation directly uses the SSO
+// buffer of a std::string as the initial size. This test validates that
+// behaviour.
+//
+// This behaviour is allowed by LWG2995.
+
+#include <sstream>
+#include <cassert>
+
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class CharT>
+struct test_buf : public std::basic_stringbuf<CharT> {
+  typedef std::basic_streambuf<CharT> base;
+  typedef typename base::char_type char_type;
+  typedef typename base::int_type int_type;
+  typedef typename base::traits_type traits_type;
+
+  char_type* pbase() const { return base::pbase(); }
+  char_type* pptr() const { return base::pptr(); }
+  char_type* epptr() const { return base::epptr(); }
+  void gbump(int n) { base::gbump(n); }
+
+  virtual int_type overflow(int_type c = traits_type::eof()) { return base::overflow(c); }
+
+  test_buf() = default;
+  explicit test_buf(std::ios_base::openmode which) : std::basic_stringbuf<CharT>(which) {}
+
+  explicit test_buf(const std::basic_string<CharT>& s) : std::basic_stringbuf<CharT>(s) {}
+#if TEST_STD_VER >= 20
+  explicit test_buf(const std::allocator<CharT>& a) : std::basic_stringbuf<CharT>(a) {}
+  test_buf(std::ios_base::openmode which, const std::allocator<CharT>& a) : std::basic_stringbuf<CharT>(which, a) {}
+  explicit test_buf(std::basic_string<CharT>&& s)
+      : std::basic_stringbuf<CharT>(std::forward<std::basic_string<CharT>>(s)) {}
+
+  test_buf(const std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>& s,
+           const std::allocator<CharT>& a)
+      : std::basic_stringbuf<CharT>(s, a) {}
+  test_buf(const std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>& s,
+           std::ios_base::openmode which,
+           const std::allocator<CharT>& a)
+      : std::basic_stringbuf<CharT>(s, which, a) {}
+  test_buf(const std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>& s)
+      : std::basic_stringbuf<CharT>(s) {}
+#endif //  TEST_STD_VER >= 20
+
+#if TEST_STD_VER >= 26
+  test_buf(std::basic_string_view<CharT> s) : std::basic_stringbuf<CharT>(s) {}
+  test_buf(std::basic_string_view<CharT> s, const std::allocator<CharT>& a) : std::basic_stringbuf<CharT>(s, a) {}
+  test_buf(std::basic_string_view<CharT> s, std::ios_base::openmode which, const std::allocator<CharT>& a)
+      : std::basic_stringbuf<CharT>(s, which, a) {}
+#endif //  TEST_STD_VER >= 26
+};
+
+template <class CharT>
+static void test() {
+  std::size_t size = std::basic_string<CharT>().capacity(); // SSO buffer size.
+  {
+    test_buf<CharT> b;
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    test_buf<CharT> b(std::ios_base::out);
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    std::basic_string<CharT> s;
+    s.reserve(1024);
+    test_buf<CharT> b(s);
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size); // copy so uses size
+  }
+#if TEST_STD_VER >= 20
+  {
+    test_buf<CharT> b = test_buf<CharT>(std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    test_buf<CharT> b = test_buf<CharT>(std::ios_base::out, std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    std::basic_string<CharT> s;
+    s.reserve(1024);
+    std::size_t capacity = s.capacity();
+    test_buf<CharT> b    = test_buf<CharT>(std::move(s));
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() >= b.pbase() + capacity); // move so uses s.capacity()
+  }
+  {
+    std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>> s;
+    s.reserve(1024);
+    test_buf<CharT> b = test_buf<CharT>(s, std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size); // copy so uses size
+  }
+  {
+    std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>> s;
+    s.reserve(1024);
+    test_buf<CharT> b = test_buf<CharT>(s, std::ios_base::out, std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size); // copy so uses size
+  }
+  {
+    std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>> s;
+    s.reserve(1024);
+    test_buf<CharT> b = test_buf<CharT>(s);
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size); // copy so uses size
+  }
+#endif // TEST_STD_VER >= 20
+#if TEST_STD_VER >= 26
+  {
+    std::basic_string_view<CharT> s;
+    test_buf<CharT> b = test_buf<CharT>(s);
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    std::basic_string_view<CharT> s;
+    test_buf<CharT> b = test_buf<CharT>(s, std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+  {
+    std::basic_string_view<CharT> s;
+    test_buf<CharT> b = test_buf<CharT>(s, std::ios_base::out, std::allocator<CharT>());
+    assert(b.pbase() != nullptr);
+    assert(b.pptr() == b.pbase());
+    assert(b.epptr() == b.pbase() + size);
+  }
+#endif // TEST_STD_VER >= 26
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+  return 0;
+}
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp
index feaef53ae09d5..45d0cc3b95f90 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp
@@ -11,8 +11,8 @@
 //
 // Arithmetic operators
 
+#include <__iterator/bounded_iter.h>
 #include <cstddef>
-#include <iterator>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
index f4b0da9511eaf..cef2157469c8f 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
@@ -11,7 +11,8 @@
 //
 // Comparison operators
 
-#include <iterator>
+#include <concepts>
+#include <__iterator/bounded_iter.h>
 
 #include "test_iterators.h"
 #include "test_macros.h"
@@ -59,6 +60,12 @@ TEST_CONSTEXPR_CXX14 bool tests() {
     assert(iter1 >= iter1);
   }
 
+#if TEST_STD_VER >= 20
+  // P1614
+  std::same_as<std::strong_ordering> decltype(auto) r1 = iter1 <=> iter2;
+  assert(r1 == std::strong_ordering::less);
+#endif
+
   return true;
 }
 
@@ -69,8 +76,11 @@ int main(int, char**) {
 #endif
 
 #if TEST_STD_VER > 17
-  tests<contiguous_iterator<int*> >();
-  static_assert(tests<contiguous_iterator<int*> >(), "");
+  tests<contiguous_iterator<int*>>();
+  static_assert(tests<contiguous_iterator<int*>>());
+
+  tests<three_way_contiguous_iterator<int*>>();
+  static_assert(tests<three_way_contiguous_iterator<int*>>());
 #endif
 
   return 0;
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
index bf723f14e80a9..7e3a59a49ffd4 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
@@ -16,7 +16,7 @@
 // UNSUPPORTED: libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
-#include <iterator>
+#include <__iterator/bounded_iter.h>
 
 #include "check_assertion.h"
 #include "test_iterators.h"
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp
index 6ae0928b7e528..bfd779d644f51 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp
@@ -11,9 +11,10 @@
 //
 // std::pointer_traits specialization
 
+#include <__iterator/bounded_iter.h>
 #include <cassert>
 #include <cstddef>
-#include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_iterators.h"
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp
index db95513055f81..56ded9ae5ed21 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp
@@ -11,6 +11,7 @@
 //
 // Nested types
 
+#include <__iterator/bounded_iter.h>
 #include <cstddef>
 #include <iterator>
 #include <type_traits>
diff --git a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
index 4b2f7abe9159b..493ebf044187c 100644
--- a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
+++ b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
@@ -15,8 +15,8 @@
 // template<class _Alloc>
 // struct __allocation_guard;
 
+#include <__memory/allocation_guard.h>
 #include <cassert>
-#include <memory>
 #include <type_traits>
 #include <utility>
 
diff --git a/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp b/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp
index 8bc890a208d0c..4258089813e0d 100644
--- a/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp
+++ b/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <__memory/compressed_pair.h>
 #include <assert.h>
-#include <memory>
 #include <new>
 
 #include "test_macros.h"
diff --git a/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp b/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp
index a826555d48dda..aed78f9cddf84 100644
--- a/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp
+++ b/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp
@@ -12,10 +12,10 @@
 // closest representable value for the specified integer type, or
 // numeric_limits<IntT>::max()/min() if the value isn't representable.
 
+#include <__random/clamp_to_integral.h>
 #include <cassert>
 #include <cmath>
 #include <limits>
-#include <random> // for __clamp_to_integral
 
 template <class IntT>
 void test() {
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp
index 7046936b1b7a7..b60f172363350 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp
@@ -11,7 +11,7 @@
 // template<class T>
 // constexpr T& as-lvalue(T&& t) { // exposition only
 
-#include <utility>
+#include <__utility/as_lvalue.h>
 
 void test() {
   // Check prvalue
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp
index 721279fcd586b..8e47a507f2f8a 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp
@@ -11,6 +11,7 @@
 // template<class T>
 // constexpr T& as-lvalue(T&& t) { // exposition only
 
+#include <__utility/as_lvalue.h>
 #include <type_traits>
 #include <utility>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp b/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
index 7b4d54ed410b0..00943ef8762f8 100644
--- a/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
@@ -10,6 +10,7 @@
 
 // UNSUPPORTED: c++03
 
+#include <iterator>
 #include <string>
 
 #include "test_macros.h"
@@ -18,6 +19,14 @@
 
 template <class T>
 class small_pointer {
+public:
+  using value_type        = T;
+  using difference_type   = std::int16_t;
+  using pointer           = small_pointer;
+  using reference         = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp b/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
index 6e00e43618b2e..b85895ffcd837 100644
--- a/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
@@ -8,6 +8,7 @@
 
 // Ensure that we never change the size or alignment of `basic_string`
 
+#include <iterator>
 #include <string>
 
 #include "test_macros.h"
@@ -16,6 +17,14 @@
 
 template <class T>
 class small_pointer {
+public:
+  using value_type        = T;
+  using difference_type   = std::int16_t;
+  using pointer           = small_pointer;
+  using reference         = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp
similarity index 77%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp
index 8459284637dc5..56c9d63d0dbaf 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp
@@ -10,8 +10,8 @@
 
 // Add to iterator out of bounds.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <string>
 #include <cassert>
@@ -26,7 +26,7 @@ void test() {
   i += 1;
   assert(i == c.end());
   i = c.begin();
-  TEST_LIBCPP_ASSERT_FAILURE(i += 2, "Attempted to add/subtract an iterator outside its valid range");
+  TEST_LIBCPP_ASSERT_FAILURE(i += 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp
similarity index 76%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp
index f1fa08d006a1e..43a9739bf936f 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp
@@ -10,8 +10,8 @@
 
 // Decrement iterator prior to begin.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <string>
 #include <cassert>
@@ -25,7 +25,7 @@ void test() {
   typename C::iterator i = c.end();
   --i;
   assert(i == c.begin());
-  TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp
similarity index 75%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp
index 0bf295c6c4f4f..e2326be021033 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp
@@ -10,8 +10,8 @@
 
 // Dereference non-dereferenceable iterator.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <string>
 
@@ -22,7 +22,7 @@ template <class C>
 void test() {
   C c(1, '\0');
   typename C::iterator i = c.end();
-  TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp
similarity index 76%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp
index 9cc9ab40bcdd6..a7453f3115197 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp
@@ -10,8 +10,8 @@
 
 // Increment iterator past end.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <string>
 #include <cassert>
@@ -25,7 +25,7 @@ void test() {
   typename C::iterator i = c.begin();
   ++i;
   assert(i == c.end());
-  TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp
similarity index 66%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp
index 34060065d2046..e7d384413b589 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp
@@ -10,8 +10,8 @@
 
 // Index iterator out of bounds.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
+// UNSUPPORTED: libcpp-hardening-mode=none, c++03
 
 #include <string>
 #include <cassert>
@@ -23,9 +23,10 @@ template <class C>
 void test() {
   using T = decltype(std::uint8_t() - std::uint8_t());
   C c(1, '\0');
-  C::iterator i = c.begin();
+  typename C::iterator i = c.begin();
   assert(i[0] == 0);
-  TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
+  TEST_LIBCPP_ASSERT_FAILURE(i[1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(i[-1], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/time/convert_to_tm.pass.cpp b/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
index 908a38dec83d2..6e3fa62066bcc 100644
--- a/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
+++ b/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 65c805cd86b76..51e659f52000b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -960,6 +960,7 @@ unordered_set type_traits
 unordered_set version
 utility compare
 utility cstddef
+utility cstdint
 utility cstdlib
 utility initializer_list
 utility iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index bf353b2dd4ce4..17e85e982729c 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -967,6 +967,7 @@ unordered_set type_traits
 unordered_set version
 utility compare
 utility cstddef
+utility cstdint
 utility cstdlib
 utility initializer_list
 utility iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index fa6e44873fc12..8aed93da9e6cc 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -970,6 +970,7 @@ unordered_set type_traits
 unordered_set version
 utility compare
 utility cstddef
+utility cstdint
 utility cstdlib
 utility initializer_list
 utility iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index e03f74f50b914..2c028462144ee 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -971,6 +971,7 @@ unordered_set type_traits
 unordered_set version
 utility compare
 utility cstddef
+utility cstdint
 utility cstdlib
 utility initializer_list
 utility iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 37cd4e58e7fca..982c2013e3417 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -682,6 +682,7 @@ ranges optional
 ranges span
 ranges tuple
 ranges type_traits
+ranges variant
 ranges version
 ratio climits
 ratio cstdint
@@ -977,6 +978,7 @@ unordered_set type_traits
 unordered_set version
 utility compare
 utility cstddef
+utility cstdint
 utility cstdlib
 utility initializer_list
 utility iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 098752e8699f7..8ffb71d8b566b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -456,7 +456,6 @@ ranges cwchar
 ranges initializer_list
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 098752e8699f7..8ffb71d8b566b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -456,7 +456,6 @@ ranges cwchar
 ranges initializer_list
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
diff --git a/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp
index f7f76bbe9bef0..d7bd701aa706a 100644
--- a/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <type_traits>
+#include <__type_traits/is_callable.h>
 
 struct Functor {
   void operator()();
diff --git a/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp b/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp
index 55398f8ad64ba..5a779c0e96e21 100644
--- a/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp
@@ -14,7 +14,7 @@
 // returns false when there's no constant evaluation support from the compiler.
 //  as well as when called not in a constexpr context
 
-#include <type_traits>
+#include <__type_traits/is_constant_evaluated.h>
 #include <cassert>
 
 #include "test_macros.h"
diff --git a/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp b/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp
index e1951cbc8446f..ff0ab6f68df67 100644
--- a/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp
@@ -12,8 +12,7 @@
 
 // __is_implicitly_default_constructible<Tp>
 
-#include <type_traits>
-
+#include <__type_traits/is_implicitly_default_constructible.h>
 
 struct ExplicitlyDefaultConstructible1 {
     explicit ExplicitlyDefaultConstructible1() = default;
diff --git a/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp
index 134e5f5d186a6..73dfc773aa774 100644
--- a/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp
@@ -14,10 +14,7 @@
 // Note instantiation for certain type combinations are ill-formed. These are
 // tested in is_specialization.verify.cpp.
 
-#include <type_traits>
-
-#include <array>
-#include <concepts>
+#include <__type_traits/is_specialization.h>
 #include <string_view>
 #include <tuple>
 #include <utility>
diff --git a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
index 2fd1176417538..a798647d56ee1 100644
--- a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
@@ -13,8 +13,7 @@
 //
 // Tests the ill-formed instantiations.
 
-#include <type_traits>
-
+#include <__type_traits/is_specialization.h>
 #include <array>
 #include <utility>
 
diff --git a/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp b/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp
index 51ff161357605..669bcdb58d7ba 100644
--- a/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp
@@ -14,6 +14,9 @@
 
 // Test the libc++ lazy meta-programming helpers in <type_traits>
 
+#include <__type_traits/conjunction.h>
+#include <__type_traits/disjunction.h>
+#include <__type_traits/negation.h>
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/libcxx/utilities/any/allocator.pass.cpp b/libcxx/test/libcxx/utilities/any/allocator.pass.cpp
index daaf74a2783fd..eab3ca8826493 100644
--- a/libcxx/test/libcxx/utilities/any/allocator.pass.cpp
+++ b/libcxx/test/libcxx/utilities/any/allocator.pass.cpp
@@ -39,62 +39,62 @@ bool Large_was_deallocated = false;
 bool Small_was_constructed = false;
 bool Small_was_destroyed = false;
 
-namespace std {
-  template <>
-  struct allocator<Large> {
-    using value_type = Large;
-    using size_type = std::size_t;
-    using difference_type = std::ptrdiff_t;
-    using propagate_on_container_move_assignment = std::true_type;
-    using is_always_equal = std::true_type;
-
-    Large* allocate(std::size_t n) {
-      Large_was_allocated = true;
-      return static_cast<Large*>(::operator new(n * sizeof(Large)));
-    }
+template <>
+struct std::allocator<Large> {
+  using value_type                             = Large;
+  using size_type                              = std::size_t;
+  using difference_type                        = std::ptrdiff_t;
+  using propagate_on_container_move_assignment = std::true_type;
+  using is_always_equal                        = std::true_type;
+
+  Large* allocate(std::size_t n) {
+    Large_was_allocated = true;
+    return static_cast<Large*>(::operator new(n * sizeof(Large)));
+  }
 
-    template <typename ...Args>
-    void construct(Large* p, Args&& ...args) {
-      new (p) Large(std::forward<Args>(args)...);
-      Large_was_constructed = true;
-    }
+  template <typename... Args>
+  void construct(Large* p, Args&&... args) {
+    new (p) Large(std::forward<Args>(args)...);
+    Large_was_constructed = true;
+  }
 
-    void destroy(Large* p) {
-      p->~Large();
-      Large_was_destroyed = true;
-    }
+  void destroy(Large* p) {
+    p->~Large();
+    Large_was_destroyed = true;
+  }
 
-    void deallocate(Large* p, std::size_t) {
-      Large_was_deallocated = true;
-      return ::operator delete(p);
-    }
-  };
-
-  template <>
-  struct allocator<Small> {
-    using value_type = Small;
-    using size_type = std::size_t;
-    using difference_type = std::ptrdiff_t;
-    using propagate_on_container_move_assignment = std::true_type;
-    using is_always_equal = std::true_type;
-
-    Small* allocate(std::size_t) { assert(false); return nullptr; }
-
-    template <typename ...Args>
-    void construct(Small* p, Args&& ...args) {
-      new (p) Small(std::forward<Args>(args)...);
-      Small_was_constructed = true;
-    }
+  void deallocate(Large* p, std::size_t) {
+    Large_was_deallocated = true;
+    return ::operator delete(p);
+  }
+};
+
+template <>
+struct std::allocator<Small> {
+  using value_type                             = Small;
+  using size_type                              = std::size_t;
+  using difference_type                        = std::ptrdiff_t;
+  using propagate_on_container_move_assignment = std::true_type;
+  using is_always_equal                        = std::true_type;
+
+  Small* allocate(std::size_t) {
+    assert(false);
+    return nullptr;
+  }
 
-    void destroy(Small* p) {
-      p->~Small();
-      Small_was_destroyed = true;
-    }
+  template <typename... Args>
+  void construct(Small* p, Args&&... args) {
+    new (p) Small(std::forward<Args>(args)...);
+    Small_was_constructed = true;
+  }
 
-    void deallocate(Small*, std::size_t) { assert(false); }
-  };
-} // end namespace std
+  void destroy(Small* p) {
+    p->~Small();
+    Small_was_destroyed = true;
+  }
 
+  void deallocate(Small*, std::size_t) { assert(false); }
+};
 
 int main(int, char**) {
   // Test large types
diff --git a/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp b/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp
index 7f2bcbca11ca0..60cf4f7814597 100644
--- a/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp
+++ b/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp
@@ -10,6 +10,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -fno-exceptions
 
+#include <__utility/exception_guard.h>
 #include <utility>
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/utilities/exception_guard.pass.cpp b/libcxx/test/libcxx/utilities/exception_guard.pass.cpp
index 71e60fc94542d..0728959c7277b 100644
--- a/libcxx/test/libcxx/utilities/exception_guard.pass.cpp
+++ b/libcxx/test/libcxx/utilities/exception_guard.pass.cpp
@@ -10,6 +10,7 @@
 
 // UNSUPPORTED: no-exceptions
 
+#include <__utility/exception_guard.h>
 #include <cassert>
 #include <type_traits>
 #include <utility>
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/and_then.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/and_then.mandates.verify.cpp
index 8b3138fd73db3..c46ab633295c1 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/and_then.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/and_then.mandates.verify.cpp
@@ -11,22 +11,22 @@
 // Test the mandates
 // template<class F> constexpr auto and_then(F&& f) &;
 // Mandates:
-//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(**this)>>
+//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(value())>>
 //  U is a specialization of std::expected and std::is_same_v<U:error_type, E> is true
 
 // template<class F> constexpr auto and_then(F&& f) const &;
 // Mandates:
-//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(**this)>>
+//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(value())>>
 //  U is a specialization of std::expected and std::is_same_v<U:error_type, E> is true
 
 // template<class F> constexpr auto and_then(F&& f) &&;
 // Mandates:
-//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(**this)>>
+//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(value())>>
 //  U is a specialization of std::expected and std::is_same_v<U:error_type, E> is true
 
 // template<class F> constexpr auto and_then(F&& f) const &&;
 // Mandates:
-//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(**this)>>
+//  Let U be std::remove_cvref_t<std::invoke_result<F, decltype(value())>>
 //  U is a specialization of std::expected and std::is_same_v<U:error_type, E> is true
 
 #include <expected>
@@ -52,7 +52,7 @@ void test() {
     {
       std::expected<int, int> f1(1);
       f1.and_then(lval_return_not_std_expected); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<int (&)(int &)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(**this) must be a specialization of std::expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(value()) must be a specialization of std::expected}}
       // expected-error-re@*:* {{{{.*}}cannot be used prior to '::' because it has no members}}
       // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     }
@@ -61,7 +61,7 @@ void test() {
     {
       std::expected<int, int> f1(1);
       f1.and_then(lval_error_type_not_same_as_int);  // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<std::expected<int, NotSameAsInt> (&)(int &)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(**this) must have the same error_type as this expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(value()) must have the same error_type as this expected}}
     }
   }
 
@@ -71,7 +71,7 @@ void test() {
     {
       const std::expected<int, int> f1(1);
       f1.and_then(clval_return_not_std_expected); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<int (&)(const int &)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(**this) must be a specialization of std::expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(value()) must be a specialization of std::expected}}
       // expected-error-re@*:* {{{{.*}}cannot be used prior to '::' because it has no members}}
       // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     }
@@ -80,7 +80,7 @@ void test() {
     {
       const std::expected<int, int> f1(1);
       f1.and_then(clval_error_type_not_same_as_int);  // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<std::expected<int, NotSameAsInt> (&)(const int &)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(**this) must have the same error_type as this expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(value()) must have the same error_type as this expected}}
 
     }
   }
@@ -91,7 +91,7 @@ void test() {
     {
       std::expected<int, int> f1(1);
       std::move(f1).and_then(rval_return_not_std_expected); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<int (&)(int &&)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(**this)) must be a specialization of std::expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(value())) must be a specialization of std::expected}}
       // expected-error-re@*:* {{{{.*}}cannot be used prior to '::' because it has no members}}
       // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     }
@@ -100,7 +100,7 @@ void test() {
     {
       std::expected<int, int> f1(1);
       std::move(f1).and_then(rval_error_type_not_same_as_int); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<std::expected<int, NotSameAsInt> (&)(int &&)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(**this)) must have the same error_type as this expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(value())) must have the same error_type as this expected}}
     }
   }
 
@@ -110,7 +110,7 @@ void test() {
     {
       const std::expected<int, int> f1(1);
       std::move(f1).and_then(crval_return_not_std_expected); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<int (&)(const int &&)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(**this)) must be a specialization of std::expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(value())) must be a specialization of std::expected}}
       // expected-error-re@*:* {{{{.*}}cannot be used prior to '::' because it has no members}}
       // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     }
@@ -119,7 +119,7 @@ void test() {
     {
       const std::expected<int, int> f1(1);
       std::move(f1).and_then(crval_error_type_not_same_as_int); // expected-note{{in instantiation of function template specialization 'std::expected<int, int>::and_then<std::expected<int, NotSameAsInt> (&)(const int &&)>' requested here}}
-      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(**this)) must have the same error_type as this expected}}
+      // expected-error-re@*:* {{static assertion failed {{.*}}The result of f(std::move(value())) must have the same error_type as this expected}}
     }
   }
 }
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index 580c0f4ae10c7..8c2b71f3c5f23 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
index 643bf4d1f2a8f..4f8c4ef821d5f 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
index 867b90e8bc728..162ecd5e36fc9 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index 27da03c54ac44..c4aae3356a9a2 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
index 3ecbc0abec059..2d5cee942dfd4 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
index 9c1c1d2e648d6..253ef1d5483b8 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
index 5c100322fbcf3..064d7cf720d85 100644
--- a/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -61,26 +62,24 @@ struct valid {
   void insert(iterator, CharT*, CharT*);
 };
 
-namespace std::__format {
 template <>
-inline constexpr bool __enable_insertable<no_value_type<char>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_value_type<char>> = true;
 template <>
-inline constexpr bool __enable_insertable<no_end<char>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_end<char>> = true;
 template <>
-inline constexpr bool __enable_insertable<no_insert<char>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_insert<char>> = true;
 template <>
-inline constexpr bool __enable_insertable<valid<char>> = true;
+inline constexpr bool std::__format::__enable_insertable<valid<char>> = true;
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
 template <>
-inline constexpr bool __enable_insertable<no_value_type<wchar_t>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_value_type<wchar_t>> = true;
 template <>
-inline constexpr bool __enable_insertable<no_end<wchar_t>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_end<wchar_t>> = true;
 template <>
-inline constexpr bool __enable_insertable<no_insert<wchar_t>> = true;
+inline constexpr bool std::__format::__enable_insertable<no_insert<wchar_t>> = true;
 template <>
-inline constexpr bool __enable_insertable<valid<wchar_t>> = true;
+inline constexpr bool std::__format::__enable_insertable<valid<wchar_t>> = true;
 #endif
-} // namespace std::__format
 
 static_assert(!std::__format::__insertable<no_value_type<char>>);
 static_assert(!std::__format::__insertable<no_end<char>>);
@@ -95,12 +94,10 @@ static_assert(!std::__format::__insertable<no_specialization<wchar_t>>);
 static_assert(std::__format::__insertable<valid<wchar_t>>);
 #endif
 
-namespace std::__format {
 template <>
-inline constexpr bool __enable_insertable<valid<signed char>> = true;
+inline constexpr bool std::__format::__enable_insertable<valid<signed char>> = true;
 template <>
-inline constexpr bool __enable_insertable<valid<unsigned char>> = true;
-} // namespace std::__format
+inline constexpr bool std::__format::__enable_insertable<valid<unsigned char>> = true;
 
 static_assert(!std::__format::__insertable<valid<signed char>>);
 static_assert(!std::__format::__insertable<valid<unsigned char>>);
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
index eb562e12b750e..88039f7f68f04 100644
--- a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
index c0fe5a8adc38e..cd06c509ffda2 100644
--- a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
index dd6b36eb13152..818fc72fabecb 100644
--- a/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
index d43d4de04b842..7d1cce5db8337 100644
--- a/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
index ead90e6bfa0d7..e119e42b00287 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
index 86c9f0638804f..80751ead14b49 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 5b1191642c9a6..298bbe6b0cb3b 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
index f69a995dc1784..dd1f4b607ddfc 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
index 80e45c5024b2c..3b879421c9f2a 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp b/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp
index a39352e539135..7e597081cfaad 100644
--- a/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp
+++ b/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp
@@ -11,7 +11,7 @@
 // template <class F1, class F2>
 // constexpr unspecified __compose(F1&&, F2&&);
 
-#include <functional>
+#include <__functional/compose.h>
 #include <cassert>
 #include <tuple>
 #include <utility>
diff --git a/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp b/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp
index 4696e3f667830..5d0c5586d9b3d 100644
--- a/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp
+++ b/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp
@@ -10,9 +10,12 @@
 #include "test_macros.h"
 
 TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
+#include <__type_traits/conjunction.h>
+#include <__type_traits/disjunction.h>
 #include <__type_traits/is_valid_expansion.h>
-#include <type_traits>
+#include <__type_traits/negation.h>
 #include <cassert>
+#include <type_traits>
 
 struct Bomb;
 template <int N, class T = Bomb >
diff --git a/libcxx/test/libcxx/vendor/ibm/bad_function_call.pass.cpp b/libcxx/test/libcxx/vendor/ibm/bad_function_call.pass.cpp
index 2b684465650fa..3714e4037a2dc 100644
--- a/libcxx/test/libcxx/vendor/ibm/bad_function_call.pass.cpp
+++ b/libcxx/test/libcxx/vendor/ibm/bad_function_call.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: target={{powerpc.*-ibm-aix.*}}
+// REQUIRES: target={{.+}}-aix{{.*}}
 // ADDITIONAL_COMPILE_FLAGS: -fvisibility-inlines-hidden
 
 // When there is a weak hidden symbol in user code and a strong definition
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp
new file mode 100644
index 0000000000000..9da8c26db0f56
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp
@@ -0,0 +1,206 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// MSVC warning C4242: 'argument': conversion from 'const _Ty' to 'ElementT', possible loss of data
+// MSVC warning C4244: 'argument': conversion from 'const _Ty' to 'ElementT', possible loss of data
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4242 /wd4244
+
+// template<forward_iterator I, sentinel_for<I> S, class T, class Proj = identity>
+//   requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>
+//   constexpr subrange<I> ranges::find_last(I first, S last, const T& value, Proj proj = {});
+// template<forward_range R, class T, class Proj = identity>
+//   requires indirect_binary_predicate<ranges::equal_to, projected<iterator_t<R>, Proj>, const T*>
+//   constexpr borrowed_subrange_t<R> ranges::find_last(R&& r, const T& value, Proj proj = {});
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <memory>
+#include <ranges>
+#include <vector>
+
+#include "almost_satisfies_types.h"
+#include "test_iterators.h"
+
+struct NotEqualityComparable {};
+
+template <class It, class Sent = It>
+concept HasFindLastIt = requires(It it, Sent sent) { std::ranges::find_last(it, sent, *it); };
+static_assert(HasFindLastIt<int*>);
+static_assert(HasFindLastIt<forward_iterator<int*>>);
+static_assert(!HasFindLastIt<cpp20_input_iterator<int*>>);
+static_assert(!HasFindLastIt<NotEqualityComparable*>);
+static_assert(!HasFindLastIt<ForwardIteratorNotDerivedFrom>);
+static_assert(!HasFindLastIt<ForwardIteratorNotIncrementable>);
+static_assert(!HasFindLastIt<forward_iterator<int*>, SentinelForNotSemiregular>);
+static_assert(!HasFindLastIt<forward_iterator<int*>, InputRangeNotSentinelEqualityComparableWith>);
+
+static_assert(!HasFindLastIt<int*, int>);
+static_assert(!HasFindLastIt<int, int*>);
+
+template <class Range, class ValT>
+concept HasFindLastR = requires(Range r) { std::ranges::find_last(r, ValT{}); };
+static_assert(HasFindLastR<std::array<int, 1>, int>);
+static_assert(!HasFindLastR<int, int>);
+static_assert(!HasFindLastR<std::array<NotEqualityComparable, 1>, NotEqualityComparable>);
+static_assert(!HasFindLastR<ForwardRangeNotDerivedFrom, int>);
+static_assert(!HasFindLastR<ForwardRangeNotIncrementable, int>);
+static_assert(!HasFindLastR<ForwardRangeNotSentinelSemiregular, int>);
+static_assert(!HasFindLastR<ForwardRangeNotSentinelEqualityComparableWith, int>);
+
+template <class It, class Sent = It>
+constexpr void test_iterators() {
+  using ValueT    = std::iter_value_t<It>;
+  auto make_range = [](auto& a) {
+    return std::ranges::subrange(
+        It(std::to_address(std::ranges::begin(a))), Sent(It(std::to_address(std::ranges::end(a)))));
+  };
+  { // simple test
+    {
+      ValueT a[] = {1, 2, 3, 4};
+
+      std::same_as<std::ranges::subrange<It>> auto ret = std::ranges::find_last(It(a), Sent(It(a + 4)), 2);
+      assert(base(ret.begin()) == a + 1);
+      assert(*ret.begin() == 2);
+    }
+    {
+      ValueT a[] = {1, 2, 3, 4};
+
+      std::same_as<std::ranges::subrange<It>> auto ret = std::ranges::find_last(make_range(a), 2);
+      assert(base(ret.begin()) == a + 1);
+      assert(*ret.begin() == 2);
+    }
+  }
+
+  { // check that an empty range works
+    {
+      std::array<ValueT, 0> a = {};
+
+      auto ret = std::ranges::find_last(It(a.data()), Sent(It(a.data())), 1).begin();
+      assert(ret == It(a.data()));
+    }
+    {
+      std::array<ValueT, 0> a = {};
+
+      auto ret = std::ranges::find_last(make_range(a), 1).begin();
+      assert(ret == It(a.data()));
+    }
+  }
+
+  { // check that last is returned with no match
+    {
+      ValueT a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last(It(a), Sent(It(a + 3)), 0).begin();
+      assert(ret == It(a + 3));
+    }
+    {
+      ValueT a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last(make_range(a), 0).begin();
+      assert(ret == It(a + 3));
+    }
+  }
+}
+
+template <template <class> class IteratorT>
+constexpr void test_iterator_classes() {
+  { // check that the last element is returned
+    struct S {
+      int comp;
+      int other;
+    };
+    using it = IteratorT<S*>;
+    S a[]    = {{0, 0}, {0, 2}, {0, 1}};
+
+    auto ret = std::ranges::find_last(it(std::begin(a)), it(std::end(a)), 0, &S::comp).begin();
+    assert(ret == it(a + 2));
+    assert((*ret).comp == 0);
+    assert((*ret).other == 1);
+  }
+
+  {
+    // count invocations of the projection
+    using it = IteratorT<int*>;
+
+    int a[]              = {1, 2, 3, 4};
+    int projection_count = 0;
+
+    auto ret = std::ranges::find_last(it(std::begin(a)), it(std::end(a)), 2, [&](int i) {
+                 ++projection_count;
+                 return i;
+               }).begin();
+    assert(ret == it(a + 1));
+    assert(*ret == 2);
+    if (std::bidirectional_iterator<it>) {
+      assert(projection_count == 3);
+    } else {
+      assert(projection_count == 4); // must go through entire list
+    }
+  }
+}
+
+template <class ElementT>
+class TriviallyComparable {
+  ElementT el_;
+
+public:
+  constexpr TriviallyComparable(ElementT el) : el_(el) {}
+  bool operator==(const TriviallyComparable&) const = default;
+};
+
+constexpr bool test() {
+  types::for_each(types::type_list<char, int, TriviallyComparable<char>>{}, []<class T> {
+    types::for_each(types::forward_iterator_list<T*>{}, []<class Iter> {
+      test_iterators<Iter>();
+      test_iterators<Iter, sentinel_wrapper<Iter>>();
+      test_iterators<Iter, sized_sentinel<Iter>>();
+    });
+  });
+
+  test_iterator_classes<forward_iterator>();
+  test_iterator_classes<bidirectional_iterator>();
+  test_iterator_classes<random_access_iterator>();
+  test_iterator_classes<std::type_identity_t>();
+
+  {
+    std::vector<std::vector<int>> vec = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+
+    auto view = vec | std::views::join;
+    assert(std::ranges::find_last(view.begin(), view.end(), 4).begin() == std::next(view.begin(), 3));
+    assert(std::ranges::find_last(view, 4).begin() == std::next(view.begin(), 3));
+  }
+
+  {
+    // check that an iterator is returned with a borrowing range
+    int a[] = {1, 2, 3, 4};
+
+    std::same_as<std::ranges::subrange<int*>> auto ret = std::ranges::find_last(std::views::all(a), 1);
+    assert(ret.begin() == a);
+    assert(*ret.begin() == 1);
+  }
+
+  {
+    // check that dangling ranges are dangling
+    std::same_as<std::ranges::dangling> auto ret = std::ranges::find_last(std::vector<int>(), 0);
+    (void)ret;
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp
new file mode 100644
index 0000000000000..107fcf900d3e4
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp
@@ -0,0 +1,261 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
+//          indirect_unary_predicate<projected<I, Proj>> Pred>
+//   constexpr subrange<I> ranges::find_last_if(I first, S last, Pred pred, Proj proj = {});
+// template<forward_range R, class Proj = identity,
+//          indirect_unary_predicate<projected<iterator_t<R>, Proj>> Pred>
+//   constexpr borrowed_subrange_t<R> ranges::find_last_if(R&& r, Pred pred, Proj proj = {});
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <ranges>
+
+#include "almost_satisfies_types.h"
+#include "test_iterators.h"
+
+struct Predicate {
+  bool operator()(int);
+};
+
+template <class It, class Sent = It>
+concept HasFindLastIfIt = requires(It it, Sent sent) { std::ranges::find_last_if(it, sent, Predicate{}); };
+static_assert(HasFindLastIfIt<int*>);
+static_assert(HasFindLastIfIt<forward_iterator<int*>>);
+static_assert(!HasFindLastIfIt<cpp20_input_iterator<int*>>);
+static_assert(!HasFindLastIfIt<ForwardIteratorNotDerivedFrom>);
+static_assert(!HasFindLastIfIt<ForwardIteratorNotIncrementable>);
+static_assert(!HasFindLastIfIt<forward_iterator<int*>, SentinelForNotSemiregular>);
+static_assert(!HasFindLastIfIt<forward_iterator<int*>, InputRangeNotSentinelEqualityComparableWith>);
+
+static_assert(!HasFindLastIfIt<int*, int>);
+static_assert(!HasFindLastIfIt<int, int*>);
+
+template <class Pred>
+concept HasFindLastIfPred = requires(int* it, Pred pred) { std::ranges::find_last_if(it, it, pred); };
+
+static_assert(!HasFindLastIfPred<IndirectUnaryPredicateNotCopyConstructible>);
+static_assert(!HasFindLastIfPred<IndirectUnaryPredicateNotPredicate>);
+
+template <class R>
+concept HasFindLastIfR = requires(R r) { std::ranges::find_last_if(r, Predicate{}); };
+static_assert(HasFindLastIfR<std::array<int, 0>>);
+static_assert(!HasFindLastIfR<int>);
+static_assert(!HasFindLastIfR<ForwardRangeNotDerivedFrom>);
+static_assert(!HasFindLastIfR<ForwardRangeNotIncrementable>);
+static_assert(!HasFindLastIfR<ForwardRangeNotSentinelSemiregular>);
+static_assert(!HasFindLastIfR<ForwardRangeNotSentinelEqualityComparableWith>);
+
+template <class It, class Sent>
+constexpr auto make_range(auto& a) {
+  return std::ranges::subrange(It(std::ranges::begin(a)), Sent(It(std::ranges::end(a))));
+}
+
+template <template <class> class IteratorT, template <class> class SentinelT>
+constexpr void test_iterator_classes() {
+  {
+    using it   = IteratorT<int*>;
+    using sent = SentinelT<it>;
+
+    {
+      int a[] = {1, 2, 3, 4};
+      std::same_as<std::ranges::subrange<it>> auto ret =
+          std::ranges::find_last_if(it(a), sent(it(a + 4)), [](int x) { return x == 4; });
+      assert(base(ret.begin()) == a + 3);
+      assert(*ret.begin() == 4);
+    }
+    {
+      int a[] = {1, 2, 3, 4};
+
+      std::same_as<std::ranges::subrange<it>> auto ret =
+          std::ranges::find_last_if(make_range<it, sent>(a), [](int x) { return x == 4; });
+      assert(ret.begin() == it(a + 3));
+      assert(*ret.begin() == 4);
+    }
+  }
+
+  { // check that an empty range works
+    using it   = IteratorT<std::ranges::iterator_t<std::array<int, 0>&>>;
+    using sent = SentinelT<it>;
+
+    {
+      std::array<int, 0> a = {};
+
+      auto ret = std::ranges::find_last_if(it(a.begin()), sent(it(a.end())), [](auto&&) { return true; }).begin();
+      assert(ret == it(a.end()));
+    }
+    {
+      std::array<int, 0> a = {};
+
+      auto ret = std::ranges::find_last_if(make_range<it, sent>(a), [](auto&&) { return true; }).begin();
+      assert(ret == it(a.end()));
+    }
+  }
+
+  { // check that last is returned with no match
+    using it   = IteratorT<int*>;
+    using sent = SentinelT<it>;
+
+    {
+      int a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last_if(it(a), sent(it(a + 3)), [](auto&&) { return false; }).begin();
+      assert(ret == it(a + 3));
+    }
+    {
+      int a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last_if(make_range<it, sent>(a), [](auto&&) { return false; }).begin();
+      assert(ret == it(a + 3));
+    }
+  }
+
+  { // check that the last element is returned
+    struct S {
+      int comp;
+      int other;
+    };
+    using it   = IteratorT<S*>;
+    using sent = SentinelT<it>;
+
+    S a[] = {{0, 0}, {0, 2}, {0, 1}};
+
+    auto ret = std::ranges::find_last_if(
+                   it(std::begin(a)), sent(it(std::end(a))), [](int c) { return c == 0; }, &S::comp)
+                   .begin();
+    assert(ret == it(a + 2));
+    assert((*ret).comp == 0);
+    assert((*ret).other == 1);
+  }
+
+  {
+    // count projection and predicate invocation count
+    {
+      int a[]              = {1, 2, 3, 4};
+      int predicate_count  = 0;
+      int projection_count = 0;
+
+      using it   = IteratorT<int*>;
+      using sent = SentinelT<it>;
+
+      auto ret =
+          std::ranges::find_last_if(
+              it(a),
+              sent(it(a + 4)),
+              [&](int i) {
+                ++predicate_count;
+                return i == 2;
+              },
+              [&](int i) {
+                ++projection_count;
+                return i;
+              })
+              .begin();
+      assert(ret == it(a + 1));
+      assert(*ret == 2);
+
+      if constexpr (std::bidirectional_iterator<it>) {
+        assert(predicate_count == 3);
+        assert(projection_count == 3);
+      } else {
+        assert(predicate_count == 4);
+        assert(projection_count == 4);
+      }
+    }
+  }
+}
+
+struct NonConstComparable {
+  friend constexpr bool operator==(const NonConstComparable&, const NonConstComparable&) { return false; }
+  friend constexpr bool operator==(NonConstComparable&, NonConstComparable&) { return false; }
+  friend constexpr bool operator==(const NonConstComparable&, NonConstComparable&) { return false; }
+  friend constexpr bool operator==(NonConstComparable&, const NonConstComparable&) { return true; }
+};
+
+// TODO: this should really use `std::const_iterator`
+template <class T>
+struct add_const_to_ptr {
+  using type = T;
+};
+template <class T>
+struct add_const_to_ptr<T*> {
+  using type = const T*;
+};
+template <class T>
+using add_const_to_ptr_t = typename add_const_to_ptr<T>::type;
+
+constexpr bool test() {
+  test_iterator_classes<std::type_identity_t, std::type_identity_t>();
+  test_iterator_classes<add_const_to_ptr_t, std::type_identity_t>();
+  test_iterator_classes<contiguous_iterator, std::type_identity_t>();
+  test_iterator_classes<random_access_iterator, std::type_identity_t>();
+  test_iterator_classes<bidirectional_iterator, std::type_identity_t>();
+  test_iterator_classes<forward_iterator, std::type_identity_t>();
+  test_iterator_classes<forward_iterator, sentinel_wrapper>();
+
+  {
+    // check that projections are used properly and that they are called with the iterator directly
+    {
+      int a[] = {1, 2, 3, 4};
+      auto ret =
+          std::ranges::find_last_if(a, a + 4, [&](int* i) { return i == a + 3; }, [](int& i) { return &i; }).begin();
+      assert(ret == a + 3);
+    }
+    {
+      int a[]  = {1, 2, 3, 4};
+      auto ret = std::ranges::find_last_if(a, [&](int* i) { return i == a + 3; }, [](int& i) { return &i; }).begin();
+      assert(ret == a + 3);
+    }
+  }
+
+  {
+    // check that ranges::dangling is returned
+    [[maybe_unused]] std::same_as<std::ranges::dangling> auto ret =
+        std::ranges::find_last_if(std::array{1, 2}, [](int) { return false; });
+  }
+
+  {
+    // check that a subrange is returned with a borrowing range
+    int a[] = {1, 2, 3, 4};
+    std::same_as<std::ranges::subrange<int*>> auto ret =
+        std::ranges::find_last_if(std::views::all(a), [](int) { return true; });
+    assert(ret.begin() == a + 3);
+    assert(*ret.begin() == 4);
+  }
+
+  {
+    // check that the return type of `iter::operator*` doesn't change
+    {
+      NonConstComparable a[] = {NonConstComparable{}};
+
+      auto ret = std::ranges::find_last_if(a, a + 1, [](auto&& e) { return e == NonConstComparable{}; }).begin();
+      assert(ret == a);
+    }
+    {
+      NonConstComparable a[] = {NonConstComparable{}};
+
+      auto ret = std::ranges::find_last_if(a, [](auto&& e) { return e == NonConstComparable{}; }).begin();
+      assert(ret == a);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp
new file mode 100644
index 0000000000000..6602ac569c679
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp
@@ -0,0 +1,263 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
+//          indirect_unary_predicate<projected<I, Proj>> Pred>
+//   constexpr subrange<I> ranges::find_last_if_not(I first, S last, Pred pred, Proj proj = {});
+// template<forward_range R, class Proj = identity,
+//          indirect_unary_predicate<projected<iterator_t<R>, Proj>> Pred>
+//   constexpr borrowed_subrange_t<R> ranges::find_last_if_not(R&& r, Pred pred, Proj proj = {});
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <ranges>
+
+#include "almost_satisfies_types.h"
+#include "test_iterators.h"
+
+struct Predicate {
+  bool operator()(int);
+};
+
+template <class It, class Sent = It>
+concept HasFindLastIfIt = requires(It it, Sent sent) { std::ranges::find_last_if_not(it, sent, Predicate{}); };
+static_assert(HasFindLastIfIt<int*>);
+static_assert(HasFindLastIfIt<forward_iterator<int*>>);
+static_assert(!HasFindLastIfIt<cpp20_input_iterator<int*>>);
+static_assert(!HasFindLastIfIt<ForwardIteratorNotDerivedFrom>);
+static_assert(!HasFindLastIfIt<ForwardIteratorNotIncrementable>);
+static_assert(!HasFindLastIfIt<forward_iterator<int*>, SentinelForNotSemiregular>);
+static_assert(!HasFindLastIfIt<forward_iterator<int*>, InputRangeNotSentinelEqualityComparableWith>);
+
+static_assert(!HasFindLastIfIt<int*, int>);
+static_assert(!HasFindLastIfIt<int, int*>);
+
+template <class Pred>
+concept HasFindLastIfPred = requires(int* it, Pred pred) { std::ranges::find_last_if_not(it, it, pred); };
+
+static_assert(!HasFindLastIfPred<IndirectUnaryPredicateNotCopyConstructible>);
+static_assert(!HasFindLastIfPred<IndirectUnaryPredicateNotPredicate>);
+
+template <class R>
+concept HasFindLastIfR = requires(R r) { std::ranges::find_last_if_not(r, Predicate{}); };
+static_assert(HasFindLastIfR<std::array<int, 0>>);
+static_assert(!HasFindLastIfR<int>);
+static_assert(!HasFindLastIfR<ForwardRangeNotDerivedFrom>);
+static_assert(!HasFindLastIfR<ForwardRangeNotIncrementable>);
+static_assert(!HasFindLastIfR<ForwardRangeNotSentinelSemiregular>);
+static_assert(!HasFindLastIfR<ForwardRangeNotSentinelEqualityComparableWith>);
+
+template <class It, class Sent>
+constexpr auto make_range(auto& a) {
+  return std::ranges::subrange(It(std::ranges::begin(a)), Sent(It(std::ranges::end(a))));
+}
+
+template <template <class> class IteratorT, template <class> class SentinelT>
+constexpr void test_iterator_classes() {
+  {
+    using it   = IteratorT<int*>;
+    using sent = SentinelT<it>;
+
+    {
+      int a[] = {1, 2, 3, 4};
+      std::same_as<std::ranges::subrange<it>> auto ret =
+          std::ranges::find_last_if_not(it(a), sent(it(a + 4)), [](int x) { return x != 4; });
+      assert(base(ret.begin()) == a + 3);
+      assert(*ret.begin() == 4);
+    }
+    {
+      int a[] = {1, 2, 3, 4};
+
+      std::same_as<std::ranges::subrange<it>> auto ret =
+          std::ranges::find_last_if_not(make_range<it, sent>(a), [](int x) { return x != 4; });
+      assert(ret.begin() == it(a + 3));
+      assert(*ret.begin() == 4);
+    }
+  }
+
+  { // check that an empty range works
+    using it   = IteratorT<std::ranges::iterator_t<std::array<int, 0>&>>;
+    using sent = SentinelT<it>;
+
+    {
+      std::array<int, 0> a = {};
+
+      auto ret = std::ranges::find_last_if_not(it(a.begin()), sent(it(a.end())), [](auto&&) { return false; }).begin();
+      assert(ret == it(a.end()));
+    }
+    {
+      std::array<int, 0> a = {};
+
+      auto ret = std::ranges::find_last_if_not(make_range<it, sent>(a), [](auto&&) { return false; }).begin();
+      assert(ret == it(a.end()));
+    }
+  }
+
+  { // check that last is returned with no match
+    using it   = IteratorT<int*>;
+    using sent = SentinelT<it>;
+
+    {
+      int a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last_if_not(it(a), sent(it(a + 3)), [](auto&&) { return true; }).begin();
+      assert(ret == it(a + 3));
+    }
+    {
+      int a[] = {1, 1, 1};
+
+      auto ret = std::ranges::find_last_if_not(make_range<it, sent>(a), [](auto&&) { return true; }).begin();
+      assert(ret == it(a + 3));
+    }
+  }
+
+  { // check that the last element is returned
+    struct S {
+      int comp;
+      int other;
+    };
+    using it   = IteratorT<S*>;
+    using sent = SentinelT<it>;
+
+    S a[] = {{0, 0}, {0, 2}, {0, 1}};
+
+    auto ret = std::ranges::find_last_if_not(
+                   it(std::begin(a)), sent(it(std::end(a))), [](int c) { return c != 0; }, &S::comp)
+                   .begin();
+    assert(ret == it(a + 2));
+    assert((*ret).comp == 0);
+    assert((*ret).other == 1);
+  }
+
+  {
+    // count projection and predicate invocation count
+    {
+      int a[]              = {1, 2, 3, 4};
+      int predicate_count  = 0;
+      int projection_count = 0;
+
+      using it   = IteratorT<int*>;
+      using sent = SentinelT<it>;
+
+      auto ret =
+          std::ranges::find_last_if_not(
+              it(a),
+              sent(it(a + 4)),
+              [&](int i) {
+                ++predicate_count;
+                return i != 2;
+              },
+              [&](int i) {
+                ++projection_count;
+                return i;
+              })
+              .begin();
+      assert(ret == it(a + 1));
+      assert(*ret == 2);
+
+      if constexpr (std::bidirectional_iterator<it>) {
+        assert(predicate_count == 3);
+        assert(projection_count == 3);
+      } else {
+        assert(predicate_count == 4);
+        assert(projection_count == 4);
+      }
+    }
+  }
+}
+
+struct NonConstComparable {
+  friend constexpr bool operator!=(const NonConstComparable&, const NonConstComparable&) { return true; }
+  friend constexpr bool operator!=(NonConstComparable&, NonConstComparable&) { return true; }
+  friend constexpr bool operator!=(const NonConstComparable&, NonConstComparable&) { return true; }
+  friend constexpr bool operator!=(NonConstComparable&, const NonConstComparable&) { return false; }
+};
+
+// TODO: this should really use `std::const_iterator`
+template <class T>
+struct add_const_to_ptr {
+  using type = T;
+};
+template <class T>
+struct add_const_to_ptr<T*> {
+  using type = const T*;
+};
+template <class T>
+using add_const_to_ptr_t = typename add_const_to_ptr<T>::type;
+
+constexpr bool test() {
+  test_iterator_classes<std::type_identity_t, std::type_identity_t>();
+  test_iterator_classes<add_const_to_ptr_t, std::type_identity_t>();
+  test_iterator_classes<contiguous_iterator, std::type_identity_t>();
+  test_iterator_classes<random_access_iterator, std::type_identity_t>();
+  test_iterator_classes<bidirectional_iterator, std::type_identity_t>();
+  test_iterator_classes<forward_iterator, std::type_identity_t>();
+  test_iterator_classes<forward_iterator, sentinel_wrapper>();
+
+  {
+    // check that projections are used properly and that they are called with the iterator directly
+    {
+      int a[]  = {1, 2, 3, 4};
+      auto ret = std::ranges::find_last_if_not(
+                     a, a + 4, [&](int* i) { return i != a + 3; }, [](int& i) { return &i; })
+                     .begin();
+      assert(ret == a + 3);
+    }
+    {
+      int a[] = {1, 2, 3, 4};
+      auto ret =
+          std::ranges::find_last_if_not(a, [&](int* i) { return i != a + 3; }, [](int& i) { return &i; }).begin();
+      assert(ret == a + 3);
+    }
+  }
+
+  {
+    // check that ranges::dangling is returned
+    [[maybe_unused]] std::same_as<std::ranges::dangling> auto ret =
+        std::ranges::find_last_if_not(std::array{1, 2}, [](int) { return false; });
+  }
+
+  {
+    // check that a subrange is returned with a borrowing range
+    int a[] = {1, 2, 3, 4};
+    std::same_as<std::ranges::subrange<int*>> auto ret =
+        std::ranges::find_last_if_not(std::views::all(a), [](int) { return false; });
+    assert(ret.begin() == a + 3);
+    assert(*ret.begin() == 4);
+  }
+
+  {
+    // check that the return type of `iter::operator*` doesn't change
+    {
+      NonConstComparable a[] = {NonConstComparable{}};
+
+      auto ret = std::ranges::find_last_if_not(a, a + 1, [](auto&& e) { return e != NonConstComparable{}; }).begin();
+      assert(ret == a);
+    }
+    {
+      NonConstComparable a[] = {NonConstComparable{}};
+
+      auto ret = std::ranges::find_last_if_not(a, [](auto&& e) { return e != NonConstComparable{}; }).begin();
+      assert(ret == a);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp
index 8d9df94eee5a6..f69351209e4f1 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp
@@ -25,6 +25,7 @@
 #include <array>
 #include <concepts>
 #include <functional>
+#include <memory>
 #include <ranges>
 
 #include "almost_satisfies_types.h"
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp
index 2f13bdc76f793..9efe2513271ed 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp
@@ -25,6 +25,7 @@
 #include <array>
 #include <concepts>
 #include <functional>
+#include <memory>
 #include <ranges>
 
 #include "almost_satisfies_types.h"
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp
index 6f8a1a20c965c..571da879ed3f5 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp
@@ -25,6 +25,7 @@
 #include <array>
 #include <concepts>
 #include <functional>
+#include <memory>
 #include <ranges>
 
 #include "almost_satisfies_types.h"
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index 5323bb1bc1193..f7870485cfefc 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -28,6 +28,8 @@
 #include <algorithm>
 #include <array>
 #include <concepts>
+#include <cstddef>
+#include <ranges>
 
 #include "almost_satisfies_types.h"
 #include "MoveOnly.h"
@@ -463,75 +465,6 @@ constexpr bool test() {
     }
   }
 
-  // Complexity: At most 2 * ((last1 - first1) + (last2 - first2)) - 1 comparisons and applications of each projection.
-  {
-    std::array<Data, 5> r1{{{1}, {3}, {5}, {7}, {9}}};
-    std::array<Data, 5> r2{{{2}, {4}, {6}, {8}, {10}}};
-    std::array<int, 0> expected{};
-
-    const std::size_t maxOperation = 2 * (r1.size() + r2.size()) - 1;
-
-    // iterator overload
-    {
-      std::array<Data, 0> out{};
-      std::size_t numberOfComp  = 0;
-      std::size_t numberOfProj1 = 0;
-      std::size_t numberOfProj2 = 0;
-
-      const auto comp = [&numberOfComp](int x, int y) {
-        ++numberOfComp;
-        return x < y;
-      };
-
-      const auto proj1 = [&numberOfProj1](const Data& d) {
-        ++numberOfProj1;
-        return d.data;
-      };
-
-      const auto proj2 = [&numberOfProj2](const Data& d) {
-        ++numberOfProj2;
-        return d.data;
-      };
-
-      std::ranges::set_intersection(r1.begin(), r1.end(), r2.begin(), r2.end(), out.data(), comp, proj1, proj2);
-
-      assert(std::ranges::equal(out, expected, {}, &Data::data));
-      assert(numberOfComp < maxOperation);
-      assert(numberOfProj1 < maxOperation);
-      assert(numberOfProj2 < maxOperation);
-    }
-
-    // range overload
-    {
-      std::array<Data, 0> out{};
-      std::size_t numberOfComp  = 0;
-      std::size_t numberOfProj1 = 0;
-      std::size_t numberOfProj2 = 0;
-
-      const auto comp = [&numberOfComp](int x, int y) {
-        ++numberOfComp;
-        return x < y;
-      };
-
-      const auto proj1 = [&numberOfProj1](const Data& d) {
-        ++numberOfProj1;
-        return d.data;
-      };
-
-      const auto proj2 = [&numberOfProj2](const Data& d) {
-        ++numberOfProj2;
-        return d.data;
-      };
-
-      std::ranges::set_intersection(r1, r2, out.data(), comp, proj1, proj2);
-
-      assert(std::ranges::equal(out, expected, {}, &Data::data));
-      assert(numberOfComp < maxOperation);
-      assert(numberOfProj1 < maxOperation);
-      assert(numberOfProj2 < maxOperation);
-    }
-  }
-
   // Comparator convertible to bool
   {
     struct ConvertibleToBool {
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp
new file mode 100644
index 0000000000000..ddf4087ddd6cd
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp
@@ -0,0 +1,404 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Algorithmic complexity tests for both std::set_intersection and std::ranges::set_intersection
+
+// template<InputIterator InIter1, InputIterator InIter2, typename OutIter>
+//   requires OutputIterator<OutIter, InIter1::reference>
+//         && OutputIterator<OutIter, InIter2::reference>
+//         && HasLess<InIter2::value_type, InIter1::value_type>
+//         && HasLess<InIter1::value_type, InIter2::value_type>
+//   constexpr OutIter       // constexpr after C++17
+//   set_intersection(InIter1 first1, InIter1 last1, InIter2 first2, InIter2 last2,
+//                    OutIter result);
+//
+// template<input_iterator I1, sentinel_for<I1> S1, input_iterator I2, sentinel_for<I2> S2,
+//          weakly_incrementable O, class Comp = ranges::less,
+//          class Proj1 = identity, class Proj2 = identity>
+//   requires mergeable<I1, I2, O, Comp, Proj1, Proj2>
+//   constexpr set_intersection_result<I1, I2, O>
+//     set_intersection(I1 first1, S1 last1, I2 first2, S2 last2, O result,
+//                      Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {});                         // since C++20
+//
+// template<input_range R1, input_range R2, weakly_incrementable O,
+//          class Comp = ranges::less, class Proj1 = identity, class Proj2 = identity>
+//   requires mergeable<iterator_t<R1>, iterator_t<R2>, O, Comp, Proj1, Proj2>
+//   constexpr set_intersection_result<borrowed_iterator_t<R1>, borrowed_iterator_t<R2>, O>
+//     set_intersection(R1&& r1, R2&& r2, O result,
+//                      Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {});                         // since C++20
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <ranges>
+
+#include "test_iterators.h"
+
+namespace {
+
+// __debug_less will perform an additional comparison in an assertion
+static constexpr unsigned std_less_comparison_count_multiplier() noexcept {
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+  return 2;
+#else
+  return 1;
+#endif
+}
+
+struct [[nodiscard]] OperationCounts {
+  std::size_t comparisons{};
+  struct PerInput {
+    std::size_t proj{};
+    IteratorOpCounts iterops;
+
+    [[nodiscard]] constexpr bool isNotBetterThan(const PerInput& other) {
+      return proj >= other.proj && iterops.increments + iterops.decrements + iterops.zero_moves >=
+                                       other.iterops.increments + other.iterops.decrements + other.iterops.zero_moves;
+    }
+  };
+  std::array<PerInput, 2> in;
+
+  [[nodiscard]] constexpr bool isNotBetterThan(const OperationCounts& expect) {
+    return std_less_comparison_count_multiplier() * comparisons >= expect.comparisons &&
+           in[0].isNotBetterThan(expect.in[0]) && in[1].isNotBetterThan(expect.in[1]);
+  }
+};
+
+template <std::size_t ResultSize>
+struct counted_set_intersection_result {
+  std::array<int, ResultSize> result;
+  OperationCounts opcounts;
+
+  constexpr counted_set_intersection_result() = default;
+
+  constexpr explicit counted_set_intersection_result(std::array<int, ResultSize>&& contents) : result{contents} {}
+
+  constexpr void assertNotBetterThan(const counted_set_intersection_result& other) {
+    assert(result == other.result);
+    assert(opcounts.isNotBetterThan(other.opcounts));
+  }
+};
+
+template <std::size_t ResultSize>
+counted_set_intersection_result(std::array<int, ResultSize>) -> counted_set_intersection_result<ResultSize>;
+
+template <template <class...> class InIterType1,
+          template <class...>
+          class InIterType2,
+          class OutIterType,
+          std::size_t ResultSize,
+          std::ranges::input_range R1,
+          std::ranges::input_range R2>
+constexpr counted_set_intersection_result<ResultSize> counted_set_intersection(const R1& in1, const R2& in2) {
+  counted_set_intersection_result<ResultSize> out;
+
+  const auto comp = [&out](int x, int y) {
+    ++out.opcounts.comparisons;
+    return x < y;
+  };
+
+  operation_counting_iterator b1(InIterType1<decltype(in1.begin())>(in1.begin()), &out.opcounts.in[0].iterops);
+  operation_counting_iterator e1(InIterType1<decltype(in1.end()) >(in1.end()), &out.opcounts.in[0].iterops);
+
+  operation_counting_iterator b2(InIterType2<decltype(in2.begin())>(in2.begin()), &out.opcounts.in[1].iterops);
+  operation_counting_iterator e2(InIterType2<decltype(in2.end()) >(in2.end()), &out.opcounts.in[1].iterops);
+
+  std::set_intersection(b1, e1, b2, e2, OutIterType(out.result.data()), comp);
+
+  return out;
+}
+
+template <template <class...> class InIterType1,
+          template <class...>
+          class InIterType2,
+          class OutIterType,
+          std::size_t ResultSize,
+          std::ranges::input_range R1,
+          std::ranges::input_range R2>
+constexpr counted_set_intersection_result<ResultSize> counted_ranges_set_intersection(const R1& in1, const R2& in2) {
+  counted_set_intersection_result<ResultSize> out;
+
+  const auto comp = [&out](int x, int y) {
+    ++out.opcounts.comparisons;
+    return x < y;
+  };
+
+  const auto proj1 = [&out](const int& i) {
+    ++out.opcounts.in[0].proj;
+    return i;
+  };
+
+  const auto proj2 = [&out](const int& i) {
+    ++out.opcounts.in[1].proj;
+    return i;
+  };
+
+  operation_counting_iterator b1(InIterType1<decltype(in1.begin())>(in1.begin()), &out.opcounts.in[0].iterops);
+  operation_counting_iterator e1(InIterType1<decltype(in1.end()) >(in1.end()), &out.opcounts.in[0].iterops);
+
+  operation_counting_iterator b2(InIterType2<decltype(in2.begin())>(in2.begin()), &out.opcounts.in[1].iterops);
+  operation_counting_iterator e2(InIterType2<decltype(in2.end()) >(in2.end()), &out.opcounts.in[1].iterops);
+
+  std::ranges::subrange r1{b1, sentinel_wrapper<decltype(e1)>{e1}};
+  std::ranges::subrange r2{b2, sentinel_wrapper<decltype(e2)>{e2}};
+  std::same_as<std::ranges::set_intersection_result<decltype(e1), decltype(e2), OutIterType>> decltype(auto) result =
+      std::ranges::set_intersection(r1, r2, OutIterType{out.result.data()}, comp, proj1, proj2);
+  assert(base(result.in1) == base(e1));
+  assert(base(result.in2) == base(e2));
+  assert(base(result.out) == out.result.data() + out.result.size());
+
+  return out;
+}
+
+template <template <typename...> class In1, template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIter() {
+  // Worst-case complexity:
+  // Let N=(last1 - first1) and M=(last2 - first2)
+  // At most 2*(N+M) - 1 comparisons and applications of each projection.
+  // At most 2*(N+M) iterator mutations.
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+
+    counted_set_intersection_result<0> expected;
+    expected.opcounts.comparisons              = 37;
+    expected.opcounts.in[0].proj               = 37;
+    expected.opcounts.in[0].iterops.increments = 30;
+    expected.opcounts.in[0].iterops.decrements = 0;
+    expected.opcounts.in[1]                    = expected.opcounts.in[0];
+
+    expected.assertNotBetterThan(counted_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+    expected.assertNotBetterThan(counted_ranges_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+  }
+
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+    counted_set_intersection_result expected(std::array{1, 3, 5, 7, 9, 11, 13, 15, 17, 19});
+    expected.opcounts.comparisons              = 38;
+    expected.opcounts.in[0].proj               = 38;
+    expected.opcounts.in[0].iterops.increments = 30;
+    expected.opcounts.in[0].iterops.decrements = 0;
+    expected.opcounts.in[1]                    = expected.opcounts.in[0];
+
+    expected.assertNotBetterThan(counted_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+    expected.assertNotBetterThan(counted_ranges_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+  }
+
+  // Lower complexity when there is low overlap between ranges: we can make 2*log(X) comparisons when one range
+  // has X elements that can be skipped over (and then 1 more to confirm that the value we found is equal).
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{15};
+
+    counted_set_intersection_result expected(std::array{15});
+    expected.opcounts.comparisons              = 9;
+    expected.opcounts.in[0].proj               = 9;
+    expected.opcounts.in[0].iterops.increments = 23;
+    expected.opcounts.in[0].iterops.decrements = 0;
+    expected.opcounts.in[1].proj               = 9;
+    expected.opcounts.in[1].iterops.increments = 1;
+    expected.opcounts.in[1].iterops.decrements = 0;
+
+    expected.assertNotBetterThan(counted_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+    expected.assertNotBetterThan(counted_ranges_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+  }
+
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{0, 16};
+    counted_set_intersection_result<0> expected;
+
+    expected.opcounts.comparisons              = 10;
+    expected.opcounts.in[0].proj               = 10;
+    expected.opcounts.in[0].iterops.increments = 24;
+    expected.opcounts.in[0].iterops.decrements = 0;
+    expected.opcounts.in[1].proj               = 10;
+    expected.opcounts.in[1].iterops.increments = 4;
+    expected.opcounts.in[1].iterops.decrements = 0;
+
+    expected.assertNotBetterThan(counted_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+    expected.assertNotBetterThan(counted_ranges_set_intersection<In1, In2, Out, expected.result.size()>(r1, r2));
+  }
+}
+
+template <template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1() {
+  //common_input_iterator
+  testComplexityParameterizedIter<forward_iterator, In2, Out>();
+  testComplexityParameterizedIter<bidirectional_iterator, In2, Out>();
+  testComplexityParameterizedIter<random_access_iterator, In2, Out>();
+}
+
+template <class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1In2() {
+  testComplexityParameterizedIterPermutateIn1<forward_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<bidirectional_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<random_access_iterator, Out>();
+}
+
+constexpr bool testComplexity() {
+  testComplexityParameterizedIterPermutateIn1In2<forward_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<random_access_iterator<int*>>();
+  return true;
+}
+
+template <template <typename...> class In1, template <typename...> class In2, class Out>
+constexpr void testComplexityGuaranteesParameterizedIter() {
+  // now a more generic validation of the complexity guarantees when searching for a single value
+  for (unsigned range_size = 1; range_size < 20; ++range_size) {
+    std::ranges::iota_view<int, int> r1(0, range_size);
+    for (int i : r1) {
+      // At most 2 * ((last1 - first1) + (last2 - first2)) - 1 comparisons
+      counted_set_intersection_result<1> expected(std::array{i});
+      expected.opcounts.comparisons              = 2 * (r1.size() + 1) - 1;
+      expected.opcounts.in[0].proj               = expected.opcounts.comparisons;
+      expected.opcounts.in[1].proj               = expected.opcounts.comparisons;
+      expected.opcounts.in[0].iterops.increments = 2 * r1.size();
+      expected.opcounts.in[1].iterops.increments = 2;
+      expected.opcounts.in[0].iterops.decrements = expected.opcounts.in[0].iterops.increments;
+      expected.opcounts.in[1].iterops.decrements = expected.opcounts.in[1].iterops.increments;
+
+      expected.assertNotBetterThan(
+          counted_set_intersection<In1, In2, Out, expected.result.size()>(r1, expected.result));
+      expected.assertNotBetterThan(
+          counted_ranges_set_intersection<In1, In2, Out, expected.result.size()>(r1, expected.result));
+    }
+  }
+}
+
+template <template <typename...> class In2, class Out>
+constexpr void testComplexityGuaranteesParameterizedIterPermutateIn1() {
+  //common_input_iterator
+  testComplexityGuaranteesParameterizedIter<forward_iterator, In2, Out>();
+  testComplexityGuaranteesParameterizedIter<bidirectional_iterator, In2, Out>();
+  testComplexityGuaranteesParameterizedIter<random_access_iterator, In2, Out>();
+}
+
+template <class Out>
+constexpr void testComplexityGuaranteesParameterizedIterPermutateIn1In2() {
+  testComplexityGuaranteesParameterizedIterPermutateIn1<forward_iterator, Out>();
+  testComplexityGuaranteesParameterizedIterPermutateIn1<bidirectional_iterator, Out>();
+  testComplexityGuaranteesParameterizedIterPermutateIn1<random_access_iterator, Out>();
+}
+
+constexpr bool testComplexityGuarantees() {
+  testComplexityGuaranteesParameterizedIterPermutateIn1In2<forward_iterator<int*>>();
+  testComplexityGuaranteesParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>();
+  testComplexityGuaranteesParameterizedIterPermutateIn1In2<random_access_iterator<int*>>();
+  return true;
+}
+
+constexpr bool testComplexityBasic() {
+  // Complexity: At most 2 * ((last1 - first1) + (last2 - first2)) - 1 comparisons and applications of each projection.
+  std::array<int, 5> r1{1, 3, 5, 7, 9};
+  std::array<int, 5> r2{2, 4, 6, 8, 10};
+  std::array<int, 0> expected{};
+
+  const std::size_t maxOperation = std_less_comparison_count_multiplier() * (2 * (r1.size() + r2.size()) - 1);
+
+  // std::set_intersection
+  {
+    std::array<int, 0> out{};
+    std::size_t numberOfComp = 0;
+
+    const auto comp = [&numberOfComp](int x, int y) {
+      ++numberOfComp;
+      return x < y;
+    };
+
+    std::set_intersection(r1.begin(), r1.end(), r2.begin(), r2.end(), out.data(), comp);
+
+    assert(std::ranges::equal(out, expected));
+    assert(numberOfComp <= maxOperation);
+  }
+
+  // ranges::set_intersection iterator overload
+  {
+    std::array<int, 0> out{};
+    std::size_t numberOfComp  = 0;
+    std::size_t numberOfProj1 = 0;
+    std::size_t numberOfProj2 = 0;
+
+    const auto comp = [&numberOfComp](int x, int y) {
+      ++numberOfComp;
+      return x < y;
+    };
+
+    const auto proj1 = [&numberOfProj1](int d) {
+      ++numberOfProj1;
+      return d;
+    };
+
+    const auto proj2 = [&numberOfProj2](int d) {
+      ++numberOfProj2;
+      return d;
+    };
+
+    std::ranges::set_intersection(r1.begin(), r1.end(), r2.begin(), r2.end(), out.data(), comp, proj1, proj2);
+
+    assert(std::ranges::equal(out, expected));
+    assert(numberOfComp <= maxOperation);
+    assert(numberOfProj1 <= maxOperation);
+    assert(numberOfProj2 <= maxOperation);
+  }
+
+  // ranges::set_intersection range overload
+  {
+    std::array<int, 0> out{};
+    std::size_t numberOfComp  = 0;
+    std::size_t numberOfProj1 = 0;
+    std::size_t numberOfProj2 = 0;
+
+    const auto comp = [&numberOfComp](int x, int y) {
+      ++numberOfComp;
+      return x < y;
+    };
+
+    const auto proj1 = [&numberOfProj1](int d) {
+      ++numberOfProj1;
+      return d;
+    };
+
+    const auto proj2 = [&numberOfProj2](int d) {
+      ++numberOfProj2;
+      return d;
+    };
+
+    std::ranges::set_intersection(r1, r2, out.data(), comp, proj1, proj2);
+
+    assert(std::ranges::equal(out, expected));
+    assert(numberOfComp < maxOperation);
+    assert(numberOfProj1 < maxOperation);
+    assert(numberOfProj2 < maxOperation);
+  }
+  return true;
+}
+
+} // unnamed namespace
+
+int main(int, char**) {
+  testComplexityBasic();
+  testComplexity();
+  testComplexityGuarantees();
+
+  static_assert(testComplexityBasic());
+  static_assert(testComplexity());
+
+  // we hit maximum constexpr evaluation step limit even if we split this into
+  // the 3 types of the first type layer, so let's skip the constexpr validation
+  // static_assert(testComplexityGuarantees());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
index e2581fbf2fa6c..9bd28140349b6 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test appears to hang with picolibc & qemu.
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+// This test did pass but is very slow when run using qemu. ~7 minutes on a
+// Neoverse N1 (AArch64) server core.
+// REQUIRES: long_tests
 
 // <algorithm>
 
diff --git a/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp
index e01a86d9f746f..0624a6c2d49c7 100644
--- a/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp
+++ b/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp
@@ -116,6 +116,11 @@ constexpr bool test_all() {
   dangling_1st(std::ranges::find, in, x);
   dangling_1st(std::ranges::find_if, in, unary_pred);
   dangling_1st(std::ranges::find_if_not, in, unary_pred);
+#if TEST_STD_VER >= 23
+  dangling_1st(std::ranges::find_last, in, x);
+  dangling_1st(std::ranges::find_last_if, in, unary_pred);
+  dangling_1st(std::ranges::find_last_if_not, in, unary_pred);
+#endif
   dangling_1st(std::ranges::find_first_of, in, in2);
   dangling_1st(std::ranges::adjacent_find, in);
   dangling_1st<mismatch_result<dangling, InIter>>(std::ranges::mismatch, in, in2);
diff --git a/libcxx/test/std/algorithms/ranges_robust_against_nonbool.compile.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_nonbool.compile.pass.cpp
index a5668f20ccfd2..5203ee18d7bb0 100644
--- a/libcxx/test/std/algorithms/ranges_robust_against_nonbool.compile.pass.cpp
+++ b/libcxx/test/std/algorithms/ranges_robust_against_nonbool.compile.pass.cpp
@@ -53,6 +53,10 @@ void f(Iterator it, Range in, Iterator out, std::size_t n, Value const& val, std
   in_pred(std::ranges::none_of, pred1);
   in_pred(std::ranges::find_if, pred1);
   in_pred(std::ranges::find_if_not, pred1);
+#if TEST_STD_VER >= 23
+  in_pred(std::ranges::find_last_if, pred1);
+  in_pred(std::ranges::find_last_if_not, pred1);
+#endif
   in_in_pred(std::ranges::find_first_of, pred2);
   in_pred(std::ranges::adjacent_find, pred2);
   in_in_pred(std::ranges::mismatch, pred2);
diff --git a/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp
index 139f1999bc9dc..ca1433b778751 100644
--- a/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp
+++ b/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp
@@ -82,6 +82,11 @@ constexpr void run_tests() {
   test(std::ranges::find, in, x);
   test(std::ranges::find_if, in, unary_pred);
   test(std::ranges::find_if_not, in, unary_pred);
+#if TEST_STD_VER >= 23
+  test(std::ranges::find_last, in, x);
+  test(std::ranges::find_last_if, in, unary_pred);
+  test(std::ranges::find_last_if_not, in, unary_pred);
+#endif
   test(std::ranges::find_first_of, in, in2);
   test(std::ranges::adjacent_find, in);
   test(std::ranges::mismatch, in, in2);
diff --git a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
new file mode 100644
index 0000000000000..2dc7f5c765419
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <atomic>
+//
+// template <class T>
+// class atomic;
+//
+// static constexpr bool is_always_lock_free;
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+
+#include "test_macros.h"
+#include "atomic_helpers.h"
+
+template <typename T>
+void check_always_lock_free(std::atomic<T> const& a) {
+  using InfoT = LockFreeStatusInfo<T>;
+
+  constexpr std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic<T>::is_always_lock_free;
+
+  // If we know the status of T for sure, validate the exact result of the function.
+  if constexpr (InfoT::status_known) {
+    constexpr LockFreeStatus known_status = InfoT::value;
+    if constexpr (known_status == LockFreeStatus::always) {
+      static_assert(is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else if constexpr (known_status == LockFreeStatus::never) {
+      static_assert(!is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(!a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else {
+      assert(a.is_lock_free() || !a.is_lock_free()); // This is kinda dumb, but we might as well call the function once.
+    }
+  }
+
+  // In all cases, also sanity-check it based on the implication always-lock-free => lock-free.
+  if (is_always_lock_free) {
+    std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
+    assert(is_lock_free);
+  }
+  ASSERT_NOEXCEPT(a.is_lock_free());
+}
+
+#define CHECK_ALWAYS_LOCK_FREE(T)                                                                                      \
+  do {                                                                                                                 \
+    typedef T type;                                                                                                    \
+    type obj{};                                                                                                        \
+    std::atomic<type> a(obj);                                                                                          \
+    check_always_lock_free(a);                                                                                         \
+  } while (0)
+
+void test() {
+  char c = 'x';
+  check_always_lock_free(std::atomic<char>(c));
+
+  int i = 0;
+  check_always_lock_free(std::atomic<int>(i));
+
+  float f = 0.f;
+  check_always_lock_free(std::atomic<float>(f));
+
+  int* p = &i;
+  check_always_lock_free(std::atomic<int*>(p));
+
+  CHECK_ALWAYS_LOCK_FREE(bool);
+  CHECK_ALWAYS_LOCK_FREE(char);
+  CHECK_ALWAYS_LOCK_FREE(signed char);
+  CHECK_ALWAYS_LOCK_FREE(unsigned char);
+#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
+  CHECK_ALWAYS_LOCK_FREE(char8_t);
+#endif
+  CHECK_ALWAYS_LOCK_FREE(char16_t);
+  CHECK_ALWAYS_LOCK_FREE(char32_t);
+  CHECK_ALWAYS_LOCK_FREE(wchar_t);
+  CHECK_ALWAYS_LOCK_FREE(short);
+  CHECK_ALWAYS_LOCK_FREE(unsigned short);
+  CHECK_ALWAYS_LOCK_FREE(int);
+  CHECK_ALWAYS_LOCK_FREE(unsigned int);
+  CHECK_ALWAYS_LOCK_FREE(long);
+  CHECK_ALWAYS_LOCK_FREE(unsigned long);
+  CHECK_ALWAYS_LOCK_FREE(long long);
+  CHECK_ALWAYS_LOCK_FREE(unsigned long long);
+  CHECK_ALWAYS_LOCK_FREE(std::nullptr_t);
+  CHECK_ALWAYS_LOCK_FREE(void*);
+  CHECK_ALWAYS_LOCK_FREE(float);
+  CHECK_ALWAYS_LOCK_FREE(double);
+  CHECK_ALWAYS_LOCK_FREE(long double);
+#if __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(1 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(2 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(4 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(16 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(32 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(1 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(2 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(4 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(16 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(32 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(1 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(2 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(4 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(16 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(32 * sizeof(double)))));
+#endif // __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
+  CHECK_ALWAYS_LOCK_FREE(struct Empty{});
+  CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
+  CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
+  CHECK_ALWAYS_LOCK_FREE(struct Padding {
+    char c; /* padding */
+    long long int i;
+  });
+  CHECK_ALWAYS_LOCK_FREE(union IntFloat {
+    int i;
+    float f;
+  });
+  CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char{foo});
+
+  // C macro and static constexpr must be consistent.
+  enum class CharEnumClass : char { foo };
+  static_assert(std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
+  static_assert(std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<CharEnumClass>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
+  static_assert(std::atomic<char8_t>::is_always_lock_free == (2 == ATOMIC_CHAR8_T_LOCK_FREE), "");
+#endif
+  static_assert(std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
+  static_assert(std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
+  static_assert(std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
+  static_assert(std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
+  static_assert(std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(std::atomic<long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(std::atomic<std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+
+#if TEST_STD_VER >= 20
+  static_assert(std::atomic_signed_lock_free::is_always_lock_free, "");
+  static_assert(std::atomic_unsigned_lock_free::is_always_lock_free, "");
+#endif
+}
+
+int main(int, char**) {
+  test();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
deleted file mode 100644
index 6d6e6477bc251..0000000000000
--- a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: c++03, c++11, c++14
-
-// <atomic>
-
-// static constexpr bool is_always_lock_free;
-
-#include <atomic>
-#include <cassert>
-#include <cstddef>
-
-#include "test_macros.h"
-
-template <typename T>
-void checkAlwaysLockFree() {
-  if (std::atomic<T>::is_always_lock_free) {
-    assert(std::atomic<T>().is_lock_free());
-  }
-}
-
-void run()
-{
-// structs and unions can't be defined in the template invocation.
-// Work around this with a typedef.
-#define CHECK_ALWAYS_LOCK_FREE(T)                                              \
-  do {                                                                         \
-    typedef T type;                                                            \
-    checkAlwaysLockFree<type>();                                               \
-  } while (0)
-
-    CHECK_ALWAYS_LOCK_FREE(bool);
-    CHECK_ALWAYS_LOCK_FREE(char);
-    CHECK_ALWAYS_LOCK_FREE(signed char);
-    CHECK_ALWAYS_LOCK_FREE(unsigned char);
-#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
-    CHECK_ALWAYS_LOCK_FREE(char8_t);
-#endif
-    CHECK_ALWAYS_LOCK_FREE(char16_t);
-    CHECK_ALWAYS_LOCK_FREE(char32_t);
-    CHECK_ALWAYS_LOCK_FREE(wchar_t);
-    CHECK_ALWAYS_LOCK_FREE(short);
-    CHECK_ALWAYS_LOCK_FREE(unsigned short);
-    CHECK_ALWAYS_LOCK_FREE(int);
-    CHECK_ALWAYS_LOCK_FREE(unsigned int);
-    CHECK_ALWAYS_LOCK_FREE(long);
-    CHECK_ALWAYS_LOCK_FREE(unsigned long);
-    CHECK_ALWAYS_LOCK_FREE(long long);
-    CHECK_ALWAYS_LOCK_FREE(unsigned long long);
-    CHECK_ALWAYS_LOCK_FREE(std::nullptr_t);
-    CHECK_ALWAYS_LOCK_FREE(void*);
-    CHECK_ALWAYS_LOCK_FREE(float);
-    CHECK_ALWAYS_LOCK_FREE(double);
-    CHECK_ALWAYS_LOCK_FREE(long double);
-#if __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(1 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(2 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(4 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(16 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(32 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(1 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(2 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(4 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(16 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(32 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(1 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(2 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(4 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(16 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(32 * sizeof(double)))));
-#endif // __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
-    CHECK_ALWAYS_LOCK_FREE(struct Empty {});
-    CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
-    CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
-    CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
-    CHECK_ALWAYS_LOCK_FREE(struct Padding { char c; /* padding */ long long int i; });
-    CHECK_ALWAYS_LOCK_FREE(union IntFloat { int i; float f; });
-    CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char { foo });
-
-    // C macro and static constexpr must be consistent.
-    enum class CharEnumClass : char { foo };
-    static_assert(std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
-    static_assert(std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<CharEnumClass>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
-    static_assert(std::atomic<char8_t>::is_always_lock_free == (2 == ATOMIC_CHAR8_T_LOCK_FREE), "");
-#endif
-    static_assert(std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
-    static_assert(std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
-    static_assert(std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
-    static_assert(std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-    static_assert(std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-    static_assert(std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-    static_assert(std::atomic<long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-    static_assert(std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-    static_assert(std::atomic<std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-
-#if TEST_STD_VER >= 20
-    static_assert(std::atomic_signed_lock_free::is_always_lock_free, "");
-    static_assert(std::atomic_unsigned_lock_free::is_always_lock_free, "");
-#endif
-}
-
-int main(int, char**) { run(); return 0; }
diff --git a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
index 94f65e3b4b669..78e46c0397951 100644
--- a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
@@ -9,7 +9,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <atomic>
-
+//
+// template <class T>
+// class atomic_ref;
+//
 // static constexpr bool is_always_lock_free;
 // bool is_lock_free() const noexcept;
 
@@ -18,10 +21,29 @@
 #include <concepts>
 
 #include "test_macros.h"
+#include "atomic_helpers.h"
 
 template <typename T>
-void check_always_lock_free(std::atomic_ref<T> const a) {
-  std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+void check_always_lock_free(std::atomic_ref<T> const& a) {
+  using InfoT = LockFreeStatusInfo<T>;
+
+  constexpr std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+
+  // If we know the status of T for sure, validate the exact result of the function.
+  if constexpr (InfoT::status_known) {
+    constexpr LockFreeStatus known_status = InfoT::value;
+    if constexpr (known_status == LockFreeStatus::always) {
+      static_assert(is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else if constexpr (known_status == LockFreeStatus::never) {
+      static_assert(!is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(!a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else {
+      assert(a.is_lock_free() || !a.is_lock_free()); // This is kinda dumb, but we might as well call the function once.
+    }
+  }
+
+  // In all cases, also sanity-check it based on the implication always-lock-free => lock-free.
   if (is_always_lock_free) {
     std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
     assert(is_lock_free);
@@ -32,11 +54,15 @@ void check_always_lock_free(std::atomic_ref<T> const a) {
 #define CHECK_ALWAYS_LOCK_FREE(T)                                                                                      \
   do {                                                                                                                 \
     typedef T type;                                                                                                    \
-    type obj{};                                                                                                        \
-    check_always_lock_free(std::atomic_ref<type>(obj));                                                                \
+    alignas(std::atomic_ref<type>::required_alignment) type obj{};                                                     \
+    std::atomic_ref<type> a(obj);                                                                                      \
+    check_always_lock_free(a);                                                                                         \
   } while (0)
 
 void test() {
+  char c = 'x';
+  check_always_lock_free(std::atomic_ref<char>(c));
+
   int i = 0;
   check_always_lock_free(std::atomic_ref<int>(i));
 
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
index 86e0cba4dbf02..4fc453fb0ff11 100644
--- a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -12,6 +12,7 @@
 #include <atomic>
 #include <cassert>
 #include <concepts>
+#include <cstddef>
 
 template <typename T>
 constexpr void check_required_alignment() {
diff --git a/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
index fead6e2e5f6c2..817a70d2ce364 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
@@ -94,9 +94,11 @@ void test() {
   TEST_IGNORE_NODISCARD a.compare_exchange_weak(v, v);
   TEST_IGNORE_NODISCARD a.compare_exchange_strong(v, v, m);
 
+#if TEST_STD_VER >= 20
   a.wait(v);
   a.notify_one();
   a.notify_all();
+#endif
 }
 
 void test() {
diff --git a/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
index 961aed3b4fb1a..c62127f3883b0 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
@@ -128,9 +128,11 @@ void test() {
   a += 0;
   a -= 0;
 
+#if TEST_STD_VER >= 20
   a.wait(v);
   a.notify_one();
   a.notify_all();
+#endif
 }
 
 void test() {
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 0ec530c922e70..fc159b15e78e1 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
index c21b67d479ae2..330d8a44bfc2f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
index af99113f13499..7c5169b64cbe3 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
index bb8c64593b54b..c84eecff3eac4 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
index 2bac7cbc93ad4..29881891ffd11 100644
--- a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
@@ -101,6 +101,8 @@
 // using std::atomic_fetch_sub_explicit                   // see below
 // using std::atomic_fetch_or                             // see below
 // using std::atomic_fetch_or_explicit                    // see below
+// using std::atomic_fetch_xor                            // see below
+// using std::atomic_fetch_xor_explicit                   // see below
 // using std::atomic_fetch_and                            // see below
 // using std::atomic_fetch_and_explicit                   // see below
 // using std::atomic_flag_test_and_set                    // see below
@@ -222,6 +224,8 @@ void f() {
   using ::atomic_fetch_or_explicit;
   using ::atomic_fetch_sub;
   using ::atomic_fetch_sub_explicit;
+  using ::atomic_fetch_xor;
+  using ::atomic_fetch_xor_explicit;
   using ::atomic_flag_clear;
   using ::atomic_flag_clear_explicit;
   using ::atomic_flag_test_and_set;
diff --git a/libcxx/test/std/concepts/concepts.lang/concept.common/common_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.lang/concept.common/common_with.compile.pass.cpp
index 5d80c1cba2655..d72df6dbb3ca9 100644
--- a/libcxx/test/std/concepts/concepts.lang/concept.common/common_with.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.lang/concept.common/common_with.compile.pass.cpp
@@ -262,17 +262,16 @@ struct BadBasicCommonType {
   // should be placed so the test doesn't get deleted.
 };
 
-namespace std {
 template <>
-struct common_type<BadBasicCommonType, int> {
+struct std::common_type<BadBasicCommonType, int> {
   using type = BadBasicCommonType;
 };
 
 template <>
-struct common_type<int, BadBasicCommonType> {
+struct std::common_type<int, BadBasicCommonType> {
   using type = int;
 };
-} // namespace std
+
 static_assert(requires {
   typename std::common_type_t<BadBasicCommonType, int>;
 });
@@ -289,17 +288,16 @@ static_assert(!std::convertible_to<DullCommonType, int>);
 struct T1 {};
 static_assert(!std::convertible_to<DullCommonType, T1>);
 
-namespace std {
 template <>
-struct common_type<T1, int> {
+struct std::common_type<T1, int> {
   using type = DullCommonType;
 };
 
 template <>
-struct common_type<int, T1> {
+struct std::common_type<int, T1> {
   using type = DullCommonType;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T1, int>());
 static_assert(!CheckCommonWith<T1, int>());
 
@@ -314,17 +312,16 @@ struct T2 {};
 static_assert(
     !std::convertible_to<CommonTypeImplicitlyConstructibleFromInt, T2>);
 
-namespace std {
 template <>
-struct common_type<T2, int> {
+struct std::common_type<T2, int> {
   using type = CommonTypeImplicitlyConstructibleFromInt;
 };
 
 template <>
-struct common_type<int, T2> {
+struct std::common_type<int, T2> {
   using type = CommonTypeImplicitlyConstructibleFromInt;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T2, int>());
 static_assert(!CheckCommonWith<T2, int>());
 
@@ -339,17 +336,16 @@ struct T3 {};
 static_assert(
     !std::convertible_to<CommonTypeExplicitlyConstructibleFromInt, T2>);
 
-namespace std {
 template <>
-struct common_type<T3, int> {
+struct std::common_type<T3, int> {
   using type = CommonTypeExplicitlyConstructibleFromInt;
 };
 
 template <>
-struct common_type<int, T3> {
+struct std::common_type<int, T3> {
   using type = CommonTypeExplicitlyConstructibleFromInt;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T3, int>());
 static_assert(!CheckCommonWith<T3, int>());
 
@@ -361,17 +357,16 @@ static_assert(requires(T4 t4) {
   static_cast<CommonTypeImplicitlyConstructibleFromT4>(t4);
 });
 
-namespace std {
 template <>
-struct common_type<T4, int> {
+struct std::common_type<T4, int> {
   using type = CommonTypeImplicitlyConstructibleFromT4;
 };
 
 template <>
-struct common_type<int, T4> {
+struct std::common_type<int, T4> {
   using type = CommonTypeImplicitlyConstructibleFromT4;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T4, int>());
 static_assert(!CheckCommonWith<T4, int>());
 
@@ -383,17 +378,16 @@ static_assert(requires(T5 t5) {
   static_cast<CommonTypeExplicitlyConstructibleFromT5>(t5);
 });
 
-namespace std {
 template <>
-struct common_type<T5, int> {
+struct std::common_type<T5, int> {
   using type = CommonTypeExplicitlyConstructibleFromT5;
 };
 
 template <>
-struct common_type<int, T5> {
+struct std::common_type<int, T5> {
   using type = CommonTypeExplicitlyConstructibleFromT5;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T5, int>());
 static_assert(!CheckCommonWith<T5, int>());
 
@@ -403,113 +397,111 @@ struct CommonTypeNoCommonReference {
   CommonTypeNoCommonReference(int);
 };
 
-namespace std {
 template <>
-struct common_type<T6, int> {
+struct std::common_type<T6, int> {
   using type = CommonTypeNoCommonReference;
 };
 
 template <>
-struct common_type<int, T6> {
+struct std::common_type<int, T6> {
   using type = CommonTypeNoCommonReference;
 };
 
 template <>
-struct common_type<T6&, int&> {};
+struct std::common_type<T6&, int&> {};
 
 template <>
-struct common_type<int&, T6&> {};
+struct std::common_type<int&, T6&> {};
 
 template <>
-struct common_type<T6&, const int&> {};
+struct std::common_type<T6&, const int&> {};
 
 template <>
-struct common_type<int&, const T6&> {};
+struct std::common_type<int&, const T6&> {};
 
 template <>
-struct common_type<T6&, volatile int&> {};
+struct std::common_type<T6&, volatile int&> {};
 
 template <>
-struct common_type<int&, volatile T6&> {};
+struct std::common_type<int&, volatile T6&> {};
 
 template <>
-struct common_type<T6&, const volatile int&> {};
+struct std::common_type<T6&, const volatile int&> {};
 
 template <>
-struct common_type<int&, const volatile T6&> {};
+struct std::common_type<int&, const volatile T6&> {};
 
 template <>
-struct common_type<const T6&, int&> {};
+struct std::common_type<const T6&, int&> {};
 
 template <>
-struct common_type<const int&, T6&> {};
+struct std::common_type<const int&, T6&> {};
 
 template <>
-struct common_type<const T6&, const int&> {};
+struct std::common_type<const T6&, const int&> {};
 
 template <>
-struct common_type<const int&, const T6&> {};
+struct std::common_type<const int&, const T6&> {};
 
 template <>
-struct common_type<const T6&, volatile int&> {};
+struct std::common_type<const T6&, volatile int&> {};
 
 template <>
-struct common_type<const int&, volatile T6&> {};
+struct std::common_type<const int&, volatile T6&> {};
 
 template <>
-struct common_type<const T6&, const volatile int&> {};
+struct std::common_type<const T6&, const volatile int&> {};
 
 template <>
-struct common_type<const int&, const volatile T6&> {};
+struct std::common_type<const int&, const volatile T6&> {};
 
 template <>
-struct common_type<volatile T6&, int&> {};
+struct std::common_type<volatile T6&, int&> {};
 
 template <>
-struct common_type<volatile int&, T6&> {};
+struct std::common_type<volatile int&, T6&> {};
 
 template <>
-struct common_type<volatile T6&, const int&> {};
+struct std::common_type<volatile T6&, const int&> {};
 
 template <>
-struct common_type<volatile int&, const T6&> {};
+struct std::common_type<volatile int&, const T6&> {};
 
 template <>
-struct common_type<volatile T6&, volatile int&> {};
+struct std::common_type<volatile T6&, volatile int&> {};
 
 template <>
-struct common_type<volatile int&, volatile T6&> {};
+struct std::common_type<volatile int&, volatile T6&> {};
 
 template <>
-struct common_type<volatile T6&, const volatile int&> {};
+struct std::common_type<volatile T6&, const volatile int&> {};
 
 template <>
-struct common_type<volatile int&, const volatile T6&> {};
+struct std::common_type<volatile int&, const volatile T6&> {};
 
 template <>
-struct common_type<const volatile T6&, int&> {};
+struct std::common_type<const volatile T6&, int&> {};
 
 template <>
-struct common_type<const volatile int&, T6&> {};
+struct std::common_type<const volatile int&, T6&> {};
 
 template <>
-struct common_type<const volatile T6&, const int&> {};
+struct std::common_type<const volatile T6&, const int&> {};
 
 template <>
-struct common_type<const volatile int&, const T6&> {};
+struct std::common_type<const volatile int&, const T6&> {};
 
 template <>
-struct common_type<const volatile T6&, volatile int&> {};
+struct std::common_type<const volatile T6&, volatile int&> {};
 
 template <>
-struct common_type<const volatile int&, volatile T6&> {};
+struct std::common_type<const volatile int&, volatile T6&> {};
 
 template <>
-struct common_type<const volatile T6&, const volatile int&> {};
+struct std::common_type<const volatile T6&, const volatile int&> {};
 
 template <>
-struct common_type<const volatile int&, const volatile T6&> {};
-} // namespace std
+struct std::common_type<const volatile int&, const volatile T6&> {};
 
 template <typename T, typename U>
 constexpr bool HasCommonReference() noexcept {
@@ -526,177 +518,176 @@ struct CommonTypeNoMetaCommonReference {
   CommonTypeNoMetaCommonReference(int);
 };
 
-namespace std {
 template <>
-struct common_type<T7, int> {
+struct std::common_type<T7, int> {
   using type = CommonTypeNoMetaCommonReference;
 };
 
 template <>
-struct common_type<int, T7> {
+struct std::common_type<int, T7> {
   using type = CommonTypeNoMetaCommonReference;
 };
 
 template <>
-struct common_type<T7&, int&> {
+struct std::common_type<T7&, int&> {
   using type = void;
 };
 
 template <>
-struct common_type<int&, T7&> {
+struct std::common_type<int&, T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<T7&, const int&> {
+struct std::common_type<T7&, const int&> {
   using type = void;
 };
 
 template <>
-struct common_type<int&, const T7&> {
+struct std::common_type<int&, const T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<T7&, volatile int&> {
+struct std::common_type<T7&, volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<int&, volatile T7&> {
+struct std::common_type<int&, volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<T7&, const volatile int&> {
+struct std::common_type<T7&, const volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<int&, const volatile T7&> {
+struct std::common_type<int&, const volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const T7&, int&> {
+struct std::common_type<const T7&, int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const int&, T7&> {
+struct std::common_type<const int&, T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const T7&, const int&> {
+struct std::common_type<const T7&, const int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const int&, const T7&> {
+struct std::common_type<const int&, const T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const T7&, volatile int&> {
+struct std::common_type<const T7&, volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const int&, volatile T7&> {
+struct std::common_type<const int&, volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const T7&, const volatile int&> {
+struct std::common_type<const T7&, const volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const int&, const volatile T7&> {
+struct std::common_type<const int&, const volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile T7&, int&> {
+struct std::common_type<volatile T7&, int&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile int&, T7&> {
+struct std::common_type<volatile int&, T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile T7&, const int&> {
+struct std::common_type<volatile T7&, const int&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile int&, const T7&> {
+struct std::common_type<volatile int&, const T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile T7&, volatile int&> {
+struct std::common_type<volatile T7&, volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile int&, volatile T7&> {
+struct std::common_type<volatile int&, volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile T7&, const volatile int&> {
+struct std::common_type<volatile T7&, const volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<volatile int&, const volatile T7&> {
+struct std::common_type<volatile int&, const volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile T7&, int&> {
+struct std::common_type<const volatile T7&, int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile int&, T7&> {
+struct std::common_type<const volatile int&, T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile T7&, const int&> {
+struct std::common_type<const volatile T7&, const int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile int&, const T7&> {
+struct std::common_type<const volatile int&, const T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile T7&, volatile int&> {
+struct std::common_type<const volatile T7&, volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile int&, volatile T7&> {
+struct std::common_type<const volatile int&, volatile T7&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile T7&, const volatile int&> {
+struct std::common_type<const volatile T7&, const volatile int&> {
   using type = void;
 };
 
 template <>
-struct common_type<const volatile int&, const volatile T7&> {
+struct std::common_type<const volatile int&, const volatile T7&> {
   using type = void;
 };
-} // namespace std
+
 static_assert(HasValidCommonType<T7, int>());
 static_assert(HasValidCommonType<const T7&, const int&>());
 static_assert(HasCommonReference<const T7&, const int&>());
@@ -709,284 +700,242 @@ struct CommonWithInt {
   operator int() const volatile;
 };
 
-namespace std {
 template <>
-struct common_type<CommonWithInt, int> {
+struct std::common_type<CommonWithInt, int> {
   using type = int;
 };
 
 template <>
-struct common_type<int, CommonWithInt> : common_type<CommonWithInt, int> {};
+struct std::common_type<int, CommonWithInt> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<CommonWithInt&, int&> : common_type<CommonWithInt, int> {};
+struct std::common_type<CommonWithInt&, int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<int&, CommonWithInt&> : common_type<CommonWithInt, int> {};
+struct std::common_type<int&, CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<CommonWithInt&, const int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<CommonWithInt&, const int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<int&, const CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<int&, const CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<CommonWithInt&, volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<CommonWithInt&, volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<int&, volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<int&, volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<CommonWithInt&, const volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<CommonWithInt&, const volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<int&, const volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<int&, const volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const CommonWithInt&, int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const CommonWithInt&, int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const int&, CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const int&, CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const CommonWithInt&, const int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const CommonWithInt&, const int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const int&, const CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const int&, const CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const CommonWithInt&, volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const CommonWithInt&, volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const int&, volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const int&, volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const CommonWithInt&, const volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const CommonWithInt&, const volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const int&, const volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const int&, const volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile CommonWithInt&, int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile CommonWithInt&, int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile int&, CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile int&, CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile CommonWithInt&, const int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile CommonWithInt&, const int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile int&, const CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile int&, const CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile CommonWithInt&, volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile CommonWithInt&, volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile int&, volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile int&, volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile CommonWithInt&, const volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile CommonWithInt&, const volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<volatile int&, const volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<volatile int&, const volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile CommonWithInt&, int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile CommonWithInt&, int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile int&, CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile int&, CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile CommonWithInt&, const int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile CommonWithInt&, const int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile int&, const CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile int&, const CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile CommonWithInt&, volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile CommonWithInt&, volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile int&, volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile int&, volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile CommonWithInt&, const volatile int&>
-    : common_type<CommonWithInt, int> {};
+struct std::common_type<const volatile CommonWithInt&, const volatile int&> : std::common_type<CommonWithInt, int> {};
 
 template <>
-struct common_type<const volatile int&, const volatile CommonWithInt&>
-    : common_type<CommonWithInt, int> {};
-} // namespace std
+struct std::common_type<const volatile int&, const volatile CommonWithInt&> : std::common_type<CommonWithInt, int> {};
+
 static_assert(CheckCommonWith<CommonWithInt, int>());
 
 struct CommonWithIntButRefLong {
   operator int() const volatile;
 };
 
-namespace std {
 template <>
-struct common_type<CommonWithIntButRefLong, int> {
+struct std::common_type<CommonWithIntButRefLong, int> {
   using type = int;
 };
 
 template <>
-struct common_type<int, CommonWithIntButRefLong>
-    : common_type<CommonWithIntButRefLong, int> {};
+struct std::common_type<int, CommonWithIntButRefLong> : std::common_type<CommonWithIntButRefLong, int> {};
 
 template <>
-struct common_type<CommonWithIntButRefLong&, int&> {
+struct std::common_type<CommonWithIntButRefLong&, int&> {
   using type = long;
 };
 
 template <>
-struct common_type<int&, CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<int&, CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<CommonWithIntButRefLong&, const int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<CommonWithIntButRefLong&, const int&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<int&, const CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<int&, const CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<CommonWithIntButRefLong&, volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<CommonWithIntButRefLong&, volatile int&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<int&, volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<int&, volatile CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<CommonWithIntButRefLong&, const volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<CommonWithIntButRefLong&, const volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<int&, const volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<int&, const volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const CommonWithIntButRefLong&, int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const CommonWithIntButRefLong&, int&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const int&, CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const int&, CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const CommonWithIntButRefLong&, const int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const CommonWithIntButRefLong&, const int&> : std::common_type<CommonWithIntButRefLong&, int&> {
+};
 
 template <>
-struct common_type<const int&, const CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const int&, const CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {
+};
 
 template <>
-struct common_type<const CommonWithIntButRefLong&, volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const CommonWithIntButRefLong&, volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const int&, volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const int&, volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const CommonWithIntButRefLong&, const volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const CommonWithIntButRefLong&, const volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const int&, const volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const int&, const volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile CommonWithIntButRefLong&, int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile CommonWithIntButRefLong&, int&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile int&, CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile int&, CommonWithIntButRefLong&> : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile CommonWithIntButRefLong&, const int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile CommonWithIntButRefLong&, const int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile int&, const CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile int&, const CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile CommonWithIntButRefLong&, volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile CommonWithIntButRefLong&, volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile int&, volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile int&, volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile CommonWithIntButRefLong&, const volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile CommonWithIntButRefLong&, const volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<volatile int&, const volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<volatile int&, const volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile CommonWithIntButRefLong&, int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile CommonWithIntButRefLong&, int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile int&, CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile int&, CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile CommonWithIntButRefLong&, const int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile CommonWithIntButRefLong&, const int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile int&, const CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile int&, const CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile CommonWithIntButRefLong&, volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile CommonWithIntButRefLong&, volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile int&, volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile int&, volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile CommonWithIntButRefLong&, const volatile int&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
+struct std::common_type<const volatile CommonWithIntButRefLong&, const volatile int&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
 
 template <>
-struct common_type<const volatile int&, const volatile CommonWithIntButRefLong&>
-    : common_type<CommonWithIntButRefLong&, int&> {};
-} // namespace std
+struct std::common_type<const volatile int&, const volatile CommonWithIntButRefLong&>
+    : std::common_type<CommonWithIntButRefLong&, int&> {};
+
 static_assert(CheckCommonWith<CommonWithIntButRefLong, int>());
diff --git a/libcxx/test/std/concepts/concepts.lang/concept.commonref/common_reference.compile.pass.cpp b/libcxx/test/std/concepts/concepts.lang/concept.commonref/common_reference.compile.pass.cpp
index 7c37dafc2d691..ab79130f812f0 100644
--- a/libcxx/test/std/concepts/concepts.lang/concept.commonref/common_reference.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.lang/concept.commonref/common_reference.compile.pass.cpp
@@ -273,17 +273,16 @@ struct BadBasicCommonReference {
 static_assert(std::convertible_to<BadBasicCommonReference, int>);
 static_assert(std::convertible_to<BadBasicCommonReference, int&>);
 
-namespace std {
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<BadBasicCommonReference, int, X, Y> {
+struct std::basic_common_reference<BadBasicCommonReference, int, X, Y> {
   using type = BadBasicCommonReference&;
 };
 
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<int, BadBasicCommonReference, X, Y> {
+struct std::basic_common_reference<int, BadBasicCommonReference, X, Y> {
   using type = int&;
 };
-} // namespace std
+
 static_assert(!std::common_reference_with<BadBasicCommonReference, int>);
 
 struct StructNotConvertibleToCommonReference {
@@ -291,19 +290,16 @@ struct StructNotConvertibleToCommonReference {
 };
 static_assert(std::convertible_to<int, StructNotConvertibleToCommonReference>);
 
-namespace std {
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<StructNotConvertibleToCommonReference, int, X,
-                              Y> {
+struct std::basic_common_reference<StructNotConvertibleToCommonReference, int, X, Y> {
   using type = int&;
 };
 
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<int, StructNotConvertibleToCommonReference, X,
-                              Y> {
+struct std::basic_common_reference<int, StructNotConvertibleToCommonReference, X, Y> {
   using type = int&;
 };
-} // namespace std
+
 static_assert(
     !std::common_reference_with<StructNotConvertibleToCommonReference, int>);
 
@@ -311,17 +307,16 @@ struct IntNotConvertibleToCommonReference {
   operator int&() const;
 };
 
-namespace std {
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<IntNotConvertibleToCommonReference, int, X, Y> {
+struct std::basic_common_reference<IntNotConvertibleToCommonReference, int, X, Y> {
   using type = int&;
 };
 
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<int, IntNotConvertibleToCommonReference, X, Y> {
+struct std::basic_common_reference<int, IntNotConvertibleToCommonReference, X, Y> {
   using type = int&;
 };
-} // namespace std
+
 static_assert(
     !std::common_reference_with<StructNotConvertibleToCommonReference, int>);
 
@@ -330,16 +325,15 @@ struct HasCommonReference {
   operator int&() const;
 };
 
-namespace std {
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<HasCommonReference, int, X, Y> {
+struct std::basic_common_reference<HasCommonReference, int, X, Y> {
   using type = int&;
 };
 
 template <template <class> class X, template <class> class Y>
-struct basic_common_reference<int, HasCommonReference, X, Y> {
+struct std::basic_common_reference<int, HasCommonReference, X, Y> {
   using type = int&;
 };
-} // namespace std
+
 static_assert(!std::common_reference_with<HasCommonReference, int>);
 static_assert(std::common_reference_with<HasCommonReference, int&>);
diff --git a/libcxx/test/std/containers/Emplaceable.h b/libcxx/test/std/containers/Emplaceable.h
index ca780d0ae1fe2..1a4e14505fb21 100644
--- a/libcxx/test/std/containers/Emplaceable.h
+++ b/libcxx/test/std/containers/Emplaceable.h
@@ -40,18 +40,13 @@ class Emplaceable
     int get() const {return int_;}
 };
 
-namespace std {
-
 template <>
-struct hash<Emplaceable>
-{
-    typedef Emplaceable argument_type;
-    typedef std::size_t result_type;
+struct std::hash<Emplaceable> {
+  typedef Emplaceable argument_type;
+  typedef std::size_t result_type;
 
-    std::size_t operator()(const Emplaceable& x) const {return x.get();}
+  std::size_t operator()(const Emplaceable& x) const { return x.get(); }
 };
 
-}
-
 #endif // TEST_STD_VER >= 11
 #endif // EMPLACEABLE_H
diff --git a/libcxx/test/std/containers/NotConstructible.h b/libcxx/test/std/containers/NotConstructible.h
index f8d0d4d84ae90..213aa1442d5c2 100644
--- a/libcxx/test/std/containers/NotConstructible.h
+++ b/libcxx/test/std/containers/NotConstructible.h
@@ -23,18 +23,12 @@ bool
 operator==(const NotConstructible&, const NotConstructible&)
 {return true;}
 
-namespace std
-{
-
 template <>
-struct hash<NotConstructible>
-{
-    typedef NotConstructible argument_type;
-    typedef std::size_t result_type;
+struct std::hash<NotConstructible> {
+  typedef NotConstructible argument_type;
+  typedef std::size_t result_type;
 
-    std::size_t operator()(const NotConstructible&) const {return 0;}
+  std::size_t operator()(const NotConstructible&) const { return 0; }
 };
 
-}
-
 #endif // NOTCONSTRUCTIBLE_H
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
index df315feb42471..6d2a5ecbb5ecf 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
index f4a1307d30dae..54ab27887f463 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
index f683da6ff8d98..c6f1815de9c57 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
index 03a116cacebc7..001223c6bc287 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
index c47fb188c865c..0e6f6207267b8 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
index 07239d33ef669..f5329b5b6cf3e 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.node/node_handle.pass.cpp b/libcxx/test/std/containers/container.node/node_handle.pass.cpp
index 1eab9042b9e13..de9564261a04f 100644
--- a/libcxx/test/std/containers/container.node/node_handle.pass.cpp
+++ b/libcxx/test/std/containers/container.node/node_handle.pass.cpp
@@ -59,16 +59,13 @@ struct Static
     Static& operator=(Static&&) = delete;
 };
 
-namespace std
-{
-template <> struct hash<Static>
-{
-    using argument_type = Static;
-    using result_type = std::size_t;
-    hash() = default;
-    std::size_t operator()(const Static&) const;
+template <>
+struct std::hash<Static> {
+  using argument_type = Static;
+  using result_type   = std::size_t;
+  hash()              = default;
+  std::size_t operator()(const Static&) const;
 };
-}
 
 static_assert(node_compatibility_table<
                   int, int, std::less<int>, std::less<int>, std::hash<int>,
diff --git a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
index 106bc45c70998..710994c68295e 100644
--- a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
@@ -148,6 +148,15 @@ TEST_CONSTEXPR_CXX17 bool tests()
             assert(std::rbegin(c)  != std::rend(c));
             assert(std::cbegin(c)  != std::cend(c));
             assert(std::crbegin(c) != std::crend(c));
+
+#  if TEST_STD_VER >= 20
+            // P1614 + LWG3352
+            std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+            assert(r1 == std::strong_ordering::equal);
+
+            std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+            assert(r2 == std::strong_ordering::equal);
+#  endif
         }
         {
             typedef std::array<int, 0> C;
@@ -189,6 +198,15 @@ TEST_CONSTEXPR_CXX17 bool tests()
             assert(std::rbegin(c)  == std::rend(c));
             assert(std::cbegin(c)  == std::cend(c));
             assert(std::crbegin(c) == std::crend(c));
+
+#  if TEST_STD_VER >= 20
+            // P1614 + LWG3352
+            std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+            assert(r1 == std::strong_ordering::equal);
+
+            std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+            assert(r2 == std::strong_ordering::equal);
+#  endif
         }
     }
 #endif
diff --git a/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp b/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
index 1f06ffde41ac2..484a2961fdb0c 100644
--- a/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
@@ -41,7 +41,27 @@ int main(int, char**)
     i = c.begin();
     C::const_iterator j;
     j = c.cbegin();
+
     assert(i == j);
+    assert(!(i != j));
+
+    assert(!(i < j));
+    assert((i <= j));
+
+    assert(!(i > j));
+    assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+    // P1614 + LWG3352
+    // When the allocator does not have operator<=> then the iterator uses a
+    // fallback to provide operator<=>.
+    // Make sure to test with an allocator that does not have operator<=>.
+    static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+    static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+    std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+    assert(r1 == std::strong_ordering::equal);
+#  endif
     }
 #endif
 #if TEST_STD_VER > 11
@@ -74,6 +94,15 @@ int main(int, char**)
 //         assert ( cii != c.begin());
 //         assert ( cii != c.cend());
 //         assert ( ii1 != c.end());
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
 #endif
 
diff --git a/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
index 9aaaac7a5557f..1e4877e8d2443 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
@@ -77,7 +77,21 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::iterator i = c.begin();
         C::iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r = i <=> j;
+        assert(r == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef bool T;
@@ -86,7 +100,21 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::const_iterator i = c.begin();
         C::const_iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r = i <=> j;
+        assert(r == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef bool T;
@@ -131,6 +159,15 @@ TEST_CONSTEXPR_CXX20 bool tests()
         assert ( (cii >= ii1 ));
         assert (cii - ii1 == 0);
         assert (ii1 - cii == 0);
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
 #endif
 
diff --git a/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
index b39245cab7bf4..f8bcee31964bb 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
@@ -39,11 +39,54 @@ TEST_CONSTEXPR_CXX20 bool tests()
     return true;
 }
 
+#if TEST_STD_VER >= 23
+template <typename T>
+struct increasing_allocator {
+  using value_type         = T;
+  std::size_t min_elements = 1000;
+  increasing_allocator()   = default;
+
+  template <typename U>
+  constexpr increasing_allocator(const increasing_allocator<U>& other) noexcept : min_elements(other.min_elements) {}
+
+  constexpr std::allocation_result<T*> allocate_at_least(std::size_t n) {
+    if (n < min_elements)
+      n = min_elements;
+    min_elements += 1000;
+    return std::allocator<T>{}.allocate_at_least(n);
+  }
+  constexpr T* allocate(std::size_t n) { return allocate_at_least(n).ptr; }
+  constexpr void deallocate(T* p, std::size_t n) noexcept { std::allocator<T>{}.deallocate(p, n); }
+};
+
+template <typename T, typename U>
+bool operator==(increasing_allocator<T>, increasing_allocator<U>) {
+  return true;
+}
+
+// https://github.com/llvm/llvm-project/issues/95161
+constexpr bool test_increasing_allocator() {
+  std::vector<bool, increasing_allocator<bool>> v;
+  v.push_back(1);
+  std::size_t capacity = v.capacity();
+  v.shrink_to_fit();
+  assert(v.capacity() <= capacity);
+  assert(v.size() == 1);
+
+  return true;
+}
+#endif // TEST_STD_VER >= 23
+
 int main(int, char**)
 {
-    tests();
+  tests();
 #if TEST_STD_VER > 17
     static_assert(tests());
 #endif
+#if TEST_STD_VER >= 23
+    test_increasing_allocator();
+    static_assert(test_increasing_allocator());
+#endif // TEST_STD_VER >= 23
+
     return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
index 6f1aea11ed3c8..850d616534765 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
index 8be3d9ab274a6..49d007a299b7e 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
index be46078adbaaa..efbb700e3b9e1 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
index 08233741acf25..cc88bf6d03fee 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
index abae40d78b23a..a1b7f98aa52d1 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
index d1cbcbfd72334..71794f8488a45 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp b/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
index 70e0e35767e09..0aa7ad0d42ed7 100644
--- a/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
@@ -87,7 +87,27 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::iterator i = c.begin();
         C::iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        // When the allocator does not have operator<=> then the iterator uses a
+        // fallback to provide operator<=>.
+        // Make sure to test with an allocator that does not have operator<=>.
+        static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+        static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+        std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+        assert(r1 == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef int T;
@@ -96,7 +116,26 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::const_iterator i = c.begin();
         C::const_iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // When the allocator does not have operator<=> then the iterator uses a
+        // fallback to provide operator<=>.
+        // Make sure to test with an allocator that does not have operator<=>.
+        static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+        static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+        std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+        assert(r1 == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef int T;
@@ -164,8 +203,16 @@ TEST_CONSTEXPR_CXX20 bool tests()
         assert ( (cii >= ii1 ));
         assert (cii - ii1 == 0);
         assert (ii1 - cii == 0);
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
-#endif
+#endif // TEST_STD_VER > 11
 
     return true;
 }
diff --git a/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit.pass.cpp
index 8851e2a9ed0c7..e39afb2d48f0a 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.capacity/shrink_to_fit.pass.cpp
@@ -71,11 +71,56 @@ TEST_CONSTEXPR_CXX20 bool tests() {
     return true;
 }
 
+#if TEST_STD_VER >= 23
+template <typename T>
+struct increasing_allocator {
+  using value_type         = T;
+  std::size_t min_elements = 1000;
+  increasing_allocator()   = default;
+
+  template <typename U>
+  constexpr increasing_allocator(const increasing_allocator<U>& other) noexcept : min_elements(other.min_elements) {}
+
+  constexpr std::allocation_result<T*> allocate_at_least(std::size_t n) {
+    if (n < min_elements)
+      n = min_elements;
+    min_elements += 1000;
+    return std::allocator<T>{}.allocate_at_least(n);
+  }
+  constexpr T* allocate(std::size_t n) { return allocate_at_least(n).ptr; }
+  constexpr void deallocate(T* p, std::size_t n) noexcept { std::allocator<T>{}.deallocate(p, n); }
+};
+
+template <typename T, typename U>
+bool operator==(increasing_allocator<T>, increasing_allocator<U>) {
+  return true;
+}
+
+// https://github.com/llvm/llvm-project/issues/95161
+constexpr bool test_increasing_allocator() {
+  std::vector<int, increasing_allocator<int>> v;
+  v.push_back(1);
+  assert(is_contiguous_container_asan_correct(v));
+  std::size_t capacity = v.capacity();
+  v.shrink_to_fit();
+  assert(v.capacity() <= capacity);
+  assert(v.size() == 1);
+  assert(is_contiguous_container_asan_correct(v));
+
+  return true;
+}
+#endif // TEST_STD_VER >= 23
+
 int main(int, char**)
 {
-    tests();
+  tests();
 #if TEST_STD_VER > 17
     static_assert(tests());
 #endif
+#if TEST_STD_VER >= 23
+    test_increasing_allocator();
+    static_assert(test_increasing_allocator());
+#endif
+
     return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp
new file mode 100644
index 0000000000000..193c00891da7f
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// void push_back(const value_type& x);
+//
+// If no reallocation happens, then references, pointers, and iterators before
+// the insertion point remain valid but those at or after the insertion point,
+// including the past-the-end iterator, are invalidated.
+
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
+// UNSUPPORTED: c++03
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <vector>
+#include <cassert>
+#include <cstddef>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  std::vector<int> vec;
+  vec.reserve(4);
+  std::size_t old_capacity = vec.capacity();
+  assert(old_capacity >= 4);
+
+  vec.push_back(0);
+  vec.push_back(1);
+  vec.push_back(2);
+  auto it = vec.begin();
+  vec.push_back(3);
+  assert(vec.capacity() == old_capacity);
+
+  // The capacity did not change, so the iterator remains valid and can reach the new element.
+  assert(*it == 0);
+  assert(*(it + 1) == 1);
+  assert(*(it + 2) == 2);
+  assert(*(it + 3) == 3);
+
+  while (vec.capacity() == old_capacity) {
+    vec.push_back(42);
+  }
+  TEST_LIBCPP_ASSERT_FAILURE(
+      *(it + old_capacity), "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+  // Unfortunately, the bounded iterator does not detect that it's been invalidated and will still allow attempts to
+  // dereference elements 0 to 3 (even though they refer to memory that's been reallocated).
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/unord/unord.map/compare.pass.cpp b/libcxx/test/std/containers/unord/unord.map/compare.pass.cpp
index 200107cddf12a..a8474f8f0bdc8 100644
--- a/libcxx/test/std/containers/unord/unord.map/compare.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/compare.pass.cpp
@@ -25,14 +25,10 @@ struct Key {
   bool operator== (const Key&) const { return true; }
 };
 
-namespace std
-{
-    template <>
-    struct hash<Key>
-    {
-        std::size_t operator()(Key const &) const {return 0;}
-    };
-}
+template <>
+struct std::hash<Key> {
+  std::size_t operator()(Key const&) const { return 0; }
+};
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/insert_or_assign.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/insert_or_assign.pass.cpp
index df2138395adf2..6ba28c00675bc 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/insert_or_assign.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/insert_or_assign.pass.cpp
@@ -58,11 +58,10 @@ class Moveable
     bool moved() const {return int_ == -1;}
 };
 
-namespace std {
-    template <> struct hash<Moveable> {
-        std::size_t operator () (const Moveable &m) const { return m.hash(); }
-    };
-}
+template <>
+struct std::hash<Moveable> {
+  std::size_t operator()(const Moveable& m) const { return m.hash(); }
+};
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/try.emplace.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/try.emplace.pass.cpp
index b9c82390ab3a5..15cf8bc59eba9 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/try.emplace.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/try.emplace.pass.cpp
@@ -57,11 +57,10 @@ class Moveable
     bool moved() const {return int_ == -1;}
 };
 
-namespace std {
-    template <> struct hash<Moveable> {
-        std::size_t operator () (const Moveable &m) const { return m.hash(); }
-    };
-}
+template <>
+struct std::hash<Moveable> {
+  std::size_t operator()(const Moveable& m) const { return m.hash(); }
+};
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp b/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp
new file mode 100644
index 0000000000000..13a7628e6043d
--- /dev/null
+++ b/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <span>
+
+// class iterator
+
+#include <cassert>
+#include <concepts>
+#include <iterator>
+#include <span>
+#include <string>
+#include <version> // __cpp_lib_ranges_as_const is not defined in span.
+
+#include "test_macros.h"
+
+template <class T>
+constexpr void test_type() {
+  using C = std::span<T>;
+  typename C::iterator ii1{}, ii2{};
+  typename C::iterator ii4 = ii1;
+  // TODO Test against C++23 after implementing
+  //  P2278R4 cbegin should always return a constant iterator
+  // The means adjusting the #ifdef to guard against C++23.
+#ifdef __cpp_lib_ranges_as_const
+  typename C::const_iterator cii{};
+#endif
+  assert(ii1 == ii2);
+  assert(ii1 == ii4);
+#ifdef __cpp_lib_ranges_as_const
+  assert(ii1 == cii);
+#endif
+
+  assert(!(ii1 != ii2));
+#ifdef __cpp_lib_ranges_as_const
+  assert(!(ii1 != cii));
+#endif
+
+  T v;
+  C c{&v, 1};
+  assert(c.begin() == std::begin(c));
+  assert(c.rbegin() == std::rbegin(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(c.cbegin() == std::cbegin(c));
+  assert(c.crbegin() == std::crbegin(c));
+#endif
+
+  assert(c.end() == std::end(c));
+  assert(c.rend() == std::rend(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(c.cend() == std::cend(c));
+  assert(c.crend() == std::crend(c));
+#endif
+
+  assert(std::begin(c) != std::end(c));
+  assert(std::rbegin(c) != std::rend(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(std::cbegin(c) != std::cend(c));
+  assert(std::crbegin(c) != std::crend(c));
+#endif
+
+  // P1614 + LWG3352
+  std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+  assert(r1 == std::strong_ordering::equal);
+
+#ifdef __cpp_lib_ranges_as_const
+  std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+  assert(r2 == std::strong_ordering::equal);
+#endif
+}
+
+constexpr bool test() {
+  test_type<char>();
+  test_type<int>();
+  test_type<std::string>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test(), "");
+
+  return 0;
+}
diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
index bbfb0c5548fab..0f47a513ea0e8 100644
--- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // <math.h>
 
diff --git a/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp b/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp
index 80735ba377f4f..3f614efee2036 100644
--- a/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp
@@ -33,12 +33,8 @@ class A {
 };
 
 // Specialize the template for my class
-namespace std
-{
-  template <>
-  struct is_error_code_enum<A> : public std::true_type {};
-}
-
+template <>
+struct std::is_error_code_enum<A> : public std::true_type {};
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp b/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp
index b4a6566aaa031..e9916f2427a55 100644
--- a/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp
@@ -34,12 +34,8 @@ class A {
 };
 
 // Specialize the template for my class
-namespace std
-{
-  template <>
-  struct is_error_condition_enum<A> : public std::true_type {};
-}
-
+template <>
+struct std::is_error_condition_enum<A> : public std::true_type {};
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/ErrorCodeEnum.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/ErrorCodeEnum.pass.cpp
index 055977895dba7..2667223f5a7a7 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/ErrorCodeEnum.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/ErrorCodeEnum.pass.cpp
@@ -23,12 +23,8 @@ enum testing
     zero, one, two
 };
 
-namespace std
-{
-
-template <> struct is_error_code_enum<testing> : public std::true_type {};
-
-}
+template <>
+struct std::is_error_code_enum<testing> : public std::true_type {};
 
 std::error_code
 make_error_code(testing x)
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/lwg3629.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/lwg3629.pass.cpp
index 9e9e21f9fd9e7..076530b7007a6 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/lwg3629.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.constructors/lwg3629.pass.cpp
@@ -26,10 +26,8 @@ namespace User {
   std::error_code make_error_code(Err) { return std::error_code(42, std::generic_category()); }
 }
 
-namespace std {
-  template <>
-  struct is_error_code_enum<User::Err> : true_type {};
-}
+template <>
+struct std::is_error_code_enum<User::Err> : true_type {};
 
 int main(int, char**) {
   std::error_code e((User::Err()));
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/ErrorCodeEnum.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/ErrorCodeEnum.pass.cpp
index 2c98c89119792..b86f9d21e204c 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/ErrorCodeEnum.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/ErrorCodeEnum.pass.cpp
@@ -23,12 +23,8 @@ enum testing
     zero, one, two
 };
 
-namespace std
-{
-
-template <> struct is_error_code_enum<testing> : public std::true_type {};
-
-}
+template <>
+struct std::is_error_code_enum<testing> : public std::true_type {};
 
 std::error_code
 make_error_code(testing x)
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/lwg3629.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/lwg3629.pass.cpp
index 4e5bcc394607c..4bea503ec0423 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/lwg3629.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcode/syserr.errcode.modifiers/lwg3629.pass.cpp
@@ -26,10 +26,8 @@ namespace User {
   std::error_code make_error_code(Err) { return std::error_code(42, std::generic_category()); }
 }
 
-namespace std {
-  template <>
-  struct is_error_code_enum<User::Err> : true_type {};
-}
+template <>
+struct std::is_error_code_enum<User::Err> : true_type {};
 
 int main(int, char**) {
   std::error_code e;
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.constructors/lwg3629.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.constructors/lwg3629.pass.cpp
index 2d2d43a129d70..f1e2d9a2ba349 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.constructors/lwg3629.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.constructors/lwg3629.pass.cpp
@@ -26,10 +26,8 @@ namespace User {
   std::error_condition make_error_condition(Err) { return std::error_condition(42, std::generic_category()); }
 }
 
-namespace std {
-  template <>
-  struct is_error_condition_enum<User::Err> : true_type {};
-}
+template <>
+struct std::is_error_condition_enum<User::Err> : true_type {};
 
 int main(int, char**) {
   std::error_condition e((User::Err()));
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.modifiers/lwg3629.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.modifiers/lwg3629.pass.cpp
index 72cc77b4949d6..014f14a9fd82a 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.modifiers/lwg3629.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcondition/syserr.errcondition.modifiers/lwg3629.pass.cpp
@@ -28,10 +28,8 @@ namespace User {
   std::error_condition make_error_condition(Err) { return std::error_condition(42, std::generic_category()); }
 }
 
-namespace std {
-  template <>
-  struct is_error_condition_enum<User::Err> : true_type {};
-}
+template <>
+struct std::is_error_condition_enum<User::Err> : true_type {};
 
 int main(int, char**) {
   std::error_condition e;
diff --git a/libcxx/test/std/experimental/utilities/propagate_const/propagate_const.nonmembers/hash.pass.cpp b/libcxx/test/std/experimental/utilities/propagate_const/propagate_const.nonmembers/hash.pass.cpp
index 65208ffd8b1fb..64f939c7cbc1b 100644
--- a/libcxx/test/std/experimental/utilities/propagate_const/propagate_const.nonmembers/hash.pass.cpp
+++ b/libcxx/test/std/experimental/utilities/propagate_const/propagate_const.nonmembers/hash.pass.cpp
@@ -19,18 +19,12 @@
 
 using std::experimental::propagate_const;
 
-namespace std {
-template <> struct hash<X>
-{
+template <>
+struct std::hash<X> {
   typedef X first_argument_type;
 
-  std::size_t operator()(const first_argument_type&) const
-  {
-    return 99;
-  }
-
+  std::size_t operator()(const first_argument_type&) const { return 99; }
 };
-} // namespace std
 
 int main(int, char**) {
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
index db15ee193ad2a..53bf828fc120b 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
@@ -72,7 +72,7 @@ static void large_file() {
   // the data at the end of the source file.
   std::string out_data(additional_size, 'z');
   {
-    std::FILE* dest_file = std::fopen(dest.string().c_str(), "rb");
+    std::FILE* dest_file = std::fopen(dest.string().c_str(), "r");
     assert(dest_file != nullptr);
     assert(std::fseek(dest_file, sendfile_size_limit, SEEK_SET) == 0);
     assert(std::fread(&out_data[0], sizeof(out_data[0]), additional_size, dest_file) == additional_size);
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
index 57023b73c4744..98fa56b7f4b6d 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
index 6abfb30265e1e..711152ba6f32a 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
index f5a6a639cbd30..ca540bdd131e3 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
index 19a02638a9da1..c79a1a3f0b55d 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
index 054ea36a04c7e..73ad19132f17f 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
index b4d6b3529cec4..21f6654d5f9cf 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
index 34eff101d730a..9b9b0e404e6b7 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
index 70e0c11c5e6c5..d3e4463fe0bc8 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
index 3edc0e2245d64..0bd5ca5eb2b9b 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
index d348a3b530be3..ea2ec65810020 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h b/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
index d28256cabc798..74a2ceafa392a 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
index a262c287108a4..a797abb82a7b2 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
index 2f088e7a7db5f..fc40680d6fba5 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
index b811b4f2a2493..b884738cf82c8 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
index edc8bb3f543c4..66fe733ee1035 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
index c1a690f559b11..ee4478f361804 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
index bd9b99166922f..fe49876d0b049 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
index e32a794b96dd9..992e7f4600053 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
index 6451d44e91b5c..d131f5c9a6fa7 100644
--- a/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
@@ -29,12 +29,18 @@ struct testbuf
 {
     void check()
     {
-        assert(this->eback() == NULL);
-        assert(this->gptr() == NULL);
-        assert(this->egptr() == NULL);
-        assert(this->pbase() == NULL);
-        assert(this->pptr() == NULL);
-        assert(this->epptr() == NULL);
+      // LWG2995
+      //   It is implementation-defined whether the sequence pointers (eback(),
+      //   gptr(), egptr(), pbase(), pptr(), epptr()) are initialized to null
+      //   pointers.
+      // This tests the libc++ specific implementation.
+      LIBCPP_ASSERT(this->eback() != nullptr);
+      LIBCPP_ASSERT(this->gptr() != nullptr);
+      LIBCPP_ASSERT(this->egptr() != nullptr);
+      LIBCPP_ASSERT(this->pbase() != nullptr);
+      LIBCPP_ASSERT(this->pptr() != nullptr);
+      LIBCPP_ASSERT(this->epptr() != nullptr);
+      assert(this->str().empty());
     }
 };
 
diff --git a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count.pass.cpp
index cda49acad985b..18f8d15335ec6 100644
--- a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count.pass.cpp
@@ -12,6 +12,7 @@
 
 #include <iterator>
 
+#include <algorithm>
 #include <cassert>
 
 #include "test_iterators.h"
@@ -32,12 +33,16 @@ constexpr void check(int* first, std::iter_difference_t<It> n, int* expected) {
 
   // Count operations
   if constexpr (Count) {
-    auto it = stride_counting_iterator(It(first));
+    IteratorOpCounts ops;
+    auto it = operation_counting_iterator(It(first), &ops);
     std::ranges::advance(it, n);
     if constexpr (std::random_access_iterator<It>) {
-      assert(it.stride_count() <= 1);
+      assert(ops.increments + ops.decrements <= 1);
     } else {
-      assert(it.stride_count() == abs(M));
+      const auto big   = std::max(ops.increments, ops.decrements);
+      const auto small = std::min(ops.increments, ops.decrements);
+      assert(big == std::size_t(abs(M)));
+      assert(small == 0);
     }
   }
 }
diff --git a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count_sentinel.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count_sentinel.pass.cpp
index 76439ef93a607..d613105805dd5 100644
--- a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count_sentinel.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_count_sentinel.pass.cpp
@@ -38,14 +38,16 @@ check_forward(int* first, int* last, std::iter_difference_t<It> n, int* expected
 
   // Count operations
   if constexpr (Count) {
-    auto it = stride_counting_iterator(It(first));
-    auto sent = sentinel_wrapper(stride_counting_iterator(It(last)));
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(first), &ops);
+    auto sent = sentinel_wrapper(operation_counting_iterator(It(last), &ops));
     (void)std::ranges::advance(it, n, sent);
     // We don't have a sized sentinel, so we have to increment one-by-one
     // regardless of the iterator category.
-    assert(it.stride_count() == M);
-    assert(it.stride_displacement() == M);
-    assert(it.equals_count() == expected_equals_count);
+    assert(static_cast<Difference>(ops.increments) == M);
+    assert(static_cast<Difference>(ops.decrements) == 0);
+    assert(ops.zero_moves == 0);
+    assert(ops.equal_cmps == static_cast<std::size_t>(expected_equals_count));
   }
 }
 
@@ -65,28 +67,24 @@ constexpr void check_forward_sized_sentinel(int* first, int* last, std::iter_dif
 
   // Count operations
   {
-    auto it = stride_counting_iterator(It(first));
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(first), &ops);
     auto sent = distance_apriori_sentinel(size);
     (void)std::ranges::advance(it, n, sent);
     if constexpr (std::random_access_iterator<It>) {
-      assert(it.stride_count() <= 1);
-      assert(it.stride_displacement() <= 1);
+      assert(ops.increments + ops.zero_moves == 1);
+      assert(ops.decrements == 0);
     } else {
-      assert(it.stride_count() == M);
-      assert(it.stride_displacement() == M);
+      assert(static_cast<Difference>(ops.increments) == M);
+      assert(ops.decrements == 0);
+      assert(ops.zero_moves == 0);
     }
   }
 }
 
-struct Expected {
-  int stride_count;
-  int stride_displacement;
-  int equals_count;
-};
-
 template <bool Count, typename It>
 constexpr void
-check_backward(int* first, int* last, std::iter_difference_t<It> n, int* expected, Expected expected_counts) {
+check_backward(int* first, int* last, std::iter_difference_t<It> n, int* expected, IteratorOpCounts expected_counts) {
   // Check preconditions for `advance` when called with negative `n`
   // (see [range.iter.op.advance]). In addition, allow `n == 0`.
   assert(n <= 0);
@@ -105,16 +103,18 @@ check_backward(int* first, int* last, std::iter_difference_t<It> n, int* expecte
 
   // Count operations
   {
-    auto it = stride_counting_iterator(It(last));
-    auto sent = stride_counting_iterator(It(first));
-    static_assert(std::bidirectional_iterator<stride_counting_iterator<It>>);
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(last), &ops);
+    auto sent = operation_counting_iterator(It(first), &ops);
+    static_assert(std::bidirectional_iterator<operation_counting_iterator<It>>);
     static_assert(Count == !std::sized_sentinel_for<It, It>);
 
     (void)std::ranges::advance(it, n, sent);
 
-    assert(it.stride_count() == expected_counts.stride_count);
-    assert(it.stride_displacement() == expected_counts.stride_displacement);
-    assert(it.equals_count() == expected_counts.equals_count);
+    assert(ops.increments == expected_counts.increments);
+    assert(ops.decrements == expected_counts.decrements);
+    assert(ops.zero_moves == expected_counts.zero_moves);
+    assert(ops.equal_cmps == expected_counts.equal_cmps);
   }
 }
 
@@ -217,21 +217,22 @@ constexpr bool test() {
       {
         int* expected = n > size ? range : range + size - n;
         {
-          Expected expected_counts = {
-              .stride_count        = static_cast<int>(range + size - expected),
-              .stride_displacement = -expected_counts.stride_count,
-              .equals_count        = n > size ? size + 1 : n,
+          IteratorOpCounts expected_counts = {
+              .increments = 0,
+              .decrements = static_cast<std::size_t>(range + size - expected),
+              .equal_cmps = static_cast<std::size_t>(n > size ? size + 1 : n),
           };
 
           check_backward<true, bidirectional_iterator<int*>>(range, range + size, -n, expected, expected_counts);
         }
         {
-          Expected expected_counts = {
+          IteratorOpCounts expected_counts = {
               // If `n >= size`, the algorithm can just do `it = std::move(sent);`
               // instead of doing iterator arithmetic.
-              .stride_count        = (n >= size) ? 0 : 1,
-              .stride_displacement = (n >= size) ? 0 : 1,
-              .equals_count        = 0,
+              .increments = 0,
+              .decrements = static_cast<std::size_t>((n == 0 || n >= size) ? 0 : 1),
+              .zero_moves = static_cast<std::size_t>(n == 0 && size != 0 ? 1 : 0),
+              .equal_cmps = 0,
           };
 
           check_backward<false, random_access_iterator<int*>>(range, range + size, -n, expected, expected_counts);
diff --git a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_sentinel.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_sentinel.pass.cpp
index 2e9a28e4ad395..147c26e429063 100644
--- a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_sentinel.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.advance/iterator_sentinel.pass.cpp
@@ -12,6 +12,7 @@
 
 #include <iterator>
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 
@@ -31,11 +32,12 @@ constexpr void check_assignable(int* first, int* last, int* expected) {
 
   // Count operations
   if constexpr (Count) {
-    auto it = stride_counting_iterator(It(first));
-    auto sent = assignable_sentinel(stride_counting_iterator(It(last)));
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(first), &ops);
+    auto sent = assignable_sentinel(operation_counting_iterator(It(last), &ops));
     std::ranges::advance(it, sent);
     assert(base(base(it)) == expected);
-    assert(it.stride_count() == 0); // because we got here by assigning from last, not by incrementing
+    assert(ops.increments + ops.decrements == 0); // because we got here by assigning from last, not by incrementing
   }
 }
 
@@ -53,13 +55,17 @@ constexpr void check_sized_sentinel(int* first, int* last, int* expected) {
 
   // Count operations
   if constexpr (Count) {
-    auto it = stride_counting_iterator(It(first));
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(first), &ops);
     auto sent = distance_apriori_sentinel(size);
     std::ranges::advance(it, sent);
     if constexpr (std::random_access_iterator<It>) {
-      assert(it.stride_count() == 1);
+      assert(ops.increments + ops.decrements + ops.zero_moves == 1);
     } else {
-      assert(it.stride_count() == size);
+      const auto big   = std::max(ops.increments, ops.decrements);
+      const auto small = std::min(ops.increments, ops.decrements);
+      assert(big == static_cast<size_t>(size > 0 ? size : -size));
+      assert(small == 0);
     }
   }
 }
@@ -78,10 +84,14 @@ constexpr void check_sentinel(int* first, int* last, int* expected) {
 
   // Count operations
   if constexpr (Count) {
-    auto it = stride_counting_iterator(It(first));
-    auto sent = sentinel_wrapper(stride_counting_iterator(It(last)));
+    IteratorOpCounts ops;
+    auto it   = operation_counting_iterator(It(first), &ops);
+    auto sent = sentinel_wrapper(operation_counting_iterator(It(last), &ops));
     std::ranges::advance(it, sent);
-    assert(it.stride_count() == size);
+    const auto big   = std::max(ops.increments, ops.decrements);
+    const auto small = std::min(ops.increments, ops.decrements);
+    assert(big == static_cast<size_t>(size > 0 ? size : -size));
+    assert(small == 0);
   }
 }
 
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
index e04c508852820..38dcd70d850e6 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_binary_predicate.compile.pass.cpp
@@ -76,12 +76,13 @@ struct BadPredicate5 {
 };
 static_assert(!std::indirect_binary_predicate<BadPredicate5, It1, It2>);
 
-// Should fail when the predicate can't be called with (iter_common_reference_t, iter_common_reference_t)
-struct BadPredicate6 {
-    template <class T, class U> bool operator()(T const&, U const&) const;
-    bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodPredicate6 {
+  template <class T, class U>
+  bool operator()(T const&, U const&) const;
+  bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
 };
-static_assert(!std::indirect_binary_predicate<BadPredicate6, It1, It2>);
+static_assert(std::indirect_binary_predicate<GoodPredicate6, It1, It2>);
 
 // Test ADL-proofing (P2538R1)
 #if TEST_STD_VER >= 26 || defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
index 56f52f28ff6fe..8ea7e8a6d9e1e 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_equivalence_relation.compile.pass.cpp
@@ -91,12 +91,13 @@ struct BadRelation5 {
 };
 static_assert(!std::indirect_equivalence_relation<BadRelation5, It1, It2>);
 
-// Should fail when the function can't be called with (iter_common_reference_t, iter_common_reference_t)
-struct BadRelation6 {
-    template <class T, class U> bool operator()(T const&, U const&) const;
-    bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodRelation6 {
+  template <class T, class U>
+  bool operator()(T const&, U const&) const;
+  bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
 };
-static_assert(!std::indirect_equivalence_relation<BadRelation6, It1, It2>);
+static_assert(std::indirect_equivalence_relation<GoodRelation6, It1, It2>);
 
 // Test ADL-proofing (P2538R1)
 #if TEST_STD_VER >= 26 || defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
index f36431c89f288..074725f97bf32 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_strict_weak_order.compile.pass.cpp
@@ -91,12 +91,13 @@ struct BadOrder5 {
 };
 static_assert(!std::indirect_strict_weak_order<BadOrder5, It1, It2>);
 
-// Should fail when the function can't be called with (iter_common_reference_t, iter_common_reference_t)
-struct BadOrder6 {
-    template <class T, class U> bool operator()(T const&, U const&) const;
-    bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodOrder6 {
+  template <class T, class U>
+  bool operator()(T const&, U const&) const;
+  bool operator()(std::iter_common_reference_t<It1>, std::iter_common_reference_t<It2>) const = delete;
 };
-static_assert(!std::indirect_strict_weak_order<BadOrder6, It1, It2>);
+static_assert(std::indirect_strict_weak_order<GoodOrder6, It1, It2>);
 
 // Test ADL-proofing (P2538R1)
 #if TEST_STD_VER >= 26 || defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
index 4551113af82e8..f52ea8f9151c3 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirect_unary_predicate.compile.pass.cpp
@@ -57,12 +57,13 @@ struct BadPredicate3 {
 };
 static_assert(!std::indirect_unary_predicate<BadPredicate3, It>);
 
-// Should fail when the predicate can't be called with std::iter_common_reference_t<It>
-struct BadPredicate4 {
-    template <class T> bool operator()(T const&) const;
-    bool operator()(std::iter_common_reference_t<It>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodPredicate4 {
+  template <class T>
+  bool operator()(T const&) const;
+  bool operator()(std::iter_common_reference_t<It>) const = delete;
 };
-static_assert(!std::indirect_unary_predicate<BadPredicate4, It>);
+static_assert(std::indirect_unary_predicate<GoodPredicate4, It>);
 
 // Test ADL-proofing (P2538R1)
 #if TEST_STD_VER >= 26 || defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_regular_unary_invocable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_regular_unary_invocable.compile.pass.cpp
index 9e0c3c99dee98..fceae03a4b324 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_regular_unary_invocable.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_regular_unary_invocable.compile.pass.cpp
@@ -56,12 +56,13 @@ struct BadInvocable3 {
 };
 static_assert(!std::indirectly_regular_unary_invocable<BadInvocable3, It>);
 
-// Should fail when the invocable can't be called with (iter_common_reference_t)
-struct BadInvocable4 {
-    template <class T> R1 operator()(T const&) const;
-    R1 operator()(std::iter_common_reference_t<It>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodInvocable4 {
+  template <class T>
+  R1 operator()(T const&) const;
+  R1 operator()(std::iter_common_reference_t<It>) const = delete;
 };
-static_assert(!std::indirectly_regular_unary_invocable<BadInvocable4, It>);
+static_assert(std::indirectly_regular_unary_invocable<GoodInvocable4, It>);
 
 // Should fail when the invocable doesn't have a common reference between its return types
 struct BadInvocable5 {
diff --git a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_unary_invocable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_unary_invocable.compile.pass.cpp
index fc955eb88585a..27c078baeb194 100644
--- a/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_unary_invocable.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/indirectcallable/indirectinvocable/indirectly_unary_invocable.compile.pass.cpp
@@ -56,12 +56,13 @@ struct BadInvocable3 {
 };
 static_assert(!std::indirectly_unary_invocable<BadInvocable3, It>);
 
-// Should fail when the invocable can't be called with (iter_common_reference_t)
-struct BadInvocable4 {
-    template <class T> R1 operator()(T const&) const;
-    R1 operator()(std::iter_common_reference_t<It>) const = delete;
+// This case was made valid by P2997R1.
+struct GoodInvocable4 {
+  template <class T>
+  R1 operator()(T const&) const;
+  R1 operator()(std::iter_common_reference_t<It>) const = delete;
 };
-static_assert(!std::indirectly_unary_invocable<BadInvocable4, It>);
+static_assert(std::indirectly_unary_invocable<GoodInvocable4, It>);
 
 // Should fail when the invocable doesn't have a common reference between its return types
 struct BadInvocable5 {
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
index e06234477e290..5fc6ffd37caf1 100644
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
@@ -81,13 +81,12 @@ static_assert(!check_indirectly_readable<missing_iter_value_t>());
 struct unrelated_lvalue_ref_and_rvalue_ref {};
 
 struct iter_ref1 {};
-namespace std {
 template <>
-struct common_reference<iter_ref1&, iter_ref1&&> {};
+struct std::common_reference<iter_ref1&, iter_ref1&&> {};
 
 template <>
-struct common_reference<iter_ref1&&, iter_ref1&> {};
-} // namespace std
+struct std::common_reference<iter_ref1&&, iter_ref1&> {};
+
 static_assert(!std::common_reference_with<iter_ref1&, iter_ref1&&>);
 
 struct bad_iter_reference_t {
@@ -109,16 +108,16 @@ static_assert(!check_indirectly_readable<unrelated_iter_ref_rvalue_and_iter_rval
 struct iter_ref3 {
   operator iter_rvalue_ref() const;
 };
-namespace std {
+
 template <template <class> class XQual, template <class> class YQual>
-struct basic_common_reference<iter_ref3, iter_rvalue_ref, XQual, YQual> {
+struct std::basic_common_reference<iter_ref3, iter_rvalue_ref, XQual, YQual> {
   using type = iter_rvalue_ref;
 };
 template <template <class> class XQual, template <class> class YQual>
-struct basic_common_reference<iter_rvalue_ref, iter_ref3, XQual, YQual> {
+struct std::basic_common_reference<iter_rvalue_ref, iter_ref3, XQual, YQual> {
   using type = iter_rvalue_ref;
 };
-} // namespace std
+
 static_assert(std::common_reference_with<iter_ref3&&, iter_rvalue_ref&&>);
 
 struct different_reference_types_with_common_reference {
@@ -131,21 +130,22 @@ static_assert(check_indirectly_readable<different_reference_types_with_common_re
 struct iter_ref4 {
   operator iter_rvalue_ref() const;
 };
-namespace std {
+
 template <template <class> class XQual, template <class> class YQual>
-struct basic_common_reference<iter_ref4, iter_rvalue_ref, XQual, YQual> {
+struct std::basic_common_reference<iter_ref4, iter_rvalue_ref, XQual, YQual> {
   using type = iter_rvalue_ref;
 };
 template <template <class> class XQual, template <class> class YQual>
-struct basic_common_reference<iter_rvalue_ref, iter_ref4, XQual, YQual> {
+struct std::basic_common_reference<iter_rvalue_ref, iter_ref4, XQual, YQual> {
   using type = iter_rvalue_ref;
 };
 
+// FIXME: This is UB according to [meta.rqmts], and there is no exception for common_reference.
 template <>
-struct common_reference<iter_ref4 const&, iter_rvalue_ref&&> {};
+struct std::common_reference<iter_ref4 const&, iter_rvalue_ref&&> {};
 template <>
-struct common_reference<iter_rvalue_ref&&, iter_ref4 const&> {};
-} // namespace std
+struct std::common_reference<iter_rvalue_ref&&, iter_ref4 const&> {};
+
 static_assert(std::common_reference_with<iter_ref4&&, iter_rvalue_ref&&>);
 static_assert(!std::common_reference_with<iter_ref4 const&, iter_rvalue_ref&&>);
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index 01467ca9911e1..85b641322d99e 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -15,6 +15,9 @@
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11
 
+// AIX, z/OS, and MinGW default to -fno-sized-deallocation.
+// XFAIL: target={{.+}}-aix{{.*}}, target={{.+}}-zos{{.*}}, target={{.+}}-windows-gnu
+
 #include <new>
 #include <cstddef>
 #include <cstdlib>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index fd7f2dbabdacc..ae614a1432f7d 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -15,6 +15,9 @@
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11
 
+// AIX, z/OS, and MinGW default to -fno-sized-deallocation.
+// XFAIL: target={{.+}}-aix{{.*}}, target={{.+}}-zos{{.*}}, target={{.+}}-windows-gnu
+
 #include <new>
 #include <cstddef>
 #include <cstdlib>
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
index 8ccd252115ac8..ded8006063241 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
@@ -23,6 +23,7 @@
     __cpp_lib_parallel_algorithm                            201603L [C++17]
     __cpp_lib_ranges                                        202207L [C++20]
     __cpp_lib_ranges_contains                               202207L [C++23]
+    __cpp_lib_ranges_find_last                              202207L [C++23]
     __cpp_lib_ranges_starts_ends_with                       202106L [C++23]
     __cpp_lib_robust_nonmodifying_seq_ops                   201304L [C++14]
     __cpp_lib_sample                                        201603L [C++17]
@@ -62,6 +63,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23"
 # endif
@@ -108,6 +113,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23"
 # endif
@@ -169,6 +178,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23"
 # endif
@@ -239,6 +252,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23"
 # endif
@@ -315,6 +332,13 @@
 #   error "__cpp_lib_ranges_contains should have the value 202207L in c++23"
 # endif
 
+# ifndef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should be defined in c++23"
+# endif
+# if __cpp_lib_ranges_find_last != 202207L
+#   error "__cpp_lib_ranges_find_last should have the value 202207L in c++23"
+# endif
+
 # ifndef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should be defined in c++23"
 # endif
@@ -412,6 +436,13 @@
 #   error "__cpp_lib_ranges_contains should have the value 202207L in c++26"
 # endif
 
+# ifndef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should be defined in c++26"
+# endif
+# if __cpp_lib_ranges_find_last != 202207L
+#   error "__cpp_lib_ranges_find_last should have the value 202207L in c++26"
+# endif
+
 # ifndef __cpp_lib_ranges_starts_ends_with
 #   error "__cpp_lib_ranges_starts_ends_with should be defined in c++26"
 # endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
index aac00f20c7b45..1d61f43f9ee51 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
@@ -16,7 +16,7 @@
 // Test the feature test macros defined by <compare>
 
 /*  Constant                          Value
-    __cpp_lib_three_way_comparison    201711L [C++20]
+    __cpp_lib_three_way_comparison    201907L [C++20]
 */
 
 #include <compare>
@@ -45,8 +45,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++20"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++20"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++20"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -54,8 +54,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++23"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++23"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++23"
 # endif
 
 #elif TEST_STD_VER > 23
@@ -63,8 +63,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++26"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++26"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
index 45d9271faa578..aa1170658b103 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
@@ -485,17 +485,11 @@
 #   error "__cpp_lib_make_unique should have the value 201304L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should be defined in c++23"
-#   endif
-#   if __cpp_lib_out_ptr != 202106L
-#     error "__cpp_lib_out_ptr should have the value 202106L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_out_ptr
+#   error "__cpp_lib_out_ptr should be defined in c++23"
+# endif
+# if __cpp_lib_out_ptr != 202106L
+#   error "__cpp_lib_out_ptr should have the value 202106L in c++23"
 # endif
 
 # ifndef __cpp_lib_ranges
@@ -622,17 +616,11 @@
 #   error "__cpp_lib_make_unique should have the value 201304L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should be defined in c++26"
-#   endif
-#   if __cpp_lib_out_ptr != 202311L
-#     error "__cpp_lib_out_ptr should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_out_ptr
+#   error "__cpp_lib_out_ptr should be defined in c++26"
+# endif
+# if __cpp_lib_out_ptr != 202311L
+#   error "__cpp_lib_out_ptr should have the value 202311L in c++26"
 # endif
 
 # ifndef __cpp_lib_ranges
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index af6386a40a458..69a938edd1cb9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -29,6 +29,7 @@
     __cpp_lib_string_udls                                   201304L [C++14]
     __cpp_lib_string_view                                   201606L [C++17]
                                                             201803L [C++20]
+                                                            202403L [C++26]
     __cpp_lib_to_string                                     202306L [C++26]
 */
 
@@ -483,8 +484,8 @@
 # ifndef __cpp_lib_string_view
 #   error "__cpp_lib_string_view should be defined in c++26"
 # endif
-# if __cpp_lib_string_view != 201803L
-#   error "__cpp_lib_string_view should have the value 201803L in c++26"
+# if __cpp_lib_string_view != 202403L
+#   error "__cpp_lib_string_view should have the value 202403L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
index a86ab2adff6a9..f3c70cf977973 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
@@ -23,6 +23,7 @@
     __cpp_lib_string_contains             202011L [C++23]
     __cpp_lib_string_view                 201606L [C++17]
                                           201803L [C++20]
+                                          202403L [C++26]
 */
 
 #include <string_view>
@@ -252,8 +253,8 @@
 # ifndef __cpp_lib_string_view
 #   error "__cpp_lib_string_view should be defined in c++26"
 # endif
-# if __cpp_lib_string_view != 201803L
-#   error "__cpp_lib_string_view should have the value 201803L in c++26"
+# if __cpp_lib_string_view != 202403L
+#   error "__cpp_lib_string_view should have the value 202403L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index a01ee702a5172..b8bad696f1bae 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -173,6 +173,7 @@
     __cpp_lib_ranges_chunk_by                               202202L [C++23]
     __cpp_lib_ranges_concat                                 202403L [C++26]
     __cpp_lib_ranges_contains                               202207L [C++23]
+    __cpp_lib_ranges_find_last                              202207L [C++23]
     __cpp_lib_ranges_iota                                   202202L [C++23]
     __cpp_lib_ranges_join_with                              202202L [C++23]
     __cpp_lib_ranges_repeat                                 202207L [C++23]
@@ -216,10 +217,11 @@
     __cpp_lib_string_udls                                   201304L [C++14]
     __cpp_lib_string_view                                   201606L [C++17]
                                                             201803L [C++20]
+                                                            202403L [C++26]
     __cpp_lib_submdspan                                     202306L [C++26]
     __cpp_lib_syncbuf                                       201803L [C++20]
     __cpp_lib_text_encoding                                 202306L [C++26]
-    __cpp_lib_three_way_comparison                          201711L [C++20]
+    __cpp_lib_three_way_comparison                          201907L [C++20]
     __cpp_lib_to_address                                    201711L [C++20]
     __cpp_lib_to_array                                      201907L [C++20]
     __cpp_lib_to_chars                                      201611L [C++17]
@@ -848,6 +850,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_iota
 #   error "__cpp_lib_ranges_iota should not be defined before c++23"
 # endif
@@ -1744,6 +1750,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_iota
 #   error "__cpp_lib_ranges_iota should not be defined before c++23"
 # endif
@@ -2811,6 +2821,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_iota
 #   error "__cpp_lib_ranges_iota should not be defined before c++23"
 # endif
@@ -4145,6 +4159,10 @@
 #   error "__cpp_lib_ranges_contains should not be defined before c++23"
 # endif
 
+# ifdef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_ranges_iota
 #   error "__cpp_lib_ranges_iota should not be defined before c++23"
 # endif
@@ -4420,8 +4438,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++20"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++20"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++20"
 # endif
 
 # ifndef __cpp_lib_to_address
@@ -5541,17 +5559,11 @@
 #   error "__cpp_lib_optional_range_support should not be defined before c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should be defined in c++23"
-#   endif
-#   if __cpp_lib_out_ptr != 202106L
-#     error "__cpp_lib_out_ptr should have the value 202106L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_out_ptr
+#   error "__cpp_lib_out_ptr should be defined in c++23"
+# endif
+# if __cpp_lib_out_ptr != 202106L
+#   error "__cpp_lib_out_ptr should have the value 202106L in c++23"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
@@ -5662,6 +5674,13 @@
 #   error "__cpp_lib_ranges_contains should have the value 202207L in c++23"
 # endif
 
+# ifndef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should be defined in c++23"
+# endif
+# if __cpp_lib_ranges_find_last != 202207L
+#   error "__cpp_lib_ranges_find_last should have the value 202207L in c++23"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_ranges_iota
 #     error "__cpp_lib_ranges_iota should be defined in c++23"
@@ -6018,8 +6037,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++23"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++23"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++23"
 # endif
 
 # ifndef __cpp_lib_to_address
@@ -7382,17 +7401,11 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should be defined in c++26"
-#   endif
-#   if __cpp_lib_out_ptr != 202311L
-#     error "__cpp_lib_out_ptr should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_out_ptr
-#     error "__cpp_lib_out_ptr should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_out_ptr
+#   error "__cpp_lib_out_ptr should be defined in c++26"
+# endif
+# if __cpp_lib_out_ptr != 202311L
+#   error "__cpp_lib_out_ptr should have the value 202311L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
@@ -7521,6 +7534,13 @@
 #   error "__cpp_lib_ranges_contains should have the value 202207L in c++26"
 # endif
 
+# ifndef __cpp_lib_ranges_find_last
+#   error "__cpp_lib_ranges_find_last should be defined in c++26"
+# endif
+# if __cpp_lib_ranges_find_last != 202207L
+#   error "__cpp_lib_ranges_find_last should have the value 202207L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_ranges_iota
 #     error "__cpp_lib_ranges_iota should be defined in c++26"
@@ -7894,8 +7914,8 @@
 # ifndef __cpp_lib_string_view
 #   error "__cpp_lib_string_view should be defined in c++26"
 # endif
-# if __cpp_lib_string_view != 201803L
-#   error "__cpp_lib_string_view should have the value 201803L in c++26"
+# if __cpp_lib_string_view != 202403L
+#   error "__cpp_lib_string_view should have the value 202403L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
@@ -7940,8 +7960,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++26"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++26"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++26"
 # endif
 
 # ifndef __cpp_lib_to_address
diff --git a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
index b7fd892003e2d..194262bb42e3b 100644
--- a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
@@ -11,9 +11,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// picolibc doesn't define TIME_UTC.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // ::timespec_get is provided by the C library, but it's marked as
 // unavailable until macOS 10.15
 // XFAIL: target={{.+}}-apple-macosx10.{{13|14}}
diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
index 9506ca1c768bd..b26b397056343 100644
--- a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
+++ b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
@@ -88,6 +88,9 @@ static_assert(test(std::ranges::find_first_of, a, a));
 static_assert(test(std::ranges::find_if, a, odd));
 static_assert(test(std::ranges::find_if_not, a, odd));
 #if TEST_STD_VER >= 23
+static_assert(test(std::ranges::find_last, a, 42));
+static_assert(test(std::ranges::find_last_if, a, odd));
+static_assert(test(std::ranges::find_last_if_not, a, odd));
 static_assert(test(std::ranges::fold_left, a, 0, std::plus()));
 static_assert(test(std::ranges::fold_left_with_iter, a, 0, std::plus()));
 #endif
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/classic_table.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/classic_table.pass.cpp
index 50c60180ce0dd..1bd9fc88c72db 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/classic_table.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/classic_table.pass.cpp
@@ -6,6 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Picolibc/newlib checks for the blank flag in isprint, which needs to be false
+// for tab. So they removed the _B flag from the ctype data, which causes this
+// test to fail.
+// See https://github.com/picolibc/picolibc/issues/778.
 // XFAIL: LIBCXX-PICOLIBC-FIXME
 
 // <locale>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
index 21efa978abdcc..946c26398fcb8 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
@@ -12,10 +12,7 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
-// With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
-// while other C runtimes produce just "0x0p+0".
-// https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-// XFAIL: msvc
+// XFAIL: win32-broken-printf-a-precision
 
 // XFAIL: LIBCXX-AIX-FIXME
 
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
index c97c9a0c40369..a195c34e5f8e8 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
@@ -12,10 +12,7 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
-// With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
-// while other C runtimes produce just "0x0p+0".
-// https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-// XFAIL: msvc
+// XFAIL: win32-broken-printf-a-precision
 
 // XFAIL: LIBCXX-AIX-FIXME
 
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/user_defined_char_type.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/user_defined_char_type.pass.cpp
index 9a4a2f0d5657e..17faf83892a7c 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/user_defined_char_type.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/user_defined_char_type.pass.cpp
@@ -26,9 +26,8 @@ struct Char {
   char underlying_;
 };
 
-namespace std {
 template <>
-struct char_traits<Char> {
+struct std::char_traits<Char> {
   using char_type  = Char;
   using int_type   = int;
   using off_type   = streamoff;
@@ -73,7 +72,7 @@ struct char_traits<Char> {
 
 // This ctype specialization treats all characters as spaces
 template <>
-class ctype<Char> : public locale::facet, public ctype_base {
+class std::ctype<Char> : public locale::facet, public ctype_base {
 public:
   using char_type = Char;
   static locale::id id;
@@ -121,10 +120,10 @@ class ctype<Char> : public locale::facet, public ctype_base {
   }
 };
 
-locale::id ctype<Char>::id;
+std::locale::id std::ctype<Char>::id;
 
 template <>
-class numpunct<Char> : public locale::facet {
+class std::numpunct<Char> : public locale::facet {
 public:
   typedef basic_string<Char> string_type;
   static locale::id id;
@@ -141,9 +140,7 @@ class numpunct<Char> : public locale::facet {
   }
 };
 
-locale::id numpunct<Char>::id;
-
-} // namespace std
+std::locale::id std::numpunct<Char>::id;
 
 int main(int, char**) {
   std::locale l(std::locale::classic(), new std::num_get<Char>);
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
index f368b1069c063..850352b3bc1ec 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
@@ -64,7 +64,7 @@ int main(int, char**)
         std::locale l(LOCALE_fr_FR_UTF_8);
         {
 #if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX)
-          const char sep = ' ';
+            const char sep = ' ';
 #else
             const char sep = ',';
 #endif
@@ -77,11 +77,11 @@ int main(int, char**)
 #if defined(_CS_GNU_LIBC_VERSION)
             const wchar_t wsep = glibc_version_less_than("2.27") ? L' ' : L'\u202f';
 #  elif defined(_AIX)
-          const wchar_t wsep = L'\u202F';
+            const wchar_t wsep = L'\u202F';
 #  elif defined(_WIN32)
-          const wchar_t wsep = L'\u00A0';
+            const wchar_t wsep = L'\u00A0';
 #  else
-          const wchar_t wsep = L',';
+            const wchar_t wsep = L',';
 #  endif
             typedef wchar_t C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index 9379084499792..f53d8f259f47e 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // <cmath>
 
diff --git a/libcxx/test/std/numerics/c.math/hermite.pass.cpp b/libcxx/test/std/numerics/c.math/hermite.pass.cpp
new file mode 100644
index 0000000000000..08fbd5c3283c1
--- /dev/null
+++ b/libcxx/test/std/numerics/c.math/hermite.pass.cpp
@@ -0,0 +1,341 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <cmath>
+
+// double         hermite(unsigned n, double x);
+// float          hermite(unsigned n, float x);
+// long double    hermite(unsigned n, long double x);
+// float          hermitef(unsigned n, float x);
+// long double    hermitel(unsigned n, long double x);
+// template <class Integer>
+// double         hermite(unsigned n, Integer x);
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "type_algorithms.h"
+
+inline constexpr unsigned g_max_n = 128;
+
+template <class T>
+std::array<T, 11> sample_points() {
+  return {-12.34, -7.42, -1.0, -0.5, -0.1, 0.0, 0.1, 0.5, 1.0, 5.67, 15.67};
+}
+
+template <class Real>
+class CompareFloatingValues {
+private:
+  Real abs_tol;
+  Real rel_tol;
+
+public:
+  CompareFloatingValues() {
+    abs_tol = []() -> Real {
+      if (std::is_same_v<Real, float>)
+        return 1e-5f;
+      else if (std::is_same_v<Real, double>)
+        return 1e-11;
+      else
+        return 1e-12l;
+    }();
+
+    rel_tol = abs_tol;
+  }
+
+  bool operator()(Real result, Real expected) const {
+    if (std::isinf(expected) && std::isinf(result))
+      return result == expected;
+
+    if (std::isnan(expected) || std::isnan(result))
+      return false;
+
+    Real tol = abs_tol + std::abs(expected) * rel_tol;
+    return std::abs(result - expected) < tol;
+  }
+};
+
+// Roots are taken from
+// Salzer, Herbert E., Ruth Zucker, and Ruth Capuano.
+// Table of the zeros and weight factors of the first twenty Hermite
+// polynomials. US Government Printing Office, 1952.
+template <class T>
+std::vector<T> get_roots(unsigned n) {
+  switch (n) {
+  case 0:
+    return {};
+  case 1:
+    return {T(0)};
+  case 2:
+    return {T(0.707106781186548)};
+  case 3:
+    return {T(0), T(1.224744871391589)};
+  case 4:
+    return {T(0.524647623275290), T(1.650680123885785)};
+  case 5:
+    return {T(0), T(0.958572464613819), T(2.020182870456086)};
+  case 6:
+    return {T(0.436077411927617), T(1.335849074013697), T(2.350604973674492)};
+  case 7:
+    return {T(0), T(0.816287882858965), T(1.673551628767471), T(2.651961356835233)};
+  case 8:
+    return {T(0.381186990207322), T(1.157193712446780), T(1.981656756695843), T(2.930637420257244)};
+  case 9:
+    return {T(0), T(0.723551018752838), T(1.468553289216668), T(2.266580584531843), T(3.190993201781528)};
+  case 10:
+    return {
+        T(0.342901327223705), T(1.036610829789514), T(1.756683649299882), T(2.532731674232790), T(3.436159118837738)};
+  case 11:
+    return {T(0),
+            T(0.65680956682100),
+            T(1.326557084494933),
+            T(2.025948015825755),
+            T(2.783290099781652),
+            T(3.668470846559583)};
+
+  case 12:
+    return {T(0.314240376254359),
+            T(0.947788391240164),
+            T(1.597682635152605),
+            T(2.279507080501060),
+            T(3.020637025120890),
+            T(3.889724897869782)};
+
+  case 13:
+    return {T(0),
+            T(0.605763879171060),
+            T(1.220055036590748),
+            T(1.853107651601512),
+            T(2.519735685678238),
+            T(3.246608978372410),
+            T(4.101337596178640)};
+
+  case 14:
+    return {T(0.29174551067256),
+            T(0.87871378732940),
+            T(1.47668273114114),
+            T(2.09518325850772),
+            T(2.74847072498540),
+            T(3.46265693360227),
+            T(4.30444857047363)};
+
+  case 15:
+    return {T(0.00000000000000),
+            T(0.56506958325558),
+            T(1.13611558521092),
+            T(1.71999257518649),
+            T(2.32573248617386),
+            T(2.96716692790560),
+            T(3.66995037340445),
+            T(4.49999070730939)};
+
+  case 16:
+    return {T(0.27348104613815),
+            T(0.82295144914466),
+            T(1.38025853919888),
+            T(1.95178799091625),
+            T(2.54620215784748),
+            T(3.17699916197996),
+            T(3.86944790486012),
+            T(4.68873893930582)};
+
+  case 17:
+    return {T(0),
+            T(0.5316330013427),
+            T(1.0676487257435),
+            T(1.6129243142212),
+            T(2.1735028266666),
+            T(2.7577629157039),
+            T(3.3789320911415),
+            T(4.0619466758755),
+            T(4.8713451936744)};
+
+  case 18:
+    return {T(0.2582677505191),
+            T(0.7766829192674),
+            T(1.3009208583896),
+            T(1.8355316042616),
+            T(2.3862990891667),
+            T(2.9613775055316),
+            T(3.5737690684863),
+            T(4.2481178735681),
+            T(5.0483640088745)};
+
+  case 19:
+    return {T(0),
+            T(0.5035201634239),
+            T(1.0103683871343),
+            T(1.5241706193935),
+            T(2.0492317098506),
+            T(2.5911337897945),
+            T(3.1578488183476),
+            T(3.7621873519640),
+            T(4.4285328066038),
+            T(5.2202716905375)};
+
+  case 20:
+    return {T(0.2453407083009),
+            T(0.7374737285454),
+            T(1.2340762153953),
+            T(1.7385377121166),
+            T(2.2549740020893),
+            T(2.7888060584281),
+            T(3.347854567332),
+            T(3.9447640401156),
+            T(4.6036824495507),
+            T(5.3874808900112)};
+
+  default: // polynom degree n>20 is unsupported
+    assert(false);
+    return {T(-42)};
+  }
+}
+
+template <class Real>
+void test() {
+  { // checks if NaNs are reported correctly (i.e. output == input for input == NaN)
+    using nl = std::numeric_limits<Real>;
+    for (Real NaN : {nl::quiet_NaN(), nl::signaling_NaN()})
+      for (unsigned n = 0; n < g_max_n; ++n)
+        assert(std::isnan(std::hermite(n, NaN)));
+  }
+
+  { // simple sample points for n=0..127 should not produce NaNs.
+    for (Real x : sample_points<Real>())
+      for (unsigned n = 0; n < g_max_n; ++n)
+        assert(!std::isnan(std::hermite(n, x)));
+  }
+
+  { // checks std::hermite(n, x) for n=0..5 against analytic polynoms
+    const auto h0 = [](Real) -> Real { return 1; };
+    const auto h1 = [](Real y) -> Real { return 2 * y; };
+    const auto h2 = [](Real y) -> Real { return 4 * y * y - 2; };
+    const auto h3 = [](Real y) -> Real { return y * (8 * y * y - 12); };
+    const auto h4 = [](Real y) -> Real { return (16 * std::pow(y, 4) - 48 * y * y + 12); };
+    const auto h5 = [](Real y) -> Real { return y * (32 * std::pow(y, 4) - 160 * y * y + 120); };
+
+    for (Real x : sample_points<Real>()) {
+      const CompareFloatingValues<Real> compare;
+      assert(compare(std::hermite(0, x), h0(x)));
+      assert(compare(std::hermite(1, x), h1(x)));
+      assert(compare(std::hermite(2, x), h2(x)));
+      assert(compare(std::hermite(3, x), h3(x)));
+      assert(compare(std::hermite(4, x), h4(x)));
+      assert(compare(std::hermite(5, x), h5(x)));
+    }
+  }
+
+  { // checks std::hermitef for bitwise equality with std::hermite(unsigned, float)
+    if constexpr (std::is_same_v<Real, float>)
+      for (unsigned n = 0; n < g_max_n; ++n)
+        for (float x : sample_points<float>())
+          assert(std::hermite(n, x) == std::hermitef(n, x));
+  }
+
+  { // checks std::hermitel for bitwise equality with std::hermite(unsigned, long double)
+    if constexpr (std::is_same_v<Real, long double>)
+      for (unsigned n = 0; n < g_max_n; ++n)
+        for (long double x : sample_points<long double>())
+          assert(std::hermite(n, x) == std::hermitel(n, x));
+  }
+
+  { // Checks if the characteristic recurrence relation holds:    H_{n+1}(x) = 2x H_n(x) - 2n H_{n-1}(x)
+    for (Real x : sample_points<Real>()) {
+      for (unsigned n = 1; n < g_max_n - 1; ++n) {
+        Real H_next            = std::hermite(n + 1, x);
+        Real H_next_recurrence = 2 * (x * std::hermite(n, x) - n * std::hermite(n - 1, x));
+
+        if (std::isinf(H_next))
+          break;
+        const CompareFloatingValues<Real> compare;
+        assert(compare(H_next, H_next_recurrence));
+      }
+    }
+  }
+
+  { // sanity checks: hermite polynoms need to change signs at (simple) roots. checked upto order n<=20.
+
+    // root tolerance: must be smaller than the smallest difference between adjacent roots
+    Real tol = []() -> Real {
+      if (std::is_same_v<Real, float>)
+        return 1e-5f;
+      else if (std::is_same_v<Real, double>)
+        return 1e-9;
+      else
+        return 1e-10l;
+    }();
+
+    const auto is_sign_change = [tol](unsigned n, Real x) -> bool {
+      return std::hermite(n, x - tol) * std::hermite(n, x + tol) < 0;
+    };
+
+    for (unsigned n = 0; n <= 20u; ++n) {
+      for (Real x : get_roots<Real>(n)) {
+        // the roots are symmetric: if x is a root, so is -x
+        if (x > 0)
+          assert(is_sign_change(n, -x));
+        assert(is_sign_change(n, x));
+      }
+    }
+  }
+
+  { // check input infinity is handled correctly
+    Real inf = std::numeric_limits<Real>::infinity();
+    for (unsigned n = 1; n < g_max_n; ++n) {
+      assert(std::hermite(n, +inf) == inf);
+      assert(std::hermite(n, -inf) == ((n & 1) ? -inf : inf));
+    }
+  }
+
+  { // check: if overflow occurs that it is mapped to the correct infinity
+    if constexpr (std::is_same_v<Real, double>) {
+      // Q: Why only double?
+      // A: The numeric values (e.g. overflow threshold `n`) below are different for other types.
+      static_assert(sizeof(double) == 8);
+      for (unsigned n = 0; n < g_max_n; ++n) {
+        // Q: Why n=111 and x=300?
+        // A: Both are chosen s.t. the first overlow occurs for some `n<g_max_n`.
+        if (n < 111) {
+          assert(std::isfinite(std::hermite(n, +300.0)));
+          assert(std::isfinite(std::hermite(n, -300.0)));
+        } else {
+          double inf = std::numeric_limits<double>::infinity();
+          assert(std::hermite(n, +300.0) == inf);
+          assert(std::hermite(n, -300.0) == ((n & 1) ? -inf : inf));
+        }
+      }
+    }
+  }
+}
+
+struct TestFloat {
+  template <class Real>
+  void operator()() {
+    test<Real>();
+  }
+};
+
+struct TestInt {
+  template <class Integer>
+  void operator()() {
+    // checks that std::hermite(unsigned, Integer) actually wraps std::hermite(unsigned, double)
+    for (unsigned n = 0; n < g_max_n; ++n)
+      for (Integer x : {-42, -7, -5, -1, 0, 1, 5, 7, 42})
+        assert(std::hermite(n, x) == std::hermite(n, static_cast<double>(x)));
+  }
+};
+
+int main() {
+  types::for_each(types::floating_point_types(), TestFloat());
+  types::for_each(types::type_list<short, int, long, long long>(), TestInt());
+}
diff --git a/libcxx/test/std/numerics/c.math/isfinite.pass.cpp b/libcxx/test/std/numerics/c.math/isfinite.pass.cpp
index 6bbc3aaac6d13..3d5be61634334 100644
--- a/libcxx/test/std/numerics/c.math/isfinite.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/isfinite.pass.cpp
@@ -62,9 +62,21 @@ struct TestInt {
   }
 };
 
+template <typename T>
+struct ConvertibleTo {
+  operator T() const { return T(); }
+};
+
 int main(int, char**) {
   types::for_each(types::floating_point_types(), TestFloat());
   types::for_each(types::integral_types(), TestInt());
 
+  // Make sure we can call `std::isfinite` with convertible types
+  {
+    assert(std::isfinite(ConvertibleTo<float>()));
+    assert(std::isfinite(ConvertibleTo<double>()));
+    assert(std::isfinite(ConvertibleTo<long double>()));
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
index 549f3cee4d4a4..a523bb9b194b7 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
@@ -18,8 +18,8 @@
 // Serializing/deserializing the state of the RNG requires iostreams
 // UNSUPPORTED: no-localization
 
-// This test appears to hang with picolibc & qemu.
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+// Very slow when run in qemu.
+// REQUIRES: long_tests
 
 #include <random>
 #include <numeric>
diff --git a/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp
index 155f88010309c..2dddd84d8796e 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp
@@ -19,6 +19,7 @@
 #include <array>
 #include <cassert>
 #include <concepts>
+#include <memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
index 28ac53c2445d4..7e01b5884728f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
@@ -73,12 +73,12 @@ constexpr bool test() {
   std::ranges::drop_view dropView7(MoveOnlyView(), 10);
   assert(dropView7.begin() == globalBuff + 8);
 
-  CountedView view8;
+  IteratorOpCounts opcounts;
+  CountedView view8(&opcounts);
+  ;
   std::ranges::drop_view dropView8(view8, 5);
   assert(base(base(dropView8.begin())) == globalBuff + 5);
-  assert(dropView8.begin().stride_count() == 5);
-  assert(base(base(dropView8.begin())) == globalBuff + 5);
-  assert(dropView8.begin().stride_count() == 5);
+  assert(opcounts.increments == 5);
 
   static_assert(!BeginInvocable<const ForwardView>);
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
index ae861bce40f1e..73d1e5045ad22 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
@@ -120,10 +120,14 @@ struct Range {
   int *end() const;
 };
 
-using CountedIter = stride_counting_iterator<forward_iterator<int*>>;
+using CountedIter = operation_counting_iterator<forward_iterator<int*>>;
 struct CountedView : std::ranges::view_base {
-  constexpr CountedIter begin() const { return CountedIter(ForwardIter(globalBuff)); }
-  constexpr CountedIter end() const { return CountedIter(ForwardIter(globalBuff + 8)); }
+  explicit constexpr CountedView(IteratorOpCounts* opcounts) noexcept : opcounts_(opcounts) {}
+  constexpr CountedIter begin() const { return CountedIter(ForwardIter(globalBuff), opcounts_); }
+  constexpr CountedIter end() const { return CountedIter(ForwardIter(globalBuff + 8), opcounts_); }
+
+private:
+  IteratorOpCounts* opcounts_;
 };
 
 struct View : std::ranges::view_base {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
index 06eb91dba22ff..5d8bf51c69696 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
@@ -37,7 +37,7 @@ int main(int, char**) {
     using View = std::ranges::transform_view<MoveOnlyView, PlusOneNoexcept>;
     View transformView(MoveOnlyView{buff}, PlusOneNoexcept{});
     assert(*transformView.begin() == 1);
-    LIBCPP_ASSERT_NOEXCEPT(*std::declval<std::ranges::iterator_t<View>>());
+    ASSERT_NOEXCEPT(*std::declval<std::ranges::iterator_t<View>>());
     ASSERT_SAME_TYPE(int, decltype(*std::declval<View>().begin()));
   }
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
deleted file mode 100644
index ea1e4b61412c4..0000000000000
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// friend constexpr decltype(auto) iter_move(const iterator& i)
-//    noexcept(noexcept(invoke(i.parent_->fun_, *i.current_)))
-
-#include <ranges>
-
-#include "test_macros.h"
-#include "../types.h"
-
-constexpr bool test() {
-  int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  {
-    std::ranges::transform_view transformView(MoveOnlyView{buff}, PlusOneMutable{});
-    auto iter = transformView.begin();
-    ASSERT_NOT_NOEXCEPT(std::ranges::iter_move(iter));
-
-    assert(std::ranges::iter_move(iter) == 1);
-    assert(std::ranges::iter_move(iter + 2) == 3);
-
-    ASSERT_SAME_TYPE(int, decltype(std::ranges::iter_move(iter)));
-    ASSERT_SAME_TYPE(int, decltype(std::ranges::iter_move(std::move(iter))));
-  }
-
-  {
-    LIBCPP_ASSERT_NOEXCEPT(std::ranges::iter_move(
-      std::declval<std::ranges::iterator_t<std::ranges::transform_view<MoveOnlyView, PlusOneNoexcept>>&>()));
-    ASSERT_NOT_NOEXCEPT(std::ranges::iter_move(
-      std::declval<std::ranges::iterator_t<std::ranges::transform_view<MoveOnlyView, PlusOneMutable>>&>()));
-  }
-
-  return true;
-}
-
-int main(int, char**) {
-  test();
-  static_assert(test());
-
-  return 0;
-}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
index 14f85722a8c19..cc5679f229de2 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
@@ -119,12 +119,6 @@ struct Range {
   int *end() const;
 };
 
-using CountedIter = stride_counting_iterator<forward_iterator<int*>>;
-struct CountedView : std::ranges::view_base {
-  constexpr CountedIter begin() const { return CountedIter(ForwardIter(globalBuff)); }
-  constexpr CountedIter end() const { return CountedIter(ForwardIter(globalBuff + 8)); }
-};
-
 struct TimesTwo {
   constexpr int operator()(int x) const { return x * 2; }
 };
diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp
index a4696d73839b6..549ebffe5c97e 100644
--- a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp
@@ -10,9 +10,10 @@
 
 // constexpr iterator& operator+=(difference_type n);
 
-#include <ranges>
 #include <cassert>
 #include <concepts>
+#include <memory>
+#include <ranges>
 
 constexpr bool test() {
   std::ranges::repeat_view<int> v(10);
diff --git a/libcxx/test/std/ranges/range.utility/range.subrange/ctor.range_size.pass.cpp b/libcxx/test/std/ranges/range.utility/range.subrange/ctor.range_size.pass.cpp
index 4b29546465e2c..fa2d34db15f23 100644
--- a/libcxx/test/std/ranges/range.utility/range.subrange/ctor.range_size.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.subrange/ctor.range_size.pass.cpp
@@ -27,10 +27,8 @@ struct BorrowedRange {
   int* end_;
 };
 
-namespace std::ranges {
-  template <>
-  inline constexpr bool enable_borrowed_range<::BorrowedRange> = true;
-}
+template <>
+inline constexpr bool std::ranges::enable_borrowed_range<BorrowedRange> = true;
 
 constexpr bool test() {
   int buff[] = {1, 2, 3, 4, 5, 6, 7, 8};
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
index edc8b67808b85..abd284852a189 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <memory>
 #include <string>
 
 #include "make_string.h"
@@ -29,7 +30,7 @@ constexpr void test_appending(std::size_t k, size_t N, size_t new_capacity) {
   s.resize_and_overwrite(new_capacity, [&](auto* p, auto n) {
     assert(n == new_capacity);
     LIBCPP_ASSERT(s.size() == new_capacity);
-    LIBCPP_ASSERT(s.begin().base() == p);
+    LIBCPP_ASSERT(std::to_address(s.begin()) == p);
     assert(std::all_of(p, p + k, [](const auto ch) { return ch == 'a'; }));
     std::fill(p + k, p + n, 'b');
     p[n] = 'c'; // will be overwritten
@@ -48,7 +49,7 @@ constexpr void test_truncating(std::size_t o, size_t N) {
   s.resize_and_overwrite(N, [&](auto* p, auto n) {
     assert(n == N);
     LIBCPP_ASSERT(s.size() == n);
-    LIBCPP_ASSERT(s.begin().base() == p);
+    LIBCPP_ASSERT(std::to_address(s.begin()) == p);
     assert(std::all_of(p, p + n, [](auto ch) { return ch == 'a'; }));
     p[n - 1] = 'b';
     p[n]     = 'c'; // will be overwritten
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp
index 057050cdcf7fa..6f5e43d1341f5 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp
@@ -63,8 +63,49 @@ TEST_CONSTEXPR_CXX20 bool test() {
   return true;
 }
 
+#if TEST_STD_VER >= 23
+std::size_t min_bytes = 1000;
+
+template <typename T>
+struct increasing_allocator {
+  using value_type       = T;
+  increasing_allocator() = default;
+  template <typename U>
+  increasing_allocator(const increasing_allocator<U>&) noexcept {}
+  std::allocation_result<T*> allocate_at_least(std::size_t n) {
+    std::size_t allocation_amount = n * sizeof(T);
+    if (allocation_amount < min_bytes)
+      allocation_amount = min_bytes;
+    min_bytes += 1000;
+    return {static_cast<T*>(::operator new(allocation_amount)), allocation_amount / sizeof(T)};
+  }
+  T* allocate(std::size_t n) { return allocate_at_least(n).ptr; }
+  void deallocate(T* p, std::size_t) noexcept { ::operator delete(static_cast<void*>(p)); }
+};
+
+template <typename T, typename U>
+bool operator==(increasing_allocator<T>, increasing_allocator<U>) {
+  return true;
+}
+
+// https://github.com/llvm/llvm-project/issues/95161
+void test_increasing_allocator() {
+  std::basic_string<char, std::char_traits<char>, increasing_allocator<char>> s{
+      "String does not fit in the internal buffer"};
+  std::size_t capacity = s.capacity();
+  std::size_t size     = s.size();
+  s.shrink_to_fit();
+  assert(s.capacity() <= capacity);
+  assert(s.size() == size);
+  LIBCPP_ASSERT(is_string_asan_correct(s));
+}
+#endif // TEST_STD_VER >= 23
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 23
+  test_increasing_allocator();
+#endif
 #if TEST_STD_VER > 17
   static_assert(test());
 #endif
diff --git a/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp b/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
index a352b4cf9b92e..8c2324c9d1759 100644
--- a/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
@@ -23,9 +23,8 @@ struct VeryLarge {
   char b;
 };
 
-namespace std {
 template <>
-struct char_traits<VeryLarge> {
+struct std::char_traits<VeryLarge> {
   using char_type  = VeryLarge;
   using int_type   = int;
   using off_type   = streamoff;
@@ -55,7 +54,6 @@ struct char_traits<VeryLarge> {
   static bool eq_int_type(int_type c1, int_type c2);
   static int_type eof();
 };
-} // end namespace std
 
 template <class S>
 TEST_CONSTEXPR_CXX20 void test(S s, typename S::value_type c, S expected) {
diff --git a/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp
new file mode 100644
index 0000000000000..3d981e8b3a3cd
--- /dev/null
+++ b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp
@@ -0,0 +1,216 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <string>
+
+// [string.op.plus]
+//
+// template<class charT, class traits, class Allocator>
+//   constexpr basic_string<charT, traits, Allocator>
+//     operator+(const basic_string<charT, traits, Allocator>& lhs,
+//               type_identity_t<basic_string_view<charT, traits>> rhs);                           // Since C++26
+// template<class charT, class traits, class Allocator>
+//   constexpr basic_string<charT, traits, Allocator>
+//     operator+(basic_string<charT, traits, Allocator>&& lhs,
+//               type_identity_t<basic_string_view<charT, traits>> rhs);                           // Since C++26
+// template<class charT, class traits, class Allocator>
+//   constexpr basic_string<charT, traits, Allocator>
+//     operator+(type_identity_t<basic_string_view<charT, traits>> lhs,
+//               const basic_string<charT, traits, Allocator>& rhs);                               // Since C++26
+// template<class charT, class traits, class Allocator>
+//   constexpr basic_string<charT, traits, Allocator>
+//     operator+(type_identity_t<basic_string_view<charT, traits>> lhs,
+//               basic_string<charT, traits, Allocator>&& rhs);                                    // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <string>
+#include <utility>
+
+#include "asan_testing.h"
+#include "constexpr_char_traits.h"
+#include "make_string.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <typename CharT, class TraitsT = std::char_traits<CharT>>
+class ConvertibleToStringView {
+public:
+  constexpr explicit ConvertibleToStringView(const CharT* cs) : cs_{cs} {}
+
+  constexpr operator std::basic_string_view<CharT, TraitsT>() { return std::basic_string_view<CharT, TraitsT>(cs_); }
+  constexpr operator std::basic_string_view<CharT, TraitsT>() const {
+    return std::basic_string_view<CharT, TraitsT>(cs_);
+  }
+
+private:
+  const CharT* cs_;
+};
+
+static_assert(std::constructible_from<std::basic_string_view<char>, const ConvertibleToStringView<char>>);
+static_assert(std::convertible_to<const ConvertibleToStringView<char>, std::basic_string_view<char>>);
+
+static_assert(std::constructible_from<std::basic_string_view<char>, ConvertibleToStringView<char>>);
+static_assert(std::convertible_to<ConvertibleToStringView<char>, std::basic_string_view<char>>);
+
+#define CS(S) MAKE_CSTRING(CharT, S)
+
+template <template <typename, typename> typename StringViewT, typename CharT, typename TraitsT, typename AllocT>
+constexpr void test(const CharT* x, const CharT* y, const CharT* expected) {
+  AllocT allocator;
+
+  // string& + string_view
+  {
+    std::basic_string<CharT, TraitsT, AllocT> st{x, allocator};
+    StringViewT<CharT, TraitsT> sv{y};
+
+    std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = st + sv;
+    assert(result == expected);
+    assert(result.get_allocator() == allocator);
+    LIBCPP_ASSERT(is_string_asan_correct(st + sv));
+  }
+  // const string& + string_view
+  {
+    const std::basic_string<CharT, TraitsT, AllocT> st{x, allocator};
+    StringViewT<CharT, TraitsT> sv{y};
+
+    std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = st + sv;
+    assert(result == expected);
+    assert(result.get_allocator() == allocator);
+    LIBCPP_ASSERT(is_string_asan_correct(st + sv));
+  }
+  // string&& + string_view
+  {
+    std::basic_string<CharT, TraitsT, AllocT> st{x, allocator};
+    StringViewT<CharT, TraitsT> sv{y};
+
+    std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = std::move(st) + sv;
+    assert(result == expected);
+    assert(result.get_allocator() == allocator);
+    LIBCPP_ASSERT(is_string_asan_correct(std::move(st) + sv));
+  }
+  // string_view + string&
+  {
+    StringViewT<CharT, TraitsT> sv{x};
+    std::basic_string<CharT, TraitsT, AllocT> st{y, allocator};
+
+    std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = sv + st;
+    assert(result == expected);
+    assert(result.get_allocator() == allocator);
+    LIBCPP_ASSERT(is_string_asan_correct(sv + st));
+  }
+  // string_view + const string&
+  {
+    StringViewT<CharT, TraitsT> sv{x};
+    const std::basic_string<CharT, TraitsT, AllocT> st{y, allocator};
+
+    std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = sv + st;
+    assert(result == expected);
+    assert(result.get_allocator() == allocator);
+    LIBCPP_ASSERT(is_string_asan_correct(sv + st));
+  }
+  // string_view + string&&
+  {
+    // TODO: Remove workaround once https://github.com/llvm/llvm-project/issues/92382 is fixed.
+    // Create a `basic_string` to workaround clang bug:
+    // https://github.com/llvm/llvm-project/issues/92382
+    // Comparison between pointers to a string literal and some other object results in constant evaluation failure.
+    if constexpr (std::same_as<StringViewT<CharT, TraitsT>, std::basic_string_view<CharT, TraitsT>>) {
+      std::basic_string<CharT, TraitsT, AllocT> st_{x, allocator};
+      StringViewT<CharT, TraitsT> sv{st_};
+      std::basic_string<CharT, TraitsT, AllocT> st{y, allocator};
+
+      std::same_as<std::basic_string<CharT, TraitsT, AllocT>> decltype(auto) result = sv + std::move(st);
+      assert(result == expected);
+      assert(result.get_allocator() == allocator);
+      LIBCPP_ASSERT(is_string_asan_correct(sv + std::move(st)));
+    }
+  }
+}
+
+template <template <typename, typename> typename StringViewT,
+          typename CharT,
+          typename TraitsT,
+          typename AllocT = std::allocator<CharT>>
+constexpr void test() {
+  // Concatenate with an empty `string`/`string_view`
+  test<StringViewT, CharT, TraitsT, AllocT>(CS(""), CS(""), CS(""));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS(""), CS("short"), CS("short"));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS(""), CS("not so short"), CS("not so short"));
+  test<StringViewT, CharT, TraitsT, AllocT>(
+      CS(""), CS("this is a much longer string"), CS("this is a much longer string"));
+
+  test<StringViewT, CharT, TraitsT, AllocT>(CS(""), CS(""), CS(""));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS("short"), CS(""), CS("short"));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS("not so short"), CS(""), CS("not so short"));
+  test<StringViewT, CharT, TraitsT, AllocT>(
+      CS("this is a much longer string"), CS(""), CS("this is a much longer string"));
+
+  // Non empty
+  test<StringViewT, CharT, TraitsT, AllocT>(CS("B"), CS("D"), CS("BD"));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS("zmt94"), CS("+hkt82"), CS("zmt94+hkt82"));
+  test<StringViewT, CharT, TraitsT, AllocT>(CS("not so short"), CS("+is not bad"), CS("not so short+is not bad"));
+  test<StringViewT, CharT, TraitsT, AllocT>(
+      CS("this is a much longer string"),
+      CS("+which is so much better"),
+      CS("this is a much longer string+which is so much better"));
+}
+
+template <template <typename, typename> typename StringViewT, typename CharT>
+constexpr bool test() {
+  test<StringViewT, CharT, std::char_traits<CharT>>();
+  test<StringViewT, CharT, std::char_traits<CharT>, min_allocator<CharT>>();
+  test<StringViewT, CharT, std::char_traits<CharT>, safe_allocator<CharT>>();
+  test<StringViewT, CharT, std::char_traits<CharT>, test_allocator<CharT>>();
+
+  test<StringViewT, CharT, constexpr_char_traits<CharT>>();
+  test<StringViewT, CharT, constexpr_char_traits<CharT>, min_allocator<CharT>>();
+  test<StringViewT, CharT, constexpr_char_traits<CharT>, safe_allocator<CharT>>();
+  test<StringViewT, CharT, constexpr_char_traits<CharT>, test_allocator<CharT>>();
+
+  return true;
+}
+
+int main(int, char**) {
+  // std::basic_string_view
+  test<std::basic_string_view, char>();
+  static_assert(test<std::basic_string_view, char>());
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<std::basic_string_view, wchar_t>();
+  static_assert(test<std::basic_string_view, wchar_t>());
+#endif
+#ifndef TEST_HAS_NO_CHAR8_T
+  test<std::basic_string_view, char8_t>();
+  static_assert(test<std::basic_string_view, char8_t>());
+#endif
+  test<std::basic_string_view, char16_t>();
+  static_assert(test<std::basic_string_view, char16_t>());
+  test<std::basic_string_view, char32_t>();
+  static_assert(test<std::basic_string_view, char32_t>());
+
+  // ConvertibleToStringView
+  test<ConvertibleToStringView, char>();
+  static_assert(test<ConvertibleToStringView, char>());
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<ConvertibleToStringView, wchar_t>();
+  static_assert(test<ConvertibleToStringView, wchar_t>());
+#endif
+#ifndef TEST_HAS_NO_CHAR8_T
+  test<ConvertibleToStringView, char8_t>();
+  static_assert(test<ConvertibleToStringView, char8_t>());
+#endif
+  test<ConvertibleToStringView, char16_t>();
+  static_assert(test<ConvertibleToStringView, char16_t>());
+  test<ConvertibleToStringView, char32_t>();
+  static_assert(test<ConvertibleToStringView, char32_t>());
+
+  return 0;
+}
diff --git a/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp b/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp
new file mode 100644
index 0000000000000..75d492bf7b3c6
--- /dev/null
+++ b/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: !stdlib=libc++ && (c++03 || c++11 || c++14)
+
+// <string_view>
+
+// class iterator
+
+#include <cassert>
+#include <concepts>
+#include <iterator>
+#include <string_view>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+template <class CharT>
+TEST_CONSTEXPR_CXX14 void test_type() {
+  using C                  = std::basic_string_view<CharT>;
+  typename C::iterator ii1 = typename C::iterator(), ii2 = typename C::iterator();
+  typename C::iterator ii4       = ii1;
+  typename C::const_iterator cii = typename C::const_iterator();
+  assert(ii1 == ii2);
+  assert(ii1 == ii4);
+  assert(ii1 == cii);
+
+  assert(!(ii1 != ii2));
+  assert(!(ii1 != cii));
+
+#if TEST_STD_VER >= 17
+  C c = MAKE_STRING_VIEW(CharT, "abc");
+  assert(c.begin() == std::begin(c));
+  assert(c.rbegin() == std::rbegin(c));
+  assert(c.cbegin() == std::cbegin(c));
+  assert(c.crbegin() == std::crbegin(c));
+
+  assert(c.end() == std::end(c));
+  assert(c.rend() == std::rend(c));
+  assert(c.cend() == std::cend(c));
+  assert(c.crend() == std::crend(c));
+
+  assert(std::begin(c) != std::end(c));
+  assert(std::rbegin(c) != std::rend(c));
+  assert(std::cbegin(c) != std::cend(c));
+  assert(std::crbegin(c) != std::crend(c));
+#endif
+
+#if TEST_STD_VER >= 20
+  // P1614 + LWG3352
+  std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+  assert(r1 == std::strong_ordering::equal);
+
+  std::same_as<std::strong_ordering> decltype(auto) r2 = ii1 <=> ii2;
+  assert(r2 == std::strong_ordering::equal);
+#endif
+}
+
+TEST_CONSTEXPR_CXX14 bool test() {
+  test_type<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_type<wchar_t>();
+#endif
+#ifndef TEST_HAS_NO_CHAR8_T
+  test_type<char8_t>();
+#endif
+  test_type<char16_t>();
+  test_type<char32_t>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 14
+  static_assert(test(), "");
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
index d9d9c1dba6bbb..b1ad6447a9e21 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
index aff7b26e16f70..b0d94a8a3f4fe 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
index 8c45ba9278f28..2d747e3c9b9da 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
index 633a0c8bf2366..892e29b9dfa9e 100644
--- a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
index fe7068d2a574c..d67cf36c860ea 100644
--- a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <barrier>
 
diff --git a/libcxx/test/std/thread/thread.barrier/max.pass.cpp b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
index b09a02e1bdef4..a3ec904897a18 100644
--- a/libcxx/test/std/thread/thread.barrier/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <barrier>
 
diff --git a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
index 8ca4f37b73b95..23cb2706beb5b 100644
--- a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
index eb524abd24b98..f33f7b21908d4 100644
--- a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
index bca4561bd2f74..df258b01be33c 100644
--- a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <latch>
 
diff --git a/libcxx/test/std/thread/thread.latch/max.pass.cpp b/libcxx/test/std/thread/thread.latch/max.pass.cpp
index bcf353ed9712e..4490f94a2dac7 100644
--- a/libcxx/test/std/thread/thread.latch/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <latch>
 
diff --git a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
index 8f354463a8697..fa09e5632fbfa 100644
--- a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
index 337ad4c45a94d..8c7ca4279eead 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
@@ -18,12 +17,13 @@
 #include <mutex>
 
 #include "test_macros.h"
+#include "types.h"
 
 int main(int, char**) {
-  std::mutex mutex;
+  MyMutex mutex;
   {
     std::unique_lock lock(mutex);
-    ASSERT_SAME_TYPE(decltype(lock), std::unique_lock<std::mutex>);
+    ASSERT_SAME_TYPE(decltype(lock), std::unique_lock<MyMutex>);
   }
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp
similarity index 59%
rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp
index 799cb61f9b21e..9ab8369637cdc 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp
@@ -13,22 +13,7 @@
 // unique_lock& operator=(unique_lock const&) = delete;
 
 #include <mutex>
-#include <cassert>
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
-    M m0;
-    M m1;
-    std::unique_lock<M> lk0(m0);
-    std::unique_lock<M> lk1(m1);
-    lk1 = lk0;
-    assert(lk1.mutex() == &m0);
-    assert(lk1.owns_lock() == true);
-    assert(lk0.mutex() == nullptr);
-    assert(lk0.owns_lock() == false);
-    }
+#include "../types.h"
 
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::lock_guard<MyMutex> >::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp
similarity index 61%
rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp
index e258198e60bbf..e846061f5fbd0 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp
@@ -13,20 +13,7 @@
 // unique_lock(unique_lock const&) = delete;
 
 #include <mutex>
-#include <cassert>
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
-    M m;
-    std::unique_lock<M> lk0(m);
-    std::unique_lock<M> lk = lk0;
-    assert(lk.mutex() == &m);
-    assert(lk.owns_lock() == true);
-    assert(lk0.mutex() == nullptr);
-    assert(lk0.owns_lock() == false);
-    }
+#include "../types.h"
 
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::lock_guard<MyMutex> >::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
index 2034a26b1d913..6fc4f7f23ced3 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,16 +12,16 @@
 
 // unique_lock();
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> ul;
-    assert(!ul.owns_lock());
-    assert(ul.mutex() == nullptr);
+int main(int, char**) {
+  std::unique_lock<MyMutex> ul;
+  assert(!ul.owns_lock());
+  assert(ul.mutex() == nullptr);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
index 0af918c1e20ee..9563fdebd3e06 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads, c++03
 
 // <mutex>
 
@@ -14,16 +12,16 @@
 
 // unique_lock& operator=(unique_lock&& u);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m0;
     M m1;
     std::unique_lock<M> lk0(m0);
@@ -33,8 +31,8 @@ int main(int, char**)
     assert(lk1.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m0;
     M m1;
@@ -45,7 +43,7 @@ int main(int, char**)
     assert(lk1.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
index cce0eb5fb9057..08f6fc8410e25 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads, c++03
+// UNSUPPORTED: c++03
 
 // <mutex>
 
@@ -14,16 +14,16 @@
 
 // unique_lock(unique_lock&& u);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     std::unique_lock<M> lk0(m);
     std::unique_lock<M> lk = std::move(lk0);
@@ -31,8 +31,8 @@ int main(int, char**)
     assert(lk.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     std::unique_lock<M> lk0(m);
@@ -41,7 +41,7 @@ int main(int, char**)
     assert(lk.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
index 4adbe26777d0b..28cc43853180e 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -15,30 +14,30 @@
 
 // unique_lock(mutex_type& m, adopt_lock_t);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     m.lock();
     std::unique_lock<M> lk(m, std::adopt_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == true);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     m.lock();
     std::unique_lock<M> lk(m, std::adopt_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == true);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
index 06ef2043edcef..96a9afbc9438c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -15,28 +14,28 @@
 
 // unique_lock(mutex_type& m, defer_lock_t);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     std::unique_lock<M> lk(m, std::defer_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     std::unique_lock<M> lk(m, std::defer_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
index 920baa71317cf..6767e11a1f8b4 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
@@ -51,7 +51,7 @@ void f()
     }
     catch (std::system_error& e)
     {
-        assert(e.code().value() == EDEADLK);
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
     lk.unlock();
@@ -64,7 +64,7 @@ void f()
     }
     catch (std::system_error& e)
     {
-        assert(e.code().value() == EPERM);
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
index 4cf5ec2ab5ccf..2ee5d3766eb18 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 // ALLOW_RETRIES: 2
 
@@ -21,53 +20,35 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_called = false;
+MyTimedMutex m;
 
-struct mutex
-{
-    bool try_lock()
-    {
-        try_lock_called = !try_lock_called;
-        return try_lock_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock() == true);
-    assert(try_lock_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock() == true);
+  assert(m.try_lock_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock() == false);
-    assert(try_lock_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock() == false);
+  assert(m.try_lock_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
index 8e7004e5eec85..603cc7b185620 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -21,57 +20,36 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_for_called = false;
+MyTimedMutex m;
 
-typedef std::chrono::milliseconds ms;
-
-struct mutex
-{
-    template <class Rep, class Period>
-        bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time)
-    {
-        assert(rel_time == ms(5));
-        try_lock_for_called = !try_lock_for_called;
-        return try_lock_for_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock_for(ms(5)) == true);
-    assert(try_lock_for_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  using ms = std::chrono::milliseconds;
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock_for(ms(5)) == true);
+  assert(m.try_lock_for_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock_for(ms(5)) == false);
-    assert(try_lock_for_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock_for(ms(5)) == false);
+  assert(m.try_lock_for_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
index 077bc517399ab..46ab95197311c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -17,61 +16,41 @@
 //   bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <cassert>
+#include <chrono>
 #include <mutex>
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_until_called = false;
+MyTimedMutex m;
 
-struct mutex
-{
-    template <class Clock, class Duration>
-        bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time)
-    {
-        typedef std::chrono::milliseconds ms;
-        assert(Clock::now() - abs_time < ms(5));
-        try_lock_until_called = !try_lock_until_called;
-        return try_lock_until_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    typedef std::chrono::steady_clock Clock;
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock_until(Clock::now()) == true);
-    assert(try_lock_until_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  typedef std::chrono::system_clock Clock;
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock_until(Clock::now()) == true);
+  assert(m.try_lock_until_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock_until(Clock::now()) == false);
-    assert(try_lock_until_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock_until(Clock::now()) == false);
+  assert(m.try_lock_until_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
index 30c795150dace..97808f60f2e55 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -19,45 +17,30 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool unlock_called = false;
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {unlock_called = true;}
-};
-
-mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m);
-    lk.unlock();
-    assert(unlock_called == true);
-    assert(lk.owns_lock() == false);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk(m);
+  lk.unlock();
+  assert(lk.owns_lock() == false);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.unlock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    lk.unlock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
-    lk.release();
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.unlock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    lk.unlock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
index fc12d3baea202..361c85e015059 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,28 +12,22 @@
 
 // void swap(unique_lock& u);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk1(m);
-    std::unique_lock<mutex> lk2;
-    lk1.swap(lk2);
-    assert(lk1.mutex() == nullptr);
-    assert(lk1.owns_lock() == false);
-    assert(lk2.mutex() == &m);
-    assert(lk2.owns_lock() == true);
+#include "../types.h"
+
+MyMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk1(m);
+  std::unique_lock<MyMutex> lk2;
+  lk1.swap(lk2);
+  assert(lk1.mutex() == nullptr);
+  assert(lk1.owns_lock() == false);
+  assert(lk2.mutex() == &m);
+  assert(lk2.owns_lock() == true);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
index 03d268c4b9306..5133032f6ae39 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -15,28 +13,22 @@
 // template <class Mutex>
 //   void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk1(m);
-    std::unique_lock<mutex> lk2;
-    swap(lk1, lk2);
-    assert(lk1.mutex() == nullptr);
-    assert(lk1.owns_lock() == false);
-    assert(lk2.mutex() == &m);
-    assert(lk2.owns_lock() == true);
+#include "../types.h"
+
+MyMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk1(m);
+  std::unique_lock<MyMutex> lk2;
+  swap(lk1, lk2);
+  assert(lk1.mutex() == nullptr);
+  assert(lk1.owns_lock() == false);
+  assert(lk2.mutex() == &m);
+  assert(lk2.owns_lock() == true);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
index 4f2d59c3d333d..a726c8ccc060a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,36 +12,28 @@
 
 // mutex_type* release() noexcept;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    static int lock_count;
-    static int unlock_count;
-    void lock() {++lock_count;}
-    void unlock() {++unlock_count;}
-};
-
-int mutex::lock_count = 0;
-int mutex::unlock_count = 0;
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m);
-    assert(lk.mutex() == &m);
-    assert(lk.owns_lock() == true);
-    assert(mutex::lock_count == 1);
-    assert(mutex::unlock_count == 0);
-    assert(lk.release() == &m);
-    assert(lk.mutex() == nullptr);
-    assert(lk.owns_lock() == false);
-    assert(mutex::lock_count == 1);
-    assert(mutex::unlock_count == 0);
+#include "../types.h"
+
+int MyCountingMutex::lock_count   = 0;
+int MyCountingMutex::unlock_count = 0;
+
+MyCountingMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyCountingMutex> lk(m);
+  assert(lk.mutex() == &m);
+  assert(lk.owns_lock() == true);
+  assert(MyCountingMutex::lock_count == 1);
+  assert(MyCountingMutex::unlock_count == 0);
+  assert(lk.release() == &m);
+  assert(lk.mutex() == nullptr);
+  assert(lk.owns_lock() == false);
+  assert(MyCountingMutex::lock_count == 1);
+  assert(MyCountingMutex::unlock_count == 0);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
index dd2f5239645f9..72346e8c67e25 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,21 +12,21 @@
 
 // mutex_type *mutex() const;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> lk0;
-    assert(lk0.mutex() == nullptr);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(lk1.mutex() == &m);
-    lk1.unlock();
-    assert(lk1.mutex() == &m);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk0;
+  assert(lk0.mutex() == nullptr);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(lk1.mutex() == &m);
+  lk1.unlock();
+  assert(lk1.mutex() == &m);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
index ea05eb7d023a9..3759302a483eb 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,26 +12,26 @@
 
 // explicit operator bool() const noexcept;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 #include <type_traits>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    static_assert(std::is_constructible<bool, std::unique_lock<std::mutex> >::value, "");
-    static_assert(!std::is_convertible<std::unique_lock<std::mutex>, bool>::value, "");
+int main(int, char**) {
+  static_assert(std::is_constructible<bool, std::unique_lock<MyMutex> >::value, "");
+  static_assert(!std::is_convertible<std::unique_lock<MyMutex>, bool>::value, "");
 
-    std::unique_lock<std::mutex> lk0;
-    assert(static_cast<bool>(lk0) == false);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(static_cast<bool>(lk1) == true);
-    lk1.unlock();
-    assert(static_cast<bool>(lk1) == false);
-    ASSERT_NOEXCEPT(static_cast<bool>(lk0));
+  std::unique_lock<MyMutex> lk0;
+  assert(static_cast<bool>(lk0) == false);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(static_cast<bool>(lk1) == true);
+  lk1.unlock();
+  assert(static_cast<bool>(lk1) == false);
+  ASSERT_NOEXCEPT(static_cast<bool>(lk0));
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
index 9b192fbcd621e..163942786323a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,21 +12,21 @@
 
 // bool owns_lock() const;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> lk0;
-    assert(lk0.owns_lock() == false);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(lk1.owns_lock() == true);
-    lk1.unlock();
-    assert(lk1.owns_lock() == false);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk0;
+  assert(lk0.owns_lock() == false);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(lk1.owns_lock() == true);
+  lk1.unlock();
+  assert(lk1.owns_lock() == false);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp
similarity index 74%
rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp
index d8497888f59bb..312863ae8e743 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -22,11 +20,6 @@
 #include <type_traits>
 
 #include "test_macros.h"
+#include "types.h"
 
-int main(int, char**)
-{
-    static_assert((std::is_same<std::unique_lock<std::mutex>::mutex_type,
-                   std::mutex>::value), "");
-
-  return 0;
-}
+static_assert((std::is_same<std::unique_lock<MyMutex>::mutex_type, MyMutex>::value), "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h
new file mode 100644
index 0000000000000..15a1a531487f5
--- /dev/null
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
+#define TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
+
+#include <cassert>
+#include <chrono>
+
+struct MyMutex {
+  bool locked = false;
+
+  MyMutex() = default;
+  ~MyMutex() { assert(!locked); }
+
+  void lock() {
+    assert(!locked);
+    locked = true;
+  }
+
+  void unlock() {
+    assert(locked);
+    locked = false;
+  }
+
+  bool try_lock() {
+    if (locked)
+      return false;
+    lock();
+    return true;
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
+    using ms = std::chrono::milliseconds;
+    assert(rel_time == ms(5));
+    if (locked)
+      return false;
+    lock();
+    return true;
+  }
+
+  MyMutex(MyMutex const&)            = delete;
+  MyMutex& operator=(MyMutex const&) = delete;
+};
+
+struct MyTimedMutex {
+  using ms = std::chrono::milliseconds;
+
+  bool try_lock_called       = false;
+  bool try_lock_for_called   = false;
+  bool try_lock_until_called = false;
+
+  bool try_lock() {
+    try_lock_called = !try_lock_called;
+    return try_lock_called;
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
+    assert(rel_time == ms(5));
+    try_lock_for_called = !try_lock_for_called;
+    return try_lock_for_called;
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
+    assert(Clock::now() - abs_time < ms(5));
+    try_lock_until_called = !try_lock_until_called;
+    return try_lock_until_called;
+  }
+
+  void unlock() {}
+};
+
+struct MyCountingMutex {
+  static int lock_count;
+  static int unlock_count;
+  void lock() { ++lock_count; }
+  void unlock() { ++unlock_count; }
+};
+
+#endif // TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.compile.pass.cpp
similarity index 75%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.compile.pass.cpp
index 34164aaa0413c..0d90bff928369 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.compile.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -16,12 +16,6 @@
 // shared_mutex& operator=(const shared_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_mutex m0;
-    std::shared_mutex m1;
-    m1 = m0; // expected-error {{overload resolution selected deleted operator '='}}
-
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::shared_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.copy.compile.pass.cpp
similarity index 76%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.copy.compile.pass.cpp
index 9b43198d0a37b..f9e1935f15b9e 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.copy.compile.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -16,11 +16,6 @@
 // shared_mutex(const shared_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_mutex m0;
-    std::shared_mutex m1(m0); // expected-error {{call to deleted constructor of 'std::shared_mutex'}}
-
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::shared_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.default.pass.cpp
similarity index 87%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.default.pass.cpp
index 5504645bb31f9..c941f3a5ea277 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.default.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -19,10 +19,9 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::shared_mutex m;
-    (void)m;
+int main(int, char**) {
+  std::shared_mutex m;
+  (void)m;
 
-    return 0;
+  return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
index 122f2b0cddb79..724fb0728a979 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
@@ -5,63 +5,105 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // void lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::shared_mutex m;
+int main(int, char**) {
+  // Exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    m.lock();
+    m.unlock();
+  }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+  // Exclusive-lock a mutex that is already locked exclusively. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
 
-ms WaitTime = ms(250);
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    while (!ready)
+      /* spin */;
 
-void f()
-{
-    time_point t0 = Clock::now();
-    m.lock();
-    time_point t1 = Clock::now();
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for the thread
+    // to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
     m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
 
-int main(int, char**)
-{
-    m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(WaitTime);
-    m.unlock();
     t.join();
+  }
+
+  // Exclusive-lock a mutex that is already share-locked. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_mutex m;
+    m.lock_shared();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock_shared();
+
+    t.join();
+  }
+
+  // Make sure that at most one thread can acquire the mutex concurrently.
+  {
+    std::atomic<int> counter = 0;
+    std::shared_mutex mutex;
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 10; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        mutex.lock();
+        counter++;
+        assert(counter == 1);
+        counter--;
+        mutex.unlock();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
index 9df0d57c9621c..e6640f70631ab 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
@@ -5,87 +5,139 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // void lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <algorithm>
+#include <atomic>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
-
-void f()
-{
-    time_point t0 = Clock::now();
-    m.lock_shared();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
 
-void g()
-{
-    time_point t0 = Clock::now();
-    m.lock_shared();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0;
-    assert(d < Tolerance);  // within tolerance
-}
+int main(int, char**) {
+  // Lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
+  }
 
-int main(int, char**)
-{
+  // Lock-shared a mutex that is already exclusively locked. This should block until it is unlocked.
+  {
+    std::atomic<int> ready(0);
+    std::shared_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(WaitTime);
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        assert(!is_locked_from_main);
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
     m.unlock();
-    for (auto& t : v)
-        t.join();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::atomic<int> ready(0);
+    std::shared_mutex m;
     m.lock_shared();
-    for (auto& t : v)
-        t = support::make_test_thread(g);
-    std::thread q = support::make_test_thread(f);
-    std::this_thread::sleep_for(WaitTime);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
     m.unlock_shared();
-    for (auto& t : v)
-        t.join();
-    q.join();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Create several threads that all acquire-shared the same mutex and make sure that each
+  // thread successfully acquires-shared the mutex.
+  //
+  // We record how many other threads were holding the mutex when it was acquired, which allows
+  // us to know whether the test was somewhat effective at causing multiple threads to lock at
+  // the same time.
+  {
+    std::shared_mutex mutex;
+    std::vector<std::thread> threads;
+    constexpr int n_threads           = 5;
+    std::atomic<int> holders          = 0;
+    int concurrent_holders[n_threads] = {};
+    std::atomic<bool> ready           = false;
+
+    for (int i = 0; i != n_threads; ++i) {
+      threads.push_back(support::make_test_thread([&, i] {
+        while (!ready) {
+          // spin
+        }
+
+        mutex.lock_shared();
+        ++holders;
+        concurrent_holders[i] = holders;
+
+        mutex.unlock_shared();
+        --holders;
+      }));
+    }
+
+    ready = true; // let the threads actually start shared-acquiring the mutex
+    for (auto& t : threads)
+      t.join();
+
+    // We can't guarantee that we'll ever have more than 1 concurrent holder so that's what
+    // we assert, however in principle we should often trigger more than 1 concurrent holder.
+    int max_concurrent_holders = *std::max_element(std::begin(concurrent_holders), std::end(concurrent_holders));
+    assert(max_concurrent_holders >= 1);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
index f39b1ee29f7db..11d396df3fe7d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
@@ -5,56 +5,60 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // bool try_lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    while(!m.try_lock())
-        ;
-    time_point t1 = Clock::now();
+
+int main(int, char**) {
+  // Try to exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    bool succeeded = m.try_lock();
+    assert(succeeded);
     m.unlock();
-    ns d = t1 - t0 - ms(250);
-    assert(d < ms(200));  // within 200ms
-}
+  }
 
-int main(int, char**)
-{
+  // Try to exclusive-lock a mutex that is already locked exclusively. This should fail.
+  {
+    std::shared_mutex m;
     m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(ms(250));
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
+    t.join();
+
     m.unlock();
+  }
+
+  // Try to exclusive-lock a mutex that is already share-locked. This should fail.
+  {
+    std::shared_mutex m;
+    m.lock_shared();
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
     t.join();
 
+    m.unlock_shared();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
index c091b06f47492..61069bebb72bd 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
@@ -5,71 +5,76 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // bool try_lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_mutex m;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+    for (auto& t : threads)
+      t.join();
+  }
 
+  // Try to lock-shared a mutex that is already exclusively locked. This should fail.
+  {
+    std::shared_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(200);
-#else
-ms Tolerance = ms(200 * 5);
-#endif
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    while(!m.try_lock_shared())
-        std::this_thread::yield();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - ms(250);
-    assert(d < Tolerance);  // within tolerance
-}
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(!succeeded);
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(ms(250));
     m.unlock();
-    for (auto& t : v)
-        t.join();
+  }
+
+  // Try to lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::shared_mutex m;
+    m.lock_shared();
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+    m.unlock_shared();
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.pass.cpp
similarity index 81%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.pass.cpp
index c2cd8931b16cd..9cbb0b1456d2d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
 // <shared_mutex>
@@ -15,12 +16,6 @@
 // shared_timed_mutex& operator=(const shared_timed_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_timed_mutex m0;
-    std::shared_timed_mutex m1;
-    m1 = m0;
-
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::shared_timed_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.copy.compile.pass.cpp
similarity index 80%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.copy.compile.pass.cpp
index 9b0a66160e070..12b01a589b043 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.copy.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
 // <shared_mutex>
@@ -15,11 +16,6 @@
 // shared_timed_mutex(const shared_timed_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_timed_mutex m0;
-    std::shared_timed_mutex m1(m0);
-
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::shared_timed_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.default.pass.cpp
similarity index 99%
rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.default.pass.cpp
index 7a8d096994fff..eadc59e13bebf 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.default.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
index acabbabb3896b..f78b13dc4bfa2 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
@@ -5,62 +5,104 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // void lock();
 
+#include <shared_mutex>
 #include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-std::atomic<bool> ready(false);
-time_point start;
-
-ms WaitTime = ms(250);
-
-void f()
-{
-  ready.store(true);
-  m.lock();
-  time_point t0 = start;
-  time_point t1 = Clock::now();
-  m.unlock();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
 
-int main(int, char**)
-{
-  m.lock();
-  std::thread t = support::make_test_thread(f);
-  while (!ready)
-    std::this_thread::yield();
-  start = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock();
-  t.join();
+int main(int, char**) {
+  // Exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    m.lock();
+    m.unlock();
+  }
+
+  // Exclusive-lock a mutex that is already locked exclusively. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_timed_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for the thread
+    // to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock();
+
+    t.join();
+  }
+
+  // Exclusive-lock a mutex that is already share-locked. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_timed_mutex m;
+    m.lock_shared();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock_shared();
+
+    t.join();
+  }
+
+  // Make sure that at most one thread can acquire the mutex concurrently.
+  {
+    std::atomic<int> counter(0);
+    std::shared_timed_mutex mutex;
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 10; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        mutex.lock();
+        counter++;
+        assert(counter == 1);
+        counter--;
+        mutex.unlock();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
index 36f5dbaa2040f..d9a3db150a25c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
@@ -5,100 +5,138 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // void lock_shared();
 
-
+#include <shared_mutex>
+#include <algorithm>
 #include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-std::atomic<unsigned> countDown;
-time_point readerStart; // Protected by the above mutex 'm'
-time_point writerStart; // Protected by the above mutex 'm'
-
-ms WaitTime = ms(250);
-
-void readerMustWait() {
-  --countDown;
-  m.lock_shared();
-  time_point t1 = Clock::now();
-  time_point t0 = readerStart;
-  m.unlock_shared();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
-
-void reader() {
-  --countDown;
-  m.lock_shared();
-  m.unlock_shared();
-}
-
-void writerMustWait() {
-  --countDown;
-  m.lock();
-  time_point t1 = Clock::now();
-  time_point t0 = writerStart;
-  m.unlock();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
 
-int main(int, char**)
-{
-  int threads = 5;
-
-  countDown.store(threads);
-  m.lock();
-  std::vector<std::thread> v;
-  for (int i = 0; i < threads; ++i)
-    v.push_back(support::make_test_thread(readerMustWait));
-  while (countDown > 0)
-    std::this_thread::yield();
-  readerStart = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock();
-  for (auto& t : v)
-    t.join();
-
-  countDown.store(threads + 1);
-  m.lock_shared();
-  for (auto& t : v)
-    t = support::make_test_thread(reader);
-  std::thread q = support::make_test_thread(writerMustWait);
-  while (countDown > 0)
-    std::this_thread::yield();
-  writerStart = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock_shared();
-  for (auto& t : v)
-    t.join();
-  q.join();
+int main(int, char**) {
+  // Lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already exclusively locked. This should block until it is unlocked.
+  {
+    std::atomic<int> ready(0);
+    std::shared_timed_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        assert(!is_locked_from_main);
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::atomic<int> ready(0);
+    std::shared_timed_mutex m;
+    m.lock_shared();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    m.unlock_shared();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Create several threads that all acquire-shared the same mutex and make sure that each
+  // thread successfully acquires-shared the mutex.
+  //
+  // We record how many other threads were holding the mutex when it was acquired, which allows
+  // us to know whether the test was somewhat effective at causing multiple threads to lock at
+  // the same time.
+  {
+    std::shared_timed_mutex mutex;
+    std::vector<std::thread> threads;
+    constexpr int n_threads = 5;
+    std::atomic<int> holders(0);
+    int concurrent_holders[n_threads] = {};
+    std::atomic<bool> ready(false);
+
+    for (int i = 0; i != n_threads; ++i) {
+      threads.push_back(support::make_test_thread([&, i] {
+        while (!ready)
+          /* spin */;
+
+        mutex.lock_shared();
+        ++holders;
+        concurrent_holders[i] = holders;
+
+        mutex.unlock_shared();
+        --holders;
+      }));
+    }
+
+    ready = true; // let the threads actually start shared-acquiring the mutex
+    for (auto& t : threads)
+      t.join();
+
+    // We can't guarantee that we'll ever have more than 1 concurrent holder so that's what
+    // we assert, however in principle we should often trigger more than 1 concurrent holder.
+    int max_concurrent_holders = *std::max_element(std::begin(concurrent_holders), std::end(concurrent_holders));
+    assert(max_concurrent_holders >= 1);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
index cc7091fed415d..9ed8b5bb35041 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
@@ -5,56 +5,60 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // bool try_lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    while(!m.try_lock())
-        ;
-    time_point t1 = Clock::now();
+
+int main(int, char**) {
+  // Try to exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock();
+    assert(succeeded);
     m.unlock();
-    ns d = t1 - t0 - ms(250);
-    assert(d < ms(200));  // within 200ms
-}
+  }
 
-int main(int, char**)
-{
+  // Try to exclusive-lock a mutex that is already locked exclusively. This should fail.
+  {
+    std::shared_timed_mutex m;
     m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(ms(250));
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
+    t.join();
+
     m.unlock();
+  }
+
+  // Try to exclusive-lock a mutex that is already share-locked. This should fail.
+  {
+    std::shared_timed_mutex m;
+    m.lock_shared();
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
     t.join();
 
+    m.unlock_shared();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
index 30fc3c510a237..0ae9a488dd224 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -18,69 +17,89 @@
 //     bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+}
 
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+int main(int, char**) {
+  // Try to lock a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock_for(std::chrono::milliseconds(1));
+    assert(succeeded);
+    m.unlock();
+  }
 
+  // Try to lock an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<bool> ready(false);
 
-ms WaitTime = ms(250);
+    std::shared_timed_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        ready          = true;
+        bool succeeded = m.try_lock_for(wait_time);
+        assert(succeeded);
+        m.unlock();
+      });
 
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_for(WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+      // Ensure we didn't wait significantly longer than our timeout. This is technically
+      // flaky and non-conforming because an implementation is free to block for arbitrarily
+      // long, but any decent quality implementation should pass this test.
+      assert(elapsed - wait_time < tolerance);
+    });
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_for(WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+    // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (!ready)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f1);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        t.join();
-    }
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f2);
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        t.join();
-    }
+    m.unlock(); // this should allow the thread to lock 'm'
+    t.join();
+  }
+
+  // Try to lock an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        bool succeeded = m.try_lock_for(wait_time);
+        assert(!succeeded);
+      });
+
+      // Ensure we failed within some bounded time.
+      assert(elapsed - wait_time < tolerance);
+    });
+
+    t.join();
+
+    m.unlock();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
index 8523df08f8129..6430a32334227 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
@@ -5,70 +5,76 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // bool try_lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+  // Try to lock-shared a mutex that is already exclusively locked. This should fail.
+  {
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(!succeeded);
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(200);
-#else
-ms Tolerance = ms(200 * 5);
-#endif
+    m.unlock();
+  }
 
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    while(!m.try_lock_shared())
-        std::this_thread::yield();
-    time_point t1 = Clock::now();
+  // Try to lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    m.lock_shared();
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
     m.unlock_shared();
-    ns d = t1 - t0 - ms(250);
-    assert(d < Tolerance);  // within tolerance
-}
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(ms(250));
-    m.unlock();
-    for (auto& t : v)
-        t.join();
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
index c7d02a3a72001..23a88ba475bd3 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -19,75 +17,110 @@
 //     bool try_lock_shared_for(const chrono::duration<Rep, Period>& rel_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <vector>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
-
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_shared_for(WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within 50ms
-}
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_shared_for(WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within 50ms
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::vector<std::thread> v;
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f1));
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared_for(std::chrono::milliseconds(1));
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<int> ready(0);
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* spin until all threads are created */;
+
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_for(wait_time);
+          assert(succeeded);
+          m.unlock_shared();
+        });
+
+        // Ensure we didn't wait significantly longer than our timeout. This is technically
+        // flaky and non-conforming because an implementation is free to block for arbitrarily
+        // long, but any decent quality implementation should pass this test.
+        assert(elapsed - wait_time < tolerance);
+      }));
     }
-    {
-        m.lock();
-        std::vector<std::thread> v;
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f2));
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
+
+    // Wait for all the threads to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (ready < 5)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
+
+    m.unlock(); // this should allow the threads to lock-shared 'm'
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_for(wait_time);
+          assert(!succeeded);
+        });
+
+        // Ensure we failed within some bounded time.
+        assert(elapsed - wait_time < tolerance);
+      }));
     }
 
+    for (auto& t : threads)
+      t.join();
+
+    m.unlock();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
index a95ffab0aa60d..af88baeebf5d3 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -18,73 +16,109 @@
 // template <class Clock, class Duration>
 //     bool try_lock_shared_until(const chrono::time_point<Clock, Duration>& abs_time);
 
-#include <thread>
-
+#include <shared_mutex>
 #include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
+#include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms SuccessWaitTime = ms(5000); // Some machines are busy or slow or both
-ms FailureWaitTime = ms(50);
-
-// On busy or slow machines, there can be a significant delay between thread
-// creation and thread start, so we use an atomic variable to signal that the
-// thread is actually executing.
-static std::atomic<unsigned> countDown;
-
-void f1()
-{
-  --countDown;
-  time_point t0 = Clock::now();
-  assert(m.try_lock_shared_until(Clock::now() + SuccessWaitTime) == true);
-  time_point t1 = Clock::now();
-  m.unlock_shared();
-  assert(t1 - t0 <= SuccessWaitTime);
-}
 
-void f2()
-{
-  time_point t0 = Clock::now();
-  assert(m.try_lock_shared_until(Clock::now() + FailureWaitTime) == false);
-  assert(Clock::now() - t0 >= FailureWaitTime);
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-  int threads = 5;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1));
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
   {
-    countDown.store(threads);
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<int> ready(0);
+
+    std::shared_timed_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < threads; ++i)
-      v.push_back(support::make_test_thread(f1));
-    while (countDown > 0)
-      std::this_thread::yield();
-    m.unlock();
-    for (auto& t : v)
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* spin until all threads are created */;
+
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + wait_time);
+          assert(succeeded);
+          m.unlock_shared();
+        });
+
+        // Ensure we didn't wait significantly longer than our timeout. This is technically
+        // flaky and non-conforming because an implementation is free to block for arbitrarily
+        // long, but any decent quality implementation should pass this test.
+        assert(elapsed - wait_time < tolerance);
+      }));
+    }
+
+    // Wait for all the threads to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (ready < 5)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
+
+    m.unlock(); // this should allow the threads to lock-shared 'm'
+
+    for (auto& t : threads)
       t.join();
   }
+
+  // Try to lock-shared an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
   {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < threads; ++i)
-      v.push_back(support::make_test_thread(f2));
-    for (auto& t : v)
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + wait_time);
+          assert(!succeeded);
+        });
+
+        // Ensure we failed within some bounded time.
+        assert(elapsed - wait_time < tolerance);
+      }));
+    }
+
+    for (auto& t : threads)
       t.join();
+
     m.unlock();
   }
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
index fb521efdb93f5..948364d67236c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -19,69 +17,89 @@
 //     bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+}
 
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+int main(int, char**) {
+  // Try to lock a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1));
+    assert(succeeded);
+    m.unlock();
+  }
 
+  // Try to lock an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<bool> ready(false);
 
-ms WaitTime = ms(250);
+    std::shared_timed_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        ready          = true;
+        bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time);
+        assert(succeeded);
+        m.unlock();
+      });
 
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_until(Clock::now() + WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+      // Ensure we didn't wait significantly longer than our timeout. This is technically
+      // flaky and non-conforming because an implementation is free to block for arbitrarily
+      // long, but any decent quality implementation should pass this test.
+      assert(elapsed - wait_time < tolerance);
+    });
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_until(Clock::now() + WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+    // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (!ready)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f1);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        t.join();
-    }
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f2);
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        t.join();
-    }
+    m.unlock(); // this should allow the thread to lock 'm'
+    t.join();
+  }
+
+  // Try to lock an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time);
+        assert(!succeeded);
+      });
+
+      // Ensure we failed within some bounded time.
+      assert(elapsed - wait_time < tolerance);
+    });
+
+    t.join();
+
+    m.unlock();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
index 22eed736c6b75..5a4a0a94b0191 100644
--- a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
index c01c78506587c..b244a9d9eda2a 100644
--- a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
index dcc298ce11ce8..b7c8d5340b982 100644
--- a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <semaphore>
 
diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
index 6f3ed5e345e0b..bf6b0f05e64f0 100644
--- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <semaphore>
 
diff --git a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
index 3c4d179e50433..d068872ea7230 100644
--- a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
index 77f15ece221d4..ad3c0fb103790 100644
--- a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
index ec159daf87a3f..fb6fff3baf4c4 100644
--- a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
index 3287b950743fb..5618836d89e80 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
index cc38a5324b5ca..f55f0e2af8cb2 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
index 10b73175e9902..8555ebd742616 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
index 575e5dda93f53..39a257592809d 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
index b478893434e8e..f0db4e18a0235 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
index 5a2f0147b3116..b507e77fd964b 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/rep.h b/libcxx/test/std/time/rep.h
index ddb5c0b249317..6df3e7ad27c4d 100644
--- a/libcxx/test/std/time/rep.h
+++ b/libcxx/test/std/time/rep.h
@@ -40,16 +40,16 @@ struct RepConstConvertibleLWG3050 {
   operator long() = delete;
   operator long() const { return 2; }
 };
-namespace std {
+
 template <>
-struct common_type<RepConstConvertibleLWG3050, int> {
+struct std::common_type<RepConstConvertibleLWG3050, int> {
   using type = long;
 };
 template <>
-struct common_type<int, RepConstConvertibleLWG3050> {
+struct std::common_type<int, RepConstConvertibleLWG3050> {
   using type = long;
 };
-} // namespace std
+
 #endif // TEST_STD_VER >= 11
 
 // std::chrono:::duration has only '*', '/' and '%' taking a "Rep" parameter
diff --git a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp
index 33d8cd9f776d2..0101d205b5555 100644
--- a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp
@@ -17,10 +17,13 @@
 #include <chrono>
 #include <type_traits>
 #include <cassert>
+#include <concepts>
 
 #include "test_macros.h"
 #include "test_comparisons.h"
 
+static_assert(!std::totally_ordered<std::chrono::weekday>);
+
 int main(int, char**)
 {
     using weekday = std::chrono::weekday;
diff --git a/libcxx/test/std/time/time.clock/time.clock.file/now.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.file/now.pass.cpp
index 4ba17a48988b3..0852483a8b39f 100644
--- a/libcxx/test/std/time/time.clock/time.clock.file/now.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.file/now.pass.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: availability-filesystem-missing
 
-// qemu: Unsupported SemiHosting SWI 0x30
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // file_clock
diff --git a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
index 35f3af2d6dbc3..02dc215d35c53 100644
--- a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
index c1d7c21d864d0..1a471ac227987 100644
--- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
index f57841cca8629..64fb1f1586623 100644
--- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
index b4127af511b58..50e3963a21753 100644
--- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
index 019a1fc473275..db69e0820c424 100644
--- a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
index 45c3a12ea35cb..f062e7ac68192 100644
--- a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
index 6e73638408a2c..09c0fadbe0ea4 100644
--- a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
index 746554d968311..63ead5c152d5d 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
index f29ccea414305..6f9bfd834ff64 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
index f2a73c5c25f91..3d3430f653ff0 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
index d57977413b696..0e9018dae6d73 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
index 3a7d6f9a6b01f..96e77daa0e19e 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
index 28b742acba60b..4f232ee481a25 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
index 03d3e77e0d712..3ebc2671be3f9 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
index ea73382cb1909..c8f68aa925aa8 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
index cf1b99a62d963..e1a7d788f9db1 100644
--- a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
index f80f474304b48..96b654d828b48 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
index 1f2af1cb0530d..95e1795b573b6 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
index db1cdfcf0e88a..e31ed5271cd26 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
index fcf0acc926d12..38a9538bbd76a 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
index a9c2b34de9320..3dfda7261c795 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp
new file mode 100644
index 0000000000000..900cf42cf4fdc
--- /dev/null
+++ b/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp
@@ -0,0 +1,975 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+//
+// template<class Duration, class TimeZonePtr, class charT>
+// struct formatter<chrono::zoned_time<Duration, TimeZonePtr>, charT>
+
+#include <chrono>
+#include <format>
+
+#include <cassert>
+#include <concepts>
+#include <locale>
+#include <iostream>
+#include <type_traits>
+
+#include "formatter_tests.h"
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+
+template <class CharT>
+static void test_no_chrono_specs() {
+  using namespace std::literals::chrono_literals;
+
+  check(SV("1970-01-01 01:00:00.000000042 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::nanoseconds>{42ns}));
+  check(SV("1970-01-01 01:00:00.000042 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::microseconds>{42us}));
+  check(SV("1970-01-01 01:00:00.042 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::milliseconds>{42ms}));
+  check(SV("1970-01-01 01:00:42 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::seconds>{42s}));
+  check(SV("1970-02-12 01:00:00 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::days>{std::chrono::days{42}}));
+  check(SV("1970-10-22 01:00:00 +01"),
+        SV("{}"),
+        std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{42}}));
+}
+
+template <class CharT>
+static void test_valid_values_year() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt =
+      SV("{:%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}");
+  constexpr std::basic_string_view<CharT> lfmt =
+      SV("{:L%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  check(loc,
+        SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)|| defined(__FreeBSD__)
+
+  check(loc,
+        SV("%C='19'\t%EC='昭和'\t%y='70'\t%Oy='七十'\t%Ey='45'\t%Y='1970'\t%EY='昭和45年'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%C='20'\t%EC='平成'\t%y='09'\t%Oy='九'\t%Ey='21'\t%Y='2009'\t%EY='平成21年'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)|| defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_month() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%b='Jan'\t%h='Jan'\t%B='January'\t%m='01'\t%Om='01'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%b='May'\t%h='May'\t%B='May'\t%m='05'\t%Om='05'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__)
+  check(SV("%b='jan'\t%h='jan'\t%B='janvier'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+#else
+  check(SV("%b='janv.'\t%h='janv.'\t%B='janvier'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+#endif
+
+  check(SV("%b='mai'\t%h='mai'\t%B='mai'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#ifdef _WIN32
+  check(loc,
+        SV("%b='1'\t%h='1'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b='5'\t%h='5'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(_AIX)                                                         // _WIN32
+  check(loc,
+        SV("%b='1月'\t%h='1月'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b='5月'\t%h='5月'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(__APPLE__)                                                    // _WIN32
+  check(loc,
+        SV("%b=' 1'\t%h=' 1'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5'\t%h=' 5'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(__FreeBSD__)                                                  // _WIN32
+  check(loc,
+        SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#else                                                                       // _WIN32
+  check(loc,
+        SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='一'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='五'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#endif                                                                      // _WIN32
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_day() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%d='01'\t%Od='一'\t%e=' 1'\t%Oe='一'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%d='13'\t%Od='十三'\t%e='13'\t%Oe='十三'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_weekday() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt =
+      SV("{:%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}");
+  constexpr std::basic_string_view<CharT> lfmt =
+      SV("{:L%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%a='Thu'\t%A='Thursday'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='Sun'\t%A='Sunday'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{4'294'967'295s})); // 06:28:15 UTC on Sunday, 7 February 2106
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__)
+  check(SV("%a='Jeu'\t%A='Jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='Dim'\t%A='Dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{4'294'967'295s})); // 06:28:15 UTC on Sunday, 7 February 2106
+#else
+  check(SV("%a='jeu.'\t%A='jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='dim.'\t%A='dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{4'294'967'295s})); // 06:28:15 UTC on Sunday, 7 February 2106
+#endif
+
+  // Use supplied locale (ja_JP).
+  // This locale has a different alternate, but not on all platforms
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{4'294'967'295s})); // 06:28:15 UTC on Sunday, 7 February 2106
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='四'\t%w='4'\t%Ow='四'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='七'\t%w='0'\t%Ow='〇'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{4'294'967'295s})); // 06:28:15 UTC on Sunday, 7 February 2106
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_day_of_year() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%j='%j'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%j='%j'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%j='001'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+  check(SV("%j='138'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+  check(SV("%j='001'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+  check(SV("%j='138'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+  check(loc,
+        SV("%j='001'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%j='138'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_week() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+  check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%U='00'\t%OU='〇'\t%W='00'\t%OW='〇'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%U='20'\t%OU='二十'\t%W='20'\t%OW='二十'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{2'000'000'000s})); // 03:33:20 UTC on Wednesday, 18 May 2033
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_iso_8601_week() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%g='70'\t%G='1970'\t%V='01'\t%OV='一'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%g='09'\t%G='2009'\t%V='07'\t%OV='七'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_date() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/70'\t%Ex='01/01/70'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='02/13/09'\t%Ex='02/13/09'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01.01.1970'\t%Ex='01.01.1970'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13.02.2009'\t%Ex='13.02.2009'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#else
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/1970'\t%Ex='01/01/1970'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13/02/2009'\t%Ex='13/02/2009'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#endif
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970/01/01'\t%Ex='1970/01/01'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009/02/13'\t%Ex='2009/02/13'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970年01月01日'\t%Ex='昭和45年01月01日'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009年02月13日'\t%Ex='平成21年02月13日'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_time() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt = SV(
+      "{:"
+      "%%H='%H'%t"
+      "%%OH='%OH'%t"
+      "%%I='%I'%t"
+      "%%OI='%OI'%t"
+      "%%M='%M'%t"
+      "%%OM='%OM'%t"
+      "%%S='%S'%t"
+      "%%OS='%OS'%t"
+      "%%p='%p'%t"
+      "%%R='%R'%t"
+      "%%T='%T'%t"
+      "%%r='%r'%t"
+      "%%X='%X'%t"
+      "%%EX='%EX'%t"
+      "%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV(
+      "{:L"
+      "%%H='%H'%t"
+      "%%OH='%OH'%t"
+      "%%I='%I'%t"
+      "%%OI='%OI'%t"
+      "%%M='%M'%t"
+      "%%OM='%OM'%t"
+      "%%S='%S'%t"
+      "%%OS='%OS'%t"
+      "%%p='%p'%t"
+      "%%R='%R'%t"
+      "%%T='%T'%t"
+      "%%r='%r'%t"
+      "%%X='%X'%t"
+      "%%EX='%EX'%t"
+      "%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+           "%p='AM'\t"
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+           "%r='12:00:00 AM'\t"
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+           "\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30.123'\t"
+           "%OS='30.123'\t"
+           "%p='PM'\t"
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+           "%r='11:31:30 PM'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+           "\n"),
+        fmt,
+        std::chrono::sys_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms)); // 23:31:30 UTC on Friday, 13 February 2009
+  // Use the global locale (fr_FR)
+  check(SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+#if defined(_AIX)
+           "%p='AM'\t"
+#else
+           "%p=''\t"
+#endif
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+#ifdef _WIN32
+           "%r='00:00:00'\t"
+#elif defined(_AIX)
+           "%r='12:00:00 AM'\t"
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+           "%r=''\t"
+#else
+           "%r='12:00:00 '\t"
+#endif
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+           "\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30,123'\t"
+           "%OS='30,123'\t"
+#if defined(_AIX)
+           "%p='PM'\t"
+#else
+           "%p=''\t"
+#endif
+           "%R='23:31'\t"
+           "%T='23:31:30,123'\t"
+#ifdef _WIN32
+           "%r='23:31:30'\t"
+#elif defined(_AIX)
+           "%r='11:31:30 PM'\t"
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+           "%r=''\t"
+#elif defined(_WIN32)
+           "%r='23:31:30 '\t"
+#else
+           "%r='11:31:30 '\t"
+#endif
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+           "\n"),
+        lfmt,
+        std::chrono::sys_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.a
+#if defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+  check(loc,
+        SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+#  if defined(__APPLE__)
+           "%p='AM'\t"
+#  else
+           "%p='午前'\t"
+#  endif
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    if defined(__APPLE__)
+           "%r='12:00:00 AM'\t"
+#    else
+           "%r='12:00:00 午前'\t"
+#    endif
+           "%X='00時00分00秒'\t"
+           "%EX='00時00分00秒'\t"
+#  elif defined(_WIN32)
+           "%r='0:00:00'\t"
+           "%X='0:00:00'\t"
+           "%EX='0:00:00'\t"
+#  else
+           "%r='午前12:00:00'\t"
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+#  endif
+           "\n"),
+        lfmt,
+        std::chrono::hh_mm_ss(0s));
+
+  check(loc,
+        SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30.123'\t"
+           "%OS='30.123'\t"
+#  if defined(__APPLE__)
+           "%p='PM'\t"
+#  else
+           "%p='午後'\t"
+#  endif
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    if defined(__APPLE__)
+           "%r='11:31:30 PM'\t"
+#    else
+           "%r='11:31:30 午後'\t"
+#    endif
+           "%X='23時31分30秒'\t"
+           "%EX='23時31分30秒'\t"
+#  elif defined(_WIN32)
+           "%r='23:31:30'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+#  else
+           "%r='午後11:31:30'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+#  endif
+           "\n"),
+        lfmt,
+        std::chrono::hh_mm_ss(23h + 31min + 30s + 123ms));
+#else  // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+  check(loc,
+        SV("%H='00'\t"
+           "%OH='〇'\t"
+           "%I='12'\t"
+           "%OI='十二'\t"
+           "%M='00'\t"
+           "%OM='〇'\t"
+           "%S='00'\t"
+           "%OS='〇'\t"
+           "%p='午前'\t"
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+           "%r='午前12時00分00秒'\t"
+           "%X='00時00分00秒'\t"
+           "%EX='00時00分00秒'\t"
+           "\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%H='23'\t"
+           "%OH='二十三'\t"
+           "%I='11'\t"
+           "%OI='十一'\t"
+           "%M='31'\t"
+           "%OM='三十一'\t"
+           "%S='30.123'\t"
+           "%OS='三十.123'\t"
+           "%p='午後'\t"
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+           "%r='午後11時31分30秒'\t"
+           "%X='23時31分30秒'\t"
+           "%EX='23時31分30秒'\t"
+           "\n"),
+        lfmt,
+        std::chrono::sys_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_date_time() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%c='%c'%t%%Ec='%Ec'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%c='%c'%t%%Ec='%Ec'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%c='Thu Jan  1 00:00:00 1970'\t%Ec='Thu Jan  1 00:00:00 1970'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%c='Fri Feb 13 23:31:30 2009'\t%Ec='Fri Feb 13 23:31:30 2009'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(
+// https://sourceware.org/bugzilla/show_bug.cgi?id=24054
+#if defined(__powerpc__) && defined(__linux__)
+      SV("%c='jeu. 01 janv. 1970 00:00:00 UTC'\t%Ec='jeu. 01 janv. 1970 00:00:00 UTC'\n"),
+#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29
+      SV("%c='jeu. 01 janv. 1970 00:00:00 GMT'\t%Ec='jeu. 01 janv. 1970 00:00:00 GMT'\n"),
+#elif defined(_AIX)
+      SV("%c=' 1 janvier 1970 à 00:00:00 UTC'\t%Ec=' 1 janvier 1970 à 00:00:00 UTC'\n"),
+#elif defined(__APPLE__)
+      SV("%c='Jeu  1 jan 00:00:00 1970'\t%Ec='Jeu  1 jan 00:00:00 1970'\n"),
+#elif defined(_WIN32)
+      SV("%c='01/01/1970 00:00:00'\t%Ec='01/01/1970 00:00:00'\n"),
+#elif defined(__FreeBSD__)
+      SV("%c='jeu.  1 janv. 00:00:00 1970'\t%Ec='jeu.  1 janv. 00:00:00 1970'\n"),
+#else
+      SV("%c='jeu. 01 janv. 1970 00:00:00'\t%Ec='jeu. 01 janv. 1970 00:00:00'\n"),
+#endif
+      lfmt,
+      std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(
+// https://sourceware.org/bugzilla/show_bug.cgi?id=24054
+#if defined(__powerpc__) && defined(__linux__)
+      SV("%c='ven. 13 févr. 2009 23:31:30 UTC'\t%Ec='ven. 13 févr. 2009 23:31:30 UTC'\n"),
+#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29
+      SV("%c='ven. 13 févr. 2009 23:31:30 GMT'\t%Ec='ven. 13 févr. 2009 23:31:30 GMT'\n"),
+#elif defined(_AIX)
+      SV("%c='13 février 2009 à 23:31:30 UTC'\t%Ec='13 février 2009 à 23:31:30 UTC'\n"),
+#elif defined(__APPLE__)
+      SV("%c='Ven 13 fév 23:31:30 2009'\t%Ec='Ven 13 fév 23:31:30 2009'\n"),
+#elif defined(_WIN32)
+      SV("%c='13/02/2009 23:31:30'\t%Ec='13/02/2009 23:31:30'\n"),
+#elif defined(__FreeBSD__)
+      SV("%c='ven. 13 févr. 23:31:30 2009'\t%Ec='ven. 13 févr. 23:31:30 2009'\n"),
+#else
+      SV("%c='ven. 13 févr. 2009 23:31:30'\t%Ec='ven. 13 févr. 2009 23:31:30'\n"),
+#endif
+      lfmt,
+      std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.a
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  check(loc,
+        SV("%c='木  1/ 1 00:00:00 1970'\t%Ec='木  1/ 1 00:00:00 1970'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='金  2/13 23:31:30 2009'\t%Ec='金  2/13 23:31:30 2009'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#elif defined(_AIX)                                                         // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970年01月 1日 00:00:00 UTC'\t%Ec='1970年01月 1日 00:00:00 UTC'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='2009年02月13日 23:31:30 UTC'\t%Ec='2009年02月13日 23:31:30 UTC'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#elif defined(_WIN32)                                                       // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970/01/01 0:00:00'\t%Ec='1970/01/01 0:00:00'\n"),
+        lfmt,
+        std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='2009/02/13 23:31:30'\t%Ec='2009/02/13 23:31:30'\n"),
+        lfmt,
+        std::chrono::sys_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else                                                                       // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970年01月01日 00時00分00秒'\t%Ec='昭和45年01月01日 00時00分00秒'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%c='2009年02月13日 23時31分30秒'\t%Ec='平成21年02月13日 23時31分30秒'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{1'234'567'890s})); // 23:31:30 UTC on Friday, 13 February 2009
+#endif                                                                      // defined(__APPLE__)|| defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_time_zone() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        fmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  // Use the global locale (fr_FR)
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  // Use supplied locale (ja_JP).
+  check(loc,
+        SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        lfmt,
+        std::chrono::zoned_time(std::chrono::sys_seconds{0s})); // 00:00:00 UTC Thursday, 1 January 1970
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values() {
+  test_valid_values_year<CharT>();
+  test_valid_values_month<CharT>();
+  test_valid_values_day<CharT>();
+  test_valid_values_weekday<CharT>();
+  test_valid_values_day_of_year<CharT>();
+  test_valid_values_week<CharT>();
+  test_valid_values_iso_8601_week<CharT>();
+  test_valid_values_date<CharT>();
+  test_valid_values_time<CharT>();
+  test_valid_values_date_time<CharT>();
+  test_valid_values_time_zone<CharT>();
+}
+
+template <class CharT>
+static void test() {
+  test_no_chrono_specs<CharT>();
+  test_valid_values<CharT>();
+
+  check_invalid_types<CharT>(
+      {SV("a"),  SV("A"),  SV("b"),  SV("B"),  SV("c"),  SV("C"),  SV("d"),  SV("D"),  SV("e"),  SV("F"),  SV("g"),
+       SV("G"),  SV("h"),  SV("H"),  SV("I"),  SV("j"),  SV("m"),  SV("M"),  SV("p"),  SV("r"),  SV("R"),  SV("S"),
+       SV("T"),  SV("u"),  SV("U"),  SV("V"),  SV("w"),  SV("W"),  SV("x"),  SV("X"),  SV("y"),  SV("Y"),  SV("z"),
+       SV("Z"),  SV("Ec"), SV("EC"), SV("Ex"), SV("EX"), SV("Ey"), SV("EY"), SV("Ez"), SV("Od"), SV("Oe"), SV("OH"),
+       SV("OI"), SV("Om"), SV("OM"), SV("OS"), SV("Ou"), SV("OU"), SV("OV"), SV("Ow"), SV("OW"), SV("Oy"), SV("Oz")},
+      std::chrono::zoned_time{});
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.syn/formatter_tests.h b/libcxx/test/std/time/time.syn/formatter_tests.h
index 1b343b5c8711b..798b953a377bd 100644
--- a/libcxx/test/std/time/time.syn/formatter_tests.h
+++ b/libcxx/test/std/time/time.syn/formatter_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
index 874c3d52e460b..e32b6d523d0ed 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
@@ -89,7 +89,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000000000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -103,7 +103,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -117,7 +117,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -131,7 +131,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -173,7 +173,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000000000 is ambiguous.  It could be
 1986-09-28 02:30:00.000000000 CEST == 1986-09-28 00:30:00.000000000 UTC or
 1986-09-28 02:30:00.000000000 CET == 1986-09-28 01:30:00.000000000 UTC)";
@@ -186,7 +186,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000000 is ambiguous.  It could be
 1986-09-28 02:30:00.000000 CEST == 1986-09-28 00:30:00.000000 UTC or
 1986-09-28 02:30:00.000000 CET == 1986-09-28 01:30:00.000000 UTC)";
@@ -199,7 +199,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000 is ambiguous.  It could be
 1986-09-28 02:30:00.000 CEST == 1986-09-28 00:30:00.000 UTC or
 1986-09-28 02:30:00.000 CET == 1986-09-28 01:30:00.000 UTC)";
@@ -212,7 +212,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00 is ambiguous.  It could be
 1986-09-28 02:30:00 CEST == 1986-09-28 00:30:00 UTC or
 1986-09-28 02:30:00 CET == 1986-09-28 01:30:00 UTC)";
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
index c137049bde8aa..e9262c5d95db1 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
@@ -13,6 +13,7 @@
 #include <cassert>
 #include <charconv>
 #include <chrono>
+#include <format>
 #include <string_view>
 #include <type_traits>
 
@@ -42,6 +43,8 @@ class offset_time_zone {
 
   offset_time_zone* operator->() { return this; }
 
+  const offset_time_zone* operator->() const { return this; }
+
   template <class Duration>
   std::chrono::sys_time<std::common_type_t<Duration, std::chrono::seconds>>
   to_sys(const std::chrono::local_time<Duration>& local) const {
@@ -49,6 +52,22 @@ class offset_time_zone {
         local.time_since_epoch() + offset_};
   }
 
+  template <class Duration>
+  std::chrono::local_time<std::common_type_t<Duration, std::chrono::seconds>>
+  to_local(const std::chrono::sys_time<Duration>& sys) const {
+    return std::chrono::local_time<std::common_type_t<Duration, std::chrono::seconds>>{
+        sys.time_since_epoch() - offset_};
+  }
+
+  template <class Duration>
+  std::chrono::sys_info get_info(const std::chrono::sys_time<Duration>&) const {
+    return {std::chrono::sys_seconds::min(),
+            std::chrono::sys_seconds::max(),
+            offset_,
+            std::chrono::minutes{0},
+            std::format("{:+03d}s", offset_.count())};
+  }
+
 private:
   std::chrono::seconds offset_;
 };
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.nonmembers/ostream.pass.cpp
new file mode 100644
index 0000000000000..06131d66c0f5c
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.nonmembers/ostream.pass.cpp
@@ -0,0 +1,351 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+
+// template<class charT, class traits, class Duration, class TimeZonePtr>
+// basic_ostream<charT, traits>&
+// operator<<(basic_ostream<charT, traits>& os,
+//            const zoned_time<Duration, TimeZonePtr>& t);
+
+#include <chrono>
+#include <cassert>
+#include <sstream>
+
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+#include "../test_offset_time_zone.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+#define TEST_EQUAL(OUT, EXPECTED)                                                                                      \
+  TEST_REQUIRE(OUT == EXPECTED,                                                                                        \
+               TEST_WRITE_CONCATENATED(                                                                                \
+                   "\nExpression      ", #OUT, "\nExpected output ", EXPECTED, "\nActual output   ", OUT, '\n'));
+
+template <class CharT, class Duration, class TimeZonePtr>
+static std::basic_string<CharT> stream_c_locale(std::chrono::zoned_time<Duration, TimeZonePtr> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  sstr << time_point;
+  return sstr.str();
+}
+
+template <class CharT, class Duration, class TimeZonePtr>
+static std::basic_string<CharT> stream_fr_FR_locale(std::chrono::zoned_time<Duration, TimeZonePtr> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_fr_FR_UTF_8);
+  sstr.imbue(locale);
+  sstr << time_point;
+  return sstr.str();
+}
+
+template <class CharT, class Duration, class TimeZonePtr>
+static std::basic_string<CharT> stream_ja_JP_locale(std::chrono::zoned_time<Duration, TimeZonePtr> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_ja_JP_UTF_8);
+  sstr.imbue(locale);
+  sstr << time_point;
+  return sstr.str();
+}
+
+template <class CharT>
+static void test_c() {
+  using namespace std::literals::chrono_literals;
+
+  { //  Different durations
+    TEST_EQUAL(stream_c_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::nanoseconds>{42ns})),
+               SV("1970-01-01 01:00:00.000000042 +01"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::microseconds>{42us})),
+               SV("1970-01-01 01:00:00.000042 +01"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::milliseconds>{42ms})),
+               SV("1970-01-01 01:00:00.042 +01"));
+
+    TEST_EQUAL(
+        stream_c_locale<CharT>(std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::seconds>{42s})),
+        SV("1970-01-01 01:00:42 +01"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::days>{std::chrono::days{42}})),
+               SV("1970-02-12 01:00:00 +01"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{42}})),
+               SV("1970-10-22 01:00:00 +01"));
+  }
+
+  { // Daylight saving time switches
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+    // ...
+    // 1 DE CE%sT 1980
+    // 1 E CE%sT
+    //
+    // ...
+    // R E 1979 1995 - S lastSu 1u 0 -
+    // R E 1981 ma - Mar lastSu 1u 1 S
+
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Start of daylight saving time
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 0h + 59min + 59s)),
+               SV("1986-03-30 01:59:59 CET"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 1h)),
+               SV("1986-03-30 03:00:00 CEST"));
+
+    // End of daylight saving time
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 0h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CEST"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h)),
+               SV("1986-09-28 02:00:00 CET"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CET"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h)),
+               SV("1986-09-28 03:00:00 CET"));
+  }
+
+  { // offset pointer
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:00 +00s"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"42"}, std::chrono::sys_seconds{})),
+               SV("1969-12-31 23:59:18 +42s"));
+
+    TEST_EQUAL(stream_c_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"-42"}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:42 -42s"));
+  }
+}
+
+template <class CharT>
+static void test_fr_FR() {
+  using namespace std::literals::chrono_literals;
+
+  { //  Different durations
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::nanoseconds>{42ns})),
+               SV("1970-01-01 01:00:00,000000042 +01"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::microseconds>{42us})),
+               SV("1970-01-01 01:00:00,000042 +01"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::milliseconds>{42ms})),
+               SV("1970-01-01 01:00:00,042 +01"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::seconds>{42s})),
+               SV("1970-01-01 01:00:42 +01"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::days>{std::chrono::days{42}})),
+               SV("1970-02-12 01:00:00 +01"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{42}})),
+               SV("1970-10-22 01:00:00 +01"));
+  }
+
+  { // Daylight saving time switches
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+    // ...
+    // 1 DE CE%sT 1980
+    // 1 E CE%sT
+    //
+    // ...
+    // R E 1979 1995 - S lastSu 1u 0 -
+    // R E 1981 ma - Mar lastSu 1u 1 S
+
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Start of daylight saving time
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 0h + 59min + 59s)),
+               SV("1986-03-30 01:59:59 CET"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 1h)),
+               SV("1986-03-30 03:00:00 CEST"));
+
+    // End of daylight saving time
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 0h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CEST"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h)),
+               SV("1986-09-28 02:00:00 CET"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CET"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h)),
+               SV("1986-09-28 03:00:00 CET"));
+  }
+
+  { // offset pointer
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:00 +00s"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"42"}, std::chrono::sys_seconds{})),
+               SV("1969-12-31 23:59:18 +42s"));
+
+    TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"-42"}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:42 -42s"));
+  }
+}
+
+template <class CharT>
+static void test_ja_JP() {
+  using namespace std::literals::chrono_literals;
+
+  { //  Different durations
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::nanoseconds>{42ns})),
+               SV("1970-01-01 01:00:00.000000042 +01"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::microseconds>{42us})),
+               SV("1970-01-01 01:00:00.000042 +01"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::milliseconds>{42ms})),
+               SV("1970-01-01 01:00:00.042 +01"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(
+                   std::chrono::zoned_time("Etc/GMT-1", std::chrono::sys_time<std::chrono::seconds>{42s})),
+               SV("1970-01-01 01:00:42 +01"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::days>{std::chrono::days{42}})),
+               SV("1970-02-12 01:00:00 +01"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Etc/GMT-1", std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{42}})),
+               SV("1970-10-22 01:00:00 +01"));
+  }
+
+  { // Daylight saving time switches
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+    // ...
+    // 1 DE CE%sT 1980
+    // 1 E CE%sT
+    //
+    // ...
+    // R E 1979 1995 - S lastSu 1u 0 -
+    // R E 1981 ma - Mar lastSu 1u 1 S
+
+    // Pick an historic date where it's well known what the time zone rules were.
+    // This makes it unlikely updates to the database change these rules.
+
+    // Start of daylight saving time
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 0h + 59min + 59s)),
+               SV("1986-03-30 01:59:59 CET"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::March / 30 / 1986} + 1h)),
+               SV("1986-03-30 03:00:00 CEST"));
+
+    // End of daylight saving time
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 0h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CEST"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h)),
+               SV("1986-09-28 02:00:00 CET"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 1h + 59min + 59s)),
+               SV("1986-09-28 02:59:59 CET"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   "Europe/Berlin", std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h)),
+               SV("1986-09-28 03:00:00 CET"));
+  }
+
+  { // offset pointer
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:00 +00s"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"42"}, std::chrono::sys_seconds{})),
+               SV("1969-12-31 23:59:18 +42s"));
+
+    TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::zoned_time(
+                   offset_time_zone<offset_time_zone_flags::none>{"-42"}, std::chrono::sys_seconds{})),
+               SV("1970-01-01 00:00:42 -42s"));
+  }
+}
+
+template <class CharT>
+static void test() {
+  test_c<CharT>();
+  test_fr_FR<CharT>();
+  test_ja_JP<CharT>();
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
index 545215a3b1613..09d2ade8e351e 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
index 67a91b99f26c1..9c7ba5a9a4bc8 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
index 4e1e2ea864de2..01f666274cbee 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
index 092e1153103c8..92e2fef9d9f02 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
index bc5e356161a74..3ea5d8bee1f86 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
index 2d3b03689b1d9..807a8af5bb5f6 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
index 2f5291332355c..2f6af70ee1d74 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
index 065827ae652c9..1fe7c41fc0901 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
index 92a0c42e88d29..e7909d6074813 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
index 98501973d2a6f..4de9b98d96200 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
index 2f36349743f38..cc365c93313d3 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
index 7e37f8bdf4abc..d901177b7080b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
index 16de28d970396..ac369a5bac18a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
index 0e30ea2c7fe0b..dea0e53f84f1c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
index ba9831750c842..9e78596929fb6 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
index dcd046bdd9d89..9f9ff7ee7717d 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
index 88ec41939439a..31de6e4e0243c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
index a97086fcc9195..ed2afbbeb910b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
index cd89e2445860a..b8ab466c85b99 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
index 1cf3d9cc2ef49..fa9f5a115ccf5 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
index 27ce97737d288..f71d17c8df400 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
index 4f5d3d1492d37..6da98e11c4a13 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
index bbfd3048533c7..9cc8d11a13f93 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
index 800d47bda6958..cb87db3801aa1 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
index 7596d2e6ec642..a4250e606d6b3 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
index 29945d24ef7a0..bc8b9de97e4d2 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
index 62f0d6fe57ea9..9325c6c61ad2d 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
index 963f39b1cbc2c..a8c469d01be28 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
index 72fb16609bf96..6cbadb849a384 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
index 8daecf3950ec0..e4aab72b7df04 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
index 80bc243fcaefd..628538e588846 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
index 491e5e6b96b9a..3969466acc9fc 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
index 8e39986893e54..6111d7a351134 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
index 44d1b2fc2f7c3..7a2d3cba68d17 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
index fa05a136e2f86..82bcd7be4c432 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
index 05b49e3d8a48b..607113a856646 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
index f19599d43b32d..d2eaf5c26c2e0 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
index 74f46f0ad9bc4..088ed21c64c04 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
index 31573be894d37..010f45f6214d8 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
index eb23a2f981476..61b09d945c1d6 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
index 9a6600232d96b..57a26a6099fb6 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
index 86e903aaf6320..bdeb74e3b45fc 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
index 5cad5c77cef20..3e842444d9cb7 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
index ee9a228cdcfd7..b8927f5947ca7 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
index 3028dfa25756c..53d4c6b27c399 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
index 4d249df64592f..3c29cf9758046 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
index 8cfe555f70131..c6b9f3cfb80e9 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
index ffd8ed8e209a0..13a722940e55c 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
index 4f22eb4773e6e..33c85ea8cbf33 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
index 415435b921185..69b25252c2bca 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
index ac0fb82353470..711a779957d3d 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
index d77870e648ece..4f36514a0d3a9 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
index a51916f874eea..6ed1a98f7dbaf 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
index 60ae0346f7af1..d96a70ccff67f 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
index 699597d243c19..409352e8a8cb0 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
index 641eb4927a724..31cecc137e9ee 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
index a01895abf660a..5df73cfba4950 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
index 05f556e25eac1..cca3ec588f3ba 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
index a48888be53ee0..cb2d46fc74f2f 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
index 7c04a5fa9d044..430f0ca9989c4 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
index e05cf027719b7..d6a71fab6d002 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
index 848e61dd60779..c00d0df528857 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
index bfb5028c9264d..1a79a4fd8bedc 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
index 85bc98d7f462d..4e7657cb878f7 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
index 4128668a6b07b..7ece9dba04032 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
index ba738a3e339de..be8dccea11062 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
index 33a5e7293df21..fea7825a1a238 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
index aa8f225110bb5..798149f45ae09 100644
--- a/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
index eb05a84643211..8b24875586852 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
index ce95d162b55b1..4500971131b65 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
index 42e9f22969663..b4e6b743e95d3 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
index 536f8980c27a4..b08ffce93e180 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
index f7be7607aede3..b9d6667be0457 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
index fe92bb401643d..549d2f08c07cd 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
index a24d67c57e19b..2b109f97e31a4 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
index 5fe3db5c07fa9..b76a561d6618e 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
index 25601af39a316..368a85b4f1a1e 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
index 62fd0f25ae3a9..de551be3189d2 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
index 2d5ee8349b749..ff515060dbf45 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
index 73c4395a4a630..2aa580748315f 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
index 0f61c4bd701ab..e92b559ea8c23 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
index 407ab9af9839c..472d5edb6e3e8 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
index 284b03c32cd09..829b74121b9c6 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
index 4c60cb0e9ae1d..874d609432f22 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
index 6a3896c8965c7..e3e3e9a19e122 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 5ab8c649cc58c..d99675a71f321 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
index b87b5c774ef7e..e1c3493618292 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
index bb542a8e63ecf..4686ed89e97f3 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index 80439483414cc..c7dd82d726b3a 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
index ba5ef4e2b8884..d44877e686535 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index 94a1ea576bda0..52cfa2c81c21a 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
index a20972c16ffcb..bddc36dad2b73 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
index c81074e25e172..824c5d636a324 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
index 824813d33a519..673d5ddc5f85c 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
index 83ece7da73ad3..a32e75a1469b0 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
index 14bdc1426098e..197f9bd6a2d90 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
index 3d3a5d1560b79..b887183103191 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp
new file mode 100644
index 0000000000000..934de7ca4dcf2
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <format>
+
+// template<class T>
+// constexpr bool enable_nonlocking_formatter_optimization = false;
+
+// Remarks: Pursuant to [namespace.std], users may specialize
+// enable_nonlocking_formatter_optimization for cv-unqualified program-defined
+// types. Such specializations shall be usable in constant expressions
+// ([expr.const]) and have type const bool.
+
+// [format.formatter.spec]
+// In addition, for each type T for which a formatter specialization is provided
+// above, each of the headers provides the following specialization:
+//
+// template<>
+// inline constexpr bool enable_nonlocking_formatter_optimization<T> = true;
+
+#include <array>
+#include <bitset>
+#include <bitset>
+#include <chrono>
+#include <complex>
+#include <concepts>
+#include <deque>
+#include <filesystem>
+#include <format>
+#include <forward_list>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <set>
+#include <span>
+#include <stack>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <valarray>
+#include <variant>
+#include <vector>
+
+#include "test_macros.h"
+#include "min_allocator.h"
+
+#ifndef TEST_HAS_NO_LOCALIZATION
+#  include <regex>
+#endif
+#ifndef TEST_HAS_NO_THREADS
+#  include <thread>
+#endif
+
+// Tests for P0645 Text Formatting
+template <class CharT>
+void test_P0645() {
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<const CharT*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT[42]>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<std::basic_string<CharT>>);
+  static_assert(std::enable_nonlocking_formatter_optimization<std::basic_string_view<CharT>>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<bool>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<signed char>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed short>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed int>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed long>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed long long>);
+#ifndef TEST_HAS_NO_INT128
+  static_assert(std::enable_nonlocking_formatter_optimization<__int128_t>);
+#endif
+
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned char>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned short>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned int>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned long>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned long long>);
+#ifndef TEST_HAS_NO_INT128
+  static_assert(std::enable_nonlocking_formatter_optimization<__uint128_t>);
+#endif
+
+  static_assert(std::enable_nonlocking_formatter_optimization<float>);
+  static_assert(std::enable_nonlocking_formatter_optimization<double>);
+  static_assert(std::enable_nonlocking_formatter_optimization<long double>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<std::nullptr_t>);
+  static_assert(std::enable_nonlocking_formatter_optimization<void*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<const void*>);
+}
+
+// Tests for P1361 Integration of chrono with text formatting
+//
+// Some tests are commented out since these types haven't been implemented in
+// chrono yet. After P1361 has been implemented these formatters should be all
+// enabled.
+void test_P1361() {
+// The chrono formatters require localization support.
+// [time.format]/7
+//   If the chrono-specs is omitted, the chrono object is formatted as if by
+//   streaming it to std::ostringstream os with the formatting
+//   locale imbued and copying os.str() through the output iterator of the
+//   context with additional padding and adjustments as specified by the format
+//   specifiers.
+// In libc++ std:::ostringstream requires localization support.
+#ifndef TEST_HAS_NO_LOCALIZATION
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::microseconds>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::sys_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::utc_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::tai_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::gps_time<std::chrono::microseconds>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::file_time<std::chrono::microseconds>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::local_time<std::chrono::microseconds>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday_indexed>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_day_last>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_day_last>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::hh_mm_ss<std::chrono::microseconds>>);
+
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::sys_info>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::local_info>);
+
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::zoned_time>);
+
+#endif // TEST_HAS_NO_LOCALIZATION
+}
+
+// Tests for P1636 Formatters for library types
+//
+// The paper hasn't been voted in so currently all formatters are disabled.
+// Note the paper has been abandoned, the types are kept since other papers may
+// introduce these formatters.
+void test_P1636() {
+#ifndef TEST_HAS_NO_THREADS
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::thread::id>);
+#endif
+}
+
+template <class Vector>
+void test_P2286_vector_bool() {
+  static_assert(!std::enable_nonlocking_formatter_optimization<Vector>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<typename Vector::reference>);
+
+  // The const_reference shall be a bool.
+  // However libc++ uses a __bit_const_reference<vector> when
+  // _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL is defined.
+  static_assert(!std::enable_nonlocking_formatter_optimization<const Vector&>);
+  static_assert(std::enable_nonlocking_formatter_optimization<typename Vector::const_reference> ==
+                std::same_as<typename Vector::const_reference, bool>);
+}
+
+// Tests for P2286 Formatting ranges
+void test_P2286() {
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::array<int, 42>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::vector<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::deque<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::forward_list<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::list<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::set<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::map<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::multiset<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::multimap<int, int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_set<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_map<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_multiset<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_multimap<int, int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::stack<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::queue<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::priority_queue<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::span<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::valarray<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::pair<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::tuple<int>>);
+
+  test_P2286_vector_bool<std::vector<bool>>();
+  test_P2286_vector_bool<std::vector<bool, std::allocator<bool>>>();
+  test_P2286_vector_bool<std::vector<bool, min_allocator<bool>>>();
+}
+
+// The trait does not care about whether the type is formattable, obviously the
+// trait for non formattable types are not used.
+struct not_formattable_nonlocking_disabled {};
+static_assert(!std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_disabled>);
+
+struct not_formattable_nonlocking_enabled {};
+template <>
+inline constexpr bool std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_enabled> = true;
+static_assert(std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_enabled>);
+
+void test() {
+  test_P0645<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_P0645<wchar_t>();
+#endif
+  test_P1361();
+  test_P1636();
+  test_P2286();
+}
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
index 116f78e63be09..80b69463c9511 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
index 3125dd8b60bbd..4f9efca28803b 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
index f3d587e11f255..10436767f3c3e 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
index a7577c1bd9341..1fde08a117bc7 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
index 0723547c2df27..dd240eecb6a74 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index 23dcc0b7bad9b..1b3ff52d22d58 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
index 263dc1d8d8518..3ad84577b90d2 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
index 5921cc6efcecb..5c855805e45dd 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
index 408168e033bb6..19bb8bf0b47a5 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
index cdd56d1b882a0..d589a315dc924 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
index 49f54dae26478..9728a2f08fa44 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
index a9537465faf9d..7446ab26bf975 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
index f2fab8011c1a7..40a40a7dc7660 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
index db301d380262f..113c7a4d06d95 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
index 96a6b26d3111b..3760b1afa1e31 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
index ad36b6626bf83..db6683ed1cf1d 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
index ae50394738425..c7dfb601b95c7 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
index 3cd95b65ec102..9fb4fbbf70976 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
index 5049c29327566..5a82bcdcf943d 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
index 5f43b4ea032f8..a1404e0c8c944 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
index 079164b59f5e5..0962ff5265aa5 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
index bfd804140fe4c..0c2480f449ad8 100644
--- a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
index 5f248f88582d0..c2508fdee092e 100644
--- a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 96c1e2664f7a6..fa3076224f2d1 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
index d0a687b01478a..cd555e1ab9ce8 100644
--- a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
index a0fe98e3554c2..1078c314bc704 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
index 0ddb597c2d092..a262d64f4579b 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
index 8b06141a4f339..63e6f8f65588a 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
index 5fe210e582d10..5ec244c4f8b31 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
index 089dd615451b3..4bec9094382de 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
index 53b9b29fa72c6..9e1f86b474b86 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index aa33c200caa98..b2ed6775fe8a1 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
index 5de7c7be43a76..ec21133463a39 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
index f3eb3f1d1dc3d..488110ab2aa2b 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
index 5c0740960d4e9..31509e376d94a 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
index 8441e9f5a9705..671359a346997 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
index 948f2b37238ee..3c266ad756da2 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
index dd25616320587..ebe6ff081ac11 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
index 12ec4594f94cd..149e3413967ba 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
index ef90fe3a72e73..ccea8d4eea7cf 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
index 16f0524f45706..e5a85826a11fe 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
index 8ed848ec1fa0f..d8374435bf1ac 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
index ac59af6dc1811..472ac447b4ce9 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
index 91b786e2213e4..8420a03c16d62 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
index b25fd25f4e194..51be78e1a02f8 100644
--- a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
index b5b0442fb376c..11989f17aebf9 100644
--- a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
index 228ccbabcb328..2cde40d726664 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
index e16d50f18284f..99a1bd1f8f205 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
index bfbe9d0d81337..34d7c84f087be 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
index c247a9bc8cef2..0047db74cca1d 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
index 75d227b4335db..81fdab47df7ea 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
index 0eb984cc2c01a..156f0ee964db0 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
index 6f361de252215..97132ab99100e 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
index 87cf275ddf084..e767339057600 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
index 8756cb9156dca..d3c89e89731b8 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
index 397b1f5e4f3f3..05de4f909d898 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
index 869984f439bfc..a4b2854ba0b71 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
index 5ac84b4ba625e..d64e9a6dc2a8a 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
index 3ebaa054f0cab..11088796e5386 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
index 567ebc1d6480e..93745cfbbdaaf 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
index a7dc5e930d5af..c0e1d847a5770 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
index 99d6aa7452a02..3fb0a95f4c1d2 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
index 4b971cb1e13ba..98097cfbda9e1 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
index 5a0c89ecccfff..258a6a1b8465d 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
index 60dde24114361..6afae316b2003 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
index 5b94aa0ef4481..2a32e4f4eab55 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
index 182beff4bd168..ecba1746e5c72 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
index dd2815fa7a11f..5fc9f94f03ac0 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
index 261431c712445..632f664d6a995 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
index b836f2eeb2542..2328a72d2f91e 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
index 675a5e896ff57..7d6502e826cd4 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
index 3354de3472199..2317507c95275 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
index a5348104e26d2..a0b8aed3a33d2 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
index 78b067e8cffa9..06bac9b673709 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
index d46499b9834ed..9b43a225492e8 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
index 858952263b8ed..7b19ed0039e00 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
index 2d0cef11feb8c..f095d4dffb21e 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
index a26a10cc4b8aa..ae7eb89649104 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
index 9d402db4fe53c..fd80c3f452d7d 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
index 2770f4c221719..45511ae6d7ab4 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
index 5f1757a5beb3f..bbeb0df56dee5 100644
--- a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
index 9b13e253fcedb..b5a893fc1172f 100644
--- a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
index 8cdcc9d081d4b..ba4002c958511 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
index a353339d51ca7..3d7cdaa475db0 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h b/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
index e26d2dfaf6806..d8ed0157615e3 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
index e3f20e4b47ecb..f44e81b52b76a 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
index d83139bf2f952..c4fd45f528133 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
index 8653c282bfe10..0c5a11bfb5c04 100644
--- a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
index b04800867ba82..7aeaaf6a06b46 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
index af285e63f0e87..246de50783075 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/types.compile.pass.cpp b/libcxx/test/std/utilities/format/types.compile.pass.cpp
index 93723ddba6ce4..62ed37dc500a2 100644
--- a/libcxx/test/std/utilities/format/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/difference_type.pass.cpp b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/difference_type.pass.cpp
index 35721f1cbba9d..2cb2c670441b0 100644
--- a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/difference_type.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/difference_type.pass.cpp
@@ -53,19 +53,13 @@ struct D
     typedef void difference_type;
 };
 
-namespace std
-{
-
 template <>
-struct pointer_traits<C<char>::pointer>
-{
-    typedef C<char>::pointer pointer;
-    typedef char             element_type;
-    typedef signed char      difference_type;
+struct std::pointer_traits<C<char>::pointer> {
+  typedef C<char>::pointer pointer;
+  typedef char element_type;
+  typedef signed char difference_type;
 };
 
-}
-
 int main(int, char**)
 {
     static_assert((std::is_same<std::allocator_traits<A<char> >::difference_type, short>::value), "");
diff --git a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/size_type.pass.cpp b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/size_type.pass.cpp
index cd74671390e7e..d8f15af6acf18 100644
--- a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/size_type.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.types/size_type.pass.cpp
@@ -51,17 +51,11 @@ struct D {
     typedef void size_type;
 };
 
-namespace std
-{
-
 template <>
-struct pointer_traits<C<char>::pointer>
-{
-    typedef signed char difference_type;
+struct std::pointer_traits<C<char>::pointer> {
+  typedef signed char difference_type;
 };
 
-}
-
 int main(int, char**)
 {
     static_assert((std::is_same<std::allocator_traits<A<char> >::size_type, unsigned short>::value), "");
diff --git a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
index ab3c039497dd9..235a93987fe48 100644
--- a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
@@ -121,6 +121,63 @@ constexpr void testOne() {
     assert(std::get<2>(std::get<1>(ret)) == 3);
     assert(std::get<0>(std::get<2>(ret)) == 4);
   }
+  {
+    // Tests for ensuring forward declarations of uses_allocator_construction_args
+    // See https://github.com/llvm/llvm-project/issues/66714.
+    {
+      using NestedPairType = std::pair<int, std::pair<int, UsesAllocArgT>>;
+      std::same_as<std::tuple<
+          std::piecewise_construct_t,
+          std::tuple<>,
+          std::tuple<std::piecewise_construct_t, std::tuple<>, std::tuple<std::allocator_arg_t, const Alloc&>>>> auto
+          ret = test_uses_allocator_construction_args<NestedPairType>(a);
+      (void)ret;
+    }
+    {
+      using NestedPairType = std::pair<int, std::pair<UsesAllocArgT, int>>;
+      std::same_as<std::tuple<
+          std::piecewise_construct_t,
+          std::tuple<>,
+          std::tuple<std::piecewise_construct_t, std::tuple<std::allocator_arg_t, const Alloc&>, std::tuple<>>>> auto
+          ret = test_uses_allocator_construction_args<NestedPairType>(a);
+      (void)ret;
+    }
+    {
+      using PairType = std::pair<int, int>;
+      int up         = 1;
+      int vp         = 2;
+
+      std::same_as<std::tuple<std::piecewise_construct_t, std::tuple<int&&>, std::tuple<int&&>>> auto ret =
+          test_uses_allocator_construction_args<PairType>(a, std::move(up), std::move(vp));
+      (void)ret;
+    }
+    {
+      using PairType = const std::pair<int, int>;
+      PairType p(1, 2);
+
+      std::same_as<std::tuple<std::piecewise_construct_t, std::tuple<const int&>, std::tuple<const int&>>> auto ret =
+          test_uses_allocator_construction_args<PairType>(a, p);
+      (void)ret;
+    }
+    {
+      using PairType = std::pair<int, int>;
+      PairType p(1, 2);
+
+      std::same_as<std::tuple<std::piecewise_construct_t, std::tuple<int&&>, std::tuple<int&&>>> auto ret =
+          test_uses_allocator_construction_args<PairType>(a, std::move(p));
+      (void)ret;
+    }
+#if TEST_STD_VER >= 23
+    {
+      using PairType = const std::pair<int, int>;
+      PairType p(1, 2);
+
+      std::same_as<std::tuple<std::piecewise_construct_t, std::tuple<const int&&>, std::tuple<const int&&>>> auto ret =
+          test_uses_allocator_construction_args<PairType>(a, std::move(p));
+      (void)ret;
+    }
+#endif
+  }
 #if TEST_STD_VER >= 23
   {
     std::pair p{3, 4};
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.verify.cpp
index b2b6f51e98245..886ce7ac8d4a0 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.verify.cpp
@@ -15,7 +15,21 @@
 
 #include <memory>
 
+struct incomplete;
+
 void f() {
+  {
     std::allocator<int> a;
     a.allocate(3); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  }
+  {
+    std::allocator<void> a;
+    [[maybe_unused]] auto b =
+        a.allocate(3); // expected-error@*:* {{invalid application of 'sizeof' to an incomplete type 'void'}}
+  }
+  {
+    std::allocator<incomplete> a;
+    [[maybe_unused]] auto b =
+        a.allocate(3); // expected-error@*:* {{invalid application of 'sizeof' to an incomplete type 'incomplete'}}
+  }
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate_at_least.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate_at_least.verify.cpp
new file mode 100644
index 0000000000000..cfee54198e099
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate_at_least.verify.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// allocator:
+// T* allocate_at_least(size_t n);
+
+#include <memory>
+
+struct incomplete;
+
+void f() {
+  {
+    std::allocator<int> a;
+    a.allocate_at_least(3); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  }
+  {
+    std::allocator<void> a;
+    [[maybe_unused]] auto b =
+        a.allocate_at_least(3); // expected-error@*:* {{invalid application of 'sizeof' to an incomplete type 'void'}}
+  }
+  {
+    std::allocator<incomplete> a;
+    [[maybe_unused]] auto b = a.allocate_at_least(
+        3); // expected-error@*:* {{invalid application of 'sizeof' to an incomplete type 'incomplete'}}
+  }
+}
diff --git a/libcxx/test/std/utilities/memory/unique.ptr/noexcept_operator_star.compile.pass.cpp b/libcxx/test/std/utilities/memory/unique.ptr/noexcept_operator_star.compile.pass.cpp
new file mode 100644
index 0000000000000..a2d788005f0cd
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/unique.ptr/noexcept_operator_star.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// unique_ptr
+
+// add_lvalue_reference_t<T> operator*() const noexcept(noexcept(*declval<pointer>()));
+
+// Dereferencing pointer directly in noexcept fails for a void pointer.  This
+// is not SFINAE-ed away leading to a hard error. The issue was originally
+// triggered by
+// test/std/utilities/memory/unique.ptr/iterator_concept_conformance.compile.pass.cpp
+//
+// This test validates whether the code compiles.
+
+#include <memory>
+
+extern const std::unique_ptr<void> p;
+void f() { [[maybe_unused]] bool b = noexcept(p.operator*()); }
diff --git a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
index 04a1451863c90..e802776f52bfc 100644
--- a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
@@ -33,26 +33,22 @@ struct Tuple_helper<std::void_t<std::common_reference_t<Ts, Us>...>, UserTuple<T
   using type = UserTuple<std::common_reference_t<Ts, Us>...>;
 };
 
-namespace std {
 template <class... Ts, class... Us, template <class> class TQual, template <class> class UQual>
-struct basic_common_reference< ::UserTuple<Ts...>, ::UserTuple<Us...>, TQual, UQual>
+struct std::basic_common_reference< ::UserTuple<Ts...>, ::UserTuple<Us...>, TQual, UQual>
     : ::Tuple_helper<void, UserTuple<TQual<Ts>...>, UserTuple<UQual<Us>...> > {};
-} // namespace std
 
 struct X2 {};
 struct Y2 {};
 struct Z2 {};
 
-namespace std {
 template <>
-struct common_type<X2, Y2> {
+struct std::common_type<X2, Y2> {
   using type = Z2;
 };
 template <>
-struct common_type<Y2, X2> {
+struct std::common_type<Y2, X2> {
   using type = Z2;
 };
-} // namespace std
 
 // (6.1)
 //  -- If sizeof...(T) is zero, there shall be no member type.
diff --git a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
index 66a29a37a3688..82929a3f3eee3 100644
--- a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
@@ -33,29 +33,28 @@ struct bad_reference_wrapper {
     operator T&() const;
 };
 
-namespace std
-{
-    template <typename T>
-    struct common_type<T, ::S<T> >
-    {
-        typedef S<T> type;
-    };
+template <typename T>
+struct std::common_type<T, ::S<T> > {
+  typedef S<T> type;
+};
 
-    template <class T>
-    struct common_type< ::S<T>, T> {
-      typedef S<T> type;
-    };
+template <class T>
+struct std::common_type< ::S<T>, T> {
+  typedef S<T> type;
+};
 
 //  P0548
-    template <class T>
-    struct common_type< ::S<T>, ::S<T> > {
-      typedef S<T> type;
-    };
-
-    template <> struct common_type< ::S<long>, long> {};
-    template <> struct common_type<long, ::S<long> > {};
-    template <> struct common_type< ::X<double>, ::X<double> > {};
-}
+template <class T>
+struct std::common_type< ::S<T>, ::S<T> > {
+  typedef S<T> type;
+};
+
+template <>
+struct std::common_type< ::S<long>, long> {};
+template <>
+struct std::common_type<long, ::S<long> > {};
+template <>
+struct std::common_type< ::X<double>, ::X<double> > {};
 
 template <class> struct VoidT { typedef void type; };
 
diff --git a/libcxx/test/std/utilities/optional/optional.hash/hash.pass.cpp b/libcxx/test/std/utilities/optional/optional.hash/hash.pass.cpp
index cb23e03e7ca70..ae14b571f7388 100644
--- a/libcxx/test/std/utilities/optional/optional.hash/hash.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.hash/hash.pass.cpp
@@ -23,15 +23,11 @@
 struct A {};
 struct B {};
 
-namespace std {
-
 template <>
-struct hash<B> {
+struct std::hash<B> {
   std::size_t operator()(B const&) noexcept(false) { return 0; }
 };
 
-}
-
 int main(int, char**)
 {
     using std::optional;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
index 5b04e5a35aafa..49b4d21a28066 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
@@ -44,15 +44,7 @@ int main(int, char**)
     {
         optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(*opt), X&);
-        LIBCPP_STATIC_ASSERT(noexcept(*opt));
-        // ASSERT_NOT_NOEXCEPT(*opt);
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator*() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(*opt);
     }
     {
         optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
index f323cd1a5e405..ff86d9534faf6 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
@@ -37,15 +37,7 @@ int main(int, char**)
     {
         const optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(*opt), X const&);
-        LIBCPP_STATIC_ASSERT(noexcept(*opt));
-        // ASSERT_NOT_NOEXCEPT(*opt);
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator*() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(*opt);
     }
     {
         constexpr optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const_rvalue.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const_rvalue.pass.cpp
index 68591c5e2dbcb..646857fdc0465 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const_rvalue.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const_rvalue.pass.cpp
@@ -37,15 +37,7 @@ int main(int, char**)
     {
         const optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(*std::move(opt)), X const &&);
-        LIBCPP_STATIC_ASSERT(noexcept(*opt));
-        // ASSERT_NOT_NOEXCEPT(*std::move(opt));
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator*() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(*std::move(opt));
     }
     {
         constexpr optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_rvalue.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_rvalue.pass.cpp
index 67edbb903353e..16bf2e4336c69 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_rvalue.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_rvalue.pass.cpp
@@ -44,15 +44,7 @@ int main(int, char**)
     {
         optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(*std::move(opt)), X&&);
-        LIBCPP_STATIC_ASSERT(noexcept(*opt));
-        // ASSERT_NOT_NOEXCEPT(*std::move(opt));
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator*() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(*std::move(opt));
     }
     {
         optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
index 7cf043f9375c1..2b5fba546ef42 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
@@ -41,14 +41,7 @@ int main(int, char**)
     {
         std::optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(opt.operator->()), X*);
-        // ASSERT_NOT_NOEXCEPT(opt.operator->());
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator->() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(opt.operator->());
     }
     {
         optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
index b9c1fe7794b66..d8ce932bd7810 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
@@ -40,14 +40,7 @@ int main(int, char**)
     {
         const std::optional<X> opt; ((void)opt);
         ASSERT_SAME_TYPE(decltype(opt.operator->()), X const*);
-        // ASSERT_NOT_NOEXCEPT(opt.operator->());
-        // FIXME: This assertion fails with GCC because it can see that
-        // (A) operator->() is constexpr, and
-        // (B) there is no path through the function that throws.
-        // It's arguable if this is the correct behavior for the noexcept
-        // operator.
-        // Regardless this function should still be noexcept(false) because
-        // it has a narrow contract.
+        ASSERT_NOEXCEPT(opt.operator->());
     }
     {
         constexpr optional<X> opt(X{});
diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.general.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.general.pass.cpp
new file mode 100644
index 0000000000000..d55a0c2abd464
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.general.pass.cpp
@@ -0,0 +1,206 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [inout.ptr], function template inout_ptr
+// template<class Pointer = void, class Smart, class... Args>
+//   auto inout_ptr(Smart& s, Args&&... args);                 // since c++23
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "../types.h"
+
+// Test updating the ownership of an `inout_ptr_t`-managed pointer for an API with a non-void pointer type.
+// The API returns a new valid object.
+void test_replace_int_p() {
+  auto replace_int_p = [](int** pp) {
+    assert(**pp == 90);
+
+    delete *pp;
+    *pp = new int{84};
+  };
+
+  // raw pointer
+  {
+    auto rPtr = new int{90};
+
+    replace_int_p(std::inout_ptr<int*>(rPtr));
+    assert(*rPtr == 84);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    auto uPtr = std::make_unique<int>(90);
+
+    replace_int_p(std::inout_ptr(uPtr));
+    assert(*uPtr == 84);
+  }
+
+  {
+    MoveOnlyDeleter<int> del;
+    std::unique_ptr<int, MoveOnlyDeleter<int>> uPtr{new int{90}};
+
+    replace_int_p(std::inout_ptr(uPtr, std::move(del)));
+    assert(*uPtr == 84);
+    assert(uPtr.get_deleter().wasMoveInitilized == true);
+  }
+
+  // pointer-like ConstructiblePtr
+  {
+    ConstructiblePtr<int> cPtr(new int{90});
+
+    replace_int_p(std::inout_ptr(cPtr));
+    assert(cPtr == 84);
+  }
+
+  // pointer-like ResettablePtr
+  {
+    ResettablePtr<int> rPtr(new int{90});
+
+    replace_int_p(std::inout_ptr(rPtr));
+    assert(rPtr == 84);
+  }
+
+  // pointer-like NonConstructiblePtr
+  {
+    NonConstructiblePtr<int> nPtr;
+    nPtr.reset(new int{90});
+
+    replace_int_p(std::inout_ptr(nPtr));
+    assert(nPtr == 84);
+  }
+}
+
+// Test updating the ownership of an `inout_ptr_t`-managed pointer for an API with a non-void pointer type.
+// The API returns `nullptr`.
+void test_replace_int_p_with_nullptr() {
+  auto replace_int_p_with_nullptr = [](int** pp) -> void {
+    assert(**pp == 90);
+
+    delete *pp;
+    *pp = nullptr;
+  };
+
+  // raw pointer
+  {
+    // LWG-3897 inout_ptr will not update raw pointer to null
+    auto rPtr = new int{90};
+
+    replace_int_p_with_nullptr(std::inout_ptr<int*>(rPtr));
+    assert(rPtr == nullptr);
+  }
+
+  // std::unique_ptr
+  {
+    auto uPtr = std::make_unique<int>(90);
+
+    replace_int_p_with_nullptr(std::inout_ptr(uPtr));
+    assert(uPtr == nullptr);
+  }
+}
+
+// Test updating the ownership of an `inout_ptr_t`-managed pointer for an API with a void pointer type.
+// The API returns a new valid object.
+void test_replace_int_void_p() {
+  auto replace_int_void_p = [](void** pp) {
+    assert(*(static_cast<int*>(*pp)) == 90);
+
+    delete static_cast<int*>(*pp);
+    *pp = new int{84};
+  };
+
+  // raw pointer
+  {
+    auto rPtr = new int{90};
+
+    replace_int_void_p(std::inout_ptr<int*>(rPtr));
+    assert(*rPtr == 84);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    auto uPtr = std::make_unique<int>(90);
+
+    replace_int_void_p(std::inout_ptr(uPtr));
+    assert(*uPtr == 84);
+  }
+}
+
+// Test updating the ownership of an `inout_ptr_t`-managed pointer for an API with a non-void pointer type.
+// The API returns `nullptr`.
+void test_replace_int_void_p_with_nullptr() {
+  auto replace_int_void_p_with_nullptr = [](void** pp) {
+    assert(*(static_cast<int*>(*pp)) == 90);
+
+    delete static_cast<int*>(*pp);
+    *pp = nullptr;
+  };
+
+  // raw pointer
+  {
+    auto rPtr = new int{90};
+
+    replace_int_void_p_with_nullptr(std::inout_ptr<int*>(rPtr));
+    assert(rPtr == nullptr);
+  }
+
+  // std::unique_ptr
+  {
+    auto uPtr = std::make_unique<int>(90);
+
+    replace_int_void_p_with_nullptr(std::inout_ptr(uPtr));
+    assert(uPtr == nullptr);
+  }
+}
+
+// Test updating the ownership of an `inout_ptr_t`-managed pointer for an API with a void pointer type.
+// The API returns a new valid object.
+void test_replace_nullptr_with_int_p() {
+  auto replace_nullptr_with_int_p = [](int** pp) {
+    assert(*pp == nullptr);
+
+    *pp = new int{84};
+  };
+
+  // raw pointer
+  {
+    int* rPtr = nullptr;
+
+    replace_nullptr_with_int_p(std::inout_ptr<int*>(rPtr));
+    assert(*rPtr == 84);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    std::unique_ptr<int> uPtr;
+
+    replace_nullptr_with_int_p(std::inout_ptr(uPtr));
+    assert(*uPtr == 84);
+  }
+}
+
+int main(int, char**) {
+  test_replace_int_p();
+  test_replace_int_p_with_nullptr();
+  test_replace_int_void_p();
+  test_replace_int_void_p_with_nullptr();
+  test_replace_nullptr_with_int_p();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp
new file mode 100644
index 0000000000000..d6d70782c5eb5
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [inout.ptr], function template inout_ptr
+// template<class Pointer = void, class Smart, class... Args>
+//   auto inout_ptr(Smart& s, Args&&... args);                 // since c++23
+
+#include <memory>
+#include <tuple>
+
+#include "../types.h"
+
+int main(int, char**) {
+  // `std::inout_ptr<>` does not support `std::shared_ptr<>`.
+  {
+    std::shared_ptr<int> sPtr;
+
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}std::shared_ptr<> is not supported}}
+    std::ignore = std::inout_ptr(sPtr);
+    // expected-error@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr<int>' to 'std::inout_ptr_t<shared_ptr<int>, _Ptr>' (aka 'inout_ptr_t<std::shared_ptr<int>, int *>'}}
+    std::ignore = std::inout_ptr<int*>(sPtr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.convert.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.convert.pass.cpp
new file mode 100644
index 0000000000000..230402af07cfe
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.convert.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [inout.ptr.t], class template inout_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class inout_ptr_t;                                        // since c++23
+
+// operator Pointer*() const noexcept;
+// operator void**() const noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <memory>
+
+int main(int, char**) {
+  // operator Pointer*()
+  {
+    std::unique_ptr<int> uPtr = std::make_unique<int>(84);
+
+    const std::inout_ptr_t<std::unique_ptr<int>, int*> ioPtr{uPtr};
+
+    static_assert(noexcept(ioPtr.operator int**()));
+    std::same_as<int**> decltype(auto) pPtr = ioPtr.operator int**();
+
+    assert(**pPtr == 84);
+  }
+
+  {
+    std::unique_ptr<int, std::default_delete<int>> uPtr = std::make_unique<int>(84);
+
+    const std::inout_ptr_t<decltype(uPtr), int*, std::default_delete<int>> ioPtr{uPtr, std::default_delete<int>{}};
+
+    static_assert(noexcept(ioPtr.operator int**()));
+    std::same_as<int**> decltype(auto) pPtr = ioPtr.operator int**();
+
+    assert(**pPtr == 84);
+  }
+
+  // operator void**()
+  {
+    std::unique_ptr<int> uPtr = std::make_unique<int>(84);
+
+    const std::inout_ptr_t<std::unique_ptr<int>, void*> ioPtr{uPtr};
+
+    static_assert(noexcept(ioPtr.operator void**()));
+    std::same_as<void**> decltype(auto) pPtr = ioPtr.operator void**();
+
+    assert(**reinterpret_cast<int**>(pPtr) == 84);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.ctor.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.ctor.pass.cpp
new file mode 100644
index 0000000000000..c7e173a76e728
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.ctor.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [inout.ptr.t], class template inout_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class inout_ptr_t;                                        // since c++23
+
+// explicit inout_ptr_t(Smart&, Args...);
+
+#include <cassert>
+#include <memory>
+
+#include "test_convertible.h"
+#include "../types.h"
+
+int main(int, char**) {
+  {
+    std::unique_ptr<int> uPtr;
+
+    std::inout_ptr_t<std::unique_ptr<int>, int*>{uPtr};
+
+    static_assert(
+        !test_convertible<std::inout_ptr_t<std::unique_ptr<int>, int*>>(), "This constructor must be explicit");
+
+    // Test the state of the pointer after construction. Complete tests are available in inout_ptr.general.pass.cpp.
+    assert(uPtr == nullptr);
+  }
+
+  {
+    auto deleter = [](auto* p) { delete p; };
+    std::unique_ptr<int, decltype(deleter)> uPtr;
+
+    std::inout_ptr_t<std::unique_ptr<int, decltype(deleter)>, int*>{uPtr};
+
+    static_assert(!test_convertible<std::inout_ptr_t<std::unique_ptr<int, decltype(deleter)>, int*>>(),
+                  "This constructor must be explicit");
+
+    // Test the state of the pointer after construction. Complete tests are available in inout_ptr.general.pass.cpp.
+    assert(uPtr == nullptr);
+  }
+
+  {
+    std::unique_ptr<int, MoveOnlyDeleter<int>> uPtr;
+
+    std::inout_ptr_t<decltype(uPtr), int*, MoveOnlyDeleter<int>>{uPtr, MoveOnlyDeleter<int>{}};
+
+    // Test the state of the pointer after construction. Complete tests are available in inout_ptr.general.pass.cpp.
+    assert(uPtr == nullptr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.verify.cpp
new file mode 100644
index 0000000000000..2270c4c3fc785
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr_t.verify.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [inout.ptr.t], class template inout_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class inout_ptr_t;                                        // since c++23
+
+#include <memory>
+
+int main(int, char**) {
+  // `std::inout_ptr<>` does not support `std::shared_ptr<>`.
+  {
+    std::shared_ptr<int> sPtr;
+
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}std::shared_ptr<> is not supported with std::inout_ptr.}}
+    std::inout_ptr_t<std::shared_ptr<int>, int*>{sPtr};
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.general.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.general.pass.cpp
new file mode 100644
index 0000000000000..a78e22f69abc0
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.general.pass.cpp
@@ -0,0 +1,230 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [out.ptr], function template out_ptr
+// template<class Pointer = void, class Smart, class... Args>
+//   auto out_ptr(Smart& s, Args&&... args);                   // since c++23
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "../types.h"
+
+// Test updating an `out_ptr_t`-managed pointer for an API with a non-void pointer type.
+// The API returns a new valid object.
+void test_get_int_p() {
+  auto get_int_p = [](int** pp) { *pp = new int{84}; };
+
+  // raw pointer
+  {
+    int* rPtr;
+
+    get_int_p(std::out_ptr<int*>(rPtr));
+    assert(*rPtr == 84);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    std::unique_ptr<int> uPtr;
+
+    get_int_p(std::out_ptr(uPtr));
+    assert(*uPtr == 84);
+  }
+
+  {
+    MoveOnlyDeleter<int> del;
+    std::unique_ptr<int, MoveOnlyDeleter<int>> uPtr;
+
+    get_int_p(std::out_ptr(uPtr, std::move(del)));
+    assert(*uPtr == 84);
+    assert(uPtr.get_deleter().wasMoveInitilized == true);
+  }
+
+  // std::shared_ptr
+  {
+    std::shared_ptr<int> sPtr;
+
+    get_int_p(std::out_ptr(sPtr, [](auto* p) {
+      assert(*p == 84);
+
+      delete p;
+    }));
+    assert(*sPtr == 84);
+  }
+
+  // pointer-like ConstructiblePtr
+  {
+    ConstructiblePtr<int> cPtr;
+
+    get_int_p(std::out_ptr(cPtr));
+    assert(cPtr == 84);
+  }
+
+  // pointer-like ResettablePtr
+  {
+    ResettablePtr<int> rPtr{nullptr};
+
+    get_int_p(std::out_ptr(rPtr));
+    assert(rPtr == 84);
+  }
+
+  // NonConstructiblePtr
+  {
+    NonConstructiblePtr<int> nPtr;
+
+    get_int_p(std::out_ptr(nPtr));
+    assert(nPtr == 84);
+  }
+}
+
+// Test updating an `out_ptr_t`-managed pointer for an API with a non-void pointer type.
+// The API returns `nullptr`.
+void test_get_int_p_nullptr() {
+  auto get_int_p_nullptr = [](int** pp) { *pp = nullptr; };
+  // raw pointer
+  {
+    int* rPtr;
+
+    get_int_p_nullptr(std::out_ptr<int*>(rPtr));
+    assert(rPtr == nullptr);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    std::unique_ptr<int> uPtr;
+
+    get_int_p_nullptr(std::out_ptr(uPtr));
+    assert(uPtr == nullptr);
+  }
+
+  // std::shared_ptr
+  {
+    std::shared_ptr<int> sPtr;
+
+    get_int_p_nullptr(std::out_ptr(sPtr, [](auto* p) {
+      assert(p == nullptr);
+
+      delete p;
+    }));
+    assert(sPtr == nullptr);
+  }
+}
+
+// Test updating an `out_ptr_t`-managed pointer for an API with a void pointer type.
+// The API returns a new valid object.
+void test_get_int_void_p() {
+  auto get_int_void_p = [](void** pp) { *(reinterpret_cast<int**>(pp)) = new int{84}; };
+
+  // raw pointer
+  {
+    int* rPtr;
+
+    get_int_void_p(std::out_ptr(rPtr));
+    assert(*rPtr == 84);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    std::unique_ptr<int> uPtr;
+
+    get_int_void_p(std::out_ptr(uPtr));
+    assert(*uPtr == 84);
+  }
+
+  // std::shared_ptr
+  {
+    std::shared_ptr<int> sPtr;
+
+    get_int_void_p(std::out_ptr(sPtr, [](auto* p) {
+      assert(*p == 84);
+
+      delete p;
+    }));
+    assert(*sPtr == 84);
+  }
+
+  // pointer-like ConstructiblePtr
+  {
+    ConstructiblePtr<int> cPtr;
+
+    get_int_void_p(std::out_ptr(cPtr));
+    assert(cPtr == 84);
+  }
+
+  // pointer-like ResettablePtr
+  {
+    ResettablePtr<int> rPtr{nullptr};
+
+    get_int_void_p(std::out_ptr(rPtr));
+    assert(rPtr == 84);
+  }
+
+  // NonConstructiblePtr
+  {
+    NonConstructiblePtr<int> nPtr;
+
+    get_int_void_p(std::out_ptr(nPtr));
+    assert(nPtr == 84);
+  }
+}
+
+// Test updating an `out_ptr_t`-managed pointer for an API with a void pointer type.
+// The API returns `nullptr`.
+void test_get_int_void_p_nullptr() {
+  auto get_int_void_p_nullptr = [](void** pp) { *pp = nullptr; };
+
+  // raw pointer
+  {
+    int* rPtr;
+
+    get_int_void_p_nullptr(std::out_ptr<int*>(rPtr));
+    assert(rPtr == nullptr);
+
+    delete rPtr;
+  }
+
+  // std::unique_ptr
+  {
+    std::unique_ptr<int> uPtr;
+
+    get_int_void_p_nullptr(std::out_ptr(uPtr));
+    assert(uPtr == nullptr);
+  }
+
+  // std::shared_ptr
+  {
+    std::shared_ptr<int> sPtr;
+
+    get_int_void_p_nullptr(std::out_ptr(sPtr, [](auto* p) {
+      assert(p == nullptr);
+
+      delete p;
+    }));
+    assert(sPtr == nullptr);
+  }
+}
+
+int main(int, char**) {
+  test_get_int_p();
+  test_get_int_p_nullptr();
+  test_get_int_void_p();
+  test_get_int_void_p_nullptr();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp
new file mode 100644
index 0000000000000..1fe78ecb22789
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [out.ptr], function template out_ptr
+// template<class Pointer = void, class Smart, class... Args>
+//   auto out_ptr(Smart& s, Args&&... args);                   // since c++23
+
+#include <memory>
+#include <tuple>
+
+#include "../types.h"
+
+int main(int, char**) {
+  // `std::out_ptr<>` requires `std::shared_ptr<>` with a deleter.
+  {
+    std::shared_ptr<int> sPtr;
+
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}Using std::shared_ptr<> without a deleter in std::out_ptr is not supported.}}
+    std::ignore = std::out_ptr(sPtr);
+    // expected-error@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr<int>' to 'std::out_ptr_t<shared_ptr<int>, _Ptr>' (aka 'out_ptr_t<std::shared_ptr<int>, int *>')}}
+    std::ignore = std::out_ptr<int*>(sPtr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.convert.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.convert.pass.cpp
new file mode 100644
index 0000000000000..828a5d0ba31ea
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.convert.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [out.ptr.t], class template out_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class out_ptr_t;                                          // since c++23
+
+// operator Pointer*() const noexcept;
+// operator void**() const noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <memory>
+
+int main(int, char**) {
+  // operator Pointer*()
+  {
+    std::unique_ptr<int> uPtr;
+
+    const std::out_ptr_t<std::unique_ptr<int>, int*> oPtr{uPtr};
+
+    static_assert(noexcept(oPtr.operator int**()));
+    std::same_as<int**> decltype(auto) pPtr = oPtr.operator int**();
+
+    assert(*pPtr == nullptr);
+  }
+
+  {
+    std::unique_ptr<int, std::default_delete<int>> uPtr;
+
+    const std::out_ptr_t<decltype(uPtr), int*, std::default_delete<int>> oPtr{uPtr, std::default_delete<int>{}};
+
+    static_assert(noexcept(oPtr.operator int**()));
+    std::same_as<int**> decltype(auto) pPtr = oPtr.operator int**();
+
+    assert(*pPtr == nullptr);
+  }
+
+  // operator void**()
+  {
+    std::unique_ptr<int> uPtr;
+
+    const std::out_ptr_t<std::unique_ptr<int>, void*> oPtr{uPtr};
+
+    static_assert(noexcept(oPtr.operator void**()));
+    std::same_as<void**> decltype(auto) pPtr = oPtr.operator void**();
+
+    assert(*pPtr == nullptr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.ctor.pass.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.ctor.pass.cpp
new file mode 100644
index 0000000000000..29e1258845bc9
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.ctor.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [out.ptr.t], class template out_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class out_ptr_t;                                          // since c++23
+
+// explicit out_ptr_t(Smart&, Args...);
+
+#include <cassert>
+#include <memory>
+
+#include "test_convertible.h"
+#include "../types.h"
+
+int main(int, char**) {
+  {
+    std::unique_ptr<int> uPtr;
+
+    std::out_ptr_t<std::unique_ptr<int>, int*>{uPtr};
+
+    static_assert(!test_convertible<std::out_ptr_t<std::unique_ptr<int>, int*>>(), "This constructor must be explicit");
+
+    // Test the state of the pointer after construction. Complete tests are available in out_ptr.general.pass.cpp
+    assert(uPtr == nullptr);
+  }
+
+  {
+    std::unique_ptr<int, std::default_delete<int>> uPtr;
+
+    std::out_ptr_t<decltype(uPtr), int*, std::default_delete<int>>{uPtr, std::default_delete<int>{}};
+
+    static_assert(!test_convertible<std::out_ptr_t<decltype(uPtr), int*, std::default_delete<int>>>(),
+                  "This constructor must be explicit");
+
+    // Test the state of the pointer after construction. Complete tests are available in out_ptr.general.pass.cpp
+    assert(uPtr == nullptr);
+  }
+
+  {
+    std::unique_ptr<int, MoveOnlyDeleter<int>> uPtr;
+
+    std::out_ptr_t<decltype(uPtr), int*, MoveOnlyDeleter<int>>{uPtr, MoveOnlyDeleter<int>{}};
+
+    // Test the state of the pointer after construction. Complete tests are available in out_ptr.general.pass.cpp
+    assert(uPtr == nullptr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.verify.cpp
new file mode 100644
index 0000000000000..cbe96e752ac92
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr_t.verify.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <memory>
+
+// [out.ptr.t], class template out_ptr_t
+// template<class Smart, class Pointer, class... Args>
+//   class out_ptr_t;                                          // since c++23
+
+#include <memory>
+
+int main(int, char**) {
+  // `std::out_ptr_t<>` requires `std::shared_ptr<>` with a deleter.
+  {
+    std::shared_ptr<int> sPtr;
+
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}Using std::shared_ptr<> without a deleter in std::out_ptr is not supported.}}
+    std::out_ptr_t<std::shared_ptr<int>, int*>{sPtr};
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/adapt/types.h b/libcxx/test/std/utilities/smartptr/adapt/types.h
new file mode 100644
index 0000000000000..0da6007ba394f
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/adapt/types.h
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_LIBCXX_UTILITIES_SMARTPTR_ADAPT_TYPES_H
+#define TEST_LIBCXX_UTILITIES_SMARTPTR_ADAPT_TYPES_H
+
+#include <type_traits>
+#include <memory>
+
+#include "test_macros.h"
+
+// Custom deleters.
+
+template <typename T>
+struct MoveOnlyDeleter {
+  MoveOnlyDeleter()                                  = default;
+  MoveOnlyDeleter(const MoveOnlyDeleter&)            = delete;
+  MoveOnlyDeleter& operator=(const MoveOnlyDeleter&) = delete;
+  MoveOnlyDeleter(MoveOnlyDeleter&&) : wasMoveInitilized{true} {}
+  MoveOnlyDeleter& operator=(MoveOnlyDeleter&&) = default;
+
+  void operator()(T* p) const { delete p; }
+
+  bool wasMoveInitilized = false;
+};
+
+// Custom pointer types.
+
+template <typename T>
+struct ConstructiblePtr {
+  using pointer = T*;
+  std::unique_ptr<T> ptr;
+
+  ConstructiblePtr() = default;
+  explicit ConstructiblePtr(T* p) : ptr{p} {}
+
+  auto operator==(T val) { return *ptr == val; }
+
+  auto* get() const { return ptr.get(); }
+
+  void release() { ptr.release(); }
+};
+
+LIBCPP_STATIC_ASSERT(std::is_same_v<std::__pointer_of_t< ConstructiblePtr<int>>, int* >);
+static_assert(std::is_constructible_v< ConstructiblePtr<int>, int* >);
+
+struct ResetArg {};
+
+template <typename T>
+struct ResettablePtr {
+  using element_type = T;
+  std::unique_ptr<T> ptr;
+
+  explicit ResettablePtr(T* p) : ptr{p} {}
+
+  auto operator*() const { return *ptr; }
+
+  auto operator==(T val) { return *ptr == val; }
+
+  void reset() { ptr.reset(); }
+  void reset(T* p, ResetArg) { ptr.reset(p); }
+
+  auto* get() const { return ptr.get(); }
+
+  void release() { ptr.release(); }
+};
+
+LIBCPP_STATIC_ASSERT(std::is_same_v<std::__pointer_of_t< ResettablePtr<int>>, int* >);
+static_assert(std::is_constructible_v< ResettablePtr<int>, int* >);
+
+template <typename T>
+struct NonConstructiblePtr : public ResettablePtr<T> {
+  NonConstructiblePtr() : NonConstructiblePtr::ResettablePtr(nullptr) {};
+
+  void reset(T* p) { ResettablePtr<T>::ptr.reset(p); }
+};
+
+LIBCPP_STATIC_ASSERT(std::is_same_v<std::__pointer_of_t< NonConstructiblePtr<int>>, int* >);
+static_assert(!std::is_constructible_v< NonConstructiblePtr<int>, int* >);
+
+#endif // TEST_LIBCXX_UTILITIES_SMARTPTR_ADAPT_TYPES_H
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp
index 4a1e9704afc9c..17902a4ce27a4 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp
@@ -14,12 +14,50 @@
 
 #include <memory>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 11
+struct ThrowDereference {
+  TEST_CONSTEXPR_CXX23 ThrowDereference& operator*() noexcept(false);
+  TEST_CONSTEXPR_CXX23 operator bool() const { return false; }
+};
+
+struct Deleter {
+  using pointer = ThrowDereference;
+  TEST_CONSTEXPR_CXX23 void operator()(ThrowDereference&) const {}
+};
+#endif
+
 TEST_CONSTEXPR_CXX23 bool test() {
-  std::unique_ptr<int> p(new int(3));
-  assert(*p == 3);
+  ASSERT_NOEXCEPT(*(std::unique_ptr<void>{}));
+#if TEST_STD_VER >= 11
+  static_assert(noexcept(*std::declval<std::unique_ptr<void>>()), "");
+#endif
+  {
+    std::unique_ptr<int> p(new int(3));
+    assert(*p == 3);
+    ASSERT_NOEXCEPT(*p);
+  }
+#if TEST_STD_VER >= 11
+  {
+    std::unique_ptr<std::vector<int>> p(new std::vector<int>{3, 4, 5});
+    assert((*p)[0] == 3);
+    assert((*p)[1] == 4);
+    assert((*p)[2] == 5);
+    ASSERT_NOEXCEPT(*p);
+  }
+  {
+    std::unique_ptr<ThrowDereference> p;
+    ASSERT_NOEXCEPT(*p);
+  }
+  {
+    // The noexcept status of *unique_ptr<>::pointer should be propagated.
+    std::unique_ptr<ThrowDereference, Deleter> p;
+    ASSERT_NOT_NOEXCEPT(*p);
+  }
+#endif
 
   return true;
 }
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_incomplete.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_incomplete.pass.cpp
index 7585911e2e8d2..a2cb2e989a571 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_incomplete.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_incomplete.pass.cpp
@@ -31,9 +31,8 @@ template <class T> constexpr bool is_complete() { return is_complete<T>(0); }
 struct Dummy1 {};
 struct Dummy2 {};
 
-namespace std {
-template <> struct tuple_size<Dummy1> : public integral_constant<std::size_t, 0> {};
-}
+template <>
+struct std::tuple_size<Dummy1> : public integral_constant<std::size_t, 0> {};
 
 template <class T>
 void test_complete() {
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp
new file mode 100644
index 0000000000000..792f05d2f82f4
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: availability-pmr-missing
+
+// <memory_resource>
+
+// template <class T> class polymorphic_allocator
+
+// friend bool operator==(const polymorphic_allocator& a,
+//                        const polymorphic_allocator& b) noexcept
+
+#include <memory_resource>
+#include <cassert>
+#include <vector>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::pmr::unsynchronized_pool_resource a;
+  std::pmr::vector<int> vec(&a);
+
+  assert(vec.get_allocator() == &a);
+  static_assert(noexcept(vec.get_allocator() == &a));
+
+  // LWG3683 added operator== after C++20. In C++20 operator!= is generated by
+  // the compiler. Libc++ adds operator!= in C++17 as an extension. MSVC STL
+  // and libstdc++ have done the same so test this extension unconditionally.
+  std::pmr::unsynchronized_pool_resource b;
+
+  assert(vec.get_allocator() != &b);
+  static_assert(noexcept(vec.get_allocator() != &b));
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_like.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_like.pass.cpp
index 3810c4450e57f..3ff416ed5737e 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_like.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_like.pass.cpp
@@ -37,18 +37,14 @@ friend int get(MyPairLike const&)
 
 } // namespace my_ns
 
-namespace std {
-
 template <>
-struct tuple_size<my_ns::MyPairLike> : std::integral_constant<std::size_t, 2> {};
+struct std::tuple_size<my_ns::MyPairLike> : std::integral_constant<std::size_t, 2> {};
 
 template <std::size_t N>
-struct tuple_element<N, my_ns::MyPairLike> {
+struct std::tuple_element<N, my_ns::MyPairLike> {
   using type = int;
 };
 
-} // namespace std
-
 // https://github.com/llvm/llvm-project/issues/65620
 // This used to be a hard error
 static_assert(!std::is_constructible_v<std::pair<int,int>, my_ns::MyPairLike const&>);
diff --git a/libcxx/test/std/utilities/variant/variant.hash/hash.pass.cpp b/libcxx/test/std/utilities/variant/variant.hash/hash.pass.cpp
index f472144403d55..ffd5f8266ec2b 100644
--- a/libcxx/test/std/utilities/variant/variant.hash/hash.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.hash/hash.pass.cpp
@@ -22,14 +22,13 @@
 #include "poisoned_hash_helper.h"
 
 #ifndef TEST_HAS_NO_EXCEPTIONS
-namespace std {
-template <> struct hash<::MakeEmptyT> {
+template <>
+struct std::hash<::MakeEmptyT> {
   std::size_t operator()(const ::MakeEmptyT &) const {
     assert(false);
     return 0;
   }
 };
-}
 #endif
 
 void test_hash_variant() {
@@ -123,17 +122,13 @@ void test_hash_variant_duplicate_elements() {
 struct A {};
 struct B {};
 
-namespace std {
-
 template <>
-struct hash<B> {
+struct std::hash<B> {
   std::size_t operator()(B const&) const {
     return 0;
   }
 };
 
-}
-
 void test_hash_variant_enabled() {
   {
     test_hash_enabled_for_type<std::variant<int> >();
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
index 857e85d00857a..0da23fd58ccaa 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
@@ -163,6 +163,7 @@ void test_caller_accepts_nonconst() {
 
 struct MyVariant : std::variant<short, long, float> {};
 
+// FIXME: This is UB according to [namespace.std]
 namespace std {
 template <std::size_t Index>
 void get(const MyVariant&) {
diff --git a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
index 0caecbe875d55..a72dcf13174c6 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
@@ -314,6 +314,7 @@ void test_caller_accepts_nonconst() {
 
 struct MyVariant : std::variant<short, long, float> {};
 
+// FIXME: This is UB according to [namespace.std]
 namespace std {
 template <std::size_t Index>
 void get(const MyVariant&) {
diff --git a/libcxx/test/support/Counter.h b/libcxx/test/support/Counter.h
index 6a51cc991eee7..ed03e8a04a069 100644
--- a/libcxx/test/support/Counter.h
+++ b/libcxx/test/support/Counter.h
@@ -40,16 +40,12 @@ class Counter : public Counter_base
 
 int Counter_base::gConstructed = 0;
 
-namespace std {
-
 template <class T>
-struct hash<Counter<T> >
-{
-    typedef Counter<T> argument_type;
-    typedef std::size_t result_type;
+struct std::hash<Counter<T> > {
+  typedef Counter<T> argument_type;
+  typedef std::size_t result_type;
 
-    std::size_t operator()(const Counter<T>& x) const {return std::hash<T>()(x.get());}
+  std::size_t operator()(const Counter<T>& x) const { return std::hash<T>()(x.get()); }
 };
-}
 
 #endif
diff --git a/libcxx/test/support/atomic_helpers.h b/libcxx/test/support/atomic_helpers.h
index 0266a0961067b..d2f2b751cb47d 100644
--- a/libcxx/test/support/atomic_helpers.h
+++ b/libcxx/test/support/atomic_helpers.h
@@ -11,9 +11,112 @@
 
 #include <cassert>
 #include <cstdint>
+#include <cstddef>
+#include <type_traits>
 
 #include "test_macros.h"
 
+#if defined(TEST_COMPILER_CLANG)
+#  define TEST_ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define TEST_ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define TEST_ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE
+#  define TEST_ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define TEST_ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define TEST_ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE
+#elif defined(TEST_COMPILER_GCC)
+#  define TEST_ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define TEST_ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define TEST_ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
+#  define TEST_ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
+#  define TEST_ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define TEST_ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
+#elif TEST_COMPILER_MSVC
+// This is lifted from STL/stl/inc/atomic on github for the purposes of
+// keeping the tests compiling for MSVC's STL. It's not a perfect solution
+// but at least the tests will keep running.
+//
+// Note MSVC's STL never produces a type that is sometimes lock free, but not always lock free.
+template <class T, size_t Size = sizeof(T)>
+constexpr bool msvc_is_lock_free_macro_value() {
+  return (Size <= 8 && (Size & Size - 1) == 0) ? 2 : 0;
+}
+#  define TEST_ATOMIC_CHAR_LOCK_FREE ::msvc_is_lock_free_macro_value<char>()
+#  define TEST_ATOMIC_SHORT_LOCK_FREE ::msvc_is_lock_free_macro_value<short>()
+#  define TEST_ATOMIC_INT_LOCK_FREE ::msvc_is_lock_free_macro_value<int>()
+#  define TEST_ATOMIC_LONG_LOCK_FREE ::msvc_is_lock_free_macro_value<long>()
+#  define TEST_ATOMIC_LLONG_LOCK_FREE ::msvc_is_lock_free_macro_value<long long>()
+#  define TEST_ATOMIC_POINTER_LOCK_FREE ::msvc_is_lock_free_macro_value<void*>()
+#else
+#  error "Unknown compiler"
+#endif
+
+#ifdef TEST_COMPILER_CLANG
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wc++11-extensions"
+#endif
+
+enum class LockFreeStatus : int { unknown = -1, never = 0, sometimes = 1, always = 2 };
+
+// We should really be checking whether the alignment of T is greater-than-or-equal-to the alignment required
+// for T to be atomic, but this is basically impossible to implement portably. Instead, we assume that any type
+// aligned to at least its size is going to be atomic if there exists atomic operations for that size at all,
+// which is true on most platforms. This technically reduces our test coverage in the sense that if a type has
+// an alignment requirement less than its size but could still be made lockfree, LockFreeStatusInfo will report
+// that we don't know whether it is lockfree or not.
+#define COMPARE_TYPES(T, FundamentalT) (sizeof(T) == sizeof(FundamentalT) && TEST_ALIGNOF(T) >= sizeof(T))
+
+template <class T>
+struct LockFreeStatusInfo {
+  static const LockFreeStatus value = LockFreeStatus(
+      COMPARE_TYPES(T, char)
+          ? TEST_ATOMIC_CHAR_LOCK_FREE
+          : (COMPARE_TYPES(T, short)
+                 ? TEST_ATOMIC_SHORT_LOCK_FREE
+                 : (COMPARE_TYPES(T, int)
+                        ? TEST_ATOMIC_INT_LOCK_FREE
+                        : (COMPARE_TYPES(T, long)
+                               ? TEST_ATOMIC_LONG_LOCK_FREE
+                               : (COMPARE_TYPES(T, long long)
+                                      ? TEST_ATOMIC_LLONG_LOCK_FREE
+                                      : (COMPARE_TYPES(T, void*) ? TEST_ATOMIC_POINTER_LOCK_FREE : -1))))));
+
+  static const bool status_known = LockFreeStatusInfo::value != LockFreeStatus::unknown;
+};
+
+#undef COMPARE_TYPES
+
+// This doesn't work in C++03 due to issues with scoped enumerations. Just disable the test.
+#if TEST_STD_VER >= 11
+static_assert(LockFreeStatusInfo<char>::status_known, "");
+static_assert(LockFreeStatusInfo<short>::status_known, "");
+static_assert(LockFreeStatusInfo<int>::status_known, "");
+static_assert(LockFreeStatusInfo<long>::status_known, "");
+static_assert(LockFreeStatusInfo<void*>::status_known, "");
+
+// long long is a bit funky: on some platforms, its alignment is 4 bytes but its size is
+// 8 bytes. In that case, atomics may or may not be lockfree based on their address.
+static_assert(alignof(long long) == sizeof(long long) ? LockFreeStatusInfo<long long>::status_known : true, "");
+
+// Those should always be lock free: hardcode some expected values to make sure our tests are actually
+// testing something meaningful.
+static_assert(LockFreeStatusInfo<char>::value == LockFreeStatus::always, "");
+static_assert(LockFreeStatusInfo<short>::value == LockFreeStatus::always, "");
+static_assert(LockFreeStatusInfo<int>::value == LockFreeStatus::always, "");
+#endif
+
+// These macros are somewhat suprising to use, since they take the values 0, 1, or 2.
+// To make the tests clearer, get rid of them in preference of LockFreeStatusInfo.
+#undef TEST_ATOMIC_CHAR_LOCK_FREE
+#undef TEST_ATOMIC_SHORT_LOCK_FREE
+#undef TEST_ATOMIC_INT_LOCK_FREE
+#undef TEST_ATOMIC_LONG_LOCK_FREE
+#undef TEST_ATOMIC_LLONG_LOCK_FREE
+#undef TEST_ATOMIC_POINTER_LOCK_FREE
+
+#ifdef TEST_COMPILER_CLANG
+#  pragma clang diagnostic pop
+#endif
+
 struct UserAtomicType {
   int i;
 
diff --git a/libcxx/test/support/format.functions.common.h b/libcxx/test/support/format.functions.common.h
index 976df5b741fb9..473b4efe690b0 100644
--- a/libcxx/test/support/format.functions.common.h
+++ b/libcxx/test/support/format.functions.common.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test.support/make_string_header.pass.cpp b/libcxx/test/support/test.support/make_string_header.pass.cpp
index 4cee361552406..8fcc05ec49d7c 100644
--- a/libcxx/test/support/test.support/make_string_header.pass.cpp
+++ b/libcxx/test/support/test.support/make_string_header.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h
index 1ec719a48b6c8..f51f6e97cbed0 100644
--- a/libcxx/test/support/test_basic_format_arg.h
+++ b/libcxx/test/support/test_basic_format_arg.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index aa819ecd4733b..44bd4a597539d 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -11,6 +11,7 @@
 
 #include <cassert>
 #include <concepts>
+#include <cstdint>
 #include <iterator>
 #include <ranges>
 #include <stdexcept>
@@ -338,60 +339,92 @@ cpp20_random_access_iterator(It) -> cpp20_random_access_iterator<It>;
 
 static_assert(std::random_access_iterator<cpp20_random_access_iterator<int*>>);
 
-template <class It>
-class contiguous_iterator
-{
-    static_assert(std::is_pointer_v<It>, "Things probably break in this case");
+template <std::contiguous_iterator It>
+class contiguous_iterator {
+  It it_;
 
-    It it_;
+  template <std::contiguous_iterator U>
+  friend class contiguous_iterator;
 
-    template <class U> friend class contiguous_iterator;
 public:
-    typedef          std::contiguous_iterator_tag              iterator_category;
-    typedef typename std::iterator_traits<It>::value_type      value_type;
-    typedef typename std::iterator_traits<It>::difference_type difference_type;
-    typedef It                                                 pointer;
-    typedef typename std::iterator_traits<It>::reference       reference;
-    typedef typename std::remove_pointer<It>::type             element_type;
+  using iterator_category = std::contiguous_iterator_tag;
+  using value_type        = typename std::iterator_traits<It>::value_type;
+  using difference_type   = typename std::iterator_traits<It>::difference_type;
+  using pointer           = typename std::iterator_traits<It>::pointer;
+  using reference         = typename std::iterator_traits<It>::reference;
+  using element_type      = value_type;
 
-    TEST_CONSTEXPR_CXX14 It base() const {return it_;}
+  constexpr It base() const { return it_; }
 
-    TEST_CONSTEXPR_CXX14 contiguous_iterator() : it_() {}
-    TEST_CONSTEXPR_CXX14 explicit contiguous_iterator(It it) : it_(it) {}
+  constexpr contiguous_iterator() : it_() {}
+  constexpr explicit contiguous_iterator(It it) : it_(it) {}
 
-    template <class U>
-    TEST_CONSTEXPR_CXX14 contiguous_iterator(const contiguous_iterator<U>& u) : it_(u.it_) {}
+  template <class U>
+  constexpr contiguous_iterator(const contiguous_iterator<U>& u) : it_(u.it_) {}
 
-    template <class U, class = typename std::enable_if<std::is_default_constructible<U>::value>::type>
-    constexpr contiguous_iterator(contiguous_iterator<U>&& u) : it_(u.it_) { u.it_ = U(); }
+  template <class U, class = typename std::enable_if<std::is_default_constructible<U>::value>::type>
+  constexpr contiguous_iterator(contiguous_iterator<U>&& u) : it_(u.it_) {
+    u.it_ = U();
+  }
 
-    TEST_CONSTEXPR reference operator*() const {return *it_;}
-    TEST_CONSTEXPR pointer operator->() const {return it_;}
-    TEST_CONSTEXPR reference operator[](difference_type n) const {return it_[n];}
-
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator++() {++it_; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator--() {--it_; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator operator++(int) {return contiguous_iterator(it_++);}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator operator--(int) {return contiguous_iterator(it_--);}
-
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator+=(difference_type n) {it_ += n; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator-=(difference_type n) {it_ -= n; return *this;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator+(contiguous_iterator x, difference_type n) {x += n; return x;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator+(difference_type n, contiguous_iterator x) {x += n; return x;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator-(contiguous_iterator x, difference_type n) {x -= n; return x;}
-    friend TEST_CONSTEXPR difference_type operator-(contiguous_iterator x, contiguous_iterator y) {return x.it_ - y.it_;}
-
-    friend TEST_CONSTEXPR bool operator==(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ == y.it_;}
-    friend TEST_CONSTEXPR bool operator!=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ != y.it_;}
-    friend TEST_CONSTEXPR bool operator< (const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ <  y.it_;}
-    friend TEST_CONSTEXPR bool operator<=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ <= y.it_;}
-    friend TEST_CONSTEXPR bool operator> (const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >  y.it_;}
-    friend TEST_CONSTEXPR bool operator>=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >= y.it_;}
-
-    friend TEST_CONSTEXPR It base(const contiguous_iterator& i) { return i.it_; }
+  constexpr reference operator*() const { return *it_; }
+  constexpr pointer operator->() const { return it_; }
+  constexpr reference operator[](difference_type n) const { return it_[n]; }
 
-    template <class T>
-    void operator,(T const &) = delete;
+  constexpr contiguous_iterator& operator++() {
+    ++it_;
+    return *this;
+  }
+  constexpr contiguous_iterator& operator--() {
+    --it_;
+    return *this;
+  }
+  constexpr contiguous_iterator operator++(int) { return contiguous_iterator(it_++); }
+  constexpr contiguous_iterator operator--(int) { return contiguous_iterator(it_--); }
+
+  constexpr contiguous_iterator& operator+=(difference_type n) {
+    it_ += n;
+    return *this;
+  }
+  constexpr contiguous_iterator& operator-=(difference_type n) {
+    it_ -= n;
+    return *this;
+  }
+  friend constexpr contiguous_iterator operator+(contiguous_iterator x, difference_type n) {
+    x += n;
+    return x;
+  }
+  friend constexpr contiguous_iterator operator+(difference_type n, contiguous_iterator x) {
+    x += n;
+    return x;
+  }
+  friend constexpr contiguous_iterator operator-(contiguous_iterator x, difference_type n) {
+    x -= n;
+    return x;
+  }
+  friend constexpr difference_type operator-(contiguous_iterator x, contiguous_iterator y) { return x.it_ - y.it_; }
+
+  friend constexpr bool operator==(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ == y.it_;
+  }
+  friend constexpr bool operator!=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ != y.it_;
+  }
+  friend constexpr bool operator<(const contiguous_iterator& x, const contiguous_iterator& y) { return x.it_ < y.it_; }
+  friend constexpr bool operator<=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ <= y.it_;
+  }
+  friend constexpr bool operator>(const contiguous_iterator& x, const contiguous_iterator& y) { return x.it_ > y.it_; }
+  friend constexpr bool operator>=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ >= y.it_;
+  }
+
+    // Note no operator<=>, use three_way_contiguous_iterator for testing operator<=>
+
+  friend constexpr It base(const contiguous_iterator& i) { return i.it_; }
+
+  template <class T>
+  void operator,(T const&) = delete;
 };
 template <class It>
 contiguous_iterator(It) -> contiguous_iterator<It>;
@@ -724,17 +757,18 @@ struct common_input_iterator {
 
 #  endif // TEST_STD_VER >= 20
 
-// Iterator adaptor that counts the number of times the iterator has had a successor/predecessor
-// operation or an equality comparison operation called. Has three recorders:
-// * `stride_count`, which records the total number of calls to an op++, op--, op+=, or op-=.
-// * `stride_displacement`, which records the displacement of the calls. This means that both
-//   op++/op+= will increase the displacement counter by 1, and op--/op-= will decrease the
-//   displacement counter by 1.
-// * `equals_count`, which records the total number of calls to an op== or op!=. If compared
-//   against a sentinel object, that sentinel object must call the `record_equality_comparison`
-//   function so that the comparison is counted correctly.
+struct IteratorOpCounts {
+  std::size_t increments = 0; ///< Number of times the iterator moved forward (++it, it++, it+=positive, it-=negative).
+  std::size_t decrements = 0; ///< Number of times the iterator moved backward (--it, it--, it-=positive, it+=negative).
+  std::size_t zero_moves = 0; ///< Number of times a call was made to move the iterator by 0 positions (it+=0, it-=0).
+  std::size_t equal_cmps = 0; ///< Total number of calls to op== or op!=. If compared against a sentinel object, that
+                              ///  sentinel object must call the `record_equality_comparison` function so that the
+                              ///  comparison is counted correctly.
+};
+
+// Iterator adaptor that records its operation counts in a IteratorOpCounts
 template <class It>
-class stride_counting_iterator {
+class operation_counting_iterator {
 public:
     using value_type = typename iter_value_or_void<It>::type;
     using difference_type = std::iter_difference_t<It>;
@@ -746,147 +780,160 @@ class stride_counting_iterator {
         std::conditional_t<std::input_iterator<It>,         std::input_iterator_tag,
         /* else */                                          std::output_iterator_tag
     >>>>>;
+    using iterator_category = iterator_concept;
 
-    stride_counting_iterator() requires std::default_initializable<It> = default;
+    operation_counting_iterator()
+      requires std::default_initializable<It>
+    = default;
 
-    constexpr explicit stride_counting_iterator(It const& it) : base_(base(it)) { }
+    constexpr explicit operation_counting_iterator(It const& it, IteratorOpCounts* counts = nullptr)
+        : base_(base(it)), counts_(counts) {}
 
-    friend constexpr It base(stride_counting_iterator const& it) { return It(it.base_); }
+    constexpr operation_counting_iterator(const operation_counting_iterator& o) { *this = o; }
+    constexpr operation_counting_iterator(operation_counting_iterator&& o) { *this = o; }
 
-    constexpr difference_type stride_count() const { return stride_count_; }
+    constexpr operation_counting_iterator& operator=(const operation_counting_iterator& o) = default;
+    constexpr operation_counting_iterator& operator=(operation_counting_iterator&& o) { return *this = o; }
 
-    constexpr difference_type stride_displacement() const { return stride_displacement_; }
-
-    constexpr difference_type equals_count() const { return equals_count_; }
+    friend constexpr It base(operation_counting_iterator const& it) { return It(it.base_); }
 
     constexpr decltype(auto) operator*() const { return *It(base_); }
 
     constexpr decltype(auto) operator[](difference_type n) const { return It(base_)[n]; }
 
-    constexpr stride_counting_iterator& operator++() {
-        It tmp(base_);
-        base_ = base(++tmp);
-        ++stride_count_;
-        ++stride_displacement_;
-        return *this;
+    constexpr operation_counting_iterator& operator++() {
+      It tmp(base_);
+      base_ = base(++tmp);
+      moved_by(1);
+      return *this;
     }
 
     constexpr void operator++(int) { ++*this; }
 
-    constexpr stride_counting_iterator operator++(int)
-        requires std::forward_iterator<It>
+    constexpr operation_counting_iterator operator++(int)
+      requires std::forward_iterator<It>
     {
-        auto temp = *this;
-        ++*this;
-        return temp;
+      auto temp = *this;
+      ++*this;
+      return temp;
     }
 
-    constexpr stride_counting_iterator& operator--()
-        requires std::bidirectional_iterator<It>
+    constexpr operation_counting_iterator& operator--()
+      requires std::bidirectional_iterator<It>
     {
-        It tmp(base_);
-        base_ = base(--tmp);
-        ++stride_count_;
-        --stride_displacement_;
-        return *this;
+      It tmp(base_);
+      base_ = base(--tmp);
+      moved_by(-1);
+      return *this;
     }
 
-    constexpr stride_counting_iterator operator--(int)
-        requires std::bidirectional_iterator<It>
+    constexpr operation_counting_iterator operator--(int)
+      requires std::bidirectional_iterator<It>
     {
-        auto temp = *this;
-        --*this;
-        return temp;
+      auto temp = *this;
+      --*this;
+      return temp;
     }
 
-    constexpr stride_counting_iterator& operator+=(difference_type const n)
-        requires std::random_access_iterator<It>
+    constexpr operation_counting_iterator& operator+=(difference_type const n)
+      requires std::random_access_iterator<It>
     {
-        It tmp(base_);
-        base_ = base(tmp += n);
-        ++stride_count_;
-        ++stride_displacement_;
-        return *this;
+      It tmp(base_);
+      base_ = base(tmp += n);
+      moved_by(n);
+      return *this;
     }
 
-    constexpr stride_counting_iterator& operator-=(difference_type const n)
-        requires std::random_access_iterator<It>
+    constexpr operation_counting_iterator& operator-=(difference_type const n)
+      requires std::random_access_iterator<It>
     {
-        It tmp(base_);
-        base_ = base(tmp -= n);
-        ++stride_count_;
-        --stride_displacement_;
-        return *this;
+      It tmp(base_);
+      base_ = base(tmp -= n);
+      moved_by(-n);
+      return *this;
     }
 
-    friend constexpr stride_counting_iterator operator+(stride_counting_iterator it, difference_type n)
-        requires std::random_access_iterator<It>
+    friend constexpr operation_counting_iterator operator+(operation_counting_iterator it, difference_type n)
+      requires std::random_access_iterator<It>
     {
-        return it += n;
+      return it += n;
     }
 
-    friend constexpr stride_counting_iterator operator+(difference_type n, stride_counting_iterator it)
-        requires std::random_access_iterator<It>
+    friend constexpr operation_counting_iterator operator+(difference_type n, operation_counting_iterator it)
+      requires std::random_access_iterator<It>
     {
-        return it += n;
+      return it += n;
     }
 
-    friend constexpr stride_counting_iterator operator-(stride_counting_iterator it, difference_type n)
-        requires std::random_access_iterator<It>
+    friend constexpr operation_counting_iterator operator-(operation_counting_iterator it, difference_type n)
+      requires std::random_access_iterator<It>
     {
-        return it -= n;
+      return it -= n;
     }
 
-    friend constexpr difference_type operator-(stride_counting_iterator const& x, stride_counting_iterator const& y)
-        requires std::sized_sentinel_for<It, It>
+    friend constexpr difference_type
+    operator-(operation_counting_iterator const& x, operation_counting_iterator const& y)
+      requires std::sized_sentinel_for<It, It>
     {
-        return base(x) - base(y);
+      return base(x) - base(y);
     }
 
-    constexpr void record_equality_comparison() const { ++equals_count_; }
+    constexpr void record_equality_comparison() const {
+      if (counts_ != nullptr)
+        ++counts_->equal_cmps;
+    }
 
-    constexpr bool operator==(stride_counting_iterator const& other) const
-        requires std::sentinel_for<It, It>
+    constexpr bool operator==(operation_counting_iterator const& other) const
+      requires std::sentinel_for<It, It>
     {
       record_equality_comparison();
       return It(base_) == It(other.base_);
     }
 
-    friend constexpr bool operator<(stride_counting_iterator const& x, stride_counting_iterator const& y)
-        requires std::random_access_iterator<It>
+    friend constexpr bool operator<(operation_counting_iterator const& x, operation_counting_iterator const& y)
+      requires std::random_access_iterator<It>
     {
-        return It(x.base_) < It(y.base_);
+      return It(x.base_) < It(y.base_);
     }
 
-    friend constexpr bool operator>(stride_counting_iterator const& x, stride_counting_iterator const& y)
-        requires std::random_access_iterator<It>
+    friend constexpr bool operator>(operation_counting_iterator const& x, operation_counting_iterator const& y)
+      requires std::random_access_iterator<It>
     {
-        return It(x.base_) > It(y.base_);
+      return It(x.base_) > It(y.base_);
     }
 
-    friend constexpr bool operator<=(stride_counting_iterator const& x, stride_counting_iterator const& y)
-        requires std::random_access_iterator<It>
+    friend constexpr bool operator<=(operation_counting_iterator const& x, operation_counting_iterator const& y)
+      requires std::random_access_iterator<It>
     {
-        return It(x.base_) <= It(y.base_);
+      return It(x.base_) <= It(y.base_);
     }
 
-    friend constexpr bool operator>=(stride_counting_iterator const& x, stride_counting_iterator const& y)
-        requires std::random_access_iterator<It>
+    friend constexpr bool operator>=(operation_counting_iterator const& x, operation_counting_iterator const& y)
+      requires std::random_access_iterator<It>
     {
-        return It(x.base_) >= It(y.base_);
+      return It(x.base_) >= It(y.base_);
     }
 
     template <class T>
     void operator,(T const &) = delete;
 
 private:
+  constexpr void moved_by(difference_type n) {
+    if (counts_ == nullptr)
+      return;
+    if (n > 0)
+      ++counts_->increments;
+    else if (n < 0)
+      ++counts_->decrements;
+    else
+      ++counts_->zero_moves;
+  }
+
     decltype(base(std::declval<It>())) base_;
-    difference_type stride_count_ = 0;
-    difference_type stride_displacement_ = 0;
-    mutable difference_type equals_count_ = 0;
+    IteratorOpCounts* counts_ = nullptr;
 };
 template <class It>
-stride_counting_iterator(It) -> stride_counting_iterator<It>;
+operation_counting_iterator(It) -> operation_counting_iterator<It>;
 
 #endif // TEST_STD_VER > 17
 
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 9e1865ee61fdf..490bee4942e03 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -106,6 +106,7 @@ RUN sudo apt-get update \
 #RUN apt-get update && apt-get install -y ninja-build python3 python3-distutils python3-psutil git gdb ccache
 # TODO add ninja-build once 1.11 is available in Ubuntu, also remove the manual installation.
 RUN <<EOF
+  set -e
   wget -qO /tmp/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
   gunzip /tmp/ninja.gz
   chmod a+x /tmp/ninja
@@ -115,6 +116,7 @@ EOF
 
 # These two locales are not enabled by default so generate them
 RUN <<EOF
+  set -e
   printf "fr_CA ISO-8859-1\ncs_CZ ISO-8859-2" | sudo tee -a /etc/locale.gen
   sudo mkdir /usr/local/share/i1en/
   printf "fr_CA ISO-8859-1\ncs_CZ ISO-8859-2" | sudo tee -a /usr/local/share/i1en/SUPPORTED
@@ -129,6 +131,7 @@ EOF
 # 14 release branch CI uses it. The tip-of-trunk CI will never use Clang 12,
 # though.
 RUN <<EOF
+  set -e
   sudo apt-get update
   wget https://apt.llvm.org/llvm.sh -O /tmp/llvm.sh
   chmod +x /tmp/llvm.sh
@@ -142,6 +145,7 @@ EOF
 
 # Install the most recent GCC, like clang install the previous version as a transition.
 RUN <<EOF
+  set -e
   sudo git clone https://github.com/compiler-explorer/infra.git /tmp/ce-infra
   (cd /tmp/ce-infra && sudo make ce)
   sudo /tmp/ce-infra/bin/ce_install install compilers/c++/x86/gcc $GCC_LATEST_VERSION.1.0
@@ -155,13 +159,14 @@ EOF
 
 RUN <<EOF
     # Install a recent CMake
+    set -e
     wget https://github.com/Kitware/CMake/releases/download/v3.21.1/cmake-3.21.1-linux-x86_64.sh -O /tmp/install-cmake.sh
     sudo bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license
     rm /tmp/install-cmake.sh
 EOF
 
 # ===----------------------------------------------------------------------===##
-#                       Android Buildkite Image
+#                       Android Builder Base Image
 # ===----------------------------------------------------------------------===##
 
 FROM ubuntu:jammy AS android-builder-base
@@ -170,10 +175,11 @@ ARG ANDROID_CLANG_VERSION
 ARG ANDROID_CLANG_PREBUILTS_COMMIT
 ARG ANDROID_SYSROOT_BID
 
-RUN  apt-get update && apt-get install -y curl unzip git
+RUN apt-get update && apt-get install -y curl bzip2 git unzip
 
 # Install the Android platform tools (e.g. adb) into /opt/android/sdk.
 RUN <<EOF
+  set -e
   mkdir -p /opt/android/sdk
   cd /opt/android/sdk
   curl -LO https://dl.google.com/android/repository/platform-tools-latest-linux.zip
@@ -187,6 +193,7 @@ EOF
 ENV ANDROID_CLANG_VERSION=$ANDROID_CLANG_VERSION
 ENV ANDROID_CLANG_PREBUILTS_COMMIT=$ANDROID_CLANG_PREBUILTS_COMMIT
 RUN <<EOF
+    set -e
     git clone --filter=blob:none --sparse \
         https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86 \
         /opt/android/clang
@@ -206,6 +213,7 @@ EOF
 
 ENV ANDROID_SYSROOT_BID=$ANDROID_SYSROOT_BID
 RUN <<EOF
+  set -e
   cd /opt/android
   curl -L -o ndk_platform.tar.bz2 \
       https://androidbuildinternal.googleapis.com/android/internal/build/v3/builds/${ANDROID_SYSROOT_BID}/ndk/attempts/latest/artifacts/ndk_platform.tar.bz2/url
@@ -213,19 +221,6 @@ RUN <<EOF
   rm ndk_platform.tar.bz2
 EOF
 
-# Install Docker
-RUN <<EOF
-  curl -fsSL https://get.docker.com -o /tmp/get-docker.sh
-  sh /tmp/get-docker.sh
-  rm /tmp/get-docker.sh
-
-  # Install Docker. Mark the binary setuid so it can be run without prefixing it
-  # with sudo. Adding the container user to the docker group doesn't work because
-  # /var/run/docker.sock is owned by the host's docker GID, not the container's
-  # docker GID.
-  chmod u+s /usr/bin/docker
-EOF
-
 # ===----------------------------------------------------------------------===##
 #                    Buildkite Builder Image
 # ===----------------------------------------------------------------------===##
@@ -243,6 +238,7 @@ WORKDIR /home/libcxx-builder
 # Install the Buildkite agent and dependencies. This must be done as non-root
 # for the Buildkite agent to be installed in a path where we can find it.
 RUN <<EOF
+  set -e
   cd /home/libcxx-builder
   curl -sL https://raw.githubusercontent.com/buildkite/agent/main/install.sh -o /tmp/install-agent.sh
   bash /tmp/install-agent.sh
@@ -271,6 +267,22 @@ COPY ./vendor/android/container-setup.sh /opt/android/container-setup.sh
 
 ENV PATH="/opt/android/sdk/platform-tools:${PATH}"
 
+USER root
+
+# Install Docker
+RUN <<EOF
+  set -e
+  curl -fsSL https://get.docker.com -o /tmp/get-docker.sh
+  sh /tmp/get-docker.sh
+  rm /tmp/get-docker.sh
+
+  # Install Docker. Mark the binary setuid so it can be run without prefixing it
+  # with sudo. Adding the container user to the docker group doesn't work because
+  # /var/run/docker.sock is owned by the host's docker GID, not the container's
+  # docker GID.
+  chmod u+s /usr/bin/docker
+EOF
+
 USER libcxx-builder
 WORKDIR /home/libcxx-builder
 
diff --git a/libcxx/utils/ci/apple-install-libcxx.sh b/libcxx/utils/ci/apple-install-libcxx.sh
index ddefabe77264f..4872253a266a8 100755
--- a/libcxx/utils/ci/apple-install-libcxx.sh
+++ b/libcxx/utils/ci/apple-install-libcxx.sh
@@ -114,8 +114,14 @@ for arch in ${architectures}; do
     #       Then LLVM would guess the LLVM_DEFAULT_TARGET_TRIPLE properly and we wouldn't have to specify it.
     target=$(xcrun clang -arch ${arch} -xc - -### 2>&1 | grep --only-matching -E '"-triple" ".+?"' | grep --only-matching -E '"[^ ]+-apple-[^ ]+?"' | tr -d '"')
 
-    step "Building libc++.dylib and libc++abi.dylib for architecture ${arch}"
     mkdir -p "${build_dir}/${arch}"
+
+    step "Building shims to make libc++ compatible with the system libc++ on Apple platforms when running the tests"
+    shims_library="${build_dir}/${arch}/apple-system-shims.a"
+    # Note that this doesn't need to match the Standard version used to build the rest of the library.
+    xcrun clang++ -c -std=c++2b -target ${target} "${llvm_root}/libcxxabi/src/vendor/apple/shims.cpp" -static -o "${shims_library}"
+
+    step "Building libc++.dylib and libc++abi.dylib for architecture ${arch}"
     xcrun cmake -S "${llvm_root}/runtimes" \
                 -B "${build_dir}/${arch}" \
                 -GNinja \
@@ -127,9 +133,9 @@ for arch in ${architectures}; do
                 -DCMAKE_OSX_ARCHITECTURES="${arch}" \
                 -DLIBCXXABI_LIBRARY_VERSION="${version}" \
                 -DLIBCXX_LIBRARY_VERSION="${version}" \
-                -DLIBCXX_TEST_PARAMS="target_triple=${target};stdlib=apple-libc++" \
-                -DLIBCXXABI_TEST_PARAMS="target_triple=${target};stdlib=apple-libc++" \
-                -DLIBUNWIND_TEST_PARAMS="target_triple=${target};stdlib=apple-libc++"
+                -DLIBCXX_TEST_PARAMS="target_triple=${target};apple_system_shims=${shims_library}" \
+                -DLIBCXXABI_TEST_PARAMS="target_triple=${target};apple_system_shims=${shims_library}" \
+                -DLIBUNWIND_TEST_PARAMS="target_triple=${target};apple_system_shims=${shims_library}"
 
     if [ "$headers_only" = true ]; then
         xcrun cmake --build "${build_dir}/${arch}" --target install-cxx-headers install-cxxabi-headers -- -v
diff --git a/libcxx/utils/ci/build-picolibc.sh b/libcxx/utils/ci/build-picolibc.sh
index 713e277354768..400e4dcab99a3 100755
--- a/libcxx/utils/ci/build-picolibc.sh
+++ b/libcxx/utils/ci/build-picolibc.sh
@@ -69,13 +69,8 @@ picolibc_source_dir="${build_dir}/picolibc-source"
 picolibc_build_dir="${build_dir}/picolibc-build"
 mkdir -p "${picolibc_source_dir}"
 mkdir -p "${picolibc_build_dir}"
-# Download the version of picolibc that was the latest at the time this script was written.
-# Following changes are required and were introduced after version 1.8.5:
-# - updated semihost arguments handling,
-# - added missing macros in stdio.h
-# - external linkage for isblank
-# Version following 1.8.5, was not released by the time of writing.
-picolibc_commit="04a90c56d7aac61880f205ec29b3dce6a9de0342"
+# Download a known good version of picolibc.
+picolibc_commit="48fbc2009c6473293d03d5ec6f190565c6223a5c"
 curl -L "https://github.com/picolibc/picolibc/archive/${picolibc_commit}.zip" --output "${picolibc_source_dir}/picolibc.zip"
 unzip -q "${picolibc_source_dir}/picolibc.zip" -d "${picolibc_source_dir}"
 mv "${picolibc_source_dir}/picolibc-${picolibc_commit}"/* "${picolibc_source_dir}"
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 71d211bfc287d..0a81268cbe7a1 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -17,11 +17,6 @@
 # goal being to reduce the load on testers when a commit is known to fail.
 #
 
-# The Linux CI runners use the nightly ToT build provided by the Docker image.
-# (Note the image isn't updated daily.) The LLVM_HEAD_VERSION contains that
-# version number. The Linux CI runners for GCC use the latest stable version.
-# Theses numbers are available in all runners, making it easier to update the
-# version number.
 env:
     # LLVM POST-BRANCH bump version
     # LLVM POST-BRANCH add compiler test for ToT - 1, e.g. "Clang 17"
@@ -41,20 +36,6 @@ definitions:
       - "**/*.abilist"
       - "**/crash_diagnostics/*"
 
-environment_definitions:
-  _common_env: &common_env
-      LLVM_SYMBOLIZER_PATH: "/usr/bin/llvm-symbolizer-${LLVM_HEAD_VERSION}"
-      CLANG_CRASH_DIAGNOSTICS_DIR: "crash_diagnostics"
-      CC: clang-${LLVM_HEAD_VERSION}
-      CXX: clang++-${LLVM_HEAD_VERSION}
-
-  _absolute_path_clang: &absolute_path_clang
-    # Note modules require and absolute path for clang-scan-deps
-    # https://github.com/llvm/llvm-project/issues/61006
-    CC: /usr/lib/llvm-${LLVM_HEAD_VERSION}/bin/clang
-    CXX: /usr/lib/llvm-${LLVM_HEAD_VERSION}/bin/clang++
-
-
 steps:
 - group: ':mac: Apple'
   steps:
diff --git a/libcxx/utils/ci/vendor/android/Dockerfile.emulator b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
index 54953f40014e7..2f52b27f6edcd 100644
--- a/libcxx/utils/ci/vendor/android/Dockerfile.emulator
+++ b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     && rm -rf /var/lib/apt/lists/*
 
-ENV ANDROID_HOME /opt/android/sdk
+ENV ANDROID_HOME=/opt/android/sdk
 
 RUN curl -sL https://dl.google.com/android/repository/commandlinetools-linux-9477386_latest.zip -o cmdline-tools.zip && \
     mkdir -p ${ANDROID_HOME} && \
diff --git a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
index e4538697266a4..99d4995b2ee1d 100755
--- a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
+++ b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
@@ -45,5 +45,5 @@ fi
 
 # Use exec so that the emulator is PID 1, so that `docker stop` kills the
 # emulator.
-exec emulator @emulator -no-audio -no-window \
+exec emulator @emulator -no-audio -no-window -no-metrics \
     -partition-size "${EMU_PARTITION_SIZE}"
diff --git a/libcxx/utils/ci/vendor/android/run-buildbot-container b/libcxx/utils/ci/vendor/android/run-buildbot-container
index 4ab83194c05d5..7b5d9a4cc3fe7 100755
--- a/libcxx/utils/ci/vendor/android/run-buildbot-container
+++ b/libcxx/utils/ci/vendor/android/run-buildbot-container
@@ -27,5 +27,5 @@ if [ -S /var/run/docker.sock ]; then
     DOCKER_OPTIONS+=(--volume /var/run/docker.sock:/var/run/docker.sock)
 fi
 
-docker run "${DOCKER_OPTIONS[@]}" libcxx-builder-android \
+docker run "${DOCKER_OPTIONS[@]}" ghcr.io/libcxx/android-buildkite-builder \
     bash -c 'git config --global --add safe.directory /llvm; (/opt/android/container-setup.sh && exec bash)'
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 773b1523cde4e..6c42748002aee 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -949,7 +949,6 @@ def add_version_header(tc):
                 "c++26": 202311,  # P2833R2 Freestanding Library: inout expected span
             },
             "headers": ["memory"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_parallel_algorithm",
@@ -993,7 +992,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_ranges",
             "values": {
                 "c++20": 202207,
-                # "c++26": 202406, # P2997R1 Removing the common reference requirement from the indirectly invocable concepts
+                # "c++23": 202302,  # Relaxing Ranges Just A Smidge
+                # "c++26": 202406,  # P2997R1 Removing the common reference requirement from the indirectly invocable concepts (already implemented as a DR)
             },
             "headers": ["algorithm", "functional", "iterator", "memory", "ranges"],
         },
@@ -1033,6 +1033,11 @@ def add_version_header(tc):
             "values": {"c++23": 202207},
             "headers": ["algorithm"],
         },
+        {
+            "name": "__cpp_lib_ranges_find_last",
+            "values": {"c++23": 202207},
+            "headers": ["algorithm"],
+        },
         {
             "name": "__cpp_lib_ranges_iota",
             "values": {"c++23": 202202},
@@ -1267,7 +1272,7 @@ def add_version_header(tc):
             "values": {
                 "c++17": 201606,
                 "c++20": 201803,
-                # "c++26": 202403, # P2591R5: Concatenation of strings and string views
+                "c++26": 202403,  # P2591R5: Concatenation of strings and string views
             },
             "headers": ["string", "string_view"],
         },
@@ -1297,8 +1302,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_three_way_comparison",
-            "values": {"c++20": 201711},
-            # {"c++20": 201907} # P1614R2 The Mothership has Landed (see P1902R1 Missing feature-test macros 2017-2019)
+            "values": {"c++20": 201907},
             "headers": ["compare"],
         },
         {
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 5e708da4f8fbe..97cdb0349885d 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -263,6 +263,26 @@ def _mingwSupportsModules(cfg):
           """,
         ),
     ),
+    # Check for a Windows UCRT bug (not fixed upstream yet).
+    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
+    # while other C runtimes produce just "0x0p+0".
+    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
+    Feature(
+        name="win32-broken-printf-a-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%a", 0.0);
+              return strcmp(buf, "0x0p+0");
+            }
+          """,
+        ),
+    ),
     # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
     # mon_decimal_point == ".", which our tests don't handle.
     Feature(
@@ -352,6 +372,8 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
     "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
     "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
     "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
     "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem",
     "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device",
diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py
index aab7651c7bb03..b7758dc9a41ee 100644
--- a/libcxx/utils/libcxx/test/modules.py
+++ b/libcxx/utils/libcxx/test/modules.py
@@ -76,6 +76,13 @@
 # This declaration is in the ostream header.
 ExtraDeclarations["system_error"] = ["std::operator<<"]
 
+# TODO MODULES avoid this work-around
+# This is a work-around for the special math functions. They are declared in
+# __math/special_functions.h. Adding this as an ExtraHeader works for the std
+# module. However these functions are special; they are not available in the
+# global namespace.
+ExtraDeclarations["cmath"] = ["std::hermite", "std::hermitef", "std::hermitel"]
+
 ### ExtraHeader
 
 # Adds extra headers file to scan
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 3aadc54cf92dc..13c7297fd7304 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -88,6 +88,17 @@ def getStdFlag(cfg, std):
     return None
 
 
+def getDefaultStdValue(cfg):
+    viable = [s for s in reversed(_allStandards) if getStdFlag(cfg, s)]
+
+    if not viable:
+        raise RuntimeError(
+            "Unable to successfully detect the presence of any -std=c++NN flag. This likely indicates an issue with your compiler."
+        )
+
+    return viable[0]
+
+
 def getSpeedOptimizationFlag(cfg):
     if _isClang(cfg) or _isAppleClang(cfg) or _isGCC(cfg):
         return "-O3"
@@ -170,9 +181,7 @@ def getSuitableClangTidy(cfg):
         choices=_allStandards,
         type=str,
         help="The version of the standard to compile the test suite with.",
-        default=lambda cfg: next(
-            s for s in reversed(_allStandards) if getStdFlag(cfg, s)
-        ),
+        default=lambda cfg: getDefaultStdValue(cfg),
         actions=lambda std: [
             AddFeature(std),
             AddSubstitution("%{cxx_std}", re.sub(r"\+", "x", std)),
diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index 9bd831c227798..8f37b9089ece8 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -24,7 +24,14 @@
 #else
 
 #  if __has_builtin(__builtin_verbose_trap)
-#    define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message)
+// AppleClang shipped a slightly different version of __builtin_verbose_trap from the upstream
+// version before upstream Clang actually got the builtin.
+// TODO: Remove once AppleClang supports the two-arguments version of the builtin.
+#    if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 17000
+#      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap(message)
+#    else
+#      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message)
+#    endif
 #  else
 #    define _LIBCPP_ASSERTION_HANDLER(message) ((void)message, __builtin_trap())
 #  endif
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 36bf454636366..2637d2d456673 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -200,8 +200,7 @@ class Node {
 
   Prec Precedence : 6;
 
-  // FIXME: Make these protected.
-public:
+protected:
   /// Tracks if this node has a component on its right side, in which case we
   /// need to call printRight.
   Cache RHSComponentCache : 2;
@@ -255,6 +254,9 @@ class Node {
   Kind getKind() const { return K; }
 
   Prec getPrecedence() const { return Precedence; }
+  Cache getRHSComponentCache() const { return RHSComponentCache; }
+  Cache getArrayCache() const { return ArrayCache; }
+  Cache getFunctionCache() const { return FunctionCache; }
 
   virtual bool hasRHSComponentSlow(OutputBuffer &) const { return false; }
   virtual bool hasArraySlow(OutputBuffer &) const { return false; }
@@ -424,8 +426,8 @@ class QualType final : public Node {
 
 public:
   QualType(const Node *Child_, Qualifiers Quals_)
-      : Node(KQualType, Child_->RHSComponentCache,
-             Child_->ArrayCache, Child_->FunctionCache),
+      : Node(KQualType, Child_->getRHSComponentCache(), Child_->getArrayCache(),
+             Child_->getFunctionCache()),
         Quals(Quals_), Child(Child_) {}
 
   Qualifiers getQuals() const { return Quals; }
@@ -554,8 +556,8 @@ struct AbiTagAttr : Node {
   std::string_view Tag;
 
   AbiTagAttr(Node *Base_, std::string_view Tag_)
-      : Node(KAbiTagAttr, Base_->RHSComponentCache, Base_->ArrayCache,
-             Base_->FunctionCache),
+      : Node(KAbiTagAttr, Base_->getRHSComponentCache(), Base_->getArrayCache(),
+             Base_->getFunctionCache()),
         Base(Base_), Tag(Tag_) {}
 
   template<typename Fn> void match(Fn F) const { F(Base, Tag); }
@@ -615,7 +617,7 @@ class PointerType final : public Node {
 
 public:
   PointerType(const Node *Pointee_)
-      : Node(KPointerType, Pointee_->RHSComponentCache),
+      : Node(KPointerType, Pointee_->getRHSComponentCache()),
         Pointee(Pointee_) {}
 
   const Node *getPointee() const { return Pointee; }
@@ -699,7 +701,7 @@ class ReferenceType : public Node {
 
 public:
   ReferenceType(const Node *Pointee_, ReferenceKind RK_)
-      : Node(KReferenceType, Pointee_->RHSComponentCache),
+      : Node(KReferenceType, Pointee_->getRHSComponentCache()),
         Pointee(Pointee_), RK(RK_) {}
 
   template<typename Fn> void match(Fn F) const { F(Pointee, RK); }
@@ -742,7 +744,7 @@ class PointerToMemberType final : public Node {
 
 public:
   PointerToMemberType(const Node *ClassType_, const Node *MemberType_)
-      : Node(KPointerToMemberType, MemberType_->RHSComponentCache),
+      : Node(KPointerToMemberType, MemberType_->getRHSComponentCache()),
         ClassType(ClassType_), MemberType(MemberType_) {}
 
   template<typename Fn> void match(Fn F) const { F(ClassType, MemberType); }
@@ -1383,16 +1385,14 @@ class ParameterPack final : public Node {
 public:
   ParameterPack(NodeArray Data_) : Node(KParameterPack), Data(Data_) {
     ArrayCache = FunctionCache = RHSComponentCache = Cache::Unknown;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->ArrayCache == Cache::No;
-        }))
+    if (std::all_of(Data.begin(), Data.end(),
+                    [](Node *P) { return P->getArrayCache() == Cache::No; }))
       ArrayCache = Cache::No;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->FunctionCache == Cache::No;
-        }))
+    if (std::all_of(Data.begin(), Data.end(),
+                    [](Node *P) { return P->getFunctionCache() == Cache::No; }))
       FunctionCache = Cache::No;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->RHSComponentCache == Cache::No;
+    if (std::all_of(Data.begin(), Data.end(), [](Node *P) {
+          return P->getRHSComponentCache() == Cache::No;
         }))
       RHSComponentCache = Cache::No;
   }
diff --git a/libcxxabi/src/private_typeinfo.cpp b/libcxxabi/src/private_typeinfo.cpp
index 9e58501a55934..9dba91e1985e3 100644
--- a/libcxxabi/src/private_typeinfo.cpp
+++ b/libcxxabi/src/private_typeinfo.cpp
@@ -55,15 +55,12 @@
 #include <ptrauth.h>
 #endif
 
-
-template<typename T>
-static inline
-T *
-get_vtable(T *vtable) {
+template <typename T>
+static inline T* strip_vtable(T* vtable) {
 #if __has_feature(ptrauth_calls)
-    vtable = ptrauth_strip(vtable, ptrauth_key_cxx_vtable_pointer);
+  vtable = ptrauth_strip(vtable, ptrauth_key_cxx_vtable_pointer);
 #endif
-    return vtable;
+  return vtable;
 }
 
 static inline
@@ -117,11 +114,10 @@ void dyn_cast_get_derived_info(derived_object_info* info, const void* static_ptr
         reinterpret_cast<const uint8_t*>(vtable) + offset_to_ti_proxy;
     info->dynamic_type = *(reinterpret_cast<const __class_type_info* const*>(ptr_to_ti_proxy));
 #else
-    void **vtable = *static_cast<void ** const *>(static_ptr);
-    vtable = get_vtable(vtable);
-    info->offset_to_derived = reinterpret_cast<ptrdiff_t>(vtable[-2]);
-    info->dynamic_ptr = static_cast<const char*>(static_ptr) + info->offset_to_derived;
-    info->dynamic_type = static_cast<const __class_type_info*>(vtable[-1]);
+  void** vtable = strip_vtable(*static_cast<void** const*>(static_ptr));
+  info->offset_to_derived = reinterpret_cast<ptrdiff_t>(vtable[-2]);
+  info->dynamic_ptr = static_cast<const char*>(static_ptr) + info->offset_to_derived;
+  info->dynamic_type = static_cast<const __class_type_info*>(vtable[-1]);
 #endif
 }
 
@@ -576,8 +572,7 @@ __base_class_type_info::has_unambiguous_public_base(__dynamic_cast_info* info,
        find the layout.  */
     offset_to_base = __offset_flags >> __offset_shift;
     if (is_virtual) {
-      const char* vtable = *static_cast<const char* const*>(adjustedPtr);
-      vtable = get_vtable(vtable);
+      const char* vtable = strip_vtable(*static_cast<const char* const*>(adjustedPtr));
       offset_to_base = update_offset_to_base(vtable, offset_to_base);
     }
   } else if (!is_virtual) {
@@ -1517,9 +1512,8 @@ __base_class_type_info::search_above_dst(__dynamic_cast_info* info,
     ptrdiff_t offset_to_base = __offset_flags >> __offset_shift;
     if (__offset_flags & __virtual_mask)
     {
-        const char* vtable = *static_cast<const char*const*>(current_ptr);
-        vtable = get_vtable(vtable);
-        offset_to_base = update_offset_to_base(vtable, offset_to_base);
+      const char* vtable = strip_vtable(*static_cast<const char* const*>(current_ptr));
+      offset_to_base = update_offset_to_base(vtable, offset_to_base);
     }
     __base_type->search_above_dst(info, dst_ptr,
                                   static_cast<const char*>(current_ptr) + offset_to_base,
@@ -1538,9 +1532,8 @@ __base_class_type_info::search_below_dst(__dynamic_cast_info* info,
     ptrdiff_t offset_to_base = __offset_flags >> __offset_shift;
     if (__offset_flags & __virtual_mask)
     {
-        const char* vtable = *static_cast<const char*const*>(current_ptr);
-        vtable = get_vtable(vtable);
-        offset_to_base = update_offset_to_base(vtable, offset_to_base);
+      const char* vtable = strip_vtable(*static_cast<const char* const*>(current_ptr));
+      offset_to_base = update_offset_to_base(vtable, offset_to_base);
     }
     __base_type->search_below_dst(info,
                                   static_cast<const char*>(current_ptr) + offset_to_base,
diff --git a/libcxxabi/src/vendor/apple/shims.cpp b/libcxxabi/src/vendor/apple/shims.cpp
new file mode 100644
index 0000000000000..65152f7e0e8c9
--- /dev/null
+++ b/libcxxabi/src/vendor/apple/shims.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//
+// These shims implement symbols that are present in the system libc++ on Apple platforms
+// but are not implemented in upstream libc++. This allows testing libc++ under a system
+// library configuration, which requires the just-built libc++ to be ABI compatible with
+// the system library it is replacing.
+//
+
+#include <cstddef>
+#include <new>
+
+namespace std { // purposefully not versioned, like align_val_t
+enum class __type_descriptor_t : unsigned long long;
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new(std::size_t __sz, std::__type_descriptor_t) {
+  return ::operator new(__sz);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new(std::size_t __sz, const std::nothrow_t& __nt,
+                                                std::__type_descriptor_t) noexcept {
+  return ::operator new(__sz, __nt);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](std::size_t __sz, std::__type_descriptor_t) {
+  return ::operator new[](__sz);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](std::size_t __sz, const std::nothrow_t& __nt,
+                                                  std::__type_descriptor_t) noexcept {
+  return ::operator new(__sz, __nt);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete(void* __p, std::__type_descriptor_t) noexcept {
+  return ::operator delete(__p);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete(void* __p, const std::nothrow_t& __nt,
+                                                  std::__type_descriptor_t) noexcept {
+  return ::operator delete(__p, __nt);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete[](void* __p, std::__type_descriptor_t) noexcept {
+  return ::operator delete[](__p);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete[](void* __p, const std::nothrow_t& __nt,
+                                                    std::__type_descriptor_t) noexcept {
+  return ::operator delete[](__p, __nt);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete(void* __p, std::size_t __sz, std::__type_descriptor_t) noexcept {
+  return ::operator delete(__p, __sz);
+}
+
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete[](void* __p, std::size_t __sz, std::__type_descriptor_t) noexcept {
+  return ::operator delete[](__p, __sz);
+}
diff --git a/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in b/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in
new file mode 100644
index 0000000000000..537fe8272a038
--- /dev/null
+++ b/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in
@@ -0,0 +1,49 @@
+# Testing configuration for Apple's system libc++abi.
+#
+# This configuration differs from a normal LLVM shared library configuration in
+# that we must use DYLD_LIBRARY_PATH to run the tests against the just-built library,
+# since Apple's libc++abi has an absolute install_name.
+#
+# Finally, we also link against an artificial shims library that provides
+# the functionality necessary for the upstream libc++abi to be usable in place
+# of a system-provided libc++abi. Without that, attempting to replace the system
+# libc++abi with DYLD_LIBRARY_PATH would result in missing symbols and other similar
+# issues since the upstream libc++abi does not contain all the symbols provided by
+# the system library.
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+import os, site
+site.addsitedir(os.path.join('@LIBCXXABI_LIBCXX_PATH@', 'utils'))
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+ADDITIONAL_PARAMETERS = [
+    libcxx.test.dsl.Parameter(name='apple_system_shims', type=str,
+        actions=lambda path: [libcxx.test.dsl.AddSubstitution('%{apple-system-shims}', path)],
+        help="""
+        Path to a pre-built object file or static archive that contains shims necessary to
+        allow replacing the system libc++abi by the just-built one.
+        """),
+]
+
+config.substitutions.append(('%{flags}',
+    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
+))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS ' +
+    '-I %{libcxx}/test/support -I %{libcxx}/src'
+))
+config.substitutions.append(('%{link_flags}',
+    '-nostdlib++ -L %{lib} -lc++ %{apple-system-shims}'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib} -- '
+))
+
+config.stdlib = 'apple-libc++'
+
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS + ADDITIONAL_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index fe5598991b831..ab783cf9d96f2 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -17,6 +17,8 @@
 // 80-bit format, and this demangling test is failing on it.
 // XFAIL: LIBCXX-ANDROID-FIXME && target={{i686|x86_64}}-{{.+}}-android{{.*}}
 
+// XFAIL: win32-broken-printf-a-precision
+
 #include "support/timer.h"
 #include <algorithm>
 #include <cassert>
diff --git a/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_32.pass.sh.s b/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_32.pass.sh.s
index ce90045586082..b35c999e6e50d 100644
--- a/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_32.pass.sh.s
+++ b/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_32.pass.sh.s
@@ -9,7 +9,7 @@
 # Test that a nested exception is thrown by a destructor inside a try-block
 # when the code is generated by the legacy AIX xlclang compiler.
 
-# REQUIRES: target=powerpc-ibm-aix
+# REQUIRES: target=powerpc-ibm-aix{{.*}}
 # UNSUPPORTED: no-exceptions
 
 # RUN: %{cxx} %{flags} %s %{link_flags} \
diff --git a/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_64.pass.sh.s b/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_64.pass.sh.s
index 7b0afb9ebae38..16754db2837ca 100644
--- a/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_64.pass.sh.s
+++ b/libcxxabi/test/vendor/ibm/aix_xlclang_nested_excp_64.pass.sh.s
@@ -8,7 +8,7 @@
 # Test that a nested exception is thrown by a destructor inside a try-block
 # when the code is generated by the legacy AIX xlclang compiler.
 
-# REQUIRES: target=powerpc64-ibm-aix
+# REQUIRES: target=powerpc64-ibm-aix{{.*}}
 # UNSUPPORTED: no-exceptions
 
 # RUN: %{cxx} %{flags} %s %{link_flags} \
diff --git a/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_32.pass.sh.S b/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_32.pass.sh.S
index 71c3ab9409a81..8b92e4febf562 100644
--- a/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_32.pass.sh.S
+++ b/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_32.pass.sh.S
@@ -14,7 +14,7 @@
 // xlclang++ compiler included in this file. This file tests for the 32-bit
 // mode.
 
-# REQUIRES: target=powerpc-ibm-aix
+# REQUIRES: target=powerpc-ibm-aix{{.*}}
 # UNSUPPORTED: no-exceptions
 
 // RUN: %{cxx} -c %s -o %t1_32.o -DT1_CPP_CODE %{flags} %{compile_flags}
diff --git a/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_64.pass.sh.S b/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_64.pass.sh.S
index da413577bd38f..64d7c80e9e6dd 100644
--- a/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_64.pass.sh.S
+++ b/libcxxabi/test/vendor/ibm/aix_xlclang_passing_excp_obj_64.pass.sh.S
@@ -14,7 +14,7 @@
 // xlclang++ compiler included in this file. This file tests for the 64-bit
 // mode.
 
-# REQUIRES: target=powerpc64-ibm-aix
+# REQUIRES: target=powerpc64-ibm-aix{{.*}}
 # UNSUPPORTED: no-exceptions
 
 // RUN: %{cxx} -c %s -o %t1_64.o -DT1_CPP_CODE %{flags} %{compile_flags}
diff --git a/libcxxabi/test/vendor/ibm/cond_reg_restore.pass.cpp b/libcxxabi/test/vendor/ibm/cond_reg_restore.pass.cpp
index 63817e1b13a25..a5eb3c20534a3 100644
--- a/libcxxabi/test/vendor/ibm/cond_reg_restore.pass.cpp
+++ b/libcxxabi/test/vendor/ibm/cond_reg_restore.pass.cpp
@@ -10,7 +10,7 @@
 // on AIX. Option -O3 is required so that the compiler will re-use the value
 // in the condition register instead of re-evaluating the condition expression.
 
-// REQUIRES: target=powerpc{{(64)?}}-ibm-aix
+// REQUIRES: target={{.+}}-aix{{.*}}
 // ADDITIONAL_COMPILE_FLAGS: -O3
 // UNSUPPORTED: no-exceptions
 
diff --git a/libcxxabi/test/vendor/ibm/vec_reg_restore.pass.cpp b/libcxxabi/test/vendor/ibm/vec_reg_restore.pass.cpp
index 703c311dae392..7c31970546993 100644
--- a/libcxxabi/test/vendor/ibm/vec_reg_restore.pass.cpp
+++ b/libcxxabi/test/vendor/ibm/vec_reg_restore.pass.cpp
@@ -9,7 +9,7 @@
 // Check that the PowerPC vector registers are restored properly during
 // unwinding. Option -mabi=vec-extabi is required to compile the test case.
 
-// REQUIRES: target=powerpc{{(64)?}}-ibm-aix
+// REQUIRES: target={{.+}}-aix{{.*}}
 // ADDITIONAL_COMPILE_FLAGS: -mabi=vec-extabi
 // UNSUPPORTED: no-exceptions
 
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 2ec60e4c123d5..758557337899e 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -2589,7 +2589,8 @@ void UnwindCursor<A, R>::setInfoBasedOnIPRegister(bool isReturnAddress) {
     --pc;
 #endif
 
-#if !(defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32))
+#if !(defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32)) &&            \
+    !defined(_LIBUNWIND_SUPPORT_TBTAB_UNWIND)
   // In case of this is frame of signal handler, the IP saved in the signal
   // handler points to first non-executed instruction, while FDE/CIE expects IP
   // to be after the first non-executed instruction.
diff --git a/libunwind/test/aix_signal_unwind.pass.sh.S b/libunwind/test/aix_signal_unwind.pass.sh.S
index 9ca18e9481f4f..2c0cf140fe267 100644
--- a/libunwind/test/aix_signal_unwind.pass.sh.S
+++ b/libunwind/test/aix_signal_unwind.pass.sh.S
@@ -10,7 +10,7 @@
 // a correct traceback when the function raising the signal does not save
 // the link register or does not store the stack back chain.
 
-// REQUIRES: target=powerpc{{(64)?}}-ibm-aix
+// REQUIRES: target={{.+}}-aix{{.*}}
 
 // Test when the function raising the signal does not save the link register
 // RUN: %{cxx} -x c++ %s -o %t.exe -DCXX_CODE %{flags} %{compile_flags}
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index 1e78a560bca86..4bc4d7c4b5a47 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -295,6 +295,8 @@ def wrap : P_priv<"wrap">;
 
 def vfsoverlay : P<"vfsoverlay", "Path to a vfsoverlay yaml file to optionally look for /defaultlib's in">;
 
+def show_timing : F<"time">,
+    HelpText<"Print the time spent in each phase of linking">;
 def time_trace_eq: Joined<["--"], "time-trace=">, MetaVarName<"<file>">,
   HelpText<"Record time trace to <file>">;
 def : Flag<["--"], "time-trace">, Alias<time_trace_eq>,
@@ -315,7 +317,6 @@ def map : F<"map">;
 def map_file : P_priv<"map">;
 def map_info : P<"mapinfo", "Include the specified information in a map file">;
 def print_search_paths : F<"print-search-paths">;
-def show_timing : F<"time">;
 def summary : F<"summary">;
 
 //==============================================================================
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 5ef46f5af6a6c..56b137d56873a 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -98,10 +98,10 @@ class Symbol {
   friend SymbolTable;
   explicit Symbol(Kind k, StringRef n = "")
       : symbolKind(k), isExternal(true), isCOMDAT(false),
-        writtenToSymtab(false), pendingArchiveLoad(false), isGCRoot(false),
-        isRuntimePseudoReloc(false), deferUndefined(false), canInline(true),
-        isWeak(false), nameSize(n.size()),
-        nameData(n.empty() ? nullptr : n.data()) {
+        writtenToSymtab(false), isUsedInRegularObj(false),
+        pendingArchiveLoad(false), isGCRoot(false), isRuntimePseudoReloc(false),
+        deferUndefined(false), canInline(true), isWeak(false),
+        nameSize(n.size()), nameData(n.empty() ? nullptr : n.data()) {
     assert((!n.empty() || k <= LastDefinedCOFFKind) &&
            "If the name is empty, the Symbol must be a DefinedCOFF.");
   }
@@ -499,8 +499,10 @@ void replaceSymbol(Symbol *s, ArgT &&... arg) {
   assert(static_cast<Symbol *>(static_cast<T *>(nullptr)) == nullptr &&
          "Not a Symbol");
   bool canInline = s->canInline;
+  bool isUsedInRegularObj = s->isUsedInRegularObj;
   new (s) T(std::forward<ArgT>(arg)...);
   s->canInline = canInline;
+  s->isUsedInRegularObj = isUsedInRegularObj;
 }
 } // namespace coff
 
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index cf5c2380690f1..0106349b2d277 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -239,30 +239,81 @@ int64_t AArch64::getImplicitAddend(const uint8_t *buf, RelType type) const {
   case R_AARCH64_IRELATIVE:
   case R_AARCH64_TLS_TPREL64:
     return read64(buf);
+
+    // The following relocation types all point at instructions, and
+    // relocate an immediate field in the instruction.
+    //
+    // The general rule, from AAELF64 §5.7.2 "Addends and PC-bias",
+    // says: "If the relocation relocates an instruction the immediate
+    // field of the instruction is extracted, scaled as required by
+    // the instruction field encoding, and sign-extended to 64 bits".
+
+    // The R_AARCH64_MOVW family operates on wide MOV/MOVK/MOVZ
+    // instructions, which have a 16-bit immediate field with its low
+    // bit in bit 5 of the instruction encoding. When the immediate
+    // field is used as an implicit addend for REL-type relocations,
+    // it is treated as added to the low bits of the output value, not
+    // shifted depending on the relocation type.
+    //
+    // This allows REL relocations to express the requirement 'please
+    // add 12345 to this symbol value and give me the four 16-bit
+    // chunks of the result', by putting the same addend 12345 in all
+    // four instructions. Carries between the 16-bit chunks are
+    // handled correctly, because the whole 64-bit addition is done
+    // once per relocation.
   case R_AARCH64_MOVW_UABS_G0:
   case R_AARCH64_MOVW_UABS_G0_NC:
-    return getBits(SignExtend64<16>(read16(buf)), 0, 15);
   case R_AARCH64_MOVW_UABS_G1:
   case R_AARCH64_MOVW_UABS_G1_NC:
-    return getBits(SignExtend64<32>(read32(buf)), 16, 31);
   case R_AARCH64_MOVW_UABS_G2:
   case R_AARCH64_MOVW_UABS_G2_NC:
-    return getBits(read64(buf), 32, 47);
   case R_AARCH64_MOVW_UABS_G3:
-    return getBits(read64(buf), 48, 63);
+    return SignExtend64<16>(getBits(read32(buf), 5, 20));
+
+    // R_AARCH64_TSTBR14 points at a TBZ or TBNZ instruction, which
+    // has a 14-bit offset measured in instructions, i.e. shifted left
+    // by 2.
   case R_AARCH64_TSTBR14:
-    return getBits(SignExtend64<32>(read32(buf)), 2, 15);
+    return SignExtend64<16>(getBits(read32(buf), 5, 18) << 2);
+
+    // R_AARCH64_CONDBR19 operates on the ordinary B.cond instruction,
+    // which has a 19-bit offset measured in instructions.
+    //
+    // R_AARCH64_LD_PREL_LO19 operates on the LDR (literal)
+    // instruction, which also has a 19-bit offset, measured in 4-byte
+    // chunks. So the calculation is the same as for
+    // R_AARCH64_CONDBR19.
   case R_AARCH64_CONDBR19:
   case R_AARCH64_LD_PREL_LO19:
-    return getBits(SignExtend64<32>(read32(buf)), 2, 20);
+    return SignExtend64<21>(getBits(read32(buf), 5, 23) << 2);
+
+    // R_AARCH64_ADD_ABS_LO12_NC operates on ADD (immediate). The
+    // immediate can optionally be shifted left by 12 bits, but this
+    // relocation is intended for the case where it is not.
   case R_AARCH64_ADD_ABS_LO12_NC:
-    return getBits(SignExtend64<16>(read16(buf)), 0, 11);
+    return SignExtend64<12>(getBits(read32(buf), 10, 21));
+
+    // R_AARCH64_ADR_PREL_LO21 operates on an ADR instruction, whose
+    // 21-bit immediate is split between two bits high up in the word
+    // (in fact the two _lowest_ order bits of the value) and 19 bits
+    // lower down.
+    //
+    // R_AARCH64_ADR_PREL_PG_HI21[_NC] operate on an ADRP instruction,
+    // which encodes the immediate in the same way, but will shift it
+    // left by 12 bits when the instruction executes. For the same
+    // reason as the MOVW family, we don't apply that left shift here.
+  case R_AARCH64_ADR_PREL_LO21:
   case R_AARCH64_ADR_PREL_PG_HI21:
   case R_AARCH64_ADR_PREL_PG_HI21_NC:
-    return getBits(SignExtend64<32>(read32(buf)), 12, 32);
+    return SignExtend64<21>((getBits(read32(buf), 5, 23) << 2) |
+                            getBits(read32(buf), 29, 30));
+
+    // R_AARCH64_{JUMP,CALL}26 operate on B and BL, which have a
+    // 26-bit offset measured in instructions.
   case R_AARCH64_JUMP26:
   case R_AARCH64_CALL26:
-    return getBits(SignExtend64<32>(read32(buf)), 2, 27);
+    return SignExtend64<28>(getBits(read32(buf), 0, 25) << 2);
+
   default:
     internalLinkerError(getErrorLocation(buf),
                         "cannot read addend for relocation " + toString(type));
@@ -366,11 +417,13 @@ static void write32AArch64Addr(uint8_t *l, uint64_t imm) {
   write32le(l, (read32le(l) & ~mask) | immLo | immHi);
 }
 
-static void or32le(uint8_t *p, int32_t v) { write32le(p, read32le(p) | v); }
+static void writeMaskedBits32le(uint8_t *p, int32_t v, uint32_t mask) {
+  write32le(p, (read32le(p) & ~mask) | v);
+}
 
 // Update the immediate field in a AARCH64 ldr, str, and add instruction.
-static void or32AArch64Imm(uint8_t *l, uint64_t imm) {
-  or32le(l, (imm & 0xFFF) << 10);
+static void write32Imm12(uint8_t *l, uint64_t imm) {
+  writeMaskedBits32le(l, (imm & 0xFFF) << 10, 0xFFF << 10);
 }
 
 // Update the immediate field in an AArch64 movk, movn or movz instruction
@@ -443,7 +496,7 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
       write32(loc, val);
     break;
   case R_AARCH64_ADD_ABS_LO12_NC:
-    or32AArch64Imm(loc, val);
+    write32Imm12(loc, val);
     break;
   case R_AARCH64_ADR_GOT_PAGE:
   case R_AARCH64_ADR_PREL_PG_HI21:
@@ -470,28 +523,28 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
     [[fallthrough]];
   case R_AARCH64_CALL26:
     checkInt(loc, val, 28, rel);
-    or32le(loc, (val & 0x0FFFFFFC) >> 2);
+    writeMaskedBits32le(loc, (val & 0x0FFFFFFC) >> 2, 0x0FFFFFFC >> 2);
     break;
   case R_AARCH64_CONDBR19:
   case R_AARCH64_LD_PREL_LO19:
   case R_AARCH64_GOT_LD_PREL19:
     checkAlignment(loc, val, 4, rel);
     checkInt(loc, val, 21, rel);
-    or32le(loc, (val & 0x1FFFFC) << 3);
+    writeMaskedBits32le(loc, (val & 0x1FFFFC) << 3, 0x1FFFFC << 3);
     break;
   case R_AARCH64_LDST8_ABS_LO12_NC:
   case R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC:
-    or32AArch64Imm(loc, getBits(val, 0, 11));
+    write32Imm12(loc, getBits(val, 0, 11));
     break;
   case R_AARCH64_LDST16_ABS_LO12_NC:
   case R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC:
     checkAlignment(loc, val, 2, rel);
-    or32AArch64Imm(loc, getBits(val, 1, 11));
+    write32Imm12(loc, getBits(val, 1, 11));
     break;
   case R_AARCH64_LDST32_ABS_LO12_NC:
   case R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC:
     checkAlignment(loc, val, 4, rel);
-    or32AArch64Imm(loc, getBits(val, 2, 11));
+    write32Imm12(loc, getBits(val, 2, 11));
     break;
   case R_AARCH64_LDST64_ABS_LO12_NC:
   case R_AARCH64_LD64_GOT_LO12_NC:
@@ -499,37 +552,39 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
   case R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC:
   case R_AARCH64_TLSDESC_LD64_LO12:
     checkAlignment(loc, val, 8, rel);
-    or32AArch64Imm(loc, getBits(val, 3, 11));
+    write32Imm12(loc, getBits(val, 3, 11));
     break;
   case R_AARCH64_LDST128_ABS_LO12_NC:
   case R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC:
     checkAlignment(loc, val, 16, rel);
-    or32AArch64Imm(loc, getBits(val, 4, 11));
+    write32Imm12(loc, getBits(val, 4, 11));
     break;
   case R_AARCH64_LD64_GOTPAGE_LO15:
     checkAlignment(loc, val, 8, rel);
-    or32AArch64Imm(loc, getBits(val, 3, 14));
+    write32Imm12(loc, getBits(val, 3, 14));
     break;
   case R_AARCH64_MOVW_UABS_G0:
     checkUInt(loc, val, 16, rel);
     [[fallthrough]];
   case R_AARCH64_MOVW_UABS_G0_NC:
-    or32le(loc, (val & 0xFFFF) << 5);
+    writeMaskedBits32le(loc, (val & 0xFFFF) << 5, 0xFFFF << 5);
     break;
   case R_AARCH64_MOVW_UABS_G1:
     checkUInt(loc, val, 32, rel);
     [[fallthrough]];
   case R_AARCH64_MOVW_UABS_G1_NC:
-    or32le(loc, (val & 0xFFFF0000) >> 11);
+    writeMaskedBits32le(loc, (val & 0xFFFF0000) >> 11, 0xFFFF0000 >> 11);
     break;
   case R_AARCH64_MOVW_UABS_G2:
     checkUInt(loc, val, 48, rel);
     [[fallthrough]];
   case R_AARCH64_MOVW_UABS_G2_NC:
-    or32le(loc, (val & 0xFFFF00000000) >> 27);
+    writeMaskedBits32le(loc, (val & 0xFFFF00000000) >> 27,
+                        0xFFFF00000000 >> 27);
     break;
   case R_AARCH64_MOVW_UABS_G3:
-    or32le(loc, (val & 0xFFFF000000000000) >> 43);
+    writeMaskedBits32le(loc, (val & 0xFFFF000000000000) >> 43,
+                        0xFFFF000000000000 >> 43);
     break;
   case R_AARCH64_MOVW_PREL_G0:
   case R_AARCH64_MOVW_SABS_G0:
@@ -562,15 +617,15 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
     break;
   case R_AARCH64_TSTBR14:
     checkInt(loc, val, 16, rel);
-    or32le(loc, (val & 0xFFFC) << 3);
+    writeMaskedBits32le(loc, (val & 0xFFFC) << 3, 0xFFFC << 3);
     break;
   case R_AARCH64_TLSLE_ADD_TPREL_HI12:
     checkUInt(loc, val, 24, rel);
-    or32AArch64Imm(loc, val >> 12);
+    write32Imm12(loc, val >> 12);
     break;
   case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
   case R_AARCH64_TLSDESC_ADD_LO12:
-    or32AArch64Imm(loc, val);
+    write32Imm12(loc, val);
     break;
   case R_AARCH64_TLSDESC:
     // For R_AARCH64_TLSDESC the addend is stored in the second 64-bit word.
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index c6ee73f23d471..db0bc6c760096 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/LEB128.h"
 
 using namespace llvm;
@@ -407,7 +408,9 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_TLS_TPREL32:
   case R_LARCH_TLS_TPREL64:
   case R_LARCH_TLS_LE_HI20:
+  case R_LARCH_TLS_LE_HI20_R:
   case R_LARCH_TLS_LE_LO12:
+  case R_LARCH_TLS_LE_LO12_R:
   case R_LARCH_TLS_LE64_LO20:
   case R_LARCH_TLS_LE64_HI12:
     return R_TPREL;
@@ -490,6 +493,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     return R_TLSLD_GOT;
   case R_LARCH_TLS_GD_HI20:
     return R_TLSGD_GOT;
+  case R_LARCH_TLS_LE_ADD_R:
   case R_LARCH_RELAX:
     return config->relax ? R_RELAX_HINT : R_NONE;
   case R_LARCH_ALIGN:
@@ -507,6 +511,12 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     return R_TLSDESC;
   case R_LARCH_TLS_DESC_CALL:
     return R_TLSDESC_CALL;
+  case R_LARCH_TLS_LD_PCREL20_S2:
+    return R_TLSLD_PC;
+  case R_LARCH_TLS_GD_PCREL20_S2:
+    return R_TLSGD_PC;
+  case R_LARCH_TLS_DESC_PCREL20_S2:
+    return R_TLSDESC_PC;
 
   // Other known relocs that are explicitly unimplemented:
   //
@@ -553,7 +563,11 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, val);
     return;
 
+  // Relocs intended for `pcaddi`.
   case R_LARCH_PCREL20_S2:
+  case R_LARCH_TLS_LD_PCREL20_S2:
+  case R_LARCH_TLS_GD_PCREL20_S2:
+  case R_LARCH_TLS_DESC_PCREL20_S2:
     checkInt(loc, val, 22, rel);
     checkAlignment(loc, val, 4, rel);
     write32le(loc, setJ20(read32le(loc), val >> 2));
@@ -617,6 +631,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_TLS_LE_LO12:
   case R_LARCH_TLS_IE_PC_LO12:
   case R_LARCH_TLS_IE_LO12:
+  case R_LARCH_TLS_LE_LO12_R:
   case R_LARCH_TLS_DESC_PC_LO12:
   case R_LARCH_TLS_DESC_LO12:
     write32le(loc, setK12(read32le(loc), extractBits(val, 11, 0)));
@@ -638,6 +653,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_TLS_DESC_HI20:
     write32le(loc, setJ20(read32le(loc), extractBits(val, 31, 12)));
     return;
+  case R_LARCH_TLS_LE_HI20_R:
+    write32le(loc, setJ20(read32le(loc), extractBits(val + 0x800, 31, 12)));
+    return;
 
   // Relocs intended for `lu32i.d`.
   case R_LARCH_ABS64_LO20:
@@ -707,6 +725,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     // no-op
     return;
 
+  case R_LARCH_TLS_LE_ADD_R:
   case R_LARCH_RELAX:
     return; // Ignored (for now)
 
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 6af89ce3517b7..56759c28dcf41 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1131,6 +1131,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6C:
       return;
     };
+    break;
 
   case RISCVAtomicAbiTag::A6S:
     switch (newTag) {
@@ -1144,6 +1145,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6S:
       return;
     };
+    break;
 
   case RISCVAtomicAbiTag::A7:
     switch (newTag) {
@@ -1157,6 +1159,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A7:
       return;
     };
+    break;
   };
 
   // If we get here, then we have an invalid tag, so report it.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0173be396163e..183dc88a93e2f 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -27,6 +27,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/TarWriter.h"
 #include <atomic>
 #include <memory>
 #include <optional>
@@ -332,6 +333,7 @@ struct Config {
   bool zPacPlt;
   bool zRelro;
   bool zRodynamic;
+  bool zSectionHeader;
   bool zShstk;
   bool zStartStopGC;
   uint8_t zStartStopVisibility;
@@ -458,6 +460,15 @@ struct ConfigWrapper {
 
 LLVM_LIBRARY_VISIBILITY extern ConfigWrapper config;
 
+// Some index properties of a symbol are stored separately in this auxiliary
+// struct to decrease sizeof(SymbolUnion) in the majority of cases.
+struct SymbolAux {
+  uint32_t gotIdx = -1;
+  uint32_t pltIdx = -1;
+  uint32_t tlsDescIdx = -1;
+  uint32_t tlsGdIdx = -1;
+};
+
 struct DuplicateSymbol {
   const Symbol *sym;
   const InputFile *file;
@@ -475,6 +486,8 @@ struct Ctx {
   SmallVector<BitcodeFile *, 0> lazyBitcodeFiles;
   SmallVector<InputSectionBase *, 0> inputSections;
   SmallVector<EhInputSection *, 0> ehInputSections;
+
+  SmallVector<SymbolAux, 0> symAux;
   // Duplicate symbol candidates.
   SmallVector<DuplicateSymbol, 0> duplicates;
   // Symbols in a non-prevailing COMDAT group which should be changed to an
@@ -489,6 +502,9 @@ struct Ctx {
                  std::pair<const InputFile *, const InputFile *>>
       backwardReferences;
   llvm::SmallSet<llvm::StringRef, 0> auxiliaryFiles;
+  // If --reproduce is specified, all input files are written to this tar
+  // archive.
+  std::unique_ptr<llvm::TarWriter> tar;
   // InputFile for linker created symbols with no source location.
   InputFile *internalFile;
   // True if SHT_LLVM_SYMPART is used.
diff --git a/lld/ELF/DWARF.cpp b/lld/ELF/DWARF.cpp
index 5d58e0c60a952..517d26810a378 100644
--- a/lld/ELF/DWARF.cpp
+++ b/lld/ELF/DWARF.cpp
@@ -136,7 +136,8 @@ template <class ELFT>
 std::optional<RelocAddrEntry>
 LLDDwarfObj<ELFT>::find(const llvm::DWARFSection &s, uint64_t pos) const {
   auto &sec = static_cast<const LLDDWARFSection &>(s);
-  const RelsOrRelas<ELFT> rels = sec.sec->template relsOrRelas<ELFT>();
+  const RelsOrRelas<ELFT> rels =
+      sec.sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
   if (rels.areRelocsRel())
     return findAux(*sec.sec, pos, rels.rels);
   return findAux(*sec.sec, pos, rels.relas);
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 40e095a133d95..a8c52e8c2b8e1 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -101,11 +101,14 @@ void Ctx::reset() {
   lazyBitcodeFiles.clear();
   inputSections.clear();
   ehInputSections.clear();
+
+  symAux.clear();
   duplicates.clear();
   nonPrevailingSyms.clear();
   whyExtractRecords.clear();
   backwardReferences.clear();
   auxiliaryFiles.clear();
+  tar.reset();
   internalFile = nullptr;
   hasSympart.store(false, std::memory_order_relaxed);
   hasTlsIe.store(false, std::memory_order_relaxed);
@@ -136,9 +139,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
     symtab = SymbolTable();
 
     outputSections.clear();
-    symAux.clear();
 
-    tar = nullptr;
     in.reset();
 
     partitions.clear();
@@ -153,7 +154,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
   config = ConfigWrapper();
   script = ScriptWrapper();
 
-  symAux.emplace_back();
+  elf::ctx.symAux.emplace_back();
 
   partitions.clear();
   partitions.emplace_back();
@@ -224,14 +225,15 @@ std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
 
   std::vector<std::pair<MemoryBufferRef, uint64_t>> v;
   Error err = Error::success();
-  bool addToTar = file->isThin() && tar;
+  bool addToTar = file->isThin() && ctx.tar;
   for (const Archive::Child &c : file->children(err)) {
     MemoryBufferRef mbref =
         CHECK(c.getMemoryBufferRef(),
               mb.getBufferIdentifier() +
                   ": could not get the buffer for a child of the archive");
     if (addToTar)
-      tar->append(relativeToRoot(check(c.getFullName())), mbref.getBuffer());
+      ctx.tar->append(relativeToRoot(check(c.getFullName())),
+                      mbref.getBuffer());
     v.push_back(std::make_pair(mbref, c.getChildOffset()));
   }
   if (err)
@@ -445,6 +447,8 @@ static void checkOptions() {
       error("-r and --export-dynamic may not be used together");
     if (config->debugNames)
       error("-r and --debug-names may not be used together");
+    if (!config->zSectionHeader)
+      error("-r and -z nosectionheader may not be used together");
   }
 
   if (config->executeOnly) {
@@ -640,9 +644,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Expected<std::unique_ptr<TarWriter>> errOrWriter =
         TarWriter::create(path, path::stem(path));
     if (errOrWriter) {
-      tar = std::move(*errOrWriter);
-      tar->append("response.txt", createResponseFile(args));
-      tar->append("version.txt", getLLDVersion() + "\n");
+      ctx.tar = std::move(*errOrWriter);
+      ctx.tar->append("response.txt", createResponseFile(args));
+      ctx.tar->append("version.txt", getLLDVersion() + "\n");
       StringRef ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile);
       if (!ltoSampleProfile.empty())
         readFile(ltoSampleProfile);
@@ -834,6 +838,8 @@ static ICFLevel getICF(opt::InputArgList &args) {
 static StripPolicy getStrip(opt::InputArgList &args) {
   if (args.hasArg(OPT_relocatable))
     return StripPolicy::None;
+  if (!config->zSectionHeader)
+    return StripPolicy::All;
 
   auto *arg = args.getLastArg(OPT_strip_all, OPT_strip_debug);
   if (!arg)
@@ -1409,7 +1415,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->soName = args.getLastArgValue(OPT_soname);
   config->sortSection = getSortSection(args);
   config->splitStackAdjustSize = args::getInteger(args, OPT_split_stack_adjust_size, 16384);
-  config->strip = getStrip(args);
+  config->zSectionHeader =
+      getZFlag(args, "sectionheader", "nosectionheader", true);
+  config->strip = getStrip(args); // needs zSectionHeader
   config->sysroot = args.getLastArgValue(OPT_sysroot);
   config->target1Rel = args.hasFlag(OPT_target1_rel, OPT_target1_abs, false);
   config->target2 = getTarget2(args);
@@ -1911,13 +1919,7 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
       hasInput = true;
       break;
     case OPT_defsym: {
-      StringRef from;
-      StringRef to;
-      std::tie(from, to) = StringRef(arg->getValue()).split('=');
-      if (from.empty() || to.empty())
-        error("--defsym: syntax error: " + StringRef(arg->getValue()));
-      else
-        readDefsym(from, MemoryBufferRef(to, "--defsym"));
+      readDefsym(MemoryBufferRef(arg->getValue(), "--defsym"));
       break;
     }
     case OPT_script:
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index bfc605c793a92..44e8a71cc6286 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -103,12 +103,12 @@ template <class ELFT> class ICF {
   void segregate(size_t begin, size_t end, uint32_t eqClassBase, bool constant);
 
   template <class RelTy>
-  bool constantEq(const InputSection *a, ArrayRef<RelTy> relsA,
-                  const InputSection *b, ArrayRef<RelTy> relsB);
+  bool constantEq(const InputSection *a, Relocs<RelTy> relsA,
+                  const InputSection *b, Relocs<RelTy> relsB);
 
   template <class RelTy>
-  bool variableEq(const InputSection *a, ArrayRef<RelTy> relsA,
-                  const InputSection *b, ArrayRef<RelTy> relsB);
+  bool variableEq(const InputSection *a, Relocs<RelTy> relsA,
+                  const InputSection *b, Relocs<RelTy> relsB);
 
   bool equalsConstant(const InputSection *a, const InputSection *b);
   bool equalsVariable(const InputSection *a, const InputSection *b);
@@ -235,8 +235,8 @@ void ICF<ELFT>::segregate(size_t begin, size_t end, uint32_t eqClassBase,
 // Compare two lists of relocations.
 template <class ELFT>
 template <class RelTy>
-bool ICF<ELFT>::constantEq(const InputSection *secA, ArrayRef<RelTy> ra,
-                           const InputSection *secB, ArrayRef<RelTy> rb) {
+bool ICF<ELFT>::constantEq(const InputSection *secA, Relocs<RelTy> ra,
+                           const InputSection *secB, Relocs<RelTy> rb) {
   if (ra.size() != rb.size())
     return false;
   auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
@@ -324,6 +324,8 @@ bool ICF<ELFT>::equalsConstant(const InputSection *a, const InputSection *b) {
 
   const RelsOrRelas<ELFT> ra = a->template relsOrRelas<ELFT>();
   const RelsOrRelas<ELFT> rb = b->template relsOrRelas<ELFT>();
+  if (ra.areRelocsCrel())
+    return constantEq(a, ra.crels, b, rb.crels);
   return ra.areRelocsRel() || rb.areRelocsRel()
              ? constantEq(a, ra.rels, b, rb.rels)
              : constantEq(a, ra.relas, b, rb.relas);
@@ -333,8 +335,8 @@ bool ICF<ELFT>::equalsConstant(const InputSection *a, const InputSection *b) {
 // relocations point to the same section in terms of ICF.
 template <class ELFT>
 template <class RelTy>
-bool ICF<ELFT>::variableEq(const InputSection *secA, ArrayRef<RelTy> ra,
-                           const InputSection *secB, ArrayRef<RelTy> rb) {
+bool ICF<ELFT>::variableEq(const InputSection *secA, Relocs<RelTy> ra,
+                           const InputSection *secB, Relocs<RelTy> rb) {
   assert(ra.size() == rb.size());
 
   auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
@@ -374,6 +376,8 @@ template <class ELFT>
 bool ICF<ELFT>::equalsVariable(const InputSection *a, const InputSection *b) {
   const RelsOrRelas<ELFT> ra = a->template relsOrRelas<ELFT>();
   const RelsOrRelas<ELFT> rb = b->template relsOrRelas<ELFT>();
+  if (ra.areRelocsCrel())
+    return variableEq(a, ra.crels, b, rb.crels);
   return ra.areRelocsRel() || rb.areRelocsRel()
              ? variableEq(a, ra.rels, b, rb.rels)
              : variableEq(a, ra.relas, b, rb.relas);
@@ -441,7 +445,7 @@ void ICF<ELFT>::forEachClass(llvm::function_ref<void(size_t, size_t)> fn) {
 // hash.
 template <class RelTy>
 static void combineRelocHashes(unsigned cnt, InputSection *isec,
-                               ArrayRef<RelTy> rels) {
+                               Relocs<RelTy> rels) {
   uint32_t hash = isec->eqClass[cnt % 2];
   for (RelTy rel : rels) {
     Symbol &s = isec->file->getRelocTargetSym(rel);
@@ -505,7 +509,9 @@ template <class ELFT> void ICF<ELFT>::run() {
   for (unsigned cnt = 0; cnt != 2; ++cnt) {
     parallelForEach(sections, [&](InputSection *s) {
       const RelsOrRelas<ELFT> rels = s->template relsOrRelas<ELFT>();
-      if (rels.areRelocsRel())
+      if (rels.areRelocsCrel())
+        combineRelocHashes(cnt, s, rels.crels);
+      else if (rels.areRelocsRel())
         combineRelocHashes(cnt, s, rels.rels);
       else
         combineRelocHashes(cnt, s, rels.relas);
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 03ff4eadfe670..0e4ba06e9b780 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RISCVAttributeParser.h"
-#include "llvm/Support/TarWriter.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -52,8 +51,6 @@ extern template void ObjFile<ELF64BE>::importCmseSymbols();
 bool InputFile::isInGroup;
 uint32_t InputFile::nextGroupId;
 
-std::unique_ptr<TarWriter> elf::tar;
-
 // Returns "<internal>", "foo.a(bar.o)" or "baz.o".
 std::string lld::toString(const InputFile *f) {
   static std::mutex mu;
@@ -261,8 +258,8 @@ std::optional<MemoryBufferRef> elf::readFile(StringRef path) {
   MemoryBufferRef mbref = (*mbOrErr)->getMemBufferRef();
   ctx.memoryBuffers.push_back(std::move(*mbOrErr)); // take MB ownership
 
-  if (tar)
-    tar->append(relativeToRoot(path), mbref.getBuffer());
+  if (ctx.tar)
+    ctx.tar->append(relativeToRoot(path), mbref.getBuffer());
   return mbref;
 }
 
@@ -834,6 +831,7 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     case SHT_STRTAB:
     case SHT_REL:
     case SHT_RELA:
+    case SHT_CREL:
     case SHT_NULL:
       break;
     case SHT_PROGBITS:
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 0617f41e1e13a..b0a74877d0aae 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -39,9 +39,6 @@ namespace elf {
 class InputSection;
 class Symbol;
 
-// If --reproduce is specified, all input files are written to this tar archive.
-extern std::unique_ptr<llvm::TarWriter> tar;
-
 // Opens a given file.
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
@@ -84,6 +81,7 @@ class InputFile {
     assert(fileKind == ObjKind || fileKind == BinaryKind);
     return sections;
   }
+  void cacheDecodedCrel(size_t i, InputSectionBase *s) { sections[i] = s; }
 
   // Returns object file symbols. It is a runtime error to call this
   // function on files of other types.
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 12ab1f1eac808..570e485455bad 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -133,21 +133,56 @@ void InputSectionBase::decompress() const {
   compressed = false;
 }
 
-template <class ELFT> RelsOrRelas<ELFT> InputSectionBase::relsOrRelas() const {
+template <class ELFT>
+RelsOrRelas<ELFT> InputSectionBase::relsOrRelas(bool supportsCrel) const {
   if (relSecIdx == 0)
     return {};
   RelsOrRelas<ELFT> ret;
-  typename ELFT::Shdr shdr =
-      cast<ELFFileBase>(file)->getELFShdrs<ELFT>()[relSecIdx];
+  auto *f = cast<ObjFile<ELFT>>(file);
+  typename ELFT::Shdr shdr = f->template getELFShdrs<ELFT>()[relSecIdx];
+  if (shdr.sh_type == SHT_CREL) {
+    // Return an iterator if supported by caller.
+    if (supportsCrel) {
+      ret.crels = Relocs<typename ELFT::Crel>(
+          (const uint8_t *)f->mb.getBufferStart() + shdr.sh_offset);
+      return ret;
+    }
+    InputSectionBase *const &relSec = f->getSections()[relSecIdx];
+    // Otherwise, allocate a buffer to hold the decoded RELA relocations. When
+    // called for the first time, relSec is null (without --emit-relocs) or an
+    // InputSection with zero eqClass[0].
+    if (!relSec || !cast<InputSection>(relSec)->eqClass[0]) {
+      auto *sec = makeThreadLocal<InputSection>(*f, shdr, name);
+      f->cacheDecodedCrel(relSecIdx, sec);
+      sec->type = SHT_RELA;
+      sec->eqClass[0] = SHT_RELA;
+
+      RelocsCrel<ELFT::Is64Bits> entries(sec->content_);
+      sec->size = entries.size() * sizeof(typename ELFT::Rela);
+      auto *relas = makeThreadLocalN<typename ELFT::Rela>(entries.size());
+      sec->content_ = reinterpret_cast<uint8_t *>(relas);
+      for (auto [i, r] : llvm::enumerate(entries)) {
+        relas[i].r_offset = r.r_offset;
+        relas[i].setSymbolAndType(r.r_symidx, r.r_type, false);
+        relas[i].r_addend = r.r_addend;
+      }
+    }
+    ret.relas = {ArrayRef(
+        reinterpret_cast<const typename ELFT::Rela *>(relSec->content_),
+        relSec->size / sizeof(typename ELFT::Rela))};
+    return ret;
+  }
+
+  const void *content = f->mb.getBufferStart() + shdr.sh_offset;
+  size_t size = shdr.sh_size;
   if (shdr.sh_type == SHT_REL) {
-    ret.rels = ArrayRef(reinterpret_cast<const typename ELFT::Rel *>(
-                            file->mb.getBufferStart() + shdr.sh_offset),
-                        shdr.sh_size / sizeof(typename ELFT::Rel));
+    ret.rels = {ArrayRef(reinterpret_cast<const typename ELFT::Rel *>(content),
+                         size / sizeof(typename ELFT::Rel))};
   } else {
     assert(shdr.sh_type == SHT_RELA);
-    ret.relas = ArrayRef(reinterpret_cast<const typename ELFT::Rela *>(
-                             file->mb.getBufferStart() + shdr.sh_offset),
-                         shdr.sh_size / sizeof(typename ELFT::Rela));
+    ret.relas = {
+        ArrayRef(reinterpret_cast<const typename ELFT::Rela *>(content),
+                 size / sizeof(typename ELFT::Rela))};
   }
   return ret;
 }
@@ -911,7 +946,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
 // So, we handle relocations for non-alloc sections directly in this
 // function as a performance optimization.
 template <class ELFT, class RelTy>
-void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
+void InputSection::relocateNonAlloc(uint8_t *buf, Relocs<RelTy> rels) {
   const unsigned bits = sizeof(typename ELFT::uint) * 8;
   const TargetInfo &target = *elf::target;
   const auto emachine = config->emachine;
@@ -1073,11 +1108,7 @@ void InputSectionBase::relocate(uint8_t *buf, uint8_t *bufEnd) {
   auto *sec = cast<InputSection>(this);
   // For a relocatable link, also call relocateNonAlloc() to rewrite applicable
   // locations with tombstone values.
-  const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
-  if (rels.areRelocsRel())
-    sec->relocateNonAlloc<ELFT>(buf, rels.rels);
-  else
-    sec->relocateNonAlloc<ELFT>(buf, rels.relas);
+  invokeOnRelocs(*sec, sec->relocateNonAlloc<ELFT>, buf);
 }
 
 // For each function-defining prologue, find any calls to __morestack,
@@ -1252,7 +1283,7 @@ SyntheticSection *EhInputSection::getParent() const {
 // .eh_frame is a sequence of CIE or FDE records.
 // This function splits an input section into records and returns them.
 template <class ELFT> void EhInputSection::split() {
-  const RelsOrRelas<ELFT> rels = relsOrRelas<ELFT>();
+  const RelsOrRelas<ELFT> rels = relsOrRelas<ELFT>(/*supportsCrel=*/false);
   // getReloc expects the relocations to be sorted by r_offset. See the comment
   // in scanRelocs.
   if (rels.areRelocsRel()) {
@@ -1418,10 +1449,14 @@ template void InputSection::writeTo<ELF32BE>(uint8_t *);
 template void InputSection::writeTo<ELF64LE>(uint8_t *);
 template void InputSection::writeTo<ELF64BE>(uint8_t *);
 
-template RelsOrRelas<ELF32LE> InputSectionBase::relsOrRelas<ELF32LE>() const;
-template RelsOrRelas<ELF32BE> InputSectionBase::relsOrRelas<ELF32BE>() const;
-template RelsOrRelas<ELF64LE> InputSectionBase::relsOrRelas<ELF64LE>() const;
-template RelsOrRelas<ELF64BE> InputSectionBase::relsOrRelas<ELF64BE>() const;
+template RelsOrRelas<ELF32LE>
+InputSectionBase::relsOrRelas<ELF32LE>(bool) const;
+template RelsOrRelas<ELF32BE>
+InputSectionBase::relsOrRelas<ELF32BE>(bool) const;
+template RelsOrRelas<ELF64LE>
+InputSectionBase::relsOrRelas<ELF64LE>(bool) const;
+template RelsOrRelas<ELF64BE>
+InputSectionBase::relsOrRelas<ELF64BE>(bool) const;
 
 template MergeInputSection::MergeInputSection(ObjFile<ELF32LE> &,
                                               const ELF32LE::Shdr &, StringRef);
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index ec12235f842a9..6659530a9c9c2 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -35,13 +35,26 @@ class OutputSection;
 
 LLVM_LIBRARY_VISIBILITY extern std::vector<Partition> partitions;
 
-// Returned by InputSectionBase::relsOrRelas. At least one member is empty.
+// Returned by InputSectionBase::relsOrRelas. At most one member is empty.
 template <class ELFT> struct RelsOrRelas {
-  ArrayRef<typename ELFT::Rel> rels;
-  ArrayRef<typename ELFT::Rela> relas;
+  Relocs<typename ELFT::Rel> rels;
+  Relocs<typename ELFT::Rela> relas;
+  Relocs<typename ELFT::Crel> crels;
   bool areRelocsRel() const { return rels.size(); }
+  bool areRelocsCrel() const { return crels.size(); }
 };
 
+#define invokeOnRelocs(sec, f, ...)                                            \
+  {                                                                            \
+    const RelsOrRelas<ELFT> rs = (sec).template relsOrRelas<ELFT>();           \
+    if (rs.areRelocsCrel())                                                    \
+      f(__VA_ARGS__, rs.crels);                                                \
+    else if (rs.areRelocsRel())                                                \
+      f(__VA_ARGS__, rs.rels);                                                 \
+    else                                                                       \
+      f(__VA_ARGS__, rs.relas);                                                \
+  }
+
 // This is the base class of all sections that lld handles. Some are sections in
 // input files, some are sections in the produced output file and some exist
 // just as a convenience for implementing special ways of combining some
@@ -200,7 +213,8 @@ class InputSectionBase : public SectionBase {
   // used by --gc-sections.
   InputSectionBase *nextInSectionGroup = nullptr;
 
-  template <class ELFT> RelsOrRelas<ELFT> relsOrRelas() const;
+  template <class ELFT>
+  RelsOrRelas<ELFT> relsOrRelas(bool supportsCrel = true) const;
 
   // InputSections that are dependent on us (reverse dependency for GC)
   llvm::TinyPtrVector<InputSection *> dependentSections;
@@ -407,7 +421,7 @@ class InputSection : public InputSectionBase {
   InputSectionBase *getRelocatedSection() const;
 
   template <class ELFT, class RelTy>
-  void relocateNonAlloc(uint8_t *buf, llvm::ArrayRef<RelTy> rels);
+  void relocateNonAlloc(uint8_t *buf, Relocs<RelTy> rels);
 
   // Points to the canonical section. If ICF folds two sections, repl pointer of
   // one section points to the other.
@@ -474,7 +488,8 @@ class SyntheticSection : public InputSection {
 };
 
 inline bool isStaticRelSecType(uint32_t type) {
-  return type == llvm::ELF::SHT_RELA || type == llvm::ELF::SHT_REL;
+  return type == llvm::ELF::SHT_RELA || type == llvm::ELF::SHT_CREL ||
+         type == llvm::ELF::SHT_REL;
 }
 
 inline bool isDebugSection(const InputSectionBase &sec) {
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index e2208da18dce0..055fa21d44ca6 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -61,6 +61,8 @@ static StringRef getOutputSectionName(const InputSectionBase *s) {
         assert(config->relocatable && (rel->flags & SHF_LINK_ORDER));
         return s->name;
       }
+      if (s->type == SHT_CREL)
+        return saver().save(".crel" + out->name);
       if (s->type == SHT_RELA)
         return saver().save(".rela" + out->name);
       return saver().save(".rel" + out->name);
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 45431e44a6c8c..16e5883c2002c 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -85,6 +85,13 @@ static uint64_t getAddend(InputSectionBase &sec,
   return rel.r_addend;
 }
 
+// Currently, we assume all input CREL relocations have an explicit addend.
+template <class ELFT>
+static uint64_t getAddend(InputSectionBase &sec,
+                          const typename ELFT::Crel &rel) {
+  return rel.r_addend;
+}
+
 template <class ELFT>
 template <class RelTy>
 void MarkLive<ELFT>::resolveReloc(InputSectionBase &sec, RelTy &rel,
@@ -239,7 +246,8 @@ template <class ELFT> void MarkLive<ELFT>::run() {
   // all of them. We also want to preserve personality routines and LSDA
   // referenced by .eh_frame sections, so we scan them for that here.
   for (EhInputSection *eh : ctx.ehInputSections) {
-    const RelsOrRelas<ELFT> rels = eh->template relsOrRelas<ELFT>();
+    const RelsOrRelas<ELFT> rels =
+        eh->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
     if (rels.areRelocsRel())
       scanEhFrameSection(*eh, rels.rels);
     else if (rels.relas.size())
@@ -310,6 +318,8 @@ template <class ELFT> void MarkLive<ELFT>::mark() {
       resolveReloc(sec, rel, false);
     for (const typename ELFT::Rela &rel : rels.relas)
       resolveReloc(sec, rel, false);
+    for (const typename ELFT::Crel &rel : rels.crels)
+      resolveReloc(sec, rel, false);
 
     for (InputSectionBase *isec : sec.dependentSections)
       enqueue(isec, 0);
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 60de10061c53d..29f18f89274f3 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -18,6 +18,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Config/llvm-config.h" // LLVM_ENABLE_ZLIB
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -115,7 +116,19 @@ void OutputSection::recordSection(InputSectionBase *isec) {
 // other InputSections.
 void OutputSection::commitSection(InputSection *isec) {
   if (LLVM_UNLIKELY(type != isec->type)) {
-    if (hasInputSections || typeIsSet) {
+    if (!hasInputSections && !typeIsSet) {
+      type = isec->type;
+    } else if (isStaticRelSecType(type) && isStaticRelSecType(isec->type) &&
+               (type == SHT_CREL) != (isec->type == SHT_CREL)) {
+      // Combine mixed SHT_REL[A] and SHT_CREL to SHT_CREL.
+      type = SHT_CREL;
+      if (type == SHT_REL) {
+        if (name.consume_front(".rel"))
+          name = saver().save(".crel" + name);
+      } else if (name.consume_front(".rela")) {
+        name = saver().save(".crel" + name);
+      }
+    } else {
       if (typeIsSet || !canMergeToProgbits(type) ||
           !canMergeToProgbits(isec->type)) {
         // The (NOLOAD) changes the section type to SHT_NOBITS, the intention is
@@ -133,8 +146,6 @@ void OutputSection::commitSection(InputSection *isec) {
       }
       if (!typeIsSet)
         type = SHT_PROGBITS;
-    } else {
-      type = isec->type;
     }
   }
   if (!hasInputSections) {
@@ -470,6 +481,11 @@ void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
   llvm::TimeTraceScope timeScope("Write sections", name);
   if (type == SHT_NOBITS)
     return;
+  if (type == SHT_CREL && !(flags & SHF_ALLOC)) {
+    buf += encodeULEB128(crelHeader, buf);
+    memcpy(buf, crelBody.data(), crelBody.size());
+    return;
+  }
 
   // If the section is compressed due to
   // --compress-debug-section/--compress-sections, the content is already known.
@@ -505,6 +521,12 @@ void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
   if (nonZeroFiller)
     fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
 
+  if (type == SHT_CREL && !(flags & SHF_ALLOC)) {
+    buf += encodeULEB128(crelHeader, buf);
+    memcpy(buf, crelBody.data(), crelBody.size());
+    return;
+  }
+
   auto fn = [=](size_t begin, size_t end) {
     size_t numSections = sections.size();
     for (size_t i = begin; i != end; ++i) {
@@ -592,6 +614,103 @@ static void finalizeShtGroup(OutputSection *os, InputSection *section) {
   os->size = (1 + seen.size()) * sizeof(uint32_t);
 }
 
+template <class uint>
+LLVM_ATTRIBUTE_ALWAYS_INLINE static void
+encodeOneCrel(raw_svector_ostream &os, Elf_Crel<sizeof(uint) == 8> &out,
+              uint offset, const Symbol &sym, uint32_t type, uint addend) {
+  const auto deltaOffset = static_cast<uint64_t>(offset - out.r_offset);
+  out.r_offset = offset;
+  int64_t symidx = in.symTab->getSymbolIndex(sym);
+  if (sym.type == STT_SECTION) {
+    auto *d = dyn_cast<Defined>(&sym);
+    if (d) {
+      SectionBase *section = d->section;
+      assert(section->isLive());
+      addend = sym.getVA(addend) - section->getOutputSection()->addr;
+    } else {
+      // Encode R_*_NONE(symidx=0).
+      symidx = type = addend = 0;
+    }
+  }
+
+  // Similar to llvm::ELF::encodeCrel.
+  uint8_t b = deltaOffset * 8 + (out.r_symidx != symidx) +
+              (out.r_type != type ? 2 : 0) +
+              (uint(out.r_addend) != addend ? 4 : 0);
+  if (deltaOffset < 0x10) {
+    os << char(b);
+  } else {
+    os << char(b | 0x80);
+    encodeULEB128(deltaOffset >> 4, os);
+  }
+  if (b & 1) {
+    encodeSLEB128(static_cast<int32_t>(symidx - out.r_symidx), os);
+    out.r_symidx = symidx;
+  }
+  if (b & 2) {
+    encodeSLEB128(static_cast<int32_t>(type - out.r_type), os);
+    out.r_type = type;
+  }
+  if (b & 4) {
+    encodeSLEB128(std::make_signed_t<uint>(addend - out.r_addend), os);
+    out.r_addend = addend;
+  }
+}
+
+template <class ELFT>
+static size_t relToCrel(raw_svector_ostream &os, Elf_Crel<ELFT::Is64Bits> &out,
+                        InputSection *relSec, InputSectionBase *sec) {
+  const auto &file = *cast<ELFFileBase>(relSec->file);
+  if (relSec->type == SHT_REL) {
+    // REL conversion is complex and unsupported yet.
+    errorOrWarn(toString(relSec) + ": REL cannot be converted to CREL");
+    return 0;
+  }
+  auto rels = relSec->getDataAs<typename ELFT::Rela>();
+  for (auto rel : rels) {
+    encodeOneCrel<typename ELFT::uint>(
+        os, out, sec->getVA(rel.r_offset), file.getRelocTargetSym(rel),
+        rel.getType(config->isMips64EL), getAddend<ELFT>(rel));
+  }
+  return rels.size();
+}
+
+// Compute the content of a non-alloc CREL section due to -r or --emit-relocs.
+// Input CREL sections are decoded while REL[A] need to be converted.
+template <bool is64> void OutputSection::finalizeNonAllocCrel() {
+  using uint = typename Elf_Crel_Impl<is64>::uint;
+  raw_svector_ostream os(crelBody);
+  uint64_t totalCount = 0;
+  Elf_Crel<is64> out{};
+  assert(commands.size() == 1);
+  auto *isd = cast<InputSectionDescription>(commands[0]);
+  for (InputSection *relSec : isd->sections) {
+    const auto &file = *cast<ELFFileBase>(relSec->file);
+    InputSectionBase *sec = relSec->getRelocatedSection();
+    if (relSec->type == SHT_CREL) {
+      RelocsCrel<is64> entries(relSec->content_);
+      totalCount += entries.size();
+      for (Elf_Crel_Impl<is64> r : entries) {
+        encodeOneCrel<uint>(os, out, uint(sec->getVA(r.r_offset)),
+                            file.getSymbol(r.r_symidx), r.r_type, r.r_addend);
+      }
+      continue;
+    }
+
+    // Convert REL[A] to CREL.
+    if constexpr (is64) {
+      totalCount += config->isLE ? relToCrel<ELF64LE>(os, out, relSec, sec)
+                                 : relToCrel<ELF64BE>(os, out, relSec, sec);
+    } else {
+      totalCount += config->isLE ? relToCrel<ELF32LE>(os, out, relSec, sec)
+                                 : relToCrel<ELF32BE>(os, out, relSec, sec);
+    }
+  }
+
+  crelHeader = totalCount * 8 + 4;
+  size = getULEB128Size(crelHeader) + crelBody.size();
+}
+
 void OutputSection::finalize() {
   InputSection *first = getFirstInputSection(this);
 
@@ -628,6 +747,13 @@ void OutputSection::finalize() {
   InputSectionBase *s = first->getRelocatedSection();
   info = s->getOutputSection()->sectionIndex;
   flags |= SHF_INFO_LINK;
+  // Finalize the content of non-alloc CREL.
+  if (type == SHT_CREL) {
+    if (config->is64)
+      finalizeNonAllocCrel<true>();
+    else
+      finalizeNonAllocCrel<false>();
+  }
 }
 
 // Returns true if S is in one of the many forms the compiler driver may pass
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index 78fede48a23f2..8c0c52f34ac9f 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -84,6 +84,11 @@ class OutputSection final : public SectionBase {
   Expr alignExpr;
   Expr lmaExpr;
   Expr subalignExpr;
+
+  // Used by non-alloc SHT_CREL to hold the header and content byte stream.
+  uint64_t crelHeader = 0;
+  SmallVector<char, 0> crelBody;
+
   SmallVector<SectionCommand *, 0> commands;
   SmallVector<StringRef, 0> phdrs;
   std::optional<std::array<uint8_t, 4>> filler;
@@ -106,6 +111,7 @@ class OutputSection final : public SectionBase {
   // DATA_RELRO_END.
   bool relro = false;
 
+  template <bool is64> void finalizeNonAllocCrel();
   void finalize();
   template <class ELFT>
   void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 36857d72c647e..1c0dd97ed3910 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -475,8 +475,9 @@ class RelocationScanner {
                                 uint64_t relOff) const;
   void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
                   int64_t addend) const;
-  template <class ELFT, class RelTy> void scanOne(RelTy *&i);
-  template <class ELFT, class RelTy> void scan(ArrayRef<RelTy> rels);
+  template <class ELFT, class RelTy>
+  void scanOne(typename Relocs<RelTy>::const_iterator &i);
+  template <class ELFT, class RelTy> void scan(Relocs<RelTy> rels);
 };
 } // namespace
 
@@ -1308,7 +1309,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   // LoongArch does not yet implement transition from TLSDESC to LE/IE, so
   // generate TLSDESC dynamic relocation for the dynamic linker to handle.
   if (config->emachine == EM_LOONGARCH &&
-      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_CALL>(expr)) {
+      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
+            R_TLSDESC_CALL>(expr)) {
     if (expr != R_TLSDESC_CALL) {
       sym.setFlags(NEEDS_TLSDESC);
       c.addReloc({expr, type, offset, addend, &sym});
@@ -1433,15 +1435,17 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   return 0;
 }
 
-template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
+template <class ELFT, class RelTy>
+void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   const RelTy &rel = *i;
   uint32_t symIndex = rel.getSymbol(config->isMips64EL);
   Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIndex);
   RelType type;
-  if constexpr (ELFT::Is64Bits) {
+  if constexpr (ELFT::Is64Bits || RelTy::IsCrel) {
     type = rel.getType(config->isMips64EL);
     ++i;
   } else {
+    // CREL is unsupported for MIPS N32.
     if (config->mipsN32Abi) {
       type = getMipsN32RelType(i);
     } else {
@@ -1494,15 +1498,18 @@ template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
 
     if ((type == R_PPC64_TLSGD && expr == R_TLSDESC_CALL) ||
         (type == R_PPC64_TLSLD && expr == R_TLSLD_HINT)) {
-      if (i == end) {
-        errorOrWarn("R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last "
-                    "relocation" +
-                    getLocation(*sec, sym, offset));
-        return;
+      // Skip the error check for CREL, which does not set `end`.
+      if constexpr (!RelTy::IsCrel) {
+        if (i == end) {
+          errorOrWarn("R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last "
+                      "relocation" +
+                      getLocation(*sec, sym, offset));
+          return;
+        }
       }
 
-      // Offset the 4-byte aligned R_PPC64_TLSGD by one byte in the NOTOC case,
-      // so we can discern it later from the toc-case.
+      // Offset the 4-byte aligned R_PPC64_TLSGD by one byte in the NOTOC
+      // case, so we can discern it later from the toc-case.
       if (i->getType(/*isMips64EL=*/false) == R_PPC64_REL24_NOTOC)
         ++offset;
     }
@@ -1542,7 +1549,7 @@ template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
 // instructions are generated by very old IBM XL compilers. Work around the
 // issue by disabling GD/LD to IE/LE relaxation.
 template <class RelTy>
-static void checkPPC64TLSRelax(InputSectionBase &sec, ArrayRef<RelTy> rels) {
+static void checkPPC64TLSRelax(InputSectionBase &sec, Relocs<RelTy> rels) {
   // Skip if sec is synthetic (sec.file is null) or if sec has been marked.
   if (!sec.file || sec.file->ppc64DisableTLSRelax)
     return;
@@ -1574,7 +1581,7 @@ static void checkPPC64TLSRelax(InputSectionBase &sec, ArrayRef<RelTy> rels) {
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scan(ArrayRef<RelTy> rels) {
+void RelocationScanner::scan(Relocs<RelTy> rels) {
   // Not all relocations end up in Sec->Relocations, but a lot do.
   sec->relocations.reserve(rels.size());
 
@@ -1590,9 +1597,15 @@ void RelocationScanner::scan(ArrayRef<RelTy> rels) {
   if (isa<EhInputSection>(sec) || config->emachine == EM_S390)
     rels = sortRels(rels, storage);
 
-  end = static_cast<const void *>(rels.end());
-  for (auto i = rels.begin(); i != end;)
-    scanOne<ELFT>(i);
+  if constexpr (RelTy::IsCrel) {
+    for (auto i = rels.begin(); i != rels.end();)
+      scanOne<ELFT, RelTy>(i);
+  } else {
+    // The non-CREL code path has additional check for PPC64 TLS.
+    end = static_cast<const void *>(rels.end());
+    for (auto i = rels.begin(); i != end;)
+      scanOne<ELFT, RelTy>(i);
+  }
 
   // Sort relocations by offset for more efficient searching for
   // R_RISCV_PCREL_HI20 and R_PPC64_ADDR64.
@@ -1608,7 +1621,9 @@ template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
   sec = &s;
   getter = OffsetGetter(s);
   const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
-  if (rels.areRelocsRel())
+  if (rels.areRelocsCrel())
+    scan<ELFT>(rels.crels);
+  else if (rels.areRelocsRel())
     scan<ELFT>(rels.rels);
   else
     scan<ELFT>(rels.relas);
@@ -1713,7 +1728,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn;
   addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym);
   sym.allocateAux();
-  symAux.back().pltIdx = symAux[directSym->auxIdx].pltIdx;
+  ctx.symAux.back().pltIdx = ctx.symAux[directSym->auxIdx].pltIdx;
 
   if (flags & HAS_DIRECT_RELOC) {
     // Change the value to the IPLT and redirect all references to it.
@@ -1831,7 +1846,7 @@ void elf::postScanRelocations() {
           {R_ADDEND, target->symbolicRel, got->getTlsIndexOff(), 1, &dummy});
   }
 
-  assert(symAux.size() == 1);
+  assert(ctx.symAux.size() == 1);
   for (Symbol *sym : symtab.getSymbols())
     fn(*sym);
 
@@ -2409,11 +2424,7 @@ template <class ELFT> void elf::checkNoCrossRefs() {
         if (!isd)
           continue;
         parallelForEach(isd->sections, [&](InputSection *sec) {
-          const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
-          if (rels.areRelocsRel())
-            scanCrossRefs<ELFT>(noxref, osec, sec, rels.rels);
-          else
-            scanCrossRefs<ELFT>(noxref, osec, sec, rels.relas);
+          invokeOnRelocs(*sec, scanCrossRefs<ELFT>, noxref, osec, sec);
         });
       }
     }
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 1bee0dedf8587..aaa4581490a28 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -12,6 +12,7 @@
 #include "lld/Common/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Object/ELFTypes.h"
 #include <vector>
 
 namespace lld::elf {
@@ -205,6 +206,91 @@ class ThunkCreator {
   uint32_t pass = 0;
 };
 
+// Decode LEB128 without error checking. Only used by performance critical code
+// like RelocsCrel.
+inline uint64_t readLEB128(const uint8_t *&p, uint64_t leb) {
+  uint64_t acc = 0, shift = 0, byte;
+  do {
+    byte = *p++;
+    acc |= (byte - 128 * (byte >= leb)) << shift;
+    shift += 7;
+  } while (byte >= 128);
+  return acc;
+}
+inline uint64_t readULEB128(const uint8_t *&p) { return readLEB128(p, 128); }
+inline int64_t readSLEB128(const uint8_t *&p) { return readLEB128(p, 64); }
+
+// This class implements a CREL iterator that does not allocate extra memory.
+template <bool is64> struct RelocsCrel {
+  using uint = std::conditional_t<is64, uint64_t, uint32_t>;
+  struct const_iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = llvm::object::Elf_Crel_Impl<is64>;
+    using difference_type = ptrdiff_t;
+    using pointer = value_type *;
+    using reference = const value_type &;
+    uint32_t count;
+    uint8_t flagBits, shift;
+    const uint8_t *p;
+    llvm::object::Elf_Crel_Impl<is64> crel{};
+    const_iterator(size_t hdr, const uint8_t *p)
+        : count(hdr / 8), flagBits(hdr & 4 ? 3 : 2), shift(hdr % 4), p(p) {
+      if (count)
+        step();
+    }
+    void step() {
+      // See object::decodeCrel.
+      const uint8_t b = *p++;
+      crel.r_offset += b >> flagBits << shift;
+      if (b >= 0x80)
+        crel.r_offset +=
+            ((readULEB128(p) << (7 - flagBits)) - (0x80 >> flagBits)) << shift;
+      if (b & 1)
+        crel.r_symidx += readSLEB128(p);
+      if (b & 2)
+        crel.r_type += readSLEB128(p);
+      if (b & 4 && flagBits == 3)
+        crel.r_addend += static_cast<uint>(readSLEB128(p));
+    }
+    llvm::object::Elf_Crel_Impl<is64> operator*() const { return crel; };
+    const llvm::object::Elf_Crel_Impl<is64> *operator->() const {
+      return &crel;
+    }
+    // For llvm::enumerate.
+    bool operator==(const const_iterator &r) const { return count == r.count; }
+    bool operator!=(const const_iterator &r) const { return count != r.count; }
+    const_iterator &operator++() {
+      if (--count)
+        step();
+      return *this;
+    }
+    // For RelocationScanner::scanOne.
+    void operator+=(size_t n) {
+      for (; n; --n)
+        operator++();
+    }
+  };
+
+  size_t hdr = 0;
+  const uint8_t *p = nullptr;
+
+  constexpr RelocsCrel() = default;
+  RelocsCrel(const uint8_t *p) : hdr(readULEB128(p)) { this->p = p; }
+  size_t size() const { return hdr / 8; }
+  const_iterator begin() const { return {hdr, p}; }
+  const_iterator end() const { return {0, nullptr}; }
+};
+
+template <class RelTy> struct Relocs : ArrayRef<RelTy> {
+  Relocs() = default;
+  Relocs(ArrayRef<RelTy> a) : ArrayRef<RelTy>(a) {}
+};
+
+template <bool is64>
+struct Relocs<llvm::object::Elf_Crel_Impl<is64>> : RelocsCrel<is64> {
+  using RelocsCrel<is64>::RelocsCrel;
+};
+
 // Return a int64_t to make sure we get the sign extension out of the way as
 // early as possible.
 template <class ELFT>
@@ -215,20 +301,32 @@ template <class ELFT>
 static inline int64_t getAddend(const typename ELFT::Rela &rel) {
   return rel.r_addend;
 }
+template <class ELFT>
+static inline int64_t getAddend(const typename ELFT::Crel &rel) {
+  return rel.r_addend;
+}
 
 template <typename RelTy>
-ArrayRef<RelTy> sortRels(ArrayRef<RelTy> rels, SmallVector<RelTy, 0> &storage) {
+inline Relocs<RelTy> sortRels(Relocs<RelTy> rels,
+                              SmallVector<RelTy, 0> &storage) {
   auto cmp = [](const RelTy &a, const RelTy &b) {
     return a.r_offset < b.r_offset;
   };
   if (!llvm::is_sorted(rels, cmp)) {
     storage.assign(rels.begin(), rels.end());
     llvm::stable_sort(storage, cmp);
-    rels = storage;
+    rels = Relocs<RelTy>(storage);
   }
   return rels;
 }
 
+template <bool is64>
+inline Relocs<llvm::object::Elf_Crel_Impl<is64>>
+sortRels(Relocs<llvm::object::Elf_Crel_Impl<is64>> rels,
+         SmallVector<llvm::object::Elf_Crel_Impl<is64>, 0> &storage) {
+  return {};
+}
+
 // Returns true if Expr refers a GOT entry. Note that this function returns
 // false for TLS variables even though they need GOT, because TLS variables uses
 // GOT differently than the regular variables.
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 14f39ed10e17c..40528f0cd7f61 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -20,11 +20,6 @@
 // in various corner cases. We do not care much about efficiency because
 // the time spent in parsing linker scripts is usually negligible.
 //
-// Our grammar of the linker script is LL(2), meaning that it needs at
-// most two-token lookahead to parse. The only place we need two-token
-// lookahead is labels in version scripts, where we need to parse "local :"
-// as if "local:".
-//
 // Overall, this lexer works fine for most linker scripts. There might
 // be room for improving compatibility, but that's probably not at the
 // top of our todo list.
@@ -32,89 +27,84 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScriptLexer.h"
+#include "Config.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 
 using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
+ScriptLexer::Buffer::Buffer(MemoryBufferRef mb)
+    : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
+      begin(mb.getBufferStart()) {
+  if (config->sysroot == "")
+    return;
+  StringRef path = filename;
+  for (; !path.empty(); path = sys::path::parent_path(path)) {
+    if (!sys::fs::equivalent(config->sysroot, path))
+      continue;
+    isUnderSysroot = true;
+    return;
+  }
+}
+
+ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {
+  activeFilenames.insert(mb.getBufferIdentifier());
+}
+
 // Returns a whole line containing the current token.
 StringRef ScriptLexer::getLine() {
   StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
 
-  size_t pos = s.rfind('\n', tok.data() - s.data());
+  size_t pos = s.rfind('\n', prevTok.data() - s.data());
   if (pos != StringRef::npos)
     s = s.substr(pos + 1);
   return s.substr(0, s.find_first_of("\r\n"));
 }
 
-// Returns 1-based line number of the current token.
-size_t ScriptLexer::getLineNumber() {
-  if (pos == 0)
-    return 1;
-  StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
-  const size_t tokOffset = tok.data() - s.data();
-
-  // For the first token, or when going backwards, start from the beginning of
-  // the buffer. If this token is after the previous token, start from the
-  // previous token.
-  size_t line = 1;
-  size_t start = 0;
-  if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) {
-    start = lastLineNumberOffset;
-    line = lastLineNumber;
-  }
-
-  line += s.substr(start, tokOffset - start).count('\n');
-
-  // Store the line number of this token for reuse.
-  lastLineNumberOffset = tokOffset;
-  lastLineNumber = line;
-
-  return line;
-}
-
 // Returns 0-based column number of the current token.
 size_t ScriptLexer::getColumnNumber() {
-  StringRef tok = tokens[pos - 1];
-  return tok.data() - getLine().data();
+  return prevTok.data() - getLine().data();
 }
 
 std::string ScriptLexer::getCurrentLocation() {
   std::string filename = std::string(getCurrentMB().getBufferIdentifier());
-  return (filename + ":" + Twine(getLineNumber())).str();
+  return (filename + ":" + Twine(prevTokLine)).str();
 }
 
-ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
-
 // We don't want to record cascading errors. Keep only the first one.
 void ScriptLexer::setError(const Twine &msg) {
   if (errorCount())
     return;
 
   std::string s = (getCurrentLocation() + ": " + msg).str();
-  if (pos)
+  if (prevTok.size())
     s += "\n>>> " + getLine().str() + "\n>>> " +
          std::string(getColumnNumber(), ' ') + "^";
   error(s);
 }
 
-// Split S into linker script tokens.
-void ScriptLexer::tokenize(MemoryBufferRef mb) {
-  std::vector<StringRef> vec;
-  mbs.push_back(mb);
-  StringRef s = mb.getBuffer();
-  StringRef begin = s;
-
+void ScriptLexer::lex() {
   for (;;) {
+    StringRef &s = curBuf.s;
     s = skipSpace(s);
-    if (s.empty())
-      break;
+    if (s.empty()) {
+      // If this buffer is from an INCLUDE command, switch to the "return
+      // value"; otherwise, mark EOF.
+      if (buffers.empty()) {
+        eof = true;
+        return;
+      }
+      activeFilenames.erase(curBuf.filename);
+      curBuf = buffers.pop_back_val();
+      continue;
+    }
+    curTokState = inExpr;
 
     // Quoted token. Note that double-quote characters are parts of a token
     // because, in a glob match context, only unquoted tokens are interpreted
@@ -123,45 +113,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
     if (s.starts_with("\"")) {
       size_t e = s.find("\"", 1);
       if (e == StringRef::npos) {
-        StringRef filename = mb.getBufferIdentifier();
-        size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
-        error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
+        size_t lineno =
+            StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
+        error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
         return;
       }
 
-      vec.push_back(s.take_front(e + 1));
+      curTok = s.take_front(e + 1);
       s = s.substr(e + 1);
-      continue;
+      return;
     }
 
     // Some operators form separate tokens.
     if (s.starts_with("<<=") || s.starts_with(">>=")) {
-      vec.push_back(s.substr(0, 3));
+      curTok = s.substr(0, 3);
       s = s.substr(3);
-      continue;
+      return;
     }
-    if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
-                         (s[0] == s[1] && strchr("<>&|", s[0])))) {
-      vec.push_back(s.substr(0, 2));
+    if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
+      curTok = s.substr(0, 2);
       s = s.substr(2);
-      continue;
+      return;
     }
 
-    // Unquoted token. This is more relaxed than tokens in C-like language,
-    // so that you can write "file-name.cpp" as one bare token, for example.
-    size_t pos = s.find_first_not_of(
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        "0123456789_.$/\\~=+[]*?-!^:");
+    // Unquoted token. The non-expression token is more relaxed than tokens in
+    // C-like languages, so that you can write "file-name.cpp" as one bare
+    // token.
+    size_t pos;
+    if (inExpr) {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$");
+      if (pos == 0 && s.size() >= 2 &&
+          ((s[0] == s[1] && strchr("<>&|", s[0])) ||
+           is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
+        pos = 2;
+    } else {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$/\\~=+[]*?-!^:");
+    }
 
-    // A character that cannot start a word (which is usually a
-    // punctuation) forms a single character token.
     if (pos == 0)
       pos = 1;
-    vec.push_back(s.substr(0, pos));
+    curTok = s.substr(0, pos);
     s = s.substr(pos);
+    break;
   }
-
-  tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
 }
 
 // Skip leading whitespace characters or comments.
@@ -173,6 +171,7 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
         setError("unclosed comment in a linker script");
         return "";
       }
+      curBuf.lineNumber += s.substr(0, e).count('\n');
       s = s.substr(e + 2);
       continue;
     }
@@ -180,125 +179,48 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
       size_t e = s.find('\n', 1);
       if (e == StringRef::npos)
         e = s.size() - 1;
+      else
+        ++curBuf.lineNumber;
       s = s.substr(e + 1);
       continue;
     }
-    size_t size = s.size();
+    StringRef saved = s;
     s = s.ltrim();
-    if (s.size() == size)
+    auto len = saved.size() - s.size();
+    if (len == 0)
       return s;
+    curBuf.lineNumber += saved.substr(0, len).count('\n');
   }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
-  StringRef ops = "!~*/+-<>?^:="; // List of operators
-
-  // Quoted strings are literal strings, so we don't want to split it.
-  if (s.starts_with("\""))
-    return {s};
-
-  // Split S with operators as separators.
-  std::vector<StringRef> ret;
-  while (!s.empty()) {
-    size_t e = s.find_first_of(ops);
-
-    // No need to split if there is no operator.
-    if (e == StringRef::npos) {
-      ret.push_back(s);
-      break;
-    }
-
-    // Get a token before the operator.
-    if (e != 0)
-      ret.push_back(s.substr(0, e));
-
-    // Get the operator as a token.
-    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
-    if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
-        s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
-        s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
-      ret.push_back(s.substr(e, 2));
-      s = s.substr(e + 2);
-    } else {
-      ret.push_back(s.substr(e, 1));
-      s = s.substr(e + 1);
-    }
-  }
-  return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
-  if (!inExpr || errorCount() || atEOF())
-    return;
-
-  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
-  if (v.size() == 1)
-    return;
-  tokens.erase(tokens.begin() + pos);
-  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
-  maybeSplitExpr();
-
-  if (errorCount())
-    return "";
-  if (atEOF()) {
-    setError("unexpected EOF");
-    return "";
-  }
-  return tokens[pos++];
+  prevTok = peek();
+  // `prevTokLine` is not updated for EOF so that the line number in `setError`
+  // will be more useful.
+  if (prevTok.size())
+    prevTokLine = curBuf.lineNumber;
+  return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
-  StringRef tok = next();
-  if (errorCount())
-    return "";
-  pos = pos - 1;
-  return tok;
-}
-
-StringRef ScriptLexer::peek2() {
-  skip();
-  StringRef tok = next();
-  if (errorCount())
-    return "";
-  pos = pos - 2;
-  return tok;
-}
-
-bool ScriptLexer::consume(StringRef tok) {
-  if (peek() == tok) {
-    skip();
-    return true;
+  // curTok is invalid if curTokState and inExpr mismatch.
+  if (curTok.size() && curTokState != inExpr) {
+    curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
+    curTok = {};
   }
-  return false;
+  if (curTok.empty())
+    lex();
+  return curTok;
 }
 
-// Consumes Tok followed by ":". Space is allowed between Tok and ":".
-bool ScriptLexer::consumeLabel(StringRef tok) {
-  if (consume((tok + ":").str()))
-    return true;
-  if (tokens.size() >= pos + 2 && tokens[pos] == tok &&
-      tokens[pos + 1] == ":") {
-    pos += 2;
-    return true;
-  }
-  return false;
+bool ScriptLexer::consume(StringRef tok) {
+  if (peek() != tok)
+    return false;
+  next();
+  return true;
 }
 
 void ScriptLexer::skip() { (void)next(); }
@@ -307,8 +229,23 @@ void ScriptLexer::expect(StringRef expect) {
   if (errorCount())
     return;
   StringRef tok = next();
-  if (tok != expect)
-    setError(expect + " expected, but got " + tok);
+  if (tok != expect) {
+    if (atEOF())
+      setError("unexpected EOF");
+    else
+      setError(expect + " expected, but got " + tok);
+  }
+}
+
+ScriptLexer::Token ScriptLexer::till(StringRef tok) {
+  StringRef str = next();
+  if (str == tok)
+    return {};
+  if (!atEOF())
+    return {str};
+  prevTok = {};
+  setError("unexpected EOF");
+  return {};
 }
 
 // Returns true if S encloses T.
@@ -319,10 +256,8 @@ static bool encloses(StringRef s, StringRef t) {
 MemoryBufferRef ScriptLexer::getCurrentMB() {
   // Find input buffer containing the current token.
   assert(!mbs.empty());
-  if (pos == 0)
-    return mbs.back();
   for (MemoryBufferRef mb : mbs)
-    if (encloses(mb.getBuffer(), tokens[pos - 1]))
+    if (encloses(mb.getBuffer(), curBuf.s))
       return mb;
   llvm_unreachable("getCurrentMB: failed to find a token");
 }
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
index 7919e493fa28b..ffd84411f22ac 100644
--- a/lld/ELF/ScriptLexer.h
+++ b/lld/ELF/ScriptLexer.h
@@ -10,6 +10,8 @@
 #define LLD_ELF_SCRIPT_LEXER_H
 
 #include "lld/Common/LLVM.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <vector>
@@ -17,35 +19,64 @@
 namespace lld::elf {
 
 class ScriptLexer {
+protected:
+  struct Buffer {
+    // The remaining content to parse and the filename.
+    StringRef s, filename;
+    const char *begin = nullptr;
+    size_t lineNumber = 1;
+    // True if the script is opened as an absolute path under the --sysroot
+    // directory.
+    bool isUnderSysroot = false;
+
+    Buffer() = default;
+    Buffer(MemoryBufferRef mb);
+  };
+  // The current buffer and parent buffers due to INCLUDE.
+  Buffer curBuf;
+  SmallVector<Buffer, 0> buffers;
+
+  // Used to detect INCLUDE() cycles.
+  llvm::DenseSet<StringRef> activeFilenames;
+
+  struct Token {
+    StringRef str;
+    explicit operator bool() const { return !str.empty(); }
+    operator StringRef() const { return str; }
+  };
+
+  // The token before the last next().
+  StringRef prevTok;
+  // Rules for what is a token are different when we are in an expression.
+  // curTok holds the cached return value of peek() and is invalid when the
+  // expression state changes.
+  StringRef curTok;
+  size_t prevTokLine = 1;
+  // The inExpr state when curTok is cached.
+  bool curTokState = false;
+  bool eof = false;
+
 public:
   explicit ScriptLexer(MemoryBufferRef mb);
 
   void setError(const Twine &msg);
-  void tokenize(MemoryBufferRef mb);
+  void lex();
   StringRef skipSpace(StringRef s);
   bool atEOF();
   StringRef next();
   StringRef peek();
-  StringRef peek2();
   void skip();
   bool consume(StringRef tok);
   void expect(StringRef expect);
-  bool consumeLabel(StringRef tok);
+  Token till(StringRef tok);
   std::string getCurrentLocation();
   MemoryBufferRef getCurrentMB();
 
   std::vector<MemoryBufferRef> mbs;
-  std::vector<StringRef> tokens;
   bool inExpr = false;
-  size_t pos = 0;
-
-  size_t lastLineNumber = 0;
-  size_t lastLineNumberOffset = 0;
 
 private:
-  void maybeSplitExpr();
   StringRef getLine();
-  size_t getLineNumber();
   size_t getColumnNumber();
 };
 
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 47a94c29ea496..107b4c6e7af06 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -24,7 +24,6 @@
 #include "lld/Common/CommonLinkerContext.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Casting.h"
@@ -49,22 +48,12 @@ namespace {
 class ScriptParser final : ScriptLexer {
 public:
   ScriptParser(MemoryBufferRef mb) : ScriptLexer(mb) {
-    // Initialize IsUnderSysroot
-    if (config->sysroot == "")
-      return;
-    StringRef path = mb.getBufferIdentifier();
-    for (; !path.empty(); path = sys::path::parent_path(path)) {
-      if (!sys::fs::equivalent(config->sysroot, path))
-        continue;
-      isUnderSysroot = true;
-      return;
-    }
   }
 
   void readLinkerScript();
   void readVersionScript();
   void readDynamicList();
-  void readDefsym(StringRef name);
+  void readDefsym();
 
 private:
   void addFile(StringRef path);
@@ -89,10 +78,11 @@ class ScriptParser final : ScriptLexer {
   void readVersionScriptCommand();
   void readNoCrossRefs(bool to);
 
+  StringRef readName();
   SymbolAssignment *readSymbolAssignment(StringRef name);
   ByteCommand *readByteCommand(StringRef tok);
   std::array<uint8_t, 4> readFill();
-  bool readSectionDirective(OutputSection *cmd, StringRef tok1, StringRef tok2);
+  bool readSectionDirective(OutputSection *cmd, StringRef tok);
   void readSectionAddressType(OutputSection *cmd);
   OutputDesc *readOverlaySectionDescription();
   OutputDesc *readOutputSectionDescription(StringRef outSec);
@@ -122,7 +112,7 @@ class ScriptParser final : ScriptLexer {
   Expr combine(StringRef op, Expr l, Expr r);
   Expr readExpr();
   Expr readExpr1(Expr lhs, int minPrec);
-  StringRef readParenLiteral();
+  StringRef readParenName();
   Expr readPrimary();
   Expr readTernary(Expr cond);
   Expr readParenExpr();
@@ -135,12 +125,6 @@ class ScriptParser final : ScriptLexer {
   std::pair<SmallVector<SymbolVersion, 0>, SmallVector<SymbolVersion, 0>>
   readSymbols();
 
-  // True if a script being read is in the --sysroot directory.
-  bool isUnderSysroot = false;
-
-  // A set to detect an INCLUDE() cycle.
-  StringSet<> seen;
-
   // If we are currently parsing a PROVIDE|PROVIDE_HIDDEN command,
   // then this member is set to the PROVIDE symbol name.
   std::optional<llvm::StringRef> activeProvideSym;
@@ -200,8 +184,9 @@ void ScriptParser::readDynamicList() {
   std::tie(locals, globals) = readSymbols();
   expect(";");
 
-  if (!atEOF()) {
-    setError("EOF expected, but got " + next());
+  StringRef tok = peek();
+  if (tok.size()) {
+    setError("EOF expected, but got " + tok);
     return;
   }
   if (!locals.empty()) {
@@ -215,8 +200,9 @@ void ScriptParser::readDynamicList() {
 
 void ScriptParser::readVersionScript() {
   readVersionScriptCommand();
-  if (!atEOF())
-    setError("EOF expected, but got " + next());
+  StringRef tok = peek();
+  if (tok.size())
+    setError("EOF expected, but got " + tok);
 }
 
 void ScriptParser::readVersionScriptCommand() {
@@ -225,7 +211,9 @@ void ScriptParser::readVersionScriptCommand() {
     return;
   }
 
-  while (!atEOF() && !errorCount() && peek() != "}") {
+  if (atEOF())
+    setError("unexpected EOF");
+  while (peek() != "}" && !atEOF()) {
     StringRef verStr = next();
     if (verStr == "{") {
       setError("anonymous version definition is used in "
@@ -246,6 +234,8 @@ void ScriptParser::readVersion() {
 void ScriptParser::readLinkerScript() {
   while (!atEOF()) {
     StringRef tok = next();
+    if (atEOF())
+      break;
     if (tok == ";")
       continue;
 
@@ -293,9 +283,12 @@ void ScriptParser::readLinkerScript() {
   }
 }
 
-void ScriptParser::readDefsym(StringRef name) {
+void ScriptParser::readDefsym() {
   if (errorCount())
     return;
+  inExpr = true;
+  StringRef name = readName();
+  expect("=");
   Expr e = readExpr();
   if (!atEOF())
     setError("EOF expected, but got " + next());
@@ -307,8 +300,8 @@ void ScriptParser::readDefsym(StringRef name) {
 void ScriptParser::readNoCrossRefs(bool to) {
   expect("(");
   NoCrossRefCommand cmd{{}, to};
-  while (!errorCount() && !consume(")"))
-    cmd.outputSections.push_back(unquote(next()));
+  while (auto tok = till(")"))
+    cmd.outputSections.push_back(unquote(tok));
   if (cmd.outputSections.size() < 2)
     warn(getCurrentLocation() + ": ignored with fewer than 2 output sections");
   else
@@ -316,7 +309,7 @@ void ScriptParser::readNoCrossRefs(bool to) {
 }
 
 void ScriptParser::addFile(StringRef s) {
-  if (isUnderSysroot && s.starts_with("/")) {
+  if (curBuf.isUnderSysroot && s.starts_with("/")) {
     SmallString<128> pathData;
     StringRef path = (config->sysroot + s).toStringRef(pathData);
     if (sys::fs::exists(path))
@@ -368,24 +361,24 @@ void ScriptParser::readAsNeeded() {
   expect("(");
   bool orig = config->asNeeded;
   config->asNeeded = true;
-  while (!errorCount() && !consume(")"))
-    addFile(unquote(next()));
+  while (auto tok = till(")"))
+    addFile(unquote(tok));
   config->asNeeded = orig;
 }
 
 void ScriptParser::readEntry() {
   // -e <symbol> takes predecence over ENTRY(<symbol>).
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (config->entry.empty())
-    config->entry = unquote(tok);
+    config->entry = name;
   expect(")");
 }
 
 void ScriptParser::readExtern() {
   expect("(");
-  while (!errorCount() && !consume(")"))
-    config->undefined.push_back(unquote(next()));
+  while (auto tok = till(")"))
+    config->undefined.push_back(unquote(tok));
 }
 
 void ScriptParser::readGroup() {
@@ -398,45 +391,47 @@ void ScriptParser::readGroup() {
 }
 
 void ScriptParser::readInclude() {
-  StringRef tok = unquote(next());
-
-  if (!seen.insert(tok).second) {
+  StringRef name = readName();
+  if (!activeFilenames.insert(name).second) {
     setError("there is a cycle in linker script INCLUDEs");
     return;
   }
 
-  if (std::optional<std::string> path = searchScript(tok)) {
-    if (std::optional<MemoryBufferRef> mb = readFile(*path))
-      tokenize(*mb);
+  if (std::optional<std::string> path = searchScript(name)) {
+    if (std::optional<MemoryBufferRef> mb = readFile(*path)) {
+      buffers.push_back(curBuf);
+      curBuf = Buffer(*mb);
+      mbs.push_back(*mb);
+    }
     return;
   }
-  setError("cannot find linker script " + tok);
+  setError("cannot find linker script " + name);
 }
 
 void ScriptParser::readInput() {
   expect("(");
-  while (!errorCount() && !consume(")")) {
-    if (consume("AS_NEEDED"))
+  while (auto tok = till(")")) {
+    if (tok == "AS_NEEDED")
       readAsNeeded();
     else
-      addFile(unquote(next()));
+      addFile(unquote(tok));
   }
 }
 
 void ScriptParser::readOutput() {
   // -o <file> takes predecence over OUTPUT(<file>).
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (config->outputFile.empty())
-    config->outputFile = unquote(tok);
+    config->outputFile = name;
   expect(")");
 }
 
 void ScriptParser::readOutputArch() {
   // OUTPUT_ARCH is ignored for now.
   expect("(");
-  while (!errorCount() && !consume(")"))
-    skip();
+  while (till(")"))
+    ;
 }
 
 static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
@@ -478,14 +473,14 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
 void ScriptParser::readOutputFormat() {
   expect("(");
 
-  StringRef s = unquote(next());
+  StringRef s = readName();
   if (!consume(")")) {
     expect(",");
-    StringRef tmp = unquote(next());
+    StringRef tmp = readName();
     if (config->optEB)
       s = tmp;
     expect(",");
-    tmp = unquote(next());
+    tmp = readName();
     if (config->optEL)
       s = tmp;
     consume(")");
@@ -514,10 +509,9 @@ void ScriptParser::readOutputFormat() {
 
 void ScriptParser::readPhdrs() {
   expect("{");
-
-  while (!errorCount() && !consume("}")) {
+  while (auto tok = till("}")) {
     PhdrsCommand cmd;
-    cmd.name = next();
+    cmd.name = tok;
     cmd.type = readPhdrType();
 
     while (!errorCount() && !consume(";")) {
@@ -539,9 +533,9 @@ void ScriptParser::readPhdrs() {
 
 void ScriptParser::readRegionAlias() {
   expect("(");
-  StringRef alias = unquote(next());
+  StringRef alias = readName();
   expect(",");
-  StringRef name = next();
+  StringRef name = readName();
   expect(")");
 
   if (script->memoryRegions.count(alias))
@@ -553,9 +547,9 @@ void ScriptParser::readRegionAlias() {
 
 void ScriptParser::readSearchDir() {
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (!config->nostdlib)
-    config->searchPaths.push_back(unquote(tok));
+    config->searchPaths.push_back(name);
   expect(")");
 }
 
@@ -613,15 +607,14 @@ SmallVector<SectionCommand *, 0> ScriptParser::readOverlay() {
 
 void ScriptParser::readOverwriteSections() {
   expect("{");
-  while (!errorCount() && !consume("}"))
-    script->overwriteSections.push_back(readOutputSectionDescription(next()));
+  while (auto tok = till("}"))
+    script->overwriteSections.push_back(readOutputSectionDescription(tok));
 }
 
 void ScriptParser::readSections() {
   expect("{");
   SmallVector<SectionCommand *, 0> v;
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == "OVERLAY") {
       for (SectionCommand *cmd : readOverlay())
         v.push_back(cmd);
@@ -657,7 +650,7 @@ void ScriptParser::readSections() {
     isAfter = true;
   else if (!consume("BEFORE"))
     setError("expected AFTER/BEFORE, but got '" + next() + "'");
-  StringRef where = next();
+  StringRef where = readName();
   SmallVector<StringRef, 0> names;
   for (SectionCommand *cmd : v)
     if (auto *os = dyn_cast<OutputDesc>(cmd))
@@ -672,7 +665,7 @@ void ScriptParser::readTarget() {
   // for --format. We recognize only /^elf/ and "binary" in the linker
   // script as well.
   expect("(");
-  StringRef tok = unquote(next());
+  StringRef tok = readName();
   expect(")");
 
   if (tok.starts_with("elf"))
@@ -701,9 +694,8 @@ static int precedence(StringRef op) {
 
 StringMatcher ScriptParser::readFilePatterns() {
   StringMatcher Matcher;
-
-  while (!errorCount() && !consume(")"))
-    Matcher.addPattern(SingleStringMatcher(next()));
+  while (auto tok = till(")"))
+    Matcher.addPattern(SingleStringMatcher(tok));
   return Matcher;
 }
 
@@ -759,7 +751,7 @@ SmallVector<SectionPattern, 0> ScriptParser::readInputSectionsList() {
         setError("section pattern is expected");
         break;
       }
-      SectionMatcher.addPattern(unquote(next()));
+      SectionMatcher.addPattern(readName());
     }
 
     if (!SectionMatcher.empty())
@@ -790,7 +782,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
       make<InputSectionDescription>(filePattern, withFlags, withoutFlags);
   expect("(");
 
-  while (!errorCount() && !consume(")")) {
+  while (peek() != ")" && !atEOF()) {
     SortSectionPolicy outer = readSortKind();
     SortSectionPolicy inner = SortSectionPolicy::Default;
     SmallVector<SectionPattern, 0> v;
@@ -816,6 +808,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
 
     std::move(v.begin(), v.end(), std::back_inserter(cmd->sectionPatterns));
   }
+  expect(")");
   return cmd;
 }
 
@@ -852,7 +845,7 @@ Expr ScriptParser::readAssert() {
   expect("(");
   Expr e = readExpr();
   expect(",");
-  StringRef msg = unquote(next());
+  StringRef msg = readName();
   expect(")");
 
   return [=] {
@@ -873,16 +866,11 @@ constexpr std::pair<const char *, unsigned> typeMap[] = {
 // Tries to read the special directive for an output section definition which
 // can be one of following: "(NOLOAD)", "(COPY)", "(INFO)", "(OVERLAY)", and
 // "(TYPE=<value>)".
-// Tok1 and Tok2 are next 2 tokens peeked. See comment for
-// readSectionAddressType below.
-bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok1, StringRef tok2) {
-  if (tok1 != "(")
-    return false;
-  if (tok2 != "NOLOAD" && tok2 != "COPY" && tok2 != "INFO" &&
-      tok2 != "OVERLAY" && tok2 != "TYPE")
+bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok) {
+  if (tok != "NOLOAD" && tok != "COPY" && tok != "INFO" && tok != "OVERLAY" &&
+      tok != "TYPE")
     return false;
 
-  expect("(");
   if (consume("NOLOAD")) {
     cmd->type = SHT_NOBITS;
     cmd->typeIsSet = true;
@@ -921,16 +909,23 @@ bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok1, Stri
 // https://sourceware.org/binutils/docs/ld/Output-Section-Address.html
 // https://sourceware.org/binutils/docs/ld/Output-Section-Type.html
 void ScriptParser::readSectionAddressType(OutputSection *cmd) {
-  // Temporarily set inExpr to support TYPE=<value> without spaces.
-  bool saved = std::exchange(inExpr, true);
-  bool isDirective = readSectionDirective(cmd, peek(), peek2());
-  inExpr = saved;
-  if (isDirective)
-    return;
+  if (consume("(")) {
+    // Temporarily set inExpr to support TYPE=<value> without spaces.
+    SaveAndRestore saved(inExpr, true);
+    if (readSectionDirective(cmd, peek()))
+      return;
+    cmd->addrExpr = readExpr();
+    expect(")");
+  } else {
+    cmd->addrExpr = readExpr();
+  }
 
-  cmd->addrExpr = readExpr();
-  if (peek() == "(" && !readSectionDirective(cmd, "(", peek2()))
-    setError("unknown section directive: " + peek2());
+  if (consume("(")) {
+    SaveAndRestore saved(inExpr, true);
+    StringRef tok = peek();
+    if (!readSectionDirective(cmd, tok))
+      setError("unknown section directive: " + tok);
+  }
 }
 
 static Expr checkAlignment(Expr e, std::string &loc) {
@@ -945,16 +940,19 @@ static Expr checkAlignment(Expr e, std::string &loc) {
 }
 
 OutputDesc *ScriptParser::readOverlaySectionDescription() {
-  OutputDesc *osd = script->createOutputSection(next(), getCurrentLocation());
+  OutputDesc *osd =
+      script->createOutputSection(readName(), getCurrentLocation());
   osd->osec.inOverlay = true;
   expect("{");
-  while (!errorCount() && !consume("}")) {
+  while (auto tok = till("}")) {
     uint64_t withFlags = 0;
     uint64_t withoutFlags = 0;
-    if (consume("INPUT_SECTION_FLAGS"))
+    if (tok == "INPUT_SECTION_FLAGS") {
       std::tie(withFlags, withoutFlags) = readInputSectionFlags();
+      tok = till("");
+    }
     osd->osec.commands.push_back(
-        readInputSectionRules(next(), withFlags, withoutFlags));
+        readInputSectionRules(tok, withFlags, withoutFlags));
   }
   osd->osec.phdrs = readOutputSectionPhdrs();
   return osd;
@@ -988,8 +986,7 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) {
     osec->constraint = ConstraintKind::ReadWrite;
   expect("{");
 
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == ";") {
       // Empty commands are allowed. Do nothing here.
     } else if (SymbolAssignment *assign = readAssignment(tok)) {
@@ -1029,11 +1026,11 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) {
   }
 
   if (consume(">"))
-    osec->memoryRegionName = std::string(next());
+    osec->memoryRegionName = std::string(readName());
 
   if (consume("AT")) {
     expect(">");
-    osec->lmaRegionName = std::string(next());
+    osec->lmaRegionName = std::string(readName());
   }
 
   if (osec->lmaExpr && !osec->lmaRegionName.empty())
@@ -1079,10 +1076,10 @@ std::array<uint8_t, 4> ScriptParser::readFill() {
 
 SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
   expect("(");
-  StringRef name = next(), eq = peek();
+  StringRef name = readName(), eq = peek();
   if (eq != "=") {
     setError("= expected, but got " + next());
-    while (!atEOF() && next() != ")")
+    while (till(")"))
       ;
     return nullptr;
   }
@@ -1096,45 +1093,55 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
   return cmd;
 }
 
+// Replace whitespace sequence (including \n) with one single space. The output
+// is used by -Map.
+static void squeezeSpaces(std::string &str) {
+  char prev = '\0';
+  auto it = str.begin();
+  for (char c : str)
+    if (!isSpace(c) || (c = ' ') != prev)
+      *it++ = prev = c;
+  str.erase(it, str.end());
+}
+
 SymbolAssignment *ScriptParser::readAssignment(StringRef tok) {
   // Assert expression returns Dot, so this is equal to ".=."
   if (tok == "ASSERT")
     return make<SymbolAssignment>(".", readAssert(), 0, getCurrentLocation());
 
-  size_t oldPos = pos;
+  const char *oldS = prevTok.data();
   SymbolAssignment *cmd = nullptr;
   bool savedSeenRelroEnd = script->seenRelroEnd;
   const StringRef op = peek();
-  if (op.starts_with("=")) {
-    // Support = followed by an expression without whitespace.
-    SaveAndRestore saved(inExpr, true);
-    cmd = readSymbolAssignment(tok);
-  } else if ((op.size() == 2 && op[1] == '=' && strchr("*/+-&^|", op[0])) ||
-             op == "<<=" || op == ">>=") {
-    cmd = readSymbolAssignment(tok);
-  } else if (tok == "PROVIDE") {
+  {
     SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(true, false);
-  } else if (tok == "HIDDEN") {
-    SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(false, true);
-  } else if (tok == "PROVIDE_HIDDEN") {
-    SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(true, true);
+    if (op.starts_with("=")) {
+      // Support = followed by an expression without whitespace.
+      cmd = readSymbolAssignment(unquote(tok));
+    } else if ((op.size() == 2 && op[1] == '=' && strchr("+-*/&^|", op[0])) ||
+               op == "<<=" || op == ">>=") {
+      cmd = readSymbolAssignment(unquote(tok));
+    } else if (tok == "PROVIDE") {
+      cmd = readProvideHidden(true, false);
+    } else if (tok == "HIDDEN") {
+      cmd = readProvideHidden(false, true);
+    } else if (tok == "PROVIDE_HIDDEN") {
+      cmd = readProvideHidden(true, true);
+    }
   }
 
   if (cmd) {
     cmd->dataSegmentRelroEnd = !savedSeenRelroEnd && script->seenRelroEnd;
-    cmd->commandString =
-        tok.str() + " " +
-        llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
+    cmd->commandString = StringRef(oldS, curTok.data() - oldS).str();
+    squeezeSpaces(cmd->commandString);
     expect(";");
   }
   return cmd;
 }
 
+StringRef ScriptParser::readName() { return unquote(next()); }
+
 SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) {
-  name = unquote(name);
   StringRef op = next();
   assert(op == "=" || op == "*=" || op == "/=" || op == "+=" || op == "-=" ||
          op == "&=" || op == "^=" || op == "|=" || op == "<<=" || op == ">>=");
@@ -1180,10 +1187,8 @@ SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) {
 Expr ScriptParser::readExpr() {
   // Our lexer is context-aware. Set the in-expression bit so that
   // they apply different tokenization rules.
-  bool orig = inExpr;
-  inExpr = true;
+  SaveAndRestore saved(inExpr, true);
   Expr e = readExpr1(readPrimary(), 0);
-  inExpr = orig;
   return e;
 }
 
@@ -1249,9 +1254,9 @@ Expr ScriptParser::readExpr1(Expr lhs, int minPrec) {
     StringRef op1 = peek();
     if (precedence(op1) < minPrec)
       break;
-    if (consume("?"))
-      return readTernary(lhs);
     skip();
+    if (op1 == "?")
+      return readTernary(lhs);
     Expr rhs = readPrimary();
 
     // Evaluate the remaining part of the expression first if the
@@ -1281,7 +1286,7 @@ Expr ScriptParser::getPageSize() {
 }
 
 Expr ScriptParser::readConstant() {
-  StringRef s = readParenLiteral();
+  StringRef s = readParenName();
   if (s == "COMMONPAGESIZE")
     return getPageSize();
   if (s == "MAXPAGESIZE")
@@ -1333,12 +1338,11 @@ ByteCommand *ScriptParser::readByteCommand(StringRef tok) {
   if (size == -1)
     return nullptr;
 
-  size_t oldPos = pos;
+  const char *oldS = prevTok.data();
   Expr e = readParenExpr();
-  std::string commandString =
-      tok.str() + " " +
-      llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
-  return make<ByteCommand>(e, size, commandString);
+  std::string commandString = StringRef(oldS, curBuf.s.data() - oldS).str();
+  squeezeSpaces(commandString);
+  return make<ByteCommand>(e, size, std::move(commandString));
 }
 
 static std::optional<uint64_t> parseFlag(StringRef tok) {
@@ -1374,11 +1378,11 @@ static std::optional<uint64_t> parseFlag(StringRef tok) {
 // Example: SHF_EXECINSTR & !SHF_WRITE means with flag SHF_EXECINSTR and
 // without flag SHF_WRITE.
 std::pair<uint64_t, uint64_t> ScriptParser::readInputSectionFlags() {
-   uint64_t withFlags = 0;
-   uint64_t withoutFlags = 0;
-   expect("(");
-   while (!errorCount()) {
-    StringRef tok = unquote(next());
+  uint64_t withFlags = 0;
+  uint64_t withoutFlags = 0;
+  expect("(");
+  while (!errorCount()) {
+    StringRef tok = readName();
     bool without = tok.consume_front("!");
     if (std::optional<uint64_t> flag = parseFlag(tok)) {
       if (without)
@@ -1398,11 +1402,11 @@ std::pair<uint64_t, uint64_t> ScriptParser::readInputSectionFlags() {
   return std::make_pair(withFlags, withoutFlags);
 }
 
-StringRef ScriptParser::readParenLiteral() {
+StringRef ScriptParser::readParenName() {
   expect("(");
   bool orig = inExpr;
   inExpr = false;
-  StringRef tok = next();
+  StringRef tok = readName();
   inExpr = orig;
   expect(")");
   return tok;
@@ -1451,7 +1455,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "ADDR") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     osec->usedInExpression = true;
     return [=]() -> ExprValue {
@@ -1476,7 +1480,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "ALIGNOF") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     return [=] {
       checkIfExists(*osec, location);
@@ -1518,7 +1522,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return alignToPowerOf2(script->getDot(), config->maxPageSize); };
   }
   if (tok == "DEFINED") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     // Return 1 if s is defined. If the definition is only found in a linker
     // script, it must happen before this DEFINED.
     auto order = ctx.scriptSymOrderCounter++;
@@ -1529,7 +1533,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "LENGTH") {
-    StringRef name = readParenLiteral();
+    StringRef name = readParenName();
     if (script->memoryRegions.count(name) == 0) {
       setError("memory region not defined: " + name);
       return [] { return 0; };
@@ -1537,7 +1541,7 @@ Expr ScriptParser::readPrimary() {
     return script->memoryRegions[name]->length;
   }
   if (tok == "LOADADDR") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     osec->usedInExpression = true;
     return [=] {
@@ -1565,7 +1569,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return std::max(a().getValue(), b().getValue()); };
   }
   if (tok == "ORIGIN") {
-    StringRef name = readParenLiteral();
+    StringRef name = readParenName();
     if (script->memoryRegions.count(name) == 0) {
       setError("memory region not defined: " + name);
       return [] { return 0; };
@@ -1581,7 +1585,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return e(); };
   }
   if (tok == "SIZEOF") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *cmd = &script->getOrCreateOutputSection(name)->osec;
     // Linker script does not create an output section if its content is empty.
     // We want to allow SIZEOF(.foo) where .foo is a section which happened to
@@ -1629,7 +1633,7 @@ SmallVector<StringRef, 0> ScriptParser::readOutputSectionPhdrs() {
   SmallVector<StringRef, 0> phdrs;
   while (!errorCount() && peek().starts_with(":")) {
     StringRef tok = next();
-    phdrs.push_back((tok.size() == 1) ? next() : tok.substr(1));
+    phdrs.push_back((tok.size() == 1) ? readName() : tok.substr(1));
   }
   return phdrs;
 }
@@ -1716,23 +1720,19 @@ ScriptParser::readSymbols() {
   SmallVector<SymbolVersion, 0> globals;
   SmallVector<SymbolVersion, 0> *v = &globals;
 
-  while (!errorCount()) {
-    if (consume("}"))
-      break;
-    if (consumeLabel("local")) {
-      v = &locals;
-      continue;
-    }
-    if (consumeLabel("global")) {
-      v = &globals;
-      continue;
-    }
-
-    if (consume("extern")) {
+  while (auto tok = till("}")) {
+    if (tok == "extern") {
       SmallVector<SymbolVersion, 0> ext = readVersionExtern();
       v->insert(v->end(), ext.begin(), ext.end());
     } else {
-      StringRef tok = next();
+      if (tok == "local:" || (tok == "local" && consume(":"))) {
+        v = &locals;
+        continue;
+      }
+      if (tok == "global:" || (tok == "global" && consume(":"))) {
+        v = &globals;
+        continue;
+      }
       v->push_back({unquote(tok), false, hasWildcard(tok)});
     }
     expect(";");
@@ -1753,16 +1753,13 @@ SmallVector<SymbolVersion, 0> ScriptParser::readVersionExtern() {
   expect("{");
 
   SmallVector<SymbolVersion, 0> ret;
-  while (!errorCount() && peek() != "}") {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     ret.push_back(
-        {unquote(tok), isCXX, !tok.starts_with("\"") && hasWildcard(tok)});
+        {unquote(tok), isCXX, !tok.str.starts_with("\"") && hasWildcard(tok)});
     if (consume("}"))
       return ret;
     expect(";");
   }
-
-  expect("}");
   return ret;
 }
 
@@ -1782,8 +1779,7 @@ Expr ScriptParser::readMemoryAssignment(StringRef s1, StringRef s2,
 // MEMORY { name [(attr)] : ORIGIN = origin, LENGTH = len ... }
 void ScriptParser::readMemory() {
   expect("{");
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == "INCLUDE") {
       readInclude();
       continue;
@@ -1861,7 +1857,4 @@ void elf::readDynamicList(MemoryBufferRef mb) {
   ScriptParser(mb).readDynamicList();
 }
 
-void elf::readDefsym(StringRef name, MemoryBufferRef mb) {
-  llvm::TimeTraceScope timeScope("Read defsym input", name);
-  ScriptParser(mb).readDefsym(name);
-}
+void elf::readDefsym(MemoryBufferRef mb) { ScriptParser(mb).readDefsym(); }
diff --git a/lld/ELF/ScriptParser.h b/lld/ELF/ScriptParser.h
index 34b27d2b4787c..d6f71c5ee6329 100644
--- a/lld/ELF/ScriptParser.h
+++ b/lld/ELF/ScriptParser.h
@@ -24,7 +24,7 @@ void readVersionScript(MemoryBufferRef mb);
 void readDynamicList(MemoryBufferRef mb);
 
 // Parses the defsym expression.
-void readDefsym(StringRef name, MemoryBufferRef mb);
+void readDefsym(MemoryBufferRef mb);
 
 bool hasWildcard(StringRef s);
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 93653def328f8..263d4f35c5b9a 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -73,7 +73,6 @@ Defined *ElfSym::riscvGlobalPointer;
 Defined *ElfSym::relaIpltStart;
 Defined *ElfSym::relaIpltEnd;
 Defined *ElfSym::tlsModuleBase;
-SmallVector<SymbolAux, 0> elf::symAux;
 
 static uint64_t getSymVA(const Symbol &sym, int64_t addend) {
   switch (sym.kind()) {
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index e764fe8d73633..ff825615658eb 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -56,17 +56,6 @@ enum {
   NEEDS_TLSIE = 1 << 8,
 };
 
-// Some index properties of a symbol are stored separately in this auxiliary
-// struct to decrease sizeof(SymbolUnion) in the majority of cases.
-struct SymbolAux {
-  uint32_t gotIdx = -1;
-  uint32_t pltIdx = -1;
-  uint32_t tlsDescIdx = -1;
-  uint32_t tlsGdIdx = -1;
-};
-
-LLVM_LIBRARY_VISIBILITY extern SmallVector<SymbolAux, 0> symAux;
-
 // The base class for real symbol classes.
 class Symbol {
 public:
@@ -211,10 +200,10 @@ class Symbol {
   // truncated by Symbol::parseSymbolVersion().
   const char *getVersionSuffix() const { return nameData + nameSize; }
 
-  uint32_t getGotIdx() const { return symAux[auxIdx].gotIdx; }
-  uint32_t getPltIdx() const { return symAux[auxIdx].pltIdx; }
-  uint32_t getTlsDescIdx() const { return symAux[auxIdx].tlsDescIdx; }
-  uint32_t getTlsGdIdx() const { return symAux[auxIdx].tlsGdIdx; }
+  uint32_t getGotIdx() const { return ctx.symAux[auxIdx].gotIdx; }
+  uint32_t getPltIdx() const { return ctx.symAux[auxIdx].pltIdx; }
+  uint32_t getTlsDescIdx() const { return ctx.symAux[auxIdx].tlsDescIdx; }
+  uint32_t getTlsGdIdx() const { return ctx.symAux[auxIdx].tlsGdIdx; }
 
   bool isInGot() const { return getGotIdx() != uint32_t(-1); }
   bool isInPlt() const { return getPltIdx() != uint32_t(-1); }
@@ -325,8 +314,8 @@ class Symbol {
   // entries during postScanRelocations();
   std::atomic<uint16_t> flags;
 
-  // A symAux index used to access GOT/PLT entry indexes. This is allocated in
-  // postScanRelocations().
+  // A ctx.symAux index used to access GOT/PLT entry indexes. This is allocated
+  // in postScanRelocations().
   uint32_t auxIdx;
   uint32_t dynsymIndex;
 
@@ -357,8 +346,8 @@ class Symbol {
   }
   void allocateAux() {
     assert(auxIdx == 0);
-    auxIdx = symAux.size();
-    symAux.emplace_back();
+    auxIdx = ctx.symAux.size();
+    ctx.symAux.emplace_back();
   }
 
   bool isSection() const { return type == llvm::ELF::STT_SECTION; }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 5d3f3df216b85..8ffa917635120 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -455,7 +455,8 @@ template <class ELFT>
 void EhFrameSection::addSectionAux(EhInputSection *sec) {
   if (!sec->isLive())
     return;
-  const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
+  const RelsOrRelas<ELFT> rels =
+      sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
   if (rels.areRelocsRel())
     addRecords<ELFT>(sec, rels.rels);
   else
@@ -489,7 +490,8 @@ void EhFrameSection::iterateFDEWithLSDA(
   DenseSet<size_t> ciesWithLSDA;
   for (EhInputSection *sec : sections) {
     ciesWithLSDA.clear();
-    const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
+    const RelsOrRelas<ELFT> rels =
+        sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
     if (rels.areRelocsRel())
       iterateFDEWithLSDAAux<ELFT>(*sec, rels.rels, ciesWithLSDA, fn);
     else
@@ -653,20 +655,20 @@ GotSection::GotSection()
 
 void GotSection::addConstant(const Relocation &r) { relocations.push_back(r); }
 void GotSection::addEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().gotIdx = numEntries++;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().gotIdx = numEntries++;
 }
 
 bool GotSection::addTlsDescEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsDescIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsDescIdx = numEntries;
   numEntries += 2;
   return true;
 }
 
 bool GotSection::addDynTlsEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsGdIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsGdIdx = numEntries;
   // Global Dynamic TLS entries take two GOT slots.
   numEntries += 2;
   return true;
@@ -997,12 +999,12 @@ void MipsGotSection::build() {
   for (auto &p : primGot->global) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
   for (auto &p : primGot->relocs) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
 
   // Create dynamic relocations.
@@ -1171,8 +1173,8 @@ GotPltSection::GotPltSection()
 }
 
 void GotPltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1 &&
-         symAux.back().pltIdx == entries.size());
+  assert(sym.auxIdx == ctx.symAux.size() - 1 &&
+         ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -1217,7 +1219,7 @@ IgotPltSection::IgotPltSection()
                        target->gotEntrySize, getIgotPltName()) {}
 
 void IgotPltSection::addEntry(Symbol &sym) {
-  assert(symAux.back().pltIdx == entries.size());
+  assert(ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -2566,8 +2568,8 @@ void PltSection::writeTo(uint8_t *buf) {
 }
 
 void PltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
@@ -2613,8 +2615,8 @@ size_t IpltSection::getSize() const {
 }
 
 void IpltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
@@ -3203,10 +3205,10 @@ template <class ELFT> DebugNamesSection<ELFT>::DebugNamesSection() {
 template <class ELFT>
 template <class RelTy>
 void DebugNamesSection<ELFT>::getNameRelocs(
-    InputSection *sec, ArrayRef<RelTy> rels,
-    DenseMap<uint32_t, uint32_t> &relocs) {
+    const InputFile &file, DenseMap<uint32_t, uint32_t> &relocs,
+    Relocs<RelTy> rels) {
   for (const RelTy &rel : rels) {
-    Symbol &sym = sec->file->getRelocTargetSym(rel);
+    Symbol &sym = file.getRelocTargetSym(rel);
     relocs[rel.r_offset] = sym.getVA(getAddend<ELFT>(rel));
   }
 }
@@ -3216,11 +3218,7 @@ template <class ELFT> void DebugNamesSection<ELFT>::finalizeContents() {
   auto relocs = std::make_unique<DenseMap<uint32_t, uint32_t>[]>(numChunks);
   parallelFor(0, numChunks, [&](size_t i) {
     InputSection *sec = inputSections[i];
-    auto rels = sec->template relsOrRelas<ELFT>();
-    if (rels.areRelocsRel())
-      getNameRelocs(sec, rels.rels, relocs.get()[i]);
-    else
-      getNameRelocs(sec, rels.relas, relocs.get()[i]);
+    invokeOnRelocs(*sec, getNameRelocs, *sec->file, relocs.get()[i]);
 
     // Relocate CU offsets with .debug_info + X relocations.
     OutputChunk &chunk = chunks.get()[i];
@@ -4669,7 +4667,8 @@ template <class ELFT> void elf::createSyntheticSections() {
 
   auto add = [](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); };
 
-  in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
+  if (config->zSectionHeader)
+    in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
 
   Out::programHeaders = make<OutputSection>("", 0, SHF_ALLOC);
   Out::programHeaders->addralign = config->wordsize;
@@ -4921,7 +4920,8 @@ template <class ELFT> void elf::createSyntheticSections() {
     add(*in.symTab);
   if (in.symTabShndx)
     add(*in.symTabShndx);
-  add(*in.shStrTab);
+  if (in.shStrTab)
+    add(*in.shStrTab);
   if (in.strTab)
     add(*in.strTab);
 }
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index eaa09ea7194fb..d4169e1e1acaf 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -916,8 +916,9 @@ class DebugNamesSection final : public DebugNamesBaseSection {
   void writeTo(uint8_t *buf) override;
 
   template <class RelTy>
-  void getNameRelocs(InputSection *sec, ArrayRef<RelTy> rels,
-                     llvm::DenseMap<uint32_t, uint32_t> &relocs);
+  void getNameRelocs(const InputFile &file,
+                     llvm::DenseMap<uint32_t, uint32_t> &relocs,
+                     Relocs<RelTy> rels);
 
 private:
   static void readOffsets(InputChunk &inputChunk, OutputChunk &chunk,
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 5cffdb771a738..2ceae215ae278 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -401,10 +401,19 @@ template <class ELFT> static void markUsedLocalSymbols() {
       InputSection *isec = dyn_cast_or_null<InputSection>(s);
       if (!isec)
         continue;
-      if (isec->type == SHT_REL)
+      if (isec->type == SHT_REL) {
         markUsedLocalSymbolsImpl(f, isec->getDataAs<typename ELFT::Rel>());
-      else if (isec->type == SHT_RELA)
+      } else if (isec->type == SHT_RELA) {
         markUsedLocalSymbolsImpl(f, isec->getDataAs<typename ELFT::Rela>());
+      } else if (isec->type == SHT_CREL) {
+        // The is64=true variant also works with ELF32 since only the r_symidx
+        // member is used.
+        for (Elf_Crel_Impl<true> r : RelocsCrel<true>(isec->content_)) {
+          Symbol &sym = file->getSymbol(r.r_symidx);
+          if (sym.isLocal())
+            sym.used = true;
+        }
+      }
     }
   }
 }
@@ -1875,13 +1884,16 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   sortSections();
 
   // Create a list of OutputSections, assign sectionIndex, and populate
-  // in.shStrTab.
+  // in.shStrTab. If -z nosectionheader is specified, drop non-ALLOC sections.
   for (SectionCommand *cmd : script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd)) {
       OutputSection *osec = &osd->osec;
+      if (!in.shStrTab && !(osec->flags & SHF_ALLOC))
+        continue;
       outputSections.push_back(osec);
       osec->sectionIndex = outputSections.size();
-      osec->shName = in.shStrTab->addString(osec->name);
+      if (in.shStrTab)
+        osec->shName = in.shStrTab->addString(osec->name);
     }
 
   // Prefer command line supplied address over other constraints.
@@ -2703,6 +2715,10 @@ template <class ELFT> void Writer<ELFT>::writeHeader() {
   auto *eHdr = reinterpret_cast<Elf_Ehdr *>(Out::bufferStart);
   eHdr->e_type = getELFType();
   eHdr->e_entry = getEntryAddr();
+
+  // If -z nosectionheader is specified, omit the section header table.
+  if (!in.shStrTab)
+    return;
   eHdr->e_shoff = sectionHeaderOff;
 
   // Write the section header table.
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
new file mode 100644
index 0000000000000..568843d72bbb5
--- /dev/null
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -0,0 +1,413 @@
+//===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPSectionOrderer.h"
+#include "InputSection.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/xxhash.h"
+
+#define DEBUG_TYPE "bp-section-orderer"
+using namespace llvm;
+using namespace lld::macho;
+
+/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
+/// "yyyy" are numbers that could change between builds. We need to use the root
+/// symbol name before this suffix so these symbols can be matched with profiles
+/// which may have different suffixes.
+static StringRef getRootSymbol(StringRef Name) {
+  auto [P0, S0] = Name.rsplit(".llvm.");
+  auto [P1, S1] = P0.rsplit(".__uniq.");
+  return P1;
+}
+
+static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
+                             uint64_t offset, uint64_t addend) {
+  return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
+                   Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
+                      .str());
+}
+
+static uint64_t
+getRelocHash(const Reloc &reloc,
+             const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
+  auto *isec = reloc.getReferentInputSection();
+  std::optional<uint64_t> sectionIdx;
+  auto sectionIdxIt = sectionToIdx.find(isec);
+  if (sectionIdxIt != sectionToIdx.end())
+    sectionIdx = sectionIdxIt->getSecond();
+  std::string kind;
+  if (isec)
+    kind = ("Section " + Twine(static_cast<uint8_t>(isec->kind()))).str();
+  if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
+    kind += (" Symbol " + Twine(static_cast<uint8_t>(sym->kind()))).str();
+    if (auto *d = dyn_cast<Defined>(sym)) {
+      if (isa_and_nonnull<CStringInputSection>(isec))
+        return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);
+      return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
+    }
+  }
+  return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
+}
+
+static void constructNodesForCompression(
+    const SmallVector<const InputSection *> &sections,
+    const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
+    const SmallVector<unsigned> &sectionIdxs,
+    std::vector<BPFunctionNode> &nodes,
+    DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+    BPFunctionNode::UtilityNodeT &maxUN) {
+  TimeTraceScope timeScope("Build nodes for compression");
+
+  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
+  sectionHashes.reserve(sectionIdxs.size());
+  SmallVector<uint64_t> hashes;
+  for (unsigned sectionIdx : sectionIdxs) {
+    const auto *isec = sections[sectionIdx];
+    constexpr unsigned windowSize = 4;
+
+    for (size_t i = 0; i < isec->data.size(); i++) {
+      auto window = isec->data.drop_front(i).take_front(windowSize);
+      hashes.push_back(xxHash64(window));
+    }
+    for (const auto &r : isec->relocs) {
+      if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
+        continue;
+      uint64_t relocHash = getRelocHash(r, sectionToIdx);
+      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
+      for (uint32_t i = start; i < r.offset + r.length; i++) {
+        auto window = isec->data.drop_front(i).take_front(windowSize);
+        hashes.push_back(xxHash64(window) + relocHash);
+      }
+    }
+
+    llvm::sort(hashes);
+    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+
+    sectionHashes.emplace_back(sectionIdx, hashes);
+    hashes.clear();
+  }
+
+  DenseMap<uint64_t, unsigned> hashFrequency;
+  for (auto &[sectionIdx, hashes] : sectionHashes)
+    for (auto hash : hashes)
+      ++hashFrequency[hash];
+
+  // Merge section that are nearly identical
+  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+  DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+  for (auto &[sectionIdx, hashes] : sectionHashes) {
+    uint64_t wholeHash = 0;
+    for (auto hash : hashes)
+      if (hashFrequency[hash] > 5)
+        wholeHash ^= hash;
+    auto [it, wasInserted] =
+        wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+    if (wasInserted) {
+      newSectionHashes.emplace_back(sectionIdx, hashes);
+    } else {
+      duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+    }
+  }
+  sectionHashes = newSectionHashes;
+
+  // Recompute hash frequencies
+  hashFrequency.clear();
+  for (auto &[sectionIdx, hashes] : sectionHashes)
+    for (auto hash : hashes)
+      ++hashFrequency[hash];
+
+  // Filter rare and common hashes and assign each a unique utility node that
+  // doesn't conflict with the trace utility nodes
+  DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
+  for (auto &[hash, frequency] : hashFrequency) {
+    if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+      continue;
+    hashToUN[hash] = ++maxUN;
+  }
+
+  std::vector<BPFunctionNode::UtilityNodeT> uns;
+  for (auto &[sectionIdx, hashes] : sectionHashes) {
+    for (auto &hash : hashes) {
+      auto it = hashToUN.find(hash);
+      if (it != hashToUN.end())
+        uns.push_back(it->second);
+    }
+    nodes.emplace_back(sectionIdx, uns);
+    uns.clear();
+  }
+}
+
+DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
+    size_t &highestAvailablePriority, StringRef profilePath,
+    bool forFunctionCompression, bool forDataCompression, bool verbose) {
+
+  SmallVector<const InputSection *> sections;
+  DenseMap<const InputSection *, uint64_t> sectionToIdx;
+  StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
+  for (const auto *file : inputFiles) {
+    for (auto *sec : file->sections) {
+      for (auto &subsec : sec->subsections) {
+        auto *isec = subsec.isec;
+        if (!isec || isec->data.empty() || !isec->data.data())
+          continue;
+        unsigned sectionIdx = sections.size();
+        sectionToIdx.try_emplace(isec, sectionIdx);
+        sections.push_back(isec);
+        for (Symbol *sym : isec->symbols)
+          if (auto *d = dyn_cast_or_null<Defined>(sym))
+            symbolToSectionIdxs[d->getName()].insert(sectionIdx);
+      }
+    }
+  }
+
+  StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
+  for (auto &entry : symbolToSectionIdxs) {
+    StringRef name = entry.getKey();
+    auto &sectionIdxs = entry.getValue();
+    name = getRootSymbol(name);
+    rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                         sectionIdxs.end());
+    // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
+    // Mangler::getNameWithPrefix() for details.
+    if (name.consume_front("_") || name.consume_front("l_"))
+      rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                           sectionIdxs.end());
+  }
+
+  std::vector<BPFunctionNode> nodesForStartup;
+  BPFunctionNode::UtilityNodeT maxUN = 0;
+  DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
+      startupSectionIdxUNs;
+  std::unique_ptr<InstrProfReader> reader;
+  if (!profilePath.empty()) {
+    auto fs = vfs::getRealFileSystem();
+    auto readerOrErr = InstrProfReader::create(profilePath, *fs);
+    lld::checkError(readerOrErr.takeError());
+
+    reader = std::move(readerOrErr.get());
+    for (auto &entry : *reader) {
+      // Read all entries
+      (void)entry;
+    }
+    auto &traces = reader->getTemporalProfTraces();
+
+    // Used to define the initial order for startup functions.
+    DenseMap<unsigned, size_t> sectionIdxToTimestamp;
+    DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
+    for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
+      uint64_t currentSize = 0, cutoffSize = 1;
+      size_t cutoffTimestamp = 1;
+      auto &trace = traces[traceIdx].FunctionNameRefs;
+      for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
+        auto [Filename, ParsedFuncName] = getParsedIRPGOName(
+            reader->getSymtab().getFuncOrVarName(trace[timestamp]));
+        ParsedFuncName = getRootSymbol(ParsedFuncName);
+
+        auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
+        if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
+          continue;
+        auto &sectionIdxs = sectionIdxsIt->getValue();
+        // If the same symbol is found in multiple sections, they might be
+        // identical, so we arbitrarily use the size from the first section.
+        currentSize += sections[*sectionIdxs.begin()]->getSize();
+
+        // Since BalancedPartitioning is sensitive to the initial order, we need
+        // to explicitly define it to be ordered by earliest timestamp.
+        for (unsigned sectionIdx : sectionIdxs) {
+          auto [it, wasInserted] =
+              sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
+          if (!wasInserted)
+            it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
+        }
+
+        if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
+          ++maxUN;
+          cutoffSize = 2 * currentSize;
+          cutoffTimestamp = 2 * cutoffTimestamp;
+        }
+        for (unsigned sectionIdx : sectionIdxs)
+          sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
+      }
+      for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
+        for (auto un = firstUN; un <= maxUN; ++un)
+          startupSectionIdxUNs[sectionIdx].push_back(un);
+      ++maxUN;
+      sectionIdxToFirstUN.clear();
+    }
+
+    // These uns should already be sorted without duplicates.
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      nodesForStartup.emplace_back(sectionIdx, uns);
+
+    llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+      return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+             std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+    });
+  }
+
+  SmallVector<unsigned> sectionIdxsForFunctionCompression,
+      sectionIdxsForDataCompression;
+  for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
+    if (startupSectionIdxUNs.count(sectionIdx))
+      continue;
+    const auto *isec = sections[sectionIdx];
+    if (isCodeSection(isec)) {
+      if (forFunctionCompression)
+        sectionIdxsForFunctionCompression.push_back(sectionIdx);
+    } else {
+      if (forDataCompression)
+        sectionIdxsForDataCompression.push_back(sectionIdx);
+    }
+  }
+
+  std::vector<BPFunctionNode> nodesForFunctionCompression,
+      nodesForDataCompression;
+  // Map a section index (to be ordered for compression) to a list of duplicate
+  // section indices (not ordered for compression).
+  DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
+      duplicateDataSectionIdxs;
+  constructNodesForCompression(
+      sections, sectionToIdx, sectionIdxsForFunctionCompression,
+      nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
+  constructNodesForCompression(
+      sections, sectionToIdx, sectionIdxsForDataCompression,
+      nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
+
+  // Sort nodes by their Id (which is the section index) because the input
+  // linker order tends to be not bad
+  llvm::sort(nodesForFunctionCompression,
+             [](auto &L, auto &R) { return L.Id < R.Id; });
+  llvm::sort(nodesForDataCompression,
+             [](auto &L, auto &R) { return L.Id < R.Id; });
+
+  {
+    TimeTraceScope timeScope("Balanced Partitioning");
+    BalancedPartitioningConfig config;
+    BalancedPartitioning bp(config);
+    bp.run(nodesForStartup);
+    bp.run(nodesForFunctionCompression);
+    bp.run(nodesForDataCompression);
+  }
+
+  unsigned numStartupSections = 0;
+  unsigned numCodeCompressionSections = 0;
+  unsigned numDuplicateCodeSections = 0;
+  unsigned numDataCompressionSections = 0;
+  unsigned numDuplicateDataSections = 0;
+  SetVector<const InputSection *> orderedSections;
+  // Order startup functions,
+  for (auto &node : nodesForStartup) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numStartupSections;
+  }
+  // then functions for compression,
+  for (auto &node : nodesForFunctionCompression) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numCodeCompressionSections;
+
+    auto It = duplicateFunctionSectionIdxs.find(node.Id);
+    if (It == duplicateFunctionSectionIdxs.end())
+      continue;
+    for (auto dupSecIdx : It->getSecond()) {
+      const auto *dupIsec = sections[dupSecIdx];
+      if (orderedSections.insert(dupIsec))
+        ++numDuplicateCodeSections;
+    }
+  }
+  // then data for compression.
+  for (auto &node : nodesForDataCompression) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numDataCompressionSections;
+    auto It = duplicateDataSectionIdxs.find(node.Id);
+    if (It == duplicateDataSectionIdxs.end())
+      continue;
+    for (auto dupSecIdx : It->getSecond()) {
+      const auto *dupIsec = sections[dupSecIdx];
+      if (orderedSections.insert(dupIsec))
+        ++numDuplicateDataSections;
+    }
+  }
+
+  if (verbose) {
+    unsigned numTotalOrderedSections =
+        numStartupSections + numCodeCompressionSections +
+        numDuplicateCodeSections + numDataCompressionSections +
+        numDuplicateDataSections;
+    dbgs()
+        << "Ordered " << numTotalOrderedSections
+        << " sections using balanced partitioning:\n  Functions for startup: "
+        << numStartupSections
+        << "\n  Functions for compression: " << numCodeCompressionSections
+        << "\n  Duplicate functions: " << numDuplicateCodeSections
+        << "\n  Data for compression: " << numDataCompressionSections
+        << "\n  Duplicate data: " << numDuplicateDataSections << "\n";
+
+    if (!profilePath.empty()) {
+      // Evaluate this function order for startup
+      StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
+      const uint64_t pageSize = (1 << 14);
+      uint64_t currentAddress = 0;
+      for (const auto *isec : orderedSections) {
+        for (Symbol *sym : isec->symbols) {
+          if (auto *d = dyn_cast_or_null<Defined>(sym)) {
+            uint64_t startAddress = currentAddress + d->value;
+            uint64_t endAddress = startAddress + d->size;
+            uint64_t firstPage = startAddress / pageSize;
+            // I think the kernel might pull in a few pages when one it touched,
+            // so it might be more accurate to force lastPage to be aligned by
+            // 4?
+            uint64_t lastPage = endAddress / pageSize;
+            StringRef rootSymbol = d->getName();
+            rootSymbol = getRootSymbol(rootSymbol);
+            symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+            if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
+              symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+          }
+        }
+
+        currentAddress += isec->getSize();
+      }
+
+      // The area under the curve F where F(t) is the total number of page
+      // faults at step t.
+      unsigned area = 0;
+      for (auto &trace : reader->getTemporalProfTraces()) {
+        SmallSet<uint64_t, 0> touchedPages;
+        for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
+          auto traceId = trace.FunctionNameRefs[step];
+          auto [Filename, ParsedFuncName] =
+              getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
+          ParsedFuncName = getRootSymbol(ParsedFuncName);
+          auto it = symbolToPageNumbers.find(ParsedFuncName);
+          if (it != symbolToPageNumbers.end()) {
+            auto &[firstPage, lastPage] = it->getValue();
+            for (uint64_t i = firstPage; i <= lastPage; i++)
+              touchedPages.insert(i);
+          }
+          area += touchedPages.size();
+        }
+      }
+      dbgs() << "Total area under the page fault curve: " << (float)area
+             << "\n";
+    }
+  }
+
+  DenseMap<const InputSection *, size_t> sectionPriorities;
+  for (const auto *isec : orderedSections)
+    sectionPriorities[isec] = --highestAvailablePriority;
+  return sectionPriorities;
+}
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
new file mode 100644
index 0000000000000..6f9eefd5d82be
--- /dev/null
+++ b/lld/MachO/BPSectionOrderer.h
@@ -0,0 +1,37 @@
+//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file uses Balanced Partitioning to order sections to improve startup
+/// time and compressed size.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_BPSECTION_ORDERER_H
+#define LLD_MACHO_BPSECTION_ORDERER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lld::macho {
+
+class InputSection;
+
+/// Run Balanced Partitioning to find the optimal function and data order to
+/// improve startup time and compressed size.
+///
+/// It is important that .subsections_via_symbols is used to ensure functions
+/// and data are in their own sections and thus can be reordered.
+llvm::DenseMap<const lld::macho::InputSection *, size_t>
+runBalancedPartitioning(size_t &highestAvailablePriority,
+                        llvm::StringRef profilePath,
+                        bool forFunctionCompression, bool forDataCompression,
+                        bool verbose);
+
+} // namespace lld::macho
+
+#endif
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index 0b92488b00bea..385a5f5d022eb 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -25,7 +25,9 @@ add_lld_library(lldMachO
   OutputSection.cpp
   OutputSegment.cpp
   Relocations.cpp
+  BPSectionOrderer.cpp
   SectionPriorities.cpp
+  Sections.cpp
   SymbolTable.cpp
   Symbols.cpp
   SyntheticSections.cpp
@@ -47,6 +49,7 @@ add_lld_library(lldMachO
   Object
   Option
   Passes
+  ProfileData
   Support
   TargetParser
   TextAPI
diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
index 279423720be9d..e89cafe0d1e6d 100644
--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@@ -127,10 +127,20 @@ bool TextOutputSection::needsThunks() const {
   uint64_t isecAddr = addr;
   for (ConcatInputSection *isec : inputs)
     isecAddr = alignToPowerOf2(isecAddr, isec->align) + isec->getSize();
-  if (isecAddr - addr + in.stubs->getSize() <=
-      std::min(target->backwardBranchRange, target->forwardBranchRange))
+  // Other sections besides __text might be small enough to pass this
+  // test but nevertheless need thunks for calling into other sections.
+  // An imperfect heuristic to use in this case is that if a section
+  // we've already processed in this segment needs thunks, so do the
+  // rest.
+  bool needsThunks = parent && parent->needsThunks;
+  if (!needsThunks &&
+      isecAddr - addr + in.stubs->getSize() <=
+          std::min(target->backwardBranchRange, target->forwardBranchRange))
     return false;
   // Yes, this program is large enough to need thunks.
+  if (parent) {
+    parent->needsThunks = true;
+  }
   for (ConcatInputSection *isec : inputs) {
     for (Reloc &r : isec->relocs) {
       if (!target->hasAttr(r.type, RelocAttrBits::BRANCH))
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 4d3f3d05c2338..5beb0662ba727 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -199,6 +199,7 @@ struct Configuration {
   std::vector<llvm::StringRef> systemLibraryRoots;
   std::vector<llvm::StringRef> librarySearchPaths;
   std::vector<llvm::StringRef> frameworkSearchPaths;
+  bool warnDuplicateRpath = true;
   llvm::SmallVector<llvm::StringRef, 0> runtimePaths;
   std::vector<std::string> astPaths;
   std::vector<Symbol *> explicitUndefineds;
@@ -211,10 +212,16 @@ struct Configuration {
   bool csProfileGenerate = false;
   llvm::StringRef csProfilePath;
   bool pgoWarnMismatch;
+  bool warnThinArchiveMissingMembers;
 
   bool callGraphProfileSort = false;
   llvm::StringRef printSymbolOrder;
 
+  llvm::StringRef irpgoProfileSortProfilePath;
+  bool functionOrderForCompression = false;
+  bool dataOrderForCompression = false;
+  bool verboseBpSectionOrderer = false;
+
   SectionRenameMap sectionRenameMap;
   SegmentRenameMap segmentRenameMap;
 
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index a370d5734124a..f3d2a93914f71 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/PackedVersion.h"
 
 #include <algorithm>
@@ -270,6 +271,20 @@ struct ArchiveFileInfo {
 
 static DenseMap<StringRef, ArchiveFileInfo> loadedArchives;
 
+static void saveThinArchiveToRepro(ArchiveFile const *file) {
+  assert(tar && file->getArchive().isThin());
+
+  Error e = Error::success();
+  for (const object::Archive::Child &c : file->getArchive().children(e)) {
+    MemoryBufferRef mb = CHECK(c.getMemoryBufferRef(),
+                               toString(file) + ": failed to get buffer");
+    tar->append(relativeToRoot(CHECK(c.getFullName(), file)), mb.getBuffer());
+  }
+  if (e)
+    error(toString(file) +
+          ": Archive::children failed: " + toString(std::move(e)));
+}
+
 static InputFile *addFile(StringRef path, LoadType loadType,
                           bool isLazy = false, bool isExplicit = true,
                           bool isBundleLoader = false,
@@ -301,6 +316,9 @@ static InputFile *addFile(StringRef path, LoadType loadType,
       if (!archive->isEmpty() && !archive->hasSymbolTable())
         error(path + ": archive has no index; run ranlib to add one");
       file = make<ArchiveFile>(std::move(archive), isForceHidden);
+
+      if (tar && file->getArchive().isThin())
+        saveThinArchiveToRepro(file);
     } else {
       file = entry->second.file;
       // Command-line loads take precedence. If file is previously loaded via
@@ -330,9 +348,13 @@ static InputFile *addFile(StringRef path, LoadType loadType,
               reason = "-all_load";
               break;
           }
-          if (Error e = file->fetch(c, reason))
-            error(toString(file) + ": " + reason +
-                  " failed to load archive member: " + toString(std::move(e)));
+          if (Error e = file->fetch(c, reason)) {
+            if (config->warnThinArchiveMissingMembers)
+              warn(toString(file) + ": " + reason +
+                   " failed to load archive member: " + toString(std::move(e)));
+            else
+              llvm::consumeError(std::move(e));
+          }
         }
         if (e)
           error(toString(file) +
@@ -349,7 +371,18 @@ static InputFile *addFile(StringRef path, LoadType loadType,
         Error e = Error::success();
         for (const object::Archive::Child &c : file->getArchive().children(e)) {
           Expected<MemoryBufferRef> mb = c.getMemoryBufferRef();
-          if (!mb || !hasObjCSection(*mb))
+          if (!mb) {
+            // We used to create broken repro tarballs that only included those
+            // object files from thin archives that ended up being used.
+            if (config->warnThinArchiveMissingMembers)
+              warn(toString(file) + ": -ObjC failed to open archive member: " +
+                   toString(mb.takeError()));
+            else
+              llvm::consumeError(mb.takeError());
+            continue;
+          }
+
+          if (!hasObjCSection(*mb))
             continue;
           if (Error e = file->fetch(c, "-ObjC"))
             error(toString(file) + ": -ObjC failed to load archive member: " +
@@ -1048,42 +1081,55 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   if (arg && arg->getOption().matches(OPT_no_fixup_chains))
     return false;
 
-  bool isRequested = arg != nullptr;
-
-  // Version numbers taken from the Xcode 13.3 release notes.
-  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
-      {{{PLATFORM_MACOS, VersionTuple(11, 0)},
-        {PLATFORM_IOS, VersionTuple(13, 4)},
-        {PLATFORM_TVOS, VersionTuple(14, 0)},
-        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
-        {PLATFORM_XROS, VersionTuple(1, 0)}}};
-  PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
-  auto it = llvm::find_if(minVersion,
-                          [&](const auto &p) { return p.first == platform; });
-  if (it != minVersion.end() &&
-      it->second > config->platformInfo.target.MinDeployment) {
-    if (!isRequested)
-      return false;
+  bool requested = arg && arg->getOption().matches(OPT_fixup_chains);
+  if (!config->isPic) {
+    if (requested)
+      error("-fixup_chains is incompatible with -no_pie");
 
-    warn("-fixup_chains requires " + getPlatformName(config->platform()) + " " +
-         it->second.getAsString() + ", which is newer than target minimum of " +
-         config->platformInfo.target.MinDeployment.getAsString());
+    return false;
   }
 
   if (!is_contained({AK_x86_64, AK_x86_64h, AK_arm64}, config->arch())) {
-    if (isRequested)
+    if (requested)
       error("-fixup_chains is only supported on x86_64 and arm64 targets");
+
     return false;
   }
 
-  if (!config->isPic) {
-    if (isRequested)
-      error("-fixup_chains is incompatible with -no_pie");
+  if (args.hasArg(OPT_preload)) {
+    if (requested)
+      error("-fixup_chains is incompatible with -preload");
+
     return false;
   }
 
-  // TODO: Enable by default once stable.
-  return isRequested;
+  if (requested)
+    return true;
+
+  static const std::array<std::pair<PlatformType, VersionTuple>, 9> minVersion =
+      {{
+          {PLATFORM_IOS, VersionTuple(13, 4)},
+          {PLATFORM_IOSSIMULATOR, VersionTuple(16, 0)},
+          {PLATFORM_MACOS, VersionTuple(13, 0)},
+          {PLATFORM_TVOS, VersionTuple(14, 0)},
+          {PLATFORM_TVOSSIMULATOR, VersionTuple(15, 0)},
+          {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+          {PLATFORM_WATCHOSSIMULATOR, VersionTuple(8, 0)},
+          {PLATFORM_XROS, VersionTuple(1, 0)},
+          {PLATFORM_XROS_SIMULATOR, VersionTuple(1, 0)},
+      }};
+  PlatformType platform = config->platformInfo.target.Platform;
+  auto it = llvm::find_if(minVersion,
+                          [&](const auto &p) { return p.first == platform; });
+
+  // We don't know the versions for other platforms, so default to disabled.
+  if (it == minVersion.end())
+    return false;
+
+  if (it->second > config->platformInfo.target.MinDeployment)
+    return false;
+
+  return true;
 }
 
 static bool shouldEmitRelativeMethodLists(const InputArgList &args) {
@@ -1402,6 +1448,19 @@ static void eraseInitializerSymbols() {
       sym->used = false;
 }
 
+static SmallVector<StringRef, 0> getRuntimePaths(opt::InputArgList &args) {
+  SmallVector<StringRef, 0> vals;
+  DenseSet<StringRef> seen;
+  for (const Arg *arg : args.filtered(OPT_rpath)) {
+    StringRef val = arg->getValue();
+    if (seen.insert(val).second)
+      vals.push_back(val);
+    else if (config->warnDuplicateRpath)
+      warn("duplicate -rpath '" + val + "' ignored [--warn-duplicate-rpath]");
+  }
+  return vals;
+}
+
 namespace lld {
 namespace macho {
 bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
@@ -1642,7 +1701,9 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with "
           "--thinlto-index-only=");
   }
-  config->runtimePaths = args::getStrings(args, OPT_rpath);
+  config->warnDuplicateRpath =
+      args.hasFlag(OPT_warn_duplicate_rpath, OPT_no_warn_duplicate_rpath, true);
+  config->runtimePaths = getRuntimePaths(args);
   config->allLoad = args.hasFlag(OPT_all_load, OPT_noall_load, false);
   config->archMultiple = args.hasArg(OPT_arch_multiple);
   config->applicationExtension = args.hasFlag(
@@ -1684,8 +1745,39 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   config->csProfilePath = args.getLastArgValue(OPT_cs_profile_path);
   config->pgoWarnMismatch =
       args.hasFlag(OPT_pgo_warn_mismatch, OPT_no_pgo_warn_mismatch, true);
+  config->warnThinArchiveMissingMembers =
+      args.hasFlag(OPT_warn_thin_archive_missing_members,
+                   OPT_no_warn_thin_archive_missing_members, true);
   config->generateUuid = !args.hasArg(OPT_no_uuid);
 
+  auto IncompatWithCGSort = [&](StringRef firstArgStr) {
+    // Throw an error only if --call-graph-profile-sort is explicitly specified
+    if (config->callGraphProfileSort)
+      if (const Arg *arg = args.getLastArgNoClaim(OPT_call_graph_profile_sort))
+        error(firstArgStr + " is incompatible with " + arg->getSpelling());
+  };
+  if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
+    config->irpgoProfileSortProfilePath = arg->getValue();
+    IncompatWithCGSort(arg->getSpelling());
+  }
+  if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
+    StringRef compressionSortStr = arg->getValue();
+    if (compressionSortStr == "function") {
+      config->functionOrderForCompression = true;
+    } else if (compressionSortStr == "data") {
+      config->dataOrderForCompression = true;
+    } else if (compressionSortStr == "both") {
+      config->functionOrderForCompression = true;
+      config->dataOrderForCompression = true;
+    } else if (compressionSortStr != "none") {
+      error("unknown value `" + compressionSortStr + "` for " +
+            arg->getSpelling());
+    }
+    if (compressionSortStr != "none")
+      IncompatWithCGSort(arg->getSpelling());
+  }
+  config->verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);
+
   for (const Arg *arg : args.filtered(OPT_alias)) {
     config->aliasedSymbols.push_back(
         std::make_pair(arg->getValue(0), arg->getValue(1)));
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index b40a812f30bd3..3086c9cc4729d 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -2200,10 +2200,6 @@ Error ArchiveFile::fetch(const object::Archive::Child &c, StringRef reason) {
   if (!mb)
     return mb.takeError();
 
-  // Thin archives refer to .o files, so --reproduce needs the .o files too.
-  if (tar && c.getParent()->isThin())
-    tar->append(relativeToRoot(CHECK(c.getFullName(), this)), mb->getBuffer());
-
   Expected<TimePoint<std::chrono::seconds>> modTime = c.getLastModified();
   if (!modTime)
     return modTime.takeError();
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 904701731684b..a9b93e07a6013 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -11,6 +11,7 @@
 #include "Config.h"
 #include "InputFiles.h"
 #include "OutputSegment.h"
+#include "Sections.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
@@ -366,20 +367,8 @@ uint64_t WordLiteralInputSection::getOffset(uint64_t off) const {
 }
 
 bool macho::isCodeSection(const InputSection *isec) {
-  uint32_t type = sectionType(isec->getFlags());
-  if (type != S_REGULAR && type != S_COALESCED)
-    return false;
-
-  uint32_t attr = isec->getFlags() & SECTION_ATTRIBUTES_USR;
-  if (attr == S_ATTR_PURE_INSTRUCTIONS)
-    return true;
-
-  if (isec->getSegName() == segment_names::text)
-    return StringSwitch<bool>(isec->getName())
-        .Cases(section_names::textCoalNt, section_names::staticInit, true)
-        .Default(false);
-
-  return false;
+  return sections::isCodeSection(isec->getName(), isec->getSegName(),
+                                 isec->getFlags());
 }
 
 bool macho::isCfStringSection(const InputSection *isec) {
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 4a6f99654ba13..9c056f40aa943 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -1303,12 +1303,16 @@ void ObjcCategoryMerger::eraseMergedCategories() {
         continue;
 
       eraseISec(catInfo.catBodyIsec);
-      // We can't erase 'catLayout.nameOffset' for Swift categories because the
-      // name will be referenced for generating relative offsets
-      // See usages of 'l_.str.11.SimpleClass' in objc-category-merging-swift.s
+
+      // We can't erase 'catLayout.nameOffset' for either Swift or ObjC
+      //   categories because the name will sometimes also be used for other
+      //   purposes.
+      // For Swift, see usages of 'l_.str.11.SimpleClass' in
+      //   objc-category-merging-swift.s
+      // For ObjC, see usages of 'l_OBJC_CLASS_NAME_.1' in
+      //   objc-category-merging-erase-objc-name-test.s
       // TODO: handle the above in a smarter way
-      if (catInfo.sourceLanguage != SourceLanguage::Swift)
-        tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+
       tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
                                   catLayout.instanceMethodsOffset);
       tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index aecced9279da4..9c9570cdbeb05 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -111,6 +111,12 @@ def no_warn_dylib_install_name: Flag<["--"], "no-warn-dylib-install-name">,
 def warn_dylib_install_name: Flag<["--"], "warn-dylib-install-name">,
     HelpText<"Warn on -install_name if -dylib is not passed">,
     Group<grp_lld>;
+def warn_duplicate_rpath: Flag<["--"], "warn-duplicate-rpath">,
+    HelpText<"Warn if the same -rpath is specified multiple times (default)">,
+    Group<grp_lld>;
+def no_warn_duplicate_rpath: Flag<["--"], "no-warn-duplicate-rpath">,
+    HelpText<"Do not warn if the same -rpath is specified multiple times">,
+    Group<grp_lld>;
 def call_graph_profile_sort: Flag<["--"], "call-graph-profile-sort">,
     HelpText<"Reorder sections with call graph profile (default)">,
     Group<grp_lld>;
@@ -120,6 +126,17 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
 def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
     HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
     Group<grp_lld>;
+def irpgo_profile_sort: Separate<["--"], "irpgo-profile-sort">, Group<grp_lld>;
+def irpgo_profile_sort_eq: Joined<["--"], "irpgo-profile-sort=">,
+    Alias<!cast<Separate>(irpgo_profile_sort)>, MetaVarName<"<profile>">,
+    HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
+    Group<grp_lld>;
+def compression_sort: Joined<["--"], "compression-sort=">,
+    MetaVarName<"[none,function,data,both]">,
+    HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
+def verbose_bp_section_orderer: Flag<["--"], "verbose-bp-section-orderer">,
+    HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">,
+    Group<grp_lld>;
 def ignore_auto_link_option : Separate<["--"], "ignore-auto-link-option">,
     Group<grp_lld>;
 def ignore_auto_link_option_eq : Joined<["--"], "ignore-auto-link-option=">,
@@ -147,6 +164,9 @@ def cs_profile_path: Joined<["--"], "cs-profile-path=">,
 defm pgo_warn_mismatch: BB<"pgo-warn-mismatch",
   "turn on warnings about profile cfg mismatch (default)",
   "turn off warnings about profile cfg mismatch">, Group<grp_lld>;
+defm warn_thin_archive_missing_members : BB<"warn-thin-archive-missing-members",
+  "Warn on missing object files referenced by thin archives (default)",
+  "Do not warn on missing object files referenced by thin archives">, Group<grp_lld>;
 
 // This is a complete Options.td compiled from Apple's ld(1) manpage
 // dated 2018-03-07 and cross checked with ld64 source code in repo
diff --git a/lld/MachO/OutputSegment.cpp b/lld/MachO/OutputSegment.cpp
index a887bc4d515de..3d8a8eb61a9bb 100644
--- a/lld/MachO/OutputSegment.cpp
+++ b/lld/MachO/OutputSegment.cpp
@@ -9,6 +9,7 @@
 #include "OutputSegment.h"
 #include "ConcatOutputSection.h"
 #include "InputSection.h"
+#include "Sections.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 
@@ -89,9 +90,20 @@ static int sectionOrder(OutputSection *osec) {
   StringRef segname = osec->parent->name;
   // Sections are uniquely identified by their segment + section name.
   if (segname == segment_names::text) {
+    if (osec->name == section_names::header)
+      return -7;
+    // `__text` needs to precede the other code sections since its
+    // expected to be the largest. This means in effect that it will
+    // be the section that determines whether we need thunks or not.
+    if (osec->name == section_names::text)
+      return -6;
+    // Ensure all code sections are contiguous with `__text` for thunk
+    // calculations.
+    if (sections::isCodeSection(osec->name, segment_names::text, osec->flags) &&
+        osec->name != section_names::stubHelper) {
+      return -5;
+    }
     return StringSwitch<int>(osec->name)
-        .Case(section_names::header, -6)
-        .Case(section_names::text, -5)
         .Case(section_names::stubs, -4)
         .Case(section_names::stubHelper, -3)
         .Case(section_names::objcStubs, -2)
diff --git a/lld/MachO/OutputSegment.h b/lld/MachO/OutputSegment.h
index 7a0c4a2065a15..5f39734ffdbd7 100644
--- a/lld/MachO/OutputSegment.h
+++ b/lld/MachO/OutputSegment.h
@@ -57,6 +57,7 @@ class OutputSegment {
   uint32_t initProt = 0;
   uint32_t flags = 0;
   uint8_t index;
+  bool needsThunks = false;
 
   llvm::TinyPtrVector<Defined *> segmentStartSymbols;
   llvm::TinyPtrVector<Defined *> segmentEndSymbols;
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 907aee29d2386..69c301d8ff8a7 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SectionPriorities.h"
+#include "BPSectionOrderer.h"
 #include "Config.h"
 #include "InputFiles.h"
 #include "Symbols.h"
@@ -352,7 +353,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
 DenseMap<const InputSection *, size_t>
 macho::PriorityBuilder::buildInputSectionPriorities() {
   DenseMap<const InputSection *, size_t> sectionPriorities;
-  if (config->callGraphProfileSort) {
+  if (!config->irpgoProfileSortProfilePath.empty() ||
+      config->functionOrderForCompression || config->dataOrderForCompression) {
+    TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
+    sectionPriorities = runBalancedPartitioning(
+        highestAvailablePriority, config->irpgoProfileSortProfilePath,
+        config->functionOrderForCompression, config->dataOrderForCompression,
+        config->verboseBpSectionOrderer);
+  } else if (config->callGraphProfileSort) {
     // Sort sections by the profile data provided by __LLVM,__cg_profile
     // sections.
     //
diff --git a/lld/MachO/Sections.cpp b/lld/MachO/Sections.cpp
new file mode 100644
index 0000000000000..a27d902c0a227
--- /dev/null
+++ b/lld/MachO/Sections.cpp
@@ -0,0 +1,36 @@
+//===- Sections.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sections.h"
+#include "InputSection.h"
+#include "OutputSegment.h"
+
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace lld::macho::sections {
+bool isCodeSection(StringRef name, StringRef segName, uint32_t flags) {
+  uint32_t type = sectionType(flags);
+  if (type != S_REGULAR && type != S_COALESCED)
+    return false;
+
+  uint32_t attr = flags & SECTION_ATTRIBUTES_USR;
+  if (attr == S_ATTR_PURE_INSTRUCTIONS)
+    return true;
+
+  if (segName == segment_names::text)
+    return StringSwitch<bool>(name)
+        .Cases(section_names::textCoalNt, section_names::staticInit, true)
+        .Default(false);
+
+  return false;
+}
+
+} // namespace lld::macho::sections
diff --git a/lld/MachO/Sections.h b/lld/MachO/Sections.h
new file mode 100644
index 0000000000000..2246333d85729
--- /dev/null
+++ b/lld/MachO/Sections.h
@@ -0,0 +1,19 @@
+//===- Sections.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_SECTIONS_H
+#define LLD_MACHO_SECTIONS_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace lld::macho::sections {
+bool isCodeSection(llvm::StringRef name, llvm::StringRef segName,
+                   uint32_t flags);
+} // namespace lld::macho::sections
+
+#endif // #ifndef LLD_MACHO_SECTIONS_H
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index e6b80c1d42d9e..0eb809282af28 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -28,8 +28,8 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/thread.h"
 #include "llvm/Support/xxhash.h"
 
 #include <algorithm>
@@ -66,7 +66,6 @@ class Writer {
 
   template <class LP> void run();
 
-  DefaultThreadPool threadPool;
   std::unique_ptr<FileOutputBuffer> &buffer;
   uint64_t addr = 0;
   uint64_t fileOff = 0;
@@ -1121,14 +1120,12 @@ void Writer::finalizeLinkEditSegment() {
       symtabSection,     indirectSymtabSection,
       dataInCodeSection, functionStartsSection,
   };
-  SmallVector<std::shared_future<void>> threadFutures;
-  threadFutures.reserve(linkEditSections.size());
-  for (LinkEditSection *osec : linkEditSections)
-    if (osec)
-      threadFutures.emplace_back(threadPool.async(
-          [](LinkEditSection *osec) { osec->finalizeContents(); }, osec));
-  for (std::shared_future<void> &future : threadFutures)
-    future.wait();
+
+  parallelForEach(linkEditSections.begin(), linkEditSections.end(),
+                  [](LinkEditSection *osec) {
+                    if (osec)
+                      osec->finalizeContents();
+                  });
 
   // Now that __LINKEDIT is filled out, do a proper calculation of its
   // addresses and offsets.
@@ -1170,6 +1167,8 @@ void Writer::openFile() {
 }
 
 void Writer::writeSections() {
+  TimeTraceScope timeScope("Write output sections");
+
   uint8_t *buf = buffer->getBufferStart();
   std::vector<const OutputSection *> osecs;
   for (const OutputSegment *seg : outputSegments)
@@ -1200,18 +1199,15 @@ void Writer::writeUuid() {
 
   ArrayRef<uint8_t> data{buffer->getBufferStart(), buffer->getBufferEnd()};
   std::vector<ArrayRef<uint8_t>> chunks = split(data, 1024 * 1024);
+
   // Leave one slot for filename
   std::vector<uint64_t> hashes(chunks.size() + 1);
-  SmallVector<std::shared_future<void>> threadFutures;
-  threadFutures.reserve(chunks.size());
-  for (size_t i = 0; i < chunks.size(); ++i)
-    threadFutures.emplace_back(threadPool.async(
-        [&](size_t j) { hashes[j] = xxh3_64bits(chunks[j]); }, i));
-  for (std::shared_future<void> &future : threadFutures)
-    future.wait();
+  parallelFor(0, chunks.size(),
+              [&](size_t i) { hashes[i] = xxh3_64bits(chunks[i]); });
   // Append the output filename so that identical binaries with different names
   // don't get the same UUID.
   hashes[chunks.size()] = xxh3_64bits(sys::path::filename(config->finalOutput));
+
   uint64_t digest = xxh3_64bits({reinterpret_cast<uint8_t *>(hashes.data()),
                                  hashes.size() * sizeof(uint64_t)});
   uuidCommand->writeUuid(digest);
@@ -1330,15 +1326,18 @@ template <class LP> void Writer::run() {
   sortSegmentsAndSections();
   createLoadCommands<LP>();
   finalizeAddresses();
-  threadPool.async([&] {
+
+  llvm::thread mapFileWriter([&] {
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerInitialize(config->timeTraceGranularity, "writeMapFile");
     writeMapFile();
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerFinishThread();
   });
+
   finalizeLinkEditSegment();
   writeOutputFile();
+  mapFileWriter.join();
 }
 
 template <class LP> void macho::writeResult() { Writer().run<LP>(); }
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index ba8543732bb8e..e9d3c12b76545 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,69 +26,9 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
-* ``EI_OSABI`` in the output is now inferred from input object files.
-  (`#97144 <https://github.com/llvm/llvm-project/pull/97144>`_)
-* ``--compress-sections <section-glib>={none,zlib,zstd}[:level]`` is added to compress
-  matched output sections without the ``SHF_ALLOC`` flag.
-  (`#84855 <https://github.com/llvm/llvm-project/pull/84855>`_)
-  (`#90567 <https://github.com/llvm/llvm-project/pull/90567>`_)
-* The default compression level for zlib is now independent of linker
-  optimization level (``Z_BEST_SPEED``).
-* zstd compression parallelism no longer requires ``ZSTD_MULITHREAD`` build.
-* ``GNU_PROPERTY_AARCH64_FEATURE_PAUTH`` notes, ``R_AARCH64_AUTH_ABS64`` and
-  ``R_AARCH64_AUTH_RELATIVE`` relocations are now supported.
-  (`#72714 <https://github.com/llvm/llvm-project/pull/72714>`_)
-* ``--no-allow-shlib-undefined`` now rejects non-exported definitions in the
-  ``def-hidden.so ref.so`` case.
-  (`#86777 <https://github.com/llvm/llvm-project/issues/86777>`_)
-* ``--debug-names`` is added to create a merged ``.debug_names`` index
-  from input ``.debug_names`` sections. Type units are not handled yet.
-  (`#86508 <https://github.com/llvm/llvm-project/pull/86508>`_)
-* ``--enable-non-contiguous-regions`` option allows automatically packing input
-  sections into memory regions by automatically spilling to later matches if a
-  region would overflow. This reduces the toil of manually packing regions
-  (typical for embedded). It also makes full LTO feasible in such cases, since
-  IR merging currently prevents the linker script from referring to input
-  files. (`#90007 <https://github.com/llvm/llvm-project/pull/90007>`_)
-* ``--default-script`/``-dT`` is implemented to specify a default script that is processed
-  if ``--script``/``-T`` is not specified.
-  (`#89327 <https://github.com/llvm/llvm-project/pull/89327>`_)
-* ``--force-group-allocation`` is implemented to discard ``SHT_GROUP`` sections
-  and combine relocation sections if their relocated section group members are
-  placed to the same output section.
-  (`#94704 <https://github.com/llvm/llvm-project/pull/94704>`_)
-* ``--build-id`` now defaults to generating a 20-byte digest ("sha1") instead
-  of 8-byte ("fast"). This improves compatibility with RPM packaging tools.
-  (`#93943 <https://github.com/llvm/llvm-project/pull/93943>`_)
-* ``-z lrodata-after-bss`` is implemented to place ``.lrodata`` after ``.bss``.
-  (`#81224 <https://github.com/llvm/llvm-project/pull/81224>`_)
-* ``--export-dynamic`` no longer creates dynamic sections for ``-no-pie`` static linking.
-* ``--lto-emit-asm`` is now added as the canonical spelling of ``--plugin-opt=emit-llvm``.
-* ``--lto-emit-llvm`` now uses the pre-codegen module.
-  (`#97480 <https://github.com/llvm/llvm-project/pull/97480>`_)
-* When AArch64 PAuth is enabled, ``-z pack-relative-relocs`` now encodes ``R_AARCH64_AUTH_RELATIVE`` relocations in ``.rela.auth.dyn``.
-  (`#96496 <https://github.com/llvm/llvm-project/pull/96496>`_)
-* ``-z gcs`` and ``-z gcs-report`` are now supported for AArch64 Guarded Control Stack extension.
-* ``-r`` now forces ``-Bstatic``.
-* Thumb2 PLT is now supported for Cortex-M processors.
-  (`#93644 <https://github.com/llvm/llvm-project/pull/93644>`_)
-* ``DW_EH_sdata4`` of addresses larger than 0x80000000 is now supported for MIPS32.
-  (`#92438 <https://github.com/llvm/llvm-project/pull/92438>`_)
-* Certain unknown section types are rejected.
-  (`#85173 <https://github.com/llvm/llvm-project/pull/85173>`_)
-* ``PROVIDE(lhs = rhs) PROVIDE(rhs = ...)``, ``lhs`` is now defined only if ``rhs`` is needed.
-  (`#74771 <https://github.com/llvm/llvm-project/issues/74771>`_)
-  (`#87530 <https://github.com/llvm/llvm-project/pull/87530>`_)
-* ``OUTPUT_FORMAT(binary)`` is now supported.
-  (`#98837 <https://github.com/llvm/llvm-project/pull/98837>`_)
-* ``NOCROSSREFS`` and ``NOCRFOSSREFS_TO`` commands now supported to prohibit
-  cross references between certain output sections.
-  (`#98773 <https://github.com/llvm/llvm-project/pull/98773>`_)
-* Orphan placement is refined to prefer the last similar section when its rank <= orphan's rank.
-  (`#94099 <https://github.com/llvm/llvm-project/pull/94099>`_)
-  Non-alloc orphan sections are now placed at the end.
-  (`#94519 <https://github.com/llvm/llvm-project/pull/94519>`_)
-* R_X86_64_REX_GOTPCRELX of the addq form is no longer incorrectly optimized when the address is larger than 0x80000000.
+* ``-z nosectionheader`` has been implemented to omit the section header table.
+  The operation is similar to ``llvm-objcopy --strip-sections``.
+  (`#101286 <https://github.com/llvm/llvm-project/pull/101286>`_)
 
 Breaking changes
 ----------------
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index f9a00b7875038..b22cb36283771 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -857,6 +857,9 @@ The object will omit the
 .Dv PT_GNU_RELRO
 segment.
 .Pp
+.It Cm nosectionheader
+Don't generate the section header table.
+.Pp
 .It Cm notext
 Allow relocations against read-only segments.
 Sets the
diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt
index 25d8f0a424926..5d4a2757c529b 100644
--- a/lld/test/CMakeLists.txt
+++ b/lld/test/CMakeLists.txt
@@ -64,6 +64,7 @@ if (NOT LLD_BUILT_STANDALONE)
     llvm-profdata
     llvm-readelf
     llvm-readobj
+    llvm-strings
     llvm-strip
     llvm-symbolizer
     not
diff --git a/lld/test/COFF/lto-cpu-string.ll b/lld/test/COFF/lto-cpu-string.ll
index c2a8df5f75bad..0f233b32bd43a 100644
--- a/lld/test/COFF/lto-cpu-string.ll
+++ b/lld/test/COFF/lto-cpu-string.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-pc-windows-msvc19.14.26433"
 
 define dllexport void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s b/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
index 608b1aceb0a00..a9b95b1a63ef6 100644
--- a/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
+++ b/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
@@ -12,7 +12,7 @@
         .type _start, %function
 _start: ret
 
-// CHECK:     Name: $x.0
+// CHECK:     Name: $x
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local (0x0)
diff --git a/lld/test/ELF/aarch64-gnu-ifunc.s b/lld/test/ELF/aarch64-gnu-ifunc.s
index d76b54eabf8a4..f39ae0cd84934 100644
--- a/lld/test/ELF/aarch64-gnu-ifunc.s
+++ b/lld/test/ELF/aarch64-gnu-ifunc.s
@@ -37,7 +37,7 @@
 // CHECK-NEXT:    Section: Undefined
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: $x.0
+// CHECK-NEXT:    Name: $x
 // CHECK-NEXT:    Value: 0x210188
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Local
diff --git a/lld/test/ELF/aarch64-reloc-implicit-addend.test b/lld/test/ELF/aarch64-reloc-implicit-addend.test
index 804ed97a27371..23fc9209d2201 100644
--- a/lld/test/ELF/aarch64-reloc-implicit-addend.test
+++ b/lld/test/ELF/aarch64-reloc-implicit-addend.test
@@ -1,97 +1,316 @@
-## Test certain REL relocation types generated by legacy armasm.
-# RUN: yaml2obj %s -o %t.o
-# RUN: ld.lld %t.o -o %t
-# RUN: llvm-objdump -s %t | FileCheck %s
-
-# CHECK:      Contents of section .abs:
-# CHECK-NEXT:  [[#%x,]] 29002800 00002700 00000000 0000fcff  ).(...'.........
-# CHECK-NEXT:  [[#%x,]] ffffffff ffff                        ......
-# CHECK-NEXT: Contents of section .uabs:
-# CHECK-NEXT:  [[#%x,]] 40ffffff 40ffffff 20ffffff 20ffffff  @...@... ... ...
-# CHECK-NEXT:  [[#%x,]] 00ffffff 00ffffff                    ........
-# CHECK-NEXT: Contents of section .prel:
-# CHECK-NEXT:  [[#%x,]] 00ffffff fcfeffff f8feffff a0ffffff  ................
-# CHECK-NEXT:  [[#%x,]] 0010009f 0010009f                    ........
-# CHECK-NEXT: Contents of section .branch:
-# CHECK-NEXT:  [[#%x,]] f0ffffff f0ffffff fdffffff fcffff14  ................
-
----
-!ELF
-FileHeader:
-  Class: ELFCLASS64
-  Data: ELFDATA2LSB
-  Type: ET_REL
-  Machine: EM_AARCH64
-Sections:
-  - Name:    .abs
-    Type:    SHT_PROGBITS
-    Flags:   [ SHF_ALLOC ]
-    Content: fffffefffffffdfffffffffffffffcffffffffffffff
-  - Name:    .rel.abs
-    Type:    SHT_REL
-    Link:    .symtab
-    Info:    .abs
-    Relocations:
-      - {Offset: 0, Symbol: abs, Type: R_AARCH64_ABS16}
-      - {Offset: 2, Symbol: abs, Type: R_AARCH64_ABS32}
-      - {Offset: 6, Symbol: abs, Type: R_AARCH64_ABS64}
-      - {Offset: 14, Symbol: abs, Type: R_AARCH64_ADD_ABS_LO12_NC}
-
-  - Name:    .uabs
-    Type:    SHT_PROGBITS
-    Flags:   [ SHF_ALLOC ]
-    AddressAlign: 4
-    Content: 00ffffff00ffffff00ffffff00ffffff00ffffff00ffffff
-  - Name:    .rel.uabs
-    Type:    SHT_REL
-    Link:    .symtab
-    Info:    .uabs
-    Relocations:
-      - {Offset:  0, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G0}
-      - {Offset:  4, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G0_NC}
-      - {Offset:  8, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G1}
-      - {Offset: 12, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G1_NC}
-      - {Offset: 16, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G2}
-      - {Offset: 20, Symbol: abs, Type: R_AARCH64_MOVW_UABS_G2_NC}
-
-  - Name:    .prel
-    Type:    SHT_PROGBITS
-    Flags:   [ SHF_ALLOC ]
-    AddressAlign: 4
-    Content: 00ffffff00ffffff00ffffff00ffffff00ffffff00ffffff
-  - Name:    .rel.prel
-    Type:    SHT_REL
-    Link:    .symtab
-    Info:    .prel
-    Relocations:
-      - {Offset:  0, Symbol: .prel, Type: R_AARCH64_PREL64}
-      - {Offset:  4, Symbol: .prel, Type: R_AARCH64_PREL32}
-      - {Offset:  8, Symbol: .prel, Type: R_AARCH64_PREL16}
-      - {Offset: 12, Symbol: .prel, Type: R_AARCH64_LD_PREL_LO19}
-      - {Offset: 16, Symbol: .prel, Type: R_AARCH64_ADR_PREL_PG_HI21}
-      - {Offset: 20, Symbol: .prel, Type: R_AARCH64_ADR_PREL_PG_HI21_NC}
-
-  - Name:    .branch
-    Type:    SHT_PROGBITS
-    Flags:   [ SHF_ALLOC ]
-    AddressAlign: 4
-    Content: f0fffffff0fffffff0fffffff0ffffff
-  - Name:    .rel.branch
-    Type:    SHT_REL
-    Link:    .symtab
-    Info:    .branch
-    Relocations:
-      - {Offset:  0, Symbol: .branch, Type: R_AARCH64_TSTBR14}
-      - {Offset:  4, Symbol: .branch, Type: R_AARCH64_CONDBR19}
-      - {Offset:  8, Symbol: .branch, Type: R_AARCH64_CALL26}
-      - {Offset: 12, Symbol: .branch, Type: R_AARCH64_JUMP26}
-
-Symbols:
-  - Name:    .branch
-    Section: .branch
-  - Name:    .prel
-    Section: .prel
-  - Name:    abs
-    Index:   SHN_ABS
-    Value:   42
-    Binding: STB_GLOBAL
+REQUIRES: aarch64
+
+## Test handling of addends taken from the relocated word or instruction
+## in AArch64 relocation sections of type SHT_REL. These can be generated
+## by assemblers other than LLVM, in particular the legacy 'armasm'.
+##
+## llvm-mc will only generate SHT_RELA when targeting AArch64. So to make
+## an input file with SHT_REL, we assemble our test source file, then
+## round-trip via YAML and do some seddery to change the type of the
+## relocation section. Since all the relocations were made manually with
+## .reloc directives containing no addend, this succeeds.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64 relocs.s -o rela.o
+# RUN: obj2yaml rela.o -o rela.yaml
+# RUN: sed "s/\.rela/\.rel/;s/SHT_RELA/SHT_REL/" rela.yaml > rel.yaml
+# RUN: yaml2obj rel.yaml -o rel.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64 symbols.s -o symbols.o
+# RUN: ld.lld rel.o symbols.o -o a.out --section-start=.data=0x100000 --section-start=.text=0x200000
+# RUN: llvm-objdump -s a.out | FileCheck %s --check-prefix=DATA
+# RUN: llvm-objdump -d a.out | FileCheck %s --check-prefix=CODE
+
+#--- symbols.s
+
+// Source file containing the values of target symbols for the relocations. If
+// we don't keep these in their own file, then llvm-mc is clever enough to
+// resolve some of the relocations during assembly, even though they're written
+// as explicit .reloc directives. But we want the relocations to be present in
+// the object file, so that yaml2obj can change their type and we can test
+// lld's handling of the result. So we ensure that llvm-mc can't see both the
+// .reloc and the target symbol value at the same time.
+
+.globl abs16
+.globl abs32
+.globl abs64
+.globl big64
+.globl pcrel
+.globl data
+.globl branchtarget
+.globl calltarget
+
+.equ abs16, 0x9999
+.equ data, 0x100000
+.equ branchtarget, 0x200100
+.equ calltarget, 0x02000100
+.equ pcrel, 0x245678
+.equ abs32, 0x88888888
+.equ abs64, 0x7777777777777777
+.equ big64, 0x77ffffffffffff77
+
+#--- relocs.s
+
+// Source file containing the test instructions and their relocations, with the
+// FileCheck comments interleaved.
+
+// DATA: Contents of section .data:
+.data
+
+// First test absolute data relocations. For each one I show the expected
+// value in a comment, and then expect a line in llvm-objdump -s containing
+// all the values together.
+
+        // 0x7777777777777777 + 0x1234567887654321 = 0x89abcdeffedcba98
+        .reloc ., R_AARCH64_ABS64, abs64
+        .xword 0x1234567887654321
+
+        // 0x88888888 + 0x12344321 = 0x9abccba9
+        .reloc ., R_AARCH64_ABS32, abs32
+        .word 0x12344321
+
+        // 0x9999 + 0x1234 = 0xabcd
+        .reloc ., R_AARCH64_ABS16, abs16
+        .hword 0x1234
+
+        // DATA-NEXT:  100000 98badcfe efcdab89 a9cbbc9a cdab
+
+        .balign 16
+
+// Test relative data relocs, each subtracting the address of the relocated
+// word.
+
+        // 0x100000 + 0x1234567887654321 - 0x100010 = 0x1234567887654311
+        .reloc ., R_AARCH64_PREL64, data
+        .xword 0x1234567887654321
+
+        // 0x100000 + 0x12344321 - 0x100018 = 0x12344309
+        .reloc ., R_AARCH64_PREL32, data
+        .word 0x12344321
+
+        // 0x100000 + 0x1234 - 0x10001c = 0x1218
+        .reloc ., R_AARCH64_PREL16, data
+        .hword 0x1234
+
+        // DATA-NEXT:  100010 11436587 78563412 09433412 1812
+
+// CODE: 0000000000200000 <_start>:
+.text
+.globl _start
+_start:
+
+// Full set of 4 instructions loading the constant 'abs64' and adding 0x1234 to
+// it.
+
+// Expected constant is 0x7777777777777777 + 0x1234 = 0x77777777777789ab
+
+        .reloc ., R_AARCH64_MOVW_UABS_G0_NC, abs64
+        movz x0, #0x1234
+        // CODE-NEXT:  200000: d2913560      mov     x0, #0x89ab
+        .reloc ., R_AARCH64_MOVW_UABS_G1_NC, abs64
+        movk x0, #0x1234, lsl #16
+        // CODE-NEXT:  200004: f2aeeee0      movk    x0, #0x7777, lsl #16
+        .reloc ., R_AARCH64_MOVW_UABS_G2_NC, abs64
+        movk x0, #0x1234, lsl #32
+        // CODE-NEXT:  200008: f2ceeee0      movk    x0, #0x7777, lsl #32
+        .reloc ., R_AARCH64_MOVW_UABS_G3,    abs64
+        movk x0, #0x1234, lsl #48
+        // CODE-NEXT:  20000c: f2eeeee0      movk    x0, #0x7777, lsl #48
+
+// The same, but this constant has ffff in the middle 32 bits, forcing carries
+// to be propagated.
+
+// Expected constant: 0x77ffffffffffff77 + 0x1234 = 0x78000000000011ab
+
+        .reloc ., R_AARCH64_MOVW_UABS_G0_NC, big64
+        movz x0, #0x1234
+        // CODE-NEXT:  200010: d2823560      mov     x0, #0x11ab            
+        .reloc ., R_AARCH64_MOVW_UABS_G1_NC, big64
+        movk x0, #0x1234, lsl #16
+        // CODE-NEXT:  200014: f2a00000      movk    x0, #0x0, lsl #16
+        .reloc ., R_AARCH64_MOVW_UABS_G2_NC, big64
+        movk x0, #0x1234, lsl #32
+        // CODE-NEXT:  200018: f2c00000      movk    x0, #0x0, lsl #32
+        .reloc ., R_AARCH64_MOVW_UABS_G3,    big64
+        movk x0, #0x1234, lsl #48
+        // CODE-NEXT:  20001c: f2ef0000      movk    x0, #0x7800, lsl #48
+
+// Demonstrate that offsets are treated as signed: this one is taken to be
+// -0x1234. (If it were +0xedcc then you'd be able to tell the difference by
+// the carry into the second halfword.)
+
+// Expected value: 0x7777777777777777 - 0x1234 = 0x7777777777776543
+
+        .reloc ., R_AARCH64_MOVW_UABS_G0_NC, abs64
+        movz x0, #0xedcc
+        // CODE-NEXT:  200020: d28ca860      mov     x0, #0x6543
+        .reloc ., R_AARCH64_MOVW_UABS_G1_NC, abs64
+        movk x0, #0xedcc, lsl #16
+        // CODE-NEXT:  200024: f2aeeee0      movk    x0, #0x7777, lsl #16
+        .reloc ., R_AARCH64_MOVW_UABS_G2_NC, abs64
+        movk x0, #0xedcc, lsl #32
+        // CODE-NEXT:  200028: f2ceeee0      movk    x0, #0x7777, lsl #32
+        .reloc ., R_AARCH64_MOVW_UABS_G3,    abs64
+        movk x0, #0xedcc, lsl #48
+        // CODE-NEXT:  20002c: f2eeeee0      movk    x0, #0x7777, lsl #48
+
+// Check various bits of the ADR immediate, including in particular the low 2
+// bits, which are not contiguous with the rest in the encoding.
+//
+// These values are all 0x245678 + 2^n, except the last one, where the set bit
+// of the addend is the top bit, counting as negative, i.e. we expect the value
+// 0x254678 - 0x100000 = 0x145678.
+
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .+1
+        // CODE-NEXT:  200030: 3022b240      adr     x0, 0x245679
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .+2
+        // CODE-NEXT:  200034: 5022b220      adr     x0, 0x24567a
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .+4
+        // CODE-NEXT:  200038: 1022b220      adr     x0, 0x24567c
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .+8
+        // CODE-NEXT:  20003c: 1022b220      adr     x0, 0x245680
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .+1<<19
+        // CODE-NEXT:  200040: 1062b1c0      adr     x0, 0x2c5678
+        .reloc ., R_AARCH64_ADR_PREL_LO21, pcrel
+        adr x0, .-1<<20
+        // CODE-NEXT:  200044: 10a2b1a0      adr     x0, 0x145678
+
+// Now load the same set of values with ADRP+ADD. But because the real ADRP
+// instruction shifts its immediate, we must account for that.
+
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, 1<<12
+        // CODE-NEXT:  200048: b0000220      adrp    x0, 0x245000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #1
+        // CODE-NEXT:  20004c: 9119e400      add     x0, x0, #0x679
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, 2<<12
+        // CODE-NEXT:  200050: b0000220      adrp    x0, 0x245000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #2
+        // CODE-NEXT:  200054: 9119e800      add     x0, x0, #0x67a
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, 4<<12
+        // CODE-NEXT:  200058: b0000220      adrp    x0, 0x245000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #4
+        // CODE-NEXT:  20005c: 9119f000      add     x0, x0, #0x67c
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, 8<<12
+        // CODE-NEXT:  200060: b0000220      adrp    x0, 0x245000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #8
+        // CODE-NEXT:  200064: 911a0000      add     x0, x0, #0x680
+
+        // Starting here, the high bits won't fit in the ADD immediate, so that
+        // becomes 0, and only the ADRP immediate shows evidence of the addend.
+
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, 1<<(19+12)
+        // CODE-NEXT:  200068: b0000620      adrp    x0, 0x2c5000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #0
+        // CODE-NEXT:  20006c: 9119e000      add     x0, x0, #0x678
+
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, -1<<(20+12)
+        // CODE-NEXT:  200070: b0fffa20      adrp    x0, 0x145000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #0
+        // CODE-NEXT:  200074: 9119e000      add     x0, x0, #0x678
+
+        // Finally, an example with a full 21-bit addend.
+        // Expected value = 0x245678 + 0xfedcb - 0x100000 = 0x244443
+        .reloc ., R_AARCH64_ADR_PREL_PG_HI21, pcrel
+        adrp x0, (0xfedcb-0x100000)<<12
+        // CODE-NEXT:  200078: 90000220      adrp    x0, 0x244000
+        .reloc ., R_AARCH64_ADD_ABS_LO12_NC,  pcrel
+        add x0, x0, #0xdcb
+        // CODE-NEXT:  20007c: 91110c00      add     x0, x0, #0x443
+
+// PC-relative loads, in which the 19-bit offset is shifted. The offsets are
+// the same as the ADRs above, except for the first two, which can't be
+// expressed by pc-relative LDR with an offset shifted left 2.
+//
+// (The input syntax is confusing here. I'd normally expect to write this as
+// `ldr x0, [pc, #offset]`, but LLVM writes just `#offset`.)
+
+        .reloc ., R_AARCH64_LD_PREL_LO19,     pcrel
+        ldr w0, #4
+        // CODE-NEXT:  200080: 1822afe0      ldr     w0, 0x24567c
+        .reloc ., R_AARCH64_LD_PREL_LO19,     pcrel
+        ldr w0, #8
+        // CODE-NEXT:  200084: 1822afe0      ldr     w0, 0x245680
+        .reloc ., R_AARCH64_LD_PREL_LO19,     pcrel
+        ldr w0, #1<<19
+        // CODE-NEXT:  200088: 1862af80      ldr     w0, 0x2c5678
+        .reloc ., R_AARCH64_LD_PREL_LO19,     pcrel
+        ldr w0, #-1<<20
+        // CODE-NEXT:  20008c: 18a2af60      ldr     w0, 0x145678
+
+
+// For these, the branch target is 0x200100 plus powers of 2, except the offset
+// 2^15, which is negative, because the addend is treated as signed.
+
+        .reloc ., R_AARCH64_TSTBR14, branchtarget
+        tbnz x1, #63, #4
+        // CODE-NEXT:  200090: b7f803a1      tbnz    x1, #0x3f, 0x200104
+        .reloc ., R_AARCH64_TSTBR14, branchtarget
+        tbnz x1, #62, #8
+        // CODE-NEXT:  200094: b7f003a1      tbnz    x1, #0x3e, 0x200108
+        .reloc ., R_AARCH64_TSTBR14, branchtarget
+        tbnz x1, #61, #1<<14
+        // CODE-NEXT:  200098: b7ea0341      tbnz    x1, #0x3d, 0x204100
+        .reloc ., R_AARCH64_TSTBR14, branchtarget
+        tbnz x1, #60, #-1<<15
+        // CODE-NEXT:  20009c: b7e40321      tbnz    x1, #0x3c, 0x1f8100
+
+// CONDBR19 is used for both cbz/cbnz and B.cond, so test both at once. Base
+// offset is the same again (from 0x200100), but this time, offsets can go up
+// to 2^20.
+
+        .reloc ., R_AARCH64_CONDBR19, branchtarget
+        cbnz x2, #4
+        // CODE-NEXT:  2000a0: b5000322      cbnz    x2, 0x200104
+        .reloc ., R_AARCH64_CONDBR19, branchtarget
+        b.eq #8
+        // CODE-NEXT:  2000a4: 54000320      b.eq    0x200108
+        .reloc ., R_AARCH64_CONDBR19, branchtarget
+        cbz x2, #1<<19
+        // CODE-NEXT:  2000a8: b44002c2      cbz     x2, 0x280100
+        .reloc ., R_AARCH64_CONDBR19, branchtarget
+        b.vs #-1<<20
+        // CODE-NEXT:  2000ac: 548002a6      b.vs    0x100100
+
+// And for BL and B, the offsets go up to 2^25.
+
+        .reloc ., R_AARCH64_CALL26, calltarget
+        bl #4
+        // CODE-NEXT:  2000b0: 94780015      bl      0x2000104
+        .reloc ., R_AARCH64_CALL26, calltarget
+        bl #8
+        // CODE-NEXT:  2000b4: 94780015      bl      0x2000108
+        .reloc ., R_AARCH64_CALL26, calltarget
+        bl #1<<24
+        // CODE-NEXT:  2000b8: 94b80012      bl      0x3000100
+        .reloc ., R_AARCH64_CALL26, calltarget
+        bl #-1<<25
+        // CODE-NEXT:  2000bc: 97f80011      bl      0x100
+
+        .reloc ., R_AARCH64_JUMP26, calltarget
+        b #4
+        // CODE-NEXT:  2000c0: 14780011      b       0x2000104
+        .reloc ., R_AARCH64_JUMP26, calltarget
+        b #8
+        // CODE-NEXT:  2000c4: 14780011      b       0x2000108
+        .reloc ., R_AARCH64_JUMP26, calltarget
+        b #1<<24
+        // CODE-NEXT:  2000c8: 14b8000e      b       0x3000100
+        .reloc ., R_AARCH64_JUMP26, calltarget
+        b #-1<<25
+        // CODE-NEXT:  2000cc: 17f8000d      b       0x100
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
index 19493fcfa1365..6b4d23d37babc 100644
--- a/lld/test/ELF/aarch64-reloc-pauth.s
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -68,11 +68,11 @@
 # RELR-EMPTY:
 # RELR-NEXT: Relocation section '.relr.auth.dyn' at offset 0x[[ADDR2]] contains 5 entries:
 # RELR-NEXT: Index: Entry Address Symbolic Address
-# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d.0
-# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d.0 + 0x8
-# RELR-NEXT:  0000000000030450 $d.0 + 0x10
-# RELR-NEXT:  0000000000030458 $d.0 + 0x18
-# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d.0 + 0x52
+# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d
+# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d + 0x8
+# RELR-NEXT:  0000000000030450 $d + 0x10
+# RELR-NEXT:  0000000000030458 $d + 0x18
+# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d + 0x52
 
 # HEX:      Hex dump of section '.test':
 # HEX-NEXT: 0x00030440 01000000 2a000020 42040300 2b000000
@@ -201,7 +201,7 @@
 # EMPTY-RELA-EMPTY:
 # EMPTY-RELA-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 1 entries:
 # EMPTY-RELA-NEXT: Index: Entry Address Symbolic Address
-# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d.0
+# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d
 
 # EMPTY-RELA-RO-NOT: .rela.dyn
 
diff --git a/lld/test/ELF/aarch64-thunk-script.s b/lld/test/ELF/aarch64-thunk-script.s
index 08ff4e9871180..a6c524f548288 100644
--- a/lld/test/ELF/aarch64-thunk-script.s
+++ b/lld/test/ELF/aarch64-thunk-script.s
@@ -43,8 +43,8 @@ high_target:
 // CHECK-NEXT:                 ret
 
 /// Local symbols copied from %t.o
-// NM:      t $x.0
-// NM-NEXT: t $x.1
+// NM:      t $x
+// NM-NEXT: t $x
 /// Local thunk symbols.
 // NM-NEXT: t __AArch64AbsLongThunk_high_target
 // NM-NEXT: t $x
diff --git a/lld/test/ELF/arm-cmse-implib.s b/lld/test/ELF/arm-cmse-implib.s
index 581bff9dd8536..60a68b0226c3d 100644
--- a/lld/test/ELF/arm-cmse-implib.s
+++ b/lld/test/ELF/arm-cmse-implib.s
@@ -53,8 +53,8 @@ secure_entry:
 // CHECK1-NEXT:    Num:    Value  Size Type    Bind   Vis       Ndx Name
 // CHECK1-NEXT:      0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
 // CHECK1-NEXT:      1: 00020000     0 NOTYPE  LOCAL  DEFAULT     2 $t
-// CHECK1-NEXT:      2: 00008000     0 NOTYPE  LOCAL  DEFAULT     1 $t.0
-// CHECK1-NEXT:      3: 00008004     0 NOTYPE  LOCAL  DEFAULT     1 $t.0
+// CHECK1-NEXT:      2: 00008000     0 NOTYPE  LOCAL  DEFAULT     1 $t
+// CHECK1-NEXT:      3: 00008004     0 NOTYPE  LOCAL  DEFAULT     1 $t
 // CHECK1-NEXT:      4: 00008001     2 FUNC    GLOBAL DEFAULT     1 secure_entry
 // CHECK1-NEXT:      5: 00020001     8 FUNC    GLOBAL DEFAULT     2 foo
 // CHECK1-NEXT:      6: 00008005     2 FUNC    GLOBAL DEFAULT     1 __acle_se_foo
@@ -82,8 +82,8 @@ secure_entry:
 // CHECK2-NEXT:    Num:    Value  Size Type    Bind   Vis       Ndx Name
 // CHECK2-NEXT:      0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
 // CHECK2-NEXT:      1: 00020000     0 NOTYPE  LOCAL  DEFAULT     2 $t
-// CHECK2-NEXT:      2: 00008000     0 NOTYPE  LOCAL  DEFAULT     1 $t.0
-// CHECK2-NEXT:      3: 00008004     0 NOTYPE  LOCAL  DEFAULT     1 $t.0
+// CHECK2-NEXT:      2: 00008000     0 NOTYPE  LOCAL  DEFAULT     1 $t
+// CHECK2-NEXT:      3: 00008004     0 NOTYPE  LOCAL  DEFAULT     1 $t
 // CHECK2-NEXT:      4: 00008001     2 FUNC    GLOBAL DEFAULT     1 secure_entry
 // CHECK2-NEXT:      5: 00020011     8 FUNC    WEAK   DEFAULT     2 baz
 // CHECK2-NEXT:      6: 00008005     2 FUNC    GLOBAL DEFAULT     1 __acle_se_baz
diff --git a/lld/test/ELF/basic-aarch64.s b/lld/test/ELF/basic-aarch64.s
index 6b109e8da2c3e..1f59d33c4230b 100644
--- a/lld/test/ELF/basic-aarch64.s
+++ b/lld/test/ELF/basic-aarch64.s
@@ -119,7 +119,7 @@ _start:
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
 # CHECK-NEXT:     Offset: 0x1AA
-# CHECK-NEXT:     Size: 13
+# CHECK-NEXT:     Size: 11
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
 # CHECK-NEXT:     AddressAlignment: 1
@@ -189,7 +189,7 @@ _start:
 # CHECK-NEXT:     Section: Undefined (0x0)
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Symbol {
-# CHECK-NEXT:     Name: $x.0
+# CHECK-NEXT:     Name: $x
 # CHECK-NEXT:     Value: 0x210120
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Local (0x0)
diff --git a/lld/test/ELF/crel-rel-mixed.s b/lld/test/ELF/crel-rel-mixed.s
new file mode 100644
index 0000000000000..a69fa1c09b436
--- /dev/null
+++ b/lld/test/ELF/crel-rel-mixed.s
@@ -0,0 +1,22 @@
+# REQUIRES: arm
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=armv7a -crel a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=armv7a b.s -o b.o
+# RUN: not ld.lld -r a.o b.o 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR: error: b.o:(.rel.text): REL cannot be converted to CREL
+
+#--- a.s
+.global _start, foo
+_start:
+  bl foo
+  bl .text.foo
+
+.section .text.foo,"ax"
+foo:
+  nop
+
+#--- b.s
+.globl fb
+fb:
+  bl fb
diff --git a/lld/test/ELF/crel.s b/lld/test/ELF/crel.s
new file mode 100644
index 0000000000000..d7c87be9a5402
--- /dev/null
+++ b/lld/test/ELF/crel.s
@@ -0,0 +1,90 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -crel a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -crel b.s -o b.o
+# RUN: ld.lld -pie a.o b.o -o out
+# RUN: llvm-objdump -d out | FileCheck %s
+# RUN: llvm-readelf -Srs out | FileCheck %s --check-prefix=RELOC
+
+# CHECK:       <_start>:
+# CHECK-NEXT:    callq {{.*}} <foo>
+# CHECK-NEXT:    callq {{.*}} <foo>
+# CHECK-EMPTY:
+# CHECK-NEXT:  <foo>:
+# CHECK-NEXT:    leaq {{.*}}  # 0x27c
+# CHECK-NEXT:    leaq {{.*}}  # 0x278
+
+# RELOC:  .data             PROGBITS        {{0*}}[[#%x,DATA:]]
+
+# RELOC:  {{0*}}[[#DATA+8]]  0000000000000008 R_X86_64_RELATIVE [[#%x,DATA+0x8000000000000000]]
+
+# RUN: ld.lld -pie --emit-relocs a.o b.o -o out1
+# RUN: llvm-objdump -dr out1 | FileCheck %s --check-prefix=CHECKE
+# RUN: llvm-readelf -Sr out1 | FileCheck %s --check-prefix=RELOCE
+
+# CHECKE:       <_start>:
+# CHECKE-NEXT:    callq {{.*}} <foo>
+# CHECKE-NEXT:      R_X86_64_PLT32 foo-0x4
+# CHECKE-NEXT:    callq {{.*}} <foo>
+# CHECKE-NEXT:      R_X86_64_PLT32 .text+0x6
+# CHECKE-EMPTY:
+# CHECKE-NEXT:  <foo>:
+# CHECKE-NEXT:    leaq {{.*}}
+# CHECKE-NEXT:      R_X86_64_PC32 .L.str-0x4
+# CHECKE-NEXT:    leaq {{.*}}
+# CHECKE-NEXT:      R_X86_64_PC32 .L.str1-0x4
+
+# RELOCE:      .rodata             PROGBITS        {{0*}}[[#%x,RO:]]
+# RELOCE:      .eh_frame           PROGBITS        {{0*}}[[#%x,EHFRAME:]]
+# RELOCE:      .data               PROGBITS        {{0*}}[[#%x,DATA:]]
+
+# RELOCE:      Relocation section '.crel.data' at offset {{.*}} contains 2 entries:
+# RELOCE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# RELOCE-NEXT: {{0*}}[[#DATA+8]] {{.*}}           R_X86_64_64            {{.*}}           .data - 8000000000000000
+# RELOCE-NEXT: {{0*}}[[#DATA+24]]{{.*}}           R_X86_64_64            {{.*}}           .data - 1
+# RELOCE:      Relocation section '.crel.eh_frame' at offset {{.*}} contains 2 entries:
+# RELOCE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# RELOCE-NEXT: {{0*}}[[#EHFRAME+32]] {{.*}}       R_X86_64_PC32          {{.*}}           .text + 0
+# RELOCE-NEXT: {{0*}}[[#EHFRAME+52]] {{.*}}       R_X86_64_PC32          {{.*}}           .text + a
+# RELOCE:      Relocation section '.crel.rodata' at offset {{.*}} contains 4 entries:
+# RELOCE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# RELOCE-NEXT: {{0*}}[[#RO+8]]   {{.*}}           R_X86_64_PC32          {{.*}}           foo + 0
+# RELOCE-NEXT: {{0*}}[[#RO+23]]  {{.*}}           R_X86_64_PC32          {{.*}}           foo + 3f
+# RELOCE-NEXT: {{0*}}[[#RO+39]]  {{.*}}           R_X86_64_PC64          {{.*}}           foo + 7f
+# RELOCE-NEXT: {{0*}}[[#RO+47]]  {{.*}}           R_X86_64_PC32          {{.*}}           _start - 1f81
+
+#--- a.s
+.global _start, foo
+_start:
+  .cfi_startproc # Test .eh_frame
+  call foo
+  call .text.foo
+  .cfi_endproc
+
+.section .text.foo,"ax"
+foo:
+  .cfi_startproc
+  leaq .L.str(%rip), %rsi
+  leaq .L.str1(%rip), %rsi
+  .cfi_endproc
+
+.section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+  .asciz  "abc"
+.L.str1:
+  .asciz  "def"
+
+.data
+.quad 0
+.quad .data - 0x8000000000000000
+.quad 0
+.quad .data - 1
+
+#--- b.s
+.section .rodata,"a"
+.long foo - .
+.space 15-4
+.long foo - . + 63  # offset+=15
+.space 16-4
+.quad foo - . + 127  # offset+=16
+.long _start - . - 8065
diff --git a/lld/test/ELF/debug-names.s b/lld/test/ELF/debug-names.s
index 888dd9007ed12..1bbb07b065e33 100644
--- a/lld/test/ELF/debug-names.s
+++ b/lld/test/ELF/debug-names.s
@@ -10,7 +10,7 @@
 
 # REQUIRES: x86
 # RUN: rm -rf %t && split-file %s %t && cd %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 --crel a.s -o a.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
 
 # RUN: ld.lld --debug-names --no-debug-names a.o b.o -o out0
diff --git a/lld/test/ELF/defsym.s b/lld/test/ELF/defsym.s
index 0168ce854cc15..eb409cccfc033 100644
--- a/lld/test/ELF/defsym.s
+++ b/lld/test/ELF/defsym.s
@@ -5,15 +5,16 @@
 # RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s --check-prefix=USE
 
 ## Check that we accept --defsym foo2=foo1 form.
-# RUN: ld.lld -o %t2 %t.o --defsym foo2=foo1
+# RUN: ld.lld -o %t2 %t.o --defsym '"foo2"=foo1'
 # RUN: llvm-readelf -s %t2 | FileCheck %s
 # RUN: llvm-objdump -d --print-imm-hex %t2 | FileCheck %s --check-prefix=USE
 
 ## Check we are reporting the error correctly and don't crash
 ## when handling the second --defsym.
-# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ \
-#        --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR
-# ERR: error: --defsym: syntax error: ERR+
+# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR --strict-whitespace
+#      ERR:error: --defsym:1: = expected, but got +
+# ERR-NEXT:>>> ERR+
+# ERR-NEXT:>>>    ^
 
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo1
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo2
@@ -27,7 +28,7 @@
 # RUN: ld.lld -o %t %t.o --defsym=foo2=1
 # RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=ABS
 
-# ABS: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo2
+# ABS: 0000000000000001     0 NOTYPE  GLOBAL DEFAULT   ABS foo2
 
 # RUN: ld.lld -o %t %t.o --defsym=foo2=foo1+5
 # RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=EXPR
@@ -42,10 +43,9 @@
 # ERR2: error: --defsym:1: EOF expected, but got ,
 
 # RUN: not ld.lld -o /dev/null %t.o --defsym=foo 2>&1 | FileCheck %s -check-prefix=ERR3
-# ERR3: error: --defsym: syntax error: foo
+# ERR3: error: --defsym:1: unexpected EOF
 
-# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR4
-# ERR4: error: --defsym: syntax error:
+# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR3
 
 .globl foo1
  foo1 = 0x123
diff --git a/lld/test/ELF/gc-sections-with-provide.s b/lld/test/ELF/gc-sections-with-provide.s
index 3e5b1b1efe6ca..268faa4354687 100644
--- a/lld/test/ELF/gc-sections-with-provide.s
+++ b/lld/test/ELF/gc-sections-with-provide.s
@@ -53,8 +53,8 @@ baz:
 
 
 #--- script.t
-PROVIDE(unused = bar + used);
-PROVIDE(used = another_used);
+PROVIDE(unused = bar + "used");
+PROVIDE("used" = another_used);
 PROVIDE(baz_ref = baz);
 PROVIDE(another_used = baz_ref);
 PROVIDE(another_unused = unused + bar + 0x1);
diff --git a/lld/test/ELF/gc-sections.s b/lld/test/ELF/gc-sections.s
index 94adc8210b4bc..31e00d495146a 100644
--- a/lld/test/ELF/gc-sections.s
+++ b/lld/test/ELF/gc-sections.s
@@ -8,6 +8,10 @@
 # RUN: ld.lld --export-dynamic --gc-sections %t -o %t2
 # RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC2 %s
 
+# RUN: llvm-mc -filetype=obj -triple=x86_64 --crel %s -o %t.o
+# RUN: ld.lld --gc-sections --print-gc-sections %t.o -o %t2 | FileCheck --check-prefix=GC1-DISCARD %s
+# RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC1 %s
+
 # NOGC: Name: .eh_frame
 # NOGC: Name: .text
 # NOGC: Name: .init
diff --git a/lld/test/ELF/icf1.s b/lld/test/ELF/icf1.s
index 5c6e667d53c78..9682b06f4606f 100644
--- a/lld/test/ELF/icf1.s
+++ b/lld/test/ELF/icf1.s
@@ -3,6 +3,9 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
 # RUN: ld.lld %t -o /dev/null --icf=all --print-icf-sections | FileCheck %s
 
+# RUN: llvm-mc -filetype=obj -triple=x86_64 --crel %s -o %t
+# RUN: ld.lld %t -o /dev/null --icf=all --print-icf-sections | FileCheck %s
+
 # CHECK: selected section {{.*}}:(.text.f1)
 # CHECK:   removing identical section {{.*}}:(.text.f2)
 
diff --git a/lld/test/ELF/icf4.s b/lld/test/ELF/icf4.s
index ff13a7ebff3da..310577a55c0d8 100644
--- a/lld/test/ELF/icf4.s
+++ b/lld/test/ELF/icf4.s
@@ -1,6 +1,6 @@
 # REQUIRES: x86
 
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: llvm-mc --crel -filetype=obj -triple=x86_64-unknown-linux %s -o %t
 # RUN: ld.lld %t -o /dev/null --icf=all --print-icf-sections | count 0
 
 .globl _start, f1, f2
diff --git a/lld/test/ELF/linkerscript/at2.test b/lld/test/ELF/linkerscript/at2.test
index d744fce911b48..24c49350dcfb3 100644
--- a/lld/test/ELF/linkerscript/at2.test
+++ b/lld/test/ELF/linkerscript/at2.test
@@ -19,7 +19,7 @@ SECTIONS {
  .foo2 : { *(.foo2) } > AX
 
  .bar1 : { *(.bar1) } > AW
- .bar2 : { *(.bar2) } > AW AT > RAM
+ .bar2 : { *(.bar2) } > "AW" AT > "RAM"
  .bar3 . : { *(.bar3) } > AW
  .bar4 : { *(.bar4) } > AW AT >RAM
 }
diff --git a/lld/test/ELF/linkerscript/at3.test b/lld/test/ELF/linkerscript/at3.test
index 1e7f970c63ef3..4d6d753f6ab7d 100644
--- a/lld/test/ELF/linkerscript/at3.test
+++ b/lld/test/ELF/linkerscript/at3.test
@@ -13,7 +13,7 @@ MEMORY {
 SECTIONS {
  .foo1 : { *(.foo1) }            > FOO AT>FLASH
  .foo2 : { *(.foo2) BYTE(0x42) } > BAR AT>FLASH
- .foo3 : { *(.foo3) }            > ZED AT>FLASH
+ .foo3 : { *(.foo3) }            >"ZED" AT>"FLASH"
 }
 
 # CHECK: .foo1             PROGBITS        0000000000001000 001000
diff --git a/lld/test/ELF/linkerscript/custom-section-type.s b/lld/test/ELF/linkerscript/custom-section-type.s
index 8ca0a4db325bd..2add3a52f8117 100644
--- a/lld/test/ELF/linkerscript/custom-section-type.s
+++ b/lld/test/ELF/linkerscript/custom-section-type.s
@@ -67,7 +67,7 @@ SECTIONS {
   nobits ( TYPE=SHT_NOBITS) : { BYTE(8) }
   init_array (TYPE=SHT_INIT_ARRAY ) : { QUAD(myinit) }
   fini_array (TYPE=SHT_FINI_ARRAY) : { QUAD(15) }
-  preinit_array (TYPE=SHT_PREINIT_ARRAY) : { QUAD(16) }
+  preinit_array . (TYPE=SHT_PREINIT_ARRAY) : { QUAD(16) }
   group (TYPE=17) : { LONG(17) }
   expr (TYPE=0x41+1) : { BYTE(0x42) *(expr) }
 }
diff --git a/lld/test/ELF/linkerscript/diag.test b/lld/test/ELF/linkerscript/diag.test
new file mode 100644
index 0000000000000..fbc24659a5311
--- /dev/null
+++ b/lld/test/ELF/linkerscript/diag.test
@@ -0,0 +1,49 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o 0.o
+
+#--- 1.lds
+SECTIONS {
+  .text + { *(.text) }
+  .keep : { *(.keep) } /*
+  comment line 1
+  comment line 2 */
+  .temp : { *(.temp) }
+}
+
+# RUN: not ld.lld -shared 0.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+#      CHECK1:{{.*}}:2: malformed number: +
+# CHECK1-NEXT:>>>   .text + { *(.text) }
+# CHECK1-NEXT:>>>         ^
+
+#--- 2.lds
+
+UNKNOWN_TAG {
+  .text : { *(.text) }
+  .keep : { *(.keep) }
+  .temp : { *(.temp) }
+}
+
+# RUN: not ld.lld -shared 0.o -T 2.lds 2>&1 | FileCheck %s --check-prefix=CHECK2 --match-full-lines --strict-whitespace
+#      CHECK2:{{.*}}:2: unknown directive: UNKNOWN_TAG
+# CHECK2-NEXT:>>> UNKNOWN_TAG {
+# CHECK2-NEXT:>>> ^
+
+#--- 3.lds
+SECTIONS {
+  .text : { *(.text) }
+  .keep : { *(.keep) }
+  boom ^temp : { *(.temp) }
+}
+#--- 3a.lds
+INCLUDE "3.lds"
+#--- 3b.lds
+foo = 3;
+INCLUDE "3a.lds"
+
+# RUN: not ld.lld -shared 0.o -T 3.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 3a.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 3b.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+#      CHECK3:{{.*}}3.lds:4: malformed number: ^
+# CHECK3-NEXT:>>>   boom ^temp : { *(.temp) }
+# CHECK3-NEXT:>>>        ^
diff --git a/lld/test/ELF/linkerscript/diag1.test b/lld/test/ELF/linkerscript/diag1.test
deleted file mode 100644
index 829bc5a1bffaf..0000000000000
--- a/lld/test/ELF/linkerscript/diag1.test
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text + { *(.text) }
-  .keep : { *(.keep) } /*
-  comment line 1
-  comment line 2 */
-  .temp : { *(.temp) }
-}
-
-CHECK:      6: malformed number: +
-CHECK-NEXT: >>>   .text + { *(.text) }
-CHECK-NEXT: >>>         ^
diff --git a/lld/test/ELF/linkerscript/diag2.test b/lld/test/ELF/linkerscript/diag2.test
deleted file mode 100644
index aeb623dbb7f4b..0000000000000
--- a/lld/test/ELF/linkerscript/diag2.test
+++ /dev/null
@@ -1,13 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-UNKNOWN_TAG {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  .temp : { *(.temp) }
-}
-
-CHECK:      5: unknown directive: UNKNOWN_TAG
-CHECK-NEXT: >>> UNKNOWN_TAG {
-CHECK-NEXT: >>> ^
diff --git a/lld/test/ELF/linkerscript/diag3.test b/lld/test/ELF/linkerscript/diag3.test
deleted file mode 100644
index 1df8d601db016..0000000000000
--- a/lld/test/ELF/linkerscript/diag3.test
+++ /dev/null
@@ -1,13 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      8: malformed number: ^
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag4.test b/lld/test/ELF/linkerscript/diag4.test
deleted file mode 100644
index d93a69a95c61d..0000000000000
--- a/lld/test/ELF/linkerscript/diag4.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: echo "INCLUDE \"%s\"" > %t.script
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %t.script 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      9: malformed number: ^{{$}}
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag5.test b/lld/test/ELF/linkerscript/diag5.test
deleted file mode 100644
index 9a2304baa4413..0000000000000
--- a/lld/test/ELF/linkerscript/diag5.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: echo "INCLUDE \"%s\"" > %t.script
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %t.script 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      9: malformed number: ^
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag6.test b/lld/test/ELF/linkerscript/diag6.test
deleted file mode 100644
index 0ec0400040b54..0000000000000
--- a/lld/test/ELF/linkerscript/diag6.test
+++ /dev/null
@@ -1,7 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck %s
-
-SECTIONS /*
-
-CHECK: error: {{.*}}diag6.test:1: unclosed comment in a linker script
diff --git a/lld/test/ELF/linkerscript/group.s b/lld/test/ELF/linkerscript/group.s
index 89b09e529f466..8cfe1631a8c84 100644
--- a/lld/test/ELF/linkerscript/group.s
+++ b/lld/test/ELF/linkerscript/group.s
@@ -3,62 +3,61 @@
 
 # RUN: rm -rf %t.dir && mkdir %t.dir
 # RUN: rm -rf %t && split-file %s %t && cd %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o b.o
-# RUN: llvm-ar rc %t.dir/libxyz.a b.o
-
-# RUN: echo 'GROUP("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'INPUT("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t 2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" =libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t --sysroot=%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" -lxyz )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" /libxyz.a )' > %t.t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o xyz.o
+# RUN: llvm-ar rc %t.dir/libb.a b.o
+# RUN: llvm-ar rc %t.dir/libxyz.a xyz.o
+
+# RUN: echo 'GROUP("a.o" libxyz.a -lxyz b.o )' > 1.t
+# RUN: not ld.lld 1.t 2>&1 | FileCheck %s --check-prefix=NOLIB
+# RUN: ld.lld 1.t -L%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP( "a.o" b.o =libxyz.a )' > 2.t
+# RUN: not ld.lld 2.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=libxyz.a
+# RUN: ld.lld 2.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/3a.t")' > 3.t
+# RUN: echo 'INCLUDE "%t.dir/3a.t"' > 3i.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))INPUT(/libb.a)' > %t.dir/3a.t
+# RUN: ld.lld 3.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+# RUN: ld.lld 3i.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/4a.t")INPUT(/libb.a)' > 4.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > %t.dir/4a.t
+# RUN: not ld.lld 4.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# RUN: echo 'INCLUDE "%t.dir/5a.t" INPUT(/libb.a)' > 5.t
+# RUN: echo 'GROUP(a.o)' > %t.dir/5a.t
+# RUN: not ld.lld 5.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# CHECK: T _start
+
+# NOLIB: error: {{.*}}unable to find
+
+# RUN: echo 'GROUP("a.o" /libxyz.a )' > a.t
 # RUN: echo 'GROUP("%t/a.o" /libxyz.a )' > %t.dir/xyz.t
-# RUN: not ld.lld -o /dev/null %t.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
-# RUN: not ld.lld -o /dev/null %t.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
 
 ## Since %t.dir/%t does not exist, report an error, instead of falling back to %t
 ## without the syroot prefix.
-# RUN: not ld.lld -o /dev/null %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
+# RUN: not ld.lld %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
 
 # CANNOT_FIND_SYSROOT:      error: {{.*}}xyz.t:1: cannot find [[TMP]] inside {{.*}}.dir
 # CANNOT_FIND_SYSROOT-NEXT: >>> GROUP({{.*}}
 
-# RUN: echo 'GROUP("2.t")' > 1.t
-# RUN: echo 'GROUP("a.o")' > 2.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
-# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > 1.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
 # CANNOT_OPEN: error: cannot open [[FILE]]: {{.*}}
 
 #--- a.s
 .globl _start
 _start:
-  ret
+  call b
+
+#--- b.s
+.globl b
+b:
diff --git a/lld/test/ELF/linkerscript/header-phdr.test b/lld/test/ELF/linkerscript/header-phdr.test
deleted file mode 100644
index 866e2d487b31a..0000000000000
--- a/lld/test/ELF/linkerscript/header-phdr.test
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: echo '.section .zed, "a"; .zero 4' \
-# RUN:   | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t.o
-# RUN: ld.lld --script %s %t.o -o %t
-# RUN: llvm-readelf -S -l -W %t | FileCheck %s
-
-# CHECK: [ 1] .abc              PROGBITS        0000000000001000 001000 000004 00   A  0   0  1
-# CHECK: LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x001004 0x001004 R E 0x1000
-
-PHDRS { foobar PT_LOAD FILEHDR PHDRS; }
-
-SECTIONS {
-  . = 0x1000;
-  .abc : { *(.zed) } : foobar
-}
diff --git a/lld/test/ELF/linkerscript/include-cycle.s b/lld/test/ELF/linkerscript/include-cycle.s
index e93ed904f05a3..a87b079a36925 100644
--- a/lld/test/ELF/linkerscript/include-cycle.s
+++ b/lld/test/ELF/linkerscript/include-cycle.s
@@ -1,15 +1,32 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# RUN: echo "INCLUDE \"%t1.script\"" > %t1.script
-# RUN: not ld.lld %t.o %t1.script 2>&1 | FileCheck %s
+# RUN: not ld.lld a.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=ERR1
+# ERR1: error: 1.lds:1: there is a cycle in linker script INCLUDEs
 
-# RUN: echo "INCLUDE \"%t2.script\"" > %t1.script
-# RUN: echo "INCLUDE \"%t1.script\"" > %t2.script
-# RUN: not ld.lld %t.o %t1.script 2>&1 | FileCheck %s
+# RUN: not ld.lld a.o -T 2a.lds 2>&1 | FileCheck %s --check-prefix=ERR2
+# ERR2: error: 2b.lds:1: there is a cycle in linker script INCLUDEs
 
-# CHECK: there is a cycle in linker script INCLUDEs
+# RUN: ld.lld a.o -T 3.lds -o 3
+# RUN: llvm-objdump -s 3 | FileCheck %s --check-prefix=CHECK3
+# CHECK3:      Contents of section foo:
+# CHECK3-NEXT: 0000 2a2a                    **
 
+#--- 0.lds
+BYTE(42)
+#--- 1.lds
+INCLUDE "1.lds"
+#--- 2a.lds
+INCLUDE "2b.lds"
+#--- 2b.lds
+INCLUDE "2a.lds"
+#--- 3.lds
+SECTIONS {
+  foo : { INCLUDE "0.lds" INCLUDE "0.lds" }
+}
+
+#--- a.s
 .globl _start
 _start:
   ret
diff --git a/lld/test/ELF/linkerscript/insert-after.test b/lld/test/ELF/linkerscript/insert-after.test
index 4b25ff36806cf..38eb0481a10a9 100644
--- a/lld/test/ELF/linkerscript/insert-after.test
+++ b/lld/test/ELF/linkerscript/insert-after.test
@@ -41,7 +41,7 @@
 
 SECTIONS { .byte : { BYTE(0) } } INSERT AFTER .data;
 
-SECTIONS { .foo.data : { *(.foo.data) } } INSERT AFTER .data;
+SECTIONS { .foo.data : { *(.foo.data) } } INSERT AFTER ".data";
 
 ## The input section .foo.text is an orphan. It will be placed in .foo.text
 SECTIONS { .foo.text : {} } INSERT AFTER .text;
diff --git a/lld/test/ELF/linkerscript/insert-before.test b/lld/test/ELF/linkerscript/insert-before.test
index a72834988007c..f06c73327f088 100644
--- a/lld/test/ELF/linkerscript/insert-before.test
+++ b/lld/test/ELF/linkerscript/insert-before.test
@@ -50,4 +50,4 @@ SECTIONS { .byte : { BYTE(0) } } INSERT BEFORE .data;
 SECTIONS { .foo.data : { *(.foo.data) } } INSERT BEFORE .data;
 
 ## The input section .foo.text is an orphan. It will be placed in .foo.text
-SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE .text;
+SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE ".text";
diff --git a/lld/test/ELF/invalid-linkerscript.test b/lld/test/ELF/linkerscript/invalid.test
similarity index 97%
rename from lld/test/ELF/invalid-linkerscript.test
rename to lld/test/ELF/linkerscript/invalid.test
index 4cbedf639cb1a..73b761ce4d571 100644
--- a/lld/test/ELF/invalid-linkerscript.test
+++ b/lld/test/ELF/linkerscript/invalid.test
@@ -15,7 +15,7 @@
 
 # RUN: echo foobar > %t1
 # RUN: not ld.lld %t1 no-such-file 2>&1 | FileCheck -check-prefix=ERR1 %s
-# ERR1: unexpected EOF
+# ERR1: error: {{.*}}1:1: unknown directive: foobar
 # ERR1: cannot open no-such-file:
 
 # RUN: echo "foo \"bar" > %t2
diff --git a/lld/test/ELF/linkerscript/map-file.test b/lld/test/ELF/linkerscript/map-file.test
index 6ec8bafc42b16..6347c3a5d900a 100644
--- a/lld/test/ELF/linkerscript/map-file.test
+++ b/lld/test/ELF/linkerscript/map-file.test
@@ -7,17 +7,17 @@
 # RUN: FileCheck -strict-whitespace %s < %t.map
 
 SECTIONS {
-  . = 0x1000;
+  .		 = 	0x1000;  # tabs
   .foo : {
-    BYTE(0x11)
-    SHORT(0x1122)
+    BYTE ( 0x11 )
+    SHORT (0x1122)
     LONG(0x11223344)
     QUAD(0x1122334455667788)
     PROVIDE_HIDDEN(sym4 = .);
     . += 0x1000;
     *(.foo.1)
     PROVIDE(unused1 = 0xff);
-    HIDDEN(sym6 = .);
+    HIDDEN(  sym6  =  .  );
     . += 0x123 *
          (1 + 1);
     foo = .;
@@ -34,20 +34,20 @@ SECTIONS {
 # CHECK-NEXT:      0                0     1000     1 . = 0x1000
 # CHECK-NEXT:   1000             1000     125d     1 .foo
 # CHECK-NEXT:   1000             1000        1     1         BYTE ( 0x11 )
-# CHECK-NEXT:   1001             1001        2     1         SHORT ( 0x1122 )
-# CHECK-NEXT:   1003             1003        4     1         LONG ( 0x11223344 )
-# CHECK-NEXT:   1007             1007        8     1         QUAD ( 0x1122334455667788 )
-# CHECK-NEXT:   100f             100f        0     1         PROVIDE_HIDDEN ( sym4 = . )
+# CHECK-NEXT:   1001             1001        2     1         SHORT (0x1122)
+# CHECK-NEXT:   1003             1003        4     1         LONG(0x11223344)
+# CHECK-NEXT:   1007             1007        8     1         QUAD(0x1122334455667788)
+# CHECK-NEXT:   100f             100f        0     1         PROVIDE_HIDDEN(sym4 = .)
 # CHECK-NEXT:   100f             100f     1000     1         . += 0x1000
 # CHECK-NEXT:   200f             200f        8     1         {{.*}}{{/|\\}}map-file.test.tmp.o:(.foo.1)
-# CHECK-NEXT:   2017             2017        0     1         HIDDEN ( sym6 = . )
-# CHECK-NEXT:   2017             2017      246     1         . += 0x123 * ( 1 + 1 )
+# CHECK-NEXT:   2017             2017        0     1         HIDDEN( sym6 = . )
+# CHECK-NEXT:   2017             2017      246     1         . += 0x123 * (1 + 1)
 # CHECK-NEXT:   225d             225d        0     1         foo = .
 # CHECK-NEXT:   225d             225d        0     1         bar = 0x42 - 0x26
 # CHECK-NEXT:   225d             225d        0     1 sym1 = .
 # CHECK-NEXT:   225d             225d      500     1 . += 0x500
 # CHECK-NEXT:   275d             275d        0     1 sym2 = .
-# CHECK-NEXT:   275d             275d        0     1 PROVIDE ( sym3 = 42 )
+# CHECK-NEXT:   275d             275d        0     1 PROVIDE(sym3 = 42)
 # CHECK-NEXT:   2760             2760       10     4 .text
 # CHECK-NEXT:   2760             2760       10     4         {{.*}}{{/|\\}}map-file.test.tmp.o:(.text)
 # CHECK-NEXT:      0                0        8     1 .comment
diff --git a/lld/test/ELF/linkerscript/map-file2.test b/lld/test/ELF/linkerscript/map-file2.test
index 8efb5d6cd3d34..a34595f14856e 100644
--- a/lld/test/ELF/linkerscript/map-file2.test
+++ b/lld/test/ELF/linkerscript/map-file2.test
@@ -27,7 +27,7 @@ SECTIONS {
 # CHECK-NEXT:       1010             3000        8     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ccc)
 # CHECK-NEXT:       1018             3008      100     1 . += 0x100
 # CHECK-NEXT:       1118             3108      109     1 .ddd
-# CHECK-NEXT:       1118             3108        1     1         BYTE ( 0x11 )
+# CHECK-NEXT:       1118             3108        1     1         BYTE(0x11)
 # CHECK-NEXT:       1119             3109      100     1         . += 0x100
 # CHECK-NEXT:       1219             3209        8     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ddd)
 # CHECK-NEXT:       1228             3218       34     8 .eh_frame
diff --git a/lld/test/ELF/linkerscript/memory-err.s b/lld/test/ELF/linkerscript/memory-err.s
index 5ec190a415b29..5500b840ac830 100644
--- a/lld/test/ELF/linkerscript/memory-err.s
+++ b/lld/test/ELF/linkerscript/memory-err.s
@@ -71,6 +71,10 @@
 # RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=NOT_CONVERGE %s
 # NOT_CONVERGE: error: address (0x14) of section '.text' does not converge
 
+# RUN: echo 'MEMORY { ram : ORIGIN = symbol, LENGTH = 4094 ' > %t.script
+# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+# UNCLOSED: error: {{.*}}:1: unexpected EOF
+
 nop
 
 .data
diff --git a/lld/test/ELF/linkerscript/nocrossrefs.test b/lld/test/ELF/linkerscript/nocrossrefs.test
index f13d50a03be87..5eb56190fe63b 100644
--- a/lld/test/ELF/linkerscript/nocrossrefs.test
+++ b/lld/test/ELF/linkerscript/nocrossrefs.test
@@ -2,6 +2,7 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
 # RUN: llvm-mc --triple=x86_64 -filetype=obj a.s -o a.o
+# RUN: llvm-mc --triple=x86_64 -filetype=obj -crel a.s -o ac.o
 # RUN: llvm-mc --triple=x86_64 -filetype=obj data.s -o data.o
 # RUN: ld.lld a.o data.o -T 0.t 2>&1 | FileCheck %s --check-prefix=CHECK0 --implicit-check-not=warning:
 
@@ -9,7 +10,8 @@
 # CHECK0-NEXT: warning: 0.t:4: ignored with fewer than 2 output sections
 
 # RUN: not ld.lld a.o data.o -T 1.t 2>&1 | FileCheck %s --check-prefix=CHECK1 --implicit-check-not=error:
-# CHECK1:      error: a.o:(.text.start+0x11): prohibited cross reference from '.text' to 'data' in '.data'
+# RUN: not ld.lld ac.o data.o -T 1.t 2>&1 | FileCheck %s --check-prefix=CHECK1 --implicit-check-not=error:
+# CHECK1:      error: a{{.?}}.o:(.text.start+0x11): prohibited cross reference from '.text' to 'data' in '.data'
 
 ## .text and .text1 are in two NOCROSSREFS commands. Violations are reported twice.
 # RUN: not ld.lld --threads=1 a.o data.o -T 2.t 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error:
diff --git a/lld/test/ELF/linkerscript/outputarch.test b/lld/test/ELF/linkerscript/outputarch.test
index 4819a983cfce8..982e55ac3b2ff 100644
--- a/lld/test/ELF/linkerscript/outputarch.test
+++ b/lld/test/ELF/linkerscript/outputarch.test
@@ -1,5 +1,13 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-freebsd /dev/null -o %t1
-# RUN: ld.lld -shared -o %t2 %t1 %s
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o a.o
+# RUN: ld.lld -shared -T 1.lds a.o
 
+#--- 1.lds
 OUTPUT_ARCH(All data written here is ignored)
+
+#--- unclosed.lds
+OUTPUT_ARCH(All
+
+# RUN: not ld.lld -shared -T unclosed.lds a.o 2>&1 | FileCheck %s --check-prefix=UNCLOSED
+# UNCLOSED: error: unclosed.lds:1: unexpected EOF
diff --git a/lld/test/ELF/linkerscript/overlay.test b/lld/test/ELF/linkerscript/overlay.test
index b939ee4c4095b..7c64303b45659 100644
--- a/lld/test/ELF/linkerscript/overlay.test
+++ b/lld/test/ELF/linkerscript/overlay.test
@@ -59,13 +59,13 @@ _start:
 SECTIONS {
 ## LMA defaults to VMA
   OVERLAY 0x1000 : {
-    .big1 { *(.big1) }
+    ".big1" { *(".big1") }
     .small1 { *(.small1) }
   }
 ## .big2 starts at ADDR(.small2)
   OVERLAY : AT (0x2008) {
     .small2 { *(.small2) }
-    .big2 { *(.big2) }
+    ".big2" { *(.big2) }
   }
 ## .empty3 is not discarded. .small3 and .big3 share its address.
   OVERLAY . : AT (0x2014) {
@@ -92,3 +92,11 @@ SECTIONS {
     .out.aaa { *(.aaa) } > AX AT>FLASH
   }
 }
+
+#--- unclosed.lds
+SECTIONS {
+  OVERLAY 0x1000 : AT ( 0x2000 ) {
+
+# RUN: not ld.lld a.o -T unclosed.lds 2>&1 | FileCheck %s --check-prefix=UNCLOSED
+# UNCLOSED:     error: unclosed.lds:2: unexpected EOF
+# UNCLOSED-NOT: {{.}}
diff --git a/lld/test/ELF/linkerscript/phdr-check.s b/lld/test/ELF/linkerscript/phdr-check.s
deleted file mode 100644
index 63fdec7b6a8bc..0000000000000
--- a/lld/test/ELF/linkerscript/phdr-check.s
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
-
-# RUN: echo "SECTIONS { . = 0x10000000 + SIZEOF_HEADERS; .text : {*(.text.*)} }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck %s
-# CHECK:      ProgramHeaders [
-# CHECK-NEXT:  ProgramHeader {
-# CHECK-NEXT:    Type: PT_PHDR (0x6)
-# CHECK-NEXT:    Offset: 0x40
-# CHECK-NEXT:    VirtualAddress: 0x10000040
-
-.global _start
-_start:
- nop
diff --git a/lld/test/ELF/linkerscript/phdrs.s b/lld/test/ELF/linkerscript/phdrs.s
index 3e645f7919b7c..997f7e39972d2 100644
--- a/lld/test/ELF/linkerscript/phdrs.s
+++ b/lld/test/ELF/linkerscript/phdrs.s
@@ -1,143 +1,116 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:           .data : {*(.data.*)} :all}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck %s
-
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+
+#--- 1.lds
+PHDRS {all PT_LOAD FILEHDR PHDRS ;}
+SECTIONS {
+  . = 0x10000200;
+  .text : {*(.text*)} :all
+  .foo : {*(.foo.*)} :"all"
+  .data : {*(.data.*)} : "all"}
+
+# RUN: ld.lld -o 1 -T 1.lds a.o
+# RUN: llvm-readelf -Sl 1 | FileCheck %s
+# CHECK:      [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      [ 1] .text             PROGBITS        0000000010000200 000200 000001 00  AX  0   0  4
+# CHECK-NEXT: [ 2] .foo              PROGBITS        0000000010000201 000201 000008 00  WA  0   0  1
+
+# CHECK:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000010000000 0x000209 0x000209 RWE 0x1000
+
+#--- 2.lds
 ## Check that program headers are not written, unless we explicitly tell
 ## lld to do this.
-# RUN: echo "PHDRS {all PT_LOAD;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           /DISCARD/ : {*(.text*)}  \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:       }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=NOPHDR %s
-
+PHDRS {all PT_LOAD;}
+SECTIONS {
+    . = 0x10000200;
+    /DISCARD/ : {*(.text*)}
+    .foo : {*(.foo.*)} :all
+}
+
+# RUN: ld.lld -o 2 -T 2.lds a.o
+# RUN: llvm-readelf -l 2 | FileCheck --check-prefix=NOPHDR %s
+# NOPHDR:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOPHDR-NEXT: LOAD           0x000200 0x0000000010000200 0x0000000010000200 0x000008 0x000008 RW  0x1000
+
+#--- 3.lds
+PHDRS {all PT_LOAD FILEHDR PHDRS ;}
+SECTIONS {
+    . = 0x10000200;
+    .text : {*(.text*)} :all
+    .foo : {*(.foo.*)}
+    .data : {*(.data.*)} }
+
+# RUN: ld.lld -o 3 -T 3.lds a.o
+# RUN: llvm-readelf -l 3 | FileCheck --check-prefix=DEFHDR %s
+# DEFHDR:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# DEFHDR-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000010000000 0x000209 0x000209 RWE 0x1000
+
+#--- at.lds
 ## Check the AT(expr)
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS AT(0x500 + 0x500) ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:           .data : {*(.data.*)} :all}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=AT %s
-
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)}  \
-# RUN:           .data : {*(.data.*)} }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=DEFHDR %s
+PHDRS {all PT_LOAD FILEHDR PHDRS AT(0x500 + 0x500) ;}
+SECTIONS {
+    . = 0x10000200;
+    .text : {*(.text*)} :all
+    .foo : {*(.foo.*)} :all
+    .data : {*(.data.*)} :all}
+
+# RUN: ld.lld -o at -T at.lds a.o
+# RUN: llvm-readelf -l at | FileCheck --check-prefix=AT %s
+# AT:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# AT-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000000000a00 0x000209 0x000209 RWE 0x1000
+
+#--- int.lds
+## Check the numetic values for PHDRS.
+PHDRS {text PT_LOAD FILEHDR PHDRS; foo 0x11223344; }
+SECTIONS { . = SIZEOF_HEADERS; .foo : { *(.foo* .text*) } : text : foo}
+
+# RUN: ld.lld -o int -T int.lds a.o
+# RUN: llvm-readelf -l int | FileCheck --check-prefix=INT-PHDRS %s
+# INT-PHDRS:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# INT-PHDRS-NEXT: LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x0000b9 0x0000b9 RWE 0x1000
+# INT-PHDRS-NEXT: <unknown>: 0x11223344 0x0000b0 0x00000000000000b0 0x00000000000000b0 0x000009 0x000009 RWE 0x4
 
+#--- unspecified.lds
 ## Check that error is reported when trying to use phdr which is not listed
 ## inside PHDRS {} block
 ## TODO: If script doesn't contain PHDRS {} block then default phdr is always
 ## created and error is not reported.
-# RUN: echo "PHDRS { all PT_LOAD; } \
-# RUN:       SECTIONS { .baz : {*(.foo.*)} :bar }" > %t.script
-# RUN: not ld.lld -o /dev/null --script %t.script %t 2>&1 | FileCheck --check-prefix=BADHDR %s
-
-# CHECK:     ProgramHeaders [
-# CHECK-NEXT:  ProgramHeader {
-# CHECK-NEXT:    Type: PT_LOAD (0x1)
-# CHECK-NEXT:    Offset: 0x0
-# CHECK-NEXT:    VirtualAddress: 0x10000000
-# CHECK-NEXT:    PhysicalAddress: 0x10000000
-# CHECK-NEXT:    FileSize: 521
-# CHECK-NEXT:    MemSize: 521
-# CHECK-NEXT:    Flags [ (0x7)
-# CHECK-NEXT:      PF_R (0x4)
-# CHECK-NEXT:      PF_W (0x2)
-# CHECK-NEXT:      PF_X (0x1)
-# CHECK-NEXT:    ]
-
-# NOPHDR:     ProgramHeaders [
-# NOPHDR-NEXT:  ProgramHeader {
-# NOPHDR-NEXT:    Type: PT_LOAD (0x1)
-# NOPHDR-NEXT:    Offset: 0x200
-# NOPHDR-NEXT:    VirtualAddress: 0x10000200
-# NOPHDR-NEXT:    PhysicalAddress: 0x10000200
-# NOPHDR-NEXT:    FileSize: 8
-# NOPHDR-NEXT:    MemSize: 8
-# NOPHDR-NEXT:    Flags [ (0x6)
-# NOPHDR-NEXT:      PF_R (0x4)
-# NOPHDR-NEXT:      PF_W (0x2)
-# NOPHDR-NEXT:    ]
-# NOPHDR-NEXT:    Alignment: 4096
-# NOPHDR-NEXT:  }
-# NOPHDR-NEXT: ]
-
-# AT:       ProgramHeaders [
-# AT-NEXT:    ProgramHeader {
-# AT-NEXT:      Type: PT_LOAD (0x1)
-# AT-NEXT:      Offset: 0x0
-# AT-NEXT:      VirtualAddress: 0x10000000
-# AT-NEXT:      PhysicalAddress: 0xA00
-# AT-NEXT:      FileSize: 521
-# AT-NEXT:      MemSize: 521
-# AT-NEXT:      Flags [ (0x7)
-# AT-NEXT:        PF_R (0x4)
-# AT-NEXT:        PF_W (0x2)
-# AT-NEXT:        PF_X (0x1)
-# AT-NEXT:      ]
+PHDRS { all PT_LOAD; }
+SECTIONS { .baz : {*(.foo.*)} :bar }
 
-## Check the numetic values for PHDRS.
-# RUN: echo "PHDRS {text PT_LOAD FILEHDR PHDRS; foo 0x11223344; } \
-# RUN:       SECTIONS { . = SIZEOF_HEADERS; .foo : { *(.foo* .text*) } : text : foo}" > %t1.script
-# RUN: ld.lld -o %t2 --script %t1.script %t
-# RUN: llvm-readobj -l %t2 | FileCheck --check-prefix=INT-PHDRS %s
-
-# INT-PHDRS:      ProgramHeaders [
-# INT-PHDRS:        ProgramHeader {
-# INT-PHDRS:           Type: Unknown (0x11223344)
-# INT-PHDRS-NEXT:      Offset: 0xB0
-# INT-PHDRS-NEXT:      VirtualAddress: 0xB0
-# INT-PHDRS-NEXT:      PhysicalAddress: 0xB0
-# INT-PHDRS-NEXT:      FileSize:
-# INT-PHDRS-NEXT:      MemSize:
-# INT-PHDRS-NEXT:      Flags [
-# INT-PHDRS-NEXT:        PF_R
-# INT-PHDRS-NEXT:        PF_W
-# INT-PHDRS-NEXT:        PF_X
-# INT-PHDRS-NEXT:      ]
-# INT-PHDRS-NEXT:      Alignment:
-# INT-PHDRS-NEXT:    }
-# INT-PHDRS-NEXT:  ]
-
-# DEFHDR:     ProgramHeaders [
-# DEFHDR-NEXT:  ProgramHeader {
-# DEFHDR-NEXT:    Type: PT_LOAD (0x1)
-# DEFHDR-NEXT:    Offset: 0x0
-# DEFHDR-NEXT:    VirtualAddress: 0x10000000
-# DEFHDR-NEXT:    PhysicalAddress: 0x10000000
-# DEFHDR-NEXT:    FileSize: 521
-# DEFHDR-NEXT:    MemSize: 521
-# DEFHDR-NEXT:    Flags [ (0x7)
-# DEFHDR-NEXT:      PF_R (0x4)
-# DEFHDR-NEXT:      PF_W (0x2)
-# DEFHDR-NEXT:      PF_X (0x1)
-# DEFHDR-NEXT:    ]
-
-# BADHDR:       {{.*}}.script:1: program header 'bar' is not listed in PHDRS
-
-# RUN: echo "PHDRS { text PT_LOAD FOOHDR; }" > %t1.script
-# RUN: not ld.lld -o /dev/null --script %t1.script %t 2>&1 | FileCheck --check-prefix=FOOHDR %s
-# FOOHDR: error: {{.*}}.script:1: unexpected header attribute: FOOHDR
-
-# RUN: echo "PHDRS { text PT_FOO FOOHDR; }" > %t1.script
-# RUN: not ld.lld -o /dev/null --script %t1.script %t 2>&1 | FileCheck --check-prefix=PTFOO %s
-# PTFOO: invalid program header type: PT_FOO
+# RUN: not ld.lld -T unspecified.lds a.o 2>&1 | FileCheck --check-prefix=UNSPECIFIED %s
+# UNSPECIFIED: unspecified.lds:6: program header 'bar' is not listed in PHDRS
+
+#--- foohdr.lds
+PHDRS { text PT_LOAD FOOHDR; }
+
+# RUN: not ld.lld -T foohdr.lds a.o 2>&1 | FileCheck --check-prefix=FOOHDR %s
+# FOOHDR: error: foohdr.lds:1: unexpected header attribute: FOOHDR
+
+#--- pt_foo.lds
+PHDRS { text PT_FOO FOOHDR; }
+
+# RUN: not ld.lld -T pt_foo.lds a.o 2>&1 | FileCheck --check-prefix=PTFOO %s --strict-whitespace
+#      PTFOO:{{.*}}error: pt_foo.lds:1: invalid program header type: PT_FOO
+# PTFOO-NEXT:>>> PHDRS { text PT_FOO FOOHDR; }
+# PTFOO-NEXT:>>>              ^
+
+#--- unclosed.lds
+PHDRS { text PT_LOAD ;
+
+# RUN: not ld.lld -T unclosed.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+#     UNCLOSED:error: unclosed.lds:1: unexpected EOF
+# UNCLOSED-NOT:{{.}}
+
+#--- unclosed2.lds
+PHDRS { text PT_LOAD
+
+# RUN: not ld.lld -T unclosed2.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED2 %s
+# UNCLOSED2: error: unclosed2.lds:1: unexpected header attribute:
 
+#--- a.s
 .global _start
 _start:
  nop
diff --git a/lld/test/ELF/linkerscript/region-alias.s b/lld/test/ELF/linkerscript/region-alias.s
index db716e180cc52..f6a6e1f1ddbd4 100644
--- a/lld/test/ELF/linkerscript/region-alias.s
+++ b/lld/test/ELF/linkerscript/region-alias.s
@@ -11,7 +11,7 @@
 # RUN: }" > %t.script
 
 ## .text to ROM, .data to RAM.
-# RUN: echo "REGION_ALIAS (\"ALIAS_TEXT\", ROM);" > %t.script.inc
+# RUN: echo 'REGION_ALIAS ("ALIAS_TEXT", "ROM");' > %t.script.inc
 # RUN: echo "REGION_ALIAS (\"ALIAS_DATA\", RAM);" >> %t.script.inc
 # RUN: ld.lld %t --script %t.script -o %t2
 # RUN: llvm-objdump --section-headers %t2 | FileCheck %s
diff --git a/lld/test/ELF/linkerscript/sections.s b/lld/test/ELF/linkerscript/sections.s
index fc03af8402dfe..2a27d9e736db5 100644
--- a/lld/test/ELF/linkerscript/sections.s
+++ b/lld/test/ELF/linkerscript/sections.s
@@ -1,18 +1,22 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# Empty SECTIONS command.
-# RUN: echo "SECTIONS {}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-objdump --section-headers %t1 | \
+#--- empty.lds
+SECTIONS {}
+
+# RUN: ld.lld -o empty -T empty.lds a.o
+# RUN: llvm-objdump --section-headers empty | \
 # RUN:   FileCheck -check-prefix=SEC-DEFAULT %s
 
+#--- 1.lds
 # SECTIONS command with the same order as default.
-# RUN: echo "SECTIONS { \
-# RUN:          .text : { *(.text) } \
-# RUN:          .data : { *(.data) } }" > %t.script
-# RUN: ld.lld -o %t2 --script %t.script %t
-# RUN: llvm-objdump --section-headers %t2 | \
+SECTIONS {
+   .text : { *(.text) }
+   .data : { *(.data) } }
+
+# RUN: ld.lld -o 1 -T 1.lds a.o
+# RUN: llvm-objdump --section-headers 1 | \
 # RUN:   FileCheck -check-prefix=SEC-DEFAULT %s
 
 #             Idx Name          Size
@@ -28,8 +32,8 @@
 # .text and .data have swapped names but proper sizes and types.
 # RUN: echo "SECTIONS { \
 # RUN:          .data : { *(.text) } \
-# RUN:          .text : { *(.data) } }" > %t.script
-# RUN: ld.lld -o %t4 --script %t.script %t
+# RUN:          .text : { *(.data) } }" > t.lds
+# RUN: ld.lld -o %t4 --script t.lds a.o
 # RUN: llvm-objdump --section-headers %t4 | \
 # RUN:   FileCheck -check-prefix=SEC-SWAP-NAMES %s
 
@@ -50,8 +54,8 @@
 # RUN:          .text : { *(.text) } \
 # RUN:          .data : { *(.data) } } \
 # RUN:       SECTIONS { \
-# RUN:          .data : { *(other) } }" > %t.script
-# RUN: ld.lld -o %t6 --script %t.script %t
+# RUN:          .data : { *(other) } }" > t.lds
+# RUN: ld.lld -o %t6 --script t.lds a.o
 # RUN: llvm-objdump --section-headers %t6 | \
 # RUN:   FileCheck -check-prefix=SEC-MULTI %s
 
@@ -72,7 +76,7 @@
 # RUN:          .data : { *(.data) } \
 # RUN:          .comment : { *(.comment) } \
 # RUN:          other : { *(other) } }' > %t5.lds
-# RUN: ld.lld -o %t5 -T %t5.lds %t
+# RUN: ld.lld -o %t5 -T %t5.lds a.o
 # RUN: llvm-readelf -S -l %t5 | FileCheck --check-prefix=SEP-BY-NONALLOC %s
 
 # SEP-BY-NONALLOC:      [Nr] Name      Type     Address          Off    Size   ES Flg
@@ -87,11 +91,26 @@
 # SEP-BY-NONALLOC-NEXT: LOAD      0x00100e 0x000000000000000e 0x000000000000000e 0x000023 0x000025 RW  0x1000
 # SEP-BY-NONALLOC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
+#--- semi.lds
 # Input section pattern contains additional semicolon.
 # Case found in linux kernel script. Check we are able to parse it.
-# RUN: echo "SECTIONS { .text : { ;;*(.text);;S = 0;; } }" > %t.script
-# RUN: ld.lld -o /dev/null --script %t.script %t
+SECTIONS { .text : { ;;*(.text);;S = 0;; } }
+
+# RUN: ld.lld -T semi.lds a.o
+
+#--- unclosed.lds
+SECTIONS {
+   .text : { *(.text) }
+
+# RUN: not ld.lld -T unclosed.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+#     UNCLOSED:error: unclosed.lds:2: unexpected EOF
+# UNCLOSED-NOT:{{.}}
+
+#--- unclosed-out.lds
+SECTIONS {
+   .text : { *(.text)
 
+#--- a.s
 .globl _start
 _start:
     mov $60, %rax
diff --git a/lld/test/ELF/linkerscript/symbolreferenced.s b/lld/test/ELF/linkerscript/symbolreferenced.s
index 6848082690837..ad356d45fac4c 100644
--- a/lld/test/ELF/linkerscript/symbolreferenced.s
+++ b/lld/test/ELF/linkerscript/symbolreferenced.s
@@ -94,7 +94,7 @@ PROVIDE(unused = g1);
 PROVIDE_HIDDEN(another_unused = g1);
 
 #--- chain_with_cycle.t
-PROVIDE(f1 = f2 + f3);
+PROVIDE("f1" = f2 + f3);
 PROVIDE(f2 = f3 + f4);
 PROVIDE(f3 = f4);
 PROVIDE(f4 = f1);
diff --git a/lld/test/ELF/linkerscript/unquoted.test b/lld/test/ELF/linkerscript/unquoted.test
new file mode 100644
index 0000000000000..9a30ae8a37ff7
--- /dev/null
+++ b/lld/test/ELF/linkerscript/unquoted.test
@@ -0,0 +1,25 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o 0.o
+
+#--- empty.lds
+#--- 1.lds
+
+SECTIONS /*
+#--- 1a.lds
+foo = 3;
+INCLUDE "empty.lds"
+INCLUDE "1.lds"
+
+# RUN: not ld.lld -shared 0.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 1a.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+#      CHECK1:{{.*}}error: 1.lds:2: unclosed comment in a linker script
+# CHECK1-NEXT:>>> SECTIONS /*
+# CHECK1-NEXT:>>> ^
+
+#--- 2.lds
+INCLUDE "empty.lds"
+"
+# RUN: not ld.lld -shared 0.o -T 2.lds 2>&1 | FileCheck %s --check-prefix=CHECK2 --match-full-lines --strict-whitespace
+#      CHECK2:{{.*}}error: 2.lds:2: unclosed quote
+# CHECK2-NOT:{{.}}
diff --git a/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
new file mode 100644
index 0000000000000..d4d12b9d4a520
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
@@ -0,0 +1,129 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/bc.s -o %t/bc.32.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.32.o -o %t/bc.32.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/bc.s -o %t/bc.64.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.64.o -o %t/bc.64.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 GD
+# RUN: ld.lld -shared %t/a.32.o %t/bc.32.o -o %t/gd.32.so
+# RUN: llvm-readobj -r %t/gd.32.so | FileCheck --check-prefix=GD32-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.32.so | FileCheck --check-prefix=GD32 %s
+
+## LA32 GD -> LE
+# RUN: ld.lld %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: ld.lld -pie %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le-pie.32
+# RUN: llvm-readelf -r %t/le-pie.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.32 | FileCheck --check-prefix=LE32-GOT %s
+
+## LA32 GD -> IE
+# RUN: ld.lld %t/a.32.o %t/bc.32.so %t/tga.32.o -o %t/ie.32
+# RUN: llvm-readobj -r %t/ie.32 | FileCheck --check-prefix=IE32-REL %s
+# RUN: llvm-readelf -x .got %t/ie.32 | FileCheck --check-prefix=IE32-GOT %s
+
+## LA64 GD
+# RUN: ld.lld -shared %t/a.64.o %t/bc.64.o -o %t/gd.64.so
+# RUN: llvm-readobj -r %t/gd.64.so | FileCheck --check-prefix=GD64-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.64.so | FileCheck --check-prefix=GD64 %s
+
+## LA64 GD -> LE
+# RUN: ld.lld %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: ld.lld -pie %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le-pie.64
+# RUN: llvm-readelf -r %t/le-pie.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.64 | FileCheck --check-prefix=LE64-GOT %s
+
+## LA64 GD -> IE
+# RUN: ld.lld %t/a.64.o %t/bc.64.so %t/tga.64.o -o %t/ie.64
+# RUN: llvm-readobj -r %t/ie.64 | FileCheck --check-prefix=IE64-REL %s
+# RUN: llvm-readelf -x .got %t/ie.64 | FileCheck --check-prefix=IE64-GOT %s
+
+# GD32-REL:      .rela.dyn {
+# GD32-REL-NEXT:   0x20300 R_LARCH_TLS_DTPMOD32 a 0x0
+# GD32-REL-NEXT:   0x20304 R_LARCH_TLS_DTPREL32 a 0x0
+# GD32-REL-NEXT:   0x20308 R_LARCH_TLS_DTPMOD32 b 0x0
+# GD32-REL-NEXT:   0x2030C R_LARCH_TLS_DTPREL32 b 0x0
+# GD32-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x20300 - 0x10250 = 16428<<2
+# GD32:      10250: pcaddi $a0, 16428
+# GD32-NEXT:        bl 44
+
+## &DTPMOD(b) - . = 0x20308 - 0x10258 = 16428<<2
+# GD32:      10258: pcaddi $a0, 16428
+# GD32-NEXT:        bl 36
+
+# GD64-REL:      .rela.dyn {
+# GD64-REL-NEXT:   0x204C0 R_LARCH_TLS_DTPMOD64 a 0x0
+# GD64-REL-NEXT:   0x204C8 R_LARCH_TLS_DTPREL64 a 0x0
+# GD64-REL-NEXT:   0x204D0 R_LARCH_TLS_DTPMOD64 b 0x0
+# GD64-REL-NEXT:   0x204D8 R_LARCH_TLS_DTPREL64 b 0x0
+# GD64-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x204c0 - 0x10398 = 16458<<2
+# GD64:      10398: pcaddi $a0, 16458
+# GD64-NEXT:        bl 52
+
+## &DTPMOD(b) - . = 0x204d0 - 0x103a4 = 16460<<2
+# GD64:      103a0: pcaddi $a0, 16460
+# GD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## .got contains pre-populated values: [a@dtpmod, a@dtprel, b@dtpmod, b@dtprel]
+## a@dtprel = st_value(a) = 0x8
+## b@dtprel = st_value(b) = 0xc
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x[[#%x,A:]] 01000000 08000000 01000000 0c000000
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 08000000 00000000
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 0c000000 00000000
+
+## a is local - relaxed to LE - its DTPMOD/DTPREL slots are link-time constants.
+## b is external - DTPMOD/DTPREL dynamic relocations are required.
+# IE32-REL:      .rela.dyn {
+# IE32-REL-NEXT:   0x30220 R_LARCH_TLS_DTPMOD32 b 0x0
+# IE32-REL-NEXT:   0x30224 R_LARCH_TLS_DTPREL32 b 0x0
+# IE32-REL-NEXT: }
+# IE32-GOT:      section '.got':
+# IE32-GOT-NEXT: 0x00030218 01000000 08000000 00000000 00000000
+
+# IE64-REL:      .rela.dyn {
+# IE64-REL-NEXT:   0x30380 R_LARCH_TLS_DTPMOD64 b 0x0
+# IE64-REL-NEXT:   0x30388 R_LARCH_TLS_DTPREL64 b 0x0
+# IE64-REL-NEXT: }
+# IE64-GOT:      section '.got':
+# IE64-GOT-NEXT: 0x00030370 01000000 00000000 08000000 00000000
+# IE64-GOT-NEXT: 0x00030380 00000000 00000000 00000000 00000000
+
+#--- a.s
+pcaddi $a0, %gd_pcrel_20(a)
+bl %plt(__tls_get_addr)
+
+pcaddi $a0, %gd_pcrel_20(b)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 4
+
+#--- bc.s
+.section .tbss,"awT",@nobits
+.globl b, c
+b:
+.zero 4
+c:
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
new file mode 100644
index 0000000000000..70186f5538dfc
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
@@ -0,0 +1,82 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --position-independent %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --position-independent %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 LD
+# RUN: ld.lld -shared %t/a.32.o -o %t/ld.32.so
+# RUN: llvm-readobj -r %t/ld.32.so | FileCheck --check-prefix=LD32-REL %s
+# RUN: llvm-readelf -x .got %t/ld.32.so | FileCheck --check-prefix=LD32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.32.so | FileCheck --check-prefixes=LD32 %s
+
+## LA32 LD -> LE
+# RUN: ld.lld %t/a.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.32 | FileCheck --check-prefixes=LE32 %s
+
+## LA64 LD
+# RUN: ld.lld -shared %t/a.64.o -o %t/ld.64.so
+# RUN: llvm-readobj -r %t/ld.64.so | FileCheck --check-prefix=LD64-REL %s
+# RUN: llvm-readelf -x .got %t/ld.64.so | FileCheck --check-prefix=LD64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.64.so | FileCheck --check-prefixes=LD64 %s
+
+## LA64 LD -> LE
+# RUN: ld.lld %t/a.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.64 | FileCheck --check-prefixes=LE64 %s
+
+## a@dtprel = st_value(a) = 0 is a link-time constant.
+# LD32-REL:      .rela.dyn {
+# LD32-REL-NEXT:   0x20280 R_LARCH_TLS_DTPMOD32 - 0x0
+# LD32-REL-NEXT: }
+# LD32-GOT:      section '.got':
+# LD32-GOT-NEXT: 0x00020280 00000000 00000000
+
+# LD64-REL:      .rela.dyn {
+# LD64-REL-NEXT:   0x20400 R_LARCH_TLS_DTPMOD64 - 0x0
+# LD64-REL-NEXT: }
+# LD64-GOT:      section '.got':
+# LD64-GOT-NEXT: 0x00020400 00000000 00000000 00000000 00000000
+
+## LA32: &DTPMOD(a) - . = 0x20280 - 0x101cc = 16429<<2
+# LD32:      101cc: pcaddi $a0, 16429
+# LD32-NEXT:        bl 48
+
+## LA64: &DTPMOD(a) - . = 0x20400 - 0x102e0 = 16456<<2
+# LD64:      102e0: pcaddi $a0, 16456
+# LD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## a is local - its DTPMOD/DTPREL slots are link-time constants.
+## a@dtpmod = 1 (main module)
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x0003011c 01000000 00000000
+
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x000301d0 01000000 00000000 00000000 00000000
+
+## LA32: DTPMOD(.LANCHOR0) - . = 0x3011c - 0x20114 = 16386<<2
+# LE32:      20114: pcaddi $a0, 16386
+# LE32-NEXT:        bl 4
+
+## LA64: DTPMOD(.LANCHOR0) - . = 0x301d0 - 0x201c8 = 16386<<2
+# LE64:      201c8: pcaddi $a0, 16386
+# LE64-NEXT:        bl 4
+
+#--- a.s
+pcaddi $a0, %ld_pcrel_20(.LANCHOR0)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.set .LANCHOR0, . + 0
+.zero 8
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tls-le.s b/lld/test/ELF/loongarch-tls-le.s
index a20d7d83bae3f..394c60f67bce8 100644
--- a/lld/test/ELF/loongarch-tls-le.s
+++ b/lld/test/ELF/loongarch-tls-le.s
@@ -1,14 +1,14 @@
 # REQUIRES: loongarch
 
-# RUN: llvm-mc --filetype=obj --triple=loongarch32 %s -o %t.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --defsym ELF32=1 %s -o %t.32.o
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t.64.o
 
 # RUN: ld.lld %t.32.o -o %t.32
 # RUN: llvm-nm -p %t.32 | FileCheck --check-prefixes=NM %s
-# RUN: llvm-objdump -d --no-show-raw-insn %t.32 | FileCheck --check-prefixes=LE %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.32 | FileCheck --check-prefixes=LE,LE32 %s
 
 # RUN: ld.lld %t.64.o -o %t.64
-# RUN: llvm-objdump -d --no-show-raw-insn %t.64 | FileCheck --check-prefixes=LE %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.64 | FileCheck --check-prefixes=LE,LE64 %s
 
 # RUN: not ld.lld -shared %t.32.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 
@@ -16,6 +16,10 @@
 # ERR: error: relocation R_LARCH_TLS_LE_LO12 against .LANCHOR0 cannot be used with -shared
 # ERR: error: relocation R_LARCH_TLS_LE_HI20 against a cannot be used with -shared
 # ERR: error: relocation R_LARCH_TLS_LE_LO12 against a cannot be used with -shared
+# ERR: error: relocation R_LARCH_TLS_LE_HI20_R against .LANCHOR0 cannot be used with -shared
+# ERR: error: relocation R_LARCH_TLS_LE_LO12_R against .LANCHOR0 cannot be used with -shared
+# ERR: error: relocation R_LARCH_TLS_LE_HI20_R against a cannot be used with -shared
+# ERR: error: relocation R_LARCH_TLS_LE_LO12_R against a cannot be used with -shared
 
 # NM: {{0*}}00000008 b .LANCHOR0
 # NM: {{0*}}00000800 B a
@@ -26,13 +30,50 @@
 # LE-NEXT: ori $a0, $a0, 8
 # LE-NEXT: lu12i.w $a1, 0
 # LE-NEXT: ori $a1, $a1, 2048
+
+# LE32:      add.w   $a0, $a0, $tp
+# LE32-NEXT: addi.w  $a0, $a0, 8
+# LE32-NEXT: lu12i.w $a0, 1
+# LE32-NEXT: add.w   $a0, $a0, $tp
+# LE32-NEXT: addi.w  $a0, $a0, -2048
+
+# LE64:      add.d   $a0, $a0, $tp
+# LE64-NEXT: addi.d  $a0, $a0, 8
+# LE64-NEXT: lu12i.w $a0, 1
+# LE64-NEXT: add.d   $a0, $a0, $tp
+# LE64-NEXT: addi.d  $a0, $a0, -2048
+
 # LE-EMPTY:
 
+.macro add dst, src1, src2, src3
+.ifdef ELF32
+add.w \dst, \src1, \src2, \src3
+.else
+add.d \dst, \src1, \src2, \src3
+.endif
+.endm
+.macro addi dst, src1, src2
+.ifdef ELF32
+addi.w \dst, \src1, \src2
+.else
+addi.d \dst, \src1, \src2
+.endif
+.endm
+
 .text
+
 _start:
 la.tls.le $a0, .LANCHOR0
 la.tls.le $a1, a
 
+lu12i.w $a0, %le_hi20_r(.LANCHOR0)
+add $a0, $a0, $tp, %le_add_r(.LANCHOR0)
+addi $a0, $a0, %le_lo12_r(.LANCHOR0)
+
+lu12i.w $a0, %le_hi20_r(a)
+add $a0, $a0, $tp, %le_add_r(a)
+addi $a0, $a0, %le_lo12_r(a)
+
 .section .tbss,"awT",@nobits
 .space 8
 .LANCHOR0:
diff --git a/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
new file mode 100644
index 0000000000000..99e21d9935197
--- /dev/null
+++ b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
@@ -0,0 +1,142 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 a.s -o a.32.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 c.s -o c.32.o
+# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so
+# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel
+# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s
+
+## FIXME: The transition frome TLSDESC to IE/LE has not yet been implemented.
+## Keep the dynamic relocations and hand them over to dynamic linker.
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le
+# RUN: llvm-readobj -r -x .got a.64.le | FileCheck --check-prefix=LE64-RELA %s
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie
+# RUN: llvm-readobj -r -x .got a.64.ie | FileCheck --check-prefix=IE64-RELA %s
+
+## 32-bit code is mostly the same. We only test a few variants.
+
+# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel
+# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s
+
+# GD64-RELA:      .rela.dyn {
+# GD64-RELA-NEXT:   0x203F0 R_LARCH_TLS_DESC64 - 0x7FF
+# GD64-RELA-NEXT:   0x203D0 R_LARCH_TLS_DESC64 a 0x0
+# GD64-RELA-NEXT:   0x203E0 R_LARCH_TLS_DESC64 c 0x0
+# GD64-RELA-NEXT: }
+# GD64-RELA:      Hex dump of section '.got':
+# GD64-RELA-NEXT: 0x000203d0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203e0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203f0 00000000 00000000 00000000 00000000 .
+
+# GD64-REL:      .rel.dyn {
+# GD64-REL-NEXT:   0x203D8 R_LARCH_TLS_DESC64 -
+# GD64-REL-NEXT:   0x203B8 R_LARCH_TLS_DESC64 a
+# GD64-REL-NEXT:   0x203C8 R_LARCH_TLS_DESC64 c
+# GD64-REL-NEXT: }
+# GD64-REL:      Hex dump of section '.got':
+# GD64-REL-NEXT: 0x000203b8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203c8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203d8 00000000 00000000 ff070000 00000000 .
+
+# GD64:      .got     00000030 00000000000203d0
+
+## &.got[a]-. = 0x203d0 - 0x102e0 = 16444<<2
+# GD64:        102e0: pcaddi $a0, 16444
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a1, $a0, $tp
+
+## &.got[b]-. = 0x203d0+32 - 0x102f0 = 16448<<2
+# GD64:        102f0: pcaddi $a0, 16448
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a2, $a0, $tp
+
+## &.got[c]-. = 0x203d0+16 - 0x10300 = 16440<<2
+# GD64:        10300: pcaddi $a0, 16440
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a3, $a0, $tp
+
+# LE64-RELA:      .rela.dyn {
+# LE64-RELA-NEXT:   0x30240 R_LARCH_TLS_DESC64 - 0x8
+# LE64-RELA-NEXT:   0x30250 R_LARCH_TLS_DESC64 - 0x800
+# LE64-RELA-NEXT:   0x30260 R_LARCH_TLS_DESC64 - 0x7FF
+# LE64-RELA-NEXT: }
+# LE64-RELA:      Hex dump of section '.got':
+# LE64-RELA-NEXT: 0x00030240 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030250 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030260 00000000 00000000 00000000 00000000 .
+
+# IE64-RELA:      .rela.dyn {
+# IE64-RELA-NEXT:   0x303C8 R_LARCH_TLS_DESC64 - 0x8
+# IE64-RELA-NEXT:   0x303E8 R_LARCH_TLS_DESC64 - 0x7FF
+# IE64-RELA-NEXT:   0x303D8 R_LARCH_TLS_DESC64 c 0x0
+# IE64-RELA-NEXT: }
+# IE64-RELA:      Hex dump of section '.got':
+# IE64-RELA-NEXT: 0x000303c8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303d8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303e8 00000000 00000000 00000000 00000000 .
+
+# GD32-REL:      .rel.dyn {
+# GD32-REL-NEXT:    0x20264 R_LARCH_TLS_DESC32 -
+# GD32-REL-NEXT:    0x20254 R_LARCH_TLS_DESC32 a
+# GD32-REL-NEXT:    0x2025C R_LARCH_TLS_DESC32 c
+# GD32-REL-NEXT: }
+# GD32-REL:      Hex dump of section '.got':
+# GD32-REL-NEXT: 0x00020254 00000000 00000000 00000000 00000000 .
+# GD32-REL-NEXT: 0x00020264 00000000 ff070000                   .
+
+#--- a.s
+.macro add dst, src1, src2
+.ifdef ELF32
+add.w \dst, \src1, \src2
+.else
+add.d \dst, \src1, \src2
+.endif
+.endm
+.macro load dst, src1, src2
+.ifdef ELF32
+ld.w \dst, \src1, \src2
+.else
+ld.d \dst, \src1, \src2
+.endif
+.endm
+
+pcaddi $a0, %desc_pcrel_20(a)
+load $ra, $a0, %desc_ld(a)
+jirl $ra, $ra, %desc_call(a)
+add $a1, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(b)
+load $ra, $a0, %desc_ld(b)
+jirl $ra, $ra, %desc_call(b)
+add $a2, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(c)
+load $ra, $a0, %desc_ld(c)
+jirl $ra, $ra, %desc_call(c)
+add $a3, $a0, $tp
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 2039  ## Place b at 0x7ff
+b:
+.zero 1
+
+#--- c.s
+.section .tbss,"awT",@nobits
+.globl c
+c: .zero 4
diff --git a/lld/test/ELF/lto/cpu-string.ll b/lld/test/ELF/lto/cpu-string.ll
index 3697cbd6d9472..5444339007b0d 100644
--- a/lld/test/ELF/lto/cpu-string.ll
+++ b/lld/test/ELF/lto/cpu-string.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/ELF/lto/mllvm.ll b/lld/test/ELF/lto/mllvm.ll
index 883a9c8d8dc75..d09dcd0bf7b95 100644
--- a/lld/test/ELF/lto/mllvm.ll
+++ b/lld/test/ELF/lto/mllvm.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 define void @_start() #0 {
 entry:
-  call void asm sideeffect ".p2align 4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align 4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index 3cbba68716972..dd18c92c7352e 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -264,30 +264,30 @@
 // RELR64-EMPTY: 
 // RELR64-NEXT:  Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries:
 // RELR64-NEXT:  Symbolic Address
-// RELR64-NEXT:  $d.0{{$}}
-// RELR64-NEXT:  $d.0 + 0x8
-// RELR64-NEXT:  $d.0 + 0x10
-// RELR64-NEXT:  $d.0 + 0x18
-// RELR64-NEXT:  $d.0 + 0x20
-// RELR64-NEXT:  $d.0 + 0x28
-// RELR64-NEXT:  $d.0 + 0x30
-// RELR64-NEXT:  $d.0 + 0x38
-// RELR64-NEXT:  $d.0 + 0x48
-// RELR64-NEXT:  $d.0 + 0x50
-// RELR64-NEXT:  $d.0 + 0x58
-// RELR64-NEXT:  $d.0 + 0x60
-// RELR64-NEXT:  $d.0 + 0x68
-// RELR64-NEXT:  $d.0 + 0x70
-// RELR64-NEXT:  $d.0 + 0x78
-// RELR64-NEXT:  $d.0 + 0x90
-// RELR64-NEXT:  $d.0 + 0x98
-// RELR64-NEXT:  $d.0 + 0xa0
-// RELR64-NEXT:  $d.0 + 0xa8
-// RELR64-NEXT:  $d.0 + 0xb0
-// RELR64-NEXT:  $d.0 + 0xb8
-// RELR64-NEXT:  $d.0 + 0xc0
-// RELR64-NEXT:  $d.0 + 0xc8
-// RELR64-NEXT:  $d.0 + 0xd0
+// RELR64-NEXT:  $d{{$}}
+// RELR64-NEXT:  $d + 0x8
+// RELR64-NEXT:  $d + 0x10
+// RELR64-NEXT:  $d + 0x18
+// RELR64-NEXT:  $d + 0x20
+// RELR64-NEXT:  $d + 0x28
+// RELR64-NEXT:  $d + 0x30
+// RELR64-NEXT:  $d + 0x38
+// RELR64-NEXT:  $d + 0x48
+// RELR64-NEXT:  $d + 0x50
+// RELR64-NEXT:  $d + 0x58
+// RELR64-NEXT:  $d + 0x60
+// RELR64-NEXT:  $d + 0x68
+// RELR64-NEXT:  $d + 0x70
+// RELR64-NEXT:  $d + 0x78
+// RELR64-NEXT:  $d + 0x90
+// RELR64-NEXT:  $d + 0x98
+// RELR64-NEXT:  $d + 0xa0
+// RELR64-NEXT:  $d + 0xa8
+// RELR64-NEXT:  $d + 0xb0
+// RELR64-NEXT:  $d + 0xb8
+// RELR64-NEXT:  $d + 0xc0
+// RELR64-NEXT:  $d + 0xc8
+// RELR64-NEXT:  $d + 0xd0
 // RELR64-EMPTY:
 // RELR64-NEXT: Hex dump of section '.data':
 // RELR64-NEXT: 0x00030490 90040300 00000000 91040300 00000000 .
diff --git a/lld/test/ELF/relocatable-crel-32.s b/lld/test/ELF/relocatable-crel-32.s
new file mode 100644
index 0000000000000..8fbf236d77452
--- /dev/null
+++ b/lld/test/ELF/relocatable-crel-32.s
@@ -0,0 +1,71 @@
+# REQUIRES: ppc
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=powerpc -crel a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=powerpc -crel b.s -o b.o
+# RUN: ld.lld -r b.o a.o -o out
+# RUN: llvm-readobj -r out | FileCheck %s --check-prefixes=CHECK,CRELFOO
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc a.s -o a1.o
+# RUN: ld.lld -r b.o a1.o -o out1
+# RUN: llvm-readobj -r out1 | FileCheck %s --check-prefixes=CHECK,RELAFOO
+# RUN: ld.lld -r a1.o b.o -o out2
+# RUN: llvm-readobj -r out2 | FileCheck %s --check-prefixes=CHECK2
+
+# CHECK:      Relocations [
+# CHECK-NEXT:   Section (2) .crel.text {
+# CHECK-NEXT:     0x0 R_PPC_REL24 fb 0x0
+# CHECK-NEXT:     0x4 R_PPC_REL24 foo 0x0
+# CHECK-NEXT:     0x8 R_PPC_REL24 .text.foo 0x0
+# CHECK-NEXT:     0xE R_PPC_ADDR16_HA .rodata.str1.1 0x4
+# CHECK-NEXT:     0x12 R_PPC_ADDR16_LO .rodata.str1.1 0x4
+# CHECK-NEXT:     0x16 R_PPC_ADDR16_HA .rodata.str1.1 0x0
+# CHECK-NEXT:     0x1A R_PPC_ADDR16_LO .rodata.str1.1 0x0
+# CHECK-NEXT:   }
+# CRELFOO-NEXT: Section (4) .crel.text.foo {
+# RELAFOO-NEXT: Section (4) .rela.text.foo {
+# CHECK-NEXT:     0x0 R_PPC_REL24 g 0x0
+# CHECK-NEXT:     0x4 R_PPC_REL24 g 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+# CHECK2:      Relocations [
+# CHECK2-NEXT:   Section (2) .crel.text {
+# CHECK2-NEXT:     0x0 R_PPC_REL24 foo 0x0
+# CHECK2-NEXT:     0x4 R_PPC_REL24 .text.foo 0x0
+# CHECK2-NEXT:     0xA R_PPC_ADDR16_HA .rodata.str1.1 0x4
+# CHECK2-NEXT:     0xE R_PPC_ADDR16_LO .rodata.str1.1 0x4
+# CHECK2-NEXT:     0x12 R_PPC_ADDR16_HA .rodata.str1.1 0x0
+# CHECK2-NEXT:     0x16 R_PPC_ADDR16_LO .rodata.str1.1 0x0
+# CHECK2-NEXT:     0x18 R_PPC_REL24 fb 0x0
+# CHECK2-NEXT:   }
+# CHECK2-NEXT:   Section (4) .rela.text.foo {
+# CHECK2-NEXT:     0x0 R_PPC_REL24 g 0x0
+# CHECK2-NEXT:     0x4 R_PPC_REL24 g 0x0
+# CHECK2-NEXT:   }
+# CHECK2-NEXT: ]
+
+#--- a.s
+.global _start, foo
+_start:
+  bl foo
+  bl .text.foo
+  lis 3, .L.str@ha
+  la 3, .L.str@l(3)
+  lis 3, .L.str1@ha
+  la 3, .L.str1@l(3)
+
+.section .text.foo,"ax"
+foo:
+  bl g
+  bl g
+
+.section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+  .asciz  "abc"
+.L.str1:
+  .asciz  "def"
+
+#--- b.s
+.globl fb
+fb:
+  bl fb
diff --git a/lld/test/ELF/relocatable-crel.s b/lld/test/ELF/relocatable-crel.s
new file mode 100644
index 0000000000000..6e97c3e24d66c
--- /dev/null
+++ b/lld/test/ELF/relocatable-crel.s
@@ -0,0 +1,107 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -crel a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -crel b.s -o b.o
+# RUN: ld.lld -r b.o a.o -o out
+# RUN: llvm-readobj -r out | FileCheck %s --check-prefixes=CHECK,CRELFOO
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a1.o
+# RUN: ld.lld -r b.o a1.o -o out1
+# RUN: llvm-readobj -r out1 | FileCheck %s --check-prefixes=CHECK,RELAFOO
+# RUN: ld.lld -r a1.o b.o -o out2
+# RUN: llvm-readobj -r out2 | FileCheck %s --check-prefixes=CHECK2
+
+# CHECK:      Relocations [
+# CHECK-NEXT:   .crel.text {
+# CHECK-NEXT:     0x1 R_X86_64_PLT32 fb 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:     0x9 R_X86_64_PLT32 foo 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:     0xE R_X86_64_PLT32 .text.foo 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:   }
+# CHECK-NEXT:   .crel.rodata {
+# CHECK-NEXT:     0x0 R_X86_64_PC32 foo 0x0
+# CHECK-NEXT:     0xF R_X86_64_PC32 foo 0x3F
+# CHECK-NEXT:     0x1F R_X86_64_PC64 foo 0x7F
+# CHECK-NEXT:     0x27 R_X86_64_PC32 _start 0xFFFFFFFFFFFFE07F
+# CHECK-COUNT-12:      R_X86_64_32 _start 0x0
+# CHECK-NEXT:   }
+# CRELFOO-NEXT: .crel.text.foo {
+# RELAFOO-NEXT: .rela.text.foo {
+# CHECK-NEXT:     0x3 R_X86_64_PC32 .L.str 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:     0xA R_X86_64_PC32 .L.str1 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:     0xF R_X86_64_PLT32 g 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:     0x14 R_X86_64_PLT32 g 0xFFFFFFFFFFFFFFFC
+# CHECK-NEXT:   }
+# CRELFOO-NEXT: .crel.data {
+# RELAFOO-NEXT: .rela.data {
+# CHECK-NEXT:     0x8 R_X86_64_64 _start 0x8000000000000000
+# CHECK-NEXT:     0x18 R_X86_64_64 _start 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+# CHECK2:      Relocations [
+# CHECK2-NEXT:   .crel.text {
+# CHECK2-NEXT:     0x1 R_X86_64_PLT32 foo 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:     0x6 R_X86_64_PLT32 .text.foo 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:     0xD R_X86_64_PLT32 fb 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:   }
+# CHECK2-NEXT:   .rela.text.foo {
+# CHECK2-NEXT:     0x3 R_X86_64_PC32 .L.str 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:     0xA R_X86_64_PC32 .L.str1 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:     0xF R_X86_64_PLT32 g 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:     0x14 R_X86_64_PLT32 g 0xFFFFFFFFFFFFFFFC
+# CHECK2-NEXT:   }
+# CHECK2-NEXT:   .rela.data {
+# CHECK2-NEXT:     0x8 R_X86_64_64 _start 0x8000000000000000
+# CHECK2-NEXT:     0x18 R_X86_64_64 _start 0xFFFFFFFFFFFFFFFF
+# CHECK2-NEXT:   }
+# CHECK2-NEXT:   .crel.rodata {
+# CHECK2-NEXT:     0x0 R_X86_64_PC32 foo 0x0
+# CHECK2-NEXT:     0xF R_X86_64_PC32 foo 0x3F
+# CHECK2-NEXT:     0x1F R_X86_64_PC64 foo 0x7F
+# CHECK2-NEXT:     0x27 R_X86_64_PC32 _start 0xFFFFFFFFFFFFE07F
+# CHECK2-COUNT-12:      R_X86_64_32 _start 0x0
+# CHECK2-NEXT:   }
+# CHECK2-NEXT: ]
+
+#--- a.s
+.global _start, foo
+_start:
+  call foo
+  call .text.foo
+
+.section .text.foo,"ax"
+foo:
+  leaq .L.str(%rip), %rsi
+  leaq .L.str1(%rip), %rsi
+  call g
+  call g
+
+.section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+  .asciz  "abc"
+.L.str1:
+  .asciz  "def"
+
+.data
+.quad 0
+.quad _start - 0x8000000000000000
+.quad 0
+.quad _start - 1
+
+#--- b.s
+.globl fb
+fb:
+  call fb
+
+.section .rodata,"a"
+.long foo - .
+.space 15-4
+.long foo - . + 63  # offset+=15
+.space 16-4
+.quad foo - . + 127  # offset+=16
+.long _start - . - 8065
+
+## Ensure .crel.rodata contains 16 relocations so that getULEB128Size(crelHeader) > 1.
+.rept 12
+.long _start
+.endr
diff --git a/lld/test/ELF/zsectionheader.s b/lld/test/ELF/zsectionheader.s
new file mode 100644
index 0000000000000..c1e654ac1082d
--- /dev/null
+++ b/lld/test/ELF/zsectionheader.s
@@ -0,0 +1,36 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld -shared -z nosectionheader -z sectionheader %t.o -o %t.so 2>&1 | count 0
+# RUN: llvm-readelf -hS %t.so | FileCheck %s --check-prefixes=CHECK,SHDR
+
+# RUN: ld.lld -shared -z nosectionheader %t.o -o %t0.so
+# RUN: llvm-readelf -h --dyn-syms %t0.so | FileCheck %s --check-prefixes=CHECK,NOSHDR
+# RUN: llvm-strings %t0.so | FileCheck %s --check-prefixes=NOSHDR-STR
+
+# CHECK:       Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+# CHECK-NEXT:  Number of program headers:         6
+# CHECK-NEXT:  Size of section headers:           64 (bytes)
+# SHDR-NEXT:   Number of section headers:         13
+# SHDR-NEXT:   Section header string table index: 11
+# NOSHDR-NEXT: Number of section headers:         0
+# NOSHDR-NEXT: Section header string table index: 0
+
+# SHDR:        Section Headers:
+# NOSHDR:      Symbol table for image contains 2 entries:
+# NOSHDR:        _start
+
+## _start occurs as a dynamic string table entry. There is no static string table
+## entry. `nonalloc` is not in the output.
+# NOSHDR-STR:      _start
+# NOSHDR-STR-NOT:  _start
+
+# RUN: not ld.lld -r -z nosectionheader %t.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR: error: -r and -z nosectionheader may not be used together
+
+.globl _start
+_start:
+
+.section nonalloc,""
+.asciz "_start"
diff --git a/lld/test/MachO/arm64-32-stubs.s b/lld/test/MachO/arm64-32-stubs.s
index 743d5c419bf1a..a5d48ff5baef3 100644
--- a/lld/test/MachO/arm64-32-stubs.s
+++ b/lld/test/MachO/arm64-32-stubs.s
@@ -10,7 +10,7 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-watchos %t/test.s -o %t/test.o
 # RUN: %lld-watchos -dylib -install_name @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: %lld-watchos -dylib -install_name @executable_path/libbar.dylib %t/bar.o -o %t/libbar.dylib
-# RUN: %lld-watchos -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test
+# RUN: %lld-watchos -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test -no_fixup_chains
 
 # RUN: llvm-objdump --no-print-imm-hex --macho -d --no-show-raw-insn --section="__TEXT,__stubs" --section="__TEXT,__stub_helper" %t/test | FileCheck %s
 
diff --git a/lld/test/MachO/arm64-stubs.s b/lld/test/MachO/arm64-stubs.s
index f714e20084895..6fd94661d32e2 100644
--- a/lld/test/MachO/arm64-stubs.s
+++ b/lld/test/MachO/arm64-stubs.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/test.s -o %t/test.o
 # RUN: %lld -arch arm64 -dylib -install_name @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: %lld -arch arm64 -dylib -install_name @executable_path/libbar.dylib %t/bar.o -o %t/libbar.dylib
-# RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test
+# RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test -no_fixup_chains
 
 # RUN: llvm-objdump --no-print-imm-hex --macho -d --no-show-raw-insn --section="__TEXT,__stubs" --section="__TEXT,__stub_helper" %t/test | FileCheck %s
 
diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s
index 0e4157ece72de..d887359bbc23e 100644
--- a/lld/test/MachO/arm64-thunks.s
+++ b/lld/test/MachO/arm64-thunks.s
@@ -7,12 +7,13 @@
 ## (3) a second thunk is created when the first one goes out of range
 ## (4) early calls to a dylib stub use a thunk, and later calls the stub
 ##     directly
+## (5) Thunks are created for all sections in the text segment with branches.
 ## Notes:
 ## 0x4000000 = 64 Mi = half the magnitude of the forward-branch range
 
 # RUN: rm -rf %t; mkdir %t
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t/input.o
-# RUN: %lld -arch arm64 -dead_strip -lSystem -o %t/thunk %t/input.o
+# RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -o %t/thunk %t/input.o
 # RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/thunk | FileCheck %s
 
 # CHECK: Disassembly of section __TEXT,__text:
@@ -164,6 +165,10 @@
 # CHECK:  adrp x16, 0x[[#%x, F_PAGE]]
 # CHECK:  add  x16, x16, #[[#F_OFFSET]]
 
+# CHECK: Disassembly of section __TEXT,__lcxx_override:
+# CHECK: <_z>:
+# CHECK:  bl 0x[[#%x, A_THUNK_0]] <_a.thunk.0>
+
 # CHECK: Disassembly of section __TEXT,__stubs:
 
 # CHECK: [[#%x, NAN_PAGE + NAN_OFFSET]] <__stubs>:
@@ -300,3 +305,17 @@ _main:
   bl _h
   bl ___nan
   ret
+
+.section __TEXT,__cstring
+  .space 0x4000000
+
+.section __TEXT,__lcxx_override,regular,pure_instructions
+
+.globl _z
+.no_dead_strip _z
+.p2align 2
+_z:
+  bl _a
+  ## Ensure calling into stubs works
+  bl _extern_sym
+  ret
diff --git a/lld/test/MachO/bp-section-orderer-errs.s b/lld/test/MachO/bp-section-orderer-errs.s
new file mode 100644
index 0000000000000..8392b88508481
--- /dev/null
+++ b/lld/test/MachO/bp-section-orderer-errs.s
@@ -0,0 +1,9 @@
+# RUN: not %lld -o /dev/null --irpgo-profile-sort %s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
+# RUN: not %lld -o /dev/null --irpgo-profile-sort=%s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
+# IRPGO-ERR: --irpgo-profile-sort is incompatible with --call-graph-profile-sort
+
+# RUN: not %lld -o /dev/null --compression-sort=function --call-graph-profile-sort %s 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
+# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-profile-sort
+
+# RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
+# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
diff --git a/lld/test/MachO/bp-section-orderer-stress.s b/lld/test/MachO/bp-section-orderer-stress.s
new file mode 100644
index 0000000000000..fdc6a20e2655b
--- /dev/null
+++ b/lld/test/MachO/bp-section-orderer-stress.s
@@ -0,0 +1,105 @@
+# REQUIRES: aarch64
+
+# Generate a large test case and check that the output is deterministic.
+
+# RUN: %python %s %t.s %t.proftext
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
+# RUN: llvm-profdata merge %t.proftext -o %t.profdata
+
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
+# RUN: diff %t.order1.txt %t.order2.txt
+
+import random
+import sys
+
+assembly_filepath = sys.argv[1]
+proftext_filepath = sys.argv[2]
+
+random.seed(1234)
+num_functions = 1000
+num_data = 100
+num_traces = 10
+
+function_names = [f"f{n}" for n in range(num_functions)]
+data_names = [f"d{n}" for n in range(num_data)]
+profiled_functions = function_names[: int(num_functions / 2)]
+
+function_contents = [
+    f"""
+{name}:
+  add w0, w0, #{i % 4096}
+  add w1, w1, #{i % 10}
+  add w2, w0, #{i % 20}
+  adrp x3, {name}@PAGE
+  ret
+"""
+    for i, name in enumerate(function_names)
+]
+
+data_contents = [
+      f"""
+{name}:
+  .ascii "s{i % 2}-{i % 3}-{i % 5}"
+  .xword {name}
+"""
+    for i, name in enumerate(data_names)
+]
+
+trace_contents = [
+    f"""
+# Weight
+1
+{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
+"""
+    for i in range(num_traces)
+]
+
+profile_contents = [
+    f"""
+{name}
+# Func Hash:
+{i}
+# Num Counters:
+1
+# Counter Values:
+1
+"""
+    for i, name in enumerate(profiled_functions)
+]
+
+with open(assembly_filepath, "w") as f:
+    f.write(
+        f"""
+.text
+.globl _main
+
+_main:
+  ret
+
+{"".join(function_contents)}
+
+.data
+{"".join(data_contents)}
+
+.subsections_via_symbols
+"""
+    )
+
+with open(proftext_filepath, "w") as f:
+    f.write(
+        f"""
+:ir
+:temporal_prof_traces
+
+# Num Traces
+{num_traces}
+# Trace Stream Size:
+{num_traces}
+
+{"".join(trace_contents)}
+
+{"".join(profile_contents)}
+"""
+    )
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
new file mode 100644
index 0000000000000..407787025150d
--- /dev/null
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -0,0 +1,123 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
+# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata
+
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
+
+# STARTUP: Ordered 3 sections using balanced partitioning
+
+# RUN: %lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE
+
+# ORDERFILE: A
+# ORDERFILE: F
+# ORDERFILE: E
+# ORDERFILE: D
+# ORDERFILE-DAG: _main
+# ORDERFILE-DAG: _B
+# ORDERFILE-DAG: l_C
+# ORDERFILE-DAG: s1
+# ORDERFILE-DAG: s2
+# ORDERFILE-DAG: r1
+# ORDERFILE-DAG: r2
+
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
+
+# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
+# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
+# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning
+
+#--- a.s
+.text
+.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+
+_main:
+  ret
+A:
+  ret
+_B:
+  add w0, w0, #1
+  bl  A
+  ret
+l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
+  add w0, w0, #2
+  bl  A
+  ret
+D:
+  add w0, w0, #2
+  bl _B
+  ret
+E:
+  add w0, w0, #2
+  bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+  ret
+F:
+  add w0, w0, #3
+  bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+  ret
+
+.data
+s1:
+  .ascii "hello world"
+s2:
+  .ascii "i am a string"
+r1:
+  .quad s1
+r2:
+  .quad r1
+
+.subsections_via_symbols
+
+#--- a.proftext
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
+
+A
+# Func Hash:
+1111
+# Num Counters:
+1
+# Counter Values:
+1
+
+B
+# Func Hash:
+2222
+# Num Counters:
+1
+# Counter Values:
+1
+
+C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
+# Func Hash:
+3333
+# Num Counters:
+1
+# Counter Values:
+1
+
+D
+# Func Hash:
+4444
+# Num Counters:
+1
+# Counter Values:
+1
+
+#--- a.orderfile
+A
+F
+E
+D
diff --git a/lld/test/MachO/dyld-stub-binder.s b/lld/test/MachO/dyld-stub-binder.s
index 1d2e556607c79..170fe8abd89bd 100644
--- a/lld/test/MachO/dyld-stub-binder.s
+++ b/lld/test/MachO/dyld-stub-binder.s
@@ -14,27 +14,27 @@
 # RUN: %lld -arch arm64 -lSystem -dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: llvm-nm -m %t/libfoo.dylib | FileCheck --check-prefix=STUB %s
 
-
 ## Dylibs that do lazy dynamic calls do need dyld_stub_binder.
 # RUN: not %no-lsystem-lld -arch arm64 -dylib %t/bar.o %t/libfoo.dylib \
-# RUN:     -o %t/libbar.dylib 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
+# RUN:     -o %t/libbar.dylib -no_fixup_chains 2>&1 | \
+# RUN:     FileCheck --check-prefix=MISSINGSTUB %s
 # RUN: %lld -arch arm64 -lSystem -dylib %t/bar.o  %t/libfoo.dylib \
-# RUN:     -o %t/libbar.dylib
+# RUN:     -o %t/libbar.dylib -no_fixup_chains
 # RUN: llvm-nm -m %t/libbar.dylib | FileCheck --check-prefix=STUB %s
 
 ## As do executables.
 # RUN: not %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
+# RUN:     -o %t/test -no_fixup_chains 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
 # RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test
+# RUN:     -o %t/test -no_fixup_chains
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=STUB %s
 
 ## Test dynamic lookup of dyld_stub_binder.
 # RUN: %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -undefined dynamic_lookup
+# RUN:     -o %t/test -undefined dynamic_lookup -no_fixup_chains
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=DYNSTUB %s
 # RUN: %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -U dyld_stub_binder
+# RUN:     -o %t/test -U dyld_stub_binder -no_fixup_chains
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=DYNSTUB %s
 
 # MISSINGSTUB:      error: undefined symbol: dyld_stub_binder
diff --git a/lld/test/MachO/filelist.s b/lld/test/MachO/filelist.s
index 3d4846c4505a9..6a9e54fe874e5 100644
--- a/lld/test/MachO/filelist.s
+++ b/lld/test/MachO/filelist.s
@@ -3,7 +3,7 @@
 ## This test verifies that the paths in -filelist get processed in command-line
 ## order.
 
-# RUN: rm -rf %t; split-file %s %t
+# RUN: rm -rf %t; split-file %s %t && cd %t
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/first.s -o %t/first.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/second.s -o %t/second.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
diff --git a/lld/test/MachO/invalid/chained-fixups-incompatible.s b/lld/test/MachO/invalid/chained-fixups-incompatible.s
index 83411472beb9f..0589204968fe3 100644
--- a/lld/test/MachO/invalid/chained-fixups-incompatible.s
+++ b/lld/test/MachO/invalid/chained-fixups-incompatible.s
@@ -2,9 +2,6 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
 # RUN: not %lld -lSystem -no_pie -fixup_chains %t.o -o /dev/null 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=NO-PIE
-# RUN: %no-fatal-warnings-lld -fixup_chains %t.o -o /dev/null \
-# RUN:   -lSystem -platform_version macos 10.15 10.15 2>&1 | \
-# RUN:   FileCheck %s --check-prefix=VERSION
 # RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-darwin %s -o %t-arm64_32.o
 # RUN: not %lld-watchos -lSystem -fixup_chains %t-arm64_32.o -o /dev/null 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=ARCH
@@ -12,7 +9,6 @@
 ## Check that we emit diagnostics when -fixup_chains is explicitly specified,
 ## but we don't support creating chained fixups for said configuration.
 # NO-PIE:  error: -fixup_chains is incompatible with -no_pie
-# VERSION: warning: -fixup_chains requires macOS 11.0, which is newer than target minimum of 10.15
 # ARCH:    error: -fixup_chains is only supported on x86_64 and arm64 targets
 
 .globl _main
diff --git a/lld/test/MachO/link-search-at-rpath.s b/lld/test/MachO/link-search-at-rpath.s
index dbc0d4b3cbf63..71d27fc5033bb 100644
--- a/lld/test/MachO/link-search-at-rpath.s
+++ b/lld/test/MachO/link-search-at-rpath.s
@@ -14,6 +14,7 @@
 # RUN:     -rpath @loader_path/../foo \
 # RUN:     -rpath @loader_path/../subdir \
 # RUN:     -rpath @loader_path/../foo \
+# RUN:     --no-warn-duplicate-rpath \
 # RUN:     %t/bar.o -o %t/subdir2/libbar.dylib
 
 # RUN: %lld -lSystem %t/main.o %t/subdir2/libbar.dylib -o %t/test
diff --git a/lld/test/MachO/lto-cpu-string.ll b/lld/test/MachO/lto-cpu-string.ll
index dcd51bef6f446..3736a2e1f0fa8 100644
--- a/lld/test/MachO/lto-cpu-string.ll
+++ b/lld/test/MachO/lto-cpu-string.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 define void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/MachO/objc-category-merging-erase-objc-name-test.s b/lld/test/MachO/objc-category-merging-erase-objc-name-test.s
new file mode 100644
index 0000000000000..aeb2395b3a858
--- /dev/null
+++ b/lld/test/MachO/objc-category-merging-erase-objc-name-test.s
@@ -0,0 +1,306 @@
+; REQUIRES: aarch64
+
+; Here we test that if we defined a protocol MyTestProtocol and also a category MyTestProtocol
+; then when merging the category into the base class (and deleting the category), we don't
+; delete the 'MyTestProtocol' name
+
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o %T/erase-objc-name.o %s
+; RUN: %lld -arch arm64 -dylib -o %T/erase-objc-name.dylib %T/erase-objc-name.o -objc_category_merging
+; RUN: llvm-objdump --objc-meta-data --macho %T/erase-objc-name.dylib | FileCheck %s --check-prefixes=MERGE_CATS
+
+; === Check merge categories enabled ===
+; Check that the original categories are not there
+; MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+; MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+; Check that we get the expected output - most importantly that the protocol is named `MyTestProtocol`
+; MERGE_CATS:        Contents of (__DATA_CONST,__objc_classlist) section
+; MERGE_CATS-NEXT:   _OBJC_CLASS_$_MyBaseClass
+; MERGE_CATS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+; MERGE_CATS-NEXT:     superclass {{.*}}
+; MERGE_CATS-NEXT:          cache {{.*}}
+; MERGE_CATS-NEXT:         vtable {{.*}}
+; MERGE_CATS-NEXT:           data {{.*}} (struct class_ro_t *)
+; MERGE_CATS-NEXT:                     flags {{.*}} RO_ROOT
+; MERGE_CATS-NEXT:             instanceStart 0
+; MERGE_CATS-NEXT:              instanceSize 0
+; MERGE_CATS-NEXT:                  reserved {{.*}}
+; MERGE_CATS-NEXT:                ivarLayout {{.*}}
+; MERGE_CATS-NEXT:                      name {{.*}} MyBaseClass
+; MERGE_CATS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:             entsize 24
+; MERGE_CATS-NEXT:               count 2
+; MERGE_CATS-NEXT:                name {{.*}} getValue
+; MERGE_CATS-NEXT:               types {{.*}} i16@0:8
+; MERGE_CATS-NEXT:                 imp -[MyBaseClass(MyTestProtocol) getValue]
+; MERGE_CATS-NEXT:                name {{.*}} baseInstanceMethod
+; MERGE_CATS-NEXT:               types {{.*}} v16@0:8
+; MERGE_CATS-NEXT:                 imp -[MyBaseClass baseInstanceMethod]
+; MERGE_CATS-NEXT:             baseProtocols {{.*}}
+; MERGE_CATS-NEXT:                       count 1
+; MERGE_CATS-NEXT:                list[0] {{.*}} (struct protocol_t *)
+; MERGE_CATS-NEXT:                       isa {{.*}}
+; MERGE_CATS-NEXT:                      name {{.*}} MyTestProtocol
+; MERGE_CATS-NEXT:                 protocols {{.*}}
+; MERGE_CATS-NEXT:            instanceMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:                    entsize 24
+; MERGE_CATS-NEXT:                      count 1
+; MERGE_CATS-NEXT:                       name {{.*}} getValue
+; MERGE_CATS-NEXT:                      types {{.*}} i16@0:8
+; MERGE_CATS-NEXT:                        imp {{.*}}
+; MERGE_CATS-NEXT:               classMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:     optionalInstanceMethods {{.*}}
+; MERGE_CATS-NEXT:        optionalClassMethods {{.*}}
+; MERGE_CATS-NEXT:          instanceProperties {{.*}}
+; MERGE_CATS-NEXT:                     ivars {{.*}}
+; MERGE_CATS-NEXT:            weakIvarLayout {{.*}}
+; MERGE_CATS-NEXT:            baseProperties {{.*}}
+; MERGE_CATS-NEXT: Meta Class
+; MERGE_CATS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+; MERGE_CATS-NEXT:     superclass {{.*}} _OBJC_CLASS_$_MyBaseClass
+; MERGE_CATS-NEXT:          cache {{.*}}
+; MERGE_CATS-NEXT:         vtable {{.*}}
+; MERGE_CATS-NEXT:           data {{.*}} (struct class_ro_t *)
+; MERGE_CATS-NEXT:                     flags {{.*}} RO_META RO_ROOT
+; MERGE_CATS-NEXT:             instanceStart 40
+; MERGE_CATS-NEXT:              instanceSize 40
+; MERGE_CATS-NEXT:                  reserved {{.*}}
+; MERGE_CATS-NEXT:                ivarLayout {{.*}}
+; MERGE_CATS-NEXT:                      name {{.*}} MyBaseClass
+; MERGE_CATS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:             baseProtocols {{.*}}
+; MERGE_CATS-NEXT:                       count 1
+; MERGE_CATS-NEXT:                list[0] {{.*}} (struct protocol_t *)
+; MERGE_CATS-NEXT:                       isa {{.*}}
+; MERGE_CATS-NEXT:                      name {{.*}} MyTestProtocol
+; MERGE_CATS-NEXT:                 protocols {{.*}}
+; MERGE_CATS-NEXT:            instanceMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:                    entsize 24
+; MERGE_CATS-NEXT:                      count 1
+; MERGE_CATS-NEXT:                       name {{.*}} getValue
+; MERGE_CATS-NEXT:                      types {{.*}} i16@0:8
+; MERGE_CATS-NEXT:                        imp {{.*}}
+; MERGE_CATS-NEXT:               classMethods {{.*}} (struct method_list_t *)
+; MERGE_CATS-NEXT:     optionalInstanceMethods {{.*}}
+; MERGE_CATS-NEXT:        optionalClassMethods {{.*}}
+; MERGE_CATS-NEXT:          instanceProperties {{.*}}
+; MERGE_CATS-NEXT:                     ivars {{.*}}
+; MERGE_CATS-NEXT:            weakIvarLayout {{.*}}
+; MERGE_CATS-NEXT:            baseProperties {{.*}}
+; MERGE_CATS-NEXT: Contents of (__DATA_CONST,__objc_protolist) section
+; MERGE_CATS-NEXT: {{.*}} {{.*}} __OBJC_PROTOCOL_$_MyTestProtocol
+; MERGE_CATS-NEXT: Contents of (__DATA_CONST,__objc_imageinfo) section
+; MERGE_CATS-NEXT:   version 0
+; MERGE_CATS-NEXT:     flags {{.*}} OBJC_IMAGE_HAS_CATEGORY_CLASS_PROPERTIES
+
+
+; ================== repro.sh ====================
+; # Write the Objective-C code to a file
+; cat << EOF > MyClass.m
+; @protocol MyTestProtocol
+; - (int)getValue;
+; @end
+;
+; __attribute__((objc_root_class))
+; @interface MyBaseClass
+; - (void)baseInstanceMethod;
+; @end
+;
+; @implementation MyBaseClass
+; - (void)baseInstanceMethod {}
+; @end
+;
+; @interface MyBaseClass (MyTestProtocol) <MyTestProtocol>
+; @end
+;
+; @implementation MyBaseClass (MyTestProtocol)
+;
+; - (int)getValue {
+;     return 0x30;
+; }
+;
+; @end
+; EOF
+;
+; # Compile the Objective-C file to assembly
+; xcrun clang -S -arch arm64 MyClass.m -o MyClass.s
+; ==============================================
+
+
+       .section      __TEXT,__text,regular,pure_instructions
+       .p2align      2                               ; -- Begin function -[MyBaseClass baseInstanceMethod]
+"-[MyBaseClass baseInstanceMethod]":    ; @"\01-[MyBaseClass baseInstanceMethod]"
+       .cfi_startproc
+; %bb.0:
+       sub    sp, sp, #16
+       .cfi_def_cfa_offset 16
+       str    x0, [sp, #8]
+       str    x1, [sp]
+       add    sp, sp, #16
+       ret
+       .cfi_endproc
+                                        ; -- End function
+       .p2align      2                               ; -- Begin function -[MyBaseClass(MyTestProtocol) getValue]
+"-[MyBaseClass(MyTestProtocol) getValue]": ; @"\01-[MyBaseClass(MyTestProtocol) getValue]"
+       .cfi_startproc
+; %bb.0:
+       sub    sp, sp, #16
+       .cfi_def_cfa_offset 16
+       str    x0, [sp, #8]
+       str    x1, [sp]
+       mov    w0, #48                         ; =0x30
+       add    sp, sp, #16
+       ret
+       .cfi_endproc
+                                        ; -- End function
+       .section      __DATA,__objc_data
+       .globl _OBJC_CLASS_$_MyBaseClass       ; @"OBJC_CLASS_$_MyBaseClass"
+       .p2align      3, 0x0
+_OBJC_CLASS_$_MyBaseClass:
+       .quad  _OBJC_METACLASS_$_MyBaseClass
+       .quad  0
+       .quad  __objc_empty_cache
+       .quad  0
+       .quad  __OBJC_CLASS_RO_$_MyBaseClass
+       .globl _OBJC_METACLASS_$_MyBaseClass   ; @"OBJC_METACLASS_$_MyBaseClass"
+       .p2align      3, 0x0
+_OBJC_METACLASS_$_MyBaseClass:
+       .quad  _OBJC_METACLASS_$_MyBaseClass
+       .quad  _OBJC_CLASS_$_MyBaseClass
+       .quad  __objc_empty_cache
+       .quad  0
+       .quad  __OBJC_METACLASS_RO_$_MyBaseClass
+       .section      __TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+       .asciz "MyBaseClass"
+       .section      __DATA,__objc_const
+       .p2align      3, 0x0                          ; @"_OBJC_METACLASS_RO_$_MyBaseClass"
+__OBJC_METACLASS_RO_$_MyBaseClass:
+       .long  131                             ; 0x83
+       .long  40                              ; 0x28
+       .long  40                              ; 0x28
+       .space 4
+       .quad  0
+       .quad  l_OBJC_CLASS_NAME_
+       .quad  0
+       .quad  0
+       .quad  0
+       .quad  0
+       .quad  0
+       .section      __TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+       .asciz "baseInstanceMethod"
+       .section      __TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+       .asciz "v16@0:8"
+       .section      __DATA,__objc_const
+       .p2align      3, 0x0                          ; @"_OBJC_$_INSTANCE_METHODS_MyBaseClass"
+__OBJC_$_INSTANCE_METHODS_MyBaseClass:
+       .long  24                              ; 0x18
+       .long  1                               ; 0x1
+       .quad  l_OBJC_METH_VAR_NAME_
+       .quad  l_OBJC_METH_VAR_TYPE_
+       .quad  "-[MyBaseClass baseInstanceMethod]"
+       .p2align      3, 0x0                          ; @"_OBJC_CLASS_RO_$_MyBaseClass"
+__OBJC_CLASS_RO_$_MyBaseClass:
+       .long  130                             ; 0x82
+       .long  0                               ; 0x0
+       .long  0                               ; 0x0
+       .space 4
+       .quad  0
+       .quad  l_OBJC_CLASS_NAME_
+       .quad  __OBJC_$_INSTANCE_METHODS_MyBaseClass
+       .quad  0
+       .quad  0
+       .quad  0
+       .quad  0
+       .section      __TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
+       .asciz "MyTestProtocol"
+       .section      __TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+       .asciz "getValue"
+       .section      __TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.3:                ; @OBJC_METH_VAR_TYPE_.3
+       .asciz "i16@0:8"
+       .section      __DATA,__objc_const
+       .p2align      3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_MyTestProtocol"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_MyTestProtocol:
+       .long  24                              ; 0x18
+       .long  1                               ; 0x1
+       .quad  l_OBJC_METH_VAR_NAME_.2
+       .quad  l_OBJC_METH_VAR_TYPE_.3
+       .quad  "-[MyBaseClass(MyTestProtocol) getValue]"
+       .p2align      3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyTestProtocol"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyTestProtocol:
+       .long  24                              ; 0x18
+       .long  1                               ; 0x1
+       .quad  l_OBJC_METH_VAR_NAME_.2
+       .quad  l_OBJC_METH_VAR_TYPE_.3
+       .quad  0
+       .p2align      3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyTestProtocol"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyTestProtocol:
+       .quad  l_OBJC_METH_VAR_TYPE_.3
+       .private_extern      __OBJC_PROTOCOL_$_MyTestProtocol ; @"_OBJC_PROTOCOL_$_MyTestProtocol"
+       .section      __DATA,__data
+       .globl __OBJC_PROTOCOL_$_MyTestProtocol
+       .weak_definition     __OBJC_PROTOCOL_$_MyTestProtocol
+       .p2align      3, 0x0
+__OBJC_PROTOCOL_$_MyTestProtocol:
+       .quad  0
+       .quad  l_OBJC_CLASS_NAME_.1
+       .quad  0
+       .quad  __OBJC_$_PROTOCOL_INSTANCE_METHODS_MyTestProtocol
+       .quad  0
+       .quad  0
+       .quad  0
+       .quad  0
+       .long  96                              ; 0x60
+       .long  0                               ; 0x0
+       .quad  __OBJC_$_PROTOCOL_METHOD_TYPES_MyTestProtocol
+       .quad  0
+       .quad  0
+       .private_extern      __OBJC_LABEL_PROTOCOL_$_MyTestProtocol ; @"_OBJC_LABEL_PROTOCOL_$_MyTestProtocol"
+       .section      __DATA,__objc_protolist,coalesced,no_dead_strip
+       .globl __OBJC_LABEL_PROTOCOL_$_MyTestProtocol
+       .weak_definition     __OBJC_LABEL_PROTOCOL_$_MyTestProtocol
+       .p2align      3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyTestProtocol:
+       .quad  __OBJC_PROTOCOL_$_MyTestProtocol
+       .section      __DATA,__objc_const
+       .p2align      3, 0x0                          ; @"_OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_MyTestProtocol"
+__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_MyTestProtocol:
+       .quad  1                               ; 0x1
+       .quad  __OBJC_PROTOCOL_$_MyTestProtocol
+       .quad  0
+       .p2align      3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_MyTestProtocol"
+__OBJC_$_CATEGORY_MyBaseClass_$_MyTestProtocol:
+       .quad  l_OBJC_CLASS_NAME_.1
+       .quad  _OBJC_CLASS_$_MyBaseClass
+       .quad  __OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_MyTestProtocol
+       .quad  0
+       .quad  __OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_MyTestProtocol
+       .quad  0
+       .quad  0
+       .long  64                              ; 0x40
+       .space 4
+       .section      __DATA,__objc_classlist,regular,no_dead_strip
+       .p2align      3, 0x0                          ; @"OBJC_LABEL_CLASS_$"
+l_OBJC_LABEL_CLASS_$:
+       .quad  _OBJC_CLASS_$_MyBaseClass
+       .section      __DATA,__objc_catlist,regular,no_dead_strip
+       .p2align      3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
+l_OBJC_LABEL_CATEGORY_$:
+       .quad  __OBJC_$_CATEGORY_MyBaseClass_$_MyTestProtocol
+       .no_dead_strip       __OBJC_PROTOCOL_$_MyTestProtocol
+       .no_dead_strip       __OBJC_LABEL_PROTOCOL_$_MyTestProtocol
+       .section      __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+       .long  0
+       .long  64
+
+__objc_empty_cache:
+_$sBOWV:
+  .quad 0
+
+.subsections_via_symbols
diff --git a/lld/test/MachO/objc-selrefs.s b/lld/test/MachO/objc-selrefs.s
index 6d144f4938b45..eebe7c6476cd5 100644
--- a/lld/test/MachO/objc-selrefs.s
+++ b/lld/test/MachO/objc-selrefs.s
@@ -6,14 +6,14 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/implicit-selrefs.s -o %t/implicit-selrefs.o
 
 # RUN: %lld -dylib -arch arm64 -lSystem -o %t/explicit-only-no-icf \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o -no_fixup_chains
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-only-no-icf | \
 # RUN:   FileCheck %s --check-prefix=EXPLICIT-NO-ICF
 
 ## NOTE: ld64 always dedups the selrefs unconditionally, but we only do it when
 ## ICF is enabled.
 # RUN: %lld -dylib -arch arm64 -lSystem -o %t/explicit-only-with-icf \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o -no_fixup_chains
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-only-with-icf \
 # RUN:   | FileCheck %s --check-prefix=EXPLICIT-WITH-ICF
 
@@ -25,7 +25,8 @@
 # SELREFS-EMPTY:
 
 # RUN: %lld -dylib -arch arm64 -lSystem --icf=all -o %t/explicit-and-implicit \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o %t/implicit-selrefs.o
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o %t/implicit-selrefs.o \
+# RUN:   -no_fixup_chains
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-and-implicit \
 # RUN:   | FileCheck %s --check-prefix=EXPLICIT-AND-IMPLICIT
 
diff --git a/lld/test/MachO/reproduce-thin-archive-objc.s b/lld/test/MachO/reproduce-thin-archive-objc.s
new file mode 100644
index 0000000000000..8159f03f0f740
--- /dev/null
+++ b/lld/test/MachO/reproduce-thin-archive-objc.s
@@ -0,0 +1,24 @@
+# REQUIRES: x86
+
+## For a long time, LLD only included those members from thin archives that were actually used
+## during linking. However, we need to iterate over all members for -ObjC, check that we don't
+## crash when we encounter a missing member.
+
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: sed s/SYM/_main/   %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o main.o
+# RUN: sed s/SYM/_unused/ %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o unused.o
+
+# RUN: llvm-ar rcsT unused.a unused.o; rm unused.o
+## FIXME: Absolute paths don't end up relativized in the repro file.
+
+# RUN: %no-fatal-warnings-lld %t/main.o %t/unused.a -ObjC -o /dev/null 2>&1 \
+# RUN:                      | FileCheck %s --check-prefix=WARN
+
+# RUN: %lld main.o unused.a -ObjC --no-warn-thin-archive-missing-members 2>&1 | count 0
+
+# WARN: warning: {{.*}}unused.a: -ObjC failed to open archive member: 'unused.o'
+
+.text
+.globl SYM
+SYM:
+    ret
diff --git a/lld/test/MachO/reproduce-thin-archives.s b/lld/test/MachO/reproduce-thin-archives.s
index 9dee3f400e06a..33eeaede7aa41 100644
--- a/lld/test/MachO/reproduce-thin-archives.s
+++ b/lld/test/MachO/reproduce-thin-archives.s
@@ -1,10 +1,11 @@
 # REQUIRES: x86
 
-# RUN: rm -rf %t.dir
-# RUN: mkdir -p %t.dir
-# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %s -o %t.dir/foo.o
+# RUN: rm -rf %t.dir; split-file %s %t.dir
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %t.dir/foo.s -o %t.dir/foo.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %t.dir/unused.s -o %t.dir/unused.o
 # RUN: cd %t.dir
-# RUN: llvm-ar rcsT foo.a foo.o
+# RUN: llvm-ar rcsT foo.a foo.o unused.o
 
 # RUN: %lld foo.a -o /dev/null --reproduce repro.tar
 # RUN: tar tf repro.tar | FileCheck -DPATH='repro/%:t.dir' %s
@@ -12,9 +13,19 @@
 # RUN: %lld -all_load foo.a -o /dev/null --reproduce repro2.tar
 # RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' %s
 
+# RUN: %lld -ObjC foo.a -o /dev/null --reproduce repro3.tar
+# RUN: tar tf repro3.tar | FileCheck -DPATH='repro3/%:t.dir' %s
+
 # CHECK-DAG: [[PATH]]/foo.a
 # CHECK-DAG: [[PATH]]/foo.o
+# CHECK-DAG: [[PATH]]/unused.o
 
+#--- foo.s
 .globl _main
 _main:
   nop
+
+#--- unused.s
+.globl _unused
+_unused:
+  nop
diff --git a/lld/test/MachO/rpath.s b/lld/test/MachO/rpath.s
index 5b404a36b26b0..09ae108b34a21 100644
--- a/lld/test/MachO/rpath.s
+++ b/lld/test/MachO/rpath.s
@@ -12,6 +12,21 @@
 # CHECK-NEXT: cmdsize 32
 # CHECK-NEXT: path /another/rpath
 
+## Check that -rpath entries are deduplicated.
+# RUN: not %lld %t.o -o /dev/null -rpath /some/rpath -rpath /other/rpath -rpath /some/rpath 2>&1 | \
+# RUN:     FileCheck --check-prefix=FATAL %s
+# FATAL: error: duplicate -rpath '/some/rpath' ignored [--warn-duplicate-rpath]
+
+# RUN: %lld -o %t-dup %t.o -rpath /some/rpath -rpath /other/rpath -rpath /some/rpath --no-warn-duplicate-rpath
+# RUN: llvm-objdump --macho --all-headers %t-dup | FileCheck %s --check-prefix=DEDUP
+# DEDUP:      LC_RPATH
+# DEDUP-NEXT: cmdsize 24
+# DEDUP-NEXT: path /some/rpath
+# DEDUP:      LC_RPATH
+# DEDUP-NEXT: cmdsize 32
+# DEDUP-NEXT: path /other/rpath
+# DEDUP-NOT:  LC_RPATH
+
 .text
 .global _main
 _main:
diff --git a/lld/test/MachO/section-order.s b/lld/test/MachO/section-order.s
index f575eebf96dba..7a0b6f799630e 100644
--- a/lld/test/MachO/section-order.s
+++ b/lld/test/MachO/section-order.s
@@ -23,18 +23,20 @@
 # CHECK-12-NEXT: __cstring
 
 # CHECK-21:      __text
+## `foo` always sorts next to `__text` since it's a code section
+## and needs to be adjacent for arm64 thunk calculations
+# CHECK-21-NEXT: foo
 # CHECK-21-NEXT: __cstring
 # CHECK-21-NEXT: bar
-# CHECK-21-NEXT: foo
 
 # CHECK-SYNTHETIC-ORDER:      __text
+# CHECK-SYNTHETIC-ORDER-NEXT: foo
 # CHECK-SYNTHETIC-ORDER-NEXT: __stubs
 # CHECK-SYNTHETIC-ORDER-NEXT: __stub_helper
 # CHECK-SYNTHETIC-ORDER-NEXT: __objc_stubs
 # CHECK-SYNTHETIC-ORDER-NEXT: __init_offsets
 # CHECK-SYNTHETIC-ORDER-NEXT: __cstring
 # CHECK-SYNTHETIC-ORDER-NEXT: bar
-# CHECK-SYNTHETIC-ORDER-NEXT: foo
 # CHECK-SYNTHETIC-ORDER-NEXT: __unwind_info
 # CHECK-SYNTHETIC-ORDER-NEXT: __eh_frame
 # CHECK-SYNTHETIC-ORDER-NEXT: __objc_selrefs
@@ -52,5 +54,5 @@
   .asciz ""
 .section __TEXT,bar
   .space 1
-.section __TEXT,foo
+.section __TEXT,foo,regular,pure_instructions
   .space 1
diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig
index c91504604b6ac..c0dde905f986b 100644
--- a/lldb/bindings/headers.swig
+++ b/lldb/bindings/headers.swig
@@ -21,6 +21,7 @@
 #include "lldb/API/SBCommandReturnObject.h"
 #include "lldb/API/SBCommunication.h"
 #include "lldb/API/SBCompileUnit.h"
+#include "lldb/API/SBSaveCoreOptions.h"
 #include "lldb/API/SBData.h"
 #include "lldb/API/SBDebugger.h"
 #include "lldb/API/SBDeclaration.h"
diff --git a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index 0953f4c72a910..8a6fed95f0b72 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -25,6 +25,7 @@
 %include "./interface/SBCommandReturnObjectDocstrings.i"
 %include "./interface/SBCommunicationDocstrings.i"
 %include "./interface/SBCompileUnitDocstrings.i"
+%include "./interface/SBSaveCoreOptionsDocstrings.i"
 %include "./interface/SBDataDocstrings.i"
 %include "./interface/SBDebuggerDocstrings.i"
 %include "./interface/SBDeclarationDocstrings.i"
@@ -101,6 +102,7 @@
 %include "lldb/API/SBCommandReturnObject.h"
 %include "lldb/API/SBCommunication.h"
 %include "lldb/API/SBCompileUnit.h"
+%include "lldb/API/SBSaveCoreOptions.h"
 %include "lldb/API/SBData.h"
 %include "lldb/API/SBDebugger.h"
 %include "lldb/API/SBDeclaration.h"
diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 538029037dd46..0a81ec5092185 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -258,6 +258,7 @@ function(add_lldb_tool name)
   endif()
 
   add_lldb_executable(${name} GENERATE_INSTALL ${ARG_UNPARSED_ARGUMENTS})
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endfunction()
 
 # The test suite relies on finding LLDB.framework binary resources in the
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index 7076a75032dae..5cac3736337a8 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -1403,6 +1403,12 @@ For instance, with a macOS process which has nothing mapped in the first
 The lack of `permissions:` indicates that none of read/write/execute are valid
 for this region.
 
+The stub must include `permissions:` key-value on all memory ranges
+that are valid to access in the inferior process -- the lack of
+`permissions:` means that this is an inaccessible (no page table
+entries exist, in a system using VM) memory range.  If a stub cannot
+determine actual permissions, return `rwx`.
+
 **Priority To Implement:** Medium
 
 This is nice to have, but it isn't necessary. It helps LLDB
@@ -2434,4 +2440,4 @@ The `0x` prefixes are optional - like most of the gdb-remote packets,
 omitting them will work fine; these numbers are always base 16.
 
 The length of the payload is not provided.  A reliable, 8-bit clean,
-transport layer is assumed.
\ No newline at end of file
+transport layer is assumed.
diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index d8cc9f5067fe9..40368e036e0e4 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -57,6 +57,7 @@
 #include "lldb/API/SBQueue.h"
 #include "lldb/API/SBQueueItem.h"
 #include "lldb/API/SBReproducer.h"
+#include "lldb/API/SBSaveCoreOptions.h"
 #include "lldb/API/SBSection.h"
 #include "lldb/API/SBSourceManager.h"
 #include "lldb/API/SBStatisticsOptions.h"
diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h
index 87c0a1c3661ca..f42e2be558d2e 100644
--- a/lldb/include/lldb/API/SBDefines.h
+++ b/lldb/include/lldb/API/SBDefines.h
@@ -61,6 +61,7 @@ class LLDB_API SBCommandPluginInterface;
 class LLDB_API SBCommandReturnObject;
 class LLDB_API SBCommunication;
 class LLDB_API SBCompileUnit;
+class LLDB_API SBSaveCoreOptions;
 class LLDB_API SBData;
 class LLDB_API SBDebugger;
 class LLDB_API SBDeclaration;
diff --git a/lldb/include/lldb/API/SBError.h b/lldb/include/lldb/API/SBError.h
index 1a720a479d9a6..17f2c6c3027af 100644
--- a/lldb/include/lldb/API/SBError.h
+++ b/lldb/include/lldb/API/SBError.h
@@ -77,6 +77,7 @@ class LLDB_API SBError {
   friend class SBBreakpointName;
   friend class SBCommandReturnObject;
   friend class SBCommunication;
+  friend class SBSaveCoreOptions;
   friend class SBData;
   friend class SBDebugger;
   friend class SBFile;
diff --git a/lldb/include/lldb/API/SBFileSpec.h b/lldb/include/lldb/API/SBFileSpec.h
index beefa19ad7f36..36641843aabeb 100644
--- a/lldb/include/lldb/API/SBFileSpec.h
+++ b/lldb/include/lldb/API/SBFileSpec.h
@@ -78,6 +78,7 @@ class LLDB_API SBFileSpec {
   friend class SBTarget;
   friend class SBThread;
   friend class SBTrace;
+  friend class SBSaveCoreOptions;
 
   SBFileSpec(const lldb_private::FileSpec &fspec);
 
diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index a6ab7ae759918..778be79583990 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -378,6 +378,12 @@ class LLDB_API SBProcess {
   /// \param[in] file_name - The name of the file to save the core file to.
   lldb::SBError SaveCore(const char *file_name);
 
+  /// Save the state of the process with the desired settings
+  /// as defined in the options object.
+  ///
+  /// \param[in] options - The options to use when saving the core file.
+  lldb::SBError SaveCore(SBSaveCoreOptions &options);
+
   /// Query the address load_addr and store the details of the memory
   /// region that contains it in the supplied SBMemoryRegionInfo object.
   /// To iterate over all memory regions use GetMemoryRegionList.
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
new file mode 100644
index 0000000000000..e77496bd3a4a0
--- /dev/null
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -0,0 +1,68 @@
+//===-- SBSaveCoreOptions.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBSAVECOREOPTIONS_H
+#define LLDB_API_SBSAVECOREOPTIONS_H
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb {
+
+class LLDB_API SBSaveCoreOptions {
+public:
+  SBSaveCoreOptions();
+  SBSaveCoreOptions(const lldb::SBSaveCoreOptions &rhs);
+  ~SBSaveCoreOptions() = default;
+
+  const SBSaveCoreOptions &operator=(const lldb::SBSaveCoreOptions &rhs);
+
+  /// Set the plugin name. Supplying null or empty string will reset
+  /// the option.
+  ///
+  /// \param plugin Name of the object file plugin.
+  SBError SetPluginName(const char *plugin);
+
+  /// Get the Core dump plugin name, if set.
+  ///
+  /// \return The name of the plugin, or null if not set.
+  const char *GetPluginName() const;
+
+  /// Set the Core dump style.
+  ///
+  /// \param style The style of the core dump.
+  void SetStyle(lldb::SaveCoreStyle style);
+
+  /// Get the Core dump style, if set.
+  ///
+  /// \return The core dump style, or undefined if not set.
+  lldb::SaveCoreStyle GetStyle() const;
+
+  /// Set the output file path
+  ///
+  /// \param output_file a
+  /// \class SBFileSpec object that describes the output file.
+  void SetOutputFile(SBFileSpec output_file);
+
+  /// Get the output file spec
+  ///
+  /// \return The output file spec.
+  SBFileSpec GetOutputFile() const;
+
+  /// Reset all options.
+  void Clear();
+
+protected:
+  friend class SBProcess;
+  lldb_private::SaveCoreOptions &ref() const;
+
+private:
+  std::unique_ptr<lldb_private::SaveCoreOptions> m_opaque_up;
+}; // SBSaveCoreOptions
+} // namespace lldb
+
+#endif // LLDB_API_SBSAVECOREOPTIONS_H
diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h
index 0188057247a68..5589c1c9a350d 100644
--- a/lldb/include/lldb/Core/Module.h
+++ b/lldb/include/lldb/Core/Module.h
@@ -1021,9 +1021,9 @@ class Module : public std::enable_shared_from_this<Module>,
   lldb::ObjectFileSP m_objfile_sp; ///< A shared pointer to the object file
                                    /// parser for this module as it may or may
                                    /// not be shared with the SymbolFile
-  std::optional<UnwindTable> m_unwind_table; ///< Table of FuncUnwinders
-                                             /// objects created for this
-                                             /// Module's functions
+  UnwindTable m_unwind_table;      ///< Table of FuncUnwinders
+                                   /// objects created for this
+                                   /// Module's functions
   lldb::SymbolVendorUP
       m_symfile_up; ///< A pointer to the symbol vendor for this module.
   std::vector<lldb::SymbolVendorUP>
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index f2296e2920238..a23f834f471fb 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -10,6 +10,7 @@
 #define LLDB_CORE_PLUGINMANAGER_H
 
 #include "lldb/Core/Architecture.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/CompletionRequest.h"
 #include "lldb/Utility/FileSpec.h"
@@ -178,6 +179,8 @@ class PluginManager {
 
   static bool UnregisterPlugin(ObjectFileCreateInstance create_callback);
 
+  static bool IsRegisteredObjectFilePluginName(llvm::StringRef name);
+
   static ObjectFileCreateInstance
   GetObjectFileCreateCallbackAtIndex(uint32_t idx);
 
@@ -191,9 +194,7 @@ class PluginManager {
   GetObjectFileCreateMemoryCallbackForPluginName(llvm::StringRef name);
 
   static Status SaveCore(const lldb::ProcessSP &process_sp,
-                         const FileSpec &outfile,
-                         lldb::SaveCoreStyle &core_style,
-                         llvm::StringRef plugin_name);
+                         const lldb_private::SaveCoreOptions &core_options);
 
   // ObjectContainer
   static bool RegisterPlugin(
@@ -487,6 +488,25 @@ class PluginManager {
 
   static LanguageSet GetAllTypeSystemSupportedLanguagesForExpressions();
 
+  // Scripted Interface
+  static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+                             ScriptedInterfaceCreateInstance create_callback,
+                             lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages);
+
+  static bool UnregisterPlugin(ScriptedInterfaceCreateInstance create_callback);
+
+  static uint32_t GetNumScriptedInterfaces();
+
+  static llvm::StringRef GetScriptedInterfaceNameAtIndex(uint32_t idx);
+
+  static llvm::StringRef GetScriptedInterfaceDescriptionAtIndex(uint32_t idx);
+
+  static lldb::ScriptLanguage GetScriptedInterfaceLanguageAtIndex(uint32_t idx);
+
+  static ScriptedInterfaceUsages
+  GetScriptedInterfaceUsagesAtIndex(uint32_t idx);
+
   // REPL
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              REPLCreateInstance create_callback,
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index d48dbcdd5a5da..20c4769af9033 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -369,13 +369,14 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
            "currently stopped.";
   }
 
-  // This is for use in the command interpreter, when you either want the
-  // selected target, or if no target is present you want to prime the dummy
-  // target with entities that will be copied over to new targets.
-  Target &GetSelectedOrDummyTarget(bool prefer_dummy = false);
-  Target &GetSelectedTarget();
   Target &GetDummyTarget();
 
+  // This is for use in the command interpreter, and returns the most relevant
+  // target. In order of priority, that's the target from the command object's
+  // execution context, the target from the interpreter's execution context, the
+  // selected target or the dummy target.
+  Target &GetTarget();
+
   // If a command needs to use the "current" thread, use this call. Command
   // objects will have an ExecutionContext to use, and that may or may not have
   // a thread in it.  If it does, you should use that by default, if not, then
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 69504dbcda5dc..3ce47d0584a8a 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
+#include "ScriptedInterfaceUsages.h"
+
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -68,6 +70,11 @@ class ScriptedInterface {
     return true;
   }
 
+  static bool CreateInstance(lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages) {
+    return false;
+  }
+
 protected:
   StructuredData::GenericSP m_object_instance_sp;
 };
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
new file mode 100644
index 0000000000000..36c0cfdca546e
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
@@ -0,0 +1,43 @@
+//===-- ScriptedInterfaceUsages.h ---------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+#define LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+
+#include "lldb/lldb-types.h"
+
+#include "lldb/Utility/Stream.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lldb_private {
+class ScriptedInterfaceUsages {
+public:
+  ScriptedInterfaceUsages() = default;
+  ScriptedInterfaceUsages(const std::vector<llvm::StringRef> ci_usages,
+                          const std::vector<llvm::StringRef> sbapi_usages)
+      : m_command_interpreter_usages(ci_usages), m_sbapi_usages(sbapi_usages) {}
+
+  const std::vector<llvm::StringRef> &GetCommandInterpreterUsages() const {
+    return m_command_interpreter_usages;
+  }
+
+  const std::vector<llvm::StringRef> &GetSBAPIUsages() const {
+    return m_sbapi_usages;
+  }
+
+  enum class UsageKind { CommandInterpreter, API };
+
+  void Dump(Stream &s, UsageKind kind) const;
+
+private:
+  std::vector<llvm::StringRef> m_command_interpreter_usages;
+  std::vector<llvm::StringRef> m_sbapi_usages;
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
new file mode 100644
index 0000000000000..583bc1f483d04
--- /dev/null
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -0,0 +1,44 @@
+//===-- SaveCoreOptions.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
+#define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
+
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-types.h"
+
+#include <optional>
+#include <string>
+
+namespace lldb_private {
+
+class SaveCoreOptions {
+public:
+  SaveCoreOptions(){};
+  ~SaveCoreOptions() = default;
+
+  lldb_private::Status SetPluginName(const char *name);
+  std::optional<std::string> GetPluginName() const;
+
+  void SetStyle(lldb::SaveCoreStyle style);
+  lldb::SaveCoreStyle GetStyle() const;
+
+  void SetOutputFile(lldb_private::FileSpec file);
+  const std::optional<lldb_private::FileSpec> GetOutputFile() const;
+
+  void Clear();
+
+private:
+  std::optional<std::string> m_plugin_name;
+  std::optional<lldb_private::FileSpec> m_file;
+  std::optional<lldb::SaveCoreStyle> m_style;
+};
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
index d20766788192f..8419495da73a2 100644
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -211,7 +211,15 @@ class SymbolFile : public PluginInterface {
   /// The characteristics of an array type.
   struct ArrayInfo {
     int64_t first_index = 0;
-    llvm::SmallVector<uint64_t, 1> element_orders;
+
+    ///< Each entry belongs to a distinct DW_TAG_subrange_type.
+    ///< For multi-dimensional DW_TAG_array_types we would have
+    ///< an entry for each dimension. An entry represents the
+    ///< optional element count of the subrange.
+    ///
+    ///< The order of entries follows the order of the DW_TAG_subrange_type
+    ///< children of this DW_TAG_array_type.
+    llvm::SmallVector<std::optional<uint64_t>, 1> element_orders;
     uint32_t byte_stride = 0;
     uint32_t bit_stride = 0;
   };
diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h
index ebb0ec421da72..a9e8406608ff3 100644
--- a/lldb/include/lldb/Symbol/UnwindPlan.h
+++ b/lldb/include/lldb/Symbol/UnwindPlan.h
@@ -68,7 +68,8 @@ class UnwindPlan {
         isAFAPlusOffset,   // reg = AFA + offset
         inOtherRegister,   // reg = other reg
         atDWARFExpression, // reg = deref(eval(dwarf_expr))
-        isDWARFExpression  // reg = eval(dwarf_expr)
+        isDWARFExpression, // reg = eval(dwarf_expr)
+        isConstant         // reg = constant
       };
 
       RegisterLocation() : m_location() {}
@@ -105,6 +106,15 @@ class UnwindPlan {
 
       bool IsDWARFExpression() const { return m_type == isDWARFExpression; }
 
+      bool IsConstant() const { return m_type == isConstant; }
+
+      void SetIsConstant(uint64_t value) {
+        m_type = isConstant;
+        m_location.constant_value = value;
+      }
+
+      uint64_t GetConstant() const { return m_location.constant_value; }
+
       void SetAtCFAPlusOffset(int32_t offset) {
         m_type = atCFAPlusOffset;
         m_location.offset = offset;
@@ -192,6 +202,8 @@ class UnwindPlan {
           const uint8_t *opcodes;
           uint16_t length;
         } expr;
+        // For m_type == isConstant
+        uint64_t constant_value;
       } m_location;
     };
 
@@ -358,6 +370,9 @@ class UnwindPlan {
 
     bool SetRegisterLocationToSame(uint32_t reg_num, bool must_replace);
 
+    bool SetRegisterLocationToIsConstant(uint32_t reg_num, uint64_t constant,
+                                         bool can_replace);
+
     // When this UnspecifiedRegistersAreUndefined mode is
     // set, any register that is not specified by this Row will
     // be described as Undefined.
diff --git a/lldb/include/lldb/Symbol/UnwindTable.h b/lldb/include/lldb/Symbol/UnwindTable.h
index 26826e5d1b497..29b7fa61b4849 100644
--- a/lldb/include/lldb/Symbol/UnwindTable.h
+++ b/lldb/include/lldb/Symbol/UnwindTable.h
@@ -57,9 +57,9 @@ class UnwindTable {
 
   ArchSpec GetArchitecture();
 
-  /// Called after a SymbolFile has been added to a Module to add any new
-  /// unwind sections that may now be available.
-  void Update();
+  /// Called after an ObjectFile/SymbolFile has been added to a Module to add
+  /// any new unwind sections that may now be available.
+  void ModuleWasUpdated();
 
 private:
   void Dump(Stream &s);
@@ -75,7 +75,11 @@ class UnwindTable {
   Module &m_module;
   collection m_unwinds;
 
-  bool m_initialized; // delay some initialization until ObjectFile is set up
+  bool m_scanned_all_unwind_sources; // true when we have looked at the
+                                     // ObjectFile and SymbolFile for all
+                                     // sources of unwind information; false if
+                                     // we haven't done that yet, or one of the
+                                     // files has been updated in the Module.
   std::mutex m_mutex;
 
   std::unique_ptr<CallFrameInfo> m_object_file_unwind_up;
diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h
index e508d192d2759..0629e2faae7e9 100644
--- a/lldb/include/lldb/Target/DynamicLoader.h
+++ b/lldb/include/lldb/Target/DynamicLoader.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_TARGET_DYNAMICLOADER_H
 #define LLDB_TARGET_DYNAMICLOADER_H
 
+#include "lldb/Core/Address.h"
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
@@ -329,6 +330,13 @@ class DynamicLoader : public PluginInterface {
   /// safe to call certain APIs or SPIs.
   virtual bool IsFullyInitialized() { return true; }
 
+  /// Return the `start` \b address in the dynamic loader module.
+  /// This is the address the process will begin executing with
+  /// `process launch --stop-at-entry`.
+  virtual std::optional<lldb_private::Address> GetStartAddress() {
+    return std::nullopt;
+  }
+
 protected:
   // Utility methods for derived classes
 
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 5d5ae1bfcd3bd..119dff4d49819 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -141,6 +141,8 @@ class TargetProperties : public Properties {
 
   PathMappingList &GetSourcePathMap() const;
 
+  PathMappingList &GetObjectPathMap() const;
+
   bool GetAutoSourceMapRelative() const;
 
   FileSpecList GetExecutableSearchPaths();
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 9c946eaa19f0b..1024501e05bca 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -182,6 +182,7 @@ class RegisterTypeBuilder;
 class RegisterValue;
 class RegularExpression;
 class RichManglingContext;
+class SaveCoreOptions;
 class Scalar;
 class ScriptInterpreter;
 class ScriptInterpreterLocker;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index cdd9b51d9329c..87c5ff8d22fb6 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_LLDB_PRIVATE_INTERFACES_H
 #define LLDB_LLDB_PRIVATE_INTERFACES_H
 
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
@@ -24,6 +25,7 @@ class Value;
 } // namespace llvm
 
 namespace lldb_private {
+class ScriptedInterfaceUsages;
 typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
                                          const ArchSpec &arch);
 typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -55,8 +57,7 @@ typedef ObjectFile *(*ObjectFileCreateMemoryInstance)(
     const lldb::ModuleSP &module_sp, lldb::WritableDataBufferSP data_sp,
     const lldb::ProcessSP &process_sp, lldb::addr_t offset);
 typedef bool (*ObjectFileSaveCore)(const lldb::ProcessSP &process_sp,
-                                   const FileSpec &outfile,
-                                   lldb::SaveCoreStyle &core_style,
+                                   const lldb_private::SaveCoreOptions &options,
                                    Status &error);
 typedef EmulateInstruction *(*EmulateInstructionCreateInstance)(
     const ArchSpec &arch, InstructionType inst_type);
@@ -124,6 +125,8 @@ typedef lldb::REPLSP (*REPLCreateInstance)(Status &error,
                                            lldb::LanguageType language,
                                            Debugger *debugger, Target *target,
                                            const char *repl_options);
+typedef bool (*ScriptedInterfaceCreateInstance)(lldb::ScriptLanguage language,
+                                                ScriptedInterfaceUsages usages);
 typedef int (*ComparisonFunction)(const void *, const void *);
 typedef void (*DebuggerInitializeCallback)(Debugger &debugger);
 /// Trace
diff --git a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
index 818fdf0e6b5c5..602e15d207e94 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
@@ -9,6 +9,7 @@
 import sys
 import os
 from packaging import version
+from urllib.parse import urlparse
 
 # LLDB modules
 import lldb
@@ -34,6 +35,8 @@ def check_first_register_readable(test_case):
         test_case.expect("register read r0", substrs=["r0 = 0x"])
     elif arch in ["powerpc64le"]:
         test_case.expect("register read r0", substrs=["r0 = 0x"])
+    elif re.match("^rv(32|64)", arch):
+        test_case.expect("register read zero", substrs=["zero = 0x"])
     else:
         # TODO: Add check for other architectures
         test_case.fail(
@@ -263,17 +266,13 @@ def getCompiler():
     return module.getCompiler()
 
 
-def getCompilerBinary():
-    """Returns the compiler binary the test suite is running with."""
-    return getCompiler().split()[0]
-
-
 def getCompilerVersion():
     """Returns a string that represents the compiler version.
     Supports: llvm, clang.
     """
-    compiler = getCompilerBinary()
-    version_output = subprocess.check_output([compiler, "--version"], errors="replace")
+    version_output = subprocess.check_output(
+        [getCompiler(), "--version"], errors="replace"
+    )
     m = re.search("version ([0-9.]+)", version_output)
     if m:
         return m.group(1)
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 5e50b0c145885..b57c3bdd87c83 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1379,10 +1379,6 @@ def getCompiler(self):
         """Returns the compiler in effect the test suite is running with."""
         return lldbplatformutil.getCompiler()
 
-    def getCompilerBinary(self):
-        """Returns the compiler binary the test suite is running with."""
-        return lldbplatformutil.getCompilerBinary()
-
     def getCompilerVersion(self):
         """Returns a string that represents the compiler version.
         Supports: llvm, clang.
@@ -1518,19 +1514,23 @@ def getstdFlag(self):
             stdflag = "-std=c++11"
         return stdflag
 
-    def buildDriver(self, sources, exe_name):
+    def buildDriver(self, sources, exe_name, defines=None):
         """Platform-specific way to build a program that links with LLDB (via the liblldb.so
         or LLDB.framework).
         """
+        if defines is None:
+            defines = []
+
         stdflag = self.getstdFlag()
         stdlibflag = self.getstdlibFlag()
+        defines = " ".join(["-D{}={}".format(name, value) for name, value in defines])
 
         lib_dir = configuration.lldb_libs_dir
         if self.hasDarwinFramework():
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s" % (stdflag, stdlibflag),
+                "CFLAGS_EXTRAS": "%s %s %s" % (stdflag, stdlibflag, defines),
                 "FRAMEWORK_INCLUDES": "-F%s" % self.framework_dir,
                 "LD_EXTRAS": "%s -Wl,-rpath,%s" % (self.lib_lldb, self.framework_dir),
             }
@@ -1538,12 +1538,13 @@ def buildDriver(self, sources, exe_name):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s %s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
                     os.path.join(configuration.lldb_obj_root, "include"),
+                    defines,
                 ),
                 "LD_EXTRAS": "-L%s -lliblldb" % lib_dir,
             }
@@ -1551,12 +1552,13 @@ def buildDriver(self, sources, exe_name):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s %s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
                     os.path.join(configuration.lldb_obj_root, "include"),
+                    defines,
                 ),
                 "LD_EXTRAS": "-L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 3d562285ce9cc..be3ad684dd736 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -46,31 +46,6 @@ LLDB_BASE_DIR := $(THIS_FILE_DIR)/../../../../../
 # according to variable values).
 .DEFAULT_GOAL := all
 
-#----------------------------------------------------------------------
-# If OS is not defined, use 'uname -s' to determine the OS name.
-#
-# GNUWin32 uname gives "windows32" or "server version windows32" while
-# some versions of MSYS uname return "MSYS_NT*", but most environments
-# standardize on "Windows_NT", so we'll make it consistent here. 
-# When running tests from Visual Studio, the environment variable isn't
-# inherited all the way down to the process spawned for make.
-#----------------------------------------------------------------------
-ifeq "$(HOST_OS)" ""
-  HOST_OS := $(shell uname -s)
-endif
-
-ifneq (,$(findstring windows32,$(HOST_OS)))
-	HOST_OS := Windows_NT
-endif
-
-ifneq (,$(findstring MSYS_NT,$(HOST_OS)))
-	HOST_OS := Windows_NT
-endif
-
-ifeq "$(OS)" ""
-	OS := $(HOST_OS)
-endif
-
 #----------------------------------------------------------------------
 # If OS is Windows, force SHELL to be cmd
 #
@@ -80,8 +55,10 @@ endif
 # Also reset BUILDDIR value because "pwd" returns cygwin or msys path
 # which needs to be converted to windows path.
 #----------------------------------------------------------------------
-ifeq "$(OS)" "Windows_NT"
-	SHELL = $(WINDIR)\system32\cmd.exe
+ifeq "$(HOST_OS)" "Windows_NT"
+	# MinGW make gets $(windir) variable if launched from cmd.exe
+	# and $(WINDIR) if launched from MSYS2.
+	SHELL := $(or $(windir),$(WINDIR),C:\WINDOWS)\system32\cmd.exe
 	BUILDDIR := $(shell echo %cd%)
 endif
 
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 6397101609315..a32bc58507d8e 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -56,6 +56,7 @@ add_lldb_library(liblldb SHARED ${option_framework}
   SBCommandReturnObject.cpp
   SBCommunication.cpp
   SBCompileUnit.cpp
+  SBSaveCoreOptions.cpp
   SBData.cpp
   SBDebugger.cpp
   SBDeclaration.cpp
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index efb3c8def2796..b88f897ff5280 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -40,6 +40,7 @@
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
 #include "lldb/API/SBMemoryRegionInfoList.h"
+#include "lldb/API/SBSaveCoreOptions.h"
 #include "lldb/API/SBScriptObject.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBStringList.h"
@@ -1216,13 +1217,28 @@ bool SBProcess::IsInstrumentationRuntimePresent(
 
 lldb::SBError SBProcess::SaveCore(const char *file_name) {
   LLDB_INSTRUMENT_VA(this, file_name);
-  return SaveCore(file_name, "", SaveCoreStyle::eSaveCoreFull);
+  SBSaveCoreOptions options;
+  options.SetOutputFile(SBFileSpec(file_name));
+  options.SetStyle(SaveCoreStyle::eSaveCoreFull);
+  return SaveCore(options);
 }
 
 lldb::SBError SBProcess::SaveCore(const char *file_name,
                                   const char *flavor,
                                   SaveCoreStyle core_style) {
   LLDB_INSTRUMENT_VA(this, file_name, flavor, core_style);
+  SBSaveCoreOptions options;
+  options.SetOutputFile(SBFileSpec(file_name));
+  options.SetStyle(core_style);
+  SBError error = options.SetPluginName(flavor);
+  if (error.Fail())
+    return error;
+  return SaveCore(options);
+}
+
+lldb::SBError SBProcess::SaveCore(SBSaveCoreOptions &options) {
+
+  LLDB_INSTRUMENT_VA(this, options);
 
   lldb::SBError error;
   ProcessSP process_sp(GetSP());
@@ -1239,10 +1255,7 @@ lldb::SBError SBProcess::SaveCore(const char *file_name,
     return error;
   }
 
-  FileSpec core_file(file_name);
-  FileSystem::Instance().Resolve(core_file);
-  error.ref() = PluginManager::SaveCore(process_sp, core_file, core_style,
-                                        flavor);
+  error.ref() = PluginManager::SaveCore(process_sp, options.ref());
 
   return error;
 }
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
new file mode 100644
index 0000000000000..6c3f74596203d
--- /dev/null
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -0,0 +1,85 @@
+//===-- SBSaveCoreOptions.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBSaveCoreOptions.h"
+#include "lldb/API/SBError.h"
+#include "lldb/API/SBFileSpec.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Utility/Instrumentation.h"
+
+#include "Utils.h"
+
+using namespace lldb;
+
+SBSaveCoreOptions::SBSaveCoreOptions() {
+  LLDB_INSTRUMENT_VA(this)
+
+  m_opaque_up = std::make_unique<lldb_private::SaveCoreOptions>();
+}
+
+SBSaveCoreOptions::SBSaveCoreOptions(const SBSaveCoreOptions &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  m_opaque_up = clone(rhs.m_opaque_up);
+}
+
+const SBSaveCoreOptions &
+SBSaveCoreOptions::operator=(const SBSaveCoreOptions &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    m_opaque_up = clone(rhs.m_opaque_up);
+  return *this;
+}
+
+SBError SBSaveCoreOptions::SetPluginName(const char *name) {
+  LLDB_INSTRUMENT_VA(this, name);
+  lldb_private::Status error = m_opaque_up->SetPluginName(name);
+  return SBError(error);
+}
+
+void SBSaveCoreOptions::SetStyle(lldb::SaveCoreStyle style) {
+  LLDB_INSTRUMENT_VA(this, style);
+  m_opaque_up->SetStyle(style);
+}
+
+void SBSaveCoreOptions::SetOutputFile(lldb::SBFileSpec file_spec) {
+  LLDB_INSTRUMENT_VA(this, file_spec);
+  m_opaque_up->SetOutputFile(file_spec.ref());
+}
+
+const char *SBSaveCoreOptions::GetPluginName() const {
+  LLDB_INSTRUMENT_VA(this);
+  const auto name = m_opaque_up->GetPluginName();
+  if (!name)
+    return nullptr;
+  return lldb_private::ConstString(name.value()).GetCString();
+}
+
+SBFileSpec SBSaveCoreOptions::GetOutputFile() const {
+  LLDB_INSTRUMENT_VA(this);
+  const auto file_spec = m_opaque_up->GetOutputFile();
+  if (file_spec)
+    return SBFileSpec(file_spec.value());
+  return SBFileSpec();
+}
+
+lldb::SaveCoreStyle SBSaveCoreOptions::GetStyle() const {
+  LLDB_INSTRUMENT_VA(this);
+  return m_opaque_up->GetStyle();
+}
+
+void SBSaveCoreOptions::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+  m_opaque_up->Clear();
+}
+
+lldb_private::SaveCoreOptions &SBSaveCoreOptions::ref() const {
+  return *m_opaque_up.get();
+}
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index 773f8ed2fa8af..aad03af11331c 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -539,7 +539,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_dummy_options.m_use_dummy);
+    Target &target =
+        m_dummy_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     // The following are the various types of breakpoints that could be set:
     //   1).  -f -l -p  [-s -g]   (setting breakpoint by source location)
@@ -839,7 +840,7 @@ class CommandObjectBreakpointModify : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_dummy_opts.m_use_dummy);
+    Target &target = m_dummy_opts.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -903,7 +904,7 @@ class CommandObjectBreakpointEnable : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1010,7 +1011,7 @@ the second re-enables the first location.");
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
 
@@ -1148,7 +1149,7 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints =
         target.GetBreakpointList(m_options.m_internal);
@@ -1267,7 +1268,7 @@ class CommandObjectBreakpointClear : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // The following are the various types of breakpoints that could be
     // cleared:
@@ -1416,7 +1417,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
     result.Clear();
     
     std::unique_lock<std::recursive_mutex> lock;
@@ -1676,7 +1677,7 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
       return;
     }
 
-    Target &target = GetSelectedOrDummyTarget(false);
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1764,7 +1765,7 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
     }
 
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1838,7 +1839,7 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
     }
 
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1897,7 +1898,7 @@ class CommandObjectBreakpointNameList : public CommandObjectParsed {
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::vector<std::string> name_list;
     if (command.empty()) {
@@ -2209,7 +2210,7 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -2319,7 +2320,7 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index 6ebe6e8a35570..8c1fb513e016e 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -323,7 +323,7 @@ are no syntax errors may indicate that a function was declared but never called.
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
@@ -481,7 +481,7 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
@@ -548,9 +548,9 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const BreakpointList &breakpoints = target->GetBreakpointList();
+    const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
 
     if (num_breakpoints == 0) {
@@ -566,7 +566,7 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
 
     BreakpointIDList valid_bp_ids;
     CommandObjectMultiwordBreakpoint::VerifyBreakpointOrLocationIDs(
-        command, target, result, &valid_bp_ids,
+        command, &target, result, &valid_bp_ids,
         BreakpointName::Permissions::PermissionKinds::listPerm);
 
     if (result.Succeeded()) {
@@ -575,7 +575,7 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
         BreakpointID cur_bp_id = valid_bp_ids.GetBreakpointIDAtIndex(i);
         if (cur_bp_id.GetBreakpointID() != LLDB_INVALID_BREAK_ID) {
           Breakpoint *bp =
-              target->GetBreakpointByID(cur_bp_id.GetBreakpointID()).get();
+              target.GetBreakpointByID(cur_bp_id.GetBreakpointID()).get();
 
           if (bp) {
             BreakpointLocationSP bp_loc_sp;
diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp
index d975e39801317..652a300706b11 100644
--- a/lldb/source/Commands/CommandObjectDisassemble.cpp
+++ b/lldb/source/Commands/CommandObjectDisassemble.cpp
@@ -227,7 +227,7 @@ llvm::Error CommandObjectDisassemble::CheckRangeSize(const AddressRange &range,
     return llvm::Error::success();
   StreamString msg;
   msg << "Not disassembling " << what << " because it is very large ";
-  range.Dump(&msg, &GetSelectedTarget(), Address::DumpStyleLoadAddress,
+  range.Dump(&msg, &GetTarget(), Address::DumpStyleLoadAddress,
              Address::DumpStyleFileAddress);
   msg << ". To disassemble specify an instruction count limit, start/stop "
          "addresses or use the --force option.";
@@ -252,7 +252,7 @@ CommandObjectDisassemble::GetContainingAddressRanges() {
     }
   };
 
-  Target &target = GetSelectedTarget();
+  Target &target = GetTarget();
   if (!target.GetSectionLoadList().IsEmpty()) {
     Address symbol_containing_address;
     if (target.GetSectionLoadList().ResolveLoadAddress(
@@ -351,8 +351,8 @@ CommandObjectDisassemble::GetNameRanges(CommandReturnObject &result) {
 
   // Find functions matching the given name.
   SymbolContextList sc_list;
-  GetSelectedTarget().GetImages().FindFunctions(name, eFunctionNameTypeAuto,
-                                                function_options, sc_list);
+  GetTarget().GetImages().FindFunctions(name, eFunctionNameTypeAuto,
+                                        function_options, sc_list);
 
   std::vector<AddressRange> ranges;
   llvm::Error range_errs = llvm::Error::success();
@@ -439,10 +439,10 @@ CommandObjectDisassemble::GetRangesForSelectedMode(
 
 void CommandObjectDisassemble::DoExecute(Args &command,
                                          CommandReturnObject &result) {
-  Target *target = &GetSelectedTarget();
+  Target &target = GetTarget();
 
   if (!m_options.arch.IsValid())
-    m_options.arch = target->GetArchitecture();
+    m_options.arch = target.GetArchitecture();
 
   if (!m_options.arch.IsValid()) {
     result.AppendError(
@@ -535,7 +535,7 @@ void CommandObjectDisassemble::DoExecute(Args &command,
       } else {
         result.AppendErrorWithFormat(
             "Failed to disassemble memory at 0x%8.8" PRIx64 ".\n",
-            cur_range.GetBaseAddress().GetLoadAddress(target));
+            cur_range.GetBaseAddress().GetLoadAddress(&target));
       }
     }
     if (print_sc_header)
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index eb76753d98efc..769f01dab007a 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -605,7 +605,7 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command,
       return;
 
     if (m_repl_option.GetOptionValue().GetCurrentValue()) {
-      Target &target = GetSelectedOrDummyTarget();
+      Target &target = GetTarget();
       // Drop into REPL
       m_expr_lines.clear();
       m_expr_line_count = 0;
@@ -665,7 +665,7 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command,
     }
   }
 
-  Target &target = GetSelectedOrDummyTarget();
+  Target &target = GetTarget();
   if (EvaluateExpression(expr, result.GetOutputStream(),
                          result.GetErrorStream(), result)) {
 
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 3f4178c1a9595..29e460fe3885f 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -687,7 +687,7 @@ may even involve JITing and running code in the target program.)");
                                            m_cmd_name);
 
     // Increment statistics.
-    TargetStats &target_stats = GetSelectedOrDummyTarget().GetStatistics();
+    TargetStats &target_stats = GetTarget().GetStatistics();
     if (result.Succeeded())
       target_stats.GetFrameVariableStats().NotifySuccess();
     else
@@ -874,13 +874,13 @@ void CommandObjectFrameRecognizerAdd::DoExecute(Args &command,
         RegularExpressionSP(new RegularExpression(m_options.m_module));
     auto func =
         RegularExpressionSP(new RegularExpression(m_options.m_symbols.front()));
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().AddRecognizer(
+    GetTarget().GetFrameRecognizerManager().AddRecognizer(
         recognizer_sp, module, func, m_options.m_first_instruction_only);
   } else {
     auto module = ConstString(m_options.m_module);
     std::vector<ConstString> symbols(m_options.m_symbols.begin(),
                                      m_options.m_symbols.end());
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().AddRecognizer(
+    GetTarget().GetFrameRecognizerManager().AddRecognizer(
         recognizer_sp, module, symbols, m_options.m_first_instruction_only);
   }
 #endif
@@ -898,9 +898,7 @@ class CommandObjectFrameRecognizerClear : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    GetSelectedOrDummyTarget()
-        .GetFrameRecognizerManager()
-        .RemoveAllRecognizers();
+    GetTarget().GetFrameRecognizerManager().RemoveAllRecognizers();
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
 };
@@ -922,7 +920,7 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
     if (request.GetCursorIndex() != 0)
       return;
 
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().ForEach(
+    GetTarget().GetFrameRecognizerManager().ForEach(
         [&request](uint32_t rid, std::string rname, std::string module,
                    llvm::ArrayRef<lldb_private::ConstString> symbols,
                    bool regexp) {
@@ -953,9 +951,7 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
         return;
       }
 
-      GetSelectedOrDummyTarget()
-          .GetFrameRecognizerManager()
-          .RemoveAllRecognizers();
+      GetTarget().GetFrameRecognizerManager().RemoveAllRecognizers();
       result.SetStatus(eReturnStatusSuccessFinishResult);
       return;
     }
@@ -973,9 +969,8 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
       return;
     }
 
-    if (!GetSelectedOrDummyTarget()
-             .GetFrameRecognizerManager()
-             .RemoveRecognizerWithID(recognizer_id)) {
+    if (!GetTarget().GetFrameRecognizerManager().RemoveRecognizerWithID(
+            recognizer_id)) {
       result.AppendErrorWithFormat("'%s' is not a valid recognizer id.\n",
                                    command.GetArgumentAtIndex(0));
       return;
@@ -996,7 +991,7 @@ class CommandObjectFrameRecognizerList : public CommandObjectParsed {
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     bool any_printed = false;
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().ForEach(
+    GetTarget().GetFrameRecognizerManager().ForEach(
         [&result, &any_printed](
             uint32_t recognizer_id, std::string name, std::string module,
             llvm::ArrayRef<ConstString> symbols, bool regexp) {
@@ -1073,9 +1068,8 @@ class CommandObjectFrameRecognizerInfo : public CommandObjectParsed {
       return;
     }
 
-    auto recognizer = GetSelectedOrDummyTarget()
-                          .GetFrameRecognizerManager()
-                          .GetRecognizerForFrame(frame_sp);
+    auto recognizer =
+        GetTarget().GetFrameRecognizerManager().GetRecognizerForFrame(frame_sp);
 
     Stream &output_stream = result.GetOutputStream();
     output_stream.Printf("frame %d ", frame_index);
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 8685d5761557b..e8174ca6ddace 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -369,25 +369,23 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
 
     // Okay, we're done.  Last step is to warn if the executable module has
     // changed:
-    char new_path[PATH_MAX];
     ModuleSP new_exec_module_sp(target->GetExecutableModule());
     if (!old_exec_module_sp) {
       // We might not have a module if we attached to a raw pid...
       if (new_exec_module_sp) {
-        new_exec_module_sp->GetFileSpec().GetPath(new_path, PATH_MAX);
-        result.AppendMessageWithFormat("Executable module set to \"%s\".\n",
-                                       new_path);
+        result.AppendMessageWithFormat(
+            "Executable binary set to \"%s\".\n",
+            new_exec_module_sp->GetFileSpec().GetPath().c_str());
       }
+    } else if (!new_exec_module_sp) {
+      result.AppendWarningWithFormat("No executable binary.");
     } else if (old_exec_module_sp->GetFileSpec() !=
                new_exec_module_sp->GetFileSpec()) {
-      char old_path[PATH_MAX];
-
-      old_exec_module_sp->GetFileSpec().GetPath(old_path, PATH_MAX);
-      new_exec_module_sp->GetFileSpec().GetPath(new_path, PATH_MAX);
 
       result.AppendWarningWithFormat(
-          "Executable module changed from \"%s\" to \"%s\".\n", old_path,
-          new_path);
+          "Executable binary changed from \"%s\" to \"%s\".\n",
+          old_exec_module_sp->GetFileSpec().GetPath().c_str(),
+          new_exec_module_sp->GetFileSpec().GetPath().c_str());
     }
 
     if (!old_arch_spec.IsValid()) {
@@ -1273,13 +1271,13 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
 
       switch (short_option) {
       case 'p':
-        m_requested_plugin_name = option_arg.str();
+        error = m_core_dump_options.SetPluginName(option_arg.data());
         break;
       case 's':
-        m_requested_save_core_style =
+        m_core_dump_options.SetStyle(
             (lldb::SaveCoreStyle)OptionArgParser::ToOptionEnum(
                 option_arg, GetDefinitions()[option_idx].enum_values,
-                eSaveCoreUnspecified, error);
+                eSaveCoreUnspecified, error));
         break;
       default:
         llvm_unreachable("Unimplemented option");
@@ -1289,13 +1287,11 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
     }
 
     void OptionParsingStarting(ExecutionContext *execution_context) override {
-      m_requested_save_core_style = eSaveCoreUnspecified;
-      m_requested_plugin_name.clear();
+      m_core_dump_options.Clear();
     }
 
     // Instance variables to hold the values for command options.
-    SaveCoreStyle m_requested_save_core_style = eSaveCoreUnspecified;
-    std::string m_requested_plugin_name;
+    SaveCoreOptions m_core_dump_options;
   };
 
 protected:
@@ -1305,13 +1301,14 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
       if (command.GetArgumentCount() == 1) {
         FileSpec output_file(command.GetArgumentAtIndex(0));
         FileSystem::Instance().Resolve(output_file);
-        SaveCoreStyle corefile_style = m_options.m_requested_save_core_style;
-        Status error =
-            PluginManager::SaveCore(process_sp, output_file, corefile_style,
-                                    m_options.m_requested_plugin_name);
+        auto &core_dump_options = m_options.m_core_dump_options;
+        core_dump_options.SetOutputFile(output_file);
+        Status error = PluginManager::SaveCore(process_sp, core_dump_options);
         if (error.Success()) {
-          if (corefile_style == SaveCoreStyle::eSaveCoreDirtyOnly ||
-              corefile_style == SaveCoreStyle::eSaveCoreStackOnly) {
+          if (core_dump_options.GetStyle() ==
+                  SaveCoreStyle::eSaveCoreDirtyOnly ||
+              core_dump_options.GetStyle() ==
+                  SaveCoreStyle::eSaveCoreStackOnly) {
             result.AppendMessageWithFormat(
                 "\nModified-memory or stack-memory only corefile "
                 "created.  This corefile may \n"
@@ -1587,7 +1584,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &signal_args, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // Any signals that are being set should be added to the Target's
     // DummySignals so they will get applied on rerun, etc.
diff --git a/lldb/source/Commands/CommandObjectScripting.cpp b/lldb/source/Commands/CommandObjectScripting.cpp
index fee0565a7c48a..730a190a6e891 100644
--- a/lldb/source/Commands/CommandObjectScripting.cpp
+++ b/lldb/source/Commands/CommandObjectScripting.cpp
@@ -8,12 +8,14 @@
 
 #include "CommandObjectScripting.h"
 #include "lldb/Core/Debugger.h"
+#include "lldb/Core/PluginManager.h"
 #include "lldb/DataFormatters/DataVisualization.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandOptionArgumentTable.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Utility/Args.h"
@@ -127,9 +129,126 @@ class CommandObjectScriptingRun : public CommandObjectRaw {
   CommandOptions m_options;
 };
 
-#pragma mark CommandObjectMultiwordScripting
+#define LLDB_OPTIONS_scripting_template_list
+#include "CommandOptions.inc"
+
+class CommandObjectScriptingTemplateList : public CommandObjectParsed {
+public:
+  CommandObjectScriptingTemplateList(CommandInterpreter &interpreter)
+      : CommandObjectParsed(
+            interpreter, "scripting template list",
+            "List all the available scripting extension templates. ",
+            "scripting template list [--language <scripting-language> --]") {}
+
+  ~CommandObjectScriptingTemplateList() override = default;
+
+  Options *GetOptions() override { return &m_options; }
+
+  class CommandOptions : public Options {
+  public:
+    CommandOptions() = default;
+    ~CommandOptions() override = default;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
+      const int short_option = m_getopt_table[option_idx].val;
 
-// CommandObjectMultiwordScripting
+      switch (short_option) {
+      case 'l':
+        m_language = (lldb::ScriptLanguage)OptionArgParser::ToOptionEnum(
+            option_arg, GetDefinitions()[option_idx].enum_values,
+            eScriptLanguageNone, error);
+        if (!error.Success())
+          error.SetErrorStringWithFormatv(
+              "unrecognized value for language '{0}'", option_arg);
+        break;
+      default:
+        llvm_unreachable("Unimplemented option");
+      }
+
+      return error;
+    }
+
+    void OptionParsingStarting(ExecutionContext *execution_context) override {
+      m_language = lldb::eScriptLanguageDefault;
+    }
+
+    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+      return llvm::ArrayRef(g_scripting_template_list_options);
+    }
+
+    lldb::ScriptLanguage m_language = lldb::eScriptLanguageDefault;
+  };
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    Stream &s = result.GetOutputStream();
+    s.Printf("Available scripted extension templates:");
+
+    auto print_field = [&s](llvm::StringRef key, llvm::StringRef value) {
+      if (!value.empty()) {
+        s.IndentMore();
+        s.Indent();
+        s << key << ": " << value << '\n';
+        s.IndentLess();
+      }
+    };
+
+    size_t num_listed_interface = 0;
+    size_t num_templates = PluginManager::GetNumScriptedInterfaces();
+    for (size_t i = 0; i < num_templates; i++) {
+      llvm::StringRef plugin_name =
+          PluginManager::GetScriptedInterfaceNameAtIndex(i);
+      if (plugin_name.empty())
+        break;
+
+      lldb::ScriptLanguage lang =
+          PluginManager::GetScriptedInterfaceLanguageAtIndex(i);
+      if (lang != m_options.m_language)
+        continue;
+
+      if (!num_listed_interface)
+        s.EOL();
+
+      num_listed_interface++;
+
+      llvm::StringRef desc =
+          PluginManager::GetScriptedInterfaceDescriptionAtIndex(i);
+      ScriptedInterfaceUsages usages =
+          PluginManager::GetScriptedInterfaceUsagesAtIndex(i);
+
+      print_field("Name", plugin_name);
+      print_field("Language", ScriptInterpreter::LanguageToString(lang));
+      print_field("Description", desc);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::API);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::CommandInterpreter);
+
+      if (i != num_templates - 1)
+        s.EOL();
+    }
+
+    if (!num_listed_interface)
+      s << " None\n";
+  }
+
+private:
+  CommandOptions m_options;
+};
+
+class CommandObjectMultiwordScriptingTemplate : public CommandObjectMultiword {
+public:
+  CommandObjectMultiwordScriptingTemplate(CommandInterpreter &interpreter)
+      : CommandObjectMultiword(
+            interpreter, "scripting template",
+            "Commands for operating on the scripting templates.",
+            "scripting template [<subcommand-options>]") {
+    LoadSubCommand(
+        "list",
+        CommandObjectSP(new CommandObjectScriptingTemplateList(interpreter)));
+  }
+
+  ~CommandObjectMultiwordScriptingTemplate() override = default;
+};
 
 CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
     CommandInterpreter &interpreter)
@@ -139,6 +258,9 @@ CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
           "scripting <subcommand> [<subcommand-options>]") {
   LoadSubCommand("run",
                  CommandObjectSP(new CommandObjectScriptingRun(interpreter)));
+  LoadSubCommand("template",
+                 CommandObjectSP(
+                     new CommandObjectMultiwordScriptingTemplate(interpreter)));
 }
 
 CommandObjectMultiwordScripting::~CommandObjectMultiwordScripting() = default;
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index d594330934ad7..b77bd8b0bc71a 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -59,10 +59,10 @@
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-private-enumerations.h"
 
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
@@ -97,7 +97,7 @@ static void DumpTargetInfo(uint32_t target_idx, Target *target,
 
   uint32_t properties = 0;
   if (target_arch.IsValid()) {
-    strm.Printf("%sarch=", properties++ > 0 ? ", " : " ( ");
+    strm.Printf(" ( arch=");
     target_arch.DumpTriple(strm.AsRawOstream());
     properties++;
   }
@@ -1027,7 +1027,7 @@ class CommandObjectTargetModulesSearchPathsAdd : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     const size_t argc = command.GetArgumentCount();
     if (argc & 1) {
       result.AppendError("add requires an even number of arguments\n");
@@ -1045,7 +1045,7 @@ class CommandObjectTargetModulesSearchPathsAdd : public CommandObjectParsed {
                       from, to);
           }
           bool last_pair = ((argc - i) == 2);
-          target->GetImageSearchPathList().Append(
+          target.GetImageSearchPathList().Append(
               from, to, last_pair); // Notify if this is the last pair
           result.SetStatus(eReturnStatusSuccessFinishNoResult);
         } else {
@@ -1074,9 +1074,9 @@ class CommandObjectTargetModulesSearchPathsClear : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool notify = true;
-    target->GetImageSearchPathList().Clear(notify);
+    target.GetImageSearchPathList().Clear(notify);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
   }
 };
@@ -1148,7 +1148,7 @@ class CommandObjectTargetModulesSearchPathsInsert : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     size_t argc = command.GetArgumentCount();
     // check for at least 3 arguments and an odd number of parameters
     if (argc >= 3 && argc & 1) {
@@ -1171,8 +1171,8 @@ class CommandObjectTargetModulesSearchPathsInsert : public CommandObjectParsed {
 
         if (from[0] && to[0]) {
           bool last_pair = ((argc - i) == 2);
-          target->GetImageSearchPathList().Insert(from, to, insert_idx,
-                                                  last_pair);
+          target.GetImageSearchPathList().Insert(from, to, insert_idx,
+                                                 last_pair);
           result.SetStatus(eReturnStatusSuccessFinishNoResult);
         } else {
           if (from[0])
@@ -1203,9 +1203,8 @@ class CommandObjectTargetModulesSearchPathsList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
-
-    target->GetImageSearchPathList().Dump(&result.GetOutputStream());
+    Target &target = GetTarget();
+    target.GetImageSearchPathList().Dump(&result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
 };
@@ -1226,7 +1225,7 @@ class CommandObjectTargetModulesSearchPathsQuery : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (command.GetArgumentCount() != 1) {
       result.AppendError("query requires one argument\n");
       return;
@@ -1234,7 +1233,7 @@ class CommandObjectTargetModulesSearchPathsQuery : public CommandObjectParsed {
 
     ConstString orig(command.GetArgumentAtIndex(0));
     ConstString transformed;
-    if (target->GetImageSearchPathList().RemapPath(orig, transformed))
+    if (target.GetImageSearchPathList().RemapPath(orig, transformed))
       result.GetOutputStream().Printf("%s\n", transformed.GetCString());
     else
       result.GetOutputStream().Printf("%s\n", orig.GetCString());
@@ -1898,9 +1897,9 @@ class CommandObjectTargetModulesDumpObjfile
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
@@ -1908,7 +1907,7 @@ class CommandObjectTargetModulesDumpObjfile
     if (command.GetArgumentCount() == 0) {
       // Dump all headers for all modules images
       num_dumped = DumpModuleObjfileHeaders(result.GetOutputStream(),
-                                            target->GetImages());
+                                            target.GetImages());
       if (num_dumped == 0) {
         result.AppendError("the target has no associated executable images");
       }
@@ -1920,7 +1919,7 @@ class CommandObjectTargetModulesDumpObjfile
            (arg_cstr = command.GetArgumentAtIndex(arg_idx)) != nullptr;
            ++arg_idx) {
         size_t num_matched =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matched == 0) {
           result.AppendWarningWithFormat(
               "Unable to find an image that matches '%s'.\n", arg_cstr);
@@ -1999,19 +1998,19 @@ class CommandObjectTargetModulesDumpSymtab
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
     Mangled::NamePreference name_preference =
         (m_options.m_prefer_mangled ? Mangled::ePreferMangled
                                     : Mangled::ePreferDemangled);
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const ModuleList &module_list = target->GetImages();
+      const ModuleList &module_list = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(module_list.GetMutex());
       const size_t num_modules = module_list.GetSize();
       if (num_modules > 0) {
@@ -2044,7 +2043,7 @@ class CommandObjectTargetModulesDumpSymtab
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (ModuleSP module_sp : module_list.Modules()) {
             if (module_sp) {
@@ -2097,16 +2096,16 @@ class CommandObjectTargetModulesDumpSections
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const size_t num_modules = target->GetImages().GetSize();
+      const size_t num_modules = target.GetImages().GetSize();
       if (num_modules == 0) {
         result.AppendError("the target has no associated executable images");
         return;
@@ -2123,7 +2122,7 @@ class CommandObjectTargetModulesDumpSections
         num_dumped++;
         DumpModuleSections(
             m_interpreter, result.GetOutputStream(),
-            target->GetImages().GetModulePointerAtIndex(image_idx));
+            target.GetImages().GetModulePointerAtIndex(image_idx));
       }
     } else {
       // Dump specified images (by basename or fullpath)
@@ -2133,7 +2132,7 @@ class CommandObjectTargetModulesDumpSections
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (size_t i = 0; i < num_matches; ++i) {
             if (INTERRUPT_REQUESTED(GetDebugger(),
@@ -2238,9 +2237,9 @@ class CommandObjectTargetModulesDumpClangAST
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const ModuleList &module_list = target->GetImages();
+    const ModuleList &module_list = target.GetImages();
     const size_t num_modules = module_list.GetSize();
     if (num_modules == 0) {
       result.AppendError("the target has no associated executable images");
@@ -2265,7 +2264,7 @@ class CommandObjectTargetModulesDumpClangAST
     for (const Args::ArgEntry &arg : command.entries()) {
       ModuleList module_list;
       const size_t num_matches =
-          FindModulesByName(target, arg.c_str(), module_list, true);
+          FindModulesByName(&target, arg.c_str(), module_list, true);
       if (num_matches == 0) {
         // Check the global list
         std::lock_guard<std::recursive_mutex> guard(
@@ -2309,16 +2308,16 @@ class CommandObjectTargetModulesDumpSymfile
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const ModuleList &target_modules = target->GetImages();
+      const ModuleList &target_modules = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(target_modules.GetMutex());
       const size_t num_modules = target_modules.GetSize();
       if (num_modules == 0) {
@@ -2344,7 +2343,7 @@ class CommandObjectTargetModulesDumpSymfile
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (size_t i = 0; i < num_matches; ++i) {
             if (INTERRUPT_REQUESTED(GetDebugger(), "Interrupted dumping {0} "
@@ -2533,7 +2532,7 @@ class CommandObjectTargetModulesDumpSeparateDebugInfoFiles
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
     uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
@@ -2726,7 +2725,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
   OptionGroupFile m_symbol_file;
 
   void DoExecute(Args &args, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool flush = false;
 
     const size_t argc = args.GetArgumentCount();
@@ -2742,7 +2741,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
         Status error;
         if (PluginManager::DownloadObjectAndSymbolFile(module_spec, error)) {
           ModuleSP module_sp(
-              target->GetOrCreateModule(module_spec, true /* notify */));
+              target.GetOrCreateModule(module_spec, true /* notify */));
           if (module_sp) {
             result.SetStatus(eReturnStatusSuccessFinishResult);
             return;
@@ -2799,10 +2798,10 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
             module_spec.GetSymbolFileSpec() =
                 m_symbol_file.GetOptionValue().GetCurrentValue();
           if (!module_spec.GetArchitecture().IsValid())
-            module_spec.GetArchitecture() = target->GetArchitecture();
+            module_spec.GetArchitecture() = target.GetArchitecture();
           Status error;
-          ModuleSP module_sp(target->GetOrCreateModule(
-              module_spec, true /* notify */, &error));
+          ModuleSP module_sp(
+              target.GetOrCreateModule(module_spec, true /* notify */, &error));
           if (!module_sp) {
             const char *error_cstr = error.AsCString();
             if (error_cstr)
@@ -2831,7 +2830,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
     }
 
     if (flush) {
-      ProcessSP process = target->GetProcessSP();
+      ProcessSP process = target.GetProcessSP();
       if (process)
         process->Flush();
     }
@@ -2876,7 +2875,7 @@ class CommandObjectTargetModulesLoad
 
 protected:
   void DoExecute(Args &args, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     const bool load = m_load_option.GetOptionValue().GetCurrentValue();
     const bool set_pc = m_pc_option.GetOptionValue().GetCurrentValue();
 
@@ -2888,7 +2887,7 @@ class CommandObjectTargetModulesLoad
     if (load) {
       if (!m_file_option.GetOptionValue().OptionWasSet() &&
           !m_uuid_option_group.GetOptionValue().OptionWasSet()) {
-        ModuleList &module_list = target->GetImages();
+        ModuleList &module_list = target.GetImages();
         if (module_list.GetSize() == 1) {
           search_using_module_spec = true;
           module_spec.GetFileSpec() =
@@ -2903,7 +2902,7 @@ class CommandObjectTargetModulesLoad
       const bool use_global_module_list = true;
       ModuleList module_list;
       const size_t num_matches = FindModulesByName(
-          target, arg_cstr, module_list, use_global_module_list);
+          &target, arg_cstr, module_list, use_global_module_list);
       if (num_matches == 1) {
         module_spec.GetFileSpec() =
             module_list.GetModuleAtIndex(0)->GetFileSpec();
@@ -2926,7 +2925,7 @@ class CommandObjectTargetModulesLoad
 
     if (search_using_module_spec) {
       ModuleList matching_modules;
-      target->GetImages().FindModules(module_spec, matching_modules);
+      target.GetImages().FindModules(module_spec, matching_modules);
       const size_t num_matches = matching_modules.GetSize();
 
       char path[PATH_MAX];
@@ -2943,7 +2942,7 @@ class CommandObjectTargetModulesLoad
                   const addr_t slide =
                       m_slide_option.GetOptionValue().GetCurrentValue();
                   const bool slide_is_offset = true;
-                  module->SetLoadAddress(*target, slide, slide_is_offset,
+                  module->SetLoadAddress(target, slide, slide_is_offset,
                                          changed);
                 } else {
                   result.AppendError("one or more section name + load "
@@ -2975,8 +2974,8 @@ class CommandObjectTargetModulesLoad
                               sect_name);
                           break;
                         } else {
-                          if (target->GetSectionLoadList()
-                                  .SetSectionLoadAddress(section_sp, load_addr))
+                          if (target.GetSectionLoadList().SetSectionLoadAddress(
+                                  section_sp, load_addr))
                             changed = true;
                           result.AppendMessageWithFormat(
                               "section '%s' loaded at 0x%" PRIx64 "\n",
@@ -3007,13 +3006,13 @@ class CommandObjectTargetModulesLoad
               }
 
               if (changed) {
-                target->ModulesDidLoad(matching_modules);
+                target.ModulesDidLoad(matching_modules);
                 Process *process = m_exe_ctx.GetProcessPtr();
                 if (process)
                   process->Flush();
               }
               if (load) {
-                ProcessSP process = target->CalculateProcess();
+                ProcessSP process = target.CalculateProcess();
                 Address file_entry = objfile->GetEntryPointAddress();
                 if (!process) {
                   result.AppendError("No process");
@@ -3024,7 +3023,7 @@ class CommandObjectTargetModulesLoad
                   return;
                 }
                 std::vector<ObjectFile::LoadableData> loadables(
-                    objfile->GetLoadableData(*target));
+                    objfile->GetLoadableData(target));
                 if (loadables.size() == 0) {
                   result.AppendError("No loadable sections");
                   return;
@@ -3038,7 +3037,7 @@ class CommandObjectTargetModulesLoad
                   ThreadList &thread_list = process->GetThreadList();
                   RegisterContextSP reg_context(
                       thread_list.GetSelectedThread()->GetRegisterContext());
-                  addr_t file_entry_addr = file_entry.GetLoadAddress(target);
+                  addr_t file_entry_addr = file_entry.GetLoadAddress(&target);
                   if (!reg_context->SetPC(file_entry_addr)) {
                     result.AppendErrorWithFormat("failed to set PC value to "
                                                  "0x%" PRIx64 "\n",
@@ -3166,50 +3165,37 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     const bool use_global_module_list = m_options.m_use_global_module_list;
     // Define a local module list here to ensure it lives longer than any
     // "locker" object which might lock its contents below (through the
     // "module_list_ptr" variable).
     ModuleList module_list;
-    if (target == nullptr && !use_global_module_list) {
-      result.AppendError("invalid target, create a debug target using the "
-                         "'target create' command");
-      return;
-    } else {
-      if (target) {
-        uint32_t addr_byte_size =
-            target->GetArchitecture().GetAddressByteSize();
-        result.GetOutputStream().SetAddressByteSize(addr_byte_size);
-        result.GetErrorStream().SetAddressByteSize(addr_byte_size);
-      }
-      // Dump all sections for all modules images
-      Stream &strm = result.GetOutputStream();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
+    result.GetOutputStream().SetAddressByteSize(addr_byte_size);
+    result.GetErrorStream().SetAddressByteSize(addr_byte_size);
+    // Dump all sections for all modules images
+    Stream &strm = result.GetOutputStream();
 
-      if (m_options.m_module_addr != LLDB_INVALID_ADDRESS) {
-        if (target) {
-          Address module_address;
-          if (module_address.SetLoadAddress(m_options.m_module_addr, target)) {
-            ModuleSP module_sp(module_address.GetModule());
-            if (module_sp) {
-              PrintModule(target, module_sp.get(), 0, strm);
-              result.SetStatus(eReturnStatusSuccessFinishResult);
-            } else {
-              result.AppendErrorWithFormat(
-                  "Couldn't find module matching address: 0x%" PRIx64 ".",
-                  m_options.m_module_addr);
-            }
-          } else {
-            result.AppendErrorWithFormat(
-                "Couldn't find module containing address: 0x%" PRIx64 ".",
-                m_options.m_module_addr);
-          }
+    if (m_options.m_module_addr != LLDB_INVALID_ADDRESS) {
+      Address module_address;
+      if (module_address.SetLoadAddress(m_options.m_module_addr, &target)) {
+        ModuleSP module_sp(module_address.GetModule());
+        if (module_sp) {
+          PrintModule(target, module_sp.get(), 0, strm);
+          result.SetStatus(eReturnStatusSuccessFinishResult);
         } else {
-          result.AppendError(
-              "Can only look up modules by address with a valid target.");
+          result.AppendErrorWithFormat(
+              "Couldn't find module matching address: 0x%" PRIx64 ".",
+              m_options.m_module_addr);
         }
-        return;
+      } else {
+        result.AppendErrorWithFormat(
+            "Couldn't find module containing address: 0x%" PRIx64 ".",
+            m_options.m_module_addr);
       }
+      return;
+    }
 
       size_t num_modules = 0;
 
@@ -3227,13 +3213,13 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
           guard.lock();
           num_modules = Module::GetNumberAllocatedModules();
         } else {
-          module_list_ptr = &target->GetImages();
+          module_list_ptr = &target.GetImages();
         }
       } else {
         for (const Args::ArgEntry &arg : command) {
           // Dump specified images (by basename or fullpath)
           const size_t num_matches = FindModulesByName(
-              target, arg.c_str(), module_list, use_global_module_list);
+              &target, arg.c_str(), module_list, use_global_module_list);
           if (num_matches == 0) {
             if (argc == 1) {
               result.AppendErrorWithFormat("no modules found that match '%s'",
@@ -3286,10 +3272,9 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
         }
         return;
       }
-    }
   }
 
-  void PrintModule(Target *target, Module *module, int indent, Stream &strm) {
+  void PrintModule(Target &target, Module *module, int indent, Stream &strm) {
     if (module == nullptr) {
       strm.PutCString("Null module");
       return;
@@ -3338,17 +3323,16 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
         // Image header address
         {
           uint32_t addr_nibble_width =
-              target ? (target->GetArchitecture().GetAddressByteSize() * 2)
-                     : 16;
+              target.GetArchitecture().GetAddressByteSize() * 2;
 
           ObjectFile *objfile = module->GetObjectFile();
           if (objfile) {
             Address base_addr(objfile->GetBaseAddress());
             if (base_addr.IsValid()) {
-              if (target && !target->GetSectionLoadList().IsEmpty()) {
-                lldb::addr_t load_addr = base_addr.GetLoadAddress(target);
+              if (!target.GetSectionLoadList().IsEmpty()) {
+                lldb::addr_t load_addr = base_addr.GetLoadAddress(&target);
                 if (load_addr == LLDB_INVALID_ADDRESS) {
-                  base_addr.Dump(&strm, target,
+                  base_addr.Dump(&strm, &target,
                                  Address::DumpStyleModuleWithFileAddress,
                                  Address::DumpStyleFileAddress);
                 } else {
@@ -3367,7 +3351,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
               }
               // The address was valid, but the image isn't loaded, output the
               // address in an appropriate format
-              base_addr.Dump(&strm, target, Address::DumpStyleFileAddress);
+              base_addr.Dump(&strm, &target, Address::DumpStyleFileAddress);
               break;
             }
           }
@@ -3969,7 +3953,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
       return false;
     case eLookupTypeType:
       if (!m_options.m_str.empty()) {
-        if (LookupTypeHere(&GetSelectedTarget(), m_interpreter,
+        if (LookupTypeHere(&GetTarget(), m_interpreter,
                            result.GetOutputStream(), *sym_ctx.module_sp,
                            m_options.m_str.c_str(), m_options.m_use_regex)) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -4048,8 +4032,8 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
     case eLookupTypeType:
       if (!m_options.m_str.empty()) {
         if (LookupTypeInModule(
-                &GetSelectedTarget(), m_interpreter, result.GetOutputStream(),
-                module, m_options.m_str.c_str(), m_options.m_use_regex)) {
+                &GetTarget(), m_interpreter, result.GetOutputStream(), module,
+                m_options.m_str.c_str(), m_options.m_use_regex)) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
           return true;
         }
@@ -4070,11 +4054,11 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool syntax_error = false;
     uint32_t i;
     uint32_t num_successful_lookups = 0;
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
     // Dump all sections for all modules images
@@ -4096,7 +4080,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
 
       // Dump all sections for all other modules
 
-      const ModuleList &target_modules = target->GetImages();
+      const ModuleList &target_modules = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(target_modules.GetMutex());
       if (target_modules.GetSize() == 0) {
         result.AppendError("the target has no associated executable images");
@@ -4119,7 +4103,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
            ++i) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, false);
+            FindModulesByName(&target, arg_cstr, module_list, false);
         if (num_matches > 0) {
           for (size_t j = 0; j < num_matches; ++j) {
             Module *module = module_list.GetModulePointerAtIndex(j);
@@ -4937,10 +4921,7 @@ Filter Options:
                            m_stop_hook_sp->GetID());
           error_sp->Flush();
         }
-        Target *target = GetDebugger().GetSelectedTarget().get();
-        if (target) {
-          target->UndoCreateStopHook(m_stop_hook_sp->GetID());
-        }
+        GetTarget().UndoCreateStopHook(m_stop_hook_sp->GetID());
       } else {
         // The IOHandler editor is only for command lines stop hooks:
         Target::StopHookCommandLine *hook_ptr =
@@ -4962,7 +4943,7 @@ Filter Options:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     m_stop_hook_sp.reset();
 
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     Target::StopHookSP new_hook_sp =
         target.CreateStopHook(m_python_class_options.GetName().empty() ?
                                Target::StopHook::StopHookKind::CommandBased
@@ -5099,7 +5080,7 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
     if (num_args == 0) {
@@ -5153,7 +5134,7 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
     bool success;
@@ -5197,7 +5178,7 @@ class CommandObjectTargetStopHookList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     size_t num_hooks = target.GetNumStopHooks();
     if (num_hooks == 0) {
@@ -5263,7 +5244,7 @@ class CommandObjectTargetDumpTypesystem : public CommandObjectParsed {
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // Go over every scratch TypeSystem and dump to the command output.
-    for (lldb::TypeSystemSP ts : GetSelectedTarget().GetScratchTypeSystems())
+    for (lldb::TypeSystemSP ts : GetTarget().GetScratchTypeSystems())
       if (ts)
         ts->Dump(result.GetOutputStream().AsRawOstream());
 
@@ -5287,7 +5268,7 @@ class CommandObjectTargetDumpSectionLoadList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedTarget();
+    Target &target = GetTarget();
     target.GetSectionLoadList().Dump(result.GetOutputStream(), &target);
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 4398cf3c3b89e..366b6dd965b38 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -882,7 +882,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
   void DoExecute(Args &command, CommandReturnObject &result) override {
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
-    Target *target = &GetSelectedTarget();
+    Target *target = &GetTarget();
 
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index f123211e72377..126982dfddf88 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -39,10 +39,10 @@ static void AddWatchpointDescription(Stream &s, Watchpoint &wp,
   s.EOL();
 }
 
-static bool CheckTargetForWatchpointOperations(Target *target,
+static bool CheckTargetForWatchpointOperations(Target &target,
                                                CommandReturnObject &result) {
   bool process_is_valid =
-      target->GetProcessSP() && target->GetProcessSP()->IsAlive();
+      target.GetProcessSP() && target.GetProcessSP()->IsAlive();
   if (!process_is_valid) {
     result.AppendError("There's no process or it is not alive.");
     return false;
@@ -67,12 +67,10 @@ static int32_t WithRSAIndex(llvm::StringRef Arg) {
 // Return true if wp_ids is successfully populated with the watch ids. False
 // otherwise.
 bool CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
-    Target *target, Args &args, std::vector<uint32_t> &wp_ids) {
+    Target &target, Args &args, std::vector<uint32_t> &wp_ids) {
   // Pre-condition: args.GetArgumentCount() > 0.
   if (args.GetArgumentCount() == 0) {
-    if (target == nullptr)
-      return false;
-    WatchpointSP watch_sp = target->GetLastCreatedWatchpoint();
+    WatchpointSP watch_sp = target.GetLastCreatedWatchpoint();
     if (watch_sp) {
       wp_ids.push_back(watch_sp->GetID());
       return true;
@@ -203,22 +201,24 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    if (target->GetProcessSP() && target->GetProcessSP()->IsAlive()) {
-      std::optional<uint32_t> num_supported_hardware_watchpoints =
-          target->GetProcessSP()->GetWatchpointSlotCount();
+    if (ProcessSP process_sp = target.GetProcessSP()) {
+      if (process_sp->IsAlive()) {
+        std::optional<uint32_t> num_supported_hardware_watchpoints =
+            process_sp->GetWatchpointSlotCount();
 
-      if (num_supported_hardware_watchpoints)
-        result.AppendMessageWithFormat(
-            "Number of supported hardware watchpoints: %u\n",
-            *num_supported_hardware_watchpoints);
+        if (num_supported_hardware_watchpoints)
+          result.AppendMessageWithFormat(
+              "Number of supported hardware watchpoints: %u\n",
+              *num_supported_hardware_watchpoints);
+      }
     }
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -286,14 +286,14 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -304,7 +304,7 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
 
     if (command.GetArgumentCount() == 0) {
       // No watchpoint selected; enable all currently set watchpoints.
-      target->EnableAllWatchpoints();
+      target.EnableAllWatchpoints();
       result.AppendMessageWithFormat("All watchpoints enabled. (%" PRIu64
                                      " watchpoints)\n",
                                      (uint64_t)num_watchpoints);
@@ -321,7 +321,7 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->EnableWatchpointByID(wp_ids[i]))
+        if (target.EnableWatchpointByID(wp_ids[i]))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints enabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -355,14 +355,14 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -372,7 +372,7 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
 
     if (command.GetArgumentCount() == 0) {
       // No watchpoint selected; disable all currently set watchpoints.
-      if (target->DisableAllWatchpoints()) {
+      if (target.DisableAllWatchpoints()) {
         result.AppendMessageWithFormat("All watchpoints disabled. (%" PRIu64
                                        " watchpoints)\n",
                                        (uint64_t)num_watchpoints);
@@ -392,7 +392,7 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->DisableWatchpointByID(wp_ids[i]))
+        if (target.DisableWatchpointByID(wp_ids[i]))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints disabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -464,14 +464,14 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -487,7 +487,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
               true)) {
         result.AppendMessage("Operation cancelled...");
       } else {
-        target->RemoveAllWatchpoints();
+        target.RemoveAllWatchpoints();
         result.AppendMessageWithFormat("All watchpoints removed. (%" PRIu64
                                        " watchpoints)\n",
                                        (uint64_t)num_watchpoints);
@@ -507,7 +507,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
     int count = 0;
     const size_t size = wp_ids.size();
     for (size_t i = 0; i < size; ++i)
-      if (target->RemoveWatchpointByID(wp_ids[i]))
+      if (target.RemoveWatchpointByID(wp_ids[i]))
         ++count;
     result.AppendMessageWithFormat("%d watchpoints deleted.\n", count);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -584,14 +584,14 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -601,7 +601,7 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
     }
 
     if (command.GetArgumentCount() == 0) {
-      target->IgnoreAllWatchpoints(m_options.m_ignore_count);
+      target.IgnoreAllWatchpoints(m_options.m_ignore_count);
       result.AppendMessageWithFormat("All watchpoints ignored. (%" PRIu64
                                      " watchpoints)\n",
                                      (uint64_t)num_watchpoints);
@@ -618,7 +618,7 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->IgnoreWatchpointByID(wp_ids[i], m_options.m_ignore_count))
+        if (target.IgnoreWatchpointByID(wp_ids[i], m_options.m_ignore_count))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints ignored.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -703,14 +703,14 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -720,7 +720,7 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
     }
 
     if (command.GetArgumentCount() == 0) {
-      WatchpointSP watch_sp = target->GetLastCreatedWatchpoint();
+      WatchpointSP watch_sp = target.GetLastCreatedWatchpoint();
       watch_sp->SetCondition(m_options.m_condition.c_str());
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     } else {
@@ -804,7 +804,7 @@ corresponding to the byte size of the data type.");
   }
 
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
 
     // If no argument is present, issue an error message.  There's no way to
@@ -852,8 +852,8 @@ corresponding to the byte size of the data type.");
 
       Status error(Variable::GetValuesForVariableExpressionPath(
           command.GetArgumentAtIndex(0),
-          m_exe_ctx.GetBestExecutionContextScope(), GetVariableCallback, target,
-          variable_list, valobj_list));
+          m_exe_ctx.GetBestExecutionContextScope(), GetVariableCallback,
+          &target, variable_list, valobj_list));
 
       if (valobj_list.GetSize())
         valobj_sp = valobj_list.GetValueObjectAtIndex(0);
@@ -904,7 +904,7 @@ corresponding to the byte size of the data type.");
 
     error.Clear();
     WatchpointSP watch_sp =
-        target->CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
+        target.CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
     if (!watch_sp) {
       result.AppendErrorWithFormat(
           "Watchpoint creation failed (addr=0x%" PRIx64 ", size=%" PRIu64
@@ -991,7 +991,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
     m_option_group.NotifyOptionParsingStarting(
         &exe_ctx); // This is a raw command, so notify the option group
 
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
 
     OptionsWithRaw args(raw_command);
@@ -1034,7 +1034,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
       options.SetLanguage(m_option_watchpoint.language_type);
 
     ExpressionResults expr_result =
-        target->EvaluateExpression(expr, frame, valobj_sp, options);
+        target.EvaluateExpression(expr, frame, valobj_sp, options);
     if (expr_result != eExpressionCompleted) {
       result.AppendError("expression evaluation of address to watch failed");
       result.AppendErrorWithFormat("expression evaluated: \n%s", expr.data());
@@ -1054,7 +1054,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
     if (m_option_watchpoint.watch_size.GetCurrentValue() != 0)
       size = m_option_watchpoint.watch_size.GetCurrentValue();
     else
-      size = target->GetArchitecture().GetAddressByteSize();
+      size = target.GetArchitecture().GetAddressByteSize();
 
     // Now it's time to create the watchpoint.
     uint32_t watch_type;
@@ -1095,7 +1095,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
 
     Status error;
     WatchpointSP watch_sp =
-        target->CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
+        target.CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
     if (watch_sp) {
       watch_sp->SetWatchSpec(std::string(expr));
       Stream &output_stream = result.GetOutputStream();
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.h b/lldb/source/Commands/CommandObjectWatchpoint.h
index 87f9f4383bd27..a68491103ef51 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.h
+++ b/lldb/source/Commands/CommandObjectWatchpoint.h
@@ -22,7 +22,7 @@ class CommandObjectMultiwordWatchpoint : public CommandObjectMultiword {
 
   ~CommandObjectMultiwordWatchpoint() override;
 
-  static bool VerifyWatchpointIDs(Target *target, Args &args,
+  static bool VerifyWatchpointIDs(Target &target, Args &args,
                                   std::vector<uint32_t> &wp_ids);
 };
 
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index aaf14540cb282..cc4cb76764866 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -355,9 +355,9 @@ are no syntax errors may indicate that a function was declared but never called.
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -384,7 +384,7 @@ are no syntax errors may indicate that a function was declared but never called.
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
         // Sanity check wp first.
         if (wp == nullptr)
           continue;
@@ -450,9 +450,9 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -478,7 +478,7 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
         if (wp)
           wp->ClearCallback();
       } else {
@@ -505,9 +505,9 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -533,7 +533,7 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
 
         if (wp) {
           const WatchpointOptions *wp_options = wp->GetOptions();
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 24e97f3bb97d3..6e5ed21b22ad8 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -841,6 +841,12 @@ let Command = "scripting run" in {
     " language. If none is specific the default scripting language is used.">;
 }
 
+let Command = "scripting template list" in {
+  def scripting_template_list_language : Option<"language", "l">,
+    EnumArg<"ScriptLang">, Desc<"Specify the scripting "
+    " language. If none is specified the default scripting language is used.">;
+}
+
 let Command = "source info" in {
   def source_info_count : Option<"count", "c">, Arg<"Count">,
     Desc<"The number of line entries to display.">;
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index 9c105b3f0e57a..f9d7832254f46 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -131,7 +131,8 @@ Module *Module::GetAllocatedModuleAtIndex(size_t idx) {
 }
 
 Module::Module(const ModuleSpec &module_spec)
-    : m_file_has_changed(false), m_first_file_changed_log(false) {
+    : m_unwind_table(*this), m_file_has_changed(false),
+      m_first_file_changed_log(false) {
   // Scope for locker below...
   {
     std::lock_guard<std::recursive_mutex> guard(
@@ -238,7 +239,8 @@ Module::Module(const FileSpec &file_spec, const ArchSpec &arch,
     : m_mod_time(FileSystem::Instance().GetModificationTime(file_spec)),
       m_arch(arch), m_file(file_spec), m_object_name(object_name),
       m_object_offset(object_offset), m_object_mod_time(object_mod_time),
-      m_file_has_changed(false), m_first_file_changed_log(false) {
+      m_unwind_table(*this), m_file_has_changed(false),
+      m_first_file_changed_log(false) {
   // Scope for locker below...
   {
     std::lock_guard<std::recursive_mutex> guard(
@@ -254,7 +256,9 @@ Module::Module(const FileSpec &file_spec, const ArchSpec &arch,
               m_object_name.AsCString(""), m_object_name.IsEmpty() ? "" : ")");
 }
 
-Module::Module() : m_file_has_changed(false), m_first_file_changed_log(false) {
+Module::Module()
+    : m_unwind_table(*this), m_file_has_changed(false),
+      m_first_file_changed_log(false) {
   std::lock_guard<std::recursive_mutex> guard(
       GetAllocationModuleCollectionMutex());
   GetModuleCollection().push_back(this);
@@ -323,6 +327,8 @@ ObjectFile *Module::GetMemoryObjectFile(const lldb::ProcessSP &process_sp,
           // Augment the arch with the target's information in case
           // we are unable to extract the os/environment from memory.
           m_arch.MergeFrom(process_sp->GetTarget().GetArchitecture());
+
+          m_unwind_table.ModuleWasUpdated();
         } else {
           error.SetErrorString("unable to find suitable object file plug-in");
         }
@@ -1009,8 +1015,7 @@ SymbolFile *Module::GetSymbolFile(bool can_create, Stream *feedback_strm) {
         m_symfile_up.reset(
             SymbolVendor::FindPlugin(shared_from_this(), feedback_strm));
         m_did_load_symfile = true;
-        if (m_unwind_table)
-          m_unwind_table->Update();
+        m_unwind_table.ModuleWasUpdated();
       }
     }
   }
@@ -1210,6 +1215,8 @@ ObjectFile *Module::GetObjectFile() {
           // more specific than the generic COFF architecture, only merge in
           // those values that overwrite unspecified unknown values.
           m_arch.MergeFrom(m_objfile_sp->GetArchitecture());
+
+          m_unwind_table.ModuleWasUpdated();
         } else {
           ReportError("failed to load objfile for {0}\nDebugging will be "
                       "degraded for this module.",
@@ -1240,12 +1247,9 @@ void Module::SectionFileAddressesChanged() {
 }
 
 UnwindTable &Module::GetUnwindTable() {
-  if (!m_unwind_table) {
-    if (!m_symfile_spec)
-      SymbolLocator::DownloadSymbolFileAsync(GetUUID());
-    m_unwind_table.emplace(*this);
-  }
-  return *m_unwind_table;
+  if (!m_symfile_spec)
+    SymbolLocator::DownloadSymbolFileAsync(GetUUID());
+  return m_unwind_table;
 }
 
 SectionList *Module::GetUnifiedSectionList() {
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index b428370d7f333..01bee8680b7ba 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -12,6 +12,7 @@
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Interpreter/OptionValueProperties.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
@@ -639,6 +640,18 @@ static ObjectFileInstances &GetObjectFileInstances() {
   return g_instances;
 }
 
+bool PluginManager::IsRegisteredObjectFilePluginName(llvm::StringRef name) {
+  if (name.empty())
+    return false;
+
+  const auto &instances = GetObjectFileInstances().GetInstances();
+  for (auto &instance : instances) {
+    if (instance.name == name)
+      return true;
+  }
+  return false;
+}
+
 bool PluginManager::RegisterPlugin(
     llvm::StringRef name, llvm::StringRef description,
     ObjectFileCreateInstance create_callback,
@@ -689,12 +702,22 @@ PluginManager::GetObjectFileCreateMemoryCallbackForPluginName(
 }
 
 Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
-                               const FileSpec &outfile,
-                               lldb::SaveCoreStyle &core_style,
-                               llvm::StringRef plugin_name) {
-  if (plugin_name.empty()) {
+                               const lldb_private::SaveCoreOptions &options) {
+  Status error;
+  if (!options.GetOutputFile()) {
+    error.SetErrorString("No output file specified");
+    return error;
+  }
+
+  if (!process_sp) {
+    error.SetErrorString("Invalid process");
+    return error;
+  }
+
+  if (!options.GetPluginName().has_value()) {
     // Try saving core directly from the process plugin first.
-    llvm::Expected<bool> ret = process_sp->SaveCore(outfile.GetPath());
+    llvm::Expected<bool> ret =
+        process_sp->SaveCore(options.GetOutputFile()->GetPath());
     if (!ret)
       return Status(ret.takeError());
     if (ret.get())
@@ -702,17 +725,20 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
   }
 
   // Fall back to object plugins.
-  Status error;
+  const auto &plugin_name = options.GetPluginName().value_or("");
   auto &instances = GetObjectFileInstances().GetInstances();
   for (auto &instance : instances) {
     if (plugin_name.empty() || instance.name == plugin_name) {
-      if (instance.save_core &&
-          instance.save_core(process_sp, outfile, core_style, error))
+      if (instance.save_core && instance.save_core(process_sp, options, error))
         return error;
     }
   }
-  error.SetErrorString(
-      "no ObjectFile plugins were able to save a core for this process");
+
+  // Check to see if any of the object file plugins tried and failed to save.
+  // If none ran, set the error message.
+  if (error.Success())
+    error.SetErrorString(
+        "no ObjectFile plugins were able to save a core for this process");
   return error;
 }
 
@@ -1479,6 +1505,70 @@ LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForExpressions() {
   return all;
 }
 
+#pragma mark ScriptedInterfaces
+
+struct ScriptedInterfaceInstance
+    : public PluginInstance<ScriptedInterfaceCreateInstance> {
+  ScriptedInterfaceInstance(llvm::StringRef name, llvm::StringRef description,
+                            ScriptedInterfaceCreateInstance create_callback,
+                            lldb::ScriptLanguage language,
+                            ScriptedInterfaceUsages usages)
+      : PluginInstance<ScriptedInterfaceCreateInstance>(name, description,
+                                                        create_callback),
+        language(language), usages(usages) {}
+
+  lldb::ScriptLanguage language;
+  ScriptedInterfaceUsages usages;
+};
+
+typedef PluginInstances<ScriptedInterfaceInstance> ScriptedInterfaceInstances;
+
+static ScriptedInterfaceInstances &GetScriptedInterfaceInstances() {
+  static ScriptedInterfaceInstances g_instances;
+  return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+    llvm::StringRef name, llvm::StringRef description,
+    ScriptedInterfaceCreateInstance create_callback,
+    lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) {
+  return GetScriptedInterfaceInstances().RegisterPlugin(
+      name, description, create_callback, language, usages);
+}
+
+bool PluginManager::UnregisterPlugin(
+    ScriptedInterfaceCreateInstance create_callback) {
+  return GetScriptedInterfaceInstances().UnregisterPlugin(create_callback);
+}
+
+uint32_t PluginManager::GetNumScriptedInterfaces() {
+  return GetScriptedInterfaceInstances().GetInstances().size();
+}
+
+llvm::StringRef PluginManager::GetScriptedInterfaceNameAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetNameAtIndex(index);
+}
+
+llvm::StringRef
+PluginManager::GetScriptedInterfaceDescriptionAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetDescriptionAtIndex(index);
+}
+
+lldb::ScriptLanguage
+PluginManager::GetScriptedInterfaceLanguageAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  return idx < instances.size() ? instances[idx].language
+                                : ScriptLanguage::eScriptLanguageNone;
+}
+
+ScriptedInterfaceUsages
+PluginManager::GetScriptedInterfaceUsagesAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  if (idx >= instances.size())
+    return {};
+  return instances[idx].usages;
+}
+
 #pragma mark REPL
 
 struct REPLInstance : public PluginInstance<REPLCreateInstance> {
@@ -1539,6 +1629,7 @@ void PluginManager::DebuggerInitialize(Debugger &debugger) {
   GetOperatingSystemInstances().PerformDebuggerCallback(debugger);
   GetStructuredDataPluginInstances().PerformDebuggerCallback(debugger);
   GetTracePluginInstances().PerformDebuggerCallback(debugger);
+  GetScriptedInterfaceInstances().PerformDebuggerCallback(debugger);
 }
 
 // This is the preferred new way to register plugin specific settings.  e.g.
diff --git a/lldb/source/Expression/IRMemoryMap.cpp b/lldb/source/Expression/IRMemoryMap.cpp
index de631370bb048..0c1d9016616cb 100644
--- a/lldb/source/Expression/IRMemoryMap.cpp
+++ b/lldb/source/Expression/IRMemoryMap.cpp
@@ -84,7 +84,7 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
   // any allocations.  Otherwise start at the beginning of memory.
 
   if (m_allocations.empty()) {
-    ret = 0x0;
+    ret = 0;
   } else {
     auto back = m_allocations.rbegin();
     lldb::addr_t addr = back->first;
@@ -116,10 +116,18 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
     Status err = process_sp->GetMemoryRegionInfo(ret, region_info);
     if (err.Success()) {
       while (true) {
-        if (region_info.GetReadable() != MemoryRegionInfo::OptionalBool::eNo ||
-            region_info.GetWritable() != MemoryRegionInfo::OptionalBool::eNo ||
-            region_info.GetExecutable() !=
-                MemoryRegionInfo::OptionalBool::eNo) {
+        if (region_info.GetRange().GetRangeBase() == 0 &&
+            region_info.GetRange().GetRangeEnd() < end_of_memory) {
+          // Don't use a region that starts at address 0,
+          // it can make it harder to debug null dereference crashes
+          // in the inferior.
+          ret = region_info.GetRange().GetRangeEnd();
+        } else if (region_info.GetReadable() !=
+                       MemoryRegionInfo::OptionalBool::eNo ||
+                   region_info.GetWritable() !=
+                       MemoryRegionInfo::OptionalBool::eNo ||
+                   region_info.GetExecutable() !=
+                       MemoryRegionInfo::OptionalBool::eNo) {
           if (region_info.GetRange().GetRangeEnd() - 1 >= end_of_memory) {
             ret = LLDB_INVALID_ADDRESS;
             break;
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 06ccc0e2b3424..e03d36e9cad4a 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -93,9 +93,8 @@ using namespace lldb_private;
 #include <syslog.h>
 void Host::SystemLog(Severity severity, llvm::StringRef message) {
   static llvm::once_flag g_openlog_once;
-  llvm::call_once(g_openlog_once, [] {
-    openlog("lldb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_USER);
-  });
+  llvm::call_once(g_openlog_once,
+                  [] { openlog("lldb", LOG_PID | LOG_NDELAY, LOG_USER); });
   int level = LOG_DEBUG;
   switch (severity) {
   case lldb::eSeverityInfo:
diff --git a/lldb/source/Host/linux/Host.cpp b/lldb/source/Host/linux/Host.cpp
index 5545f9ef4d70e..0bc736d90ea76 100644
--- a/lldb/source/Host/linux/Host.cpp
+++ b/lldb/source/Host/linux/Host.cpp
@@ -51,11 +51,9 @@ enum class ProcessState {
   Zombie,
 };
 
-constexpr int task_comm_len = 16;
-
 struct StatFields {
   ::pid_t pid = LLDB_INVALID_PROCESS_ID;
-  char comm[task_comm_len];
+  // comm
   char state;
   ::pid_t ppid = LLDB_INVALID_PROCESS_ID;
   ::pid_t pgrp = LLDB_INVALID_PROCESS_ID;
@@ -100,8 +98,8 @@ static bool GetStatusInfo(::pid_t Pid, ProcessInstanceInfo &ProcessInfo,
   StatFields stat_fields;
   if (sscanf(
           Rest.data(),
-          "%d %s %c %d %d %d %d %d %u %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld",
-          &stat_fields.pid, stat_fields.comm, &stat_fields.state,
+          "%d %*s %c %d %d %d %d %d %u %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld",
+          &stat_fields.pid, /* comm, */ &stat_fields.state,
           &stat_fields.ppid, &stat_fields.pgrp, &stat_fields.session,
           &stat_fields.tty_nr, &stat_fields.tpgid, &stat_fields.flags,
           &stat_fields.minflt, &stat_fields.cminflt, &stat_fields.majflt,
diff --git a/lldb/source/Host/windows/Host.cpp b/lldb/source/Host/windows/Host.cpp
index 6908f0003eaf7..642092f61d924 100644
--- a/lldb/source/Host/windows/Host.cpp
+++ b/lldb/source/Host/windows/Host.cpp
@@ -103,7 +103,9 @@ lldb::thread_t Host::GetCurrentThread() {
 }
 
 void Host::Kill(lldb::pid_t pid, int signo) {
-  TerminateProcess((HANDLE)pid, 1);
+  AutoHandle handle(::OpenProcess(PROCESS_TERMINATE, FALSE, pid), nullptr);
+  if (handle.IsValid())
+    ::TerminateProcess(handle.get(), 1);
 }
 
 const char *Host::GetSignalAsCString(int signo) { return NULL; }
diff --git a/lldb/source/Interpreter/CMakeLists.txt b/lldb/source/Interpreter/CMakeLists.txt
index ae79b82d7c3e2..642263a8bda7f 100644
--- a/lldb/source/Interpreter/CMakeLists.txt
+++ b/lldb/source/Interpreter/CMakeLists.txt
@@ -6,6 +6,8 @@ lldb_tablegen(InterpreterPropertiesEnum.inc -gen-lldb-property-enum-defs
   SOURCE InterpreterProperties.td
   TARGET LLDBInterpreterPropertiesEnumGen)
 
+add_subdirectory(Interfaces)
+
 add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   CommandAlias.cpp
   CommandHistory.cpp
@@ -54,6 +56,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   ScriptInterpreter.cpp
 
   LINK_LIBS
+    lldbInterpreterInterfaces
     lldbCommands
     lldbCore
     lldbDataFormatters
@@ -66,6 +69,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   )
 
 add_dependencies(lldbInterpreter
+  lldbInterpreterInterfaces
   LLDBInterpreterPropertiesGen
   LLDBInterpreterPropertiesEnumGen)
 
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index 4634b75c6a33a..c819024ccf018 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -758,17 +758,23 @@ Target &CommandObject::GetDummyTarget() {
   return m_interpreter.GetDebugger().GetDummyTarget();
 }
 
-Target &CommandObject::GetSelectedOrDummyTarget(bool prefer_dummy) {
-  return m_interpreter.GetDebugger().GetSelectedOrDummyTarget(prefer_dummy);
-}
-
-Target &CommandObject::GetSelectedTarget() {
-  assert(m_flags.AnySet(eCommandRequiresTarget | eCommandProcessMustBePaused |
-                        eCommandProcessMustBeLaunched | eCommandRequiresFrame |
-                        eCommandRequiresThread | eCommandRequiresProcess |
-                        eCommandRequiresRegContext) &&
-         "GetSelectedTarget called from object that may have no target");
-  return *m_interpreter.GetDebugger().GetSelectedTarget();
+Target &CommandObject::GetTarget() {
+  // Prefer the frozen execution context in the command object.
+  if (Target *target = m_exe_ctx.GetTargetPtr())
+    return *target;
+
+  // Fallback to the command interpreter's execution context in case we get
+  // called after DoExecute has finished. For example, when doing multi-line
+  // expression that uses an input reader or breakpoint callbacks.
+  if (Target *target = m_interpreter.GetExecutionContext().GetTargetPtr())
+    return *target;
+
+  // Finally, if we have no other target, get the selected target.
+  if (TargetSP target_sp = m_interpreter.GetDebugger().GetSelectedTarget())
+    return *target_sp;
+
+  // We only have the dummy target.
+  return GetDummyTarget();
 }
 
 Thread *CommandObject::GetDefaultThread() {
diff --git a/lldb/source/Interpreter/Interfaces/CMakeLists.txt b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
new file mode 100644
index 0000000000000..f44672aa50b75
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_library(lldbInterpreterInterfaces NO_PLUGIN_DEPENDENCIES
+  ScriptedInterfaceUsages.cpp
+
+  LINK_LIBS
+    lldbUtility
+
+  LINK_COMPONENTS
+    Support
+  )
+
diff --git a/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
new file mode 100644
index 0000000000000..05d7a5d852f8c
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
@@ -0,0 +1,37 @@
+//===-- ScriptedInterfaceUsages.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+void ScriptedInterfaceUsages::Dump(Stream &s, UsageKind kind) const {
+  s.IndentMore();
+  s.Indent();
+  llvm::StringRef usage_kind =
+      (kind == UsageKind::CommandInterpreter) ? "Command Interpreter" : "API";
+  s << usage_kind << " Usages:";
+  const std::vector<llvm::StringRef> &usages =
+      (kind == UsageKind::CommandInterpreter) ? GetCommandInterpreterUsages()
+                                              : GetSBAPIUsages();
+  if (usages.empty())
+    s << " None\n";
+  else if (usages.size() == 1)
+    s << " " << usages.front() << '\n';
+  else {
+    s << '\n';
+    for (llvm::StringRef usage : usages) {
+      s.IndentMore();
+      s.Indent();
+      s << usage << '\n';
+      s.IndentLess();
+    }
+  }
+  s.IndentLess();
+}
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 0e17d57fa9c4f..3863b6b3520db 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -609,6 +609,26 @@ void DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
   }
 }
 
+std::optional<lldb_private::Address> DynamicLoaderDarwin::GetStartAddress() {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  auto log_err = [log](llvm::StringLiteral err_msg) -> std::nullopt_t {
+    LLDB_LOGV(log, "{}", err_msg);
+    return std::nullopt;
+  };
+
+  ModuleSP dyld_sp = GetDYLDModule();
+  if (!dyld_sp)
+    return log_err("Couldn't retrieve DYLD module. Cannot get `start` symbol.");
+
+  const Symbol *symbol =
+      dyld_sp->FindFirstSymbolWithNameAndType(ConstString("_dyld_start"));
+  if (!symbol)
+    return log_err("Cannot find `start` symbol in DYLD module.");
+
+  return symbol->GetAddress();
+}
+
 void DynamicLoaderDarwin::SetDYLDModule(lldb::ModuleSP &dyld_module_sp) {
   m_dyld_module_wp = dyld_module_sp;
 }
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
index 8f9a29c94173f..3613c4c29b178 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
@@ -56,6 +56,8 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader {
 
   virtual bool NeedToDoInitialImageFetch() = 0;
 
+  std::optional<lldb_private::Address> GetStartAddress() override;
+
 protected:
   void PrivateInitialize(lldb_private::Process *process);
 
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 51bd34e95c77d..890db5c274814 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1696,7 +1696,6 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
   return llvm::StringSwitch<SectionType>(Name)
       .Case(".ARM.exidx", eSectionTypeARMexidx)
       .Case(".ARM.extab", eSectionTypeARMextab)
-      .Cases(".bss", ".tbss", eSectionTypeZeroFill)
       .Case(".ctf", eSectionTypeDebug)
       .Cases(".data", ".tdata", eSectionTypeData)
       .Case(".eh_frame", eSectionTypeEHFrame)
@@ -1713,6 +1712,10 @@ SectionType ObjectFileELF::GetSectionType(const ELFSectionHeaderInfo &H) const {
     if (H.sh_flags & SHF_EXECINSTR)
       return eSectionTypeCode;
     break;
+  case SHT_NOBITS:
+    if (H.sh_flags & SHF_ALLOC)
+      return eSectionTypeZeroFill;
+    break;
   case SHT_SYMTAB:
     return eSectionTypeELFSymbolTable;
   case SHT_DYNSYM:
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 0dcb1bed23548..ce095bcc48374 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -137,6 +137,9 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::MachO;
 
+static constexpr llvm::StringLiteral g_loader_path = "@loader_path";
+static constexpr llvm::StringLiteral g_executable_path = "@executable_path";
+
 LLDB_PLUGIN_DEFINE(ObjectFileMachO)
 
 static void PrintRegisterValue(RegisterContext *reg_ctx, const char *name,
@@ -5116,123 +5119,121 @@ UUID ObjectFileMachO::GetUUID() {
 }
 
 uint32_t ObjectFileMachO::GetDependentModules(FileSpecList &files) {
+  ModuleSP module_sp = GetModule();
+  if (!module_sp)
+    return 0;
+
   uint32_t count = 0;
-  ModuleSP module_sp(GetModule());
-  if (module_sp) {
-    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
-    llvm::MachO::load_command load_cmd;
-    lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
-    std::vector<std::string> rpath_paths;
-    std::vector<std::string> rpath_relative_paths;
-    std::vector<std::string> at_exec_relative_paths;
-    uint32_t i;
-    for (i = 0; i < m_header.ncmds; ++i) {
-      const uint32_t cmd_offset = offset;
-      if (m_data.GetU32(&offset, &load_cmd, 2) == nullptr)
-        break;
+  std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+  llvm::MachO::load_command load_cmd;
+  lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
+  std::vector<std::string> rpath_paths;
+  std::vector<std::string> rpath_relative_paths;
+  std::vector<std::string> at_exec_relative_paths;
+  uint32_t i;
+  for (i = 0; i < m_header.ncmds; ++i) {
+    const uint32_t cmd_offset = offset;
+    if (m_data.GetU32(&offset, &load_cmd, 2) == nullptr)
+      break;
 
-      switch (load_cmd.cmd) {
-      case LC_RPATH:
-      case LC_LOAD_DYLIB:
-      case LC_LOAD_WEAK_DYLIB:
-      case LC_REEXPORT_DYLIB:
-      case LC_LOAD_DYLINKER:
-      case LC_LOADFVMLIB:
-      case LC_LOAD_UPWARD_DYLIB: {
-        uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
-        // For LC_LOAD_DYLIB there is an alternate encoding
-        // which adds a uint32_t `flags` field for `DYLD_USE_*`
-        // flags.  This can be detected by a timestamp field with
-        // the `DYLIB_USE_MARKER` constant value.
-        bool is_delayed_init = false;
-        uint32_t use_command_marker = m_data.GetU32(&offset);
-        if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
-          offset += 4; /* uint32_t current_version */
-          offset += 4; /* uint32_t compat_version */
-          uint32_t flags = m_data.GetU32(&offset);
-          // If this LC_LOAD_DYLIB is marked delay-init,
-          // don't report it as a dependent library -- it
-          // may be loaded in the process at some point,
-          // but will most likely not be load at launch.
-          if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
-            is_delayed_init = true;
-        }
-        const char *path = m_data.PeekCStr(name_offset);
-        if (path && !is_delayed_init) {
-          if (load_cmd.cmd == LC_RPATH)
-            rpath_paths.push_back(path);
-          else {
-            if (path[0] == '@') {
-              if (strncmp(path, "@rpath", strlen("@rpath")) == 0)
-                rpath_relative_paths.push_back(path + strlen("@rpath"));
-              else if (strncmp(path, "@executable_path",
-                               strlen("@executable_path")) == 0)
-                at_exec_relative_paths.push_back(path +
-                                                 strlen("@executable_path"));
-            } else {
-              FileSpec file_spec(path);
-              if (files.AppendIfUnique(file_spec))
-                count++;
-            }
+    switch (load_cmd.cmd) {
+    case LC_RPATH:
+    case LC_LOAD_DYLIB:
+    case LC_LOAD_WEAK_DYLIB:
+    case LC_REEXPORT_DYLIB:
+    case LC_LOAD_DYLINKER:
+    case LC_LOADFVMLIB:
+    case LC_LOAD_UPWARD_DYLIB: {
+      uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
+      // For LC_LOAD_DYLIB there is an alternate encoding
+      // which adds a uint32_t `flags` field for `DYLD_USE_*`
+      // flags.  This can be detected by a timestamp field with
+      // the `DYLIB_USE_MARKER` constant value.
+      bool is_delayed_init = false;
+      uint32_t use_command_marker = m_data.GetU32(&offset);
+      if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
+        offset += 4; /* uint32_t current_version */
+        offset += 4; /* uint32_t compat_version */
+        uint32_t flags = m_data.GetU32(&offset);
+        // If this LC_LOAD_DYLIB is marked delay-init,
+        // don't report it as a dependent library -- it
+        // may be loaded in the process at some point,
+        // but will most likely not be load at launch.
+        if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
+          is_delayed_init = true;
+      }
+      const char *path = m_data.PeekCStr(name_offset);
+      if (path && !is_delayed_init) {
+        if (load_cmd.cmd == LC_RPATH)
+          rpath_paths.push_back(path);
+        else {
+          if (path[0] == '@') {
+            if (strncmp(path, "@rpath", strlen("@rpath")) == 0)
+              rpath_relative_paths.push_back(path + strlen("@rpath"));
+            else if (strncmp(path, "@executable_path",
+                             strlen("@executable_path")) == 0)
+              at_exec_relative_paths.push_back(path +
+                                               strlen("@executable_path"));
+          } else {
+            FileSpec file_spec(path);
+            if (files.AppendIfUnique(file_spec))
+              count++;
           }
         }
-      } break;
-
-      default:
-        break;
       }
-      offset = cmd_offset + load_cmd.cmdsize;
-    }
+    } break;
 
-    FileSpec this_file_spec(m_file);
-    FileSystem::Instance().Resolve(this_file_spec);
-
-    if (!rpath_paths.empty()) {
-      // Fixup all LC_RPATH values to be absolute paths
-      std::string loader_path("@loader_path");
-      std::string executable_path("@executable_path");
-      for (auto &rpath : rpath_paths) {
-        if (llvm::StringRef(rpath).starts_with(loader_path)) {
-          rpath.erase(0, loader_path.size());
-          rpath.insert(0, this_file_spec.GetDirectory().GetCString());
-        } else if (llvm::StringRef(rpath).starts_with(executable_path)) {
-          rpath.erase(0, executable_path.size());
-          rpath.insert(0, this_file_spec.GetDirectory().GetCString());
-        }
-      }
+    default:
+      break;
+    }
+    offset = cmd_offset + load_cmd.cmdsize;
+  }
 
-      for (const auto &rpath_relative_path : rpath_relative_paths) {
-        for (const auto &rpath : rpath_paths) {
-          std::string path = rpath;
-          path += rpath_relative_path;
-          // It is OK to resolve this path because we must find a file on disk
-          // for us to accept it anyway if it is rpath relative.
-          FileSpec file_spec(path);
-          FileSystem::Instance().Resolve(file_spec);
-          if (FileSystem::Instance().Exists(file_spec) &&
-              files.AppendIfUnique(file_spec)) {
-            count++;
-            break;
-          }
-        }
-      }
+  FileSpec this_file_spec(m_file);
+  FileSystem::Instance().Resolve(this_file_spec);
+
+  if (!rpath_paths.empty()) {
+    // Fixup all LC_RPATH values to be absolute paths.
+    const std::string this_directory =
+        this_file_spec.GetDirectory().GetString();
+    for (auto &rpath : rpath_paths) {
+      if (llvm::StringRef(rpath).starts_with(g_loader_path))
+        rpath = this_directory + rpath.substr(g_loader_path.size());
+      else if (llvm::StringRef(rpath).starts_with(g_executable_path))
+        rpath = this_directory + rpath.substr(g_executable_path.size());
     }
 
-    // We may have @executable_paths but no RPATHS.  Figure those out here.
-    // Only do this if this object file is the executable.  We have no way to
-    // get back to the actual executable otherwise, so we won't get the right
-    // path.
-    if (!at_exec_relative_paths.empty() && CalculateType() == eTypeExecutable) {
-      FileSpec exec_dir = this_file_spec.CopyByRemovingLastPathComponent();
-      for (const auto &at_exec_relative_path : at_exec_relative_paths) {
-        FileSpec file_spec =
-            exec_dir.CopyByAppendingPathComponent(at_exec_relative_path);
+    for (const auto &rpath_relative_path : rpath_relative_paths) {
+      for (const auto &rpath : rpath_paths) {
+        std::string path = rpath;
+        path += rpath_relative_path;
+        // It is OK to resolve this path because we must find a file on disk
+        // for us to accept it anyway if it is rpath relative.
+        FileSpec file_spec(path);
+        FileSystem::Instance().Resolve(file_spec);
         if (FileSystem::Instance().Exists(file_spec) &&
-            files.AppendIfUnique(file_spec))
+            files.AppendIfUnique(file_spec)) {
           count++;
+          break;
+        }
       }
     }
   }
+
+  // We may have @executable_paths but no RPATHS.  Figure those out here.
+  // Only do this if this object file is the executable.  We have no way to
+  // get back to the actual executable otherwise, so we won't get the right
+  // path.
+  if (!at_exec_relative_paths.empty() && CalculateType() == eTypeExecutable) {
+    FileSpec exec_dir = this_file_spec.CopyByRemovingLastPathComponent();
+    for (const auto &at_exec_relative_path : at_exec_relative_paths) {
+      FileSpec file_spec =
+          exec_dir.CopyByAppendingPathComponent(at_exec_relative_path);
+      if (FileSystem::Instance().Exists(file_spec) &&
+          files.AppendIfUnique(file_spec))
+        count++;
+    }
+  }
   return count;
 }
 
@@ -6519,14 +6520,15 @@ struct page_object {
 };
 
 bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
-                               const FileSpec &outfile,
-                               lldb::SaveCoreStyle &core_style, Status &error) {
-  if (!process_sp)
-    return false;
-
-  // Default on macOS is to create a dirty-memory-only corefile.
+                               const lldb_private::SaveCoreOptions &options,
+                               Status &error) {
+  auto core_style = options.GetStyle();
   if (core_style == SaveCoreStyle::eSaveCoreUnspecified)
     core_style = SaveCoreStyle::eSaveCoreDirtyOnly;
+  // The FileSpec and Process are already checked in PluginManager::SaveCore.
+  assert(options.GetOutputFile().has_value());
+  assert(process_sp);
+  const FileSpec outfile = options.GetOutputFile().value();
 
   Target &target = process_sp->GetTarget();
   const ArchSpec target_arch = target.GetArchitecture();
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 55bc688126eb3..e7af90e28bc4b 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -62,8 +62,7 @@ class ObjectFileMachO : public lldb_private::ObjectFile {
                                         lldb_private::ModuleSpecList &specs);
 
   static bool SaveCore(const lldb::ProcessSP &process_sp,
-                       const lldb_private::FileSpec &outfile,
-                       lldb::SaveCoreStyle &core_style,
+                       const lldb_private::SaveCoreOptions &options,
                        lldb_private::Status &error);
 
   static bool MagicBytesMatch(lldb::DataBufferSP data_sp, lldb::addr_t offset,
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index 7c875aa3223ed..faa144bfb5f6a 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -56,18 +56,20 @@ size_t ObjectFileMinidump::GetModuleSpecifications(
 }
 
 bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
-                                  const lldb_private::FileSpec &outfile,
-                                  lldb::SaveCoreStyle &core_style,
+                                  const lldb_private::SaveCoreOptions &options,
                                   lldb_private::Status &error) {
-  // Set default core style if it isn't set.
+  // Output file and process_sp are both checked in PluginManager::SaveCore.
+  assert(options.GetOutputFile().has_value());
+  assert(process_sp);
+
+  // Minidump defaults to stacks only.
+  SaveCoreStyle core_style = options.GetStyle();
   if (core_style == SaveCoreStyle::eSaveCoreUnspecified)
     core_style = SaveCoreStyle::eSaveCoreStackOnly;
 
-  if (!process_sp)
-    return false;
-
   llvm::Expected<lldb::FileUP> maybe_core_file = FileSystem::Instance().Open(
-      outfile, File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate);
+      options.GetOutputFile().value(),
+      File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate);
   if (!maybe_core_file) {
     error = maybe_core_file.takeError();
     return false;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
index b5c40445fe742..0cd31a0e482d5 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
@@ -55,8 +55,7 @@ class ObjectFileMinidump : public lldb_private::PluginInterface {
 
   // Saves dump in Minidump file format
   static bool SaveCore(const lldb::ProcessSP &process_sp,
-                       const lldb_private::FileSpec &outfile,
-                       lldb::SaveCoreStyle &core_style,
+                       const lldb_private::SaveCoreOptions &options,
                        lldb_private::Status &error);
 
 private:
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index be0020cad5bee..bda691ade8af0 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -355,11 +355,12 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
 }
 
 bool ObjectFilePECOFF::SaveCore(const lldb::ProcessSP &process_sp,
-                                const lldb_private::FileSpec &outfile,
-                                lldb::SaveCoreStyle &core_style,
+                                const lldb_private::SaveCoreOptions &options,
                                 lldb_private::Status &error) {
-  core_style = eSaveCoreFull;
-  return SaveMiniDump(process_sp, outfile, error);
+  // Outfile and process_sp are validated by PluginManager::SaveCore
+  assert(options.GetOutputFile().has_value());
+  assert(process_sp);
+  return SaveMiniDump(process_sp, options, error);
 }
 
 bool ObjectFilePECOFF::MagicBytesMatch(DataBufferSP data_sp) {
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index c59701fa65ec3..2eb2b3b774538 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -82,8 +82,7 @@ class ObjectFilePECOFF : public lldb_private::ObjectFile {
                                         lldb_private::ModuleSpecList &specs);
 
   static bool SaveCore(const lldb::ProcessSP &process_sp,
-                       const lldb_private::FileSpec &outfile,
-                       lldb::SaveCoreStyle &core_style,
+                       const lldb_private::SaveCoreOptions &options,
                        lldb_private::Status &error);
 
   static bool MagicBytesMatch(lldb::DataBufferSP data_sp);
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
index e5cb36910dd25..61cd74da22223 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
@@ -21,11 +21,12 @@
 namespace lldb_private {
 
 bool SaveMiniDump(const lldb::ProcessSP &process_sp,
-                  const lldb_private::FileSpec &outfile,
+                  const SaveCoreOptions &core_options,
                   lldb_private::Status &error) {
   if (!process_sp)
     return false;
 #ifdef _WIN32
+  const auto &outfile = core_options.GetOutputFile().value();
   HANDLE process_handle = ::OpenProcess(
       PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, process_sp->GetID());
   const std::string file_name = outfile.GetPath();
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h b/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
index 04b93e221362a..03c0ece306143 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
@@ -14,7 +14,7 @@
 namespace lldb_private {
 
 bool SaveMiniDump(const lldb::ProcessSP &process_sp,
-                  const lldb_private::FileSpec &outfile,
+                  const SaveCoreOptions &core_options,
                   lldb_private::Status &error);
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm.h b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm.h
index 89ffa617294aa..b9537e6952f6c 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm.h
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm.h
@@ -30,7 +30,7 @@ class NativeProcessFreeBSD;
 class NativeRegisterContextFreeBSD_arm : public NativeRegisterContextFreeBSD {
 public:
   NativeRegisterContextFreeBSD_arm(const ArchSpec &target_arch,
-                                   NativeThreadProtocol &native_thread);
+                                   NativeThreadFreeBSD &native_thread);
 
   uint32_t GetRegisterSetCount() const override;
 
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_mips64.h b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_mips64.h
index 0b4a508a7d5dd..286b4fd8d8b99 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_mips64.h
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_mips64.h
@@ -31,7 +31,7 @@ class NativeRegisterContextFreeBSD_mips64
     : public NativeRegisterContextFreeBSD {
 public:
   NativeRegisterContextFreeBSD_mips64(const ArchSpec &target_arch,
-                                      NativeThreadProtocol &native_thread);
+                                      NativeThreadFreeBSD &native_thread);
 
   uint32_t GetRegisterSetCount() const override;
 
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_powerpc.h b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_powerpc.h
index 3df371036f915..420db822acc0f 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_powerpc.h
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_powerpc.h
@@ -31,7 +31,7 @@ class NativeRegisterContextFreeBSD_powerpc
     : public NativeRegisterContextFreeBSD {
 public:
   NativeRegisterContextFreeBSD_powerpc(const ArchSpec &target_arch,
-                                       NativeThreadProtocol &native_thread);
+                                       NativeThreadFreeBSD &native_thread);
 
   uint32_t GetRegisterSetCount() const override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 187370eb36cae..5d0a3e31d0437 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1145,8 +1145,8 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
       if (socket_pipe.CanWrite())
         socket_pipe.CloseWriteFileDescriptor();
       if (socket_pipe.CanRead()) {
-        char port_cstr[PATH_MAX] = {0};
-        port_cstr[0] = '\0';
+        // The port number may be up to "65535\0".
+        char port_cstr[6] = {0};
         size_t num_bytes = sizeof(port_cstr);
         // Read port from pipe with 10 second timeout.
         error = socket_pipe.ReadWithTimeout(
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
index 5285ec1d3db4e..65f1cc12ba307 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
@@ -46,11 +46,13 @@ using namespace lldb_private;
 
 GDBRemoteCommunicationServerPlatform::PortMap::PortMap(uint16_t min_port,
                                                        uint16_t max_port) {
+  assert(min_port);
   for (; min_port < max_port; ++min_port)
     m_port_map[min_port] = LLDB_INVALID_PROCESS_ID;
 }
 
 void GDBRemoteCommunicationServerPlatform::PortMap::AllowPort(uint16_t port) {
+  assert(port);
   // Do not modify existing mappings
   m_port_map.insert({port, LLDB_INVALID_PROCESS_ID});
 }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index c60e4bb503a37..8c7e92bead32c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -20,12 +20,9 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
-  OperatingSystemPythonInterface.cpp
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
-  ScriptedThreadPlanPythonInterface.cpp
-  ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
     lldbCore
@@ -38,3 +35,8 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   LINK_COMPONENTS
     Support
   )
+
+add_subdirectory(OperatingSystemPythonInterface)
+add_subdirectory(ScriptedPlatformPythonInterface)
+add_subdirectory(ScriptedThreadPlanPythonInterface)
+
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..b48f1e818e5d5
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonOperatingSystemPythonInterface PLUGIN
+
+  OperatingSystemPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
similarity index 75%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
index c162c7367c654..019db269a905b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -13,11 +14,13 @@
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "OperatingSystemPythonInterface.h"
 
 using namespace lldb;
@@ -25,6 +28,8 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(OperatingSystemPythonInterface, ScriptInterpreterPythonOperatingSystemPythonInterface)
+
 OperatingSystemPythonInterface::OperatingSystemPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : OperatingSystemInterface(), ScriptedThreadPythonInterface(interpreter) {}
@@ -79,4 +84,18 @@ OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
   return obj->GetAsString()->GetValue().str();
 }
 
+void OperatingSystemPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "settings set target.process.python-os-plugin-path <script-path>",
+      "settings set process.experimental.os-plugin-reports-all-threads [0/1]"};
+  const std::vector<llvm::StringRef> api_usages = {};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), llvm::StringRef("Mock thread state"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void OperatingSystemPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
similarity index 83%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
index da7bbf13b1d55..6d60f8b437d1c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
@@ -10,17 +10,19 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedThreadPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
+#include "../ScriptedThreadPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class OperatingSystemPythonInterface
     : virtual public OperatingSystemInterface,
-      virtual public ScriptedThreadPythonInterface {
+      virtual public ScriptedThreadPythonInterface,
+      public PluginInterface {
 public:
   OperatingSystemPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ class OperatingSystemPythonInterface
   StructuredData::DictionarySP GetRegisterInfo() override;
 
   std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "OperatingSystemPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..ae5e525229c02
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedPlatformPythonInterface PLUGIN
+
+  ScriptedPlatformPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
similarity index 83%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
index 6e93bec80056e..3586251bd4aac 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
@@ -6,27 +6,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
-#include "lldb/Target/ExecutionContext.h"
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedPlatformPythonInterface, ScriptInterpreterPythonScriptedPlatformPythonInterface)
+
 ScriptedPlatformPythonInterface::ScriptedPlatformPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedPlatformInterface(), ScriptedPythonInterface(interpreter) {}
@@ -93,4 +97,14 @@ Status ScriptedPlatformPythonInterface::KillProcess(lldb::pid_t pid) {
   return GetStatusFromMethod("kill_process", pid);
 }
 
+void ScriptedPlatformPythonInterface::Initialize() {
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), "Mock platform and interact with its processes.",
+      CreateInstance, eScriptLanguagePython, {});
+}
+
+void ScriptedPlatformPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
similarity index 84%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
index 0842d3a003429..01ee40a5a197c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
@@ -10,15 +10,16 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
+#include "../ScriptedPythonInterface.h"
 
 namespace lldb_private {
 class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
-                                        public ScriptedPythonInterface {
+                                        public ScriptedPythonInterface,
+                                        public PluginInterface {
 public:
   ScriptedPlatformPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -43,6 +44,16 @@ class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
   Status LaunchProcess(lldb::ProcessLaunchInfoSP launch_info) override;
 
   Status KillProcess(lldb::pid_t pid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedPlatformPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..db41da165d275
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedThreadPlanPythonInterface PLUGIN
+
+  ScriptedThreadPlanPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
similarity index 77%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
index f23858c01277c..5f1c7da71bd52 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
@@ -6,23 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPlanPythonInterface.h"
 
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedThreadPlanPythonInterface, ScriptInterpreterPythonScriptedThreadPlanPythonInterface)
+
 ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
@@ -102,4 +107,19 @@ ScriptedThreadPlanPythonInterface::GetStopDescription(lldb::StreamSP &stream) {
   return llvm::Error::success();
 }
 
+void ScriptedThreadPlanPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "thread step-scripted -C <script-name> [-k key -v value ...]"};
+  const std::vector<llvm::StringRef> api_usages = {
+      "SBThread.StepUsingScriptedThreadPlan"};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(),
+      llvm::StringRef("Alter thread stepping logic and stop reason"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void ScriptedThreadPlanPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
similarity index 82%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
index 6ec89b9f59253..c0a82f4cbf46a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
@@ -10,16 +10,18 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
+#include "../ScriptedPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
-                                          public ScriptedPythonInterface {
+                                          public ScriptedPythonInterface,
+                                          public PluginInterface {
 public:
   ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
   lldb::StateType GetRunState() override;
 
   llvm::Error GetStopDescription(lldb::StreamSP &stream) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedThreadPlanPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 70fa6d83e306f..d34fdf14122f2 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -14,11 +14,10 @@
 // LLDB Python header must be included first
 #include "lldb-python.h"
 
-#include "Interfaces/OperatingSystemPythonInterface.h"
-#include "Interfaces/ScriptedPlatformPythonInterface.h"
+#include "Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h"
+#include "Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
-#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
-#include "Interfaces/ScriptedThreadPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
 #include "SWIGPythonBridge.h"
diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index c46dc54c912e5..07245479fc2cd 100644
--- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -783,7 +783,7 @@ class EnableCommand : public CommandObjectParsed {
 
     // Now check if we have a running process.  If so, we should instruct the
     // process monitor to enable/disable DarwinLog support now.
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // Grab the active process.
     auto process_sp = target.GetProcessSP();
@@ -865,7 +865,7 @@ class StatusCommand : public CommandObjectParsed {
 
     // Figure out if we've got a process.  If so, we can tell if DarwinLog is
     // available for that process.
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     auto process_sp = target.GetProcessSP();
     if (!process_sp) {
       stream.PutCString("Availability: unknown (requires process)\n");
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
index 409e9bb37ab28..4ed523bbb9e76 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
@@ -37,7 +37,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
     if (attributes.Size() == 0)
       continue;
 
-    uint64_t num_elements = 0;
+    std::optional<uint64_t> num_elements;
     uint64_t lower_bound = 0;
     uint64_t upper_bound = 0;
     bool upper_bound_valid = false;
@@ -91,7 +91,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
       }
     }
 
-    if (num_elements == 0) {
+    if (!num_elements || *num_elements == 0) {
       if (upper_bound_valid && upper_bound >= lower_bound)
         num_elements = upper_bound - lower_bound + 1;
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 85c59a605c675..a4dcde1629fc2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1395,20 +1395,20 @@ DWARFASTParserClang::ParseArrayType(const DWARFDIE &die,
   uint64_t array_element_bit_stride = byte_stride * 8 + bit_stride;
   CompilerType clang_type;
   if (array_info && array_info->element_orders.size() > 0) {
-    uint64_t num_elements = 0;
     auto end = array_info->element_orders.rend();
     for (auto pos = array_info->element_orders.rbegin(); pos != end; ++pos) {
-      num_elements = *pos;
-      clang_type = m_ast.CreateArrayType(array_element_type, num_elements,
-                                         attrs.is_vector);
+      clang_type = m_ast.CreateArrayType(
+          array_element_type, /*element_count=*/*pos, attrs.is_vector);
+
+      uint64_t num_elements = pos->value_or(0);
       array_element_type = clang_type;
       array_element_bit_stride = num_elements
                                      ? array_element_bit_stride * num_elements
                                      : array_element_bit_stride;
     }
   } else {
-    clang_type =
-        m_ast.CreateArrayType(array_element_type, 0, attrs.is_vector);
+    clang_type = m_ast.CreateArrayType(
+        array_element_type, /*element_count=*/std::nullopt, attrs.is_vector);
   }
   ConstString empty_name;
   TypeSP type_sp =
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index a6efecb64b268..ae59454193163 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2233,30 +2233,31 @@ TypeSystemClang::CreateBlockPointerType(const CompilerType &function_type) {
 
 #pragma mark Array Types
 
-CompilerType TypeSystemClang::CreateArrayType(const CompilerType &element_type,
-                                              size_t element_count,
-                                              bool is_vector) {
-  if (element_type.IsValid()) {
-    ASTContext &ast = getASTContext();
+CompilerType
+TypeSystemClang::CreateArrayType(const CompilerType &element_type,
+                                 std::optional<size_t> element_count,
+                                 bool is_vector) {
+  if (!element_type.IsValid())
+    return {};
 
-    if (is_vector) {
-      return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
-                                          element_count));
-    } else {
+  ASTContext &ast = getASTContext();
 
-      llvm::APInt ap_element_count(64, element_count);
-      if (element_count == 0) {
-        return GetType(
-            ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
-                                       clang::ArraySizeModifier::Normal, 0));
-      } else {
-        return GetType(ast.getConstantArrayType(
-            ClangUtil::GetQualType(element_type), ap_element_count, nullptr,
-            clang::ArraySizeModifier::Normal, 0));
-      }
-    }
-  }
-  return CompilerType();
+  // Unknown number of elements; this is an incomplete array
+  // (e.g., variable length array with non-constant bounds, or
+  // a flexible array member).
+  if (!element_count)
+    return GetType(
+        ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
+                                   clang::ArraySizeModifier::Normal, 0));
+
+  if (is_vector)
+    return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
+                                        *element_count));
+
+  llvm::APInt ap_element_count(64, *element_count);
+  return GetType(ast.getConstantArrayType(ClangUtil::GetQualType(element_type),
+                                          ap_element_count, nullptr,
+                                          clang::ArraySizeModifier::Normal, 0));
 }
 
 CompilerType TypeSystemClang::CreateStructForIdentifier(
@@ -4725,67 +4726,69 @@ TypeSystemClang::GetFloatTypeSemantics(size_t byte_size) {
   return llvm::APFloatBase::Bogus();
 }
 
+std::optional<uint64_t>
+TypeSystemClang::GetObjCBitSize(QualType qual_type,
+                                ExecutionContextScope *exe_scope) {
+  assert(qual_type->isObjCObjectOrInterfaceType());
+  ExecutionContext exe_ctx(exe_scope);
+  if (Process *process = exe_ctx.GetProcessPtr()) {
+    if (ObjCLanguageRuntime *objc_runtime =
+            ObjCLanguageRuntime::Get(*process)) {
+      if (std::optional<uint64_t> bit_size =
+              objc_runtime->GetTypeBitSize(GetType(qual_type)))
+        return *bit_size;
+    }
+  } else {
+    static bool g_printed = false;
+    if (!g_printed) {
+      StreamString s;
+      DumpTypeDescription(qual_type.getAsOpaquePtr(), s);
+
+      llvm::outs() << "warning: trying to determine the size of type ";
+      llvm::outs() << s.GetString() << "\n";
+      llvm::outs() << "without a valid ExecutionContext. this is not "
+                      "reliable. please file a bug against LLDB.\n";
+      llvm::outs() << "backtrace:\n";
+      llvm::sys::PrintStackTrace(llvm::outs());
+      llvm::outs() << "\n";
+      g_printed = true;
+    }
+  }
+
+  return getASTContext().getTypeSize(qual_type) +
+         getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
+}
+
 std::optional<uint64_t>
 TypeSystemClang::GetBitSize(lldb::opaque_compiler_type_t type,
                             ExecutionContextScope *exe_scope) {
-  if (GetCompleteType(type)) {
-    clang::QualType qual_type(GetCanonicalQualType(type));
-    const clang::Type::TypeClass type_class = qual_type->getTypeClass();
-    switch (type_class) {
-    case clang::Type::Record:
-      if (GetCompleteType(type))
-        return getASTContext().getTypeSize(qual_type);
-      else
-        return std::nullopt;
-      break;
+  if (!GetCompleteType(type))
+    return std::nullopt;
 
-    case clang::Type::ObjCInterface:
-    case clang::Type::ObjCObject: {
-      ExecutionContext exe_ctx(exe_scope);
-      Process *process = exe_ctx.GetProcessPtr();
-      if (process) {
-        if (ObjCLanguageRuntime *objc_runtime =
-                ObjCLanguageRuntime::Get(*process)) {
-          if (std::optional<uint64_t> bit_size =
-                  objc_runtime->GetTypeBitSize(GetType(qual_type)))
-            return *bit_size;
-        }
-      } else {
-        static bool g_printed = false;
-        if (!g_printed) {
-          StreamString s;
-          DumpTypeDescription(type, s);
-
-          llvm::outs() << "warning: trying to determine the size of type ";
-          llvm::outs() << s.GetString() << "\n";
-          llvm::outs() << "without a valid ExecutionContext. this is not "
-                          "reliable. please file a bug against LLDB.\n";
-          llvm::outs() << "backtrace:\n";
-          llvm::sys::PrintStackTrace(llvm::outs());
-          llvm::outs() << "\n";
-          g_printed = true;
-        }
-      }
-    }
-      [[fallthrough]];
-    default:
-      const uint32_t bit_size = getASTContext().getTypeSize(qual_type);
-      if (bit_size == 0) {
-        if (qual_type->isIncompleteArrayType())
-          return getASTContext().getTypeSize(
-              qual_type->getArrayElementTypeNoTypeQual()
-                  ->getCanonicalTypeUnqualified());
-      }
-      if (qual_type->isObjCObjectOrInterfaceType())
-        return bit_size +
-               getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
-      // Function types actually have a size of 0, that's not an error.
-      if (qual_type->isFunctionProtoType())
-        return bit_size;
-      if (bit_size)
-        return bit_size;
-    }
+  clang::QualType qual_type(GetCanonicalQualType(type));
+  const clang::Type::TypeClass type_class = qual_type->getTypeClass();
+  switch (type_class) {
+  case clang::Type::ConstantArray:
+  case clang::Type::FunctionProto:
+  case clang::Type::Record:
+    return getASTContext().getTypeSize(qual_type);
+  case clang::Type::ObjCInterface:
+  case clang::Type::ObjCObject:
+    return GetObjCBitSize(qual_type, exe_scope);
+  case clang::Type::IncompleteArray: {
+    const uint64_t bit_size = getASTContext().getTypeSize(qual_type);
+    if (bit_size == 0)
+      return getASTContext().getTypeSize(
+          qual_type->getArrayElementTypeNoTypeQual()
+              ->getCanonicalTypeUnqualified());
+
+    return bit_size;
   }
+  default:
+    if (const uint64_t bit_size = getASTContext().getTypeSize(qual_type))
+      return bit_size;
+  }
+
   return std::nullopt;
 }
 
@@ -5471,9 +5474,9 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
   case clang::Type::IncompleteArray:
     if (auto array_info =
             GetDynamicArrayInfo(*this, GetSymbolFile(), qual_type, exe_ctx))
-      // Only 1-dimensional arrays are supported.
+      // FIXME: Only 1-dimensional arrays are supported.
       num_children = array_info->element_orders.size()
-                         ? array_info->element_orders.back()
+                         ? array_info->element_orders.back().value_or(0)
                          : 0;
     break;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index d67b7a4c9fe72..56a5c0a516706 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -498,7 +498,8 @@ class TypeSystemClang : public TypeSystem {
   // Array Types
 
   CompilerType CreateArrayType(const CompilerType &element_type,
-                               size_t element_count, bool is_vector);
+                               std::optional<size_t> element_count,
+                               bool is_vector);
 
   // Enumeration Types
   CompilerType CreateEnumerationType(llvm::StringRef name,
@@ -1172,6 +1173,9 @@ class TypeSystemClang : public TypeSystem {
   /// on creation of a new instance.
   void LogCreation() const;
 
+  std::optional<uint64_t> GetObjCBitSize(clang::QualType qual_type,
+                                         ExecutionContextScope *exe_scope);
+
   // Classes that inherit from TypeSystemClang can see and modify these
   std::string m_target_triple;
   std::unique_ptr<clang::ASTContext> m_ast_up;
diff --git a/lldb/source/Symbol/CMakeLists.txt b/lldb/source/Symbol/CMakeLists.txt
index ad3488dcdfcec..e49477df06c94 100644
--- a/lldb/source/Symbol/CMakeLists.txt
+++ b/lldb/source/Symbol/CMakeLists.txt
@@ -6,6 +6,7 @@ add_lldb_library(lldbSymbol NO_PLUGIN_DEPENDENCIES
   CompilerDecl.cpp
   CompilerDeclContext.cpp
   CompilerType.cpp
+  SaveCoreOptions.cpp
   DWARFCallFrameInfo.cpp
   DebugMacros.cpp
   DeclVendor.cpp
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
new file mode 100644
index 0000000000000..0f6fdac1ce22e
--- /dev/null
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -0,0 +1,53 @@
+//===-- SaveCoreOptions.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Core/PluginManager.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+Status SaveCoreOptions::SetPluginName(const char *name) {
+  Status error;
+  if (!name || !name[0]) {
+    m_plugin_name = std::nullopt;
+    return error;
+  }
+
+  if (!PluginManager::IsRegisteredObjectFilePluginName(name)) {
+    error.SetErrorStringWithFormat(
+        "plugin name '%s' is not a valid ObjectFile plugin name", name);
+    return error;
+  }
+
+  m_plugin_name = name;
+  return error;
+}
+
+void SaveCoreOptions::SetStyle(lldb::SaveCoreStyle style) { m_style = style; }
+
+void SaveCoreOptions::SetOutputFile(FileSpec file) { m_file = file; }
+
+std::optional<std::string> SaveCoreOptions::GetPluginName() const {
+  return m_plugin_name;
+}
+
+lldb::SaveCoreStyle SaveCoreOptions::GetStyle() const {
+  return m_style.value_or(lldb::eSaveCoreUnspecified);
+}
+
+const std::optional<lldb_private::FileSpec>
+SaveCoreOptions::GetOutputFile() const {
+  return m_file;
+}
+
+void SaveCoreOptions::Clear() {
+  m_file = std::nullopt;
+  m_plugin_name = std::nullopt;
+  m_style = std::nullopt;
+}
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index e258a4e3d82f2..e2dbd81a82c84 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -46,6 +46,8 @@ operator==(const UnwindPlan::Row::RegisterLocation &rhs) const {
         return !memcmp(m_location.expr.opcodes, rhs.m_location.expr.opcodes,
                        m_location.expr.length);
       break;
+    case isConstant:
+      return m_location.constant_value == rhs.m_location.constant_value;
     }
   }
   return false;
@@ -153,6 +155,9 @@ void UnwindPlan::Row::RegisterLocation::Dump(Stream &s,
     if (m_type == atDWARFExpression)
       s.PutChar(']');
   } break;
+  case isConstant:
+    s.Printf("=0x%" PRIx64, m_location.constant_value);
+    break;
   }
 }
 
@@ -351,6 +356,18 @@ bool UnwindPlan::Row::SetRegisterLocationToSame(uint32_t reg_num,
   return true;
 }
 
+bool UnwindPlan::Row::SetRegisterLocationToIsConstant(uint32_t reg_num,
+                                                      uint64_t constant,
+                                                      bool can_replace) {
+  if (!can_replace &&
+      m_register_locations.find(reg_num) != m_register_locations.end())
+    return false;
+  RegisterLocation reg_loc;
+  reg_loc.SetIsConstant(constant);
+  m_register_locations[reg_num] = reg_loc;
+  return true;
+}
+
 bool UnwindPlan::Row::operator==(const UnwindPlan::Row &rhs) const {
   return m_offset == rhs.m_offset && m_cfa_value == rhs.m_cfa_value &&
          m_afa_value == rhs.m_afa_value &&
diff --git a/lldb/source/Symbol/UnwindTable.cpp b/lldb/source/Symbol/UnwindTable.cpp
index 11bedf3d6052e..da88b0c9c4ea1 100644
--- a/lldb/source/Symbol/UnwindTable.cpp
+++ b/lldb/source/Symbol/UnwindTable.cpp
@@ -30,70 +30,27 @@ using namespace lldb;
 using namespace lldb_private;
 
 UnwindTable::UnwindTable(Module &module)
-    : m_module(module), m_unwinds(), m_initialized(false), m_mutex(),
-      m_object_file_unwind_up(), m_eh_frame_up(), m_compact_unwind_up(),
-      m_arm_unwind_up() {}
+    : m_module(module), m_unwinds(), m_scanned_all_unwind_sources(false),
+      m_mutex(), m_object_file_unwind_up(), m_eh_frame_up(),
+      m_compact_unwind_up(), m_arm_unwind_up() {}
 
 // We can't do some of this initialization when the ObjectFile is running its
 // ctor; delay doing it until needed for something.
-
 void UnwindTable::Initialize() {
-  if (m_initialized)
+  if (m_scanned_all_unwind_sources)
     return;
 
   std::lock_guard<std::mutex> guard(m_mutex);
 
-  if (m_initialized) // check again once we've acquired the lock
-    return;
-  m_initialized = true;
-  ObjectFile *object_file = m_module.GetObjectFile();
-  if (!object_file)
+  if (m_scanned_all_unwind_sources) // check again once we've acquired the lock
     return;
 
-  m_object_file_unwind_up = object_file->CreateCallFrameInfo();
-
-  SectionList *sl = m_module.GetSectionList();
-  if (!sl)
-    return;
-
-  SectionSP sect = sl->FindSectionByType(eSectionTypeEHFrame, true);
-  if (sect.get()) {
-    m_eh_frame_up = std::make_unique<DWARFCallFrameInfo>(
-        *object_file, sect, DWARFCallFrameInfo::EH);
-  }
-
-  sect = sl->FindSectionByType(eSectionTypeDWARFDebugFrame, true);
-  if (sect) {
-    m_debug_frame_up = std::make_unique<DWARFCallFrameInfo>(
-        *object_file, sect, DWARFCallFrameInfo::DWARF);
-  }
-
-  sect = sl->FindSectionByType(eSectionTypeCompactUnwind, true);
-  if (sect) {
-    m_compact_unwind_up =
-        std::make_unique<CompactUnwindInfo>(*object_file, sect);
-  }
-
-  sect = sl->FindSectionByType(eSectionTypeARMexidx, true);
-  if (sect) {
-    SectionSP sect_extab = sl->FindSectionByType(eSectionTypeARMextab, true);
-    if (sect_extab.get()) {
-      m_arm_unwind_up =
-          std::make_unique<ArmUnwindInfo>(*object_file, sect, sect_extab);
-    }
-  }
-}
-
-void UnwindTable::Update() {
-  if (!m_initialized)
-    return Initialize();
-
-  std::lock_guard<std::mutex> guard(m_mutex);
-
   ObjectFile *object_file = m_module.GetObjectFile();
   if (!object_file)
     return;
 
+  m_scanned_all_unwind_sources = true;
+
   if (!m_object_file_unwind_up)
     m_object_file_unwind_up = object_file->CreateCallFrameInfo();
 
@@ -102,22 +59,19 @@ void UnwindTable::Update() {
     return;
 
   SectionSP sect = sl->FindSectionByType(eSectionTypeEHFrame, true);
-  if (!m_eh_frame_up && sect) {
+  if (!m_eh_frame_up && sect)
     m_eh_frame_up = std::make_unique<DWARFCallFrameInfo>(
         *object_file, sect, DWARFCallFrameInfo::EH);
-  }
 
   sect = sl->FindSectionByType(eSectionTypeDWARFDebugFrame, true);
-  if (!m_debug_frame_up && sect) {
+  if (!m_debug_frame_up && sect)
     m_debug_frame_up = std::make_unique<DWARFCallFrameInfo>(
         *object_file, sect, DWARFCallFrameInfo::DWARF);
-  }
 
   sect = sl->FindSectionByType(eSectionTypeCompactUnwind, true);
-  if (!m_compact_unwind_up && sect) {
+  if (!m_compact_unwind_up && sect)
     m_compact_unwind_up =
         std::make_unique<CompactUnwindInfo>(*object_file, sect);
-  }
 
   sect = sl->FindSectionByType(eSectionTypeARMexidx, true);
   if (!m_arm_unwind_up && sect) {
@@ -129,6 +83,11 @@ void UnwindTable::Update() {
   }
 }
 
+void UnwindTable::ModuleWasUpdated() {
+  std::lock_guard<std::mutex> guard(m_mutex);
+  m_scanned_all_unwind_sources = false;
+}
+
 UnwindTable::~UnwindTable() = default;
 
 std::optional<AddressRange>
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index bc8081f4e3b31..a61228d092d89 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -1694,6 +1694,15 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
   }
 
+  if (unwindplan_regloc.IsConstant()) {
+    regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred;
+    regloc.location.inferred_value = unwindplan_regloc.GetConstant();
+    m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
+    UnwindLogMsg("supplying caller's register %s (%d) via constant value",
+                 regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+    return UnwindLLDB::RegisterSearchResult::eRegisterFound;
+  }
+
   UnwindLogMsg("no save location for %s (%d) in this stack frame",
                regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index ec0da8a1378a8..129683c43f0c1 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2155,12 +2155,21 @@ bool Target::ReadPointerFromMemory(const Address &addr, Status &error,
   return false;
 }
 
-ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
-                                   Status *error_ptr) {
+ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
+                                   bool notify, Status *error_ptr) {
   ModuleSP module_sp;
 
   Status error;
 
+  // Apply any remappings specified in target.object-map:
+  ModuleSpec module_spec(orig_module_spec);
+  PathMappingList &obj_mapping = GetObjectPathMap();
+  if (std::optional<FileSpec> remapped_obj_file =
+          obj_mapping.RemapPath(orig_module_spec.GetFileSpec().GetPath(),
+                                true /* only_if_exists */)) {
+    module_spec.GetFileSpec().SetPath(remapped_obj_file->GetPath());
+  }
+
   // First see if we already have this module in our module list.  If we do,
   // then we're done, we don't need to consult the shared modules list.  But
   // only do this if we are passed a UUID.
@@ -4459,6 +4468,14 @@ PathMappingList &TargetProperties::GetSourcePathMap() const {
   return option_value->GetCurrentValue();
 }
 
+PathMappingList &TargetProperties::GetObjectPathMap() const {
+  const uint32_t idx = ePropertyObjectMap;
+  OptionValuePathMappings *option_value =
+      m_collection_sp->GetPropertyAtIndexAsOptionValuePathMappings(idx);
+  assert(option_value);
+  return option_value->GetCurrentValue();
+}
+
 bool TargetProperties::GetAutoSourceMapRelative() const {
   const uint32_t idx = ePropertyAutoSourceMapRelative;
   return GetPropertyAtIndexAs<bool>(
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 7f79218e0a6a4..4404a45492254 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -46,6 +46,9 @@ let Definition = "target" in {
   def SourceMap: Property<"source-map", "PathMap">,
     DefaultStringValue<"">,
     Desc<"Source path remappings apply substitutions to the paths of source files, typically needed to debug from a different host than the one that built the target.  The source-map property consists of an array of pairs, the first element is a path prefix, and the second is its replacement.  The syntax is `prefix1 replacement1 prefix2 replacement2...`.  The pairs are checked in order, the first prefix that matches is used, and that prefix is substituted with the replacement.  A common pattern is to use source-map in conjunction with the clang -fdebug-prefix-map flag.  In the build, use `-fdebug-prefix-map=/path/to/build_dir=.` to rewrite the host specific build directory to `.`.  Then for debugging, use `settings set target.source-map . /path/to/local_dir` to convert `.` to a valid local path.">;
+  def ObjectMap: Property<"object-map", "PathMap">,
+    DefaultStringValue<"">,
+    Desc<"Object path remappings apply substitutions to the paths of object files, typically needed to debug from a different host than the one that built the target.  The object-map property consists of an array of pairs, the first element is a path prefix, and the second is its replacement.  The syntax is `prefix1 replacement1 prefix2 replacement2...`.  The pairs are checked in order, the first prefix that matches is used, and that prefix is substituted with the replacement.">;
   def AutoSourceMapRelative: Property<"auto-source-map-relative", "Boolean">,
     DefaultTrue,
     Desc<"Automatically deduce source path mappings based on source file breakpoint resolution. It only deduces source mapping if source file breakpoint request is using full path and if the debug info contains relative paths.">;
diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index fc9ddac9c1d59..bee6e66aa7219 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,15 +12,17 @@
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # This test has been flaky lately on Linux buildbots and Github/Buildkite CI
-    # runs.
-    @skipIfLinux
+    # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162.
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget
     def test_multiple_debuggers(self):
         self.driver_exe = self.getBuildArtifact("multi-process-driver")
-        self.buildDriver("multi-process-driver.cpp", self.driver_exe)
+        self.buildDriver(
+            "multi-process-driver.cpp",
+            self.driver_exe,
+            defines=[("LLDB_HOST_ARCH", lldbplatformutil.getArchitecture())],
+        )
         self.addTearDownHook(lambda: os.remove(self.driver_exe))
 
         self.inferior_exe = self.getBuildArtifact("testprog")
diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index c9c0bcf641e08..64728fb7c29a1 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -16,6 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <inttypes.h>
 
 #include "lldb/API/LLDB.h"
 #include "lldb/API/SBCommandInterpreter.h"
@@ -30,6 +31,9 @@
 
 #define DEBUG 0
 
+#define STR1(x) #x
+#define STR(x) STR1(x)
+
 using namespace lldb;
 
 bool *completed_threads_array = 0;
@@ -102,20 +106,21 @@ void *do_one_debugger (void *in)
     if (debugger.IsValid ())
     {
         debugger.SetAsync (true);
-        SBTarget target = debugger.CreateTargetWithFileAndArch(inferior_process_name, "x86_64");
+        SBTarget target = debugger.CreateTargetWithFileAndArch(inferior_process_name,
+                                                               STR(LLDB_HOST_ARCH));
         SBCommandInterpreter command_interp = debugger.GetCommandInterpreter();
         if (target.IsValid())
         {
             SBBreakpoint bar_br = target.BreakpointCreateByName ("bar", "testprog");
             if (!bar_br.IsValid())
             {
-                printf ("#%lld: failed to set breakpoint on bar, exiting.\n", threadnum);
+                printf ("#%" PRIu64 ": failed to set breakpoint on bar, exiting.\n", threadnum);
                 exit (1);
             }
             SBBreakpoint foo_br = target.BreakpointCreateByName ("foo", "testprog");
             if (!foo_br.IsValid())
             {
-                printf ("#%lld: Failed to set breakpoint on foo()\n", threadnum);
+                printf ("#%" PRIu64 ": Failed to set breakpoint on foo()\n", threadnum);
             }
 
             SBLaunchInfo launch_info (NULL);
@@ -136,15 +141,17 @@ void *do_one_debugger (void *in)
 
                 if (!walk_stack_to_main (process.GetThreadAtIndex(0)))
                 {
-                    printf ("#%lld: backtrace while @ foo() failed\n", threadnum);
+                    printf ("#%" PRIu64 ": backtrace while @ foo() failed\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
 
-                if (strcmp (process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName(), "foo") != 0)
+                // On Linux the () are included.
+                const char* hit_fn = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName();
+                if (strcmp (hit_fn, "foo") != 0 && strcmp (hit_fn, "foo()") != 0)
                 {
 #if DEBUG == 1
-                    printf ("#%lld: First breakpoint did not stop at foo(), instead stopped at '%s'\n", threadnum, process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName());
+                    printf ("#%" PRIu64 ": First breakpoint did not stop at foo(), instead stopped at '%s'\n", threadnum, process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName());
 #endif
                     completed_threads_array[threadnum] = true;
                     return (void*) 1;
@@ -156,7 +163,7 @@ void *do_one_debugger (void *in)
 
                 if (process.GetState() == StateType::eStateExited)
                 {
-                    printf ("#%lld: Process exited\n", threadnum);
+                    printf ("#%" PRIu64 ": Process exited\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
@@ -164,14 +171,15 @@ void *do_one_debugger (void *in)
 
                 if (!walk_stack_to_main (process.GetThreadAtIndex(0)))
                 {
-                    printf ("#%lld: backtrace while @ bar() failed\n", threadnum);
+                    printf ("#%" PRIu64 ": backtrace while @ bar() failed\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
 
-                if (strcmp (process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName(), "bar") != 0)
+                hit_fn = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName();
+                if (strcmp (hit_fn, "bar") != 0 && strcmp (hit_fn, "bar()") != 0)
                 {
-                    printf ("#%lld: First breakpoint did not stop at bar()\n", threadnum);
+                    printf ("#%" PRIu64 ": First breakpoint did not stop at bar()\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void*) 1;
                 }
@@ -183,7 +191,7 @@ void *do_one_debugger (void *in)
                 SBDebugger::Destroy(debugger);
 
 #if DEBUG == 1
-                printf ("#%lld: All good!\n", threadnum);
+                printf ("#%" PRIu64 ": All good!\n", threadnum);
 #endif
                 successful_threads_array[threadnum] = true;
                 completed_threads_array[threadnum] = true;
@@ -191,7 +199,7 @@ void *do_one_debugger (void *in)
             }
             else
             {
-                printf("#%lld: process failed to launch\n", threadnum);
+                printf("#%" PRIu64 ": process failed to launch\n", threadnum);
                 successful_threads_array[threadnum] = false;
                 completed_threads_array[threadnum] = true;
                 return (void*) 0;
@@ -199,7 +207,7 @@ void *do_one_debugger (void *in)
         }
         else
         {
-            printf ("#%lld: did not get valid target\n", threadnum);
+            printf ("#%" PRIu64 ": did not get valid target\n", threadnum);
             successful_threads_array[threadnum] = false;
             completed_threads_array[threadnum] = true;
             return (void*) 0;
@@ -207,7 +215,7 @@ void *do_one_debugger (void *in)
     }
     else
     {
-        printf ("#%lld: did not get debugger\n", threadnum);
+        printf ("#%" PRIu64 ": did not get debugger\n", threadnum);
         successful_threads_array[threadnum] = false;
         completed_threads_array[threadnum] = true;
         return (void*) 0;
diff --git a/lldb/test/API/assert_messages_test/TestAssertMessages.py b/lldb/test/API/assert_messages_test/TestAssertMessages.py
index 3c54b9379d4c1..27d18e5ae99b9 100644
--- a/lldb/test/API/assert_messages_test/TestAssertMessages.py
+++ b/lldb/test/API/assert_messages_test/TestAssertMessages.py
@@ -23,7 +23,7 @@ def assert_expect_fails_with(self, cmd, expect_args, expected_msg):
         else:
             self.fail("Initial expect should have raised AssertionError!")
 
-    @expectedFailureAll(remote=True)
+    @expectedFailureAll(oslist=no_match(["linux"]), remote=True)
     def test_createTestTarget(self):
         try:
             self.createTestTarget("doesnt_exist")
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPlatformKill.py b/lldb/test/API/functionalities/gdb_remote_client/TestPlatformKill.py
index 46cda4d66b488..430f1871d3b30 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestPlatformKill.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestPlatformKill.py
@@ -8,7 +8,6 @@
 
 class TestPlatformKill(GDBRemoteTestBase):
     @skipIfRemote
-    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr52451")
     def test_kill_different_platform(self):
         """Test connecting to a remote linux platform"""
 
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 0afac26367de0..0b9d17bc9f45e 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -249,6 +249,32 @@ def test_i386_sysroot(self):
 
         self.dbg.DeleteTarget(target)
 
+    def test_object_map(self):
+        """Test that lldb can find the exe for an i386 linux core file using the object map."""
+
+        # Copy linux-i386.out to lldb_i386_object_map/a.out
+        tmp_object_map_root = os.path.join(self.getBuildDir(), "lldb_i386_object_map")
+        executable = os.path.join(tmp_object_map_root, "a.out")
+        lldbutil.mkdir_p(os.path.dirname(executable))
+        shutil.copyfile("linux-i386.out", executable)
+
+        # Replace the original module path at /home/labath/test and load the core
+        self.runCmd(
+            "settings set target.object-map /home/labath/test {}".format(
+                tmp_object_map_root
+            )
+        )
+
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore("linux-i386.core")
+
+        # Check that we did load the mapped executable
+        exe_module_spec = process.GetTarget().GetModuleAtIndex(0).GetFileSpec()
+        self.assertTrue(exe_module_spec.fullpath.startswith(tmp_object_map_root))
+
+        self.check_all(process, self._i386_pid, self._i386_regions, "a.out")
+        self.dbg.DeleteTarget(target)
+
     @skipIfLLVMTargetMissing("X86")
     @skipIfWindows
     def test_x86_64_sysroot(self):
diff --git a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
index c3f0e7597758a..8573d15733927 100644
--- a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
+++ b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
@@ -2,7 +2,6 @@
 Test saving a core file (or mini dump).
 """
 
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
@@ -21,6 +20,8 @@ def test_cannot_save_core_unless_process_stopped(self):
         target = self.dbg.CreateTarget(exe)
         process = target.LaunchSimple(None, None, self.get_process_working_directory())
         self.assertNotEqual(process.GetState(), lldb.eStateStopped)
+        options = lldb.SBSaveCoreOptions()
+        options.SetOutputFile(lldb.SBFileSpec(core))
         error = process.SaveCore(core)
         self.assertTrue(error.Fail())
 
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 1e171e726fb6b..96511d790271f 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -132,8 +132,13 @@ def test_save_linux_mini_dump(self):
                 stacks_to_sp_map,
             )
 
+            options = lldb.SBSaveCoreOptions()
+            core_sb_stack_spec = lldb.SBFileSpec(core_sb_stack)
+            options.SetOutputFile(core_sb_stack_spec)
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreStackOnly)
             # validate saving via SBProcess
-            error = process.SaveCore(core_sb_stack, "minidump", lldb.eSaveCoreStackOnly)
+            error = process.SaveCore(options)
             self.assertTrue(error.Success())
             self.assertTrue(os.path.isfile(core_sb_stack))
             self.verify_core_file(
@@ -144,7 +149,12 @@ def test_save_linux_mini_dump(self):
                 stacks_to_sp_map,
             )
 
-            error = process.SaveCore(core_sb_dirty, "minidump", lldb.eSaveCoreDirtyOnly)
+            options = lldb.SBSaveCoreOptions()
+            core_sb_dirty_spec = lldb.SBFileSpec(core_sb_dirty)
+            options.SetOutputFile(core_sb_dirty_spec)
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreDirtyOnly)
+            error = process.SaveCore(options)
             self.assertTrue(error.Success())
             self.assertTrue(os.path.isfile(core_sb_dirty))
             self.verify_core_file(
@@ -157,7 +167,12 @@ def test_save_linux_mini_dump(self):
 
             # Minidump can now save full core files, but they will be huge and
             # they might cause this test to timeout.
-            error = process.SaveCore(core_sb_full, "minidump", lldb.eSaveCoreFull)
+            options = lldb.SBSaveCoreOptions()
+            core_sb_full_spec = lldb.SBFileSpec(core_sb_full)
+            options.SetOutputFile(core_sb_full_spec)
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreFull)
+            error = process.SaveCore(options)
             self.assertTrue(error.Success())
             self.assertTrue(os.path.isfile(core_sb_full))
             self.verify_core_file(
diff --git a/lldb/test/API/lang/c/struct_types/main.c b/lldb/test/API/lang/c/struct_types/main.c
index e683c49108976..70217c57bec5f 100644
--- a/lldb/test/API/lang/c/struct_types/main.c
+++ b/lldb/test/API/lang/c/struct_types/main.c
@@ -1,3 +1,4 @@
+// clang-format off
 struct things_to_sum {
     int a;
     int b;
@@ -18,7 +19,7 @@ int main (int argc, char const *argv[])
     }; //% self.expect("frame variable pt.padding[0]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[0] = "])
        //% self.expect("frame variable pt.padding[1]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[1] = "])
        //% self.expect_expr("pt.padding[0]", result_type="char")
-       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[]'])
+       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[0]'])
 
     struct {} empty;
     //% self.expect("frame variable empty", substrs = ["empty = {}"])
diff --git a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
index 9e484e0132c83..730537da2fccf 100644
--- a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
@@ -5,9 +5,6 @@
 
 
 class TestCaseClassTemplateNonTypeParameterPack(TestBase):
-    @expectedFailureAll(
-        oslist=["windows"], archs=["i[3-6]86", "x86_64"]
-    )  # Fails to read memory from target.
     @no_debug_info_test
     def test(self):
         self.build()
diff --git a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
index 102c00dc603df..1ed643e6dd85c 100644
--- a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
@@ -5,9 +5,6 @@
 
 
 class TestCaseClassTemplateTypeParameterPack(TestBase):
-    @expectedFailureAll(
-        oslist=["windows"], archs=["i[3-6]86", "x86_64"]
-    )  # Fails to read memory from target.
     @no_debug_info_test
     def test(self):
         self.build()
diff --git a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
new file mode 100644
index 0000000000000..c509e81d8951a
--- /dev/null
+++ b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
@@ -0,0 +1,28 @@
+"""Test the SBSaveCoreOptions APIs."""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class SBSaveCoreOptionsAPICase(TestBase):
+    def test_plugin_name_assignment(self):
+        """Test assignment ensuring valid plugin names only."""
+        options = lldb.SBSaveCoreOptions()
+        error = options.SetPluginName(None)
+        self.assertTrue(error.Success())
+        self.assertEqual(options.GetPluginName(), None)
+        error = options.SetPluginName("Not a real plugin")
+        self.assertTrue(error.Fail())
+        self.assertEqual(options.GetPluginName(), None)
+        error = options.SetPluginName("minidump")
+        self.assertTrue(error.Success())
+        self.assertEqual(options.GetPluginName(), "minidump")
+        error = options.SetPluginName("")
+        self.assertTrue(error.Success())
+        self.assertEqual(options.GetPluginName(), None)
+
+    def test_default_corestyle_behavior(self):
+        """Test that the default core style is unspecified."""
+        options = lldb.SBSaveCoreOptions()
+        self.assertEqual(options.GetStyle(), lldb.eSaveCoreUnspecified)
diff --git a/lldb/test/Shell/SymbolFile/DWARF/vla.cpp b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
new file mode 100644
index 0000000000000..47a4aa1836899
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
@@ -0,0 +1,83 @@
+// When linking with link.exe, -gdwarf still produces PDB instead.
+// UNSUPPORTED: system-windows
+
+// RUN: %clangxx_host -gdwarf -std=c++11 -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o run \
+// RUN:   -o "frame var --show-types f" \
+// RUN:   -o "frame var vla0" \
+// RUN:   -o "frame var fla0" \
+// RUN:   -o "frame var fla1" \
+// RUN:   -o "frame var vla01" \
+// RUN:   -o "frame var vla10" \
+// RUN:   -o "frame var vlaN" \
+// RUN:   -o "frame var vlaNM" \
+// RUN:   -o exit | FileCheck %s
+
+struct Foo {
+  static constexpr int n = 1;
+  int m_vlaN[n];
+
+  int m_vla0[0];
+};
+
+int main() {
+  Foo f;
+  f.m_vlaN[0] = 60;
+
+  // CHECK:      (lldb) frame var --show-types f
+  // CHECK-NEXT: (Foo) f = {
+  // CHECK-NEXT:   (int[1]) m_vlaN = {
+  // CHECK-NEXT:     (int) [0] = 60
+  // CHECK-NEXT:   }
+  // CHECK-NEXT:   (int[0]) m_vla0 = {}
+  // CHECK-NEXT: }
+
+  int vla0[0] = {};
+
+  // CHECK:      (lldb) frame var vla0
+  // CHECK-NEXT: (int[0]) vla0 = {}
+
+  int fla0[] = {};
+
+  // CHECK:      (lldb) frame var fla0
+  // CHECK-NEXT: (int[0]) fla0 = {}
+
+  int fla1[] = {42};
+
+  // CHECK:      (lldb) frame var fla1
+  // CHECK-NEXT: (int[1]) fla1 = ([0] = 42)
+
+  int vla01[0][1];
+
+  // CHECK:      (lldb) frame var vla01
+  // CHECK-NEXT: (int[0][1]) vla01 = {}
+
+  int vla10[1][0];
+
+  // CHECK:      (lldb) frame var vla10
+  // CHECK-NEXT: (int[1][0]) vla10 = ([0] = int[0]
+
+  int n = 3;
+  int vlaN[n];
+  for (int i = 0; i < n; ++i)
+    vlaN[i] = -i;
+
+  // CHECK:      (lldb) frame var vlaN
+  // CHECK-NEXT: (int[]) vlaN = ([0] = 0, [1] = -1, [2] = -2)
+
+  int m = 2;
+  int vlaNM[n][m];
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j)
+      vlaNM[i][j] = i + j;
+
+  // FIXME: multi-dimensional VLAs aren't well supported
+  // CHECK:      (lldb) frame var vlaNM
+  // CHECK-NEXT: (int[][]) vlaNM = {
+  // CHECK-NEXT:   [0] = ([0] = 0, [1] = 1, [2] = 1)
+  // CHECK-NEXT:   [1] = ([0] = 1, [1] = 1, [2] = 2)
+  // CHECK-NEXT: }
+
+  __builtin_debugtrap();
+}
diff --git a/lldb/tools/debugserver/source/PThreadEvent.cpp b/lldb/tools/debugserver/source/PThreadEvent.cpp
index e77c7511ae5ba..f9166a1b63d06 100644
--- a/lldb/tools/debugserver/source/PThreadEvent.cpp
+++ b/lldb/tools/debugserver/source/PThreadEvent.cpp
@@ -108,79 +108,55 @@ void PThreadEvent::ResetEvents(const uint32_t mask) {
 // Wait until 'timeout_abstime' for any events that are set in
 // 'mask'. If 'timeout_abstime' is NULL, then wait forever.
 uint32_t
-PThreadEvent::WaitForSetEvents(const uint32_t mask,
-                               const struct timespec *timeout_abstime) const {
+PThreadEvent::WaitForEventsImpl(const uint32_t mask,
+                                const struct timespec *timeout_abstime,
+                                std::function<bool()> predicate) const {
   // DNBLogThreadedIf(LOG_EVENTS, "%p PThreadEvent::%s (0x%8.8x, %p)", this,
   // __FUNCTION__, mask, timeout_abstime);
+
   int err = 0;
+
   // pthread_cond_timedwait() or pthread_cond_wait() will atomically
   // unlock the mutex and wait for the condition to be set. When either
   // function returns, they will re-lock the mutex. We use an auto lock/unlock
   // class (PThreadMutex::Locker) to allow us to return at any point in this
   // function and not have to worry about unlocking the mutex.
   PTHREAD_MUTEX_LOCKER(locker, m_mutex);
-  do {
-    // Check our predicate (event bits) in case any are already set
-    if (mask & m_bits) {
-      uint32_t bits_set = mask & m_bits;
-      // Our PThreadMutex::Locker will automatically unlock our mutex
-      return bits_set;
-    }
+
+  // Check the predicate and the error code. The functions below do not return
+  // EINTR so that's not something we need to handle.
+  while (!predicate() && err == 0) {
     if (timeout_abstime) {
       // Wait for condition to get broadcast, or for a timeout. If we get
-      // a timeout we will drop out of the do loop and return false which
-      // is what we want.
+      // a timeout we will drop out of the loop on the next iteration and we
+      // will recompute the mask in case of a race between the condition and the
+      // timeout.
       err = ::pthread_cond_timedwait(m_set_condition.Condition(),
                                      m_mutex.Mutex(), timeout_abstime);
-      // Retest our predicate in case of a race condition right at the end
-      // of the timeout.
-      if (err == ETIMEDOUT) {
-        uint32_t bits_set = mask & m_bits;
-        return bits_set;
-      }
     } else {
-      // Wait for condition to get broadcast. The only error this function
-      // should return is if
+      // Wait for condition to get broadcast.
       err = ::pthread_cond_wait(m_set_condition.Condition(), m_mutex.Mutex());
     }
-  } while (err == 0);
-  return 0;
+  }
+
+  // Either the predicate passed, we hit the specified timeout (ETIMEDOUT) or we
+  // encountered an unrecoverable error (EINVAL, EPERM). Regardless of how we
+  // got here, recompute and return the mask indicating which bits (if any) are
+  // set.
+  return GetBitsMasked(mask);
+}
+
+uint32_t
+PThreadEvent::WaitForSetEvents(const uint32_t mask,
+                               const struct timespec *timeout_abstime) const {
+  auto predicate = [&]() -> uint32_t { return GetBitsMasked(mask) != 0; };
+  return WaitForEventsImpl(mask, timeout_abstime, predicate);
 }
 
-// Wait until 'timeout_abstime' for any events in 'mask' to reset.
-// If 'timeout_abstime' is NULL, then wait forever.
 uint32_t PThreadEvent::WaitForEventsToReset(
     const uint32_t mask, const struct timespec *timeout_abstime) const {
-  // DNBLogThreadedIf(LOG_EVENTS, "%p PThreadEvent::%s (0x%8.8x, %p)", this,
-  // __FUNCTION__, mask, timeout_abstime);
-  int err = 0;
-  // pthread_cond_timedwait() or pthread_cond_wait() will atomically
-  // unlock the mutex and wait for the condition to be set. When either
-  // function returns, they will re-lock the mutex. We use an auto lock/unlock
-  // class (PThreadMutex::Locker) to allow us to return at any point in this
-  // function and not have to worry about unlocking the mutex.
-  PTHREAD_MUTEX_LOCKER(locker, m_mutex);
-  do {
-    // Check our predicate (event bits) each time through this do loop
-    if ((mask & m_bits) == 0) {
-      // All the bits requested have been reset, return zero indicating
-      // which bits from the mask were still set (none of them)
-      return 0;
-    }
-    if (timeout_abstime) {
-      // Wait for condition to get broadcast, or for a timeout. If we get
-      // a timeout we will drop out of the do loop and return false which
-      // is what we want.
-      err = ::pthread_cond_timedwait(m_reset_condition.Condition(),
-                                     m_mutex.Mutex(), timeout_abstime);
-    } else {
-      // Wait for condition to get broadcast. The only error this function
-      // should return is if
-      err = ::pthread_cond_wait(m_reset_condition.Condition(), m_mutex.Mutex());
-    }
-  } while (err == 0);
-  // Return a mask indicating which bits (if any) were still set
-  return mask & m_bits;
+  auto predicate = [&]() -> uint32_t { return GetBitsMasked(mask) == 0; };
+  return WaitForEventsImpl(mask, timeout_abstime, predicate);
 }
 
 uint32_t
diff --git a/lldb/tools/debugserver/source/PThreadEvent.h b/lldb/tools/debugserver/source/PThreadEvent.h
index 20faf82e4fff7..5a8a7dd207493 100644
--- a/lldb/tools/debugserver/source/PThreadEvent.h
+++ b/lldb/tools/debugserver/source/PThreadEvent.h
@@ -16,6 +16,7 @@
 #include "PThreadMutex.h"
 #include <cstdint>
 #include <ctime>
+#include <functional>
 
 class PThreadEvent {
 public:
@@ -53,6 +54,12 @@ class PThreadEvent {
   uint32_t m_validBits;
   uint32_t m_reset_ack_mask;
 
+  uint32_t GetBitsMasked(uint32_t mask) const { return mask & m_bits; }
+
+  uint32_t WaitForEventsImpl(const uint32_t mask,
+                             const struct timespec *timeout_abstime,
+                             std::function<bool()> predicate) const;
+
 private:
   PThreadEvent(const PThreadEvent &) = delete;
   PThreadEvent &operator=(const PThreadEvent &rhs) = delete;
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 8ecbaf7ce9816..11a14d29ab51e 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -157,6 +157,20 @@ locally on port `2345`.
 }
 ```
 
+You can also use the `gdb-remote-port` parameter to send an attach request
+to a debug server running on the current machine,
+instead of using the custom command `attachCommands`.
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 2345,
+}
+```
+
 #### Connect to a Debug Server on Another Machine
 
 This connects to a debug server running on another machine with hostname
@@ -173,6 +187,23 @@ port `5678` of that other machine.
 }
 ```
 
+You can also use the `gdb-remote-hostname` and `gdb-remote-port` parameters
+to send an attach request to a debug server running on a different machine,
+instead of custom command `attachCommands`.
+The default hostname being used `localhost`.
+
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 5678,
+  "gdb-remote-hostname": "hostname",
+}
+```
+
 ## Custom debugger commands
 
 The `lldb-dap` tool includes additional custom commands to support the Debug
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index fd5de30ed28dd..97e4efe7bac19 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -1,7 +1,7 @@
 {
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
-  "version": "0.2.2",
+  "version": "0.2.3",
   "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
@@ -353,7 +353,7 @@
                   "number",
                   "string"
                 ],
-                "description": "TCP/IP port to attach to. Specifying both pid and port is an error."
+                "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
               },
               "gdb-remote-hostname": {
                 "type": "string",
diff --git a/lldb/tools/lldb-instr/Instrument.cpp b/lldb/tools/lldb-instr/Instrument.cpp
index 4b8725396a61f..d07ccf121bdf3 100644
--- a/lldb/tools/lldb-instr/Instrument.cpp
+++ b/lldb/tools/lldb-instr/Instrument.cpp
@@ -1,11 +1,12 @@
 #include "clang/AST/AST.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/CodeGen/ObjectFilePCHContainerOperations.h"
+#include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Rewrite/Core/Rewriter.h"
+#include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "clang/Tooling/CommonOptionsParser.h"
 #include "clang/Tooling/Tooling.h"
 
diff --git a/lldb/tools/lldb-server/LLDBServerUtilities.cpp b/lldb/tools/lldb-server/LLDBServerUtilities.cpp
index c3a8df19e969e..5facfbf3105e9 100644
--- a/lldb/tools/lldb-server/LLDBServerUtilities.cpp
+++ b/lldb/tools/lldb-server/LLDBServerUtilities.cpp
@@ -27,11 +27,13 @@ class TestLogHandler : public LogHandler {
       : m_stream_sp(stream_sp) {}
 
   void Emit(llvm::StringRef message) override {
+    std::lock_guard<std::mutex> guard(m_mutex);
     (*m_stream_sp) << message;
     m_stream_sp->flush();
   }
 
 private:
+  std::mutex m_mutex;
   std::shared_ptr<raw_ostream> m_stream_sp;
 };
 
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index cfd0a3797d810..7148a1d2a3094 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -313,9 +313,11 @@ int main_platform(int argc, char *argv[]) {
       GDBRemoteCommunicationServerPlatform::PortMap portmap_for_child;
       llvm::Expected<uint16_t> available_port =
           gdbserver_portmap.GetNextAvailablePort();
-      if (available_port)
-        portmap_for_child.AllowPort(*available_port);
-      else {
+      if (available_port) {
+        // GetNextAvailablePort() may return 0 if gdbserver_portmap is empty.
+        if (*available_port)
+          portmap_for_child.AllowPort(*available_port);
+      } else {
         llvm::consumeError(available_port.takeError());
         fprintf(stderr,
                 "no available gdbserver port for connection - dropping...\n");
@@ -352,7 +354,7 @@ int main_platform(int argc, char *argv[]) {
     if (platform.IsConnected()) {
       if (inferior_arguments.GetArgumentCount() > 0) {
         lldb::pid_t pid = LLDB_INVALID_PROCESS_ID;
-        std::optional<uint16_t> port = 0;
+        std::optional<uint16_t> port;
         std::string socket_name;
         Status error = platform.LaunchGDBServer(inferior_arguments,
                                                 "", // hostname
diff --git a/llvm-spirv/.github/workflows/check-code-style.yml b/llvm-spirv/.github/workflows/check-code-style.yml
index eafb7d7ff38b0..92e951da5a2a0 100644
--- a/llvm-spirv/.github/workflows/check-code-style.yml
+++ b/llvm-spirv/.github/workflows/check-code-style.yml
@@ -24,7 +24,7 @@ env:
   # We need compile command database in order to perform clang-tidy check. So,
   # in order to perform configure step we need to setup llvm-dev package. This
   # env variable used to specify desired version of it
-  LLVM_VERSION: 19
+  LLVM_VERSION: 20
 
 jobs:
   clang-format-and-tidy:
diff --git a/llvm-spirv/.github/workflows/check-in-tree-build.yml b/llvm-spirv/.github/workflows/check-in-tree-build.yml
index c938b497f92e4..9eace466c19eb 100644
--- a/llvm-spirv/.github/workflows/check-in-tree-build.yml
+++ b/llvm-spirv/.github/workflows/check-in-tree-build.yml
@@ -32,7 +32,7 @@ on:
     - cron: 0 0 * * *
 
 env:
-  LLVM_VERSION: 19
+  LLVM_VERSION: 20
 
 jobs:
   build_and_test_linux:
@@ -45,14 +45,14 @@ jobs:
           - build_type: Release
             shared_libs: EnableSharedLibs
       fail-fast: false
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - name: Install dependencies
         run: |
           curl -L "https://apt.llvm.org/llvm-snapshot.gpg.key" | sudo apt-key add -
           curl -L "https://packages.lunarg.com/lunarg-signing-key-pub.asc" | sudo apt-key add -
-          echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal main" | sudo tee -a /etc/apt/sources.list
-          echo "deb https://packages.lunarg.com/vulkan focal main" | sudo tee -a /etc/apt/sources.list
+          echo "deb https://apt.llvm.org/jammy/ llvm-toolchain-jammy main" | sudo tee -a /etc/apt/sources.list
+          echo "deb https://packages.lunarg.com/vulkan jammy main" | sudo tee -a /etc/apt/sources.list
           sudo apt-get update
           sudo apt-get -yq --no-install-suggests --no-install-recommends install \
             clang-${{ env.LLVM_VERSION }} \
diff --git a/llvm-spirv/.github/workflows/check-out-of-tree-build.yml b/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
index 21c6c9895114c..3d10f2e406746 100644
--- a/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
+++ b/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
@@ -29,7 +29,7 @@ on:
     - cron: 0 0 * * *
 
 env:
-  LLVM_VERSION: 19
+  LLVM_VERSION: 20
 
 jobs:
   build_and_test:
diff --git a/llvm-spirv/CMakeLists.txt b/llvm-spirv/CMakeLists.txt
index 6252c5da397cf..a90a7017a218f 100644
--- a/llvm-spirv/CMakeLists.txt
+++ b/llvm-spirv/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13.4)
 
 if(NOT DEFINED BASE_LLVM_VERSION)
-  set (BASE_LLVM_VERSION 19.0.0)
+  set (BASE_LLVM_VERSION 20.0.0)
 endif(NOT DEFINED BASE_LLVM_VERSION)
 set(LLVM_SPIRV_VERSION ${BASE_LLVM_VERSION}.0)
 
diff --git a/llvm-spirv/README.md b/llvm-spirv/README.md
index 88a4ee613203a..803cfe366caca 100644
--- a/llvm-spirv/README.md
+++ b/llvm-spirv/README.md
@@ -30,7 +30,7 @@ The translator can be built with the latest(nightly) package of LLVM. For Ubuntu
 wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
 sudo add-apt-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main"
 sudo apt-get update
-sudo apt-get install llvm-19-dev llvm-19-tools clang-19 libclang-19-dev
+sudo apt-get install llvm-20-dev llvm-20-tools clang-20 libclang-20-dev
 ```
 The installed version of LLVM will be used by default for out-of-tree build of the translator.
 ```
@@ -156,7 +156,7 @@ make test
 ```
 This requires that the `-DLLVM_SPIRV_INCLUDE_TESTS=ON` argument is
 passed to CMake during the build step. Additionally,
-`-DLLVM_EXTERNAL_LIT="/usr/lib/llvm-19/build/utils/lit/lit.py"` is
+`-DLLVM_EXTERNAL_LIT="/usr/lib/llvm-20/build/utils/lit/lit.py"` is
 needed when building with a pre-installed version of LLVM.
 
 The translator test suite can be disabled by passing
diff --git a/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
index 80c93f5a6bf54..be04f0ad91f6d 100644
--- a/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
@@ -485,6 +485,18 @@ CallInst *OCLToSPIRVBase::visitCallAtomicCmpXchg(CallInst *CI) {
     auto Mutator = mutateCallInst(CI, kOCLBuiltinName::AtomicCmpXchgStrong);
     Value *Expected = Mutator.getArg(1);
     Type *MemTy = Mutator.getArg(2)->getType();
+    if (MemTy->isFloatTy() || MemTy->isDoubleTy()) {
+      MemTy =
+          MemTy->isFloatTy() ? Type::getInt32Ty(*Ctx) : Type::getInt64Ty(*Ctx);
+      Mutator.replaceArg(
+          0,
+          {Mutator.getArg(0),
+           TypedPointerType::get(
+               MemTy, Mutator.getArg(0)->getType()->getPointerAddressSpace())});
+      Mutator.mapArg(2, [=](IRBuilder<> &Builder, Value *V) {
+        return Builder.CreateBitCast(V, MemTy);
+      });
+    }
     assert(MemTy->isIntegerTy() &&
            "In SPIR-V 1.0 arguments of OpAtomicCompareExchange must be "
            "an integer type scalars");
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index eb08935c7a457..da7c79698c4d6 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -3815,8 +3815,8 @@ void SPIRVToLLVM::transIntelFPGADecorations(SPIRVValue *BV, Value *V) {
       GS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
       GS->setSection("llvm.metadata");
 
-      Type *ResType = PointerType::get(GV->getContext(),
-                                       GV->getType()->getPointerAddressSpace());
+      Type *ResType = PointerType::get(
+          GV->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace());
       Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, ResType);
 
       Type *Int8PtrTyPrivate = PointerType::get(*Context, SPIRAS_Private);
@@ -3875,8 +3875,8 @@ void SPIRVToLLVM::transUserSemantic(SPIRV::SPIRVFunction *Fun) {
     GS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
     GS->setSection("llvm.metadata");
 
-    Type *ResType = PointerType::get(V->getContext(),
-                                     V->getType()->getPointerAddressSpace());
+    Type *ResType = PointerType::get(
+        V->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace());
     Constant *C =
         ConstantExpr::getPointerBitCastOrAddrSpaceCast(TransFun, ResType);
 
diff --git a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
index 3caa1cfe285c9..9f5cb04725398 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
@@ -1848,9 +1848,13 @@ std::string decodeSPIRVTypeName(StringRef Name,
 // Returns true if type(s) and number of elements (if vector) is valid
 bool checkTypeForSPIRVExtendedInstLowering(IntrinsicInst *II, SPIRVModule *BM) {
   switch (II->getIntrinsicID()) {
+  case Intrinsic::acos:
+  case Intrinsic::asin:
+  case Intrinsic::atan:
   case Intrinsic::ceil:
   case Intrinsic::copysign:
   case Intrinsic::cos:
+  case Intrinsic::cosh:
   case Intrinsic::exp:
   case Intrinsic::exp2:
   case Intrinsic::fabs:
@@ -1870,8 +1874,10 @@ bool checkTypeForSPIRVExtendedInstLowering(IntrinsicInst *II, SPIRVModule *BM) {
   case Intrinsic::round:
   case Intrinsic::roundeven:
   case Intrinsic::sin:
+  case Intrinsic::sinh:
   case Intrinsic::sqrt:
   case Intrinsic::tan:
+  case Intrinsic::tanh:
   case Intrinsic::trunc: {
     // Although some of the intrinsics above take multiple arguments, it is
     // sufficient to check arg 0 because the LLVM Verifier will have checked
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 42b8d317edcfa..8dba81c03b56d 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -3885,12 +3885,20 @@ static SPIRVWord getBuiltinIdForIntrinsic(Intrinsic::ID IID) {
   //       and assume that the operations have no side effects (FP status flags
   //       aren't maintained), so the OpenCL builtin behavior should be
   //       acceptable.
+  case Intrinsic::acos:
+    return OpenCLLIB::Acos;
+  case Intrinsic::asin:
+    return OpenCLLIB::Asin;
+  case Intrinsic::atan:
+    return OpenCLLIB::Atan;
   case Intrinsic::ceil:
     return OpenCLLIB::Ceil;
   case Intrinsic::copysign:
     return OpenCLLIB::Copysign;
   case Intrinsic::cos:
     return OpenCLLIB::Cos;
+  case Intrinsic::cosh:
+    return OpenCLLIB::Cosh;
   case Intrinsic::exp:
     return OpenCLLIB::Exp;
   case Intrinsic::exp2:
@@ -3931,10 +3939,14 @@ static SPIRVWord getBuiltinIdForIntrinsic(Intrinsic::ID IID) {
     return OpenCLLIB::Rint;
   case Intrinsic::sin:
     return OpenCLLIB::Sin;
+  case Intrinsic::sinh:
+    return OpenCLLIB::Sinh;
   case Intrinsic::sqrt:
     return OpenCLLIB::Sqrt;
   case Intrinsic::tan:
     return OpenCLLIB::Tan;
+  case Intrinsic::tanh:
+    return OpenCLLIB::Tanh;
   case Intrinsic::trunc:
     return OpenCLLIB::Trunc;
   default:
@@ -4067,8 +4079,12 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
   }
 
   // Unary FP intrinsic
+  case Intrinsic::acos:
+  case Intrinsic::asin:
+  case Intrinsic::atan:
   case Intrinsic::ceil:
   case Intrinsic::cos:
+  case Intrinsic::cosh:
   case Intrinsic::exp:
   case Intrinsic::exp2:
   case Intrinsic::fabs:
@@ -4081,8 +4097,10 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
   case Intrinsic::round:
   case Intrinsic::roundeven:
   case Intrinsic::sin:
+  case Intrinsic::sinh:
   case Intrinsic::sqrt:
   case Intrinsic::tan:
+  case Intrinsic::tanh:
   case Intrinsic::trunc: {
     if (!checkTypeForSPIRVExtendedInstLowering(II, BM))
       break;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 0bbfab0b7472a..c243403eb020b 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2799,6 +2799,12 @@ class SPIRVAtomicInstBase : public SPIRVInstTemplateBase {
     for (auto RC : getRequiredCapability())
       Module->addCapability(RC);
   }
+
+  void validate() const override {
+    if (OpCode == OpAtomicCompareExchangeWeak)
+      assert(this->getModule()->getSPIRVVersion() < VersionNumber::SPIRV_1_4 &&
+             "OpAtomicCompareExchangeWeak is removed starting from SPIR-V 1.4");
+  }
 };
 
 class SPIRVAtomicStoreInst : public SPIRVAtomicInstBase {
diff --git a/llvm-spirv/test/llvm-intrinsics/dynamic-memmove.ll b/llvm-spirv/test/llvm-intrinsics/dynamic-memmove.ll
index 17ffee15dd1e2..04bd2333c1e33 100644
--- a/llvm-spirv/test/llvm-intrinsics/dynamic-memmove.ll
+++ b/llvm-spirv/test/llvm-intrinsics/dynamic-memmove.ll
@@ -21,10 +21,10 @@ entry:
   ret void
 
 ; CHECK-LLVM: @memmove_caller(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[N:%.*]])
+; CHECK-LLVM: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 [[N]], 0
 ; CHECK-LLVM:  [[SRC_CMP:%.*]] = ptrtoint ptr addrspace(1) [[SRC]] to i64
 ; CHECK-LLVM:  [[DST_CMP:%.*]] = ptrtoint ptr addrspace(1) [[DST]] to i64
 ; CHECK-LLVM: [[COMPARE_SRC_DST:%.*]] = icmp ult i64 [[SRC_CMP]], [[DST_CMP]]
-; CHECK-LLVM-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 [[N]], 0
 ; CHECK-LLVM-NEXT: br i1 [[COMPARE_SRC_DST]], label %[[COPY_BACKWARDS:.*]], label %[[COPY_FORWARD:.*]]
 ; CHECK-LLVM: [[COPY_BACKWARDS]]:
 }
diff --git a/llvm-spirv/test/llvm-intrinsics/fp-intrinsics.ll b/llvm-spirv/test/llvm-intrinsics/fp-intrinsics.ll
index 716dcbbfa8854..5f4de31eb50ea 100644
--- a/llvm-spirv/test/llvm-intrinsics/fp-intrinsics.ll
+++ b/llvm-spirv/test/llvm-intrinsics/fp-intrinsics.ll
@@ -344,3 +344,81 @@ entry:
 }
 
 declare float @llvm.fma.f32(float, float, float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] acos [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestAcos(float %x) {
+entry:
+  %t = tail call float @llvm.acos.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.acos.f32(float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] asin [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestAsin(float %x) {
+entry:
+  %t = tail call float @llvm.asin.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.asin.f32(float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] atan [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestAtan(float %x) {
+entry:
+  %t = tail call float @llvm.atan.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.atan.f32(float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] cosh [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestCosh(float %x) {
+entry:
+  %t = tail call float @llvm.cosh.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.cosh.f32(float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] sinh [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestSinh(float %x) {
+entry:
+  %t = tail call float @llvm.sinh.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.sinh.f32(float)
+
+; CHECK: Function
+; CHECK: FunctionParameter {{[0-9]+}} [[x:[0-9]+]]
+; CHECK: ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] tanh [[x]]
+; CHECK: FunctionEnd
+
+define spir_func float @TestTanh(float %x) {
+entry:
+  %t = tail call float @llvm.tanh.f32(float %x)
+  ret float %t
+}
+
+declare float @llvm.tanh.f32(float)
diff --git a/llvm-spirv/test/transcoding/AtomicCompareExchangeExplicit_cl20.cl b/llvm-spirv/test/transcoding/AtomicCompareExchangeExplicit_cl20.cl
index 166cda9284fc4..7eefe0dafd3e0 100644
--- a/llvm-spirv/test/transcoding/AtomicCompareExchangeExplicit_cl20.cl
+++ b/llvm-spirv/test/transcoding/AtomicCompareExchangeExplicit_cl20.cl
@@ -6,46 +6,112 @@
 // RUN: llvm-spirv -r --spirv-target-env=CL2.0 %t.spv -o %t.rev.bc
 // RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-__kernel void testAtomicCompareExchangeExplicit_cl20(
-    volatile global atomic_int* object,
-    global int* expected,
-    int desired)
-{
-  // Values of memory order and memory scope arguments correspond to SPIR-2.0 spec.
-  atomic_compare_exchange_strong_explicit(object, expected, desired,
-                                          memory_order_release, // 3
-                                          memory_order_relaxed  // 0
-                                         ); // by default, assume device scope = 2
-  atomic_compare_exchange_strong_explicit(object, expected, desired,
-                                          memory_order_acq_rel,   // 4
-                                          memory_order_relaxed,   // 0
-                                          memory_scope_work_group // 1
-                                         );
-  atomic_compare_exchange_weak_explicit(object, expected, desired,
-                                        memory_order_release, // 3
-                                        memory_order_relaxed  // 0
-                                         ); // by default, assume device scope = 2
-  atomic_compare_exchange_weak_explicit(object, expected, desired,
-                                        memory_order_acq_rel,   // 4
-                                        memory_order_relaxed,   // 0
-                                        memory_scope_work_group // 1
-                                       );
+#define DEFINE_KERNEL(TYPE)                                                            \
+__kernel void testAtomicCompareExchangeExplicit_cl20_##TYPE(                           \
+    volatile global atomic_##TYPE* object,                                             \
+    global TYPE* expected,                                                             \
+    TYPE desired)                                                                      \
+{                                                                                      \
+  /* Values of memory order and memory scope arguments correspond to SPIR-2.0 spec. */ \
+  atomic_compare_exchange_strong_explicit(object, expected, desired,                   \
+                                          memory_order_release, /* 3 */                \
+                                          memory_order_relaxed  /* 0 */                \
+                                         ); /* by default, assume device scope = 2 */  \
+  atomic_compare_exchange_strong_explicit(object, expected, desired,                   \
+                                          memory_order_acq_rel,   /* 4 */              \
+                                          memory_order_relaxed,   /* 0 */              \
+                                          memory_scope_work_group /* 1 */              \
+                                         );                                            \
+  atomic_compare_exchange_weak_explicit(object, expected, desired,                     \
+                                        memory_order_release, /* 3 */                  \
+                                        memory_order_relaxed  /* 0 */                  \
+                                         ); /* by default, assume device scope = 2 */  \
+  atomic_compare_exchange_weak_explicit(object, expected, desired,                     \
+                                        memory_order_acq_rel,   /* 4 */                \
+                                        memory_order_relaxed,   /* 0 */                \
+                                        memory_scope_work_group /* 1 */                \
+                                       );                                              \
 }
 
-//CHECK-SPIRV: TypeInt [[int:[0-9]+]] 32 0
+DEFINE_KERNEL(int)
+DEFINE_KERNEL(float)
+DEFINE_KERNEL(double)
+
+//CHECK-SPIRV: TypeInt [[int32:[0-9]+]] 32 0
+//CHECK-SPIRV: TypeInt [[int64:[0-9]+]] 64 0
 //; Constants below correspond to the SPIR-V spec
-//CHECK-SPIRV-DAG: Constant [[int]] [[DeviceScope:[0-9]+]] 1
-//CHECK-SPIRV-DAG: Constant [[int]] [[WorkgroupScope:[0-9]+]] 2
-//CHECK-SPIRV-DAG: Constant [[int]] [[ReleaseMemSem:[0-9]+]] 4
-//CHECK-SPIRV-DAG: Constant [[int]] [[RelaxedMemSem:[0-9]+]] 0
-//CHECK-SPIRV-DAG: Constant [[int]] [[AcqRelMemSem:[0-9]+]] 8
-
-//CHECK-SPIRV: AtomicCompareExchange {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
-//CHECK-SPIRV: AtomicCompareExchange {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
-//CHECK-SPIRV: AtomicCompareExchange {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
-//CHECK-SPIRV: AtomicCompareExchange {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
-
-//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected5.as, i32 %desired, i32 3, i32 0, i32 2)
-//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected8.as, i32 %desired, i32 4, i32 0, i32 1)
-//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected11.as, i32 %desired, i32 3, i32 0, i32 2)
-//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected14.as, i32 %desired, i32 4, i32 0, i32 1)
+//CHECK-SPIRV-DAG: Constant [[int32]] [[DeviceScope:[0-9]+]] 1
+//CHECK-SPIRV-DAG: Constant [[int32]] [[WorkgroupScope:[0-9]+]] 2
+//CHECK-SPIRV-DAG: Constant [[int32]] [[ReleaseMemSem:[0-9]+]] 4
+//CHECK-SPIRV-DAG: Constant [[int32]] [[RelaxedMemSem:[0-9]+]] 0
+//CHECK-SPIRV-DAG: Constant [[int32]] [[AcqRelMemSem:[0-9]+]] 8
+
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int32]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+
+//CHECK-SPIRV: AtomicCompareExchange [[int64]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int64]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int64]] {{[0-9]+}} {{[0-9]+}} [[DeviceScope]] [[ReleaseMemSem]] [[RelaxedMemSem]]
+//CHECK-SPIRV: AtomicCompareExchange [[int64]] {{[0-9]+}} {{[0-9]+}} [[WorkgroupScope]] [[AcqRelMemSem]] [[RelaxedMemSem]]
+
+//CHECK-LLVM-LABEL: define spir_kernel void @testAtomicCompareExchangeExplicit_cl20_int(
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected{{.*}}, i32 %desired, i32 3, i32 0, i32 2)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected{{.*}}, i32 %desired, i32 4, i32 0, i32 1)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected{{.*}}, i32 %desired, i32 3, i32 0, i32 2)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) %0, ptr addrspace(4) %expected{{.*}}, i32 %desired, i32 4, i32 0, i32 1)
+
+//CHECK-LLVM-LABEL: define spir_kernel void @testAtomicCompareExchangeExplicit_cl20_float(
+//CHECK-LLVM: [[OBJECT:%[0-9]+]] = addrspacecast ptr addrspace(1) %object to ptr addrspace(4)
+//CHECK-LLVM: [[EXPECTED:%[0-9]+]] = addrspacecast ptr addrspace(1) %expected to ptr addrspace(4)
+//CHECK-LLVM: [[CAST1:%[0-9]+]] = bitcast float %desired to i32
+//CHECK-LLVM: %exp = load i32, ptr addrspace(4) [[EXPECTED]], align 4
+//CHECK-LLVM: store i32 %exp, ptr [[EXPECTED_ALLOCA:%expected[0-9]+]], align 4
+//CHECK-LLVM: [[EXPECTED_AS1:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS1]], i32 [[CAST1]], i32 3, i32 0, i32 2)
+//CHECK-LLVM: [[CAST2:%[0-9]+]] = bitcast float %desired to i32
+//CHECK-LLVM: [[LOAD2:%exp[0-9]+]] = load i32, ptr addrspace(4) [[EXPECTED]], align 4
+//CHECK-LLVM: store i32 [[LOAD2]], ptr [[EXPECTED_ALLOCA2:%expected[0-9]+]], align 4
+//CHECK-LLVM: [[EXPECTED_AS2:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA2]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS2]], i32 [[CAST2]], i32 4, i32 0, i32 1)
+//CHECK-LLVM: [[CAST3:%[0-9]+]] = bitcast float %desired to i32
+//CHECK-LLVM: [[LOAD3:%exp[0-9]+]] = load i32, ptr addrspace(4) [[EXPECTED]], align 4
+//CHECK-LLVM: store i32 [[LOAD3]], ptr [[EXPECTED_ALLOCA3:%expected[0-9]+]], align 4
+//CHECK-LLVM: [[EXPECTED_AS3:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA3]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS3]], i32 [[CAST3]], i32 3, i32 0, i32 2)
+//CHECK-LLVM: [[CAST4:%[0-9]+]] = bitcast float %desired to i32
+//CHECK-LLVM: [[LOAD4:%exp[0-9]+]] = load i32, ptr addrspace(4) [[EXPECTED]], align 4
+//CHECK-LLVM: store i32 [[LOAD4]], ptr [[EXPECTED_ALLOCA4:%expected[0-9]+]], align 4
+//CHECK-LLVM: [[EXPECTED_AS4:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA4]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS4]], i32 [[CAST4]], i32 4, i32 0, i32 1)
+
+
+//CHECK-LLVM-LABEL: define spir_kernel void @testAtomicCompareExchangeExplicit_cl20_double(
+//CHECK-LLVM: [[OBJECT:%[0-9]+]] = addrspacecast ptr addrspace(1) %object to ptr addrspace(4)
+//CHECK-LLVM: [[EXPECTED:%[0-9]+]] = addrspacecast ptr addrspace(1) %expected to ptr addrspace(4)
+//CHECK-LLVM: [[CAST1:%[0-9]+]] = bitcast double %desired to i64
+//CHECK-LLVM: %exp = load i64, ptr addrspace(4) [[EXPECTED]], align 8
+//CHECK-LLVM: store i64 %exp, ptr [[EXPECTED_ALLOCA:%expected[0-9]+]], align 8
+//CHECK-LLVM: [[EXPECTED_AS1:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiclPU3AS4ll12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS1]], i64 [[CAST1]], i32 3, i32 0, i32 2)
+//CHECK-LLVM: [[CAST2:%[0-9]+]] = bitcast double %desired to i64
+//CHECK-LLVM: [[LOAD2:%exp[0-9]+]] = load i64, ptr addrspace(4) [[EXPECTED]], align 8
+//CHECK-LLVM: store i64 [[LOAD2]], ptr [[EXPECTED_ALLOCA2:%expected[0-9]+]], align 8
+//CHECK-LLVM: [[EXPECTED_AS2:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA2]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiclPU3AS4ll12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS2]], i64 [[CAST2]], i32 4, i32 0, i32 1)
+//CHECK-LLVM: [[CAST3:%[0-9]+]] = bitcast double %desired to i64
+//CHECK-LLVM: [[LOAD3:%exp[0-9]+]] = load i64, ptr addrspace(4) [[EXPECTED]], align 8
+//CHECK-LLVM: store i64 [[LOAD3]], ptr [[EXPECTED_ALLOCA3:%expected[0-9]+]], align 8
+//CHECK-LLVM: [[EXPECTED_AS3:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA3]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiclPU3AS4ll12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS3]], i64 [[CAST3]], i32 3, i32 0, i32 2)
+//CHECK-LLVM: [[CAST4:%[0-9]+]] = bitcast double %desired to i64
+//CHECK-LLVM: [[LOAD4:%exp[0-9]+]] = load i64, ptr addrspace(4) [[EXPECTED]], align 8
+//CHECK-LLVM: store i64 [[LOAD4]], ptr [[EXPECTED_ALLOCA4:%expected[0-9]+]], align 8
+//CHECK-LLVM: [[EXPECTED_AS4:%expected.*]] = addrspacecast ptr [[EXPECTED_ALLOCA4]] to ptr addrspace(4)
+//CHECK-LLVM: call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiclPU3AS4ll12memory_orderS4_12memory_scope(ptr addrspace(4) [[OBJECT]], ptr addrspace(4) [[EXPECTED_AS4]], i64 [[CAST4]], i32 4, i32 0, i32 1)
diff --git a/llvm-spirv/test/transcoding/AtomicCompareExchange_cl20.ll b/llvm-spirv/test/transcoding/AtomicCompareExchange_cl20.ll
index 98b59a7364c3b..97c598aa9e292 100644
--- a/llvm-spirv/test/transcoding/AtomicCompareExchange_cl20.ll
+++ b/llvm-spirv/test/transcoding/AtomicCompareExchange_cl20.ll
@@ -26,6 +26,24 @@ target triple = "spir-unknown-unknown"
 ; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicit{{.*}}(ptr {{.*}} %object, ptr {{.*}} [[PTR_STRONG]].as, i32 %desired, i32 5, i32 5, i32 2)
 ; CHECK:         load i32, ptr addrspace(4) [[PTR_STRONG]].as
 
+; CHECK-LABEL:   define spir_func void @test_strong_float
+; CHECK-NEXT:    entry:
+; CHECK:         [[PTR_STRONG:%expected[0-9]*]] = alloca i32, align 4
+; CHECK:         [[DESIRED_CAST:%[0-9]*]] = bitcast float %desired to i32
+; CHECK:         store i32 {{.*}}, ptr [[PTR_STRONG]]
+; CHECK:         [[PTR_STRONG]].as = addrspacecast ptr [[PTR_STRONG]] to ptr addrspace(4)
+; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicit{{.*}}(ptr {{.*}} %object, ptr {{.*}} [[PTR_STRONG]].as, i32 [[DESIRED_CAST]], i32 5, i32 5, i32 2)
+; CHECK:         load i32, ptr addrspace(4) [[PTR_STRONG]].as
+
+; CHECK-LABEL:   define spir_func void @test_strong_double
+; CHECK-NEXT:    entry:
+; CHECK:         [[PTR_STRONG:%expected[0-9]*]] = alloca i64, align 8
+; CHECK:         [[DESIRED_CAST:%[0-9]*]] = bitcast double %desired to i64
+; CHECK:         store i64 {{.*}}, ptr [[PTR_STRONG]]
+; CHECK:         [[PTR_STRONG]].as = addrspacecast ptr [[PTR_STRONG]] to ptr addrspace(4)
+; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicit{{.*}}(ptr {{.*}} %object, ptr {{.*}} [[PTR_STRONG]].as, i64 [[DESIRED_CAST]], i32 5, i32 5, i32 2)
+; CHECK:         load i64, ptr addrspace(4) [[PTR_STRONG]].as
+
 ; CHECK-LABEL:   define spir_func void @test_weak
 ; CHECK-NEXT:    entry:
 ; CHECK:         [[PTR_WEAK:%expected[0-9]*]] = alloca i32, align 4
@@ -34,6 +52,24 @@ target triple = "spir-unknown-unknown"
 ; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope{{.*}}(ptr {{.*}} %object, ptr {{.*}} [[PTR_WEAK]].as, i32 %desired, i32 5, i32 5, i32 2)
 ; CHECK:         load i32, ptr addrspace(4) [[PTR_WEAK]].as
 
+; CHECK-LABEL:   define spir_func void @test_weak_float
+; CHECK-NEXT:    entry:
+; CHECK:         [[PTR_WEAK:%expected[0-9]*]] = alloca i32, align 4
+; CHECK:         [[DESIRED_CAST:%[0-9]*]] = bitcast float %desired to i32
+; CHECK:         store i32 {{.*}}, ptr [[PTR_WEAK]]
+; CHECK:         [[PTR_WEAK]].as = addrspacecast ptr [[PTR_WEAK]] to ptr addrspace(4)
+; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiciPU3AS4ii12memory_orderS4_12memory_scope(ptr {{.*}} %object, ptr {{.*}} [[PTR_WEAK]].as, i32 [[DESIRED_CAST]], i32 5, i32 5, i32 2)
+; CHECK:         load i32, ptr addrspace(4) [[PTR_WEAK]].as
+
+; CHECK-LABEL:   define spir_func void @test_weak_double
+; CHECK-NEXT:    entry:
+; CHECK:         [[PTR_WEAK:%expected[0-9]*]] = alloca i64, align 8
+; CHECK:         [[DESIRED_CAST:%[0-9]*]] = bitcast double %desired to i64
+; CHECK:         store i64 {{.*}}, ptr [[PTR_WEAK]]
+; CHECK:         [[PTR_WEAK]].as = addrspacecast ptr [[PTR_WEAK]] to ptr addrspace(4)
+; CHECK:         call spir_func i1 @_Z39atomic_compare_exchange_strong_explicitPU3AS4VU7_AtomiclPU3AS4ll12memory_orderS4_12memory_scope(ptr {{.*}} %object, ptr {{.*}} [[PTR_WEAK]].as, i64 [[DESIRED_CAST]], i32 5, i32 5, i32 2)
+; CHECK:         load i64, ptr addrspace(4) [[PTR_WEAK]].as
+
 ; Check that alloca for atomic_compare_exchange is being created in the entry block.
 
 ; CHECK-LABEL:   @atomic_in_loop
@@ -52,6 +88,22 @@ entry:
 
 declare spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
 
+define spir_func void @test_strong_float(ptr addrspace(4) %object, ptr addrspace(4) %expected, float %desired) #0 {
+entry:
+  %call = call spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPU3AS1VU7_AtomicfPU3AS1ff(ptr addrspace(4) %object, ptr addrspace(4) %expected, float %desired) #2
+  ret void
+}
+
+declare spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPU3AS1VU7_AtomicfPU3AS1ff(ptr addrspace(4), ptr addrspace(4), float)
+
+define spir_func void @test_strong_double(ptr addrspace(4) %object, ptr addrspace(4) %expected, double %desired) #0 {
+entry:
+  %call = call spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPU3AS1VU7_AtomicdPU3AS1dd(ptr addrspace(4) %object, ptr addrspace(4) %expected, double %desired) #2
+  ret void
+}
+
+declare spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPU3AS1VU7_AtomicdPU3AS1dd(ptr addrspace(4), ptr addrspace(4), double)
+
 ; Function Attrs: nounwind
 define spir_func void @test_weak(ptr addrspace(4) %object, ptr addrspace(4) %expected, i32 %desired) #0 {
 entry:
@@ -61,6 +113,22 @@ entry:
 
 declare spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
 
+define spir_func void @test_weak_float(ptr addrspace(4) %object, ptr addrspace(4) %expected, float %desired) #0 {
+entry:
+  %call = call spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPU3AS1VU7_AtomicfPU3AS1ff(ptr addrspace(4) %object, ptr addrspace(4) %expected, float %desired) #2
+  ret void
+}
+
+declare spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPU3AS1VU7_AtomicfPU3AS1ff(ptr addrspace(4), ptr addrspace(4), float)
+
+define spir_func void @test_weak_double(ptr addrspace(4) %object, ptr addrspace(4) %expected, double %desired) #0 {
+entry:
+  %call = call spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPU3AS1VU7_AtomicdPU3AS1dd(ptr addrspace(4) %object, ptr addrspace(4) %expected, double %desired) #2
+  ret void
+}
+
+declare spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPU3AS1VU7_AtomicdPU3AS1dd(ptr addrspace(4), ptr addrspace(4), double)
+
 ; Function Attrs: nounwind
 define spir_kernel void @atomic_in_loop(ptr addrspace(1) %destMemory, ptr addrspace(1) %oldValues) #0 {
 entry:
diff --git a/llvm-spirv/test/transcoding/addrspace_global_annotations.ll b/llvm-spirv/test/transcoding/addrspace_global_annotations.ll
new file mode 100644
index 0000000000000..1e07e4b013074
--- /dev/null
+++ b/llvm-spirv/test/transcoding/addrspace_global_annotations.ll
@@ -0,0 +1,24 @@
+; This test checks that we don't face an assertion during the reverse translation:
+;   Assertion `C->getType() == Ty->getElementType() && "Wrong type in array element initializer"' failed
+; It also verifies that all the different address spaces were casted to the "common" one.
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_memory_attributes
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc
+; RUN: FileCheck < %t.rev.ll %s
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%class.test = type { %"class.test_private" }
+%"class.test_private" = type { i16 }
+
+; CHECK: @llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr addrspacecast (ptr addrspace(1) @samples to ptr), ptr @0, ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr @1, ptr undef, i32 undef, ptr undef }]
+
+@llvm.global.annotations = addrspace(1) global [2 x { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1) }] [ { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1) } { ptr addrspace(1) @samples, ptr addrspace(1) @.str, ptr addrspace(1) null, i32 92, ptr addrspace(1) null }, { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1) } { ptr addrspace(1) addrspacecast (ptr addrspace(2) @foo to ptr addrspace(1)), ptr addrspace(1) @.str.2, ptr addrspace(1) null, i32 30, ptr addrspace(1) null }]
+@samples = external addrspace(1) global { [64 x %class.test] }
+@foo = dso_local addrspace(2) constant i32 1, align 4
+@.str = addrspace(1) constant [13 x i8] c"{register:1}\00"
+@.str.2 = addrspace(1) constant [13 x i8] c"{register:0}\00"
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index eb467a06db827..17ffda0ae6a33 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1431,3 +1431,16 @@ endif()
 if (LLVM_INCLUDE_UTILS AND LLVM_INCLUDE_TOOLS)
   add_subdirectory(utils/llvm-locstats)
 endif()
+
+if (XCODE)
+  # For additional targets that you would like to add schemes, specify e.g:
+  #
+  #   -DLLVM_XCODE_EXTRA_TARGET_SCHEMES="TargetParserTests;SupportTests"
+  #
+  # at CMake configure time.
+  set(LLVM_XCODE_EXTRA_TARGET_SCHEMES "" CACHE STRING "Specifies an extra list of targets to turn into schemes")
+
+  foreach(target ${LLVM_XCODE_EXTRA_TARGET_SCHEMES})
+    set_target_properties(${target} PROPERTIES XCODE_GENERATE_SCHEME ON)
+  endforeach()
+endif()
diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index d1620d1cbf870..5b4df555fd666 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -263,3 +263,7 @@ D: C-SKY backend (lib/Target/CSKY/*)
 N: Ilia Diachkov
 E: ilia.diachkov@gmail.com
 D: SPIR-V backend (lib/Target/SPIRV/*)
+
+N: Christopher Apple, David Trevelyan
+E: cja-private@pm.me, david.trevelyan@gmail.com
+D: RealtimeSanitizer (LLVM part)
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 43f88f7257924..52d726451ada9 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
   Support)
 
-add_benchmark(DummyYAML DummyYAML.cpp)
+add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/xxhash.cpp b/llvm/benchmarks/xxhash.cpp
new file mode 100644
index 0000000000000..429cbc0fa87d4
--- /dev/null
+++ b/llvm/benchmarks/xxhash.cpp
@@ -0,0 +1,29 @@
+#include "llvm/Support/xxhash.h"
+#include "benchmark/benchmark.h"
+
+#include <memory>
+
+static uint32_t xorshift(uint32_t State) {
+  State ^= State << 13;
+  State ^= State >> 17;
+  State ^= State << 5;
+  return State;
+}
+
+static void BM_xxh3_64bits(benchmark::State &State) {
+  std::unique_ptr<uint32_t[]> Data(new uint32_t[State.range(0) / 4]);
+
+  uint32_t Prev = 0xcafebabe;
+  for (int64_t I = 0; I < State.range(0) / 4; I++)
+    Data[I] = Prev = xorshift(Prev);
+
+  llvm::ArrayRef DataRef =
+      llvm::ArrayRef(reinterpret_cast<uint8_t *>(Data.get()), State.range(0));
+
+  for (auto _ : State)
+    llvm::xxh3_64bits(DataRef);
+}
+
+BENCHMARK(BM_xxh3_64bits)->Arg(32)->Arg(512)->Arg(64 * 1024)->Arg(1024 * 1024);
+
+BENCHMARK_MAIN();
diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 86b010e0ac22d..908e6658a89f7 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -519,7 +519,6 @@ external vector_size : lltype -> int = "llvm_vector_size"
 (*--... Operations on other types ..........................................--*)
 external void_type : llcontext -> lltype = "llvm_void_type"
 external label_type : llcontext -> lltype = "llvm_label_type"
-external x86_mmx_type : llcontext -> lltype = "llvm_x86_mmx_type"
 external type_by_name : llmodule -> string -> lltype option = "llvm_type_by_name"
 
 external classify_value : llvalue -> ValueKind.t = "llvm_classify_value"
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index c16530d3a70cb..b8a430adf6cf2 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -760,10 +760,6 @@ val void_type : llcontext -> lltype
     [llvm::Type::LabelTy]. *)
 val label_type : llcontext -> lltype
 
-(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the
-    context [c]. See [llvm::Type::X86_MMXTy]. *)
-val x86_mmx_type : llcontext -> lltype
-
 (** [type_by_name m name] returns the specified type from the current module
     if it exists.
     See the method [llvm::Module::getTypeByName] *)
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 4ac824cd6a98a..5906f427e6907 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -686,11 +686,6 @@ value llvm_label_type(value Context) {
   return to_val(LLVMLabelTypeInContext(Context_val(Context)));
 }
 
-/* llcontext -> lltype */
-value llvm_x86_mmx_type(value Context) {
-  return to_val(LLVMX86MMXTypeInContext(Context_val(Context)));
-}
-
 /* llmodule -> string -> lltype option */
 value llvm_type_by_name(value M, value Name) {
   return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name)));
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 03f4e1f190fd9..bb4e9963d0913 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -327,8 +327,8 @@ function(add_link_opts target_name)
       elseif(${CMAKE_SYSTEM_NAME} MATCHES "SunOS" AND LLVM_LINKER_IS_SOLARISLD)
         # Support for ld -z discard-unused=sections was only added in
         # Solaris 11.4.  GNU ld ignores it, but warns every time.
-        include(LLVMCheckLinkerFlag)
-        llvm_check_linker_flag(CXX "-Wl,-z,discard-unused=sections" LINKER_SUPPORTS_Z_DISCARD_UNUSED)
+        include(CheckLinkerFlag)
+        check_linker_flag(CXX "-Wl,-z,discard-unused=sections" LINKER_SUPPORTS_Z_DISCARD_UNUSED)
         if (LINKER_SUPPORTS_Z_DISCARD_UNUSED)
           set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                        LINK_FLAGS " -Wl,-z,discard-unused=sections")
@@ -1452,6 +1452,7 @@ macro(llvm_add_tool project name)
   endif()
   get_subproject_title(subproject_title)
   set_target_properties(${name} PROPERTIES FOLDER "${subproject_title}/Tools")
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro(llvm_add_tool project name)
 
 macro(add_llvm_tool name)
@@ -2043,6 +2044,7 @@ function(add_lit_target target comment)
 
   # Tests should be excluded from "Build Solution".
   set_target_properties(${target} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD ON)
+  set_target_properties(${target} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endfunction()
 
 # Convert a target name like check-clang to a variable name like CLANG.
diff --git a/llvm/cmake/modules/AddSecurityFlags.cmake b/llvm/cmake/modules/AddSecurityFlags.cmake
index 38c9c736b6e74..774ca47b5cd6e 100644
--- a/llvm/cmake/modules/AddSecurityFlags.cmake
+++ b/llvm/cmake/modules/AddSecurityFlags.cmake
@@ -20,9 +20,9 @@ macro(add_compile_option_ext flag name)
 endmacro()
 
 macro(add_link_option_ext flag name)
-  include(LLVMCheckLinkerFlag)
+  include(CheckLinkerFlag)
   cmake_parse_arguments(ARG "" "" "" ${ARGN})
-  llvm_check_linker_flag(CXX "${flag}" "LINKER_SUPPORTS_${name}")
+  check_linker_flag(CXX "${flag}" "LINKER_SUPPORTS_${name}")
   if(LINKER_SUPPORTS_${name})
     message(STATUS "Building with ${flag}")
     append("${flag}" ${ARG_UNPARSED_ARGUMENTS})
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 63a5bfe09a037..18c9191fb705b 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1061,8 +1061,8 @@ if (LLVM_USE_SPLIT_DWARF AND
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
       CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:-gsplit-dwarf>)
-    include(LLVMCheckLinkerFlag)
-    llvm_check_linker_flag(CXX "-Wl,--gdb-index" LINKER_SUPPORTS_GDB_INDEX)
+    include(CheckLinkerFlag)
+    check_linker_flag(CXX "-Wl,--gdb-index" LINKER_SUPPORTS_GDB_INDEX)
     append_if(LINKER_SUPPORTS_GDB_INDEX "-Wl,--gdb-index"
       CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
@@ -1083,8 +1083,8 @@ endif()
 
 # lld doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND CMAKE_GENERATOR MATCHES "Ninja")
-  include(LLVMCheckLinkerFlag)
-  llvm_check_linker_flag(CXX "-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
+  include(CheckLinkerFlag)
+  check_linker_flag(CXX "-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
   append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,--color-diagnostics"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
diff --git a/llvm/cmake/modules/HandleLLVMStdlib.cmake b/llvm/cmake/modules/HandleLLVMStdlib.cmake
index 7afc10cff74ff..a7e138aa0789b 100644
--- a/llvm/cmake/modules/HandleLLVMStdlib.cmake
+++ b/llvm/cmake/modules/HandleLLVMStdlib.cmake
@@ -13,12 +13,12 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED)
   endfunction()
 
   include(CheckCXXCompilerFlag)
-  include(LLVMCheckLinkerFlag)
+  include(CheckLinkerFlag)
   set(LLVM_LIBCXX_USED 0)
   if(LLVM_ENABLE_LIBCXX)
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       check_cxx_compiler_flag("-stdlib=libc++" CXX_COMPILER_SUPPORTS_STDLIB)
-      llvm_check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB)
+      check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB)
       if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB)
         append("-stdlib=libc++"
           CMAKE_CXX_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS
@@ -36,7 +36,7 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED)
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       check_cxx_compiler_flag("-static-libstdc++"
                               CXX_COMPILER_SUPPORTS_STATIC_STDLIB)
-      llvm_check_linker_flag(CXX "-static-libstdc++" CXX_LINKER_SUPPORTS_STATIC_STDLIB)
+      check_linker_flag(CXX "-static-libstdc++" CXX_LINKER_SUPPORTS_STATIC_STDLIB)
       if(CXX_COMPILER_SUPPORTS_STATIC_STDLIB AND
         CXX_LINKER_SUPPORTS_STATIC_STDLIB)
         append("-static-libstdc++"
diff --git a/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake b/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake
deleted file mode 100644
index e09bbc66f2d26..0000000000000
--- a/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-include(CheckLinkerFlag OPTIONAL)
-
-if (COMMAND check_linker_flag)
-  macro(llvm_check_linker_flag)
-    check_linker_flag(${ARGN})
-  endmacro()
-else()
-  # Until the minimum CMAKE version is 3.18
-
-  include(CheckCXXCompilerFlag)
-
-  # cmake builtin compatible, except we assume lang is C or CXX
-  function(llvm_check_linker_flag lang flag out_var)
-    cmake_policy(PUSH)
-    cmake_policy(SET CMP0056 NEW)
-    set(_CMAKE_EXE_LINKER_FLAGS_SAVE ${CMAKE_EXE_LINKER_FLAGS})
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
-    if("${lang}" STREQUAL "C")
-      check_c_compiler_flag("" ${out_var})
-    elseif("${lang}" STREQUAL "CXX")
-      check_cxx_compiler_flag("" ${out_var})
-    else()
-      message(FATAL_ERROR "\"${lang}\" is not C or CXX")
-    endif()
-    set(CMAKE_EXE_LINKER_FLAGS ${_CMAKE_EXE_LINKER_FLAGS_SAVE})
-    cmake_policy(POP)
-  endfunction()
-endif()
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 46af2e421a258..1a724a58f58e0 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1291,7 +1291,7 @@ TYPE_CODE_X86_MMX Record
 
 ``[X86_MMX]``
 
-The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table.
+The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector.
 
 TYPE_CODE_STRUCT_ANON Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst
index 799ee34e9f9ff..c9d5baba3e2f4 100644
--- a/llvm/docs/CommandGuide/lit.rst
+++ b/llvm/docs/CommandGuide/lit.rst
@@ -151,6 +151,10 @@ EXECUTION OPTIONS
  feature that can be used to conditionally disable (or expect failure in)
  certain tests.
 
+.. option:: --skip-test-time-recording
+
+ Disable tracking the wall time individual tests take to execute.
+
 .. option:: --time-tests
 
  Track the wall time individual tests take to execute and includes the results
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 8ccb025a3f0f3..7f20a76550c83 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -303,6 +303,15 @@ them.
 
  Shift LMA of non-zero-sized segments by ``<val>``.
 
+.. option:: --change-section-address <section>{=+-}<val>, --adjust-section-vma
+
+ Change the address of sections that match ``<section>`` pattern to the
+ specified value, or apply ``+<val>``/``-<val>`` to the current value. Can be
+ specified multiple times to specify multiple patterns. Each section is only
+ modified by one ``--change-section-address`` argument. If a section name
+ matches multiple patterns, the rightmost change applies. The object file needs
+ to be of ET_REL type.
+
 .. option:: --change-start <incr>, --adjust-start
 
  Add ``<incr>`` to the program's start address. Can be specified multiple
diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst
index 0371d7a3bdfcb..1b1a27255cc40 100644
--- a/llvm/docs/CompileCudaWithLLVM.rst
+++ b/llvm/docs/CompileCudaWithLLVM.rst
@@ -514,7 +514,7 @@ Modern CPUs and GPUs are architecturally quite different, so code that's fast
 on a CPU isn't necessarily fast on a GPU.  We've made a number of changes to
 LLVM to make it generate good GPU code.  Among these changes are:
 
-* `Straight-line scalar optimizations <https://goo.gl/4Rb9As>`_ -- These
+* `Straight-line scalar optimizations <https://docs.google.com/document/d/1momWzKFf4D6h8H3YlfgKQ3qeZy5ayvMRh6yR-Xn2hUE>`_ -- These
   reduce redundancy within straight-line code.
 
 * `Aggressive speculative execution
diff --git a/llvm/docs/DirectX/DXILOpTableGenDesign.rst b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
index 5f3ac75e82494..50d801bd05efd 100644
--- a/llvm/docs/DirectX/DXILOpTableGenDesign.rst
+++ b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
@@ -24,12 +24,12 @@ DXIL Operations are represented in one of the following `two ways
 
    These are  collectively referred to as `LLVM Intrinsics` in this note.
 
-Following is the complete list of attributes of DXIL Ops with the corresponding field name
-as used in ``hctdb.py``. A DXIL Op is represented by a set of associated attributes. These
-are consumed in DXIL backend passes as well as in other usage scenarios such as validation, 
+Following is the complete list of properties of DXIL Ops with the corresponding field name
+as used in ``hctdb.py``. A DXIL Op is represented by a set of associated properties. These
+are consumed in DXIL backend passes as well as in other usage scenarios such as validation,
 DXIL reader, etc.
 
-A. Attributes consumed in DXIL backend passes
+A. Properties consumed in DXIL backend passes
 
    1. Name of operation (``dxil_op``)
    2. A string that documents the operation (``doc``) - This is not strictly necessary but is included
@@ -38,7 +38,7 @@ A. Attributes consumed in DXIL backend passes
    4. Unique Integer ID (``dxil_opid``)
    5. Operation Class signifying the name and function signature of the operation (``dxil_class``).
       This string is an integral part of the DXIL Op function name and is constructed in
-      the format ``dx.op.<class-name>.<overload-type>``. Each DXIL Op call target function name 
+      the format ``dx.op.<class-name>.<overload-type>``. Each DXIL Op call target function name
       is required to conform to this format per existing contract with the driver.
    6. List of valid overload types for the operation (``oload_types``).
    7. Required DXIL Version with support for the operation.
@@ -58,26 +58,26 @@ A. Attributes consumed in DXIL backend passes
 Motivation
 ==========
 
-DXIL backend passes depend on various attributes of DXIL Operations. For example, ``DXILOpLowering``
+DXIL backend passes depend on various properties of DXIL Operations. For example, ``DXILOpLowering``
 pass will need information such as the DXIL operation an LLVM intrinsic is to be lowered to,
-along with valid overload and parameter types etc. The TableGen file -
+along with valid overload and argument types etc. The TableGen file -
 ``llvm/lib/Target/DirectX/DXIL.td`` - is used to represent DXIL Operations
-by specifying their attributes listed above. ``DXIL.td`` is designed to be the single source
-of reference of DXIL Operations primarily for the implementation of passes in DXIL backend in 
-``llvm-project`` repo - analogous to ``hctdb.py`` for ``DirectXShadeCompiler`` repo. However, 
-the current design does not intend to encapsulate various validation rules, present in ``hctdb.py``, 
-but do not pertain to DXIL Operations. It needs to have a rich representation capabilities that 
-TableGen backends (such as ``DXILEmitter``) can rely on. Additionally, the DXIL Op specification 
+by specifying their properties listed above. ``DXIL.td`` is designed to be the single source
+of reference of DXIL Operations primarily for the implementation of passes in DXIL backend in
+``llvm-project`` repo - analogous to ``hctdb.py`` for ``DirectXShadeCompiler`` repo. However,
+the current design does not intend to encapsulate various validation rules, present in ``hctdb.py``,
+but do not pertain to DXIL Operations. It needs to have a rich representation capabilities that
+TableGen backends (such as ``DXILEmitter``) can rely on. Additionally, the DXIL Op specification
 should be easy to read and comprehend.
 
 This note provides the design of the specification DXIL Ops as TableGen class ``DXILOp``
-by specifying its attributes identified above.
+by specifying its properties identified above.
 
 DXIL Operation Specification
 ============================
 
 The DXIL Operation is represented using the TableGen class ``DXILOp``. The DXIL operation
-attributes are specified as fields of the ``DXILOp`` class as described below.
+properties are specified as fields of the ``DXILOp`` class as described below.
 
 1. Each DXIL Operation is represented as a TableGen record. The name of each of the records
    signifies operation name.
@@ -93,76 +93,81 @@ attributes are specified as fields of the ``DXILOp`` class as described below.
         class DXILOpClass;
 
    Concrete operation records, such as ``unary`` are defined by inheriting from ``DXILOpClass``.
-6. Return and argument types of the operation are represented as ``dag``s using the
-   special markers ``out`` and ``ins``. An overload type, if supported by the operation, is
-   denoted as the positional type ``dxil_overload_ty`` in the argument or in the result, where
-   ``dxil_overload_ty`` is defined to be synonymous to ``llvm_any_ty``.
+6. Return type of the operation is represented as ``LLVMType``.
+7. Operation arguments are represented as a list of ``LLVMType`` with each type
+   corresponding to the argument position. An overload type, if supported by the operation, is
+   denoted as the positional type ``overloadTy`` in the argument or in the result, where
+   ``overloadTy`` is defined to be synonymous to ``llvm_any_ty``.
 
    .. code-block::
 
-      defvar dxil_overload_ty = llvm_any_ty
+      defvar overloadTy = llvm_any_ty
 
+   Empty list, ``[]`` represents an operation with no arguments.
 
-7. Valid overload types and shader stages predicated on Shader Model version are specified
-   as a list of ``Constraint`` records. Representation of ``Constraints`` class is described
-   a later section.
-8. Various attributes of the DXIL Operation that are not predicated on Shader Model version
-   are represented as a ``dag`` using the special marker ``attrs``. Representation of ``Attributes`` 
+8. Valid operation overload types predicated on DXIL version are specified as
+   a list of ``Overloads`` records. Representation of ``Overloads``
    class is described in a later section.
+9.  Valid shader stages predicated on DXIL version are specified as a list of
+    ``Stages`` records. Representation of ``Stages`` class is
+    described in a later section.
+10. Various attributes of the DXIL Operation are represented as a ``list`` of
+    ``Attributes`` class records. Representation of ``Attributes``
+    class is described in a later section.
 
-A DXIL Operation is represented by the following TableGen class by encapsulating the various
-TableGen representations of its attributes described above.
+Types specific to DXIL
+----------------------
+
+Type notation used in this document viz., ``<size>Ty`` corresponds to TableGen records for
+LLVM types ``llvm_<size>_ty``. Apart from ``overloadTy`` described above, ``resRetF32Ty`` is
+used to denote resource return type and ``handleTy`` is used to denote handle type.
+
+Specification of DXIL Operation
+================================
+
+A DXIL Operation is represented by the following TableGen class that encapsulates the various
+TableGen representations of its properties described above.
 
 .. code-block::
 
    // Abstraction DXIL Operation
-   class DXILOp {
+   class DXILOp<int opcode, DXILOpClass opclass> {
      // A short description of the operation
      string Doc = "";
 
      // Opcode of DXIL Operation
-     int OpCode = 0;
+     int OpCode = opcode;
 
      // Class of DXIL Operation.
-     DXILOpClass OpClass = UnknownOpClass;
+     DXILOpClass OpClass = opclass;
 
      // LLVM Intrinsic DXIL Operation maps to
      Intrinsic LLVMIntrinsic = ?;
 
-     // Dag containing the arguments of the op. Default to 0 arguments.
-     dag arguments = (ins);
-
-     // Results of the op. Default to 0 results.
-     dag result = (out);
-
-     // List of constraints predicated on Shader Model version
-     list<SMVersionConstraints> sm_constraints;
+     // Result type of the op.
+     LLVMType result;
 
-     // Non-predicated operation attributes
-     dag attrtibutes = (attrs);
-     Version DXILVersion = ?;
-   }
-
-Constraint Specification
-========================
+     // List of argument types of the op. Default to 0 arguments.
+     list<LLVMType> arguments = [];
 
-DXIL Operation attributes such as valid overload types and valid shader stages are
-predicated on Shader Model version. These are represented as list of constrained
-attributes.
+     // List of valid overload types predicated by DXIL version
+     list<Overloads> overloads;
 
-Following is the definition of a generic constraint and the associated predicate
+     // List of valid shader stages predicated by DXIL version
+    list<Stages> stages;
 
-.. code-block::
+     // List of valid attributes predicated by DXIL version
+     list<Attributes> attributes = [];
+   }
 
-   // Primitive predicate
-   class Pred;
+Version Specification
+=====================
 
-   // Generic constraint
-   class Constraint<Pred pred> {
-     Pred predicate = pred;
-   }
+DXIL version is used to specify various version-dependent operation properties in
+place of Shader Model version.
 
-Shader Model version is represented as follows:
+A ``Version`` class encapsulating ``Major`` and ``Minor`` version number is defined
+as follows:
 
 .. code-block::
 
@@ -172,112 +177,272 @@ Shader Model version is represented as follows:
      int Minor = minor;
    }
 
-   // Valid Shader model version records
 
-   // Definition of Shader Model 6.0 - 6.8 and DXIL Version 1.0 - 1.8
+Concrete representations of valid DXIL versions are defined as follows:
+
+.. code-block::
+
+   // Definition of DXIL Version 1.0 - 1.8
    foreach i = 0...8 in {
-     def SM6_#i : Version<6, i>;
-     def DX1_#i : Version<1, i>;
+     def DXIL1_#i : Version<1, i>;
    }
 
-A shader model version predicate class is defined as
+Shader Stage Specification
+==========================
+
+Various shader stages such as ``compute``, ``pixel``, ``vertex``, etc., are represented
+as follows
 
 .. code-block::
 
-   class SMVersion<Version ver> : Pred {
-     Version SMVersion = ver;
-   }
+   // Shader stages
+   class DXILShaderStage;
 
-A constraint class to represent overload types and shader stages predicated on shader
-model version is defined as
+   def compute : DXILShaderStage;
+   def pixel : DXILShaderStage;
+   def vertex : DXILShaderStage;
+   ...
+
+Shader Attribute Specification
+==============================
+
+Various operation memory access and boolean attributes such as ``ReadNone``,
+``IsWave`` etc., are represented as follows
 
 .. code-block::
 
-   class SMVersionConstraints<SMVersion smver, dag oloads, dag stages> : Constraint<smver> {
-     dag overload_types = oloads;
-     dag stage_kinds = stages;
-   }
+  class DXILAttribute;
 
-The ``dag overload_types`` and ``dag shader_kinds`` use a special markers ``overloads``
-and ``stages``, respectively.
+  def ReadOnly : DXILOpAttributes;
+  def ReadNone : DXILOpAttributes;
+  def IsWave : DXILOpAttributes;
+  ...
 
-Examples of Constraints
------------------------
+Versioned Property Specification
+================================
+
+DXIL Operation properties such as valid overload types, shader stages and
+attributes are predicated on DXIL version. These are represented as list of
+versioned properties.
+
+Overload Type Specification
+---------------------------
 
-Consider a DXIL Operation that is valid in Shader Model 6.2 and later,
+``overloads`` field of ``class DXILOp`` is used to represent valid operation
+overloads predicated on DXIL version as list of records of the following class
+
+.. code-block::
 
-1. with valid overload types ``half``, ``float``, ``i16`` and ``i32``
-2. is valid for stages ``pixel`` and ``compute``
-3. with valid overload types ``double`` and ``i614`` if Shader Model version 6.3 and later
-4. is valid for all stages if Shader Model version 6.3 and later
+   class Overloads<Version minver, list<LLVMType> ols> {
+     Version dxil_version = minver;
+     list<LLVMType> overload_types = ols;
+   }
 
-This is represented as
+Following is an example specification of valid overload types for ``DXIL1_0`` and
+``DXIL1_2``.
 
 .. code-block::
 
-   [SMVersionConstraints<SMVersion<SM6_2>,
-                          (overloads llvm_half_ty, llvm_float_ty, llvm_i16_ty, llvm_i32_ty),
-                          (stages pixel, compute)>,
-    SMVersionConstraints<SMVersion<SM6_3>,
-                          (overloads llvm_half_ty, llvm_float_ty, llvm_double_ty,
-                                 llvm_i16_ty, llvm_i32_ty, llvm_i64_ty),
-                          (stages allKinds)>];
+   overloads = [
+                 Overloads<DXIL1_0, [halfTy, floatTy]>,
+                 Overloads<DXIL1_2, [halfTy, floatTy, doubleTy]>
+               ];
+
+An empty list signifies that the operation supports no overload types.
 
-Consider a DXIL operation that is valid in Shader Model version 6.2 and later,
 
-1. with no overload types, i.e., all argument typess and result type are fixed.
-2. is valid for all stages.
+Stages Specification
+--------------------
 
-This is represented as
+``stages`` field of ``class DXILOp`` is used to represent valid operation
+stages predicated on DXIL version as list of records of the following class
 
 .. code-block::
 
-     [SMVersionConstraints<SMVersion<SM6_2>, (overloads), (stages allKinds)>];
+   class Stages<Version minver, list<DXILShaderStage> sts> {
+     Version dxil_version = minver;
+     list<DXILShaderStage> shader_stages = sts;
+   }
 
+Following is an example specification of valid stages for ``DXIL1_0``,
+``DXIL1_2``, ``DXIL1_4`` and ``DXIL1_6``.
+
+.. code-block::
+
+   stages = [
+             Stages<DXIL1_0, [compute, pixel]>,
+             Stages<DXIL1_2, [compute, pixel, mesh]>,
+             Stages<DXIL1_4, [all_stages]>,
+             Stages<DXIL1_6, [removed]>
+            ];
+
+The following two pseudo stage records in addition to standard shader stages
+are defined.
+
+1. ``all_stages`` signifies that the operation is valid for all stages in the
+   specified DXIL version and later.
+2. ``removed`` signifies removal of support for the operation in the specified
+   DXIL version and later.
+
+A non-empty list of supported stages is required to be specified. If an operation
+is supported in all DXIL versions and all stages it is required to be specified as
+
+.. code-block::
+
+   stages = [Stages<DXIL1_0, [all_stages]>];
 
-Specifying attributes predicated on Shader Model version using the single field 
-``sm_constraints`` not only allows for all of them to be specified together but
-also allows for a single place to specify minimum shader model version that supports
-the operation. Thus, a separate fiels is not needed to specify minimum shader model 
-version.
 
 Attribute Specification
-=======================
+-----------------------
 
-DXIL Operation attributes that are not predicated on any constraint, are represented as
-a ``dag`` of Attribute records of the following abstract ``DXILAttributes`` class.
+``attributes`` field of ``class DXILOp`` is used to represent valid operation
+attributes predicated on DXIL version as list of records of the following class
 
 .. code-block::
 
-  class DXILAttributes;
+  class Attributes<MinVersion minver, list<DXILAttribute> attrs> {
+    MinVersion dxil_version = ver;
+    list<DXILAttribute> attributes = attrs;
+  }
 
-Following example records represent memory attributes 
+Following is an example specification of valid attributes for ``DXIL1_0``.
 
 .. code-block::
 
-  def ReadOnly : DXILOpAttributes;
-  def ReadNone : DXILOpAttributes;
+   attributes = [Attributes<DXIL1_0, [ReadNone]];
+
+A null list of ``attributes`` signifies no operation attributes.
 
-DXIL Operation Specification Example
-====================================
-Following illustrates the specification of the DXIL Op ``Sin``
+Interpretation of Multiple Versioned Properties
+-----------------------------------------------
+
+Each of the versioned properties states that the specified overload type, stage or
+attribute records are valid for the predicated DXIL version. Only
+the properties corresponding to latest minimal DXIL version are applicable.
+Note as in the above example, any overload types, stages or attributes,
+that remain valid in a later DXIL version need to be specified in full.
+For example, consider the following specification of valid overload types:
 
 .. code-block::
 
-  def Sin  : DXILOp {
-    let Doc ="Returns sine(theta) for theta in radians.";
-    let OpCode = 13;
-    let OpClass = unary;
+   overloads = [
+                Overloads<DXIL1_0, [halfTy, floatTy]>,
+                Overloads<DXIL1_2, [halfTy, floatTy, doubleTy]>
+               ];
+
+It specifies that the overload types ``halfTy`` and ``floatTy`` are valid for DXIL
+version 1.0 and later. It also specifies that  ``doubleTy`` is additionally supported
+in DXIL version 1.2 and later.
+
+This provides the flexibility to specify properties independent of other
+versioned specifications in the list.
+
+
+DXIL Operation Specification Examples
+=====================================
+
+Following examples illustrate the specification of some of the DXIL Ops.
+
+``Sin`` operation - an operation valid in all DXIL versions and all stages
+and has valid overload types predicated on DXIL version.
+
+.. code-block::
+
+  def Sin : DXILOp<13, unary> {
+    let Doc = "Returns sine(theta) for theta in radians.";
     let LLVMIntrinsic = int_sin;
-    let arguments = (ins LLVMMatchType<0>);
-    let result = (out dxil_overload_ty);
-    let sm_constraints = [SMVersionConstraints<SMVersion<SM6_0>,
-                          (overloads llvm_half_ty, llvm_float_ty),
-                          (stages allKinds)>];
-    let attributes = (attrs ReadNone);
-    let DXILVersion = DX1_0;
+    let result = overloadTy;
+    let arguments = [overloadTy];
+    let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+    let stages = [Stages<DXIL1_0, [all_stages]>];
+    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
   }
 
+``FlattenedThreadIdInGroup`` - an operation with no arguments, no
+overload types, and valid stages and attributes predicated by DXIL Version.
+
+.. code-block::
+
+   def FlattenedThreadIdInGroup :  DXILOp<96, flattenedThreadIdInGroup> {
+    let Doc = "Provides a flattened index for a given thread within a given "
+              "group (SV_GroupIndex)";
+    let LLVMIntrinsic = int_dx_flattened_thread_id_in_group;
+    let result = i32Ty;
+    let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+   }
+
+``RawBufferStore`` - an operation with ``void`` return type, valid overload types
+predicated by DXIL Version and valid in all DXIL versions and stages.
+
+.. code-block::
+
+   def RawBufferStore : DXILOp<140, rawBufferStore> {
+     let Doc = "Writes to a RWByteAddressBuffer or RWStructuredBuffer.";
+     let result = voidTy;
+     let arguments = [dxil_resource_ty, i32Ty, i32Ty, overloadTy,
+                      overloadTy, overloadTy, overloadTy, i8Ty, i32Ty];
+     let overloads = [
+                      Overloads<DXIL1_2, [halfTy, floatTy, i16Ty, i32Ty]>,
+                      Overloads<DXIL1_3>,[halfTy, floatTy, doubleTy,
+                                                   i16Ty, i32Ty, i64Ty]>
+                     ];
+      let stages = [Stages<DXIL1_2, all_stages>];
+      let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
+   }
+
+``DerivCoarseX`` - an operation with no overload types and stages predicated
+by DXIL Version.
+
+.. code-block::
+
+   def DerivCoarseX : DXILOp<83, unary> {
+    let doc = "Computes the rate of change per stamp in x direction.";
+    let LLVMIntrinsic = int_dx_ddx;
+    let result = overloadTy;
+    let arguments = [overloadTy];
+    let stages = [
+                   Stages<DXIL1_0, [library, pixel]>,
+                   Stages<DXIL1_6, [library, pixel, amplification, compute, mesh]>
+                 ];
+    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+   }
+
+``CreateHandle`` - an operation with no overload types, no associated ``LLVMIntrinsic``
+and stages predicated  by DXIL Version.
+
+.. code-block::
+
+   def CreateHandle : DXILOp<57, createHandle> {
+     let doc = "Creates the handle to a resource";
+     let result = i32Ty;
+     let arguments = [i8Ty, i32Ty, i32Ty, i1Ty];
+     let stages = [
+                   Stages<DXIL1_0, [all_stages]>,
+                   Stages<DXIL1_6, [removed]
+                  ];
+     let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
+   }
+
+``Sample`` - an operation with valid overload types, stages and attributes
+predicated by DXIL version.
+
+.. code-block::
+
+   def Sample : DXILOp<60, sample> {
+     let Doc = "Samples a texture";
+     let LLVMIntrinsic = int_dx_sample;
+     let result = resRetF32Ty;
+     let arguments = [handleTy, handleTy, floatTy, floatTy, floatTy, floatTy,
+                      i32Ty, i32Ty, i32Ty, floatTy];
+     let overloads = [Overloads<DXIL1_0, [halfTy, floatTy, i16Ty, i32Ty]>];
+     let stages = [
+                   Stages<DXIL1_0, [library, pixel]>,
+                   Stages<DXIL1_6, [library, pixel, amplification, compute, mesh]>
+                  ];
+     let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
+   }
+
 Summary
 =======
 
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 5bbe902d9d2d7..aef88bc43b224 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -96,7 +96,7 @@ Buffers
 
 .. code-block:: llvm
 
-   target("dx.TypedBuffer", ElementType, IsWriteable, IsROV)
+   target("dx.TypedBuffer", ElementType, IsWriteable, IsROV, IsSigned)
    target("dx.RawBuffer", ElementType, IsWriteable, IsROV)
 
 We need two separate buffer types to account for the differences between the
@@ -106,9 +106,14 @@ used for DXIL's RawBuffers and StructuredBuffers. We call the latter
 "RawBuffer" to match the naming of the operations, but it can represent both
 the Raw and Structured variants.
 
-For TypedBuffer, the element type must be an integer or floating point type.
-For RawBuffer the type can be an integer, floating point, or struct type.
-HLSL's ByteAddressBuffer is represented by an `i8` element type.
+HLSL's Buffer and RWBuffer are represented as a TypedBuffer with an element
+type that is a scalar integer or floating point type, or a vector of at most 4
+such types. HLSL's ByteAddressBuffer is a RawBuffer with an `i8` element type.
+HLSL's StructuredBuffers are RawBuffer with a struct, vector, or scalar type.
+
+One unfortunate necessity here is that TypedBuffer needs an extra parameter to
+differentiate signed vs unsigned ints. The is because in LLVM IR int types
+don't have a sign, so to keep this information we need a side channel.
 
 These types are generally used by BufferLoad and BufferStore operations, as
 well as atomics.
@@ -128,6 +133,8 @@ There are a few fields to describe variants of all of these types:
        writeable) and UAVs (writeable).
    * - IsROV
      - Whether the UAV is a rasterizer ordered view. Always ``0`` for SRVs.
+   * - IsSigned
+     - Whether an int element type is signed ("dx.TypedBuffer" only)
 
 .. _bufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferload
 .. _bufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
@@ -197,23 +204,23 @@ Examples:
 .. code-block:: llvm
 
    ; RWBuffer<float4> Buf : register(u5, space3)
-   %buf = call target("dx.TypedBuffer", float, 1, 0)
+   %buf = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
                    i32 3, i32 5, i32 1, i32 0, i1 false)
 
-   ; RWBuffer<uint> Buf : register(u7, space2)
-   %buf = call target("dx.TypedBuffer", i32, 1, 0)
+   ; RWBuffer<int> Buf : register(u7, space2)
+   %buf = call target("dx.TypedBuffer", i32, 1, 0, 1)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t(
                    i32 2, i32 7, i32 1, i32 0, i1 false)
 
    ; Buffer<uint4> Buf[24] : register(t3, space5)
-   %buf = call target("dx.TypedBuffer", i32, 0, 0)
+   %buf = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t(
                    i32 2, i32 7, i32 24, i32 0, i1 false)
 
    ; struct S { float4 a; uint4 b; };
    ; StructuredBuffer<S> Buf : register(t2, space4)
-   %buf = call target("dx.RawBuffer", {<4 x f32>, <4 x i32>}, 0, 0)
+   %buf = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
                    i32 4, i32 2, i32 1, i32 0, i1 false)
 
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 0a1913dca8aac..e03ae5effcdc4 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -291,10 +291,11 @@ uses the package and provides other details.
 =========================================================== ============ ==========================================
 Package                                                     Version      Notes
 =========================================================== ============ ==========================================
-`CMake <http://cmake.org/>`__                               >=3.20.0     Makefile/workspace generator
+`CMake <http://cmake.org/>`_                                >=3.20.0     Makefile/workspace generator
 `python <http://www.python.org/>`_                          >=3.8        Automated test suite\ :sup:`1`
 `zlib <http://zlib.net>`_                                   >=1.2.3.4    Compression library\ :sup:`2`
 `GNU Make <http://savannah.gnu.org/projects/make>`_         3.79, 3.79.1 Makefile/build processor\ :sup:`3`
+`PyYAML <https://pypi.org/project/PyYAML/>`_                >=5.1        Header generator\ :sup:`4`
 =========================================================== ============ ==========================================
 
 .. note::
@@ -305,6 +306,7 @@ Package                                                     Version      Notes
    #. Optional, adds compression / uncompression capabilities to selected LLVM
       tools.
    #. Optional, you can use any other build tool supported by CMake.
+   #. Only needed when building libc with New Headergen. Mainly used by libc.
 
 Additionally, your compilation host is expected to have the usual plethora of
 Unix utilities. Specifically:
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index 4010958c5d313..14424f722abf6 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -376,10 +376,10 @@ Releases
 
 Backporting Fixes to the Release Branches
 -----------------------------------------
-You can use special comments on issues to make backport requests for the
-release branches.  This is done by making a comment containing the following
-command on any issue that has been added to one of the "X.Y.Z Release"
-milestones.
+You can use special comments on issues or pull requests to make backport
+requests for the release branches.  This is done by making a comment containing
+the following command on any issue or pull request that has been added to one
+of the "X.Y.Z Release" milestones.
 
 ::
 
@@ -388,8 +388,8 @@ milestones.
 This command takes one or more git commit hashes as arguments and will attempt
 to cherry-pick the commit(s) to the release branch.  If the commit(s) fail to
 apply cleanly, then a comment with a link to the failing job will be added to
-the issue.  If the commit(s) do apply cleanly, then a pull request will
-be created with the specified commits.
+the issue/pull request.  If the commit(s) do apply cleanly, then a pull request
+will be created with the specified commits.
 
 If a commit you want to backport does not apply cleanly, you may resolve
 the conflicts locally and then create a pull request against the release
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 18a53a4815722..d32aeff5a69bb 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -348,6 +348,26 @@ G_ICMP
 Perform integer comparison producing non-zero (true) or zero (false). It's
 target specific whether a true value is 1, ~0U, or some other non-zero value.
 
+G_SCMP
+^^^^^^
+
+Perform signed 3-way integer comparison producing -1 (smaller), 0 (equal), or 1 (larger).
+
+.. code-block:: none
+
+  %5:_(s32) = G_SCMP %6, %2
+
+
+G_UCMP
+^^^^^^
+
+Perform unsigned 3-way integer comparison producing -1 (smaller), 0 (equal), or 1 (larger).
+
+.. code-block:: none
+
+  %7:_(s32) = G_UCMP %2, %6
+
+
 G_SELECT
 ^^^^^^^^
 
diff --git a/llvm/docs/GlobalISel/MIRPatterns.rst b/llvm/docs/GlobalISel/MIRPatterns.rst
index 7e6d88683d491..654a60730bb9c 100644
--- a/llvm/docs/GlobalISel/MIRPatterns.rst
+++ b/llvm/docs/GlobalISel/MIRPatterns.rst
@@ -115,7 +115,7 @@ GITypeOf
 ``GITypeOf<"$x">`` is a ``GISpecialType`` that allows for the creation of a
 register or immediate with the same type as another (register) operand.
 
-Operand:
+Type Parameters:
 
 * An operand name as a string, prefixed by ``$``.
 
@@ -143,6 +143,59 @@ Semantics:
     (apply (G_FSUB $dst, $src, $tmp),
            (G_FNEG GITypeOf<"$dst">:$tmp, $src))>;
 
+GIVariadic
+~~~~~~~~~~
+
+``GIVariadic<>`` is a ``GISpecialType`` that allows for matching 1 or
+more operands remaining on an instruction.
+
+Type Parameters:
+
+* The minimum number of additional operands to match. Must be greater than zero.
+
+  * Default is 1.
+
+* The maximum number of additional operands to match. Must be strictly greater
+  than the minimum.
+
+  * 0 can be used to indicate there is no upper limit.
+  * Default is 0.
+
+Semantics:
+
+* ``GIVariadic<>`` operands can only appear on variadic instructions.
+* ``GIVariadic<>`` operands cannot be defs.
+* ``GIVariadic<>`` operands can only appear as the last operand in a 'match' pattern.
+* Each instance within a 'match' pattern must be uniquely named.
+* Re-using a ``GIVariadic<>`` operand in an 'apply' pattern will result in all
+  the matched operands being copied from the original instruction.
+* The min/max operands will result in the matcher checking that the number of operands
+  falls within that range.
+* ``GIVariadic<>`` operands can be used in C++ code within a rule, which will
+  result in the operand name being expanded to a value of type ``ArrayRef<MachineOperand>``.
+
+.. code-block:: text
+
+  // bool checkBuildVectorToUnmerge(ArrayRef<MachineOperand>);
+
+  def build_vector_to_unmerge: GICombineRule <
+    (defs root:$root),
+    (match (G_BUILD_VECTOR $root, GIVariadic<>:$args),
+           [{ return checkBuildVectorToUnmerge(${args}); }]),
+    (apply (G_UNMERGE_VALUES $root, $args))
+  >;
+
+.. code-block:: text
+
+  // Will additionally check the number of operands is >= 3 and <= 5.
+  // ($root is one operand, then 2 to 4 variadic operands).
+  def build_vector_to_unmerge: GICombineRule <
+    (defs root:$root),
+    (match (G_BUILD_VECTOR $root, GIVariadic<2, 4>:$two_to_four),
+           [{ return checkBuildVectorToUnmerge(${two_to_four}); }]),
+    (apply (G_UNMERGE_VALUES $root, $two_to_four))
+  >;
+
 Builtin Operations
 ------------------
 
@@ -240,6 +293,8 @@ This a non-exhaustive list of known issues with MIR patterns at this time.
   match. e.g. if a pattern needs to work on both i32 and i64, you either
   need to leave it untyped and check the type in C++, or duplicate the
   pattern.
+* ``GISpecialType`` operands are not allowed within a ``GICombinePatFrag``.
+* ``GIVariadic<>`` matched operands must each have a unique name.
 
 GICombineRule
 -------------
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 612040bc5506c..41ebbadf4eb1d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3945,24 +3945,6 @@ or constants of this type.
       x86_amx
 
 
-X86_mmx Type
-""""""""""""
-
-:Overview:
-
-The x86_mmx type represents a value held in an MMX register on an x86
-machine. The operations allowed on it are quite limited: parameters and
-return values, load and store, and bitcast. User-specified MMX
-instructions are represented as intrinsic or asm calls with arguments
-and/or results of this type. There are no arrays, vectors or constants
-of this type.
-
-:Syntax:
-
-::
-
-      x86_mmx
-
 
 .. _t_pointer:
 
@@ -4396,7 +4378,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit
 format is represented by ``0xR`` followed by 4 hexadecimal digits. All
 hexadecimal formats are big-endian (sign bit at the left).
 
-There are no constants of type x86_mmx and x86_amx.
+There are no constants of type x86_amx.
 
 .. _complexconstants:
 
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index cf2cc6305f130..e027c902e58e1 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -18,6 +18,9 @@ At the IR level, it is represented using:
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
 * a [signed pointer constant](#constant) (to sign globals)
 * a [call operand bundle](#operand-bundle) (to authenticate called pointers)
+* a [set of function attributes](#function-attributes) (to describe what
+  pointers are signed and how, to control implicit codegen in the backend, as
+  well as preserve invariants in the mid-level optimizer)
 
 The current implementation leverages the
 [Armv8.3-A PAuth/Pointer Authentication Code](#armv8-3-a-pauth-pointer-authentication-code)
@@ -287,6 +290,27 @@ but with the added guarantee that `%fp_i`, `%fp_auth`, and `%fp_auth_p`
 are not stored to (and reloaded from) memory.
 
 
+### Function Attributes
+
+Some function attributes are used to describe other pointer authentication
+operations that are not otherwise explicitly expressed in IR.
+
+#### ``ptrauth-indirect-gotos``
+
+``ptrauth-indirect-gotos`` specifies that indirect gotos in this function
+should authenticate their target.  At the IR level, no other change is needed.
+When lowering [``blockaddress`` constants](https://llvm.org/docs/LangRef.html#blockaddress),
+and [``indirectbr`` instructions](https://llvm.org/docs/LangRef.html#i-indirectbr),
+this tells the backend to respectively sign and authenticate the pointers.
+
+The specific scheme isn't ABI-visible.  Currently, the AArch64 backend
+signs blockaddresses using the `ASIA` key, with an integer discriminator
+derived from the parent function's name, using the SipHash stable discriminator:
+```
+  ptrauth_string_discriminator("<function_name> blockaddress")
+```
+
+
 ## AArch64 Support
 
 AArch64 is currently the only architecture with full support of the pointer
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index e00165caae0c9..231de56ef4cfe 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -1392,6 +1392,7 @@ How to use reduce-chunk-list:
 First, Figure out the number of calls to the debug counter you want to minimize.
 To do so, run the compilation command causing you want to minimize with `-print-debug-counter` adding a `-mllvm` if needed.
 Than find the line with the counter of interest. it should look like:
+
 .. code-block:: none
 
   my-counter               : {5678,empty}
@@ -1400,6 +1401,7 @@ The number of calls to `my-counter` is 5678
 
 Than Find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
 Build a reproducer script like:
+
 .. code-block:: bash
 
   #! /bin/bash
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 4474478b6d3f8..b3c7b0e3883d0 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -153,7 +153,6 @@ on support follow.
      ``Za64rs``        Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zaamo``         Assembly Support
      ``Zabha``         Supported
-     ``Zacas``         Supported (`See note <#riscv-zacas-note>`__)
      ``Zalrsc``        Assembly Support
      ``Zama16b``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zawrs``         Assembly Support
@@ -281,11 +280,6 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
-  .. _riscv-zacas-note:
-
-``Zacas``
-  amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler.
-
 Experimental Extensions
 =======================
 
@@ -299,6 +293,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-ssqosid``
   LLVM implements assembler support for the `v1.0-rc1 draft specification <https://github.com/riscv/riscv-ssqosid/releases/tag/v1.0-rc1>`_.
 
+``experimental-zacas``
+  LLVM implements the `1.0 release specification <https://github.com/riscvarchive/riscv-zacas/releases/tag/v1.0>`__. amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler. The extension will be left as experimental until `an ABI issue <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/issues/444>`__ is resolved.
+
 ``experimental-zalasr``
   LLVM implements the `0.0.5 draft specification <https://github.com/mehnadnerd/riscv-zalasr>`__.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 827a0fd3606ec..551a9bec3b916 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -47,39 +47,11 @@ Non-comprehensive list of changes in this release
 Update on required toolchains to build LLVM
 -------------------------------------------
 
-* The minimum Python version has been raised from 3.6 to 3.8 across all of LLVM.
-  This enables the use of many new Python features, aligning more closely with
-  modern Python best practices, and improves CI maintainability
-  See `#78828 <https://github.com/llvm/llvm-project/pull/78828>`_ for more info.
-
 Changes to the LLVM IR
 ----------------------
 
-* Added Memory Model Relaxation Annotations (MMRAs).
-* Added ``nusw`` and ``nuw`` flags to ``getelementptr`` instruction.
-* Renamed ``llvm.experimental.vector.reverse`` intrinsic to ``llvm.vector.reverse``.
-* Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
-* Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
-* Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
-* The constant expression variants of the following instructions have been
-  removed:
-
-  * ``icmp``
-  * ``fcmp``
-  * ``shl``
-* LLVM has switched from using debug intrinsics in textual IR to using debug
-  records by default. Details of the change and instructions on how to update
-  any downstream tools and tests can be found in the `migration docs
-  <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
-* Semantics of MC/DC intrinsics have been changed.
-
-  * ``llvm.instprof.mcdc.parameters``: 3rd argument has been changed
-    from bytes to bits.
-  * ``llvm.instprof.mcdc.condbitmap.update``: Removed.
-  * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been
-    removed. The next argument has been changed from byte index to bit
-    index.
-* Added ``llvm.experimental.vector.compress`` intrinsic.
+* The ``x86_mmx`` IR type has been removed. It will be translated to
+  the standard vector type ``<1 x i64>`` in bitcode upgrade.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -87,69 +59,28 @@ Changes to LLVM infrastructure
 Changes to building LLVM
 ------------------------
 
-* LLVM now has rpmalloc version 1.4.5 in-tree, as a replacement C allocator for
-  hosted toolchains. This supports several host platforms such as Mac or Unix,
-  however currently only the Windows 64-bit LLVM release uses it.
-  This has a great benefit in terms of build times on Windows when using ThinLTO
-  linking, especially on machines with lots of cores, to an order of magnitude
-  or more. Clang compilation is also improved. Please see some build timings in
-  (`#91862 <https://github.com/llvm/llvm-project/pull/91862#issue-2291033962>`_)
-  For more information, refer to the **LLVM_ENABLE_RPMALLOC** option in `CMake variables <https://llvm.org/docs/CMake.html#llvm-related-variables>`_.
-
-* The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on
-  terminfo and now always uses the ``TERM`` environment variable for color
-  support autodetection.
-
 Changes to TableGen
 -------------------
 
-- We can define type aliases via new keyword ``deftype``.
-
 Changes to Interprocedural Optimizations
 ----------------------------------------
 
 Changes to the AArch64 Backend
 ------------------------------
 
-* Added support for Cortex-R82AE, Cortex-A78AE, Cortex-A520AE, Cortex-A720AE,
-  Cortex-A725, Cortex-X925, Neoverse-N3, Neoverse-V3 and Neoverse-V3AE CPUs.
-
-* ``-mbranch-protection=standard`` now enables FEAT_PAuth_LR by
-  default when the feature is enabled. The new behaviour results 
-  in ``standard`` being equal to ``bti+pac-ret+pc`` when ``+pauth-lr``
-  is passed as part of ``-mcpu=`` options.
-
-* SVE and SVE2 have been moved to the default extensions list for ARMv9.0,
-  making them optional per the Arm ARM.  Existing v9.0+ CPUs in the backend that
-  support these extensions continue to have these features enabled by default
-  when specified via ``-march=`` or an ``-mcpu=`` that supports them.  The
-  attribute ``"target-features"="+v9a"`` no longer implies ``"+sve"`` and
-  ``"+sve2"`` respectively.
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
 
 Changes to the AMDGPU Backend
 -----------------------------
 
-* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
-* Added ``!amdgpu.no.fine.grained.memory`` and
-  ``!amdgpu.no.remote.memory`` metadata to control atomic behavior.
-
-* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
-
-* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
-  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
-  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
-  `fmax` with addrspace(3) instead.
-
-* AMDGPUAttributor is no longer run as part of the codegen pass
-  pipeline. It is expected to run as part of the middle end
-  optimizations.
-
 Changes to the ARM Backend
 --------------------------
 
-* Added support for Cortex-R52+ CPU.
-* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
-* armv8-r now implies only fp-armv8d16sp, rather than neon and full fp-armv8. These features are still included by default for cortex-r52. The default cpu for armv8-r is now "generic", for compatibility with variants that do not include neon, fp64, and d32.
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
 
 Changes to the AVR Backend
 --------------------------
@@ -163,10 +94,6 @@ Changes to the Hexagon Backend
 Changes to the LoongArch Backend
 --------------------------------
 
-* i32 is now a native type in the datalayout string. This enables
-  LoopStrengthReduce for loops with i32 induction variables, among other
-  optimizations.
-
 Changes to the MIPS Backend
 ---------------------------
 
@@ -176,36 +103,9 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
-* Added full support for the experimental Zabha (Byte and
-  Halfword Atomic Memory Operations) extension.
-* Added assembler/disassembler support for the experimenatl Zalasr
-  (Load-Acquire and Store-Release) extension.
-* The names of the majority of the S-prefixed (supervisor-level) extension
-  names in the RISC-V profiles specification are now recognised.
-* Codegen support was added for the Zimop (May-Be-Operations) extension.
-* The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 1.0.0 Pointer Masking extensions are supported.
-* The experimental Ssqosid extension is supported.
-* Zacas is no longer experimental.
-* Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
-* llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
-  match GNU objdump. The bytes within the groups are in big endian order.
-* Added smstateen extension to -march. CSR names for smstateen were already supported.
-* Zaamo and Zalrsc are no longer experimental.
-* Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler.
-* Zabha is no longer experimental.
-* B (the collection of the Zba, Zbb, Zbs extensions) is supported.
-* Added smcdeleg, ssccfg, smcsrind, and sscsrind extensions to -march.
-* ``-mcpu=syntacore-scr3-rv32`` and ``-mcpu=syntacore-scr3-rv64`` were added.
-* The default atomics mapping was changed to emit an additional trailing fence
-  for sequentially consistent stores, offering compatibility with a future
-  mapping using load-acquire and store-release instructions while remaining
-  fully compatible with objects produced prior to this change. The mapping
-  (ABI) used is recorded as an ELF attribute.
-* Ztso is no longer experimental.
-* The WCH / Nanjing Qinheng Microelectronics QingKe "XW" compressed opcodes are
-  supported under the name "Xwchc".
-* ``-mcpu=native`` now detects available features with hwprobe (RISC-V Hardware Probing Interface) on Linux 6.4 or later.
-* The version of Zicfilp/Zicfiss is updated to 1.0.
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -216,11 +116,18 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
-- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
-  while assembly encoding/decoding supports are kept.
-
-- Removed ``3DNow!``-specific ISA intrinsics and codegen support. The ``3dnow`` and ``3dnowa`` target features are no longer supported. The intrinsics ``llvm.x86.3dnow.*``, ``llvm.x86.3dnowa.*``, and ``llvm.x86.mmx.femms`` have been removed. Assembly encoding/decoding for the corresponding instructions remains supported.
+* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections
+  now fill the required alignment space with repeating `0x90` bytes, rather than
+  using optimised NOP filling. Optimised NOP filling fills the space with NOP
+  instructions of various widths, not just those that use the `0x90` byte
+  encoding. To use optimised NOP filling in a code section, leave off the
+  "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
 
+* Due to the removal of the ``x86_mmx`` IR type, functions with
+  ``x86_mmx`` arguments or return values will use a different,
+  incompatible, calling convention ABI. Such functions are not
+  generally seen in the wild (Clang never generates them!), so this is
+  not expected to result in real-world compatibility problems.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -231,103 +138,11 @@ Changes to the Python bindings
 Changes to the C API
 --------------------
 
-* Added ``LLVMGetBlockAddressFunction`` and ``LLVMGetBlockAddressBasicBlock``
-  functions for accessing the values in a blockaddress constant.
-
-* Added ``LLVMConstStringInContext2`` function, which better matches the C++
-  API by using ``size_t`` for string length. Deprecated ``LLVMConstStringInContext``.
-
-* Added the following functions for accessing a function's prefix data:
-
-  * ``LLVMHasPrefixData``
-  * ``LLVMGetPrefixData``
-  * ``LLVMSetPrefixData``
-
-* Added the following functions for accessing a function's prologue data:
-
-  * ``LLVMHasPrologueData``
-  * ``LLVMGetPrologueData``
-  * ``LLVMSetPrologueData``
-
-* Deprecated ``LLVMConstNUWNeg`` and ``LLVMBuildNUWNeg``.
-
-* Added ``LLVMAtomicRMWBinOpUIncWrap`` and ``LLVMAtomicRMWBinOpUDecWrap`` to
-  ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions.
-
-* Added ``LLVMCreateConstantRangeAttribute`` function for creating ConstantRange Attributes.
-
-* Added the following functions for creating and accessing data for CallBr instructions:
-
-  * ``LLVMBuildCallBr``
-  * ``LLVMGetCallBrDefaultDest``
-  * ``LLVMGetCallBrNumIndirectDests``
-  * ``LLVMGetCallBrIndirectDest``
-
-* The following functions for creating constant expressions have been removed,
-  because the underlying constant expressions are no longer supported. Instead,
-  an instruction should be created using the ``LLVMBuildXYZ`` APIs, which will
-  constant fold the operands if possible and create an instruction otherwise:
+* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type:
 
-  * ``LLVMConstICmp``
-  * ``LLVMConstFCmp``
-  * ``LLVMConstShl``
-
-**Note:** The following changes are due to the removal of the debug info
-intrinsics from LLVM and to the introduction of debug records into LLVM.
-They are described in detail in the `debug info migration guide <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
-
-* Added the following functions to insert before the indicated instruction but
-  after any attached debug records.
-
-  * ``LLVMPositionBuilderBeforeDbgRecords``
-  * ``LLVMPositionBuilderBeforeInstrAndDbgRecords``
-
-  Same as ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` except the
-  insertion position is set to before the debug records that precede the target
-  instruction. ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` are
-  unchanged.
-
-* Added the following functions to get/set the new non-instruction debug info format.
-  They will be deprecated in the future and they are just a transition aid.
-
-  * ``LLVMIsNewDbgInfoFormat``
-  * ``LLVMSetIsNewDbgInfoFormat``
-
-* Added the following functions to insert a debug record (new debug info format).
-
-  * ``LLVMDIBuilderInsertDeclareRecordBefore``
-  * ``LLVMDIBuilderInsertDeclareRecordAtEnd``
-  * ``LLVMDIBuilderInsertDbgValueRecordBefore``
-  * ``LLVMDIBuilderInsertDbgValueRecordAtEnd``
-
-* Deleted the following functions that inserted a debug intrinsic (old debug info format).
-
-  * ``LLVMDIBuilderInsertDeclareBefore``
-  * ``LLVMDIBuilderInsertDeclareAtEnd``
-  * ``LLVMDIBuilderInsertDbgValueBefore``
-  * ``LLVMDIBuilderInsertDbgValueAtEnd``
-
-* Added the following functions for accessing a Target Extension Type's data:
-
-  * ``LLVMGetTargetExtTypeName``
-  * ``LLVMGetTargetExtTypeNumTypeParams``/``LLVMGetTargetExtTypeTypeParam``
-  * ``LLVMGetTargetExtTypeNumIntParams``/``LLVMGetTargetExtTypeIntParam``
-
-* Added the following functions for accessing/setting the no-wrap flags for a
-  GetElementPtr instruction:
-
-  * ``LLVMBuildGEPWithNoWrapFlags``
-  * ``LLVMConstGEPWithNoWrapFlags``
-  * ``LLVMGEPGetNoWrapFlags``
-  * ``LLVMGEPSetNoWrapFlags``
-
-* Added the following functions for creating and accessing data for ConstantPtrAuth constants:
-
-  * ``LLVMConstantPtrAuth``
-  * ``LLVMGetConstantPtrAuthPointer``
-  * ``LLVMGetConstantPtrAuthKey``
-  * ``LLVMGetConstantPtrAuthDiscriminator``
-  * ``LLVMGetConstantPtrAuthAddrDiscriminator``
+  * ``LLVMX86_MMXTypeKind``
+  * ``LLVMX86MMXTypeInContext``
+  * ``LLVMX86MMXType``
 
 Changes to the CodeGen infrastructure
 -------------------------------------
@@ -338,106 +153,14 @@ Changes to the Metadata Info
 Changes to the Debug Info
 ---------------------------------
 
-* LLVM has switched from using debug intrinsics internally to using debug
-  records by default. This should happen transparently when using the DIBuilder
-  to construct debug variable information, but will require changes for any code
-  that interacts with debug intrinsics directly. Debug intrinsics will only be
-  supported on a best-effort basis from here onwards; for more information, see
-  the `migration docs <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
-
-* When emitting DWARF v2 and not in strict DWARF mode, LLVM will now add
-  a ``DW_AT_type`` to instances of ``DW_TAG_enumeration_type``. This is actually
-  a DWARF v3 feature which tells tools what the enum's underlying type is.
-  Emitting this for v2 as well will help users who have to build binaries with
-  DWARF v2 but are using tools that understand newer DWARF standards. Older
-  tools will ignore it. (`#98335 <https://github.com/llvm/llvm-project/pull/98335>`_)
-
 Changes to the LLVM tools
 ---------------------------------
-* llvm-nm and llvm-objdump can now print symbol information from linked
-  WebAssembly binaries, using information from exports or the "name"
-  section for functions, globals and data segments. Symbol addresses and sizes
-  are printed as offsets in the file, allowing for binary size analysis. Wasm
-  files using reference types and GC are also supported (but also only for
-  functions, globals, and data, and only for listing symbols and names).
-
-* llvm-ar now utilizes LLVM_DEFAULT_TARGET_TRIPLE to determine the archive format
-  if it's not specified with the ``--format`` argument and cannot be inferred from
-  input files.
-
-* llvm-ar now allows specifying COFF archive format with ``--format`` argument
-  and uses it by default for COFF targets.
-
-* llvm-ranlib now supports ``-V`` as an alias for ``--version``.
-  ``-v`` (``--verbose`` in llvm-ar) has been removed.
-  (`#87661 <https://github.com/llvm/llvm-project/pull/87661>`_)
-
-* llvm-objcopy now supports ``--set-symbol-visibility`` and
-  ``--set-symbols-visibility`` options for ELF input to change the
-  visibility of symbols.
-
-* llvm-objcopy now supports ``--skip-symbol`` and ``--skip-symbols`` options
-  for ELF input to skip the specified symbols when executing other options
-  that can change a symbol's name, binding or visibility.
-
-* llvm-objcopy now supports ``--compress-sections`` to compress or decompress
-  arbitrary sections not within a segment.
-  (`#85036 <https://github.com/llvm/llvm-project/pull/85036>`_.)
-
-* llvm-profgen now supports COFF+DWARF binaries. This enables Sample-based PGO
-  on Windows using Intel VTune's SEP. For details on usage, see the `end-user
-  documentation for SPGO
-  <https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers>`_.
-
-* llvm-readelf's ``-r`` output for RELR has been improved.
-  (`#89162 <https://github.com/llvm/llvm-project/pull/89162>`_)
-  ``--raw-relr`` has been removed.
-
-* llvm-mca now aborts by default if it is given bad input where previously it
-  would continue. Additionally, it can now continue when it encounters
-  instructions which lack scheduling information. The behaviour can be
-  controlled by the newly introduced
-  ``--skip-unsupported-instructions=<none|lack-sched|parse-failure|any>``, as
-  documented in ``--help`` output and the command guide. (`#90474
-  <https://github.com/llvm/llvm-project/pull/90474>`_)
-
-* llvm-readobj's LLVM output format for ELF core files has been changed.
-  Similarly, the JSON format has been fixed for this case. The NT_FILE note
-  now has a map for the mapped files. (`#92835
-  <https://github.com/llvm/llvm-project/pull/92835>`_).
-
-* llvm-cov now generates HTML report with JavaScript code to allow simple
-  jumping between uncovered parts (lines/regions/branches) of code 
-  using buttons on top-right corner of the page or using keys (L/R/B or 
-  jumping in reverse direction with shift+L/R/B). (`#95662
-  <https://github.com/llvm/llvm-project/pull/95662>`_).
-
-* llvm-objcopy now verifies format of ``.note`` sections for ELF input. This can
-  be disabled by ``--no-verify-note-sections``. (`#90458
-  <https://github.com/llvm/llvm-project/pull/90458>`).
 
 Changes to LLDB
 ---------------------------------
 
-* Register field information is now provided on AArch64 FreeBSD for live
-  processes and core files (previously only provided on AArch64 Linux).
-
-* Register field information can now include enums to represent field
-  values. Enums have been added for ``fpcr.RMode`` and ``mte_ctrl.TCF``
-  for AArch64 targets::
-
-    (lldb) register read fpcr
-        fpcr = 0x00000000
-             = (AHP = 0, DN = 0, FZ = 0, RMode = RN, <...>)
-
-  If you need to know the values of the enum, these can be found in
-  the output of ``register info`` for the same register.
-
 Changes to BOLT
 ---------------------------------
-* Now supports ``--match-profile-with-function-hash`` to match profiled and
-  binary functions with exact hash, allowing for the matching of renamed but
-  identical functions.
 
 Changes to Sanitizers
 ---------------------
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index 0acc929856eb6..c1a95efd2d8bc 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -389,12 +389,12 @@ Compiled to LLVM, this function would be represented like this:
     %X = alloca i32, align 4
     %Y = alloca i32, align 4
     %Z = alloca i32, align 4
-      #dbg_declare(ptr %X, !11, !DIExpression(), !14)
-    store i32 21, i32* %X, align 4, !dbg !14
-      #dbg_declare(ptr %Y, !15, !DIExpression(), !16)
-    store i32 22, i32* %Y, align 4, !dbg !16
-      #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
-    store i32 23, i32* %Z, align 4, !dbg !19
+      #dbg_declare(ptr %X, !11, !DIExpression(), !13)
+    store i32 21, i32* %X, align 4, !dbg !13
+      #dbg_declare(ptr %Y, !14, !DIExpression(), !15)
+    store i32 22, i32* %Y, align 4, !dbg !15
+      #dbg_declare(ptr %Z, !16, !DIExpression(), !18)
+    store i32 23, i32* %Z, align 4, !dbg !18
     %0 = load i32, i32* %X, align 4, !dbg !20
     store i32 %0, i32* %Z, align 4, !dbg !21
     %1 = load i32, i32* %Y, align 4, !dbg !22
@@ -427,9 +427,9 @@ Compiled to LLVM, this function would be represented like this:
   !15 = !DILocation(line: 3, column: 9, scope: !4)
   !16 = !DILocalVariable(name: "Z", scope: !18, file: !1, line: 5, type: !12)
   !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
-  !18 = !DILocation(line: 5, column: 11, scope: !18)
-  !29 = !DILocation(line: 6, column: 11, scope: !18)
-  !20 = !DILocation(line: 6, column: 9, scope: !18)
+  !18 = !DILocation(line: 5, column: 11, scope: !17)
+  !29 = !DILocation(line: 6, column: 11, scope: !17)
+  !20 = !DILocation(line: 6, column: 9, scope: !17)
   !21 = !DILocation(line: 8, column: 9, scope: !4)
   !22 = !DILocation(line: 8, column: 7, scope: !4)
   !23 = !DILocation(line: 9, column: 3, scope: !4)
@@ -443,21 +443,21 @@ variable definitions, and the code used to implement the function.
 
 .. code-block:: llvm
 
-    #dbg_declare(ptr %X, !11, !DIExpression(), !14)
-    ; [debug line = 2:7] [debug variable = X]
+    #dbg_declare(ptr %X, !11, !DIExpression(), !13)
+    ; [debug line = 2:9] [debug variable = X]
 
 The first record ``#dbg_declare`` encodes debugging information for the
-variable ``X``.  The location ``!14`` at the end of the record provides
+variable ``X``.  The location ``!13`` at the end of the record provides
 scope information for the variable ``X``.
 
 .. code-block:: text
 
-  !14 = !DILocation(line: 2, column: 9, scope: !4)
+  !13 = !DILocation(line: 2, column: 9, scope: !4)
   !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5,
                               isLocal: false, isDefinition: true, scopeLine: 1,
                               isOptimized: false, retainedNodes: !2)
 
-Here ``!14`` is metadata providing `location information
+Here ``!13`` is metadata providing `location information
 <LangRef.html#dilocation>`_.  In this example, scope is encoded by ``!4``, a
 `subprogram descriptor <LangRef.html#disubprogram>`_.  This way the location
 information parameter to the records indicates that the variable ``X`` is
@@ -467,20 +467,20 @@ Now lets take another example.
 
 .. code-block:: llvm
 
-    #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
-    ; [debug line = 5:9] [debug variable = Z]
+    #dbg_declare(ptr %Z, !16, !DIExpression(), !18)
+    ; [debug line = 5:11] [debug variable = Z]
 
 The third record ``#dbg_declare`` encodes debugging information for
-variable ``Z``.  The metadata ``!19`` at the end of the record provides
+variable ``Z``.  The metadata ``!18`` at the end of the record provides
 scope information for the variable ``Z``.
 
 .. code-block:: text
 
-  !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
-  !19 = !DILocation(line: 5, column: 11, scope: !18)
+  !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
+  !18 = !DILocation(line: 5, column: 11, scope: !17)
 
-Here ``!19`` indicates that ``Z`` is declared at line number 5 and column
-number 11 inside of lexical scope ``!18``.  The lexical scope itself resides
+Here ``!18`` indicates that ``Z`` is declared at line number 5 and column
+number 11 inside of lexical scope ``!17``.  The lexical scope itself resides
 inside of subprogram ``!4`` described above.
 
 The scope information attached with each instruction provides a straightforward
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 223d8efe57daa..4bde82d816e35 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -146,27 +146,27 @@ typedef enum {
 } LLVMOpcode;
 
 typedef enum {
-  LLVMVoidTypeKind,      /**< type with no size */
-  LLVMHalfTypeKind,      /**< 16 bit floating point type */
-  LLVMFloatTypeKind,     /**< 32 bit floating point type */
-  LLVMDoubleTypeKind,    /**< 64 bit floating point type */
-  LLVMX86_FP80TypeKind,  /**< 80 bit floating point type (X87) */
-  LLVMFP128TypeKind,     /**< 128 bit floating point type (112-bit mantissa)*/
-  LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */
-  LLVMLabelTypeKind,     /**< Labels */
-  LLVMIntegerTypeKind,   /**< Arbitrary bit width integers */
-  LLVMFunctionTypeKind,  /**< Functions */
-  LLVMStructTypeKind,    /**< Structures */
-  LLVMArrayTypeKind,     /**< Arrays */
-  LLVMPointerTypeKind,   /**< Pointers */
-  LLVMVectorTypeKind,    /**< Fixed width SIMD vector type */
-  LLVMMetadataTypeKind,  /**< Metadata */
-  LLVMX86_MMXTypeKind,   /**< X86 MMX */
-  LLVMTokenTypeKind,     /**< Tokens */
-  LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
-  LLVMBFloatTypeKind,    /**< 16 bit brain floating point type */
-  LLVMX86_AMXTypeKind,   /**< X86 AMX */
-  LLVMTargetExtTypeKind, /**< Target extension type */
+  LLVMVoidTypeKind = 0,     /**< type with no size */
+  LLVMHalfTypeKind = 1,     /**< 16 bit floating point type */
+  LLVMFloatTypeKind = 2,    /**< 32 bit floating point type */
+  LLVMDoubleTypeKind = 3,   /**< 64 bit floating point type */
+  LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */
+  LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/
+  LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */
+  LLVMLabelTypeKind = 7,     /**< Labels */
+  LLVMIntegerTypeKind = 8,   /**< Arbitrary bit width integers */
+  LLVMFunctionTypeKind = 9,  /**< Functions */
+  LLVMStructTypeKind = 10,   /**< Structures */
+  LLVMArrayTypeKind = 11,    /**< Arrays */
+  LLVMPointerTypeKind = 12,  /**< Pointers */
+  LLVMVectorTypeKind = 13,   /**< Fixed width SIMD vector type */
+  LLVMMetadataTypeKind = 14, /**< Metadata */
+                             /* 15 previously used by LLVMX86_MMXTypeKind */
+  LLVMTokenTypeKind = 16,    /**< Tokens */
+  LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */
+  LLVMBFloatTypeKind = 18,         /**< 16 bit brain floating point type */
+  LLVMX86_AMXTypeKind = 19,        /**< X86 AMX */
+  LLVMTargetExtTypeKind = 20,      /**< Target extension type */
 } LLVMTypeKind;
 
 typedef enum {
@@ -1716,11 +1716,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C);
 
-/**
- * Create a X86 MMX type in a context.
- */
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C);
-
 /**
  * Create a X86 AMX type in a context.
  */
@@ -1742,7 +1737,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMVoidType(void);
 LLVMTypeRef LLVMLabelType(void);
-LLVMTypeRef LLVMX86MMXType(void);
 LLVMTypeRef LLVMX86AMXType(void);
 
 /**
diff --git a/llvm/include/llvm-c/Error.h b/llvm/include/llvm-c/Error.h
index c3baaf65186aa..874bbcfe8f21a 100644
--- a/llvm/include/llvm-c/Error.h
+++ b/llvm/include/llvm-c/Error.h
@@ -51,6 +51,14 @@ LLVMErrorTypeId LLVMGetErrorTypeId(LLVMErrorRef Err);
  */
 void LLVMConsumeError(LLVMErrorRef Err);
 
+/**
+ * Report a fatal error if Err is a failure value.
+ *
+ * This function can be used to wrap calls to fallible functions ONLY when it is
+ * known that the Error will always be a success value.
+ */
+void LLVMCantFail(LLVMErrorRef Err);
+
 /**
  * Returns the given string's error message. This operation consumes the error,
  * and the given LLVMErrorRef value is not usable once this call returns.
diff --git a/llvm/include/llvm-c/Target.h b/llvm/include/llvm-c/Target.h
index 518b46d55bc3c..aef06a5f2486d 100644
--- a/llvm/include/llvm-c/Target.h
+++ b/llvm/include/llvm-c/Target.h
@@ -244,7 +244,7 @@ LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD);
 LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD,
                                          unsigned AS);
 
-/** Computes the size of a type in bytes for a target.
+/** Computes the size of a type in bits for a target.
     See the method llvm::DataLayout::getTypeSizeInBits. */
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bff8e6490d1de..7039e961bff82 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -188,6 +188,9 @@ struct APFloatBase {
     // This format's exponent bias is 11, instead of the 7 (2 ** (4 - 1) - 1)
     // that IEEE precedent would imply.
     S_Float8E4M3B11FNUZ,
+    // 8-bit floating point number following IEEE-754 conventions with bit
+    // layout S1E3M4.
+    S_Float8E3M4,
     // Floating point number that occupies 32 bits or less of storage, providing
     // improved range compared to half (16-bit) formats, at (potentially)
     // greater throughput than single precision (32-bit) formats.
@@ -224,6 +227,7 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
+  static const fltSemantics &Float8E3M4() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
@@ -646,6 +650,7 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNAPFloatToAPInt() const;
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
+  APInt convertFloat8E3M4APFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
@@ -665,6 +670,7 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNAPInt(const APInt &api);
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
+  void initFromFloat8E3M4APInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 7ccc9445c0a7b..f71cd5b4771b7 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -118,12 +118,13 @@ class DenseMapBase : public DebugEpochBase {
       return;
     }
 
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = getEmptyKey();
     if (std::is_trivially_destructible<ValueT>::value) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
         P->getFirst() = EmptyKey;
     } else {
+      const KeyT TombstoneKey = getTombstoneKey();
       unsigned NumEntries = getNumEntries();
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
         if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
@@ -143,8 +144,7 @@ class DenseMapBase : public DebugEpochBase {
 
   /// Return true if the specified key is in the map, false otherwise.
   bool contains(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    return LookupBucketFor(Val, TheBucket);
+    return doFind(Val) != nullptr;
   }
 
   /// Return 1 if the specified key is in the map, 0 otherwise.
@@ -153,21 +153,17 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   iterator find(const_arg_type_t<KeyT> Val) {
-    BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeIterator(TheBucket,
-                          shouldReverseIterate<KeyT>() ? getBuckets()
-                                                       : getBucketsEnd(),
-                          *this, true);
+    if (BucketT *Bucket = doFind(Val))
+      return makeIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
   const_iterator find(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeConstIterator(TheBucket,
-                               shouldReverseIterate<KeyT>() ? getBuckets()
-                                                            : getBucketsEnd(),
-                               *this, true);
+    if (const BucketT *Bucket = doFind(Val))
+      return makeConstIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
 
@@ -178,31 +174,26 @@ class DenseMapBase : public DebugEpochBase {
   /// type used.
   template<class LookupKeyT>
   iterator find_as(const LookupKeyT &Val) {
-    BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeIterator(TheBucket,
-                          shouldReverseIterate<KeyT>() ? getBuckets()
-                                                       : getBucketsEnd(),
-                          *this, true);
+    if (BucketT *Bucket = doFind(Val))
+      return makeIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
   template<class LookupKeyT>
   const_iterator find_as(const LookupKeyT &Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeConstIterator(TheBucket,
-                               shouldReverseIterate<KeyT>() ? getBuckets()
-                                                            : getBucketsEnd(),
-                               *this, true);
+    if (const BucketT *Bucket = doFind(Val))
+      return makeConstIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
   ValueT lookup(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return TheBucket->getSecond();
+    if (const BucketT *Bucket = doFind(Val))
+      return Bucket->getSecond();
     return ValueT();
   }
 
@@ -343,8 +334,8 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   bool erase(const KeyT &Val) {
-    BucketT *TheBucket;
-    if (!LookupBucketFor(Val, TheBucket))
+    BucketT *TheBucket = doFind(Val);
+    if (!TheBucket)
       return false; // not in map.
 
     TheBucket->getSecond().~ValueT();
@@ -643,6 +634,34 @@ class DenseMapBase : public DebugEpochBase {
     return TheBucket;
   }
 
+  template <typename LookupKeyT> BucketT *doFind(const LookupKeyT &Val) {
+    BucketT *BucketsPtr = getBuckets();
+    const unsigned NumBuckets = getNumBuckets();
+    if (NumBuckets == 0)
+      return nullptr;
+
+    const KeyT EmptyKey = getEmptyKey();
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned ProbeAmt = 1;
+    while (true) {
+      BucketT *Bucket = BucketsPtr + BucketNo;
+      if (LLVM_LIKELY(KeyInfoT::isEqual(Val, Bucket->getFirst())))
+        return Bucket;
+      if (LLVM_LIKELY(KeyInfoT::isEqual(Bucket->getFirst(), EmptyKey)))
+        return nullptr;
+
+      // Otherwise, it's a hash collision or a tombstone, continue quadratic
+      // probing.
+      BucketNo += ProbeAmt++;
+      BucketNo &= NumBuckets - 1;
+    }
+  }
+
+  template <typename LookupKeyT>
+  const BucketT *doFind(const LookupKeyT &Val) const {
+    return const_cast<DenseMapBase *>(this)->doFind(Val); // NOLINT
+  }
+
   /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
   /// FoundBucket.  If the bucket contains the key and a value, this returns
   /// true, otherwise it returns a bucket with an empty marker or tombstone and
diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h
index ab9c421a44693..56d5ba6e06077 100644
--- a/llvm/include/llvm/ADT/GenericCycleImpl.h
+++ b/llvm/include/llvm/ADT/GenericCycleImpl.h
@@ -134,6 +134,8 @@ template <typename ContextT> class GenericCycleInfoCompute {
     DFSInfo() = default;
     explicit DFSInfo(unsigned Start) : Start(Start) {}
 
+    explicit operator bool() const { return Start; }
+
     /// Whether this node is an ancestor (or equal to) the node \p Other
     /// in the DFS tree.
     bool isAncestorOf(const DFSInfo &Other) const {
@@ -231,6 +233,8 @@ void GenericCycleInfoCompute<ContextT>::run(BlockT *EntryBlock) {
 
     for (BlockT *Pred : predecessors(HeaderCandidate)) {
       const DFSInfo PredDFSInfo = BlockDFSInfo.lookup(Pred);
+      // This automatically ignores unreachable predecessors since they have
+      // zeros in their DFSInfo.
       if (CandidateInfo.isAncestorOf(PredDFSInfo))
         Worklist.push_back(Pred);
     }
@@ -257,6 +261,10 @@ void GenericCycleInfoCompute<ContextT>::run(BlockT *EntryBlock) {
         const DFSInfo PredDFSInfo = BlockDFSInfo.lookup(Pred);
         if (CandidateInfo.isAncestorOf(PredDFSInfo)) {
           Worklist.push_back(Pred);
+        } else if (!PredDFSInfo) {
+          // Ignore an unreachable predecessor. It will will incorrectly cause
+          // Block to be treated as a cycle entry.
+          LLVM_DEBUG(errs() << " skipped unreachable predecessor.\n");
         } else {
           IsEntry = true;
         }
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index e34068592de81..8f988d01cb2a6 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -2027,6 +2027,12 @@ template <typename L, typename R> bool equal(L &&LRange, R &&RRange) {
                     adl_end(RRange));
 }
 
+template <typename L, typename R, typename BinaryPredicate>
+bool equal(L &&LRange, R &&RRange, BinaryPredicate P) {
+  return std::equal(adl_begin(LRange), adl_end(LRange), adl_begin(RRange),
+                    adl_end(RRange), P);
+}
+
 /// Returns true if all elements in Range are equal or when the Range is empty.
 template <typename R> bool all_equal(R &&Range) {
   auto Begin = adl_begin(Range);
diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h
index 2b1a103565f7d..86a27b683ebc1 100644
--- a/llvm/include/llvm/ADT/SetOperations.h
+++ b/llvm/include/llvm/ADT/SetOperations.h
@@ -43,9 +43,8 @@ static constexpr bool HasMemberEraseIter =
 template <class S1Ty, class S2Ty> bool set_union(S1Ty &S1, const S2Ty &S2) {
   bool Changed = false;
 
-  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE;
-       ++SI)
-    if (S1.insert(*SI).second)
+  for (const auto &E : S2)
+    if (S1.insert(E).second)
       Changed = true;
 
   return Changed;
@@ -61,11 +60,11 @@ template <class S1Ty, class S2Ty> void set_intersect(S1Ty &S1, const S2Ty &S2) {
   if constexpr (detail::HasMemberRemoveIf<S1Ty, decltype(Pred)>) {
     S1.remove_if(Pred);
   } else {
-    for (typename S1Ty::iterator I = S1.begin(); I != S1.end();) {
-      const auto &E = *I;
-      ++I;
-      if (!S2.count(E))
-        S1.erase(E); // Erase element if not in S2
+    typename S1Ty::iterator Next;
+    for (typename S1Ty::iterator I = S1.begin(); I != S1.end(); I = Next) {
+      Next = std::next(I);
+      if (!S2.count(*I))
+        S1.erase(I); // Erase element if not in S2
     }
   }
 }
@@ -73,10 +72,9 @@ template <class S1Ty, class S2Ty> void set_intersect(S1Ty &S1, const S2Ty &S2) {
 template <class S1Ty, class S2Ty>
 S1Ty set_intersection_impl(const S1Ty &S1, const S2Ty &S2) {
   S1Ty Result;
-  for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end(); SI != SE;
-       ++SI)
-    if (S2.count(*SI))
-      Result.insert(*SI);
+  for (const auto &E : S1)
+    if (S2.count(E))
+      Result.insert(E);
   return Result;
 }
 
@@ -94,10 +92,9 @@ S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2) {
 template <class S1Ty, class S2Ty>
 S1Ty set_difference(const S1Ty &S1, const S2Ty &S2) {
   S1Ty Result;
-  for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end(); SI != SE;
-       ++SI)
-    if (!S2.count(*SI)) // if the element is not in set2
-      Result.insert(*SI);
+  for (const auto &E : S1)
+    if (!S2.count(E)) // if the element is not in set2
+      Result.insert(E);
   return Result;
 }
 
@@ -132,9 +129,8 @@ template <class S1Ty, class S2Ty> void set_subtract(S1Ty &S1, const S2Ty &S2) {
     }
   }
 
-  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE;
-       ++SI)
-    S1.erase(*SI);
+  for (const auto &E : S2)
+    S1.erase(E);
 }
 
 /// set_subtract(A, B, C, D) - Compute A := A - B, set C to the elements of B
@@ -142,12 +138,11 @@ template <class S1Ty, class S2Ty> void set_subtract(S1Ty &S1, const S2Ty &S2) {
 /// from A (B - A).
 template <class S1Ty, class S2Ty>
 void set_subtract(S1Ty &S1, const S2Ty &S2, S1Ty &Removed, S1Ty &Remaining) {
-  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE;
-       ++SI)
-    if (S1.erase(*SI))
-      Removed.insert(*SI);
+  for (const auto &E : S2)
+    if (S1.erase(E))
+      Removed.insert(E);
     else
-      Remaining.insert(*SI);
+      Remaining.insert(E);
 }
 
 /// set_is_subset(A, B) - Return true iff A in B
diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index 884b5752d9bb0..f675f828f702e 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -95,18 +95,6 @@ inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) {
     hashing::detail::stable_hash_append(Hash, P[I]);
   return Hash;
 }
-
-inline stable_hash stable_hash_combine_string(const StringRef &S) {
-  return stable_hash_combine_range(S.begin(), S.end());
-}
-
-inline stable_hash stable_hash_combine_string(const char *C) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  while (*C)
-    hashing::detail::stable_hash_append(Hash, *(C++));
-  return Hash;
-}
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 812b5a9f72a3a..4140387a1f341 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -244,12 +244,23 @@ class AAQueryInfo {
 public:
   using LocPair = std::pair<AACacheLoc, AACacheLoc>;
   struct CacheEntry {
+    /// Cache entry is neither an assumption nor does it use a (non-definitive)
+    /// assumption.
+    static constexpr int Definitive = -2;
+    /// Cache entry is not an assumption itself, but may be using an assumption
+    /// from higher up the stack.
+    static constexpr int AssumptionBased = -1;
+
     AliasResult Result;
-    /// Number of times a NoAlias assumption has been used.
-    /// 0 for assumptions that have not been used, -1 for definitive results.
+    /// Number of times a NoAlias assumption has been used, 0 for assumptions
+    /// that have not been used. Can also take one of the Definitive or
+    /// AssumptionBased values documented above.
     int NumAssumptionUses;
+
     /// Whether this is a definitive (non-assumption) result.
-    bool isDefinitive() const { return NumAssumptionUses < 0; }
+    bool isDefinitive() const { return NumAssumptionUses == Definitive; }
+    /// Whether this is an assumption that has not been proven yet.
+    bool isAssumption() const { return NumAssumptionUses >= 0; }
   };
 
   // Alias analysis result aggregration using which this query is performed.
diff --git a/llvm/include/llvm/Transforms/Utils/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
similarity index 56%
rename from llvm/include/llvm/Transforms/Utils/DXILResource.h
rename to llvm/include/llvm/Analysis/DXILResource.h
index df01fb457977a..ed9fade3f14f3 100644
--- a/llvm/include/llvm/Transforms/Utils/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -1,4 +1,4 @@
-//===- DXILResource.h - Tools to translate DXIL resources -------*- C++ -*-===//
+//===- DXILResource.h - Representations of DXIL resources -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,29 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
-#define LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
+#ifndef LLVM_ANALYSIS_DXILRESOURCE_H
+#define LLVM_ANALYSIS_DXILRESOURCE_H
 
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/DXILABI.h"
 
 namespace llvm {
-namespace dxil {
-
-struct ResourceBinding {
-  uint32_t Space;
-  uint32_t LowerBound;
-  uint32_t Size;
+class MDTuple;
 
-  bool operator==(const ResourceBinding &RHS) const {
-    return std::tie(Space, LowerBound, Size) ==
-           std::tie(RHS.Space, RHS.LowerBound, RHS.Size);
-  }
-  bool operator!=(const ResourceBinding &RHS) const { return !(*this == RHS); }
-};
+namespace dxil {
 
 class ResourceInfo {
+  struct ResourceBinding {
+    uint32_t UniqueID;
+    uint32_t Space;
+    uint32_t LowerBound;
+    uint32_t Size;
+
+    bool operator==(const ResourceBinding &RHS) const {
+      return std::tie(UniqueID, Space, LowerBound, Size) ==
+             std::tie(RHS.UniqueID, RHS.Space, RHS.LowerBound, RHS.Size);
+    }
+    bool operator!=(const ResourceBinding &RHS) const {
+      return !(*this == RHS);
+    }
+  };
+
   struct UAVInfo {
     bool GloballyCoherent;
     bool HasCounter;
@@ -43,10 +47,14 @@ class ResourceInfo {
 
   struct StructInfo {
     uint32_t Stride;
-    Align Alignment;
+    // Note: we store an integer here rather than using `MaybeAlign` because in
+    // GCC 7 MaybeAlign isn't trivial so having one in this union would delete
+    // our move constructor.
+    // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0602r4.html
+    uint32_t AlignLog2;
 
     bool operator==(const StructInfo &RHS) const {
-      return std::tie(Stride, Alignment) == std::tie(RHS.Stride, RHS.Alignment);
+      return std::tie(Stride, AlignLog2) == std::tie(RHS.Stride, RHS.AlignLog2);
     }
     bool operator!=(const StructInfo &RHS) const { return !(*this == RHS); }
   };
@@ -80,12 +88,11 @@ class ResourceInfo {
   Value *Symbol;
   StringRef Name;
 
-  ResourceBinding Binding;
-  uint32_t UniqueID;
-
   dxil::ResourceClass RC;
   dxil::ResourceKind Kind;
 
+  ResourceBinding Binding = {};
+
   // Resource class dependent properties.
   // CBuffer, Sampler, and RawBuffer end here.
   union {
@@ -103,6 +110,11 @@ class ResourceInfo {
 
   MSInfo MultiSample;
 
+public:
+  ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol,
+               StringRef Name)
+      : Symbol(Symbol), Name(Name), RC(RC), Kind(Kind) {}
+
   // Conditions to check before accessing union members.
   bool isUAV() const;
   bool isCBuffer() const;
@@ -112,73 +124,89 @@ class ResourceInfo {
   bool isFeedback() const;
   bool isMultiSample() const;
 
-  ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol,
-               StringRef Name, ResourceBinding Binding, uint32_t UniqueID)
-      : Symbol(Symbol), Name(Name), Binding(Binding), UniqueID(UniqueID),
-        RC(RC), Kind(Kind) {}
+  void bind(uint32_t UniqueID, uint32_t Space, uint32_t LowerBound,
+            uint32_t Size) {
+    Binding.UniqueID = UniqueID;
+    Binding.Space = Space;
+    Binding.LowerBound = LowerBound;
+    Binding.Size = Size;
+  }
+  void setUAV(bool GloballyCoherent, bool HasCounter, bool IsROV) {
+    assert(isUAV() && "Not a UAV");
+    UAVFlags.GloballyCoherent = GloballyCoherent;
+    UAVFlags.HasCounter = HasCounter;
+    UAVFlags.IsROV = IsROV;
+  }
+  void setCBuffer(uint32_t Size) {
+    assert(isCBuffer() && "Not a CBuffer");
+    CBufferSize = Size;
+  }
+  void setSampler(dxil::SamplerType Ty) { SamplerTy = Ty; }
+  void setStruct(uint32_t Stride, MaybeAlign Alignment) {
+    assert(isStruct() && "Not a Struct");
+    Struct.Stride = Stride;
+    Struct.AlignLog2 = Alignment ? Log2(*Alignment) : 0;
+  }
+  void setTyped(dxil::ElementType ElementTy, uint32_t ElementCount) {
+    assert(isTyped() && "Not Typed");
+    Typed.ElementTy = ElementTy;
+    Typed.ElementCount = ElementCount;
+  }
+  void setFeedback(dxil::SamplerFeedbackType Type) {
+    assert(isFeedback() && "Not Feedback");
+    Feedback.Type = Type;
+  }
+  void setMultiSample(uint32_t Count) {
+    assert(isMultiSample() && "Not MultiSampled");
+    MultiSample.Count = Count;
+  }
+
+  bool operator==(const ResourceInfo &RHS) const;
 
-public:
   static ResourceInfo SRV(Value *Symbol, StringRef Name,
-                          ResourceBinding Binding, uint32_t UniqueID,
                           dxil::ElementType ElementTy, uint32_t ElementCount,
                           dxil::ResourceKind Kind);
-  static ResourceInfo RawBuffer(Value *Symbol, StringRef Name,
-                                ResourceBinding Binding, uint32_t UniqueID);
+  static ResourceInfo RawBuffer(Value *Symbol, StringRef Name);
   static ResourceInfo StructuredBuffer(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, uint32_t Stride,
-                                       Align Alignment);
+                                       uint32_t Stride, MaybeAlign Alignment);
   static ResourceInfo Texture2DMS(Value *Symbol, StringRef Name,
-                                  ResourceBinding Binding, uint32_t UniqueID,
                                   dxil::ElementType ElementTy,
                                   uint32_t ElementCount, uint32_t SampleCount);
-  static ResourceInfo
-  Texture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                   uint32_t UniqueID, dxil::ElementType ElementTy,
-                   uint32_t ElementCount, uint32_t SampleCount);
+  static ResourceInfo Texture2DMSArray(Value *Symbol, StringRef Name,
+                                       dxil::ElementType ElementTy,
+                                       uint32_t ElementCount,
+                                       uint32_t SampleCount);
 
   static ResourceInfo UAV(Value *Symbol, StringRef Name,
-                          ResourceBinding Binding, uint32_t UniqueID,
                           dxil::ElementType ElementTy, uint32_t ElementCount,
                           bool GloballyCoherent, bool IsROV,
                           dxil::ResourceKind Kind);
   static ResourceInfo RWRawBuffer(Value *Symbol, StringRef Name,
-                                  ResourceBinding Binding, uint32_t UniqueID,
                                   bool GloballyCoherent, bool IsROV);
   static ResourceInfo RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                         ResourceBinding Binding,
-                                         uint32_t UniqueID, uint32_t Stride,
-                                         Align Alignment, bool GloballyCoherent,
-                                         bool IsROV, bool HasCounter);
+                                         uint32_t Stride, MaybeAlign Alignment,
+                                         bool GloballyCoherent, bool IsROV,
+                                         bool HasCounter);
   static ResourceInfo RWTexture2DMS(Value *Symbol, StringRef Name,
-                                    ResourceBinding Binding, uint32_t UniqueID,
                                     dxil::ElementType ElementTy,
                                     uint32_t ElementCount, uint32_t SampleCount,
                                     bool GloballyCoherent);
-  static ResourceInfo
-  RWTexture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                     uint32_t UniqueID, dxil::ElementType ElementTy,
-                     uint32_t ElementCount, uint32_t SampleCount,
-                     bool GloballyCoherent);
+  static ResourceInfo RWTexture2DMSArray(Value *Symbol, StringRef Name,
+                                         dxil::ElementType ElementTy,
+                                         uint32_t ElementCount,
+                                         uint32_t SampleCount,
+                                         bool GloballyCoherent);
   static ResourceInfo FeedbackTexture2D(Value *Symbol, StringRef Name,
-                                        ResourceBinding Binding,
-                                        uint32_t UniqueID,
                                         dxil::SamplerFeedbackType FeedbackTy);
   static ResourceInfo
-  FeedbackTexture2DArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                         uint32_t UniqueID,
+  FeedbackTexture2DArray(Value *Symbol, StringRef Name,
                          dxil::SamplerFeedbackType FeedbackTy);
 
-  static ResourceInfo CBuffer(Value *Symbol, StringRef Name,
-                              ResourceBinding Binding, uint32_t UniqueID,
-                              uint32_t Size);
+  static ResourceInfo CBuffer(Value *Symbol, StringRef Name, uint32_t Size);
 
   static ResourceInfo Sampler(Value *Symbol, StringRef Name,
-                              ResourceBinding Binding, uint32_t UniqueID,
                               dxil::SamplerType SamplerTy);
 
-  bool operator==(const ResourceInfo &RHS) const;
-
   MDTuple *getAsMetadata(LLVMContext &Ctx) const;
 
   ResourceBinding getBinding() const { return Binding; }
@@ -188,4 +216,4 @@ class ResourceInfo {
 } // namespace dxil
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
+#endif // LLVM_ANALYSIS_DXILRESOURCE_H
diff --git a/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
index 84ed882c6de84..ca4ce68b85cbc 100644
--- a/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
+++ b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
@@ -232,7 +232,7 @@ class GenericDomTreeUpdater {
   /// insertEdge/deleteEdge or is unnecessary in the batch update.
   bool isUpdateValid(typename DomTreeT::UpdateType Update) const;
 
-  /// Erase Basic Block node that has been unlinked from Function
+  /// Erase Basic Block node before it is unlinked from Function
   /// in the DomTree and PostDomTree.
   void eraseDelBBNode(BasicBlockT *DelBB);
 
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 6da1c5596b10e..1f01ff7027fa9 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -69,8 +69,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size,
-                                 const DataLayout &DL,
-                                 Instruction *ScanFrom = nullptr,
+                                 const DataLayout &DL, Instruction *ScanFrom,
                                  AssumptionCache *AC = nullptr,
                                  const DominatorTree *DT = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr);
@@ -86,6 +85,11 @@ bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
                                        ScalarEvolution &SE, DominatorTree &DT,
                                        AssumptionCache *AC = nullptr);
 
+/// Return true if the loop \p L cannot fault on any iteration and only
+/// contains read-only memory accesses.
+bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE,
+                                   DominatorTree *DT, AssumptionCache *AC);
+
 /// Return true if we know that executing a load from this value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
@@ -95,12 +99,18 @@ bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Type *Ty, Align Alignment,
-                                 const DataLayout &DL,
-                                 Instruction *ScanFrom = nullptr,
+                                 const DataLayout &DL, Instruction *ScanFrom,
                                  AssumptionCache *AC = nullptr,
                                  const DominatorTree *DT = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr);
 
+/// Return true if speculation of the given load must be suppressed to avoid
+/// ordering or interfering with an active sanitizer.  If not suppressed,
+/// dereferenceability and alignment must be proven separately.  Note: This
+/// is only needed for raw reasoning; if you use the interface below
+/// (isSafeToSpeculativelyExecute), this is handled internally.
+bool mustSuppressSpeculation(const LoadInst &LI);
+
 /// The default number of maximum instructions to scan in the block, used by
 /// FindAvailableLoadedValue().
 extern cl::opt<unsigned> DefMaxInstsToScan;
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index afafb74bdcb0a..43eac1474876f 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -199,9 +199,8 @@ class MemoryDepChecker {
   /// Check whether the dependencies between the accesses are safe.
   ///
   /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
-                   const DenseMap<Value *, SmallVector<const Value *, 16>>
-                       &UnderlyingObjects);
+  bool areDepsSafe(const DepCandidates &AccessSets,
+                   const MemAccessInfoList &CheckDeps);
 
   /// No memory dependence was encountered that would inhibit
   /// vectorization.
@@ -351,11 +350,8 @@ class MemoryDepChecker {
   /// element access it records this distance in \p MinDepDistBytes (if this
   /// distance is smaller than any other distance encountered so far).
   /// Otherwise, this function returns true signaling a possible dependence.
-  Dependence::DepType
-  isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-              unsigned BIdx,
-              const DenseMap<Value *, SmallVector<const Value *, 16>>
-                  &UnderlyingObjects);
+  Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx,
+                                  const MemAccessInfo &B, unsigned BIdx);
 
   /// Check whether the data dependence could prevent store-load
   /// forwarding.
@@ -392,11 +388,9 @@ class MemoryDepChecker {
   /// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
   /// BIsWrite).
   std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
-  getDependenceDistanceStrideAndSize(
-      const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B,
-      Instruction *BInst,
-      const DenseMap<Value *, SmallVector<const Value *, 16>>
-          &UnderlyingObjects);
+  getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
+                                     const MemAccessInfo &B,
+                                     Instruction *BInst);
 };
 
 class RuntimePointerChecking;
@@ -405,14 +399,15 @@ class RuntimePointerChecking;
 struct RuntimeCheckingPtrGroup {
   /// Create a new pointer checking group containing a single
   /// pointer, with index \p Index in RtCheck.
-  RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck);
+  RuntimeCheckingPtrGroup(unsigned Index,
+                          const RuntimePointerChecking &RtCheck);
 
   /// Tries to add the pointer recorded in RtCheck at index
   /// \p Index to this pointer checking group. We can only add a pointer
   /// to a checking group if we will still be able to get
   /// the upper and lower bounds of the check. Returns true in case
   /// of success, false otherwise.
-  bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck);
+  bool addPointer(unsigned Index, const RuntimePointerChecking &RtCheck);
   bool addPointer(unsigned Index, const SCEV *Start, const SCEV *End,
                   unsigned AS, bool NeedsFreeze, ScalarEvolution &SE);
 
@@ -718,8 +713,8 @@ class LoopAccessInfo {
 private:
   /// Analyze the loop. Returns true if all memory access in the loop can be
   /// vectorized.
-  bool analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI,
-                   DominatorTree *DT);
+  bool analyzeLoop(AAResults *AA, const LoopInfo *LI,
+                   const TargetLibraryInfo *TLI, DominatorTree *DT);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -730,8 +725,8 @@ class LoopAccessInfo {
   /// LAA does not directly emits the remarks.  Instead it stores it which the
   /// client can retrieve and presents as its own analysis
   /// (e.g. -Rpass-analysis=loop-vectorize).
-  OptimizationRemarkAnalysis &recordAnalysis(StringRef RemarkName,
-                                             Instruction *Instr = nullptr);
+  OptimizationRemarkAnalysis &
+  recordAnalysis(StringRef RemarkName, const Instruction *Instr = nullptr);
 
   /// Collect memory access with loop invariant strides.
   ///
@@ -797,7 +792,8 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                           Value *Ptr);
 
 /// If the pointer has a constant stride return it in units of the access type
-/// size.  Otherwise return std::nullopt.
+/// size. If the pointer is loop-invariant, return 0. Otherwise return
+/// std::nullopt.
 ///
 /// Ensure that it does not wrap in the address space, assuming the predicate
 /// associated with \p PSE is true.
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index d9bfca763819f..1081357e734a6 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1028,6 +1028,11 @@ class ScalarEvolution {
   /// Test if the given expression is known to be non-zero.
   bool isKnownNonZero(const SCEV *S);
 
+  /// Test if the given expression is known to be a power of 2.  OrNegative
+  /// allows matching negative power of 2s, and OrZero allows matching 0.
+  bool isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero = false,
+                              bool OrNegative = false);
+
   /// Splits SCEV expression \p S into two SCEVs. One of them is obtained from
   /// \p S by substitution of all AddRec sub-expression related to loop \p L
   /// with initial value of that SCEV. The second is obtained from \p S by
diff --git a/llvm/include/llvm/Analysis/SimplifyQuery.h b/llvm/include/llvm/Analysis/SimplifyQuery.h
index a560744f01222..e8f43c8c2e91f 100644
--- a/llvm/include/llvm/Analysis/SimplifyQuery.h
+++ b/llvm/include/llvm/Analysis/SimplifyQuery.h
@@ -130,6 +130,12 @@ struct SimplifyQuery {
     Copy.CC = &CC;
     return Copy;
   }
+
+  SimplifyQuery getWithoutCondContext() const {
+    SimplifyQuery Copy(*this);
+    Copy.CC = nullptr;
+    return Copy;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 717693a7cf63c..754f09c19fb35 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -476,6 +476,21 @@ TLI_DEFINE_ENUM_INTERNAL(atexit)
 TLI_DEFINE_STRING_INTERNAL("atexit")
 TLI_DEFINE_SIG_INTERNAL(Int, Ptr)
 
+/// void abort(void)
+TLI_DEFINE_ENUM_INTERNAL(abort)
+TLI_DEFINE_STRING_INTERNAL("abort")
+TLI_DEFINE_SIG_INTERNAL(Void)
+
+/// void exit(int)
+TLI_DEFINE_ENUM_INTERNAL(exit)
+TLI_DEFINE_STRING_INTERNAL("exit")
+TLI_DEFINE_SIG_INTERNAL(Void, Int)
+
+/// void _Exit(int)
+TLI_DEFINE_ENUM_INTERNAL(Exit)
+TLI_DEFINE_STRING_INTERNAL("_Exit")
+TLI_DEFINE_SIG_INTERNAL(Void, Int)
+
 /// void __cxa_guard_abort(guard_t *guard);
 /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
 TLI_DEFINE_ENUM_INTERNAL(cxa_guard_abort)
@@ -1792,6 +1807,21 @@ TLI_DEFINE_ENUM_INTERNAL(modfl)
 TLI_DEFINE_STRING_INTERNAL("modfl")
 TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, Ptr)
 
+/// double nan(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nan)
+TLI_DEFINE_STRING_INTERNAL("nan")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Ptr)
+
+/// float nanf(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nanf)
+TLI_DEFINE_STRING_INTERNAL("nanf")
+TLI_DEFINE_SIG_INTERNAL(Flt, Ptr)
+
+/// long double nanl(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nanl)
+TLI_DEFINE_STRING_INTERNAL("nanl")
+TLI_DEFINE_SIG_INTERNAL(LDbl, Ptr)
+
 /// double nearbyint(double x);
 TLI_DEFINE_ENUM_INTERNAL(nearbyint)
 TLI_DEFINE_STRING_INTERNAL("nearbyint")
@@ -1953,6 +1983,21 @@ TLI_DEFINE_ENUM_INTERNAL(remainderl)
 TLI_DEFINE_STRING_INTERNAL("remainderl")
 TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl)
 
+/// double remquo(double x, double y, int *quo);
+TLI_DEFINE_ENUM_INTERNAL(remquo)
+TLI_DEFINE_STRING_INTERNAL("remquo")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Dbl, Ptr)
+
+/// float remquof(float x, float y, int *quo);
+TLI_DEFINE_ENUM_INTERNAL(remquof)
+TLI_DEFINE_STRING_INTERNAL("remquof")
+TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Flt, Ptr)
+
+/// long double remquol(long double x, long double y, int *quo);
+TLI_DEFINE_ENUM_INTERNAL(remquol)
+TLI_DEFINE_STRING_INTERNAL("remquol")
+TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl, Ptr)
+
 /// int remove(const char *path);
 TLI_DEFINE_ENUM_INTERNAL(remove)
 TLI_DEFINE_STRING_INTERNAL("remove")
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index cf378008e4c7c..38e8b9da21397 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -419,6 +419,12 @@ class TargetTransformInfo {
   /// this factor, it is very likely to be predicted correctly.
   BranchProbability getPredictableBranchThreshold() const;
 
+  /// Returns estimated penalty of a branch misprediction in latency. Indicates
+  /// how aggressive the target wants for eliminating unpredictable branches. A
+  /// zero return value means extra optimization applied to them should be
+  /// minimal.
+  InstructionCost getBranchMispredictPenalty() const;
+
   /// Return true if branch divergence exists.
   ///
   /// Branch divergence has a significantly negative impact on GPU performance
@@ -1581,7 +1587,7 @@ class TargetTransformInfo {
   /// \returns The type to use in a loop expansion of a memcpy call.
   Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize = std::nullopt) const;
 
   /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
@@ -1593,7 +1599,7 @@ class TargetTransformInfo {
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize = std::nullopt) const;
 
   /// \returns True if the two functions have compatible attributes for inlining
@@ -1832,6 +1838,7 @@ class TargetTransformInfo::Concept {
                                              ArrayRef<const Value *> Operands,
                                              TargetCostKind CostKind) = 0;
   virtual BranchProbability getPredictableBranchThreshold() = 0;
+  virtual InstructionCost getBranchMispredictPenalty() = 0;
   virtual bool hasBranchDivergence(const Function *F = nullptr) = 0;
   virtual bool isSourceOfDivergence(const Value *V) = 0;
   virtual bool isAlwaysUniform(const Value *V) = 0;
@@ -2126,13 +2133,13 @@ class TargetTransformInfo::Concept {
                                                    Type *ExpectedType) = 0;
   virtual Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize) const = 0;
 
   virtual void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const = 0;
   virtual bool areInlineCompatible(const Function *Caller,
                                    const Function *Callee) const = 0;
@@ -2243,6 +2250,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   BranchProbability getPredictableBranchThreshold() override {
     return Impl.getPredictableBranchThreshold();
   }
+  InstructionCost getBranchMispredictPenalty() override {
+    return Impl.getBranchMispredictPenalty();
+  }
   bool hasBranchDivergence(const Function *F = nullptr) override {
     return Impl.hasBranchDivergence(F);
   }
@@ -2828,7 +2838,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   }
   Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize) const override {
     return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
                                           DestAddrSpace, SrcAlign, DestAlign,
@@ -2837,7 +2847,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const override {
     Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
                                            SrcAddrSpace, DestAddrSpace,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 47fde08735c0c..899c5041aba4d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -99,6 +99,8 @@ class TargetTransformInfoImplBase {
     return BranchProbability(99, 100);
   }
 
+  InstructionCost getBranchMispredictPenalty() const { return 0; }
+
   bool hasBranchDivergence(const Function *F = nullptr) const { return false; }
 
   bool isSourceOfDivergence(const Value *V) const { return false; }
@@ -837,7 +839,7 @@ class TargetTransformInfoImplBase {
   Type *
   getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                             unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                            unsigned SrcAlign, unsigned DestAlign,
+                            Align SrcAlign, Align DestAlign,
                             std::optional<uint32_t> AtomicElementSize) const {
     return AtomicElementSize ? Type::getIntNTy(Context, *AtomicElementSize * 8)
                              : Type::getInt8Ty(Context);
@@ -846,7 +848,7 @@ class TargetTransformInfoImplBase {
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const {
     unsigned OpSizeInBytes = AtomicCpySize ? *AtomicCpySize : 1;
     Type *OpType = Type::getIntNTy(Context, OpSizeInBytes * 8);
diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h
index b81eb5f60ab7e..fa56d838a3859 100644
--- a/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/llvm/include/llvm/Analysis/ValueLattice.h
@@ -281,18 +281,21 @@ class ValueLatticeElement {
     return std::nullopt;
   }
 
-  ConstantRange asConstantRange(Type *Ty, bool UndefAllowed = false) const {
-    assert(Ty->isIntOrIntVectorTy() && "Must be integer type");
+  ConstantRange asConstantRange(unsigned BW, bool UndefAllowed = false) const {
     if (isConstantRange(UndefAllowed))
       return getConstantRange();
     if (isConstant())
       return getConstant()->toConstantRange();
-    unsigned BW = Ty->getScalarSizeInBits();
     if (isUnknown())
       return ConstantRange::getEmpty(BW);
     return ConstantRange::getFull(BW);
   }
 
+  ConstantRange asConstantRange(Type *Ty, bool UndefAllowed = false) const {
+    assert(Ty->isIntOrIntVectorTy() && "Must be integer type");
+    return asConstantRange(Ty->getScalarSizeInBits(), UndefAllowed);
+  }
+
   bool markOverdefined() {
     if (isOverdefined())
       return false;
@@ -384,7 +387,9 @@ class ValueLatticeElement {
       return true;
     }
 
-    assert(isUnknown() || isUndef());
+    assert(isUnknown() || isUndef() || isConstant());
+    assert((!isConstant() || NewR.contains(getConstant()->toConstantRange())) &&
+           "Constant must be subset of new range");
 
     NumRangeExtensions = 0;
     Tag = NewTag;
@@ -426,6 +431,16 @@ class ValueLatticeElement {
         return false;
       if (RHS.isUndef())
         return false;
+      // If the constant is a vector of integers, try to treat it as a range.
+      if (getConstant()->getType()->isVectorTy() &&
+          getConstant()->getType()->getScalarType()->isIntegerTy()) {
+        ConstantRange L = getConstant()->toConstantRange();
+        ConstantRange NewR = L.unionWith(
+            RHS.asConstantRange(L.getBitWidth(), /*UndefAllowed=*/true));
+        return markConstantRange(
+            std::move(NewR),
+            Opts.setMayIncludeUndef(RHS.isConstantRangeIncludingUndef()));
+      }
       markOverdefined();
       return true;
     }
@@ -444,14 +459,9 @@ class ValueLatticeElement {
       return OldTag != Tag;
     }
 
-    if (!RHS.isConstantRange()) {
-      // We can get here if we've encountered a constantexpr of integer type
-      // and merge it with a constantrange.
-      markOverdefined();
-      return true;
-    }
-
-    ConstantRange NewR = getConstantRange().unionWith(RHS.getConstantRange());
+    const ConstantRange &L = getConstantRange();
+    ConstantRange NewR = L.unionWith(
+        RHS.asConstantRange(L.getBitWidth(), /*UndefAllowed=*/true));
     return markConstantRange(
         std::move(NewR),
         Opts.setMayIncludeUndef(RHS.isConstantRangeIncludingUndef()));
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 354ad5bc95317..96fa16970584d 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -526,16 +526,17 @@ inline KnownFPClass computeKnownFPClass(
 }
 
 /// Wrapper to account for known fast math flags at the use instruction.
-inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF,
-                                        FPClassTest InterestedClasses,
-                                        unsigned Depth,
-                                        const SimplifyQuery &SQ) {
+inline KnownFPClass
+computeKnownFPClass(const Value *V, const APInt &DemandedElts,
+                    FastMathFlags FMF, FPClassTest InterestedClasses,
+                    unsigned Depth, const SimplifyQuery &SQ) {
   if (FMF.noNaNs())
     InterestedClasses &= ~fcNan;
   if (FMF.noInfs())
     InterestedClasses &= ~fcInf;
 
-  KnownFPClass Result = computeKnownFPClass(V, InterestedClasses, Depth, SQ);
+  KnownFPClass Result =
+      computeKnownFPClass(V, DemandedElts, InterestedClasses, Depth, SQ);
 
   if (FMF.noNaNs())
     Result.KnownFPClasses &= ~fcNan;
@@ -544,6 +545,17 @@ inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF,
   return Result;
 }
 
+inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF,
+                                        FPClassTest InterestedClasses,
+                                        unsigned Depth,
+                                        const SimplifyQuery &SQ) {
+  auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
+  APInt DemandedElts =
+      FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
+  return computeKnownFPClass(V, DemandedElts, FMF, InterestedClasses, Depth,
+                             SQ);
+}
+
 /// Return true if we can prove that the specified FP value is never equal to
 /// -0.0. Users should use caution when considering PreserveSign
 /// denormal-fp-math.
@@ -724,6 +736,10 @@ inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) {
   return const_cast<Value *>(getUnderlyingObject(VConst, MaxLookup));
 }
 
+/// Like getUnderlyingObject(), but will try harder to find a single underlying
+/// object. In particular, this function also looks through selects and phis.
+const Value *getUnderlyingObjectAggressive(const Value *V);
+
 /// This method is similar to getUnderlyingObject except that it can
 /// look through phi and select instructions and return multiple objects.
 ///
@@ -754,7 +770,7 @@ inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) {
 /// it shouldn't look through the phi above.
 void getUnderlyingObjects(const Value *V,
                           SmallVectorImpl<const Value *> &Objects,
-                          LoopInfo *LI = nullptr, unsigned MaxLookup = 6);
+                          const LoopInfo *LI = nullptr, unsigned MaxLookup = 6);
 
 /// This is a wrapper around getUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
@@ -776,13 +792,6 @@ bool onlyUsedByLifetimeMarkers(const Value *V);
 /// droppable instructions.
 bool onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V);
 
-/// Return true if speculation of the given load must be suppressed to avoid
-/// ordering or interfering with an active sanitizer.  If not suppressed,
-/// dereferenceability and alignment must be proven separately.  Note: This
-/// is only needed for raw reasoning; if you use the interface below
-/// (isSafeToSpeculativelyExecute), this is handled internally.
-bool mustSuppressSpeculation(const LoadInst &LI);
-
 /// Return true if the instruction does not have any effects besides
 /// calculating the result and does not have undefined behavior.
 ///
@@ -810,15 +819,25 @@ bool isSafeToSpeculativelyExecute(const Instruction *I,
                                   const Instruction *CtxI = nullptr,
                                   AssumptionCache *AC = nullptr,
                                   const DominatorTree *DT = nullptr,
-                                  const TargetLibraryInfo *TLI = nullptr);
+                                  const TargetLibraryInfo *TLI = nullptr,
+                                  bool UseVariableInfo = true);
+
+inline bool isSafeToSpeculativelyExecute(const Instruction *I,
+                                         BasicBlock::iterator CtxI,
+                                         AssumptionCache *AC = nullptr,
+                                         const DominatorTree *DT = nullptr,
+                                         const TargetLibraryInfo *TLI = nullptr,
+                                         bool UseVariableInfo = true) {
+  // Take an iterator, and unwrap it into an Instruction *.
+  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI, UseVariableInfo);
+}
 
+/// Don't use information from its non-constant operands. This helper is used
+/// when its operands are going to be replaced.
 inline bool
-isSafeToSpeculativelyExecute(const Instruction *I, BasicBlock::iterator CtxI,
-                             AssumptionCache *AC = nullptr,
-                             const DominatorTree *DT = nullptr,
-                             const TargetLibraryInfo *TLI = nullptr) {
-  // Take an iterator, and unwrap it into an Instruction *.
-  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI);
+isSafeToSpeculativelyExecuteWithVariableReplaced(const Instruction *I) {
+  return isSafeToSpeculativelyExecute(I, nullptr, nullptr, nullptr, nullptr,
+                                      /*UseVariableInfo=*/false);
 }
 
 /// This returns the same result as isSafeToSpeculativelyExecute if Opcode is
@@ -841,7 +860,7 @@ isSafeToSpeculativelyExecute(const Instruction *I, BasicBlock::iterator CtxI,
 bool isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI = nullptr,
     AssumptionCache *AC = nullptr, const DominatorTree *DT = nullptr,
-    const TargetLibraryInfo *TLI = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, bool UseVariableInfo = true);
 
 /// Returns true if the result or effects of the given instructions \p I
 /// depend values not reachable through the def use graph.
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index ffdf8b8c3bc79..7ea010a345f38 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -51,13 +51,19 @@ TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.tan.f32", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Hyperbolic Functions
 TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4), "_ZGV_LLVM_N4v")
@@ -73,12 +79,18 @@ TLI_DEFINE_VECFUNC("llvm.exp.f32", "_simd_exp_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Trigonometric Functions
 TLI_DEFINE_VECFUNC("acos", "_simd_acos_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_simd_acos_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("acosf", "_simd_acos_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_simd_acos_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asin", "_simd_asin_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_simd_asin_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("asinf", "_simd_asin_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_simd_asin_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("atan", "_simd_atan_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_simd_atan_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("atanf", "_simd_atan_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_simd_atan_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atan2", "_simd_atan2_d2", FIXED(2), "_ZGV_LLVM_N2vv")
 TLI_DEFINE_VECFUNC("atan2f", "_simd_atan2_f4", FIXED(4), "_ZGV_LLVM_N4vv")
 
@@ -109,11 +121,17 @@ TLI_DEFINE_VECFUNC("llvm.pow.f32", "_simd_pow_f4", FIXED(4), "_ZGV_LLVM_N4vv")
 
 // Hyperbolic Functions
 TLI_DEFINE_VECFUNC("sinh", "_simd_sinh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_simd_sinh_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sinhf", "_simd_sinh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_simd_sinh_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("cosh", "_simd_cosh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_simd_cosh_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("coshf", "_simd_cosh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_simd_cosh_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanh", "_simd_tanh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_simd_tanh_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("tanhf", "_simd_tanh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_simd_tanh_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinh", "_simd_asinh_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("asinhf", "_simd_asinh_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosh", "_simd_acosh_d2", FIXED(2), "_ZGV_LLVM_N2v")
@@ -500,14 +518,17 @@ TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16), "_ZGV_LLVM_N16
 #elif defined(TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS)
 
 TLI_DEFINE_VECFUNC("acos", "_ZGVnN2v_acos", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVnN2v_acos", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("acosh", "_ZGVnN2v_acosh", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("asin", "_ZGVnN2v_asin", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVnN2v_asin", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("asinh", "_ZGVnN2v_asinh", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("atan", "_ZGVnN2v_atan", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVnN2v_atan", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("atan2", "_ZGVnN2vv_atan2", FIXED(2), "_ZGV_LLVM_N2vv")
 
@@ -521,6 +542,7 @@ TLI_DEFINE_VECFUNC("cos", "_ZGVnN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("cosh", "_ZGVnN2v_cosh", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVnN2v_cosh", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("cospi", "_ZGVnN2v_cospi", FIXED(2), "_ZGV_LLVM_N2v")
 
@@ -583,6 +605,7 @@ TLI_DEFINE_VECFUNC("sincos", "_ZGVnN2vl8l8_sincos", FIXED(2), "_ZGV_LLVM_N2vl8l8
 TLI_DEFINE_VECFUNC("sincospi", "_ZGVnN2vl8l8_sincospi", FIXED(2), "_ZGV_LLVM_N2vl8l8")
 
 TLI_DEFINE_VECFUNC("sinh", "_ZGVnN2v_sinh", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVnN2v_sinh", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("sinpi", "_ZGVnN2v_sinpi", FIXED(2), "_ZGV_LLVM_N2v")
 
@@ -592,20 +615,24 @@ TLI_DEFINE_VECFUNC("tan", "_ZGVnN2v_tan", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("tanh", "_ZGVnN2v_tanh", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVnN2v_tanh", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("tgamma", "_ZGVnN2v_tgamma", FIXED(2), "_ZGV_LLVM_N2v")
 
 #elif defined(TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS)
 
 TLI_DEFINE_VECFUNC("acosf", "_ZGVnN4v_acosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN4v_acosf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN4v_acoshf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("asinf", "_ZGVnN4v_asinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN4v_asinf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN4v_asinhf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("atanf", "_ZGVnN4v_atanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN4v_atanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN4vv_atan2f", FIXED(4), "_ZGV_LLVM_N4vv")
 
@@ -619,6 +646,7 @@ TLI_DEFINE_VECFUNC("cosf", "_ZGVnN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("coshf", "_ZGVnN4v_coshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN4v_coshf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("cospif", "_ZGVnN4v_cospif", FIXED(4), "_ZGV_LLVM_N4v")
 
@@ -681,6 +709,7 @@ TLI_DEFINE_VECFUNC("sincosf", "_ZGVnN4vl4l4_sincosf", FIXED(4), "_ZGV_LLVM_N4vl4
 TLI_DEFINE_VECFUNC("sincospif", "_ZGVnN4vl4l4_sincospif", FIXED(4), "_ZGV_LLVM_N4vl4l4")
 
 TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN4v_sinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN4v_sinhf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("sinpif", "_ZGVnN4v_sinpif", FIXED(4), "_ZGV_LLVM_N4v")
 
@@ -690,6 +719,7 @@ TLI_DEFINE_VECFUNC("tanf", "_ZGVnN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN4v_tanhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("tgammaf", "_ZGVnN4v_tgammaf", FIXED(4), "_ZGV_LLVM_N4v")
 
@@ -697,18 +727,24 @@ TLI_DEFINE_VECFUNC("tgammaf", "_ZGVnN4v_tgammaf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("acosh", "_ZGVsMxv_acosh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("acoshf", "_ZGVsMxv_acoshf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("asinh", "_ZGVsMxv_asinh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("asinhf", "_ZGVsMxv_asinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2",  SCALABLE(2), MASKED, "_ZGVsMxvv")
 TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv")
@@ -729,6 +765,8 @@ TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsM
 
 TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVsMxv_cosh", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("cospi", "_ZGVsMxv_cospi",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("cospif", "_ZGVsMxv_cospif", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -826,6 +864,8 @@ TLI_DEFINE_VECFUNC("sincospif", "_ZGVsNxvl4l4_sincospif", SCALABLE(4), NOMASK, "
 
 TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVsMxv_sinh", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("sinpi", "_ZGVsMxv_sinpi",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("sinpif", "_ZGVsMxv_sinpif", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -840,6 +880,8 @@ TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsM
 
 TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVsMxv_tanh", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -851,6 +893,11 @@ TLI_DEFINE_VECFUNC("acosf", "armpl_vacosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("acos", "armpl_svacos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("acosf", "armpl_svacos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "armpl_vacosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "armpl_vacosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "armpl_svacos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "armpl_svacos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("acosh", "armpl_vacoshq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("acoshf", "armpl_vacoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosh", "armpl_svacosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -861,6 +908,11 @@ TLI_DEFINE_VECFUNC("asinf", "armpl_vasinq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("asin", "armpl_svasin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("asinf", "armpl_svasin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "armpl_vasinq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "armpl_vasinq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "armpl_svasin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "armpl_svasin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("asinh", "armpl_vasinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("asinhf", "armpl_vasinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinh", "armpl_svasinh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -871,6 +923,11 @@ TLI_DEFINE_VECFUNC("atanf", "armpl_vatanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("atan", "armpl_svatan_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("atanf", "armpl_svatan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "armpl_vatanq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "armpl_vatanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "armpl_svatan_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "armpl_svatan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("atan2", "armpl_vatan2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
 TLI_DEFINE_VECFUNC("atan2f", "armpl_vatan2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
 TLI_DEFINE_VECFUNC("atan2", "armpl_svatan2_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
@@ -906,6 +963,11 @@ TLI_DEFINE_VECFUNC("coshf", "armpl_vcoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("cosh", "armpl_svcosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("coshf", "armpl_svcosh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "armpl_vcoshq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "armpl_vcoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "armpl_svcosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "armpl_svcosh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("cospi", "armpl_vcospiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cospif", "armpl_vcospiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("cospi", "armpl_svcospi_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -1081,6 +1143,11 @@ TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("sinh", "armpl_svsinh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("sinhf", "armpl_svsinh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "armpl_vsinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "armpl_svsinh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "armpl_svsinh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("sinpi", "armpl_vsinpiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sinpif", "armpl_vsinpiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("sinpi", "armpl_svsinpi_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -1106,6 +1173,11 @@ TLI_DEFINE_VECFUNC("tanhf", "armpl_vtanhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("tanh", "armpl_svtanh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tanhf", "armpl_svtanh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "armpl_vtanhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "armpl_vtanhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "armpl_svtanh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "armpl_svtanh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("tgamma", "armpl_vtgammaq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -1292,9 +1364,17 @@ TLI_DEFINE_VECFUNC("asinf", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
 TLI_DEFINE_VECFUNC("atan", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
@@ -1302,12 +1382,25 @@ TLI_DEFINE_VECFUNC("atanf", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs4_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs8_tanhf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs4_tanhf",  FIXED(4),  NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs8_tanhf",  FIXED(8),  NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index e381295802009..9e551d3ae2bbe 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -560,8 +560,7 @@ namespace llvm {
     bool parseExceptionArgs(SmallVectorImpl<Value *> &Args,
                             PerFunctionState &PFS);
 
-    bool resolveFunctionType(Type *RetType,
-                             const SmallVector<ParamInfo, 16> &ArgList,
+    bool resolveFunctionType(Type *RetType, ArrayRef<ParamInfo> ArgList,
                              FunctionType *&FuncTy);
 
     // Constant Parsing.
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 456cffff6b4a7..fb39bb4b10b37 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -697,6 +697,19 @@ enum {
 #include "ELFRelocs/SystemZ.def"
 };
 
+// SPARC Specific e_flags
+enum : unsigned {
+  EF_SPARC_EXT_MASK = 0xffff00,
+  EF_SPARC_32PLUS = 0x000100,
+  EF_SPARC_SUN_US1 = 0x000200,
+  EF_SPARC_HAL_R1 = 0x000400,
+  EF_SPARC_SUN_US3 = 0x000800,
+  EF_SPARCV9_MM = 0x3,
+  EF_SPARCV9_TSO = 0x0,
+  EF_SPARCV9_PSO = 0x1,
+  EF_SPARCV9_RMO = 0x2,
+};
+
 // ELF Relocation type for Sparc.
 enum {
 #include "ELFRelocs/Sparc.def"
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 184bbe32df695..fb88f2fe75adb 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -757,6 +757,7 @@ enum AttributeKindCodes {
   ATTR_KIND_RANGE = 92,
   ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 93,
   ATTR_KIND_INITIALIZES = 94,
+  ATTR_KIND_HYBRID_PATCHABLE = 95,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index dc00bd57d655d..36d1b47973870 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -225,6 +225,9 @@ class AsmPrinter : public MachineFunctionPass {
   /// split stack prologue.
   bool HasNoSplitStack = false;
 
+  /// True if debugging information is available in this module.
+  bool DbgInfoAvailable = false;
+
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
@@ -430,6 +433,9 @@ class AsmPrinter : public MachineFunctionPass {
   /// Get the CFISection type for the module.
   CFISection getModuleCFISectionType() const { return ModuleCFISection; }
 
+  /// Returns true if valid debug info is present.
+  bool hasDebugInfo() const { return DbgInfoAvailable; }
+
   bool needsSEHMoves();
 
   /// Since emitting CFI unwind information is entangled with supporting the
@@ -577,6 +583,9 @@ class AsmPrinter : public MachineFunctionPass {
     report_fatal_error("ptrauth constant lowering not implemented");
   }
 
+  /// Lower the specified BlockAddress to an MCExpr.
+  virtual const MCExpr *lowerBlockAddressConstant(const BlockAddress &BA);
+
   /// Return true if the basic block has exactly one predecessor and the control
   /// transfer mechanism between the predecessor and this block is a
   /// fall-through.
@@ -892,7 +901,6 @@ class AsmPrinter : public MachineFunctionPass {
   virtual void emitModuleCommandLines(Module &M);
 
   GCMetadataPrinter *getOrCreateGCPrinter(GCStrategy &S);
-  virtual void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
   void emitGlobalIFunc(Module &M, const GlobalIFunc &GI);
 
 private:
@@ -900,6 +908,7 @@ class AsmPrinter : public MachineFunctionPass {
   bool shouldEmitLabelForBasicBlock(const MachineBasicBlock &MBB) const;
 
 protected:
+  virtual void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
   virtual bool shouldEmitWeakSwiftAsyncExtendedFramePointerFlags() const {
     return false;
   }
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 36a844e7087fa..85046c200ff9b 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -137,6 +137,8 @@ class DebugHandlerBase {
   void beginBasicBlockSection(const MachineBasicBlock &MBB);
   void endBasicBlockSection(const MachineBasicBlock &MBB);
 
+  virtual void beginCodeAlignment(const MachineBasicBlock &MBB) {}
+
   /// Return Label preceding the instruction.
   MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
 
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 45a47d7333e35..04667c04a4ef4 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -188,6 +188,9 @@ class FunctionLoweringInfo {
   /// SelectionDAGISel::PrepareEHLandingPad().
   unsigned ExceptionPointerVirtReg, ExceptionSelectorVirtReg;
 
+  /// The current call site index being processed, if any. 0 if none.
+  unsigned CurCallSite = 0;
+
   /// Collection of dbg.declare instructions handled after argument
   /// lowering and before ISel proper.
   SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
@@ -281,6 +284,12 @@ class FunctionLoweringInfo {
   Register getCatchPadExceptionPointerVReg(const Value *CPI,
                                            const TargetRegisterClass *RC);
 
+  /// Set the call site currently being processed.
+  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
+
+  /// Get the call site currently being processed, if any. Return zero if none.
+  unsigned getCurrentCallSite() { return CurCallSite; }
+
 private:
   /// LiveOutRegInfo - Information about live out vregs.
   IndexedMap<LiveOutInfo, VirtReg2IndexFunctor> LiveOutRegInfo;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 37a56e12efcc3..05d7e882f5135 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -129,6 +129,12 @@ class CombinerHelper {
 
   const TargetLowering &getTargetLowering() const;
 
+  const MachineFunction &getMachineFunction() const;
+
+  const DataLayout &getDataLayout() const;
+
+  LLVMContext &getContext() const;
+
   /// \returns true if the combiner is running pre-legalization.
   bool isPreLegalize() const;
 
@@ -390,12 +396,6 @@ class CombinerHelper {
   void applyCombineExtOfExt(MachineInstr &MI,
                             std::tuple<Register, unsigned> &MatchInfo);
 
-  /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
-  bool matchCombineTruncOfExt(MachineInstr &MI,
-                              std::pair<Register, unsigned> &MatchInfo);
-  void applyCombineTruncOfExt(MachineInstr &MI,
-                              std::pair<Register, unsigned> &MatchInfo);
-
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
   ///
@@ -886,6 +886,13 @@ class CombinerHelper {
 
   bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
 
+  /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
+  bool matchTruncateOfExt(const MachineInstr &Root, const MachineInstr &ExtMI,
+                          BuildFnTy &MatchInfo);
+
+  bool matchCastOfSelect(const MachineInstr &Cast, const MachineInstr &SelectMI,
+                         BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -998,6 +1005,8 @@ class CombinerHelper {
 
   // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
   bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+
+  bool isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const;
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index cc2dd2f4e489c..7b42722ca8d4f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -15,11 +15,13 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_GIMATCHTABLEEXECUTOR_H
 #define LLVM_CODEGEN_GLOBALISEL_GIMATCHTABLEEXECUTOR_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Bitset.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGenTypes/LowLevelType.h"
 #include "llvm/IR/Function.h"
 #include <bitset>
@@ -133,6 +135,12 @@ enum {
   /// - Ops(ULEB128) - Expected number of operands
   GIM_CheckNumOperands,
 
+  /// Check the instruction has a number of operands <= or >= than given number.
+  /// - InsnID(ULEB128) - Instruction ID
+  /// - Ops(ULEB128) - Number of operands
+  GIM_CheckNumOperandsLE,
+  GIM_CheckNumOperandsGE,
+
   /// Check an immediate predicate on the specified instruction
   /// - InsnID(ULEB128) - Instruction ID
   /// - Pred(2) - The predicate to test
@@ -294,12 +302,15 @@ enum {
   /// Check the specified operands are identical.
   /// The IgnoreCopies variant looks through COPY instructions before
   /// comparing the operands.
+  /// The "All" variants check all operands starting from the index.
   /// - InsnID(ULEB128) - Instruction ID
   /// - OpIdx(ULEB128) - Operand index
   /// - OtherInsnID(ULEB128) - Other instruction ID
   /// - OtherOpIdx(ULEB128) - Other operand index
   GIM_CheckIsSameOperand,
   GIM_CheckIsSameOperandIgnoreCopies,
+  GIM_CheckAllSameOperand,
+  GIM_CheckAllSameOperandIgnoreCopies,
 
   /// Check we can replace all uses of a register with another.
   /// - OldInsnID(ULEB128)
@@ -362,6 +373,13 @@ enum {
   /// GIR_Copy but with both New/OldInsnIDs omitted and defaulting to zero.
   GIR_RootToRootCopy,
 
+  /// Copies all operand starting from OpIdx in OldInsnID into the new
+  /// instruction NewInsnID.
+  /// - NewInsnID(ULEB128) - Instruction ID to modify
+  /// - OldInsnID(ULEB128) - Instruction ID to copy from
+  /// - OpIdx(ULEB128) - The first operand to copy
+  GIR_CopyRemaining,
+
   /// Copy an operand to the specified instruction or add a zero register if the
   /// operand is a zero immediate.
   /// - NewInsnID(ULEB128) - Instruction ID to modify
@@ -713,6 +731,12 @@ class GIMatchTableExecutor {
     return Ret;
   }
 
+  static ArrayRef<MachineOperand> getRemainingOperands(const MachineInstr &MI,
+                                                       unsigned FirstVarOp) {
+    auto Operands = drop_begin(MI.operands(), FirstVarOp);
+    return {Operands.begin(), Operands.end()};
+  }
+
 public:
   // Faster ULEB128 decoder tailored for the Match Table Executor.
   //
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 90b4fe5518c87..5a5a750ac6b4a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -309,6 +309,23 @@ bool GIMatchTableExecutor::executeMatchTable(
       break;
     }
 
+    case GIM_CheckNumOperandsGE:
+    case GIM_CheckNumOperandsLE: {
+      uint64_t InsnID = readULEB();
+      uint64_t Expected = readULEB();
+      const bool IsLE = (MatcherOpcode == GIM_CheckNumOperandsLE);
+      DEBUG_WITH_TYPE(TgtExecutor::getName(),
+                      dbgs() << CurrentIdx << ": GIM_CheckNumOperands"
+                             << (IsLE ? "LE" : "GE") << "(MIs[" << InsnID
+                             << "], Expected=" << Expected << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      const unsigned NumOps = State.MIs[InsnID]->getNumOperands();
+      if (IsLE ? (NumOps <= Expected) : (NumOps >= Expected)) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      }
+      break;
+    }
     case GIM_CheckNumOperands: {
       uint64_t InsnID = readULEB();
       uint64_t Expected = readULEB();
@@ -1081,6 +1098,22 @@ bool GIMatchTableExecutor::executeMatchTable(
       break;
     }
 
+    case GIR_CopyRemaining: {
+      uint64_t NewInsnID = readULEB();
+      uint64_t OldInsnID = readULEB();
+      uint64_t OpIdx = readULEB();
+      assert(OutMIs[NewInsnID] && "Attempted to add to undefined instruction");
+      MachineInstr &OldMI = *State.MIs[OldInsnID];
+      MachineInstrBuilder &NewMI = OutMIs[NewInsnID];
+      for (const auto &Op : drop_begin(OldMI.operands(), OpIdx))
+        NewMI.add(Op);
+      DEBUG_WITH_TYPE(TgtExecutor::getName(),
+                      dbgs() << CurrentIdx << ": GIR_CopyRemaining(OutMIs["
+                             << NewInsnID << "], MIs[" << OldInsnID
+                             << "], /*start=*/" << OpIdx << ")\n");
+      break;
+    }
+
     case GIR_CopyOrAddZeroReg: {
       uint64_t NewInsnID = readULEB();
       uint64_t OldInsnID = readULEB();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index d9f6f6540bdc8..ef1171d9f1f64 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -900,6 +900,56 @@ class GShl : public GenericMachineInstr {
   };
 };
 
+/// Represents a threeway compare.
+class GSUCmp : public GenericMachineInstr {
+public:
+  Register getLHSReg() const { return getOperand(1).getReg(); }
+  Register getRHSReg() const { return getOperand(2).getReg(); }
+
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SCMP; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_SCMP:
+    case TargetOpcode::G_UCMP:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
+/// Represents an integer-like extending operation.
+class GExtOp : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
+/// Represents an integer-like extending or truncating operation.
+class GExtOrTruncOp : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_TRUNC:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
index cada7f30072e2..8017f09aa3c8b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
@@ -49,8 +49,8 @@ class InstructionSelect : public MachineFunctionPass {
         MachineFunctionProperties::Property::Selected);
   }
 
-  InstructionSelect(CodeGenOptLevel OL);
-  InstructionSelect();
+  InstructionSelect(CodeGenOptLevel OL = CodeGenOptLevel::Default,
+                    char &PassID = ID);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index b17bc9aa2a44e..0b2cd299bde12 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -280,6 +280,9 @@ class LegalizerHelper {
   LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
                                          MachineInstr &MI,
                                          LostDebugLocObserver &LocObserver);
+  LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver);
 
   MachineInstrBuilder
   getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
@@ -402,6 +405,7 @@ class LegalizerHelper {
 
   LegalizeResult lowerISFPCLASS(MachineInstr &MI);
 
+  LegalizeResult lowerThreewayCompare(MachineInstr &MI);
   LegalizeResult lowerMinMax(MachineInstr &MI);
   LegalizeResult lowerFCopySign(MachineInstr &MI);
   LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index e74136f34b234..56a77b8596a18 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1273,6 +1273,34 @@ class MachineIRBuilder {
                                 const SrcOp &Op0, const SrcOp &Op1,
                                 std::optional<unsigned> Flags = std::nullopt);
 
+  /// Build and insert a \p Res = G_SCMP \p Op0, \p Op1
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+
+  /// \pre \p Res must be a generic virtual register with scalar or
+  ///      vector type. Typically this starts as s2 or <N x s2>.
+  /// \pre \p Op0 and Op1 must be generic virtual registers with the
+  ///      same number of elements as \p Res. If \p Res is a scalar,
+  ///      \p Op0 must be a scalar.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildSCmp(const DstOp &Res, const SrcOp &Op0,
+                                const SrcOp &Op1);
+
+  /// Build and insert a \p Res = G_UCMP \p Op0, \p Op1
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+
+  /// \pre \p Res must be a generic virtual register with scalar or
+  ///      vector type. Typically this starts as s2 or <N x s2>.
+  /// \pre \p Op0 and Op1 must be generic virtual registers with the
+  ///      same number of elements as \p Res. If \p Res is a scalar,
+  ///      \p Op0 must be a scalar.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildUCmp(const DstOp &Res, const SrcOp &Op0,
+                                const SrcOp &Op1);
+
   /// Build and insert a \p Res = G_IS_FPCLASS \p Src, \p Mask
   MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src,
                                      unsigned Mask) {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index daceaf98583bd..5b657fb171296 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -217,9 +217,9 @@ enum NodeType {
   /// UNDEF - An undefined node.
   UNDEF,
 
-  // FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or
-  // is evaluated to UNDEF), or returns VAL otherwise. Note that each
-  // read of UNDEF can yield different value, but FREEZE(UNDEF) cannot.
+  /// FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or
+  /// is evaluated to UNDEF), or returns VAL otherwise. Note that each
+  /// read of UNDEF can yield different value, but FREEZE(UNDEF) cannot.
   FREEZE,
 
   /// EXTRACT_ELEMENT - This is used to get the lower or upper (determined by
@@ -300,7 +300,7 @@ enum NodeType {
   /// it to the add/sub hardware instruction, and then inverting the outgoing
   /// carry/borrow.
   ///
-  /// The use of these opcodes is preferable to adde/sube if the target supports
+  /// The use of these opcodes is preferable to ADDE/SUBE if the target supports
   /// it, as the carry is a regular value rather than a glue, which allows
   /// further optimisation.
   ///
@@ -490,7 +490,7 @@ enum NodeType {
   STRICT_FSETCC,
   STRICT_FSETCCS,
 
-  // FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
+  /// FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
   FPTRUNC_ROUND,
 
   /// FMA - Perform a * b + c with no intermediate rounding step.
@@ -684,10 +684,10 @@ enum NodeType {
   AVGCEILS,
   AVGCEILU,
 
-  // ABDS/ABDU - Absolute difference - Return the absolute difference between
-  // two numbers interpreted as signed/unsigned.
-  // i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
-  //  or trunc(abs(zext(Op0) - zext(Op1))) becomes abdu(Op0, Op1)
+  /// ABDS/ABDU - Absolute difference - Return the absolute difference between
+  /// two numbers interpreted as signed/unsigned.
+  /// i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
+  ///  or trunc(abs(zext(Op0) - zext(Op1))) becomes abdu(Op0, Op1)
   ABDS,
   ABDU,
 
@@ -728,8 +728,9 @@ enum NodeType {
   /// amount modulo the element size of the first operand.
   ///
   /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
-  /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-  /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  ///
+  ///     fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  ///     fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
   SHL,
   SRA,
   SRL,
@@ -787,7 +788,8 @@ enum NodeType {
 
   /// SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded
   /// integer shift operations.  The operation ordering is:
-  ///       [Lo,Hi] = op [LoLHS,HiLHS], Amt
+  ///
+  ///     [Lo,Hi] = op [LoLHS,HiLHS], Amt
   SHL_PARTS,
   SRA_PARTS,
   SRL_PARTS,
@@ -998,7 +1000,7 @@ enum NodeType {
 
   /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
   /// values.
-  //
+  ///
   /// In the case where a single input is a NaN (either signaling or quiet),
   /// the non-NaN input is returned.
   ///
@@ -1196,11 +1198,11 @@ enum NodeType {
   VAEND,
   VASTART,
 
-  // PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE
-  // with the preallocated call Value.
+  /// PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE
+  /// with the preallocated call Value.
   PREALLOCATED_SETUP,
-  // PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
-  // with the preallocated call Value, and a constant int.
+  /// PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
+  /// with the preallocated call Value, and a constant int.
   PREALLOCATED_ARG,
 
   /// SRCVALUE - This is a node type that holds a Value* that is used to
@@ -1319,24 +1321,24 @@ enum NodeType {
   ATOMIC_LOAD_UINC_WRAP,
   ATOMIC_LOAD_UDEC_WRAP,
 
-  // Masked load and store - consecutive vector load and store operations
-  // with additional mask operand that prevents memory accesses to the
-  // masked-off lanes.
-  //
-  // Val, OutChain = MLOAD(BasePtr, Mask, PassThru)
-  // OutChain = MSTORE(Value, BasePtr, Mask)
+  /// Masked load and store - consecutive vector load and store operations
+  /// with additional mask operand that prevents memory accesses to the
+  /// masked-off lanes.
+  ///
+  ///     Val, OutChain = MLOAD(BasePtr, Mask, PassThru)
+  ///     OutChain = MSTORE(Value, BasePtr, Mask)
   MLOAD,
   MSTORE,
 
-  // Masked gather and scatter - load and store operations for a vector of
-  // random addresses with additional mask operand that prevents memory
-  // accesses to the masked-off lanes.
-  //
-  // Val, OutChain = GATHER(InChain, PassThru, Mask, BasePtr, Index, Scale)
-  // OutChain = SCATTER(InChain, Value, Mask, BasePtr, Index, Scale)
-  //
-  // The Index operand can have more vector elements than the other operands
-  // due to type legalization. The extra elements are ignored.
+  /// Masked gather and scatter - load and store operations for a vector of
+  /// random addresses with additional mask operand that prevents memory
+  /// accesses to the masked-off lanes.
+  ///
+  ///     Val, OutChain = GATHER(InChain, PassThru, Mask, BasePtr, Index, Scale)
+  ///     OutChain = SCATTER(InChain, Value, Mask, BasePtr, Index, Scale)
+  ///
+  /// The Index operand can have more vector elements than the other operands
+  /// due to type legalization. The extra elements are ignored.
   MGATHER,
   MSCATTER,
 
@@ -1385,9 +1387,11 @@ enum NodeType {
   /// pow-of-2 vectors, one valid legalizer expansion is to use a tree
   /// reduction, i.e.:
   /// For RES = VECREDUCE_FADD <8 x f16> SRC_VEC
-  ///   PART_RDX = FADD SRC_VEC[0:3], SRC_VEC[4:7]
-  ///   PART_RDX2 = FADD PART_RDX[0:1], PART_RDX[2:3]
-  ///   RES = FADD PART_RDX2[0], PART_RDX2[1]
+  ///
+  ///     PART_RDX = FADD SRC_VEC[0:3], SRC_VEC[4:7]
+  ///     PART_RDX2 = FADD PART_RDX[0:1], PART_RDX[2:3]
+  ///     RES = FADD PART_RDX2[0], PART_RDX2[1]
+  ///
   /// For non-pow-2 vectors, this can be computed by extracting each element
   /// and performing the operation as if it were scalarized.
   VECREDUCE_FADD,
diff --git a/llvm/include/llvm/CodeGen/MIRPrinter.h b/llvm/include/llvm/CodeGen/MIRPrinter.h
index d0a11e1c4a2fd..85bd674c56a60 100644
--- a/llvm/include/llvm/CodeGen/MIRPrinter.h
+++ b/llvm/include/llvm/CodeGen/MIRPrinter.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class MachineBasicBlock;
 class MachineFunction;
+class MachineModuleInfo;
 class Module;
 template <typename T> class SmallVectorImpl;
 
@@ -46,7 +47,8 @@ void printMIR(raw_ostream &OS, const Module &M);
 
 /// Print a machine function using the MIR serialization format to the given
 /// output stream.
-void printMIR(raw_ostream &OS, const MachineFunction &MF);
+void printMIR(raw_ostream &OS, const MachineModuleInfo &MMI,
+              const MachineFunction &MF);
 
 /// Determine a possible list of successors of a basic block based on the
 /// basic block machine operand being used inside the block. This should give
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 466fed7fb3a29..213b7ec6b3fbf 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -251,7 +251,7 @@ class MachineFrameInfo {
   /// targets, this value is only used when generating debug info (via
   /// TargetRegisterInfo::getFrameIndexReference); when generating code, the
   /// corresponding adjustments are performed directly.
-  int OffsetAdjustment = 0;
+  int64_t OffsetAdjustment = 0;
 
   /// The prolog/epilog code inserter may process objects that require greater
   /// alignment than the default alignment the target provides.
@@ -280,7 +280,7 @@ class MachineFrameInfo {
   /// setup/destroy pseudo instructions (as defined in the TargetFrameInfo
   /// class).  This information is important for frame pointer elimination.
   /// It is only valid during and after prolog/epilog code insertion.
-  unsigned MaxCallFrameSize = ~0u;
+  uint64_t MaxCallFrameSize = ~UINT64_C(0);
 
   /// The number of bytes of callee saved registers that the target wants to
   /// report for the current function in the CodeView S_FRAMEPROC record.
@@ -593,10 +593,10 @@ class MachineFrameInfo {
   uint64_t estimateStackSize(const MachineFunction &MF) const;
 
   /// Return the correction for frame offsets.
-  int getOffsetAdjustment() const { return OffsetAdjustment; }
+  int64_t getOffsetAdjustment() const { return OffsetAdjustment; }
 
   /// Set the correction for frame offsets.
-  void setOffsetAdjustment(int Adj) { OffsetAdjustment = Adj; }
+  void setOffsetAdjustment(int64_t Adj) { OffsetAdjustment = Adj; }
 
   /// Return the alignment in bytes that this function must be aligned to,
   /// which is greater than the default stack alignment provided by the target.
@@ -663,7 +663,7 @@ class MachineFrameInfo {
   /// CallFrameSetup/Destroy pseudo instructions are used by the target, and
   /// then only during or after prolog/epilog code insertion.
   ///
-  unsigned getMaxCallFrameSize() const {
+  uint64_t getMaxCallFrameSize() const {
     // TODO: Enable this assert when targets are fixed.
     //assert(isMaxCallFrameSizeComputed() && "MaxCallFrameSize not computed yet");
     if (!isMaxCallFrameSizeComputed())
@@ -671,9 +671,9 @@ class MachineFrameInfo {
     return MaxCallFrameSize;
   }
   bool isMaxCallFrameSizeComputed() const {
-    return MaxCallFrameSize != ~0u;
+    return MaxCallFrameSize != ~UINT64_C(0);
   }
-  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+  void setMaxCallFrameSize(uint64_t S) { MaxCallFrameSize = S; }
 
   /// Returns how many bytes of callee-saved registers the target pushed in the
   /// prologue. Only used for debug info.
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 6e7292abeddbb..e1d03fe6b92c1 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -59,7 +59,6 @@ class MachineConstantPool;
 class MachineFrameInfo;
 class MachineFunction;
 class MachineJumpTableInfo;
-class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCContext;
 class MCInstrDesc;
@@ -260,7 +259,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
   MCContext &Ctx;
-  MachineModuleInfo &MMI;
 
   // RegInfo - Information about each register in use in the function.
   MachineRegisterInfo *RegInfo;
@@ -395,15 +393,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
 
   /// \}
 
-  /// Clear all the members of this MachineFunction, but the ones used
-  /// to initialize again the MachineFunction.
-  /// More specifically, this deallocates all the dynamically allocated
-  /// objects and get rid of all the XXXInfo data structure, but keep
-  /// unchanged the references to Fn, Target, MMI, and FunctionNumber.
+  /// Clear all the members of this MachineFunction, but the ones used to
+  /// initialize again the MachineFunction.  More specifically, this deallocates
+  /// all the dynamically allocated objects and get rids of all the XXXInfo data
+  /// structure, but keeps unchanged the references to Fn, Target, and
+  /// FunctionNumber.
   void clear();
   /// Allocate and initialize the different members.
   /// In particular, the XXXInfo data structure.
-  /// \pre Fn, Target, MMI, and FunctionNumber are properly set.
+  /// \pre Fn, Target, and FunctionNumber are properly set.
   void init();
 
 public:
@@ -632,8 +630,8 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   const static unsigned int DebugOperandMemNumber;
 
   MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                  const TargetSubtargetInfo &STI, unsigned FunctionNum,
-                  MachineModuleInfo &MMI);
+                  const TargetSubtargetInfo &STI, MCContext &Ctx,
+                  unsigned FunctionNum);
   MachineFunction(const MachineFunction &) = delete;
   MachineFunction &operator=(const MachineFunction &) = delete;
   ~MachineFunction();
@@ -665,7 +663,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
 
   GISelChangeObserver *getObserver() const { return Observer; }
 
-  MachineModuleInfo &getMMI() const { return MMI; }
   MCContext &getContext() const { return Ctx; }
 
   /// Returns the Section this function belongs to.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 229f5159fd6ef..04c8144f2fe7a 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1723,7 +1723,7 @@ class MachineInstr
   /// Return true if it is safe to move this instruction. If
   /// SawStore is set to true, it means that there is a store (or call) between
   /// the instruction's location and its intended destination.
-  bool isSafeToMove(AAResults *AA, bool &SawStore) const;
+  bool isSafeToMove(bool &SawStore) const;
 
   /// Returns true if this instruction's memory access aliases the memory
   /// access of Other.
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 97b439c726b0a..310cc4b2abb77 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -99,27 +99,6 @@ class MachineModuleInfo {
   /// want.
   MachineModuleInfoImpl *ObjFileMMI;
 
-  /// \name Exception Handling
-  /// \{
-
-  /// The current call site index being processed, if any. 0 if none.
-  unsigned CurCallSite = 0;
-
-  /// \}
-
-  // TODO: Ideally, what we'd like is to have a switch that allows emitting
-  // synchronous (precise at call-sites only) CFA into .eh_frame. However,
-  // even under this switch, we'd like .debug_frame to be precise when using
-  // -g. At this moment, there's no way to specify that some CFI directives
-  // go into .eh_frame only, while others go into .debug_frame only.
-
-  /// True if debugging information is available in this module.
-  bool DbgInfoAvailable = false;
-
-  /// True if this module is being built for windows/msvc, and uses floating
-  /// point.  This is used to emit an undefined reference to _fltused.
-  bool UsesMSVCFloatingPoint = false;
-
   /// Maps IR Functions to their corresponding MachineFunctions.
   DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
   /// Next unique number available for a MachineFunction.
@@ -186,23 +165,6 @@ class MachineModuleInfo {
     return const_cast<MachineModuleInfo*>(this)->getObjFileInfo<Ty>();
   }
 
-  /// Returns true if valid debug info is present.
-  bool hasDebugInfo() const { return DbgInfoAvailable; }
-
-  bool usesMSVCFloatingPoint() const { return UsesMSVCFloatingPoint; }
-
-  void setUsesMSVCFloatingPoint(bool b) { UsesMSVCFloatingPoint = b; }
-
-  /// \name Exception Handling
-  /// \{
-
-  /// Set the call site currently being processed.
-  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
-
-  /// Get the call site currently being processed, if any.  return zero if
-  /// none.
-  unsigned getCurrentCallSite() { return CurCallSite; }
-
   /// \}
 }; // End class MachineModuleInfo
 
diff --git a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
index fc7635edd82c7..3bf0fde1e26ef 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
@@ -33,7 +33,8 @@ class MachineModuleSlotTracker : public ModuleSlotTracker {
                               bool ShouldInitializeAllMetadata);
 
 public:
-  MachineModuleSlotTracker(const MachineFunction *MF,
+  MachineModuleSlotTracker(const MachineModuleInfo &MMI,
+                           const MachineFunction *MF,
                            bool ShouldInitializeAllMetadata = true);
   ~MachineModuleSlotTracker();
 
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 21444cd8649c4..9848bf5d188be 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -205,20 +205,20 @@ namespace llvm {
   /// possible. It is best suited for debug code where live ranges are short.
   ///
   FunctionPass *createFastRegisterAllocator();
-  FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F,
+  FunctionPass *createFastRegisterAllocator(RegAllocFilterFunc F,
                                             bool ClearVirtRegs);
 
   /// BasicRegisterAllocation Pass - This pass implements a degenerate global
   /// register allocator using the basic regalloc framework.
   ///
   FunctionPass *createBasicRegisterAllocator();
-  FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F);
+  FunctionPass *createBasicRegisterAllocator(RegAllocFilterFunc F);
 
   /// Greedy register allocation pass - This pass implements a global register
   /// allocator for optimized builds.
   ///
   FunctionPass *createGreedyRegisterAllocator();
-  FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F);
+  FunctionPass *createGreedyRegisterAllocator(RegAllocFilterFunc F);
 
   /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean
   /// Quadratic Prograaming (PBQP) based register allocator.
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index ad533eab1861c..6c5cb295f5290 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -9,18 +9,20 @@
 #ifndef LLVM_CODEGEN_REGALLOCCOMMON_H
 #define LLVM_CODEGEN_REGALLOCCOMMON_H
 
+#include "llvm/CodeGen/Register.h"
 #include <functional>
 
 namespace llvm {
 
 class TargetRegisterClass;
 class TargetRegisterInfo;
+class MachineRegisterInfo;
 
 /// Filter function for register classes during regalloc. Default register class
 /// filter is nullptr, where all registers should be allocated.
 typedef std::function<bool(const TargetRegisterInfo &TRI,
-                           const TargetRegisterClass &RC)>
-    RegClassFilterFunc;
+                           const MachineRegisterInfo &MRI, const Register Reg)>
+    RegAllocFilterFunc;
 }
 
 #endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/include/llvm/CodeGen/RegAllocFast.h b/llvm/include/llvm/CodeGen/RegAllocFast.h
index c62bd14d0b4cb..c99c715daacf9 100644
--- a/llvm/include/llvm/CodeGen/RegAllocFast.h
+++ b/llvm/include/llvm/CodeGen/RegAllocFast.h
@@ -15,7 +15,7 @@
 namespace llvm {
 
 struct RegAllocFastPassOptions {
-  RegClassFilterFunc Filter = nullptr;
+  RegAllocFilterFunc Filter = nullptr;
   StringRef FilterName = "all";
   bool ClearVRegs = true;
 };
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index ce63dcc405fd5..7a13164589392 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CODEGEN_RUNTIMELIBCALLS_H
 #define LLVM_CODEGEN_RUNTIMELIBCALLS_H
 
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -90,6 +91,9 @@ Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 /// UNKNOW_LIBCALL if there is none.
 Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 
+/// Initialize the default condition code on the libcalls.
+void initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs);
+
 } // namespace RTLIB
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index a905c85f56b66..ad1c5415055c2 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -490,6 +490,18 @@ m_c_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) {
                                                          CC);
 }
 
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_Select(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::SELECT, Cond, T, F);
+}
+
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::VSELECT, Cond, T, F);
+}
+
 // === Binary operations ===
 template <typename LHS_P, typename RHS_P, bool Commutable = false,
           bool ExcludeChain = false>
@@ -722,6 +734,10 @@ inline Or<UnaryOpc_match<Opnd>, Opnd> m_TruncOrSelf(const Opnd &Op) {
   return Or<UnaryOpc_match<Opnd>, Opnd>(m_Trunc(Op), Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_VScale(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::VSCALE, Op);
+}
+
 // === Constants ===
 struct ConstantInt_match {
   APInt *BindVal;
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 16ec65f2e7daa..6a80c8c7216f6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -76,6 +76,7 @@ struct KnownBits;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
+class MachineModuleInfo;
 class MCSymbol;
 class OptimizationRemarkEmitter;
 class ProfileSummaryInfo;
@@ -245,6 +246,7 @@ class SelectionDAG {
 
   ProfileSummaryInfo *PSI = nullptr;
   BlockFrequencyInfo *BFI = nullptr;
+  MachineModuleInfo *MMI = nullptr;
 
   /// List of non-single value types.
   FoldingSet<SDVTListNode> VTListMap;
@@ -459,14 +461,15 @@ class SelectionDAG {
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
             UniformityInfo *UA, ProfileSummaryInfo *PSIin,
-            BlockFrequencyInfo *BFIin, FunctionVarLocs const *FnVarLocs);
+            BlockFrequencyInfo *BFIin, MachineModuleInfo &MMI,
+            FunctionVarLocs const *FnVarLocs);
 
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             MachineFunctionAnalysisManager &AM,
             const TargetLibraryInfo *LibraryInfo, UniformityInfo *UA,
             ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin,
-            FunctionVarLocs const *FnVarLocs) {
-    init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, FnVarLocs);
+            MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs) {
+    init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs);
     MFAM = &AM;
   }
 
@@ -500,6 +503,7 @@ class SelectionDAG {
   OptimizationRemarkEmitter &getORE() const { return *ORE; }
   ProfileSummaryInfo *getPSI() const { return PSI; }
   BlockFrequencyInfo *getBFI() const { return BFI; }
+  MachineModuleInfo *getMMI() const { return MMI; }
 
   FlagInserter *getFlagInserter() { return Inserter; }
   void setFlagInserter(FlagInserter *FI) { Inserter = FI; }
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index aa0efa5d9bf5d..fc0590b1a1b69 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -48,6 +48,7 @@ class SelectionDAGISel {
   std::unique_ptr<FunctionLoweringInfo> FuncInfo;
   SwiftErrorValueTracking *SwiftError;
   MachineFunction *MF;
+  MachineModuleInfo *MMI;
   MachineRegisterInfo *RegInfo;
   SelectionDAG *CurDAG;
   std::unique_ptr<SelectionDAGBuilder> SDB;
diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h
index 8fdce301c0ccb..8b1f67c416c22 100644
--- a/llvm/include/llvm/CodeGen/TailDuplicator.h
+++ b/llvm/include/llvm/CodeGen/TailDuplicator.h
@@ -40,7 +40,6 @@ class TailDuplicator {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MachineBranchProbabilityInfo *MBPI;
-  const MachineModuleInfo *MMI;
   MachineRegisterInfo *MRI;
   MachineFunction *MF;
   MBFIWrapper *MBFI;
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 0ff4e1fe657f4..cb0055633f4f3 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -45,9 +45,9 @@ namespace ISD {
     unsigned IsHva : 1;        ///< HVA field for
     unsigned IsHvaStart : 1;   ///< HVA structure start
     unsigned IsSecArgPass : 1; ///< Second argument
-    unsigned MemAlign : 4;     ///< Log 2 of alignment when arg is passed in memory
-                               ///< (including byval/byref). The max alignment is
-                               ///< verified in IR verification.
+    unsigned MemAlign : 6; ///< Log 2 of alignment when arg is passed in memory
+                           ///< (including byval/byref). The max alignment is
+                           ///< verified in IR verification.
     unsigned OrigAlign : 5;    ///< Log 2 of original alignment
     unsigned IsInConsecutiveRegsLast : 1;
     unsigned IsInConsecutiveRegs : 1;
@@ -67,7 +67,7 @@ namespace ISD {
           IsSecArgPass(0), MemAlign(0), OrigAlign(0),
           IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
           IsCopyElisionCandidate(0), IsPointer(0) {
-      static_assert(sizeof(*this) == 3 * sizeof(unsigned), "flags are too big");
+      static_assert(sizeof(*this) == 4 * sizeof(unsigned), "flags are too big");
     }
 
     bool isZExt() const { return IsZExt; }
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 0b9cacecc7cbe..0656c0d739fdf 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -51,7 +51,7 @@ class TargetFrameLowering {
   // Maps a callee saved register to a stack slot with a fixed offset.
   struct SpillSlot {
     unsigned Reg;
-    int Offset; // Offset relative to stack pointer on function entry.
+    int64_t Offset; // Offset relative to stack pointer on function entry.
   };
 
   struct DwarfFrameBase {
@@ -66,7 +66,7 @@ class TargetFrameLowering {
       // Used with FrameBaseKind::Register.
       unsigned Reg;
       // Used with FrameBaseKind::CFA.
-      int Offset;
+      int64_t Offset;
       struct WasmFrameBase WasmLoc;
     } Location;
   };
@@ -343,6 +343,13 @@ class TargetFrameLowering {
     return getFrameIndexReference(MF, FI, FrameReg);
   }
 
+  /// getFrameIndexReferenceFromSP - This method returns the offset from the
+  /// stack pointer to the slot of the specified index. This function serves to
+  /// provide a comparable offset from a single reference point (the value of
+  /// the stack-pointer at function entry) that can be used for analysis.
+  virtual StackOffset getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI) const;
+
   /// Returns the callee-saved registers as computed by determineCalleeSaves
   /// in the BitVector \p SavedRegs.
   virtual void getCalleeSaves(const MachineFunction &MF,
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5c7f6ddc94840..882cadea22369 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -46,6 +46,7 @@ class LiveIntervals;
 class LiveVariables;
 class MachineLoop;
 class MachineMemOperand;
+class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCAsmInfo;
 class MCInst;
@@ -2089,6 +2090,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// information for a set of outlining candidates. Returns std::nullopt if the
   /// candidates are not suitable for outlining.
   virtual std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::getOutliningCandidateInfo!");
@@ -2103,7 +2105,8 @@ class TargetInstrInfo : public MCInstrInfo {
 protected:
   /// Target-dependent implementation for getOutliningTypeImpl.
   virtual outliner::InstrType
-  getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, unsigned Flags) const {
+  getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                       MachineBasicBlock::iterator &MIT, unsigned Flags) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::getOutliningTypeImpl!");
   }
@@ -2111,8 +2114,9 @@ class TargetInstrInfo : public MCInstrInfo {
 public:
   /// Returns how or if \p MIT should be outlined. \p Flags is the
   /// target-specific information returned by isMBBSafeToOutlineFrom.
-  outliner::InstrType
-  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const;
+  outliner::InstrType getOutliningType(const MachineModuleInfo &MMI,
+                                       MachineBasicBlock::iterator &MIT,
+                                       unsigned Flags) const;
 
   /// Optional target hook that returns true if \p MBB is safe to outline from,
   /// and returns any target-specific information in \p Flags.
@@ -2226,6 +2230,12 @@ class TargetInstrInfo : public MCInstrInfo {
     return OptLevel >= CodeGenOptLevel::Aggressive ? 4 : 2;
   }
 
+  /// Returns the target-specific default value for tail merging.
+  /// This value will be used if the tail-merge-size argument is not provided.
+  virtual unsigned getTailMergeSize(const MachineFunction &MF) const {
+    return 3;
+  }
+
   /// Returns the callee operand from the given \p MI.
   virtual const MachineOperand &getCalleeOperand(const MachineInstr &MI) const {
     return MI.getOperand(0);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d4a2166bf768e..9ccdbab008aec 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1640,7 +1640,8 @@ class TargetLoweringBase {
     MVT NVT = VT;
     do {
       NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
-      assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
+      assert(NVT.isInteger() == VT.isInteger() &&
+             NVT.isFloatingPoint() == VT.isFloatingPoint() &&
              "Didn't find type to promote to!");
     } while (VTBits >= NVT.getScalarSizeInBits() || !isTypeLegal(NVT) ||
              getOperationAction(Op, NVT) == Promote);
@@ -3431,16 +3432,20 @@ class TargetLoweringBase {
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
+  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
   void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    Libcalls.setCmpLibcallCC(Call, CC);
+    CmpLibcallCCs[Call] = CC;
   }
 
+
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
+  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
   ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return Libcalls.getCmpLibcallCC(Call);
+    return CmpLibcallCCs[Call];
   }
 
+
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     Libcalls.setLibcallCallingConv(Call, CC);
@@ -3630,6 +3635,10 @@ class TargetLoweringBase {
   /// The list of libcalls that the target will use.
   RTLIB::RuntimeLibcallsInfo Libcalls;
 
+  /// The ISD::CondCode that should be used to test the result of each of the
+  /// comparison libcall against zero.
+  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
+
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
   enum IndexedModeActionsBits {
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 963b6a71de380..0883f597dabc1 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -289,34 +289,34 @@ def aarch64svcount
 def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
 let isNormalValueType = false in {
-def token      : ValueType<0, 248>;  // TokenTy
-def MetadataVT : ValueType<0, 249> { // Metadata
+def token      : ValueType<0, 504>;  // TokenTy
+def MetadataVT : ValueType<0, 505> { // Metadata
   let LLVMName = "Metadata";
 }
 
 // Pseudo valuetype mapped to the current pointer size to any address space.
 // Should only be used in TableGen.
-def iPTRAny    : VTAny<250>;
+def iPTRAny    : VTAny<506>;
 
 // Pseudo valuetype to represent "vector of any size"
 // Should only be used in TableGen.
-def vAny       : VTAny<251>;
+def vAny       : VTAny<507>;
 
 // Pseudo valuetype to represent "float of any format"
 // Should only be used in TableGen.
-def fAny       : VTAny<252>;
+def fAny       : VTAny<508>;
 
 // Pseudo valuetype to represent "integer of any bit width"
 // Should only be used in TableGen.
-def iAny       : VTAny<253>;
+def iAny       : VTAny<509>;
 
 // Pseudo valuetype mapped to the current pointer size.
 // Should only be used in TableGen.
-def iPTR       : ValueType<0, 254>;
+def iPTR       : ValueType<0, 510>;
 
 // Pseudo valuetype to represent "any type of any size".
 // Should only be used in TableGen.
-def Any        : VTAny<255>;
+def Any        : VTAny<511>;
 
 } // isNormalValueType = false
 
diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index 7e7608c34fb70..556531c3147e5 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -33,7 +33,7 @@ namespace llvm {
   /// type can be represented by an MVT.
   class MVT {
   public:
-    enum SimpleValueType : uint8_t {
+    enum SimpleValueType : uint16_t {
       // Simple value types that aren't explicitly part of this enumeration
       // are considered extended value types.
       INVALID_SIMPLE_VALUE_TYPE = 0,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 497d3bee048ab..421b84d644db6 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -181,6 +181,8 @@ class DWARFDie {
   DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const;
   DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const;
 
+  DWARFDie resolveTypeUnitReference() const;
+
   /// Extract the range base attribute from this DIE as absolute section offset.
   ///
   /// This is a utility function that checks for either the DW_AT_rnglists_base
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index ed30a792e9e9c..f997faf1ebcb0 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -328,6 +328,7 @@ class StaticLibraryDefinitionGenerator : public DefinitionGenerator {
   std::unique_ptr<MemoryBuffer> ArchiveBuffer;
   std::unique_ptr<object::Archive> Archive;
   DenseMap<SymbolStringPtr, MemoryBufferRef> ObjectFilesMap;
+  BumpPtrAllocator ObjFileNameStorage;
 };
 
 /// A utility class to create COFF dllimport GOT symbols (__imp_*) and PLT
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
index b8b5f90b6b0fb..fa522ff88e1db 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
@@ -184,7 +184,7 @@ template <typename T> class AllocGroupSmallMap {
   iterator end() { return Elems.end(); }
   iterator find(AllocGroup G) {
     auto I = lower_bound(Elems, G, compareKey);
-    return (I->first == G) ? I : end();
+    return (I == end() || I->first == G) ? I : end();
   }
 
   bool empty() const { return Elems.empty(); }
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLResource.h b/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
index 4ed742b4129a2..989893bcaccec 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_FRONTEND_HLSL_HLSLRESOURCE_H
 #define LLVM_FRONTEND_HLSL_HLSLRESOURCE_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DXILABI.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
index f6ee963bd8855..b3a02cd531217 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
@@ -81,6 +81,10 @@ template <typename ClauseType> struct ConstructCompositionT {
   std::unordered_map<llvm::omp::Clause, ClauseSet> clauseSets;
 };
 
+template <typename ClauseTy>
+ConstructCompositionT(uint32_t, llvm::ArrayRef<DirectiveWithClauses<ClauseTy>>)
+    -> ConstructCompositionT<ClauseTy>;
+
 template <typename C>
 ConstructCompositionT<C>::ConstructCompositionT(
     uint32_t version, llvm::ArrayRef<DirectiveWithClauses<C>> leafs)
@@ -129,7 +133,7 @@ ConstructCompositionT<C>::ConstructCompositionT(
   mergeReduction();
   mergeDSA();
 
-  // Fir the rest of the clauses, just copy them.
+  // For the rest of the clauses, just copy them.
   for (auto &[id, clauses] : clauseSets) {
     // Skip clauses we've already dealt with.
     switch (id) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 3feb4bd11c998..b93bc594a82bf 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -236,6 +236,9 @@ struct ConstructDecompositionT {
                    const ClauseTy *);
   bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
+  bool
+  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+              const ClauseTy *);
 
   uint32_t version;
   llvm::omp::Directive construct;
@@ -1101,9 +1104,21 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applyToOutermost(node);
 }
 
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  return applyToAll(node);
+}
+
 template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   bool success = true;
 
+  auto isImplicit = [this](const ClauseTy *node) {
+    return llvm::any_of(
+        implicit, [node](const ClauseTy &clause) { return &clause == node; });
+  };
+
   for (llvm::omp::Directive leaf :
        llvm::omp::getLeafConstructsOrSelf(construct))
     leafs.push_back(LeafReprInternal{leaf, /*clauses=*/{}});
@@ -1143,9 +1158,10 @@ template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   for (const ClauseTy *node : nodes) {
     if (skip(node))
       continue;
-    success =
-        success &&
+    bool result =
         std::visit([&](auto &&s) { return applyClause(s, node); }, node->u);
+    if (!isImplicit(node))
+      success = success && result;
   }
 
   // Apply "allocate".
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 6f7a39acac1d3..54ae672755ba8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm::omp {
 ArrayRef<Directive> getLeafConstructs(Directive D);
@@ -30,6 +31,14 @@ Directive getCompoundConstruct(ArrayRef<Directive> Parts);
 bool isLeafConstruct(Directive D);
 bool isCompositeConstruct(Directive D);
 bool isCombinedConstruct(Directive D);
+
+/// Create a nicer version of a function name for humans to look at.
+std::string prettifyFunctionName(StringRef FunctionName);
+
+/// Deconstruct an OpenMP kernel name into the parent function name and the line
+/// number.
+std::string deconstructOpenMPKernelName(StringRef KernelName, unsigned &LineNo);
+
 } // namespace llvm::omp
 
 #endif // LLVM_FRONTEND_OPENMP_OMP_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 005c678302b27..99cef340f40df 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -735,6 +735,10 @@ def OMP_For : Directive<"for"> {
   let association = AS_Loop;
   let category = CA_Executable;
 }
+def OMP_Interchange : Directive<"interchange"> {
+  let association = AS_Loop;
+  let category = CA_Executable;
+}
 def OMP_interop : Directive<"interop"> {
   let allowedClauses = [
     VersionedClause<OMPC_Depend>,
@@ -837,6 +841,10 @@ def OMP_Requires : Directive<"requires"> {
   let association = AS_None;
   let category = CA_Informational;
 }
+def OMP_Reverse : Directive<"reverse"> {
+  let association = AS_Loop;
+  let category = CA_Executable;
+}
 def OMP_Scan : Directive<"scan"> {
   let allowedClauses = [
     VersionedClause<OMPC_Exclusive, 50>,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 02c0265354ccd..9cb311834907b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -196,6 +196,9 @@ class OpenMPIRBuilderConfig {
 /// Data structure to contain the information needed to uniquely identify
 /// a target entry.
 struct TargetRegionEntryInfo {
+  /// The prefix used for kernel names.
+  static constexpr const char *KernelNamePrefix = "__omp_offloading_";
+
   std::string ParentName;
   unsigned DeviceID;
   unsigned FileID;
@@ -2231,6 +2234,8 @@ class OpenMPIRBuilder {
     /// The total number of pointers passed to the runtime library.
     unsigned NumberOfPtrs = 0u;
 
+    bool EmitDebug = false;
+
     explicit TargetDataInfo() {}
     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
                             bool SeparateBeginEndCalls)
@@ -2322,6 +2327,26 @@ class OpenMPIRBuilder {
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
 
+  /// Generate a target-task for the target construct
+  ///
+  /// \param OutlinedFn The outlined device/target kernel function.
+  /// \param OutlinedFnID The ooulined function ID.
+  /// \param EmitTargetCallFallbackCB Call back function to generate host
+  ///        fallback code.
+  /// \param Args Data structure holding information about the kernel arguments.
+  /// \param DeviceID Identifier for the device via the 'device' clause.
+  /// \param RTLoc Source location identifier
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param Dependencies Vector of DependData objects holding information of
+  ///        dependencies as specified by the 'depend' clause.
+  /// \param HasNoWait True if the target construct had 'nowait' on it, false
+  ///        otherwise
+  InsertPointTy emitTargetTask(
+      Function *OutlinedFn, Value *OutlinedFnID,
+      EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+      Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
+      SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
+
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
@@ -2329,7 +2354,6 @@ class OpenMPIRBuilder {
   void emitOffloadingArraysArgument(IRBuilderBase &Builder,
                                     OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
                                     OpenMPIRBuilder::TargetDataInfo &Info,
-                                    bool EmitDebug = false,
                                     bool ForEndCall = false);
 
   /// Emit an array of struct descriptors to be assigned to the offload args.
@@ -2340,13 +2364,28 @@ class OpenMPIRBuilder {
 
   /// Emit the arrays used to pass the captures and map information to the
   /// offloading runtime library. If there is no map or capture information,
-  /// return nullptr by reference.
+  /// return nullptr by reference. Accepts a reference to a MapInfosTy object
+  /// that contains information generated for mappable clauses,
+  /// including base pointers, pointers, sizes, map types, user-defined mappers.
   void emitOffloadingArrays(
       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
       TargetDataInfo &Info, bool IsNonContiguous = false,
       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
 
+  /// Allocates memory for and populates the arrays required for offloading
+  /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
+  /// emits their base addresses as arguments to be passed to the runtime
+  /// library. In essence, this function is a combination of
+  /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
+  /// be preferred by clients of OpenMPIRBuilder.
+  void emitOffloadingArraysAndArgs(
+      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
+      TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
+      bool IsNonContiguous = false, bool ForEndCall = false,
+      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
+      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
+
   /// Creates offloading entry for the provided entry ID \a ID, address \a
   /// Addr, size \a Size, and flags \a Flags.
   void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
@@ -2805,6 +2844,8 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   /// instructions for passed in target arguments where neccessary
+  /// \param Dependencies A vector of DependData objects that carry
+  // dependency information as passed in the depend clause
   InsertPointTy createTarget(const LocationDescription &Loc,
                              OpenMPIRBuilder::InsertPointTy AllocaIP,
                              OpenMPIRBuilder::InsertPointTy CodeGenIP,
@@ -2813,7 +2854,8 @@ class OpenMPIRBuilder {
                              SmallVectorImpl<Value *> &Inputs,
                              GenMapInfoCallbackTy GenMapInfoCB,
                              TargetBodyGenCallbackTy BodyGenCB,
-                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
+                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+                             SmallVector<DependData> Dependencies = {});
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 0457f0c388d26..e1bd193891c1e 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -112,6 +112,9 @@ def ElementType : TypeAttr<"elementtype", [ParamAttr]>;
 /// symbol.
 def FnRetThunkExtern : EnumAttr<"fn_ret_thunk_extern", [FnAttr]>;
 
+/// Function has a hybrid patchable thunk.
+def HybridPatchable : EnumAttr<"hybrid_patchable", [FnAttr]>;
+
 /// Pass structure in an alloca.
 def InAlloca : TypeAttr<"inalloca", [ParamAttr]>;
 
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 12571d957da60..c7913e60cea08 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -67,6 +67,11 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   bool IsNewDbgInfoFormat;
 
 private:
+  // Allow Function to renumber blocks.
+  friend class Function;
+  /// Per-function unique number.
+  unsigned Number = -1u;
+
   friend class BlockAddress;
   friend class SymbolTableListTraits<BasicBlock>;
 
@@ -96,6 +101,11 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   void setIsNewDbgInfoFormat(bool NewFlag);
   void setNewDbgInfoFormatFlag(bool NewFlag);
 
+  unsigned getNumber() const {
+    assert(getParent() && "only basic blocks in functions have valid numbers");
+    return Number;
+  }
+
   /// Record that the collection of DbgRecords in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index 86d0a6b35d748..d086c25390fd2 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -501,6 +501,14 @@ class [[nodiscard]] ConstantRange {
   /// TODO: This isn't fully implemented yet.
   ConstantRange shl(const ConstantRange &Other) const;
 
+  /// Return a new range representing the possible values resulting
+  /// from a left shift with wrap type \p NoWrapKind of a value in this
+  /// range and a value in \p Other.
+  /// If the result range is disjoint, the preferred range is determined by the
+  /// \p PreferredRangeType.
+  ConstantRange shlWithNoWrap(const ConstantRange &Other, unsigned NoWrapKind,
+                              PreferredRangeType RangeType = Smallest) const;
+
   /// Return a new range representing the possible values resulting from a
   /// logical right shift of a value in this range and a value in \p Other.
   ConstantRange lshr(const ConstantRange &Other) const;
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index d14adfe1590be..5f7034b5ee36f 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -693,7 +693,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::FloatTyID:
     return TypeSize::getFixed(32);
   case Type::DoubleTyID:
-  case Type::X86_MMXTyID:
     return TypeSize::getFixed(64);
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index fd7a6aa46eea0..4abf978687d9d 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -75,6 +75,13 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
 private:
   // Important things that make up a function!
   BasicBlockListType BasicBlocks;         ///< The basic blocks
+
+  // Basic blocks need to get their number when added to a function.
+  friend void BasicBlock::setParent(Function *);
+  unsigned NextBlockNum = 0;
+  /// Epoch of block numbers. (Could be shrinked to uint8_t if required.)
+  unsigned BlockNumEpoch = 0;
+
   mutable Argument *Arguments = nullptr;  ///< The formal arguments
   size_t NumArgs;
   std::unique_ptr<ValueSymbolTable>
@@ -810,6 +817,34 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
     return SymTab.get();
   }
 
+  //===--------------------------------------------------------------------===//
+  // Block number functions
+
+  /// Return a value larger than the largest block number. Intended to allocate
+  /// a vector that is sufficiently large to hold all blocks indexed by their
+  /// number.
+  unsigned getMaxBlockNumber() const { return NextBlockNum; }
+
+  /// Renumber basic blocks into a dense value range starting from 0. Be aware
+  /// that other data structures and analyses (e.g., DominatorTree) may depend
+  /// on the value numbers and need to be updated or invalidated.
+  void renumberBlocks();
+
+  /// Return the "epoch" of current block numbers. This will return a different
+  /// value after every renumbering. The intention is: if something (e.g., an
+  /// analysis) uses block numbers, it also stores the number epoch and then
+  /// can assert later on that the epoch didn't change (indicating that the
+  /// numbering is still valid). If the epoch changed, blocks might have been
+  /// assigned new numbers and previous uses of the numbers needs to be
+  /// invalidated. This is solely intended as a debugging feature.
+  unsigned getBlockNumberEpoch() const { return BlockNumEpoch; }
+
+private:
+  /// Assert that all blocks have unique numbers within 0..NextBlockNum. This
+  /// has O(n) runtime complexity.
+  void validateBlockNumbers() const;
+
+public:
   //===--------------------------------------------------------------------===//
   // BasicBlock iterator forwarding functions
   //
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 494fd4bf2633d..a0516de4652ea 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -570,6 +570,10 @@ class VPIntrinsic : public IntrinsicInst {
   /// The llvm.vp.* intrinsics for this instruction Opcode
   static Intrinsic::ID getForOpcode(unsigned OC);
 
+  /// The llvm.vp.* intrinsics for this intrinsic ID \p Id. Return \p Id if it
+  /// is already a VP intrinsic.
+  static Intrinsic::ID getForIntrinsic(Intrinsic::ID Id);
+
   // Whether \p ID is a VP intrinsic ID.
   static bool isVPIntrinsic(Intrinsic::ID);
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e0a8c630a08b2..7445c572b97e9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -769,9 +769,10 @@ class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod,
 
 class AMDGPUDimSampleProfile<string opmod,
                              AMDGPUDimProps dim,
-                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
+                             AMDGPUSampleVariant sample,
+                             bit has_return = true> : AMDGPUDimProfile<opmod, dim> {
   let IsSample = true;
-  let RetTypes = [llvm_any_ty];
+  let RetTypes = !if(has_return, [llvm_any_ty], []);
   let ExtraAddrArgs = sample.ExtraAddrArgs;
   let Offset = sample.Offset;
   let Bias = sample.Bias;
@@ -780,6 +781,12 @@ class AMDGPUDimSampleProfile<string opmod,
   let LodClampMip = sample.LodOrClamp;
 }
 
+class AMDGPUDimSampleNoReturnProfile<string opmod,
+                             AMDGPUDimProps dim,
+                             AMDGPUSampleVariant sample>
+    : AMDGPUDimSampleProfile<opmod, dim, sample, false> {
+}
+
 class AMDGPUDimNoSampleProfile<string opmod,
                                AMDGPUDimProps dim,
                                list<LLVMType> retty,
@@ -970,6 +977,21 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
         AMDGPUImageDMaskIntrinsic;
   }
 
+  multiclass AMDGPUImageDimSampleNoReturnDims<string opmod,
+                                      AMDGPUSampleVariant sample> {
+    foreach dim = AMDGPUDims.NoMsaa in {
+      def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic<
+          AMDGPUDimSampleNoReturnProfile<opmod, dim, sample>,
+          [IntrWillReturn], [SDNPMemOperand]>;
+    }
+  }
+  foreach sample = AMDGPUSampleVariants in {
+    defm int_amdgcn_image_sample # sample.LowerCaseMod
+      : AMDGPUImageDimSampleNoReturnDims<
+        "SAMPLE" # sample.UpperCaseMod # "_nortn", sample>,
+        AMDGPUImageDMaskIntrinsic;
+  }
+
   defm int_amdgcn_image_getlod
     : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
       AMDGPUImageDMaskIntrinsic;
@@ -1116,6 +1138,19 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
 def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
 
+class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
+  [llvm_v4i32_ty,     // rsrc(SGPR)
+   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
+   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
+  [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
+
 class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [data_ty],
   [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
@@ -1134,6 +1169,19 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
 def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
 
+class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
+  [AMDGPUBufferRsrcTy,// rsrc(SGPR)
+   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
+   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
+  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad;
+
 class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [data_ty],
   [llvm_v4i32_ty,    // rsrc(SGPR)
@@ -1152,6 +1200,23 @@ class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
 def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
 def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
 
+class AMDGPUStructAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
+  [llvm_v4i32_ty,    // rsrc(SGPR)
+   llvm_i32_ty,      // vindex(VGPR)
+   llvm_i32_ty,      // offset(VGPR/imm, included in bounds checking and swizzling)
+   llvm_i32_ty,      // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
+                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+                     //                bit 3 = swz, bit 4 = scc (gfx90a)
+                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
+                     //                bit 6 = swz
+                     //           all: volatile op (bit 31, stripped at lowering)
+  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_struct_atomic_buffer_load : AMDGPUStructAtomicBufferLoad;
+
 class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [data_ty],
   [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
@@ -1171,6 +1236,24 @@ class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIn
 def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad;
 def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad;
 
+class AMDGPUStructPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
+  [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
+   llvm_i32_ty,           // vindex(VGPR)
+   llvm_i32_ty,           // offset(VGPR/imm, included in bounds checking and swizzling)
+   llvm_i32_ty,           // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty],          // auxiliary/cachepolicy(imm):
+                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+                          //                bit 3 = swz, bit 4 = scc (gfx90a)
+                          //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
+                          //                bit 6 = swz
+                          //           all: volatile op (bit 31, stripped at lowering)
+  [IntrArgMemOnly, NoCapture<ArgIndex<0>>,
+   ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_struct_ptr_atomic_buffer_load : AMDGPUStructPtrAtomicBufferLoad;
+
 class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [],
   [data_ty,          // vdata(VGPR)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 15bf5da740abc..d02f6e1b9505c 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1631,30 +1631,30 @@ def int_nvvm_mbarrier_pending_count :
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 2da154c300344..57702aa50774b 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1710,20 +1710,6 @@ let TargetPrefix = "riscv" in {
     defm vsuxseg # nf : RISCVISegStore<nf>;
   }
 
-  // Strided loads/stores for fixed vectors.
-  def int_riscv_masked_strided_load
-        : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                [LLVMMatchType<0>, llvm_anyptr_ty,
-                                 llvm_anyint_ty,
-                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                [NoCapture<ArgIndex<1>>, IntrReadMem]>;
-  def int_riscv_masked_strided_store
-        : DefaultAttrsIntrinsic<[],
-                                [llvm_anyvector_ty, llvm_anyptr_ty,
-                                 llvm_anyint_ty,
-                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                [NoCapture<ArgIndex<1>>, IntrWriteMem]>;
-
   // Segment loads/stores for fixed vectors.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
     def int_riscv_seg # nf # _load
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 683acf4a6ffa9..ef6ddf12c32f6 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -60,6 +60,7 @@ let TargetPrefix = "spv" in {
       Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
   def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
   def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
+  def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>;
   def int_spv_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], 
     [IntrNoMem, IntrWillReturn] >;
   def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 47aab196a6d4f..4d2df1c44ebce 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -363,6 +363,10 @@ def int_wasm_extract_lane_f16x8:
   DefaultAttrsIntrinsic<[llvm_float_ty],
                         [llvm_v8f16_ty, llvm_i32_ty],
                         [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_replace_lane_f16x8:
+  DefaultAttrsIntrinsic<[llvm_v8f16_ty],
+                        [llvm_v8f16_ty, llvm_i32_ty, llvm_float_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index adc46f9789ebb..b6a92136f3828 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -222,11 +222,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cvttss2si64 : ClangBuiltin<"__builtin_ia32_cvttss2si64">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
-  def int_x86_sse_cvtps2pi : ClangBuiltin<"__builtin_ia32_cvtps2pi">,
+  def int_x86_sse_cvtps2pi :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttps2pi: ClangBuiltin<"__builtin_ia32_cvttps2pi">,
+  def int_x86_sse_cvttps2pi:
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2ps : ClangBuiltin<"__builtin_ia32_cvtpi2ps">,
+  def int_x86_sse_cvtpi2ps :
       DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
 }
@@ -426,11 +426,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_cvtsd2ss : ClangBuiltin<"__builtin_ia32_cvtsd2ss">,
       DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                              llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpd2pi : ClangBuiltin<"__builtin_ia32_cvtpd2pi">,
+  def int_x86_sse_cvtpd2pi :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttpd2pi: ClangBuiltin<"__builtin_ia32_cvttpd2pi">,
+  def int_x86_sse_cvttpd2pi:
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2pd : ClangBuiltin<"__builtin_ia32_cvtpi2pd">,
+  def int_x86_sse_cvtpi2pd :
       DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
@@ -512,49 +512,49 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_phadd_w         : ClangBuiltin<"__builtin_ia32_phaddw">,
+  def int_x86_ssse3_phadd_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_w_128     : ClangBuiltin<"__builtin_ia32_phaddw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_d         : ClangBuiltin<"__builtin_ia32_phaddd">,
+  def int_x86_ssse3_phadd_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_d_128     : ClangBuiltin<"__builtin_ia32_phaddd128">,
       DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                              llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_sw        : ClangBuiltin<"__builtin_ia32_phaddsw">,
+  def int_x86_ssse3_phadd_sw        :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_sw_128    : ClangBuiltin<"__builtin_ia32_phaddsw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_w         : ClangBuiltin<"__builtin_ia32_phsubw">,
+  def int_x86_ssse3_phsub_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_w_128     : ClangBuiltin<"__builtin_ia32_phsubw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_d         : ClangBuiltin<"__builtin_ia32_phsubd">,
+  def int_x86_ssse3_phsub_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_d_128     : ClangBuiltin<"__builtin_ia32_phsubd128">,
       DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                              llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_sw        : ClangBuiltin<"__builtin_ia32_phsubsw">,
+  def int_x86_ssse3_phsub_sw        :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_sw_128    : ClangBuiltin<"__builtin_ia32_phsubsw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pmadd_ub_sw     : ClangBuiltin<"__builtin_ia32_pmaddubsw">,
+  def int_x86_ssse3_pmadd_ub_sw     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_pmadd_ub_sw_128 : ClangBuiltin<"__builtin_ia32_pmaddubsw128">,
@@ -564,7 +564,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Packed multiply high with round and scale
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pmul_hr_sw      : ClangBuiltin<"__builtin_ia32_pmulhrsw">,
+  def int_x86_ssse3_pmul_hr_sw      :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
   def int_x86_ssse3_pmul_hr_sw_128  : ClangBuiltin<"__builtin_ia32_pmulhrsw128">,
@@ -574,34 +574,34 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Shuffle ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pshuf_b         : ClangBuiltin<"__builtin_ia32_pshufb">,
+  def int_x86_ssse3_pshuf_b         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_pshuf_b_128     : ClangBuiltin<"__builtin_ia32_pshufb128">,
       DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                              llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse_pshuf_w           : ClangBuiltin<"__builtin_ia32_pshufw">,
+  def int_x86_sse_pshuf_w           :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Sign ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_psign_b         : ClangBuiltin<"__builtin_ia32_psignb">,
+  def int_x86_ssse3_psign_b         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_b_128     : ClangBuiltin<"__builtin_ia32_psignb128">,
       DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                              llvm_v16i8_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_w         : ClangBuiltin<"__builtin_ia32_psignw">,
+  def int_x86_ssse3_psign_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_w_128     : ClangBuiltin<"__builtin_ia32_psignw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_d         : ClangBuiltin<"__builtin_ia32_psignd">,
+  def int_x86_ssse3_psign_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_d_128     : ClangBuiltin<"__builtin_ia32_psignd128">,
@@ -611,13 +611,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Absolute value ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pabs_b     : ClangBuiltin<"__builtin_ia32_pabsb">,
+  def int_x86_ssse3_pabs_b     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_w     : ClangBuiltin<"__builtin_ia32_pabsw">,
+  def int_x86_ssse3_pabs_w     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_d     : ClangBuiltin<"__builtin_ia32_pabsd">,
+  def int_x86_ssse3_pabs_d     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
@@ -2260,118 +2260,118 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Addition
-  def int_x86_mmx_padd_b : ClangBuiltin<"__builtin_ia32_paddb">,
+  def int_x86_mmx_padd_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_w : ClangBuiltin<"__builtin_ia32_paddw">,
+  def int_x86_mmx_padd_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_d : ClangBuiltin<"__builtin_ia32_paddd">,
+  def int_x86_mmx_padd_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_q : ClangBuiltin<"__builtin_ia32_paddq">,
+  def int_x86_mmx_padd_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_padds_b : ClangBuiltin<"__builtin_ia32_paddsb">,
+  def int_x86_mmx_padds_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padds_w : ClangBuiltin<"__builtin_ia32_paddsw">,
+  def int_x86_mmx_padds_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_paddus_b : ClangBuiltin<"__builtin_ia32_paddusb">,
+  def int_x86_mmx_paddus_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_paddus_w : ClangBuiltin<"__builtin_ia32_paddusw">,
+  def int_x86_mmx_paddus_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Subtraction
-  def int_x86_mmx_psub_b : ClangBuiltin<"__builtin_ia32_psubb">,
+  def int_x86_mmx_psub_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_w : ClangBuiltin<"__builtin_ia32_psubw">,
+  def int_x86_mmx_psub_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_d : ClangBuiltin<"__builtin_ia32_psubd">,
+  def int_x86_mmx_psub_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_q : ClangBuiltin<"__builtin_ia32_psubq">,
+  def int_x86_mmx_psub_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psubs_b : ClangBuiltin<"__builtin_ia32_psubsb">,
+  def int_x86_mmx_psubs_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psubs_w : ClangBuiltin<"__builtin_ia32_psubsw">,
+  def int_x86_mmx_psubs_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psubus_b : ClangBuiltin<"__builtin_ia32_psubusb">,
+  def int_x86_mmx_psubus_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psubus_w : ClangBuiltin<"__builtin_ia32_psubusw">,
+  def int_x86_mmx_psubus_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
   // Multiplication
-  def int_x86_mmx_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw">,
+  def int_x86_mmx_pmulh_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmull_w : ClangBuiltin<"__builtin_ia32_pmullw">,
+  def int_x86_mmx_pmull_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw">,
+  def int_x86_mmx_pmulhu_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulu_dq : ClangBuiltin<"__builtin_ia32_pmuludq">,
+  def int_x86_mmx_pmulu_dq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd">,
+  def int_x86_mmx_pmadd_wd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Bitwise operations
-  def int_x86_mmx_pand : ClangBuiltin<"__builtin_ia32_pand">,
+  def int_x86_mmx_pand :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pandn : ClangBuiltin<"__builtin_ia32_pandn">,
+  def int_x86_mmx_pandn :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_por : ClangBuiltin<"__builtin_ia32_por">,
+  def int_x86_mmx_por :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pxor : ClangBuiltin<"__builtin_ia32_pxor">,
+  def int_x86_mmx_pxor :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Averages
-  def int_x86_mmx_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb">,
+  def int_x86_mmx_pavg_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw">,
+  def int_x86_mmx_pavg_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Maximum
-  def int_x86_mmx_pmaxu_b : ClangBuiltin<"__builtin_ia32_pmaxub">,
+  def int_x86_mmx_pmaxu_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmaxs_w : ClangBuiltin<"__builtin_ia32_pmaxsw">,
+  def int_x86_mmx_pmaxs_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Minimum
-  def int_x86_mmx_pminu_b : ClangBuiltin<"__builtin_ia32_pminub">,
+  def int_x86_mmx_pminu_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmins_w : ClangBuiltin<"__builtin_ia32_pminsw">,
+  def int_x86_mmx_pmins_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Packed sum of absolute differences
-  def int_x86_mmx_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw">,
+  def int_x86_mmx_psad_bw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 }
@@ -2379,58 +2379,58 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Shift left logical
-  def int_x86_mmx_psll_w : ClangBuiltin<"__builtin_ia32_psllw">,
+  def int_x86_mmx_psll_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psll_d : ClangBuiltin<"__builtin_ia32_pslld">,
+  def int_x86_mmx_psll_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psll_q : ClangBuiltin<"__builtin_ia32_psllq">,
+  def int_x86_mmx_psll_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw">,
+  def int_x86_mmx_psrl_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrl_d : ClangBuiltin<"__builtin_ia32_psrld">,
+  def int_x86_mmx_psrl_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq">,
+  def int_x86_mmx_psrl_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psra_w : ClangBuiltin<"__builtin_ia32_psraw">,
+  def int_x86_mmx_psra_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psra_d : ClangBuiltin<"__builtin_ia32_psrad">,
+  def int_x86_mmx_psra_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_mmx_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi">,
+  def int_x86_mmx_pslli_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi">,
+  def int_x86_mmx_pslli_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi">,
+  def int_x86_mmx_pslli_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi">,
+  def int_x86_mmx_psrli_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi">,
+  def int_x86_mmx_psrli_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi">,
+  def int_x86_mmx_psrli_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi">,
+  def int_x86_mmx_psrai_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrai_d : ClangBuiltin<"__builtin_ia32_psradi">,
+  def int_x86_mmx_psrai_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 }
@@ -2475,83 +2475,83 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_packsswb : ClangBuiltin<"__builtin_ia32_packsswb">,
+  def int_x86_mmx_packsswb :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_packssdw : ClangBuiltin<"__builtin_ia32_packssdw">,
+  def int_x86_mmx_packssdw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_packuswb : ClangBuiltin<"__builtin_ia32_packuswb">,
+  def int_x86_mmx_packuswb :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Unpacking ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_punpckhbw : ClangBuiltin<"__builtin_ia32_punpckhbw">,
+  def int_x86_mmx_punpckhbw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckhwd : ClangBuiltin<"__builtin_ia32_punpckhwd">,
+  def int_x86_mmx_punpckhwd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckhdq : ClangBuiltin<"__builtin_ia32_punpckhdq">,
+  def int_x86_mmx_punpckhdq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpcklbw : ClangBuiltin<"__builtin_ia32_punpcklbw">,
+  def int_x86_mmx_punpcklbw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpcklwd : ClangBuiltin<"__builtin_ia32_punpcklwd">,
+  def int_x86_mmx_punpcklwd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckldq : ClangBuiltin<"__builtin_ia32_punpckldq">,
+  def int_x86_mmx_punpckldq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Integer comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_pcmpeq_b : ClangBuiltin<"__builtin_ia32_pcmpeqb">,
+  def int_x86_mmx_pcmpeq_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_w : ClangBuiltin<"__builtin_ia32_pcmpeqw">,
+  def int_x86_mmx_pcmpeq_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_d : ClangBuiltin<"__builtin_ia32_pcmpeqd">,
+  def int_x86_mmx_pcmpeq_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_pcmpgt_b : ClangBuiltin<"__builtin_ia32_pcmpgtb">,
+  def int_x86_mmx_pcmpgt_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_w : ClangBuiltin<"__builtin_ia32_pcmpgtw">,
+  def int_x86_mmx_pcmpgt_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_d : ClangBuiltin<"__builtin_ia32_pcmpgtd">,
+  def int_x86_mmx_pcmpgt_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_maskmovq : ClangBuiltin<"__builtin_ia32_maskmovq">,
+  def int_x86_mmx_maskmovq :
               Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>;
 
-  def int_x86_mmx_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb">,
+  def int_x86_mmx_pmovmskb :
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_movnt_dq : ClangBuiltin<"__builtin_ia32_movntq">,
+  def int_x86_mmx_movnt_dq :
               Intrinsic<[], [llvm_ptr_ty, llvm_x86mmx_ty], []>;
 
-  def int_x86_mmx_palignr_b : ClangBuiltin<"__builtin_ia32_palignr">,
+  def int_x86_mmx_palignr_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                             [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty],
                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_mmx_pextr_w : ClangBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+  def int_x86_mmx_pextr_w :
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_x86_mmx_pinsr_w : ClangBuiltin<"__builtin_ia32_vec_set_v4hi">,
+  def int_x86_mmx_pinsr_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                             [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 22da54a1f03c5..7b54c74fb1b9d 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -846,8 +846,10 @@ struct AAMDNodes {
   AAMDNodes concat(const AAMDNodes &Other) const;
 
   /// Create a new AAMDNode for accessing \p AccessSize bytes of this AAMDNode.
-  /// If his AAMDNode has !tbaa.struct and \p AccessSize matches the size of the
-  /// field at offset 0, get the TBAA tag describing the accessed field.
+  /// If this AAMDNode has !tbaa.struct and \p AccessSize matches the size of
+  /// the field at offset 0, get the TBAA tag describing the accessed field.
+  /// If such an AAMDNode already embeds !tbaa, the existing one is retrieved.
+  /// Finally, !tbaa.struct is zeroed out.
   AAMDNodes adjustForAccess(unsigned AccessSize);
   AAMDNodes adjustForAccess(size_t Offset, Type *AccessTy,
                             const DataLayout &DL);
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 8ae47fb556b25..9c70a60bf50c7 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1550,23 +1550,27 @@ template <typename T> inline Exact_match<T> m_Exact(const T &SubPattern) {
 template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy,
           bool Commutable = false>
 struct CmpClass_match {
-  PredicateTy &Predicate;
+  PredicateTy *Predicate;
   LHS_t L;
   RHS_t R;
 
   // The evaluation order is always stable, regardless of Commutability.
   // The LHS is always matched first.
   CmpClass_match(PredicateTy &Pred, const LHS_t &LHS, const RHS_t &RHS)
-      : Predicate(Pred), L(LHS), R(RHS) {}
+      : Predicate(&Pred), L(LHS), R(RHS) {}
+  CmpClass_match(const LHS_t &LHS, const RHS_t &RHS)
+      : Predicate(nullptr), L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *I = dyn_cast<Class>(V)) {
       if (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) {
-        Predicate = I->getPredicate();
+        if (Predicate)
+          *Predicate = I->getPredicate();
         return true;
       } else if (Commutable && L.match(I->getOperand(1)) &&
                  R.match(I->getOperand(0))) {
-        Predicate = I->getSwappedPredicate();
+        if (Predicate)
+          *Predicate = I->getSwappedPredicate();
         return true;
       }
     }
@@ -1595,27 +1599,25 @@ m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
 template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, CmpInst, CmpInst::Predicate>
 m_Cmp(const LHS &L, const RHS &R) {
-  CmpInst::Predicate Unused;
-  return CmpClass_match<LHS, RHS, CmpInst, CmpInst::Predicate>(Unused, L, R);
+  return CmpClass_match<LHS, RHS, CmpInst, CmpInst::Predicate>(L, R);
 }
 
 template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>
 m_ICmp(const LHS &L, const RHS &R) {
-  ICmpInst::Predicate Unused;
-  return CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>(Unused, L, R);
+  return CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>(L, R);
 }
 
 template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>
 m_FCmp(const LHS &L, const RHS &R) {
-  FCmpInst::Predicate Unused;
-  return CmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>(Unused, L, R);
+  return CmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>(L, R);
 }
 
 // Same as CmpClass, but instead of saving Pred as out output variable, match a
 // specific input pred for equality.
-template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy>
+template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy,
+          bool Commutable = false>
 struct SpecificCmpClass_match {
   const PredicateTy Predicate;
   LHS_t L;
@@ -1625,9 +1627,17 @@ struct SpecificCmpClass_match {
       : Predicate(Pred), L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
-    if (auto *I = dyn_cast<Class>(V))
-      return I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
-             R.match(I->getOperand(1));
+    if (auto *I = dyn_cast<Class>(V)) {
+      if (I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
+          R.match(I->getOperand(1)))
+        return true;
+      if constexpr (Commutable) {
+        if (I->getPredicate() == Class::getSwappedPredicate(Predicate) &&
+            L.match(I->getOperand(1)) && R.match(I->getOperand(0)))
+          return true;
+      }
+    }
+
     return false;
   }
 };
@@ -1646,6 +1656,13 @@ m_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
       MatchPred, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>
+m_c_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
+  return SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>(
+      MatchPred, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline SpecificCmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>
 m_SpecificFCmp(FCmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
@@ -2681,9 +2698,7 @@ m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
 template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>
 m_c_ICmp(const LHS &L, const RHS &R) {
-  ICmpInst::Predicate Unused;
-  return CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>(Unused,
-                                                                       L, R);
+  return CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>(L, R);
 }
 
 /// Matches a specific opcode with LHS and RHS in either order.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 3057bff397b2f..b3648f5a31e2a 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -15,7 +15,6 @@
 #define LLVM_IR_RUNTIME_LIBCALLS_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/Triple.h"
@@ -41,7 +40,6 @@ enum Libcall {
 struct RuntimeLibcallsInfo {
   explicit RuntimeLibcallsInfo(const Triple &TT) {
     initLibcalls(TT);
-    initCmpLibcallCCs();
   }
 
   /// Rename the default libcall routine name for the specified libcall.
@@ -59,18 +57,6 @@ struct RuntimeLibcallsInfo {
     return LibcallRoutineNames[Call];
   }
 
-  /// Override the default CondCode to be used to test the result of the
-  /// comparison libcall against zero.
-  void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    CmpLibcallCCs[Call] = CC;
-  }
-
-  /// Get the CondCode that's to be used to test the result of the comparison
-  /// libcall against zero.
-  ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return CmpLibcallCCs[Call];
-  }
-
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     LibcallCallingConvs[Call] = CC;
@@ -90,10 +76,6 @@ struct RuntimeLibcallsInfo {
   /// Stores the name each libcall.
   const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
 
-  /// The ISD::CondCode that should be used to test the result of each of the
-  /// comparison libcall against zero.
-  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
-
   /// Stores the CallingConv that should be used for each libcall.
   CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
 
@@ -112,9 +94,6 @@ struct RuntimeLibcallsInfo {
     return true;
   }
 
-  /// Sets default libcall calling conventions.
-  void initCmpLibcallCCs();
-
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
   void initLibcalls(const Triple &TT);
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 734f4baf2630e..e80e77f88eaec 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -63,7 +63,6 @@ class Type {
     VoidTyID,      ///< type with no size
     LabelTyID,     ///< Labels
     MetadataTyID,  ///< Metadata
-    X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
     X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
     TokenTyID,     ///< Tokens
 
@@ -197,9 +196,6 @@ class Type {
 
   const fltSemantics &getFltSemantics() const;
 
-  /// Return true if this is X86 MMX.
-  bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
-
   /// Return true if this is X86 AMX.
   bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
 
@@ -285,8 +281,8 @@ class Type {
   /// Return true if the type is a valid type for a register in codegen. This
   /// includes all first-class types except struct and array types.
   bool isSingleValueType() const {
-    return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
-           isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy();
+    return isFloatingPointTy() || isIntegerTy() || isPointerTy() ||
+           isVectorTy() || isX86_AMXTy() || isTargetExtTy();
   }
 
   /// Return true if the type is an aggregate type. This means it is valid as
@@ -302,8 +298,7 @@ class Type {
   bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
-        getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
-        getTypeID() == X86_AMXTyID)
+        getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID)
       return true;
     // If it is not something that can have a size (e.g. a function or label),
     // it doesn't have a size.
@@ -453,7 +448,6 @@ class Type {
   static Type *getX86_FP80Ty(LLVMContext &C);
   static Type *getFP128Ty(LLVMContext &C);
   static Type *getPPC_FP128Ty(LLVMContext &C);
-  static Type *getX86_MMXTy(LLVMContext &C);
   static Type *getX86_AMXTy(LLVMContext &C);
   static Type *getTokenTy(LLVMContext &C);
   static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 6af7f6075551d..dbb9f4c7336d5 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_IR_VECTORBUILDER_H
 #define LLVM_IR_VECTORBUILDER_H
 
-#include <llvm/Analysis/IVDescriptors.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/InstrTypes.h>
 #include <llvm/IR/Instruction.h>
@@ -100,11 +99,11 @@ class VectorBuilder {
                                  const Twine &Name = Twine());
 
   /// Emit a VP reduction intrinsic call for recurrence kind.
-  /// \param Kind        The kind of recurrence
+  /// \param RdxID       The intrinsic ID of llvm.vector.reduce.*
   /// \param ValTy       The type of operand which the reduction operation is
   ///                    performed.
   /// \param VecOpArray  The operand list.
-  Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+  Value *createSimpleTargetReduction(Intrinsic::ID RdxID, Type *ValTy,
                                      ArrayRef<Value *> VecOpArray,
                                      const Twine &Name = Twine());
 };
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 2b047f82f7bb1..c45679c0dec95 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -92,8 +92,7 @@ namespace {
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopStrengthReducePass();
-      (void) llvm::createLoopUnrollPass();
-      (void) llvm::createLowerConstantIntrinsicsPass();
+      (void)llvm::createLoopUnrollPass();
       (void) llvm::createLowerGlobalDtorsLegacyPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index f91c8e1fc9af3..3f88ac02cd92a 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -224,7 +224,7 @@ class MCAsmBackend {
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
 
   /// Generate the compact unwind encoding for the CFI instructions.
-  virtual uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  virtual uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                                  const MCContext *Ctxt) const {
     return 0;
   }
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 4b08d50de9e22..c6fa48128d189 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -15,13 +15,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/VersionTuple.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -56,25 +52,10 @@ class MCValue;
 
 class MCAssembler {
 public:
+  friend class MCObjectWriter;
   using SectionListType = SmallVector<MCSection *, 0>;
   using const_iterator = pointee_iterator<SectionListType::const_iterator>;
 
-  /// MachO specific deployment target version info.
-  // A Major version of 0 indicates that no version information was supplied
-  // and so the corresponding load command should not be emitted.
-  using VersionInfoType = struct {
-    bool EmitBuildVersion;
-    union {
-      MCVersionMinType Type;          ///< Used when EmitBuildVersion==false.
-      MachO::PlatformType Platform;   ///< Used when EmitBuildVersion==true.
-    } TypeOrPlatform;
-    unsigned Major;
-    unsigned Minor;
-    unsigned Update;
-    /// An optional version of the SDK that was used to build the source.
-    VersionTuple SDKVersion;
-  };
-
 private:
   MCContext &Context;
 
@@ -84,21 +65,11 @@ class MCAssembler {
 
   bool HasLayout = false;
   bool RelaxAll = false;
-  bool SubsectionsViaSymbols = false;
-  bool IncrementalLinkerCompatible = false;
 
   SectionListType Sections;
 
   SmallVector<const MCSymbol *, 0> Symbols;
 
-  /// The list of linker options to propagate into the object file.
-  std::vector<std::vector<std::string>> LinkerOptions;
-
-  /// List of declared file names
-  std::vector<std::pair<std::string, size_t>> FileNames;
-  // Optional compiler version.
-  std::string CompilerVersion;
-
   MCDwarfLineTableParams LTParams;
 
   /// The set of function symbols for which a .thumb_func directive has
@@ -115,20 +86,6 @@ class MCAssembler {
   /// By default it's 0, which means bundling is disabled.
   unsigned BundleAlignSize = 0;
 
-  /// ELF specific e_header flags
-  // It would be good if there were an MCELFAssembler class to hold this.
-  // ELF header flags are used both by the integrated and standalone assemblers.
-  // Access to the flags is necessary in cases where assembler directives affect
-  // which flags to be set.
-  unsigned ELFHeaderEFlags = 0;
-
-  /// Used to communicate Linker Optimization Hint information between
-  /// the Streamer and the .o writer
-  MCLOHContainer LOHContainer;
-
-  VersionInfoType VersionInfo;
-  VersionInfoType DarwinTargetVariantVersionInfo;
-
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
   ///
@@ -174,15 +131,6 @@ class MCAssembler {
   handleFixup(MCFragment &F, const MCFixup &Fixup, const MCSubtargetInfo *STI);
 
 public:
-  struct Symver {
-    SMLoc Loc;
-    const MCSymbol *Sym;
-    StringRef Name;
-    // True if .symver *, *@@@* or .symver *, *, remove.
-    bool KeepOriginalSym;
-  };
-  std::vector<Symver> Symvers;
-
   /// Construct a new assembler instance.
   //
   // FIXME: How are we going to parameterize this? Two obvious options are stay
@@ -194,7 +142,6 @@ class MCAssembler {
               std::unique_ptr<MCObjectWriter> Writer);
   MCAssembler(const MCAssembler &) = delete;
   MCAssembler &operator=(const MCAssembler &) = delete;
-  ~MCAssembler();
 
   /// Compute the effective fragment size.
   uint64_t computeFragmentSize(const MCFragment &F) const;
@@ -228,48 +175,6 @@ class MCAssembler {
   /// Flag a function symbol as the target of a .thumb_func directive.
   void setIsThumbFunc(const MCSymbol *Func) { ThumbFuncs.insert(Func); }
 
-  /// ELF e_header flags
-  unsigned getELFHeaderEFlags() const { return ELFHeaderEFlags; }
-  void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags; }
-
-  /// MachO deployment target version information.
-  const VersionInfoType &getVersionInfo() const { return VersionInfo; }
-  void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor,
-                     unsigned Update,
-                     VersionTuple SDKVersion = VersionTuple()) {
-    VersionInfo.EmitBuildVersion = false;
-    VersionInfo.TypeOrPlatform.Type = Type;
-    VersionInfo.Major = Major;
-    VersionInfo.Minor = Minor;
-    VersionInfo.Update = Update;
-    VersionInfo.SDKVersion = SDKVersion;
-  }
-  void setBuildVersion(MachO::PlatformType Platform, unsigned Major,
-                       unsigned Minor, unsigned Update,
-                       VersionTuple SDKVersion = VersionTuple()) {
-    VersionInfo.EmitBuildVersion = true;
-    VersionInfo.TypeOrPlatform.Platform = Platform;
-    VersionInfo.Major = Major;
-    VersionInfo.Minor = Minor;
-    VersionInfo.Update = Update;
-    VersionInfo.SDKVersion = SDKVersion;
-  }
-
-  const VersionInfoType &getDarwinTargetVariantVersionInfo() const {
-    return DarwinTargetVariantVersionInfo;
-  }
-  void setDarwinTargetVariantBuildVersion(MachO::PlatformType Platform,
-                                          unsigned Major, unsigned Minor,
-                                          unsigned Update,
-                                          VersionTuple SDKVersion) {
-    DarwinTargetVariantVersionInfo.EmitBuildVersion = true;
-    DarwinTargetVariantVersionInfo.TypeOrPlatform.Platform = Platform;
-    DarwinTargetVariantVersionInfo.Major = Major;
-    DarwinTargetVariantVersionInfo.Minor = Minor;
-    DarwinTargetVariantVersionInfo.Update = Update;
-    DarwinTargetVariantVersionInfo.SDKVersion = SDKVersion;
-  }
-
   /// Reuse an assembler instance
   ///
   void reset();
@@ -280,8 +185,6 @@ class MCAssembler {
 
   MCCodeEmitter *getEmitterPtr() const { return Emitter.get(); }
 
-  MCObjectWriter *getWriterPtr() const { return Writer.get(); }
-
   MCAsmBackend &getBackend() const { return *Backend; }
 
   MCCodeEmitter &getEmitter() const { return *Emitter; }
@@ -289,7 +192,6 @@ class MCAssembler {
   MCObjectWriter &getWriter() const { return *Writer; }
 
   MCDwarfLineTableParams getDWARFLinetableParams() const { return LTParams; }
-  void setDWARFLinetableParams(MCDwarfLineTableParams P) { LTParams = P; }
 
   /// Finish - Do final processing and write the object to the output stream.
   /// \p Writer is used for custom object writer (as the MCJIT does),
@@ -299,17 +201,6 @@ class MCAssembler {
   // Layout all section and prepare them for emission.
   void layout();
 
-  // FIXME: This does not belong here.
-  bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
-  void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }
-
-  bool isIncrementalLinkerCompatible() const {
-    return IncrementalLinkerCompatible;
-  }
-  void setIncrementalLinkerCompatible(bool Value) {
-    IncrementalLinkerCompatible = Value;
-  }
-
   bool hasLayout() const { return HasLayout; }
   bool getRelaxAll() const { return RelaxAll; }
   void setRelaxAll(bool Value) { RelaxAll = Value; }
@@ -333,56 +224,14 @@ class MCAssembler {
     return make_pointee_range(Symbols);
   }
 
-  /// @}
-  /// \name Linker Option List Access
-  /// @{
-
-  std::vector<std::vector<std::string>> &getLinkerOptions() {
-    return LinkerOptions;
-  }
-
-  // FIXME: This is a total hack, this should not be here. Once things are
-  // factored so that the streamer has direct access to the .o writer, it can
-  // disappear.
-  MCLOHContainer &getLOHContainer() { return LOHContainer; }
-  const MCLOHContainer &getLOHContainer() const {
-    return const_cast<MCAssembler *>(this)->getLOHContainer();
-  }
-
-  struct CGProfileEntry {
-    const MCSymbolRefExpr *From;
-    const MCSymbolRefExpr *To;
-    uint64_t Count;
-  };
-  std::vector<CGProfileEntry> CGProfile;
-  /// @}
-  /// \name Backend Data Access
-  /// @{
-
   bool registerSection(MCSection &Section);
   bool registerSymbol(const MCSymbol &Symbol);
 
-  MutableArrayRef<std::pair<std::string, size_t>> getFileNames() {
-    return FileNames;
-  }
-
-  void addFileName(StringRef FileName) {
-    FileNames.emplace_back(std::string(FileName), Symbols.size());
-  }
-
-  void setCompilerVersion(std::string CompilerVers) {
-    if (CompilerVersion.empty())
-      CompilerVersion = std::move(CompilerVers);
-  }
-  StringRef getCompilerVersion() { return CompilerVersion; }
-
   /// Write the necessary bundle padding to \p OS.
   /// Expects a fragment \p F containing instructions and its size \p FSize.
   void writeFragmentPadding(raw_ostream &OS, const MCEncodedFragment &F,
                             uint64_t FSize) const;
 
-  /// @}
-
   void dump() const;
 };
 
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 73fe6361cb866..57ba40f7ac26f 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -468,6 +468,10 @@ class MCContext {
   /// true, behaves like getOrCreateSymbol, prefixed with PrivateLabelPrefix.
   MCSymbol *createBlockSymbol(const Twine &Name, bool AlwaysEmit = false);
 
+  /// Create a local, non-temporary symbol like an ELF mapping symbol. Calling
+  /// the function with the same name will generate new, unique instances.
+  MCSymbol *createLocalSymbol(StringRef Name);
+
   /// Create the definition of a directional local symbol for numbered label
   /// (used for "1:" definitions).
   MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);
@@ -643,7 +647,7 @@ class MCContext {
   MCSectionXCOFF *getXCOFFSection(
       StringRef Section, SectionKind K,
       std::optional<XCOFF::CsectProperties> CsectProp = std::nullopt,
-      bool MultiSymbolsAllowed = false, const char *BeginSymName = nullptr,
+      bool MultiSymbolsAllowed = false,
       std::optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSubtypeFlags =
           std::nullopt);
 
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index d0e45ab59a92e..7dba67efa22fa 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -509,11 +509,11 @@ class MCCFIInstruction {
   union {
     struct {
       unsigned Register;
-      int Offset;
+      int64_t Offset;
     } RI;
     struct {
       unsigned Register;
-      int Offset;
+      int64_t Offset;
       unsigned AddressSpace;
     } RIA;
     struct {
@@ -527,7 +527,7 @@ class MCCFIInstruction {
   std::vector<char> Values;
   std::string Comment;
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, SMLoc Loc,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, SMLoc Loc,
                    StringRef V = "", StringRef Comment = "")
       : Label(L), Operation(Op), Loc(Loc), Values(V.begin(), V.end()),
         Comment(Comment) {
@@ -539,7 +539,7 @@ class MCCFIInstruction {
     assert(Op == OpRegister);
     U.RR = {R1, R2};
   }
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, unsigned AS,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, unsigned AS,
                    SMLoc Loc)
       : Label(L), Operation(Op), Loc(Loc) {
     assert(Op == OpLLVMDefAspaceCfa);
@@ -555,8 +555,8 @@ class MCCFIInstruction {
 public:
   /// .cfi_def_cfa defines a rule for computing CFA as: take address from
   /// Register and add Offset to it.
-  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int Offset,
-                                    SMLoc Loc = {}) {
+  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register,
+                                    int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfa, L, Register, Offset, Loc);
   }
 
@@ -564,13 +564,13 @@ class MCCFIInstruction {
   /// on Register will be used instead of the old one. Offset remains the same.
   static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register,
                                                SMLoc Loc = {}) {
-    return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, Loc);
+    return MCCFIInstruction(OpDefCfaRegister, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_def_cfa_offset modifies a rule for computing CFA. Register
   /// remains the same, but offset is new. Note that it is the absolute offset
   /// that will be added to a defined register to the compute CFA address.
-  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int Offset,
+  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset,
                                           SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfaOffset, L, 0, Offset, Loc);
   }
@@ -578,7 +578,7 @@ class MCCFIInstruction {
   /// .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
   /// Offset is a relative value that is added/subtracted from the previous
   /// offset.
-  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment,
+  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment,
                                                 SMLoc Loc = {}) {
     return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, Loc);
   }
@@ -588,7 +588,7 @@ class MCCFIInstruction {
   /// be the result of evaluating the DWARF operation expression
   /// `DW_OP_constu AS; DW_OP_aspace_bregx R, B` as a location description.
   static MCCFIInstruction createLLVMDefAspaceCfa(MCSymbol *L, unsigned Register,
-                                                 int Offset,
+                                                 int64_t Offset,
                                                  unsigned AddressSpace,
                                                  SMLoc Loc) {
     return MCCFIInstruction(OpLLVMDefAspaceCfa, L, Register, Offset,
@@ -598,7 +598,7 @@ class MCCFIInstruction {
   /// .cfi_offset Previous value of Register is saved at offset Offset
   /// from CFA.
   static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
-                                       int Offset, SMLoc Loc = {}) {
+                                       int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpOffset, L, Register, Offset, Loc);
   }
 
@@ -606,7 +606,7 @@ class MCCFIInstruction {
   /// Offset from the current CFA register. This is transformed to .cfi_offset
   /// using the known displacement of the CFA register from the CFA.
   static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
-                                          int Offset, SMLoc Loc = {}) {
+                                          int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpRelOffset, L, Register, Offset, Loc);
   }
 
@@ -619,12 +619,12 @@ class MCCFIInstruction {
 
   /// .cfi_window_save SPARC register window is saved.
   static MCCFIInstruction createWindowSave(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpWindowSave, L, 0, 0, Loc);
+    return MCCFIInstruction(OpWindowSave, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_negate_ra_state AArch64 negate RA state.
   static MCCFIInstruction createNegateRAState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpNegateRAState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpNegateRAState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_restore says that the rule for Register is now the same as it
@@ -632,31 +632,31 @@ class MCCFIInstruction {
   /// by .cfi_startproc were executed.
   static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register,
                                         SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestore, L, Register, 0, Loc);
+    return MCCFIInstruction(OpRestore, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_undefined From now on the previous value of Register can't be
   /// restored anymore.
   static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpUndefined, L, Register, 0, Loc);
+    return MCCFIInstruction(OpUndefined, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_same_value Current value of Register is the same as in the
   /// previous frame. I.e., no restoration is needed.
   static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpSameValue, L, Register, 0, Loc);
+    return MCCFIInstruction(OpSameValue, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_remember_state Save all current rules for all registers.
   static MCCFIInstruction createRememberState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRememberState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpRememberState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_restore_state Restore the previously saved state.
   static MCCFIInstruction createRestoreState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestoreState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpRestoreState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_escape Allows the user to add arbitrary bytes to the unwind
@@ -667,7 +667,7 @@ class MCCFIInstruction {
   }
 
   /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
-  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int Size,
+  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int64_t Size,
                                             SMLoc Loc = {}) {
     return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, Loc);
   }
@@ -702,7 +702,7 @@ class MCCFIInstruction {
     return U.RIA.AddressSpace;
   }
 
-  int getOffset() const {
+  int64_t getOffset() const {
     if (Operation == OpLLVMDefAspaceCfa)
       return U.RIA.Offset;
     assert(Operation == OpDefCfa || Operation == OpOffset ||
@@ -736,7 +736,7 @@ struct MCDwarfFrameInfo {
   unsigned CurrentCfaRegister = 0;
   unsigned PersonalityEncoding = 0;
   unsigned LsdaEncoding = 0;
-  uint32_t CompactUnwindEncoding = 0;
+  uint64_t CompactUnwindEncoding = 0;
   bool IsSignalFrame = false;
   bool IsSimple = false;
   unsigned RAReg = static_cast<unsigned>(INT_MAX);
diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h
index 12237094ad86a..4952e0383b211 100644
--- a/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_MC_MCELFOBJECTWRITER_H
 #define LLVM_MC_MCELFOBJECTWRITER_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -16,6 +18,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <vector>
 
 namespace llvm {
@@ -25,6 +29,7 @@ class MCContext;
 class MCFixup;
 class MCSymbol;
 class MCSymbolELF;
+class MCTargetOptions;
 class MCValue;
 
 struct ELFRelocationEntry {
@@ -149,20 +154,66 @@ class MCELFObjectTargetWriter : public MCObjectTargetWriter {
   }
 };
 
-/// Construct a new ELF writer instance.
-///
-/// \param MOTW - The target specific ELF writer subclass.
-/// \param OS - The stream to write to.
-/// \returns The constructed object writer.
-std::unique_ptr<MCObjectWriter>
-createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                      raw_pwrite_stream &OS, bool IsLittleEndian);
-
-std::unique_ptr<MCObjectWriter>
-createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                         raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
-                         bool IsLittleEndian);
+class ELFObjectWriter final : public MCObjectWriter {
+  unsigned ELFHeaderEFlags = 0;
 
+public:
+  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
+  raw_pwrite_stream &OS;
+  raw_pwrite_stream *DwoOS = nullptr;
+
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
+  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
+  bool IsLittleEndian = false;
+  bool SeenGnuAbi = false;
+  std::optional<uint8_t> OverrideABIVersion;
+
+  struct Symver {
+    SMLoc Loc;
+    const MCSymbol *Sym;
+    StringRef Name;
+    // True if .symver *, *@@@* or .symver *, *, remove.
+    bool KeepOriginalSym;
+  };
+  SmallVector<Symver, 0> Symvers;
+
+  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                  raw_pwrite_stream &OS, bool IsLittleEndian);
+  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                  raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                  bool IsLittleEndian);
+
+  void reset() override;
+  void executePostLayoutBinding(MCAssembler &Asm) override;
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbol &SymA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
+  uint64_t writeObject(MCAssembler &Asm) override;
+
+  bool hasRelocationAddend() const;
+  bool usesRela(const MCTargetOptions *TO, const MCSectionELF &Sec) const;
+
+  bool shouldRelocateWithSymbol(const MCAssembler &Asm, const MCValue &Val,
+                                const MCSymbolELF *Sym, uint64_t C,
+                                unsigned Type) const;
+
+  bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
+                       const MCSectionELF *To);
+
+  unsigned getELFHeaderEFlags() const { return ELFHeaderEFlags; }
+  void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags; }
+
+  // Mark that we have seen GNU ABI usage (e.g. SHF_GNU_RETAIN, STB_GNU_UNIQUE).
+  void markGnuAbi() { SeenGnuAbi = true; }
+  bool seenGnuAbi() const { return SeenGnuAbi; }
+
+  // Override the default e_ident[EI_ABIVERSION] in the ELF header.
+  void setOverrideABIVersion(uint8_t V) { OverrideABIVersion = V; }
+};
 } // end namespace llvm
 
 #endif // LLVM_MC_MCELFOBJECTWRITER_H
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 1cd64e406d4d0..94d14088d0f5d 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -15,6 +15,7 @@
 
 namespace llvm {
 
+class ELFObjectWriter;
 class MCContext;
 class MCDataFragment;
 class MCFragment;
@@ -42,6 +43,8 @@ class MCELFStreamer : public MCObjectStreamer {
     MCObjectStreamer::reset();
   }
 
+  ELFObjectWriter &getWriter();
+
   /// \name MCStreamer Interface
   /// @{
 
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index ee93cb94100ec..bfd349e8b8e7e 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -38,7 +38,6 @@ class MCFragment {
   enum FragmentType : uint8_t {
     FT_Align,
     FT_Data,
-    FT_CompactEncodedInst,
     FT_Fill,
     FT_Nops,
     FT_Relaxable,
@@ -70,8 +69,15 @@ class MCFragment {
   FragmentType Kind;
 
 protected:
+  /// Used by subclasses for better packing.
+  ///
+  /// MCEncodedFragment
   bool HasInstructions : 1;
+  bool AlignToBundleEnd : 1;
+  /// MCDataFragment
   bool LinkerRelaxable : 1;
+  /// MCRelaxableFragment: x86-specific
+  bool AllowAutoPadding : 1;
 
   MCFragment(FragmentType Kind, bool HasInstructions);
 
@@ -116,9 +122,6 @@ class MCDummyFragment : public MCFragment {
 /// data.
 ///
 class MCEncodedFragment : public MCFragment {
-  /// Should this fragment be aligned to the end of a bundle?
-  bool AlignToBundleEnd = false;
-
   uint8_t BundlePadding = 0;
 
 protected:
@@ -136,7 +139,6 @@ class MCEncodedFragment : public MCFragment {
     default:
       return false;
     case MCFragment::FT_Relaxable:
-    case MCFragment::FT_CompactEncodedInst:
     case MCFragment::FT_Data:
     case MCFragment::FT_Dwarf:
     case MCFragment::FT_DwarfFrame:
@@ -172,29 +174,12 @@ class MCEncodedFragment : public MCFragment {
   }
 };
 
-/// Interface implemented by fragments that contain encoded instructions and/or
-/// data.
-///
-template<unsigned ContentsSize>
-class MCEncodedFragmentWithContents : public MCEncodedFragment {
-  SmallVector<char, ContentsSize> Contents;
-
-protected:
-  MCEncodedFragmentWithContents(MCFragment::FragmentType FType,
-                                bool HasInstructions)
-      : MCEncodedFragment(FType, HasInstructions) {}
-
-public:
-  SmallVectorImpl<char> &getContents() { return Contents; }
-  const SmallVectorImpl<char> &getContents() const { return Contents; }
-};
-
 /// Interface implemented by fragments that contain encoded instructions and/or
 /// data and also have fixups registered.
 ///
-template<unsigned ContentsSize, unsigned FixupsSize>
-class MCEncodedFragmentWithFixups :
-  public MCEncodedFragmentWithContents<ContentsSize> {
+template <unsigned ContentsSize, unsigned FixupsSize>
+class MCEncodedFragmentWithFixups : public MCEncodedFragment {
+  SmallVector<char, ContentsSize> Contents;
 
   /// The list of fixups in this fragment.
   SmallVector<MCFixup, FixupsSize> Fixups;
@@ -202,13 +187,16 @@ class MCEncodedFragmentWithFixups :
 protected:
   MCEncodedFragmentWithFixups(MCFragment::FragmentType FType,
                               bool HasInstructions)
-      : MCEncodedFragmentWithContents<ContentsSize>(FType, HasInstructions) {}
+      : MCEncodedFragment(FType, HasInstructions) {}
 
 public:
 
   using const_fixup_iterator = SmallVectorImpl<MCFixup>::const_iterator;
   using fixup_iterator = SmallVectorImpl<MCFixup>::iterator;
 
+  SmallVectorImpl<char> &getContents() { return Contents; }
+  const SmallVectorImpl<char> &getContents() const { return Contents; }
+
   SmallVectorImpl<MCFixup> &getFixups() { return Fixups; }
   const SmallVectorImpl<MCFixup> &getFixups() const { return Fixups; }
 
@@ -240,30 +228,12 @@ class MCDataFragment : public MCEncodedFragmentWithFixups<32, 4> {
   void setLinkerRelaxable() { LinkerRelaxable = true; }
 };
 
-/// This is a compact (memory-size-wise) fragment for holding an encoded
-/// instruction (non-relaxable) that has no fixups registered. When applicable,
-/// it can be used instead of MCDataFragment and lead to lower memory
-/// consumption.
-///
-class MCCompactEncodedInstFragment : public MCEncodedFragmentWithContents<4> {
-public:
-  MCCompactEncodedInstFragment()
-      : MCEncodedFragmentWithContents(FT_CompactEncodedInst, true) {}
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_CompactEncodedInst;
-  }
-};
-
 /// A relaxable fragment holds on to its MCInst, since it may need to be
 /// relaxed during the assembler layout and relaxation stage.
 ///
 class MCRelaxableFragment : public MCEncodedFragmentWithFixups<8, 1> {
-
   /// The instruction this is a fragment for.
   MCInst Inst;
-  /// Can we auto pad the instruction?
-  bool AllowAutoPadding = false;
 
 public:
   MCRelaxableFragment(const MCInst &Inst, const MCSubtargetInfo &STI)
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index c26c75e98844c..77652c7ff53a3 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -12,11 +12,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/VersionTuple.h"
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -80,7 +83,7 @@ class MCMachObjectTargetWriter : public MCObjectTargetWriter {
   /// @}
 };
 
-class MachObjectWriter : public MCObjectWriter {
+class MachObjectWriter final : public MCObjectWriter {
 public:
   struct DataRegionData {
     MachO::DataRegionType Kind;
@@ -88,6 +91,21 @@ class MachObjectWriter : public MCObjectWriter {
     MCSymbol *End;
   };
 
+  // A Major version of 0 indicates that no version information was supplied
+  // and so the corresponding load command should not be emitted.
+  using VersionInfoType = struct {
+    bool EmitBuildVersion;
+    union {
+      MCVersionMinType Type;        ///< Used when EmitBuildVersion==false.
+      MachO::PlatformType Platform; ///< Used when EmitBuildVersion==true.
+    } TypeOrPlatform;
+    unsigned Major;
+    unsigned Minor;
+    unsigned Update;
+    /// An optional version of the SDK that was used to build the source.
+    VersionTuple SDKVersion;
+  };
+
 private:
   /// Helper struct for containing some precomputed information on symbols.
   struct MachSymbolData {
@@ -140,6 +158,15 @@ class MachObjectWriter : public MCObjectWriter {
 
   /// @}
 
+  // Used to communicate Linker Optimization Hint information.
+  MCLOHContainer LOHContainer;
+
+  VersionInfoType VersionInfo{};
+  VersionInfoType TargetVariantVersionInfo{};
+
+  // The list of linker options for LC_LINKER_OPTION.
+  std::vector<std::vector<std::string>> LinkerOptions;
+
   MachSymbolData *findSymbolData(const MCSymbol &Sym);
 
   void writeWithPadding(StringRef Str, uint64_t Size);
@@ -177,6 +204,7 @@ class MachObjectWriter : public MCObjectWriter {
     return SectionOrder;
   }
   SectionAddrMap &getSectionAddressMap() { return SectionAddress; }
+  MCLOHContainer &getLOHContainer() { return LOHContainer; }
 
   uint64_t getSectionAddress(const MCSection *Sec) const {
     return SectionAddress.lookup(Sec);
@@ -192,6 +220,42 @@ class MachObjectWriter : public MCObjectWriter {
 
   bool doesSymbolRequireExternRelocation(const MCSymbol &S);
 
+  /// Mach-O deployment target version information.
+  void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor,
+                     unsigned Update,
+                     VersionTuple SDKVersion = VersionTuple()) {
+    VersionInfo.EmitBuildVersion = false;
+    VersionInfo.TypeOrPlatform.Type = Type;
+    VersionInfo.Major = Major;
+    VersionInfo.Minor = Minor;
+    VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
+  }
+  void setBuildVersion(MachO::PlatformType Platform, unsigned Major,
+                       unsigned Minor, unsigned Update,
+                       VersionTuple SDKVersion = VersionTuple()) {
+    VersionInfo.EmitBuildVersion = true;
+    VersionInfo.TypeOrPlatform.Platform = Platform;
+    VersionInfo.Major = Major;
+    VersionInfo.Minor = Minor;
+    VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
+  }
+  void setTargetVariantBuildVersion(MachO::PlatformType Platform,
+                                    unsigned Major, unsigned Minor,
+                                    unsigned Update, VersionTuple SDKVersion) {
+    TargetVariantVersionInfo.EmitBuildVersion = true;
+    TargetVariantVersionInfo.TypeOrPlatform.Platform = Platform;
+    TargetVariantVersionInfo.Major = Major;
+    TargetVariantVersionInfo.Minor = Minor;
+    TargetVariantVersionInfo.Update = Update;
+    TargetVariantVersionInfo.SDKVersion = SDKVersion;
+  }
+
+  std::vector<std::vector<std::string>> &getLinkerOptions() {
+    return LinkerOptions;
+  }
+
   /// @}
 
   /// \name Target Writer Proxy Accessors
@@ -289,18 +353,6 @@ class MachObjectWriter : public MCObjectWriter {
 
   uint64_t writeObject(MCAssembler &Asm) override;
 };
-
-/// Construct a new Mach-O writer instance.
-///
-/// This routine takes ownership of the target writer subclass.
-///
-/// \param MOTW - The target specific Mach-O writer subclass.
-/// \param OS - The stream to write to.
-/// \returns The constructed object writer.
-std::unique_ptr<MCObjectWriter>
-createMachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
-                       raw_pwrite_stream &OS, bool IsLittleEndian);
-
 } // end namespace llvm
 
 #endif // LLVM_MC_MCMACHOBJECTWRITER_H
diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h
index 01114469c241e..81ba6ffd5d44e 100644
--- a/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/llvm/include/llvm/MC/MCObjectWriter.h
@@ -32,8 +32,20 @@ class MCValue;
 /// should be emitted as part of writeObject().
 class MCObjectWriter {
 protected:
+  /// List of declared file names
+  SmallVector<std::pair<std::string, size_t>, 0> FileNames;
+  // XCOFF specific: Optional compiler version.
+  std::string CompilerVersion;
   std::vector<const MCSymbol *> AddrsigSyms;
   bool EmitAddrsigSection = false;
+  bool SubsectionsViaSymbols = false;
+
+  struct CGProfileEntry {
+    const MCSymbolRefExpr *From;
+    const MCSymbolRefExpr *To;
+    uint64_t Count;
+  };
+  SmallVector<CGProfileEntry, 0> CGProfile;
 
   MCObjectWriter() = default;
 
@@ -43,7 +55,7 @@ class MCObjectWriter {
   virtual ~MCObjectWriter();
 
   /// lifetime management
-  virtual void reset() {}
+  virtual void reset();
 
   /// \name High-Level API
   /// @{
@@ -81,11 +93,13 @@ class MCObjectWriter {
                                                       bool InSet,
                                                       bool IsPCRel) const;
 
-  /// ELF only. Mark that we have seen GNU ABI usage (e.g. SHF_GNU_RETAIN).
-  virtual void markGnuAbi() {}
-
-  /// ELF only, override the default ABIVersion in the ELF header.
-  virtual void setOverrideABIVersion(uint8_t ABIVersion) {}
+  MutableArrayRef<std::pair<std::string, size_t>> getFileNames() {
+    return FileNames;
+  }
+  void addFileName(MCAssembler &Asm, StringRef FileName);
+  void setCompilerVersion(StringRef CompilerVers) {
+    CompilerVersion = CompilerVers;
+  }
 
   /// Tell the object writer to emit an address-significance table during
   /// writeObject(). If this function is not called, all symbols are treated as
@@ -99,15 +113,12 @@ class MCObjectWriter {
   void addAddrsigSymbol(const MCSymbol *Sym) { AddrsigSyms.push_back(Sym); }
 
   std::vector<const MCSymbol *> &getAddrsigSyms() { return AddrsigSyms; }
+  SmallVector<CGProfileEntry, 0> &getCGProfile() { return CGProfile; }
+
+  // Mach-O specific: Whether .subsections_via_symbols is enabled.
+  bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
+  void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }
 
-  virtual void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
-                                 unsigned LanguageCode, unsigned ReasonCode,
-                                 unsigned FunctionSize, bool hasDebug) {
-    report_fatal_error("addExceptionEntry is only supported on XCOFF targets");
-  }
-  virtual void addCInfoSymEntry(StringRef Name, StringRef Metadata) {
-    report_fatal_error("addCInfoSymEntry is only supported on XCOFF targets");
-  }
   /// Write the object file and returns the number of bytes written.
   ///
   /// This routine is called by the assembler after layout and relaxation is
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 4d82da70f3df8..54a7f832e5008 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -61,6 +61,7 @@
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/ErrorOr.h"
 #include <list>
+#include <map>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -102,8 +103,7 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 using GUIDProbeFunctionMap =
     std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
 // Address to pseudo probes map.
-using AddressProbesMap =
-    std::unordered_map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+using AddressProbesMap = std::map<uint64_t, std::list<MCDecodedPseudoProbe>>;
 
 class MCDecodedPseudoProbeInlineTree;
 
@@ -280,8 +280,6 @@ class MCDecodedPseudoProbeInlineTree
                                          MCDecodedPseudoProbeInlineTree> {
 public:
   InlineSite ISite;
-  // Used for decoding
-  uint32_t ChildrenToProcess = 0;
 
   MCDecodedPseudoProbeInlineTree() = default;
   MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
diff --git a/llvm/include/llvm/MC/MCSPIRVObjectWriter.h b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
index a8baf96b83842..18bdb8be0145e 100644
--- a/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
+++ b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
@@ -10,6 +10,8 @@
 #define LLVM_MC_MCSPIRVOBJECTWRITER_H
 
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -26,6 +28,32 @@ class MCSPIRVObjectTargetWriter : public MCObjectTargetWriter {
   }
 };
 
+class SPIRVObjectWriter final : public MCObjectWriter {
+  support::endian::Writer W;
+  std::unique_ptr<MCSPIRVObjectTargetWriter> TargetObjectWriter;
+
+  struct VersionInfoType {
+    unsigned Major = 0;
+    unsigned Minor = 0;
+    unsigned Bound = 0;
+  } VersionInfo;
+
+public:
+  SPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
+                    raw_pwrite_stream &OS)
+      : W(OS, llvm::endianness::little), TargetObjectWriter(std::move(MOTW)) {}
+
+  void setBuildVersion(unsigned Major, unsigned Minor, unsigned Bound);
+
+private:
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {}
+
+  uint64_t writeObject(MCAssembler &Asm) override;
+  void writeHeader(const MCAssembler &Asm);
+};
+
 /// Construct a new SPIR-V writer instance.
 ///
 /// \param MOTW - The target specific SPIR-V writer subclass.
diff --git a/llvm/include/llvm/MC/MCSymbol.h b/llvm/include/llvm/MC/MCSymbol.h
index 5f06249b479eb..6fd5c02d04531 100644
--- a/llvm/include/llvm/MC/MCSymbol.h
+++ b/llvm/include/llvm/MC/MCSymbol.h
@@ -99,7 +99,7 @@ class MCSymbol {
   /// uses binding instead of this bit.
   mutable unsigned IsExternal : 1;
 
-  /// This symbol is private extern.
+  /// Mach-O specific: This symbol is private extern.
   mutable unsigned IsPrivateExtern : 1;
 
   /// This symbol is weak external.
@@ -403,12 +403,11 @@ class MCSymbol {
     return Fragment;
   }
 
+  // For ELF, use MCSymbolELF::setBinding instead.
   bool isExternal() const { return IsExternal; }
   void setExternal(bool Value) const { IsExternal = Value; }
 
-  bool isPrivateExtern() const { return IsPrivateExtern; }
-  void setPrivateExtern(bool Value) { IsPrivateExtern = Value; }
-
+  // COFF-specific
   bool isWeakExternal() const { return IsWeakExternal; }
 
   /// print - Print the value to the stream \p OS.
diff --git a/llvm/include/llvm/MC/MCSymbolMachO.h b/llvm/include/llvm/MC/MCSymbolMachO.h
index 730fbea0059a6..97c9abfcf5f2c 100644
--- a/llvm/include/llvm/MC/MCSymbolMachO.h
+++ b/llvm/include/llvm/MC/MCSymbolMachO.h
@@ -46,6 +46,9 @@ class MCSymbolMachO : public MCSymbol {
   MCSymbolMachO(const MCSymbolTableEntry *Name, bool isTemporary)
       : MCSymbol(SymbolKindMachO, Name, isTemporary) {}
 
+  bool isPrivateExtern() const { return IsPrivateExtern; }
+  void setPrivateExtern(bool Value) { IsPrivateExtern = Value; }
+
   // Reference type methods.
 
   void clearReferenceType() const {
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index 307800e73c687..a4ede61e45099 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -43,6 +43,37 @@ class MCWinCOFFObjectTargetWriter : public MCObjectTargetWriter {
   virtual bool recordRelocation(const MCFixup &) const { return true; }
 };
 
+class WinCOFFWriter;
+
+class WinCOFFObjectWriter final : public MCObjectWriter {
+  friend class WinCOFFWriter;
+
+  std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
+  std::unique_ptr<WinCOFFWriter> ObjWriter, DwoWriter;
+  bool IncrementalLinkerCompatible = false;
+
+public:
+  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
+                      raw_pwrite_stream &OS);
+  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
+                      raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS);
+
+  // MCObjectWriter interface implementation.
+  void reset() override;
+  void setIncrementalLinkerCompatible(bool Value) {
+    IncrementalLinkerCompatible = Value;
+  }
+  void executePostLayoutBinding(MCAssembler &Asm) override;
+  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbol &SymA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+  uint64_t writeObject(MCAssembler &Asm) override;
+};
+
 /// Construct a new Win COFF writer instance.
 ///
 /// \param MOTW - The target specific WinCOFF writer subclass.
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 893a70f74ee3b..5c39d80538944 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -22,6 +22,7 @@ class MCSection;
 class MCSubtargetInfo;
 class MCSymbol;
 class StringRef;
+class WinCOFFObjectWriter;
 class raw_pwrite_stream;
 
 class MCWinCOFFStreamer : public MCObjectStreamer {
@@ -36,6 +37,8 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
     MCObjectStreamer::reset();
   }
 
+  WinCOFFObjectWriter &getWriter();
+
   /// \name MCStreamer interface
   /// \{
 
diff --git a/llvm/include/llvm/MC/MCXCOFFObjectWriter.h b/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
index faad2ceb26910..c0e32a70172d8 100644
--- a/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
@@ -43,6 +43,15 @@ std::unique_ptr<MCObjectWriter>
 createXCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                         raw_pwrite_stream &OS);
 
+namespace XCOFF {
+void addExceptionEntry(MCObjectWriter &Writer, const MCSymbol *Symbol,
+                       const MCSymbol *Trap, unsigned LanguageCode,
+                       unsigned ReasonCode, unsigned FunctionSize,
+                       bool hasDebug);
+void addCInfoSymEntry(MCObjectWriter &Writer, StringRef Name,
+                      StringRef Metadata);
+} // namespace XCOFF
+
 } // end namespace llvm
 
 #endif // LLVM_MC_MCXCOFFOBJECTWRITER_H
diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index 5038b87cd1dc9..42d510c17bce3 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -85,9 +86,8 @@ MCStreamer *createNullStreamer(MCContext &Ctx);
 /// the assembly.
 MCStreamer *
 createAsmStreamer(MCContext &Ctx, std::unique_ptr<formatted_raw_ostream> OS,
-                  bool isVerboseAsm, bool useDwarfDirectory,
                   MCInstPrinter *InstPrint, std::unique_ptr<MCCodeEmitter> &&CE,
-                  std::unique_ptr<MCAsmBackend> &&TAB, bool ShowInst);
+                  std::unique_ptr<MCAsmBackend> &&TAB);
 
 MCStreamer *createELFStreamer(MCContext &Ctx,
                               std::unique_ptr<MCAsmBackend> &&TAB,
@@ -107,10 +107,6 @@ MCStreamer *createWasmStreamer(MCContext &Ctx,
                                std::unique_ptr<MCAsmBackend> &&TAB,
                                std::unique_ptr<MCObjectWriter> &&OW,
                                std::unique_ptr<MCCodeEmitter> &&CE);
-MCStreamer *createXCOFFStreamer(MCContext &Ctx,
-                                std::unique_ptr<MCAsmBackend> &&TAB,
-                                std::unique_ptr<MCObjectWriter> &&OW,
-                                std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createSPIRVStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCAsmBackend> &&TAB,
                                 std::unique_ptr<MCObjectWriter> &&OW,
@@ -194,23 +190,12 @@ class Target {
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter);
-  using GOFFStreamerCtorTy =
-      MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
-                      std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using MachOStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                      bool DWARFMustBeAtTheEnd);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using COFFStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
-                      std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                      bool IncrementalLinkerCompatible);
-  using WasmStreamerCtorTy =
-      MCStreamer *(*)(const Triple &T, MCContext &Ctx,
-                      std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter);
   using XCOFFStreamerCtorTy =
@@ -218,22 +203,11 @@ class Target {
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter);
-  using SPIRVStreamerCtorTy =
-      MCStreamer *(*)(const Triple &T, MCContext &Ctx,
-                      std::unique_ptr<MCAsmBackend> &&TAB,
-                      std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter);
-
-  using DXContainerStreamerCtorTy =
-      MCStreamer *(*)(const Triple &T, MCContext &Ctx,
-                      std::unique_ptr<MCAsmBackend> &&TAB,
-                      std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter);
 
   using NullTargetStreamerCtorTy = MCTargetStreamer *(*)(MCStreamer &S);
-  using AsmTargetStreamerCtorTy = MCTargetStreamer *(*)(
-      MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint,
-      bool IsVerboseAsm);
+  using AsmTargetStreamerCtorTy =
+      MCTargetStreamer *(*)(MCStreamer &S, formatted_raw_ostream &OS,
+                            MCInstPrinter *InstPrint);
   using ObjectTargetStreamerCtorTy = MCTargetStreamer *(*)(
       MCStreamer &S, const MCSubtargetInfo &STI);
   using MCRelocationInfoCtorTy = MCRelocationInfo *(*)(const Triple &TT,
@@ -330,13 +304,9 @@ class Target {
 
   // Construction functions for the various object formats, if registered.
   COFFStreamerCtorTy COFFStreamerCtorFn = nullptr;
-  GOFFStreamerCtorTy GOFFStreamerCtorFn = nullptr;
   MachOStreamerCtorTy MachOStreamerCtorFn = nullptr;
   ELFStreamerCtorTy ELFStreamerCtorFn = nullptr;
-  WasmStreamerCtorTy WasmStreamerCtorFn = nullptr;
   XCOFFStreamerCtorTy XCOFFStreamerCtorFn = nullptr;
-  SPIRVStreamerCtorTy SPIRVStreamerCtorFn = nullptr;
-  DXContainerStreamerCtorTy DXContainerStreamerCtorFn = nullptr;
 
   /// Construction function for this target's null TargetStreamer, if
   /// registered (default = nullptr).
@@ -555,107 +525,36 @@ class Target {
   /// \param TAB The target assembler backend object. Takes ownership.
   /// \param OW The stream object.
   /// \param Emitter The target independent assembler object.Takes ownership.
-  /// \param RelaxAll Relax all fixups?
+  MCStreamer *createMCObjectStreamer(const Triple &T, MCContext &Ctx,
+                                     std::unique_ptr<MCAsmBackend> TAB,
+                                     std::unique_ptr<MCObjectWriter> OW,
+                                     std::unique_ptr<MCCodeEmitter> Emitter,
+                                     const MCSubtargetInfo &STI) const;
+  LLVM_DEPRECATED("Use the overload without the 3 trailing bool", "")
   MCStreamer *createMCObjectStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&TAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     const MCSubtargetInfo &STI, bool,
-                                     bool IncrementalLinkerCompatible,
-                                     bool DWARFMustBeAtTheEnd) const {
-    MCStreamer *S = nullptr;
-    switch (T.getObjectFormat()) {
-    case Triple::UnknownObjectFormat:
-      llvm_unreachable("Unknown object format");
-    case Triple::COFF:
-      assert((T.isOSWindows() || T.isUEFI()) &&
-             "only Windows and UEFI COFF are supported");
-      S = COFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                             std::move(Emitter), IncrementalLinkerCompatible);
-      break;
-    case Triple::MachO:
-      if (MachOStreamerCtorFn)
-        S = MachOStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), DWARFMustBeAtTheEnd);
-      else
-        S = createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), DWARFMustBeAtTheEnd);
-      break;
-    case Triple::ELF:
-      if (ELFStreamerCtorFn)
-        S = ELFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                              std::move(Emitter));
-      else
-        S = createELFStreamer(Ctx, std::move(TAB), std::move(OW),
-                              std::move(Emitter));
-      break;
-    case Triple::Wasm:
-      if (WasmStreamerCtorFn)
-        S = WasmStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter));
-      else
-        S = createWasmStreamer(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter));
-      break;
-    case Triple::GOFF:
-      if (GOFFStreamerCtorFn)
-        S = GOFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter));
-      else
-        S = createGOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter));
-      break;
-    case Triple::XCOFF:
-      if (XCOFFStreamerCtorFn)
-        S = XCOFFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter));
-      else
-        S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter));
-      break;
-    case Triple::SPIRV:
-      if (SPIRVStreamerCtorFn)
-        S = SPIRVStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter));
-      else
-        S = createSPIRVStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter));
-      break;
-    case Triple::DXContainer:
-      if (DXContainerStreamerCtorFn)
-        S = DXContainerStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                                      std::move(Emitter));
-      else
-        S = createDXContainerStreamer(Ctx, std::move(TAB), std::move(OW),
-                                      std::move(Emitter));
-      break;
-    }
-    if (ObjectTargetStreamerCtorFn)
-      ObjectTargetStreamerCtorFn(*S, STI);
-    return S;
-  }
+                                     const MCSubtargetInfo &STI, bool, bool,
+                                     bool) const;
 
   MCStreamer *createAsmStreamer(MCContext &Ctx,
                                 std::unique_ptr<formatted_raw_ostream> OS,
-                                bool IsVerboseAsm, bool UseDwarfDirectory,
-                                MCInstPrinter *InstPrint,
-                                std::unique_ptr<MCCodeEmitter> &&CE,
-                                std::unique_ptr<MCAsmBackend> &&TAB,
-                                bool ShowInst) const {
-    formatted_raw_ostream &OSRef = *OS;
-    MCStreamer *S = llvm::createAsmStreamer(
-        Ctx, std::move(OS), IsVerboseAsm, UseDwarfDirectory, InstPrint,
-        std::move(CE), std::move(TAB), ShowInst);
-    createAsmTargetStreamer(*S, OSRef, InstPrint, IsVerboseAsm);
-    return S;
-  }
+                                MCInstPrinter *IP,
+                                std::unique_ptr<MCCodeEmitter> CE,
+                                std::unique_ptr<MCAsmBackend> TAB) const;
+  LLVM_DEPRECATED("Use the overload without the 3 unused bool", "")
+  MCStreamer *
+  createAsmStreamer(MCContext &Ctx, std::unique_ptr<formatted_raw_ostream> OS,
+                    bool IsVerboseAsm, bool UseDwarfDirectory,
+                    MCInstPrinter *IP, std::unique_ptr<MCCodeEmitter> &&CE,
+                    std::unique_ptr<MCAsmBackend> &&TAB, bool ShowInst) const;
 
   MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
                                             formatted_raw_ostream &OS,
-                                            MCInstPrinter *InstPrint,
-                                            bool IsVerboseAsm) const {
+                                            MCInstPrinter *InstPrint) const {
     if (AsmTargetStreamerCtorFn)
-      return AsmTargetStreamerCtorFn(S, OS, InstPrint, IsVerboseAsm);
+      return AsmTargetStreamerCtorFn(S, OS, InstPrint);
     return nullptr;
   }
 
@@ -1011,10 +910,6 @@ struct TargetRegistry {
     T.COFFStreamerCtorFn = Fn;
   }
 
-  static void RegisterGOFFStreamer(Target &T, Target::GOFFStreamerCtorTy Fn) {
-    T.GOFFStreamerCtorFn = Fn;
-  }
-
   static void RegisterMachOStreamer(Target &T, Target::MachOStreamerCtorTy Fn) {
     T.MachOStreamerCtorFn = Fn;
   }
@@ -1023,18 +918,6 @@ struct TargetRegistry {
     T.ELFStreamerCtorFn = Fn;
   }
 
-  static void RegisterSPIRVStreamer(Target &T, Target::SPIRVStreamerCtorTy Fn) {
-    T.SPIRVStreamerCtorFn = Fn;
-  }
-
-  static void RegisterDXContainerStreamer(Target &T, Target::DXContainerStreamerCtorTy Fn) {
-    T.DXContainerStreamerCtorFn = Fn;
-  }
-
-  static void RegisterWasmStreamer(Target &T, Target::WasmStreamerCtorTy Fn) {
-    T.WasmStreamerCtorFn = Fn;
-  }
-
   static void RegisterXCOFFStreamer(Target &T, Target::XCOFFStreamerCtorTy Fn) {
     T.XCOFFStreamerCtorFn = Fn;
   }
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 7f9d90d528b3e..5ae09760e9a54 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -151,6 +151,18 @@ class NameMatcher {
   }
 };
 
+enum class AdjustKind { Set, Add, Subtract };
+
+struct AddressUpdate {
+  uint64_t Value = 0;
+  AdjustKind Kind = AdjustKind::Add;
+};
+
+struct SectionPatternAddressUpdate {
+  NameMatcher SectionPattern;
+  AddressUpdate Update;
+};
+
 enum class SymbolFlag {
   Global,
   Local,
@@ -219,6 +231,7 @@ struct CommonConfig {
   SmallVector<NewSectionInfo, 0> AddSection;
   SmallVector<StringRef, 0> DumpSection;
   SmallVector<NewSectionInfo, 0> UpdateSection;
+  SmallVector<SectionPatternAddressUpdate, 0> ChangeSectionAddress;
 
   // Section matchers
   NameMatcher KeepSection;
diff --git a/llvm/include/llvm/Object/ArchiveWriter.h b/llvm/include/llvm/Object/ArchiveWriter.h
index e41b3d51173d4..95b81f5f64c8f 100644
--- a/llvm/include/llvm/Object/ArchiveWriter.h
+++ b/llvm/include/llvm/Object/ArchiveWriter.h
@@ -48,25 +48,30 @@ enum class SymtabWritingMode {
   BigArchive64  // Only write the 64-bit symbol table.
 };
 
+void warnToStderr(Error Err);
+
 // Write an archive directly to an output stream.
 Error writeArchiveToStream(raw_ostream &Out,
                            ArrayRef<NewArchiveMember> NewMembers,
                            SymtabWritingMode WriteSymtab,
                            object::Archive::Kind Kind, bool Deterministic,
-                           bool Thin, std::optional<bool> IsEC = std::nullopt);
+                           bool Thin, std::optional<bool> IsEC = std::nullopt,
+                           function_ref<void(Error)> Warn = warnToStderr);
 
 Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
                    SymtabWritingMode WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf = nullptr,
-                   std::optional<bool> IsEC = std::nullopt);
+                   std::optional<bool> IsEC = std::nullopt,
+                   function_ref<void(Error)> Warn = warnToStderr);
 
 // writeArchiveToBuffer is similar to writeArchive but returns the Archive in a
 // buffer instead of writing it out to a file.
 Expected<std::unique_ptr<MemoryBuffer>>
 writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers,
                      SymtabWritingMode WriteSymtab, object::Archive::Kind Kind,
-                     bool Deterministic, bool Thin);
+                     bool Deterministic, bool Thin,
+                     function_ref<void(Error)> Warn = warnToStderr);
 }
 
 #endif
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 6bf42de89e1c4..8a363741ee628 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -127,8 +127,8 @@ template <class T> struct DataRegion {
 };
 
 template <class ELFT>
-std::string getSecIndexForError(const ELFFile<ELFT> &Obj,
-                                const typename ELFT::Shdr &Sec) {
+static std::string getSecIndexForError(const ELFFile<ELFT> &Obj,
+                                       const typename ELFT::Shdr &Sec) {
   auto TableOrErr = Obj.sections();
   if (TableOrErr)
     return "[index " + std::to_string(&Sec - &TableOrErr->front()) + "]";
@@ -151,8 +151,8 @@ static std::string describe(const ELFFile<ELFT> &Obj,
 }
 
 template <class ELFT>
-std::string getPhdrIndexForError(const ELFFile<ELFT> &Obj,
-                                 const typename ELFT::Phdr &Phdr) {
+static std::string getPhdrIndexForError(const ELFFile<ELFT> &Obj,
+                                        const typename ELFT::Phdr &Phdr) {
   auto Headers = Obj.program_headers();
   if (Headers)
     return ("[index " + Twine(&Phdr - &Headers->front()) + "]").str();
@@ -166,8 +166,8 @@ static inline Error defaultWarningHandler(const Twine &Msg) {
 }
 
 template <class ELFT>
-bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
-                         const typename ELFT::Shdr &Sec) {
+static bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
+                                const typename ELFT::Shdr &Sec) {
   // SHT_NOBITS sections don't need to have an offset inside the segment.
   if (Sec.sh_type == ELF::SHT_NOBITS)
     return true;
@@ -184,8 +184,8 @@ bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
 // Check that an allocatable section belongs to a virtual address
 // space of a segment.
 template <class ELFT>
-bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
-                     const typename ELFT::Shdr &Sec) {
+static bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
+                            const typename ELFT::Shdr &Sec) {
   if (!(Sec.sh_flags & ELF::SHF_ALLOC))
     return true;
 
@@ -203,8 +203,8 @@ bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
 }
 
 template <class ELFT>
-bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
-                        const typename ELFT::Shdr &Sec) {
+static bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
+                               const typename ELFT::Shdr &Sec) {
   return checkSectionOffsets<ELFT>(Phdr, Sec) &&
          checkSectionVMA<ELFT>(Phdr, Sec);
 }
@@ -212,7 +212,7 @@ bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
 // HdrHandler is called once with the number of relocations and whether the
 // relocations have addends. EntryHandler is called once per decoded relocation.
 template <bool Is64>
-Error decodeCrel(
+static Error decodeCrel(
     ArrayRef<uint8_t> Content,
     function_ref<void(uint64_t /*relocation count*/, bool /*explicit addends*/)>
         HdrHandler,
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 64e8a0a6c36c2..0f0698d6652ba 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -17,6 +17,7 @@
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)
 #endif
 MODULE_ANALYSIS("collector-metadata", CollectorMetadataAnalysis())
+MODULE_ANALYSIS("machine-module-info", MachineModuleAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 #undef MODULE_ANALYSIS
 
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 06d35b8f4ab10..28fcde45f2fb1 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -398,9 +398,9 @@ class PassBuilder {
   /// returns false.
   Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
-  /// Parse RegClassFilterName to get RegClassFilterFunc.
-  std::optional<RegClassFilterFunc>
-  parseRegAllocFilter(StringRef RegClassFilterName);
+  /// Parse RegAllocFilterName to get RegAllocFilterFunc.
+  std::optional<RegAllocFilterFunc>
+  parseRegAllocFilter(StringRef RegAllocFilterName);
 
   /// Print pass names.
   void printPassNames(raw_ostream &OS);
@@ -594,7 +594,7 @@ class PassBuilder {
   /// needs it. E.g. AMDGPU requires regalloc passes can handle sgpr and vgpr
   /// separately.
   void registerRegClassFilterParsingCallback(
-      const std::function<RegClassFilterFunc(StringRef)> &C) {
+      const std::function<RegAllocFilterFunc(StringRef)> &C) {
     RegClassFilterParsingCallbacks.push_back(C);
   }
 
@@ -815,7 +815,7 @@ class PassBuilder {
               2>
       MachineFunctionPipelineParsingCallbacks;
   // Callbacks to parse `filter` parameter in register allocation passes
-  SmallVector<std::function<RegClassFilterFunc(StringRef)>, 2>
+  SmallVector<std::function<RegAllocFilterFunc(StringRef)>, 2>
       RegClassFilterParsingCallbacks;
 };
 
@@ -952,6 +952,10 @@ class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
     return Result();
   }
 };
+
+/// Common option used by multiple tools to print pipeline passes
+extern cl::opt<bool> PrintPipelinePasses;
+
 }
 
 #endif
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index d449091078c93..824dcf2372c83 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -174,6 +174,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
   return INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR);
 }
 
+inline StringRef getInstrProfBitmapBiasVarName() {
+  return INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_BITMAP_BIAS_VAR);
+}
+
 /// Return the marker used to separate PGO names during serialization.
 inline StringRef getInstrProfNameSeparator() { return "\01"; }
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index e9866d94b762c..b9df3266fbcf8 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -738,7 +738,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_VAR __llvm_profile_bitmap_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
+#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
index a19b3f51d642d..28f05e9073a8a 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
@@ -73,7 +73,8 @@ class PGOContextualProfile final {
 };
 
 class PGOCtxProfileReader final {
-  BitstreamCursor &Cursor;
+  StringRef Magic;
+  BitstreamCursor Cursor;
   Expected<BitstreamEntry> advance();
   Error readMetadata();
   Error wrongValue(const Twine &);
@@ -84,7 +85,9 @@ class PGOCtxProfileReader final {
   bool canReadContext();
 
 public:
-  PGOCtxProfileReader(BitstreamCursor &Cursor) : Cursor(Cursor) {}
+  PGOCtxProfileReader(StringRef Buffer)
+      : Magic(Buffer.substr(0, PGOCtxProfileWriter::ContainerMagic.size())),
+        Cursor(Buffer.substr(PGOCtxProfileWriter::ContainerMagic.size())) {}
 
   Expected<std::map<GlobalValue::GUID, PGOContextualProfile>> loadContexts();
 };
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index ecee7a2cfb539..db9a0fd77f835 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 #define LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
@@ -68,15 +69,7 @@ class PGOCtxProfileWriter final {
 
 public:
   PGOCtxProfileWriter(raw_ostream &Out,
-                      std::optional<unsigned> VersionOverride = std::nullopt)
-      : Writer(Out, 0) {
-    Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID,
-                         CodeLen);
-    const auto Version = VersionOverride ? *VersionOverride : CurrentVersion;
-    Writer.EmitRecord(PGOCtxProfileRecords::Version,
-                      SmallVector<unsigned, 1>({Version}));
-  }
-
+                      std::optional<unsigned> VersionOverride = std::nullopt);
   ~PGOCtxProfileWriter() { Writer.ExitBlock(); }
 
   void write(const ctx_profile::ContextNode &);
@@ -85,6 +78,7 @@ class PGOCtxProfileWriter final {
   static constexpr unsigned CodeLen = 2;
   static constexpr uint32_t CurrentVersion = 1;
   static constexpr unsigned VBREncodingBits = 6;
+  static constexpr StringRef ContainerMagic = "CTXP";
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index c5d59ba47ca31..313b541ddcaca 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -18,29 +18,57 @@
 //
 // namespace sandboxir {
 //
-//        +- Argument                   +- BinaryOperator
-//        |                             |
-// Value -+- BasicBlock                 +- BranchInst
-//        |                             |
-//        +- Function   +- Constant     +- CastInst
-//        |             |               |
-//        +- User ------+- Instruction -+- CallInst
+// Value -+- Argument
+//        |
+//        +- BasicBlock
+//        |
+//        +- User ------+- Constant ------ Function
+//                      |
+//                      +- Instruction -+- BinaryOperator
 //                                      |
-//                                      +- CmpInst
+//                                      +- BranchInst
 //                                      |
-//                                      +- ExtractElementInst
+//                                      +- CastInst --------+- AddrSpaceCastInst
+//                                      |                   |
+//                                      |                   +- BitCastInst
+//                                      |                   |
+//                                      |                   +- FPExtInst
+//                                      |                   |
+//                                      |                   +- FPToSIInst
+//                                      |                   |
+//                                      |                   +- FPToUIInst
+//                                      |                   |
+//                                      |                   +- FPTruncInst
+//                                      |                   |
+//                                      |                   +- IntToPtrInst
+//                                      |                   |
+//                                      |                   +- PtrToIntInst
+//                                      |                   |
+//                                      |                   +- SExtInst
+//                                      |                   |
+//                                      |                   +- SIToFPInst
+//                                      |                   |
+//                                      |                   +- TruncInst
+//                                      |                   |
+//                                      |                   +- UIToFPInst
+//                                      |                   |
+//                                      |                   +- ZExtInst
+//                                      |
+//                                      +- CallBase -----------+- CallBrInst
+//                                      |                      |
+//                                      +- CmpInst             +- CallInst
+//                                      |                      |
+//                                      +- ExtractElementInst  +- InvokeInst
 //                                      |
 //                                      +- GetElementPtrInst
 //                                      |
 //                                      +- InsertElementInst
 //                                      |
-//                                      +- LoadInst
-//                                      |
 //                                      +- OpaqueInst
 //                                      |
 //                                      +- PHINode
 //                                      |
-//                                      +- RetInst
+//                                      +- ReturnInst
 //                                      |
 //                                      +- SelectInst
 //                                      |
@@ -48,6 +76,10 @@
 //                                      |
 //                                      +- StoreInst
 //                                      |
+//                                      +- UnaryInstruction -+- LoadInst
+//                                      |                    |
+//                                      |                    +- CastInst
+//                                      |
 //                                      +- UnaryOperator
 //
 // Use
@@ -59,6 +91,8 @@
 #define LLVM_SANDBOXIR_SANDBOXIR_H
 
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Tracker.h"
@@ -74,8 +108,22 @@ class BasicBlock;
 class Context;
 class Function;
 class Instruction;
+class SelectInst;
+class BranchInst;
+class UnaryInstruction;
+class LoadInst;
+class ReturnInst;
+class StoreInst;
 class User;
 class Value;
+class CallBase;
+class CallInst;
+class InvokeInst;
+class CallBrInst;
+class GetElementPtrInst;
+class CastInst;
+class PtrToIntInst;
+class BitCastInst;
 
 /// Iterator for the `Use` edges of a User's operands.
 /// \Returns the operand `Use` when dereferenced.
@@ -97,12 +145,20 @@ class OperandUseIterator {
   OperandUseIterator() = default;
   value_type operator*() const;
   OperandUseIterator &operator++();
+  OperandUseIterator operator++(int) {
+    auto Copy = *this;
+    this->operator++();
+    return Copy;
+  }
   bool operator==(const OperandUseIterator &Other) const {
     return Use == Other.Use;
   }
   bool operator!=(const OperandUseIterator &Other) const {
     return !(*this == Other);
   }
+  OperandUseIterator operator+(unsigned Num) const;
+  OperandUseIterator operator-(unsigned Num) const;
+  int operator-(const OperandUseIterator &Other) const;
 };
 
 /// Iterator for the `Use` edges of a Value's users.
@@ -129,6 +185,7 @@ class UserUseIterator {
   bool operator!=(const UserUseIterator &Other) const {
     return !(*this == Other);
   }
+  const sandboxir::Use &getUse() const { return Use; }
 };
 
 /// A SandboxIR Value has users. This is the base class.
@@ -170,9 +227,21 @@ class Value {
   /// order.
   llvm::Value *Val = nullptr;
 
-  friend class Context; // For getting `Val`.
-  friend class User;    // For getting `Val`.
-  friend class Use;     // For getting `Val`.
+  friend class Context;           // For getting `Val`.
+  friend class User;              // For getting `Val`.
+  friend class Use;               // For getting `Val`.
+  friend class SelectInst;        // For getting `Val`.
+  friend class BranchInst;        // For getting `Val`.
+  friend class LoadInst;          // For getting `Val`.
+  friend class StoreInst;         // For getting `Val`.
+  friend class ReturnInst;        // For getting `Val`.
+  friend class CallBase;          // For getting `Val`.
+  friend class CallInst;          // For getting `Val`.
+  friend class InvokeInst;        // For getting `Val`.
+  friend class CallBrInst;        // For getting `Val`.
+  friend class GetElementPtrInst; // For getting `Val`.
+  friend class CastInst;          // For getting `Val`.
+  friend class PHINode;           // For getting `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -262,11 +331,14 @@ class Value {
                          llvm::function_ref<bool(const Use &)> ShouldReplace);
   void replaceAllUsesWith(Value *Other);
 
+  /// \Returns the LLVM IR name of the bottom-most LLVM value.
+  StringRef getName() const { return Val->getName(); }
+
 #ifndef NDEBUG
   /// Should crash if there is something wrong with the instruction.
   virtual void verify() const = 0;
-  /// Returns the name in the form 'SB<number>.' like 'SB1.'
-  std::string getName() const;
+  /// Returns the unique id in the form 'SB<number>.' like 'SB1.'
+  std::string getUid() const;
   virtual void dumpCommonHeader(raw_ostream &OS) const;
   void dumpCommonFooter(raw_ostream &OS) const;
   void dumpCommonPrefix(raw_ostream &OS) const;
@@ -331,6 +403,14 @@ class User : public Value {
   virtual unsigned getUseOperandNo(const Use &Use) const = 0;
   friend unsigned Use::getOperandNo() const; // For getUseOperandNo()
 
+  void swapOperandsInternal(unsigned OpIdxA, unsigned OpIdxB) {
+    assert(OpIdxA < getNumOperands() && "OpIdxA out of bounds!");
+    assert(OpIdxB < getNumOperands() && "OpIdxB out of bounds!");
+    auto UseA = getOperandUse(OpIdxA);
+    auto UseB = getOperandUse(OpIdxB);
+    UseA.swap(UseB);
+  }
+
 #ifndef NDEBUG
   void verifyUserOfLLVMUse(const llvm::Use &Use) const;
 #endif // NDEBUG
@@ -395,12 +475,17 @@ class User : public Value {
 class Constant : public sandboxir::User {
   Constant(llvm::Constant *C, sandboxir::Context &SBCtx)
       : sandboxir::User(ClassID::Constant, C, SBCtx) {}
-  friend class Context; // For constructor.
+  Constant(ClassID ID, llvm::Constant *C, sandboxir::Context &SBCtx)
+      : sandboxir::User(ID, C, SBCtx) {}
+  friend class Function; // For constructor
+  friend class Context;  // For constructor.
   Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
     return getOperandUseDefault(OpIdx, Verify);
   }
 
 public:
+  static Constant *createInt(Type *Ty, uint64_t V, Context &Ctx,
+                             bool IsSigned = false);
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
     return From->getSubclassID() == ClassID::Constant ||
@@ -411,7 +496,7 @@ class Constant : public sandboxir::User {
     return getUseOperandNoDefault(Use);
   }
 #ifndef NDEBUG
-  void verify() const final {
+  void verify() const override {
     assert(isa<llvm::Constant>(Val) && "Expected Constant!");
   }
   friend raw_ostream &operator<<(raw_ostream &OS,
@@ -467,14 +552,64 @@ class BBIterator {
   pointer get() const { return getInstr(It); }
 };
 
+/// Contains a list of sandboxir::Instruction's.
+class BasicBlock : public Value {
+  /// Builds a graph that contains all values in \p BB in their original form
+  /// i.e., no vectorization is taking place here.
+  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
+  friend class Context;     // For `buildBasicBlockFromIR`
+  friend class Instruction; // For LLVM Val.
+
+  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
+      : Value(ClassID::Block, BB, SBCtx) {
+    buildBasicBlockFromLLVMIR(BB);
+  }
+
+public:
+  ~BasicBlock() = default;
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == Value::ClassID::Block;
+  }
+  Function *getParent() const;
+  using iterator = BBIterator;
+  iterator begin() const;
+  iterator end() const {
+    auto *BB = cast<llvm::BasicBlock>(Val);
+    return iterator(BB, BB->end(), &Ctx);
+  }
+  std::reverse_iterator<iterator> rbegin() const {
+    return std::make_reverse_iterator(end());
+  }
+  std::reverse_iterator<iterator> rend() const {
+    return std::make_reverse_iterator(begin());
+  }
+  Context &getContext() const { return Ctx; }
+  Instruction *getTerminator() const;
+  bool empty() const { return begin() == end(); }
+  Instruction &front() const;
+  Instruction &back() const;
+
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS, const BasicBlock &SBBB) {
+    SBBB.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 /// A sandboxir::User with operands, opcode and linked with previous/next
 /// instructions in an instruction list.
 class Instruction : public sandboxir::User {
 public:
   enum class Opcode {
-#define DEF_VALUE(ID, CLASS)
-#define DEF_USER(ID, CLASS)
 #define OP(OPC) OPC,
+#define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
 #include "llvm/SandboxIR/SandboxIRValues.def"
   };
@@ -489,10 +624,22 @@ class Instruction : public sandboxir::User {
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
+  friend class SelectInst;        // For getTopmostLLVMInstruction().
+  friend class BranchInst;        // For getTopmostLLVMInstruction().
+  friend class LoadInst;          // For getTopmostLLVMInstruction().
+  friend class StoreInst;         // For getTopmostLLVMInstruction().
+  friend class ReturnInst;        // For getTopmostLLVMInstruction().
+  friend class CallInst;          // For getTopmostLLVMInstruction().
+  friend class InvokeInst;        // For getTopmostLLVMInstruction().
+  friend class CallBrInst;        // For getTopmostLLVMInstruction().
+  friend class GetElementPtrInst; // For getTopmostLLVMInstruction().
+  friend class CastInst;          // For getTopmostLLVMInstruction().
+  friend class PHINode;           // For getTopmostLLVMInstruction().
 
   /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program
   /// order.
   virtual SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const = 0;
+  friend class EraseFromParent; // For getLLVMInstrs().
 
 public:
   static const char *getOpcodeName(Opcode Opc);
@@ -552,36 +699,139 @@ class Instruction : public sandboxir::User {
 #endif
 };
 
-/// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
-/// an OpaqueInstr.
-class OpaqueInst : public sandboxir::Instruction {
-  OpaqueInst(llvm::Instruction *I, sandboxir::Context &Ctx)
-      : sandboxir::Instruction(ClassID::Opaque, Opcode::Opaque, I, Ctx) {}
-  OpaqueInst(ClassID SubclassID, llvm::Instruction *I, sandboxir::Context &Ctx)
-      : sandboxir::Instruction(SubclassID, Opcode::Opaque, I, Ctx) {}
-  friend class Context; // For constructor.
+class SelectInst : public Instruction {
+  /// Use Context::createSelectInst(). Don't call the
+  /// constructor directly.
+  SelectInst(llvm::SelectInst *CI, Context &Ctx)
+      : Instruction(ClassID::Select, Opcode::Select, CI, Ctx) {}
+  friend Context; // for SelectInst()
   Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
     return getOperandUseDefault(OpIdx, Verify);
   }
   SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
     return {cast<llvm::Instruction>(Val)};
   }
+  static Value *createCommon(Value *Cond, Value *True, Value *False,
+                             const Twine &Name, IRBuilder<> &Builder,
+                             Context &Ctx);
 
 public:
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::Opaque;
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static Value *create(Value *Cond, Value *True, Value *False,
+                       Instruction *InsertBefore, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Cond, Value *True, Value *False,
+                       BasicBlock *InsertAtEnd, Context &Ctx,
+                       const Twine &Name = "");
+  Value *getCondition() { return getOperand(0); }
+  Value *getTrueValue() { return getOperand(1); }
+  Value *getFalseValue() { return getOperand(2); }
+
+  void setCondition(Value *New) { setOperand(0, New); }
+  void setTrueValue(Value *New) { setOperand(1, New); }
+  void setFalseValue(Value *New) { setOperand(2, New); }
+  void swapValues() { cast<llvm::SelectInst>(Val)->swapValues(); }
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::SelectInst>(Val) && "Expected SelectInst!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class BranchInst : public Instruction {
+  /// Use Context::createBranchInst(). Don't call the constructor directly.
+  BranchInst(llvm::BranchInst *BI, Context &Ctx)
+      : Instruction(ClassID::Br, Opcode::Br, BI, Ctx) {}
+  friend Context; // for BranchInst()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
   }
+
+public:
   unsigned getUseOperandNo(const Use &Use) const final {
     return getUseOperandNoDefault(Use);
   }
   unsigned getNumOfIRInstrs() const final { return 1u; }
+  static BranchInst *create(BasicBlock *IfTrue, Instruction *InsertBefore,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                            Value *Cond, Instruction *InsertBefore,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                            Value *Cond, BasicBlock *InsertAtEnd, Context &Ctx);
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  bool isUnconditional() const {
+    return cast<llvm::BranchInst>(Val)->isUnconditional();
+  }
+  bool isConditional() const {
+    return cast<llvm::BranchInst>(Val)->isConditional();
+  }
+  Value *getCondition() const;
+  void setCondition(Value *V) { setOperand(0, V); }
+  unsigned getNumSuccessors() const { return 1 + isConditional(); }
+  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  void swapSuccessors() { swapOperandsInternal(1, 2); }
+
+private:
+  struct LLVMBBToSBBB {
+    Context &Ctx;
+    LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
+    BasicBlock *operator()(llvm::BasicBlock *BB) const;
+  };
+
+  struct ConstLLVMBBToSBBB {
+    Context &Ctx;
+    ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
+    const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
+  };
+
+public:
+  using sb_succ_op_iterator =
+      mapped_iterator<llvm::BranchInst::succ_op_iterator, LLVMBBToSBBB>;
+  iterator_range<sb_succ_op_iterator> successors() {
+    iterator_range<llvm::BranchInst::succ_op_iterator> LLVMRange =
+        cast<llvm::BranchInst>(Val)->successors();
+    LLVMBBToSBBB BBMap(Ctx);
+    sb_succ_op_iterator MappedBegin = map_iterator(LLVMRange.begin(), BBMap);
+    sb_succ_op_iterator MappedEnd = map_iterator(LLVMRange.end(), BBMap);
+    return make_range(MappedBegin, MappedEnd);
+  }
+
+  using const_sb_succ_op_iterator =
+      mapped_iterator<llvm::BranchInst::const_succ_op_iterator,
+                      ConstLLVMBBToSBBB>;
+  iterator_range<const_sb_succ_op_iterator> successors() const {
+    iterator_range<llvm::BranchInst::const_succ_op_iterator> ConstLLVMRange =
+        static_cast<const llvm::BranchInst *>(cast<llvm::BranchInst>(Val))
+            ->successors();
+    ConstLLVMBBToSBBB ConstBBMap(Ctx);
+    const_sb_succ_op_iterator ConstMappedBegin =
+        map_iterator(ConstLLVMRange.begin(), ConstBBMap);
+    const_sb_succ_op_iterator ConstMappedEnd =
+        map_iterator(ConstLLVMRange.end(), ConstBBMap);
+    return make_range(ConstMappedBegin, ConstMappedEnd);
+  }
+
 #ifndef NDEBUG
   void verify() const final {
-    // Nothing to do
+    assert(isa<llvm::BranchInst>(Val) && "Expected BranchInst!");
   }
-  friend raw_ostream &operator<<(raw_ostream &OS,
-                                 const sandboxir::OpaqueInst &OI) {
-    OI.dump(OS);
+  friend raw_ostream &operator<<(raw_ostream &OS, const BranchInst &BI) {
+    BI.dump(OS);
     return OS;
   }
   void dump(raw_ostream &OS) const override;
@@ -589,54 +839,821 @@ class OpaqueInst : public sandboxir::Instruction {
 #endif
 };
 
-/// Contains a list of sandboxir::Instruction's.
-class BasicBlock : public Value {
-  /// Builds a graph that contains all values in \p BB in their original form
-  /// i.e., no vectorization is taking place here.
-  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
-  friend class Context;     // For `buildBasicBlockFromIR`
-  friend class Instruction; // For LLVM Val.
+/// An abstract class, parent of unary instructions.
+class UnaryInstruction : public Instruction {
+protected:
+  UnaryInstruction(ClassID ID, Opcode Opc, llvm::Instruction *LLVMI,
+                   Context &Ctx)
+      : Instruction(ID, Opc, LLVMI, Ctx) {}
 
-  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
-      : Value(ClassID::Block, BB, SBCtx) {
-    buildBasicBlockFromLLVMIR(BB);
+public:
+  static bool classof(const Instruction *I) {
+    return isa<LoadInst>(I) || isa<CastInst>(I);
+  }
+  static bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+};
+
+class LoadInst final : public UnaryInstruction {
+  /// Use LoadInst::create() instead of calling the constructor.
+  LoadInst(llvm::LoadInst *LI, Context &Ctx)
+      : UnaryInstruction(ClassID::Load, Opcode::Load, LI, Ctx) {}
+  friend Context; // for LoadInst()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
   }
 
 public:
-  ~BasicBlock() = default;
+  /// Return true if this is a load from a volatile memory location.
+  bool isVolatile() const { return cast<llvm::LoadInst>(Val)->isVolatile(); }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          Instruction *InsertBefore, Context &Ctx,
+                          const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          Instruction *InsertBefore, bool IsVolatile,
+                          Context &Ctx, const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          BasicBlock *InsertAtEnd, Context &Ctx,
+                          const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          BasicBlock *InsertAtEnd, bool IsVolatile,
+                          Context &Ctx, const Twine &Name = "");
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  Value *getPointerOperand() const;
+  Align getAlign() const { return cast<llvm::LoadInst>(Val)->getAlign(); }
+  bool isUnordered() const { return cast<llvm::LoadInst>(Val)->isUnordered(); }
+  bool isSimple() const { return cast<llvm::LoadInst>(Val)->isSimple(); }
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::LoadInst>(Val) && "Expected LoadInst!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class StoreInst final : public Instruction {
+  /// Use StoreInst::create().
+  StoreInst(llvm::StoreInst *SI, Context &Ctx)
+      : Instruction(ClassID::Store, Opcode::Store, SI, Ctx) {}
+  friend Context; // for StoreInst()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  /// Return true if this is a store from a volatile memory location.
+  bool isVolatile() const { return cast<llvm::StoreInst>(Val)->isVolatile(); }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, Context &Ctx);
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, bool IsVolatile,
+                           Context &Ctx);
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, Context &Ctx);
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, bool IsVolatile,
+                           Context &Ctx);
   /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  Value *getValueOperand() const;
+  Value *getPointerOperand() const;
+  Align getAlign() const { return cast<llvm::StoreInst>(Val)->getAlign(); }
+  bool isSimple() const { return cast<llvm::StoreInst>(Val)->isSimple(); }
+  bool isUnordered() const { return cast<llvm::StoreInst>(Val)->isUnordered(); }
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::StoreInst>(Val) && "Expected StoreInst!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class ReturnInst final : public Instruction {
+  /// Use ReturnInst::create() instead of calling the constructor.
+  ReturnInst(llvm::Instruction *I, Context &Ctx)
+      : Instruction(ClassID::Ret, Opcode::Ret, I, Ctx) {}
+  ReturnInst(ClassID SubclassID, llvm::Instruction *I, Context &Ctx)
+      : Instruction(SubclassID, Opcode::Ret, I, Ctx) {}
+  friend class Context; // For accessing the constructor in create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+  static ReturnInst *createCommon(Value *RetVal, IRBuilder<> &Builder,
+                                  Context &Ctx);
+
+public:
+  static ReturnInst *create(Value *RetVal, Instruction *InsertBefore,
+                            Context &Ctx);
+  static ReturnInst *create(Value *RetVal, BasicBlock *InsertAtEnd,
+                            Context &Ctx);
   static bool classof(const Value *From) {
-    return From->getSubclassID() == Value::ClassID::Block;
+    return From->getSubclassID() == ClassID::Ret;
   }
-  Function *getParent() const;
-  using iterator = BBIterator;
-  iterator begin() const;
-  iterator end() const {
-    auto *BB = cast<llvm::BasicBlock>(Val);
-    return iterator(BB, BB->end(), &Ctx);
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
   }
-  std::reverse_iterator<iterator> rbegin() const {
-    return std::make_reverse_iterator(end());
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  /// \Returns null if there is no return value.
+  Value *getReturnValue() const;
+#ifndef NDEBUG
+  void verify() const final {}
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class CallBase : public Instruction {
+  CallBase(ClassID ID, Opcode Opc, llvm::Instruction *I, Context &Ctx)
+      : Instruction(ID, Opc, I, Ctx) {}
+  friend class CallInst;   // For constructor.
+  friend class InvokeInst; // For constructor.
+  friend class CallBrInst; // For constructor.
+
+public:
+  static bool classof(const Value *From) {
+    auto Opc = From->getSubclassID();
+    return Opc == Instruction::ClassID::Call ||
+           Opc == Instruction::ClassID::Invoke ||
+           Opc == Instruction::ClassID::CallBr;
   }
-  std::reverse_iterator<iterator> rend() const {
-    return std::make_reverse_iterator(begin());
+
+  FunctionType *getFunctionType() const {
+    return cast<llvm::CallBase>(Val)->getFunctionType();
   }
-  Context &getContext() const { return Ctx; }
-  Instruction *getTerminator() const;
-  bool empty() const { return begin() == end(); }
-  Instruction &front() const;
-  Instruction &back() const;
+
+  op_iterator data_operands_begin() { return op_begin(); }
+  const_op_iterator data_operands_begin() const {
+    return const_cast<CallBase *>(this)->data_operands_begin();
+  }
+  op_iterator data_operands_end() {
+    auto *LLVMCB = cast<llvm::CallBase>(Val);
+    auto Dist = LLVMCB->data_operands_end() - LLVMCB->data_operands_begin();
+    return op_begin() + Dist;
+  }
+  const_op_iterator data_operands_end() const {
+    auto *LLVMCB = cast<llvm::CallBase>(Val);
+    auto Dist = LLVMCB->data_operands_end() - LLVMCB->data_operands_begin();
+    return op_begin() + Dist;
+  }
+  iterator_range<op_iterator> data_ops() {
+    return make_range(data_operands_begin(), data_operands_end());
+  }
+  iterator_range<const_op_iterator> data_ops() const {
+    return make_range(data_operands_begin(), data_operands_end());
+  }
+  bool data_operands_empty() const {
+    return data_operands_end() == data_operands_begin();
+  }
+  unsigned data_operands_size() const {
+    return std::distance(data_operands_begin(), data_operands_end());
+  }
+  bool isDataOperand(Use U) const {
+    assert(this == U.getUser() &&
+           "Only valid to query with a use of this instruction!");
+    return cast<llvm::CallBase>(Val)->isDataOperand(U.LLVMUse);
+  }
+  unsigned getDataOperandNo(Use U) const {
+    assert(isDataOperand(U) && "Data operand # out of range!");
+    return cast<llvm::CallBase>(Val)->getDataOperandNo(U.LLVMUse);
+  }
+
+  /// Return the total number operands (not operand bundles) used by
+  /// every operand bundle in this OperandBundleUser.
+  unsigned getNumTotalBundleOperands() const {
+    return cast<llvm::CallBase>(Val)->getNumTotalBundleOperands();
+  }
+
+  op_iterator arg_begin() { return op_begin(); }
+  const_op_iterator arg_begin() const { return op_begin(); }
+  op_iterator arg_end() {
+    return data_operands_end() - getNumTotalBundleOperands();
+  }
+  const_op_iterator arg_end() const {
+    return const_cast<CallBase *>(this)->arg_end();
+  }
+  iterator_range<op_iterator> args() {
+    return make_range(arg_begin(), arg_end());
+  }
+  iterator_range<const_op_iterator> args() const {
+    return make_range(arg_begin(), arg_end());
+  }
+  bool arg_empty() const { return arg_end() == arg_begin(); }
+  unsigned arg_size() const { return arg_end() - arg_begin(); }
+
+  Value *getArgOperand(unsigned OpIdx) const {
+    assert(OpIdx < arg_size() && "Out of bounds!");
+    return getOperand(OpIdx);
+  }
+  void setArgOperand(unsigned OpIdx, Value *NewOp) {
+    assert(OpIdx < arg_size() && "Out of bounds!");
+    setOperand(OpIdx, NewOp);
+  }
+
+  Use getArgOperandUse(unsigned Idx) const {
+    assert(Idx < arg_size() && "Out of bounds!");
+    return getOperandUse(Idx);
+  }
+  Use getArgOperandUse(unsigned Idx) {
+    assert(Idx < arg_size() && "Out of bounds!");
+    return getOperandUse(Idx);
+  }
+
+  bool isArgOperand(Use U) const {
+    return cast<llvm::CallBase>(Val)->isArgOperand(U.LLVMUse);
+  }
+  unsigned getArgOperandNo(Use U) const {
+    return cast<llvm::CallBase>(Val)->getArgOperandNo(U.LLVMUse);
+  }
+  bool hasArgument(const Value *V) const { return is_contained(args(), V); }
+
+  Value *getCalledOperand() const;
+  Use getCalledOperandUse() const;
+
+  Function *getCalledFunction() const;
+  bool isIndirectCall() const {
+    return cast<llvm::CallBase>(Val)->isIndirectCall();
+  }
+  bool isCallee(Use U) const {
+    return cast<llvm::CallBase>(Val)->isCallee(U.LLVMUse);
+  }
+  Function *getCaller();
+  const Function *getCaller() const {
+    return const_cast<CallBase *>(this)->getCaller();
+  }
+  bool isMustTailCall() const {
+    return cast<llvm::CallBase>(Val)->isMustTailCall();
+  }
+  bool isTailCall() const { return cast<llvm::CallBase>(Val)->isTailCall(); }
+  Intrinsic::ID getIntrinsicID() const {
+    return cast<llvm::CallBase>(Val)->getIntrinsicID();
+  }
+  void setCalledOperand(Value *V) { getCalledOperandUse().set(V); }
+  void setCalledFunction(Function *F);
+  CallingConv::ID getCallingConv() const {
+    return cast<llvm::CallBase>(Val)->getCallingConv();
+  }
+  bool isInlineAsm() const { return cast<llvm::CallBase>(Val)->isInlineAsm(); }
+};
+
+class CallInst final : public CallBase {
+  /// Use Context::createCallInst(). Don't call the
+  /// constructor directly.
+  CallInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::Call, Opcode::Call, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, BBIterator WhereIt,
+                          BasicBlock *WhereBB, Context &Ctx,
+                          const Twine &NameStr = "");
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, Instruction *InsertBefore,
+                          Context &Ctx, const Twine &NameStr = "");
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                          Context &Ctx, const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Call;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+#ifndef NDEBUG
+  void verify() const final {}
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class InvokeInst final : public CallBase {
+  /// Use Context::createInvokeInst(). Don't call the
+  /// constructor directly.
+  InvokeInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::Invoke, Opcode::Invoke, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &NameStr = "");
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &NameStr = "");
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Invoke;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  BasicBlock *getNormalDest() const;
+  BasicBlock *getUnwindDest() const;
+  void setNormalDest(BasicBlock *BB);
+  void setUnwindDest(BasicBlock *BB);
+  // TODO: Return a `LandingPadInst` once implemented.
+  Instruction *getLandingPadInst() const;
+  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) {
+    assert(SuccIdx < 2 && "Successor # out of range for invoke!");
+    if (SuccIdx == 0)
+      setNormalDest(NewSucc);
+    else
+      setUnwindDest(NewSucc);
+  }
+  unsigned getNumSuccessors() const {
+    return cast<llvm::InvokeInst>(Val)->getNumSuccessors();
+  }
+#ifndef NDEBUG
+  void verify() const final {}
+  friend raw_ostream &operator<<(raw_ostream &OS, const InvokeInst &I) {
+    I.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class CallBrInst final : public CallBase {
+  /// Use Context::createCallBrInst(). Don't call the
+  /// constructor directly.
+  CallBrInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::CallBr, Opcode::CallBr, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &NameStr = "");
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &NameStr = "");
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &NameStr = "");
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::CallBr;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  unsigned getNumIndirectDests() const {
+    return cast<llvm::CallBrInst>(Val)->getNumIndirectDests();
+  }
+  Value *getIndirectDestLabel(unsigned Idx) const;
+  Value *getIndirectDestLabelUse(unsigned Idx) const;
+  BasicBlock *getDefaultDest() const;
+  BasicBlock *getIndirectDest(unsigned Idx) const;
+  SmallVector<BasicBlock *, 16> getIndirectDests() const;
+  void setDefaultDest(BasicBlock *BB);
+  void setIndirectDest(unsigned Idx, BasicBlock *BB);
+  BasicBlock *getSuccessor(unsigned Idx) const;
+  unsigned getNumSuccessors() const {
+    return cast<llvm::CallBrInst>(Val)->getNumSuccessors();
+  }
+#ifndef NDEBUG
+  void verify() const final {}
+  friend raw_ostream &operator<<(raw_ostream &OS, const CallBrInst &I) {
+    I.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class GetElementPtrInst final : public Instruction {
+  /// Use Context::createGetElementPtrInst(). Don't call
+  /// the constructor directly.
+  GetElementPtrInst(llvm::Instruction *I, Context &Ctx)
+      : Instruction(ClassID::GetElementPtr, Opcode::GetElementPtr, I, Ctx) {}
+  GetElementPtrInst(ClassID SubclassID, llvm::Instruction *I, Context &Ctx)
+      : Instruction(SubclassID, Opcode::GetElementPtr, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &NameStr = "");
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       Instruction *InsertBefore, Context &Ctx,
+                       const Twine &NameStr = "");
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       BasicBlock *InsertAtEnd, Context &Ctx,
+                       const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::GetElementPtr;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+
+  Type *getSourceElementType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getSourceElementType();
+  }
+  Type *getResultElementType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getResultElementType();
+  }
+  unsigned getAddressSpace() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getAddressSpace();
+  }
+
+  inline op_iterator idx_begin() { return op_begin() + 1; }
+  inline const_op_iterator idx_begin() const {
+    return const_cast<GetElementPtrInst *>(this)->idx_begin();
+  }
+  inline op_iterator idx_end() { return op_end(); }
+  inline const_op_iterator idx_end() const {
+    return const_cast<GetElementPtrInst *>(this)->idx_end();
+  }
+  inline iterator_range<op_iterator> indices() {
+    return make_range(idx_begin(), idx_end());
+  }
+  inline iterator_range<const_op_iterator> indices() const {
+    return const_cast<GetElementPtrInst *>(this)->indices();
+  }
+
+  Value *getPointerOperand() const;
+  static unsigned getPointerOperandIndex() {
+    return llvm::GetElementPtrInst::getPointerOperandIndex();
+  }
+  Type *getPointerOperandType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getPointerOperandType();
+  }
+  unsigned getPointerAddressSpace() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getPointerAddressSpace();
+  }
+  unsigned getNumIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getNumIndices();
+  }
+  bool hasIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasIndices();
+  }
+  bool hasAllConstantIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasAllConstantIndices();
+  }
+  GEPNoWrapFlags getNoWrapFlags() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getNoWrapFlags();
+  }
+  bool isInBounds() const {
+    return cast<llvm::GetElementPtrInst>(Val)->isInBounds();
+  }
+  bool hasNoUnsignedSignedWrap() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasNoUnsignedSignedWrap();
+  }
+  bool hasNoUnsignedWrap() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasNoUnsignedWrap();
+  }
+  bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const {
+    return cast<llvm::GetElementPtrInst>(Val)->accumulateConstantOffset(DL,
+                                                                        Offset);
+  }
+  // TODO: Add missing member functions.
 
+#ifndef NDEBUG
+  void verify() const final {}
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class CastInst : public UnaryInstruction {
+  static Opcode getCastOpcode(llvm::Instruction::CastOps CastOp) {
+    switch (CastOp) {
+    case llvm::Instruction::ZExt:
+      return Opcode::ZExt;
+    case llvm::Instruction::SExt:
+      return Opcode::SExt;
+    case llvm::Instruction::FPToUI:
+      return Opcode::FPToUI;
+    case llvm::Instruction::FPToSI:
+      return Opcode::FPToSI;
+    case llvm::Instruction::FPExt:
+      return Opcode::FPExt;
+    case llvm::Instruction::PtrToInt:
+      return Opcode::PtrToInt;
+    case llvm::Instruction::IntToPtr:
+      return Opcode::IntToPtr;
+    case llvm::Instruction::SIToFP:
+      return Opcode::SIToFP;
+    case llvm::Instruction::UIToFP:
+      return Opcode::UIToFP;
+    case llvm::Instruction::Trunc:
+      return Opcode::Trunc;
+    case llvm::Instruction::FPTrunc:
+      return Opcode::FPTrunc;
+    case llvm::Instruction::BitCast:
+      return Opcode::BitCast;
+    case llvm::Instruction::AddrSpaceCast:
+      return Opcode::AddrSpaceCast;
+    case llvm::Instruction::CastOpsEnd:
+      llvm_unreachable("Bad CastOp!");
+    }
+    llvm_unreachable("Unhandled CastOp!");
+  }
+  /// Use Context::createCastInst(). Don't call the
+  /// constructor directly.
+  CastInst(llvm::CastInst *CI, Context &Ctx)
+      : UnaryInstruction(ClassID::Cast, getCastOpcode(CI->getOpcode()), CI,
+                         Ctx) {}
+  friend Context;        // for SBCastInstruction()
+  friend class PtrToInt; // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       Instruction *InsertBefore, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       BasicBlock *InsertAtEnd, Context &Ctx,
+                       const Twine &Name = "");
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  Type *getSrcTy() const { return cast<llvm::CastInst>(Val)->getSrcTy(); }
+  Type *getDestTy() const { return cast<llvm::CastInst>(Val)->getDestTy(); }
 #ifndef NDEBUG
   void verify() const final {
-    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+    assert(isa<llvm::CastInst>(Val) && "Expected CastInst!");
   }
-  friend raw_ostream &operator<<(raw_ostream &OS, const BasicBlock &SBBB) {
-    SBBB.dump(OS);
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+// Helper class to simplify stamping out CastInst subclasses.
+template <Instruction::Opcode Op> class CastInstImpl : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "") {
+    return CastInst::create(DestTy, Op, Src, WhereIt, WhereBB, Ctx, Name);
+  }
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "") {
+    return create(Src, DestTy, InsertBefore->getIterator(),
+                  InsertBefore->getParent(), Ctx, Name);
+  }
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "") {
+    return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+  }
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Op;
+    return false;
+  }
+};
+
+class TruncInst final : public CastInstImpl<Instruction::Opcode::Trunc> {};
+class ZExtInst final : public CastInstImpl<Instruction::Opcode::ZExt> {};
+class SExtInst final : public CastInstImpl<Instruction::Opcode::SExt> {};
+class FPTruncInst final : public CastInstImpl<Instruction::Opcode::FPTrunc> {};
+class FPExtInst final : public CastInstImpl<Instruction::Opcode::FPExt> {};
+class UIToFPInst final : public CastInstImpl<Instruction::Opcode::UIToFP> {};
+class SIToFPInst final : public CastInstImpl<Instruction::Opcode::SIToFP> {};
+class FPToUIInst final : public CastInstImpl<Instruction::Opcode::FPToUI> {};
+class FPToSIInst final : public CastInstImpl<Instruction::Opcode::FPToSI> {};
+class IntToPtrInst final : public CastInstImpl<Instruction::Opcode::IntToPtr> {
+};
+class PtrToIntInst final : public CastInstImpl<Instruction::Opcode::PtrToInt> {
+};
+class BitCastInst final : public CastInstImpl<Instruction::Opcode::BitCast> {};
+class AddrSpaceCastInst final
+    : public CastInstImpl<Instruction::Opcode::AddrSpaceCast> {
+public:
+  /// \Returns the pointer operand.
+  Value *getPointerOperand() { return getOperand(0); }
+  /// \Returns the pointer operand.
+  const Value *getPointerOperand() const {
+    return const_cast<AddrSpaceCastInst *>(this)->getPointerOperand();
+  }
+  /// \Returns the operand index of the pointer operand.
+  static unsigned getPointerOperandIndex() { return 0u; }
+  /// \Returns the address space of the pointer operand.
+  unsigned getSrcAddressSpace() const {
+    return getPointerOperand()->getType()->getPointerAddressSpace();
+  }
+  /// \Returns the address space of the result.
+  unsigned getDestAddressSpace() const {
+    return getType()->getPointerAddressSpace();
+  }
+};
+
+class PHINode final : public Instruction {
+  /// Use Context::createPHINode(). Don't call the constructor directly.
+  PHINode(llvm::PHINode *PHI, Context &Ctx)
+      : Instruction(ClassID::PHI, Opcode::PHI, PHI, Ctx) {}
+  friend Context; // for PHINode()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+  /// Helper for mapped_iterator.
+  struct LLVMBBToBB {
+    Context &Ctx;
+    LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
+    BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
+  };
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static PHINode *create(Type *Ty, unsigned NumReservedValues,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name = "");
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+
+  using const_block_iterator =
+      mapped_iterator<llvm::PHINode::const_block_iterator, LLVMBBToBB>;
+
+  const_block_iterator block_begin() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return const_block_iterator(cast<llvm::PHINode>(Val)->block_begin(),
+                                BBGetter);
+  }
+  const_block_iterator block_end() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return const_block_iterator(cast<llvm::PHINode>(Val)->block_end(),
+                                BBGetter);
+  }
+  iterator_range<const_block_iterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
+
+  op_range incoming_values() { return operands(); }
+
+  const_op_range incoming_values() const { return operands(); }
+
+  unsigned getNumIncomingValues() const {
+    return cast<llvm::PHINode>(Val)->getNumIncomingValues();
+  }
+  Value *getIncomingValue(unsigned Idx) const;
+  void setIncomingValue(unsigned Idx, Value *V);
+  static unsigned getOperandNumForIncomingValue(unsigned Idx) {
+    return llvm::PHINode::getOperandNumForIncomingValue(Idx);
+  }
+  static unsigned getIncomingValueNumForOperand(unsigned Idx) {
+    return llvm::PHINode::getIncomingValueNumForOperand(Idx);
+  }
+  BasicBlock *getIncomingBlock(unsigned Idx) const;
+  BasicBlock *getIncomingBlock(const Use &U) const;
+
+  void setIncomingBlock(unsigned Idx, BasicBlock *BB);
+
+  void addIncoming(Value *V, BasicBlock *BB);
+
+  Value *removeIncomingValue(unsigned Idx);
+  Value *removeIncomingValue(BasicBlock *BB);
+
+  int getBasicBlockIndex(const BasicBlock *BB) const;
+  Value *getIncomingValueForBlock(const BasicBlock *BB) const;
+
+  Value *hasConstantValue() const;
+
+  bool hasConstantOrUndefValue() const {
+    return cast<llvm::PHINode>(Val)->hasConstantOrUndefValue();
+  }
+  bool isComplete() const { return cast<llvm::PHINode>(Val)->isComplete(); }
+  // TODO: Implement the below functions:
+  // void replaceIncomingBlockWith (const BasicBlock *Old, BasicBlock *New);
+  // void copyIncomingBlocks(iterator_range<const_block_iterator> BBRange,
+  //                         uint32_t ToIdx = 0)
+  // void removeIncomingValueIf(function_ref< bool(unsigned)> Predicate,
+  //                            bool DeletePHIIfEmpty=true)
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::PHINode>(Val) && "Expected PHINode!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+/// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
+/// an OpaqueInstr.
+class OpaqueInst : public sandboxir::Instruction {
+  OpaqueInst(llvm::Instruction *I, sandboxir::Context &Ctx)
+      : sandboxir::Instruction(ClassID::Opaque, Opcode::Opaque, I, Ctx) {}
+  OpaqueInst(ClassID SubclassID, llvm::Instruction *I, sandboxir::Context &Ctx)
+      : sandboxir::Instruction(SubclassID, Opcode::Opaque, I, Ctx) {}
+  friend class Context; // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::Opaque;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+#ifndef NDEBUG
+  void verify() const final {
+    // Nothing to do
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const sandboxir::OpaqueInst &OI) {
+    OI.dump(OS);
     return OS;
   }
-  void dump(raw_ostream &OS) const final;
-  LLVM_DUMP_METHOD void dump() const final;
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
 #endif
 };
 
@@ -658,6 +1675,7 @@ class Context {
   friend void Instruction::eraseFromParent(); // For detach().
   /// Take ownership of VPtr and store it in `LLVMValueToValueMap`.
   Value *registerValue(std::unique_ptr<Value> &&VPtr);
+  friend class EraseFromParent; // For registerValue().
   /// This is the actual function that creates sandboxir values for \p V,
   /// and among others handles all instruction types.
   Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr);
@@ -675,14 +1693,47 @@ class Context {
   Value *getOrCreateValue(llvm::Value *LLVMV) {
     return getOrCreateValueInternal(LLVMV, 0);
   }
+  /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
+  Constant *getOrCreateConstant(llvm::Constant *LLVMC) {
+    return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
+  }
+  friend class Constant; // For getOrCreateConstant().
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
   BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
 
   friend class BasicBlock; // For getOrCreateValue().
 
+  IRBuilder<ConstantFolder> LLVMIRBuilder;
+  auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
+
+  SelectInst *createSelectInst(llvm::SelectInst *SI);
+  friend SelectInst; // For createSelectInst()
+  BranchInst *createBranchInst(llvm::BranchInst *I);
+  friend BranchInst; // For createBranchInst()
+  LoadInst *createLoadInst(llvm::LoadInst *LI);
+  friend LoadInst; // For createLoadInst()
+  StoreInst *createStoreInst(llvm::StoreInst *SI);
+  friend StoreInst; // For createStoreInst()
+  ReturnInst *createReturnInst(llvm::ReturnInst *I);
+  friend ReturnInst; // For createReturnInst()
+  CallInst *createCallInst(llvm::CallInst *I);
+  friend CallInst; // For createCallInst()
+  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
+  friend InvokeInst; // For createInvokeInst()
+  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
+  friend CallBrInst; // For createCallBrInst()
+  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
+  friend GetElementPtrInst; // For createGetElementPtrInst()
+  CastInst *createCastInst(llvm::CastInst *I);
+  friend CastInst; // For createCastInst()
+  PHINode *createPHINode(llvm::PHINode *I);
+  friend PHINode; // For createPHINode()
+
 public:
-  Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx) {}
+  Context(LLVMContext &LLVMCtx)
+      : LLVMCtx(LLVMCtx), IRTracker(*this),
+        LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
 
   Tracker &getTracker() { return IRTracker; }
   /// Convenience function for `getTracker().save()`
@@ -705,7 +1756,7 @@ class Context {
   size_t getNumValues() const { return LLVMValueToValueMap.size(); }
 };
 
-class Function : public sandboxir::Value {
+class Function : public Constant {
   /// Helper for mapped_iterator.
   struct LLVMBBToBB {
     Context &Ctx;
@@ -716,7 +1767,7 @@ class Function : public sandboxir::Value {
   };
   /// Use Context::createFunction() instead.
   Function(llvm::Function *F, sandboxir::Context &Ctx)
-      : sandboxir::Value(ClassID::Function, F, Ctx) {}
+      : Constant(ClassID::Function, F, Ctx) {}
   friend class Context; // For constructor.
 
 public:
@@ -742,6 +1793,9 @@ class Function : public sandboxir::Value {
     LLVMBBToBB BBGetter(Ctx);
     return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
   }
+  FunctionType *getFunctionType() const {
+    return cast<llvm::Function>(Val)->getFunctionType();
+  }
 
 #ifndef NDEBUG
   void verify() const final {
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index b090ade3ea0ca..4cb601128a507 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -23,9 +23,44 @@ DEF_USER(Constant, Constant)
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
 #endif
-//       ClassID, Opcode(s),  Class
-DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
 
+#ifndef OP
+#define OP(OPCODE)
+#endif
+
+#ifndef OPCODES
+#define OPCODES(...)
+#endif
+// clang-format off
+//       ClassID,        Opcode(s),         Class
+DEF_INSTR(Opaque,        OP(Opaque),        OpaqueInst)
+DEF_INSTR(Select,        OP(Select),        SelectInst)
+DEF_INSTR(Br,            OP(Br),            BranchInst)
+DEF_INSTR(Load,          OP(Load),          LoadInst)
+DEF_INSTR(Store,         OP(Store),         StoreInst)
+DEF_INSTR(Ret,           OP(Ret),           ReturnInst)
+DEF_INSTR(Call,          OP(Call),          CallInst)
+DEF_INSTR(Invoke,        OP(Invoke),        InvokeInst)
+DEF_INSTR(CallBr,        OP(CallBr),        CallBrInst)
+DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst)
+DEF_INSTR(Cast,  OPCODES(\
+                         OP(ZExt)          \
+                         OP(SExt)          \
+                         OP(FPToUI)        \
+                         OP(FPToSI)        \
+                         OP(FPExt)         \
+                         OP(PtrToInt)      \
+                         OP(IntToPtr)      \
+                         OP(SIToFP)        \
+                         OP(UIToFP)        \
+                         OP(Trunc)         \
+                         OP(FPTrunc)       \
+                         OP(BitCast)       \
+                         OP(AddrSpaceCast) \
+                         ),                 CastInst)
+DEF_INSTR(PHI,           OP(PHI),           PHINode)
+  
+// clang-format on
 #ifdef DEF_VALUE
 #undef DEF_VALUE
 #endif
@@ -38,3 +73,6 @@ DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
 #ifdef OP
 #undef OP
 #endif
+#ifdef OPCODES
+#undef OPCODES
+#endif
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index 2d0904f5665b1..238e4e9dacd34 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -40,6 +40,7 @@
 #ifndef LLVM_SANDBOXIR_TRACKER_H
 #define LLVM_SANDBOXIR_TRACKER_H
 
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -52,6 +53,8 @@
 namespace llvm::sandboxir {
 
 class BasicBlock;
+class CallBrInst;
+class Instruction;
 class Tracker;
 
 /// The base class for IR Change classes.
@@ -99,6 +102,195 @@ class UseSet : public IRChangeBase {
 #endif
 };
 
+class PHISetIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned Idx;
+  PointerUnion<Value *, BasicBlock *> OrigValueOrBB;
+
+public:
+  enum class What {
+    Value,
+    Block,
+  };
+  PHISetIncoming(PHINode &PHI, unsigned Idx, What What, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class PHIRemoveIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned RemovedIdx;
+  Value *RemovedV;
+  BasicBlock *RemovedBB;
+
+public:
+  PHIRemoveIncoming(PHINode &PHI, unsigned RemovedIdx, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class PHIAddIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned Idx;
+
+public:
+  PHIAddIncoming(PHINode &PHI, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+/// Tracks swapping a Use with another Use.
+class UseSwap : public IRChangeBase {
+  Use ThisUse;
+  Use OtherUse;
+
+public:
+  UseSwap(const Use &ThisUse, const Use &OtherUse, Tracker &Tracker)
+      : IRChangeBase(Tracker), ThisUse(ThisUse), OtherUse(OtherUse) {
+    assert(ThisUse.getUser() == OtherUse.getUser() && "Expected same user!");
+  }
+  void revert() final { ThisUse.swap(OtherUse); }
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "UseSwap";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class EraseFromParent : public IRChangeBase {
+  /// Contains all the data we need to restore an "erased" (i.e., detached)
+  /// instruction: the instruction itself and its operands in order.
+  struct InstrAndOperands {
+    /// The operands that got dropped.
+    SmallVector<llvm::Value *> Operands;
+    /// The instruction that got "erased".
+    llvm::Instruction *LLVMI;
+  };
+  /// The instruction data is in reverse program order, which helps create the
+  /// original program order during revert().
+  SmallVector<InstrAndOperands> InstrData;
+  /// This is either the next Instruction in the stream, or the parent
+  /// BasicBlock if at the end of the BB.
+  PointerUnion<llvm::Instruction *, llvm::BasicBlock *> NextLLVMIOrBB;
+  /// We take ownership of the "erased" instruction.
+  std::unique_ptr<sandboxir::Value> ErasedIPtr;
+
+public:
+  EraseFromParent(std::unique_ptr<sandboxir::Value> &&IPtr, Tracker &Tracker);
+  void revert() final;
+  void accept() final;
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "EraseFromParent";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+  friend raw_ostream &operator<<(raw_ostream &OS, const EraseFromParent &C) {
+    C.dump(OS);
+    return OS;
+  }
+#endif
+};
+
+class RemoveFromParent : public IRChangeBase {
+  /// The instruction that is about to get removed.
+  Instruction *RemovedI = nullptr;
+  /// This is either the next instr, or the parent BB if at the end of the BB.
+  PointerUnion<Instruction *, BasicBlock *> NextInstrOrBB;
+
+public:
+  RemoveFromParent(Instruction *RemovedI, Tracker &Tracker);
+  void revert() final;
+  void accept() final {};
+  Instruction *getInstruction() const { return RemovedI; }
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "RemoveFromParent";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class CallBrInstSetDefaultDest : public IRChangeBase {
+  CallBrInst *CallBr;
+  BasicBlock *OrigDefaultDest;
+
+public:
+  CallBrInstSetDefaultDest(CallBrInst *CallBr, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "CallBrInstSetDefaultDest";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class CallBrInstSetIndirectDest : public IRChangeBase {
+  CallBrInst *CallBr;
+  unsigned Idx;
+  BasicBlock *OrigIndirectDest;
+
+public:
+  CallBrInstSetIndirectDest(CallBrInst *CallBr, unsigned Idx, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "CallBrInstSetIndirectDest";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class MoveInstr : public IRChangeBase {
+  /// The instruction that moved.
+  Instruction *MovedI;
+  /// This is either the next instruction in the block, or the parent BB if at
+  /// the end of the BB.
+  PointerUnion<Instruction *, BasicBlock *> NextInstrOrBB;
+
+public:
+  MoveInstr(sandboxir::Instruction *I, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "MoveInstr";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
 /// The tracker collects all the change objects and implements the main API for
 /// saving / reverting / accepting.
 class Tracker {
@@ -116,6 +308,7 @@ class Tracker {
 #endif
   /// The current state of the tracker.
   TrackerState State = TrackerState::Disabled;
+  Context &Ctx;
 
 public:
 #ifndef NDEBUG
@@ -124,8 +317,9 @@ class Tracker {
   bool InMiddleOfCreatingChange = false;
 #endif // NDEBUG
 
-  Tracker() = default;
+  explicit Tracker(Context &Ctx) : Ctx(Ctx) {}
   ~Tracker();
+  Context &getContext() const { return Ctx; }
   /// Record \p Change and take ownership. This is the main function used to
   /// track Sandbox IR changes.
   void track(std::unique_ptr<IRChangeBase> &&Change);
diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h
index d77b4568d0fab..35d01daf39f6e 100644
--- a/llvm/include/llvm/SandboxIR/Use.h
+++ b/llvm/include/llvm/SandboxIR/Use.h
@@ -21,6 +21,8 @@ namespace llvm::sandboxir {
 class Context;
 class Value;
 class User;
+class CallBase;
+class PHINode;
 
 /// Represents a Def-use/Use-def edge in SandboxIR.
 /// NOTE: Unlike llvm::Use, this is not an integral part of the use-def chains.
@@ -40,6 +42,9 @@ class Use {
   friend class User;               // For constructor
   friend class OperandUseIterator; // For constructor
   friend class UserUseIterator;    // For accessing members
+  friend class CallBase;           // For LLVMUse
+  friend class CallBrInst;         // For constructor
+  friend class PHINode;            // For LLVMUse
 
 public:
   operator Value *() const { return get(); }
@@ -47,6 +52,7 @@ class Use {
   void set(Value *V);
   class User *getUser() const { return Usr; }
   unsigned getOperandNo() const;
+  void swap(Use &OtherUse);
   Context *getContext() const { return Ctx; }
   bool operator==(const Use &Other) const {
     assert(Ctx == Other.Ctx && "Contexts differ!");
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index eb6c4d3668550..568f0d34032fa 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -149,8 +149,7 @@ class BumpPtrAllocatorImpl
     // Keep track of how many bytes we've allocated.
     BytesAllocated += Size;
 
-    size_t Adjustment = offsetToAlignedAddr(CurPtr, Alignment);
-    assert(Adjustment + Size >= Size && "Adjustment + Size must not overflow");
+    uintptr_t AlignedPtr = alignAddr(CurPtr, Alignment);
 
     size_t SizeToAllocate = Size;
 #if LLVM_ADDRESS_SANITIZER_BUILD
@@ -158,19 +157,22 @@ class BumpPtrAllocatorImpl
     SizeToAllocate += RedZoneSize;
 #endif
 
+    uintptr_t AllocEndPtr = AlignedPtr + SizeToAllocate;
+    assert(AllocEndPtr >= uintptr_t(CurPtr) &&
+           "Alignment + Size must not overflow");
+
     // Check if we have enough space.
-    if (LLVM_LIKELY(Adjustment + SizeToAllocate <= size_t(End - CurPtr)
+    if (LLVM_LIKELY(AllocEndPtr <= uintptr_t(End)
                     // We can't return nullptr even for a zero-sized allocation!
                     && CurPtr != nullptr)) {
-      char *AlignedPtr = CurPtr + Adjustment;
-      CurPtr = AlignedPtr + SizeToAllocate;
+      CurPtr = reinterpret_cast<char *>(AllocEndPtr);
       // Update the allocation point of this memory block in MemorySanitizer.
       // Without this, MemorySanitizer messages for values originated from here
       // will point to the allocation of the entire slab.
-      __msan_allocated_memory(AlignedPtr, Size);
+      __msan_allocated_memory(reinterpret_cast<char *>(AlignedPtr), Size);
       // Similarly, tell ASan about this space.
-      __asan_unpoison_memory_region(AlignedPtr, Size);
-      return AlignedPtr;
+      __asan_unpoison_memory_region(reinterpret_cast<char *>(AlignedPtr), Size);
+      return reinterpret_cast<char *>(AlignedPtr);
     }
 
     return AllocateSlow(Size, SizeToAllocate, Alignment);
@@ -435,6 +437,13 @@ template <typename T> class SpecificBumpPtrAllocator {
 
   /// Allocate space for an array of objects without constructing them.
   T *Allocate(size_t num = 1) { return Allocator.Allocate<T>(num); }
+
+  /// \return An index uniquely and reproducibly identifying
+  /// an input pointer \p Ptr in the given allocator.
+  /// Returns an empty optional if the pointer is not found in the allocator.
+  std::optional<int64_t> identifyObject(const void *Ptr) {
+    return Allocator.identifyObject(Ptr);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h
index d0bed4d5cf383..a2222eec09ba8 100644
--- a/llvm/include/llvm/Support/DXILABI.h
+++ b/llvm/include/llvm/Support/DXILABI.h
@@ -17,7 +17,7 @@
 #ifndef LLVM_SUPPORT_DXILABI_H
 #define LLVM_SUPPORT_DXILABI_H
 
-#include "llvm/ADT/StringSwitch.h"
+#include <cstdint>
 
 namespace llvm {
 namespace dxil {
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index a8e178d6461e0..e05e5f0f842e3 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -90,11 +90,7 @@ template <class NodeT> class DomTreeNodeBase {
   DomTreeNodeBase *getIDom() const { return IDom; }
   unsigned getLevel() const { return Level; }
 
-  std::unique_ptr<DomTreeNodeBase> addChild(
-      std::unique_ptr<DomTreeNodeBase> C) {
-    Children.push_back(C.get());
-    return C;
-  }
+  void addChild(DomTreeNodeBase *C) { Children.push_back(C); }
 
   bool isLeaf() const { return Children.empty(); }
   size_t getNumChildren() const { return Children.size(); }
@@ -636,7 +632,7 @@ class DominatorTreeBase {
     DomTreeNodeBase<NodeT> *IDomNode = getNode(DomBB);
     assert(IDomNode && "Not immediate dominator specified for block!");
     DFSInfoValid = false;
-    return createChild(BB, IDomNode);
+    return createNode(BB, IDomNode);
   }
 
   /// Add a new node to the forward dominator tree and make it a new root.
@@ -655,8 +651,8 @@ class DominatorTreeBase {
     } else {
       assert(Roots.size() == 1);
       NodeT *OldRoot = Roots.front();
-      auto &OldNode = DomTreeNodes[OldRoot];
-      OldNode = NewNode->addChild(std::move(DomTreeNodes[OldRoot]));
+      DomTreeNodeBase<NodeT> *OldNode = getNode(OldRoot);
+      NewNode->addChild(OldNode);
       OldNode->IDom = NewNode;
       OldNode->UpdateLevel();
       Roots[0] = BB;
@@ -695,7 +691,8 @@ class DominatorTreeBase {
       assert(I != IDom->Children.end() &&
              "Not in immediate dominator children set!");
       // I am no longer your child...
-      IDom->Children.erase(I);
+      std::swap(*I, IDom->Children.back());
+      IDom->Children.pop_back();
     }
 
     DomTreeNodes.erase(BB);
@@ -830,16 +827,14 @@ class DominatorTreeBase {
 protected:
   void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
-  DomTreeNodeBase<NodeT> *createChild(NodeT *BB, DomTreeNodeBase<NodeT> *IDom) {
-    return (DomTreeNodes[BB] = IDom->addChild(
-                std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDom)))
-        .get();
-  }
-
-  DomTreeNodeBase<NodeT> *createNode(NodeT *BB) {
-    return (DomTreeNodes[BB] =
-                std::make_unique<DomTreeNodeBase<NodeT>>(BB, nullptr))
-        .get();
+  DomTreeNodeBase<NodeT> *createNode(NodeT *BB,
+                                     DomTreeNodeBase<NodeT> *IDom = nullptr) {
+    auto Node = std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDom);
+    auto *NodePtr = Node.get();
+    DomTreeNodes[BB] = std::move(Node);
+    if (IDom)
+      IDom->addChild(NodePtr);
+    return NodePtr;
   }
 
   // NewBB is split and now it has one successor. Update dominator tree to
diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index 57cbe993d8739..af7ac04a5ab29 100644
--- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -137,12 +137,12 @@ struct SemiNCAInfo {
     // immediate dominator.
     NodePtr IDom = getIDom(BB);
 
-    assert(IDom || DT.DomTreeNodes[nullptr]);
+    assert(IDom || DT.getNode(nullptr));
     TreeNodePtr IDomNode = getNodeForBlock(IDom, DT);
 
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
-    return DT.createChild(BB, IDomNode);
+    return DT.createNode(BB, IDomNode);
   }
 
   static bool AlwaysDescend(NodePtr, NodePtr) { return true; }
@@ -585,8 +585,8 @@ struct SemiNCAInfo {
     NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
     // Loop over all of the discovered blocks in the function...
     for (NodePtr W : llvm::drop_begin(NumToNode)) {
-      // Don't replace this with 'count', the insertion side effect is important
-      if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
+      if (DT.getNode(W))
+        continue; // Already calculated the node before
 
       NodePtr ImmDom = getIDom(W);
 
@@ -595,7 +595,7 @@ struct SemiNCAInfo {
 
       // Add a new tree node for this BasicBlock, and link it as a child of
       // IDomNode.
-      DT.createChild(W, IDomNode);
+      DT.createNode(W, IDomNode);
     }
   }
 
@@ -644,7 +644,7 @@ struct SemiNCAInfo {
 
       // The unreachable node becomes a new root -- a tree node for it.
       TreeNodePtr VirtualRoot = DT.getNode(nullptr);
-      FromTN = DT.createChild(From, VirtualRoot);
+      FromTN = DT.createNode(From, VirtualRoot);
       DT.Roots.push_back(From);
     }
 
@@ -1078,10 +1078,9 @@ struct SemiNCAInfo {
     // before deleting their parent.
     for (unsigned i = LastDFSNum; i > 0; --i) {
       const NodePtr N = SNCA.NumToNode[i];
-      const TreeNodePtr TN = DT.getNode(N);
-      LLVM_DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
-
-      EraseNode(DT, TN);
+      LLVM_DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(DT.getNode(N))
+                        << "\n");
+      DT.eraseNode(N);
     }
 
     // The affected subtree start at the To node -- there's no extra work to do.
@@ -1109,22 +1108,6 @@ struct SemiNCAInfo {
     SNCA.reattachExistingSubtree(DT, PrevIDom);
   }
 
-  // Removes leaf tree nodes from the dominator tree.
-  static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
-    assert(TN);
-    assert(TN->getNumChildren() == 0 && "Not a tree leaf");
-
-    const TreeNodePtr IDom = TN->getIDom();
-    assert(IDom);
-
-    auto ChIt = llvm::find(IDom->Children, TN);
-    assert(ChIt != IDom->Children.end());
-    std::swap(*ChIt, IDom->Children.back());
-    IDom->Children.pop_back();
-
-    DT.DomTreeNodes.erase(TN->getBlock());
-  }
-
   //~~
   //===--------------------- DomTree Batch Updater --------------------------===
   //~~
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 0d0fa826f7bba..e568e42afcf4d 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -770,6 +770,14 @@ std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
 #endif
 }
 
+/// Type to force float point values onto the stack, so that x86 doesn't add
+/// hidden precision, avoiding rounding differences on various platforms.
+#if defined(__i386__) || defined(_M_IX86)
+using stack_float_t = volatile float;
+#else
+using stack_float_t = float;
+#endif
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index a6672f87af977..9fb6de49fb205 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -503,6 +503,12 @@ HANDLE_TARGET_OPCODE(G_ICMP)
 /// Generic floating-point comparison, also applicable to vectors.
 HANDLE_TARGET_OPCODE(G_FCMP)
 
+/// Generic signed 3-way comparison.
+HANDLE_TARGET_OPCODE(G_SCMP)
+
+/// Generic unsigned 3-way comparison.
+HANDLE_TARGET_OPCODE(G_UCMP)
+
 /// Generic select.
 HANDLE_TARGET_OPCODE(G_SELECT)
 
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 31f7df10916db..9e2ba31991f54 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -83,16 +83,28 @@ namespace llvm {
 
 class raw_pwrite_stream;
 
+struct TimeTraceMetadata {
+  std::string Detail;
+  // Source file and line number information for the event.
+  std::string File;
+  int Line = 0;
+
+  bool isEmpty() const { return Detail.empty() && File.empty(); }
+};
+
 struct TimeTraceProfiler;
 TimeTraceProfiler *getTimeTraceProfilerInstance();
 
+bool isTimeTraceVerbose();
+
 struct TimeTraceProfilerEntry;
 
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
 void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                 StringRef ProcName);
+                                 StringRef ProcName,
+                                 bool TimeTraceVerbose = false);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
@@ -128,6 +140,10 @@ TimeTraceProfilerEntry *
 timeTraceProfilerBegin(StringRef Name,
                        llvm::function_ref<std::string()> Detail);
 
+TimeTraceProfilerEntry *
+timeTraceProfilerBegin(StringRef Name,
+                       llvm::function_ref<TimeTraceMetadata()> MetaData);
+
 /// Manually begin a time section, with the given \p Name and \p Detail.
 /// This starts Async Events having \p Name as a category which is shown
 /// separately from other traces. See
@@ -164,6 +180,11 @@ class TimeTraceScope {
     if (getTimeTraceProfilerInstance() != nullptr)
       Entry = timeTraceProfilerBegin(Name, Detail);
   }
+  TimeTraceScope(StringRef Name,
+                 llvm::function_ref<TimeTraceMetadata()> Metadata) {
+    if (getTimeTraceProfilerInstance() != nullptr)
+      Entry = timeTraceProfilerBegin(Name, Metadata);
+  }
   ~TimeTraceScope() {
     if (getTimeTraceProfilerInstance() != nullptr)
       timeTraceProfilerEnd(Entry);
diff --git a/llvm/include/llvm/Support/raw_socket_stream.h b/llvm/include/llvm/Support/raw_socket_stream.h
index eed865fb5af49..6c65a66dec9a4 100644
--- a/llvm/include/llvm/Support/raw_socket_stream.h
+++ b/llvm/include/llvm/Support/raw_socket_stream.h
@@ -92,13 +92,14 @@ class ListeningSocket {
   /// Accepts an incoming connection on the listening socket. This method can
   /// optionally either block until a connection is available or timeout after a
   /// specified amount of time has passed. By default the method will block
-  /// until the socket has recieved a connection.
+  /// until the socket has recieved a connection. If the accept timesout this
+  /// method will return std::errc:timed_out
   ///
   /// \param Timeout An optional timeout duration in milliseconds. Setting
-  /// Timeout to -1 causes accept to block indefinitely
+  /// Timeout to a negative number causes ::accept to block indefinitely
   ///
-  Expected<std::unique_ptr<raw_socket_stream>>
-  accept(std::chrono::milliseconds Timeout = std::chrono::milliseconds(-1));
+  Expected<std::unique_ptr<raw_socket_stream>> accept(
+      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
 
   /// Creates a listening socket bound to the specified file system path.
   /// Handles the socket creation, binding, and immediately starts listening for
@@ -124,11 +125,28 @@ class raw_socket_stream : public raw_fd_stream {
 
 public:
   raw_socket_stream(int SocketFD);
+  ~raw_socket_stream();
+
   /// Create a \p raw_socket_stream connected to the UNIX domain socket at \p
   /// SocketPath.
   static Expected<std::unique_ptr<raw_socket_stream>>
   createConnectedUnix(StringRef SocketPath);
-  ~raw_socket_stream();
+
+  /// Attempt to read from the raw_socket_stream's file descriptor.
+  ///
+  /// This method can optionally either block until data is read or an error has
+  /// occurred or timeout after a specified amount of time has passed. By
+  /// default the method will block until the socket has read data or
+  /// encountered an error. If the read times out this method will return
+  /// std::errc:timed_out
+  ///
+  /// \param Ptr The start of the buffer that will hold any read data
+  /// \param Size The number of bytes to be read
+  /// \param Timeout An optional timeout duration in milliseconds
+  ///
+  ssize_t read(
+      char *Ptr, size_t Size,
+      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 7501048dfdd78..36a0a087ba457 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -430,6 +430,20 @@ def G_FCMP : GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Generic signed three-way comparison.
+def G_SCMP : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, type1:$src2);
+  let hasSideEffects = false;
+}
+
+// Generic unsigned three-way comparison.
+def G_UCMP : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, type1:$src2);
+  let hasSideEffects = false;
+}
+
 // Generic select
 def G_SELECT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3ef0636ebf1c7..1f26132561cca 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -128,6 +128,16 @@ class GITypeOf<string opName> : GISpecialType {
   string OpName = opName;
 }
 
+// The type of an operand that can match a variable amount of operands.
+// This type contains a minimum and maximum number of operands to match.
+// The minimum must be 1 or more, as we cannot have an operand representing
+// zero operands, and the max can be zero (which means "unlimited") or a value
+// greater than the minimum.
+class GIVariadic<int min = 1, int max = 0> : GISpecialType {
+  int MinArgs = min;
+  int MaxArgs = max;
+}
+
 //===----------------------------------------------------------------------===//
 // Pattern Builtins
 //===----------------------------------------------------------------------===//
@@ -839,15 +849,6 @@ def unmerge_zext_to_zext : GICombineRule<
   (apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
 >;
 
-// Fold trunc ([asz]ext x) -> x or ([asz]ext x) or (trunc x).
-def trunc_ext_fold_matchinfo : GIDefMatchData<"std::pair<Register, unsigned>">;
-def trunc_ext_fold: GICombineRule <
-  (defs root:$root, trunc_ext_fold_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_TRUNC):$root,
-         [{ return Helper.matchCombineTruncOfExt(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }])
->;
-
 // Under certain conditions, transform:
 //  trunc (shl x, K)     -> shl (trunc x), K//
 //  trunc ([al]shr x, K) -> (trunc ([al]shr (trunc x), K))
@@ -1768,6 +1769,40 @@ def freeze_combines: GICombineGroup<[
   push_freeze_to_prevent_poison_from_propagating
 ]>;
 
+/// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
+class truncate_of_opcode<Instruction extOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (extOpcode $ext, $src):$ExtMI,
+         (G_TRUNC $root, $ext):$root,
+         [{ return Helper.matchTruncateOfExt(*${root}, *${ExtMI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+def truncate_of_zext : truncate_of_opcode<G_ZEXT>;
+def truncate_of_sext : truncate_of_opcode<G_SEXT>;
+def truncate_of_anyext : truncate_of_opcode<G_ANYEXT>;
+
+// Push cast through select.
+class select_of_opcode<Instruction castOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_SELECT $select, $cond, $true, $false):$Select,
+         (castOpcode $root, $select):$Cast,
+         [{ return Helper.matchCastOfSelect(*${Cast}, *${Select}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>;
+
+def select_of_zext : select_of_opcode<G_ZEXT>;
+def select_of_anyext : select_of_opcode<G_ANYEXT>;
+def select_of_truncate : select_of_opcode<G_TRUNC>;
+
+def cast_combines: GICombineGroup<[
+  truncate_of_zext,
+  truncate_of_sext,
+  truncate_of_anyext,
+  select_of_zext,
+  select_of_anyext,
+  select_of_truncate
+]>;
+
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1828,7 +1863,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 def prefer_sign_combines : GICombineGroup<[nneg_zext]>;
 
 def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
-    vector_ops_combines, freeze_combines,
+    vector_ops_combines, freeze_combines, cast_combines,
     insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
     combine_extracted_vector_load,
     undef_combines, identity_combines, phi_combines,
@@ -1839,7 +1874,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     known_bits_simplifications, ext_ext_fold,
     not_cmp_fold, opt_brcond_by_inverting_cond,
     unmerge_merge, unmerge_cst, unmerge_dead_to_trunc,
-    unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift,
+    unmerge_zext_to_zext, merge_unmerge, trunc_shift,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
     shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
     div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift,
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
index b20d124953f88..101a48cbd5399 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
 LOONGARCH_FEATURE("+lbt", FK_LBT)
 LOONGARCH_FEATURE("+lvz", FK_LVZ)
 LOONGARCH_FEATURE("+ual", FK_UAL)
+LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 
 #undef LOONGARCH_FEATURE
 
@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
 
 LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
 LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
+LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
 
 #undef LOONGARCH_ARCH
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 028844187584b..c0bb15a5163b1 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {
 
   // Allow memory accesses to be unaligned.
   FK_UAL = 1 << 8,
+
+  // Floating-point approximate reciprocal instructions are available.
+  FK_FRECIPE = 1 << 9,
 };
 
 struct FeatureInfo {
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def
index 44e97d56a059c..da4be3e39f2c7 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.def
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def
@@ -40,6 +40,7 @@
 #undef AIX_PPC8_VALUE
 #undef AIX_PPC9_VALUE
 #undef AIX_PPC10_VALUE
+#undef AIX_PPC11_VALUE
 #else
 #ifndef PPC_LNX_FEATURE
 #define PPC_LNX_FEATURE(NAME, DESC, ENUMNAME, ENUMVAL, HWCAPN)
@@ -84,25 +85,75 @@
 #define AIX_PPC8_VALUE 0x00010000
 #define AIX_PPC9_VALUE 0x00020000
 #define AIX_PPC10_VALUE 0x00040000
+#define AIX_PPC11_VALUE 0x00080000
 
 // __builtin_cpu_is() and __builtin_cpu_supports() are supported only on Power7 and up on AIX.
 // PPC_CPU(Name, Linux_SUPPORT_METHOD, LinuxID, AIX_SUPPORT_METHOD, AIXID)
-PPC_CPU("power4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("generic",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("440fp",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("450",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("601",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("602",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603e",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603ev",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("604",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("604e",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("620",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("630",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("7400",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g4",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("7450",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g4+",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("750",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("8548",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc405",SYS_CALL,41,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc464",SYS_CALL,43,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc476",SYS_CALL,44,BUILTIN_PPC_FALSE,0)
+PPC_CPU("970",SYS_CALL,33,BUILTIN_PPC_FALSE,0)
 PPC_CPU("ppc970",SYS_CALL,33,BUILTIN_PPC_FALSE,0)
+PPC_CPU("g5",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("a2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppca2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc-cell-be",SYS_CALL,37,BUILTIN_PPC_FALSE,0)
+PPC_CPU("e500",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("e500mc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("e5500",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("power3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("power4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5",SYS_CALL,34,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power5",SYS_CALL,34,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5+",SYS_CALL,35,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power5+",SYS_CALL,35,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5x",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("power5x",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr6",SYS_CALL,36,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power6",SYS_CALL,36,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc-cell-be",SYS_CALL,37,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr6x",SYS_CALL,38,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power6x",SYS_CALL,38,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr7",SYS_CALL,39,USE_SYS_CONF,AIX_PPC7_VALUE)
 PPC_CPU("power7",SYS_CALL,39,USE_SYS_CONF,AIX_PPC7_VALUE)
-PPC_CPU("ppca2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc405",SYS_CALL,41,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc464",SYS_CALL,43,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc476",SYS_CALL,44,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr8",SYS_CALL,45,USE_SYS_CONF,AIX_PPC8_VALUE)
 PPC_CPU("power8",SYS_CALL,45,USE_SYS_CONF,AIX_PPC8_VALUE)
+PPC_CPU("pwr9",SYS_CALL,46,USE_SYS_CONF,AIX_PPC9_VALUE)
 PPC_CPU("power9",SYS_CALL,46,USE_SYS_CONF,AIX_PPC9_VALUE)
+PPC_CPU("pwr10",SYS_CALL,47,USE_SYS_CONF,AIX_PPC10_VALUE)
 PPC_CPU("power10",SYS_CALL,47,USE_SYS_CONF,AIX_PPC10_VALUE)
+PPC_CPU("pwr11",SYS_CALL,48,USE_SYS_CONF,AIX_PPC11_VALUE)
+PPC_CPU("power11",SYS_CALL,48,USE_SYS_CONF,AIX_PPC11_VALUE)
+PPC_CPU("powerpc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc32",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("powerpc64",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc64",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("powerpc64le",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc64le",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("future",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
 #undef PPC_CPU
 
 // PPC features on Linux:
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
new file mode 100644
index 0000000000000..5f9fe543aff0b
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -0,0 +1,40 @@
+//===---- PPCTargetParser - Parser for target features ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise hardware features
+// for PPC CPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
+#define LLVM_TARGETPARSER_PPCTARGETPARSER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/TargetParser/Triple.h"
+
+namespace llvm {
+namespace PPC {
+bool isValidCPU(StringRef CPU);
+void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+
+// Get target CPU name.
+// If CPUName is empty or generic, return the default CPU name.
+// If CPUName is not empty or generic, return the normalized CPU name.
+StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = "");
+
+// Get the tune CPU name.
+StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = "");
+
+// For PPC, there are some cpu names for same CPU, like pwr10 and power10,
+// normalize them.
+StringRef normalizeCPUName(StringRef CPUName);
+} // namespace PPC
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index d7a08013fa6ac..d71ff174bf0d2 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -80,6 +80,10 @@ class RISCVISAInfo {
                                      std::set<StringRef> &EnabledFeatureNames,
                                      StringMap<StringRef> &DescMap);
 
+  /// Return the bit position (in group 0) of __riscv_feature_bits.  Returns
+  /// -1 if not supported.
+  static int getRISCVFeaturesBitPosition(StringRef Ext);
+
 private:
   RISCVISAInfo(unsigned XLen) : XLen(XLen) {}
 
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index 7421dac2744b6..c75778952e0f5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -24,6 +24,14 @@ class Triple;
 
 namespace RISCV {
 
+namespace RISCVExtensionBitmaskTable {
+struct RISCVExtensionBitmask {
+  const char *Name;
+  unsigned GroupID;
+  unsigned BitPosition;
+};
+} // namespace RISCVExtensionBitmaskTable
+
 // We use 64 bits as the known part in the scalable vector types.
 static constexpr unsigned RVVBitsPerBlock = 64;
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 1dc7261aa8fac..5dcc0e1075df8 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -277,7 +277,7 @@ class Triple {
     Cygnus,
     CoreCLR,
     Simulator, // Simulator variants of other systems, e.g., Apple's iOS
-    MacABI, // Mac Catalyst variant of Apple's iOS deployment target.
+    MacABI,    // Mac Catalyst variant of Apple's iOS deployment target.
 
     // Shader Stages
     // The order of these values matters, and must be kept in sync with the
@@ -301,7 +301,9 @@ class Triple {
     OpenCL,
     OpenHOS,
 
-    LastEnvironmentType = OpenHOS
+    PAuthTest,
+
+    LastEnvironmentType = PAuthTest
   };
   enum ObjectFormatType {
     UnknownObjectFormat,
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 0e4ad873e3639..92798cbe4b4c1 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -175,8 +175,12 @@ X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            34)
 X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 35)
 // Below Features has some missings comparing to gcc, it's because gcc has some
 // not one-to-one mapped in llvm.
-X86_FEATURE_COMPAT(3DNOW,           "3dnow",                  0)
-X86_FEATURE       (3DNOWA,          "3dnowa")
+
+// FIXME: dummy features were added to keep the numeric values of later features
+// stable. Since the values need to be ABI stable, they should be changed to
+// have explicitly assigned values, and then these dummy features removed.
+X86_FEATURE       (DUMMYFEATURE1,   "__dummyfeature1")
+X86_FEATURE       (DUMMYFEATURE2,   "__dummyfeature2")
 X86_FEATURE_COMPAT(ADX,             "adx",                    0)
 X86_FEATURE       (64BIT,           "64bit")
 X86_FEATURE_COMPAT(CLDEMOTE,        "cldemote",               0)
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index ea97ab2562a5b..969c2cd12f3f0 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -121,12 +121,18 @@ struct InstrProfOptions {
   // Use BFI to guide register promotion
   bool UseBFIInPromotion = false;
 
+  // Use sampling to reduce the profile instrumentation runtime overhead.
+  bool Sampling = false;
+
   // Name of the profile file to use as output
   std::string InstrProfileOutput;
 
   InstrProfOptions() = default;
 };
 
+// Create the variable for profile sampling.
+void createProfileSamplingVar(Module &M);
+
 // Options for sanitizer coverage instrumentation.
 struct SanitizerCoverageOptions {
   enum Type {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index 6dfdfb729cf50..1b85766f6cb7c 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -1,4 +1,4 @@
-//===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
+//===- AddressSanitizer.h - AddressSanitizer instrumentation ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
index 41ba05cd67f0c..3256dddd12b38 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
@@ -1,4 +1,4 @@
-//===- DataFlowSanitizer.h - dynamic data flow analysis -------------------===//
+//===- DataFlowSanitizer.h - dynamic data flow analysis ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index 0984e8ec32656..f88d832351118 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -1,4 +1,4 @@
-//===- Transforms/Instrumentation/MemorySanitizer.h - MSan Pass -----------===//
+//===- MemorySanitizer.h - MemorySanitizer instrumentation ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index 5b1977b7de9a2..7199f27dbc991 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -43,12 +43,14 @@ class FileSystem;
 class PGOInstrumentationGenCreateVar
     : public PassInfoMixin<PGOInstrumentationGenCreateVar> {
 public:
-  PGOInstrumentationGenCreateVar(std::string CSInstrName = "")
-      : CSInstrName(CSInstrName) {}
+  PGOInstrumentationGenCreateVar(std::string CSInstrName = "",
+                                 bool Sampling = false)
+      : CSInstrName(CSInstrName), ProfileSampling(Sampling) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 
 private:
   std::string CSInstrName;
+  bool ProfileSampling;
 };
 
 /// The instrumentation (profile-instr-gen) pass for IR based PGO.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index fd37130d54596..346951febf735 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -1,4 +1,4 @@
-//===- Transforms/Instrumentation/ThreadSanitizer.h - TSan Pass -----------===//
+//===- ThreadSanitizer.h - ThreadSanitizer instrumentation ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index f74a49785e11b..98d0adca35521 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -149,13 +149,6 @@ extern char &InferAddressSpacesID;
 //
 FunctionPass *createTLSVariableHoistPass();
 
-//===----------------------------------------------------------------------===//
-//
-// LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and
-// llvm.is.constant intrinsic calls, even for the unknown cases.
-//
-FunctionPass *createLowerConstantIntrinsicsPass();
-
 //===----------------------------------------------------------------------===//
 //
 // PartiallyInlineLibCalls - Tries to inline the fast path of library
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h b/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
index e8e404bb93d6e..3d4d1034c0b24 100644
--- a/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
@@ -19,7 +19,12 @@
 
 namespace llvm {
 
+class DominatorTree;
 class Function;
+class TargetLibraryInfo;
+
+bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI,
+                             DominatorTree *DT);
 
 struct LowerConstantIntrinsicsPass :
     PassInfoMixin<LowerConstantIntrinsicsPass> {
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index b01a447f3c28b..56880bd4822c7 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -359,6 +359,10 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         SinkAndHoistLICMFlags &LICMFlags,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Returns the llvm.vector.reduce intrinsic that corresponds to the recurrence
+/// kind.
+constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
+
 /// Returns the arithmetic instruction opcode used when expanding a reduction.
 unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
 
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 28ff6c4c7927d..010d6b0de9f6a 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -15,6 +15,7 @@
 #define LLVM_TRANSFORMS_UTILS_SSAUPDATERIMPL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
@@ -413,26 +414,33 @@ class SSAUpdaterImpl {
   /// FindExistingPHI - Look through the PHI nodes in a block to see if any of
   /// them match what is needed.
   void FindExistingPHI(BlkT *BB, BlockListTy *BlockList) {
+    SmallVector<BBInfo *, 20> TaggedBlocks;
     for (auto &SomePHI : BB->phis()) {
-      if (CheckIfPHIMatches(&SomePHI)) {
+      if (CheckIfPHIMatches(&SomePHI, TaggedBlocks)) {
         RecordMatchingPHIs(BlockList);
         break;
       }
-      // Match failed: clear all the PHITag values.
-      for (typename BlockListTy::iterator I = BlockList->begin(),
-             E = BlockList->end(); I != E; ++I)
-        (*I)->PHITag = nullptr;
     }
   }
 
   /// CheckIfPHIMatches - Check if a PHI node matches the placement and values
   /// in the BBMap.
-  bool CheckIfPHIMatches(PhiT *PHI) {
+  bool CheckIfPHIMatches(PhiT *PHI, SmallVectorImpl<BBInfo *> &TaggedBlocks) {
+    // Match failed: clear all the PHITag values. Only need to clear visited
+    // blocks.
+    auto Cleanup = make_scope_exit([&]() {
+      for (BBInfo *TaggedBlock : TaggedBlocks)
+        TaggedBlock->PHITag = nullptr;
+      TaggedBlocks.clear();
+    });
+
     SmallVector<PhiT *, 20> WorkList;
     WorkList.push_back(PHI);
 
     // Mark that the block containing this PHI has been visited.
-    BBMap[PHI->getParent()]->PHITag = PHI;
+    BBInfo *PHIBlock = BBMap[PHI->getParent()];
+    PHIBlock->PHITag = PHI;
+    TaggedBlocks.push_back(PHIBlock);
 
     while (!WorkList.empty()) {
       PHI = WorkList.pop_back_val();
@@ -465,10 +473,13 @@ class SSAUpdaterImpl {
           return false;
         }
         PredInfo->PHITag = IncomingPHIVal;
+        TaggedBlocks.push_back(PredInfo);
 
         WorkList.push_back(IncomingPHIVal);
       }
     }
+    // Match found, keep PHITags.
+    Cleanup.release();
     return true;
   }
 
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index 8008fc6e8422d..2ea9d64f03cb6 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -30,6 +30,7 @@ struct SimplifyCFGOptions {
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool SpeculateBlocks = true;
+  bool SpeculateUnpredictables = false;
 
   AssumptionCache *AC = nullptr;
 
@@ -75,6 +76,10 @@ struct SimplifyCFGOptions {
     SpeculateBlocks = B;
     return *this;
   }
+  SimplifyCFGOptions &speculateUnpredictables(bool B) {
+    SpeculateUnpredictables = B;
+    return *this;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index e2682b429e9db..43b5c9250a890 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -205,6 +205,7 @@ class LibCallSimplifier {
   Value *optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B);
   Value *optimizeTrigInversionPairs(CallInst *CI, IRBuilderBase &B);
   Value *optimizeSymmetric(CallInst *CI, LibFunc Func, IRBuilderBase &B);
+  Value *optimizeRemquo(CallInst *CI, IRBuilderBase &B);
   // Wrapper for all floating point library call optimizations
   Value *optimizeFloatingPointLibCall(CallInst *CI, LibFunc Func,
                                       IRBuilderBase &B);
@@ -244,6 +245,9 @@ class LibCallSimplifier {
   Value *optimizeSnPrintFString(CallInst *CI, IRBuilderBase &B);
   Value *optimizeFPrintFString(CallInst *CI, IRBuilderBase &B);
 
+  /// Exit functions
+  Value *optimizeExit(CallInst *CI);
+
   /// hasFloatVersion - Checks if there is a float version of the specified
   /// function by checking for an existing function with name FuncName + f
   bool hasFloatVersion(const Module *M, StringRef FuncName);
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2ff17bd2f7a71..0f4d1355dd2bf 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -349,8 +349,8 @@ class LoopVectorizationLegality {
   /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
   int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
 
-  /// Returns true if value V is uniform across \p VF lanes, when \p VF is
-  /// provided, and otherwise if \p V is invariant across all loop iterations.
+  /// Returns true if \p V is invariant across all loop iterations according to
+  /// SCEV.
   bool isInvariant(Value *V) const;
 
   /// Returns true if value V is uniform across \p VF lanes, when \p VF is
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 161a3034e4829..e474899fb548e 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1692,9 +1692,12 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   if (!Pair.second) {
     auto &Entry = Pair.first->second;
     if (!Entry.isDefinitive()) {
-      // Remember that we used an assumption.
-      ++Entry.NumAssumptionUses;
+      // Remember that we used an assumption. This may either be a direct use
+      // of an assumption, or a use of an entry that may itself be based on an
+      // assumption.
       ++AAQI.NumAssumptionUses;
+      if (Entry.isAssumption())
+        ++Entry.NumAssumptionUses;
     }
     // Cache contains sorted {V1,V2} pairs but we should return original order.
     auto Result = Entry.Result;
@@ -1722,7 +1725,6 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   Entry.Result = Result;
   // Cache contains sorted {V1,V2} pairs.
   Entry.Result.swap(Swapped);
-  Entry.NumAssumptionUses = -1;
 
   // If the assumption has been disproven, remove any results that may have
   // been based on this assumption. Do this after the Entry updates above to
@@ -1734,8 +1736,26 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   // The result may still be based on assumptions higher up in the chain.
   // Remember it, so it can be purged from the cache later.
   if (OrigNumAssumptionUses != AAQI.NumAssumptionUses &&
-      Result != AliasResult::MayAlias)
+      Result != AliasResult::MayAlias) {
     AAQI.AssumptionBasedResults.push_back(Locs);
+    Entry.NumAssumptionUses = AAQueryInfo::CacheEntry::AssumptionBased;
+  } else {
+    Entry.NumAssumptionUses = AAQueryInfo::CacheEntry::Definitive;
+  }
+
+  // Depth is incremented before this function is called, so Depth==1 indicates
+  // a root query.
+  if (AAQI.Depth == 1) {
+    // Any remaining assumption based results must be based on proven
+    // assumptions, so convert them to definitive results.
+    for (const auto &Loc : AAQI.AssumptionBasedResults) {
+      auto It = AAQI.AliasCache.find(Loc);
+      if (It != AAQI.AliasCache.end())
+        It->second.NumAssumptionUses = AAQueryInfo::CacheEntry::Definitive;
+    }
+    AAQI.AssumptionBasedResults.clear();
+    AAQI.NumAssumptionUses = 0;
+  }
   return Result;
 }
 
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 74476cb5440c6..997bb7a0bb178 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_component_library(LLVMAnalysis
   DomPrinter.cpp
   DomTreeUpdater.cpp
   DominanceFrontier.cpp
+  DXILResource.cpp
   FunctionPropertiesAnalysis.cpp
   GlobalsModRef.cpp
   GuardUtils.cpp
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index df75745645e04..084647b1d9d93 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -564,16 +564,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
     Type *MapTy = Type::getIntNTy(C->getContext(),
                                   DL.getTypeSizeInBits(LoadTy).getFixedValue());
     if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
-      if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-          !LoadTy->isX86_AMXTy())
+      if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
         // Materializing a zero can be done trivially without a bitcast
         return Constant::getNullValue(LoadTy);
       Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
       Res = FoldBitCast(Res, CastTy, DL);
       if (LoadTy->isPtrOrPtrVectorTy()) {
         // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
-        if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-            !LoadTy->isX86_AMXTy())
+        if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
           return Constant::getNullValue(LoadTy);
         if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
           // Be careful not to replace a load of an addrspace value with an inttoptr here
@@ -764,7 +762,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty,
   // uniform.
   if (!DL.typeSizeEqualsStoreSize(C->getType()))
     return nullptr;
-  if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy())
+  if (C->isNullValue() && !Ty->isX86_AMXTy())
     return Constant::getNullValue(Ty);
   if (C->isAllOnesValue() &&
       (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy()))
@@ -1784,8 +1782,8 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
 }
 
 #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
-Constant *ConstantFoldFP128(long double (*NativeFP)(long double),
-                            const APFloat &V, Type *Ty) {
+Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V,
+                            Type *Ty) {
   llvm_fenv_clearexcept();
   float128 Result = NativeFP(V.convertToQuad());
   if (llvm_fenv_testexcept()) {
diff --git a/llvm/lib/Transforms/Utils/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
similarity index 64%
rename from llvm/lib/Transforms/Utils/DXILResource.cpp
rename to llvm/lib/Analysis/DXILResource.cpp
index bf45654a591b5..5e8350fd2d251 100644
--- a/llvm/lib/Transforms/Utils/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -1,4 +1,4 @@
-//===- DXILResource.cpp - Tools to translate DXIL resources ---------------===//
+//===- DXILResource.cpp - Representations of DXIL resources ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/DXILResource.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Metadata.h"
 
 using namespace llvm;
 using namespace dxil;
@@ -63,184 +64,142 @@ bool ResourceInfo::isMultiSample() const {
 }
 
 ResourceInfo ResourceInfo::SRV(Value *Symbol, StringRef Name,
-                               ResourceBinding Binding, uint32_t UniqueID,
                                ElementType ElementTy, uint32_t ElementCount,
                                ResourceKind Kind) {
-  ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name, Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for SRV constructor.");
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
+  RI.setTyped(ElementTy, ElementCount);
   return RI;
 }
 
-ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name,
-                                     ResourceBinding Binding,
-                                     uint32_t UniqueID) {
-  ResourceInfo RI(ResourceClass::SRV, ResourceKind::RawBuffer, Symbol, Name,
-                  Binding, UniqueID);
+ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name) {
+  ResourceInfo RI(ResourceClass::SRV, ResourceKind::RawBuffer, Symbol, Name);
   return RI;
 }
 
 ResourceInfo ResourceInfo::StructuredBuffer(Value *Symbol, StringRef Name,
-                                            ResourceBinding Binding,
-                                            uint32_t UniqueID, uint32_t Stride,
-                                            Align Alignment) {
+                                            uint32_t Stride,
+                                            MaybeAlign Alignment) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::StructuredBuffer, Symbol,
-                  Name, Binding, UniqueID);
-  RI.Struct.Stride = Stride;
-  RI.Struct.Alignment = Alignment;
+                  Name);
+  RI.setStruct(Stride, Alignment);
   return RI;
 }
 
 ResourceInfo ResourceInfo::Texture2DMS(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, ElementType ElementTy,
+                                       ElementType ElementTy,
                                        uint32_t ElementCount,
                                        uint32_t SampleCount) {
-  ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name,
-                  Binding, UniqueID);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.MultiSample.Count = SampleCount;
+  ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name);
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
-ResourceInfo ResourceInfo::Texture2DMSArray(
-    Value *Symbol, StringRef Name, ResourceBinding Binding, uint32_t UniqueID,
-    ElementType ElementTy, uint32_t ElementCount, uint32_t SampleCount) {
+ResourceInfo ResourceInfo::Texture2DMSArray(Value *Symbol, StringRef Name,
+                                            ElementType ElementTy,
+                                            uint32_t ElementCount,
+                                            uint32_t SampleCount) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMSArray, Symbol,
-                  Name, Binding, UniqueID);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.MultiSample.Count = SampleCount;
+                  Name);
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
 ResourceInfo ResourceInfo::UAV(Value *Symbol, StringRef Name,
-                               ResourceBinding Binding, uint32_t UniqueID,
                                ElementType ElementTy, uint32_t ElementCount,
                                bool GloballyCoherent, bool IsROV,
                                ResourceKind Kind) {
-  ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name, Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for UAV constructor.");
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = false;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, IsROV);
   return RI;
 }
 
 ResourceInfo ResourceInfo::RWRawBuffer(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, bool GloballyCoherent,
-                                       bool IsROV) {
-  ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name,
-                  Binding, UniqueID);
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = false;
+                                       bool GloballyCoherent, bool IsROV) {
+  ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, IsROV);
   return RI;
 }
 
 ResourceInfo ResourceInfo::RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                              ResourceBinding Binding,
-                                              uint32_t UniqueID,
-                                              uint32_t Stride, Align Alignment,
+                                              uint32_t Stride,
+                                              MaybeAlign Alignment,
                                               bool GloballyCoherent, bool IsROV,
                                               bool HasCounter) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::StructuredBuffer, Symbol,
-                  Name, Binding, UniqueID);
-  RI.Struct.Stride = Stride;
-  RI.Struct.Alignment = Alignment;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = HasCounter;
+                  Name);
+  RI.setStruct(Stride, Alignment);
+  RI.setUAV(GloballyCoherent, HasCounter, IsROV);
   return RI;
 }
 
-ResourceInfo
-ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
-                            ResourceBinding Binding, uint32_t UniqueID,
-                            ElementType ElementTy, uint32_t ElementCount,
-                            uint32_t SampleCount, bool GloballyCoherent) {
-  ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name,
-                  Binding, UniqueID);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.MultiSample.Count = SampleCount;
+ResourceInfo ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
+                                         ElementType ElementTy,
+                                         uint32_t ElementCount,
+                                         uint32_t SampleCount,
+                                         bool GloballyCoherent) {
+  ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name);
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
-ResourceInfo
-ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
-                                 ResourceBinding Binding, uint32_t UniqueID,
-                                 ElementType ElementTy, uint32_t ElementCount,
-                                 uint32_t SampleCount, bool GloballyCoherent) {
+ResourceInfo ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
+                                              ElementType ElementTy,
+                                              uint32_t ElementCount,
+                                              uint32_t SampleCount,
+                                              bool GloballyCoherent) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMSArray, Symbol,
-                  Name, Binding, UniqueID);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.MultiSample.Count = SampleCount;
+                  Name);
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
 ResourceInfo ResourceInfo::FeedbackTexture2D(Value *Symbol, StringRef Name,
-                                             ResourceBinding Binding,
-                                             uint32_t UniqueID,
                                              SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2D, Symbol,
-                  Name, Binding, UniqueID);
-  RI.UAVFlags.GloballyCoherent = false;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.Feedback.Type = FeedbackTy;
+                  Name);
+  RI.setUAV(/*GloballyCoherent=*/false, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setFeedback(FeedbackTy);
   return RI;
 }
 
 ResourceInfo
 ResourceInfo::FeedbackTexture2DArray(Value *Symbol, StringRef Name,
-                                     ResourceBinding Binding, uint32_t UniqueID,
                                      SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2DArray,
-                  Symbol, Name, Binding, UniqueID);
-  RI.UAVFlags.GloballyCoherent = false;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.Feedback.Type = FeedbackTy;
+                  Symbol, Name);
+  RI.setUAV(/*GloballyCoherent=*/false, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setFeedback(FeedbackTy);
   return RI;
 }
 
 ResourceInfo ResourceInfo::CBuffer(Value *Symbol, StringRef Name,
-                                   ResourceBinding Binding, uint32_t UniqueID,
                                    uint32_t Size) {
-  ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name,
-                  Binding, UniqueID);
-  RI.CBufferSize = Size;
+  ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name);
+  RI.setCBuffer(Size);
   return RI;
 }
 
 ResourceInfo ResourceInfo::Sampler(Value *Symbol, StringRef Name,
-                                   ResourceBinding Binding, uint32_t UniqueID,
                                    SamplerType SamplerTy) {
-  ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name,
-                  Binding, UniqueID);
-  RI.SamplerTy = SamplerTy;
+  ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name);
+  RI.setSampler(SamplerTy);
   return RI;
 }
 
 bool ResourceInfo::operator==(const ResourceInfo &RHS) const {
-  if (std::tie(Symbol, Name, Binding, UniqueID, RC, Kind) !=
-      std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.UniqueID, RHS.RC,
-               RHS.Kind))
+  if (std::tie(Symbol, Name, Binding, RC, Kind) !=
+      std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.RC, RHS.Kind))
     return false;
   if (isCBuffer())
     return CBufferSize == RHS.CBufferSize;
@@ -277,7 +236,7 @@ MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const {
         Constant::getIntegerValue(I1Ty, APInt(1, V)));
   };
 
-  MDVals.push_back(getIntMD(UniqueID));
+  MDVals.push_back(getIntMD(Binding.UniqueID));
   MDVals.push_back(ValueAsMetadata::get(Symbol));
   MDVals.push_back(MDString::get(Ctx, Name));
   MDVals.push_back(getIntMD(Binding.Space));
@@ -327,10 +286,10 @@ MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const {
 
 std::pair<uint32_t, uint32_t> ResourceInfo::getAnnotateProps() const {
   uint32_t ResourceKind = llvm::to_underlying(Kind);
-  uint32_t AlignLog2 = isStruct() ? Log2(Struct.Alignment) : 0;
+  uint32_t AlignLog2 = isStruct() ? Struct.AlignLog2 : 0;
   bool IsUAV = isUAV();
-  bool IsROV = IsUAV ? UAVFlags.IsROV : 0;
-  bool IsGloballyCoherent = IsUAV ? UAVFlags.GloballyCoherent : 0;
+  bool IsROV = IsUAV && UAVFlags.IsROV;
+  bool IsGloballyCoherent = IsUAV && UAVFlags.GloballyCoherent;
   uint8_t SamplerCmpOrHasCounter = 0;
   if (IsUAV)
     SamplerCmpOrHasCounter = UAVFlags.HasCounter;
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index 6895317c1d03a..351bd66e389bc 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -42,9 +42,8 @@ bool DomTreeUpdater::forceFlushDeletedBB() {
     // delete only has an UnreachableInst inside.
     assert(BB->size() == 1 && isa<UnreachableInst>(BB->getTerminator()) &&
            "DelBB has been modified while awaiting deletion.");
-    BB->removeFromParent();
     eraseDelBBNode(BB);
-    delete BB;
+    BB->eraseFromParent();
   }
   DeletedBBs.clear();
   Callbacks.clear();
@@ -63,9 +62,8 @@ void DomTreeUpdater::deleteBB(BasicBlock *DelBB) {
     return;
   }
 
-  DelBB->removeFromParent();
   eraseDelBBNode(DelBB);
-  delete DelBB;
+  DelBB->eraseFromParent();
 }
 
 void DomTreeUpdater::callbackDeleteBB(
@@ -77,8 +75,8 @@ void DomTreeUpdater::callbackDeleteBB(
     return;
   }
 
-  DelBB->removeFromParent();
   eraseDelBBNode(DelBB);
+  DelBB->removeFromParent();
   Callback(DelBB);
   delete DelBB;
 }
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 3a7ae577bb068..12a3193e63755 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -88,7 +88,7 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   else
     return nullptr;
 
-  CmpInst::Predicate ExpectedPred, Pred1, Pred2;
+  CmpInst::Predicate ExpectedPred;
   if (BinOpCode == BinaryOperator::Or) {
     ExpectedPred = ICmpInst::ICMP_NE;
   } else if (BinOpCode == BinaryOperator::And) {
@@ -110,10 +110,10 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   // -->
   // %TV
   Value *X, *Y;
-  if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
-                                      m_Specific(FalseVal)),
-                             m_ICmp(Pred2, m_Value(X), m_Value(Y)))) ||
-      Pred1 != Pred2 || Pred1 != ExpectedPred)
+  if (!match(Cond,
+             m_c_BinOp(m_c_SpecificICmp(ExpectedPred, m_Specific(TrueVal),
+                                        m_Specific(FalseVal)),
+                       m_SpecificICmp(ExpectedPred, m_Value(X), m_Value(Y)))))
     return nullptr;
 
   if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal)
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 92389f2896b8e..4c023ed5ed8f0 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -428,6 +428,8 @@ class LazyValueInfoImpl {
   std::optional<ValueLatticeElement> solveBlockValueIntrinsic(IntrinsicInst *II,
                                                               BasicBlock *BB);
   std::optional<ValueLatticeElement>
+  solveBlockValueInsertElement(InsertElementInst *IEI, BasicBlock *BB);
+  std::optional<ValueLatticeElement>
   solveBlockValueExtractValue(ExtractValueInst *EVI, BasicBlock *BB);
   bool isNonNullAtEndOfBlock(Value *Val, BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
@@ -657,6 +659,9 @@ LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) {
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI))
       return solveBlockValueBinaryOp(BO, BB);
 
+    if (auto *IEI = dyn_cast<InsertElementInst>(BBI))
+      return solveBlockValueInsertElement(IEI, BB);
+
     if (auto *EVI = dyn_cast<ExtractValueInst>(BBI))
       return solveBlockValueExtractValue(EVI, BB);
 
@@ -1038,6 +1043,24 @@ LazyValueInfoImpl::solveBlockValueIntrinsic(IntrinsicInst *II, BasicBlock *BB) {
                    MetadataVal);
 }
 
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::solveBlockValueInsertElement(InsertElementInst *IEI,
+                                                BasicBlock *BB) {
+  std::optional<ValueLatticeElement> OptEltVal =
+      getBlockValue(IEI->getOperand(1), BB, IEI);
+  if (!OptEltVal)
+    return std::nullopt;
+  ValueLatticeElement &Res = *OptEltVal;
+
+  std::optional<ValueLatticeElement> OptVecVal =
+      getBlockValue(IEI->getOperand(0), BB, IEI);
+  if (!OptVecVal)
+    return std::nullopt;
+
+  Res.mergeIn(*OptVecVal);
+  return Res;
+}
+
 std::optional<ValueLatticeElement>
 LazyValueInfoImpl::solveBlockValueExtractValue(ExtractValueInst *EVI,
                                                BasicBlock *BB) {
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 1d54a66705a2a..a88469ab81a8c 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -313,6 +313,13 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
     const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
     const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
     if (StartS->getNumOperands() == 2 && Offset && NewBase) {
+      // The following code below assumes the offset is unsigned, but GEP
+      // offsets are treated as signed so we can end up with a signed value
+      // here too. For example, suppose the initial PHI value is (i8 255),
+      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
+      if (Offset->getAPInt().isNegative())
+        return false;
+
       // For the moment, restrict ourselves to the case where the offset is a
       // multiple of the requested alignment and the base is aligned.
       // TODO: generalize if a case found which warrants
@@ -338,6 +345,19 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
                                             HeaderFirstNonPHI, AC, &DT);
 }
 
+static bool suppressSpeculativeLoadForSanitizers(const Instruction &CtxI) {
+  const Function &F = *CtxI.getFunction();
+  // Speculative load may create a race that did not exist in the source.
+  return F.hasFnAttribute(Attribute::SanitizeThread) ||
+         // Speculative load may load data from dirty regions.
+         F.hasFnAttribute(Attribute::SanitizeAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeHWAddress);
+}
+
+bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
+  return !LI.isUnordered() || suppressSpeculativeLoadForSanitizers(LI);
+}
+
 /// Check if executing a load of this pointer value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
@@ -358,8 +378,12 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &S
   // If DT is not specified we can't make context-sensitive query
   const Instruction* CtxI = DT ? ScanFrom : nullptr;
   if (isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, AC, DT,
-                                         TLI))
-    return true;
+                                         TLI)) {
+    // With sanitizers `Dereferenceable` is not always enough for unconditional
+    // load.
+    if (!ScanFrom || !suppressSpeculativeLoadForSanitizers(*ScanFrom))
+      return true;
+  }
 
   if (!ScanFrom)
     return false;
@@ -743,9 +767,8 @@ static bool isPointerAlwaysReplaceable(const Value *From, const Value *To,
   if (isa<Constant>(To) &&
       isDereferenceablePointer(To, Type::getInt8Ty(To->getContext()), DL))
     return true;
-  if (getUnderlyingObject(From) == getUnderlyingObject(To))
-    return true;
-  return false;
+  return getUnderlyingObjectAggressive(From) ==
+         getUnderlyingObjectAggressive(To);
 }
 
 bool llvm::canReplacePointersInUseIfEqual(const Use &U, const Value *To,
@@ -769,3 +792,18 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
 
   return isPointerAlwaysReplaceable(From, To, DL);
 }
+
+bool llvm::isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE,
+                                         DominatorTree *DT,
+                                         AssumptionCache *AC) {
+  for (BasicBlock *BB : L->blocks()) {
+    for (Instruction &I : *BB) {
+      if (auto *LI = dyn_cast<LoadInst>(&I)) {
+        if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC))
+          return false;
+      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
+        return false;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 91994f33f3046..a2b124d3b10bf 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -181,7 +181,7 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 }
 
 RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
-    unsigned Index, RuntimePointerChecking &RtCheck)
+    unsigned Index, const RuntimePointerChecking &RtCheck)
     : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
       AddressSpace(RtCheck.Pointers[Index]
                        .PointerValue->getType()
@@ -280,8 +280,8 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
   if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
     return false;
 
-  PointerInfo *Src = &Pointers[CGI.Members[0]];
-  PointerInfo *Sink = &Pointers[CGJ.Members[0]];
+  const PointerInfo *Src = &Pointers[CGI.Members[0]];
+  const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
 
   // If either pointer is read and written, multiple checks may be needed. Bail
   // out.
@@ -385,7 +385,7 @@ SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
 
       if (needsChecking(CGI, CGJ)) {
         CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
-        Checks.push_back(std::make_pair(&CGI, &CGJ));
+        Checks.emplace_back(&CGI, &CGJ);
       }
     }
   }
@@ -420,8 +420,8 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
   return C->getValue()->isNegative() ? J : I;
 }
 
-bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
-                                         RuntimePointerChecking &RtCheck) {
+bool RuntimeCheckingPtrGroup::addPointer(
+    unsigned Index, const RuntimePointerChecking &RtCheck) {
   return addPointer(
       Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
       RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
@@ -507,7 +507,7 @@ void RuntimePointerChecking::groupChecks(
   // pointers to the same underlying object.
   if (!UseDependencies) {
     for (unsigned I = 0; I < Pointers.size(); ++I)
-      CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
+      CheckingGroups.emplace_back(I, *this);
     return;
   }
 
@@ -575,7 +575,7 @@ void RuntimePointerChecking::groupChecks(
           // We couldn't add this pointer to any existing set or the threshold
           // for the number of comparisons has been reached. Create a new group
           // to hold the current pointer.
-          Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
+          Groups.emplace_back(Pointer, *this);
       }
     }
 
@@ -605,10 +605,7 @@ bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
     return false;
 
   // Only need to check pointers in the same alias set.
-  if (PointerI.AliasSetId != PointerJ.AliasSetId)
-    return false;
-
-  return true;
+  return PointerI.AliasSetId == PointerJ.AliasSetId;
 }
 
 void RuntimePointerChecking::printChecks(
@@ -658,7 +655,7 @@ class AccessAnalysis {
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
-  AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
+  AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE,
                  SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
@@ -669,7 +666,7 @@ class AccessAnalysis {
   }
 
   /// Register a load  and whether it is only read from.
-  void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
+  void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
     Value *Ptr = const_cast<Value *>(Loc.Ptr);
     AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
@@ -678,7 +675,7 @@ class AccessAnalysis {
   }
 
   /// Register a store.
-  void addStore(MemoryLocation &Loc, Type *AccessTy) {
+  void addStore(const MemoryLocation &Loc, Type *AccessTy) {
     Value *Ptr = const_cast<Value *>(Loc.Ptr);
     AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
@@ -718,7 +715,7 @@ class AccessAnalysis {
   ///
   /// Note that this can later be cleared if we retry memcheck analysis without
   /// dependency checking (i.e. FoundNonConstantDistanceDependence).
-  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
 
   /// We decided that no dependence analysis would be used.  Reset the state.
   void resetDepChecks(MemoryDepChecker &DepChecker) {
@@ -726,12 +723,7 @@ class AccessAnalysis {
     DepChecker.clearDependences();
   }
 
-  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
-
-  const DenseMap<Value *, SmallVector<const Value *, 16>> &
-  getUnderlyingObjects() {
-    return UnderlyingObjects;
-  }
+  const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
 
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
@@ -786,7 +778,8 @@ class AccessAnalysis {
   //intrinsic property (such as TBAA metadata).
   AliasSetTracker AST;
 
-  LoopInfo *LI;
+  /// The LoopInfo of the loop being checked.
+  const LoopInfo *LI;
 
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
@@ -843,10 +836,8 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE,
     return true;
 
   int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
-  if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
-    return true;
-
-  return false;
+  return Stride == 1 ||
+         PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
 }
 
 static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
@@ -925,7 +916,7 @@ static void findForkedSCEVs(
   unsigned Opcode = I->getOpcode();
   switch (Opcode) {
   case Instruction::GetElementPtr: {
-    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    auto *GEP = cast<GetElementPtrInst>(I);
     Type *SourceTy = GEP->getSourceElementType();
     // We only handle base + single offset GEPs here for now.
     // Not dealing with preexisting gathers yet, so no vectors.
@@ -1080,7 +1071,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
       findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
-  for (auto &P : TranslatedPtrs) {
+  for (const auto &P : TranslatedPtrs) {
     const SCEV *PtrExpr = get<0>(P);
     if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
       return false;
@@ -1145,7 +1136,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   // We assign a consecutive id to access from different alias sets.
   // Accesses between different groups doesn't need to be checked.
   unsigned ASId = 0;
-  for (auto &AS : AST) {
+  for (const auto &AS : AST) {
     int NumReadPtrChecks = 0;
     int NumWritePtrChecks = 0;
     bool CanDoAliasSetRT = true;
@@ -1195,7 +1186,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                   ShouldCheckWrap, false)) {
           LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
                             << *Access.getPointer() << '\n');
-          Retries.push_back({Access, AccessTy});
+          Retries.emplace_back(Access, AccessTy);
           CanDoAliasSetRT = false;
         }
       }
@@ -1426,7 +1417,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
   // non-wrapping for the *specific* value of Ptr.
 
   // The arithmetic implied by an inbounds GEP can't overflow.
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP || !GEP->isInBounds())
     return false;
 
@@ -1459,22 +1450,23 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// Check whether the access through \p Ptr has a constant stride.
-std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
-                                          Type *AccessTy, Value *Ptr,
-                                          const Loop *Lp,
-                                          const DenseMap<Value *, const SCEV *> &StridesMap,
-                                          bool Assume, bool ShouldCheckWrap) {
+std::optional<int64_t>
+llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
+                   const Loop *Lp,
+                   const DenseMap<Value *, const SCEV *> &StridesMap,
+                   bool Assume, bool ShouldCheckWrap) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
+  if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
+    return {0};
+
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
-
   if (isa<ScalableVectorType>(AccessTy)) {
     LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
                       << "\n");
     return std::nullopt;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
-
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (Assume && !AR)
     AR = PSE.getAsAddRec(Ptr);
@@ -1503,7 +1495,7 @@ std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
     return std::nullopt;
   }
 
-  auto &DL = Lp->getHeader()->getDataLayout();
+  const auto &DL = Lp->getHeader()->getDataLayout();
   TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
   int64_t Size = AllocSize.getFixedValue();
   const APInt &APStepVal = C->getAPInt();
@@ -1582,8 +1574,10 @@ std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
   unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
 
   APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
-  Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+  const Value *PtrA1 =
+      PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  const Value *PtrB1 =
+      PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
   int Val;
   if (PtrA1 == PtrB1) {
@@ -1730,7 +1724,7 @@ bool MemoryDepChecker::Dependence::isBackward() const {
 }
 
 bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
-  return isBackward() || Type == Unknown;
+  return isBackward() || Type == Unknown || Type == IndirectUnsafe;
 }
 
 bool MemoryDepChecker::Dependence::isForward() const {
@@ -1899,27 +1893,15 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
   return ScaledDist % Stride;
 }
 
-/// Returns true if any of the underlying objects has a loop varying address,
-/// i.e. may change in \p L.
-static bool
-isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
-                             ScalarEvolution &SE, const Loop *L) {
-  return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
-    return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
-  });
-}
-
 std::variant<MemoryDepChecker::Dependence::DepType,
              MemoryDepChecker::DepDistanceStrideAndSizeInfo>
 MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
-    const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
-  auto &DL = InnermostLoop->getHeader()->getDataLayout();
+    const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
+  const auto &DL = InnermostLoop->getHeader()->getDataLayout();
   auto &SE = *PSE.getSE();
-  auto [APtr, AIsWrite] = A;
-  auto [BPtr, BIsWrite] = B;
+  const auto &[APtr, AIsWrite] = A;
+  const auto &[BPtr, BIsWrite] = B;
 
   // Two reads are independent.
   if (!AIsWrite && !BIsWrite)
@@ -1933,12 +1915,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
-  int64_t StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true)
-          .value_or(0);
-  int64_t StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true)
-          .value_or(0);
+  std::optional<int64_t> StrideAPtr =
+      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideBPtr =
+      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -1946,26 +1926,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   // If the induction step is negative we have to invert source and sink of the
   // dependence when measuring the distance between them. We should not swap
   // AIsWrite with BIsWrite, as their uses expect them in program order.
-  if (StrideAPtr < 0) {
+  if (StrideAPtr && *StrideAPtr < 0) {
     std::swap(Src, Sink);
     std::swap(AInst, BInst);
+    std::swap(StrideAPtr, StrideBPtr);
   }
 
   const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
 
   LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-                    << "(Induction step: " << StrideAPtr << ")\n");
+                    << "\n");
   LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
                     << ": " << *Dist << "\n");
 
-  // Needs accesses where the addresses of the accessed underlying objects do
-  // not change within the loop.
-  if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
-                                   InnermostLoop) ||
-      isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
-                                   InnermostLoop))
-    return MemoryDepChecker::Dependence::IndirectUnsafe;
-
   // Check if we can prove that Sink only accesses memory after Src's end or
   // vice versa. At the moment this is limited to cases where either source or
   // sink are loop invariant to avoid compile-time increases. This is not
@@ -1987,12 +1960,33 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     }
   }
 
-  // Need accesses with constant strides and the same direction. We don't want
-  // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
-  // could wrap in the address space.
-  if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
-      (StrideAPtr < 0 && StrideBPtr > 0)) {
+  // Need accesses with constant strides and the same direction for further
+  // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
+  // similar code or pointer arithmetic that could wrap in the address space.
+
+  // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
+  // not loop-invariant (stride will be 0 in that case), we cannot analyze the
+  // dependence further and also cannot generate runtime checks.
+  if (!StrideAPtr || !StrideBPtr) {
     LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
+    return MemoryDepChecker::Dependence::IndirectUnsafe;
+  }
+
+  int64_t StrideAPtrInt = *StrideAPtr;
+  int64_t StrideBPtrInt = *StrideBPtr;
+  LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << StrideAPtrInt
+                    << " Sink induction step: " << StrideBPtrInt << "\n");
+  // At least Src or Sink are loop invariant and the other is strided or
+  // invariant. We can generate a runtime check to disambiguate the accesses.
+  if (StrideAPtrInt == 0 || StrideBPtrInt == 0)
+    return MemoryDepChecker::Dependence::Unknown;
+
+  // Both Src and Sink have a constant stride, check if they are in the same
+  // direction.
+  if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) ||
+      (StrideAPtrInt < 0 && StrideBPtrInt > 0)) {
+    LLVM_DEBUG(
+        dbgs() << "Pointer access with strides in different directions\n");
     return MemoryDepChecker::Dependence::Unknown;
   }
 
@@ -2001,22 +1995,20 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
   if (!HasSameSize)
     TypeByteSize = 0;
-  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
-                                      std::abs(StrideBPtr), TypeByteSize,
+  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
+                                      std::abs(StrideBPtrInt), TypeByteSize,
                                       AIsWrite, BIsWrite);
 }
 
-MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
-    const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-    unsigned BIdx,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+MemoryDepChecker::Dependence::DepType
+MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+                              const MemAccessInfo &B, unsigned BIdx) {
   assert(AIdx < BIdx && "Must pass arguments in program order");
 
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
-  auto Res = getDependenceDistanceStrideAndSize(
-      A, InstMap[AIdx], B, InstMap[BIdx], UnderlyingObjects);
+  auto Res =
+      getDependenceDistanceStrideAndSize(A, InstMap[AIdx], B, InstMap[BIdx]);
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
@@ -2250,10 +2242,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   return Dependence::BackwardVectorizable;
 }
 
-bool MemoryDepChecker::areDepsSafe(
-    DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets,
+                                   const MemAccessInfoList &CheckDeps) {
 
   MinDepDistBytes = -1;
   SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2296,8 +2286,8 @@ bool MemoryDepChecker::areDepsSafe(
             if (*I1 > *I2)
               std::swap(A, B);
 
-            Dependence::DepType Type = isDependent(*A.first, A.second, *B.first,
-                                                   B.second, UnderlyingObjects);
+            Dependence::DepType Type =
+                isDependent(*A.first, A.second, *B.first, B.second);
             mergeInStatus(Dependence::isSafeForVectorization(Type));
 
             // Gather dependences unless we accumulated MaxDependences
@@ -2306,7 +2296,7 @@ bool MemoryDepChecker::areDepsSafe(
             // algorithm.
             if (RecordDependences) {
               if (Type != Dependence::NoDep)
-                Dependences.push_back(Dependence(A.second, B.second, Type));
+                Dependences.emplace_back(A.second, B.second, Type);
 
               if (Dependences.size() >= MaxDependences) {
                 RecordDependences = false;
@@ -2396,7 +2386,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   return true;
 }
 
-bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
+bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
                                  DominatorTree *DT) {
   // Holds the Load and Store instructions.
@@ -2637,7 +2627,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
                                SymbolicStrides, UncomputablePtr, false);
   if (!CanDoRTIfNeeded) {
-    auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+    const auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
     recordAnalysis("CantIdentifyArrayBounds", I)
         << "cannot identify array bounds";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -2652,8 +2642,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   if (Accesses.isDependencyCheckNeeded()) {
     LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
     DepsAreSafe = DepChecker->areDepsSafe(DependentAccesses,
-                                          Accesses.getDependenciesToCheck(),
-                                          Accesses.getUnderlyingObjects());
+                                          Accesses.getDependenciesToCheck());
 
     if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -2775,15 +2764,15 @@ bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
   assert(TheLoop->contains(BB) && "Unknown block used");
 
   // Blocks that do not dominate the latch need predication.
-  BasicBlock* Latch = TheLoop->getLoopLatch();
+  const BasicBlock *Latch = TheLoop->getLoopLatch();
   return !DT->dominates(BB, Latch);
 }
 
-OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
-                                                           Instruction *I) {
+OptimizationRemarkAnalysis &
+LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
   assert(!Report && "Multiple reports generated");
 
-  Value *CodeRegion = TheLoop->getHeader();
+  const Value *CodeRegion = TheLoop->getHeader();
   DebugLoc DL = TheLoop->getStartLoc();
 
   if (I) {
@@ -2840,7 +2829,7 @@ static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 /// operand, it returns that instead.
 static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return Ptr;
 
@@ -3077,7 +3066,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
-  auto [It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
+  const auto &[It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
 
   if (Inserted)
     It->second =
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 51cffac808768..21923cc80aa32 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -581,27 +581,10 @@ void SCEVUnknown::allUsesReplacedWith(Value *New) {
 
 /// Compare the two values \p LV and \p RV in terms of their "complexity" where
 /// "complexity" is a partial (and somewhat ad-hoc) relation used to order
-/// operands in SCEV expressions.  \p EqCache is a set of pairs of values that
-/// have been previously deemed to be "equally complex" by this routine.  It is
-/// intended to avoid exponential time complexity in cases like:
-///
-///   %a = f(%x, %y)
-///   %b = f(%a, %a)
-///   %c = f(%b, %b)
-///
-///   %d = f(%x, %y)
-///   %e = f(%d, %d)
-///   %f = f(%e, %e)
-///
-///   CompareValueComplexity(%f, %c)
-///
-/// Since we do not continue running this routine on expression trees once we
-/// have seen unequal values, there is no need to track them in the cache.
-static int
-CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
-                       const LoopInfo *const LI, Value *LV, Value *RV,
-                       unsigned Depth) {
-  if (Depth > MaxValueCompareDepth || EqCacheValue.isEquivalent(LV, RV))
+/// operands in SCEV expressions.
+static int CompareValueComplexity(const LoopInfo *const LI, Value *LV,
+                                  Value *RV, unsigned Depth) {
+  if (Depth > MaxValueCompareDepth)
     return 0;
 
   // Order pointer values after integer values. This helps SCEVExpander form
@@ -660,15 +643,13 @@ CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
       return (int)LNumOps - (int)RNumOps;
 
     for (unsigned Idx : seq(LNumOps)) {
-      int Result =
-          CompareValueComplexity(EqCacheValue, LI, LInst->getOperand(Idx),
-                                 RInst->getOperand(Idx), Depth + 1);
+      int Result = CompareValueComplexity(LI, LInst->getOperand(Idx),
+                                          RInst->getOperand(Idx), Depth + 1);
       if (Result != 0)
         return Result;
     }
   }
 
-  EqCacheValue.unionSets(LV, RV);
   return 0;
 }
 
@@ -679,7 +660,6 @@ CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
 // not know if they are equivalent for sure.
 static std::optional<int>
 CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
-                      EquivalenceClasses<const Value *> &EqCacheValue,
                       const LoopInfo *const LI, const SCEV *LHS,
                       const SCEV *RHS, DominatorTree &DT, unsigned Depth = 0) {
   // Fast-path: SCEVs are uniqued so we can do a quick equality check.
@@ -705,8 +685,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
     const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
     const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
 
-    int X = CompareValueComplexity(EqCacheValue, LI, LU->getValue(),
-                                   RU->getValue(), Depth + 1);
+    int X =
+        CompareValueComplexity(LI, LU->getValue(), RU->getValue(), Depth + 1);
     if (X == 0)
       EqCacheSCEV.unionSets(LHS, RHS);
     return X;
@@ -773,8 +753,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
       return (int)LNumOps - (int)RNumOps;
 
     for (unsigned i = 0; i != LNumOps; ++i) {
-      auto X = CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LOps[i],
-                                     ROps[i], DT, Depth + 1);
+      auto X = CompareSCEVComplexity(EqCacheSCEV, LI, LOps[i], ROps[i], DT,
+                                     Depth + 1);
       if (X != 0)
         return X;
     }
@@ -802,12 +782,10 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   if (Ops.size() < 2) return;  // Noop
 
   EquivalenceClasses<const SCEV *> EqCacheSCEV;
-  EquivalenceClasses<const Value *> EqCacheValue;
 
   // Whether LHS has provably less complexity than RHS.
   auto IsLessComplex = [&](const SCEV *LHS, const SCEV *RHS) {
-    auto Complexity =
-        CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LHS, RHS, DT);
+    auto Complexity = CompareSCEVComplexity(EqCacheSCEV, LI, LHS, RHS, DT);
     return Complexity && *Complexity < 0;
   };
   if (Ops.size() == 2) {
@@ -9171,23 +9149,22 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(
   // behaviour), and we can prove the test sequence produced must repeat
   // the same values on self-wrap of the IV, then we can infer that IV
   // doesn't self wrap because if it did, we'd have an infinite (undefined)
-  // loop.
+  // loop.  Note that a stride of 0 is trivially no-self-wrap by definition.
   if (ControllingFiniteLoop && isLoopInvariant(RHS, L)) {
     // TODO: We can peel off any functions which are invertible *in L*.  Loop
     // invariant terms are effectively constants for our purposes here.
     auto *InnerLHS = LHS;
     if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS))
       InnerLHS = ZExt->getOperand();
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(InnerLHS)) {
-      auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
-      if (!AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() &&
-          StrideC && StrideC->getAPInt().isPowerOf2()) {
-        auto Flags = AR->getNoWrapFlags();
-        Flags = setFlags(Flags, SCEV::FlagNW);
-        SmallVector<const SCEV*> Operands{AR->operands()};
-        Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
-        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
-      }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(InnerLHS);
+        AR && !AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() &&
+        isKnownToBeAPowerOfTwo(AR->getStepRecurrence(*this), /*OrZero=*/true,
+                               /*OrNegative=*/true)) {
+      auto Flags = AR->getNoWrapFlags();
+      Flags = setFlags(Flags, SCEV::FlagNW);
+      SmallVector<const SCEV *> Operands{AR->operands()};
+      Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
+      setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
     }
   }
 
@@ -10867,6 +10844,26 @@ bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
   return getUnsignedRangeMin(S) != 0;
 }
 
+bool ScalarEvolution::isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero,
+                                             bool OrNegative) {
+  auto NonRecursive = [this, OrNegative](const SCEV *S) {
+    if (auto *C = dyn_cast<SCEVConstant>(S))
+      return C->getAPInt().isPowerOf2() ||
+             (OrNegative && C->getAPInt().isNegatedPowerOf2());
+
+    // The vscale_range indicates vscale is a power-of-two.
+    return isa<SCEVVScale>(S) && F.hasFnAttribute(Attribute::VScaleRange);
+  };
+
+  if (NonRecursive(S))
+    return true;
+
+  auto *Mul = dyn_cast<SCEVMulExpr>(S);
+  if (!Mul)
+    return false;
+  return all_of(Mul->operands(), NonRecursive) && (OrZero || isKnownNonZero(S));
+}
+
 std::pair<const SCEV *, const SCEV *>
 ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) {
   // Compute SCEV on entry of loop L.
@@ -12098,8 +12095,10 @@ bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
   // C)".
 
   std::optional<APInt> LDiff = computeConstantDifference(LHS, FoundLHS);
+  if (!LDiff)
+    return false;
   std::optional<APInt> RDiff = computeConstantDifference(RHS, FoundRHS);
-  if (!LDiff || !RDiff || *LDiff != *RDiff)
+  if (!RDiff || *LDiff != *RDiff)
     return false;
 
   if (LDiff->isMinValue())
@@ -12795,8 +12794,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     if (!isLoopInvariant(RHS, L))
       return false;
 
-    auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
-    if (!StrideC || !StrideC->getAPInt().isPowerOf2())
+    if (!isKnownToBeAPowerOfTwo(AR->getStepRecurrence(*this), /*OrZero=*/true,
+                                /*OrNegative*/ true))
       return false;
 
     if (!ControlsOnlyExit || !loopHasNoAbnormalExits(L))
@@ -13152,52 +13151,50 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
       // "(Start - End) + (Stride - 1)" has unsigned overflow.
       const SCEV *One = getOne(Stride->getType());
       bool MayAddOverflow = [&] {
-        if (auto *StrideC = dyn_cast<SCEVConstant>(Stride)) {
-          if (StrideC->getAPInt().isPowerOf2()) {
-            // Suppose Stride is a power of two, and Start/End are unsigned
-            // integers.  Let UMAX be the largest representable unsigned
-            // integer.
-            //
-            // By the preconditions of this function, we know
-            // "(Start + Stride * N) >= End", and this doesn't overflow.
-            // As a formula:
-            //
-            //   End <= (Start + Stride * N) <= UMAX
-            //
-            // Subtracting Start from all the terms:
-            //
-            //   End - Start <= Stride * N <= UMAX - Start
-            //
-            // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
-            //
-            //   End - Start <= Stride * N <= UMAX
-            //
-            // Stride * N is a multiple of Stride. Therefore,
-            //
-            //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
-            //
-            // Since Stride is a power of two, UMAX + 1 is divisible by
-            // Stride. Therefore, UMAX mod Stride == Stride - 1.  So we can
-            // write:
-            //
-            //   End - Start <= Stride * N <= UMAX - Stride - 1
-            //
-            // Dropping the middle term:
-            //
-            //   End - Start <= UMAX - Stride - 1
-            //
-            // Adding Stride - 1 to both sides:
-            //
-            //   (End - Start) + (Stride - 1) <= UMAX
-            //
-            // In other words, the addition doesn't have unsigned overflow.
-            //
-            // A similar proof works if we treat Start/End as signed values.
-            // Just rewrite steps before "End - Start <= Stride * N <= UMAX"
-            // to use signed max instead of unsigned max. Note that we're
-            // trying to prove a lack of unsigned overflow in either case.
-            return false;
-          }
+        if (isKnownToBeAPowerOfTwo(Stride)) {
+          // Suppose Stride is a power of two, and Start/End are unsigned
+          // integers.  Let UMAX be the largest representable unsigned
+          // integer.
+          //
+          // By the preconditions of this function, we know
+          // "(Start + Stride * N) >= End", and this doesn't overflow.
+          // As a formula:
+          //
+          //   End <= (Start + Stride * N) <= UMAX
+          //
+          // Subtracting Start from all the terms:
+          //
+          //   End - Start <= Stride * N <= UMAX - Start
+          //
+          // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
+          //
+          //   End - Start <= Stride * N <= UMAX
+          //
+          // Stride * N is a multiple of Stride. Therefore,
+          //
+          //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
+          //
+          // Since Stride is a power of two, UMAX + 1 is divisible by
+          // Stride. Therefore, UMAX mod Stride == Stride - 1.  So we can
+          // write:
+          //
+          //   End - Start <= Stride * N <= UMAX - Stride - 1
+          //
+          // Dropping the middle term:
+          //
+          //   End - Start <= UMAX - Stride - 1
+          //
+          // Adding Stride - 1 to both sides:
+          //
+          //   (End - Start) + (Stride - 1) <= UMAX
+          //
+          // In other words, the addition doesn't have unsigned overflow.
+          //
+          // A similar proof works if we treat Start/End as signed values.
+          // Just rewrite steps before "End - Start <= Stride * N <= UMAX"
+          // to use signed max instead of unsigned max. Note that we're
+          // trying to prove a lack of unsigned overflow in either case.
+          return false;
         }
         if (Start == Stride || Start == getMinusSCEV(Stride, One)) {
           // If Start is equal to Stride, (End - Start) + (Stride - 1) == End
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c93fdea8e099a..c5ba6ec203f29 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -316,6 +316,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc_modff);
       TLI.setUnavailable(LibFunc_powf);
       TLI.setUnavailable(LibFunc_remainderf);
+      TLI.setUnavailable(LibFunc_remquof);
       TLI.setUnavailable(LibFunc_sinf);
       TLI.setUnavailable(LibFunc_sinhf);
       TLI.setUnavailable(LibFunc_sqrtf);
@@ -346,6 +347,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_modfl);
     TLI.setUnavailable(LibFunc_powl);
     TLI.setUnavailable(LibFunc_remainderl);
+    TLI.setUnavailable(LibFunc_remquol);
     TLI.setUnavailable(LibFunc_sinl);
     TLI.setUnavailable(LibFunc_sinhl);
     TLI.setUnavailable(LibFunc_sqrtl);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 693f7a5bb7af5..dcde78925bfa9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const {
              : TTIImpl->getPredictableBranchThreshold();
 }
 
+InstructionCost TargetTransformInfo::getBranchMispredictPenalty() const {
+  return TTIImpl->getBranchMispredictPenalty();
+}
+
 bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
   return TTIImpl->hasBranchDivergence(F);
 }
@@ -1194,7 +1198,7 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
 
 Type *TargetTransformInfo::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicElementSize) const {
   return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
                                             DestAddrSpace, SrcAlign, DestAlign,
@@ -1204,7 +1208,7 @@ Type *TargetTransformInfo::getMemcpyLoopLoweringType(
 void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-    unsigned SrcAlign, unsigned DestAlign,
+    Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
   TTIImpl->getMemcpyLoopResidualLoweringType(
       OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e1cb63a9ab8f9..0d7eb7da8d6b6 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -822,16 +822,16 @@ MDNode *AAMDNodes::extendToTBAA(MDNode *MD, ssize_t Len) {
 AAMDNodes AAMDNodes::adjustForAccess(unsigned AccessSize) {
   AAMDNodes New = *this;
   MDNode *M = New.TBAAStruct;
-  if (M && M->getNumOperands() >= 3 && M->getOperand(0) &&
+  if (!New.TBAA && M && M->getNumOperands() >= 3 && M->getOperand(0) &&
       mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
       mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
       M->getOperand(1) && mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
       mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
           AccessSize &&
-      M->getOperand(2) && isa<MDNode>(M->getOperand(2))) {
-    New.TBAAStruct = nullptr;
+      M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
     New.TBAA = cast<MDNode>(M->getOperand(2));
-  }
+
+  New.TBAAStruct = nullptr;
   return New;
 }
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index f8ec868398323..285284dc27071 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -254,8 +254,7 @@ bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
 
 bool llvm::isOnlyUsedInZeroComparison(const Instruction *I) {
   return !I->user_empty() && all_of(I->users(), [](const User *U) {
-    ICmpInst::Predicate P;
-    return match(U, m_ICmp(P, m_Value(), m_Zero()));
+    return match(U, m_ICmp(m_Value(), m_Zero()));
   });
 }
 
@@ -303,15 +302,21 @@ bool llvm::isKnownNegative(const Value *V, const SimplifyQuery &SQ,
   return computeKnownBits(V, Depth, SQ).isNegative();
 }
 
-static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
+static bool isKnownNonEqual(const Value *V1, const Value *V2,
+                            const APInt &DemandedElts, unsigned Depth,
                             const SimplifyQuery &Q);
 
 bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
                            const DataLayout &DL, AssumptionCache *AC,
                            const Instruction *CxtI, const DominatorTree *DT,
                            bool UseInstrInfo) {
+  assert(V1->getType() == V2->getType() &&
+         "Testing equality of non-equal types!");
+  auto *FVTy = dyn_cast<FixedVectorType>(V1->getType());
+  APInt DemandedElts =
+      FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
   return ::isKnownNonEqual(
-      V1, V2, 0,
+      V1, V2, DemandedElts, 0,
       SimplifyQuery(DL, DT, AC, safeCxtI(V2, V1, CxtI), UseInstrInfo));
 }
 
@@ -928,7 +933,7 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
     // Demanded) == (xor(x, x-1) & Demanded). Extend the xor pattern
     // to use arbitrary C if xor(x, x-C) as the same as xor(x, x-1).
     if (HasKnownOne &&
-        match(I, m_c_Xor(m_Value(X), m_c_Add(m_Deferred(X), m_AllOnes())))) {
+        match(I, m_c_Xor(m_Value(X), m_Add(m_Deferred(X), m_AllOnes())))) {
       const KnownBits &XBits = I->getOperand(0) == X ? KnownLHS : KnownRHS;
       KnownOut = XBits.blsmsk();
     }
@@ -1091,15 +1096,15 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::UDiv: {
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
     Known =
         KnownBits::udiv(Known, Known2, Q.IIQ.isExact(cast<BinaryOperator>(I)));
     break;
   }
   case Instruction::SDiv: {
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
     Known =
         KnownBits::sdiv(Known, Known2, Q.IIQ.isExact(cast<BinaryOperator>(I)));
     break;
@@ -1107,7 +1112,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
   case Instruction::Select: {
     auto ComputeForArm = [&](Value *Arm, bool Invert) {
       KnownBits Res(Known.getBitWidth());
-      computeKnownBits(Arm, Res, Depth + 1, Q);
+      computeKnownBits(Arm, DemandedElts, Res, Depth + 1, Q);
       adjustKnownBitsForSelectArm(Res, I->getOperand(0), Arm, Invert, Depth, Q);
       return Res;
     };
@@ -1142,7 +1147,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     Known = Known.anyextOrTrunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
     if (auto *Inst = dyn_cast<PossiblyNonNegInst>(I);
         Inst && Inst->hasNonNeg() && !Known.isNegative())
       Known.makeNonNegative();
@@ -1164,7 +1169,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
     if (match(I, m_ElementWiseBitCast(m_Value(V))) &&
         V->getType()->isFPOrFPVectorTy()) {
       Type *FPType = V->getType()->getScalarType();
-      KnownFPClass Result = computeKnownFPClass(V, fcAllFlags, Depth + 1, Q);
+      KnownFPClass Result =
+          computeKnownFPClass(V, DemandedElts, fcAllFlags, Depth + 1, Q);
       FPClassTest FPClasses = Result.KnownFPClasses;
 
       // TODO: Treat it as zero/poison if the use of I is unreachable.
@@ -1245,7 +1251,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
     Known = Known.trunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
     Known = Known.sext(BitWidth);
@@ -1305,14 +1311,14 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::SRem:
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
     Known = KnownBits::srem(Known, Known2);
     break;
 
   case Instruction::URem:
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
     Known = KnownBits::urem(Known, Known2);
     break;
   case Instruction::Alloca:
@@ -1428,7 +1434,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         // inferred hold at original context instruction.  TODO: It may be
         // correct to use the original context.  IF warranted, explore and
         // add sufficient tests to cover.
-        SimplifyQuery RecQ = Q;
+        SimplifyQuery RecQ = Q.getWithoutCondContext();
         RecQ.CxtI = P;
         computeKnownBits(R, DemandedElts, Known2, Depth + 1, RecQ);
         switch (Opcode) {
@@ -1461,21 +1467,21 @@ static void computeKnownBitsFromOperator(const Operator *I,
         // phi. This is important because that is where the value is actually
         // "evaluated" even though it is used later somewhere else. (see also
         // D69571).
-        SimplifyQuery RecQ = Q;
+        SimplifyQuery RecQ = Q.getWithoutCondContext();
 
         unsigned OpNum = P->getOperand(0) == R ? 0 : 1;
         Instruction *RInst = P->getIncomingBlock(OpNum)->getTerminator();
-        Instruction *LInst = P->getIncomingBlock(1-OpNum)->getTerminator();
+        Instruction *LInst = P->getIncomingBlock(1 - OpNum)->getTerminator();
 
         // Ok, we have a PHI of the form L op= R. Check for low
         // zero bits.
         RecQ.CxtI = RInst;
-        computeKnownBits(R, Known2, Depth + 1, RecQ);
+        computeKnownBits(R, DemandedElts, Known2, Depth + 1, RecQ);
 
         // We need to take the minimum number of known bits
         KnownBits Known3(BitWidth);
         RecQ.CxtI = LInst;
-        computeKnownBits(L, Known3, Depth + 1, RecQ);
+        computeKnownBits(L, DemandedElts, Known3, Depth + 1, RecQ);
 
         Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
                                        Known3.countMinTrailingZeros()));
@@ -1539,7 +1545,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         // phi. This is important because that is where the value is actually
         // "evaluated" even though it is used later somewhere else. (see also
         // D69571).
-        SimplifyQuery RecQ = Q;
+        SimplifyQuery RecQ = Q.getWithoutCondContext();
         RecQ.CxtI = P->getIncomingBlock(u)->getTerminator();
 
         Known2 = KnownBits(BitWidth);
@@ -1548,7 +1554,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         // want to waste time spinning around in loops.
         // TODO: See if we can base recursion limiter on number of incoming phi
         // edges so we don't overly clamp analysis.
-        computeKnownBits(IncValue, Known2, MaxAnalysisRecursionDepth - 1, RecQ);
+        computeKnownBits(IncValue, DemandedElts, Known2,
+                         MaxAnalysisRecursionDepth - 1, RecQ);
 
         // See if we can further use a conditional branch into the phi
         // to help us determine the range of the value.
@@ -1619,9 +1626,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
     }
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
-      default: break;
+      default:
+        break;
       case Intrinsic::abs: {
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         bool IntMinIsPoison = match(II->getArgOperand(1), m_One());
         Known = Known2.abs(IntMinIsPoison);
         break;
@@ -1637,7 +1645,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         Known.One |= Known2.One.byteSwap();
         break;
       case Intrinsic::ctlz: {
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleLZ = Known2.countMaxLeadingZeros();
         // If this call is poison for 0 input, the result will be less than 2^n.
@@ -1648,7 +1656,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         break;
       }
       case Intrinsic::cttz: {
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleTZ = Known2.countMaxTrailingZeros();
         // If this call is poison for 0 input, the result will be less than 2^n.
@@ -1659,7 +1667,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         break;
       }
       case Intrinsic::ctpop: {
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
         unsigned BitsPossiblySet = Known2.countMaxPopulation();
@@ -1681,8 +1689,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
           ShiftAmt = BitWidth - ShiftAmt;
 
         KnownBits Known3(BitWidth);
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known3, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known3, Depth + 1, Q);
 
         Known.Zero =
             Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt);
@@ -1691,27 +1699,30 @@ static void computeKnownBitsFromOperator(const Operator *I,
         break;
       }
       case Intrinsic::uadd_sat:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::uadd_sat(Known, Known2);
         break;
       case Intrinsic::usub_sat:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::usub_sat(Known, Known2);
         break;
       case Intrinsic::sadd_sat:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::sadd_sat(Known, Known2);
         break;
       case Intrinsic::ssub_sat:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::ssub_sat(Known, Known2);
         break;
         // Vec reverse preserves bits from input vec.
       case Intrinsic::vector_reverse:
+        computeKnownBits(I->getOperand(0), DemandedElts.reverseBits(), Known,
+                         Depth + 1, Q);
+        break;
         // for min/max/and/or reduce, any bit common to each element in the
         // input vec is set in the output.
       case Intrinsic::vector_reduce_and:
@@ -1738,31 +1749,31 @@ static void computeKnownBitsFromOperator(const Operator *I,
         break;
       }
       case Intrinsic::umin:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::umin(Known, Known2);
         break;
       case Intrinsic::umax:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::umax(Known, Known2);
         break;
       case Intrinsic::smin:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::smin(Known, Known2);
         break;
       case Intrinsic::smax:
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::smax(Known, Known2);
         break;
       case Intrinsic::ptrmask: {
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
 
         const Value *Mask = I->getOperand(1);
         Known2 = KnownBits(Mask->getType()->getScalarSizeInBits());
-        computeKnownBits(Mask, Known2, Depth + 1, Q);
+        computeKnownBits(Mask, DemandedElts, Known2, Depth + 1, Q);
         // TODO: 1-extend would be more precise.
         Known &= Known2.anyextOrTrunc(BitWidth);
         break;
@@ -2317,7 +2328,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
     // it is an induction variable where in each step its value is a power of
     // two.
     auto *PN = cast<PHINode>(I);
-    SimplifyQuery RecQ = Q;
+    SimplifyQuery RecQ = Q.getWithoutCondContext();
 
     // Check if it is an induction variable and always power of two.
     if (isPowerOfTwoRecurrence(PN, OrZero, Depth, RecQ))
@@ -2582,10 +2593,10 @@ static bool isNonZeroRecurrence(const PHINode *PN) {
 }
 
 static bool matchOpWithOpEqZero(Value *Op0, Value *Op1) {
-  ICmpInst::Predicate Pred;
-  return (match(Op0, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op1), m_Zero()))) ||
-          match(Op1, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op0), m_Zero())))) &&
-         Pred == ICmpInst::ICMP_EQ;
+  return match(Op0, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op1), m_Zero()))) ||
+         match(Op1, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op0), m_Zero())));
 }
 
 static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth,
@@ -2648,7 +2659,7 @@ static bool isNonZeroSub(const APInt &DemandedElts, unsigned Depth,
     if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Q, Depth))
       return true;
 
-  return ::isKnownNonEqual(X, Y, Depth, Q);
+  return ::isKnownNonEqual(X, Y, DemandedElts, Depth, Q);
 }
 
 static bool isNonZeroMul(const APInt &DemandedElts, unsigned Depth,
@@ -2782,7 +2793,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     if (!isa<ScalableVectorType>(I->getType()) &&
         Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <=
             Q.DL.getTypeSizeInBits(I->getType()).getFixedValue())
-      return isKnownNonZero(I->getOperand(0), Q, Depth);
+      return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
     break;
   case Instruction::PtrToInt:
     // Similar to int2ptr above, we can look through ptr2int here if the cast
@@ -2790,13 +2801,13 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     if (!isa<ScalableVectorType>(I->getType()) &&
         Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <=
             Q.DL.getTypeSizeInBits(I->getType()).getFixedValue())
-      return isKnownNonZero(I->getOperand(0), Q, Depth);
+      return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
     break;
   case Instruction::Trunc:
     // nuw/nsw trunc preserves zero/non-zero status of input.
     if (auto *TI = dyn_cast<TruncInst>(I))
       if (TI->hasNoSignedWrap() || TI->hasNoUnsignedWrap())
-        return isKnownNonZero(TI->getOperand(0), Q, Depth);
+        return isKnownNonZero(TI->getOperand(0), DemandedElts, Q, Depth);
     break;
 
   case Instruction::Sub:
@@ -2817,13 +2828,13 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
   case Instruction::SExt:
   case Instruction::ZExt:
     // ext X != 0 if X != 0.
-    return isKnownNonZero(I->getOperand(0), Q, Depth);
+    return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
 
   case Instruction::Shl: {
     // shl nsw/nuw can't remove any non-zero bits.
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(I);
     if (Q.IIQ.hasNoUnsignedWrap(BO) || Q.IIQ.hasNoSignedWrap(BO))
-      return isKnownNonZero(I->getOperand(0), Q, Depth);
+      return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
 
     // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
     // if the lowest bit is shifted off the end.
@@ -2839,7 +2850,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     // shr exact can only shift out zero bits.
     const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(I);
     if (BO->isExact())
-      return isKnownNonZero(I->getOperand(0), Q, Depth);
+      return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
 
     // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
     // defined if the sign bit is shifted off the end.
@@ -2931,7 +2942,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       return true;
 
     // Check if all incoming values are non-zero using recursion.
-    SimplifyQuery RecQ = Q;
+    SimplifyQuery RecQ = Q.getWithoutCondContext();
     unsigned NewDepth = std::max(Depth, MaxAnalysisRecursionDepth - 1);
     return llvm::all_of(PN->operands(), [&](const Use &U) {
       if (U.get() == PN)
@@ -3094,6 +3105,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
                             /*NSW=*/true, /* NUW=*/false);
         // Vec reverse preserves zero/non-zero status from input vec.
       case Intrinsic::vector_reverse:
+        return isKnownNonZero(II->getArgOperand(0), DemandedElts.reverseBits(),
+                              Q, Depth);
         // umin/smin/smax/smin/or of all non-zero elements is always non-zero.
       case Intrinsic::vector_reduce_or:
       case Intrinsic::vector_reduce_umax:
@@ -3418,7 +3431,8 @@ getInvertibleOperands(const Operator *Op1,
 /// Only handle a small subset of binops where (binop V2, X) with non-zero X
 /// implies V2 != V1.
 static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2,
-                                      unsigned Depth, const SimplifyQuery &Q) {
+                                      const APInt &DemandedElts, unsigned Depth,
+                                      const SimplifyQuery &Q) {
   const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
   if (!BO)
     return false;
@@ -3438,39 +3452,43 @@ static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2,
       Op = BO->getOperand(0);
     else
       return false;
-    return isKnownNonZero(Op, Q, Depth + 1);
+    return isKnownNonZero(Op, DemandedElts, Q, Depth + 1);
   }
   return false;
 }
 
 /// Return true if V2 == V1 * C, where V1 is known non-zero, C is not 0/1 and
 /// the multiplication is nuw or nsw.
-static bool isNonEqualMul(const Value *V1, const Value *V2, unsigned Depth,
+static bool isNonEqualMul(const Value *V1, const Value *V2,
+                          const APInt &DemandedElts, unsigned Depth,
                           const SimplifyQuery &Q) {
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(V2)) {
     const APInt *C;
     return match(OBO, m_Mul(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isZero() && !C->isOne() && isKnownNonZero(V1, Q, Depth + 1);
+           !C->isZero() && !C->isOne() &&
+           isKnownNonZero(V1, DemandedElts, Q, Depth + 1);
   }
   return false;
 }
 
 /// Return true if V2 == V1 << C, where V1 is known non-zero, C is not 0 and
 /// the shift is nuw or nsw.
-static bool isNonEqualShl(const Value *V1, const Value *V2, unsigned Depth,
+static bool isNonEqualShl(const Value *V1, const Value *V2,
+                          const APInt &DemandedElts, unsigned Depth,
                           const SimplifyQuery &Q) {
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(V2)) {
     const APInt *C;
     return match(OBO, m_Shl(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isZero() && isKnownNonZero(V1, Q, Depth + 1);
+           !C->isZero() && isKnownNonZero(V1, DemandedElts, Q, Depth + 1);
   }
   return false;
 }
 
 static bool isNonEqualPHIs(const PHINode *PN1, const PHINode *PN2,
-                           unsigned Depth, const SimplifyQuery &Q) {
+                           const APInt &DemandedElts, unsigned Depth,
+                           const SimplifyQuery &Q) {
   // Check two PHIs are in same block.
   if (PN1->getParent() != PN2->getParent())
     return false;
@@ -3490,16 +3508,17 @@ static bool isNonEqualPHIs(const PHINode *PN1, const PHINode *PN2,
     if (UsedFullRecursion)
       return false;
 
-    SimplifyQuery RecQ = Q;
+    SimplifyQuery RecQ = Q.getWithoutCondContext();
     RecQ.CxtI = IncomBB->getTerminator();
-    if (!isKnownNonEqual(IV1, IV2, Depth + 1, RecQ))
+    if (!isKnownNonEqual(IV1, IV2, DemandedElts, Depth + 1, RecQ))
       return false;
     UsedFullRecursion = true;
   }
   return true;
 }
 
-static bool isNonEqualSelect(const Value *V1, const Value *V2, unsigned Depth,
+static bool isNonEqualSelect(const Value *V1, const Value *V2,
+                             const APInt &DemandedElts, unsigned Depth,
                              const SimplifyQuery &Q) {
   const SelectInst *SI1 = dyn_cast<SelectInst>(V1);
   if (!SI1)
@@ -3510,12 +3529,12 @@ static bool isNonEqualSelect(const Value *V1, const Value *V2, unsigned Depth,
     const Value *Cond2 = SI2->getCondition();
     if (Cond1 == Cond2)
       return isKnownNonEqual(SI1->getTrueValue(), SI2->getTrueValue(),
-                             Depth + 1, Q) &&
+                             DemandedElts, Depth + 1, Q) &&
              isKnownNonEqual(SI1->getFalseValue(), SI2->getFalseValue(),
-                             Depth + 1, Q);
+                             DemandedElts, Depth + 1, Q);
   }
-  return isKnownNonEqual(SI1->getTrueValue(), V2, Depth + 1, Q) &&
-         isKnownNonEqual(SI1->getFalseValue(), V2, Depth + 1, Q);
+  return isKnownNonEqual(SI1->getTrueValue(), V2, DemandedElts, Depth + 1, Q) &&
+         isKnownNonEqual(SI1->getFalseValue(), V2, DemandedElts, Depth + 1, Q);
 }
 
 // Check to see if A is both a GEP and is the incoming value for a PHI in the
@@ -3571,7 +3590,8 @@ static bool isNonEqualPointersWithRecursiveGEP(const Value *A, const Value *B,
 }
 
 /// Return true if it is known that V1 != V2.
-static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
+static bool isKnownNonEqual(const Value *V1, const Value *V2,
+                            const APInt &DemandedElts, unsigned Depth,
                             const SimplifyQuery &Q) {
   if (V1 == V2)
     return false;
@@ -3589,40 +3609,44 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
   auto *O2 = dyn_cast<Operator>(V2);
   if (O1 && O2 && O1->getOpcode() == O2->getOpcode()) {
     if (auto Values = getInvertibleOperands(O1, O2))
-      return isKnownNonEqual(Values->first, Values->second, Depth + 1, Q);
+      return isKnownNonEqual(Values->first, Values->second, DemandedElts,
+                             Depth + 1, Q);
 
     if (const PHINode *PN1 = dyn_cast<PHINode>(V1)) {
       const PHINode *PN2 = cast<PHINode>(V2);
       // FIXME: This is missing a generalization to handle the case where one is
       // a PHI and another one isn't.
-      if (isNonEqualPHIs(PN1, PN2, Depth, Q))
+      if (isNonEqualPHIs(PN1, PN2, DemandedElts, Depth, Q))
         return true;
     };
   }
 
-  if (isModifyingBinopOfNonZero(V1, V2, Depth, Q) ||
-      isModifyingBinopOfNonZero(V2, V1, Depth, Q))
+  if (isModifyingBinopOfNonZero(V1, V2, DemandedElts, Depth, Q) ||
+      isModifyingBinopOfNonZero(V2, V1, DemandedElts, Depth, Q))
     return true;
 
-  if (isNonEqualMul(V1, V2, Depth, Q) || isNonEqualMul(V2, V1, Depth, Q))
+  if (isNonEqualMul(V1, V2, DemandedElts, Depth, Q) ||
+      isNonEqualMul(V2, V1, DemandedElts, Depth, Q))
     return true;
 
-  if (isNonEqualShl(V1, V2, Depth, Q) || isNonEqualShl(V2, V1, Depth, Q))
+  if (isNonEqualShl(V1, V2, DemandedElts, Depth, Q) ||
+      isNonEqualShl(V2, V1, DemandedElts, Depth, Q))
     return true;
 
   if (V1->getType()->isIntOrIntVectorTy()) {
     // Are any known bits in V1 contradictory to known bits in V2? If V1
     // has a known zero where V2 has a known one, they must not be equal.
-    KnownBits Known1 = computeKnownBits(V1, Depth, Q);
+    KnownBits Known1 = computeKnownBits(V1, DemandedElts, Depth, Q);
     if (!Known1.isUnknown()) {
-      KnownBits Known2 = computeKnownBits(V2, Depth, Q);
+      KnownBits Known2 = computeKnownBits(V2, DemandedElts, Depth, Q);
       if (Known1.Zero.intersects(Known2.One) ||
           Known2.Zero.intersects(Known1.One))
         return true;
     }
   }
 
-  if (isNonEqualSelect(V1, V2, Depth, Q) || isNonEqualSelect(V2, V1, Depth, Q))
+  if (isNonEqualSelect(V1, V2, DemandedElts, Depth, Q) ||
+      isNonEqualSelect(V2, V1, DemandedElts, Depth, Q))
     return true;
 
   if (isNonEqualPointersWithRecursiveGEP(V1, V2, Q) ||
@@ -3634,7 +3658,7 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
   // Check PtrToInt type matches the pointer size.
   if (match(V1, m_PtrToIntSameSize(Q.DL, m_Value(A))) &&
       match(V2, m_PtrToIntSameSize(Q.DL, m_Value(B))))
-    return isKnownNonEqual(A, B, Depth + 1, Q);
+    return isKnownNonEqual(A, B, DemandedElts, Depth + 1, Q);
 
   return false;
 }
@@ -3773,7 +3797,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     default: break;
     case Instruction::SExt:
       Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
-      return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
+      return ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q) +
+             Tmp;
 
     case Instruction::SDiv: {
       const APInt *Denominator;
@@ -3785,7 +3810,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
           break;
 
         // Calculate the incoming numerator bits.
-        unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+        unsigned NumBits =
+            ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
 
         // Add floor(log(C)) bits to the numerator bits.
         return std::min(TyBits, NumBits + Denominator->logBase2());
@@ -3794,7 +3820,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     }
 
     case Instruction::SRem: {
-      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
 
       const APInt *Denominator;
       // srem X, C -> we know that the result is within [-C+1,C) when C is a
@@ -3825,7 +3851,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     }
 
     case Instruction::AShr: {
-      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
       // ashr X, C   -> adds C sign bits.  Vectors too.
       const APInt *ShAmt;
       if (match(U->getOperand(1), m_APInt(ShAmt))) {
@@ -3839,11 +3865,22 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     }
     case Instruction::Shl: {
       const APInt *ShAmt;
+      Value *X = nullptr;
       if (match(U->getOperand(1), m_APInt(ShAmt))) {
         // shl destroys sign bits.
-        Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-        if (ShAmt->uge(TyBits) ||   // Bad shift.
-            ShAmt->uge(Tmp)) break; // Shifted all sign bits out.
+        if (ShAmt->uge(TyBits))
+          break; // Bad shift.
+        // We can look through a zext (more or less treating it as a sext) if
+        // all extended bits are shifted out.
+        if (match(U->getOperand(0), m_ZExt(m_Value(X))) &&
+            ShAmt->uge(TyBits - X->getType()->getScalarSizeInBits())) {
+          Tmp = ComputeNumSignBits(X, DemandedElts, Depth + 1, Q);
+          Tmp += TyBits - X->getType()->getScalarSizeInBits();
+        } else
+          Tmp =
+              ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
+        if (ShAmt->uge(Tmp))
+          break; // Shifted all sign bits out.
         Tmp2 = ShAmt->getZExtValue();
         return Tmp - Tmp2;
       }
@@ -3853,9 +3890,9 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     case Instruction::Or:
     case Instruction::Xor: // NOT is handled here.
       // Logical binary ops preserve the number of sign bits at the worst.
-      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
       if (Tmp != 1) {
-        Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+        Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q);
         FirstAnswer = std::min(Tmp, Tmp2);
         // We computed what we know about the sign bits as our first
         // answer. Now proceed to the generic code that uses
@@ -3871,9 +3908,10 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
       if (isSignedMinMaxClamp(U, X, CLow, CHigh))
         return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits());
 
-      Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-      if (Tmp == 1) break;
-      Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
+      Tmp = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q);
+      if (Tmp == 1)
+        break;
+      Tmp2 = ComputeNumSignBits(U->getOperand(2), DemandedElts, Depth + 1, Q);
       return std::min(Tmp, Tmp2);
     }
 
@@ -3887,7 +3925,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
       if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
         if (CRHS->isAllOnesValue()) {
           KnownBits Known(TyBits);
-          computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
+          computeKnownBits(U->getOperand(0), DemandedElts, Known, Depth + 1, Q);
 
           // If the input is known to be 0 or 1, the output is 0/-1, which is
           // all sign bits set.
@@ -3900,19 +3938,21 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
             return Tmp;
         }
 
-      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-      if (Tmp2 == 1) break;
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q);
+      if (Tmp2 == 1)
+        break;
       return std::min(Tmp, Tmp2) - 1;
 
     case Instruction::Sub:
-      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-      if (Tmp2 == 1) break;
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q);
+      if (Tmp2 == 1)
+        break;
 
       // Handle NEG.
       if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
         if (CLHS->isNullValue()) {
           KnownBits Known(TyBits);
-          computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
+          computeKnownBits(U->getOperand(1), DemandedElts, Known, Depth + 1, Q);
           // If the input is known to be 0 or 1, the output is 0/-1, which is
           // all sign bits set.
           if ((Known.Zero | 1).isAllOnes())
@@ -3929,17 +3969,22 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
       // Sub can have at most one carry bit.  Thus we know that the output
       // is, at worst, one more bit than the inputs.
-      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-      if (Tmp == 1) break;
+      Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
+      if (Tmp == 1)
+        break;
       return std::min(Tmp, Tmp2) - 1;
 
     case Instruction::Mul: {
       // The output of the Mul can be at most twice the valid bits in the
       // inputs.
-      unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-      if (SignBitsOp0 == 1) break;
-      unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-      if (SignBitsOp1 == 1) break;
+      unsigned SignBitsOp0 =
+          ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
+      if (SignBitsOp0 == 1)
+        break;
+      unsigned SignBitsOp1 =
+          ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q);
+      if (SignBitsOp1 == 1)
+        break;
       unsigned OutValidBits =
           (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
       return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
@@ -3955,13 +4000,13 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
       // Take the minimum of all incoming values.  This can't infinitely loop
       // because of our depth threshold.
-      SimplifyQuery RecQ = Q;
+      SimplifyQuery RecQ = Q.getWithoutCondContext();
       Tmp = TyBits;
       for (unsigned i = 0, e = NumIncomingValues; i != e; ++i) {
         if (Tmp == 1) return Tmp;
         RecQ.CxtI = PN->getIncomingBlock(i)->getTerminator();
-        Tmp = std::min(
-            Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, RecQ));
+        Tmp = std::min(Tmp, ComputeNumSignBits(PN->getIncomingValue(i),
+                                               DemandedElts, Depth + 1, RecQ));
       }
       return Tmp;
     }
@@ -4022,10 +4067,13 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     case Instruction::Call: {
       if (const auto *II = dyn_cast<IntrinsicInst>(U)) {
         switch (II->getIntrinsicID()) {
-        default: break;
+        default:
+          break;
         case Intrinsic::abs:
-          Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-          if (Tmp == 1) break;
+          Tmp =
+              ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q);
+          if (Tmp == 1)
+            break;
 
           // Absolute value reduces number of sign bits by at most 1.
           return Tmp - 1;
@@ -5233,8 +5281,9 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
     }
       // reverse preserves all characteristics of the input vec's element.
     case Intrinsic::vector_reverse:
-      Known = computeKnownFPClass(II->getArgOperand(0), II->getFastMathFlags(),
-                                  InterestedClasses, Depth + 1, Q);
+      Known = computeKnownFPClass(
+          II->getArgOperand(0), DemandedElts.reverseBits(),
+          II->getFastMathFlags(), InterestedClasses, Depth + 1, Q);
       break;
     case Intrinsic::trunc:
     case Intrinsic::floor:
@@ -5859,10 +5908,10 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         // Recurse, but cap the recursion to two levels, because we don't want
         // to waste time spinning around in loops. We need at least depth 2 to
         // detect known sign bits.
-        computeKnownFPClass(
-            IncValue, DemandedElts, InterestedClasses, KnownSrc,
-            PhiRecursionLimit,
-            Q.getWithInstruction(P->getIncomingBlock(U)->getTerminator()));
+        computeKnownFPClass(IncValue, DemandedElts, InterestedClasses, KnownSrc,
+                            PhiRecursionLimit,
+                            Q.getWithoutCondContext().getWithInstruction(
+                                P->getIncomingBlock(U)->getTerminator()));
 
         if (First) {
           Known = KnownSrc;
@@ -6530,7 +6579,7 @@ const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) {
 
 void llvm::getUnderlyingObjects(const Value *V,
                                 SmallVectorImpl<const Value *> &Objects,
-                                LoopInfo *LI, unsigned MaxLookup) {
+                                const LoopInfo *LI, unsigned MaxLookup) {
   SmallPtrSet<const Value *, 4> Visited;
   SmallVector<const Value *, 4> Worklist;
   Worklist.push_back(V);
@@ -6570,6 +6619,48 @@ void llvm::getUnderlyingObjects(const Value *V,
   } while (!Worklist.empty());
 }
 
+const Value *llvm::getUnderlyingObjectAggressive(const Value *V) {
+  const unsigned MaxVisited = 8;
+
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(V);
+  const Value *Object = nullptr;
+  // Used as fallback if we can't find a common underlying object through
+  // recursion.
+  bool First = true;
+  const Value *FirstObject = getUnderlyingObject(V);
+  do {
+    const Value *P = Worklist.pop_back_val();
+    P = First ? FirstObject : getUnderlyingObject(P);
+    First = false;
+
+    if (!Visited.insert(P).second)
+      continue;
+
+    if (Visited.size() == MaxVisited)
+      return FirstObject;
+
+    if (auto *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (auto *PN = dyn_cast<PHINode>(P)) {
+      append_range(Worklist, PN->incoming_values());
+      continue;
+    }
+
+    if (!Object)
+      Object = P;
+    else if (Object != P)
+      return FirstObject;
+  } while (!Worklist.empty());
+
+  return Object;
+}
+
 /// This is the function that does the work of looking through basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
@@ -6706,30 +6797,20 @@ bool llvm::onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V) {
       V, /* AllowLifetime */ true, /* AllowDroppable */ true);
 }
 
-bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
-  if (!LI.isUnordered())
-    return true;
-  const Function &F = *LI.getFunction();
-  // Speculative load may create a race that did not exist in the source.
-  return F.hasFnAttribute(Attribute::SanitizeThread) ||
-    // Speculative load may load data from dirty regions.
-    F.hasFnAttribute(Attribute::SanitizeAddress) ||
-    F.hasFnAttribute(Attribute::SanitizeHWAddress);
-}
-
 bool llvm::isSafeToSpeculativelyExecute(const Instruction *Inst,
                                         const Instruction *CtxI,
                                         AssumptionCache *AC,
                                         const DominatorTree *DT,
-                                        const TargetLibraryInfo *TLI) {
+                                        const TargetLibraryInfo *TLI,
+                                        bool UseVariableInfo) {
   return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI,
-                                                AC, DT, TLI);
+                                                AC, DT, TLI, UseVariableInfo);
 }
 
 bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI,
-    AssumptionCache *AC, const DominatorTree *DT,
-    const TargetLibraryInfo *TLI) {
+    AssumptionCache *AC, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    bool UseVariableInfo) {
 #ifndef NDEBUG
   if (Inst->getOpcode() != Opcode) {
     // Check that the operands are actually compatible with the Opcode override.
@@ -6781,6 +6862,9 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     return false;
   }
   case Instruction::Load: {
+    if (!UseVariableInfo)
+      return false;
+
     const LoadInst *LI = dyn_cast<LoadInst>(Inst);
     if (!LI)
       return false;
@@ -7014,16 +7098,13 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
     if (isGuaranteedNotToBeUndef(LHS, SQ.AC, SQ.CxtI, SQ.DT))
       return OverflowResult::NeverOverflows;
 
-  // Checking for conditions implied by dominating conditions may be expensive.
-  // Limit it to usub_with_overflow calls for now.
-  if (match(SQ.CxtI,
-            m_Intrinsic<Intrinsic::usub_with_overflow>(m_Value(), m_Value())))
-    if (auto C = isImpliedByDomCondition(CmpInst::ICMP_UGE, LHS, RHS, SQ.CxtI,
-                                         SQ.DL)) {
-      if (*C)
-        return OverflowResult::NeverOverflows;
-      return OverflowResult::AlwaysOverflowsLow;
-    }
+  if (auto C = isImpliedByDomCondition(CmpInst::ICMP_UGE, LHS, RHS, SQ.CxtI,
+                                       SQ.DL)) {
+    if (*C)
+      return OverflowResult::NeverOverflows;
+    return OverflowResult::AlwaysOverflowsLow;
+  }
+
   ConstantRange LHSRange =
       computeConstantRangeIncludingKnownBits(LHS, /*ForSigned=*/false, SQ);
   ConstantRange RHSRange =
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 9a34803f18b35..e622d4ef86e90 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -838,7 +838,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
-  TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
   TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index a886f6e3a4b93..9358f89e2bf9d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7260,13 +7260,13 @@ bool LLParser::parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
 // If RetType is a non-function pointer type, then this is the short syntax
 // for the call, which means that RetType is just the return type.  Infer the
 // rest of the function argument types from the arguments that are present.
-bool LLParser::resolveFunctionType(Type *RetType,
-                                   const SmallVector<ParamInfo, 16> &ArgList,
+bool LLParser::resolveFunctionType(Type *RetType, ArrayRef<ParamInfo> ArgList,
                                    FunctionType *&FuncTy) {
   FuncTy = dyn_cast<FunctionType>(RetType);
   if (!FuncTy) {
     // Pull out the types of all of the arguments...
-    std::vector<Type*> ParamTypes;
+    SmallVector<Type *, 8> ParamTypes;
+    ParamTypes.reserve(ArgList.size());
     for (const ParamInfo &Arg : ArgList)
       ParamTypes.push_back(Arg.V->getType());
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 84d624f6cf8fa..fd4ae109b4bb8 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2496,7 +2496,9 @@ Error BitcodeReader::parseTypeTableBody() {
       ResultTy = Type::getMetadataTy(Context);
       break;
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
-      ResultTy = Type::getX86_MMXTy(Context);
+      // Deprecated: decodes as <1 x i64>
+      ResultTy =
+          llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
       break;
     case bitc::TYPE_CODE_X86_AMX:   // X86_AMX
       ResultTy = Type::getX86_AMXTy(Context);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index b3ebe70e8c52f..52e15e6880ef2 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -727,6 +727,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_HOT;
   case Attribute::ElementType:
     return bitc::ATTR_KIND_ELEMENTTYPE;
+  case Attribute::HybridPatchable:
+    return bitc::ATTR_KIND_HYBRID_PATCHABLE;
   case Attribute::InlineHint:
     return bitc::ATTR_KIND_INLINE_HINT;
   case Attribute::InReg:
@@ -1086,8 +1088,9 @@ void ModuleBitcodeWriter::writeTypeTable() {
     case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
     case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
     case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
-    case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
-    case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
     case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
     case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
     case Type::IntegerTyID:
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 1f59ec545b4f7..b64fe83959eb1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -436,6 +436,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   HasSplitStack = false;
   HasNoSplitStack = false;
+  DbgInfoAvailable = !M.debug_compile_units().empty();
 
   AddrLabelSymbols = nullptr;
 
@@ -541,8 +542,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     if (EmitCodeView && TM.getTargetTriple().isOSWindows())
       DebugHandlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
-      assert(MMI && "MMI could not be nullptr here!");
-      if (MMI->hasDebugInfo()) {
+      if (hasDebugInfo()) {
         DD = new DwarfDebug(this);
         DebugHandlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
@@ -1277,8 +1277,7 @@ AsmPrinter::getFunctionCFISectionType(const Function &F) const {
   if (MAI->usesCFIWithoutEH() && F.hasUWTable())
     return CFISection::EH;
 
-  assert(MMI != nullptr && "Invalid machine module info");
-  if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
+  if (hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
     return CFISection::Debug;
 
   return CFISection::None;
@@ -1669,10 +1668,9 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) {
 }
 
 /// Returns true if function begin and end labels should be emitted.
-static bool needFuncLabels(const MachineFunction &MF) {
-  MachineModuleInfo &MMI = MF.getMMI();
-  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() ||
-      MMI.hasDebugInfo() ||
+static bool needFuncLabels(const MachineFunction &MF, const AsmPrinter &Asm) {
+  if (Asm.hasDebugInfo() || !MF.getLandingPads().empty() ||
+      MF.hasEHFunclets() ||
       MF.getFunction().hasMetadata(LLVMContext::MD_pcsections))
     return true;
 
@@ -1944,7 +1942,7 @@ void AsmPrinter::emitFunctionBody() {
   // are automatically sized.
   bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
 
-  if (needFuncLabels(*MF) || EmitFunctionSize) {
+  if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->emitLabel(CurrentFnEnd);
@@ -2587,8 +2585,9 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (F.hasFnAttribute("patchable-function-entry") ||
       F.hasFnAttribute("function-instrument") ||
-      F.hasFnAttribute("xray-instruction-threshold") || needFuncLabels(MF) ||
-      NeedsLocalForSize || MF.getTarget().Options.EmitStackSizeSection ||
+      F.hasFnAttribute("xray-instruction-threshold") ||
+      needFuncLabels(MF, *this) || NeedsLocalForSize ||
+      MF.getTarget().Options.EmitStackSizeSection ||
       MF.getTarget().Options.BBAddrMap || MF.hasBBLabels()) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
@@ -2859,8 +2858,8 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
     auto *Arr = cast<ConstantArray>(GV->getInitializer());
     for (auto &U : Arr->operands()) {
       auto *C = cast<Constant>(U);
-      auto *Src = cast<Function>(C->getOperand(0)->stripPointerCasts());
-      auto *Dst = cast<Function>(C->getOperand(1)->stripPointerCasts());
+      auto *Src = cast<GlobalValue>(C->getOperand(0)->stripPointerCasts());
+      auto *Dst = cast<GlobalValue>(C->getOperand(1)->stripPointerCasts());
       int Kind = cast<ConstantInt>(C->getOperand(2))->getZExtValue();
 
       if (Src->hasDLLImportStorageClass()) {
@@ -3143,7 +3142,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     return MCSymbolRefExpr::create(getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return MCSymbolRefExpr::create(GetBlockAddressSymbol(BA), Ctx);
+    return lowerBlockAddressConstant(*BA);
 
   if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV))
     return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM);
@@ -3825,6 +3824,10 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
   return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(BB);
 }
 
+const MCExpr *AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) {
+  return MCSymbolRefExpr::create(GetBlockAddressSymbol(&BA), OutContext);
+}
+
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
   if (getSubtargetInfo().getTargetTriple().isWindowsMSVCEnvironment()) {
@@ -3964,6 +3967,9 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     CurrentSectionBeginSym = MBB.getSymbol();
   }
 
+  for (auto &Handler : DebugHandlers)
+    Handler->beginCodeAlignment(MBB);
+
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index dddc08b3bc016..7700ffd6da803 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -613,7 +613,7 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
-  if (!MMI->hasDebugInfo() ||
+  if (!Asm->hasDebugInfo() ||
       !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
@@ -636,7 +636,7 @@ void CodeViewDebug::beginModule(Module *M) {
 }
 
 void CodeViewDebug::endModule() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // The COFF .debug$S section consists of several subsections, each starting
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 6c70c47de8822..de2263c57493b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -241,10 +241,7 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
          Ty->getTag() == dwarf::DW_TAG_unspecified_type;
 }
 
-static bool hasDebugInfo(const MachineModuleInfo *MMI,
-                         const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo())
-    return false;
+static bool hasDebugInfo(const MachineFunction *MF) {
   auto *SP = MF->getFunction().getSubprogram();
   if (!SP)
     return false;
@@ -258,7 +255,7 @@ static bool hasDebugInfo(const MachineModuleInfo *MMI,
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
   PrevInstBB = nullptr;
 
-  if (!Asm || !hasDebugInfo(MMI, MF)) {
+  if (!Asm || !hasDebugInfo(MF)) {
     skippedNonDebugFunction();
     return;
   }
@@ -354,7 +351,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI == nullptr);
@@ -380,7 +377,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
 }
 
 void DebugHandlerBase::endInstruction() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI != nullptr);
@@ -415,7 +412,7 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
-  if (Asm && hasDebugInfo(MMI, MF))
+  if (Asm && hasDebugInfo(MF))
     endFunctionImpl(MF);
   DbgValues.clear();
   DbgLabels.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 49f3fc1a1fa59..087ee02a7f2b3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -90,7 +90,7 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   shouldEmitLSDA = shouldEmitPersonality &&
     LSDAEncoding != dwarf::DW_EH_PE_omit;
 
-  const MCAsmInfo &MAI = *MF->getMMI().getContext().getAsmInfo();
+  const MCAsmInfo &MAI = *MF->getContext().getAsmInfo();
   if (MAI.getExceptionHandlingType() != ExceptionHandling::None)
     shouldEmitCFI =
         MAI.usesCFIForEH() && (shouldEmitPersonality || shouldEmitMoves);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 80cd5ec501f25..9b1965c7d6c9d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -354,6 +354,9 @@ DwarfDebug::DwarfDebug(AsmPrinter *A)
 
   UseLocSection = !TT.isNVPTX();
 
+  // Always emit .debug_aranges for SCE tuning.
+  UseARangesSection = GenerateARangeSection || tuneForSCE();
+
   HasAppleExtensionAttributes = tuneForLLDB();
 
   // Handle split DWARF.
@@ -1145,14 +1148,15 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 void DwarfDebug::beginModule(Module *M) {
   DebugHandlerBase::beginModule(M);
 
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm)
     return;
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
+  if (NumDebugCUs == 0)
+    return;
+
   assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
-  assert(MMI->hasDebugInfo() &&
-         "DebugInfoAvailabilty unexpectedly not initialized");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
@@ -1430,7 +1434,7 @@ void DwarfDebug::endModule() {
 
   // If we aren't actually generating debug info (check beginModule -
   // conditionalized on the presence of the llvm.dbg.cu metadata node)
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // Finalize the debug info for the module.
@@ -1450,7 +1454,7 @@ void DwarfDebug::endModule() {
   emitDebugInfo();
 
   // Emit info into a debug aranges section.
-  if (GenerateARangeSection)
+  if (UseARangesSection)
     emitDebugARanges();
 
   // Emit info into a debug ranges section.
@@ -2990,6 +2994,9 @@ struct ArangeSpan {
 // Emit a debug aranges section, containing a CU lookup for any
 // address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
+  if (ArangeLabels.empty())
+    return;
+
   // Provides a unique id per text section.
   MapVector<MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
 
@@ -3012,8 +3019,7 @@ void DwarfDebug::emitDebugARanges() {
   for (auto &I : SectionMap) {
     MCSection *Section = I.first;
     SmallVector<SymbolCU, 8> &List = I.second;
-    if (List.size() < 1)
-      continue;
+    assert(!List.empty());
 
     // If we have no section (e.g. common), just write out
     // individual spans for each symbol.
@@ -3668,3 +3674,21 @@ bool DwarfDebug::alwaysUseRanges(const DwarfCompileUnit &CU) const {
     return true;
   return false;
 }
+
+void DwarfDebug::beginCodeAlignment(const MachineBasicBlock &MBB) {
+  if (MBB.getAlignment() == Align(1))
+    return;
+
+  auto *SP = MBB.getParent()->getFunction().getSubprogram();
+  bool NoDebug =
+      !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug;
+
+  if (NoDebug)
+    return;
+
+  auto PrevLoc = Asm->OutStreamer->getContext().getCurrentDwarfLoc();
+  Asm->OutStreamer->emitDwarfLocDirective(
+      PrevLoc.getFileNum(), 0, PrevLoc.getColumn(), 0, 0, 0, StringRef());
+  MCDwarfLineEntry::make(Asm->OutStreamer.get(),
+                         Asm->OutStreamer->getCurrentSectionOnly());
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 452485b632c45..6e379396ea079 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -435,6 +435,9 @@ class DwarfDebug : public DebugHandlerBase {
   ///Allow emission of the .debug_loc section.
   bool UseLocSection = true;
 
+  /// Allow emission of .debug_aranges section
+  bool UseARangesSection = false;
+
   /// Generate DWARF v4 type units.
   bool GenerateTypeUnits;
 
@@ -727,6 +730,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
+  /// Process beginning of code alignment.
+  void beginCodeAlignment(const MachineBasicBlock &MBB) override;
+
   /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
   static uint64_t makeTypeSignature(StringRef Identifier);
 
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index c0fc7a2b35ea3..1dc278586f117 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -80,7 +80,6 @@ TailMergeThreshold("tail-merge-threshold",
           cl::init(150), cl::Hidden);
 
 // Heuristic for tail merging (and, inversely, tail duplication).
-// TODO: This should be replaced with a target query.
 static cl::opt<unsigned>
 TailMergeSize("tail-merge-size",
               cl::desc("Min number of instructions to consider tail merging"),
@@ -145,8 +144,6 @@ BranchFolder::BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
                            ProfileSummaryInfo *PSI, unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
       MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
-  if (MinCommonTailLength == 0)
-    MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET:
     EnableTailMerge = DefaultEnableTailMerge;
@@ -195,6 +192,12 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   MLI = mli;
   this->MRI = &MRI;
 
+  if (MinCommonTailLength == 0) {
+    MinCommonTailLength = TailMergeSize.getNumOccurrences() > 0
+                              ? TailMergeSize
+                              : TII->getTailMergeSize(MF);
+  }
+
   UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
@@ -1888,7 +1891,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   // Also avoid moving code above predicated instruction since it's hard to
   // reason about register liveness with predicated instruction.
   bool DontMoveAcrossStore = true;
-  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+  if (!PI->isSafeToMove(DontMoveAcrossStore) || TII->isPredicated(*PI))
     return MBB->end();
 
   // Find out what registers are live. Note this routine is ignoring other live
@@ -2012,7 +2015,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     bool DontMoveAcrossStore = true;
-    if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+    if (!TIB->isSafeToMove(DontMoveAcrossStore))
       break;
 
     // Remove kills from ActiveDefsSet, these registers had short live ranges.
diff --git a/llvm/lib/CodeGen/CFGuardLongjmp.cpp b/llvm/lib/CodeGen/CFGuardLongjmp.cpp
index b5d88a7432b17..04de011400568 100644
--- a/llvm/lib/CodeGen/CFGuardLongjmp.cpp
+++ b/llvm/lib/CodeGen/CFGuardLongjmp.cpp
@@ -62,7 +62,7 @@ FunctionPass *llvm::createCFGuardLongjmpPass() { return new CFGuardLongjmp(); }
 bool CFGuardLongjmp::runOnMachineFunction(MachineFunction &MF) {
 
   // Skip modules for which the cfguard flag is not set.
-  if (!MF.getMMI().getModule()->getModuleFlag("cfguard"))
+  if (!MF.getFunction().getParent()->getModuleFlag("cfguard"))
     return false;
 
   // Skip functions that do not have calls to _setjmp.
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index 1ff01ad34b30e..06de92515c044 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -68,9 +68,9 @@ class CFIInstrInserter : public MachineFunctionPass {
   struct MBBCFAInfo {
     MachineBasicBlock *MBB;
     /// Value of cfa offset valid at basic block entry.
-    int IncomingCFAOffset = -1;
+    int64_t IncomingCFAOffset = -1;
     /// Value of cfa offset valid at basic block exit.
-    int OutgoingCFAOffset = -1;
+    int64_t OutgoingCFAOffset = -1;
     /// Value of cfa register valid at basic block entry.
     unsigned IncomingCFARegister = 0;
     /// Value of cfa register valid at basic block exit.
@@ -120,7 +120,7 @@ class CFIInstrInserter : public MachineFunctionPass {
   /// Return the cfa offset value that should be set at the beginning of a MBB
   /// if needed. The negated value is needed when creating CFI instructions that
   /// set absolute offset.
-  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+  int64_t getCorrectCFAOffset(MachineBasicBlock *MBB) {
     return MBBVector[MBB->getNumber()].IncomingCFAOffset;
   }
 
@@ -175,7 +175,7 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
 
 void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   // Outgoing cfa offset set by the block.
-  int SetOffset = MBBInfo.IncomingCFAOffset;
+  int64_t SetOffset = MBBInfo.IncomingCFAOffset;
   // Outgoing cfa register set by the block.
   unsigned SetRegister = MBBInfo.IncomingCFARegister;
   MachineFunction *MF = MBBInfo.MBB->getParent();
@@ -188,7 +188,7 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   for (MachineInstr &MI : *MBBInfo.MBB) {
     if (MI.isCFIInstruction()) {
       std::optional<unsigned> CSRReg;
-      std::optional<int> CSROffset;
+      std::optional<int64_t> CSROffset;
       unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
       const MCCFIInstruction &CFI = Instrs[CFIIndex];
       switch (CFI.getOperation()) {
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 1d767a3484bca..9d8c9119f7719 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <tuple>
@@ -257,7 +258,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       return -1.0f;
     }
 
-    float Weight = 1.0f;
+    // Force Weight onto the stack so that x86 doesn't add hidden precision,
+    // similar to HWeight below.
+    stack_float_t Weight = 1.0f;
     if (IsSpillable) {
       // Get loop info for mi.
       if (MI->getParent() != MBB) {
@@ -284,11 +287,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
     if (!HintReg)
       continue;
-    // Force hweight onto the stack so that x86 doesn't add hidden precision,
+    // Force HWeight onto the stack so that x86 doesn't add hidden precision,
     // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
-    //
-    // FIXME: we probably shouldn't use floats at all.
-    volatile float HWeight = Hint[HintReg] += Weight;
+    stack_float_t HWeight = Hint[HintReg] += Weight;
     if (HintReg.isVirtual() || MRI.isAllocatable(HintReg))
       CopyHints.insert(CopyHint(HintReg, HWeight));
   }
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 578854cdb4a5d..7fc25cd889a0d 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -92,7 +92,7 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const {
 
   // Don't delete instructions with side effects.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+  if (!MI->isSafeToMove(SawStore) && !MI->isPHI())
     return false;
 
   // Examine each operand.
diff --git a/llvm/lib/CodeGen/EHContGuardCatchret.cpp b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
index f7c6580a73da5..cd1cdb0653618 100644
--- a/llvm/lib/CodeGen/EHContGuardCatchret.cpp
+++ b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
@@ -62,7 +62,7 @@ FunctionPass *llvm::createEHContGuardCatchretPass() {
 bool EHContGuardCatchret::runOnMachineFunction(MachineFunction &MF) {
 
   // Skip modules for which the ehcontguard flag is not set.
-  if (!MF.getMMI().getModule()->getModuleFlag("ehcontguard"))
+  if (!MF.getFunction().getParent()->getModuleFlag("ehcontguard"))
     return false;
 
   // Skip functions that do not have catchret
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index a5c99498921db..0de8112fb72c8 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -181,8 +181,8 @@ class SSAIfConv {
   bool canConvertIf(MachineBasicBlock *MBB, bool Predicate = false);
 
   /// convertIf - If-convert the last block passed to canConvertIf(), assuming
-  /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+  /// it is possible. Add any blocks that are to be erased to RemoveBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
                  bool Predicate = false);
 };
 } // end anonymous namespace
@@ -235,7 +235,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
       return false;
     }
@@ -678,9 +678,9 @@ void SSAIfConv::rewritePHIOperands() {
 /// convertIf - Execute the if conversion after canConvertIf has determined the
 /// feasibility.
 ///
-/// Any basic blocks erased will be added to RemovedBlocks.
+/// Any basic blocks that need to be erased will be added to RemoveBlocks.
 ///
-void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
                           bool Predicate) {
   assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
 
@@ -721,15 +721,18 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
   DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
   TII->removeBranch(*Head);
 
-  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // Mark the now empty conditional blocks for removal and move them to the end.
+  // It is likely that Head can fall
   // through to Tail, and we can join the two blocks.
   if (TBB != Tail) {
-    RemovedBlocks.push_back(TBB);
-    TBB->eraseFromParent();
+    RemoveBlocks.push_back(TBB);
+    if (TBB != &TBB->getParent()->back())
+      TBB->moveAfter(&TBB->getParent()->back());
   }
   if (FBB != Tail) {
-    RemovedBlocks.push_back(FBB);
-    FBB->eraseFromParent();
+    RemoveBlocks.push_back(FBB);
+    if (FBB != &FBB->getParent()->back())
+      FBB->moveAfter(&FBB->getParent()->back());
   }
 
   assert(Head->succ_empty() && "Additional head successors?");
@@ -740,8 +743,9 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
     Head->splice(Head->end(), Tail,
                      Tail->begin(), Tail->end());
     Head->transferSuccessorsAndUpdatePHIs(Tail);
-    RemovedBlocks.push_back(Tail);
-    Tail->eraseFromParent();
+    RemoveBlocks.push_back(Tail);
+    if (Tail != &Tail->getParent()->back())
+      Tail->moveAfter(&Tail->getParent()->back());
   } else {
     // We need a branch to Tail, let code placement work it out later.
     LLVM_DEBUG(dbgs() << "Converting to unconditional branch.\n");
@@ -1062,11 +1066,13 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
   while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
     // If-convert MBB and update analyses.
     invalidateTraces();
-    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
-    IfConv.convertIf(RemovedBlocks);
+    SmallVector<MachineBasicBlock *, 4> RemoveBlocks;
+    IfConv.convertIf(RemoveBlocks);
     Changed = true;
-    updateDomTree(DomTree, IfConv, RemovedBlocks);
-    updateLoops(Loops, RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemoveBlocks);
+    for (MachineBasicBlock *MBB : RemoveBlocks)
+      MBB->eraseFromParent();
+    updateLoops(Loops, RemoveBlocks);
   }
   return Changed;
 }
@@ -1200,11 +1206,13 @@ bool EarlyIfPredicator::tryConvertIf(MachineBasicBlock *MBB) {
   bool Changed = false;
   while (IfConv.canConvertIf(MBB, /*Predicate*/ true) && shouldConvertIf()) {
     // If-convert MBB and update analyses.
-    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
-    IfConv.convertIf(RemovedBlocks, /*Predicate*/ true);
+    SmallVector<MachineBasicBlock *, 4> RemoveBlocks;
+    IfConv.convertIf(RemoveBlocks, /*Predicate*/ true);
     Changed = true;
-    updateDomTree(DomTree, IfConv, RemovedBlocks);
-    updateLoops(Loops, RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemoveBlocks);
+    for (MachineBasicBlock *MBB : RemoveBlocks)
+      MBB->eraseFromParent();
+    updateLoops(Loops, RemoveBlocks);
   }
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index 54ac7f72011a6..a15b76440364b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
   GlobalISel.cpp
   Combiner.cpp
   CombinerHelper.cpp
+  CombinerHelperCasts.cpp
   CombinerHelperVectorOps.cpp
   GIMatchTableExecutor.cpp
   GISelChangeObserver.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index dfc3d73e322b8..d930ab2984629 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -68,6 +68,16 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
   return *Builder.getMF().getSubtarget().getTargetLowering();
 }
 
+const MachineFunction &CombinerHelper::getMachineFunction() const {
+  return Builder.getMF();
+}
+
+const DataLayout &CombinerHelper::getDataLayout() const {
+  return getMachineFunction().getDataLayout();
+}
+
+LLVMContext &CombinerHelper::getContext() const { return Builder.getContext(); }
+
 /// \returns The little endian in-memory byte position of byte \p I in a
 /// \p ByteWidth bytes wide type.
 ///
@@ -2582,40 +2592,6 @@ void CombinerHelper::applyCombineExtOfExt(
   }
 }
 
-bool CombinerHelper::matchCombineTruncOfExt(
-    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
-  Register SrcReg = MI.getOperand(1).getReg();
-  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
-  unsigned SrcOpc = SrcMI->getOpcode();
-  if (SrcOpc == TargetOpcode::G_ANYEXT || SrcOpc == TargetOpcode::G_SEXT ||
-      SrcOpc == TargetOpcode::G_ZEXT) {
-    MatchInfo = std::make_pair(SrcMI->getOperand(1).getReg(), SrcOpc);
-    return true;
-  }
-  return false;
-}
-
-void CombinerHelper::applyCombineTruncOfExt(
-    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
-  Register SrcReg = MatchInfo.first;
-  unsigned SrcExtOp = MatchInfo.second;
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-  LLT DstTy = MRI.getType(DstReg);
-  if (SrcTy == DstTy) {
-    MI.eraseFromParent();
-    replaceRegWith(MRI, DstReg, SrcReg);
-    return;
-  }
-  if (SrcTy.getSizeInBits() < DstTy.getSizeInBits())
-    Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg});
-  else
-    Builder.buildTrunc(DstReg, SrcReg);
-  MI.eraseFromParent();
-}
-
 static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
   const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
   const unsigned TruncSize = TruncTy.getScalarSizeInBits();
@@ -5179,13 +5155,9 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
   LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
 
-  unsigned KnownLeadingZeros =
-      KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
   auto &MIB = Builder;
 
   bool UseSRL = false;
-  bool UseNPQ = false;
-  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
   SmallVector<Register, 16> Shifts, Factors;
   auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
   bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
@@ -5213,6 +5185,33 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
     return true;
   };
 
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+    // Collect all magic values from the build vector.
+    if (!matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern))
+      llvm_unreachable("Expected unary predicate match to succeed");
+
+    Register Shift, Factor;
+    if (Ty.isVector()) {
+      Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+      Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+    } else {
+      Shift = Shifts[0];
+      Factor = Factors[0];
+    }
+
+    Register Res = LHS;
+
+    if (UseSRL)
+      Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+    return MIB.buildMul(Ty, Res, Factor);
+  }
+
+  unsigned KnownLeadingZeros =
+      KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
+
+  bool UseNPQ = false;
+  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
   auto BuildUDIVPattern = [&](const Constant *C) {
     auto *CI = cast<ConstantInt>(C);
     const APInt &Divisor = CI->getValue();
@@ -5258,29 +5257,6 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
     return true;
   };
 
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
-    // Collect all magic values from the build vector.
-    bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern);
-    (void)Matched;
-    assert(Matched && "Expected unary predicate match to succeed");
-
-    Register Shift, Factor;
-    if (Ty.isVector()) {
-      Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
-      Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
-    } else {
-      Shift = Shifts[0];
-      Factor = Factors[0];
-    }
-
-    Register Res = LHS;
-
-    if (UseSRL)
-      Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
-
-    return MIB.buildMul(Ty, Res, Factor);
-  }
-
   // Collect the shifts/magic values from each element.
   bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
   (void)Matched;
@@ -7457,92 +7433,3 @@ void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
   Builder.buildCopy(Dst, *Res);
   MI.eraseFromParent();
 }
-
-bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
-  GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
-  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));
-
-  Register Dst = Sext->getReg(0);
-  Register Src = Trunc->getSrcReg();
-
-  LLT DstTy = MRI.getType(Dst);
-  LLT SrcTy = MRI.getType(Src);
-
-  if (DstTy == SrcTy) {
-    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
-    return true;
-  }
-
-  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
-      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
-    MatchInfo = [=](MachineIRBuilder &B) {
-      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap);
-    };
-    return true;
-  }
-
-  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
-      isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) {
-    MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
-    return true;
-  }
-
-  return false;
-}
-
-bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
-  GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
-  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));
-
-  Register Dst = Zext->getReg(0);
-  Register Src = Trunc->getSrcReg();
-
-  LLT DstTy = MRI.getType(Dst);
-  LLT SrcTy = MRI.getType(Src);
-
-  if (DstTy == SrcTy) {
-    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
-    return true;
-  }
-
-  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
-      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
-    MatchInfo = [=](MachineIRBuilder &B) {
-      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap);
-    };
-    return true;
-  }
-
-  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
-      isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) {
-    MatchInfo = [=](MachineIRBuilder &B) {
-      B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg);
-    };
-    return true;
-  }
-
-  return false;
-}
-
-bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
-                                     BuildFnTy &MatchInfo) {
-  GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg()));
-
-  Register Dst = Zext->getReg(0);
-  Register Src = Zext->getSrcReg();
-
-  LLT DstTy = MRI.getType(Dst);
-  LLT SrcTy = MRI.getType(Src);
-  const auto &TLI = getTargetLowering();
-
-  // Convert zext nneg to sext if sext is the preferred form for the target.
-  if (isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}}) &&
-      TLI.isSExtCheaperThanZExt(getMVTForLLT(SrcTy), getMVTForLLT(DstTy))) {
-    MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
-    return true;
-  }
-
-  return false;
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
new file mode 100644
index 0000000000000..59295f7a65835
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -0,0 +1,211 @@
+//===- CombinerHelperCasts.cpp---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for G_ANYEXT, G_SEXT, G_TRUNC, and
+// G_ZEXT
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
+  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));
+
+  Register Dst = Sext->getReg(0);
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy == SrcTy) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap);
+    };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
+  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));
+
+  Register Dst = Zext->getReg(0);
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy == SrcTy) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap);
+    };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
+                                     BuildFnTy &MatchInfo) {
+  GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg()));
+
+  Register Dst = Zext->getReg(0);
+  Register Src = Zext->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const auto &TLI = getTargetLowering();
+
+  // Convert zext nneg to sext if sext is the preferred form for the target.
+  if (isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}}) &&
+      TLI.isSExtCheaperThanZExt(getMVTForLLT(SrcTy), getMVTForLLT(DstTy))) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchTruncateOfExt(const MachineInstr &Root,
+                                        const MachineInstr &ExtMI,
+                                        BuildFnTy &MatchInfo) {
+  const GTrunc *Trunc = cast<GTrunc>(&Root);
+  const GExtOp *Ext = cast<GExtOp>(&ExtMI);
+
+  if (!MRI.hasOneNonDBGUse(Ext->getReg(0)))
+    return false;
+
+  Register Dst = Trunc->getReg(0);
+  Register Src = Ext->getSrcReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (SrcTy == DstTy) {
+    // The source and the destination are equally sized. We need to copy.
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+
+    return true;
+  }
+
+  if (SrcTy.getScalarSizeInBits() < DstTy.getScalarSizeInBits()) {
+    // If the source is smaller than the destination, we need to extend.
+
+    if (!isLegalOrBeforeLegalizer({Ext->getOpcode(), {DstTy, SrcTy}}))
+      return false;
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildInstr(Ext->getOpcode(), {Dst}, {Src});
+    };
+
+    return true;
+  }
+
+  if (SrcTy.getScalarSizeInBits() > DstTy.getScalarSizeInBits()) {
+    // If the source is larger than the destination, then we need to truncate.
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}}))
+      return false;
+
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildTrunc(Dst, Src); };
+
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const {
+  const TargetLowering &TLI = getTargetLowering();
+  const DataLayout &DL = getDataLayout();
+  LLVMContext &Ctx = getContext();
+
+  switch (Opcode) {
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_ZEXT:
+    return TLI.isZExtFree(FromTy, ToTy, DL, Ctx);
+  case TargetOpcode::G_TRUNC:
+    return TLI.isTruncateFree(FromTy, ToTy, DL, Ctx);
+  default:
+    return false;
+  }
+}
+
+bool CombinerHelper::matchCastOfSelect(const MachineInstr &CastMI,
+                                       const MachineInstr &SelectMI,
+                                       BuildFnTy &MatchInfo) {
+  const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+  const GSelect *Select = cast<GSelect>(&SelectMI);
+
+  if (!MRI.hasOneNonDBGUse(Select->getReg(0)))
+    return false;
+
+  Register Dst = Cast->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  Register TrueReg = Select->getTrueReg();
+  Register FalseReg = Select->getFalseReg();
+  LLT SrcTy = MRI.getType(TrueReg);
+  Register Cond = Select->getCondReg();
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SELECT, {DstTy, CondTy}}))
+    return false;
+
+  if (!isCastFree(Cast->getOpcode(), DstTy, SrcTy))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto True = B.buildInstr(Cast->getOpcode(), {DstTy}, {TrueReg});
+    auto False = B.buildInstr(Cast->getOpcode(), {DstTy}, {FalseReg});
+    B.buildSelect(Dst, Cond, True, False);
+  };
+
+  return true;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 72dff12423ced..68a8a273a1b47 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2451,8 +2451,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
       int FI = getOrCreateFrameIndex(*cast<AllocaInst>(Arg));
       MCSymbol *FrameAllocSym =
-          MF->getMMI().getContext().getOrCreateFrameAllocSymbol(EscapedName,
-                                                                Idx);
+          MF->getContext().getOrCreateFrameAllocSymbol(EscapedName, Idx);
 
       // This should be inserted at the start of the entry block.
       auto LocalEscape =
@@ -2561,6 +2560,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     MIRBuilder.buildVScale(getOrCreateVReg(CI), 1);
     return true;
   }
+  case Intrinsic::scmp:
+    MIRBuilder.buildSCmp(getOrCreateVReg(CI),
+                         getOrCreateVReg(*CI.getOperand(0)),
+                         getOrCreateVReg(*CI.getOperand(1)));
+    return true;
+  case Intrinsic::ucmp:
+    MIRBuilder.buildUCmp(getOrCreateVReg(CI),
+                         getOrCreateVReg(*CI.getOperand(0)),
+                         getOrCreateVReg(*CI.getOperand(1)));
+    return true;
   case Intrinsic::prefetch: {
     Value *Addr = CI.getOperand(0);
     unsigned RW = cast<ConstantInt>(CI.getOperand(1))->getZExtValue();
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 4cb1d01f3e8ca..9a27728dcb4dd 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -62,14 +62,8 @@ INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE,
                     "Select target instructions out of generic instructions",
                     false, false)
 
-InstructionSelect::InstructionSelect(CodeGenOptLevel OL)
-    : MachineFunctionPass(ID), OptLevel(OL) {}
-
-// In order not to crash when calling getAnalysis during testing with -run-pass
-// we use the default opt level here instead of None, so that the addRequired()
-// calls are made in getAnalysisUsage().
-InstructionSelect::InstructionSelect()
-    : MachineFunctionPass(ID), OptLevel(CodeGenOptLevel::Default) {}
+InstructionSelect::InstructionSelect(CodeGenOptLevel OL, char &PassID)
+    : MachineFunctionPass(PassID), OptLevel(OL) {}
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 2c77ed8b06008..8fe48195c610b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -194,7 +194,8 @@ LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT MemTy = Query.MMODescrs[MMOIdx].MemoryTy;
     return !MemTy.isByteSized() ||
-           !llvm::has_single_bit<uint32_t>(MemTy.getSizeInBytes());
+           !llvm::has_single_bit<uint32_t>(
+               MemTy.getSizeInBytes().getKnownMinValue());
   };
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bcc30390bc82e..225ec19246231 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -735,8 +735,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
   if (MemType.isVector())
     return RTLIB::UNKNOWN_LIBCALL;
 
-#define LCALLS(A, B)                                                           \
-  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
 #define LCALL5(A)                                                              \
   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
   switch (Opc) {
@@ -992,6 +991,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
                        LocObserver, nullptr);
 }
 
+/// Returns the corresponding libcall for the given Pred and
+/// the ICMP predicate that should be generated to compare with #0
+/// after the libcall.
+static std::pair<RTLIB::Libcall, CmpInst::Predicate>
+getFCMPLibcallDesc(const CmpInst::Predicate Pred) {
+
+  switch (Pred) {
+  case CmpInst::FCMP_OEQ:
+    return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
+  case CmpInst::FCMP_UNE:
+    return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
+  case CmpInst::FCMP_OGE:
+    return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
+  case CmpInst::FCMP_OLT:
+    return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
+  case CmpInst::FCMP_OLE:
+    return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
+  case CmpInst::FCMP_OGT:
+    return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
+  case CmpInst::FCMP_UNO:
+    return {RTLIB::UO_F128, CmpInst::ICMP_NE};
+  default:
+    return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
+  }
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver) {
+  auto &MF = MIRBuilder.getMF();
+  auto &Ctx = MF.getFunction().getContext();
+  const GFCmp *Cmp = cast<GFCmp>(&MI);
+
+  LLT OpLLT = MRI.getType(Cmp->getLHSReg());
+  if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
+    return UnableToLegalize;
+
+  Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
+
+  // DstReg type is s32
+  const Register DstReg = Cmp->getReg(0);
+  const auto Cond = Cmp->getCond();
+
+  // Reference:
+  // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
+  // Generates a libcall followed by ICMP.
+  const auto BuildLibcall =
+      [&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
+          const DstOp &Res = LLT::scalar(32)) -> Register {
+    // FCMP libcall always returns an i32, and needs an ICMP with #0.
+    constexpr LLT TempLLT = LLT::scalar(32);
+    Register Temp = MRI.createGenericVirtualRegister(TempLLT);
+    // Generate libcall, holding result in Temp
+    const auto Status = createLibcall(
+        MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
+        {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
+        LocObserver, &MI);
+    if (!Status)
+      return {};
+
+    // Compare temp with #0 to get the final result.
+    return MIRBuilder
+        .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
+        .getReg(0);
+  };
+
+  // Simple case if we have a direct mapping from predicate to libcall
+  if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
+      Libcall != RTLIB::UNKNOWN_LIBCALL &&
+      ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
+    if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
+      return Legalized;
+    }
+    return UnableToLegalize;
+  }
+
+  // No direct mapping found, should be generated as combination of libcalls.
+
+  switch (Cond) {
+  case CmpInst::FCMP_UEQ: {
+    // FCMP_UEQ: unordered or equal
+    // Convert into (FCMP_OEQ || FCMP_UNO).
+
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
+    if (Oeq && Uno)
+      MIRBuilder.buildOr(DstReg, Oeq, Uno);
+    else
+      return UnableToLegalize;
+
+    break;
+  }
+  case CmpInst::FCMP_ONE: {
+    // FCMP_ONE: ordered and operands are unequal
+    // Convert into (!FCMP_OEQ && !FCMP_UNO).
+
+    // We inverse the predicate instead of generating a NOT
+    // to save one instruction.
+    // On AArch64 isel can even select two cmp into a single ccmp.
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto NotOeq =
+        BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto NotUno =
+        BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));
+
+    if (NotOeq && NotUno)
+      MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
+    else
+      return UnableToLegalize;
+
+    break;
+  }
+  case CmpInst::FCMP_ULT:
+  case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_UGT:
+  case CmpInst::FCMP_ULE:
+  case CmpInst::FCMP_ORD: {
+    // Convert into: !(inverse(Pred))
+    // E.g. FCMP_ULT becomes !FCMP_OGE
+    // This is equivalent to the following, but saves some instructions.
+    //   MIRBuilder.buildNot(
+    //       PredTy,
+    //       MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
+    //                            Op1, Op2));
+    const auto [InversedLibcall, InversedPred] =
+        getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
+    if (!BuildLibcall(InversedLibcall,
+                      CmpInst::getInversePredicate(InversedPred), DstReg))
+      return UnableToLegalize;
+    break;
+  }
+  default:
+    return UnableToLegalize;
+  }
+
+  return Legalized;
+}
+
 // The function is used to legalize operations that set default environment
 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
 // On most targets supported in glibc FE_DFL_MODE is defined as
@@ -1138,6 +1281,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FCMP: {
+    LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
+    if (Status != Legalized)
+      return Status;
+    MI.eraseFromParent();
+    return Status;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     // FIXME: Support other types
@@ -3881,6 +4031,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerFMad(MI);
   case TargetOpcode::G_FFLOOR:
     return lowerFFloor(MI);
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND: {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    auto Round = MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_ROUND, {SrcTy},
+                                       {SrcReg});
+    MIRBuilder.buildFPTOSI(DstReg, Round);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_INTRINSIC_ROUND:
     return lowerIntrinsicRound(MI);
   case TargetOpcode::G_FRINT: {
@@ -3889,6 +4050,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
     return Legalized;
   }
+  case TargetOpcode::G_INTRINSIC_LRINT:
+  case TargetOpcode::G_INTRINSIC_LLRINT: {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    auto Round =
+        MIRBuilder.buildInstr(TargetOpcode::G_FRINT, {SrcTy}, {SrcReg});
+    MIRBuilder.buildFPTOSI(DstReg, Round);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
     Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes);
@@ -4006,6 +4178,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_UMIN:
   case G_UMAX:
     return lowerMinMax(MI);
+  case G_SCMP:
+  case G_UCMP:
+    return lowerThreewayCompare(MI);
   case G_FCOPYSIGN:
     return lowerFCopySign(MI);
   case G_FMINNUM:
@@ -4741,12 +4916,20 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FCEIL:
   case G_FFLOOR:
   case G_FRINT:
+  case G_INTRINSIC_LRINT:
+  case G_INTRINSIC_LLRINT:
   case G_INTRINSIC_ROUND:
   case G_INTRINSIC_ROUNDEVEN:
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:
   case G_FTAN:
+  case G_FACOS:
+  case G_FASIN:
+  case G_FATAN:
+  case G_FCOSH:
+  case G_FSINH:
+  case G_FTANH:
   case G_FSQRT:
   case G_BSWAP:
   case G_BITREVERSE:
@@ -7263,6 +7446,36 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
+  GSUCmp *Cmp = cast<GSUCmp>(&MI);
+
+  Register Dst = Cmp->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  LLT CmpTy = DstTy.changeElementSize(1);
+
+  CmpInst::Predicate LTPredicate = Cmp->isSigned()
+                                       ? CmpInst::Predicate::ICMP_SLT
+                                       : CmpInst::Predicate::ICMP_ULT;
+  CmpInst::Predicate GTPredicate = Cmp->isSigned()
+                                       ? CmpInst::Predicate::ICMP_SGT
+                                       : CmpInst::Predicate::ICMP_UGT;
+
+  auto One = MIRBuilder.buildConstant(DstTy, 1);
+  auto Zero = MIRBuilder.buildConstant(DstTy, 0);
+  auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
+                                   Cmp->getRHSReg());
+  auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero);
+
+  auto MinusOne = MIRBuilder.buildConstant(DstTy, -1);
+  auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
+                                   Cmp->getRHSReg());
+  MIRBuilder.buildSelect(Dst, IsLT, MinusOne, SelectZeroOrOne);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 06a6c1f93ef1f..7eb6cd4e0d798 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -911,6 +911,18 @@ MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
   return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildSCmp(const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1) {
+  return buildInstr(TargetOpcode::G_SCMP, Res, {Op0, Op1});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUCmp(const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1) {
+  return buildInstr(TargetOpcode::G_UCMP, Res, {Op0, Op1});
+}
+
 MachineInstrBuilder
 MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst,
                               const SrcOp &Op0, const SrcOp &Op1,
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index c906f3a7c9227..080664843aaeb 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -236,7 +236,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
-  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
+  if (!MI.isSafeToMove(SawStore) && !MI.isPHI())
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f3789569b78fb..ba5605ad5cb4a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -2097,7 +2097,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 static bool MaySpeculate(const MachineInstr &MI,
                          SmallSet<MCPhysReg, 4> &LaterRedefs) {
   bool SawStore = true;
-  if (!MI.isSafeToMove(nullptr, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   for (const MachineOperand &MO : MI.operands()) {
diff --git a/llvm/lib/CodeGen/KCFI.cpp b/llvm/lib/CodeGen/KCFI.cpp
index 91c6ac2618279..af19319bc1bb8 100644
--- a/llvm/lib/CodeGen/KCFI.cpp
+++ b/llvm/lib/CodeGen/KCFI.cpp
@@ -89,7 +89,7 @@ bool KCFI::emitCheck(MachineBasicBlock &MBB,
 }
 
 bool KCFI::runOnMachineFunction(MachineFunction &MF) {
-  const Module *M = MF.getMMI().getModule();
+  const Module *M = MF.getFunction().getParent();
   if (!M->getModuleFlag("kcfi"))
     return false;
 
diff --git a/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 1d13173632833..d0dfafeaef561 100644
--- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -167,26 +167,11 @@ Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
     if (Options.MCOptions.ShowMCEncoding)
       MCE.reset(getTarget().createMCCodeEmitter(MII, Context));
 
-    bool UseDwarfDirectory = false;
-    switch (Options.MCOptions.MCUseDwarfDirectory) {
-    case MCTargetOptions::DisableDwarfDirectory:
-      UseDwarfDirectory = false;
-      break;
-    case MCTargetOptions::EnableDwarfDirectory:
-      UseDwarfDirectory = true;
-      break;
-    case MCTargetOptions::DefaultDwarfDirectory:
-      UseDwarfDirectory = MAI.enableDwarfFileDirectoryDefault();
-      break;
-    }
-
     std::unique_ptr<MCAsmBackend> MAB(
         getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
     auto FOut = std::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
-        Context, std::move(FOut), Options.MCOptions.AsmVerbose,
-        UseDwarfDirectory, InstPrinter, std::move(MCE), std::move(MAB),
-        Options.MCOptions.ShowMCInst);
+        Context, std::move(FOut), InstPrinter, std::move(MCE), std::move(MAB));
     AsmStreamer.reset(S);
     break;
   }
@@ -208,9 +193,7 @@ Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
         T, Context, std::unique_ptr<MCAsmBackend>(MAB),
         DwoOut ? MAB->createDwoObjectWriter(Out, *DwoOut)
                : MAB->createObjectWriter(Out),
-        std::unique_ptr<MCCodeEmitter>(MCE), STI, Options.MCOptions.MCRelaxAll,
-        Options.MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ true));
+        std::unique_ptr<MCCodeEmitter>(MCE), STI));
     break;
   }
   case CodeGenFileType::Null:
@@ -276,17 +259,15 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
   const MCRegisterInfo &MRI = *getMCRegisterInfo();
   std::unique_ptr<MCCodeEmitter> MCE(
       getTarget().createMCCodeEmitter(*getMCInstrInfo(), *Ctx));
-  std::unique_ptr<MCAsmBackend> MAB(
-      getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
+  MCAsmBackend *MAB =
+      getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions);
   if (!MCE || !MAB)
     return true;
 
   const Triple &T = getTargetTriple();
   std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
-      T, *Ctx, std::move(MAB), MAB->createObjectWriter(Out), std::move(MCE),
-      STI, Options.MCOptions.MCRelaxAll,
-      Options.MCOptions.MCIncrementalLinkerCompatible,
-      /*DWARFMustBeAtTheEnd*/ true));
+      T, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(Out),
+      std::move(MCE), STI));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
   FunctionPass *Printer =
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 247258a1ff553..20d5b2602df12 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -183,6 +183,7 @@ class TransferTracker {
   /// information from it. (XXX make it const?)
   MLocTracker *MTracker;
   MachineFunction &MF;
+  const DebugVariableMap &DVMap;
   bool ShouldEmitDebugEntryValues;
 
   /// Record of all changes in variable locations at a block position. Awkwardly
@@ -191,7 +192,9 @@ class TransferTracker {
   struct Transfer {
     MachineBasicBlock::instr_iterator Pos; /// Position to insert DBG_VALUes
     MachineBasicBlock *MBB; /// non-null if we should insert after.
-    SmallVector<MachineInstr *, 4> Insts; /// Vector of DBG_VALUEs to insert.
+    /// Vector of DBG_VALUEs to insert. Store with their DebugVariableID so that
+    /// they can be sorted into a stable order for emission at a later time.
+    SmallVector<std::pair<DebugVariableID, MachineInstr *>, 4> Insts;
   };
 
   /// Stores the resolved operands (machine locations and constants) and
@@ -227,15 +230,15 @@ class TransferTracker {
   /// Map from LocIdxes to which DebugVariables are based that location.
   /// Mantained while stepping through the block. Not accurate if
   /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx].
-  DenseMap<LocIdx, SmallSet<DebugVariable, 4>> ActiveMLocs;
+  DenseMap<LocIdx, SmallSet<DebugVariableID, 4>> ActiveMLocs;
 
   /// Map from DebugVariable to it's current location and qualifying meta
   /// information. To be used in conjunction with ActiveMLocs to construct
   /// enough information for the DBG_VALUEs for a particular LocIdx.
-  DenseMap<DebugVariable, ResolvedDbgValue> ActiveVLocs;
+  DenseMap<DebugVariableID, ResolvedDbgValue> ActiveVLocs;
 
   /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection.
-  SmallVector<MachineInstr *, 4> PendingDbgValues;
+  SmallVector<std::pair<DebugVariableID, MachineInstr *>, 4> PendingDbgValues;
 
   /// Record of a use-before-def: created when a value that's live-in to the
   /// current block isn't available in any machine location, but it will be
@@ -244,12 +247,12 @@ class TransferTracker {
     /// Value of this variable, def'd in block.
     SmallVector<DbgOp> Values;
     /// Identity of this variable.
-    DebugVariable Var;
+    DebugVariableID VarID;
     /// Additional variable properties.
     DbgValueProperties Properties;
-    UseBeforeDef(ArrayRef<DbgOp> Values, const DebugVariable &Var,
+    UseBeforeDef(ArrayRef<DbgOp> Values, DebugVariableID VarID,
                  const DbgValueProperties &Properties)
-        : Values(Values.begin(), Values.end()), Var(Var),
+        : Values(Values.begin(), Values.end()), VarID(VarID),
           Properties(Properties) {}
   };
 
@@ -260,15 +263,16 @@ class TransferTracker {
   /// The set of variables that are in UseBeforeDefs and can become a location
   /// once the relevant value is defined. An element being erased from this
   /// collection prevents the use-before-def materializing.
-  DenseSet<DebugVariable> UseBeforeDefVariables;
+  DenseSet<DebugVariableID> UseBeforeDefVariables;
 
   const TargetRegisterInfo &TRI;
   const BitVector &CalleeSavedRegs;
 
   TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker,
-                  MachineFunction &MF, const TargetRegisterInfo &TRI,
+                  MachineFunction &MF, const DebugVariableMap &DVMap,
+                  const TargetRegisterInfo &TRI,
                   const BitVector &CalleeSavedRegs, const TargetPassConfig &TPC)
-      : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI),
+      : TII(TII), MTracker(MTracker), MF(MF), DVMap(DVMap), TRI(TRI),
         CalleeSavedRegs(CalleeSavedRegs) {
     TLI = MF.getSubtarget().getTargetLowering();
     auto &TM = TPC.getTM<TargetMachine>();
@@ -352,7 +356,7 @@ class TransferTracker {
   ///    determine the values used by Value.
   void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore,
                     const SmallVectorImpl<ValueLocPair> &ValueToLoc,
-                    DebugVariable Var, DbgValue Value) {
+                    DebugVariableID VarID, DbgValue Value) {
     SmallVector<DbgOp> DbgOps;
     SmallVector<ResolvedDbgOp> ResolvedDbgOps;
     bool IsValueValid = true;
@@ -401,7 +405,7 @@ class TransferTracker {
                                       static_cast<unsigned>(Num.getInst()));
           continue;
         }
-        recoverAsEntryValue(Var, Value.Properties, Num);
+        recoverAsEntryValue(VarID, Value.Properties, Num);
         IsValueValid = false;
         break;
       }
@@ -419,8 +423,7 @@ class TransferTracker {
 
     // Add UseBeforeDef entry for the last value to be defined in this block.
     if (LastUseBeforeDef) {
-      addUseBeforeDef(Var, Value.Properties, DbgOps,
-                      LastUseBeforeDef);
+      addUseBeforeDef(VarID, Value.Properties, DbgOps, LastUseBeforeDef);
       return;
     }
 
@@ -428,13 +431,15 @@ class TransferTracker {
     // the transfer.
     for (const ResolvedDbgOp &Op : ResolvedDbgOps)
       if (!Op.IsConst)
-        ActiveMLocs[Op.Loc].insert(Var);
+        ActiveMLocs[Op.Loc].insert(VarID);
     auto NewValue = ResolvedDbgValue{ResolvedDbgOps, Value.Properties};
-    auto Result = ActiveVLocs.insert(std::make_pair(Var, NewValue));
+    auto Result = ActiveVLocs.insert(std::make_pair(VarID, NewValue));
     if (!Result.second)
       Result.first->second = NewValue;
+    auto &[Var, DILoc] = DVMap.lookupDVID(VarID);
     PendingDbgValues.push_back(
-        MTracker->emitLoc(ResolvedDbgOps, Var, Value.Properties));
+        std::make_pair(VarID, &*MTracker->emitLoc(ResolvedDbgOps, Var, DILoc,
+                                                  Value.Properties)));
   }
 
   /// Load object with live-in variable values. \p mlocs contains the live-in
@@ -445,7 +450,7 @@ class TransferTracker {
   /// FIXME: could just examine mloctracker instead of passing in \p mlocs?
   void
   loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs, DbgOpIDMap &DbgOpStore,
-             const SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
+             const SmallVectorImpl<std::pair<DebugVariableID, DbgValue>> &VLocs,
              unsigned NumLocs) {
     ActiveMLocs.clear();
     ActiveVLocs.clear();
@@ -506,11 +511,11 @@ class TransferTracker {
 
   /// Record that \p Var has value \p ID, a value that becomes available
   /// later in the function.
-  void addUseBeforeDef(const DebugVariable &Var,
+  void addUseBeforeDef(DebugVariableID VarID,
                        const DbgValueProperties &Properties,
                        const SmallVectorImpl<DbgOp> &DbgOps, unsigned Inst) {
-    UseBeforeDefs[Inst].emplace_back(DbgOps, Var, Properties);
-    UseBeforeDefVariables.insert(Var);
+    UseBeforeDefs[Inst].emplace_back(DbgOps, VarID, Properties);
+    UseBeforeDefVariables.insert(VarID);
   }
 
   /// After the instruction at index \p Inst and position \p pos has been
@@ -529,7 +534,7 @@ class TransferTracker {
     // Populate ValueToLoc with illegal default mappings for every value used by
     // any UseBeforeDef variables for this instruction.
     for (auto &Use : MIt->second) {
-      if (!UseBeforeDefVariables.count(Use.Var))
+      if (!UseBeforeDefVariables.count(Use.VarID))
         continue;
 
       for (DbgOp &Op : Use.Values) {
@@ -568,7 +573,7 @@ class TransferTracker {
     // Using the map of values to locations, produce a final set of values for
     // this variable.
     for (auto &Use : MIt->second) {
-      if (!UseBeforeDefVariables.count(Use.Var))
+      if (!UseBeforeDefVariables.count(Use.VarID))
         continue;
 
       SmallVector<ResolvedDbgOp> DbgOps;
@@ -591,8 +596,9 @@ class TransferTracker {
         continue;
 
       // Otherwise, we're good to go.
-      PendingDbgValues.push_back(
-          MTracker->emitLoc(DbgOps, Use.Var, Use.Properties));
+      auto &[Var, DILoc] = DVMap.lookupDVID(Use.VarID);
+      PendingDbgValues.push_back(std::make_pair(
+          Use.VarID, MTracker->emitLoc(DbgOps, Var, DILoc, Use.Properties)));
     }
     flushDbgValues(pos, nullptr);
   }
@@ -642,7 +648,7 @@ class TransferTracker {
     return Reg != SP && Reg != FP;
   }
 
-  bool recoverAsEntryValue(const DebugVariable &Var,
+  bool recoverAsEntryValue(DebugVariableID VarID,
                            const DbgValueProperties &Prop,
                            const ValueIDNum &Num) {
     // Is this variable location a candidate to be an entry value. First,
@@ -663,6 +669,8 @@ class TransferTracker {
       DIExpr = *NonVariadicExpression;
     }
 
+    auto &[Var, DILoc] = DVMap.lookupDVID(VarID);
+
     // Is the variable appropriate for entry values (i.e., is a parameter).
     if (!isEntryValueVariable(Var, DIExpr))
       return false;
@@ -676,9 +684,8 @@ class TransferTracker {
         DIExpression::prepend(DIExpr, DIExpression::EntryValue);
     Register Reg = MTracker->LocIdxToLocID[Num.getLoc()];
     MachineOperand MO = MachineOperand::CreateReg(Reg, false);
-
-    PendingDbgValues.push_back(
-        emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false}));
+    PendingDbgValues.push_back(std::make_pair(
+        VarID, &*emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false})));
     return true;
   }
 
@@ -687,19 +694,20 @@ class TransferTracker {
     DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
                       MI.getDebugLoc()->getInlinedAt());
     DbgValueProperties Properties(MI);
+    DebugVariableID VarID = DVMap.getDVID(Var);
 
     // Ignore non-register locations, we don't transfer those.
     if (MI.isUndefDebugValue() ||
         all_of(MI.debug_operands(),
                [](const MachineOperand &MO) { return !MO.isReg(); })) {
-      auto It = ActiveVLocs.find(Var);
+      auto It = ActiveVLocs.find(VarID);
       if (It != ActiveVLocs.end()) {
         for (LocIdx Loc : It->second.loc_indices())
-          ActiveMLocs[Loc].erase(Var);
+          ActiveMLocs[Loc].erase(VarID);
         ActiveVLocs.erase(It);
       }
       // Any use-before-defs no longer apply.
-      UseBeforeDefVariables.erase(Var);
+      UseBeforeDefVariables.erase(VarID);
       return;
     }
 
@@ -725,14 +733,15 @@ class TransferTracker {
                 SmallVectorImpl<ResolvedDbgOp> &NewLocs) {
     DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
                       MI.getDebugLoc()->getInlinedAt());
+    DebugVariableID VarID = DVMap.getDVID(Var);
     // Any use-before-defs no longer apply.
-    UseBeforeDefVariables.erase(Var);
+    UseBeforeDefVariables.erase(VarID);
 
     // Erase any previous location.
-    auto It = ActiveVLocs.find(Var);
+    auto It = ActiveVLocs.find(VarID);
     if (It != ActiveVLocs.end()) {
       for (LocIdx Loc : It->second.loc_indices())
-        ActiveMLocs[Loc].erase(Var);
+        ActiveMLocs[Loc].erase(VarID);
     }
 
     // If there _is_ no new location, all we had to do was erase.
@@ -742,7 +751,7 @@ class TransferTracker {
       return;
     }
 
-    SmallVector<std::pair<LocIdx, DebugVariable>> LostMLocs;
+    SmallVector<std::pair<LocIdx, DebugVariableID>> LostMLocs;
     for (ResolvedDbgOp &Op : NewLocs) {
       if (Op.IsConst)
         continue;
@@ -769,17 +778,17 @@ class TransferTracker {
         for (const auto &LostMLoc : LostMLocs)
           ActiveMLocs[LostMLoc.first].erase(LostMLoc.second);
         LostMLocs.clear();
-        It = ActiveVLocs.find(Var);
+        It = ActiveVLocs.find(VarID);
         ActiveMLocs[NewLoc.asU64()].clear();
         VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc);
       }
 
-      ActiveMLocs[NewLoc].insert(Var);
+      ActiveMLocs[NewLoc].insert(VarID);
     }
 
     if (It == ActiveVLocs.end()) {
       ActiveVLocs.insert(
-          std::make_pair(Var, ResolvedDbgValue(NewLocs, Properties)));
+          std::make_pair(VarID, ResolvedDbgValue(NewLocs, Properties)));
     } else {
       It->second.Ops.assign(NewLocs);
       It->second.Properties = Properties;
@@ -822,21 +831,21 @@ class TransferTracker {
     // explicitly undef, then stop here.
     if (!NewLoc && !MakeUndef) {
       // Try and recover a few more locations with entry values.
-      for (const auto &Var : ActiveMLocIt->second) {
-        auto &Prop = ActiveVLocs.find(Var)->second.Properties;
-        recoverAsEntryValue(Var, Prop, OldValue);
+      for (DebugVariableID VarID : ActiveMLocIt->second) {
+        auto &Prop = ActiveVLocs.find(VarID)->second.Properties;
+        recoverAsEntryValue(VarID, Prop, OldValue);
       }
       flushDbgValues(Pos, nullptr);
       return;
     }
 
     // Examine all the variables based on this location.
-    DenseSet<DebugVariable> NewMLocs;
+    DenseSet<DebugVariableID> NewMLocs;
     // If no new location has been found, every variable that depends on this
     // MLoc is dead, so end their existing MLoc->Var mappings as well.
-    SmallVector<std::pair<LocIdx, DebugVariable>> LostMLocs;
-    for (const auto &Var : ActiveMLocIt->second) {
-      auto ActiveVLocIt = ActiveVLocs.find(Var);
+    SmallVector<std::pair<LocIdx, DebugVariableID>> LostMLocs;
+    for (DebugVariableID VarID : ActiveMLocIt->second) {
+      auto ActiveVLocIt = ActiveVLocs.find(VarID);
       // Re-state the variable location: if there's no replacement then NewLoc
       // is std::nullopt and a $noreg DBG_VALUE will be created. Otherwise, a
       // DBG_VALUE identifying the alternative location will be emitted.
@@ -855,19 +864,21 @@ class TransferTracker {
         replace_copy(ActiveVLocIt->second.Ops, DbgOps.begin(), OldOp, NewOp);
       }
 
-      PendingDbgValues.push_back(MTracker->emitLoc(DbgOps, Var, Properties));
+      auto &[Var, DILoc] = DVMap.lookupDVID(VarID);
+      PendingDbgValues.push_back(std::make_pair(
+          VarID, &*MTracker->emitLoc(DbgOps, Var, DILoc, Properties)));
 
       // Update machine locations <=> variable locations maps. Defer updating
       // ActiveMLocs to avoid invalidating the ActiveMLocIt iterator.
       if (!NewLoc) {
         for (LocIdx Loc : ActiveVLocIt->second.loc_indices()) {
           if (Loc != MLoc)
-            LostMLocs.emplace_back(Loc, Var);
+            LostMLocs.emplace_back(Loc, VarID);
         }
         ActiveVLocs.erase(ActiveVLocIt);
       } else {
         ActiveVLocIt->second.Ops = DbgOps;
-        NewMLocs.insert(Var);
+        NewMLocs.insert(VarID);
       }
     }
 
@@ -891,8 +902,8 @@ class TransferTracker {
     // Commit ActiveMLoc changes.
     ActiveMLocIt->second.clear();
     if (!NewMLocs.empty())
-      for (auto &Var : NewMLocs)
-        ActiveMLocs[*NewLoc].insert(Var);
+      for (DebugVariableID VarID : NewMLocs)
+        ActiveMLocs[*NewLoc].insert(VarID);
   }
 
   /// Transfer variables based on \p Src to be based on \p Dst. This handles
@@ -915,17 +926,18 @@ class TransferTracker {
     // For each variable based on Src; create a location at Dst.
     ResolvedDbgOp SrcOp(Src);
     ResolvedDbgOp DstOp(Dst);
-    for (const auto &Var : MovingVars) {
-      auto ActiveVLocIt = ActiveVLocs.find(Var);
+    for (DebugVariableID VarID : MovingVars) {
+      auto ActiveVLocIt = ActiveVLocs.find(VarID);
       assert(ActiveVLocIt != ActiveVLocs.end());
 
       // Update all instances of Src in the variable's tracked values to Dst.
       std::replace(ActiveVLocIt->second.Ops.begin(),
                    ActiveVLocIt->second.Ops.end(), SrcOp, DstOp);
 
-      MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var,
+      auto &[Var, DILoc] = DVMap.lookupDVID(VarID);
+      MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, DILoc,
                                            ActiveVLocIt->second.Properties);
-      PendingDbgValues.push_back(MI);
+      PendingDbgValues.push_back(std::make_pair(VarID, MI));
     }
     ActiveMLocs[Src].clear();
     flushDbgValues(Pos, nullptr);
@@ -1176,11 +1188,9 @@ LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() {
 
 MachineInstrBuilder
 MLocTracker::emitLoc(const SmallVectorImpl<ResolvedDbgOp> &DbgOps,
-                     const DebugVariable &Var,
+                     const DebugVariable &Var, const DILocation *DILoc,
                      const DbgValueProperties &Properties) {
-  DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
-                                Var.getVariable()->getScope(),
-                                const_cast<DILocation *>(Var.getInlinedAt()));
+  DebugLoc DL = DebugLoc(DILoc);
 
   const MCInstrDesc &Desc = Properties.IsVariadic
                                 ? TII.get(TargetOpcode::DBG_VALUE_LIST)
@@ -1726,7 +1736,8 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
       LastUseBeforeDef = std::max(LastUseBeforeDef, NewID.getInst());
     }
     if (IsValidUseBeforeDef) {
-      TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false, true},
+      DebugVariableID VID = DVMap.insertDVID(V, MI.getDebugLoc().get());
+      TTracker->addUseBeforeDef(VID, {MI.getDebugExpression(), false, true},
                                 DbgOps, LastUseBeforeDef);
     }
   }
@@ -1735,9 +1746,11 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
   // This DBG_VALUE is potentially a $noreg / undefined location, if
   // FoundLoc is illegal.
   // (XXX -- could morph the DBG_INSTR_REF in the future).
-  MachineInstr *DbgMI = MTracker->emitLoc(NewLocs, V, Properties);
+  MachineInstr *DbgMI =
+      MTracker->emitLoc(NewLocs, V, MI.getDebugLoc().get(), Properties);
+  DebugVariableID ID = DVMap.getDVID(V);
 
-  TTracker->PendingDbgValues.push_back(DbgMI);
+  TTracker->PendingDbgValues.push_back(std::make_pair(ID, DbgMI));
   TTracker->flushDbgValues(MI.getIterator(), nullptr);
   return true;
 }
@@ -2413,9 +2426,7 @@ bool InstrRefBasedLDV::mlocJoin(
   // as its predecessors. If a PHI is placed, test to see whether it's now a
   // redundant PHI that we can eliminate.
 
-  SmallVector<const MachineBasicBlock *, 8> BlockOrders;
-  for (auto *Pred : MBB.predecessors())
-    BlockOrders.push_back(Pred);
+  SmallVector<const MachineBasicBlock *, 8> BlockOrders(MBB.predecessors());
 
   // Visit predecessors in RPOT order.
   auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
@@ -3112,7 +3123,8 @@ void InstrRefBasedLDV::getBlocksForScope(
 }
 
 void InstrRefBasedLDV::buildVLocValueMap(
-    const DILocation *DILoc, const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+    const DILocation *DILoc,
+    const SmallSet<DebugVariableID, 4> &VarsWeCareAbout,
     SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks, LiveInsT &Output,
     FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
     SmallVectorImpl<VLocTracker> &AllTheVLocs) {
@@ -3188,7 +3200,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
   // between blocks. This keeps the locality of working on one lexical scope at
   // at time, but avoids re-processing variable values because some other
   // variable has been assigned.
-  for (const auto &Var : VarsWeCareAbout) {
+  for (DebugVariableID VarID : VarsWeCareAbout) {
     // Re-initialize live-ins and live-outs, to clear the remains of previous
     // variables live-ins / live-outs.
     for (unsigned int I = 0; I < NumBlocks; ++I) {
@@ -3202,7 +3214,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
     SmallPtrSet<MachineBasicBlock *, 32> DefBlocks;
     for (const MachineBasicBlock *ExpMBB : BlocksToExplore) {
       auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars;
-      if (TransferFunc.contains(Var))
+      if (TransferFunc.contains(VarID))
         DefBlocks.insert(const_cast<MachineBasicBlock *>(ExpMBB));
     }
 
@@ -3212,7 +3224,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
     // only one value definition, things are very simple.
     if (DefBlocks.size() == 1) {
       placePHIsForSingleVarDefinition(MutBlocksToExplore, *DefBlocks.begin(),
-                                      AllTheVLocs, Var, Output);
+                                      AllTheVLocs, VarID, Output);
       continue;
     }
 
@@ -3254,9 +3266,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
         bool InLocsChanged =
             vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn);
 
-        SmallVector<const MachineBasicBlock *, 8> Preds;
-        for (const auto *Pred : MBB->predecessors())
-          Preds.push_back(Pred);
+        SmallVector<const MachineBasicBlock *, 8> Preds(MBB->predecessors());
 
         // If this block's live-in value is a VPHI, try to pick a machine-value
         // for it. This makes the machine-value available and propagated
@@ -3285,7 +3295,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
 
         // Do transfer function.
         auto &VTracker = AllTheVLocs[MBB->getNumber()];
-        auto TransferIt = VTracker.Vars.find(Var);
+        auto TransferIt = VTracker.Vars.find(VarID);
         if (TransferIt != VTracker.Vars.end()) {
           // Erase on empty transfer (DBG_VALUE $noreg).
           if (TransferIt->second.Kind == DbgValue::Undef) {
@@ -3347,9 +3357,11 @@ void InstrRefBasedLDV::buildVLocValueMap(
         continue;
       if (BlockLiveIn->Kind == DbgValue::VPHI)
         BlockLiveIn->Kind = DbgValue::Def;
+      [[maybe_unused]] auto &[Var, DILoc] = DVMap.lookupDVID(VarID);
       assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() ==
-             Var.getFragment() && "Fragment info missing during value prop");
-      Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn));
+                 Var.getFragment() &&
+             "Fragment info missing during value prop");
+      Output[MBB->getNumber()].push_back(std::make_pair(VarID, *BlockLiveIn));
     }
   } // Per-variable loop.
 
@@ -3360,7 +3372,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
 void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
     const SmallPtrSetImpl<MachineBasicBlock *> &InScopeBlocks,
     MachineBasicBlock *AssignMBB, SmallVectorImpl<VLocTracker> &AllTheVLocs,
-    const DebugVariable &Var, LiveInsT &Output) {
+    DebugVariableID VarID, LiveInsT &Output) {
   // If there is a single definition of the variable, then working out it's
   // value everywhere is very simple: it's every block dominated by the
   // definition. At the dominance frontier, the usual algorithm would:
@@ -3373,7 +3385,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
 
   // Pick out the variables value from the block transfer function.
   VLocTracker &VLocs = AllTheVLocs[AssignMBB->getNumber()];
-  auto ValueIt = VLocs.Vars.find(Var);
+  auto ValueIt = VLocs.Vars.find(VarID);
   const DbgValue &Value = ValueIt->second;
 
   // If it's an explicit assignment of "undef", that means there is no location
@@ -3388,7 +3400,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
     if (!DomTree->properlyDominates(AssignMBB, ScopeBlock))
       continue;
 
-    Output[ScopeBlock->getNumber()].push_back({Var, Value});
+    Output[ScopeBlock->getNumber()].push_back({VarID, Value});
   }
 
   // All blocks that aren't dominated have no live-in value, thus no variable
@@ -3515,9 +3527,9 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
     const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks,
     LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
     SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
-    DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
     const TargetPassConfig &TPC) {
-  TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC);
+  TTracker =
+      new TransferTracker(TII, MTracker, MF, DVMap, *TRI, CalleeSavedRegs, TPC);
   unsigned NumLocs = MTracker->getNumLocs();
   VTracker = nullptr;
 
@@ -3622,31 +3634,24 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
     if (MInLocs.hasTableFor(*MBB))
       EjectBlock(*MBB);
 
-  return emitTransfers(AllVarsNumbering);
+  return emitTransfers();
 }
 
-bool InstrRefBasedLDV::emitTransfers(
-    DenseMap<DebugVariable, unsigned> &AllVarsNumbering) {
+bool InstrRefBasedLDV::emitTransfers() {
   // Go through all the transfers recorded in the TransferTracker -- this is
   // both the live-ins to a block, and any movements of values that happen
   // in the middle.
-  for (const auto &P : TTracker->Transfers) {
+  for (auto &P : TTracker->Transfers) {
     // We have to insert DBG_VALUEs in a consistent order, otherwise they
     // appear in DWARF in different orders. Use the order that they appear
     // when walking through each block / each instruction, stored in
-    // AllVarsNumbering.
-    SmallVector<std::pair<unsigned, MachineInstr *>> Insts;
-    for (MachineInstr *MI : P.Insts) {
-      DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(),
-                        MI->getDebugLoc()->getInlinedAt());
-      Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI);
-    }
-    llvm::sort(Insts, llvm::less_first());
+    // DVMap.
+    llvm::sort(P.Insts, llvm::less_first());
 
     // Insert either before or after the designated point...
     if (P.MBB) {
       MachineBasicBlock &MBB = *P.MBB;
-      for (const auto &Pair : Insts)
+      for (const auto &Pair : P.Insts)
         MBB.insert(P.Pos, Pair.second);
     } else {
       // Terminators, like tail calls, can clobber things. Don't try and place
@@ -3655,7 +3660,7 @@ bool InstrRefBasedLDV::emitTransfers(
         continue;
 
       MachineBasicBlock &MBB = *P.Pos->getParent();
-      for (const auto &Pair : Insts)
+      for (const auto &Pair : P.Insts)
         MBB.insertAfterBundle(P.Pos, Pair.second);
     }
   }
@@ -3710,7 +3715,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   initialSetup(MF);
 
   MLocTransfer.resize(MaxNumBlocks);
-  vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr));
+  vlocs.resize(MaxNumBlocks, VLocTracker(DVMap, OverlapFragments, EmptyExpr));
   SavedLiveIns.resize(MaxNumBlocks);
 
   produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks);
@@ -3766,10 +3771,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     MTracker->reset();
   }
 
-  // Number all variables in the order that they appear, to be used as a stable
-  // insertion order later.
-  DenseMap<DebugVariable, unsigned> AllVarsNumbering;
-
   // Map from one LexicalScope to all the variables in that scope.
   ScopeToVarsT ScopeToVars;
 
@@ -3788,16 +3789,15 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     auto *VTracker = &vlocs[MBB->getNumber()];
     // Collect each variable with a DBG_VALUE in this block.
     for (auto &idx : VTracker->Vars) {
-      const auto &Var = idx.first;
-      const DILocation *ScopeLoc = VTracker->Scopes[Var];
+      DebugVariableID VarID = idx.first;
+      const DILocation *ScopeLoc = VTracker->Scopes[VarID];
       assert(ScopeLoc != nullptr);
       auto *Scope = LS.findLexicalScope(ScopeLoc);
 
       // No insts in scope -> shouldn't have been recorded.
       assert(Scope != nullptr);
 
-      AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size()));
-      ScopeToVars[Scope].insert(Var);
+      ScopeToVars[Scope].insert(VarID);
       ScopeToAssignBlocks[Scope].insert(VTracker->MBB);
       ScopeToDILocation[Scope] = ScopeLoc;
       ++VarAssignCount;
@@ -3821,7 +3821,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     // the "else" block of this condition.
     Changed = depthFirstVLocAndEmit(
         MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks,
-        SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC);
+        SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, *TPC);
   }
 
   delete MTracker;
@@ -3840,6 +3840,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   SeenFragments.clear();
   SeenDbgPHIs.clear();
   DbgOpStore.clear();
+  DVMap.clear();
 
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index 8770983481c2f..d9851ad13eab2 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -35,6 +35,44 @@ class DbgOpIDMap;
 
 using namespace llvm;
 
+using DebugVariableID = unsigned;
+using VarAndLoc = std::pair<DebugVariable, const DILocation *>;
+
+/// Mapping from DebugVariable to/from a unique identifying number. Each
+/// DebugVariable consists of three pointers, and after a small amount of
+/// work to identify overlapping fragments of variables we mostly only use
+/// DebugVariables as identities of variables. It's much more compile-time
+/// efficient to use an ID number instead, which this class provides.
+class DebugVariableMap {
+  DenseMap<DebugVariable, unsigned> VarToIdx;
+  SmallVector<VarAndLoc> IdxToVar;
+
+public:
+  DebugVariableID getDVID(const DebugVariable &Var) const {
+    auto It = VarToIdx.find(Var);
+    assert(It != VarToIdx.end());
+    return It->second;
+  }
+
+  DebugVariableID insertDVID(DebugVariable &Var, const DILocation *Loc) {
+    unsigned Size = VarToIdx.size();
+    auto ItPair = VarToIdx.insert({Var, Size});
+    if (ItPair.second) {
+      IdxToVar.push_back({Var, Loc});
+      return Size;
+    }
+
+    return ItPair.first->second;
+  }
+
+  const VarAndLoc &lookupDVID(DebugVariableID ID) const { return IdxToVar[ID]; }
+
+  void clear() {
+    VarToIdx.clear();
+    IdxToVar.clear();
+  }
+};
+
 /// Handle-class for a particular "location". This value-type uniquely
 /// symbolises a register or stack location, allowing manipulation of locations
 /// without concern for where that location is. Practically, this allows us to
@@ -985,7 +1023,7 @@ class MLocTracker {
   /// information in \pProperties, for variable Var. Don't insert it anywhere,
   /// just return the builder for it.
   MachineInstrBuilder emitLoc(const SmallVectorImpl<ResolvedDbgOp> &DbgOps,
-                              const DebugVariable &Var,
+                              const DebugVariable &Var, const DILocation *DILoc,
                               const DbgValueProperties &Properties);
 };
 
@@ -1003,38 +1041,45 @@ using OverlapMap =
 /// identified.
 class VLocTracker {
 public:
+  /// Ref to function-wide map of DebugVariable <=> ID-numbers.
+  DebugVariableMap &DVMap;
   /// Map DebugVariable to the latest Value it's defined to have.
   /// Needs to be a MapVector because we determine order-in-the-input-MIR from
-  /// the order in this container.
+  /// the order in this container. (FIXME: likely no longer true as the ordering
+  /// is now provided by DebugVariableMap).
   /// We only retain the last DbgValue in each block for each variable, to
   /// determine the blocks live-out variable value. The Vars container forms the
   /// transfer function for this block, as part of the dataflow analysis. The
   /// movement of values between locations inside of a block is handled at a
   /// much later stage, in the TransferTracker class.
-  MapVector<DebugVariable, DbgValue> Vars;
-  SmallDenseMap<DebugVariable, const DILocation *, 8> Scopes;
+  MapVector<DebugVariableID, DbgValue> Vars;
+  SmallDenseMap<DebugVariableID, const DILocation *, 8> Scopes;
   MachineBasicBlock *MBB = nullptr;
   const OverlapMap &OverlappingFragments;
   DbgValueProperties EmptyProperties;
 
 public:
-  VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr)
-      : OverlappingFragments(O), EmptyProperties(EmptyExpr, false, false) {}
+  VLocTracker(DebugVariableMap &DVMap, const OverlapMap &O,
+              const DIExpression *EmptyExpr)
+      : DVMap(DVMap), OverlappingFragments(O),
+        EmptyProperties(EmptyExpr, false, false) {}
 
   void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
               const SmallVectorImpl<DbgOpID> &DebugOps) {
     assert(MI.isDebugValueLike());
     DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
                       MI.getDebugLoc()->getInlinedAt());
+    // Either insert or fetch an ID number for this variable.
+    DebugVariableID VarID = DVMap.insertDVID(Var, MI.getDebugLoc().get());
     DbgValue Rec = (DebugOps.size() > 0)
                        ? DbgValue(DebugOps, Properties)
                        : DbgValue(Properties, DbgValue::Undef);
 
     // Attempt insertion; overwrite if it's already mapped.
-    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    auto Result = Vars.insert(std::make_pair(VarID, Rec));
     if (!Result.second)
       Result.first->second = Rec;
-    Scopes[Var] = MI.getDebugLoc().get();
+    Scopes[VarID] = MI.getDebugLoc().get();
 
     considerOverlaps(Var, MI.getDebugLoc().get());
   }
@@ -1056,13 +1101,15 @@ class VLocTracker {
 
       DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo,
                                Var.getInlinedAt());
+      // Produce an ID number for this overlapping fragment of a variable.
+      DebugVariableID OverlappedID = DVMap.insertDVID(Overlapped, Loc);
       DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef);
 
       // Attempt insertion; overwrite if it's already mapped.
-      auto Result = Vars.insert(std::make_pair(Overlapped, Rec));
+      auto Result = Vars.insert(std::make_pair(OverlappedID, Rec));
       if (!Result.second)
         Result.first->second = Rec;
-      Scopes[Overlapped] = Loc;
+      Scopes[OverlappedID] = Loc;
     }
   }
 
@@ -1093,7 +1140,7 @@ class InstrRefBasedLDV : public LDVImpl {
   /// variables to their values.
   using LiveIdxT = DenseMap<const MachineBasicBlock *, DbgValue *>;
 
-  using VarAndLoc = std::pair<DebugVariable, DbgValue>;
+  using VarAndLoc = std::pair<DebugVariableID, DbgValue>;
 
   /// Type for a live-in value: the predecessor block, and its value.
   using InValueT = std::pair<MachineBasicBlock *, DbgValue *>;
@@ -1106,7 +1153,8 @@ class InstrRefBasedLDV : public LDVImpl {
   using ScopeToDILocT = DenseMap<const LexicalScope *, const DILocation *>;
 
   /// Mapping from lexical scopes to variables in that scope.
-  using ScopeToVarsT = DenseMap<const LexicalScope *, SmallSet<DebugVariable, 4>>;
+  using ScopeToVarsT =
+      DenseMap<const LexicalScope *, SmallSet<DebugVariableID, 4>>;
 
   /// Mapping from lexical scopes to blocks where variables in that scope are
   /// assigned. Such blocks aren't necessarily "in" the lexical scope, it's
@@ -1200,6 +1248,11 @@ class InstrRefBasedLDV : public LDVImpl {
 
   DbgOpIDMap DbgOpStore;
 
+  /// Mapping between DebugVariables and unique ID numbers. This is a more
+  /// efficient way to represent the identity of a variable, versus a plain
+  /// DebugVariable.
+  DebugVariableMap DVMap;
+
   /// True if we need to examine call instructions for stack clobbers. We
   /// normally assume that they don't clobber SP, but stack probes on Windows
   /// do.
@@ -1330,9 +1383,9 @@ class InstrRefBasedLDV : public LDVImpl {
   /// performance as it doesn't have to find the dominance frontier between
   /// different assignments.
   void placePHIsForSingleVarDefinition(
-          const SmallPtrSetImpl<MachineBasicBlock *> &InScopeBlocks,
-          MachineBasicBlock *MBB, SmallVectorImpl<VLocTracker> &AllTheVLocs,
-          const DebugVariable &Var, LiveInsT &Output);
+      const SmallPtrSetImpl<MachineBasicBlock *> &InScopeBlocks,
+      MachineBasicBlock *MBB, SmallVectorImpl<VLocTracker> &AllTheVLocs,
+      DebugVariableID Var, LiveInsT &Output);
 
   /// Calculate the iterated-dominance-frontier for a set of defs, using the
   /// existing LLVM facilities for this. Works for a single "value" or
@@ -1381,7 +1434,7 @@ class InstrRefBasedLDV : public LDVImpl {
   /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks
   /// locations through.
   void buildVLocValueMap(const DILocation *DILoc,
-                         const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                         const SmallSet<DebugVariableID, 4> &VarsWeCareAbout,
                          SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
                          LiveInsT &Output, FuncValueTable &MOutLocs,
                          FuncValueTable &MInLocs,
@@ -1414,10 +1467,8 @@ class InstrRefBasedLDV : public LDVImpl {
       const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders);
 
   /// Take collections of DBG_VALUE instructions stored in TTracker, and
-  /// install them into their output blocks. Preserves a stable order of
-  /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through
-  /// the AllVarsNumbering order.
-  bool emitTransfers(DenseMap<DebugVariable, unsigned> &AllVarsNumbering);
+  /// install them into their output blocks.
+  bool emitTransfers();
 
   /// Boilerplate computation of some initial sets, artifical blocks and
   /// RPOT block ordering.
@@ -1437,13 +1488,14 @@ class InstrRefBasedLDV : public LDVImpl {
   /// block information can be fully computed before exploration finishes,
   /// allowing us to emit it and free data structures earlier than otherwise.
   /// It's also good for locality.
-  bool depthFirstVLocAndEmit(
-      unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation,
-      const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks,
-      LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
-      SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
-      DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
-      const TargetPassConfig &TPC);
+  bool depthFirstVLocAndEmit(unsigned MaxNumBlocks,
+                             const ScopeToDILocT &ScopeToDILocation,
+                             const ScopeToVarsT &ScopeToVars,
+                             ScopeToAssignBlocksT &ScopeToBlocks,
+                             LiveInsT &Output, FuncValueTable &MOutLocs,
+                             FuncValueTable &MInLocs,
+                             SmallVectorImpl<VLocTracker> &AllTheVLocs,
+                             MachineFunction &MF, const TargetPassConfig &TPC);
 
   bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
                     TargetPassConfig *TPC, unsigned InputBBLimit,
@@ -1473,6 +1525,11 @@ class InstrRefBasedLDV : public LDVImpl {
   }
 
   std::optional<LocIdx> findLocationForMemOperand(const MachineInstr &MI);
+
+  // Utility for unit testing, don't use directly.
+  DebugVariableMap &getDVMap() {
+    return DVMap;
+  }
 };
 
 } // namespace LiveDebugValues
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7b7b5459ad7b2..c3c581d42236e 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -238,7 +238,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   // We also need to make sure it is safe to move the load.
   // Assume there are stores between DefMI and UseMI.
   bool SawStore = true;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
 
   LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
@@ -300,7 +300,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Use the same criteria as DeadMachineInstructionElim.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore)) {
+  if (!MI->isSafeToMove(SawStore)) {
     LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
     return;
   }
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4403b8f..3e3e3e51bfe9c 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -153,7 +153,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           }
       }
 
-      if (!MI.isSafeToMove(nullptr, SawStore)) {
+      if (!MI.isSafeToMove(SawStore)) {
         // If MI has side effects, it should become a barrier for code motion.
         // IOM is rebuild from the next instruction to prevent later
         // instructions from being moved before this MI.
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 1d16729aa3387..0e2b71729fbf5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -3362,15 +3362,15 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   if (parseOptionalAtomicOrdering(FailureOrder))
     return true;
 
-  LLT MemoryType;
   if (Token.isNot(MIToken::IntegerLiteral) &&
       Token.isNot(MIToken::kw_unknown_size) &&
       Token.isNot(MIToken::lparen))
     return error("expected memory LLT, the size integer literal or 'unknown-size' after "
                  "memory operation");
 
-  uint64_t Size = MemoryLocation::UnknownSize;
+  LLT MemoryType;
   if (Token.is(MIToken::IntegerLiteral)) {
+    uint64_t Size;
     if (getUint64(Size))
       return true;
 
@@ -3378,7 +3378,6 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     MemoryType = LLT::scalar(8 * Size);
     lex();
   } else if (Token.is(MIToken::kw_unknown_size)) {
-    Size = MemoryLocation::UnknownSize;
     lex();
   } else {
     if (expectAndConsume(MIToken::lparen))
@@ -3387,8 +3386,6 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
       return true;
     if (expectAndConsume(MIToken::rparen))
       return true;
-
-    Size = MemoryType.getSizeInBytes();
   }
 
   MachinePointerInfo Ptr = MachinePointerInfo();
@@ -3406,7 +3403,9 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
       return true;
   }
   uint64_t BaseAlignment =
-      (Size != MemoryLocation::UnknownSize ? PowerOf2Ceil(Size) : 1);
+      MemoryType.isValid()
+          ? PowerOf2Ceil(MemoryType.getSizeInBytes().getKnownMinValue())
+          : 1;
   AAMDNodes AAInfo;
   MDNode *Range = nullptr;
   while (consumeIfPresent(MIToken::comma)) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 48c3e0d7a97e6..63fb887b09b0d 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -101,13 +101,15 @@ namespace llvm {
 /// format.
 class MIRPrinter {
   raw_ostream &OS;
+  const MachineModuleInfo &MMI;
   DenseMap<const uint32_t *, unsigned> RegisterMaskIds;
   /// Maps from stack object indices to operand indices which will be used when
   /// printing frame index machine operands.
   DenseMap<int, FrameIndexOperand> StackObjectOperandMapping;
 
 public:
-  MIRPrinter(raw_ostream &OS) : OS(OS) {}
+  MIRPrinter(raw_ostream &OS, const MachineModuleInfo &MMI)
+      : OS(OS), MMI(MMI) {}
 
   void print(const MachineFunction &MF);
 
@@ -222,7 +224,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
       MachineFunctionProperties::Property::TracksDebugUserValues);
 
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
-  MachineModuleSlotTracker MST(&MF);
+  MachineModuleSlotTracker MST(MMI, &MF);
   MST.incorporateFunction(MF.getFunction());
   convert(MST, YamlMF.FrameInfo, MF.getFrameInfo());
   convertStackObjects(YamlMF, MF, MST);
@@ -1005,12 +1007,13 @@ void llvm::printMIR(raw_ostream &OS, const Module &M) {
   Out << const_cast<Module &>(M);
 }
 
-void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
+void llvm::printMIR(raw_ostream &OS, const MachineModuleInfo &MMI,
+                    const MachineFunction &MF) {
   // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   ScopedDbgInfoFormatSetter FormatSetter(
       const_cast<Function &>(MF.getFunction()), WriteNewDbgInfoFormat);
 
-  MIRPrinter Printer(OS);
+  MIRPrinter Printer(OS, MMI);
   Printer.print(MF);
 }
diff --git a/llvm/lib/CodeGen/MIRPrintingPass.cpp b/llvm/lib/CodeGen/MIRPrintingPass.cpp
index f70c0731ffafa..fc79410f97b58 100644
--- a/llvm/lib/CodeGen/MIRPrintingPass.cpp
+++ b/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -13,7 +13,9 @@
 
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -24,8 +26,13 @@ PreservedAnalyses PrintMIRPreparePass::run(Module &M, ModuleAnalysisManager &) {
 }
 
 PreservedAnalyses PrintMIRPass::run(MachineFunction &MF,
-                                    MachineFunctionAnalysisManager &) {
-  printMIR(OS, MF);
+                                    MachineFunctionAnalysisManager &MFAM) {
+  auto &MAMP = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF);
+  Module *M = MF.getFunction().getParent();
+  const MachineModuleInfo &MMI =
+      MAMP.getCachedResult<MachineModuleAnalysis>(*M)->getMMI();
+
+  printMIR(OS, MMI, MF);
   return PreservedAnalyses::all();
 }
 
@@ -51,7 +58,11 @@ struct MIRPrintingPass : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override {
     std::string Str;
     raw_string_ostream StrOS(Str);
-    printMIR(StrOS, MF);
+
+    const MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+    printMIR(StrOS, MMI, MF);
     MachineFunctions.append(Str);
     return false;
   }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 4c864ca15ccc5..6179e0938034a 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -213,6 +213,12 @@ static cl::opt<bool> RenumberBlocksBeforeView(
         "into a dot graph. Only used when a function is being printed."),
     cl::init(false), cl::Hidden);
 
+static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
+    "ext-tsp-block-placement-max-blocks",
+    cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
+             "block placement."),
+    cl::init(UINT_MAX), cl::Hidden);
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -2596,7 +2602,15 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
-  BlockFilterSet LoopBlockSet;
+  // Collect the blocks in a set ordered by block number, as this gives the same
+  // order as they appear in the function.
+  struct MBBCompare {
+    bool operator()(const MachineBasicBlock *X,
+                    const MachineBasicBlock *Y) const {
+      return X->getNumber() < Y->getNumber();
+    }
+  };
+  std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
   // Collect the sum of frequencies of incoming edges to the loop header from
@@ -2627,7 +2641,11 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   } else
     LoopBlockSet.insert(L.block_begin(), L.block_end());
 
-  return LoopBlockSet;
+  // Copy the blocks into a BlockFilterSet, as iterating it is faster than
+  // std::set. We will only remove blocks and never insert them, which will
+  // preserve the ordering.
+  BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
+  return Ret;
 }
 
 /// Forms basic block chains from the natural loop structures.
@@ -3511,7 +3529,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   // Apply a post-processing optimizing block placement.
   if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData())) {
+      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+      MF.size() <= ExtTspBlockPlacementMaxBlocks) {
     // Find a new placement and modify the layout of the blocks in the function.
     applyExtTsp();
 
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 853de4c88caeb..e4b993850f73d 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -197,7 +197,7 @@ void MachineFrameInfo::computeMaxCallFrameSize(
     for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
-        unsigned Size = TII.getFrameSize(MI);
+        uint64_t Size = TII.getFrameSize(MI);
         MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
         if (FrameSDOps != nullptr)
           FrameSDOps->push_back(&MI);
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 7f6a75208d253..40bde200f37ac 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -162,9 +162,9 @@ static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
 }
 
 MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                                 const TargetSubtargetInfo &STI,
-                                 unsigned FunctionNum, MachineModuleInfo &mmi)
-    : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
+                                 const TargetSubtargetInfo &STI, MCContext &Ctx,
+                                 unsigned FunctionNum)
+    : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
   FunctionNumber = FunctionNum;
   init();
 }
@@ -654,9 +654,14 @@ void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
 
 /// True if this function needs frame moves for debug or exceptions.
 bool MachineFunction::needsFrameMoves() const {
-  return getMMI().hasDebugInfo() ||
-         getTarget().Options.ForceDwarfFrameSection ||
-         F.needsUnwindTableEntry();
+  // TODO: Ideally, what we'd like is to have a switch that allows emitting
+  // synchronous (precise at call-sites only) CFA into .eh_frame. However, even
+  // under this switch, we'd like .debug_frame to be precise when using -g. At
+  // this moment, there's no way to specify that some CFI directives go into
+  // .eh_frame only, while others go into .debug_frame only.
+  return getTarget().Options.ForceDwarfFrameSection ||
+         F.needsUnwindTableEntry() ||
+         !F.getParent()->debug_compile_units().empty();
 }
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 24eb360723dad..e7a4d6d61e211 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -37,7 +37,7 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
                   .getCachedResult<MachineModuleAnalysis>(*F.getParent())
                   ->getMMI();
   auto MF = std::make_unique<MachineFunction>(
-      F, *TM, STI, Context.generateMachineFunctionNum(F), MMI);
+      F, *TM, STI, MMI.getContext(), Context.generateMachineFunctionNum(F));
   MF->initTargetMachineFunctionInfo(STI);
 
   // MRI callback for target specific initializations.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 2b083e1dc3210..583bc1bc4a42c 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1293,7 +1293,7 @@ void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
   //
   // Treat volatile loads as stores. This is not strictly necessary for
@@ -2216,7 +2216,7 @@ void MachineInstr::emitError(StringRef Msg) const {
 
   if (const MachineBasicBlock *MBB = getParent())
     if (const MachineFunction *MF = MBB->getParent())
-      return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
+      return MF->getFunction().getContext().emitError(LocCookie, Msg);
   report_fatal_error(Msg);
 }
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index f24ab187ef400..1e4bf4bba5622 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -476,7 +476,7 @@ static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
   RUs |= RUsFromRegsNotInMask;
 }
 
-/// Examine the instruction for potentai LICM candidate. Also
+/// Examine the instruction for potential LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
                                 BitVector &RUClobbers,
@@ -1075,7 +1075,7 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
 bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = !HoistConstLoads || !AllowedToHoistLoads[CurLoop];
-  if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+  if ((!I.isSafeToMove(DontMoveAcrossStore)) &&
       !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
     LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
     return false;
diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index 1f596cd1bd2ec..9d650e4bea7e1 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -158,8 +158,7 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
                         Register FrameReg) {
   DefedReg = MCRegister::NoRegister;
   bool SawStore = true;
-  if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
-      MI->isInlineAsm())
+  if (!MI->isSafeToMove(SawStore) || MI->isImplicitDef() || MI->isInlineAsm())
     return false;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index a03c008e6045a..88ba10fbe29a9 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -287,7 +287,7 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I,
       }
     }
 
-    if (!MO.isUse())
+    if (!MO.readsReg())
       continue;
 
     assert(MRI->getVRegDef(Reg) &&
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index b950f4fdbcf79..c66495969b4e6 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
@@ -16,34 +14,19 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include <algorithm>
 #include <cassert>
-#include <memory>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::dwarf;
 
-static cl::opt<bool>
-    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
-                             cl::desc("Disable debug info printing"));
-
 // Out of line virtual method.
 MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
 void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
-  CurCallSite = 0;
   NextFnNum = 0;
-  UsesMSVCFloatingPoint = false;
-  DbgInfoAvailable = false;
 }
 
 void MachineModuleInfo::finalize() {
@@ -61,7 +44,6 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
       MachineFunctions(std::move(MMI.MachineFunctions)) {
   Context.setObjectFileInfo(TM.getObjFileLowering());
   ObjFileMMI = MMI.ObjFileMMI;
-  CurCallSite = MMI.CurCallSite;
   ExternalContext = MMI.ExternalContext;
   TheModule = MMI.TheModule;
 }
@@ -104,7 +86,7 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
   if (I.second) {
     // No pre-existing machine function, create a new one.
     const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
-    MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+    MF = new MachineFunction(F, TM, STI, getContext(), NextFnNum++);
     MF->initTargetMachineFunctionInfo(STI);
 
     // MRI callback for target specific initializations.
@@ -224,8 +206,6 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
-                         !M.debug_compile_units().empty();
   return false;
 }
 
@@ -250,7 +230,5 @@ MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable =
-      !DisableDebugInfoPrinting && !M.debug_compile_units().empty();
   return Result(MMI);
 }
diff --git a/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp b/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
index 965539ddaca8d..8f10435e7998e 100644
--- a/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
+++ b/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
@@ -64,10 +64,11 @@ void MachineModuleSlotTracker::collectMachineMDNodes(
 }
 
 MachineModuleSlotTracker::MachineModuleSlotTracker(
-    const MachineFunction *MF, bool ShouldInitializeAllMetadata)
+    const MachineModuleInfo &MMI, const MachineFunction *MF,
+    bool ShouldInitializeAllMetadata)
     : ModuleSlotTracker(MF->getFunction().getParent(),
                         ShouldInitializeAllMetadata),
-      TheFunction(MF->getFunction()), TheMMI(MF->getMMI()) {
+      TheFunction(MF->getFunction()), TheMMI(MMI) {
   setProcessHook([this](AbstractSlotTrackerStorage *AST, const Module *M,
                         bool ShouldInitializeAllMetadata) {
     this->processMachineModule(AST, M, ShouldInitializeAllMetadata);
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index c7ccf10e12b12..4b56a467b8d07 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -132,6 +132,7 @@ namespace {
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
 struct InstructionMapper {
+  const MachineModuleInfo &MMI;
 
   /// The next available integer to assign to a \p MachineInstr that
   /// cannot be outlined.
@@ -333,7 +334,7 @@ struct InstructionMapper {
       // which may be outlinable. Check if each instruction is known to be safe.
       for (; It != OutlinableRangeEnd; ++It) {
         // Keep track of where this instruction is in the module.
-        switch (TII.getOutliningType(It, Flags)) {
+        switch (TII.getOutliningType(MMI, It, Flags)) {
         case InstrType::Illegal:
           mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
                                InstrListForMBB);
@@ -382,7 +383,7 @@ struct InstructionMapper {
     }
   }
 
-  InstructionMapper() {
+  InstructionMapper(const MachineModuleInfo &MMI_) : MMI(MMI_) {
     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
     // changed.
     assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
@@ -405,6 +406,8 @@ struct MachineOutliner : public ModulePass {
 
   static char ID;
 
+  MachineModuleInfo *MMI = nullptr;
+
   /// Set to true if the outliner should consider functions with
   /// linkonceodr linkage.
   bool OutlineFromLinkOnceODRs = false;
@@ -489,20 +492,19 @@ struct MachineOutliner : public ModulePass {
 
   /// Populate and \p InstructionMapper with instruction-to-integer mappings.
   /// These are used to construct a suffix tree.
-  void populateMapper(InstructionMapper &Mapper, Module &M,
-                      MachineModuleInfo &MMI);
+  void populateMapper(InstructionMapper &Mapper, Module &M);
 
   /// Initialize information necessary to output a size remark.
   /// FIXME: This should be handled by the pass manager, not the outliner.
   /// FIXME: This is nearly identical to the initSizeRemarkInfo in the legacy
   /// pass manager.
-  void initSizeRemarkInfo(const Module &M, const MachineModuleInfo &MMI,
+  void initSizeRemarkInfo(const Module &M,
                           StringMap<unsigned> &FunctionToInstrCount);
 
   /// Emit the remark.
   // FIXME: This should be handled by the pass manager, not the outliner.
   void
-  emitInstrCountChangedRemark(const Module &M, const MachineModuleInfo &MMI,
+  emitInstrCountChangedRemark(const Module &M,
                               const StringMap<unsigned> &FunctionToInstrCount);
 };
 } // Anonymous namespace.
@@ -669,7 +671,7 @@ void MachineOutliner::findCandidates(
         CandidatesForRepeatedSeq[0].getMF()->getSubtarget().getInstrInfo();
 
     std::optional<OutlinedFunction> OF =
-        TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq);
+        TII->getOutliningCandidateInfo(*MMI, CandidatesForRepeatedSeq);
 
     // If we deleted too many candidates, then there's nothing worth outlining.
     // FIXME: This should take target-specified instruction sizes into account.
@@ -988,8 +990,7 @@ bool MachineOutliner::outline(Module &M,
   return OutlinedSomething;
 }
 
-void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
-                                     MachineModuleInfo &MMI) {
+void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
   // Build instruction mappings for each function in the module. Start by
   // iterating over each Function in M.
   LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n");
@@ -1003,7 +1004,7 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
 
     // There's something in F. Check if it has a MachineFunction associated with
     // it.
-    MachineFunction *MF = MMI.getMachineFunction(F);
+    MachineFunction *MF = MMI->getMachineFunction(F);
 
     // If it doesn't, then there's nothing to outline from. Move to the next
     // Function.
@@ -1062,12 +1063,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
 }
 
 void MachineOutliner::initSizeRemarkInfo(
-    const Module &M, const MachineModuleInfo &MMI,
-    StringMap<unsigned> &FunctionToInstrCount) {
+    const Module &M, StringMap<unsigned> &FunctionToInstrCount) {
   // Collect instruction counts for every function. We'll use this to emit
   // per-function size remarks later.
   for (const Function &F : M) {
-    MachineFunction *MF = MMI.getMachineFunction(F);
+    MachineFunction *MF = MMI->getMachineFunction(F);
 
     // We only care about MI counts here. If there's no MachineFunction at this
     // point, then there won't be after the outliner runs, so let's move on.
@@ -1078,13 +1078,12 @@ void MachineOutliner::initSizeRemarkInfo(
 }
 
 void MachineOutliner::emitInstrCountChangedRemark(
-    const Module &M, const MachineModuleInfo &MMI,
-    const StringMap<unsigned> &FunctionToInstrCount) {
+    const Module &M, const StringMap<unsigned> &FunctionToInstrCount) {
   // Iterate over each function in the module and emit remarks.
   // Note that we won't miss anything by doing this, because the outliner never
   // deletes functions.
   for (const Function &F : M) {
-    MachineFunction *MF = MMI.getMachineFunction(F);
+    MachineFunction *MF = MMI->getMachineFunction(F);
 
     // The outliner never deletes functions. If we don't have a MF here, then we
     // didn't have one prior to outlining either.
@@ -1135,6 +1134,8 @@ bool MachineOutliner::runOnModule(Module &M) {
   if (M.empty())
     return false;
 
+  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
   // Number to append to the current outlined function.
   unsigned OutlinedFunctionNum = 0;
 
@@ -1158,8 +1159,6 @@ bool MachineOutliner::runOnModule(Module &M) {
 }
 
 bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-
   // If the user passed -enable-machine-outliner=always or
   // -enable-machine-outliner, the pass will run on all functions in the module.
   // Otherwise, if the target supports default outlining, it will run on all
@@ -1177,10 +1176,10 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   // If the user specifies that they want to outline from linkonceodrs, set
   // it here.
   OutlineFromLinkOnceODRs = EnableLinkOnceODROutlining;
-  InstructionMapper Mapper;
+  InstructionMapper Mapper(*MMI);
 
   // Prepare instruction mappings for the suffix tree.
-  populateMapper(Mapper, M, MMI);
+  populateMapper(Mapper, M);
   std::vector<OutlinedFunction> FunctionList;
 
   // Find all of the outlining candidates.
@@ -1198,7 +1197,7 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   bool ShouldEmitSizeRemarks = M.shouldEmitInstrCountChangedRemark();
   StringMap<unsigned> FunctionToInstrCount;
   if (ShouldEmitSizeRemarks)
-    initSizeRemarkInfo(M, MMI, FunctionToInstrCount);
+    initSizeRemarkInfo(M, FunctionToInstrCount);
 
   // Outline each of the candidates and return true if something was outlined.
   bool OutlinedSomething =
@@ -1208,7 +1207,7 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   // module. If we've asked for size remarks, then output them.
   // FIXME: This should be in the pass manager.
   if (ShouldEmitSizeRemarks && OutlinedSomething)
-    emitInstrCountChangedRemark(M, MMI, FunctionToInstrCount);
+    emitInstrCountChangedRemark(M, FunctionToInstrCount);
 
   LLVM_DEBUG({
     if (!OutlinedSomething)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 497e282bb9768..5c68711ff6193 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -528,8 +528,16 @@ bool MachinePipeliner::useSwingModuloScheduler() {
 }
 
 bool MachinePipeliner::useWindowScheduler(bool Changed) {
-  // WindowScheduler does not work when it is off or when SwingModuloScheduler
-  // is successfully scheduled.
+  // WindowScheduler does not work for following cases:
+  // 1. when it is off.
+  // 2. when SwingModuloScheduler is successfully scheduled.
+  // 3. when pragma II is enabled.
+  if (II_setByPragma) {
+    LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
+                         "llvm.loop.pipeline.initiationinterval is set.\n");
+    return false;
+  }
+
   return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
          (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 4b3ff57fb478a..f10b98cebd133 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -374,7 +374,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
 
   // Check if it's safe to move the instruction.
   bool SawStore = true;
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -687,7 +687,7 @@ void MachineSinking::FindCycleSinkCandidates(
       continue;
     }
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
       continue;
     }
@@ -1654,7 +1654,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     return false;
 
   // Check if it's safe to move the instruction.
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -1705,7 +1705,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     bool TryBreak = false;
     bool Store =
         MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
-    if (!MI.isSafeToMove(AA, Store)) {
+    if (!MI.isSafeToMove(Store)) {
       LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 5abfbd5981fba..d2e02a2d739c1 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/xxhash.h"
 
 #define DEBUG_TYPE "machine-stable-hash"
 
@@ -100,8 +101,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_TargetIndex: {
     if (const char *Name = MO.getTargetIndexName())
       return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
-                                 stable_hash_combine_string(Name),
-                                 MO.getOffset());
+                                 xxh3_64bits(Name), MO.getOffset());
     StableHashBailingTargetIndexNoName++;
     return 0;
   }
@@ -113,7 +113,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
 
   case MachineOperand::MO_ExternalSymbol:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
-                        stable_hash_combine_string(MO.getSymbolName()));
+                        xxh3_64bits(MO.getSymbolName()));
 
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut: {
@@ -151,7 +151,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_MCSymbol: {
     auto SymbolName = MO.getMCSymbol()->getName();
     return hash_combine(MO.getType(), MO.getTargetFlags(),
-                        stable_hash_combine_string(SymbolName));
+                        xxh3_64bits(SymbolName));
   }
   case MachineOperand::MO_CFIIndex:
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 0a5b8bdbc9371..d22fbe322ec36 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1544,6 +1544,36 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
 
     break;
   }
+  case TargetOpcode::G_SCMP:
+  case TargetOpcode::G_UCMP: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT SrcTy2 = MRI->getType(MI->getOperand(2).getReg());
+
+    if (SrcTy.isPointerOrPointerVector() || SrcTy2.isPointerOrPointerVector()) {
+      report("Generic scmp/ucmp does not support pointers as operands", MI);
+      break;
+    }
+
+    if (DstTy.isPointerOrPointerVector()) {
+      report("Generic scmp/ucmp does not support pointers as a result", MI);
+      break;
+    }
+
+    if ((DstTy.isVector() != SrcTy.isVector()) ||
+        (DstTy.isVector() &&
+         DstTy.getElementCount() != SrcTy.getElementCount())) {
+      report("Generic vector scmp/ucmp must preserve number of lanes", MI);
+      break;
+    }
+
+    if (SrcTy != SrcTy2) {
+      report("Generic scmp/ucmp must have same input types", MI);
+      break;
+    }
+
+    break;
+  }
   case TargetOpcode::G_EXTRACT: {
     const MachineOperand &SrcOp = MI->getOperand(1);
     if (!SrcOp.isReg()) {
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f29ebe3ee798..dae0cb2c900e5 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -739,7 +739,7 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
       bool SawStore = false;
       // Check if it's safe to remove the instruction due to side effects.
       // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+      if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
         ++MI;
         continue;
       }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 19950f3eb67ba..51dad439f57d5 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -24,10 +25,12 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 using namespace llvm;
@@ -45,6 +48,7 @@ namespace {
 struct PreISelIntrinsicLowering {
   const TargetMachine &TM;
   const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
+  const function_ref<TargetLibraryInfo &(Function &)> LookupTLI;
 
   /// If this is true, assume it's preferably to leave memory intrinsic calls
   /// for replacement with a library call later. Otherwise this depends on
@@ -54,8 +58,9 @@ struct PreISelIntrinsicLowering {
   explicit PreISelIntrinsicLowering(
       const TargetMachine &TM_,
       function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
+      function_ref<TargetLibraryInfo &(Function &)> LookupTLI_,
       bool UseMemIntrinsicLibFunc_ = true)
-      : TM(TM_), LookupTTI(LookupTTI_),
+      : TM(TM_), LookupTTI(LookupTTI_), LookupTLI(LookupTLI_),
         UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
 
   static bool shouldExpandMemIntrinsicWithSize(Value *Size,
@@ -66,6 +71,27 @@ struct PreISelIntrinsicLowering {
 
 } // namespace
 
+template <class T> static bool forEachCall(Function &Intrin, T Callback) {
+  // Lowering all intrinsics in a function will delete multiple uses, so we
+  // can't use an early-inc-range. In case some remain, we don't want to look
+  // at them again. Unfortunately, Value::UseList is private, so we can't use a
+  // simple Use**. If LastUse is null, the next use to consider is
+  // Intrin.use_begin(), otherwise it's LastUse->getNext().
+  Use *LastUse = nullptr;
+  bool Changed = false;
+  while (!Intrin.use_empty() && (!LastUse || LastUse->getNext())) {
+    Use *U = LastUse ? LastUse->getNext() : &*Intrin.use_begin();
+    bool Removed = false;
+    // An intrinsic cannot have its address taken, so it cannot be an argument
+    // operand. It might be used as operand in debug metadata, though.
+    if (auto CI = dyn_cast<CallInst>(U->getUser()))
+      Changed |= Removed = Callback(CI);
+    if (!Removed)
+      LastUse = U;
+  }
+  return Changed;
+}
+
 static bool lowerLoadRelative(Function &F) {
   if (F.use_empty())
     return false;
@@ -315,6 +341,16 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::load_relative:
       Changed |= lowerLoadRelative(F);
       break;
+    case Intrinsic::is_constant:
+    case Intrinsic::objectsize:
+      Changed |= forEachCall(F, [&](CallInst *CI) {
+        Function *Parent = CI->getParent()->getParent();
+        TargetLibraryInfo &TLI = LookupTLI(*Parent);
+        bool Changed = lowerConstantIntrinsics(*Parent, TLI, /*DT=*/nullptr);
+        assert(Changed && "lowerConstantIntrinsics did not lower intrinsic");
+        return Changed;
+      });
+      break;
     case Intrinsic::objc_autorelease:
       Changed |= lowerObjCCall(F, "objc_autorelease");
       break;
@@ -405,6 +441,7 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
   }
 
@@ -412,9 +449,12 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
     auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
       return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     };
+    auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
 
     const auto &TM = getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-    PreISelIntrinsicLowering Lowering(TM, LookupTTI);
+    PreISelIntrinsicLowering Lowering(TM, LookupTTI, LookupTLI);
     return Lowering.lowerIntrinsics(M);
   }
 };
@@ -426,6 +466,7 @@ char PreISelIntrinsicLoweringLegacyPass::ID;
 INITIALIZE_PASS_BEGIN(PreISelIntrinsicLoweringLegacyPass,
                       "pre-isel-intrinsic-lowering",
                       "Pre-ISel Intrinsic Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(PreISelIntrinsicLoweringLegacyPass,
@@ -443,8 +484,11 @@ PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
   auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
 
-  PreISelIntrinsicLowering Lowering(TM, LookupTTI);
+  PreISelIntrinsicLowering Lowering(TM, LookupTTI, LookupTLI);
   if (!Lowering.lowerIntrinsics(M))
     return PreservedAnalyses::all();
   else
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 3db5e17615fd4..cd5d877e53d82 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -366,8 +366,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) {
     return;
 
   // (Re-)Compute the MaxCallFrameSize.
-  [[maybe_unused]] uint32_t MaxCFSIn =
-      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT32_MAX;
+  [[maybe_unused]] uint64_t MaxCFSIn =
+      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT64_MAX;
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   MFI.computeMaxCallFrameSize(MF, &FrameSDOps);
   assert(MFI.getMaxCallFrameSize() <= MaxCFSIn &&
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index fb18f5a8a884a..60deb62bc908a 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -130,7 +130,7 @@ void RegAllocBase::allocatePhysRegs() {
         MI->emitError("inline assembly requires more registers than available");
       } else if (MI) {
         LLVMContext &Context =
-            MI->getParent()->getParent()->getMMI().getModule()->getContext();
+            MI->getParent()->getParent()->getFunction().getContext();
         Context.emitError("ran out of registers during register allocation");
       } else {
         report_fatal_error("ran out of registers during register allocation");
diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h
index 643094671d682..a1ede08a15356 100644
--- a/llvm/lib/CodeGen/RegAllocBase.h
+++ b/llvm/lib/CodeGen/RegAllocBase.h
@@ -72,7 +72,7 @@ class RegAllocBase {
 
 private:
   /// Private, callees should go through shouldAllocateRegister
-  const RegClassFilterFunc ShouldAllocateClass;
+  const RegAllocFilterFunc shouldAllocateRegisterImpl;
 
 protected:
   /// Inst which is a def of an original reg and whose defs are already all
@@ -81,7 +81,8 @@ class RegAllocBase {
   /// always available for the remat of all the siblings of the original reg.
   SmallPtrSet<MachineInstr *, 32> DeadRemats;
 
-  RegAllocBase(const RegClassFilterFunc F = nullptr) : ShouldAllocateClass(F) {}
+  RegAllocBase(const RegAllocFilterFunc F = nullptr)
+      : shouldAllocateRegisterImpl(F) {}
 
   virtual ~RegAllocBase() = default;
 
@@ -90,9 +91,9 @@ class RegAllocBase {
 
   /// Get whether a given register should be allocated
   bool shouldAllocateRegister(Register Reg) {
-    if (!ShouldAllocateClass)
+    if (!shouldAllocateRegisterImpl)
       return true;
-    return ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg));
+    return shouldAllocateRegisterImpl(*TRI, *MRI, Reg);
   }
 
   // The top-level driver. The output is a VirtRegMap that us updated with
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index e78a447969fb2..caf9c32a5a349 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -74,7 +74,7 @@ class RABasic : public MachineFunctionPass,
   void LRE_WillShrinkVirtReg(Register) override;
 
 public:
-  RABasic(const RegClassFilterFunc F = nullptr);
+  RABasic(const RegAllocFilterFunc F = nullptr);
 
   /// Return the pass name.
   StringRef getPassName() const override { return "Basic Register Allocator"; }
@@ -168,10 +168,8 @@ void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
   enqueue(&LI);
 }
 
-RABasic::RABasic(RegClassFilterFunc F):
-  MachineFunctionPass(ID),
-  RegAllocBase(F) {
-}
+RABasic::RABasic(RegAllocFilterFunc F)
+    : MachineFunctionPass(ID), RegAllocBase(F) {}
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -333,6 +331,6 @@ FunctionPass* llvm::createBasicRegisterAllocator() {
   return new RABasic();
 }
 
-FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) {
+FunctionPass *llvm::createBasicRegisterAllocator(RegAllocFilterFunc F) {
   return new RABasic(F);
 }
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index b4660d4085ffd..6e5ce72240d27 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -177,9 +177,9 @@ class InstrPosIndexes {
 
 class RegAllocFastImpl {
 public:
-  RegAllocFastImpl(const RegClassFilterFunc F = nullptr,
+  RegAllocFastImpl(const RegAllocFilterFunc F = nullptr,
                    bool ClearVirtRegs_ = true)
-      : ShouldAllocateClass(F), StackSlotForVirtReg(-1),
+      : ShouldAllocateRegisterImpl(F), StackSlotForVirtReg(-1),
         ClearVirtRegs(ClearVirtRegs_) {}
 
 private:
@@ -188,7 +188,7 @@ class RegAllocFastImpl {
   const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   RegisterClassInfo RegClassInfo;
-  const RegClassFilterFunc ShouldAllocateClass;
+  const RegAllocFilterFunc ShouldAllocateRegisterImpl;
 
   /// Basic block currently being allocated.
   MachineBasicBlock *MBB = nullptr;
@@ -397,7 +397,7 @@ class RegAllocFast : public MachineFunctionPass {
 public:
   static char ID;
 
-  RegAllocFast(const RegClassFilterFunc F = nullptr, bool ClearVirtRegs_ = true)
+  RegAllocFast(const RegAllocFilterFunc F = nullptr, bool ClearVirtRegs_ = true)
       : MachineFunctionPass(ID), Impl(F, ClearVirtRegs_) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
@@ -440,10 +440,10 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
 
 bool RegAllocFastImpl::shouldAllocateRegister(const Register Reg) const {
   assert(Reg.isVirtual());
-  if (!ShouldAllocateClass)
+  if (!ShouldAllocateRegisterImpl)
     return true;
-  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
-  return ShouldAllocateClass(*TRI, RC);
+
+  return ShouldAllocateRegisterImpl(*TRI, *MRI, Reg);
 }
 
 void RegAllocFastImpl::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
@@ -1841,7 +1841,7 @@ void RegAllocFastPass::printPipeline(
 
 FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); }
 
-FunctionPass *llvm::createFastRegisterAllocator(RegClassFilterFunc Ftor,
+FunctionPass *llvm::createFastRegisterAllocator(RegAllocFilterFunc Ftor,
                                                 bool ClearVirtRegs) {
   return new RegAllocFast(Ftor, ClearVirtRegs);
 }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 310050d92d744..5001b4fec58f2 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -192,14 +192,12 @@ FunctionPass* llvm::createGreedyRegisterAllocator() {
   return new RAGreedy();
 }
 
-FunctionPass *llvm::createGreedyRegisterAllocator(RegClassFilterFunc Ftor) {
+FunctionPass *llvm::createGreedyRegisterAllocator(RegAllocFilterFunc Ftor) {
   return new RAGreedy(Ftor);
 }
 
-RAGreedy::RAGreedy(RegClassFilterFunc F):
-  MachineFunctionPass(ID),
-  RegAllocBase(F) {
-}
+RAGreedy::RAGreedy(RegAllocFilterFunc F)
+    : MachineFunctionPass(ID), RegAllocBase(F) {}
 
 void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -2306,7 +2304,7 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     if (Reg.isPhysical())
       continue;
 
-    // This may be a skipped class
+    // This may be a skipped register.
     if (!VRM->hasPhys(Reg)) {
       assert(!shouldAllocateRegister(Reg) &&
              "We have an unallocated variable which should have been handled");
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index ac300c0024f5a..2e7608a53e9ce 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -281,7 +281,7 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
   bool ReverseLocalAssignment = false;
 
 public:
-  RAGreedy(const RegClassFilterFunc F = nullptr);
+  RAGreedy(const RegAllocFilterFunc F = nullptr);
 
   /// Return the pass name.
   StringRef getPassName() const override { return "Greedy Register Allocator"; }
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
index 72b07eb1902d9..00dcc1fbcd0c7 100644
--- a/llvm/lib/CodeGen/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -215,8 +215,9 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
       }
     }
 
-    unsigned Size = getSizeInBits(Reg, MRI, TRI);
-    const ValueMapping *ValMapping = &getValueMapping(0, Size, *CurRegBank);
+    TypeSize Size = getSizeInBits(Reg, MRI, TRI);
+    const ValueMapping *ValMapping =
+        &getValueMapping(0, Size.getKnownMinValue(), *CurRegBank);
     if (IsCopyLike) {
       if (!OperandsMapping[0]) {
         if (MI.isRegSequence()) {
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 1c35a88b4dc4a..f6c53f3051c2f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1320,7 +1320,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (!definesFullReg(*DefMI, SrcReg))
     return false;
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(AA, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
@@ -3673,6 +3673,13 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
 
     LHSVals.pruneSubRegValues(LHS, ShrinkMask);
     RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  } else if (TrackSubRegLiveness && !CP.getDstIdx() && CP.getSrcIdx()) {
+    LHS.createSubRangeFrom(LIS->getVNInfoAllocator(),
+                           CP.getNewRC()->getLaneMask(), LHS);
+    mergeSubRangeInto(LHS, RHS, TRI->getSubRegIndexLaneMask(CP.getSrcIdx()), CP,
+                      CP.getDstIdx());
+    LHSVals.pruneMainSegments(LHS, ShrinkMainRange);
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
   }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 30203f9119af7..060e66175d965 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11501,28 +11501,28 @@ static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
   if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
     return SDValue();
 
-  // select Cond, Cond, F --> or Cond, F
-  // select Cond, 1, F    --> or Cond, F
+  // select Cond, Cond, F --> or Cond, freeze(F)
+  // select Cond, 1, F    --> or Cond, freeze(F)
   if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
-    return matcher.getNode(ISD::OR, DL, VT, Cond, F);
+    return matcher.getNode(ISD::OR, DL, VT, Cond, DAG.getFreeze(F));
 
-  // select Cond, T, Cond --> and Cond, T
-  // select Cond, T, 0    --> and Cond, T
+  // select Cond, T, Cond --> and Cond, freeze(T)
+  // select Cond, T, 0    --> and Cond, freeze(T)
   if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
-    return matcher.getNode(ISD::AND, DL, VT, Cond, T);
+    return matcher.getNode(ISD::AND, DL, VT, Cond, DAG.getFreeze(T));
 
-  // select Cond, T, 1 --> or (not Cond), T
+  // select Cond, T, 1 --> or (not Cond), freeze(T)
   if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
     SDValue NotCond =
         matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT));
-    return matcher.getNode(ISD::OR, DL, VT, NotCond, T);
+    return matcher.getNode(ISD::OR, DL, VT, NotCond, DAG.getFreeze(T));
   }
 
-  // select Cond, 0, F --> and (not Cond), F
+  // select Cond, 0, F --> and (not Cond), freeze(F)
   if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
     SDValue NotCond =
         matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT));
-    return matcher.getNode(ISD::AND, DL, VT, NotCond, F);
+    return matcher.getNode(ISD::AND, DL, VT, NotCond, DAG.getFreeze(F));
   }
 
   return SDValue();
@@ -11550,37 +11550,37 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
   else
     return SDValue();
 
-  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
+  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & freeze(N1)
   if (isNullOrNullSplat(N2)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
-    return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, DAG.getFreeze(N1));
   }
 
-  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
+  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | freeze(N2)
   if (isAllOnesOrAllOnesSplat(N1)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
-    return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, DAG.getFreeze(N2));
   }
 
   // If we have to invert the sign bit mask, only do that transform if the
   // target has a bitwise 'and not' instruction (the invert is free).
-  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
+  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & freeze(N2)
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
     SDValue Not = DAG.getNOT(DL, Sra, VT);
-    return DAG.getNode(ISD::AND, DL, VT, Not, N2);
+    return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
   }
 
   // TODO: There's another pattern in this family, but it may require
   //       implementing hasOrNot() to check for profitability:
-  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
+  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
 
   return SDValue();
 }
@@ -12812,13 +12812,11 @@ static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) {
 /// This function is called by the DAGCombiner when visiting sext/zext/aext
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
-                                         SelectionDAG &DAG,
+                                         SelectionDAG &DAG, const SDLoc &DL,
                                          CombineLevel Level) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
   assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
           Opcode == ISD::ANY_EXTEND) &&
          "Expected EXTEND dag node in input!");
@@ -13775,7 +13773,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
   }
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
     return Res;
 
   return SDValue();
@@ -13784,9 +13782,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
 /// Given an extending node with a pop-count operand, if the target does not
 /// support a pop-count in the narrow source type but does support it in the
 /// destination type, widen the pop-count to the destination type.
-static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
+static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG, const SDLoc &DL) {
   assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
-          Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
+          Extend->getOpcode() == ISD::ANY_EXTEND) &&
+         "Expected extend op");
 
   SDValue CtPop = Extend->getOperand(0);
   if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
@@ -13799,7 +13798,6 @@ static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
     return SDValue();
 
   // zext (ctpop X) --> ctpop (zext X)
-  SDLoc DL(Extend);
   SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
   return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
 }
@@ -13860,8 +13858,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // fold (zext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
   if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG)
-    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT,
-                       N0.getOperand(0));
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, N0.getOperand(0));
 
   // fold (zext (truncate x)) -> (zext x) or
   //      (zext (truncate x)) -> (truncate x)
@@ -14141,13 +14138,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
-  if (SDValue NewCtPop = widenCtPop(N, DAG))
+  if (SDValue NewCtPop = widenCtPop(N, DAG, DL))
     return NewCtPop;
 
   if (SDValue V = widenAbs(N, DAG))
     return V;
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
     return Res;
 
   // CSE zext nneg with sext if the zext is not free.
@@ -14319,10 +14316,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       return SCC;
   }
 
-  if (SDValue NewCtPop = widenCtPop(N, DAG))
+  if (SDValue NewCtPop = widenCtPop(N, DAG, DL))
     return NewCtPop;
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
     return Res;
 
   return SDValue();
@@ -15656,6 +15653,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
     return SDValue();
 
   bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::SELECT_CC ||
+      N0.getOpcode() == ISD::SETCC ||
       N0.getOpcode() == ISD::BUILD_VECTOR ||
       N0.getOpcode() == ISD::BUILD_PAIR ||
       N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
@@ -21414,7 +21413,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   }
 
   // Turn 'store undef, Ptr' -> nothing.
-  if (Value.isUndef() && ST->isUnindexed())
+  if (Value.isUndef() && ST->isUnindexed() && !ST->isVolatile())
     return Chain;
 
   // Try to infer better alignment information than the store already has.
@@ -22534,6 +22533,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
       Index == VecOp.getOperand(2)) {
     SDValue Elt = VecOp.getOperand(1);
+    AddUsersToWorklist(VecOp.getNode());
     return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ef9f783355190..e255bbaa92b16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1203,11 +1203,6 @@ void FastISel::handleDbgInfo(const Instruction *II) {
 
     if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
       assert(DLR->getLabel() && "Missing label");
-      if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-        LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DLR << "\n");
-        continue;
-      }
-
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DLR->getDebugLoc(),
               TII.get(TargetOpcode::DBG_LABEL))
           .addMetadata(DLR->getLabel());
@@ -1402,12 +1397,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
-                        << " (!hasDebugInfo)\n");
-      return true;
-    }
-
     if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
       return true;
 
@@ -1446,11 +1435,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_label: {
     const DbgLabelInst *DI = cast<DbgLabelInst>(II);
     assert(DI->getLabel() && "Missing label");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
-      return true;
-    }
-
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
             TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
     return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7f5b46af01c62..bdb7917073020 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3439,6 +3439,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::FP_TO_UINT_SAT:
     Results.push_back(TLI.expandFP_TO_INT_SAT(Node, DAG));
     break;
+  case ISD::LROUND:
+  case ISD::LLROUND: {
+    SDValue Arg = Node->getOperand(0);
+    EVT ArgVT = Arg.getValueType();
+    EVT ResVT = Node->getValueType(0);
+    SDLoc dl(Node);
+    SDValue RoundNode = DAG.getNode(ISD::FROUND, dl, ArgVT, Arg);
+    Results.push_back(DAG.getNode(ISD::FP_TO_SINT, dl, ResVT, RoundNode));
+    break;
+  }
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
@@ -4326,6 +4336,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // targets where it is not needed.
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::LRINT:
+  case ISD::LLRINT: {
+    SDValue Arg = Node->getOperand(0);
+    EVT ArgVT = Arg.getValueType();
+    EVT ResVT = Node->getValueType(0);
+    SDLoc dl(Node);
+    SDValue RoundNode = DAG.getNode(ISD::FRINT, dl, ArgVT, Arg);
+    Results.push_back(DAG.getNode(ISD::FP_TO_SINT, dl, ResVT, RoundNode));
+    break;
+  }
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index af77b0070df0a..b1ada66aa9aeb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2294,12 +2294,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CMP(SDNode *N) {
-  SDValue LHS = N->getOpcode() == ISD::UCMP
-                    ? ZExtPromotedInteger(N->getOperand(0))
-                    : SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = N->getOpcode() == ISD::UCMP
-                    ? ZExtPromotedInteger(N->getOperand(1))
-                    : SExtPromotedInteger(N->getOperand(1));
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (N->getOpcode() == ISD::SCMP) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+  } else {
+    SExtOrZExtPromotedOperands(LHS, RHS);
+  }
 
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS), 0);
 }
@@ -3395,13 +3398,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps);
     }
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 92b62ccdc2755..5672b611234b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7299,14 +7299,14 @@ static std::optional<EVT> findMemType(SelectionDAG &DAG,
   unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
   unsigned AlignInBits = Align*8;
 
-  // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
-  if (!Scalable && Width == WidenEltWidth)
-    return RetVT;
-
   // Don't bother looking for an integer type if the vector is scalable, skip
   // to vector types.
   if (!Scalable) {
+    // If we have one element to load/store, return it.
+    if (Width == WidenEltWidth)
+      return RetVT;
+
     // See if there is larger legal integer than the element type to load/store.
     for (EVT MemVT : reverse(MVT::integer_valuetypes())) {
       unsigned MemVTWidth = MemVT.getSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 02d44cd36ae53..b3ed7f78eb68e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1333,7 +1333,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE, Pass *PassPtr,
                         const TargetLibraryInfo *LibraryInfo,
                         UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
-                        BlockFrequencyInfo *BFIin,
+                        BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
                         FunctionVarLocs const *VarLocs) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
@@ -1345,6 +1345,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
   UA = NewUA;
   PSI = PSIin;
   BFI = BFIin;
+  MMI = &MMIin;
   FnVarLocs = VarLocs;
 }
 
@@ -2482,6 +2483,11 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
     Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
     if (RedAlign2 < RedAlign)
       RedAlign = RedAlign2;
+
+    if (!getMachineFunction().getFrameInfo().isStackRealignable())
+      // If the stack is not realignable, the alignment should be limited to the
+      // StackAlignment
+      RedAlign = std::min(RedAlign, StackAlign);
   }
 
   return RedAlign;
@@ -4705,14 +4711,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp, Tmp2);
+  case ISD::SSUBO_CARRY:
+  case ISD::USUBO_CARRY:
+    // sub_carry(x,x,c) -> 0/-1 (sext carry)
+    if (Op.getResNo() == 0 && Op.getOperand(0) == Op.getOperand(1))
+      return VTBits;
+    [[fallthrough]];
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SADDO_CARRY:
   case ISD::UADDO_CARRY:
   case ISD::SSUBO:
   case ISD::USUBO:
-  case ISD::SSUBO_CARRY:
-  case ISD::USUBO_CARRY:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
@@ -11577,6 +11587,19 @@ class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
 
 } // end anonymous namespace
 
+/// Return true if a glue output should propagate divergence information.
+static bool gluePropagatesDivergence(const SDNode *Node) {
+  switch (Node->getOpcode()) {
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+    return false;
+  default:
+    return true;
+  }
+
+  llvm_unreachable("covered opcode switch");
+}
+
 bool SelectionDAG::calculateDivergence(SDNode *N) {
   if (TLI->isSDNodeAlwaysUniform(N)) {
     assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
@@ -11586,7 +11609,11 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
   if (TLI->isSDNodeSourceOfDivergence(N, FLI, UA))
     return true;
   for (const auto &Op : N->ops()) {
-    if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
+    EVT VT = Op.getValueType();
+
+    // Skip Chain. It does not carry divergence.
+    if (VT != MVT::Other && Op.getNode()->isDivergent() &&
+        (VT != MVT::Glue || gluePropagatesDivergence(Op.getNode())))
       return true;
   }
   return false;
@@ -13125,8 +13152,14 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
   for (unsigned I = 0; I != Vals.size(); ++I) {
     Ops[I].setUser(Node);
     Ops[I].setInitial(Vals[I]);
-    if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
-      IsDivergent |= Ops[I].getNode()->isDivergent();
+    EVT VT = Ops[I].getValueType();
+
+    // Skip Chain. It does not carry divergence.
+    if (VT != MVT::Other &&
+        (VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
+        Ops[I].getNode()->isDivergent()) {
+      IsDivergent = true;
+    }
   }
   Node->NumOperands = Vals.size();
   Node->OperandList = Ops;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index bbd4c3521d908..a035fee6aafca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1246,9 +1246,7 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
       SmallVector<Value *> Values(It->Values.location_ops());
       if (!handleDebugValue(Values, Var, It->Expr, It->DL, SDNodeOrder,
                             It->Values.hasArgList())) {
-        SmallVector<Value *, 4> Vals;
-        for (Value *V : It->Values.location_ops())
-          Vals.push_back(V);
+        SmallVector<Value *, 4> Vals(It->Values.location_ops());
         addDanglingDebugInfo(Vals,
                              FnVarLocs->getDILocalVariable(It->VariableID),
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
@@ -3730,8 +3728,8 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     // ValueTracking's select pattern matching does not account for -0.0,
     // so we can't lower to FMINIMUM/FMAXIMUM because those nodes specify that
     // -0.0 is less than +0.0.
-    Value *LHS, *RHS;
-    auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
+    const Value *LHS, *RHS;
+    auto SPR = matchSelectPattern(&I, LHS, RHS);
     ISD::NodeType Opc = ISD::DELETED_NODE;
     switch (SPR.Flavor) {
     case SPF_UMAX:    Opc = ISD::UMAX; break;
@@ -6707,11 +6705,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0))));
     return;
   case Intrinsic::eh_sjlj_callsite: {
-    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
-    assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
+    assert(FuncInfo.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
-    MMI.setCurrentCallSite(CI->getZExtValue());
+    FuncInfo.setCurrentCallSite(CI->getZExtValue());
     return;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
@@ -7375,8 +7372,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::codeview_annotation: {
     // Emit a label associated with this metadata.
     MachineFunction &MF = DAG.getMachineFunction();
-    MCSymbol *Label =
-        MF.getMMI().getContext().createTempSymbol("annotation", true);
+    MCSymbol *Label = MF.getContext().createTempSymbol("annotation", true);
     Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
     MF.addCodeViewAnnotation(Label, cast<MDNode>(MD));
     Res = DAG.getLabelNode(ISD::ANNOTATION_LABEL, sdl, getRoot(), Label);
@@ -7452,6 +7448,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         break;
       default: llvm_unreachable("unknown trap intrinsic");
       }
+      DAG.addNoMergeSiteInfo(DAG.getRoot().getNode(),
+                             I.hasFnAttr(Attribute::NoMerge));
       return;
     }
     TargetLowering::ArgListTy Args;
@@ -7468,7 +7466,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
         std::move(Args));
-
+    CLI.NoMerge = I.hasFnAttr(Attribute::NoMerge);
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return;
@@ -7644,9 +7642,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       assert(FuncInfo.StaticAllocaMap.count(Slot) &&
              "can only escape static allocas");
       int FI = FuncInfo.StaticAllocaMap[Slot];
-      MCSymbol *FrameAllocSym =
-          MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-              GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
+      MCSymbol *FrameAllocSym = MF.getContext().getOrCreateFrameAllocSymbol(
+          GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
               TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
@@ -7665,9 +7662,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
     unsigned IdxVal =
         unsigned(Idx->getLimitedValue(std::numeric_limits<int>::max()));
-    MCSymbol *FrameAllocSym =
-        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-            GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
+    MCSymbol *FrameAllocSym = MF.getContext().getOrCreateFrameAllocSymbol(
+        GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
     Value *FP = I.getArgOperand(1);
     SDValue FPVal = getValue(FP);
@@ -8622,21 +8618,20 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
                                           const BasicBlock *EHPadBB,
                                           MCSymbol *&BeginLabel) {
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
 
   // Insert a label before the invoke call to mark the try range.  This can be
   // used to detect deletion of the invoke via the MachineModuleInfo.
-  BeginLabel = MMI.getContext().createTempSymbol();
+  BeginLabel = MF.getContext().createTempSymbol();
 
   // For SjLj, keep track of which landing pads go with which invokes
   // so as to maintain the ordering of pads in the LSDA.
-  unsigned CallSiteIndex = MMI.getCurrentCallSite();
+  unsigned CallSiteIndex = FuncInfo.getCurrentCallSite();
   if (CallSiteIndex) {
     MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
     LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
 
     // Now that the call site is handled, stop tracking it.
-    MMI.setCurrentCallSite(0);
+    FuncInfo.setCurrentCallSite(0);
   }
 
   return DAG.getEHLabel(getCurSDLoc(), Chain, BeginLabel);
@@ -8648,11 +8643,10 @@ SDValue SelectionDAGBuilder::lowerEndEH(SDValue Chain, const InvokeInst *II,
   assert(BeginLabel && "BeginLabel should've been set");
 
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
 
   // Insert a label at the end of the invoke call to mark the try range.  This
   // can be used to detect deletion of the invoke via the MachineModuleInfo.
-  MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+  MCSymbol *EndLabel = MF.getContext().createTempSymbol();
   Chain = DAG.getEHLabel(getCurSDLoc(), Chain, EndLabel);
 
   // Inform MachineModuleInfo of range.
@@ -9570,10 +9564,15 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
   // Otherwise, create a stack slot and emit a store to it before the asm.
   Type *Ty = OpVal->getType();
   auto &DL = DAG.getDataLayout();
-  uint64_t TySize = DL.getTypeAllocSize(Ty);
+  TypeSize TySize = DL.getTypeAllocSize(Ty);
   MachineFunction &MF = DAG.getMachineFunction();
-  int SSFI = MF.getFrameInfo().CreateStackObject(
-      TySize, DL.getPrefTypeAlign(Ty), false);
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  int StackID = 0;
+  if (TySize.isScalable())
+    StackID = TFI->getStackIDForScalableVectors();
+  int SSFI = MF.getFrameInfo().CreateStackObject(TySize.getKnownMinValue(),
+                                                 DL.getPrefTypeAlign(Ty), false,
+                                                 nullptr, StackID);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
   Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
                             MachinePointerInfo::getFixedStack(MF, SSFI),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index df3d207d85d35..607c8031480c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -417,30 +417,6 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F,
-                                         MachineModuleInfo &MMI) {
-  // Only needed for MSVC
-  if (!TT.isWindowsMSVCEnvironment())
-    return;
-
-  // If it's already set, nothing to do.
-  if (MMI.usesMSVCFloatingPoint())
-    return;
-
-  for (const Instruction &I : instructions(F)) {
-    if (I.getType()->isFPOrFPVectorTy()) {
-      MMI.setUsesMSVCFloatingPoint(true);
-      return;
-    }
-    for (const auto &Op : I.operands()) {
-      if (Op->getType()->isFPOrFPVectorTy()) {
-        MMI.setUsesMSVCFloatingPoint(true);
-        return;
-      }
-    }
-  }
-}
-
 PreservedAnalyses
 SelectionDAGISelPass::run(MachineFunction &MF,
                           MachineFunctionAnalysisManager &MFAM) {
@@ -509,7 +485,10 @@ void SelectionDAGISel::initializeAnalysisResults(
     FnVarLocs = &FAM.getResult<DebugAssignmentTrackingAnalysis>(Fn);
 
   auto *UA = FAM.getCachedResult<UniformityInfoAnalysis>(Fn);
-  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, FnVarLocs);
+  MachineModuleInfo &MMI =
+      MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
+
+  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -562,7 +541,11 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   UniformityInfo *UA = nullptr;
   if (auto *UAPass = MFP.getAnalysisIfAvailable<UniformityInfoWrapperPass>())
     UA = &UAPass->getUniformityInfo();
-  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, FnVarLocs);
+
+  MachineModuleInfo &MMI =
+      MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -795,9 +778,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  // Determine if floating point is used for msvc
-  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
-
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
@@ -1443,7 +1423,6 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
 
 // Mark and Report IPToState for each Block under IsEHa
 void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
-  MachineModuleInfo &MMI = MF->getMMI();
   llvm::WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo();
   if (!EHInfo)
     return;
@@ -1458,8 +1437,8 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
         continue;
 
       // Insert EH Labels
-      MCSymbol *BeginLabel = MMI.getContext().createTempSymbol();
-      MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+      MCSymbol *BeginLabel = MF->getContext().createTempSymbol();
+      MCSymbol *EndLabel = MF->getContext().createTempSymbol();
       EHInfo->addIPToStateRange(State, BeginLabel, EndLabel);
       BuildMI(MBB, MBBb, SDB->getCurDebugLoc(),
               TII->get(TargetOpcode::EH_LABEL))
@@ -2617,6 +2596,17 @@ GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   return Val;
 }
 
+/// getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value,
+/// use GetVBR to decode it.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static MVT::SimpleValueType
+getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex) {
+  unsigned SimpleVT = MatcherTable[MatcherIndex++];
+  if (SimpleVT & 128)
+    SimpleVT = GetVBR(SimpleVT, MatcherTable, MatcherIndex);
+
+  return static_cast<MVT::SimpleValueType>(SimpleVT);
+}
+
 void SelectionDAGISel::Select_JUMP_TABLE_DEBUG_INFO(SDNode *N) {
   SDLoc dl(N);
   CurDAG->SelectNodeTo(N, TargetOpcode::JUMP_TABLE_DEBUG_INFO, MVT::Glue,
@@ -2896,8 +2886,7 @@ CheckChild2CondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
-  MVT::SimpleValueType VT =
-      static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+  MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
   if (cast<VTSDNode>(N)->getVT() == VT)
     return true;
 
@@ -3027,7 +3016,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
       VT = MVT::i64;
       break;
     default:
-      VT = static_cast<MVT::SimpleValueType>(Table[Index++]);
+      VT = getSimpleVT(Table, Index);
       break;
     }
     Result = !::CheckType(VT, N, SDISel.TLI, SDISel.CurDAG->getDataLayout());
@@ -3035,9 +3024,8 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   }
   case SelectionDAGISel::OPC_CheckTypeRes: {
     unsigned Res = Table[Index++];
-    Result = !::CheckType(static_cast<MVT::SimpleValueType>(Table[Index++]),
-                          N.getValue(Res), SDISel.TLI,
-                          SDISel.CurDAG->getDataLayout());
+    Result = !::CheckType(getSimpleVT(Table, Index), N.getValue(Res),
+                          SDISel.TLI, SDISel.CurDAG->getDataLayout());
     return Index;
   }
   case SelectionDAGISel::OPC_CheckChild0Type:
@@ -3075,7 +3063,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
       VT = MVT::i64;
       ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0TypeI64;
     } else {
-      VT = static_cast<MVT::SimpleValueType>(Table[Index++]);
+      VT = getSimpleVT(Table, Index);
       ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0Type;
     }
     Result = !::CheckChildType(VT, N, SDISel.TLI,
@@ -3579,7 +3567,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       if (!::CheckType(VT, N, TLI, CurDAG->getDataLayout()))
@@ -3588,9 +3576,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
 
     case OPC_CheckTypeRes: {
       unsigned Res = MatcherTable[MatcherIndex++];
-      if (!::CheckType(
-              static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]),
-              N.getValue(Res), TLI, CurDAG->getDataLayout()))
+      if (!::CheckType(getSimpleVT(MatcherTable, MatcherIndex), N.getValue(Res),
+                       TLI, CurDAG->getDataLayout()))
         break;
       continue;
     }
@@ -3637,8 +3624,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
         if (CaseSize == 0) break;
 
-        MVT CaseVT =
-            static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        MVT CaseVT = getSimpleVT(MatcherTable, MatcherIndex);
         if (CaseVT == MVT::iPTR)
           CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
 
@@ -3694,7 +3680,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0TypeI64;
       } else {
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0Type;
       }
       if (!::CheckChildType(VT, N, TLI, CurDAG->getDataLayout(), ChildNo))
@@ -3788,7 +3774,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       int64_t Val = MatcherTable[MatcherIndex++];
@@ -3812,7 +3798,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       unsigned RegNo = MatcherTable[MatcherIndex++];
@@ -3824,8 +3810,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // For targets w/ more than 256 register names, the register enum
       // values are stored in two bytes in the matcher table (just like
       // opcodes).
-      MVT::SimpleValueType VT =
-          static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+      MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RegNo |= MatcherTable[MatcherIndex++] << 8;
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
@@ -4063,8 +4048,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         NumVTs = MatcherTable[MatcherIndex++];
       SmallVector<EVT, 4> VTs;
       for (unsigned i = 0; i != NumVTs; ++i) {
-        MVT::SimpleValueType VT =
-            static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
         if (VT == MVT::iPTR)
           VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
         VTs.push_back(VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3a20b5044c5f..7fa83a5999dfe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6483,15 +6483,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
 
   // Try to use leading zeros of the dividend to reduce the multiplier and
   // avoid expensive fixups.
-  // TODO: Support vectors.
-  unsigned LeadingZeros = 0;
-  if (!VT.isVector() && isa<ConstantSDNode>(N1)) {
-    assert(!isOneConstant(N1) && "Unexpected divisor");
-    LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
-    // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in
-    // the dividend exceeds the leading zeros for the divisor.
-    LeadingZeros = std::min(LeadingZeros, N1->getAsAPIntVal().countl_zero());
-  }
+  unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
@@ -6510,7 +6502,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       MagicFactor = NPQFactor = DAG.getUNDEF(SVT);
     } else {
       UnsignedDivisionByConstantInfo magics =
-          UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
+          UnsignedDivisionByConstantInfo::get(
+              Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
 
       MagicFactor = DAG.getConstant(magics.Magic, dl, SVT);
 
@@ -8563,11 +8556,12 @@ static std::optional<bool> isFCmpEqualZero(FPClassTest Test,
 }
 
 SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
-                                         FPClassTest Test, SDNodeFlags Flags,
-                                         const SDLoc &DL,
+                                         const FPClassTest OrigTestMask,
+                                         SDNodeFlags Flags, const SDLoc &DL,
                                          SelectionDAG &DAG) const {
   EVT OperandVT = Op.getValueType();
   assert(OperandVT.isFloatingPoint());
+  FPClassTest Test = OrigTestMask;
 
   // Degenerated cases.
   if (Test == fcNone)
@@ -8601,9 +8595,21 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   // exceptions are ignored.
   if (Flags.hasNoFPExcept() &&
       isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
+    FPClassTest FPTestMask = Test;
+
     ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ;
     ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ;
 
+    // See if we can fold an | fcNan into an unordered compare.
+    FPClassTest OrderedFPTestMask = FPTestMask & ~fcNan;
+
+    // Can't fold the ordered check if we're only testing for snan or qnan
+    // individually.
+    if ((FPTestMask & fcNan) != fcNan)
+      OrderedFPTestMask = FPTestMask;
+
+    const bool IsOrdered = FPTestMask == OrderedFPTestMask;
+
     if (std::optional<bool> IsCmp0 =
             isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction());
         IsCmp0 && (isCondCodeLegalOrCustom(
@@ -8635,6 +8641,27 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
       return DAG.getSetCC(DL, ResultVT, Abs, Inf,
                           IsInverted ? ISD::SETUNE : ISD::SETOEQ);
     }
+
+    if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) {
+      // TODO: Could handle ordered case, but it produces worse code for
+      // x86. Maybe handle ordered if fabs is free?
+
+      ISD::CondCode OrderedOp = IsInverted ? ISD::SETUGE : ISD::SETOLT;
+      ISD::CondCode UnorderedOp = IsInverted ? ISD::SETOGE : ISD::SETULT;
+
+      if (isCondCodeLegalOrCustom(IsOrdered ? OrderedOp : UnorderedOp,
+                                  OperandVT.getScalarType().getSimpleVT())) {
+        // (issubnormal(x) || iszero(x)) --> fabs(x) < smallest_normal
+
+        // TODO: Maybe only makes sense if fabs is free. Integer test of
+        // exponent bits seems better for x86.
+        SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
+        SDValue SmallestNormal = DAG.getConstantFP(
+            APFloat::getSmallestNormalized(Semantics), DL, OperandVT);
+        return DAG.getSetCC(DL, ResultVT, Abs, SmallestNormal,
+                            IsOrdered ? OrderedOp : UnorderedOp);
+      }
+    }
   }
 
   // In the general case use integer operations.
@@ -9352,6 +9379,26 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
     }
   }
 
+  // avgflooru(lhs, rhs) -> or(lshr(add(lhs, rhs),1),shl(overflow, typesize-1))
+  if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) {
+    SDValue UAddWithOverflow =
+        DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS});
+
+    SDValue Sum = UAddWithOverflow.getValue(0);
+    SDValue Overflow = UAddWithOverflow.getValue(1);
+
+    // Right shift the sum by 1
+    SDValue One = DAG.getShiftAmountConstant(1, VT, dl);
+    SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One);
+
+    SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow);
+    SDValue OverflowShl =
+        DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+
+    return DAG.getNode(ISD::OR, dl, VT, LShrVal, OverflowShl);
+  }
+
   // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
   // avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
   // avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 5d3903ed84ce8..0a7a6bad4e86d 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -51,6 +51,8 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
 
   enum SlotType {
     Spill,          // a Spill slot
+    Fixed,          // a Fixed slot (e.g. arguments passed on the stack)
+    VariableSized,  // a variable sized object
     StackProtector, // Stack Protector slot
     Variable,       // a slot used to store a local data (could be a tmp)
     Invalid         // It's an error for a slot to have this type
@@ -60,24 +62,43 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     int Slot;
     int Size;
     int Align;
-    int Offset;
+    StackOffset Offset;
     SlotType SlotTy;
+    bool Scalable;
 
-    SlotData(const MachineFrameInfo &MFI, const int ValOffset, const int Idx)
+    SlotData(const MachineFrameInfo &MFI, const StackOffset Offset,
+             const int Idx)
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
-          Align(MFI.getObjectAlign(Idx).value()),
-          Offset(MFI.getObjectOffset(Idx) - ValOffset), SlotTy(Invalid) {
+          Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
+          SlotTy(Invalid), Scalable(false) {
+      Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
-      else if (Idx == MFI.getStackProtectorIndex())
+      else if (MFI.isFixedObjectIndex(Idx))
+        SlotTy = SlotType::Fixed;
+      else if (MFI.isVariableSizedObjectIndex(Idx))
+        SlotTy = SlotType::VariableSized;
+      else if (MFI.hasStackProtectorIndex() &&
+               Idx == MFI.getStackProtectorIndex())
         SlotTy = SlotType::StackProtector;
       else
         SlotTy = SlotType::Variable;
     }
 
-    // we use this to sort in reverse order, so that the layout is displayed
-    // correctly
-    bool operator<(const SlotData &Rhs) const { return Offset > Rhs.Offset; }
+    bool isVarSize() const { return SlotTy == SlotType::VariableSized; }
+
+    // We use this to sort in reverse order, so that the layout is displayed
+    // correctly. Variable sized slots are sorted to the end of the list, as
+    // offsets are currently incorrect for these but they reside at the end of
+    // the stack frame. The Slot index is used to ensure deterministic order
+    // when offsets are equal.
+    bool operator<(const SlotData &Rhs) const {
+      return std::make_tuple(!isVarSize(),
+                             Offset.getFixed() + Offset.getScalable(), Slot) >
+             std::make_tuple(!Rhs.isVarSize(),
+                             Rhs.Offset.getFixed() + Rhs.Offset.getScalable(),
+                             Rhs.Slot);
+    }
   };
 
   StackFrameLayoutAnalysisPass() : MachineFunctionPass(ID) {}
@@ -115,6 +136,10 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     switch (Ty) {
     case SlotType::Spill:
       return "Spill";
+    case SlotType::Fixed:
+      return "Fixed";
+    case SlotType::VariableSized:
+      return "VariableSized";
     case SlotType::StackProtector:
       return "Protector";
     case SlotType::Variable:
@@ -143,17 +168,29 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     // For example we store the Offset in YAML as:
     //    ...
     //    - Offset: -8
+    //    - ScalableOffset: -16
+    // Note: the ScalableOffset entries are added only for slots with non-zero
+    // scalable offsets.
     //
-    // But we print it to the CLI as
+    // But we print it to the CLI as:
     //   Offset: [SP-8]
+    //
+    // Or with non-zero scalable offset:
+    //   Offset: [SP-8-16 x vscale]
 
     // Negative offsets will print a leading `-`, so only add `+`
     std::string Prefix =
-        formatv("\nOffset: [SP{0}", (D.Offset < 0) ? "" : "+").str();
-    Rem << Prefix << ore::NV("Offset", D.Offset)
-        << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
+        formatv("\nOffset: [SP{0}", (D.Offset.getFixed() < 0) ? "" : "+").str();
+    Rem << Prefix << ore::NV("Offset", D.Offset.getFixed());
+
+    if (D.Offset.getScalable()) {
+      Rem << ((D.Offset.getScalable() < 0) ? "" : "+")
+          << ore::NV("ScalableOffset", D.Offset.getScalable()) << " x vscale";
+    }
+
+    Rem << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
         << ", Align: " << ore::NV("Align", D.Align)
-        << ", Size: " << ore::NV("Size", D.Size);
+        << ", Size: " << ore::NV("Size", ElementCount::get(D.Size, D.Scalable));
   }
 
   void emitSourceLocRemark(const MachineFunction &MF, const DILocalVariable *N,
@@ -164,17 +201,22 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     Rem << "\n    " << ore::NV("DataLoc", Loc);
   }
 
+  StackOffset getStackOffset(const MachineFunction &MF,
+                             const MachineFrameInfo &MFI,
+                             const TargetFrameLowering *FI, int FrameIdx) {
+    if (!FI)
+      return StackOffset::getFixed(MFI.getObjectOffset(FrameIdx));
+
+    return FI->getFrameIndexReferenceFromSP(MF, FrameIdx);
+  }
+
   void emitStackFrameLayoutRemarks(MachineFunction &MF,
                                    MachineOptimizationRemarkAnalysis &Rem) {
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     if (!MFI.hasStackObjects())
       return;
 
-    // ValOffset is the offset to the local area from the SP at function entry.
-    // To display the true offset from SP, we need to subtract ValOffset from
-    // MFI's ObjectOffset.
     const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
-    const int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
 
     LLVM_DEBUG(dbgs() << "getStackProtectorIndex =="
                       << MFI.getStackProtectorIndex() << "\n");
@@ -188,7 +230,7 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
          Idx != EndIdx; ++Idx) {
       if (MFI.isDeadObjectIndex(Idx))
         continue;
-      SlotInfo.emplace_back(MFI, ValOffset, Idx);
+      SlotInfo.emplace_back(MFI, getStackOffset(MF, MFI, FI, Idx), Idx);
     }
 
     // sort the ordering, to match the actual layout in memory
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 6b860466a8875..c5fa4e6211a63 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -97,7 +97,6 @@ void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
   TII = MF->getSubtarget().getInstrInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
-  MMI = &MF->getMMI();
   MBPI = MBPIin;
   MBFI = MBFIin;
   PSI = PSIin;
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 48a2094f5d451..7d054cb7c7c71 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -61,6 +61,20 @@ TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                MFI.getOffsetAdjustment());
 }
 
+/// Returns the offset from the stack pointer to the slot of the specified
+/// index. This function serves to provide a comparable offset from a single
+/// reference point (the value of the stack-pointer at function entry) that can
+/// be used for analysis. This is the default implementation using
+/// MachineFrameInfo offsets.
+StackOffset
+TargetFrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                  int FI) const {
+  // To display the true offset from SP, we need to subtract the offset to the
+  // local area from MFI's ObjectOffset.
+  return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI) -
+                               getOffsetOfLocalArea());
+}
+
 bool TargetFrameLowering::needsFrameIndexResolution(
     const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects();
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 3cd1bb296d288..c2b8c39662bb6 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1809,15 +1809,17 @@ void TargetInstrInfo::mergeOutliningCandidateAttributes(
     F.addFnAttr(Attribute::NoUnwind);
 }
 
-outliner::InstrType TargetInstrInfo::getOutliningType(
-    MachineBasicBlock::iterator &MIT, unsigned Flags) const {
+outliner::InstrType
+TargetInstrInfo::getOutliningType(const MachineModuleInfo &MMI,
+                                  MachineBasicBlock::iterator &MIT,
+                                  unsigned Flags) const {
   MachineInstr &MI = *MIT;
 
   // NOTE: MI.isMetaInstruction() will match CFI_INSTRUCTION, but some targets
   // have support for outlining those. Special-case that here.
   if (MI.isCFIInstruction())
     // Just go right to the target implementation.
-    return getOutliningTypeImpl(MIT, Flags);
+    return getOutliningTypeImpl(MMI, MIT, Flags);
 
   // Be conservative about inline assembly.
   if (MI.isInlineAsm())
@@ -1883,7 +1885,7 @@ outliner::InstrType TargetInstrInfo::getOutliningType(
   }
 
   // If we don't know, delegate to the target-specific hook.
-  return getOutliningTypeImpl(MIT, Flags);
+  return getOutliningTypeImpl(MMI, MIT, Flags);
 }
 
 bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8040f1eeae810..6ca9955993d24 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -574,6 +574,39 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
   }
 }
 
+void RTLIB::initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs) {
+  std::fill(CmpLibcallCCs, CmpLibcallCCs + RTLIB::UNKNOWN_LIBCALL,
+            ISD::SETCC_INVALID);
+  CmpLibcallCCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CmpLibcallCCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CmpLibcallCCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CmpLibcallCCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
+  CmpLibcallCCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CmpLibcallCCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CmpLibcallCCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CmpLibcallCCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
+  CmpLibcallCCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CmpLibcallCCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CmpLibcallCCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CmpLibcallCCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
+  CmpLibcallCCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CmpLibcallCCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CmpLibcallCCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CmpLibcallCCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
+  CmpLibcallCCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CmpLibcallCCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CmpLibcallCCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CmpLibcallCCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
+  CmpLibcallCCs[RTLIB::UO_F32] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UO_F64] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UO_F128] = ISD::SETNE;
+  CmpLibcallCCs[RTLIB::UO_PPCF128] = ISD::SETNE;
+}
+
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
     : TM(tm), Libcalls(TM.getTargetTriple()) {
@@ -608,6 +641,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
 
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
+
+  RTLIB::initCmpLibcallCCs(CmpLibcallCCs);
 }
 
 void TargetLoweringBase::initActions() {
@@ -789,14 +824,17 @@ void TargetLoweringBase::initActions() {
                      Expand);
 
   // These library functions default to expand.
-  setOperationAction({ISD::FCBRT,      ISD::FLOG,    ISD::FLOG2,  ISD::FLOG10,
-                      ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10, ISD::FFLOOR,
-                      ISD::FNEARBYINT, ISD::FCEIL,   ISD::FRINT,  ISD::FTRUNC,
-                      ISD::LROUND,     ISD::LLROUND, ISD::LRINT,  ISD::LLRINT,
-                      ISD::FROUNDEVEN, ISD::FTAN,    ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH,   ISD::FSINH,  ISD::FTANH},
+  setOperationAction({ISD::FCBRT,      ISD::FLOG,  ISD::FLOG2,  ISD::FLOG10,
+                      ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
+                      ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
+                      ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
+                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH},
                      {MVT::f32, MVT::f64, MVT::f128}, Expand);
 
+  // FIXME: Query RuntimeLibCalls to make the decision.
+  setOperationAction({ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+                     {MVT::f32, MVT::f64, MVT::f128}, LibCall);
+
   setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH,
                       ISD::FSINH, ISD::FTANH},
                      MVT::f16, Promote);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 976691b7de448..0d3e4ba5662e0 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2323,7 +2323,7 @@ bool TargetLoweringObjectFileXCOFF::ShouldSetSSPCanaryBitInTB(
 
 MCSymbol *
 TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(const MachineFunction *MF) {
-  MCSymbol *EHInfoSym = MF->getMMI().getContext().getOrCreateSymbol(
+  MCSymbol *EHInfoSym = MF->getContext().getOrCreateSymbol(
       "__ehinfo." + Twine(MF->getFunctionNumber()));
   cast<MCSymbolXCOFF>(EHInfoSym)->setEHInfo();
   return EHInfoSym;
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 7173f073c0662..0c92d7c1e45ee 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -845,7 +845,6 @@ void TargetPassConfig::addIRPasses() {
   // TODO: add a pass insertion point here
   addPass(&GCLoweringID);
   addPass(&ShadowStackGCLoweringID);
-  addPass(createLowerConstantIntrinsicsPass());
 
   // For MachO, lower @llvm.global_dtors into @llvm.global_ctors with
   // __cxa_atexit() calls to avoid emitting the deprecated __mod_term_func.
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 665d57841a97b..fb6274b09919b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -948,7 +948,7 @@ bool TwoAddressInstructionImpl::rescheduleMIBelowKill(
     return false;
 
   bool SeenStore = true;
-  if (!MI->isSafeToMove(AA, SeenStore))
+  if (!MI->isSafeToMove(SeenStore))
     return false;
 
   if (TII->getInstrLatency(InstrItins, *MI) > 1)
@@ -1131,7 +1131,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI(
     return false;
 
   bool SeenStore = true;
-  if (!KillMI->isSafeToMove(AA, SeenStore))
+  if (!KillMI->isSafeToMove(SeenStore))
     return false;
 
   SmallVector<Register, 2> Uses;
@@ -1160,7 +1160,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI(
     }
   }
 
-  // Check if the reschedule will not break depedencies.
+  // Check if the reschedule will not break dependencies.
   unsigned NumVisited = 0;
   for (MachineInstr &OtherMI :
        make_range(mi, MachineBasicBlock::iterator(KillMI))) {
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index b0f736a49c20e..0c6b726a28a24 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -207,7 +207,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     assert(isExtended() && "Type is not extended!");
     return LLVMTy;
   case MVT::isVoid:  return Type::getVoidTy(Context);
-  case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::x86mmx:  return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case MVT::aarch64svcount:
     return TargetExtType::get(Context, "aarch64.svcount");
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
@@ -241,8 +241,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::BFloatTyID:    return MVT(MVT::bf16);
   case Type::FloatTyID:     return MVT(MVT::f32);
   case Type::DoubleTyID:    return MVT(MVT::f64);
-  case Type::X86_FP80TyID:  return MVT(MVT::f80);
-  case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::X86_FP80TyID:
+    return MVT(MVT::f80);
   case Type::TargetExtTyID: {
     TargetExtType *TargetExtTy = cast<TargetExtType>(Ty);
     if (TargetExtTy->getName() == "aarch64.svcount")
@@ -302,4 +302,3 @@ void MVT::print(raw_ostream &OS) const {
   else
     OS << EVT(*this).getEVTString();
 }
-
diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 0777480499e55..6a2c6537470db 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -232,8 +232,11 @@ bool WindowScheduler::initialize() {
       return false;
     }
     for (auto &Def : MI.all_defs())
-      if (Def.isReg() && Def.getReg().isPhysical())
+      if (Def.isReg() && Def.getReg().isPhysical()) {
+        LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
+                             "window scheduling!\n");
         return false;
+      }
   }
   if (SchedInstrNum <= WindowRegionLimit) {
     LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
@@ -437,12 +440,16 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
       int PredCycle = getOriCycle(PredMI);
       ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
     }
-    // ResourceManager can be used to detect resource conflicts between the
-    // current MI and the previously inserted MIs.
-    while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
-      ++CurCycle;
-      if (CurCycle == (int)WindowIILimit)
-        return CurCycle;
+    // Zero cost instructions do not need to check resource.
+    if (!TII->isZeroCost(MI.getOpcode())) {
+      // ResourceManager can be used to detect resource conflicts between the
+      // current MI and the previously inserted MIs.
+      while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
+        ++CurCycle;
+        if (CurCycle == (int)WindowIILimit)
+          return CurCycle;
+      }
+      RM.reserveResources(*SU, CurCycle);
     }
     RM.reserveResources(*SU, CurCycle);
     OriToCycle[getOriMI(&MI)] = CurCycle;
@@ -485,6 +492,7 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
 // ========================================
 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
   int MaxStallCycle = 0;
+  int CurrentII = MaxCycle + 1;
   auto Range = getScheduleRange(Offset, SchedInstrNum);
   for (auto &MI : Range) {
     auto *SU = TripleDAG->getSUnit(&MI);
@@ -492,8 +500,8 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
     for (auto &Succ : SU->Succs) {
       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
         continue;
-      // If the expected cycle does not exceed MaxCycle, no check is needed.
-      if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+      // If the expected cycle does not exceed CurrentII, no check is needed.
+      if (DefCycle + (int)Succ.getLatency() <= CurrentII)
         continue;
       // If the cycle of the scheduled MI A is less than that of the scheduled
       // MI B, the scheduling will fail because the lifetime of the
@@ -503,7 +511,7 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
       if (DefCycle < UseCycle)
         return WindowIILimit;
       // Get the stall cycle introduced by the register between two trips.
-      int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+      int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
     }
   }
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
index 3ba90f96cc86d..f9d107f52a715 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -11,4 +11,5 @@ add_llvm_component_library(LLVMCodeGenData
   LINK_COMPONENTS
   Core
   Support
+  Object
   )
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index 74df2eb9d68ae..bca3125368353 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -62,6 +62,8 @@ Error DwarfStreamer::init(Triple TheTriple,
                              TripleName.c_str());
 
   MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
+  MCOptions.AsmVerbose = true;
+  MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
   MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
   if (!MAI)
     return createStringError(std::errc::invalid_argument,
@@ -101,17 +103,16 @@ Error DwarfStreamer::init(Triple TheTriple,
     MIP = TheTarget->createMCInstPrinter(TheTriple, MAI->getAssemblerDialect(),
                                          *MAI, *MII, *MRI);
     MS = TheTarget->createAsmStreamer(
-        *MC, std::make_unique<formatted_raw_ostream>(OutFile), true, true, MIP,
-        std::unique_ptr<MCCodeEmitter>(MCE), std::unique_ptr<MCAsmBackend>(MAB),
-        true);
+        *MC, std::make_unique<formatted_raw_ostream>(OutFile), MIP,
+        std::unique_ptr<MCCodeEmitter>(MCE),
+        std::unique_ptr<MCAsmBackend>(MAB));
     break;
   }
   case DWARFLinker::OutputFileType::Object: {
     MS = TheTarget->createMCObjectStreamer(
         TheTriple, *MC, std::unique_ptr<MCAsmBackend>(MAB),
         MAB->createObjectWriter(OutFile), std::unique_ptr<MCCodeEmitter>(MCE),
-        *MSTI, MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ false);
+        *MSTI);
     break;
   }
   }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
index b9edcb63a3401..fbab6b25ca0f1 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
@@ -41,6 +41,8 @@ Error DwarfEmitterImpl::init(Triple TheTriple,
                              TripleName.c_str());
 
   MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
+  MCOptions.AsmVerbose = true;
+  MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
   MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
   if (!MAI)
     return createStringError(std::errc::invalid_argument,
@@ -80,17 +82,16 @@ Error DwarfEmitterImpl::init(Triple TheTriple,
     MIP = TheTarget->createMCInstPrinter(TheTriple, MAI->getAssemblerDialect(),
                                          *MAI, *MII, *MRI);
     MS = TheTarget->createAsmStreamer(
-        *MC, std::make_unique<formatted_raw_ostream>(OutFile), true, true, MIP,
-        std::unique_ptr<MCCodeEmitter>(MCE), std::unique_ptr<MCAsmBackend>(MAB),
-        true);
+        *MC, std::make_unique<formatted_raw_ostream>(OutFile), MIP,
+        std::unique_ptr<MCCodeEmitter>(MCE),
+        std::unique_ptr<MCAsmBackend>(MAB));
     break;
   }
   case DWARFLinker::OutputFileType::Object: {
     MS = TheTarget->createMCObjectStreamer(
         TheTriple, *MC, std::unique_ptr<MCAsmBackend>(MAB),
         MAB->createObjectWriter(OutFile), std::unique_ptr<MCCodeEmitter>(MCE),
-        *MSTI, MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ false);
+        *MSTI);
     break;
   }
   }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 345a91a6f3585..72e7464b68971 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -103,6 +103,10 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
       .print(OS, DumpOpts, U);
 }
 
+static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
+  return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
+}
+
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -194,8 +198,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                 DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
   } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) {
-    if (DWARFDie D = Die.getAttributeValueAsReferencedDie(FormValue);
-        D && !D.isNULL()) {
+    DWARFDie D = resolveReferencedType(Die, FormValue);
+    if (D && !D.isNULL()) {
       OS << Space << "\"";
       dumpTypeQualifiedName(D, OS);
       OS << '"';
@@ -287,12 +291,13 @@ DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
     if (auto Value = Die.find(Attrs))
       return Value;
 
-    for (dwarf::Attribute Attr :
-         {DW_AT_abstract_origin, DW_AT_specification, DW_AT_signature}) {
-      if (auto D = Die.getAttributeValueAsReferencedDie(Attr))
-        if (Seen.insert(D).second)
-          Worklist.push_back(D);
-    }
+    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
+      if (Seen.insert(D).second)
+        Worklist.push_back(D);
+
+    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
+      if (Seen.insert(D).second)
+        Worklist.push_back(D);
   }
 
   return std::nullopt;
@@ -314,14 +319,21 @@ DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const {
   } else if (Offset = V.getAsDebugInfoReference(); Offset) {
     if (DWARFUnit *SpecUnit = U->getUnitVector().getUnitForOffset(*Offset))
       Result = SpecUnit->getDIEForOffset(*Offset);
-  } else if (std::optional<uint64_t> Sig = V.getAsSignatureReference()) {
-    if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash(
-            U->getVersion(), *Sig, U->isDWOUnit()))
-      Result = TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset());
   }
   return Result;
 }
 
+DWARFDie DWARFDie::resolveTypeUnitReference() const {
+  if (auto Attr = find(DW_AT_signature)) {
+    if (std::optional<uint64_t> Sig = Attr->getAsReferenceUVal()) {
+      if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash(
+              U->getVersion(), *Sig, U->isDWOUnit()))
+        return TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset());
+    }
+  }
+  return *this;
+}
+
 std::optional<uint64_t> DWARFDie::getRangesBaseAttribute() const {
   return toSectionOffset(find({DW_AT_rnglists_base, DW_AT_GNU_ranges_base}));
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
index fc1aae77a9293..a26431e8313f6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
@@ -62,10 +62,17 @@ void DWARFTypePrinter::appendArrayType(const DWARFDie &D) {
   EndedWithTemplate = false;
 }
 
+static DWARFDie resolveReferencedType(DWARFDie D,
+                                      dwarf::Attribute Attr = DW_AT_type) {
+  return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference();
+}
+static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
+  return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
+}
 DWARFDie DWARFTypePrinter::skipQualifiers(DWARFDie D) {
   while (D && (D.getTag() == DW_TAG_const_type ||
                D.getTag() == DW_TAG_volatile_type))
-    D = D.getAttributeValueAsReferencedDie(DW_AT_type);
+    D = resolveReferencedType(D);
   return D;
 }
 
@@ -96,9 +103,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
     return DWARFDie();
   }
   DWARFDie InnerDIE;
-  auto Inner = [&] {
-    return InnerDIE = D.getAttributeValueAsReferencedDie(DW_AT_type);
-  };
+  auto Inner = [&] { return InnerDIE = resolveReferencedType(D); };
   const dwarf::Tag T = D.getTag();
   switch (T) {
   case DW_TAG_pointer_type: {
@@ -129,8 +134,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
       OS << '(';
     else if (Word)
       OS << ' ';
-    if (DWARFDie Cont =
-            D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) {
+    if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) {
       appendQualifiedName(Cont);
       EndedWithTemplate = false;
       OS << "::";
@@ -169,8 +173,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
   case DW_TAG_base_type:
   */
   default: {
-    const char *NamePtr =
-        dwarf::toString(D.findRecursively(DW_AT_name), nullptr);
+    const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
     if (!NamePtr) {
       appendTypeTagName(D.getTag());
       return DWARFDie();
@@ -232,9 +235,9 @@ void DWARFTypePrinter::appendUnqualifiedNameAfter(
   case DW_TAG_pointer_type: {
     if (needsParens(Inner))
       OS << ')';
-    appendUnqualifiedNameAfter(
-        Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type),
-        /*SkipFirstParamIfArtificial=*/D.getTag() == DW_TAG_ptr_to_member_type);
+    appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner),
+                               /*SkipFirstParamIfArtificial=*/D.getTag() ==
+                                   DW_TAG_ptr_to_member_type);
     break;
   }
   case DW_TAG_LLVM_ptrauth_type: {
@@ -338,7 +341,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D,
       appendTemplateParameters(C, FirstParameter);
     }
     if (C.getTag() == dwarf::DW_TAG_template_value_parameter) {
-      DWARFDie T = C.getAttributeValueAsReferencedDie(DW_AT_type);
+      DWARFDie T = resolveReferencedType(C);
       Sep();
       if (T.getTag() == DW_TAG_enumeration_type) {
         OS << '(';
@@ -458,7 +461,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D,
       continue;
     auto TypeAttr = C.find(DW_AT_type);
     Sep();
-    appendQualifiedName(TypeAttr ? C.getAttributeValueAsReferencedDie(*TypeAttr)
+    appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr)
                                  : DWARFDie());
   }
   if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue) {
@@ -470,15 +473,15 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D,
 void DWARFTypePrinter::decomposeConstVolatile(DWARFDie &N, DWARFDie &T,
                                               DWARFDie &C, DWARFDie &V) {
   (N.getTag() == DW_TAG_const_type ? C : V) = N;
-  T = N.getAttributeValueAsReferencedDie(DW_AT_type);
+  T = resolveReferencedType(N);
   if (T) {
     auto Tag = T.getTag();
     if (Tag == DW_TAG_const_type) {
       C = T;
-      T = T.getAttributeValueAsReferencedDie(DW_AT_type);
+      T = resolveReferencedType(T);
     } else if (Tag == DW_TAG_volatile_type) {
       V = T;
-      T = T.getAttributeValueAsReferencedDie(DW_AT_type);
+      T = resolveReferencedType(T);
     }
   }
 }
@@ -488,11 +491,10 @@ void DWARFTypePrinter::appendConstVolatileQualifierAfter(DWARFDie N) {
   DWARFDie T;
   decomposeConstVolatile(N, T, C, V);
   if (T && T.getTag() == DW_TAG_subroutine_type)
-    appendSubroutineNameAfter(T, T.getAttributeValueAsReferencedDie(DW_AT_type),
-                              false, C.isValid(), V.isValid());
+    appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(),
+                              V.isValid());
   else
-    appendUnqualifiedNameAfter(T,
-                               T.getAttributeValueAsReferencedDie(DW_AT_type));
+    appendUnqualifiedNameAfter(T, resolveReferencedType(T));
 }
 void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) {
   DWARFDie C;
@@ -502,7 +504,7 @@ void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) {
   bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type;
   DWARFDie A = T;
   while (A && A.getTag() == DW_TAG_array_type)
-    A = A.getAttributeValueAsReferencedDie(DW_AT_type);
+    A = resolveReferencedType(A);
   bool Leading =
       (!A || (A.getTag() != DW_TAG_pointer_type &&
               A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) &&
@@ -544,7 +546,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter(
     if (P.getTag() != DW_TAG_formal_parameter &&
         P.getTag() != DW_TAG_unspecified_parameters)
       return;
-    DWARFDie T = P.getAttributeValueAsReferencedDie(DW_AT_type);
+    DWARFDie T = resolveReferencedType(P);
     if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) {
       FirstParamIfArtificial = T;
       RealFirst = false;
@@ -565,7 +567,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter(
     if (DWARFDie P = FirstParamIfArtificial) {
       if (P.getTag() == DW_TAG_pointer_type) {
         auto CVStep = [&](DWARFDie CV) {
-          if (DWARFDie U = CV.getAttributeValueAsReferencedDie(DW_AT_type)) {
+          if (DWARFDie U = resolveReferencedType(CV)) {
             Const |= U.getTag() == DW_TAG_const_type;
             Volatile |= U.getTag() == DW_TAG_volatile_type;
             return U;
@@ -651,8 +653,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter(
   if (D.find(DW_AT_rvalue_reference))
     OS << " &&";
 
-  appendUnqualifiedNameAfter(
-      Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type));
+  appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner));
 }
 void DWARFTypePrinter::appendScopes(DWARFDie D) {
   if (D.getTag() == DW_TAG_compile_unit)
@@ -665,6 +666,7 @@ void DWARFTypePrinter::appendScopes(DWARFDie D) {
     return;
   if (D.getTag() == DW_TAG_lexical_block)
     return;
+  D = D.resolveTypeUnitReference();
   if (DWARFDie P = D.getParent())
     appendScopes(P);
   appendUnqualifiedName(D);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index a804deb446186..8ff6ebd230025 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -1481,9 +1481,9 @@ unsigned DWARFVerifier::verifyNameIndexAttribute(
 
 unsigned
 DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) {
-  if (NI.getLocalTUCount() + NI.getForeignTUCount() > 0) {
-    warn() << formatv("Name Index @ {0:x}: Verifying indexes of type units is "
-                      "not currently supported.\n",
+  if (NI.getForeignTUCount() > 0) {
+    warn() << formatv("Name Index @ {0:x}: Verifying indexes of foreign type "
+                      "units is not currently supported.\n",
                       NI.getUnitOffset());
     return 0;
   }
@@ -1512,10 +1512,12 @@ DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) {
       NumErrors += verifyNameIndexAttribute(NI, Abbrev, AttrEnc);
     }
 
-    if (NI.getCUCount() > 1 && !Attributes.count(dwarf::DW_IDX_compile_unit)) {
+    if (NI.getCUCount() > 1 && !Attributes.count(dwarf::DW_IDX_compile_unit) &&
+        !Attributes.count(dwarf::DW_IDX_type_unit)) {
       ErrorCategory.Report("Abbreviation contains no attribute", [&]() {
         error() << formatv("NameIndex @ {0:x}: Indexing multiple compile units "
-                           "and abbreviation {1:x} has no {2} attribute.\n",
+                           "and abbreviation {1:x} has no DW_IDX_compile_unit "
+                           "or DW_IDX_type_unit attribute.\n",
                            NI.getUnitOffset(), Abbrev.Code,
                            dwarf::DW_IDX_compile_unit);
       });
@@ -1574,8 +1576,8 @@ static SmallVector<std::string, 3> getNames(const DWARFDie &DIE,
 unsigned DWARFVerifier::verifyNameIndexEntries(
     const DWARFDebugNames::NameIndex &NI,
     const DWARFDebugNames::NameTableEntry &NTE) {
-  // Verifying type unit indexes not supported.
-  if (NI.getLocalTUCount() + NI.getForeignTUCount() > 0)
+  // Verifying foreign type unit indexes not supported.
+  if (NI.getForeignTUCount() > 0)
     return 0;
 
   const char *CStr = NTE.getString();
@@ -1596,18 +1598,35 @@ unsigned DWARFVerifier::verifyNameIndexEntries(
   Expected<DWARFDebugNames::Entry> EntryOr = NI.getEntry(&NextEntryID);
   for (; EntryOr; ++NumEntries, EntryID = NextEntryID,
                                 EntryOr = NI.getEntry(&NextEntryID)) {
-    uint32_t CUIndex = *EntryOr->getCUIndex();
-    if (CUIndex > NI.getCUCount()) {
+
+    std::optional<uint64_t> CUIndex = EntryOr->getCUIndex();
+    std::optional<uint64_t> TUIndex = EntryOr->getLocalTUIndex();
+    if (CUIndex && *CUIndex >= NI.getCUCount()) {
       ErrorCategory.Report("Name Index entry contains invalid CU index", [&]() {
         error() << formatv("Name Index @ {0:x}: Entry @ {1:x} contains an "
                            "invalid CU index ({2}).\n",
-                           NI.getUnitOffset(), EntryID, CUIndex);
+                           NI.getUnitOffset(), EntryID, *CUIndex);
       });
       ++NumErrors;
       continue;
     }
-    uint64_t CUOffset = NI.getCUOffset(CUIndex);
-    uint64_t DIEOffset = CUOffset + *EntryOr->getDIEUnitOffset();
+    if (TUIndex && *TUIndex >= NI.getLocalTUCount()) {
+      ErrorCategory.Report("Name Index entry contains invalid TU index", [&]() {
+        error() << formatv("Name Index @ {0:x}: Entry @ {1:x} contains an "
+                           "invalid TU index ({2}).\n",
+                           NI.getUnitOffset(), EntryID, *TUIndex);
+      });
+      ++NumErrors;
+      continue;
+    }
+    std::optional<uint64_t> UnitOffset;
+    if (TUIndex)
+      UnitOffset = NI.getLocalTUOffset(*TUIndex);
+    else if (CUIndex)
+      UnitOffset = NI.getCUOffset(*CUIndex);
+    if (!UnitOffset)
+      continue;
+    uint64_t DIEOffset = *UnitOffset + *EntryOr->getDIEUnitOffset();
     DWARFDie DIE = DCtx.getDIEForOffset(DIEOffset);
     if (!DIE) {
       ErrorCategory.Report("NameIndex references nonexistent DIE", [&]() {
@@ -1618,12 +1637,12 @@ unsigned DWARFVerifier::verifyNameIndexEntries(
       ++NumErrors;
       continue;
     }
-    if (DIE.getDwarfUnit()->getOffset() != CUOffset) {
+    if (DIE.getDwarfUnit()->getOffset() != *UnitOffset) {
       ErrorCategory.Report("Name index contains mismatched CU of DIE", [&]() {
         error() << formatv(
             "Name Index @ {0:x}: Entry @ {1:x}: mismatched CU of "
             "DIE @ {2:x}: index - {3:x}; debug_info - {4:x}.\n",
-            NI.getUnitOffset(), EntryID, DIEOffset, CUOffset,
+            NI.getUnitOffset(), EntryID, DIEOffset, *UnitOffset,
             DIE.getDwarfUnit()->getOffset());
       });
       ++NumErrors;
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
index 1fd2a33d3f11f..92630650ec937 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
@@ -467,7 +467,7 @@ Expected<Symbol *> COFFLinkGraphBuilder::createDefinedSymbol(
     return &G->addDefinedSymbol(
         G->createZeroFillBlock(getCommonSection(), Symbol.getValue(),
                                orc::ExecutorAddr(), Symbol.getValue(), 0),
-        0, SymbolName, Symbol.getValue(), Linkage::Strong, Scope::Default,
+        0, SymbolName, Symbol.getValue(), Linkage::Weak, Scope::Default,
         false, false);
   }
   if (Symbol.isAbsolute())
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 5dae600629395..c05961c64837c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -472,7 +472,7 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
       Symbol &GSym = G->addDefinedSymbol(
           G->createZeroFillBlock(getCommonSection(), Sym.st_size,
                                  orc::ExecutorAddr(), Sym.getValue(), 0),
-          0, *Name, Sym.st_size, Linkage::Strong, Scope::Default, false, false);
+          0, *Name, Sym.st_size, Linkage::Weak, Scope::Default, false, false);
       setGraphSymbol(SymIndex, GSym);
       continue;
     }
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index bb21f633d9829..260999b4fb454 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -366,7 +366,7 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
                                    orc::ExecutorAddrDiff(NSym.Value),
                                    orc::ExecutorAddr(),
                                    1ull << MachO::GET_COMM_ALIGN(NSym.Desc), 0),
-            0, *NSym.Name, orc::ExecutorAddrDiff(NSym.Value), Linkage::Strong,
+            0, *NSym.Name, orc::ExecutorAddrDiff(NSym.Value), Linkage::Weak,
             NSym.S, false, NSym.Desc & MachO::N_NO_DEAD_STRIP);
       } else {
         if (!NSym.Name)
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 869b383dd064f..0d7a51bfe7375 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -260,9 +260,7 @@ void MCJIT::finalizeObject() {
 
   // Generate code for module is going to move objects out of the 'added' list,
   // so we need to copy that out before using it:
-  SmallVector<Module*, 16> ModsToAdd;
-  for (auto *M : OwnedModules.added())
-    ModsToAdd.push_back(M);
+  SmallVector<Module *, 16> ModsToAdd(OwnedModules.added());
 
   for (auto *M : ModsToAdd)
     generateCodeForModule(M);
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 3e6de62c8b7dd..a078bffa86f59 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -932,13 +932,21 @@ Error JITDylib::resolve(MaterializationResponsibility &MR,
           if (SymI->second.getFlags().hasError())
             SymbolsInErrorState.insert(KV.first);
           else {
-            auto Flags = KV.second.getFlags();
-            Flags &= ~JITSymbolFlags::Common;
-            assert(Flags ==
-                       (SymI->second.getFlags() & ~JITSymbolFlags::Common) &&
-                   "Resolved flags should match the declared flags");
+            if (SymI->second.getFlags() & JITSymbolFlags::Common) {
+              [[maybe_unused]] auto WeakOrCommon =
+                  JITSymbolFlags::Weak | JITSymbolFlags::Common;
+              assert((KV.second.getFlags() & WeakOrCommon) &&
+                     "Common symbols must be resolved as common or weak");
+              assert((KV.second.getFlags() & ~WeakOrCommon) ==
+                         (SymI->second.getFlags() & ~JITSymbolFlags::Common) &&
+                     "Resolving symbol with incorrect flags");
 
-            Worklist.push_back({SymI, {KV.second.getAddress(), Flags}});
+            } else
+              assert(KV.second.getFlags() == SymI->second.getFlags() &&
+                     "Resolved flags should match the declared flags");
+
+            Worklist.push_back(
+                {SymI, {KV.second.getAddress(), SymI->second.getFlags()}});
           }
         }
 
@@ -2899,9 +2907,16 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
            "Resolving symbol outside this responsibility set");
     assert(!I->second.hasMaterializationSideEffectsOnly() &&
            "Can't resolve materialization-side-effects-only symbol");
-    assert((KV.second.getFlags() & ~JITSymbolFlags::Common) ==
-               (I->second & ~JITSymbolFlags::Common) &&
-           "Resolving symbol with incorrect flags");
+    if (I->second & JITSymbolFlags::Common) {
+      auto WeakOrCommon = JITSymbolFlags::Weak | JITSymbolFlags::Common;
+      assert((KV.second.getFlags() & WeakOrCommon) &&
+             "Common symbols must be resolved as common or weak");
+      assert((KV.second.getFlags() & ~WeakOrCommon) ==
+                 (I->second & ~JITSymbolFlags::Common) &&
+             "Resolving symbol with incorrect flags");
+    } else
+      assert(KV.second.getFlags() == I->second &&
+             "Resolving symbol with incorrect flags");
   }
 #endif
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 8a5986c1b88b1..c1a193f6a2802 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Target/TargetMachine.h"
 #include <string>
 
@@ -422,6 +423,7 @@ Error StaticLibraryDefinitionGenerator::buildObjectFilesMap() {
   DenseMap<uint64_t, MemoryBufferRef> MemoryBuffers;
   DenseSet<uint64_t> Visited;
   DenseSet<uint64_t> Excluded;
+  StringSaver FileNames(ObjFileNameStorage);
   for (auto &S : Archive->symbols()) {
     StringRef SymName = S.getName();
     auto Member = S.getMember();
@@ -438,7 +440,17 @@ Error StaticLibraryDefinitionGenerator::buildObjectFilesMap() {
         Excluded.insert(DataOffset);
         continue;
       }
-      MemoryBuffers[DataOffset] = (*Child)->getMemoryBufferRef();
+
+      // Give members of the archive a name that contains the archive path so
+      // that they can be differentiated from a member with the same name in a
+      // different archive. This also ensure initializer symbols names will be
+      // unique within a JITDylib.
+      StringRef FullName = FileNames.save(Archive->getFileName() + "(" +
+                                          (*Child)->getFileName() + ")");
+      MemoryBufferRef MemBuffer((*Child)->getMemoryBufferRef().getBuffer(),
+                                FullName);
+
+      MemoryBuffers[DataOffset] = MemBuffer;
     }
     if (!Excluded.count(DataOffset))
       ObjectFilesMap[L.getExecutionSession().intern(SymName)] =
diff --git a/llvm/lib/Frontend/OpenMP/CMakeLists.txt b/llvm/lib/Frontend/OpenMP/CMakeLists.txt
index 67aedf5c2b61a..82d2a9ae7c533 100644
--- a/llvm/lib/Frontend/OpenMP/CMakeLists.txt
+++ b/llvm/lib/Frontend/OpenMP/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_component_library(LLVMFrontendOpenMP
   TargetParser
   TransformUtils
   Analysis
+  Demangle
   MC
   Scalar
   BitReader
diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp
index c1556ff3c74d7..5720655442be3 100644
--- a/llvm/lib/Frontend/OpenMP/OMP.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMP.cpp
@@ -10,13 +10,19 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Demangle/Demangle.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/StringSaver.h"
 
 #include <algorithm>
+#include <cstdio>
 #include <iterator>
+#include <string>
 #include <type_traits>
 
 using namespace llvm;
@@ -186,4 +192,42 @@ bool isCombinedConstruct(Directive D) {
   // Otherwise directive-name is a combined construct.
   return !getLeafConstructs(D).empty() && !isCompositeConstruct(D);
 }
+
+std::string prettifyFunctionName(StringRef FunctionName) {
+  // Internalized functions have the right name, but simply a suffix.
+  if (FunctionName.ends_with(".internalized"))
+    return FunctionName.drop_back(sizeof("internalized")).str() +
+           " (internalized)";
+  unsigned LineNo = 0;
+  auto ParentName = deconstructOpenMPKernelName(FunctionName, LineNo);
+  if (LineNo == 0)
+    return FunctionName.str();
+  return ("omp target in " + ParentName + " @ " + std::to_string(LineNo) +
+          " (" + FunctionName + ")")
+      .str();
+}
+
+std::string deconstructOpenMPKernelName(StringRef KernelName,
+                                        unsigned &LineNo) {
+
+  // Only handle functions with an OpenMP kernel prefix for now. Naming scheme:
+  // __omp_offloading_<hex_hash1>_<hex_hash2>_<name>_l<line>_[<count>_]<suffix>
+  if (!KernelName.starts_with(TargetRegionEntryInfo::KernelNamePrefix))
+    return "";
+
+  auto PrettyName = KernelName.drop_front(
+      sizeof(TargetRegionEntryInfo::KernelNamePrefix) - /*'\0'*/ 1);
+  for (int I = 0; I < 3; ++I) {
+    PrettyName = PrettyName.drop_while([](char c) { return c != '_'; });
+    PrettyName = PrettyName.drop_front();
+  }
+
+  // Look for the last '_l<line>'.
+  size_t LineIdx = PrettyName.rfind("_l");
+  if (LineIdx == StringRef::npos)
+    return "";
+  if (PrettyName.drop_front(LineIdx + 2).consumeInteger(10, LineNo))
+    return "";
+  return demangle(PrettyName.take_front(LineIdx));
+}
 } // namespace llvm::omp
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a06e81dd280e0..3f8e64315849e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -360,23 +360,23 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
 // This function creates a fake integer value and a fake use for the integer
 // value. It returns the fake value created. This is useful in modeling the
 // extra arguments to the outlined functions.
-Value *createFakeIntVal(IRBuilder<> &Builder,
+Value *createFakeIntVal(IRBuilderBase &Builder,
                         OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
-                        std::stack<Instruction *> &ToBeDeleted,
+                        llvm::SmallVectorImpl<Instruction *> &ToBeDeleted,
                         OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
                         const Twine &Name = "", bool AsPtr = true) {
   Builder.restoreIP(OuterAllocaIP);
   Instruction *FakeVal;
   AllocaInst *FakeValAddr =
       Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
-  ToBeDeleted.push(FakeValAddr);
+  ToBeDeleted.push_back(FakeValAddr);
 
   if (AsPtr) {
     FakeVal = FakeValAddr;
   } else {
     FakeVal =
         Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
-    ToBeDeleted.push(FakeVal);
+    ToBeDeleted.push_back(FakeVal);
   }
 
   // Generate a fake use of this value
@@ -389,7 +389,7 @@ Value *createFakeIntVal(IRBuilder<> &Builder,
     UseFakeVal =
         cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
   }
-  ToBeDeleted.push(UseFakeVal);
+  ToBeDeleted.push_back(UseFakeVal);
   return FakeVal;
 }
 
@@ -1707,6 +1707,74 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
   emitTaskyieldImpl(Loc);
 }
 
+// Processes the dependencies in Dependencies and does the following
+// - Allocates space on the stack of an array of DependInfo objects
+// - Populates each DependInfo object with relevant information of
+//   the corresponding dependence.
+// - All code is inserted in the entry block of the current function.
+static Value *emitTaskDependencies(
+    OpenMPIRBuilder &OMPBuilder,
+    SmallVectorImpl<OpenMPIRBuilder::DependData> &Dependencies) {
+  // Early return if we have no dependencies to process
+  if (Dependencies.empty())
+    return nullptr;
+
+  // Given a vector of DependData objects, in this function we create an
+  // array on the stack that holds kmp_dep_info objects corresponding
+  // to each dependency. This is then passed to the OpenMP runtime.
+  // For example, if there are 'n' dependencies then the following psedo
+  // code is generated. Assume the first dependence is on a variable 'a'
+  //
+  // \code{c}
+  // DepArray = alloc(n x sizeof(kmp_depend_info);
+  // idx = 0;
+  // DepArray[idx].base_addr = ptrtoint(&a);
+  // DepArray[idx].len = 8;
+  // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
+  // ++idx;
+  // DepArray[idx].base_addr = ...;
+  // \endcode
+
+  IRBuilderBase &Builder = OMPBuilder.Builder;
+  Type *DependInfo = OMPBuilder.DependInfo;
+  Module &M = OMPBuilder.M;
+
+  Value *DepArray = nullptr;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  Builder.SetInsertPoint(
+      OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
+
+  Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
+  DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
+
+  for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
+    Value *Base =
+        Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
+    // Store the pointer to the variable
+    Value *Addr = Builder.CreateStructGEP(
+        DependInfo, Base,
+        static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
+    Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
+    Builder.CreateStore(DepValPtr, Addr);
+    // Store the size of the variable
+    Value *Size = Builder.CreateStructGEP(
+        DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
+    Builder.CreateStore(
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
+        Size);
+    // Store the dependency kind
+    Value *Flags = Builder.CreateStructGEP(
+        DependInfo, Base,
+        static_cast<unsigned int>(RTLDependInfoFields::Flags));
+    Builder.CreateStore(
+        ConstantInt::get(Builder.getInt8Ty(),
+                         static_cast<unsigned int>(Dep.DepKind)),
+        Flags);
+  }
+  Builder.restoreIP(OldIP);
+  return DepArray;
+}
+
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTask(const LocationDescription &Loc,
                             InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
@@ -1751,7 +1819,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
   OI.ExitBB = TaskExitBB;
 
   // Add the thread ID argument.
-  std::stack<Instruction *> ToBeDeleted;
+  SmallVector<Instruction *, 4> ToBeDeleted;
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
@@ -1948,10 +2016,8 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
           Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
     }
 
-    while (!ToBeDeleted.empty()) {
-      ToBeDeleted.top()->eraseFromParent();
-      ToBeDeleted.pop();
-    }
+    llvm::for_each(llvm::reverse(ToBeDeleted),
+                   [](Instruction *I) { I->eraseFromParent(); });
   };
 
   addOutlineInfo(std::move(OI));
@@ -3360,11 +3426,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   else
     Config.setGridValue(getGridValue(T, ReductionFunc));
 
-  uint32_t SrcLocStrSize;
-  Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
-  Value *RTLoc =
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
-
   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
   // RedList, shuffle_reduce_func, interwarp_copy_func);
   // or
@@ -3417,7 +3478,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
         Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
     Value *WcFuncCast =
         Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
-    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
+                     WcFuncCast};
     Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
         RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
     Res = Builder.CreateCall(Pv2Ptr, Args);
@@ -3440,7 +3502,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     Value *KernelTeamsReductionPtr = Builder.CreateCall(
         RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
 
-    Value *Args3[] = {RTLoc,
+    Value *Args3[] = {SrcLocInfo,
                       KernelTeamsReductionPtr,
                       Builder.getInt32(ReductionBufNum),
                       ReductionDataSize,
@@ -5947,7 +6009,17 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
-  Function *Kernel = Builder.GetInsertBlock()->getParent();
+  Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
+  Function *Kernel = DebugKernelWrapper;
+
+  // We need to strip the debug prefix to get the correct kernel name.
+  StringRef KernelName = Kernel->getName();
+  const std::string DebugPrefix = "_debug__";
+  if (KernelName.ends_with(DebugPrefix)) {
+    KernelName = KernelName.drop_back(DebugPrefix.length());
+    Kernel = M.getFunction(KernelName);
+    assert(Kernel && "Expected the real kernel to exist");
+  }
 
   // Manifest the launch configuration in the metadata matching the kernel
   // environment.
@@ -5969,12 +6041,6 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
   Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
 
-  // We need to strip the debug prefix to get the correct kernel name.
-  StringRef KernelName = Kernel->getName();
-  const std::string DebugPrefix = "_debug__";
-  if (KernelName.ends_with(DebugPrefix))
-    KernelName = KernelName.drop_back(DebugPrefix.length());
-
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_init);
   const DataLayout &DL = Fn->getDataLayout();
@@ -6027,7 +6093,7 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
           ? KernelEnvironmentGV
           : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
                                            KernelEnvironmentPtr);
-  Value *KernelLaunchEnvironment = Kernel->getArg(0);
+  Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
   CallInst *ThreadKind =
       Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
 
@@ -6302,8 +6368,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
                          CustomMapperCB);
 
     TargetDataRTArgs RTArgs;
-    emitOffloadingArraysArgument(Builder, RTArgs, Info,
-                                 !MapInfo->Names.empty());
+    emitOffloadingArraysArgument(Builder, RTArgs, Info);
 
     // Emit the number of elements in the offloading arrays.
     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
@@ -6356,8 +6421,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
   // Generate code for the closing of the data region.
   auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     TargetDataRTArgs RTArgs;
-    emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
-                                 /*ForEndCall=*/true);
+    Info.EmitDebug = !MapInfo->Names.empty();
+    emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
 
     // Emit the number of elements in the offloading arrays.
     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
@@ -6617,6 +6682,91 @@ static Function *createOutlinedFunction(
   return Func;
 }
 
+/// Create an entry point for a target task with the following.
+/// It'll have the following signature
+/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
+/// This function is called from emitTargetTask once the
+/// code to launch the target kernel has been outlined already.
+static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
+                                             IRBuilderBase &Builder,
+                                             CallInst *StaleCI) {
+  Module &M = OMPBuilder.M;
+  // KernelLaunchFunction is the target launch function, i.e.
+  // the function that sets up kernel arguments and calls
+  // __tgt_target_kernel to launch the kernel on the device.
+  //
+  Function *KernelLaunchFunction = StaleCI->getCalledFunction();
+
+  // StaleCI is the CallInst which is the call to the outlined
+  // target kernel launch function. If there are values that the
+  // outlined function uses then these are aggregated into a structure
+  // which is passed as the second argument. If not, then there's
+  // only one argument, the threadID. So, StaleCI can be
+  //
+  // %structArg = alloca { ptr, ptr }, align 8
+  // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
+  // store ptr %20, ptr %gep_, align 8
+  // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
+  // store ptr %21, ptr %gep_8, align 8
+  // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
+  //
+  // OR
+  //
+  // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
+  OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
+                                    StaleCI->getIterator());
+  LLVMContext &Ctx = StaleCI->getParent()->getContext();
+  Type *ThreadIDTy = Type::getInt32Ty(Ctx);
+  Type *TaskPtrTy = OMPBuilder.TaskPtr;
+  Type *TaskTy = OMPBuilder.Task;
+  auto ProxyFnTy =
+      FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
+                        /* isVarArg */ false);
+  auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
+                                  ".omp_target_task_proxy_func",
+                                  Builder.GetInsertBlock()->getModule());
+  ProxyFn->getArg(0)->setName("thread.id");
+  ProxyFn->getArg(1)->setName("task");
+
+  BasicBlock *EntryBB =
+      BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  bool HasShareds = StaleCI->arg_size() > 1;
+  // TODO: This is a temporary assert to prove to ourselves that
+  // the outlined target launch function is always going to have
+  // atmost two arguments if there is any data shared between
+  // host and device.
+  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
+         "StaleCI with shareds should have exactly two arguments.");
+  if (HasShareds) {
+    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    assert(ArgStructAlloca &&
+           "Unable to find the alloca instruction corresponding to arguments "
+           "for extracted function");
+    auto *ArgStructType =
+        dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+
+    AllocaInst *NewArgStructAlloca =
+        Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
+    Value *TaskT = ProxyFn->getArg(1);
+    Value *ThreadId = ProxyFn->getArg(0);
+    Value *SharedsSize =
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+
+    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+    LoadInst *LoadShared =
+        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+
+    Builder.CreateMemCpy(
+        NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
+        LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
+
+    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
+  }
+  Builder.CreateRetVoid();
+  return ProxyFn;
+}
 static void emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
@@ -6634,25 +6784,302 @@ static void emitTargetOutlinedFunction(
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
                                       OutlinedFn, OutlinedFnID);
 }
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
+    Function *OutlinedFn, Value *OutlinedFnID,
+    EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+    SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
+    bool HasNoWait) {
+
+  // When we arrive at this function, the target region itself has been
+  // outlined into the function OutlinedFn.
+  // So at ths point, for
+  // --------------------------------------------------
+  //   void user_code_that_offloads(...) {
+  //     omp target depend(..) map(from:a) map(to:b, c)
+  //        a = b + c
+  //   }
+  //
+  // --------------------------------------------------
+  //
+  // we have
+  //
+  // --------------------------------------------------
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     ;; target region has been outlined and now we need to
+  //     ;; offload to it via a target task.
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //     *a = *b + *c
+  //   }
+  //
+  // We have to now do the following
+  // (i)   Make an offloading call to outlined_device_function using the OpenMP
+  //       RTL. See 'kernel_launch_function' in the pseudo code below. This is
+  //       emitted by emitKernelLaunch
+  // (ii)  Create a task entry point function that calls kernel_launch_function
+  //       and is the entry point for the target task. See
+  //       '@.omp_target_task_proxy_func in the pseudocode below.
+  // (iii) Create a task with the task entry point created in (ii)
+  //
+  // That is we create the following
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //
+  //     %structArg = alloca { ptr, ptr, ptr }, align 8
+  //     %strucArg[0] = %.offload_baseptrs
+  //     %strucArg[1] = %.offload_ptrs
+  //     %strucArg[2] = %.offload_mappers
+  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
+  //                                               @.omp_target_task_proxy_func)
+  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     dependencies_array = ...
+  //     ;; if nowait not present
+  //     call @__kmpc_omp_wait_deps(..., dependencies_array)
+  //     call @__kmpc_omp_task_begin_if0(...)
+  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
+  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+  //   }
+  //
+  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
+  //                                                     ptr %task) {
+  //       %structArg = alloca {ptr, ptr, ptr}
+  //       %shared_data = load (getelementptr %task, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(structArg))
+  //       kernel_launch_function(%thread.id, %structArg)
+  //   }
+  //
+  //   We need the proxy function because the signature of the task entry point
+  //   expected by kmpc_omp_task is always the same and will be different from
+  //   that of the kernel_launch function.
+  //
+  //   kernel_launch_function is generated by emitKernelLaunch and has the
+  //   always_inline attribute.
+  //   void kernel_launch_function(thread_id,
+  //                               structArg) alwaysinline {
+  //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
+  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
+  //       offload_ptrs = load(getelementptr structArg, 0, 1)
+  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
+  //       ; offload_mappers
+  //       call i32 @__tgt_target_kernel(...,
+  //                                     outlined_device_function,
+  //                                     ptr %kernel_args)
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //      *a = *b + *c
+  //   }
+  //
+  BasicBlock *TargetTaskBodyBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
+  BasicBlock *TargetTaskAllocaBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
+
+  InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
+                                   TargetTaskAllocaBB->begin());
+  InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+
+  OutlineInfo OI;
+  OI.EntryBB = TargetTaskAllocaBB;
+  OI.OuterAllocaBB = AllocaIP.getBlock();
+
+  // Add the thread ID argument.
+  SmallVector<Instruction *, 4> ToBeDeleted;
+  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+      Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
+
+  Builder.restoreIP(TargetTaskBodyIP);
+
+  // emitKernelLaunch makes the necessary runtime call to offload the kernel.
+  // We then outline all that code into a separate function
+  // ('kernel_launch_function' in the pseudo code above). This function is then
+  // called by the target task proxy function (see
+  // '@.omp_target_task_proxy_func' in the pseudo code above)
+  // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction
+  Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
+                                     EmitTargetCallFallbackCB, Args, DeviceID,
+                                     RTLoc, TargetTaskAllocaIP));
+
+  OI.ExitBB = Builder.saveIP().getBlock();
+  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
+                      HasNoWait](Function &OutlinedFn) mutable {
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "there must be a single user for the outlined function");
+
+    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+    bool HasShareds = StaleCI->arg_size() > 1;
+
+    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
 
-static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
-                           OpenMPIRBuilder::InsertPointTy AllocaIP,
-                           Function *OutlinedFn, Constant *OutlinedFnID,
-                           int32_t NumTeams, int32_t NumThreads,
-                           SmallVectorImpl<Value *> &Args,
-                           OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) {
+    LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
+                      << "\n");
+
+    Builder.SetInsertPoint(StaleCI);
+
+    // Gather the arguments for emitting the runtime call.
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr =
+        getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
+    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+    // @__kmpc_omp_task_alloc
+    Function *TaskAllocFn =
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+    // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
+    // call.
+    Value *ThreadID = getOrCreateThreadID(Ident);
+
+    // Argument - `sizeof_kmp_task_t` (TaskSize)
+    // Tasksize refers to the size in bytes of kmp_task_t data structure
+    // including private vars accessed in task.
+    // TODO: add kmp_task_t_with_privates (privates)
+    Value *TaskSize =
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
+
+    // Argument - `sizeof_shareds` (SharedsSize)
+    // SharedsSize refers to the shareds array size in the kmp_task_t data
+    // structure.
+    Value *SharedsSize = Builder.getInt64(0);
+    if (HasShareds) {
+      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      assert(ArgStructAlloca &&
+             "Unable to find the alloca instruction corresponding to arguments "
+             "for extracted function");
+      auto *ArgStructType =
+          dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+      assert(ArgStructType && "Unable to find struct type corresponding to "
+                              "arguments for extracted function");
+      SharedsSize =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+    }
+
+    // Argument - `flags`
+    // Task is tied iff (Flags & 1) == 1.
+    // Task is untied iff (Flags & 1) == 0.
+    // Task is final iff (Flags & 2) == 2.
+    // Task is not final iff (Flags & 2) == 0.
+    // A target task is not final and is untied.
+    Value *Flags = Builder.getInt32(0);
+
+    // Emit the @__kmpc_omp_task_alloc runtime call
+    // The runtime call returns a pointer to an area where the task captured
+    // variables must be copied before the task is run (TaskData)
+    CallInst *TaskData = Builder.CreateCall(
+        TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+                      /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
+                      /*task_func=*/ProxyFn});
+
+    if (HasShareds) {
+      Value *Shareds = StaleCI->getArgOperand(1);
+      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+      Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+                           SharedsSize);
+    }
+
+    Value *DepArray = emitTaskDependencies(*this, Dependencies);
+
+    // ---------------------------------------------------------------
+    // V5.2 13.8 target construct
+    // If the nowait clause is present, execution of the target task
+    // may be deferred. If the nowait clause is not present, the target task is
+    // an included task.
+    // ---------------------------------------------------------------
+    // The above means that the lack of a nowait on the target construct
+    // translates to '#pragma omp task if(0)'
+    if (!HasNoWait) {
+      if (DepArray) {
+        Function *TaskWaitFn =
+            getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+        Builder.CreateCall(
+            TaskWaitFn,
+            {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
+             /*ndeps=*/Builder.getInt32(Dependencies.size()),
+             /*dep_list=*/DepArray,
+             /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
+             /*noalias_dep_list=*/
+             ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+      }
+      // Included task.
+      Function *TaskBeginFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
+      Function *TaskCompleteFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
+      Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
+      CallInst *CI = nullptr;
+      if (HasShareds)
+        CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
+      else
+        CI = Builder.CreateCall(ProxyFn, {ThreadID});
+      CI->setDebugLoc(StaleCI->getDebugLoc());
+      Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
+    } else if (DepArray) {
+      // HasNoWait - meaning the task may be deferred. Call
+      // __kmpc_omp_task_with_deps if there are dependencies,
+      // else call __kmpc_omp_task
+      Function *TaskFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
+      Builder.CreateCall(
+          TaskFn,
+          {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
+           DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
+           ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+    } else {
+      // Emit the @__kmpc_omp_task runtime call to spawn the task
+      Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
+      Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
+    }
+
+    StaleCI->eraseFromParent();
+    llvm::for_each(llvm::reverse(ToBeDeleted),
+                   [](Instruction *I) { I->eraseFromParent(); });
+  };
+  addOutlineInfo(std::move(OI));
+
+  LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
+                    << *(Builder.GetInsertBlock()) << "\n");
+  LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
+                    << *(Builder.GetInsertBlock()->getParent()->getParent())
+                    << "\n");
+  return Builder.saveIP();
+}
+void OpenMPIRBuilder::emitOffloadingArraysAndArgs(
+    InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
+    TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
+    bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
+    function_ref<Value *(unsigned int)> CustomMapperCB) {
+  emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
+                       DeviceAddrCB, CustomMapperCB);
+  emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
+}
+
+static void emitTargetCall(
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+    OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
+    Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
+    SmallVectorImpl<Value *> &Args,
+    OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
+    SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {}) {
 
   OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
       /*SeparateBeginEndCalls=*/true);
 
   OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
-  OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
-                                  /*IsNonContiguous=*/true);
-
   OpenMPIRBuilder::TargetDataRTArgs RTArgs;
-  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
-                                          !MapInfo.Names.empty());
+  OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
+                                         RTArgs, MapInfo,
+                                         /*IsNonContiguous=*/true,
+                                         /*ForEndCall=*/false);
 
   //  emitKernelLaunch
   auto &&EmitTargetCallFallbackCB =
@@ -6662,7 +7089,7 @@ static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     return Builder.saveIP();
   };
 
-  unsigned NumTargetItems = MapInfo.BasePointers.size();
+  unsigned NumTargetItems = Info.NumberOfPtrs;
   // TODO: Use correct device ID
   Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
   Value *NumTeamsVal = Builder.getInt32(NumTeams);
@@ -6677,23 +7104,34 @@ static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
   Value *DynCGGroupMem = Builder.getInt32(0);
 
   bool HasNoWait = false;
+  bool HasDependencies = Dependencies.size() > 0;
+  bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
 
   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
                                           NumTeamsVal, NumThreadsVal,
                                           DynCGGroupMem, HasNoWait);
 
-  Builder.restoreIP(OMPBuilder.emitKernelLaunch(
-      Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
-      DeviceID, RTLoc, AllocaIP));
+  // The presence of certain clauses on the target directive require the
+  // explicit generation of the target task.
+  if (RequiresOuterTargetTask) {
+    Builder.restoreIP(OMPBuilder.emitTargetTask(
+        OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
+        RTLoc, AllocaIP, Dependencies, HasNoWait));
+  } else {
+    Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+        Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+        DeviceID, RTLoc, AllocaIP));
+  }
 }
-
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
     int32_t NumThreads, SmallVectorImpl<Value *> &Args,
     GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
-    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
+    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+    SmallVector<DependData> Dependencies) {
+
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -6701,12 +7139,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
 
   Function *OutlinedFn;
   Constant *OutlinedFnID;
+  // The target region is outlined into its own function. The LLVM IR for
+  // the target region itself is generated using the callbacks CBFunc
+  // and ArgAccessorFuncCB
   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
                              OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+
+  // If we are not on the target device, then we need to generate code
+  // to make a remote call (offload) to the previously outlined function
+  // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
-                   NumThreads, Args, GenMapInfoCB);
-
+                   NumThreads, Args, GenMapInfoCB, Dependencies);
   return Builder.saveIP();
 }
 
@@ -6839,7 +7283,6 @@ void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
                                                    TargetDataRTArgs &RTArgs,
                                                    TargetDataInfo &Info,
-                                                   bool EmitDebug,
                                                    bool ForEndCall) {
   assert((!ForEndCall || Info.separateBeginEndCalls()) &&
          "expected region end call to runtime only when end call is separate");
@@ -6879,7 +7322,7 @@ void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
 
   // Only emit the mapper information arrays if debug information is
   // requested.
-  if (!EmitDebug)
+  if (!Info.EmitDebug)
     RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
   else
     RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
@@ -7068,9 +7511,11 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     auto *MapNamesArrayGbl =
         createOffloadMapnames(CombinedInfo.Names, MapnamesName);
     Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
+    Info.EmitDebug = true;
   } else {
     Info.RTArgs.MapNamesArray =
         Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
+    Info.EmitDebug = false;
   }
 
   // If there's a present map type modifier, it must not be applied to the end
@@ -7827,7 +8272,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   OI.OuterAllocaBB = &OuterAllocaBB;
 
   // Insert fake values for global tid and bound tid.
-  std::stack<Instruction *> ToBeDeleted;
+  SmallVector<Instruction *, 8> ToBeDeleted;
   InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
@@ -7842,7 +8287,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    ToBeDeleted.push(StaleCI);
+    ToBeDeleted.push_back(StaleCI);
 
     assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
            "Outlined function must have two or three arguments only");
@@ -7866,10 +8311,9 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
                            omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
                        Args);
 
-    while (!ToBeDeleted.empty()) {
-      ToBeDeleted.top()->eraseFromParent();
-      ToBeDeleted.pop();
-    }
+    llvm::for_each(llvm::reverse(ToBeDeleted),
+                   [](Instruction *I) { I->eraseFromParent(); });
+
   };
 
   if (!Config.isTargetDevice())
@@ -8137,7 +8581,7 @@ void TargetRegionEntryInfo::getTargetRegionEntryFnName(
     SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
     unsigned FileID, unsigned Line, unsigned Count) {
   raw_svector_ostream OS(Name);
-  OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
+  OS << KernelNamePrefix << llvm::format("%x", DeviceID)
      << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
   if (Count)
     OS << "_" << Count;
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 6599730590de6..01a16ccd688f4 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -573,8 +573,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   case Type::FP128TyID:     OS << "fp128"; return;
   case Type::PPC_FP128TyID: OS << "ppc_fp128"; return;
   case Type::LabelTyID:     OS << "label"; return;
-  case Type::MetadataTyID:  OS << "metadata"; return;
-  case Type::X86_MMXTyID:   OS << "x86_mmx"; return;
+  case Type::MetadataTyID:
+    OS << "metadata";
+    return;
   case Type::X86_AMXTyID:   OS << "x86_amx"; return;
   case Type::TokenTyID:     OS << "token"; return;
   case Type::IntegerTyID:
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bf19934da047c..cf05b11c53963 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -240,6 +240,8 @@ BasicBlock::~BasicBlock() {
 
 void BasicBlock::setParent(Function *parent) {
   // Set Parent=parent, updating instruction symtab entries as appropriate.
+  if (Parent != parent)
+    Number = parent ? parent->NextBlockNum++ : -1u;
   InstList.setSymTabObject(&Parent, parent);
 }
 
@@ -626,9 +628,7 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
   // to reflect that the incoming branches will be from the New block and not
   // from predecessors of the 'this' block.
   // Save predecessors to separate vector before modifying them.
-  SmallVector<BasicBlock *, 4> Predecessors;
-  for (BasicBlock *Pred : predecessors(this))
-    Predecessors.push_back(Pred);
+  SmallVector<BasicBlock *, 4> Predecessors(predecessors(this));
   for (BasicBlock *Pred : Predecessors) {
     Instruction *TI = Pred->getTerminator();
     TI->replaceSuccessorWith(this, New);
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 337a600ae3ff1..5f0f164b4067e 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -83,7 +83,6 @@ add_llvm_component_library(LLVMCore
   ${LLVM_PTHREAD_LIB}
 
   DEPENDS
-  vt_gen
   intrinsics_gen
 
   LINK_COMPONENTS
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 693674ae0d06f..05ab0968ef6f3 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
+  if (V->isNullValue() && !DestTy->isX86_AMXTy() &&
       opc != Instruction::AddrSpaceCast)
     return Constant::getNullValue(DestTy);
 
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 0ead677422803..50b211a302e8f 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -988,6 +988,8 @@ ConstantRange ConstantRange::overflowingBinaryOp(Instruction::BinaryOps BinOp,
     return subWithNoWrap(Other, NoWrapKind);
   case Instruction::Mul:
     return multiplyWithNoWrap(Other, NoWrapKind);
+  case Instruction::Shl:
+    return shlWithNoWrap(Other, NoWrapKind);
   default:
     // Don't know about this Overflowing Binary Operation.
     // Conservatively fallback to plain binop handling.
@@ -1242,6 +1244,17 @@ ConstantRange::multiplyWithNoWrap(const ConstantRange &Other,
   if (NoWrapKind & OverflowingBinaryOperator::NoUnsignedWrap)
     Result = Result.intersectWith(umul_sat(Other), RangeType);
 
+  // mul nsw nuw X, Y s>= 0 if X s> 1 or Y s> 1
+  if ((NoWrapKind == (OverflowingBinaryOperator::NoSignedWrap |
+                      OverflowingBinaryOperator::NoUnsignedWrap)) &&
+      !Result.isAllNonNegative()) {
+    if (getSignedMin().sgt(1) || Other.getSignedMin().sgt(1))
+      Result = Result.intersectWith(
+          getNonEmpty(APInt::getZero(getBitWidth()),
+                      APInt::getSignedMinValue(getBitWidth())),
+          RangeType);
+  }
+
   return Result;
 }
 
@@ -1604,6 +1617,23 @@ ConstantRange::shl(const ConstantRange &Other) const {
   return ConstantRange::getNonEmpty(std::move(Min), std::move(Max) + 1);
 }
 
+ConstantRange ConstantRange::shlWithNoWrap(const ConstantRange &Other,
+                                           unsigned NoWrapKind,
+                                           PreferredRangeType RangeType) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+
+  ConstantRange Result = shl(Other);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoSignedWrap)
+    Result = Result.intersectWith(sshl_sat(Other), RangeType);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoUnsignedWrap)
+    Result = Result.intersectWith(ushl_sat(Other), RangeType);
+
+  return Result;
+}
+
 ConstantRange
 ConstantRange::lshr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8ee6ab14c1d8c..f4e0101e91e19 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -609,8 +609,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMPointerTypeKind;
   case Type::FixedVectorTyID:
     return LLVMVectorTypeKind;
-  case Type::X86_MMXTyID:
-    return LLVMX86_MMXTypeKind;
   case Type::X86_AMXTyID:
     return LLVMX86_AMXTypeKind;
   case Type::TokenTyID:
@@ -725,9 +723,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
 }
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
-  return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
-}
 LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C));
 }
@@ -753,9 +748,6 @@ LLVMTypeRef LLVMFP128Type(void) {
 LLVMTypeRef LLVMPPCFP128Type(void) {
   return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
 }
-LLVMTypeRef LLVMX86MMXType(void) {
-  return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
-}
 LLVMTypeRef LLVMX86AMXType(void) {
   return LLVMX86AMXTypeInContext(LLVMGetGlobalContext());
 }
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 2741165332487..17897f77b4edb 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -835,7 +835,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     // layout.
     return Align(PowerOf2Ceil(BitWidth / 8));
   }
-  case Type::X86_MMXTyID:
   case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID: {
     unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 20871982afb06..69520fdb03dc7 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Function.h"
 #include "SymbolTableListTraitsImpl.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
@@ -85,6 +86,27 @@ static cl::opt<int> NonGlobalValueMaxNameSize(
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
+void Function::renumberBlocks() {
+  validateBlockNumbers();
+
+  NextBlockNum = 0;
+  for (auto &BB : *this)
+    BB.Number = NextBlockNum++;
+  BlockNumEpoch++;
+}
+
+void Function::validateBlockNumbers() const {
+#ifndef NDEBUG
+  BitVector Numbers(NextBlockNum);
+  for (const auto &BB : *this) {
+    unsigned Num = BB.getNumber();
+    assert(Num < NextBlockNum && "out of range block number");
+    assert(!Numbers[Num] && "duplicate block numbers");
+    Numbers.set(Num);
+  }
+#endif
+}
+
 void Function::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
@@ -509,6 +531,8 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
 }
 
 Function::~Function() {
+  validateBlockNumbers();
+
   dropAllReferences();    // After this it is safe to delete instructions.
 
   // Delete all of the method arguments and unlink from symbol table...
@@ -1052,8 +1076,9 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
     case Type::DoubleTyID:    Result += "f64";      break;
     case Type::X86_FP80TyID:  Result += "f80";      break;
     case Type::FP128TyID:     Result += "f128";     break;
-    case Type::PPC_FP128TyID: Result += "ppcf128";  break;
-    case Type::X86_MMXTyID:   Result += "x86mmx";   break;
+    case Type::PPC_FP128TyID:
+      Result += "ppcf128";
+      break;
     case Type::X86_AMXTyID:   Result += "x86amx";   break;
     case Type::IntegerTyID:
       Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
@@ -1397,7 +1422,8 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::VarArg: return Type::getVoidTy(Context);
-  case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::MMX:
+    return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case IITDescriptor::AMX: return Type::getX86_AMXTy(Context);
   case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
@@ -1580,7 +1606,11 @@ static bool matchIntrinsicType(
   switch (D.Kind) {
     case IITDescriptor::Void: return !Ty->isVoidTy();
     case IITDescriptor::VarArg: return true;
-    case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+    case IITDescriptor::MMX: {
+      FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+      return !VT || VT->getNumElements() != 1 ||
+             !VT->getElementType()->isIntegerTy(64);
+    }
     case IITDescriptor::AMX:  return !Ty->isX86_AMXTy();
     case IITDescriptor::Token: return !Ty->isTokenTy();
     case IITDescriptor::Metadata: return !Ty->isMetadataTy();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 7a8cf8c230498..58ebe7e95cd06 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3116,9 +3116,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
   if (SrcBits != DestBits)
     return false;
 
-  if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy())
-    return false;
-
   return true;
 }
 
@@ -3228,12 +3225,6 @@ CastInst::getCastOpcode(
       return IntToPtr;                              // int -> ptr
     }
     llvm_unreachable("Casting pointer to other than pointer or int");
-  } else if (DestTy->isX86_MMXTy()) {
-    if (SrcTy->isVectorTy()) {
-      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
-      return BitCast;                               // 64-bit vector to MMX
-    }
-    llvm_unreachable("Illegal cast to X86_MMX");
   }
   llvm_unreachable("Casting to type that is not first-class");
 }
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 61f2046af6e67..6804507af3579 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -666,6 +666,25 @@ Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) {
   return Intrinsic::not_intrinsic;
 }
 
+constexpr static Intrinsic::ID getForIntrinsic(Intrinsic::ID Id) {
+  if (::isVPIntrinsic(Id))
+    return Id;
+
+  switch (Id) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break;
+#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) case Intrinsic::INTRIN:
+#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return Intrinsic::not_intrinsic;
+}
+
+Intrinsic::ID VPIntrinsic::getForIntrinsic(Intrinsic::ID Id) {
+  return ::getForIntrinsic(Id);
+}
+
 bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   using namespace PatternMatch;
 
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 9443e939bddf5..761c40cf90655 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -40,9 +40,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID),
       MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID),
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
-      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
-      X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
-      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
+      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID),
+      Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32),
+      Int64Ty(C, 64), Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 937a87d686175..8e9ca21d149f6 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1582,7 +1582,7 @@ class LLVMContextImpl {
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
-  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
+  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 3aec7140510a6..ae5f5de142328 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1287,12 +1287,12 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
     return A;
 
   // First, walk both lists in order of the lower boundary of each interval.
-  // At each step, try to merge the new interval to the last one we adedd.
+  // At each step, try to merge the new interval to the last one we added.
   SmallVector<ConstantInt *, 4> EndPoints;
-  int AI = 0;
-  int BI = 0;
-  int AN = A->getNumOperands() / 2;
-  int BN = B->getNumOperands() / 2;
+  unsigned AI = 0;
+  unsigned BI = 0;
+  unsigned AN = A->getNumOperands() / 2;
+  unsigned BN = B->getNumOperands() / 2;
   while (AI < AN && BI < BN) {
     ConstantInt *ALow = mdconst::extract<ConstantInt>(A->getOperand(2 * AI));
     ConstantInt *BLow = mdconst::extract<ConstantInt>(B->getOperand(2 * BI));
diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp
index 893a5e59c86a6..559b199445366 100644
--- a/llvm/lib/IR/OptBisect.cpp
+++ b/llvm/lib/IR/OptBisect.cpp
@@ -32,6 +32,11 @@ static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
                                    }),
                                    cl::desc("Maximum optimization to perform"));
 
+static cl::opt<bool> OptBisectVerbose(
+    "opt-bisect-verbose",
+    cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden,
+    cl::init(true), cl::Optional);
+
 static void printPassMessage(const StringRef &Name, int PassNum,
                              StringRef TargetDesc, bool Running) {
   StringRef Status = Running ? "" : "NOT ";
@@ -45,7 +50,8 @@ bool OptBisect::shouldRunPass(const StringRef PassName,
 
   int CurBisectNum = ++LastBisectNum;
   bool ShouldRun = (BisectLimit == -1 || CurBisectNum <= BisectLimit);
-  printPassMessage(PassName, CurBisectNum, IRDescription, ShouldRun);
+  if (OptBisectVerbose)
+    printPassMessage(PassName, CurBisectNum, IRDescription, ShouldRun);
   return ShouldRun;
 }
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index de3db557d8b50..d806f8093459e 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -155,6 +155,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
         break;
       }
       [[fallthrough]];
+    case Triple::DriverKit:
     case Triple::TvOS:
     case Triple::WatchOS:
     case Triple::XROS:
@@ -164,6 +165,10 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     default:
       break;
     }
+  } else if (TT.getOS() == Triple::BridgeOS) {
+    // TODO: BridgeOS should be included in isOSDarwin.
+    setLibcallName(RTLIB::EXP10_F32, "__exp10f");
+    setLibcallName(RTLIB::EXP10_F64, "__exp10");
   } else {
     setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
     setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
@@ -344,36 +349,3 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     }
   }
 }
-
-void RuntimeLibcallsInfo::initCmpLibcallCCs() {
-  std::fill(CmpLibcallCCs, CmpLibcallCCs + RTLIB::UNKNOWN_LIBCALL,
-            ISD::SETCC_INVALID);
-  CmpLibcallCCs[RTLIB::OEQ_F32] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_F64] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_F128] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::UNE_F32] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_F64] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_F128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::OGE_F32] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_F64] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_F128] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OLT_F32] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_F64] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_F128] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLE_F32] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_F64] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_F128] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OGT_F32] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_F64] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_F128] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::UO_F32] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_F64] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_F128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_PPCF128] = ISD::SETNE;
-}
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index e0c35e2324f62..e3ca35d12b878 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -45,7 +45,6 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case PPC_FP128TyID : return getPPC_FP128Ty(C);
   case LabelTyID     : return getLabelTy(C);
   case MetadataTyID  : return getMetadataTy(C);
-  case X86_MMXTyID   : return getX86_MMXTy(C);
   case X86_AMXTyID   : return getX86_AMXTy(C);
   case TokenTyID     : return getTokenTy(C);
   default:
@@ -125,14 +124,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   if (isa<VectorType>(this) && isa<VectorType>(Ty))
     return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits();
 
-  //  64-bit fixed width vector types can be losslessly converted to x86mmx.
-  if (((isa<FixedVectorType>(this)) && Ty->isX86_MMXTy()) &&
-      getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-  if ((isX86_MMXTy() && isa<FixedVectorType>(Ty)) &&
-      Ty->getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-
   //  8192-bit fixed width vector types can be losslessly converted to x86amx.
   if (((isa<FixedVectorType>(this)) && Ty->isX86_AMXTy()) &&
       getPrimitiveSizeInBits().getFixedValue() == 8192)
@@ -179,8 +170,6 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     return TypeSize::getFixed(128);
   case Type::PPC_FP128TyID:
     return TypeSize::getFixed(128);
-  case Type::X86_MMXTyID:
-    return TypeSize::getFixed(64);
   case Type::X86_AMXTyID:
     return TypeSize::getFixed(8192);
   case Type::IntegerTyID:
@@ -245,7 +234,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; }
 Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
 Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
-Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
 Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
 
 IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index 5ff3082879895..8dbf25277bf5d 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -60,60 +60,13 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
   return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
 }
 
-Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+Value *VectorBuilder::createSimpleTargetReduction(Intrinsic::ID RdxID,
+                                                  Type *ValTy,
                                                   ArrayRef<Value *> InstOpArray,
                                                   const Twine &Name) {
-  Intrinsic::ID VPID;
-  switch (Kind) {
-  case RecurKind::Add:
-    VPID = Intrinsic::vp_reduce_add;
-    break;
-  case RecurKind::Mul:
-    VPID = Intrinsic::vp_reduce_mul;
-    break;
-  case RecurKind::And:
-    VPID = Intrinsic::vp_reduce_and;
-    break;
-  case RecurKind::Or:
-    VPID = Intrinsic::vp_reduce_or;
-    break;
-  case RecurKind::Xor:
-    VPID = Intrinsic::vp_reduce_xor;
-    break;
-  case RecurKind::FMulAdd:
-  case RecurKind::FAdd:
-    VPID = Intrinsic::vp_reduce_fadd;
-    break;
-  case RecurKind::FMul:
-    VPID = Intrinsic::vp_reduce_fmul;
-    break;
-  case RecurKind::SMax:
-    VPID = Intrinsic::vp_reduce_smax;
-    break;
-  case RecurKind::SMin:
-    VPID = Intrinsic::vp_reduce_smin;
-    break;
-  case RecurKind::UMax:
-    VPID = Intrinsic::vp_reduce_umax;
-    break;
-  case RecurKind::UMin:
-    VPID = Intrinsic::vp_reduce_umin;
-    break;
-  case RecurKind::FMax:
-    VPID = Intrinsic::vp_reduce_fmax;
-    break;
-  case RecurKind::FMin:
-    VPID = Intrinsic::vp_reduce_fmin;
-    break;
-  case RecurKind::FMaximum:
-    VPID = Intrinsic::vp_reduce_fmaximum;
-    break;
-  case RecurKind::FMinimum:
-    VPID = Intrinsic::vp_reduce_fminimum;
-    break;
-  default:
-    llvm_unreachable("No VPIntrinsic for this reduction");
-  }
+  auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
   return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 75a53c1c99734..c5c407637cbf3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -324,13 +324,6 @@ namespace {
 
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   friend class InstVisitor<Verifier>;
-
-  // ISD::ArgFlagsTy::MemAlign only have 4 bits for alignment, so
-  // the alignment size should not exceed 2^15. Since encode(Align)
-  // would plus the shift value by 1, the alignment size should
-  // not exceed 2^14, otherwise it can NOT be properly lowered
-  // in backend.
-  static constexpr unsigned ParamMaxAlignment = 1 << 14;
   DominatorTree DT;
 
   /// When verifying a basic block, keep track of all of the
@@ -2021,31 +2014,43 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
   }
 
   if (isa<PointerType>(Ty)) {
+    if (Attrs.hasAttribute(Attribute::Alignment)) {
+      Align AttrAlign = Attrs.getAlignment().valueOrOne();
+      Check(AttrAlign.value() <= Value::MaximumAlignment,
+            "huge alignment values are unsupported", V);
+    }
     if (Attrs.hasAttribute(Attribute::ByVal)) {
-      if (Attrs.hasAttribute(Attribute::Alignment)) {
-        Align AttrAlign = Attrs.getAlignment().valueOrOne();
-        Align MaxAlign(ParamMaxAlignment);
-        Check(AttrAlign <= MaxAlign,
-              "Attribute 'align' exceed the max size 2^14", V);
-      }
       SmallPtrSet<Type *, 4> Visited;
       Check(Attrs.getByValType()->isSized(&Visited),
             "Attribute 'byval' does not support unsized types!", V);
+      Check(DL.getTypeAllocSize(Attrs.getByValType()).getKnownMinValue() <
+                (1ULL << 32),
+            "huge 'byval' arguments are unsupported", V);
     }
     if (Attrs.hasAttribute(Attribute::ByRef)) {
       SmallPtrSet<Type *, 4> Visited;
       Check(Attrs.getByRefType()->isSized(&Visited),
             "Attribute 'byref' does not support unsized types!", V);
+      Check(DL.getTypeAllocSize(Attrs.getByRefType()).getKnownMinValue() <
+                (1ULL << 32),
+            "huge 'byref' arguments are unsupported", V);
     }
     if (Attrs.hasAttribute(Attribute::InAlloca)) {
       SmallPtrSet<Type *, 4> Visited;
       Check(Attrs.getInAllocaType()->isSized(&Visited),
             "Attribute 'inalloca' does not support unsized types!", V);
+      Check(DL.getTypeAllocSize(Attrs.getInAllocaType()).getKnownMinValue() <
+                (1ULL << 32),
+            "huge 'inalloca' arguments are unsupported", V);
     }
     if (Attrs.hasAttribute(Attribute::Preallocated)) {
       SmallPtrSet<Type *, 4> Visited;
       Check(Attrs.getPreallocatedType()->isSized(&Visited),
             "Attribute 'preallocated' does not support unsized types!", V);
+      Check(
+          DL.getTypeAllocSize(Attrs.getPreallocatedType()).getKnownMinValue() <
+              (1ULL << 32),
+          "huge 'preallocated' arguments are unsupported", V);
     }
   }
 
@@ -3511,12 +3516,15 @@ void Verifier::visitCallBase(CallBase &Call) {
         "not allowed. Please use the @llvm.amdgpu.cs.chain intrinsic instead.",
         Call);
 
+  // Disallow passing/returning values with alignment higher than we can
+  // represent.
+  // FIXME: Consider making DataLayout cap the alignment, so this isn't
+  // necessary.
   auto VerifyTypeAlign = [&](Type *Ty, const Twine &Message) {
     if (!Ty->isSized())
       return;
     Align ABIAlign = DL.getABITypeAlign(Ty);
-    Align MaxAlign(ParamMaxAlignment);
-    Check(ABIAlign <= MaxAlign,
+    Check(ABIAlign.value() <= Value::MaximumAlignment,
           "Incorrect alignment of " + Message + " to called function!", Call);
   };
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index d303f228aa72c..bb3c9f7acdb8e 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1360,7 +1360,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
 SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
-
   SmallVector<const char *> LibcallSymbols;
   copy_if(Libcalls.getLibcallNames(), std::back_inserter(LibcallSymbols),
           [](const char *Name) { return Name; });
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d5d642f0d25e6..effaed2d9bfb6 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -335,6 +335,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (!Conf.DisableVerify)
     MPM.addPass(VerifierPass());
 
+  if (PrintPipelinePasses) {
+    std::string PipelineStr;
+    raw_string_ostream OS(PipelineStr);
+    MPM.printPipeline(OS, [&PIC](StringRef ClassName) {
+      auto PassName = PIC.getPassNameForClassName(ClassName);
+      return PassName.empty() ? ClassName : PassName;
+    });
+    outs() << "pipeline-passes: " << PipelineStr << '\n';
+  }
+
   MPM.run(Mod, MAM);
 }
 
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 59d796b419b35..f958905a26aa4 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -67,7 +67,6 @@ using namespace llvm;
 
 namespace {
 
-class ELFObjectWriter;
 struct ELFWriter;
 
 bool isDwoSection(const MCSectionELF &Sec) {
@@ -199,116 +198,6 @@ struct ELFWriter {
   void writeSection(uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
                     const MCSectionELF &Section);
 };
-
-class ELFObjectWriter : public MCObjectWriter {
-  /// The target specific ELF writer instance.
-  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
-
-  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
-
-  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
-
-  bool SeenGnuAbi = false;
-
-  std::optional<uint8_t> OverrideABIVersion;
-
-  bool hasRelocationAddend() const;
-
-  bool shouldRelocateWithSymbol(const MCAssembler &Asm, const MCValue &Val,
-                                const MCSymbolELF *Sym, uint64_t C,
-                                unsigned Type) const;
-
-public:
-  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW)
-      : TargetObjectWriter(std::move(MOTW)) {}
-
-  void reset() override {
-    SeenGnuAbi = false;
-    OverrideABIVersion.reset();
-    Relocations.clear();
-    Renames.clear();
-    MCObjectWriter::reset();
-  }
-
-  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                              const MCSymbol &SymA,
-                                              const MCFragment &FB, bool InSet,
-                                              bool IsPCRel) const override;
-
-  virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc,
-                               const MCSectionELF *From,
-                               const MCSectionELF *To) {
-    return true;
-  }
-
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
-  bool usesRela(const MCTargetOptions *TO, const MCSectionELF &Sec) const;
-
-  void executePostLayoutBinding(MCAssembler &Asm) override;
-
-  void markGnuAbi() override { SeenGnuAbi = true; }
-  bool seenGnuAbi() const { return SeenGnuAbi; }
-
-  bool seenOverrideABIVersion() const { return OverrideABIVersion.has_value(); }
-  uint8_t getOverrideABIVersion() const { return OverrideABIVersion.value(); }
-  void setOverrideABIVersion(uint8_t V) override { OverrideABIVersion = V; }
-
-  friend struct ELFWriter;
-};
-
-class ELFSingleObjectWriter : public ELFObjectWriter {
-  raw_pwrite_stream &OS;
-  bool IsLittleEndian;
-
-public:
-  ELFSingleObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                        raw_pwrite_stream &OS, bool IsLittleEndian)
-      : ELFObjectWriter(std::move(MOTW)), OS(OS),
-        IsLittleEndian(IsLittleEndian) {}
-
-  uint64_t writeObject(MCAssembler &Asm) override {
-    return ELFWriter(*this, OS, IsLittleEndian, ELFWriter::AllSections)
-        .writeObject(Asm);
-  }
-
-  friend struct ELFWriter;
-};
-
-class ELFDwoObjectWriter : public ELFObjectWriter {
-  raw_pwrite_stream &OS, &DwoOS;
-  bool IsLittleEndian;
-
-public:
-  ELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                     raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
-                     bool IsLittleEndian)
-      : ELFObjectWriter(std::move(MOTW)), OS(OS), DwoOS(DwoOS),
-        IsLittleEndian(IsLittleEndian) {}
-
-  bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
-                       const MCSectionELF *To) override {
-    if (isDwoSection(*From)) {
-      Ctx.reportError(Loc, "A dwo section may not contain relocations");
-      return false;
-    }
-    if (To && isDwoSection(*To)) {
-      Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
-      return false;
-    }
-    return true;
-  }
-
-  uint64_t writeObject(MCAssembler &Asm) override {
-    uint64_t Size = ELFWriter(*this, OS, IsLittleEndian, ELFWriter::NonDwoOnly)
-                        .writeObject(Asm);
-    Size += ELFWriter(*this, DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
-                .writeObject(Asm);
-    return Size;
-  }
-};
-
 } // end anonymous namespace
 
 uint64_t ELFWriter::align(Align Alignment) {
@@ -403,8 +292,8 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
                    ? int(ELF::ELFOSABI_GNU)
                    : OSABI);
   // e_ident[EI_ABIVERSION]
-  W.OS << char(OWriter.seenOverrideABIVersion()
-                   ? OWriter.getOverrideABIVersion()
+  W.OS << char(OWriter.OverrideABIVersion
+                   ? *OWriter.OverrideABIVersion
                    : OWriter.TargetObjectWriter->getABIVersion());
 
   W.OS.write_zeros(ELF::EI_NIDENT - ELF::EI_PAD);
@@ -419,7 +308,7 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
   WriteWord(0);                     // e_shoff = sec hdr table off in bytes
 
   // e_flags = whatever the target wants
-  W.write<uint32_t>(Asm.getELFHeaderEFlags());
+  W.write<uint32_t>(OWriter.getELFHeaderEFlags());
 
   // e_ehsize = ELF header size
   W.write<uint16_t>(is64Bit() ? sizeof(ELF::Elf64_Ehdr)
@@ -619,7 +508,7 @@ void ELFWriter::computeSymbolTable(MCAssembler &Asm,
   std::vector<ELFSymbolData> LocalSymbolData;
   std::vector<ELFSymbolData> ExternalSymbolData;
   MutableArrayRef<std::pair<std::string, size_t>> FileNames =
-      Asm.getFileNames();
+      OWriter.getFileNames();
   for (const std::pair<std::string, size_t> &F : FileNames)
     StrTabBuilder.add(F.first);
 
@@ -779,7 +668,7 @@ void ELFWriter::computeSymbolTable(MCAssembler &Asm,
 }
 
 void ELFWriter::writeAddrsigSection() {
-  for (const MCSymbol *Sym : OWriter.AddrsigSyms)
+  for (const MCSymbol *Sym : OWriter.getAddrsigSyms())
     if (Sym->getIndex() != 0)
       encodeULEB128(Sym->getIndex(), W.OS);
 }
@@ -1150,7 +1039,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
     StrTabBuilder.finalize();
   } else {
     MCSectionELF *AddrsigSection;
-    if (OWriter.EmitAddrsigSection) {
+    if (OWriter.getEmitAddrsigSection()) {
       AddrsigSection = Ctx.getELFSection(".llvm_addrsig", ELF::SHT_LLVM_ADDRSIG,
                                          ELF::SHF_EXCLUDE);
       addToSectionTable(AddrsigSection);
@@ -1170,7 +1059,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
       RelSection->setOffsets(SecStart, SecEnd);
     }
 
-    if (OWriter.EmitAddrsigSection) {
+    if (OWriter.getEmitAddrsigSection()) {
       uint64_t SecStart = W.OS.tell();
       writeAddrsigSection();
       uint64_t SecEnd = W.OS.tell();
@@ -1215,6 +1104,26 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
   return W.OS.tell() - StartOffset;
 }
 
+ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                                 raw_pwrite_stream &OS, bool IsLittleEndian)
+    : TargetObjectWriter(std::move(MOTW)), OS(OS),
+      IsLittleEndian(IsLittleEndian) {}
+ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                                 raw_pwrite_stream &OS,
+                                 raw_pwrite_stream &DwoOS, bool IsLittleEndian)
+    : TargetObjectWriter(std::move(MOTW)), OS(OS), DwoOS(&DwoOS),
+      IsLittleEndian(IsLittleEndian) {}
+
+void ELFObjectWriter::reset() {
+  ELFHeaderEFlags = 0;
+  SeenGnuAbi = false;
+  OverrideABIVersion.reset();
+  Relocations.clear();
+  Renames.clear();
+  Symvers.clear();
+  MCObjectWriter::reset();
+}
+
 bool ELFObjectWriter::hasRelocationAddend() const {
   return TargetObjectWriter->hasRelocationAddend();
 }
@@ -1222,7 +1131,7 @@ bool ELFObjectWriter::hasRelocationAddend() const {
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
-  for (const MCAssembler::Symver &S : Asm.Symvers) {
+  for (const Symver &S : Symvers) {
     StringRef AliasName = S.Name;
     const auto &Symbol = cast<MCSymbolELF>(*S.Sym);
     size_t Pos = AliasName.find('@');
@@ -1406,6 +1315,22 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
   return false;
 }
 
+bool ELFObjectWriter::checkRelocation(MCContext &Ctx, SMLoc Loc,
+                                      const MCSectionELF *From,
+                                      const MCSectionELF *To) {
+  if (DwoOS) {
+    if (isDwoSection(*From)) {
+      Ctx.reportError(Loc, "A dwo section may not contain relocations");
+      return false;
+    }
+    if (To && isDwoSection(*To)) {
+      Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
+      return false;
+    }
+  }
+  return true;
+}
+
 void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
                                        const MCFragment *Fragment,
                                        const MCFixup &Fixup, MCValue Target,
@@ -1522,17 +1447,13 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   return &SymA.getSection() == FB.getParent();
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                            raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return std::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
-                                                  IsLittleEndian);
-}
-
-std::unique_ptr<MCObjectWriter>
-llvm::createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                               raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
-                               bool IsLittleEndian) {
-  return std::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
-                                               IsLittleEndian);
+uint64_t ELFObjectWriter::writeObject(MCAssembler &Asm) {
+  uint64_t Size =
+      ELFWriter(*this, OS, IsLittleEndian,
+                DwoOS ? ELFWriter::NonDwoOnly : ELFWriter::AllSections)
+          .writeObject(Asm);
+  if (DwoOS)
+    Size += ELFWriter(*this, *DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
+                .writeObject(Asm);
+  return Size;
 }
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index fc36f29a90339..bab3485b50614 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -32,16 +32,17 @@ MCAsmBackend::~MCAsmBackend() = default;
 std::unique_ptr<MCObjectWriter>
 MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
   auto TW = createObjectTargetWriter();
+  bool IsLE = Endian == llvm::endianness::little;
   switch (TW->getFormat()) {
-  case Triple::ELF:
-    return createELFObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)),
-                                 OS, Endian == llvm::endianness::little);
   case Triple::MachO:
-    return createMachObjectWriter(cast<MCMachObjectTargetWriter>(std::move(TW)),
-                                  OS, Endian == llvm::endianness::little);
+    return std::make_unique<MachObjectWriter>(
+        cast<MCMachObjectTargetWriter>(std::move(TW)), OS, IsLE);
   case Triple::COFF:
     return createWinCOFFObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
+  case Triple::ELF:
+    return std::make_unique<ELFObjectWriter>(
+        cast<MCELFObjectTargetWriter>(std::move(TW)), OS, IsLE);
   case Triple::SPIRV:
     return createSPIRVObjectWriter(
         cast<MCSPIRVObjectTargetWriter>(std::move(TW)), OS);
@@ -71,7 +72,7 @@ MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS,
     return createWinCOFFDwoObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS, DwoOS);
   case Triple::ELF:
-    return createELFDwoObjectWriter(
+    return std::make_unique<ELFObjectWriter>(
         cast<MCELFObjectTargetWriter>(std::move(TW)), OS, DwoOS,
         Endian == llvm::endianness::little);
   case Triple::Wasm:
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 24209e456b5e2..9309d5987dc94 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -55,9 +55,9 @@ class MCAsmStreamer final : public MCStreamer {
   raw_svector_ostream CommentStream;
   raw_null_ostream NullStream;
 
-  unsigned IsVerboseAsm : 1;
-  unsigned ShowInst : 1;
-  unsigned UseDwarfDirectory : 1;
+  bool IsVerboseAsm = false;
+  bool ShowInst = false;
+  bool UseDwarfDirectory = false;
 
   void EmitRegisterName(int64_t Register);
   void PrintQuotedString(StringRef Data, raw_ostream &OS) const;
@@ -72,24 +72,40 @@ class MCAsmStreamer final : public MCStreamer {
 
 public:
   MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
-                bool isVerboseAsm, bool useDwarfDirectory,
                 MCInstPrinter *printer, std::unique_ptr<MCCodeEmitter> emitter,
-                std::unique_ptr<MCAsmBackend> asmbackend, bool showInst)
+                std::unique_ptr<MCAsmBackend> asmbackend)
       : MCStreamer(Context), OSOwner(std::move(os)), OS(*OSOwner),
         MAI(Context.getAsmInfo()), InstPrinter(printer),
         Assembler(std::make_unique<MCAssembler>(
             Context, std::move(asmbackend), std::move(emitter),
             (asmbackend) ? asmbackend->createObjectWriter(NullStream)
                          : nullptr)),
-        CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-        ShowInst(showInst), UseDwarfDirectory(useDwarfDirectory) {
+        CommentStream(CommentToEmit) {
     assert(InstPrinter);
-    if (IsVerboseAsm)
-        InstPrinter->setCommentStream(CommentStream);
     if (Assembler->getBackendPtr())
       setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
 
     Context.setUseNamesOnTempLabels(true);
+
+    auto *TO = Context.getTargetOptions();
+    if (!TO)
+      return;
+    IsVerboseAsm = TO->AsmVerbose;
+    if (IsVerboseAsm)
+      InstPrinter->setCommentStream(CommentStream);
+    ShowInst = TO->ShowMCInst;
+    switch (TO->MCUseDwarfDirectory) {
+    case MCTargetOptions::DisableDwarfDirectory:
+      UseDwarfDirectory = false;
+      break;
+    case MCTargetOptions::EnableDwarfDirectory:
+      UseDwarfDirectory = true;
+      break;
+    case MCTargetOptions::DefaultDwarfDirectory:
+      UseDwarfDirectory =
+          Context.getAsmInfo()->enableDwarfFileDirectoryDefault();
+      break;
+    }
   }
 
   MCAssembler &getAssembler() { return *Assembler; }
@@ -2641,12 +2657,9 @@ void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) {
 
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     std::unique_ptr<formatted_raw_ostream> OS,
-                                    bool isVerboseAsm, bool useDwarfDirectory,
                                     MCInstPrinter *IP,
                                     std::unique_ptr<MCCodeEmitter> &&CE,
-                                    std::unique_ptr<MCAsmBackend> &&MAB,
-                                    bool ShowInst) {
-  return new MCAsmStreamer(Context, std::move(OS), isVerboseAsm,
-                           useDwarfDirectory, IP, std::move(CE), std::move(MAB),
-                           ShowInst);
+                                    std::unique_ptr<MCAsmBackend> &&MAB) {
+  return new MCAsmStreamer(Context, std::move(OS), IP, std::move(CE),
+                           std::move(MAB));
 }
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index c8d12eb5dcf64..cbeb41f58c99b 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -56,8 +56,6 @@ STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
 STATISTIC(EmittedDataFragments,
           "Number of emitted assembler fragments - data");
-STATISTIC(EmittedCompactEncodedInstFragments,
-          "Number of emitted assembler fragments - compact encoded inst");
 STATISTIC(EmittedAlignFragments,
           "Number of emitted assembler fragments - align");
 STATISTIC(EmittedFillFragments,
@@ -84,38 +82,22 @@ MCAssembler::MCAssembler(MCContext &Context,
                          std::unique_ptr<MCCodeEmitter> Emitter,
                          std::unique_ptr<MCObjectWriter> Writer)
     : Context(Context), Backend(std::move(Backend)),
-      Emitter(std::move(Emitter)), Writer(std::move(Writer)) {
-  VersionInfo.Major = 0; // Major version == 0 for "none specified"
-  DarwinTargetVariantVersionInfo.Major = 0;
-}
-
-MCAssembler::~MCAssembler() = default;
+      Emitter(std::move(Emitter)), Writer(std::move(Writer)) {}
 
 void MCAssembler::reset() {
   RelaxAll = false;
-  SubsectionsViaSymbols = false;
-  IncrementalLinkerCompatible = false;
   Sections.clear();
   Symbols.clear();
-  LinkerOptions.clear();
-  FileNames.clear();
   ThumbFuncs.clear();
   BundleAlignSize = 0;
-  ELFHeaderEFlags = 0;
-  LOHContainer.reset();
-  VersionInfo.Major = 0;
-  VersionInfo.SDKVersion = VersionTuple();
-  DarwinTargetVariantVersionInfo.Major = 0;
-  DarwinTargetVariantVersionInfo.SDKVersion = VersionTuple();
 
   // reset objects owned by us
   if (getBackendPtr())
     getBackendPtr()->reset();
   if (getEmitterPtr())
     getEmitterPtr()->reset();
-  if (getWriterPtr())
-    getWriterPtr()->reset();
-  getLOHContainer().reset();
+  if (Writer)
+    Writer->reset();
 }
 
 bool MCAssembler::registerSection(MCSection &Section) {
@@ -208,9 +190,9 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
       const MCSymbol &SA = A->getSymbol();
       if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {
         IsResolved = false;
-      } else if (auto *Writer = getWriterPtr()) {
+      } else {
         IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) ||
-                     Writer->isSymbolRefDifferenceFullyResolvedImpl(
+                     getWriter().isSymbolRefDifferenceFullyResolvedImpl(
                          *this, SA, *DF, false, true);
       }
     }
@@ -269,8 +251,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
     return cast<MCDataFragment>(F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(F).getContents().size();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
     int64_t NumValues = 0;
@@ -700,11 +680,6 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     OS << cast<MCRelaxableFragment>(F).getContents();
     break;
 
-  case MCFragment::FT_CompactEncodedInst:
-    ++stats::EmittedCompactEncodedInstFragments;
-    OS << cast<MCCompactEncodedInstFragment>(F).getContents();
-    break;
-
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
     const MCFillFragment &FF = cast<MCFillFragment>(F);
@@ -1132,7 +1107,7 @@ bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
   // Use evaluateKnownAbsolute for Mach-O as a hack: .subsections_via_symbols
   // requires that .uleb128 A-B is foldable where A and B reside in different
   // fragments. This is used by __gcc_except_table.
-  bool Abs = getSubsectionsViaSymbols()
+  bool Abs = getWriter().getSubsectionsViaSymbols()
                  ? LF.getValue().evaluateKnownAbsolute(Value, *this)
                  : LF.getValue().evaluateAsAbsolute(Value, *this);
   if (!Abs) {
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 2fe8369016440..ac3946b6ef46f 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -349,6 +349,11 @@ MCSymbol *MCContext::createNamedTempSymbol() {
   return createNamedTempSymbol("tmp");
 }
 
+MCSymbol *MCContext::createLocalSymbol(StringRef Name) {
+  MCSymbolTableEntry &NameEntry = getSymbolTableEntry(Name);
+  return createSymbolImpl(&NameEntry, /*IsTemporary=*/false);
+}
+
 unsigned MCContext::NextInstance(unsigned LocalLabelVal) {
   MCLabel *&Label = Instances[LocalLabelVal];
   if (!Label)
@@ -692,10 +697,11 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
   MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty()) {
     COMDATSymbol = getOrCreateSymbol(COMDATSymName);
+    assert(COMDATSymbol && "COMDATSymbol is null");
     COMDATSymName = COMDATSymbol->getName();
     // A non-associative COMDAT is considered to define the COMDAT symbol. Check
     // the redefinition error.
-    if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE && COMDATSymbol &&
+    if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE &&
         COMDATSymbol->isDefined() &&
         (!COMDATSymbol->isInSection() ||
          cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() !=
@@ -795,7 +801,6 @@ bool MCContext::hasXCOFFSection(StringRef Section,
 MCSectionXCOFF *MCContext::getXCOFFSection(
     StringRef Section, SectionKind Kind,
     std::optional<XCOFF::CsectProperties> CsectProp, bool MultiSymbolsAllowed,
-    const char *BeginSymName,
     std::optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSectionSubtypeFlags) {
   bool IsDwarfSec = DwarfSectionSubtypeFlags.has_value();
   assert((IsDwarfSec != CsectProp.has_value()) && "Invalid XCOFF section!");
@@ -825,35 +830,29 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
         CachedName + "[" +
         XCOFF::getMappingClassString(CsectProp->MappingClass) + "]"));
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
-
   // QualName->getUnqualifiedName() and CachedName are the same except when
   // CachedName contains invalid character(s) such as '$' for an XCOFF symbol.
   MCSectionXCOFF *Result = nullptr;
   if (IsDwarfSec)
     Result = new (XCOFFAllocator.Allocate()) MCSectionXCOFF(
         QualName->getUnqualifiedName(), Kind, QualName,
-        *DwarfSectionSubtypeFlags, Begin, CachedName, MultiSymbolsAllowed);
+        *DwarfSectionSubtypeFlags, QualName, CachedName, MultiSymbolsAllowed);
   else
     Result = new (XCOFFAllocator.Allocate())
         MCSectionXCOFF(QualName->getUnqualifiedName(), CsectProp->MappingClass,
-                       CsectProp->Type, Kind, QualName, Begin, CachedName,
+                       CsectProp->Type, Kind, QualName, nullptr, CachedName,
                        MultiSymbolsAllowed);
 
   Entry.second = Result;
 
   auto *F = allocInitialFragment(*Result);
-  if (Begin)
-    Begin->setFragment(F);
 
   // We might miss calculating the symbols difference as absolute value before
   // adding fixups when symbol_A without the fragment set is the csect itself
   // and symbol_B is in it.
-  // TODO: Currently we only set the fragment for XMC_PR csects because we don't
-  // have other cases that hit this problem yet.
-  if (!IsDwarfSec && CsectProp->MappingClass == XCOFF::XMC_PR)
+  // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
+  // sections because we don't have other cases that hit this problem yet.
+  if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
     QualName->setFragment(F);
 
   return Result;
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index efafd555c5c5c..1297dc3828b58 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1299,8 +1299,8 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 namespace {
 
 class FrameEmitterImpl {
-  int CFAOffset = 0;
-  int InitialCFAOffset = 0;
+  int64_t CFAOffset = 0;
+  int64_t InitialCFAOffset = 0;
   bool IsEH;
   MCObjectStreamer &Streamer;
 
@@ -1414,7 +1414,7 @@ void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
 
-    int Offset = Instr.getOffset();
+    int64_t Offset = Instr.getOffset();
     if (IsRelative)
       Offset -= CFAOffset;
     Offset = Offset / dataAlignmentFactor;
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 38ebaccf3b6ff..8dda587ba56b2 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFragment.h"
@@ -46,6 +47,10 @@ MCELFStreamer::MCELFStreamer(MCContext &Context,
     : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
                        std::move(Emitter)) {}
 
+ELFObjectWriter &MCELFStreamer::getWriter() {
+  return static_cast<ELFObjectWriter &>(getAssembler().getWriter());
+}
+
 bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
 }
@@ -84,18 +89,6 @@ void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
 void MCELFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
-  // Do any generic stuff we need to do.
-  switch (Flag) {
-  case MCAF_SyntaxUnified: return; // no-op here.
-  case MCAF_Code16: return; // Change parsing mode; no-op here.
-  case MCAF_Code32: return; // Change parsing mode; no-op here.
-  case MCAF_Code64: return; // Change parsing mode; no-op here.
-  case MCAF_SubsectionsViaSymbols:
-    getAssembler().setSubsectionsViaSymbols(true);
-    return;
-  }
-
-  llvm_unreachable("invalid assembler flag!");
 }
 
 // If bundle alignment is used and there are any instructions in the section, it
@@ -120,7 +113,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   if (Grp)
     Asm.registerSymbol(*Grp);
   if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN)
-    Asm.getWriter().markGnuAbi();
+    getWriter().markGnuAbi();
 
   changeSectionImpl(Section, Subsection);
   Asm.registerSymbol(*Section->getBeginSymbol());
@@ -188,7 +181,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_ELF_TypeGnuUniqueObject:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_OBJECT));
     Symbol->setBinding(ELF::STB_GNU_UNIQUE);
-    getAssembler().getWriter().markGnuAbi();
+    getWriter().markGnuAbi();
     break;
 
   case MCSA_Global:
@@ -226,7 +219,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
 
   case MCSA_ELF_TypeIndFunction:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_GNU_IFUNC));
-    getAssembler().getWriter().markGnuAbi();
+    getWriter().markGnuAbi();
     break;
 
   case MCSA_ELF_TypeObject:
@@ -310,7 +303,7 @@ void MCELFStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 void MCELFStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym,
                                            StringRef Name,
                                            bool KeepOriginalSym) {
-  getAssembler().Symvers.push_back(MCAssembler::Symver{
+  getWriter().Symvers.push_back(ELFObjectWriter::Symver{
       getStartTokLoc(), OriginalSym, Name, KeepOriginalSym});
 }
 
@@ -343,7 +336,7 @@ void MCELFStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
 void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                        const MCSymbolRefExpr *To,
                                        uint64_t Count) {
-  getAssembler().CGProfile.push_back({From, To, Count});
+  getWriter().getCGProfile().push_back({From, To, Count});
 }
 
 void MCELFStreamer::emitIdent(StringRef IdentString) {
@@ -472,8 +465,8 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
 }
 
 void MCELFStreamer::finalizeCGProfile() {
-  MCAssembler &Asm = getAssembler();
-  if (Asm.CGProfile.empty())
+  ELFObjectWriter &W = getWriter();
+  if (W.getCGProfile().empty())
     return;
   MCSection *CGProfile = getAssembler().getContext().getELFSection(
       ".llvm.call-graph-profile", ELF::SHT_LLVM_CALL_GRAPH_PROFILE,
@@ -481,7 +474,7 @@ void MCELFStreamer::finalizeCGProfile() {
   pushSection();
   switchSection(CGProfile);
   uint64_t Offset = 0;
-  for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
+  for (auto &E : W.getCGProfile()) {
     finalizeCGProfileEntry(E.From, Offset);
     finalizeCGProfileEntry(E.To, Offset);
     emitIntValue(E.Count, sizeof(uint64_t));
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index b1012507a1242..ac1b5a8fb8274 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -27,7 +27,8 @@
 using namespace llvm;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
-    : Kind(Kind), HasInstructions(HasInstructions), LinkerRelaxable(false) {}
+    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
+      LinkerRelaxable(false), AllowAutoPadding(false) {}
 
 void MCFragment::destroy() {
   switch (Kind) {
@@ -37,9 +38,6 @@ void MCFragment::destroy() {
     case FT_Data:
       cast<MCDataFragment>(this)->~MCDataFragment();
       return;
-    case FT_CompactEncodedInst:
-      cast<MCCompactEncodedInstFragment>(this)->~MCCompactEncodedInstFragment();
-      return;
     case FT_Fill:
       cast<MCFillFragment>(this)->~MCFillFragment();
       return;
@@ -107,8 +105,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
-  case MCFragment::FT_CompactEncodedInst:
-    OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
   case MCFragment::FT_Nops:
     OS << "MCFNopsFragment";
@@ -168,19 +164,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     }
     break;
   }
-  case MCFragment::FT_CompactEncodedInst: {
-    const auto *CEIF =
-      cast<MCCompactEncodedInstFragment>(this);
-    OS << "\n       ";
-    OS << " Contents:[";
-    const SmallVectorImpl<char> &Contents = CEIF->getContents();
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
-      if (i) OS << ",";
-      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
-    }
-    OS << "] (" << Contents.size() << " bytes)";
-    break;
-  }
   case MCFragment::FT_Fill:  {
     const auto *FF = cast<MCFillFragment>(this);
     OS << " Value:" << static_cast<unsigned>(FF->getValue())
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index c8bc819129850..528caa12ec212 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -54,9 +54,6 @@ class MCMachOStreamer : public MCObjectStreamer {
   /// need for local relocations. False by default.
   bool LabelSections;
 
-  bool DWARFMustBeAtTheEnd;
-  bool CreatedADWARFSection;
-
   /// HasSectionLabel - map of which sections have already had a non-local
   /// label emitted to them. Used so we don't emit extraneous linker local
   /// labels in the middle of the section.
@@ -70,16 +67,13 @@ class MCMachOStreamer : public MCObjectStreamer {
 public:
   MCMachOStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
                   std::unique_ptr<MCObjectWriter> OW,
-                  std::unique_ptr<MCCodeEmitter> Emitter,
-                  bool DWARFMustBeAtTheEnd, bool label)
+                  std::unique_ptr<MCCodeEmitter> Emitter, bool label)
       : MCObjectStreamer(Context, std::move(MAB), std::move(OW),
                          std::move(Emitter)),
-        LabelSections(label), DWARFMustBeAtTheEnd(DWARFMustBeAtTheEnd),
-        CreatedADWARFSection(false) {}
+        LabelSections(label) {}
 
   /// state management
   void reset() override {
-    CreatedADWARFSection = false;
     HasSectionLabel.clear();
     MCObjectStreamer::reset();
   }
@@ -124,12 +118,12 @@ class MCMachOStreamer : public MCObjectStreamer {
   }
 
   void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override {
-    getAssembler().getLOHContainer().addDirective(Kind, Args);
+    getWriter().getLOHContainer().addDirective(Kind, Args);
   }
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override {
     if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
-      getAssembler().CGProfile.push_back({From, To, Count});
+      getWriter().getCGProfile().push_back({From, To, Count});
   }
 
   void finishImpl() override;
@@ -141,48 +135,9 @@ class MCMachOStreamer : public MCObjectStreamer {
 
 } // end anonymous namespace.
 
-static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
-  // These sections are created by the assembler itself after the end of
-  // the .s file.
-  StringRef SegName = MSec.getSegmentName();
-  StringRef SecName = MSec.getName();
-
-  if (SegName == "__LD" && SecName == "__compact_unwind")
-    return true;
-
-  if (SegName == "__IMPORT") {
-    if (SecName == "__jump_table")
-      return true;
-
-    if (SecName == "__pointers")
-      return true;
-  }
-
-  if (SegName == "__TEXT" && SecName == "__eh_frame")
-    return true;
-
-  if (SegName == "__DATA" &&
-      (SecName == "__llvm_addrsig" || SecName == "__nl_symbol_ptr" ||
-       SecName == "__thread_ptr"))
-    return true;
-  if (SegName == "__LLVM" && (SecName == "__cg_profile"))
-    return true;
-
-  if (SegName == "__DATA" && SecName == "__auth_ptr")
-    return true;
-
-  return false;
-}
-
 void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   // Change the section normally.
-  bool Created = changeSectionImpl(Section, Subsection);
-  const MCSectionMachO &MSec = *cast<MCSectionMachO>(Section);
-  StringRef SegName = MSec.getSegmentName();
-  if (SegName == "__DWARF")
-    CreatedADWARFSection = true;
-  else if (Created && DWARFMustBeAtTheEnd && !canGoAfterDWARF(MSec))
-    assert(!CreatedADWARFSection && "Creating regular section after DWARF");
+  changeSectionImpl(Section, Subsection);
 
   // Output a linker-local symbol so we don't need section-relative local
   // relocations. The linker hates us when we do that.
@@ -196,12 +151,13 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
 
 void MCMachOStreamer::emitEHSymAttributes(const MCSymbol *Symbol,
                                           MCSymbol *EHSymbol) {
+  auto *Sym = cast<MCSymbolMachO>(Symbol);
   getAssembler().registerSymbol(*Symbol);
   if (Symbol->isExternal())
     emitSymbolAttribute(EHSymbol, MCSA_Global);
-  if (cast<MCSymbolMachO>(Symbol)->isWeakDefinition())
+  if (Sym->isWeakDefinition())
     emitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
-  if (Symbol->isPrivateExtern())
+  if (Sym->isPrivateExtern())
     emitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
 }
 
@@ -264,13 +220,13 @@ void MCMachOStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   case MCAF_Code32: return; // Change parsing mode; no-op here.
   case MCAF_Code64: return; // Change parsing mode; no-op here.
   case MCAF_SubsectionsViaSymbols:
-    getAssembler().setSubsectionsViaSymbols(true);
+    getWriter().setSubsectionsViaSymbols(true);
     return;
   }
 }
 
 void MCMachOStreamer::emitLinkerOptions(ArrayRef<std::string> Options) {
-  getAssembler().getLinkerOptions().push_back(Options);
+  getWriter().getLinkerOptions().push_back(Options);
 }
 
 void MCMachOStreamer::emitDataRegion(MCDataRegionType Kind) {
@@ -296,21 +252,21 @@ void MCMachOStreamer::emitDataRegion(MCDataRegionType Kind) {
 void MCMachOStreamer::emitVersionMin(MCVersionMinType Kind, unsigned Major,
                                      unsigned Minor, unsigned Update,
                                      VersionTuple SDKVersion) {
-  getAssembler().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
+  getWriter().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
                                        unsigned Minor, unsigned Update,
                                        VersionTuple SDKVersion) {
-  getAssembler().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
-                                 Update, SDKVersion);
+  getWriter().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
+                              Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitDarwinTargetVariantBuildVersion(
     unsigned Platform, unsigned Major, unsigned Minor, unsigned Update,
     VersionTuple SDKVersion) {
-  getAssembler().setDarwinTargetVariantBuildVersion(
-      (MachO::PlatformType)Platform, Major, Minor, Update, SDKVersion);
+  getWriter().setTargetVariantBuildVersion((MachO::PlatformType)Platform, Major,
+                                           Minor, Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitThumbFunc(MCSymbol *Symbol) {
@@ -550,9 +506,10 @@ void MCMachOStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
 
 void MCMachOStreamer::finalizeCGProfile() {
   MCAssembler &Asm = getAssembler();
-  if (Asm.CGProfile.empty())
+  MCObjectWriter &W = getWriter();
+  if (W.getCGProfile().empty())
     return;
-  for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
+  for (auto &E : W.getCGProfile()) {
     finalizeCGProfileEntry(E.From);
     finalizeCGProfileEntry(E.To);
   }
@@ -564,7 +521,7 @@ void MCMachOStreamer::finalizeCGProfile() {
   changeSection(CGProfileSection);
   // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
   size_t SectionBytes =
-      Asm.CGProfile.size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
+      W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
   cast<MCDataFragment>(*CGProfileSection->begin())
       .getContents()
       .resize(SectionBytes);
@@ -576,9 +533,8 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
                                       std::unique_ptr<MCCodeEmitter> &&CE,
                                       bool DWARFMustBeAtTheEnd,
                                       bool LabelSections) {
-  MCMachOStreamer *S =
-      new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
-                          DWARFMustBeAtTheEnd, LabelSections);
+  MCMachOStreamer *S = new MCMachOStreamer(
+      Context, std::move(MAB), std::move(OW), std::move(CE), LabelSections);
   const Triple &Target = Context.getTargetTriple();
   S->emitVersionForTarget(
       Target, Context.getObjectFileInfo()->getSDKVersion(),
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 7a2b43a954b10..6dadd9752646f 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -948,52 +948,52 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
   DwarfAbbrevSection = Ctx->getXCOFFSection(
       ".dwabrev", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwabrev", XCOFF::SSUBTYP_DWABREV);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWABREV);
 
   DwarfInfoSection = Ctx->getXCOFFSection(
       ".dwinfo", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwinfo", XCOFF::SSUBTYP_DWINFO);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWINFO);
 
   DwarfLineSection = Ctx->getXCOFFSection(
       ".dwline", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwline", XCOFF::SSUBTYP_DWLINE);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWLINE);
 
   DwarfFrameSection = Ctx->getXCOFFSection(
       ".dwframe", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwframe", XCOFF::SSUBTYP_DWFRAME);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWFRAME);
 
   DwarfPubNamesSection = Ctx->getXCOFFSection(
       ".dwpbnms", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwpbnms", XCOFF::SSUBTYP_DWPBNMS);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWPBNMS);
 
   DwarfPubTypesSection = Ctx->getXCOFFSection(
       ".dwpbtyp", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwpbtyp", XCOFF::SSUBTYP_DWPBTYP);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWPBTYP);
 
   DwarfStrSection = Ctx->getXCOFFSection(
       ".dwstr", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwstr", XCOFF::SSUBTYP_DWSTR);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWSTR);
 
   DwarfLocSection = Ctx->getXCOFFSection(
       ".dwloc", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwloc", XCOFF::SSUBTYP_DWLOC);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWLOC);
 
   DwarfARangesSection = Ctx->getXCOFFSection(
       ".dwarnge", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwarnge", XCOFF::SSUBTYP_DWARNGE);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWARNGE);
 
   DwarfRangesSection = Ctx->getXCOFFSection(
       ".dwrnges", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwrnges", XCOFF::SSUBTYP_DWRNGES);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWRNGES);
 
   DwarfMacinfoSection = Ctx->getXCOFFSection(
       ".dwmac", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, ".dwmac", XCOFF::SSUBTYP_DWMAC);
+      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWMAC);
 }
 
 void MCObjectFileInfo::initDXContainerObjectFileInfo(const Triple &T) {
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index a72e34fe6fd33..9dc3974fd8f0d 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -32,8 +32,8 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
       Assembler(std::make_unique<MCAssembler>(
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
       EmitEHFrame(true), EmitDebugFrame(false) {
-  if (Assembler->getBackendPtr())
-    setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
+  assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
+  setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
   if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
     Assembler->setRelaxAll(true);
 }
@@ -784,15 +784,18 @@ void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength,
 }
 
 void MCObjectStreamer::emitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
+  MCAssembler &Asm = getAssembler();
+  Asm.getWriter().addFileName(Asm, Filename);
 }
 
 void MCObjectStreamer::emitFileDirective(StringRef Filename,
                                          StringRef CompilerVersion,
                                          StringRef TimeStamp,
                                          StringRef Description) {
-  getAssembler().addFileName(Filename);
-  getAssembler().setCompilerVersion(CompilerVersion.str());
+  MCObjectWriter &W = getAssembler().getWriter();
+  W.addFileName(getAssembler(), Filename);
+  if (CompilerVersion.size())
+    W.setCompilerVersion(CompilerVersion);
   // TODO: add TimeStamp and Description to .file symbol table entry
   // with the integrated assembler.
 }
diff --git a/llvm/lib/MC/MCObjectWriter.cpp b/llvm/lib/MC/MCObjectWriter.cpp
index d321e581bbf0e..818e703514d61 100644
--- a/llvm/lib/MC/MCObjectWriter.cpp
+++ b/llvm/lib/MC/MCObjectWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
@@ -18,6 +19,14 @@ using namespace llvm;
 
 MCObjectWriter::~MCObjectWriter() = default;
 
+void MCObjectWriter::reset() {
+  FileNames.clear();
+  AddrsigSyms.clear();
+  EmitAddrsigSection = false;
+  SubsectionsViaSymbols = false;
+  CGProfile.clear();
+}
+
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
     const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B,
     bool InSet) const {
@@ -44,3 +53,7 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // On ELF and COFF  A - B is absolute if A and B are in the same section.
   return &SecA == &SecB;
 }
+
+void MCObjectWriter::addFileName(MCAssembler &Asm, StringRef FileName) {
+  FileNames.emplace_back(std::string(FileName), Asm.Symbols.size());
+}
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index d05712bca73cd..1b9c624136e8b 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -658,7 +658,7 @@ class AsmParser : public MCAsmParser {
 
   bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
 
-  bool parseDirectiveAbort(); // ".abort"
+  bool parseDirectiveAbort(SMLoc DirectiveLoc); // ".abort"
   bool parseDirectiveInclude(); // ".include"
   bool parseDirectiveIncbin(); // ".incbin"
 
@@ -2120,7 +2120,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     case DK_LCOMM:
       return parseDirectiveComm(/*IsLocal=*/true);
     case DK_ABORT:
-      return parseDirectiveAbort();
+      return parseDirectiveAbort(IDLoc);
     case DK_INCLUDE:
       return parseDirectiveInclude();
     case DK_INCBIN:
@@ -3462,17 +3462,6 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     }
   }
 
-  if (HasFillExpr && FillExpr != 0) {
-    MCSection *Sec = getStreamer().getCurrentSectionOnly();
-    if (Sec && Sec->isVirtualSection()) {
-      ReturnVal |=
-          Warning(FillExprLoc, "ignoring non-zero fill value in " +
-                                   Sec->getVirtualSectionKind() + " section '" +
-                                   Sec->getName() + "'");
-      FillExpr = 0;
-    }
-  }
-
   // Diagnose non-sensical max bytes to align.
   if (MaxBytesLoc.isValid()) {
     if (MaxBytesToFill < 1) {
@@ -3489,13 +3478,20 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     }
   }
 
-  // Check whether we should use optimal code alignment for this .align
-  // directive.
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   assert(Section && "must have section to emit alignment");
-  bool useCodeAlign = Section->useCodeAlign();
-  if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
-      ValueSize == 1 && useCodeAlign) {
+
+  if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) {
+    ReturnVal |=
+        Warning(FillExprLoc, "ignoring non-zero fill value in " +
+                                 Section->getVirtualSectionKind() +
+                                 " section '" + Section->getName() + "'");
+    FillExpr = 0;
+  }
+
+  // Check whether we should use optimal code alignment for this .align
+  // directive.
+  if (Section->useCodeAlign() && !HasFillExpr) {
     getStreamer().emitCodeAlignment(
         Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill);
   } else {
@@ -5095,21 +5091,17 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
 
 /// parseDirectiveAbort
 ///  ::= .abort [... message ...]
-bool AsmParser::parseDirectiveAbort() {
-  // FIXME: Use loc from directive.
-  SMLoc Loc = getLexer().getLoc();
-
+bool AsmParser::parseDirectiveAbort(SMLoc DirectiveLoc) {
   StringRef Str = parseStringToEndOfStatement();
   if (parseEOL())
     return true;
 
   if (Str.empty())
-    return Error(Loc, ".abort detected. Assembly stopping.");
-  else
-    return Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
-  // FIXME: Actually abort assembly here.
+    return Error(DirectiveLoc, ".abort detected. Assembly stopping");
 
-  return false;
+  // FIXME: Actually abort assembly here.
+  return Error(DirectiveLoc,
+               ".abort '" + Str + "' detected. Assembly stopping");
 }
 
 /// parseDirectiveInclude
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 00ca5d2067d58..41043b2aeb93f 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -124,7 +124,7 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (getKind().isMetadata() && isDwarfSect()) {
     OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags())
        << '\n';
-    OS << MAI.getPrivateLabelPrefix() << getName() << ':' << '\n';
+    OS << getName() << ':' << '\n';
     return;
   }
 
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 538244cb8ba6a..92329e079f93a 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -27,6 +27,8 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -45,7 +47,15 @@ MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCCodeEmitter> CE,
                                      std::unique_ptr<MCObjectWriter> OW)
     : MCObjectStreamer(Context, std::move(MAB), std::move(OW), std::move(CE)),
-      CurSymbol(nullptr) {}
+      CurSymbol(nullptr) {
+  auto *TO = Context.getTargetOptions();
+  if (TO && TO->MCIncrementalLinkerCompatible)
+    getWriter().setIncrementalLinkerCompatible(true);
+}
+
+WinCOFFObjectWriter &MCWinCOFFStreamer::getWriter() {
+  return static_cast<WinCOFFObjectWriter &>(getAssembler().getWriter());
+}
 
 void MCWinCOFFStreamer::emitInstToData(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
@@ -350,7 +360,7 @@ void MCWinCOFFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                            uint64_t Count) {
   // Ignore temporary symbols for now.
   if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
-    getAssembler().CGProfile.push_back({From, To, Count});
+    getWriter().getCGProfile().push_back({From, To, Count});
 }
 
 void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
@@ -366,8 +376,8 @@ void MCWinCOFFStreamer::finishImpl() {
     switchSection(Asm.getContext().getCOFFSection(".llvm_addrsig",
                                                   COFF::IMAGE_SCN_LNK_REMOVE));
   }
-  if (!Asm.CGProfile.empty()) {
-    for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
+  if (!Asm.getWriter().getCGProfile().empty()) {
+    for (auto &E : Asm.getWriter().getCGProfile()) {
       finalizeCGProfileEntry(E.From);
       finalizeCGProfileEntry(E.To);
     }
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 175d7d6b6c31a..94aa1ebc8f9e1 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 
@@ -108,12 +109,14 @@ void MCXCOFFStreamer::emitXCOFFExceptDirective(const MCSymbol *Symbol,
                                                unsigned Lang, unsigned Reason,
                                                unsigned FunctionSize,
                                                bool hasDebug) {
-  getAssembler().getWriter().addExceptionEntry(Symbol, Trap, Lang, Reason,
-                                               FunctionSize, hasDebug);
+  // TODO: Export XCOFFObjectWriter to llvm/MC/MCXCOFFObjectWriter.h and access
+  // it from MCXCOFFStreamer.
+  XCOFF::addExceptionEntry(getAssembler().getWriter(), Symbol, Trap, Lang,
+                           Reason, FunctionSize, hasDebug);
 }
 
 void MCXCOFFStreamer::emitXCOFFCInfoSym(StringRef Name, StringRef Metadata) {
-  getAssembler().getWriter().addCInfoSymEntry(Name, Metadata);
+  XCOFF::addCInfoSymEntry(getAssembler().getWriter(), Name, Metadata);
 }
 
 void MCXCOFFStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -159,15 +162,6 @@ void MCXCOFFStreamer::emitInstToData(const MCInst &Inst,
   DF->getContents().append(Code.begin(), Code.end());
 }
 
-MCStreamer *llvm::createXCOFFStreamer(MCContext &Context,
-                                      std::unique_ptr<MCAsmBackend> &&MAB,
-                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                      std::unique_ptr<MCCodeEmitter> &&CE) {
-  MCXCOFFStreamer *S = new MCXCOFFStreamer(Context, std::move(MAB),
-                                           std::move(OW), std::move(CE));
-  return S;
-}
-
 void MCXCOFFStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
                                                  uint64_t Size,
                                                  MCSymbol *CsectSym,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index e58e095252d05..8596f457574a3 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -55,6 +55,12 @@ void MachObjectWriter::reset() {
   LocalSymbolData.clear();
   ExternalSymbolData.clear();
   UndefinedSymbolData.clear();
+  LOHContainer.reset();
+  VersionInfo.Major = 0;
+  VersionInfo.SDKVersion = VersionTuple();
+  TargetVariantVersionInfo.Major = 0;
+  TargetVariantVersionInfo.SDKVersion = VersionTuple();
+  LinkerOptions.clear();
   MCObjectWriter::reset();
 }
 
@@ -376,7 +382,7 @@ const MCSymbol &MachObjectWriter::findAliasedSymbol(const MCSymbol &Sym) const {
 
 void MachObjectWriter::writeNlist(MachSymbolData &MSD, const MCAssembler &Asm) {
   const MCSymbol *Symbol = MSD.Symbol;
-  const MCSymbol &Data = *Symbol;
+  const auto &Data = cast<MCSymbolMachO>(*Symbol);
   const MCSymbol *AliasedSymbol = &findAliasedSymbol(*Symbol);
   uint8_t SectionIndex = MSD.SectionIndex;
   uint8_t Type = 0;
@@ -738,7 +744,7 @@ bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     if (!hasReliableSymbolDifference) {
       if (!SA.isInSection() || &SecA != &SecB ||
           (!SA.isTemporary() && FB.getAtom() != SA.getFragment()->getAtom() &&
-           Asm.getSubsectionsViaSymbols()))
+           SubsectionsViaSymbols))
         return false;
       return true;
     }
@@ -786,13 +792,13 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
 
-  if (!Asm.CGProfile.empty()) {
+  if (!CGProfile.empty()) {
     MCSection *CGProfileSection = Asm.getContext().getMachOSection(
         "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
     auto &Frag = cast<MCDataFragment>(*CGProfileSection->begin());
     Frag.getContents().clear();
     raw_svector_ostream OS(Frag.getContents());
-    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
+    for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
       support::endian::write(OS, FromIndex, W.Endian);
@@ -802,7 +808,6 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   unsigned NumSections = Asm.end() - Asm.begin();
-  const MCAssembler::VersionInfoType &VersionInfo = Asm.getVersionInfo();
 
   // The section data starts after the header, the segment load command (and
   // section headers) and the symbol table.
@@ -820,9 +825,6 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
       LoadCommandsSize += sizeof(MachO::version_min_command);
   }
 
-  const MCAssembler::VersionInfoType &TargetVariantVersionInfo =
-      Asm.getDarwinTargetVariantVersionInfo();
-
   // Add the target variant version info load command size, if used.
   if (TargetVariantVersionInfo.Major != 0) {
     ++NumLoadCommands;
@@ -839,7 +841,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Add the loh load command size, if used.
-  uint64_t LOHRawSize = Asm.getLOHContainer().getEmitSize(Asm, *this);
+  uint64_t LOHRawSize = LOHContainer.getEmitSize(Asm, *this);
   uint64_t LOHSize = alignTo(LOHRawSize, is64Bit() ? 8 : 4);
   if (LOHSize) {
     ++NumLoadCommands;
@@ -856,7 +858,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Add the linker option load commands sizes.
-  for (const auto &Option : Asm.getLinkerOptions()) {
+  for (const auto &Option : LinkerOptions) {
     ++NumLoadCommands;
     LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(Option, is64Bit());
   }
@@ -892,7 +894,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   // Write the prolog, starting with the header and load command...
   writeHeader(MachO::MH_OBJECT, NumLoadCommands, LoadCommandsSize,
-              Asm.getSubsectionsViaSymbols());
+              SubsectionsViaSymbols);
   uint32_t Prot =
       MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
   writeSegmentLoadCommand("", NumSections, 0, VMSize, SectionDataStart,
@@ -927,7 +929,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   // Write out the deployment target information, if it's available.
   auto EmitDeploymentTargetVersion =
-      [&](const MCAssembler::VersionInfoType &VersionInfo) {
+      [&](const VersionInfoType &VersionInfo) {
         auto EncodeVersion = [](VersionTuple V) -> uint32_t {
           assert(!V.empty() && "empty version");
           unsigned Update = V.getSubminor().value_or(0);
@@ -1015,7 +1017,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Write the linker options load commands.
-  for (const auto &Option : Asm.getLinkerOptions())
+  for (const auto &Option : LinkerOptions)
     writeLinkerOptionsLoadCommand(Option);
 
   // Write the actual section data.
@@ -1063,7 +1065,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 #ifndef NDEBUG
     unsigned Start = W.OS.tell();
 #endif
-    Asm.getLOHContainer().emit(Asm, *this);
+    LOHContainer.emit(Asm, *this);
     // Pad to a multiple of the pointer size.
     W.OS.write_zeros(
         offsetToAlignment(LOHRawSize, is64Bit() ? Align(8) : Align(4)));
@@ -1106,10 +1108,3 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   return NumBytesWritten();
 }
-
-std::unique_ptr<MCObjectWriter>
-llvm::createMachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
-                             raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return std::make_unique<MachObjectWriter>(std::move(MOTW), OS,
-                                             IsLittleEndian);
-}
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 91bde7ab62c1e..ef3d4b029e43e 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -14,47 +14,27 @@
 
 using namespace llvm;
 
-namespace {
-class SPIRVObjectWriter : public MCObjectWriter {
-  ::support::endian::Writer W;
-
-  /// The target specific SPIR-V writer instance.
-  std::unique_ptr<MCSPIRVObjectTargetWriter> TargetObjectWriter;
-
-public:
-  SPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
-                    raw_pwrite_stream &OS)
-      : W(OS, llvm::endianness::little), TargetObjectWriter(std::move(MOTW)) {}
-
-  ~SPIRVObjectWriter() override {}
-
-private:
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override {}
-
-  uint64_t writeObject(MCAssembler &Asm) override;
-  void writeHeader(const MCAssembler &Asm);
-};
-} // namespace
-
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
   constexpr uint32_t GeneratorID = 43;
   constexpr uint32_t GeneratorMagicNumber =
       (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
   constexpr uint32_t Schema = 0;
-  const MCAssembler::VersionInfoType &VIT = Asm.getVersionInfo();
-  uint32_t VersionNumber = 0 | (VIT.Major << 16) | (VIT.Minor << 8);
-  uint32_t Bound = VIT.Update;
 
   W.write<uint32_t>(MagicNumber);
-  W.write<uint32_t>(VersionNumber);
+  W.write<uint32_t>((VersionInfo.Major << 16) | (VersionInfo.Minor << 8));
   W.write<uint32_t>(GeneratorMagicNumber);
-  W.write<uint32_t>(Bound);
+  W.write<uint32_t>(VersionInfo.Bound);
   W.write<uint32_t>(Schema);
 }
 
+void SPIRVObjectWriter::setBuildVersion(unsigned Major, unsigned Minor,
+                                        unsigned Bound) {
+  VersionInfo.Major = Major;
+  VersionInfo.Minor = Minor;
+  VersionInfo.Bound = Bound;
+}
+
 uint64_t SPIRVObjectWriter::writeObject(MCAssembler &Asm) {
   uint64_t StartOffset = W.OS.tell();
   writeHeader(Asm);
diff --git a/llvm/lib/MC/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp
index 0aa48916c7d25..3be6f1d463499 100644
--- a/llvm/lib/MC/TargetRegistry.cpp
+++ b/llvm/lib/MC/TargetRegistry.cpp
@@ -9,6 +9,11 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <vector>
@@ -17,6 +22,94 @@ using namespace llvm;
 // Clients are responsible for avoid race conditions in registration.
 static Target *FirstTarget = nullptr;
 
+MCStreamer *Target::createMCObjectStreamer(
+    const Triple &T, MCContext &Ctx, std::unique_ptr<MCAsmBackend> TAB,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
+    const MCSubtargetInfo &STI) const {
+  MCStreamer *S = nullptr;
+  switch (T.getObjectFormat()) {
+  case Triple::UnknownObjectFormat:
+    llvm_unreachable("Unknown object format");
+  case Triple::COFF:
+    assert((T.isOSWindows() || T.isUEFI()) &&
+           "only Windows and UEFI COFF are supported");
+    S = COFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
+                           std::move(Emitter));
+    break;
+  case Triple::MachO:
+    if (MachOStreamerCtorFn)
+      S = MachOStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter));
+    else
+      S = createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter), false);
+    break;
+  case Triple::ELF:
+    if (ELFStreamerCtorFn)
+      S = ELFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                            std::move(Emitter));
+    else
+      S = createELFStreamer(Ctx, std::move(TAB), std::move(OW),
+                            std::move(Emitter));
+    break;
+  case Triple::Wasm:
+    S = createWasmStreamer(Ctx, std::move(TAB), std::move(OW),
+                           std::move(Emitter));
+    break;
+  case Triple::GOFF:
+    S = createGOFFStreamer(Ctx, std::move(TAB), std::move(OW),
+                           std::move(Emitter));
+    break;
+  case Triple::XCOFF:
+    S = XCOFFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                            std::move(Emitter));
+    break;
+  case Triple::SPIRV:
+    S = createSPIRVStreamer(Ctx, std::move(TAB), std::move(OW),
+                            std::move(Emitter));
+    break;
+  case Triple::DXContainer:
+    S = createDXContainerStreamer(Ctx, std::move(TAB), std::move(OW),
+                                  std::move(Emitter));
+    break;
+  }
+  if (ObjectTargetStreamerCtorFn)
+    ObjectTargetStreamerCtorFn(*S, STI);
+  return S;
+}
+
+MCStreamer *Target::createMCObjectStreamer(
+    const Triple &T, MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
+    std::unique_ptr<MCObjectWriter> &&OW,
+    std::unique_ptr<MCCodeEmitter> &&Emitter, const MCSubtargetInfo &STI, bool,
+    bool, bool) const {
+  return createMCObjectStreamer(T, Ctx, std::move(TAB), std::move(OW),
+                                std::move(Emitter), STI);
+}
+
+MCStreamer *Target::createAsmStreamer(MCContext &Ctx,
+                                      std::unique_ptr<formatted_raw_ostream> OS,
+                                      MCInstPrinter *IP,
+                                      std::unique_ptr<MCCodeEmitter> CE,
+                                      std::unique_ptr<MCAsmBackend> TAB) const {
+  formatted_raw_ostream &OSRef = *OS;
+  MCStreamer *S = llvm::createAsmStreamer(Ctx, std::move(OS), IP,
+                                          std::move(CE), std::move(TAB));
+  createAsmTargetStreamer(*S, OSRef, IP);
+  return S;
+}
+
+MCStreamer *Target::createAsmStreamer(MCContext &Ctx,
+                                      std::unique_ptr<formatted_raw_ostream> OS,
+                                      bool IsVerboseAsm, bool UseDwarfDirectory,
+                                      MCInstPrinter *IP,
+                                      std::unique_ptr<MCCodeEmitter> &&CE,
+                                      std::unique_ptr<MCAsmBackend> &&TAB,
+                                      bool ShowInst) const {
+  return createAsmStreamer(Ctx, std::move(OS), IP, std::move(CE),
+                           std::move(TAB));
+}
+
 iterator_range<TargetRegistry::iterator> TargetRegistry::targets() {
   return make_range(iterator(FirstTarget), iterator());
 }
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 7ba38be7edba9..62f53423126ea 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -119,10 +119,9 @@ class COFFSection {
 
   SmallVector<COFFSymbol *, 1> OffsetSymbols;
 };
+} // namespace
 
-class WinCOFFObjectWriter;
-
-class WinCOFFWriter {
+class llvm::WinCOFFWriter {
   WinCOFFObjectWriter &OWriter;
   support::endian::Writer W;
 
@@ -196,41 +195,19 @@ class WinCOFFWriter {
   void assignFileOffsets(MCAssembler &Asm);
 };
 
-class WinCOFFObjectWriter : public MCObjectWriter {
-  friend class WinCOFFWriter;
-
-  std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
-  std::unique_ptr<WinCOFFWriter> ObjWriter, DwoWriter;
-
-public:
-  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
-                      raw_pwrite_stream &OS)
-      : TargetObjectWriter(std::move(MOTW)),
-        ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
-                                                  WinCOFFWriter::AllSections)) {
-  }
-  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
-                      raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS)
-      : TargetObjectWriter(std::move(MOTW)),
-        ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
-                                                  WinCOFFWriter::NonDwoOnly)),
-        DwoWriter(std::make_unique<WinCOFFWriter>(*this, DwoOS,
-                                                  WinCOFFWriter::DwoOnly)) {}
-
-  // MCObjectWriter interface implementation.
-  void reset() override;
-  void executePostLayoutBinding(MCAssembler &Asm) override;
-  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                              const MCSymbol &SymA,
-                                              const MCFragment &FB, bool InSet,
-                                              bool IsPCRel) const override;
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
-  uint64_t writeObject(MCAssembler &Asm) override;
-};
-
-} // end anonymous namespace
+WinCOFFObjectWriter::WinCOFFObjectWriter(
+    std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
+    : TargetObjectWriter(std::move(MOTW)),
+      ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
+                                                WinCOFFWriter::AllSections)) {}
+WinCOFFObjectWriter::WinCOFFObjectWriter(
+    std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS,
+    raw_pwrite_stream &DwoOS)
+    : TargetObjectWriter(std::move(MOTW)),
+      ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
+                                                WinCOFFWriter::NonDwoOnly)),
+      DwoWriter(std::make_unique<WinCOFFWriter>(*this, DwoOS,
+                                                WinCOFFWriter::DwoOnly)) {}
 
 static bool isDwoSection(const MCSection &Sec) {
   return Sec.getName().ends_with(".dwo");
@@ -659,7 +636,7 @@ void WinCOFFWriter::writeSection(MCAssembler &Asm, const COFFSection &Sec) {
 
 // Create .file symbols.
 void WinCOFFWriter::createFileSymbols(MCAssembler &Asm) {
-  for (const std::pair<std::string, size_t> &It : Asm.getFileNames()) {
+  for (const std::pair<std::string, size_t> &It : OWriter.getFileNames()) {
     // round up to calculate the number of auxiliary symbols required
     const std::string &Name = It.first;
     unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
@@ -1107,12 +1084,12 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Create the contents of the .llvm.call-graph-profile section.
-  if (Mode != DwoOnly && !Asm.CGProfile.empty()) {
+  if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
     auto *Sec = Asm.getContext().getCOFFSection(
         ".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE);
     auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
     raw_svector_ostream OS(Frag->getContents());
-    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
+    for (const auto &CGPE : OWriter.getCGProfile()) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
       support::endian::write(OS, FromIndex, W.Endian);
@@ -1125,7 +1102,7 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
 
   // MS LINK expects to be able to use this timestamp to implement their
   // /INCREMENTAL feature.
-  if (Asm.isIncrementalLinkerCompatible()) {
+  if (OWriter.IncrementalLinkerCompatible) {
     Header.TimeDateStamp = getTime();
   } else {
     // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
@@ -1174,6 +1151,7 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
 // MCObjectWriter interface implementations
 
 void WinCOFFObjectWriter::reset() {
+  IncrementalLinkerCompatible = false;
   ObjWriter->reset();
   if (DwoWriter)
     DwoWriter->reset();
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 2735142c576b1..124b31e870884 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -296,7 +296,6 @@ class XCOFFObjectWriter : public MCObjectWriter {
   uint64_t SymbolTableOffset = 0;
   uint16_t SectionCount = 0;
   uint32_t PaddingsBeforeDwarf = 0;
-  std::vector<std::pair<std::string, size_t>> FileNames;
   bool HasVisibility = false;
 
   support::endian::Writer W;
@@ -422,16 +421,12 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void finalizeRelocationInfo(SectionEntry *Sec, uint64_t RelCount);
   void calcOffsetToRelocations(SectionEntry *Sec, uint64_t &RawPointer);
 
-  void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
-                         unsigned LanguageCode, unsigned ReasonCode,
-                         unsigned FunctionSize, bool hasDebug) override;
   bool hasExceptionSection() {
     return !ExceptionSection.ExceptionTable.empty();
   }
   unsigned getExceptionSectionSize();
   unsigned getExceptionOffset(const MCSymbol *Symbol);
 
-  void addCInfoSymEntry(StringRef Name, StringRef Metadata) override;
   size_t auxiliaryHeaderSize() const {
     // 64-bit object files have no auxiliary header.
     return HasVisibility && !is64Bit() ? XCOFF::AuxFileHeaderSizeShort : 0;
@@ -444,6 +439,11 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void writeWord(uint64_t Word) {
     is64Bit() ? W.write<uint64_t>(Word) : W.write<uint32_t>(Word);
   }
+
+  void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
+                         unsigned LanguageCode, unsigned ReasonCode,
+                         unsigned FunctionSize, bool hasDebug);
+  void addCInfoSymEntry(StringRef Name, StringRef Metadata);
 };
 
 XCOFFObjectWriter::XCOFFObjectWriter(
@@ -596,6 +596,9 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
     const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S);
     const MCSectionXCOFF *ContainingCsect = getContainingCsect(XSym);
 
+    if (ContainingCsect->isDwarfSect())
+      continue;
+
     if (XSym->getVisibilityType() != XCOFF::SYM_V_UNSPECIFIED)
       HasVisibility = true;
 
@@ -634,7 +637,6 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   if (CISI && nameShouldBeInStringTable(CISI->Name))
     Strings.add(CISI->Name);
 
-  FileNames = Asm.getFileNames();
   // Emit ".file" as the source file name when there is no file name.
   if (FileNames.empty())
     FileNames.emplace_back(".file", 0);
@@ -647,7 +649,7 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   // the AUX_FILE auxiliary entry.
   if (nameShouldBeInStringTable(".file"))
     Strings.add(".file");
-  StringRef Vers = Asm.getCompilerVersion();
+  StringRef Vers = CompilerVersion;
   if (auxFileSymNameShouldBeInStringTable(Vers))
     Strings.add(Vers);
 
@@ -825,8 +827,6 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm) {
   // We always emit a timestamp of 0 for reproducibility, so ensure incremental
   // linking is not enabled, in case, like with Windows COFF, such a timestamp
   // is incompatible with incremental linking of XCOFF.
-  if (Asm.isIncrementalLinkerCompatible())
-    report_fatal_error("Incremental linking not supported for XCOFF.");
 
   finalizeSectionInfo();
   uint64_t StartOffset = W.OS.tell();
@@ -1159,7 +1159,7 @@ void XCOFFObjectWriter::writeRelocations() {
 
 void XCOFFObjectWriter::writeSymbolTable(MCAssembler &Asm) {
   // Write C_FILE symbols.
-  StringRef Vers = Asm.getCompilerVersion();
+  StringRef Vers = CompilerVersion;
 
   for (const std::pair<std::string, size_t> &F : FileNames) {
     // The n_name of a C_FILE symbol is the source file's name when no auxiliary
@@ -1415,7 +1415,7 @@ void XCOFFObjectWriter::assignAddressesAndIndices(MCAssembler &Asm) {
   // The symbol table starts with all the C_FILE symbols. Each C_FILE symbol
   // requires 1 or 2 auxiliary entries.
   uint32_t SymbolTableIndex =
-      (2 + (Asm.getCompilerVersion().empty() ? 0 : 1)) * FileNames.size();
+      (2 + (CompilerVersion.empty() ? 0 : 1)) * FileNames.size();
 
   if (CInfoSymSection.Entry)
     SymbolTableIndex++;
@@ -1738,3 +1738,18 @@ llvm::createXCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                               raw_pwrite_stream &OS) {
   return std::make_unique<XCOFFObjectWriter>(std::move(MOTW), OS);
 }
+
+// TODO: Export XCOFFObjectWriter to llvm/MC/MCXCOFFObjectWriter.h and remove
+// the forwarders.
+void XCOFF::addExceptionEntry(MCObjectWriter &Writer, const MCSymbol *Symbol,
+                              const MCSymbol *Trap, unsigned LanguageCode,
+                              unsigned ReasonCode, unsigned FunctionSize,
+                              bool hasDebug) {
+  static_cast<XCOFFObjectWriter &>(Writer).addExceptionEntry(
+      Symbol, Trap, LanguageCode, ReasonCode, FunctionSize, hasDebug);
+}
+
+void XCOFF::addCInfoSymEntry(MCObjectWriter &Writer, StringRef Name,
+                             StringRef Metadata) {
+  static_cast<XCOFFObjectWriter &>(Writer).addCInfoSymEntry(Name, Metadata);
+}
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index c542c4e5f0743..78fc0c451e1a3 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -26,7 +26,8 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
       Common.DecompressDebugSections ||
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for COFF");
 
@@ -48,7 +49,8 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
       Common.DecompressDebugSections || Common.StripUnneeded ||
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for MachO");
 
@@ -68,7 +70,8 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
       !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
       !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
       !Common.SymbolsToRename.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "only flags for section dumping, removal, and "
                              "addition are supported");
@@ -97,7 +100,8 @@ Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const {
       Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
       Common.Weaken || Common.StripUnneeded || Common.DecompressDebugSections ||
       Common.GapFill != 0 || Common.PadTo != 0 ||
-      Common.ChangeSectionLMAValAll != 0) {
+      Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty()) {
     return createStringError(
         llvm::errc::invalid_argument,
         "no flags are supported yet, only basic copying is allowed");
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 075455c034154..40598861d4273 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -745,6 +745,56 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
     }
   }
 
+  if (!Config.ChangeSectionAddress.empty()) {
+    if (Obj.Type != ELF::ET_REL)
+      return createStringError(
+          object_error::invalid_file_type,
+          "cannot change section address in a non-relocatable file");
+
+    StringMap<AddressUpdate> SectionsToUpdateAddress;
+    for (const SectionPatternAddressUpdate &PatternUpdate :
+         make_range(Config.ChangeSectionAddress.rbegin(),
+                    Config.ChangeSectionAddress.rend())) {
+      for (SectionBase &Sec : Obj.sections()) {
+        if (PatternUpdate.SectionPattern.matches(Sec.Name) &&
+            SectionsToUpdateAddress.try_emplace(Sec.Name, PatternUpdate.Update)
+                .second) {
+          if (PatternUpdate.Update.Kind == AdjustKind::Subtract &&
+              Sec.Addr < PatternUpdate.Update.Value) {
+            return createStringError(
+                errc::invalid_argument,
+                "address 0x" + Twine::utohexstr(Sec.Addr) +
+                    " cannot be decreased by 0x" +
+                    Twine::utohexstr(PatternUpdate.Update.Value) +
+                    ". The result would underflow");
+          }
+          if (PatternUpdate.Update.Kind == AdjustKind::Add &&
+              Sec.Addr > std::numeric_limits<uint64_t>::max() -
+                             PatternUpdate.Update.Value) {
+            return createStringError(
+                errc::invalid_argument,
+                "address 0x" + Twine::utohexstr(Sec.Addr) +
+                    " cannot be increased by 0x" +
+                    Twine::utohexstr(PatternUpdate.Update.Value) +
+                    ". The result would overflow");
+          }
+
+          switch (PatternUpdate.Update.Kind) {
+          case (AdjustKind::Set):
+            Sec.Addr = PatternUpdate.Update.Value;
+            break;
+          case (AdjustKind::Subtract):
+            Sec.Addr -= PatternUpdate.Update.Value;
+            break;
+          case (AdjustKind::Add):
+            Sec.Addr += PatternUpdate.Update.Value;
+            break;
+          }
+        }
+      }
+    }
+  }
+
   if (Config.OnlyKeepDebug)
     for (auto &Sec : Obj.sections())
       if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE)
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 34f12cf0111cf..114045561366d 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -482,7 +482,8 @@ static uint64_t computeHeadersSize(object::Archive::Kind Kind,
 }
 
 static Expected<std::unique_ptr<SymbolicFile>>
-getSymbolicFile(MemoryBufferRef Buf, LLVMContext &Context) {
+getSymbolicFile(MemoryBufferRef Buf, LLVMContext &Context,
+                object::Archive::Kind Kind, function_ref<void(Error)> Warn) {
   const file_magic Type = identify_magic(Buf.getBuffer());
   // Don't attempt to read non-symbolic file types.
   if (!object::SymbolicFile::isSymbolicFile(Type, &Context))
@@ -490,8 +491,36 @@ getSymbolicFile(MemoryBufferRef Buf, LLVMContext &Context) {
   if (Type == file_magic::bitcode) {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(
         Buf, file_magic::bitcode, &Context);
-    if (!ObjOrErr)
-      return ObjOrErr.takeError();
+    // An error reading a bitcode file most likely indicates that the file
+    // was created by a compiler from the future. Normally we don't try to
+    // implement forwards compatibility for bitcode files, but when creating an
+    // archive we can implement best-effort forwards compatibility by treating
+    // the file as a blob and not creating symbol index entries for it. lld and
+    // mold ignore the archive symbol index, so provided that you use one of
+    // these linkers, LTO will work as long as lld or the gold plugin is newer
+    // than the compiler. We only ignore errors if the archive format is one
+    // that is supported by a linker that is known to ignore the index,
+    // otherwise there's no chance of this working so we may as well error out.
+    // We print a warning on read failure so that users of linkers that rely on
+    // the symbol index can diagnose the issue.
+    //
+    // This is the same behavior as GNU ar when the linker plugin returns an
+    // error when reading the input file. If the bitcode file is actually
+    // malformed, it will be diagnosed at link time.
+    if (!ObjOrErr) {
+      switch (Kind) {
+      case object::Archive::K_BSD:
+      case object::Archive::K_GNU:
+      case object::Archive::K_GNU64:
+        Warn(ObjOrErr.takeError());
+        return nullptr;
+      case object::Archive::K_AIXBIG:
+      case object::Archive::K_COFF:
+      case object::Archive::K_DARWIN:
+      case object::Archive::K_DARWIN64:
+        return ObjOrErr.takeError();
+      }
+    }
     return std::move(*ObjOrErr);
   } else {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(Buf);
@@ -751,7 +780,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                   object::Archive::Kind Kind, bool Thin, bool Deterministic,
                   SymtabWritingMode NeedSymbols, SymMap *SymMap,
                   LLVMContext &Context, ArrayRef<NewArchiveMember> NewMembers,
-                  std::optional<bool> IsEC) {
+                  std::optional<bool> IsEC, function_ref<void(Error)> Warn) {
   static char PaddingData[8] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
   uint64_t MemHeadPadSize = 0;
   uint64_t Pos =
@@ -819,8 +848,10 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
 
   if (NeedSymbols != SymtabWritingMode::NoSymtab || isAIXBigArchive(Kind)) {
     for (const NewArchiveMember &M : NewMembers) {
-      Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr =
-          getSymbolicFile(M.Buf->getMemBufferRef(), Context);
+      Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr = getSymbolicFile(
+          M.Buf->getMemBufferRef(), Context, Kind, [&](Error Err) {
+            Warn(createFileError(M.MemberName, std::move(Err)));
+          });
       if (!SymFileOrErr)
         return createFileError(M.MemberName, SymFileOrErr.takeError());
       SymFiles.push_back(std::move(*SymFileOrErr));
@@ -1001,7 +1032,8 @@ Error writeArchiveToStream(raw_ostream &Out,
                            ArrayRef<NewArchiveMember> NewMembers,
                            SymtabWritingMode WriteSymtab,
                            object::Archive::Kind Kind, bool Deterministic,
-                           bool Thin, std::optional<bool> IsEC) {
+                           bool Thin, std::optional<bool> IsEC,
+                           function_ref<void(Error)> Warn) {
   assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
 
   SmallString<0> SymNamesBuf;
@@ -1023,7 +1055,7 @@ Error writeArchiveToStream(raw_ostream &Out,
 
   Expected<std::vector<MemberData>> DataOrErr = computeMemberData(
       StringTable, SymNames, Kind, Thin, Deterministic, WriteSymtab,
-      isCOFFArchive(Kind) ? &SymMap : nullptr, Context, NewMembers, IsEC);
+      isCOFFArchive(Kind) ? &SymMap : nullptr, Context, NewMembers, IsEC, Warn);
   if (Error E = DataOrErr.takeError())
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
@@ -1266,11 +1298,15 @@ Error writeArchiveToStream(raw_ostream &Out,
   return Error::success();
 }
 
+void warnToStderr(Error Err) {
+  llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "warning: ");
+}
+
 Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
                    SymtabWritingMode WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                   std::optional<bool> IsEC) {
+                   std::optional<bool> IsEC, function_ref<void(Error)> Warn) {
   Expected<sys::fs::TempFile> Temp =
       sys::fs::TempFile::create(ArcName + ".temp-archive-%%%%%%%.a");
   if (!Temp)
@@ -1278,7 +1314,7 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
   raw_fd_ostream Out(Temp->FD, false);
 
   if (Error E = writeArchiveToStream(Out, NewMembers, WriteSymtab, Kind,
-                                     Deterministic, Thin, IsEC)) {
+                                     Deterministic, Thin, IsEC, Warn)) {
     if (Error DiscardError = Temp->discard())
       return joinErrors(std::move(E), std::move(DiscardError));
     return E;
@@ -1302,12 +1338,14 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
 Expected<std::unique_ptr<MemoryBuffer>>
 writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers,
                      SymtabWritingMode WriteSymtab, object::Archive::Kind Kind,
-                     bool Deterministic, bool Thin) {
+                     bool Deterministic, bool Thin,
+                     function_ref<void(Error)> Warn) {
   SmallVector<char, 0> ArchiveBufferVector;
   raw_svector_ostream ArchiveStream(ArchiveBufferVector);
 
-  if (Error E = writeArchiveToStream(ArchiveStream, NewMembers, WriteSymtab,
-                                     Kind, Deterministic, Thin, std::nullopt))
+  if (Error E =
+          writeArchiveToStream(ArchiveStream, NewMembers, WriteSymtab, Kind,
+                               Deterministic, Thin, std::nullopt, Warn))
     return std::move(E);
 
   return std::make_unique<SmallVectorMemoryBuffer>(
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 2458a53cb6d54..e67b02405a3a1 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -729,7 +729,10 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
         else if (Machine == IMAGE_FILE_MACHINE_I386 &&
                  applyNameType(IMPORT_NAME_NOPREFIX, Name) == E.ImportName)
           NameType = IMPORT_NAME_NOPREFIX;
-        else if (Name == E.ImportName)
+        else if (isArm64EC(M)) {
+          NameType = IMPORT_NAME_EXPORTAS;
+          ExportName = E.ImportName;
+        } else if (Name == E.ImportName)
           NameType = IMPORT_NAME;
         else {
           Deferred D;
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 812b2c00ba699..ff55a847c3432 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -5193,11 +5193,6 @@ MachOObjectFile::getDyldChainedFixupTargets() const {
   const char *Symbols = Contents + Header.symbols_offset;
   const char *SymbolsEnd = Contents + DyldChainedFixups.datasize;
 
-  if (ImportsEnd > Symbols)
-    return malformedError("bad chained fixups: imports end " +
-                          Twine(ImportsEndOffset) + " extends past end " +
-                          Twine(DyldChainedFixups.datasize));
-
   if (ImportsEnd > Symbols)
     return malformedError("bad chained fixups: imports end " +
                           Twine(ImportsEndOffset) + " overlaps with symbols");
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1b8110b06f4a2..0602c4a0a75ae 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -862,6 +862,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.hoistCommonInsts(Enable);
     } else if (ParamName == "sink-common-insts") {
       Result.sinkCommonInsts(Enable);
+    } else if (ParamName == "speculate-unpredictables") {
+      Result.speculateUnpredictables(Enable);
     } else if (Enable && ParamName.consume_front("bonus-inst-threshold=")) {
       APInt BonusInstThreshold;
       if (ParamName.getAsInteger(0, BonusInstThreshold))
@@ -1197,7 +1199,7 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
     std::tie(ParamName, Params) = Params.split(';');
 
     if (ParamName.consume_front("filter=")) {
-      std::optional<RegClassFilterFunc> Filter =
+      std::optional<RegAllocFilterFunc> Filter =
           PB.parseRegAllocFilter(ParamName);
       if (!Filter) {
         return make_error<StringError>(
@@ -2207,7 +2209,7 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   return Error::success();
 }
 
-std::optional<RegClassFilterFunc>
+std::optional<RegAllocFilterFunc>
 PassBuilder::parseRegAllocFilter(StringRef FilterName) {
   if (FilterName == "all")
     return nullptr;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index ef77c88454798..671ceaf51e6bb 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -300,6 +301,9 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
+static cl::opt<bool> EnableSampledInstr(
+    "enable-sampled-instrumentation", cl::init(false), cl::Hidden,
+    cl::desc("Enable profile instrumentation sampling (default = off)"));
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
@@ -870,6 +874,12 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   // Do counter promotion at Level greater than O0.
   Options.DoCounterPromotion = true;
   Options.UseBFIInPromotion = IsCS;
+  if (EnableSampledInstr) {
+    Options.Sampling = true;
+    // With sampling, there is little beneifit to enable counter promotion.
+    // But note that sampling does work with counter promotion.
+    Options.DoCounterPromotion = false;
+  }
   Options.Atomic = AtomicCounterUpdate;
   MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
 }
@@ -992,7 +1002,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1034,8 +1045,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      CoroSplitPass(Level != OptimizationLevel::O0)));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1208,7 +1220,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     MPM.addPass(PGOIndirectCallPromotion(false, false));
 
   if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
-    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
+    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
+                                               EnableSampledInstr));
 
   if (IsMemprofUse)
     MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
@@ -1231,7 +1244,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  MPM.addPass(CoroCleanupPass());
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
@@ -1531,8 +1545,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(
-      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
+  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                         .convertSwitchRangeToICmp(true)
+                                         .speculateUnpredictables(true)));
 
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
@@ -1889,6 +1904,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
+  // Lower variadic functions for supported targets prior to inlining.
+  MPM.addPass(ExpandVariadicsPass(ExpandVariadicsMode::Optimize));
+
   // Note: historically, the PruneEH pass was run first to deduce nounwind and
   // generally clean up exception handling overhead. It isn't clear this is
   // valuable as the inliner doesn't currently care whether it is inlining an
@@ -2050,9 +2068,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   LateFPM.addPass(DivRemPairsPass());
 
   // Delete basic blocks, which optimization passes may have killed.
-  LateFPM.addPass(SimplifyCFGPass(
-      SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts(
-          true)));
+  LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                      .convertSwitchRangeToICmp(true)
+                                      .hoistCommonInsts(true)
+                                      .speculateUnpredictables(true)));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
 
   // Drop bodies of available eternally objects to improve GlobalDCE.
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index fc7b82d522bf0..8f2461f40cb00 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/ADT/Any.h"
-#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
@@ -44,6 +43,7 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -753,28 +753,27 @@ static SmallString<32> getIRFileDisplayName(Any IR) {
   SmallString<32> Result;
   raw_svector_ostream ResultStream(Result);
   const Module *M = unwrapModule(IR);
-  stable_hash NameHash = stable_hash_combine_string(M->getName());
-  unsigned int MaxHashWidth = sizeof(stable_hash) * 8 / 4;
+  uint64_t NameHash = xxh3_64bits(M->getName());
+  unsigned MaxHashWidth = sizeof(uint64_t) * 2;
   write_hex(ResultStream, NameHash, HexPrintStyle::Lower, MaxHashWidth);
   if (unwrapIR<Module>(IR)) {
     ResultStream << "-module";
   } else if (const auto *F = unwrapIR<Function>(IR)) {
     ResultStream << "-function-";
-    stable_hash FunctionNameHash = stable_hash_combine_string(F->getName());
+    auto FunctionNameHash = xxh3_64bits(F->getName());
     write_hex(ResultStream, FunctionNameHash, HexPrintStyle::Lower,
               MaxHashWidth);
   } else if (const auto *C = unwrapIR<LazyCallGraph::SCC>(IR)) {
     ResultStream << "-scc-";
-    stable_hash SCCNameHash = stable_hash_combine_string(C->getName());
+    auto SCCNameHash = xxh3_64bits(C->getName());
     write_hex(ResultStream, SCCNameHash, HexPrintStyle::Lower, MaxHashWidth);
   } else if (const auto *L = unwrapIR<Loop>(IR)) {
     ResultStream << "-loop-";
-    stable_hash LoopNameHash = stable_hash_combine_string(L->getName());
+    auto LoopNameHash = xxh3_64bits(L->getName());
     write_hex(ResultStream, LoopNameHash, HexPrintStyle::Lower, MaxHashWidth);
   } else if (const auto *MF = unwrapIR<MachineFunction>(IR)) {
     ResultStream << "-machine-function-";
-    stable_hash MachineFunctionNameHash =
-        stable_hash_combine_string(MF->getName());
+    auto MachineFunctionNameHash = xxh3_64bits(MF->getName());
     write_hex(ResultStream, MachineFunctionNameHash, HexPrintStyle::Lower,
               MaxHashWidth);
   } else {
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
index 1b42d8c765f2d..0a0e7db457fa8 100644
--- a/llvm/lib/ProfileData/PGOCtxProfReader.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/PGOCtxProfReader.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -139,6 +140,20 @@ PGOCtxProfileReader::readContext(bool ExpectIndex) {
 }
 
 Error PGOCtxProfileReader::readMetadata() {
+  if (Magic.size() < PGOCtxProfileWriter::ContainerMagic.size() ||
+      Magic != PGOCtxProfileWriter::ContainerMagic)
+    return make_error<InstrProfError>(instrprof_error::invalid_prof,
+                                      "Invalid magic");
+
+  BitstreamEntry Entry;
+  RET_ON_ERR(Cursor.advance().moveInto(Entry));
+  if (Entry.Kind != BitstreamEntry::SubBlock ||
+      Entry.ID != bitc::BLOCKINFO_BLOCK_ID)
+    return unsupported("Expected Block ID");
+  // We don't need the blockinfo to read the rest, it's metadata usable for e.g.
+  // llvm-bcanalyzer.
+  RET_ON_ERR(Cursor.SkipBlock());
+
   EXPECT_OR_RET(Blk, advance());
   if (Blk->Kind != BitstreamEntry::SubBlock)
     return unsupported("Expected Version record");
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
index 5081797564469..74cd8763cc769 100644
--- a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
@@ -16,6 +16,40 @@
 using namespace llvm;
 using namespace llvm::ctx_profile;
 
+PGOCtxProfileWriter::PGOCtxProfileWriter(
+    raw_ostream &Out, std::optional<unsigned> VersionOverride)
+    : Writer(Out, 0) {
+  static_assert(ContainerMagic.size() == 4);
+  Out.write(ContainerMagic.data(), ContainerMagic.size());
+  Writer.EnterBlockInfoBlock();
+  {
+    auto DescribeBlock = [&](unsigned ID, StringRef Name) {
+      Writer.EmitRecord(bitc::BLOCKINFO_CODE_SETBID,
+                        SmallVector<unsigned, 1>{ID});
+      Writer.EmitRecord(bitc::BLOCKINFO_CODE_BLOCKNAME,
+                        llvm::arrayRefFromStringRef(Name));
+    };
+    SmallVector<uint64_t, 16> Data;
+    auto DescribeRecord = [&](unsigned RecordID, StringRef Name) {
+      Data.clear();
+      Data.push_back(RecordID);
+      llvm::append_range(Data, Name);
+      Writer.EmitRecord(bitc::BLOCKINFO_CODE_SETRECORDNAME, Data);
+    };
+    DescribeBlock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID, "Metadata");
+    DescribeRecord(PGOCtxProfileRecords::Version, "Version");
+    DescribeBlock(PGOCtxProfileBlockIDs::ContextNodeBlockID, "Context");
+    DescribeRecord(PGOCtxProfileRecords::Guid, "GUID");
+    DescribeRecord(PGOCtxProfileRecords::CalleeIndex, "CalleeIndex");
+    DescribeRecord(PGOCtxProfileRecords::Counters, "Counters");
+  }
+  Writer.ExitBlock();
+  Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID, CodeLen);
+  const auto Version = VersionOverride ? *VersionOverride : CurrentVersion;
+  Writer.EmitRecord(PGOCtxProfileRecords::Version,
+                    SmallVector<unsigned, 1>({Version}));
+}
+
 void PGOCtxProfileWriter::writeCounters(const ContextNode &Node) {
   Writer.EmitCode(bitc::UNABBREV_RECORD);
   Writer.EmitVBR(PGOCtxProfileRecords::Counters, VBREncodingBits);
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 3eb84ba9864cb..892bbcaaf98a9 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -4,9 +4,9 @@ if (NOT TARGET LLVMGenXIntrinsics)
   if (NOT DEFINED LLVMGenXIntrinsics_SOURCE_DIR)
     set(LLVMGenXIntrinsics_GIT_REPO https://github.com/intel/vc-intrinsics.git)
 
-    # Date: 08 May 2024
-    # Fix warning from switch
-    set(LLVMGenXIntrinsics_GIT_TAG b2565a03eb3cac07f5e8000fde971f95dc782c75)
+    # Date: Jul 25, 2024
+    # Support MMX removal in LLVM upstream
+    set(LLVMGenXIntrinsics_GIT_TAG e1e2f81165ae75576a0201f342e7f1ea2ca40273)
 
     message(STATUS "vc-intrinsics repo is missing. Will try to download it from ${LLVMGenXIntrinsics_GIT_REPO}")
     include(FetchContent)
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 944869a37989c..a67cb2a5bb25a 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 #include <sstream>
@@ -16,10 +17,22 @@ using namespace llvm::sandboxir;
 
 Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); }
 
-void Use::set(Value *V) { LLVMUse->set(V->Val); }
+void Use::set(Value *V) {
+  auto &Tracker = Ctx->getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<UseSet>(*this, Tracker));
+  LLVMUse->set(V->Val);
+}
 
 unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); }
 
+void Use::swap(Use &OtherUse) {
+  auto &Tracker = Ctx->getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<UseSwap>(*this, OtherUse, Tracker));
+  LLVMUse->swap(*OtherUse.LLVMUse);
+}
+
 #ifndef NDEBUG
 void Use::dump(raw_ostream &OS) const {
   Value *Def = nullptr;
@@ -77,6 +90,25 @@ UserUseIterator &UserUseIterator::operator++() {
   return *this;
 }
 
+OperandUseIterator OperandUseIterator::operator+(unsigned Num) const {
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() + Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+OperandUseIterator OperandUseIterator::operator-(unsigned Num) const {
+  assert(Use.getOperandNo() >= Num && "Out of bounds!");
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() - Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+int OperandUseIterator::operator-(const OperandUseIterator &Other) const {
+  int ThisOpNo = Use.getOperandNo();
+  int OtherOpNo = Other.Use.getOperandNo();
+  return ThisOpNo - OtherOpNo;
+}
+
 Value::Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx)
     : SubclassID(SubclassID), Val(Val), Ctx(Ctx) {
 #ifndef NDEBUG
@@ -140,14 +172,14 @@ void Value::replaceAllUsesWith(Value *Other) {
 }
 
 #ifndef NDEBUG
-std::string Value::getName() const {
+std::string Value::getUid() const {
   std::stringstream SS;
   SS << "SB" << UID << ".";
   return SS.str();
 }
 
 void Value::dumpCommonHeader(raw_ostream &OS) const {
-  OS << getName() << " " << getSubclassIDStr(SubclassID) << " ";
+  OS << getUid() << " " << getSubclassIDStr(SubclassID) << " ";
 }
 
 void Value::dumpCommonFooter(raw_ostream &OS) const {
@@ -167,7 +199,7 @@ void Value::dumpCommonPrefix(raw_ostream &OS) const {
 }
 
 void Value::dumpCommonSuffix(raw_ostream &OS) const {
-  OS << " ; " << getName() << " (" << getSubclassIDStr(SubclassID) << ")";
+  OS << " ; " << getUid() << " (" << getSubclassIDStr(SubclassID) << ")";
 }
 
 void Value::printAsOperandCommon(raw_ostream &OS) const {
@@ -281,11 +313,10 @@ BBIterator &BBIterator::operator--() {
 
 const char *Instruction::getOpcodeName(Opcode Opc) {
   switch (Opc) {
-#define DEF_VALUE(ID, CLASS)
-#define DEF_USER(ID, CLASS)
 #define OP(OPC)                                                                \
   case Opcode::OPC:                                                            \
     return #OPC;
+#define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
 #include "llvm/SandboxIR/SandboxIRValues.def"
   }
@@ -333,6 +364,10 @@ Instruction *Instruction::getPrevNode() const {
 }
 
 void Instruction::removeFromParent() {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<RemoveFromParent>(this, Tracker));
+
   // Detach all the LLVM IR instructions from their parent BB.
   for (llvm::Instruction *I : getLLVMInstrs())
     I->removeFromParent();
@@ -341,17 +376,37 @@ void Instruction::removeFromParent() {
 void Instruction::eraseFromParent() {
   assert(users().empty() && "Still connected to users, can't erase!");
   std::unique_ptr<Value> Detached = Ctx.detach(this);
-  // We don't have Tracking yet, so just erase the LLVM IR instructions.
-  // Erase in reverse to avoid erasing nstructions with attached uses.
-  auto Instrs = getLLVMInstrs();
-  for (llvm::Instruction *I : reverse(Instrs))
-    I->eraseFromParent();
+  auto LLVMInstrs = getLLVMInstrs();
+
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking()) {
+    Tracker.track(
+        std::make_unique<EraseFromParent>(std::move(Detached), Tracker));
+    // We don't actually delete the IR instruction, because then it would be
+    // impossible to bring it back from the dead at the same memory location.
+    // Instead we remove it from its BB and track its current location.
+    for (llvm::Instruction *I : LLVMInstrs)
+      I->removeFromParent();
+    // TODO: Multi-instructions need special treatment because some of the
+    // references are internal to the instruction.
+    for (llvm::Instruction *I : LLVMInstrs)
+      I->dropAllReferences();
+  } else {
+    // Erase in reverse to avoid erasing nstructions with attached uses.
+    for (llvm::Instruction *I : reverse(LLVMInstrs))
+      I->eraseFromParent();
+  }
 }
 
 void Instruction::moveBefore(BasicBlock &BB, const BBIterator &WhereIt) {
   if (std::next(getIterator()) == WhereIt)
     // Destination is same as origin, nothing to do.
     return;
+
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<MoveInstr>(this, Tracker));
+
   auto *LLVMBB = cast<llvm::BasicBlock>(BB.Val);
   llvm::BasicBlock::iterator It;
   if (WhereIt == BB.end()) {
@@ -429,6 +484,762 @@ void Instruction::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+#endif // NDEBUG
+
+Value *SelectInst::createCommon(Value *Cond, Value *True, Value *False,
+                                const Twine &Name, IRBuilder<> &Builder,
+                                Context &Ctx) {
+  llvm::Value *NewV =
+      Builder.CreateSelect(Cond->Val, True->Val, False->Val, Name);
+  if (auto *NewSI = dyn_cast<llvm::SelectInst>(NewV))
+    return Ctx.createSelectInst(NewSI);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
+Value *SelectInst::create(Value *Cond, Value *True, Value *False,
+                          Instruction *InsertBefore, Context &Ctx,
+                          const Twine &Name) {
+  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(BeforeIR);
+  return createCommon(Cond, True, False, Name, Builder, Ctx);
+}
+
+Value *SelectInst::create(Value *Cond, Value *True, Value *False,
+                          BasicBlock *InsertAtEnd, Context &Ctx,
+                          const Twine &Name) {
+  auto *IRInsertAtEnd = cast<llvm::BasicBlock>(InsertAtEnd->Val);
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(IRInsertAtEnd);
+  return createCommon(Cond, True, False, Name, Builder, Ctx);
+}
+
+bool SelectInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Select;
+}
+
+#ifndef NDEBUG
+void SelectInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void SelectInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, Instruction *InsertBefore,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::Instruction>(InsertBefore->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateBr(cast<llvm::BasicBlock>(IfTrue->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateBr(cast<llvm::BasicBlock>(IfTrue->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                               Value *Cond, Instruction *InsertBefore,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::Instruction>(InsertBefore->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateCondBr(Cond->Val, cast<llvm::BasicBlock>(IfTrue->Val),
+                           cast<llvm::BasicBlock>(IfFalse->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                               Value *Cond, BasicBlock *InsertAtEnd,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateCondBr(Cond->Val, cast<llvm::BasicBlock>(IfTrue->Val),
+                           cast<llvm::BasicBlock>(IfFalse->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+bool BranchInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Br;
+}
+
+Value *BranchInst::getCondition() const {
+  assert(isConditional() && "Cannot get condition of an uncond branch!");
+  return Ctx.getValue(cast<llvm::BranchInst>(Val)->getCondition());
+}
+
+BasicBlock *BranchInst::getSuccessor(unsigned SuccIdx) const {
+  assert(SuccIdx < getNumSuccessors() &&
+         "Successor # out of range for Branch!");
+  return cast_or_null<BasicBlock>(
+      Ctx.getValue(cast<llvm::BranchInst>(Val)->getSuccessor(SuccIdx)));
+}
+
+void BranchInst::setSuccessor(unsigned Idx, BasicBlock *NewSucc) {
+  assert((Idx == 0 || Idx == 1) && "Out of bounds!");
+  setOperand(2u - Idx, NewSucc);
+}
+
+BasicBlock *BranchInst::LLVMBBToSBBB::operator()(llvm::BasicBlock *BB) const {
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+const BasicBlock *
+BranchInst::ConstLLVMBBToSBBB::operator()(const llvm::BasicBlock *BB) const {
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+#ifndef NDEBUG
+void BranchInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void BranchInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, Context &Ctx,
+                           const Twine &Name) {
+  return create(Ty, Ptr, Align, InsertBefore, /*IsVolatile=*/false, Ctx, Name);
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, bool IsVolatile,
+                           Context &Ctx, const Twine &Name) {
+  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(BeforeIR);
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
+  auto *NewSBI = Ctx.createLoadInst(NewLI);
+  return NewSBI;
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, Context &Ctx,
+                           const Twine &Name) {
+  return create(Ty, Ptr, Align, InsertAtEnd, /*IsVolatile=*/false, Ctx, Name);
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, bool IsVolatile,
+                           Context &Ctx, const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
+  auto *NewSBI = Ctx.createLoadInst(NewLI);
+  return NewSBI;
+}
+
+bool LoadInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Load;
+}
+
+Value *LoadInst::getPointerOperand() const {
+  return Ctx.getValue(cast<llvm::LoadInst>(Val)->getPointerOperand());
+}
+
+#ifndef NDEBUG
+void LoadInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void LoadInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             Instruction *InsertBefore, Context &Ctx) {
+  return create(V, Ptr, Align, InsertBefore, /*IsVolatile=*/false, Ctx);
+}
+
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             Instruction *InsertBefore, bool IsVolatile,
+                             Context &Ctx) {
+  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(BeforeIR);
+  auto *NewSI = Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, IsVolatile);
+  auto *NewSBI = Ctx.createStoreInst(NewSI);
+  return NewSBI;
+}
+
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             BasicBlock *InsertAtEnd, Context &Ctx) {
+  return create(V, Ptr, Align, InsertAtEnd, /*IsVolatile=*/false, Ctx);
+}
+
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             BasicBlock *InsertAtEnd, bool IsVolatile,
+                             Context &Ctx) {
+  auto *InsertAtEndIR = cast<llvm::BasicBlock>(InsertAtEnd->Val);
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(InsertAtEndIR);
+  auto *NewSI = Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, IsVolatile);
+  auto *NewSBI = Ctx.createStoreInst(NewSI);
+  return NewSBI;
+}
+
+bool StoreInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Store;
+}
+
+Value *StoreInst::getValueOperand() const {
+  return Ctx.getValue(cast<llvm::StoreInst>(Val)->getValueOperand());
+}
+
+Value *StoreInst::getPointerOperand() const {
+  return Ctx.getValue(cast<llvm::StoreInst>(Val)->getPointerOperand());
+}
+
+#ifndef NDEBUG
+void StoreInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void StoreInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+ReturnInst *ReturnInst::createCommon(Value *RetVal, IRBuilder<> &Builder,
+                                     Context &Ctx) {
+  llvm::ReturnInst *NewRI;
+  if (RetVal != nullptr)
+    NewRI = Builder.CreateRet(RetVal->Val);
+  else
+    NewRI = Builder.CreateRetVoid();
+  return Ctx.createReturnInst(NewRI);
+}
+
+ReturnInst *ReturnInst::create(Value *RetVal, Instruction *InsertBefore,
+                               Context &Ctx) {
+  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(BeforeIR);
+  return createCommon(RetVal, Builder, Ctx);
+}
+
+ReturnInst *ReturnInst::create(Value *RetVal, BasicBlock *InsertAtEnd,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  return createCommon(RetVal, Builder, Ctx);
+}
+
+Value *ReturnInst::getReturnValue() const {
+  auto *LLVMRetVal = cast<llvm::ReturnInst>(Val)->getReturnValue();
+  return LLVMRetVal != nullptr ? Ctx.getValue(LLVMRetVal) : nullptr;
+}
+
+#ifndef NDEBUG
+void ReturnInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void ReturnInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *CallBase::getCalledOperand() const {
+  return Ctx.getValue(cast<llvm::CallBase>(Val)->getCalledOperand());
+}
+
+Use CallBase::getCalledOperandUse() const {
+  llvm::Use *LLVMUse = &cast<llvm::CallBase>(Val)->getCalledOperandUse();
+  return Use(LLVMUse, cast<User>(Ctx.getValue(LLVMUse->getUser())), Ctx);
+}
+
+Function *CallBase::getCalledFunction() const {
+  return cast_or_null<Function>(
+      Ctx.getValue(cast<llvm::CallBase>(Val)->getCalledFunction()));
+}
+Function *CallBase::getCaller() {
+  return cast<Function>(Ctx.getValue(cast<llvm::CallBase>(Val)->getCaller()));
+}
+
+void CallBase::setCalledFunction(Function *F) {
+  // F's function type is private, so we rely on `setCalledFunction()` to update
+  // it. But even though we are calling `setCalledFunction()` we also need to
+  // track this change at the SandboxIR level, which is why we call
+  // `setCalledOperand()` here.
+  // Note: This may break if `setCalledFunction()` early returns if `F`
+  // is already set, but we do have a unit test for it.
+  setCalledOperand(F);
+  cast<llvm::CallBase>(Val)->setCalledFunction(F->getFunctionType(),
+                                               cast<llvm::Function>(F->Val));
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, BasicBlock::iterator WhereIt,
+                           BasicBlock *WhereBB, Context &Ctx,
+                           const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+  llvm::CallInst *NewCI = Builder.CreateCall(FTy, Func->Val, LLVMArgs, NameStr);
+  return Ctx.createCallInst(NewCI);
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, Instruction *InsertBefore,
+                           Context &Ctx, const Twine &NameStr) {
+  return CallInst::create(FTy, Func, Args, InsertBefore->getIterator(),
+                          InsertBefore->getParent(), Ctx, NameStr);
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                           Context &Ctx, const Twine &NameStr) {
+  return CallInst::create(FTy, Func, Args, InsertAtEnd->end(), InsertAtEnd, Ctx,
+                          NameStr);
+}
+
+#ifndef NDEBUG
+void CallInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CallInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args, BBIterator WhereIt,
+                               BasicBlock *WhereBB, Context &Ctx,
+                               const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+  llvm::InvokeInst *Invoke = Builder.CreateInvoke(
+      FTy, Func->Val, cast<llvm::BasicBlock>(IfNormal->Val),
+      cast<llvm::BasicBlock>(IfException->Val), LLVMArgs, NameStr);
+  return Ctx.createInvokeInst(Invoke);
+}
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args,
+                               Instruction *InsertBefore, Context &Ctx,
+                               const Twine &NameStr) {
+  return create(FTy, Func, IfNormal, IfException, Args,
+                InsertBefore->getIterator(), InsertBefore->getParent(), Ctx,
+                NameStr);
+}
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                               Context &Ctx, const Twine &NameStr) {
+  return create(FTy, Func, IfNormal, IfException, Args, InsertAtEnd->end(),
+                InsertAtEnd, Ctx, NameStr);
+}
+
+BasicBlock *InvokeInst::getNormalDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getNormalDest()));
+}
+BasicBlock *InvokeInst::getUnwindDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getUnwindDest()));
+}
+void InvokeInst::setNormalDest(BasicBlock *BB) {
+  setOperand(1, BB);
+  assert(getNormalDest() == BB && "LLVM IR uses a different operan index!");
+}
+void InvokeInst::setUnwindDest(BasicBlock *BB) {
+  setOperand(2, BB);
+  assert(getUnwindDest() == BB && "LLVM IR uses a different operan index!");
+}
+Instruction *InvokeInst::getLandingPadInst() const {
+  return cast<Instruction>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getLandingPadInst()));
+  ;
+}
+BasicBlock *InvokeInst::getSuccessor(unsigned SuccIdx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getSuccessor(SuccIdx)));
+}
+
+#ifndef NDEBUG
+void InvokeInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void InvokeInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args, BBIterator WhereIt,
+                               BasicBlock *WhereBB, Context &Ctx,
+                               const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+
+  SmallVector<llvm::BasicBlock *> LLVMIndirectDests;
+  LLVMIndirectDests.reserve(IndirectDests.size());
+  for (BasicBlock *IndDest : IndirectDests)
+    LLVMIndirectDests.push_back(cast<llvm::BasicBlock>(IndDest->Val));
+
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+
+  llvm::CallBrInst *CallBr = Builder.CreateCallBr(
+      FTy, Func->Val, cast<llvm::BasicBlock>(DefaultDest->Val),
+      LLVMIndirectDests, LLVMArgs, NameStr);
+  return Ctx.createCallBrInst(CallBr);
+}
+
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args,
+                               Instruction *InsertBefore, Context &Ctx,
+                               const Twine &NameStr) {
+  return create(FTy, Func, DefaultDest, IndirectDests, Args,
+                InsertBefore->getIterator(), InsertBefore->getParent(), Ctx,
+                NameStr);
+}
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                               Context &Ctx, const Twine &NameStr) {
+  return create(FTy, Func, DefaultDest, IndirectDests, Args, InsertAtEnd->end(),
+                InsertAtEnd, Ctx, NameStr);
+}
+
+Value *CallBrInst::getIndirectDestLabel(unsigned Idx) const {
+  return Ctx.getValue(cast<llvm::CallBrInst>(Val)->getIndirectDestLabel(Idx));
+}
+Value *CallBrInst::getIndirectDestLabelUse(unsigned Idx) const {
+  return Ctx.getValue(
+      cast<llvm::CallBrInst>(Val)->getIndirectDestLabelUse(Idx));
+}
+BasicBlock *CallBrInst::getDefaultDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getDefaultDest()));
+}
+BasicBlock *CallBrInst::getIndirectDest(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getIndirectDest(Idx)));
+}
+llvm::SmallVector<BasicBlock *, 16> CallBrInst::getIndirectDests() const {
+  SmallVector<BasicBlock *, 16> BBs;
+  for (llvm::BasicBlock *LLVMBB :
+       cast<llvm::CallBrInst>(Val)->getIndirectDests())
+    BBs.push_back(cast<BasicBlock>(Ctx.getValue(LLVMBB)));
+  return BBs;
+}
+void CallBrInst::setDefaultDest(BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<CallBrInstSetDefaultDest>(this, Tracker));
+  cast<llvm::CallBrInst>(Val)->setDefaultDest(cast<llvm::BasicBlock>(BB->Val));
+}
+void CallBrInst::setIndirectDest(unsigned Idx, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(
+        std::make_unique<CallBrInstSetIndirectDest>(this, Idx, Tracker));
+  cast<llvm::CallBrInst>(Val)->setIndirectDest(Idx,
+                                               cast<llvm::BasicBlock>(BB->Val));
+}
+BasicBlock *CallBrInst::getSuccessor(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getSuccessor(Idx)));
+}
+
+#ifndef NDEBUG
+void CallBrInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void CallBrInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 BasicBlock::iterator WhereIt,
+                                 BasicBlock *WhereBB, Context &Ctx,
+                                 const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMIdxList;
+  LLVMIdxList.reserve(IdxList.size());
+  for (Value *Idx : IdxList)
+    LLVMIdxList.push_back(Idx->Val);
+  llvm::Value *NewV = Builder.CreateGEP(Ty, Ptr->Val, LLVMIdxList, NameStr);
+  if (auto *NewGEP = dyn_cast<llvm::GetElementPtrInst>(NewV))
+    return Ctx.createGetElementPtrInst(NewGEP);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 Instruction *InsertBefore, Context &Ctx,
+                                 const Twine &NameStr) {
+  return GetElementPtrInst::create(Ty, Ptr, IdxList,
+                                   InsertBefore->getIterator(),
+                                   InsertBefore->getParent(), Ctx, NameStr);
+}
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 BasicBlock *InsertAtEnd, Context &Ctx,
+                                 const Twine &NameStr) {
+  return GetElementPtrInst::create(Ty, Ptr, IdxList, InsertAtEnd->end(),
+                                   InsertAtEnd, Ctx, NameStr);
+}
+
+Value *GetElementPtrInst::getPointerOperand() const {
+  return Ctx.getValue(cast<llvm::GetElementPtrInst>(Val)->getPointerOperand());
+}
+
+#ifndef NDEBUG
+void GetElementPtrInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void GetElementPtrInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+BasicBlock *PHINode::LLVMBBToBB::operator()(llvm::BasicBlock *LLVMBB) const {
+  return cast<BasicBlock>(Ctx.getValue(LLVMBB));
+}
+
+PHINode *PHINode::create(Type *Ty, unsigned NumReservedValues,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name) {
+  llvm::PHINode *NewPHI = llvm::PHINode::Create(
+      Ty, NumReservedValues, Name, InsertBefore->getTopmostLLVMInstruction());
+  return Ctx.createPHINode(NewPHI);
+}
+
+bool PHINode::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::PHI;
+}
+
+Value *PHINode::getIncomingValue(unsigned Idx) const {
+  return Ctx.getValue(cast<llvm::PHINode>(Val)->getIncomingValue(Idx));
+}
+void PHINode::setIncomingValue(unsigned Idx, Value *V) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHISetIncoming>(
+        *this, Idx, PHISetIncoming::What::Value, Tracker));
+
+  cast<llvm::PHINode>(Val)->setIncomingValue(Idx, V->Val);
+}
+BasicBlock *PHINode::getIncomingBlock(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::PHINode>(Val)->getIncomingBlock(Idx)));
+}
+BasicBlock *PHINode::getIncomingBlock(const Use &U) const {
+  llvm::Use *LLVMUse = U.LLVMUse;
+  llvm::BasicBlock *BB = cast<llvm::PHINode>(Val)->getIncomingBlock(*LLVMUse);
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+void PHINode::setIncomingBlock(unsigned Idx, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHISetIncoming>(
+        *this, Idx, PHISetIncoming::What::Block, Tracker));
+  cast<llvm::PHINode>(Val)->setIncomingBlock(Idx,
+                                             cast<llvm::BasicBlock>(BB->Val));
+}
+void PHINode::addIncoming(Value *V, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIAddIncoming>(*this, Tracker));
+
+  cast<llvm::PHINode>(Val)->addIncoming(V->Val,
+                                        cast<llvm::BasicBlock>(BB->Val));
+}
+Value *PHINode::removeIncomingValue(unsigned Idx) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIRemoveIncoming>(*this, Idx, Tracker));
+
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->removeIncomingValue(Idx,
+                                                    /*DeletePHIIfEmpty=*/false);
+  return Ctx.getValue(LLVMV);
+}
+Value *PHINode::removeIncomingValue(BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIRemoveIncoming>(
+        *this, getBasicBlockIndex(BB), Tracker));
+
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->removeIncomingValue(LLVMBB,
+                                                    /*DeletePHIIfEmpty=*/false);
+  return Ctx.getValue(LLVMV);
+}
+int PHINode::getBasicBlockIndex(const BasicBlock *BB) const {
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  return cast<llvm::PHINode>(Val)->getBasicBlockIndex(LLVMBB);
+}
+Value *PHINode::getIncomingValueForBlock(const BasicBlock *BB) const {
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->getIncomingValueForBlock(LLVMBB);
+  return Ctx.getValue(LLVMV);
+}
+Value *PHINode::hasConstantValue() const {
+  llvm::Value *LLVMV = cast<llvm::PHINode>(Val)->hasConstantValue();
+  return LLVMV != nullptr ? Ctx.getValue(LLVMV) : nullptr;
+}
+
+static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
+  switch (Opc) {
+  case Instruction::Opcode::ZExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::ZExt);
+  case Instruction::Opcode::SExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::SExt);
+  case Instruction::Opcode::FPToUI:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToUI);
+  case Instruction::Opcode::FPToSI:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
+  case Instruction::Opcode::FPExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+  case Instruction::Opcode::PtrToInt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
+  case Instruction::Opcode::IntToPtr:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::IntToPtr);
+  case Instruction::Opcode::SIToFP:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::SIToFP);
+  case Instruction::Opcode::UIToFP:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::UIToFP);
+  case Instruction::Opcode::Trunc:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::Trunc);
+  case Instruction::Opcode::FPTrunc:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPTrunc);
+  case Instruction::Opcode::BitCast:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::BitCast);
+  case Instruction::Opcode::AddrSpaceCast:
+    return static_cast<llvm::Instruction::CastOps>(
+        llvm::Instruction::AddrSpaceCast);
+  default:
+    llvm_unreachable("Opcode not suitable for CastInst!");
+  }
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                        const Twine &Name) {
+  assert(getLLVMCastOp(Op) && "Opcode not suitable for CastInst!");
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt == WhereBB->end())
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  else
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  auto *NewV =
+      Builder.CreateCast(getLLVMCastOp(Op), Operand->Val, DestTy, Name);
+  if (auto *NewCI = dyn_cast<llvm::CastInst>(NewV))
+    return Ctx.createCastInst(NewCI);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        Instruction *InsertBefore, Context &Ctx,
+                        const Twine &Name) {
+  return create(DestTy, Op, Operand, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        BasicBlock *InsertAtEnd, Context &Ctx,
+                        const Twine &Name) {
+  return create(DestTy, Op, Operand, InsertAtEnd->end(), InsertAtEnd, Ctx,
+                Name);
+}
+
+bool CastInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Cast;
+}
+
+#ifndef NDEBUG
+void CastInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CastInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+
+void PHINode::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void PHINode::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
 
 void OpaqueInst::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
@@ -439,7 +1250,15 @@ void OpaqueInst::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+#endif // NDEBUG
+
+Constant *Constant::createInt(Type *Ty, uint64_t V, Context &Ctx,
+                              bool IsSigned) {
+  llvm::Constant *LLVMC = llvm::ConstantInt::get(Ty, V, IsSigned);
+  return Ctx.getOrCreateConstant(LLVMC);
+}
 
+#ifndef NDEBUG
 void Constant::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
   dumpCommonSuffix(OS);
@@ -514,8 +1333,9 @@ Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
   assert(VPtr->getSubclassID() != Value::ClassID::User &&
          "Can't register a user!");
   Value *V = VPtr.get();
-  llvm::Value *Key = V->Val;
-  LLVMValueToValueMap[Key] = std::move(VPtr);
+  [[maybe_unused]] auto Pair =
+      LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
+  assert(Pair.second && "Already exists!");
   return V;
 }
 
@@ -526,7 +1346,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return It->second.get();
 
   if (auto *C = dyn_cast<llvm::Constant>(LLVMV)) {
-    It->second = std::unique_ptr<Constant>(new Constant(C, *this));
+    if (auto *F = dyn_cast<llvm::Function>(LLVMV))
+      It->second = std::unique_ptr<Function>(new Function(F, *this));
+    else
+      It->second = std::unique_ptr<Constant>(new Constant(C, *this));
     auto *NewC = It->second.get();
     for (llvm::Value *COp : C->operands())
       getOrCreateValueInternal(COp, C);
@@ -544,6 +1367,80 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return nullptr;
   }
   assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
+
+  switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
+  case llvm::Instruction::Select: {
+    auto *LLVMSel = cast<llvm::SelectInst>(LLVMV);
+    It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Br: {
+    auto *LLVMBr = cast<llvm::BranchInst>(LLVMV);
+    It->second = std::unique_ptr<BranchInst>(new BranchInst(LLVMBr, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Load: {
+    auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
+    It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Store: {
+    auto *LLVMSt = cast<llvm::StoreInst>(LLVMV);
+    It->second = std::unique_ptr<StoreInst>(new StoreInst(LLVMSt, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Ret: {
+    auto *LLVMRet = cast<llvm::ReturnInst>(LLVMV);
+    It->second = std::unique_ptr<ReturnInst>(new ReturnInst(LLVMRet, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Call: {
+    auto *LLVMCall = cast<llvm::CallInst>(LLVMV);
+    It->second = std::unique_ptr<CallInst>(new CallInst(LLVMCall, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Invoke: {
+    auto *LLVMInvoke = cast<llvm::InvokeInst>(LLVMV);
+    It->second = std::unique_ptr<InvokeInst>(new InvokeInst(LLVMInvoke, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CallBr: {
+    auto *LLVMCallBr = cast<llvm::CallBrInst>(LLVMV);
+    It->second = std::unique_ptr<CallBrInst>(new CallBrInst(LLVMCallBr, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::GetElementPtr: {
+    auto *LLVMGEP = cast<llvm::GetElementPtrInst>(LLVMV);
+    It->second = std::unique_ptr<GetElementPtrInst>(
+        new GetElementPtrInst(LLVMGEP, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ZExt:
+  case llvm::Instruction::SExt:
+  case llvm::Instruction::FPToUI:
+  case llvm::Instruction::FPToSI:
+  case llvm::Instruction::FPExt:
+  case llvm::Instruction::PtrToInt:
+  case llvm::Instruction::IntToPtr:
+  case llvm::Instruction::SIToFP:
+  case llvm::Instruction::UIToFP:
+  case llvm::Instruction::Trunc:
+  case llvm::Instruction::FPTrunc:
+  case llvm::Instruction::BitCast:
+  case llvm::Instruction::AddrSpaceCast: {
+    auto *LLVMCast = cast<llvm::CastInst>(LLVMV);
+    It->second = std::unique_ptr<CastInst>(new CastInst(LLVMCast, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::PHI: {
+    auto *LLVMPhi = cast<llvm::PHINode>(LLVMV);
+    It->second = std::unique_ptr<PHINode>(new PHINode(LLVMPhi, *this));
+    return It->second.get();
+  }
+  default:
+    break;
+  }
+
   It->second = std::unique_ptr<OpaqueInst>(
       new OpaqueInst(cast<llvm::Instruction>(LLVMV), *this));
   return It->second.get();
@@ -558,6 +1455,62 @@ BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
   return BB;
 }
 
+SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
+  auto NewPtr = std::unique_ptr<SelectInst>(new SelectInst(SI, *this));
+  return cast<SelectInst>(registerValue(std::move(NewPtr)));
+}
+
+BranchInst *Context::createBranchInst(llvm::BranchInst *BI) {
+  auto NewPtr = std::unique_ptr<BranchInst>(new BranchInst(BI, *this));
+  return cast<BranchInst>(registerValue(std::move(NewPtr)));
+}
+
+LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
+  auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
+  return cast<LoadInst>(registerValue(std::move(NewPtr)));
+}
+
+StoreInst *Context::createStoreInst(llvm::StoreInst *SI) {
+  auto NewPtr = std::unique_ptr<StoreInst>(new StoreInst(SI, *this));
+  return cast<StoreInst>(registerValue(std::move(NewPtr)));
+}
+
+ReturnInst *Context::createReturnInst(llvm::ReturnInst *I) {
+  auto NewPtr = std::unique_ptr<ReturnInst>(new ReturnInst(I, *this));
+  return cast<ReturnInst>(registerValue(std::move(NewPtr)));
+}
+
+CallInst *Context::createCallInst(llvm::CallInst *I) {
+  auto NewPtr = std::unique_ptr<CallInst>(new CallInst(I, *this));
+  return cast<CallInst>(registerValue(std::move(NewPtr)));
+}
+
+InvokeInst *Context::createInvokeInst(llvm::InvokeInst *I) {
+  auto NewPtr = std::unique_ptr<InvokeInst>(new InvokeInst(I, *this));
+  return cast<InvokeInst>(registerValue(std::move(NewPtr)));
+}
+
+CallBrInst *Context::createCallBrInst(llvm::CallBrInst *I) {
+  auto NewPtr = std::unique_ptr<CallBrInst>(new CallBrInst(I, *this));
+  return cast<CallBrInst>(registerValue(std::move(NewPtr)));
+}
+
+GetElementPtrInst *
+Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) {
+  auto NewPtr =
+      std::unique_ptr<GetElementPtrInst>(new GetElementPtrInst(I, *this));
+  return cast<GetElementPtrInst>(registerValue(std::move(NewPtr)));
+}
+
+CastInst *Context::createCastInst(llvm::CastInst *I) {
+  auto NewPtr = std::unique_ptr<CastInst>(new CastInst(I, *this));
+  return cast<CastInst>(registerValue(std::move(NewPtr)));
+}
+PHINode *Context::createPHINode(llvm::PHINode *I) {
+  auto NewPtr = std::unique_ptr<PHINode>(new PHINode(I, *this));
+  return cast<PHINode>(registerValue(std::move(NewPtr)));
+}
+
 Value *Context::getValue(llvm::Value *V) const {
   auto It = LLVMValueToValueMap.find(V);
   if (It != LLVMValueToValueMap.end())
@@ -568,13 +1521,13 @@ Value *Context::getValue(llvm::Value *V) const {
 Function *Context::createFunction(llvm::Function *F) {
   assert(getValue(F) == nullptr && "Already exists!");
   auto NewFPtr = std::unique_ptr<Function>(new Function(F, *this));
+  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
   // Create arguments.
   for (auto &Arg : F->args())
     getOrCreateArgument(&Arg);
   // Create BBs.
   for (auto &BB : *F)
     createBasicBlock(&BB);
-  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
   return SBF;
 }
 
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index 1182f5c55d10b..0310160e8bf35 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -35,12 +35,230 @@ void UseSet::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+
+void UseSwap::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+PHISetIncoming::PHISetIncoming(PHINode &PHI, unsigned Idx, What What,
+                               Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), Idx(Idx) {
+  switch (What) {
+  case What::Value:
+    OrigValueOrBB = PHI.getIncomingValue(Idx);
+    break;
+  case What::Block:
+    OrigValueOrBB = PHI.getIncomingBlock(Idx);
+    break;
+  }
+}
+
+void PHISetIncoming::revert() {
+  if (auto *V = OrigValueOrBB.dyn_cast<Value *>())
+    PHI.setIncomingValue(Idx, V);
+  else
+    PHI.setIncomingBlock(Idx, OrigValueOrBB.get<BasicBlock *>());
+}
+
+#ifndef NDEBUG
+void PHISetIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+PHIRemoveIncoming::PHIRemoveIncoming(PHINode &PHI, unsigned RemovedIdx,
+                                     Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), RemovedIdx(RemovedIdx) {
+  RemovedV = PHI.getIncomingValue(RemovedIdx);
+  RemovedBB = PHI.getIncomingBlock(RemovedIdx);
+}
+
+void PHIRemoveIncoming::revert() {
+  // Special case: if the PHI is now empty, as we don't need to care about the
+  // order of the incoming values.
+  unsigned NumIncoming = PHI.getNumIncomingValues();
+  if (NumIncoming == 0) {
+    PHI.addIncoming(RemovedV, RemovedBB);
+    return;
+  }
+  // Shift all incoming values by one starting from the end until `Idx`.
+  // Start by adding a copy of the last incoming values.
+  unsigned LastIdx = NumIncoming - 1;
+  PHI.addIncoming(PHI.getIncomingValue(LastIdx), PHI.getIncomingBlock(LastIdx));
+  for (unsigned Idx = LastIdx; Idx > RemovedIdx; --Idx) {
+    auto *PrevV = PHI.getIncomingValue(Idx - 1);
+    auto *PrevBB = PHI.getIncomingBlock(Idx - 1);
+    PHI.setIncomingValue(Idx, PrevV);
+    PHI.setIncomingBlock(Idx, PrevBB);
+  }
+  PHI.setIncomingValue(RemovedIdx, RemovedV);
+  PHI.setIncomingBlock(RemovedIdx, RemovedBB);
+}
+
+#ifndef NDEBUG
+void PHIRemoveIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+PHIAddIncoming::PHIAddIncoming(PHINode &PHI, Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), Idx(PHI.getNumIncomingValues()) {}
+
+void PHIAddIncoming::revert() { PHI.removeIncomingValue(Idx); }
+
+#ifndef NDEBUG
+void PHIAddIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
 #endif // NDEBUG
 
 Tracker::~Tracker() {
   assert(Changes.empty() && "You must accept or revert changes!");
 }
 
+EraseFromParent::EraseFromParent(std::unique_ptr<sandboxir::Value> &&ErasedIPtr,
+                                 Tracker &Tracker)
+    : IRChangeBase(Tracker), ErasedIPtr(std::move(ErasedIPtr)) {
+  auto *I = cast<Instruction>(this->ErasedIPtr.get());
+  auto LLVMInstrs = I->getLLVMInstrs();
+  // Iterate in reverse program order.
+  for (auto *LLVMI : reverse(LLVMInstrs)) {
+    SmallVector<llvm::Value *> Operands;
+    Operands.reserve(LLVMI->getNumOperands());
+    for (auto [OpNum, Use] : enumerate(LLVMI->operands()))
+      Operands.push_back(Use.get());
+    InstrData.push_back({Operands, LLVMI});
+  }
+  assert(is_sorted(InstrData,
+                   [](const auto &D0, const auto &D1) {
+                     return D0.LLVMI->comesBefore(D1.LLVMI);
+                   }) &&
+         "Expected reverse program order!");
+  auto *BotLLVMI = cast<llvm::Instruction>(I->Val);
+  if (BotLLVMI->getNextNode() != nullptr)
+    NextLLVMIOrBB = BotLLVMI->getNextNode();
+  else
+    NextLLVMIOrBB = BotLLVMI->getParent();
+}
+
+void EraseFromParent::accept() {
+  for (const auto &IData : InstrData)
+    IData.LLVMI->deleteValue();
+}
+
+void EraseFromParent::revert() {
+  // Place the bottom-most instruction first.
+  auto [Operands, BotLLVMI] = InstrData[0];
+  if (auto *NextLLVMI = NextLLVMIOrBB.dyn_cast<llvm::Instruction *>()) {
+    BotLLVMI->insertBefore(NextLLVMI);
+  } else {
+    auto *LLVMBB = NextLLVMIOrBB.get<llvm::BasicBlock *>();
+    BotLLVMI->insertInto(LLVMBB, LLVMBB->end());
+  }
+  for (auto [OpNum, Op] : enumerate(Operands))
+    BotLLVMI->setOperand(OpNum, Op);
+
+  // Go over the rest of the instructions and stack them on top.
+  for (auto [Operands, LLVMI] : drop_begin(InstrData)) {
+    LLVMI->insertBefore(BotLLVMI);
+    for (auto [OpNum, Op] : enumerate(Operands))
+      LLVMI->setOperand(OpNum, Op);
+    BotLLVMI = LLVMI;
+  }
+  Parent.getContext().registerValue(std::move(ErasedIPtr));
+}
+
+#ifndef NDEBUG
+void EraseFromParent::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+RemoveFromParent::RemoveFromParent(Instruction *RemovedI, Tracker &Tracker)
+    : IRChangeBase(Tracker), RemovedI(RemovedI) {
+  if (auto *NextI = RemovedI->getNextNode())
+    NextInstrOrBB = NextI;
+  else
+    NextInstrOrBB = RemovedI->getParent();
+}
+
+void RemoveFromParent::revert() {
+  if (auto *NextI = NextInstrOrBB.dyn_cast<Instruction *>()) {
+    RemovedI->insertBefore(NextI);
+  } else {
+    auto *BB = NextInstrOrBB.get<BasicBlock *>();
+    RemovedI->insertInto(BB, BB->end());
+  }
+}
+
+#ifndef NDEBUG
+void RemoveFromParent::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+CallBrInstSetDefaultDest::CallBrInstSetDefaultDest(CallBrInst *CallBr,
+                                                   Tracker &Tracker)
+    : IRChangeBase(Tracker), CallBr(CallBr) {
+  OrigDefaultDest = CallBr->getDefaultDest();
+}
+void CallBrInstSetDefaultDest::revert() {
+  CallBr->setDefaultDest(OrigDefaultDest);
+}
+#ifndef NDEBUG
+void CallBrInstSetDefaultDest::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+CallBrInstSetIndirectDest::CallBrInstSetIndirectDest(CallBrInst *CallBr,
+                                                     unsigned Idx,
+                                                     Tracker &Tracker)
+    : IRChangeBase(Tracker), CallBr(CallBr), Idx(Idx) {
+  OrigIndirectDest = CallBr->getIndirectDest(Idx);
+}
+void CallBrInstSetIndirectDest::revert() {
+  CallBr->setIndirectDest(Idx, OrigIndirectDest);
+}
+#ifndef NDEBUG
+void CallBrInstSetIndirectDest::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+MoveInstr::MoveInstr(Instruction *MovedI, Tracker &Tracker)
+    : IRChangeBase(Tracker), MovedI(MovedI) {
+  if (auto *NextI = MovedI->getNextNode())
+    NextInstrOrBB = NextI;
+  else
+    NextInstrOrBB = MovedI->getParent();
+}
+
+void MoveInstr::revert() {
+  if (auto *NextI = NextInstrOrBB.dyn_cast<Instruction *>()) {
+    MovedI->moveBefore(NextI);
+  } else {
+    auto *BB = NextInstrOrBB.get<BasicBlock *>();
+    MovedI->moveBefore(*BB, BB->end());
+  }
+}
+
+#ifndef NDEBUG
+void MoveInstr::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
 void Tracker::track(std::unique_ptr<IRChangeBase> &&Change) {
   assert(State == TrackerState::Record && "The tracker should be tracking!");
   Changes.push_back(std::move(Change));
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 26b4f8e55448f..7f68c5ab9b7cf 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -143,6 +143,7 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
     7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
     4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
+static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
 static constexpr fltSemantics semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
@@ -217,6 +218,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
     return Float8E4M3FNUZ();
   case S_Float8E4M3B11FNUZ:
     return Float8E4M3B11FNUZ();
+  case S_Float8E3M4:
+    return Float8E3M4();
   case S_FloatTF32:
     return FloatTF32();
   case S_Float6E3M2FN:
@@ -257,6 +260,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     return S_Float8E4M3FNUZ;
   else if (&Sem == &llvm::APFloat::Float8E4M3B11FNUZ())
     return S_Float8E4M3B11FNUZ;
+  else if (&Sem == &llvm::APFloat::Float8E3M4())
+    return S_Float8E3M4;
   else if (&Sem == &llvm::APFloat::FloatTF32())
     return S_FloatTF32;
   else if (&Sem == &llvm::APFloat::Float6E3M2FN())
@@ -287,6 +292,7 @@ const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
 const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
   return semFloat8E4M3B11FNUZ;
 }
+const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; }
 const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
 const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
 const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
@@ -3643,6 +3649,11 @@ APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const {
   return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>();
 }
 
+APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat8E3M4>();
+}
+
 APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
   assert(partCount() == 1);
   return convertIEEEFloatToAPInt<semFloatTF32>();
@@ -3704,6 +3715,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ)
     return convertFloat8E4M3B11FNUZAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4)
+    return convertFloat8E3M4APFloatToAPInt();
+
   if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
     return convertFloatTF32APFloatToAPInt();
 
@@ -3932,6 +3946,10 @@ void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) {
   initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api);
 }
 
+void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat8E3M4>(api);
+}
+
 void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
   initFromIEEEAPInt<semFloatTF32>(api);
 }
@@ -3977,6 +3995,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromFloat8E4M3FNUZAPInt(api);
   if (Sem == &semFloat8E4M3B11FNUZ)
     return initFromFloat8E4M3B11FNUZAPInt(api);
+  if (Sem == &semFloat8E3M4)
+    return initFromFloat8E3M4APInt(api);
   if (Sem == &semFloatTF32)
     return initFromFloatTF32APInt(api);
   if (Sem == &semFloat6E3M2FN)
diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp
index 93481ca916d2a..baa3c322e9dae 100644
--- a/llvm/lib/Support/Error.cpp
+++ b/llvm/lib/Support/Error.cpp
@@ -182,6 +182,12 @@ LLVMErrorTypeId LLVMGetErrorTypeId(LLVMErrorRef Err) {
 
 void LLVMConsumeError(LLVMErrorRef Err) { consumeError(unwrap(Err)); }
 
+
+
+void LLVMCantFail(LLVMErrorRef Err) {
+  cantFail(unwrap(Err));
+}
+
 char *LLVMGetErrorMessage(LLVMErrorRef Err) {
   std::string Tmp = toString(unwrap(Err));
   char *ErrMsg = new char[Tmp.size() + 1];
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 9612db7d30f98..c2014028ddadc 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -73,12 +73,20 @@ struct llvm::TimeTraceProfilerEntry {
   const TimePointType Start;
   TimePointType End;
   const std::string Name;
-  const std::string Detail;
+  TimeTraceMetadata Metadata;
+
   const bool AsyncEvent = false;
   TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
                          std::string &&Dt, bool Ae)
+      : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), Metadata(),
+        AsyncEvent(Ae) {
+    Metadata.Detail = std::move(Dt);
+  }
+
+  TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
+                         TimeTraceMetadata &&Mt, bool Ae)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Detail(std::move(Dt)), AsyncEvent(Ae) {}
+        Metadata(std::move(Mt)), AsyncEvent(Ae) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
@@ -97,10 +105,12 @@ struct llvm::TimeTraceProfilerEntry {
 };
 
 struct llvm::TimeTraceProfiler {
-  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
+  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "",
+                    bool TimeTraceVerbose = false)
       : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()),
         ProcName(ProcName), Pid(sys::Process::getProcessId()),
-        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) {
+        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity),
+        TimeTraceVerbose(TimeTraceVerbose) {
     llvm::get_thread_name(ThreadName);
   }
 
@@ -113,6 +123,15 @@ struct llvm::TimeTraceProfiler {
     return Stack.back().get();
   }
 
+  TimeTraceProfilerEntry *
+  begin(std::string Name, llvm::function_ref<TimeTraceMetadata()> Metadata,
+        bool AsyncEvent = false) {
+    Stack.emplace_back(std::make_unique<TimeTraceProfilerEntry>(
+        ClockType::now(), TimePointType(), std::move(Name), Metadata(),
+        AsyncEvent));
+    return Stack.back().get();
+  }
+
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
     end(*Stack.back());
@@ -184,8 +203,15 @@ struct llvm::TimeTraceProfiler {
           J.attribute("dur", DurUs);
         }
         J.attribute("name", E.Name);
-        if (!E.Detail.empty()) {
-          J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
+        if (!E.Metadata.isEmpty()) {
+          J.attributeObject("args", [&] {
+            if (!E.Metadata.Detail.empty())
+              J.attribute("detail", E.Metadata.Detail);
+            if (!E.Metadata.File.empty())
+              J.attribute("file", E.Metadata.File);
+            if (E.Metadata.Line > 0)
+              J.attribute("line", E.Metadata.Line);
+          });
         }
       });
 
@@ -307,14 +333,25 @@ struct llvm::TimeTraceProfiler {
 
   // Minimum time granularity (in microseconds)
   const unsigned TimeTraceGranularity;
+
+  // Make time trace capture verbose event details (e.g. source filenames). This
+  // can increase the size of the output by 2-3 times.
+  const bool TimeTraceVerbose;
 };
 
+bool llvm::isTimeTraceVerbose() {
+  return getTimeTraceProfilerInstance() &&
+         getTimeTraceProfilerInstance()->TimeTraceVerbose;
+}
+
 void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                       StringRef ProcName) {
+                                       StringRef ProcName,
+                                       bool TimeTraceVerbose) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
   TimeTraceProfilerInstance = new TimeTraceProfiler(
-      TimeTraceGranularity, llvm::sys::path::filename(ProcName));
+      TimeTraceGranularity, llvm::sys::path::filename(ProcName),
+      TimeTraceVerbose);
 }
 
 // Removes all TimeTraceProfilerInstances.
@@ -381,6 +418,14 @@ llvm::timeTraceProfilerBegin(StringRef Name,
   return nullptr;
 }
 
+TimeTraceProfilerEntry *
+llvm::timeTraceProfilerBegin(StringRef Name,
+                             llvm::function_ref<TimeTraceMetadata()> Metadata) {
+  if (TimeTraceProfilerInstance != nullptr)
+    return TimeTraceProfilerInstance->begin(std::string(Name), Metadata, false);
+  return nullptr;
+}
+
 TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name,
                                                           StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 34d294b232c32..d525f5b16e862 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -482,7 +482,8 @@ static RTL_OSVERSIONINFOEXW GetWindowsVer() {
     HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
     assert(hMod);
 
-    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+    auto getVer =
+        (RtlGetVersionPtr)(void *)::GetProcAddress(hMod, "RtlGetVersion");
     assert(getVer);
 
     RTL_OSVERSIONINFOEXW info{};
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 29ebf7c696e04..f11ad09f37139 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -171,23 +171,27 @@ static bool load64BitDebugHelp(void) {
   HMODULE hLib =
       ::LoadLibraryExA("Dbghelp.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
   if (hLib) {
-    fMiniDumpWriteDump =
-        (fpMiniDumpWriteDump)::GetProcAddress(hLib, "MiniDumpWriteDump");
-    fStackWalk64 = (fpStackWalk64)::GetProcAddress(hLib, "StackWalk64");
-    fSymGetModuleBase64 =
-        (fpSymGetModuleBase64)::GetProcAddress(hLib, "SymGetModuleBase64");
-    fSymGetSymFromAddr64 =
-        (fpSymGetSymFromAddr64)::GetProcAddress(hLib, "SymGetSymFromAddr64");
-    fSymGetLineFromAddr64 =
-        (fpSymGetLineFromAddr64)::GetProcAddress(hLib, "SymGetLineFromAddr64");
-    fSymGetModuleInfo64 =
-        (fpSymGetModuleInfo64)::GetProcAddress(hLib, "SymGetModuleInfo64");
-    fSymFunctionTableAccess64 = (fpSymFunctionTableAccess64)::GetProcAddress(
-        hLib, "SymFunctionTableAccess64");
-    fSymSetOptions = (fpSymSetOptions)::GetProcAddress(hLib, "SymSetOptions");
-    fSymInitialize = (fpSymInitialize)::GetProcAddress(hLib, "SymInitialize");
-    fEnumerateLoadedModules = (fpEnumerateLoadedModules)::GetProcAddress(
-        hLib, "EnumerateLoadedModules64");
+    fMiniDumpWriteDump = (fpMiniDumpWriteDump)(void *)::GetProcAddress(
+        hLib, "MiniDumpWriteDump");
+    fStackWalk64 = (fpStackWalk64)(void *)::GetProcAddress(hLib, "StackWalk64");
+    fSymGetModuleBase64 = (fpSymGetModuleBase64)(void *)::GetProcAddress(
+        hLib, "SymGetModuleBase64");
+    fSymGetSymFromAddr64 = (fpSymGetSymFromAddr64)(void *)::GetProcAddress(
+        hLib, "SymGetSymFromAddr64");
+    fSymGetLineFromAddr64 = (fpSymGetLineFromAddr64)(void *)::GetProcAddress(
+        hLib, "SymGetLineFromAddr64");
+    fSymGetModuleInfo64 = (fpSymGetModuleInfo64)(void *)::GetProcAddress(
+        hLib, "SymGetModuleInfo64");
+    fSymFunctionTableAccess64 =
+        (fpSymFunctionTableAccess64)(void *)::GetProcAddress(
+            hLib, "SymFunctionTableAccess64");
+    fSymSetOptions =
+        (fpSymSetOptions)(void *)::GetProcAddress(hLib, "SymSetOptions");
+    fSymInitialize =
+        (fpSymInitialize)(void *)::GetProcAddress(hLib, "SymInitialize");
+    fEnumerateLoadedModules =
+        (fpEnumerateLoadedModules)(void *)::GetProcAddress(
+            hLib, "EnumerateLoadedModules64");
   }
   return isDebugHelpInitialized();
 }
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 4cd3d58b80198..04b3233084a41 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -18,6 +18,7 @@
 
 #include <atomic>
 #include <fcntl.h>
+#include <functional>
 #include <thread>
 
 #ifndef _WIN32
@@ -177,70 +178,89 @@ Expected<ListeningSocket> ListeningSocket::createUnix(StringRef SocketPath,
 #endif // _WIN32
 }
 
-Expected<std::unique_ptr<raw_socket_stream>>
-ListeningSocket::accept(std::chrono::milliseconds Timeout) {
-
-  struct pollfd FDs[2];
-  FDs[0].events = POLLIN;
+// If a file descriptor being monitored by ::poll is closed by another thread,
+// the result is unspecified. In the case ::poll does not unblock and return,
+// when ActiveFD is closed, you can provide another file descriptor via CancelFD
+// that when written to will cause poll to return. Typically CancelFD is the
+// read end of a unidirectional pipe.
+//
+// Timeout should be -1 to block indefinitly
+//
+// getActiveFD is a callback to handle ActiveFD's of std::atomic<int> and int
+static std::error_code
+manageTimeout(const std::chrono::milliseconds &Timeout,
+              const std::function<int()> &getActiveFD,
+              const std::optional<int> &CancelFD = std::nullopt) {
+  struct pollfd FD[2];
+  FD[0].events = POLLIN;
 #ifdef _WIN32
-  SOCKET WinServerSock = _get_osfhandle(FD);
-  FDs[0].fd = WinServerSock;
+  SOCKET WinServerSock = _get_osfhandle(getActiveFD());
+  FD[0].fd = WinServerSock;
 #else
-  FDs[0].fd = FD;
+  FD[0].fd = getActiveFD();
 #endif
-  FDs[1].events = POLLIN;
-  FDs[1].fd = PipeFD[0];
-
-  // Keep track of how much time has passed in case poll is interupted by a
-  // signal and needs to be recalled
-  int RemainingTime = Timeout.count();
-  std::chrono::milliseconds ElapsedTime = std::chrono::milliseconds(0);
-  int PollStatus = -1;
-
-  while (PollStatus == -1 && (Timeout.count() == -1 || ElapsedTime < Timeout)) {
-    if (Timeout.count() != -1)
-      RemainingTime -= ElapsedTime.count();
+  uint8_t FDCount = 1;
+  if (CancelFD.has_value()) {
+    FD[1].events = POLLIN;
+    FD[1].fd = CancelFD.value();
+    FDCount++;
+  }
 
-    auto Start = std::chrono::steady_clock::now();
+  // Keep track of how much time has passed in case ::poll or WSAPoll are
+  // interupted by a signal and need to be recalled
+  auto Start = std::chrono::steady_clock::now();
+  auto RemainingTimeout = Timeout;
+  int PollStatus = 0;
+  do {
+    // If Timeout is -1 then poll should block and RemainingTimeout does not
+    // need to be recalculated
+    if (PollStatus != 0 && Timeout != std::chrono::milliseconds(-1)) {
+      auto TotalElapsedTime =
+          std::chrono::duration_cast<std::chrono::milliseconds>(
+              std::chrono::steady_clock::now() - Start);
+
+      if (TotalElapsedTime >= Timeout)
+        return std::make_error_code(std::errc::operation_would_block);
+
+      RemainingTimeout = Timeout - TotalElapsedTime;
+    }
 #ifdef _WIN32
-    PollStatus = WSAPoll(FDs, 2, RemainingTime);
+    PollStatus = WSAPoll(FD, FDCount, RemainingTimeout.count());
+  } while (PollStatus == SOCKET_ERROR &&
+           getLastSocketErrorCode() == std::errc::interrupted);
 #else
-    PollStatus = ::poll(FDs, 2, RemainingTime);
+    PollStatus = ::poll(FD, FDCount, RemainingTimeout.count());
+  } while (PollStatus == -1 &&
+           getLastSocketErrorCode() == std::errc::interrupted);
 #endif
-    // If FD equals -1 then ListeningSocket::shutdown has been called and it is
-    // appropriate to return operation_canceled
-    if (FD.load() == -1)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::operation_canceled),
-          "Accept canceled");
 
+  // If ActiveFD equals -1 or CancelFD has data to be read then the operation
+  // has been canceled by another thread
+  if (getActiveFD() == -1 || (CancelFD.has_value() && FD[1].revents & POLLIN))
+    return std::make_error_code(std::errc::operation_canceled);
 #if _WIN32
-    if (PollStatus == SOCKET_ERROR) {
+  if (PollStatus == SOCKET_ERROR)
 #else
-    if (PollStatus == -1) {
+  if (PollStatus == -1)
 #endif
-      std::error_code PollErrCode = getLastSocketErrorCode();
-      // Ignore EINTR (signal occured before any request event) and retry
-      if (PollErrCode != std::errc::interrupted)
-        return llvm::make_error<StringError>(PollErrCode, "FD poll failed");
-    }
-    if (PollStatus == 0)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::timed_out),
-          "No client requests within timeout window");
-
-    if (FDs[0].revents & POLLNVAL)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::bad_file_descriptor));
+    return getLastSocketErrorCode();
+  if (PollStatus == 0)
+    return std::make_error_code(std::errc::timed_out);
+  if (FD[0].revents & POLLNVAL)
+    return std::make_error_code(std::errc::bad_file_descriptor);
+  return std::error_code();
+}
 
-    auto Stop = std::chrono::steady_clock::now();
-    ElapsedTime +=
-        std::chrono::duration_cast<std::chrono::milliseconds>(Stop - Start);
-  }
+Expected<std::unique_ptr<raw_socket_stream>>
+ListeningSocket::accept(const std::chrono::milliseconds &Timeout) {
+  auto getActiveFD = [this]() -> int { return FD; };
+  std::error_code TimeoutErr = manageTimeout(Timeout, getActiveFD, PipeFD[0]);
+  if (TimeoutErr)
+    return llvm::make_error<StringError>(TimeoutErr, "Timeout error");
 
   int AcceptFD;
 #ifdef _WIN32
-  SOCKET WinAcceptSock = ::accept(WinServerSock, NULL, NULL);
+  SOCKET WinAcceptSock = ::accept(_get_osfhandle(FD), NULL, NULL);
   AcceptFD = _open_osfhandle(WinAcceptSock, 0);
 #else
   AcceptFD = ::accept(FD, NULL, NULL);
@@ -295,6 +315,8 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
+raw_socket_stream::~raw_socket_stream() {}
+
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
 #ifdef _WIN32
@@ -306,4 +328,14 @@ raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
   return std::make_unique<raw_socket_stream>(*FD);
 }
 
-raw_socket_stream::~raw_socket_stream() {}
+ssize_t raw_socket_stream::read(char *Ptr, size_t Size,
+                                const std::chrono::milliseconds &Timeout) {
+  auto getActiveFD = [this]() -> int { return this->get_fd(); };
+  std::error_code Err = manageTimeout(Timeout, getActiveFD);
+  // Mimic raw_fd_stream::read error handling behavior
+  if (Err) {
+    raw_fd_stream::error_detected(Err);
+    return -1;
+  }
+  return raw_fd_stream::read(Ptr, Size);
+}
diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp
index 607789b391381..cdb76d57e2c1d 100644
--- a/llvm/lib/Support/xxhash.cpp
+++ b/llvm/lib/Support/xxhash.cpp
@@ -47,6 +47,19 @@
 
 #include <stdlib.h>
 
+#if !defined(LLVM_XXH_USE_NEON)
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) &&      \
+    !defined(__ARM_BIG_ENDIAN)
+#define LLVM_XXH_USE_NEON 1
+#else
+#define LLVM_XXH_USE_NEON 0
+#endif
+#endif
+
+#if LLVM_XXH_USE_NEON
+#include <arm_neon.h>
+#endif
+
 using namespace llvm;
 using namespace support;
 
@@ -323,6 +336,144 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
   return XXH3_avalanche(acc);
 }
 
+#if LLVM_XXH_USE_NEON
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+
+// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
+// (July 2024), with the following removed:
+// - workaround for suboptimal codegen on older GCC
+// - compiler barriers against instruction reordering
+// - WebAssembly SIMD support
+// - configurable split between NEON and scalar lanes (benchmarking shows no
+//   penalty when fully doing SIMD on the Apple M1)
+
+#if defined(__GNUC__) || defined(__clang__)
+#define XXH_ALIASING __attribute__((__may_alias__))
+#else
+#define XXH_ALIASING /* nothing */
+#endif
+
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
+  return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
+                                     const uint8_t *secret) {
+  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
+
+#ifdef __clang__
+#pragma clang loop unroll(full)
+#endif
+  for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
+    /* data_vec = input[i]; */
+    uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
+    uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
+
+    /* key_vec  = secret[i];  */
+    uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
+    uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
+
+    /* data_swap = swap(data_vec) */
+    uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+    uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+
+    /* data_key = data_vec ^ key_vec; */
+    uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+    uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+    /*
+     * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+     * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+     * get one vector with the low 32 bits of each lane, and one vector
+     * with the high 32 bits of each lane.
+     *
+     * The intrinsic returns a double vector because the original ARMv7-a
+     * instruction modified both arguments in place. AArch64 and SIMD128 emit
+     * two instructions from this intrinsic.
+     *
+     *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+     *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+     */
+    uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
+                                      vreinterpretq_u32_u64(data_key_2));
+
+    /* data_key_lo = data_key & 0xFFFFFFFF */
+    uint32x4_t data_key_lo = unzipped.val[0];
+    /* data_key_hi = data_key >> 32 */
+    uint32x4_t data_key_hi = unzipped.val[1];
+
+    /*
+     * Then, we can split the vectors horizontally and multiply which, as for
+     * most widening intrinsics, have a variant that works on both high half
+     * vectors for free on AArch64. A similar instruction is available on
+     * SIMD128.
+     *
+     * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+     */
+    uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
+                                 vget_low_u32(data_key_hi));
+    uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
+                                 vget_high_u32(data_key_hi));
+
+    /* xacc[i] = acc_vec + sum; */
+    xacc[i] = vaddq_u64(xacc[i], sum_1);
+    xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
+  }
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
+  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
+
+  /* { prime32_1, prime32_1 } */
+  uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
+  /* { 0, prime32_1, 0, prime32_1 } */
+  uint32x4_t const kPrimeHi =
+      vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
+
+  for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
+    /* xacc[i] ^= (xacc[i] >> 47); */
+    uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
+    uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
+    uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+    /* xacc[i] ^= secret[i]; */
+    uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
+    uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+
+    /*
+     * xacc[i] *= XXH_PRIME32_1
+     *
+     * Expanded version with portable NEON intrinsics
+     *
+     *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+     *
+     * prod_hi = hi(data_key) * lo(prime) << 32
+     *
+     * Since we only need 32 bits of this multiply a trick can be used,
+     * reinterpreting the vector as a uint32x4_t and multiplying by
+     * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
+     * shift.
+     */
+    uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
+
+    /* Extract low bits for vmlal_u32  */
+    uint32x2_t data_key_lo = vmovn_u64(data_key);
+
+    /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+    xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+  }
+}
+#else
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+
 LLVM_ATTRIBUTE_ALWAYS_INLINE
 static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
                                        const uint8_t *secret) {
@@ -335,20 +486,23 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_accumulate_scalar(uint64_t *acc, const uint8_t *input,
-                                   const uint8_t *secret, size_t nbStripes) {
-  for (size_t n = 0; n < nbStripes; ++n)
-    XXH3_accumulate_512_scalar(acc, input + n * XXH_STRIPE_LEN,
-                               secret + n * XXH_SECRET_CONSUME_RATE);
-}
-
-static void XXH3_scrambleAcc(uint64_t *acc, const uint8_t *secret) {
+static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
   for (size_t i = 0; i < XXH_ACC_NB; ++i) {
     acc[i] ^= acc[i] >> 47;
     acc[i] ^= endian::read64le(secret + 8 * i);
     acc[i] *= PRIME32_1;
   }
 }
+#endif
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
+                            const uint8_t *secret, size_t nbStripes) {
+  for (size_t n = 0; n < nbStripes; ++n) {
+    XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
+                        secret + n * XXH_SECRET_CONSUME_RATE);
+  }
+}
 
 static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
   return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
@@ -375,21 +529,20 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
       PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
   };
   for (size_t n = 0; n < nb_blocks; ++n) {
-    XXH3_accumulate_scalar(acc, input + n * block_len, secret,
-                           nbStripesPerBlock);
+    XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
   }
 
   /* last partial block */
   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
-  XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
+  XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
 
   /* last stripe */
   constexpr size_t XXH_SECRET_LASTACC_START = 7;
-  XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
-                             secret + secretSize - XXH_STRIPE_LEN -
-                                 XXH_SECRET_LASTACC_START);
+  XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
+                      secret + secretSize - XXH_STRIPE_LEN -
+                          XXH_SECRET_LASTACC_START);
 
   /* converge into final hash */
   constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
@@ -840,21 +993,20 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
   };
 
   for (size_t n = 0; n < nb_blocks; ++n) {
-    XXH3_accumulate_scalar(acc, input + n * block_len, secret,
-                           nbStripesPerBlock);
+    XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
   }
 
   /* last partial block */
   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
-  XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
+  XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
 
   /* last stripe */
   constexpr size_t XXH_SECRET_LASTACC_START = 7;
-  XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
-                             secret + secretSize - XXH_STRIPE_LEN -
-                                 XXH_SECRET_LASTACC_START);
+  XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
+                      secret + secretSize - XXH_STRIPE_LEN -
+                          XXH_SECRET_LASTACC_START);
 
   /* converge into final hash */
   static_assert(sizeof(acc) == 64);
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 4ff52eb252d20..415edb189e60c 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Mangler.h"
@@ -70,15 +71,21 @@ class AArch64Arm64ECCallLowering : public ModulePass {
   Function *buildEntryThunk(Function *F);
   void lowerCall(CallBase *CB);
   Function *buildGuestExitThunk(Function *F);
-  bool processFunction(Function &F, SetVector<Function *> &DirectCalledFns);
+  Function *buildPatchableThunk(GlobalAlias *UnmangledAlias,
+                                GlobalAlias *MangledAlias);
+  bool processFunction(Function &F, SetVector<GlobalValue *> &DirectCalledFns,
+                       DenseMap<GlobalAlias *, GlobalAlias *> &FnsMap);
   bool runOnModule(Module &M) override;
 
 private:
   int cfguard_module_flag = 0;
   FunctionType *GuardFnType = nullptr;
   PointerType *GuardFnPtrType = nullptr;
+  FunctionType *DispatchFnType = nullptr;
+  PointerType *DispatchFnPtrType = nullptr;
   Constant *GuardFnCFGlobal = nullptr;
   Constant *GuardFnGlobal = nullptr;
+  Constant *DispatchFnGlobal = nullptr;
   Module *M = nullptr;
 
   Type *PtrTy;
@@ -672,6 +679,66 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   return GuestExit;
 }
 
+Function *
+AArch64Arm64ECCallLowering::buildPatchableThunk(GlobalAlias *UnmangledAlias,
+                                                GlobalAlias *MangledAlias) {
+  llvm::raw_null_ostream NullThunkName;
+  FunctionType *Arm64Ty, *X64Ty;
+  Function *F = cast<Function>(MangledAlias->getAliasee());
+  SmallVector<ThunkArgTranslation> ArgTranslations;
+  getThunkType(F->getFunctionType(), F->getAttributes(),
+               Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty,
+               ArgTranslations);
+  std::string ThunkName(MangledAlias->getName());
+  if (ThunkName[0] == '?' && ThunkName.find("@") != std::string::npos) {
+    ThunkName.insert(ThunkName.find("@"), "$hybpatch_thunk");
+  } else {
+    ThunkName.append("$hybpatch_thunk");
+  }
+
+  Function *GuestExit =
+      Function::Create(Arm64Ty, GlobalValue::WeakODRLinkage, 0, ThunkName, M);
+  GuestExit->setComdat(M->getOrInsertComdat(ThunkName));
+  GuestExit->setSection(".wowthk$aa");
+  BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit);
+  IRBuilder<> B(BB);
+
+  // Load the global symbol as a pointer to the check function.
+  LoadInst *DispatchLoad = B.CreateLoad(DispatchFnPtrType, DispatchFnGlobal);
+
+  // Create new dispatch call instruction.
+  Function *ExitThunk =
+      buildExitThunk(F->getFunctionType(), F->getAttributes());
+  CallInst *Dispatch =
+      B.CreateCall(DispatchFnType, DispatchLoad,
+                   {UnmangledAlias, ExitThunk, UnmangledAlias->getAliasee()});
+
+  // Ensure that the first arguments are passed in the correct registers.
+  Dispatch->setCallingConv(CallingConv::CFGuard_Check);
+
+  Value *DispatchRetVal = B.CreateBitCast(Dispatch, PtrTy);
+  SmallVector<Value *> Args;
+  for (Argument &Arg : GuestExit->args())
+    Args.push_back(&Arg);
+  CallInst *Call = B.CreateCall(Arm64Ty, DispatchRetVal, Args);
+  Call->setTailCallKind(llvm::CallInst::TCK_MustTail);
+
+  if (Call->getType()->isVoidTy())
+    B.CreateRetVoid();
+  else
+    B.CreateRet(Call);
+
+  auto SRetAttr = F->getAttributes().getParamAttr(0, Attribute::StructRet);
+  auto InRegAttr = F->getAttributes().getParamAttr(0, Attribute::InReg);
+  if (SRetAttr.isValid() && !InRegAttr.isValid()) {
+    GuestExit->addParamAttr(0, SRetAttr);
+    Call->addParamAttr(0, SRetAttr);
+  }
+
+  MangledAlias->setAliasee(GuestExit);
+  return GuestExit;
+}
+
 // Lower an indirect call with inline code.
 void AArch64Arm64ECCallLowering::lowerCall(CallBase *CB) {
   assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
@@ -727,17 +794,62 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
 
   GuardFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy}, false);
   GuardFnPtrType = PointerType::get(GuardFnType, 0);
+  DispatchFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy, PtrTy}, false);
+  DispatchFnPtrType = PointerType::get(DispatchFnType, 0);
   GuardFnCFGlobal =
       M->getOrInsertGlobal("__os_arm64x_check_icall_cfg", GuardFnPtrType);
   GuardFnGlobal =
       M->getOrInsertGlobal("__os_arm64x_check_icall", GuardFnPtrType);
+  DispatchFnGlobal =
+      M->getOrInsertGlobal("__os_arm64x_dispatch_call", DispatchFnPtrType);
+
+  DenseMap<GlobalAlias *, GlobalAlias *> FnsMap;
+  SetVector<GlobalAlias *> PatchableFns;
+
+  for (Function &F : Mod) {
+    if (!F.hasFnAttribute(Attribute::HybridPatchable) || F.isDeclaration() ||
+        F.hasLocalLinkage() || F.getName().ends_with("$hp_target"))
+      continue;
+
+    // Rename hybrid patchable functions and change callers to use a global
+    // alias instead.
+    if (std::optional<std::string> MangledName =
+            getArm64ECMangledFunctionName(F.getName().str())) {
+      std::string OrigName(F.getName());
+      F.setName(MangledName.value() + "$hp_target");
+
+      // The unmangled symbol is a weak alias to an undefined symbol with the
+      // "EXP+" prefix. This undefined symbol is resolved by the linker by
+      // creating an x86 thunk that jumps back to the actual EC target. Since we
+      // can't represent that in IR, we create an alias to the target instead.
+      // The "EXP+" symbol is set as metadata, which is then used by
+      // emitGlobalAlias to emit the right alias.
+      auto *A =
+          GlobalAlias::create(GlobalValue::LinkOnceODRLinkage, OrigName, &F);
+      F.replaceAllUsesWith(A);
+      F.setMetadata("arm64ec_exp_name",
+                    MDNode::get(M->getContext(),
+                                MDString::get(M->getContext(),
+                                              "EXP+" + MangledName.value())));
+      A->setAliasee(&F);
+
+      if (F.hasDLLExportStorageClass()) {
+        A->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
+        F.setDLLStorageClass(GlobalValue::DefaultStorageClass);
+      }
 
-  SetVector<Function *> DirectCalledFns;
+      FnsMap[A] = GlobalAlias::create(GlobalValue::LinkOnceODRLinkage,
+                                      MangledName.value(), &F);
+      PatchableFns.insert(A);
+    }
+  }
+
+  SetVector<GlobalValue *> DirectCalledFns;
   for (Function &F : Mod)
     if (!F.isDeclaration() &&
         F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native &&
         F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64)
-      processFunction(F, DirectCalledFns);
+      processFunction(F, DirectCalledFns, FnsMap);
 
   struct ThunkInfo {
     Constant *Src;
@@ -755,14 +867,20 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
           {&F, buildEntryThunk(&F), Arm64ECThunkType::Entry});
     }
   }
-  for (Function *F : DirectCalledFns) {
+  for (GlobalValue *O : DirectCalledFns) {
+    auto GA = dyn_cast<GlobalAlias>(O);
+    auto F = dyn_cast<Function>(GA ? GA->getAliasee() : O);
     ThunkMapping.push_back(
-        {F, buildExitThunk(F->getFunctionType(), F->getAttributes()),
+        {O, buildExitThunk(F->getFunctionType(), F->getAttributes()),
          Arm64ECThunkType::Exit});
-    if (!F->hasDLLImportStorageClass())
+    if (!GA && !F->hasDLLImportStorageClass())
       ThunkMapping.push_back(
           {buildGuestExitThunk(F), F, Arm64ECThunkType::GuestExit});
   }
+  for (GlobalAlias *A : PatchableFns) {
+    Function *Thunk = buildPatchableThunk(A, FnsMap[A]);
+    ThunkMapping.push_back({Thunk, A, Arm64ECThunkType::GuestExit});
+  }
 
   if (!ThunkMapping.empty()) {
     SmallVector<Constant *> ThunkMappingArrayElems;
@@ -785,7 +903,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
 }
 
 bool AArch64Arm64ECCallLowering::processFunction(
-    Function &F, SetVector<Function *> &DirectCalledFns) {
+    Function &F, SetVector<GlobalValue *> &DirectCalledFns,
+    DenseMap<GlobalAlias *, GlobalAlias *> &FnsMap) {
   SmallVector<CallBase *, 8> IndirectCalls;
 
   // For ARM64EC targets, a function definition's name is mangled differently
@@ -837,6 +956,16 @@ bool AArch64Arm64ECCallLowering::processFunction(
         continue;
       }
 
+      // Use mangled global alias for direct calls to patchable functions.
+      if (GlobalAlias *A = dyn_cast<GlobalAlias>(CB->getCalledOperand())) {
+        auto I = FnsMap.find(A);
+        if (I != FnsMap.end()) {
+          CB->setCalledOperand(I->second);
+          DirectCalledFns.insert(I->first);
+          continue;
+        }
+      }
+
       IndirectCalls.push_back(CB);
       ++Arm64ECCallsLowered;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 6f6b551993b6d..b51c056e9d53a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -68,6 +68,15 @@
 
 using namespace llvm;
 
+enum PtrauthCheckMode { Default, Unchecked, Poison, Trap };
+static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
+    "aarch64-ptrauth-auth-checks", cl::Hidden,
+    cl::values(clEnumValN(Unchecked, "none", "don't test for failure"),
+               clEnumValN(Poison, "poison", "poison on failure"),
+               clEnumValN(Trap, "trap", "trap on failure")),
+    cl::desc("Check pointer authentication auth/resign failures"),
+    cl::init(Default));
+
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -93,6 +102,8 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override;
 
+  const MCExpr *lowerBlockAddressConstant(const BlockAddress &BA) override;
+
   void emitStartOfAsmFile(Module &M) override;
   void emitJumpTableInfo() override;
   std::tuple<const MCSymbol *, uint64_t, const MCSymbol *,
@@ -104,6 +115,8 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
 
+  void LowerHardenedBRJumpTable(const MachineInstr &MI);
+
   void LowerMOPS(MCStreamer &OutStreamer, const MachineInstr &MI);
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
@@ -128,8 +141,12 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   void emitSled(const MachineInstr &MI, SledKind Kind);
 
-  // Emit the sequence for BLRA (authenticate + branch).
+  // Emit the sequence for BRA/BLRA (authenticate + branch/call).
   void emitPtrauthBranch(const MachineInstr *MI);
+
+  // Emit the sequence for AUT or AUTPAC.
+  void emitPtrauthAuthResign(const MachineInstr *MI);
+
   // Emit the sequence to compute a discriminator into x17, or reuse AddrDisc.
   unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc,
                                     unsigned &InstsEmitted);
@@ -143,8 +160,7 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   void emitInstruction(const MachineInstr *MI) override;
 
@@ -201,6 +217,7 @@ class AArch64AsmPrinter : public AsmPrinter {
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
   void emitFunctionBodyEnd() override;
+  void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
   void emitEndOfAsmFile(Module &M) override;
@@ -1263,6 +1280,32 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
   }
 }
 
+void AArch64AsmPrinter::emitGlobalAlias(const Module &M,
+                                        const GlobalAlias &GA) {
+  if (auto F = dyn_cast_or_null<Function>(GA.getAliasee())) {
+    // Global aliases must point to a definition, but unmangled patchable
+    // symbols are special and need to point to an undefined symbol with "EXP+"
+    // prefix. Such undefined symbol is resolved by the linker by creating
+    // x86 thunk that jumps back to the actual EC target.
+    if (MDNode *Node = F->getMetadata("arm64ec_exp_name")) {
+      StringRef ExpStr = cast<MDString>(Node->getOperand(0))->getString();
+      MCSymbol *ExpSym = MMI->getContext().getOrCreateSymbol(ExpStr);
+      MCSymbol *Sym = MMI->getContext().getOrCreateSymbol(GA.getName());
+      OutStreamer->beginCOFFSymbolDef(Sym);
+      OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
+      OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                      << COFF::SCT_COMPLEX_TYPE_SHIFT);
+      OutStreamer->endCOFFSymbolDef();
+      OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
+      OutStreamer->emitAssignment(
+          Sym, MCSymbolRefExpr::create(ExpSym, MCSymbolRefExpr::VK_None,
+                                       MMI->getContext()));
+      return;
+    }
+  }
+  AsmPrinter::emitGlobalAlias(M, GA);
+}
+
 /// Small jump tables contain an unsigned byte or half, representing the offset
 /// from the lowest-addressed possible destination to the desired basic
 /// block. Since all instructions are 4-byte aligned, this is further compressed
@@ -1327,6 +1370,139 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
                                   .addImm(Size == 4 ? 0 : 2));
 }
 
+void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
+  unsigned InstsEmitted = 0;
+
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  assert(MJTI && "Can't lower jump-table dispatch without JTI");
+
+  const std::vector<MachineJumpTableEntry> &JTs = MJTI->getJumpTables();
+  assert(!JTs.empty() && "Invalid JT index for jump-table dispatch");
+
+  // Emit:
+  //     mov x17, #<size of table>     ; depending on table size, with MOVKs
+  //     cmp x16, x17                  ; or #imm if table size fits in 12-bit
+  //     csel x16, x16, xzr, ls        ; check for index overflow
+  //
+  //     adrp x17, Ltable@PAGE         ; materialize table address
+  //     add x17, Ltable@PAGEOFF
+  //     ldrsw x16, [x17, x16, lsl #2] ; load table entry
+  //
+  //   Lanchor:
+  //     adr x17, Lanchor              ; compute target address
+  //     add x16, x17, x16
+  //     br x16                        ; branch to target
+
+  MachineOperand JTOp = MI.getOperand(0);
+
+  unsigned JTI = JTOp.getIndex();
+  assert(!AArch64FI->getJumpTableEntryPCRelSymbol(JTI) &&
+         "unsupported compressed jump table");
+
+  const uint64_t NumTableEntries = JTs[JTI].MBBs.size();
+
+  // cmp only supports a 12-bit immediate.  If we need more, materialize the
+  // immediate, using x17 as a scratch register.
+  uint64_t MaxTableEntry = NumTableEntries - 1;
+  if (isUInt<12>(MaxTableEntry)) {
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXri)
+                                     .addReg(AArch64::XZR)
+                                     .addReg(AArch64::X16)
+                                     .addImm(MaxTableEntry)
+                                     .addImm(0));
+    ++InstsEmitted;
+  } else {
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(AArch64::MOVZXi)
+                       .addReg(AArch64::X17)
+                       .addImm(static_cast<uint16_t>(MaxTableEntry))
+                       .addImm(0));
+    ++InstsEmitted;
+    // It's sad that we have to manually materialize instructions, but we can't
+    // trivially reuse the main pseudo expansion logic.
+    // A MOVK sequence is easy enough to generate and handles the general case.
+    for (int Offset = 16; Offset < 64; Offset += 16) {
+      if ((MaxTableEntry >> Offset) == 0)
+        break;
+      EmitToStreamer(*OutStreamer,
+                     MCInstBuilder(AArch64::MOVKXi)
+                         .addReg(AArch64::X17)
+                         .addReg(AArch64::X17)
+                         .addImm(static_cast<uint16_t>(MaxTableEntry >> Offset))
+                         .addImm(Offset));
+      ++InstsEmitted;
+    }
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
+                                     .addReg(AArch64::XZR)
+                                     .addReg(AArch64::X16)
+                                     .addReg(AArch64::X17)
+                                     .addImm(0));
+    ++InstsEmitted;
+  }
+
+  // This picks entry #0 on failure.
+  // We might want to trap instead.
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::CSELXr)
+                                   .addReg(AArch64::X16)
+                                   .addReg(AArch64::X16)
+                                   .addReg(AArch64::XZR)
+                                   .addImm(AArch64CC::LS));
+  ++InstsEmitted;
+
+  // Prepare the @PAGE/@PAGEOFF low/high operands.
+  MachineOperand JTMOHi(JTOp), JTMOLo(JTOp);
+  MCOperand JTMCHi, JTMCLo;
+
+  JTMOHi.setTargetFlags(AArch64II::MO_PAGE);
+  JTMOLo.setTargetFlags(AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+  MCInstLowering.lowerOperand(JTMOHi, JTMCHi);
+  MCInstLowering.lowerOperand(JTMOLo, JTMCLo);
+
+  EmitToStreamer(
+      *OutStreamer,
+      MCInstBuilder(AArch64::ADRP).addReg(AArch64::X17).addOperand(JTMCHi));
+  ++InstsEmitted;
+
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXri)
+                                   .addReg(AArch64::X17)
+                                   .addReg(AArch64::X17)
+                                   .addOperand(JTMCLo)
+                                   .addImm(0));
+  ++InstsEmitted;
+
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
+                                   .addReg(AArch64::X16)
+                                   .addReg(AArch64::X17)
+                                   .addReg(AArch64::X16)
+                                   .addImm(0)
+                                   .addImm(1));
+  ++InstsEmitted;
+
+  MCSymbol *AdrLabel = MF->getContext().createTempSymbol();
+  const auto *AdrLabelE = MCSymbolRefExpr::create(AdrLabel, MF->getContext());
+  AArch64FI->setJumpTableEntryInfo(JTI, 4, AdrLabel);
+
+  OutStreamer->emitLabel(AdrLabel);
+  EmitToStreamer(
+      *OutStreamer,
+      MCInstBuilder(AArch64::ADR).addReg(AArch64::X17).addExpr(AdrLabelE));
+  ++InstsEmitted;
+
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
+                                   .addReg(AArch64::X16)
+                                   .addReg(AArch64::X17)
+                                   .addReg(AArch64::X16)
+                                   .addImm(0));
+  ++InstsEmitted;
+
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::BR).addReg(AArch64::X16));
+  ++InstsEmitted;
+
+  (void)InstsEmitted;
+  assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
+}
+
 void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer,
                                   const llvm::MachineInstr &MI) {
   unsigned Opcode = MI.getOpcode();
@@ -1596,8 +1772,225 @@ unsigned AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
   return AArch64::X17;
 }
 
+void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
+  unsigned InstsEmitted = 0;
+  const bool IsAUTPAC = MI->getOpcode() == AArch64::AUTPAC;
+
+  // We can expand AUT/AUTPAC into 3 possible sequences:
+  // - unchecked:
+  //      autia x16, x0
+  //      pacib x16, x1 ; if AUTPAC
+  //
+  // - checked and clearing:
+  //      mov x17, x0
+  //      movk x17, #disc, lsl #48
+  //      autia x16, x17
+  //      mov x17, x16
+  //      xpaci x17
+  //      cmp x16, x17
+  //      b.eq Lsuccess
+  //      mov x16, x17
+  //      b Lend
+  //     Lsuccess:
+  //      mov x17, x1
+  //      movk x17, #disc, lsl #48
+  //      pacib x16, x17
+  //     Lend:
+  //   Where we only emit the AUT if we started with an AUT.
+  //
+  // - checked and trapping:
+  //      mov x17, x0
+  //      movk x17, #disc, lsl #48
+  //      autia x16, x0
+  //      mov x17, x16
+  //      xpaci x17
+  //      cmp x16, x17
+  //      b.eq Lsuccess
+  //      brk #<0xc470 + aut key>
+  //     Lsuccess:
+  //      mov x17, x1
+  //      movk x17, #disc, lsl #48
+  //      pacib x16, x17 ; if AUTPAC
+  //   Where the b.eq skips over the trap if the PAC is valid.
+  //
+  // This sequence is expensive, but we need more information to be able to
+  // do better.
+  //
+  // We can't TBZ the poison bit because EnhancedPAC2 XORs the PAC bits
+  // on failure.
+  // We can't TST the PAC bits because we don't always know how the address
+  // space is setup for the target environment (and the bottom PAC bit is
+  // based on that).
+  // Either way, we also don't always know whether TBI is enabled or not for
+  // the specific target environment.
+
+  // By default, auth/resign sequences check for auth failures.
+  bool ShouldCheck = true;
+  // In the checked sequence, we only trap if explicitly requested.
+  bool ShouldTrap = MF->getFunction().hasFnAttribute("ptrauth-auth-traps");
+
+  // On an FPAC CPU, you get traps whether you want them or not: there's
+  // no point in emitting checks or traps.
+  if (STI->hasFPAC())
+    ShouldCheck = ShouldTrap = false;
+
+  // However, command-line flags can override this, for experimentation.
+  switch (PtrauthAuthChecks) {
+  case PtrauthCheckMode::Default:
+    break;
+  case PtrauthCheckMode::Unchecked:
+    ShouldCheck = ShouldTrap = false;
+    break;
+  case PtrauthCheckMode::Poison:
+    ShouldCheck = true;
+    ShouldTrap = false;
+    break;
+  case PtrauthCheckMode::Trap:
+    ShouldCheck = ShouldTrap = true;
+    break;
+  }
+
+  auto AUTKey = (AArch64PACKey::ID)MI->getOperand(0).getImm();
+  uint64_t AUTDisc = MI->getOperand(1).getImm();
+  unsigned AUTAddrDisc = MI->getOperand(2).getReg();
+
+  unsigned XPACOpc = getXPACOpcodeForKey(AUTKey);
+
+  // Compute aut discriminator into x17
+  assert(isUInt<16>(AUTDisc));
+  unsigned AUTDiscReg =
+      emitPtrauthDiscriminator(AUTDisc, AUTAddrDisc, InstsEmitted);
+  bool AUTZero = AUTDiscReg == AArch64::XZR;
+  unsigned AUTOpc = getAUTOpcodeForKey(AUTKey, AUTZero);
+
+  //  autiza x16      ; if  AUTZero
+  //  autia x16, x17  ; if !AUTZero
+  MCInst AUTInst;
+  AUTInst.setOpcode(AUTOpc);
+  AUTInst.addOperand(MCOperand::createReg(AArch64::X16));
+  AUTInst.addOperand(MCOperand::createReg(AArch64::X16));
+  if (!AUTZero)
+    AUTInst.addOperand(MCOperand::createReg(AUTDiscReg));
+  EmitToStreamer(*OutStreamer, AUTInst);
+  ++InstsEmitted;
+
+  // Unchecked or checked-but-non-trapping AUT is just an "AUT": we're done.
+  if (!IsAUTPAC && (!ShouldCheck || !ShouldTrap)) {
+    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+    return;
+  }
+
+  MCSymbol *EndSym = nullptr;
+
+  // Checked sequences do an additional strip-and-compare.
+  if (ShouldCheck) {
+    MCSymbol *SuccessSym = createTempSymbol("auth_success_");
+
+    // XPAC has tied src/dst: use x17 as a temporary copy.
+    //  mov x17, x16
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
+                                     .addReg(AArch64::X17)
+                                     .addReg(AArch64::XZR)
+                                     .addReg(AArch64::X16)
+                                     .addImm(0));
+    ++InstsEmitted;
+
+    //  xpaci x17
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(XPACOpc).addReg(AArch64::X17).addReg(AArch64::X17));
+    ++InstsEmitted;
+
+    //  cmp x16, x17
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
+                                     .addReg(AArch64::XZR)
+                                     .addReg(AArch64::X16)
+                                     .addReg(AArch64::X17)
+                                     .addImm(0));
+    ++InstsEmitted;
+
+    //  b.eq Lsuccess
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::Bcc)
+                                     .addImm(AArch64CC::EQ)
+                                     .addExpr(MCSymbolRefExpr::create(
+                                         SuccessSym, OutContext)));
+    ++InstsEmitted;
+
+    if (ShouldTrap) {
+      // Trapping sequences do a 'brk'.
+      //  brk #<0xc470 + aut key>
+      EmitToStreamer(*OutStreamer,
+                     MCInstBuilder(AArch64::BRK).addImm(0xc470 | AUTKey));
+      ++InstsEmitted;
+    } else {
+      // Non-trapping checked sequences return the stripped result in x16,
+      // skipping over the PAC if there is one.
+
+      // FIXME: can we simply return the AUT result, already in x16? without..
+      //        ..traps this is usable as an oracle anyway, based on high bits
+      //  mov x17, x16
+      EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
+                                       .addReg(AArch64::X16)
+                                       .addReg(AArch64::XZR)
+                                       .addReg(AArch64::X17)
+                                       .addImm(0));
+      ++InstsEmitted;
+
+      if (IsAUTPAC) {
+        EndSym = createTempSymbol("resign_end_");
+
+        //  b Lend
+        EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B)
+                                         .addExpr(MCSymbolRefExpr::create(
+                                             EndSym, OutContext)));
+        ++InstsEmitted;
+      }
+    }
+
+    // If the auth check succeeds, we can continue.
+    // Lsuccess:
+    OutStreamer->emitLabel(SuccessSym);
+  }
+
+  // We already emitted unchecked and checked-but-non-trapping AUTs.
+  // That left us with trapping AUTs, and AUTPACs.
+  // Trapping AUTs don't need PAC: we're done.
+  if (!IsAUTPAC) {
+    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+    return;
+  }
+
+  auto PACKey = (AArch64PACKey::ID)MI->getOperand(3).getImm();
+  uint64_t PACDisc = MI->getOperand(4).getImm();
+  unsigned PACAddrDisc = MI->getOperand(5).getReg();
+
+  // Compute pac discriminator into x17
+  assert(isUInt<16>(PACDisc));
+  unsigned PACDiscReg =
+      emitPtrauthDiscriminator(PACDisc, PACAddrDisc, InstsEmitted);
+  bool PACZero = PACDiscReg == AArch64::XZR;
+  unsigned PACOpc = getPACOpcodeForKey(PACKey, PACZero);
+
+  //  pacizb x16      ; if  PACZero
+  //  pacib x16, x17  ; if !PACZero
+  MCInst PACInst;
+  PACInst.setOpcode(PACOpc);
+  PACInst.addOperand(MCOperand::createReg(AArch64::X16));
+  PACInst.addOperand(MCOperand::createReg(AArch64::X16));
+  if (!PACZero)
+    PACInst.addOperand(MCOperand::createReg(PACDiscReg));
+  EmitToStreamer(*OutStreamer, PACInst);
+  ++InstsEmitted;
+
+  assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+  //  Lend:
+  if (EndSym)
+    OutStreamer->emitLabel(EndSym);
+}
+
 void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   unsigned InstsEmitted = 0;
+  bool IsCall = MI->getOpcode() == AArch64::BLRA;
   unsigned BrTarget = MI->getOperand(0).getReg();
 
   auto Key = (AArch64PACKey::ID)MI->getOperand(1).getImm();
@@ -1614,10 +2007,17 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   bool IsZeroDisc = DiscReg == AArch64::XZR;
 
   unsigned Opc;
-  if (Key == AArch64PACKey::IA)
-    Opc = IsZeroDisc ? AArch64::BLRAAZ : AArch64::BLRAA;
-  else
-    Opc = IsZeroDisc ? AArch64::BLRABZ : AArch64::BLRAB;
+  if (IsCall) {
+    if (Key == AArch64PACKey::IA)
+      Opc = IsZeroDisc ? AArch64::BLRAAZ : AArch64::BLRAA;
+    else
+      Opc = IsZeroDisc ? AArch64::BLRABZ : AArch64::BLRAB;
+  } else {
+    if (Key == AArch64PACKey::IA)
+      Opc = IsZeroDisc ? AArch64::BRAAZ : AArch64::BRAA;
+    else
+      Opc = IsZeroDisc ? AArch64::BRABZ : AArch64::BRAB;
+  }
 
   MCInst BRInst;
   BRInst.setOpcode(Opc);
@@ -1701,8 +2101,7 @@ void AArch64AsmPrinter::LowerLOADauthptrstatic(const MachineInstr &MI) {
     assert(GAOp.getOffset() == 0 &&
            "non-zero offset for $auth_ptr$ stub slots is not supported");
     const MCSymbol *GASym = TM.getSymbol(GAOp.getGlobal());
-    AuthPtrStubSym =
-        TLOF.getAuthPtrSlotSymbol(TM, &MF->getMMI(), GASym, Key, Disc);
+    AuthPtrStubSym = TLOF.getAuthPtrSlotSymbol(TM, MMI, GASym, Key, Disc);
   } else {
     assert(TM.getTargetTriple().isOSBinFormatMachO() &&
            "LOADauthptrstatic is implemented only for MachO/ELF");
@@ -1713,8 +2112,7 @@ void AArch64AsmPrinter::LowerLOADauthptrstatic(const MachineInstr &MI) {
     assert(GAOp.getOffset() == 0 &&
            "non-zero offset for $auth_ptr$ stub slots is not supported");
     const MCSymbol *GASym = TM.getSymbol(GAOp.getGlobal());
-    AuthPtrStubSym =
-        TLOF.getAuthPtrSlotSymbol(TM, &MF->getMMI(), GASym, Key, Disc);
+    AuthPtrStubSym = TLOF.getAuthPtrSlotSymbol(TM, MMI, GASym, Key, Disc);
   }
 
   MachineOperand StubMOHi =
@@ -1896,6 +2294,19 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
   assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
 }
 
+const MCExpr *
+AArch64AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) {
+  const MCExpr *BAE = AsmPrinter::lowerBlockAddressConstant(BA);
+  const Function &Fn = *BA.getFunction();
+
+  if (std::optional<uint16_t> BADisc =
+          STI->getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn))
+    return AArch64AuthMCExpr::create(BAE, *BADisc, AArch64PACKey::IA,
+                                     /*HasAddressDiversity=*/false, OutContext);
+
+  return BAE;
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
@@ -1904,8 +2315,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   if (MI->getOpcode() == AArch64::ADRP) {
     for (auto &Opd : MI->operands()) {
@@ -2031,6 +2444,11 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::AUT:
+  case AArch64::AUTPAC:
+    emitPtrauthAuthResign(MI);
+    return;
+
   case AArch64::LOADauthptrstatic:
     LowerLOADauthptrstatic(*MI);
     return;
@@ -2040,6 +2458,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     LowerMOVaddrPAC(*MI);
     return;
 
+  case AArch64::BRA:
   case AArch64::BLRA:
     emitPtrauthBranch(MI);
     return;
@@ -2207,6 +2626,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     LowerJumpTableDest(*OutStreamer, *MI);
     return;
 
+  case AArch64::BR_JumpTable:
+    LowerHardenedBRJumpTable(*MI);
+    return;
+
   case AArch64::FMOVH0:
   case AArch64::FMOVS0:
   case AArch64::FMOVD0:
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 2f7e226fd09b2..2bbb4997d56a5 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,11 @@ class CCIfBigEndian<CCAction A> :
 class CCIfILP32<CCAction A> :
   CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
 
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("State.getMachineFunction()"
+                      ".getSubtarget<AArch64Subtarget>().", F),
+           A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
@@ -333,7 +338,7 @@ def CC_AArch64_Win64_CFGuard_Check : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
-  CCIfType<[i64], CCAssignToReg<[X11, X10]>>
+  CCIfType<[i64], CCAssignToReg<[X11, X10, X9]>>
 ]>;
 
 let Entry = 1 in
@@ -496,36 +501,44 @@ def CC_AArch64_GHC : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_Preserve_None : CallingConv<[
-    // We can pass arguments in all general registers, except:
-    // - X8, used for sret
-    // - X16/X17, used by the linker as IP0/IP1
-    // - X18, the platform register
-    // - X19, the base pointer
-    // - X29, the frame pointer
-    // - X30, the link register
-    // General registers are not preserved with the exception of
-    // FP, LR, and X18
-    // Non-volatile registers are used first, so functions may call
-    // normal functions without saving and reloading arguments.
-    // X9 is assigned last as it is used in FrameLowering as the first
-    // choice for a scratch register.
-    CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
-                                   W24, W25, W26, W27, W28,
-                                   W0, W1, W2, W3, W4, W5,
-                                   W6, W7, W10, W11,
-                                   W12, W13, W14, W9]>>,
-    CCIfType<[i64], CCAssignToReg<[X20, X21, X22, X23,
-                                   X24, X25, X26, X27, X28,
-                                   X0, X1, X2, X3, X4, X5,
-                                   X6, X7, X10, X11,
-                                   X12, X13, X14, X9]>>,
-
-    // Windows uses X15 for stack allocation
-    CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-        CCIfType<[i32], CCAssignToReg<[W15]>>>,
-    CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-        CCIfType<[i64], CCAssignToReg<[X15]>>>,
-    CCDelegateTo<CC_AArch64_AAPCS>
+  // VarArgs are only supported using the C calling convention.
+  // This handles the non-variadic parameter case. Variadic parameters
+  // are handled in CCAssignFnForCall.
+  CCIfVarArg<CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_AArch64_DarwinPCS>>>,
+  CCIfVarArg<CCIfSubtarget<"isTargetWindows()", CCDelegateTo<CC_AArch64_Win64PCS>>>,
+  CCIfVarArg<CCDelegateTo<CC_AArch64_AAPCS>>,
+
+  // We can pass arguments in all general registers, except:
+  // - X8, used for sret
+  // - X16/X17, used by the linker as IP0/IP1
+  // - X18, the platform register
+  // - X19, the base pointer
+  // - X29, the frame pointer
+  // - X30, the link register
+  // General registers are not preserved with the exception of
+  // FP, LR, and X18
+  // Non-volatile registers are used first, so functions may call
+  // normal functions without saving and reloading arguments.
+  // X9 is assigned last as it is used in FrameLowering as the first
+  // choice for a scratch register.
+  CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
+                                 W24, W25, W26, W27, W28,
+                                 W0, W1, W2, W3, W4, W5,
+                                 W6, W7, W10, W11,
+                                 W12, W13, W14, W9]>>,
+  CCIfType<[i64], CCAssignToReg<[X20, X21, X22, X23,
+                                 X24, X25, X26, X27, X28,
+                                 X0, X1, X2, X3, X4, X5,
+                                 X6, X7, X10, X11,
+                                 X12, X13, X14, X9]>>,
+
+  // Windows uses X15 for stack allocation
+  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+    CCIfType<[i32], CCAssignToReg<[W15]>>>,
+  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+    CCIfType<[i64], CCAssignToReg<[X15]>>>,
+
+  CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
 
 // The order of the callee-saves in this file is important, because the
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index fc4d05c4b8b21..9669a393bc2b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -416,7 +416,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!I.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
       return false;
     }
@@ -711,7 +711,6 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   Head->updateTerminator(CmpBB->getNextNode());
 
   RemovedBlocks.push_back(CmpBB);
-  CmpBB->eraseFromParent();
   LLVM_DEBUG(dbgs() << "Result:\n" << *Head);
   ++NumConverted;
 }
@@ -918,6 +917,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
     CmpConv.convert(RemovedBlocks);
     Changed = true;
     updateDomTree(RemovedBlocks);
+    for (MachineBasicBlock *MBB : RemovedBlocks)
+      MBB->eraseFromParent();
     updateLoops(RemovedBlocks);
   }
   return Changed;
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 527496f1a6374..719d79dec0c31 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2516,6 +2516,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   if (AddrReg == 0)
     return false;
 
+  // Authenticated indirectbr is not implemented yet.
+  if (FuncInfo.MF->getFunction().hasFnAttribute("ptrauth-indirect-gotos"))
+    return false;
+
   // Emit the indirect branch.
   const MCInstrDesc &II = TII.get(AArch64::BR);
   AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 832e44fe117e2..a1ae0873fc190 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -185,6 +185,9 @@ def FeatureJS : ExtensionWithMArch<"jsconv", "JS", "FEAT_JSCVT",
   "Enable Armv8.3-A JavaScript FP conversion instructions",
   [FeatureFPARMv8]>;
 
+def FeatureFPAC : Extension<"fpac", "FPAC", "FEAT_FPAC",
+  "Enable v8.3-A Pointer Authentication Faulting enhancement">;
+
 def FeatureCCIDX : Extension<"ccidx", "CCIDX", "FEAT_CCIDX",
   "Enable Armv8.3-A Extend of the CCSIDR number of sets">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0589b14949bf4..bd530903bb664 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1714,7 +1714,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
-  MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
   bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
@@ -1882,7 +1881,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
       if (EmitCFI) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-        MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+        MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
         // Encode the stack size of the leaf function.
         unsigned CFIIndex = MF.addFrameInst(
             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
@@ -1901,8 +1900,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     return;
   }
 
-  bool IsWin64 =
-      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
@@ -2308,8 +2306,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // How much of the stack used by incoming arguments this function is expected
   // to restore in this particular epilogue.
   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
-  bool IsWin64 =
-      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
+                                              MF.getFunction().isVarArg());
   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   int64_t AfterCSRPopSize = ArgumentStackToRestore;
@@ -2605,6 +2603,41 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       /*ForSimm=*/false);
 }
 
+StackOffset
+AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI) const {
+  // This function serves to provide a comparable offset from a single reference
+  // point (the value of SP at function entry) that can be used for analysis,
+  // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
+  // correct for all objects in the presence of VLA-area objects or dynamic
+  // stack re-alignment.
+
+  const auto &MFI = MF.getFrameInfo();
+
+  int64_t ObjectOffset = MFI.getObjectOffset(FI);
+
+  // This is correct in the absence of any SVE stack objects.
+  StackOffset SVEStackSize = getSVEStackSize(MF);
+  if (!SVEStackSize)
+    return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
+
+  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+  if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+    return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
+                            ObjectOffset);
+  }
+
+  bool IsFixed = MFI.isFixedObjectIndex(FI);
+  bool IsCSR =
+      !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
+
+  StackOffset ScalableOffset = {};
+  if (!IsFixed && !IsCSR)
+    ScalableOffset = -SVEStackSize;
+
+  return StackOffset::getFixed(ObjectOffset) + ScalableOffset;
+}
+
 StackOffset
 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
                                                      int FI) const {
@@ -2615,8 +2648,8 @@ static StackOffset getFPOffset(const MachineFunction &MF,
                                int64_t ObjectOffset) {
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  bool IsWin64 =
-      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+  const Function &F = MF.getFunction();
+  bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
   unsigned FixedObject =
       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
@@ -2722,9 +2755,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         // via the frame pointer, so we have to use the FP in the parent
         // function.
         (void) Subtarget;
-        assert(
-            Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
-            "Funclets should only be present on Win64");
+        assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
+                                            MF.getFunction().isVarArg()) &&
+               "Funclets should only be present on Win64");
         UseFP = true;
       } else {
         // We have the choice between FP and (SP or BP).
@@ -3550,7 +3583,8 @@ void AArch64FrameLowering::determineStackHazardSlot(
       for (auto &MI : MBB) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
+              AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI] |= 2;
           else
             FrameObjects[*FI] |= 1;
@@ -4734,7 +4768,8 @@ void AArch64FrameLowering::orderFrameObjects(
       if (AFI.hasStackHazardSlotIndex()) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
+              AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
           else
             FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index da315850d6362..0ebab1700e9ce 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -41,6 +41,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
 
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
+  StackOffset getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                           int FI) const override;
   StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
                                          Register &FrameReg, bool PreferFP,
                                          bool ForSimm) const;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index b9c57d1975b6f..a3db3628c0686 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -239,18 +239,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
     return false;
   }
 
-  bool SelectDupNegativeZero(SDValue N) {
-    switch(N->getOpcode()) {
-    case AArch64ISD::DUP:
-    case ISD::SPLAT_VECTOR: {
-      ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
-      return Const && Const->isZero() && Const->isNegative();
-    }
-    }
-
-    return false;
-  }
-
   template<MVT::SimpleValueType VT>
   bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
     return SelectSVEAddSubImm(N, VT, Imm, Shift);
@@ -366,6 +354,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   bool tryIndexedLoad(SDNode *N);
 
+  void SelectPtrauthAuth(SDNode *N);
+  void SelectPtrauthResign(SDNode *N);
+
   bool trySelectStackSlotTagP(SDNode *N);
   void SelectTagP(SDNode *N);
 
@@ -1481,6 +1472,96 @@ void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
   ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
+static std::tuple<SDValue, SDValue>
+extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
+  SDLoc DL(Disc);
+  SDValue AddrDisc;
+  SDValue ConstDisc;
+
+  // If this is a blend, remember the constant and address discriminators.
+  // Otherwise, it's either a constant discriminator, or a non-blended
+  // address discriminator.
+  if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+      Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
+    AddrDisc = Disc->getOperand(1);
+    ConstDisc = Disc->getOperand(2);
+  } else {
+    ConstDisc = Disc;
+  }
+
+  // If the constant discriminator (either the blend RHS, or the entire
+  // discriminator value) isn't a 16-bit constant, bail out, and let the
+  // discriminator be computed separately.
+  auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
+  if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
+    return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
+
+  // If there's no address discriminator, use XZR directly.
+  if (!AddrDisc)
+    AddrDisc = DAG->getRegister(AArch64::XZR, MVT::i64);
+
+  return std::make_tuple(
+      DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
+      AddrDisc);
+}
+
+void AArch64DAGToDAGISel::SelectPtrauthAuth(SDNode *N) {
+  SDLoc DL(N);
+  // IntrinsicID is operand #0
+  SDValue Val = N->getOperand(1);
+  SDValue AUTKey = N->getOperand(2);
+  SDValue AUTDisc = N->getOperand(3);
+
+  unsigned AUTKeyC = cast<ConstantSDNode>(AUTKey)->getZExtValue();
+  AUTKey = CurDAG->getTargetConstant(AUTKeyC, DL, MVT::i64);
+
+  SDValue AUTAddrDisc, AUTConstDisc;
+  std::tie(AUTConstDisc, AUTAddrDisc) =
+      extractPtrauthBlendDiscriminators(AUTDisc, CurDAG);
+
+  SDValue X16Copy = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
+                                         AArch64::X16, Val, SDValue());
+  SDValue Ops[] = {AUTKey, AUTConstDisc, AUTAddrDisc, X16Copy.getValue(1)};
+
+  SDNode *AUT = CurDAG->getMachineNode(AArch64::AUT, DL, MVT::i64, Ops);
+  ReplaceNode(N, AUT);
+  return;
+}
+
+void AArch64DAGToDAGISel::SelectPtrauthResign(SDNode *N) {
+  SDLoc DL(N);
+  // IntrinsicID is operand #0
+  SDValue Val = N->getOperand(1);
+  SDValue AUTKey = N->getOperand(2);
+  SDValue AUTDisc = N->getOperand(3);
+  SDValue PACKey = N->getOperand(4);
+  SDValue PACDisc = N->getOperand(5);
+
+  unsigned AUTKeyC = cast<ConstantSDNode>(AUTKey)->getZExtValue();
+  unsigned PACKeyC = cast<ConstantSDNode>(PACKey)->getZExtValue();
+
+  AUTKey = CurDAG->getTargetConstant(AUTKeyC, DL, MVT::i64);
+  PACKey = CurDAG->getTargetConstant(PACKeyC, DL, MVT::i64);
+
+  SDValue AUTAddrDisc, AUTConstDisc;
+  std::tie(AUTConstDisc, AUTAddrDisc) =
+      extractPtrauthBlendDiscriminators(AUTDisc, CurDAG);
+
+  SDValue PACAddrDisc, PACConstDisc;
+  std::tie(PACConstDisc, PACAddrDisc) =
+      extractPtrauthBlendDiscriminators(PACDisc, CurDAG);
+
+  SDValue X16Copy = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
+                                         AArch64::X16, Val, SDValue());
+
+  SDValue Ops[] = {AUTKey,       AUTConstDisc, AUTAddrDisc,        PACKey,
+                   PACConstDisc, PACAddrDisc,  X16Copy.getValue(1)};
+
+  SDNode *AUTPAC = CurDAG->getMachineNode(AArch64::AUTPAC, DL, MVT::i64, Ops);
+  ReplaceNode(N, AUTPAC);
+  return;
+}
+
 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (LD->isUnindexed())
@@ -5437,6 +5518,15 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_tagp:
       SelectTagP(Node);
       return;
+
+    case Intrinsic::ptrauth_auth:
+      SelectPtrauthAuth(Node);
+      return;
+
+    case Intrinsic::ptrauth_resign:
+      SelectPtrauthResign(Node);
+      return;
+
     case Intrinsic::aarch64_neon_tbl2:
       SelectTable(Node, 2,
                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eef83a845e2c3..1e9da9b819bdd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -85,6 +85,7 @@
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SipHash.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -509,6 +510,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+  setOperationAction(ISD::BRIND, MVT::Other, Custom);
   setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
 
   setOperationAction(ISD::PtrAuthGlobalAddress, MVT::i64, Custom);
@@ -732,10 +734,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   for (auto Op : {ISD::FREM,         ISD::FPOW,          ISD::FPOWI,
                   ISD::FCOS,         ISD::FSIN,          ISD::FSINCOS,
+                  ISD::FACOS,        ISD::FASIN,         ISD::FATAN,
+                  ISD::FCOSH,        ISD::FSINH,         ISD::FTANH,
                   ISD::FTAN,         ISD::FEXP,          ISD::FEXP2,
                   ISD::FEXP10,       ISD::FLOG,          ISD::FLOG2,
                   ISD::FLOG10,       ISD::STRICT_FREM,   ISD::STRICT_FPOW,
                   ISD::STRICT_FPOWI, ISD::STRICT_FCOS,   ISD::STRICT_FSIN,
+                  ISD::STRICT_FACOS, ISD::STRICT_FASIN,  ISD::STRICT_FATAN,
+                  ISD::STRICT_FCOSH, ISD::STRICT_FSINH,  ISD::STRICT_FTANH,
                   ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,
                   ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
     setOperationAction(Op, MVT::f16, Promote);
@@ -1074,6 +1080,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Try to create BICs for vector ANDs.
   setTargetDAGCombine(ISD::AND);
 
+  // llvm.init.trampoline and llvm.adjust.trampoline
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
@@ -1176,6 +1186,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           ISD::FNEG,              ISD::FABS,           ISD::FCEIL,
           ISD::FSQRT,             ISD::FFLOOR,         ISD::FNEARBYINT,
           ISD::FSIN,              ISD::FCOS,           ISD::FTAN,
+          ISD::FASIN,             ISD::FACOS,          ISD::FATAN,
+          ISD::FSINH,             ISD::FCOSH,          ISD::FTANH,
           ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,          
           ISD::FLOG10,            ISD::FEXP,           ISD::FEXP2,
           ISD::FEXP10,            ISD::FRINT,          ISD::FROUND,
@@ -1615,6 +1627,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
       setOperationAction(ISD::FTAN, VT, Expand);
+      setOperationAction(ISD::FACOS, VT, Expand);
+      setOperationAction(ISD::FASIN, VT, Expand);
+      setOperationAction(ISD::FATAN, VT, Expand);
+      setOperationAction(ISD::FCOSH, VT, Expand);
+      setOperationAction(ISD::FSINH, VT, Expand);
+      setOperationAction(ISD::FTANH, VT, Expand);
       setOperationAction(ISD::FEXP, VT, Expand);
       setOperationAction(ISD::FEXP2, VT, Expand);
       setOperationAction(ISD::FEXP10, VT, Expand);
@@ -1822,6 +1840,12 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FTAN, VT, Expand);
+    setOperationAction(ISD::FASIN, VT, Expand);
+    setOperationAction(ISD::FACOS, VT, Expand);
+    setOperationAction(ISD::FATAN, VT, Expand);
+    setOperationAction(ISD::FSINH, VT, Expand);
+    setOperationAction(ISD::FCOSH, VT, Expand);
+    setOperationAction(ISD::FTANH, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
@@ -3385,6 +3409,11 @@ static bool isLegalArithImmed(uint64_t C) {
   return IsLegal;
 }
 
+static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
+  KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
+  return !KnownSrc.getSignedMinValue().isMinSignedValue();
+}
+
 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
 // can be set differently by this operation. It comes down to whether
@@ -3392,12 +3421,14 @@ static bool isLegalArithImmed(uint64_t C) {
 // everything is fine. If not then the optimization is wrong. Thus general
 // comparisons are only valid if op2 != 0.
 //
-// So, finally, the only LLVM-native comparisons that don't mention C and V
-// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-// the absence of information about op2.
-static bool isCMN(SDValue Op, ISD::CondCode CC) {
+// So, finally, the only LLVM-native comparisons that don't mention C or V
+// are the ones that aren't unsigned comparisons. They're the only ones we can
+// safely use CMN for in the absence of information about op2.
+static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
-         (CC == ISD::SETEQ || CC == ISD::SETNE);
+         (isIntEqualitySetCC(CC) ||
+          (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
+          (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
 }
 
 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
@@ -3442,11 +3473,12 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (isCMN(RHS, CC)) {
+  if (isCMN(RHS, CC, DAG)) {
     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
-  } else if (isCMN(LHS, CC)) {
+  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+             isIntEqualitySetCC(CC)) {
     // As we are looking for EQ/NE compares, the operands can be commuted ; can
     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
@@ -3548,13 +3580,15 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
       Opcode = AArch64ISD::CCMN;
       RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
     }
-  } else if (RHS.getOpcode() == ISD::SUB) {
-    SDValue SubOp0 = RHS.getOperand(0);
-    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-      // See emitComparison() on why we can only do this for SETEQ and SETNE.
-      Opcode = AArch64ISD::CCMN;
-      RHS = RHS.getOperand(1);
-    }
+  } else if (isCMN(RHS, CC, DAG)) {
+    Opcode = AArch64ISD::CCMN;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+             isIntEqualitySetCC(CC)) {
+    // As we are looking for EQ/NE compares, the operands can be commuted ; can
+    // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
+    Opcode = AArch64ISD::CCMN;
+    LHS = LHS.getOperand(1);
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
@@ -3872,8 +3906,8 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   //    cmp     w12, w11, lsl #1
   if (!isa<ConstantSDNode>(RHS) ||
       !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
-    bool LHSIsCMN = isCMN(LHS, CC);
-    bool RHSIsCMN = isCMN(RHS, CC);
+    bool LHSIsCMN = isCMN(LHS, CC, DAG);
+    bool RHSIsCMN = isCMN(RHS, CC, DAG);
     SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
     SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
 
@@ -3886,7 +3920,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
-  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+  if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
 
     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
@@ -4474,11 +4508,19 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
   EVT SrcElementVT = SrcVT.getVectorElementType();
 
   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
+  SDLoc DL(Op);
+  SDValue SrcVal2;
   if ((SrcElementVT == MVT::f16 &&
        (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
       SrcElementVT == MVT::bf16) {
     MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
-    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
+    SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
+    // If we are extending to a v8f32, split into two v4f32 to produce legal
+    // types.
+    if (F32VT.getSizeInBits() > 128) {
+      std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
+      F32VT = F32VT.getHalfNumVectorElementsVT();
+    }
     SrcVT = F32VT;
     SrcElementVT = MVT::f32;
     SrcElementWidth = 32;
@@ -4486,9 +4528,8 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
              SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
     return SDValue();
 
-  SDLoc DL(Op);
-  // Expand to f64 if we are saturating to i64, to help produce keep the lanes
-  // the same width and produce a fcvtzu.
+  // Expand to f64 if we are saturating to i64, to help keep the lanes the same
+  // width and produce a fcvtzu.
   if (SatWidth == 64 && SrcElementWidth < 64) {
     MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
     SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
@@ -4497,9 +4538,16 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
     SrcElementWidth = 64;
   }
   // Cases that we can emit directly.
-  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
-    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
-                       DAG.getValueType(DstVT.getScalarType()));
+  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
+    SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
+                              DAG.getValueType(DstVT.getScalarType()));
+    if (SrcVal2) {
+      SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
+                                 DAG.getValueType(DstVT.getScalarType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
+    }
+    return Res;
+  }
 
   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
   // result. This is only valid if the legal cvt is larger than the saturate
@@ -4511,20 +4559,32 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
   SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
                                   DAG.getValueType(IntVT.getScalarType()));
-  SDValue Sat;
+  SDValue NativeCvt2 =
+      SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
+                            DAG.getValueType(IntVT.getScalarType()))
+              : SDValue();
+  SDValue Sat, Sat2;
   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
     SDValue MinC = DAG.getConstant(
         APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
+    SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
     SDValue MaxC = DAG.getConstant(
         APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
+    Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
   } else {
     SDValue MinC = DAG.getConstant(
         APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
+    Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
   }
 
+  if (SrcVal2)
+    Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                      IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),
+                      Sat, Sat2);
+
   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
 }
 
@@ -6658,6 +6718,56 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
   return Final;
 }
 
+SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+    report_fatal_error(
+        "ADJUST_TRAMPOLINE operation is only supported on Linux.");
+
+  return Op.getOperand(0);
+}
+
+SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+    report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl(Op);
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp;
+  Args.push_back(Entry);
+  Entry.Node = DAG.getConstant(20, dl, MVT::i64);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr;
+  Args.push_back(Entry);
+  Entry.Node = Nest;
+  Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -6675,6 +6785,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::PtrAuthGlobalAddress:
     return LowerPtrAuthGlobalAddress(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:
+    return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:
+    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
@@ -6693,6 +6807,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
     return LowerBR_JT(Op, DAG);
+  case ISD::BRIND:
+    return LowerBRIND(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
@@ -7091,7 +7207,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::GHC:
     return CC_AArch64_GHC;
   case CallingConv::PreserveNone:
-    return CC_AArch64_Preserve_None;
+    // The VarArg implementation makes assumptions about register
+    // argument passing that do not hold for preserve_none, so we
+    // instead fall back to C argument passing.
+    // The non-vararg case is handled in the CC function itself.
+    if (!IsVarArg)
+      return CC_AArch64_Preserve_None;
+    [[fallthrough]];
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::PreserveMost:
@@ -7164,7 +7286,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &F = MF.getFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
+  bool IsWin64 =
+      Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
   bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
                     (isVarArg && Subtarget->isWindowsArm64EC());
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -7616,7 +7739,9 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
-  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
+  Function &F = MF.getFunction();
+  bool IsWin64 =
+      Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -7787,6 +7912,21 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   }
 }
 
+/// Return true if the call convention supports varargs
+/// Currently only those that pass varargs like the C
+/// calling convention does are eligible
+/// Calling conventions listed in this function must also
+/// be properly handled in AArch64Subtarget::isCallingConvWin64
+static bool callConvSupportsVarArgs(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::PreserveNone:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static void analyzeCallOperands(const AArch64TargetLowering &TLI,
                                 const AArch64Subtarget *Subtarget,
                                 const TargetLowering::CallLoweringInfo &CLI,
@@ -7795,7 +7935,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
   CallingConv::ID CalleeCC = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
 
   // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
   // for the shadow store.
@@ -7923,8 +8063,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
-  assert((!IsVarArg || CalleeCC == CallingConv::C) &&
-         "Unexpected variadic calling convention");
+  if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
+    report_fatal_error("Unsupported variadic calling convention");
 
   LLVMContext &C = *DAG.getContext();
   // Check that the call results are passed in the same way.
@@ -10690,6 +10830,30 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
 
+  // With aarch64-jump-table-hardening, we only expand the jump table dispatch
+  // sequence later, to guarantee the integrity of the intermediate values.
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "aarch64-jump-table-hardening")) {
+    CodeModel::Model CM = getTargetMachine().getCodeModel();
+    if (Subtarget->isTargetMachO()) {
+      if (CM != CodeModel::Small && CM != CodeModel::Large)
+        report_fatal_error("Unsupported code-model for hardened jump-table");
+    } else {
+      // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
+      assert(Subtarget->isTargetELF() &&
+             "jump table hardening only supported on MachO/ELF");
+      if (CM != CodeModel::Small)
+        report_fatal_error("Unsupported code-model for hardened jump-table");
+    }
+
+    SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
+                                       Entry, SDValue());
+    SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
+                                   DAG.getTargetJumpTable(JTI, MVT::i32),
+                                   X16Copy.getValue(0), X16Copy.getValue(1));
+    return SDValue(B, 0);
+  }
+
   SDNode *Dest =
       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
@@ -10697,6 +10861,33 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
   return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
 }
 
+SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(1);
+
+  // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
+  // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
+  if (Dest->isMachineOpcode() &&
+      Dest->getMachineOpcode() == AArch64::JumpTableDest32)
+    return SDValue();
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  std::optional<uint16_t> BADisc =
+      Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
+  if (!BADisc)
+    return SDValue();
+
+  SDLoc DL(Op);
+
+  SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
+  SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
+  SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
+
+  SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
+                                   {Dest, Key, Disc, AddrDisc, Chain});
+  return SDValue(BrA, 0);
+}
+
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -10716,15 +10907,37 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
-  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
+  BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
+  const BlockAddress *BA = BAN->getBlockAddress();
+
+  if (std::optional<uint16_t> BADisc =
+          Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
+              *BA->getFunction())) {
+    SDLoc DL(Op);
+
+    // This isn't cheap, but BRIND is rare.
+    SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
+
+    SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
+
+    SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
+    SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
+
+    SDNode *MOV =
+        DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
+                           {TargetBA, Key, AddrDisc, Disc});
+    return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
+                              SDValue(MOV, 1));
+  }
+
   CodeModel::Model CM = getTargetMachine().getCodeModel();
   if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
     if (!getTargetMachine().isPositionIndependent())
-      return getAddrLarge(BA, DAG);
+      return getAddrLarge(BAN, DAG);
   } else if (CM == CodeModel::Tiny) {
-    return getAddrTiny(BA, DAG);
+    return getAddrTiny(BAN, DAG);
   }
-  return getAddr(BA, DAG);
+  return getAddr(BAN, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
@@ -10854,8 +11067,9 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  Function &F = MF.getFunction();
 
-  if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
+  if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
     return LowerWin64_VASTART(Op, DAG);
   else if (Subtarget->isTargetDarwin())
     return LowerDarwin_VASTART(Op, DAG);
@@ -13162,7 +13376,7 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
   if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
     return SDValue();
 
-  // The DUPQ operation is indepedent of element type so normalise to i64s.
+  // The DUPQ operation is independent of element type so normalise to i64s.
   SDValue Idx128 = Op.getOperand(2);
 
   // DUPQ can be used when idx is in range.
@@ -27529,6 +27743,20 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
   }
 
+  // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
+  if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
+      VecOp.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue BoolVec = VecOp.getOperand(0);
+    if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
+      // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
+      SDValue CntpOp = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+          DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
+          BoolVec, BoolVec);
+      return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
+    }
+  }
+
   // UADDV always returns an i64 result.
   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
                                                    SrcVT.getVectorElementType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fcdd47541be82..81e15185f985d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1143,8 +1143,11 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
                          SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
+  SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRIND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1b301a4a05fc5..6cd9a1a817086 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8286,6 +8286,7 @@ static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
 
 std::optional<outliner::OutlinedFunction>
 AArch64InstrInfo::getOutliningCandidateInfo(
+    const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   unsigned SequenceSize = 0;
   for (auto &MI : RepeatedSequenceLocs[0])
@@ -8339,7 +8340,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     NumBytesToCreateFrame += 8;
 
     // PAuth is enabled - set extra tail call cost, if any.
-    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
+    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
+        *RepeatedSequenceLocs[0].getMF());
     NumBytesToCheckLRInTCEpilogue =
         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
     // Checking the authenticated LR value may significantly impact
@@ -8700,6 +8702,10 @@ void AArch64InstrInfo::mergeOutliningCandidateAttributes(
   // behaviour of one of them
   const auto &CFn = Candidates.front().getMF()->getFunction();
 
+  if (CFn.hasFnAttribute("ptrauth-returns"))
+    F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
+  if (CFn.hasFnAttribute("ptrauth-auth-traps"))
+    F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
   // Since all candidates belong to the same module, just copy the
   // function-level attributes of an arbitrary function.
   if (CFn.hasFnAttribute("sign-return-address"))
@@ -8861,8 +8867,9 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
 }
 
 outliner::InstrType
-AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
-                                   unsigned Flags) const {
+AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                       MachineBasicBlock::iterator &MIT,
+                                       unsigned Flags) const {
   MachineInstr &MI = *MIT;
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
@@ -8974,7 +8981,7 @@ AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
 
     // We have a function we have information about. Check it if it's something
     // can safely outline.
-    MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
+    MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
 
     // We don't know what's going on with the callee at all. Don't touch it.
     if (!CalleeMF)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 69ee0a70765e1..7e5ac423f09ce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -466,11 +466,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
   void mergeOutliningCandidateAttributes(
       Function &F, std::vector<outliner::Candidate> &Candidates) const override;
-  outliner::InstrType
-  getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+  outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                           MachineBasicBlock::iterator &MIT,
+                                           unsigned Flags) const override;
   SmallVector<
       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
   getOutlinableRanges(MachineBasicBlock &MBB, unsigned &Flags) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index dd11f74882115..1053ba9242768 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -276,6 +276,8 @@ def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
                        AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
                        AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
+def HasFPAC          : Predicate<"Subtarget->hasFPAC())">,
+                       AssemblerPredicateWithAll<(all_of FeatureFPAC), "fpac">;
 def HasXS            : Predicate<"Subtarget->hasXS()">,
                        AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
 def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
@@ -1143,6 +1145,33 @@ def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
                      Sched<[]>;
 }
 
+// A hardened but more expensive version of jump-table dispatch.
+// This combines the target address computation (otherwise done using the
+// JumpTableDest pseudos above) with the branch itself (otherwise done using
+// a plain BR) in a single non-attackable sequence.
+//
+// We take the final entry index as an operand to allow isel freedom. This does
+// mean that the index can be attacker-controlled.  To address that, we also do
+// limited checking of the offset, mainly ensuring it still points within the
+// jump-table array.  When it doesn't, this branches to the first entry.
+// We might want to trap instead.
+//
+// This is intended for use in conjunction with ptrauth for other code pointers,
+// to avoid signing jump-table entries and turning them into pointers.
+//
+// Entry index is passed in x16.  Clobbers x16/x17/nzcv.
+let isNotDuplicable = 1 in
+def BR_JumpTable : Pseudo<(outs), (ins i32imm:$jti), []>, Sched<[]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let isBarrier = 1;
+  let isNotDuplicable = 1;
+  let Defs = [X16,X17,NZCV];
+  let Uses = [X16];
+  let Size = 44; // 28 fixed + 16 variable, for table size materialization
+}
+
 // Space-consuming pseudo to aid testing of placement and reachability
 // algorithms. Immediate operand is the number of bytes this "instruction"
 // occupies; register operands can be used to enforce dependency and constrain
@@ -1766,6 +1795,24 @@ let Predicates = [HasPAuth] in {
     let Uses = [SP];
   }
 
+  // BRA pseudo, generalized version of BRAA/BRAB/Z.
+  // This directly manipulates x16/x17, which are the only registers the OS
+  // guarantees are safe to use for sensitive operations.
+  def BRA : Pseudo<(outs), (ins GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc,
+                                GPR64noip:$AddrDisc), []>, Sched<[]> {
+    let isCodeGenOnly = 1;
+    let hasNoSchedulingInfo = 1;
+    let hasSideEffects = 1;
+    let mayStore = 0;
+    let mayLoad = 0;
+    let isBranch = 1;
+    let isTerminator = 1;
+    let isBarrier = 1;
+    let isIndirectBranch = 1;
+    let Size = 12; // 4 fixed + 8 variable, to compute discriminator.
+    let Defs = [X17];
+  }
+
   let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     def RETAA   : AuthReturn<0b010, 0, "retaa">;
     def RETAB   : AuthReturn<0b010, 1, "retab">;
@@ -1776,6 +1823,37 @@ let Predicates = [HasPAuth] in {
   defm LDRAA  : AuthLoad<0, "ldraa", simm10Scaled>;
   defm LDRAB  : AuthLoad<1, "ldrab", simm10Scaled>;
 
+  // AUT pseudo.
+  // This directly manipulates x16/x17, which are the only registers the OS
+  // guarantees are safe to use for sensitive operations.
+  def AUT : Pseudo<(outs), (ins i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc),
+                   []>, Sched<[WriteI, ReadI]> {
+    let isCodeGenOnly = 1;
+    let hasSideEffects = 1;
+    let mayStore = 0;
+    let mayLoad = 0;
+    let Size = 32;
+    let Defs = [X16,X17,NZCV];
+    let Uses = [X16];
+  }
+
+  // AUT and re-PAC a value, using different keys/data.
+  // This directly manipulates x16/x17, which are the only registers the OS
+  // guarantees are safe to use for sensitive operations.
+  def AUTPAC
+      : Pseudo<(outs),
+               (ins i32imm:$AUTKey, i64imm:$AUTDisc, GPR64noip:$AUTAddrDisc,
+                    i32imm:$PACKey, i64imm:$PACDisc, GPR64noip:$PACAddrDisc),
+               []>, Sched<[WriteI, ReadI]> {
+    let isCodeGenOnly = 1;
+    let hasSideEffects = 1;
+    let mayStore = 0;
+    let mayLoad = 0;
+    let Size = 48;
+    let Defs = [X16,X17,NZCV];
+    let Uses = [X16];
+  }
+
   // Materialize a signed global address, with adrp+add and PAC.
   def MOVaddrPAC : Pseudo<(outs),
                           (ins i64imm:$Addr, i32imm:$Key,
@@ -1833,7 +1911,6 @@ let Predicates = [HasPAuth] in {
                                 tcGPR64:$AddrDisc),
               (AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key,
                                  imm:$Disc, tcGPR64:$AddrDisc)>;
-
 }
 
 // v9.5-A pointer authentication extensions
@@ -5363,6 +5440,17 @@ def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
                               (v4i32 VImm8000)))),
           (SQXTNv4i16 V128:$Vn)>;
 
+// concat_vectors(Vd, trunc(umin(X, 255))) -> UQXTRN(Vd, Vn)
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 V64:$Vd),
+                 (v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))))),
+          (UQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+// concat_vectors(Vd, trunc(umin(X, 65535))) -> UQXTRN(Vd, Vn)
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 V64:$Vd),
+                 (v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))))),
+          (UQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+
 // concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn)
 // with reversed min/max
 def : Pat<(v16i8 (concat_vectors
@@ -6084,6 +6172,34 @@ def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
           (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
 }
+
+// int -> float conversion of value in lane 0 of simd vector should use 
+// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
+def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+          (SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f32 (uint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+          (UCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+          (SCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+          (UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+// fp16: integer extraction from vector must be at least 32-bits to be legal.
+// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract 
+                (v8i16 FPR128:$Rn), (i64 0))), i16)))), 
+          (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+
+// unsigned 32-bit extracted element is truncated to 16-bits using AND
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract 
+                (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
+          (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+}
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index bd11bc4dd6e3f..0dbfc0d64376e 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -52,14 +52,14 @@
 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
 //   %2:fpr64 = MOVID 0
 //   %4:fpr128 = IMPLICIT_DEF
-//   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
+//   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub
 //   %6:fpr128 = IMPLICIT_DEF
-//   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
-//   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
+//   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
+//   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0
 //   ==>
 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
 //   %6:fpr128 = IMPLICIT_DEF
-//   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
+//   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
 //
 //===----------------------------------------------------------------------===//
 
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
   bool visitFMOVDr(MachineInstr &MI);
+  bool visitCopy(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -690,6 +691,40 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
   return true;
 }
 
+// Across a basic-block we might have in i32 extract from a value that only
+// operates on upper bits (for example a sxtw). We can replace the COPY with a
+// new version skipping the sxtw.
+bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
+  Register InputReg = MI.getOperand(1).getReg();
+  if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
+      !MRI->hasOneNonDBGUse(InputReg))
+    return false;
+
+  MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
+  SmallPtrSet<MachineInstr *, 4> DeadInstrs;
+  DeadInstrs.insert(SrcMI);
+  while (SrcMI && SrcMI->isFullCopy() &&
+         MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
+    SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
+    DeadInstrs.insert(SrcMI);
+  }
+
+  if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
+      SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
+    return false;
+
+  Register SrcReg = SrcMI->getOperand(1).getReg();
+  MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
+  LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
+  MI.getOperand(1).setReg(SrcReg);
+  LLVM_DEBUG(dbgs() << "        to: " << MI);
+  for (auto *DeadMI : DeadInstrs) {
+    LLVM_DEBUG(dbgs() << "  Removing: " << *DeadMI);
+    DeadMI->eraseFromParent();
+  }
+  return true;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -771,6 +806,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::FMOVDr:
         Changed |= visitFMOVDr(MI);
         break;
+      case AArch64::COPY:
+        Changed |= visitCopy(MI);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 201e8047b3686..e96c5a953ff2b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -38,6 +38,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
 }
 
 static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
+  if (F.hasFnAttribute("ptrauth-returns"))
+    return {true, false}; // non-leaf
   // The function should be signed in the following situations:
   // - sign-return-address=all
   // - sign-return-address=non-leaf and the functions spills the LR
@@ -56,6 +58,8 @@ static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
 }
 
 static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) {
+  if (F.hasFnAttribute("ptrauth-returns"))
+    return true;
   if (!F.hasFnAttribute("sign-return-address-key")) {
     if (STI.getTargetTriple().isOSWindows())
       return true;
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index eb0ff73200407..92ab4b5c3d251 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -116,7 +116,7 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
   // PAuthLR authentication instructions need to know the value of PC at the
   // point of signing (PACI*).
   if (MFnI.branchProtectionPAuthLR()) {
-    MCSymbol *PACSym = MF.getMMI().getContext().createTempSymbol();
+    MCSymbol *PACSym = MF.getContext().createTempSymbol();
     MFnI.setSigningInstrLabel(PACSym);
   }
 
@@ -341,7 +341,8 @@ bool AArch64PointerAuth::checkAuthenticatedLR(
   AArch64PACKey::ID KeyId =
       MFnI->shouldSignWithBKey() ? AArch64PACKey::IB : AArch64PACKey::IA;
 
-  AuthCheckMethod Method = Subtarget->getAuthenticatedLRCheckMethod();
+  AuthCheckMethod Method =
+      Subtarget->getAuthenticatedLRCheckMethod(*TI->getMF());
 
   if (Method == AuthCheckMethod::None)
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 1e069f4790c53..435cc18cdea62 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -611,7 +611,8 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
                                              MCRegister Reg) const {
   CallingConv::ID CC = MF.getFunction().getCallingConv();
   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
-  bool IsVarArg = STI.isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsVarArg = STI.isCallingConvWin64(MF.getFunction().getCallingConv(),
+                                         MF.getFunction().isVarArg());
 
   auto HasReg = [](ArrayRef<MCRegister> RegList, MCRegister Reg) {
     return llvm::is_contained(RegList, Reg);
@@ -623,7 +624,9 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
   case CallingConv::GHC:
     return HasReg(CC_AArch64_GHC_ArgRegs, Reg);
   case CallingConv::PreserveNone:
-    return HasReg(CC_AArch64_Preserve_None_ArgRegs, Reg);
+    if (!MF.getFunction().isVarArg())
+      return HasReg(CC_AArch64_Preserve_None_ArgRegs, Reg);
+    [[fallthrough]];
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::PreserveMost:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index a3c41f2e052cd..19c03011e07b2 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -213,18 +213,10 @@ def AArch64fadd_p_contract : PatFrag<(ops node:$op1, node:$op2, node:$op3),
                                      (AArch64fadd_p node:$op1, node:$op2, node:$op3), [{
   return N->getFlags().hasAllowContract();
 }]>;
-def AArch64fadd_p_nsz : PatFrag<(ops node:$op1, node:$op2, node:$op3),
-                                (AArch64fadd_p node:$op1, node:$op2, node:$op3), [{
-  return N->getFlags().hasNoSignedZeros();
-}]>;
 def AArch64fsub_p_contract : PatFrag<(ops node:$op1, node:$op2, node:$op3),
                                      (AArch64fsub_p node:$op1, node:$op2, node:$op3), [{
   return N->getFlags().hasAllowContract();
 }]>;
-def AArch64fsub_p_nsz : PatFrag<(ops node:$op1, node:$op2, node:$op3),
-                                (AArch64fsub_p node:$op1, node:$op2, node:$op3), [{
-  return N->getFlags().hasNoSignedZeros();
-}]>;
 
 def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3,i32>,
@@ -281,15 +273,11 @@ def AArch64not_mt  : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_
 def AArch64fmul_m1 : VSelectPredOrPassthruPatFrags<int_aarch64_sve_fmul, AArch64fmul_p>;
 def AArch64fadd_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [
     (int_aarch64_sve_fadd node:$pg, node:$op1, node:$op2),
-    (vselect node:$pg, (AArch64fadd_p (SVEAllActive), node:$op1, node:$op2), node:$op1),
-    (AArch64fadd_p_nsz (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDup0))),
-    (AArch64fadd_p (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDupNeg0)))
+    (vselect node:$pg, (AArch64fadd_p (SVEAllActive), node:$op1, node:$op2), node:$op1)
 ]>;
 def AArch64fsub_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [
     (int_aarch64_sve_fsub node:$pg, node:$op1, node:$op2),
-    (vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op1, node:$op2), node:$op1),
-    (AArch64fsub_p (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDup0))),
-    (AArch64fsub_p_nsz (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDupNeg0)))
+    (vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op1, node:$op2), node:$op1)
 ]>;
 
 def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 1fad1d5ca6d7d..642006e706c13 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/SipHash.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace llvm;
@@ -564,8 +565,13 @@ bool AArch64Subtarget::useAA() const { return UseAA; }
 // exception on its own. Later, if the callee spills the signed LR value and
 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
 // the higher bits of LR thus hiding the authentication failure.
-AArch64PAuth::AuthCheckMethod
-AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
+AArch64PAuth::AuthCheckMethod AArch64Subtarget::getAuthenticatedLRCheckMethod(
+    const MachineFunction &MF) const {
+  // TODO: Check subtarget for the scheme. Present variant is a default for
+  // pauthtest ABI.
+  if (MF.getFunction().hasFnAttribute("ptrauth-returns") &&
+      MF.getFunction().hasFnAttribute("ptrauth-auth-traps"))
+    return AArch64PAuth::AuthCheckMethod::HighBitsNoTBI;
   if (AuthenticatedLRCheckMethod.getNumOccurrences())
     return AuthenticatedLRCheckMethod;
 
@@ -574,6 +580,17 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
   return AArch64PAuth::AuthCheckMethod::None;
 }
 
+std::optional<uint16_t>
+AArch64Subtarget::getPtrAuthBlockAddressDiscriminatorIfEnabled(
+    const Function &ParentFn) const {
+  if (!ParentFn.hasFnAttribute("ptrauth-indirect-gotos"))
+    return std::nullopt;
+  // We currently have one simple mechanism for all targets.
+  // This isn't ABI, so we can always do better in the future.
+  return getPointerAuthStableSipHash(
+      (Twine(ParentFn.getName()) + " blockaddress").str());
+}
+
 bool AArch64Subtarget::enableMachinePipeliner() const {
   return getSchedModel().hasInstrSchedModel();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 4b840b24ba134..0f3a637f98fbe 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -322,13 +322,15 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
 
-  bool isCallingConvWin64(CallingConv::ID CC) const {
+  bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const {
     switch (CC) {
     case CallingConv::C:
     case CallingConv::Fast:
     case CallingConv::Swift:
     case CallingConv::SwiftTail:
       return isTargetWindows();
+    case CallingConv::PreserveNone:
+      return IsVarArg && isTargetWindows();
     case CallingConv::Win64:
       return true;
     default:
@@ -411,7 +413,18 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
 
   /// Choose a method of checking LR before performing a tail call.
-  AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod() const;
+  AArch64PAuth::AuthCheckMethod
+  getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
+
+  /// Compute the integer discriminator for a given BlockAddress constant, if
+  /// blockaddress signing is enabled, or std::nullopt otherwise.
+  /// Blockaddress signing is controlled by the function attribute
+  /// "ptrauth-indirect-gotos" on the parent function.
+  /// Note that this assumes the discriminator is independent of the indirect
+  /// goto branch site itself, i.e., it's the same for all BlockAddresses in
+  /// a function.
+  std::optional<uint16_t>
+  getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const;
 
   const PseudoSourceValue *getAddressCheckPSV() const {
     return AddressCheckPSV.get();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 45148449dfb82..79c0e45e3aa5b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -748,22 +748,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // output are the same, or we are using cvt f64->i32 or f32->i64.
     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
-         LT.second == MVT::v2f64) &&
-        (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
-         (LT.second == MVT::f64 && MTy == MVT::i32) ||
-         (LT.second == MVT::f32 && MTy == MVT::i64)))
-      return LT.first;
-    // Similarly for fp16 sizes
-    if (ST->hasFullFP16() &&
-        ((LT.second == MVT::f16 && MTy == MVT::i32) ||
-         ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
-          (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+         LT.second == MVT::v2f64)) {
+      if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+           (LT.second == MVT::f64 && MTy == MVT::i32) ||
+           (LT.second == MVT::f32 && MTy == MVT::i64)))
+        return LT.first;
+      // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
+      if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
+          MTy.getScalarSizeInBits() == 64)
+        return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
+    }
+    // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
+    // f32.
+    if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
+      return LT.first + getIntrinsicInstrCost(
+                            {ICA.getID(),
+                             RetTy,
+                             {ICA.getArgTypes()[0]->getWithNewType(
+                                 Type::getFloatTy(RetTy->getContext()))}},
+                            CostKind);
+    if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+        (LT.second == MVT::f16 && MTy == MVT::i64) ||
+        ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+         (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
       return LT.first;
-
-    // Otherwise we use a legal convert followed by a min+max
+    // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
+    if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
+        MTy.getScalarSizeInBits() == 32)
+      return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
+    // Extending vector types v8f16->v8i32. These current scalarize but the
+    // codegen could be better.
+    if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
+        MTy.getScalarSizeInBits() == 64)
+      return MTy.getVectorNumElements() * 3;
+
+    // If we can we use a legal convert followed by a min+max
     if ((LT.second.getScalarType() == MVT::f32 ||
          LT.second.getScalarType() == MVT::f64 ||
-         (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+         LT.second.getScalarType() == MVT::f16) &&
         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
       Type *LegalTy =
           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
@@ -776,9 +798,33 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
                                     LegalTy, {LegalTy, LegalTy});
       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
-      return LT.first * Cost;
+      return LT.first * Cost +
+             ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
+                                                                           : 1);
     }
-    break;
+    // Otherwise we need to follow the default expansion that clamps the value
+    // using a float min/max with a fcmp+sel for nan handling when signed.
+    Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
+    RetTy = RetTy->getScalarType();
+    if (LT.second.isVector()) {
+      FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
+      RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
+    }
+    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
+    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
+    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
+    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+    Cost +=
+        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);
+    if (IsSigned) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+    }
+    return LT.first * Cost;
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
@@ -4065,15 +4111,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
           NMask.push_back(MaskElt % LTNumElts);
       }
       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
-      // getShuffleCost. If not then cost it using the worst case.
+      // getShuffleCost. If not then cost it using the worst case as the number
+      // of element moves into a new vector.
       if (NumSources <= 2)
         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
                                                : TTI::SK_PermuteTwoSrc,
                                NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
-      else if (any_of(enumerate(NMask), [&](const auto &ME) {
-                 return ME.value() % LTNumElts == ME.index();
-               }))
-        Cost += LTNumElts - 1;
       else
         Cost += LTNumElts;
     }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 5206ba46260ed..b4d2a3388c1df 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -117,7 +117,9 @@ struct AArch64OutgoingValueAssigner
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
                  CCState &State) override {
-    bool IsCalleeWin = Subtarget.isCallingConvWin64(State.getCallingConv());
+    const Function &F = State.getMachineFunction().getFunction();
+    bool IsCalleeWin =
+        Subtarget.isCallingConvWin64(State.getCallingConv(), F.isVarArg());
     bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
 
     bool Res;
@@ -557,8 +559,8 @@ void AArch64CallLowering::saveVarArgRegisters(
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  bool IsWin64CC =
-      Subtarget.isCallingConvWin64(CCInfo.getCallingConv());
+  bool IsWin64CC = Subtarget.isCallingConvWin64(CCInfo.getCallingConv(),
+                                                MF.getFunction().isVarArg());
   const LLT p0 = LLT::pointer(0, 64);
   const LLT s64 = LLT::scalar(64);
 
@@ -653,7 +655,9 @@ bool AArch64CallLowering::lowerFormalArguments(
       F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64)
     return false;
 
-  bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv()) && !Subtarget.isWindowsArm64EC();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()) &&
+      !Subtarget.isWindowsArm64EC();
 
   SmallVector<ArgInfo, 8> SplitArgs;
   SmallVector<std::pair<Register, Register>> BoolArgs;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 24d65624e09e9..e9e6b6cb68d0d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -414,8 +414,13 @@ class AArch64InstructionSelector : public InstructionSelector {
     return selectAddrModeIndexed(Root, Width / 8);
   }
 
+  std::optional<bool>
+  isWorthFoldingIntoAddrMode(MachineInstr &MI,
+                             const MachineRegisterInfo &MRI) const;
+
   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
-                                     const MachineRegisterInfo &MRI) const;
+                                     const MachineRegisterInfo &MRI,
+                                     bool IsAddrOperand) const;
   ComplexRendererFns
   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
                                   unsigned SizeInBytes) const;
@@ -2001,7 +2006,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin(
 
   int FrameIdx = FuncInfo->getVarArgsStackIndex();
   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
-          MF.getFunction().getCallingConv())) {
+          MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) {
     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
                    ? FuncInfo->getVarArgsGPRIndex()
                    : FuncInfo->getVarArgsStackIndex();
@@ -2279,8 +2284,9 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     Register Dst = I.getOperand(0).getReg();
     auto *CV = ConstantDataVector::getSplat(
         MRI.getType(Dst).getNumElements(),
-        ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
-                         ValAndVReg->Value));
+        ConstantInt::get(
+            Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()),
+            ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits())));
     if (!emitConstantVector(Dst, CV, MIB, MRI))
       return false;
     I.eraseFromParent();
@@ -2547,6 +2553,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCompareBranch(I, MF, MRI);
 
   case TargetOpcode::G_BRINDIRECT: {
+    const Function &Fn = MF.getFunction();
+    if (std::optional<uint16_t> BADisc =
+            STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
+      auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
+      MI.addImm(AArch64PACKey::IA);
+      MI.addImm(*BADisc);
+      MI.addReg(/*AddrDisc=*/AArch64::XZR);
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+    }
     I.setDesc(TII.get(AArch64::BR));
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
@@ -3461,6 +3477,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
   case TargetOpcode::G_BLOCK_ADDR: {
+    Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
+    if (std::optional<uint16_t> BADisc =
+            STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
+      MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
+      MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
+      MIB.buildInstr(AArch64::MOVaddrPAC)
+          .addBlockAddress(I.getOperand(1).getBlockAddress())
+          .addImm(AArch64PACKey::IA)
+          .addReg(/*AddrDisc=*/AArch64::XZR)
+          .addImm(*BADisc)
+          .constrainAllUses(TII, TRI, RBI);
+      MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
+      RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+                                   AArch64::GPR64RegClass, MRI);
+      I.eraseFromParent();
+      return true;
+    }
     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
       I.eraseFromParent();
@@ -3597,10 +3630,33 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
   unsigned JTI = I.getOperand(1).getIndex();
   Register Index = I.getOperand(2).getReg();
 
+  MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
+
+  // With aarch64-jump-table-hardening, we only expand the jump table dispatch
+  // sequence later, to guarantee the integrity of the intermediate values.
+  if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
+    CodeModel::Model CM = TM.getCodeModel();
+    if (STI.isTargetMachO()) {
+      if (CM != CodeModel::Small && CM != CodeModel::Large)
+        report_fatal_error("Unsupported code-model for hardened jump-table");
+    } else {
+      // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
+      assert(STI.isTargetELF() &&
+             "jump table hardening only supported on MachO/ELF");
+      if (CM != CodeModel::Small)
+        report_fatal_error("Unsupported code-model for hardened jump-table");
+    }
+
+    MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
+    MIB.buildInstr(AArch64::BR_JumpTable)
+        .addJumpTableIndex(I.getOperand(1).getIndex());
+    I.eraseFromParent();
+    return true;
+  }
+
   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
 
-  MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
                                       {TargetReg, ScratchReg}, {JTAddr, Index})
                            .addJumpTableIndex(JTI);
@@ -5559,7 +5615,8 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
   }
 
   if (CV->getSplatValue()) {
-    APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger());
+    APInt DefBits = APInt::getSplat(
+        DstSize, CV->getUniqueInteger().trunc(DstTy.getScalarSizeInBits()));
     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
       MachineInstr *NewOp;
       bool Inv = false;
@@ -6502,6 +6559,64 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case Intrinsic::ptrauth_resign: {
+    Register DstReg = I.getOperand(0).getReg();
+    Register ValReg = I.getOperand(2).getReg();
+    uint64_t AUTKey = I.getOperand(3).getImm();
+    Register AUTDisc = I.getOperand(4).getReg();
+    uint64_t PACKey = I.getOperand(5).getImm();
+    Register PACDisc = I.getOperand(6).getReg();
+
+    Register AUTAddrDisc = AUTDisc;
+    uint16_t AUTConstDiscC = 0;
+    std::tie(AUTConstDiscC, AUTAddrDisc) =
+        extractPtrauthBlendDiscriminators(AUTDisc, MRI);
+
+    Register PACAddrDisc = PACDisc;
+    uint16_t PACConstDiscC = 0;
+    std::tie(PACConstDiscC, PACAddrDisc) =
+        extractPtrauthBlendDiscriminators(PACDisc, MRI);
+
+    MIB.buildCopy({AArch64::X16}, {ValReg});
+    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
+    MIB.buildInstr(AArch64::AUTPAC)
+        .addImm(AUTKey)
+        .addImm(AUTConstDiscC)
+        .addUse(AUTAddrDisc)
+        .addImm(PACKey)
+        .addImm(PACConstDiscC)
+        .addUse(PACAddrDisc)
+        .constrainAllUses(TII, TRI, RBI);
+    MIB.buildCopy({DstReg}, Register(AArch64::X16));
+
+    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
+    I.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::ptrauth_auth: {
+    Register DstReg = I.getOperand(0).getReg();
+    Register ValReg = I.getOperand(2).getReg();
+    uint64_t AUTKey = I.getOperand(3).getImm();
+    Register AUTDisc = I.getOperand(4).getReg();
+
+    Register AUTAddrDisc = AUTDisc;
+    uint16_t AUTConstDiscC = 0;
+    std::tie(AUTConstDiscC, AUTAddrDisc) =
+        extractPtrauthBlendDiscriminators(AUTDisc, MRI);
+
+    MIB.buildCopy({AArch64::X16}, {ValReg});
+    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
+    MIB.buildInstr(AArch64::AUT)
+        .addImm(AUTKey)
+        .addImm(AUTConstDiscC)
+        .addUse(AUTAddrDisc)
+        .constrainAllUses(TII, TRI, RBI);
+    MIB.buildCopy({DstReg}, Register(AArch64::X16));
+
+    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
+    I.eraseFromParent();
+    return true;
+  }
   case Intrinsic::frameaddress:
   case Intrinsic::returnaddress: {
     MachineFunction &MF = *I.getParent()->getParent();
@@ -6869,19 +6984,70 @@ AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
   return select12BitValueWithLeftShift(Immed);
 }
 
+/// Checks if we are sure that folding MI into load/store addressing mode is
+/// beneficial or not.
+///
+/// Returns:
+/// - true if folding MI would be beneficial.
+/// - false if folding MI would be bad.
+/// - std::nullopt if it is not sure whether folding MI is beneficial.
+///
+/// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
+///
+/// %13:gpr(s64) = G_CONSTANT i64 1
+/// %8:gpr(s64) = G_SHL %6, %13(s64)
+/// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
+/// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
+std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
+    MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+  if (MI.getOpcode() == AArch64::G_SHL) {
+    // Address operands with shifts are free, except for running on subtargets
+    // with AddrLSLSlow14.
+    if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
+            MI.getOperand(2).getReg(), MRI)) {
+      const APInt ShiftVal = ValAndVeg->Value;
+
+      // Don't fold if we know this will be slow.
+      return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
+    }
+  }
+  return std::nullopt;
+}
+
 /// Return true if it is worth folding MI into an extended register. That is,
 /// if it's safe to pull it into the addressing mode of a load or store as a
 /// shift.
+/// \p IsAddrOperand whether the def of MI is used as an address operand
+/// (e.g. feeding into an LDR/STR).
 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
-    MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+    MachineInstr &MI, const MachineRegisterInfo &MRI,
+    bool IsAddrOperand) const {
+
   // Always fold if there is one use, or if we're optimizing for size.
   Register DefReg = MI.getOperand(0).getReg();
   if (MRI.hasOneNonDBGUse(DefReg) ||
       MI.getParent()->getParent()->getFunction().hasOptSize())
     return true;
 
-  // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
-  // appropriate.
+  if (IsAddrOperand) {
+    // If we are already sure that folding MI is good or bad, return the result.
+    if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
+      return *Worth;
+
+    // Fold G_PTR_ADD if its offset operand can be folded
+    if (MI.getOpcode() == AArch64::G_PTR_ADD) {
+      MachineInstr *OffsetInst =
+          getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+      // Note, we already know G_PTR_ADD is used by at least two instructions.
+      // If we are also sure about whether folding is beneficial or not,
+      // return the result.
+      if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI))
+        return *Worth;
+    }
+  }
+
+  // FIXME: Consider checking HasALULSLFast as appropriate.
 
   // We have a fastpath, so folding a shift in and potentially computing it
   // many times may be beneficial. Check if this is only used in memory ops.
@@ -6929,7 +7095,7 @@ AArch64InstructionSelector::selectExtendedSHL(
   int64_t LegalShiftVal = Log2_32(SizeInBytes);
   if (LegalShiftVal == 0)
     return std::nullopt;
-  if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+  if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
     return std::nullopt;
 
   // Now, try to find the specific G_CONSTANT. Start by assuming that the
@@ -7036,7 +7202,7 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
   // Check if we can find the G_PTR_ADD.
   MachineInstr *PtrAdd =
       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
-  if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
+  if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
     return std::nullopt;
 
   // Now, try to match an opcode which will match our specific offset.
@@ -7170,7 +7336,7 @@ AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
 
   MachineInstr *PtrAdd =
       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
-  if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
+  if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
     return std::nullopt;
 
   MachineOperand &LHS = PtrAdd->getOperand(1);
@@ -7201,7 +7367,7 @@ AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
   //
   // e.g.
   // ldr something, [base_reg, ext_reg, sxtw]
-  if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+  if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
     return std::nullopt;
 
   // Check if this is an extend. We'll get an extend type if it is.
@@ -7396,7 +7562,7 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
     return std::nullopt;
   if (ShType == AArch64_AM::ROR && !AllowROR)
     return std::nullopt;
-  if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
+  if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false))
     return std::nullopt;
 
   // Need an immediate on the RHS.
@@ -7510,7 +7676,7 @@ AArch64InstructionSelector::selectArithExtendedRegister(
   if (!RootDef)
     return std::nullopt;
 
-  if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
+  if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false))
     return std::nullopt;
 
   // Check if we can fold a shift and an extend.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a73c971020bd8..d3c5742cee3eb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -267,8 +267,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .libcallFor({{s64, s128}})
       .minScalarOrElt(1, MinFPScalar);
 
-  getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
-                               G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10})
+  getActionDefinitionsBuilder(
+      {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP,
+       G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FCOSH, G_FSINH, G_FTANH})
       // We need a call for these, so we always need to scalarize.
       .scalarize(0)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
@@ -560,7 +561,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
-      .clampScalarOrElt(1, MinFPScalar, s64)
+      .minScalarOrElt(1, MinFPScalar)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -572,7 +574,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(1, v4s16, v8s16)
       .clampNumElements(1, v2s32, v4s32)
       .clampMaxNumElements(1, s64, 2)
-      .moreElementsToNextPow2(1);
+      .moreElementsToNextPow2(1)
+      .libcallFor({{s32, s128}});
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
@@ -1287,6 +1290,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_PREFETCH).custom();
 
+  getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 5616d063f70bc..23e135063147a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -42,6 +42,7 @@
 #include "AArch64GenRegisterBankInfo.def"
 
 using namespace llvm;
+static const unsigned CustomMappingID = 1;
 
 AArch64RegisterBankInfo::AArch64RegisterBankInfo(
     const TargetRegisterInfo &TRI) {
@@ -240,57 +241,12 @@ unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
 
 const RegisterBank &
 AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                                LLT) const {
+                                                LLT Ty) const {
   switch (RC.getID()) {
-  case AArch64::FPR8RegClassID:
-  case AArch64::FPR16RegClassID:
-  case AArch64::FPR16_loRegClassID:
-  case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID:
-  case AArch64::FPR32RegClassID:
-  case AArch64::FPR64RegClassID:
-  case AArch64::FPR128RegClassID:
-  case AArch64::FPR64_loRegClassID:
-  case AArch64::FPR128_loRegClassID:
-  case AArch64::FPR128_0to7RegClassID:
-  case AArch64::DDRegClassID:
-  case AArch64::DDDRegClassID:
-  case AArch64::DDDDRegClassID:
-  case AArch64::QQRegClassID:
-  case AArch64::QQQRegClassID:
-  case AArch64::QQQQRegClassID:
-  case AArch64::ZPRRegClassID:
-  case AArch64::ZPR_3bRegClassID:
-    return getRegBank(AArch64::FPRRegBankID);
-  case AArch64::GPR32commonRegClassID:
-  case AArch64::GPR32RegClassID:
-  case AArch64::GPR32spRegClassID:
-  case AArch64::GPR32sponlyRegClassID:
-  case AArch64::GPR32argRegClassID:
-  case AArch64::GPR32allRegClassID:
-  case AArch64::GPR64commonRegClassID:
-  case AArch64::GPR64RegClassID:
-  case AArch64::GPR64spRegClassID:
   case AArch64::GPR64sponlyRegClassID:
-  case AArch64::GPR64argRegClassID:
-  case AArch64::GPR64allRegClassID:
-  case AArch64::GPR64noipRegClassID:
-  case AArch64::GPR64common_and_GPR64noipRegClassID:
-  case AArch64::GPR64noip_and_tcGPR64RegClassID:
-  case AArch64::tcGPR64RegClassID:
-  case AArch64::tcGPRx16x17RegClassID:
-  case AArch64::tcGPRx17RegClassID:
-  case AArch64::tcGPRnotx16RegClassID:
-  case AArch64::WSeqPairsClassRegClassID:
-  case AArch64::XSeqPairsClassRegClassID:
-  case AArch64::MatrixIndexGPR32_8_11RegClassID:
-  case AArch64::MatrixIndexGPR32_12_15RegClassID:
-  case AArch64::GPR64_with_sub_32_in_MatrixIndexGPR32_8_11RegClassID:
-  case AArch64::GPR64_with_sub_32_in_MatrixIndexGPR32_12_15RegClassID:
     return getRegBank(AArch64::GPRRegBankID);
-  case AArch64::CCRRegClassID:
-    return getRegBank(AArch64::CCRegBankID);
   default:
-    llvm_unreachable("Register class not supported");
+    return AArch64GenRegisterBankInfo::getRegBankFromRegClass(RC, Ty);
   }
 }
 
@@ -424,6 +380,26 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(2).setReg(Ext.getReg(0));
     return applyDefaultMapping(OpdMapper);
   }
+  case AArch64::G_DUP: {
+    // Extend smaller gpr to 32-bits
+    assert(MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() < 32 &&
+           "Expected sources smaller than 32-bits");
+    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
+
+    Register ConstReg;
+    auto ConstMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+    if (ConstMI->getOpcode() == TargetOpcode::G_CONSTANT) {
+      auto CstVal = ConstMI->getOperand(1).getCImm()->getValue();
+      ConstReg =
+          Builder.buildConstant(LLT::scalar(32), CstVal.sext(32)).getReg(0);
+    } else {
+      ConstReg = Builder.buildAnyExt(LLT::scalar(32), MI.getOperand(1).getReg())
+                     .getReg(0);
+    }
+    MRI.setRegBank(ConstReg, getRegBank(AArch64::GPRRegBankID));
+    MI.getOperand(1).setReg(ConstReg);
+    return applyDefaultMapping(OpdMapper);
+  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -792,8 +768,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
              (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
               onlyDefinesFP(*ScalarDef, MRI, TRI)))
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
-    else
+    else {
+      if (ScalarTy.getSizeInBits() < 32 &&
+          getRegBank(ScalarReg, MRI, TRI) == &AArch64::GPRRegBank) {
+        // Calls applyMappingImpl()
+        MappingID = CustomMappingID;
+      }
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+    }
     break;
   }
   case TargetOpcode::G_TRUNC: {
@@ -1014,8 +996,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // If the type is i8/i16, and the regank will be GPR, then we change the
       // type to i32 in applyMappingImpl.
       LLT Ty = MRI.getType(MI.getOperand(2).getReg());
-      if (Ty.getSizeInBits() == 8 || Ty.getSizeInBits() == 16)
-        MappingID = 1;
+      if (Ty.getSizeInBits() == 8 || Ty.getSizeInBits() == 16) {
+        // Calls applyMappingImpl()
+        MappingID = CustomMappingID;
+      }
       OpRegBankIdx[2] = PMI_FirstGPR;
     }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 0d89f540650a9..941499b08d05d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -150,7 +150,7 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
                     TypeSize Size) const override;
 
   const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
+                                             LLT Ty) const override;
 
   InstructionMappings
   getInstrAlternativeMappings(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index be470c71ae8b6..be34a649e1c4b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -599,7 +599,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   }
 
   /// Generate the compact unwind encoding from the CFI directives.
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty())
@@ -609,10 +609,10 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
       return CU::UNWIND_ARM64_MODE_DWARF;
 
     bool HasFP = false;
-    unsigned StackSize = 0;
+    uint64_t StackSize = 0;
 
-    uint32_t CompactUnwindEncoding = 0;
-    int CurOffset = 0;
+    uint64_t CompactUnwindEncoding = 0;
+    int64_t CurOffset = 0;
     for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
       const MCCFIInstruction &Inst = Instrs[i];
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 7dba22c066dcd..ed670bce594ec 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -181,13 +181,13 @@ class AArch64ELFStreamer : public MCELFStreamer {
                      std::unique_ptr<MCCodeEmitter> Emitter)
       : MCELFStreamer(Context, std::move(TAB), std::move(OW),
                       std::move(Emitter)),
-        MappingSymbolCounter(0), LastEMS(EMS_None) {}
+        LastEMS(EMS_None) {}
 
   void changeSection(MCSection *Section, uint32_t Subsection = 0) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastMappingSymbols[getCurrentSection().first] = LastEMS;
     LastEMS = LastMappingSymbols.lookup(Section);
 
     MCELFStreamer::changeSection(Section, Subsection);
@@ -195,7 +195,6 @@ class AArch64ELFStreamer : public MCELFStreamer {
 
   // Reset state between object emissions
   void reset() override {
-    MappingSymbolCounter = 0;
     MCELFStreamer::reset();
     LastMappingSymbols.clear();
     LastEMS = EMS_None;
@@ -271,16 +270,12 @@ class AArch64ELFStreamer : public MCELFStreamer {
   }
 
   void emitMappingSymbol(StringRef Name) {
-    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++)));
+    auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
     emitLabel(Symbol);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
-    Symbol->setExternal(false);
   }
 
-  int64_t MappingSymbolCounter;
-
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
@@ -332,8 +327,7 @@ void AArch64TargetELFStreamer::finish() {
 
 MCTargetStreamer *
 llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
-                                     MCInstPrinter *InstPrint,
-                                     bool isVerboseAsm) {
+                                     MCInstPrinter *InstPrint) {
   return new AArch64TargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f05e5e6df7f8e..97c5f96388abe 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -386,24 +386,21 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                   std::move(Emitter));
 }
 
-static MCStreamer *createMachOStreamer(MCContext &Ctx,
-                                       std::unique_ptr<MCAsmBackend> &&TAB,
-                                       std::unique_ptr<MCObjectWriter> &&OW,
-                                       std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                       bool DWARFMustBeAtTheEnd) {
+static MCStreamer *
+createMachOStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
+                    std::unique_ptr<MCObjectWriter> &&OW,
+                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
-                             std::move(Emitter), DWARFMustBeAtTheEnd,
+                             std::move(Emitter), /*ignore=*/false,
                              /*LabelSections*/ true);
 }
 
 static MCStreamer *
 createWinCOFFStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                      bool IncrementalLinkerCompatible) {
+                      std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                      std::move(Emitter),
-                                      IncrementalLinkerCompatible);
+                                      std::move(Emitter));
 }
 
 namespace {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 7b4f102840aa5..91bdc880998b2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -58,8 +58,7 @@ createAArch64WinCOFFObjectWriter(const Triple &TheTriple);
 
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm);
+                                                 MCInstPrinter *InstPrint);
 
 namespace AArch64_MC {
 void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index c25cc2e99adca..208d43502cb88 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -291,12 +291,11 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveAnyRegQPX(unsigned Reg,
   emitARM64WinUnwindCode(Win64EH::UOP_SaveAnyRegQPX, Reg, Offset);
 }
 
-MCWinCOFFStreamer *llvm::createAArch64WinCOFFStreamer(
-    MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool IncrementalLinkerCompatible) {
-  auto *S = new AArch64WinCOFFStreamer(Context, std::move(MAB),
-                                       std::move(Emitter), std::move(OW));
-  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
-  return S;
+MCWinCOFFStreamer *
+llvm::createAArch64WinCOFFStreamer(MCContext &Context,
+                                   std::unique_ptr<MCAsmBackend> MAB,
+                                   std::unique_ptr<MCObjectWriter> OW,
+                                   std::unique_ptr<MCCodeEmitter> Emitter) {
+  return new AArch64WinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
+                                    std::move(OW));
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index a13b1a451be5f..5caf520a3aa37 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -20,8 +20,7 @@ namespace llvm {
 
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool IncrementalLinkerCompatible);
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter);
 } // end llvm namespace
 
 #endif
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fc7d3cdda4acd..482d3a0664d72 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -446,7 +446,6 @@ multiclass SVE_1_Op_PassthruUndef_Round_Pat<ValueType vtd, SDPatternOperator op,
 }
 
 def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>;
-def SVEDupNeg0 : ComplexPattern<vAny, 0, "SelectDupNegativeZero", []>;
 
 class SVE_1_Op_PassthruZero_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                    ValueType vt2, Instruction inst>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index dfc8eaea66f7b..7906e0ee9d785 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -947,6 +947,12 @@ def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
   "Has restricted SOffset (immediate not supported)."
 >;
 
+def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
+  "HasRequiredExportPriority",
+  "true",
+  "Export priority must be explicitly manipulated on GFX11.5"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1567,7 +1573,8 @@ def FeatureISAVersion11_Generic: FeatureSet<
      FeatureUserSGPRInit16Bug,
      FeatureMADIntraFwdBug,
      FeaturePrivEnabledTrap2NopBug,
-     FeatureRequiresCOV6])>;
+     FeatureRequiresCOV6,
+     FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
@@ -1597,20 +1604,23 @@ def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts])>;
+     FeatureVGPRSingleUseHintInsts,
+     FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion11_5_1 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
      FeatureVGPRSingleUseHintInsts,
-     Feature1_5xVGPRs])>;
+     Feature1_5xVGPRs,
+     FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion11_5_2 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts])>;
+     FeatureVGPRSingleUseHintInsts,
+     FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
@@ -2016,6 +2026,8 @@ def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
 
 def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
 
+def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">;
+
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<(all_of Feature16BitInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
new file mode 100644
index 0000000000000..593fca5bc3ed6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -0,0 +1,332 @@
+//===AMDGPUAsanInstrumentation.cpp - ASAN related helper functions===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------===//
+
+#include "AMDGPUAsanInstrumentation.h"
+
+#define DEBUG_TYPE "amdgpu-asan-instrumentation"
+
+using namespace llvm;
+
+namespace llvm {
+namespace AMDGPU {
+
+static uint64_t getRedzoneSizeForScale(int AsanScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << AsanScale);
+}
+
+static uint64_t getMinRedzoneSizeForGlobal(int AsanScale) {
+  return getRedzoneSizeForScale(AsanScale);
+}
+
+uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes) {
+  constexpr uint64_t kMaxRZ = 1 << 18;
+  const uint64_t MinRZ = getMinRedzoneSizeForGlobal(AsanScale);
+
+  uint64_t RZ = 0;
+  if (SizeInBytes <= MinRZ / 2) {
+    // Reduce redzone size for small size objects, e.g. int, char[1]. MinRZ is
+    // at least 32 bytes, optimize when SizeInBytes is less than or equal to
+    // half of MinRZ.
+    RZ = MinRZ - SizeInBytes;
+  } else {
+    // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+    RZ = std::clamp((SizeInBytes / MinRZ / 4) * MinRZ, MinRZ, kMaxRZ);
+
+    // Round up to multiple of MinRZ.
+    if (SizeInBytes % MinRZ)
+      RZ += MinRZ - (SizeInBytes % MinRZ);
+  }
+
+  assert((RZ + SizeInBytes) % MinRZ == 0);
+
+  return RZ;
+}
+
+static size_t TypeStoreSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = llvm::countr_zero(TypeSize / 8);
+  return Res;
+}
+
+static Instruction *genAMDGPUReportBlock(Module &M, IRBuilder<> &IRB,
+                                         Value *Cond, bool Recover) {
+  Value *ReportCond = Cond;
+  if (!Recover) {
+    auto *Ballot =
+        IRB.CreateIntrinsic(Intrinsic::amdgcn_ballot, IRB.getInt64Ty(), {Cond});
+    ReportCond = IRB.CreateIsNotNull(Ballot);
+  }
+
+  auto *Trm = SplitBlockAndInsertIfThen(
+      ReportCond, &*IRB.GetInsertPoint(), false,
+      MDBuilder(M.getContext()).createUnlikelyBranchWeights());
+  Trm->getParent()->setName("asan.report");
+
+  if (Recover)
+    return Trm;
+
+  Trm = SplitBlockAndInsertIfThen(Cond, Trm, false);
+  IRB.SetInsertPoint(Trm);
+  return IRB.CreateIntrinsic(Intrinsic::amdgcn_unreachable, {}, {});
+}
+
+static Value *createSlowPathCmp(Module &M, IRBuilder<> &IRB, Type *IntptrTy,
+                                Value *AddrLong, Value *ShadowValue,
+                                uint32_t TypeStoreSize, int AsanScale) {
+  uint64_t Granularity = static_cast<uint64_t>(1) << AsanScale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeStoreSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeStoreSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+static Instruction *generateCrashCode(Module &M, IRBuilder<> &IRB,
+                                      Type *IntptrTy, Instruction *InsertBefore,
+                                      Value *Addr, bool IsWrite,
+                                      size_t AccessSizeIndex,
+                                      Value *SizeArgument, bool Recover) {
+  IRB.SetInsertPoint(InsertBefore);
+  CallInst *Call = nullptr;
+  SmallString<128> kAsanReportErrorTemplate{"__asan_report_"};
+  SmallString<64> TypeStr{IsWrite ? "store" : "load"};
+  SmallString<64> EndingStr{Recover ? "_noabort" : ""};
+
+  SmallString<128> AsanErrorCallbackSizedString;
+  raw_svector_ostream AsanErrorCallbackSizedOS(AsanErrorCallbackSizedString);
+  AsanErrorCallbackSizedOS << kAsanReportErrorTemplate << TypeStr << "_n"
+                           << EndingStr;
+
+  SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+  AttributeList AL2;
+  FunctionCallee AsanErrorCallbackSized = M.getOrInsertFunction(
+      AsanErrorCallbackSizedOS.str(),
+      FunctionType::get(IRB.getVoidTy(), Args2, false), AL2);
+  SmallVector<Type *, 2> Args1{1, IntptrTy};
+  AttributeList AL1;
+
+  SmallString<128> AsanErrorCallbackString;
+  raw_svector_ostream AsanErrorCallbackOS(AsanErrorCallbackString);
+  AsanErrorCallbackOS << kAsanReportErrorTemplate << TypeStr
+                      << (1ULL << AccessSizeIndex) << EndingStr;
+
+  FunctionCallee AsanErrorCallback = M.getOrInsertFunction(
+      AsanErrorCallbackOS.str(),
+      FunctionType::get(IRB.getVoidTy(), Args1, false), AL1);
+  if (SizeArgument) {
+    Call = IRB.CreateCall(AsanErrorCallbackSized, {Addr, SizeArgument});
+  } else {
+    Call = IRB.CreateCall(AsanErrorCallback, Addr);
+  }
+
+  Call->setCannotMerge();
+  return Call;
+}
+
+static Value *memToShadow(Module &M, IRBuilder<> &IRB, Type *IntptrTy,
+                          Value *Shadow, int AsanScale, uint32_t AsanOffset) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, AsanScale);
+  if (AsanOffset == 0)
+    return Shadow;
+  // (Shadow >> scale) | offset
+  Value *ShadowBase = ConstantInt::get(IntptrTy, AsanOffset);
+  return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
+                       Instruction *InsertBefore, Value *Addr,
+                       MaybeAlign Alignment, uint32_t TypeStoreSize,
+                       bool IsWrite, Value *SizeArgument, bool UseCalls,
+                       bool Recover, int AsanScale, int AsanOffset) {
+  Type *AddrTy = Addr->getType();
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(
+      M.getContext(), AddrTy->getPointerAddressSpace());
+  IRB.SetInsertPoint(InsertBefore);
+  size_t AccessSizeIndex = TypeStoreSizeToSizeIndex(TypeStoreSize);
+  Type *ShadowTy = IntegerType::get(M.getContext(),
+                                    std::max(8U, TypeStoreSize >> AsanScale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
+  Value *ShadowPtr =
+      memToShadow(M, IRB, IntptrTy, AddrLong, AsanScale, AsanOffset);
+  const uint64_t ShadowAlign =
+      std::max<uint64_t>(Alignment.valueOrOne().value() >> AsanScale, 1);
+  Value *ShadowValue = IRB.CreateAlignedLoad(
+      ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy), Align(ShadowAlign));
+  Value *Cmp = IRB.CreateIsNotNull(ShadowValue);
+  auto *Cmp2 = createSlowPathCmp(M, IRB, IntptrTy, AddrLong, ShadowValue,
+                                 TypeStoreSize, AsanScale);
+  Cmp = IRB.CreateAnd(Cmp, Cmp2);
+  Instruction *CrashTerm = genAMDGPUReportBlock(M, IRB, Cmp, Recover);
+  Instruction *Crash =
+      generateCrashCode(M, IRB, IntptrTy, CrashTerm, AddrLong, IsWrite,
+                        AccessSizeIndex, SizeArgument, Recover);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+  return;
+}
+
+void getInterestingMemoryOperands(
+    Module &M, Instruction *I,
+    SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+  const DataLayout &DL = M.getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), std::nullopt);
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(),
+                             std::nullopt);
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::masked_load:
+    case Intrinsic::masked_store:
+    case Intrinsic::masked_gather:
+    case Intrinsic::masked_scatter: {
+      bool IsWrite = CI->getType()->isVoidTy();
+      // Masked store has an initial operand for the value.
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = Align(1);
+      // Otherwise no alignment guarantees. We probably got Undef.
+      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Alignment = Op->getMaybeAlignValue();
+      Value *Mask = CI->getOperand(2 + OpOffset);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
+      break;
+    }
+    case Intrinsic::masked_expandload:
+    case Intrinsic::masked_compressstore: {
+      bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_compressstore;
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      auto BasePtr = CI->getOperand(OpOffset);
+      MaybeAlign Alignment = BasePtr->getPointerAlignment(DL);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      IRBuilder<> IB(I);
+      Value *Mask = CI->getOperand(1 + OpOffset);
+      Type *IntptrTy = M.getDataLayout().getIntPtrType(
+          M.getContext(), BasePtr->getType()->getPointerAddressSpace());
+      // Use the popcount of Mask as the effective vector length.
+      Type *ExtTy = VectorType::get(IntptrTy, cast<VectorType>(Ty));
+      Value *ExtMask = IB.CreateZExt(Mask, ExtTy);
+      Value *EVL = IB.CreateAddReduce(ExtMask);
+      Value *TrueMask = ConstantInt::get(Mask->getType(), 1);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, TrueMask,
+                               EVL);
+      break;
+    }
+    case Intrinsic::vp_load:
+    case Intrinsic::vp_store:
+    case Intrinsic::experimental_vp_strided_load:
+    case Intrinsic::experimental_vp_strided_store: {
+      auto *VPI = cast<VPIntrinsic>(CI);
+      unsigned IID = CI->getIntrinsicID();
+      bool IsWrite = CI->getType()->isVoidTy();
+      unsigned PtrOpNo = *VPI->getMemoryPointerParamPos(IID);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = VPI->getOperand(PtrOpNo)->getPointerAlignment(DL);
+      Value *Stride = nullptr;
+      if (IID == Intrinsic::experimental_vp_strided_store ||
+          IID == Intrinsic::experimental_vp_strided_load) {
+        Stride = VPI->getOperand(PtrOpNo + 1);
+        // Use the pointer alignment as the element alignment if the stride is a
+        // mutiple of the pointer alignment. Otherwise, the element alignment
+        // should be Align(1).
+        unsigned PointerAlign = Alignment.valueOrOne().value();
+        if (!isa<ConstantInt>(Stride) ||
+            cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
+          Alignment = Align(1);
+      }
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment,
+                               VPI->getMaskParam(), VPI->getVectorLengthParam(),
+                               Stride);
+      break;
+    }
+    case Intrinsic::vp_gather:
+    case Intrinsic::vp_scatter: {
+      auto *VPI = cast<VPIntrinsic>(CI);
+      unsigned IID = CI->getIntrinsicID();
+      bool IsWrite = IID == Intrinsic::vp_scatter;
+      unsigned PtrOpNo = *VPI->getMemoryPointerParamPos(IID);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = VPI->getPointerAlignment();
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment,
+                               VPI->getMaskParam(),
+                               VPI->getVectorLengthParam());
+      break;
+    }
+    case Intrinsic::amdgcn_raw_buffer_load:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load:
+    case Intrinsic::amdgcn_raw_buffer_load_format:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
+    case Intrinsic::amdgcn_raw_tbuffer_load:
+    case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
+    case Intrinsic::amdgcn_struct_buffer_load:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load:
+    case Intrinsic::amdgcn_struct_buffer_load_format:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
+    case Intrinsic::amdgcn_struct_tbuffer_load:
+    case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
+    case Intrinsic::amdgcn_s_buffer_load:
+    case Intrinsic::amdgcn_global_load_tr_b64:
+    case Intrinsic::amdgcn_global_load_tr_b128: {
+      unsigned PtrOpNo = 0;
+      bool IsWrite = false;
+      Type *Ty = CI->getType();
+      Value *Ptr = CI->getArgOperand(PtrOpNo);
+      MaybeAlign Alignment = Ptr->getPointerAlignment(DL);
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment);
+      break;
+    }
+    case Intrinsic::amdgcn_raw_tbuffer_store:
+    case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store:
+    case Intrinsic::amdgcn_raw_ptr_buffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store_format:
+    case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
+    case Intrinsic::amdgcn_struct_buffer_store:
+    case Intrinsic::amdgcn_struct_ptr_buffer_store:
+    case Intrinsic::amdgcn_struct_buffer_store_format:
+    case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
+    case Intrinsic::amdgcn_struct_tbuffer_store:
+    case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
+      unsigned PtrOpNo = 1;
+      bool IsWrite = true;
+      Value *Ptr = CI->getArgOperand(PtrOpNo);
+      Type *Ty = Ptr->getType();
+      MaybeAlign Alignment = Ptr->getPointerAlignment(DL);
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment);
+      break;
+    }
+    default:
+      for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
+        if (Type *Ty = CI->getParamByRefType(ArgNo)) {
+          Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+        } else if (Type *Ty = CI->getParamByValType(ArgNo)) {
+          Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+        }
+      }
+    }
+  }
+}
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h
new file mode 100644
index 0000000000000..64d78c4aeb692
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h
@@ -0,0 +1,60 @@
+//===AMDGPUAsanInstrumentation.h - ASAN helper functions -*- C++- *===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+
+/// Given SizeInBytes of the Value to be instrunmented,
+/// Returns the redzone size corresponding to it.
+uint64_t getRedzoneSizeForGlobal(int Scale, uint64_t SizeInBytes);
+
+/// Instrument the memory operand Addr.
+/// Generates report blocks that catch the addressing errors.
+void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
+                       Instruction *InsertBefore, Value *Addr,
+                       MaybeAlign Alignment, uint32_t TypeStoreSize,
+                       bool IsWrite, Value *SizeArgument, bool UseCalls,
+                       bool Recover, int Scale, int Offset);
+
+/// Get all the memory operands from the instruction
+/// that needs to be instrumented
+void getInterestingMemoryOperands(
+    Module &M, Instruction *I,
+    SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 3154dc6fe433d..e64e28e01d3d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1487,6 +1487,10 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
     return;
 
+  // Currently non-kernel functions have no resources to emit.
+  if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))
+    return;
+
   auto EmitResourceUsageRemark = [&](StringRef RemarkName,
                                      StringRef RemarkLabel, auto Argument) {
     // Add an indent for every line besides the line with the kernel name. This
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f70a60aef0073..f66bbde42ce27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -109,8 +109,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   /// tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   /// Implemented in AMDGPUMCInstLower.cpp
   void emitInstruction(const MachineInstr *MI) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 537d3a43aa9fa..825c1de6c91f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -398,8 +398,10 @@ def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">,
 def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
   GISDNodeXFormEquiv<NegateImm>;
 
-def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
+def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastFPImm32">,
   GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
+def gi_bitcast_fpimm_to_i64 : GICustomOperandRenderer<"renderBitcastFPImm64">,
+  GISDNodeXFormEquiv<bitcast_fpimm_to_i64>;
 
 def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
   GISDNodeXFormEquiv<IMMPopCount>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b7471bab12850..e36ec9595e85a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1270,8 +1270,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
     // (add n0, c0)
     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
       Base = N0;
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
       return true;
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
@@ -1306,8 +1306,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
 
           Base = SDValue(MachineSub, 0);
-          Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-          Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+          Offset0 =
+              CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+          Offset1 =
+              CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
           return true;
         }
       }
@@ -1321,8 +1323,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
       MachineSDNode *MovZero =
           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
       return true;
     }
   }
@@ -1330,8 +1332,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
   // default case
 
   Base = Addr;
-  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
-  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
+  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39ae7c96cf772..862b5c7e3e3d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -397,6 +397,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
+  setOperationAction({ISD::LROUND, ISD::LLROUND},
+                     {MVT::f16, MVT::f32, MVT::f64}, Expand);
 
   setOperationAction(
       {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
@@ -406,6 +408,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
+  setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
+                     Expand);
+
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
   if (Subtarget->has16BitInsts())
@@ -439,19 +444,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
                      Custom);
 
-  // FIXME: Why is v8f16/v8bf16 missing?
   setOperationAction(
       ISD::EXTRACT_SUBVECTOR,
-      {MVT::v2f16,  MVT::v2bf16, MVT::v2i16,  MVT::v4f16,  MVT::v4bf16,
-       MVT::v4i16,  MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,
-       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,
-       MVT::v6i32,  MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,
-       MVT::v9f32,  MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32,
-       MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
-       MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
+      {MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,
+       MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,
+       MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,
+       MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
+       MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
        MVT::v2f64,  MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,
-       MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64,
-       MVT::v32i16, MVT::v32f16, MVT::v32bf16},
+       MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -1240,10 +1241,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
       // Round up vec3/vec5 argument.
       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
-        assert(MemVT.getVectorNumElements() == 3 ||
-               MemVT.getVectorNumElements() == 5 ||
-               (MemVT.getVectorNumElements() >= 9 &&
-                MemVT.getVectorNumElements() <= 12));
         MemVT = MemVT.getPow2VectorType(State.getContext());
       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
         MemVT = MemVT.getRoundIntegerType(State.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 93bca4402ed23..9197404309663 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1102,6 +1102,78 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     break;
   }
+  case Intrinsic::amdgcn_trig_preop: {
+    // The intrinsic is declared with name mangling, but currently the
+    // instruction only exists for f64
+    if (!II.getType()->isDoubleTy())
+      break;
+
+    Value *Src = II.getArgOperand(0);
+    Value *Segment = II.getArgOperand(1);
+    if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
+      return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+
+    if (isa<UndefValue>(Src)) {
+      auto *QNaN = ConstantFP::get(
+          II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
+    if (!Csrc)
+      break;
+
+    if (II.isStrictFP())
+      break;
+
+    const APFloat &Fsrc = Csrc->getValueAPF();
+    if (Fsrc.isNaN()) {
+      auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
+      return IC.replaceInstUsesWith(II, Quieted);
+    }
+
+    const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
+    if (!Cseg)
+      break;
+
+    unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
+    unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
+    unsigned Shift = SegmentVal * 53;
+    if (Exponent > 1077)
+      Shift += Exponent - 1077;
+
+    // 2.0/PI table.
+    static const uint32_t TwoByPi[] = {
+        0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
+        0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
+        0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
+        0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
+        0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
+        0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
+        0x56033046};
+
+    // Return 0 for outbound segment (hardware behavior).
+    unsigned Idx = Shift >> 5;
+    if (Idx + 2 >= std::size(TwoByPi)) {
+      APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
+      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
+    }
+
+    unsigned BShift = Shift & 0x1f;
+    uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
+    uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
+    if (BShift)
+      Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
+    Thi = Thi >> 11;
+    APFloat Result = APFloat((double)Thi);
+
+    int Scale = -53 - Shift;
+    if (Exponent >= 1968)
+      Scale += 128;
+
+    Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
+    return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
+  }
   case Intrinsic::amdgcn_fmul_legacy: {
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index da3e8c0a62b08..73f3921b2ff4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1870,6 +1870,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
       VDataIn = MI.getOperand(1).getReg();
       VDataTy = MRI->getType(VDataIn);
       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
+    } else if (BaseOpcode->NoReturn) {
+      NumVDataDwords = 0;
     } else {
       VDataOut = MI.getOperand(0).getReg();
       VDataTy = MRI->getType(VDataOut);
@@ -2167,27 +2169,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   return Ret;
 }
 
-static int sizeToSubRegIndex(unsigned Size) {
-  switch (Size) {
-  case 32:
-    return AMDGPU::sub0;
-  case 64:
-    return AMDGPU::sub0_sub1;
-  case 96:
-    return AMDGPU::sub0_sub1_sub2;
-  case 128:
-    return AMDGPU::sub0_sub1_sub2_sub3;
-  case 256:
-    return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-  default:
-    if (Size < 32)
-      return AMDGPU::sub0;
-    if (Size > 256)
-      return -1;
-    return sizeToSubRegIndex(llvm::bit_ceil(Size));
-  }
-}
-
 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg = I.getOperand(1).getReg();
@@ -2291,8 +2272,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     return false;
 
   if (SrcSize > 32) {
-    int SubRegIdx = sizeToSubRegIndex(DstSize);
-    if (SubRegIdx == -1)
+    unsigned SubRegIdx =
+        DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
+    if (SubRegIdx == AMDGPU::NoSubRegister)
       return false;
 
     // Deal with weird cases where the class only partially supports the subreg
@@ -2521,89 +2503,6 @@ bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
   return false;
 }
 
-bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineOperand &ImmOp = I.getOperand(1);
-  Register DstReg = I.getOperand(0).getReg();
-  unsigned Size = MRI->getType(DstReg).getSizeInBits();
-  bool IsFP = false;
-
-  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
-  if (ImmOp.isFPImm()) {
-    const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
-    ImmOp.ChangeToImmediate(Imm.getZExtValue());
-    IsFP = true;
-  } else if (ImmOp.isCImm()) {
-    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
-  } else {
-    llvm_unreachable("Not supported by g_constants");
-  }
-
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
-  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
-
-  unsigned Opcode;
-  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
-    Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  } else if (Size == 64 &&
-             AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
-    Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
-    I.setDesc(TII.get(Opcode));
-    I.addImplicitDefUseOperands(*MF);
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  } else {
-    Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-
-    // We should never produce s1 values on banks other than VCC. If the user of
-    // this already constrained the register, we may incorrectly think it's VCC
-    // if it wasn't originally.
-    if (Size == 1)
-      return false;
-  }
-
-  if (Size != 64) {
-    I.setDesc(TII.get(Opcode));
-    I.addImplicitDefUseOperands(*MF);
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  }
-
-  const DebugLoc &DL = I.getDebugLoc();
-
-  APInt Imm(Size, I.getOperand(1).getImm());
-
-  MachineInstr *ResInst;
-  if (IsSgpr && TII.isInlineConstant(Imm)) {
-    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
-      .addImm(I.getOperand(1).getImm());
-  } else {
-    const TargetRegisterClass *RC = IsSgpr ?
-      &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
-    Register LoReg = MRI->createVirtualRegister(RC);
-    Register HiReg = MRI->createVirtualRegister(RC);
-
-    BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
-      .addImm(Imm.trunc(32).getZExtValue());
-
-    BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
-      .addImm(Imm.ashr(32).getZExtValue());
-
-    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-      .addReg(LoReg)
-      .addImm(AMDGPU::sub0)
-      .addReg(HiReg)
-      .addImm(AMDGPU::sub1);
-  }
-
-  // We can't call constrainSelectedInstRegOperands here, because it doesn't
-  // work for target independent opcodes
-  I.eraseFromParent();
-  const TargetRegisterClass *DstRC =
-    TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
-  if (!DstRC)
-    return true;
-  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
-}
-
 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
   // Only manually handle the f64 SGPR case.
   //
@@ -3530,9 +3429,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_FREEZE:
     return selectCOPY(I);
-  case TargetOpcode::G_CONSTANT:
-  case TargetOpcode::G_FCONSTANT:
-    return selectG_CONSTANT(I);
   case TargetOpcode::G_FNEG:
     if (selectImpl(I, *CoverageInfo))
       return true;
@@ -3616,6 +3512,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_INSERT_VECTOR_ELT(I);
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     const AMDGPU::ImageDimIntrinsicInfo *Intr =
@@ -3637,6 +3534,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectStackRestore(I);
   case AMDGPU::G_PHI:
     return selectPHI(I);
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -5623,18 +5522,12 @@ void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
 }
 
-void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
-                                                 const MachineInstr &MI,
-                                                 int OpIdx) const {
-  assert(OpIdx == -1);
-
+void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
+                                                   const MachineInstr &MI,
+                                                   int OpIdx) const {
   const MachineOperand &Op = MI.getOperand(1);
-  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
-    MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
-  else {
-    assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
-    MIB.addImm(Op.getCImm()->getSExtValue());
-  }
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
+  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
 }
 
 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 43ed210508d33..7fff7d2685e7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -91,7 +91,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectG_TRUNC(MachineInstr &I) const;
   bool selectG_SZA_EXT(MachineInstr &I) const;
   bool selectG_FPEXT(MachineInstr &I) const;
-  bool selectG_CONSTANT(MachineInstr &I) const;
   bool selectG_FNEG(MachineInstr &I) const;
   bool selectG_FABS(MachineInstr &I) const;
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
@@ -333,8 +332,17 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
 
-  void renderBitcastImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
-                        int OpIdx) const;
+  void renderBitcastFPImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                          int OpIdx) const;
+
+  void renderBitcastFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                            int OpIdx) const {
+    renderBitcastFPImm(MIB, MI, OpIdx);
+  }
+  void renderBitcastFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                            int OpIdx) const {
+    renderBitcastFPImm(MIB, MI, OpIdx);
+  }
 
   void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 2cc95f81d2f94..7bf5170794cd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -360,34 +360,65 @@ bool LiveRegOptimizer::optimizeLiveType(
         Type *NewType = calculateConvertType(Phi->getType());
         NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
                             Phi->getIncomingBlock(I));
-      } else if (ValMap.contains(IncVal))
+      } else if (ValMap.contains(IncVal) && ValMap[IncVal])
         NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
       else
         MissingIncVal = true;
     }
-    Instruction *DeadInst = Phi;
     if (MissingIncVal) {
-      DeadInst = cast<Instruction>(ValMap[Phi]);
-      // Do not use the dead phi
-      ValMap[Phi] = Phi;
+      Value *DeadVal = ValMap[Phi];
+      // The coercion chain of the PHI is broken. Delete the Phi
+      // from the ValMap and any connected / user Phis.
+      SmallVector<Value *, 4> PHIWorklist;
+      SmallPtrSet<Value *, 4> VisitedPhis;
+      PHIWorklist.push_back(DeadVal);
+      while (!PHIWorklist.empty()) {
+        Value *NextDeadValue = PHIWorklist.pop_back_val();
+        VisitedPhis.insert(NextDeadValue);
+        auto OriginalPhi =
+            std::find_if(PhiNodes.begin(), PhiNodes.end(),
+                         [this, &NextDeadValue](PHINode *CandPhi) {
+                           return ValMap[CandPhi] == NextDeadValue;
+                         });
+        // This PHI may have already been removed from maps when
+        // unwinding a previous Phi
+        if (OriginalPhi != PhiNodes.end())
+          ValMap.erase(*OriginalPhi);
+
+        DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
+
+        for (User *U : NextDeadValue->users()) {
+          if (!VisitedPhis.contains(cast<PHINode>(U)))
+            PHIWorklist.push_back(U);
+        }
+      }
+    } else {
+      DeadInsts.emplace_back(cast<Instruction>(Phi));
     }
-    DeadInsts.emplace_back(DeadInst);
   }
   // Coerce back to the original type and replace the uses.
   for (Instruction *U : Uses) {
     // Replace all converted operands for a use.
     for (auto [OpIdx, Op] : enumerate(U->operands())) {
-      if (ValMap.contains(Op)) {
+      if (ValMap.contains(Op) && ValMap[Op]) {
         Value *NewVal = nullptr;
         if (BBUseValMap.contains(U->getParent()) &&
             BBUseValMap[U->getParent()].contains(ValMap[Op]))
           NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
         else {
           BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
-          NewVal =
-              convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
-                                 InsertPt, U->getParent());
-          BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+          // We may pick up ops that were previously converted for users in
+          // other blocks. If there is an originally typed definition of the Op
+          // already in this block, simply reuse it.
+          if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
+              U->getParent() == cast<Instruction>(Op)->getParent()) {
+            NewVal = Op;
+          } else {
+            NewVal =
+                convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
+                                   InsertPt, U->getParent());
+            BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+          }
         }
         assert(NewVal);
         U->setOperand(OpIdx, NewVal);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 88e40da110555..c6c4b8f930647 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1131,6 +1131,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
+  getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
+      .clampScalar(0, S16, S64)
+      .scalarize(0)
+      .lower();
+
   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
       .customFor({S16, S32})
       .scalarize(0)
@@ -1141,6 +1146,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
 
+  getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
+      .clampScalar(0, S16, S64)
+      .scalarize(0)
+      .lower();
+
   if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder(
         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
@@ -6334,8 +6344,13 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   const LLT V2S16 = LLT::fixed_vector(2, 16);
 
   unsigned DMask = 0;
-  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
-  LLT Ty = MRI->getType(VData);
+  Register VData;
+  LLT Ty;
+
+  if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
+    VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
+    Ty = MRI->getType(VData);
+  }
 
   const bool IsAtomicPacked16Bit =
       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
@@ -6373,7 +6388,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
-  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
+  unsigned NewOpcode = LoadOpcode;
+  if (BaseOpcode->Store)
+    NewOpcode = StoreOpcode;
+  else if (BaseOpcode->NoReturn)
+    NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
 
   // Track that we legalized this
   MI.setDesc(B.getTII().get(NewOpcode));
@@ -6503,7 +6522,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     Flags |= 2;
   MI.addOperand(MachineOperand::CreateImm(Flags));
 
-  if (BaseOpcode->Store) { // No TFE for stores?
+  if (BaseOpcode->NoReturn) { // No TFE for stores?
     // TODO: Handle dmask trim
     if (!Ty.isVector() || !IsD16)
       return true;
@@ -7357,8 +7376,12 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeBufferStore(MI, MRI, B, true, true);
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
+  case Intrinsic::amdgcn_raw_atomic_buffer_load:
+  case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
   case Intrinsic::amdgcn_struct_buffer_load:
   case Intrinsic::amdgcn_struct_ptr_buffer_load:
+  case Intrinsic::amdgcn_struct_atomic_buffer_load:
+  case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
     return legalizeBufferLoad(MI, MRI, B, false, false);
   case Intrinsic::amdgcn_raw_buffer_load_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 6be9be21a8a86..77971323aa1ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1092,8 +1092,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
 
   Intrinsic::ID IID = Intrinsic::not_intrinsic;
   if (isa<LoadInst>(I))
-    // TODO: Do we need to do something about atomic loads?
-    IID = Intrinsic::amdgcn_raw_ptr_buffer_load;
+    IID = Order == AtomicOrdering::NotAtomic
+              ? Intrinsic::amdgcn_raw_ptr_buffer_load
+              : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
   else if (isa<StoreInst>(I))
     IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
   else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index e91d05954a1c9..e724c978c44b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -225,10 +225,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 
       for (User *ICmp : BlockCount->users()) {
-        ICmpInst::Predicate Pred;
-        if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
-          if (Pred != ICmpInst::ICMP_ULT)
-            continue;
+        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+                                       m_Specific(BlockCount)))) {
           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
           MadeChange = true;
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index c24d39b9e5fdd..f5b5e9e427598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -96,7 +96,6 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
   }
   case MachineOperand::MO_ExternalSymbol: {
     MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
-    Sym->setExternal(true);
     const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
     MCOp = MCOperand::createExpr(Expr);
     return true;
@@ -119,7 +118,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned Opcode = MI->getOpcode();
   const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
 
-  // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+  // FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
   if (Opcode == AMDGPU::S_SETPC_B64_return)
@@ -188,8 +187,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
   //                                        getSubtargetInfo().getFeatureBits());
 
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 42a6bac4fa6f2..02b0d436451a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -437,7 +437,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
     return false;
 
   auto PrintfFunction = M.getFunction("printf");
-  if (!PrintfFunction || !PrintfFunction->isDeclaration())
+  if (!PrintfFunction || !PrintfFunction->isDeclaration() ||
+      M.getModuleFlag("openmp"))
     return false;
 
   for (auto &U : PrintfFunction->uses()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 73796edb5d3e3..9a6ba5ac68084 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1059,6 +1059,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
   Register DstReg = MI.getOperand(0).getReg();
   const LLT LoadTy = MRI.getType(DstReg);
   unsigned LoadSize = LoadTy.getSizeInBits();
+  MachineMemOperand *MMO = *MI.memoperands_begin();
   const unsigned MaxNonSmrdLoadSize = 128;
 
   const RegisterBank *DstBank =
@@ -1069,7 +1070,6 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
       return false;
 
-    MachineMemOperand *MMO = *MI.memoperands_begin();
     const unsigned MemSize = 8 * MMO->getSize().getValue();
     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
@@ -1141,25 +1141,29 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
   if (SrcRegs.empty())
     SrcRegs.push_back(MI.getOperand(1).getReg());
 
-  assert(LoadSize % MaxNonSmrdLoadSize == 0);
-
   // RegBankSelect only emits scalar types, so we need to reset the pointer
   // operand to a pointer type.
   Register BasePtrReg = SrcRegs[0];
   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
   MRI.setType(BasePtrReg, PtrTy);
 
-  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
-  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
-  ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
-  LegalizerHelper Helper(B.getMF(), O, B);
-
-  if (LoadTy.isVector()) {
-    if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
-      return false;
-  } else {
-    if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
-      return false;
+  // The following are the loads not splitted enough during legalization
+  // because it was not clear they are smem-load or vmem-load
+  if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||
+      MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
+    assert(LoadSize % MaxNonSmrdLoadSize == 0);
+    unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
+    const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
+    ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
+    LegalizerHelper Helper(B.getMF(), O, B);
+    if (LoadTy.isVector()) {
+      if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
+          LegalizerHelper::Legalized)
+        return false;
+    } else {
+      if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+        return false;
+    }
   }
 
   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
@@ -3172,6 +3176,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   }
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
@@ -4055,6 +4060,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FCONSTANT:
   case AMDGPU::G_CONSTANT:
   case AMDGPU::G_GLOBAL_VALUE:
+  case AMDGPU::G_FRAME_INDEX:
   case AMDGPU::G_BLOCK_ADDR:
   case AMDGPU::G_READSTEADYCOUNTER:
   case AMDGPU::G_READCYCLECOUNTER: {
@@ -4062,13 +4068,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
-  case AMDGPU::G_FRAME_INDEX: {
-    // TODO: This should be the same as other constants, but eliminateFrameIndex
-    // currently assumes VALU uses.
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
-    break;
-  }
   case AMDGPU::G_DYN_STACKALLOC: {
     // Result is always uniform, and a wave reduction is needed for the source.
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
@@ -4842,6 +4841,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     auto IntrID = AMDGPU::getIntrinsicID(MI);
@@ -4983,6 +4983,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_raw_buffer_load:
     case Intrinsic::amdgcn_raw_ptr_buffer_load:
+    case Intrinsic::amdgcn_raw_atomic_buffer_load:
+    case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
     case Intrinsic::amdgcn_raw_tbuffer_load:
     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
       // FIXME: Should make intrinsic ID the last operand of the instruction,
@@ -5016,7 +5018,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_struct_buffer_load:
     case Intrinsic::amdgcn_struct_ptr_buffer_load:
     case Intrinsic::amdgcn_struct_tbuffer_load:
-    case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
+    case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
+    case Intrinsic::amdgcn_struct_atomic_buffer_load:
+    case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 146649a7e2d54..0aca99a82d197 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -85,14 +85,9 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
                                    ST.getTargetID().isXnackOnOrAny());
 }
 
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
-    const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
-  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
-}
-
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
     const GCNSubtarget &ST) const {
-  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
+  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
 }
 
 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 16dcc28bcf88b..7f71de6749dce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -45,8 +45,6 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass {
     int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
     // Total number of VGPRs is actually a combination of AGPR and VGPR
     // depending on architecture - and some alignment constraints
-    int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR,
-                             int32_t NumVGPR) const;
     int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5e3216af8834f..f160c626452d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -87,16 +87,19 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
 };
 
 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
-                              const TargetRegisterClass &RC) {
-  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
 }
 
 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
-                              const TargetRegisterClass &RC) {
-  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
 }
 
-
 /// -{sgpr|vgpr}-regalloc=... command line option.
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 
@@ -759,7 +762,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerRegClassFilterParsingCallback(
-      [](StringRef FilterName) -> RegClassFilterFunc {
+      [](StringRef FilterName) -> RegAllocFilterFunc {
         if (FilterName == "sgpr")
           return onlyAllocateSGPRs;
         if (FilterName == "vgpr")
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0b1ecc002ae25..8d4ff64ac5adc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -418,19 +418,19 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
 // FIXME: This could use fine tuning and microbenchmarks.
 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicElementSize) const {
 
   if (AtomicElementSize)
     return Type::getIntNTy(Context, *AtomicElementSize * 8);
 
-  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+  Align MinAlign = std::min(SrcAlign, DestAlign);
 
   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
   // hardware into byte accesses. If you assume all alignments are equally
   // probable, it's more efficient on average to use short accesses for this
   // case.
-  if (MinAlign == 2)
+  if (MinAlign == Align(2))
     return Type::getInt16Ty(Context);
 
   // Not all subtargets have 128-bit DS instructions, and we currently don't
@@ -450,7 +450,7 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-    unsigned SrcAlign, unsigned DestAlign,
+    Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
   assert(RemainingBytes < 16);
 
@@ -459,9 +459,9 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
         OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
         DestAlign, AtomicCpySize);
 
-  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+  Align MinAlign = std::min(SrcAlign, DestAlign);
 
-  if (MinAlign != 2) {
+  if (MinAlign != Align(2)) {
     Type *I64Ty = Type::getInt64Ty(Context);
     while (RemainingBytes >= 8) {
       OpsOut.push_back(I64Ty);
@@ -686,7 +686,10 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
 // instructions for an intrinsic, even if it requires nontrivial legalization.
 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::fma: // TODO: fmuladd
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
+  case Intrinsic::copysign:
+  case Intrinsic::canonicalize:
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
   case Intrinsic::uadd_sat:
@@ -730,18 +733,33 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   switch (ICA.getID()) {
   case Intrinsic::fma:
-    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
-                                   : getQuarterRateInstrCost(CostKind);
+  case Intrinsic::fmuladd:
+    if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
+      InstRate = getFullRateInstrCost();
+    else {
+      InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+                                     : getQuarterRateInstrCost(CostKind);
+    }
     break;
+  case Intrinsic::copysign:
+    return NElts * getFullRateInstrCost();
+  case Intrinsic::canonicalize: {
+    InstRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
+    break;
+  }
   case Intrinsic::uadd_sat:
   case Intrinsic::usub_sat:
   case Intrinsic::sadd_sat:
-  case Intrinsic::ssub_sat:
+  case Intrinsic::ssub_sat: {
     static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
       NElts = 1;
     break;
   }
+  default:
+    break;
+  }
 
   return LT.first * NElts * InstRate;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca..01df2e6caaba1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -137,15 +137,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                                     unsigned AddrSpace) const;
 
   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
-  Type *getMemcpyLoopLoweringType(
-      LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
-      std::optional<uint32_t> AtomicElementSize) const;
+  Type *
+  getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                            unsigned SrcAddrSpace, unsigned DestAddrSpace,
+                            Align SrcAlign, Align DestAlign,
+                            std::optional<uint32_t> AtomicElementSize) const;
 
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 217487b2cc7e6..26a839a95df96 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1746,6 +1746,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateMIMGDataSize(const MCInst &Inst, const SMLoc &IDLoc);
   bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
   bool validateMIMGD16(const MCInst &Inst);
+  bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
   bool validateMIMGMSAA(const MCInst &Inst);
   bool validateOpSel(const MCInst &Inst);
   bool validateNeg(const MCInst &Inst, int OpName);
@@ -3868,7 +3869,8 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst,
   int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
   int TFEIdx   = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
 
-  assert(VDataIdx != -1);
+  if (VDataIdx == -1 && isGFX10Plus()) // no return image_sample
+    return true;
 
   if ((DMaskIdx == -1 || TFEIdx == -1) && isGFX10_AEncoding()) // intersect_ray
     return true;
@@ -4010,6 +4012,29 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
   return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
 }
 
+bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst,
+                                      const OperandVector &Operands) {
+  if (!isGFX10Plus())
+    return true;
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & MIMGFlags) == 0)
+    return true;
+
+  // image_bvh_intersect_ray instructions do not have dim
+  if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
+    return true;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    if (Op.isDim())
+      return true;
+  }
+  return false;
+}
+
 bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) {
   const unsigned Opc = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opc);
@@ -5098,6 +5123,10 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "d16 modifier is not supported on this GPU");
     return false;
   }
+  if (!validateMIMGDim(Inst, Operands)) {
+    Error(IDLoc, "missing dim operand");
+    return false;
+  }
   if (!validateMIMGMSAA(Inst)) {
     Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
           "invalid dim; must be MSAA type");
@@ -8648,10 +8677,8 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
       Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
     } else if (Op.isImmModifier()) {
       OptionalIdx[Op.getImmTy()] = I;
-    } else if (Op.isRegOrImm()) {
-      Op.addRegOrImmOperands(Inst, 1);
     } else {
-      llvm_unreachable("unhandled operand type");
+      Op.addRegOrImmOperands(Inst, 1);
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a904c8483dbf5..c6668b24f4ef6 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -139,6 +139,7 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let IsAtomicNoRet      = ps.IsAtomicNoRet;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   bits<12> offset;
   bits<5>  cpol;
@@ -355,6 +356,7 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let LGKM_CNT             = ps.LGKM_CNT;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   bits<12> offset;
   bits<5>  cpol;
@@ -2435,6 +2437,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
   let MTBUF              = ps.MTBUF;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   bits<24> offset;
   bits<8>  vaddr;
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 2407ac2b872ad..5294d5a1d64f6 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUArgumentUsageInfo.cpp
+  AMDGPUAsanInstrumentation.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 219246b71fe80..3c76fb2f7961f 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -73,6 +73,7 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
   let IsAtomicNoRet      = ps.IsAtomicNoRet;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   let Constraints = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
@@ -852,24 +853,24 @@ def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
 class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
-  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
 class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
-  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
   (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
               (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
               (i1 0))
 >;
 
 class DS128Bit8ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
-  (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+  (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i32:$offset0, i32:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
 class DS128Bit8ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
-  (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
   (inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)),
               (i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1,
               (i1 0))
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 16dc019ede810..351563657aeb5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -104,6 +104,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let VALU                 = ps.VALU;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding fields
   bits<8> vaddr;
@@ -169,6 +170,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let VALU                 = ps.VALU;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   bits<7> saddr;
   bits<8> vdst;
@@ -1589,11 +1591,13 @@ let OtherPredicates = [isGFX12Plus] in {
 
   let WaveSizePredicate = isWave32 in {
     defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
-    defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
+    foreach vt = [v8i16, v8f16, v8bf16] in
+      defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, vt>;
   }
   let WaveSizePredicate = isWave64 in {
     defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
-    defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>;
+    foreach vt = [v4i16, v4f16, v4bf16] in
+      defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a402fc6d7e611..a8b171aa82840 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,6 +14,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -1104,6 +1105,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixWMMAHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
+  fixRequiredExportPriority(MI);
 }
 
 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -2895,3 +2897,113 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
 
   return true;
 }
+
+static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
+                               const SIInstrInfo &TII) {
+  MachineBasicBlock &EntryMBB = MF->front();
+  if (EntryMBB.begin() != EntryMBB.end()) {
+    auto &EntryMI = *EntryMBB.begin();
+    if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
+        EntryMI.getOperand(0).getImm() >= Priority)
+      return false;
+  }
+
+  BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
+      .addImm(Priority);
+  return true;
+}
+
+bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
+  if (!ST.hasRequiredExportPriority())
+    return false;
+
+  // Assume the following shader types will never have exports,
+  // and avoid adding or adjusting S_SETPRIO.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  auto CC = MF->getFunction().getCallingConv();
+  switch (CC) {
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_CS_Chain:
+  case CallingConv::AMDGPU_CS_ChainPreserve:
+  case CallingConv::AMDGPU_KERNEL:
+    return false;
+  default:
+    break;
+  }
+
+  const int MaxPriority = 3;
+  const int NormalPriority = 2;
+  const int PostExportPriority = 0;
+
+  auto It = MI->getIterator();
+  switch (MI->getOpcode()) {
+  case AMDGPU::S_ENDPGM:
+  case AMDGPU::S_ENDPGM_SAVED:
+  case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
+  case AMDGPU::SI_RETURN_TO_EPILOG:
+    // Ensure shader with calls raises priority at entry.
+    // This ensures correct priority if exports exist in callee.
+    if (MF->getFrameInfo().hasCalls())
+      return ensureEntrySetPrio(MF, NormalPriority, TII);
+    return false;
+  case AMDGPU::S_SETPRIO: {
+    // Raise minimum priority unless in workaround.
+    auto &PrioOp = MI->getOperand(0);
+    int Prio = PrioOp.getImm();
+    bool InWA = (Prio == PostExportPriority) &&
+                (It != MBB->begin() && TII.isEXP(*std::prev(It)));
+    if (InWA || Prio >= NormalPriority)
+      return false;
+    PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
+    return true;
+  }
+  default:
+    if (!TII.isEXP(*MI))
+      return false;
+    break;
+  }
+
+  // Check entry priority at each export (as there will only be a few).
+  // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
+  bool Changed = false;
+  if (CC != CallingConv::AMDGPU_Gfx)
+    Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
+
+  auto NextMI = std::next(It);
+  bool EndOfShader = false;
+  if (NextMI != MBB->end()) {
+    // Only need WA at end of sequence of exports.
+    if (TII.isEXP(*NextMI))
+      return Changed;
+    // Assume appropriate S_SETPRIO after export means WA already applied.
+    if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
+        NextMI->getOperand(0).getImm() == PostExportPriority)
+      return Changed;
+    EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
+  }
+
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  // Lower priority.
+  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
+      .addImm(PostExportPriority);
+
+  if (!EndOfShader) {
+    // Wait for exports to complete.
+    BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
+        .addReg(AMDGPU::SGPR_NULL)
+        .addImm(0);
+  }
+
+  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
+  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
+
+  if (!EndOfShader) {
+    // Return to normal (higher) priority.
+    BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
+        .addImm(NormalPriority);
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 3ccca527c626b..f2a64ab48e180 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -107,6 +107,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixWMMAHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
+  bool fixRequiredExportPriority(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e5817594a4521..def89c785b855 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -238,6 +238,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasVOPDInsts = false;
   bool HasVALUTransUseHazard = false;
   bool HasForceStoreSC0SC1 = false;
+  bool HasRequiredExportPriority = false;
 
   bool RequiresCOV6 = false;
 
@@ -1282,6 +1283,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
+  bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
+
   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 68774c12516ca..c2d2ca0f90f93 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 37bb9675d8c1d..94bf5e4b95270 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -47,12 +47,6 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   printAnnotation(OS, Annot);
 }
 
-void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
-                                          const MCSubtargetInfo &STI,
-                                          raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
 void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -71,16 +65,6 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
     printU32ImmOperand(MI, OpNo, STI, O);
 }
 
-void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
-void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
-}
-
 void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                               raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
@@ -135,24 +119,6 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset0:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset1:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
-}
-
 void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -165,13 +131,6 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm());
 }
 
-void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  O << " offset:";
-  printSMEMOffset(MI, OpNo, STI, O);
-}
-
 void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -289,14 +248,6 @@ void AMDGPUInstPrinter::printScope(int64_t Scope, raw_ostream &O) {
   return;
 }
 
-void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " dmask:";
-    printU16ImmOperand(MI, OpNo, STI, O);
-  }
-}
-
 void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Dim = MI->getOperand(OpNo).getImm();
@@ -372,8 +323,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::SP_REG:
   case AMDGPU::PRIVATE_RSRC_REG:
     llvm_unreachable("pseudo-register should not ever be emitted");
-  case AMDGPU::SCC:
-    llvm_unreachable("pseudo scc should not ever be emitted");
   default:
     break;
   }
@@ -698,26 +647,6 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
   O << " blgp:" << Imm;
 }
 
-void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " cbsz:" << Imm;
-}
-
-void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " abid:" << Imm;
-}
-
 void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -731,34 +660,6 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
     O << ", ";
 }
 
-void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
-                                      const MCSubtargetInfo &STI,
-                                      raw_ostream &O) {
-  O << " wait_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_va_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_vm_vsrc:";
-  printU4ImmDecOperand(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI,
-                                    raw_ostream &O) {
-  O << " wait_exp:";
-  printU4ImmDecOperand(MI, OpNo, O);
-}
-
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
   return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
@@ -1080,16 +981,13 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
     O << formatDec((Imm & 0xc0) >> 6) << ']';
   } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
              (Imm <= DppCtrl::ROW_SHL_LAST)) {
-    O << "row_shl:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shl:" << formatDec(Imm - DppCtrl::ROW_SHL0);
   } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
              (Imm <= DppCtrl::ROW_SHR_LAST)) {
-    O << "row_shr:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shr:" << formatDec(Imm - DppCtrl::ROW_SHR0);
   } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
              (Imm <= DppCtrl::ROW_ROR_LAST)) {
-    O << "row_ror:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_ror:" << formatDec(Imm - DppCtrl::ROW_ROR0);
   } else if (Imm == DppCtrl::WAVE_SHL1) {
     if (AMDGPU::isGFX10Plus(STI)) {
       O << "/* wave_shl is not supported starting from GFX10 */";
@@ -1141,34 +1039,19 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
            "than GFX90A/GFX10 */";
       return;
     }
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << formatDec(Imm - DppCtrl::ROW_SHARE_FIRST);
   } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
              (Imm <= DppCtrl::ROW_XMASK_LAST)) {
     if (!AMDGPU::isGFX10Plus(STI)) {
       O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
       return;
     }
-    O << "row_xmask:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_xmask:" << formatDec(Imm - DppCtrl::ROW_XMASK_FIRST);
   } else {
     O << "/* Invalid dpp_ctrl value */";
   }
 }
 
-void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " row_mask:";
-  printU4ImmOperand(MI, OpNo, STI, O);
-}
-
-void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  O << " bank_mask:";
-  printU4ImmOperand(MI, OpNo, STI, O);
-}
-
 void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
@@ -1812,14 +1695,13 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
   O << ' ' << formatDec(Imm);
 }
 
-void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  uint8_t Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " byte_sel:" << formatDec(Imm);
+void AMDGPUInstPrinter::printNamedInt(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O, StringRef Prefix,
+                                      bool PrintInHex, bool AlwaysPrint) {
+  int64_t V = MI->getOperand(OpNo).getImm();
+  if (AlwaysPrint || V != 0)
+    O << ' ' << Prefix << ':' << (PrintInHex ? formatHex(V) : formatDec(V));
 }
 
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index d6d7fd34b68cc..4d44db5d9d818 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -36,12 +36,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                               const MCRegisterInfo &MRI);
 
 private:
-  void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
-                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
-  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
@@ -52,24 +48,16 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                        raw_ostream &O);
 
-  void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
   void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMEMOffset(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
-  void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printCPol(const MCInst *MI, unsigned OpNo,
                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printTH(const MCInst *MI, int64_t TH, int64_t Scope, raw_ostream &O);
   void printScope(int64_t Scope, raw_ostream &O);
-  void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                  raw_ostream &O);
   void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
   void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -114,10 +102,6 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                  raw_ostream &O);
   void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
-  void printDppRowMask(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printDppBankMask(const MCInst *MI, unsigned OpNo,
-                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppFI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -158,21 +142,9 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
-  void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
-  void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
   bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const;
   void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI,
                               raw_ostream &O);
-  void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, unsigned N);
@@ -186,8 +158,9 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpTgt(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
-  void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
+  void printNamedInt(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O,
+                     StringRef Prefix, bool PrintInHex, bool AlwaysPrint);
 
 public:
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index d2ac5a7ebb2fb..37eb0b57fb153 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -91,10 +91,9 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
   return new AMDGPUInstPrinter(MAI, MII, MRI);
 }
 
-static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
-                                                      formatted_raw_ostream &OS,
-                                                      MCInstPrinter *InstPrint,
-                                                      bool isVerboseAsm) {
+static MCTargetStreamer *
+createAMDGPUAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                              MCInstPrinter *InstPrint) {
   return new AMDGPUTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 531031b580347..8c78db8e83c42 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -19,8 +19,8 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -605,9 +605,9 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
 // We use it for emitting the accumulated PAL metadata as a .note record.
 // The PAL metadata is reset after it is emitted.
 void AMDGPUTargetELFStreamer::finish() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  MCA.setELFHeaderEFlags(getEFlags());
-  MCA.getWriter().setOverrideABIVersion(
+  ELFObjectWriter &W = getStreamer().getWriter();
+  W.setELFHeaderEFlags(getEFlags());
+  W.setOverrideABIVersion(
       getELFABIVersion(STI.getTargetTriple(), CodeObjectVersion));
 
   std::string Blob;
@@ -828,10 +828,8 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
   MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
   SymbolELF->setType(ELF::STT_OBJECT);
 
-  if (!SymbolELF->isBindingSet()) {
+  if (!SymbolELF->isBindingSet())
     SymbolELF->setBinding(ELF::STB_GLOBAL);
-    SymbolELF->setExternal(true);
-  }
 
   if (SymbolELF->declareCommon(Size, Alignment, true)) {
     report_fatal_error("Symbol: " + Symbol->getName() +
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 15fd36ebd10a4..b4e58cfd98a23 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -51,6 +51,7 @@ class MIMGBaseOpcode : PredicateControl {
   bit MSAA = 0;
   bit BVH = 0;
   bit A16 = 0;
+  bit NoReturn = 0;
 }
 
 def MIMGBaseOpcode : GenericEnum {
@@ -62,7 +63,7 @@ def MIMGBaseOpcodesTable : GenericTable {
   let CppTypeName = "MIMGBaseOpcodeInfo";
   let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
-                "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16"];
+                "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
 
   let PrimaryKey = ["BaseOpcode"];
@@ -521,6 +522,25 @@ class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
+class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode,
+                            int num_addrs, RegisterClass Addr3RC = VGPR_32,
+                            string dns="">
+  : VSAMPLE_gfx12<op.GFX12, (outs), num_addrs, dns, Addr3RC> {
+  let InOperandList = !con(AddrIns,
+                           (ins SReg_256:$rsrc),
+                           !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)),
+                           (ins DMask:$dmask, Dim:$dim, UNorm:$unorm,
+                                CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe,
+                                LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" off, "#AddrAsm#", $rsrc"
+                    #!if(BaseOpcode.Sampler, ", $samp", "")
+                    #"$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+  // Force vdata to VGPR0 as no result will be returned.
+  let vdata = 0;
+}
+
 multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
                                       RegisterClass dst_rc, bit enableDisasm,
                                       bit ExtendedImageInst = 1,
@@ -835,6 +855,7 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
     let Store = 1;
     let LodOrClampOrMip = mip;
     let HasD16 = has_d16;
+    let NoReturn = 1;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1136,44 +1157,62 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
                       #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
+class MIMG_Sampler_OpList_gfx10p<dag OpPrefix, bit HasD16> {
+  dag ret = !con(OpPrefix,
+                 (ins SReg_256:$srsrc, SReg_128:$ssamp,
+                  DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                  R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
+                 !if(HasD16, (ins D16:$d16), (ins)));
+}
+
+class MIMG_Sampler_Asm_gfx10p<string opcode, string AsmPrefix, bit HasD16> {
+  string ret = opcode#" "#AsmPrefix#", $srsrc, $ssamp$dmask$dim$unorm"
+               #"$cpol$r128$a16$tfe$lwe"
+               #!if(HasD16, "$d16", "");
+}
+
 class MIMG_Sampler_gfx10<mimgopc op, string opcode,
                          RegisterClass DataRC, RegisterClass AddrRC,
                          string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
-  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
-                                DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
-                                R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
-                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$cpol$r128$a16$tfe$lwe"
-                    #!if(BaseOpcode.HasD16, "$d16", "");
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "$vdata, $vaddr0", BaseOpcode.HasD16>.ret;
 }
 
 class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
                              RegisterClass DataRC, int num_addrs,
                              string dns="">
   : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
-  let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
-                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
-                                R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
-                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$cpol$r128$a16$tfe$lwe"
-                    #!if(BaseOpcode.HasD16, "$d16", "");
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, " $vdata, "#AddrAsm, BaseOpcode.HasD16>.ret;
+}
+
+class MIMG_Sampler_nortn_gfx10<mimgopc op, string opcode,
+                         RegisterClass AddrRC,
+                         string dns="">
+  : MIMG_gfx10<op.GFX10M, (outs), dns> {
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "off, $vaddr0", BaseOpcode.HasD16>.ret;
+  // Force vdata to VGPR0 as no result will be returned.
+  let vdata = 0;
+}
+
+class MIMG_Sampler_nortn_nsa_gfx10<mimgopc op, string opcode,
+                         int num_addrs,
+                         string dns="">
+  : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, " off, "#AddrAsm, BaseOpcode.HasD16>.ret;
+  // Force vdata to VGPR0 as no result will be returned.
+  let vdata = 0;
 }
 
 class MIMG_Sampler_gfx11<mimgopc op, string opcode,
                          RegisterClass DataRC, RegisterClass AddrRC,
                          string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
-  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
-                                DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
-                                R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
-                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$cpol$r128$a16$tfe$lwe"
-                    #!if(BaseOpcode.HasD16, "$d16", "");
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "$vdata, $vaddr0", BaseOpcode.HasD16>.ret;
 }
 
 class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
@@ -1181,14 +1220,26 @@ class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
                              RegisterClass LastVAddrSize, string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
                    LastVAddrSize> {
-  let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
-                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
-                                R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
-                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$cpol$r128$a16$tfe$lwe"
-                    #!if(BaseOpcode.HasD16, "$d16", "");
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, " $vdata, "#AddrAsm, BaseOpcode.HasD16>.ret;
+}
+
+class MIMG_Sampler_nortn_gfx11<mimgopc op, string opcode,
+                                  RegisterClass AddrRC,
+                                  string dns="">
+  : MIMG_gfx11<op.GFX11, (outs), dns> {
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "off, $vaddr0", BaseOpcode.HasD16>.ret;
+  let vdata = 0;
+}
+
+class MIMG_Sampler_nortn_nsa_gfx11<mimgopc op, string opcode,
+                                      int num_addrs,
+                                      RegisterClass LastVAddrSize, string dns="">
+  : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns, [], LastVAddrSize> {
+  let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
+  let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "off, "#AddrAsm, BaseOpcode.HasD16>.ret;
+  let vdata = 0;
 }
 
 class MIMGAddrSize<int dw, bit enable_disasm, int AddrDW = dw> {
@@ -1366,6 +1417,57 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
   let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
 }
 
+multiclass MIMG_Sampler_NoReturn <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0, bit isG16, string asm> {
+  def "" : MIMG_Sampler_BaseOpcode<sample> {
+    let HasD16 = 1;
+    let G16 = isG16;
+    let NoReturn = 1;
+  }
+
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+      mayLoad = 1, mayStore = 1, VDataDwords = 0 in {
+    foreach addr = MIMG_Sampler_AddrSizes<sample, isG16>.MachineInstrs in {
+      let VAddrDwords = addr.NumWords in {
+        if op.HAS_GFX10M then {
+          def _V # addr.NumWords # _gfx10
+            : MIMG_Sampler_nortn_gfx10 <op, asm, addr.RegClass>;
+        }
+        if op.HAS_GFX11 then {
+          def _V # addr.NumWords # _gfx11
+            : MIMG_Sampler_nortn_gfx11 <op, asm, addr.RegClass>;
+        }
+      }
+    }
+
+    foreach addr = MIMG_Sampler_AddrSizes<sample, isG16>.NSAInstrs in {
+      let VAddrDwords = addr.NumWords in {
+        if op.HAS_GFX10M then {
+          def _V # addr.NumWords # _nsa_gfx10
+            : MIMG_Sampler_nortn_nsa_gfx10<op, asm, addr.NumWords>;
+        }
+      }
+    }
+
+    foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 5/*MaxNSASize*/>.PartialNSAInstrs in {
+      let VAddrDwords = addr.NumWords in {
+        if op.HAS_GFX11 then {
+          def _V # addr.NumWords # _nsa_gfx11
+            : MIMG_Sampler_nortn_nsa_gfx11<op, asm, addr.NumWords, addr.RegClass>;
+        }
+      }
+    }
+
+    foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 4/*MaxNSASize*/, 1>.PartialNSAInstrs in {
+      let VAddrDwords = addr.NumWords in {
+        if op.HAS_GFX12 then {
+          def _V # addr.NumWords # _gfx12
+            : VSAMPLE_Sampler_nortn_gfx12<op, asm, addr.NumWords, addr.RegClass>;
+        }
+      }
+    }
+  }
+}
+
 multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
                          bit isG16 = 0, bit isGetLod = 0,
                          string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
@@ -1388,6 +1490,9 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
     let VDataDwords = 5 in
     defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst, isG16>;
   }
+
+  if !not(isGetLod) then
+  defm "_nortn" : MIMG_Sampler_NoReturn <op, sample, wqm, isG16, asm>;
 }
 
 multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
@@ -1755,6 +1860,10 @@ def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>;
 def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>;
 def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>;
 def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
+def : MIMGLZMapping<IMAGE_SAMPLE_L_nortn, IMAGE_SAMPLE_LZ_nortn>;
+def : MIMGLZMapping<IMAGE_SAMPLE_C_L_nortn, IMAGE_SAMPLE_C_LZ_nortn>;
+def : MIMGLZMapping<IMAGE_SAMPLE_L_O_nortn, IMAGE_SAMPLE_LZ_O_nortn>;
+def : MIMGLZMapping<IMAGE_SAMPLE_C_L_O_nortn, IMAGE_SAMPLE_C_LZ_O_nortn>;
 
 // MIP to NONMIP Optimization Mapping
 def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
@@ -1777,6 +1886,14 @@ def : MIMGBiasMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_O>;
 def : MIMGBiasMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_CL_O>;
 def : MIMGBiasMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_O>;
 def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_CL_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_nortn, IMAGE_SAMPLE_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_nortn, IMAGE_SAMPLE_CL_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_nortn, IMAGE_SAMPLE_C_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_nortn, IMAGE_SAMPLE_C_CL_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_O_nortn, IMAGE_SAMPLE_O_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_O_nortn, IMAGE_SAMPLE_CL_O_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_O_nortn, IMAGE_SAMPLE_C_O_nortn>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_O_nortn, IMAGE_SAMPLE_C_CL_O_nortn>;
 
 // Offset to NoOffset Optimization Mapping
 def : MIMGOffsetMapping<IMAGE_SAMPLE_O, IMAGE_SAMPLE>;
@@ -1819,6 +1936,34 @@ def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16, IMAGE_SAMPLE_CD_G16>;
 def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16, IMAGE_SAMPLE_CD_CL_G16>;
 def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16, IMAGE_SAMPLE_C_CD_G16>;
 def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16, IMAGE_SAMPLE_C_CD_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_O_nortn, IMAGE_SAMPLE_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CL_O_nortn, IMAGE_SAMPLE_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_nortn, IMAGE_SAMPLE_D_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_nortn, IMAGE_SAMPLE_D_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_G16_nortn, IMAGE_SAMPLE_D_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_G16_nortn, IMAGE_SAMPLE_D_CL_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_L_O_nortn, IMAGE_SAMPLE_L_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_O_nortn, IMAGE_SAMPLE_B_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_CL_O_nortn, IMAGE_SAMPLE_B_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_LZ_O_nortn, IMAGE_SAMPLE_LZ_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_O_nortn, IMAGE_SAMPLE_C_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CL_O_nortn, IMAGE_SAMPLE_C_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_nortn, IMAGE_SAMPLE_C_D_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_nortn, IMAGE_SAMPLE_C_D_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_G16_nortn, IMAGE_SAMPLE_C_D_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_G16_nortn, IMAGE_SAMPLE_C_D_CL_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_L_O_nortn, IMAGE_SAMPLE_C_L_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_CL_O_nortn, IMAGE_SAMPLE_C_B_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_O_nortn, IMAGE_SAMPLE_C_B_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_LZ_O_nortn, IMAGE_SAMPLE_C_LZ_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16_nortn, IMAGE_SAMPLE_CD_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16_nortn, IMAGE_SAMPLE_CD_CL_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16_nortn, IMAGE_SAMPLE_C_CD_G16_nortn>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16_nortn, IMAGE_SAMPLE_C_CD_CL_G16_nortn>;
 
 // G to G16 Optimization Mapping
 def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
@@ -1837,3 +1982,19 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD_O_G16>;
 def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL_O_G16>;
 def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD_O_G16>;
 def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_nortn, IMAGE_SAMPLE_D_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_nortn, IMAGE_SAMPLE_D_CL_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_nortn, IMAGE_SAMPLE_C_D_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_nortn, IMAGE_SAMPLE_C_D_CL_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_O_nortn, IMAGE_SAMPLE_D_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_O_nortn, IMAGE_SAMPLE_D_CL_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_O_nortn, IMAGE_SAMPLE_C_D_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_O_nortn, IMAGE_SAMPLE_C_D_CL_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_nortn, IMAGE_SAMPLE_CD_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_nortn, IMAGE_SAMPLE_CD_CL_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_nortn, IMAGE_SAMPLE_C_CD_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_nortn, IMAGE_SAMPLE_C_CD_CL_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0e8c96625b221..32ecf350db59c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1361,7 +1361,9 @@ bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
     return false;
 
   Register Dst = MI.getOperand(0).getReg();
-  MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
+  MRI->replaceRegWith(Dst, Src1);
+  if (!MI.getOperand(2).isKill())
+    MRI->clearKillFlags(Src1);
   MI.eraseFromParent();
   return true;
 }
@@ -1579,7 +1581,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
 
   // Clamp is applied after omod, so it is OK if omod is set.
   DefClamp->setImm(1);
-  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+
+  Register DefReg = Def->getOperand(0).getReg();
+  Register MIDstReg = MI.getOperand(0).getReg();
+  if (TRI->isSGPRReg(*MRI, DefReg)) {
+    // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
+    // instruction with a VGPR dst.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
+            MIDstReg)
+        .addReg(DefReg);
+  } else {
+    MRI->replaceRegWith(MIDstReg, DefReg);
+  }
   MI.eraseFromParent();
 
   // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d5ffb4478bee1..e70a420ab566c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -629,10 +629,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
         case ISD::EXTRACT_VECTOR_ELT:
         case ISD::INSERT_VECTOR_ELT:
         case ISD::INSERT_SUBVECTOR:
-        case ISD::EXTRACT_SUBVECTOR:
         case ISD::SCALAR_TO_VECTOR:
         case ISD::IS_FPCLASS:
           break;
+        case ISD::EXTRACT_SUBVECTOR:
         case ISD::CONCAT_VECTORS:
           setOperationAction(Op, VT, Custom);
           break;
@@ -1190,8 +1190,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // TODO: Should images get their own address space?
     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
 
-    if (RsrcIntr->IsImage)
+    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
+    if (RsrcIntr->IsImage) {
+      const AMDGPU::ImageDimIntrinsicInfo *Intr =
+          AMDGPU::getImageDimIntrinsicInfo(IntrID);
+      BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
       Info.align.reset();
+    }
 
     Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
     if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
@@ -1212,11 +1217,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       if (RsrcIntr->IsImage) {
         unsigned MaxNumLanes = 4;
 
-        const AMDGPU::ImageDimIntrinsicInfo *Intr
-          = AMDGPU::getImageDimIntrinsicInfo(IntrID);
-        const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-          AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
-
         if (!BaseOpcode->Gather4) {
           // If this isn't a gather, we may have excess loaded elements in the
           // IR type. Check the dmask for the real number of elements loaded.
@@ -1250,7 +1250,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
       Info.flags |= MachineMemOperand::MOStore;
     } else {
-      // Atomic
+      // Atomic or NoReturn Sampler
       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
                                             ISD::INTRINSIC_W_CHAIN;
       Info.flags |= MachineMemOperand::MOLoad |
@@ -1259,9 +1259,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
       switch (IntrID) {
       default:
-        Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
-        // XXX - Should this be volatile without known ordering?
-        Info.flags |= MachineMemOperand::MOVolatile;
+        if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
+          // Fake memory access type for no return sampler intrinsics
+          Info.memVT = MVT::i32;
+        } else {
+          // XXX - Should this be volatile without known ordering?
+          Info.flags |= MachineMemOperand::MOVolatile;
+          Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+        }
         break;
       case Intrinsic::amdgcn_raw_buffer_load_lds:
       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1272,6 +1277,16 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
         Info.ptrVal = CI.getArgOperand(1);
         return true;
       }
+      case Intrinsic::amdgcn_raw_atomic_buffer_load:
+      case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
+      case Intrinsic::amdgcn_struct_atomic_buffer_load:
+      case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
+        Info.memVT =
+            memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+                                    std::numeric_limits<unsigned>::max());
+        Info.flags &= ~MachineMemOperand::MOStore;
+        return true;
+      }
       }
     }
     return true;
@@ -7900,7 +7915,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   bool IsG16 = false;
   bool IsA16 = false;
   SDValue VData;
-  int NumVDataDwords;
+  int NumVDataDwords = 0;
   bool AdjustRetType = false;
   bool IsAtomicPacked16Bit = false;
 
@@ -7949,7 +7964,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       }
 
       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
-    } else {
+    } else if (!BaseOpcode->NoReturn) {
       // Work out the num dwords based on the dmask popcount and underlying type
       // and whether packing is supported.
       MVT LoadVT = ResultTypes[0].getSimpleVT();
@@ -8242,7 +8257,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
   }
-  if (BaseOpcode->Store)
+  if (BaseOpcode->NoReturn)
     return SDValue(NewNode, 0);
   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
@@ -8884,6 +8899,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
+  case Intrinsic::amdgcn_raw_atomic_buffer_load:
+  case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
     const bool IsFormat =
@@ -8910,7 +8927,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_struct_buffer_load:
   case Intrinsic::amdgcn_struct_ptr_buffer_load:
   case Intrinsic::amdgcn_struct_buffer_load_format:
-  case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
+  case Intrinsic::amdgcn_struct_atomic_buffer_load:
+  case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
     const bool IsFormat =
         IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
         IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
@@ -16099,6 +16118,34 @@ static bool isBFloat2(Type *Ty) {
   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
 }
 
+/// \returns true if it's valid to emit a native instruction for \p RMW, based
+/// on the properties of the target memory.
+static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
+                                        const AtomicRMWInst *RMW,
+                                        bool HasSystemScope) {
+  // The remote/fine-grained access logic is different from the integer
+  // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
+  // fine-grained access does not work, even for a device local allocation.
+  //
+  // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
+  // allocations work.
+  if (HasSystemScope) {
+    if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+        RMW->hasMetadata("amdgpu.no.remote.memory"))
+      return true;
+  } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+    return true;
+
+  if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
+    return true;
+
+  // TODO: Auto-upgrade this attribute to the metadata in function body and stop
+  // checking it.
+  return RMW->getFunction()
+      ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+      .getValueAsBool();
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
@@ -16249,37 +16296,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     Type *Ty = RMW->getType();
 
     // LDS float and double fmin/fmax were always supported.
-    if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
-      return AtomicExpansionKind::None;
-
-    if (unsafeFPAtomicsDisabled(RMW->getFunction()))
-      return AtomicExpansionKind::CmpXChg;
-
-    // Always expand system scope fp atomics.
-    if (HasSystemScope)
-      return AtomicExpansionKind::CmpXChg;
+    if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+      return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
+                                                 : AtomicExpansionKind::CmpXChg;
+    }
 
-    // For flat and global cases:
-    // float, double in gfx7. Manual claims denormal support.
-    // Removed in gfx8.
-    // float, double restored in gfx10.
-    // double removed again in gfx11, so only f32 for gfx11/gfx12.
-    //
-    // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
-    // f32.
-    //
-    // FIXME: Check scope and fine grained memory
-    if (AS == AMDGPUAS::FLAT_ADDRESS) {
-      if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-      if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-    } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
-               AS == AMDGPUAS::BUFFER_FAT_POINTER) {
-      if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-      if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+    if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+      // For flat and global cases:
+      // float, double in gfx7. Manual claims denormal support.
+      // Removed in gfx8.
+      // float, double restored in gfx10.
+      // double removed again in gfx11, so only f32 for gfx11/gfx12.
+      //
+      // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
+      // no f32.
+      if (AS == AMDGPUAS::FLAT_ADDRESS) {
+        if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+        if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+                 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+        if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+        if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      }
     }
 
     return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a18da72b02ebe..1315aa0855788 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -699,7 +699,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     // these should use VM_CNT.
     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
       return VMEM_ACCESS;
-    if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
+    if (Inst.mayStore() &&
+        (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
       // instructions do not.
       if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1fe8beafd5e5d..9b506eb0a711a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -321,6 +321,12 @@ def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
   let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
 }
 
+// Source-encoded destination operand for instructions like v_swap_b16.
+def VOPSrcEncodedDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
+  let EncoderMethod = VSrcT_b16_Lo128.EncoderMethod;
+  let DecoderMethod = VSrcT_b16_Lo128.DecoderMethod;
+}
+
 class VINTRPe <bits<2> op> : Enc32 {
   bits<8> vdst;
   bits<8> vsrc;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 52044791e6c66..463737f645d45 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -472,6 +472,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     Offset = 0;
     // Get appropriate operand, and compute width accordingly.
     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    if (DataOpIdx == -1)
+      return false; // no return sampler
     Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d2838349340d2..b97256bc1ac78 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -828,7 +828,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
   return isVGPRImm(N);
-}]>;
+}]> {
+  let GISelPredicateCode = [{return true;}];
+}
 
 def NegateImm : SDNodeXForm<imm, [{
   return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
@@ -1015,18 +1017,29 @@ def SDWAVopcDst : BoolRC {
   let PrintMethod = "printVOPDst";
 }
 
-class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
+class NamedIntOperand<ValueType Type, string prefix, bit Optional = 1,
                       string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  string Prefix = prefix;
+
   let PredicateMethod =
     "getPredicate([](const AMDGPUOperand &Op) -> bool { "#
     "return Op.isImmTy(AMDGPUOperand::"#ImmTy#"); })";
+
   string Validator = "[](int64_t V) { return true; }";
   string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
     "AMDGPUOperand::"#ImmTy#", "#ConvertMethod#"); }";
+
+  bit PrintInHex = 0;
+  bit AlwaysPrint = 0;
+  let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "
+                    "const MCSubtargetInfo &STI, raw_ostream &O) { "
+                    "printNamedInt(MI, OpNo, STI, O, \""#Prefix#"\", "#
+                    !if(PrintInHex, "true", "false")#", "#
+                    !if(AlwaysPrint, "true", "false")#"); }";
 }
 
 class NamedBitOperand<string Id, string Name = NAME>
@@ -1065,10 +1078,11 @@ class ArrayOperand0<string Id, string Name = NAME>
 
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
+let PrintMethod = "printOffset" in
 def Offset : NamedIntOperand<i32, "offset">;
 let Validator = "isUInt<8>" in {
-def Offset0 : NamedIntOperand<i8, "offset0">;
-def Offset1 : NamedIntOperand<i8, "offset1">;
+def Offset0 : NamedIntOperand<i32, "offset0">;
+def Offset1 : NamedIntOperand<i32, "offset1">;
 }
 
 def gds : NamedBitOperand<"gds", "GDS">;
@@ -1103,8 +1117,10 @@ def exp_vm : NamedBitOperand<"vm", "ExpVM">;
 
 def FORMAT : CustomOperand<i8>;
 
+let PrintInHex = 1 in
 def DMask : NamedIntOperand<i16, "dmask">;
-def Dim : CustomOperand<i8>;
+
+def Dim : CustomOperand<i8, /*optional=*/1>;
 
 def dst_sel : SDWAOperand<"dst_sel", "SDWADstSel">;
 def src0_sel : SDWAOperand<"src0_sel", "SDWASrc0Sel">;
@@ -1122,16 +1138,18 @@ def IndexKey8bit : CustomOperand<i32, 1>;
 def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
-let DefaultValue = "0xf" in {
+let DefaultValue = "0xf", PrintInHex = 1, AlwaysPrint = 1 in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
 def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
   let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+  let PrintMethod = "printDppBoundCtrl";
 }
 
-let DecoderMethod = "decodeDpp8FI" in
+let DecoderMethod = "decodeDpp8FI", PrintMethod = "printDppFI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+let PrintMethod = "printDppFI" in
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
@@ -1145,6 +1163,7 @@ def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
+let AlwaysPrint = 1 in {
 def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
   let Validator = "isUInt<4>";
 }
@@ -1157,6 +1176,7 @@ def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
 def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
   let Validator = "isUInt<1>";
 }
+} // End AlwaysPrint = 1
 
 def ByteSel : NamedIntOperand<i8, "byte_sel"> {
   let Validator = "isUInt<2>";
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f2721fbd164bf..c41850ab55f75 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -212,11 +212,13 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
 }
 
 let usesCustomInserter = 1 in {
+let WaveSizePredicate = isWave32 in
 def S_INVERSE_BALLOT_U32 : SPseudoInstSI<
   (outs SReg_32:$sdst), (ins SSrc_b32:$mask),
   [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))]
 >;
 
+let WaveSizePredicate = isWave64 in
 def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
   (outs SReg_64:$sdst), (ins SSrc_b64:$mask),
   [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))]
@@ -2136,19 +2138,28 @@ def : GCNPat <
 /********** Immediate Patterns **********/
 /********** ================== **********/
 
-def : GCNPat <
-  (VGPRImm<(i32 imm)>:$imm),
-  (V_MOV_B32_e32 imm:$imm)
->;
+// FIXME: Remove VGPRImm. Should be inferrable from register bank.
+
+foreach vt = [i32, p3, p5, p6, p2] in {
+  def : GCNPat <
+    (VGPRImm<(vt imm)>:$imm),
+    (V_MOV_B32_e32 imm:$imm)
+  >;
+
+  def : GCNPat <
+    (vt imm:$imm),
+    (S_MOV_B32 imm:$imm)
+  >;
+}
 
 def : GCNPat <
-  (VGPRImm<(f32 fpimm)>:$imm),
-  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (p5 frameindex:$fi),
+  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
 >;
 
 def : GCNPat <
-  (i32 imm:$imm),
-  (S_MOV_B32 imm:$imm)
+  (p5 frameindex:$fi),
+  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
 >;
 
 def : GCNPat <
@@ -2161,18 +2172,39 @@ def : GCNPat <
   (S_MOV_B32 $ga)
 >;
 
-// FIXME: Workaround for ordering issue with peephole optimizer where
-// a register class copy interferes with immediate folding.  Should
-// use s_mov_b32, which can be shrunk to s_movk_i32
-def : GCNPat <
-  (VGPRImm<(f16 fpimm)>:$imm),
-  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
->;
+foreach pred = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in {
+  let True16Predicate = pred in {
+    def : GCNPat <
+      (VGPRImm<(i16 imm)>:$imm),
+      (V_MOV_B32_e32 imm:$imm)
+    >;
+  }
 
-def : GCNPat <
-  (VGPRImm<(bf16 fpimm)>:$imm),
-  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
->;
+  // FIXME: Workaround for ordering issue with peephole optimizer where
+  // a register class copy interferes with immediate folding.  Should
+  // use s_mov_b32, which can be shrunk to s_movk_i32
+
+  foreach vt = [f16, bf16] in {
+    def : GCNPat <
+      (VGPRImm<(f16 fpimm)>:$imm),
+      (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
+    >;
+  }
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+  def : GCNPat <
+    (VGPRImm<(i16 imm)>:$imm),
+    (V_MOV_B16_t16_e64 0, imm:$imm, 0)
+  >;
+
+  foreach vt = [f16, bf16] in {
+    def : GCNPat <
+      (VGPRImm<(vt fpimm)>:$imm),
+      (V_MOV_B16_t16_e64 0, $imm, 0)
+    >;
+  }
+}
 
 // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
 // immediate and wil be expanded as needed, but we will only use these patterns
@@ -2207,48 +2239,72 @@ def : GCNPat <
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (VGPRImm<(bf16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 def : GCNPat <
   (bf16 fpimm:$imm),
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : GCNPat <
-  (p5 frameindex:$fi),
-  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
+  (VGPRImm<(f32 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : GCNPat <
-  (p5 frameindex:$fi),
-  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
+  (f32 fpimm:$imm),
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+foreach vt = [i64, p1, p0, p4] in { // FIXME: Should accept arbitrary addrspace
+  def : GCNPat <
+    (VGPRImm<(vt imm)>:$imm),
+    (V_MOV_B64_PSEUDO imm:$imm)
+  >;
+
+  def : GCNPat <
+    (vt InlineImm64:$imm),
+    (S_MOV_B64 InlineImm64:$imm)
+  >;
+
+  def : GCNPat <
+    (vt imm:$imm),
+    (S_MOV_B64_IMM_PSEUDO imm:$imm)
+  >;
+}
+
 def : GCNPat <
-  (i64 InlineImm64:$imm),
-  (S_MOV_B64 InlineImm64:$imm)
+  (VGPRImm<(f64 fpimm)>:$imm),
+  (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
 >;
 
-// XXX - Should this use a s_cmp to set SCC?
+// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
+// immediate and wil be expanded as needed, but we will only use these patterns
+// for values which can be encoded.
+def : GCNPat <
+  (f64 InlineImmFP64:$imm),
+  (S_MOV_B64 (i64 (bitcast_fpimm_to_i64 $imm)))
+>;
 
-// Set to sign-extended 64-bit value (true = -1, false = 0)
 def : GCNPat <
-  (i1 imm:$imm),
-  (S_MOV_B64 (i64 (as_i64imm $imm)))
-> {
+  (f64 fpimm:$imm),
+  (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : GCNPat <(i1 imm:$imm),
+              (S_MOV_B64 imm:$imm)> {
   let WaveSizePredicate = isWave64;
 }
 
-def : GCNPat <
-  (i1 imm:$imm),
-  (S_MOV_B32 (i32 (as_i32imm $imm)))
-> {
+def : GCNPat <(i1 imm:$imm),
+              (S_MOV_B32 imm:$imm)> {
   let WaveSizePredicate = isWave32;
 }
 
-def : GCNPat <
-  (f64 InlineImmFP64:$imm),
-  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm)))
->;
-
 /********** ================== **********/
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
@@ -3951,6 +4007,14 @@ def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction {
   let mayStore = 1;
 }
 
+def G_AMDGPU_INTRIN_IMAGE_LOAD_NORET : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 // This is equivalent to the G_INTRINSIC*, but the operands may have
 // been legalized depending on the subtarget requirements.
 def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 8b42d4a1dee7a..ae537b194f50c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                    CombineInfo &Paired, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                         const CombineInfo &Paired);
-  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+  unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
   const TargetRegisterClass *
@@ -353,6 +353,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +364,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX3_IMM:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +375,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +386,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return 8;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +511,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return S_LOAD_IMM;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +599,10 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return AMDGPU::S_LOAD_DWORD_IMM;
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +715,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     Result.SBase = true;
     return Result;
   case AMDGPU::DS_READ_B32:
@@ -1212,8 +1228,14 @@ void SILoadStoreOptimizer::copyToDestRegs(
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+  auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+  auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+
+  // The constrained sload instructions in S_LOAD_IMM class will have
+  // `early-clobber` flag in the dst operand. Remove the flag before using the
+  // MOs in copies.
+  Dest0->setIsEarlyClobber(false);
+  Dest1->setIsEarlyClobber(false);
 
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1700,19 +1722,29 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
     }
-  case S_LOAD_IMM:
+  case S_LOAD_IMM: {
+    // If XNACK is enabled, use the constrained opcodes when the first load is
+    // under-aligned.
+    const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+    bool NeedsConstrainedOpc =
+        STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
     switch (Width) {
     default:
       return 0;
     case 2:
-      return AMDGPU::S_LOAD_DWORDX2_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX2_IMM;
     case 3:
-      return AMDGPU::S_LOAD_DWORDX3_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX3_IMM;
     case 4:
-      return AMDGPU::S_LOAD_DWORDX4_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX4_IMM;
     case 8:
-      return AMDGPU::S_LOAD_DWORDX8_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX8_IMM;
     }
+  }
   case GLOBAL_LOAD:
     switch (Width) {
     default:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 452dac4b00993..bd4203ccd6fe4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -217,7 +217,7 @@ class SIMemOpInfo final {
 
 class SIMemOpAccess final {
 private:
-  AMDGPUMachineModuleInfo *MMI = nullptr;
+  const AMDGPUMachineModuleInfo *MMI = nullptr;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -241,7 +241,7 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(MachineFunction &MF);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -806,9 +806,8 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
-SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
+    : MMI(&MMI_) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -2802,7 +2801,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
-  SIMemOpAccess MOA(MF);
+  const MachineModuleInfo &MMI =
+      getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
 
   for (auto &MBB : MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8a315aa822786..e824e95610a65 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                             ? &AMDGPU::SReg_32RegClass
                                             : &AMDGPU::VGPR_32RegClass;
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
-                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
+                      MI->getOpcode() == AMDGPU::S_MOV_B32;
         Register ResultReg =
             IsCopy ? MI->getOperand(0).getReg()
                    : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (Offset == 0) {
           unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
                                                : AMDGPU::V_LSHRREV_B32_e64;
-          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
+          Register TmpResultReg = ResultReg;
+          if (IsSALU && LiveSCC) {
+            TmpResultReg = RS->scavengeRegisterBackwards(
+                AMDGPU::VGPR_32RegClass, MI, false, 0);
+          }
+
+          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
           if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
             // For V_LSHRREV, the operands are reversed (the shift count goes
             // first).
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           if (IsSALU && !LiveSCC)
             Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
           if (IsSALU && LiveSCC) {
-            Register NewDest = RS->scavengeRegisterBackwards(
-                AMDGPU::SReg_32RegClass, Shift, false, 0);
+            Register NewDest =
+                IsCopy ? ResultReg
+                       : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
+                                                       Shift, false, 0);
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
                     NewDest)
-                .addReg(ResultReg);
+                .addReg(TmpResultReg);
             ResultReg = NewDest;
           }
         } else {
@@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
-            Register TmpScaledReg = RS->scavengeRegisterBackwards(
-                AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
-            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
+            Register TmpScaledReg = IsCopy && IsSALU
+                                        ? ResultReg
+                                        : RS->scavengeRegisterBackwards(
+                                              AMDGPU::SReg_32_XM0RegClass, MI,
+                                              false, 0, /*AllowSpill=*/false);
+            Register ScaledReg =
+                TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
+            Register TmpResultReg = ScaledReg;
+
+            if (!LiveSCC) {
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
+                  .addReg(FrameReg)
+                  .addImm(ST.getWavefrontSizeLog2());
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
+                  .addReg(TmpResultReg, RegState::Kill)
+                  .addImm(Offset);
+            } else {
+              TmpResultReg = RS->scavengeRegisterBackwards(
+                  AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+              MachineInstrBuilder Add;
+              if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
+                BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                        TmpResultReg)
+                    .addImm(ST.getWavefrontSizeLog2())
+                    .addReg(FrameReg);
+                if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
+                  BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
+                          ResultReg)
+                      .addImm(Offset);
+                  Add.addReg(ResultReg, RegState::Kill)
+                      .addReg(TmpResultReg, RegState::Kill)
+                      .addImm(0);
+                } else
+                  Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
+              } else {
+                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                        TmpResultReg)
+                    .addImm(Offset);
+                assert(Offset > 0 &&
+                       isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
+                       "offset is unsafe for v_mad_u32_u24");
+                // We start with a frame pointer with a wave space value, and an
+                // offset in lane-space. We are materializing a lane space
+                // value. We can either do a right shift of the frame pointer to
+                // get to lane space, or a left shift of the offset to get to
+                // wavespace. We can right shift after the computation to get
+                // back to the desired per-lane value.
+                // We are using the mad_u32_u24 primarily as an add with no
+                // carry out clobber.
+                Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+                              TmpResultReg)
+                          .addReg(TmpResultReg, RegState::Kill)
+                          .addImm(ST.getWavefrontSize())
+                          .addReg(FrameReg)
+                          .addImm(0);
+                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                        TmpResultReg)
+                    .addImm(ST.getWavefrontSizeLog2())
+                    .addReg(FrameReg);
+              }
 
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-              .addReg(FrameReg)
-              .addImm(ST.getWavefrontSizeLog2());
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
-                .addReg(ScaledReg, RegState::Kill)
-                .addImm(Offset);
+              Register NewDest = IsCopy ? ResultReg
+                                        : RS->scavengeRegisterBackwards(
+                                              AMDGPU::SReg_32RegClass, *Add,
+                                              false, 0, /*AllowSpill=*/true);
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                      NewDest)
+                  .addReg(TmpResultReg);
+              ResultReg = NewDest;
+            }
             if (!IsSALU)
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
-                  .addReg(ScaledReg, RegState::Kill);
+                  .addReg(TmpResultReg, RegState::Kill);
             else
-              ResultReg = ScaledReg;
-
+              ResultReg = TmpResultReg;
             // If there were truly no free SGPRs, we need to undo everything.
             if (!TmpScaledReg.isValid()) {
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index df7906ebd8a7e..9a51cbbb9f6b8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1364,8 +1364,26 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
         llvm_unreachable("Unknown state");
         break;
       }
+      char StartState = State & StateStrict ? NonStrictState : State;
+      bool WQMToExact =
+          StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
+      bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
+                        !(Needs & StateExact);
+      bool PreferLast = Needs == StateWQM;
+      // Exact regions in divergent control flow may run at EXEC=0, so try to
+      // exclude instructions with unexpected effects from them.
+      // FIXME: ideally we would branch over these when EXEC=0,
+      // but this requires updating implicit values, live intervals and CFG.
+      if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
+        for (MachineBasicBlock::iterator I = First; I != II; ++I) {
+          if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
+            PreferLast = WQMToExact;
+            break;
+          }
+        }
+      }
       MachineBasicBlock::iterator Before =
-          prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
+          prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
 
       if (State & StateStrict) {
         assert(State == StateStrictWWM || State == StateStrictWQM);
@@ -1385,9 +1403,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
 
         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
         State = Needs;
-
       } else {
-        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
+        if (WQMToExact) {
           if (!WQMFromExec && (OutNeeds & StateWQM)) {
             assert(!SavedWQMReg);
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
@@ -1395,8 +1412,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
 
           toExact(MBB, Before, SavedWQMReg);
           State = StateExact;
-        } else if (State == StateExact && (Needs & StateWQM) &&
-                   !(Needs & StateExact)) {
+        } else if (ExactToWQM) {
           assert(WQMFromExec == (SavedWQMReg == 0));
 
           toWQM(MBB, Before, SavedWQMReg);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index df1722b1f7fb4..8cc963a6c1bb5 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -11,7 +11,10 @@ def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>;
 let EncoderMethod = "getSMEMOffsetEncoding",
     DecoderMethod = "decodeSMEMOffset" in {
 def SMEMOffset : ImmOperand<i32, "SMEMOffset", 1>;
-def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;
+def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0> {
+  let AlwaysPrint = 1;
+  let PrintInHex = 1;
+}
 def OptSMEMOffsetMod : NamedIntOperand<i32, "offset"> {
   let ImmTy = SMEMOffsetMod.ImmTy;
   let PredicateMethod = SMEMOffsetMod.PredicateMethod;
@@ -73,6 +76,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
   let IsAtomicNoRet        = ps.IsAtomicNoRet;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   let TSFlags = ps.TSFlags;
 
@@ -161,12 +165,25 @@ class SM_Discard_Pseudo <string opName, OffsetMode offsets>
   let has_soffset = offsets.HasSOffset;
 }
 
+multiclass SM_Load_Pseudos<string op, RegisterClass baseClass,
+                           RegisterClass dstClass, OffsetMode offsets> {
+  defvar opName = !tolower(op);
+  def "" : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;
+
+  // The constrained multi-dword load equivalents with early clobber flag at
+  // the dst operands. They are needed only for codegen and there is no need
+  // for their real opcodes.
+  if !gt(dstClass.RegTypes[0].Size, 32) then
+    let Constraints = "@earlyclobber $sdst",
+        PseudoInstr = op # offsets.Variant in
+      def "" # _ec : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;
+}
+
 multiclass SM_Pseudo_Loads<RegisterClass baseClass,
                            RegisterClass dstClass> {
-  defvar opName = !tolower(NAME);
-  def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
-  def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
-  def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
+  defm _IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, IMM_Offset>;
+  defm _SGPR : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_Offset>;
+  defm _SGPR_IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_IMM_Offset>;
 }
 
 multiclass SM_Pseudo_Stores<RegisterClass baseClass,
@@ -853,45 +870,74 @@ def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
 
-multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
+class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+  // Returns true if it is a single dword load or naturally aligned multi-dword load.
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  unsigned Size = Ld->getMemoryVT().getStoreSize();
+  return Size <= 4 || Ld->getAlign().value() >= Size;
+}]> {
+  let GISelPredicateCode = [{
+  auto &Ld = cast<GLoad>(MI);
+  TypeSize Size = Ld.getMMO().getSize().getValue();
+  return Size <= 4 || Ld.getMMO().getAlign().value() >= Size;
+  }];
+}
+
+def aligned_smrd_load : SMRDAlignedLoadPat<smrd_load>;
 
+multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
+                          bit immci = true, string suffix = ""> {
   // 1. IMM offset
   def : GCNPat <
-    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
-  >;
+    (frag (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) $sbase, $offset, 0))>;
 
   // 2. 32-bit IMM offset on CI
   if immci then def : GCNPat <
-    (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
-    (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
-    let OtherPredicates = [isGFX7Only];
+    (frag (SMRDImm32 i64:$sbase, i32:$offset)),
+    (vt (!cast<InstSI>(Instr#"_IMM_ci"#suffix) $sbase, $offset, 0))> {
+    let SubtargetPredicate = isGFX7Only;
   }
 
   // 3. SGPR offset
   def : GCNPat <
-    (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
-    let OtherPredicates = [isNotGFX9Plus];
+    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR"#suffix) $sbase, $soffset, 0))> {
+    let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
-    (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
-    let OtherPredicates = [isGFX9Plus];
+    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+    let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. SGPR+IMM offset
   def : GCNPat <
-    (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
-    let OtherPredicates = [isGFX9Plus];
+    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+    let SubtargetPredicate = isGFX9Plus;
   }
 
   // 5. No offset
   def : GCNPat <
-    (vt (smrd_load (i64 SReg_64:$sbase))),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
-  >;
+    (vt (frag (i64 SReg_64:$sbase))),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) i64:$sbase, 0, 0))>;
+}
+
+multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
+  // High priority when XNACK is enabled and the load was naturally aligned.
+  let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 102 in
+    defm: SMRD_Patterns <Instr, vt, aligned_smrd_load, immci>;
+
+  // XNACK is enabled and the load wasn't naturally aligned. The constrained sload variant.
+  if !gt(vt.Size, 32) then {
+    let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 101 in
+      defm: SMRD_Patterns <Instr, vt, smrd_load, /*immci=*/false, /*suffix=*/"_ec">;
+  }
+
+  // XNACK is disabled.
+  let AddedComplexity = 100 in
+    defm: SMRD_Patterns <Instr, vt, smrd_load, immci>;
 }
 
 multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
@@ -907,7 +953,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
     (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
     (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
                                     (extract_cpol $cachepolicy))> {
-    let OtherPredicates = [isGFX7Only];
+    let SubtargetPredicate = isGFX7Only;
     let AddedComplexity = 1;
   }
 
@@ -915,12 +961,12 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isNotGFX9Plus];
+    let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. Offset as an 32-bit SGPR + immediate
@@ -929,7 +975,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
                     timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 }
 
@@ -938,28 +984,28 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
    def : GCNPat <
      (node (SMRDImm i64:$sbase, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 2. SGPR offset
    def : GCNPat <
      (node (SMRDSgpr i64:$sbase, i32:$soffset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 3. SGPR+IMM offset
    def : GCNPat <
      (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 4. No offset
    def : GCNPat <
      (vt (node (i64 SReg_64:$sbase))),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
   }
 }
 
@@ -969,14 +1015,14 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
   def : GCNPat <
     (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 2. Offset as an 32-bit SGPR
   def : GCNPat <
     (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 3. Offset as an 32-bit SGPR + immediate
@@ -985,7 +1031,7 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
                     timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 }
 
@@ -1005,6 +1051,8 @@ defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">;
 defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">;
 defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">;
 
+} // End let AddedComplexity = 100
+
 foreach vt = Reg32Types.types in {
 defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
 }
@@ -1029,7 +1077,6 @@ foreach vt = SReg_512.RegTypes in {
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
 }
 
-} // End let AddedComplexity = 100
 
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2i32>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0d6883254c350..de3191bd91df6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -67,6 +67,7 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
   let isBarrier          = ps.isBarrier;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   // encoding
   bits<7> sdst;
@@ -579,6 +580,7 @@ class SOP2_Real<SOP_Pseudo ps, string name = ps.Mnemonic> :
   let DisableEncoding      = ps.DisableEncoding;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding
   bits<7> sdst;
@@ -996,6 +998,7 @@ class SOPK_Real<SOPK_Pseudo ps, string name = ps.Mnemonic> :
   let isBarrier          = ps.isBarrier;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   // encoding
   bits<7>  sdst;
@@ -1262,6 +1265,7 @@ class SOPC_Real<bits<7> op, SOPC_Pseudo ps> :
   let mayStore             = ps.mayStore;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding
   bits<8> src0;
@@ -1459,6 +1463,7 @@ class SOPP_Real<SOPP_Pseudo ps, string name = ps.Mnemonic> :
   let isBarrier            = ps.isBarrier;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
   bits <16> simm16;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index af2f0bc1a6306..429c3ad335d21 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -404,6 +404,7 @@ struct MIMGBaseOpcodeInfo {
   bool MSAA;
   bool BVH;
   bool A16;
+  bool NoReturn;
 };
 
 LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 09b8da9f5dd48..056d6b9bbcc42 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   MCParser
   Support
   TargetParser
+  TransformUtils
 
   ADD_TO_COMPONENT
   AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2c0d61ee4afa1..0a2e338b34787 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -88,6 +88,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP1_Real_Gen <VOP1_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
@@ -729,7 +730,21 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
   let isAsCheapAsAMove = 1;
 }
 
+def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
+  let Outs32 = (outs VOPDstOperand_t16Lo128:$vdst,
+                     VOPSrcEncodedDstOperand_t16Lo128:$vdst1);
+  let Ins32 = (ins VOPSrcEncodedDstOperand_t16Lo128:$src0,
+                   VOPDstOperand_t16Lo128:$src1);
+  let Asm32 = " $vdst, $src0";
+}
+
 let SubtargetPredicate = isGFX11Plus in {
+  def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */true> {
+    let Constraints = "$vdst = $src1, $vdst1 = $src0";
+    let DisableEncoding = "$vdst1, $src1";
+    let SchedRW = [Write64Bit, Write64Bit];
+    let True16Predicate = UseRealTrue16Insts;
+  }
   // Restrict src0 to be VGPR
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
                                       [], /*VOP1Only=*/ 1> {
@@ -952,7 +967,8 @@ defm V_CTZ_I32_B32         : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
   "V_FFBL_B32", "v_ctz_i32_b32">;
 defm V_CLS_I32             : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
   "V_FFBH_I32", "v_cls_i32">;
-defm V_PERMLANE64_B32      : VOP1Only_Real_gfx11_gfx12<0x067>;
+defm V_SWAP_B16              : VOP1Only_Real_gfx11_gfx12<0x066>;
+defm V_PERMLANE64_B32        : VOP1Only_Real_gfx11_gfx12<0x067>;
 defm V_MOV_B16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
 defm V_NOT_B16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
 defm V_CVT_I32_I16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9989752c2f6bc..44eb5f5abafe0 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -107,6 +107,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let SchedRW              = ps.SchedRW;
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP2_Real_Gen <VOP2_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 3bcee28a2cb78..62ca6261c47c8 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -180,6 +180,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
   let SchedRW              = ps.SchedRW;
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..3851415ab0cae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -194,6 +194,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 
   VOPProfile Pfl = ps.Pfl;
 }
@@ -653,6 +654,7 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -689,6 +691,7 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -889,6 +892,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP_DPP_Base <string OpName, VOPProfile P,
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index 6b5802d972eb7..1227fae13211a 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -116,8 +116,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   LLVM_DEBUG(dbgs() << "Emit Prologue: " << MF.getName() << "\n");
   auto *AFI = MF.getInfo<ARCFunctionInfo>();
-  MachineModuleInfo &MMI = MF.getMMI();
-  MCContext &Context = MMI.getContext();
+  MCContext &Context = MF.getContext();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARCInstrInfo *TII = MF.getSubtarget<ARCSubtarget>().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 36b00af2c0b48..8cd39668311fd 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -77,8 +77,7 @@ ARCTargetStreamer::~ARCTargetStreamer() = default;
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm) {
+                                                 MCInstPrinter *InstPrint) {
   return new ARCTargetStreamer(S);
 }
 
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 642739a29d6b0..d9a8789315552 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1447,8 +1447,10 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitUnwindingInstruction(MI);
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
          "Pseudo flag setting opcode should be expanded early");
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 33b4417aa9b80..c4503d952105e 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -126,9 +126,8 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
 
   void EmitUnwindingInstruction(const MachineInstr *MI);
 
-  // emitPseudoExpansionLowering - tblgen'erated.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  // tblgen'erated.
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
 public:
   unsigned getISAEncoding() override {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index bf6bb8c295efe..3b9195b4e3d85 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2311,7 +2311,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
@@ -5873,6 +5873,7 @@ static bool isLRAvailable(const TargetRegisterInfo &TRI,
 
 std::optional<outliner::OutlinedFunction>
 ARMBaseInstrInfo::getOutliningCandidateInfo(
+    const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   unsigned SequenceSize = 0;
   for (auto &MI : RepeatedSequenceLocs[0])
@@ -6278,8 +6279,9 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
 }
 
 outliner::InstrType
-ARMBaseInstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
-                                   unsigned Flags) const {
+ARMBaseInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                       MachineBasicBlock::iterator &MIT,
+                                       unsigned Flags) const {
   MachineInstr &MI = *MIT;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
 
@@ -6350,8 +6352,7 @@ ARMBaseInstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
 
     // We have a function we have information about.  Check if it's something we
     // can safely outline.
-    MachineFunction *MF = MI.getParent()->getParent();
-    MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
+    MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
 
     // We don't know what's going on with the callee at all.  Don't touch it.
     if (!CalleeMF)
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index a3c2684ac1fb9..8521e3ef91399 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -356,11 +356,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
   void mergeOutliningCandidateAttributes(
       Function &F, std::vector<outliner::Candidate> &Candidates) const override;
-  outliner::InstrType getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
-                                       unsigned Flags) const override;
+  outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                           MachineBasicBlock::iterator &MIT,
+                                           unsigned Flags) const override;
   bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                               unsigned &Flags) const override;
   void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index e94b0f6e1a44f..40354f9955989 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -735,8 +735,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo  &MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  MachineModuleInfo &MMI = MF.getMMI();
-  MCContext &Context = MMI.getContext();
+  MCContext &Context = MF.getContext();
   const TargetMachine &TM = MF.getTarget();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo();
@@ -1167,7 +1166,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         if (STI.splitFramePushPop(MF)) {
           unsigned DwarfReg = MRI->getDwarfRegNum(
               Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, true);
-          unsigned Offset = MFI.getObjectOffset(FI);
+          int64_t Offset = MFI.getObjectOffset(FI);
           unsigned CFIIndex = MF.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
           BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1189,7 +1188,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
         unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-        unsigned Offset = MFI.getObjectOffset(FI);
+        int64_t Offset = MFI.getObjectOffset(FI);
         unsigned CFIIndex = MF.addFrameInst(
             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
         BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -2995,8 +2994,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     report_fatal_error("Segmented stacks not supported on this platform.");
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  MCContext &Context = MMI.getContext();
+  MCContext &Context = MF.getContext();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 2c2853223ba56..c6d4aa9ba835c 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -218,7 +218,7 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   // Emit "B #20" instruction, which jumps over the next 24 bytes (because
   // register pc is 8 bytes ahead of the jump instruction by the moment CPU
   // is executing it).
-  // By analogy to ARMAsmPrinter::emitPseudoExpansionLowering() |case ARM::B|.
+  // By analogy to ARMAsmPrinter::lowerPseudoInstExpansion() |case ARM::B|.
   // It is not clear why |addReg(0)| is needed (the last operand).
   EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc).addImm(20)
     .addImm(ARMCC::AL).addReg(0));
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 9234881c9407e..447db18b8defa 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -170,44 +170,6 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) {
   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
 }
 
-const RegisterBank &
-ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                            LLT) const {
-  using namespace ARM;
-
-  switch (RC.getID()) {
-  case GPRRegClassID:
-  case GPRwithAPSRRegClassID:
-  case GPRnoipRegClassID:
-  case GPRnopcRegClassID:
-  case GPRnoip_and_GPRnopcRegClassID:
-  case rGPRRegClassID:
-  case GPRspRegClassID:
-  case tcGPRRegClassID:
-  case tcGPRnotr12RegClassID:
-  case tGPRRegClassID:
-  case tGPREvenRegClassID:
-  case tGPROddRegClassID:
-  case tGPR_and_tGPREvenRegClassID:
-  case tGPR_and_tGPROddRegClassID:
-  case tGPREven_and_tcGPRRegClassID:
-  case tGPROdd_and_tcGPRRegClassID:
-  case tGPREven_and_tcGPRnotr12RegClassID:
-    return getRegBank(ARM::GPRRegBankID);
-  case HPRRegClassID:
-  case SPR_8RegClassID:
-  case SPRRegClassID:
-  case DPR_8RegClassID:
-  case DPRRegClassID:
-  case QPRRegClassID:
-    return getRegBank(ARM::FPRRegBankID);
-  default:
-    llvm_unreachable("Unsupported register kind");
-  }
-
-  llvm_unreachable("Switch should handle all register classes");
-}
-
 const RegisterBankInfo::InstructionMapping &
 ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   auto Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
index c56134aab38c6..2694174623c5c 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -32,9 +32,6 @@ class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
 public:
   ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 };
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b5ca045058cbc..e940dce7faa14 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1963,7 +1963,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
       return LT.first * ST->getMVEVectorCostFactor(CostKind);
 
-    // Otherwise we use a legal convert followed by a min+max
+    // If we can we use a legal convert followed by a min+max
     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
          (ST->hasFP64() && LT.second == MVT::f64) ||
          (ST->hasFullFP16() && LT.second == MVT::f16) ||
@@ -1984,7 +1984,25 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
       return LT.first * Cost;
     }
-    break;
+    // Otherwise we need to follow the default expansion that clamps the value
+    // using a float min/max with a fcmp+sel for nan handling when signed.
+    Type *FPTy = ICA.getArgTypes()[0];
+    Type *RetTy = ICA.getReturnType();
+    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
+    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
+    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
+    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+    Cost +=
+        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);
+    if (IsSigned) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+    }
+    return Cost;
   }
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index eb55a2b5e70b8..994b43f1abb49 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1146,7 +1146,7 @@ enum CompactUnwindEncodings {
 /// instructions. If the CFI instructions describe a frame that cannot be
 /// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
 /// tells the runtime to fallback and unwind using dwarf.
-uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
     const MCDwarfFrameInfo *FI, const MCContext *Ctxt) const {
   DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
   // Only armv7k uses CFI based unwinding.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index ac0c9b101cae1..9c958003ca756 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -34,7 +34,7 @@ class ARMAsmBackendDarwin : public ARMAsmBackend {
         /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype);
   }
 
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override;
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 31b577b9c301f..9df752f8eb680 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
@@ -114,15 +115,14 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
 
 public:
   ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
-                       MCInstPrinter &InstPrinter, bool VerboseAsm);
+                       MCInstPrinter &InstPrinter);
 };
 
 ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
                                            formatted_raw_ostream &OS,
-                                           MCInstPrinter &InstPrinter,
-                                           bool VerboseAsm)
+                                           MCInstPrinter &InstPrinter)
     : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
-      IsVerboseAsm(VerboseAsm) {}
+      IsVerboseAsm(S.isVerboseAsm()) {}
 
 void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
 void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
@@ -671,8 +671,7 @@ class ARMELFStreamer : public MCELFStreamer {
   }
 
   void EmitMappingSymbol(StringRef Name) {
-    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++)));
+    auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
     emitLabel(Symbol);
 
     Symbol->setType(ELF::STT_NOTYPE);
@@ -680,8 +679,7 @@ class ARMELFStreamer : public MCELFStreamer {
   }
 
   void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) {
-    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++)));
+    auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
     emitLabelAtPos(Symbol, SMLoc(), F, Offset);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
@@ -711,7 +709,6 @@ class ARMELFStreamer : public MCELFStreamer {
 
   bool IsThumb;
   bool IsAndroid;
-  int64_t MappingSymbolCounter = 0;
 
   DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>>
       LastMappingSymbols;
@@ -1122,14 +1119,13 @@ void ARMELFStreamer::reset() {
   MCTargetStreamer &TS = *getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   ATS.reset();
-  MappingSymbolCounter = 0;
   MCELFStreamer::reset();
   LastMappingSymbols.clear();
   LastEMSInfo.reset();
   // MCELFStreamer clear's the assembler's e_flags. However, for
   // arm we manually set the ABI version on streamer creation, so
   // do the same here
-  getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+  getWriter().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
@@ -1462,9 +1458,8 @@ namespace llvm {
 
 MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS,
-                                             MCInstPrinter *InstPrint,
-                                             bool isVerboseAsm) {
-  return new ARMTargetAsmStreamer(S, OS, *InstPrint, isVerboseAsm);
+                                             MCInstPrinter *InstPrint) {
+  return new ARMTargetAsmStreamer(S, OS, *InstPrint);
 }
 
 MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
@@ -1486,7 +1481,7 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
   // FIXME: This should eventually end up somewhere else where more
   // intelligent flag decisions can be made. For now we are just maintaining
   // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-  S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+  S->getWriter().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 
   return S;
 }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 20603b6cf1b0b..cf4fc37f84553 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -369,10 +369,9 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
 static MCStreamer *
 createARMMachOStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&MAB,
                        std::unique_ptr<MCObjectWriter> &&OW,
-                       std::unique_ptr<MCCodeEmitter> &&Emitter,
-                       bool DWARFMustBeAtTheEnd) {
+                       std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createMachOStreamer(Ctx, std::move(MAB), std::move(OW),
-                             std::move(Emitter), DWARFMustBeAtTheEnd);
+                             std::move(Emitter), false);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index a673d590419ec..39b189780f1ff 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -67,8 +67,7 @@ MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU,
 MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S);
 MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS,
-                                             MCInstPrinter *InstPrint,
-                                             bool isVerboseAsm);
+                                             MCInstPrinter *InstPrint);
 MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
                                                 const MCSubtargetInfo &STI);
 MCTargetStreamer *createARMObjectTargetELFStreamer(MCStreamer &S);
@@ -93,8 +92,7 @@ MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
 MCStreamer *createARMWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     bool IncrementalLinkerCompatible);
+                                     std::unique_ptr<MCCodeEmitter> &&Emitter);
 
 /// Construct an ELF Mach-O object writer.
 std::unique_ptr<MCObjectTargetWriter> createARMELFObjectWriter(uint8_t OSABI);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index c4427948d3b89..5a1ae90b84d74 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -61,8 +61,8 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
 
   switch (FixupKind) {
   default: {
-    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+    Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
+    return COFF::IMAGE_REL_ARM_ABSOLUTE;
   }
   case FK_Data_4:
     switch (Modifier) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index 0fcf6eb1a5abb..e66059c2a0e09 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -70,12 +70,9 @@ MCStreamer *
 llvm::createARMWinCOFFStreamer(MCContext &Context,
                                std::unique_ptr<MCAsmBackend> &&MAB,
                                std::unique_ptr<MCObjectWriter> &&OW,
-                               std::unique_ptr<MCCodeEmitter> &&Emitter,
-                               bool IncrementalLinkerCompatible) {
-  auto *S = new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
-                                   std::move(OW));
-  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
-  return S;
+                               std::unique_ptr<MCCodeEmitter> &&Emitter) {
+  return new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
+                                std::move(OW));
 }
 
 namespace {
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index e908f1fb95124..cb9ded7dee57b 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -149,8 +149,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   const ThumbRegisterInfo *RegInfo =
       static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
   const Thumb1InstrInfo &TII =
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index d520880d73bbd..0d29912bee264 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/AVRAsmBackend.h"
 #include "MCTargetDesc/AVRFixupKinds.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 4ac54c8876d72..e32d2d42717f3 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,13 +1,12 @@
 #include "AVRELFStreamer.h"
-
+#include "AVRMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 
-#include "AVRMCTargetDesc.h"
-
 namespace llvm {
 
 static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
@@ -56,14 +55,13 @@ static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
 
 AVRELFStreamer::AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI)
     : AVRTargetStreamer(S) {
-
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned EFlags = W.getELFHeaderEFlags();
 
   EFlags |= getEFlagsForFeatureSet(STI.getFeatureBits());
   EFlags |= ELF::EF_AVR_LINKRELAX_PREPARED;
 
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index 119baff83dae4..3153dfb3b827f 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -84,8 +84,7 @@ createAVRObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
 
 static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
                                                    formatted_raw_ostream &OS,
-                                                   MCInstPrinter *InstPrint,
-                                                   bool isVerboseAsm) {
+                                                   MCInstPrinter *InstPrint) {
   return new AVRTargetAsmStreamer(S);
 }
 
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index 7d121b8d24f05..7f0c2d1dad70b 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -145,8 +145,10 @@ void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
                                        getSubtargetInfo().getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != CSKY::CONSTPOOL_ENTRY) {
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
index 379189512405a..da47b650f1e19 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -41,8 +41,7 @@ class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter {
 
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
index 059c3c143c311..346b123781541 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -31,10 +32,10 @@ using namespace llvm;
 CSKYTargetELFStreamer::CSKYTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : CSKYTargetStreamer(S), CurrentVendor("csky") {
-  MCAssembler &MCA = getStreamer().getAssembler();
+  ELFObjectWriter &W = getStreamer().getWriter();
   const FeatureBitset &Features = STI.getFeatureBits();
 
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  unsigned EFlags = W.getELFHeaderEFlags();
 
   EFlags |= ELF::EF_CSKY_ABIV2;
 
@@ -62,7 +63,7 @@ CSKYTargetELFStreamer::CSKYTargetELFStreamer(MCStreamer &S,
 
   EFlags |= ELF::EF_CSKY_EFV1;
 
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 }
 
 MCELFStreamer &CSKYTargetELFStreamer::getStreamer() {
@@ -168,8 +169,7 @@ void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) {
 
   State = (Name == "$t" ? EMS_Text : EMS_Data);
 
-  auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
-      Name + "." + Twine(MappingSymbolCounter++)));
+  auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
   emitLabel(Symbol);
 
   Symbol->setType(ELF::STT_NOTYPE);
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
index b7931e9222792..1392fc9b5cca9 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
@@ -99,8 +99,6 @@ class CSKYTargetELFStreamer : public CSKYTargetStreamer {
 };
 
 class CSKYELFStreamer : public MCELFStreamer {
-  int64_t MappingSymbolCounter = 0;
-
   void EmitMappingSymbol(StringRef Name);
 
 public:
@@ -138,7 +136,6 @@ class CSKYELFStreamer : public MCELFStreamer {
     MCELFStreamer::emitValueImpl(Value, Size, Loc);
   }
   void reset() override {
-    MappingSymbolCounter = 0;
     State = EMS_None;
     MCELFStreamer::reset();
   }
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index c3403ade389c4..bb122f683644b 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -95,10 +95,9 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
   return S;
 }
 
-static MCTargetStreamer *createCSKYAsmTargetStreamer(MCStreamer &S,
-                                                     formatted_raw_ostream &OS,
-                                                     MCInstPrinter *InstPrinter,
-                                                     bool isVerboseAsm) {
+static MCTargetStreamer *
+createCSKYAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                            MCInstPrinter *InstPrinter) {
   return new CSKYTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index adaaa2a6e0d4e..a66f5b6470934 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -13,14 +13,32 @@
 
 include "llvm/IR/Intrinsics.td"
 
-class DXILOpClass;
+// Abstract class to represent major and minor version values
+class Version<int major, int minor> {
+  int Major = major;
+  int Minor = minor;
+}
+
+// Valid DXIL Version records
+foreach i = 0...8 in {
+  def DXIL1_ #i : Version<1, i>;
+}
 
-// Following is a set of DXIL Operation classes whose names appear to be
-// arbitrary, yet need to be a substring of the function name used during
-// lowering to DXIL Operation calls. These class name strings are specified
-// as the third argument of add_dixil_op in utils/hct/hctdb.py and case converted
-// in utils/hct/hctdb_instrhelp.py of DirectXShaderCompiler repo. The function
-// name has the format "dx.op.<class-name>.<return-type>".
+// Overload type alias of llvm_any_ty
+defvar overloadTy = llvm_any_ty;
+
+// Type aliases for DXIL Op types to LLVM Types.
+// TODO: Define DXIL Types independent of LLVM types
+defvar i1Ty = llvm_i1_ty;
+defvar i8Ty = llvm_i8_ty;
+defvar i16Ty = llvm_i16_ty;
+defvar i32Ty = llvm_i32_ty;
+defvar i64Ty = llvm_i64_ty;
+defvar halfTy = llvm_half_ty;
+defvar floatTy = llvm_float_ty;
+defvar doubleTy = llvm_double_ty;
+
+class DXILOpClass;
 
 defset list<DXILOpClass> OpClasses = {
   def acceptHitAndEndSearch : DXILOpClass;
@@ -206,160 +224,482 @@ defset list<DXILOpClass> OpClasses = {
   def writeSamplerFeedbackGrad : DXILOpClass;
   def writeSamplerFeedbackLevel: DXILOpClass;
 
-  // This is a sentinel definition. Hence placed at the end of the list
-  // and not as part of the above alphabetically sorted valid definitions.
+  // This is a sentinel definition. Hence placed at the end here and
+  // not as part of the above alphabetically sorted valid definitions.
+  // It is never used to construct the name of DXIL Op call name.
   // Additionally it is capitalized unlike all the others.
-  def UnknownOpClass: DXILOpClass;
+  def UnknownOpClass : DXILOpClass;
 }
 
-// Several of the overloaded DXIL Operations support for data types
-// that are a subset of the overloaded LLVM intrinsics that they map to.
-// For e.g., llvm.sin.* intrinsic operates on any floating-point type and
-// maps for lowering to DXIL Op Sin. However, valid overloads of DXIL Sin
-// operation overloads are half (f16) and float (f32) only.
-//
-// The following abstracts overload types specific to DXIL operations.
-
-class DXILType : LLVMType<OtherVT> {
-  let isAny = 1;
-  int isI16OrI32 = 0;
-  int isHalfOrFloat = 0;
-}
-
-// Concrete records for various overload types supported specifically by
-// DXIL Operations.
-let isI16OrI32 = 1 in
-  def llvm_i16ori32_ty : DXILType;
-
-let isHalfOrFloat = 1 in
-  def llvm_halforfloat_ty : DXILType;
-
-// Abstraction DXIL Operation to LLVM intrinsic
-class DXILOpMappingBase {
-  int OpCode = 0;                      // Opcode of DXIL Operation
-  DXILOpClass OpClass = UnknownOpClass;// Class of DXIL Operation.
-  Intrinsic LLVMIntrinsic = ?;         // LLVM Intrinsic DXIL Operation maps to
-  string Doc = "";                     // A short description of the operation
-  list<LLVMType> OpTypes = ?;          // Valid types of DXIL Operation in the
-                                       // format [returnTy, param1ty, ...]
-}
-
-class DXILOpMapping<int opCode, DXILOpClass opClass,
-                    Intrinsic intrinsic, string doc,
-                    list<LLVMType> opTys = []> : DXILOpMappingBase {
-  int OpCode = opCode;                 // Opcode corresponding to DXIL Operation
-  DXILOpClass OpClass = opClass;       // Class of DXIL Operation.
-  Intrinsic LLVMIntrinsic = intrinsic; // LLVM Intrinsic the DXIL Operation maps
-  string Doc = doc;                    // to a short description of the operation
-  list<LLVMType> OpTypes = !if(!eq(!size(opTys), 0), LLVMIntrinsic.Types, opTys);
-}
-
-// Concrete definition of DXIL Operation mapping to corresponding LLVM intrinsic
-def Abs : DXILOpMapping<6, unary, int_fabs,
-                         "Returns the absolute value of the input.">;
-def IsInf : DXILOpMapping<9, isSpecialFloat, int_dx_isinf,
-                         "Determines if the specified value is infinite.",
-                         [llvm_i1_ty, llvm_halforfloat_ty]>;
-def Cos  : DXILOpMapping<12, unary, int_cos,
-                         "Returns cosine(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Sin  : DXILOpMapping<13, unary, int_sin,
-                         "Returns sine(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Tan  : DXILOpMapping<14, unary, int_tan,
-                         "Returns tangent(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ACos  : DXILOpMapping<15, unary, int_acos,
-                         "Returns the arccosine of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ASin  : DXILOpMapping<16, unary, int_asin,
-                         "Returns the arcsine of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ATan  : DXILOpMapping<17, unary, int_atan,
-                         "Returns the arctangent of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HCos  : DXILOpMapping<18, unary, int_cosh,
-                         "Returns the hyperbolic cosine of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HSin  : DXILOpMapping<19, unary, int_sinh,
-                         "Returns the hyperbolic sine of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HTan  : DXILOpMapping<20, unary, int_tanh,
-                         "Returns the hyperbolic tan of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-
-def Exp2 : DXILOpMapping<21, unary, int_exp2,
-                         "Returns the base 2 exponential, or 2**x, of the specified value."
-                         "exp2(x) = 2**x.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Frac : DXILOpMapping<22, unary, int_dx_frac,
-                         "Returns a fraction from 0 to 1 that represents the "
-                         "decimal part of the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Log2 : DXILOpMapping<23, unary, int_log2,
-                         "Returns the base-2 logarithm of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Sqrt : DXILOpMapping<24, unary, int_sqrt,
-                         "Returns the square root of the specified floating-point"
-                         "value, per component.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def RSqrt : DXILOpMapping<25, unary, int_dx_rsqrt,
-                         "Returns the reciprocal of the square root of the specified value."
-                         "rsqrt(x) = 1 / sqrt(x).",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Round : DXILOpMapping<26, unary, int_roundeven,
-                         "Returns the input rounded to the nearest integer"
-                         "within a floating-point type.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Floor : DXILOpMapping<27, unary, int_floor,
-                         "Returns the largest integer that is less than or equal to the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Ceil : DXILOpMapping<28, unary, int_ceil,
-                         "Returns the smallest integer that is greater than or equal to the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Trunc : DXILOpMapping<29, unary, int_trunc,
-                         "Returns the specified value truncated to the integer component.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Rbits : DXILOpMapping<30, unary, int_bitreverse,
-                         "Returns the specified value with its bits reversed.",
-                         [llvm_anyint_ty, LLVMMatchType<0>]>;
-def FMax : DXILOpMapping<35, binary, int_maxnum,
-                         "Float maximum. FMax(a,b) = a > b ? a : b">;
-def FMin : DXILOpMapping<36, binary, int_minnum,
-                         "Float minimum. FMin(a,b) = a < b ? a : b">;
-def SMax : DXILOpMapping<37, binary, int_smax,
-                         "Signed integer maximum. SMax(a,b) = a > b ? a : b">;
-def SMin : DXILOpMapping<38, binary, int_smin,
-                         "Signed integer minimum. SMin(a,b) = a < b ? a : b">;
-def UMax : DXILOpMapping<39, binary, int_umax,
-                         "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
-def UMin : DXILOpMapping<40, binary, int_umin,
-                         "Unsigned integer minimum. UMin(a,b) = a < b ? a : b">;
-def FMad : DXILOpMapping<46, tertiary, int_fmuladd,
-                         "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m * a + b.">;
-def IMad : DXILOpMapping<48, tertiary, int_dx_imad,
-                         "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m * a + b.">;
-def UMad : DXILOpMapping<49, tertiary, int_dx_umad,
-                         "Unsigned integer arithmetic multiply/add operation. umad(m,a,b) = m * a + b.">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 4)) in
-  def Dot2 : DXILOpMapping<54, dot2, int_dx_dot2,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 1">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 6)) in
-  def Dot3 : DXILOpMapping<55, dot3, int_dx_dot3,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 2">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 8)) in
-  def Dot4 : DXILOpMapping<56, dot4, int_dx_dot4,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 3">;
-def ThreadId : DXILOpMapping<93, threadId, int_dx_thread_id,
-                             "Reads the thread ID">;
-def GroupId  : DXILOpMapping<94, groupId, int_dx_group_id,
-                             "Reads the group ID (SV_GroupID)">;
-def ThreadIdInGroup : DXILOpMapping<95, threadIdInGroup,
-                                    int_dx_thread_id_in_group,
-                                    "Reads the thread ID within the group "
-                                    "(SV_GroupThreadID)">;
-def FlattenedThreadIdInGroup : DXILOpMapping<96, flattenedThreadIdInGroup,
-                                             int_dx_flattened_thread_id_in_group,
-                                             "Provides a flattened index for a "
-                                             "given thread within a given "
-                                             "group (SV_GroupIndex)">;
+class DXILShaderStage;
+
+def compute : DXILShaderStage;
+def domain : DXILShaderStage;
+def hull : DXILShaderStage;
+def pixel : DXILShaderStage;
+def vertex : DXILShaderStage;
+def geometry : DXILShaderStage;
+def library : DXILShaderStage;
+def amplification : DXILShaderStage;
+def mesh : DXILShaderStage;
+def node : DXILShaderStage;
+def raygeneration : DXILShaderStage;
+def intersection : DXILShaderStage;
+def anyhit : DXILShaderStage;
+def closesthit : DXILShaderStage;
+def callable : DXILShaderStage;
+def miss : DXILShaderStage;
+
+// Pseudo-stages
+// Denote DXIL Op to be supported in all stages
+def all_stages : DXILShaderStage;
+// Denote support for DXIL Op to have been removed
+def removed : DXILShaderStage;
+// DXIL Op attributes
+
+class DXILAttribute;
+
+def ReadOnly : DXILAttribute;
+def ReadNone : DXILAttribute;
+def IsDerivative : DXILAttribute;
+def IsGradient : DXILAttribute;
+def IsFeedback : DXILAttribute;
+def IsWave : DXILAttribute;
+def NeedsUniformInputs : DXILAttribute;
+def IsBarrier : DXILAttribute;
+
+class Overloads<Version ver, list<LLVMType> ols> {
+  Version dxil_version = ver;
+  list<LLVMType> overload_types = ols;
+}
+
+class Stages<Version ver, list<DXILShaderStage> st> {
+  Version dxil_version = ver;
+  list<DXILShaderStage> shader_stages = st;
+}
+
+class Attributes<Version ver = DXIL1_0, list<DXILAttribute> attrs> {
+  Version dxil_version = ver;
+  list<DXILAttribute> op_attrs = attrs;
+}
+
+// Abstraction DXIL Operation
+class DXILOp<int opcode, DXILOpClass opclass> {
+  // A short description of the operation
+  string Doc = "";
+
+  // Opcode of DXIL Operation
+  int OpCode = opcode;
+
+  // Class of DXIL Operation.
+  DXILOpClass OpClass = opclass;
+
+  // LLVM Intrinsic DXIL Operation maps to
+  Intrinsic LLVMIntrinsic = ?;
+
+  // Result type of the op
+  LLVMType result;
+
+  // List of argument types of the op. Default to 0 arguments.
+  list<LLVMType> arguments = [];
+
+  // List of valid overload types predicated by DXIL version
+  list<Overloads> overloads = [];
+
+  // List of valid shader stages predicated by DXIL version
+  list<Stages> stages;
+
+  // Versioned attributes of operation
+  list<Attributes> attributes = [];
+}
+
+// Concrete definitions of DXIL Operations
+
+def Abs :  DXILOp<6, unary> {
+  let Doc = "Returns the absolute value of the input.";
+  let LLVMIntrinsic = int_fabs;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def IsInf :  DXILOp<9, isSpecialFloat> {
+  let Doc = "Determines if the specified value is infinite.";
+  let LLVMIntrinsic = int_dx_isinf;
+  let arguments = [overloadTy];
+  let result = i1Ty;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Cos :  DXILOp<12, unary> {
+  let Doc = "Returns cosine(theta) for theta in radians.";
+  let LLVMIntrinsic = int_cos;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Sin :  DXILOp<13, unary> {
+  let Doc = "Returns sine(theta) for theta in radians.";
+  let LLVMIntrinsic = int_sin;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Tan :  DXILOp<14, unary> {
+  let Doc = "Returns tangent(theta) for theta in radians.";
+  let LLVMIntrinsic = int_tan;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ACos :  DXILOp<15, unary> {
+  let Doc = "Returns the arccosine of the specified value.";
+  let LLVMIntrinsic = int_acos;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ASin :  DXILOp<16, unary> {
+  let Doc = "Returns the arcsine of the specified value.";
+  let LLVMIntrinsic = int_asin;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ATan :  DXILOp<17, unary> {
+  let Doc = "Returns the arctangent of the specified value.";
+  let LLVMIntrinsic = int_atan;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HCos :  DXILOp<18, unary> {
+  let Doc = "Returns the hyperbolic cosine of the specified value.";
+  let LLVMIntrinsic = int_cosh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HSin :  DXILOp<19, unary> {
+  let Doc = "Returns the hyperbolic sine of the specified value.";
+  let LLVMIntrinsic = int_sinh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HTan :  DXILOp<20, unary> {
+  let Doc = "Returns the hyperbolic tan of the specified value.";
+  let LLVMIntrinsic = int_tanh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Exp2 :  DXILOp<21, unary> {
+  let Doc = "Returns the base 2 exponential, or 2**x, of the specified value. "
+            "exp2(x) = 2**x.";
+  let LLVMIntrinsic = int_exp2;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Frac :  DXILOp<22, unary> {
+  let Doc = "Returns a fraction from 0 to 1 that represents the decimal part "
+            "of the input.";
+  let LLVMIntrinsic = int_dx_frac;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Log2 :  DXILOp<23, unary> {
+  let Doc = "Returns the base-2 logarithm of the specified value.";
+  let LLVMIntrinsic = int_log2;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Sqrt :  DXILOp<24, unary> {
+  let Doc = "Returns the square root of the specified floating-point value, "
+            "per component.";
+  let LLVMIntrinsic = int_sqrt;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def RSqrt :  DXILOp<25, unary> {
+  let Doc = "Returns the reciprocal of the square root of the specified value. "
+            "rsqrt(x) = 1 / sqrt(x).";
+  let LLVMIntrinsic = int_dx_rsqrt;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Round :  DXILOp<26, unary> {
+  let Doc = "Returns the input rounded to the nearest integer within a "
+            "floating-point type.";
+  let LLVMIntrinsic = int_roundeven;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Floor :  DXILOp<27, unary> {
+  let Doc =
+      "Returns the largest integer that is less than or equal to the input.";
+  let LLVMIntrinsic = int_floor;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Ceil :  DXILOp<28, unary> {
+  let Doc = "Returns the smallest integer that is greater than or equal to the "
+            "input.";
+  let LLVMIntrinsic = int_ceil;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Trunc :  DXILOp<29, unary> {
+  let Doc = "Returns the specified value truncated to the integer component.";
+  let LLVMIntrinsic = int_trunc;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Rbits :  DXILOp<30, unary> {
+  let Doc = "Returns the specified value with its bits reversed.";
+  let LLVMIntrinsic = int_bitreverse;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMax :  DXILOp<35, binary> {
+  let Doc = "Float maximum. FMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_maxnum;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMin :  DXILOp<36, binary> {
+  let Doc = "Float minimum. FMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_minnum;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def SMax :  DXILOp<37, binary> {
+  let Doc = "Signed integer maximum. SMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_smax;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def SMin :  DXILOp<38, binary> {
+  let Doc = "Signed integer minimum. SMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_smin;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMax :  DXILOp<39, binary> {
+  let Doc = "Unsigned integer maximum. UMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_umax;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMin :  DXILOp<40, binary> {
+  let Doc = "Unsigned integer minimum. UMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_umin;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMad :  DXILOp<46, tertiary> {
+  let Doc = "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_fmuladd;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def IMad :  DXILOp<48, tertiary> {
+  let Doc = "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_dx_imad;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMad :  DXILOp<49, tertiary> {
+  let Doc = "Unsigned integer arithmetic multiply/add operation. umad(m,a, = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_dx_umad;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot2 :  DXILOp<54, dot2> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 1";
+  let LLVMIntrinsic = int_dx_dot2;
+  let arguments = !listsplat(overloadTy, 4);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot3 :  DXILOp<55, dot3> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 2";
+  let LLVMIntrinsic = int_dx_dot3;
+  let arguments = !listsplat(overloadTy, 6);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot4 :  DXILOp<56, dot4> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 3";
+  let LLVMIntrinsic = int_dx_dot4;
+  let arguments = !listsplat(overloadTy, 8);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ThreadId :  DXILOp<93, threadId> {
+  let Doc = "Reads the thread ID";
+  let LLVMIntrinsic = int_dx_thread_id;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def GroupId :  DXILOp<94, groupId> {
+  let Doc = "Reads the group ID (SV_GroupID)";
+  let LLVMIntrinsic = int_dx_group_id;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ThreadIdInGroup :  DXILOp<95, threadIdInGroup> {
+  let Doc = "Reads the thread ID within the group  (SV_GroupThreadID)";
+  let LLVMIntrinsic = int_dx_thread_id_in_group;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FlattenedThreadIdInGroup :  DXILOp<96, flattenedThreadIdInGroup> {
+  let Doc = "Provides a flattened index for a given thread within a given "
+            "group (SV_GroupIndex)";
+  let LLVMIntrinsic = int_dx_flattened_thread_id_in_group;
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
diff --git a/llvm/lib/Target/DirectX/DXILConstants.h b/llvm/lib/Target/DirectX/DXILConstants.h
index f7e37ba16d68d..0c9c1ac38fdbc 100644
--- a/llvm/lib/Target/DirectX/DXILConstants.h
+++ b/llvm/lib/Target/DirectX/DXILConstants.h
@@ -15,9 +15,15 @@
 namespace llvm {
 namespace dxil {
 
-#define DXIL_OP_ENUM
+enum class OpCode : unsigned {
+#define DXIL_OPCODE(Op, Name) Name = Op,
 #include "DXILOperation.inc"
-#undef DXIL_OP_ENUM
+};
+
+enum class OpCodeClass : unsigned {
+#define DXIL_OPCLASS(Name) Name,
+#include "DXILOperation.inc"
+};
 
 } // namespace dxil
 } // namespace llvm
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 0b3982ea0f438..a03701be743c7 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <optional>
 
 using namespace llvm;
 using namespace llvm::dxil;
@@ -22,8 +23,8 @@ using namespace llvm::dxil;
 constexpr StringLiteral DXILOpNamePrefix = "dx.op.";
 
 namespace {
-
 enum OverloadKind : uint16_t {
+  UNDEFINED = 0,
   VOID = 1,
   HALF = 1 << 1,
   FLOAT = 1 << 2,
@@ -36,9 +37,27 @@ enum OverloadKind : uint16_t {
   UserDefineType = 1 << 9,
   ObjectType = 1 << 10,
 };
+struct Version {
+  unsigned Major = 0;
+  unsigned Minor = 0;
+};
 
+struct OpOverload {
+  Version DXILVersion;
+  uint16_t ValidTys;
+};
 } // namespace
 
+struct OpStage {
+  Version DXILVersion;
+  uint32_t ValidStages;
+};
+
+struct OpAttribute {
+  Version DXILVersion;
+  uint32_t ValidAttrs;
+};
+
 static const char *getOverloadTypeName(OverloadKind Kind) {
   switch (Kind) {
   case OverloadKind::HALF:
@@ -58,12 +77,13 @@ static const char *getOverloadTypeName(OverloadKind Kind) {
   case OverloadKind::I64:
     return "i64";
   case OverloadKind::VOID:
+  case OverloadKind::UNDEFINED:
+    return "void";
   case OverloadKind::ObjectType:
   case OverloadKind::UserDefineType:
     break;
   }
   llvm_unreachable("invalid overload type for name");
-  return "void";
 }
 
 static OverloadKind getOverloadKind(Type *Ty) {
@@ -131,8 +151,9 @@ struct OpCodeProperty {
   dxil::OpCodeClass OpCodeClass;
   // Offset in DXILOpCodeClassNameTable.
   unsigned OpCodeClassNameOffset;
-  uint16_t OverloadTys;
-  llvm::Attribute::AttrKind FuncAttr;
+  llvm::SmallVector<OpOverload> Overloads;
+  llvm::SmallVector<OpStage> Stages;
+  llvm::SmallVector<OpAttribute> Attributes;
   int OverloadParamIndex;        // parameter index which control the overload.
                                  // When < 0, should be only 1 overload type.
   unsigned NumOfParameters;      // Number of parameters include return value.
@@ -221,6 +242,45 @@ static Type *getTypeFromParameterKind(ParameterKind Kind, Type *OverloadTy) {
   return nullptr;
 }
 
+static ShaderKind getShaderKindEnum(Triple::EnvironmentType EnvType) {
+  switch (EnvType) {
+  case Triple::Pixel:
+    return ShaderKind::pixel;
+  case Triple::Vertex:
+    return ShaderKind::vertex;
+  case Triple::Geometry:
+    return ShaderKind::geometry;
+  case Triple::Hull:
+    return ShaderKind::hull;
+  case Triple::Domain:
+    return ShaderKind::domain;
+  case Triple::Compute:
+    return ShaderKind::compute;
+  case Triple::Library:
+    return ShaderKind::library;
+  case Triple::RayGeneration:
+    return ShaderKind::raygeneration;
+  case Triple::Intersection:
+    return ShaderKind::intersection;
+  case Triple::AnyHit:
+    return ShaderKind::anyhit;
+  case Triple::ClosestHit:
+    return ShaderKind::closesthit;
+  case Triple::Miss:
+    return ShaderKind::miss;
+  case Triple::Callable:
+    return ShaderKind::callable;
+  case Triple::Mesh:
+    return ShaderKind::mesh;
+  case Triple::Amplification:
+    return ShaderKind::amplification;
+  default:
+    break;
+  }
+  llvm_unreachable(
+      "Shader Kind Not Found - Invalid DXIL Environment Specified");
+}
+
 /// Construct DXIL function type. This is the type of a function with
 /// the following prototype
 ///     OverloadType dx.op.<opclass>.<return-type>(int opcode, <param types>)
@@ -232,7 +292,7 @@ static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop,
                                            Type *ReturnTy, Type *OverloadTy) {
   SmallVector<Type *> ArgTys;
 
-  auto ParamKinds = getOpCodeParameterKind(*Prop);
+  const ParameterKind *ParamKinds = getOpCodeParameterKind(*Prop);
 
   // Add ReturnTy as return type of the function
   ArgTys.emplace_back(ReturnTy);
@@ -249,17 +309,103 @@ static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop,
       ArgTys[0], ArrayRef<Type *>(&ArgTys[1], ArgTys.size() - 1), false);
 }
 
+/// Get index of the property from PropList valid for the most recent
+/// DXIL version not greater than DXILVer.
+/// PropList is expected to be sorted in ascending order of DXIL version.
+template <typename T>
+static std::optional<size_t> getPropIndex(ArrayRef<T> PropList,
+                                          const VersionTuple DXILVer) {
+  size_t Index = PropList.size() - 1;
+  for (auto Iter = PropList.rbegin(); Iter != PropList.rend();
+       Iter++, Index--) {
+    const T &Prop = *Iter;
+    if (VersionTuple(Prop.DXILVersion.Major, Prop.DXILVersion.Minor) <=
+        DXILVer) {
+      return Index;
+    }
+  }
+  return std::nullopt;
+}
+
 namespace llvm {
 namespace dxil {
 
+// No extra checks on TargetTriple need be performed to verify that the
+// Triple is well-formed or that the target is supported since these checks
+// would have been done at the time the module M is constructed in the earlier
+// stages of compilation.
+DXILOpBuilder::DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {
+  Triple TT(Triple(M.getTargetTriple()));
+  DXILVersion = TT.getDXILVersion();
+  ShaderStage = TT.getEnvironment();
+  // Ensure Environment type is known
+  if (ShaderStage == Triple::UnknownEnvironment) {
+    report_fatal_error(
+        Twine(DXILVersion.getAsString()) +
+            ": Unknown Compilation Target Shader Stage specified ",
+        /*gen_crash_diag*/ false);
+  }
+}
+
 CallInst *DXILOpBuilder::createDXILOpCall(dxil::OpCode OpCode, Type *ReturnTy,
                                           Type *OverloadTy,
                                           SmallVector<Value *> Args) {
+
   const OpCodeProperty *Prop = getOpCodeProperty(OpCode);
+  std::optional<size_t> OlIndexOrErr =
+      getPropIndex(ArrayRef(Prop->Overloads), DXILVersion);
+  if (!OlIndexOrErr.has_value()) {
+    report_fatal_error(Twine(getOpCodeName(OpCode)) +
+                           ": No valid overloads found for DXIL Version - " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
+  }
+  uint16_t ValidTyMask = Prop->Overloads[*OlIndexOrErr].ValidTys;
 
   OverloadKind Kind = getOverloadKind(OverloadTy);
-  if ((Prop->OverloadTys & (uint16_t)Kind) == 0) {
-    report_fatal_error("Invalid Overload Type", /* gen_crash_diag=*/false);
+
+  // Check if the operation supports overload types and OverloadTy is valid
+  // per the specified types for the operation
+  if ((ValidTyMask != OverloadKind::UNDEFINED) &&
+      (ValidTyMask & (uint16_t)Kind) == 0) {
+    report_fatal_error(Twine("Invalid Overload Type for DXIL operation - ") +
+                           getOpCodeName(OpCode),
+                       /* gen_crash_diag=*/false);
+  }
+
+  // Perform necessary checks to ensure Opcode is valid in the targeted shader
+  // kind
+  std::optional<size_t> StIndexOrErr =
+      getPropIndex(ArrayRef(Prop->Stages), DXILVersion);
+  if (!StIndexOrErr.has_value()) {
+    report_fatal_error(Twine(getOpCodeName(OpCode)) +
+                           ": No valid stages found for DXIL Version - " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
+  }
+  uint16_t ValidShaderKindMask = Prop->Stages[*StIndexOrErr].ValidStages;
+
+  // Ensure valid shader stage properties are specified
+  if (ValidShaderKindMask == ShaderKind::removed) {
+    report_fatal_error(
+        Twine(DXILVersion.getAsString()) +
+            ": Unsupported Target Shader Stage for DXIL operation - " +
+            getOpCodeName(OpCode),
+        /*gen_crash_diag*/ false);
+  }
+
+  // Shader stage need not be validated since getShaderKindEnum() fails
+  // for unknown shader stage.
+
+  // Verify the target shader stage is valid for the DXIL operation
+  ShaderKind ModuleStagekind = getShaderKindEnum(ShaderStage);
+  if (!(ValidShaderKindMask & ModuleStagekind)) {
+    auto ShaderEnvStr = Triple::getEnvironmentTypeName(ShaderStage);
+    report_fatal_error(Twine(ShaderEnvStr) +
+                           " : Invalid Shader Stage for DXIL operation - " +
+                           getOpCodeName(OpCode) + " for DXIL Version " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
   }
 
   std::string DXILFnName = constructOverloadName(Kind, OverloadTy, *Prop);
@@ -282,40 +428,18 @@ Type *DXILOpBuilder::getOverloadTy(dxil::OpCode OpCode, FunctionType *FT) {
   // If DXIL Op has no overload parameter, just return the
   // precise return type specified.
   if (Prop->OverloadParamIndex < 0) {
-    auto &Ctx = FT->getContext();
-    switch (Prop->OverloadTys) {
-    case OverloadKind::VOID:
-      return Type::getVoidTy(Ctx);
-    case OverloadKind::HALF:
-      return Type::getHalfTy(Ctx);
-    case OverloadKind::FLOAT:
-      return Type::getFloatTy(Ctx);
-    case OverloadKind::DOUBLE:
-      return Type::getDoubleTy(Ctx);
-    case OverloadKind::I1:
-      return Type::getInt1Ty(Ctx);
-    case OverloadKind::I8:
-      return Type::getInt8Ty(Ctx);
-    case OverloadKind::I16:
-      return Type::getInt16Ty(Ctx);
-    case OverloadKind::I32:
-      return Type::getInt32Ty(Ctx);
-    case OverloadKind::I64:
-      return Type::getInt64Ty(Ctx);
-    default:
-      llvm_unreachable("invalid overload type");
-      return nullptr;
-    }
+    return FT->getReturnType();
   }
 
-  // Prop->OverloadParamIndex is 0, overload type is FT->getReturnType().
+  // Consider FT->getReturnType() as default overload type, unless
+  // Prop->OverloadParamIndex != 0.
   Type *OverloadType = FT->getReturnType();
   if (Prop->OverloadParamIndex != 0) {
     // Skip Return Type.
     OverloadType = FT->getParamType(Prop->OverloadParamIndex - 1);
   }
 
-  auto ParamKinds = getOpCodeParameterKind(*Prop);
+  const ParameterKind *ParamKinds = getOpCodeParameterKind(*Prop);
   auto Kind = ParamKinds[Prop->OverloadParamIndex];
   // For ResRet and CBufferRet, OverloadTy is in field of StructType.
   if (Kind == ParameterKind::CBufferRet ||
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
index 5babeae470178..abb9a8d2b4cf8 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.h
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -14,6 +14,7 @@
 
 #include "DXILConstants.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 class Module;
@@ -28,11 +29,16 @@ namespace dxil {
 
 class DXILOpBuilder {
 public:
-  DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {}
+  DXILOpBuilder(Module &M, IRBuilderBase &B);
   /// Create an instruction that calls DXIL Op with return type, specified
-  /// opcode, and call arguments. \param OpCode Opcode of the DXIL Op call
-  /// constructed \param ReturnTy Return type of the DXIL Op call constructed
+  /// opcode, and call arguments.
+  ///
+  /// \param OpCode Opcode of the DXIL Op call constructed
+  /// \param SMVer Shader Model Version of DXIL Module being constructed.
+  /// \param StageKind Shader Stage for DXIL Module being constructed.
+  /// \param ReturnTy Return type of the DXIL Op call constructed
   /// \param OverloadTy Overload type of the DXIL Op call constructed
+  /// \param Args Arguments for the DXIL Op call constructed
   /// \return DXIL Op call constructed
   CallInst *createDXILOpCall(dxil::OpCode OpCode, Type *ReturnTy,
                              Type *OverloadTy, SmallVector<Value *> Args);
@@ -42,6 +48,8 @@ class DXILOpBuilder {
 private:
   Module &M;
   IRBuilderBase &B;
+  VersionTuple DXILVersion;
+  Triple::EnvironmentType ShaderStage;
 };
 
 } // namespace dxil
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 1329308ffec26..0c0edde43a277 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -104,20 +104,19 @@ static void lowerIntrinsic(dxil::OpCode DXILOp, Function &F, Module &M) {
 static bool lowerIntrinsics(Module &M) {
   bool Updated = false;
 
-#define DXIL_OP_INTRINSIC_MAP
-#include "DXILOperation.inc"
-#undef DXIL_OP_INTRINSIC_MAP
-
   for (Function &F : make_early_inc_range(M.functions())) {
     if (!F.isDeclaration())
       continue;
     Intrinsic::ID ID = F.getIntrinsicID();
-    if (ID == Intrinsic::not_intrinsic)
+    switch (ID) {
+    default:
       continue;
-    auto LowerIt = LowerMap.find(ID);
-    if (LowerIt == LowerMap.end())
-      continue;
-    lowerIntrinsic(LowerIt->second, F, M);
+#define DXIL_OP_INTRINSIC(OpCode, Intrin)                                      \
+  case Intrin:                                                                 \
+    lowerIntrinsic(OpCode, F, M);                                              \
+    break;
+#include "DXILOperation.inc"
+    }
     Updated = true;
   }
   return Updated;
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 889de3a81536d..56098864e987f 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -246,7 +246,7 @@ class DXILPrepareModule : public ModulePass {
   DXILPrepareModule() : ModulePass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
-    AU.addPreserved<DXILResourceWrapper>();
+    AU.addPreserved<DXILResourceMDWrapper>();
   }
   static char ID; // Pass identification.
 };
diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
index 7ae568c5ae53a..99cc4067b1d62 100644
--- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
@@ -41,7 +41,7 @@ class DXILPrettyPrinter : public llvm::ModulePass {
   bool runOnModule(Module &M) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    AU.addRequired<DXILResourceWrapper>();
+    AU.addRequired<DXILResourceMDWrapper>();
   }
 };
 } // namespace
@@ -49,12 +49,12 @@ class DXILPrettyPrinter : public llvm::ModulePass {
 char DXILPrettyPrinter::ID = 0;
 INITIALIZE_PASS_BEGIN(DXILPrettyPrinter, "dxil-pretty-printer",
                       "DXIL Metadata Pretty Printer", true, true)
-INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_END(DXILPrettyPrinter, "dxil-pretty-printer",
                     "DXIL Metadata Pretty Printer", true, true)
 
 bool DXILPrettyPrinter::runOnModule(Module &M) {
-  dxil::Resources &Res = getAnalysis<DXILResourceWrapper>().getDXILResource();
+  dxil::Resources &Res = getAnalysis<DXILResourceMDWrapper>().getDXILResource();
   Res.print(OS);
   return false;
 }
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp b/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
index 0b2f0d827ebbc..33e0119807bb8 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
@@ -18,35 +18,35 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dxil-resource-analysis"
 
-dxil::Resources DXILResourceAnalysis::run(Module &M,
-                                          ModuleAnalysisManager &AM) {
+dxil::Resources DXILResourceMDAnalysis::run(Module &M,
+                                            ModuleAnalysisManager &AM) {
   dxil::Resources R;
   R.collect(M);
   return R;
 }
 
-AnalysisKey DXILResourceAnalysis::Key;
+AnalysisKey DXILResourceMDAnalysis::Key;
 
-PreservedAnalyses DXILResourcePrinterPass::run(Module &M,
-                                               ModuleAnalysisManager &AM) {
-  dxil::Resources Res = AM.getResult<DXILResourceAnalysis>(M);
+PreservedAnalyses DXILResourceMDPrinterPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  dxil::Resources Res = AM.getResult<DXILResourceMDAnalysis>(M);
   Res.print(OS);
   return PreservedAnalyses::all();
 }
 
-char DXILResourceWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(DXILResourceWrapper, DEBUG_TYPE,
+char DXILResourceMDWrapper::ID = 0;
+INITIALIZE_PASS_BEGIN(DXILResourceMDWrapper, DEBUG_TYPE,
                       "DXIL resource Information", true, true)
-INITIALIZE_PASS_END(DXILResourceWrapper, DEBUG_TYPE,
+INITIALIZE_PASS_END(DXILResourceMDWrapper, DEBUG_TYPE,
                     "DXIL resource Information", true, true)
 
-bool DXILResourceWrapper::runOnModule(Module &M) {
+bool DXILResourceMDWrapper::runOnModule(Module &M) {
   Resources.collect(M);
   return false;
 }
 
-DXILResourceWrapper::DXILResourceWrapper() : ModulePass(ID) {}
+DXILResourceMDWrapper::DXILResourceMDWrapper() : ModulePass(ID) {}
 
-void DXILResourceWrapper::print(raw_ostream &OS, const Module *) const {
+void DXILResourceMDWrapper::print(raw_ostream &OS, const Module *) const {
   Resources.print(OS);
 }
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
index bce41160b95ec..3a2b8a9fd39d5 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
@@ -20,8 +20,9 @@
 
 namespace llvm {
 /// Analysis pass that exposes the \c DXILResource for a module.
-class DXILResourceAnalysis : public AnalysisInfoMixin<DXILResourceAnalysis> {
-  friend AnalysisInfoMixin<DXILResourceAnalysis>;
+class DXILResourceMDAnalysis
+    : public AnalysisInfoMixin<DXILResourceMDAnalysis> {
+  friend AnalysisInfoMixin<DXILResourceMDAnalysis>;
   static AnalysisKey Key;
 
 public:
@@ -29,25 +30,26 @@ class DXILResourceAnalysis : public AnalysisInfoMixin<DXILResourceAnalysis> {
   dxil::Resources run(Module &M, ModuleAnalysisManager &AM);
 };
 
-/// Printer pass for the \c DXILResourceAnalysis results.
-class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> {
+/// Printer pass for the \c DXILResourceMDAnalysis results.
+class DXILResourceMDPrinterPass
+    : public PassInfoMixin<DXILResourceMDPrinterPass> {
   raw_ostream &OS;
 
 public:
-  explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {}
+  explicit DXILResourceMDPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
 
 /// The legacy pass manager's analysis pass to compute DXIL resource
 /// information.
-class DXILResourceWrapper : public ModulePass {
+class DXILResourceMDWrapper : public ModulePass {
   dxil::Resources Resources;
 
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  DXILResourceWrapper();
+  DXILResourceMDWrapper();
 
   dxil::Resources &getDXILResource() { return Resources; }
   const dxil::Resources &getDXILResource() const { return Resources; }
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index ae6d6f96904c8..583bce0f50e70 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -33,7 +33,7 @@ class DXILTranslateMetadata : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    AU.addRequired<DXILResourceWrapper>();
+    AU.addRequired<DXILResourceMDWrapper>();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
   }
 
@@ -51,7 +51,7 @@ bool DXILTranslateMetadata::runOnModule(Module &M) {
   dxil::createDXILVersionMD(M);
 
   const dxil::Resources &Res =
-      getAnalysis<DXILResourceWrapper>().getDXILResource();
+      getAnalysis<DXILResourceMDWrapper>().getDXILResource();
   Res.write(M);
 
   const uint64_t Flags = static_cast<uint64_t>(
@@ -69,7 +69,7 @@ ModulePass *llvm::createDXILTranslateMetadataPass() {
 
 INITIALIZE_PASS_BEGIN(DXILTranslateMetadata, "dxil-metadata-emit",
                       "DXIL Metadata Emit", false, false)
-INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
 INITIALIZE_PASS_END(DXILTranslateMetadata, "dxil-metadata-emit",
                     "DXIL Metadata Emit", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 3433408f05171..cd0d6d34e9a67 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() {
     case Type::MetadataTyID:
       Code = bitc::TYPE_CODE_METADATA;
       break;
-    case Type::X86_MMXTyID:
-      Code = bitc::TYPE_CODE_X86_MMX;
-      break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index 11b5412c21d78..d056ae2bc488e 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -47,7 +47,7 @@ void initializeDXILTranslateMetadataPass(PassRegistry &);
 ModulePass *createDXILTranslateMetadataPass();
 
 /// Initializer for DXILTranslateMetadata.
-void initializeDXILResourceWrapperPass(PassRegistry &);
+void initializeDXILResourceMDWrapperPass(PassRegistry &);
 
 /// Pass to pretty print DXIL metadata.
 ModulePass *createDXILPrettyPrinterPass(raw_ostream &OS);
diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
index 1b326d020d511..7544172ab94e4 100644
--- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def
+++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
@@ -17,7 +17,7 @@
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)
 #endif
 MODULE_ANALYSIS("dx-shader-flags", dxil::ShaderFlagsAnalysis())
-MODULE_ANALYSIS("dxil-resource", DXILResourceAnalysis())
+MODULE_ANALYSIS("dxil-resource-md", DXILResourceMDAnalysis())
 #undef MODULE_ANALYSIS
 
 #ifndef MODULE_PASS
@@ -25,5 +25,5 @@ MODULE_ANALYSIS("dxil-resource", DXILResourceAnalysis())
 #endif
 // TODO: rename to print<foo> after NPM switch
 MODULE_PASS("print-dx-shader-flags", dxil::ShaderFlagsAnalysisPrinter(dbgs()))
-MODULE_PASS("print-dxil-resource", DXILResourcePrinterPass(dbgs()))
+MODULE_PASS("print-dxil-resource-md", DXILResourceMDPrinterPass(dbgs()))
 #undef MODULE_PASS
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index e6dbb25b710ec..92bd69b69684f 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -46,7 +46,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   initializeDXContainerGlobalsPass(*PR);
   initializeDXILOpLoweringLegacyPass(*PR);
   initializeDXILTranslateMetadataPass(*PR);
-  initializeDXILResourceWrapperPass(*PR);
+  initializeDXILResourceMDWrapperPass(*PR);
   initializeShaderFlagsAnalysisWrapperPass(*PR);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6926b02701771..1922675917752 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1037,7 +1037,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
     if (MI->isInlineAsm())
       continue;
     // Delete PHIs if possible.
-    if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+    if (!MI->isPHI() && !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index f0933765bbcbd..86ce6b4e05ed2 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -1223,6 +1223,10 @@ void HCE::recordExtender(MachineInstr &MI, unsigned OpNum) {
   if (ER.Kind == MachineOperand::MO_GlobalAddress)
     if (ER.V.GV->getName().empty())
       return;
+  // Ignore block address that points to block in another function
+  if (ER.Kind == MachineOperand::MO_BlockAddress)
+    if (ER.V.BA->getFunction() != &(MI.getMF()->getFunction()))
+      return;
   Extenders.push_back(ED);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index f4f84beea734d..05357de40e3a9 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1032,7 +1032,6 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
       MachineBasicBlock::iterator At) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
@@ -1043,7 +1042,7 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
   DebugLoc DL;
   const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
 
-  MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+  MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
   bool HasFP = hasFP(MF);
 
   if (HasFP) {
@@ -1660,7 +1659,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   using SpillSlot = TargetFrameLowering::SpillSlot;
 
   unsigned NumFixed;
-  int MinOffset = 0;  // CS offsets are negative.
+  int64_t MinOffset = 0; // CS offsets are negative.
   const SpillSlot *FixedSlots = getCalleeSavedSpillSlots(NumFixed);
   for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) {
     if (!SRegs[S->Reg])
@@ -1679,7 +1678,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     Register R = x;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
     unsigned Size = TRI->getSpillSize(*RC);
-    int Off = MinOffset - Size;
+    int64_t Off = MinOffset - Size;
     Align Alignment = std::min(TRI->getSpillAlign(*RC), getStackAlign());
     Off &= -Alignment.value();
     int FI = MFI.CreateFixedSpillStackObject(Size, Off);
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 8840c272057ab..5e52cf03cfbc7 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1451,7 +1451,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
         Opc == TargetOpcode::LIFETIME_END)
       continue;
     bool Store = false;
-    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+    if (MI->isInlineAsm() || !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 81035849491bc..39b8c829a0b21 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1748,7 +1748,7 @@ HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
 
   unsigned InpLen = ty(Op.getOperand(0)).getVectorNumElements();
   MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
-  SDValue S = DAG.getConstant(InpLen*BitBytes, dl, MVT::i32);
+  SDValue S = DAG.getConstant(HwLen - InpLen*BitBytes, dl, MVT::i32);
   SDValue Res = getZero(dl, ByteTy, DAG);
   for (unsigned i = 0, e = Prefixes.size(); i != e; ++i) {
     Res = DAG.getNode(HexagonISD::VROR, dl, ByteTy, Res, S);
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index b75cd1beadc58..4ef009c87a1e6 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -829,7 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     return false;
 
   Value *X = nullptr;
-  if (!match(C, m_c_And(m_Value(X), m_One())))
+  if (!match(C, m_And(m_Value(X), m_One())))
     return false;
   // Matched: select (X & 1) == +++ ? ... : ...
   //          select (X & 1) != +++ ? ... : ...
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e5d10a75728bf..0c1b0aea41f41 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::PPC_FP128TyID:
   case Type::LabelTyID:
   case Type::MetadataTyID:
-  case Type::X86_MMXTyID:
   case Type::X86_AMXTyID:
   case Type::TokenTyID:
   case Type::TypedPointerTyID:
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 22c82bf5ac1b4..ddc6898d53dbf 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -97,10 +97,8 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
   StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"};
 
   auto ELFSymbol = cast<MCSymbolELF>(Symbol);
-  if (!ELFSymbol->isBindingSet()) {
+  if (!ELFSymbol->isBindingSet())
     ELFSymbol->setBinding(ELF::STB_GLOBAL);
-    ELFSymbol->setExternal(true);
-  }
 
   ELFSymbol->setType(ELF::STT_OBJECT);
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index eab7647e633bc..aa86b2df85629 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -26,10 +26,10 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -226,12 +226,11 @@ namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
   formatted_raw_ostream &OS;
-  bool IsVerboseAsm;
 
 public:
   HexagonTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
-                           bool IsVerboseAsm, MCInstPrinter &IP)
-      : HexagonTargetStreamer(S), OS(OS), IsVerboseAsm(IsVerboseAsm) {}
+                           MCInstPrinter &IP)
+      : HexagonTargetStreamer(S), OS(OS) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, uint64_t Address,
                       const MCInst &Inst, const MCSubtargetInfo &STI,
@@ -275,7 +274,7 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
 
   void emitAttribute(unsigned Attribute, unsigned Value) override {
     OS << "\t.attribute\t" << Attribute << ", " << Twine(Value);
-    if (IsVerboseAsm) {
+    if (getStreamer().isVerboseAsm()) {
       StringRef Name = ELFAttrs::attrTypeAsString(
           Attribute, HexagonAttrs::getHexagonAttributeTags());
       if (!Name.empty())
@@ -292,8 +291,7 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
   }
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
-    MCAssembler &MCA = getStreamer().getAssembler();
-    MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
+    getStreamer().getWriter().setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
   void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
@@ -376,10 +374,10 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
     return nullptr;
 }
 
-static MCTargetStreamer *
-createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
-                          MCInstPrinter *IP, bool IsVerboseAsm) {
-  return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *IP);
+static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS,
+                                                   MCInstPrinter *IP) {
+  return new HexagonTargetAsmStreamer(S, OS, *IP);
 }
 
 static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index b8a37435f5a64..9f3d9b7aaa6f9 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -483,7 +483,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 73cb80245bca4..f52e188f87792 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -245,6 +245,16 @@ class LoongArchOperand : public MCParsedAsmOperand {
            VK == LoongArchMCExpr::VK_LoongArch_None;
   }
 
+  bool isTPRelAddSymbol() const {
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    // Must be of 'immediate' type but not a constant.
+    if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
+      return false;
+    return LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+           VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_ADD_R;
+  }
+
   bool isUImm1() const { return isUImm<1>(); }
   bool isUImm2() const { return isUImm<2>(); }
   bool isUImm2plus1() const { return isUImm<2, 1>(); }
@@ -276,6 +286,7 @@ class LoongArchOperand : public MCParsedAsmOperand {
                        VK == LoongArchMCExpr::VK_LoongArch_PCALA_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD;
     return IsConstantImm
@@ -391,6 +402,7 @@ class LoongArchOperand : public MCParsedAsmOperand {
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_HI20;
     return IsConstantImm
                ? isInt<20>(Imm) && IsValidKind
@@ -438,6 +450,24 @@ class LoongArchOperand : public MCParsedAsmOperand {
                      IsValidKind;
   }
 
+  bool isSImm20pcaddi() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2;
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1664,6 +1694,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %call36) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddi:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %pcrel_20) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
@@ -1686,6 +1722,10 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a bare symbol name");
   }
+  case Match_InvalidTPRelAddSymbol: {
+    SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be a symbol with %le_add_r modifier");
+  }
   }
   llvm_unreachable("Unknown match type detected!");
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 8a628157c6018..ddb27dc6404fa 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -106,11 +106,6 @@ def FeatureRelax
     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
                        "Enable Linker relaxation">;
 
-// Experimental auto vectorization
-def FeatureAutoVec
-    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
-                       "Experimental auto vectorization">;
-
 // Floating point approximation operation
 def FeatureFrecipe
     : SubtargetFeature<"frecipe", "HasFrecipe", "true",
@@ -151,6 +146,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
                                              FeatureExtLVZ,
                                              FeatureExtLBT]>;
 
+def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
+                                             FeatureUAL,
+                                             FeatureExtLASX,
+                                             FeatureExtLVZ,
+                                             FeatureExtLBT,
+                                             FeatureFrecipe]>;
+
 //===----------------------------------------------------------------------===//
 // Define the LoongArch target.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 27979a830b10e..f478870217ec6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -34,8 +34,10 @@ void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) {
       MI->getOpcode(), getSubtargetInfo().getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   switch (MI->getOpcode()) {
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
index 693456443c7a4..9da90886627ec 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -47,8 +47,8 @@ class LLVM_LIBRARY_VISIBILITY LoongArchAsmPrinter : public AsmPrinter {
   void emitSled(const MachineInstr &MI, SledKind Kind);
 
   // tblgen'erated function.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
+
   // Wrapper needed for tblgenned pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this);
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index c136f5b3e515d..33b93e42bb5c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,27 +62,47 @@ class LoongArchPreRAExpandPseudo : public MachineFunctionPass {
                                MachineBasicBlock::iterator &NextMBBI,
                                unsigned FlagsHi, unsigned SecondOpcode,
                                unsigned FlagsLo);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO,
+                              const MachineOperand &Symbol, Register DestReg,
+                              bool EraseFromParent);
   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressGot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            MachineBasicBlock::iterator &NextMBBI);
+                            MachineBasicBlock::iterator &NextMBBI,
+                            bool Large = false);
   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSDesc(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
-                                MachineBasicBlock::iterator &NextMBBI);
+                                MachineBasicBlock::iterator &NextMBBI,
+                                bool Large = false);
+  bool expandFunctionCALL(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI,
+                          bool IsTailCall);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -115,18 +135,38 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoLA_PCREL:
     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_PCREL_LARGE:
+    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_GOT:
     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_GOT_LARGE:
+    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LE:
     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_IE:
     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_IE_LARGE:
+    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LD:
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_LD_LARGE:
+    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_GD_LARGE:
+    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_DESC_PC:
     return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
+    return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI, /*Large=*/true);
+  case LoongArch::PseudoCALL:
+  case LoongArch::PseudoCALL_MEDIUM:
+  case LoongArch::PseudoCALL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+  case LoongArch::PseudoTAIL:
+  case LoongArch::PseudoTAIL_MEDIUM:
+  case LoongArch::PseudoTAIL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
   return false;
 }
@@ -159,9 +199,118 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   return true;
 }
 
+bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO) {
+  MachineInstr &MI = *MBBI;
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+                                MI.getOperand(2), MI.getOperand(0).getReg(),
+                                true);
+}
+
+bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+    bool EraseFromParent) {
+  // Code Sequence:
+  //
+  // Part1: pcalau12i  $scratch, %MO1(sym)
+  // Part0: addi.d     $dest, $zero, %MO0(sym)
+  // Part2: lu32i.d    $dest, %MO2(sym)
+  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
+  // Fin:   LastOpcode $dest, $dest, $scratch
+
+  unsigned MO0, MO1, MO2, MO3;
+  switch (IdentifyingMO) {
+  default:
+    llvm_unreachable("unsupported identifying MO");
+  case LoongArchII::MO_PCREL_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_PCREL_HI;
+    MO2 = LoongArchII::MO_PCREL64_LO;
+    MO3 = LoongArchII::MO_PCREL64_HI;
+    break;
+  case LoongArchII::MO_GOT_PC_HI:
+  case LoongArchII::MO_LD_PC_HI:
+  case LoongArchII::MO_GD_PC_HI:
+    // These cases relocate just like the GOT case, except for Part1.
+    MO0 = LoongArchII::MO_GOT_PC_LO;
+    MO1 = IdentifyingMO;
+    MO2 = LoongArchII::MO_GOT_PC64_LO;
+    MO3 = LoongArchII::MO_GOT_PC64_HI;
+    break;
+  case LoongArchII::MO_IE_PC_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_IE_PC_HI;
+    MO2 = LoongArchII::MO_IE_PC64_LO;
+    MO3 = LoongArchII::MO_IE_PC64_HI;
+    break;
+  }
+
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  Register TmpPart1 =
+      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+  Register TmpPart0 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+  Register TmpParts02 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+  Register TmpParts023 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+
+  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
+  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
+                   .addReg(LoongArch::R0);
+  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
+                   // "rj" is needed due to InstrInfo pattern requirement.
+                   .addReg(TmpPart0, RegState::Kill);
+  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
+                   .addReg(TmpParts02, RegState::Kill);
+  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+      .addReg(TmpParts023)
+      .addReg(TmpPart1, RegState::Kill);
+
+  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+    const char *SymName = Symbol.getSymbolName();
+    Part0.addExternalSymbol(SymName, MO0);
+    Part1.addExternalSymbol(SymName, MO1);
+    Part2.addExternalSymbol(SymName, MO2);
+    Part3.addExternalSymbol(SymName, MO3);
+  } else {
+    Part0.addDisp(Symbol, 0, MO0);
+    Part1.addDisp(Symbol, 0, MO1);
+    Part2.addDisp(Symbol, 0, MO2);
+    Part3.addDisp(Symbol, 0, MO3);
+  }
+
+  if (EraseFromParent)
+    MI.eraseFromParent();
+
+  return true;
+}
+
 bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%pc` family of
+    // relocs.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_PCREL_LO);
+
   // Code Sequence:
   // pcalau12i $rd, %pc_hi20(sym)
   // addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -174,7 +323,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, loading the result from GOT with `ldx.d` in the end.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                  LoongArchII::MO_GOT_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %got_pc_hi20(sym)
   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -235,7 +390,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%ie_pc` family
+    // of relocs, loading the result with `ldx.d` in the end.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                  LoongArchII::MO_IE_PC_LO);
+
   // Code Sequence:
   // pcalau12i $rd, %ie_pc_hi20(sym)
   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -248,7 +409,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_LD_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %ld_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -261,7 +428,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_GD_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %gd_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -274,13 +447,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Code Sequence:
-  // pcalau12i $a0, %desc_pc_hi20(sym)
-  // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
-  // ld.w/d    $ra, $a0, %desc_ld(sym)
-  // jirl      $ra, $ra, %desc_ld(sym)
-  // add.d     $dst, $a0, $tp
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
   MachineFunction *MF = MBB.getParent();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
@@ -291,25 +458,62 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   unsigned LD = STI.is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
 
   Register DestReg = MI.getOperand(0).getReg();
-  Register ScratchReg =
+  Register Tmp1Reg =
       MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
-  MachineOperand &Symbol = MI.getOperand(1);
+  MachineOperand &Symbol = MI.getOperand(Large ? 2 : 1);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg)
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), Tmp1Reg)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
 
-  BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+  if (Large) {
+    // Code Sequence:
+    //
+    // pcalau12i  $a0, %desc_pc_hi20(sym)
+    // addi.d     $a1, $zero, %desc_pc_lo12(sym)
+    // lu32i.d    $a1, %desc64_pc_lo20(sym)
+    // lu52i.d    $a1, $a1, %desc64_pc_hi12(sym)
+    // add.d      $a0, $a0, $a1
+    // ld.d       $ra, $a0, %desc_ld(sym)
+    // jirl       $ra, $ra, %desc_call(sym)
+    // add.d      $dst, $a0, $tp
+    assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+           "Large code model requires LA64");
+    Register Tmp2Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    Register Tmp3Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    Register Tmp4Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), Tmp2Reg)
+        .addReg(LoongArch::R0)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), Tmp3Reg)
+        .addReg(Tmp2Reg, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_LO);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), Tmp4Reg)
+        .addReg(Tmp3Reg)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_HI);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), LoongArch::R4)
+        .addReg(Tmp1Reg)
+        .addReg(Tmp4Reg);
+  } else {
+    // Code Sequence:
+    // pcalau12i $a0, %desc_pc_hi20(sym)
+    // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
+    // ld.w/d    $ra, $a0, %desc_ld(sym)
+    // jirl      $ra, $ra, %desc_ld(sym)
+    // add.d     $dst, $a0, $tp
+    BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
+        .addReg(Tmp1Reg)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+  }
 
   BuildMI(MBB, MBBI, DL, TII->get(LD), LoongArch::R1)
       .addReg(LoongArch::R4)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
-
   BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
       .addReg(LoongArch::R1)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
-
   BuildMI(MBB, MBBI, DL, TII->get(ADD), DestReg)
       .addReg(LoongArch::R4)
       .addReg(LoongArch::R2);
@@ -318,6 +522,85 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   return true;
 }
 
+bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Func = MI.getOperand(0);
+  MachineInstrBuilder CALL;
+  unsigned Opcode;
+
+  switch (MF->getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+    break;
+  case CodeModel::Small: {
+    // CALL:
+    // bl func
+    // TAIL:
+    // b func
+    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+    break;
+  }
+  case CodeModel::Medium: {
+    // CALL:
+    // pcaddu18i $ra, %call36(func)
+    // jirl      $ra, $ra, 0
+    // TAIL:
+    // pcaddu18i $scratch, %call36(func)
+    // jirl      $r0, $scratch, 0
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register ScratchReg =
+        IsTailCall
+            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+            : LoongArch::R1;
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+    break;
+  }
+  case CodeModel::Large: {
+    // Emit the 5-insn large address load sequence, either directly or
+    // indirectly in case of going through the GOT, then JIRL_TAIL or
+    // JIRL_CALL to $addr.
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register AddrReg =
+        IsTailCall
+            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+            : LoongArch::R1;
+
+    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+                           false);
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+    break;
+  }
+  }
+
+  // Transfer implicit operands.
+  CALL.copyImplicitOps(MI);
+
+  // Transfer MI flags.
+  CALL.setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+  return true;
+}
+
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -339,38 +622,6 @@ class LoongArchExpandPseudo : public MachineFunctionPass {
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineBasicBlock::iterator &NextMBBI);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO,
-                              const MachineOperand &Symbol, Register DestReg,
-                              bool EraseFromParent);
-  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSDescPcLarge(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI,
-                                       MachineBasicBlock::iterator &NextMBBI);
-  bool expandFunctionCALL(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          MachineBasicBlock::iterator &NextMBBI,
-                          bool IsTailCall);
 };
 
 char LoongArchExpandPseudo::ID = 0;
@@ -405,26 +656,6 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoCopyCFR:
     return expandCopyCFR(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_PCREL_LARGE:
-    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_GOT_LARGE:
-    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_IE_LARGE:
-    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_LD_LARGE:
-    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_GD_LARGE:
-    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
-    return expandLoadAddressTLSDescPcLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoCALL:
-  case LoongArch::PseudoCALL_MEDIUM:
-  case LoongArch::PseudoCALL_LARGE:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
-  case LoongArch::PseudoTAIL:
-  case LoongArch::PseudoTAIL_MEDIUM:
-  case LoongArch::PseudoTAIL_LARGE:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
 
   return false;
@@ -483,264 +714,6 @@ bool LoongArchExpandPseudo::expandCopyCFR(
   return true;
 }
 
-bool LoongArchExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO) {
-  MachineInstr &MI = *MBBI;
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
-                                MI.getOperand(2), MI.getOperand(0).getReg(),
-                                true);
-}
-
-bool LoongArchExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
-    bool EraseFromParent) {
-  // Code Sequence:
-  //
-  // Part1: pcalau12i  $dst, %MO1(sym)
-  // Part0: addi.d     $t8, $zero, %MO0(sym)
-  // Part2: lu32i.d    $t8, %MO2(sym)
-  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
-  // Fin:   LastOpcode $dst, $t8, $dst
-
-  unsigned MO0, MO1, MO2, MO3;
-  switch (IdentifyingMO) {
-  default:
-    llvm_unreachable("unsupported identifying MO");
-  case LoongArchII::MO_PCREL_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_PCREL_HI;
-    MO2 = LoongArchII::MO_PCREL64_LO;
-    MO3 = LoongArchII::MO_PCREL64_HI;
-    break;
-  case LoongArchII::MO_GOT_PC_HI:
-  case LoongArchII::MO_LD_PC_HI:
-  case LoongArchII::MO_GD_PC_HI:
-    // These cases relocate just like the GOT case, except for Part1.
-    MO0 = LoongArchII::MO_GOT_PC_LO;
-    MO1 = IdentifyingMO;
-    MO2 = LoongArchII::MO_GOT_PC64_LO;
-    MO3 = LoongArchII::MO_GOT_PC64_HI;
-    break;
-  case LoongArchII::MO_IE_PC_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_IE_PC_HI;
-    MO2 = LoongArchII::MO_IE_PC64_LO;
-    MO3 = LoongArchII::MO_IE_PC64_HI;
-    break;
-  }
-
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  Register ScratchReg = LoongArch::R20; // $t8
-
-  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
-                   .addReg(LoongArch::R0);
-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
-                   // "rj" is needed due to InstrInfo pattern requirement.
-                   .addReg(ScratchReg);
-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
-                   .addReg(ScratchReg);
-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
-      .addReg(ScratchReg)
-      .addReg(DestReg);
-
-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
-    const char *SymName = Symbol.getSymbolName();
-    Part0.addExternalSymbol(SymName, MO0);
-    Part1.addExternalSymbol(SymName, MO1);
-    Part2.addExternalSymbol(SymName, MO2);
-    Part3.addExternalSymbol(SymName, MO3);
-  } else {
-    Part0.addDisp(Symbol, 0, MO0);
-    Part1.addDisp(Symbol, 0, MO1);
-    Part2.addDisp(Symbol, 0, MO2);
-    Part3.addDisp(Symbol, 0, MO3);
-  }
-
-  if (EraseFromParent)
-    MI.eraseFromParent();
-
-  return true;
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%pc` family of
-  // relocs.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_PCREL_LO);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, loading the result from GOT with `ldx.d` in the end.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                LoongArchII::MO_GOT_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%ie_pc` family
-  // of relocs, loading the result with `ldx.d` in the end.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                LoongArchII::MO_IE_PC_LO);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_LD_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_GD_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSDescPcLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Code Sequence:
-  //
-  // pcalau12i  $a0, %desc_pc_hi20(sym)
-  // addi.d     $t8, $zero, %desc_pc_lo12(sym)
-  // lu32i.d    $t8, %desc64_pc_lo20(sym)
-  // lu52i.d    $t8, $t8, %desc64_pc_hi12(sym)
-  // add.d      $a0, $a0, $t8
-  // ld.d       $ra, $a0, %desc_ld(sym)
-  // jirl       $ra, $ra, %desc_call(sym)
-  // add.d      $dst, $a0, $tp
-
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  Register DestReg = MI.getOperand(0).getReg();
-  MachineOperand &Symbol = MI.getOperand(2);
-  Register ScratchReg = LoongArch::R20; // $t8
-
-  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), LoongArch::R4)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
-      .addReg(LoongArch::R0)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_LO);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_HI);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), LoongArch::R4)
-      .addReg(ScratchReg)
-      .addReg(LoongArch::R4);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LD_D), LoongArch::R1)
-      .addReg(LoongArch::R4)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
-      .addReg(LoongArch::R1)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), DestReg)
-      .addReg(LoongArch::R4)
-      .addReg(LoongArch::R2);
-
-  MI.eraseFromParent();
-
-  return true;
-}
-
-bool LoongArchExpandPseudo::expandFunctionCALL(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Func = MI.getOperand(0);
-  MachineInstrBuilder CALL;
-  unsigned Opcode;
-
-  switch (MF->getTarget().getCodeModel()) {
-  default:
-    report_fatal_error("Unsupported code model");
-    break;
-  case CodeModel::Small: {
-    // CALL:
-    // bl func
-    // TAIL:
-    // b func
-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
-    break;
-  }
-  case CodeModel::Medium: {
-    // CALL:
-    // pcaddu18i  $ra, %call36(func)
-    // jirl       $ra, $ra, 0
-    // TAIL:
-    // pcaddu18i  $t8, %call36(func)
-    // jr         $t8
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
-
-    CALL =
-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
-
-    if (Func.isSymbol())
-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
-    else
-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
-    break;
-  }
-  case CodeModel::Large: {
-    // Emit the 5-insn large address load sequence, either directly or
-    // indirectly in case of going through the GOT, then JIRL_TAIL or
-    // JIRL_CALL to $addr.
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
-
-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
-                           false);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
-    break;
-  }
-  }
-
-  // Transfer implicit operands.
-  CALL.copyImplicitOps(MI);
-
-  // Transfer MI flags.
-  CALL.setMIFlags(MI.getFlags());
-
-  MI.eraseFromParent();
-  return true;
-}
-
 } // end namespace
 
 INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6072e5e244263..bdbb7ab0f5139 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsLoongArch.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
@@ -252,9 +253,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -298,9 +299,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -428,9 +429,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
   return SDValue();
 }
 
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+                   unsigned CheckStride,
+                   typename SmallVectorImpl<ValType>::const_iterator End,
+                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+  auto &I = Begin;
+
+  while (I != End) {
+    if (*I != -1 && *I != ExpectedIndex)
+      return false;
+    ExpectedIndex += ExpectedIndexStride;
+
+    // Incrementing past End is undefined behaviour so we must increment one
+    // step at a time and check for End at each step.
+    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+      ; // Empty loop body.
+  }
+  return true;
+}
+
+/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
+///
+/// VREPLVEI performs vector broadcast based on an element specified by an
+/// integer immediate, with its mask being similar to:
+///   <x, x, x, ...>
+/// where x is any valid index.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
+///
+/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
+/// elements according to a <4 x i2> constant (encoded as an integer immediate).
+///
+/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
+///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+/// When undef's appear they are treated as if they were whatever value is
+/// necessary in order to fit the above forms.
+///
+/// For example:
+///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+///                                 i32 7, i32 6, i32 5, i32 4>
+/// is lowered to:
+///   (VSHUF4I_H $v0, $v1, 27)
+/// where the 27 comes from:
+///   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  // When the size is less than 4, lower cost instructions may be used.
+  if (Mask.size() < 4)
+    return SDValue();
+
+  int SubMask[4] = {-1, -1, -1, -1};
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Mask.size(); j += 4) {
+      int Idx = Mask[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SubMask[i] == -1)
+        SubMask[i] = Idx;
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      else if (Idx != -1 && Idx != SubMask[i])
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(64, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SubMask[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
+                     DAG.getConstant(Imm, DL, MVT::i64));
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
+///
+/// VPACKEV interleaves the even elements from each vector.
+///
+/// It is possible to lower into VPACKEV when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 2, 2, 4, 4, ...>
+///   <0, n, 2, n+2, 4, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
+///
+/// VPACKOD interleaves the odd elements from each vector.
+///
+/// It is possible to lower into VPACKOD when the mask consists of two of the
+/// following forms interleaved:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 1, 3, 3, 5, 5, ...>
+///   <1, n+1, 3, n+3, 5, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVH (if possible).
+///
+/// VILVH interleaves consecutive elements from the left (highest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVH when the mask consists of two of the
+/// following forms interleaved:
+///   <x, x+1, x+2, ...>
+///   <n+x, n+x+1, n+x+2, ...>
+/// where n is the number of elements in the vector and x is half n.
+/// For example:
+///   <x, x, x+1, x+1, x+2, x+2, ...>
+///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
+                                   1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVL (if possible).
+///
+/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVL when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 1, 2, ...>
+///   <n, n+1, n+2, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 1, 1, 2, 2, ...>
+///   <0, n, 1, n+1, 2, n+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
+///
+/// VPICKEV copies the even elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKEV when the mask consists of two of the
+/// following forms concatenated:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 2, 4, ..., 0, 2, 4, ...>
+///   <0, 2, 4, ..., n, n+2, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
+///
+/// VPICKOD copies the odd elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKOD when the mask consists of two of the
+/// following forms concatenated:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 3, 5, ..., 1, 3, 5, ...>
+///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF.
+///
+/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
+/// adding it as an operand to the resulting VSHUF.
+static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  SmallVector<SDValue, 16> Ops;
+  for (auto M : Mask)
+    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
+
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
+          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
+          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
+         "Vector type is unsupported for lsx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
+///
+/// It is a XVREPLVEI when the mask is:
+///   <x, x, x, ..., x+n, x+n, x+n, ...>
+/// where the number of x is equal to n and n is half the length of vector.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
+                                             ArrayRef<int> Mask, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
+                              0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  // When the size is less than or equal to 4, lower cost instructions may be
+  // used.
+  if (Mask.size() <= 4)
+    return SDValue();
+  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  unsigned LeftSize = HalfSize / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
+                              1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
+                                   1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
+                                   2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
+                                   2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + HalfSize;
+  const auto &End = Mask.end();
+
+  // VECTOR_SHUFFLE concatenates the vectors:
+  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
+  //  shuffling ->
+  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
+  //
+  // XVSHUF concatenates the vectors:
+  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
+  //  shuffling ->
+  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
+  SmallVector<SDValue, 8> MaskAlloc;
+  for (auto it = Begin; it < Mid; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= 0 && *it < HalfSize) ||
+             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
+      int M = *it < HalfSize ? *it : *it - HalfSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
+
+  for (auto it = Mid; it < End; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= HalfSize && *it < MaskSize) ||
+             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
+      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Shuffle vectors by lane to generate more optimized instructions.
+/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
+///
+/// Therefore, except for the following four cases, other cases are regarded
+/// as cross-lane shuffles, where optimization is relatively limited.
+///
+/// - Shuffle high, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
+/// - Shuffle low, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
+/// - Shuffle low, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
+/// - Shuffle high, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
+///
+/// The first case is the closest to LoongArch instructions and the other
+/// cases need to be converted to it for processing.
+///
+/// This function may modify V1, V2 and Mask
+static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
+                                            MutableArrayRef<int> Mask, MVT VT,
+                                            SDValue &V1, SDValue &V2,
+                                            SelectionDAG &DAG) {
+
+  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+
+  HalfMaskType preMask = None, postMask = None;
+
+  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    preMask = HighLaneTy;
+  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    preMask = LowLaneTy;
+
+  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    postMask = HighLaneTy;
+  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    postMask = LowLaneTy;
+
+  // The pre-half of mask is high lane type, and the post-half of mask
+  // is low lane type, which is closest to the LoongArch instructions.
+  //
+  // Note: In the LoongArch architecture, the high lane of mask corresponds
+  // to the lower 128-bit of vector register, and the low lane of mask
+  // corresponds the higher 128-bit of vector register.
+  if (preMask == HighLaneTy && postMask == LowLaneTy) {
+    return;
+  }
+  if (preMask == LowLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01001110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01001110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b11101110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b11101110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01000100, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01000100, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else { // cross-lane
+    return;
+  }
+}
+
+/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 256-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
+          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
+          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
+         "Vector type is unsupported for lasx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+  assert(Mask.size() >= 4 && "Mask size is less than 4.");
+
+  // canonicalize non cross-lane shuffle vector
+  SmallVector<int> NewMask(Mask);
+  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  // TODO: custom shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc DL(Op);
+
+  bool V1IsUndef = V1.isUndef();
+  bool V2IsUndef = V2.isUndef();
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef &&
+      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+    SmallVector<int, 8> NewMask(OrigMask);
+    for (int &M : NewMask)
+      if (M >= NumElements)
+        M = -1;
+    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+  }
+
+  // Check for illegal shuffle mask element index values.
+  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+  (void)MaskUpperLimit;
+  assert(llvm::all_of(OrigMask,
+                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+         "Out of bounds shuffle index");
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.is128BitVector())
+    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
+  if (VT.is256BitVector())
+    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
   return SDValue();
 }
 
@@ -2675,6 +3593,10 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
       !checkValueWidth(TruncInputValue2, ExtType2))
     return SDValue();
 
+  if (TruncInputValue1->getValueType(0) != TruncInputValue2->getValueType(0) ||
+      AndNode->getValueType(0) != TruncInputValue1->getValueType(0))
+    return SDValue();
+
   if ((ExtType2 != ISD::ZEXTLOAD) &&
       ((ExtType2 != ISD::SEXTLOAD) && (ExtType1 != ISD::SEXTLOAD)))
     return SDValue();
@@ -3702,6 +4624,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(MOVFCSR2GR)
     NODE_NAME_CASE(CACOP_D)
     NODE_NAME_CASE(CACOP_W)
+    NODE_NAME_CASE(VSHUF)
+    NODE_NAME_CASE(VPICKEV)
+    NODE_NAME_CASE(VPICKOD)
+    NODE_NAME_CASE(VPACKEV)
+    NODE_NAME_CASE(VPACKOD)
+    NODE_NAME_CASE(VILVL)
+    NODE_NAME_CASE(VILVH)
+    NODE_NAME_CASE(VSHUF4I)
+    NODE_NAME_CASE(VREPLVEI)
+    NODE_NAME_CASE(XVPERMI)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
@@ -5231,3 +6163,23 @@ bool LoongArchTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
     return false;
   return true;
 }
+
+// memcpy, and other memory intrinsics, typically tries to use wider load/store
+// if the source/dest is aligned and the copy size is large enough. We therefore
+// want to align such objects passed to memory intrinsics.
+bool LoongArchTargetLowering::shouldAlignPointerArgs(CallInst *CI,
+                                                     unsigned &MinSize,
+                                                     Align &PrefAlign) const {
+  if (!isa<MemIntrinsic>(CI))
+    return false;
+
+  if (Subtarget.is64Bit()) {
+    MinSize = 8;
+    PrefAlign = Align(8);
+  } else {
+    MinSize = 4;
+    PrefAlign = Align(4);
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f4c57f80fdbe4..d834a5d8587fd 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -120,6 +120,16 @@ enum NodeType : unsigned {
 
   // Vector Shuffle
   VREPLVE,
+  VSHUF,
+  VPICKEV,
+  VPICKOD,
+  VPACKEV,
+  VPACKOD,
+  VILVL,
+  VILVH,
+  VSHUF4I,
+  VREPLVEI,
+  XVPERMI,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
@@ -247,6 +257,9 @@ class LoongArchTargetLowering : public TargetLowering {
   bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
   bool shouldExtendTypeInLibCall(EVT Type) const override;
 
+  bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                              Align &PrefAlign) const override;
+
 private:
   /// Target-specific function used to lower LoongArch calling conventions.
   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index a85b054a85d72..90d94e96b0efd 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -347,6 +347,83 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   }
 }
 
+bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                              const MachineBasicBlock *MBB,
+                                              const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+
+  auto MII = MI.getIterator();
+  auto MIE = MBB->end();
+
+  // According to psABI v2.30:
+  //
+  // https://github.com/loongson/la-abi-specs/releases/tag/v2.30
+  //
+  // The following instruction patterns are prohibited from being reordered:
+  //
+  // * pcaddu18 $ra, %call36(s)
+  //   jirl     $ra, $ra, 0
+  //
+  // * pcalau12i $a0, %pc_hi20(s)
+  //   addi.d $a1, $zero, %pc_lo12(s)
+  //   lu32i.d $a1, %pc64_lo20(s)
+  //   lu52i.d $a1, $a1, %pc64_hi12(s)
+  //
+  // * pcalau12i $a0, %got_pc_hi20(s) | %ld_pc_hi20(s) | %gd_pc_hi20(s)
+  //   addi.d $a1, $zero, %got_pc_lo12(s)
+  //   lu32i.d $a1, %got64_pc_lo20(s)
+  //   lu52i.d $a1, $a1, %got64_pc_hi12(s)
+  //
+  // * pcalau12i $a0, %ie_pc_hi20(s)
+  //   addi.d $a1, $zero, %ie_pc_lo12(s)
+  //   lu32i.d $a1, %ie64_pc_lo20(s)
+  //   lu52i.d $a1, $a1, %ie64_pc_hi12(s)
+  //
+  // For simplicity, only pcalau12i and lu52i.d are marked as scheduling
+  // boundaries, and the instructions between them are guaranteed to be
+  // ordered according to data dependencies.
+  switch (MI.getOpcode()) {
+  case LoongArch::PCADDU18I:
+    if (MI.getOperand(1).getTargetFlags() == LoongArchII::MO_CALL36)
+      return true;
+    break;
+  case LoongArch::PCALAU12I: {
+    auto AddI = std::next(MII);
+    if (AddI == MIE || AddI->getOpcode() != LoongArch::ADDI_D)
+      break;
+    auto Lu32I = std::next(AddI);
+    if (Lu32I == MIE || Lu32I->getOpcode() != LoongArch::LU32I_D)
+      break;
+    auto MO0 = MI.getOperand(1).getTargetFlags();
+    auto MO1 = AddI->getOperand(2).getTargetFlags();
+    auto MO2 = Lu32I->getOperand(2).getTargetFlags();
+    if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO &&
+        MO2 == LoongArchII::MO_PCREL64_LO)
+      return true;
+    if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI ||
+         MO0 == LoongArchII::MO_GD_PC_HI) &&
+        MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO)
+      return true;
+    if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO &&
+        MO2 == LoongArchII::MO_IE_PC64_LO)
+      return true;
+    break;
+  }
+  case LoongArch::LU52I_D: {
+    auto MO = MI.getOperand(2).getTargetFlags();
+    if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI ||
+        MO == LoongArchII::MO_IE_PC64_HI)
+      return true;
+    break;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
 unsigned LoongArchInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                           int *BytesRemoved) const {
   if (BytesRemoved)
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index eb19051e380ca..d66b2cb8efb33 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -64,6 +64,10 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 67c3dfd3b2599..0ab429817724c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -397,6 +397,10 @@ def simm20_pcaddu18i : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
 }
 
+def simm20_pcaddi : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddi">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -456,6 +460,19 @@ def bare_symbol : Operand<GRLenVT> {
   let ParserMatchClass = BareSymbol;
 }
 
+def TPRelAddSymbol : AsmOperandClass {
+  let Name = "TPRelAddSymbol";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidTPRelAddSymbol";
+  let ParserMethod = "parseOperandWithModifier";
+}
+
+// A bare symbol with the %le_add_r variant.
+def tprel_add_symbol : Operand<GRLenVT> {
+  let ParserMatchClass = TPRelAddSymbol;
+}
+
+
 // Standalone (codegen-only) immleaf patterns.
 
 // A 12-bit signed immediate plus one where the imm range will be [-2047, 2048].
@@ -741,7 +758,7 @@ def SLT  : ALU_3R<0x00120000>;
 def SLTU : ALU_3R<0x00128000>;
 def SLTI  : ALU_2RI12<0x02000000, simm12>;
 def SLTUI : ALU_2RI12<0x02400000, simm12>;
-def PCADDI    : ALU_1RI20<0x18000000, simm20>;
+def PCADDI    : ALU_1RI20<0x18000000, simm20_pcaddi>;
 def PCADDU12I : ALU_1RI20<0x1c000000, simm20>;
 def PCALAU12I : ALU_1RI20<0x1a000000, simm20_pcalau12i>;
 def AND  : ALU_3R<0x00148000>;
@@ -1106,7 +1123,7 @@ def : PatGprGpr<urem, MOD_WU>;
 def : PatGprGpr<mul, MUL_W>;
 def : PatGprGpr<mulhs, MULH_W>;
 def : PatGprGpr<mulhu, MULH_WU>;
-def : PatGprGpr<rotr, ROTR_W>;
+def : PatGprGpr<shiftop<rotr>, ROTR_W>;
 def : PatGprImm<rotr, ROTRI_W, uimm5>;
 
 foreach Idx = 1...3 in {
@@ -1129,9 +1146,8 @@ def : PatGprGpr<srem, MOD_D>;
 def : PatGprGpr_32<srem, MOD_W>;
 def : PatGprGpr<urem, MOD_DU>;
 def : PatGprGpr<loongarch_mod_wu, MOD_WU>;
-def : PatGprGpr<rotr, ROTR_D>;
-def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
-def : PatGprGpr_32<rotr, ROTR_W>;
+def : PatGprGpr<shiftop<rotr>, ROTR_D>;
+def : PatGprGpr<shiftopw<loongarch_rotr_w>, ROTR_W>;
 def : PatGprImm<rotr, ROTRI_D, uimm6>;
 def : PatGprImm_32<rotr, ROTRI_W, uimm5>;
 def : PatGprImm<loongarch_rotr_w, ROTRI_W, uimm5>;
@@ -1465,7 +1481,7 @@ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
 // Function call with 'Medium' code model.
-let isCall = 1, Defs = [R1, R20], Size = 8 in
+let isCall = 1, Defs = [R1] in
 def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 let Predicates = [IsLA64] in {
@@ -1476,7 +1492,7 @@ def : Pat<(loongarch_call_medium texternalsym:$func),
 } // Predicates = [IsLA64]
 
 // Function call with 'Large' code model.
-let isCall = 1, Defs = [R1, R20], Size = 24 in
+let isCall = 1, Defs = [R1] in
 def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
 
 let Predicates = [IsLA64] in {
@@ -1514,8 +1530,7 @@ def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
 // Tail call with 'Medium' code model.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [R3], Defs = [R20], Size = 8 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 let Predicates = [IsLA64] in {
@@ -1526,8 +1541,7 @@ def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
 } // Predicates = [IsLA64]
 
 // Tail call with 'Large' code model.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [R3], Defs = [R19, R20], Size = 24 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 let Predicates = [IsLA64] in {
@@ -1559,17 +1573,32 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
 
 /// call36/taill36 macro instructions
 let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
-    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+    Defs = [R1], hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
 def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
                           "call36", "$dst">,
                    Requires<[IsLA64]>;
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
-    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+    isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0,
     mayStore = 0, mayLoad = 0 in
 def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
                           "tail36", "$tmp, $dst">,
                    Requires<[IsLA64]>;
 
+// This is a special case of the ADD_W/D instruction used to facilitate the use
+// of a fourth operand to emit a relocation on a symbol relating to this
+// instruction. The relocation does not affect any bits of the instruction itself
+// but is used as a hint to the linker.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0 in {
+def PseudoAddTPRel_W : Pseudo<(outs GPR:$rd),
+                              (ins GPR:$rj, GPR:$rk, tprel_add_symbol:$sym), [],
+                              "add.w", "$rd, $rj, $rk, $sym">,
+                              Requires<[IsLA32]>;
+def PseudoAddTPRel_D : Pseudo<(outs GPR:$rd),
+                              (ins GPR:$rj, GPR:$rk, tprel_add_symbol:$sym), [],
+                              "add.d", "$rd, $rj, $rk, $sym">,
+                              Requires<[IsLA64]>;
+}
+
 /// Load address (la*) macro instructions.
 
 // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
@@ -1586,7 +1615,6 @@ def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ld", "$dst, $src">;
 def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.gd", "$dst, $src">;
-let Defs = [R20], Size = 20 in {
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
@@ -1601,7 +1629,6 @@ def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.gd", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-} // Defs = [R20], Size = 20
 }
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
@@ -1609,7 +1636,6 @@ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
 def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ie", "$dst, $src">;
-let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
                                 "la.got", "$dst, $tmp, $src">,
@@ -1618,7 +1644,6 @@ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-} // Defs = [R20], Size = 20
 }
 
 // Used for expand PseudoLA_TLS_DESC_* instructions.
@@ -1643,7 +1668,7 @@ def PseudoLA_TLS_DESC_PC : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
 }
 
 let isCall = 1, isBarrier = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0,
-    isCodeGenOnly = 0, isAsmParserOnly = 1, Defs = [R1, R4, R20], Size = 32 in
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Defs = [R1, R4] in
 def PseudoLA_TLS_DESC_PC_LARGE : Pseudo<(outs GPR:$dst),
                                         (ins GPR:$tmp, bare_symbol:$src), [],
                                         "la.tls.desc", "$dst, $tmp, $src">,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 3de1fe2b722e5..6f1969bf8cae0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
+
 def lasxsplati8
   : PatFrag<(ops node:$e0),
             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
@@ -1575,6 +1577,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
 
+// XVSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
+          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
+def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
+          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
+          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
+          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
+          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
+          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
+          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
+          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
+          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
+          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
+          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
+          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
+          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
+          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
+          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
+          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
+          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
+          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
+          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
+          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
+          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
+          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
+          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
+          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
+          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
+          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
+          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
+          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
+          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
+          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
+          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
+          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
+          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
+          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
+          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
+          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
+          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
+          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
+          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
+          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
+          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
+          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
+
+// XVSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
+          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
+        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
+
+// XVREPL128VEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
+          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
+        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
+
+// XVPERMI_D
+def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
+def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 39ee861cd0565..0580683c3ce30 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                          SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
 def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
 
+def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisInt<1>, SDTCisVec<1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<2, 3>]>;
+def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+
 // Target nodes.
 def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
 def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
 def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
                                       SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
 
+def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
+def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
+def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
+def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
+def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
+def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
+def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
+
+def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
+def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
+
+def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
+def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
+
 class VecCond<SDPatternOperator OpNode, ValueType TyNode,
               RegisterClass RC = LSX128>
     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
@@ -1682,6 +1708,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
 
+// VSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
+          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
+def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
+          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
+          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
+          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
+          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
+          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
+
+// VPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
+          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
+          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
+          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
+          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
+          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
+          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
+          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
+          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
+          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
+          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
+          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
+          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
+          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
+          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
+          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
+          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
+          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
+          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
+          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
+          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
+          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
+          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
+          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
+          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
+          (VILVL_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
+          (VILVL_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
+          (VILVL_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
+          (VILVL_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
+          (VILVL_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
+          (VILVL_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
+          (VILVH_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
+          (VILVH_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
+          (VILVH_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
+          (VILVH_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
+          (VILVH_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
+          (VILVH_D v2f64:$vj, v2f64:$vk)>;
+
+// VSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
+          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
+        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
+
+// VREPLVEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
+          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
+        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
+
 // VREPLVEI_{W/D}
 def : Pat<(lsxsplatf32 FPR32:$fj),
           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 710650acba304..2c7b0bfeaaad5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (!ST->hasExpAutoVec())
-      return DefSize;
     if (ST->hasExtLASX())
       return TypeSize::getFixed(256);
     if (ST->hasExtLSX())
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
index a8a4f1ed327d3..eee85822398b1 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
@@ -16,7 +16,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 
 using namespace llvm;
 
@@ -37,7 +37,7 @@ MCELFStreamer &LoongArchTargetELFStreamer::getStreamer() {
 
 void LoongArchTargetELFStreamer::finish() {
   LoongArchTargetStreamer::finish();
-  MCAssembler &MCA = getStreamer().getAssembler();
+  ELFObjectWriter &W = getStreamer().getWriter();
   LoongArchABI::ABI ABI = getTargetABI();
 
   // Figure out the e_flags.
@@ -48,7 +48,7 @@ void LoongArchTargetELFStreamer::finish() {
   // based relocs from day one.
   //
   // Refer to LoongArch ELF psABI v2.01 for details.
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  unsigned EFlags = W.getELFHeaderEFlags();
   EFlags |= ELF::EF_LOONGARCH_OBJABI_V1;
   switch (ABI) {
   case LoongArchABI::ABI_ILP32S:
@@ -66,7 +66,7 @@ void LoongArchTargetELFStreamer::finish() {
   case LoongArchABI::ABI_Unknown:
     llvm_unreachable("Improperly initialized target ABI");
   }
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 }
 
 namespace {
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index fb0587bf3bed7..370f5b0189b51 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,8 @@ enum Fixups {
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+  // 20-bit fixup corresponding to %pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_pcrel20_s2,
   // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
   // pcaddu18i+jirl.
   fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
@@ -136,6 +138,18 @@ enum Fixups {
   fixup_loongarch_tls_desc_ld,
   // 12-bit fixup corresponding to %desc_call(foo) for instruction jirl.
   fixup_loongarch_tls_desc_call,
+  // 20-bit fixup corresponding to %le_hi20_r(foo) for instruction lu12i.w.
+  fixup_loongarch_tls_le_hi20_r,
+  // Fixup corresponding to %le_add_r(foo) for instruction PseudoAddTPRel_W/D.
+  fixup_loongarch_tls_le_add_r,
+  // 12-bit fixup corresponding to %le_lo12_r(foo) for instruction addi.w/d.
+  fixup_loongarch_tls_le_lo12_r,
+  // 20-bit fixup corresponding to %ld_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_ld_pcrel20_s2,
+  // 20-bit fixup corresponding to %gd_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_gd_pcrel20_s2,
+  // 20-bit fixup corresponding to %desc_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_desc_pcrel20_s2,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 83812dc3c62a8..4f7f93fa4783e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -49,6 +49,10 @@ class LoongArchMCCodeEmitter : public MCCodeEmitter {
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const;
 
+  void expandAddTPRel(const MCInst &MI, SmallVectorImpl<char> &CB,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const;
+
   /// TableGen'erated function for getting the binary encoding for an
   /// instruction.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -134,6 +138,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_None:
     case LoongArchMCExpr::VK_LoongArch_Invalid:
       llvm_unreachable("Unhandled fixup kind!");
+    case LoongArchMCExpr::VK_LoongArch_TLS_LE_ADD_R:
+      llvm_unreachable("VK_LoongArch_TLS_LE_ADD_R should not represent an "
+                       "instruction operand");
     case LoongArchMCExpr::VK_LoongArch_B16:
       FixupKind = LoongArch::fixup_loongarch_b16;
       break;
@@ -274,6 +281,24 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL:
       FixupKind = LoongArch::fixup_loongarch_tls_desc_call;
       break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R:
+      FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
+      FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_ld_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_gd_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_pcrel20_s2;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
@@ -346,6 +371,38 @@ void LoongArchMCCodeEmitter::expandToVectorLDI(
   support::endian::write(CB, Binary, llvm::endianness::little);
 }
 
+void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI,
+                                            SmallVectorImpl<char> &CB,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  MCOperand Rd = MI.getOperand(0);
+  MCOperand Rj = MI.getOperand(1);
+  MCOperand Rk = MI.getOperand(2);
+  MCOperand Symbol = MI.getOperand(3);
+  assert(Symbol.isExpr() &&
+         "Expected expression as third input to TP-relative add");
+
+  const LoongArchMCExpr *Expr = dyn_cast<LoongArchMCExpr>(Symbol.getExpr());
+  assert(Expr &&
+         Expr->getKind() == LoongArchMCExpr::VK_LoongArch_TLS_LE_ADD_R &&
+         "Expected %le_add_r relocation on TP-relative symbol");
+
+  // Emit the correct %le_add_r relocation for the symbol.
+  // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled.
+  Fixups.push_back(MCFixup::create(
+      0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r),
+      MI.getLoc()));
+
+  // Emit a normal ADD instruction with the given operands.
+  unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D
+                     ? LoongArch::ADD_D
+                     : LoongArch::ADD_W;
+  MCInst TmpInst =
+      MCInstBuilder(ADD).addOperand(Rd).addOperand(Rj).addOperand(Rk);
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  support::endian::write(CB, Binary, llvm::endianness::little);
+}
+
 void LoongArchMCCodeEmitter::encodeInstruction(
     const MCInst &MI, SmallVectorImpl<char> &CB,
     SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
@@ -366,6 +423,9 @@ void LoongArchMCCodeEmitter::encodeInstruction(
   case LoongArch::PseudoXVREPLI_W:
   case LoongArch::PseudoXVREPLI_D:
     return expandToVectorLDI<LoongArch::XVLDI>(MI, CB, Fixups, STI);
+  case LoongArch::PseudoAddTPRel_W:
+  case LoongArch::PseudoAddTPRel_D:
+    return expandAddTPRel(MI, CB, Fixups, STI);
   }
 
   switch (Size) {
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 9b537794c7ab6..53d46cca913e5 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -160,6 +160,20 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "desc_ld";
   case VK_LoongArch_TLS_DESC_CALL:
     return "desc_call";
+  case VK_LoongArch_TLS_LE_HI20_R:
+    return "le_hi20_r";
+  case VK_LoongArch_TLS_LE_ADD_R:
+    return "le_add_r";
+  case VK_LoongArch_TLS_LE_LO12_R:
+    return "le_lo12_r";
+  case VK_LoongArch_PCREL20_S2:
+    return "pcrel_20";
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+    return "ld_pcrel_20";
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+    return "gd_pcrel_20";
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
+    return "desc_pcrel_20";
   }
 }
 
@@ -213,6 +227,13 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("desc64_hi12", VK_LoongArch_TLS_DESC64_HI12)
       .Case("desc_ld", VK_LoongArch_TLS_DESC_LD)
       .Case("desc_call", VK_LoongArch_TLS_DESC_CALL)
+      .Case("le_hi20_r", VK_LoongArch_TLS_LE_HI20_R)
+      .Case("le_add_r", VK_LoongArch_TLS_LE_ADD_R)
+      .Case("le_lo12_r", VK_LoongArch_TLS_LE_LO12_R)
+      .Case("pcrel_20", VK_LoongArch_PCREL20_S2)
+      .Case("ld_pcrel_20", VK_LoongArch_TLS_LD_PCREL20_S2)
+      .Case("gd_pcrel_20", VK_LoongArch_TLS_GD_PCREL20_S2)
+      .Case("desc_pcrel_20", VK_LoongArch_TLS_DESC_PCREL20_S2)
       .Default(VK_LoongArch_Invalid);
 }
 
@@ -255,6 +276,9 @@ void LoongArchMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_LoongArch_TLS_GD_HI20:
   case VK_LoongArch_TLS_DESC_PC_HI20:
   case VK_LoongArch_TLS_DESC_HI20:
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
     break;
   }
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 148fea309d6fc..91215b863ccc2 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -72,6 +72,13 @@ class LoongArchMCExpr : public MCTargetExpr {
     VK_LoongArch_TLS_DESC64_HI12,
     VK_LoongArch_TLS_DESC_LD,
     VK_LoongArch_TLS_DESC_CALL,
+    VK_LoongArch_TLS_LE_HI20_R,
+    VK_LoongArch_TLS_LE_ADD_R,
+    VK_LoongArch_TLS_LE_LO12_R,
+    VK_LoongArch_PCREL20_S2,
+    VK_LoongArch_TLS_LD_PCREL20_S2,
+    VK_LoongArch_TLS_GD_PCREL20_S2,
+    VK_LoongArch_TLS_DESC_PCREL20_S2,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
index e7e5bb19c3a07..16d009ab81adc 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
@@ -58,12 +58,6 @@ const RegisterBankInfo::ValueMapping ValueMappings[] = {
 M68kRegisterBankInfo::M68kRegisterBankInfo(const TargetRegisterInfo &TRI)
     : M68kGenRegisterBankInfo() {}
 
-const RegisterBank &
-M68kRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const {
-  return getRegBank(M68k::GPRRegBankID);
-}
-
 const RegisterBankInfo::InstructionMapping &
 M68kRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   auto Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
index 493c139f018cd..6122db8b48989 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
@@ -35,9 +35,6 @@ class M68kRegisterBankInfo final : public M68kGenRegisterBankInfo {
 public:
   M68kRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 };
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
index 6347a52e19188..1445bac0b92e8 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
@@ -246,9 +246,7 @@ MachineBasicBlock::iterator M68kFrameLowering::eliminateCallFramePseudoInstr(
     unsigned StackAlign = getStackAlignment();
     Amount = alignTo(Amount, StackAlign);
 
-    MachineModuleInfo &MMI = MF.getMMI();
-    const auto &Fn = MF.getFunction();
-    bool DwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry();
+    bool DwarfCFI = MF.needsFrameMoves();
 
     // If we have any exception handlers in this function, and we adjust
     // the SP before calls, we may need to indicate this to the unwinder
@@ -452,8 +450,7 @@ void M68kFrameLowering::emitPrologueCalleeSavedFrameMoves(
     const DebugLoc &DL) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
 
   // Add callee saved registers to move list.
   const auto &CSI = MFI.getCalleeSavedInfo();
@@ -478,13 +475,11 @@ void M68kFrameLowering::emitPrologue(MachineFunction &MF,
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const auto &Fn = MF.getFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
   M68kMachineFunctionInfo *MMFI = MF.getInfo<M68kMachineFunctionInfo>();
   uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
   uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
-  bool NeedsDwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry();
+  bool NeedsDwarfCFI = MF.needsFrameMoves();
   Register FramePtr = TRI->getFrameRegister(MF);
   const unsigned MachineFramePtr = FramePtr;
   unsigned BasePtr = TRI->getBaseRegister();
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 0cdb3a595f719..6c7922a655d9f 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -35,10 +35,6 @@ class MSP430TargetELFStreamer : public MCTargetStreamer {
 MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
                                                  const MCSubtargetInfo &STI)
     : MCTargetStreamer(S) {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned EFlags = MCA.getELFHeaderEFlags();
-  MCA.setELFHeaderEFlags(EFlags);
-
   // Emit build attributes section according to
   // MSP430 EABI (slaa534.pdf, part 13).
   MCSection *AttributeSection = getStreamer().getContext().getELFSection(
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 176387d71fcb6..d0dc6dd146efd 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -59,8 +59,7 @@ void MSP430FrameLowering::emitCalleeSavedFrameMoves(
     const DebugLoc &DL, bool IsPrologue) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
@@ -294,7 +293,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!hasFP(MF)) {
     MBBI = FirstCSPop;
-    int64_t Offset = -CSSize - 2;
+    int64_t Offset = -(int64_t)CSSize - 2;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index fc95b61fd4df5..f8172e576ce4c 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 499cbd873e299..ca95f67174da1 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -117,8 +117,7 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
 
 static MCTargetStreamer *createMipsAsmTargetStreamer(MCStreamer &S,
                                                      formatted_raw_ostream &OS,
-                                                     MCInstPrinter *InstPrint,
-                                                     bool isVerboseAsm) {
+                                                     MCInstPrinter *InstPrint) {
   return new MipsTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 3093b9fddc73a..cca4a9cc80566 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -801,6 +802,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
   MCAssembler &MCA = getStreamer().getAssembler();
+  ELFObjectWriter &W = getStreamer().getWriter();
 
   // It's possible that MCObjectFileInfo isn't fully initialized at this point
   // due to an initialization order problem where LLVMTargetMachine creates the
@@ -824,7 +826,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // We can fix this by making the target streamer construct
   // the ABI, but this is fraught with wide ranging dependency
   // issues as well.
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  unsigned EFlags = W.getELFHeaderEFlags();
 
   // FIXME: Fix a dependency issue by instantiating the ABI object to some
   // default based off the triple. The triple doesn't describe the target
@@ -873,7 +875,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   if (Features[Mips::FeatureNaN2008])
     EFlags |= ELF::EF_MIPS_NAN2008;
 
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 }
 
 void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
@@ -889,6 +891,7 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
 
 void MipsTargetELFStreamer::finish() {
   MCAssembler &MCA = getStreamer().getAssembler();
+  ELFObjectWriter &W = getStreamer().getWriter();
   const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
   MCELFStreamer &S = getStreamer();
 
@@ -925,7 +928,7 @@ void MipsTargetELFStreamer::finish() {
 
   // Update e_header flags. See the FIXME and comment above in
   // the constructor for a full rundown on this.
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  unsigned EFlags = W.getELFHeaderEFlags();
 
   // ABI
   // N64 does not require any ABI bits.
@@ -948,7 +951,7 @@ void MipsTargetELFStreamer::finish() {
   if (Pic)
     EFlags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
 
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 
   // Emit all the option records.
   // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
@@ -988,25 +991,25 @@ void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
 }
 
 void MipsTargetELFStreamer::setUsesMicroMips() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_MICROMIPS;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips16() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
   forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NOREORDER;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
   forbidModuleDirective();
 }
 
@@ -1063,45 +1066,45 @@ void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
 }
 
 void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveNaN2008() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NAN2008;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Flags &= ~ELF::EF_MIPS_NAN2008;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   // This option overrides other PIC options like -KPIC.
   Pic = false;
   Flags &= ~ELF::EF_MIPS_PIC;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned Flags = W.getELFHeaderEFlags();
   Pic = true;
   // NOTE: We are following the GAS behaviour here which means the directive
   // 'pic2' also sets the CPIC bit in the ELF header. This is different from
   // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
   // EF_MIPS_CPIC to be mutually exclusive.
   Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
-  MCA.setELFHeaderEFlags(Flags);
+  W.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveInsn() {
diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index 10c953bb344a8..f72c49eb707db 100644
--- a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -54,8 +54,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   // No need to allocate space on the stack.
   if (StackSize == 0 && !MFI.adjustsStack()) return;
 
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
 
   // Adjust stack.
   TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 018c620f5c84c..e267a6d0844c6 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -254,8 +254,10 @@ void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   do {
     // Do any auto-generated pseudo lowerings.
-    if (emitPseudoExpansionLowering(*OutStreamer, &*I))
+    if (MCInst OutInst; lowerPseudoInstExpansion(&*I, OutInst)) {
+      EmitToStreamer(*OutStreamer, OutInst);
       continue;
+    }
 
     // Skip the BUNDLE pseudo instruction and lower the contents
     if (I->isBundle())
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 0b55089385d79..d53a0f6a39667 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -69,8 +69,7 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
   // tblgen'erated function.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
   // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 62b58cba9f24a..4aecaf18db480 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -75,35 +75,6 @@ using namespace llvm;
 
 MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI) {}
 
-const RegisterBank &
-MipsRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const {
-  using namespace Mips;
-
-  switch (RC.getID()) {
-  case Mips::GPR32RegClassID:
-  case Mips::CPU16Regs_and_GPRMM16ZeroRegClassID:
-  case Mips::GPRMM16MovePPairFirstRegClassID:
-  case Mips::CPU16Regs_and_GPRMM16MovePPairSecondRegClassID:
-  case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID:
-  case Mips::GPRMM16MovePPairFirst_and_GPRMM16MovePPairSecondRegClassID:
-  case Mips::SP32RegClassID:
-  case Mips::GP32RegClassID:
-    return getRegBank(Mips::GPRBRegBankID);
-  case Mips::FGRCCRegClassID:
-  case Mips::FGR32RegClassID:
-  case Mips::FGR64RegClassID:
-  case Mips::AFGR64RegClassID:
-  case Mips::MSA128BRegClassID:
-  case Mips::MSA128HRegClassID:
-  case Mips::MSA128WRegClassID:
-  case Mips::MSA128DRegClassID:
-    return getRegBank(Mips::FPRBRegBankID);
-  default:
-    llvm_unreachable("Register class not supported");
-  }
-}
-
 // Instructions where use operands are floating point registers.
 // Def operands are general purpose.
 static bool isFloatingPointOpcodeUse(unsigned Opc) {
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index bc424b93f6056..1e07962fd02b3 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -32,9 +32,6 @@ class MipsRegisterBankInfo final : public MipsGenRegisterBankInfo {
 public:
   MipsRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 38f6889a52358..ceb0668c455d6 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -427,8 +427,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   // No need to allocate space on the stack.
   if (StackSize == 0 && !MFI.adjustsStack()) return;
 
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
 
   // Adjust stack.
   TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 50e7cc6371253..1cafd236a2925 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -62,7 +62,7 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &,
-                                                 MCInstPrinter *, bool) {
+                                                 MCInstPrinter *) {
   return new NVPTXAsmTargetStreamer(S);
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0b654abd2814c..d7197a7923eaf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -493,7 +493,7 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   // Emit initial .loc debug directive for correct relocation symbol data.
   if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
     assert(SP->getUnit());
-    if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+    if (!SP->getUnit()->isDebugDirectivesOnly())
       emitInitialRawDwarfLocDirective(*MF);
   }
 }
@@ -912,7 +912,7 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
     if (HasFullDebugInfo)
       break;
   }
-  if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
+  if (HasFullDebugInfo)
     O << ", debug";
 
   O << "\n";
@@ -928,8 +928,6 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
-  bool HasDebugInfo = MMI && MMI->hasDebugInfo();
-
   // If we did not emit any functions, then the global declarations have not
   // yet been emitted.
   if (!GlobalsEmitted) {
@@ -945,7 +943,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   auto *TS =
       static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer());
   // Close the last emitted section
-  if (HasDebugInfo) {
+  if (hasDebugInfo()) {
     TS->closeLastSection();
     // Emit empty .debug_loc section for better support of the empty files.
     OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 10ae81e0460e3..9abe0e3186f20 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -93,5 +93,8 @@ MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
 
 TargetFrameLowering::DwarfFrameBase
 NVPTXFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
-  return {DwarfFrameBase::CFA, {0}};
+  DwarfFrameBase FrameBase;
+  FrameBase.Kind = DwarfFrameBase::CFA;
+  FrameBase.Location.Offset = 0;
+  return FrameBase;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..96456ad0547ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -736,8 +736,9 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
   // | No      | Yes      | Generic,Shared,    | .volatile  | .volatile                    |
   // |         |          | Global [0]         |            |                              |
   // | No      | Yes      | Local,Const,Param  | plain [1]  | .weak [1]                    |
-  // | Relaxed | No       | Generic,Shared,    |            |                              |
-  // |         |          | Global [0]         | .volatile  | <atomic sem>                 |
+  // | Unorder | Yes/No   | All                | == Relaxed | == Relaxed                   |
+  // | Relaxed | No       | Generic,Shared,    | .volatile  | <atomic sem>                 |
+  // |         |          | Global [0]         |            |                              |
   // | Other   | No       | Generic,Shared,    | Error [2]  | <atomic sem>                 |
   // |         |          | Global [0]         |            |                              |
   // | Yes     | No       | Local,Const,Param  | plain [1]  | .weak [1]                    |
@@ -794,9 +795,10 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
     return NVPTX::PTXLdStInstCode::NotAtomic;
   }
 
-  // [2]: Atomics with Ordering different than Relaxed are not supported on
-  //      sm_60 and older; this includes volatile atomics.
+  // [2]: Atomics with Ordering different than Unordered or Relaxed are not
+  //      supported on sm_60 and older; this includes volatile atomics.
   if (!(Ordering == AtomicOrdering::NotAtomic ||
+        Ordering == AtomicOrdering::Unordered ||
         Ordering == AtomicOrdering::Monotonic) &&
       !HasMemoryOrdering) {
     SmallString<256> Msg;
@@ -826,6 +828,9 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
     return N->isVolatile() && AddrGenericOrGlobalOrShared
                ? NVPTX::PTXLdStInstCode::Volatile
                : NVPTX::PTXLdStInstCode::NotAtomic;
+  case AtomicOrdering::Unordered:
+    // We lower unordered in the exact same way as 'monotonic' to respect
+    // LLVM IR atomicity requirements.
   case AtomicOrdering::Monotonic:
     if (N->isVolatile())
       return UseRelaxedMMIO                ? NVPTX::PTXLdStInstCode::RelaxedMMIO
@@ -866,7 +871,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
     report_fatal_error(OS.str());
   }
   case AtomicOrdering::SequentiallyConsistent:
-  case AtomicOrdering::Unordered:
     // TODO: support AcquireRelease and SequentiallyConsistent
     SmallString<256> Msg;
     raw_svector_ostream OS(Msg);
@@ -3914,8 +3918,14 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
       else
         Base = Addr.getOperand(0);
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
-                                         mvt);
+
+      // Offset must fit in a 32-bit signed int in PTX [register+offset] address
+      // mode
+      if (!CN->getAPIntValue().isSignedIntN(32))
+        return false;
+
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
+                                         MVT::i32);
       return true;
     }
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a2181b478c269..6975412ce5d35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -629,6 +629,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
@@ -1428,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  const Function *F = CB.getFunction();
   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
@@ -1470,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
       continue;
     }
 
+    // Indirect calls need strict ABI alignment so we disable optimizations by
+    // not providing a function to optimize.
     Type *ETy = Args[i].IndirectType;
     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
     Align ParamByValAlign =
-        getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+        getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
 
     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
     O << "_";
@@ -2920,9 +2922,10 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   assert(Node->getValueType(0) == MVT::i1 &&
          "Custom lowering for i1 load only");
-  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
-                              LD->getPointerInfo(), LD->getAlign(),
-                              LD->getMemOperand()->getFlags());
+  SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
+                                 LD->getBasePtr(), LD->getPointerInfo(),
+                                 MVT::i8, LD->getAlign(),
+                                 LD->getMemOperand()->getFlags());
   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
   // The legalizer (the caller) is expecting two values from the legalized
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
@@ -5165,9 +5168,12 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   // - [areg+immoff]
   // - [immAddr]
 
-  if (AM.BaseGV) {
+  // immoff must fit in a signed 32-bit int
+  if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
+    return false;
+
+  if (AM.BaseGV)
     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
-  }
 
   switch (AM.Scale) {
   case 0: // "r", "r+i" or "i" is allowed
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3133801ee35d2..e6efcf4d8d41d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -269,16 +269,12 @@ multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
   }
 }
 
-// Template for instructions which take three fp64 or fp32 args.  The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+// Template for minimum/maximum instructions.
 //
 // Also defines ftz (flush subnormal inputs and results to sign-preserving
 // zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that cannot be folded into FMAs.
-// For nodes that can be folded into FMAs (i.e. adds and muls), use
-// F3_fma_component.
-multiclass F3<string OpcStr, SDNode OpNode> {
+multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
+  if !not(NaN) then {
    def f64rr :
      NVPTXInst<(outs Float64Regs:$dst),
                (ins Float64Regs:$a, Float64Regs:$b),
@@ -289,6 +285,7 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                (ins Float64Regs:$a, f64imm:$b),
                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  }
    def f32rr_ftz :
      NVPTXInst<(outs Float32Regs:$dst),
                (ins Float32Regs:$a, Float32Regs:$b),
@@ -323,45 +320,45 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
-               Requires<[useFP16Math]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
 
    def f16x2rr_ftz :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
-               Requires<[useFP16Math, doF32FTZ]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
    def f16x2rr :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
-               Requires<[useFP16Math]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
    def bf16rr_ftz :
      NVPTXInst<(outs Int16Regs:$dst),
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
-               Requires<[hasBF16Math, doF32FTZ]>;
+               Requires<[hasBF16Math, doF32FTZ, hasSM<80>, hasPTX<70>]>;
    def bf16rr :
      NVPTXInst<(outs Int16Regs:$dst),
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
-               Requires<[hasBF16Math]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 
    def bf16x2rr_ftz :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
-               Requires<[hasBF16Math, doF32FTZ]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
    def bf16x2rr :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
-               Requires<[hasBF16Math]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 }
 
 // Template for instructions which take three FP args.  The
@@ -1179,11 +1176,10 @@ defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
 
-defm FMIN : F3<"min", fminnum>;
-defm FMAX : F3<"max", fmaxnum>;
-// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
-defm FMINNAN : F3<"min.NaN", fminimum>;
-defm FMAXNAN : F3<"max.NaN", fmaximum>;
+defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
+defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
+defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
+defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
 
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
@@ -3909,3 +3905,31 @@ def : Pat <
   (V2I32toI64
     (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
     (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX Fence instructions
+////////////////////////////////////////////////////////////////////////////////
+
+def atomic_thread_fence_seq_cst_sys :
+  NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
+  Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_sys :
+  NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
+  Requires<[hasPTX<60>, hasSM<70>]>;
+
+def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+
+
+// If PTX<60 or SM<70, we fall back to MEMBAR:
+def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
+def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
+def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1)
+def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1)
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
index 125a49de7b27d..4a004f8960562 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -34,13 +34,6 @@ const RegisterBank &
 PPCRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
                                             LLT Ty) const {
   switch (RC.getID()) {
-  case PPC::G8RCRegClassID:
-  case PPC::G8RC_NOX0RegClassID:
-  case PPC::G8RC_and_G8RC_NOX0RegClassID:
-  case PPC::GPRCRegClassID:
-  case PPC::GPRC_NOR0RegClassID:
-  case PPC::GPRC_and_GPRC_NOR0RegClassID:
-    return getRegBank(PPC::GPRRegBankID);
   case PPC::VSFRCRegClassID:
   case PPC::SPILLTOVSRRC_and_VSFRCRegClassID:
   case PPC::SPILLTOVSRRC_and_VFRCRegClassID:
@@ -50,19 +43,8 @@ PPCRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case PPC::VSSRCRegClassID:
   case PPC::F4RCRegClassID:
     return getRegBank(PPC::FPRRegBankID);
-  case PPC::VSRCRegClassID:
-  case PPC::VRRCRegClassID:
-  case PPC::VRRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-  case PPC::VSRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-  case PPC::SPILLTOVSRRCRegClassID:
-  case PPC::VSLRCRegClassID:
-  case PPC::VSLRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-    return getRegBank(PPC::VECRegBankID);
-  case PPC::CRRCRegClassID:
-  case PPC::CRBITRCRegClassID:
-    return getRegBank(PPC::CRRegBankID);
   default:
-    llvm_unreachable("Unexpected register class");
+    return PPCGenRegisterBankInfo::getRegBankFromRegClass(RC, Ty);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
index 1477fdca917d7..332a34fe022dd 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
@@ -67,6 +67,7 @@ class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo {
 
   const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
                                              LLT Ty) const override;
+
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 9106e6ab6397d..7e79d2728e674 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -25,11 +25,11 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -298,15 +298,14 @@ class PPCTargetELFStreamer : public PPCTargetStreamer {
   }
 
   void emitAbiVersion(int AbiVersion) override {
-    MCAssembler &MCA = getStreamer().getAssembler();
-    unsigned Flags = MCA.getELFHeaderEFlags();
+    ELFObjectWriter &W = getStreamer().getWriter();
+    unsigned Flags = W.getELFHeaderEFlags();
     Flags &= ~ELF::EF_PPC64_ABI;
     Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
-    MCA.setELFHeaderEFlags(Flags);
+    W.setELFHeaderEFlags(Flags);
   }
 
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
-    MCAssembler &MCA = getStreamer().getAssembler();
 
     // encodePPC64LocalEntryOffset will report an error if it cannot
     // encode LocalOffset.
@@ -319,9 +318,10 @@ class PPCTargetELFStreamer : public PPCTargetStreamer {
 
     // For GAS compatibility, unless we already saw a .abiversion directive,
     // set e_flags to indicate ELFv2 ABI.
-    unsigned Flags = MCA.getELFHeaderEFlags();
+    ELFObjectWriter &W = getStreamer().getWriter();
+    unsigned Flags = W.getELFHeaderEFlags();
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
-      MCA.setELFHeaderEFlags(Flags | 2);
+      W.setELFHeaderEFlags(Flags | 2);
   }
 
   void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
@@ -439,8 +439,7 @@ class PPCTargetXCOFFStreamer : public PPCTargetStreamer {
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm) {
+                                                 MCInstPrinter *InstPrint) {
   return new PPCTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 84ef582c029d3..da31a993b9c69 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -52,6 +52,7 @@ def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
 def DirectivePwr10: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR10", "">;
+def DirectivePwr11: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR11", "">;
 def DirectivePwrFuture
     : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
@@ -467,13 +468,25 @@ def ProcessorFeatures {
   list<SubtargetFeature> P10Features =
     !listconcat(P10InheritableFeatures, P10SpecificFeatures);
 
-  // Future
-  // For future CPU we assume that all of the existing features from Power10
+  // Power11
+  // For P11 CPU we assume that all the existing features from Power10
   // still exist with the exception of those we know are Power10 specific.
+  list<SubtargetFeature> P11AdditionalFeatures =
+    [DirectivePwr11];
+  list<SubtargetFeature> P11SpecificFeatures =
+    [];
+  list<SubtargetFeature> P11InheritableFeatures =
+    !listconcat(P10InheritableFeatures, P11AdditionalFeatures);
+  list<SubtargetFeature> P11Features =
+    !listconcat(P11InheritableFeatures, P11SpecificFeatures);
+
+  // Future
+  // For future CPU we assume that all of the existing features from Power11
+  // still exist with the exception of those we know are Power11 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [FeatureISAFuture];
   list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
-    !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
+    !listconcat(P11InheritableFeatures, FutureAdditionalFeatures);
   list<SubtargetFeature> FutureFeatures =
     !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
@@ -672,6 +685,7 @@ def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
 def : ProcessorModel<"pwr10", P10Model, ProcessorFeatures.P10Features>;
+def : ProcessorModel<"pwr11", P10Model, ProcessorFeatures.P11Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d74143b487880..dcde86388dcd9 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -909,6 +909,23 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    assert(!Subtarget->isAIXABI() &&
+           "AIX does not support patchable function entry!");
+    // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is
+    // handled in PPCLinuxAsmPrinter.
+    if (MAI->isLittleEndian())
+      return;
+    const Function &F = MF->getFunction();
+    unsigned Num = 0;
+    (void)F.getFnAttribute("patchable-function-entry")
+        .getValueAsString()
+        .getAsInteger(10, Num);
+    if (!Num)
+      return;
+    emitNops(Num);
+    return;
+  }
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
   case TargetOpcode::STACKMAP:
@@ -1780,7 +1797,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   switch (MI->getOpcode()) {
   default:
-    return PPCAsmPrinter::emitInstruction(MI);
+    break;
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
     // .begin:
     //   b .end # lis 0, FuncId[16..32]
@@ -1793,6 +1810,9 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
+    // XRAY is only supported on PPC Linux little endian.
+    if (!MAI->isLittleEndian())
+      break;
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     MCSymbol *EndOfSled = OutContext.createTempSymbol();
     OutStreamer->emitLabel(BeginOfSled);
@@ -1834,7 +1854,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
       IsConditional = false;
     } else {
       EmitToStreamer(*OutStreamer, RetInst);
-      break;
+      return;
     }
 
     MCSymbol *FallthroughLabel;
@@ -1899,7 +1919,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IsConditional)
       OutStreamer->emitLabel(FallthroughLabel);
     recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT, 2);
-    break;
+    return;
   }
   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
     llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");
@@ -1909,6 +1929,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     llvm_unreachable("Tail call is handled in the normal case. See comments "
                      "around this assert.");
   }
+  return PPCAsmPrinter::emitInstruction(MI);
 }
 
 void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
@@ -3116,11 +3137,11 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
       break;
     MCSymbol *TempSym = OutContext.createNamedTempSymbol();
     OutStreamer->emitLabel(TempSym);
-    OutStreamer->emitXCOFFExceptDirective(CurrentFnSym, TempSym,
-                 LangMO.getImm(), ReasonMO.getImm(),
-                 Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8 :
-                 MI->getMF()->getInstructionCount() * 4,
-		 MMI->hasDebugInfo());
+    OutStreamer->emitXCOFFExceptDirective(
+        CurrentFnSym, TempSym, LangMO.getImm(), ReasonMO.getImm(),
+        Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8
+                             : MI->getMF()->getInstructionCount() * 4,
+        hasDebugInfo());
     break;
   }
   case PPC::GETtlsMOD32AIX:
@@ -3178,7 +3199,7 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 bool PPCAIXAsmPrinter::doFinalization(Module &M) {
   // Do streamer related finalization for DWARF.
-  if (!MAI->usesDwarfFileAndLocDirectives() && MMI->hasDebugInfo())
+  if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo())
     OutStreamer->doFinalizationAtSectionEnd(
         OutStreamer->getContext().getObjectFileInfo()->getTextSection());
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 277d708013c78..1963582ce6863 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -613,8 +613,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
 
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   DebugLoc dl;
   // AIX assembler does not support cfi directives.
   const bool needsCFI = MF.needsFrameMoves() && !Subtarget.isAIXABI();
@@ -1239,8 +1238,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
   const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   // AIX assembler does not support cfi directives.
   const bool needsCFI = MF.needsFrameMoves() && !Subtarget.isAIXABI();
   auto StackAllocMIPos = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
diff --git a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
index 00931b1f63b29..e43437d00e91d 100644
--- a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
@@ -123,9 +123,7 @@ bool PPCGenScalarMASSEntries::runOnModule(Module &M) {
     // The call to createScalarMASSCall() invalidates the iterator over users
     // upon replacing the users. Precomputing the current list of users allows
     // us to replace all the call sites.
-    SmallVector<User *, 4> TheUsers;
-    for (auto *User : Func.users())
-      TheUsers.push_back(User);
+    SmallVector<User *, 4> TheUsers(Func.users());
 
     for (auto *User : TheUsers)
       if (auto *CI = dyn_cast_or_null<CallInst>(User)) {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index a11ab93b8db3c..1686ec572c855 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1469,6 +1469,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
   case PPC::DIR_PWR10:
+  case PPC::DIR_PWR11:
   case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
@@ -5559,7 +5560,7 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
       // C-linkage name. A Qualname is returned here because an external
       // function entry point is a csect with XTY_ER property.
       const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
-        auto &Context = DAG.getMachineFunction().getMMI().getContext();
+        auto &Context = DAG.getMachineFunction().getContext();
         MCSectionXCOFF *Sec = Context.getXCOFFSection(
             (Twine(".") + Twine(SymName)).str(), SectionKind::getMetadata(),
             XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
@@ -7214,6 +7215,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   // Reserve space for the linkage area on the stack.
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
+  uint64_t SaveStackPos = CCInfo.getStackSize();
+  bool SaveParams = MF.getFunction().hasFnAttribute("save-reg-params");
   CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
 
   SmallVector<SDValue, 8> MemOps;
@@ -7232,6 +7235,27 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
       continue;
 
+    if (SaveParams && VA.isRegLoc() && !Flags.isByVal() && !VA.needsCustom()) {
+      const TargetRegisterClass *RegClass = getRegClassForSVT(
+          LocVT.SimpleTy, IsPPC64, Subtarget.hasP8Vector(), Subtarget.hasVSX());
+      // On PPC64, debugger assumes extended 8-byte values are stored from GPR.
+      MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
+      const Register VReg = MF.addLiveIn(VA.getLocReg(), RegClass);
+      SDValue Parm = DAG.getCopyFromReg(Chain, dl, VReg, SaveVT);
+      int FI = MFI.CreateFixedObject(SaveVT.getStoreSize(), SaveStackPos, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      SDValue StoreReg = DAG.getStore(Chain, dl, Parm, FIN,
+                                      MachinePointerInfo(), Align(PtrByteSize));
+      SaveStackPos = alignTo(SaveStackPos + SaveVT.getStoreSize(), PtrByteSize);
+      MemOps.push_back(StoreReg);
+    }
+
+    if (SaveParams && (VA.isMemLoc() || Flags.isByVal()) && !VA.needsCustom()) {
+      unsigned StoreSize =
+          Flags.isByVal() ? Flags.getByValSize() : LocVT.getStoreSize();
+      SaveStackPos = alignTo(SaveStackPos + StoreSize, PtrByteSize);
+    }
+
     auto HandleMemLoc = [&]() {
       const unsigned LocSize = LocVT.getStoreSize();
       const unsigned ValSize = ValVT.getStoreSize();
@@ -16664,6 +16688,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
   case PPC::DIR_PWR10:
+  case PPC::DIR_PWR11:
   case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
@@ -18046,6 +18071,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       return true;
     case PPC::DIR_PWR9:
     case PPC::DIR_PWR10:
+    case PPC::DIR_PWR11:
     case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 2d3c520429f2a..81f16eb1a905b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -3485,6 +3485,7 @@ unsigned PPCInstrInfo::getSpillTarget() const {
   // With P10, we may need to spill paired vector registers or accumulator
   // registers. MMA implies paired vectors, so we can just check that.
   bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops();
+  // P11 uses the P10 target.
   return Subtarget.isISAFuture() ? 3 : IsP10Variant ?
                                    2 : Subtarget.hasP9Vector() ?
                                    1 : 0;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index fdbdc14736c86..3cb7cd9d8f229 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -17,6 +17,7 @@ def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
 def sub_32_hi_phony : SubRegIndex<32,32>;
 def sub_64 : SubRegIndex<64>;
+def sub_64_hi_phony : SubRegIndex<64,64>;
 def sub_vsx0 : SubRegIndex<128>;
 def sub_vsx1 : SubRegIndex<128, 128>;
 def sub_gp8_x0 : SubRegIndex<64>;
@@ -77,19 +78,19 @@ class VF<bits<5> num, string n> : PPCReg<n> {
 }
 
 // VR - One of the 32 128-bit vector registers
-class VR<VF SubReg, string n> : PPCReg<n> {
+class VR<VF SubReg, VF SubRegH, string n> : PPCReg<n> {
   let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
   let HWEncoding{5} = 0;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
 // floating-point registers.
-class VSRL<FPR SubReg, string n> : PPCReg<n> {
+class VSRL<FPR SubReg, FPR SubRegH, string n> : PPCReg<n> {
   let HWEncoding = SubReg.HWEncoding;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSXReg - One of the VSX registers in the range vs32-vs63 with numbering
@@ -155,6 +156,22 @@ foreach Index = 0-31 in {
                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
 }
 
+// The FH and VFH registers have been marked as Artifical because there are no
+// instructions on PowerPC that use those register classes. They only exist
+// in order to ensure that the super registers (V and VSL) are covered by their
+// subregisters and have correct subregister lane masks.
+let isArtificial = 1 in {
+  foreach Index = 0-31 in {
+    def FH#Index : FPR<-1, "">;
+    def VFH#Index : VF<-1, "">;
+  }
+}
+
+let isAllocatable = 0, CopyCost = -1 in {
+  def VFHRC : RegisterClass<"PPC", [f64], 64, (sequence "VFH%u", 0, 31)>;
+  def FHRC : RegisterClass<"PPC", [f64], 64, (sequence "FH%u", 0, 31)>;
+}
+
 // Floating-point pair registers
 foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
   def Fpair#Index : FPPair<"fp"#Index, Index>;
@@ -168,17 +185,19 @@ foreach Index = 0-31 in {
                  DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
+let CoveredBySubRegs = 1 in {
 // Vector registers
 foreach Index = 0-31 in {
-  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
+  def V#Index : VR<!cast<VF>("VF"#Index), !cast<VF>("VFH"#Index), "v"#Index>,
                 DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
 // VSX registers
 foreach Index = 0-31 in {
-  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), !cast<FPR>("FH"#Index), "vs"#Index>,
                   DwarfRegAlias<!cast<FPR>("F"#Index)>;
 }
+}
 
 // Dummy VSX registers, this defines string: "vs32"-"vs63", and is only used for
 // asm printing.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index bf35f8ec151b1..2079dc0acc3cf 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -61,6 +61,7 @@ enum {
   DIR_PWR8,
   DIR_PWR9,
   DIR_PWR10,
+  DIR_PWR11,
   DIR_PWR_FUTURE,
   DIR_64
 };
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 3fa35efc2d159..b7bdbeb535d52 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -504,7 +504,7 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
       Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
-      Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR11 || Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -538,7 +538,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
   // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
       Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
-      Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR11 || Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
diff --git a/llvm/lib/Target/README.txt b/llvm/lib/Target/README.txt
index 05e590e71fe10..adf75c3368677 100644
--- a/llvm/lib/Target/README.txt
+++ b/llvm/lib/Target/README.txt
@@ -2,20 +2,6 @@ Target Independent Opportunities:
 
 //===---------------------------------------------------------------------===//
 
-We should recognized various "overflow detection" idioms and translate them into
-llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
-
-unsigned int mul(unsigned int a,unsigned int b) {
- if ((unsigned long long)a*b>0xffffffff)
-   exit(0);
-  return a*b;
-}
-
-The legalization code for mul-with-overflow needs to be made more robust before
-this can be implemented though.
-
-//===---------------------------------------------------------------------===//
-
 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
 safe in general, even on darwin.  See the libm implementation of hypot for
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index b5f8715598f3a..f7fa0e170fd29 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -329,7 +329,7 @@ static bool isLegalElementTypeForRVV(Type *EltTy,
   if (EltTy->isHalfTy())
     return Subtarget.hasVInstructionsF16();
   if (EltTy->isBFloatTy())
-    return Subtarget.hasVInstructionsBF16();
+    return Subtarget.hasVInstructionsBF16Minimal();
   if (EltTy->isFloatTy())
     return Subtarget.hasVInstructionsF32();
   if (EltTy->isDoubleTy())
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index f033ea7250030..4e583d96335d9 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -67,6 +68,17 @@ typeIsLegalBoolVec(unsigned TypeIdx, std::initializer_list<LLT> BoolVecTys,
   return all(typeInSet(TypeIdx, BoolVecTys), P);
 }
 
+static LegalityPredicate typeIsLegalPtrVec(unsigned TypeIdx,
+                                           std::initializer_list<LLT> PtrVecTys,
+                                           const RISCVSubtarget &ST) {
+  LegalityPredicate P = [=, &ST](const LegalityQuery &Query) {
+    return ST.hasVInstructions() &&
+           (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 1 ||
+            ST.getELen() == 64);
+  };
+  return all(typeInSet(TypeIdx, PtrVecTys), P);
+}
+
 RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
     : STI(ST), XLen(STI.getXLen()), sXLen(LLT::scalar(XLen)) {
   const LLT sDoubleXLen = LLT::scalar(2 * XLen);
@@ -111,6 +123,11 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   const LLT nxv4s64 = LLT::scalable_vector(4, s64);
   const LLT nxv8s64 = LLT::scalable_vector(8, s64);
 
+  const LLT nxv1p0 = LLT::scalable_vector(1, p0);
+  const LLT nxv2p0 = LLT::scalable_vector(2, p0);
+  const LLT nxv4p0 = LLT::scalable_vector(4, p0);
+  const LLT nxv8p0 = LLT::scalable_vector(8, p0);
+
   using namespace TargetOpcode;
 
   auto BoolVecTys = {nxv1s1, nxv2s1, nxv4s1, nxv8s1, nxv16s1, nxv32s1, nxv64s1};
@@ -120,6 +137,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                         nxv32s16, nxv1s32, nxv2s32, nxv4s32, nxv8s32, nxv16s32,
                         nxv1s64,  nxv2s64, nxv4s64, nxv8s64};
 
+  auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0};
+
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
       .legalFor({s32, sXLen})
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
@@ -266,6 +285,23 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                                      {s32, p0, s16, 16},
                                      {s32, p0, s32, 32},
                                      {p0, p0, sXLen, XLen}});
+  if (ST.hasVInstructions())
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv2s8, p0, nxv2s8, 8},
+                                               {nxv4s8, p0, nxv4s8, 8},
+                                               {nxv8s8, p0, nxv8s8, 8},
+                                               {nxv16s8, p0, nxv16s8, 8},
+                                               {nxv32s8, p0, nxv32s8, 8},
+                                               {nxv64s8, p0, nxv64s8, 8},
+                                               {nxv2s16, p0, nxv2s16, 16},
+                                               {nxv4s16, p0, nxv4s16, 16},
+                                               {nxv8s16, p0, nxv8s16, 16},
+                                               {nxv16s16, p0, nxv16s16, 16},
+                                               {nxv32s16, p0, nxv32s16, 16},
+                                               {nxv2s32, p0, nxv2s32, 32},
+                                               {nxv4s32, p0, nxv4s32, 32},
+                                               {nxv8s32, p0, nxv8s32, 32},
+                                               {nxv16s32, p0, nxv16s32, 32}});
+
   auto &ExtLoadActions =
       getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
           .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 16}});
@@ -279,7 +315,28 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   } else if (ST.hasStdExtD()) {
     LoadStoreActions.legalForTypesWithMemDesc({{s64, p0, s64, 64}});
   }
-  LoadStoreActions.clampScalar(0, s32, sXLen).lower();
+  if (ST.hasVInstructions() && ST.getELen() == 64)
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv1s8, p0, nxv1s8, 8},
+                                               {nxv1s16, p0, nxv1s16, 16},
+                                               {nxv1s32, p0, nxv1s32, 32}});
+
+  if (ST.hasVInstructionsI64())
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv1s64, p0, nxv1s64, 64},
+
+                                               {nxv2s64, p0, nxv2s64, 64},
+                                               {nxv4s64, p0, nxv4s64, 64},
+                                               {nxv8s64, p0, nxv8s64, 64}});
+
+  LoadStoreActions.widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .lowerIfMemSizeNotByteSizePow2()
+      // we will take the custom lowering logic if we have scalable vector types
+      // with non-standard alignments
+      .customIf(LegalityPredicate(
+          LegalityPredicates::any(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                                  typeIsLegalPtrVec(0, PtrVecTys, ST))))
+      .clampScalar(0, s32, sXLen)
+      .lower();
+
   ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();
 
   getActionDefinitionsBuilder({G_PTR_ADD, G_PTRMASK}).legalFor({{p0, sXLen}});
@@ -651,6 +708,46 @@ bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI,
   return true;
 }
 
+bool RISCVLegalizerInfo::legalizeLoadStore(MachineInstr &MI,
+                                           LegalizerHelper &Helper,
+                                           MachineIRBuilder &MIB) const {
+  assert((isa<GLoad>(MI) || isa<GStore>(MI)) &&
+         "Machine instructions must be Load/Store.");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  MachineFunction *MF = MI.getMF();
+  const DataLayout &DL = MIB.getDataLayout();
+  LLVMContext &Ctx = MF->getFunction().getContext();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DataTy = MRI.getType(DstReg);
+  if (!DataTy.isVector())
+    return false;
+
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+
+  const auto *TLI = STI.getTargetLowering();
+  EVT VT = EVT::getEVT(getTypeForLLT(DataTy, Ctx));
+
+  if (TLI->allowsMemoryAccessForAlignment(Ctx, DL, VT, *MMO))
+    return true;
+
+  unsigned EltSizeBits = DataTy.getScalarSizeInBits();
+  assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
+         "Unexpected unaligned RVV load type");
+
+  // Calculate the new vector type with i8 elements
+  unsigned NumElements =
+      DataTy.getElementCount().getKnownMinValue() * (EltSizeBits / 8);
+  LLT NewDataTy = LLT::scalable_vector(NumElements, 8);
+
+  Helper.bitcast(MI, 0, NewDataTy);
+
+  return true;
+}
+
 /// Return the type of the mask type suitable for masking the provided
 /// vector type.  This is simply an i1 element type vector of the same
 /// (possibly scalable) length.
@@ -828,6 +925,9 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return legalizeExt(MI, MIRBuilder);
   case TargetOpcode::G_SPLAT_VECTOR:
     return legalizeSplatVector(MI, MIRBuilder);
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
+    return legalizeLoadStore(MI, Helper, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index 5bb1e7a728278..2fc28615e7630 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/Register.h"
 
@@ -45,6 +46,8 @@ class RISCVLegalizerInfo : public LegalizerInfo {
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  bool legalizeLoadStore(MachineInstr &MI, LegalizerHelper &Helper,
+                         MachineIRBuilder &MIB) const;
 };
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index d25e96525399e..5369be24f0e7c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -112,51 +112,6 @@ using namespace llvm;
 RISCVRegisterBankInfo::RISCVRegisterBankInfo(unsigned HwMode)
     : RISCVGenRegisterBankInfo(HwMode) {}
 
-const RegisterBank &
-RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                              LLT Ty) const {
-  switch (RC.getID()) {
-  default:
-    llvm_unreachable("Register class not supported");
-  case RISCV::GPRRegClassID:
-  case RISCV::GPRF16RegClassID:
-  case RISCV::GPRF32RegClassID:
-  case RISCV::GPRNoX0RegClassID:
-  case RISCV::GPRNoX0X2RegClassID:
-  case RISCV::GPRJALRRegClassID:
-  case RISCV::GPRJALRNonX7RegClassID:
-  case RISCV::GPRTCRegClassID:
-  case RISCV::GPRTCNonX7RegClassID:
-  case RISCV::GPRC_and_GPRTCRegClassID:
-  case RISCV::GPRCRegClassID:
-  case RISCV::GPRC_and_SR07RegClassID:
-  case RISCV::SR07RegClassID:
-  case RISCV::SPRegClassID:
-  case RISCV::GPRX0RegClassID:
-    return getRegBank(RISCV::GPRBRegBankID);
-  case RISCV::FPR64RegClassID:
-  case RISCV::FPR16RegClassID:
-  case RISCV::FPR32RegClassID:
-  case RISCV::FPR64CRegClassID:
-  case RISCV::FPR32CRegClassID:
-    return getRegBank(RISCV::FPRBRegBankID);
-  case RISCV::VMRegClassID:
-  case RISCV::VRRegClassID:
-  case RISCV::VRNoV0RegClassID:
-  case RISCV::VRM2RegClassID:
-  case RISCV::VRM2NoV0RegClassID:
-  case RISCV::VRM4RegClassID:
-  case RISCV::VRM4NoV0RegClassID:
-  case RISCV::VMV0RegClassID:
-  case RISCV::VRM2_with_sub_vrm1_0_in_VMV0RegClassID:
-  case RISCV::VRM4_with_sub_vrm1_0_in_VMV0RegClassID:
-  case RISCV::VRM8RegClassID:
-  case RISCV::VRM8NoV0RegClassID:
-  case RISCV::VRM8_with_sub_vrm1_0_in_VMV0RegClassID:
-    return getRegBank(RISCV::VRBRegBankID);
-  }
-}
-
 static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
   unsigned Idx;
   switch (Size) {
@@ -355,10 +310,19 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   switch (Opc) {
   case TargetOpcode::G_LOAD: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OpdsMapping[0] = GPRValueMapping;
+    TypeSize Size = Ty.getSizeInBits();
+
     OpdsMapping[1] = GPRValueMapping;
+
+    if (Ty.isVector()) {
+      OpdsMapping[0] = getVRBValueMapping(Size.getKnownMinValue());
+      break;
+    }
+
+    OpdsMapping[0] = GPRValueMapping;
+
     // Use FPR64 for s64 loads on rv32.
-    if (GPRSize == 32 && Ty.getSizeInBits() == 64) {
+    if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
       OpdsMapping[0] = getFPValueMapping(Ty.getSizeInBits());
       break;
@@ -378,10 +342,19 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case TargetOpcode::G_STORE: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OpdsMapping[0] = GPRValueMapping;
+    TypeSize Size = Ty.getSizeInBits();
+
     OpdsMapping[1] = GPRValueMapping;
+
+    if (Ty.isVector()) {
+      OpdsMapping[0] = getVRBValueMapping(Size.getKnownMinValue());
+      break;
+    }
+
+    OpdsMapping[0] = GPRValueMapping;
+
     // Use FPR64 for s64 stores on rv32.
-    if (GPRSize == 32 && Ty.getSizeInBits() == 64) {
+    if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
       OpdsMapping[0] = getFPValueMapping(Ty.getSizeInBits());
       break;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
index abd0837395f66..79dddb73a2373 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
@@ -33,9 +33,6 @@ class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo {
 public:
   RISCVRegisterBankInfo(unsigned HwMode);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT Ty) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index cb64b64c503e9..237a8eaca05a0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -560,11 +560,7 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm,
   if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined())
     return false;
 
-  auto *Writer = Asm.getWriterPtr();
-  if (!Writer)
-    return false;
-
-  bool IsResolved = Writer->isSymbolRefDifferenceFullyResolvedImpl(
+  bool IsResolved = Asm.getWriter().isSymbolRefDifferenceFullyResolvedImpl(
       Asm, SA, *AUIPCDF, false, true);
   if (!IsResolved)
     return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 87c5a756e0258..26a3ee43ef7cf 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -19,7 +19,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
@@ -87,10 +87,10 @@ void RISCVTargetELFStreamer::finishAttributeSection() {
 
 void RISCVTargetELFStreamer::finish() {
   RISCVTargetStreamer::finish();
-  MCAssembler &MCA = getStreamer().getAssembler();
+  ELFObjectWriter &W = getStreamer().getWriter();
   RISCVABI::ABI ABI = getTargetABI();
 
-  unsigned EFlags = MCA.getELFHeaderEFlags();
+  unsigned EFlags = W.getELFHeaderEFlags();
 
   if (hasRVC())
     EFlags |= ELF::EF_RISCV_RVC;
@@ -117,7 +117,7 @@ void RISCVTargetELFStreamer::finish() {
     llvm_unreachable("Improperly initialised target ABI");
   }
 
-  MCA.setELFHeaderEFlags(EFlags);
+  W.setELFHeaderEFlags(EFlags);
 }
 
 void RISCVTargetELFStreamer::reset() {
@@ -132,7 +132,6 @@ void RISCVTargetELFStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) {
 void RISCVELFStreamer::reset() {
   static_cast<RISCVTargetStreamer *>(getTargetStreamer())->reset();
   MCELFStreamer::reset();
-  MappingSymbolCounter = 0;
   LastMappingSymbols.clear();
   LastEMS = EMS_None;
 }
@@ -152,8 +151,7 @@ void RISCVELFStreamer::emitInstructionsMappingSymbol() {
 }
 
 void RISCVELFStreamer::emitMappingSymbol(StringRef Name) {
-  auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
-      Name + "." + Twine(MappingSymbolCounter++)));
+  auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
   emitLabel(Symbol);
   Symbol->setType(ELF::STT_NOTYPE);
   Symbol->setBinding(ELF::STB_LOCAL);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 40c6b5ac3dcc8..b06760859483d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -22,7 +22,6 @@ class RISCVELFStreamer : public MCELFStreamer {
 
   enum ElfMappingSymbol { EMS_None, EMS_Instructions, EMS_Data };
 
-  int64_t MappingSymbolCounter = 0;
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS = EMS_None;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 691a5892ae827..e051312d61a7b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -110,10 +110,9 @@ createRISCVObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return nullptr;
 }
 
-static MCTargetStreamer *createRISCVAsmTargetStreamer(MCStreamer &S,
-                                                      formatted_raw_ostream &OS,
-                                                      MCInstPrinter *InstPrint,
-                                                      bool isVerboseAsm) {
+static MCTargetStreamer *
+createRISCVAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                             MCInstPrinter *InstPrint) {
   return new RISCVTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index c0a4d0e9c520f..93677433c0440 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -84,8 +84,8 @@ class RISCVAsmPrinter : public AsmPrinter {
 
   // Returns whether Inst is compressed.
   bool EmitToStreamer(MCStreamer &S, const MCInst &Inst);
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   typedef std::tuple<unsigned, uint32_t> HwasanMemaccessTuple;
   std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
@@ -291,9 +291,10 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   emitNTLHint(MI);
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
-
+  }
 
   switch (MI->getOpcode()) {
   case RISCV::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
@@ -975,7 +976,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
     if (hasVLOutput && OpNo == 1)
       continue;
 
-    // Skip merge op. It should be the first operand after the defs.
+    // Skip passthru op. It should be the first operand after the defs.
     if (OpNo == MI->getNumExplicitDefs() && MO.isReg() && MO.isTied()) {
       assert(MCID.getOperandConstraint(OpNo, MCOI::TIED_TO) == 0 &&
              "Expected tied to first def.");
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 0a66a38f6d5ab..be2e880ecd3a9 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -187,25 +187,10 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
   auto *VTy = cast<VectorType>(II.getType());
 
   IRBuilder<> Builder(&II);
-
-  // Extend VL from i32 to XLen if needed.
-  if (ST->is64Bit())
-    VL = Builder.CreateZExt(VL, Builder.getInt64Ty());
-
   Type *STy = VTy->getElementType();
   Value *Val = Builder.CreateLoad(STy, BasePtr);
-  const auto &TLI = *ST->getTargetLowering();
-  Value *Res;
-
-  // TODO: Also support fixed/illegal vector types to splat with evl = vl.
-  if (isa<ScalableVectorType>(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) {
-    unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f
-                                              : Intrinsic::riscv_vmv_v_x;
-    Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()},
-                                  {PoisonValue::get(VTy), Val, VL});
-  } else {
-    Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val);
-  }
+  Value *Res = Builder.CreateIntrinsic(Intrinsic::experimental_vp_splat, {VTy},
+                                       {Val, II.getOperand(2), VL});
 
   II.replaceAllUsesWith(Res);
   II.eraseFromParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index c9979b2b36fc3..e278fa3fe3176 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -37,6 +37,15 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
+// The groupID/bitmask of RISCVExtension is used to retrieve a specific bit value 
+// from __riscv_feature_bits based on the groupID and bitmask.
+// groupID - groupID of extension
+// bitPos  - bit position of extension bitmask
+class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
+    int GroupID = groupID;
+    int BitPos = bitPos;
+}
+
 // Version of RISCVExtension to be used for Experimental extensions. This
 // sets the Experimental flag and prepends experimental- to the -mattr name.
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
@@ -52,7 +61,8 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">;
+                     "'I' (Base Integer Instruction Set)">,
+      RISCVExtensionBitmask<0, 8>;
 
 def FeatureStdExtE
     : RISCVExtension<"e", 2, 0,
@@ -78,7 +88,8 @@ def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
 
 def FeatureStdExtZicboz
     : RISCVExtension<"zicboz", 1, 0,
-                     "'Zicboz' (Cache-Block Zero Instructions)">;
+                     "'Zicboz' (Cache-Block Zero Instructions)">,
+      RISCVExtensionBitmask<0, 37>;
 def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicboz),
                           "'Zicboz' (Cache-Block Zero Instructions)">;
@@ -113,7 +124,8 @@ def FeatureStdExtZicntr
 
 def FeatureStdExtZicond
     : RISCVExtension<"zicond", 1, 0,
-                     "'Zicond' (Integer Conditional Operations)">;
+                     "'Zicond' (Integer Conditional Operations)">,
+      RISCVExtensionBitmask<0, 38>;
 def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicond),
                           "'Zicond' (Integer Conditional Operations)">;
@@ -127,14 +139,16 @@ def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
 
 def FeatureStdExtZihintpause
     : RISCVExtension<"zihintpause", 2, 0,
-                     "'Zihintpause' (Pause Hint)">;
+                     "'Zihintpause' (Pause Hint)">,
+      RISCVExtensionBitmask<0, 40>;
 def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
                            AssemblerPredicate<(all_of FeatureStdExtZihintpause),
                                               "'Zihintpause' (Pause Hint)">;
 
 def FeatureStdExtZihintntl
     : RISCVExtension<"zihintntl", 1, 0,
-                     "'Zihintntl' (Non-Temporal Locality Hints)">;
+                     "'Zihintntl' (Non-Temporal Locality Hints)">,
+      RISCVExtensionBitmask<0, 39>;
 def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
                          AssemblerPredicate<(all_of FeatureStdExtZihintntl),
                              "'Zihintntl' (Non-Temporal Locality Hints)">;
@@ -181,7 +195,8 @@ def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
 def FeatureStdExtM
     : RISCVExtension<"m", 2, 0,
                      "'M' (Integer Multiplication and Division)",
-                     [FeatureStdExtZmmul]>;
+                     [FeatureStdExtZmmul]>,
+      RISCVExtensionBitmask<0, 12>;
 def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
                  AssemblerPredicate<(all_of FeatureStdExtM),
                      "'M' (Integer Multiplication and Division)">;
@@ -190,14 +205,16 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
 
 def FeatureStdExtA
     : RISCVExtension<"a", 2, 1,
-                     "'A' (Atomic Instructions)">;
+                     "'A' (Atomic Instructions)">,
+      RISCVExtensionBitmask<0, 0>;
 def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
                  AssemblerPredicate<(all_of FeatureStdExtA),
                                     "'A' (Atomic Instructions)">;
 
 def FeatureStdExtZtso
     : RISCVExtension<"ztso", 1, 0,
-                     "'Ztso' (Memory Model - Total Store Order)">;
+                     "'Ztso' (Memory Model - Total Store Order)">,
+      RISCVExtensionBitmask<0, 47>;
 def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
                     AssemblerPredicate<(all_of FeatureStdExtZtso),
                         "'Ztso' (Memory Model - Total Store Order)">;
@@ -226,8 +243,9 @@ def HasStdExtZabha : Predicate<"Subtarget->hasStdExtZabha()">,
                          "'Zabha' (Byte and Halfword Atomic Memory Operations)">;
 
 def FeatureStdExtZacas
-    : RISCVExtension<"zacas", 1, 0,
-                     "'Zacas' (Atomic Compare-And-Swap Instructions)">;
+    : RISCVExperimentalExtension<"zacas", 1, 0,
+                                 "'Zacas' (Atomic Compare-And-Swap Instructions)">,
+      RISCVExtensionBitmask<0, 26>;
 def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
                      AssemblerPredicate<(all_of FeatureStdExtZacas),
                          "'Zacas' (Atomic Compare-And-Swap Instructions)">;
@@ -264,7 +282,8 @@ def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>;
+                     [FeatureStdExtZicsr]>,
+      RISCVExtensionBitmask<0, 5>;
 def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
                  AssemblerPredicate<(all_of FeatureStdExtF),
                                     "'F' (Single-Precision Floating-Point)">;
@@ -272,7 +291,8 @@ def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
 def FeatureStdExtD
     : RISCVExtension<"d", 2, 2,
                      "'D' (Double-Precision Floating-Point)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 3>;
 def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                  AssemblerPredicate<(all_of FeatureStdExtD),
                                     "'D' (Double-Precision Floating-Point)">;
@@ -280,7 +300,8 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
 def FeatureStdExtZfhmin
     : RISCVExtension<"zfhmin", 1, 0,
                      "'Zfhmin' (Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 36>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
                       AssemblerPredicate<(all_of FeatureStdExtZfhmin),
                           "'Zfh' (Half-Precision Floating-Point) or "
@@ -289,7 +310,8 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
 def FeatureStdExtZfh
     : RISCVExtension<"zfh", 1, 0,
                      "'Zfh' (Half-Precision Floating-Point)",
-                     [FeatureStdExtZfhmin]>;
+                     [FeatureStdExtZfhmin]>,
+      RISCVExtensionBitmask<0, 35>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfh),
                        "'Zfh' (Half-Precision Floating-Point)">;
@@ -313,7 +335,8 @@ def HasHalfFPLoadStoreMove
 def FeatureStdExtZfa
     : RISCVExtension<"zfa", 1, 0,
                      "'Zfa' (Additional Floating-Point)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 34>;
 def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfa),
                                       "'Zfa' (Additional Floating-Point)">;
@@ -356,7 +379,8 @@ def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
 def FeatureStdExtC
     : RISCVExtension<"c", 2, 0,
-                     "'C' (Compressed Instructions)">;
+                     "'C' (Compressed Instructions)">,
+      RISCVExtensionBitmask<0, 2>;
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                  AssemblerPredicate<(all_of FeatureStdExtC),
                                     "'C' (Compressed Instructions)">;
@@ -445,7 +469,8 @@ def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
 
 def FeatureStdExtZba
     : RISCVExtension<"zba", 1, 0,
-                     "'Zba' (Address Generation Instructions)">;
+                     "'Zba' (Address Generation Instructions)">,
+      RISCVExtensionBitmask<0, 27>;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
                    AssemblerPredicate<(all_of FeatureStdExtZba),
                                       "'Zba' (Address Generation Instructions)">;
@@ -453,7 +478,8 @@ def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
 def FeatureStdExtZbb
     : RISCVExtension<"zbb", 1, 0,
-                     "'Zbb' (Basic Bit-Manipulation)">;
+                     "'Zbb' (Basic Bit-Manipulation)">,
+      RISCVExtensionBitmask<0, 28>;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbb),
                                       "'Zbb' (Basic Bit-Manipulation)">;
@@ -462,14 +488,16 @@ def NoStdExtZbb : Predicate<"!Subtarget->hasStdExtZbb()">,
 
 def FeatureStdExtZbc
     : RISCVExtension<"zbc", 1, 0,
-                     "'Zbc' (Carry-Less Multiplication)">;
+                     "'Zbc' (Carry-Less Multiplication)">,
+      RISCVExtensionBitmask<0, 29>;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbc),
                                       "'Zbc' (Carry-Less Multiplication)">;
 
 def FeatureStdExtZbs
     : RISCVExtension<"zbs", 1, 0,
-                     "'Zbs' (Single-Bit Instructions)">;
+                     "'Zbs' (Single-Bit Instructions)">,
+      RISCVExtensionBitmask<0, 33>;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbs),
                                       "'Zbs' (Single-Bit Instructions)">;
@@ -486,14 +514,16 @@ def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
 
 def FeatureStdExtZbkb
     : RISCVExtension<"zbkb", 1, 0,
-                     "'Zbkb' (Bitmanip instructions for Cryptography)">;
+                     "'Zbkb' (Bitmanip instructions for Cryptography)">,
+      RISCVExtensionBitmask<0, 30>;
 def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkb),
                         "'Zbkb' (Bitmanip instructions for Cryptography)">;
 
 def FeatureStdExtZbkx
     : RISCVExtension<"zbkx", 1, 0,
-                     "'Zbkx' (Crossbar permutation instructions)">;
+                     "'Zbkx' (Crossbar permutation instructions)">,
+      RISCVExtensionBitmask<0, 32>;
 def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkx),
                         "'Zbkx' (Crossbar permutation instructions)">;
@@ -510,7 +540,8 @@ def HasStdExtZbbOrZbkb
 def FeatureStdExtZbkc
     : RISCVExtension<"zbkc", 1, 0,
                      "'Zbkc' (Carry-less multiply instructions for "
-                     "Cryptography)">;
+                     "Cryptography)">,
+      RISCVExtensionBitmask<0, 31>;
 def HasStdExtZbkc
     : Predicate<"Subtarget->hasStdExtZbkc()">,
       AssemblerPredicate<(all_of FeatureStdExtZbkc),
@@ -527,14 +558,16 @@ def HasStdExtZbcOrZbkc
 
 def FeatureStdExtZknd
     : RISCVExtension<"zknd", 1, 0,
-                     "'Zknd' (NIST Suite: AES Decryption)">;
+                     "'Zknd' (NIST Suite: AES Decryption)">,
+      RISCVExtensionBitmask<0, 41>;
 def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknd),
                                        "'Zknd' (NIST Suite: AES Decryption)">;
 
 def FeatureStdExtZkne
     : RISCVExtension<"zkne", 1, 0,
-                     "'Zkne' (NIST Suite: AES Encryption)">;
+                     "'Zkne' (NIST Suite: AES Encryption)">,
+      RISCVExtensionBitmask<0, 42>;
 def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">,
                     AssemblerPredicate<(all_of FeatureStdExtZkne),
                                        "'Zkne' (NIST Suite: AES Encryption)">;
@@ -549,21 +582,24 @@ def HasStdExtZkndOrZkne
 
 def FeatureStdExtZknh
     : RISCVExtension<"zknh", 1, 0,
-                     "'Zknh' (NIST Suite: Hash Function Instructions)">;
+                     "'Zknh' (NIST Suite: Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 43>;
 def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknh),
                         "'Zknh' (NIST Suite: Hash Function Instructions)">;
 
 def FeatureStdExtZksed
     : RISCVExtension<"zksed", 1, 0,
-                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
+                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">,
+      RISCVExtensionBitmask<0, 44>;
 def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">,
                      AssemblerPredicate<(all_of FeatureStdExtZksed),
                          "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZksh
     : RISCVExtension<"zksh", 1, 0,
-                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
+                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 45>;
 def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZksh),
                         "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
@@ -596,7 +632,8 @@ def FeatureStdExtZks
 
 def FeatureStdExtZkt
     : RISCVExtension<"zkt", 1, 0,
-                     "'Zkt' (Data Independent Execution Latency)">;
+                     "'Zkt' (Data Independent Execution Latency)">,
+      RISCVExtensionBitmask<0, 46>;
 
 def FeatureStdExtZk
     : RISCVExtension<"zk", 1, 0,
@@ -626,6 +663,7 @@ def FeatureStdExtZve32x
                      "with maximal 32 EEW)",
                      [FeatureStdExtZicsr, FeatureStdExtZvl32b]>;
 
+
 def FeatureStdExtZve32f
     : RISCVExtension<"zve32f", 1, 0,
                      "'Zve32f' (Vector Extensions for Embedded Processors "
@@ -653,7 +691,8 @@ def FeatureStdExtZve64d
 def FeatureStdExtV
     : RISCVExtension<"v", 1, 0,
                      "'V' (Vector Extension for Application Processors)",
-                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>;
+                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>,
+      RISCVExtensionBitmask<0, 21>;
 
 def FeatureStdExtZvfbfmin
     : RISCVExtension<"zvfbfmin", 1, 0, "'Zvbfmin' (Vector BF16 Converts)",
@@ -673,12 +712,14 @@ def HasStdExtZvfbfwma : Predicate<"Subtarget->hasStdExtZvfbfwma()">,
 def FeatureStdExtZvfhmin
     : RISCVExtension<"zvfhmin", 1, 0,
                      "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtZve32f]>;
+                     [FeatureStdExtZve32f]>,
+      RISCVExtensionBitmask<0, 51>;
 
 def FeatureStdExtZvfh
     : RISCVExtension<"zvfh", 1, 0,
                      "'Zvfh' (Vector Half-Precision Floating-Point)",
-                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>;
+                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>,
+      RISCVExtensionBitmask<0, 50>;
 
 def HasStdExtZfhOrZvfh
     : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZvfh()">,
@@ -690,7 +731,8 @@ def HasStdExtZfhOrZvfh
 
 def FeatureStdExtZvkb
     : RISCVExtension<"zvkb", 1, 0,
-                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
+                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">,
+      RISCVExtensionBitmask<0, 52>;
 def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkb),
                         "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
@@ -698,35 +740,40 @@ def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
 def FeatureStdExtZvbb
     : RISCVExtension<"zvbb", 1, 0,
                      "'Zvbb' (Vector basic bit-manipulation instructions)",
-                     [FeatureStdExtZvkb]>;
+                     [FeatureStdExtZvkb]>,
+      RISCVExtensionBitmask<0, 48>;
 def HasStdExtZvbb : Predicate<"Subtarget->hasStdExtZvbb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbb),
                         "'Zvbb' (Vector basic bit-manipulation instructions)">;
 
 def FeatureStdExtZvbc
     : RISCVExtension<"zvbc", 1, 0,
-                     "'Zvbc' (Vector Carryless Multiplication)">;
+                     "'Zvbc' (Vector Carryless Multiplication)">,
+      RISCVExtensionBitmask<0, 49>;
 def HasStdExtZvbc : Predicate<"Subtarget->hasStdExtZvbc()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbc),
                         "'Zvbc' (Vector Carryless Multiplication)">;
 
 def FeatureStdExtZvkg
     : RISCVExtension<"zvkg", 1, 0,
-                     "'Zvkg' (Vector GCM instructions for Cryptography)">;
+                     "'Zvkg' (Vector GCM instructions for Cryptography)">,
+      RISCVExtensionBitmask<0, 53>;
 def HasStdExtZvkg : Predicate<"Subtarget->hasStdExtZvkg()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkg),
                         "'Zvkg' (Vector GCM instructions for Cryptography)">;
 
 def FeatureStdExtZvkned
     : RISCVExtension<"zvkned", 1, 0,
-                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
+                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">,
+      RISCVExtensionBitmask<0, 54>;
 def HasStdExtZvkned : Predicate<"Subtarget->hasStdExtZvkned()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvkned),
                           "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 
 def FeatureStdExtZvknha
     : RISCVExtension<"zvknha", 1, 0,
-                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
+                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">,
+      RISCVExtensionBitmask<0, 55>;
 def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknha),
                           "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
@@ -734,7 +781,8 @@ def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
 def FeatureStdExtZvknhb
     : RISCVExtension<"zvknhb", 1, 0,
                      "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))",
-                     [FeatureStdExtZve64x]>;
+                     [FeatureStdExtZve64x]>,
+      RISCVExtensionBitmask<0, 56>;
 def HasStdExtZvknhb : Predicate<"Subtarget->hasStdExtZvknhb()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknhb),
                           "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))">;
@@ -745,21 +793,24 @@ def HasStdExtZvknhaOrZvknhb : Predicate<"Subtarget->hasStdExtZvknha() || Subtarg
 
 def FeatureStdExtZvksed
     : RISCVExtension<"zvksed", 1, 0,
-                     "'Zvksed' (SM4 Block Cipher Instructions)">;
+                     "'Zvksed' (SM4 Block Cipher Instructions)">,
+      RISCVExtensionBitmask<0, 57>;
 def HasStdExtZvksed : Predicate<"Subtarget->hasStdExtZvksed()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvksed),
                           "'Zvksed' (SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZvksh
     : RISCVExtension<"zvksh", 1, 0,
-                     "'Zvksh' (SM3 Hash Function Instructions)">;
+                     "'Zvksh' (SM3 Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 58>;
 def HasStdExtZvksh : Predicate<"Subtarget->hasStdExtZvksh()">,
                      AssemblerPredicate<(all_of FeatureStdExtZvksh),
                          "'Zvksh' (SM3 Hash Function Instructions)">;
 
 def FeatureStdExtZvkt
     : RISCVExtension<"zvkt", 1, 0,
-                     "'Zvkt' (Vector Data-Independent Execution Latency)">;
+                     "'Zvkt' (Vector Data-Independent Execution Latency)">,
+      RISCVExtensionBitmask<0, 59>;
 
 // Zvk short-hand extensions
 
@@ -796,7 +847,6 @@ def FeatureStdExtZvksg
     : RISCVExtension<"zvksg", 1, 0,
                      "'Zvksg' (shorthand for 'Zvks' and 'Zvkg')",
                      [FeatureStdExtZvks, FeatureStdExtZvkg]>;
-
 // Vector instruction predicates
 
 def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
@@ -820,7 +870,7 @@ def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minima
                          "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal) or "
                          "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
-def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">;
+def HasVInstructionsBF16Minimal : Predicate<"Subtarget->hasVInstructionsBF16Minimal()">;
 def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">;
 def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
 
@@ -1320,9 +1370,7 @@ def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()"
 def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
 
 def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
-                                   "SiFive 7-Series processors",
-                                   [TuneNoDefaultUnroll,
-                                    TuneShortForwardBranchOpt]>;
+                                   "SiFive 7-Series processors">;
 
 def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
                                          "Ventana Veyron-Series processors">;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e676c2f94583d..7abd5a49a1b5f 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1535,6 +1535,7 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo());
@@ -1554,12 +1555,22 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
     // Insert the spill to the stack frame.
     int FI = CS.getFrameIdx();
     if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
-      unsigned CFIIndex = MF->addFrameInst(
-          createDefCFAOffset(*STI.getRegisterInfo(), CS.getReg(), -FixedSize,
-                             MFI.getObjectOffset(FI) / 8));
-      BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlag(MachineInstr::FrameSetup);
+      MCRegister BaseReg = TRI.getSubReg(CS.getReg(), RISCV::sub_vrm1_0);
+      // If it's not a grouped vector register, it doesn't have subregister, so
+      // the base register is just itself.
+      if (BaseReg == RISCV::NoRegister)
+        BaseReg = CS.getReg();
+      unsigned NumRegs = RISCV::VRRegClass.contains(CS.getReg())     ? 1
+                         : RISCV::VRM2RegClass.contains(CS.getReg()) ? 2
+                         : RISCV::VRM4RegClass.contains(CS.getReg()) ? 4
+                                                                     : 8;
+      for (unsigned i = 0; i < NumRegs; ++i) {
+        unsigned CFIIndex = MF->addFrameInst(createDefCFAOffset(
+            TRI, BaseReg + i, -FixedSize, MFI.getObjectOffset(FI) / 8 + i));
+        BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index d9971791a2cfa..881be28bfe79e 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -515,17 +515,23 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
 
   Builder.SetInsertPoint(II);
 
+  Value *EVL = Builder.CreateElementCount(
+      IntegerType::get(Ctx, 32), cast<VectorType>(DataType)->getElementCount());
+
   CallInst *Call;
-  if (II->getIntrinsicID() == Intrinsic::masked_gather)
+  if (II->getIntrinsicID() == Intrinsic::masked_gather) {
     Call = Builder.CreateIntrinsic(
-        Intrinsic::riscv_masked_strided_load,
+        Intrinsic::experimental_vp_strided_load,
         {DataType, BasePtr->getType(), Stride->getType()},
-        {II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
-  else
+        {BasePtr, Stride, II->getArgOperand(2), EVL});
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::vp_select, {DataType},
+        {II->getOperand(2), Call, II->getArgOperand(3), EVL});
+  } else
     Call = Builder.CreateIntrinsic(
-        Intrinsic::riscv_masked_strided_store,
+        Intrinsic::experimental_vp_strided_store,
         {DataType, BasePtr->getType(), Stride->getType()},
-        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
+        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL});
 
   Call->takeName(II);
   II->replaceAllUsesWith(Call);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index eef6ae677ac85..4de38db6e1fe9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1393,6 +1393,18 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
           ReplaceNode(Node, SLLI);
           return;
         }
+        // If we have 32 bits in the mask, we can use SLLI_UW instead of SLLI.
+        if (Trailing > 0 && Leading + Trailing == 32 && C2 + Trailing < XLen &&
+            OneUseOrZExtW && Subtarget->hasStdExtZba()) {
+          SDNode *SRLI = CurDAG->getMachineNode(
+              RISCV::SRLI, DL, VT, X,
+              CurDAG->getTargetConstant(C2 + Trailing, DL, VT));
+          SDNode *SLLI_UW = CurDAG->getMachineNode(
+              RISCV::SLLI_UW, DL, VT, SDValue(SRLI, 0),
+              CurDAG->getTargetConstant(Trailing, DL, VT));
+          ReplaceNode(Node, SLLI_UW);
+          return;
+        }
       }
 
       // Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a
@@ -1528,7 +1540,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (tryIndexedLoad(Node))
       return;
 
-    if (Subtarget->hasVendorXCVmem()) {
+    if (Subtarget->hasVendorXCVmem() && !Subtarget->is64Bit()) {
       // We match post-incrementing load here
       LoadSDNode *Load = cast<LoadSDNode>(Node);
       if (Load->getAddressingMode() != ISD::POST_INC)
@@ -3621,7 +3633,7 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
 #endif
 
   SmallVector<SDValue, 8> Ops;
-  // Skip the merge operand at index 0 if !UseTUPseudo.
+  // Skip the passthru operand at index 0 if !UseTUPseudo.
   for (unsigned I = !UseTUPseudo, E = N->getNumOperands(); I != E; I++) {
     // Skip the mask, and the Glue.
     SDValue Op = N->getOperand(I);
@@ -3684,9 +3696,9 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
 // ->
 // %x = PseudoVADD_VV_MASK %false, ..., %mask
 //
-// We can only fold if vmerge's merge operand, vmerge's false operand and
-// %true's merge operand (if it has one) are the same. This is because we have
-// to consolidate them into one merge operand in the result.
+// We can only fold if vmerge's passthru operand, vmerge's false operand and
+// %true's passthru operand (if it has one) are the same. This is because we
+// have to consolidate them into one passthru operand in the result.
 //
 // If %true is masked, then we can use its mask instead of vmerge's if vmerge's
 // mask is all ones.
@@ -3697,12 +3709,12 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
 // The resulting VL is the minimum of the two VLs.
 //
 // The resulting policy is the effective policy the vmerge would have had,
-// i.e. whether or not it's merge operand was implicit-def.
+// i.e. whether or not it's passthru operand was implicit-def.
 bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
-  SDValue Merge, False, True, VL, Mask, Glue;
+  SDValue Passthru, False, True, VL, Mask, Glue;
   // A vmv.v.v is equivalent to a vmerge with an all-ones mask.
   if (IsVMv(N)) {
-    Merge = N->getOperand(0);
+    Passthru = N->getOperand(0);
     False = N->getOperand(0);
     True = N->getOperand(1);
     VL = N->getOperand(2);
@@ -3710,7 +3722,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     // mask later below.
   } else {
     assert(IsVMerge(N));
-    Merge = N->getOperand(0);
+    Passthru = N->getOperand(0);
     False = N->getOperand(1);
     True = N->getOperand(2);
     Mask = N->getOperand(3);
@@ -3721,9 +3733,13 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   assert(!Mask || cast<RegisterSDNode>(Mask)->getReg() == RISCV::V0);
   assert(!Glue || Glue.getValueType() == MVT::Glue);
 
-  // We require that either merge and false are the same, or that merge
+  // If the EEW of True is different from vmerge's SEW, then we can't fold.
+  if (True.getSimpleValueType() != N->getSimpleValueType(0))
+    return false;
+
+  // We require that either passthru and false are the same, or that passthru
   // is undefined.
-  if (Merge != False && !isImplicitDef(Merge))
+  if (Passthru != False && !isImplicitDef(Passthru))
     return false;
 
   assert(True.getResNo() == 0 &&
@@ -3753,11 +3769,11 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   if (!Info)
     return false;
 
-  // If True has a merge operand then it needs to be the same as vmerge's False,
-  // since False will be used for the result's merge operand.
+  // If True has a passthru operand then it needs to be the same as vmerge's
+  // False, since False will be used for the result's passthru operand.
   if (HasTiedDest && !isImplicitDef(True->getOperand(0))) {
-    SDValue MergeOpTrue = True->getOperand(0);
-    if (False != MergeOpTrue)
+    SDValue PassthruOpTrue = True->getOperand(0);
+    if (False != PassthruOpTrue)
       return false;
   }
 
@@ -3765,7 +3781,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // 1s mask, since we're going to keep the mask from True.
   if (IsMasked && Mask) {
     // FIXME: Support mask agnostic True instruction which would have an
-    // undef merge operand.
+    // undef passthru operand.
     SDValue TrueMask =
         getMaskSetter(True->getOperand(Info->MaskOpIdx),
                       True->getOperand(True->getNumOperands() - 1));
@@ -3823,8 +3839,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     return CLHS->getZExtValue() <= CRHS->getZExtValue() ? LHS : RHS;
   };
 
-  // Because N and True must have the same merge operand (or True's operand is
-  // implicit_def), the "effective" body is the minimum of their VLs.
+  // Because N and True must have the same passthru operand (or True's operand
+  // is implicit_def), the "effective" body is the minimum of their VLs.
   SDValue OrigVL = VL;
   VL = GetMinVL(TrueVL, VL);
   if (!VL)
@@ -3883,7 +3899,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
          "Expected instructions with mask have a tied dest.");
 #endif
 
-  // Use a tumu policy, relaxing it to tail agnostic provided that the merge
+  // Use a tumu policy, relaxing it to tail agnostic provided that the passthru
   // operand is undefined.
   //
   // However, if the VL became smaller than what the vmerge had originally, then
@@ -3891,7 +3907,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // to the tail. In that case we always need to use tail undisturbed to
   // preserve them.
   bool MergeVLShrunk = VL != OrigVL;
-  uint64_t Policy = (isImplicitDef(Merge) && !MergeVLShrunk)
+  uint64_t Policy = (isImplicitDef(Passthru) && !MergeVLShrunk)
                         ? RISCVII::TAIL_AGNOSTIC
                         : /*TUMU*/ 0;
   SDValue PolicyOp =
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 21193ebe1eb94..68b614d1d3fdc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -198,7 +198,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       for (MVT VT : F16VecVTs)
         addRegClassForRVV(VT);
 
-    if (Subtarget.hasVInstructionsBF16())
+    if (Subtarget.hasVInstructionsBF16Minimal())
       for (MVT VT : BF16VecVTs)
         addRegClassForRVV(VT);
 
@@ -250,14 +250,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (RV64LegalI32 && Subtarget.is64Bit())
     setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
 
-  if (!Subtarget.hasVendorXCValu())
-    setCondCodeAction(ISD::SETLE, XLenVT, Expand);
   setCondCodeAction(ISD::SETGT, XLenVT, Custom);
   setCondCodeAction(ISD::SETGE, XLenVT, Expand);
-  if (!Subtarget.hasVendorXCValu())
-    setCondCodeAction(ISD::SETULE, XLenVT, Expand);
   setCondCodeAction(ISD::SETUGT, XLenVT, Custom);
   setCondCodeAction(ISD::SETUGE, XLenVT, Expand);
+  if (!(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
+    setCondCodeAction(ISD::SETULE, XLenVT, Expand);
+    setCondCodeAction(ISD::SETLE, XLenVT, Expand);
+  }
 
   if (RV64LegalI32 && Subtarget.is64Bit())
     setOperationAction(ISD::SETCC, MVT::i32, Promote);
@@ -273,7 +273,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
 
-  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb())
+  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
+      !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
     setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
 
   if (Subtarget.is64Bit()) {
@@ -343,7 +344,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.is64Bit())
       setOperationAction({ISD::ROTL, ISD::ROTR}, MVT::i32, Custom);
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Custom);
-  } else if (Subtarget.hasVendorXCVbitmanip()) {
+  } else if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::ROTL, XLenVT, Expand);
   } else {
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Expand);
@@ -366,7 +367,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                            : Expand);
 
 
-  if (Subtarget.hasVendorXCVbitmanip()) {
+  if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
   } else {
     // Zbkb can use rev8+brev8 to implement bitreverse.
@@ -374,27 +375,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                        Subtarget.hasStdExtZbkb() ? Custom : Expand);
   }
 
-  if (Subtarget.hasStdExtZbb()) {
+  if (Subtarget.hasStdExtZbb() ||
+      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
                        Legal);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, MVT::i32,
                          Promote);
+  }
 
+  if (Subtarget.hasStdExtZbb() ||
+      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     if (Subtarget.is64Bit()) {
       if (RV64LegalI32)
         setOperationAction(ISD::CTTZ, MVT::i32, Legal);
       else
         setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
     }
-  } else if (!Subtarget.hasVendorXCVbitmanip()) {
+  } else {
     setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
   }
 
   if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-      Subtarget.hasVendorXCVbitmanip()) {
+      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
     if (Subtarget.is64Bit()) {
@@ -412,13 +417,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLZ, MVT::i32, Expand);
   }
 
-  if (!RV64LegalI32 && Subtarget.is64Bit() &&
-      !Subtarget.hasShortForwardBranchOpt())
-    setOperationAction(ISD::ABS, MVT::i32, Custom);
-
-  // We can use PseudoCCSUB to implement ABS.
-  if (Subtarget.hasShortForwardBranchOpt())
+  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::ABS, XLenVT, Legal);
+  } else {
+    if (!RV64LegalI32 && Subtarget.is64Bit() &&
+        !Subtarget.hasShortForwardBranchOpt())
+      setOperationAction(ISD::ABS, MVT::i32, Custom);
+
+    // We can use PseudoCCSUB to implement ABS.
+    if (Subtarget.hasShortForwardBranchOpt())
+      setOperationAction(ISD::ABS, XLenVT, Legal);
+  }
 
   if (!Subtarget.hasVendorXTHeadCondMov()) {
     setOperationAction(ISD::SELECT, XLenVT, Custom);
@@ -1092,7 +1101,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
 
     // TODO: Could we merge some code with zvfhmin?
-    if (Subtarget.hasVInstructionsBF16()) {
+    if (Subtarget.hasVInstructionsBF16Minimal()) {
       for (MVT VT : BF16VecVTs) {
         if (!isTypeLegal(VT))
           continue;
@@ -1439,7 +1448,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasVendorXCVmem()) {
+  if (Subtarget.hasVendorXCVmem() && !Subtarget.is64Bit()) {
     setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
@@ -1449,16 +1458,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
   }
 
-  if (Subtarget.hasVendorXCValu()) {
-    setOperationAction(ISD::ABS, XLenVT, Legal);
-    setOperationAction(ISD::SMIN, XLenVT, Legal);
-    setOperationAction(ISD::UMIN, XLenVT, Legal);
-    setOperationAction(ISD::SMAX, XLenVT, Legal);
-    setOperationAction(ISD::UMAX, XLenVT, Legal);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
-  }
-
   // Function alignments.
   const Align FunctionAlignment(Subtarget.hasStdExtCOrZca() ? 2 : 4);
   setMinFunctionAlignment(FunctionAlignment);
@@ -1473,13 +1472,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SRA);
 
   if (Subtarget.hasStdExtFOrZfinx())
-    setTargetDAGCombine({ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM});
+    setTargetDAGCombine({ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM, ISD::FMUL});
 
   if (Subtarget.hasStdExtZbb())
     setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});
 
   if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
-      Subtarget.hasStdExtV())
+      Subtarget.hasVInstructions())
     setTargetDAGCombine(ISD::TRUNCATE);
 
   if (Subtarget.hasStdExtZbkb())
@@ -1622,12 +1621,6 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
-  case Intrinsic::riscv_masked_strided_load:
-    return SetRVVLoadStoreInfo(/*PtrOp*/ 1, /*IsStore*/ false,
-                               /*IsUnitStrided*/ false);
-  case Intrinsic::riscv_masked_strided_store:
-    return SetRVVLoadStoreInfo(/*PtrOp*/ 1, /*IsStore*/ true,
-                               /*IsUnitStrided*/ false);
   case Intrinsic::riscv_seg2_load:
   case Intrinsic::riscv_seg3_load:
   case Intrinsic::riscv_seg4_load:
@@ -1841,6 +1834,10 @@ bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
+  // None of our addressing modes allows a scalable offset
+  if (AM.ScalableOffset)
+    return false;
+
   // RVV instructions only support register addressing.
   if (Subtarget.hasVInstructions() && isa<VectorType>(Ty))
     return AM.HasBaseReg && AM.Scale == 0 && !AM.BaseOffs;
@@ -1898,7 +1895,7 @@ bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
 bool RISCVTargetLowering::isTruncateFree(SDValue Val, EVT VT2) const {
   EVT SrcVT = Val.getValueType();
   // free truncate from vnsrl and vnsra
-  if (Subtarget.hasStdExtV() &&
+  if (Subtarget.hasVInstructions() &&
       (Val.getOpcode() == ISD::SRL || Val.getOpcode() == ISD::SRA) &&
       SrcVT.isVector() && VT2.isVector()) {
     unsigned SrcBits = SrcVT.getVectorElementType().getSizeInBits();
@@ -1934,12 +1931,13 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
-  return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXCVbitmanip();
+  return Subtarget.hasStdExtZbb() ||
+         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-         Subtarget.hasVendorXCVbitmanip();
+         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
 }
 
 bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial(
@@ -2643,7 +2641,7 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
       return false;
     break;
   case MVT::bf16:
-    if (!Subtarget.hasVInstructionsBF16())
+    if (!Subtarget.hasVInstructionsBF16Minimal())
       return false;
     break;
   case MVT::f32:
@@ -2764,19 +2762,6 @@ static SDValue getAllOnesMask(MVT VecVT, SDValue VL, const SDLoc &DL,
   return DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
 }
 
-static SDValue getVLOp(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
-                       SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
-  // If we know the exact VLEN, and our VL is exactly equal to VLMAX,
-  // canonicalize the representation.  InsertVSETVLI will pick the immediate
-  // encoding later if profitable.
-  const auto [MinVLMAX, MaxVLMAX] =
-      RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
-  if (MinVLMAX == MaxVLMAX && NumElts == MinVLMAX)
-    return DAG.getRegister(RISCV::X0, Subtarget.getXLenVT());
-
-  return DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
-}
-
 static std::pair<SDValue, SDValue>
 getDefaultScalableVLOps(MVT VecVT, const SDLoc &DL, SelectionDAG &DAG,
                         const RISCVSubtarget &Subtarget) {
@@ -2790,7 +2775,7 @@ static std::pair<SDValue, SDValue>
 getDefaultVLOps(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
                 SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
   assert(ContainerVT.isScalableVector() && "Expecting scalable container type");
-  SDValue VL = getVLOp(NumElts, ContainerVT, DL, DAG, Subtarget);
+  SDValue VL = DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
   SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
   return {Mask, VL};
 }
@@ -2955,10 +2940,6 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
   if (SatVT != DstEltVT)
     return SDValue();
 
-  // FIXME: Don't support narrowing by more than 1 steps for now.
-  if (SrcEltSize > (2 * DstEltSize))
-    return SDValue();
-
   MVT DstContainerVT = DstVT;
   MVT SrcContainerVT = SrcVT;
   if (DstVT.isFixedLengthVector()) {
@@ -2986,9 +2967,25 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
     Src = DAG.getNode(RISCVISD::FP_EXTEND_VL, DL, InterVT, Src, Mask, VL);
   }
 
+  MVT CvtContainerVT = DstContainerVT;
+  MVT CvtEltVT = DstEltVT;
+  if (SrcEltSize > (2 * DstEltSize)) {
+    CvtEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
+    CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+  }
+
   unsigned RVVOpc =
       IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL;
-  SDValue Res = DAG.getNode(RVVOpc, DL, DstContainerVT, Src, Mask, VL);
+  SDValue Res = DAG.getNode(RVVOpc, DL, CvtContainerVT, Src, Mask, VL);
+
+  while (CvtContainerVT != DstContainerVT) {
+    CvtEltVT = MVT::getIntegerVT(CvtEltVT.getSizeInBits() / 2);
+    CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+    // Rounding mode here is arbitrary since we aren't shifting out any bits.
+    unsigned ClipOpc = IsSigned ? RISCVISD::TRUNCATE_VECTOR_VL_SSAT
+                                : RISCVISD::TRUNCATE_VECTOR_VL_USAT;
+    Res = DAG.getNode(ClipOpc, DL, CvtContainerVT, Res, Mask, VL);
+  }
 
   SDValue SplatZero = DAG.getNode(
       RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
@@ -3300,25 +3297,25 @@ static SDValue lowerVectorXRINT(SDValue Op, SelectionDAG &DAG,
 
 static SDValue
 getVSlidedown(SelectionDAG &DAG, const RISCVSubtarget &Subtarget,
-              const SDLoc &DL, EVT VT, SDValue Merge, SDValue Op,
+              const SDLoc &DL, EVT VT, SDValue Passthru, SDValue Op,
               SDValue Offset, SDValue Mask, SDValue VL,
               unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
-  if (Merge.isUndef())
+  if (Passthru.isUndef())
     Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
-  SDValue Ops[] = {Merge, Op, Offset, Mask, VL, PolicyOp};
+  SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VT, Ops);
 }
 
 static SDValue
 getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
-            EVT VT, SDValue Merge, SDValue Op, SDValue Offset, SDValue Mask,
+            EVT VT, SDValue Passthru, SDValue Op, SDValue Offset, SDValue Mask,
             SDValue VL,
             unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
-  if (Merge.isUndef())
+  if (Passthru.isUndef())
     Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
-  SDValue Ops[] = {Merge, Op, Offset, Mask, VL, PolicyOp};
+  SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
 }
 
@@ -3336,8 +3333,8 @@ struct VIDSequence {
   int64_t Addend;
 };
 
-static std::optional<uint64_t> getExactInteger(const APFloat &APF,
-                                               uint32_t BitWidth) {
+static std::optional<APInt> getExactInteger(const APFloat &APF,
+                                            uint32_t BitWidth) {
   // We will use a SINT_TO_FP to materialize this constant so we should use a
   // signed APSInt here.
   APSInt ValInt(BitWidth, /*IsUnsigned*/ false);
@@ -3353,7 +3350,7 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
        APFloatBase::opInvalidOp) ||
       !IsExact)
     return std::nullopt;
-  return ValInt.extractBitsAsZExtValue(BitWidth, 0);
+  return ValInt.extractBits(BitWidth, 0);
 }
 
 // Try to match an arithmetic-sequence BUILD_VECTOR [X,X+S,X+2*S,...,X+(N-1)*S]
@@ -3366,6 +3363,9 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
 // Note that this method will also match potentially unappealing index
 // sequences, like <i32 0, i32 50939494>, however it is left to the caller to
 // determine whether this is worth generating code for.
+//
+// EltSizeInBits is the size of the type that the sequence will be calculated
+// in, i.e. SEW for build_vectors or XLEN for address calculations.
 static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
                                                       unsigned EltSizeInBits) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
@@ -3374,13 +3374,14 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
   bool IsInteger = Op.getValueType().isInteger();
 
   std::optional<unsigned> SeqStepDenom;
-  std::optional<int64_t> SeqStepNum, SeqAddend;
-  std::optional<std::pair<uint64_t, unsigned>> PrevElt;
+  std::optional<APInt> SeqStepNum;
+  std::optional<APInt> SeqAddend;
+  std::optional<std::pair<APInt, unsigned>> PrevElt;
   assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
 
   // First extract the ops into a list of constant integer values. This may not
   // be possible for floats if they're not all representable as integers.
-  SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+  SmallVector<std::optional<APInt>> Elts(Op.getNumOperands());
   const unsigned OpSize = Op.getScalarValueSizeInBits();
   for (auto [Idx, Elt] : enumerate(Op->op_values())) {
     if (Elt.isUndef()) {
@@ -3388,7 +3389,7 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       continue;
     }
     if (IsInteger) {
-      Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
+      Elts[Idx] = Elt->getAsAPIntVal().trunc(OpSize).zext(EltSizeInBits);
     } else {
       auto ExactInteger =
           getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
@@ -3408,7 +3409,7 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       // Calculate the step since the last non-undef element, and ensure
       // it's consistent across the entire sequence.
       unsigned IdxDiff = Idx - PrevElt->second;
-      int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
+      APInt ValDiff = *Elt - PrevElt->first;
 
       // A zero-value value difference means that we're somewhere in the middle
       // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3416,13 +3417,13 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       if (ValDiff == 0)
         continue;
 
-      int64_t Remainder = ValDiff % IdxDiff;
+      int64_t Remainder = ValDiff.srem(IdxDiff);
       // Normalize the step if it's greater than 1.
-      if (Remainder != ValDiff) {
+      if (Remainder != ValDiff.getSExtValue()) {
         // The difference must cleanly divide the element span.
         if (Remainder != 0)
           return std::nullopt;
-        ValDiff /= IdxDiff;
+        ValDiff = ValDiff.sdiv(IdxDiff);
         IdxDiff = 1;
       }
 
@@ -3451,9 +3452,10 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
   for (auto [Idx, Elt] : enumerate(Elts)) {
     if (!Elt)
       continue;
-    uint64_t ExpectedVal =
-        (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
-    int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
+    APInt ExpectedVal =
+        (APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom);
+
+    APInt Addend = *Elt - ExpectedVal;
     if (!SeqAddend)
       SeqAddend = Addend;
     else if (Addend != SeqAddend)
@@ -3462,7 +3464,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
 
   assert(SeqAddend && "Must have an addend if we have a step");
 
-  return VIDSequence{*SeqStepNum, *SeqStepDenom, *SeqAddend};
+  return VIDSequence{SeqStepNum->getSExtValue(), *SeqStepDenom,
+                     SeqAddend->getSExtValue()};
 }
 
 // Match a splatted value (SPLAT_VECTOR/BUILD_VECTOR) of an EXTRACT_VECTOR_ELT
@@ -6086,8 +6089,8 @@ static unsigned getRISCVVLOp(SDValue Op) {
 #undef VP_CASE
 }
 
-/// Return true if a RISC-V target specified op has a merge operand.
-static bool hasMergeOp(unsigned Opcode) {
+/// Return true if a RISC-V target specified op has a passthru operand.
+static bool hasPassthruOp(unsigned Opcode) {
   assert(Opcode > RISCVISD::FIRST_NUMBER &&
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
@@ -6841,7 +6844,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
           Subtarget.hasStdExtZfhminOrZhinxmin() &&
           !Subtarget.hasVInstructionsF16())) ||
         (Op.getValueType().getScalarType() == MVT::bf16 &&
-         (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin()))) {
+         (Subtarget.hasVInstructionsBF16Minimal() &&
+          Subtarget.hasStdExtZfbfmin()))) {
       if (Op.getValueType() == MVT::nxv32f16 ||
           Op.getValueType() == MVT::nxv32bf16)
         return SplitVectorOp(Op, DAG);
@@ -8273,11 +8277,9 @@ SDValue RISCVTargetLowering::lowerVectorTruncLike(SDValue Op,
         getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
   }
 
-  LLVMContext &Context = *DAG.getContext();
-  const ElementCount Count = ContainerVT.getVectorElementCount();
   do {
     SrcEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
-    EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
+    MVT ResultVT = ContainerVT.changeVectorElementType(SrcEltVT);
     Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, ResultVT, Result,
                          Mask, VL);
   } while (SrcEltVT != DstEltVT);
@@ -8855,14 +8857,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
         I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT);
       } else if (AVLInt >= 2 * MaxVLMAX) {
         // Just set vl to VLMAX in this situation
-        RISCVII::VLMUL Lmul = RISCVTargetLowering::getLMUL(I32VT);
-        SDValue LMUL = DAG.getConstant(Lmul, DL, XLenVT);
-        unsigned Sew = RISCVVType::encodeSEW(I32VT.getScalarSizeInBits());
-        SDValue SEW = DAG.getConstant(Sew, DL, XLenVT);
-        SDValue SETVLMAX = DAG.getTargetConstant(
-            Intrinsic::riscv_vsetvlimax, DL, MVT::i32);
-        I32VL = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, SETVLMAX, SEW,
-                            LMUL);
+        I32VL = DAG.getRegister(RISCV::X0, XLenVT);
       } else {
         // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
         // is related to the hardware implementation.
@@ -9404,81 +9399,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   switch (IntNo) {
   default:
     break;
-  case Intrinsic::riscv_masked_strided_load: {
-    SDLoc DL(Op);
-    MVT XLenVT = Subtarget.getXLenVT();
-
-    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
-    // the selection of the masked intrinsics doesn't do this for us.
-    SDValue Mask = Op.getOperand(5);
-    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
-
-    MVT VT = Op->getSimpleValueType(0);
-    MVT ContainerVT = VT;
-    if (VT.isFixedLengthVector())
-      ContainerVT = getContainerForFixedLengthVector(VT);
-
-    SDValue PassThru = Op.getOperand(2);
-    if (!IsUnmasked) {
-      MVT MaskVT = getMaskTypeFor(ContainerVT);
-      if (VT.isFixedLengthVector()) {
-        Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
-        PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
-      }
-    }
-
-    auto *Load = cast<MemIntrinsicSDNode>(Op);
-    SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
-    SDValue Ptr = Op.getOperand(3);
-    SDValue Stride = Op.getOperand(4);
-    SDValue Result, Chain;
-
-    // TODO: We restrict this to unmasked loads currently in consideration of
-    // the complexity of handling all falses masks.
-    MVT ScalarVT = ContainerVT.getVectorElementType();
-    if (IsUnmasked && isNullConstant(Stride) && ContainerVT.isInteger()) {
-      SDValue ScalarLoad =
-          DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, Load->getChain(), Ptr,
-                         ScalarVT, Load->getMemOperand());
-      Chain = ScalarLoad.getValue(1);
-      Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
-                                Subtarget);
-    } else if (IsUnmasked && isNullConstant(Stride) && isTypeLegal(ScalarVT)) {
-      SDValue ScalarLoad = DAG.getLoad(ScalarVT, DL, Load->getChain(), Ptr,
-                                       Load->getMemOperand());
-      Chain = ScalarLoad.getValue(1);
-      Result = DAG.getSplat(ContainerVT, DL, ScalarLoad);
-    } else {
-      SDValue IntID = DAG.getTargetConstant(
-          IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
-          XLenVT);
-
-      SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
-      if (IsUnmasked)
-        Ops.push_back(DAG.getUNDEF(ContainerVT));
-      else
-        Ops.push_back(PassThru);
-      Ops.push_back(Ptr);
-      Ops.push_back(Stride);
-      if (!IsUnmasked)
-        Ops.push_back(Mask);
-      Ops.push_back(VL);
-      if (!IsUnmasked) {
-        SDValue Policy =
-            DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
-        Ops.push_back(Policy);
-      }
-
-      SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
-      Result =
-          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
-                                  Load->getMemoryVT(), Load->getMemOperand());
-      Chain = Result.getValue(1);
-    }
-    if (VT.isFixedLengthVector())
-      Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
-    return DAG.getMergeValues({Result, Chain}, DL);
-  }
   case Intrinsic::riscv_seg2_load:
   case Intrinsic::riscv_seg3_load:
   case Intrinsic::riscv_seg4_load:
@@ -9498,8 +9418,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     MVT VT = Op->getSimpleValueType(0);
     MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-    SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
     SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
     auto *Load = cast<MemIntrinsicSDNode>(Op);
     SmallVector<EVT, 9> ContainerVTs(NF, ContainerVT);
@@ -9558,47 +9477,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   switch (IntNo) {
   default:
     break;
-  case Intrinsic::riscv_masked_strided_store: {
-    SDLoc DL(Op);
-    MVT XLenVT = Subtarget.getXLenVT();
-
-    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
-    // the selection of the masked intrinsics doesn't do this for us.
-    SDValue Mask = Op.getOperand(5);
-    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
-
-    SDValue Val = Op.getOperand(2);
-    MVT VT = Val.getSimpleValueType();
-    MVT ContainerVT = VT;
-    if (VT.isFixedLengthVector()) {
-      ContainerVT = getContainerForFixedLengthVector(VT);
-      Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
-    }
-    if (!IsUnmasked) {
-      MVT MaskVT = getMaskTypeFor(ContainerVT);
-      if (VT.isFixedLengthVector())
-        Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
-    }
-
-    SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
-
-    SDValue IntID = DAG.getTargetConstant(
-        IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL,
-        XLenVT);
-
-    auto *Store = cast<MemIntrinsicSDNode>(Op);
-    SmallVector<SDValue, 8> Ops{Store->getChain(), IntID};
-    Ops.push_back(Val);
-    Ops.push_back(Op.getOperand(3)); // Ptr
-    Ops.push_back(Op.getOperand(4)); // Stride
-    if (!IsUnmasked)
-      Ops.push_back(Mask);
-    Ops.push_back(VL);
-
-    return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(),
-                                   Ops, Store->getMemoryVT(),
-                                   Store->getMemOperand());
-  }
   case Intrinsic::riscv_seg2_store:
   case Intrinsic::riscv_seg3_store:
   case Intrinsic::riscv_seg4_store:
@@ -9619,8 +9497,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MVT VT = Op->getOperand(2).getSimpleValueType();
     MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-    SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
     SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
     SDValue Ptr = Op->getOperand(NF + 2);
 
@@ -10086,7 +9963,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     // Set the vector length to only the number of elements we care about. Note
     // that for slideup this includes the offset.
     unsigned EndIndex = OrigIdx + SubVecVT.getVectorNumElements();
-    SDValue VL = getVLOp(EndIndex, ContainerVT, DL, DAG, Subtarget);
+    SDValue VL = DAG.getConstant(EndIndex, DL, XLenVT);
 
     // Use tail agnostic policy if we're inserting over Vec's tail.
     unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
@@ -10323,8 +10200,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. This
     // avoids sliding down elements we're going to discard straight away.
-    SDValue VL = getVLOp(SubVecVT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(SubVecVT.getVectorNumElements(), DL, XLenVT);
     SDValue SlidedownAmt = DAG.getConstant(OrigIdx, DL, XLenVT);
     SDValue Slidedown =
         getVSlidedown(DAG, Subtarget, DL, ContainerVT,
@@ -10399,8 +10275,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   SDValue SlidedownAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
   auto [Mask, VL] = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget);
   if (SubVecVT.isFixedLengthVector())
-    VL = getVLOp(SubVecVT.getVectorNumElements(), InterSubVT, DL, DAG,
-                 Subtarget);
+    VL = DAG.getConstant(SubVecVT.getVectorNumElements(), DL, XLenVT);
   SDValue Slidedown =
       getVSlidedown(DAG, Subtarget, DL, InterSubVT, DAG.getUNDEF(InterSubVT),
                     Vec, SlidedownAmt, Mask, VL);
@@ -10780,7 +10655,7 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
     return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
   }
 
-  SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG, Subtarget);
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
   bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
   SDValue IntID = DAG.getTargetConstant(
@@ -10827,7 +10702,6 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   SDValue NewValue =
       convertToScalableVector(ContainerVT, StoreVal, DAG, Subtarget);
 
-
   // If we know the exact VLEN and our fixed length vector completely fills
   // the container, use a whole register store instead.
   const auto [MinVLMAX, MaxVLMAX] =
@@ -10840,8 +10714,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
                         MMO->getFlags(), MMO->getAAInfo());
   }
 
-  SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                       Subtarget);
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
   bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
   SDValue IntID = DAG.getTargetConstant(
@@ -11076,7 +10949,7 @@ SDValue RISCVTargetLowering::lowerVectorStrictFSetcc(SDValue Op,
          True, VL});
     Mask =
         DAG.getNode(RISCVISD::VMAND_VL, DL, MaskVT, OrderMask1, OrderMask2, VL);
-    // Use Mask as the merge operand to let the result be 0 if either of the
+    // Use Mask as the passthru operand to let the result be 0 if either of the
     // inputs is unordered.
     Res = DAG.getNode(RISCVISD::STRICT_FSETCCS_VL, DL,
                       DAG.getVTList(MaskVT, MVT::Other),
@@ -11181,7 +11054,7 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
 SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned NewOpc = getRISCVVLOp(Op);
-  bool HasMergeOp = hasMergeOp(NewOpc);
+  bool HasPassthruOp = hasPassthruOp(NewOpc);
   bool HasMask = hasMaskOp(NewOpc);
 
   MVT VT = Op.getSimpleValueType();
@@ -11206,7 +11079,7 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
 
   SDLoc DL(Op);
   auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-  if (HasMergeOp)
+  if (HasPassthruOp)
     Ops.push_back(DAG.getUNDEF(ContainerVT));
   if (HasMask)
     Ops.push_back(Mask);
@@ -11234,7 +11107,7 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
 //   types.
 SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   unsigned RISCVISDOpc = getRISCVVLOp(Op);
-  bool HasMergeOp = hasMergeOp(RISCVISDOpc);
+  bool HasPassthruOp = hasPassthruOp(RISCVISDOpc);
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
@@ -11247,9 +11120,9 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   for (const auto &OpIdx : enumerate(Op->ops())) {
     SDValue V = OpIdx.value();
     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-    // Add dummy merge value before the mask. Or if there isn't a mask, before
-    // EVL.
-    if (HasMergeOp) {
+    // Add dummy passthru value before the mask. Or if there isn't a mask,
+    // before EVL.
+    if (HasPassthruOp) {
       auto MaskIdx = ISD::getVPMaskIdx(Op.getOpcode());
       if (MaskIdx) {
         if (*MaskIdx == OpIdx.index())
@@ -14440,6 +14313,14 @@ struct NodeExtensionHelper {
       return RISCVISD::VFWSUB_VL;
     case RISCVISD::FMUL_VL:
       return RISCVISD::VFWMUL_VL;
+    case RISCVISD::VFMADD_VL:
+      return RISCVISD::VFWMADD_VL;
+    case RISCVISD::VFMSUB_VL:
+      return RISCVISD::VFWMSUB_VL;
+    case RISCVISD::VFNMADD_VL:
+      return RISCVISD::VFWNMADD_VL;
+    case RISCVISD::VFNMSUB_VL:
+      return RISCVISD::VFWNMSUB_VL;
     default:
       llvm_unreachable("Unexpected opcode");
     }
@@ -14633,6 +14514,11 @@ struct NodeExtensionHelper {
              Subtarget.hasStdExtZvbb();
     case RISCVISD::SHL_VL:
       return Subtarget.hasStdExtZvbb();
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
+      return true;
     default:
       return false;
     }
@@ -14713,6 +14599,10 @@ struct NodeExtensionHelper {
     case RISCVISD::FADD_VL:
     case RISCVISD::FMUL_VL:
     case RISCVISD::VFWADD_W_VL:
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
       return true;
     case ISD::SUB:
     case RISCVISD::SUB_VL:
@@ -14764,25 +14654,25 @@ struct CombineResult {
   /// The actual replacement is *not* done in that method.
   SDValue materialize(SelectionDAG &DAG,
                       const RISCVSubtarget &Subtarget) const {
-    SDValue Mask, VL, Merge;
+    SDValue Mask, VL, Passthru;
     std::tie(Mask, VL) =
         NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
     switch (Root->getOpcode()) {
     default:
-      Merge = Root->getOperand(2);
+      Passthru = Root->getOperand(2);
       break;
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
     case ISD::OR:
     case ISD::SHL:
-      Merge = DAG.getUNDEF(Root->getValueType(0));
+      Passthru = DAG.getUNDEF(Root->getValueType(0));
       break;
     }
     return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
                        LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, LHSExt),
                        RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, RHSExt),
-                       Merge, Mask, VL);
+                       Passthru, Mask, VL);
   }
 };
 
@@ -14928,6 +14818,10 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
     Strategies.push_back(canFoldToVW_W);
     break;
   case RISCVISD::FMUL_VL:
+  case RISCVISD::VFMADD_VL:
+  case RISCVISD::VFMSUB_VL:
+  case RISCVISD::VFNMADD_VL:
+  case RISCVISD::VFNMSUB_VL:
     Strategies.push_back(canFoldToVWWithSameExtension);
     break;
   case ISD::MUL:
@@ -14964,7 +14858,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 }
 } // End anonymous namespace.
 
-/// Combine a binary operation to its equivalent VW or VW_W form.
+/// Combine a binary or FMA operation to its equivalent VW or VW_W form.
 /// The supported combines are:
 /// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w
 /// sub | sub_vl -> vwsub(u) | vwsub(u)_w
@@ -14977,9 +14871,9 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 /// vwsub_w(u) -> vwsub(u)
 /// vfwadd_w -> vfwadd
 /// vfwsub_w -> vfwsub
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const RISCVSubtarget &Subtarget) {
+static SDValue combineOp_VLToVWOp_VL(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   if (DCI.isBeforeLegalize())
     return SDValue();
@@ -14995,19 +14889,26 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget))
-      return SDValue();
 
-    NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
-    NodeExtensionHelper RHS(N, 1, DAG, Subtarget);
-    auto AppendUsersIfNeeded = [&Worklist,
+    NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
+    NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
+    auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
-        for (SDNode *TheUse : Op.OrigOperand->uses()) {
+        for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
+                                  UE = Op.OrigOperand->use_end();
+             UI != UE; ++UI) {
+          SDNode *TheUse = *UI;
+          if (!NodeExtensionHelper::isSupportedRoot(TheUse, Subtarget))
+            return false;
+          // We only support the first 2 operands of FMA.
+          if (UI.getOperandNo() >= 2)
+            return false;
           if (Inserted.insert(TheUse).second)
             Worklist.push_back(TheUse);
         }
       }
+      return true;
     };
 
     // Control the compile time by limiting the number of node we look at in
@@ -15016,18 +14917,18 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
       return SDValue();
 
     SmallVector<NodeExtensionHelper::CombineToTry> FoldingStrategies =
-        NodeExtensionHelper::getSupportedFoldings(N);
+        NodeExtensionHelper::getSupportedFoldings(Root);
 
     assert(!FoldingStrategies.empty() && "Nothing to be folded");
     bool Matched = false;
     for (int Attempt = 0;
-         (Attempt != 1 + NodeExtensionHelper::isCommutative(N)) && !Matched;
+         (Attempt != 1 + NodeExtensionHelper::isCommutative(Root)) && !Matched;
          ++Attempt) {
 
       for (NodeExtensionHelper::CombineToTry FoldingStrategy :
            FoldingStrategies) {
         std::optional<CombineResult> Res =
-            FoldingStrategy(N, LHS, RHS, DAG, Subtarget);
+            FoldingStrategy(Root, LHS, RHS, DAG, Subtarget);
         if (Res) {
           Matched = true;
           CombinesToApply.push_back(*Res);
@@ -15035,9 +14936,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
           // we would be leaving the old input (since it is may still be used),
           // and the new one.
           if (Res->LHSExt.has_value())
-            AppendUsersIfNeeded(LHS);
+            if (!AppendUsersIfNeeded(LHS))
+              return SDValue();
           if (Res->RHSExt.has_value())
-            AppendUsersIfNeeded(RHS);
+            if (!AppendUsersIfNeeded(RHS))
+              return SDValue();
           break;
         }
       }
@@ -15124,7 +15027,7 @@ static SDValue performVWADDSUBW_VLCombine(SDNode *N,
   assert(Opc == RISCVISD::VWADD_W_VL || Opc == RISCVISD::VWADDU_W_VL ||
          Opc == RISCVISD::VWSUB_W_VL || Opc == RISCVISD::VWSUBU_W_VL);
 
-  if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+  if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
     return V;
 
   return combineVWADDSUBWSelect(N, DCI.DAG);
@@ -15539,8 +15442,11 @@ static SDValue combineVFMADD_VLWithVFNEG_VL(SDNode *N, SelectionDAG &DAG) {
                      VL);
 }
 
-static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performVFMADD_VLCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
                                        const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
   if (SDValue V = combineVFMADD_VLWithVFNEG_VL(N, DAG))
     return V;
 
@@ -15552,50 +15458,7 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
   if (N->isTargetStrictFPOpcode())
     return SDValue();
 
-  // Try to form widening FMA.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue Mask = N->getOperand(3);
-  SDValue VL = N->getOperand(4);
-
-  if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL ||
-      Op1.getOpcode() != RISCVISD::FP_EXTEND_VL)
-    return SDValue();
-
-  // TODO: Refactor to handle more complex cases similar to
-  // combineBinOp_VLToVWBinOp_VL.
-  if ((!Op0.hasOneUse() || !Op1.hasOneUse()) &&
-      (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0)))
-    return SDValue();
-
-  // Check the mask and VL are the same.
-  if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL ||
-      Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
-    return SDValue();
-
-  unsigned NewOpc;
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-  case RISCVISD::VFMADD_VL:
-    NewOpc = RISCVISD::VFWMADD_VL;
-    break;
-  case RISCVISD::VFNMSUB_VL:
-    NewOpc = RISCVISD::VFWNMSUB_VL;
-    break;
-  case RISCVISD::VFNMADD_VL:
-    NewOpc = RISCVISD::VFWNMADD_VL;
-    break;
-  case RISCVISD::VFMSUB_VL:
-    NewOpc = RISCVISD::VFWMSUB_VL;
-    break;
-  }
-
-  Op0 = Op0.getOperand(0);
-  Op1 = Op1.getOperand(0);
-
-  return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0), Op0, Op1,
-                     N->getOperand(2), Mask, VL);
+  return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
 }
 
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
@@ -16292,8 +16155,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   SDValue MulOp = N->getOperand(1);
 
   if (N->getOpcode() == RISCVISD::ADD_VL) {
-    SDValue AddMergeOp = N->getOperand(2);
-    if (!AddMergeOp.isUndef())
+    SDValue AddPassthruOp = N->getOperand(2);
+    if (!AddPassthruOp.isUndef())
       return SDValue();
   }
 
@@ -16314,9 +16177,9 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   if (!IsVWMulOpc(MulOp.getOpcode()))
     return SDValue();
 
-  SDValue MulMergeOp = MulOp.getOperand(2);
+  SDValue MulPassthruOp = MulOp.getOperand(2);
 
-  if (!MulMergeOp.isUndef())
+  if (!MulPassthruOp.isUndef())
     return SDValue();
 
   auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
@@ -16629,9 +16492,9 @@ static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
   SDValue Val;
   unsigned ClipOpc;
   if ((Val = DetectUSatPattern(Src)))
-    ClipOpc = RISCVISD::VNCLIPU_VL;
+    ClipOpc = RISCVISD::TRUNCATE_VECTOR_VL_USAT;
   else if ((Val = DetectSSatPattern(Src)))
-    ClipOpc = RISCVISD::VNCLIP_VL;
+    ClipOpc = RISCVISD::TRUNCATE_VECTOR_VL_SSAT;
   else
     return SDValue();
 
@@ -16640,12 +16503,7 @@ static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
   do {
     MVT ValEltVT = MVT::getIntegerVT(ValVT.getScalarSizeInBits() / 2);
     ValVT = ValVT.changeVectorElementType(ValEltVT);
-    // Rounding mode here is arbitrary since we aren't shifting out any bits.
-    Val = DAG.getNode(
-        ClipOpc, DL, ValVT,
-        {Val, DAG.getConstant(0, DL, ValVT), DAG.getUNDEF(VT), Mask,
-         DAG.getTargetConstant(RISCVVXRndMode::RNU, DL, Subtarget.getXLenVT()),
-         VL});
+    Val = DAG.getNode(ClipOpc, DL, ValVT, Val, Mask, VL);
   } while (ValVT != VT);
 
   return Val;
@@ -16797,28 +16655,28 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::ADD: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
       return V;
     return performADDCombine(N, DCI, Subtarget);
   }
   case ISD::SUB: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performSUBCombine(N, DAG, Subtarget);
   }
   case ISD::AND:
     return performANDCombine(N, DCI, Subtarget);
   case ISD::OR: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performORCombine(N, DCI, Subtarget);
   }
   case ISD::XOR:
     return performXORCombine(N, DAG, Subtarget);
   case ISD::MUL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performMULCombine(N, DAG, DCI, Subtarget);
   case ISD::SDIV:
@@ -16828,6 +16686,25 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SDValue V = combineBinOpOfZExt(N, DAG))
       return V;
     break;
+  case ISD::FMUL: {
+    // fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    if (N0->getOpcode() != ISD::FCOPYSIGN)
+      std::swap(N0, N1);
+    if (N0->getOpcode() != ISD::FCOPYSIGN)
+      return SDValue();
+    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
+    if (!C || !C->getValueAPF().isExactlyValue(+1.0))
+      return SDValue();
+    EVT VT = N->getValueType(0);
+    if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
+      return SDValue();
+    SDValue Sign = N0->getOperand(1);
+    if (Sign.getValueType() != VT)
+      return SDValue();
+    return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1));
+  }
   case ISD::FADD:
   case ISD::UMAX:
   case ISD::UMIN:
@@ -17243,7 +17120,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::SHL_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     [[fallthrough]];
   case RISCVISD::SRA_VL:
@@ -17268,7 +17145,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRL:
   case ISD::SHL: {
     if (N->getOpcode() == ISD::SHL) {
-      if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
         return V;
     }
     SDValue ShAmt = N->getOperand(1);
@@ -17284,7 +17161,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::ADD_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return combineToVWMACC(N, DAG, Subtarget);
   case RISCVISD::VWADD_W_VL:
@@ -17294,7 +17171,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return performVWADDSUBW_VLCombine(N, DCI, Subtarget);
   case RISCVISD::SUB_VL:
   case RISCVISD::MUL_VL:
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   case RISCVISD::VFMADD_VL:
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFMSUB_VL:
@@ -17303,7 +17180,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::STRICT_VFNMADD_VL:
   case RISCVISD::STRICT_VFMSUB_VL:
   case RISCVISD::STRICT_VFNMSUB_VL:
-    return performVFMADD_VLCombine(N, DAG, Subtarget);
+    return performVFMADD_VLCombine(N, DCI, Subtarget);
   case RISCVISD::FADD_VL:
   case RISCVISD::FSUB_VL:
   case RISCVISD::FMUL_VL:
@@ -17312,7 +17189,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (N->getValueType(0).getVectorElementType() == MVT::f32 &&
         !Subtarget.hasVInstructionsF16())
       return SDValue();
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   }
   case ISD::LOAD:
   case ISD::STORE: {
@@ -17546,43 +17423,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       // By default we do not combine any intrinsic.
     default:
       return SDValue();
-    case Intrinsic::riscv_masked_strided_load: {
-      MVT VT = N->getSimpleValueType(0);
-      auto *Load = cast<MemIntrinsicSDNode>(N);
-      SDValue PassThru = N->getOperand(2);
-      SDValue Base = N->getOperand(3);
-      SDValue Stride = N->getOperand(4);
-      SDValue Mask = N->getOperand(5);
-
-      // If the stride is equal to the element size in bytes,  we can use
-      // a masked.load.
-      const unsigned ElementSize = VT.getScalarStoreSize();
-      if (auto *StrideC = dyn_cast<ConstantSDNode>(Stride);
-          StrideC && StrideC->getZExtValue() == ElementSize)
-        return DAG.getMaskedLoad(VT, DL, Load->getChain(), Base,
-                                 DAG.getUNDEF(XLenVT), Mask, PassThru,
-                                 Load->getMemoryVT(), Load->getMemOperand(),
-                                 ISD::UNINDEXED, ISD::NON_EXTLOAD);
-      return SDValue();
-    }
-    case Intrinsic::riscv_masked_strided_store: {
-      auto *Store = cast<MemIntrinsicSDNode>(N);
-      SDValue Value = N->getOperand(2);
-      SDValue Base = N->getOperand(3);
-      SDValue Stride = N->getOperand(4);
-      SDValue Mask = N->getOperand(5);
-
-      // If the stride is equal to the element size in bytes,  we can use
-      // a masked.store.
-      const unsigned ElementSize = Value.getValueType().getScalarStoreSize();
-      if (auto *StrideC = dyn_cast<ConstantSDNode>(Stride);
-          StrideC && StrideC->getZExtValue() == ElementSize)
-        return DAG.getMaskedStore(Store->getChain(), DL, Value, Base,
-                                  DAG.getUNDEF(XLenVT), Mask,
-                                  Value.getValueType(), Store->getMemOperand(),
-                                  ISD::UNINDEXED, false);
-      return SDValue();
-    }
     case Intrinsic::riscv_vcpop:
     case Intrinsic::riscv_vcpop_mask:
     case Intrinsic::riscv_vfirst:
@@ -18418,6 +18258,15 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // Select_FPRX_ (rs1, rs2, imm, rs4, (Select_FPRX_ rs1, rs2, imm, rs4, rs5))
   // is checked here and handled by a separate function -
   // EmitLoweredCascadedSelect.
+
+  auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
+  if ((MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR &&
+       MI.getOpcode() != RISCV::Select_GPR_Using_CC_Imm) &&
+      Next != BB->end() && Next->getOpcode() == MI.getOpcode() &&
+      Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
+      Next->getOperand(5).isKill())
+    return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
+
   Register LHS = MI.getOperand(1).getReg();
   Register RHS;
   if (MI.getOperand(2).isReg())
@@ -18429,15 +18278,6 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   SelectDests.insert(MI.getOperand(0).getReg());
 
   MachineInstr *LastSelectPseudo = &MI;
-  auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
-  if ((MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR &&
-       MI.getOpcode() != RISCV::Select_GPR_Using_CC_Imm) &&
-      Next != BB->end() && Next->getOpcode() == MI.getOpcode() &&
-      Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
-      Next->getOperand(5).isKill()) {
-    return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
-  }
-
   for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
        SequenceMBBI != E; ++SequenceMBBI) {
     if (SequenceMBBI->isDebugInstr())
@@ -18478,6 +18318,11 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   F->insert(I, IfFalseMBB);
   F->insert(I, TailMBB);
 
+  // Set the call frame size on entry to the new basic blocks.
+  unsigned CallFrameSize = TII.getCallFrameSizeAt(*LastSelectPseudo);
+  IfFalseMBB->setCallFrameSize(CallFrameSize);
+  TailMBB->setCallFrameSize(CallFrameSize);
+
   // Transfer debug instructions associated with the selects to TailMBB.
   for (MachineInstr *DebugInstr : SelectDebugValues) {
     TailMBB->push_back(DebugInstr->removeFromParent());
@@ -20410,6 +20255,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FP_EXTEND_BF16)
   NODE_NAME_CASE(FROUND)
   NODE_NAME_CASE(FCLASS)
+  NODE_NAME_CASE(FSGNJX)
   NODE_NAME_CASE(FMAX)
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(READ_COUNTER_WIDE)
@@ -20444,6 +20290,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SPLAT_VECTOR_SPLIT_I64_VL)
   NODE_NAME_CASE(READ_VLENB)
   NODE_NAME_CASE(TRUNCATE_VECTOR_VL)
+  NODE_NAME_CASE(TRUNCATE_VECTOR_VL_SSAT)
+  NODE_NAME_CASE(TRUNCATE_VECTOR_VL_USAT)
   NODE_NAME_CASE(VSLIDEUP_VL)
   NODE_NAME_CASE(VSLIDE1UP_VL)
   NODE_NAME_CASE(VSLIDEDOWN_VL)
@@ -20487,8 +20335,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(UADDSAT_VL)
   NODE_NAME_CASE(SSUBSAT_VL)
   NODE_NAME_CASE(USUBSAT_VL)
-  NODE_NAME_CASE(VNCLIP_VL)
-  NODE_NAME_CASE(VNCLIPU_VL)
   NODE_NAME_CASE(FADD_VL)
   NODE_NAME_CASE(FSUB_VL)
   NODE_NAME_CASE(FMUL_VL)
@@ -21240,7 +21086,7 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                      SDValue &Offset,
                                                      ISD::MemIndexedMode &AM,
                                                      SelectionDAG &DAG) const {
-  if (Subtarget.hasVendorXCVmem()) {
+  if (Subtarget.hasVendorXCVmem() && !Subtarget.is64Bit()) {
     if (Op->getOpcode() != ISD::ADD)
       return false;
 
@@ -21339,37 +21185,37 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
 bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                                  SDValue C) const {
   // Check integral scalar types.
-  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (!VT.isScalarInteger())
     return false;
 
   // Omit the optimization if the sub target has the M extension and the data
   // size exceeds XLen.
+  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (HasZmmul && VT.getSizeInBits() > Subtarget.getXLen())
     return false;
 
-  if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
-    // Break the MUL to a SLLI and an ADD/SUB.
-    const APInt &Imm = ConstNode->getAPIntValue();
-    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
-        (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
-      return true;
+  auto *ConstNode = cast<ConstantSDNode>(C);
+  const APInt &Imm = ConstNode->getAPIntValue();
 
-    // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
-    if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
-        ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
-         (Imm - 8).isPowerOf2()))
-      return true;
+  // Break the MUL to a SLLI and an ADD/SUB.
+  if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
+      (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
+    return true;
 
-    // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
-    // a pair of LUI/ADDI.
-    if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
-        ConstNode->hasOneUse()) {
-      APInt ImmS = Imm.ashr(Imm.countr_zero());
-      if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
-          (1 - ImmS).isPowerOf2())
-        return true;
-    }
+  // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
+  if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
+      ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
+       (Imm - 8).isPowerOf2()))
+    return true;
+
+  // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
+  // a pair of LUI/ADDI.
+  if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
+      ConstNode->hasOneUse()) {
+    APInt ImmS = Imm.ashr(Imm.countr_zero());
+    if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
+        (1 - ImmS).isPowerOf2())
+      return true;
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b0ad9229f0b3..d1d0760d8ffd1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -128,6 +128,7 @@ enum NodeType : unsigned {
   FROUND,
 
   FCLASS,
+  FSGNJX,
 
   // Floating point fmax and fmin matching the RISC-V instruction semantics.
   FMAX, FMIN,
@@ -181,6 +182,12 @@ enum NodeType : unsigned {
   // Truncates a RVV integer vector by one power-of-two. Carries both an extra
   // mask and VL operand.
   TRUNCATE_VECTOR_VL,
+  // Truncates a RVV integer vector by one power-of-two. If the value doesn't
+  // fit in the destination type, the result is saturated. These correspond to
+  // vnclip and vnclipu with a shift of 0. Carries both an extra mask and VL
+  // operand.
+  TRUNCATE_VECTOR_VL_SSAT,
+  TRUNCATE_VECTOR_VL_USAT,
   // Matches the semantics of vslideup/vslidedown. The first operand is the
   // pass-thru operand, the second is the source vector, the third is the XLenVT
   // index (either constant or non-constant), the fourth is the mask, the fifth
@@ -231,7 +238,7 @@ enum NodeType : unsigned {
   VECREDUCE_FMIN_VL,
   VECREDUCE_FMAX_VL,
 
-  // Vector binary ops with a merge as a third operand, a mask as a fourth
+  // Vector binary ops with a passthru as a third operand, a mask as a fourth
   // operand, and VL as a fifth operand.
   ADD_VL,
   AND_VL,
@@ -273,10 +280,6 @@ enum NodeType : unsigned {
   // Rounding averaging adds of unsigned integers.
   AVGCEILU_VL,
 
-  // Operands are (source, shift, merge, mask, roundmode, vl)
-  VNCLIPU_VL,
-  VNCLIP_VL,
-
   MULHS_VL,
   MULHU_VL,
   FADD_VL,
@@ -291,7 +294,7 @@ enum NodeType : unsigned {
   FABS_VL,
   FSQRT_VL,
   FCLASS_VL,
-  FCOPYSIGN_VL, // Has a merge operand
+  FCOPYSIGN_VL, // Has a passthru operand
   VFCVT_RTZ_X_F_VL,
   VFCVT_RTZ_XU_F_VL,
   VFCVT_X_F_VL,
@@ -319,7 +322,7 @@ enum NodeType : unsigned {
   VFWMSUB_VL,
   VFWNMSUB_VL,
 
-  // Widening instructions with a merge value a third operand, a mask as a
+  // Widening instructions with a passthru value a third operand, a mask as a
   // fourth operand, and VL as a fifth operand.
   VWMUL_VL,
   VWMULU_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index d5b3df48d53b4..af2279f1023b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -182,7 +182,7 @@ static bool isMaskRegOp(const MachineInstr &MI) {
 /// Note that this is different from "agnostic" as defined by the vector
 /// specification.  Agnostic requires each lane to either be undisturbed, or
 /// take the value -1; no other value is allowed.
-static bool hasUndefinedMergeOp(const MachineInstr &MI) {
+static bool hasUndefinedPassthru(const MachineInstr &MI) {
 
   unsigned UseOpIdx;
   if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -443,13 +443,13 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     Res.LMUL = DemandedFields::LMULNone;
     Res.SEWLMULRatio = false;
     Res.VLAny = false;
-    // For vmv.s.x and vfmv.s.f, if the merge operand is *undefined*, we don't
+    // For vmv.s.x and vfmv.s.f, if the passthru is *undefined*, we don't
     // need to preserve any other bits and are thus compatible with any larger,
     // etype and can disregard policy bits.  Warning: It's tempting to try doing
     // this for any tail agnostic operation, but we can't as TA requires
     // tail lanes to either be the original value or -1.  We are writing
     // unknown bits to the lanes here.
-    if (hasUndefinedMergeOp(MI)) {
+    if (hasUndefinedPassthru(MI)) {
       if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
         Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
       else
@@ -458,7 +458,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     }
   }
 
-  // vmv.x.s, and vmv.f.s are unconditional and ignore everything except SEW.
+  // vmv.x.s, and vfmv.f.s are unconditional and ignore everything except SEW.
   if (isScalarExtractInstr(MI)) {
     assert(!RISCVII::hasVLOp(TSFlags));
     Res.LMUL = DemandedFields::LMULNone;
@@ -469,7 +469,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
 
   if (RISCVII::hasVLOp(MI.getDesc().TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
-    // A slidedown/slideup with an *undefined* merge op can freely clobber
+    // A slidedown/slideup with an *undefined* passthru can freely clobber
     // elements not copied from the source vector (e.g. masked off, tail, or
     // slideup's prefix). Notes:
     // * We can't modify SEW here since the slide amount is in units of SEW.
@@ -478,7 +478,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     // * The LMUL1 restriction is for machines whose latency may depend on VL.
     // * As above, this is only legal for tail "undefined" not "agnostic".
     if (isVSlideInstr(MI) && VLOp.isImm() && VLOp.getImm() == 1 &&
-        hasUndefinedMergeOp(MI)) {
+        hasUndefinedPassthru(MI)) {
       Res.VLAny = false;
       Res.VLZeroness = true;
       Res.LMUL = DemandedFields::LMULLessThanOrEqualToM1;
@@ -492,7 +492,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     // careful to not increase the number of active vector registers (unlike for
     // vmv.s.x.)
     if (isScalarSplatInstr(MI) && VLOp.isImm() && VLOp.getImm() == 1 &&
-        hasUndefinedMergeOp(MI)) {
+        hasUndefinedPassthru(MI)) {
       Res.LMUL = DemandedFields::LMULLessThanOrEqualToM1;
       Res.SEWLMULRatio = false;
       Res.VLAny = false;
@@ -1000,7 +1000,7 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
   bool TailAgnostic = true;
   bool MaskAgnostic = true;
-  if (!hasUndefinedMergeOp(MI)) {
+  if (!hasUndefinedPassthru(MI)) {
     // Start with undisturbed.
     TailAgnostic = false;
     MaskAgnostic = false;
@@ -1645,24 +1645,23 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   Used.demandVTYPE();
   SmallVector<MachineInstr*> ToDelete;
 
-  // Update LIS and cleanup dead AVLs given a value which has
-  // has had one use (as an AVL) removed.
-  auto afterDroppedAVLUse = [&](Register OldVLReg) {
+  auto dropAVLUse = [&](MachineOperand &MO) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      return;
+    Register OldVLReg = MO.getReg();
+    MO.setReg(RISCV::NoRegister);
+
     if (LIS)
       LIS->shrinkToUses(&LIS->getInterval(OldVLReg));
 
     MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
     if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
-        MRI->use_nodbg_empty(OldVLReg)) {
-      if (LIS) {
-        LIS->removeInterval(OldVLReg);
-        LIS->RemoveMachineInstrFromMaps(*VLOpDef);
-      }
-      VLOpDef->eraseFromParent();
-    }
+        MRI->use_nodbg_empty(OldVLReg))
+      ToDelete.push_back(VLOpDef);
   };
 
-  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+  for (MachineInstr &MI :
+       make_early_inc_range(make_range(MBB.rbegin(), MBB.rend()))) {
 
     if (!isVectorConfigInstr(MI)) {
       Used.doUnion(getDemanded(MI, ST));
@@ -1678,7 +1677,11 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
 
     if (NextMI) {
       if (!Used.usedVL() && !Used.usedVTYPE()) {
-        ToDelete.push_back(&MI);
+        dropAVLUse(MI.getOperand(1));
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        NumCoalescedVSETVL++;
         // Leave NextMI unchanged
         continue;
       }
@@ -1707,20 +1710,22 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
             LIS->shrinkToUses(&DefLI);
           }
 
-          Register OldVLReg;
-          if (MI.getOperand(1).isReg())
-            OldVLReg = MI.getOperand(1).getReg();
+          dropAVLUse(MI.getOperand(1));
           if (NextMI->getOperand(1).isImm())
             MI.getOperand(1).ChangeToImmediate(NextMI->getOperand(1).getImm());
           else
-            MI.getOperand(1).ChangeToRegister(NextMI->getOperand(1).getReg(), false);
-          if (OldVLReg && OldVLReg.isVirtual())
-            afterDroppedAVLUse(OldVLReg);
+            MI.getOperand(1).ChangeToRegister(NextMI->getOperand(1).getReg(),
+                                              false);
 
           MI.setDesc(NextMI->getDesc());
         }
         MI.getOperand(2).setImm(NextMI->getOperand(2).getImm());
-        ToDelete.push_back(NextMI);
+
+        dropAVLUse(NextMI->getOperand(1));
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(*NextMI);
+        NextMI->eraseFromParent();
+        NumCoalescedVSETVL++;
         // fallthrough
       }
     }
@@ -1728,16 +1733,14 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
     Used = getDemanded(MI, ST);
   }
 
-  NumCoalescedVSETVL += ToDelete.size();
+  // Loop over the dead AVL values, and delete them now.  This has
+  // to be outside the above loop to avoid invalidating iterators.
   for (auto *MI : ToDelete) {
-    if (LIS)
+    if (LIS) {
+      LIS->removeInterval(MI->getOperand(0).getReg());
       LIS->RemoveMachineInstrFromMaps(*MI);
-    Register OldAVLReg;
-    if (MI->getOperand(1).isReg())
-      OldAVLReg = MI->getOperand(1).getReg();
+    }
     MI->eraseFromParent();
-    if (OldAVLReg && OldAVLReg.isVirtual())
-      afterDroppedAVLUse(OldAVLReg);
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ba3b4bd701d63..9dd79027d7a16 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1365,7 +1365,7 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
@@ -2830,6 +2830,7 @@ bool RISCVInstrInfo::shouldOutlineFromFunctionByDefault(
 
 std::optional<outliner::OutlinedFunction>
 RISCVInstrInfo::getOutliningCandidateInfo(
+    const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
 
   // First we need to filter out candidates where the X5 register (IE t0) can't
@@ -2868,8 +2869,9 @@ RISCVInstrInfo::getOutliningCandidateInfo(
 }
 
 outliner::InstrType
-RISCVInstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MBBI,
-                                 unsigned Flags) const {
+RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     unsigned Flags) const {
   MachineInstr &MI = *MBBI;
   MachineBasicBlock *MBB = MI.getParent();
   const TargetRegisterInfo *TRI =
@@ -3761,6 +3763,12 @@ RISCVInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
+unsigned RISCVInstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
+  return OptLevel >= CodeGenOptLevel::Aggressive
+             ? STI.getTailDupAggressiveThreshold()
+             : 2;
+}
+
 // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
 bool RISCV::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 025cc36d19eb7..1612f56a8b506 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -206,11 +206,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
   // Calculate target-specific information for a set of outlining candidates.
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
 
   // Return if/how a given MachineInstr should be outlined.
   virtual outliner::InstrType
-  getOutliningTypeImpl(MachineBasicBlock::iterator &MBBI,
+  getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                       MachineBasicBlock::iterator &MBBI,
                        unsigned Flags) const override;
 
   // Insert a custom frame for outlined functions.
@@ -286,6 +288,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
+  unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override;
+
   unsigned getUndefInitOpcode(unsigned RegClassID) const override {
     switch (RegClassID) {
     case RISCV::VRRegClassID:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 04054d2c3feee..cebe95ca16506 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -593,8 +593,7 @@ class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class Priv<string opcodestr, bits<7> funct7>
-    : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1, GPR:$rs2),
-              opcodestr, "">;
+    : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins), opcodestr, "">;
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class Priv_rr<string opcodestr, bits<7> funct7>
@@ -1543,8 +1542,8 @@ def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), [],
 def : Pat<(riscv_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
-def : Pat<(riscv_sret_glue), (SRET (XLenVT X0), (XLenVT X0))>;
-def : Pat<(riscv_mret_glue), (MRET (XLenVT X0), (XLenVT X0))>;
+def : Pat<(riscv_sret_glue), (SRET)>;
+def : Pat<(riscv_mret_glue), (MRET)>;
 
 let isCall = 1, Defs = [X1] in {
 let Predicates = [NoStdExtZicfilp] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 9257ee5a09a8e..3f279b7a58ca6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -764,9 +764,9 @@ def InsnCR : DirectiveInsnCR<(outs AnyReg:$rd), (ins uimm2_opcode:$opcode,
                                                      uimm4:$funct4,
                                                      AnyReg:$rs2),
                              "$opcode, $funct4, $rd, $rs2">;
-def InsnCI : DirectiveInsnCI<(outs AnyRegC:$rd), (ins uimm2_opcode:$opcode,
-                                                      uimm3:$funct3,
-                                                      simm6:$imm6),
+def InsnCI : DirectiveInsnCI<(outs AnyReg:$rd), (ins uimm2_opcode:$opcode,
+                                                     uimm3:$funct3,
+                                                     simm6:$imm6),
                              "$opcode, $funct3, $rd, $imm6">;
 def InsnCIW : DirectiveInsnCIW<(outs AnyRegC:$rd), (ins uimm2_opcode:$opcode,
                                                         uimm3:$funct3,
@@ -818,7 +818,7 @@ def : InstAlias<".insn_cr $opcode, $funct4, $rd, $rs2",
                 (InsnCR AnyReg:$rd, uimm2_opcode:$opcode, uimm4:$funct4,
                         AnyReg:$rs2)>;
 def : InstAlias<".insn_ci $opcode, $funct3, $rd, $imm6",
-                (InsnCI AnyRegC:$rd, uimm2_opcode:$opcode, uimm3:$funct3,
+                (InsnCI AnyReg:$rd, uimm2_opcode:$opcode, uimm3:$funct3,
                         simm6:$imm6)>;
 def : InstAlias<".insn_ciw $opcode, $funct3, $rd, $imm8",
                 (InsnCIW AnyRegC:$rd, uimm2_opcode:$opcode, uimm3:$funct3,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 8efefee383a6a..35ab277fa3505 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -282,6 +282,7 @@ def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64:$rs1), (FCLASS_D $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D, FPR64, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D, FPR64, f64>;
 def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2,
                                                               FRM_RNE))>;
@@ -318,6 +319,7 @@ def : Pat<(fabs FPR64INX:$rs1), (FSGNJX_D_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64INX:$rs1), (FCLASS_D_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_INX, FPR64INX, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_INX, FPR64INX, f64>;
 def : Pat<(fcopysign FPR64INX:$rs1, (fneg FPR64INX:$rs2)),
           (FSGNJN_D_INX $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64INX:$rs1, FPR32INX:$rs2),
@@ -355,6 +357,7 @@ def : Pat<(fabs FPR64IN32X:$rs1), (FSGNJX_D_IN32X $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64IN32X:$rs1), (FCLASS_D_IN32X $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_IN32X, FPR64IN32X, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_IN32X, FPR64IN32X, f64>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, (fneg FPR64IN32X:$rs2)),
           (FSGNJN_D_IN32X $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, FPR32INX:$rs2),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 7d89608de1223..e6c25e0844fb2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -31,6 +31,8 @@ def SDT_RISCVFROUND
                            SDTCisVT<3, XLenVT>]>;
 def SDT_RISCVFCLASS
     : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>;
+def SDT_RISCVFSGNJX
+    : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]>;
 
 def riscv_fclass
     : SDNode<"RISCVISD::FCLASS", SDT_RISCVFCLASS>;
@@ -38,6 +40,9 @@ def riscv_fclass
 def riscv_fround
     : SDNode<"RISCVISD::FROUND", SDT_RISCVFROUND>;
 
+def riscv_fsgnjx
+    : SDNode<"RISCVISD::FSGNJX", SDT_RISCVFSGNJX>;
+
 def riscv_fmv_w_x_rv64
     : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
 def riscv_fmv_x_anyextw_rv64
@@ -539,8 +544,10 @@ def : Pat<(fabs FPR32INX:$rs1), (FSGNJX_S_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR32INX:$rs1), (FCLASS_S_INX $rs1)>;
 } // Predicates = [HasStdExtZfinx]
 
-foreach Ext = FExts in
+foreach Ext = FExts in {
 defm : PatFprFpr_m<fcopysign, FSGNJ_S, Ext>;
+defm : PatFprFpr_m<riscv_fsgnjx, FSGNJX_S, Ext>;
+}
 
 let Predicates = [HasStdExtF] in {
 def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index b5817237b7fd2..5580504061637 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -92,34 +92,34 @@ def simm5_plus1_nonzero : ImmLeaf<XLenVT,
 //===----------------------------------------------------------------------===//
 
 // Common class of scheduling definitions.
-// `ReadVMergeOp` will be prepended to reads if instruction is masked.
+// `ReadVPassthru` will be prepended to reads if instruction is masked.
 // `ReadVMask` will be appended to reads if instruction is masked.
 // Operands:
 //   `writes`       SchedWrites that are listed for each explicit def operand
 //                  in order.
 //   `reads`        SchedReads that are listed for each explicit use operand.
 //   `forceMasked`  Forced to be masked (e.g. Add-with-Carry Instructions).
-//   `forceMergeOpRead` Force to have read for merge operand.
+//   `forcePassthruRead` Force to have read for passthru operand.
 class SchedCommon<list<SchedWrite> writes, list<SchedRead> reads,
                   string mx = "WorstCase", int sew = 0, bit forceMasked = 0,
-                  bit forceMergeOpRead = 0> : Sched<[]> {
+                  bit forcePassthruRead = 0> : Sched<[]> {
   defvar isMasked = !ne(!find(NAME, "_MASK"), -1);
   defvar isMaskedOrForceMasked = !or(forceMasked, isMasked);
-  defvar mergeRead = !if(!or(!eq(mx, "WorstCase"), !eq(sew, 0)),
-                            !cast<SchedRead>("ReadVMergeOp_" # mx),
-                            !cast<SchedRead>("ReadVMergeOp_" # mx # "_E" #sew));
-  defvar needsMergeRead = !or(isMaskedOrForceMasked, forceMergeOpRead);
+  defvar passthruRead = !if(!or(!eq(mx, "WorstCase"), !eq(sew, 0)),
+                            !cast<SchedRead>("ReadVPassthru_" # mx),
+                            !cast<SchedRead>("ReadVPassthru_" # mx # "_E" #sew));
+  defvar needsPassthruRead = !or(isMaskedOrForceMasked, forcePassthruRead);
   defvar readsWithMask =
       !if(isMaskedOrForceMasked, !listconcat(reads, [ReadVMask]), reads);
   defvar allReads =
-      !if(needsMergeRead, !listconcat([mergeRead], readsWithMask), reads);
+      !if(needsPassthruRead, !listconcat([passthruRead], readsWithMask), reads);
   let SchedRW = !listconcat(writes, allReads);
 }
 
 // Common class of scheduling definitions for n-ary instructions.
 // The scheudling resources are relevant to LMUL and may be relevant to SEW.
 class SchedNary<string write, list<string> reads, string mx, int sew = 0,
-                bit forceMasked = 0, bit forceMergeOpRead = 0>
+                bit forceMasked = 0, bit forcePassthruRead = 0>
     : SchedCommon<[!cast<SchedWrite>(
                       !if(sew,
                           write # "_" # mx # "_E" # sew,
@@ -127,7 +127,7 @@ class SchedNary<string write, list<string> reads, string mx, int sew = 0,
                   !foreach(read, reads,
                            !cast<SchedRead>(!if(sew, read #"_" #mx #"_E" #sew,
                                                  read #"_" #mx))),
-                  mx, sew, forceMasked, forceMergeOpRead>;
+                  mx, sew, forceMasked, forcePassthruRead>;
 
 // Classes with postfix "MC" are only used in MC layer.
 // For these classes, we assume that they are with the worst case costs and
@@ -135,22 +135,22 @@ class SchedNary<string write, list<string> reads, string mx, int sew = 0,
 
 // For instructions with no operand.
 class SchedNullary<string write, string mx, int sew = 0, bit forceMasked = 0,
-                   bit forceMergeOpRead = 0>:
-  SchedNary<write, [], mx, sew, forceMasked, forceMergeOpRead>;
+                   bit forcePassthruRead = 0>:
+  SchedNary<write, [], mx, sew, forceMasked, forcePassthruRead>;
 class SchedNullaryMC<string write, bit forceMasked = 1>:
   SchedNullary<write, "WorstCase", forceMasked=forceMasked>;
 
 // For instructions with one operand.
 class SchedUnary<string write, string read0, string mx, int sew = 0,
-                 bit forceMasked = 0, bit forceMergeOpRead = 0>:
-  SchedNary<write, [read0], mx, sew, forceMasked, forceMergeOpRead>;
+                 bit forceMasked = 0, bit forcePassthruRead = 0>:
+  SchedNary<write, [read0], mx, sew, forceMasked, forcePassthruRead>;
 class SchedUnaryMC<string write, string read0, bit forceMasked = 1>:
   SchedUnary<write, read0, "WorstCase", forceMasked=forceMasked>;
 
 // For instructions with two operands.
 class SchedBinary<string write, string read0, string read1, string mx,
-                  int sew = 0, bit forceMasked = 0, bit forceMergeOpRead = 0>
-    : SchedNary<write, [read0, read1], mx, sew, forceMasked, forceMergeOpRead>;
+                  int sew = 0, bit forceMasked = 0, bit forcePassthruRead = 0>
+    : SchedNary<write, [read0, read1], mx, sew, forceMasked, forcePassthruRead>;
 class SchedBinaryMC<string write, string read0, string read1,
                     bit forceMasked = 1>:
   SchedBinary<write, read0, read1, "WorstCase", forceMasked=forceMasked>;
@@ -165,9 +165,9 @@ class SchedTernaryMC<string write, string read0, string read1, string read2,
 
 // For reduction instructions.
 class SchedReduction<string write, string read, string mx, int sew,
-                     bit forceMergeOpRead = 0>
+                     bit forcePassthruRead = 0>
     : SchedCommon<[!cast<SchedWrite>(write #"_" #mx #"_E" #sew)],
-                  !listsplat(!cast<SchedRead>(read), 3), mx, sew, forceMergeOpRead>;
+                  !listsplat(!cast<SchedRead>(read), 3), mx, sew, forcePassthruRead>;
 class SchedReductionMC<string write, string readV, string readV0>:
   SchedCommon<[!cast<SchedWrite>(write # "_WorstCase")],
               [!cast<SchedRead>(readV), !cast<SchedRead>(readV0)],
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index b860273d639ee..239d66b723439 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -751,7 +751,7 @@ class VPseudo<Instruction instr, LMULInfo m, dag outs, dag ins, int sew = 0> :
 
 class GetVTypePredicates<VTypeInfo vti> {
   list<Predicate> Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16],
-                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16],
+                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal],
                                      !eq(vti.Scalar, f32) : [HasVInstructionsAnyF],
                                      !eq(vti.Scalar, f64) : [HasVInstructionsF64],
                                      !eq(vti.SEW, 64) : [HasVInstructionsI64],
@@ -777,7 +777,7 @@ class VPseudoUSLoadNoMask<VReg RetClass,
 class VPseudoUSLoadMask<VReg RetClass,
                         int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -785,7 +785,7 @@ class VPseudoUSLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -811,7 +811,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
 class VPseudoUSLoadFFMask<VReg RetClass,
                           int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -819,7 +819,7 @@ class VPseudoUSLoadFFMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -845,7 +845,7 @@ class VPseudoSLoadNoMask<VReg RetClass,
 class VPseudoSLoadMask<VReg RetClass,
                        int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1, GPR:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -853,7 +853,7 @@ class VPseudoSLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -890,7 +890,7 @@ class VPseudoILoadMask<VReg RetClass,
                        bit EarlyClobber,
                        int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1, IdxClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -898,7 +898,7 @@ class VPseudoILoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $merge", "$rd = $merge");
+  let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $passthru", "$rd = $passthru");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -963,13 +963,13 @@ class VPseudoSStoreMask<VReg StClass,
 
 class VPseudoNullaryNoMask<VReg RegClass> :
       Pseudo<(outs RegClass:$rd),
-             (ins RegClass:$merge,
+             (ins RegClass:$passthru,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -977,13 +977,13 @@ class VPseudoNullaryNoMask<VReg RegClass> :
 
 class VPseudoNullaryMask<VReg RegClass> :
       Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
-             (ins GetVRegNoV0<RegClass>.R:$merge,
+             (ins GetVRegNoV0<RegClass>.R:$passthru,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints ="$rd = $merge";
+  let Constraints ="$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let UsesMaskPolicy = 1;
@@ -1012,13 +1012,13 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
                          string Constraint = "",
                          int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2,
+             (ins RetClass:$passthru, OpClass:$rs2,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1046,13 +1046,13 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
                                      string Constraint = "",
                                      int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$rm,
+             (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1066,13 +1066,13 @@ class VPseudoUnaryMask<VReg RetClass,
                        string Constraint = "",
                        int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1085,14 +1085,14 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
                                    string Constraint = "",
                                    int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1106,12 +1106,12 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
                                 VReg OpClass,
                                 string Constraint = ""> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1124,13 +1124,13 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
                              string Constraint = "",
                              int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$frm,
+             (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1143,14 +1143,14 @@ class VPseudoUnaryMask_FRM<VReg RetClass,
                            string Constraint = "",
                            int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1185,13 +1185,13 @@ class VPseudoUnaryMaskGPROut :
 class VPseudoUnaryAnyMask<VReg RetClass,
                           VReg Op1Class> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2,
+             (ins RetClass:$passthru, Op1Class:$rs2,
                   VR:$vm, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
 }
@@ -1219,13 +1219,13 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
                                 string Constraint,
                                 int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1239,12 +1239,12 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
                                       int UsesVXRM_ = 1,
                                       int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, ixlenimm:$rm,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1260,14 +1260,14 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
                                           int UsesVXRM_,
                                           int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, ixlenimm:$rm, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1358,14 +1358,14 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
                               string Constraint,
                               int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1377,14 +1377,14 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
                                RegisterClass Op1Class,
                                DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1394,7 +1394,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
                                            RegisterClass Op1Class,
                                            DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm,
                   ixlenimm:$rm,
@@ -1403,7 +1403,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1418,14 +1418,14 @@ class VPseudoBinaryMOutMask<VReg RetClass,
                             string Constraint,
                             int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge,
+             (ins RetClass:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1440,14 +1440,14 @@ class VPseudoTiedBinaryMask<VReg RetClass,
                             string Constraint,
                             int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1461,7 +1461,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
                                         string Constraint,
                                         int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op2Class:$rs1,
                   VMaskOp:$vm,
                   ixlenimm:$rm,
@@ -1470,7 +1470,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1511,13 +1511,13 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
                                LMULInfo MInfo,
                                int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
                   VMV0:$carry, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1602,14 +1602,14 @@ class VPseudoUSSegLoadMask<VReg RetClass,
                            int EEW,
                            bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1637,14 +1637,14 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
                              int EEW,
                              bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1655,7 +1655,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
                             int EEW,
                             bits<4> NF> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, GPRMem:$rs1, GPR:$offset, AVL:$vl,
+             (ins RetClass:$passthru, GPRMem:$rs1, GPR:$offset, AVL:$vl,
              ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
@@ -1665,14 +1665,14 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
 }
 
 class VPseudoSSegLoadMask<VReg RetClass,
                           int EEW,
                           bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -1680,7 +1680,7 @@ class VPseudoSSegLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1694,7 +1694,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
                             bits<4> NF,
                             bit Ordered> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, GPRMem:$rs1, IdxClass:$offset, AVL:$vl,
+             (ins RetClass:$passthru, GPRMem:$rs1, IdxClass:$offset, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
@@ -1703,7 +1703,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
   let hasSideEffects = 0;
   // For vector indexed segment loads, the destination vector register groups
   // cannot overlap the source vector register group
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1716,7 +1716,7 @@ class VPseudoISegLoadMask<VReg RetClass,
                           bits<4> NF,
                           bit Ordered> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -1726,7 +1726,7 @@ class VPseudoISegLoadMask<VReg RetClass,
   let hasSideEffects = 0;
   // For vector indexed segment loads, the destination vector register groups
   // cannot overlap the source vector register group
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -2024,11 +2024,11 @@ multiclass VPseudoVSFS_M {
     let VLMul = mti.LMul.value in {
       def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>,
                            SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
-                                      forceMergeOpRead=true>;
+                                      forcePassthruRead=true>;
       let ForceTailAgnostic = true in
       def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>,
                                      SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
-                                                forceMergeOpRead=true>;
+                                                forcePassthruRead=true>;
     }
   }
 }
@@ -2038,11 +2038,11 @@ multiclass VPseudoVID_V {
     defvar mx = m.MX;
     let VLMul = m.value in {
       def "_V_" # mx : VPseudoNullaryNoMask<m.vrclass>,
-                         SchedNullary<"WriteVIdxV", mx, forceMergeOpRead=true>;
+                         SchedNullary<"WriteVIdxV", mx, forcePassthruRead=true>;
       def "_V_" # mx # "_MASK" : VPseudoNullaryMask<m.vrclass>,
                                    RISCVMaskedPseudo<MaskIdx=1>,
                                    SchedNullary<"WriteVIdxV", mx,
-                                                forceMergeOpRead=true>;
+                                                forcePassthruRead=true>;
     }
   }
 }
@@ -2063,11 +2063,11 @@ multiclass VPseudoVIOTA_M {
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
                        SchedUnary<"WriteVIotaV", "ReadVIotaV", mx,
-                                  forceMergeOpRead=true>;
+                                  forcePassthruRead=true>;
       def "_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>,
                                  RISCVMaskedPseudo<MaskIdx=2, ActiveAffectsRes=true>,
                                  SchedUnary<"WriteVIotaV", "ReadVIotaV", mx,
-                                            forceMergeOpRead=true>;
+                                            forcePassthruRead=true>;
     }
   }
 }
@@ -2227,7 +2227,7 @@ multiclass VPseudoVGTR_EI16_VV {
               : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul,
                                   constraint, e>,
                 SchedBinary<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data",
-                            "ReadVRGatherEI16VV_index", mx, e, forceMergeOpRead=true>;
+                            "ReadVRGatherEI16VV_index", mx, e, forcePassthruRead=true>;
         }
       }
     }
@@ -2246,7 +2246,7 @@ multiclass VPseudoVSLD1_VX<string Constraint = ""> {
   foreach m = MxList in {
     defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>,
                  SchedBinary<"WriteVISlide1X", "ReadVISlideV", "ReadVISlideX",
-                             m.MX, forceMergeOpRead=true>;
+                             m.MX, forcePassthruRead=true>;
   }
 }
 
@@ -2267,7 +2267,7 @@ multiclass VPseudoVSLD1_VF<string Constraint = ""> {
       defm "_V" #f.FX
           : VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>,
             SchedBinary<"WriteVFSlide1F", "ReadVFSlideV", "ReadVFSlideF", m.MX,
-                      forceMergeOpRead=true>;
+                      forcePassthruRead=true>;
     }
   }
 }
@@ -2445,7 +2445,7 @@ multiclass VPseudoVMRG_FM {
           : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, m.vrclass,
                                      f.fprclass, m>,
           SchedBinary<"WriteVFMergeV", "ReadVFMergeV", "ReadVFMergeF", mx,
-                      forceMasked=1, forceMergeOpRead=true>;
+                      forceMasked=1, forcePassthruRead=true>;
     }
   }
 }
@@ -2472,13 +2472,13 @@ multiclass VPseudoUnaryVMV_V_X_I {
       let VLMul = m.value in {
         def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          SchedUnary<"WriteVIMovV", "ReadVIMovV", mx,
-                                    forceMergeOpRead=true>;
+                                    forcePassthruRead=true>;
         def "_X_" # mx : VPseudoUnaryNoMask<m.vrclass, GPR>,
                          SchedUnary<"WriteVIMovX", "ReadVIMovX", mx,
-                                    forceMergeOpRead=true>;
+                                    forcePassthruRead=true>;
         def "_I_" # mx : VPseudoUnaryNoMask<m.vrclass, simm5>,
                          SchedNullary<"WriteVIMovI", mx,
-                                      forceMergeOpRead=true>;
+                                      forcePassthruRead=true>;
       }
     }
   }
@@ -2491,7 +2491,7 @@ multiclass VPseudoVMV_F {
       let VLMul = m.value in {
         def "_" # f.FX # "_" # mx :
           VPseudoUnaryNoMask<m.vrclass, f.fprclass>,
-          SchedUnary<"WriteVFMovV", "ReadVFMovF", mx, forceMergeOpRead=true>;
+          SchedUnary<"WriteVFMovV", "ReadVFMovF", mx, forcePassthruRead=true>;
       }
     }
   }
@@ -2503,11 +2503,11 @@ multiclass VPseudoVCLS_V {
     let VLMul = m.value in {
       def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                        SchedUnary<"WriteVFClassV", "ReadVFClassV", mx,
-                                  forceMergeOpRead=true>;
+                                  forcePassthruRead=true>;
       def "_V_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
                                  RISCVMaskedPseudo<MaskIdx=2>,
                                  SchedUnary<"WriteVFClassV", "ReadVFClassV", mx,
-                                            forceMergeOpRead=true>;
+                                            forcePassthruRead=true>;
     }
   }
 }
@@ -2523,12 +2523,12 @@ multiclass VPseudoVSQR_V_RM {
         let SEW = e in {
           def "_V" # suffix : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
                               SchedUnary<"WriteVFSqrtV", "ReadVFSqrtV", mx, e,
-                                         forceMergeOpRead=true>;
+                                         forcePassthruRead=true>;
           def "_V" #suffix # "_MASK"
               : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
                 RISCVMaskedPseudo<MaskIdx = 2>,
                 SchedUnary<"WriteVFSqrtV", "ReadVFSqrtV", mx, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
         }
       }
   }
@@ -2541,11 +2541,11 @@ multiclass VPseudoVRCP_V {
       let VLMul = m.value in {
         def "_V_" # mx # "_E" # e
             : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
         def "_V_" # mx # "_E" # e # "_MASK"
             : VPseudoUnaryMask<m.vrclass, m.vrclass>,
               RISCVMaskedPseudo<MaskIdx = 2>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
       }
     }
   }
@@ -2558,11 +2558,11 @@ multiclass VPseudoVRCP_V_RM {
       let VLMul = m.value in {
         def "_V_" # mx # "_E" # e
             : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
         def "_V_" # mx # "_E" # e # "_MASK"
             : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
               RISCVMaskedPseudo<MaskIdx = 2>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
       }
     }
   }
@@ -2575,11 +2575,11 @@ multiclass PseudoVEXT_VF2 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF4"), !eq(mx, "MF2"), !eq(mx, "M1")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2591,11 +2591,11 @@ multiclass PseudoVEXT_VF4 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF2"), !eq(mx, "M1"), !eq(mx, "M2")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2607,11 +2607,11 @@ multiclass PseudoVEXT_VF8 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "M1"), !eq(mx, "M2"), !eq(mx, "M4")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2657,16 +2657,16 @@ multiclass VPseudoVGTR_VV_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VX<m, constraint>,
               SchedBinary<"WriteVRGatherVX", "ReadVRGatherVX_data",
-                          "ReadVRGatherVX_index", mx, forceMergeOpRead=true>;
+                          "ReadVRGatherVX_index", mx, forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<uimm5, m, constraint>,
               SchedUnary<"WriteVRGatherVI", "ReadVRGatherVI_data", mx,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
 
     defvar sews = SchedSEWSet<mx>.val;
     foreach e = sews in {
       defm "" : VPseudoBinaryV_VV<m, constraint, e>,
                 SchedBinary<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                              "ReadVRGatherVV_index", mx, e, forceMergeOpRead=true>;
+                              "ReadVRGatherVV_index", mx, e, forcePassthruRead=true>;
     }
   }
 }
@@ -2676,12 +2676,12 @@ multiclass VPseudoVSALU_VV_VX_VI<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVSALUX", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-              SchedUnary<"WriteVSALUI", "ReadVSALUV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVSALUI", "ReadVSALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2691,12 +2691,12 @@ multiclass VPseudoVSHT_VV_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVShiftV", "ReadVShiftV", "ReadVShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVShiftX", "ReadVShiftV", "ReadVShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<uimm5, m>,
-              SchedUnary<"WriteVShiftI", "ReadVShiftV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVShiftI", "ReadVShiftV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2705,12 +2705,12 @@ multiclass VPseudoVSSHT_VV_VX_VI_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m>,
               SchedBinary<"WriteVSShiftV", "ReadVSShiftV", "ReadVSShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVSShiftX", "ReadVSShiftV", "ReadVSShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI_RM<uimm5, m>,
-              SchedUnary<"WriteVSShiftI", "ReadVSShiftV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVSShiftI", "ReadVSShiftV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2719,12 +2719,12 @@ multiclass VPseudoVALU_VV_VX_VI<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
             SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-            SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
+            SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2733,10 +2733,10 @@ multiclass VPseudoVSALU_VV_VX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVSALUX", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2745,10 +2745,10 @@ multiclass VPseudoVSMUL_VV_VX_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m, Commutable=1>,
               SchedBinary<"WriteVSMulV", "ReadVSMulV", "ReadVSMulV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVSMulX", "ReadVSMulV", "ReadVSMulX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2757,10 +2757,10 @@ multiclass VPseudoVAALU_VV_VX_RM<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m, Commutable=Commutable>,
               SchedBinary<"WriteVAALUV", "ReadVAALUV", "ReadVAALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVAALUX", "ReadVAALUV", "ReadVAALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2802,14 +2802,14 @@ multiclass VPseudoVFMUL_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2821,7 +2821,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
     foreach e = sews in {
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFDivV", "ReadVFDivV", "ReadVFDivV", mx, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 
@@ -2829,7 +2829,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 }
@@ -2839,7 +2839,7 @@ multiclass VPseudoVFRDIV_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 }
@@ -2848,10 +2848,10 @@ multiclass VPseudoVALU_VV_VX {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
             SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
   }
 }
 
@@ -2860,14 +2860,14 @@ multiclass VPseudoVSGNJ_VV_VF {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
     defm "" : VPseudoBinaryV_VV<m, sew=e>,
               SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX,
-                          e, forceMergeOpRead=true>;
+                          e, forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2877,14 +2877,14 @@ multiclass VPseudoVMAX_VV_VF {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryV_VV<m, sew=e>,
                 SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV",
-                            m.MX, e, forceMergeOpRead=true>;
+                            m.MX, e, forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF",
-                            m.MX, f.SEW, forceMergeOpRead=true>;
+                            m.MX, f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2894,14 +2894,14 @@ multiclass VPseudoVALU_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2911,7 +2911,7 @@ multiclass VPseudoVALU_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2921,9 +2921,9 @@ multiclass VPseudoVALU_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-              SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2932,10 +2932,10 @@ multiclass VPseudoVWALU_VV_VX<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>, 
               SchedBinary<"WriteVIWALUX", "ReadVIWALUV", "ReadVIWALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2944,10 +2944,10 @@ multiclass VPseudoVWMUL_VV_VX<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWMulV", "ReadVIWMulV", "ReadVIWMulV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
               SchedBinary<"WriteVIWMulX", "ReadVIWMulV", "ReadVIWMulX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2956,14 +2956,14 @@ multiclass VPseudoVWMUL_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
     defm "" : VPseudoBinaryW_VV_RM<m, sew=e>,
               SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX,
-                          e, forceMergeOpRead=true>;
+                          e, forcePassthruRead=true>;
   }
 
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX,
-                          f.SEW, forceMergeOpRead=true>;
+                          f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2973,10 +2973,10 @@ multiclass VPseudoVWALU_WV_WX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_WV<m>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_WX<m>,
               SchedBinary<"WriteVIWALUX", "ReadVIWALUV", "ReadVIWALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2985,14 +2985,14 @@ multiclass VPseudoVFWALU_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm "" : VPseudoBinaryW_VV_RM<m, sew=e>,
                 SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
-                            e, forceMergeOpRead=true>;
+                            e, forcePassthruRead=true>;
   }
 
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
-                          f.SEW, forceMergeOpRead=true>;
+                          f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -3002,13 +3002,13 @@ multiclass VPseudoVFWALU_WV_WF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm "" : VPseudoBinaryW_WV_RM<m, sew=e>,
                 SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
-                            e, forceMergeOpRead=true>;
+                            e, forcePassthruRead=true>;
   }
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -3020,17 +3020,17 @@ multiclass VPseudoVMRG_VM_XM_IM {
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, m.vrclass, m>,
       SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     def "_VXM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, GPR, m>,
       SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     def "_VIM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, simm5, m>,
       SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3039,13 +3039,13 @@ multiclass VPseudoVCALU_VM_XM_IM {
     defvar mx = m.MX;
     defm "" : VPseudoTiedBinaryV_VM<m, Commutable=1>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_IM<m>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3054,10 +3054,10 @@ multiclass VPseudoVCALU_VM_XM {
     defvar mx = m.MX;
     defm "" : VPseudoTiedBinaryV_VM<m>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3068,13 +3068,13 @@ multiclass VPseudoVCALUM_VM_XM_IM {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3085,11 +3085,11 @@ multiclass VPseudoVCALUM_VM_XM {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3100,13 +3100,13 @@ multiclass VPseudoVCALUM_V_X_I {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=0, Constraint=constraint>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3116,10 +3116,10 @@ multiclass VPseudoVCALUM_V_X {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3128,13 +3128,13 @@ multiclass VPseudoVNCLP_WV_WX_WI_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_WV_RM<m>,
               SchedBinary<"WriteVNClipV", "ReadVNClipV", "ReadVNClipV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WX_RM<m>,
               SchedBinary<"WriteVNClipX", "ReadVNClipV", "ReadVNClipX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WI_RM<m>,
               SchedUnary<"WriteVNClipI", "ReadVNClipV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3143,13 +3143,13 @@ multiclass VPseudoVNSHT_WV_WX_WI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_WV<m>,
               SchedBinary<"WriteVNShiftV", "ReadVNShiftV", "ReadVNShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WX<m>,
               SchedBinary<"WriteVNShiftX", "ReadVNShiftV", "ReadVNShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WI<m>,
               SchedUnary<"WriteVNShiftI", "ReadVNShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3591,7 +3591,7 @@ multiclass VPseudoVCVTI_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3599,7 +3599,7 @@ multiclass VPseudoVCVTI_V_RM {
   foreach m = MxListF in {
     defm _V : VPseudoConversionRoundingMode<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3607,7 +3607,7 @@ multiclass VPseudoVCVTI_RM_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3615,7 +3615,7 @@ multiclass VPseudoVFROUND_NOEXCEPT_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversionNoExcept<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3624,7 +3624,7 @@ multiclass VPseudoVCVTF_V_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm _V : VPseudoConversionRoundingMode<m.vrclass, m.vrclass, m, sew=e>,
                 SchedUnary<"WriteVFCvtIToFV", "ReadVFCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3633,7 +3633,7 @@ multiclass VPseudoVCVTF_RM_V {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m, sew=e>,
                 SchedUnary<"WriteVFCvtIToFV", "ReadVFCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3642,7 +3642,7 @@ multiclass VPseudoVWCVTI_V {
   foreach m = MxListFW in {
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, TargetConstraintType=3>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3651,7 +3651,7 @@ multiclass VPseudoVWCVTI_V_RM {
   foreach m = MxListFW in {
     defm _V : VPseudoConversionRoundingMode<m.wvrclass, m.vrclass, m, constraint, TargetConstraintType=3>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3660,7 +3660,7 @@ multiclass VPseudoVWCVTI_RM_V {
   foreach m = MxListFW in {
     defm _V : VPseudoConversionRM<m.wvrclass, m.vrclass, m, constraint>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3671,7 +3671,7 @@ multiclass VPseudoVWCVTF_V {
       defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=e,
                                   TargetConstraintType=3>,
                 SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3682,7 +3682,7 @@ multiclass VPseudoVWCVTD_V {
       defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=e,
                                   TargetConstraintType=3>,
                 SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3691,7 +3691,7 @@ multiclass VPseudoVNCVTI_W {
   foreach m = MxListW in {
     defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3700,7 +3700,7 @@ multiclass VPseudoVNCVTI_W_RM {
   foreach m = MxListW in {
     defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3709,7 +3709,7 @@ multiclass VPseudoVNCVTI_RM_W {
   foreach m = MxListW in {
     defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3721,7 +3721,7 @@ multiclass VPseudoVNCVTF_W_RM {
                                               constraint, sew=e,
                                               TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtIToFV", "ReadVFNCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3731,7 +3731,7 @@ multiclass VPseudoVNCVTF_RM_W {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, sew=e>,
                 SchedUnary<"WriteVFNCvtIToFV", "ReadVFNCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3742,7 +3742,7 @@ multiclass VPseudoVNCVTD_W {
       defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=e,
                                   TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3754,7 +3754,7 @@ multiclass VPseudoVNCVTD_W_RM {
                                               constraint, sew=e,
                                               TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3922,14 +3922,14 @@ class VPatUnaryNoMask<string intrinsic_name,
                       VReg op2_reg_class,
                       bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(
                      !if(isSEWAware,
                          inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                          inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    GPR:$vl, log2sew, TU_MU)>;
 
@@ -3944,7 +3944,7 @@ class VPatUnaryNoMaskRoundingMode<string intrinsic_name,
                                   VReg op2_reg_class,
                                   bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
@@ -3952,7 +3952,7 @@ class VPatUnaryNoMaskRoundingMode<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                           inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT timm:$round),
                    GPR:$vl, log2sew, TU_MU)>;
@@ -3968,7 +3968,7 @@ class VPatUnaryNoMaskRTZ<string intrinsic_name,
                          VReg op2_reg_class,
                          bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT 0b001),
                    VLOpFrag)),
@@ -3976,7 +3976,7 @@ class VPatUnaryNoMaskRTZ<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                           inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    GPR:$vl, log2sew, TU_MU)>;
 
@@ -3992,7 +3992,7 @@ class VPatUnaryMask<string intrinsic_name,
                     VReg op2_reg_class,
                     bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
@@ -4000,7 +4000,7 @@ class VPatUnaryMask<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
 
@@ -4016,7 +4016,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
                                 VReg op2_reg_class,
                                 bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4025,7 +4025,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4043,7 +4043,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
                        VReg op2_reg_class,
                        bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT 0b001),
@@ -4052,7 +4052,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    GPR:$vl, log2sew, (XLenVT timm:$policy))>;
@@ -4071,12 +4071,12 @@ class VPatMaskUnaryMask<string intrinsic_name,
                         string inst,
                         MTypeInfo mti> :
   Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
-                (mti.Mask VR:$merge),
+                (mti.Mask VR:$passthru),
                 (mti.Mask VR:$rs2),
                 (mti.Mask V0),
                 VLOpFrag)),
                 (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
-                (mti.Mask VR:$merge),
+                (mti.Mask VR:$passthru),
                 (mti.Mask VR:$rs2),
                 (mti.Mask V0), GPR:$vl, mti.Log2SEW, TU_MU)>;
 
@@ -4091,12 +4091,12 @@ class VPatUnaryAnyMask<string intrinsic,
                        VReg result_reg_class,
                        VReg op1_reg_class> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
                    GPR:$vl, log2sew)>;
@@ -4128,12 +4128,12 @@ class VPatBinaryNoMaskTU<string intrinsic_name,
                          VReg op1_reg_class,
                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew, TU_MU)>;
@@ -4148,13 +4148,13 @@ class VPatBinaryNoMaskTURoundingMode<string intrinsic_name,
                                      VReg op1_reg_class,
                                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
                    (!cast<Instruction>(inst)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
@@ -4190,13 +4190,13 @@ class VPatBinaryMask<string intrinsic_name,
                      VReg op1_reg_class,
                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
@@ -4212,13 +4212,13 @@ class VPatBinaryMaskPolicy<string intrinsic_name,
                            VReg op1_reg_class,
                            DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
@@ -4234,14 +4234,14 @@ class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
                                        VReg op1_reg_class,
                                        DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
@@ -4260,13 +4260,13 @@ class VPatBinaryMaskSwapped<string intrinsic_name,
                             VReg op1_reg_class,
                             DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type V0),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
@@ -4315,12 +4315,12 @@ class VPatTiedBinaryNoMaskTU<string intrinsic_name,
                              VReg result_reg_class,
                              DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew, TU_MU)>;
 
@@ -4332,13 +4332,13 @@ class VPatTiedBinaryNoMaskTURoundingMode<string intrinsic_name,
                                          VReg result_reg_class,
                                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    GPR:$vl, sew, TU_MU)>;
@@ -4352,13 +4352,13 @@ class VPatTiedBinaryMask<string intrinsic_name,
                          VReg result_reg_class,
                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
@@ -4371,14 +4371,14 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
                                      VReg result_reg_class,
                                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4678,15 +4678,15 @@ multiclass VPatNullaryV<string intrinsic, string instruction> {
   foreach vti = AllIntegerVectors in {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
-                            (vti.Vector vti.RegClass:$merge),
+                            (vti.Vector vti.RegClass:$passthru),
                             VLOpFrag)),
                             (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
-                            vti.RegClass:$merge, GPR:$vl, vti.Log2SEW, TU_MU)>;
+                            vti.RegClass:$passthru, GPR:$vl, vti.Log2SEW, TU_MU)>;
       def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
-                            (vti.Vector vti.RegClass:$merge),
+                            (vti.Vector vti.RegClass:$passthru),
                             (vti.Mask V0), VLOpFrag, (XLenVT timm:$policy))),
                             (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
-                            vti.RegClass:$merge, (vti.Mask V0),
+                            vti.RegClass:$passthru, (vti.Mask V0),
                             GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
   }
   }
@@ -4781,13 +4781,13 @@ multiclass VPatBinaryCarryInTAIL<string intrinsic,
                                  VReg op1_reg_class,
                                  DAGOperand op2_kind> {
   def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
-                         (result_type result_reg_class:$merge),
+                         (result_type result_reg_class:$passthru),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
                          (mask_type V0),
                          VLOpFrag)),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
-                         (result_type result_reg_class:$merge),
+                         (result_type result_reg_class:$passthru),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
                          (mask_type V0), GPR:$vl, sew)>;
@@ -6065,12 +6065,12 @@ multiclass VPatCompare_VI<string intrinsic, string inst,
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX#"_MASK");
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$merge),
+    def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$passthru),
                                   (vti.Vector vti.RegClass:$rs1),
                                   (vti.Scalar ImmType:$rs2),
                                   (vti.Mask V0),
                                   VLOpFrag)),
-              (PseudoMask VR:$merge, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
+              (PseudoMask VR:$passthru, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
                           (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
@@ -6215,24 +6215,24 @@ foreach vti = AllIntegerVectors in {
   // to use a more complex splat sequence. Add the pattern for all VTs for
   // consistency.
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$passthru),
                                            (vti.Vector vti.RegClass:$rs2),
                                            (vti.Vector vti.RegClass:$rs1),
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX)
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2,
                                                         GPR:$vl,
                                                         vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$passthru),
                                                 (vti.Vector vti.RegClass:$rs2),
                                                 (vti.Vector vti.RegClass:$rs1),
                                                 (vti.Mask V0),
                                                 VLOpFrag,
                                                 (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK")
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2,
                                                         (vti.Mask V0),
@@ -6241,24 +6241,24 @@ foreach vti = AllIntegerVectors in {
                                                         (XLenVT timm:$policy))>;
 
     // Match VSUB with a small immediate to vadd.vi by negating the immediate.
-    def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$passthru),
                                           (vti.Vector vti.RegClass:$rs1),
                                           (vti.Scalar simm5_plus1:$rs2),
                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX)
-                                                      vti.RegClass:$merge,
+                                                      vti.RegClass:$passthru,
                                                       vti.RegClass:$rs1,
                                                       (NegImm simm5_plus1:$rs2),
                                                       GPR:$vl,
                                                       vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$passthru),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (vti.Scalar simm5_plus1:$rs2),
                                                (vti.Mask V0),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX#"_MASK")
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         (NegImm simm5_plus1:$rs2),
                                                         (vti.Mask V0),
@@ -6781,26 +6781,20 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 let Predicates = [HasVInstructionsAnyF] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach f = FPList in {
-    foreach m = f.MxList in {
-      defvar mx = m.MX;
-      let VLMul = m.value in {
-        let HasSEWOp = 1, BaseInstr = VFMV_F_S in
-        def "PseudoVFMV_" # f.FX # "_S_" # mx :
-          Pseudo<(outs f.fprclass:$rd),
-                 (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
-          Sched<[WriteVMovFS, ReadVMovFS]>,
-          RISCVVPseudo;
-        let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
-            Constraints = "$rd = $rs1" in
-        def "PseudoVFMV_S_" # f.FX # "_" # mx :
-                                          Pseudo<(outs m.vrclass:$rd),
-                                                 (ins m.vrclass:$rs1, f.fprclass:$rs2,
-                                                      AVL:$vl, ixlenimm:$sew),
-                                                 []>,
-          Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
-          RISCVVPseudo;
-      }
-    }
+    let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+    def "PseudoVFMV_" # f.FX # "_S" :
+      Pseudo<(outs f.fprclass:$rd),
+             (ins VR:$rs2, ixlenimm:$sew), []>,
+      Sched<[WriteVMovFS, ReadVMovFS]>,
+      RISCVVPseudo;
+    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
+        Constraints = "$rd = $rs1" in
+    def "PseudoVFMV_S_" # f.FX :
+      Pseudo<(outs VR:$rd),
+             (ins VR:$rs1, f.fprclass:$rs2, AVL:$vl, ixlenimm:$sew),
+             []>,
+      Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
+      RISCVVPseudo;
   }
 }
 } // Predicates = [HasVInstructionsAnyF]
@@ -6907,20 +6901,20 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$passthru),
                                           (vti.Vector vti.RegClass:$rs1),
                                           (XLenVT 1), VLOpFrag)),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX)
-                 vti.RegClass:$merge, vti.RegClass:$rs1,
+                 vti.RegClass:$passthru, vti.RegClass:$rs1,
                  vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$passthru),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (XLenVT 1),
                                                (vti.Mask V0),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX#"_MASK")
-                                                          vti.RegClass:$merge,
+                                                          vti.RegClass:$passthru,
                                                           vti.RegClass:$rs1,
                                                           vti.RegClass:$rs1,
                                                           (vti.Mask V0),
@@ -7040,7 +7034,8 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 // 11.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
+  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
+                       GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
                                              (vti.Vector vti.RegClass:$rs1),
                                              VLOpFrag)),
@@ -7235,34 +7230,30 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
 foreach vti = AllFloatVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
+  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
+                       GetVTypePredicates<vti>.Predicates) in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
                                  vti.Vector,
                                  vti.Vector, vti.Vector, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
                                  vti.RegClass, vti.RegClass>;
-    defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVMERGE", "VVM",
-                                 vti.Vector,
-                                 vti.Vector, vti.Vector, vti.Mask,
-                                 vti.Log2SEW, vti.LMul, vti.RegClass,
-                                 vti.RegClass, vti.RegClass>;
+  let Predicates = GetVTypePredicates<vti>.Predicates in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE",
                                  "V"#vti.ScalarSuffix#"M",
                                  vti.Vector,
                                  vti.Vector, vti.Scalar, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
                                  vti.RegClass, vti.ScalarRegClass>;
-  }
 }
 
 foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
   let Predicates = GetVTypePredicates<fvti>.Predicates in
-  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$merge),
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru),
                                             (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
                                             (fvti.Mask V0), VLOpFrag)),
-            (instr fvti.RegClass:$merge, fvti.RegClass:$rs2, 0,
+            (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0,
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 7afd6def4e4d2..8d64788b3cb7d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -41,24 +41,6 @@ multiclass VPatUSLoadStoreSDNode<ValueType type,
             (store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>;
 }
 
-multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
-                                        int log2sew,
-                                        LMULInfo vlmul,
-                                        VReg reg_class,
-                                        int sew = !shl(1, log2sew)> {
-  defvar load_instr =
-    !cast<Instruction>("VL"#!substr(vlmul.MX, 1)#"RE"#sew#"_V");
-  defvar store_instr =
-    !cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V");
-
-  // Load
-  def : Pat<(type (load GPR:$rs1)),
-            (load_instr GPR:$rs1)>;
-  // Store
-  def : Pat<(store type:$rs2, GPR:$rs1),
-            (store_instr reg_class:$rs2, GPR:$rs1)>;
-}
-
 multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> {
   defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX);
   defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX);
@@ -895,23 +877,11 @@ multiclass VPatAVGADD_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
 //===----------------------------------------------------------------------===//
 
 // 7.4. Vector Unit-Stride Instructions
-foreach vti = !listconcat(FractionalGroupIntegerVectors,
-                          FractionalGroupFloatVectors,
-                          FractionalGroupBFloatVectors) in
+foreach vti = AllVectors in
   let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
                        GetVTypePredicates<vti>.Predicates) in 
   defm : VPatUSLoadStoreSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
                                vti.AVL, vti.RegClass>;
-foreach vti = [VI8M1, VI16M1, VI32M1, VI64M1, VBF16M1, VF16M1, VF32M1, VF64M1] in
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in 
-  defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
-                                      vti.RegClass>;
-foreach vti = !listconcat(GroupIntegerVectors, GroupFloatVectors, GroupBFloatVectors) in
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in 
-  defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
-                                      vti.RegClass>;
 foreach mti = AllMasks in
   let Predicates = [HasVInstructions] in
   defm : VPatUSLoadStoreMaskSDNode<mti>;
@@ -1460,7 +1430,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
             (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                 (fvti.Vector (IMPLICIT_DEF)),
@@ -1474,14 +1444,14 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
 //===----------------------------------------------------------------------===//
 // Vector Element Extracts
 //===----------------------------------------------------------------------===//
-foreach vti = AllFloatVectors in {
-  defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
+foreach vti = NoGroupFloatVectors in {
+  defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
                                                        vti.ScalarSuffix,
-                                                       "_S_", vti.LMul.MX));
+                                                       "_S"));
   // Only pattern-match extract-element operations where the index is 0. Any
   // other index will have been custom-lowered to slide the vector correctly
   // into place.
   let Predicates = GetVTypePredicates<vti>.Predicates in
   def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
-            (vmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
+            (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cc294bf9254e8..699536b186969 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -36,7 +36,7 @@ def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameNumEltsAs<0, 4>,
                                                 SDTCisVT<5, XLenVT>]>;
 
-// Input: (vector, vector/scalar, merge, mask, roundmode, vl)
+// Input: (vector, vector/scalar, passthru, mask, roundmode, vl)
 def SDT_RISCVVNBinOp_RM_VL : SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisInt<0>,
                                                   SDTCisSameAs<0, 3>,
                                                   SDTCisSameNumEltsAs<0, 1>,
@@ -132,9 +132,6 @@ def riscv_uaddsat_vl   : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [S
 def riscv_ssubsat_vl   : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>;
 def riscv_usubsat_vl   : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>;
 
-def riscv_vnclipu_vl : SDNode<"RISCVISD::VNCLIPU_VL", SDT_RISCVVNBinOp_RM_VL>;
-def riscv_vnclip_vl : SDNode<"RISCVISD::VNCLIP_VL", SDT_RISCVVNBinOp_RM_VL>;
-
 def riscv_fadd_vl  : SDNode<"RISCVISD::FADD_VL",  SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
 def riscv_fsub_vl  : SDNode<"RISCVISD::FSUB_VL",  SDT_RISCVFPBinOp_VL>;
 def riscv_fmul_vl  : SDNode<"RISCVISD::FMUL_VL",  SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
@@ -152,18 +149,18 @@ def riscv_strict_fmul_vl  : SDNode<"RISCVISD::STRICT_FMUL_VL",  SDT_RISCVFPBinOp
 def riscv_strict_fdiv_vl  : SDNode<"RISCVISD::STRICT_FDIV_VL",  SDT_RISCVFPBinOp_VL, [SDNPHasChain]>;
 def riscv_strict_fsqrt_vl : SDNode<"RISCVISD::STRICT_FSQRT_VL", SDT_RISCVFPUnOp_VL, [SDNPHasChain]>;
 
-def any_riscv_fadd_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fadd_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fadd_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fsub_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fsub_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fsub_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fmul_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fmul_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fmul_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fdiv_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fdiv_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fdiv_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
+def any_riscv_fadd_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fadd_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fadd_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fsub_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fsub_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fsub_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fmul_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fmul_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fmul_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fdiv_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fdiv_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fdiv_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
 def any_riscv_fsqrt_vl : PatFrags<(ops node:$src, node:$mask, node:$vl),
                         [(riscv_fsqrt_vl node:$src, node:$mask, node:$vl),
                          (riscv_strict_fsqrt_vl node:$src, node:$mask, node:$vl)]>;
@@ -321,12 +318,12 @@ def any_riscv_vfround_noexcept_vl : PatFrags<(ops node:$src, node:$mask, node:$v
 def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL", SDT_RISCVSETCCOP_VL>;
 def riscv_strict_fsetcc_vl : SDNode<"RISCVISD::STRICT_FSETCC_VL", SDT_RISCVSETCCOP_VL, [SDNPHasChain]>;
 def riscv_strict_fsetccs_vl : SDNode<"RISCVISD::STRICT_FSETCCS_VL", SDT_RISCVSETCCOP_VL, [SDNPHasChain]>;
-def any_riscv_fsetcc_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                             (riscv_strict_fsetcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fsetccs_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                             (riscv_strict_fsetccs_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl)]>;
+def any_riscv_fsetcc_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                             (riscv_strict_fsetcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fsetccs_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                             (riscv_strict_fsetccs_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl)]>;
 
 def riscv_vrgather_vx_vl : SDNode<"RISCVISD::VRGATHER_VX_VL",
                                   SDTypeProfile<1, 5, [SDTCisVec<0>,
@@ -408,12 +405,17 @@ def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C),
                             [(riscv_sext_vl node:$A, node:$B, node:$C),
                              (riscv_zext_vl node:$A, node:$B, node:$C)]>;
 
+def SDT_RISCVVTRUNCATE_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                                 SDTCisSameNumEltsAs<0, 1>,
+                                                 SDTCisSameNumEltsAs<0, 2>,
+                                                 SDTCVecEltisVT<2, i1>,
+                                                 SDTCisVT<3, XLenVT>]>;
 def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
-                                   SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                                        SDTCisSameNumEltsAs<0, 1>,
-                                                        SDTCisSameNumEltsAs<0, 2>,
-                                                        SDTCVecEltisVT<2, i1>,
-                                                        SDTCisVT<3, XLenVT>]>>;
+                                   SDT_RISCVVTRUNCATE_VL>;
+def riscv_trunc_vector_vl_ssat : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL_SSAT",
+                                        SDT_RISCVVTRUNCATE_VL>;
+def riscv_trunc_vector_vl_usat : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL_USAT",
+                                        SDT_RISCVVTRUNCATE_VL>;
 
 def SDT_RISCVVWIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>,
                                                   SDTCisInt<1>,
@@ -638,46 +640,18 @@ class VPatBinaryVL_V<SDPatternOperator vop,
     : Pat<(result_type (vop
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
-multiclass VPatBinaryRM_VL_V<SDNode vop,
-                             string instruction_name,
-                             string suffix,
-                             ValueType result_type,
-                             ValueType op1_type,
-                             ValueType op2_type,
-                             ValueType mask_type,
-                             int sew,
-                             LMULInfo vlmul,
-                             VReg result_reg_class,
-                             VReg op1_reg_class,
-                             VReg op2_reg_class> {
-  def : Pat<(result_type (vop
-                         (op1_type op1_reg_class:$rs1),
-                         (op2_type op2_reg_class:$rs2),
-                         (result_type result_reg_class:$merge),
-                         (mask_type V0),
-                         (XLenVT timm:$roundmode),
-                         VLOpFrag)),
-        (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK")
-                     result_reg_class:$merge,
-                     op1_reg_class:$rs1,
-                     op2_reg_class:$rs2,
-                     (mask_type V0),
-                     (XLenVT timm:$roundmode),
-                     GPR:$vl, sew, TAIL_AGNOSTIC)>;
-}
-
 class VPatBinaryVL_V_RM<SDPatternOperator vop,
                         string instruction_name,
                         string suffix,
@@ -694,14 +668,14 @@ class VPatBinaryVL_V_RM<SDPatternOperator vop,
     : Pat<(result_type (vop
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
                    (mask_type V0),
@@ -826,47 +800,18 @@ class VPatBinaryVL_XI<SDPatternOperator vop,
     : Pat<(result_type (vop
                    (vop1_type vop_reg_class:$rs1),
                    (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (mask_type V0),
                    VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#_#suffix#_#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#_#suffix#_#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    xop_kind:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
-multiclass VPatBinaryRM_VL_XI<SDNode vop,
-                              string instruction_name,
-                              string suffix,
-                              ValueType result_type,
-                              ValueType vop1_type,
-                              ValueType vop2_type,
-                              ValueType mask_type,
-                              int sew,
-                              LMULInfo vlmul,
-                              VReg result_reg_class,
-                              VReg vop_reg_class,
-                              ComplexPattern SplatPatKind,
-                              DAGOperand xop_kind> {
-  def : Pat<(result_type (vop
-                     (vop1_type vop_reg_class:$rs1),
-                     (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
-                     (result_type result_reg_class:$merge),
-                     (mask_type V0),
-                     (XLenVT timm:$roundmode),
-                     VLOpFrag)),
-        (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX#"_MASK")
-                     result_reg_class:$merge,
-                     vop_reg_class:$rs1,
-                     xop_kind:$rs2,
-                     (mask_type V0),
-                     (XLenVT timm:$roundmode),
-                     GPR:$vl, sew, TAIL_AGNOSTIC)>;
-}
-
 multiclass VPatBinaryVL_VV_VX<SDPatternOperator vop, string instruction_name,
                               list<VTypeInfo> vtilist = AllIntegerVectors,
                               bit isSEWAware = 0> {
@@ -965,24 +910,6 @@ multiclass VPatBinaryNVL_WV_WX_WI<SDPatternOperator vop, string instruction_name
   }
 }
 
-multiclass VPatBinaryRM_NVL_WV_WX_WI<SDNode vop, string instruction_name> {
-  foreach VtiToWti = AllWidenableIntVectors in {
-    defvar vti = VtiToWti.Vti;
-    defvar wti = VtiToWti.Wti;
-    defm : VPatBinaryRM_VL_V<vop, instruction_name, "WV",
-                             vti.Vector, wti.Vector, vti.Vector, vti.Mask,
-                             vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass, vti.RegClass>;
-    defm : VPatBinaryRM_VL_XI<vop, instruction_name, "WX",
-                              vti.Vector, wti.Vector, vti.Vector, vti.Mask,
-                              vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass, SplatPat, GPR>;
-    defm : VPatBinaryRM_VL_XI<vop, instruction_name, "WI",
-                              vti.Vector, wti.Vector, vti.Vector, vti.Mask,
-                              vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass,
-                              !cast<ComplexPattern>(SplatPat#_#uimm5),
-                              uimm5>;
-  }
-}
-
 class VPatBinaryVL_VF<SDPatternOperator vop,
                       string instruction_name,
                       ValueType result_type,
@@ -997,14 +924,14 @@ class VPatBinaryVL_VF<SDPatternOperator vop,
                       bit isSEWAware = 0>
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
@@ -1023,14 +950,14 @@ class VPatBinaryVL_VF_RM<SDPatternOperator vop,
                       bit isSEWAware = 0>
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
                    (mask_type V0),
@@ -1077,14 +1004,14 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name,
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
-                                (fvti.Vector fvti.RegClass:$merge),
+                                (fvti.Vector fvti.RegClass:$passthru),
                                 (fvti.Mask V0),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
-                           fvti.RegClass:$merge,
+                           fvti.RegClass:$passthru,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                            (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
@@ -1096,14 +1023,14 @@ multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
-                                (fvti.Vector fvti.RegClass:$merge),
+                                (fvti.Vector fvti.RegClass:$passthru),
                                 (fvti.Mask V0),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
-                           fvti.RegClass:$merge,
+                           fvti.RegClass:$passthru,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                            (fvti.Mask V0),
                            // Value to indicate no rounding mode change in
@@ -1117,11 +1044,11 @@ multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
                                  CondCode cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       vti.RegClass:$rs2, cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
-                         VR:$merge,
+                         VR:$passthru,
                          vti.RegClass:$rs1,
                          vti.RegClass:$rs2,
                          (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
@@ -1133,11 +1060,11 @@ multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_nam
     : VPatIntegerSetCCVL_VV<vti, instruction_name, cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs2),
                                       vti.RegClass:$rs1, invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
-                         VR:$merge, vti.RegClass:$rs1,
+                         VR:$passthru, vti.RegClass:$rs1,
                          vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
@@ -1146,17 +1073,17 @@ multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_nam
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat (XLenVT GPR:$rs2)), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
@@ -1165,20 +1092,20 @@ multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_nam
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat_simm5 simm5:$rs2), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 simm5:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 }
@@ -1190,20 +1117,20 @@ multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (splatpat_kind simm5:$rs2), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 }
@@ -1216,31 +1143,31 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                  fvti.RegClass:$rs2,
                                  cc,
-                                 VR:$merge,
+                                 VR:$passthru,
                                  (fvti.Mask V0),
                                  VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.RegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                 (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 cc,
-                                VR:$merge,
+                                VR:$passthru,
                                 (fvti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 (fvti.Vector fvti.RegClass:$rs1),
                                 cc,
-                                VR:$merge,
+                                VR:$passthru,
                                 (fvti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
     }
@@ -1510,12 +1437,12 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
+      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
                                    (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-              (vti_m1.Vector VR:$merge),
+              (vti_m1.Vector VR:$passthru),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
               (vti.Mask V0), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
@@ -1527,12 +1454,12 @@ multiclass VPatReductionVL_RM<SDNode vop, string instruction_name, bit is_float>
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
+      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
                                    (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-              (vti_m1.Vector VR:$merge),
+              (vti_m1.Vector VR:$passthru),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
               (vti.Mask V0),
@@ -1592,12 +1519,12 @@ multiclass VPatWidenReductionVL<SDNode vop, PatFrags extop, string instruction_n
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
@@ -1611,12 +1538,12 @@ multiclass VPatWidenReductionVL_RM<SDNode vop, PatFrags extop, string instructio
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
@@ -1634,12 +1561,12 @@ multiclass VPatWidenReductionVL_Ext_VL<SDNode vop, PatFrags extop, string instru
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
@@ -1653,12 +1580,12 @@ multiclass VPatWidenReductionVL_Ext_VL_RM<SDNode vop, PatFrags extop, string ins
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
@@ -2171,15 +2098,15 @@ multiclass VPatAVGADDVL_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADD"#suffix#"_VV_"#vti.LMul.MX#"_MASK")
-                  vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                  vti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs2,
                   (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADD"#suffix#"_VX_"#vti.LMul.MX#"_MASK")
-                  vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
+                  vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2,
                   (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     }
   }
@@ -2200,15 +2127,15 @@ foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1, simm5:$rs2,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1, simm5:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2230,18 +2157,18 @@ foreach vtiToWti = AllWidenableIntVectors in {
                               (vti.Mask V0), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK")
-               wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
+               wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1,
                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse
                               (vti.Vector vti.RegClass:$rs1),
                               (vti.Mask V0), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK")
-               wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
+               wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1,
                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2406,28 +2333,28 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                            vti.RegClass:$rs1,
                                            vti.RegClass:$rs2,
-                                           vti.RegClass:$merge,
+                                           vti.RegClass:$passthru,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                             (SplatPat XLenVT:$rs1),
                                             vti.RegClass:$rs2,
-                                            vti.RegClass:$merge,
+                                            vti.RegClass:$passthru,
                                             VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                            (SplatPat_simm5 simm5:$rs1),
                                            vti.RegClass:$rs2,
-                                           vti.RegClass:$merge,
+                                           vti.RegClass:$passthru,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, simm5:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, simm5:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
@@ -2442,14 +2369,16 @@ foreach vti = AllVectors in {
 }
 
 foreach vti = AllIntegerVectors in {
-    def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, GPR:$rs2, VLOpFrag)),
-              (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
-               vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
-    defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
-    def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, (ImmPat simm5:$imm5),
-                                                VLOpFrag)),
-              (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
-               vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW, TU_MU)>;
+  let Predicates = GetVTypePredicates<vti>.Predicates in {
+      def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, GPR:$rs2, VLOpFrag)),
+                (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
+                 vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
+      defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
+      def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, (ImmPat simm5:$imm5),
+                                                  VLOpFrag)),
+                (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
+                 vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW, TU_MU)>;
+    }
   }
 }
 
@@ -2468,8 +2397,26 @@ defm : VPatAVGADDVL_VV_VX_RM<riscv_avgceils_vl, 0b00>;
 defm : VPatAVGADDVL_VV_VX_RM<riscv_avgceilu_vl, 0b00, suffix="U">;
 
 // 12.5. Vector Narrowing Fixed-Point Clip Instructions
-defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclip_vl, "PseudoVNCLIP">;
-defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclipu_vl, "PseudoVNCLIPU">;
+foreach vtiTowti = AllWidenableIntVectors in {
+  defvar vti = vtiTowti.Vti;
+  defvar wti = vtiTowti.Wti;
+  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                               GetVTypePredicates<wti>.Predicates) in {
+    // Rounding mode here is arbitrary since we aren't shifting out any bits.
+    def : Pat<(vti.Vector (riscv_trunc_vector_vl_ssat (wti.Vector wti.RegClass:$rs1),
+                                                      (vti.Mask V0),
+                                                      VLOpFrag)),
+              (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
+                  (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+                  (vti.Mask V0), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+    def : Pat<(vti.Vector (riscv_trunc_vector_vl_usat (wti.Vector wti.RegClass:$rs1),
+                                                      (vti.Mask V0),
+                                                      VLOpFrag)),
+              (!cast<Instruction>("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK")
+                  (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+                  (vti.Mask V0), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+  }
+}
 
 // 13. Vector Floating-Point Instructions
 
@@ -2560,11 +2507,11 @@ foreach vti = AllFloatVectors in {
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (vti.Vector vti.RegClass:$rs2),
-                                  vti.RegClass:$merge,
+                                  vti.RegClass:$passthru,
                                   (vti.Mask V0),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
                    vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
@@ -2581,11 +2528,11 @@ foreach vti = AllFloatVectors in {
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (SplatFPOp vti.ScalarRegClass:$rs2),
-                                  vti.RegClass:$merge,
+                                  vti.RegClass:$passthru,
                                   (vti.Mask V0),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
                    vti.ScalarRegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
@@ -2614,29 +2561,29 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           fvti.RegClass:$rs1,
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
 
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           (SplatFPOp (fvti.Scalar fpimm0)),
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
   }
 }
@@ -2646,10 +2593,10 @@ foreach fvti = AllFloatVectors in {
     def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                             (SplatFPOp fvti.ScalarRegClass:$rs1),
                                             fvti.RegClass:$rs2,
-                                            fvti.RegClass:$merge,
+                                            fvti.RegClass:$passthru,
                                             VLOpFrag)),
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
-                   fvti.RegClass:$merge, fvti.RegClass:$rs2,
+                   fvti.RegClass:$passthru, fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
   }
@@ -2728,7 +2675,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
                              (fvti.Mask V0),
@@ -2786,7 +2733,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
                                (fwti.Mask V0), VLOpFrag)),
@@ -2921,10 +2868,10 @@ foreach mti = AllMasks in {
 // 16.1. Integer Scalar Move Instructions
 foreach vti = NoGroupIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$passthru),
                                             vti.ScalarRegClass:$rs1,
                                             VLOpFrag)),
-              (PseudoVMV_S_X $merge, vti.ScalarRegClass:$rs1, GPR:$vl,
+              (PseudoVMV_S_X $passthru, vti.ScalarRegClass:$rs1, GPR:$vl,
                              vti.Log2SEW)>;
   }
 }
@@ -2934,26 +2881,26 @@ foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                                 vti.RegClass:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                                 uimm5:$imm,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
@@ -2969,11 +2916,11 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
-                                         vti.RegClass:$merge,
+                                         vti.RegClass:$passthru,
                                          (vti.Mask V0),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2981,52 +2928,50 @@ foreach vti = AllIntegerVectors in {
 // 16.2. Floating-Point Scalar Move Instructions
 foreach vti = NoGroupFloatVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              (vti.Scalar (fpimm0)),
                                              VLOpFrag)),
-              (PseudoVMV_S_X $merge, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+              (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              (vti.Scalar (SelectFPImm (XLenVT GPR:$imm))),
                                              VLOpFrag)),
-              (PseudoVMV_S_X $merge, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
-  }
-}
-
-foreach vti = AllFloatVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+              (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              vti.ScalarRegClass:$rs1,
                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_"#vti.LMul.MX)
-                  vti.RegClass:$merge,
+              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix)
+                  vti.RegClass:$passthru,
                   (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
   }
+}
+
+foreach vti = AllFloatVectors in {
   defvar ivti = GetIntVTypeInfo<vti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(vti.Vector
                (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                      (ivti.Vector vti.RegClass:$rs1),
-                                     vti.RegClass:$merge,
+                                     vti.RegClass:$passthru,
                                      (vti.Mask V0),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector
                (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                      uimm5:$imm,
-                                     vti.RegClass:$merge,
+                                     vti.RegClass:$passthru,
                                      (vti.Mask V0),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
@@ -3042,11 +2987,11 @@ foreach vti = AllFloatVectors in {
     def : Pat<(vti.Vector
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
-                                         vti.RegClass:$merge,
+                                         vti.RegClass:$passthru,
                                          (vti.Mask V0),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
index d0a798ef475c4..b1a7a18a3bf85 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
@@ -14,7 +14,7 @@
 // XVentanaCondOps
 //===----------------------------------------------------------------------===//
 
-let Predicates = [IsRV64, HasVendorXVentanaCondOps], hasSideEffects = 0,
+let Predicates = [HasVendorXVentanaCondOps], hasSideEffects = 0,
     mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, DecoderNamespace = "XVentana" in
 class VTMaskedMove<bits<3> funct3, string opcodestr>
     : RVInstR<0b0000000, funct3, OPC_CUSTOM_3, (outs GPR:$rd),
@@ -28,18 +28,18 @@ def VT_MASKC : VTMaskedMove<0b110, "vt.maskc">,
 def VT_MASKCN : VTMaskedMove<0b111, "vt.maskcn">,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 
-let Predicates = [IsRV64, HasVendorXVentanaCondOps] in {
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, GPR:$rc)),
+let Predicates = [HasVendorXVentanaCondOps] in {
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, GPR:$rc)),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, GPR:$rc)),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, GPR:$rc)),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
 
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, (riscv_setne (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, (riscv_setne (XLenVT GPR:$rc)))),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, (riscv_seteq (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, (riscv_seteq (XLenVT GPR:$rc)))),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, (riscv_setne (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, (riscv_setne (XLenVT GPR:$rc)))),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, (riscv_seteq (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, (riscv_seteq (XLenVT GPR:$rc)))),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-} // Predicates = [IsRV64, HasVendorXVentanaCondOps]
+} // Predicates = [HasVendorXVentanaCondOps]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index e0f1c71548344..85715ca9145c3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -272,6 +272,7 @@ def : Pat<(f16 (fabs FPR16:$rs1)), (FSGNJX_H $rs1, $rs1)>;
 def : Pat<(riscv_fclass (f16 FPR16:$rs1)), (FCLASS_H $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H, FPR16, f16>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_H, FPR16, f16>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))), (FSGNJN_H $rs1, $rs2)>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, FPR32:$rs2)),
           (FSGNJ_H $rs1, (FCVT_H_S $rs2, FRM_DYN))>;
@@ -314,6 +315,7 @@ def : Pat<(fabs FPR16INX:$rs1), (FSGNJX_H_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR16INX:$rs1), (FCLASS_H_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H_INX, FPR16INX, f16>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_H_INX, FPR16INX, f16>;
 def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)), (FSGNJN_H_INX $rs1, $rs2)>;
 def : Pat<(fcopysign FPR16INX:$rs1, FPR32INX:$rs2),
           (FSGNJ_H_INX $rs1, (FCVT_H_S_INX $rs2, FRM_DYN))>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 75fcc1e7cb110..cd03ac257e321 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -349,7 +349,7 @@ multiclass VPseudoVAESKF1 {
     defvar mx = m.MX;
     defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVAESKF1V", "ReadVAESKF1V", "ReadVAESKF1V", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -384,7 +384,7 @@ multiclass VPseudoVSM4K {
     defvar mx = m.MX;
     defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVSM4KV", "ReadVSM4KV", "ReadVSM4KV", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -393,7 +393,7 @@ multiclass VPseudoVSM3ME {
     defvar mx = m.MX;
     defm _VV : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
                SchedBinary<"WriteVSM3MEV", "ReadVSM3MEV", "ReadVSM3MEV", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -402,10 +402,10 @@ multiclass VPseudoVCLMUL_VV_VX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVCLMULV", "ReadVCLMULV", "ReadVCLMULV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVCLMULX", "ReadVCLMULV", "ReadVCLMULX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -422,7 +422,7 @@ multiclass VPseudoVBREV {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVBREVV", "ReadVBREVV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVBREVV", "ReadVBREVV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -430,7 +430,7 @@ multiclass VPseudoVCLZ {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCLZV", "ReadVCLZV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCLZV", "ReadVCLZV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -438,7 +438,7 @@ multiclass VPseudoVCTZ {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCTZV", "ReadVCTZV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCTZV", "ReadVCTZV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -446,7 +446,7 @@ multiclass VPseudoVCPOP {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCPOPV", "ReadVCPOPV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCPOPV", "ReadVCPOPV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -455,13 +455,13 @@ multiclass VPseudoVWSLL {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m>,
               SchedBinary<"WriteVWSLLV", "ReadVWSLLV", "ReadVWSLLV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
               SchedBinary<"WriteVWSLLX", "ReadVWSLLV", "ReadVWSLLX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VI<uimm5, m>,
               SchedUnary<"WriteVWSLLI", "ReadVWSLLV", mx,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -469,10 +469,10 @@ multiclass VPseudoVANDN {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -480,7 +480,7 @@ multiclass VPseudoVBREV8 {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVBREV8V", "ReadVBREV8V", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVBREV8V", "ReadVBREV8V", mx, forcePassthruRead=true>;
   }
 }
 
@@ -488,7 +488,7 @@ multiclass VPseudoVREV8 {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVREV8V", "ReadVREV8V", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVREV8V", "ReadVREV8V", mx, forcePassthruRead=true>;
   }
 }
 
@@ -496,10 +496,10 @@ multiclass VPseudoVROT_VV_VX {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVRotX", "ReadVRotV", "ReadVRotX", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -508,7 +508,7 @@ multiclass VPseudoVROT_VV_VX_VI
   foreach m = MxList in {
     defm "" : VPseudoBinaryV_VI<uimm6, m>,
               SchedUnary<"WriteVRotI", "ReadVRotV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -691,11 +691,11 @@ multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name,
     let Predicates = !listconcat([predicate],
                                  GetVTypePredicates<vti>.Predicates) in {
       def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1),
-                                (vti.Vector vti.RegClass:$merge),
+                                (vti.Vector vti.RegClass:$passthru),
                                 (vti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(instruction_name#"_V_"#vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge,
+                   vti.RegClass:$passthru,
                    vti.RegClass:$rs1,
                    (vti.Mask V0),
                    GPR:$vl,
@@ -711,15 +711,15 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_and_vl (riscv_xor_vl
                                            (vti.Vector vti.RegClass:$rs1),
                                            (riscv_splat_vector -1),
-                                           (vti.Vector vti.RegClass:$merge),
+                                           (vti.Vector vti.RegClass:$passthru),
                                            (vti.Mask V0),
                                            VLOpFrag),
                                         (vti.Vector vti.RegClass:$rs2),
-                                        (vti.Vector vti.RegClass:$merge),
+                                        (vti.Vector vti.RegClass:$passthru),
                                         (vti.Mask V0),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VV_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  vti.RegClass:$rs1,
                  (vti.Mask V0),
@@ -730,11 +730,11 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_and_vl (riscv_splat_vector
                                            (not vti.ScalarRegClass:$rs1)),
                                         (vti.Vector vti.RegClass:$rs2),
-                                        (vti.Vector vti.RegClass:$merge),
+                                        (vti.Vector vti.RegClass:$passthru),
                                         (vti.Mask V0),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VX_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  vti.ScalarRegClass:$rs1,
                  (vti.Mask V0),
@@ -758,10 +758,10 @@ foreach vti = AllIntegerVectors in {
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(riscv_rotl_vl vti.RegClass:$rs2,
                              (vti.Vector (SplatPat_uimm6 uimm6:$rs1)),
-                             (vti.Vector vti.RegClass:$merge),
+                             (vti.Vector vti.RegClass:$passthru),
                              (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVROR_VI_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  (!cast<SDNodeXForm>("InvRot" # vti.SEW # "Imm") uimm6:$rs1),
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -778,10 +778,10 @@ foreach vtiToWti = AllWidenableIntVectors in {
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (ext_oneuse (vti.Vector vti.RegClass:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -791,19 +791,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (wti.Vector (riscv_ext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs1),
                                 (vti.Mask V0), VLOpFrag)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -811,19 +811,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                                 (vti.Vector vti.RegClass:$rs2),
                                 (vti.Mask V0), VLOpFrag)),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -831,37 +831,37 @@ foreach vtiToWti = AllWidenableIntVectors in {
                                 (vti.Vector vti.RegClass:$rs2),
                                 (vti.Mask V0), VLOpFrag)),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -989,11 +989,11 @@ multiclass VPatBinaryV_VI_VROL<string intrinsic, string instruction,
         !if(isSEWAware, instruction#"_VI_"#vti.LMul.MX#"_E"#vti.SEW,
                         instruction#"_VI_"#vti.LMul.MX));
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Vector (Intr (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (Intr (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (XLenVT uimm6:$rs1),
                           VLOpFrag)),
-                          (Pseudo (vti.Vector vti.RegClass:$merge),
+                          (Pseudo (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (InvRot64Imm uimm6:$rs1),
                           GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1003,12 +1003,12 @@ multiclass VPatBinaryV_VI_VROL<string intrinsic, string instruction,
         !if(isSEWAware, instruction#"_VI_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK",
                         instruction#"_VI_"#vti.LMul.MX#"_MASK"));
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Vector (IntrMask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (IntrMask (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (XLenVT uimm6:$rs1),
                           (vti.Mask V0),
                           VLOpFrag, (XLenVT timm:$policy))),
-                          (PseudoMask (vti.Vector vti.RegClass:$merge),
+                          (PseudoMask (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (InvRot64Imm uimm6:$rs1),
                           (vti.Mask V0),
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index fecc83a821f42..b6ac3384e7d3e 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -429,8 +429,16 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
         NumOps = Flags.getNumOperandRegisters();
 
         // Memory constraints have two operands.
-        if (NumOps != 2 || !Flags.isMemKind())
+        if (NumOps != 2 || !Flags.isMemKind()) {
+          // If the register is used by something other than a memory contraint,
+          // we should not fold.
+          for (unsigned J = 0; J < NumOps; ++J) {
+            const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
+            if (MO.isReg() && MO.getReg() == DestReg)
+              return false;
+          }
           continue;
+        }
 
         // We can't do this for constraint A because AMO instructions don't have
         // an immediate offset field.
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6eed2ae01f646..25b24980e0bf6 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -21,6 +21,9 @@ class RISCVTuneInfo {
   bits<32> MaxPrefetchIterationsAhead = -1;
 
   bits<32> MinimumJumpTableEntries = 5;
+
+  // Tail duplication threshold at -O3.
+  bits<32> TailDupAggressiveThreshold = 6;
 }
 
 def RISCVTuneInfoTable : GenericTable {
@@ -29,7 +32,7 @@ def RISCVTuneInfoTable : GenericTable {
   let Fields = ["Name", "PrefFunctionAlignment", "PrefLoopAlignment",
                 "CacheLineSize", "PrefetchDistance",
                 "MinPrefetchStride", "MaxPrefetchIterationsAhead",
-                "MinimumJumpTableEntries"];
+                "MinimumJumpTableEntries", "TailDupAggressiveThreshold"];
 }
 
 def getRISCVTuneInfo : SearchIndex {
@@ -83,9 +86,11 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
 def ROCKET : RISCVTuneProcessorModel<"rocket",
                                      RocketModel>;
 
+defvar SiFive7TuneFeatures = [TuneSiFive7, TuneNoDefaultUnroll,
+                              TuneShortForwardBranchOpt,
+                              FeaturePostRAScheduler];
 def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
-                                       SiFive7Model,
-                                       [TuneSiFive7, FeaturePostRAScheduler]>;
+                                       SiFive7Model, SiFive7TuneFeatures>;
 
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
@@ -145,7 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                       FeatureStdExtA,
                                       FeatureStdExtF,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
@@ -189,7 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtZihintpause],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
@@ -212,8 +217,11 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
+defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
+                                            [TuneDLenFactor2,
+                                             TuneOptimizedZeroStrideLoad]);
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
@@ -229,10 +237,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZvfh,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb],
-                                      [TuneSiFive7,
-                                       FeaturePostRAScheduler,
-                                       TuneDLenFactor2,
-                                       TuneOptimizedZeroStrideLoad]>;
+                                      SiFiveX280TuneFeatures>;
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index b2991145ee65c..0b0ac0c368d07 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1287,11 +1287,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index ba062f33b929a..59972d781a315 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -1086,11 +1086,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 // Vector Crypto Extensions
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 449611c583036..95fde1e53c805 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -766,11 +766,11 @@ def ReadVMov8V        : SchedRead;
 
 // Others
 def ReadVMask         : SchedRead;
-def ReadVMergeOp_WorstCase : SchedRead;
+def ReadVPassthru_WorstCase : SchedRead;
 foreach mx = SchedMxList in {
-  def ReadVMergeOp_ # mx : SchedRead;
+  def ReadVPassthru_ # mx : SchedRead;
   foreach sew = SchedSEWSet<mx>.val in
-    def ReadVMergeOp_ # mx  # "_E" # sew : SchedRead;
+    def ReadVPassthru_ # mx  # "_E" # sew : SchedRead;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1139,11 +1139,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 } // Unsupported
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 377d080ad4bfc..ea54ff1df0b7c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -50,6 +50,9 @@ struct RISCVTuneInfo {
   unsigned MaxPrefetchIterationsAhead;
 
   unsigned MinimumJumpTableEntries;
+
+  // Tail duplication threshold at -O3.
+  unsigned TailDupAggressiveThreshold;
 };
 
 #define GET_RISCVTuneInfoTable_DECL
@@ -225,7 +228,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool hasVInstructionsI64() const { return HasStdExtZve64x; }
   bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; }
   bool hasVInstructionsF16() const { return HasStdExtZvfh; }
-  bool hasVInstructionsBF16() const { return HasStdExtZvfbfmin; }
+  bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; }
   bool hasVInstructionsF32() const { return HasStdExtZve32f; }
   bool hasVInstructionsF64() const { return HasStdExtZve64d; }
   // F16 and F64 both require F32.
@@ -300,6 +303,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
 
   unsigned getMinimumJumpTableEntries() const;
 
+  unsigned getTailDupAggressiveThreshold() const {
+    return TuneInfo->TailDupAggressiveThreshold;
+  }
+
   bool supportsInitUndef() const override { return hasVInstructions(); }
 };
 } // End llvm namespace
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index bc3699db6f91e..21fbf47875e68 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -279,8 +279,10 @@ class RVVRegisterRegAlloc : public RegisterRegAllocBase<RVVRegisterRegAlloc> {
 };
 
 static bool onlyAllocateRVVReg(const TargetRegisterInfo &TRI,
-                               const TargetRegisterClass &RC) {
-  return RISCVRegisterInfo::isRVVRegClass(&RC);
+                               const MachineRegisterInfo &MRI,
+                               const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return RISCVRegisterInfo::isRVVRegClass(RC);
 }
 
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index f9eef60f77b7a..4cd904c039a98 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -280,7 +280,7 @@ bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
+  return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
              ? TTI::PSK_FastHardware
              : TTI::PSK_Software;
 }
@@ -1100,30 +1100,33 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
+    // For fp vector to mask, we use:
+    // vfncvt.rtz.x.f.w v9, v8
+    // vand.vi v8, v9, 1
+    // vmsne.vi v0, v8, 0
+    if (Dst->getScalarSizeInBits() == 1)
+      return 3;
+
+    if (std::abs(PowDiff) <= 1)
+      return 1;
+
+    // Counts of narrow/widen instructions.
+    return std::abs(PowDiff);
+
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
-      // The cost of convert from or to mask vector is different from other
-      // cases. We could not use PowDiff to calculate it.
-      // For mask vector to fp, we should use the following instructions:
-      // vmv.v.i v8, 0
-      // vmerge.vim v8, v8, -1, v0
-      // vfcvt.f.x.v v8, v8
-
-      // And for fp vector to mask, we use:
-      // vfncvt.rtz.x.f.w v9, v8
-      // vand.vi v8, v9, 1
-      // vmsne.vi v0, v8, 0
+    // For mask vector to fp, we should use the following instructions:
+    // vmv.v.i v8, 0
+    // vmerge.vim v8, v8, -1, v0
+    // vfcvt.f.x.v v8, v8
+    if (Src->getScalarSizeInBits() == 1)
       return 3;
-    }
+
     if (std::abs(PowDiff) <= 1)
       return 1;
     // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
     // so it only need two conversion.
-    if (Src->isIntOrIntVectorTy())
-      return 2;
-    // Counts of narrow/widen instructions.
-    return std::abs(PowDiff);
+    return 2;
   }
   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }
@@ -1390,14 +1393,32 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   InstructionCost Cost = 0;
   if (Opcode == Instruction::Store && OpInfo.isConstant())
     Cost += getStoreImmCost(Src, OpInfo, CostKind);
-  InstructionCost BaseCost =
-    BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                           CostKind, OpInfo, I);
+
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
+
+  InstructionCost BaseCost = [&]() {
+    InstructionCost Cost = LT.first;
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost;
+
+    // Our actual lowering for the case where a wider legal type is available
+    // uses the a VL predicated load on the wider type.  This is reflected in
+    // the result of getTypeLegalizationCost, but BasicTTI assumes the
+    // widened cases are scalarized.
+    const DataLayout &DL = this->getDataLayout();
+    if (Src->isVectorTy() && LT.second.isVector() &&
+        TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
+                            LT.second.getSizeInBits()))
+        return Cost;
+
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind, OpInfo, I);
+  }();
+
   // Assume memory ops cost scale with the number of vector registers
   // possible accessed by the instruction.  Note that BasicTTI already
   // handles the LT.first term for us.
-  if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
-      LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
+  if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
     BaseCost *= TLI->getLMULCost(LT.second);
   return Cost + BaseCost;
 
@@ -1688,7 +1709,6 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
 
-
   auto getConstantMatCost =
     [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
     if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
@@ -1760,8 +1780,14 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
                                                            Op1Info, Op2Info,
                                                            Args, CxtI);
   }
-  return ConstantMatCost +
-         LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+
+  InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
+  // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
+  // ops are twice as expensive as integer ops. Do the same for vectors so
+  // scalar floating point ops aren't cheaper than their vector equivalents.
+  if (Ty->isFPOrFPVectorTy())
+    InstrCost *= 2;
+  return ConstantMatCost + LT.first * InstrCost;
 }
 
 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 3d8fb6b7bbbef..979677ee92332 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -47,6 +47,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
+  const RISCVSubtarget *ST;
   RISCVVectorPeephole() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -55,14 +56,18 @@ class RISCVVectorPeephole : public MachineFunctionPass {
         MachineFunctionProperties::Property::IsSSA);
   }
 
-  StringRef getPassName() const override { return "RISC-V Fold Masks"; }
+  StringRef getPassName() const override {
+    return "RISC-V Vector Peephole Optimization";
+  }
 
 private:
   bool convertToVLMAX(MachineInstr &MI) const;
+  bool convertToWholeRegister(MachineInstr &MI) const;
   bool convertToUnmasked(MachineInstr &MI) const;
   bool convertVMergeToVMv(MachineInstr &MI) const;
 
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
+  std::optional<unsigned> getConstant(const MachineOperand &VL) const;
 
   /// Maps uses of V0 to the corresponding def of V0.
   DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
@@ -75,13 +80,44 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
-// If an AVL is a VLENB that's possibly scaled to be equal to VLMAX, convert it
-// to the VLMAX sentinel value.
+/// Check if an operand is an immediate or a materialized ADDI $x0, imm.
+std::optional<unsigned>
+RISCVVectorPeephole::getConstant(const MachineOperand &VL) const {
+  if (VL.isImm())
+    return VL.getImm();
+
+  MachineInstr *Def = MRI->getVRegDef(VL.getReg());
+  if (!Def || Def->getOpcode() != RISCV::ADDI ||
+      Def->getOperand(1).getReg() != RISCV::X0)
+    return std::nullopt;
+  return Def->getOperand(2).getImm();
+}
+
+/// Convert AVLs that are known to be VLMAX to the VLMAX sentinel.
 bool RISCVVectorPeephole::convertToVLMAX(MachineInstr &MI) const {
   if (!RISCVII::hasVLOp(MI.getDesc().TSFlags) ||
       !RISCVII::hasSEWOp(MI.getDesc().TSFlags))
     return false;
+
+  auto LMUL = RISCVVType::decodeVLMUL(RISCVII::getLMul(MI.getDesc().TSFlags));
+  // Fixed-point value, denominator=8
+  unsigned LMULFixed = LMUL.second ? (8 / LMUL.first) : 8 * LMUL.first;
+  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  // A Log2SEW of 0 is an operation on mask registers only
+  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+  assert(8 * LMULFixed / SEW > 0);
+
+  // If the exact VLEN is known then we know VLMAX, check if the AVL == VLMAX.
   MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (auto VLen = ST->getRealVLen(), AVL = getConstant(VL);
+      VLen && AVL && (*VLen * LMULFixed) / SEW == *AVL * 8) {
+    VL.ChangeToImmediate(RISCV::VLMaxSentinel);
+    return true;
+  }
+
+  // If an AVL is a VLENB that's possibly scaled to be equal to VLMAX, convert
+  // it to the VLMAX sentinel value.
   if (!VL.isReg())
     return false;
   MachineInstr *Def = MRI->getVRegDef(VL.getReg());
@@ -104,15 +140,6 @@ bool RISCVVectorPeephole::convertToVLMAX(MachineInstr &MI) const {
   if (!Def || Def->getOpcode() != RISCV::PseudoReadVLENB)
     return false;
 
-  auto LMUL = RISCVVType::decodeVLMUL(RISCVII::getLMul(MI.getDesc().TSFlags));
-  // Fixed-point value, denominator=8
-  unsigned LMULFixed = LMUL.second ? (8 / LMUL.first) : 8 * LMUL.first;
-  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
-  // A Log2SEW of 0 is an operation on mask registers only
-  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
-  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
-  assert(8 * LMULFixed / SEW > 0);
-
   // AVL = (VLENB * Scale)
   //
   // VLMAX = (VLENB * 8 * LMUL) / SEW
@@ -155,6 +182,58 @@ bool RISCVVectorPeephole::isAllOnesMask(const MachineInstr *MaskDef) const {
   }
 }
 
+/// Convert unit strided unmasked loads and stores to whole-register equivalents
+/// to avoid the dependency on $vl and $vtype.
+///
+/// %x = PseudoVLE8_V_M1 %passthru, %ptr, %vlmax, policy
+/// PseudoVSE8_V_M1 %v, %ptr, %vlmax
+///
+/// ->
+///
+/// %x = VL1RE8_V %ptr
+/// VS1R_V %v, %ptr
+bool RISCVVectorPeephole::convertToWholeRegister(MachineInstr &MI) const {
+#define CASE_WHOLE_REGISTER_LMUL_SEW(lmul, sew)                                \
+  case RISCV::PseudoVLE##sew##_V_M##lmul:                                      \
+    NewOpc = RISCV::VL##lmul##RE##sew##_V;                                     \
+    break;                                                                     \
+  case RISCV::PseudoVSE##sew##_V_M##lmul:                                      \
+    NewOpc = RISCV::VS##lmul##R_V;                                             \
+    break;
+#define CASE_WHOLE_REGISTER_LMUL(lmul)                                         \
+  CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 8)                                        \
+  CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 16)                                       \
+  CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 32)                                       \
+  CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 64)
+
+  unsigned NewOpc;
+  switch (MI.getOpcode()) {
+    CASE_WHOLE_REGISTER_LMUL(1)
+    CASE_WHOLE_REGISTER_LMUL(2)
+    CASE_WHOLE_REGISTER_LMUL(4)
+    CASE_WHOLE_REGISTER_LMUL(8)
+  default:
+    return false;
+  }
+
+  MachineOperand &VLOp = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
+    return false;
+
+  // Whole register instructions aren't pseudos so they don't have
+  // policy/SEW/AVL ops, and they don't have passthrus.
+  if (RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags))
+    MI.removeOperand(RISCVII::getVecPolicyOpNum(MI.getDesc()));
+  MI.removeOperand(RISCVII::getSEWOpNum(MI.getDesc()));
+  MI.removeOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()))
+    MI.removeOperand(1);
+
+  MI.setDesc(TII->get(NewOpc));
+
+  return true;
+}
+
 // Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
 // (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
 bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
@@ -175,11 +254,12 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
     CASE_VMERGE_TO_VMV(M8)
   }
 
-  Register MergeReg = MI.getOperand(1).getReg();
+  Register PassthruReg = MI.getOperand(1).getReg();
   Register FalseReg = MI.getOperand(2).getReg();
-  // Check merge == false (or merge == undef)
-  if (MergeReg != RISCV::NoRegister && TRI->lookThruCopyLike(MergeReg, MRI) !=
-                                           TRI->lookThruCopyLike(FalseReg, MRI))
+  // Check passthru == false (or passthru == undef)
+  if (PassthruReg != RISCV::NoRegister &&
+      TRI->lookThruCopyLike(PassthruReg, MRI) !=
+          TRI->lookThruCopyLike(FalseReg, MRI))
     return false;
 
   assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
@@ -187,14 +267,14 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
     return false;
 
   MI.setDesc(TII->get(NewOpc));
-  MI.removeOperand(1);  // Merge operand
+  MI.removeOperand(1);  // Passthru operand
   MI.tieOperands(0, 1); // Tie false to dest
   MI.removeOperand(3);  // Mask operand
   MI.addOperand(
       MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
 
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
-  // register class for the destination and merge operands e.g. VRNoV0 -> VR
+  // register class for the destination and passthru operands e.g. VRNoV0 -> VR
   MRI->recomputeRegClass(MI.getOperand(0).getReg());
   MRI->recomputeRegClass(MI.getOperand(1).getReg());
   return true;
@@ -249,11 +329,11 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   // Skip if the vector extension is not enabled.
-  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
-  if (!ST.hasVInstructions())
+  ST = &MF.getSubtarget<RISCVSubtarget>();
+  if (!ST->hasVInstructions())
     return false;
 
-  TII = ST.getInstrInfo();
+  TII = ST->getInstrInfo();
   MRI = &MF.getRegInfo();
   TRI = MRI->getTargetRegisterInfo();
 
@@ -281,6 +361,7 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB) {
       Changed |= convertToVLMAX(MI);
       Changed |= convertToUnmasked(MI);
+      Changed |= convertToWholeRegister(MI);
       Changed |= convertVMergeToVMv(MI);
     }
   }
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 74ebaa9d0c004..21a952649ff51 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -49,18 +49,9 @@ createSPIRVMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createSPIRVMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
-static MCStreamer *
-createSPIRVMCStreamer(const Triple &T, MCContext &Ctx,
-                      std::unique_ptr<MCAsmBackend> &&MAB,
-                      std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter) {
-  return createSPIRVStreamer(Ctx, std::move(MAB), std::move(OW),
-                             std::move(Emitter));
-}
-
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &,
-                                                 MCInstPrinter *, bool) {
+                                                 MCInstPrinter *) {
   return new SPIRVTargetStreamer(S);
 }
 
@@ -94,7 +85,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetMC() {
     TargetRegistry::RegisterMCInstrInfo(*T, createSPIRVMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createSPIRVMCRegisterInfo);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createSPIRVMCSubtargetInfo);
-    TargetRegistry::RegisterSPIRVStreamer(*T, createSPIRVMCStreamer);
     TargetRegistry::RegisterMCInstPrinter(*T, createSPIRVMCInstPrinter);
     TargetRegistry::RegisterMCInstrAnalysis(*T, createSPIRVInstrAnalysis);
     TargetRegistry::RegisterMCCodeEmitter(*T, createSPIRVMCCodeEmitter);
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 3206c264f99d3..7fe8e11aaa420 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -116,8 +117,8 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   // number and number of generated OpLabels
   unsigned Bound = 2 * (ST->getBound() + 1) + NLabels;
   if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
-    Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
-                         Bound, VersionTuple(Major, Minor, 0, Bound));
+    static_cast<SPIRVObjectWriter &>(Asm->getWriter())
+        .setBuildVersion(Major, Minor, Bound);
 }
 
 void SPIRVAsmPrinter::emitFunctionHeader() {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 2344ec529e16d..ae4e039744286 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -196,7 +196,7 @@ static void validateGroupWaitEventsPtr(const SPIRVSubtarget &STI,
   if (!ElemType || ElemType->getOpcode() == SPIRV::OpTypeEvent)
     return;
   // Insert a bitcast before the instruction to keep SPIR-V code valid.
-  LLVMContext &Context = MF->getMMI().getModule()->getContext();
+  LLVMContext &Context = MF->getFunction().getContext();
   SPIRVType *NewPtrType =
       createNewPtrType(GR, I, OpType, false, true, nullptr,
                        TargetExtType::get(Context, "spirv.Event"));
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 04def5ef01e7b..ed786bd33aa05 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -32,27 +32,26 @@
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Support/Debug.h"
 
-namespace llvm {
+namespace {
 
-class SPIRVMachineModuleInfo : public MachineModuleInfoImpl {
-public:
-  SyncScope::ID Work_ItemSSID;
-  SyncScope::ID WorkGroupSSID;
-  SyncScope::ID DeviceSSID;
-  SyncScope::ID AllSVMDevicesSSID;
-  SyncScope::ID SubGroupSSID;
-
-  SPIRVMachineModuleInfo(const MachineModuleInfo &MMI) {
-    LLVMContext &CTX = MMI.getModule()->getContext();
-    Work_ItemSSID = CTX.getOrInsertSyncScopeID("work_item");
-    WorkGroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
-    DeviceSSID = CTX.getOrInsertSyncScopeID("device");
-    AllSVMDevicesSSID = CTX.getOrInsertSyncScopeID("all_svm_devices");
-    SubGroupSSID = CTX.getOrInsertSyncScopeID("sub_group");
+struct SyncScopeIDs {
+  llvm::SyncScope::ID Work_ItemSSID;
+  llvm::SyncScope::ID WorkGroupSSID;
+  llvm::SyncScope::ID DeviceSSID;
+  llvm::SyncScope::ID AllSVMDevicesSSID;
+  llvm::SyncScope::ID SubGroupSSID;
+
+  SyncScopeIDs() {}
+  SyncScopeIDs(llvm::LLVMContext &Context) {
+    Work_ItemSSID = Context.getOrInsertSyncScopeID("work_item");
+    WorkGroupSSID = Context.getOrInsertSyncScopeID("workgroup");
+    DeviceSSID = Context.getOrInsertSyncScopeID("device");
+    AllSVMDevicesSSID = Context.getOrInsertSyncScopeID("all_svm_devices");
+    SubGroupSSID = Context.getOrInsertSyncScopeID("sub_group");
   }
 };
 
-} // end namespace llvm
+} // namespace
 
 #define DEBUG_TYPE "spirv-isel"
 
@@ -76,7 +75,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
   const RegisterBankInfo &RBI;
   SPIRVGlobalRegistry &GR;
   MachineRegisterInfo *MRI;
-  SPIRVMachineModuleInfo *MMI = nullptr;
+  SyncScopeIDs SSIDs;
 
   /// We need to keep track of the number we give to anonymous global values to
   /// generate the same name every time when this is needed.
@@ -173,6 +172,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFmix(Register ResVReg, const SPIRVType *ResType,
                   MachineInstr &I) const;
 
+  bool selectFrac(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
+
   bool selectRsqrt(Register ResVReg, const SPIRVType *ResType,
                    MachineInstr &I) const;
 
@@ -277,7 +279,7 @@ void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
                                        CodeGenCoverage *CoverageInfo,
                                        ProfileSummaryInfo *PSI,
                                        BlockFrequencyInfo *BFI) {
-  MMI = &MF.getMMI().getObjFileInfo<SPIRVMachineModuleInfo>();
+  SSIDs = SyncScopeIDs(MF.getFunction().getContext());
   MRI = &MF.getRegInfo();
   GR.setCurrentFunc(MF);
   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
@@ -718,16 +720,16 @@ bool SPIRVInstructionSelector::selectBitcast(Register ResVReg,
 }
 
 static SPIRV::Scope::Scope getScope(SyncScope::ID Ord,
-                                    SPIRVMachineModuleInfo *MMI) {
-  if (Ord == SyncScope::SingleThread || Ord == MMI->Work_ItemSSID)
+                                    const SyncScopeIDs &SSIDs) {
+  if (Ord == SyncScope::SingleThread || Ord == SSIDs.Work_ItemSSID)
     return SPIRV::Scope::Invocation;
-  else if (Ord == SyncScope::System || Ord == MMI->DeviceSSID)
+  else if (Ord == SyncScope::System || Ord == SSIDs.DeviceSSID)
     return SPIRV::Scope::Device;
-  else if (Ord == MMI->WorkGroupSSID)
+  else if (Ord == SSIDs.WorkGroupSSID)
     return SPIRV::Scope::Workgroup;
-  else if (Ord == MMI->AllSVMDevicesSSID)
+  else if (Ord == SSIDs.AllSVMDevicesSSID)
     return SPIRV::Scope::CrossDevice;
-  else if (Ord == MMI->SubGroupSSID)
+  else if (Ord == SSIDs.SubGroupSSID)
     return SPIRV::Scope::Subgroup;
   else
     // OpenCL approach is: "The functions that do not have memory_scope argument
@@ -893,7 +895,7 @@ bool SPIRVInstructionSelector::selectAtomicRMW(Register ResVReg,
   assert(I.hasOneMemOperand());
   const MachineMemOperand *MemOp = *I.memoperands_begin();
   uint32_t Scope =
-      static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), MMI));
+      static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), SSIDs));
   Register ScopeReg = buildI32Constant(Scope, I);
 
   Register Ptr = I.getOperand(1).getReg();
@@ -964,7 +966,7 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
   uint32_t MemSem = static_cast<uint32_t>(getMemSemantics(AO));
   Register MemSemReg = buildI32Constant(MemSem, I);
   SyncScope::ID Ord = SyncScope::ID(I.getOperand(1).getImm());
-  uint32_t Scope = static_cast<uint32_t>(getScope(Ord, MMI));
+  uint32_t Scope = static_cast<uint32_t>(getScope(Ord, SSIDs));
   Register ScopeReg = buildI32Constant(Scope, I);
   MachineBasicBlock &BB = *I.getParent();
   return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpMemoryBarrier))
@@ -984,7 +986,7 @@ bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
     assert(I.hasOneMemOperand());
     const MachineMemOperand *MemOp = *I.memoperands_begin();
     unsigned Scope =
-        static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), MMI));
+        static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), SSIDs));
     ScopeReg = buildI32Constant(Scope, I);
 
     unsigned ScSem = static_cast<uint32_t>(
@@ -1330,6 +1332,23 @@ bool SPIRVInstructionSelector::selectFmix(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectFrac(Register ResVReg,
+                                           const SPIRVType *ResType,
+                                           MachineInstr &I) const {
+
+  assert(I.getNumOperands() == 3);
+  assert(I.getOperand(2).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
+      .addImm(GL::Fract)
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectRsqrt(Register ResVReg,
                                            const SPIRVType *ResType,
                                            MachineInstr &I) const {
@@ -2059,6 +2078,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectAny(ResVReg, ResType, I);
   case Intrinsic::spv_lerp:
     return selectFmix(ResVReg, ResType, I);
+  case Intrinsic::spv_frac:
+    return selectFrac(ResVReg, ResType, I);
   case Intrinsic::spv_rsqrt:
     return selectRsqrt(ResVReg, ResType, I);
   case Intrinsic::spv_lifetime_start:
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
index ecd99f1840d7e..0ed45466788a0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "SPIRVRegisterBankInfo.h"
 #include "SPIRVRegisterInfo.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/RegisterBank.h"
 
 #define GET_REGINFO_ENUM
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 29282582b82bd..66826fadddd2e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -132,7 +132,7 @@ namespace {
   class SparcAsmBackend : public MCAsmBackend {
   protected:
     bool Is64Bit;
-    bool HasV9;
+    bool IsV8Plus;
 
   public:
     SparcAsmBackend(const MCSubtargetInfo &STI)
@@ -140,7 +140,7 @@ namespace {
                            ? llvm::endianness::little
                            : llvm::endianness::big),
           Is64Bit(STI.getTargetTriple().isArch64Bit()),
-          HasV9(STI.hasFeature(Sparc::FeatureV9)) {}
+          IsV8Plus(STI.hasFeature(Sparc::FeatureV8Plus)) {}
 
     unsigned getNumFixupKinds() const override {
       return Sparc::NumTargetFixupKinds;
@@ -359,7 +359,7 @@ namespace {
     std::unique_ptr<MCObjectTargetWriter>
     createObjectTargetWriter() const override {
       uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
-      return createSparcELFObjectWriter(Is64Bit, HasV9, OSABI);
+      return createSparcELFObjectWriter(Is64Bit, IsV8Plus, OSABI);
     }
   };
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index bfd71af736231..601199d7170a7 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -21,11 +21,11 @@ using namespace llvm;
 namespace {
   class SparcELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    SparcELFObjectWriter(bool Is64Bit, bool HasV9, uint8_t OSABI)
+    SparcELFObjectWriter(bool Is64Bit, bool IsV8Plus, uint8_t OSABI)
         : MCELFObjectTargetWriter(
               Is64Bit, OSABI,
               Is64Bit ? ELF::EM_SPARCV9
-                      : (HasV9 ? ELF::EM_SPARC32PLUS : ELF::EM_SPARC),
+                      : (IsV8Plus ? ELF::EM_SPARC32PLUS : ELF::EM_SPARC),
               /*HasRelocationAddend*/ true) {}
 
     ~SparcELFObjectWriter() override = default;
@@ -148,6 +148,6 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCValue &,
 }
 
 std::unique_ptr<MCObjectTargetWriter>
-llvm::createSparcELFObjectWriter(bool Is64Bit, bool HasV9, uint8_t OSABI) {
-  return std::make_unique<SparcELFObjectWriter>(Is64Bit, HasV9, OSABI);
+llvm::createSparcELFObjectWriter(bool Is64Bit, bool IsV8Plus, uint8_t OSABI) {
+  return std::make_unique<SparcELFObjectWriter>(Is64Bit, IsV8Plus, OSABI);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index ad6ca0911adb9..72f9b3bcd9681 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -86,13 +86,12 @@ createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 
 static MCTargetStreamer *
 createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
-  return new SparcTargetELFStreamer(S);
+  return new SparcTargetELFStreamer(S, STI);
 }
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm) {
+                                                 MCInstPrinter *InstPrint) {
   return new SparcTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 63419663b722c..a7b0538d683b6 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -35,7 +35,7 @@ MCAsmBackend *createSparcAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 std::unique_ptr<MCObjectTargetWriter>
-createSparcELFObjectWriter(bool Is64Bit, bool HasV9, uint8_t OSABI);
+createSparcELFObjectWriter(bool Is64Bit, bool IsV8Plus, uint8_t OSABI);
 
 // Defines symbolic names for Sparc v9 ASI tag names.
 namespace SparcASITag {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
index d2dcf200aef04..3205d06b295ee 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -12,11 +12,24 @@
 
 #include "SparcTargetStreamer.h"
 #include "SparcInstPrinter.h"
+#include "SparcMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
 
 using namespace llvm;
 
+static unsigned getEFlagsForFeatureSet(const MCSubtargetInfo &STI) {
+  unsigned EFlags = 0;
+
+  if (STI.hasFeature(Sparc::FeatureV8Plus))
+    EFlags |= ELF::EF_SPARC_32PLUS;
+
+  return EFlags;
+}
+
 // pin vtable to this file
 SparcTargetStreamer::SparcTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
@@ -38,8 +51,16 @@ void SparcTargetAsmStreamer::emitSparcRegisterScratch(unsigned reg) {
      << ", #scratch\n";
 }
 
-SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S)
-    : SparcTargetStreamer(S) {}
+SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S,
+                                               const MCSubtargetInfo &STI)
+    : SparcTargetStreamer(S) {
+  ELFObjectWriter &W = getStreamer().getWriter();
+  unsigned EFlags = W.getELFHeaderEFlags();
+
+  EFlags |= getEFlagsForFeatureSet(STI);
+
+  W.setELFHeaderEFlags(EFlags);
+}
 
 MCELFStreamer &SparcTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
index ef28afa06bffb..bd9954ffb687b 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
@@ -40,7 +40,7 @@ class SparcTargetAsmStreamer : public SparcTargetStreamer {
 // This part is for ELF object output
 class SparcTargetELFStreamer : public SparcTargetStreamer {
 public:
-  SparcTargetELFStreamer(MCStreamer &S);
+  SparcTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
   MCELFStreamer &getStreamer();
   void emitSparcRegisterIgnore(unsigned reg) override {}
   void emitSparcRegisterScratch(unsigned reg) override {}
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 65f372f4376b1..8b1122741b661 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -34,6 +34,9 @@ def FeatureNoFMULS
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
+def FeatureV8Plus
+  : SubtargetFeature<"v8plus", "IsV8Plus", "true",
+                     "Enable V8+ mode, allowing use of 64-bit V9 instructions in 32-bit code">;
 def FeatureV8Deprecated
   : SubtargetFeature<"deprecated-v8", "UseV8DeprecatedInsts", "true",
                      "Enable deprecated V8 instructions in V9 mode">;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 50aa19446f880..42b8248006d1f 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1731,16 +1731,12 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
 
   if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::ADDC, MVT::i64, Custom);
-    setOperationAction(ISD::ADDE, MVT::i64, Custom);
-    setOperationAction(ISD::SUBC, MVT::i64, Custom);
-    setOperationAction(ISD::SUBE, MVT::i64, Custom);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
@@ -1855,9 +1851,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MULHU,     MVT::i64, Expand);
     setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 
-    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
-    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
-
     setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
@@ -3105,110 +3098,6 @@ static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   return DstReg128;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-
-  if (Op.getValueType() != MVT::i64)
-    return Op;
-
-  SDLoc dl(Op);
-  SDValue Src1 = Op.getOperand(0);
-  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
-  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
-
-  SDValue Src2 = Op.getOperand(1);
-  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
-  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
-
-
-  bool hasChain = false;
-  unsigned hiOpc = Op.getOpcode();
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid opcode");
-  case ISD::ADDC: hiOpc = ISD::ADDE; break;
-  case ISD::ADDE: hasChain = true; break;
-  case ISD::SUBC: hiOpc = ISD::SUBE; break;
-  case ISD::SUBE: hasChain = true; break;
-  }
-  SDValue Lo;
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
-  if (hasChain) {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
-                     Op.getOperand(2));
-  } else {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
-  }
-  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
-  SDValue Carry = Hi.getValue(1);
-
-  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
-  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
-  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
-                   DAG.getConstant(32, dl, MVT::i64));
-
-  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
-  SDValue Ops[2] = { Dst, Carry };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
-// in LegalizeDAG.cpp except the order of arguments to the library function.
-static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
-                                const SparcTargetLowering &TLI)
-{
-  unsigned opcode = Op.getOpcode();
-  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
-
-  bool isSigned = (opcode == ISD::SMULO);
-  EVT VT = MVT::i64;
-  EVT WideVT = MVT::i128;
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-
-  if (LHS.getValueType() != VT)
-    return Op;
-
-  SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
-
-  SDValue RHS = Op.getOperand(1);
-  SDValue HiLHS, HiRHS;
-  if (isSigned) {
-    HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
-    HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
-  } else {
-    HiLHS = DAG.getConstant(0, dl, VT);
-    HiRHS = DAG.getConstant(0, dl, MVT::i64);
-  }
-
-  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(isSigned);
-  SDValue MulResult = TLI.makeLibCall(DAG,
-                                      RTLIB::MUL_I128, WideVT,
-                                      Args, CallOptions, dl).first;
-  SDValue BottomHalf, TopHalf;
-  std::tie(BottomHalf, TopHalf) = DAG.SplitScalar(MulResult, dl, VT, VT);
-  if (isSigned) {
-    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
-  } else {
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
-                           ISD::SETNE);
-  }
-  // MulResult is a node with an illegal type. Because such things are not
-  // generally permitted during this phase of legalization, ensure that
-  // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
-  // been folded.
-  assert(MulResult->use_empty() && "Illegally typed node still in use!");
-
-  SDValue Ops[2] = { BottomHalf, TopHalf } ;
-  return DAG.getMergeValues(Ops, dl);
-}
-
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) {
     // Expand with a fence.
@@ -3283,12 +3172,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
   case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
   case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::UMULO:
-  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 8e86c59bf9194..f58674ee118ee 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -223,11 +223,9 @@ class SystemZTargetELFStreamer : public SystemZTargetStreamer {
 };
 } // end namespace
 
-static MCTargetStreamer *
-createAsmTargetStreamer(MCStreamer &S,
-                        formatted_raw_ostream &OS,
-                        MCInstPrinter *InstPrint,
-                        bool isVerboseAsm) {
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint) {
   return new SystemZTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 3d025a99b3d83..ed400e9eceb9c 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -346,9 +346,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
             *OutStreamer,
             MCInstBuilder(SystemZ::LLILF).addReg(TargetReg).addImm(Disp));
       } else
-        EmitToStreamer(
-            *OutStreamer,
-            MCInstBuilder(SystemZ::ALGFI).addReg(TargetReg).addImm(Disp));
+        EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::ALGFI)
+                                         .addReg(TargetReg)
+                                         .addReg(TargetReg)
+                                         .addImm(Disp));
       Disp = 0;
       Op = Op0;
     }
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a769fdeff5684..8c53b8dffc2fa 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -517,8 +517,7 @@ static void buildDefCFAReg(MachineBasicBlock &MBB,
                            const DebugLoc &DL, unsigned Reg,
                            const SystemZInstrInfo *ZII) {
   MachineFunction &MF = *MBB.getParent();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   unsigned RegNum = MRI->getDwarfRegNum(Reg, true);
   unsigned CFIIndex = MF.addFrameInst(
                         MCCFIInstruction::createDefCfaRegister(nullptr, RegNum));
@@ -535,8 +534,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
   auto *ZII = static_cast<const SystemZInstrInfo *>(STI.getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFFrame.getCalleeSavedInfo();
   bool HasFP = hasFP(MF);
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index b2b88143354a5..383393914a169 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6653,7 +6653,8 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
 
           // Defer the creation of the bitcast from X to combineExtract,
           // which might be able to optimize the extraction.
-          VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
+          VecVT = EVT::getVectorVT(*DCI.DAG.getContext(),
+                                   MVT::getIntegerVT(TruncBytes * 8),
                                    VecVT.getStoreSize() / TruncBytes);
           EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
           return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
index d2e05b63c6c63..ea194a38090db 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -209,13 +209,23 @@ def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, z_load, 8>;
 def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, z_load, 8>;
 
 // Fused multiply-add (unnormalized).
-def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
 def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
 def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
-def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, z_load, 8>;
 def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, z_load, 8>;
 def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, z_load, 8>;
 
+// MAY and MAYR allow the user to specify the floating point register pair
+// making up the FP128 register by either the lower-numbered register or the
+// higher-numbered register, in contrast to all other floating point
+// instructions.
+// For this reason, the defs below accept `FP64,FP64` instead of `FP128,FP64`.
+// This is ok since these instructions are not used in code generation.
+// If and when code generation is enabled, the code gen variants should be
+// split out from this and use the proper register classes, while these should
+// remain for the Assembler and Disassembler to remain compliant with the POP.
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP64, FP64, z_load, 8>;
+def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP64, FP64>;
+
 // Division.
 def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
 def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 16bbfd44ef8a9..d0758891fe570 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -631,8 +631,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore) ||
-      !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+  if (!DefMI->isSafeToMove(SawStore) || !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
     return nullptr;
 
   int UseOpIdx =
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 7ab0b36636304..95ed1a00e603f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -298,13 +298,13 @@ let Predicates = [IsTargetXPLINK64] in {
   }
 
   let hasNoSchedulingInfo = 1, Defs = [CC] in {
-    def ADA_ENTRY : Alias<12, (outs GR64:$Reg), (ins adasym:$addr,
+    def ADA_ENTRY : Alias<12, (outs ADDR64:$Reg), (ins adasym:$addr,
                               ADDR64:$ADA, imm64:$Offset),
                             [(set i64:$Reg, (z_ada_entry i64:$addr,
                               i64:$ADA, i64:$Offset))]>;
   }
   let mayLoad = 1, AddedComplexity = 20, hasNoSchedulingInfo = 1, Defs = [CC] in {
-    def ADA_ENTRY_VALUE : Alias<12, (outs GR64:$Reg), (ins adasym:$addr,
+    def ADA_ENTRY_VALUE : Alias<12, (outs ADDR64:$Reg), (ins adasym:$addr,
                               ADDR64:$ADA, imm64:$Offset),
                             [(set i64:$Reg, (z_load (z_ada_entry
                               iPTR:$addr, iPTR:$ADA, i64:$Offset)))]>;
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index 6611e4c42eb2b..019748413d32e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -68,8 +68,7 @@ createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm) {
+                                                 MCInstPrinter *InstPrint) {
   return new VETargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 71dfe1062956e..cea282f62fe15 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -111,10 +111,9 @@ createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return new WebAssemblyTargetWasmStreamer(S);
 }
 
-static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
-                                                 formatted_raw_ostream &OS,
-                                                 MCInstPrinter * /*InstPrint*/,
-                                                 bool /*isVerboseAsm*/) {
+static MCTargetStreamer *
+createAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                        MCInstPrinter * /*InstPrint*/) {
   return new WebAssemblyTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index aa3aa1b007a53..ea795cd00ed5b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -886,7 +886,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     MIB.addImm(0);
     // The table into which this call_indirect indexes.
     MCSymbolWasm *Table = WebAssembly::getOrCreateFunctionTableSymbol(
-        MF->getMMI().getContext(), Subtarget);
+        MF->getContext(), Subtarget);
     if (Subtarget->hasReferenceTypes()) {
       MIB.addSym(Table);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
index 4252fc1f55fc9..d8fb15666662c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -42,7 +42,7 @@ class WebAssemblyFixBrTableDefaults final : public MachineFunctionPass {
 
 char WebAssemblyFixBrTableDefaults::ID = 0;
 
-// Target indepedent selection dag assumes that it is ok to use PointerTy
+// Target independent selection dag assumes that it is ok to use PointerTy
 // as the index for a "switch", whereas Wasm so far only has a 32-bit br_table.
 // See e.g. SelectionDAGBuilder::visitJumpTableHeader
 // We have a 64-bit br_table in the tablegen defs as a result, which does get
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 3d37eb2fa27bc..bb36ce7650183 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -78,7 +78,7 @@ def HasSignExt :
 
 def HasSIMD128 :
     Predicate<"Subtarget->hasSIMD128()">,
-    AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
+    AssemblerPredicate<(any_of FeatureSIMD128, FeatureRelaxedSIMD), "simd128">;
 
 def HasTailCall :
     Predicate<"Subtarget->hasTailCall()">,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 2ee430c88169d..26fe61b1d6051 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -702,6 +702,19 @@ defm "" : ReplaceLane<I64x2, 30>;
 defm "" : ReplaceLane<F32x4, 32>;
 defm "" : ReplaceLane<F64x2, 34>;
 
+// For now use an instrinsic for f16x8.replace_lane instead of ReplaceLane above
+// since LLVM IR generated with half type arguments is not well supported and
+// creates conversions from f16->f32.
+defm REPLACE_LANE_F16x8 :
+  HALF_PRECISION_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, F32:$x),
+                   (outs), (ins vec_i8imm_op:$idx),
+                   [(set (v8f16 V128:$dst), (int_wasm_replace_lane_f16x8
+                     (v8f16 V128:$vec),
+                     (i32 LaneIdx8:$idx),
+                     (f32 F32:$x)))],
+                   "f16x8.replace_lane\t$dst, $vec, $idx, $x",
+                   "f16x8.replace_lane\t$idx", 0x122>;
+
 // Lower undef lane indices to zero
 def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
           (REPLACE_LANE_I8x16 $vec, 0, $x)>;
@@ -1270,7 +1283,11 @@ def pmin : PatFrags<(ops node:$lhs, node:$rhs), [
                     (vselect (setolt $rhs, $lhs), $rhs, $lhs),
                     (vselect (setole $rhs, $lhs), $rhs, $lhs),
                     (vselect (setogt $lhs, $rhs), $rhs, $lhs),
-                    (vselect (setoge $lhs, $rhs), $rhs, $lhs)
+                    (vselect (setoge $lhs, $rhs), $rhs, $lhs),
+                    (vselect (setlt $lhs, $rhs), $lhs, $rhs),
+                    (vselect (setle $lhs, $rhs), $lhs, $rhs),
+                    (vselect (setgt $lhs, $rhs), $rhs, $lhs),
+                    (vselect (setge $lhs, $rhs), $rhs, $lhs)
 ]>;
 defm PMIN : SIMDBinaryFP<pmin, "pmin", 234>;
 
@@ -1279,7 +1296,11 @@ def pmax : PatFrags<(ops node:$lhs, node:$rhs), [
                     (vselect (setogt $rhs, $lhs), $rhs, $lhs),
                     (vselect (setoge $rhs, $lhs), $rhs, $lhs),
                     (vselect (setolt $lhs, $rhs), $rhs, $lhs),
-                    (vselect (setole $lhs, $rhs), $rhs, $lhs)
+                    (vselect (setole $lhs, $rhs), $rhs, $lhs),
+                    (vselect (setgt $lhs, $rhs), $lhs, $rhs),
+                    (vselect (setge $lhs, $rhs), $lhs, $rhs),
+                    (vselect (setlt $lhs, $rhs), $rhs, $lhs),
+                    (vselect (setle $lhs, $rhs), $rhs, $lhs)
 ]>;
 defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 7cc030460e30f..17bec8e2a6a45 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -777,6 +777,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
   SSAUpdaterBulk SSA;
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
+      if (I.getType()->isVoidTy())
+        continue;
       unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
       // If a value is defined by an invoke instruction, it is only available in
       // its normal destination and not in its unwind destination.
@@ -1687,6 +1689,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
     }
   }
 
+  SmallDenseMap<BasicBlock *, SmallSetVector<BasicBlock *, 4>, 4>
+      UnwindDestToNewPreds;
   for (auto *CI : LongjmpableCalls) {
     // Even if the callee function has attribute 'nounwind', which is true for
     // all C functions, it can longjmp, which means it can throw a Wasm
@@ -1724,6 +1728,11 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
     }
     if (!UnwindDest)
       UnwindDest = CatchDispatchLongjmpBB;
+    // Because we are changing a longjmpable call to an invoke, its unwind
+    // destination can be an existing EH pad that already have phis, and the BB
+    // with the newly created invoke will become a new predecessor of that EH
+    // pad. In this case we need to add the new predecessor to those phis.
+    UnwindDestToNewPreds[UnwindDest].insert(CI->getParent());
     changeToInvokeAndSplitBasicBlock(CI, UnwindDest);
   }
 
@@ -1752,4 +1761,45 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
 
   for (Instruction *I : ToErase)
     I->eraseFromParent();
+
+  // Add entries for new predecessors to phis in unwind destinations. We use
+  // 'undef' as a placeholder value. We should make sure the phis have a valid
+  // set of predecessors before running SSAUpdater, because SSAUpdater
+  // internally can use existing phis to gather predecessor info rather than
+  // scanning the actual CFG (See FindPredecessorBlocks in SSAUpdater.cpp for
+  // details).
+  for (auto &[UnwindDest, NewPreds] : UnwindDestToNewPreds) {
+    for (PHINode &PN : UnwindDest->phis()) {
+      for (auto *NewPred : NewPreds) {
+        assert(PN.getBasicBlockIndex(NewPred) == -1);
+        PN.addIncoming(UndefValue::get(PN.getType()), NewPred);
+      }
+    }
+  }
+
+  // For unwind destinations for newly added invokes to longjmpable functions,
+  // calculate incoming values for the newly added predecessors using
+  // SSAUpdater. We add existing values in the phis to SSAUpdater as available
+  // values and let it calculate what the value should be at the end of new
+  // incoming blocks.
+  for (auto &[UnwindDest, NewPreds] : UnwindDestToNewPreds) {
+    for (PHINode &PN : UnwindDest->phis()) {
+      SSAUpdater SSA;
+      SSA.Initialize(PN.getType(), PN.getName());
+      for (unsigned Idx = 0, E = PN.getNumIncomingValues(); Idx != E; ++Idx) {
+        if (NewPreds.contains(PN.getIncomingBlock(Idx)))
+          continue;
+        Value *V = PN.getIncomingValue(Idx);
+        if (auto *II = dyn_cast<InvokeInst>(V))
+          SSA.AddAvailableValue(II->getNormalDest(), II);
+        else if (auto *I = dyn_cast<Instruction>(V))
+          SSA.AddAvailableValue(I->getParent(), I);
+        else
+          SSA.AddAvailableValue(PN.getIncomingBlock(Idx), V);
+      }
+      for (auto *NewPred : NewPreds)
+        PN.setIncomingValueForBlock(NewPred, SSA.GetValueAtEndOfBlock(NewPred));
+      assert(PN.isComplete());
+    }
+  }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index e7810e18d44d4..ba3ab5164af26 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -204,6 +204,24 @@ struct RuntimeLibcallSignatureTable {
     Table[RTLIB::TAN_F32] = f32_func_f32;
     Table[RTLIB::TAN_F64] = f64_func_f64;
     Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::ASIN_F32] = f32_func_f32;
+    Table[RTLIB::ASIN_F64] = f64_func_f64;
+    Table[RTLIB::ASIN_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::ACOS_F32] = f32_func_f32;
+    Table[RTLIB::ACOS_F64] = f64_func_f64;
+    Table[RTLIB::ACOS_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::ATAN_F32] = f32_func_f32;
+    Table[RTLIB::ATAN_F64] = f64_func_f64;
+    Table[RTLIB::ATAN_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::SINH_F32] = f32_func_f32;
+    Table[RTLIB::SINH_F64] = f64_func_f64;
+    Table[RTLIB::SINH_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::COSH_F32] = f32_func_f32;
+    Table[RTLIB::COSH_F64] = f64_func_f64;
+    Table[RTLIB::COSH_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::TANH_F32] = f32_func_f32;
+    Table[RTLIB::TANH_F64] = f64_func_f64;
+    Table[RTLIB::TANH_F128] = i64_i64_func_i64_i64;
     Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
     Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
     Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c7f88fed9b128..6e17150edf278 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3776,6 +3776,17 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   if (X86::optimizeShiftRotateWithImmediateOne(Inst))
     return true;
 
+  auto replaceWithCCMPCTEST = [&](unsigned Opcode) -> bool {
+    if (ForcedOpcodePrefix == OpcodePrefix_EVEX) {
+      Inst.setFlags(~(X86::IP_USE_EVEX)&Inst.getFlags());
+      Inst.setOpcode(Opcode);
+      Inst.addOperand(MCOperand::createImm(0));
+      Inst.addOperand(MCOperand::createImm(10));
+      return true;
+    }
+    return false;
+  };
+
   switch (Inst.getOpcode()) {
   default: return false;
   case X86::JMP_1:
@@ -3807,6 +3818,61 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
     Inst.setOpcode(X86::INT3);
     return true;
   }
+  // `{evex} cmp <>, <>` is alias of `ccmpt {dfv=} <>, <>`, and
+  // `{evex} test <>, <>` is alias of `ctest {dfv=} <>, <>`
+#define FROM_TO(FROM, TO)                                                      \
+  case X86::FROM:                                                              \
+    return replaceWithCCMPCTEST(X86::TO);
+    FROM_TO(CMP64rr, CCMP64rr)
+    FROM_TO(CMP64mi32, CCMP64mi32)
+    FROM_TO(CMP64mi8, CCMP64mi8)
+    FROM_TO(CMP64mr, CCMP64mr)
+    FROM_TO(CMP64ri32, CCMP64ri32)
+    FROM_TO(CMP64ri8, CCMP64ri8)
+    FROM_TO(CMP64rm, CCMP64rm)
+
+    FROM_TO(CMP32rr, CCMP32rr)
+    FROM_TO(CMP32mi, CCMP32mi)
+    FROM_TO(CMP32mi8, CCMP32mi8)
+    FROM_TO(CMP32mr, CCMP32mr)
+    FROM_TO(CMP32ri, CCMP32ri)
+    FROM_TO(CMP32ri8, CCMP32ri8)
+    FROM_TO(CMP32rm, CCMP32rm)
+
+    FROM_TO(CMP16rr, CCMP16rr)
+    FROM_TO(CMP16mi, CCMP16mi)
+    FROM_TO(CMP16mi8, CCMP16mi8)
+    FROM_TO(CMP16mr, CCMP16mr)
+    FROM_TO(CMP16ri, CCMP16ri)
+    FROM_TO(CMP16ri8, CCMP16ri8)
+    FROM_TO(CMP16rm, CCMP16rm)
+
+    FROM_TO(CMP8rr, CCMP8rr)
+    FROM_TO(CMP8mi, CCMP8mi)
+    FROM_TO(CMP8mr, CCMP8mr)
+    FROM_TO(CMP8ri, CCMP8ri)
+    FROM_TO(CMP8rm, CCMP8rm)
+
+    FROM_TO(TEST64rr, CTEST64rr)
+    FROM_TO(TEST64mi32, CTEST64mi32)
+    FROM_TO(TEST64mr, CTEST64mr)
+    FROM_TO(TEST64ri32, CTEST64ri32)
+
+    FROM_TO(TEST32rr, CTEST32rr)
+    FROM_TO(TEST32mi, CTEST32mi)
+    FROM_TO(TEST32mr, CTEST32mr)
+    FROM_TO(TEST32ri, CTEST32ri)
+
+    FROM_TO(TEST16rr, CTEST16rr)
+    FROM_TO(TEST16mi, CTEST16mi)
+    FROM_TO(TEST16mr, CTEST16mr)
+    FROM_TO(TEST16ri, CTEST16ri)
+
+    FROM_TO(TEST8rr, CTEST8rr)
+    FROM_TO(TEST8mi, CTEST8mi)
+    FROM_TO(TEST8mr, CTEST8mr)
+    FROM_TO(TEST8ri, CTEST8ri)
+#undef FROM_TO
   }
 }
 
@@ -4158,7 +4224,10 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_Unsupported;
     break;
   case OpcodePrefix_EVEX:
-    if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
+    if (is64BitMode() && (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
+        !X86::isCMP(Opc) && !X86::isTEST(Opc))
+      return Match_Unsupported;
+    if (!is64BitMode() && (TSFlags & X86II::EncodingMask) != X86II::EVEX)
       return Match_Unsupported;
     break;
   }
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index dd8ecf6ef7fc7..7169d588548b0 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -315,7 +315,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   getActionDefinitionsBuilder(G_PHI)
       .legalIf([=](const LegalityQuery &Query) -> bool {
         return typeInSet(0, {s8, s16, s32, p0})(Query) ||
-               (Is64Bit && typeInSet(0, {s64})(Query)) ||
+               (UseX87 && typeIs(0, s80)(Query)) ||
+               (Is64Bit && typeIs(0, s64)(Query)) ||
                (HasSSE1 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query)) ||
                (HasAVX && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query)) ||
                (HasAVX512 &&
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index 9e85424e76e62..61633a09d93cf 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -44,33 +44,6 @@ X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) {
          "GPRs should hold up to 64-bit");
 }
 
-const RegisterBank &
-X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                            LLT) const {
-
-  if (X86::GR8RegClass.hasSubClassEq(&RC) ||
-      X86::GR16RegClass.hasSubClassEq(&RC) ||
-      X86::GR32RegClass.hasSubClassEq(&RC) ||
-      X86::GR64RegClass.hasSubClassEq(&RC) ||
-      X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
-      X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::GPRRegBankID);
-
-  if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
-      X86::FR64XRegClass.hasSubClassEq(&RC) ||
-      X86::VR128XRegClass.hasSubClassEq(&RC) ||
-      X86::VR256XRegClass.hasSubClassEq(&RC) ||
-      X86::VR512RegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::VECRRegBankID);
-
-  if (X86::RFP80RegClass.hasSubClassEq(&RC) ||
-      X86::RFP32RegClass.hasSubClassEq(&RC) ||
-      X86::RFP64RegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::PSRRegBankID);
-
-  llvm_unreachable("Unsupported register kind yet.");
-}
-
 // \returns true if a given intrinsic only uses and defines FPRs.
 static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
                           const MachineInstr &MI) {
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
index 8f38e717e36b0..f30e9fea1a597 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
@@ -81,9 +81,6 @@ class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
 public:
   X86RegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   InstructionMappings
   getInstrAlternativeMappings(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 09a6d57afc0ba..cf0cb92424c16 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -437,8 +437,6 @@ static size_t getSizeForInstFragment(const MCFragment *F) {
     return cast<MCDataFragment>(*F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(*F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
   }
 }
 
@@ -884,9 +882,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm) const {
       if (LabeledFragments.count(&F))
         Relaxable.clear();
 
-      if (F.getKind() == MCFragment::FT_Data ||
-          F.getKind() == MCFragment::FT_CompactEncodedInst)
-        // Skip and ignore
+      if (F.getKind() == MCFragment::FT_Data) // Skip and ignore
         continue;
 
       if (F.getKind() == MCFragment::FT_Relaxable) {
@@ -1328,7 +1324,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
 
   /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty()) return 0;
@@ -1343,13 +1339,13 @@ class DarwinX86AsmBackend : public X86AsmBackend {
     bool HasFP = false;
 
     // Encode that we are using EBP/RBP as the frame pointer.
-    uint32_t CompactUnwindEncoding = 0;
+    uint64_t CompactUnwindEncoding = 0;
 
     unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
     unsigned InstrOffset = 0;
     unsigned StackAdjust = 0;
-    unsigned StackSize = 0;
-    int MinAbsOffset = std::numeric_limits<int>::max();
+    uint64_t StackSize = 0;
+    int64_t MinAbsOffset = std::numeric_limits<int64_t>::max();
 
     for (const MCCFIInstruction &Inst : Instrs) {
       switch (Inst.getOperation()) {
@@ -1376,7 +1372,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         memset(SavedRegs, 0, sizeof(SavedRegs));
         StackAdjust = 0;
         SavedRegIdx = 0;
-        MinAbsOffset = std::numeric_limits<int>::max();
+        MinAbsOffset = std::numeric_limits<int64_t>::max();
         InstrOffset += MoveInstrSize;
         break;
       }
@@ -1419,7 +1415,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
         SavedRegs[SavedRegIdx++] = Reg;
         StackAdjust += OffsetSize;
-        MinAbsOffset = std::min(MinAbsOffset, abs(Inst.getOffset()));
+        MinAbsOffset = std::min(MinAbsOffset, std::abs(Inst.getOffset()));
         InstrOffset += PushInstrSize(Reg);
         break;
       }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 3cd0af0c7f546..6553e1cc4a930 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -691,9 +691,12 @@ void X86MCCodeEmitter::emitMemModRMByte(
 
   unsigned BaseRegNo = BaseReg ? getX86RegNum(Base) : -1U;
 
+  bool IsAdSize16 = STI.hasFeature(X86::Is32Bit) &&
+                    (TSFlags & X86II::AdSizeMask) == X86II::AdSize16;
+
   // 16-bit addressing forms of the ModR/M byte have a different encoding for
   // the R/M field and are far more limited in which registers can be used.
-  if (X86_MC::is16BitMemOperand(MI, Op, STI)) {
+  if (IsAdSize16 || X86_MC::is16BitMemOperand(MI, Op, STI)) {
     if (BaseReg) {
       // For 32-bit addressing, the row and column values in Table 2-2 are
       // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 96a7823b04ad8..e166b68668d9b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -114,8 +114,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T,
 /// Implements X86-only directives for assembly emission.
 MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS,
-                                             MCInstPrinter *InstPrinter,
-                                             bool IsVerboseAsm);
+                                             MCInstPrinter *InstPrinter);
 
 /// Implements X86-only directives for object files.
 MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
@@ -128,8 +127,7 @@ MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
 MCStreamer *createX86WinCOFFStreamer(MCContext &C,
                                      std::unique_ptr<MCAsmBackend> &&AB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&CE,
-                                     bool IncrementalLinkerCompatible);
+                                     std::unique_ptr<MCCodeEmitter> &&CE);
 
 MCStreamer *createX86ELFStreamer(const Triple &T, MCContext &Context,
                                  std::unique_ptr<MCAsmBackend> &&MAB,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index b1e5362c5d24b..1ef10928c05d8 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -72,14 +72,9 @@ void X86WinCOFFStreamer::finishImpl() {
 }
 } // namespace
 
-MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
-                                           std::unique_ptr<MCAsmBackend> &&AB,
-                                           std::unique_ptr<MCObjectWriter> &&OW,
-                                           std::unique_ptr<MCCodeEmitter> &&CE,
-                                           bool IncrementalLinkerCompatible) {
-  X86WinCOFFStreamer *S =
-      new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
-  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
-  return S;
+MCStreamer *
+llvm::createX86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> &&AB,
+                               std::unique_ptr<MCObjectWriter> &&OW,
+                               std::unique_ptr<MCCodeEmitter> &&CE) {
+  return new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
 }
-
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index 19427617a3d3e..7f8da407f8543 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -445,8 +445,7 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
 
 MCTargetStreamer *llvm::createX86AsmTargetStreamer(MCStreamer &S,
                                                    formatted_raw_ostream &OS,
-                                                   MCInstPrinter *InstPrinter,
-                                                   bool IsVerboseAsm) {
+                                                   MCInstPrinter *InstPrinter) {
   // FIXME: This makes it so we textually assemble COFF directives on ELF.
   // That's kind of nonsensical.
   return new X86WinCOFFAsmTargetStreamer(S, OS, *InstPrinter);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 3395a13545e45..957eb21cf4f8c 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -66,11 +67,10 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
       *Subtarget->getInstrInfo(), MF.getContext()));
 
-  EmitFPOData =
-      Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();
+  const Module *M = MF.getFunction().getParent();
+  EmitFPOData = Subtarget->isTargetWin32() && M->getCodeViewFlag();
 
-  IndCSPrefix =
-      MF.getMMI().getModule()->getModuleFlag("indirect_branch_cs_prefix");
+  IndCSPrefix = M->getModuleFlag("indirect_branch_cs_prefix");
 
   SetupMachineFunction(MF);
 
@@ -976,6 +976,33 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
   }
 }
 
+/// True if this module is being built for windows/msvc, and uses floating
+/// point. This is used to emit an undefined reference to _fltused. This is
+/// needed in Windows kernel or driver contexts to find and prevent code from
+/// modifying non-GPR registers.
+///
+/// TODO: It would be better if this was computed from MIR by looking for
+/// selected floating-point instructions.
+static bool usesMSVCFloatingPoint(const Triple &TT, const Module &M) {
+  // Only needed for MSVC
+  if (!TT.isWindowsMSVCEnvironment())
+    return false;
+
+  for (const Function &F : M) {
+    for (const Instruction &I : instructions(F)) {
+      if (I.getType()->isFPOrFPVectorTy())
+        return true;
+
+      for (const auto &Op : I.operands()) {
+        if (Op->getType()->isFPOrFPVectorTy())
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
@@ -994,7 +1021,7 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     // safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   } else if (TT.isOSBinFormatCOFF()) {
-    if (MMI->usesMSVCFloatingPoint()) {
+    if (usesMSVCFloatingPoint(TT, M)) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
       // symbol _fltused is referenced. Linking this in causes some
       // side-effects:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 9ec68bfb8e0f7..c55ff3dfc9c8e 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[
     CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
     CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
 
-    // MMX type gets 8 byte slot in stack , while alignment depends on target
-    CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
-    CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
     // float 128 get stack slots whose size and alignment depends 
     // on the subtarget.
     CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
@@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[
   CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
-  // MMX vector types are always returned in MM0. If the target doesn't have
-  // MM0, it doesn't support these vector types.
-  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
-
   // Long double types are always returned in FP0 (even with SSE),
   // except on Win64.
   CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
@@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
 
-  // MMX vector types are always returned in XMM0.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
-
   // Pointers are always returned in full 64-bit registers.
   CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
 
@@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[
 
 // X86-Win64 C return-value convention.
 def RetCC_X86_Win64_C : CallingConv<[
-  // The X86-Win64 calling convention always returns __m64 values in RAX.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // GCC returns FP values in RAX on Win64.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
   CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
@@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
-  // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[
 
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
 
-  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
-  CCIfType<[x86mmx],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCIfSubtarget<"hasSSE2()",
-            CCPromoteToType<v2i64>>>>,
-
   // Boolean vectors of AVX-512 are passed in SIMD registers.
   // The call from AVX to AVX-512 function should work,
   // since the boolean types in AVX/AVX2 are promoted by default.
@@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[
   // Long doubles are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
 
-  // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // If SSE was disabled, pass FP values smaller than 64-bits as integers in
   // GPRs or on the stack.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
@@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[
 
   CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
 
-  // The first 3 __m64 vector arguments are passed in mmx registers if the
-  // call is not a vararg call.
-  CCIfNotVarArg<CCIfType<[x86mmx],
-                CCAssignToReg<[MM0, MM1, MM2]>>>,
-
   CCIfType<[f16], CCAssignToStack<4, 4>>,
 
   // Integer/Float values get stored in stack slots that are 4 bytes in
@@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[v32i1], CCPromoteToType<v32i8>>,
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
-  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
-  // passed in the parameter area.
-  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
   // Darwin passes vectors in a form that differs from the i386 psABI
   CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
 
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 8a263e1a0678b..bdc9a0d29670a 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -466,15 +466,14 @@ void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
     emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
     return;
   }
-  const MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   const Register FramePtr = TRI->getFrameRegister(MF);
   const Register MachineFramePtr =
       STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
                                : FramePtr;
   unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
   // Offset = space for return address + size of the frame pointer itself.
-  unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+  int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
   BuildCFI(MBB, MBBI, DebugLoc{},
            MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
   emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
@@ -485,8 +484,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
     const DebugLoc &DL, bool IsPrologue) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
   // Add callee saved registers to move list.
@@ -1532,7 +1530,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const Function &Fn = MF.getFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
   uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
@@ -1547,8 +1544,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   bool IsWin64Prologue = isWin64Prologue(MF);
   bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
   // FIXME: Emit FPO data for EH funclets.
-  bool NeedsWinFPO =
-      !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
+  bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() &&
+                     MF.getFunction().getParent()->getCodeViewFlag();
   bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
   bool NeedsDwarfCFI = needsDwarfCFI(MF);
   Register FramePtr = TRI->getFrameRegister(MF);
@@ -2556,7 +2553,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!HasFP && NeedsDwarfCFI) {
     MBBI = FirstCSPop;
-    int64_t Offset = -CSSize - SlotSize;
+    int64_t Offset = -(int64_t)CSSize - SlotSize;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
@@ -3523,7 +3520,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
 
   // HiPE-specific values
   NamedMDNode *HiPELiteralsMD =
-      MF.getMMI().getModule()->getNamedMetadata("hipe.literals");
+      MF.getFunction().getParent()->getNamedMetadata("hipe.literals");
   if (!HiPELiteralsMD)
     report_fatal_error(
         "Can't generate HiPE prologue without runtime parameters");
@@ -3882,8 +3879,7 @@ bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   // If we may need to emit frameless compact unwind information, give
   // up as this is currently broken: PR25614.
   bool CompactUnwind =
-      MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
-      nullptr;
+      MF.getContext().getObjectFileInfo()->getCompactUnwindSection() != nullptr;
   return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
           !CompactUnwind) &&
          // The lowering of segmented stack and HiPE only support entry
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index c91bd576dc9f6..74804e5c9783d 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -922,7 +922,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       if (Imm == EndbrImm || isEndbrImm64(Imm)) {
         // Check that the cf-protection-branch is enabled.
         Metadata *CFProtectionBranch =
-          MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
+            MF->getFunction().getParent()->getModuleFlag(
+                "cf-protection-branch");
         if (CFProtectionBranch || IndirectBranchTracking) {
           SDLoc dl(N);
           SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 881e06e5f78b4..73405397aa6e8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -849,8 +849,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
-    setOperationAction(ISD::LROUND, MVT::f80, Expand);
-    setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+    setOperationAction(ISD::LROUND, MVT::f80, LibCall);
+    setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
     setOperationAction(ISD::LRINT, MVT::f80, Custom);
     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 
@@ -2475,8 +2475,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
     // clang-format off
    for (ISD::NodeType Op :
-         {ISD::FCEIL,  ISD::STRICT_FCEIL,
+         {ISD::FACOS,  ISD::STRICT_FACOS,
+          ISD::FASIN,  ISD::STRICT_FASIN,
+          ISD::FATAN,  ISD::STRICT_FATAN,
+          ISD::FCEIL,  ISD::STRICT_FCEIL,
           ISD::FCOS,   ISD::STRICT_FCOS,
+          ISD::FCOSH,  ISD::STRICT_FCOSH,
           ISD::FEXP,   ISD::STRICT_FEXP,
           ISD::FFLOOR, ISD::STRICT_FFLOOR,
           ISD::FREM,   ISD::STRICT_FREM,
@@ -2484,7 +2488,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
           ISD::FLOG10, ISD::STRICT_FLOG10,
           ISD::FPOW,   ISD::STRICT_FPOW,
           ISD::FSIN,   ISD::STRICT_FSIN,
-          ISD::FTAN,   ISD::STRICT_FTAN})
+          ISD::FSINH,  ISD::STRICT_FSINH,
+          ISD::FTAN,   ISD::STRICT_FTAN,
+          ISD::FTANH,  ISD::STRICT_FTANH})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
   // clang-format on
@@ -2554,7 +2560,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FP_EXTEND,
                        ISD::STRICT_FP_EXTEND,
                        ISD::FP_ROUND,
-                       ISD::STRICT_FP_ROUND});
+                       ISD::STRICT_FP_ROUND,
+                       ISD::INTRINSIC_VOID,
+                       ISD::INTRINSIC_WO_CHAIN,
+                       ISD::INTRINSIC_W_CHAIN});
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -3432,12 +3441,9 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
   if (BaseCost >= 0 && Subtarget.hasCCMP())
     BaseCost += BrMergingCcmpBias;
   // a == b && a == c is a fast pattern on x86.
-  ICmpInst::Predicate Pred;
   if (BaseCost >= 0 && Opc == Instruction::And &&
-      match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ &&
-      match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ)
+      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
     BaseCost += 1;
   return {BaseCost, BrMergingLikelyBias.getValue(),
           BrMergingUnlikelyBias.getValue()};
@@ -25446,9 +25452,8 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
 
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   // registration, or the .set_setframe offset.
-  MCSymbol *OffsetSym =
-      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
-          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
+  MCSymbol *OffsetSym = MF.getContext().getOrCreateParentFrameOffsetSymbol(
+      GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -26340,7 +26345,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-    auto &Context = MF.getMMI().getContext();
+    auto &Context = MF.getContext();
     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
                                             Twine(MF.getFunctionNumber()));
     return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
@@ -26352,7 +26357,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     MachineFunction &MF = DAG.getMachineFunction();
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
-    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+    MCSymbol *LSDASym = MF.getContext().getOrCreateLSDASymbol(
         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
@@ -27270,6 +27275,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
+  case INTR_TYPE_CAST_MMX:
+    return SDValue(); // handled in combineINTRINSIC_*
   }
 }
 
@@ -29096,6 +29103,20 @@ uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
   llvm_unreachable("Unsupported GFNI opcode");
 }
 
+// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
+SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT,
+                        unsigned Amt = 0) {
+  assert(VT.getVectorElementType() == MVT::i8 &&
+         (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
+  uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
+  SmallVector<SDValue> MaskBits;
+  for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
+    uint64_t Bits = (Imm >> (I % 64)) & 255;
+    MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
+  }
+  return DAG.getBuildVector(VT, DL, MaskBits);
+}
+
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29284,9 +29305,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
       return SDValue();
 
     if (Subtarget.hasGFNI()) {
-      uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
-      MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
-      SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
+      SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
       return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
                          DAG.getTargetConstant(0, dl, MVT::i8));
     }
@@ -30191,9 +30210,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
     uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
-    uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
-    MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
+    SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
                        DAG.getTargetConstant(0, DL, MVT::i8));
   }
@@ -30746,10 +30763,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30757,10 +30776,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30771,18 +30792,21 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
              Pred == CmpInst::ICMP_SLT;
-    if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-      return Pred == CmpInst::ICMP_SGT;
+    if (match(I->user_back(),
+              m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+      return true;
     return false;
   }
   if (Opc == AtomicRMWInst::Xor) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -31528,10 +31552,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
 
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
   if (Subtarget.hasGFNI()) {
-    MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
-    SDValue Matrix =
-        DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
-    Matrix = DAG.getBitcast(VT, Matrix);
+    SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT);
     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
                        DAG.getTargetConstant(0, DL, MVT::i8));
   }
@@ -35876,7 +35897,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     MIB.addMBB(restoreMBB);
   MIB.setMemRefs(MMOs);
 
-  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+  if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
     emitSetJmpShadowStackFix(MI, thisMBB);
   }
 
@@ -36152,7 +36173,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   MachineBasicBlock *thisMBB = MBB;
 
   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
-  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+  if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
   }
 
@@ -43309,6 +43330,9 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
   switch (Op.getOpcode()) {
+  // SSE vector multiplies are either inbounds or saturate.
+  case X86ISD::VPMADDUBSW:
+  case X86ISD::VPMADDWD:
   // SSE vector shifts handle out of bounds shift amounts.
   case X86ISD::VSHLI:
   case X86ISD::VSRLI:
@@ -43319,6 +43343,16 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
     return false;
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (Op->getConstantOperandVal(0)) {
+    case Intrinsic::x86_sse2_pmadd_wd:
+    case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_avx512_pmaddw_d_512:
+    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case Intrinsic::x86_avx2_pmadd_ub_sw:
+    case Intrinsic::x86_avx512_pmaddubs_w_512:
+      return false;
+    }
   }
   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
       Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
@@ -53473,13 +53507,6 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   if (SDValue Not = IsNOT(N0, DAG))
     return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
 
-  // Fold for better commutativity:
-  // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
-  if (N1->hasOneUse())
-    if (SDValue Not = IsNOT(N1, DAG))
-      return DAG.getNOT(
-          DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
-
   // Constant Folding
   APInt Undefs0, Undefs1;
   SmallVector<APInt> EltBits0, EltBits1;
@@ -53557,6 +53584,28 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Folds for better commutativity:
+  if (N1->hasOneUse()) {
+    // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
+    if (SDValue Not = IsNOT(N1, DAG))
+      return DAG.getNOT(
+          DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
+
+    // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
+    // Zero out elements by setting the PSHUFB mask value to 0xFF.
+    if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
+      SDValue BC1 = peekThroughOneUseBitcasts(N1);
+      if (BC1.getOpcode() == X86ISD::PSHUFB) {
+        EVT ShufVT = BC1.getValueType();
+        SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
+                                      DAG.getBitcast(ShufVT, N0));
+        SDValue NewShuf =
+            DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
+        return DAG.getBitcast(VT, NewShuf);
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -56159,6 +56208,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       SmallVector<SDValue> Subs;
       for (SDValue SubOp : SubOps)
         Subs.push_back(SubOp.getOperand(I));
+      // Attempt to peek through bitcasts and concat the original subvectors.
+      EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
+      if (SubVT.isSimple() && SubVT.isVector()) {
+        EVT ConcatVT =
+            EVT::getVectorVT(*DAG.getContext(), SubVT.getScalarType(),
+                             SubVT.getVectorElementCount() * Subs.size());
+        for (SDValue &Sub : Subs)
+          Sub = DAG.getBitcast(SubVT, Sub);
+        return DAG.getBitcast(
+            VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
+      }
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
     };
     auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
@@ -57716,6 +57776,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
+// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
+// use x86mmx instead.
+static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+
+  bool MadeChange = false, CastReturnVal = false;
+  SmallVector<SDValue, 8> Args;
+  for (const SDValue &Arg : N->op_values()) {
+    if (Arg.getValueType() == MVT::v1i64) {
+      MadeChange = true;
+      Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
+    } else
+      Args.push_back(Arg);
+  }
+  SDVTList VTs = N->getVTList();
+  SDVTList NewVTs = VTs;
+  if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
+    SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
+    NewVTArr[0] = MVT::x86mmx;
+    NewVTs = DAG.getVTList(NewVTArr);
+    MadeChange = true;
+    CastReturnVal = true;
+  }
+
+  if (MadeChange) {
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
+    if (CastReturnVal) {
+      SmallVector<SDValue, 2> Returns;
+      for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
+        Returns.push_back(Result.getValue(i));
+      Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
+      return DAG.getMergeValues(Returns, dl);
+    }
+    return Result;
+  }
+  return SDValue();
+}
+static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(0);
+  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -57906,7 +58046,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
-  // clang-format on
+  case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_VOID:  return combineINTRINSIC_VOID(N, DAG, DCI);
+    // clang-format on
   }
 
   return SDValue();
@@ -57976,7 +58119,7 @@ SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc &dl,
                                                   SDValue Value, SDValue Addr,
                                                   int JTI,
                                                   SelectionDAG &DAG) const {
-  const Module *M = DAG.getMachineFunction().getMMI().getModule();
+  const Module *M = DAG.getMachineFunction().getFunction().getParent();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
   if (IsCFProtectionSupported) {
     // In case control-flow branch protection is enabled, we need to add
@@ -58358,7 +58501,7 @@ X86TargetLowering::getSingleConstraintMatchWeight(
       Wt = CW_SpecificReg;
     break;
   case 'y':
-    if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
+    if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
       Wt = CW_SpecificReg;
     break;
   case 'Y':
@@ -58381,8 +58524,8 @@ X86TargetLowering::getSingleConstraintMatchWeight(
       return CW_Invalid;
     // Any MMX reg
     case 'm':
-      if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
-        return Wt;
+      if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
+        return CW_SpecificReg;
       return CW_Invalid;
     // Any SSE reg when ISA >= SSE2, same as 'x'
     case 'i':
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 3c49b6ea8ec7b..f659c168b86e0 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2016,7 +2016,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
   bool IsCFICall = IsIndirectCall && CLI.CFIType;
-  const Module *M = MF.getMMI().getModule();
+  const Module *M = MF.getFunction().getParent();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
   MachineFunction::CallSiteInfo CSInfo;
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 381286a3bbfc0..7740a174af4f3 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -116,7 +116,7 @@ static bool needsPrologueENDBR(MachineFunction &MF, const Module *M) {
 bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
 
-  const Module *M = MF.getMMI().getModule();
+  const Module *M = MF.getFunction().getParent();
   // Check that the cf-protection-branch is enabled.
   Metadata *isCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 322cb6f6f5819..793d62ba2a8e7 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
   if (isa<UndefValue>(Arg))
     return Constant::getNullValue(ResTy);
 
-  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
-  // We can't easily peek through x86_mmx types.
-  if (!ArgTy)
+  // Preserve previous behavior and give up.
+  // TODO: treat as <8 x i8>.
+  if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
     return nullptr;
 
+  auto *ArgTy = cast<FixedVectorType>(Arg->getType());
+
   // Expand MOVMSK to compare/bitcast/zext:
   // e.g. PMOVMSKB(v16i8 x):
   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index 493adbb5b9d41..423ee0e8c9bda 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -157,6 +157,14 @@ def : InstAlias<"ctest"#Cond#"{l} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST32mr i32mem:$src1, GR32:$src2, cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{q} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST64mr i64mem:$src1, GR64:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{b} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST8mr  i8mem:$src1,  GR8:$src2,  cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{w} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST16mr i16mem:$src1, GR16:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{l} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST32mr i32mem:$src1, GR32:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{q} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST64mr i64mem:$src1, GR64:$src2, cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{b} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST8ri  GR8:$src1,  i8imm:$src2,  cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{w} $dcf\t{$src2, $src1|$src1, $src2}",
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index fab7c167e385f..7fc786b1e570b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3121,9 +3121,9 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
 
 int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
   unsigned Opcode = MCID.getOpcode();
-  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) ||
-        X86::isCFCMOVCC(Opcode) || X86::isCCMPCC(Opcode) ||
-        X86::isCTESTCC(Opcode)))
+  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
+        X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
+        X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
     return -1;
   // Assume that condition code is always the last use operand.
   unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
@@ -3145,8 +3145,9 @@ X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
 }
 
 X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
-  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
-                                      : X86::COND_INVALID;
+  return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
+             ? X86::getCondFromMI(MI)
+             : X86::COND_INVALID;
 }
 
 X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
@@ -5588,7 +5589,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return nullptr;
 
   // Collect information about virtual register operands of MI.
@@ -10468,6 +10469,7 @@ enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall };
 
 std::optional<outliner::OutlinedFunction>
 X86InstrInfo::getOutliningCandidateInfo(
+    const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   unsigned SequenceSize = 0;
   for (auto &MI : RepeatedSequenceLocs[0]) {
@@ -10544,7 +10546,8 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(
 }
 
 outliner::InstrType
-X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
+X86InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                   MachineBasicBlock::iterator &MIT,
                                    unsigned Flags) const {
   MachineInstr &MI = *MIT;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index eaa3dd0893948..3100a9e5699f0 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -585,13 +585,15 @@ class X86InstrInfo final : public X86GenInstrInfo {
   getSerializableDirectMachineOperandTargetFlags() const override;
 
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
 
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
 
-  outliner::InstrType
-  getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+  outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI,
+                                           MachineBasicBlock::iterator &MIT,
+                                           unsigned Flags) const override;
 
   void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                           const outliner::OutlinedFunction &OF) const override;
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 8d6bc8d0ee2cf..60dfe66960507 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -576,3 +576,13 @@ def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
           (MMX_CVTTPD2PIrr VR128:$src)>;
 }
+
+let AddedComplexity = 400 in { // Prefer non-temporal version
+
+// Non-temporal store (no alignment required).
+let Predicates = [HasMMX] in {
+  def : Pat<(nontemporalstore (x86mmx VR64:$src), addr:$dst),
+            (MMX_MOVNTQmr addr:$dst, VR64:$src)>;
+}
+
+} // AddedComplexity
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 717541cf6c559..685daca360e08 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -73,7 +73,8 @@ enum IntrinsicType : uint16_t {
   GATHER_AVX2,
   ROUNDP,
   ROUNDS,
-  RDPRU
+  RDPRU,
+  INTR_TYPE_CAST_MMX
 };
 
 struct IntrinsicData {
@@ -323,6 +324,8 @@ static const IntrinsicData IntrinsicsWithChain[] = {
     X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+    X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
     X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0),
     X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
@@ -1495,6 +1498,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
                        0),
+
+    X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0),
+
     X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
     X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
     X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
@@ -1503,8 +1575,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
     X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
     X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+    X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+    X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
@@ -1512,6 +1590,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
     X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
     X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+    X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
     X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
     X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
@@ -1593,14 +1672,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
     X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
     X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW,
                        0),
+    X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+    X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index df20ecd1b9b21..77ddd2366e629 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -147,7 +147,7 @@ X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
       AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
-  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>();
 }
 
 /// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
@@ -203,7 +203,7 @@ MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
     break;
   case X86II::MO_COFFSTUB: {
     MachineModuleInfoCOFF &MMICOFF =
-        MF.getMMI().getObjFileInfo<MachineModuleInfoCOFF>();
+        AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoCOFF>();
     MachineModuleInfoImpl::StubValueTy &StubSym = MMICOFF.getGVStubEntry(Sym);
     if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index ee5fadf470dde..1d1885a3dcd24 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -45,7 +45,7 @@ using namespace llvm;
 #define DEBUG_TYPE "tile-pre-config"
 
 static void emitErrorMsg(MachineFunction &MF) {
-  LLVMContext &Context = MF.getMMI().getModule()->getContext();
+  LLVMContext &Context = MF.getFunction().getContext();
   Context.emitError(
       MF.getName() +
       ": Failed to config tile register, please define the shape earlier");
diff --git a/llvm/lib/Target/X86/X86ReturnThunks.cpp b/llvm/lib/Target/X86/X86ReturnThunks.cpp
index fe89238f26f97..c40b4f371fb31 100644
--- a/llvm/lib/Target/X86/X86ReturnThunks.cpp
+++ b/llvm/lib/Target/X86/X86ReturnThunks.cpp
@@ -78,7 +78,7 @@ bool X86ReturnThunks::runOnMachineFunction(MachineFunction &MF) {
         Rets.push_back(&Term);
 
   bool IndCS =
-      MF.getMMI().getModule()->getModuleFlag("indirect_branch_cs_prefix");
+      MF.getFunction().getParent()->getModuleFlag("indirect_branch_cs_prefix");
   const MCInstrDesc &CS = ST.getInstrInfo()->get(X86::CS_PREFIX);
   const MCInstrDesc &JMP = ST.getInstrInfo()->get(X86::TAILJMPd);
 
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 63ac91028ac93..697d30a8a9548 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -792,10 +792,10 @@ def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
 def: InstRW<[BWWriteResGroup28], (instrs VPBROADCASTBrr,
                                          VPBROADCASTWrr)>;
 
-def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 516dc62f1b6d5..c4d2ad7681c43 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1247,10 +1247,10 @@ def: InstRW<[HWWriteResGroup53_1], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 
-def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7b33aed6351c1..6966400eff2c0 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -638,7 +638,8 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
                                         MMX_PALIGNRrri,
                                         MMX_PSIGNBrr,
                                         MMX_PSIGNDrr,
-                                        MMX_PSIGNWrr)>;
+                                        MMX_PSIGNWrr,
+                                        MMX_PSUBQrr)>;
 
 def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
   let Latency = 2;
@@ -898,7 +899,8 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>;
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm,
+                                         MMX_PSUBQrm)>;
 
 def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 3ee931fe5ed9c..d764cb3fe3130 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -404,7 +404,7 @@ defm : SKLWriteResPair<WritePSADBW,  [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW
 defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
 defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
-defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
 
 // Vector integer shifts.
 defm : SKLWriteResPair<WriteVecShift,     [SKLPort0], 1, [1], 1, 5>;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 4e8e04b1112c0..7d5af2ede99cf 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -279,6 +279,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
         FullFS += ",+evex512";
   }
 
+  // Disable 64-bit only features in non-64-bit mode.
+  SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+      "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
+  if (FullFS.find("-64bit-mode") != std::string::npos)
+    llvm::for_each(FeaturesIn64BitOnly,
+                   [&](StringRef F) { FullFS += ",-" + F.str(); });
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 4c77f40fd32a3..d2d59ff3b93cf 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -676,8 +676,10 @@ std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
 }
 
 static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI,
-                                      const TargetRegisterClass &RC) {
-  return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(&RC);
+                                      const MachineRegisterInfo &MRI,
+                                      const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(RC);
 }
 
 bool X86PassConfig::addRegAssignAndRewriteOptimized() {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 32a3683355b72..02267c14896c9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3590,22 +3590,26 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { X86ISD::VROTLI,  MVT::v64i8,   {  2,  9,  3,  4 } },
     { X86ISD::VROTLI,  MVT::v32i8,   {  1,  9,  3,  4 } },
     { X86ISD::VROTLI,  MVT::v16i8,   {  1,  8,  3,  4 } },
-    { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
-    { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
-    { ISD::SSUBSAT,    MVT::v32i16,  {  1 } },
-    { ISD::SSUBSAT,    MVT::v64i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v32i16,  {  1 } },
-    { ISD::UADDSAT,    MVT::v64i8,   {  1 } },
+    { ISD::SMULO,      MVT::v32i16,  {  3,  6,  4,  4 } },
+    { ISD::SMULO,      MVT::v64i8,   {  8, 21, 17, 18 } },
+    { ISD::UMULO,      MVT::v32i16,  {  2,  5,  3,  3 } },
+    { ISD::UMULO,      MVT::v64i8,   {  8, 15, 15, 16 } },
+    { ISD::SSUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::SSUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v32i16,  {  1 } },
-    { ISD::USUBSAT,    MVT::v64i8,   {  1 } },
+    { ISD::USUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::USUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
   };
   static const CostKindTblEntry AVX512CostTbl[] = {
     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
@@ -3654,6 +3658,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { X86ISD::VROTLI,  MVT::v16i32,  {  1,  1,  1,  1 } },
     { X86ISD::VROTLI,  MVT::v8i32,   {  1,  1,  1,  1 } },
     { X86ISD::VROTLI,  MVT::v4i32,   {  1,  1,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   {  3,  3,  8,  9 } },
+    { ISD::SADDSAT,    MVT::v4i64,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v8i64,   {  3,  3,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v8i32,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v16i32,  {  3,  3,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::SADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
@@ -3666,6 +3678,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
+    { ISD::SMULO,      MVT::v8i64,   { 44, 44, 81, 93 } },
+    { ISD::SMULO,      MVT::v16i32,  {  5, 12,  9, 11 } },
+    { ISD::SMULO,      MVT::v32i16,  {  6, 12, 17, 17 } },
+    { ISD::SMULO,      MVT::v64i8,   { 22, 28, 42, 42 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   {  2, 13,  9, 10 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v8i64,   {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v8i32,   {  2, 15,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v16i32,  {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::SSUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
@@ -3678,22 +3702,25 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v16i32,  {  2 } }, // pmaxud + psubd
-    { ISD::USUBSAT,    MVT::v2i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::USUBSAT,    MVT::v4i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::USUBSAT,    MVT::v8i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::UADDSAT,    MVT::v16i32,  {  3 } }, // not + pminud + paddd
-    { ISD::UADDSAT,    MVT::v2i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::UADDSAT,    MVT::v4i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::UADDSAT,    MVT::v8i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::SADDSAT,    MVT::v32i16,  {  2 } },
-    { ISD::SADDSAT,    MVT::v64i8,   {  2 } },
-    { ISD::SSUBSAT,    MVT::v32i16,  {  2 } },
-    { ISD::SSUBSAT,    MVT::v64i8,   {  2 } },
-    { ISD::UADDSAT,    MVT::v32i16,  {  2 } },
-    { ISD::UADDSAT,    MVT::v64i8,   {  2 } },
-    { ISD::USUBSAT,    MVT::v32i16,  {  2 } },
-    { ISD::USUBSAT,    MVT::v64i8,   {  2 } },
+    { ISD::UMULO,      MVT::v8i64,   { 52, 52, 95, 104} },
+    { ISD::UMULO,      MVT::v16i32,  {  5, 12,  8, 10 } },
+    { ISD::UMULO,      MVT::v32i16,  {  5, 13, 16, 16 } },
+    { ISD::UMULO,      MVT::v64i8,   { 18, 24, 30, 30 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v8i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  1,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v8i32,   {  1,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v16i32,  {  2,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::UADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v8i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v8i32,   {  1,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v16i32,  {  1,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::FMAXNUM,    MVT::f32,     {  2,  2,  3,  3 } },
     { ISD::FMAXNUM,    MVT::v4f32,   {  1,  1,  3,  3 } },
     { ISD::FMAXNUM,    MVT::v8f32,   {  2,  2,  3,  3 } },
@@ -3797,8 +3824,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
-    { ISD::SADDSAT,    MVT::v16i16,  {  1 } },
-    { ISD::SADDSAT,    MVT::v32i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   {  4, 13,  8, 11 } },
+    { ISD::SADDSAT,    MVT::v4i64,   {  3, 10,  8, 12 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  2,  6,  7,  9 } },
+    { ISD::SADDSAT,    MVT::v8i32,   {  4,  6,  7, 13 } },
+    { ISD::SADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::SADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
@@ -3809,11 +3840,25 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
-    { ISD::SSUBSAT,    MVT::v16i16,  {  1 } },
-    { ISD::SSUBSAT,    MVT::v32i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v16i16,  {  1 } },
-    { ISD::UADDSAT,    MVT::v32i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v8i32,   {  3 } }, // not + pminud + paddd
+    { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
+    { ISD::SMULO,      MVT::v2i64,   {  8,  8, 13, 15 } },
+    { ISD::SMULO,      MVT::v8i32,   {  8, 20, 13, 24 } },
+    { ISD::SMULO,      MVT::v4i32,   {  5, 15, 11, 12 } },
+    { ISD::SMULO,      MVT::v16i16,  {  4, 14,  8, 14 } },
+    { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
+    { ISD::SMULO,      MVT::v32i8,   {  9, 15, 18, 35 } },
+    { ISD::SMULO,      MVT::v16i8,   {  6, 22, 14, 21 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   {  4, 13,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   {  4, 15,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  3, 14,  9, 11 } },
+    { ISD::SSUBSAT,    MVT::v8i32,   {  4, 15,  9, 16 } },
+    { ISD::SSUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::SSUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  2,  8,  6,  6 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  3,  8,  6, 10 } },
+    { ISD::UADDSAT,    MVT::v8i32,   {  2,  2,  4,  8 } },
+    { ISD::UADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::UADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
@@ -3824,9 +3869,19 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
-    { ISD::USUBSAT,    MVT::v16i16,  {  1 } },
-    { ISD::USUBSAT,    MVT::v32i8,   {  1 } },
-    { ISD::USUBSAT,    MVT::v8i32,   {  2 } }, // pmaxud + psubd
+    { ISD::UMULO,      MVT::v4i64,   { 24, 24, 39, 43 } },
+    { ISD::UMULO,      MVT::v2i64,   { 10, 10, 15, 19 } },
+    { ISD::UMULO,      MVT::v8i32,   {  8, 11, 13, 23 } },
+    { ISD::UMULO,      MVT::v4i32,   {  5, 12, 11, 12 } },
+    { ISD::UMULO,      MVT::v16i16,  {  4,  6,  8, 13 } },
+    { ISD::UMULO,      MVT::v8i16,   {  2,  8,  6,  6 } },
+    { ISD::UMULO,      MVT::v32i8,   {  9, 13, 17, 33 } },
+    { ISD::UMULO,      MVT::v16i8,   {  6, 19, 13, 20 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  2,  7,  6,  6 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  3,  7,  6, 10 } },
+    { ISD::USUBSAT,    MVT::v8i32,   {  2,  2,  2,  4 } },
+    { ISD::USUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::USUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::FMAXNUM,    MVT::f32,     {  2,  7,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  2,  7,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v8f32,   {  3,  7,  3,  6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
@@ -3883,8 +3938,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
-    { ISD::SADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v2i64,   {  6, 13,  8, 11 } },
+    { ISD::SADDSAT,    MVT::v4i64,   { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v8i32,   { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
@@ -3895,11 +3953,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SSUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SSUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v8i32,   {  8 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
+    { ISD::SMULO,      MVT::v2i64,   {  9,  9, 13, 17 } },
+    { ISD::SMULO,      MVT::v8i32,   { 15, 20, 24, 29 } },
+    { ISD::SMULO,      MVT::v4i32,   {  7, 15, 11, 13 } },
+    { ISD::SMULO,      MVT::v16i16,  {  8, 14, 14, 15 } },
+    { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
+    { ISD::SMULO,      MVT::v32i8,   { 20, 20, 37, 39 } },
+    { ISD::SMULO,      MVT::v16i8,   {  9, 22, 18, 21 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   {  7, 13,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v8i32,   { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v2i64,   {  3,  8,  6,  6 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v8i32,   {  6,  6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
@@ -3910,9 +3981,20 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v8i32,   {  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMULO,      MVT::v4i64,   { 24, 26, 39, 45 } },
+    { ISD::UMULO,      MVT::v2i64,   { 10, 12, 15, 20 } },
+    { ISD::UMULO,      MVT::v8i32,   { 14, 15, 23, 28 } },
+    { ISD::UMULO,      MVT::v4i32,   {  7, 12, 11, 13 } },
+    { ISD::UMULO,      MVT::v16i16,  {  7, 11, 13, 14 } },
+    { ISD::UMULO,      MVT::v8i16,   {  3,  8,  6,  6 } },
+    { ISD::UMULO,      MVT::v32i8,   { 19, 19, 35, 37 } },
+    { ISD::UMULO,      MVT::v16i8,   {  9, 19, 17, 20 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  3,  7,  6,  6 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v8i32,   {  4,  4,  7,  8 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v8i32,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::FMAXNUM,    MVT::f32,     {  3,  6,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  3,  6,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v8f32,   {  5,  7,  3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
@@ -3963,8 +4045,6 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
   };
   static const CostKindTblEntry SSE42CostTbl[] = {
-    { ISD::USUBSAT,    MVT::v4i32,   {  2 } }, // pmaxud + psubd
-    { ISD::UADDSAT,    MVT::v4i32,   {  3 } }, // not + pminud + paddd
     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  4,  4,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
@@ -3974,18 +4054,34 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   };
   static const CostKindTblEntry SSE41CostTbl[] = {
     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
+    { ISD::SADDSAT,    MVT::v2i64,   { 10, 14, 17, 21 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  5, 11,  8, 10 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   { 12, 19, 25, 29 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 10, 12 } },
     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
+    { ISD::SMULO,      MVT::v2i64,   {  9, 11, 13, 17 } },
+    { ISD::SMULO,      MVT::v4i32,   { 20, 24, 13, 19 } },
+    { ISD::SMULO,      MVT::v8i16,   {  5,  9,  8,  8 } },
+    { ISD::SMULO,      MVT::v16i8,   { 13, 22, 24, 25 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  6, 13, 14, 14 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  2,  2,  4,  4 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  6, 10, 14, 14 } },
+    { ISD::USUBSAT,    MVT::v4i32,   {  1,  2,  2,  2 } },
     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
+    { ISD::UMULO,      MVT::v2i64,   { 14, 20, 15, 20 } },
+    { ISD::UMULO,      MVT::v4i32,   { 19, 22, 12, 18 } },
+    { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
+    { ISD::UMULO,      MVT::v16i8,   { 13, 19, 18, 20 } },
   };
   static const CostKindTblEntry SSSE3CostTbl[] = {
     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
@@ -4035,8 +4131,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
-    { ISD::SADDSAT,    MVT::v8i16,   {  1 } },
-    { ISD::SADDSAT,    MVT::v16i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   { 12, 14, 24, 24 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  6, 11, 11, 12 } },
+    { ISD::SADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
@@ -4045,10 +4143,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
-    { ISD::SSUBSAT,    MVT::v8i16,   {  1 } },
-    { ISD::SSUBSAT,    MVT::v16i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v8i16,   {  1 } },
-    { ISD::UADDSAT,    MVT::v16i8,   {  1 } },
+    { ISD::SMULO,      MVT::v2i64,   { 30, 33, 13, 23 } },
+    { ISD::SMULO,      MVT::v4i32,   { 20, 24, 23, 23 } },
+    { ISD::SMULO,      MVT::v8i16,   {  5, 10,  8,  8 } },
+    { ISD::SMULO,      MVT::v16i8,   { 13, 23, 24, 25 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   { 16, 19, 31, 31 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 12, 13 } },
+    { ISD::SSUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::SSUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  7, 13, 14, 14 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  4,  5,  7,  7 } },
+    { ISD::UADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
@@ -4057,8 +4163,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v8i16,   {  1 } },
-    { ISD::USUBSAT,    MVT::v16i8,   {  1 } },
+    { ISD::UMULO,      MVT::v2i64,   { 30, 33, 15, 29 } },
+    { ISD::UMULO,      MVT::v4i32,   { 19, 22, 14, 18 } },
+    { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
+    { ISD::UMULO,      MVT::v16i8,   { 13, 19, 20, 20 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  7, 10, 14, 14 } },
+    { ISD::USUBSAT,    MVT::v4i32,   {  4,  4,  7,  7 } },
+    { ISD::USUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::USUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } },
     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  6,  6,  6 } },
     { ISD::FSQRT,      MVT::f64,     { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
@@ -4103,17 +4215,22 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR
     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
-    { ISD::ROTL,       MVT::i64,     {  2, 3, 1, 3 } },
-    { ISD::ROTR,       MVT::i64,     {  2, 3, 1, 3 } },
-    { X86ISD::VROTLI,  MVT::i64,     {  1, 1, 1, 1 } },
-    { ISD::FSHL,       MVT::i64,     {  4, 4, 1, 4 } },
+    { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
+    { ISD::ROTR,       MVT::i64,     {  2,  3,  1,  3 } },
+    { X86ISD::VROTLI,  MVT::i64,     {  1,  1,  1,  1 } },
+    { ISD::FSHL,       MVT::i64,     {  4,  4,  1,  4 } },
+    { ISD::SADDSAT,    MVT::i64,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i64,     {  4,  5,  8, 11 } },
+    { ISD::UADDSAT,    MVT::i64,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i64,     {  2,  3,  4,  7 } },
     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
-    { ISD::SADDO,      MVT::i64,     {  1 } },
-    { ISD::UADDO,      MVT::i64,     {  1 } },
-    { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
+    { ISD::SADDO,      MVT::i64,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i64,     {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i64,     {  4,  4,  4,  6 } },
+    { ISD::UMULO,      MVT::i64,     {  8,  8,  4,  7 } },
   };
   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
@@ -4151,6 +4268,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
+    { ISD::SADDSAT,    MVT::i32,     {  3,  4,  6,  9 } },
+    { ISD::SADDSAT,    MVT::i16,     {  4,  4,  7, 10 } },
+    { ISD::SADDSAT,    MVT::i8,      {  4,  5,  8, 11 } },
+    { ISD::SSUBSAT,    MVT::i32,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i16,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i8,      {  4,  5,  8, 11 } },
+    { ISD::UADDSAT,    MVT::i32,     {  2,  3,  4,  7 } },
+    { ISD::UADDSAT,    MVT::i16,     {  2,  3,  4,  7 } },
+    { ISD::UADDSAT,    MVT::i8,      {  3,  3,  5,  8 } },
+    { ISD::USUBSAT,    MVT::i32,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i16,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i8,      {  3,  3,  5,  8 } },
     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
@@ -4163,15 +4292,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
-    { ISD::SADDO,      MVT::i32,     {  1 } },
-    { ISD::SADDO,      MVT::i16,     {  1 } },
-    { ISD::SADDO,      MVT::i8,      {  1 } },
-    { ISD::UADDO,      MVT::i32,     {  1 } },
-    { ISD::UADDO,      MVT::i16,     {  1 } },
-    { ISD::UADDO,      MVT::i8,      {  1 } },
-    { ISD::UMULO,      MVT::i32,     {  2 } }, // mul + seto
-    { ISD::UMULO,      MVT::i16,     {  2 } },
-    { ISD::UMULO,      MVT::i8,      {  2 } },
+    { ISD::SADDO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::SADDO,      MVT::i16,     {  2,  2,  4,  6 } },
+    { ISD::SADDO,      MVT::i8,      {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i16,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i8,      {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i16,     {  5,  5,  4,  6 } },
+    { ISD::SMULO,      MVT::i8,      {  6,  6,  4,  6 } },
+    { ISD::UMULO,      MVT::i32,     {  6,  6,  4,  8 } },
+    { ISD::UMULO,      MVT::i16,     {  6,  6,  4,  9 } },
+    { ISD::UMULO,      MVT::i8,      {  6,  6,  4,  6 } },
   };
 
   Type *RetTy = ICA.getReturnType();
@@ -4284,9 +4416,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     ISD = ISD::UADDO;
     OpTy = RetTy->getContainedType(0);
     break;
-  case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
-    // SMULO has same costs so don't duplicate.
+    ISD = ISD::SMULO;
+    OpTy = RetTy->getContainedType(0);
+    break;
+  case Intrinsic::umul_with_overflow:
     ISD = ISD::UMULO;
     OpTy = RetTy->getContainedType(0);
     break;
@@ -6756,3 +6890,8 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
     return AM.Scale != 0;
   return -1;
 }
+
+InstructionCost X86TTIImpl::getBranchMispredictPenalty() const {
+  // TODO: Hook MispredictPenalty of SchedMachineModel into this.
+  return 14;
+}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 5eccb1aea308d..b619090e8e1e0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -294,6 +294,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
+  InstructionCost getBranchMispredictPenalty() const;
+
 private:
   bool supportsGather() const;
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index bc69e1868b88f..a3c48735a98a3 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -117,8 +117,7 @@ void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
-                                                 MCInstPrinter *InstPrint,
-                                                 bool isVerboseAsm) {
+                                                 MCInstPrinter *InstPrint) {
   return new XCoreTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 8cb9413f96526..b3753692ac2a0 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -225,8 +225,7 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineModuleInfo *MMI = &MF.getMMI();
-  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
   const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   // Debug location must be unknown since the first debug location is used
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 87ef66ba742b6..2653c293dc0c4 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -66,7 +66,7 @@ createXtensaMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 
 static MCTargetStreamer *
 createXtensaAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
-                              MCInstPrinter *InstPrint, bool isVerboseAsm) {
+                              MCInstPrinter *InstPrint) {
   return new XtensaTargetAsmStreamer(S, OS);
 }
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
index 445fd75e29f37..e24cb7714d364 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -41,8 +41,7 @@ void XtensaFrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   MCRegister SP = Xtensa::SP;
   MCRegister FP = TRI->getFrameRegister(MF);
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
 
   // First, compute final stack size.
   uint64_t StackSize = MFI.getStackSize();
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 4b5d582d57a42..8ec32f7410566 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMTargetParser
   CSKYTargetParser.cpp
   Host.cpp
   LoongArchTargetParser.cpp
+  PPCTargetParser.cpp
   RISCVISAInfo.cpp
   RISCVTargetParser.cpp
   SubtargetFeature.cpp
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 82c1731f58f0a..68aed69ee574b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -50,6 +50,11 @@
 #if defined(__sun__) && defined(__svr4__)
 #include <kstat.h>
 #endif
+#if defined(__GNUC__) || defined(__clang__)
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+#include <cpuid.h>
+#endif
+#endif
 
 #define DEBUG_TYPE "host-detection"
 
@@ -150,6 +155,7 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
       .Case("POWER10", "pwr10")
+      .Case("POWER11", "pwr11")
       // FIXME: If we get a simulator or machine with the capabilities of
       // mcpu=future, we should revisit this and add the name reported by the
       // simulator/machine.
@@ -521,68 +527,15 @@ StringRef sys::detail::getHostCPUNameForBPF() {
 #endif
 }
 
-#if defined(__i386__) || defined(_M_IX86) || \
-    defined(__x86_64__) || defined(_M_X64)
-
-// The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
-// Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
-// support. Consequently, for i386, the presence of CPUID is checked first
-// via the corresponding eflags bit.
-// Removal of cpuid.h header motivated by PR30384
-// Header cpuid.h and method __get_cpuid_max are not used in llvm, clang, openmp
-// or test-suite, but are used in external projects e.g. libstdcxx
-static bool isCpuIdSupported() {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__i386__)
-  int __cpuid_supported;
-  __asm__("  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   %%eax,%%ecx\n"
-          "  xorl   $0x00200000,%%eax\n"
-          "  pushl  %%eax\n"
-          "  popfl\n"
-          "  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   $0,%0\n"
-          "  cmpl   %%eax,%%ecx\n"
-          "  je     1f\n"
-          "  movl   $1,%0\n"
-          "1:"
-          : "=r"(__cpuid_supported)
-          :
-          : "eax", "ecx");
-  if (!__cpuid_supported)
-    return false;
-#endif
-  return true;
-#endif
-  return true;
-}
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
+    defined(_M_X64)
 
 /// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
 /// the specified arguments.  If we can't run cpuid on the host, return true.
 static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
                                unsigned *rECX, unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#else
-  return true;
-#endif
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+  return !__get_cpuid(value, rEAX, rEBX, rECX, rEDX);
 #elif defined(_MSC_VER)
   // The MSVC intrinsic is portable across x86 and x64.
   int registers[4];
@@ -609,9 +562,6 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
   else
     *MaxLeaf = 0;
 
-  if (!isCpuIdSupported())
-    return VendorSignatures::UNKNOWN;
-
   if (getX86CpuIDAndInfo(0, MaxLeaf, &EBX, &ECX, &EDX) || *MaxLeaf < 1)
     return VendorSignatures::UNKNOWN;
 
@@ -639,26 +589,12 @@ using namespace llvm::sys::detail::x86;
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#else
-  return true;
-#endif
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that __cpuidex is defined within cpuid.h for both, we can remove
+  // the __get_cpuid_count function and share the MSVC implementation between
+  // all three.
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+  return !__get_cpuid_count(value, subleaf, rEAX, rEBX, rECX, rEDX);
 #elif defined(_MSC_VER)
   int registers[4];
   __cpuidex(registers, value, subleaf);
@@ -674,6 +610,9 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
 
 // Read control register 0 (XCR0). Used to detect features such as AVX.
 static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that _xgetbv is supported by both, we can unify the implementation
+  // with MSVC and remove all inline assembly.
 #if defined(__GNUC__) || defined(__clang__)
   // Check xgetbv; this uses a .byte sequence instead of the instruction
   // directly because older assemblers do not include support for xgetbv and
@@ -1549,6 +1488,12 @@ StringRef sys::getHostCPUName() {
   case 0x40000:
 #endif
     return "pwr10";
+#ifdef POWER_11
+  case POWER_11:
+#else
+  case 0x80000:
+#endif
+    return "pwr11";
   default:
     return "generic";
   }
@@ -1562,6 +1507,8 @@ StringRef sys::getHostCPUName() {
   switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
+  case 0xd000: // Loongson 64bit, 6-issue
+    return "la664";
   // TODO: Others.
   default:
     break;
@@ -1879,7 +1826,9 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["ppx"] = HasAPXF;
   Features["ndd"] = HasAPXF;
   Features["ccmp"] = HasAPXF;
+  Features["nf"] = HasAPXF;
   Features["cf"] = HasAPXF;
+  Features["zu"] = HasAPXF;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
@@ -2067,7 +2016,8 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["zvfhmin"] = ExtMask & (1ULL << 31); // RISCV_HWPROBE_EXT_ZVFHMIN
   Features["zfa"] = ExtMask & (1ULL << 32);     // RISCV_HWPROBE_EXT_ZFA
   Features["ztso"] = ExtMask & (1ULL << 33);    // RISCV_HWPROBE_EXT_ZTSO
-  Features["zacas"] = ExtMask & (1ULL << 34);   // RISCV_HWPROBE_EXT_ZACAS
+  // TODO: Re-enable zacas when it is marked non-experimental again.
+  // Features["zacas"] = ExtMask & (1ULL << 34);   // RISCV_HWPROBE_EXT_ZACAS
   Features["zicond"] = ExtMask & (1ULL << 35);  // RISCV_HWPROBE_EXT_ZICOND
   Features["zihintpause"] =
       ExtMask & (1ULL << 36); // RISCV_HWPROBE_EXT_ZIHINTPAUSE
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 772d24c5ce3de..8e86d18de2ad9 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch,
       return true;
     }
   }
+
+  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
+    Features.push_back("+64bit");
+    Features.push_back("+d");
+    Features.push_back("+lsx");
+    Features.push_back("+ual");
+    if (Arch == "la64v1.1")
+      Features.push_back("+frecipe");
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
new file mode 100644
index 0000000000000..422d758c772e1
--- /dev/null
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -0,0 +1,121 @@
+//===---- PPCTargetParser.cpp - Parser for target features ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise hardware features
+// for PPC CPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TargetParser/PPCTargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/TargetParser/Host.h"
+
+namespace llvm {
+namespace PPC {
+
+struct CPUInfo {
+  StringLiteral Name;
+  // FIXME: add the features field for this CPU.
+};
+
+constexpr CPUInfo PPCCPUInfo[] = {
+#define PPC_CPU(Name, Linux_SUPPORT_METHOD, LinuxID, AIX_SUPPORT_METHOD,       \
+                AIXID)                                                         \
+  {Name},
+#include "llvm/TargetParser/PPCTargetParser.def"
+};
+
+static const CPUInfo *getCPUInfoByName(StringRef CPU) {
+  for (auto &C : PPCCPUInfo)
+    if (C.Name == CPU)
+      return &C;
+  return nullptr;
+}
+
+StringRef normalizeCPUName(StringRef CPUName) {
+  // Clang/LLVM does not actually support code generation
+  // for the 405 CPU. However, there are uses of this CPU ID
+  // in projects that previously used GCC and rely on Clang
+  // accepting it. Clang has always ignored it and passed the
+  // generic CPU ID to the back end.
+  return StringSwitch<StringRef>(CPUName)
+      .Cases("common", "405", "generic")
+      .Cases("ppc440", "440fp", "440")
+      .Cases("630", "power3", "pwr3")
+      .Case("G3", "g3")
+      .Case("G4", "g4")
+      .Case("G4+", "g4+")
+      .Case("8548", "e500")
+      .Case("ppc970", "970")
+      .Case("G5", "g5")
+      .Case("ppca2", "a2")
+      .Case("power4", "pwr4")
+      .Case("power5", "pwr5")
+      .Case("power5x", "pwr5x")
+      .Case("power5+", "pwr5+")
+      .Case("power6", "pwr6")
+      .Case("power6x", "pwr6x")
+      .Case("power7", "pwr7")
+      .Case("power8", "pwr8")
+      .Case("power9", "pwr9")
+      .Case("power10", "pwr10")
+      .Case("power11", "pwr11")
+      .Cases("powerpc", "powerpc32", "ppc")
+      .Case("powerpc64", "ppc64")
+      .Case("powerpc64le", "ppc64le")
+      .Default(CPUName);
+}
+
+void fillValidCPUList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto &C : PPCCPUInfo)
+    Values.emplace_back(C.Name);
+}
+
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto &C : PPCCPUInfo)
+    Values.emplace_back(C.Name);
+}
+
+bool isValidCPU(StringRef CPU) {
+  const CPUInfo *Info = getCPUInfoByName(CPU);
+  if (!Info)
+    return false;
+  return true;
+}
+
+StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName) {
+  if (!CPUName.empty()) {
+    if (CPUName == "native") {
+      StringRef CPU = sys::getHostCPUName();
+      if (!CPU.empty() && CPU != "generic")
+        return CPU;
+    }
+
+    StringRef CPU = normalizeCPUName(CPUName);
+    if (CPU != "generic" && CPU != "native")
+      return CPU;
+  }
+
+  // LLVM may default to generating code for the native CPU, but, like gcc, we
+  // default to a more generic option for each architecture. (except on AIX)
+  if (T.isOSAIX())
+    return "pwr7";
+  else if (T.getArch() == Triple::ppc64le)
+    return "ppc64le";
+  else if (T.getArch() == Triple::ppc64)
+    return "ppc64";
+
+  return "ppc";
+}
+
+StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
+  return getNormalizedPPCTargetCPU(T, CPUName);
+}
+
+} // namespace PPC
+} // namespace llvm
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index e6ddbb4dc28d5..9f650b87f2b39 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -1020,3 +1020,43 @@ std::string RISCVISAInfo::getTargetFeatureForExtension(StringRef Ext) {
   return isExperimentalExtension(Name) ? "experimental-" + Name.str()
                                        : Name.str();
 }
+
+struct RISCVExtBit {
+  const StringLiteral ext;
+  uint8_t bitpos;
+};
+
+/// Maps extensions with assigned bit positions within group 0 of
+/// __riscv_features_bits to their respective bit position.  At the
+/// moment all extensions are within group 0.
+constexpr static RISCVExtBit RISCVGroup0BitPositions[] = {
+    {"a", 0},          {"c", 2},
+    {"d", 3},          {"f", 5},
+    {"i", 8},          {"m", 12},
+    {"v", 21},         {"zacas", 26},
+    {"zba", 27},       {"zbb", 28},
+    {"zbc", 29},       {"zbkb", 30},
+    {"zbkc", 31},      {"zbkx", 32},
+    {"zbs", 33},       {"zfa", 34},
+    {"zfh", 35},       {"zfhmin", 36},
+    {"zicboz", 37},    {"zicond", 38},
+    {"zihintntl", 39}, {"zihintpause", 40},
+    {"zknd", 41},      {"zkne", 42},
+    {"zknh", 43},      {"zksed", 44},
+    {"zksh", 45},      {"zkt", 46},
+    {"ztso", 47},      {"zvbb", 48},
+    {"zvbc", 49},      {"zvfh", 50},
+    {"zvfhmin", 51},   {"zvkb", 52},
+    {"zvkg", 53},      {"zvkned", 54},
+    {"zvknha", 55},    {"zvknhb", 56},
+    {"zvksed", 57},    {"zvksh", 58},
+    {"zvkt", 59}};
+int RISCVISAInfo::getRISCVFeaturesBitPosition(StringRef Ext) {
+  // Note that this code currently accepts mixed case extension names, but
+  // does not handle extension versions at all.  That's probably fine because
+  // there's only one extension version in the __riscv_feature_bits vector.
+  for (auto E : RISCVGroup0BitPositions)
+    if (E.ext.equals_insensitive(Ext))
+      return E.bitpos;
+  return -1;
+}
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index db1b5f689d7da..49a35bfcf4b9b 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -128,6 +128,22 @@ void getFeaturesForCPU(StringRef CPU,
     else
       EnabledFeatures.push_back(F.substr(1));
 }
+
+namespace RISCVExtensionBitmaskTable {
+#define GET_RISCVExtensionBitmaskTable_IMPL
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
+
+} // namespace RISCVExtensionBitmaskTable
+
+namespace {
+struct LessExtName {
+  bool operator()(const RISCVExtensionBitmaskTable::RISCVExtensionBitmask &LHS,
+                  StringRef RHS) {
+    return StringRef(LHS.Name) < RHS;
+  }
+};
+} // namespace
+
 } // namespace RISCV
 
 namespace RISCVVType {
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index e34ed32df1588..cec9e25079fa4 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -356,6 +356,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case OpenCL:
     return "opencl";
   case OpenHOS: return "ohos";
+  case PAuthTest:
+    return "pauthtest";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -737,6 +739,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("amplification", Triple::Amplification)
       .StartsWith("opencl", Triple::OpenCL)
       .StartsWith("ohos", Triple::OpenHOS)
+      .StartsWith("pauthtest", Triple::PAuthTest)
       .Default(Triple::UnknownEnvironment);
 }
 
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 141ecb936b708..dcf9130052ac1 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -171,14 +171,14 @@ constexpr FeatureBitset FeaturesClearwaterforest =
 
 // Geode Processor.
 constexpr FeatureBitset FeaturesGeode =
-    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | FeaturePRFCHW;
 
 // K6 processor.
 constexpr FeatureBitset FeaturesK6 = FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
 
 // K7 and K8 architecture processors.
 constexpr FeatureBitset FeaturesAthlon =
-    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | FeaturePRFCHW;
 constexpr FeatureBitset FeaturesAthlonXP =
     FeaturesAthlon | FeatureFXSR | FeatureSSE;
 constexpr FeatureBitset FeaturesK8 =
@@ -256,8 +256,8 @@ constexpr ProcInfo Processors[] = {
   // i486-generation processors.
   { {"i486"}, CK_i486, ~0U, FeatureX87, '\0', false },
   { {"winchip-c6"}, CK_WinChipC6, ~0U, FeaturesPentiumMMX, '\0', false },
-  { {"winchip2"}, CK_WinChip2, ~0U, FeaturesPentiumMMX | Feature3DNOW, '\0', false },
-  { {"c3"}, CK_C3, ~0U, FeaturesPentiumMMX | Feature3DNOW, '\0', false },
+  { {"winchip2"}, CK_WinChip2, ~0U, FeaturesPentiumMMX | FeaturePRFCHW, '\0', false },
+  { {"c3"}, CK_C3, ~0U, FeaturesPentiumMMX | FeaturePRFCHW, '\0', false },
   // i586-generation processors, P5 microarchitecture based.
   { {"i586"}, CK_i586, ~0U, FeatureX87 | FeatureCMPXCHG8B, '\0', false },
   { {"pentium"}, CK_Pentium, ~0U, FeatureX87 | FeatureCMPXCHG8B, 'B', false },
@@ -386,8 +386,8 @@ constexpr ProcInfo Processors[] = {
   { {"lakemont"}, CK_Lakemont, ~0U, FeatureCMPXCHG8B, '\0', false },
   // K6 architecture processors.
   { {"k6"}, CK_K6, ~0U, FeaturesK6, '\0', false },
-  { {"k6-2"}, CK_K6_2, ~0U, FeaturesK6 | Feature3DNOW, '\0', false },
-  { {"k6-3"}, CK_K6_3, ~0U, FeaturesK6 | Feature3DNOW, '\0', false },
+  { {"k6-2"}, CK_K6_2, ~0U, FeaturesK6 | FeaturePRFCHW, '\0', false },
+  { {"k6-3"}, CK_K6_3, ~0U, FeaturesK6 | FeaturePRFCHW, '\0', false },
   // K7 architecture processors.
   { {"athlon"}, CK_Athlon, ~0U, FeaturesAthlon, '\0', false },
   { {"athlon-tbird"}, CK_Athlon, ~0U, FeaturesAthlon, '\0', false },
@@ -493,6 +493,7 @@ constexpr FeatureBitset ImpliedFeaturesFXSR = {};
 constexpr FeatureBitset ImpliedFeaturesINVPCID = {};
 constexpr FeatureBitset ImpliedFeaturesLWP = {};
 constexpr FeatureBitset ImpliedFeaturesLZCNT = {};
+constexpr FeatureBitset ImpliedFeaturesMMX = {};
 constexpr FeatureBitset ImpliedFeaturesMWAITX = {};
 constexpr FeatureBitset ImpliedFeaturesMOVBE = {};
 constexpr FeatureBitset ImpliedFeaturesMOVDIR64B = {};
@@ -520,6 +521,8 @@ constexpr FeatureBitset ImpliedFeaturesWBNOINVD = {};
 constexpr FeatureBitset ImpliedFeaturesVZEROUPPER = {};
 constexpr FeatureBitset ImpliedFeaturesX87 = {};
 constexpr FeatureBitset ImpliedFeaturesXSAVE = {};
+constexpr FeatureBitset ImpliedFeaturesDUMMYFEATURE1 = {};
+constexpr FeatureBitset ImpliedFeaturesDUMMYFEATURE2 = {};
 
 // Not really CPU features, but need to be in the table because clang uses
 // target features to communicate them to the backend.
@@ -534,11 +537,6 @@ constexpr FeatureBitset ImpliedFeaturesXSAVEC = FeatureXSAVE;
 constexpr FeatureBitset ImpliedFeaturesXSAVEOPT = FeatureXSAVE;
 constexpr FeatureBitset ImpliedFeaturesXSAVES = FeatureXSAVE;
 
-// MMX->3DNOW->3DNOWA chain.
-constexpr FeatureBitset ImpliedFeaturesMMX = {};
-constexpr FeatureBitset ImpliedFeatures3DNOW = FeatureMMX;
-constexpr FeatureBitset ImpliedFeatures3DNOWA = Feature3DNOW;
-
 // SSE/AVX/AVX512F chain.
 constexpr FeatureBitset ImpliedFeaturesSSE = {};
 constexpr FeatureBitset ImpliedFeaturesSSE2 = FeatureSSE;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index d5a38ec17a2a8..09ffc2d184f18 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -135,15 +135,12 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
     return false;
 
-  ICmpInst::Predicate Pred;
   BasicBlock *PhiBB = Phi.getParent();
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
+  if (!match(TermI, m_Br(m_SpecificICmp(CmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                        m_ZeroInt()),
                          m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
     return false;
 
-  if (Pred != CmpInst::ICMP_EQ)
-    return false;
-
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
 
   if (ShVal0 == ShVal1)
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 489106422e199..d8e827e9cebcf 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -148,6 +148,7 @@ void Lowerer::lowerCoroNoop(IntrinsicInst *II) {
     NoopCoro = new GlobalVariable(M, NoopCoroConst->getType(), /*isConstant=*/true,
                                 GlobalVariable::PrivateLinkage, NoopCoroConst,
                                 "NoopCoro.Frame.Const");
+    cast<GlobalVariable>(NoopCoro)->setNoSanitizeMetadata();
   }
 
   Builder.SetInsertPoint(II);
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 2816a85743faa..cd31c4be1c1da 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12491,6 +12491,8 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
   void initialize(Attributor &A) override {
     assert(getAssociatedType()->isPtrOrPtrVectorTy() &&
            "Associated value is not a pointer");
+    if (getAssociatedType()->getPointerAddressSpace())
+      indicateOptimisticFixpoint();
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index a63883c72bee8..ccda7052419a3 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -1050,8 +1050,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
       } else if (NewCB->getType()->isVoidTy()) {
         // If the return value is dead, replace any uses of it with poison
         // (any non-debug value uses will get removed later on).
-        if (!CB.getType()->isX86_MMXTy())
-          CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
+        CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
@@ -1115,8 +1114,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
     } else {
       // If this argument is dead, replace any uses of it with poison
       // (any non-debug value uses will get removed later on).
-      if (!I->getType()->isX86_MMXTy())
-        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 7b419d0f098b5..d50218aaa3b6c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -118,9 +118,9 @@ static void addLocAccess(MemoryEffects &ME, const MemoryLocation &Loc,
   if (isNoModRef(MR))
     return;
 
-  const Value *UO = getUnderlyingObject(Loc.Ptr);
-  assert(!isa<AllocaInst>(UO) &&
-         "Should have been handled by getModRefInfoMask()");
+  const Value *UO = getUnderlyingObjectAggressive(Loc.Ptr);
+  if (isa<AllocaInst>(UO))
+    return;
   if (isa<Argument>(UO)) {
     ME |= MemoryEffects::argMemOnly(MR);
     return;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 038785114a0cf..6c4f4001aab35 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
@@ -30,6 +29,7 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/IRMover.h"
+#include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -174,6 +174,21 @@ static cl::opt<std::string> WorkloadDefinitions(
              "}"),
     cl::Hidden);
 
+static cl::opt<bool> ImportAssumeUniqueLocal(
+    "import-assume-unique-local", cl::init(false),
+    cl::desc(
+        "By default, a local-linkage global variable won't be imported in the "
+        "edge mod1:func -> mod2:local-var (from value profiles) since compiler "
+        "cannot assume mod2 is compiled with full path which gives local-var a "
+        "program-wide unique GUID. Set this option to true will help cross "
+        "module import of such variables. This is only safe if the compiler "
+        "user specify the full module path."),
+    cl::Hidden);
+
+static cl::opt<std::string>
+    ContextualProfile("thinlto-pgo-ctx-prof",
+                      cl::desc("Path to a contextual profile."), cl::Hidden);
+
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
 }
@@ -196,6 +211,23 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
+static bool shouldSkipLocalInAnotherModule(const GlobalValueSummary *RefSummary,
+                                           size_t NumDefs,
+                                           StringRef ImporterModule) {
+  // We can import a local from another module if all inputs are compiled
+  // with full paths or when there is one definition.
+  if (ImportAssumeUniqueLocal || NumDefs == 1)
+    return false;
+  // In other cases, make sure we import the copy in the caller's module if the
+  // referenced value has local linkage. The only time a local variable can
+  // share an entry in the index is if there is a local with the same name in
+  // another module that had the same source file name (in a different
+  // directory), where each was compiled in their own directory so there was not
+  // distinguishing path.
+  return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
+         RefSummary->modulePath() != ImporterModule;
+}
+
 /// Given a list of possible callee implementation for a call site, qualify the
 /// legality of importing each. The return is a range of pairs. Each pair
 /// corresponds to a candidate. The first value is the ImportFailureReason for
@@ -228,19 +260,20 @@ static auto qualifyCalleeCandidates(
         if (!Summary)
           return {FunctionImporter::ImportFailureReason::GlobalVar, GVSummary};
 
-        // If this is a local function, make sure we import the copy
-        // in the caller's module. The only time a local function can
-        // share an entry in the index is if there is a local with the same name
-        // in another module that had the same source file name (in a different
-        // directory), where each was compiled in their own directory so there
-        // was not distinguishing path.
-        // However, do the import from another module if there is only one
-        // entry in the list - in that case this must be a reference due
-        // to indirect call profile data, since a function pointer can point to
-        // a local in another module.
-        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
-            CalleeSummaryList.size() > 1 &&
-            Summary->modulePath() != CallerModulePath)
+        // If this is a local function, make sure we import the copy in the
+        // caller's module. The only time a local function can share an entry in
+        // the index is if there is a local with the same name in another module
+        // that had the same source file name (in a different directory), where
+        // each was compiled in their own directory so there was not
+        // distinguishing path.
+        // If the local function is from another module, it must be a reference
+        // due to indirect call profile data since a function pointer can point
+        // to a local in another module. Do the import from another module if
+        // there is only one entry in the list or when all files in the program
+        // are compiled with full path - in both cases the local function has
+        // unique PGO name and GUID.
+        if (shouldSkipLocalInAnotherModule(Summary, CalleeSummaryList.size(),
+                                           CallerModulePath))
           return {
               FunctionImporter::ImportFailureReason::LocalLinkageNotInModule,
               GVSummary};
@@ -359,18 +392,6 @@ class GlobalsImporter final {
 
       LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
 
-      // If this is a local variable, make sure we import the copy
-      // in the caller's module. The only time a local variable can
-      // share an entry in the index is if there is a local with the same name
-      // in another module that had the same source file name (in a different
-      // directory), where each was compiled in their own directory so there
-      // was not distinguishing path.
-      auto LocalNotInModule =
-          [&](const GlobalValueSummary *RefSummary) -> bool {
-        return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
-               RefSummary->modulePath() != Summary.modulePath();
-      };
-
       for (const auto &RefSummary : VI.getSummaryList()) {
         const auto *GVS = dyn_cast<GlobalVarSummary>(RefSummary.get());
         // Functions could be referenced by global vars - e.g. a vtable; but we
@@ -379,7 +400,8 @@ class GlobalsImporter final {
         // based on profile information). Should we decide to handle them here,
         // we can refactor accordingly at that time.
         if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
-            LocalNotInModule(GVS))
+            shouldSkipLocalInAnotherModule(GVS, VI.getSummaryList().size(),
+                                           Summary.modulePath()))
           continue;
 
         // If there isn't an entry for GUID, insert <GUID, Definition> pair.
@@ -586,13 +608,7 @@ class WorkloadImportsManager : public ModuleImportsManager {
     LLVM_DEBUG(dbgs() << "[Workload] Done\n");
   }
 
-public:
-  WorkloadImportsManager(
-      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
-          IsPrevailing,
-      const ModuleSummaryIndex &Index,
-      DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists)
-      : ModuleImportsManager(IsPrevailing, Index, ExportLists) {
+  void loadFromJson() {
     // Since the workload def uses names, we need a quick lookup
     // name->ValueInfo.
     StringMap<ValueInfo> NameToValueInfo;
@@ -662,15 +678,81 @@ class WorkloadImportsManager : public ModuleImportsManager {
         }
         Set.insert(ElemIt->second);
       }
-      LLVM_DEBUG({
+    }
+  }
+
+  void loadFromCtxProf() {
+    std::error_code EC;
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ContextualProfile);
+    if (std::error_code EC = BufferOrErr.getError()) {
+      report_fatal_error("Failed to open contextual profile file");
+      return;
+    }
+    auto Buffer = std::move(BufferOrErr.get());
+
+    PGOCtxProfileReader Reader(Buffer->getBuffer());
+    auto Ctx = Reader.loadContexts();
+    if (!Ctx) {
+      report_fatal_error("Failed to parse contextual profiles");
+      return;
+    }
+    const auto &CtxMap = *Ctx;
+    DenseSet<GlobalValue::GUID> ContainedGUIDs;
+    for (const auto &[RootGuid, Root] : CtxMap) {
+      // Avoid ContainedGUIDs to get in/out of scope. Reuse its memory for
+      // subsequent roots, but clear its contents.
+      ContainedGUIDs.clear();
+
+      auto RootVI = Index.getValueInfo(RootGuid);
+      if (!RootVI) {
+        LLVM_DEBUG(dbgs() << "[Workload] Root " << RootGuid
+                          << " not found in this linkage unit.\n");
+        continue;
+      }
+      if (RootVI.getSummaryList().size() != 1) {
+        LLVM_DEBUG(dbgs() << "[Workload] Root " << RootGuid
+                          << " should have exactly one summary, but has "
+                          << RootVI.getSummaryList().size() << ". Skipping.\n");
+        continue;
+      }
+      StringRef RootDefiningModule =
+          RootVI.getSummaryList().front()->modulePath();
+      LLVM_DEBUG(dbgs() << "[Workload] Root defining module for " << RootGuid
+                        << " is : " << RootDefiningModule << "\n");
+      auto &Set = Workloads[RootDefiningModule];
+      Root.getContainedGuids(ContainedGUIDs);
+      for (auto Guid : ContainedGUIDs)
+        if (auto VI = Index.getValueInfo(Guid))
+          Set.insert(VI);
+    }
+  }
+
+public:
+  WorkloadImportsManager(
+      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+          IsPrevailing,
+      const ModuleSummaryIndex &Index,
+      DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists)
+      : ModuleImportsManager(IsPrevailing, Index, ExportLists) {
+    if (ContextualProfile.empty() == WorkloadDefinitions.empty()) {
+      report_fatal_error(
+          "Pass only one of: -thinlto-pgo-ctx-prof or -thinlto-workload-def");
+      return;
+    }
+    if (!ContextualProfile.empty())
+      loadFromCtxProf();
+    else
+      loadFromJson();
+    LLVM_DEBUG({
+      for (const auto &[Root, Set] : Workloads) {
         dbgs() << "[Workload] Root: " << Root << " we have " << Set.size()
                << " distinct callees.\n";
         for (const auto &VI : Set) {
           dbgs() << "[Workload] Root: " << Root
                  << " Would include: " << VI.getGUID() << "\n";
         }
-      });
-    }
+      }
+    });
   }
 };
 
@@ -679,7 +761,7 @@ std::unique_ptr<ModuleImportsManager> ModuleImportsManager::create(
         IsPrevailing,
     const ModuleSummaryIndex &Index,
     DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists) {
-  if (WorkloadDefinitions.empty()) {
+  if (WorkloadDefinitions.empty() && ContextualProfile.empty()) {
     LLVM_DEBUG(dbgs() << "[Workload] Using the regular imports manager.\n");
     return std::unique_ptr<ModuleImportsManager>(
         new ModuleImportsManager(IsPrevailing, Index, ExportLists));
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 5aefcbf13182c..2ec5da4886839 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -168,10 +169,24 @@ static bool mayExtractBlock(const BasicBlock &BB) {
   //
   // Resumes that are not reachable from a cleanup landing pad are considered to
   // be unreachable. It’s not safe to split them out either.
+
   if (BB.hasAddressTaken() || BB.isEHPad())
     return false;
   auto Term = BB.getTerminator();
-  return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
+  if (isa<InvokeInst>(Term) || isa<ResumeInst>(Term))
+    return false;
+
+  // Do not outline basic blocks that have token type instructions. e.g.,
+  // exception:
+  // %0 = cleanuppad within none []
+  // call void @"?terminate@@YAXXZ"() [ "funclet"(token %0) ]
+  // br label %continue-exception
+  if (llvm::any_of(
+          BB, [](const Instruction &I) { return I.getType()->isTokenTy(); })) {
+    return false;
+  }
+
+  return true;
 }
 
 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
@@ -258,6 +273,11 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
       F.hasFnAttribute(Attribute::SanitizeMemory))
     return false;
 
+  // Do not outline scoped EH personality functions.
+  if (F.hasPersonalityFn())
+    if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+      return false;
+
   return true;
 }
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 66bd786c85df5..c9de9c964bba0 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -242,9 +242,16 @@ class CallsiteContextGraph {
     // recursion.
     bool Recursive = false;
 
-    // The corresponding allocation or interior call.
+    // The corresponding allocation or interior call. This is the primary call
+    // for which we have created this node.
     CallInfo Call;
 
+    // List of other calls that can be treated the same as the primary call
+    // through cloning. I.e. located in the same function and have the same
+    // (possibly pruned) stack ids. They will be updated the same way as the
+    // primary call when assigning to function clones.
+    std::vector<CallInfo> MatchingCalls;
+
     // For alloc nodes this is a unique id assigned when constructed, and for
     // callsite stack nodes it is the original stack id when the node is
     // constructed from the memprof MIB metadata on the alloc nodes. Note that
@@ -457,6 +464,9 @@ class CallsiteContextGraph {
   /// iteration.
   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
 
+  /// Records the function each call is located in.
+  DenseMap<CallInfo, const FuncTy *> CallToFunc;
+
   /// Map from callsite node to the enclosing caller function.
   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
 
@@ -474,7 +484,8 @@ class CallsiteContextGraph {
   /// StackIdToMatchingCalls map.
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
+      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -1230,10 +1241,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(ContextNode *Node,
-                              DenseSet<const ContextNode *> &Visited,
-                              DenseMap<uint64_t, std::vector<CallContextInfo>>
-                                  &StackIdToMatchingCalls) {
+    assignStackNodesPostOrder(
+        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+        DenseMap<uint64_t, std::vector<CallContextInfo>>
+            &StackIdToMatchingCalls,
+        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1246,7 +1258,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     // Skip any that have been removed during the recursion.
     if (!Edge)
       continue;
-    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1289,8 +1302,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
     // Skip any for which we didn't assign any ids, these don't get a node in
     // the graph.
-    if (SavedContextIds.empty())
+    if (SavedContextIds.empty()) {
+      // If this call has a matching call (located in the same function and
+      // having the same stack ids), simply add it to the context node created
+      // for its matching call earlier. These can be treated the same through
+      // cloning and get updated at the same time.
+      if (!CallToMatchingCall.contains(Call))
+        continue;
+      auto MatchingCall = CallToMatchingCall[Call];
+      assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+      NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
+          Call);
       continue;
+    }
 
     assert(LastId == Ids.back());
 
@@ -1422,6 +1446,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // there is more than one call with the same stack ids. Their (possibly newly
   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
+  // Save a map from each call to any that are found to match it. I.e. located
+  // in the same function and have the same (possibly pruned) stack ids. We use
+  // this to avoid creating extra graph nodes as they can be treated the same.
+  DenseMap<CallInfo, CallInfo> CallToMatchingCall;
   for (auto &It : StackIdToMatchingCalls) {
     auto &Calls = It.getSecond();
     // Skip single calls with a single stack id. These don't need a new node.
@@ -1460,6 +1488,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
+    // Map from function to the first call from the below list (with matching
+    // stack ids) found in that function. Note that calls from different
+    // functions can have the same stack ids because this is the list of stack
+    // ids that had (possibly pruned) nodes after building the graph from the
+    // allocation MIBs.
+    DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+
     for (unsigned I = 0; I < Calls.size(); I++) {
       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
       assert(SavedContextIds.empty());
@@ -1533,6 +1568,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
           continue;
       }
 
+      const FuncTy *CallFunc = CallToFunc[Call];
+
+      // If the prior call had the same stack ids this map would not be empty.
+      // Check if we already have a call that "matches" because it is located
+      // in the same function.
+      if (FuncToCallMap.contains(CallFunc)) {
+        // Record the matching call found for this call, and skip it. We
+        // will subsequently combine it into the same node.
+        CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
+        continue;
+      }
+
       // Check if the next set of stack ids is the same (since the Calls vector
       // of tuples is sorted by the stack ids we can just look at the next one).
       bool DuplicateContextIds = false;
@@ -1562,7 +1609,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
         set_subtract(LastNodeContextIds, StackSequenceContextIds);
         if (LastNodeContextIds.empty())
           break;
-      }
+        // No longer possibly in a sequence of calls with duplicate stack ids,
+        // clear the map.
+        FuncToCallMap.clear();
+      } else
+        // Record the call with its function, so we can locate it the next time
+        // we find a call from this function when processing the calls with the
+        // same stack ids.
+        FuncToCallMap[CallFunc] = Call;
     }
   }
 
@@ -1579,7 +1633,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap)
-    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   if (VerifyCCG)
     check();
 }
@@ -1679,6 +1734,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           continue;
         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
           auto *AllocNode = addAllocNode(&I, &F);
           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
           assert(CallsiteMD);
@@ -1700,8 +1756,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           I.setMetadata(LLVMContext::MD_callsite, nullptr);
         }
         // For callsite metadata, add to list for this function for later use.
-        else if (I.getMetadata(LLVMContext::MD_callsite))
+        else if (I.getMetadata(LLVMContext::MD_callsite)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
+        }
       }
     }
     if (!CallsWithMetadata.empty())
@@ -1756,8 +1814,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
           // correlate properly in applyImport in the backends.
           if (AN.MIBs.empty())
             continue;
-          CallsWithMetadata.push_back({&AN});
-          auto *AllocNode = addAllocNode({&AN}, FS);
+          IndexCall AllocCall(&AN);
+          CallsWithMetadata.push_back(AllocCall);
+          CallToFunc[AllocCall] = FS;
+          auto *AllocNode = addAllocNode(AllocCall, FS);
           // Pass an empty CallStack to the CallsiteContext (second)
           // parameter, since for ThinLTO we already collapsed out the inlined
           // stack ids on the allocation call during ModuleSummaryAnalysis.
@@ -1788,8 +1848,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
       }
       // For callsite metadata, add to list for this function for later use.
       if (!FS->callsites().empty())
-        for (auto &SN : FS->mutableCallsites())
-          CallsWithMetadata.push_back({&SN});
+        for (auto &SN : FS->mutableCallsites()) {
+          IndexCall StackNodeCall(&SN);
+          CallsWithMetadata.push_back(StackNodeCall);
+          CallToFunc[StackNodeCall] = FS;
+        }
 
       if (!CallsWithMetadata.empty())
         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
@@ -2046,7 +2109,7 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(
     Instruction *Call, const Function *Func, const Function *CallerFunc,
     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
   auto *CB = dyn_cast<CallBase>(Call);
-  if (!CB->getCalledOperand())
+  if (!CB->getCalledOperand() || CB->isIndirectCall())
     return false;
   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
@@ -2225,6 +2288,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
   if (Recursive)
     OS << " (recursive)";
   OS << "\n";
+  if (!MatchingCalls.empty()) {
+    OS << "\tMatchingCalls:\n";
+    for (auto &MatchingCall : MatchingCalls) {
+      OS << "\t";
+      MatchingCall.print(OS);
+      OS << "\n";
+    }
+  }
   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
   OS << "\tContextIds:";
   // Make a copy of the computed context ids that we can sort for stability.
@@ -2478,6 +2549,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
   ContextNode *Clone = NodeOwner.back().get();
   Node->addClone(Clone);
+  Clone->MatchingCalls = Node->MatchingCalls;
   assert(NodeToCallingFunc.count(Node));
   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
@@ -3021,6 +3093,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         if (CallMap.count(Call))
           CallClone = CallMap[Call];
         CallsiteClone->setCall(CallClone);
+        // Need to do the same for all matching calls.
+        for (auto &MatchingCall : Node->MatchingCalls) {
+          CallInfo CallClone(MatchingCall);
+          if (CallMap.count(MatchingCall))
+            CallClone = CallMap[MatchingCall];
+          // Updates the call in the list.
+          MatchingCall = CallClone;
+        }
       };
 
       // Keep track of the clones of callsite Node that need to be assigned to
@@ -3187,6 +3267,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
               CallInfo NewCall(CallMap[OrigCall]);
               assert(NewCall);
               NewClone->setCall(NewCall);
+              // Need to do the same for all matching calls.
+              for (auto &MatchingCall : NewClone->MatchingCalls) {
+                CallInfo OrigMatchingCall(MatchingCall);
+                OrigMatchingCall.setCloneNo(0);
+                assert(CallMap.count(OrigMatchingCall));
+                CallInfo NewCall(CallMap[OrigMatchingCall]);
+                assert(NewCall);
+                // Updates the call in the list.
+                MatchingCall = NewCall;
+              }
             }
           }
           // Fall through to handling below to perform the recording of the
@@ -3373,6 +3463,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     if (Node->IsAllocation) {
       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+      assert(Node->MatchingCalls.empty());
       return;
     }
 
@@ -3381,6 +3472,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
     updateCall(Node->Call, CalleeFunc);
+    // Update all the matching calls as well.
+    for (auto &Call : Node->MatchingCalls)
+      updateCall(Call, CalleeFunc);
   };
 
   // Performs DFS traversal starting from allocation nodes to update calls to
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 5cc2911a1a80e..6af284d513efc 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -439,7 +439,10 @@ struct CandidateComparer {
 
     const FunctionSamples *LCS = LHS.CalleeSamples;
     const FunctionSamples *RCS = RHS.CalleeSamples;
-    assert(LCS && RCS && "Expect non-null FunctionSamples");
+    // In inline replay mode, CalleeSamples may be null and the order doesn't
+    // matter.
+    if (!LCS || !RCS)
+      return LCS;
 
     // Tie breaker using number of samples try to favor smaller functions first
     if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0a55f4762fdf0..3bd086230cbec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1694,12 +1694,10 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
 
   // Canonicalize signum variant that ends in add:
   // (A s>> (BW - 1)) + (zext (A s> 0)) --> (A s>> (BW - 1)) | (zext (A != 0))
-  ICmpInst::Predicate Pred;
   uint64_t BitWidth = Ty->getScalarSizeInBits();
   if (match(LHS, m_AShr(m_Value(A), m_SpecificIntAllowPoison(BitWidth - 1))) &&
-      match(RHS, m_OneUse(m_ZExt(
-                     m_OneUse(m_ICmp(Pred, m_Specific(A), m_ZeroInt()))))) &&
-      Pred == CmpInst::ICMP_SGT) {
+      match(RHS, m_OneUse(m_ZExt(m_OneUse(m_SpecificICmp(
+                     CmpInst::ICMP_SGT, m_Specific(A), m_ZeroInt())))))) {
     Value *NotZero = Builder.CreateIsNotNull(A, "isnotnull");
     Value *Zext = Builder.CreateZExt(NotZero, Ty, "isnotnull.zext");
     return BinaryOperator::CreateOr(LHS, Zext);
@@ -1711,12 +1709,13 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     // (add X, (sext/zext (icmp eq X, C)))
     //    -> (select (icmp eq X, C), (add C, (sext/zext 1)), X)
     auto CondMatcher = m_CombineAnd(
-        m_Value(Cond), m_ICmp(Pred, m_Deferred(A), m_ImmConstant(C)));
+        m_Value(Cond),
+        m_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A), m_ImmConstant(C)));
 
     if (match(&I,
               m_c_Add(m_Value(A),
                       m_CombineAnd(m_Value(Ext), m_ZExtOrSExt(CondMatcher)))) &&
-        Pred == ICmpInst::ICMP_EQ && Ext->hasOneUse()) {
+        Ext->hasOneUse()) {
       Value *Add = isa<ZExtInst>(Ext) ? InstCombiner::AddOne(C)
                                       : InstCombiner::SubOne(C);
       return replaceInstUsesWith(I, Builder.CreateSelect(Cond, Add, A));
@@ -1791,6 +1790,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // -->
   // BW - ctlz(A - 1, false)
   const APInt *XorC;
+  ICmpInst::Predicate Pred;
   if (match(&I,
             m_c_Add(
                 m_ZExt(m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(A)),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f9caa4da44931..4ca12d5b92f18 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -818,11 +818,11 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
   // Match  icmp ult (add %arg, C01), C1   (C1 == C01 << 1; powers of two)
   auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
                                             APInt &SignBitMask) -> bool {
-    CmpInst::Predicate Pred;
     const APInt *I01, *I1; // powers of two; I1 == I01 << 1
-    if (!(match(ICmp,
-                m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
-          Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
+    if (!(match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                     m_Add(m_Value(X), m_Power2(I01)),
+                                     m_Power2(I1))) &&
+          I1->ugt(*I01) && I01->shl(1) == *I1))
       return false;
     // Which bit is the new sign bit as per the 'signed truncation' pattern?
     SignBitMask = *I01;
@@ -936,20 +936,21 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
     std::swap(Cmp0, Cmp1);
 
   // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
-  CmpInst::Predicate Pred0, Pred1;
   Value *X;
-  if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(2))) &&
-      Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
+  if (JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(2)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
   // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
-  if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(1))) &&
-      Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
+  if (!JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(1)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
@@ -1608,31 +1609,30 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
   // There are 4 commuted variants of the pattern. Canonicalize operands of this
   // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
   Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
-  FCmpInst::Predicate Pred;
-  if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
+  if (match(Op1, m_FCmp(m_Value(), m_AnyZeroFP())))
     std::swap(Op0, Op1);
 
   // Match inner binop and the predicate for combining 2 NAN checks into 1.
   Value *BO10, *BO11;
   FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
                                                            : FCmpInst::FCMP_UNO;
-  if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
+  if (!match(Op0, m_SpecificFCmp(NanPred, m_Value(X), m_AnyZeroFP())) ||
       !match(Op1, m_BinOp(Opcode, m_Value(BO10), m_Value(BO11))))
     return nullptr;
 
   // The inner logic op must have a matching fcmp operand.
   Value *Y;
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     std::swap(BO10, BO11);
 
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     return nullptr;
 
   // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
   // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z
-  Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
+  Value *NewFCmp = Builder.CreateFCmp(NanPred, X, Y);
   if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
     // Intersect FMF from the 2 source fcmps.
     NewFCmpInst->copyIRFlags(Op0);
@@ -1744,14 +1744,13 @@ Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   //   -> zext(bitwise(A < 0, icmp))
   auto FoldBitwiseICmpZeroWithICmp = [&](Value *Op0,
                                          Value *Op1) -> Instruction * {
-    ICmpInst::Predicate Pred;
     Value *A;
     bool IsMatched =
         match(Op0,
               m_OneUse(m_LShr(
                   m_Value(A),
                   m_SpecificInt(Op0->getType()->getScalarSizeInBits() - 1)))) &&
-        match(Op1, m_OneUse(m_ZExt(m_ICmp(Pred, m_Value(), m_Value()))));
+        match(Op1, m_OneUse(m_ZExt(m_ICmp(m_Value(), m_Value()))));
 
     if (!IsMatched)
       return nullptr;
@@ -3878,14 +3877,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (match(&I,
             m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
                                 m_Value(Ov)),
-                   m_CombineAnd(m_ICmp(Pred,
-                                       m_CombineAnd(m_ExtractValue<0>(
-                                                        m_Deferred(UMulWithOv)),
-                                                    m_Value(Mul)),
-                                       m_ZeroInt()),
-                                m_Value(MulIsNotZero)))) &&
-      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
-      Pred == CmpInst::ICMP_NE) {
+                   m_CombineAnd(
+                       m_SpecificICmp(ICmpInst::ICMP_NE,
+                                      m_CombineAnd(m_ExtractValue<0>(
+                                                       m_Deferred(UMulWithOv)),
+                                                   m_Value(Mul)),
+                                      m_ZeroInt()),
+                       m_Value(MulIsNotZero)))) &&
+      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse()))) {
     Value *A, *B;
     if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
                               m_Value(A), m_Value(B)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 9b3d69df1ce7f..2ca7c9b0cd1ef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -171,7 +171,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   IntegerType *IntType = IntegerType::get(MI->getContext(), Size << 3);
 
   // If the memcpy has metadata describing the members, see if we can get the
-  // TBAA tag describing our copy.
+  // TBAA, scope and noalias tags describing our copy.
   AAMDNodes AACopyMD = MI->getAAMetadata().adjustForAccess(Size);
 
   Value *Src = MI->getArgOperand(1);
@@ -692,12 +692,24 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
                             Ty);
 
   // Add range attribute since known bits can't completely reflect what we know.
-  if (BitWidth != 1 && !II.hasRetAttr(Attribute::Range) &&
-      !II.getMetadata(LLVMContext::MD_range)) {
-    ConstantRange Range(APInt(BitWidth, Known.countMinPopulation()),
-                        APInt(BitWidth, Known.countMaxPopulation() + 1));
-    II.addRangeRetAttr(Range);
-    return &II;
+  if (BitWidth != 1) {
+    ConstantRange OldRange =
+        II.getRange().value_or(ConstantRange::getFull(BitWidth));
+
+    unsigned Lower = Known.countMinPopulation();
+    unsigned Upper = Known.countMaxPopulation() + 1;
+
+    if (Lower == 0 && OldRange.contains(APInt::getZero(BitWidth)) &&
+        isKnownNonZero(Op0, IC.getSimplifyQuery().getWithInstruction(&II)))
+      Lower = 1;
+
+    ConstantRange Range(APInt(BitWidth, Lower), APInt(BitWidth, Upper));
+    Range = Range.intersectWith(OldRange, ConstantRange::Unsigned);
+
+    if (Range != OldRange) {
+      II.addRangeRetAttr(Range);
+      return &II;
+    }
   }
 
   return nullptr;
@@ -1500,10 +1512,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty()) {
-    SmallVector<Value *, 4> Args;
-    Args.reserve(CI.arg_size());
-    for (Value *Op : CI.args())
-      Args.push_back(Op);
+    SmallVector<Value *, 8> Args(CI.args());
     if (Value *V = simplifyCall(&CI, CI.getCalledOperand(), Args,
                                 SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
@@ -1575,16 +1584,30 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         return eraseInstFromFunction(CI);
     }
 
+    auto IsPointerUndefined = [MI](Value *Ptr) {
+      return isa<ConstantPointerNull>(Ptr) &&
+             !NullPointerIsDefined(
+                 MI->getFunction(),
+                 cast<PointerType>(Ptr->getType())->getAddressSpace());
+    };
+    bool SrcIsUndefined = false;
     // If we can determine a pointer alignment that is bigger than currently
     // set, update the alignment.
     if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
       if (Instruction *I = SimplifyAnyMemTransfer(MTI))
         return I;
+      SrcIsUndefined = IsPointerUndefined(MTI->getRawSource());
     } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
       if (Instruction *I = SimplifyAnyMemSet(MSI))
         return I;
     }
 
+    // If src/dest is null, this memory intrinsic must be a noop.
+    if (SrcIsUndefined || IsPointerUndefined(MI->getRawDest())) {
+      Builder.CreateAssumption(Builder.CreateIsNull(MI->getLength()));
+      return eraseInstFromFunction(CI);
+    }
+
     if (Changed) return II;
   }
 
@@ -2480,6 +2503,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // fma fneg(x), fneg(y), z -> fma x, y, z
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
+    Value *Src2 = II->getArgOperand(2);
     Value *X, *Y;
     if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
       replaceOperand(*II, 0, X);
@@ -2497,10 +2521,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // Try to simplify the underlying FMul. We can only apply simplifications
     // that do not require rounding.
-    if (Value *V = simplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
-                                   II->getFastMathFlags(),
+    if (Value *V = simplifyFMAFMul(Src0, Src1, II->getFastMathFlags(),
                                    SQ.getWithInstruction(II))) {
-      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+      auto *FAdd = BinaryOperator::CreateFAdd(V, Src2);
       FAdd->copyFastMathFlags(II);
       return FAdd;
     }
@@ -2508,11 +2531,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // fma x, y, 0 -> fmul x, y
     // This is always valid for -0.0, but requires nsz for +0.0 as
     // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own.
-    if (match(II->getArgOperand(2), m_NegZeroFP()) ||
-        (match(II->getArgOperand(2), m_PosZeroFP()) &&
-         II->getFastMathFlags().noSignedZeros()))
+    if (match(Src2, m_NegZeroFP()) ||
+        (match(Src2, m_PosZeroFP()) && II->getFastMathFlags().noSignedZeros()))
       return BinaryOperator::CreateFMulFMF(Src0, Src1, II);
 
+    // fma x, -1.0, y -> fsub y, x
+    if (match(Src1, m_SpecificFP(-1.0)))
+      return BinaryOperator::CreateFSubFMF(Src2, Src0, II);
+
     break;
   }
   case Intrinsic::copysign: {
@@ -3028,10 +3054,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // assume( (load addr) != null ) -> add 'nonnull' metadata to load
     // (if assume is valid at the load)
-    CmpInst::Predicate Pred;
     Instruction *LHS;
-    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
-        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_NE, m_Instruction(LHS),
+                                        m_Zero())) &&
+        LHS->getOpcode() == Instruction::Load &&
         LHS->getType()->isPointerTy() &&
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), std::nullopt);
@@ -3070,8 +3096,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // into
     // call void @llvm.assume(i1 true) [ "nonnull"(i32* %PTR) ]
     if (EnableKnowledgeRetention &&
-        match(IIOperand, m_Cmp(Pred, m_Value(A), m_Zero())) &&
-        Pred == CmpInst::ICMP_NE && A->getType()->isPointerTy()) {
+        match(IIOperand,
+              m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(A), m_Zero())) &&
+        A->getType()->isPointerTy()) {
       if (auto *Replacement = buildAssumeFromKnowledge(
               {RetainedKnowledge{Attribute::NonNull, 0, A}}, Next, &AC, &DT)) {
 
@@ -3091,9 +3118,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     uint64_t AlignMask;
     if (EnableKnowledgeRetention &&
         match(IIOperand,
-              m_Cmp(Pred, m_And(m_Value(A), m_ConstantInt(AlignMask)),
-                    m_Zero())) &&
-        Pred == CmpInst::ICMP_EQ) {
+              m_SpecificICmp(ICmpInst::ICMP_EQ,
+                             m_And(m_Value(A), m_ConstantInt(AlignMask)),
+                             m_Zero()))) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 08d5b2f6e8794..ed93a0fba2b63 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2680,6 +2680,27 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
   return RetVal;
 }
 
+/// Fold (bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp) to
+/// copysign((bitcast Y to fp), X)
+static Value *foldCopySignIdioms(BitCastInst &CI,
+                                 InstCombiner::BuilderTy &Builder,
+                                 const SimplifyQuery &SQ) {
+  Value *X, *Y;
+  Type *FTy = CI.getType();
+  if (!FTy->isFPOrFPVectorTy())
+    return nullptr;
+  if (!match(&CI, m_ElementWiseBitCast(m_c_Or(
+                      m_And(m_ElementWiseBitCast(m_Value(X)), m_SignMask()),
+                      m_Value(Y)))))
+    return nullptr;
+  if (X->getType() != FTy)
+    return nullptr;
+  if (!isKnownNonNegative(Y, SQ))
+    return nullptr;
+
+  return Builder.CreateCopySign(Builder.CreateBitCast(Y, FTy), X);
+}
+
 Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
@@ -2692,14 +2713,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (DestTy == Src->getType())
     return replaceInstUsesWith(CI, Src);
 
-  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
-    // Beware: messing with this target-specific oddity may cause trouble.
-    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
-      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
-      return InsertElementInst::Create(PoisonValue::get(DestTy), Elem,
-                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-    }
-
+  if (isa<FixedVectorType>(DestTy)) {
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
@@ -2828,6 +2842,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (Instruction *I = foldBitCastSelect(CI, Builder))
     return I;
 
+  if (Value *V = foldCopySignIdioms(CI, Builder, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, V);
+
   return commonCastTransforms(CI);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index abadf54a96767..3b6df2760ecc2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5772,8 +5772,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   //    -> icmp eq/ne X, rotate-left(X)
   // We generally try to convert rotate-right -> rotate-left, this just
   // canonicalizes another case.
-  CmpInst::Predicate PredUnused = Pred;
-  if (match(&I, m_c_ICmp(PredUnused, m_Value(A),
+  if (match(&I, m_c_ICmp(m_Value(A),
                          m_OneUse(m_Intrinsic<Intrinsic::fshr>(
                              m_Deferred(A), m_Deferred(A), m_Value(B))))))
     return new ICmpInst(
@@ -5783,8 +5782,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   // Canonicalize:
   // icmp eq/ne OneUse(A ^ Cst), B --> icmp eq/ne (A ^ B), Cst
   Constant *Cst;
-  if (match(&I, m_c_ICmp(PredUnused,
-                         m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
+  if (match(&I, m_c_ICmp(m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
                          m_CombineAnd(m_Value(B), m_Unless(m_ImmConstant())))))
     return new ICmpInst(Pred, Builder.CreateXor(A, B), Cst);
 
@@ -5795,13 +5793,12 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
                                 m_c_Xor(m_Value(B), m_Deferred(A))),
                     m_Sub(m_Value(B), m_Deferred(A)));
     std::optional<bool> IsZero = std::nullopt;
-    if (match(&I, m_c_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
+    if (match(&I, m_c_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)),
                            m_Deferred(A))))
       IsZero = false;
     // (icmp eq/ne (and (add/sub/xor X, P2), P2), 0)
     else if (match(&I,
-                   m_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
-                          m_Zero())))
+                   m_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)), m_Zero())))
       IsZero = true;
 
     if (IsZero && isKnownToBeAPowerOfTwo(A, /* OrZero */ true, /*Depth*/ 0, &I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e387034110df9..a22ee1de0ac21 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1247,8 +1247,11 @@ bool InstCombinerImpl::replaceInInstruction(Value *V, Value *Old, Value *New,
   if (Depth == 2)
     return false;
 
+  assert(!isa<Constant>(Old) && "Only replace non-constant values");
+
   auto *I = dyn_cast<Instruction>(V);
-  if (!I || !I->hasOneUse() || !isSafeToSpeculativelyExecute(I))
+  if (!I || !I->hasOneUse() ||
+      !isSafeToSpeculativelyExecuteWithVariableReplaced(I))
     return false;
 
   bool Changed = false;
@@ -1331,7 +1334,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     // profitability is not clear for other cases.
     // FIXME: Support vectors.
     if (OldOp == CmpLHS && match(NewOp, m_ImmConstant()) &&
-        !match(OldOp, m_ImmConstant()) && !Cmp.getType()->isVectorTy() &&
+        !match(OldOp, m_Constant()) && !Cmp.getType()->isVectorTy() &&
         isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
       if (replaceInInstruction(TrueVal, OldOp, NewOp))
         return &Sel;
@@ -2481,9 +2484,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
 
   // Finally, see if the select is filtering out a shift-by-zero.
   Value *Cond = Sel.getCondition();
-  ICmpInst::Predicate Pred;
-  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
-      Pred != ICmpInst::ICMP_EQ)
+  if (!match(Cond, m_OneUse(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                           m_ZeroInt()))))
     return nullptr;
 
   // If this is not a rotate then the select was blocking poison from the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 081e783c964fd..c494fec84c1e6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -388,8 +388,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
       // invert the transform that reduces set bits and infinite-loop.
       Value *X;
       const APInt *CmpC;
-      ICmpInst::Predicate Pred;
-      if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+      if (!match(I->getOperand(0), m_ICmp(m_Value(X), m_APInt(CmpC))) ||
           isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
         return ShrinkDemandedConstant(I, OpNo, DemandedMask);
 
@@ -806,34 +805,30 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
 
       // Signed shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-      // If any of the high bits are demanded, we should set the sign bit as
-      // demanded.
-      if (DemandedMask.countl_zero() <= ShiftAmt)
+      // If any of the bits being shifted in are demanded, then we should set
+      // the sign bit as demanded.
+      bool ShiftedInBitsDemanded = DemandedMask.countl_zero() < ShiftAmt;
+      if (ShiftedInBitsDemanded)
         DemandedMaskIn.setSignBit();
-
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1, Q)) {
         // exact flag may not longer hold.
         I->dropPoisonGeneratingFlags();
         return I;
       }
 
-      Known = KnownBits::ashr(
-          Known, KnownBits::makeConstant(APInt(BitWidth, ShiftAmt)),
-          ShiftAmt != 0, I->isExact());
-
-      // If the input sign bit is known to be zero, or if none of the top bits
-      // are demanded, turn this into an unsigned shift right.
-      assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
-      APInt HighBits(APInt::getHighBitsSet(
-          BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
-      if (Known.Zero[BitWidth-ShiftAmt-1] ||
-          !DemandedMask.intersects(HighBits)) {
+      // If the input sign bit is known to be zero, or if none of the shifted in
+      // bits are demanded, turn this into an unsigned shift right.
+      if (Known.Zero[BitWidth - 1] || !ShiftedInBitsDemanded) {
         BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
                                                           I->getOperand(1));
         LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
         LShr->takeName(I);
         return InsertNewInstWith(LShr, I->getIterator());
       }
+
+      Known = KnownBits::ashr(
+          Known, KnownBits::makeConstant(APInt(BitWidth, ShiftAmt)),
+          ShiftAmt != 0, I->isExact());
     } else {
       llvm::computeKnownBits(I, Known, Depth, Q);
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 56e347dbf66a7..5b8931e735c9a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1651,14 +1651,14 @@ static Constant *constantFoldOperationIntoSelectOperand(Instruction &I,
                                                         bool IsTrueArm) {
   SmallVector<Constant *> ConstOps;
   for (Value *Op : I.operands()) {
-    CmpInst::Predicate Pred;
     Constant *C = nullptr;
     if (Op == SI) {
       C = dyn_cast<Constant>(IsTrueArm ? SI->getTrueValue()
                                        : SI->getFalseValue());
     } else if (match(SI->getCondition(),
-                     m_ICmp(Pred, m_Specific(Op), m_Constant(C))) &&
-               Pred == (IsTrueArm ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) &&
+                     m_SpecificICmp(IsTrueArm ? ICmpInst::ICMP_EQ
+                                              : ICmpInst::ICMP_NE,
+                                    m_Specific(Op), m_Constant(C))) &&
                isGuaranteedNotToBeUndefOrPoison(C)) {
       // Pass
     } else {
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b936e2c8e38b5..5e6f01fe682cc 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -109,7 +109,7 @@ static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
 static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kLoongArch64_ShadowOffset64 = 1ULL << 46;
-static const uint64_t kRISCV64_ShadowOffset64 = 0xd55550000;
+static const uint64_t kRISCV64_ShadowOffset64 = kDynamicShadowSentinel;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kFreeBSDAArch64_ShadowOffset64 = 1ULL << 47;
@@ -3421,6 +3421,8 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().starts_with("__asan_")) return false;
+  if (F.isPresplitCoroutine())
+    return false;
 
   if (TargetTriple.isSPIROrSPIRV()) {
     if (F.getName().contains("__sycl_service_kernel__"))
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index a0e63bf12400e..812874ff3c173 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1528,11 +1528,7 @@ static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE,
 
 bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
     Function &F, FunctionAnalysisManager &FAM) const {
-  bool Skip = [&]() {
-    if (ClRandomSkipRate.getNumOccurrences()) {
-      std::bernoulli_distribution D(ClRandomSkipRate);
-      return !D(*Rng);
-    }
+  auto SkipHot = [&]() {
     if (!ClHotPercentileCutoff.getNumOccurrences())
       return false;
     auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
@@ -1544,7 +1540,16 @@ bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
     }
     return PSI->isFunctionHotInCallGraphNthPercentile(
         ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F));
-  }();
+  };
+
+  auto SkipRandom = [&]() {
+    if (!ClRandomSkipRate.getNumOccurrences())
+      return false;
+    std::bernoulli_distribution D(ClRandomSkipRate);
+    return !D(*Rng);
+  };
+
+  bool Skip = SkipRandom() || SkipHot();
   emitRemark(F, FAM.getResult<OptimizationRemarkEmitterAnalysis>(F), Skip);
   return Skip;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f994f8a62c320..1805ea89272ec 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -170,6 +170,30 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
+static cl::opt<bool> SampledInstr("sampled-instrumentation", cl::ZeroOrMore,
+                                  cl::init(false),
+                                  cl::desc("Do PGO instrumentation sampling"));
+
+static cl::opt<unsigned> SampledInstrPeriod(
+    "sampled-instr-period",
+    cl::desc("Set the profile instrumentation sample period. For each sample "
+             "period, a fixed number of consecutive samples will be recorded. "
+             "The number is controlled by 'sampled-instr-burst-duration' flag. "
+             "The default sample period of 65535 is optimized for generating "
+             "efficient code that leverages unsigned integer wrapping in "
+             "overflow."),
+    cl::init(65535));
+
+static cl::opt<unsigned> SampledInstrBurstDuration(
+    "sampled-instr-burst-duration",
+    cl::desc("Set the profile instrumentation burst duration, which can range "
+             "from 0 to one less than the value of 'sampled-instr-period'. "
+             "This number of samples will be recorded for each "
+             "'sampled-instr-period' count update. Setting to 1 enables "
+             "simple sampling, in which case it is recommended to set "
+             "'sampled-instr-period' to a prime number."),
+    cl::init(200));
+
 using LoadStorePair = std::pair<Instruction *, Instruction *>;
 
 static uint64_t getIntModuleFlagOrZero(const Module &M, StringRef Flag) {
@@ -260,6 +284,9 @@ class InstrLowerer final {
   /// Returns true if profile counter update register promotion is enabled.
   bool isCounterPromotionEnabled() const;
 
+  /// Return true if profile sampling is enabled.
+  bool isSamplingEnabled() const;
+
   /// Count the number of instrumented value sites for the function.
   void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
 
@@ -291,6 +318,9 @@ class InstrLowerer final {
   /// acts on.
   Value *getCounterAddress(InstrProfCntrInstBase *I);
 
+  /// Lower the incremental instructions under profile sampling predicates.
+  void doSampling(Instruction *I);
+
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
@@ -635,33 +665,169 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+//
+// Perform instrumentation sampling.
+//
+// There are 3 favors of sampling:
+// (1) Full burst sampling: We transform:
+//   Increment_Instruction;
+// to:
+//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
+//     Increment_Instruction;
+//   }
+//   __llvm_profile_sampling__ += 1;
+//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
+//     __llvm_profile_sampling__ = 0;
+//   }
+//
+// "__llvm_profile_sampling__" is a thread-local global shared by all PGO
+// counters (value-instrumentation and edge instrumentation).
+//
+// (2) Fast burst sampling:
+// "__llvm_profile_sampling__" variable is an unsigned type, meaning it will
+// wrap around to zero when overflows. In this case, the second check is
+// unnecessary, so we won't generate check2 when the SampledInstrPeriod is
+// set to 65535 (64K - 1). The code after:
+//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
+//     Increment_Instruction;
+//   }
+//   __llvm_profile_sampling__ += 1;
+//
+// (3) Simple sampling:
+// When SampledInstrBurstDuration sets to 1, we do a simple sampling:
+//   __llvm_profile_sampling__ += 1;
+//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
+//     __llvm_profile_sampling__ = 0;
+//     Increment_Instruction;
+//   }
+//
+// Note that, the code snippet after the transformation can still be counter
+// promoted. However, with sampling enabled, counter updates are expected to
+// be infrequent, making the benefits of counter promotion negligible.
+// Moreover, counter promotion can potentially cause issues in server
+// applications, particularly when the counters are dumped without a clean
+// exit. To mitigate this risk, counter promotion is disabled by default when
+// sampling is enabled. This behavior can be overridden using the internal
+// option.
+void InstrLowerer::doSampling(Instruction *I) {
+  if (!isSamplingEnabled())
+    return;
+
+  unsigned SampledBurstDuration = SampledInstrBurstDuration.getValue();
+  unsigned SampledPeriod = SampledInstrPeriod.getValue();
+  if (SampledBurstDuration >= SampledPeriod) {
+    report_fatal_error(
+        "SampledPeriod needs to be greater than SampledBurstDuration");
+  }
+  bool UseShort = (SampledPeriod <= USHRT_MAX);
+  bool IsSimpleSampling = (SampledBurstDuration == 1);
+  // If (SampledBurstDuration == 1 && SampledPeriod == 65535), generate
+  // the simple sampling style code.
+  bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
+
+  auto GetConstant = [UseShort](IRBuilder<> &Builder, uint32_t C) {
+    if (UseShort)
+      return Builder.getInt16(C);
+    else
+      return Builder.getInt32(C);
+  };
+
+  IntegerType *SamplingVarTy;
+  if (UseShort)
+    SamplingVarTy = Type::getInt16Ty(M.getContext());
+  else
+    SamplingVarTy = Type::getInt32Ty(M.getContext());
+  auto *SamplingVar =
+      M.getGlobalVariable(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
+  assert(SamplingVar && "SamplingVar not set properly");
+
+  // Create the condition for checking the burst duration.
+  Instruction *SamplingVarIncr;
+  Value *NewSamplingVarVal;
+  MDBuilder MDB(I->getContext());
+  MDNode *BranchWeight;
+  IRBuilder<> CondBuilder(I);
+  auto *LoadSamplingVar = CondBuilder.CreateLoad(SamplingVarTy, SamplingVar);
+  if (IsSimpleSampling) {
+    // For the simple sampling, just create the load and increments.
+    IRBuilder<> IncBuilder(I);
+    NewSamplingVarVal =
+        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
+    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
+  } else {
+    // For the bust-sampling, create the conditonal update.
+    auto *DurationCond = CondBuilder.CreateICmpULE(
+        LoadSamplingVar, GetConstant(CondBuilder, SampledBurstDuration));
+    BranchWeight = MDB.createBranchWeights(
+        SampledBurstDuration, SampledPeriod + 1 - SampledBurstDuration);
+    Instruction *ThenTerm = SplitBlockAndInsertIfThen(
+        DurationCond, I, /* Unreachable */ false, BranchWeight);
+    IRBuilder<> IncBuilder(I);
+    NewSamplingVarVal =
+        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
+    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
+    I->moveBefore(ThenTerm);
+  }
+
+  if (IsFastSampling)
+    return;
+
+  // Create the condtion for checking the period.
+  Instruction *ThenTerm, *ElseTerm;
+  IRBuilder<> PeriodCondBuilder(SamplingVarIncr);
+  auto *PeriodCond = PeriodCondBuilder.CreateICmpUGE(
+      NewSamplingVarVal, GetConstant(PeriodCondBuilder, SampledPeriod));
+  BranchWeight = MDB.createBranchWeights(1, SampledPeriod);
+  SplitBlockAndInsertIfThenElse(PeriodCond, SamplingVarIncr, &ThenTerm,
+                                &ElseTerm, BranchWeight);
+
+  // For the simple sampling, the counter update happens in sampling var reset.
+  if (IsSimpleSampling)
+    I->moveBefore(ThenTerm);
+
+  IRBuilder<> ResetBuilder(ThenTerm);
+  ResetBuilder.CreateStore(GetConstant(ResetBuilder, 0), SamplingVar);
+  SamplingVarIncr->moveBefore(ElseTerm);
+}
+
 bool InstrLowerer::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
+  SmallVector<InstrProfInstBase *, 8> InstrProfInsts;
+
+  // To ensure compatibility with sampling, we save the intrinsics into
+  // a buffer to prevent potential breakage of the iterator (as the
+  // intrinsics will be moved to a different BB).
   for (BasicBlock &BB : *F) {
     for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
-      if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
-        lowerIncrement(IPIS);
-        MadeChange = true;
-      } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
-        lowerIncrement(IPI);
-        MadeChange = true;
-      } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(&Instr)) {
-        lowerTimestamp(IPC);
-        MadeChange = true;
-      } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(&Instr)) {
-        lowerCover(IPC);
-        MadeChange = true;
-      } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
-        lowerValueProfileInst(IPVP);
-        MadeChange = true;
-      } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
-        IPMP->eraseFromParent();
-        MadeChange = true;
-      } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
-        lowerMCDCTestVectorBitmapUpdate(IPBU);
-        MadeChange = true;
-      }
+      if (auto *IP = dyn_cast<InstrProfInstBase>(&Instr))
+        InstrProfInsts.push_back(IP);
+    }
+  }
+
+  for (auto *Instr : InstrProfInsts) {
+    doSampling(Instr);
+    if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(Instr)) {
+      lowerIncrement(IPIS);
+      MadeChange = true;
+    } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(Instr)) {
+      lowerIncrement(IPI);
+      MadeChange = true;
+    } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(Instr)) {
+      lowerTimestamp(IPC);
+      MadeChange = true;
+    } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(Instr)) {
+      lowerCover(IPC);
+      MadeChange = true;
+    } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+      lowerValueProfileInst(IPVP);
+      MadeChange = true;
+    } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(Instr)) {
+      IPMP->eraseFromParent();
+      MadeChange = true;
+    } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(Instr)) {
+      lowerMCDCTestVectorBitmapUpdate(IPBU);
+      MadeChange = true;
     }
   }
 
@@ -684,6 +850,12 @@ bool InstrLowerer::isRuntimeCounterRelocationEnabled() const {
   return TT.isOSFuchsia();
 }
 
+bool InstrLowerer::isSamplingEnabled() const {
+  if (SampledInstr.getNumOccurrences() > 0)
+    return SampledInstr;
+  return Options.Sampling;
+}
+
 bool InstrLowerer::isCounterPromotionEnabled() const {
   if (DoCounterPromotion.getNumOccurrences() > 0)
     return DoCounterPromotion;
@@ -754,6 +926,9 @@ bool InstrLowerer::lower() {
   if (NeedsRuntimeHook)
     MadeChange = emitRuntimeHook();
 
+  if (!IsCS && isSamplingEnabled())
+    createProfileSamplingVar(M);
+
   bool ContainsProfiling = containsProfilingIntrinsics(M);
   GlobalVariable *CoverageNamesVar =
       M.getNamedGlobal(getCoverageUnusedNamesVarName());
@@ -945,6 +1120,9 @@ Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
     IRBuilder<> EntryBuilder(&Fn->getEntryBlock().front());
     auto *Bias = getOrCreateBiasVar(getInstrProfCounterBiasVarName());
     BiasLI = EntryBuilder.CreateLoad(Int64Ty, Bias, "profc_bias");
+    // Bias doesn't change after startup.
+    BiasLI->setMetadata(LLVMContext::MD_invariant_load,
+                        MDNode::get(M.getContext(), std::nullopt));
   }
   auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), BiasLI);
   return Builder.CreateIntToPtr(Add, Addr->getType());
@@ -1014,18 +1192,22 @@ CallInst *InstrLowerer::getRMWOrCall(Value *Addr, Value *Val) {
 
 Value *InstrLowerer::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
   auto *Bitmaps = getOrCreateRegionBitmaps(I);
-  IRBuilder<> Builder(I);
-
-  if (isRuntimeCounterRelocationEnabled()) {
-    LLVMContext &Ctx = M.getContext();
-    Ctx.diagnose(DiagnosticInfoPGOProfile(
-        M.getName().data(),
-        Twine("Runtime counter relocation is presently not supported for MC/DC "
-              "bitmaps."),
-        DS_Warning));
-  }
+  if (!isRuntimeCounterRelocationEnabled())
+    return Bitmaps;
 
-  return Bitmaps;
+  // Put BiasLI onto the entry block.
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  Function *Fn = I->getFunction();
+  IRBuilder<> EntryBuilder(&Fn->getEntryBlock().front());
+  auto *Bias = getOrCreateBiasVar(getInstrProfBitmapBiasVarName());
+  auto *BiasLI = EntryBuilder.CreateLoad(Int64Ty, Bias, "profbm_bias");
+  // Assume BiasLI invariant (in the function at least)
+  BiasLI->setMetadata(LLVMContext::MD_invariant_load,
+                      MDNode::get(M.getContext(), std::nullopt));
+
+  // Add Bias to Bitmaps and put it before the intrinsic.
+  IRBuilder<> Builder(I);
+  return Builder.CreatePtrAdd(Bitmaps, BiasLI, "profbm_addr");
 }
 
 void InstrLowerer::lowerCover(InstrProfCoverInst *CoverInstruction) {
@@ -1952,3 +2134,29 @@ void InstrLowerer::emitInitialization() {
 
   appendToGlobalCtors(M, F, 0);
 }
+
+namespace llvm {
+// Create the variable for profile sampling.
+void createProfileSamplingVar(Module &M) {
+  const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
+  IntegerType *SamplingVarTy;
+  Constant *ValueZero;
+  if (SampledInstrPeriod.getValue() <= USHRT_MAX) {
+    SamplingVarTy = Type::getInt16Ty(M.getContext());
+    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(16, 0));
+  } else {
+    SamplingVarTy = Type::getInt32Ty(M.getContext());
+    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(32, 0));
+  }
+  auto SamplingVar = new GlobalVariable(
+      M, SamplingVarTy, false, GlobalValue::WeakAnyLinkage, ValueZero, VarName);
+  SamplingVar->setVisibility(GlobalValue::DefaultVisibility);
+  SamplingVar->setThreadLocal(true);
+  Triple TT(M.getTargetTriple());
+  if (TT.supportsCOMDAT()) {
+    SamplingVar->setLinkage(GlobalValue::ExternalLinkage);
+    SamplingVar->setComdat(M.getOrInsertComdat(VarName));
+  }
+  appendToCompilerUsed(M, SamplingVar);
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 0115809e939e8..19cf7dc7e7544 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -76,13 +76,25 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
   SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
   std::unique_ptr<RandomNumberGenerator> Rng;
 
-  auto ShouldRemove = [&](bool IsHot) {
-    if (!RandomRate.getNumOccurrences())
-      return IsHot;
+  auto GetRng = [&]() -> RandomNumberGenerator & {
     if (!Rng)
       Rng = F.getParent()->createRNG(F.getName());
-    std::bernoulli_distribution D(RandomRate);
-    return !D(*Rng);
+    return *Rng;
+  };
+
+  auto ShouldRemoveHot = [&](const BasicBlock &BB) {
+    return HotPercentileCutoff.getNumOccurrences() && PSI &&
+           PSI->isHotCountNthPercentile(
+               HotPercentileCutoff, BFI.getBlockProfileCount(&BB).value_or(0));
+  };
+
+  auto ShouldRemoveRandom = [&]() {
+    return RandomRate.getNumOccurrences() &&
+           !std::bernoulli_distribution(RandomRate)(GetRng());
+  };
+
+  auto ShouldRemove = [&](const BasicBlock &BB) {
+    return ShouldRemoveRandom() || ShouldRemoveHot(BB);
   };
 
   for (BasicBlock &BB : F) {
@@ -96,13 +108,7 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
       case Intrinsic::allow_runtime_check: {
         ++NumChecksTotal;
 
-        bool IsHot = false;
-        if (PSI) {
-          uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0);
-          IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count);
-        }
-
-        bool ToRemove = ShouldRemove(IsHot);
+        bool ToRemove = ShouldRemove(BB);
         ReplaceWithValue.push_back({
             II,
             ToRemove,
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 2c5d749d4a67a..445bf0bb26cc4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -61,6 +61,9 @@ constexpr int LLVM_MEM_PROFILER_VERSION = 1;
 // Size of memory mapped to a single shadow location.
 constexpr uint64_t DefaultMemGranularity = 64;
 
+// Size of memory mapped to a single histogram bucket.
+constexpr uint64_t HistogramGranularity = 8;
+
 // Scale from granularity down to shadow size.
 constexpr uint64_t DefaultShadowScale = 3;
 
@@ -192,7 +195,7 @@ namespace {
 struct ShadowMapping {
   ShadowMapping() {
     Scale = ClMappingScale;
-    Granularity = ClMappingGranularity;
+    Granularity = ClHistogram ? HistogramGranularity : ClMappingGranularity;
     Mask = ~(Granularity - 1);
   }
 
@@ -276,6 +279,8 @@ MemProfilerPass::MemProfilerPass() = default;
 
 PreservedAnalyses MemProfilerPass::run(Function &F,
                                        AnalysisManager<Function> &AM) {
+  assert((!ClHistogram || ClMappingGranularity == DefaultMemGranularity) &&
+         "Memprof with histogram only supports default mapping granularity");
   Module &M = *F.getParent();
   MemProfiler Profiler(M);
   if (Profiler.instrumentFunction(F))
@@ -288,10 +293,6 @@ ModuleMemProfilerPass::ModuleMemProfilerPass() = default;
 PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
                                              AnalysisManager<Module> &AM) {
 
-  assert((!ClHistogram || (ClHistogram && ClUseCalls)) &&
-         "Cannot use -memprof-histogram without Callbacks. Set "
-         "memprof-use-callbacks");
-
   ModuleMemProfiler Profiler(M);
   if (Profiler.instrumentModule(M))
     return PreservedAnalyses::none();
@@ -489,14 +490,21 @@ void MemProfiler::instrumentAddress(Instruction *OrigIns,
     return;
   }
 
-  // Create an inline sequence to compute shadow location, and increment the
-  // value by one.
-  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowTy = ClHistogram ? Type::getInt8Ty(*C) : Type::getInt64Ty(*C);
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
   Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
-  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  // If we are profiling with histograms, add overflow protection at 255.
+  if (ClHistogram) {
+    Value *MaxCount = ConstantInt::get(Type::getInt8Ty(*C), 255);
+    Value *Cmp = IRB.CreateICmpULT(ShadowValue, MaxCount);
+    Instruction *IncBlock =
+        SplitBlockAndInsertIfThen(Cmp, InsertBefore, /*Unreachable=*/false);
+    IRB.SetInsertPoint(IncBlock);
+  }
+  Value *Inc = ConstantInt::get(ShadowTy, 1);
   ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
   IRB.CreateStore(ShadowValue, ShadowAddr);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c7d41f6298372..45b3edf2b8f23 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -11,11 +11,11 @@
 /// reads.
 ///
 /// The algorithm of the tool is similar to Memcheck
-/// (http://goo.gl/QKbem). We associate a few shadow bits with every
-/// byte of the application memory, poison the shadow of the malloc-ed
-/// or alloca-ed memory, load the shadow bits on every memory read,
-/// propagate the shadow bits through some of the arithmetic
-/// instruction (including MOV), store the shadow bits on every memory
+/// (https://static.usenix.org/event/usenix05/tech/general/full_papers/seward/seward_html/usenix2005.html)
+/// We associate a few shadow bits with every byte of the application memory,
+/// poison the shadow of the malloc-ed or alloca-ed memory, load the shadow,
+/// bits on every memory read, propagate the shadow bits through some of the
+/// arithmetic instruction (including MOV), store the shadow bits on every memory
 /// write, report a bug on some other instructions (e.g. JMP) if the
 /// associated shadow is poisoned.
 ///
@@ -181,6 +181,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -2498,6 +2499,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         MSV->setOrigin(I, Origin);
       }
     }
+
+    /// Store the current combined value at the specified origin
+    /// location.
+    void DoneAndStoreOrigin(TypeSize TS, Value *OriginPtr) {
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->paintOrigin(IRB, Origin, OriginPtr, TS, kMinOriginAlignment);
+      }
+    }
   };
 
   using ShadowAndOriginCombiner = Combiner<true>;
@@ -2977,8 +2987,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Caller guarantees that this intrinsic does not access memory.
   bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
     Type *RetTy = I.getType();
-    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() ||
-          RetTy->isX86_MMXTy()))
+    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy()))
       return false;
 
     unsigned NumArgOperands = I.arg_size();
@@ -3208,7 +3217,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an X86_MMX-sized vector type.
+  // Get an MMX-sized vector type.
   Type *getMMXVectorTy(unsigned EltSizeInBits) {
     const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
@@ -3254,20 +3263,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // packs elements of 2 input vectors into half as many bits with saturation.
   // Shadow is propagated with the signed variant of the same intrinsic applied
   // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
-  // EltSizeInBits is used only for x86mmx arguments.
-  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+  // MMXEltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I,
+                                 unsigned MMXEltSizeInBits = 0) {
     assert(I.arg_size() == 2);
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
     IRBuilder<> IRB(&I);
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
-    assert(isX86_MMX || S1->getType()->isVectorTy());
+    assert(S1->getType()->isVectorTy());
 
     // SExt and ICmpNE below must apply to individual elements of input vectors.
     // In case of x86mmx arguments, cast them to appropriate vector types and
     // back.
-    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
-    if (isX86_MMX) {
+    Type *T =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType();
+    if (MMXEltSizeInBits) {
       S1 = IRB.CreateBitCast(S1, T);
       S2 = IRB.CreateBitCast(S2, T);
     }
@@ -3275,10 +3285,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
     Value *S2_ext =
         IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
-    if (isX86_MMX) {
-      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
-      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
-      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    if (MMXEltSizeInBits) {
+      S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64));
+      S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
     }
 
     Function *ShadowFn = Intrinsic::getDeclaration(
@@ -3286,7 +3295,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     Value *S =
         IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
-    if (isX86_MMX)
+    if (MMXEltSizeInBits)
       S = IRB.CreateBitCast(S, getShadowTy(&I));
     setShadow(&I, S);
     setOriginForNaryOp(I);
@@ -3393,10 +3402,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   // Instrument sum-of-absolute-differences intrinsic.
-  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+  void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) {
     const unsigned SignificantBitsPerResultElement = 16;
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType();
     unsigned ZeroBitsPerResultElement =
         ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
 
@@ -3415,9 +3423,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Instrument multiply-add intrinsic.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned EltSizeInBits = 0) {
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+                                  unsigned MMXEltSizeInBits = 0) {
+    Type *ResTy =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
     IRBuilder<> IRB(&I);
     auto *Shadow0 = getShadow(&I, 0);
     auto *Shadow1 = getShadow(&I, 1);
@@ -3865,6 +3873,85 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  /// Handle Arm NEON vector store intrinsics (vst{2,3,4} and vst1x_{2,3,4}).
+  ///
+  /// Arm NEON vector store intrinsics have the output address (pointer) as the
+  /// last argument, with the initial arguments being the inputs. They return
+  /// void.
+  ///
+  /// - st4 interleaves the output e.g., st4 (inA, inB, inC, inD, outP) writes
+  ///   abcdabcdabcdabcd... into *outP
+  /// - st1_x4 is non-interleaved e.g., st1_x4 (inA, inB, inC, inD, outP)
+  ///   writes aaaa...bbbb...cccc...dddd... into *outP
+  /// These instructions can all be instrumented with essentially the same
+  /// MSan logic, simply by applying the corresponding intrinsic to the shadow.
+  void handleNEONVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    // Don't use getNumOperands() because it includes the callee
+    int numArgOperands = I.arg_size();
+    assert(numArgOperands >= 1);
+
+    // The last arg operand is the output
+    Value *Addr = I.getArgOperand(numArgOperands - 1);
+    assert(Addr->getType()->isPointerTy());
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    SmallVector<Value *, 8> Shadows;
+    // Every arg operand, other than the last one, is an input vector
+    for (int i = 0; i < numArgOperands - 1; i++) {
+      assert(isa<FixedVectorType>(I.getArgOperand(i)->getType()));
+      Value *Shadow = getShadow(&I, i);
+      Shadows.append(1, Shadow);
+    }
+
+    // MSan's GetShadowTy assumes the LHS is the type we want the shadow for
+    // e.g., for:
+    //     [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+    // we know the type of the output (and its shadow) is <16 x i8>.
+    //
+    // Arm NEON VST is unusual because the last argument is the output address:
+    //     define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) {
+    //         call void @llvm.aarch64.neon.st2.v16i8.p0
+    //                   (<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
+    // and we have no type information about P's operand. We must manually
+    // compute the type (<16 x i8> x 2).
+    FixedVectorType *OutputVectorTy = FixedVectorType::get(
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getElementType(),
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements() *
+            (numArgOperands - 1));
+    Type *OutputShadowTy = getShadowTy(OutputVectorTy);
+
+    Value *OutputShadowPtr, *OutputOriginPtr;
+    // AArch64 NEON does not need alignment (unless OS requires it)
+    std::tie(OutputShadowPtr, OutputOriginPtr) = getShadowOriginPtr(
+        Addr, IRB, OutputShadowTy, Align(1), /*isStore*/ true);
+    Shadows.append(1, OutputShadowPtr);
+
+    // CreateIntrinsic will select the correct (integer) type for the
+    // intrinsic; the original instruction I may have either integer- or
+    // float-type inputs.
+    CallInst *CI =
+        IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), Shadows);
+    setShadow(&I, CI);
+
+    if (MS.TrackOrigins) {
+      // TODO: if we modelled the vst* instruction more precisely, we could
+      // more accurately track the origins (e.g., if both inputs are
+      // uninitialized for vst2, we currently blame the second input, even
+      // though part of the output depends only on the first input).
+      OriginCombiner OC(this, IRB);
+      for (int i = 0; i < numArgOperands - 1; i++)
+        OC.Add(I.getArgOperand(i));
+
+      const DataLayout &DL = F.getDataLayout();
+      OC.DoneAndStoreOrigin(DL.getTypeStoreSize(OutputVectorTy),
+                            OutputOriginPtr);
+    }
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case Intrinsic::uadd_with_overflow:
@@ -4088,6 +4175,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     case Intrinsic::x86_mmx_psad_bw:
+      handleVectorSadIntrinsic(I, true);
+      break;
     case Intrinsic::x86_sse2_psad_bw:
     case Intrinsic::x86_avx2_psad_bw:
       handleVectorSadIntrinsic(I);
@@ -4204,6 +4293,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setOrigin(&I, getCleanOrigin());
       break;
 
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4: {
+      handleNEONVectorStoreIntrinsic(I);
+      break;
+    }
+
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -4968,7 +5067,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
     Type *T = arg->getType();
     if (T->isX86_FP80Ty())
       return AK_Memory;
-    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+    if (T->isFPOrFPVectorTy())
       return AK_FloatingPoint;
     if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
       return AK_GeneralPurpose;
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 99b1c779f3167..5872396669435 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -493,6 +493,60 @@ class ValueToShadowMap {
   DenseMap<Value *, Value *> Map;
 };
 
+class NsanMemOpFn {
+public:
+  NsanMemOpFn(Module &M, ArrayRef<StringRef> Sized, StringRef Fallback,
+              size_t NumArgs);
+  FunctionCallee getFunctionFor(uint64_t MemOpSize) const;
+  FunctionCallee getFallback() const;
+
+private:
+  SmallVector<FunctionCallee> Funcs;
+  size_t NumSizedFuncs;
+};
+
+NsanMemOpFn::NsanMemOpFn(Module &M, ArrayRef<StringRef> Sized,
+                         StringRef Fallback, size_t NumArgs) {
+  LLVMContext &Ctx = M.getContext();
+  AttributeList Attr;
+  Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind);
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  IntegerType *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
+  FunctionType *SizedFnTy = nullptr;
+
+  NumSizedFuncs = Sized.size();
+
+  // First entry is fallback function
+  if (NumArgs == 3) {
+    Funcs.push_back(
+        M.getOrInsertFunction(Fallback, Attr, VoidTy, PtrTy, PtrTy, IntptrTy));
+    SizedFnTy = FunctionType::get(VoidTy, {PtrTy, PtrTy}, false);
+  } else if (NumArgs == 2) {
+    Funcs.push_back(
+        M.getOrInsertFunction(Fallback, Attr, VoidTy, PtrTy, IntptrTy));
+    SizedFnTy = FunctionType::get(VoidTy, {PtrTy}, false);
+  } else {
+    llvm_unreachable("Unexpected value of sized functions arguments");
+  }
+
+  for (size_t i = 0; i < NumSizedFuncs; ++i)
+    Funcs.push_back(M.getOrInsertFunction(Sized[i], SizedFnTy, Attr));
+}
+
+FunctionCallee NsanMemOpFn::getFunctionFor(uint64_t MemOpSize) const {
+  // Now `getFunctionFor` operates on `Funcs` of size 4 (at least) and the
+  // following code assumes that the number of functions in `Func` is sufficient
+  assert(NumSizedFuncs >= 3 && "Unexpected number of sized functions");
+
+  size_t Idx =
+      MemOpSize == 4 ? 1 : (MemOpSize == 8 ? 2 : (MemOpSize == 16 ? 3 : 0));
+
+  return Funcs[Idx];
+}
+
+FunctionCallee NsanMemOpFn::getFallback() const { return Funcs[0]; }
+
 /// Instantiating NumericalStabilitySanitizer inserts the nsan runtime library
 /// API function declarations into the module if they don't exist already.
 /// Instantiating ensures the __nsan_init function is in the list of global
@@ -550,12 +604,16 @@ class NumericalStabilitySanitizer {
   LLVMContext &Context;
   MappingConfig Config;
   IntegerType *IntptrTy = nullptr;
+
+  // TODO: Use std::array instead?
   FunctionCallee NsanGetShadowPtrForStore[FTValueType::kNumValueTypes] = {};
   FunctionCallee NsanGetShadowPtrForLoad[FTValueType::kNumValueTypes] = {};
   FunctionCallee NsanCheckValue[FTValueType::kNumValueTypes] = {};
   FunctionCallee NsanFCmpFail[FTValueType::kNumValueTypes] = {};
-  FunctionCallee NsanCopyValues;
-  FunctionCallee NsanSetValueUnknown;
+
+  NsanMemOpFn NsanCopyFns;
+  NsanMemOpFn NsanSetUnknownFns;
+
   FunctionCallee NsanGetRawShadowTypePtr;
   FunctionCallee NsanGetRawShadowPtr;
   GlobalValue *NsanShadowRetTag = nullptr;
@@ -598,7 +656,14 @@ static GlobalValue *createThreadLocalGV(const char *Name, Module &M, Type *Ty) {
 }
 
 NumericalStabilitySanitizer::NumericalStabilitySanitizer(Module &M)
-    : DL(M.getDataLayout()), Context(M.getContext()), Config(Context) {
+    : DL(M.getDataLayout()), Context(M.getContext()), Config(Context),
+      NsanCopyFns(M, {"__nsan_copy_4", "__nsan_copy_8", "__nsan_copy_16"},
+                  "__nsan_copy_values", /*NumArgs=*/3),
+      NsanSetUnknownFns(M,
+                        {"__nsan_set_value_unknown_4",
+                         "__nsan_set_value_unknown_8",
+                         "__nsan_set_value_unknown_16"},
+                        "__nsan_set_value_unknown", /*NumArgs=*/2) {
   IntptrTy = DL.getIntPtrType(Context);
   Type *PtrTy = PointerType::getUnqual(Context);
   Type *Int32Ty = Type::getInt32Ty(Context);
@@ -634,11 +699,6 @@ NumericalStabilitySanitizer::NumericalStabilitySanitizer(Module &M)
         Attr, VoidTy, VTTy, VTTy, ShadowTy, ShadowTy, Int32Ty, Int1Ty, Int1Ty);
   }
 
-  NsanCopyValues = M.getOrInsertFunction("__nsan_copy_values", Attr, VoidTy,
-                                         PtrTy, PtrTy, IntptrTy);
-  NsanSetValueUnknown = M.getOrInsertFunction("__nsan_set_value_unknown", Attr,
-                                              VoidTy, PtrTy, IntptrTy);
-
   // TODO: Add attributes nofree, nosync, readnone, readonly,
   NsanGetRawShadowTypePtr = M.getOrInsertFunction(
       "__nsan_internal_get_raw_shadow_type_ptr", Attr, PtrTy, PtrTy);
@@ -1655,7 +1715,7 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
                                Map.getShadow(BinOp->getOperand(1)));
 
   if (isa<UIToFPInst>(&Inst) || isa<SIToFPInst>(&Inst)) {
-    auto *Cast = dyn_cast<CastInst>(&Inst);
+    auto *Cast = cast<CastInst>(&Inst);
     return Builder.CreateCast(Cast->getOpcode(), Cast->getOperand(0),
                               ExtendedVT);
   }
@@ -1665,6 +1725,9 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
                                 Map.getShadow(S->getTrueValue()),
                                 Map.getShadow(S->getFalseValue()));
 
+  if (auto *Freeze = dyn_cast<FreezeInst>(&Inst))
+    return Builder.CreateFreeze(Map.getShadow(Freeze->getOperand(0)));
+
   if (auto *Extract = dyn_cast<ExtractElementInst>(&Inst))
     return Builder.CreateExtractElement(
         Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
@@ -1880,7 +1943,7 @@ void NumericalStabilitySanitizer::propagateNonFTStore(
     }
   }
   // All other stores just reset the shadow value to unknown.
-  Builder.CreateCall(NsanSetValueUnknown, {Dst, ValueSize});
+  Builder.CreateCall(NsanSetUnknownFns.getFallback(), {Dst, ValueSize});
 }
 
 void NumericalStabilitySanitizer::propagateShadowValues(
@@ -2105,7 +2168,7 @@ bool NumericalStabilitySanitizer::sanitizeFunction(
 
   // The last pass populates shadow phis with shadow values.
   for (PHINode *Phi : OriginalPhis) {
-    PHINode *ShadowPhi = dyn_cast<PHINode>(ValueToShadow.getShadow(Phi));
+    PHINode *ShadowPhi = cast<PHINode>(ValueToShadow.getShadow(Phi));
     for (unsigned I : seq(Phi->getNumOperands())) {
       Value *V = Phi->getOperand(I);
       Value *Shadow = ValueToShadow.getShadow(V);
@@ -2123,21 +2186,45 @@ bool NumericalStabilitySanitizer::sanitizeFunction(
   return !ValueToShadow.empty();
 }
 
+static uint64_t GetMemOpSize(Value *V) {
+  uint64_t OpSize = 0;
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    auto *CInt = dyn_cast<ConstantInt>(C);
+    if (CInt && CInt->getValue().getBitWidth() <= 64)
+      OpSize = CInt->getValue().getZExtValue();
+  }
+
+  return OpSize;
+}
+
 // Instrument the memory intrinsics so that they properly modify the shadow
 // memory.
 bool NumericalStabilitySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   IRBuilder<> Builder(MI);
   if (auto *M = dyn_cast<MemSetInst>(MI)) {
-    Builder.CreateCall(
-        NsanSetValueUnknown,
-        {/*Address=*/M->getArgOperand(0),
-         /*Size=*/Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    FunctionCallee SetUnknownFn =
+        NsanSetUnknownFns.getFunctionFor(GetMemOpSize(M->getArgOperand(2)));
+    if (SetUnknownFn.getFunctionType()->getNumParams() == 1)
+      Builder.CreateCall(SetUnknownFn, {/*Address=*/M->getArgOperand(0)});
+    else
+      Builder.CreateCall(SetUnknownFn,
+                         {/*Address=*/M->getArgOperand(0),
+                          /*Size=*/Builder.CreateIntCast(M->getArgOperand(2),
+                                                         IntptrTy, false)});
+
   } else if (auto *M = dyn_cast<MemTransferInst>(MI)) {
-    Builder.CreateCall(
-        NsanCopyValues,
-        {/*Destination=*/M->getArgOperand(0),
-         /*Source=*/M->getArgOperand(1),
-         /*Size=*/Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    FunctionCallee CopyFn =
+        NsanCopyFns.getFunctionFor(GetMemOpSize(M->getArgOperand(2)));
+
+    if (CopyFn.getFunctionType()->getNumParams() == 2)
+      Builder.CreateCall(CopyFn, {/*Destination=*/M->getArgOperand(0),
+                                  /*Source=*/M->getArgOperand(1)});
+    else
+      Builder.CreateCall(CopyFn, {/*Destination=*/M->getArgOperand(0),
+                                  /*Source=*/M->getArgOperand(1),
+                                  /*Size=*/
+                                  Builder.CreateIntCast(M->getArgOperand(2),
+                                                        IntptrTy, false)});
   }
   return false;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 35b1bbf21be97..1ce8f58c1aa14 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1096,7 +1096,7 @@ class PGOUseFunc {
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
         FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
                  InstrumentFuncEntry, HasSingleByteCoverage),
-        FreqAttr(FFA_Normal), IsCS(IsCS) {}
+        FreqAttr(FFA_Normal), IsCS(IsCS), VPC(Func, TLI) {}
 
   void handleInstrProfError(Error Err, uint64_t MismatchedFuncSum);
 
@@ -1178,6 +1178,8 @@ class PGOUseFunc {
   // Is to use the context sensitive profile.
   bool IsCS;
 
+  ValueProfileCollector VPC;
+
   // Find the Instrumented BB and set the value. Return false on error.
   bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
 
@@ -1755,8 +1757,23 @@ void PGOUseFunc::annotateValueSites() {
 void PGOUseFunc::annotateValueSites(uint32_t Kind) {
   assert(Kind <= IPVK_Last);
   unsigned ValueSiteIndex = 0;
-  auto &ValueSites = FuncInfo.ValueSites[Kind];
+
   unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+
+  // Since there isn't a reliable or fast way for profile reader to tell if a
+  // profile is generated with `-enable-vtable-value-profiling` on, we run the
+  // value profile collector over the function IR to find the instrumented sites
+  // iff function profile records shows the number of instrumented vtable sites
+  // is not zero. Function cfg already takes the number of instrumented
+  // indirect call sites into account so it doesn't hash the number of
+  // instrumented vtables; as a side effect it makes it easier to enable
+  // profiling and profile use in two steps if needed.
+  // TODO: Remove this if/when -enable-vtable-value-profiling is on by default.
+  if (NumValueSites > 0 && Kind == IPVK_VTableTarget &&
+      NumValueSites != FuncInfo.ValueSites[IPVK_VTableTarget].size() &&
+      MaxNumVTableAnnotations != 0)
+    FuncInfo.ValueSites[IPVK_VTableTarget] = VPC.get(IPVK_VTableTarget);
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
   if (NumValueSites != ValueSites.size()) {
     auto &Ctx = M->getContext();
     Ctx.diagnose(DiagnosticInfoPGOProfile(
@@ -1875,6 +1892,8 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &MAM) {
   // The variable in a comdat may be discarded by LTO. Ensure the declaration
   // will be retained.
   appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true));
+  if (ProfileSampling)
+    createProfileSamplingVar(M);
   PreservedAnalyses PA;
   PA.preserve<FunctionAnalysisManagerModuleProxy>();
   PA.preserveSet<AllAnalysesOn<Function>>();
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 4371b821eae63..d2268f0a30a25 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -149,9 +149,7 @@ class DFAJumpThreading {
   unfoldSelectInstrs(DominatorTree *DT,
                      const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    SmallVector<SelectInstToUnfold, 4> Stack;
-    for (SelectInstToUnfold SIToUnfold : SelectInsts)
-      Stack.push_back(SIToUnfold);
+    SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
 
     while (!Stack.empty()) {
       SelectInstToUnfold SIToUnfold = Stack.pop_back_val();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 931606c6f8fe1..992139a95a43d 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1889,12 +1889,12 @@ struct DSEState {
         return true;
       auto *Ptr = Memset->getArgOperand(0);
       auto *TI = MallocBB->getTerminator();
-      ICmpInst::Predicate Pred;
       BasicBlock *TrueBB, *FalseBB;
-      if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
-                          FalseBB)))
+      if (!match(TI, m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Ptr),
+                                         m_Zero()),
+                          TrueBB, FalseBB)))
         return false;
-      if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+      if (MemsetBB != FalseBB)
         return false;
       return true;
     };
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 91ef2b4b7c183..329b3ef0c8e4b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
 STATISTIC(NumIntAssociationsHoisted,
           "Number of invariant int expressions "
           "reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+                                    "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -2779,6 +2781,68 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   return true;
 }
 
+/// Reassociate associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants that we want to hoist.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+                               ICFLoopSafetyInfo &SafetyInfo,
+                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                               DominatorTree *DT) {
+  auto *BO = dyn_cast<BinaryOperator>(&I);
+  if (!BO || !BO->isAssociative())
+    return false;
+
+  // Only fold ADDs for now.
+  Instruction::BinaryOps Opcode = BO->getOpcode();
+  if (Opcode != Instruction::Add)
+    return false;
+
+  auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
+  if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative())
+    return false;
+
+  // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+  Value *LV = BO0->getOperand(0);
+  Value *C1 = BO0->getOperand(1);
+  Value *C2 = BO->getOperand(1);
+
+  if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
+    return false;
+
+  auto *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplify form?");
+
+  auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
+                                     Preheader->getTerminator());
+  auto *NewBO =
+      BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+
+  // Copy NUW for ADDs if both instructions have it.
+  if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
+      BO0->hasNoUnsignedWrap()) {
+    Inv->setHasNoUnsignedWrap(true);
+    NewBO->setHasNoUnsignedWrap(true);
+  }
+
+  BO->replaceAllUsesWith(NewBO);
+  eraseInstruction(*BO, SafetyInfo, MSSAU);
+
+  // (LV op C1) might not be erased if it has more uses than the one we just
+  // replaced.
+  if (BO0->use_empty())
+    eraseInstruction(*BO0, SafetyInfo, MSSAU);
+
+  return true;
+}
+
 static bool hoistArithmetics(Instruction &I, Loop &L,
                              ICFLoopSafetyInfo &SafetyInfo,
                              MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2816,6 +2880,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
+  if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+    ++NumHoisted;
+    ++NumBOAssociationsHoisted;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 6092cd1bc08be..ff077624802be 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -160,9 +160,8 @@ static bool isProcessableCondBI(const ScalarEvolution &SE,
                                 const BranchInst *BI) {
   BasicBlock *TrueSucc = nullptr;
   BasicBlock *FalseSucc = nullptr;
-  ICmpInst::Predicate Pred;
   Value *LHS, *RHS;
-  if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+  if (!match(BI, m_Br(m_ICmp(m_Value(LHS), m_Value(RHS)),
                       m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 489f12e689d31..db82f75bad5f3 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -192,7 +192,7 @@ class LoadEliminationForLoop {
     // forward and backward dependences qualify.  Disqualify loads that have
     // other unknown dependences.
 
-    SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
+    SmallPtrSet<Instruction *, 4> LoadsWithUnknownDependence;
 
     for (const auto &Dep : *Deps) {
       Instruction *Source = Dep.getSource(DepChecker);
@@ -201,9 +201,9 @@ class LoadEliminationForLoop {
       if (Dep.Type == MemoryDepChecker::Dependence::Unknown ||
           Dep.Type == MemoryDepChecker::Dependence::IndirectUnsafe) {
         if (isa<LoadInst>(Source))
-          LoadsWithUnknownDepedence.insert(Source);
+          LoadsWithUnknownDependence.insert(Source);
         if (isa<LoadInst>(Destination))
-          LoadsWithUnknownDepedence.insert(Destination);
+          LoadsWithUnknownDependence.insert(Destination);
         continue;
       }
 
@@ -231,9 +231,9 @@ class LoadEliminationForLoop {
       Candidates.emplace_front(Load, Store);
     }
 
-    if (!LoadsWithUnknownDepedence.empty())
+    if (!LoadsWithUnknownDependence.empty())
       Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
-        return LoadsWithUnknownDepedence.count(C.Load);
+        return LoadsWithUnknownDependence.count(C.Load);
       });
 
     return Candidates;
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 6eedf95e7575e..5c6ed8487bbd1 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -144,7 +144,23 @@ findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
         BBsToSinkInto.erase(DominatedBB);
       }
       BBsToSinkInto.insert(ColdestBB);
+      continue;
     }
+    // Otherwise, see if we can stop the search through the cold BBs early.
+    // Since the ColdLoopBBs list is sorted in increasing magnitude of
+    // frequency the cold BB frequencies can only get larger. The
+    // BBsToSinkInto set can only get smaller and have a smaller
+    // adjustedSumFreq, due to the earlier checking. So once we find a cold BB
+    // with a frequency at least as large as the adjustedSumFreq of the
+    // current BBsToSinkInto set, the earlier frequency check can never be
+    // true for a future iteration. Note we could do check this more
+    // aggressively earlier, but in practice this ended up being more
+    // expensive overall (added checking to the critical path through the loop
+    // that often ended up continuing early due to an empty
+    // BBsDominatedByColdestBB set, and the frequency check there was false
+    // most of the time anyway).
+    if (adjustedSumFreq(BBsToSinkInto, BFI) <= BFI.getBlockFreq(ColdestBB))
+      break;
   }
 
   // Can't sink into blocks that have no valid insertion point.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 11f9f7822a15c..91461d1ed2759 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -946,13 +946,15 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
                            SCEV::FlagAnyWrap);
     return Result;
-  } else if (EnableVScaleImmediates)
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+  } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    if (EnableVScaleImmediates && M->getNumOperands() == 2) {
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (isa<SCEVVScale>(M->getOperand(1))) {
           S = SE.getConstant(M->getType(), 0);
           return Immediate::getScalable(C->getValue()->getSExtValue());
         }
+    }
+  }
   return Immediate::getZero();
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index bd7895feb64a7..8dd72b8f1414e 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -99,8 +99,8 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
   return HasDeadBlocks;
 }
 
-static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI,
-                                    DominatorTree *DT) {
+bool llvm::lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI,
+                                   DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
   if (DT)
     DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
@@ -168,45 +168,3 @@ LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   return PreservedAnalyses::all();
 }
-
-namespace {
-/// Legacy pass for lowering is.constant intrinsics out of the IR.
-///
-/// When this pass is run over a function it converts is.constant intrinsics
-/// into 'true' or 'false'. This complements the normal constant folding
-/// to 'true' as part of Instruction Simplify passes.
-class LowerConstantIntrinsics : public FunctionPass {
-public:
-  static char ID;
-  LowerConstantIntrinsics() : FunctionPass(ID) {
-    initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    const TargetLibraryInfo &TLI =
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    DominatorTree *DT = nullptr;
-    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-      DT = &DTWP->getDomTree();
-    return lowerConstantIntrinsics(F, TLI, DT);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-};
-} // namespace
-
-char LowerConstantIntrinsics::ID = 0;
-INITIALIZE_PASS_BEGIN(LowerConstantIntrinsics, "lower-constant-intrinsics",
-                      "Lower constant intrinsics", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(LowerConstantIntrinsics, "lower-constant-intrinsics",
-                    "Lower constant intrinsics", false, false)
-
-FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
-  return new LowerConstantIntrinsics();
-}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 4d8fd5d3b9f5c..c738a2a6f39a4 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -4967,32 +4967,218 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   return NewAI;
 }
 
-static void insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig,
-                             AllocaInst *NewAddr, DIExpression *NewFragmentExpr,
-                             Instruction *BeforeInst) {
-  DIB.insertDeclare(NewAddr, Orig->getVariable(), NewFragmentExpr,
+// There isn't a shared interface to get the "address" parts out of a
+// dbg.declare and dbg.assign, so provide some wrappers now for
+// both debug intrinsics and records.
+const Value *getAddress(const DbgVariableIntrinsic *DVI) {
+  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+    return DAI->getAddress();
+  return cast<DbgDeclareInst>(DVI)->getAddress();
+}
+
+const Value *getAddress(const DbgVariableRecord *DVR) {
+  assert(DVR->getType() == DbgVariableRecord::LocationType::Declare ||
+         DVR->getType() == DbgVariableRecord::LocationType::Assign);
+  return DVR->getAddress();
+}
+
+bool isKillAddress(const DbgVariableIntrinsic *DVI) {
+  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+    return DAI->isKillAddress();
+  return cast<DbgDeclareInst>(DVI)->isKillLocation();
+}
+
+bool isKillAddress(const DbgVariableRecord *DVR) {
+  assert(DVR->getType() == DbgVariableRecord::LocationType::Declare ||
+         DVR->getType() == DbgVariableRecord::LocationType::Assign);
+  if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
+    return DVR->isKillAddress();
+  return DVR->isKillLocation();
+}
+
+const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
+  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+    return DAI->getAddressExpression();
+  return cast<DbgDeclareInst>(DVI)->getExpression();
+}
+
+const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
+  assert(DVR->getType() == DbgVariableRecord::LocationType::Declare ||
+         DVR->getType() == DbgVariableRecord::LocationType::Assign);
+  if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
+    return DVR->getAddressExpression();
+  return DVR->getExpression();
+}
+
+/// Create or replace an existing fragment in a DIExpression with \p Frag.
+/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
+/// operation, add \p BitExtractOffset to the offset part.
+///
+/// Returns the new expression, or nullptr if this fails (see details below).
+///
+/// This function is similar to DIExpression::createFragmentExpression except
+/// for 3 important distinctions:
+///   1. The new fragment isn't relative to an existing fragment.
+///   2. It assumes the computed location is a memory location. This means we
+///      don't need to perform checks that creating the fragment preserves the
+///      expression semantics.
+///   3. Existing extract_bits are modified independently of fragment changes
+///      using \p BitExtractOffset. A change to the fragment offset or size
+///      may affect a bit extract. But a bit extract offset can change
+///      independently of the fragment dimensions.
+///
+/// Returns the new expression, or nullptr if one couldn't be created.
+/// Ideally this is only used to signal that a bit-extract has become
+/// zero-sized (and thus the new debug record has no size and can be
+/// dropped), however, it fails for other reasons too - see the FIXME below.
+///
+/// FIXME: To keep the change that introduces this function NFC it bails
+/// in some situations unecessarily, e.g. when fragment and bit extract
+/// sizes differ.
+static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
+                                             DIExpression::FragmentInfo Frag,
+                                             int64_t BitExtractOffset) {
+  SmallVector<uint64_t, 8> Ops;
+  bool HasFragment = false;
+  bool HasBitExtract = false;
+
+  for (auto &Op : Expr->expr_ops()) {
+    if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+      HasFragment = true;
+      continue;
+    }
+    if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
+        Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
+      HasBitExtract = true;
+      int64_t ExtractOffsetInBits = Op.getArg(0);
+      int64_t ExtractSizeInBits = Op.getArg(1);
+
+      // DIExpression::createFragmentExpression doesn't know how to handle
+      // a fragment that is smaller than the extract. Copy the behaviour
+      // (bail) to avoid non-NFC changes.
+      // FIXME: Don't do this.
+      if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
+        return nullptr;
+
+      assert(BitExtractOffset <= 0);
+      int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
+
+      // DIExpression::createFragmentExpression doesn't know what to do
+      // if the new extract starts "outside" the existing one. Copy the
+      // behaviour (bail) to avoid non-NFC changes.
+      // FIXME: Don't do this.
+      if (AdjustedOffset < 0)
+        return nullptr;
+
+      Ops.push_back(Op.getOp());
+      Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
+      Ops.push_back(ExtractSizeInBits);
+      continue;
+    }
+    Op.appendToVector(Ops);
+  }
+
+  // Unsupported by createFragmentExpression, so don't support it here yet to
+  // preserve NFC-ness.
+  if (HasFragment && HasBitExtract)
+    return nullptr;
+
+  if (!HasBitExtract) {
+    Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+    Ops.push_back(Frag.OffsetInBits);
+    Ops.push_back(Frag.SizeInBits);
+  }
+  return DIExpression::get(Expr->getContext(), Ops);
+}
+
+/// Insert a new dbg.declare.
+/// \p Orig Original to copy debug loc and variable from.
+/// \p NewAddr Location's new base address.
+/// \p NewAddrExpr New expression to apply to address.
+/// \p BeforeInst Insert position.
+/// \p NewFragment New fragment (absolute, non-relative).
+/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
+static void
+insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
+                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
+                 std::optional<DIExpression::FragmentInfo> NewFragment,
+                 int64_t BitExtractAdjustment) {
+  if (NewFragment)
+    NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment,
+                                          BitExtractAdjustment);
+  if (!NewAddrExpr)
+    return;
+
+  DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
                     Orig->getDebugLoc(), BeforeInst);
 }
-static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
-                             AllocaInst *NewAddr, DIExpression *NewFragmentExpr,
-                             Instruction *BeforeInst) {
+
+/// Insert a new dbg.assign.
+/// \p Orig Original to copy debug loc, variable, value and value expression
+///    from.
+/// \p NewAddr Location's new base address.
+/// \p NewAddrExpr New expression to apply to address.
+/// \p BeforeInst Insert position.
+/// \p NewFragment New fragment (absolute, non-relative).
+/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
+static void
+insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
+                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
+                 std::optional<DIExpression::FragmentInfo> NewFragment,
+                 int64_t BitExtractAdjustment) {
+  // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
   (void)BeforeInst;
+
+  // A dbg.assign puts fragment info in the value expression only. The address
+  // expression has already been built: NewAddrExpr.
+  DIExpression *NewFragmentExpr = Orig->getExpression();
+  if (NewFragment)
+    NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
+                                              BitExtractAdjustment);
+  if (!NewFragmentExpr)
+    return;
+
+  // Apply a DIAssignID to the store if it doesn't already have it.
   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
+
   Instruction *NewAssign =
       DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(),
-                          NewFragmentExpr, NewAddr,
-                          Orig->getAddressExpression(), Orig->getDebugLoc())
+                          NewFragmentExpr, NewAddr, NewAddrExpr,
+                          Orig->getDebugLoc())
           .get<Instruction *>();
   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
   (void)NewAssign;
 }
-static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig,
-                             AllocaInst *NewAddr, DIExpression *NewFragmentExpr,
-                             Instruction *BeforeInst) {
+
+/// Insert a new DbgRecord.
+/// \p Orig Original to copy record type, debug loc and variable from, and
+///    additionally value and value expression for dbg_assign records.
+/// \p NewAddr Location's new base address.
+/// \p NewAddrExpr New expression to apply to address.
+/// \p BeforeInst Insert position.
+/// \p NewFragment New fragment (absolute, non-relative).
+/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
+static void
+insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
+                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
+                 std::optional<DIExpression::FragmentInfo> NewFragment,
+                 int64_t BitExtractAdjustment) {
   (void)DIB;
+
+  // A dbg_assign puts fragment info in the value expression only. The address
+  // expression has already been built: NewAddrExpr. A dbg_declare puts the
+  // new fragment info into NewAddrExpr (as it only has one expression).
+  DIExpression *NewFragmentExpr =
+      Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
+  if (NewFragment)
+    NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
+                                              BitExtractAdjustment);
+  if (!NewFragmentExpr)
+    return;
+
   if (Orig->isDbgDeclare()) {
     DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
@@ -5000,13 +5186,16 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig,
                                                    BeforeInst->getIterator());
     return;
   }
+
+  // Apply a DIAssignID to the store if it doesn't already have it.
   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
+
   DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
-      Orig->getAddressExpression(), Orig->getDebugLoc());
+      NewAddrExpr, Orig->getDebugLoc());
   LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
   (void)NewAssign;
 }
@@ -5019,7 +5208,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   unsigned NumPartitions = 0;
   bool Changed = false;
-  const DataLayout &DL = AI.getDataLayout();
+  const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
@@ -5113,54 +5302,78 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
   auto MigrateOne = [&](auto *DbgVariable) {
-    auto *Expr = DbgVariable->getExpression();
-    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
-    uint64_t AllocaSize =
-        DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedValue();
-    for (auto Fragment : Fragments) {
-      // Create a fragment expression describing the new partition or reuse AI's
-      // expression if there is only one partition.
-      auto *FragmentExpr = Expr;
-      if (Fragment.Size < AllocaSize || Expr->isFragment()) {
-        // If this alloca is already a scalar replacement of a larger aggregate,
-        // Fragment.Offset describes the offset inside the scalar.
-        auto ExprFragment = Expr->getFragmentInfo();
-        uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
-        uint64_t Start = Offset + Fragment.Offset;
-        uint64_t Size = Fragment.Size;
-        if (ExprFragment) {
-          uint64_t AbsEnd =
-              ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
-          if (Start >= AbsEnd) {
-            // No need to describe a SROAed padding.
-            continue;
-          }
-          Size = std::min(Size, AbsEnd - Start);
-        }
-        // The new, smaller fragment is stenciled out from the old fragment.
-        if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
-          assert(Start >= OrigFragment->OffsetInBits &&
-                 "new fragment is outside of original fragment");
-          Start -= OrigFragment->OffsetInBits;
-        }
+    // Can't overlap with undef memory.
+    if (isKillAddress(DbgVariable))
+      return;
 
-        // The alloca may be larger than the variable.
-        auto VarSize = DbgVariable->getVariable()->getSizeInBits();
-        if (VarSize) {
-          if (Size > *VarSize)
-            Size = *VarSize;
-          if (Size == 0 || Start + Size > *VarSize)
-            continue;
-        }
+    const Value *DbgPtr = getAddress(DbgVariable);
+    DIExpression::FragmentInfo VarFrag =
+        DbgVariable->getFragmentOrEntireVariable();
+    // Get the address expression constant offset if one exists and the ops
+    // that come after it.
+    int64_t CurrentExprOffsetInBytes = 0;
+    SmallVector<uint64_t> PostOffsetOps;
+    if (!getAddressExpression(DbgVariable)
+             ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
+      return; // Couldn't interpret this DIExpression - drop the var.
+
+    // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
+    int64_t ExtractOffsetInBits = 0;
+    for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
+      if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
+          Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
+        ExtractOffsetInBits = Op.getArg(0);
+        break;
+      }
+    }
 
-        // Avoid creating a fragment expression that covers the entire variable.
-        if (!VarSize || *VarSize != Size) {
-          if (auto E =
-                  DIExpression::createFragmentExpression(Expr, Start, Size))
-            FragmentExpr = *E;
-          else
-            continue;
-        }
+    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+    for (auto Fragment : Fragments) {
+      int64_t OffsetFromLocationInBits;
+      std::optional<DIExpression::FragmentInfo> NewDbgFragment;
+      // Find the variable fragment that the new alloca slice covers.
+      // Drop debug info for this variable fragment if we can't compute an
+      // intersect between it and the alloca slice.
+      if (!DIExpression::calculateFragmentIntersect(
+              DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
+              CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
+              NewDbgFragment, OffsetFromLocationInBits))
+        continue; // Do not migrate this fragment to this slice.
+
+      // Zero sized fragment indicates there's no intersect between the variable
+      // fragment and the alloca slice. Skip this slice for this variable
+      // fragment.
+      if (NewDbgFragment && !NewDbgFragment->SizeInBits)
+        continue; // Do not migrate this fragment to this slice.
+
+      // No fragment indicates DbgVariable's variable or fragment exactly
+      // overlaps the slice; copy its fragment (or nullopt if there isn't one).
+      if (!NewDbgFragment)
+        NewDbgFragment = DbgVariable->getFragment();
+
+      // Reduce the new expression offset by the bit-extract offset since
+      // we'll be keeping that.
+      int64_t OffestFromNewAllocaInBits =
+          OffsetFromLocationInBits - ExtractOffsetInBits;
+      // We need to adjust an existing bit extract if the offset expression
+      // can't eat the slack (i.e., if the new offset would be negative).
+      int64_t BitExtractOffset =
+          std::min<int64_t>(0, OffestFromNewAllocaInBits);
+      // The magnitude of a negative value indicates the number of bits into
+      // the existing variable fragment that the memory region begins. The new
+      // variable fragment already excludes those bits - the new DbgPtr offset
+      // only needs to be applied if it's positive.
+      OffestFromNewAllocaInBits =
+          std::max(int64_t(0), OffestFromNewAllocaInBits);
+
+      // Rebuild the expression:
+      //    {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
+      // Add NewDbgFragment later, because dbg.assigns don't want it in the
+      // address expression but the value expression instead.
+      DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
+      if (OffestFromNewAllocaInBits > 0) {
+        int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
+        NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
       }
 
       // Remove any existing intrinsics on the new alloca describing
@@ -5177,7 +5390,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
       for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
       for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
 
-      insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI);
+      insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
+                       NewDbgFragment, BitExtractOffset);
     }
   };
 
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index cb1456b146325..86669e8c5aa49 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -32,7 +32,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopStrengthReducePass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
-  initializeLowerConstantIntrinsicsPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 6a3b0e1b43e21..c235d2fb2a5bd 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -632,7 +632,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     } else {
       // Create a new unconditional branch that will continue the loop as a new
       // terminator.
-      BranchInst::Create(ContinueBB, ParentBB);
+      Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB);
+      NewBI->setDebugLoc(BI.getDebugLoc());
     }
     BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
     BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
@@ -666,10 +667,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // Finish updating dominator tree and memory ssa for full unswitch.
   if (FullUnswitch) {
     if (MSSAU) {
-      // Remove the cloned branch instruction.
-      ParentBB->getTerminator()->eraseFromParent();
-      // Create unconditional branch now.
-      BranchInst::Create(ContinueBB, ParentBB);
+      Instruction *Term = ParentBB->getTerminator();
+      // Remove the cloned branch instruction and create unconditional branch
+      // now.
+      Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB);
+      NewBI->setDebugLoc(Term->getDebugLoc());
+      Term->eraseFromParent();
       MSSAU->removeEdge(ParentBB, LoopExitBB);
     }
     DT.deleteEdge(ParentBB, LoopExitBB);
@@ -861,8 +864,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
   OldPH->getTerminator()->eraseFromParent();
 
-  // Now add the unswitched switch.
+  // Now add the unswitched switch. This new switch instruction inherits the
+  // debug location of the old switch, because it semantically replace the old
+  // one.
   auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+  NewSI->setDebugLoc(SIW->getDebugLoc());
   SwitchInstProfUpdateWrapper NewSIW(*NewSI);
 
   // Rewrite the IR for the unswitched basic blocks. This requires two steps.
@@ -972,8 +978,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
                                       /*KeepOneInputPHIs*/ true);
     }
     // Now nuke the switch and replace it with a direct branch.
+    Instruction *NewBI = BranchInst::Create(CommonSuccBB, BB);
+    NewBI->setDebugLoc(SIW->getDebugLoc());
     SIW.eraseFromParent();
-    BranchInst::Create(CommonSuccBB, BB);
   } else if (DefaultExitBB) {
     assert(SI.getNumCases() > 0 &&
            "If we had no cases we'd have a common successor!");
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index b7baf34f27c21..11de37f7a7c10 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -77,6 +77,9 @@ static cl::opt<bool> UserSinkCommonInsts(
     "sink-common-insts", cl::Hidden, cl::init(false),
     cl::desc("Sink common instructions (default = false)"));
 
+static cl::opt<bool> UserSpeculateUnpredictables(
+    "speculate-unpredictables", cl::Hidden, cl::init(false),
+    cl::desc("Speculate unpredictable branches (default = false)"));
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
@@ -325,6 +328,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.HoistCommonInsts = UserHoistCommonInsts;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
+  if (UserSpeculateUnpredictables.getNumOccurrences())
+    Options.SpeculateUnpredictables = UserSpeculateUnpredictables;
 }
 
 SimplifyCFGPass::SimplifyCFGPass() {
@@ -351,7 +356,9 @@ void SimplifyCFGPass::printPipeline(
   OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
   OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
   OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
-  OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch";
+  OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
+  OS << (Options.SpeculateUnpredictables ? "" : "no-")
+     << "speculate-unpredictables";
   OS << '>';
 }
 
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9be60721bebb5..75585fcc80266 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -425,14 +425,12 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
 
 // Returns true if A matches B + C where C is constant.
 static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
-  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
-          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+  return match(A, m_c_Add(m_Value(B), m_ConstantInt(C)));
 }
 
 // Returns true if A matches B | C where C is constant.
 static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
-  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
-          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+  return match(A, m_c_Or(m_Value(B), m_ConstantInt(C)));
 }
 
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 79911bf563ea4..c78965aaeba3c 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1909,22 +1909,29 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
     auto NewPhi =
         PHINode::Create(Phi->getType(), Incoming.size(),
                         Phi->getName() + ".moved", FirstGuardBlock->begin());
+    bool AllUndef = true;
     for (auto *In : Incoming) {
       Value *V = UndefValue::get(Phi->getType());
       if (In == Out) {
         V = NewPhi;
       } else if (Phi->getBasicBlockIndex(In) != -1) {
         V = Phi->removeIncomingValue(In, false);
+        AllUndef &= isa<UndefValue>(V);
       }
       NewPhi->addIncoming(V, In);
     }
     assert(NewPhi->getNumIncomingValues() == Incoming.size());
+    Value *NewV = NewPhi;
+    if (AllUndef) {
+      NewPhi->eraseFromParent();
+      NewV = UndefValue::get(Phi->getType());
+    }
     if (Phi->getNumOperands() == 0) {
-      Phi->replaceAllUsesWith(NewPhi);
+      Phi->replaceAllUsesWith(NewV);
       I = Phi->eraseFromParent();
       continue;
     }
-    Phi->addIncoming(NewPhi, GuardBlock);
+    Phi->addIncoming(NewV, GuardBlock);
     ++I;
   }
 }
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index e97506b4bbd95..30a343b2c564e 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
 STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
 STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
 STATISTIC(NumWillReturn, "Number of functions inferred as willreturn");
+STATISTIC(NumCold, "Number of functions inferred as cold");
 
 static bool setDoesNotAccessMemory(Function &F) {
   if (F.doesNotAccessMemory())
@@ -57,6 +58,14 @@ static bool setDoesNotAccessMemory(Function &F) {
   return true;
 }
 
+static bool setIsCold(Function &F) {
+  if (F.hasFnAttribute(Attribute::Cold))
+    return false;
+  F.addFnAttr(Attribute::Cold);
+  ++NumCold;
+  return true;
+}
+
 static bool setOnlyAccessesInaccessibleMemory(Function &F) {
   if (F.onlyAccessesInaccessibleMemory())
     return false;
@@ -270,6 +279,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setNonLazyBind(F);
 
   switch (TheLibFunc) {
+  case LibFunc_nan:
+  case LibFunc_nanf:
+  case LibFunc_nanl:
   case LibFunc_strlen:
   case LibFunc_strnlen:
   case LibFunc_wcslen:
@@ -1087,6 +1099,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotThrow(F);
     break;
+  case LibFunc_abort:
+    Changed |= setIsCold(F);
+    break;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
     Changed |= setRetAndArgsNoUndef(F);
@@ -1098,6 +1113,11 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
   case LibFunc_ldexpl:
     Changed |= setWillReturn(F);
     break;
+  case LibFunc_remquo:
+  case LibFunc_remquof:
+  case LibFunc_remquol:
+    Changed |= setDoesNotCapture(F, 2);
+    [[fallthrough]];
   case LibFunc_abs:
   case LibFunc_acos:
   case LibFunc_acosf:
@@ -1195,6 +1215,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
+  case LibFunc_remainder:
+  case LibFunc_remainderf:
+  case LibFunc_remainderl:
   case LibFunc_rint:
   case LibFunc_rintf:
   case LibFunc_rintl:
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 1b811c7cebef9..51e8821773c3a 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_component_library(LLVMTransformUtils
   CountVisits.cpp
   Debugify.cpp
   DemoteRegToStack.cpp
-  DXILResource.cpp
   DXILUpgrade.cpp
   EntryExitInstrumenter.cpp
   EscapeEnumerator.cpp
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index a05e955943f7c..5bca5cf8ff91f 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -932,6 +932,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::DisableSanitizerInstrumentation:
       case Attribute::FnRetThunkExtern:
       case Attribute::Hot:
+      case Attribute::HybridPatchable:
       case Attribute::NoRecurse:
       case Attribute::InlineHint:
       case Attribute::MinSize:
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index ab1edf47d8db0..f92e921949a08 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -71,24 +71,26 @@ static bool isExitBlock(BasicBlock *BB,
   return is_contained(ExitBlocks, BB);
 }
 
+// Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
+// lot of instructions within the same loops, computing the exit blocks is
+// expensive, and we're not mutating the loop structure.
+using LoopExitBlocksTy = SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>>;
+
 /// For every instruction from the worklist, check to see if it has any uses
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
-bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                                    const DominatorTree &DT, const LoopInfo &LI,
-                                    ScalarEvolution *SE,
-                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
-                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+static bool
+formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
+                             const DominatorTree &DT, const LoopInfo &LI,
+                             ScalarEvolution *SE,
+                             SmallVectorImpl<PHINode *> *PHIsToRemove,
+                             SmallVectorImpl<PHINode *> *InsertedPHIs,
+                             LoopExitBlocksTy &LoopExitBlocks) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
-  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
-  // instructions within the same loops, computing the exit blocks is
-  // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
-
   while (!Worklist.empty()) {
     UsesToRewrite.clear();
 
@@ -317,13 +319,28 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   return Changed;
 }
 
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    const DominatorTree &DT, const LoopInfo &LI,
+                                    ScalarEvolution *SE,
+                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSAForInstructionsImpl(Worklist, DT, LI, SE, PHIsToRemove,
+                                      InsertedPHIs, LoopExitBlocks);
+}
+
 // Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
 static void computeBlocksDominatingExits(
-    Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    Loop &L, const DominatorTree &DT,
+    const SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
   // We start from the exit blocks, as every block trivially dominates itself
   // (not strictly).
-  SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks);
+  SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks.begin(), ExitBlocks.end());
 
   while (!BBWorklist.empty()) {
     BasicBlock *BB = BBWorklist.pop_back_val();
@@ -360,8 +377,9 @@ static void computeBlocksDominatingExits(
   }
 }
 
-bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
-                     ScalarEvolution *SE) {
+static bool formLCSSAImpl(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                          ScalarEvolution *SE,
+                          LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
 #ifdef EXPENSIVE_CHECKS
@@ -372,8 +390,9 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
   }
 #endif
 
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L.getExitBlocks(ExitBlocks);
+  if (!LoopExitBlocks.count(&L))
+    L.getExitBlocks(LoopExitBlocks[&L]);
+  const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[&L];
   if (ExitBlocks.empty())
     return false;
 
@@ -414,26 +433,43 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
     }
   }
 
-  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE);
+  Changed = formLCSSAForInstructionsImpl(Worklist, DT, *LI, SE, nullptr,
+                                         nullptr, LoopExitBlocks);
 
   assert(L.isLCSSAForm(DT));
 
   return Changed;
 }
 
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                     ScalarEvolution *SE) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
-                                const LoopInfo *LI, ScalarEvolution *SE) {
+static bool formLCSSARecursivelyImpl(Loop &L, const DominatorTree &DT,
+                                     const LoopInfo *LI, ScalarEvolution *SE,
+                                     LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
   for (Loop *SubLoop : L.getSubLoops())
-    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+    Changed |= formLCSSARecursivelyImpl(*SubLoop, DT, LI, SE, LoopExitBlocks);
 
-  Changed |= formLCSSA(L, DT, LI, SE);
+  Changed |= formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
   return Changed;
 }
 
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+                                const LoopInfo *LI, ScalarEvolution *SE) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSARecursivelyImpl(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process all loops in the function, inner-most out.
 static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
                                 ScalarEvolution *SE) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 0afa1ef780504..a0406111ecbf3 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1091,8 +1091,8 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
-  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
-    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+  for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
+    MDNode *MD = dyn_cast<MDNode>(MDO);
     if (!MD)
       continue;
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 4609376a748f9..0abf6d77496dc 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -918,6 +918,44 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
+constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
+  switch (RK) {
+  default:
+    llvm_unreachable("Unexpected recurrence kind");
+  case RecurKind::Add:
+    return Intrinsic::vector_reduce_add;
+  case RecurKind::Mul:
+    return Intrinsic::vector_reduce_mul;
+  case RecurKind::And:
+    return Intrinsic::vector_reduce_and;
+  case RecurKind::Or:
+    return Intrinsic::vector_reduce_or;
+  case RecurKind::Xor:
+    return Intrinsic::vector_reduce_xor;
+  case RecurKind::FMulAdd:
+  case RecurKind::FAdd:
+    return Intrinsic::vector_reduce_fadd;
+  case RecurKind::FMul:
+    return Intrinsic::vector_reduce_fmul;
+  case RecurKind::SMax:
+    return Intrinsic::vector_reduce_smax;
+  case RecurKind::SMin:
+    return Intrinsic::vector_reduce_smin;
+  case RecurKind::UMax:
+    return Intrinsic::vector_reduce_umax;
+  case RecurKind::UMin:
+    return Intrinsic::vector_reduce_umin;
+  case RecurKind::FMax:
+    return Intrinsic::vector_reduce_fmax;
+  case RecurKind::FMin:
+    return Intrinsic::vector_reduce_fmin;
+  case RecurKind::FMaximum:
+    return Intrinsic::vector_reduce_fmaximum;
+  case RecurKind::FMinimum:
+    return Intrinsic::vector_reduce_fminimum;
+  }
+}
+
 unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
   switch (RdxID) {
   case Intrinsic::vector_reduce_fadd:
@@ -1215,12 +1253,13 @@ Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
   RecurKind Kind = Desc.getRecurrenceKind();
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          "AnyOf reduction is not supported.");
+  Intrinsic::ID Id = getReductionIntrinsicID(Kind);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Type *SrcEltTy = SrcTy->getElementType();
   Value *Iden =
       Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
   Value *Ops[] = {Iden, Src};
-  return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops);
+  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
 }
 
 Value *llvm::createTargetReduction(IRBuilderBase &B,
@@ -1260,9 +1299,10 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
 
+  Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Value *Ops[] = {Start, Src};
-  return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops);
+  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index b38db412f786a..f0c7e31b9c223 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -45,8 +45,7 @@ void llvm::createMemCpyLoopKnownSize(
 
   Type *TypeOfCopyLen = CopyLen->getType();
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
-      AtomicElementSize);
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
 
@@ -111,8 +110,8 @@ void llvm::createMemCpyLoopKnownSize(
 
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, SrcAlign.value(),
-                                          DstAlign.value(), AtomicElementSize);
+                                          SrcAS, DstAS, SrcAlign, DstAlign,
+                                          AtomicElementSize);
 
     for (auto *OpTy : RemainingOps) {
       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
@@ -197,8 +196,7 @@ void llvm::createMemCpyLoopUnknownSize(
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
-      AtomicElementSize);
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
@@ -347,6 +345,30 @@ void llvm::createMemCpyLoopUnknownSize(
   }
 }
 
+// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
+// addresspacecast to obtain a pair of pointers in the same addressspace. The
+// caller needs to ensure that addrspacecasting is possible.
+// No-op if the pointers are in the same address space.
+static std::pair<Value *, Value *>
+tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
+                               const TargetTransformInfo &TTI) {
+  Value *ResAddr1 = Addr1;
+  Value *ResAddr2 = Addr2;
+
+  unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
+  unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
+  if (AS1 != AS2) {
+    if (TTI.isValidAddrSpaceCast(AS2, AS1))
+      ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
+    else if (TTI.isValidAddrSpaceCast(AS1, AS2))
+      ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
+    else
+      llvm_unreachable("Can only lower memmove between address spaces if they "
+                       "support addrspacecast");
+  }
+  return {ResAddr1, ResAddr2};
+}
+
 // Lower memmove to IR. memmove is required to correctly copy overlapping memory
 // regions; therefore, it has to check the relative positions of the source and
 // destination pointers and choose the copy direction accordingly.
@@ -369,17 +391,61 @@ void llvm::createMemCpyLoopUnknownSize(
 //   }
 //   return dst;
 // }
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
-                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
-                              Align DstAlign, bool SrcIsVolatile,
-                              bool DstIsVolatile,
-                              const TargetTransformInfo &TTI) {
+//
+// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
+// used for the memory accesses in the loops. Then, additional loops with
+// byte-wise accesses are added for the remaining bytes.
+static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
+                                         Value *SrcAddr, Value *DstAddr,
+                                         Value *CopyLen, Align SrcAlign,
+                                         Align DstAlign, bool SrcIsVolatile,
+                                         bool DstIsVolatile,
+                                         const TargetTransformInfo &TTI) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getDataLayout();
-  // TODO: Use different element type if possible?
-  Type *EltTy = Type::getInt8Ty(F->getContext());
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
+                                                   SrcAlign, DstAlign);
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  bool LoopOpIsInt8 = LoopOpType == Int8Type;
+
+  // If the memory accesses are wider than one byte, residual loops with
+  // i8-accesses are required to move remaining bytes.
+  bool RequiresResidual = !LoopOpIsInt8;
+
+  Type *ResidualLoopOpType = Int8Type;
+  unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  Value *RuntimeLoopCount = CopyLen;
+  Value *RuntimeLoopRemainder = nullptr;
+  Value *RuntimeBytesCopiedMainLoop = CopyLen;
+  Value *SkipResidualCondition = nullptr;
+  if (RequiresResidual) {
+    RuntimeLoopCount =
+        getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+    RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                   CILoopOpSize, LoopOpSize);
+    RuntimeBytesCopiedMainLoop =
+        PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+    SkipResidualCondition =
+        PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
+  }
+  Value *SkipMainCondition =
+      PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -387,76 +453,374 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
   // structure. Its block terminators (unconditional branches) are replaced by
   // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
+  // If the pointers are in different address spaces, they need to be converted
+  // to a compatible one. Cases where memory ranges in the different address
+  // spaces cannot overlap are lowered as memcpy and not handled here.
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
   Instruction *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  // If the LoopOpSize is greater than 1, each part of the function consist of
+  // four blocks:
+  //   memmove_copy_backwards:
+  //       skip the residual loop when 0 iterations are required
+  //   memmove_bwd_residual_loop:
+  //       copy the last few bytes individually so that the remaining length is
+  //       a multiple of the LoopOpSize
+  //   memmove_bwd_middle: skip the main loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
+  //   memmove_copy_forward: skip the main loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB with wide accesses
+  //   memmove_fwd_middle: skip the residual loop when 0 iterations are required
+  //   memmove_fwd_residual_loop: copy the last few bytes individually
+  //
+  // The main and residual loop are switched between copying forward and
+  // backward so that the residual loop always operates on the end of the moved
+  // range. This is based on the assumption that buffers whose start is aligned
+  // with the LoopOpSize are more common than buffers whose end is.
+  //
+  // If the LoopOpSize is 1, each part of the function consists of two blocks:
+  //   memmove_copy_backwards: skip the loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB
+  //   memmove_copy_forward: skip the loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB
   BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
+  CopyBackwardsBB->setName("memmove_copy_backwards");
   BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
+  CopyForwardBB->setName("memmove_copy_forward");
   BasicBlock *ExitBB = InsertBefore->getParent();
   ExitBB->setName("memmove_done");
 
-  unsigned PartSize = DL.getTypeStoreSize(EltTy);
-  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
-  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
 
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+  // Accesses in the residual loops do not share the same alignment as those in
+  // the main loops.
+  Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
+  Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
 
   // Copying backwards.
-  BasicBlock *LoopBB =
-    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
+  {
+    BasicBlock *MainLoopBB = BasicBlock::Create(
+        F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
+
+    // The predecessor of the memmove_bwd_main_loop. Updated in the
+    // following if a residual loop is emitted first.
+    BasicBlock *PredBB = CopyBackwardsBB;
+
+    if (RequiresResidual) {
+      // backwards residual loop
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
+      Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
+          ResidualLoopPhi, One, "bwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualIndex);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualIndex);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+
+      // After the residual loop, go to an intermediate block.
+      BasicBlock *IntermediateBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
+      // Later code expects a terminator in the PredBB.
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      IntermediateBuilder.CreateUnreachable();
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
+                                           RuntimeBytesCopiedMainLoop),
+          IntermediateBB, ResidualLoopBB);
+
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+
+      // How to get to the residual:
+      BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
+                         ThenTerm->getIterator());
+      ThenTerm->eraseFromParent();
+
+      PredBB = IntermediateBB;
+    }
 
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateAlignedLoad(
-      EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
-      PartSrcAlign, SrcIsVolatile, "element");
-  LoopBuilder.CreateAlignedStore(
-      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
-      PartDstAlign, DstIsVolatile);
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
-  ThenTerm->eraseFromParent();
+    // main loop
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
+    Value *MainIndex =
+        MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
+                                 ExitBB, MainLoopBB);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+
+    // How to get to the main loop:
+    Instruction *PredBBTerm = PredBB->getTerminator();
+    BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
+                       PredBBTerm->getIterator());
+    PredBBTerm->eraseFromParent();
+  }
 
   // Copying forward.
-  BasicBlock *FwdLoopBB =
-    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
-  Value *FwdElement = FwdLoopBuilder.CreateAlignedLoad(
-      EltTy, SrcGEP, PartSrcAlign, SrcIsVolatile, "element");
-  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
-  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign,
-                                    DstIsVolatile);
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
-  ElseTerm->eraseFromParent();
+  // main loop
+  {
+    BasicBlock *MainLoopBB =
+        BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi =
+        MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(Zero, CopyForwardBB);
+
+    Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
+    BasicBlock *SuccessorBB = ExitBB;
+    if (RequiresResidual)
+      SuccessorBB =
+          BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
+
+    // leaving or staying in the main loop
+    MainLoopBuilder.CreateCondBr(
+        MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopCount), SuccessorBB,
+        MainLoopBB);
+
+    // getting in or skipping the main loop
+    BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+                       CopyFwdBBTerm->getIterator());
+    CopyFwdBBTerm->eraseFromParent();
+
+    if (RequiresResidual) {
+      BasicBlock *IntermediateBB = SuccessorBB;
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
+      IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
+                                       ResidualLoopBB);
+
+      // Residual loop
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi =
+          ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualLoopPhi);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualLoopPhi);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+      Value *ResidualIndex =
+          ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, One);
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
+          ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(RuntimeBytesCopiedMainLoop, IntermediateBB);
+    }
+  }
+}
+
+// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
+// compile time, obsolete loops and branches are omitted, and the residual code
+// is straight-line code instead of a loop.
+static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       ConstantInt *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  // No need to expand zero length moves.
+  if (CopyLen->isZero())
+    return;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
+                                                   SrcAlign, DstAlign);
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+  uint64_t BytesCopiedInLoop = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
+
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+  ConstantInt *TripCount = ConstantInt::get(ILengthType, LoopEndCount);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
+  Instruction *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+  // Helper function to generate a load/store pair of a given type in the
+  // residual. Used in the forward and backward branches.
+  auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
+                                      uint64_t &BytesCopied) {
+    Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+    Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+    // Calculate the new index
+    unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+
+    uint64_t GepIndex = BytesCopied / OperandSize;
+    assert(GepIndex * OperandSize == BytesCopied &&
+           "Division should have no Remainder!");
+
+    Value *SrcGEP = Builder.CreateInBoundsGEP(
+        OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
+    Value *DstGEP = Builder.CreateInBoundsGEP(
+        OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
+    BytesCopied += OperandSize;
+  };
+
+  // Copying backwards.
+  if (RemainingBytes != 0) {
+    CopyBackwardsBB->setName("memmove_bwd_residual");
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. We need the same
+    // instructions as in the forward case, only in reverse. So we generate code
+    // the same way, except that we change the IRBuilder insert point for each
+    // load/store pair so that each one is inserted before the previous one
+    // instead of after it.
+    IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign,
+                                          PartDstAlign);
+    for (auto *OpTy : RemainingOps) {
+      // reverse the order of the emitted operations
+      BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
+      GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
+    }
+  }
+  if (LoopEndCount != 0) {
+    BasicBlock *LoopBB = CopyBackwardsBB;
+    BasicBlock *PredBB = OrigBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      LoopBB = CopyBackwardsBB->splitBasicBlock(
+          CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
+      PredBB = CopyBackwardsBB;
+    } else {
+      CopyBackwardsBB->setName("memmove_bwd_loop");
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
+    Value *Index = LoopBuilder.CreateSub(LoopPhi, One, "bwd_index");
+    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, Index);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, Index);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+
+    // Replace the unconditional branch introduced by
+    // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(TripCount, PredBB);
+  }
+
+  // Copying forward.
+  BasicBlock *FwdResidualBB = CopyForwardBB;
+  if (LoopEndCount != 0) {
+    CopyForwardBB->setName("memmove_fwd_loop");
+    BasicBlock *LoopBB = CopyForwardBB;
+    BasicBlock *SuccBB = ExitBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
+                                              "memmove_fwd_residual");
+      FwdResidualBB = SuccBB;
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
+    Value *LoadGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopPhi);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopPhi);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+    Value *Index = LoopBuilder.CreateAdd(LoopPhi, One);
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(Zero, OrigBB);
+
+    // Replace the unconditional branch to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, TripCount), SuccBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+  }
+
+  if (RemainingBytes != 0) {
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. In the forward
+    // case, we emit it in the normal order.
+    IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign,
+                                          PartDstAlign);
+    for (auto *OpTy : RemainingOps)
+      GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
+  }
 }
 
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -572,11 +936,8 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
       return true;
     }
 
-    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
-      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
-    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
-      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
-    else {
+    if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
+          TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
       // We don't know generically if it's legal to introduce an
       // addrspacecast. We need to know either if it's legal to insert an
       // addrspacecast, or if the address spaces cannot alias.
@@ -587,9 +948,15 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
     }
   }
 
-  createMemMoveLoop(
-      /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
-      SrcIsVolatile, DstIsVolatile, TTI);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+    createMemMoveLoopKnownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  } else {
+    createMemMoveLoopUnknownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  }
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 546a6cd56b250..61183752ab905 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -361,8 +362,7 @@ struct PromoteMem2Reg {
   ///
   /// That map is used to simplify some Phi nodes as we iterate over it, so
   /// it should have deterministic iterators.  We could use a MapVector, but
-  /// since we already maintain a map from BasicBlock* to a stable numbering
-  /// (BBNumbers), the DenseMap is more efficient (also supports removal).
+  /// since basic blocks have numbers, using these are more efficient.
   DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
 
   /// For each PHI node, keep track of which entry in Allocas it corresponds
@@ -384,14 +384,11 @@ struct PromoteMem2Reg {
   SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
-  SmallPtrSet<BasicBlock *, 16> Visited;
+  BitVector Visited;
 
-  /// Contains a stable numbering of basic blocks to avoid non-determinstic
-  /// behavior.
-  DenseMap<BasicBlock *, unsigned> BBNumbers;
-
-  /// Lazily compute the number of predecessors a block has.
-  DenseMap<const BasicBlock *, unsigned> BBNumPreds;
+  /// Lazily compute the number of predecessors a block has, indexed by block
+  /// number.
+  SmallVector<unsigned> BBNumPreds;
 
   /// Whether the function has the no-signed-zeros-fp-math attribute set.
   bool NoSignedZeros = false;
@@ -414,7 +411,8 @@ struct PromoteMem2Reg {
   }
 
   unsigned getNumPreds(const BasicBlock *BB) {
-    unsigned &NP = BBNumPreds[BB];
+    // BBNumPreds is resized to getMaxBlockNumber() at the beginning.
+    unsigned &NP = BBNumPreds[BB->getNumber()];
     if (NP == 0)
       NP = pred_size(BB) + 1;
     return NP - 1;
@@ -795,13 +793,9 @@ void PromoteMem2Reg::run() {
       continue;
     }
 
-    // If we haven't computed a numbering for the BB's in the function, do so
-    // now.
-    if (BBNumbers.empty()) {
-      unsigned ID = 0;
-      for (auto &BB : F)
-        BBNumbers[&BB] = ID++;
-    }
+    // Initialize BBNumPreds lazily
+    if (BBNumPreds.empty())
+      BBNumPreds.resize(F.getMaxBlockNumber());
 
     // Remember the dbg.declare intrinsic describing this alloca, if any.
     if (!Info.DbgUsers.empty())
@@ -831,8 +825,8 @@ void PromoteMem2Reg::run() {
     IDF.setDefiningBlocks(DefBlocks);
     SmallVector<BasicBlock *, 32> PHIBlocks;
     IDF.calculate(PHIBlocks);
-    llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
-      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    llvm::sort(PHIBlocks, [](BasicBlock *A, BasicBlock *B) {
+      return A->getNumber() < B->getNumber();
     });
 
     unsigned CurrentVersion = 0;
@@ -857,6 +851,9 @@ void PromoteMem2Reg::run() {
   // locations until proven otherwise.
   RenamePassData::LocationVector Locations(Allocas.size());
 
+  // The renamer uses the Visited set to avoid infinite loops.
+  Visited.resize(F.getMaxBlockNumber());
+
   // Walks all basic blocks in the function performing the SSA rename algorithm
   // and inserting the phi nodes we marked as necessary
   std::vector<RenamePassData> RenamePassWorkList;
@@ -869,9 +866,6 @@ void PromoteMem2Reg::run() {
     RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
   } while (!RenamePassWorkList.empty());
 
-  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
-  Visited.clear();
-
   // Remove the allocas themselves from the function.
   for (Instruction *A : Allocas) {
     // Remove dbg.assigns linked to the alloca as these are now redundant.
@@ -954,8 +948,8 @@ void PromoteMem2Reg::run() {
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
     // access.
-    auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
-      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    auto CompareBBNumbers = [](BasicBlock *A, BasicBlock *B) {
+      return A->getNumber() < B->getNumber();
     };
     llvm::sort(Preds, CompareBBNumbers);
 
@@ -1067,7 +1061,7 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                                   unsigned &Version) {
   // Look up the basic-block in question.
-  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
+  PHINode *&PN = NewPhiNodes[std::make_pair(BB->getNumber(), AllocaNo)];
 
   // If the BB already has a phi node added for the i'th alloca then we're done!
   if (PN)
@@ -1165,8 +1159,9 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
   }
 
   // Don't revisit blocks.
-  if (!Visited.insert(BB).second)
+  if (Visited.test(BB->getNumber()))
     return;
+  Visited.set(BB->getNumber());
 
   for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) {
     Instruction *I = &*II++; // get the instruction, increment iterator
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 7bfff4dfa67ad..c944859cc69b8 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -228,6 +228,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
   NewInst->takeName(&Inst);
   InsertedValues.insert(NewInst);
   Inst.replaceAllUsesWith(NewInst);
+  NewInst->setDebugLoc(Inst.getDebugLoc());
   Solver.removeLatticeValueFor(&Inst);
   Inst.eraseFromParent();
   return true;
@@ -307,7 +308,8 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
       Updates.push_back({DominatorTree::Delete, BB, Succ});
     }
 
-    BranchInst::Create(OnlyFeasibleSuccessor, BB);
+    Instruction *BI = BranchInst::Create(OnlyFeasibleSuccessor, BB);
+    BI->setDebugLoc(TI->getDebugLoc());
     TI->eraseFromParent();
     DTU.applyUpdatesPermissive(Updates);
   } else if (FeasibleSuccessors.size() > 1) {
@@ -1501,7 +1503,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
     Value *V2 = SCCPSolver::isConstant(V2State)
                     ? getConstant(V2State, I.getOperand(1)->getType())
                     : I.getOperand(1);
-    Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+    Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL, &I));
     auto *C = dyn_cast_or_null<Constant>(R);
     if (C) {
       // Conservatively assume that the result may be based on operands that may
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 8f717cb43bcb4..f23e28888931d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3476,7 +3476,8 @@ static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
 /// Given a BB that starts with the specified two-entry PHI node,
 /// see if we can eliminate it.
 static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
-                                DomTreeUpdater *DTU, const DataLayout &DL) {
+                                DomTreeUpdater *DTU, const DataLayout &DL,
+                                bool SpeculateUnpredictables) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -3508,7 +3509,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // jump to one specific 'then' block (if we have two of them).
   // It isn't beneficial to speculatively execute the code
   // from the block that we know is predictably not entered.
-  if (!DomBI->getMetadata(LLVMContext::MD_unpredictable)) {
+  bool IsUnpredictable = DomBI->getMetadata(LLVMContext::MD_unpredictable);
+  if (!IsUnpredictable) {
     uint64_t TWeight, FWeight;
     if (extractBranchWeights(*DomBI, TWeight, FWeight) &&
         (TWeight + FWeight) != 0) {
@@ -3551,6 +3553,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   InstructionCost Cost = 0;
   InstructionCost Budget =
       TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+  if (SpeculateUnpredictables && IsUnpredictable)
+    Budget += TTI.getBranchMispredictPenalty();
 
   bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
@@ -3620,8 +3624,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
              [](BasicBlock *IfBlock) { return IfBlock->hasAddressTaken(); }))
     return Changed;
 
-  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
-                    << "  T: " << IfTrue->getName()
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond;
+             if (IsUnpredictable) dbgs() << " (unpredictable)";
+             dbgs() << "  T: " << IfTrue->getName()
                     << "  F: " << IfFalse->getName() << "\n");
 
   // If we can still promote the PHI nodes after this gauntlet of tests,
@@ -7814,7 +7819,8 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
     // eliminate it, do so now.
     if (auto *PN = dyn_cast<PHINode>(BB->begin()))
       if (PN->getNumIncomingValues() == 2)
-        if (FoldTwoEntryPHINode(PN, TTI, DTU, DL))
+        if (FoldTwoEntryPHINode(PN, TTI, DTU, DL,
+                                Options.SpeculateUnpredictables))
           return true;
   }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 89c8c5bf08954..ded9209cfe249 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -3018,6 +3018,37 @@ void LibCallSimplifier::classifyArgUse(
   }
 }
 
+/// Constant folds remquo
+Value *LibCallSimplifier::optimizeRemquo(CallInst *CI, IRBuilderBase &B) {
+  const APFloat *X, *Y;
+  if (!match(CI->getArgOperand(0), m_APFloat(X)) ||
+      !match(CI->getArgOperand(1), m_APFloat(Y)))
+    return nullptr;
+
+  APFloat::opStatus Status;
+  APFloat Quot = *X;
+  Status = Quot.divide(*Y, APFloat::rmNearestTiesToEven);
+  if (Status != APFloat::opOK && Status != APFloat::opInexact)
+    return nullptr;
+  APFloat Rem = *X;
+  if (Rem.remainder(*Y) != APFloat::opOK)
+    return nullptr;
+
+  // TODO: We can only keep at least the three of the last bits of x/y
+  unsigned IntBW = TLI->getIntSize();
+  APSInt QuotInt(IntBW, /*isUnsigned=*/false);
+  bool IsExact;
+  Status =
+      Quot.convertToInteger(QuotInt, APFloat::rmNearestTiesToEven, &IsExact);
+  if (Status != APFloat::opOK && Status != APFloat::opInexact)
+    return nullptr;
+
+  B.CreateAlignedStore(
+      ConstantInt::get(B.getIntNTy(IntBW), QuotInt.getExtValue()),
+      CI->getArgOperand(2), CI->getParamAlign(2));
+  return ConstantFP::get(CI->getType(), Rem);
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -3698,6 +3729,17 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeExit(CallInst *CI) {
+
+  // Mark 'exit' as cold if its not exit(0) (success).
+  const APInt *C;
+  if (!CI->hasFnAttr(Attribute::Cold) &&
+      match(CI->getArgOperand(0), m_APInt(C)) && !C->isZero()) {
+    CI->addFnAttr(Attribute::Cold);
+  }
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
   // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
   return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1),
@@ -3816,6 +3858,22 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
   return nullptr;
 }
 
+/// Constant folding nan/nanf/nanl.
+static Value *optimizeNaN(CallInst *CI) {
+  StringRef CharSeq;
+  if (!getConstantStringInfo(CI->getArgOperand(0), CharSeq))
+    return nullptr;
+
+  APInt Fill;
+  // Treat empty strings as if they were zero.
+  if (CharSeq.empty())
+    Fill = APInt(32, 0);
+  else if (CharSeq.getAsInteger(0, Fill))
+    return nullptr;
+
+  return ConstantFP::getQNaN(CI->getType(), /*Negative=*/false, &Fill);
+}
+
 Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
                                                        LibFunc Func,
                                                        IRBuilderBase &Builder) {
@@ -3926,6 +3984,14 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   case LibFunc_cabsf:
   case LibFunc_cabsl:
     return optimizeCAbs(CI, Builder);
+  case LibFunc_remquo:
+  case LibFunc_remquof:
+  case LibFunc_remquol:
+    return optimizeRemquo(CI, Builder);
+  case LibFunc_nan:
+  case LibFunc_nanf:
+  case LibFunc_nanl:
+    return optimizeNaN(CI);
   default:
     return nullptr;
   }
@@ -4049,6 +4115,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
     case LibFunc_vfprintf:
     case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
+    case LibFunc_exit:
+    case LibFunc_Exit:
+      return optimizeExit(CI);
     default:
       return nullptr;
     }
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 1696e9c726735..5af56b01cb69c 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -565,9 +565,8 @@ void Mapper::remapDbgRecord(DbgRecord &DR) {
   }
 
   // Find Value operands and remap those.
-  SmallVector<Value *, 4> Vals, NewVals;
-  for (Value *Val : V.location_ops())
-    Vals.push_back(Val);
+  SmallVector<Value *, 4> Vals(V.location_ops());
+  SmallVector<Value *, 4> NewVals;
   for (Value *Val : Vals)
     NewVals.push_back(mapValue(Val));
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 64e04cae2773f..cb31e2a2ecaec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -268,25 +268,25 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
             return false;
 
   // Match the branch instruction for the header
-  ICmpInst::Predicate Pred;
   Value *MaxLen;
   BasicBlock *EndBB, *WhileBB;
   if (!match(Header->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Index),
+                                 m_Value(MaxLen)),
                   m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
-      Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
+      !CurLoop->contains(WhileBB))
     return false;
 
   // WhileBB should contain the pattern of load & compare instructions. Match
   // the pattern and find the GEP instructions used by the loads.
-  ICmpInst::Predicate WhilePred;
   BasicBlock *FoundBB;
   BasicBlock *TrueBB;
   Value *LoadA, *LoadB;
   if (!match(WhileBB->getTerminator(),
-             m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(LoadA),
+                                 m_Value(LoadB)),
                   m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
-      WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
+      !CurLoop->contains(TrueBB))
     return false;
 
   Value *A, *B;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f54eebb2874ab..5babb1d4c1f96 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -47,7 +47,7 @@ cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
                          cl::desc("Allow enabling loop hints to reorder "
                                   "FP operations during vectorization."));
-}
+} // namespace llvm
 
 // TODO: Move size-based thresholds out of legality checking, make cost based
 // decisions instead of hard thresholds.
@@ -216,20 +216,19 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
                                       TheLoop->getStartLoc(),
                                       TheLoop->getHeader())
              << "loop not vectorized: vectorization is explicitly disabled";
-    else {
-      OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
-                                 TheLoop->getStartLoc(), TheLoop->getHeader());
-      R << "loop not vectorized";
-      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
-        R << " (Force=" << NV("Force", true);
-        if (Width.Value != 0)
-          R << ", Vector Width=" << NV("VectorWidth", getWidth());
-        if (getInterleave() != 0)
-          R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
-        R << ")";
-      }
-      return R;
+
+    OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
+                               TheLoop->getHeader());
+    R << "loop not vectorized";
+    if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+      R << " (Force=" << NV("Force", true);
+      if (Width.Value != 0)
+        R << ", Vector Width=" << NV("VectorWidth", getWidth());
+      if (getInterleave() != 0)
+        R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
+      R << ")";
     }
+    return R;
   });
 }
 
@@ -261,20 +260,20 @@ void LoopVectorizeHints::getHintsFromMetadata() {
   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
-  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+  for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
     const MDString *S = nullptr;
     SmallVector<Metadata *, 4> Args;
 
     // The expected hint is either a MDString or a MDNode with the first
     // operand a MDString.
-    if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+    if (const MDNode *MD = dyn_cast<MDNode>(MDO)) {
       if (!MD || MD->getNumOperands() == 0)
         continue;
       S = dyn_cast<MDString>(MD->getOperand(0));
-      for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
-        Args.push_back(MD->getOperand(i));
+      for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
+        Args.push_back(MD->getOperand(Idx));
     } else {
-      S = dyn_cast<MDString>(LoopID->getOperand(i));
+      S = dyn_cast<MDString>(MDO);
       assert(Args.size() == 0 && "too many arguments for MDString");
     }
 
@@ -444,10 +443,7 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
     return true;
 
   // Otherwise compare address SCEVs
-  if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
-    return true;
-
-  return false;
+  return SE->getSCEV(APtr) == SE->getSCEV(BPtr);
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
@@ -734,26 +730,21 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
   BasicBlock *Header = TheLoop->getHeader();
 
   // Returns true if a given Phi is a supported induction.
-  auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+  auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
     InductionDescriptor ID;
     if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
         ID.getKind() == InductionDescriptor::IK_IntInduction) {
       addInductionPhi(&Phi, ID, AllowedExit);
       return true;
-    } else {
-      // Bail out for any Phi in the outer loop header that is not a supported
-      // induction.
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Found unsupported PHI for outer loop vectorization.\n");
-      return false;
     }
+    // Bail out for any Phi in the outer loop header that is not a supported
+    // induction.
+    LLVM_DEBUG(
+        dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
+    return false;
   };
 
-  if (llvm::all_of(Header->phis(), isSupportedPhi))
-    return true;
-  else
-    return false;
+  return llvm::all_of(Header->phis(), IsSupportedPhi);
 }
 
 /// Checks if a function is scalarizable according to the TLI, in
@@ -837,13 +828,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // historical vectorizer behavior after a generalization of the
         // IVDescriptor code.  The intent is to remove this check, but we
         // have to fix issues around code quality for such loops first.
-        auto isDisallowedStridedPointerInduction =
-          [](const InductionDescriptor &ID) {
-          if (AllowStridedPointerIVs)
-            return false;
-          return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
-            ID.getConstIntStepValue() == nullptr;
-        };
+        auto IsDisallowedStridedPointerInduction =
+            [](const InductionDescriptor &ID) {
+              if (AllowStridedPointerIVs)
+                return false;
+              return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+                     ID.getConstIntStepValue() == nullptr;
+            };
 
         // TODO: Instead of recording the AllowedExit, it would be good to
         // record the complementary set: NotAllowedExit. These include (but may
@@ -861,7 +852,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // of these NotAllowedExit.
         InductionDescriptor ID;
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
-            !isDisallowedStridedPointerInduction(ID)) {
+            !IsDisallowedStridedPointerInduction(ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
           Requirements->addExactFPMathInst(ID.getExactFPMathInst());
           continue;
@@ -876,7 +867,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // As a last resort, coerce the PHI to a AddRec expression
         // and re-try classifying it a an induction PHI.
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
-            !isDisallowedStridedPointerInduction(ID)) {
+            !IsDisallowedStridedPointerInduction(ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
           continue;
         }
@@ -932,9 +923,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI) {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
-        for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
-          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
-            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
+        for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
+          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
+            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
+                                     TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
                   "CantVectorizeIntrinsic", ORE, TheLoop, CI);
@@ -1035,14 +1027,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           "loop induction variable could not be identified",
           "NoInductionVariable", ORE, TheLoop);
       return false;
-    } else if (!WidestIndTy) {
+    }
+    if (!WidestIndTy) {
       reportVectorizationFailure("Did not find one integer induction var",
           "integer loop induction variable could not be identified",
           "NoIntegerInductionVariable", ORE, TheLoop);
       return false;
-    } else {
-      LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     }
+    LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
   }
 
   // Now we know the widest induction type, check if our found induction
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c63cf0c37f2f9..ca20bc3ada799 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -365,8 +365,8 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Return the most profitable plan and fix its VF to the most profitable one.
-  VPlan &getBestPlan() const;
+  /// Return the most profitable vectorization factor.
+  ElementCount getBestVF() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
@@ -411,6 +411,9 @@ class LoopVectorizationPlanner {
   VectorizationFactor
   selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC);
 
+  /// Emit remarks for recipes with invalid costs in the available VPlans.
+  void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+
 protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 40919c944d21f..6daa8043a3fbf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -75,6 +75,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -137,6 +138,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
@@ -436,37 +438,6 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
   return std::nullopt;
 }
 
-/// Return a vector containing interleaved elements from multiple
-/// smaller input vectors.
-static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
-                                const Twine &Name) {
-  unsigned Factor = Vals.size();
-  assert(Factor > 1 && "Tried to interleave invalid number of vectors");
-
-  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
-#ifndef NDEBUG
-  for (Value *Val : Vals)
-    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
-#endif
-
-  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
-  // must use intrinsics to interleave.
-  if (VecTy->isScalableTy()) {
-    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
-                                   Vals,
-                                   /*FMFSource=*/nullptr, Name);
-  }
-
-  // Fixed length. Start by concatenating all vectors into a wide vector.
-  Value *WideVec = concatenateVectors(Builder, Vals);
-
-  // Interleave the elements into the wide vector.
-  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
-  return Builder.CreateShuffleVector(
-      WideVec, createInterleaveMask(NumElts, Factor), Name);
-}
-
 namespace {
 // Forward declare GeneratedRTChecks.
 class GeneratedRTChecks;
@@ -548,16 +519,6 @@ class InnerLoopVectorizer {
                             const VPIteration &Instance,
                             VPTransformState &State);
 
-  /// Try to vectorize interleaved access group \p Group with the base address
-  /// given in \p Addr, optionally masking the vector operations if \p
-  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
-  /// values in the vectorized loop.
-  void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
-                                ArrayRef<VPValue *> VPDefs,
-                                VPTransformState &State, VPValue *Addr,
-                                ArrayRef<VPValue *> StoredValues,
-                                VPValue *BlockInMask, bool NeedsMaskForGaps);
-
   /// Fix the non-induction PHIs in \p Plan.
   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
 
@@ -606,11 +567,6 @@ class InnerLoopVectorizer {
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
 
-  /// Returns a bitcasted value to the requested vector type.
-  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
-  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
-                                const DataLayout &DL);
-
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
   void emitIterationCountCheck(BasicBlock *Bypass);
@@ -934,20 +890,18 @@ static void debugVectorizationMessage(const StringRef Prefix,
 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
-/// the location of the remark.  \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
-    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
-  Value *CodeRegion = TheLoop->getHeader();
-  DebugLoc DL = TheLoop->getStartLoc();
-
-  if (I) {
-    CodeRegion = I->getParent();
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-  }
+/// the location of the remark. If \p DL is passed, use it as debug location for
+/// the remark. \return the remark object that can be streamed to.
+static OptimizationRemarkAnalysis
+createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
+                 Instruction *I, DebugLoc DL = {}) {
+  Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
+  // If debug location is attached to the instruction, use it. Otherwise if DL
+  // was not provided, use the loop's.
+  if (I && I->getDebugLoc())
+    DL = I->getDebugLoc();
+  else if (!DL)
+    DL = TheLoop->getStartLoc();
 
   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
 }
@@ -988,15 +942,17 @@ void reportVectorizationFailure(const StringRef DebugMsg,
 
 /// Reports an informative message: print \p Msg for debugging purposes as well
 /// as an optimization remark. Uses either \p I as location of the remark, or
-/// otherwise \p TheLoop.
+/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
+/// remark. If \p DL is passed, use it as debug location for the remark.
 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
-                             OptimizationRemarkEmitter *ORE, Loop *TheLoop,
-                             Instruction *I = nullptr) {
+                                    OptimizationRemarkEmitter *ORE,
+                                    Loop *TheLoop, Instruction *I = nullptr,
+                                    DebugLoc DL = {}) {
   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
-  ORE->emit(
-      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
-      << Msg);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
+                             I, DL)
+            << Msg);
 }
 
 /// Report successful vectorization of the loop. In case an outer loop is
@@ -1583,12 +1539,8 @@ class LoopVectorizationCostModel {
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width. If \p Invalid is not nullptr, this function
-  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
-  /// each instruction that has an Invalid cost for the given VF.
-  InstructionCost
-  expectedCost(ElementCount VF,
-               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+  /// the factor width.
+  InstructionCost expectedCost(ElementCount VF);
 
   bool hasPredStores() const { return NumPredStores > 0; }
 
@@ -2388,275 +2340,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-// Try to vectorize the interleave group that \p Instr belongs to.
-//
-// E.g. Translate following interleaved load group (factor = 3):
-//   for (i = 0; i < N; i+=3) {
-//     R = Pic[i];             // Member of index 0
-//     G = Pic[i+1];           // Member of index 1
-//     B = Pic[i+2];           // Member of index 2
-//     ... // do something to R, G, B
-//   }
-// To:
-//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
-//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
-//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
-//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
-//
-// Or translate following interleaved store group (factor = 3):
-//   for (i = 0; i < N; i+=3) {
-//     ... do something to R, G, B
-//     Pic[i]   = R;           // Member of index 0
-//     Pic[i+1] = G;           // Member of index 1
-//     Pic[i+2] = B;           // Member of index 2
-//   }
-// To:
-//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
-//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
-//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
-//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(
-    const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
-    VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
-    VPValue *BlockInMask, bool NeedsMaskForGaps) {
-  Instruction *Instr = Group->getInsertPos();
-  const DataLayout &DL = Instr->getDataLayout();
-
-  // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = getLoadStoreType(Instr);
-  unsigned InterleaveFactor = Group->getFactor();
-  auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
-
-  // Prepare for the new pointers.
-  SmallVector<Value *, 2> AddrParts;
-  unsigned Index = Group->getIndex(Instr);
-
-  // TODO: extend the masked interleaved-group support to reversed access.
-  assert((!BlockInMask || !Group->isReverse()) &&
-         "Reversed masked interleave-group not supported.");
-
-  Value *Idx;
-  // If the group is reverse, adjust the index to refer to the last vector lane
-  // instead of the first. We adjust the index from the first vector lane,
-  // rather than directly getting the pointer for lane VF - 1, because the
-  // pointer operand of the interleaved access is supposed to be uniform. For
-  // uniform instructions, we're only required to generate a value for the
-  // first vector lane in each unroll iteration.
-  if (Group->isReverse()) {
-    Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
-    Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
-    Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
-    Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
-    Idx = Builder.CreateNeg(Idx);
-  } else
-    Idx = Builder.getInt32(-Index);
-
-  for (unsigned Part = 0; Part < State.UF; Part++) {
-    Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
-    if (auto *I = dyn_cast<Instruction>(AddrPart))
-      State.setDebugLocFrom(I->getDebugLoc());
-
-    // Notice current instruction could be any index. Need to adjust the address
-    // to the member of index 0.
-    //
-    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
-    //       b = A[i];       // Member of index 0
-    // Current pointer is pointed to A[i+1], adjust it to A[i].
-    //
-    // E.g.  A[i+1] = a;     // Member of index 1
-    //       A[i]   = b;     // Member of index 0
-    //       A[i+2] = c;     // Member of index 2 (Current instruction)
-    // Current pointer is pointed to A[i+2], adjust it to A[i].
-
-    bool InBounds = false;
-    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
-      InBounds = gep->isInBounds();
-    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
-    AddrParts.push_back(AddrPart);
-  }
-
-  State.setDebugLocFrom(Instr->getDebugLoc());
-  Value *PoisonVec = PoisonValue::get(VecTy);
-
-  auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
-                             unsigned Part, Value *MaskForGaps) -> Value * {
-    if (State.VF.isScalable()) {
-      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(InterleaveFactor == 2 &&
-             "Unsupported deinterleave factor for scalable vectors");
-      auto *BlockInMaskPart = State.get(BlockInMask, Part);
-      SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
-      auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
-                                     State.VF.getKnownMinValue() * 2, true);
-      return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
-                                     /*FMFSource=*/nullptr, "interleaved.mask");
-    }
-
-    if (!BlockInMask)
-      return MaskForGaps;
-
-    Value *BlockInMaskPart = State.get(BlockInMask, Part);
-    Value *ShuffledMask = Builder.CreateShuffleVector(
-        BlockInMaskPart,
-        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
-        "interleaved.mask");
-    return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
-                                             MaskForGaps)
-                       : ShuffledMask;
-  };
-
-  // Vectorize the interleaved load group.
-  if (isa<LoadInst>(Instr)) {
-    Value *MaskForGaps = nullptr;
-    if (NeedsMaskForGaps) {
-      MaskForGaps =
-          createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group);
-      assert(MaskForGaps && "Mask for Gaps is required but it is null");
-    }
-
-    // For each unroll part, create a wide load for the group.
-    SmallVector<Value *, 2> NewLoads;
-    for (unsigned Part = 0; Part < State.UF; Part++) {
-      Instruction *NewLoad;
-      if (BlockInMask || MaskForGaps) {
-        assert(useMaskedInterleavedAccesses(*TTI) &&
-               "masked interleaved groups are not allowed.");
-        Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
-        NewLoad =
-            Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
-                                     GroupMask, PoisonVec, "wide.masked.vec");
-      }
-      else
-        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
-                                            Group->getAlign(), "wide.vec");
-      Group->addMetadata(NewLoad);
-      NewLoads.push_back(NewLoad);
-    }
-
-    if (VecTy->isScalableTy()) {
-      assert(InterleaveFactor == 2 &&
-             "Unsupported deinterleave factor for scalable vectors");
-
-      for (unsigned Part = 0; Part < State.UF; ++Part) {
-        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-        // so must use intrinsics to deinterleave.
-        Value *DI = Builder.CreateIntrinsic(
-            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
-            /*FMFSource=*/nullptr, "strided.vec");
-        unsigned J = 0;
-        for (unsigned I = 0; I < InterleaveFactor; ++I) {
-          Instruction *Member = Group->getMember(I);
-
-          if (!Member)
-            continue;
-
-          Value *StridedVec = Builder.CreateExtractValue(DI, I);
-          // If this member has different type, cast the result type.
-          if (Member->getType() != ScalarTy) {
-            VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
-            StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
-          }
-
-          if (Group->isReverse())
-            StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
-
-          State.set(VPDefs[J], StridedVec, Part);
-          ++J;
-        }
-      }
-
-      return;
-    }
-
-    // For each member in the group, shuffle out the appropriate data from the
-    // wide loads.
-    unsigned J = 0;
-    for (unsigned I = 0; I < InterleaveFactor; ++I) {
-      Instruction *Member = Group->getMember(I);
-
-      // Skip the gaps in the group.
-      if (!Member)
-        continue;
-
-      auto StrideMask =
-          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
-      for (unsigned Part = 0; Part < State.UF; Part++) {
-        Value *StridedVec = Builder.CreateShuffleVector(
-            NewLoads[Part], StrideMask, "strided.vec");
-
-        // If this member has different type, cast the result type.
-        if (Member->getType() != ScalarTy) {
-          assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
-          VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
-          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
-        }
-
-        if (Group->isReverse())
-          StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
-
-        State.set(VPDefs[J], StridedVec, Part);
-      }
-      ++J;
-    }
-    return;
-  }
-
-  // The sub vector type for current instruction.
-  auto *SubVT = VectorType::get(ScalarTy, State.VF);
-
-  // Vectorize the interleaved store group.
-  Value *MaskForGaps =
-      createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group);
-  assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
-         "masked interleaved groups are not allowed.");
-  assert((!MaskForGaps || !State.VF.isScalable()) &&
-         "masking gaps for scalable vectors is not yet supported.");
-  for (unsigned Part = 0; Part < State.UF; Part++) {
-    // Collect the stored vector from each member.
-    SmallVector<Value *, 4> StoredVecs;
-    unsigned StoredIdx = 0;
-    for (unsigned i = 0; i < InterleaveFactor; i++) {
-      assert((Group->getMember(i) || MaskForGaps) &&
-             "Fail to get a member from an interleaved store group");
-      Instruction *Member = Group->getMember(i);
-
-      // Skip the gaps in the group.
-      if (!Member) {
-        Value *Undef = PoisonValue::get(SubVT);
-        StoredVecs.push_back(Undef);
-        continue;
-      }
-
-      Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
-      ++StoredIdx;
-
-      if (Group->isReverse())
-        StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
-
-      // If this member has different type, cast it to a unified type.
-
-      if (StoredVec->getType() != SubVT)
-        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
-
-      StoredVecs.push_back(StoredVec);
-    }
-
-    // Interleave all the smaller vectors into one wider vector.
-    Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
-    Instruction *NewStoreInstr;
-    if (BlockInMask || MaskForGaps) {
-      Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
-      NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
-                                                Group->getAlign(), GroupMask);
-    } else
-      NewStoreInstr =
-          Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
-
-    Group->addMetadata(NewStoreInstr);
-  }
-}
-
 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPIteration &Instance,
@@ -2764,37 +2447,6 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   return VectorTripCount;
 }
 
-Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
-                                                   const DataLayout &DL) {
-  // Verify that V is a vector type with same number of elements as DstVTy.
-  auto *DstFVTy = cast<VectorType>(DstVTy);
-  auto VF = DstFVTy->getElementCount();
-  auto *SrcVecTy = cast<VectorType>(V->getType());
-  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
-  Type *SrcElemTy = SrcVecTy->getElementType();
-  Type *DstElemTy = DstFVTy->getElementType();
-  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
-         "Vector elements must have same size");
-
-  // Do a direct cast if element types are castable.
-  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
-    return Builder.CreateBitOrPointerCast(V, DstFVTy);
-  }
-  // V cannot be directly casted to desired vector type.
-  // May happen when V is a floating point vector but DstVTy is a vector of
-  // pointers or vice-versa. Handle this using a two-step bitcast using an
-  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
-  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
-         "Only one type should be a pointer type");
-  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
-         "Only one type should be a floating point type");
-  Type *IntTy =
-      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
-  auto *VecIntTy = VectorType::get(IntTy, VF);
-  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
-  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
-}
-
 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   Value *Count = getTripCount();
   // Reuse existing vector loop preheader for TC checks.
@@ -3167,10 +2819,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     // In this case, if IV1 has an external use, we need to avoid adding both
     // "last value of IV1" and "penultimate value of IV2". So, verify that we
     // don't already have an incoming value for the middle block.
-    if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
-      Plan.removeLiveOut(PHI);
-    }
   }
 }
 
@@ -3684,45 +3334,54 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
   }
 }
 
+// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
-  if (!blockNeedsPredicationForAnyReason(I->getParent()))
+  // If predication is not needed, avoid it.
+  // TODO: We can use the loop-preheader as context point here and get
+  // context sensitive reasoning for isSafeToSpeculativelyExecute.
+  if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
+      isSafeToSpeculativelyExecute(I) ||
+      (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
+      isa<BranchInst, PHINode>(I))
     return false;
 
-  // Can we prove this instruction is safe to unconditionally execute?
-  // If not, we must use some form of predication.
+  // If the instruction was executed conditionally in the original scalar loop,
+  // predication is needed with a mask whose lanes are all possibly inactive.
+  if (Legal->blockNeedsPredication(I->getParent()))
+    return true;
+
+  // All that remain are instructions with side-effects originally executed in
+  // the loop unconditionally, but now execute under a tail-fold mask (only)
+  // having at least one active lane (the first). If the side-effects of the
+  // instruction are invariant, executing it w/o (the tail-folding) mask is safe
+  // - it will cause the same side-effects as when masked.
   switch(I->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable(
+        "instruction should have been considered by earlier checks");
+  case Instruction::Call:
+    // Side-effects of a Call are assumed to be non-invariant, needing a
+    // (fold-tail) mask.
+    assert(Legal->isMaskRequired(I) &&
+           "should have returned earlier for calls not needing a mask");
+    return true;
   case Instruction::Load:
+    // If the address is loop invariant no predication is needed.
+    return !Legal->isInvariant(getLoadStorePointerOperand(I));
   case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
-      return false;
-    // When we know the load's address is loop invariant and the instruction
-    // in the original scalar loop was unconditionally executed then we
-    // don't need to mark it as a predicated instruction. Tail folding may
-    // introduce additional predication, but we're guaranteed to always have
-    // at least one active lane.  We call Legal->blockNeedsPredication here
-    // because it doesn't query tail-folding.  For stores, we need to prove
-    // both speculation safety (which follows from the same argument as loads),
-    // but also must prove the value being stored is correct.  The easiest
-    // form of the later is to require that all values stored are the same.
-    if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
-        (isa<LoadInst>(I) ||
-         (isa<StoreInst>(I) &&
-          TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
-        !Legal->blockNeedsPredication(I->getParent()))
-      return false;
-    return true;
+    // For stores, we need to prove both speculation safety (which follows from
+    // the same argument as loads), but also must prove the value being stored
+    // is correct.  The easiest form of the later is to require that all values
+    // stored are the same.
+    return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
+             TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    // TODO: We can use the loop-preheader as context point here and get
-    // context sensitive reasoning
-    return !isSafeToSpeculativelyExecute(I);
-  case Instruction::Call:
-    return Legal->isMaskRequired(I);
+    // If the divisor is loop-invariant no predication is needed.
+    return !TheLoop->isLoopInvariant(I->getOperand(1));
   }
 }
 
@@ -3915,19 +3574,19 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   SetVector<Instruction *> Worklist;
 
   // Add uniform instructions demanding lane 0 to the worklist. Instructions
-  // that are scalar with predication must not be considered uniform after
+  // that require predication must not be considered uniform after
   // vectorization, because that would create an erroneous replicating region
   // where only a single instance out of VF should be formed.
-  // TODO: optimize such seldom cases if found important, see PR40816.
   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
     if (isOutOfScope(I)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I, VF)) {
-      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
-                        << *I << "\n");
+    if (isPredicatedInst(I)) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Found not uniform due to requiring predication: " << *I
+                 << "\n");
       return;
     }
     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
@@ -4197,14 +3856,11 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
     return false;
   }
 
-  if (!Legal->isSafeForAnyVectorWidth()) {
-    std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
-    if (!MaxVScale) {
-      reportVectorizationInfo(
-          "The target does not provide maximum vscale value.",
-          "ScalableVFUnfeasible", ORE, TheLoop);
-      return false;
-    }
+  if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
+    reportVectorizationInfo("The target does not provide maximum vscale value "
+                            "for safe distance analysis.",
+                            "ScalableVFUnfeasible", ORE, TheLoop);
+    return false;
   }
 
   IsScalableVectorizationAllowed = true;
@@ -4593,15 +4249,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
 
     // Select the largest VF which doesn't require more registers than existing
     // ones.
-    for (int i = RUs.size() - 1; i >= 0; --i) {
-      bool Selected = true;
-      for (auto &pair : RUs[i].MaxLocalUsers) {
-        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
-        if (pair.second > TargetNumRegisters)
-          Selected = false;
-      }
-      if (Selected) {
-        MaxVF = VFs[i];
+    for (int I = RUs.size() - 1; I >= 0; --I) {
+      const auto &MLU = RUs[I].MaxLocalUsers;
+      if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
+            return LU.second <= TTI.getNumberOfRegisters(LU.first);
+          })) {
+        MaxVF = VFs[I];
         break;
       }
     }
@@ -4694,24 +4347,38 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   return CmpFn(RTCostA, RTCostB);
 }
 
-static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
-                                   OptimizationRemarkEmitter *ORE,
-                                   Loop *TheLoop) {
+void LoopVectorizationPlanner::emitInvalidCostRemarks(
+    OptimizationRemarkEmitter *ORE) {
+  using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
+  LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
+  SmallVector<RecipeVFPair> InvalidCosts;
+  for (const auto &Plan : VPlans) {
+    for (ElementCount VF : Plan->vectorFactors()) {
+      VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
+                            CM);
+      auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
+      for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+        for (auto &R : *VPBB) {
+          if (!R.cost(VF, CostCtx).isValid())
+            InvalidCosts.emplace_back(&R, VF);
+        }
+      }
+    }
+  }
   if (InvalidCosts.empty())
     return;
 
   // Emit a report of VFs with invalid costs in the loop.
 
-  // Group the remarks per instruction, keeping the instruction order from
-  // InvalidCosts.
-  std::map<Instruction *, unsigned> Numbering;
+  // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
+  DenseMap<VPRecipeBase *, unsigned> Numbering;
   unsigned I = 0;
   for (auto &Pair : InvalidCosts)
     if (!Numbering.count(Pair.first))
       Numbering[Pair.first] = I++;
 
-  // Sort the list, first on instruction(number) then on VF.
-  sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+  // Sort the list, first on recipe(number) then on VF.
+  sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
     if (Numbering[A.first] != Numbering[B.first])
       return Numbering[A.first] < Numbering[B.first];
     const auto &LHS = A.second;
@@ -4720,38 +4387,64 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
   });
 
-  // For a list of ordered instruction-vf pairs:
-  //   [(load, vf1), (load, vf2), (store, vf1)]
-  // Group the instructions together to emit separate remarks for:
-  //   load  (vf1, vf2)
-  //   store (vf1)
-  auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
-  auto Subset = ArrayRef<InstructionVFPair>();
+  // For a list of ordered recipe-VF pairs:
+  //   [(load, VF1), (load, VF2), (store, VF1)]
+  // group the recipes together to emit separate remarks for:
+  //   load  (VF1, VF2)
+  //   store (VF1)
+  auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
+  auto Subset = ArrayRef<RecipeVFPair>();
   do {
     if (Subset.empty())
       Subset = Tail.take_front(1);
 
-    Instruction *I = Subset.front().first;
-
-    // If the next instruction is different, or if there are no other pairs,
+    VPRecipeBase *R = Subset.front().first;
+
+    unsigned Opcode =
+        TypeSwitch<const VPRecipeBase *, unsigned>(R)
+            .Case<VPHeaderPHIRecipe>(
+                [](const auto *R) { return Instruction::PHI; })
+            .Case<VPWidenSelectRecipe>(
+                [](const auto *R) { return Instruction::Select; })
+            .Case<VPWidenStoreRecipe>(
+                [](const auto *R) { return Instruction::Store; })
+            .Case<VPWidenLoadRecipe>(
+                [](const auto *R) { return Instruction::Load; })
+            .Case<VPWidenCallRecipe>(
+                [](const auto *R) { return Instruction::Call; })
+            .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
+                  VPWidenCastRecipe>(
+                [](const auto *R) { return R->getOpcode(); })
+            .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
+              return R->getStoredValues().empty() ? Instruction::Load
+                                                  : Instruction::Store;
+            });
+
+    // If the next recipe is different, or if there are no other pairs,
     // emit a remark for the collated subset. e.g.
-    //   [(load, vf1), (load, vf2))]
+    //   [(load, VF1), (load, VF2))]
     // to emit:
-    //  remark: invalid costs for 'load' at VF=(vf, vf2)
-    if (Subset == Tail || Tail[Subset.size()].first != I) {
+    //  remark: invalid costs for 'load' at VF=(VF1, VF2)
+    if (Subset == Tail || Tail[Subset.size()].first != R) {
       std::string OutString;
       raw_string_ostream OS(OutString);
       assert(!Subset.empty() && "Unexpected empty range");
-      OS << "Instruction with invalid costs prevented vectorization at VF=(";
+      OS << "Recipe with invalid costs prevented vectorization at VF=(";
       for (const auto &Pair : Subset)
         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
       OS << "):";
-      if (auto *CI = dyn_cast<CallInst>(I))
-        OS << " call to " << CI->getCalledFunction()->getName();
-      else
-        OS << " " << I->getOpcodeName();
+      if (Opcode == Instruction::Call) {
+        auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
+        Function *CalledFn =
+            WidenCall ? WidenCall->getCalledScalarFunction()
+                      : cast<Function>(R->getOperand(R->getNumOperands() - 1)
+                                           ->getLiveInIRValue());
+        OS << " call to " << CalledFn->getName();
+      } else
+        OS << " " << Instruction::getOpcodeName(Opcode);
       OS.flush();
-      reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+      reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
+                              R->getDebugLoc());
       Tail = Tail.drop_front(Subset.size());
       Subset = {};
     } else
@@ -4880,14 +4573,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
     ChosenFactor.Cost = InstructionCost::getMax();
   }
 
-  SmallVector<InstructionVFPair> InvalidCosts;
   for (auto &P : VPlans) {
     for (ElementCount VF : P->vectorFactors()) {
       // The cost for scalar VF=1 is already calculated, so ignore it.
       if (VF.isScalar())
         continue;
 
-      InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
+      InstructionCost C = CM.expectedCost(VF);
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
@@ -4922,8 +4614,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
     }
   }
 
-  emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
-
   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
     reportVectorizationFailure(
         "There are conditional stores.",
@@ -5828,8 +5518,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   return Discount;
 }
 
-InstructionCost LoopVectorizationCostModel::expectedCost(
-    ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
+InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   InstructionCost Cost;
 
   // For each block.
@@ -5849,10 +5538,6 @@ InstructionCost LoopVectorizationCostModel::expectedCost(
       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
         C = InstructionCost(ForceTargetInstructionCost);
 
-      // Keep a list of instructions with invalid costs.
-      if (Invalid && !C.isValid())
-        Invalid->emplace_back(&I, VF);
-
       BlockCost += C;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
                         << VF << " For instruction: " << I << '\n');
@@ -7028,6 +6713,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
 
   SmallVector<Value *, 4> DeadInterleavePointerOps;
+  SmallVector<Value *, 4> DeadOps;
+
+  // If a scalar epilogue is required, users outside the loop won't use
+  // live-outs from the vector loop but from the scalar epilogue. Ignore them if
+  // that is the case.
+  bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
+  auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
+    return RequiresScalarEpilogue &&
+           !TheLoop->contains(cast<Instruction>(U)->getParent());
+  };
   for (BasicBlock *BB : TheLoop->blocks())
     for (Instruction &I : *BB) {
       // Find all stores to invariant variables. Since they are going to sink
@@ -7037,6 +6732,18 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         ValuesToIgnore.insert(&I);
 
+      if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
+        continue;
+
+      // Add instructions that would be trivially dead and are only used by
+      // values already ignored to DeadOps to seed worklist.
+      if (wouldInstructionBeTriviallyDead(&I, TLI) &&
+          all_of(I.users(), [this, IsLiveOutDead](User *U) {
+            return VecValuesToIgnore.contains(U) ||
+                   ValuesToIgnore.contains(U) || IsLiveOutDead(U);
+          }))
+        DeadOps.push_back(&I);
+
       // For interleave groups, we only create a pointer for the start of the
       // interleave group. Queue up addresses of group members except the insert
       // position for further processing.
@@ -7064,6 +6771,35 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
   }
 
+  // Mark ops that would be trivially dead and are only used by ignored
+  // instructions as free.
+  BasicBlock *Header = TheLoop->getHeader();
+  for (unsigned I = 0; I != DeadOps.size(); ++I) {
+    auto *Op = dyn_cast<Instruction>(DeadOps[I]);
+    // Skip any op that shouldn't be considered dead.
+    if (!Op || !TheLoop->contains(Op) ||
+        (isa<PHINode>(Op) && Op->getParent() == Header) ||
+        !wouldInstructionBeTriviallyDead(Op, TLI) ||
+        any_of(Op->users(), [this, IsLiveOutDead](User *U) {
+          return !VecValuesToIgnore.contains(U) && ValuesToIgnore.contains(U) &&
+                 !IsLiveOutDead(U);
+        }))
+      continue;
+
+    if (!TheLoop->contains(Op->getParent()))
+      continue;
+
+    // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
+    // which applies for both scalar and vector versions. Otherwise it is only
+    // dead in vector versions, so only add it to VecValuesToIgnore.
+    if (all_of(Op->users(),
+               [this](User *U) { return ValuesToIgnore.contains(U); }))
+      ValuesToIgnore.insert(Op);
+
+    VecValuesToIgnore.insert(Op);
+    DeadOps.append(Op->op_begin(), Op->op_end());
+  }
+
   // Ignore type-promoting instructions we identified during reduction
   // detection.
   for (const auto &Reduction : Legal->getReductionVars()) {
@@ -7455,13 +7191,12 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   return Cost;
 }
 
-VPlan &LoopVectorizationPlanner::getBestPlan() const {
+ElementCount LoopVectorizationPlanner::getBestVF() const {
   // If there is a single VPlan with a single VF, return it directly.
   VPlan &FirstPlan = *VPlans[0];
   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
-    return FirstPlan;
+    return *FirstPlan.vectorFactors().begin();
 
-  VPlan *BestPlan = &FirstPlan;
   ElementCount ScalarVF = ElementCount::getFixed(1);
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
@@ -7492,14 +7227,11 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
 
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
-      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+      if (isMoreProfitable(CurrentFactor, BestFactor))
         BestFactor = CurrentFactor;
-        BestPlan = &*P;
-      }
     }
   }
-  BestPlan->setVF(BestFactor.Width);
-  return *BestPlan;
+  return BestFactor.Width;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -8581,6 +8313,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
     BlockInMask = getBlockInMask(I->getParent());
   }
 
+  // Note that there is some custom logic to mark some intrinsics as uniform
+  // manually above for scalable vectors, which this assert needs to account for
+  // as well.
+  assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
+          (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
+         "Should not predicate a uniform recipe");
   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
                                        IsUniform, BlockInMask);
   return Recipe;
@@ -8712,8 +8450,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
 // original exit block.
-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
-                                VPRecipeBuilder &Builder, VPlan &Plan) {
+static void addUsersInExitBlock(
+    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
+    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
   // Only handle single-exit loops with unique exit blocks for now.
@@ -8731,7 +8470,12 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
     // live-outs.
     if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
          !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-        isa<VPWidenPointerInductionRecipe>(V))
+        isa<VPWidenPointerInductionRecipe>(V) ||
+        (isa<Instruction>(IncomingValue) &&
+         any_of(IncomingValue->users(), [&Inductions](User *U) {
+           auto *P = dyn_cast<PHINode>(U);
+           return P && Inductions.contains(P);
+         })))
       continue;
     Plan.addLiveOut(&ExitPhi, V);
   }
@@ -8946,7 +8690,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
   } else
-    addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
+    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
+                        Legal->getInductionVars());
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8979,6 +8724,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
     bool NeedsMaskForGaps =
         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
+    assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
+           "masked interleaved groups are not allowed.");
     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
                                         Recipe->getMask(), NeedsMaskForGaps);
     VPIG->insertBefore(Recipe);
@@ -9390,37 +9137,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   VPlanTransforms::clearReductionWrapFlags(*Plan);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const {
-  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
-  IG->getInsertPos()->printAsOperand(O, false);
-  O << ", ";
-  getAddr()->printAsOperand(O, SlotTracker);
-  VPValue *Mask = getMask();
-  if (Mask) {
-    O << ", ";
-    Mask->printAsOperand(O, SlotTracker);
-  }
-
-  unsigned OpIdx = 0;
-  for (unsigned i = 0; i < IG->getFactor(); ++i) {
-    if (!IG->getMember(i))
-      continue;
-    if (getNumStoreOperands() > 0) {
-      O << "\n" << Indent << "  store ";
-      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
-      O << " to index " << i;
-    } else {
-      O << "\n" << Indent << "  ";
-      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
-      O << " = load from index " << i;
-    }
-    ++OpIdx;
-  }
-}
-#endif
-
 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
          "Not a pointer induction according to InductionDescriptor!");
@@ -9504,16 +9220,11 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
   State.set(this, DerivedIV, VPIteration(0, 0));
 }
 
-void VPInterleaveRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
-                                      getStoredValues(), getMask(),
-                                      NeedsMaskForGaps);
-}
-
 void VPReplicateRecipe::execute(VPTransformState &State) {
   Instruction *UI = getUnderlyingInstr();
   if (State.Instance) { // Generate a single instance.
+    assert((State.VF.isScalar() || !isUniform()) &&
+           "uniform recipe shouldn't be predicated");
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
     // Insert scalar instance packing it into a vector.
@@ -10185,6 +9896,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Plan how to best vectorize, return the best VF and its cost.
   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
 
+  if (ORE->allowExtraAnalysis(LV_NAME))
+    LVP.emitInvalidCostRemarks(ORE);
+
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
@@ -10315,10 +10029,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan = LVP.getBestPlan();
-      assert(BestPlan.hasScalarVFOnly() &&
+      ElementCount BestVF = LVP.getBestVF();
+      assert(BestVF.isScalar() &&
              "VPlan cost model and legacy cost model disagreed");
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+      LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10329,20 +10044,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
+      ElementCount BestVF = LVP.getBestVF();
+      LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
+      assert(VF.Width == BestVF &&
+             "VPlan cost model and legacy cost model disagreed");
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+          LVP.selectEpilogueVectorizationFactor(BestVF, IC);
       if (EpilogueVF.Width.isVector()) {
 
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+        EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        std::unique_ptr<VPlan> BestMainPlan(
-            LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
+        std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
@@ -10433,18 +10152,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        VPlan &BestPlan = LVP.getBestPlan();
-        assert(size(BestPlan.vectorFactors()) == 1 &&
-               "Plan should have a single VF");
-        ElementCount Width = *BestPlan.vectorFactors().begin();
-        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
-                          << "\n");
-        assert(VF.Width == Width &&
-               "VPlan cost model and legacy cost model disagreed");
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
-        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
+        LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b994645cece61..8d2ce6bad6af7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -209,7 +209,7 @@ static const unsigned AliasedCheckLimit = 10;
 
 // Limit of the number of uses for potentially transformed instructions/values,
 // used in checks to avoid compile-time explode.
-static constexpr int UsesLimit = 8;
+static constexpr int UsesLimit = 64;
 
 // Another limit for the alias checks: The maximum distance between load/store
 // instructions where alias checks are done.
@@ -4842,11 +4842,46 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
   if (!AnyConsecutive)
     return false;
 
+  // If we have a better order, also sort the base pointers by increasing
+  // (variable) values if possible, to try and keep the order more regular. In
+  // order to create a valid strict-weak order we cluster by the Root of gep
+  // chains and sort within each.
+  SmallVector<std::tuple<Value *, Value *, Value *>> SortedBases;
   for (auto &Base : Bases) {
-    for (auto &T : Base.second)
-      SortedIndices.push_back(std::get<2>(T));
+    Value *Strip = Base.first->stripInBoundsConstantOffsets();
+    Value *Root = Strip;
+    while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
+      Root = Gep->getOperand(0);
+    SortedBases.emplace_back(Base.first, Strip, Root);
+  }
+  auto *Begin = SortedBases.begin();
+  auto *End = SortedBases.end();
+  while (Begin != End) {
+    Value *Root = std::get<2>(*Begin);
+    auto *Mid = std::stable_partition(
+        Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
+    DenseMap<Value *, DenseMap<Value *, bool>> LessThan;
+    for (auto I = Begin; I < Mid; ++I)
+      LessThan.try_emplace(std::get<1>(*I));
+    for (auto I = Begin; I < Mid; ++I) {
+      Value *V = std::get<1>(*I);
+      while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
+        V = Gep->getOperand(0);
+        if (LessThan.contains(V))
+          LessThan[V][std::get<1>(*I)] = true;
+      }
+    }
+    std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
+      return LessThan[std::get<1>(V1)][std::get<1>(V2)];
+    });
+    Begin = Mid;
   }
 
+  // Collect the final order of sorted indices
+  for (auto Base : SortedBases)
+    for (auto &T : Bases[std::get<0>(Base)])
+      SortedIndices.push_back(std::get<2>(T));
+
   assert(SortedIndices.size() == VL.size() &&
          "Expected SortedIndices to be the size of VL");
   return true;
@@ -8363,6 +8398,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                   : TTI.getStridedMemoryOpCost(
                         Instruction::Load, LoadTy, LI->getPointerOperand(),
                         /*VariableMask=*/false, Alignment, CostKind, LI);
+          // Add external uses costs.
+          for (auto [Idx, V] : enumerate(VL.slice(
+                   P.first, std::min<unsigned>(VL.size() - P.first, VF))))
+            if (!R.areAllUsersVectorized(cast<Instruction>(V)))
+              GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                   LoadTy, CostKind, Idx);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
           for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
@@ -9699,7 +9740,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
               CanonicalType->getContext(),
               DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-        IntrinsicCostAttributes CostAttrs(MinMaxID, VecTy, {VecTy, VecTy});
+        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
+                                          {CanonicalType, CanonicalType});
         InstructionCost IntrinsicCost =
             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
         // If the selects are the only uses of the compares, they will be
@@ -9712,6 +9754,23 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         VecCost = std::min(VecCost, IntrinsicCost);
       }
+      if (auto *SI = dyn_cast<SelectInst>(VL0)) {
+        auto *CondType =
+            getWidenedType(SI->getCondition()->getType(), VL.size());
+        unsigned CondNumElements = CondType->getNumElements();
+        unsigned VecTyNumElements = getNumElements(VecTy);
+        assert(VecTyNumElements >= CondNumElements &&
+               VecTyNumElements % CondNumElements == 0 &&
+               "Cannot vectorize Instruction::Select");
+        if (CondNumElements != VecTyNumElements) {
+          // When the return type is i1 but the source is fixed vector type, we
+          // need to duplicate the condition value.
+          VecCost += TTI->getShuffleCost(
+              TTI::SK_PermuteSingleSrc, CondType,
+              createReplicatedMask(VecTyNumElements / CondNumElements,
+                                   CondNumElements));
+        }
+      }
       return VecCost + CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -12042,6 +12101,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    assert(isa<FixedVectorType>(V1->getType()) &&
+           isa<FixedVectorType>(V2->getType()) &&
+           "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
     V1 = castToScalarTyElem(V1);
     V2 = castToScalarTyElem(V2);
     if (InVectors.empty()) {
@@ -12071,13 +12133,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask, bool = false) {
+    assert(isa<FixedVectorType>(V1->getType()) &&
+           "castToScalarTyElem expects V1 to be FixedVectorType");
     V1 = castToScalarTyElem(V1);
     if (InVectors.empty()) {
-      if (!isa<FixedVectorType>(V1->getType())) {
-        V1 = createShuffle(V1, nullptr, CommonMask);
-        CommonMask.assign(Mask.size(), PoisonMaskElem);
-        transformMaskAfterShuffle(CommonMask, Mask);
-      }
       InVectors.push_back(V1);
       CommonMask.assign(Mask.begin(), Mask.end());
       return;
@@ -12085,8 +12144,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     const auto *It = find(InVectors, V1);
     if (It == InVectors.end()) {
       if (InVectors.size() == 2 ||
-          InVectors.front()->getType() != V1->getType() ||
-          !isa<FixedVectorType>(V1->getType())) {
+          InVectors.front()->getType() != V1->getType()) {
         Value *V = InVectors.front();
         if (InVectors.size() == 2) {
           V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
@@ -12120,9 +12178,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
           break;
         }
     }
-    int VF = CommonMask.size();
-    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
-      VF = FTy->getNumElements();
+    int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
       if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
         CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
@@ -13198,6 +13254,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
       }
 
+      unsigned CondNumElements = getNumElements(Cond->getType());
+      unsigned TrueNumElements = getNumElements(True->getType());
+      assert(TrueNumElements >= CondNumElements &&
+             TrueNumElements % CondNumElements == 0 &&
+             "Cannot vectorize Instruction::Select");
+      assert(TrueNumElements == getNumElements(False->getType()) &&
+             "Cannot vectorize Instruction::Select");
+      if (CondNumElements != TrueNumElements) {
+        // When the return type is i1 but the source is fixed vector type, we
+        // need to duplicate the condition value.
+        Cond = Builder.CreateShuffleVector(
+            Cond, createReplicatedMask(TrueNumElements / CondNumElements,
+                                       CondNumElements));
+      }
+      assert(getNumElements(Cond->getType()) == TrueNumElements &&
+             "Cannot vectorize Instruction::Select");
       Value *V = Builder.CreateSelect(Cond, True, False);
       V = FinalShuffle(V, E, VecTy);
 
@@ -13499,7 +13571,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         }
         ScalarArg = CEI->getArgOperand(I);
         if (cast<VectorType>(OpVec->getType())->getElementType() !=
-                ScalarArg->getType() &&
+                ScalarArg->getType()->getScalarType() &&
             It == MinBWs.end()) {
           auto *CastTy =
               getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
@@ -13888,11 +13960,18 @@ Value *BoUpSLP::vectorizeTree(
         }
         if (!Ex) {
           // "Reuse" the existing extract to improve final codegen.
-          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
+              ES && isa<Instruction>(Vec)) {
             Value *V = ES->getVectorOperand();
+            auto *IVec = cast<Instruction>(Vec);
             if (const TreeEntry *ETE = getTreeEntry(V))
               V = ETE->VectorizedValue;
-            Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            if (auto *IV = dyn_cast<Instruction>(V);
+                !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
+                IV->comesBefore(IVec))
+              Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            else
+              Ex = Builder.CreateExtractElement(Vec, Lane);
           } else if (ReplaceGEP) {
             // Leave the GEPs as is, they are free in most cases and better to
             // keep them as GEPs.
@@ -15529,7 +15608,23 @@ void BoUpSLP::computeMinimumValueSizes() {
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
     if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
-        E.Idx > (IsStoreOrInsertElt ? 2 : 1)) {
+        E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
+        all_of(E.Scalars, [&](Value *V) {
+          return V->hasOneUse() || isa<Constant>(V) ||
+                 (!V->hasNUsesOrMore(UsesLimit) &&
+                  none_of(V->users(), [&](User *U) {
+                    const TreeEntry *TE = getTreeEntry(U);
+                    const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
+                    if (TE == UserTE || !TE)
+                      return false;
+                    unsigned UserTESz = DL->getTypeSizeInBits(
+                        UserTE->Scalars.front()->getType());
+                    auto It = MinBWs.find(TE);
+                    if (It != MinBWs.end() && It->second.first > UserTESz)
+                      return true;
+                    return DL->getTypeSizeInBits(U->getType()) > UserTESz;
+                  }));
+        })) {
       ToDemote.push_back(E.Idx);
       const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
       auto It = MinBWs.find(UserTE);
@@ -16624,8 +16719,6 @@ class HorizontalReduction {
   SmallVector<SmallVector<Value *>> ReducedVals;
   /// Maps reduced value to the corresponding reduction operation.
   DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
-  // Use map vector to make stable output.
-  MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
@@ -16958,30 +17051,26 @@ class HorizontalReduction {
     // gather all the reduced values, sorting them by their value id.
     BasicBlock *BB = Root->getParent();
     bool IsCmpSelMinMax = isCmpSelMinMax(Root);
-    SmallVector<Instruction *> Worklist(1, Root);
+    SmallVector<std::pair<Instruction *, unsigned>> Worklist(
+        1, std::make_pair(Root, 0));
     // Checks if the operands of the \p TreeN instruction are also reduction
     // operations or should be treated as reduced values or an extra argument,
     // which is not part of the reduction.
     auto CheckOperands = [&](Instruction *TreeN,
-                             SmallVectorImpl<Value *> &ExtraArgs,
                              SmallVectorImpl<Value *> &PossibleReducedVals,
-                             SmallVectorImpl<Instruction *> &ReductionOps) {
+                             SmallVectorImpl<Instruction *> &ReductionOps,
+                             unsigned Level) {
       for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
                                     getNumberOfOperands(TreeN)))) {
         Value *EdgeVal = getRdxOperand(TreeN, I);
         ReducedValsToOps[EdgeVal].push_back(TreeN);
         auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-        // Edge has wrong parent - mark as an extra argument.
-        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
-            !hasSameParent(EdgeInst, BB)) {
-          ExtraArgs.push_back(EdgeVal);
-          continue;
-        }
         // If the edge is not an instruction, or it is different from the main
         // reduction opcode or has too many uses - possible reduced value.
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
-        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+        if (!EdgeInst || Level > RecursionMaxDepth ||
+            getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -17005,6 +17094,7 @@ class HorizontalReduction {
     SmallSet<size_t, 2> LoadKeyUsed;
 
     auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+      Key = hash_combine(hash_value(LI->getParent()), Key);
       Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
       if (LoadKeyUsed.contains(Key)) {
         auto LIt = LoadsMap.find(Ptr);
@@ -17035,40 +17125,23 @@ class HorizontalReduction {
     };
 
     while (!Worklist.empty()) {
-      Instruction *TreeN = Worklist.pop_back_val();
-      SmallVector<Value *> Args;
+      auto [TreeN, Level] = Worklist.pop_back_val();
       SmallVector<Value *> PossibleRedVals;
       SmallVector<Instruction *> PossibleReductionOps;
-      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
-      // If too many extra args - mark the instruction itself as a reduction
-      // value, not a reduction operation.
-      if (Args.size() < 2) {
-        addReductionOps(TreeN);
-        // Add extra args.
-        if (!Args.empty()) {
-          assert(Args.size() == 1 && "Expected only single argument.");
-          ExtraArgs[TreeN] = Args.front();
-        }
-        // Add reduction values. The values are sorted for better vectorization
-        // results.
-        for (Value *V : PossibleRedVals) {
-          size_t Key, Idx;
-          std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
-                                                 /*AllowAlternate=*/false);
-          ++PossibleReducedVals[Key][Idx]
-                .insert(std::make_pair(V, 0))
-                .first->second;
-        }
-        Worklist.append(PossibleReductionOps.rbegin(),
-                        PossibleReductionOps.rend());
-      } else {
+      CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
+      addReductionOps(TreeN);
+      // Add reduction values. The values are sorted for better vectorization
+      // results.
+      for (Value *V : PossibleRedVals) {
         size_t Key, Idx;
-        std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
+        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
                                                /*AllowAlternate=*/false);
         ++PossibleReducedVals[Key][Idx]
-              .insert(std::make_pair(TreeN, 0))
+              .insert(std::make_pair(V, 0))
               .first->second;
       }
+      for (Instruction *I : reverse(PossibleReductionOps))
+        Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
     }
     auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
     // Sort values by the total number of values kinds to start the reduction
@@ -17145,18 +17218,9 @@ class HorizontalReduction {
 
     // Track the reduced values in case if they are replaced by extractelement
     // because of the vectorization.
-    DenseMap<Value *, WeakTrackingVH> TrackedVals(
-        ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
-    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
+                                                  ReducedVals.front().size());
     SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
-    ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
-    // The same extra argument may be used several times, so log each attempt
-    // to use it.
-    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
-      assert(Pair.first && "DebugLoc must be set.");
-      ExternallyUsedValues[Pair.second].push_back(Pair.first);
-      TrackedVals.try_emplace(Pair.second, Pair.second);
-    }
 
     // The compare instruction of a min/max is the insertion point for new
     // instructions and may be replaced with a new compare instruction.
@@ -17191,13 +17255,9 @@ class HorizontalReduction {
       // Initialize the final value in the reduction.
       return Res;
     };
-    bool AnyBoolLogicOp =
-        any_of(ReductionOps.back(), [](Value *V) {
-          return isBoolLogicOp(cast<Instruction>(V));
-        });
-    // The reduction root is used as the insertion point for new instructions,
-    // so set it as externally used to prevent it from being deleted.
-    ExternallyUsedValues[ReductionRoot];
+    bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
+      return isBoolLogicOp(cast<Instruction>(V));
+    });
     SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
                                       ReductionOps.front().size());
     for (ReductionOpsType &RdxOps : ReductionOps)
@@ -17419,8 +17479,11 @@ class HorizontalReduction {
         V.reorderBottomToTop(/*IgnoreReorder=*/true);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
-        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
-            ExternallyUsedValues);
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
+        // The reduction root is used as the insertion point for new
+        // instructions, so set it as externally used to prevent it from being
+        // deleted.
+        LocalExternallyUsedValues[ReductionRoot];
         for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
           if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
             continue;
@@ -17467,23 +17530,6 @@ class HorizontalReduction {
         for (Value *RdxVal : VL)
           if (RequiredExtract.contains(RdxVal))
             LocalExternallyUsedValues[RdxVal];
-        // Update LocalExternallyUsedValues for the scalar, replaced by
-        // extractelement instructions.
-        DenseMap<Value *, Value *> ReplacementToExternal;
-        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
-          ReplacementToExternal.try_emplace(Pair.second, Pair.first);
-        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
-          Value *Ext = Pair.first;
-          auto RIt = ReplacementToExternal.find(Ext);
-          while (RIt != ReplacementToExternal.end()) {
-            Ext = RIt->second;
-            RIt = ReplacementToExternal.find(Ext);
-          }
-          auto *It = ExternallyUsedValues.find(Ext);
-          if (It == ExternallyUsedValues.end())
-            continue;
-          LocalExternallyUsedValues[Pair.second].append(It->second);
-        }
         V.buildExternalUses(LocalExternallyUsedValues);
 
         V.computeMinimumValueSizes();
@@ -17685,11 +17731,6 @@ class HorizontalReduction {
             ExtraReductions.emplace_back(RedOp, RdxVal);
         }
       }
-      for (auto &Pair : ExternallyUsedValues) {
-        // Add each externally used value to the final reduction.
-        for (auto *I : Pair.second)
-          ExtraReductions.emplace_back(I, Pair.first);
-      }
       // Iterate through all not-vectorized reduction values/extra arguments.
       bool InitStep = true;
       while (ExtraReductions.size() > 1) {
@@ -17841,6 +17882,8 @@ class HorizontalReduction {
     assert(IsSupportedHorRdxIdentityOp &&
            "The optimization of matched scalar identity horizontal reductions "
            "must be supported.");
+    if (Cnt == 1)
+      return VectorizedValue;
     switch (RdxKind) {
     case RecurKind::Add: {
       // res = mul vv, n
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 83a035fb4df86..2a9e4e12190cd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -254,6 +254,7 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
 Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
   if (NeedsScalar) {
     assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) ||
+            !vputils::onlyFirstLaneUsed(Def) ||
             (hasScalarValue(Def, VPIteration(Part, 0)) &&
              Data.PerPartScalars[Def][Part].size() == 1)) &&
            "Trying to access a single scalar per part but has multiple scalars "
@@ -1623,7 +1624,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   return Expanded;
 }
 
-bool vputils::isHeaderMask(VPValue *V, VPlan &Plan) {
+bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   if (isa<VPActiveLaneMaskPHIRecipe>(V))
     return true;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 805d9d91fc186..e65184aba6c3f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1399,9 +1399,10 @@ class VPInstruction : public VPRecipeWithIRFlags {
   bool isSingleScalar() const;
 };
 
-/// VPWidenRecipe is a recipe for producing a copy of vector type its
-/// ingredient. This recipe covers most of the traditional vectorization cases
-/// where each ingredient transforms into a vectorized version of itself.
+/// VPWidenRecipe is a recipe for producing a widened instruction using the
+/// opcode and operands of the recipe. This recipe covers most of the
+/// traditional vectorization cases where each recipe transforms into a
+/// vectorized version of itself.
 class VPWidenRecipe : public VPRecipeWithIRFlags {
   unsigned Opcode;
 
@@ -1421,7 +1422,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSC)
 
-  /// Produce widened copies of all Ingredients.
+  /// Produce a widened instruction using the opcode and operands of the recipe,
+  /// processing State.VF elements.
   void execute(VPTransformState &State) override;
 
   unsigned getOpcode() const { return Opcode; }
@@ -2244,12 +2246,12 @@ class VPReductionRecipe : public VPSingleDefRecipe {
 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
 class VPReductionEVLRecipe : public VPReductionRecipe {
 public:
-  VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
+  VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp)
       : VPReductionRecipe(
-            VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
-            cast_or_null<Instruction>(R->getUnderlyingValue()),
-            ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
-            R->isOrdered()) {}
+            VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
+            cast_or_null<Instruction>(R.getUnderlyingValue()),
+            ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
+            R.isOrdered()) {}
 
   ~VPReductionEVLRecipe() override = default;
 
@@ -2556,10 +2558,10 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
-  VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
-                            {L->getAddr(), EVL}, L->isConsecutive(),
-                            L->isReverse(), L->getDebugLoc()),
+  VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
+                            {L.getAddr(), &EVL}, L.isConsecutive(),
+                            L.isReverse(), L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2632,11 +2634,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 /// using the value to store, the address to store to, the explicit vector
 /// length and an optional mask.
 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
-  VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
-                            {S->getAddr(), S->getStoredValue(), EVL},
-                            S->isConsecutive(), S->isReverse(),
-                            S->getDebugLoc()) {
+  VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+                            {S.getAddr(), S.getStoredValue(), &EVL},
+                            S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -3469,11 +3470,6 @@ class VPlan {
 
   void addLiveOut(PHINode *PN, VPValue *V);
 
-  void removeLiveOut(PHINode *PN) {
-    delete LiveOuts[PN];
-    LiveOuts.erase(PN);
-  }
-
   const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
     return LiveOuts;
   }
@@ -3803,12 +3799,12 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
                                        ScalarEvolution &SE);
 
 /// Returns true if \p VPV is uniform after vectorization.
-inline bool isUniformAfterVectorization(VPValue *VPV) {
+inline bool isUniformAfterVectorization(const VPValue *VPV) {
   // A value defined outside the vector region must be uniform after
   // vectorization inside a vector region.
   if (VPV->isDefinedOutsideVectorRegions())
     return true;
-  VPRecipeBase *Def = VPV->getDefiningRecipe();
+  const VPRecipeBase *Def = VPV->getDefiningRecipe();
   assert(Def && "Must have definition for value defined inside vector region");
   if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
     return Rep->isUniform();
@@ -3820,7 +3816,7 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
 }
 
 /// Return true if \p V is a header mask in \p Plan.
-bool isHeaderMask(VPValue *V, VPlan &Plan);
+bool isHeaderMask(const VPValue *V, VPlan &Plan);
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4b1ac79bbfdd4..2d6d67a55c17d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -283,12 +283,12 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) {
 }
 
 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
-  if (auto *UI = getInstructionForCost(this))
-    if (Ctx.skipCostComputation(UI, VF.isVector()))
-      return 0;
+  auto *UI = getInstructionForCost(this);
+  if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
+    return 0;
 
   InstructionCost RecipeCost = computeCost(VF, Ctx);
-  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+  if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
       RecipeCost.isValid())
     RecipeCost = InstructionCost(ForceTargetInstructionCost);
 
@@ -2037,6 +2037,373 @@ void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
+                                     VectorType *DstVTy, const DataLayout &DL) {
+  // Verify that V is a vector type with same number of elements as DstVTy.
+  auto VF = DstVTy->getElementCount();
+  auto *SrcVecTy = cast<VectorType>(V->getType());
+  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
+  Type *SrcElemTy = SrcVecTy->getElementType();
+  Type *DstElemTy = DstVTy->getElementType();
+  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+         "Vector elements must have same size");
+
+  // Do a direct cast if element types are castable.
+  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+    return Builder.CreateBitOrPointerCast(V, DstVTy);
+  }
+  // V cannot be directly casted to desired vector type.
+  // May happen when V is a floating point vector but DstVTy is a vector of
+  // pointers or vice-versa. Handle this using a two-step bitcast using an
+  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+         "Only one type should be a pointer type");
+  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+         "Only one type should be a floating point type");
+  Type *IntTy =
+      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+  auto *VecIntTy = VectorType::get(IntTy, VF);
+  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+}
+
+/// Return a vector containing interleaved elements from multiple
+/// smaller input vectors.
+static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
+                                const Twine &Name) {
+  unsigned Factor = Vals.size();
+  assert(Factor > 1 && "Tried to interleave invalid number of vectors");
+
+  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
+#ifndef NDEBUG
+  for (Value *Val : Vals)
+    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
+#endif
+
+  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
+  // must use intrinsics to interleave.
+  if (VecTy->isScalableTy()) {
+    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
+                                   Vals,
+                                   /*FMFSource=*/nullptr, Name);
+  }
+
+  // Fixed length. Start by concatenating all vectors into a wide vector.
+  Value *WideVec = concatenateVectors(Builder, Vals);
+
+  // Interleave the elements into the wide vector.
+  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
+  return Builder.CreateShuffleVector(
+      WideVec, createInterleaveMask(NumElts, Factor), Name);
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];             // Member of index 0
+//     G = Pic[i+1];           // Member of index 1
+//     B = Pic[i+2];           // Member of index 2
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
+//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
+//
+// Or translate following interleaved store group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;           // Member of index 0
+//     Pic[i+1] = G;           // Member of index 1
+//     Pic[i+2] = B;           // Member of index 2
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "Interleave group being replicated.");
+  const InterleaveGroup<Instruction> *Group = IG;
+  Instruction *Instr = Group->getInsertPos();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = getLoadStoreType(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
+  auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
+
+  // Prepare for the new pointers.
+  SmallVector<Value *, 2> AddrParts;
+  unsigned Index = Group->getIndex(Instr);
+
+  // TODO: extend the masked interleaved-group support to reversed access.
+  VPValue *BlockInMask = getMask();
+  assert((!BlockInMask || !Group->isReverse()) &&
+         "Reversed masked interleave-group not supported.");
+
+  Value *Idx;
+  // If the group is reverse, adjust the index to refer to the last vector lane
+  // instead of the first. We adjust the index from the first vector lane,
+  // rather than directly getting the pointer for lane VF - 1, because the
+  // pointer operand of the interleaved access is supposed to be uniform. For
+  // uniform instructions, we're only required to generate a value for the
+  // first vector lane in each unroll iteration.
+  if (Group->isReverse()) {
+    Value *RuntimeVF =
+        getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+    Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
+    Idx = State.Builder.CreateMul(Idx,
+                                  State.Builder.getInt32(Group->getFactor()));
+    Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
+    Idx = State.Builder.CreateNeg(Idx);
+  } else
+    Idx = State.Builder.getInt32(-Index);
+
+  VPValue *Addr = getAddr();
+  for (unsigned Part = 0; Part < State.UF; Part++) {
+    Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
+    if (auto *I = dyn_cast<Instruction>(AddrPart))
+      State.setDebugLocFrom(I->getDebugLoc());
+
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+    AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
+    AddrParts.push_back(AddrPart);
+  }
+
+  State.setDebugLocFrom(Instr->getDebugLoc());
+  Value *PoisonVec = PoisonValue::get(VecTy);
+
+  auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
+                             unsigned Part, Value *MaskForGaps) -> Value * {
+    if (State.VF.isScalable()) {
+      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
+      assert(InterleaveFactor == 2 &&
+             "Unsupported deinterleave factor for scalable vectors");
+      auto *BlockInMaskPart = State.get(BlockInMask, Part);
+      SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
+      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
+                                     State.VF.getKnownMinValue() * 2, true);
+      return State.Builder.CreateIntrinsic(
+          MaskTy, Intrinsic::vector_interleave2, Ops,
+          /*FMFSource=*/nullptr, "interleaved.mask");
+    }
+
+    if (!BlockInMask)
+      return MaskForGaps;
+
+    Value *BlockInMaskPart = State.get(BlockInMask, Part);
+    Value *ShuffledMask = State.Builder.CreateShuffleVector(
+        BlockInMaskPart,
+        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
+        "interleaved.mask");
+    return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
+                                                   ShuffledMask, MaskForGaps)
+                       : ShuffledMask;
+  };
+
+  const DataLayout &DL = Instr->getDataLayout();
+  // Vectorize the interleaved load group.
+  if (isa<LoadInst>(Instr)) {
+    Value *MaskForGaps = nullptr;
+    if (NeedsMaskForGaps) {
+      MaskForGaps = createBitMaskForGaps(State.Builder,
+                                         State.VF.getKnownMinValue(), *Group);
+      assert(MaskForGaps && "Mask for Gaps is required but it is null");
+    }
+
+    // For each unroll part, create a wide load for the group.
+    SmallVector<Value *, 2> NewLoads;
+    for (unsigned Part = 0; Part < State.UF; Part++) {
+      Instruction *NewLoad;
+      if (BlockInMask || MaskForGaps) {
+        Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
+        NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
+                                                 Group->getAlign(), GroupMask,
+                                                 PoisonVec, "wide.masked.vec");
+      } else
+        NewLoad = State.Builder.CreateAlignedLoad(
+            VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
+      Group->addMetadata(NewLoad);
+      NewLoads.push_back(NewLoad);
+    }
+
+    ArrayRef<VPValue *> VPDefs = definedValues();
+    const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
+    if (VecTy->isScalableTy()) {
+      assert(InterleaveFactor == 2 &&
+             "Unsupported deinterleave factor for scalable vectors");
+
+      for (unsigned Part = 0; Part < State.UF; ++Part) {
+        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+        // so must use intrinsics to deinterleave.
+        Value *DI = State.Builder.CreateIntrinsic(
+            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
+            /*FMFSource=*/nullptr, "strided.vec");
+        unsigned J = 0;
+        for (unsigned I = 0; I < InterleaveFactor; ++I) {
+          Instruction *Member = Group->getMember(I);
+
+          if (!Member)
+            continue;
+
+          Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+          // If this member has different type, cast the result type.
+          if (Member->getType() != ScalarTy) {
+            VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+            StridedVec =
+                createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+          }
+
+          if (Group->isReverse())
+            StridedVec =
+                State.Builder.CreateVectorReverse(StridedVec, "reverse");
+
+          State.set(VPDefs[J], StridedVec, Part);
+          ++J;
+        }
+      }
+
+      return;
+    }
+
+    // For each member in the group, shuffle out the appropriate data from the
+    // wide loads.
+    unsigned J = 0;
+    for (unsigned I = 0; I < InterleaveFactor; ++I) {
+      Instruction *Member = Group->getMember(I);
+
+      // Skip the gaps in the group.
+      if (!Member)
+        continue;
+
+      auto StrideMask =
+          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
+      for (unsigned Part = 0; Part < State.UF; Part++) {
+        Value *StridedVec = State.Builder.CreateShuffleVector(
+            NewLoads[Part], StrideMask, "strided.vec");
+
+        // If this member has different type, cast the result type.
+        if (Member->getType() != ScalarTy) {
+          assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+          VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+          StridedVec =
+              createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+        }
+
+        if (Group->isReverse())
+          StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
+
+        State.set(VPDefs[J], StridedVec, Part);
+      }
+      ++J;
+    }
+    return;
+  }
+
+  // The sub vector type for current instruction.
+  auto *SubVT = VectorType::get(ScalarTy, State.VF);
+
+  // Vectorize the interleaved store group.
+  Value *MaskForGaps =
+      createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
+  assert((!MaskForGaps || !State.VF.isScalable()) &&
+         "masking gaps for scalable vectors is not yet supported.");
+  ArrayRef<VPValue *> StoredValues = getStoredValues();
+  for (unsigned Part = 0; Part < State.UF; Part++) {
+    // Collect the stored vector from each member.
+    SmallVector<Value *, 4> StoredVecs;
+    unsigned StoredIdx = 0;
+    for (unsigned i = 0; i < InterleaveFactor; i++) {
+      assert((Group->getMember(i) || MaskForGaps) &&
+             "Fail to get a member from an interleaved store group");
+      Instruction *Member = Group->getMember(i);
+
+      // Skip the gaps in the group.
+      if (!Member) {
+        Value *Undef = PoisonValue::get(SubVT);
+        StoredVecs.push_back(Undef);
+        continue;
+      }
+
+      Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
+      ++StoredIdx;
+
+      if (Group->isReverse())
+        StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
+
+      // If this member has different type, cast it to a unified type.
+
+      if (StoredVec->getType() != SubVT)
+        StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+      StoredVecs.push_back(StoredVec);
+    }
+
+    // Interleave all the smaller vectors into one wider vector.
+    Value *IVec =
+        interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+    Instruction *NewStoreInstr;
+    if (BlockInMask || MaskForGaps) {
+      Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
+      NewStoreInstr = State.Builder.CreateMaskedStore(
+          IVec, AddrParts[Part], Group->getAlign(), GroupMask);
+    } else
+      NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
+                                                       Group->getAlign());
+
+    Group->addMetadata(NewStoreInstr);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+  IG->getInsertPos()->printAsOperand(O, false);
+  O << ", ";
+  getAddr()->printAsOperand(O, SlotTracker);
+  VPValue *Mask = getMask();
+  if (Mask) {
+    O << ", ";
+    Mask->printAsOperand(O, SlotTracker);
+  }
+
+  unsigned OpIdx = 0;
+  for (unsigned i = 0; i < IG->getFactor(); ++i) {
+    if (!IG->getMember(i))
+      continue;
+    if (getNumStoreOperands() > 0) {
+      O << "\n" << Indent << "  store ";
+      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
+      O << " to index " << i;
+    } else {
+      O << "\n" << Indent << "  ";
+      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+      O << " = load from index " << i;
+    }
+    ++OpIdx;
+  }
+}
+#endif
+
 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
   Value *Start = getStartValue()->getLiveInIRValue();
   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d668ae2aa5c08..045f6c356669f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -278,6 +278,11 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
         return UI && UI->getParent() == Then2;
       });
 
+      // Remove phi recipes that are unused after merging the regions.
+      if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
+        Phi1ToMove.eraseFromParent();
+        continue;
+      }
       Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
     }
 
@@ -1483,13 +1488,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
       if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
         VPValue *NewMask = GetNewMask(MemR->getMask());
         if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
-          NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+          NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
         else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
-          NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+          NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
         else
           llvm_unreachable("unsupported recipe");
       } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
-        NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
+        NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
                                              GetNewMask(RedR->getCondOp()));
       }
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 537e2851176ea..7a680414da358 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -118,7 +118,7 @@ class VectorCombine {
   bool foldShuffleOfShuffles(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
-  bool foldTruncFromReductions(Instruction &I);
+  bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
 
   void replaceValue(Value &Old, Value &New) {
@@ -2119,15 +2119,20 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
 
 /// Determine if its more efficient to fold:
 ///   reduce(trunc(x)) -> trunc(reduce(x)).
-bool VectorCombine::foldTruncFromReductions(Instruction &I) {
+///   reduce(sext(x))  -> sext(reduce(x)).
+///   reduce(zext(x))  -> zext(reduce(x)).
+bool VectorCombine::foldCastFromReductions(Instruction &I) {
   auto *II = dyn_cast<IntrinsicInst>(&I);
   if (!II)
     return false;
 
+  bool TruncOnly = false;
   Intrinsic::ID IID = II->getIntrinsicID();
   switch (IID) {
   case Intrinsic::vector_reduce_add:
   case Intrinsic::vector_reduce_mul:
+    TruncOnly = true;
+    break;
   case Intrinsic::vector_reduce_and:
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_xor:
@@ -2139,35 +2144,37 @@ bool VectorCombine::foldTruncFromReductions(Instruction &I) {
   unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
   Value *ReductionSrc = I.getOperand(0);
 
-  Value *TruncSrc;
-  if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(TruncSrc)))))
+  Value *Src;
+  if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
+      (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
     return false;
 
-  auto *TruncSrcTy = cast<VectorType>(TruncSrc->getType());
+  auto CastOpc =
+      (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
+
+  auto *SrcTy = cast<VectorType>(Src->getType());
   auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
   Type *ResultTy = I.getType();
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost OldCost = TTI.getArithmeticReductionCost(
       ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
-  if (auto *Trunc = dyn_cast<CastInst>(ReductionSrc))
-    OldCost +=
-        TTI.getCastInstrCost(Instruction::Trunc, ReductionSrcTy, TruncSrcTy,
-                             TTI::CastContextHint::None, CostKind, Trunc);
+  OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
+                                  TTI::CastContextHint::None, CostKind,
+                                  cast<CastInst>(ReductionSrc));
   InstructionCost NewCost =
-      TTI.getArithmeticReductionCost(ReductionOpc, TruncSrcTy, std::nullopt,
+      TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
                                      CostKind) +
-      TTI.getCastInstrCost(Instruction::Trunc, ResultTy,
-                           ReductionSrcTy->getScalarType(),
+      TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
                            TTI::CastContextHint::None, CostKind);
 
   if (OldCost <= NewCost || !NewCost.isValid())
     return false;
 
-  Value *NewReduction = Builder.CreateIntrinsic(
-      TruncSrcTy->getScalarType(), II->getIntrinsicID(), {TruncSrc});
-  Value *NewTruncation = Builder.CreateTrunc(NewReduction, ResultTy);
-  replaceValue(I, *NewTruncation);
+  Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
+                                                II->getIntrinsicID(), {Src});
+  Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
+  replaceValue(I, *NewCast);
   return true;
 }
 
@@ -2565,7 +2572,7 @@ bool VectorCombine::run() {
       switch (Opcode) {
       case Instruction::Call:
         MadeChange |= foldShuffleFromReductions(I);
-        MadeChange |= foldTruncFromReductions(I);
+        MadeChange |= foldCastFromReductions(I);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
index e4e29143985b2..c45b6c3c5dcab 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -34,8 +34,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
@@ -54,8 +54,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
@@ -74,8 +74,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
@@ -94,8 +94,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
@@ -223,56 +223,56 @@ define void @casts() {
 
 define void @fp16() {
 ; CHECK-NOFP16-LABEL: 'fp16'
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 149 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 342 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-FP16-LABEL: 'fp16'
@@ -284,48 +284,48 @@ define void @fp16() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index d67f056366104..7f4030a81e749 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -367,7 +367,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -409,15 +409,15 @@ define void @vst3(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
@@ -453,15 +453,15 @@ define void @vst4(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/abs.ll b/llvm/test/Analysis/CostModel/AMDGPU/abs.ll
new file mode 100644
index 0000000000000..f65615b07abc0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/abs.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s
+; END.
+
+define void @abs_nonpoison() {
+; FAST-LABEL: 'abs_nonpoison'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'abs_nonpoison'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'abs_nonpoison'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'abs_nonpoison'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+  %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+  %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+  %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+  %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+
+  %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+  %V2I32  = call <2 x i32>  @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+  %V4I32  = call <4 x i32>  @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+  %V8I32  = call <8 x i32>  @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+  %V9I32  = call <9 x i32>  @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+  %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+
+  %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+  %V2I16  = call <2 x i16>  @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+  %V4I16  = call <4 x i16>  @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+  %V8I16  = call <8 x i16>  @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+  %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+  %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+  %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+
+  %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+  %V2I8  = call <2 x i8>  @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+  %V4I8  = call <4 x i8>  @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+  %V8I8  = call <8 x i8>  @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+  %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+  %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+  %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+  %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+
+  ret void
+}
+
+define void @abs_poison() {
+; FAST-LABEL: 'abs_poison'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'abs_poison'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'abs_poison'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'abs_poison'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+  %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+  %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+  %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+  %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+
+  %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+  %V2I32  = call <2 x i32>  @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+  %V4I32  = call <4 x i32>  @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+  %V8I32  = call <8 x i32>  @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+  %V9I32  = call <9 x i32>  @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+  %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+
+  %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+  %V2I16  = call <2 x i16>  @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+  %V4I16  = call <4 x i16>  @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+  %V8I16  = call <8 x i16>  @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+  %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+  %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+  %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+
+  %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+  %V2I8  = call <2 x i8>  @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+  %V4I8  = call <4 x i8>  @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+  %V8I8  = call <8 x i8>  @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+  %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+  %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+  %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+  %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
deleted file mode 100644
index 72a3392891592..0000000000000
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
-
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
-
-define i32 @fcopysign(i32 %arg) {
-; ALL-LABEL: 'fcopysign'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fcopysign'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.copysign.f32(float undef, float undef)
-  %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-
-  %F64 = call double @llvm.copysign.f64(double undef, double undef)
-  %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-
-  ret i32 undef
-}
-
-define i32 @fsqrt(i32 %arg) {
-; ALL-LABEL: 'fsqrt'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fsqrt'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.sqrt.f32(float undef)
-  %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-
-  %F64 = call double @llvm.sqrt.f64(double undef)
-  %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-
-  ret i32 undef
-}
-
-declare float @llvm.copysign.f32(float, float)
-declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.copysign.v16f32(<16 x float>, <16 x float>)
-
-declare double @llvm.copysign.f64(double, double)
-declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.copysign.v8f64(<8 x double>, <8 x double>)
-
-declare float @llvm.sqrt.f32(float)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
new file mode 100644
index 0000000000000..2cee15193a503
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @arithmetic_fence_f16() {
+; ALL-LABEL: 'arithmetic_fence_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @arithmetic_fence_bf16() {
+; ALL-LABEL: 'arithmetic_fence_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f32() {
+; ALL-LABEL: 'arithmetic_fence_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f64() {
+; ALL-LABEL: 'arithmetic_fence_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
new file mode 100644
index 0000000000000..e162edbf611e2
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @canonicalize_f16() {
+; BASE-LABEL: 'canonicalize_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.canonicalize.f16(half undef) #1
+  %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) #1
+  %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef) #1
+  %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) #1
+  %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef) #1
+  %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef) #1
+  %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef) #1
+  ret void
+}
+
+define void @canonicalize_bf16() {
+; BASE-LABEL: 'canonicalize_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
+  %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
+  %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
+  %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) #1
+  %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) #1
+  %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) #1
+  %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) #1
+  ret void
+}
+
+define void @canonicalize_f32() {
+; ALL-LABEL: 'canonicalize_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.canonicalize.f32(float undef) #1
+  %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
+  %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
+  %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) #1
+  %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) #1
+  %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) #1
+  %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) #1
+  %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) #1
+  ret void
+}
+
+define void @canonicalize_f64() {
+; ALL-LABEL: 'canonicalize_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.canonicalize.f64(double undef) #1
+  %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
+  %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) #1
+  %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) #1
+  %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) #1
+  %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) #1
+  %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
+  ret void
+}
+
+
+
+
+
+
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
new file mode 100644
index 0000000000000..06a058ff2e7b1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @copysign_f16() {
+; BASE-LABEL: 'copysign_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.copysign.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+  %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+  %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @copysign_f32() {
+; ALL-LABEL: 'copysign_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.copysign.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+  %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @copysign_bf16() {
+; BASE-LABEL: 'copysign_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @copysign_f64() {
+; ALL-LABEL: 'copysign_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.copysign.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+  %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+  %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
new file mode 100644
index 0000000000000..a94b794f89a3d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp_f16() {
+; BASE-LABEL: 'exp_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp_bf16() {
+; BASE-LABEL: 'exp_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp_f32() {
+; ALL-LABEL: 'exp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp_f64() {
+; ALL-LABEL: 'exp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
new file mode 100644
index 0000000000000..0fea64d0b94f7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp10_f16() {
+; BASE-LABEL: 'exp10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp10_bf16() {
+; BASE-LABEL: 'exp10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp10_f32() {
+; ALL-LABEL: 'exp10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp10_f64() {
+; ALL-LABEL: 'exp10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
new file mode 100644
index 0000000000000..d1ff5e1551db3
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp2_f16() {
+; BASE-LABEL: 'exp2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp2_bf16() {
+; BASE-LABEL: 'exp2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp2_f32() {
+; ALL-LABEL: 'exp2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp2_f64() {
+; ALL-LABEL: 'exp2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index daad19e42f3c4..da198ad9c028c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -1,116 +1,139 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
 
-define amdgpu_kernel void @fabs_f32() #0 {
-; ALL-LABEL: 'fabs_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+define void @fabs_f16() {
+; ALL-LABEL: 'fabs_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f32'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fabs.f32(float undef) #1
-  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #1
+  %f16 = call half @llvm.fabs.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f64() #0 {
-; ALL-LABEL: 'fabs_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+define void @fabs_bf16() {
+; ALL-LABEL: 'fabs_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+; ALL-SIZE-LABEL: 'fabs_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fabs.f64(double undef) #1
-  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f16() #0 {
-; ALL-LABEL: 'fabs_f16'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+define void @fabs_f32() {
+; ALL-LABEL: 'fabs_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f16'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fabs.f16(half undef) #1
-  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #1
+  %f32 = call float @llvm.fabs.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
   ret void
 }
 
-declare float @llvm.fabs.f32(float) #1
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1
-declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
-declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1
-declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #1
-declare <9 x float> @llvm.fabs.v9f32(<9 x float>) #1
-
-declare double @llvm.fabs.f64(double) #1
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1
-declare <3 x double> @llvm.fabs.v3f64(<3 x double>) #1
-declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #1
-declare <5 x double> @llvm.fabs.v5f64(<5 x double>) #1
-
-declare half @llvm.fabs.f16(half) #1
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
-declare <3 x half> @llvm.fabs.v3f16(<3 x half>) #1
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
-declare <5 x half> @llvm.fabs.v5f16(<5 x half>) #1
-declare <16 x half> @llvm.fabs.v16f16(<16 x half>) #1
-declare <17 x half> @llvm.fabs.v17f16(<17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fabs_f64() {
+; ALL-LABEL: 'fabs_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'fabs_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fabs.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index ab4e98201f6d7..2ff9d4f7f5e38 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -1,214 +1,167 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,FASTF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s
-; END.
 
-define amdgpu_kernel void @fma_f32() #0 {
-; SLOWF64-LABEL: 'fma_f32'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f32'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fma_f16() {
+; FAST-LABEL: 'fma_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f32'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-LABEL: 'fma_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f32'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f32'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f32'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
-  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #1
+  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f64() #0 {
-; SLOWF64-LABEL: 'fma_f64'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+define void @fma_bf16() {
+; FAST-LABEL: 'fma_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f64'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-LABEL: 'fma_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f64'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f64'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f64'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-SIZE-LABEL: 'fma_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1
-  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f16() #0 {
-; FAST-LABEL: 'fma_f16'
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SLOW-LABEL: 'fma_f16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+define void @fma_f32() {
+; SLOW-LABEL: 'fma_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; FAST-SIZE-LABEL: 'fma_f16'
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SLOW-SIZE-LABEL: 'fma_f16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1
-  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #1
+  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
   ret void
 }
 
-declare float @llvm.fma.f32(float, float, float) #1
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
-declare <3 x float> @llvm.fma.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
-declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) #1
-declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
-declare <9 x float> @llvm.fma.v9f32(<9 x float>, <9 x float>, <9 x float>) #1
-
-declare double @llvm.fma.f64(double, double, double) #1
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
-declare <3 x double> @llvm.fma.v3f64(<3 x double>, <3 x double>, <3 x double>) #1
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #1
-declare <5 x double> @llvm.fma.v5f64(<5 x double>, <5 x double>, <5 x double>) #1
-
-declare half @llvm.fma.f16(half, half, half) #1
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
-declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #1
-declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
-declare <5 x half> @llvm.fma.v5f16(<5 x half>, <5 x half>, <5 x half>) #1
-declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) #1
-declare <17 x half> @llvm.fma.v17f16(<17 x half>, <17 x half>, <17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fma_f64() {
+; SLOW-LABEL: 'fma_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fma_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 2e4a9c70f3717..adc4eea309a58 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -158,4 +158,55 @@ define amdgpu_kernel void @fmul_f16() #0 {
   ret void
 }
 
+define amdgpu_kernel void @fmul_bf16() #0 {
+; GFX9-LABEL: 'fmul_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmul_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'fmul_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmul_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = fmul bfloat undef, undef
+  %v2bf16 = fmul <2 x bfloat> undef, undef
+  %v3bf16 = fmul <3 x bfloat> undef, undef
+  %v4bf16 = fmul <4 x bfloat> undef, undef
+  %v5bf16 = fmul <5 x bfloat> undef, undef
+  %v16bf16 = fmul <16 x bfloat> undef, undef
+  %v17bf16 = fmul <17 x bfloat> undef, undef
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
new file mode 100644
index 0000000000000..c6153bbd94103
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fmuladd_f16() {
+; FAST-LABEL: 'fmuladd_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+  ret void
+}
+
+define void @fmuladd_bf16() {
+; FAST-LABEL: 'fmuladd_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+  ret void
+}
+
+define void @fmuladd_f32() {
+; SLOW-LABEL: 'fmuladd_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+  ret void
+}
+
+define void @fmuladd_f64() {
+; SLOW-LABEL: 'fmuladd_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
index 725466a3c1a05..9af5dd352d588 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
@@ -3,7 +3,75 @@
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE %s
 ; END.
 
-define amdgpu_kernel void @fneg_f32() {
+define void @fneg_f16() {
+; CHECK-LABEL: 'fneg_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg half undef
+  %v2f16 = fneg <2 x half> undef
+  %v3f16 = fneg <3 x half> undef
+  %v4f16 = fneg <4 x half> undef
+  %v5f16 = fneg <5 x half> undef
+  %v8f16 = fneg <8 x half> undef
+  %v16f16 = fneg <16 x half> undef
+  %v17f16 = fneg <17 x half> undef
+  ret void
+}
+
+define void @fneg_bf16() {
+; CHECK-LABEL: 'fneg_bf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg bfloat undef
+  %v2f16 = fneg <2 x bfloat> undef
+  %v3f16 = fneg <3 x bfloat> undef
+  %v4f16 = fneg <4 x bfloat> undef
+  %v5f16 = fneg <5 x bfloat> undef
+  %v8f16 = fneg <8 x bfloat> undef
+  %v16f16 = fneg <16 x bfloat> undef
+  %v17f16 = fneg <17 x bfloat> undef
+  ret void
+}
+
+define void @fneg_f32() {
 ; CHECK-LABEL: 'fneg_f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = fneg <2 x float> undef
@@ -12,6 +80,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f32'
@@ -22,6 +91,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fneg float undef
@@ -31,16 +101,19 @@ define amdgpu_kernel void @fneg_f32() {
   %v5f32 = fneg <5 x float> undef
   %v8f32 = fneg <8 x float> undef
   %v9f32 = fneg <9 x float> undef
+  %v16f32 = fneg <16 x float> undef
   ret void
 }
 
-define amdgpu_kernel void @fneg_f64() {
+define void @fneg_f64() {
 ; CHECK-LABEL: 'fneg_f64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = fneg double undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = fneg <2 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f64'
@@ -49,6 +122,8 @@ define amdgpu_kernel void @fneg_f64() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = fneg double undef
@@ -56,37 +131,8 @@ define amdgpu_kernel void @fneg_f64() {
   %v3f64 = fneg <3 x double> undef
   %v4f64 = fneg <4 x double> undef
   %v5f64 = fneg <5 x double> undef
-  ret void
-}
-
-define amdgpu_kernel void @fneg_f16() {
-; CHECK-LABEL: 'fneg_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SIZE-LABEL: 'fneg_f16'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %f16 = fneg half undef
-  %v2f16 = fneg <2 x half> undef
-  %v3f16 = fneg <3 x half> undef
-  %v4f16 = fneg <4 x half> undef
-  %v5f16 = fneg <5 x half> undef
-  %v16f16 = fneg <16 x half> undef
-  %v17f16 = fneg <17 x half> undef
+  %v8f64 = fneg <8 x double> undef
+  %v16f64 = fneg <16 x double> undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
new file mode 100644
index 0000000000000..22134d042fabb
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @frexp_f16_i32() {
+; GFX7-LABEL: 'frexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+  %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_f16_i16() {
+; GFX7-LABEL: 'frexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+  %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_bf16() {
+; GFX7-LABEL: 'frexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+  %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+  %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+  %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+  %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+  %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+  %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+  %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+  ret void
+}
+
+define void @frexp_f32() {
+; ALL-LABEL: 'frexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+  %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+  %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+  %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+  %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+  %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+  %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+  %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+  ret void
+}
+
+define void @frexp_f64() {
+; ALL-LABEL: 'frexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+  %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+  %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+  %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+  %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+  %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+  %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+  %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
new file mode 100644
index 0000000000000..fc7af38e4729f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @is_fpclass_f16() {
+; ALL-LABEL: 'is_fpclass_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+  %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+  %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+  %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+  %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+  %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+  %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+  %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_bf16() {
+; ALL-LABEL: 'is_fpclass_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+  %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+  %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+  %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+  %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+  %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+  %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+  %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f32() {
+; ALL-LABEL: 'is_fpclass_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+  %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+  %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+  %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+  %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+  %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+  %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+  %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f64() {
+; ALL-LABEL: 'is_fpclass_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+  %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+  %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+  %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+  %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+  %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+  %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+  %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
new file mode 100644
index 0000000000000..2b1b5906ad017
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @ldexp_f16_i32() {
+; GFX7-LABEL: 'ldexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f16_i16() {
+; GFX7-LABEL: 'ldexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+  ret void
+}
+
+define void @ldexp_bf16() {
+; GFX7-LABEL: 'ldexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+  %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f32() {
+; ALL-LABEL: 'ldexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+  %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+  %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+  %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+  %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+  %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+  %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+  %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f64() {
+; ALL-LABEL: 'ldexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+  %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+  %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+  %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+  %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+  %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+  %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+  %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log.ll b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
new file mode 100644
index 0000000000000..2cf7039982f26
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log_f16() {
+; BASE-LABEL: 'log_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log_bf16() {
+; BASE-LABEL: 'log_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log_f32() {
+; ALL-LABEL: 'log_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log_f64() {
+; ALL-LABEL: 'log_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log10.ll b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
new file mode 100644
index 0000000000000..d807c6cd63648
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log10_f16() {
+; BASE-LABEL: 'log10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log10_bf16() {
+; BASE-LABEL: 'log10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log10_f32() {
+; ALL-LABEL: 'log10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log10_f64() {
+; ALL-LABEL: 'log10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log2.ll b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
new file mode 100644
index 0000000000000..1ef3977fe4a8f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log2_f16() {
+; BASE-LABEL: 'log2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log2_bf16() {
+; BASE-LABEL: 'log2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log2_f32() {
+; ALL-LABEL: 'log2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log2_f64() {
+; ALL-LABEL: 'log2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
new file mode 100644
index 0000000000000..0f6f01b9c1d36
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maximum_f16() {
+; GFX9-LABEL: 'maximum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maximum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maximum_bf16() {
+; GFX9-LABEL: 'maximum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maximum_f32() {
+; ALL-LABEL: 'maximum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maximum_f64() {
+; ALL-LABEL: 'maximum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
new file mode 100644
index 0000000000000..0d423fec7adcc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maxnum_f16() {
+; GFX9-LABEL: 'maxnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maxnum_bf16() {
+; GFX9-LABEL: 'maxnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maxnum_f32() {
+; ALL-LABEL: 'maxnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maxnum_f64() {
+; ALL-LABEL: 'maxnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
new file mode 100644
index 0000000000000..b6e52cfaaaada
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minimum_f16() {
+; GFX9-LABEL: 'minimum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minimum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minimum_bf16() {
+; GFX9-LABEL: 'minimum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minimum_f32() {
+; ALL-LABEL: 'minimum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minimum_f64() {
+; ALL-LABEL: 'minimum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
new file mode 100644
index 0000000000000..61432ba440920
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minnum_f16() {
+; GFX9-LABEL: 'minnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minnum_bf16() {
+; GFX9-LABEL: 'minnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minnum_f32() {
+; ALL-LABEL: 'minnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minnum_f64() {
+; ALL-LABEL: 'minnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
new file mode 100644
index 0000000000000..8600dd70da620
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+; END.
+
+define ptr @ptrmask_p0_i64(ptr %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p0_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p0_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %result
+;
+  %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+  ret ptr %result
+}
+
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> %ptr, <2 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr> %result
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+  ret <2 x ptr> %result
+}
+
+define ptr addrspace(1) @ptrmask_p1_i64(ptr addrspace(1) %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p1_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(1) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p1_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(1) %result
+;
+  %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+  ret ptr addrspace(1) %result
+}
+
+define ptr addrspace(5) @ptrmask_p5_i32(ptr addrspace(5) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p5_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(5) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p5_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(5) %result
+;
+  %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+  ret ptr addrspace(5) %result
+}
+
+define ptr addrspace(3) @ptrmask_p3_i32(ptr addrspace(3) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p3_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(3) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p3_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(3) %result
+;
+  %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+  ret ptr addrspace(3) %result
+}
+
+define <2 x ptr addrspace(5)> @ptrmask_v2p5_v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+  %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+  ret <2 x ptr addrspace(5)> %result
+}
+
+define <3 x ptr> @ptrmask_v3p0_v3i64(<3 x ptr> %ptr, <3 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr> %result
+;
+  %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+  ret <3 x ptr> %result
+}
+
+define <3 x ptr addrspace(5)> @ptrmask_v3p5_v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+  %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+  ret <3 x ptr addrspace(5)> %result
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
new file mode 100644
index 0000000000000..7136b70a04190
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @sqrt_f16() {
+; BASE-LABEL: 'sqrt_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.sqrt.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @sqrt_bf16() {
+; BASE-LABEL: 'sqrt_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @sqrt_f32() {
+; ALL-LABEL: 'sqrt_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.sqrt.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @sqrt_f64() {
+; ALL-LABEL: 'sqrt_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.sqrt.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index d1e8bb015491e..5236f5a3bae95 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -8,36 +8,36 @@ define i32 @fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fadd <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fadd <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fadd <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fadd half undef, undef
@@ -88,36 +88,36 @@ define i32 @fsub() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fsub <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fsub <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fsub <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fsub <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fsub <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fsub <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fsub <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fsub half undef, undef
@@ -168,36 +168,36 @@ define i32 @fmul() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fmul <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fmul <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fmul <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fmul <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fmul <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fmul half undef, undef
@@ -248,36 +248,36 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fdiv <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fdiv <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fdiv <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fdiv <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fdiv half undef, undef
@@ -408,36 +408,36 @@ define i32 @fneg() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fneg <1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg <8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fneg <32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fneg <1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg <4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fneg <16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fneg <1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg <4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fneg <8 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fneg <16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fneg <1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fneg half undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index 6ddd57a24c51f..669e7028ff54d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -4314,3 +4314,66 @@ define void @uitofp() {
 
   ret void
 }
+
+define void @oddvec_sizes() {
+; CHECK-LABEL: 'oddvec_sizes'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %1 = sext <3 x i8> undef to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %2 = sext <7 x i8> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %3 = sext <15 x i8> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %4 = zext <3 x i8> undef to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %5 = zext <7 x i8> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %6 = zext <15 x i8> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %7 = trunc <3 x i32> undef to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %8 = trunc <7 x i32> undef to <7 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %9 = trunc <15 x i32> undef to <15 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = bitcast <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = bitcast <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = bitcast <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %13 = sitofp <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %14 = sitofp <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %15 = sitofp <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %16 = uitofp <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %17 = uitofp <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %18 = uitofp <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = fptosi <3 x float> undef to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = fptosi <7 x float> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = fptosi <15 x float> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = fptoui <3 x float> undef to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = fptoui <7 x float> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = fptoui <15 x float> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  sext <3 x i8> undef to <3 x i16>
+  sext <7 x i8> undef to <7 x i32>
+  sext <15 x i8> undef to <15 x i32>
+
+  zext <3 x i8> undef to <3 x i16>
+  zext <7 x i8> undef to <7 x i32>
+  zext <15 x i8> undef to <15 x i32>
+
+  trunc <3 x i32> undef to <3 x i8>
+  trunc <7 x i32> undef to <7 x i8>
+  trunc <15 x i32> undef to <15 x i8>
+
+  bitcast <3 x i32> undef to <3 x float>
+  bitcast <7 x i32> undef to <7 x float>
+  bitcast <15 x i32> undef to <15 x float>
+
+  sitofp <3 x i32> undef to <3 x float>
+  sitofp <7 x i32> undef to <7 x float>
+  sitofp <15 x i32> undef to <15 x float>
+
+  uitofp <3 x i32> undef to <3 x float>
+  uitofp <7 x i32> undef to <7 x float>
+  uitofp <15 x i32> undef to <15 x float>
+
+  fptosi <3 x float> undef to <3 x i32>
+  fptosi <7 x float> undef to <7 x i32>
+  fptosi <15 x float> undef to <15 x i32>
+
+  fptoui <3 x float> undef to <3 x i32>
+  fptoui <7 x float> undef to <7 x i32>
+  fptoui <15 x float> undef to <15 x i32>
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 87ffb23dcb88e..67c081ba5d3c6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -21,7 +21,7 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 
 define void @powi(<vscale x 4 x float> %vec) {
 ; CHECK-LABEL: 'powi'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'powi'
@@ -1383,73 +1383,73 @@ define void @reduce_fadd() {
 
 define void @vp_fadd(){
 ; CHECK-LABEL: 'vp_fadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'vp_fadd'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index d24a29dab03d9..2448056076d8b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -1,146 +1,186 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s --check-prefixes=CODESIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CODESIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
 define void @load(ptr %p) {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i8>, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i8>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i8>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i8>, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i8>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <vscale x 16 x i8>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <vscale x 32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i16, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i16>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i16>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i16>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i16>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i16>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = load <vscale x 8 x i16>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = load <vscale x 16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = load <vscale x 32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i32, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i32>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %33 = load <32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i32>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %36 = load <vscale x 4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = load <8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = load <16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = load <32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i64>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %48 = load <vscale x 2 x i64>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %49 = load <vscale x 4 x i64>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %50 = load <vscale x 8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %51 = load <vscale x 16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %52 = load <vscale x 32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load ptr, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x ptr>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x ptr>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x ptr>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %61 = load <vscale x 2 x ptr>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %62 = load <vscale x 4 x ptr>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %63 = load <vscale x 8 x ptr>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %64 = load <vscale x 16 x ptr>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %65 = load <vscale x 32 x ptr>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i1, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <vscale x 16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <vscale x 32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i8, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i8>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i8>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i8>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i8>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = load <vscale x 16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = load <vscale x 32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i16, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i16>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i16>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %33 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i16>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i16>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %37 = load <vscale x 8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %38 = load <vscale x 16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %39 = load <vscale x 32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i32, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %45 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %46 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %49 = load <vscale x 4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %50 = load <vscale x 8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %51 = load <vscale x 16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %52 = load <vscale x 32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load i64, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x i64>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x i64>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x i64>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %61 = load <vscale x 2 x i64>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %62 = load <vscale x 4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %63 = load <vscale x 8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %64 = load <vscale x 16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %65 = load <vscale x 32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = load ptr, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %67 = load <1 x ptr>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %68 = load <2 x ptr>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %69 = load <4 x ptr>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %70 = load <8 x ptr>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %71 = load <16 x ptr>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = load <32 x ptr>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %73 = load <vscale x 1 x ptr>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %74 = load <vscale x 2 x ptr>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %75 = load <vscale x 4 x ptr>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %76 = load <vscale x 8 x ptr>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = load <vscale x 16 x ptr>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = load <vscale x 32 x ptr>, ptr %p, align 256
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'load'
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i8>, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i8>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i8>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i8>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <32 x i8>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i8>, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i8>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i8>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i8>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <vscale x 16 x i8>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <vscale x 32 x i8>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i16, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i16>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i16>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i16>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i16>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <16 x i16>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <32 x i16>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i16>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i16>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i16>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i16>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <vscale x 16 x i16>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <vscale x 32 x i16>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i32, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i32>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i32>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = load <16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %33 = load <32 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i32>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i32>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = load <8 x i64>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = load <16 x i64>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = load <32 x i64>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i64>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i64>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = load <vscale x 4 x i64>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %50 = load <vscale x 8 x i64>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %51 = load <vscale x 16 x i64>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = load <vscale x 32 x i64>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load ptr, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x ptr>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x ptr>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x ptr>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = load <vscale x 2 x ptr>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = load <vscale x 4 x ptr>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %63 = load <vscale x 8 x ptr>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %64 = load <vscale x 16 x ptr>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %65 = load <vscale x 32 x ptr>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i1, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <vscale x 16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <vscale x 32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i8, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i8>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i8>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i8>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i8>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i8>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i8>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <vscale x 16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <vscale x 32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i16, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i16>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i16>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i16>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = load <16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = load <32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i16>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i16>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i16>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = load <vscale x 8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = load <vscale x 16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = load <vscale x 32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i32, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %44 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = load <vscale x 4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %50 = load <vscale x 8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = load <vscale x 16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %52 = load <vscale x 32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load i64, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x i64>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x i64>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = load <4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = load <8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = load <16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = load <32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x i64>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = load <vscale x 2 x i64>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = load <vscale x 4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %63 = load <vscale x 8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %64 = load <vscale x 16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %65 = load <vscale x 32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = load ptr, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %67 = load <1 x ptr>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %68 = load <2 x ptr>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %69 = load <4 x ptr>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %70 = load <8 x ptr>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %71 = load <16 x ptr>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %72 = load <32 x ptr>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %73 = load <vscale x 1 x ptr>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %74 = load <vscale x 2 x ptr>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %75 = load <vscale x 4 x ptr>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %76 = load <vscale x 8 x ptr>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %77 = load <vscale x 16 x ptr>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %78 = load <vscale x 32 x ptr>, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  load i1, ptr %p
+  load <1 x i1>, ptr %p
+  load <2 x i1>, ptr %p
+  load <4 x i1>, ptr %p
+  load <8 x i1>, ptr %p
+  load <16 x i1>, ptr %p
+  load <32 x i1>, ptr %p
+  load <vscale x 1 x i1>, ptr %p
+  load <vscale x 2 x i1>, ptr %p
+  load <vscale x 4 x i1>, ptr %p
+  load <vscale x 8 x i1>, ptr %p
+  load <vscale x 16 x i1>, ptr %p
+  load <vscale x 32 x i1>, ptr %p
+
   load i8, ptr %p
   load <1 x i8>, ptr %p
   load <2 x i8>, ptr %p
@@ -217,6 +257,19 @@ define void @load(ptr %p) {
 
 define void @store(ptr %p) {
 ; CHECK-LABEL: 'store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i1 undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 32 x i1> undef, ptr %p, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr %p, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i8> undef, ptr %p, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr %p, align 2
@@ -285,13 +338,26 @@ define void @store(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'store'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i1 undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 32 x i1> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i8> undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> undef, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i8> undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr %p, align 4
@@ -303,8 +369,8 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i16> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i16> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> undef, ptr %p, align 64
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i16> undef, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i16> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i16> undef, ptr %p, align 8
@@ -315,9 +381,9 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i32> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i32> undef, ptr %p, align 16
@@ -327,10 +393,10 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i64> undef, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i64> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i64> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i64> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i64> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i64> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i64> undef, ptr %p, align 32
@@ -340,10 +406,10 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store ptr undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x ptr> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x ptr> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x ptr> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x ptr> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x ptr> undef, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x ptr> undef, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x ptr> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x ptr> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x ptr> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x ptr> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x ptr> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x ptr> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x ptr> undef, ptr %p, align 32
@@ -352,6 +418,20 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 32 x ptr> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  store i1 undef, ptr %p
+  store <1 x i1> undef, ptr %p
+  store <2 x i1> undef, ptr %p
+  store <4 x i1> undef, ptr %p
+  store <8 x i1> undef, ptr %p
+  store <16 x i1> undef, ptr %p
+  store <32 x i1> undef, ptr %p
+  store <vscale x 1 x i1> undef, ptr %p
+  store <vscale x 2 x i1> undef, ptr %p
+  store <vscale x 4 x i1> undef, ptr %p
+  store <vscale x 8 x i1> undef, ptr %p
+  store <vscale x 16 x i1> undef, ptr %p
+  store <vscale x 32 x i1> undef, ptr %p
+
   store i8 undef, ptr %p
   store <1 x i8> undef, ptr %p
   store <2 x i8> undef, ptr %p
@@ -451,9 +531,9 @@ define void @store_of_constant(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> zeroinitializer, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> zeroinitializer, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> zeroinitializer, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> <i32 4096, i32 4096, i32 4096, i32 4096>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> <i32 1, i32 1, i32 2, i32 1>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> <i32 2, i32 1, i32 1, i32 1>, ptr %p, align 16
@@ -493,3 +573,179 @@ define void @store_of_constant(ptr %p) {
 
   ret void
 }
+
+
+define void @load_oddsize_vectors(ptr %p) {
+; CHECK-LABEL: 'load_oddsize_vectors'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <3 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <5 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <7 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <9 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <15 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %25 = load <31 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'load_oddsize_vectors'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <3 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <5 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <6 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <7 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <9 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <15 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <31 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+
+  load <1 x i1>, ptr %p
+  load <2 x i1>, ptr %p
+  load <3 x i1>, ptr %p
+  load <4 x i1>, ptr %p
+  load <5 x i1>, ptr %p
+  load <6 x i1>, ptr %p
+  load <7 x i1>, ptr %p
+  load <8 x i1>, ptr %p
+  load <9 x i1>, ptr %p
+  load <15 x i1>, ptr %p
+  load <16 x i1>, ptr %p
+  load <31 x i1>, ptr %p
+  load <32 x i1>, ptr %p
+
+  load <1 x i32>, ptr %p
+  load <2 x i32>, ptr %p
+  load <3 x i32>, ptr %p
+  load <4 x i32>, ptr %p
+  load <5 x i32>, ptr %p
+  load <6 x i32>, ptr %p
+  load <7 x i32>, ptr %p
+  load <8 x i32>, ptr %p
+  load <9 x i32>, ptr %p
+  load <15 x i32>, ptr %p
+  load <16 x i32>, ptr %p
+  load <31 x i32>, ptr %p
+  load <32 x i32>, ptr %p
+
+  ret void
+}
+
+define void @store_oddsize_vectors(ptr %p) {
+; CHECK-LABEL: 'store_oddsize_vectors'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i1> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'store_oddsize_vectors'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i1> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  store <1 x i1> undef, ptr %p
+  store <2 x i1> undef, ptr %p
+  store <3 x i1> undef, ptr %p
+  store <4 x i1> undef, ptr %p
+  store <5 x i1> undef, ptr %p
+  store <6 x i1> undef, ptr %p
+  store <7 x i1> undef, ptr %p
+  store <8 x i1> undef, ptr %p
+  store <15 x i1> undef, ptr %p
+  store <16 x i1> undef, ptr %p
+  store <31 x i1> undef, ptr %p
+  store <32 x i1> undef, ptr %p
+
+  store <1 x i32> undef, ptr %p
+  store <2 x i32> undef, ptr %p
+  store <3 x i32> undef, ptr %p
+  store <4 x i32> undef, ptr %p
+  store <5 x i32> undef, ptr %p
+  store <6 x i32> undef, ptr %p
+  store <7 x i32> undef, ptr %p
+  store <8 x i32> undef, ptr %p
+  store <15 x i32> undef, ptr %p
+  store <16 x i32> undef, ptr %p
+  store <31 x i32> undef, ptr %p
+  store <32 x i32> undef, ptr %p
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index ba745262d1890..d99a6a793f964 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -37,190 +37,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @sadd(i32 %arg) {
 ; SSSE3-LABEL: 'sadd'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sadd'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sadd'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sadd'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sadd'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sadd'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sadd'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sadd'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sadd'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sadd'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -275,190 +275,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @uadd(i32 %arg) {
 ; SSSE3-LABEL: 'uadd'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'uadd'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'uadd'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'uadd'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uadd'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'uadd'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'uadd'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'uadd'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'uadd'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'uadd'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -513,190 +513,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @ssub(i32 %arg) {
 ; SSSE3-LABEL: 'ssub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'ssub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ssub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ssub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ssub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ssub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'ssub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'ssub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'ssub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'ssub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -751,190 +751,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @usub(i32 %arg) {
 ; SSSE3-LABEL: 'usub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'usub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'usub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'usub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'usub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'usub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'usub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'usub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'usub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'usub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -989,193 +989,193 @@ declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @smul(i32 %arg) {
 ; SSSE3-LABEL: 'smul'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'smul'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'smul'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'smul'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'smul'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'smul'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'smul'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'smul'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'smul'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'smul'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -1227,193 +1227,193 @@ declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @umul(i32 %arg) {
 ; SSSE3-LABEL: 'umul'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'umul'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'umul'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'umul'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'umul'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'umul'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'umul'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'umul'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'umul'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'umul'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
index 21c99555c5df3..e3ade3986398d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
@@ -35,211 +35,211 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,103 +287,103 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -391,18 +391,18 @@ define i32 @sub(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -410,18 +410,18 @@ define i32 @sub(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -429,70 +429,70 @@ define i32 @sub(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
index 1d398d9555fbf..a08785189a4b8 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
index 4d1fa7c5cf55c..070c70e1fe018 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
index 4bbd28edb434b..e8c8666617b83 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
@@ -35,13 +35,13 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -54,13 +54,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -73,13 +73,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -92,32 +92,32 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -130,13 +130,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -149,13 +149,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -168,13 +168,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -187,13 +187,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -206,13 +206,13 @@ define i32 @add(i32 %arg) {
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -225,21 +225,21 @@ define i32 @add(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -288,13 +288,13 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -307,13 +307,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'sub'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -326,13 +326,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSE42-LABEL: 'sub'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -345,32 +345,32 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -383,13 +383,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -402,13 +402,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -421,13 +421,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -440,13 +440,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -459,13 +459,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; GLM-LABEL: 'sub'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -478,21 +478,21 @@ define i32 @sub(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
index e92b8102a5a3f..fcb03da681e2a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
index 9d9e5843c2285..b5a1321531f5c 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
@@ -35,154 +35,154 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
@@ -191,55 +191,55 @@ define i32 @add(i32 %arg) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -292,17 +292,17 @@ define i32 @sub(i32 %arg) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
@@ -311,188 +311,188 @@ define i32 @sub(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
index 1912822bdcb65..c4b0f7528cb1d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat.ll b/llvm/test/Analysis/CostModel/X86/arith-usat.ll
index 75950e40c2a00..95dc22d4a1714 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat.ll
@@ -35,18 +35,18 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -54,18 +54,18 @@ define i32 @add(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -77,14 +77,14 @@ define i32 @add(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -92,37 +92,37 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -130,18 +130,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -149,18 +149,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -168,18 +168,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -187,18 +187,18 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -210,14 +210,14 @@ define i32 @add(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -225,21 +225,21 @@ define i32 @add(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -288,18 +288,18 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -307,18 +307,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'sub'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -330,14 +330,14 @@ define i32 @sub(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -345,37 +345,37 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -383,18 +383,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -402,18 +402,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -421,18 +421,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -440,18 +440,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -463,14 +463,14 @@ define i32 @sub(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -478,21 +478,21 @@ define i32 @sub(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/costmodel.ll b/llvm/test/Analysis/CostModel/X86/costmodel.ll
index 7e5ee3bb62799..e536f341019af 100644
--- a/llvm/test/Analysis/CostModel/X86/costmodel.ll
+++ b/llvm/test/Analysis/CostModel/X86/costmodel.ll
@@ -29,7 +29,7 @@ define i64 @foo(i64 %arg) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %I2P = inttoptr i64 undef to ptr
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %P2I = ptrtoint ptr undef to i64
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %TC = trunc i64 undef to i32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void undef()
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 undef
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 06429a5107113..88a22f98feb25 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -50,23 +50,23 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 
 define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-LABEL: 'umul'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'umul'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll b/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll
new file mode 100644
index 0000000000000..36a2115bd7a94
--- /dev/null
+++ b/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -disable-output -passes='print<cycles>' 2>&1 | FileCheck %s
+; CHECK-LABEL: CycleInfo for function: unreachable
+; CHECK:    depth=1: entries(loop.body) loop.latch inner.block
+define void @unreachable(i32 %n) {
+entry:
+  br label %loop.body
+
+loop.body:
+  br label %inner.block
+
+; This branch should not cause %inner.block to appear as an entry.
+unreachable.block:
+  br label %inner.block
+
+inner.block:
+  br i1 undef, label %loop.exit, label %loop.latch
+
+loop.latch:
+  br label %loop.body
+
+loop.exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index f1ae1a897fff8..7fc9958dba552 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
-; Check that loop-indepedent forward dependences are discovered properly.
+; Check that loop-independent forward dependences are discovered properly.
 ;
 ; FIXME: This does not actually always work which is pretty confusing.  Right
-; now there is hack in LAA that tries to figure out loop-indepedent forward
+; now there is hack in LAA that tries to figure out loop-independent forward
 ; dependeces *outside* of the MemoryDepChecker logic (i.e. proper dependence
 ; analysis).
 ;
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
index 2e61a28039846..6d8e296ec72fa 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
@@ -9,21 +9,19 @@
 define void @B_indices_loaded_in_loop_A_stored(ptr %A, ptr noalias %B, i64 %N, i64 %off) {
 ; CHECK-LABEL: 'B_indices_loaded_in_loop_A_stored'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %gep.B, align 4 ->
+; CHECK-NEXT:            store i32 %inc, ptr %gep.B, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %indices = load i8, ptr %gep.A, align 1 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.C, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
-; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.C = getelementptr inbounds i32, ptr %A, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv.off
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP1]]:
-; CHECK-NEXT:          (Low: %A High: ((4 * %N) + %A))
-; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop>
-; CHECK-NEXT:        Group [[GRP2]]:
-; CHECK-NEXT:          (Low: (%off + %A) High: (%N + %off + %A))
-; CHECK-NEXT:            Member: {(%off + %A),+,1}<nw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -59,9 +57,9 @@ define void @B_indices_loaded_in_loop_A_not_stored(ptr %A, ptr noalias %B, i64 %
 ; CHECK-LABEL: 'B_indices_loaded_in_loop_A_not_stored'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %l = load i32, ptr %gep.B, align 4 ->
 ; CHECK-NEXT:            store i32 %inc, ptr %gep.B, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
index 546a75cf4efd5..28ee6c6f0a89a 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
@@ -13,9 +13,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT: for.body:
 ; CHECK-NEXT:   Report: unsafe dependent memory operations in loop
 ; CHECK-NOT:    Report: cannot identify array bounds
-; CHECK-NEXT:   Unknown data dependence.
+; CHECK-NEXT:   Unsafe indirect dependence.
 ; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
+; CHECK-NEXT:       IndirectUnsafe:
 ; CHECK-NEXT:         %loadA = load i16, ptr %arrayidxA, align 2 ->
 ; CHECK-NEXT:         store i16 %mul, ptr %arrayidxA, align 2
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
index 18e45f469b4a3..8ca30383092c6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
@@ -9,8 +9,9 @@
 ; CHECK-LABEL: 'negative_step'
 ; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>(Induction step: -1)
+; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>
 ; CHECK-NEXT: LAA: Distance for   store i32 %add, ptr %gep.A.plus.1, align 4 to   %l = load i32, ptr %gep.A, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: -1 Sink induction step: -1
 ; CHECK-NEXT: LAA: Dependence is negative
 
 define void @negative_step(ptr nocapture %A) {
@@ -41,8 +42,9 @@ exit:
 ; CHECK-LABEL: 'positive_step'
 ; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>(Induction step: 1)
+; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>
 ; CHECK-NEXT: LAA: Distance for   %l = load i32, ptr %gep.A, align 4 to   store i32 %add, ptr %gep.A.minus.1, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: 1 Sink induction step: 1
 ; CHECK-NEXT: LAA: Dependence is negative
 
 define void @positive_step(ptr nocapture %A) {
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 60fe8b4fcbed4..8bef7583c35c0 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -5,9 +5,9 @@ define void @test(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %load = load double, ptr %gep.sel, align 8 ->
 ; CHECK-NEXT:            store double %load, ptr %gep.sel2, align 8
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 7c1b11e22aef2..f0aed2421a96e 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -276,9 +276,9 @@ define void @single_stride_used_for_trip_count(ptr noalias %A, ptr noalias %B, i
 ; CHECK-LABEL: 'single_stride_used_for_trip_count'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %load = load i32, ptr %gep.A, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep.A.next, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
index b3e3b208bf908..471954f44311d 100644
--- a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
@@ -56,6 +56,12 @@ define void @sle_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
 ;
 entry:
   br label %for.body
@@ -121,6 +127,12 @@ define void @ule_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
index 82b4d0e4fb483..49288c85897fd 100644
--- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
@@ -240,6 +240,9 @@ define void @test_zext(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (%N /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
index e9c13f551b4d0..64306ac28cf27 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
@@ -61,6 +61,9 @@ define void @nw_implies_nsw(i16 %n) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (128 + (-128 smax %n))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (128 + (-128 smax %n))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
 ;
 entry:
   br label %for.body
@@ -107,6 +110,9 @@ define void @actually_infinite() {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is i16 257
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is i16 257
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
@@ -132,6 +138,9 @@ define void @rhs_mustexit_1(i16 %n.raw) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -233,6 +242,9 @@ define void @neg_rhs_wrong_range(i16 %n.raw) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-1 + (2 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>)) /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {2,+,2}<nw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-1 + (2 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>)) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {2,+,2}<nw><%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -260,6 +272,9 @@ define void @neg_rhs_maybe_infinite(i16 %n.raw) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -382,6 +397,9 @@ define void @ult_constant_rhs_stride2_neg(i16 %n.raw, i8 %start) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
index befcabd911467..6f26a8a64e718 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
@@ -362,3 +362,166 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+; The next two cases check to see if we can infer the flags on the IV
+; of a countup loop using vscale strides.  vscale is a power of two
+; and these are finite loops by assumption.
+
+define void @vscale_slt_noflags(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscale_slt_noflags'
+; CHECK-NEXT:  Classifying expressions for: @vscale_slt_noflags
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    --> {0,+,vscale}<%for.body> U: full-set S: full-set Exits: (vscale * ((-1 + %n) /u vscale))<nuw> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+; CHECK-NEXT:    --> {%A,+,(4 * vscale)<nuw><nsw>}<%for.body> U: full-set S: full-set Exits: ((4 * vscale * ((-1 + %n) /u vscale)) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %add = add i32 %i.05, %vscale
+; CHECK-NEXT:    --> {vscale,+,vscale}<nw><%for.body> U: full-set S: full-set Exits: (vscale * (1 + ((-1 + %n) /u vscale))<nuw>) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscale_slt_noflags
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is ((-1 + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 1073741822
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is ((-1 + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+  %0 = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %0, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %add = add i32 %i.05, %vscale
+  %cmp = icmp slt i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vscalex4_ult_noflags(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscalex4_ult_noflags'
+; CHECK-NEXT:  Classifying expressions for: @vscalex4_ult_noflags
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %VF = mul i32 %vscale, 4
+; CHECK-NEXT:    --> (4 * vscale)<nuw><nsw> U: [8,4097) S: [8,4097)
+; CHECK-NEXT:    %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    --> {0,+,(4 * vscale)<nuw><nsw>}<%for.body> U: [0,-3) S: [-2147483648,2147483645) Exits: (4 * vscale * ((-1 + %n) /u (4 * vscale)<nuw><nsw>)) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+; CHECK-NEXT:    --> {%A,+,(16 * vscale)<nuw><nsw>}<%for.body> U: full-set S: full-set Exits: ((16 * vscale * ((-1 + %n) /u (4 * vscale)<nuw><nsw>)) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %add = add i32 %i.05, %VF
+; CHECK-NEXT:    --> {(4 * vscale)<nuw><nsw>,+,(4 * vscale)<nuw><nsw>}<nw><%for.body> U: [0,-3) S: [-2147483648,2147483645) Exits: (vscale * (4 + (4 * ((-1 + %n) /u (4 * vscale)<nuw><nsw>))<nuw><nsw>)<nuw>) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscalex4_ult_noflags
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is ((-1 + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 536870910
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is ((-1 + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %VF = mul i32 %vscale, 4
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+  %0 = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %0, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %add = add i32 %i.05, %VF
+  %cmp = icmp ult i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; The next two cases check to see if we can infer the flags on the IV
+; of a countdown loop using vscale strides.
+
+define void @vscale_countdown_ne(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscale_countdown_ne'
+; CHECK-NEXT:  Classifying expressions for: @vscale_countdown_ne
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %start = sub i32 %n, %vscale
+; CHECK-NEXT:    --> ((-1 * vscale)<nsw> + %n) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+; CHECK-NEXT:    --> {((-1 * vscale)<nsw> + %n),+,(-1 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: ((vscale * (-1 + (-1 * (((-2 * vscale)<nsw> + %n) /u vscale))<nsw>)<nsw>) + %n) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+; CHECK-NEXT:    --> {((4 * %n) + (-4 * vscale)<nsw> + %A),+,(-4 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: ((4 * %n) + (vscale * (-4 + (-4 * (((-2 * vscale)<nsw> + %n) /u vscale)))) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %sub = sub i32 %iv, %vscale
+; CHECK-NEXT:    --> {((-2 * vscale)<nsw> + %n),+,(-1 * vscale)<nsw>}<nw><%for.body> U: full-set S: full-set Exits: ((vscale * (-2 + (-1 * (((-2 * vscale)<nsw> + %n) /u vscale))<nsw>)) + %n) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscale_countdown_ne
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-2 * vscale)<nsw> + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-2 * vscale)<nsw> + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %cmp4 = icmp sgt i32 %n, 0
+  %start = sub i32 %n, %vscale
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+  %ld = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %ld, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %sub = sub i32 %iv, %vscale
+  %cmp = icmp ne i32 %sub, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vscalex4_countdown_ne(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscalex4_countdown_ne'
+; CHECK-NEXT:  Classifying expressions for: @vscalex4_countdown_ne
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %VF = shl i32 %vscale, 2
+; CHECK-NEXT:    --> (4 * vscale)<nuw><nsw> U: [8,4097) S: [8,4097)
+; CHECK-NEXT:    %start = sub i32 %n, %VF
+; CHECK-NEXT:    --> ((-4 * vscale)<nsw> + %n) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+; CHECK-NEXT:    --> {((-4 * vscale)<nsw> + %n),+,(-4 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: ((vscale * (-4 + (-4 * (((-8 * vscale)<nsw> + %n) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) + %n) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+; CHECK-NEXT:    --> {((4 * %n) + (-16 * vscale)<nsw> + %A),+,(-16 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: ((4 * %n) + (vscale * (-16 + (-16 * (((-8 * vscale)<nsw> + %n) /u (4 * vscale)<nuw><nsw>)))) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %sub = sub i32 %iv, %VF
+; CHECK-NEXT:    --> {((-8 * vscale)<nsw> + %n),+,(-4 * vscale)<nsw>}<nw><%for.body> U: full-set S: full-set Exits: ((vscale * (-8 + (-4 * (((-8 * vscale)<nsw> + %n) /u (4 * vscale)<nuw><nsw>))<nsw>)) + %n) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscalex4_countdown_ne
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-8 * vscale)<nsw> + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 536870911
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-8 * vscale)<nsw> + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %VF = shl i32 %vscale, 2
+  %cmp4 = icmp sgt i32 %n, 0
+  %start = sub i32 %n, %VF
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+  %ld = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %ld, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %sub = sub i32 %iv, %VF
+  %cmp = icmp ne i32 %sub, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 680c998a4b39f..b215fc2c2ae74 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -213,6 +213,22 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
+  store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
+  store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
+  ret void
+}
+
 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr)
 define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 bb:
@@ -229,6 +245,22 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
+  store <4 x half> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
+  store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
@@ -258,8 +290,12 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1))
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
+declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
 
 attributes #0 = { nounwind convergent }
 attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/Analysis/ValueTracking/known-bits.ll b/llvm/test/Analysis/ValueTracking/known-bits.ll
index 035ccf8d42d13..9d0b153d8ccfc 100644
--- a/llvm/test/Analysis/ValueTracking/known-bits.ll
+++ b/llvm/test/Analysis/ValueTracking/known-bits.ll
@@ -23,3 +23,29 @@ define <4 x i1> @vec_reverse_known_bits_fail(<4 x i8> %xx) {
   %r = icmp slt <4 x i8> %rev, zeroinitializer
   ret <4 x i1> %r
 }
+
+define i1 @vec_reverse_known_bits_demanded(<4 x i8> %xx) {
+; CHECK-LABEL: @vec_reverse_known_bits_demanded(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = or <4 x i8> %xx, <i8 127, i8 55, i8 128, i8 123>
+  %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x)
+  %ele = extractelement <4 x i8> %rev, i64 1
+  %r = icmp slt i8 %ele, 0
+  ret i1 %r
+}
+
+define i1 @vec_reverse_known_bits_demanded_fail(<4 x i8> %xx) {
+; CHECK-LABEL: @vec_reverse_known_bits_demanded_fail(
+; CHECK-NEXT:    [[X:%.*]] = or <4 x i8> [[XX:%.*]], <i8 127, i8 55, i8 -128, i8 123>
+; CHECK-NEXT:    [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]])
+; CHECK-NEXT:    [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[ELE]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = or <4 x i8> %xx, <i8 127, i8 55, i8 128, i8 123>
+  %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x)
+  %ele = extractelement <4 x i8> %rev, i64 2
+  %r = icmp slt i8 %ele, 0
+  ret i1 %r
+}
diff --git a/llvm/test/Analysis/ValueTracking/known-fpclass.ll b/llvm/test/Analysis/ValueTracking/known-fpclass.ll
index 59f3eed715b52..2b8e6298d746a 100644
--- a/llvm/test/Analysis/ValueTracking/known-fpclass.ll
+++ b/llvm/test/Analysis/ValueTracking/known-fpclass.ll
@@ -24,3 +24,33 @@ define <4 x i1> @vector_reverse_fpclass2(<4 x double> nofpclass(nzero) %x) {
   ret <4 x i1> %cmp
 }
 
+define i1 @vector_reverse_fpclass_demanded(<4 x double> %vec, double nofpclass(nzero nan) %x) {
+; CHECK-LABEL: @vector_reverse_fpclass_demanded(
+; CHECK-NEXT:    ret i1 true
+;
+
+  %x.abs = call double @llvm.fabs.f64(double %x)
+  %vec.x = insertelement <4 x double> %vec, double %x.abs, i64 1
+  %rev = call <4 x double> @llvm.vector.reverse(<4 x double> %vec.x)
+  %ele = extractelement <4 x double> %rev, i64 2
+  %cmp = fcmp oge double %ele, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reverse_fpclass_demanded_fail(<4 x double> %vec, double nofpclass(nzero nan) %x) {
+; CHECK-LABEL: @vector_reverse_fpclass_demanded_fail(
+; CHECK-NEXT:    [[X_ABS:%.*]] = call double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[VEC_X:%.*]] = insertelement <4 x double> [[VEC:%.*]], double [[X_ABS]], i64 1
+; CHECK-NEXT:    [[REV:%.*]] = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> [[VEC_X]])
+; CHECK-NEXT:    [[ELE:%.*]] = extractelement <4 x double> [[REV]], i64 1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[ELE]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+
+  %x.abs = call double @llvm.fabs.f64(double %x)
+  %vec.x = insertelement <4 x double> %vec, double %x.abs, i64 1
+  %rev = call <4 x double> @llvm.vector.reverse(<4 x double> %vec.x)
+  %ele = extractelement <4 x double> %rev, i64 1
+  %cmp = fcmp oge double %ele, 0.0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
index 5704586d92300..db2c4f3a1ed65 100644
--- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll
+++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
@@ -1520,4 +1520,30 @@ define <4 x i1> @vec_reverse_non_zero_fail(<4 x i8> %xx) {
   ret <4 x i1> %r
 }
 
+define i1 @vec_reverse_non_zero_demanded(<4 x i8> %xx) {
+; CHECK-LABEL: @vec_reverse_non_zero_demanded(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = add nuw <4 x i8> %xx, <i8 1, i8 0, i8 0, i8 0>
+  %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x)
+  %ele = extractelement <4 x i8> %rev, i64 3
+  %r = icmp eq i8 %ele, 0
+  ret i1 %r
+}
+
+define i1 @vec_reverse_non_zero_demanded_fail(<4 x i8> %xx) {
+; CHECK-LABEL: @vec_reverse_non_zero_demanded_fail(
+; CHECK-NEXT:    [[X:%.*]] = add nuw <4 x i8> [[XX:%.*]], <i8 1, i8 0, i8 0, i8 0>
+; CHECK-NEXT:    [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]])
+; CHECK-NEXT:    [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[ELE]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = add nuw <4 x i8> %xx, <i8 1, i8 0, i8 0, i8 0>
+  %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x)
+  %ele = extractelement <4 x i8> %rev, i64 2
+  %r = icmp eq i8 %ele, 0
+  ret i1 %r
+}
+
 declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
diff --git a/llvm/test/Analysis/ValueTracking/numsignbits-shl.ll b/llvm/test/Analysis/ValueTracking/numsignbits-shl.ll
new file mode 100644
index 0000000000000..e86d28ebbc1d2
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/numsignbits-shl.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+declare void @escape(i16 %add)
+declare void @escape2(<2 x i16> %add)
+
+define void @numsignbits_shl_zext(i8 %x) {
+; CHECK-LABEL: define void @numsignbits_shl_zext(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[ASHR]] to i16
+; CHECK-NEXT:    [[NSB4:%.*]] = shl i16 [[ZEXT]], 10
+; CHECK-NEXT:    [[ADD14:%.*]] = and i16 [[NSB4]], 15360
+; CHECK-NEXT:    call void @escape(i16 [[ADD14]])
+; CHECK-NEXT:    [[ADD13:%.*]] = and i16 [[NSB4]], 7168
+; CHECK-NEXT:    call void @escape(i16 [[ADD13]])
+; CHECK-NEXT:    [[ADD12:%.*]] = and i16 [[NSB4]], 3072
+; CHECK-NEXT:    call void @escape(i16 [[ADD12]])
+; CHECK-NEXT:    [[AND11:%.*]] = and i16 [[NSB4]], 2048
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i16 [[AND11]], [[NSB4]]
+; CHECK-NEXT:    call void @escape(i16 [[ADD11]])
+; CHECK-NEXT:    ret void
+;
+  %ashr = ashr i8 %x, 5
+  %zext = zext i8 %ashr to i16
+  %nsb4 = shl i16 %zext, 10
+  ; Validate ComputeNumSignBits using this simplification:
+  ;   (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  ; 4 sign bits: Goal is to fold away the add for bits 12-14.
+  %and14 = and i16 %nsb4, 16384
+  %add14 = add i16 %and14, %nsb4
+  call void @escape(i16 %add14)
+  %and13 = and i16 %nsb4, 8192
+  %add13 = add i16 %and13, %nsb4
+  call void @escape(i16 %add13)
+  %and12 = and i16 %nsb4, 4096
+  %add12 = add i16 %and12, %nsb4
+  call void @escape(i16 %add12)
+  %and11 = and i16 %nsb4, 2048
+  %add11 = add i16 %and11, %nsb4
+  call void @escape(i16 %add11)
+  ret void
+}
+
+define void @numsignbits_shl_zext_shift_amounr_matches_extend(i8 %x) {
+; CHECK-LABEL: define void @numsignbits_shl_zext_shift_amounr_matches_extend(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X]], 2
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[ASHR]] to i16
+; CHECK-NEXT:    [[NSB3:%.*]] = shl nuw i16 [[ZEXT]], 8
+; CHECK-NEXT:    [[ADD14:%.*]] = and i16 [[NSB3]], 16128
+; CHECK-NEXT:    call void @escape(i16 [[ADD14]])
+; CHECK-NEXT:    [[ADD13:%.*]] = and i16 [[NSB3]], 7936
+; CHECK-NEXT:    call void @escape(i16 [[ADD13]])
+; CHECK-NEXT:    [[AND12:%.*]] = and i16 [[NSB3]], 4096
+; CHECK-NEXT:    [[ADD12:%.*]] = add nsw i16 [[AND12]], [[NSB3]]
+; CHECK-NEXT:    call void @escape(i16 [[ADD12]])
+; CHECK-NEXT:    ret void
+;
+  %ashr = ashr i8 %x, 2
+  %zext = zext i8 %ashr to i16
+  %nsb3 = shl i16 %zext, 8
+  ; Validate ComputeNumSignBits using this simplification:
+  ;   (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  ; 3 sign bits: Goal is to fold away the add for bits 13-14.
+  %and14 = and i16 %nsb3, 16384
+  %add14 = add i16 %and14, %nsb3
+  call void @escape(i16 %add14)
+  %and13 = and i16 %nsb3, 8192
+  %add13 = add i16 %and13, %nsb3
+  call void @escape(i16 %add13)
+  %and12 = and i16 %nsb3, 4096
+  %add12 = add i16 %and12, %nsb3
+  call void @escape(i16 %add12)
+  ret void
+}
+
+define void @numsignbits_shl_zext_extended_bits_remains(i8 %x) {
+; CHECK-LABEL: define void @numsignbits_shl_zext_extended_bits_remains(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[ASHR]] to i16
+; CHECK-NEXT:    [[NSB1:%.*]] = shl nuw nsw i16 [[ZEXT]], 7
+; CHECK-NEXT:    [[AND14:%.*]] = and i16 [[NSB1]], 16384
+; CHECK-NEXT:    [[ADD14:%.*]] = add nuw i16 [[AND14]], [[NSB1]]
+; CHECK-NEXT:    call void @escape(i16 [[ADD14]])
+; CHECK-NEXT:    ret void
+;
+  %ashr = ashr i8 %x, 5
+  %zext = zext i8 %ashr to i16
+  %nsb1 = shl i16 %zext, 7
+  ; Validate ComputeNumSignBits using this simplification:
+  ;   (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  ; 1 sign bit: The add can't be folded away here.
+  %and14 = and i16 %nsb1, 16384
+  %add14 = add i16 %and14, %nsb1
+  call void @escape(i16 %add14)
+  ret void
+}
+
+define void @numsignbits_shl_zext_all_bits_shifted_out(i8 %x) {
+; CHECK-LABEL: define void @numsignbits_shl_zext_all_bits_shifted_out(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = lshr i8 [[X]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i8 [[ASHR]] to i16
+; CHECK-NEXT:    [[NSB1:%.*]] = shl i16 [[ZEXT]], 14
+; CHECK-NEXT:    [[AND14:%.*]] = and i16 [[NSB1]], 16384
+; CHECK-NEXT:    [[ADD14:%.*]] = add i16 [[AND14]], [[NSB1]]
+; CHECK-NEXT:    call void @escape(i16 [[ADD14]])
+; CHECK-NEXT:    ret void
+;
+  %ashr = ashr i8 %x, 5
+  %zext = zext i8 %ashr to i16
+  %nsb1 = shl i16 %zext, 14
+  ; Validate ComputeNumSignBits using this simplification:
+  ;   (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  ; 1 sign bit: The add can't be folded away here.
+  %and14 = and i16 %nsb1, 16384
+  %add14 = add i16 %and14, %nsb1
+  call void @escape(i16 %add14)
+  ret void
+}
+
+define void @numsignbits_shl_zext_vector(<2 x i8> %x) {
+; CHECK-LABEL: define void @numsignbits_shl_zext_vector(
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr <2 x i8> [[X]], <i8 5, i8 5>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[ASHR]] to <2 x i16>
+; CHECK-NEXT:    [[NSB4:%.*]] = shl <2 x i16> [[ZEXT]], <i16 10, i16 10>
+; CHECK-NEXT:    [[ADD14:%.*]] = and <2 x i16> [[NSB4]], <i16 15360, i16 15360>
+; CHECK-NEXT:    call void @escape2(<2 x i16> [[ADD14]])
+; CHECK-NEXT:    [[ADD13:%.*]] = and <2 x i16> [[NSB4]], <i16 7168, i16 7168>
+; CHECK-NEXT:    call void @escape2(<2 x i16> [[ADD13]])
+; CHECK-NEXT:    [[ADD12:%.*]] = and <2 x i16> [[NSB4]], <i16 3072, i16 3072>
+; CHECK-NEXT:    call void @escape2(<2 x i16> [[ADD12]])
+; CHECK-NEXT:    [[AND11:%.*]] = and <2 x i16> [[NSB4]], <i16 2048, i16 2048>
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw <2 x i16> [[AND11]], [[NSB4]]
+; CHECK-NEXT:    call void @escape2(<2 x i16> [[ADD11]])
+; CHECK-NEXT:    ret void
+;
+  %ashr = ashr <2 x i8> %x, <i8 5, i8 5>
+  %zext = zext <2 x i8> %ashr to <2 x i16>
+  %nsb4 = shl <2 x i16> %zext, <i16 10, i16 10>
+  ; Validate ComputeNumSignBits using this simplification:
+  ;   (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  ; 4 sign bits: Goal is to fold away the add for bits 12-14.
+  %and14 = and <2 x i16> %nsb4, <i16 16384, i16 16384>
+  %add14 = add <2 x i16> %and14, %nsb4
+  call void @escape2(<2 x i16> %add14)
+  %and13 = and <2 x i16> %nsb4, <i16 8192, i16 8192>
+  %add13 = add <2 x i16> %and13, %nsb4
+  call void @escape2(<2 x i16> %add13)
+  %and12 = and <2 x i16> %nsb4, <i16 4096, i16 4096>
+  %add12 = add <2 x i16> %and12, %nsb4
+  call void @escape2(<2 x i16> %add12)
+  %and11 = and <2 x i16> %nsb4, <i16 2048, i16 2048>
+  %add11 = add <2 x i16> %and11, %nsb4
+  call void @escape2(<2 x i16> %add11)
+  ret void
+}
diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll
deleted file mode 100644
index 608347e0fceb1..0000000000000
--- a/llvm/test/Assembler/x86mmx.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: verify-uselistorder %s
-; Basic smoke test for x86_mmx type.
-
-; CHECK: define x86_mmx @sh16
-define x86_mmx  @sh16(x86_mmx %A) {
-; CHECK: ret x86_mmx %A
-        ret x86_mmx %A
-}
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index ab9acbc0a66a5..45e3d0357ebdf 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -70,7 +70,7 @@ define void @types() {
   %9 = alloca [3 x i22], align 4
   %10 = alloca ptr addrspace(5), align 8
   %11 = alloca <5 x ptr>, align 64
-  %12 = alloca x86_mmx, align 8
+  %12 = alloca <1 x i64>, align 8
   ret void
 }
 
diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll
index cbe6f5d22c947..f1732db174c29 100644
--- a/llvm/test/Bitcode/bcanalyzer-types.ll
+++ b/llvm/test/Bitcode/bcanalyzer-types.ll
@@ -3,7 +3,6 @@
 ; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID)
 ; CHECK: BFLOAT
 ; CHECK: TOKEN
-; CHECK: X86_MMX
 ; CHECK: HALF
 ; CHECK: Block ID
 
@@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) {
   ret half %a
 }
 
-define x86_mmx @test_mmx(<2 x i32> %x) {
-  %a = bitcast <2 x i32> %x to x86_mmx
-  ret x86_mmx %a
-}
-
 define bfloat @test_bfloat(i16 %x) {
   %a = bitcast i16 %x to bfloat
   ret bfloat %a
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index 2190e2fbccf28..37a87eea41ad3 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -645,7 +645,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 7e59b5c1be6e2..8de2132d7ec89 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -689,7 +689,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index ebd1f2fff8c94..7f766aa34a005 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -742,7 +742,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index c34f04ceb0de3..c8309175e063f 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 05bffda1d117a..adbd91ac6c7fe 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index 0c872289c62ba..1b500da69568a 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -820,7 +820,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 44c680885be34..c1abbf0cda6eb 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -830,7 +830,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index e437c37d8d1c8..e5592b347425a 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1112,8 +1112,6 @@ define void @typesystem() {
   ; CHECK: %t5 = alloca x86_fp80
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
-  %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
   %t8 = alloca ptr
   ; CHECK: %t8 = alloca ptr
   %t9 = alloca <4 x i32>
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 603f33edaf761..81fa72618bfd1 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -78,6 +78,7 @@ set(LLVM_TEST_DEPENDS
           llvm-cfi-verify
           llvm-config
           llvm-cov
+          llvm-ctxprof-util
           llvm-cvtres
           llvm-cxxdump
           llvm-cxxfilt
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index b619aac709d98..de3f323891a36 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -2659,9 +2659,9 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldaxrb w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-NOLSE-O1-NEXT:    stlxrb w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB35_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2674,9 +2674,9 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-OUTLINE-O1-NEXT:    stlxrb w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB35_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2781,9 +2781,9 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrb w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-NOLSE-O1-NEXT:    stxrb w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB36_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2796,9 +2796,9 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-OUTLINE-O1-NEXT:    stxrb w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB36_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3714,9 +3714,9 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldaxrh w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-NOLSE-O1-NEXT:    stlxrh w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB45_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3729,9 +3729,9 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-OUTLINE-O1-NEXT:    stlxrh w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB45_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3836,9 +3836,9 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrh w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-NOLSE-O1-NEXT:    stxrh w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB46_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3851,9 +3851,9 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-OUTLINE-O1-NEXT:    stxrh w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB46_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index eb94cc5d0fb61..314c5458e3090 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -2321,6 +2321,54 @@ define float @test_tan_f32(float %x) {
   ret float %y
 }
 
+declare float @llvm.acos.f32(float)
+define float @test_acos_f32(float %x) {
+  ; CHECK-LABEL: name:            test_acos_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FACOS %{{[0-9]+}}
+  %y = call float @llvm.acos.f32(float %x)
+  ret float %y
+}
+
+declare float @llvm.asin.f32(float)
+define float @test_asin_f32(float %x) {
+  ; CHECK-LABEL: name:            test_asin_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FASIN %{{[0-9]+}}
+  %y = call float @llvm.asin.f32(float %x)
+  ret float %y
+}
+
+declare float @llvm.atan.f32(float)
+define float @test_atan_f32(float %x) {
+  ; CHECK-LABEL: name:            test_atan_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FATAN %{{[0-9]+}}
+  %y = call float @llvm.atan.f32(float %x)
+  ret float %y
+}
+
+declare float @llvm.cosh.f32(float)
+define float @test_cosh_f32(float %x) {
+  ; CHECK-LABEL: name:            test_cosh_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FCOSH %{{[0-9]+}}
+  %y = call float @llvm.cosh.f32(float %x)
+  ret float %y
+}
+
+declare float @llvm.sinh.f32(float)
+define float @test_sinh_f32(float %x) {
+  ; CHECK-LABEL: name:            test_sinh_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FSINH %{{[0-9]+}}
+  %y = call float @llvm.sinh.f32(float %x)
+  ret float %y
+}
+
+declare float @llvm.tanh.f32(float)
+define float @test_tanh_f32(float %x) {
+  ; CHECK-LABEL: name:            test_tanh_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FTANH %{{[0-9]+}}
+  %y = call float @llvm.tanh.f32(float %x)
+  ret float %y
+}
+
 declare float @llvm.sqrt.f32(float)
 define float @test_sqrt_f32(float %x) {
   ; CHECK-LABEL: name:            test_sqrt_f32
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index c8d313cf31afd..c6819ff39ed33 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -926,16 +926,16 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umin ptr %ptr, i8 %rhs seq_cst, !pcsections !0
   ret i8 %res
@@ -954,16 +954,16 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umax ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
@@ -1179,16 +1179,16 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umin ptr %ptr, i16 %rhs seq_cst, !pcsections !0
   ret i16 %res
@@ -1207,16 +1207,16 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umax ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
new file mode 100644
index 0000000000000..0f436127ea2eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRE
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-POST
+
+---
+name:            test_combine_trunc_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: %rhs:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %rhs(s64)
+    ; CHECK-PRE-NEXT: %small:_(s32) = G_SELECT %cond(s32), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-PRE-NEXT: $w0 = COPY %small(s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %rhs:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %small:_(s32) = G_TRUNC %res(s64)
+    ; CHECK-POST-NEXT: $w0 = COPY %small(s32)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = COPY $x0
+    %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_zext_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_zext_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %lhs(s32)
+    ; CHECK-PRE-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT %rhs(s32)
+    ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK-PRE-NEXT: $x0 = COPY %big(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_zext_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %big:_(s64) = G_ZEXT %res(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %big(s64)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ZEXT %res(s32)
+    $x0 = COPY %big(s64)
+...
+---
+name:            test_combine_anyzext_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_anyzext_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %lhs(s32)
+    ; CHECK-PRE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %rhs(s32)
+    ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-PRE-NEXT: $x0 = COPY %big(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_anyzext_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %big:_(s64) = G_ANYEXT %res(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %big(s64)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ANYEXT %res(s32)
+    $x0 = COPY %big(s64)
+...
+---
+name:            test_combine_anyzext_select_multi_use
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_anyzext_select_multi_use
+    ; CHECK: %cond:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-NEXT: %big:_(s64) = G_ANYEXT %res(s32)
+    ; CHECK-NEXT: $x0 = COPY %big(s64)
+    ; CHECK-NEXT: $w0 = COPY %res(s32)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ANYEXT %res(s32)
+    $x0 = COPY %big(s64)
+    $w0 = COPY %res(s32)
+...
+---
+name:            test_combine_trunc_select_vector_out_of_budget
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_select_vector_out_of_budget
+    ; CHECK: %cond:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %arg1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64)
+    ; CHECK-NEXT: %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
+    ; CHECK-NEXT: %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY %small(<2 x s32>)
+    %cond:_(<2 x s32>) = COPY $x0
+    %arg1:_(s64) = COPY $x0
+    %arg2:_(s64) = COPY $x0
+    %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64)
+    %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
+    %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
+    $x0 = COPY %small(<2 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
index 9f1d403340f49..4a38b5d4c63dd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
@@ -1,36 +1,51 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRE
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-POST
+
 ---
 name:            test_combine_trunc_undef
+legalized: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test_combine_trunc_undef
     ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK: $w0 = COPY [[DEF]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
     %0:_(s64) = G_IMPLICIT_DEF
     %1:_(s32) = G_TRUNC %0(s64)
     $w0 = COPY %1(s32)
 ...
 ---
 name:            test_combine_trunc_undef_vec
+legalized: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test_combine_trunc_undef_vec
     ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
-    ; CHECK: $x0 = COPY [[DEF]](<2 x s32>)
+    ; CHECK-NEXT: $x0 = COPY [[DEF]](<2 x s32>)
     %0:_(<2 x s64>) = G_IMPLICIT_DEF
     %1:_(<2 x s32>) = G_TRUNC %0(<2 x s64>)
     $x0 = COPY %1(<2 x s32>)
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_ANYEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -38,13 +53,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s16_vec
+legalized: true
 body:             |
   bb.1:
   liveins: $s0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>)
-    ; CHECK: $x0 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
+    ; CHECK-PRE: liveins: $s0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>)
+    ; CHECK-PRE-NEXT: $x0 = COPY [[ANYEXT]](<2 x s32>)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
+    ; CHECK-POST: liveins: $s0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s64>) = G_ANYEXT [[COPY]](<2 x s16>)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[ANYEXT]](<2 x s64>)
+    ; CHECK-POST-NEXT: $x0 = COPY [[TRUNC]](<2 x s32>)
     %0:_(<2 x s16>) = COPY $s0
     %1:_(<2 x s64>) = G_ANYEXT %0(<2 x s16>)
     %2:_(<2 x s32>) = G_TRUNC %1(<2 x s64>)
@@ -52,13 +78,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_sext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_sext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[SEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_sext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[SEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_sext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_SEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -66,13 +103,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_zext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_zext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[ZEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_zext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[ZEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_zext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ZEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_ZEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -80,12 +128,23 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s32
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s32
+    ; CHECK-PRE: liveins: $w0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: $w0 = COPY [[COPY]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s32
+    ; CHECK-POST: liveins: $w0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s32) = COPY $w0
     %1:_(s64) = G_ANYEXT %0(s32)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -93,13 +152,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s64
+legalized: true
 body:             |
   bb.1:
   liveins: $x0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s64
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s64
+    ; CHECK-PRE: liveins: $x0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s64
+    ; CHECK-POST: liveins: $x0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s128) = G_ANYEXT [[COPY]](s64)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s128)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s128) = G_ANYEXT %0(s64)
     %2:_(s32) = G_TRUNC %1(s128)
@@ -107,15 +177,27 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_shl_s32_by_2
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
-    ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_2
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK: $h0 = COPY [[SHL]](s16)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_shl_s32_by_2
+    ; CHECK-PRE: liveins: $w0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-PRE-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s32)
+    ; CHECK-PRE-NEXT: $h0 = COPY [[SHL]](s16)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_shl_s32_by_2
+    ; CHECK-POST: liveins: $w0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-POST-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; CHECK-POST-NEXT: $h0 = COPY [[TRUNC]](s16)
     %0:_(s32) = COPY $w0
     %1:_(s32) = G_CONSTANT i32 2
     %2:_(s32) = G_SHL %0(s32), %1(s32)
@@ -124,18 +206,41 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_shl_s32_by_17
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
     ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_17
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK: $h0 = COPY [[TRUNC]](s16)
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; CHECK-NEXT: $h0 = COPY [[TRUNC]](s16)
     %0:_(s32) = COPY $w0
     %1:_(s32) = G_CONSTANT i32 17
     %2:_(s32) = G_SHL %0(s32), %1(s32)
     %3:_(s16) = G_TRUNC %2(s32)
     $h0 = COPY %3(s16)
 ...
+---
+name:            test_combine_trunc_multi_use
+legalized: true
+body:             |
+  bb.1:
+  liveins: $w0
+    ; CHECK-LABEL: name: test_combine_trunc_multi_use
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ZEXT]](s64)
+    ; CHECK-NEXT: $h0 = COPY [[TRUNC]](s16)
+    ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64)
+    %0:_(s32) = COPY $w0
+    %2:_(s64) = G_ZEXT %0(s32)
+    %3:_(s16) = G_TRUNC %2(s64)
+    $h0 = COPY %3(s16)
+    $x0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-sucmp.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-sucmp.ll
new file mode 100644
index 0000000000000..1fa21bfb733e8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-sucmp.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -global-isel -stop-after=irtranslator %s -o - | FileCheck %s
+
+define void @scmp_i32(i32 %arg1, i32 %arg2) {
+  ; CHECK-LABEL: name: scmp_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+  ; CHECK-NEXT:   [[SCMP:%[0-9]+]]:_(s4) = G_SCMP [[COPY]](s32), [[COPY1]]
+  ; CHECK-NEXT:   RET_ReallyLR
+  %res4 = call i4 @llvm.scmp.i4.i32(i32 %arg1, i32 %arg2)
+  ret void
+}
+
+define void @scmp_4_32i(<4 x i32> %arg1, <4 x i32> %arg2) {
+  ; CHECK-LABEL: name: scmp_4_32i
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+  ; CHECK-NEXT:   [[SCMP:%[0-9]+]]:_(<4 x s32>) = G_SCMP [[COPY]](<4 x s32>), [[COPY1]]
+  ; CHECK-NEXT:   RET_ReallyLR
+  %res4 = call <4 x i32> @llvm.scmp.v4i32.i32(<4 x i32> %arg1, <4 x i32> %arg2)
+  ret void
+}
+
+define void @ucmp_i32(i32 %arg1, i32 %arg2) {
+  ; CHECK-LABEL: name: ucmp_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+  ; CHECK-NEXT:   [[UCMP:%[0-9]+]]:_(s4) = G_UCMP [[COPY]](s32), [[COPY1]]
+  ; CHECK-NEXT:   RET_ReallyLR
+  %res4 = call i4 @llvm.ucmp.i4.i32(i32 %arg1, i32 %arg2)
+  ret void
+}
+
+define void @ucmp_4_32i(<4 x i32> %arg1, <4 x i32> %arg2) {
+  ; CHECK-LABEL: name: ucmp_4_32i
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+  ; CHECK-NEXT:   [[UCMP:%[0-9]+]]:_(<4 x s32>) = G_UCMP [[COPY]](<4 x s32>), [[COPY1]]
+  ; CHECK-NEXT:   RET_ReallyLR
+  %res4 = call <4 x i32> @llvm.ucmp.v4i32.i32(<4 x i32> %arg1, <4 x i32> %arg2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-acos.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-acos.mir
new file mode 100644
index 0000000000000..fd33a4198da6b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-acos.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.acos
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.acos
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &acosf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &acosf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &acosf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &acosf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FACOS %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.acos
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.acos
+
+    ; This is big, so let's just check for the 8 calls to acosf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: BL &acosf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FACOS %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.acos
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.acos
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FACOS %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.acos
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.acos
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &acosf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FACOS %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.acos
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.acos
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &acos
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &acos
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FACOS %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_acos_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_acos_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &acosf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FACOS %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-asin.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-asin.mir
new file mode 100644
index 0000000000000..981a3ff542042
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-asin.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.asin
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.asin
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &asinf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &asinf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &asinf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &asinf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FASIN %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.asin
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.asin
+
+    ; This is big, so let's just check for the 8 calls to asinf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: BL &asinf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FASIN %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.asin
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.asin
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FASIN %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.asin
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.asin
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &asinf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FASIN %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.asin
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.asin
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &asin
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &asin
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FASIN %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_asin_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_asin_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &asinf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FASIN %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-atan.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-atan.mir
new file mode 100644
index 0000000000000..7e0015ff7c797
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-atan.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.atan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.atan
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &atanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &atanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &atanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &atanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FATAN %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.atan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.atan
+
+    ; This is big, so let's just check for the 8 calls to atanf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: BL &atanf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FATAN %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.atan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.atan
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FATAN %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.atan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.atan
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &atanf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FATAN %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.atan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.atan
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &atan
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &atan
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FATAN %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_atan_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_atan_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &atanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FATAN %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cosh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cosh.mir
new file mode 100644
index 0000000000000..653b447ed24b8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cosh.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.cosh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.cosh
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &coshf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &coshf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &coshf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &coshf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FCOSH %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.cosh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.cosh
+
+    ; This is big, so let's just check for the 8 calls to coshf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: BL &coshf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FCOSH %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.cosh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.cosh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FCOSH %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.cosh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.cosh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &coshf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FCOSH %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.cosh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.cosh
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &cosh
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &cosh
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FCOSH %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_cosh_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_cosh_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &coshf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FCOSH %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sinh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sinh.mir
new file mode 100644
index 0000000000000..8cf3409fd0d2d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sinh.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.sinh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.sinh
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &sinhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &sinhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &sinhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &sinhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FSINH %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.sinh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.sinh
+
+    ; This is big, so let's just check for the 8 calls to sinhf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: BL &sinhf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FSINH %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.sinh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.sinh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FSINH %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.sinh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.sinh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &sinhf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FSINH %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.sinh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.sinh
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &sinh
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &sinh
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FSINH %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_sinh_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_sinh_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &sinhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FSINH %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tanh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tanh.mir
new file mode 100644
index 0000000000000..2cab146996faa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tanh.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.tanh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.tanh
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &tanhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &tanhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &tanhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &tanhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FTANH %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.tanh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.tanh
+
+    ; This is big, so let's just check for the 8 calls to tanhf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: BL &tanhf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FTANH %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.tanh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.tanh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FTANH %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.tanh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.tanh
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &tanhf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FTANH %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.tanh
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.tanh
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &tanh
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &tanh
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FTANH %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_tanh_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_tanh_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &tanhf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FTANH %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
new file mode 100644
index 0000000000000..e69f79bdd187a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
@@ -0,0 +1,145 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            test_scmp
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_scmp
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[C2]], [[SELECT]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SELECT1]], 2
+    ; CHECK-NEXT: $w0 = COPY [[SEXT_INREG]](s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %4:_(s2) = G_SCMP %0(s64), %1
+    %14:_(s32) = G_SEXT %4(s2)
+    $w0 = COPY %14(s32)
+
+...
+---
+name:            test_ucmp
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_ucmp
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[C2]], [[SELECT]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SELECT1]], 2
+    ; CHECK-NEXT: $w0 = COPY [[SEXT_INREG]](s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %4:_(s2) = G_UCMP %0(s64), %1
+    %14:_(s32) = G_SEXT %4(s2)
+    $w0 = COPY %14(s32)
+
+...
+---
+name:            test_ucmp_vector
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_ucmp_vector
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[TRUNC]], [[UV]]
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR3]](<8 x s8>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s16>), [[UV3:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV2]], [[TRUNC1]]
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR4]](<8 x s8>)
+    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV4]], [[XOR]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C3]](s8), [[C3]](s8), [[C3]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR5]](<8 x s8>)
+    ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s16>), [[UV7:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT3]](<8 x s16>)
+    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(<4 x s16>) = G_XOR [[TRUNC2]], [[UV6]]
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C3]](s8), [[C3]](s8), [[C3]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR6]](<8 x s8>)
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT4]](<8 x s16>)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV8]], [[TRUNC3]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s16>) = G_AND [[OR]], [[XOR1]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND2]], [[AND3]]
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR1]](<4 x s16>)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(<4 x s32>) = G_SEXT_INREG [[ANYEXT5]], 2
+    ; CHECK-NEXT: $q0 = COPY [[SEXT_INREG]](<4 x s32>)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $w0
+    %6:_(s32) = COPY $w1
+    %7:_(s32) = COPY $w2
+    %8:_(s32) = COPY $w3
+    %9:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %10:_(<4 x s2>) = G_UCMP %4(<4 x s32>), %9
+    %11:_(<4 x s32>) = G_SEXT %10(<4 x s2>)
+    $q0 = COPY %11(<4 x s32>)
+
+...
+---
+name:            test_ucmp_i128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_ucmp_i128
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[ICMP5]], [[ICMP3]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[SELECT2]](s32), [[C2]], [[SELECT1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SELECT3]], 2
+    ; CHECK-NEXT: $w0 = COPY [[SEXT_INREG]](s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %l:_(s128) = G_ANYEXT %0
+    %r:_(s128) = G_ANYEXT %1
+    %4:_(s2) = G_UCMP %l(s128), %r
+    %14:_(s32) = G_SEXT %4(s2)
+    $w0 = COPY %14(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 61ea3fb998374..87a415b45cca9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -351,6 +351,13 @@
 # DEBUG-NEXT: G_FCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: G_SCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: G_UCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SELECT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
@@ -686,23 +693,29 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FACOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FASIN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FATAN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCOSH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSINH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FTANH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 3af2aaf57eed8..dc2e1c5dc28d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -535,13 +535,13 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
-    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]]
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
     ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
-    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
-    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
@@ -571,19 +571,36 @@ body:             |
     liveins: $x0, $x1, $x2
     liveins: $w1, $x0
 
-    ; CHECK-LABEL: name: ldrhrox_more_than_one_mem_use_shl
-    ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
-    ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
-    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
-    ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103
-    ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
-    ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
-    ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]]
-    ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]]
+    ; CHECK-FAST-LABEL: name: ldrhrox_more_than_one_mem_use_shl
+    ; CHECK-FAST: liveins: $x0, $x1, $x2, $w1, $x0
+    ; CHECK-FAST-NEXT: {{  $}}
+    ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-FAST-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
+    ; CHECK-FAST-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
+    ; CHECK-FAST-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
+    ; CHECK-FAST-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103
+    ; CHECK-FAST-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
+    ; CHECK-FAST-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
+    ; CHECK-FAST-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]]
+    ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDWrr]]
+    ;
+    ; CHECK-SLOW-LABEL: name: ldrhrox_more_than_one_mem_use_shl
+    ; CHECK-SLOW: liveins: $x0, $x1, $x2, $w1, $x0
+    ; CHECK-SLOW-NEXT: {{  $}}
+    ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-SLOW-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
+    ; CHECK-SLOW-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
+    ; CHECK-SLOW-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
+    ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]].sub_32
+    ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]]
+    ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY [[COPY]]
+    ; CHECK-SLOW-NEXT: [[ADDXrx:%[0-9]+]]:gpr64sp = ADDXrx [[COPY4]], [[COPY3]], 1
+    ; CHECK-SLOW-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16))
+    ; CHECK-SLOW-NEXT: [[LDRHHui1:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16))
+    ; CHECK-SLOW-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHui]], [[LDRHHui1]]
+    ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDWrr]]
     %0:gpr(p0) = COPY $x0
     %1:gpr(s32) = COPY $w1
     %15:gpr(s64) = G_CONSTANT i64 9
@@ -612,19 +629,36 @@ body:             |
     liveins: $x0, $x1, $x2
     liveins: $w1, $x0
 
-    ; CHECK-LABEL: name: ldrhrox_more_than_one_use_shl
-    ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
-    ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
-    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
-    ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103
-    ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
-    ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
-    ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]]
-    ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]]
+    ; CHECK-FAST-LABEL: name: ldrhrox_more_than_one_use_shl
+    ; CHECK-FAST: liveins: $x0, $x1, $x2, $w1, $x0
+    ; CHECK-FAST-NEXT: {{  $}}
+    ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-FAST-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
+    ; CHECK-FAST-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
+    ; CHECK-FAST-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
+    ; CHECK-FAST-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103
+    ; CHECK-FAST-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
+    ; CHECK-FAST-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16))
+    ; CHECK-FAST-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]]
+    ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDWrr]]
+    ;
+    ; CHECK-SLOW-LABEL: name: ldrhrox_more_than_one_use_shl
+    ; CHECK-SLOW: liveins: $x0, $x1, $x2, $w1, $x0
+    ; CHECK-SLOW-NEXT: {{  $}}
+    ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-SLOW-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31
+    ; CHECK-SLOW-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0
+    ; CHECK-SLOW-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
+    ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]].sub_32
+    ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]]
+    ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY [[COPY]]
+    ; CHECK-SLOW-NEXT: [[ADDXrx:%[0-9]+]]:gpr64sp = ADDXrx [[COPY4]], [[COPY3]], 1
+    ; CHECK-SLOW-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16))
+    ; CHECK-SLOW-NEXT: [[LDRHHui1:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16))
+    ; CHECK-SLOW-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHui]], [[LDRHHui1]]
+    ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDWrr]]
     %0:gpr(p0) = COPY $x0
     %1:gpr(s32) = COPY $w1
     %15:gpr(s64) = G_CONSTANT i64 9
@@ -656,15 +690,15 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 62, 61
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
-    ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[ADDXrr]], 0 :: (load (s32) from %ir.addr)
-    ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[LDRWui]], 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]]
+    ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY1]], [[COPY]], 0, 1 :: (load (s32) from %ir.addr)
+    ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[LDRWroX]], 0
     ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32
     ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 2, 0
     ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[SUBREG_TO_REG]], [[ADDXri]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
-    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
@@ -692,21 +726,37 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $x0, $x1, $x2
-    ; CHECK-LABEL: name: ldrqrox_more_than_one_use_shl
-    ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
-    ; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADDXrr]], 0 :: (load (s128) from %ir.addr)
-    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]]
-    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY3]], [[ADDXri]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
-    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXrr1]]
-    ; CHECK-NEXT: RET_ReallyLR implicit [[ADDXrr2]]
+    ; CHECK-FAST-LABEL: name: ldrqrox_more_than_one_use_shl
+    ; CHECK-FAST: liveins: $x0, $x1, $x2
+    ; CHECK-FAST-NEXT: {{  $}}
+    ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-FAST-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59
+    ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+    ; CHECK-FAST-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]]
+    ; CHECK-FAST-NEXT: [[LDRQroX:%[0-9]+]]:fpr128 = LDRQroX [[COPY1]], [[COPY]], 0, 1 :: (load (s128) from %ir.addr)
+    ; CHECK-FAST-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0
+    ; CHECK-FAST-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY [[LDRQroX]].dsub
+    ; CHECK-FAST-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[COPY3]]
+    ; CHECK-FAST-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXri]]
+    ; CHECK-FAST-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
+    ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDXrr2]]
+    ;
+    ; CHECK-SLOW-LABEL: name: ldrqrox_more_than_one_use_shl
+    ; CHECK-SLOW: liveins: $x0, $x1, $x2
+    ; CHECK-SLOW-NEXT: {{  $}}
+    ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-SLOW-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59
+    ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-SLOW-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK-SLOW-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADDXrr]], 0 :: (load (s128) from %ir.addr)
+    ; CHECK-SLOW-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0
+    ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub
+    ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]]
+    ; CHECK-SLOW-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY3]], [[ADDXri]]
+    ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK-SLOW-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXrr1]]
+    ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDXrr2]]
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 4
     %2:gpr(s64) = G_SHL %0, %1(s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
index 4cd6eef531ce0..66c8c2efda9bc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
@@ -16,10 +16,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s32_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
     %4:_(<4 x s32>) = G_DUP %0(s32)
     $q0 = COPY %4(<4 x s32>)
@@ -37,10 +38,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s64_gpr
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $x0
     %4:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %4(<2 x s64>)
@@ -58,10 +60,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s32_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
-    ; CHECK: RET_ReallyLR implicit $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[DUP]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s32) = COPY $w0
     %4:_(<2 x s32>) = G_DUP %0(s32)
     $d0 = COPY %4(<2 x s32>)
@@ -79,10 +82,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s32_fpr
     ; CHECK: liveins: $s0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $s0
     %4:_(<4 x s32>) = G_DUP %0(s32)
     $q0 = COPY %4(<4 x s32>)
@@ -100,10 +104,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s64_fpr
     ; CHECK: liveins: $d0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $d0
     %4:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %4(<2 x s64>)
@@ -121,10 +126,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s32_fpr
     ; CHECK: liveins: $s0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
-    ; CHECK: RET_ReallyLR implicit $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[DUP]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s32) = COPY $s0
     %4:_(<2 x s32>) = G_DUP %0(s32)
     $d0 = COPY %4(<2 x s32>)
@@ -142,10 +148,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s64_fpr_copy
     ; CHECK: liveins: $d0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $d0
     %6:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %6(<2 x s64>)
@@ -163,11 +170,13 @@ body:             |
 
     ; CHECK-LABEL: name: v416s8_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: %trunc:gpr(s8) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<16 x s8>) = G_DUP %trunc(s8)
-    ; CHECK: $q0 = COPY [[DUP]](<16 x s8>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: %trunc:gpr(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:gpr(s32) = G_ANYEXT %trunc(s8)
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<16 x s8>) = G_DUP [[ANYEXT]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<16 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
     %trunc:_(s8) = G_TRUNC %0(s32)
     %1:_(<16 x s8>) = G_DUP %trunc(s8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
index 809bdceb4aa25..cf2bab78fe5a6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
@@ -453,3 +453,22 @@ body:             |
     %dup:fpr(<2 x p0>) = G_DUP %cst(p0)
     $q0 = COPY %dup(<2 x p0>)
     RET_ReallyLR implicit $q0
+...
+---
+name:            cstv4i16gpri32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins:
+    ; CHECK-LABEL: name: cstv4i16gpri32
+    ; CHECK: %dup:fpr64 = MOVIv4i16 3, 0
+    ; CHECK-NEXT: $d0 = COPY %dup
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %cst:gpr(s32) = G_CONSTANT i32 3
+    %dup:fpr(<4 x s16>) = G_DUP %cst(s32)
+    $d0 = COPY %dup(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir
index 62ebe86504bfa..94af12a91ae97 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir
@@ -241,16 +241,28 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $x0, $x1, $x2
-    ; CHECK-LABEL: name: shl_slow_1_more_than_one_use
-    ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
-    ; CHECK-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
-    ; CHECK-NEXT: STRHHroX [[COPY4]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr)
+    ; CHECK-FAST-LABEL: name: shl_slow_1_more_than_one_use
+    ; CHECK-FAST: liveins: $x0, $x1, $x2
+    ; CHECK-FAST-NEXT: {{  $}}
+    ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2
+    ; CHECK-FAST-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
+    ; CHECK-FAST-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr)
+    ; CHECK-FAST-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
+    ; CHECK-FAST-NEXT: STRHHroX [[COPY4]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr)
+    ;
+    ; CHECK-SLOW-LABEL: name: shl_slow_1_more_than_one_use
+    ; CHECK-SLOW: liveins: $x0, $x1, $x2
+    ; CHECK-SLOW-NEXT: {{  $}}
+    ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-SLOW-NEXT: %ptr:gpr64common = ADDXrs [[COPY1]], [[COPY]], 1
+    ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2
+    ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
+    ; CHECK-SLOW-NEXT: STRHHui [[COPY3]], %ptr, 0 :: (store (s16) into %ir.addr)
+    ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32
+    ; CHECK-SLOW-NEXT: STRHHui [[COPY4]], %ptr, 0 :: (store (s16) into %ir.addr)
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 1
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -296,14 +308,24 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $x0, $x1, $x2, $q0
-    ; CHECK-LABEL: name: shl_slow_4_more_than_one_use
-    ; CHECK: liveins: $x0, $x1, $x2, $q0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr)
-    ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr)
+    ; CHECK-FAST-LABEL: name: shl_slow_4_more_than_one_use
+    ; CHECK-FAST: liveins: $x0, $x1, $x2, $q0
+    ; CHECK-FAST-NEXT: {{  $}}
+    ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-FAST-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr)
+    ; CHECK-FAST-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr)
+    ;
+    ; CHECK-SLOW-LABEL: name: shl_slow_4_more_than_one_use
+    ; CHECK-SLOW: liveins: $x0, $x1, $x2, $q0
+    ; CHECK-SLOW-NEXT: {{  $}}
+    ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-SLOW-NEXT: %ptr:gpr64common = ADDXrs [[COPY1]], [[COPY]], 4
+    ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-SLOW-NEXT: STRQui [[COPY2]], %ptr, 0 :: (store (s128) into %ir.addr)
+    ; CHECK-SLOW-NEXT: STRQui [[COPY2]], %ptr, 0 :: (store (s128) into %ir.addr)
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 4
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -339,7 +361,3 @@ body:             |
     %4:gpr(p0) = COPY $x2
     G_STORE %4, %ptr :: (store (p0) into %ir.addr)
 ...
-
-# NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# CHECK-FAST: {{.*}}
-# CHECK-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index 506bf1dcfb24e..18cdbada1ed83 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -21,7 +21,6 @@
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Instrument function entry/exit with calls to e.g. mcount() (post inlining)
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 12b319f91bfdc..64836b061be49 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -52,7 +52,6 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 614ac15d959f0..63dcafed2320a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -28,17 +28,16 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 ;
 ; CHECK0-GISEL-LABEL: halfword:
 ; CHECK0-GISEL:       // %bb.0:
-; CHECK0-GISEL-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-GISEL-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK0-GISEL-NEXT:    lsr w8, w1, #9
 ; CHECK0-GISEL-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-GISEL-NEXT:    mov x19, x0
-; CHECK0-GISEL-NEXT:    and x21, x8, #0xff
-; CHECK0-GISEL-NEXT:    ldrh w20, [x0, x21, lsl #1]
+; CHECK0-GISEL-NEXT:    add x20, x0, w8, uxtb #1
+; CHECK0-GISEL-NEXT:    ldrh w19, [x20]
 ; CHECK0-GISEL-NEXT:    bl foo
-; CHECK0-GISEL-NEXT:    mov w0, w20
-; CHECK0-GISEL-NEXT:    strh w20, [x19, x21, lsl #1]
+; CHECK0-GISEL-NEXT:    mov w0, w19
+; CHECK0-GISEL-NEXT:    strh w19, [x20]
 ; CHECK0-GISEL-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-GISEL-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK0-GISEL-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK0-GISEL-NEXT:    ret
 ;
 ; CHECK3-SDAG-LABEL: halfword:
@@ -248,27 +247,23 @@ define i16 @multi_use_half_word(ptr %ctx, i32 %xor72) {
 ;
 ; CHECK0-GISEL-LABEL: multi_use_half_word:
 ; CHECK0-GISEL:       // %bb.0: // %entry
-; CHECK0-GISEL-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK0-GISEL-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK0-GISEL-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK0-GISEL-NEXT:    .cfi_def_cfa_offset 48
+; CHECK0-GISEL-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-GISEL-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-GISEL-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK0-GISEL-NEXT:    .cfi_offset w19, -8
 ; CHECK0-GISEL-NEXT:    .cfi_offset w20, -16
 ; CHECK0-GISEL-NEXT:    .cfi_offset w21, -24
-; CHECK0-GISEL-NEXT:    .cfi_offset w22, -32
-; CHECK0-GISEL-NEXT:    .cfi_offset w30, -48
+; CHECK0-GISEL-NEXT:    .cfi_offset w30, -32
 ; CHECK0-GISEL-NEXT:    lsr w8, w1, #9
-; CHECK0-GISEL-NEXT:    mov x19, x0
-; CHECK0-GISEL-NEXT:    and x21, x8, #0xff
-; CHECK0-GISEL-NEXT:    ldrh w20, [x0, x21, lsl #1]
-; CHECK0-GISEL-NEXT:    add w22, w20, #1
+; CHECK0-GISEL-NEXT:    add x20, x0, w8, uxtb #1
+; CHECK0-GISEL-NEXT:    ldrh w19, [x20]
+; CHECK0-GISEL-NEXT:    add w21, w19, #1
 ; CHECK0-GISEL-NEXT:    bl foo
-; CHECK0-GISEL-NEXT:    strh w20, [x19, x21, lsl #1]
-; CHECK0-GISEL-NEXT:    mov w0, w20
-; CHECK0-GISEL-NEXT:    strh w22, [x19, x21, lsl #1]
-; CHECK0-GISEL-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK0-GISEL-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK0-GISEL-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK0-GISEL-NEXT:    strh w19, [x20]
+; CHECK0-GISEL-NEXT:    mov w0, w19
+; CHECK0-GISEL-NEXT:    strh w21, [x20]
+; CHECK0-GISEL-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-GISEL-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK0-GISEL-NEXT:    ret
 ;
 ; CHECK3-SDAG-LABEL: multi_use_half_word:
@@ -387,14 +382,14 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) {
 ;
 ; CHECK0-GISEL-LABEL: gep4:
 ; CHECK0-GISEL:       // %bb.0:
-; CHECK0-GISEL-NEXT:    ldr q1, [x0, x4, lsl #4]
+; CHECK0-GISEL-NEXT:    add x8, x0, x4, lsl #4
 ; CHECK0-GISEL-NEXT:    mov v0.d[0], x2
-; CHECK0-GISEL-NEXT:    mov x8, x0
+; CHECK0-GISEL-NEXT:    ldr q1, [x8]
 ; CHECK0-GISEL-NEXT:    mov d2, v1.d[1]
-; CHECK0-GISEL-NEXT:    fmov x0, d1
 ; CHECK0-GISEL-NEXT:    mov v0.d[1], x3
+; CHECK0-GISEL-NEXT:    fmov x0, d1
 ; CHECK0-GISEL-NEXT:    fmov x1, d2
-; CHECK0-GISEL-NEXT:    str q0, [x8, x4, lsl #4]
+; CHECK0-GISEL-NEXT:    str q0, [x8]
 ; CHECK0-GISEL-NEXT:    ret
 ;
 ; CHECK3-SDAG-LABEL: gep4:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
index 9b0701ab148dc..c57c95fd3b531 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
@@ -281,8 +281,7 @@ define i64 @smull_ldrsw_shift(ptr %x0, i64 %x1) {
 ; CHECK-LABEL: smull_ldrsw_shift:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrsw x8, [x0]
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    smull x0, w8, w9
+; CHECK-NEXT:    smull x0, w8, w1
 ; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
@@ -490,8 +489,7 @@ define i64 @smaddl_ldrsw_shift(ptr %x0, i64 %x1, i64 %x2) {
 ; CHECK-LABEL: smaddl_ldrsw_shift:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrsw x8, [x0]
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    smaddl x0, w8, w9, x2
+; CHECK-NEXT:    smaddl x0, w8, w1, x2
 ; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
@@ -654,8 +652,7 @@ define i64 @smnegl_ldrsw_shift(ptr %x0, i64 %x1) {
 ; CHECK-LABEL: smnegl_ldrsw_shift:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrsw x8, [x0]
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    smnegl x0, w8, w9
+; CHECK-NEXT:    smnegl x0, w8, w1
 ; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
@@ -818,8 +815,7 @@ define i64 @smsubl_ldrsw_shift(ptr %x0, i64 %x1, i64 %x2) {
 ; CHECK-LABEL: smsubl_ldrsw_shift:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrsw x8, [x0]
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    smsubl x0, w8, w9, x2
+; CHECK-NEXT:    smsubl x0, w8, w1, x2
 ; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
@@ -1451,3 +1447,21 @@ define i64 @umaddl_and_and(i64 %x, i64 %y, i64 %a) {
     %add = add i64 %a, %mul
     ret i64 %add
 }
+
+; Check which can contain multiple copies that should all be removed.
+define i32 @f(i32 %0) {
+entry:
+  %1 = sext i32 %0 to i64
+  br label %A
+
+A:
+  %2 = trunc i64 %1 to i32
+  %a69.us = sub i32 0, %2
+  %a69.us.fr = freeze i32 %a69.us
+  %3 = zext i32 %a69.us.fr to i64
+  br label %B
+
+B:
+  %t = icmp eq i64 0, %3
+  br i1 %t, label %A, label %B
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index b6702bba1598c..50afc79a5a576 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
 target triple = "arm64-apple-ios"
 
 define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
@@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
 
 define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: f128_select_and_olt_oge:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
-; CHECK-NEXT:    bl ___lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w21, lt
-; CHECK-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
-; CHECK-NEXT:    bl ___getf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w8, ge
-; CHECK-NEXT:    tst w8, w21
-; CHECK-NEXT:    csel w0, w20, w19, ne
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: f128_select_and_olt_oge:
+; SDISEL:       ; %bb.0:
+; SDISEL-NEXT:    sub sp, sp, #80
+; SDISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; SDISEL-NEXT:    mov x19, x1
+; SDISEL-NEXT:    mov x20, x0
+; SDISEL-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
+; SDISEL-NEXT:    bl ___lttf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w21, lt
+; SDISEL-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
+; SDISEL-NEXT:    bl ___getf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w8, ge
+; SDISEL-NEXT:    tst w8, w21
+; SDISEL-NEXT:    csel w0, w20, w19, ne
+; SDISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; SDISEL-NEXT:    add sp, sp, #80
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: f128_select_and_olt_oge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #80
+; GISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp q3, q2, [sp] ; 32-byte Folded Spill
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    mov x20, x1
+; GISEL-NEXT:    bl ___lttf2
+; GISEL-NEXT:    mov x21, x0
+; GISEL-NEXT:    ldp q1, q0, [sp] ; 32-byte Folded Reload
+; GISEL-NEXT:    bl ___getf2
+; GISEL-NEXT:    cmp w21, #0
+; GISEL-NEXT:    ccmp w0, #0, #8, lt
+; GISEL-NEXT:    csel w0, w19, w20, ge
+; GISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #80
+; GISEL-NEXT:    ret
   %c0 = fcmp olt fp128 %v0, %v1
   %c1 = fcmp oge fp128 %v2, %v3
   %cr = and i1 %c1, %c0
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54830..0291f8c912304 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -103,6 +103,19 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
   ret <4 x i32> %tmp4
 }
 
+define <4 x i16> @v_dup16_const(i16 %y, ptr %p) {
+; CHECK-LABEL: v_dup16_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi.4h v0, #10
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    ret
+    %i = insertelement <4 x i16> undef, i16 10, i32 0
+    %lo = shufflevector <4 x i16> %i, <4 x i16> undef, <4 x i32> zeroinitializer
+    store i16 10, ptr %p
+    ret <4 x i16> %lo
+}
+
 define <4 x float> @v_dupQfloat(float %A) nounwind {
 ; CHECK-LABEL: v_dupQfloat:
 ; CHECK:       // %bb.0:
@@ -420,9 +433,9 @@ define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) n
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -443,9 +456,9 @@ define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI35_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -462,9 +475,9 @@ define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) n
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -481,9 +494,9 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_0]
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -503,12 +516,12 @@ define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: disguised_dup:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI37_1
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_1
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_0]
 ; CHECK-GI-NEXT:    tbl.16b v2, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    str q2, [x1]
@@ -531,8 +544,8 @@ define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
 ;
 ; CHECK-GI-LABEL: dup_const2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI39_0]
 ; CHECK-GI-NEXT:    add.2s v0, v0, v1
 ; CHECK-GI-NEXT:    ret
   %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
@@ -550,8 +563,8 @@ define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
 ;
 ; CHECK-GI-LABEL: dup_const4_ext:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
 ; CHECK-GI-NEXT:    add.4s v0, v0, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -575,12 +588,12 @@ define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind
 ;
 ; CHECK-GI-LABEL: dup_const24:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI40_1
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_1
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI40_1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
+; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI41_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
 ; CHECK-GI-NEXT:    add.2s v0, v0, v3
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI40_0]
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI41_0]
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-GI-NEXT:    add.4s v1, v2, v3
 ; CHECK-GI-NEXT:    eor.16b v0, v1, v0
@@ -687,3 +700,17 @@ define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
   ret <8 x i16> %r
 }
 
+define <4 x i16> @dup_i16_v4i16_constant() {
+; CHECK-SD-LABEL: dup_i16_v4i16_constant:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #9211 // =0x23fb
+; CHECK-SD-NEXT:    dup.4h v0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_i16_v4i16_constant:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI50_0
+; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI50_0]
+; CHECK-GI-NEXT:    ret
+    ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 43d5ab5ab54e1..c56f4409e3a62 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1127,8 +1127,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov8i8:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.8b, v0.8b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1147,8 +1146,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov4i16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.4h, v0.4h
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1167,8 +1165,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov2i32:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.2s, v0.2s
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1187,8 +1184,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov1i64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg d0, d0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1209,8 +1205,7 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    fneg v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/arm64-sqxtn2-combine.ll b/llvm/test/CodeGen/AArch64/arm64-sqxtn2-combine.ll
deleted file mode 100644
index f09109c282a20..0000000000000
--- a/llvm/test/CodeGen/AArch64/arm64-sqxtn2-combine.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s
-
-; Test the (concat_vectors (X), (trunc(smin(smax(Y, -2^n), 2^n-1))) pattern.
-
-define <16 x i8> @test_combine_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) {
-; CHECK-LABEL: test_combine_v8i16_to_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqxtn2 v0.16b, v1.8h
-; CHECK-NEXT:    ret
-entry:
-  %min = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %y, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>)
-  %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %min, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>)
-  %trunc = trunc <8 x i16> %max to <8 x i8>
-  %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_combine_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) {
-; CHECK-LABEL: test_combine_v4i32_to_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqxtn2 v0.8h, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
-  %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %max, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
-  %trunc = trunc <4 x i32> %min to <4 x i16>
-  %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle
-}
-
-declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 00cc6b21ccaf8..abf2e1272d645 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -951,10 +951,8 @@ define <1 x i128> @sext_v1x64(<1 x i64> %arg) {
 ; CHECK-SD-LABEL: sext_v1x64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    asr x1, x8, #63
-; CHECK-SD-NEXT:    mov.d v0[1], x1
 ; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    asr x1, x0, #63
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v1x64:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 178c229d04e47..62a79e3547b29 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1802,28 +1802,25 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    mov.d x8, v0[1]
 ; CHECK-NEXT:    mov.d x9, v1[1]
 ; CHECK-NEXT:    fmov x10, d0
-; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    subs x10, x10, x11
+; CHECK-NEXT:    fmov x12, d1
+; CHECK-NEXT:    asr x14, x10, #63
 ; CHECK-NEXT:    asr x11, x8, #63
-; CHECK-NEXT:    asr x14, x9, #63
-; CHECK-NEXT:    sbc x12, x12, x13
+; CHECK-NEXT:    asr x13, x9, #63
+; CHECK-NEXT:    asr x15, x12, #63
 ; CHECK-NEXT:    subs x8, x8, x9
-; CHECK-NEXT:    sbc x9, x11, x14
-; CHECK-NEXT:    asr x13, x12, #63
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    eor x10, x10, x13
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    subs x2, x8, x11
-; CHECK-NEXT:    eor x8, x12, x13
-; CHECK-NEXT:    sbc x3, x9, x11
-; CHECK-NEXT:    subs x9, x10, x13
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    sbc x1, x8, x13
-; CHECK-NEXT:    mov.d v0[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    sbc x9, x11, x13
+; CHECK-NEXT:    subs x10, x10, x12
+; CHECK-NEXT:    sbc x11, x14, x15
+; CHECK-NEXT:    asr x13, x9, #63
+; CHECK-NEXT:    asr x12, x11, #63
+; CHECK-NEXT:    eor x8, x8, x13
+; CHECK-NEXT:    eor x9, x9, x13
+; CHECK-NEXT:    eor x10, x10, x12
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    subs x0, x10, x12
+; CHECK-NEXT:    sbc x1, x11, x12
+; CHECK-NEXT:    subs x2, x8, x13
+; CHECK-NEXT:    sbc x3, x9, x13
 ; CHECK-NEXT:    ret
   %aext = sext <2 x i64> %a to <2 x i128>
   %bext = sext <2 x i64> %b to <2 x i128>
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
new file mode 100644
index 0000000000000..64fb5b36b2c62
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
@@ -0,0 +1,315 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc -filetype=obj -o %t.o < %s
+; RUN: llvm-objdump -t %t.o | FileCheck --check-prefix=SYM %s
+
+define dso_local ptr @func() hybrid_patchable nounwind {
+; SYM: [ 8](sec  4)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$hp_target
+; CHECK-LABEL:     .def    "#func$hp_target";
+; CHECK:           .section        .text,"xr",discard,"#func$hp_target"
+; CHECK-NEXT:      .globl  "#func$hp_target"               // -- Begin function #func$hp_target
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#func$hp_target":                      // @"#func$hp_target"
+; CHECK-NEXT:      // %bb.0:
+; CHECK-NEXT:      adrp x0, func
+; CHECK-NEXT:      add x0, x0, :lo12:func
+; CHECK-NEXT:      ret
+  ret ptr @func
+}
+
+define void @has_varargs(...) hybrid_patchable nounwind {
+; SYM: [11](sec  5)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_varargs$hp_target
+; CHECK-LABEL:     .def "#has_varargs$hp_target";
+; CHECK:           .section .text,"xr",discard,"#has_varargs$hp_target"
+; CHECK-NEXT:      .globl  "#has_varargs$hp_target"        // -- Begin function #has_varargs$hp_target
+; CHECK-NEXT:      .p2align 2
+; CHECK-NEXT:  "#has_varargs$hp_target":               // @"#has_varargs$hp_target"
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      sub     sp, sp, #32
+; CHECK-NEXT:      stp     x0, x1, [x4, #-32]
+; CHECK-NEXT:      stp     x2, x3, [x4, #-16]
+; CHECK-NEXT:      add     sp, sp, #32
+; CHECK-NEXT:      ret
+  ret void
+}
+
+define void @has_sret(ptr sret([100 x i8])) hybrid_patchable nounwind {
+; SYM: [14](sec  6)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_sret$hp_target
+; CHECK-LABEL:     .def    "#has_sret$hp_target";
+; CHECK:           .section        .text,"xr",discard,"#has_sret$hp_target"
+; CHECK-NEXT:      .globl  "#has_sret$hp_target"           // -- Begin function #has_sret$hp_target
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#has_sret$hp_target":                  // @"#has_sret$hp_target"
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      ret
+  ret void
+}
+
+define dllexport void @exp() hybrid_patchable nounwind {
+; CHECK-LABEL:     .def    "#exp$hp_target";
+; CHECK:           .section        .text,"xr",discard,"#exp$hp_target"
+; CHECK-NEXT:      .globl  "#exp$hp_target"                // -- Begin function #exp$hp_target
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#exp$hp_target":                       // @"#exp$hp_target"
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      ret
+  ret void
+}
+
+; hybrid_patchable attribute is ignored on internal functions
+define internal i32 @static_func() hybrid_patchable nounwind {
+; CHECK-LABEL:     .def    static_func;
+; CHECK:       static_func:                            // @static_func
+; CHECK-NEXT:      // %bb.0:
+; CHECK-NEXT:      mov     w0, #2                          // =0x2
+; CHECK-NEXT:      ret
+  ret i32 2
+}
+
+define dso_local void @caller() nounwind {
+; CHECK-LABEL:     .def    "#caller";
+; CHECK:           .section        .text,"xr",discard,"#caller"
+; CHECK-NEXT:      .globl  "#caller"                       // -- Begin function #caller
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#caller":                              // @"#caller"
+; CHECK-NEXT:      .weak_anti_dep  caller
+; CHECK-NEXT:  .set caller, "#caller"{{$}}
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:      bl      "#func"
+; CHECK-NEXT:      bl      static_func
+; CHECK-NEXT:      adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:      adrp    x11, func
+; CHECK-NEXT:      add     x11, x11, :lo12:func
+; CHECK-NEXT:      ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:      adrp    x10, ($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:      add     x10, x10, :lo12:($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:      str     x11, [sp, #8]
+; CHECK-NEXT:      blr     x8
+; CHECK-NEXT:      blr     x11
+; CHECK-NEXT:      ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:      ret
+  %1 = call i32 @func()
+  %2 = call i32 @static_func()
+  %3 = alloca ptr, align 8
+  store ptr @func, ptr %3, align 8
+  %4 = load ptr, ptr %3, align 8
+  call void %4()
+  ret void
+}
+
+; CHECK-LABEL:       def    "#func$hybpatch_thunk";
+; CHECK:            .section        .wowthk$aa,"xr",discard,"#func$hybpatch_thunk"
+; CHECK-NEXT:       .globl  "#func$hybpatch_thunk"          // -- Begin function #func$hybpatch_thunk
+; CHECK-NEXT:       .p2align        2
+; CHECK-NEXT:   "#func$hybpatch_thunk":                 // @"#func$hybpatch_thunk"
+; CHECK-NEXT:   .seh_proc "#func$hybpatch_thunk"
+; CHECK-NEXT:   // %bb.0:
+; CHECK-NEXT:       str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:       .seh_save_reg_x x30, 16
+; CHECK-NEXT:       .seh_endprologue
+; CHECK-NEXT:       adrp    x8, __os_arm64x_dispatch_call
+; CHECK-NEXT:       adrp    x11, func
+; CHECK-NEXT:       add     x11, x11, :lo12:func
+; CHECK-NEXT:       ldr     x8, [x8, :lo12:__os_arm64x_dispatch_call]
+; CHECK-NEXT:       adrp    x10, ($iexit_thunk$cdecl$i8$v)
+; CHECK-NEXT:       add     x10, x10, :lo12:($iexit_thunk$cdecl$i8$v)
+; CHECK-NEXT:       adrp    x9, "#func$hp_target"
+; CHECK-NEXT:       add     x9, x9, :lo12:"#func$hp_target"
+; CHECK-NEXT:       blr     x8
+; CHECK-NEXT:       .seh_startepilogue
+; CHECK-NEXT:       ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:       .seh_save_reg_x x30, 16
+; CHECK-NEXT:       .seh_endepilogue
+; CHECK-NEXT:       br      x11
+; CHECK-NEXT:       .seh_endfunclet
+; CHECK-NEXT:       .seh_endproc
+
+; CHECK-LABEL:      .def    "#has_varargs$hybpatch_thunk";
+; CHECK:            .section        .wowthk$aa,"xr",discard,"#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:       .globl  "#has_varargs$hybpatch_thunk"   // -- Begin function #has_varargs$hybpatch_thunk
+; CHECK-NEXT:       .p2align        2
+; CHECK-NEXT:"#has_varargs$hybpatch_thunk":          // @"#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:.seh_proc "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:// %bb.0:
+; CHECK-NEXT:       str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:       .seh_save_reg_x x30, 16
+; CHECK-NEXT:       .seh_endprologue
+; CHECK-NEXT:       adrp    x8, __os_arm64x_dispatch_call
+; CHECK-NEXT:       adrp    x11, has_varargs
+; CHECK-NEXT:       add     x11, x11, :lo12:has_varargs
+; CHECK-NEXT:       ldr     x8, [x8, :lo12:__os_arm64x_dispatch_call]
+; CHECK-NEXT:       adrp    x10, ($iexit_thunk$cdecl$v$varargs)
+; CHECK-NEXT:       add     x10, x10, :lo12:($iexit_thunk$cdecl$v$varargs)
+; CHECK-NEXT:       adrp    x9, "#has_varargs$hp_target"
+; CHECK-NEXT:       add     x9, x9, :lo12:"#has_varargs$hp_target"
+; CHECK-NEXT:       blr     x8
+; CHECK-NEXT:       .seh_startepilogue
+; CHECK-NEXT:       ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:       .seh_save_reg_x x30, 16
+; CHECK-NEXT:       .seh_endepilogue
+; CHECK-NEXT:       br      x11
+; CHECK-NEXT:       .seh_endfunclet
+; CHECK-NEXT:       .seh_endproc
+
+; CHECK-LABEL:     .def    "#has_sret$hybpatch_thunk";
+; CHECK:           .section        .wowthk$aa,"xr",discard,"#has_sret$hybpatch_thunk"
+; CHECK-NEXT:      .globl  "#has_sret$hybpatch_thunk"      // -- Begin function #has_sret$hybpatch_thunk
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#has_sret$hybpatch_thunk":             // @"#has_sret$hybpatch_thunk"
+; CHECK-NEXT:  .seh_proc "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:      .seh_save_reg_x x30, 16
+; CHECK-NEXT:      .seh_endprologue
+; CHECK-NEXT:      adrp    x9, __os_arm64x_dispatch_call
+; CHECK-NEXT:      adrp    x11, has_sret
+; CHECK-NEXT:      add     x11, x11, :lo12:has_sret
+; CHECK-NEXT:      ldr     x12, [x9, :lo12:__os_arm64x_dispatch_call]
+; CHECK-NEXT:      adrp    x10, ($iexit_thunk$cdecl$m100$v)
+; CHECK-NEXT:      add     x10, x10, :lo12:($iexit_thunk$cdecl$m100$v)
+; CHECK-NEXT:      adrp    x9, "#has_sret$hp_target"
+; CHECK-NEXT:      add     x9, x9, :lo12:"#has_sret$hp_target"
+; CHECK-NEXT:      blr     x12
+; CHECK-NEXT:      .seh_startepilogue
+; CHECK-NEXT:      ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:      .seh_save_reg_x x30, 16
+; CHECK-NEXT:      .seh_endepilogue
+; CHECK-NEXT:      br      x11
+; CHECK-NEXT:      .seh_endfunclet
+; CHECK-NEXT:      .seh_endproc
+
+; CHECK-LABEL:     .def    "#exp$hybpatch_thunk";
+; CHECK:           .section        .wowthk$aa,"xr",discard,"#exp$hybpatch_thunk"
+; CHECK-NEXT:      .globl  "#exp$hybpatch_thunk"           // -- Begin function #exp$hybpatch_thunk
+; CHECK-NEXT:      .p2align        2
+; CHECK-NEXT:  "#exp$hybpatch_thunk":                // @"#exp$hybpatch_thunk"
+; CHECK-NEXT:  .seh_proc "#exp$hybpatch_thunk"
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:      str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:      .seh_save_reg_x x30, 16
+; CHECK-NEXT:      .seh_endprologue
+; CHECK-NEXT:      adrp    x8, __os_arm64x_dispatch_call
+; CHECK-NEXT:      adrp    x11, exp
+; CHECK-NEXT:      add     x11, x11, :lo12:exp
+; CHECK-NEXT:      ldr     x8, [x8, :lo12:__os_arm64x_dispatch_call]
+; CHECK-NEXT:      adrp    x10, ($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:      add     x10, x10, :lo12:($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:      adrp    x9, "#exp$hp_target"
+; CHECK-NEXT:      add     x9, x9, :lo12:"#exp$hp_target"
+; CHECK-NEXT:      blr     x8
+; CHECK-NEXT:      .seh_startepilogue
+; CHECK-NEXT:      ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:      .seh_save_reg_x x30, 16
+; CHECK-NEXT:      .seh_endepilogue
+; CHECK-NEXT:      br      x11
+; CHECK-NEXT:      .seh_endfunclet
+; CHECK-NEXT:      .seh_endproc
+
+; Verify the hybrid bitmap
+; CHECK-LABEL:     .section        .hybmp$x,"yi"
+; CHECK-NEXT:      .symidx "#func$hp_target"
+; CHECK-NEXT:      .symidx $ientry_thunk$cdecl$i8$v
+; CHECK-NEXT:      .word   1
+; CHECK-NEXT:      .symidx "#has_varargs$hp_target"
+; CHECK-NEXT:      .symidx $ientry_thunk$cdecl$v$varargs
+; CHECK-NEXT:      .word   1
+; CHECK-NEXT:      .symidx "#has_sret$hp_target"
+; CHECK-NEXT:      .symidx $ientry_thunk$cdecl$m100$v
+; CHECK-NEXT:      .word   1
+; CHECK-NEXT:      .symidx "#exp$hp_target"
+; CHECK-NEXT:      .symidx $ientry_thunk$cdecl$v$v
+; CHECK-NEXT:      .word   1
+; CHECK-NEXT:      .symidx "#caller"
+; CHECK-NEXT:      .symidx $ientry_thunk$cdecl$v$v
+; CHECK-NEXT:      .word   1
+; CHECK-NEXT:      .symidx func
+; CHECK-NEXT:      .symidx $iexit_thunk$cdecl$i8$v
+; CHECK-NEXT:      .word   4
+; CHECK-NEXT:      .symidx "#func$hybpatch_thunk"
+; CHECK-NEXT:      .symidx func
+; CHECK-NEXT:      .word   0
+; CHECK-NEXT:      .symidx "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:      .symidx has_varargs
+; CHECK-NEXT:      .word   0
+; CHECK-NEXT:      .symidx "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:      .symidx has_sret
+; CHECK-NEXT:      .word   0
+; CHECK-NEXT:      .symidx "#exp$hybpatch_thunk"
+; CHECK-NEXT:      .symidx exp
+; CHECK-NEXT:      .word   0
+; CHECK-NEXT:      .section        .drectve,"yni"
+; CHECK-NEXT:      .ascii  " /EXPORT:exp"
+
+; CHECK-NEXT:      .def    func;
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:      .weak  func
+; CHECK-NEXT:  .set func, "EXP+#func"{{$}}
+; CHECK-NEXT:      .weak  "#func"
+; CHECK-NEXT:      .def    "#func";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:  .set "#func", "#func$hybpatch_thunk"{{$}}
+; CHECK-NEXT:      .def    has_varargs;
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:      .weak   has_varargs
+; CHECK-NEXT:  .set has_varargs, "EXP+#has_varargs"
+; CHECK-NEXT:      .weak   "#has_varargs"
+; CHECK-NEXT:      .def    "#has_varargs";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:  .set "#has_varargs", "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:      .def    has_sret;
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:      .weak   has_sret
+; CHECK-NEXT:  .set has_sret, "EXP+#has_sret"
+; CHECK-NEXT:      .weak   "#has_sret"
+; CHECK-NEXT:      .def    "#has_sret";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:  .set "#has_sret", "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:      .def    exp;
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:      .weak   exp
+; CHECK-NEXT:  .set exp, "EXP+#exp"
+; CHECK-NEXT:      .weak   "#exp"
+; CHECK-NEXT:      .def    "#exp";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
+; CHECK-NEXT:  .set "#exp", "#exp$hybpatch_thunk"
+
+; SYM:      [53](sec 15)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$hybpatch_thunk
+; SYM:      [58](sec 16)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_varargs$hybpatch_thunk
+; SYM:      [68](sec 18)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_sret$hybpatch_thunk
+; SYM:      [78](sec 20)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #exp$hybpatch_thunk
+; SYM:      [110](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 func
+; SYM-NEXT: AUX indx 112 srch 3
+; SYM-NEXT: [112](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#func
+; SYM:      [116](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #func
+; SYM-NEXT: AUX indx 53 srch 3
+; SYM:      [122](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 has_varargs
+; SYM-NEXT: AUX indx 124 srch 3
+; SYM-NEXT: [124](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#has_varargs
+; SYM-NEXT: [125](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 has_sret
+; SYM-NEXT: AUX indx 127 srch 3
+; SYM-NEXT: [127](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#has_sret
+; SYM-NEXT: [128](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 exp
+; SYM-NEXT: AUX indx 130 srch 3
+; SYM-NEXT: [130](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#exp
+; SYM-NEXT: [131](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #has_varargs
+; SYM-NEXT: AUX indx 58 srch 3
+; SYM-NEXT: [133](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #has_sret
+; SYM-NEXT: AUX indx 68 srch 3
+; SYM-NEXT: [135](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #exp
+; SYM-NEXT: AUX indx 78 srch 3
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index e601f03d524a4..8b6740667a229 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -7,10 +7,10 @@
 @l = common hidden local_unnamed_addr global i32 0, align 4
 
 ; CHECK-LABEL: <test1>:
-; CHECK-LABEL: <$d.1>:
-; CHECK-LABEL: <$x.2>:
+; CHECK-LABEL: <$d>:
+; CHECK-LABEL: <$x>:
 ; CHECK-NEXT:    b 0x2c <test1+0x2c>
-; CHECK-LABEL: <$x.4>:
+; CHECK-LABEL: <$x>:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ldr x30, [sp], #16
 ; CHECK-NEXT:    ret
@@ -41,8 +41,8 @@ declare dso_local i32 @i(...) local_unnamed_addr
 
 ; CHECK-LABEL: <test2>:
 ; CHECK:         b {{.*}} <test2+0x1c>
-; CHECK-LABEL: <$d.5>:
-; CHECK-LABEL: <$x.6>:
+; CHECK-LABEL: <$d>:
+; CHECK-LABEL: <$x>:
 ; CHECK-NEXT:    b {{.*}} <test2+0x18>
 define hidden i32 @test2() local_unnamed_addr {
   %1 = load i32, ptr @l, align 4
@@ -70,10 +70,10 @@ define hidden i32 @test2() local_unnamed_addr {
 }
 
 ; CHECK-LABEL: <test3>:
-; CHECK-LABEL: <$d.9>:
-; CHECK-LABEL: <$x.10>:
+; CHECK-LABEL: <$d>:
+; CHECK-LABEL: <$x>:
 ; CHECK-NEXT:    b {{.*}} <test3+0x34>
-; CHECK-LABEL: <$x.12>:
+; CHECK-LABEL: <$x>:
 ; CHECK-NEXT:    ldr x30, [sp], #16
 ; CHECK-NEXT:    ret
 define internal i1 @test3() {
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index 14cb0c82b1c03..4b816df75a730 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -258,3 +258,246 @@ define i32 @neg_range_int(i32 %a, i32 %b, i32 %c) {
   ret i32 %retval.0
 }
 
+; (b > -(d | 1) && a < c)
+define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #4, lt
+; SDISEL-NEXT:    csel w0, w1, w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #4, lt
+; GISEL-NEXT:    csel w0, w1, w0, gt
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp sgt i32 %b, %negd
+  %cmp1 = icmp slt i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b >u -(d | 1) && a < c)
+define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp_u:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #0, lt
+; SDISEL-NEXT:    csel w0, w1, w0, hi
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp_u:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #0, lt
+; GISEL-NEXT:    csel w0, w1, w0, hi
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp ugt i32 %b, %negd
+  %cmp1 = icmp slt i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b > -(d | 1) && a u < c)
+define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp_ua:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #4, lo
+; SDISEL-NEXT:    csel w0, w1, w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp_ua:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #4, lo
+; GISEL-NEXT:    csel w0, w1, w0, gt
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp sgt i32 %b, %negd
+  %cmp1 = icmp ult i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b <= -3 && a > c)
+define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) {
+; SDISEL-LABEL: neg_range_int_2:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, #4, #4, gt
+; SDISEL-NEXT:    csel w0, w1, w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_2:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    ccmn w1, #3, #8, gt
+; GISEL-NEXT:    csel w0, w1, w0, ge
+; GISEL-NEXT:    ret
+  %cmp = icmp sge i32 %b, -3
+  %cmp1 = icmp sgt i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b < -(d | 1) && a >= c)
+define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp2:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #0, ge
+; SDISEL-NEXT:    csel w0, w1, w0, lt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp2:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #0, ge
+; GISEL-NEXT:    csel w0, w1, w0, lt
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp slt i32 %b, %negd
+  %cmp1 = icmp sge i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b <u -(d | 1) && a > c)
+define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp_u2:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #2, gt
+; SDISEL-NEXT:    csel w0, w1, w0, lo
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp_u2:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #2, gt
+; GISEL-NEXT:    csel w0, w1, w0, lo
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp ult i32 %b, %negd
+  %cmp1 = icmp sgt i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b > -(d | 1) && a u > c)
+define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp_ua2:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #4, hi
+; SDISEL-NEXT:    csel w0, w1, w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp_ua2:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #4, hi
+; GISEL-NEXT:    csel w0, w1, w0, gt
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp sgt i32 %b, %negd
+  %cmp1 = icmp ugt i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; (b > -(d | 1) && a u == c)
+define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SDISEL-LABEL: neg_range_int_comp_ua3:
+; SDISEL:       // %bb.0:
+; SDISEL-NEXT:    orr w8, w3, #0x1
+; SDISEL-NEXT:    cmp w0, w2
+; SDISEL-NEXT:    ccmn w1, w8, #4, eq
+; SDISEL-NEXT:    csel w0, w1, w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_comp_ua3:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w3, #0x1
+; GISEL-NEXT:    cmp w0, w2
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    ccmp w1, w8, #4, eq
+; GISEL-NEXT:    csel w0, w1, w0, gt
+; GISEL-NEXT:    ret
+  %dor = or i32 %d, 1
+  %negd = sub i32 0, %dor
+  %cmp = icmp sgt i32 %b, %negd
+  %cmp1 = icmp eq i32 %a, %c
+  %or.cond = and i1 %cmp, %cmp1
+  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
+  ret i32 %retval.0
+}
+
+; -(a | 1) > (b | 3) && a < c
+define i32 @neg_range_int_c(i32 %a, i32 %b, i32 %c) {
+; SDISEL-LABEL: neg_range_int_c:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    orr w8, w0, #0x1
+; SDISEL-NEXT:    orr w9, w1, #0x3
+; SDISEL-NEXT:    cmn w9, w8
+; SDISEL-NEXT:    ccmp w2, w0, #2, lo
+; SDISEL-NEXT:    cset w0, lo
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: neg_range_int_c:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    orr w8, w0, #0x1
+; GISEL-NEXT:    orr w9, w1, #0x3
+; GISEL-NEXT:    neg w8, w8
+; GISEL-NEXT:    cmp w9, w8
+; GISEL-NEXT:    cset w8, lo
+; GISEL-NEXT:    cmp w2, w0
+; GISEL-NEXT:    cset w9, lo
+; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    ret
+entry:
+  %or = or i32 %a, 1
+  %sub = sub i32 0, %or
+  %or1 = or i32 %b, 3
+  %cmp = icmp ult i32 %or1, %sub
+  %cmp2 = icmp ult i32 %c, %a
+  %0 = and i1 %cmp, %cmp2
+  %land.ext = zext i1 %0 to i32
+  ret i32 %land.ext
+}
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 09a6e26fe5a40..b4f179e992a0d 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -241,25 +241,228 @@ define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
 define <4 x i65> @sign_4xi65(<4 x i65> %a) {
 ; CHECK-LABEL: sign_4xi65:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sbfx x8, x1, #0, #1
-; CHECK-NEXT:    sbfx x9, x5, #0, #1
-; CHECK-NEXT:    sbfx x10, x3, #0, #1
-; CHECK-NEXT:    lsr x1, x8, #63
-; CHECK-NEXT:    orr x8, x8, #0x1
-; CHECK-NEXT:    lsr x3, x10, #63
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    sbfx x8, x7, #0, #1
-; CHECK-NEXT:    lsr x5, x9, #63
-; CHECK-NEXT:    orr x2, x10, #0x1
-; CHECK-NEXT:    orr x4, x9, #0x1
-; CHECK-NEXT:    lsr x7, x8, #63
-; CHECK-NEXT:    orr x6, x8, #0x1
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    sbfx x8, x5, #0, #1
+; CHECK-NEXT:    sbfx x9, x3, #0, #1
+; CHECK-NEXT:    sbfx x10, x1, #0, #1
+; CHECK-NEXT:    sbfx x11, x7, #0, #1
+; CHECK-NEXT:    lsr x1, x10, #63
+; CHECK-NEXT:    lsr x3, x9, #63
+; CHECK-NEXT:    lsr x5, x8, #63
+; CHECK-NEXT:    lsr x7, x11, #63
+; CHECK-NEXT:    orr x0, x10, #0x1
+; CHECK-NEXT:    orr x2, x9, #0x1
+; CHECK-NEXT:    orr x4, x8, #0x1
+; CHECK-NEXT:    orr x6, x11, #0x1
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i65> %a, <i65 -1, i65 -1, i65 -1, i65 -1>
   %res = select <4 x i1> %c, <4 x i65> <i65 1, i65 1, i65 1, i65 1>, <4 x i65 > <i65 -1, i65 -1, i65 -1, i65 -1>
   ret <4 x i65> %res
 }
 
+define i32 @or_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp sgt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp ugt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; Negative test
+
+define i32 @or_neg_no_smin(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_no_smin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %4 = sub i32 0, %x
+  %5 = icmp sgt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; Negative test
+
+define i32 @or_neg_ult_no_zero(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_ult_no_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %4 = sub i32 0, %x
+  %5 = icmp ult i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_no_smin_but_zero(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_no_smin_but_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic w8, w0, w0, asr #31
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %3 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %4 = sub i32 0, %3
+  %5 = icmp sgt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_slt_zero_but_no_smin(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_slt_zero_but_no_smin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #9 // =0x9
+; CHECK-NEXT:    cmp w0, #9
+; CHECK-NEXT:    csel w8, w0, w8, lo
+; CHECK-NEXT:    neg w8, w8
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %3 = call i32 @llvm.umin.i32(i32 %x, i32 9)
+  %4 = sub i32 0, %3
+  %5 = icmp ugt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg2(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, le
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp sge i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg3(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp slt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg4(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp sle i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %3 = or i32 %x, 1
+  %4 = sub i32 0, %3
+  %5 = icmp ugt i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_no_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_no_smin2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %4 = sub i32 0, %x
+  %5 = icmp sge i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; Negative test
+
+define i32 @or_neg_ult_no_zero2(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_ult_no_zero2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %4 = sub i32 0, %x
+  %5 = icmp ult i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_no_smin_but_zero2(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_no_smin_but_zero2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic w8, w0, w0, asr #31
+; CHECK-NEXT:    cmn w1, w8
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %3 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %4 = sub i32 0, %3
+  %5 = icmp sle i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+define i32 @or_neg_slt_zero_but_no_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg_slt_zero_but_no_smin2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #9 // =0x9
+; CHECK-NEXT:    cmp w0, #9
+; CHECK-NEXT:    csel w8, w0, w8, lo
+; CHECK-NEXT:    neg w8, w8
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, hs
+; CHECK-NEXT:    ret
+  %3 = call i32 @llvm.umin.i32(i32 %x, i32 9)
+  %4 = sub i32 0, %3
+  %5 = icmp uge i32 %4, %y
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
 declare void @use_4xi1(<4 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 2ef35283568c3..1c8a8d635274e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -229,7 +229,8 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x2, x2, x11
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
+; CHECK-NEXT:    and p1.b, p1/z, p1.b, p2.b
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
 ; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
index f53fd441781a5..6e603b7064f8f 100644
--- a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
+++ b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
@@ -7,6 +7,12 @@
 ; RUN: llc -mtriple=aarch64-apple-tvos6.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=aarch64-apple-xros6.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=aarch64-apple-xros1.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit1.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit24.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-bridgeos < %s | FileCheck -check-prefix=BRIDGEOS %s
+; RUN: llc -mtriple=arm64-apple-bridgeos1.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
+; RUN: llc -mtriple=arm64-apple-bridgeos9.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
 
 ; RUN: not llc -mtriple=aarch64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
 ; RUN: not llc -mtriple=aarch64-apple-ios6.0 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
@@ -23,6 +29,11 @@ define float @test_exp10_f32(float %x) {
 ; APPLE-LABEL: test_exp10_f32:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10f
+;
+; BRIDGEOS-LABEL: test_exp10_f32:
+; BRIDGEOS:       // %bb.0:
+; BRIDGEOS-NEXT:    b __exp10f
+;
   %ret = call float @llvm.exp10.f32(float %x)
   ret float %ret
 }
@@ -35,6 +46,11 @@ define double @test_exp10_f64(double %x) {
 ; APPLE-LABEL: test_exp10_f64:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10
+;
+; BRIDGEOS-LABEL: test_exp10_f64:
+; BRIDGEOS:       // %bb.0:
+; BRIDGEOS-NEXT:    b __exp10
+;
   %ret = call double @llvm.exp10.f64(double %x)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 998f7dd38c3f5..8710703ab970e 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -760,6 +760,12 @@ declare half @llvm.powi.f16.i32(half %a, i32 %b) #0
 declare half @llvm.sin.f16(half %a) #0
 declare half @llvm.cos.f16(half %a) #0
 declare half @llvm.tan.f16(half %a) #0
+declare half @llvm.asin.f16(half %a) #0
+declare half @llvm.acos.f16(half %a) #0
+declare half @llvm.atan.f16(half %a) #0
+declare half @llvm.sinh.f16(half %a) #0
+declare half @llvm.cosh.f16(half %a) #0
+declare half @llvm.tanh.f16(half %a) #0
 declare half @llvm.pow.f16(half %a, half %b) #0
 declare half @llvm.exp.f16(half %a) #0
 declare half @llvm.exp2.f16(half %a) #0
@@ -896,6 +902,156 @@ define half @test_tan(half %a) #0 {
   ret half %r
 }
 
+; FALLBACK-NOT: remark:{{.*}}test_acos
+; FALLBACK-FP16-NOT: remark:{{.*}}test_acos
+
+; CHECK-COMMON-LABEL: test_acos:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}acosf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_acos:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}acosf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_acos(half %a) #0 {
+  %r = call half @llvm.acos.f16(half %a)
+  ret half %r
+}
+
+; FALLBACK-NOT: remark:{{.*}}test_asin
+; FALLBACK-FP16-NOT: remark:{{.*}}test_asin
+
+; CHECK-COMMON-LABEL: test_asin:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}asinf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_asin:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}asinf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_asin(half %a) #0 {
+  %r = call half @llvm.asin.f16(half %a)
+  ret half %r
+}
+
+; FALLBACK-NOT: remark:{{.*}}test_atan
+; FALLBACK-FP16-NOT: remark:{{.*}}test_atan
+
+; CHECK-COMMON-LABEL: test_atan:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}atanf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_atan:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}atanf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_atan(half %a) #0 {
+  %r = call half @llvm.atan.f16(half %a)
+  ret half %r
+}
+
+; FALLBACK-NOT: remark:{{.*}}test_cosh
+; FALLBACK-FP16-NOT: remark:{{.*}}test_cosh
+
+; CHECK-COMMON-LABEL: test_cosh:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}coshf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_cosh:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}coshf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_cosh(half %a) #0 {
+  %r = call half @llvm.cosh.f16(half %a)
+  ret half %r
+}
+
+; FALLBACK-NOT: remark:{{.*}}test_sinh
+; FALLBACK-FP16-NOT: remark:{{.*}}test_sinh
+
+; CHECK-COMMON-LABEL: test_sinh:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}sinhf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_sinh:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}sinhf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_sinh(half %a) #0 {
+  %r = call half @llvm.sinh.f16(half %a)
+  ret half %r
+}
+
+; FALLBACK-NOT: remark:{{.*}}test_tanh
+; FALLBACK-FP16-NOT: remark:{{.*}}test_tanh
+
+; CHECK-COMMON-LABEL: test_tanh:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}tanhf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_tanh:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}tanhf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_tanh(half %a) #0 {
+  %r = call half @llvm.tanh.f16(half %a)
+  ret half %r
+}
+
 ; CHECK-COMMON-LABEL: test_pow:
 ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
 ; CHECK-COMMON-NEXT: mov  x29, sp
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
index 65701343ccc1e..6e55bf4968e78 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
@@ -1,15 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SDAGISEL
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FASTISEL
 ; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
 
 ; First test the different supported value types for select.
 define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
-; GISEL-LABEL: select_i1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    csel w0, w1, w2, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_i1:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_i1:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -27,11 +28,12 @@ define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
 }
 
 define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
-; GISEL-LABEL: select_i8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    csel w0, w1, w2, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_i8:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_i8:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -49,11 +51,12 @@ define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
 }
 
 define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
-; GISEL-LABEL: select_i16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    csel w0, w1, w2, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_i16:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_i16:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -71,11 +74,12 @@ define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
 }
 
 define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
-; GISEL-LABEL: select_i32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    csel w0, w1, w2, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_i32:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_i32:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -92,11 +96,12 @@ define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
 }
 
 define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
-; GISEL-LABEL: select_i64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    csel x0, x1, x2, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_i64:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    csel x0, x1, x2, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_i64:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -113,11 +118,12 @@ define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
 }
 
 define float @select_f32(i1 zeroext %c, float %a, float %b) {
-; GISEL-LABEL: select_f32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    fcsel s0, s0, s1, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_f32:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_f32:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -134,11 +140,12 @@ define float @select_f32(i1 zeroext %c, float %a, float %b) {
 }
 
 define double @select_f64(i1 zeroext %c, double %a, double %b) {
-; GISEL-LABEL: select_f64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    tst w0, #0x1
-; GISEL-NEXT:    fcsel d0, d0, d1, ne
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_f64:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    cmp w0, #0
+; CHECK-SDAGISEL-NEXT:    fcsel d0, d0, d1, ne
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_f64:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -156,6 +163,11 @@ define double @select_f64(i1 zeroext %c, double %a, double %b) {
 
 ; Now test the folding of all compares.
 define float @select_fcmp_false(float %x, float %a, float %b) {
+; CHECK-SDAGISEL-LABEL: select_fcmp_false:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    fmov s0, s2
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_fcmp_false:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fmov s0, s2
@@ -166,11 +178,6 @@ define float @select_fcmp_false(float %x, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    fcmp s0, s0
 ; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, gt
 ; CHECK-GISEL-NEXT:    ret
-; GISEL-LABEL: select_fcmp_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s0
-; GISEL-NEXT:    fcsel s0, s1, s2, gt
-; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -182,11 +189,6 @@ define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, gt
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ogt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, gt
-; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -198,11 +200,6 @@ define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ge
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_oge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, ge
-; GISEL-NEXT:    ret
   %1 = fcmp oge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -214,11 +211,6 @@ define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, mi
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_olt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, mi
-; GISEL-NEXT:    ret
   %1 = fcmp olt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -230,17 +222,19 @@ define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ls
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ole:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, ls
-; GISEL-NEXT:    ret
   %1 = fcmp ole float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
+; CHECK-SDAGISEL-LABEL: select_fcmp_one:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    fcmp s0, s1
+; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s3, mi
+; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s0, gt
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_fcmp_one:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fcmp s0, s1
@@ -257,15 +251,6 @@ define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-GISEL-NEXT:    ret
-; GISEL-LABEL: select_fcmp_one:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    fcsel s0, s2, s3, ne
-; GISEL-NEXT:    ret
   %1 = fcmp one float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -277,11 +262,6 @@ define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, vc
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ord:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, vc
-; GISEL-NEXT:    ret
   %1 = fcmp ord float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -293,17 +273,19 @@ define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, vs
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_uno:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, vs
-; GISEL-NEXT:    ret
   %1 = fcmp uno float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
+; CHECK-SDAGISEL-LABEL: select_fcmp_ueq:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    fcmp s0, s1
+; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s3, eq
+; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s0, vs
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_fcmp_ueq:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fcmp s0, s1
@@ -320,15 +302,6 @@ define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-GISEL-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ueq:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    fcsel s0, s2, s3, ne
-; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -340,11 +313,6 @@ define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, hi
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ugt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, hi
-; GISEL-NEXT:    ret
   %1 = fcmp ugt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -356,11 +324,6 @@ define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, pl
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_uge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, pl
-; GISEL-NEXT:    ret
   %1 = fcmp uge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -372,11 +335,6 @@ define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, lt
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ult:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, lt
-; GISEL-NEXT:    ret
   %1 = fcmp ult float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -389,11 +347,6 @@ define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, le
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_ule:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, le
-; GISEL-NEXT:    ret
   %1 = fcmp ule float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -405,17 +358,17 @@ define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_fcmp_une:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fcsel s0, s2, s3, ne
-; GISEL-NEXT:    ret
   %1 = fcmp une float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_true(float %x, float %a, float %b) {
+; CHECK-SDAGISEL-LABEL: select_fcmp_true:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    fmov s0, s1
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_fcmp_true:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fmov s0, s1
@@ -430,15 +383,6 @@ define float @select_fcmp_true(float %x, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, ne
 ; CHECK-GISEL-NEXT:    ret
-; GISEL-LABEL: select_fcmp_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcmp s0, s0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    fcsel s0, s1, s2, ne
-; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -450,11 +394,6 @@ define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, eq
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_eq:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, eq
-; GISEL-NEXT:    ret
   %1 = icmp eq i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -466,11 +405,6 @@ define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_ne:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, ne
-; GISEL-NEXT:    ret
   %1 = icmp ne i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -482,11 +416,6 @@ define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, hi
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_ugt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, hi
-; GISEL-NEXT:    ret
   %1 = icmp ugt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -498,11 +427,6 @@ define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, hs
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_uge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, hs
-; GISEL-NEXT:    ret
   %1 = icmp uge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -514,11 +438,6 @@ define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, lo
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_ult:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, lo
-; GISEL-NEXT:    ret
   %1 = icmp ult i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -530,11 +449,6 @@ define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ls
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_ule:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, ls
-; GISEL-NEXT:    ret
   %1 = icmp ule i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -546,11 +460,6 @@ define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, gt
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_sgt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, gt
-; GISEL-NEXT:    ret
   %1 = icmp sgt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -562,11 +471,6 @@ define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ge
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_sge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, ge
-; GISEL-NEXT:    ret
   %1 = icmp sge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -578,11 +482,6 @@ define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, lt
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_slt:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, lt
-; GISEL-NEXT:    ret
   %1 = icmp slt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -594,11 +493,6 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, le
 ; CHECK-NEXT:    ret
-; GISEL-LABEL: select_icmp_sle:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    fcsel s0, s0, s1, le
-; GISEL-NEXT:    ret
   %1 = icmp sle i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -606,33 +500,22 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
 
 ; Test peephole optimizations for select.
 define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
-; GISEL-LABEL: select_opt1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    orr w8, w0, w1
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
-; CHECK-FASTISEL-LABEL: select_opt1:
-; CHECK-FASTISEL:       ; %bb.0:
-; CHECK-FASTISEL-NEXT:    orr w8, w0, w1
-; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
-; CHECK-FASTISEL-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: select_opt1:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    orr w8, w0, w1
-; CHECK-GISEL-NEXT:    and w0, w8, #0x1
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: select_opt1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    orr w8, w0, w1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
   %1 = select i1 %c, i1 true, i1 %a
   ret i1 %1
 }
 
 define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
-; GISEL-LABEL: select_opt2:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    eor w8, w0, #0x1
-; GISEL-NEXT:    orr w8, w8, w1
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_opt2:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    orn w8, w1, w0
+; CHECK-SDAGISEL-NEXT:    and w0, w8, #0x1
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_opt2:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    eor w8, w0, #0x1
@@ -651,11 +534,12 @@ define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
 }
 
 define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
-; GISEL-LABEL: select_opt3:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    eor w8, w0, #0x1
-; GISEL-NEXT:    and w0, w8, w1
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_opt3:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    eor w8, w0, #0x1
+; CHECK-SDAGISEL-NEXT:    and w0, w8, w1
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_opt3:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    bic w8, w1, w0
@@ -672,10 +556,11 @@ define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
 }
 
 define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
-; GISEL-LABEL: select_opt4:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    and w0, w0, w1
-; GISEL-NEXT:    ret
+; CHECK-SDAGISEL-LABEL: select_opt4:
+; CHECK-SDAGISEL:       ; %bb.0:
+; CHECK-SDAGISEL-NEXT:    and w0, w0, w1
+; CHECK-SDAGISEL-NEXT:    ret
+;
 ; CHECK-FASTISEL-LABEL: select_opt4:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    and w8, w0, w1
diff --git a/llvm/test/CodeGen/AArch64/fcmp-fp128.ll b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
new file mode 100644
index 0000000000000..503cb8c533bab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
@@ -0,0 +1,560 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-GI
+
+; Checks generated libcalls for fp128 types
+
+define double @oeq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: oeq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: oeq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp oeq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ogt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ogt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ogt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ogt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @olt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: olt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: olt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ole(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ole:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ole:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ole fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @one(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: one:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w19, ne
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w8, eq
+; CHECK-SD-NEXT:    tst w8, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: one:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp one fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ord(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ord:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ord:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ord fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uno(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uno:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uno:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uno fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ueq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ueq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ueq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ueq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ugt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ugt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uge(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ge
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ge
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uge fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ult(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ult fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ule(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ule fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @une(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: une:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __netf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: une:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __netf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp une fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a76d0b36fa1aa..a5d7ae147ffda 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,37 +4,49 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-
-; CHECK-GI:      warning: Instruction selection used fallback path for f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_float
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_half
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_fp128
+; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_double
-
 
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
-; CHECK-LABEL: f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB0_2: // %entry
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB0_2
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB0_2: // %entry
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fcsel d1, d0, d1, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, fp128 %d, fp128 %e
@@ -42,37 +54,61 @@ entry:
 }
 
 define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
-; CHECK-LABEL: f128_i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    mov x21, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel x20, x22, x20, lt
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel x1, x21, x19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov x21, x1
+; CHECK-SD-NEXT:    mov x22, x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel x20, x22, x20, lt
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    mov w8, w0
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    csel x1, x21, x19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x21, x2
+; CHECK-GI-NEXT:    mov x22, x3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x0, x19, x21, lt
+; CHECK-GI-NEXT:    csel x1, x20, x22, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i128 %d, i128 %e
@@ -80,22 +116,39 @@ entry:
 }
 
 define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
-; CHECK-LABEL: f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov d8, d3
-; CHECK-NEXT:    fmov d9, d2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel d0, d9, d8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, double %d, double %e
@@ -103,22 +156,39 @@ entry:
 }
 
 define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
-; CHECK-LABEL: f128_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov s8, s3
-; CHECK-NEXT:    fmov s9, s2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov s8, s3
+; CHECK-SD-NEXT:    fmov s9, s2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel s0, s8, s9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, float %d, float %e
@@ -126,22 +196,39 @@ entry:
 }
 
 define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
-; CHECK-LABEL: f128_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov w19, w1
-; CHECK-NEXT:    mov w20, w0
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel w0, w20, w19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov w19, w1
+; CHECK-SD-NEXT:    mov w20, w0
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel w0, w20, w19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov w20, w1
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel w0, w19, w20, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i32 %d, i32 %e
@@ -184,40 +271,26 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
 ; CHECK-SD-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-NOFP16-LABEL: f128_half:
-; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-NOFP16-NEXT:    fmov s8, s3
-; CHECK-GI-NOFP16-NEXT:    fmov s9, s2
-; CHECK-GI-NOFP16-NEXT:    bl __lttf2
-; CHECK-GI-NOFP16-NEXT:    cmp w0, #0
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ret
-;
-; CHECK-GI-FP16-LABEL: f128_half:
-; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-FP16-NEXT:    fmov s8, s3
-; CHECK-GI-FP16-NEXT:    fmov s9, s2
-; CHECK-GI-FP16-NEXT:    bl __lttf2
-; CHECK-GI-FP16-NEXT:    cmp w0, #0
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    fcsel h0, h9, h8, lt
-; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ret
+; CHECK-GI-LABEL: f128_half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov w8, s8
+; CHECK-GI-NEXT:    fmov w9, s9
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    csel w8, w8, w9, lt
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, half %d, half %e
@@ -436,37 +509,64 @@ entry:
 
 
 define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double> %d, <2 x double> %e) {
-; CHECK-LABEL: v2f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
-; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
-; CHECK-NEXT:    add sp, sp, #96
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #96
+; CHECK-SD-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #96
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    stp q5, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <2 x fp128> %a, %b
   %s = select <2 x i1> %c, <2 x double> %d, <2 x double> %e
@@ -474,61 +574,122 @@ entry:
 }
 
 define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> %d, <3 x double> %e) {
-; CHECK-LABEL: v3f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #160
-; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 160
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-NEXT:    ldr d5, [sp, #184]
-; CHECK-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp d3, d2, [sp, #168]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr d5, [sp, #160]
-; CHECK-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    bsl v2.16b, v4.16b, v3.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    add sp, sp, #160
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #160
+; CHECK-SD-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    ldr d5, [sp, #184]
+; CHECK-SD-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr d5, [sp, #160]
+; CHECK-SD-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-SD-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d1, x8
+; CHECK-SD-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-SD-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    add sp, sp, #160
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #176
+; CHECK-GI-NEXT:    str x30, [sp, #128] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    stp q4, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ldr x19, [sp, #176]
+; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #184]
+; CHECK-GI-NEXT:    ldr x20, [sp, #200]
+; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    str q7, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q2, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #192]
+; CHECK-GI-NEXT:    stp q6, q2, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w21, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w22, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q0, q2, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x8, x21, #0, #1
+; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x9, x22, #0, #1
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    and x9, x19, x8
+; CHECK-GI-NEXT:    bic x8, x20, x8
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    orr x8, x9, x8
+; CHECK-GI-NEXT:    bic v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    add sp, sp, #176
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x fp128> %a, %b
   %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e
diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 62669a6d99eae..251d7a424175b 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -466,72 +466,19 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    movi v1.8h, #68, lsl #8
 ; CHECK-NO16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NO16-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NO16-NEXT:    mov w11, #-32768 // =0xffff8000
 ; CHECK-NO16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
 ; CHECK-NO16-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-NO16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-NO16-NEXT:    fcvtn v1.4h, v2.4s
 ; CHECK-NO16-NEXT:    fcvtn2 v1.8h, v0.4s
-; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v1.8h
-; CHECK-NO16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-NO16-NEXT:    mov s2, v0.s[1]
-; CHECK-NO16-NEXT:    fcvtzs w10, s0
-; CHECK-NO16-NEXT:    fcvtzs w15, s1
-; CHECK-NO16-NEXT:    fcvtzs w9, s2
-; CHECK-NO16-NEXT:    mov s2, v0.s[2]
-; CHECK-NO16-NEXT:    mov s0, v0.s[3]
-; CHECK-NO16-NEXT:    cmp w9, w8
-; CHECK-NO16-NEXT:    fcvtzs w12, s2
-; CHECK-NO16-NEXT:    mov s2, v1.s[1]
-; CHECK-NO16-NEXT:    csel w9, w9, w8, lt
-; CHECK-NO16-NEXT:    fcvtzs w13, s0
-; CHECK-NO16-NEXT:    mov s0, v1.s[2]
-; CHECK-NO16-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w9, w9, w11, gt
-; CHECK-NO16-NEXT:    cmp w10, w8
-; CHECK-NO16-NEXT:    csel w10, w10, w8, lt
-; CHECK-NO16-NEXT:    fcvtzs w14, s2
-; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    fcvtzs w16, s0
-; CHECK-NO16-NEXT:    mov s0, v1.s[3]
-; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
-; CHECK-NO16-NEXT:    cmp w12, w8
-; CHECK-NO16-NEXT:    csel w12, w12, w8, lt
-; CHECK-NO16-NEXT:    fmov s1, w10
-; CHECK-NO16-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w12, w12, w11, gt
-; CHECK-NO16-NEXT:    cmp w13, w8
-; CHECK-NO16-NEXT:    csel w13, w13, w8, lt
-; CHECK-NO16-NEXT:    mov v1.s[1], w9
-; CHECK-NO16-NEXT:    fcvtzs w9, s0
-; CHECK-NO16-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w13, w13, w11, gt
-; CHECK-NO16-NEXT:    cmp w14, w8
-; CHECK-NO16-NEXT:    csel w14, w14, w8, lt
-; CHECK-NO16-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    mov v1.s[2], w12
-; CHECK-NO16-NEXT:    csel w14, w14, w11, gt
-; CHECK-NO16-NEXT:    cmp w15, w8
-; CHECK-NO16-NEXT:    csel w15, w15, w8, lt
-; CHECK-NO16-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w10, w15, w11, gt
-; CHECK-NO16-NEXT:    cmp w16, w8
-; CHECK-NO16-NEXT:    mov v1.s[3], w13
-; CHECK-NO16-NEXT:    fmov s2, w10
-; CHECK-NO16-NEXT:    csel w10, w16, w8, lt
-; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
-; CHECK-NO16-NEXT:    cmp w9, w8
-; CHECK-NO16-NEXT:    mov v2.s[1], w14
-; CHECK-NO16-NEXT:    csel w8, w9, w8, lt
-; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w8, w8, w11, gt
-; CHECK-NO16-NEXT:    mov v2.s[2], w10
-; CHECK-NO16-NEXT:    mov v2.s[3], w8
-; CHECK-NO16-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-NO16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NO16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NO16-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NO16-NEXT:    sqxtn v0.4h, v0.4s
+; CHECK-NO16-NEXT:    sqxtn2 v0.8h, v1.4s
 ; CHECK-NO16-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_v8f16_sat:
diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
index dff216192a6c3..ab7c2afbbf871 100644
--- a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
@@ -101,4 +101,162 @@ define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
   ret <8 x half> %v
 }
 
+; int-to-fp conversion of element in lane 0 should apply
+; cvtf on vector subregister to avoid fpr->gpr trip
+define float @l0_extract_f_v2s(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 0
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v2i32
+define float @l0_extract_f_v2u(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 0
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+; Pattern should only apply when it is known to be lane 0
+define float @ln_extract_f_v2s(<2 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #2, #1
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 %n
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v4i32
+define float @l0_extract_f_v4s(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 0
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+define float @l0_extract_f_v4u(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 0
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+define float @ln_extract_f_v4s(<4 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 %n
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use dsub for bottom 64-bits from v2i64
+define double @l0_extract_d_v2s(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 0
+  %f = sitofp i64 %i to double
+  ret double %f
+}
+
+define double @l0_extract_d_v2u(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 0
+  %f = uitofp i64 %i to double
+  ret double %f
+}
+
+define double @ln_extract_d_v2s(<2 x i64> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_d_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #3, #1
+; CHECK-NEXT:    ldr d0, [x8]
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 %n
+  %f = sitofp i64 %i to double
+  ret double %f
+}
+
+; (fullfp16) cvtf to use hsub for bottom 16-bits from v8i16
+define half @l0_extract_h_v8s(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf h0, h0
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 0
+  %f = sitofp i16 %i to half
+  ret half %f
+}
+
+define half @l0_extract_h_v8u(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf h0, h0
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 0
+  %f = uitofp i16 %i to half
+  ret half %f
+}
+
+define half @ln_extract_h_v8u(<8 x i16> %u, i32 %n) #0 {
+; CHECK-LABEL: ln_extract_h_v8u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #1, #3
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 %n
+  %f = uitofp i16 %i to half
+  ret half %f
+}
+
 attributes #0 = { "target-features"="+fullfp16"}
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
index b09ed8d3eb764..cbdfb4c932775 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
@@ -353,6 +353,96 @@ define half @tan_f16(half %x) #0 {
   ret half %val
 }
 
+define half @asin_f16(half %x) #0 {
+; CHECK-LABEL: asin_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.asin.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @acos_f16(half %x) #0 {
+; CHECK-LABEL: acos_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.acos.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @atan_f16(half %x) #0 {
+; CHECK-LABEL: atan_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.atan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sinh_f16(half %x) #0 {
+; CHECK-LABEL: sinh_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.sinh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @cosh_f16(half %x) #0 {
+; CHECK-LABEL: cosh_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.cosh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @tanh_f16(half %x) #0 {
+; CHECK-LABEL: tanh_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.tanh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
 define half @pow_f16(half %x, half %y) #0 {
 ; CHECK-LABEL: pow_f16:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index 67d0b63f4076f..62b4a79b26d8e 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -153,6 +153,48 @@ define float @tan_f32(float %x) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: asin_f32:
+; CHECK: bl asinf
+define float @asin_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: acos_f32:
+; CHECK: bl acosf
+define float @acos_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: atan_f32:
+; CHECK: bl atanf
+define float @atan_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: sinh_f32:
+; CHECK: bl sinhf
+define float @sinh_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: cosh_f32:
+; CHECK: bl coshf
+define float @cosh_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: tanh_f32:
+; CHECK: bl tanhf
+define float @tanh_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: pow_f32:
 ; CHECK: bl powf
 define float @pow_f32(float %x, float %y) #0 {
@@ -644,6 +686,48 @@ define double @tan_f64(double %x) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: asin_f64:
+; CHECK: bl asin
+define double @asin_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.asin.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: acos_f64:
+; CHECK: bl acos
+define double @acos_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.acos.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: atan_f64:
+; CHECK: bl atan
+define double @atan_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.atan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: sinh_f64:
+; CHECK: bl sinh
+define double @sinh_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.sinh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: cosh_f64:
+; CHECK: bl cosh
+define double @cosh_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.cosh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: tanh_f64:
+; CHECK: bl tanh
+define double @tanh_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.tanh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: pow_f64:
 ; CHECK: bl pow
 define double @pow_f64(double %x, double %y) #0 {
@@ -1135,6 +1219,48 @@ define fp128 @tan_f128(fp128 %x) #0 {
   ret fp128 %val
 }
 
+; CHECK-LABEL: asin_f128:
+; CHECK: bl asinl
+define fp128 @asin_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.asin.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
+; CHECK-LABEL: acos_f128:
+; CHECK: bl acosl
+define fp128 @acos_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.acos.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
+; CHECK-LABEL: atan_f128:
+; CHECK: bl atanl
+define fp128 @atan_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.atan.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
+; CHECK-LABEL: sinh_f128:
+; CHECK: bl sinhl
+define fp128 @sinh_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.sinh.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
+; CHECK-LABEL: cosh_f128:
+; CHECK: bl coshl
+define fp128 @cosh_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.cosh.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
+; CHECK-LABEL: tanh_f128:
+; CHECK: bl tanhl
+define fp128 @tanh_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.tanh.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
 ; CHECK-LABEL: pow_f128:
 ; CHECK: bl powl
 define fp128 @pow_f128(fp128 %x, fp128 %y) #0 {
@@ -1519,6 +1645,48 @@ define <1 x double> @tan_v1f64(<1 x double> %x, <1 x double> %y) #0 {
   ret <1 x double> %val
 }
 
+; CHECK-LABEL: asin_v1f64:
+; CHECK: bl asin
+define <1 x double> @asin_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.asin.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: acos_v1f64:
+; CHECK: bl acos
+define <1 x double> @acos_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.acos.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: atan_v1f64:
+; CHECK: bl atan
+define <1 x double> @atan_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.atan.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: sinh_v1f64:
+; CHECK: bl sinh
+define <1 x double> @sinh_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.sinh.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: cosh_v1f64:
+; CHECK: bl cosh
+define <1 x double> @cosh_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.cosh.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: tanh_v1f64:
+; CHECK: bl tanh
+define <1 x double> @tanh_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.tanh.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
 ; CHECK-LABEL: pow_v1f64:
 ; CHECK: bl pow
 define <1 x double> @pow_v1f64(<1 x double> %x, <1 x double> %y) #0 {
@@ -1584,6 +1752,12 @@ declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, meta
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
@@ -1629,6 +1803,12 @@ declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, me
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
@@ -1674,6 +1854,12 @@ declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, met
 declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.cos.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.asin.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.acos.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.atan.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.sinh.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.cosh.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.tanh.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.log.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.log10.f128(fp128, metadata, metadata)
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 4e8bfcd9d7516..0138bef9c3845 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -283,14 +283,12 @@ entry:
 define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-CVT-LABEL: utesth_f16i16:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
-; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v0.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: utesth_f16i16:
@@ -308,14 +306,12 @@ entry:
 define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-CVT-LABEL: ustest_f16i16:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
-; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v0.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: ustest_f16i16:
@@ -909,14 +905,12 @@ entry:
 define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-CVT-LABEL: utesth_f16i16_mm:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
-; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v0.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: utesth_f16i16_mm:
@@ -933,14 +927,12 @@ entry:
 define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-CVT-LABEL: ustest_f16i16_mm:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
-; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v0.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: ustest_f16i16_mm:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 4723ac01d6021..0c880592d955b 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2287,20 +2287,19 @@ define <2 x i128> @fptos_v2f64_v2i128(<2 x double> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w20, -16
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixdfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
 ; CHECK-SD-NEXT:    bl __fixdfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -2345,20 +2344,19 @@ define <2 x i128> @fptou_v2f64_v2i128(<2 x double> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w20, -16
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixunsdfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
 ; CHECK-SD-NEXT:    bl __fixunsdfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -2407,28 +2405,26 @@ define <3 x i128> @fptos_v3f64_v3i128(<3 x double> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -48
 ; CHECK-SD-NEXT:    .cfi_offset b8, -56
 ; CHECK-SD-NEXT:    .cfi_offset b9, -64
-; CHECK-SD-NEXT:    fmov d9, d0
-; CHECK-SD-NEXT:    fmov d0, d1
 ; CHECK-SD-NEXT:    fmov d8, d2
+; CHECK-SD-NEXT:    fmov d9, d1
 ; CHECK-SD-NEXT:    bl __fixdfti
-; CHECK-SD-NEXT:    fmov d0, d8
+; CHECK-SD-NEXT:    fmov d0, d9
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
 ; CHECK-SD-NEXT:    bl __fixdfti
-; CHECK-SD-NEXT:    fmov d0, d9
+; CHECK-SD-NEXT:    fmov d0, d8
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
 ; CHECK-SD-NEXT:    bl __fixdfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ret
 ;
@@ -2488,28 +2484,26 @@ define <3 x i128> @fptou_v3f64_v3i128(<3 x double> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -48
 ; CHECK-SD-NEXT:    .cfi_offset b8, -56
 ; CHECK-SD-NEXT:    .cfi_offset b9, -64
-; CHECK-SD-NEXT:    fmov d9, d0
-; CHECK-SD-NEXT:    fmov d0, d1
 ; CHECK-SD-NEXT:    fmov d8, d2
+; CHECK-SD-NEXT:    fmov d9, d1
 ; CHECK-SD-NEXT:    bl __fixunsdfti
-; CHECK-SD-NEXT:    fmov d0, d8
+; CHECK-SD-NEXT:    fmov d0, d9
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
 ; CHECK-SD-NEXT:    bl __fixunsdfti
-; CHECK-SD-NEXT:    fmov d0, d9
+; CHECK-SD-NEXT:    fmov d0, d8
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
 ; CHECK-SD-NEXT:    bl __fixunsdfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3694,20 +3688,19 @@ define <2 x i128> @fptos_v2f32_v2i128(<2 x float> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixsfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
 ; CHECK-SD-NEXT:    bl __fixsfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3754,20 +3747,19 @@ define <2 x i128> @fptou_v2f32_v2i128(<2 x float> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixunssfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
 ; CHECK-SD-NEXT:    bl __fixunssfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3822,23 +3814,22 @@ define <3 x i128> @fptos_v3f32_v3i128(<3 x float> %a) {
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixsfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
 ; CHECK-SD-NEXT:    bl __fixsfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x21
-; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
 ; CHECK-SD-NEXT:    mov x4, x19
 ; CHECK-SD-NEXT:    mov x5, x20
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #64
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3904,23 +3895,22 @@ define <3 x i128> @fptou_v3f32_v3i128(<3 x float> %a) {
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixunssfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
 ; CHECK-SD-NEXT:    bl __fixunssfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x21
-; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
 ; CHECK-SD-NEXT:    mov x4, x19
 ; CHECK-SD-NEXT:    mov x5, x20
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #64
 ; CHECK-SD-NEXT:    ret
 ;
@@ -7034,20 +7024,19 @@ define <2 x i128> @fptos_v2f16_v2i128(<2 x half> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h0, v0.h[1]
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixhfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    mov h0, v0.h[1]
 ; CHECK-SD-NEXT:    bl __fixhfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -7089,20 +7078,19 @@ define <2 x i128> @fptou_v2f16_v2i128(<2 x half> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -32
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h0, v0.h[1]
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixunshfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    mov h0, v0.h[1]
 ; CHECK-SD-NEXT:    bl __fixunshfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #48
 ; CHECK-SD-NEXT:    ret
 ;
@@ -7147,28 +7135,27 @@ define <3 x i128> @fptos_v3f16_v3i128(<3 x half> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -48
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h0, v0.h[1]
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixhfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    mov h0, v0.h[1]
 ; CHECK-SD-NEXT:    bl __fixhfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
 ; CHECK-SD-NEXT:    bl __fixhfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #64
 ; CHECK-SD-NEXT:    ret
 ;
@@ -7220,28 +7207,27 @@ define <3 x i128> @fptou_v3f16_v3i128(<3 x half> %a) {
 ; CHECK-SD-NEXT:    .cfi_offset w30, -48
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h0, v0.h[1]
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-SD-NEXT:    bl __fixunshfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x19, x0
 ; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    mov h0, v0.h[1]
 ; CHECK-SD-NEXT:    bl __fixunshfti
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    mov x21, x0
 ; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
 ; CHECK-SD-NEXT:    bl __fixunshfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x19
+; CHECK-SD-NEXT:    mov x1, x20
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
 ; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    add sp, sp, #64
 ; CHECK-SD-NEXT:    ret
 ;
@@ -8083,260 +8069,136 @@ entry:
 }
 
 define <2 x i128> @fptos_v2f128_v2i128(<2 x fp128> %a) {
-; CHECK-SD-LABEL: fptos_v2f128_v2i128:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #48
-; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w30, -32
-; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    bl __fixtfti
-; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x19, x0
-; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    bl __fixtfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    add sp, sp, #48
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptos_v2f128_v2i128:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #48
-; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT:    .cfi_offset w19, -8
-; CHECK-GI-NEXT:    .cfi_offset w20, -16
-; CHECK-GI-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __fixtfti
-; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x20, x1
-; CHECK-GI-NEXT:    bl __fixtfti
-; CHECK-GI-NEXT:    mov x2, x0
-; CHECK-GI-NEXT:    mov x3, x1
-; CHECK-GI-NEXT:    mov x0, x19
-; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    add sp, sp, #48
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptos_v2f128_v2i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __fixtfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    mov x20, x1
+; CHECK-NEXT:    bl __fixtfti
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
 entry:
   %c = fptosi <2 x fp128> %a to <2 x i128>
   ret <2 x i128> %c
 }
 
 define <2 x i128> @fptou_v2f128_v2i128(<2 x fp128> %a) {
-; CHECK-SD-LABEL: fptou_v2f128_v2i128:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #48
-; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w30, -32
-; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    bl __fixunstfti
-; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x19, x0
-; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    bl __fixunstfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    add sp, sp, #48
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptou_v2f128_v2i128:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #48
-; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT:    .cfi_offset w19, -8
-; CHECK-GI-NEXT:    .cfi_offset w20, -16
-; CHECK-GI-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __fixunstfti
-; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x20, x1
-; CHECK-GI-NEXT:    bl __fixunstfti
-; CHECK-GI-NEXT:    mov x2, x0
-; CHECK-GI-NEXT:    mov x3, x1
-; CHECK-GI-NEXT:    mov x0, x19
-; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    add sp, sp, #48
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptou_v2f128_v2i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __fixunstfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    mov x20, x1
+; CHECK-NEXT:    bl __fixunstfti
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
 entry:
   %c = fptoui <2 x fp128> %a to <2 x i128>
   ret <2 x i128> %c
 }
 
 define <3 x i128> @fptos_v3f128_v3i128(<3 x fp128> %a) {
-; CHECK-SD-LABEL: fptos_v3f128_v3i128:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #80
-; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w21, -24
-; CHECK-SD-NEXT:    .cfi_offset w22, -32
-; CHECK-SD-NEXT:    .cfi_offset w30, -48
-; CHECK-SD-NEXT:    stp q2, q0, [sp] // 32-byte Folded Spill
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    bl __fixtfti
-; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x19, x0
-; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    bl __fixtfti
-; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x21, x0
-; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    bl __fixtfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    add sp, sp, #80
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptos_v3f128_v3i128:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #80
-; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-GI-NEXT:    .cfi_offset w19, -8
-; CHECK-GI-NEXT:    .cfi_offset w20, -16
-; CHECK-GI-NEXT:    .cfi_offset w21, -24
-; CHECK-GI-NEXT:    .cfi_offset w22, -32
-; CHECK-GI-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NEXT:    stp q1, q2, [sp] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    bl __fixtfti
-; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x20, x1
-; CHECK-GI-NEXT:    bl __fixtfti
-; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x21, x0
-; CHECK-GI-NEXT:    mov x22, x1
-; CHECK-GI-NEXT:    bl __fixtfti
-; CHECK-GI-NEXT:    mov x4, x0
-; CHECK-GI-NEXT:    mov x5, x1
-; CHECK-GI-NEXT:    mov x0, x19
-; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    mov x2, x21
-; CHECK-GI-NEXT:    mov x3, x22
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    add sp, sp, #80
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptos_v3f128_v3i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    stp q1, q2, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __fixtfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    mov x20, x1
+; CHECK-NEXT:    bl __fixtfti
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x21, x0
+; CHECK-NEXT:    mov x22, x1
+; CHECK-NEXT:    bl __fixtfti
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
 entry:
   %c = fptosi <3 x fp128> %a to <3 x i128>
   ret <3 x i128> %c
 }
 
 define <3 x i128> @fptou_v3f128_v3i128(<3 x fp128> %a) {
-; CHECK-SD-LABEL: fptou_v3f128_v3i128:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #80
-; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w21, -24
-; CHECK-SD-NEXT:    .cfi_offset w22, -32
-; CHECK-SD-NEXT:    .cfi_offset w30, -48
-; CHECK-SD-NEXT:    stp q2, q0, [sp] // 32-byte Folded Spill
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    bl __fixunstfti
-; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x19, x0
-; CHECK-SD-NEXT:    mov x20, x1
-; CHECK-SD-NEXT:    bl __fixunstfti
-; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov x21, x0
-; CHECK-SD-NEXT:    mov x22, x1
-; CHECK-SD-NEXT:    bl __fixunstfti
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov x2, x19
-; CHECK-SD-NEXT:    mov x3, x20
-; CHECK-SD-NEXT:    mov x4, x21
-; CHECK-SD-NEXT:    mov x5, x22
-; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    mov v0.d[1], x1
-; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    add sp, sp, #80
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptou_v3f128_v3i128:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #80
-; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-GI-NEXT:    .cfi_offset w19, -8
-; CHECK-GI-NEXT:    .cfi_offset w20, -16
-; CHECK-GI-NEXT:    .cfi_offset w21, -24
-; CHECK-GI-NEXT:    .cfi_offset w22, -32
-; CHECK-GI-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NEXT:    stp q1, q2, [sp] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    bl __fixunstfti
-; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x20, x1
-; CHECK-GI-NEXT:    bl __fixunstfti
-; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov x21, x0
-; CHECK-GI-NEXT:    mov x22, x1
-; CHECK-GI-NEXT:    bl __fixunstfti
-; CHECK-GI-NEXT:    mov x4, x0
-; CHECK-GI-NEXT:    mov x5, x1
-; CHECK-GI-NEXT:    mov x0, x19
-; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    mov x2, x21
-; CHECK-GI-NEXT:    mov x3, x22
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    add sp, sp, #80
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptou_v3f128_v3i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    stp q1, q2, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __fixunstfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    mov x20, x1
+; CHECK-NEXT:    bl __fixunstfti
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x21, x0
+; CHECK-NEXT:    mov x22, x1
+; CHECK-NEXT:    bl __fixunstfti
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
 entry:
   %c = fptoui <3 x fp128> %a to <3 x i128>
   ret <3 x i128> %c
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index d620a8851ee44..4626fd7f2b3dd 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -819,47 +819,43 @@ define <2 x i100> @test_signed_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -56
 ; CHECK-NEXT:    .cfi_offset b10, -64
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s8, v0.s[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
-; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    fmov s10, w8
+; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    fcmp s0, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    fcmp s8, s8
+; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    csel x19, xzr, x8, vs
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x21, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s0, s10
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x2, xzr, x8, vs
+; CHECK-NEXT:    csel x3, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
     %x = call <2 x i100> @llvm.fptosi.sat.v2f32.v2i100(<2 x float> %f)
@@ -885,47 +881,43 @@ define <2 x i128> @test_signed_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -56
 ; CHECK-NEXT:    .cfi_offset b10, -64
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s8, v0.s[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
-; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    fmov s10, w8
+; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    fcmp s0, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    fcmp s8, s8
+; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    csel x19, xzr, x8, vs
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x21, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s0, s10
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x2, xzr, x8, vs
+; CHECK-NEXT:    csel x3, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
     %x = call <2 x i128> @llvm.fptosi.sat.v2f32.v2i128(<2 x float> %f)
@@ -1068,15 +1060,15 @@ define <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i100:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    str d10, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #40] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #56] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    str d10, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #24] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -1089,28 +1081,40 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -80
 ; CHECK-NEXT:    .cfi_offset b9, -88
 ; CHECK-NEXT:    .cfi_offset b10, -96
-; CHECK-NEXT:    mov s8, v0.s[1]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
-; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    mov x26, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, s9
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    fcmp s0, s10
+; CHECK-NEXT:    csel x9, x26, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    csel x19, xzr, x8, vs
+; CHECK-NEXT:    csel x20, xzr, x9, vs
+; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    csel x9, x26, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    csel x19, xzr, x8, vs
-; CHECK-NEXT:    csel x20, xzr, x9, vs
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    csel x21, xzr, x8, vs
+; CHECK-NEXT:    csel x22, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s0, s9
@@ -1122,48 +1126,32 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s0, s0
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    csel x21, xzr, x8, vs
-; CHECK-NEXT:    csel x22, xzr, x9, vs
+; CHECK-NEXT:    csel x23, xzr, x8, vs
+; CHECK-NEXT:    csel x24, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, x26, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    csel x23, xzr, x8, vs
-; CHECK-NEXT:    csel x24, xzr, x9, vs
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s0, s10
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x26, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x6, xzr, x8, vs
+; CHECK-NEXT:    csel x7, xzr, x9, vs
+; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
     %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f)
     ret <4 x i100> %x
@@ -1172,15 +1160,15 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    str d10, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #40] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #56] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    str d10, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #24] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -1193,28 +1181,40 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -80
 ; CHECK-NEXT:    .cfi_offset b9, -88
 ; CHECK-NEXT:    .cfi_offset b10, -96
-; CHECK-NEXT:    mov s8, v0.s[1]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
-; CHECK-NEXT:    mov x25, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x25, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x26, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, s9
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    fcmp s0, s10
+; CHECK-NEXT:    csel x9, x26, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp s0, s0
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    csel x19, xzr, x8, vs
+; CHECK-NEXT:    csel x20, xzr, x9, vs
+; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    csel x9, x26, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    csel x19, xzr, x8, vs
-; CHECK-NEXT:    csel x20, xzr, x9, vs
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    csel x21, xzr, x8, vs
+; CHECK-NEXT:    csel x22, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s0, s9
@@ -1226,48 +1226,32 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s0, s0
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    csel x21, xzr, x8, vs
-; CHECK-NEXT:    csel x22, xzr, x9, vs
+; CHECK-NEXT:    csel x23, xzr, x8, vs
+; CHECK-NEXT:    csel x24, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, x26, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    csel x23, xzr, x8, vs
-; CHECK-NEXT:    csel x24, xzr, x9, vs
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s0, s10
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x26, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x6, xzr, x8, vs
+; CHECK-NEXT:    csel x7, xzr, x9, vs
+; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
     %x = call <4 x i128> @llvm.fptosi.sat.v4f32.v4i128(<4 x float> %f)
     ret <4 x i128> %x
@@ -1465,48 +1449,44 @@ define <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -48
 ; CHECK-NEXT:    .cfi_offset b9, -56
 ; CHECK-NEXT:    .cfi_offset b10, -64
-; CHECK-NEXT:    mov d8, v0.d[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    mov x8, #-4170333254945079296 // =0xc620000000000000
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
-; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
 ; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    mov x8, #5053038781909696511 // =0x461fffffffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
 ; CHECK-NEXT:    fmov d10, x8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    mov d8, v0.d[1]
+; CHECK-NEXT:    fcmp d0, d9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp d8, d10
+; CHECK-NEXT:    fcmp d0, d10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    fcmp d8, d8
+; CHECK-NEXT:    fcmp d0, d0
+; CHECK-NEXT:    fmov d0, d8
 ; CHECK-NEXT:    csel x19, xzr, x8, vs
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp d8, d9
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp d0, d9
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x21, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp d0, d10
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
+; CHECK-NEXT:    fcmp d8, d10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
-; CHECK-NEXT:    fcmp d0, d0
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp d8, d8
 ; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x2, xzr, x8, vs
+; CHECK-NEXT:    csel x3, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
     %x = call <2 x i100> @llvm.fptosi.sat.v2f64.v2i100(<2 x double> %f)
@@ -1531,48 +1511,44 @@ define <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -48
 ; CHECK-NEXT:    .cfi_offset b9, -56
 ; CHECK-NEXT:    .cfi_offset b10, -64
-; CHECK-NEXT:    mov d8, v0.d[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    mov x8, #-4044232465378705408 // =0xc7e0000000000000
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
 ; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    mov x8, #5179139571476070399 // =0x47dfffffffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
 ; CHECK-NEXT:    fmov d10, x8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    mov d8, v0.d[1]
+; CHECK-NEXT:    fcmp d0, d9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp d8, d10
+; CHECK-NEXT:    fcmp d0, d10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    fcmp d8, d8
+; CHECK-NEXT:    fcmp d0, d0
+; CHECK-NEXT:    fmov d0, d8
 ; CHECK-NEXT:    csel x19, xzr, x8, vs
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp d8, d9
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp d0, d9
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x21, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp d0, d10
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
+; CHECK-NEXT:    fcmp d8, d10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
-; CHECK-NEXT:    fcmp d0, d0
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    fcmp d8, d8
 ; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x2, xzr, x8, vs
+; CHECK-NEXT:    csel x3, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
     %x = call <2 x i128> @llvm.fptosi.sat.v2f64.v2i128(<2 x double> %f)
@@ -1838,9 +1814,8 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -88
 ; CHECK-NEXT:    .cfi_offset b10, -96
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcvt s8, h1
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
@@ -1849,7 +1824,7 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    mov x26, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
@@ -1864,7 +1839,7 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1878,6 +1853,7 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1890,30 +1866,27 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x26, x8, gt
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x26, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    csel x6, xzr, x8, vs
+; CHECK-NEXT:    csel x7, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
     %x = call <4 x i100> @llvm.fptosi.sat.v4f16.v4i100(<4 x half> %f)
@@ -1945,9 +1918,8 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -88
 ; CHECK-NEXT:    .cfi_offset b10, -96
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcvt s8, h1
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
@@ -1956,7 +1928,7 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x25, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x26, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
@@ -1971,7 +1943,7 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1985,6 +1957,7 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1997,30 +1970,27 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
 ; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x26, x8, gt
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x9, x26, x9, gt
+; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    csel x6, xzr, x8, vs
+; CHECK-NEXT:    csel x7, xzr, x9, vs
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
     %x = call <4 x i128> @llvm.fptosi.sat.v4f16.v4i128(<4 x half> %f)
@@ -2044,47 +2014,17 @@ declare <8 x i128> @llvm.fptosi.sat.v8f16.v8i128(<8 x half>)
 define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i1:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w9, s1
-; CHECK-CVT-NEXT:    fcvtzs w13, s0
-; CHECK-CVT-NEXT:    fcvtzs w8, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    ands w8, w8, w8, asr #31
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    ands w9, w9, w9, asr #31
-; CHECK-CVT-NEXT:    csinv w9, w9, wzr, ge
-; CHECK-CVT-NEXT:    ands w10, w10, w10, asr #31
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    fcvtzs w14, s1
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    csinv w10, w10, wzr, ge
-; CHECK-CVT-NEXT:    ands w11, w11, w11, asr #31
-; CHECK-CVT-NEXT:    csinv w11, w11, wzr, ge
-; CHECK-CVT-NEXT:    ands w12, w12, w12, asr #31
-; CHECK-CVT-NEXT:    mov v1.s[1], w8
-; CHECK-CVT-NEXT:    csinv w12, w12, wzr, ge
-; CHECK-CVT-NEXT:    ands w13, w13, w13, asr #31
-; CHECK-CVT-NEXT:    csinv w13, w13, wzr, ge
-; CHECK-CVT-NEXT:    ands w8, w14, w14, asr #31
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    mov v2.s[1], w12
-; CHECK-CVT-NEXT:    mov v1.s[3], w11
-; CHECK-CVT-NEXT:    mov v2.s[2], w8
-; CHECK-CVT-NEXT:    ands w8, w9, w9, asr #31
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-CVT-NEXT:    movi v3.2d, #0xffffffffffffffff
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v1.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2104,65 +2044,17 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-128 // =0xffffff80
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #128
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, #127
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, #127
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, #127
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #128
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, #127
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, #127
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #127
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mvni v1.4s, #127
+; CHECK-CVT-NEXT:    smax v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2178,65 +2070,17 @@ define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i13:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #4095 // =0xfff
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-4096 // =0xfffff000
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #4095
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, #4095
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, #4095
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, #4095
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, #4095
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, #4095
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, #4095
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, #4095
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #15, msl #8
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mvni v1.4s, #15, msl #8
+; CHECK-CVT-NEXT:    smax v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i13:
@@ -2254,65 +2098,12 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 define <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-32768 // =0xffff8000
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-NEXT:    sqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v2.4s
+; CHECK-CVT-NEXT:    sqxtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i16:
@@ -3014,123 +2805,27 @@ define <16 x i16> @test_signed_v16f32_v16i16(<16 x float> %f) {
 define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v16f16_v16i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    fcvtzs w9, s3
-; CHECK-CVT-NEXT:    mov s3, v2.s[2]
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
-; CHECK-CVT-NEXT:    mov w9, #-128 // =0xffffff80
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w11, #128
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w10, #127
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, #127
-; CHECK-CVT-NEXT:    fcvtzs w17, s1
-; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    mov s1, v3.s[2]
-; CHECK-CVT-NEXT:    fcvtzs w0, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, #127
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
-; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    fcvtzs w1, s1
-; CHECK-CVT-NEXT:    csel w14, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[1]
-; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    mov s1, v0.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
-; CHECK-CVT-NEXT:    cmp w16, #127
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w16, #128
-; CHECK-CVT-NEXT:    fcvtzs w18, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[3]
-; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    fcvtzs w3, s1
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
-; CHECK-CVT-NEXT:    cmp w17, #127
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    cmn w17, #128
-; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
-; CHECK-CVT-NEXT:    cmp w18, #127
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[1], w11
-; CHECK-CVT-NEXT:    cmn w18, #128
-; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
-; CHECK-CVT-NEXT:    cmp w0, #127
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    cmn w0, #128
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
-; CHECK-CVT-NEXT:    cmp w1, #127
-; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fmov s3, w0
-; CHECK-CVT-NEXT:    cmn w1, #128
-; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
-; CHECK-CVT-NEXT:    cmp w2, #127
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w18
-; CHECK-CVT-NEXT:    cmn w2, #128
-; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
-; CHECK-CVT-NEXT:    cmp w3, #127
-; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    cmn w3, #128
-; CHECK-CVT-NEXT:    mov v3.s[2], w1
-; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
-; CHECK-CVT-NEXT:    cmp w4, #127
-; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w4, s1
-; CHECK-CVT-NEXT:    fmov s1, w16
-; CHECK-CVT-NEXT:    cmn w3, #128
-; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
-; CHECK-CVT-NEXT:    mov v3.s[3], w2
-; CHECK-CVT-NEXT:    fmov s4, w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    cmp w4, #127
-; CHECK-CVT-NEXT:    mov v4.s[1], w13
-; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
-; CHECK-CVT-NEXT:    cmn w13, #128
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
-; CHECK-CVT-NEXT:    cmp w11, #127
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v4.s[2], w10
-; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
-; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127
+; CHECK-CVT-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzs v4.4s, v4.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    mvni v2.4s, #127
+; CHECK-CVT-NEXT:    smax v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-CVT-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v16f16_v16i8:
@@ -3147,122 +2842,18 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
 define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v16f16_v16i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    fcvtzs w16, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s3
-; CHECK-CVT-NEXT:    mov s3, v2.s[2]
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
-; CHECK-CVT-NEXT:    mov w9, #-32768 // =0xffff8000
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s2, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fcvtzs w17, s0
-; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v3.s[2]
-; CHECK-CVT-NEXT:    fcvtzs w0, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    fcvtzs w4, s1
-; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fcvtzs w1, s0
-; CHECK-CVT-NEXT:    csel w14, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[1]
-; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v1.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w16, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtzs w18, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[3]
-; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fcvtzs w3, s0
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    mov s0, v1.s[2]
-; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
-; CHECK-CVT-NEXT:    cmp w17, w8
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    cmn w17, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[1], w11
-; CHECK-CVT-NEXT:    cmn w18, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
-; CHECK-CVT-NEXT:    cmp w0, w8
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
-; CHECK-CVT-NEXT:    cmp w1, w8
-; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fmov s3, w0
-; CHECK-CVT-NEXT:    cmn w1, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w18
-; CHECK-CVT-NEXT:    cmn w2, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
-; CHECK-CVT-NEXT:    cmp w3, w8
-; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v3.s[2], w1
-; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
-; CHECK-CVT-NEXT:    cmp w4, w8
-; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
-; CHECK-CVT-NEXT:    mov s0, v1.s[3]
-; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fmov s1, w16
-; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
-; CHECK-CVT-NEXT:    mov v3.s[3], w2
-; CHECK-CVT-NEXT:    fmov s4, w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w4, w8
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    mov v4.s[1], w13
-; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
-; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v4.s[2], w10
-; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzs v3.4s, v5.4s
+; CHECK-CVT-NEXT:    sqxtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v4.4s
+; CHECK-CVT-NEXT:    sqxtn v1.4h, v1.4s
+; CHECK-CVT-NEXT:    sqxtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    sqxtn2 v1.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v16f16_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 16e04070b6543..a3b94bcf18ab4 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -729,37 +729,33 @@ define <2 x i100> @test_unsigned_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s8, v0.s[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
-; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    mov x21, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    fcmp s0, s9
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    csel x19, x21, x9, gt
 ; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x20
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x1, x21, x9, gt
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x3, x21, x9, gt
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x2, x8, xzr, le
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
     %x = call <2 x i100> @llvm.fptoui.sat.v2f32.v2i100(<2 x float> %f)
@@ -780,36 +776,32 @@ define <2 x i128> @test_unsigned_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s8, v0.s[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
-; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    mov s8, v0.s[1]
+; CHECK-NEXT:    fcmp s0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    fcmp s0, s9
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    csinv x19, x9, xzr, le
 ; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s0, s9
+; CHECK-NEXT:    csel x8, xzr, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csinv x1, x9, xzr, le
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    csinv x2, x9, xzr, le
+; CHECK-NEXT:    csinv x3, x8, xzr, le
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
     %x = call <2 x i128> @llvm.fptoui.sat.v2f32.v2i128(<2 x float> %f)
@@ -935,13 +927,13 @@ define <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i100:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x25, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -952,23 +944,32 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset w30, -64
 ; CHECK-NEXT:    .cfi_offset b8, -72
 ; CHECK-NEXT:    .cfi_offset b9, -80
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
+; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    mov s8, v0.s[1]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    csel x19, x25, x9, gt
+; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    csel x21, x25, x9, gt
+; CHECK-NEXT:    csinv x22, x8, xzr, le
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    csel x19, x25, x9, gt
-; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov s8, v0.s[1]
@@ -977,40 +978,27 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    csel x21, x25, x9, gt
-; CHECK-NEXT:    csinv x22, x8, xzr, le
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x23, x25, x9, gt
 ; CHECK-NEXT:    csinv x24, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x20
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    mov x4, x22
-; CHECK-NEXT:    mov x5, x21
-; CHECK-NEXT:    mov x6, x24
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    mov x7, x23
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, #0.0
+; CHECK-NEXT:    mov x2, x22
+; CHECK-NEXT:    mov x3, x21
+; CHECK-NEXT:    mov x4, x24
+; CHECK-NEXT:    mov x5, x23
+; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x1, x25, x9, gt
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ldp x30, x25, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x7, x25, x9, gt
+; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x6, x8, xzr, le
+; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
     %x = call <4 x i100> @llvm.fptoui.sat.v4f32.v4i100(<4 x float> %f)
     ret <4 x i100> %x
@@ -1019,13 +1007,13 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -1035,22 +1023,31 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset w30, -64
 ; CHECK-NEXT:    .cfi_offset b8, -72
 ; CHECK-NEXT:    .cfi_offset b9, -80
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
+; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    mov s8, v0.s[1]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    csel x8, xzr, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    csinv x19, x9, xzr, le
+; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov s9, w8
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    csinv x21, x9, xzr, le
+; CHECK-NEXT:    csinv x22, x8, xzr, le
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    csinv x19, x9, xzr, le
-; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov s8, v0.s[1]
@@ -1059,40 +1056,27 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    csinv x21, x9, xzr, le
-; CHECK-NEXT:    csinv x22, x8, xzr, le
+; CHECK-NEXT:    csinv x23, x9, xzr, le
+; CHECK-NEXT:    csinv x24, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csinv x23, x9, xzr, le
-; CHECK-NEXT:    csinv x24, x8, xzr, le
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csinv x1, x9, xzr, le
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x6, x9, xzr, le
+; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x7, x8, xzr, le
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
     %x = call <4 x i128> @llvm.fptoui.sat.v4f32.v4i128(<4 x float> %f)
     ret <4 x i128> %x
@@ -1261,37 +1245,33 @@ define <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    mov d8, v0.d[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bl __fixunsdfti
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x8, #5057542381537067007 // =0x462fffffffffffff
-; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    mov x21, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    fmov d9, x8
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov d8, v0.d[1]
+; CHECK-NEXT:    fcmp d0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    fcmp d0, d9
+; CHECK-NEXT:    fmov d0, d8
 ; CHECK-NEXT:    csel x19, x21, x9, gt
 ; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x20
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp d0, #0.0
+; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp d0, d9
-; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x1, x21, x9, gt
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x3, x21, x9, gt
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x2, x8, xzr, le
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
     %x = call <2 x i100> @llvm.fptoui.sat.v2f64.v2i100(<2 x double> %f)
@@ -1311,36 +1291,32 @@ define <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    mov d8, v0.d[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    mov x8, #5183643171103440895 // =0x47efffffffffffff
-; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x8, #5183643171103440895 // =0x47efffffffffffff
 ; CHECK-NEXT:    fmov d9, x8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov d8, v0.d[1]
+; CHECK-NEXT:    fcmp d0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    fcmp d0, d9
+; CHECK-NEXT:    fmov d0, d8
 ; CHECK-NEXT:    csinv x19, x9, xzr, le
 ; CHECK-NEXT:    csinv x20, x8, xzr, le
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    fcmp d0, #0.0
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp d0, d9
+; CHECK-NEXT:    csel x8, xzr, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csinv x1, x9, xzr, le
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    csinv x2, x9, xzr, le
+; CHECK-NEXT:    csinv x3, x8, xzr, le
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
     %x = call <2 x i128> @llvm.fptoui.sat.v2f64.v2i128(<2 x double> %f)
@@ -1570,7 +1546,7 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -72
 ; CHECK-NEXT:    .cfi_offset b9, -80
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[2]
+; CHECK-NEXT:    mov h1, v0.h[1]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    fcvt s8, h1
 ; CHECK-NEXT:    fmov s0, s8
@@ -1580,7 +1556,7 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
-; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1589,9 +1565,8 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    csinv x20, x9, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1600,8 +1575,9 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    csinv x22, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, #0.0
+; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1611,25 +1587,22 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov x2, x22
-; CHECK-NEXT:    mov x3, x21
-; CHECK-NEXT:    mov x4, x20
-; CHECK-NEXT:    mov x5, x19
-; CHECK-NEXT:    mov x6, x24
-; CHECK-NEXT:    mov x7, x23
+; CHECK-NEXT:    mov x2, x20
+; CHECK-NEXT:    mov x3, x19
+; CHECK-NEXT:    mov x4, x22
+; CHECK-NEXT:    mov x5, x21
 ; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov x0, x24
+; CHECK-NEXT:    mov x1, x23
 ; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x7, x25, x9, gt
 ; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x1, x25, x9, gt
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ldp x30, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    csinv x6, x8, xzr, le
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
     %x = call <4 x i100> @llvm.fptoui.sat.v4f16.v4i100(<4 x half> %f)
@@ -1656,16 +1629,15 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -72
 ; CHECK-NEXT:    .cfi_offset b9, -80
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcvt s8, h1
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1676,7 +1648,7 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1685,8 +1657,9 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    csinv x22, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmp s8, #0.0
+; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -1696,25 +1669,22 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    mov x6, x23
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    mov x3, x22
+; CHECK-NEXT:    mov x4, x23
+; CHECK-NEXT:    mov x5, x24
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    csel x8, xzr, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csinv x1, x9, xzr, le
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    csinv x6, x9, xzr, le
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    csinv x7, x8, xzr, le
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
     %x = call <4 x i128> @llvm.fptoui.sat.v4f16.v4i128(<4 x half> %f)
@@ -1738,47 +1708,14 @@ declare <8 x i128> @llvm.fptoui.sat.v8f16.v8i128(<8 x half>)
 define <8 x i1> @test_unsigned_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i1:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w9, s1
-; CHECK-CVT-NEXT:    fcvtzu w13, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w8, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w10, s3
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    fcvtzu w14, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w8, #1
-; CHECK-CVT-NEXT:    fcvtzu w12, s2
-; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-CVT-NEXT:    cmp w9, #1
-; CHECK-CVT-NEXT:    csinc w9, w9, wzr, lo
-; CHECK-CVT-NEXT:    cmp w10, #1
-; CHECK-CVT-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-CVT-NEXT:    cmp w11, #1
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    csinc w11, w11, wzr, lo
-; CHECK-CVT-NEXT:    cmp w12, #1
-; CHECK-CVT-NEXT:    csinc w12, w12, wzr, lo
-; CHECK-CVT-NEXT:    cmp w13, #1
-; CHECK-CVT-NEXT:    csinc w13, w13, wzr, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w8
-; CHECK-CVT-NEXT:    cmp w14, #1
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    fcvtzu w8, s0
-; CHECK-CVT-NEXT:    csinc w9, w14, wzr, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    cmp w8, #1
-; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w9
-; CHECK-CVT-NEXT:    mov v1.s[3], w11
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1796,48 +1733,14 @@ define <8 x i1> @test_unsigned_v8f16_v8i1(<8 x half> %f) {
 define <8 x i8> @test_unsigned_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, #255
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, #255
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, #255
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, #255
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1853,48 +1756,14 @@ define <8 x i8> @test_unsigned_v8f16_v8i8(<8 x half> %f) {
 define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i13:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #8191 // =0x1fff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #31, msl #8
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i13:
@@ -1910,48 +1779,12 @@ define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 define <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v2.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i16:
@@ -2509,12 +2342,10 @@ define <16 x i8> @test_unsigned_v16f32_v16i8(<16 x float> %f) {
 define <8 x i16> @test_unsigned_v8f32_v8i16(<8 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    uqxtn v0.4h, v0.4s
+; CHECK-NEXT:    uqxtn2 v0.8h, v1.4s
 ; CHECK-NEXT:    ret
     %x = call <8 x i16> @llvm.fptoui.sat.v8f32.v8i16(<8 x float> %f)
     ret <8 x i16> %x
@@ -2523,17 +2354,14 @@ define <8 x i16> @test_unsigned_v8f32_v8i16(<8 x float> %f) {
 define <16 x i16> @test_unsigned_v16f32_v16i16(<16 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v16f32_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v4.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
 ; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    umin v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-NEXT:    fcvtzu v4.4s, v1.4s
+; CHECK-NEXT:    uqxtn v0.4h, v0.4s
+; CHECK-NEXT:    uqxtn v1.4h, v2.4s
+; CHECK-NEXT:    fcvtzu v2.4s, v3.4s
+; CHECK-NEXT:    uqxtn2 v0.8h, v4.4s
+; CHECK-NEXT:    uqxtn2 v1.8h, v2.4s
 ; CHECK-NEXT:    ret
     %x = call <16 x i16> @llvm.fptoui.sat.v16f32.v16i16(<16 x float> %f)
     ret <16 x i16> %x
@@ -2544,100 +2372,30 @@ define <16 x i16> @test_unsigned_v16f32_v16i16(<16 x float> %f) {
 define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    mov s5, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s2
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtzu w13, s1
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu w9, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    mov s4, v1.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w12, s5
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    fcvtzu w3, s0
-; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, #255
-; CHECK-CVT-NEXT:    fcvtzu w16, s1
-; CHECK-CVT-NEXT:    mov s1, v2.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, #255
-; CHECK-CVT-NEXT:    fcvtzu w17, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, #255
-; CHECK-CVT-NEXT:    fcvtzu w0, s4
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w15, #255
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w16, #255
-; CHECK-CVT-NEXT:    fcvtzu w1, s1
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, #255
-; CHECK-CVT-NEXT:    fcvtzu w2, s3
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, #255
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, #255
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w1, #255
-; CHECK-CVT-NEXT:    fmov s3, w18
-; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
-; CHECK-CVT-NEXT:    cmp w2, #255
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
-; CHECK-CVT-NEXT:    cmp w3, #255
-; CHECK-CVT-NEXT:    fcvtzu w2, s1
-; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
-; CHECK-CVT-NEXT:    fmov s1, w13
-; CHECK-CVT-NEXT:    mov v3.s[1], w17
-; CHECK-CVT-NEXT:    fmov s4, w1
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w2, #255
-; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w0
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[3], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w16
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
-; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-CVT-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-CVT-NEXT:    fcvtzu v3.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v4.4s, v4.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-CVT-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i8:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-FP16-NEXT:    fcvtzu v1.8h, v1.8h
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
-; CHECK-FP16-NEXT:    umin v1.8h, v1.8h, v2.8h
-; CHECK-FP16-NEXT:    umin v0.8h, v0.8h, v2.8h
-; CHECK-FP16-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-FP16-NEXT:    fcvtzu v1.8h, v1.8h
+; CHECK-FP16-NEXT:    uqxtn v0.8b, v0.8h
+; CHECK-FP16-NEXT:    uqxtn2 v0.16b, v1.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <16 x i8> @llvm.fptoui.sat.v16f16.v16i8(<16 x half> %f)
     ret <16 x i8> %x
@@ -2646,89 +2404,18 @@ define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
 define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    mov s5, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s2
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtzu w13, s0
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtzu w9, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    mov s4, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w12, s5
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    fcvtzu w3, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fcvtzu w16, s0
-; CHECK-CVT-NEXT:    mov s0, v2.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    fcvtzu w17, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    fcvtzu w0, s4
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    fcvtzu w1, s0
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, w8
-; CHECK-CVT-NEXT:    fcvtzu w2, s3
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    mov s0, v1.s[2]
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w1, w8
-; CHECK-CVT-NEXT:    fmov s3, w18
-; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
-; CHECK-CVT-NEXT:    cmp w3, w8
-; CHECK-CVT-NEXT:    fcvtzu w2, s0
-; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
-; CHECK-CVT-NEXT:    mov s0, v1.s[3]
-; CHECK-CVT-NEXT:    fmov s1, w13
-; CHECK-CVT-NEXT:    fmov s4, w1
-; CHECK-CVT-NEXT:    mov v3.s[1], w17
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w0
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[3], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w16
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzu v3.4s, v5.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v4.4s
+; CHECK-CVT-NEXT:    uqxtn v1.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    uqxtn2 v1.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll b/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll
new file mode 100644
index 0000000000000..f20ea4db7baeb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll
@@ -0,0 +1,133 @@
+; RUN: rm -rf %t && split-file %s %t
+
+;--- err1.ll
+
+; RUN: not --crash llc %t/err1.ll -mtriple=aarch64-elf \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -code-model=large \
+; RUN:   -o - -verify-machineinstrs 2>&1 | FileCheck %s --check-prefix=ERR1
+
+; RUN: not --crash llc %t/err1.ll -mtriple=aarch64-elf \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -global-isel -global-isel-abort=1 \
+; RUN:   -code-model=large \
+; RUN:   -o - -verify-machineinstrs 2>&1 | FileCheck %s --check-prefix=ERR1
+
+; ERR1: LLVM ERROR: Unsupported code-model for hardened jump-table
+define i32 @test_jumptable(i32 %in) "aarch64-jump-table-hardening" {
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+  ]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+}
+
+;--- test.ll
+
+; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -global-isel -global-isel-abort=1 \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -code-model=large \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -global-isel -global-isel-abort=1 \
+; RUN:   -code-model=large \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc %t/test.ll -mtriple=aarch64-elf \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=ELF
+
+; RUN: llc %t/test.ll -mtriple=aarch64-elf \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -global-isel -global-isel-abort=1 \
+; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=ELF
+
+; MACHO-LABEL: test_jumptable:
+; MACHO:        mov   w16, w0
+; MACHO:        cmp   x16, #5
+; MACHO:        csel  x16, x16, xzr, ls
+; MACHO-NEXT:   adrp  x17, LJTI0_0@PAGE
+; MACHO-NEXT:   add   x17, x17, LJTI0_0@PAGEOFF
+; MACHO-NEXT:   ldrsw x16, [x17, x16, lsl #2]
+; MACHO-NEXT:  Ltmp0:
+; MACHO-NEXT:   adr   x17, Ltmp0
+; MACHO-NEXT:   add   x16, x17, x16
+; MACHO-NEXT:   br    x16
+
+; ELF-LABEL: test_jumptable:
+; ELF:        mov   w16, w0
+; ELF:        cmp   x16, #5
+; ELF:        csel  x16, x16, xzr, ls
+; ELF-NEXT:   adrp  x17, .LJTI0_0
+; ELF-NEXT:   add   x17, x17, :lo12:.LJTI0_0
+; ELF-NEXT:   ldrsw x16, [x17, x16, lsl #2]
+; ELF-NEXT:  .Ltmp0:
+; ELF-NEXT:   adr   x17, .Ltmp0
+; ELF-NEXT:   add   x16, x17, x16
+; ELF-NEXT:   br    x16
+
+define i32 @test_jumptable(i32 %in) "aarch64-jump-table-hardening" {
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+    i32 5, label %lbl5
+  ]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+lbl5:
+  ret i32 10
+
+}
+
+; MACHO-LABEL: LJTI0_0:
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
+
+; ELF-LABEL: .LJTI0_0:
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
+; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
diff --git a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
index 3281a98767795..08f6bb6f28532 100644
--- a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -178,7 +178,132 @@ define void @test_tan(float %float, double %double, fp128 %fp128) {
    store fp128 %tanfp128, ptr @varfp128
 ; CHECK: bl tanl
   ret void
+}
+
+declare float @llvm.acos.f32(float)
+declare double @llvm.acos.f64(double)
+declare fp128 @llvm.acos.f128(fp128)
+
+define void @test_acos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_acos:
+
+   %acosfloat = call float @llvm.acos.f32(float %float)
+   store float %acosfloat, ptr @varfloat
+; CHECK: bl acosf
+
+   %acosdouble = call double @llvm.acos.f64(double %double)
+   store double %acosdouble, ptr @vardouble
+; CHECK: bl acos
+
+   %acosfp128 = call fp128 @llvm.acos.f128(fp128 %fp128)
+   store fp128 %acosfp128, ptr @varfp128
+; CHECK: bl acosl
+  ret void
+}
+
+declare float @llvm.asin.f32(float)
+declare double @llvm.asin.f64(double)
+declare fp128 @llvm.asin.f128(fp128)
+
+define void @test_asin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_asin:
+
+   %asinfloat = call float @llvm.asin.f32(float %float)
+   store float %asinfloat, ptr @varfloat
+; CHECK: bl asinf
 
+   %asindouble = call double @llvm.asin.f64(double %double)
+   store double %asindouble, ptr @vardouble
+; CHECK: bl asin
+
+   %asinfp128 = call fp128 @llvm.asin.f128(fp128 %fp128)
+   store fp128 %asinfp128, ptr @varfp128
+; CHECK: bl asinl
+  ret void
+}
+
+declare float @llvm.atan.f32(float)
+declare double @llvm.atan.f64(double)
+declare fp128 @llvm.atan.f128(fp128)
+
+define void @test_atan(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_atan:
+
+   %atanfloat = call float @llvm.atan.f32(float %float)
+   store float %atanfloat, ptr @varfloat
+; CHECK: bl atanf
+
+   %atandouble = call double @llvm.atan.f64(double %double)
+   store double %atandouble, ptr @vardouble
+; CHECK: bl atan
+
+   %atanfp128 = call fp128 @llvm.atan.f128(fp128 %fp128)
+   store fp128 %atanfp128, ptr @varfp128
+; CHECK: bl atanl
+  ret void
+}
+
+declare float @llvm.cosh.f32(float)
+declare double @llvm.cosh.f64(double)
+declare fp128 @llvm.cosh.f128(fp128)
+
+define void @test_cosh(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cosh:
+
+   %coshfloat = call float @llvm.cosh.f32(float %float)
+   store float %coshfloat, ptr @varfloat
+; CHECK: bl coshf
+
+   %coshdouble = call double @llvm.cosh.f64(double %double)
+   store double %coshdouble, ptr @vardouble
+; CHECK: bl cosh
+
+   %coshfp128 = call fp128 @llvm.cosh.f128(fp128 %fp128)
+   store fp128 %coshfp128, ptr @varfp128
+; CHECK: bl coshl
+  ret void
+}
+
+declare float @llvm.sinh.f32(float)
+declare double @llvm.sinh.f64(double)
+declare fp128 @llvm.sinh.f128(fp128)
+
+define void @test_sinh(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sinh:
+
+   %sinhfloat = call float @llvm.sinh.f32(float %float)
+   store float %sinhfloat, ptr @varfloat
+; CHECK: bl sinhf
+
+   %sinhdouble = call double @llvm.sinh.f64(double %double)
+   store double %sinhdouble, ptr @vardouble
+; CHECK: bl sinh
+
+   %sinhfp128 = call fp128 @llvm.sinh.f128(fp128 %fp128)
+   store fp128 %sinhfp128, ptr @varfp128
+; CHECK: bl sinhl
+  ret void
+}
+
+declare float @llvm.tanh.f32(float)
+declare double @llvm.tanh.f64(double)
+declare fp128 @llvm.tanh.f128(fp128)
+
+define void @test_tanh(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_tanh:
+
+   %tanhfloat = call float @llvm.tanh.f32(float %float)
+   store float %tanhfloat, ptr @varfloat
+; CHECK: bl tanhf
+
+   %tanhdouble = call double @llvm.tanh.f64(double %double)
+   store double %tanhdouble, ptr @vardouble
+; CHECK: bl tanh
+
+   %tanhfp128 = call fp128 @llvm.tanh.f128(fp128 %fp128)
+   store fp128 %tanhfp128, ptr @varfp128
+; CHECK: bl tanhl
+  ret void
 }
 
 declare float @llvm.pow.f32(float, float)
diff --git a/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
index f2b5e5f0064b7..f3623928d67bf 100644
--- a/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
+++ b/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
@@ -16,7 +16,7 @@ entry:
 ; CHECK:        58000040                                         ldr    x0, 0x10
 ; CHECK:        d65f03c0                                         ret
 ; Make sure the constant pool entry comes after the return
-; CHECK-LABEL:        <$d.1>:
+; CHECK-LABEL:        <$d>:
 define i32 @bar() nounwind {
 entry:
   %0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index cdf2a962f9322..66d670d0b796b 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -319,8 +319,9 @@ define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
 define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: ctz_and_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmpne p2.b, p1/z, z0.b, z1.b
+; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll
index f743bae84053d..18364bdecee02 100644
--- a/llvm/test/CodeGen/AArch64/neon-abd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-abd.ll
@@ -332,8 +332,6 @@ define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-NEXT:    ngc x9, xzr
 ; CHECK-NEXT:    subs x10, x10, x11
 ; CHECK-NEXT:    ngc x11, xzr
-; CHECK-NEXT:    asr x9, x9, #63
-; CHECK-NEXT:    asr x11, x11, #63
 ; CHECK-NEXT:    eor x8, x8, x9
 ; CHECK-NEXT:    eor x10, x10, x11
 ; CHECK-NEXT:    sub x8, x8, x9
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index 170ba7292ae60..ff80ff097b28f 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -109,29 +109,11 @@ define <4 x i32> @movi4s_lsl16() {
 }
 
 define <4 x i32> @movi4s_fneg() {
-; CHECK-NOFP16-SD-LABEL: movi4s_fneg:
-; CHECK-NOFP16-SD:       // %bb.0:
-; CHECK-NOFP16-SD-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-NOFP16-SD-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NOFP16-SD-NEXT:    ret
-;
-; CHECK-FP16-SD-LABEL: movi4s_fneg:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-FP16-SD-NEXT:    fneg v0.4s, v0.4s
-; CHECK-FP16-SD-NEXT:    ret
-;
-; CHECK-NOFP16-GI-LABEL: movi4s_fneg:
-; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-NOFP16-GI-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: movi4s_fneg:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-FP16-GI-NEXT:    fneg v0.4s, v0.4s
-; CHECK-FP16-GI-NEXT:    ret
+; CHECK-LABEL: movi4s_fneg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #240, lsl #8
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    ret
    ret <4 x i32> <i32 2147545088, i32 2147545088, i32 2147545088, i32 2147545088>
 }
 
@@ -308,23 +290,17 @@ define <8 x i16> @mvni8h_neg() {
 ; CHECK-NOFP16-SD-NEXT:    dup v0.8h, w8
 ; CHECK-NOFP16-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: mvni8h_neg:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.8h, #240
-; CHECK-FP16-SD-NEXT:    fneg v0.8h, v0.8h
-; CHECK-FP16-SD-NEXT:    ret
+; CHECK-FP16-LABEL: mvni8h_neg:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    movi v0.8h, #240
+; CHECK-FP16-NEXT:    fneg v0.8h, v0.8h
+; CHECK-FP16-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: mvni8h_neg:
 ; CHECK-NOFP16-GI:       // %bb.0:
 ; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI32_0
 ; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI32_0]
 ; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: mvni8h_neg:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.8h, #240
-; CHECK-FP16-GI-NEXT:    fneg v0.8h, v0.8h
-; CHECK-FP16-GI-NEXT:    ret
    ret <8 x i16> <i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008>
 }
 
@@ -494,29 +470,11 @@ define <2 x double> @fmov2d() {
 }
 
 define <2 x double> @fmov2d_neg0() {
-; CHECK-NOFP16-SD-LABEL: fmov2d_neg0:
-; CHECK-NOFP16-SD:       // %bb.0:
-; CHECK-NOFP16-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOFP16-SD-NEXT:    fneg v0.2d, v0.2d
-; CHECK-NOFP16-SD-NEXT:    ret
-;
-; CHECK-FP16-SD-LABEL: fmov2d_neg0:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-FP16-SD-NEXT:    fneg v0.2d, v0.2d
-; CHECK-FP16-SD-NEXT:    ret
-;
-; CHECK-NOFP16-GI-LABEL: fmov2d_neg0:
-; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOFP16-GI-NEXT:    fneg v0.2d, v0.2d
-; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: fmov2d_neg0:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-FP16-GI-NEXT:    fneg v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT:    ret
+; CHECK-LABEL: fmov2d_neg0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    fneg v0.2d, v0.2d
+; CHECK-NEXT:    ret
 	ret <2 x double> <double -0.0, double -0.0>
 }
 
@@ -581,5 +539,4 @@ define <2 x i32> @movi1d() {
   ret <2 x i32> %1
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FP16: {{.*}}
 ; CHECK-NOFP16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index e93fb40897078..ed84ec2ba9eae 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -473,52 +473,37 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) {
 define <4 x i65> @test_ldnp_v4i65(ptr %A) {
 ; CHECK-LABEL: test_ldnp_v4i65:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0, #8]
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr x10, [x0, #24]
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
 ; CHECK-NEXT:    ldrb w11, [x0, #32]
-; CHECK-NEXT:    and x1, x8, #0x1
-; CHECK-NEXT:    extr x2, x9, x8, #1
-; CHECK-NEXT:    extr x4, x10, x9, #2
-; CHECK-NEXT:    mov.d v0[1], x1
-; CHECK-NEXT:    extr x6, x11, x10, #3
-; CHECK-NEXT:    ubfx x3, x9, #1, #1
-; CHECK-NEXT:    ubfx x5, x10, #2, #1
+; CHECK-NEXT:    ldp x0, x10, [x0]
 ; CHECK-NEXT:    ubfx x7, x11, #3, #1
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    extr x4, x9, x8, #2
+; CHECK-NEXT:    extr x6, x11, x9, #3
+; CHECK-NEXT:    ubfx x3, x8, #1, #1
+; CHECK-NEXT:    extr x2, x8, x10, #1
+; CHECK-NEXT:    ubfx x5, x9, #2, #1
+; CHECK-NEXT:    and x1, x10, #0x1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_ldnp_v4i65:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrb w8, [x0, #32]
-; CHECK-BE-NEXT:    ldp x12, x11, [x0, #16]
-; CHECK-BE-NEXT:    lsr x13, x10, #56
-; CHECK-BE-NEXT:    orr x7, x8, x11, lsl #8
-; CHECK-BE-NEXT:    extr x8, x10, x9, #56
-; CHECK-BE-NEXT:    extr x11, x12, x11, #56
-; CHECK-BE-NEXT:    lsr x14, x12, #56
-; CHECK-BE-NEXT:    extr x15, x9, x12, #56
-; CHECK-BE-NEXT:    lsr x10, x10, #59
-; CHECK-BE-NEXT:    extr x1, x13, x8, #3
-; CHECK-BE-NEXT:    lsr x8, x9, #56
-; CHECK-BE-NEXT:    ubfx x12, x12, #57, #1
-; CHECK-BE-NEXT:    ubfx x9, x9, #58, #1
-; CHECK-BE-NEXT:    extr x5, x14, x11, #1
-; CHECK-BE-NEXT:    and x11, x11, #0x1
-; CHECK-BE-NEXT:    fmov d0, x10
-; CHECK-BE-NEXT:    fmov d2, x12
-; CHECK-BE-NEXT:    fmov d3, x11
-; CHECK-BE-NEXT:    fmov d1, x9
-; CHECK-BE-NEXT:    extr x3, x8, x15, #2
-; CHECK-BE-NEXT:    mov v0.d[1], x1
-; CHECK-BE-NEXT:    mov v2.d[1], x5
-; CHECK-BE-NEXT:    mov v3.d[1], x7
-; CHECK-BE-NEXT:    mov v1.d[1], x3
-; CHECK-BE-NEXT:    fmov x0, d0
-; CHECK-BE-NEXT:    fmov x4, d2
-; CHECK-BE-NEXT:    fmov x6, d3
-; CHECK-BE-NEXT:    fmov x2, d1
+; CHECK-BE-NEXT:    ldp x9, x8, [x0]
+; CHECK-BE-NEXT:    ldrb w12, [x0, #32]
+; CHECK-BE-NEXT:    ldp x10, x11, [x0, #16]
+; CHECK-BE-NEXT:    extr x13, x9, x8, #56
+; CHECK-BE-NEXT:    lsr x14, x9, #56
+; CHECK-BE-NEXT:    lsr x16, x8, #56
+; CHECK-BE-NEXT:    extr x15, x8, x10, #56
+; CHECK-BE-NEXT:    orr x7, x12, x11, lsl #8
+; CHECK-BE-NEXT:    extr x11, x10, x11, #56
+; CHECK-BE-NEXT:    lsr x12, x10, #56
+; CHECK-BE-NEXT:    extr x1, x14, x13, #3
+; CHECK-BE-NEXT:    lsr x0, x9, #59
+; CHECK-BE-NEXT:    ubfx x2, x8, #58, #1
+; CHECK-BE-NEXT:    ubfx x4, x10, #57, #1
+; CHECK-BE-NEXT:    extr x3, x16, x15, #2
+; CHECK-BE-NEXT:    extr x5, x12, x11, #1
+; CHECK-BE-NEXT:    and x6, x11, #0x1
 ; CHECK-BE-NEXT:    ret
   %lv = load <4 x i65>, ptr %A, align 8, !nontemporal !0
   ret <4 x i65> %lv
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
new file mode 100644
index 0000000000000..3f92943b11eb1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
+; CHECK-LABEL: peephole_csel:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    csel x9, xzr, xzr, eq
+; CHECK-NEXT:    tst w1, #0x1
+; CHECK-NEXT:    csel x8, x8, x9, eq
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+entry:
+  br i1 %0, label %then, label %exit
+
+then:                                             ; preds = %entry
+  ; The donothing() is needed to make make this block less interesting to
+  ; SimplifyCFG. Otherwise we may not get the csel that we want to test.
+  call void @llvm.donothing()
+  br i1 %cmp, label %true, label %exit
+
+true:                                             ; preds = %then
+  ; Same as above
+  call void @llvm.donothing()
+  br label %exit
+
+exit: ; preds = %true, %then, %entry
+  %x = phi i64 [ 0, %true ], [ 0, %then ], [ 1, %entry ]
+  store i64 %x, ptr %dst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.mir b/llvm/test/CodeGen/AArch64/peephole-csel.mir
new file mode 100644
index 0000000000000..5077441a33788
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.mir
@@ -0,0 +1,112 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=aarch64-unknown-linux -run-pass=aarch64-mi-peephole-opt -verify-machineinstrs | FileCheck %s
+
+---
+name:            peephole_cselxr_same
+registers:
+  - { id: 1, class: gpr64, preferred-register: '' }
+  - { id: 2, class: gpr64, preferred-register: '' }
+liveins:
+  - { reg: '$x0', virtual-reg: '%1' }
+  - { reg: '$x1', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: peephole_cselxr_same
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: $xzr = ANDSXri [[COPY]], 0, implicit-def $nzcv
+    ; CHECK-NEXT: [[CSELXr:%[0-9]+]]:gpr64 = CSELXr [[COPY1]], [[COPY1]], 0, implicit $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:gpr64 = COPY $x1
+    %4:gpr64 = COPY $x0
+    $xzr = ANDSXri %3, 0, implicit-def $nzcv
+    %5:gpr64 = CSELXr %4, %4, 0, implicit $nzcv
+    RET_ReallyLR
+
+...
+---
+name:            peephole_cselwr_same
+registers:
+  - { id: 1, class: gpr32, preferred-register: '' }
+  - { id: 2, class: gpr32, preferred-register: '' }
+liveins:
+  - { reg: '$w0', virtual-reg: '%1' }
+  - { reg: '$w1', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: peephole_cselwr_same
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w0
+    ; CHECK-NEXT: $wzr = ANDSWri [[COPY]], 0, implicit-def $nzcv
+    ; CHECK-NEXT: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[COPY1]], [[COPY1]], 0, implicit $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:gpr32 = COPY $w1
+    %4:gpr32 = COPY $w0
+    $wzr = ANDSWri %3, 0, implicit-def $nzcv
+    %5:gpr32 = CSELWr %4, %4, 0, implicit $nzcv
+    RET_ReallyLR
+
+...
+---
+name:            peephole_cselxr_different
+registers:
+  - { id: 1, class: gpr64, preferred-register: '' }
+  - { id: 2, class: gpr64, preferred-register: '' }
+liveins:
+  - { reg: '$x0', virtual-reg: '%1' }
+  - { reg: '$x1', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: peephole_cselxr_different
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: $xzr = ANDSXri [[COPY]], 0, implicit-def $nzcv
+    ; CHECK-NEXT: [[CSELXr:%[0-9]+]]:gpr64 = CSELXr [[COPY]], [[COPY1]], 0, implicit $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:gpr64 = COPY $x1
+    %4:gpr64 = COPY $x0
+    $xzr = ANDSXri %3, 0, implicit-def $nzcv
+    %5:gpr64 = CSELXr %3, %4, 0, implicit $nzcv
+    RET_ReallyLR
+
+...
+---
+name:            peephole_cselwr_different
+registers:
+  - { id: 1, class: gpr32, preferred-register: '' }
+  - { id: 2, class: gpr32, preferred-register: '' }
+liveins:
+  - { reg: '$w0', virtual-reg: '%1' }
+  - { reg: '$w1', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: peephole_cselwr_different
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w0
+    ; CHECK-NEXT: $wzr = ANDSWri [[COPY]], 0, implicit-def $nzcv
+    ; CHECK-NEXT: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[COPY]], [[COPY1]], 0, implicit $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:gpr32 = COPY $w1
+    %4:gpr32 = COPY $w0
+    $wzr = ANDSWri %3, 0, implicit-def $nzcv
+    %5:gpr32 = CSELWr %3, %4, 0, implicit $nzcv
+    RET_ReallyLR
+
+...
+
diff --git a/llvm/test/CodeGen/AArch64/peephole-sxtw.mir b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir
new file mode 100644
index 0000000000000..274d434bbec67
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s
+
+---
+name: removeSxtw
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; CHECK-LABEL: name: removeSxtw
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32sp = COPY [[COPY]].sub_32
+    ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY1]], 1, 0
+    ; CHECK-NEXT: $w0 = COPY [[ADDWri]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = SBFMXri %0:gpr64, 0, 31
+    %2:gpr32sp = COPY %1.sub_32:gpr64
+    %3:gpr32sp = ADDWri %2:gpr32sp, 1, 0
+    $w0 = COPY %3:gpr32sp
+    RET_ReallyLR implicit $w0
+...
+---
+name: extraCopy
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; CHECK-LABEL: name: extraCopy
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32sp = COPY [[COPY]].sub_32
+    ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY1]], 1, 0
+    ; CHECK-NEXT: $w0 = COPY [[ADDWri]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = SBFMXri %0:gpr64, 0, 31
+    %2:gpr64all = COPY %1:gpr64
+    %3:gpr32sp = COPY %2.sub_32:gpr64all
+    %4:gpr32sp = ADDWri %3:gpr32sp, 1, 0
+    $w0 = COPY %4:gpr32sp
+    RET_ReallyLR implicit $w0
+...
+---
+name: multipleCopies
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: multipleCopies
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:gpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub_32
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY [[INSERT_SUBREG]].sub_32
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
+  ; CHECK-NEXT:   [[SUBWrr:%[0-9]+]]:gpr32 = SUBWrr [[COPY2]], [[COPY1]]
+  ; CHECK-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[SUBWrr]], %subreg.sub_32
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x04000000), %bb.2(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZX [[SUBREG_TO_REG]], %bb.1
+  ; CHECK-NEXT:   B %bb.2
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $w0
+
+    %2:gpr32 = COPY $w0
+    %4:gpr64all = IMPLICIT_DEF
+    %3:gpr64 = INSERT_SUBREG %4, %2, %subreg.sub_32
+    %5:gpr64 = SBFMXri killed %3, 0, 31
+    %0:gpr64all = COPY %5
+    %6:gpr64all = COPY %0
+    %7:gpr32 = COPY %6.sub_32
+    %8:gpr32 = COPY $wzr
+    %9:gpr32 = SUBWrr %8, %7
+    %10:gpr32 = ORRWrs $wzr, %9, 0
+    %1:gpr64 = SUBREG_TO_REG 0, %10, %subreg.sub_32
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    successors: %bb.1(0x04000000), %bb.2(0x7c000000)
+
+    CBZX %1, %bb.1
+    B %bb.2
+
+...
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_aapcs.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_aapcs.ll
new file mode 100644
index 0000000000000..48898719f40ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_aapcs.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+%va_list = type { ptr, ptr, ptr, i32, i32 }
+
+define preserve_nonecc i32 @callee(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, ...) nounwind noinline ssp {
+; CHECK-LABEL: callee:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #192
+; CHECK-NEXT:    mov x8, #-24 // =0xffffffffffffffe8
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x10, sp, #136
+; CHECK-NEXT:    movk x8, #65408, lsl #32
+; CHECK-NEXT:    add x9, x9, #128
+; CHECK-NEXT:    stp x6, x7, [sp, #144]
+; CHECK-NEXT:    stp x9, x8, [sp, #176]
+; CHECK-NEXT:    add x9, x10, #24
+; CHECK-NEXT:    add x10, sp, #192
+; CHECK-NEXT:    mov w8, #-24 // =0xffffffe8
+; CHECK-NEXT:    str x5, [sp, #136]
+; CHECK-NEXT:    stp q0, q1, [sp]
+; CHECK-NEXT:    stp q2, q3, [sp, #32]
+; CHECK-NEXT:    stp q4, q5, [sp, #64]
+; CHECK-NEXT:    stp q6, q7, [sp, #96]
+; CHECK-NEXT:    stp x10, x9, [sp, #160]
+; CHECK-NEXT:    tbz w8, #31, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %maybe_reg
+; CHECK-NEXT:    add w9, w8, #8
+; CHECK-NEXT:    cmp w9, #0
+; CHECK-NEXT:    str w9, [sp, #184]
+; CHECK-NEXT:    b.gt .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %in_reg
+; CHECK-NEXT:    ldr x9, [sp, #168]
+; CHECK-NEXT:    add x8, x9, w8, sxtw
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %on_stack
+; CHECK-NEXT:    ldr x8, [sp, #160]
+; CHECK-NEXT:    add x9, x8, #8
+; CHECK-NEXT:    str x9, [sp, #160]
+; CHECK-NEXT:  .LBB0_4: // %end
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #192
+; CHECK-NEXT:    ret
+entry:
+  %args = alloca %va_list, align 8
+  call void @llvm.va_start(ptr %args)
+  %gr_offs_p = getelementptr inbounds %va_list, ptr %args, i32 0, i32 3
+  %gr_offs = load i32, ptr %gr_offs_p, align 8
+  %0 = icmp sge i32 %gr_offs, 0
+  br i1 %0, label %on_stack, label %maybe_reg
+
+maybe_reg:
+  %new_reg_offs = add i32 %gr_offs, 8
+  store i32 %new_reg_offs, ptr %gr_offs_p, align 8
+  %inreg = icmp sle i32 %new_reg_offs, 0
+  br i1 %inreg, label %in_reg, label %on_stack
+
+in_reg:
+  %reg_top_p = getelementptr inbounds %va_list, ptr %args, i32 0, i32 1
+  %reg_top = load ptr, ptr %reg_top_p, align 8
+  %reg = getelementptr inbounds i8, ptr %reg_top, i32 %gr_offs
+  br label %end
+
+on_stack:
+  %stack_p = getelementptr inbounds %va_list, ptr %args, i32 0, i32 0
+  %stack = load ptr, ptr %stack_p, align 8
+  %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
+  store ptr %new_stack, ptr %stack_p, align 8
+  br label %end
+
+end:
+  %p = phi ptr [ %reg, %in_reg ], [ %stack, %on_stack ]
+  %10 = load i32, ptr %p, align 8
+  call void @llvm.va_end.p0(ptr %args)
+  ret i32 %10
+}
+
+declare void @llvm.va_start(ptr) nounwind
+declare void @llvm.va_end(ptr) nounwind
+
+define i32 @caller() nounwind ssp {
+; CHECK-LABEL: caller:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #176
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    mov w2, #3 // =0x3
+; CHECK-NEXT:    mov w3, #4 // =0x4
+; CHECK-NEXT:    mov w4, #5 // =0x5
+; CHECK-NEXT:    mov w5, #6 // =0x6
+; CHECK-NEXT:    mov w6, #7 // =0x7
+; CHECK-NEXT:    mov w7, #8 // =0x8
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #128] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    str w8, [sp, #8]
+; CHECK-NEXT:    str w9, [sp]
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #176
+; CHECK-NEXT:    ret
+  %r = tail call preserve_nonecc i32 (i32, i32, i32, i32, i32, ...) @callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10)
+  ret i32 %r
+}
+
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll
new file mode 100644
index 0000000000000..e227f14542cc1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define preserve_nonecc i32 @callee(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, ...) nounwind noinline ssp {
+; CHECK-LABEL: callee:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr w0, [sp, #16]
+; CHECK-NEXT:    orr x8, x8, #0x8
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %args = alloca ptr, align 8
+  call void @llvm.va_start(ptr %args)
+  %10 = va_arg ptr %args, i32
+  call void @llvm.va_end(ptr %args)
+  ret i32 %10
+}
+
+declare void @llvm.va_start(ptr) nounwind
+declare void @llvm.va_end(ptr) nounwind
+
+define i32 @caller() nounwind ssp {
+; CHECK-LABEL: caller:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #208
+; CHECK-NEXT:    mov w8, #10 ; =0xa
+; CHECK-NEXT:    mov w9, #9 ; =0x9
+; CHECK-NEXT:    mov w0, #1 ; =0x1
+; CHECK-NEXT:    stp x9, x8, [sp, #24]
+; CHECK-NEXT:    mov w8, #8 ; =0x8
+; CHECK-NEXT:    mov w9, #6 ; =0x6
+; CHECK-NEXT:    str x8, [sp, #16]
+; CHECK-NEXT:    mov w8, #7 ; =0x7
+; CHECK-NEXT:    mov w1, #2 ; =0x2
+; CHECK-NEXT:    mov w2, #3 ; =0x3
+; CHECK-NEXT:    mov w3, #4 ; =0x4
+; CHECK-NEXT:    mov w4, #5 ; =0x5
+; CHECK-NEXT:    stp d15, d14, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #96] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #112] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #128] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #144] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #160] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #176] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #192] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x8, [sp]
+; CHECK-NEXT:    bl _callee
+; CHECK-NEXT:    ldp x29, x30, [sp, #192] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #176] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #160] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #144] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #128] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #112] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #96] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #80] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #208
+; CHECK-NEXT:    ret
+  %r = tail call preserve_nonecc i32 (i32, i32, i32, i32, i32, ...) @callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10)
+  ret i32 %r
+}
+
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_win64.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_win64.ll
new file mode 100644
index 0000000000000..83dd240a6540f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_win64.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-pc-windows < %s | FileCheck %s
+
+define preserve_nonecc i32 @callee(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, ...) nounwind noinline ssp {
+; CHECK-LABEL: callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    mov x0, x5
+; CHECK-NEXT:    add x8, sp, #24
+; CHECK-NEXT:    stp x6, x7, [sp, #32]
+; CHECK-NEXT:    str x5, [sp, #24]
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %args = alloca ptr, align 8
+  call void @llvm.va_start(ptr %args)
+  %p = load ptr, ptr %args, align 8
+  %10 = load i32, ptr %p, align 8
+  call void @llvm.va_end(ptr %args)
+  ret i32 %10
+}
+
+declare void @llvm.va_start(ptr) nounwind
+declare void @llvm.va_end(ptr) nounwind
+
+define i32 @caller() nounwind ssp {
+; CHECK-LABEL: caller:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #176
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    mov w1, #2 // =0x2
+; CHECK-NEXT:    mov w2, #3 // =0x3
+; CHECK-NEXT:    mov w3, #4 // =0x4
+; CHECK-NEXT:    mov w4, #5 // =0x5
+; CHECK-NEXT:    mov w5, #6 // =0x6
+; CHECK-NEXT:    mov w6, #7 // =0x7
+; CHECK-NEXT:    mov w7, #8 // =0x8
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #128] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    str w8, [sp, #8]
+; CHECK-NEXT:    str w9, [sp]
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #176
+; CHECK-NEXT:    ret
+  %r = tail call preserve_nonecc i32 (i32, i32, i32, i32, i32, ...) @callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10)
+  ret i32 %r
+}
+
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll b/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
new file mode 100644
index 0000000000000..d5340dcebad57
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm64e-apple-darwin                          -verify-machineinstrs | FileCheck %s -DL="L"  --check-prefixes=ALL,NOFPAC
+; RUN: llc < %s -mtriple arm64e-apple-darwin             -mattr=+fpac -verify-machineinstrs | FileCheck %s -DL="L"  --check-prefixes=ALL,FPAC
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth              -verify-machineinstrs | FileCheck %s -DL=".L" --check-prefixes=ALL,NOFPAC
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -mattr=+fpac -verify-machineinstrs | FileCheck %s -DL=".L" --check-prefixes=ALL,FPAC
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
+; ALL-LABEL: test_auth_ia:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autia x16, x1
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ia_zero(i64 %arg) {
+; ALL-LABEL: test_auth_ia_zero:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autiza x16
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
+; ALL-LABEL: test_auth_ib:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autib x16, x1
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ib_zero(i64 %arg) {
+; ALL-LABEL: test_auth_ib_zero:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autizb x16
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_da(i64 %arg, i64 %arg1) {
+; ALL-LABEL: test_auth_da:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autda x16, x1
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_da_zero(i64 %arg) {
+; ALL-LABEL: test_auth_da_zero:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autdza x16
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_db(i64 %arg, i64 %arg1) {
+; ALL-LABEL: test_auth_db:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autdb x16, x1
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_db_zero(i64 %arg) {
+; ALL-LABEL: test_auth_db_zero:
+; ALL:       %bb.0:
+; ALL-NEXT:    mov x16, x0
+; ALL-NEXT:    autdzb x16
+; ALL-NEXT:    mov x0, x16
+; ALL-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 0)
+  ret i64 %tmp
+}
+
+; Note that this might seem like a no-op but is actually a valid way to enforce
+; the validity of a signature.
+define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_ia_ia:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autia x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpaci x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_0
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_0
+; NOFPAC-NEXT:  Lauth_success_0:
+; NOFPAC-NEXT:    pacia x16, x2
+; NOFPAC-NEXT:  Lresign_end_0:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_ia_ia:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autia x16, x1
+; FPAC-NEXT:    pacia x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_ib_ia:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autib x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpaci x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_1
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_1
+; NOFPAC-NEXT:  Lauth_success_1:
+; NOFPAC-NEXT:    pacia x16, x2
+; NOFPAC-NEXT:  Lresign_end_1:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_ib_ia:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autib x16, x1
+; FPAC-NEXT:    pacia x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 1, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_da_ia:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autda x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_2
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_2
+; NOFPAC-NEXT:  Lauth_success_2:
+; NOFPAC-NEXT:    pacia x16, x2
+; NOFPAC-NEXT:  Lresign_end_2:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_da_ia:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autda x16, x1
+; FPAC-NEXT:    pacia x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_db_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_db_ia:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autdb x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_3
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_3
+; NOFPAC-NEXT:  Lauth_success_3:
+; NOFPAC-NEXT:    pacia x16, x2
+; NOFPAC-NEXT:  Lresign_end_3:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_db_ia:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autdb x16, x1
+; FPAC-NEXT:    pacia x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_db_ib(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_db_ib:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autdb x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_4
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_4
+; NOFPAC-NEXT:  Lauth_success_4:
+; NOFPAC-NEXT:    pacib x16, x2
+; NOFPAC-NEXT:  Lresign_end_4:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_db_ib:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autdb x16, x1
+; FPAC-NEXT:    pacib x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 1, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_db_da:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autdb x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_5
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_5
+; NOFPAC-NEXT:  Lauth_success_5:
+; NOFPAC-NEXT:    pacda x16, x2
+; NOFPAC-NEXT:  Lresign_end_5:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_db_da:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autdb x16, x1
+; FPAC-NEXT:    pacda x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 2, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_db_db(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_db_db:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autdb x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_6
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_6
+; NOFPAC-NEXT:  Lauth_success_6:
+; NOFPAC-NEXT:    pacdb x16, x2
+; NOFPAC-NEXT:  Lresign_end_6:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_db_db:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autdb x16, x1
+; FPAC-NEXT:    pacdb x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 3, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_iza_db:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autiza x16
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpaci x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_7
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_7
+; NOFPAC-NEXT:  Lauth_success_7:
+; NOFPAC-NEXT:    pacdb x16, x2
+; NOFPAC-NEXT:  Lresign_end_7:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_iza_db:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autiza x16
+; FPAC-NEXT:    pacdb x16, x2
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 0, i32 3, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
+; NOFPAC-LABEL: test_resign_da_dzb:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autda x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpacd x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_8
+; NOFPAC-NEXT:    mov x16, x17
+; NOFPAC-NEXT:    b [[L]]resign_end_8
+; NOFPAC-NEXT:  Lauth_success_8:
+; NOFPAC-NEXT:    pacdzb x16
+; NOFPAC-NEXT:  Lresign_end_8:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_resign_da_dzb:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autda x16, x1
+; FPAC-NEXT:    pacdzb x16
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 3, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
+; NOFPAC-LABEL: test_auth_trap_attribute:
+; NOFPAC:       %bb.0:
+; NOFPAC-NEXT:    mov x16, x0
+; NOFPAC-NEXT:    autia x16, x1
+; NOFPAC-NEXT:    mov x17, x16
+; NOFPAC-NEXT:    xpaci x17
+; NOFPAC-NEXT:    cmp x16, x17
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_9
+; NOFPAC-NEXT:    brk #0xc470
+; NOFPAC-NEXT:  Lauth_success_9:
+; NOFPAC-NEXT:    mov x0, x16
+; NOFPAC-NEXT:    ret
+;
+; FPAC-LABEL: test_auth_trap_attribute:
+; FPAC:       %bb.0:
+; FPAC-NEXT:    mov x16, x0
+; FPAC-NEXT:    autia x16, x1
+; FPAC-NEXT:    mov x0, x16
+; FPAC-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
+  ret i64 %tmp
+}
+
+declare i64 @llvm.ptrauth.auth(i64, i32, i64)
+declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll b/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll
new file mode 100644
index 0000000000000..94de1b4f949e4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple arm64e-apple-darwin \
+; RUN:   -aarch64-enable-collect-loh=false \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc -mtriple arm64e-apple-darwin \
+; RUN:   -fast-isel \
+; RUN:   -aarch64-enable-collect-loh=false \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc -mtriple arm64e-apple-darwin \
+; RUN:   -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-enable-collect-loh=false \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
+
+; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
+
+; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -fast-isel \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
+
+; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
+; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
+
+;; The discriminator is the same for all blockaddresses in the function.
+;; ptrauth_string_discriminator("test_indirectbr blockaddress") == 34947
+
+define i32 @test_indirectbr() #0 {
+; MACHO-LABEL: test_indirectbr:
+; MACHO:       ; %bb.0: ; %entry
+; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; MACHO-NEXT:    adrp x16, Ltmp0@PAGE
+; MACHO-NEXT:    add x16, x16, Ltmp0@PAGEOFF
+; MACHO-NEXT:    mov x17, #34947 ; =0x8883
+; MACHO-NEXT:    pacia x16, x17
+; MACHO-NEXT:    mov x0, x16
+; MACHO-NEXT:    adrp x16, Ltmp1@PAGE
+; MACHO-NEXT:    add x16, x16, Ltmp1@PAGEOFF
+; MACHO-NEXT:    mov x17, #34947 ; =0x8883
+; MACHO-NEXT:    pacia x16, x17
+; MACHO-NEXT:    mov x1, x16
+; MACHO-NEXT:    bl _dummy_choose
+; MACHO-NEXT:    mov x17, #34947 ; =0x8883
+; MACHO-NEXT:    braa x0, x17
+; MACHO-NEXT:  Ltmp0: ; Block address taken
+; MACHO-NEXT:  LBB0_1: ; %bb1
+; MACHO-NEXT:    mov w0, #1 ; =0x1
+; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    ret
+; MACHO-NEXT:  Ltmp1: ; Block address taken
+; MACHO-NEXT:  LBB0_2: ; %bb2
+; MACHO-NEXT:    mov w0, #2 ; =0x2
+; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    ret
+;
+; ELF-LABEL: test_indirectbr:
+; ELF:       // %bb.0: // %entry
+; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ELF-NEXT:    adrp x16, .Ltmp0
+; ELF-NEXT:    add x16, x16, :lo12:.Ltmp0
+; ELF-NEXT:    mov x17, #34947 // =0x8883
+; ELF-NEXT:    pacia x16, x17
+; ELF-NEXT:    mov x0, x16
+; ELF-NEXT:    adrp x16, .Ltmp1
+; ELF-NEXT:    add x16, x16, :lo12:.Ltmp1
+; ELF-NEXT:    mov x17, #34947 // =0x8883
+; ELF-NEXT:    pacia x16, x17
+; ELF-NEXT:    mov x1, x16
+; ELF-NEXT:    bl dummy_choose
+; ELF-NEXT:    mov x17, #34947 // =0x8883
+; ELF-NEXT:    braa x0, x17
+; ELF-NEXT:  .Ltmp0: // Block address taken
+; ELF-NEXT:  .LBB0_1: // %bb1
+; ELF-NEXT:    mov w0, #1 // =0x1
+; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ELF-NEXT:    ret
+; ELF-NEXT:  .Ltmp1: // Block address taken
+; ELF-NEXT:  .LBB0_2: // %bb2
+; ELF-NEXT:    mov w0, #2 // =0x2
+; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ELF-NEXT:    ret
+entry:
+  %tmp0 = call ptr @dummy_choose(ptr blockaddress(@test_indirectbr, %bb1), ptr blockaddress(@test_indirectbr, %bb2))
+  indirectbr ptr %tmp0, [label %bb1, label %bb2]
+
+bb1:
+  ret i32 1
+
+bb2:
+  ret i32 2
+}
+
+define ptr @test_indirectbr_other_function() #0 {
+; MACHO-LABEL: test_indirectbr_other_function:
+; MACHO:       ; %bb.0:
+; MACHO-NEXT:    adrp x16, Ltmp0@PAGE
+; MACHO-NEXT:    add x16, x16, Ltmp0@PAGEOFF
+; MACHO-NEXT:    mov x17, #34947 ; =0x8883
+; MACHO-NEXT:    pacia x16, x17
+; MACHO-NEXT:    mov x0, x16
+; MACHO-NEXT:    ret
+;
+; ELF-LABEL: test_indirectbr_other_function:
+; ELF:       // %bb.0:
+; ELF-NEXT:    adrp x16, .Ltmp0
+; ELF-NEXT:    add x16, x16, :lo12:.Ltmp0
+; ELF-NEXT:    mov x17, #34947 // =0x8883
+; ELF-NEXT:    pacia x16, x17
+; ELF-NEXT:    mov x0, x16
+; ELF-NEXT:    ret
+  ret ptr blockaddress(@test_indirectbr, %bb1)
+}
+
+;; Test another function to compare the discriminator.
+;; ptrauth_string_discriminator("test_indirectbr_2 blockaddress") == 40224
+
+define i32 @test_indirectbr_2() #0 {
+; MACHO-LABEL: test_indirectbr_2:
+; MACHO:       ; %bb.0: ; %entry
+; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; MACHO-NEXT:    adrp x16, Ltmp2@PAGE
+; MACHO-NEXT:    add x16, x16, Ltmp2@PAGEOFF
+; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
+; MACHO-NEXT:    pacia x16, x17
+; MACHO-NEXT:    mov x0, x16
+; MACHO-NEXT:    adrp x16, Ltmp3@PAGE
+; MACHO-NEXT:    add x16, x16, Ltmp3@PAGEOFF
+; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
+; MACHO-NEXT:    pacia x16, x17
+; MACHO-NEXT:    mov x1, x16
+; MACHO-NEXT:    bl _dummy_choose
+; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
+; MACHO-NEXT:    braa x0, x17
+; MACHO-NEXT:  Ltmp2: ; Block address taken
+; MACHO-NEXT:  LBB2_1: ; %bb1
+; MACHO-NEXT:    mov w0, #1 ; =0x1
+; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    ret
+; MACHO-NEXT:  Ltmp3: ; Block address taken
+; MACHO-NEXT:  LBB2_2: ; %bb2
+; MACHO-NEXT:    mov w0, #2 ; =0x2
+; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    ret
+;
+; ELF-LABEL: test_indirectbr_2:
+; ELF:       // %bb.0: // %entry
+; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ELF-NEXT:    adrp x16, .Ltmp2
+; ELF-NEXT:    add x16, x16, :lo12:.Ltmp2
+; ELF-NEXT:    mov x17, #40224 // =0x9d20
+; ELF-NEXT:    pacia x16, x17
+; ELF-NEXT:    mov x0, x16
+; ELF-NEXT:    adrp x16, .Ltmp3
+; ELF-NEXT:    add x16, x16, :lo12:.Ltmp3
+; ELF-NEXT:    mov x17, #40224 // =0x9d20
+; ELF-NEXT:    pacia x16, x17
+; ELF-NEXT:    mov x1, x16
+; ELF-NEXT:    bl dummy_choose
+; ELF-NEXT:    mov x17, #40224 // =0x9d20
+; ELF-NEXT:    braa x0, x17
+; ELF-NEXT:  .Ltmp2: // Block address taken
+; ELF-NEXT:  .LBB2_1: // %bb1
+; ELF-NEXT:    mov w0, #1 // =0x1
+; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ELF-NEXT:    ret
+; ELF-NEXT:  .Ltmp3: // Block address taken
+; ELF-NEXT:  .LBB2_2: // %bb2
+; ELF-NEXT:    mov w0, #2 // =0x2
+; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ELF-NEXT:    ret
+entry:
+  %tmp0 = call ptr @dummy_choose(ptr blockaddress(@test_indirectbr_2, %bb1), ptr blockaddress(@test_indirectbr_2, %bb2))
+  indirectbr ptr %tmp0, [label %bb1, label %bb2]
+
+bb1:
+  ret i32 1
+
+bb2:
+  ret i32 2
+}
+
+;; Check we don't interfere with jump-table BRIND lowering.
+
+; MACHO-LABEL: test_jumptable:
+; MACHO:        adrp x9, LJTI3_0@PAGE
+; MACHO-NEXT:   add x9, x9, LJTI3_0@PAGEOFF
+; MACHO-NEXT:   adr x10, LBB3_2
+; MACHO-NEXT:   ldrb w11, [x9, x8]
+; MACHO-NEXT:   add x10, x10, x11, lsl #2
+; MACHO-NEXT:   br x10
+
+; ELF-LABEL: test_jumptable:
+; ELF:        adrp x9, .LJTI3_0
+; ELF-NEXT:   add x9, x9, :lo12:.LJTI3_0
+; ELF-NEXT:   adr x10, .LBB3_2
+; ELF-NEXT:   ldrb w11, [x9, x8]
+; ELF-NEXT:   add x10, x10, x11, lsl #2
+; ELF-NEXT:   br x10
+define i32 @test_jumptable(i32 %in) #0 {
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+  ]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+}
+
+; MACHO-LABEL: .globl _test_indirectbr_array
+; MACHO-NEXT:  .p2align 4
+; MACHO-NEXT:  _test_indirectbr_array:
+; MACHO-NEXT:   .quad Ltmp0@AUTH(ia,34947)
+; MACHO-NEXT:   .quad Ltmp1@AUTH(ia,34947)
+; MACHO-NEXT:   .quad Ltmp2@AUTH(ia,40224)
+; MACHO-NEXT:   .quad Ltmp3@AUTH(ia,40224)
+
+; ELF-LABEL: .globl test_indirectbr_array
+; ELF-NEXT:  .p2align 4, 0x0
+; ELF-NEXT:  test_indirectbr_array:
+; ELF-NEXT:   .xword .Ltmp0@AUTH(ia,34947)
+; ELF-NEXT:   .xword .Ltmp1@AUTH(ia,34947)
+; ELF-NEXT:   .xword .Ltmp2@AUTH(ia,40224)
+; ELF-NEXT:   .xword .Ltmp3@AUTH(ia,40224)
+; ELF-NEXT:   .size test_indirectbr_array, 32
+
+@test_indirectbr_array = constant [4 x ptr] [
+  ptr blockaddress(@test_indirectbr, %bb1), ptr blockaddress(@test_indirectbr, %bb2),
+  ptr blockaddress(@test_indirectbr_2, %bb1), ptr blockaddress(@test_indirectbr_2, %bb2)
+]
+
+declare ptr @dummy_choose(ptr, ptr)
+
+attributes #0 = { "ptrauth-indirect-gotos" nounwind }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
new file mode 100644
index 0000000000000..74d2370c74c54
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
@@ -0,0 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefixes=UNCHECKED,UNCHECKED-DARWIN
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefixes=UNCHECKED,UNCHECKED-DARWIN
+
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL="L" --check-prefixes=CHECKED,CHECKED-DARWIN
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL="L" --check-prefixes=CHECKED,CHECKED-DARWIN
+
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefixes=TRAP,TRAP-DARWIN
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefixes=TRAP,TRAP-DARWIN
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefixes=UNCHECKED,UNCHECKED-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefixes=UNCHECKED,UNCHECKED-ELF
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefixes=CHECKED,CHECKED-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefixes=CHECKED,CHECKED-ELF
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefixes=TRAP,TRAP-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefixes=TRAP,TRAP-ELF
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_auth_blend:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    mov x17, x1
+; UNCHECKED-NEXT:    movk x17, #65535, lsl #48
+; UNCHECKED-NEXT:    autda x16, x17
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_blend:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    mov x17, x1
+; CHECKED-NEXT:    movk x17, #65535, lsl #48
+; CHECKED-NEXT:    autda x16, x17
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_blend:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    mov x17, x1
+; TRAP-NEXT:    movk x17, #65535, lsl #48
+; TRAP-NEXT:    autda x16, x17
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_0
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_0:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 65535)
+  %tmp1 = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %tmp0)
+  ret i64 %tmp1
+}
+
+define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_blend:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    mov x17, x1
+; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
+; UNCHECKED-NEXT:    autda x16, x17
+; UNCHECKED-NEXT:    mov x17, x2
+; UNCHECKED-NEXT:    movk x17, #56789, lsl #48
+; UNCHECKED-NEXT:    pacdb x16, x17
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_blend:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    mov x17, x1
+; CHECKED-NEXT:    movk x17, #12345, lsl #48
+; CHECKED-NEXT:    autda x16, x17
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_0
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_0
+; CHECKED-NEXT:  Lauth_success_0:
+; CHECKED-NEXT:    mov x17, x2
+; CHECKED-NEXT:    movk x17, #56789, lsl #48
+; CHECKED-NEXT:    pacdb x16, x17
+; CHECKED-NEXT:  Lresign_end_0:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_blend:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    mov x17, x1
+; TRAP-NEXT:    movk x17, #12345, lsl #48
+; TRAP-NEXT:    autda x16, x17
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_1
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_1:
+; TRAP-NEXT:    mov x17, x2
+; TRAP-NEXT:    movk x17, #56789, lsl #48
+; TRAP-NEXT:    pacdb x16, x17
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
+  %tmp1 = call i64 @llvm.ptrauth.blend(i64 %arg2, i64 56789)
+  %tmp2 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 %tmp1)
+  ret i64 %tmp2
+}
+
+define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_resign_blend_and_const:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    mov x17, x1
+; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
+; UNCHECKED-NEXT:    autda x16, x17
+; UNCHECKED-NEXT:    mov x17, #56789
+; UNCHECKED-NEXT:    pacdb x16, x17
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_blend_and_const:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    mov x17, x1
+; CHECKED-NEXT:    movk x17, #12345, lsl #48
+; CHECKED-NEXT:    autda x16, x17
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_1
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_1
+; CHECKED-NEXT:  Lauth_success_1:
+; CHECKED-NEXT:    mov x17, #56789
+; CHECKED-NEXT:    pacdb x16, x17
+; CHECKED-NEXT:  Lresign_end_1:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_blend_and_const:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    mov x17, x1
+; TRAP-NEXT:    movk x17, #12345, lsl #48
+; TRAP-NEXT:    autda x16, x17
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_2
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_2:
+; TRAP-NEXT:    mov x17, #56789
+; TRAP-NEXT:    pacdb x16, x17
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
+  %tmp1 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 56789)
+  ret i64 %tmp1
+}
+
+define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_blend_and_addr:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    mov x17, x1
+; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
+; UNCHECKED-NEXT:    autda x16, x17
+; UNCHECKED-NEXT:    pacdb x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_blend_and_addr:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    mov x17, x1
+; CHECKED-NEXT:    movk x17, #12345, lsl #48
+; CHECKED-NEXT:    autda x16, x17
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_2
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_2
+; CHECKED-NEXT:  Lauth_success_2:
+; CHECKED-NEXT:    pacdb x16, x2
+; CHECKED-NEXT:  Lresign_end_2:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_blend_and_addr:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    mov x17, x1
+; TRAP-NEXT:    movk x17, #12345, lsl #48
+; TRAP-NEXT:    autda x16, x17
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_3
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_3:
+; TRAP-NEXT:    pacdb x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
+  %tmp1 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 %arg2)
+  ret i64 %tmp1
+}
+
+define i64 @test_auth_too_large_discriminator(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL:     test_auth_too_large_discriminator:
+; UNCHECKED:           %bb.0:
+; UNCHECKED-NEXT:        mov w8, #65536
+; UNCHECKED-DARWIN-NEXT: bfi x1, x8, #48, #16
+; UNCHECKED-DARWIN-NEXT: mov x16, x0
+; UNCHECKED-ELF-NEXT:    mov x16, x0
+; UNCHECKED-ELF-NEXT:    bfi x1, x8, #48, #16
+; UNCHECKED-NEXT:        autda x16, x1
+; UNCHECKED-NEXT:        mov x0, x16
+; UNCHECKED-NEXT:        ret
+;
+; CHECKED-LABEL: test_auth_too_large_discriminator:
+; CHECKED:           %bb.0:
+; CHECKED-NEXT:        mov w8, #65536
+; CHECKED-DARWIN-NEXT: bfi x1, x8, #48, #16
+; CHECKED-DARWIN-NEXT: mov x16, x0
+; CHECKED-ELF-NEXT:    mov x16, x0
+; CHECKED-ELF-NEXT:    bfi x1, x8, #48, #16
+; CHECKED-NEXT:        autda x16, x1
+; CHECKED-NEXT:        mov x0, x16
+; CHECKED-NEXT:        ret
+;
+; TRAP-LABEL: test_auth_too_large_discriminator:
+; TRAP:           %bb.0:
+; TRAP-NEXT:        mov w8, #65536
+; TRAP-DARWIN-NEXT: bfi x1, x8, #48, #16
+; TRAP-DARWIN-NEXT: mov x16, x0
+; TRAP-ELF-NEXT:    mov x16, x0
+; TRAP-ELF-NEXT:    bfi x1, x8, #48, #16
+; TRAP-NEXT:        autda x16, x1
+; TRAP-NEXT:        mov x17, x16
+; TRAP-NEXT:        xpacd x17
+; TRAP-NEXT:        cmp x16, x17
+; TRAP-NEXT:        b.eq [[L]]auth_success_4
+; TRAP-NEXT:        brk #0xc472
+; TRAP-NEXT:      Lauth_success_4:
+; TRAP-NEXT:        mov x0, x16
+; TRAP-NEXT:        ret
+  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 65536)
+  %tmp1 = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %tmp0)
+  ret i64 %tmp1
+}
+
+declare i64 @llvm.ptrauth.auth(i64, i32, i64)
+declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
+declare i64 @llvm.ptrauth.blend(i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
new file mode 100644
index 0000000000000..fdd5ae29f35ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefix=UNCHECKED
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefix=UNCHECKED
+
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL="L" --check-prefix=CHECKED
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL="L" --check-prefix=CHECKED
+
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefix=TRAP
+; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefix=TRAP
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefix=UNCHECKED
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefix=UNCHECKED
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefix=CHECKED
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefix=CHECKED
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefix=TRAP
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefix=TRAP
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_auth_ia:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autia x16, x1
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_ia:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autia x16, x1
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_ia:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autia x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_0
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_0:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ia_zero(i64 %arg) {
+; UNCHECKED-LABEL: test_auth_ia_zero:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autiza x16
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_ia_zero:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autiza x16
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_ia_zero:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autiza x16
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_1
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_1:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_auth_ib:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autib x16, x1
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_ib:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autib x16, x1
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_ib:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autib x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_2
+; TRAP-NEXT:    brk #0xc471
+; TRAP-NEXT:  Lauth_success_2:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ib_zero(i64 %arg) {
+; UNCHECKED-LABEL: test_auth_ib_zero:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autizb x16
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_ib_zero:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autizb x16
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_ib_zero:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autizb x16
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_3
+; TRAP-NEXT:    brk #0xc471
+; TRAP-NEXT:  Lauth_success_3:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_da(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_auth_da:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autda x16, x1
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_da:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autda x16, x1
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_da:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autda x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_4
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_4:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_da_zero(i64 %arg) {
+; UNCHECKED-LABEL: test_auth_da_zero:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autdza x16
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_da_zero:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autdza x16
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_da_zero:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autdza x16
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_5
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_5:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_db(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_auth_db:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autdb x16, x1
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_db:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autdb x16, x1
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_db:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autdb x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_6
+; TRAP-NEXT:    brk #0xc473
+; TRAP-NEXT:  Lauth_success_6:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_db_zero(i64 %arg) {
+; UNCHECKED-LABEL: test_auth_db_zero:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autdzb x16
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_db_zero:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autdzb x16
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_db_zero:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autdzb x16
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_7
+; TRAP-NEXT:    brk #0xc473
+; TRAP-NEXT:  Lauth_success_7:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 0)
+  ret i64 %tmp
+}
+
+;; Note that this might seem like a no-op but is actually a valid way to enforce
+;; the validity of a signature.
+define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_ia_ia:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autia x16, x1
+; UNCHECKED-NEXT:    pacia x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_ia_ia:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autia x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpaci x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_0
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_0
+; CHECKED-NEXT:  Lauth_success_0:
+; CHECKED-NEXT:    pacia x16, x2
+; CHECKED-NEXT:  Lresign_end_0:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_ia_ia:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autia x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_8
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_8:
+; TRAP-NEXT:    pacia x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_ib_ia:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autib x16, x1
+; UNCHECKED-NEXT:    pacia x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_ib_ia:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autib x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpaci x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_1
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_1
+; CHECKED-NEXT:  Lauth_success_1:
+; CHECKED-NEXT:    pacia x16, x2
+; CHECKED-NEXT:  Lresign_end_1:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_ib_ia:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autib x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_9
+; TRAP-NEXT:    brk #0xc471
+; TRAP-NEXT:  Lauth_success_9:
+; TRAP-NEXT:    pacia x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 1, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_da_ia:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autda x16, x1
+; UNCHECKED-NEXT:    pacia x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_da_ia:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autda x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_2
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_2
+; CHECKED-NEXT:  Lauth_success_2:
+; CHECKED-NEXT:    pacia x16, x2
+; CHECKED-NEXT:  Lresign_end_2:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_da_ia:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autda x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_10
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_10:
+; TRAP-NEXT:    pacia x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 0, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_db_da:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autdb x16, x1
+; UNCHECKED-NEXT:    pacda x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_db_da:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autdb x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_3
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_3
+; CHECKED-NEXT:  Lauth_success_3:
+; CHECKED-NEXT:    pacda x16, x2
+; CHECKED-NEXT:  Lresign_end_3:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_db_da:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autdb x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_11
+; TRAP-NEXT:    brk #0xc473
+; TRAP-NEXT:  Lauth_success_11:
+; TRAP-NEXT:    pacda x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 2, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_iza_db:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autiza x16
+; UNCHECKED-NEXT:    pacdb x16, x2
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_iza_db:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autiza x16
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpaci x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_4
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_4
+; CHECKED-NEXT:  Lauth_success_4:
+; CHECKED-NEXT:    pacdb x16, x2
+; CHECKED-NEXT:  Lresign_end_4:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_iza_db:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autiza x16
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_12
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_12:
+; TRAP-NEXT:    pacdb x16, x2
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 0, i32 3, i64 %arg2)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
+; UNCHECKED-LABEL: test_resign_da_dzb:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autda x16, x1
+; UNCHECKED-NEXT:    pacdzb x16
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_da_dzb:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autda x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_5
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_5
+; CHECKED-NEXT:  Lauth_success_5:
+; CHECKED-NEXT:    pacdzb x16
+; CHECKED-NEXT:  Lresign_end_5:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_da_dzb:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autda x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_13
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_13:
+; TRAP-NEXT:    pacdzb x16
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 3, i64 0)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
+; UNCHECKED-LABEL: test_auth_trap_attribute:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autia x16, x1
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_trap_attribute:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autia x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpaci x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_6
+; CHECKED-NEXT:    brk #0xc470
+; CHECKED-NEXT:  Lauth_success_6:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_trap_attribute:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autia x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_14
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_14:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
+  ret i64 %tmp
+}
+
+define i64 @test_auth_ia_constdisc(i64 %arg) {
+; UNCHECKED-LABEL: test_auth_ia_constdisc:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    mov x17, #256
+; UNCHECKED-NEXT:    autia x16, x17
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_auth_ia_constdisc:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    mov x17, #256
+; CHECKED-NEXT:    autia x16, x17
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_auth_ia_constdisc:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    mov x17, #256
+; TRAP-NEXT:    autia x16, x17
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpaci x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_15
+; TRAP-NEXT:    brk #0xc470
+; TRAP-NEXT:  Lauth_success_15:
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 256)
+  ret i64 %tmp
+}
+
+define i64 @test_resign_da_constdisc(i64 %arg, i64 %arg1) {
+; UNCHECKED-LABEL: test_resign_da_constdisc:
+; UNCHECKED:       %bb.0:
+; UNCHECKED-NEXT:    mov x16, x0
+; UNCHECKED-NEXT:    autda x16, x1
+; UNCHECKED-NEXT:    mov x17, #256
+; UNCHECKED-NEXT:    pacda x16, x17
+; UNCHECKED-NEXT:    mov x0, x16
+; UNCHECKED-NEXT:    ret
+;
+; CHECKED-LABEL: test_resign_da_constdisc:
+; CHECKED:       %bb.0:
+; CHECKED-NEXT:    mov x16, x0
+; CHECKED-NEXT:    autda x16, x1
+; CHECKED-NEXT:    mov x17, x16
+; CHECKED-NEXT:    xpacd x17
+; CHECKED-NEXT:    cmp x16, x17
+; CHECKED-NEXT:    b.eq [[L]]auth_success_7
+; CHECKED-NEXT:    mov x16, x17
+; CHECKED-NEXT:    b [[L]]resign_end_6
+; CHECKED-NEXT:  Lauth_success_7:
+; CHECKED-NEXT:    mov x17, #256
+; CHECKED-NEXT:    pacda x16, x17
+; CHECKED-NEXT:  Lresign_end_6:
+; CHECKED-NEXT:    mov x0, x16
+; CHECKED-NEXT:    ret
+;
+; TRAP-LABEL: test_resign_da_constdisc:
+; TRAP:       %bb.0:
+; TRAP-NEXT:    mov x16, x0
+; TRAP-NEXT:    autda x16, x1
+; TRAP-NEXT:    mov x17, x16
+; TRAP-NEXT:    xpacd x17
+; TRAP-NEXT:    cmp x16, x17
+; TRAP-NEXT:    b.eq [[L]]auth_success_16
+; TRAP-NEXT:    brk #0xc472
+; TRAP-NEXT:  Lauth_success_16:
+; TRAP-NEXT:    mov x17, #256
+; TRAP-NEXT:    pacda x16, x17
+; TRAP-NEXT:    mov x0, x16
+; TRAP-NEXT:    ret
+  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 2, i64 256)
+  ret i64 %tmp
+}
+
+declare i64 @llvm.ptrauth.auth(i64, i32, i64)
+declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll b/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll
new file mode 100644
index 0000000000000..42a3050eda112
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+pauth -asm-verbose=false -disable-post-ra -o - %s | FileCheck %s
+
+; CHECK-LABEL:  test_tailcall:
+; CHECK-NEXT:   pacibsp
+; CHECK-NEXT:   str x30, [sp, #-16]!
+; CHECK-NEXT:   bl bar
+; CHECK-NEXT:   ldr x30, [sp], #16
+; CHECK-NEXT:   autibsp
+; CHECK-NEXT:   eor x16, x30, x30, lsl #1
+; CHECK-NEXT:   tbnz x16, #62, [[BAD:.L.*]]
+; CHECK-NEXT:   b bar
+; CHECK-NEXT:   [[BAD]]:
+; CHECK-NEXT:   brk #0xc471
+define i32 @test_tailcall() #0 {
+  call i32 @bar()
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+; CHECK-LABEL: test_tailcall_noframe:
+; CHECK-NEXT:  b bar
+define i32 @test_tailcall_noframe() #0 {
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+; CHECK-LABEL: test_tailcall_indirect:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         br      x0
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_tailcall_indirect(ptr %fptr) #0 {
+  call i32 @test_tailcall()
+  tail call void %fptr()
+  ret void
+}
+
+; CHECK-LABEL: test_tailcall_indirect_in_x9:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         br      x9
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_tailcall_indirect_in_x9(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in)
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         mov x16, #42
+; CHECK:         braa      x0, x16
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect(ptr %fptr) #0 {
+  call i32 @test_tailcall()
+  tail call void %fptr() [ "ptrauth"(i32 0, i64 42) ]
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect_in_x9:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         brabz      x9
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect_in_x9(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in) [ "ptrauth"(i32 1, i64 0) ]
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect_bti:
+; CHECK:         autibsp
+; CHECK:         eor     x17, x30, x30, lsl #1
+; CHECK:         tbnz    x17, #62, [[BAD:.L.*]]
+; CHECK:         brabz      x16
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect_bti(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 "branch-target-enforcement"="true" {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in) [ "ptrauth"(i32 1, i64 0) ]
+  ret void
+}
+
+declare i32 @bar()
+
+attributes #0 = { nounwind "ptrauth-returns" "ptrauth-auth-traps" }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-ret.ll b/llvm/test/CodeGen/AArch64/ptrauth-ret.ll
new file mode 100644
index 0000000000000..61f5f6d9d23b7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-ret.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -disable-post-ra \
+; RUN:   -global-isel=0 -o - %s | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -disable-post-ra \
+; RUN:   -global-isel=1 -global-isel-abort=1 -o - %s | FileCheck %s
+
+define i32 @test() #0 {
+; CHECK-LABEL: test:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    str x19, [sp, #-16]!
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr x19, [sp], #16
+; CHECK-NEXT:    ret
+  call void asm sideeffect "", "~{x19}"()
+  ret i32 0
+}
+
+define i32 @test_alloca() #0 {
+; CHECK-LABEL: test_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 32
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_realign_alloca() #0 {
+; CHECK-LABEL: test_realign_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]!
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub x9, sp, #112
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffff80
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16
+; CHECK-NEXT:    retab
+  %p = alloca i8, i32 32, align 128
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_big_alloca() #0 {
+; CHECK-LABEL: test_big_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]!
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    ldr x29, [sp], #16
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 1024
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_var_alloca(i32 %s) #0 {
+  %p = alloca i8, i32 %s
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_noframe_saved(ptr %p) #0 {
+; CHECK-LABEL: test_noframe_saved:
+; CHECK:       %bb.0:
+
+
+; CHECK-NEXT:  str     x29, [sp, #-96]!
+; CHECK-NEXT:  stp     x28, x27, [sp, #16]
+; CHECK-NEXT:  stp     x26, x25, [sp, #32]
+; CHECK-NEXT:  stp     x24, x23, [sp, #48]
+; CHECK-NEXT:  stp     x22, x21, [sp, #64]
+; CHECK-NEXT:  stp     x20, x19, [sp, #80]
+; CHECK-NEXT:  ldr     w29, [x0]
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  mov     w0, w29
+; CHECK-NEXT:  ldp     x20, x19, [sp, #80]
+; CHECK-NEXT:  ldp     x22, x21, [sp, #64]
+; CHECK-NEXT:  ldp     x24, x23, [sp, #48]
+; CHECK-NEXT:  ldp     x26, x25, [sp, #32]
+; CHECK-NEXT:  ldp     x28, x27, [sp, #16]
+; CHECK-NEXT:  ldr     x29, [sp], #96
+; CHECK-NEXT:  ret
+  %v = load i32, ptr %p
+  call void asm sideeffect "", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"()
+  ret i32 %v
+}
+
+define void @test_noframe() #0 {
+; CHECK-LABEL: test_noframe:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    ret
+  ret void
+}
+
+; FIXME: Inefficient lowering of @llvm.returnaddress
+define ptr @test_returnaddress_0() #0 {
+; CHECK-LABEL: test_returnaddress_0:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    xpaci x30
+; CHECK-NEXT:    mov x0, x30
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  %r = call ptr @llvm.returnaddress(i32 0)
+  ret ptr %r
+}
+
+define ptr @test_returnaddress_1() #0 {
+; CHECK-LABEL: test_returnaddress_1:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]!
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    ldr x8, [x29]
+; CHECK-NEXT:    ldr x0, [x8, #8]
+; CHECK-NEXT:    xpaci x0
+; CHECK-NEXT:    ldp x29, x30, [sp], #16
+; CHECK-NEXT:    retab
+  %r = call ptr @llvm.returnaddress(i32 1)
+  ret ptr %r
+}
+
+define void @test_noframe_alloca() #0 {
+; CHECK-LABEL: test_noframe_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #12
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 1
+  call void asm sideeffect "", "r"(ptr %p)
+  ret void
+}
+
+define void @test_call() #0 {
+; CHECK-LABEL: test_call:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  call i32 @bar()
+  ret void
+}
+
+define void @test_call_alloca() #0 {
+; CHECK-LABEL: test_call_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  alloca i8
+  call i32 @bar()
+  ret void
+}
+
+define void @test_call_shrinkwrapping(i1 %c) #0 {
+; CHECK-LABEL: test_call_shrinkwrapping:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    tbz w0, #0, .LBB12_2
+; CHECK-NEXT:  %bb.1:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    autibsp
+; CHECK-NEXT:  LBB12_2:
+; CHECK-NEXT:    ret
+  br i1 %c, label %tbb, label %fbb
+tbb:
+  call i32 @bar()
+  br label %fbb
+fbb:
+  ret void
+}
+
+define i32 @test_tailcall() #0 {
+; CHECK-LABEL: test_tailcall:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    autibsp
+; CHECK-NEXT:    b bar
+  call i32 @bar()
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+define i32 @test_tailcall_noframe() #0 {
+; CHECK-LABEL: test_tailcall_noframe:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    b bar
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+declare i32 @bar()
+
+declare ptr @llvm.returnaddress(i32)
+
+attributes #0 = { nounwind "ptrauth-returns" }
diff --git a/llvm/test/CodeGen/AArch64/qmovn.ll b/llvm/test/CodeGen/AArch64/qmovn.ll
index f4fdb1b97e292..35c172adbad3d 100644
--- a/llvm/test/CodeGen/AArch64/qmovn.ll
+++ b/llvm/test/CodeGen/AArch64/qmovn.ll
@@ -84,9 +84,9 @@ entry:
 define <2 x i32> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_smaxmin:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #2147483647
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
 ; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    mov x8, #-2147483648
+; CHECK-NEXT:    mov x8, #-2147483648 // =0xffffffff80000000
 ; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    dup v1.2d, x8
@@ -106,9 +106,9 @@ entry:
 define <2 x i32> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_sminmax:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #-2147483648
+; CHECK-NEXT:    mov x8, #-2147483648 // =0xffffffff80000000
 ; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    mov w8, #2147483647
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
 ; CHECK-NEXT:    cmgt v2.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    dup v1.2d, x8
@@ -140,3 +140,210 @@ entry:
   %t = trunc <2 x i64> %s1 to <2 x i32>
   ret <2 x i32> %t
 }
+
+; Test the (concat_vectors (X), (trunc(smin(smax(Y, -2^n), 2^n-1))) pattern.
+
+define <16 x i8> @signed_minmax_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: signed_minmax_v8i16_to_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqxtn2 v0.16b, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %min = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %y, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>)
+  %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %min, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>)
+  %trunc = trunc <8 x i16> %max to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @signed_minmax_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: signed_minmax_v4i32_to_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqxtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %y, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %min, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
+  %trunc = trunc <4 x i32> %max to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @signed_minmax_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) {
+; CHECK-LABEL: signed_minmax_v2i64_to_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    mov x8, #-2147483648 // =0xffffffff80000000
+; CHECK-NEXT:    cmgt v3.2d, v2.2d, v1.2d
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    cmgt v3.2d, v1.2d, v2.2d
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    xtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+entry:
+  %min = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %y, <2 x i64> <i64 2147483647, i64 2147483647>)
+  %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %min, <2 x i64> <i64 -2147483648, i64 -2147483648>)
+  %trunc = trunc <2 x i64> %max to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+; Test the (concat_vectors (X), (trunc(smax(smin(Y, 2^n-1), -2^n))) pattern.
+
+define <16 x i8> @signed_maxmin_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: signed_maxmin_v8i16_to_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqxtn2 v0.16b, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %y, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>)
+  %min = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %max, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>)
+  %trunc = trunc <8 x i16> %min to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @signed_maxmin_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: signed_maxmin_v4i32_to_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqxtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
+  %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %max, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %trunc = trunc <4 x i32> %min to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @signed_maxmin_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) {
+; CHECK-LABEL: signed_maxmin_v2i64_to_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-2147483648 // =0xffffffff80000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    cmgt v3.2d, v1.2d, v2.2d
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    cmgt v3.2d, v2.2d, v1.2d
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    xtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+entry:
+  %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> <i64 -2147483648, i64 -2147483648>)
+  %min = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %max, <2 x i64> <i64 2147483647, i64 2147483647>)
+  %trunc = trunc <2 x i64> %min to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+; Test the (concat_vectors (X), (trunc(umin(Y, 2^n)))) pattern.
+
+define <16 x i8> @unsigned_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: unsigned_v8i16_to_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    uqxtn2 v0.16b, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %min = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>)
+  %trunc = trunc <8 x i16> %min to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @unsigned_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: unsigned_v4i32_to_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    uqxtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %min = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %trunc = trunc <4 x i32> %min to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @unsigned_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) {
+; CHECK-LABEL: unsigned_v2i64_to_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    cmhi v2.2d, v2.2d, v1.2d
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orn v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+entry:
+  %min = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %y, <2 x i64> <i64 4294967295, i64 4294967295>)
+  %trunc = trunc <2 x i64> %min to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+; Test the (concat_vectors (X), (trunc(umin(smax(Y, 0), 2^n))))) pattern.
+
+define <16 x i8> @us_maxmin_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: us_maxmin_v8i16_to_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0xff00ff00ff00ff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smax v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    smin v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    xtn2 v0.16b, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %y, <8 x i16> zeroinitializer)
+  %min = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %max, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>)
+  %trunc = trunc <8 x i16> %min to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @us_maxmin_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: us_maxmin_v4i32_to_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    xtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> zeroinitializer)
+  %min = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %max, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %trunc = trunc <4 x i32> %min to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @us_maxmin_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) {
+; CHECK-LABEL: us_maxmin_v2i64_to_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, #0
+; CHECK-NEXT:    movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmgt v2.2d, v3.2d, v1.2d
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orn v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+entry:
+  %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> zeroinitializer)
+  %min = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %max, <2 x i64> <i64 4294967295, i64 4294967295>)
+  %trunc = trunc <2 x i64> %min to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 7d23e870637a8..f7e95008b7123 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -15,7 +15,8 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [36 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vtanq_f64, ptr @armpl_vtanq_f32, ptr @armpl_svtan_f64_x, ptr @armpl_svtan_f32_x], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [60 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vtanq_f64, ptr @armpl_vtanq_f32, ptr @armpl_svtan_f64_x, ptr @armpl_svtan_f32_x, ptr @armpl_vacosq_f64, ptr @armpl_vacosq_f32, ptr @armpl_svacos_f64_x, ptr @armpl_svacos_f32_x, ptr @armpl_vasinq_f64, ptr @armpl_vasinq_f32, ptr @armpl_svasin_f64_x, ptr @armpl_svasin_f32_x, ptr @armpl_vatanq_f64, ptr @armpl_vatanq_f32, ptr @armpl_svatan_f64_x, ptr @armpl_svatan_f32_x, ptr @armpl_vcoshq_f64, ptr @armpl_vcoshq_f32, ptr @armpl_svcosh_f64_x, ptr @armpl_svcosh_f32_x, ptr @armpl_vsinhq_f64, ptr @armpl_vsinhq_f32, ptr @armpl_svsinh_f64_x, ptr @armpl_svsinh_f32_x, ptr @armpl_vtanhq_f64, ptr @armpl_vtanhq_f32, ptr @armpl_svtanh_f64_x, ptr @armpl_svtanh_f32_x], section "llvm.metadata"
+
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -469,6 +470,276 @@ define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) #0 {
   ret <vscale x 4 x float> %1
 }
 
+declare <2 x double> @llvm.acos.v2f64(<2 x double>)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_acos_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_acos_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vacosq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.acos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_acos_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_acos_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacosq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.acos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_acos_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_acos_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_acos_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_acos_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <2 x double> @llvm.asin.v2f64(<2 x double>)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_asin_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_asin_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vasinq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.asin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_asin_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_asin_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.asin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_asin_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_asin_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_asin_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_asin_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <2 x double> @llvm.atan.v2f64(<2 x double>)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_atan_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_atan_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vatanq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_atan_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_atan_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_atan_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <2 x double> @llvm.cosh.v2f64(<2 x double>)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_cosh_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_cosh_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vcoshq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cosh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cosh_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_cosh_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcoshq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cosh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cosh_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_cosh_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cosh_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_cosh_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <2 x double> @llvm.sinh.v2f64(<2 x double>)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_sinh_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_sinh_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vsinhq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sinh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sinh_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_sinh_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinhq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sinh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sinh_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_sinh_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sinh_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_sinh_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <2 x double> @llvm.tanh.v2f64(<2 x double>)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_tanh_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_tanh_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vtanhq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tanh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tanh_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_tanh_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanhq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tanh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tanh_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_tanh_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_tanh_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
 attributes #0 = { "target-features"="+sve" }
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
index 15d100a518c15..f3f27344ad80e 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [30 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxv_acos, ptr @_ZGVsMxv_acosf, ptr @_ZGVsMxv_asin, ptr @_ZGVsMxv_asinf, ptr @_ZGVsMxv_atan, ptr @_ZGVsMxv_atanf, ptr @_ZGVsMxv_cosh, ptr @_ZGVsMxv_coshf, ptr @_ZGVsMxv_sinh, ptr @_ZGVsMxv_sinhf, ptr @_ZGVsMxv_tanh, ptr @_ZGVsMxv_tanhf], section "llvm.metadata"
 ;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
@@ -384,6 +384,114 @@ define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) {
   ret <vscale x 4 x float> %1
 }
 
+define <vscale x 2 x double> @llvm_acos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_acos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_asin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_asin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cosh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cosh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sinh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sinh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tanh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
 
 define <vscale x 2 x double> @llvm_trunc_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_trunc_vscale_f64(
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
index a3da3b8120218..59c2f94e16763 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [30 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2v_acos, ptr @_ZGVnN4v_acosf, ptr @_ZGVnN2v_asin, ptr @_ZGVnN4v_asinf, ptr @_ZGVnN2v_atan, ptr @_ZGVnN4v_atanf, ptr @_ZGVnN2v_cosh, ptr @_ZGVnN4v_coshf, ptr @_ZGVnN2v_sinh, ptr @_ZGVnN4v_sinhf, ptr @_ZGVnN2v_tanh, ptr @_ZGVnN4v_tanhf], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_f64(
@@ -384,6 +384,114 @@ define <4 x float> @llvm_tan_f32(<4 x float> %in) {
   ret <4 x float> %1
 }
 
+define <2 x double> @llvm_acos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_acos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.acos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_acos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_acosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.acos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_asin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_asin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.asin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_asin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_asinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.asin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_atan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_atanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cosh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cosh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cosh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cosh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_coshf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cosh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sinh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sinh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sinh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sinh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sinh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tanh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tanh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tanh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tanh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tanh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
 define <2 x double> @llvm_trunc_f64(<2 x double> %in) {
 ; CHECK-LABEL: @llvm_trunc_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> [[IN:%.*]])
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 84179d3694a9d..fa0447c2c5d79 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -483,21 +483,18 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds x8, x0, x4
+; CHECK-NEXT:    adcs x9, x1, x5
+; CHECK-NEXT:    asr x10, x9, #63
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x10, x8, vs
+; CHECK-NEXT:    csel x1, x11, x9, vs
 ; CHECK-NEXT:    adds x8, x2, x6
 ; CHECK-NEXT:    adcs x9, x3, x7
 ; CHECK-NEXT:    asr x10, x9, #63
 ; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    csel x2, x10, x8, vs
 ; CHECK-NEXT:    csel x3, x11, x9, vs
-; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x9, x1, x5
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    csel x8, x10, x8, vs
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    csel x1, x11, x9, vs
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/AArch64/scmp.ll b/llvm/test/CodeGen/AArch64/scmp.ll
index a7abc5eadaff6..bcad1c1a11aa7 100644
--- a/llvm/test/CodeGen/AArch64/scmp.ll
+++ b/llvm/test/CodeGen/AArch64/scmp.ll
@@ -1,26 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scmp.8.8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w1, sxtb
-; CHECK-NEXT:    cset w8, gt
-; CHECK-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scmp.8.8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    cmp w8, w1, sxtb
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scmp.8.8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w9, w1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
   ret i8 %1
 }
 
 define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scmp.8.16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w1, sxth
-; CHECK-NEXT:    cset w8, gt
-; CHECK-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scmp.8.16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    cmp w8, w1, sxth
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scmp.8.16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    sxth w9, w1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
   ret i8 %1
 }
@@ -48,15 +67,35 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
 }
 
 define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: scmp.8.128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x2, x0
-; CHECK-NEXT:    sbcs xzr, x3, x1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    sbcs xzr, x1, x3
-; CHECK-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scmp.8.128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x2, x0
+; CHECK-SD-NEXT:    sbcs xzr, x3, x1
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    cmp x0, x2
+; CHECK-SD-NEXT:    sbcs xzr, x1, x3
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scmp.8.128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    csel w9, w10, w9, eq
+; CHECK-GI-NEXT:    tst w9, #0x1
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
   ret i8 %1
 }
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index aca9e58c1a1ba..d8b2762cf15e9 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -486,21 +486,18 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs x8, x0, x4
+; CHECK-NEXT:    sbcs x9, x1, x5
+; CHECK-NEXT:    asr x10, x9, #63
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x10, x8, vs
+; CHECK-NEXT:    csel x1, x11, x9, vs
 ; CHECK-NEXT:    subs x8, x2, x6
 ; CHECK-NEXT:    sbcs x9, x3, x7
 ; CHECK-NEXT:    asr x10, x9, #63
 ; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    csel x2, x10, x8, vs
 ; CHECK-NEXT:    csel x3, x11, x9, vs
-; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x9, x1, x5
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    csel x8, x10, x8, vs
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    csel x1, x11, x9, vs
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 5bb012ae57503..573fe3d8b8a77 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -827,8 +827,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i64> %op1 to <1 x double>
   ret <1 x double> %res
@@ -1752,8 +1751,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    ret
   %res = sitofp <1 x i64> %op1 to <1 x double>
   ret <1 x double> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index afe13851f0b95..0d7f230062650 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -24,7 +24,8 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    add z0.d, z2.d, z1.d
-; CHECK-NEXT:    bic p2.b, p1/z, p1.b, p2.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    and p2.b, p1/z, p1.b, p2.b
 ; CHECK-NEXT:    mov z0.d, p2/m, z2.d
 ; CHECK-NEXT:    sel z0.d, p1, z0.d, z2.d
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-i1-add-reduce.ll b/llvm/test/CodeGen/AArch64/sve-i1-add-reduce.ll
new file mode 100644
index 0000000000000..a748cf732e090
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-i1-add-reduce.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define i8 @uaddv_zexti8_nxv16i1(<vscale x 16 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti8_nxv16i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 16 x i1> %v to <vscale x 16 x i8>
+  %4 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %3)
+  ret i8 %4
+}
+
+define i8 @uaddv_zexti8_nxv8i1(<vscale x 8 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti8_nxv8i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 8 x i1> %v to <vscale x 8 x i8>
+  %4 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %3)
+  ret i8 %4
+}
+
+define i16 @uaddv_zexti16_nxv8i1(<vscale x 8 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti16_nxv8i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 8 x i1> %v to <vscale x 8 x i16>
+  %4 = tail call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %3)
+  ret i16 %4
+}
+
+define i8 @uaddv_zexti8_nxv4i1(<vscale x 4 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti8_nxv4i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 4 x i1> %v to <vscale x 4 x i8>
+  %4 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %3)
+  ret i8 %4
+}
+
+define i16 @uaddv_zexti16_nxv4i1(<vscale x 4 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti16_nxv4i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 4 x i1> %v to <vscale x 4 x i16>
+  %4 = tail call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %3)
+  ret i16 %4
+}
+
+define i32 @uaddv_zexti32_nxv4i1(<vscale x 4 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti32_nxv4i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 4 x i1> %v to <vscale x 4 x i32>
+  %4 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %3)
+  ret i32 %4
+}
+
+define i8 @uaddv_zexti8_nxv2i1(<vscale x 2 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti8_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 2 x i1> %v to <vscale x 2 x i8>
+  %4 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %3)
+  ret i8 %4
+}
+
+define i16 @uaddv_zexti16_nxv2i1(<vscale x 2 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti16_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 2 x i1> %v to <vscale x 2 x i16>
+  %4 = tail call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %3)
+  ret i16 %4
+}
+
+define i32 @uaddv_zexti32_nxv2i1(<vscale x 2 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti32_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 2 x i1> %v to <vscale x 2 x i32>
+  %4 = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %3)
+  ret i32 %4
+}
+
+define i64 @uaddv_zexti64_nxv2i1(<vscale x 2 x i1> %v) {
+; CHECK-LABEL: uaddv_zexti64_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    ret
+entry:
+  %3 = zext <vscale x 2 x i1> %v to <vscale x 2 x i64>
+  %4 = tail call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %3)
+  ret i64 %4
+}
+
+declare i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8>)
+declare i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8>)
+declare i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16>)
+declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>)
+declare i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16>)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>)
+declare i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16>)
+declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
new file mode 100644
index 0000000000000..431c9dc76508f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -0,0 +1,604 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-16 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    str d0, [sp]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldr x29, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_fp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #16
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    addvl x8, sp, #1
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [x8, #28]
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; In the presence of dynamic stack-realignment we emit correct offsets for
+; objects which are not realigned. For realigned objects, e.g. the i32 alloca
+; in this test, we emit the correct offset ignoring the re-alignment (i.e. the
+; offset if the alignment requirement is already satisfied).
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_dynamicrealign
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128-16 x vscale], Type: Variable, Align: 128, Size: 4
+
+define i32 @csr_d8_allocnxv4i32i32f64_dynamicrealign(double %d) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_dynamicrealign:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #96
+; CHECK-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #16
+; CHECK-NEXT:    addvl x9, x9, #-1
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffff80
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp]
+; CHECK-NEXT:    stur d0, [x29, #-8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    sub sp, x29, #16
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32, align 128
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; In the presence of VLA-area objects, we emit correct offsets for all objects
+; except for these VLA objects.
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_vla
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0
+
+define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #8
+; CHECK-NEXT:    str x19, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 24
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset w29, -24
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x8, x8, #15
+; CHECK-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK-NEXT:    sub x9, x9, x8
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    sub x8, x10, x8
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    sub x8, x29, #8
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str wzr, [x9]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str d0, [x19, #8]
+; CHECK-NEXT:    sub sp, x29, #8
+; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %0 = zext i32 %i to i64
+  %vla0 = alloca i32, i64 %0
+  %vla1 = alloca i32, i64 %0
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %vla0
+  store i32 zeroinitializer, ptr %vla1
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_stackargsi32f64
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+8], Type: Fixed, Align: 8, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Fixed, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-16 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @csr_d8_allocnxv4i32i32f64_stackargsi32f64(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_stackargsi32f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    str d0, [sp]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldr x29, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d0, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-32 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @svecc_z8_allocnxv4i32i32f64_fp(double %d, <vscale x 4 x i32> %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_fp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x29, #-2, mul vl]
+; CHECK-NEXT:    str d0, [sp], #16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> %v, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Fixed, Align: 16, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-32 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @svecc_z8_allocnxv4i32i32f64_stackargsi32_fp(double %d, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, <vscale x 4 x i32> %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all"{
+; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x29, #-2, mul vl]
+; CHECK-NEXT:    str d0, [sp], #16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> %v, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-258 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-260 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-262 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-264 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-266 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-268 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-270 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-272 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-274 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-276 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-278 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-280 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w27, -16
+; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    addvl sp, sp, #-18
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    tbz w19, #0, .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB7_2: // %entry
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov w1, #45 // =0x2d
+; CHECK-NEXT:    mov w2, #37 // =0x25
+; CHECK-NEXT:    bl memset
+; CHECK-NEXT:    tbz w19, #0, .LBB7_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB7_4: // %entry
+; CHECK-NEXT:    mov w0, #22647 // =0x5877
+; CHECK-NEXT:    movk w0, #59491, lsl #16
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #18
+; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w27
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+  ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; The VA register currently ends up in VLA space - in the presence of VLA-area
+; objects, we emit correct offsets for all objects except for these VLA objects.
+
+; CHECK-FRAMELAYOUT-LABEL: Function: vastate
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-72], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-80], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-88], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-96], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-104], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-112], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: Variable, Align: 16, Size: 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: VariableSized, Align: 16, Size: 0
+
+define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" {
+; CHECK-LABEL: vastate:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    .cfi_offset b10, -72
+; CHECK-NEXT:    .cfi_offset b11, -80
+; CHECK-NEXT:    .cfi_offset b12, -88
+; CHECK-NEXT:    .cfi_offset b13, -96
+; CHECK-NEXT:    .cfi_offset b14, -104
+; CHECK-NEXT:    .cfi_offset b15, -112
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh wzr, [x29, #-70]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    sturh w8, [x29, #-72]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl other
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_2: // %entry
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    .cfi_def_cfa wsp, 112
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  tail call void @other()
+  ret i32 %x
+}
+declare void @other()
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
new file mode 100644
index 0000000000000..293e538a7459d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+
+declare void @llvm.init.trampoline(ptr, ptr, ptr);
+declare ptr @llvm.adjust.trampoline(ptr);
+
+define i64 @f(ptr nest %c, i64 %x, i64 %y) {
+  %sum = add i64 %x, %y
+  ret i64 %sum
+}
+
+define i64 @main() {
+  %val = alloca i64
+  %nval = bitcast ptr %val to ptr
+  %tramp = alloca [36 x i8], align 8
+  ; CHECK:	bl	__trampoline_setup
+  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval)
+  %fp = call ptr @llvm.adjust.trampoline(ptr %tramp)
+  ret i64 0
+}
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3c6c1f1618a95..afc0d8704ebac 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -477,17 +477,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds x8, x0, x4
+; CHECK-NEXT:    adcs x9, x1, x5
+; CHECK-NEXT:    csinv x0, x8, xzr, lo
+; CHECK-NEXT:    csinv x1, x9, xzr, lo
 ; CHECK-NEXT:    adds x8, x2, x6
 ; CHECK-NEXT:    adcs x9, x3, x7
 ; CHECK-NEXT:    csinv x2, x8, xzr, lo
 ; CHECK-NEXT:    csinv x3, x9, xzr, lo
-; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x9, x1, x5
-; CHECK-NEXT:    csinv x8, x8, xzr, lo
-; CHECK-NEXT:    csinv x1, x9, xzr, lo
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll
index 351d440243b70..1a7f0be11cee6 100644
--- a/llvm/test/CodeGen/AArch64/ucmp.ll
+++ b/llvm/test/CodeGen/AArch64/ucmp.ll
@@ -1,26 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: ucmp.8.8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    cmp w8, w1, uxtb
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ucmp.8.8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    cmp w8, w1, uxtb
+; CHECK-SD-NEXT:    cset w8, hi
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ucmp.8.8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
   ret i8 %1
 }
 
 define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: ucmp.8.16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    cmp w8, w1, uxth
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ucmp.8.16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w1, uxth
+; CHECK-SD-NEXT:    cset w8, hi
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ucmp.8.16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
   ret i8 %1
 }
@@ -48,15 +67,35 @@ define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
 }
 
 define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: ucmp.8.128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x2, x0
-; CHECK-NEXT:    sbcs xzr, x3, x1
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    sbcs xzr, x1, x3
-; CHECK-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ucmp.8.128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x2, x0
+; CHECK-SD-NEXT:    sbcs xzr, x3, x1
+; CHECK-SD-NEXT:    cset w8, lo
+; CHECK-SD-NEXT:    cmp x0, x2
+; CHECK-SD-NEXT:    sbcs xzr, x1, x3
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ucmp.8.128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    cmp x1, x3
+; CHECK-GI-NEXT:    csel w9, w10, w9, eq
+; CHECK-GI-NEXT:    tst w9, #0x1
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
+; CHECK-GI-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
   ret i8 %1
 }
@@ -95,18 +134,41 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 }
 
 define <1 x i64> @ucmp.1.64.65(<1 x i65> %x, <1 x i65> %y) {
-; CHECK-LABEL: ucmp.1.64.65:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x1, #0x1
-; CHECK-NEXT:    and x9, x3, #0x1
-; CHECK-NEXT:    cmp x2, x0
-; CHECK-NEXT:    sbcs xzr, x9, x8
-; CHECK-NEXT:    cset x10, lo
-; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    sbcs xzr, x8, x9
-; CHECK-NEXT:    csinv x8, x10, xzr, hs
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ucmp.1.64.65:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and x8, x1, #0x1
+; CHECK-SD-NEXT:    and x9, x3, #0x1
+; CHECK-SD-NEXT:    cmp x2, x0
+; CHECK-SD-NEXT:    sbcs xzr, x9, x8
+; CHECK-SD-NEXT:    cset x10, lo
+; CHECK-SD-NEXT:    cmp x0, x2
+; CHECK-SD-NEXT:    sbcs xzr, x8, x9
+; CHECK-SD-NEXT:    csinv x8, x10, xzr, hs
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ucmp.1.64.65:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and x8, x1, #0x1
+; CHECK-GI-NEXT:    and x9, x3, #0x1
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    cset w10, hi
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w11, hi
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    csel w10, w11, w10, eq
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    cset x10, ne
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    cset w11, lo
+; CHECK-GI-NEXT:    cmp x0, x2
+; CHECK-GI-NEXT:    cset w12, lo
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    csel w8, w12, w11, eq
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csinv x8, x10, xzr, eq
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %1 = call <1 x i64> @llvm.ucmp(<1 x i65> %x, <1 x i65> %y)
   ret <1 x i64> %1
 }
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 363c12e12cb8b..dfcbe96ea948a 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -475,17 +475,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs x8, x0, x4
+; CHECK-NEXT:    sbcs x9, x1, x5
+; CHECK-NEXT:    csel x0, xzr, x8, lo
+; CHECK-NEXT:    csel x1, xzr, x9, lo
 ; CHECK-NEXT:    subs x8, x2, x6
 ; CHECK-NEXT:    sbcs x9, x3, x7
 ; CHECK-NEXT:    csel x2, xzr, x8, lo
 ; CHECK-NEXT:    csel x3, xzr, x9, lo
-; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x9, x1, x5
-; CHECK-NEXT:    csel x8, xzr, x8, lo
-; CHECK-NEXT:    csel x1, xzr, x9, lo
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/AArch64/vec-libcalls.ll b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
index 9bbac5d69c28c..43c1839818173 100644
--- a/llvm/test/CodeGen/AArch64/vec-libcalls.ll
+++ b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
@@ -21,6 +21,12 @@ declare <3 x float> @llvm.fabs.v3f32(<3 x float>)
 declare <3 x float> @llvm.ceil.v3f32(<3 x float>)
 declare <3 x float> @llvm.cos.v3f32(<3 x float>)
 declare <3 x float> @llvm.tan.v3f32(<3 x float>)
+declare <3 x float> @llvm.asin.v3f32(<3 x float>)
+declare <3 x float> @llvm.acos.v3f32(<3 x float>)
+declare <3 x float> @llvm.atan.v3f32(<3 x float>)
+declare <3 x float> @llvm.sinh.v3f32(<3 x float>)
+declare <3 x float> @llvm.cosh.v3f32(<3 x float>)
+declare <3 x float> @llvm.tanh.v3f32(<3 x float>)
 declare <3 x float> @llvm.exp.v3f32(<3 x float>)
 declare <3 x float> @llvm.exp2.v3f32(<3 x float>)
 declare <3 x float> @llvm.floor.v3f32(<3 x float>)
@@ -329,6 +335,191 @@ define <3 x float> @tan_v3f32(<3 x float> %x) nounwind {
   ret <3 x float> %r
 }
 
+define <3 x float> @asin_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: asin_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.asin.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <3 x float> @acos_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: acos_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.acos.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <3 x float> @atan_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: atan_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.atan.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <3 x float> @sinh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.sinh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+define <3 x float> @cosh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.cosh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <3 x float> @tanh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.tanh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
 define <3 x float> @exp_v3f32(<3 x float> %x) nounwind {
 ; CHECK-LABEL: exp_v3f32:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ec832ed0f7f3a..63f5464371cc6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1845,39 +1845,37 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT:    s_ashr_i32 s8, s5, 31
+; GCN-NEXT:    s_ashr_i32 s7, s5, 31
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[8:9]
+; GCN-NEXT:    s_cselect_b32 s2, s6, s7
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_sub_i32 s12, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s8, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s2, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s14, 1, 0
-; GFX10PLUS-NEXT:    s_ashr_i64 s[6:7], s[4:5], s3
-; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10PLUS-NEXT:    s_ashr_i32 s10, s5, 31
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10PLUS-NEXT:    s_ashr_i64 s[4:5], s[4:5], s12
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s11, s10
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[10:11]
+; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
+; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT:    s_ashr_i32 s3, s5, 31
+; GFX10PLUS-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index c701e873fdd2c..61d2c854dffa5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -318,29 +318,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -371,55 +353,21 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -500,27 +448,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -534,28 +465,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -585,53 +499,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -709,26 +591,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -770,27 +635,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -824,54 +672,19 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -929,34 +742,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -997,25 +790,10 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v0, v3
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 90110e6e0c09e..83be67a9138f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -318,29 +318,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -371,55 +353,21 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -500,27 +448,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -534,28 +465,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -585,53 +499,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -709,26 +591,9 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -770,27 +635,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -824,54 +672,19 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -929,34 +742,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -997,25 +790,10 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v0, v3
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 48916d8d9b2c5..84378bcb70684 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -10,9 +10,11 @@ define amdgpu_kernel void @stack_write_fi() {
 ; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index d4d5cb18bbd30..966a481b6594d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -226,15 +226,16 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_mov_b32 s13, -1
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s7, -1
+; GFX10-NEXT:    s_mov_b32 s2, s1
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10-NEXT:    s_or_b64 s[12:13], s[4:5], s[0:1]
+; GFX10-NEXT:    s_mov_b32 s3, -1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[12:13], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
@@ -248,8 +249,8 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
-; GFX10-NEXT:    v_mov_b32_e32 v4, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    buffer_load_dword v5, v3, s[4:7], 0 offen
@@ -261,17 +262,17 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    s_mov_b32 s13, 0
-; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    s_or_b32 s1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
 ; GFX10-NEXT:  .LBB4_4: ; %Flow
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s13
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s3
 ; GFX10-NEXT:    s_cbranch_vccz .LBB4_6
 ; GFX10-NEXT:  ; %bb.5: ; %.19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v3, 2, v1
 ; GFX10-NEXT:  .LBB4_6: ; %.22
-; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s1, 2
+; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s2, 2
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 5515de0cd2fee..f52b7c635a66f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -193,12 +193,12 @@ bb12:
 define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK-LABEL: break_loop:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    v_subrev_u32_e32 v0, s0, v0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_branch .LBB5_3
 ; CHECK-NEXT:  .LBB5_1: ; %bb4
 ; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b4b95fdab4ab2..4fdb4082346af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -10,11 +10,13 @@ define amdgpu_ps void @amdgpu_ps() {
 ; MESA-LABEL: amdgpu_ps:
 ; MESA:       ; %bb.0:
 ; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT:    v_mov_b32_e32 v0, 0
-; MESA-NEXT:    v_mov_b32_e32 v1, s1
+; MESA-NEXT:    s_mov_b32 s0, 0
+; MESA-NEXT:    s_mov_b64 s[2:3], src_private_base
+; MESA-NEXT:    s_mov_b32 s1, s3
+; MESA-NEXT:    v_mov_b32_e32 v0, s0
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
+; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
 ; MESA-NEXT:    s_waitcnt vmcnt(0)
 ; MESA-NEXT:    s_endpgm
@@ -24,13 +26,15 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_getpc_b64 s[2:3]
 ; PAL-NEXT:    s_mov_b32 s2, s0
 ; PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; PAL-NEXT:    v_mov_b32_e32 v0, 0
 ; PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT:    s_mov_b32 s0, 0
+; PAL-NEXT:    s_mov_b64 s[2:3], src_private_base
+; PAL-NEXT:    s_mov_b32 s1, s3
+; PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; PAL-NEXT:    flat_store_dword v[0:1], v2
 ; PAL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a5e4151bf3695..f4fd803c8dda8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:256 sc0 sc1
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:256 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    s_movk_i32 s0, 0x4004
-; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
+; GFX940-NEXT:    s_addk_i32 s0, 0x4004
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_addk_i32 s0, 0x4004
+; GFX11-NEXT:    s_addk_i32 s1, 0x4004
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_movk_i32 s0, 0x4004
-; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; GFX940-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; GFX940-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX940-NEXT:    s_add_i32 s1, s32, 4
 ; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX11-NEXT:    s_add_i32 s1, s32, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
index 34635b077cd92..e19e7823053c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -8,9 +8,9 @@ define double @v_floor_f64_ieee(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -31,9 +31,9 @@ define double @v_floor_f64_ieee_nnan(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51,9 +51,9 @@ define double @v_floor_f64_ieee_fneg(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -75,9 +75,9 @@ define double @v_floor_f64_nonieee(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -98,9 +98,9 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -118,9 +118,9 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -142,9 +142,9 @@ define double @v_floor_f64_fabs(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], |v[0:1]|
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -166,9 +166,9 @@ define double @v_floor_f64_fneg_fabs(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -|v[0:1]|
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -190,9 +190,9 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e32 v[0:1], s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -214,9 +214,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -239,9 +239,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], |s[2:3]|
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -264,9 +264,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -|s[2:3]|
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 453b229bf62bd..7525e00e6f401 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB39_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB40_2:
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB41_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB42_2:
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB49_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB49_2:
@@ -1761,19 +1761,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1842,19 +1842,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1884,19 +1884,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index b54aec935bd5f..3c57832a25e32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -434,8 +434,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
   ; GFX90A_ITERATIVE-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX90A_ITERATIVE-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -446,8 +446,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX90A_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -615,8 +615,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
   ; GFX940_ITERATIVE-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX940_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX940_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX940_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX940_ITERATIVE-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -627,8 +627,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX940_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX940_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX940_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -837,8 +837,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
   ; GFX90A_ITERATIVE-NEXT:   [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX90A_ITERATIVE-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -849,8 +849,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX90A_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -1089,8 +1089,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
   ; GFX940_ITERATIVE-NEXT:   [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
   ; GFX940_ITERATIVE-NEXT:   [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX940_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX940_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX940_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX940_ITERATIVE-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -1101,8 +1101,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX940_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX940_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX940_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index db944b98a3013..4fcde0f2fc7cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -11,12 +11,11 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
 ; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
 ; GCN-NEXT:    s_load_dwordx16 s[4:19], s[22:23], 0x80
-; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NEXT:    v_mov_b32_e32 v1, s37
@@ -143,16 +142,17 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:240
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
+; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:244
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NEXT:    s_and_b32 s4, s25, 63
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:248
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
-; GCN-NEXT:    s_lshl_b32 s4, s4, 2
+; GCN-NEXT:    s_add_u32 s4, 0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:252
-; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s24
-; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 6cb1a12464cda..9a1e1ed9208c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
 
 ---
 name: fcmp_false_f16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
index 7179b9f2cac32..cbf82daca0d2a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
index 390541ae76849..2368ea38e2d2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
@@ -142,13 +142,9 @@ body: |
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_v_s64
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -157,13 +153,9 @@ body: |
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     %0:vgpr(s64) = G_CONSTANT i64 0
     %1:vgpr(s64) = G_CONSTANT i64 1
     %2:vgpr(s64) = G_CONSTANT i64 -1
@@ -184,34 +176,26 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_s64
-    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
-    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
-    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
-    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE64-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     ;
     ; WAVE32-LABEL: name: constant_s_s64
-    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
-    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
-    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
-    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE32-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     %0:sgpr(s64) = G_CONSTANT i64 0
     %1:sgpr(s64) = G_CONSTANT i64 1
     %2:sgpr(s64) = G_CONSTANT i64 -1
@@ -310,6 +294,195 @@ body: |
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
 ...
 
+---
+name:            constant_s_p2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p2
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p2
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p2) = G_CONSTANT i32 0
+    %1:sgpr(p2) = G_CONSTANT i32 1
+    %2:sgpr(p2) = G_CONSTANT i32 -1
+    %3:sgpr(p2) = G_CONSTANT i32 -54
+    %4:sgpr(p2) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p2
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p2
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p2) = G_CONSTANT i32 0
+    %1:vgpr(p2) = G_CONSTANT i32 1
+    %2:vgpr(p2) = G_CONSTANT i32 -1
+    %3:vgpr(p2) = G_CONSTANT i32 -54
+    %4:vgpr(p2) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_s_p5
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p5
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p5
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p5) = G_CONSTANT i32 0
+    %1:sgpr(p5) = G_CONSTANT i32 1
+    %2:sgpr(p5) = G_CONSTANT i32 -1
+    %3:sgpr(p5) = G_CONSTANT i32 -54
+    %4:sgpr(p5) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p5
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p5
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p5
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p5) = G_CONSTANT i32 0
+    %1:vgpr(p5) = G_CONSTANT i32 1
+    %2:vgpr(p5) = G_CONSTANT i32 -1
+    %3:vgpr(p5) = G_CONSTANT i32 -54
+    %4:vgpr(p5) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_s_p6
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p6
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p6
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p6) = G_CONSTANT i32 0
+    %1:sgpr(p6) = G_CONSTANT i32 1
+    %2:sgpr(p6) = G_CONSTANT i32 -1
+    %3:sgpr(p6) = G_CONSTANT i32 -54
+    %4:sgpr(p6) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p6
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p6
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p6
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p6) = G_CONSTANT i32 0
+    %1:vgpr(p6) = G_CONSTANT i32 1
+    %2:vgpr(p6) = G_CONSTANT i32 -1
+    %3:vgpr(p6) = G_CONSTANT i32 -54
+    %4:vgpr(p6) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
 ---
 name:            constant_s_p1
 legalized:       true
@@ -325,13 +498,9 @@ body: |
     ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_s_p1
     ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
@@ -340,13 +509,9 @@ body: |
     ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     %0:sgpr(p1) = G_CONSTANT i64 0
     %1:sgpr(p1) = G_CONSTANT i64 1
     %2:sgpr(p1) = G_CONSTANT i64 -1
@@ -373,13 +538,9 @@ body: |
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_v_p1
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -388,13 +549,9 @@ body: |
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     %0:vgpr(p1) = G_CONSTANT i64 0
     %1:vgpr(p1) = G_CONSTANT i64 1
     %2:vgpr(p1) = G_CONSTANT i64 -1
@@ -407,98 +564,161 @@ body: |
 ...
 
 ---
-name:            constant_s_p999
+name:            constant_s_p0
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; WAVE64-LABEL: name: constant_s_p999
+    ; WAVE64-LABEL: name: constant_s_p0
     ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
     ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
     ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     ;
-    ; WAVE32-LABEL: name: constant_s_p999
+    ; WAVE32-LABEL: name: constant_s_p0
     ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
     ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
     ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
-    %0:sgpr(p999) = G_CONSTANT i64 0
-    %1:sgpr(p999) = G_CONSTANT i64 1
-    %2:sgpr(p999) = G_CONSTANT i64 -1
-    %3:sgpr(p999) = G_CONSTANT i64 -54
-    %4:sgpr(p999) = G_CONSTANT i64 27
-    %5:sgpr(p999) = G_CONSTANT i64 4294967295
-    %6:sgpr(p999) = G_CONSTANT i64 4294967296
-    %7:sgpr(p999) = G_CONSTANT i64 18446744004990098135
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    %0:sgpr(p0) = G_CONSTANT i64 0
+    %1:sgpr(p0) = G_CONSTANT i64 1
+    %2:sgpr(p0) = G_CONSTANT i64 -1
+    %3:sgpr(p0) = G_CONSTANT i64 -54
+    %4:sgpr(p0) = G_CONSTANT i64 27
+    %5:sgpr(p0) = G_CONSTANT i64 4294967295
+    %6:sgpr(p0) = G_CONSTANT i64 4294967296
+    %7:sgpr(p0) = G_CONSTANT i64 18446744004990098135
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...
 
 ---
-name:            constant_v_p999
+name:            constant_v_p0
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; WAVE64-LABEL: name: constant_v_p999
+    ; WAVE64-LABEL: name: constant_v_p0
     ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
-    ; WAVE32-LABEL: name: constant_v_p999
+    ; WAVE32-LABEL: name: constant_v_p0
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
-    %0:vgpr(p999) = G_CONSTANT i64 0
-    %1:vgpr(p999) = G_CONSTANT i64 1
-    %2:vgpr(p999) = G_CONSTANT i64 -1
-    %3:vgpr(p999) = G_CONSTANT i64 -54
-    %4:vgpr(p999) = G_CONSTANT i64 27
-    %5:vgpr(p999) = G_CONSTANT i64 4294967295
-    %6:vgpr(p999) = G_CONSTANT i64 4294967296
-    %7:vgpr(p999) = G_CONSTANT i64 18446744004990098135
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    %0:vgpr(p0) = G_CONSTANT i64 0
+    %1:vgpr(p0) = G_CONSTANT i64 1
+    %2:vgpr(p0) = G_CONSTANT i64 -1
+    %3:vgpr(p0) = G_CONSTANT i64 -54
+    %4:vgpr(p0) = G_CONSTANT i64 27
+    %5:vgpr(p0) = G_CONSTANT i64 4294967295
+    %6:vgpr(p0) = G_CONSTANT i64 4294967296
+    %7:vgpr(p0) = G_CONSTANT i64 18446744004990098135
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+---
+name:            constant_s_p4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_s_p4
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p4
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    %0:sgpr(p4) = G_CONSTANT i64 0
+    %1:sgpr(p4) = G_CONSTANT i64 1
+    %2:sgpr(p4) = G_CONSTANT i64 -1
+    %3:sgpr(p4) = G_CONSTANT i64 -54
+    %4:sgpr(p4) = G_CONSTANT i64 27
+    %5:sgpr(p4) = G_CONSTANT i64 4294967295
+    %6:sgpr(p4) = G_CONSTANT i64 4294967296
+    %7:sgpr(p4) = G_CONSTANT i64 18446744004990098135
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+
+---
+name:            constant_v_p4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p4
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p4
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    %0:vgpr(p4) = G_CONSTANT i64 0
+    %1:vgpr(p4) = G_CONSTANT i64 1
+    %2:vgpr(p4) = G_CONSTANT i64 -1
+    %3:vgpr(p4) = G_CONSTANT i64 -54
+    %4:vgpr(p4) = G_CONSTANT i64 27
+    %5:vgpr(p4) = G_CONSTANT i64 4294967295
+    %6:vgpr(p4) = G_CONSTANT i64 4294967296
+    %7:vgpr(p4) = G_CONSTANT i64 18446744004990098135
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
index 6047bda19753a..3428230606080 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
index 942a0a3bd5ff7..13e29f15504be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
@@ -87,13 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_s_s64
-    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4607182418800017408
-    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
-    ; GCN-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4611686018427387904
-    ; GCN-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
-    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B]]
-    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B1]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408
+    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
+    ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904
+    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
+    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]]
+    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B1]]
     %0:sgpr(s64) = G_FCONSTANT double 1.0
     %1:sgpr(s64) = G_FCONSTANT double 8.0
     %2:sgpr(s64) = G_FCONSTANT double -2.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 5d4816812e6c0..6a1e52cd29fd9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
 
 ---
 name:            fract_f64_neg
@@ -12,23 +12,41 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; CHECK-LABEL: name: fract_f64_neg
-    ; CHECK: liveins: $sgpr0_sgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; GFX10-LABEL: name: fract_f64_neg
+    ; GFX10: liveins: $sgpr0_sgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; GFX11-LABEL: name: fract_f64_neg
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; GFX11-NEXT: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
     %8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
@@ -60,23 +78,41 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; CHECK-LABEL: name: fract_f64_neg_abs
-    ; CHECK: liveins: $sgpr0_sgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; GFX10-LABEL: name: fract_f64_neg_abs
+    ; GFX10: liveins: $sgpr0_sgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; GFX11-LABEL: name: fract_f64_neg_abs
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; GFX11-NEXT: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
     %8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index 504f7697a0fcc..ada80da490fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -3,7 +3,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
 
 ---
 
@@ -44,6 +44,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -89,6 +96,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s16_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -133,6 +147,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -174,8 +195,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s32_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -217,8 +245,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s16_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -261,8 +296,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s32_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x  s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -307,6 +349,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -349,8 +398,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_s64_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -393,8 +449,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -439,6 +502,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
+    ; GFX11-LABEL: name: load_constant_v2p1
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -483,6 +553,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
+    ;
+    ; GFX11-LABEL: name: load_constant_s128_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -527,6 +604,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_p3_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -571,6 +655,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_p4_from_8
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -615,6 +706,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
+    ;
+    ; GFX11-LABEL: name: load_constant_p999_from_8
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -659,6 +757,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11-LABEL: name: load_constant_v2p3
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -703,6 +808,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -747,6 +859,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -789,8 +908,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -833,8 +959,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -877,8 +1010,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v16s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -921,8 +1061,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -971,6 +1118,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1020
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1020
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1018,6 +1172,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1024
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1024
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1067,6 +1228,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048575
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048575
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1116,6 +1285,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048576
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048576
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1166,6 +1343,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1073741823
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1073741823
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1189,11 +1374,11 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1204,11 +1389,11 @@ body: |
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1219,11 +1404,11 @@ body: |
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1234,16 +1419,31 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_1
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1322,6 +1522,21 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_524288
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -524288
     %2:sgpr(p4) = G_PTR_ADD %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 85ac53387f4d9..cf4e6c8b85e3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -741,17 +741,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -759,17 +757,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -777,17 +773,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -795,17 +789,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -831,35 +823,31 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -867,17 +855,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -885,17 +871,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -921,17 +905,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -939,17 +921,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -957,17 +937,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -975,17 +953,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -1010,17 +986,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1028,17 +1002,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1046,17 +1018,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1064,17 +1034,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 4309f4863bf91..6f971788727b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -45,9 +45,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0
 
 # Max immediate for CI
-# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292
-# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 3
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# SIVI: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 17179869180
 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # SIVI-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
@@ -59,9 +57,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
 
 # Immediate overflow for CI
-# GCN: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-# GCN: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# GCN: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 17179869184
 # GCN-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # GCN-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # GCN-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
@@ -77,9 +73,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0
 
 # Overflow 32-bit byte offset
-# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# SIVI: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # SIVI-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
index 5ade8ae6de1bf..b9b024519f61e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
index de29037674ce6..41e416e3f5d72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
@@ -2,10 +2,10 @@
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
 
 ---
 name:  gep_p0_sgpr_sgpr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
index 2cfbb68b8affb..2a3d97d603b13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
@@ -314,8 +314,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B64_]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 0
@@ -337,10 +337,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4042322160
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -252645136
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[REG_SEQUENCE]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1085102592571150096
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1085102592571150096
@@ -362,9 +360,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
     ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], %const, implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
@@ -387,9 +383,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -416,9 +410,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], %const, implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
@@ -441,7 +433,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -2
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -468,7 +460,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -495,7 +487,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -8
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -522,7 +514,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -16
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -16
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -743,17 +735,15 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4042322160, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -252645136, implicit $exec
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1085102592571150096, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[COPY3]], implicit $exec
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], [[COPY4]], implicit $exec
-    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -1085102592571150096
     %2:vgpr(p0) = G_PTRMASK %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index b90fe2f88874b..60357abbc7721 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
index 229a84e60b129..f2daa23db47a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
index 9be5e14cdc711..57bbe020dca85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
 
 ---
 name: legal_brcond_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 46036681e905e..96cab200b61cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 90f34acaa17aa..f88125ea02937 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -48,16 +48,16 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX10-LABEL: test_div_scale_f32_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v2, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_1:
@@ -133,16 +133,16 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX10-LABEL: test_div_scale_f32_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v2, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_2:
@@ -1266,14 +1266,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
 ;
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, 1.0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, 1.0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
@@ -1337,14 +1337,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
 ;
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, 2.0, 2.0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, 2.0, 2.0, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
@@ -1416,17 +1416,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
 ;
 ; GFX10-LABEL: test_div_scale_f32_fabs_num:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v2, v2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_fabs_num:
@@ -1508,17 +1508,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
 ;
 ; GFX10-LABEL: test_div_scale_f32_fabs_den:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v2
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_fabs_den:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 2a260823732ca..6415e185446f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index ec069c10a8d21..81c73c789239d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 69f9a5712b0b5..614f59c564df6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -681,7 +681,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -789,7 +789,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -906,28 +906,26 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_mov_b32 s16, 0xb36211c7
 ; GFX11-NEXT:    s_mov_b32 s6, 2.0
-; GFX11-NEXT:    s_movk_i32 s17, 0x102
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x34
 ; GFX11-NEXT:    s_mov_b32 s8, 0x40400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0xb36211c7 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    s_mov_b32 s12, 0x40c00000
 ; GFX11-NEXT:    s_mov_b32 s10, 0x40a00000
 ; GFX11-NEXT:    s_mov_b32 s9, 4.0
 ; GFX11-NEXT:    s_mov_b32 s14, 0x41000000
 ; GFX11-NEXT:    s_mov_b32 s13, 0x40e00000
-; GFX11-NEXT:    v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s12
-; GFX11-NEXT:    v_mov_b32_e32 v4, s9
-; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT:    v_bfrev_b32_e32 v10, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b32 s5, 1.0
-; GFX11-NEXT:    v_mov_b32_e32 v7, s13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_b32 v11, v[0:1]
@@ -1012,17 +1010,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_mov_b32 s12, 0xb36211c6
 ; GFX11-NEXT:    s_mov_b32 s6, 2.0
-; GFX11-NEXT:    s_movk_i32 s13, 0x102
 ; GFX11-NEXT:    s_mov_b32 s8, 0x42004600
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_mov_b32 s9, 0x44004700
 ; GFX11-NEXT:    s_mov_b32 s10, 0x45004800
-; GFX11-NEXT:    v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT:    v_mov_b32_e32 v7, s13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x34
-; GFX11-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0xb36211c6
+; GFX11-NEXT:    v_bfrev_b32_e32 v7, 4.0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 4d012796693cb..79164037bc6aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -326,7 +326,6 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -334,6 +333,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
 ; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
 ; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index aa21e67544d65..4946d3759c2ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -63,14 +63,14 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
 ;
 ; GFX10-LABEL: mov_dpp64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24 ; encoding: [0x01,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
 ; GFX10-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
 ; GFX11-LABEL: mov_dpp64_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 3def36766fbe0..9ef54ed724ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -77,11 +77,12 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64:
@@ -92,13 +93,14 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
@@ -115,11 +117,12 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_fabs_f64:
@@ -130,13 +133,14 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
@@ -185,11 +189,12 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_undef_f64:
@@ -200,13 +205,14 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
   ret double %rsq_clamp
@@ -254,11 +260,12 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
@@ -269,13 +276,14 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 5074f8814546e..8f88aaedf7e95 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -117,10 +117,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, 0xcccccccd
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x4010cccc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 2198ba9f1d964..1e8209bd3fc6b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: dpp_test:
@@ -64,16 +64,16 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
 ;
 ; GFX10-LABEL: update_dppi64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppi64_test:
@@ -120,16 +120,16 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
 ;
 ; GFX10-LABEL: update_dppf64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppf64_test:
@@ -176,16 +176,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ;
 ; GFX10-LABEL: update_dppv2i32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
@@ -232,16 +232,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ;
 ; GFX10-LABEL: update_dppv2f32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
@@ -288,16 +288,16 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
 ;
 ; GFX10-LABEL: update_dpp_p0_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dpp_p0_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 4e44ef33cd13f..cc0e34be02a76 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index a72b7e4f36396..112a7d98ce7ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 2d9fc9b25cfc8..0f60f40bd337b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
 
 define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
 ; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 4d4da869d7507..de9af5209db16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -9,56 +9,71 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
 ; LOOP:       ; %bb.0:
 ; LOOP-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
 ; LOOP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; LOOP-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_3
-; LOOP-NEXT:  ; %bb.1: ; %copy_forward
-; LOOP-NEXT:    s_mov_b64 s[6:7], 0
+; LOOP-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_3
+; LOOP-NEXT:  ; %bb.1: ; %Flow
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_4
+; LOOP-NEXT:  .LBB0_2: ; %memmove_done
+; LOOP-NEXT:    s_endpgm
+; LOOP-NEXT:  .LBB0_3:
+; LOOP-NEXT:    s_mov_b32 s6, 0
+; LOOP-NEXT:    s_mov_b32 s7, 0xf000
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; LOOP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execz .LBB0_2
+; LOOP-NEXT:  .LBB0_4: ; %memmove_bwd_residual
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s6
-; LOOP-NEXT:    v_mov_b32_e32 v5, s7
-; LOOP-NEXT:  .LBB0_2: ; %copy_forward_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT:    s_waitcnt expcnt(2)
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v0, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v1, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v4
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    s_cbranch_vccnz .LBB0_2
-; LOOP-NEXT:  .LBB0_3: ; %Flow17
-; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_6
-; LOOP-NEXT:  ; %bb.4: ; %copy_backwards
-; LOOP-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; LOOP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; LOOP-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; LOOP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; LOOP-NEXT:    s_mov_b32 s0, -4
-; LOOP-NEXT:    s_mov_b32 s6, 0
-; LOOP-NEXT:    s_mov_b32 s7, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
-; LOOP-NEXT:  .LBB0_5: ; %copy_backwards_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    s_and_b64 vcc, vcc, exec
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e64 v0, s[0:1], -1, v0
-; LOOP-NEXT:    v_addc_u32_e64 v1, s[0:1], -1, v1, s[0:1]
-; LOOP-NEXT:    v_add_i32_e64 v2, s[0:1], -1, v2
-; LOOP-NEXT:    v_addc_u32_e64 v3, s[0:1], -1, v3, s[0:1]
-; LOOP-NEXT:    s_cbranch_vccz .LBB0_5
-; LOOP-NEXT:  .LBB0_6: ; %memmove_done
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
 ; LOOP-NEXT:    s_endpgm
 ;
 ; UNROLL-LABEL: memmove_p1i8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 3edd2e0914a6e..7cd3babc70909 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -7,12 +7,11 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-LABEL: memset_p1i8:
 ; LOOP:       ; %bb.0: ; %loadstoreloop.preheader
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    v_mov_b32_e32 v3, s4
-; LOOP-NEXT:    v_mov_b32_e32 v4, s5
+; LOOP-NEXT:    v_mov_b32_e32 v4, s1
+; LOOP-NEXT:    v_mov_b32_e32 v3, s0
 ; LOOP-NEXT:  .LBB0_1: ; %loadstoreloop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v5, vcc, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 980ba3da4bac7..5dd4fa0809131 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1766,7 +1766,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GCN-NEXT:    s_cselect_b32 s2, s6, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65:
@@ -1788,7 +1788,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 48217b9bb0d93..4c34209752c01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -76,29 +76,25 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -120,29 +116,25 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 4
-; GFX6-NEXT:    s_mov_b32 s5, s4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 4
-; GFX7-NEXT:    s_mov_b32 s5, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -234,9 +226,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -244,21 +236,16 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_nop 0
@@ -273,7 +260,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
-; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -283,7 +270,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 4
-; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -292,13 +279,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ;
 ; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 4
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_nop 0
@@ -715,28 +697,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -758,28 +736,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 4
-; GFX6-NEXT:    s_mov_b32 s5, s4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 4
-; GFX7-NEXT:    s_mov_b32 s5, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -868,8 +842,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -878,21 +852,16 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -905,7 +874,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
-; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -915,7 +884,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 4
-; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -924,13 +893,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 4
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -1307,15 +1271,13 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 2
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1324,15 +1286,13 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ;
 ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 2
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1404,8 +1364,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 2
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1418,8 +1378,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 2
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1428,13 +1388,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ;
 ; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1549,15 +1504,13 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr, i32 %old, i32 %in) {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1567,15 +1520,13 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1649,8 +1600,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1663,8 +1614,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1673,13 +1624,9 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX12-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
-; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 577a7d0b4cba0..489f46d1237a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -7,18 +7,18 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; GFX10-LABEL: v_mul_i64_no_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[4:5]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v0, v2, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_no_zext:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index b0f3eee3c7363..42f1bf84c0420 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2559,47 +2559,47 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX8-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT:    s_mulk_i32 s2, 0x50
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_mulk_i32 s0, 0x50
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s3, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s2, s3, 0x50
-; GFX10-NEXT:    s_mul_hi_u32 s3, s3, 0x50
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_mul_i32 s0, s1, 0x50
+; GFX10-NEXT:    s_mul_hi_u32 s1, s1, 0x50
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_zext_with_sregs:
@@ -2738,56 +2738,56 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX8-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX8-NEXT:    s_mulk_i32 s2, 0x50
-; GFX8-NEXT:    s_mulk_i32 s3, 0x50
-; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    s_add_u32 s3, s3, s4
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_mulk_i32 s0, 0x50
+; GFX8-NEXT:    s_mulk_i32 s1, 0x50
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_add_u32 s1, s1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT:    s_mulk_i32 s4, 0x50
-; GFX9-NEXT:    s_add_u32 s3, s4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX9-NEXT:    s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT:    s_mulk_i32 s2, 0x50
+; GFX9-NEXT:    s_add_u32 s1, s2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX10-NEXT:    s_mul_hi_u32 s4, s2, 0x50
-; GFX10-NEXT:    s_mulk_i32 s3, 0x50
-; GFX10-NEXT:    s_mulk_i32 s2, 0x50
-; GFX10-NEXT:    s_add_i32 s3, s4, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_mul_hi_u32 s2, s0, 0x50
+; GFX10-NEXT:    s_mulk_i32 s1, 0x50
+; GFX10-NEXT:    s_mulk_i32 s0, 0x50
+; GFX10-NEXT:    s_add_i32 s1, s2, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_sext_with_sregs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
index 14f69c301ec38..76994c5cccf5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
@@ -2,22 +2,45 @@
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
---- |
-  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-  define void @test_frame_index_p5() {
-      %ptr0 = alloca i32, addrspace(5)
-     ret void
-    }
-...
 ---
 name: test_frame_index_p5
 legalized:       true
 stack:
-  - { id: 0, name: ptr0, offset: 0, size: 4, alignment: 4 }
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
 body: |
   bb.0:
     ; CHECK-LABEL: name: test_frame_index_p5
-    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:vgpr(p5) = G_FRAME_INDEX %stack.0.ptr0
-    %0:_(p5) = G_FRAME_INDEX %stack.0.ptr0
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+
+...
+
+---
+name: test_frame_index_p5_sgpr_use
+legalized:       true
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_frame_index_p5_sgpr_use
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: $sgpr0 = COPY [[FRAME_INDEX]](p5)
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+    $sgpr0 = COPY %0
+
+...
+
+---
+name: test_frame_index_p5_vgpr_use
+legalized:       true
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_frame_index_p5_vgpr_use
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5)
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+    $vgpr0 = COPY %0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 377fa24cb4755..404e726246f4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -218,6 +218,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s11
+; CHECK-NEXT:    s_xor_b64 s[6:7], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
@@ -326,10 +327,9 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
-; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
-; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; CHECK-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index cf69c50ed9357..7d7f450e590fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -305,25 +305,25 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdivrem_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
-; GFX9-NEXT:    s_add_u32 s0, s8, s2
-; GFX9-NEXT:    s_addc_u32 s1, s9, s2
-; GFX9-NEXT:    s_add_u32 s8, s10, s12
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s9, s11, s12
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
+; GFX9-NEXT:    s_add_u32 s0, s12, s2
+; GFX9-NEXT:    s_addc_u32 s1, s13, s2
+; GFX9-NEXT:    s_add_u32 s6, s14, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s7, s15, s4
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s14, 0, s8
-; GFX9-NEXT:    s_subb_u32 s15, 0, s9
+; GFX9-NEXT:    s_sub_u32 s14, 0, s6
+; GFX9-NEXT:    s_subb_u32 s15, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
@@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s12, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s13, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
-; GFX10-NEXT:    s_add_u32 s0, s8, s2
-; GFX10-NEXT:    s_addc_u32 s1, s9, s2
-; GFX10-NEXT:    s_add_u32 s8, s10, s12
-; GFX10-NEXT:    s_mov_b32 s13, s12
-; GFX10-NEXT:    s_addc_u32 s9, s11, s12
+; GFX10-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX10-NEXT:    s_ashr_i32 s4, s15, 31
+; GFX10-NEXT:    s_add_u32 s0, s12, s2
+; GFX10-NEXT:    s_addc_u32 s1, s13, s2
+; GFX10-NEXT:    s_add_u32 s6, s14, s4
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_addc_u32 s7, s15, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT:    s_sub_u32 s10, 0, s8
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT:    s_sub_u32 s12, 0, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -484,11 +484,12 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s11, s10, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s11, 0, s9
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s13, s12, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
+; GFX10-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
+; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
@@ -510,28 +511,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s12, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2]
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v6, s10, v7, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v6, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v6, s12, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v2, s12, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
@@ -540,71 +541,70 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v3
+; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v0, s12, v5, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v0, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v5, s10, v0, v2
+; GFX10-NEXT:    v_add_co_u32 v0, s12, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v5, s12, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v5, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s12, s6, v5, 0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2]
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s8
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_xor_b32_e32 v2, s8, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s9, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s5, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s2, v1
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, s4
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv i64 %x, %y
   store i64 %div, ptr addrspace(1) %out0
@@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s9
-; GFX9-NEXT:    s_xor_b32 s7, s7, s9
+; GFX9-NEXT:    s_ashr_i32 s0, s14, 31
+; GFX9-NEXT:    s_add_i32 s1, s14, s0
+; GFX9-NEXT:    s_xor_b32 s1, s1, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX9-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX9-NEXT:    s_add_i32 s3, s15, s2
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_i32 s12, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s1
+; GFX9-NEXT:    s_ashr_i32 s4, s12, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s10
-; GFX9-NEXT:    s_xor_b32 s4, s4, s10
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s12, 0, s7
+; GFX9-NEXT:    s_add_i32 s6, s12, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX9-NEXT:    s_add_i32 s7, s13, s5
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s11
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
+; GFX9-NEXT:    s_xor_b32 s7, s7, s5
+; GFX9-NEXT:    s_xor_b32 s0, s4, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s4, s10, s8
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
-; GFX9-NEXT:    s_xor_b32 s4, s11, s9
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    s_xor_b32 s0, s5, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s10, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX10-NEXT:    s_add_i32 s0, s10, s1
-; GFX10-NEXT:    s_add_i32 s3, s11, s2
-; GFX10-NEXT:    s_xor_b32 s10, s0, s1
+; GFX10-NEXT:    s_ashr_i32 s1, s14, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX10-NEXT:    s_add_i32 s0, s14, s1
+; GFX10-NEXT:    s_add_i32 s3, s15, s2
+; GFX10-NEXT:    s_xor_b32 s4, s0, s1
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s11, 0, s3
-; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX10-NEXT:    s_sub_i32 s0, 0, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_add_i32 s7, s13, s6
+; GFX10-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v1
-; GFX10-NEXT:    s_ashr_i32 s11, s8, 31
-; GFX10-NEXT:    s_add_i32 s0, s8, s11
-; GFX10-NEXT:    s_add_i32 s8, s9, s12
-; GFX10-NEXT:    s_xor_b32 s0, s0, s11
-; GFX10-NEXT:    s_xor_b32 s8, s8, s12
+; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
+; GFX10-NEXT:    s_ashr_i32 s5, s12, 31
+; GFX10-NEXT:    s_add_i32 s0, s12, s5
+; GFX10-NEXT:    s_xor_b32 s1, s5, s1
+; GFX10-NEXT:    s_xor_b32 s0, s0, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s1, s11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s12, s2
+; GFX10-NEXT:    s_xor_b32 s0, s6, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc44587..4cf1c92539c36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1733,9 +1733,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s10
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[8:9]
+; GCN-NEXT:    s_cselect_b32 s3, s6, s8
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_cselect_b32 s2, s2, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i65:
@@ -1753,9 +1753,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[0:1], s10
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10PLUS-NEXT:    s_cselect_b32 s3, s4, s6
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 9ee0acf2aa2db..8ea5fc25d95de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -92,8 +92,8 @@ entry:
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
-; GFX9_10: s_add_u32 s2, s2, -4
-; GFX9_10: s_addc_u32 s3, s3, -1
+; GFX9_10: s_add_u32 s0, s6, -4
+; GFX9_10: s_addc_u32 s1, s7, -1
 ; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 83ebc84e1f84a..5b94e71ecf52e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -193,7 +193,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_mov_b32 s7, 1
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
@@ -212,6 +212,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s9
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
@@ -272,43 +273,43 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
-; CHECK-NEXT:    v_mul_hi_u32 v4, s10, v0
+; CHECK-NEXT:    v_mul_hi_u32 v5, s10, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT:    v_mul_hi_u32 v6, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s11, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s10, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s11
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v4, s9
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v3, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v5, s[0:1]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
+; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v4, v5, s[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -321,12 +322,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s7, 1
 ; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index d15551365707b..e31d8e95bd608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -191,7 +191,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT:    s_mov_b32 s4, 1
+; CHECK-NEXT:    s_mov_b32 s6, 1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
@@ -199,6 +199,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -317,12 +318,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s1, s4, 1
+; CHECK-NEXT:    s_xor_b32 s1, s6, 1
 ; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 8b94f93e44e56..5aef667934709 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -251,12 +251,12 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ;
 ; GFX9-LABEL: udivrem_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX9-NEXT:    s_sub_u32 s2, 0, s10
-; GFX9-NEXT:    s_subb_u32 s3, 0, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT:    s_sub_u32 s2, 0, s14
+; GFX9-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s12, v0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
-; GFX9-NEXT:    v_sub_u32_e32 v0, s9, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v6
+; GFX9-NEXT:    v_sub_u32_e32 v0, s13, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s10, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s14, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s14, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX10-NEXT:    s_sub_u32 s0, 0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX10-NEXT:    s_sub_u32 s0, 0, s14
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s0, v3, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s1, 0, s11
+; GFX10-NEXT:    s_subb_u32 s1, 0, s15
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
@@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, s9, v1
+; GFX10-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX10-NEXT:    v_mul_lo_u32 v5, s13, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX10-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v1
+; GFX10-NEXT:    v_mul_hi_u32 v2, s13, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v5, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s14, v5, 0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s8, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v1
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
+; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s12, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s13, v1
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s10
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v9
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v8
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s10
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
@@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s0
-; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v10, v[2:3], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v10, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv i64 %x, %y
   store i64 %div, ptr addrspace(1) %out0
@@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_sub_i32 s0, 0, s10
-; GFX9-NEXT:    s_sub_i32 s1, 0, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX9-NEXT:    s_sub_i32 s0, 0, s14
+; GFX9-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s14
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s12, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s1, 0, s11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX10-NEXT:    s_sub_i32 s0, 0, s14
+; GFX10-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s8, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s12, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s13, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
@@ -1248,16 +1248,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
+; GFX9-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x20
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
-; GFX9-NEXT:    s_sub_u32 s2, 0, s12
-; GFX9-NEXT:    s_subb_u32 s3, 0, s13
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s17
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s16
+; GFX9-NEXT:    s_sub_u32 s2, 0, s16
+; GFX9-NEXT:    s_subb_u32 s3, 0, s17
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    s_sub_u32 s2, 0, s14
+; GFX9-NEXT:    s_sub_u32 s2, 0, s18
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_subb_u32 s3, 0, s15
+; GFX9-NEXT:    s_subb_u32 s3, 0, s19
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1317,48 +1317,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT:    v_mul_hi_u32 v5, s13, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, s17
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v9, v3, v0, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s12, v1
 ; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s19
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s18
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s12, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
@@ -1370,13 +1369,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[0:1]
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e64 v18, s[0:1], 1, v13
@@ -1385,7 +1384,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s12, v11
+; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s16, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v16, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
@@ -1441,55 +1440,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v16, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v5
-; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v5
+; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, s15, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v6
 ; GFX9-NEXT:    v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v6
-; GFX9-NEXT:    v_mul_hi_u32 v13, s11, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, s14, v6
+; GFX9-NEXT:    v_mul_hi_u32 v13, s15, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v10, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v1, v11, v9
 ; GFX9-NEXT:    v_add3_u32 v9, v1, v2, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v10, s11
-; GFX9-NEXT:    v_mov_b32_e32 v6, s15
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v10, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s19
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s14, v5
 ; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v10
-; GFX9-NEXT:    v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v10
+; GFX9-NEXT:    v_sub_u32_e32 v1, s15, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v10
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s14, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s18, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v12
 ; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v13
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s14, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s18, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v14
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1504,22 +1503,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT:    global_store_dwordx4 v0, v[3:6], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v0, v[7:10], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[3:6], s[8:9]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[7:10], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x20
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s12
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s14
-; GFX10-NEXT:    s_sub_u32 s0, 0, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s17
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s19
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s16
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s18
+; GFX10-NEXT:    s_sub_u32 s0, 0, s16
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_subb_u32 s1, 0, s13
+; GFX10-NEXT:    s_subb_u32 s1, 0, s17
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1539,13 +1540,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT:    s_sub_u32 s2, 0, s14
+; GFX10-NEXT:    s_sub_u32 s2, 0, s18
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s3, s2, v8, 0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT:    s_subb_u32 s3, 0, s15
+; GFX10-NEXT:    s_subb_u32 s3, 0, s19
 ; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5]
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v7, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6]
@@ -1592,7 +1593,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s0, v7, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, s2, v8, 0
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
@@ -1641,21 +1641,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v8, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u32 v3, s9, v4
-; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v4
-; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v2
-; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, s9, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s10, v0
-; GFX10-NEXT:    v_mul_hi_u32 v7, s10, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT:    v_mul_lo_u32 v12, s11, v0
-; GFX10-NEXT:    v_mul_hi_u32 v13, s10, v0
-; GFX10-NEXT:    v_mul_hi_u32 v14, s11, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s13, v4
+; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v2
+; GFX10-NEXT:    v_mul_hi_u32 v5, s12, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, s13, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v2
+; GFX10-NEXT:    v_mul_lo_u32 v6, s15, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v2
+; GFX10-NEXT:    v_mul_hi_u32 v11, s13, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s14, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s14, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, s15, v0
+; GFX10-NEXT:    v_mul_hi_u32 v13, s14, v0
+; GFX10-NEXT:    v_mul_hi_u32 v14, s15, v0
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v3, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v9, v4
@@ -1678,77 +1677,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_co_u32 v8, s0, v4, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s12, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s16, v8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s14, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s18, v10, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v5, v4, v11
 ; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v8, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v6, v14
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v12, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s8, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s9, v3
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s12, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s16, v14
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s13, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v15, s0, s10, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s11, v0
+; GFX10-NEXT:    v_sub_co_u32 v15, s0, s14, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v15
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s12
+; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s16
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v5
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v18
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v17
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v18
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v16
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s19, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s12
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s16
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s14
+; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s18
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v17, v0, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s19, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v6
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v10, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s14
+; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v14, v17, vcc_lo
@@ -1759,8 +1758,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v13, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[8:9]
+; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index cc0f7e2ca5a54..f30b278b3e611 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -188,13 +188,14 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT:    s_mov_b32 s4, 1
+; CHECK-NEXT:    s_mov_b32 s6, 1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
@@ -313,12 +314,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s1, s4, 1
+; CHECK-NEXT:    s_xor_b32 s1, s6, 1
 ; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index c9a9eb9d91724..1f1c2659e8110 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -479,33 +479,33 @@ return:
 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_chain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
 ; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
 ; GFX906-NEXT:  .LBB8_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -533,31 +533,31 @@ bb.3:
 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_multi_block:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[6:7]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
 ; GFX906-NEXT:  .LBB9_3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index ef2e57eafbf13..f36dcb487e915 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
 ;
 ; GFX9-LABEL: constant_load_i8_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i8_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %ld = load i8, ptr addrspace(4) %in, align 4
   store i8 %ld, ptr addrspace(1) %out, align 4
@@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
 ;
 ; GFX9-LABEL: constant_load_i16_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_short v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i16_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %ld = load i16, ptr addrspace(4) %in, align 4
   store i16 %ld, ptr addrspace(1) %out, align 4
@@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: sextload_i8_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_sext_i32_i8 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sextload_i8_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i8 s2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 4
   %sext = sext i8 %load to i32
@@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: sextload_i16_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sextload_i16_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_sext_i32_i16 s0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 4
   %sext = sext i16 %load to i32
@@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: zextload_i8_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zextload_i8_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 4
   %zext = zext i8 %load to i32
@@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: zextload_i16_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zextload_i16_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 4
   %zext = zext i16 %load to i32
@@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: constant_load_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   store i8 %load, ptr addrspace(1) %out, align 2
@@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: constant_load_i16_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i16_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 2
   store i16 %load, ptr addrspace(1) %out, align 2
@@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: constant_sextload_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_sbyte v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_sbyte v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_sextload_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_sbyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_sbyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   %sextload = sext i8 %load to i32
@@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: constant_zextload_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_zextload_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   %zextload = zext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 6c232b680ebf5..08a5d6660bab2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 717a4fc823d51..738671cf7c648 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index 1ef50cbd0fc7e..25f4145f9724c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 0bd255e5e1af4..87fe8334d059f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index 7399fa0a341e2..a03180f039526 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index ae20ab1de3a2d..e53653408feb4 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -255,7 +255,6 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
   %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
   %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
   %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
-  store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index e9797fa1fc309..94d704fa3f92d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -35,26 +35,26 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: s_add_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_add_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_i32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_add_i32:
@@ -125,30 +125,30 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_add_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s2, s5, s7
-; GFX9-NEXT:    s_add_i32 s3, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_add_i32 s1, s1, s3
+; GFX9-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_add_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_i32 s2, s4, s6
-; GFX10-NEXT:    s_add_i32 s3, s5, s7
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_add_i32 s0, s0, s2
+; GFX10-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_add_v2i32:
@@ -813,30 +813,30 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_add_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_add_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_add_i32:
@@ -922,26 +922,26 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_add_imm_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x7b, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_add_imm_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_add_imm_i32:
@@ -1240,50 +1240,50 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: add64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_add_u32 s4, s4, s6
-; GFX9-NEXT:    s_addc_u32 s5, s5, s7
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT:    s_add_u32 s0, s8, s10
+; GFX9-NEXT:    s_addc_u32 s1, s9, s11
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB9_3
 ; GFX9-NEXT:  .LBB9_2: ; %if
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:  .LBB9_3: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:  .LBB9_4:
-; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; GFX9-NEXT:    s_branch .LBB9_2
 ;
 ; GFX10-LABEL: add64_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_add_u32 s4, s4, s6
-; GFX10-NEXT:    s_addc_u32 s5, s5, s7
+; GFX10-NEXT:    s_add_u32 s0, s8, s10
+; GFX10-NEXT:    s_addc_u32 s1, s9, s11
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX10-NEXT:  .LBB9_2: ; %if
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:  .LBB9_3: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ; GFX10-NEXT:  .LBB9_4:
-; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; GFX10-NEXT:    s_branch .LBB9_2
 ;
 ; GFX11-LABEL: add64_in_branch:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index b751be51a9739..4cc384e9d2718 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -186,24 +186,24 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: s_test_add_self_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_add_u16 v1, s2, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_pk_add_u16 v1, s0, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_self_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s2, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_add_u16 v1, s0, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_self_v2i16:
@@ -245,21 +245,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_pk_add_u16 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_add_u16 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
@@ -300,27 +300,27 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_add_v2i16_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0x1c8007b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s2, 0x1c8007b
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_constant:
@@ -369,27 +369,27 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0xfc21fcb3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s2, 0xfc21fcb3
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
@@ -437,26 +437,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, -1
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, -1
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
@@ -503,26 +503,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 32
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 32
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
@@ -570,26 +570,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1.0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1.0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index def6df9adf597..88203202a320d 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
 ; GFX9-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:  .LBB2_2: ; %then
@@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
 ; GFX10-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ; GFX10-NEXT:  .LBB2_2: ; %then
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 77976e470fc78..95f59479c73e8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
 ; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
 ; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
 ; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
+; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
 ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
 ; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
+; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
 define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
                                                     ptr addrspace(1) inreg %out) {
   %v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7cf18171a6cd7..96e92bb3dce0d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s4, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s7
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_sub_i32 s5, 0, s4
-; GFX9-NEXT:    s_xor_b32 s3, s2, s3
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_xor_b32 s1, s6, s7
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s5, s4
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s3, s0
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s4
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s3, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_add_i32 s6, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s3, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -5486,13 +5486,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s3, s3, 12
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_add_i32 s0, s7, 12
+; GFX9-NEXT:    s_lshr_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -5528,14 +5528,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5574,18 +5574,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_mul_hi_u32 s1, s7, 0x100101
+; GFX9-NEXT:    s_sub_i32 s2, s7, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5879,14 +5879,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_add_i32 s3, s3, -1
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_add_i32 s0, s0, -1
+; GFX9-NEXT:    s_and_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -5922,14 +5922,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6239,41 +6239,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX9-NEXT:    s_add_i32 s3, s6, s2
+; GFX9-NEXT:    s_sub_i32 s6, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s0
+; GFX9-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_sub_i32 s8, s3, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX9-NEXT:    s_xor_b32 s3, s5, s4
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s7, s6
+; GFX9-NEXT:    s_xor_b32 s1, s2, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6315,20 +6315,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, s5
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6370,21 +6370,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_add_i32 s5, s5, s3
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
-; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_add_i32 s4, s4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_mul_hi_i32 s1, s7, 0x80080081
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_i32 s1, s1, s7
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_add_i32 s1, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6753,38 +6753,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_add_i32 s2, s6, s1
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -6828,22 +6828,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s4, s2, s4
-; GFX9-NEXT:    s_add_i32 s5, s3, s5
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -7290,13 +7290,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -7871,12 +7871,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -8138,58 +8138,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT:    s_add_u32 s4, 0x396, s4
+; GFX9-NEXT:    s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT:    s_add_u32 s0, 0x396, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT:    s_addc_u32 s5, 0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
-; GFX9-NEXT:    s_sub_i32 s5, s7, s6
-; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT:    s_mul_i32 s12, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT:    s_add_u32 s6, s6, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_sub_i32 s1, s3, s2
+; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT:    s_mul_i32 s12, s2, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT:    s_add_u32 s2, s2, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT:    s_mul_i32 s10, s0, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s11
-; GFX9-NEXT:    s_add_u32 s6, s6, s10
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT:    s_addc_u32 s6, s8, s9
-; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s5, s4, s5
-; GFX9-NEXT:    s_add_u32 s5, s6, s5
-; GFX9-NEXT:    s_addc_u32 s6, 0, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    s_add_u32 s2, s2, s10
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT:    s_addc_u32 s2, s8, s9
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    s_mul_i32 s1, s0, s1
+; GFX9-NEXT:    s_add_u32 s1, s2, s1
+; GFX9-NEXT:    s_addc_u32 s2, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s6, s4, s6
+; GFX9-NEXT:    s_addc_u32 s8, s0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
+; GFX9-NEXT:    s_add_u32 s2, s6, s0
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_addc_u32 s3, s7, s0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s8, s2, s6
+; GFX9-NEXT:    s_mul_i32 s7, s2, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT:    s_mul_hi_u32 s7, s2, s6
-; GFX9-NEXT:    s_add_u32 s8, s10, s8
-; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s8
+; GFX9-NEXT:    s_add_u32 s7, s10, s7
+; GFX9-NEXT:    s_addc_u32 s6, 0, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s9
 ; GFX9-NEXT:    s_mul_i32 s9, s3, s9
-; GFX9-NEXT:    s_add_u32 s8, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s6
-; GFX9-NEXT:    s_addc_u32 s7, s7, s11
-; GFX9-NEXT:    s_addc_u32 s8, s10, 0
-; GFX9-NEXT:    s_mul_i32 s6, s3, s6
-; GFX9-NEXT:    s_add_u32 s6, s7, s6
-; GFX9-NEXT:    s_addc_u32 s7, 0, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s9
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT:    s_addc_u32 s6, s6, s11
+; GFX9-NEXT:    s_addc_u32 s7, s10, 0
+; GFX9-NEXT:    s_mul_i32 s8, s3, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s8
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_add_u32 s8, s6, 1
 ; GFX9-NEXT:    s_addc_u32 s9, s7, 0
 ; GFX9-NEXT:    s_add_u32 s10, s6, 2
@@ -8222,13 +8222,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s2, s8, s6
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT:    s_sub_u32 s2, s2, s4
-; GFX9-NEXT:    s_subb_u32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_u32 s2, s2, s0
+; GFX9-NEXT:    s_subb_u32 s3, s3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
@@ -8261,17 +8261,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_u32 s0, s6, s0
+; GFX9-NEXT:    s_addc_u32 s1, s7, 0
+; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -9132,19 +9132,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s8
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
-; GFX9-NEXT:    s_ashr_i32 s8, s1, 31
-; GFX9-NEXT:    s_add_u32 s0, s0, s8
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_addc_u32 s1, s1, s8
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[8:9]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX9-NEXT:    s_sub_u32 s0, 0, s12
-; GFX9-NEXT:    s_subb_u32 s1, 0, s13
+; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
+; GFX9-NEXT:    s_add_u32 s0, s0, s12
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s1, s1, s12
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX9-NEXT:    s_sub_u32 s0, 0, s14
+; GFX9-NEXT:    s_subb_u32 s1, 0, s15
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9153,60 +9153,60 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX9-NEXT:    s_mul_i32 s16, s0, s14
-; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
-; GFX9-NEXT:    s_mul_i32 s17, s1, s15
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s16, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s3
+; GFX9-NEXT:    s_mul_i32 s17, s1, s3
 ; GFX9-NEXT:    s_add_i32 s16, s18, s16
-; GFX9-NEXT:    s_mul_i32 s19, s0, s15
+; GFX9-NEXT:    s_mul_i32 s19, s0, s3
 ; GFX9-NEXT:    s_add_i32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
-; GFX9-NEXT:    s_mul_i32 s18, s15, s16
-; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
-; GFX9-NEXT:    s_add_u32 s15, s15, s18
+; GFX9-NEXT:    s_mul_hi_u32 s17, s3, s16
+; GFX9-NEXT:    s_mul_i32 s18, s3, s16
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s19
+; GFX9-NEXT:    s_add_u32 s3, s3, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
-; GFX9-NEXT:    s_mul_i32 s19, s14, s19
-; GFX9-NEXT:    s_add_u32 s15, s15, s19
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s16
-; GFX9-NEXT:    s_addc_u32 s15, s17, s20
+; GFX9-NEXT:    s_mul_hi_u32 s20, s2, s19
+; GFX9-NEXT:    s_mul_i32 s19, s2, s19
+; GFX9-NEXT:    s_add_u32 s3, s3, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s2, s16
+; GFX9-NEXT:    s_addc_u32 s3, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s17, s18, 0
-; GFX9-NEXT:    s_mul_i32 s16, s14, s16
-; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_mul_i32 s16, s2, s16
+; GFX9-NEXT:    s_add_u32 s3, s3, s16
 ; GFX9-NEXT:    s_addc_u32 s16, 0, s17
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s15, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s14, s14, s16
+; GFX9-NEXT:    s_addc_u32 s2, s2, s16
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX9-NEXT:    s_mul_i32 s15, s0, s14
+; GFX9-NEXT:    s_mul_i32 s3, s0, s2
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s16
-; GFX9-NEXT:    s_add_i32 s15, s17, s15
+; GFX9-NEXT:    s_add_i32 s3, s17, s3
 ; GFX9-NEXT:    s_mul_i32 s1, s1, s16
-; GFX9-NEXT:    s_add_i32 s15, s15, s1
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s16
-; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s0
-; GFX9-NEXT:    s_mul_i32 s18, s14, s0
-; GFX9-NEXT:    s_mul_i32 s20, s16, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s2, s0
+; GFX9-NEXT:    s_mul_i32 s18, s2, s0
+; GFX9-NEXT:    s_mul_i32 s20, s16, s3
 ; GFX9-NEXT:    s_mul_hi_u32 s0, s16, s0
-; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s15
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s3
 ; GFX9-NEXT:    s_add_u32 s0, s0, s20
 ; GFX9-NEXT:    s_addc_u32 s16, 0, s19
 ; GFX9-NEXT:    s_add_u32 s0, s0, s18
-; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
 ; GFX9-NEXT:    s_addc_u32 s0, s16, s17
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_mul_i32 s15, s14, s15
-; GFX9-NEXT:    s_add_u32 s0, s0, s15
+; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s3
 ; GFX9-NEXT:    s_addc_u32 s1, 0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s16, s14, s1
-; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX9-NEXT:    s_add_u32 s0, s4, s14
-; GFX9-NEXT:    s_mov_b32 s15, s14
-; GFX9-NEXT:    s_addc_u32 s1, s5, s14
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
+; GFX9-NEXT:    s_addc_u32 s16, s2, s1
+; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    s_add_u32 s0, s4, s2
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s1, s5, s2
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v0
 ; GFX9-NEXT:    s_mul_i32 s1, s4, s16
 ; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
@@ -9222,24 +9222,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s16, s5, s16
 ; GFX9-NEXT:    s_add_u32 s16, s0, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s1
-; GFX9-NEXT:    s_mul_i32 s0, s12, s17
-; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s16
+; GFX9-NEXT:    s_mul_i32 s0, s14, s17
+; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s16
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
-; GFX9-NEXT:    s_mul_i32 s1, s13, s16
+; GFX9-NEXT:    s_mul_i32 s1, s15, s16
 ; GFX9-NEXT:    s_add_i32 s18, s0, s1
-; GFX9-NEXT:    s_mul_i32 s1, s12, s16
+; GFX9-NEXT:    s_mul_i32 s1, s14, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_sub_i32 s0, s5, s18
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s4, s0, s13
-; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
+; GFX9-NEXT:    s_subb_u32 s4, s0, s15
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s14, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_subb_u32 s4, s4, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s15
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s4, s13
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s4, s15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -9257,35 +9257,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX9-NEXT:    s_subb_u32 s0, s5, s18
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s15
 ; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
-; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX9-NEXT:    s_add_u32 s8, s10, s4
+; GFX9-NEXT:    s_add_u32 s4, s10, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s9, s11, s4
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s5, s11, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_sub_u32 s0, 0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    s_subb_u32 s1, 0, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -9362,24 +9362,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s7, s12
 ; GFX9-NEXT:    s_add_u32 s12, s0, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s1
-; GFX9-NEXT:    s_mul_i32 s0, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
+; GFX9-NEXT:    s_mul_i32 s0, s4, s13
+; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s12
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
-; GFX9-NEXT:    s_mul_i32 s1, s9, s12
+; GFX9-NEXT:    s_mul_i32 s1, s5, s12
 ; GFX9-NEXT:    s_add_i32 s14, s0, s1
-; GFX9-NEXT:    s_mul_i32 s1, s8, s12
+; GFX9-NEXT:    s_mul_i32 s1, s4, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_sub_i32 s0, s7, s14
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s6, s0, s9
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v2
+; GFX9-NEXT:    s_subb_u32 s6, s0, s5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_subb_u32 s6, s6, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
 ; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
-; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
+; GFX9-NEXT:    s_cmp_eq_u32 s6, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -9397,10 +9397,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
 ; GFX9-NEXT:    s_subb_u32 s0, s7, s14
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s5
 ; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
-; GFX9-NEXT:    s_cmp_eq_u32 s0, s9
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -9410,13 +9410,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -9525,100 +9526,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT:    s_add_u32 s4, 0x396, s4
+; GFX9-NEXT:    s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT:    s_add_u32 s0, 0x396, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT:    s_addc_u32 s5, 0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
-; GFX9-NEXT:    s_sub_i32 s5, s7, s6
-; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT:    s_mul_i32 s12, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT:    s_add_u32 s6, s6, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_sub_i32 s1, s3, s2
+; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT:    s_mul_i32 s12, s2, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT:    s_add_u32 s2, s2, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT:    s_mul_i32 s10, s0, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s11
-; GFX9-NEXT:    s_add_u32 s6, s6, s10
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT:    s_addc_u32 s6, s8, s9
-; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s5, s4, s5
-; GFX9-NEXT:    s_add_u32 s5, s6, s5
-; GFX9-NEXT:    s_addc_u32 s6, 0, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    s_add_u32 s2, s2, s10
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT:    s_addc_u32 s2, s8, s9
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    s_mul_i32 s1, s0, s1
+; GFX9-NEXT:    s_add_u32 s1, s2, s1
+; GFX9-NEXT:    s_addc_u32 s2, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s6, s4, s6
+; GFX9-NEXT:    s_addc_u32 s8, s0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s7, s2, s6
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_add_u32 s7, s9, s7
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
-; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s7, s7, s8
-; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s6
-; GFX9-NEXT:    s_addc_u32 s5, s5, s10
-; GFX9-NEXT:    s_addc_u32 s7, s9, 0
-; GFX9-NEXT:    s_mul_i32 s6, s3, s6
-; GFX9-NEXT:    s_add_u32 s5, s5, s6
-; GFX9-NEXT:    s_addc_u32 s6, 0, s7
-; GFX9-NEXT:    s_mul_hi_u32 s8, s5, 0x12d8fb
-; GFX9-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
+; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
+; GFX9-NEXT:    s_add_u32 s2, s6, s0
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_addc_u32 s3, s7, s0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s7
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s8
+; GFX9-NEXT:    s_add_u32 s6, s9, s6
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s7
+; GFX9-NEXT:    s_mul_i32 s7, s3, s7
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s8
+; GFX9-NEXT:    s_addc_u32 s1, s1, s10
+; GFX9-NEXT:    s_addc_u32 s6, s9, 0
+; GFX9-NEXT:    s_mul_i32 s7, s3, s8
+; GFX9-NEXT:    s_add_u32 s1, s1, s7
+; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_mul_hi_u32 s8, s1, 0x12d8fb
+; GFX9-NEXT:    s_mul_i32 s1, s1, 0x12d8fb
 ; GFX9-NEXT:    s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s3, s8
+; GFX9-NEXT:    s_subb_u32 s1, s3, s8
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s7, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s3, s2, 0
+; GFX9-NEXT:    s_subb_u32 s2, s1, 0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s7, v1
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s5, s3, 0
+; GFX9-NEXT:    s_subb_u32 s3, s2, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0x12d8fa
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
@@ -9653,19 +9654,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_add_u32 s4, s2, s4
-; GFX9-NEXT:    s_addc_u32 s5, s3, 0
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT:    s_sub_u32 s2, s2, s4
-; GFX9-NEXT:    s_subb_u32 s3, s3, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_u32 s0, s6, s0
+; GFX9-NEXT:    s_addc_u32 s1, s7, 0
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_sub_u32 s0, s6, s0
+; GFX9-NEXT:    s_subb_u32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 6eb7a4a9a90d6..ebbab5c2b9508 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -634,11 +634,11 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v1, v1
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -669,9 +669,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    s_brev_b32 s4, -2
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
-; CHECK-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 91abbfff7f2de..74a7ca243b56a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index d6137597293f6..c6233642110ea 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
 ; SI: s_load_dword [[B:s[0-9]+]]
 ; SI: s_load_dwordx2
 ; SI-NOT: and
-; SI: s_lshl_b32 [[A]], [[A]], 1
-; SI: s_lshl_b32 [[B]], [[B]], 1
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
@@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
 ; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
 ; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index b1134ae78cb97..17fe3adc22169 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
@@ -427,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT:    s_mov_b32 null, 0
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1921,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB6_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index bc5d2662dcb45..16f3ff4be6b50 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,21 +2,21 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -89,101 +89,101 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: add_i32_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB0_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s2
-; GFX1064-NEXT:    s_mov_b32 s9, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s6
+; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s2
-; GFX1032-NEXT:    s_mov_b32 s9, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s6
+; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_constant:
@@ -766,137 +766,137 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-LABEL: add_i32_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX9_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s4
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, m0
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s14, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[12:15], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB2_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v1
-; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s0, v1
+; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s8, s8, s7
 ; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s0, v1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_add_i32 s4, s4, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s4, s1
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB2_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s0, v1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i32_varying:
@@ -1180,17 +1180,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: add_i32_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1202,33 +1202,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB2_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
-; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i32_varying:
@@ -1249,50 +1249,50 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 15
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 16
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 32
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s9
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1064_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1064_DPP-NEXT:    buffer_atomic_add v0, off, s[0:3], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB2_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i32_varying:
@@ -1309,44 +1309,44 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1032_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1032_DPP-NEXT:    buffer_atomic_add v0, off, s[0:3], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB2_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: add_i32_varying:
@@ -1712,110 +1712,109 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: add_i64_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB3_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_nop 2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s2
-; GFX1064-NEXT:    s_mov_b32 s9, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s6
+; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB3_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i64_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s2
-; GFX1032-NEXT:    s_mov_b32 s9, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s6
+; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB3_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i64_constant:
@@ -2480,160 +2479,160 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_ITERATIVE-LABEL: add_i64_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
-; GFX9_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
-; GFX9_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
+; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
+; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB5_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
 ; GFX9_ITERATIVE-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
-; GFX1064_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
-; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
+; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB5_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
-; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s0, v1
+; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s1, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1032_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s1
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s1
-; GFX1032_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
-; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s5
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s5
+; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
+; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s6, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB5_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v1
+; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i64_varying:
@@ -2987,22 +2986,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -3050,39 +3049,39 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9_DPP-NEXT:    buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB5_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
@@ -3145,59 +3144,59 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v4, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v4, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v3, 15
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v3, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v3, 47
-; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 16
-; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s2, 16
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v4, 47
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v4, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s11, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB5_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s2, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
@@ -3248,51 +3247,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v3, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB5_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: add_i64_varying:
@@ -3808,104 +3807,104 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: sub_i32_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB6_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s2
-; GFX1064-NEXT:    s_mov_b32 s9, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s6
+; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s2
-; GFX1032-NEXT:    s_mov_b32 s9, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s6
+; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i32_constant:
@@ -4498,137 +4497,137 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX9_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s4
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, m0
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s14, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[12:15], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB8_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v1
-; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s0, v1
+; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s8, s8, s7
 ; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s0, v1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_add_i32 s4, s4, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s4, s1
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB8_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s0, v1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
@@ -4912,17 +4911,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: sub_i32_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4934,33 +4933,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB8_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s0, v0
+; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
@@ -4981,50 +4980,50 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 15
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 16
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 32
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s9
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1064_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1064_DPP-NEXT:    buffer_atomic_sub v0, off, s[0:3], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB8_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i32_varying:
@@ -5041,44 +5040,44 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1032_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1032_DPP-NEXT:    buffer_atomic_sub v0, off, s[0:3], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB8_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: sub_i32_varying:
@@ -5445,117 +5444,117 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: sub_i64_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB9_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s2
-; GFX1064-NEXT:    s_mov_b32 s9, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s6
+; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB9_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i64_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s2
-; GFX1032-NEXT:    s_mov_b32 s9, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s6
+; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB9_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i64_constant:
@@ -6253,160 +6252,160 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
-; GFX9_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
-; GFX9_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
+; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
+; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB11_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
 ; GFX9_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
-; GFX1064_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
-; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
+; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB11_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
-; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s0, v1
+; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1032_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s1
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s1
-; GFX1032_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
-; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s5
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s5
+; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
+; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s6, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB11_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v1
+; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
@@ -6760,22 +6759,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -6823,39 +6822,39 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9_DPP-NEXT:    buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB11_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX9_DPP-NEXT:    v_subb_co_u32_e32 v8, vcc, v0, v8, vcc
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
@@ -6918,59 +6917,59 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v4, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v4, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v3, 15
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v3, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v3, 47
-; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 16
-; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s2, 16
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v4, 47
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v4, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s11, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1064_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1064_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB11_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s2, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
@@ -7021,51 +7020,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v3, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
-; GFX1032_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
+; GFX1032_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB11_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s2, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
-; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
+; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: sub_i64_varying:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 1439d4b40c951..c7296185422ce 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,17 +2,17 @@
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -1839,111 +1839,112 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ;
 ; GFX9-LABEL: add_i64_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s7, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_i32 s6, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mul_i32 s3, s7, s2
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT:    s_add_i32 s8, s8, s3
+; GFX9-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_nop 2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
-; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
-; GFX1064-NEXT:    s_add_i32 s8, s8, s7
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    s_mul_i32 s3, s7, s2
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064-NEXT:    s_add_i32 s8, s8, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB5_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
-; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i64_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
-; GFX1032-NEXT:    s_add_i32 s7, s7, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032-NEXT:    s_mul_i32 s2, s7, s1
+; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT:    s_mul_i32 s1, s6, s1
+; GFX1032-NEXT:    s_add_i32 s3, s3, s2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB5_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
-; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i64_uniform:
@@ -5501,119 +5502,119 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ;
 ; GFX9-LABEL: sub_i64_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s7, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_i32 s6, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mul_i32 s3, s7, s2
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT:    s_add_i32 s8, s8, s3
+; GFX9-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB13_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v3
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
-; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
-; GFX1064-NEXT:    s_add_i32 s8, s8, s7
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    s_mul_i32 s3, s7, s2
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064-NEXT:    s_add_i32 s8, s8, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i64_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
-; GFX1032-NEXT:    s_add_i32 s7, s7, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032-NEXT:    s_mul_i32 s2, s7, s1
+; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT:    s_mul_i32 s1, s6, s1
+; GFX1032-NEXT:    s_add_i32 s3, s3, s2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s0, s6, v2, 0
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i64_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 29704959fc176..f67fcd6e0caf5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
 ; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 
 declare i1 @llvm.amdgcn.wqm.vote(i1)
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index f636fa5d83a57..3a2efadac067d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
@@ -426,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT:    s_mov_b32 null, 0
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1504,14 +1505,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB5_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 3e8565d34c6be..d0c0b62c78e42 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
@@ -439,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT:    s_mov_b32 null, 0
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1674,14 +1675,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB6_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 2a7206c6ac43f..bc206656cf181 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope  -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 16ffdd7ebe421..d732da1a67bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -2131,14 +2131,26 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
 ; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_store_fpimm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: test_store_fpimm:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3f80
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0x4228
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11TRUE16-NEXT:    global_store_b16 v[0:1], v5, off
+; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v4, off
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: test_store_fpimm:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store bfloat 1.0, ptr addrspace(1) %ptr0
   store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
@@ -9310,6 +9322,72 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fadd_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fadd_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -13012,6 +13090,72 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fsub_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fsub_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -13615,6 +13759,72 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fmul_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fmul_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -17609,6 +17819,72 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_minnum_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_minnum_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -21162,6 +21438,72 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -24674,6 +25016,41 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_frexp_bf16_i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
 }
@@ -30041,6 +30418,72 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX11TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX11FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -30501,6 +30944,60 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -31325,6 +31822,144 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v10, v1
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v11, v3
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, v10, v8
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, v11, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v10, v1
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v11, v3
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, v10, v8
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, v11, v9
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -32056,6 +32691,74 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -32518,6 +33221,60 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -33204,6 +33961,120 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v6, v1
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v7, v5
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v8, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v6, v1
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v7, v5
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v8, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -39040,6 +39911,84 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v3, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v3, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -39908,6 +40857,132 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11FAKE16-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11FAKE16-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 80732d5de1e20..ca339938161bd 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(
 define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
   %bc = bitcast i64 %undef to <2 x i32>
-  store volatile <2 x i32> %bc, ptr addrspace(1) %out
+  store <2 x i32> %bc, ptr addrspace(1) %out
   ret void
 }
 
@@ -83,7 +83,7 @@ define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractel
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
-  store volatile i32 %elt1, ptr addrspace(1) %out
+  store i32 %elt1, ptr addrspace(1) %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 0bd030f1a3750..04d72691a088a 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ec0408236975d..172ce4c065e13 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, exec_lo
@@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, exec_lo
@@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
@@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
@@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s18
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
+; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, s18
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v6
-; GFX940-NEXT:    v_mov_b32_e32 v6, v5
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v14
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v11
-; GFX8-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v11
@@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s18
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-
+attributes #0 = { nounwind }
 
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index cd01cc7309fcd..61ee956747135 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, exec_lo
@@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, exec_lo
@@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
@@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
@@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s18
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
+; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, s18
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v6
-; GFX940-NEXT:    v_mov_b32_e32 v6, v5
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v14
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v11
-; GFX8-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v11
@@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s18
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-
+attributes #0 = { nounwind }
 
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 8293280609517..13c4ff8b2ff30 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX940:       ; %bb.0: ; %entry
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX940-NEXT:    s_lshl_b32 s1, s6, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index f248708d16ea2..0cc10512af5cd 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -157,26 +157,26 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: sadd64ri:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s2, 0x56789876
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0x1234
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s6, 0x56789876
+; GFX9-NEXT:    s_addc_u32 s1, s7, 0x1234
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: sadd64ri:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_add_u32 s2, s2, 0x56789876
-; GFX1010-NEXT:    s_addc_u32 s3, s3, 0x1234
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    s_add_u32 s0, s6, 0x56789876
+; GFX1010-NEXT:    s_addc_u32 s1, s7, 0x1234
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: sadd64ri:
@@ -255,23 +255,23 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: vadd64rr:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vadd64rr:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_add_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    v_add_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vadd64rr:
@@ -679,34 +679,34 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX9-LABEL: suaddo64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_addc_u32 s7, s5, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_add_u32 s0, s8, s10
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: suaddo64:
 ; GFX1010:       ; %bb.0:
-; GFX1010-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_add_u32 s6, s4, s6
-; GFX1010-NEXT:    s_addc_u32 s7, s5, s7
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1010-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT:    s_add_u32 s0, s8, s10
+; GFX1010-NEXT:    s_addc_u32 s1, s9, s11
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: suaddo64:
@@ -1048,26 +1048,26 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: ssub64ri:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, 0x56789876, s2
-; GFX9-NEXT:    s_subb_u32 s3, 0x1234, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_sub_u32 s0, 0x56789876, s6
+; GFX9-NEXT:    s_subb_u32 s1, 0x1234, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: ssub64ri:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_sub_u32 s2, 0x56789876, s2
-; GFX1010-NEXT:    s_subb_u32 s3, 0x1234, s3
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    s_sub_u32 s0, 0x56789876, s6
+; GFX1010-NEXT:    s_subb_u32 s1, 0x1234, s7
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: ssub64ri:
@@ -1146,23 +1146,23 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: vsub64rr:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vsub64rr:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_sub_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    v_sub_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vsub64rr:
@@ -1571,34 +1571,34 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX9-LABEL: susubo64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_subb_u32 s7, s5, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_sub_u32 s0, s8, s10
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_subb_u32 s1, s9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: susubo64:
 ; GFX1010:       ; %bb.0:
-; GFX1010-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_sub_u32 s6, s4, s6
-; GFX1010-NEXT:    s_subb_u32 s7, s5, s7
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1010-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT:    s_sub_u32 s0, s8, s10
+; GFX1010-NEXT:    s_subb_u32 s1, s9, s11
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: susubo64:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index e1717a816de0d..45324392aacde 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -40,13 +40,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_add_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f32:
@@ -117,14 +117,14 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_multi_use_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e64 v2, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -194,13 +194,13 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_dbg_use_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_dbg_use_src_f32:
@@ -267,14 +267,14 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_add_neg_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_neg_src_f32:
@@ -342,14 +342,14 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_non_clamp_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_non_clamp_max_f32:
@@ -413,13 +413,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_add_src_f32_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f32_denormals:
@@ -485,13 +485,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_add_src_f16_denorm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f16_denorm:
@@ -557,13 +557,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
@@ -629,14 +629,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f32:
@@ -701,13 +701,13 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_add_src_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f64:
@@ -866,13 +866,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
@@ -947,13 +947,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
@@ -1038,14 +1038,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
@@ -1124,14 +1124,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
@@ -1212,14 +1212,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
@@ -1382,14 +1382,14 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
@@ -1469,14 +1469,14 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
@@ -1553,15 +1553,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 9b6c50c10d90d..94482d7f0d809 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -41,13 +41,13 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32:
@@ -126,13 +126,13 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f32:
@@ -212,13 +212,13 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f32:
@@ -304,15 +304,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_negzero_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negzero_f32:
@@ -400,15 +400,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
@@ -498,15 +498,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_multi_use_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
 ; GFX9-NEXT:    v_min_f32_e32 v2, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -600,13 +600,13 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f16:
@@ -686,13 +686,13 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f16:
@@ -773,13 +773,13 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f16:
@@ -861,13 +861,13 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f64:
@@ -946,13 +946,13 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f64:
@@ -1032,13 +1032,13 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f64:
@@ -1122,14 +1122,14 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_brev_b32 s0, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_brev_b32 s2, 1
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_med3_f32 v1, s2, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_med3_f32 v1, s0, 1.0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
@@ -1206,13 +1206,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_f32:
@@ -1289,13 +1289,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_bay_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bay_f32:
@@ -1372,13 +1372,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_yab_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yab_f32:
@@ -1455,13 +1455,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_yba_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yba_f32:
@@ -1538,13 +1538,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_ayb_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_ayb_f32:
@@ -1621,13 +1621,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_bya_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bya_f32:
@@ -2091,14 +2091,14 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
@@ -2179,13 +2179,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
@@ -2267,14 +2267,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
@@ -2356,14 +2356,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
@@ -2444,13 +2444,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
@@ -2527,13 +2527,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
@@ -2610,13 +2610,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
@@ -2693,13 +2693,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 1.0, 0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
@@ -2776,13 +2776,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, 0, v1, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
@@ -2859,13 +2859,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, 1.0, v1, 0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
@@ -3078,13 +3078,13 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16:
@@ -3180,13 +3180,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_elt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_elt:
@@ -3277,15 +3277,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_clamp_v2f16_not_zero:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 2.0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_not_zero:
@@ -3381,15 +3381,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_v2f16_not_one:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_not_one:
@@ -3483,13 +3483,13 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_clamp_neg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_v2f16:
@@ -3578,14 +3578,14 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_negabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_v2f16:
@@ -3678,13 +3678,13 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_neglo_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neglo_v2f16:
@@ -3774,13 +3774,13 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_neghi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neghi_v2f16:
@@ -3870,13 +3870,13 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_v2f16_shuffle:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_shuffle:
@@ -3973,13 +3973,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
@@ -4075,13 +4075,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
@@ -4163,18 +4163,18 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_diff_source_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_add_f32_e32 v1, s0, v1
+; GFX9-NEXT:    v_add_f32_e32 v2, s0, v2
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v2 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:12
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5] offset:12
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_diff_source_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index fad1d47f55fd7..bada3d904fbe3 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX10-LABEL: cluster_load_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_clause 0x3
@@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 16
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    s_add_u32 s2, s2, 24
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_add_u32 s0, s6, 16
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
@@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX10-LABEL: cluster_load_valu_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    s_add_u32 s4, s2, 16
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s5, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_add_u32 s0, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index e2b4865410db8..3216e71e6221a 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -96,16 +96,29 @@ define void @private_alloca_to_flat(ptr %ptr) {
 ; OPT-NEXT:    store volatile i32 7, ptr [[TMP1]], align 4
 ; OPT-NEXT:    ret void
 ;
-; ASM-LABEL: private_alloca_to_flat:
-; ASM:       ; %bb.0:
-; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
-; ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; ASM-NEXT:    v_mov_b32_e32 v1, s5
-; ASM-NEXT:    v_mov_b32_e32 v2, 7
-; ASM-NEXT:    flat_store_dword v[0:1], v2
-; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ASM-NEXT:    s_setpc_b64 s[30:31]
+; DAGISEL-ASM-LABEL: private_alloca_to_flat:
+; DAGISEL-ASM:       ; %bb.0:
+; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-ASM-LABEL: private_alloca_to_flat:
+; GISEL-ASM:       ; %bb.0:
+; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_lshr_b32 s4, s32, 6
+; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GISEL-ASM-NEXT:    s_mov_b32 s5, s7
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i8, addrspace(5)
   %x = addrspacecast ptr addrspace(5) %alloca to ptr
   store volatile i32 7, ptr %x
@@ -224,8 +237,9 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-ASM-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-ASM-NEXT:    s_lshr_b32 s6, s32, 6
 ; GISEL-ASM-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s6
 ; GISEL-ASM-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GISEL-ASM-NEXT:  ; %bb.1: ; %then
 ; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index df223b3ec1354..b5e0589cf9e46 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 48bd8f9b80799..1e2a2dcf1e333 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -4,13 +4,13 @@
 define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadCombine:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -37,14 +37,14 @@ entry:
 define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadShuffle:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x7050604
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-NEXT:    s_mov_b32 s0, 0x7050604
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index c57ee9cc6a1e2..ce181b6223e41 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:252
-; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], vcc
-; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GCN-NEXT:    s_cselect_b32 s2, 2, 3
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, 2, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 entry:                                             ; preds = %1009
   %0 = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index af4f236c783c6..b2e4117096ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -154,4 +154,226 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
   ret <2 x double> %pow_sign1
 }
 
+define float @copysign_f32_f32_sign_known_p0_or_n0(float %x, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define double @copysign_f64_f32_sign_known_p0_or_n0(double %x, i32 %y.i) {
+; GFX9-LABEL: copysign_f64_f32_sign_known_p0_or_n0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %y.even.as.f32.fpext = fpext float %y.even.as.f32 to double
+  %copysign = call double @llvm.copysign.f64(double %x, double %y.even.as.f32.fpext)
+  ret double %copysign
+}
+
+define half @copysign_f16_f32_sign_known_p0_or_n0(half %x, i32 %y.i) {
+; GFX9-LABEL: copysign_f16_f32_sign_known_p0_or_n0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %y.even.as.f32.fptrunc = fptrunc float %y.even.as.f32 to half
+  %copysign = call half @llvm.copysign.f16(half %x, half %y.even.as.f32.fptrunc)
+  ret half %copysign
+}
+
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs(float %x.arg, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x = call float @llvm.fabs.f32(float %x.arg)
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select(float %x.arg, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x.ule.0 = fcmp ule float %x.arg, 0.0
+  %x = select i1 %x.ule.0, float 0.0, float %x.arg
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_sqrt(float %x.arg, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_sqrt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sqrt_f32_e32 v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x260
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x = call nnan nsz float @llvm.sqrt.f32(float %x.arg)
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt(float %x.arg, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sqrt_f32_e32 v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x260
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x = call nsz float @llvm.sqrt.f32(float %x.arg)
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqrt(float %x.arg, i32 %y.i) {
+; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqrt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sqrt_f32_e32 v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x260
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x = call nnan float @llvm.sqrt.f32(float %x.arg)
+  %y.even = shl i32 %y.i, 31
+  %y.even.as.f32 = bitcast i32 %y.even to float
+  %copysign = call float @llvm.copysign.f32(float %x, float %y.even.as.f32)
+  ret float %copysign
+}
+
+define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
+; GFX9-LABEL: test_copysign_pow_fast_f32__integral_y:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x800000
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; GFX9-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; GFX9-NEXT:    v_log_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; GFX9-NEXT:    v_fma_f32 v2, v2, v1, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %y = sitofp i32 %y.i to float
+  %y.fptosi = fptosi float %y to i32
+  %fabs = call fast float @llvm.fabs.f32(float %x)
+  %log2 = call fast float @llvm.log2.f32(float %fabs)
+  %pownI2F = sitofp i32 %y.i to float
+  %ylogx = fmul fast float %log2, %pownI2F
+  %exp2 = call fast float @llvm.exp2.f32(float %ylogx)
+  %yeven = shl i32 %y.fptosi, 31
+  %x.i32 = bitcast float %x to i32
+  %pow_sign = and i32 %yeven, %x.i32
+  %pow_sign.f32 = bitcast i32 %pow_sign to float
+  %pow_sign1 = call fast float @llvm.copysign.f32(float %exp2, float %pow_sign.f32)
+  ret float %pow_sign1
+}
+
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 63b9d68123fa4..a0b549711f339 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32:
@@ -278,32 +278,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
@@ -414,11 +414,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -428,16 +428,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -447,7 +447,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
@@ -555,28 +555,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_ctlz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8:
@@ -742,24 +742,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
@@ -852,25 +852,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -878,7 +878,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64:
@@ -981,33 +981,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
@@ -1099,29 +1099,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1208,29 +1208,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1326,32 +1326,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1450,32 +1450,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1569,22 +1569,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1593,9 +1593,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1691,25 +1691,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1812,23 +1812,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1841,7 +1841,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index f16f05811c185..2168e7fe1dd28 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -561,14 +561,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
   %ctlz_ret = icmp ne i64 %val, 0
@@ -649,17 +649,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
@@ -753,11 +753,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
@@ -765,7 +765,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
@@ -869,13 +869,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -886,7 +886,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -1050,17 +1050,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1081,7 +1081,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1158,11 +1158,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1282,12 +1282,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
@@ -1364,17 +1364,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1454,17 +1454,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1534,16 +1534,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1613,16 +1613,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1697,21 +1697,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1799,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1888,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1972,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2057,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2142,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 02b0b1cc28fa8..14e6c4bcf6d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_cttz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %cttz to i32
@@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
-; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, s0
+; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
@@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 2491abe4bc1ce..81ed823bad204 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
   %cttz_ret = icmp ne i64 %val, 0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
@@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
@@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
@@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 96969a12b2c58..f3f749b5c054b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX10-LABEL: load_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_i8_to_f32:
@@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
@@ -1104,30 +1104,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
@@ -1198,32 +1198,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
@@ -1318,15 +1318,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -1335,19 +1335,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1356,7 +1356,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
@@ -1481,15 +1481,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX10-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
+; GFX10-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 8, v1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1499,21 +1499,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v7, v4, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dword v7, v4, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0x4000405
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
-; GFX9-NEXT:    s_mov_b32 s4, 0x4000405
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v3, 8, v1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1522,9 +1522,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s4
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dword v5, v4, s[2:3]
+; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s0
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dword v5, v4, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1653,12 +1653,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
@@ -1676,22 +1676,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dword v4, v5, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s5, 0x900
+; GFX9-NEXT:    s_movk_i32 s0, 0xff00
+; GFX9-NEXT:    s_movk_i32 s1, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1699,17 +1698,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
-; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v5, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
@@ -1851,17 +1849,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
@@ -1875,22 +1873,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
@@ -1904,8 +1902,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
@@ -2004,11 +2002,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2018,17 +2016,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -2038,8 +2036,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
@@ -2114,28 +2112,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
@@ -2201,26 +2199,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
@@ -2285,24 +2283,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
@@ -2388,15 +2386,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -2405,19 +2403,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -2426,7 +2424,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
@@ -2500,26 +2498,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
@@ -2583,26 +2581,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
@@ -2667,26 +2665,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
@@ -2751,26 +2749,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
@@ -2840,30 +2838,30 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX10-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX10-NEXT:    global_load_dword v0, v0, s[4:5]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX10-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 6799980c18439..de14d64dbf7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -8,13 +8,13 @@
 define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: add:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_add v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_add v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: sub:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_sub v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_sub v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_and v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_and v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: or:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_or v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_or v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
 define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xor:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_xor v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_xor v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: nand:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_not_b32_e32 v0, v3
 ; CHECK-NEXT:    v_or_b32_e32 v2, -2, v0
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
 define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
 define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
 define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
 define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: cmpxchg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(
 define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xchg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_swap v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_swap v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: inc:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_inc v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_inc v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: dec:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_dec v2, v0, v1, s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    global_atomic_dec v2, v0, v1, s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fadd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_add_f32_e32 v2, 1.0, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB18_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fsub:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_add_f32_e32 v2, -1.0, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmin:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmax:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
index b0b9bbe4835ed..dfc28539ea814 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
@@ -28,3 +28,20 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
   store i32 %maybe.not.uniform.load, ptr addrspace(1) undef
   ret void
 }
+
+; This decomposes into a sequence of divergent sub carries. The first
+; subs in the sequence are divergent from the value inputs, but the
+; last values are divergent due to the carry in glue (such that
+; divergence needs to propagate through glue if there are any non-void
+; outputs)
+; GCN-LABEL: {{^}}wide_carry_divergence_error:
+; GCN: v_sub_u32_e32
+; GCN: v_subb_u32_e32
+; GCN: v_subbrev_u32_e32
+; GCN: v_subbrev_u32_e32
+define <2 x i128> @wide_carry_divergence_error(i128 %arg) {
+  %i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false)
+  %i1 = sub i128 0, %i
+  %i2 = insertelement <2 x i128> zeroinitializer, i128 %i1, i64 0
+  ret <2 x i128> %i2
+}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f0ab3a5342e01..fea1303d0a2b7 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -482,28 +482,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -514,7 +507,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1048,10 +1040,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -2695,28 +2687,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -2727,7 +2712,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -3261,10 +3245,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index f298a95c63485..5fae60a7acac1 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX9-LABEL: uniform_vec_i16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s0
 ; GFX9-NEXT:    ;;#ASMEND
@@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX906-LABEL: uniform_vec_i16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
@@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
@@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
@@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX9-LABEL: uniform_vec_f16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s0
 ; GFX9-NEXT:    ;;#ASMEND
@@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX906-LABEL: uniform_vec_f16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index ae4d302e04a7c..5b39cc2e185b7 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -15,13 +15,16 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
   ; GCN-NEXT:   [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; GCN-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -65,15 +68,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY4]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY killed [[S_LOAD_DWORDX2_IMM1]]
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY5]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY3]], killed [[S_MOV_B32_2]], implicit-def $scc
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -122,13 +126,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
   ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY7]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
-  ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
+  ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index b4218bc2afc7f..60ce730c3eed3 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX940
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
 
 ; GCN-LABEL: {{^}}dpp64_ceil:
 ; GCN:           global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
@@ -69,6 +69,27 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
   ret void
 }
 
+; GCN-LABEL: {{^}}dpp64_loop:
+; GCN: v_mov_b32_dpp
+; DPP64: v_mov_b32_dpp
+; GFX90A: v_add_co_u32_e32
+; GFX90A: v_addc_co_u32_e32
+; GFX940: v_lshl_add_u64
+; GFX10: v_mov_b32_dpp
+; GFX10: v_add_co_u32
+; GFX10: v_add_co_ci_u32_e32
+; GFX11: v_add_co_u32_e64_dpp
+; GFX11: v_add_co_ci_u32_e32
+define amdgpu_cs void @dpp64_loop(i64 %arg) {
+bb:
+  br label %bb1
+bb1:
+  %i = call i64 @llvm.amdgcn.update.dpp.i64(i64 0, i64 0, i32 0, i32 0, i32 0, i1 false)
+  %i2 = add i64 %i, %arg
+  %i3 = atomicrmw add ptr addrspace(1) null, i64 %i2 monotonic, align 8
+  br label %bb1
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
 declare double @llvm.ceil.f64(double)
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 9f191fa69f654..9649cdc6001cd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
@@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
@@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
 ;
 ; GFX9-LABEL: simple_write2_two_val_too_far_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX9-ALIGNED:       ; %bb.0:
-; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x8
-; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
+; GFX9-ALIGNED-NEXT:    s_load_dword s6, s[2:3], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
@@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x8
-; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
+; GFX9-UNALIGNED-NEXT:    s_load_dword s6, s[2:3], 0x0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
index 99ea18c37050b..77bc9729ee845 100644
--- a/llvm/test/CodeGen/AMDGPU/early-term.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
 
 --- |
   define amdgpu_ps void @early_term_scc0_end_block() {
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
new file mode 100644
index 0000000000000..78fb25a76d25e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -0,0 +1,1072 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX8 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX90A %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1010 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1100 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1200 %s
+
+---
+name: s_copy_frame_index_elimination_failure_pei
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: 24, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX8-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX8: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX8-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX8-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX8-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX8-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX8-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX8-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX8-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX8-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX8-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX8-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX8-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX8-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX8-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX900: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX900-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX900-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX900-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX900-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX900-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX900-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX900-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX900-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX900-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX900-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX900-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX900-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX90A-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX90A: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX90A-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX90A-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX90A-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX90A-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX90A-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX90A-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX90A-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX90A-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX90A-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX90A-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX90A-NEXT: SI_RETURN
+    ;
+    ; GFX1010-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1010: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1010-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1010-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1010-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc
+    ; GFX1010-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1010-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc
+    ; GFX1010-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX1010-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1010-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1010-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc
+    ; GFX1010-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX1010-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1010-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1010-NEXT: SI_RETURN
+    ;
+    ; GFX1100-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1100: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1100-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1100-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1100-NEXT: renamable $sgpr24 = S_MOV_B32 $sgpr32
+    ; GFX1100-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1100-NEXT: $sgpr22 = S_ADD_I32 $sgpr32, 4, implicit-def $scc
+    ; GFX1100-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
+    ; GFX1100-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1100-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
+    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1100-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1100-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1100-NEXT: SI_RETURN
+    ;
+    ; GFX1200-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1200: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1200-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1200-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1200-NEXT: renamable $sgpr24 = S_MOV_B32 $sgpr32
+    ; GFX1200-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1200-NEXT: $sgpr22 = S_ADD_I32 $sgpr32, 4, implicit-def $scc
+    ; GFX1200-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
+    ; GFX1200-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1200-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
+    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1200-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1200-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1200-NEXT: SI_RETURN
+    renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    renamable $sgpr17 = S_MOV_B32 0
+    undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    renamable $sgpr24 = S_MOV_B32 %stack.0
+    renamable $sgpr29 = COPY undef renamable $sgpr30
+    renamable $sgpr20 = S_MOV_B32 %stack.1
+    undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    renamable $sgpr31 = S_MOV_B32 %stack.2
+    renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    SI_RETURN
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $sgpr0_sgpr1 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr0 = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr0, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr0 = S_BITSET0_B32 0, $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr0
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr0 = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr0, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr0 = S_BITSET0_B32 0, $sgpr0
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr0
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
+
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128
+    ; GFX8-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX8-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX900-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 64, killed $vgpr1, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr64 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr64 = V_ADD_U32_e32 128, killed $vgpr64, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr64, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1010-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1010-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1010-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1010-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1010-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1010-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1010-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1010-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1010-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1010-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1010-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1010-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1010-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1010-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1010-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1010-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr5 = S_ADDC_U32 $sgpr32, 128, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr5, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr5 = S_BITSET0_B32 0, $sgpr5
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr5
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1100-NEXT: $vgpr63 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1100-NEXT: $vgpr62 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1100-NEXT: $vgpr61 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1100-NEXT: $vgpr60 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1100-NEXT: $vgpr59 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1100-NEXT: $vgpr58 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1100-NEXT: $vgpr57 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1100-NEXT: $vgpr56 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1100-NEXT: $vgpr47 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1100-NEXT: $vgpr46 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1100-NEXT: $vgpr45 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1100-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1100-NEXT: $vgpr43 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1100-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1100-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1100-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr5 = S_ADDC_U32 $sgpr32, 128, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr5, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr5 = S_BITSET0_B32 0, $sgpr5
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr5
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1200-NEXT: $vgpr63 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1200-NEXT: $vgpr62 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1200-NEXT: $vgpr61 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1200-NEXT: $vgpr60 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1200-NEXT: $vgpr59 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1200-NEXT: $vgpr58 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1200-NEXT: $vgpr57 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1200-NEXT: $vgpr56 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1200-NEXT: $vgpr47 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1200-NEXT: $vgpr46 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1200-NEXT: $vgpr45 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1200-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1200-NEXT: $vgpr43 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1200-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1200-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1200-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+  S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+  S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+  S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+  S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+  S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+  S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+  S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+  S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+...
+
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index bd483f4c07071..fe672f1b3b131 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1842,21 +1842,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; GFX9-LABEL: s_copysign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s6, 16
+; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index f53d3cf33c9cc..7c89efd0a713c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -197,24 +197,24 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ;
 ; GFX9-LABEL: v_rcp_f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16:
@@ -293,24 +293,24 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_abs:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, |v1|
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_abs:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, |v1|
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_abs:
@@ -392,24 +392,24 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
 ;
 ; GFX9-LABEL: reciprocal_f16_rounded:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: reciprocal_f16_rounded:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: reciprocal_f16_rounded:
@@ -475,24 +475,24 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_afn:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_afn:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_afn:
@@ -571,24 +571,24 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_neg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_neg:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_neg:
@@ -670,24 +670,24 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ;
 ; GFX9-LABEL: v_rsq_f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16:
@@ -771,26 +771,26 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rsq_f16_neg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_neg:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_neg:
@@ -879,28 +879,28 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
 ;
 ; GFX9-LABEL: v_rsq_f16_multi_use:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v2, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_multi_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v2, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_multi_use:
@@ -987,26 +987,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
 ;
 ; GFX9-LABEL: v_rsq_f16_missing_contract0:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract0:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract0:
@@ -1092,26 +1092,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
 ;
 ; GFX9-LABEL: v_rsq_f16_missing_contract1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract1:
@@ -1197,26 +1197,26 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
 ;
 ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index c6b730e3fd5d6..93105e57a5918 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
@@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX10-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
@@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
+; GFX10-NEXT:    v_mul_f32_e32 v1, s7, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
@@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s3
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s2
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s7
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
@@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
@@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index a3424793fdc4d..ea2427a3c420f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -929,29 +929,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -982,56 +964,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
@@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
@@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 
 !0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 0d954e277cdd5..2767b66e44703 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -929,29 +929,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -982,56 +964,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
@@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
@@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 
 !0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 087d38ce7b004..89da9b8e75bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -37,17 +37,17 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -78,12 +78,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -114,8 +116,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -166,13 +169,13 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -208,10 +211,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -246,10 +251,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -300,13 +307,13 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -342,10 +349,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -380,10 +389,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -434,18 +445,18 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -478,14 +489,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -519,8 +531,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -571,14 +584,14 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -615,12 +628,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -656,12 +670,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -712,14 +727,14 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -756,12 +771,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -797,12 +813,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -853,18 +870,18 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -897,14 +914,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -938,8 +956,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -990,14 +1009,14 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -1034,12 +1053,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -1075,12 +1095,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -1130,14 +1151,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -1174,12 +1195,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -1215,12 +1237,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index c9618d43943ef..e0377ddad14c0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5279,15 +5279,15 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5328,15 +5328,15 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5689,15 +5689,15 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5738,15 +5738,15 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6099,15 +6099,15 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i8_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6148,15 +6148,15 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i8:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6394,15 +6394,15 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i16_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6443,15 +6443,15 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i16:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -7979,15 +7979,15 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f16_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %gep = getelementptr half, ptr %in, i64 8
@@ -8027,15 +8027,15 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f16:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %val = load atomic half, ptr %in seq_cst, align 2
@@ -8078,15 +8078,15 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_bf16_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr %in, i64 8
@@ -8126,15 +8126,15 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_bf16:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %val = load atomic bfloat, ptr %in seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 4d80e9124f41f..9c2faf622623d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 7e4a36b7dc11b..3fd624b592cd4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -4439,23 +4439,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -4654,23 +4654,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ;
 ; GCN3-LABEL: atomic_max_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5821,23 +5821,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_cbranch_execnz .LBB103_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5934,23 +5934,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ;
 ; GCN3-LABEL: atomic_umax_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN3-NEXT:    s_cbranch_execnz .LBB104_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -8045,23 +8045,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_cbranch_execnz .LBB126_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -8148,20 +8148,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: atomic_min_i64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v5, s1
-; GCN3-NEXT:    v_mov_b32_e32 v6, s3
-; GCN3-NEXT:    v_mov_b32_e32 v7, s2
-; GCN3-NEXT:    v_mov_b32_e32 v4, s0
+; GCN3-NEXT:    v_mov_b32_e32 v4, s4
+; GCN3-NEXT:    v_mov_b32_e32 v6, s7
+; GCN3-NEXT:    v_mov_b32_e32 v7, s6
+; GCN3-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN3-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB127_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -8253,23 +8253,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ;
 ; GCN3-LABEL: atomic_min_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    v_mov_b32_e32 v5, s4
+; GCN3-NEXT:    v_mov_b32_e32 v4, s9
+; GCN3-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN3-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_cbranch_execnz .LBB128_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 84852c2632f67..f86f5305e6ba1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_0_f32:
@@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max3_f32 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_1_f32:
@@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_0_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max3_f16 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_0_f16:
@@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_1_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max3_f16 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_1_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 84099e472d65f..6634d36122d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -83,14 +83,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
@@ -195,14 +195,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32:
@@ -308,14 +308,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
@@ -421,14 +421,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
@@ -538,15 +538,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 2.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
@@ -668,33 +668,33 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 ;
 ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, 2.0, v1
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, 4.0, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -831,15 +831,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_fmed3_r_i_i_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_r_i_i_f64:
@@ -943,13 +943,13 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
@@ -1057,29 +1057,29 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-GISEL-NEXT:    v_cmp_nlt_f32_e32 vcc, 2.0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 2.0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc, 4.0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
@@ -1244,33 +1244,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
@@ -1438,33 +1438,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, -v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v2, -v2, -v2
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
@@ -1632,33 +1632,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, v2, -v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -v3, -v3
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
@@ -1828,34 +1828,34 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -|v3|, -|v3|
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, |v2|, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
@@ -2034,35 +2034,35 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v2, -|v2|, -|v2|
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -|v3|, -|v3|
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
@@ -2250,20 +2250,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0:
@@ -2415,17 +2415,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0:
@@ -2569,17 +2569,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_nnan_call_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_call_med3_f32_pat0:
@@ -2723,17 +2723,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_fast_call_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_fast_call_med3_f32_pat0:
@@ -2889,17 +2889,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
@@ -3043,17 +3043,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
@@ -3199,33 +3199,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
@@ -3391,17 +3391,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
@@ -3545,17 +3545,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
@@ -3699,17 +3699,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
@@ -3853,17 +3853,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
@@ -4007,17 +4007,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
@@ -4161,17 +4161,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
@@ -4315,17 +4315,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
@@ -4469,17 +4469,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
@@ -4623,17 +4623,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
@@ -4777,17 +4777,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
@@ -4931,17 +4931,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
@@ -5085,17 +5085,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
@@ -5239,17 +5239,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
@@ -5393,17 +5393,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
@@ -5550,17 +5550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
@@ -5743,14 +5743,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -5761,7 +5761,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
@@ -5947,14 +5947,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -5965,7 +5965,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
@@ -6178,14 +6178,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -6196,7 +6196,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
@@ -6370,14 +6370,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -6386,7 +6386,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0:
@@ -6569,20 +6569,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
@@ -6746,20 +6746,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
@@ -6923,20 +6923,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
@@ -7090,33 +7090,33 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
@@ -7295,39 +7295,39 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_min_f32_e64 v4, -v1, v2
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v4, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
@@ -7502,18 +7502,18 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_global_nnans_min_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_min_max_f32:
@@ -7637,14 +7637,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f16 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
@@ -7825,20 +7825,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f16_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f16 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
@@ -7963,15 +7963,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: two_non_inline_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: two_non_inline_constant:
@@ -8114,16 +8114,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: one_non_inline_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v3, 0.5, v1
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
 ; GFX9-NEXT:    v_med3_f32 v2, v3, 1.0, v2
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -8271,18 +8271,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ;
 ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x41000000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x41000000
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v3, 0.5, v1
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v4, 0x41800000, v1
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s2, v2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s0, v2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v1, off
@@ -8291,18 +8291,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ;
 ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v1
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v1
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v5, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v1, off
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 3a55b2d50a5e5..2025ddb07e83a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f32:
@@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_min3_f32 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f32:
@@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_min3_f16 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f16:
@@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_min3_f16 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f16:
@@ -680,36 +680,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
-; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f64:
@@ -827,36 +827,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s2
-; GFX9-NEXT:    s_mov_b32 s13, s3
-; GFX9-NEXT:    s_mov_b32 s16, s4
-; GFX9-NEXT:    s_mov_b32 s17, s5
-; GFX9-NEXT:    s_mov_b32 s18, s10
-; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s16, s8
+; GFX9-NEXT:    s_mov_b32 s17, s9
+; GFX9-NEXT:    s_mov_b32 s18, s2
+; GFX9-NEXT:    s_mov_b32 s19, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
-; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 7c1c970b3fef7..7b8384f317c6c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -111,23 +111,41 @@ define amdgpu_kernel void @fmul_f16_imm_a(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fmul_f16_imm_a:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fmul_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmul_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -178,23 +196,41 @@ define amdgpu_kernel void @fmul_f16_imm_b(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fmul_f16_imm_b:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_mul_f16_e32 v0, 4.0, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fmul_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmul_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
@@ -390,21 +426,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
 ;
 ; GFX9-LABEL: fmul_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v2f16_imm_a:
@@ -485,21 +521,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
 ;
 ; GFX9-LABEL: fmul_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v2f16_imm_b:
@@ -725,23 +761,23 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmul_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s2, 0x44004200
-; GFX9-NEXT:    s_mov_b32 s3, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s6, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s7, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, s2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s3
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, s6
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s7
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 9300dfcb16e8a..2b556a0be2b16 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -62,32 +62,32 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x2
-; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16:
@@ -176,48 +176,48 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-FLUSH-LABEL: fmul_fadd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    s_clause 0x2
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    s_clause 0x2
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmul_fadd_f16:
@@ -326,32 +326,32 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ;
 ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x2
-; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[10:11]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
@@ -1425,49 +1425,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
@@ -1600,49 +1600,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
@@ -1775,49 +1775,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
@@ -1951,49 +1951,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
@@ -2127,49 +2127,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
@@ -2304,49 +2304,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir
new file mode 100644
index 0000000000000..baaca76bfd8a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+---
+name:            fold_zero_high_bits_src1_alive
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: fold_zero_high_bits_src1_alive
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], 1, 0, implicit $exec
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GCN-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U16_e64_]], 1, 0, implicit $exec
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[V_ADD_U16_e64_]], [[COPY1]], 0, implicit $exec
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_32 = S_MOV_B32 1
+    %2:vgpr_32 = V_ADD_U16_e64 %0:vgpr_32, %1:sreg_32, 0, implicit $exec
+    %3:sreg_32 = S_MOV_B32 65535
+    %4:vgpr_32 = V_AND_B32_e64 %3:sreg_32, %2:vgpr_32, implicit $exec
+    %5:vgpr_32 = V_MUL_U32_U24_e64 killed %4:vgpr_32, %1:sreg_32, 0, implicit $exec
+    %6:vgpr_32 = COPY $vgpr1
+    %7:vgpr_32 = V_SUB_U16_e64 %2:vgpr_32, %6:vgpr_32, 0, implicit $exec
+...
+
+---
+name:            fold_zero_high_bits_src1_killed
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: fold_zero_high_bits_src1_killed
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], 1, 0, implicit $exec
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[V_ADD_U16_e64_]], [[COPY1]], 0, implicit $exec
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GCN-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 killed [[V_ADD_U16_e64_]], 1, 0, implicit $exec
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_32 = S_MOV_B32 1
+    %2:vgpr_32 = V_ADD_U16_e64 %0:vgpr_32, %1:sreg_32, 0, implicit $exec
+    %6:vgpr_32 = COPY $vgpr1
+    %7:vgpr_32 = V_SUB_U16_e64 %2:vgpr_32, %6:vgpr_32, 0, implicit $exec
+    %3:sreg_32 = S_MOV_B32 65535
+    %4:vgpr_32 = V_AND_B32_e64 %3:sreg_32, killed %2:vgpr_32, implicit $exec
+    %5:vgpr_32 = V_MUL_U32_U24_e64 killed %4:vgpr_32, %1:sreg_32, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 105d9246880a4..742aeb96fcc2c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -271,12 +271,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b32 v1, v0
 ; GFX10-NEXT:    s_endpgm
@@ -657,14 +657,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
@@ -736,14 +736,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index e124aadf4e8c2..950d228f29929 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -248,12 +248,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b32 v1, v0
 ; GFX10-NEXT:    s_endpgm
@@ -600,14 +600,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
@@ -665,14 +665,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index ce1fcccf4a17c..d964cf4858f09 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1022,22 +1022,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1047,22 +1047,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
+; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1072,22 +1072,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
+; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB39_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB39_2:
@@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB40_2:
@@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB40_2:
@@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB41_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB42_2:
@@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB42_2:
@@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB49_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB49_2:
@@ -1539,12 +1539,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1610,12 +1610,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1760,23 +1760,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NEXT:    v_mov_b32_e32 v3, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1806,12 +1806,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1846,23 +1846,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NEXT:    v_mov_b32_e32 v3, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1892,23 +1892,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NEXT:    v_mov_b32_e32 v3, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index f18f5752269e0..90c1759070a59 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 6a2a8c3ce595d..ff6700d10ff53 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 8c6dc4395839c..75f4dff14fcbd 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -23,23 +23,41 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -89,24 +107,43 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fpext_f16_to_f64:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fpext_f16_to_f64:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fpext_f16_to_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_f16_to_f64:
 ; GFX11:       ; %bb.0: ; %entry
@@ -159,24 +196,43 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fpext_v2f16_to_v2f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_dword v1, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX89-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fpext_v2f16_to_v2f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fpext_v2f16_to_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_v2f16_to_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -232,26 +288,47 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fpext_v2f16_to_v2f64:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX89-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fpext_v2f16_to_v2f64:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fpext_v2f16_to_v2f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_v2f16_to_v2f64:
 ; GFX11:       ; %bb.0: ; %entry
@@ -350,23 +427,41 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fneg_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fneg_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fneg_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -416,23 +511,41 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fabs_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fabs_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fabs_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -482,23 +595,41 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fneg_fabs_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -555,27 +686,49 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX89-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -636,27 +789,49 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX89-NEXT:    v_mul_f16_e64 v0, -v0, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; VI-NEXT:    v_mul_f16_e64 v0, -v0, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX9-NEXT:    v_mul_f16_e64 v0, -v0, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -716,27 +891,49 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX89-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -797,27 +994,49 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX89-NEXT:    v_mul_f16_e64 v0, |v0|, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; VI-NEXT:    v_mul_f16_e64 v0, |v0|, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX9-NEXT:    v_mul_f16_e64 v0, |v0|, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -877,27 +1096,49 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX89-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -959,27 +1200,49 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX89-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
-; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX9-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -1020,6 +1283,3 @@ entry:
 declare half @llvm.fabs.f16(half) #1
 
 attributes #1 = { nounwind readnone }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX9: {{.*}}
-; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 99818df6175bd..667a3f398c08a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -22,13 +22,13 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -394,13 +394,13 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -765,13 +765,13 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1123,13 +1123,13 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1509,13 +1509,13 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1860,13 +1860,13 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s5, -1
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT:    s_mov_b32 s7, -1
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
 ; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 65ac2e240469d..0817ac1b3cd67 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -71,32 +71,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
@@ -208,34 +208,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
@@ -358,36 +358,36 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -519,40 +519,40 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -673,32 +673,32 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s2
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s0
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
@@ -807,32 +807,32 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
@@ -941,32 +941,32 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -|v0|
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s2|
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s0|
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
@@ -1076,32 +1076,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
@@ -1215,32 +1215,32 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
@@ -1359,34 +1359,34 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index eeddc2211ea97..4215ae43345fd 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -310,8 +310,8 @@ ret:
 
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
-; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
+; GFX11:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
+; GFX11:     v_dual_mov_b32 [[C:v[0-9]+]], 0x7b :: v_dual_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index d8736c5276a26..34c7614ae36f9 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX8,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX900,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX90A,GCN %s
 
 ---
 name: func_add_constant_to_fi_divergent_i32
@@ -211,3 +213,602 @@ body:             |
     renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
 ...
 
+---
+name: materialize_fi_s_mov_b32_offset_0_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.0
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc
+    ; GCN: liveins: $sgpr4, $sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.0
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FI#0 is filler to get a non-0 offset for FI#1
+---
+name: materialize_fi_s_mov_b32_offset_64_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_64_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 64, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_68_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX8: liveins: $sgpr4, $sgpr5
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX90A: liveins: $sgpr4, $sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX8: liveins: $sgpr4, $sgpr5
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 68
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX90A: liveins: $sgpr4, $sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is finding a VGPR
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 16, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy: 10
+
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+
+    renamable $sgpr4 = S_MOV_B32 %stack.0
+
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is clobbering scc
+---
+name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 128, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+
+    renamable $sgpr4 = S_MOV_B32 %stack.1
+
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index ea588df86b846..4f230140f7ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -135,12 +135,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshl_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -157,11 +157,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshl_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 25
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
@@ -732,15 +732,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: orxor2or1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX9-NEXT:    s_or_b32 s4, s3, s4
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX9-NEXT:    s_or_b32 s0, s7, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: orxor2or1:
@@ -759,15 +759,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX10-LABEL: orxor2or1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX10-NEXT:    s_or_b32 s4, s3, s4
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX10-NEXT:    s_or_b32 s0, s7, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX10-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: orxor2or1:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index dbcebe6e07e3f..31f574d44ab8c 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -127,12 +127,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshr_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -149,11 +149,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshr_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index f6df1cbbdd06b..2a79793443fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -111,23 +111,41 @@ define amdgpu_kernel void @fsub_f16_imm_a(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fsub_f16_imm_a:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_sub_f16_e32 v0, 1.0, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fsub_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_sub_f16_e32 v0, 1.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fsub_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_sub_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -178,23 +196,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: fsub_f16_imm_b:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_add_f16_e32 v0, -2.0, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: fsub_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_add_f16_e32 v0, -2.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fsub_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_add_f16_e32 v0, -2.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
@@ -390,21 +426,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
 ;
 ; GFX9-LABEL: fsub_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x40003c00
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_v2f16_imm_a:
@@ -485,21 +521,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
 ;
 ; GFX9-LABEL: fsub_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0xbc00c000
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0xbc00c000
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_v2f16_imm_b:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 4f7b6164936f8..06d971febd038 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -53,7 +53,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -63,7 +63,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -73,7 +73,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -97,7 +97,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -121,7 +121,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -145,7 +145,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -158,7 +158,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -171,12 +171,12 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -190,7 +190,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -215,7 +215,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -225,7 +225,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -235,7 +235,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -259,7 +259,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -283,7 +283,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -308,7 +308,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -335,12 +335,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -354,7 +354,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -379,7 +379,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -389,7 +389,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -399,7 +399,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -423,7 +423,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -447,7 +447,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -472,7 +472,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -485,7 +485,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -499,12 +499,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32:
+define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -518,7 +518,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -542,7 +542,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -552,7 +552,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -562,7 +562,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -585,7 +585,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -608,7 +608,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -631,7 +631,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -643,7 +643,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -655,12 +655,12 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -674,7 +674,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -698,7 +698,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -708,7 +708,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -718,7 +718,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -741,7 +741,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -764,7 +764,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -789,7 +789,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -801,7 +801,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -814,12 +814,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -833,7 +833,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -857,7 +857,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -923,7 +923,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -948,7 +948,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -960,7 +960,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -973,44 +973,26 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
+
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
-  ret void
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64:
+define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64:
+define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
@@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16:
+define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16:
+define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16:
+define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16:
+define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16:
+define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 591e01b11bd24..65df8f07fb8b3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -53,7 +53,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -63,7 +63,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -73,7 +73,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -97,7 +97,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -121,7 +121,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -145,7 +145,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -158,7 +158,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -171,12 +171,12 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -190,7 +190,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -215,7 +215,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -225,7 +225,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -235,7 +235,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -259,7 +259,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -283,7 +283,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -308,7 +308,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -335,12 +335,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -354,7 +354,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -379,7 +379,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -389,7 +389,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -399,7 +399,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -423,7 +423,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -447,7 +447,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -472,7 +472,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -485,7 +485,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -499,12 +499,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32:
+define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -518,7 +518,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -542,7 +542,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -552,7 +552,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -562,7 +562,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -585,7 +585,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -608,7 +608,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -631,7 +631,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -643,7 +643,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -655,12 +655,12 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -674,7 +674,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -698,7 +698,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -708,7 +708,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -718,7 +718,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -741,7 +741,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -764,7 +764,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -789,7 +789,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -801,7 +801,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -814,12 +814,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -833,7 +833,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -857,7 +857,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -923,7 +923,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -948,7 +948,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -960,7 +960,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -973,44 +973,26 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
+
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
-  ret void
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64:
+define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64:
+define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
@@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16:
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16:
+define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16:
+define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16:
+define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16:
+define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index 7f6a3ad5c9346..31069e567d86d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v0, s0
-; GFX908-NEXT:    v_mov_b32_e32 v1, s1
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX908-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX908-NEXT:    s_endpgm
 ;
@@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 9bee539b1e4e5..16c09cf600080 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5105,13 +5105,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %in, i64 4
@@ -5159,13 +5159,13 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_i32_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
@@ -5211,13 +5211,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_f32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(1) %in, i64 4
@@ -5261,13 +5261,13 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
 ;
 ; GFX9-LABEL: atomic_load_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -5812,13 +5812,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
 ;
 ; GFX9-LABEL: atomic_load_i8_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(1) %in, i64 16
@@ -5866,13 +5866,13 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
 ;
 ; GFX9-LABEL: atomic_load_i8_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
@@ -5993,13 +5993,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i16_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr addrspace(1) %in, i64 8
@@ -6047,13 +6047,13 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_i16_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
@@ -7074,13 +7074,13 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_f16_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr half, ptr addrspace(1) %in, i64 8
   %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7127,13 +7127,13 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_f16_negoffset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr half, ptr addrspace(1) %in, i64 -256
   %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7176,13 +7176,13 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
 ;
 ; GFX9-LABEL: atomic_load_bf16_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
   %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7229,13 +7229,13 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
 ;
 ; GFX9-LABEL: atomic_load_bf16_negoffset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
   %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index a7ba8a084272b..c7fa2a2ede388 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 40f0acf3d5d09..d1a371fc4356f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_add_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -259,18 +259,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
@@ -331,12 +331,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_add_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -550,18 +550,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_add_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_ret_addr64:
@@ -617,12 +617,12 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_and_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -843,18 +843,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_and_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1134,18 +1134,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_and_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_ret_addr64:
@@ -1201,12 +1201,12 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_sub_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1427,18 +1427,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
@@ -1499,12 +1499,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_sub_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1718,18 +1718,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_sub_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64:
@@ -1781,12 +1781,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_max_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_offset:
@@ -1994,17 +1994,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -2061,12 +2061,12 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_max_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64:
@@ -2267,17 +2267,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_ret_addr64:
@@ -2329,12 +2329,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_umax_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_offset:
@@ -2542,17 +2542,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -2609,12 +2609,12 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_umax_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64:
@@ -2815,17 +2815,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64:
@@ -2877,12 +2877,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_min_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_offset:
@@ -3090,17 +3090,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -3157,12 +3157,12 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_min_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64:
@@ -3363,17 +3363,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_ret_addr64:
@@ -3425,12 +3425,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_umin_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_offset:
@@ -3638,17 +3638,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
@@ -3705,12 +3705,12 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_umin_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64:
@@ -3911,17 +3911,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umin_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64:
@@ -3977,12 +3977,12 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_or_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4203,18 +4203,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
@@ -4275,12 +4275,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_or_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4494,18 +4494,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: atomic_or_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_ret_addr64:
@@ -4561,12 +4561,12 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_xchg_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4617,12 +4617,12 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ;
 ; GFX9-LABEL: atomic_xchg_f64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4673,12 +4673,12 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_pointer_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4899,18 +4899,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
@@ -4971,12 +4971,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_xchg_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5190,18 +5190,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
@@ -5257,12 +5257,12 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_xor_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5483,18 +5483,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
@@ -5555,12 +5555,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_xor_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5774,18 +5774,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xor_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64:
@@ -6001,17 +6001,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
@@ -6079,16 +6079,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6329,17 +6329,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret:
@@ -6404,16 +6404,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6573,13 +6573,13 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64_offset:
@@ -6640,13 +6640,13 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ;
 ; GFX9-LABEL: atomic_load_i64_neg_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64_neg_offset:
@@ -6703,13 +6703,13 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ;
 ; GFX9-LABEL: atomic_load_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64:
@@ -7005,12 +7005,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_store_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_store_i64_offset:
@@ -7057,12 +7057,12 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: atomic_store_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_store_i64:
@@ -7320,12 +7320,12 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_inc_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -7528,12 +7528,12 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_dec_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 8897ad3e950a5..9af7e0978f9db 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -5064,37 +5064,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5300,37 +5300,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6526,37 +6526,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6650,37 +6650,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8862,37 +8862,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8972,29 +8972,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_min_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB127_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -9087,37 +9087,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT:    s_add_u32 s0, s0, s6
-; GFX9-NEXT:    s_addc_u32 s1, s1, s7
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c05f9c679979d..600c35b6862f6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 46f0bb0393885..31717bf1c8253 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
@@ -1148,55 +1148,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT:  .LBB2_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB2_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT:  .LBB2_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB2_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1206,60 +1180,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT:  .LBB2_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB2_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT:  .LBB2_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB2_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1330,55 +1277,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT:  .LBB2_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB2_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT:  .LBB2_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB2_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1388,60 +1309,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT:  .LBB2_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB2_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT:  .LBB2_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB2_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1623,18 +1517,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1642,27 +1536,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT:  .LBB3_5:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB3_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1693,45 +1573,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT:  .LBB3_5:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB3_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1752,13 +1618,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1766,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1775,29 +1641,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT:  .LBB3_5:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB3_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1818,13 +1668,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1832,36 +1682,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT:  .LBB3_5:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB3_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2080,27 +1915,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT:  .LBB3_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2162,29 +1983,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT:  .LBB3_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2252,34 +2059,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT:  .LBB3_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2335,34 +2126,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT:  .LBB3_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -2439,55 +2214,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT:  .LBB4_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT:  .LBB4_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2497,60 +2246,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT:  .LBB4_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB4_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT:  .LBB4_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB4_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2621,55 +2343,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT:  .LBB4_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB4_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT:  .LBB4_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB4_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2679,60 +2375,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT:  .LBB4_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB4_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT:  .LBB4_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB4_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
@@ -2913,18 +2582,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2932,27 +2601,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT:  .LBB5_5:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB5_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -2983,45 +2638,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT:  .LBB5_5:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB5_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3042,13 +2683,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -3056,7 +2697,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3065,29 +2706,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT:  .LBB5_5:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB5_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3108,13 +2733,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3122,36 +2747,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT:  .LBB5_5:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB5_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3370,27 +2980,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT:  .LBB5_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3452,29 +3048,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT:  .LBB5_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3542,34 +3124,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT:  .LBB5_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3625,34 +3191,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT:  .LBB5_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4
@@ -6003,59 +5553,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT:  .LBB8_3:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB8_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT:  .LBB8_3:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB8_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6197,59 +5719,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT:  .LBB8_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT:  .LBB8_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6503,20 +5997,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6524,28 +6018,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1064-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1064-NEXT:  .LBB9_5:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB9_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -6576,48 +6055,33 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1032-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1032-NEXT:  .LBB9_5:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB9_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7008,28 +6472,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[13:14], v[11:12], v[11:12]
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[13:14], v[9:10]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1064-DPP-NEXT:  .LBB9_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB9_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7104,30 +6553,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[9:10], v[0:1]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1032-DPP-NEXT:  .LBB9_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB9_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -9680,55 +9114,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT:  .LBB12_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB12_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT:  .LBB12_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB12_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9738,60 +9146,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT:  .LBB12_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB12_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT:  .LBB12_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB12_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9862,55 +9243,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT:  .LBB12_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB12_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT:  .LBB12_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB12_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9920,60 +9275,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT:  .LBB12_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB12_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT:  .LBB12_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB12_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -10048,55 +9376,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT:  .LBB13_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT:  .LBB13_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10106,60 +9408,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT:  .LBB13_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB13_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT:  .LBB13_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB13_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10230,55 +9505,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT:  .LBB13_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB13_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT:  .LBB13_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB13_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10288,60 +9537,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT:  .LBB13_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB13_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT:  .LBB13_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB13_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd5e589ec2be7..6f2702eae9302 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
@@ -1148,55 +1148,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT:  .LBB2_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB2_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT:  .LBB2_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB2_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1206,60 +1180,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT:  .LBB2_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB2_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT:  .LBB2_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB2_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1330,55 +1277,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT:  .LBB2_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB2_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT:  .LBB2_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB2_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1388,60 +1309,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT:  .LBB2_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB2_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT:  .LBB2_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB2_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1623,18 +1517,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1642,27 +1536,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT:  .LBB3_5:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB3_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1693,45 +1573,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT:  .LBB3_5:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB3_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1752,13 +1618,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1766,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1775,29 +1641,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT:  .LBB3_5:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB3_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1818,13 +1668,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1832,36 +1682,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT:  .LBB3_5:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB3_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2080,27 +1915,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT:  .LBB3_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2162,29 +1983,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT:  .LBB3_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2252,34 +2059,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT:  .LBB3_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2335,34 +2126,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT:  .LBB3_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -2439,55 +2214,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT:  .LBB4_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT:  .LBB4_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2497,60 +2246,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT:  .LBB4_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB4_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT:  .LBB4_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB4_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2621,55 +2343,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT:  .LBB4_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB4_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT:  .LBB4_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB4_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2679,60 +2375,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT:  .LBB4_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB4_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT:  .LBB4_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB4_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
@@ -2913,18 +2582,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2932,27 +2601,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT:  .LBB5_5:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB5_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -2983,45 +2638,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT:  .LBB5_5:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB5_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3042,13 +2683,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -3056,7 +2697,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3065,29 +2706,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT:  .LBB5_5:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB5_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3108,13 +2733,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3122,36 +2747,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT:  .LBB5_5:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB5_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3370,27 +2980,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT:  .LBB5_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3452,29 +3048,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT:  .LBB5_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3542,34 +3124,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT:  .LBB5_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3625,34 +3191,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT:  .LBB5_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4
@@ -6003,59 +5553,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT:  .LBB8_3:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB8_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT:  .LBB8_3:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB8_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6197,59 +5719,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT:  .LBB8_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT:  .LBB8_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6503,20 +5997,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6524,28 +6018,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1064-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1064-NEXT:  .LBB9_5:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB9_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -6576,48 +6055,33 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1032-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1032-NEXT:  .LBB9_5:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB9_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7008,28 +6472,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[13:14], v[11:12], v[11:12]
-; GFX1064-DPP-NEXT:    v_min_f64 v[9:10], v[13:14], v[9:10]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1064-DPP-NEXT:  .LBB9_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB9_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7104,30 +6553,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
-; GFX1032-DPP-NEXT:    v_min_f64 v[9:10], v[9:10], v[0:1]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1032-DPP-NEXT:  .LBB9_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB9_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -9680,55 +9114,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT:  .LBB12_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB12_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT:  .LBB12_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB12_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9738,60 +9146,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT:  .LBB12_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB12_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT:  .LBB12_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB12_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9862,55 +9243,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT:  .LBB12_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB12_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT:  .LBB12_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB12_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9920,60 +9275,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT:  .LBB12_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB12_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT:  .LBB12_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB12_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -10048,55 +9376,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT:  .LBB13_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT:  .LBB13_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10106,60 +9408,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT:  .LBB13_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB13_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT:  .LBB13_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB13_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10230,55 +9505,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT:  .LBB13_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB13_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT:  .LBB13_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB13_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10288,60 +9537,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT:  .LBB13_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB13_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT:  .LBB13_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB13_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5ffa71d37164c..cbe243a949115 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index c17feaed83f6c..e2635fc6ecbbb 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
 
 # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle
 # GCN: bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index 8d02f0614dfc3..401f6e303e796 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
 
 # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle
 # GCN:          S_LOAD_DWORDX2_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
index 1cb27dc6fc45c..1801082b60030 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -6,13 +6,13 @@
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic < %s | FileCheck -check-prefix=CU %s
 
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
 
 ; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
index 3dc9f209d01e2..431ba38f60a02 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
 
 ; GCN:      amdhsa.kernels:
 ; GCN:      .name: wavefrontsize
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 487e62b6c3495..37476203fbfad 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -5,10 +5,10 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index b9269e2b24284..2cb440b1b7a01 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b5180dfe9b..5abd4c9069c91 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT:    s_movk_i32 s4, 0x130
-; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_movk_i32 s20, 0x130
+; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
 ; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
@@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
 ; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
 ; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
 ; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
 ; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
@@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
 ; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
 ; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
 ; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
-; CHECK-NEXT:    s_mov_b32 s4, 48
-; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    s_mov_b32 s26, 48
+; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
 ; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
 ; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
@@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
 ; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
 ; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
-; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
-; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
+; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    s_mov_b32 s31, s24
 ; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
 ; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
 ; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 78653d7e21ad8..81210d8f5d0ca 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -638,9 +638,9 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s4, 0x400
+; GFX9-NEXT:    s_movk_i32 s2, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -651,14 +651,14 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
 ; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    global_store_short v5, v4, s[2:3]
+; GFX9-NEXT:    global_store_short v5, v4, s[4:5]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 1d68b0ba0a280..fee59455da4c8 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -2044,13 +2044,13 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -2062,20 +2062,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -2084,20 +2084,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v3, v1, s1
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2107,7 +2106,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v3, v0
-; GFX10-DL-NEXT:    global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v2, v1, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src:
@@ -2250,13 +2249,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
@@ -2267,21 +2266,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -2291,20 +2290,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2315,7 +2313,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
@@ -2856,13 +2854,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 8, 8
@@ -2873,21 +2871,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
@@ -2897,20 +2895,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2921,7 +2918,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index fb94b504781b1..2894ae76c0be4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -3494,13 +3494,13 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -3512,20 +3512,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -3534,20 +3534,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v3, v1, s1
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -3556,7 +3555,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src:
@@ -3700,13 +3699,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
@@ -3717,21 +3716,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -3741,20 +3740,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -3764,7 +3762,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020100
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
@@ -4306,13 +4304,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
@@ -4323,21 +4321,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
@@ -4347,20 +4345,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -4370,7 +4367,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020101
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[10:11]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
@@ -5597,15 +5594,16 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
 ;
 ; GFX10-DL-LABEL: idot4_acc32_v8i8:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_v8i8:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 3cabe41afb05a..44e8ae01fd692 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
   ; GCN-NEXT:   liveins: $vgpr0, $sgpr2_sgpr3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
-  ; GCN-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
   ; GCN-NEXT:   renamable $sgpr6 = COPY renamable $sgpr1
   ; GCN-NEXT:   renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
   ; GCN-NEXT:   renamable $sgpr4 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 555af5013bc4e..881433f48c76f 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -314,3 +314,52 @@ exit:
   tail call void asm sideeffect "; use $0", "v"(i64 %v1)
   ret void
 }
+
+; CHECK-LABEL: {{^}}scc_as_i32:
+; CHECK: ; def scc
+; CHECK: ; use scc
+define void @scc_as_i32() {
+  %scc = call i32 asm sideeffect "; def $0", "={scc}"()
+  call void asm sideeffect "; use $0 ", "{scc}"(i32 %scc)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}scc_as_i1:
+; CHECK: ; def scc
+; CHECK: ; use scc
+define void @scc_as_i1() {
+  %scc = call i1 asm sideeffect "; def $0", "={scc}"()
+  call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc)
+  ret void
+}
+
+; Make sure the SGPR def is treated as a uniform value when the inline
+; assembly also defines a divergent value. The add should be scalar
+; and not introduce illegal vgpr to sgpr copies.
+; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm:
+; CHECK: ; def v0 s[4:5]
+; CHECK: s_add_u32
+; CHECK-NEXT: s_addc_u32
+; CHECK: ; use s[4:5]
+define void @mixed_def_vgpr_sgpr_def_asm() {
+  %vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"()
+  %vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0
+  %sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1
+  %sgpr.add = add i64 %sgpr, 2
+  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm:
+; CHECK: ; def s[4:5] v0
+; CHECK: s_add_u32
+; CHECK-NEXT: s_addc_u32
+; CHECK: ; use s[4:5]
+define void @mixed_def_sgpr_vgpr_def_asm() {
+  %sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"()
+  %sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0
+  %vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1
+  %sgpr.add = add i64 %sgpr, 2
+  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
index 18bc442ae98e0..266da50f6e543 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
index af2e6001502f6..af7ba4e3fe653 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 0bdd2494e7ad6..5c9c0d1119163 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index b62bf890e65fe..247ec407df5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -974,19 +974,19 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s3, 3
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v4, 2, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v5, 3, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0
-; GCN-NEXT:    buffer_store_byte v4, off, s[12:15], 0 offset:3
-; GCN-NEXT:    buffer_store_byte v3, off, s[12:15], 0 offset:2
+; GCN-NEXT:    v_and_b32_e32 v4, 3, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    buffer_store_byte v3, off, s[12:15], 0
+; GCN-NEXT:    buffer_store_byte v5, off, s[12:15], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v4, off, s[12:15], 0 offset:2
 ; GCN-NEXT:    buffer_store_byte v2, off, s[12:15], 0 offset:1
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    buffer_store_byte v1, v0, s[12:15], 0 offen
 ; GCN-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0
 ; GCN-NEXT:    buffer_load_ubyte v1, off, s[12:15], 0 offset:1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index b9dc27cb7e019..17fe1721f73d7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -438,121 +438,121 @@ entry:
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: udiv_i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX90A-NEXT:    s_add_i32 s5, s5, s4
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
-; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
-; GFX90A-NEXT:    s_add_i32 s6, s4, 1
-; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX90A-NEXT:    s_add_i32 s5, s4, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX90A-NEXT:    s_add_i32 s1, s1, s0
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX90A-NEXT:    s_mul_i32 s1, s0, s7
+; GFX90A-NEXT:    s_sub_i32 s1, s6, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_sub_i32 s3, s1, s7
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mul_i32 s5, s5, s4
-; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX10-NEXT:    s_add_i32 s4, s4, s5
-; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
-; GFX10-NEXT:    s_mul_i32 s5, s4, s3
-; GFX10-NEXT:    s_sub_i32 s2, s2, s5
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_sub_i32 s6, s2, s3
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_mul_i32 s1, s1, s0
+; GFX10-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX10-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX10-NEXT:    s_mul_i32 s1, s0, s7
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_sub_i32 s1, s6, s1
+; GFX10-NEXT:    s_sub_i32 s3, s1, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: udiv_i32:
 ; GFX9-FLATSCR:       ; %bb.0:
-; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
@@ -1660,3 +1660,4 @@ entry:
   %bc = bitcast <2 x i32> %r.1 to <2 x float>
   ret <2 x float> %bc
 }
+
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 1fd0c67737a27..05fd883768480 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index fa447f981bf3a..28492f1426921 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index ab160ffc10ed0..acd48a64dea1f 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 ; Test formal argument lowering as well as calls to amdgpu_gfx functions.
 
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ba946fe00a8c5..c62b4e565078e 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
 
 ; We only care about which physical registers the parameters are copied from;
 ; the function bodies are just some arbitrary uses.
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index f74f9a8f2bdd8..1a0aa09e2e40a 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -27,6 +27,48 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}v5i8_arg:
+; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0
+define amdgpu_kernel void @v5i8_arg(<5 x i8> %in) nounwind {
+  store <5 x i8> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i8_arg:
+; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0
+define amdgpu_kernel void @v6i8_arg(<6 x i8> %in) nounwind {
+  store <6 x i8> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v5i16_arg:
+; GCN: s_load_dwordx4 s[0:3], s[8:9], 0x0
+define amdgpu_kernel void @v5i16_arg(<5 x i16> %in) nounwind {
+  store <5 x i16> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i16_arg:
+; GCN-DAG: s_load_dwordx4 s[0:3], s[8:9], 0x0
+define amdgpu_kernel void @v6i16_arg(<6 x i16> %in) nounwind {
+  store <6 x i16> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v5i32_arg:
+; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0
+define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind {
+  store <5 x i32> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i32_arg:
+; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0
+define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind {
+  store <6 x i32> %in, ptr addrspace(1) null
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}i65_arg:
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 266ab687cd8d5..d51ace630f692 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -94,14 +94,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_normal
 
@@ -134,14 +134,14 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -157,14 +157,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_overalign
 
@@ -197,14 +197,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -220,14 +220,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_normal
 
@@ -260,14 +260,14 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -283,14 +283,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_overalign
 
@@ -323,14 +323,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -368,11 +368,11 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -408,12 +408,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -445,11 +445,11 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -485,12 +485,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -522,11 +522,11 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -562,12 +562,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -599,11 +599,11 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -639,12 +639,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 3b3e107a62967..397502711283e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -98,8 +98,8 @@ define void @func_use_lds_global() {
   ret void
 }
 
-; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (): local memory global used by non-kernel function
-define void @func_use_lds_global_constexpr_cast() {
+; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (ptr addrspace(1)): local memory global used by non-kernel function
+define void @func_use_lds_global_constexpr_cast(ptr addrspace(1) %out) {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -153,7 +153,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB1_2:
 ; GISEL-NEXT:    s_endpgm
-  store volatile i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
+  store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) %out, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 23692e456bd9a..81d8945d243c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -42,7 +42,6 @@
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand Atomic instructions
-; GCN-O0-NEXT:      Lower constant intrinsics
 ; GCN-O0-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:      Expand vector predication intrinsics
 ; GCN-O0-NEXT:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
@@ -219,7 +218,6 @@
 ; GCN-O1-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O1-NEXT:      Expand memcmp() to load/stores
-; GCN-O1-NEXT:      Lower constant intrinsics
 ; GCN-O1-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Post-Dominator Tree Construction
@@ -510,7 +508,6 @@
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:      Expand memcmp() to load/stores
-; GCN-O1-OPTS-NEXT:      Lower constant intrinsics
 ; GCN-O1-OPTS-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
 ; GCN-O1-OPTS-NEXT:      Post-Dominator Tree Construction
@@ -820,7 +817,6 @@
 ; GCN-O2-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O2-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O2-NEXT:      Expand memcmp() to load/stores
-; GCN-O2-NEXT:      Lower constant intrinsics
 ; GCN-O2-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O2-NEXT:      Natural Loop Information
 ; GCN-O2-NEXT:      Post-Dominator Tree Construction
@@ -1138,7 +1134,6 @@
 ; GCN-O3-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O3-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O3-NEXT:      Expand memcmp() to load/stores
-; GCN-O3-NEXT:      Lower constant intrinsics
 ; GCN-O3-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O3-NEXT:      Natural Loop Information
 ; GCN-O3-NEXT:      Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 3ff34c0c2eaff..047b35b8c0f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index e1caf3bea6119..eed85345b3b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
@@ -281,24 +281,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
@@ -361,24 +361,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 224de9512c493..c120c58ce6f33 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2
 
 ; GCN-LABEL: {{^}}test_export_zeroes_f32:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off done{{$}}
 define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
@@ -23,7 +24,7 @@ define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -34,7 +35,7 @@ define amdgpu_kernel void @test_export_en_src0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -45,7 +46,7 @@ define amdgpu_kernel void @test_export_en_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -56,7 +57,7 @@ define amdgpu_kernel void @test_export_en_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -67,7 +68,7 @@ define amdgpu_kernel void @test_export_en_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -78,7 +79,7 @@ define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -89,8 +90,8 @@ define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -102,8 +103,8 @@ define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -112,8 +113,8 @@ define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_export_mrt7_f32:
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
 define amdgpu_kernel void @test_export_mrt7_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
@@ -125,8 +126,8 @@ define amdgpu_kernel void @test_export_mrt7_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_z_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -138,8 +139,8 @@ define amdgpu_kernel void @test_export_z_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_null_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -151,8 +152,8 @@ define amdgpu_kernel void @test_export_null_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved10_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -164,8 +165,8 @@ define amdgpu_kernel void @test_export_reserved10_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved11_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -177,8 +178,8 @@ define amdgpu_kernel void @test_export_reserved11_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -190,8 +191,8 @@ define amdgpu_kernel void @test_export_pos0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -203,8 +204,8 @@ define amdgpu_kernel void @test_export_pos3_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -216,8 +217,8 @@ define amdgpu_kernel void @test_export_param0_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param31_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -229,10 +230,10 @@ define amdgpu_kernel void @test_export_param31_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_vm_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
@@ -254,8 +255,8 @@ define amdgpu_kernel void @test_export_vm_f32() #0 {
 
 
 ; GCN-LABEL: {{^}}test_export_zeroes_i32:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off done{{$}}
 define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
@@ -270,7 +271,7 @@ define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -281,7 +282,7 @@ define amdgpu_kernel void @test_export_en_src0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -292,7 +293,7 @@ define amdgpu_kernel void @test_export_en_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -303,7 +304,7 @@ define amdgpu_kernel void @test_export_en_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -314,7 +315,7 @@ define amdgpu_kernel void @test_export_en_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -325,7 +326,7 @@ define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -336,8 +337,8 @@ define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -349,8 +350,8 @@ define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -359,8 +360,8 @@ define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
 
 ; GCN-LABEL: {{^}}test_export_mrt7_i32:
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
 define amdgpu_kernel void @test_export_mrt7_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
@@ -372,8 +373,8 @@ define amdgpu_kernel void @test_export_mrt7_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_z_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -385,8 +386,8 @@ define amdgpu_kernel void @test_export_z_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_null_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -398,8 +399,8 @@ define amdgpu_kernel void @test_export_null_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved10_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -411,8 +412,8 @@ define amdgpu_kernel void @test_export_reserved10_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved11_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -424,8 +425,8 @@ define amdgpu_kernel void @test_export_reserved11_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -437,8 +438,8 @@ define amdgpu_kernel void @test_export_pos0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -450,8 +451,8 @@ define amdgpu_kernel void @test_export_pos3_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -463,8 +464,8 @@ define amdgpu_kernel void @test_export_param0_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param31_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -476,10 +477,10 @@ define amdgpu_kernel void @test_export_param31_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_vm_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
@@ -549,12 +550,20 @@ end:
 ; GCN-LABEL: {{^}}test_export_clustering:
 ; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
 ; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
-; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
-; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
-; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
-; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
-; PREGFX11-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
+
+; GFX8-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s1
+; GFX8-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s0
+; GFX8-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
+; GFX8-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
+; GFX8: {{exp|export}} param0 [[Y]], [[X]], [[Z0]], [[W0]]{{$}}
+; GFX8-NEXT: {{exp|export}} param1 [[Y]], [[X]], [[Z1]], [[W1]] done{{$}}
+
+; GFX10-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
+; GFX10-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
+; GFX10-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
+; GFX10-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
+; GFX10: {{exp|export}} param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
+; GFX10-NEXT: {{exp|export}} param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
 define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)
@@ -564,9 +573,9 @@ define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param:
-; PREGFX11: exp pos0
+; PREGFX11: {{exp|export}} pos0
 ; PREGFX11-NOT: s_waitcnt
-; PREGFX11: exp param0
+; PREGFX11: {{exp|export}} param0
 define amdgpu_kernel void @test_export_pos_before_param(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -576,9 +585,9 @@ define amdgpu_kernel void @test_export_pos_before_param(float %x, float %y) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_export_pos4_before_param:
-; GFX10: exp pos4
+; GFX10: {{exp|export}} pos4
 ; GFX10-NOT: s_waitcnt
-; GFX10: exp param0
+; GFX10: {{exp|export}} param0
 define amdgpu_kernel void @test_export_pos4_before_param(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -588,13 +597,13 @@ define amdgpu_kernel void @test_export_pos4_before_param(float %x, float %y) #0
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param_ordered:
-; PREGFX11: exp pos0
-; PREGFX11: exp pos1
-; PREGFX11: exp pos2
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11: {{exp|export}} pos1
+; PREGFX11: {{exp|export}} pos2
 ; PREGFX11-NOT: s_waitcnt
-; PREGFX11: exp param0
-; PREGFX11: exp param1
-; PREGFX11: exp param2
+; PREGFX11: {{exp|export}} param0
+; PREGFX11: {{exp|export}} param1
+; PREGFX11: {{exp|export}} param2
 define amdgpu_kernel void @test_export_pos_before_param_ordered(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -608,9 +617,9 @@ define amdgpu_kernel void @test_export_pos_before_param_ordered(float %x, float
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param_across_load:
-; PREGFX11: exp pos0
-; PREGFX11-NEXT: exp param0
-; PREGFX11-NEXT: exp param1
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11-NEXT: {{exp|export}} param0
+; PREGFX11-NEXT: {{exp|export}} param1
 define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false)
@@ -622,9 +631,9 @@ define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0
 ; GCN-LABEL: {{^}}test_export_across_store_load:
 ; PREGFX11: buffer_store
 ; PREGFX11: buffer_load
-; PREGFX11: exp pos0
-; PREGFX11: exp param0
-; PREGFX11: exp param1
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11: {{exp|export}} param0
+; PREGFX11: {{exp|export}} param1
 define amdgpu_kernel void @test_export_across_store_load(i32 %idx, float %v) #0 {
   %data0 = alloca <4 x float>, align 8, addrspace(5)
   %data1 = alloca <4 x float>, align 8, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
index 52441bcb82f5c..429ee21598384 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
@@ -1,12 +1,13 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=NOPRIM %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
 
 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 
 ; GCN-LABEL: {{^}}test_export_prim_i32:
 ; NOPRIM: exp invalid_target_20 v0, off, off, off done{{$}}
-; PRIM: exp prim v0, off, off, off done{{$}}
+; PRIM: {{exp|export}} prim v0, off, off, off done{{$}}
 define amdgpu_gs void @test_export_prim_i32(i32 inreg %a) #0 {
   call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %a, i32 undef, i32 undef, i32 undef, i1 true, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index 18c711d0b2aec..c62a8882c4245 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
 
 declare void @llvm.amdgcn.exp.row.i32(i32, i32, i32, i32, i32, i32, i1, i32)
 declare void @llvm.amdgcn.exp.row.f32(i32, i32, float, float, float, float, i1, i32)
@@ -13,6 +15,13 @@ define amdgpu_kernel void @undef_i32() #0 {
 ; GFX11-NEXT:    exp pos0 off, off, off, off row_en
 ; GFX11-NEXT:    exp pos1 off, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: undef_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 off, off, off, off row_en
+; GFX12-NEXT:    export pos1 off, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.i32(i32 13, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 true, i32 0)
   ret void
@@ -25,6 +34,13 @@ define amdgpu_kernel void @undef_f32() #0 {
 ; GFX11-NEXT:    exp pos0 off, off, off, off row_en
 ; GFX11-NEXT:    exp pos1 off, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: undef_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 off, off, off, off row_en
+; GFX12-NEXT:    export pos1 off, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.f32(i32 12, i32 0, float undef, float undef, float undef, float undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.f32(i32 13, i32 0, float undef, float undef, float undef, float undef, i1 true, i32 0)
   ret void
@@ -38,6 +54,14 @@ define amdgpu_kernel void @zero_i32() #0 {
 ; GFX11-NEXT:    exp pos0 v0, v0, v0, off row_en
 ; GFX11-NEXT:    exp pos1 v0, v0, v0, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: zero_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, v0, v0, off row_en
+; GFX12-NEXT:    export pos1 v0, v0, v0, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.i32(i32 13, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 true, i32 0)
   ret void
@@ -51,6 +75,14 @@ define amdgpu_kernel void @one_f32() #0 {
 ; GFX11-NEXT:    exp pos0 v0, v0, v0, off row_en
 ; GFX11-NEXT:    exp pos1 v0, v0, v0, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: one_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, v0, v0, off row_en
+; GFX12-NEXT:    export pos1 v0, v0, v0, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.f32(i32 12, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.f32(i32 13, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 true, i32 0)
   ret void
@@ -63,6 +95,13 @@ define amdgpu_kernel void @id_i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 m0, 0
 ; GFX11-NEXT:    exp pos0 v0, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: id_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 0)
   ret void
@@ -77,6 +116,15 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 {
 ; GFX11-NEXT:    s_mov_b32 m0, s0
 ; GFX11-NEXT:    exp pos0 v0, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: id_arg_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 m0, s0
+; GFX12-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 %row)
   ret void
@@ -102,6 +150,25 @@ define amdgpu_kernel void @id_row_i32() #0 {
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GFX11-GISEL-NEXT:    exp pos0 v1, off, off, off done row_en
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: id_row_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: id_row_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    export pos0 v1, off, off, off done row_en
+; GFX12-GISEL-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index a26b84e17374a..dad77d1efd3a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
@@ -973,12 +973,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
@@ -996,12 +996,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i32 %result, ptr addrspace(1) %out
@@ -1023,12 +1023,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_one:
@@ -1046,12 +1046,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i32 %result, ptr addrspace(1) %out
@@ -1073,12 +1073,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
@@ -1096,12 +1096,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i32 %result, ptr addrspace(1) %out
@@ -1123,12 +1123,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
@@ -1146,12 +1146,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i32 %result, ptr addrspace(1) %out
@@ -1173,12 +1173,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
@@ -1196,12 +1196,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i32 %result, ptr addrspace(1) %out
@@ -1223,12 +1223,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
@@ -1246,12 +1246,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i32 %result, ptr addrspace(1) %out
@@ -1273,12 +1273,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
@@ -1296,12 +1296,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i32 %result, ptr addrspace(1) %out
@@ -1323,12 +1323,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_o:
@@ -1346,12 +1346,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
   store i32 %result, ptr addrspace(1) %out
@@ -1373,12 +1373,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
@@ -1396,12 +1396,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
   store i32 %result, ptr addrspace(1) %out
@@ -1423,12 +1423,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_une:
@@ -1446,12 +1446,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i32 %result, ptr addrspace(1) %out
@@ -1473,12 +1473,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
@@ -1496,12 +1496,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i32 %result, ptr addrspace(1) %out
@@ -1523,12 +1523,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
@@ -1546,12 +1546,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i32 %result, ptr addrspace(1) %out
@@ -1573,12 +1573,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
@@ -1596,12 +1596,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i32 %result, ptr addrspace(1) %out
@@ -1623,12 +1623,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
@@ -1646,12 +1646,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i32 %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 7e78d8b05d09f..d1883d9196af6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -1059,15 +1059,15 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_oeq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oeq:
@@ -1119,15 +1119,15 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_one:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_one:
@@ -1179,15 +1179,15 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ogt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ogt:
@@ -1239,15 +1239,15 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_oge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oge:
@@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_olt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_olt:
@@ -1359,15 +1359,15 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ole:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ole:
@@ -1419,15 +1419,15 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ueq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ueq:
@@ -1479,15 +1479,15 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_o:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_o:
@@ -1539,15 +1539,15 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_uo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uo:
@@ -1599,15 +1599,15 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_une:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_une:
@@ -1659,15 +1659,15 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ugt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ugt:
@@ -1719,15 +1719,15 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_uge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uge:
@@ -1779,15 +1779,15 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ult:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ult:
@@ -1839,15 +1839,15 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ule:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ule:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 78d5da8dda177..453913b334a49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -45,17 +45,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
 ; GFX11-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX11-NEXT:    scratch_store_b16 off, v0, s0
 ; GFX11-NEXT:    s_endpgm
-; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
-; GISEL-GFX11:       ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_load_b32 v0, off, s1
-; GISEL-GFX11-NEXT:    scratch_load_b32 v1, off, s2
-; GISEL-GFX11-NEXT:    scratch_load_u16 v2, off, s3
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX11-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
-; GISEL-GFX11-NEXT:    s_endpgm
     ptr addrspace(5) %r,
     ptr addrspace(5) %a,
     ptr addrspace(5) %b,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index d7dd0ce58a08f..4a66b761306f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
 ; GCN-LABEL: {{^}}global_atomic_csub_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -15,7 +15,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -24,7 +24,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
@@ -34,7 +34,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 291c249e4b738..88e3929819313 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1))
 
-define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-LABEL: global_load_tr_b64:
+define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b64_v2i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
@@ -24,8 +26,8 @@ entry:
   ret void
 }
 
-define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-LABEL: global_load_tr_b128:
+define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v8i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
@@ -42,3 +44,41 @@ entry:
   store <8 x i16> %val, ptr addrspace(1) %use
   ret void
 }
+
+define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v8f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
+  store <8 x half> %val, ptr addrspace(1) %use
+  ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v8bf16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
+  store <8 x bfloat> %val, ptr addrspace(1) %use
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 12742f4f7127b..a5841e04f6e07 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1))
+declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1))
 
-define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-LABEL: global_load_tr_b64:
+define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b64_i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -24,8 +26,8 @@ entry:
   ret void
 }
 
-define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-LABEL: global_load_tr_b128:
+define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v4i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
@@ -42,3 +44,41 @@ entry:
   store <4 x i16> %val, ptr addrspace(1) %use
   ret void
 }
+
+define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v4f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep)
+  store <4 x half> %val, ptr addrspace(1) %use
+  ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128_v4bf16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) %gep)
+  store <4 x bfloat> %val, ptr addrspace(1) %use
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 9e3e393d82e22..d0d759a57a682 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -644,12 +644,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_eq:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i32 %result, ptr addrspace(1) %out
@@ -694,12 +694,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_ne:
@@ -717,12 +717,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i32 %result, ptr addrspace(1) %out
@@ -744,12 +744,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
@@ -767,12 +767,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i32 %result, ptr addrspace(1) %out
@@ -794,12 +794,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_uge:
@@ -817,12 +817,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i32 %result, ptr addrspace(1) %out
@@ -844,12 +844,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ult:
@@ -867,12 +867,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i32 %result, ptr addrspace(1) %out
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ule:
@@ -917,12 +917,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i32 %result, ptr addrspace(1) %out
@@ -944,12 +944,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
@@ -967,12 +967,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i32 %result, ptr addrspace(1) %out
@@ -994,12 +994,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sge:
@@ -1017,12 +1017,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i32 %result, ptr addrspace(1) %out
@@ -1044,12 +1044,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_slt:
@@ -1067,12 +1067,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i32 %result, ptr addrspace(1) %out
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sle:
@@ -1117,12 +1117,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i32 %result, ptr addrspace(1) %out
@@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    s_cmp_gt_u32 s3, 2
-; GFX10-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX10-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 60e242bf5b0e8..cad3c54ae54b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -716,15 +716,15 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_eq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_eq:
@@ -776,15 +776,15 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_ne:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_ne:
@@ -836,15 +836,15 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ugt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ugt:
@@ -896,15 +896,15 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_uge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_uge:
@@ -956,15 +956,15 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ult:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ult:
@@ -1016,15 +1016,15 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ule:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ule:
@@ -1076,15 +1076,15 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sgt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sgt:
@@ -1136,15 +1136,15 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sge:
@@ -1196,15 +1196,15 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_slt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_slt:
@@ -1256,15 +1256,15 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sle:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sle:
@@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX9-LABEL: v_icmp_i1_ne0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX9-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s7, 2
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
new file mode 100644
index 0000000000000..90dfab501d0a4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @sample_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-LABEL: sample_1d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_1d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+; GFX10PLUS-LABEL: sample_2d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_2d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.2d.nortn.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_3d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+; GFX10PLUS-LABEL: sample_3d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_3d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.3d.nortn.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_cube_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+; GFX10PLUS-LABEL: sample_cube_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_cube_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.cube.nortn.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_1darray_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+; GFX10PLUS-LABEL: sample_1darray_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_1darray_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.1darray.nortn.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_2darray_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
+; GFX10PLUS-LABEL: sample_2darray_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_2darray_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.2darray.nortn.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_b_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+; GFX10PLUS-LABEL: sample_b_1d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample_b off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_b_1d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample_b off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.b.1d.nortn.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_b_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+; GFX10PLUS-LABEL: sample_b_2d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample_b off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_b_2d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample_b off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.b.2d.nortn.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_c_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+; GFX10PLUS-LABEL: sample_c_1d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample_c off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_c_1d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample_c off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.c.1d.nortn.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_c_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+; GFX10PLUS-LABEL: sample_c_2d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample_c off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_c_2d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample_c off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.c.2d.nortn.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_d_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
+; GFX10PLUS-LABEL: sample_d_1d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_sample_d off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_d_1d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    image_sample_d off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.d.1d.nortn.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_d_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+; GFX10PLUS-LABEL: sample_d_2d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_sample_d off, v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_d_2d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    image_sample_d off, [v0, v1, v2, v[3:5]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.d.2d.nortn.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_l_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
+; GFX10PLUS-LABEL: sample_l_1d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_sample_l off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_l_1d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    image_sample_l off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.l.1d.nortn.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_l_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
+; GFX10PLUS-LABEL: sample_l_2d_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_sample_l off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_l_2d_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    image_sample_l off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.l.2d.nortn.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps <4 x float> @sample_nortn_mix_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-LABEL: sample_nortn_mix_1:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: sample_nortn_mix_1:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_nortn_mix_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-LABEL: sample_nortn_mix_2:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10PLUS-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: sample_nortn_mix_2:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    v_mov_b32_e32 v4, v0
+; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_nortn_mix_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-SDAG-LABEL: sample_nortn_mix_3:
+; GFX10PLUS-SDAG:       ; %bb.0: ; %main_body
+; GFX10PLUS-SDAG-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-SDAG-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-SDAG-NEXT:    image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10PLUS-SDAG-NEXT:    image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-GISEL-LABEL: sample_nortn_mix_3:
+; GFX10PLUS-GISEL:       ; %bb.0: ; %main_body
+; GFX10PLUS-GISEL-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-GISEL-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-GISEL-NEXT:    image_sample v[1:4], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10PLUS-GISEL-NEXT:    image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: sample_nortn_mix_3:
+; GFX12-SDAG:       ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-SDAG-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-SDAG-NEXT:    image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x1
+; GFX12-SDAG-NEXT:    image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: sample_nortn_mix_3:
+; GFX12-GISEL:       ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-GISEL-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-GISEL-NEXT:    image_sample v[1:4], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v.0 = extractelement <4 x float> %v, i32 0
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %u = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %v.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %u
+}
+
+define amdgpu_ps <4 x float> @sample_nortn_mix_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-SDAG-LABEL: sample_nortn_mix_4:
+; GFX10PLUS-SDAG:       ; %bb.0: ; %main_body
+; GFX10PLUS-SDAG-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-SDAG-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-SDAG-NEXT:    image_sample v4, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10PLUS-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-GISEL-LABEL: sample_nortn_mix_4:
+; GFX10PLUS-GISEL:       ; %bb.0: ; %main_body
+; GFX10PLUS-GISEL-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10PLUS-GISEL-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-GISEL-NEXT:    image_sample v[4:7], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10PLUS-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: sample_nortn_mix_4:
+; GFX12-SDAG:       ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-SDAG-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-SDAG-NEXT:    image_sample v4, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x1
+; GFX12-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x2
+; GFX12-SDAG-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: sample_nortn_mix_4:
+; GFX12-GISEL:       ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-GISEL-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-GISEL-NEXT:    image_sample v[4:7], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x2
+; GFX12-GISEL-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v.0 = extractelement <4 x float> %v, i32 0
+  %v.1 = extractelement <4 x float> %v, i32 0
+  %v.2 = extractelement <4 x float> %v, i32 0
+  %v.3 = extractelement <4 x float> %v, i32 0
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %v.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %u = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %v.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %u.0 = extractelement <4 x float> %u, i32 0
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %v.2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %v.3, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 15, float %u.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %u
+}
+
+define amdgpu_ps void @sample_d_1d_g16_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+; GFX10PLUS-LABEL: sample_d_1d_g16_nortn:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_d_1d_g16_nortn:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.sample.1d.nortn.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.2d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.3d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.cube.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.1darray.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.2darray.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.b.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.b.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.c.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.c.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.d.1d.f32.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.d.2d.f32.nortn.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.l.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.l.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX10-GISEL: {{.*}}
+; GFX10-SDAG: {{.*}}
+; GFX11: {{.*}}
+; GFX11-GISEL: {{.*}}
+; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index 3a540bdf5b53d..b61ca566bdfad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}full_mask:
 ; GCN: s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
index d3961c04838cf..d4ae040ce6128 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
 
 ; GCN-LABEL: {{^}}test_init_exec:
 ; GFX1032: s_mov_b32 exec_lo, 0x12345
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 2d01703c78d78..0393a551dcd41 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -163,7 +163,7 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
@@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -182,7 +182,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -262,15 +262,15 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -278,7 +278,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -352,9 +352,10 @@ main_body:
 define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1013-NEXT:    s_clause 0x1
+; GFX1013-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x34
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
@@ -365,13 +366,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v11, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1013-NEXT:    v_bfrev_b32_e32 v1, 4.0
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -447,9 +448,10 @@ main_body:
 define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1013-NEXT:    s_clause 0x1
+; GFX1013-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x34
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
@@ -457,13 +459,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x46004500
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 ; GFX1013-NEXT:    v_bfrev_b32_e32 v1, 4.0
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
index 71ed71cd84bcd..3781faa54e7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11,SDAG %s
+
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=1 < %s 2>&1 | FileCheck  -check-prefix=GISEL-ERR %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=0 < %s 2>&1 | FileCheck  -check-prefix=SDAG-ERR %s
+
+; GISEL-ERR: LLVM ERROR: cannot select: {{.*}}  = G_INTRINSIC intrinsic(@llvm.amdgcn.inverse.ballot)
+; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.inverse.ballot
 
 declare i1 @llvm.amdgcn.inverse.ballot(i32)
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index 2e3dc11feed1e..29218a3625216 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
+
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -global-isel=1 < %s 2>&1 | FileCheck  -check-prefix=GISEL-ERR %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -global-isel=0 < %s 2>&1 | FileCheck  -check-prefix=SDAG-ERR %s
+
+; GISEL-ERR: LLVM ERROR: cannot select: {{.*}}  = G_INTRINSIC intrinsic(@llvm.amdgcn.inverse.ballot)
+; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.inverse.ballot
 
 declare i1 @llvm.amdgcn.inverse.ballot.i64(i64)
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 63d272607563c..e0dacd47e51c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
 
 ; GCN-LABEL: {{^}}gs_const:
 ; GCN-NOT: v_cmpx
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 8e9a652ae8a8e..aad74410d1453 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -384,26 +384,26 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64:
@@ -469,26 +469,26 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64:
@@ -601,28 +601,28 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64:
@@ -742,28 +742,28 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64:
@@ -933,30 +933,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlane16_b32_vvv_i64:
@@ -1104,30 +1104,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlane16_b32_vvv_f64:
@@ -1179,24 +1179,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl
 define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32:
@@ -1337,24 +1337,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %
 define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32:
@@ -1493,27 +1493,16 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1693,27 +1682,16 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -3117,26 +3095,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64:
@@ -3202,26 +3180,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64:
@@ -3380,28 +3358,28 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64:
@@ -3475,28 +3453,28 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64:
@@ -3762,30 +3740,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlanex16_b32_vvv_i64:
@@ -3837,30 +3815,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlanex16_b32_vvv_f64:
@@ -3912,24 +3890,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub
 define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
@@ -3996,24 +3974,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
@@ -4226,27 +4204,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -4314,27 +4281,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
 ; GFX11-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
new file mode 100644
index 0000000000000..419e19083f85e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
+
+define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB1_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB2_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB3_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
+; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:  .LBB4_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b32 s0, s1, s0
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_i64:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB5_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.zext = zext i32 %id to i64
+  br label %bb1
+bb1:
+  %load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i64 %load, %id.zext
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB6_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1)
+  %bitcast = bitcast <2 x i16> %load to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB7_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1)
+  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+  %bitcast = bitcast <2 x i16> %shortened to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB8_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
+  %extracted = extractelement <4 x i32> %load, i32 3
+  %cmp = icmp eq i32 %extracted, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
+; CHECK-LABEL: raw_atomic_buffer_load_ptr:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB9_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1)
+  %elem = load i32, ptr %load
+  %cmp = icmp eq i32 %elem, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg)
+declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg)
+declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg)
+declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
+declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
new file mode 100644
index 0000000000000..6541ac9553231
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB1_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB2_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 4, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB3_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 4)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:  .LBB4_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b32 s0, s1, s0
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB5_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.zext = zext i32 %id to i64
+  br label %bb1
+bb1:
+  %load = call i64 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i64(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i64 %load, %id.zext
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB6_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <2 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v2i16(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
+  %bitcast = bitcast <2 x i16> %load to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB7_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i16(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
+  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+  %bitcast = bitcast <2 x i16> %shortened to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB8_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i32> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
+  %extracted = extractelement <4 x i32> %load, i32 3
+  %cmp = icmp eq i32 %extracted, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB9_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call ptr @llvm.amdgcn.raw.ptr.atomic.buffer.load.ptr(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
+  %elem = load i32, ptr %load
+  %cmp = icmp eq i32 %elem, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.amdgcn.raw.ptr.atom.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.raw.ptr.atom.buffer.load.i64(ptr addrspace(8), i32, i32, i32 immarg)
+declare <2 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32 immarg)
+declare <4 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32 immarg)
+declare <4 x i32> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32 immarg)
+declare ptr @llvm.amdgcn.raw.ptr.atom.buffer.load.ptr(ptr addrspace(8), i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index e2f494283a3f2..7371d498a7070 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -808,11 +808,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
 ; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1032GISEL-NEXT:    s_endpgm
 ;
 ; GFX1164DAGISEL-LABEL: divergent_cfg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 5304188e02f84..60af21524a04a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -809,11 +809,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
 ; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1032GISEL-NEXT:    s_endpgm
 ;
 ; GFX1164DAGISEL-LABEL: divergent_cfg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 24b8a3c2dc873..b8a4674833cee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -29,19 +29,19 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -83,33 +83,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -151,14 +151,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -194,10 +194,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
@@ -208,10 +208,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:80
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
@@ -228,8 +228,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; GCN-NEXT:    v_mul_lo_u32 v18, v18, v18
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
@@ -245,14 +245,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -261,12 +261,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -275,10 +275,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
@@ -289,10 +289,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(2)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
@@ -309,8 +309,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v18, v18, v18
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
@@ -326,14 +326,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -385,12 +385,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -398,8 +398,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -407,30 +407,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:80
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -442,8 +442,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -455,8 +455,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -468,18 +468,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
@@ -487,8 +487,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
@@ -496,30 +496,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -531,8 +531,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -544,8 +544,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -557,7 +557,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
new file mode 100644
index 0000000000000..2a979976d806c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+
+define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %addr) {
+; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB1_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 15, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB2_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB3_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 4, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB4_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 4)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_nonatomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:  .LBB5_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b32 s0, s1, s0
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_i64:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB6_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.zext = zext i32 %id to i64
+  br label %bb1
+bb1:
+  %load = call i64 @llvm.amdgcn.struct.atomic.buffer.load.i64(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i64 %load, %id.zext
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB7_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <2 x i16> @llvm.amdgcn.struct.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
+  %bitcast = bitcast <2 x i16> %load to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB8_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i16> @llvm.amdgcn.struct.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
+  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+  %bitcast = bitcast <2 x i16> %shortened to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB9_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i32> @llvm.amdgcn.struct.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
+  %extracted = extractelement <4 x i32> %load, i32 3
+  %cmp = icmp eq i32 %extracted, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %index) {
+; CHECK-LABEL: struct_atomic_buffer_load_ptr:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB10_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_load_b32 v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB10_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call ptr @llvm.amdgcn.struct.atomic.buffer.load.ptr(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
+  %elem = load i32, ptr %load
+  %cmp = icmp eq i32 %elem, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.struct.atom.buffer.load.i32(<4 x i32>, i32, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.struct.atom.buffer.load.i64(<4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x i16> @llvm.amdgcn.struct.atom.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32 immarg)
+declare <4 x i16> @llvm.amdgcn.struct.atom.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32 immarg)
+declare <4 x i32> @llvm.amdgcn.struct.atom.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32 immarg)
+declare ptr @llvm.amdgcn.struct.atom.buffer.load.ptr(<4 x i32>, i32, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
new file mode 100644
index 0000000000000..70296a0a7bec6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 %index, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrspace(8) %ptr) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB1_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 15, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB2_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 %index, i32 0, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB3_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 4, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB4_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 4)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_nonatomic_buffer_load_i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:  .LBB5_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b32 s0, s1, s0
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i32 %load, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB6_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.zext = zext i32 %id to i64
+  br label %bb1
+bb1:
+  %load = call i64 @llvm.amdgcn.struct.ptr.atomic.buffer.load.i64(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 1)
+  %cmp = icmp eq i64 %load, %id.zext
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB7_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <2 x i16> @llvm.amdgcn.struct.ptr.atomic.buffer.load.v2i16(ptr addrspace(8) %ptr, i32 %index, i32 0, i32 0, i32 1)
+  %bitcast = bitcast <2 x i16> %load to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB8_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i16> @llvm.amdgcn.struct.ptr.atomic.buffer.load.v4i16(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 1)
+  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+  %bitcast = bitcast <2 x i16> %shortened to i32
+  %cmp = icmp eq i32 %bitcast, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB9_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call <4 x i32> @llvm.amdgcn.struct.ptr.atomic.buffer.load.v4i32(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 1)
+  %extracted = extractelement <4 x i32> %load, i32 3
+  %cmp = icmp eq i32 %extracted, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr, i32 %index) {
+; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  .LBB10_1: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_load_b32 v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB10_1
+; CHECK-NEXT:  ; %bb.2: ; %bb2
+; CHECK-NEXT:    s_endpgm
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+bb1:
+  %load = call ptr @llvm.amdgcn.struct.ptr.atomic.buffer.load.ptr(ptr addrspace(8) %ptr, i32 %index, i32 4, i32 0, i32 1)
+  %elem = load i32, ptr %load
+  %cmp = icmp eq i32 %elem, %id
+  br i1 %cmp, label %bb1, label %bb2
+bb2:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.struct.ptr.atom.buffer.load.i32(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.struct.ptr.atom.buffer.load.i64(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare <2 x i16> @llvm.amdgcn.struct.ptr.atom.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare <4 x i16> @llvm.amdgcn.struct.ptr.atom.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare <4 x i32> @llvm.amdgcn.struct.ptr.atom.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare ptr @llvm.amdgcn.struct.ptr.atom.buffer.load.ptr(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index b678378e55545..a9a710f9b6723 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -287,7 +287,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -300,7 +300,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -314,7 +314,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -327,7 +327,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -405,7 +405,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -418,7 +418,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -431,7 +431,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -444,7 +444,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -522,7 +522,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -535,7 +535,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -548,7 +548,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -561,7 +561,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index a538364bf2ec4..3d5e6dca45618 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 368d637f8ac1b..d11aaf0036b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index d37b4b41fac3d..684ca3aac7c31 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
 
 define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
 ; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 5514efa6838e7..0b18e5f35a310 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -46,26 +46,26 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: cos_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX9-NEXT:    v_cos_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cos_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX10-NEXT:    v_cos_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cos_f16:
@@ -142,34 +142,34 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: cos_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_cos_f16_e32 v2, v3
 ; GFX9-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cos_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cos_f16_e32 v2, v3
 ; GFX10-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cos_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 142145098df87..4bed23487445a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_exp_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 4d981d27c309e..ec7e52532cd32 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -444,59 +444,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -506,18 +506,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc23369f4
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 9f80e66e8f873..32b599e63c61d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -271,25 +271,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
@@ -538,10 +538,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-LABEL: s_exp2_v3f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -562,7 +562,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_exp2_v3f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index edcdd323cb0ae..92658e660ea67 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -104,60 +104,60 @@ define amdgpu_kernel void @fmuladd_f16(
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s3
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s9
 ; GFX10-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; GFX10-FLUSH-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s3
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s11
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
-; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[8:11], 0
+; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[0:3], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16:
@@ -722,60 +722,60 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_v2f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s3
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s9
 ; GFX10-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; GFX10-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT:    buffer_load_dword v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT:    buffer_load_dword v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX10-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_v2f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s3
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s11
 ; GFX10-DENORM-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; GFX10-DENORM-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; GFX10-DENORM-NEXT:    buffer_load_dword v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-DENORM-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX10-DENORM-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index a807885e0d853..e05f3f1e65ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -139,12 +139,12 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
 ;
 ; GFX9CHECK-LABEL: sgpr_isnan_f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9CHECK-NEXT:    s_endpgm
 ;
 ; GFX10CHECK-LABEL: sgpr_isnan_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 7ca04cc235605..50c52037dc4d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -314,25 +314,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
@@ -630,10 +630,10 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-LABEL: s_log2_v3f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
@@ -654,7 +654,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: s_log2_v3f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index c7913f638798a..2c9ce001b8c4d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -194,40 +194,40 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
 ;
 ; GFX9-LABEL: maxnum_f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_f16_imm_a:
@@ -302,40 +302,40 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
 ;
 ; GFX9-LABEL: maxnum_f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v0, 4.0, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_f16_imm_b:
@@ -524,29 +524,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
 ;
 ; GFX9-LABEL: maxnum_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_v2f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_v2f16_imm_a:
@@ -614,29 +614,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
 ;
 ; GFX9-LABEL: maxnum_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_v2f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_v2f16_imm_b:
@@ -1007,36 +1007,36 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmax_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
 ; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
 ; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fmax_v4f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
 ; GFX10-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmax_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 0a004fd7701cf..59508e049a3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -221,40 +221,40 @@ define amdgpu_kernel void @minnum_f16_imm_a(
 ;
 ; GFX9-LABEL: minnum_f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_f16_imm_a:
@@ -328,40 +328,40 @@ define amdgpu_kernel void @minnum_f16_imm_b(
 ;
 ; GFX9-LABEL: minnum_f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f16_e32 v0, 4.0, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_f16_imm_b:
@@ -583,29 +583,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
 ;
 ; GFX9-LABEL: minnum_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_v2f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_v2f16_imm_a:
@@ -672,29 +672,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
 ;
 ; GFX9-LABEL: minnum_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_v2f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_v2f16_imm_b:
@@ -1062,36 +1062,36 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmin_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
 ; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
 ; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fmin_v4f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
 ; GFX10-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmin_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 53ea253035655..2d3b18dcb121c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ;
 ; GFX9-LABEL: umulo_i64_s:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s7, s0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT:    s_add_u32 s9, s8, s7
-; GFX9-NEXT:    s_mul_i32 s6, s1, s2
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT:    s_add_u32 s9, s9, s6
-; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
-; GFX9-NEXT:    s_addc_u32 s4, s5, s4
-; GFX9-NEXT:    s_addc_u32 s5, s10, 0
-; GFX9-NEXT:    s_mul_i32 s1, s1, s3
-; GFX9-NEXT:    s_add_u32 s4, s4, s1
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_add_i32 s1, s8, s7
-; GFX9-NEXT:    s_add_i32 s1, s1, s6
-; GFX9-NEXT:    s_mul_i32 s0, s0, s2
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
-; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_mul_i32 s3, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT:    s_add_u32 s9, s8, s3
+; GFX9-NEXT:    s_mul_i32 s2, s5, s6
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT:    s_add_u32 s9, s9, s2
+; GFX9-NEXT:    s_mul_hi_u32 s10, s5, s7
+; GFX9-NEXT:    s_addc_u32 s0, s1, s0
+; GFX9-NEXT:    s_addc_u32 s1, s10, 0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s7
+; GFX9-NEXT:    s_add_u32 s0, s0, s5
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_add_i32 s3, s8, s3
+; GFX9-NEXT:    s_add_i32 s3, s3, s2
+; GFX9-NEXT:    s_mul_i32 s2, s4, s6
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s3
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: umulo_i64_s:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s7, s0, s3
-; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_mul_i32 s6, s1, s2
-; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
-; GFX10-NEXT:    s_mul_i32 s1, s1, s3
-; GFX10-NEXT:    s_add_u32 s3, s8, s7
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_add_u32 s3, s3, s6
-; GFX10-NEXT:    s_addc_u32 s3, s5, s4
-; GFX10-NEXT:    s_addc_u32 s5, s9, 0
-; GFX10-NEXT:    s_add_u32 s4, s3, s1
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_add_i32 s1, s8, s7
-; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    s_add_i32 s1, s1, s6
-; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX10-NEXT:    s_mul_i32 s3, s4, s7
+; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT:    s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT:    s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT:    s_mul_i32 s2, s5, s6
+; GFX10-NEXT:    s_mul_hi_u32 s9, s5, s7
+; GFX10-NEXT:    s_mul_i32 s5, s5, s7
+; GFX10-NEXT:    s_add_u32 s7, s8, s3
+; GFX10-NEXT:    s_addc_u32 s1, 0, s1
+; GFX10-NEXT:    s_add_u32 s7, s7, s2
+; GFX10-NEXT:    s_addc_u32 s0, s1, s0
+; GFX10-NEXT:    s_addc_u32 s1, s9, 0
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, 0, s1
+; GFX10-NEXT:    s_add_i32 s3, s8, s3
+; GFX10-NEXT:    s_mul_i32 s4, s4, s6
+; GFX10-NEXT:    s_add_i32 s3, s3, s2
+; GFX10-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s4
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
@@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ;
 ; GFX9-LABEL: smulo_i64_s:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s7, s0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT:    s_add_u32 s9, s8, s7
-; GFX9-NEXT:    s_mul_i32 s6, s1, s2
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT:    s_add_u32 s9, s9, s6
-; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT:    s_addc_u32 s4, s5, s4
-; GFX9-NEXT:    s_addc_u32 s5, s10, 0
-; GFX9-NEXT:    s_mul_i32 s9, s1, s3
-; GFX9-NEXT:    s_add_u32 s4, s4, s9
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_sub_u32 s9, s4, s2
-; GFX9-NEXT:    s_subb_u32 s10, s5, 0
-; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_cselect_b32 s1, s10, s5
-; GFX9-NEXT:    s_sub_u32 s9, s4, s0
-; GFX9-NEXT:    s_subb_u32 s5, s1, 0
-; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s1
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_add_i32 s1, s8, s7
-; GFX9-NEXT:    s_add_i32 s1, s1, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    s_mul_i32 s0, s0, s2
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
-; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_mul_i32 s3, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT:    s_add_u32 s9, s8, s3
+; GFX9-NEXT:    s_mul_i32 s2, s5, s6
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT:    s_add_u32 s9, s9, s2
+; GFX9-NEXT:    s_mul_hi_i32 s10, s5, s7
+; GFX9-NEXT:    s_addc_u32 s0, s1, s0
+; GFX9-NEXT:    s_addc_u32 s1, s10, 0
+; GFX9-NEXT:    s_mul_i32 s9, s5, s7
+; GFX9-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_sub_u32 s9, s0, s6
+; GFX9-NEXT:    s_subb_u32 s10, s1, 0
+; GFX9-NEXT:    s_cmp_lt_i32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s9, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s10, s1
+; GFX9-NEXT:    s_sub_u32 s5, s0, s4
+; GFX9-NEXT:    s_subb_u32 s9, s1, 0
+; GFX9-NEXT:    s_cmp_lt_i32 s7, 0
+; GFX9-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX9-NEXT:    s_add_i32 s3, s8, s3
+; GFX9-NEXT:    s_add_i32 s5, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s5
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: smulo_i64_s:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s7, s0, s3
-; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT:    s_mul_i32 s6, s1, s2
-; GFX10-NEXT:    s_add_u32 s11, s8, s7
-; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT:    s_add_u32 s11, s11, s6
-; GFX10-NEXT:    s_mul_i32 s10, s1, s3
-; GFX10-NEXT:    s_addc_u32 s4, s5, s4
-; GFX10-NEXT:    s_addc_u32 s5, s9, 0
-; GFX10-NEXT:    s_add_u32 s4, s4, s10
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_sub_u32 s9, s4, s2
-; GFX10-NEXT:    s_subb_u32 s10, s5, 0
-; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX10-NEXT:    s_cselect_b32 s1, s9, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s10, s5
-; GFX10-NEXT:    s_sub_u32 s9, s1, s0
-; GFX10-NEXT:    s_subb_u32 s5, s4, 0
-; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s9, s1
-; GFX10-NEXT:    s_add_i32 s1, s8, s7
-; GFX10-NEXT:    s_add_i32 s1, s1, s6
-; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, s6
-; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX10-NEXT:    s_mul_i32 s3, s4, s7
+; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT:    s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT:    s_mul_i32 s2, s5, s6
+; GFX10-NEXT:    s_add_u32 s11, s8, s3
+; GFX10-NEXT:    s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT:    s_addc_u32 s1, 0, s1
+; GFX10-NEXT:    s_mul_hi_i32 s9, s5, s7
+; GFX10-NEXT:    s_add_u32 s11, s11, s2
+; GFX10-NEXT:    s_mul_i32 s10, s5, s7
+; GFX10-NEXT:    s_addc_u32 s0, s1, s0
+; GFX10-NEXT:    s_addc_u32 s1, s9, 0
+; GFX10-NEXT:    s_add_u32 s0, s0, s10
+; GFX10-NEXT:    s_addc_u32 s1, 0, s1
+; GFX10-NEXT:    s_sub_u32 s9, s0, s6
+; GFX10-NEXT:    s_subb_u32 s10, s1, 0
+; GFX10-NEXT:    s_cmp_lt_i32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s0, s9, s0
+; GFX10-NEXT:    s_cselect_b32 s1, s10, s1
+; GFX10-NEXT:    s_sub_u32 s5, s0, s4
+; GFX10-NEXT:    s_subb_u32 s9, s1, 0
+; GFX10-NEXT:    s_cmp_lt_i32 s7, 0
+; GFX10-NEXT:    s_mul_i32 s4, s4, s6
+; GFX10-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX10-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX10-NEXT:    s_add_i32 s3, s8, s3
+; GFX10-NEXT:    s_add_i32 s5, s3, s2
+; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX10-NEXT:    s_mov_b32 s3, s2
+; GFX10-NEXT:    s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s4
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 47dd0263d020e..bf7dbcde62fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -28,23 +28,41 @@ define amdgpu_kernel void @rint_f16(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: rint_f16:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; VI-LABEL: rint_f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_rndne_f16_e32 v0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: rint_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rint_f16:
 ; GFX11:       ; %bb.0: ; %entry
@@ -131,22 +149,22 @@ define amdgpu_kernel void @rint_v2f16(
 ;
 ; GFX9-LABEL: rint_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rint_v2f16:
@@ -180,3 +198,5 @@ entry:
   store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index d5b4f879bf8a0..e7b17c30cf753 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
 
@@ -24,23 +24,41 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dword s6, s[2:3], 0x2c
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX89-NEXT:    s_brev_b32 s4, -2
-; GFX89-NEXT:    v_mov_b32_e32 v2, s6
-; GFX89-NEXT:    v_bfi_b32 v1, s4, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_f32:
 ; GFX11:       ; %bb.0:
@@ -115,31 +133,57 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_v2f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    s_brev_b32 s8, -2
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX89-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s3
-; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s2
-; GFX89-NEXT:    v_sub_f32_e32 v2, s2, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v3, s2
-; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_brev_b32 s8, -2
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX8-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s2
+; GFX8-NEXT:    v_sub_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_brev_b32 s8, -2
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v2, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
@@ -235,44 +279,83 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_v4f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX89-NEXT:    s_brev_b32 s10, -2
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s7
-; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s6
-; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX89-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX89-NEXT:    v_mov_b32_e32 v4, s5
-; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v4
-; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX89-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX89-NEXT:    v_mov_b32_e32 v5, s4
-; GFX89-NEXT:    v_bfi_b32 v4, s10, v4, v5
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8-NEXT:    s_brev_b32 s10, -2
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX8-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX8-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX8-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_bfi_b32 v4, s10, v4, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_brev_b32 s10, -2
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX9-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX9-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    v_bfi_b32 v4, s10, v4, v5
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v4f32:
 ; GFX11:       ; %bb.0:
@@ -421,73 +504,141 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_v8f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
-; GFX89-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
-; GFX89-NEXT:    s_brev_b32 s2, -2
-; GFX89-NEXT:    s_mov_b32 s15, 0xf000
-; GFX89-NEXT:    s_mov_b32 s14, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s7
-; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s6
-; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX89-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v4, s5
-; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v4
-; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX89-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v5, s4
-; GFX89-NEXT:    v_bfi_b32 v4, s2, v4, v5
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX89-NEXT:    v_trunc_f32_e32 v4, s11
-; GFX89-NEXT:    v_sub_f32_e32 v5, s11, v4
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v6, s11
-; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX89-NEXT:    v_add_f32_e32 v7, v4, v5
-; GFX89-NEXT:    v_trunc_f32_e32 v4, s10
-; GFX89-NEXT:    v_sub_f32_e32 v5, s10, v4
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v6, s10
-; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX89-NEXT:    v_add_f32_e32 v6, v4, v5
-; GFX89-NEXT:    v_trunc_f32_e32 v4, s9
-; GFX89-NEXT:    v_sub_f32_e32 v5, s9, v4
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v8, s9
-; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v8
-; GFX89-NEXT:    v_add_f32_e32 v5, v4, v5
-; GFX89-NEXT:    v_trunc_f32_e32 v4, s8
-; GFX89-NEXT:    v_sub_f32_e32 v8, s8, v4
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v9, s8
-; GFX89-NEXT:    v_bfi_b32 v8, s2, v8, v9
-; GFX89-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
-; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_v8f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
+; GFX8-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
+; GFX8-NEXT:    s_brev_b32 s2, -2
+; GFX8-NEXT:    s_mov_b32 s15, 0xf000
+; GFX8-NEXT:    s_mov_b32 s14, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX8-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX8-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX8-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_bfi_b32 v4, s2, v4, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_trunc_f32_e32 v4, s11
+; GFX8-NEXT:    v_sub_f32_e32 v5, s11, v4
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v6, s11
+; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX8-NEXT:    v_add_f32_e32 v7, v4, v5
+; GFX8-NEXT:    v_trunc_f32_e32 v4, s10
+; GFX8-NEXT:    v_sub_f32_e32 v5, s10, v4
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX8-NEXT:    v_add_f32_e32 v6, v4, v5
+; GFX8-NEXT:    v_trunc_f32_e32 v4, s9
+; GFX8-NEXT:    v_sub_f32_e32 v5, s9, v4
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s9
+; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v8
+; GFX8-NEXT:    v_add_f32_e32 v5, v4, v5
+; GFX8-NEXT:    v_trunc_f32_e32 v4, s8
+; GFX8-NEXT:    v_sub_f32_e32 v8, s8, v4
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v9, s8
+; GFX8-NEXT:    v_bfi_b32 v8, s2, v8, v9
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    s_mov_b32 s15, 0xf000
+; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX9-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX9-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    v_bfi_b32 v4, s2, v4, v5
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, s11
+; GFX9-NEXT:    v_sub_f32_e32 v5, s11, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX9-NEXT:    v_add_f32_e32 v7, v4, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v4, s10
+; GFX9-NEXT:    v_sub_f32_e32 v5, s10, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX9-NEXT:    v_add_f32_e32 v6, v4, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v4, s9
+; GFX9-NEXT:    v_sub_f32_e32 v5, s9, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v8
+; GFX9-NEXT:    v_add_f32_e32 v5, v4, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v4, s8
+; GFX9-NEXT:    v_sub_f32_e32 v8, s8, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-NEXT:    v_bfi_b32 v8, s2, v8, v9
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v8f32:
 ; GFX11:       ; %bb.0:
@@ -646,24 +797,43 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_f16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dword s4, s[2:3], 0x2c
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX89-NEXT:    v_mov_b32_e32 v0, 0x3c00
-; GFX89-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f16_e32 v1, s4
-; GFX89-NEXT:    v_sub_f16_e32 v2, s4, v1
-; GFX89-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX89-NEXT:    v_mov_b32_e32 v2, s4
-; GFX89-NEXT:    v_bfi_b32 v0, s5, v0, v2
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f16_e32 v1, s4
+; GFX8-NEXT:    v_sub_f16_e32 v2, s4, v1
+; GFX8-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_bfi_b32 v0, s5, v0, v2
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, s4
+; GFX9-NEXT:    v_sub_f16_e32 v2, s4, v1
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s5, v0, v2
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_f16:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index a70f4d8d90065..3ae0cf65eb00f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -46,26 +46,26 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: sin_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sin_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: sin_f16:
@@ -142,34 +142,34 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: sin_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_sin_f16_e32 v2, v3
 ; GFX9-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sin_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_sin_f16_e32 v2, v3
 ; GFX10-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: sin_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
index 66c68f7cc731e..57028a0f9b14f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
@@ -17,10 +17,10 @@ define void @loads(ptr addrspace(8) %buf) {
 ; CHECK-NEXT:    [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
 ; CHECK-NEXT:    fence syncscope("wavefront") release
-; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
+; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
 ; CHECK-NEXT:    fence syncscope("wavefront") acquire
-; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
-; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
 ; CHECK-NEXT:    fence acquire
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 7998d430d5f90..0fb9e2572446b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,18 +19,18 @@ $_f2 = comdat any
 define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 2
-; GCN-NEXT:    ds_write_b8 v0, v1
-; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
-; GCN-NEXT:    ds_read_u16 v3, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    ds_write_b8 v1, v0
+; GCN-NEXT:    ds_read_u8 v2, v1 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v1
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
-; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
+; GCN-NEXT:    ds_write_b8 v1, v2 offset:6
+; GCN-NEXT:    ds_write_b16 v1, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GCN-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: define protected amdgpu_kernel void @test(
 ; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -47,7 +47,6 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
 ; CHECK-NEXT:    [[FROMBOOL8:%.*]] = zext i1 [[TMP2]] to i8
 ; CHECK-NEXT:    store i8 [[FROMBOOL8]], ptr addrspace(1) [[PTR_COERCE]], align 1
 ; CHECK-NEXT:    ret void
-;
 entry:
   store i8 3, ptr addrspace(3) @_f1, align 1
   tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) @_f2, ptr addrspace(3) noundef align 1 dereferenceable(3) @_f1, i64 3, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 5cb57ee112b3a..9e2e37a886d1f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -17,6 +17,9 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
 
 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
 declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
@@ -87,30 +90,25 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 ;
 ; ALL-LABEL: @max_size_small_static_memmove_caller0(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -121,30 +119,37 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @min_size_large_static_memmove_caller0(
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; OPT-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
-; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; OPT:       copy_backwards:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; OPT:       copy_backwards_loop:
-; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
-; OPT-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; OPT:       copy_forward:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; OPT:       copy_forward_loop:
-; OPT-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
-; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 64
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
 ; OPT:       memmove_done:
 ; OPT-NEXT:    ret void
 ;
@@ -1333,30 +1338,25 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
 ; ALL-LABEL: @memmove_flat_align1_global_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1372,30 +1372,25 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
 ; ALL-LABEL: @memmove_global_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1411,30 +1406,25 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
 ; ALL-LABEL: @memmove_flat_align1_private_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1450,30 +1440,25 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 ; ALL-LABEL: @memmove_private_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1734,6 +1719,477 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
   ret void
 }
 
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_flat_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_flat_align1_local_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) {
+; MAX1024-LABEL: @memmove_local_align1_flat_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_flat_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_local_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_local_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 32
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_private_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_private_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 16
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
+; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 15
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_empty(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 65, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 65
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_full(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 64
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
+; OPT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
+; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
+; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
+; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
+; OPT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
+; OPT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
+; OPT-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false)
+  ret void
+}
+
 define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-LABEL: @test_umin(
 ; OPT-NEXT:  entry:
@@ -1783,30 +2239,25 @@ define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace
 ;
 ; ALL-LABEL: @memmove_volatile(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 64, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 64, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load volatile i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 64
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 4, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 4
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 8009f917aef5a..f90753652baa5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
@@ -126,10 +126,21 @@ define amdgpu_cs void @caller() {
 declare amdgpu_gfx void @callee(i32)
 
 define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
-; GFX9-LABEL: workgroup_ids_gfx:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: workgroup_ids_gfx:
+; GFX9-SDAG:           ; %bb.0:
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[0:1], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_gfx:
+; GFX9-GISEL:          ; %bb.0:
+; GFX9-GISEL-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:       s_setpc_b64 s[30:31]
 ;
 ; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx:
 ; GFX9ARCH-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lrint.ll b/llvm/test/CodeGen/AMDGPU/lrint.ll
new file mode 100644
index 0000000000000..31e6cf6ea645c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lrint.ll
@@ -0,0 +1,771 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+
+declare float @llvm.rint.f32(float)
+declare i32 @llvm.lrint.i32.f32(float)
+declare i32 @llvm.lrint.i32.f64(double)
+declare i64 @llvm.lrint.i64.f32(float)
+declare i64 @llvm.lrint.i64.f64(double)
+declare i64 @llvm.llrint.i64.f32(float)
+declare half @llvm.rint.f16(half)
+declare i32 @llvm.lrint.i32.f16(half %arg)
+declare <2 x float> @llvm.rint.v2f32.v2f32(<2 x float> %arg)
+declare <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %arg)
+declare <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %arg)
+
+define float @intrinsic_frint(float %arg) {
+; GCN-LABEL: intrinsic_frint:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call float @llvm.rint.f32(float %arg)
+  ret float %res
+}
+
+define i32 @intrinsic_lrint_i32_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f32(float %arg)
+  ret i32 %res
+}
+
+define i32 @intrinsic_lrint_i32_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f64(double %arg)
+  ret i32 %res
+}
+
+define i64 @intrinsic_lrint_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lrint.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_lrint_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lrint.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llrint_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llrint.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llrint_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llrint.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define half @intrinsic_frint_half(half %arg) {
+; GCN-LABEL: intrinsic_frint_half:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f16_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call half @llvm.rint.f16(half %arg)
+  ret half %res
+}
+
+define i32 @intrinsic_lrint_i32_f16(half %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f16(half %arg)
+  ret i32 %res
+}
+
+define <2 x float> @intrinsic_frint_v2f32_v2f32(<2 x float> %arg) {
+; GCN-LABEL: intrinsic_frint_v2f32_v2f32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_rndne_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x float> @llvm.rint.v2f32.v2f32(<2 x float> %arg)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @intrinsic_lrint_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %arg)
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @intrinsic_lrint_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v2, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v2, s5, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v4
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v2|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s5, |v2|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v4, v5, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v4, |v2|, v3
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v2, v4, v5, |v2|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v6
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v4, v6
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v4, v1
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v4
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v3, |v1|, v3
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v3, v5, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v6, vcc
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v5, v4
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v3, v3
+; GFX10-SDAG-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v5, v5
+; GFX10-GISEL-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX10-GISEL-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v3, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX11-GISEL-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %arg)
+  ret <2 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
new file mode 100644
index 0000000000000..d45d83026013d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -0,0 +1,1036 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
+
+declare float @llvm.round.f32(float)
+declare i32 @llvm.lround.i32.f32(float)
+declare i32 @llvm.lround.i32.f64(double)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare half @llvm.round.f16(half)
+declare i32 @llvm.lround.i32.f16(half %arg)
+
+define float @intrinsic_fround(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_fround:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_fround:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_fround:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_fround:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_fround:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_fround:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call float @llvm.round.f32(float %arg)
+  ret float %res
+}
+
+define i32 @intrinsic_lround_i32_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lround.i32.f32(float %arg)
+  ret i32 %res
+}
+
+define i32 @intrinsic_lround_i32_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lround.i32.f64(double %arg)
+  ret i32 %res
+}
+
+define i64 @intrinsic_lround_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lround.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_lround_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lround.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llround_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llround.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llround_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llround.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define half @intrinsic_fround_half(half %arg) {
+; GFX9-SDAG-LABEL: intrinsic_fround_half:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_fround_half:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_fround_half:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_fround_half:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_fround_half:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_fround_half:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call half @llvm.round.f16(half %arg)
+  ret half %res
+}
+
+define i32 @intrinsic_lround_i32_f16(half %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lround.i32.f16(half %arg)
+  ret i32 %res
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index e9a1b38eee157..40cfa4e7e4dfc 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
 define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
 ; GCN-LABEL: add_u64_ss:
 ; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: s_addc_u32 s1, s5, s7
   %add = add i64 %v, %a
   store i64 %add, ptr undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index e8ac1b2887c36..a4bde5c9d8215 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,12 +8,12 @@
 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s7, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
@@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
@@ -79,13 +79,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v2i16:
@@ -131,13 +131,13 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-LABEL: v_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_lshr_v2i16:
@@ -363,13 +363,13 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_imm_v_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_imm_v_v2i16:
@@ -414,13 +414,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_imm_v_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_imm_v_v2i16:
@@ -450,13 +450,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
 define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_v_imm_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_imm_v2i16:
@@ -497,13 +497,13 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_v_imm_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_v_imm_v2i16:
@@ -533,14 +533,14 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_lshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v4i16:
@@ -596,14 +596,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-LABEL: v_lshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_lshr_v4i16:
@@ -636,14 +636,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_v_imm_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_imm_v4i16:
@@ -689,14 +689,14 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_v_imm_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_v_imm_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir
new file mode 100644
index 0000000000000..07eaa9f95604d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=early-machinelicm %s -o - | FileCheck %s
+
+# Issue #100115: test that MachineLICM does not assert on the undef use operand
+# of the REG_SEQUENCE instruction.
+---
+name: test_undef_use
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: test_undef_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, undef [[DEF]], %subreg.sub1
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.5(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc_lo = COPY undef [[DEF1]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.5, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_LOOP undef [[DEF1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %2:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, undef %0, %subreg.sub1
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.5(0x04000000), %bb.1(0x7c000000)
+
+    $vcc_lo = COPY undef %1
+    S_CBRANCH_VCCNZ %bb.5, implicit $vcc
+    S_BRANCH %bb.1
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+
+    S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+
+    SI_LOOP undef %1, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.5:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 3032b1028dc2d..a3e7bf2caf772 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -35,34 +35,34 @@ define amdgpu_kernel void @mad_u16(
 ;
 ; GFX9-LABEL: mad_u16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[8:9] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v3, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mad_u16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v0, s[4:5] glc dlc
+; GFX10-NEXT:    global_load_ushort v2, v0, s[8:9] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v3, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v3, v0, s[10:11] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mad_u16:
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index b8b4d4440d580..f2815915b8425 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -225,40 +225,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX9-LABEL: madak_2_use_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
+; GFX9-NEXT:    global_store_dword v0, v4, s[6:7] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_2_use_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
@@ -289,41 +289,41 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
-; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_2_use_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
@@ -408,24 +408,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-LABEL: madak_m_inline_imm_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
@@ -446,25 +446,25 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
@@ -961,23 +961,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX9-LABEL: s_s_madak_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mac_f32_e32 v1, s6, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s6, v0, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
@@ -994,23 +994,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s6, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
new file mode 100644
index 0000000000000..94d1eca05ed0e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -0,0 +1,1836 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10_1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10_3 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+; We aren't pressuring the SGPRs, so this can use the add with carry out pre-gfx9.
+define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s0, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s0, s32, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX12-NEXT:    s_bitset0_b32 s0, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s32, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+  ret void
+}
+
+; %alloca1 should end up materializing with s_mov_b32, and scc is
+; available.
+define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT:    s_addk_i32 s59, 0x4040
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT:    s_addk_i32 s59, 0x4040
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_addk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    s_addk_i32 s59, 0x4040
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca1)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s5, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s5, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_add_i32 s0, s33, 64
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s0, s33, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s0, s33, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s33
+; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX12-NEXT:    s_bitset0_b32 s0, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX8-NEXT:    s_movk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s6, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s2, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s3 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_add_i32 s0, s33, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s33, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addc_u32 s0, s32, 64
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    s_mov_b32 s59, 64
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s32, 64
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX8-NEXT:    s_add_i32 s59, s59, 64
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX900-NEXT:    s_add_i32 s59, s59, 64
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s5, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s5, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addc_u32 s0, s33, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_mov_b32 s59, s33
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX8-NEXT:    s_mov_b32 s59, 64
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s6, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s2, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s3 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s33, 64
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s3 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s4, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s4, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_add_i32 s1, s33, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s33
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s33, 6
+; GFX8-NEXT:    s_add_i32 s59, s59, 64
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s33, 6
+; GFX900-NEXT:    s_add_i32 s59, s59, 64
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s0, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s33, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_add_i32 s1, s33, 64
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s1
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s33, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s1 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v1
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v1
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v1
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v1
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, vcc_lo, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3ec, v0
+; GFX8-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 64, v1
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v1
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x3ec, v0
+; GFX900-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v1
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v2, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x3ec, v0
+; GFX940-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v2, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 251
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset(i32 inreg %soffset) #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v3, 5, s32
+; GFX10_1-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX10_1-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, s4, v3
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v3, 5, s32
+; GFX10_3-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX10_3-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, s4, v3
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v2, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_add_i32 s1, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_add_nc_u32_e64 v1, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_add_co_i32 s1, s32, 0x4000
+; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    v_add_nc_u32_e64 v1, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, vcc_lo, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 64, v1
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v1
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX900-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v1
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v2, s1 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX940-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v2, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 %soffset
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+  ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" "frame-pointer"="all" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
new file mode 100644
index 0000000000000..9cd92dcd5c94d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -0,0 +1,2323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <7 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <5 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+                     <16 x i32>, <6 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+; %alloca1 should end up materializing with s_mov_b32, but scc is
+; unavailable.
+;
+; This is primarily to test gfx7 and gfx8, which do not have vector
+; add with no carry.
+;
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
+; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX7-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use alloca0 v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4040
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX7-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX8-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4040
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX8-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX900-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX900-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX940-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX940-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX940-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX940-NEXT:    v_writelane_b32 v23, s60, 29
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v23, s61, 30
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_addc_u32 s60, s32, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX940-NEXT:    s_bitset0_b32 s60, 0
+; GFX940-NEXT:    s_mov_b32 s59, s60
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v23, 30
+; GFX940-NEXT:    v_readlane_b32 s60, v23, 29
+; GFX940-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX940-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_1-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v24
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX10_1-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_3-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v24
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX10_3-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX11-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_addc_u32 s32, s32, 0x4040
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_mov_b32 s59, s32
+; GFX11-NEXT:    s_addc_u32 s32, s32, 0xffffbfc0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX11-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX12-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0x4000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX12-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0xffffc000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX12-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+  %s0 = extractvalue %asm.output %asm, 0
+  %s1 = extractvalue %asm.output %asm, 1
+  %s2 = extractvalue %asm.output %asm, 2
+  %s3 = extractvalue %asm.output %asm, 3
+  %s4 = extractvalue %asm.output %asm, 4
+  %s5 = extractvalue %asm.output %asm, 5
+
+  %v0 = extractvalue %asm.output %asm, 6
+  %v1 = extractvalue %asm.output %asm, 7
+
+  %vcc = extractvalue %asm.output %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <7 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; FIXME: This would have test FI at offset 0, but other objects get
+; assigned there. This shows a non-0, but inline immediate that can
+; fold directly into the address computation.
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset() #1 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX7-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX7-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX7-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX7-NEXT:    v_mov_b32_e32 v22, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX7-NEXT:    v_lshr_b32_e64 v22, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX7-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX8-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX8-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX8-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_mov_b32_e32 v22, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX8-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX900-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX900-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX900-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v22, 16, v22
+; GFX900-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX900-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4010
+; GFX940-NEXT:    scratch_store_dword off, v21, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX940-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX940-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX940-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX940-NEXT:    v_writelane_b32 v21, s60, 29
+; GFX940-NEXT:    v_writelane_b32 v21, s61, 30
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_addc_u32 s60, s32, 16
+; GFX940-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX940-NEXT:    s_bitset0_b32 s60, 0
+; GFX940-NEXT:    s_mov_b32 s59, s60
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v21, 30
+; GFX940-NEXT:    v_readlane_b32 s60, v21, 29
+; GFX940-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX940-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4010
+; GFX940-NEXT:    scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_1-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX10_1-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v22, 16, v22
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX10_1-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_1-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX10_3-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v22, 16, v22
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX10_3-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_3-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
+; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX11-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s32, s32, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_mov_b32 s59, s32
+; GFX11-NEXT:    s_addc_u32 s32, s32, -16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX11-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
+; GFX11-NEXT:    scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX12-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX12-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 16, addrspace(5)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output2 asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:20]},={vcc}"()
+
+  %s0 = extractvalue %asm.output2 %asm, 0
+  %s1 = extractvalue %asm.output2 %asm, 1
+  %s2 = extractvalue %asm.output2 %asm, 2
+  %s3 = extractvalue %asm.output2 %asm, 3
+  %s4 = extractvalue %asm.output2 %asm, 4
+  %s5 = extractvalue %asm.output2 %asm, 5
+
+  %v0 = extractvalue %asm.output2 %asm, 6
+  %v1 = extractvalue %asm.output2 %asm, 7
+
+  %vcc = extractvalue %asm.output2 %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:20]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <5 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca0,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; This case isn't using SGPRs yet.
+; FIXME: Should also use one more VGPR, but currently fails to allocate on gfx8.
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset() #0 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v23, s28, 28
+; GFX7-NEXT:    v_writelane_b32 v23, s29, 29
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
+; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use alloca0 v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ; implicit-def: $vgpr22
+; GFX7-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX7-NEXT:    v_writelane_b32 v22, vcc_lo, 0
+; GFX7-NEXT:    v_writelane_b32 v22, vcc_hi, 1
+; GFX7-NEXT:    s_or_saveexec_b64 s[28:29], -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX7-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_lshr_b32_e64 v22, s32, 6
+; GFX7-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, vcc_lo, v22
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0x200, v22
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX7-NEXT:    s_and_b64 vcc, 0, exec
+; GFX7-NEXT:    s_mov_b64 s[28:29], exec
+; GFX7-NEXT:    s_mov_b64 exec, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX7-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_readlane_b32 vcc_lo, v22, 0
+; GFX7-NEXT:    v_readlane_b32 vcc_hi, v22, 1
+; GFX7-NEXT:    s_mov_b64 s[28:29], exec
+; GFX7-NEXT:    s_mov_b64 exec, -1
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    ; kill: killed $vgpr22
+; GFX7-NEXT:    v_readlane_b32 s28, v23, 28
+; GFX7-NEXT:    v_readlane_b32 s29, v23, 29
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v23, s58, 28
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 29
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ; implicit-def: $vgpr22
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v22, vcc_lo, 0
+; GFX8-NEXT:    v_writelane_b32 v22, vcc_hi, 1
+; GFX8-NEXT:    s_or_saveexec_b64 s[58:59], -1
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX8-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, vcc_lo, v22
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0x200, v22
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX8-NEXT:    s_and_b64 vcc, 0, exec
+; GFX8-NEXT:    s_mov_b64 s[58:59], exec
+; GFX8-NEXT:    s_mov_b64 exec, -1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX8-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_readlane_b32 vcc_lo, v22, 0
+; GFX8-NEXT:    v_readlane_b32 vcc_hi, v22, 1
+; GFX8-NEXT:    s_mov_b64 s[58:59], exec
+; GFX8-NEXT:    s_mov_b64 exec, -1
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    ; kill: killed $vgpr22
+; GFX8-NEXT:    v_readlane_b32 s58, v23, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 29
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v22, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v22, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v22, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v22, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v22, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v22, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v22, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v22, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v22, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v22, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v22, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v22, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v22, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v22, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v22, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v22, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v22, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v22, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v22, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v22, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v22, s53, 22
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v22, s54, 23
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    v_writelane_b32 v22, s55, 24
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v22, s56, 25
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v22, s57, 26
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v22, s59, 27
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v22, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v22, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v22, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v22, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v22, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v22, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v22, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v22, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v22, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v22, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v22, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v22, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v22, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v22, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v22, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v22, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v22, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v22, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v22, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v22, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v22, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v22, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v22, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v22, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v22, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v22, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_add_i32 s59, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v22, s59
+; GFX940-NEXT:    v_writelane_b32 v23, s60, 28
+; GFX940-NEXT:    v_add_u32_e32 v22, 0x200, v22
+; GFX940-NEXT:    v_writelane_b32 v23, s61, 29
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v23, 29
+; GFX940-NEXT:    v_readlane_b32 s60, v23, 28
+; GFX940-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v1
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v1
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v1
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v1
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output3 asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={v[0:15]},={v[16:21]},={vcc}"()
+
+  %s0 = extractvalue %asm.output3 %asm, 0
+  %s1 = extractvalue %asm.output3 %asm, 1
+  %s2 = extractvalue %asm.output3 %asm, 2
+  %s3 = extractvalue %asm.output3 %asm, 3
+  %s4 = extractvalue %asm.output3 %asm, 4
+
+  %v0 = extractvalue %asm.output3 %asm, 5
+  %v1 = extractvalue %asm.output3 %asm, 6
+
+  %vcc = extractvalue %asm.output3 %asm, 7
+
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 128
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{v[0:15]},{v[16:21]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    <16 x i32> %v0,
+    <6 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1.offset,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; For gfx8/gfx9, this should enforce a budget of 24 VGPRs, and 60 SGPRs (4
+; are reserved at the end for xnack + vcc).
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index dd892ec3d59b3..99090da4da513 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,64 +9,64 @@ declare hidden i32 @strnlen(ptr nocapture, i32) #1
 declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1
 
 
-; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr): unsupported call to function memcmp
+; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr, ptr addrspace(1)): unsupported call to function memcmp
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp@rel32@hi+12
-define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p, ptr addrspace(1) %out) #0 {
 entry:
   %cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
-  store volatile i32 %cmp, ptr addrspace(1) undef
+  store i32 %cmp, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64): unsupported call to function memchr
+; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64, ptr addrspace(1)): unsupported call to function memchr
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr@rel32@hi+12
-define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len, ptr addrspace(1) %out) #0 {
   %res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
-  store volatile ptr addrspace(1) %res, ptr addrspace(1) undef
+  store ptr addrspace(1) %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr): unsupported call to function strcpy
+; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcpy
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy@rel32@hi+12
-define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
+define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src, ptr addrspace(1) %out) #0 {
   %res = call ptr @strcpy(ptr %dst, ptr %src)
-  store volatile ptr %res, ptr addrspace(1) undef
+  store ptr %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr): unsupported call to function strcmp
+; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcmp
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp@rel32@hi+12
-define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
+define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1, ptr addrspace(1) %out) #0 {
   %res = call i32 @strcmp(ptr %src0, ptr %src1)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr): unsupported call to function strlen
+; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr, ptr addrspace(1)): unsupported call to function strlen
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen@rel32@hi+12
-define amdgpu_kernel void @test_strlen(ptr %src) #0 {
+define amdgpu_kernel void @test_strlen(ptr %src, ptr addrspace(1) %out) #0 {
   %res = call i32 @strlen(ptr %src)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32): unsupported call to function strnlen
+; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32, ptr addrspace(1)): unsupported call to function strnlen
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen@rel32@hi+12
-define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size, ptr addrspace(1) %out) #0 {
   %res = call i32 @strnlen(ptr %src, i32 %size)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 3a065d518f0a9..1de9206801e2a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,21 +5,21 @@
 define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index b08da2e1848ff..28d30a70e50c1 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX11
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12
 
@@ -6,11 +7,23 @@
 name: merge_s_load_x1_x1
 body: |
   bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x1_x1
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ; GFX10-LABEL: name: merge_s_load_x1_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x1_x1
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x1_x1
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -46,6 +59,13 @@ body: |
 name: merge_s_load_x1_x1_x1
 body: |
   bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
@@ -70,6 +90,16 @@ body: |
 name: merge_s_load_x1_x1_x1_x1
 body: |
   bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
+    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
@@ -100,6 +130,24 @@ body: |
 name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
 body: |
   bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
+    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
@@ -150,6 +198,11 @@ body: |
 name: merge_s_load_x2_x1
 body: |
   bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x2_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX11-LABEL: name: merge_s_load_x2_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
@@ -169,11 +222,23 @@ body: |
 name: merge_s_load_x2_x2
 body: |
   bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x2_x2
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; GFX10-LABEL: name: merge_s_load_x2_x2
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x2_x2
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2_x2
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -183,15 +248,35 @@ body: |
 name: merge_s_load_x2_x2_x2_x2
 body: |
   bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ; GFX10-LABEL: name: merge_s_load_x2_x2_x2_x2
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x2_x2_x2_x2
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2_x2_x2_x2
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -217,12 +302,181 @@ body: |
 name: merge_s_load_x4_x4
 body: |
   bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x4_x4
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX10-LABEL: name: merge_s_load_x4_x4
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x4_x4
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x4_x4
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
     %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
 ...
+
+# The constrained multi-dword scalar load merge tests.
+---
+name: merge_s_load_x1_x2ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x1_x2ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x1_x3ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x1_x3ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96))
+...
+
+---
+name: merge_s_load_x2ec_x1
+body: |
+  bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x2ec_x1
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x2ec_x1
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2ec_x1
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: merge_s_load_x2ec_x2ec
+body: |
+  bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+body: |
+  bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x3ec_x1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x3ec_x1
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: merge_s_load_x4ec_x4ec
+body: |
+  bb.0:
+    ; GFX10-LABEL: name: merge_s_load_x4ec_x4ec
+    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+    ;
+    ; GFX11-LABEL: name: merge_s_load_x4ec_x4ec
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x4ec_x4ec
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
+    early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index a77892c8f5fc7..686797f290b97 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2513,38 +2513,38 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s9, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[12:13], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[14:15], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lt_u32 s8, s9
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s4, s8, s9
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    s_cmp_lt_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_store_dword v0, v2, s[8:9]
+; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x0
-; GFX10-NEXT:    s_load_dword s9, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX10-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lt_u32 s8, s9
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT:    s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s4, s8, s9
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
-; GFX10-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT:    s_cmp_lt_u32 s0, s1
+; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v1, v2, s[8:9]
+; GFX10-NEXT:    global_store_byte v1, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
@@ -2665,33 +2665,33 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[4:5]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll
index d9b48f79739b6..0167fcbc4ab7c 100644
--- a/llvm/test/CodeGen/AMDGPU/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmra.ll
@@ -17,10 +17,9 @@ define void @fence_loads(ptr %ptr) {
   ; CHECK-NEXT:   ATOMIC_FENCE 5, 1, mmra !0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
   ; CHECK-NEXT:   [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4)
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1, mmra !2
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, mmra !2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], mmra !2
-  ; CHECK-NEXT:   FLAT_STORE_BYTE [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
+  ; CHECK-NEXT:   FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
   ; CHECK-NEXT:   SI_RETURN
   fence release,                                        !mmra !0
   %ld = load atomic i8, ptr %ptr acquire, align 4,      !mmra !2
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index ece7e28c763fb..b696f097d05b7 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
@@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index b5ee6689f8dc3..06ebd86b79510 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 162c47f879465..2591ff4bd2538 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index ece2e1b653d34..c702de6285d9b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,10 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
 
 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 #
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 0889f8ef6316e..9b44b58c4a01e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -50,40 +50,40 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_mul_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: test_mul_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_mul_v2i32:
@@ -201,47 +201,47 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_mul_v4i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_v4i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_v4i32:
@@ -857,41 +857,41 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: v_mul64_sext_c:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x50
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_movk_i32 s0, 0x50
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_sext_c:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_sext_c:
@@ -1004,41 +1004,41 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: v_mul64_zext_c:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x50
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_movk_i32 s0, 0x50
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_zext_c:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_u32 v1, 0x50, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_zext_c:
@@ -1149,40 +1149,40 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_mul64_sext_inline_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_sext_inline_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_sext_inline_imm:
@@ -1396,38 +1396,38 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_mul_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i32:
@@ -1662,43 +1662,43 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ;
 ; GFX9-LABEL: v_mul_i1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i1:
@@ -2211,67 +2211,67 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_mul_i32 s6, s0, s1
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_mul_i32 s8, s0, s1
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_branch .LBB15_3
 ; GFX9-NEXT:  .LBB15_2:
-; GFX9-NEXT:    s_mov_b64 s[4:5], -1
-; GFX9-NEXT:    ; implicit-def: $sgpr6
+; GFX9-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:  .LBB15_3: ; %Flow
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
 ; GFX9-NEXT:  ; %bb.4: ; %if
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s2
-; GFX9-NEXT:    s_mov_b32 s5, s3
-; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_branch .LBB15_6
 ; GFX9-NEXT:  .LBB15_5:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:  .LBB15_6: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mul32_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_mul_i32 s5, s0, s1
+; GFX10-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10-NEXT:    s_branch .LBB15_3
 ; GFX10-NEXT:  .LBB15_2:
-; GFX10-NEXT:    s_mov_b32 s4, -1
-; GFX10-NEXT:    ; implicit-def: $sgpr5
+; GFX10-NEXT:    s_mov_b32 s8, -1
+; GFX10-NEXT:    ; implicit-def: $sgpr0
 ; GFX10-NEXT:  .LBB15_3: ; %Flow
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB15_5
 ; GFX10-NEXT:  ; %bb.4: ; %if
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, s2
-; GFX10-NEXT:    s_mov_b32 s5, s3
-; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_mov_b32 s0, s6
+; GFX10-NEXT:    s_mov_b32 s1, s7
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_branch .LBB15_6
 ; GFX10-NEXT:  .LBB15_5:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:  .LBB15_6: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mul32_in_branch:
@@ -2472,72 +2472,72 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: mul64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_mul_i32 s7, s4, s7
-; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
-; GFX9-NEXT:    s_add_i32 s7, s10, s7
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_add_i32 s5, s7, s5
-; GFX9-NEXT:    s_mul_i32 s4, s4, s6
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT:    s_mul_i32 s2, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s3, s8, s10
+; GFX9-NEXT:    s_add_i32 s2, s3, s2
+; GFX9-NEXT:    s_mul_i32 s3, s9, s10
+; GFX9-NEXT:    s_add_i32 s3, s2, s3
+; GFX9-NEXT:    s_mul_i32 s2, s8, s10
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB16_4
 ; GFX9-NEXT:  .LBB16_2: ; %if
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s4, s2
-; GFX9-NEXT:    s_mov_b32 s5, s3
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_branch .LBB16_5
 ; GFX9-NEXT:  .LBB16_3:
-; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-NEXT:    s_branch .LBB16_2
 ; GFX9-NEXT:  .LBB16_4:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB16_5: ; %endif
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mul64_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_mul_i32 s7, s4, s7
-; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT:    s_mul_i32 s5, s5, s6
-; GFX10-NEXT:    s_add_i32 s7, s8, s7
-; GFX10-NEXT:    s_mul_i32 s4, s4, s6
-; GFX10-NEXT:    s_add_i32 s5, s7, s5
+; GFX10-NEXT:    s_mul_i32 s0, s8, s11
+; GFX10-NEXT:    s_mul_hi_u32 s1, s8, s10
+; GFX10-NEXT:    s_mul_i32 s2, s9, s10
+; GFX10-NEXT:    s_add_i32 s0, s1, s0
+; GFX10-NEXT:    s_add_i32 s1, s0, s2
+; GFX10-NEXT:    s_mul_i32 s0, s8, s10
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX10-NEXT:  .LBB16_2: ; %if
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s4, s2
-; GFX10-NEXT:    s_mov_b32 s5, s3
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s0, s6
+; GFX10-NEXT:    s_mov_b32 s1, s7
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_branch .LBB16_5
 ; GFX10-NEXT:  .LBB16_3:
-; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; GFX10-NEXT:    s_branch .LBB16_2
 ; GFX10-NEXT:  .LBB16_4:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:  .LBB16_5: ; %endif
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mul64_in_branch:
@@ -3102,12 +3102,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ;
 ; GFX9-LABEL: v_mul_i128:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
@@ -3128,18 +3128,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX9-NEXT:    v_add3_u32 v3, v16, v3, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i128:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v2
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v12
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 842dc36e00154..be77a10380c49 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 0c0bb830ba847..a1099554559af 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -158,18 +158,18 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: test_umul24_i16_vgpr_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v3
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -279,17 +279,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_umul24_i16_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v3
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -661,14 +661,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.16 = and i32 %a, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 92b2f516277bc..10d08032bf59a 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
 
 ; Add an extra verifier runs. There were some cases where invalid IR
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 5c09d2bd61a39..31e8a49b16ca1 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -858,14 +858,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -892,39 +892,17 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -936,13 +914,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -962,14 +935,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
@@ -996,39 +969,17 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1040,13 +991,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1066,14 +1012,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1100,39 +1046,17 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1144,13 +1068,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1161,32 +1080,32 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
 define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1201,46 +1120,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1248,13 +1127,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1274,14 +1148,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1308,39 +1182,17 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1352,13 +1204,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1369,32 +1216,32 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
 define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1409,46 +1256,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1456,13 +1263,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1483,23 +1285,23 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1517,43 +1319,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1561,13 +1333,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1588,23 +1355,23 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1622,43 +1389,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1666,13 +1403,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1693,23 +1425,23 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1727,43 +1459,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1771,13 +1473,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1798,23 +1495,23 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1832,43 +1529,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1876,13 +1543,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1903,23 +1565,23 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1937,43 +1599,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1981,13 +1613,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2008,23 +1635,23 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -2042,43 +1669,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2086,13 +1683,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4187,3 +3779,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
   store i8 %load, ptr undef
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10-GISEL: {{.*}}
+; GFX10-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index b5b8213bcd57e..548c196df1da5 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -924,12 +924,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -937,12 +933,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -950,13 +942,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -968,13 +955,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1028,39 +1010,26 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1072,13 +1041,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1092,15 +1056,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1132,12 +1087,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1145,12 +1096,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1158,13 +1105,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,13 +1118,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1233,45 +1170,32 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
 define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1280,44 +1204,12 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1340,12 +1232,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1353,12 +1241,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1366,13 +1250,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1384,13 +1263,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1441,45 +1315,32 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
 define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1488,44 +1349,12 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1548,12 +1377,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1561,12 +1387,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1574,13 +1396,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1592,13 +1409,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1653,39 +1465,27 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,13 +1497,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1718,15 +1513,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,12 +1544,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1771,12 +1554,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1784,13 +1563,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1802,13 +1576,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1863,42 +1632,30 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1907,13 +1664,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1928,24 +1680,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1968,12 +1702,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1981,12 +1712,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1994,13 +1721,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2012,13 +1734,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2073,42 +1790,30 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2117,13 +1822,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2138,24 +1838,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index d73b1bd29c981..7e5def00ee7cb 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)
 define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_0x8000000000000000:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s5, 0x80000000
 ; GCN-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; GCN-NEXT:    s_cselect_b32 s0, 22, 33
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    global_store_dword v0, v1, s[2:3]
+; GCN-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GCN-NEXT:    s_endpgm
   %and = and i64 %arg, 9223372036854775808
   %cmp = icmp eq i64 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index c72a7ba3eee83..d6fc52bdb9322 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -4,17 +4,17 @@
 define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -35,17 +35,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -67,17 +67,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -99,17 +99,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -132,17 +132,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -163,17 +163,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -194,16 +194,16 @@ bb:
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -222,11 +222,11 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1
-; GCN-NEXT:    global_store_dword v4, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v4, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -261,10 +261,10 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out,
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, v1
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v2, v0
-; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    ; kill: killed $vgpr0_vgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
@@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index f53ca53518a17..40542bc6f05a7 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
 ; GCN-LABEL: dbg_clause:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
+; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GCN-NEXT:    ;DEBUG_VALUE: foo:a <- $vgpr1
-; GCN-NEXT:    global_load_dword v2, v0, s[2:3] offset:32
+; GCN-NEXT:    global_load_dword v2, v0, s[6:7] offset:32
 ; GCN-NEXT:    ;DEBUG_VALUE: foo:b <- $vgpr2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index dabb9d43bf3d6..0c8dbd1db8ec6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0
 ; GCN-LABEL: load_v3i32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
   %vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
   store <3 x i32> %vec, ptr addrspace(1) undef, align 8
   ret void
@@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0
 ; GCN-LABEL: load_v3f32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
   %vec = load <3 x float>, ptr addrspace(1) %arg, align 8
   store <3 x float> %vec, ptr addrspace(1) undef, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 9a8d5acfbe3e9..f916c9375bc6d 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -5,28 +5,28 @@
 define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) {
 ; SDAG-LABEL: buffers_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
 ; SDAG-LABEL: buffers_from_flat_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; SDAG-NEXT:    s_mov_b32 s7, 0
-; SDAG-NEXT:    s_mov_b32 s6, 16
+; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-NEXT:    s_mov_b32 s3, 0
+; SDAG-NEXT:    s_mov_b32 s2, 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s0
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s2
+; SDAG-NEXT:    s_and_b32 s1, s5, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s4
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GISEL-NEXT:    s_mov_b32 s7, 0
-; GISEL-NEXT:    s_mov_b32 s6, 16
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_mov_b32 s3, 0
+; GISEL-NEXT:    s_mov_b32 s2, 16
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s0
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s2
+; GISEL-NEXT:    s_and_b32 s1, s5, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s4
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    s_and_b32 s1, s7, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s6
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
@@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
 define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) {
 ; SDAG-LABEL: buffers_might_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:12
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_might_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
-; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:12
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac
 define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) {
 ; SDAG-LABEL: independent_offsets:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 1.0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; SDAG-NEXT:    buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; SDAG-NEXT:    buffer_store_dword v2, v0, s[4:7], 0 offen
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; SDAG-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: independent_offsets:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GISEL-NEXT:    buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GISEL-NEXT:    buffer_store_dword v2, v0, s[4:7], 0 offen
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; GISEL-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
 ; GISEL-NEXT:    s_endpgm
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %idx = shl i32 %lane, 2
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6b036f675929e..fe093d4ac8515 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -520,28 +520,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -552,7 +545,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1086,10 +1078,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1918,28 +1910,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -1950,7 +1935,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -2484,10 +2468,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
new file mode 100644
index 0000000000000..ebc209bd4d451
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @test_export_zeroes_f32() #0 {
+; GCN-LABEL: test_export_zeroes_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    exp mrt0 off, off, off, off
+; GCN-NEXT:    exp mrt0 off, off, off, off done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_ps void @test_export_en_src0_f32() #0 {
+; GCN-LABEL: test_export_en_src0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
+; GCN-NEXT:    exp mrt0 v3, off, off, off done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_export_gs() #0 {
+; GCN-LABEL: test_export_gs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
+; GCN-NEXT:    exp mrt0 off, v2, off, off done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_hs void @test_export_hs() #0 {
+; GCN-LABEL: test_export_hs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
+; GCN-NEXT:    exp mrt0 off, v2, off, off done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gfx void @test_export_gfx(float %v) #0 {
+; GCN-LABEL: test_export_gfx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-NEXT:    exp mrt0 off, v3, off, off done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %v, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_cs void @test_export_cs() #0 {
+; GCN-LABEL: test_export_cs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
+; GCN-NEXT:    exp mrt0 off, v2, off, off done
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @test_export_kernel() #0 {
+; GCN-LABEL: test_export_kernel:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
+; GCN-NEXT:    exp mrt0 off, v2, off, off done
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gfx void @test_no_export_gfx(float %v) #0 {
+; GCN-LABEL: test_no_export_gfx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+define amdgpu_ps void @test_no_export_ps(float %v) #0 {
+; GCN-LABEL: test_no_export_ps:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_endpgm
+  ret void
+}
+
+define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+; GCN-LABEL: test_if_export_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GCN-NEXT:    s_cbranch_execz .LBB9_2
+; GCN-NEXT:  ; %bb.1: ; %exp
+; GCN-NEXT:    exp mrt0 v1, v2, v3, v4
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:  .LBB9_2: ; %end
+; GCN-NEXT:    s_endpgm
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
+  br label %end
+
+end:
+  ret void
+}
+
+define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+; GCN-LABEL: test_if_export_vm_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GCN-NEXT:    s_cbranch_execz .LBB10_2
+; GCN-NEXT:  ; %bb.1: ; %exp
+; GCN-NEXT:    exp mrt0 v1, v2, v3, v4
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:  .LBB10_2: ; %end
+; GCN-NEXT:    s_endpgm
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true)
+  br label %end
+
+end:
+  ret void
+}
+
+define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+; GCN-LABEL: test_if_export_done_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GCN-NEXT:    s_cbranch_execz .LBB11_2
+; GCN-NEXT:  ; %bb.1: ; %exp
+; GCN-NEXT:    exp mrt0 v1, v2, v3, v4 done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:  .LBB11_2: ; %end
+; GCN-NEXT:    s_endpgm
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  br label %end
+
+end:
+  ret void
+}
+
+define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+; GCN-LABEL: test_if_export_vm_done_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GCN-NEXT:    s_cbranch_execz .LBB12_2
+; GCN-NEXT:  ; %bb.1: ; %exp
+; GCN-NEXT:    exp mrt0 v1, v2, v3, v4 done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:  .LBB12_2: ; %end
+; GCN-NEXT:    s_endpgm
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+  br label %end
+
+end:
+  ret void
+}
+
+define amdgpu_ps void @test_export_pos_before_param_across_load(i32 %idx) #0 {
+; GCN-LABEL: test_export_pos_before_param_across_load:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0.5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    exp pos0 v1, v1, v1, v0 done
+; GCN-NEXT:    exp invalid_target_32 v2, v2, v2, v2
+; GCN-NEXT:    exp invalid_target_33 v2, v2, v2, v3
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false)
+  %load = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0)
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 {
+; GCN-LABEL: test_export_across_store_load:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    v_mov_b32_e32 v2, 16
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    scratch_store_b32 v0, v1, off
+; GCN-NEXT:    scratch_load_b32 v0, off, off
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    exp pos0 v2, v2, v2, v1 done
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    exp invalid_target_32 v0, v2, v1, v2
+; GCN-NEXT:    exp invalid_target_33 v0, v2, v1, v2
+; GCN-NEXT:    s_setprio 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_endpgm
+  %data0 = alloca <4 x float>, align 8, addrspace(5)
+  %data1 = alloca <4 x float>, align 8, addrspace(5)
+  %cmp = icmp eq i32 %idx, 1
+  %data = select i1 %cmp, ptr addrspace(5) %data0, ptr addrspace(5) %data1
+  store float %v, ptr addrspace(5) %data, align 8
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false)
+  %load0 = load float, ptr addrspace(5) %data0, align 8
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
+  ret void
+}
+
+define amdgpu_ps void @test_export_in_callee(float %v) #0 {
+; GCN-LABEL: test_export_in_callee:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_getpc_b64 s[0:1]
+; GCN-NEXT:    s_add_u32 s0, s0, test_export_gfx@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s1, s1, test_export_gfx@gotpcrel32@hi+12
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GCN-NEXT:    s_endpgm
+  %x = fadd float %v, 1.0
+  call void @test_export_gfx(float %x)
+  ret void
+}
+
+define amdgpu_ps void @test_export_in_callee_prio(float %v) #0 {
+; GCN-LABEL: test_export_in_callee_prio:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_setprio 2
+; GCN-NEXT:    s_getpc_b64 s[0:1]
+; GCN-NEXT:    s_add_u32 s0, s0, test_export_gfx@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s1, s1, test_export_gfx@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GCN-NEXT:    s_endpgm
+  %x = fadd float %v, 1.0
+  call void @llvm.amdgcn.s.setprio(i16 0)
+  call void @test_export_gfx(float %x)
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
+declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2
+declare void @llvm.amdgcn.s.setprio(i16)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind inaccessiblememonly }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.mir b/llvm/test/CodeGen/AMDGPU/required-export-priority.mir
new file mode 100644
index 0000000000000..392519171cd71
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.mir
@@ -0,0 +1,294 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GFX1150 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GFX1150 %s
+
+--- |
+  define amdgpu_ps void @end_of_shader() {
+    ret void
+  }
+  define amdgpu_ps void @end_of_shader_return_to_epilogue() {
+    ret void
+  }
+  define amdgpu_ps void @end_of_block() {
+    ret void
+  }
+  define amdgpu_ps void @start_of_block() {
+    ret void
+  }
+  define amdgpu_ps void @block_of_exports() {
+    ret void
+  }
+  define amdgpu_ps void @sparse_exports() {
+    ret void
+  }
+  define amdgpu_ps void @existing_setprio_1() {
+    ret void
+  }
+  define amdgpu_ps void @existing_setprio_2() {
+    ret void
+  }
+...
+
+---
+name: end_of_shader
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX1150-LABEL: name: end_of_shader
+    ; GFX1150: liveins: $vgpr0
+    ; GFX1150-NEXT: {{  $}}
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_ENDPGM 0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: end_of_shader_return_to_epilogue
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX1150-LABEL: name: end_of_shader_return_to_epilogue
+    ; GFX1150: liveins: $vgpr0
+    ; GFX1150-NEXT: {{  $}}
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    SI_RETURN_TO_EPILOG $vgpr0
+...
+
+---
+name: end_of_block
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  ; GFX1150-LABEL: name: end_of_block
+  ; GFX1150: bb.0:
+  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; GFX1150-NEXT:   S_SETPRIO 0
+  ; GFX1150-NEXT:   S_WAITCNT_EXPCNT $sgpr_null, 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.1:
+  ; GFX1150-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+
+  bb.1:
+    S_ENDPGM 0
+...
+
+---
+name: start_of_block
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  ; GFX1150-LABEL: name: start_of_block
+  ; GFX1150: bb.0:
+  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.1:
+  ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; GFX1150-NEXT:   S_SETPRIO 0
+  ; GFX1150-NEXT:   S_WAITCNT_EXPCNT $sgpr_null, 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.2:
+  ; GFX1150-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+
+  bb.1:
+    liveins: $vgpr0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: block_of_exports
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX1150-LABEL: name: block_of_exports
+    ; GFX1150: liveins: $vgpr0
+    ; GFX1150-NEXT: {{  $}}
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_ENDPGM 0
+    EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: sparse_exports
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX1150-LABEL: name: sparse_exports
+    ; GFX1150: liveins: $vgpr0
+    ; GFX1150-NEXT: {{  $}}
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
+    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
+    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_ENDPGM 0
+    EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
+    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: existing_setprio_1
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  ; GFX1150-LABEL: name: existing_setprio_1
+  ; GFX1150: bb.0:
+  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT:   $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.1:
+  ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   S_SETPRIO 3
+  ; GFX1150-NEXT:   $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.2:
+  ; GFX1150-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   S_SETPRIO 3
+  ; GFX1150-NEXT:   $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec
+  ; GFX1150-NEXT:   S_SETPRIO 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.3:
+  ; GFX1150-NEXT:   liveins: $vgpr0
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; GFX1150-NEXT:   S_SETPRIO 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_NOP 0
+  ; GFX1150-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
+
+  bb.1:
+    liveins: $vgpr0
+    S_SETPRIO 3
+    $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
+    S_SETPRIO 0
+
+  bb.2:
+    liveins: $vgpr0
+    S_SETPRIO 1
+    $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec
+    S_SETPRIO 0
+
+  bb.3:
+    liveins: $vgpr0
+    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: existing_setprio_2
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0' }
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX1150-LABEL: name: existing_setprio_2
+    ; GFX1150: liveins: $vgpr0
+    ; GFX1150-NEXT: {{  $}}
+    ; GFX1150-NEXT: S_SETPRIO 3
+    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX1150-NEXT: S_SETPRIO 0
+    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_NOP 0
+    ; GFX1150-NEXT: S_SETPRIO 2
+    ; GFX1150-NEXT: S_SETPRIO 3
+    ; GFX1150-NEXT: S_ENDPGM 0
+    S_SETPRIO 3
+    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_SETPRIO 3
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index a640ac985ade4..002de8bb4eb51 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -113,16 +113,7 @@ define amdgpu_kernel void @test_kernel() !dbg !3 {
   ret void
 }
 
-; STDERR: remark: foo.cl:42:0: Function Name: test_func
-; STDERR-NEXT: remark: foo.cl:42:0:     SGPRs: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     VGPRs: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     AGPRs: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:42:0:     Occupancy [waves/SIMD]: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     SGPRs Spill: 0
-; STDERR-NEXT: remark: foo.cl:42:0:     VGPRs Spill: 0
-; STDERR-NOT: LDS Size
+; STDERR-NOT: test_func
 define void @test_func() !dbg !6 {
   call void asm sideeffect "; clobber v17", "~{v17}"()
   call void asm sideeffect "; clobber s11", "~{s11}"()
@@ -144,15 +135,7 @@ define amdgpu_kernel void @empty_kernel() !dbg !7 {
   ret void
 }
 
-; STDERR: remark: foo.cl:52:0: Function Name: empty_func
-; STDERR-NEXT: remark: foo.cl:52:0:     SGPRs: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     VGPRs: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     AGPRs: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:52:0:     Occupancy [waves/SIMD]: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     SGPRs Spill: 0
-; STDERR-NEXT: remark: foo.cl:52:0:     VGPRs Spill: 0
+; STDERR-NOT: empty_func
 define void @empty_func() !dbg !8 {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index fdce4431fbbf2..f692584ef92be 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotl_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s3, 32, s3
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_sub_i32 s0, 32, s7
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 0e1dd69d930ae..a368aa1055b28 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotr_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 528dcfc6bc038..0f95c0255d3ab 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -1008,10 +1008,10 @@ define double @v_roundeven_f64(double %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
 ; GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_mov_b32_e32 v6, -1
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
-; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,17 +1087,17 @@ define double @v_roundeven_f64_fneg(double %x) {
 ; GFX6-LABEL: v_roundeven_f64_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v6
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
 ; GFX6-NEXT:    v_add_f64 v[4:5], -v[0:1], v[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_mov_b32_e32 v6, -1
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
-; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f64_fneg:
diff --git a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
index 9ad1660264519..c1121f902982a 100644
--- a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
 
 ---
 
@@ -124,4 +124,4 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1
     renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
-    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
\ No newline at end of file
+    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 684279a3776fc..57c936d4689eb 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -288,35 +288,35 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_saddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v3, v1, v2 clamp
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_saddo_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX10-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_i32 v3, v1, v2 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v2, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_saddo_i32:
@@ -401,38 +401,38 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_saddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s8, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_addc_u32 s9, s5, s7
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s8, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
+; GFX9-NEXT:    s_addc_u32 s1, s9, s11
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_saddo_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s8, s4, s6
-; GFX10-NEXT:    s_addc_u32 s9, s5, s7
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_xor_b32 s4, s6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX10-NEXT:    s_add_u32 s0, s8, s10
+; GFX10-NEXT:    s_addc_u32 s1, s9, s11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_xor_b32 s0, s2, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_saddo_i64:
@@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_saddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
@@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_saddo_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[4:5]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v1, v3
 ; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
@@ -691,8 +691,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[2:3]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_saddo_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
index 6d243e4bddac5..47e4406762872 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
@@ -1,10 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}scalar_to_vector_i16:
-; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 42
-; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
-; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 42
+; GCN:   v_mov_b32_e32 [[V:v[0-9]+]], 42
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_i16() {
   %tmp = load <2 x i16>, ptr addrspace(5) undef
@@ -14,9 +12,7 @@ define void @scalar_to_vector_i16() {
 }
 
 ; GCN-LABEL: {{^}}scalar_to_vector_f16:
-; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 0x3c00
-; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
-; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
+; GCN:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_f16() {
   %tmp = load <2 x half>, ptr addrspace(5) undef
diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index a3489a8bb1403..695d5225421de 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -13,7 +13,7 @@
 ; GCN-DEFAULT:      t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
 ; GCN-DEFAULT:    t6: f32 = fadd # D:1 t5, t4
 ; GCN-DEFAULT:  t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
-; GCN-DEFAULT:  t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1
+; GCN-DEFAULT:  t9: ch = RETURN_TO_EPILOG t8, Register:f32 $vgpr0, t8:1
 
 ; GCN-VERBOSE:  t0: ch,glue = EntryToken # D:0
 ; GCN-VERBOSE:  t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
@@ -21,7 +21,7 @@
 ; GCN-VERBOSE:      t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
 ; GCN-VERBOSE:    t6: f32 = fadd [ORD=3] # D:1 t5, t4
 ; GCN-VERBOSE:  t8: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6
-; GCN-VERBOSE:  t9: ch = RETURN_TO_EPILOG [ORD=4] # D:1 t8, Register:f32 $vgpr0 # D:0, t8:1
+; GCN-VERBOSE:  t9: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t8, Register:f32 $vgpr0 # D:0, t8:1
 
 define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector)  {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index b57a51f1382ae..c10600a78e3e7 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -241,23 +241,23 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; GFX9-LABEL: sdiv_i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_i32_4:
@@ -339,25 +339,25 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: slow_sdiv_i32_3435:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s2, 0x98a1930b
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, 0x98a1930b
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
+; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: slow_sdiv_i32_3435:
@@ -732,17 +732,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdiv_v2i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
@@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v2i32_4:
@@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdiv_v4i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
@@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v4i32_4:
@@ -1515,18 +1515,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_sdiv_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
 ; GFX9-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_sdiv_i8:
@@ -2253,21 +2253,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
 ;
 ; GFX9-LABEL: scalarize_mulhs_4xi32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    s_mov_b32 s0, 0x1389c755
-; GFX9-NEXT:    s_mov_b32 s4, s2
-; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, 0x1389c755
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s0
-; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s0
-; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s0
-; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s0
+; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s4
+; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s4
+; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
@@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: scalarize_mulhs_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 669ed915a002a..64b3317edc519 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: add_shr_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: add_shr_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
@@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: sub_shr_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sub_shr_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ;
 ; GFX9-LABEL: sitofp_v2i16_to_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_i16_e32 v2, v1
 ; GFX9-NEXT:    v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sitofp_v2i16_to_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f16_i16_e32 v2, v1
 ; GFX10-NEXT:    v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: immediate_mul_v2i16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0x141007b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x141007b
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: immediate_mul_v2i16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, 0x141007b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ;
 ; GFX9-LABEL: pulled_out_test:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
@@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: pulled_out_test:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %idxprom = ashr exact i64 15, 32
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index f19b0a5f53e95..eaa3d22f0465c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-MUBUF %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-FLATSCR %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -filetype=obj -verify-machineinstrs -start-before=prologepilog %s -o /dev/null
 # Check not crashing when emitting ISA
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 8c663d963b73e..9b9f03ff74aa3 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s7, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_v2i16:
@@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
@@ -90,13 +90,13 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v2i16:
@@ -142,13 +142,13 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_shl_v2i16:
@@ -374,13 +374,13 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_imm_v_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_imm_v_v2i16:
@@ -426,13 +426,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_imm_v_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_imm_v_v2i16:
@@ -462,13 +462,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_v_imm_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_imm_v2i16:
@@ -510,13 +510,13 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_v_imm_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_v_imm_v2i16:
@@ -546,14 +546,14 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_shl_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v4i16:
@@ -609,14 +609,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_shl_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_shl_v4i16:
@@ -649,14 +649,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
 define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_v_imm_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_imm_v4i16:
@@ -708,14 +708,14 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_v_imm_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_v_imm_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index ddf331816694a..88e2bb772a2d3 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -84,24 +84,24 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_x_sub_64:
@@ -223,35 +223,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -359,24 +359,24 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_64_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_64_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_64_sub_x:
@@ -474,46 +474,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
@@ -626,24 +626,24 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_65_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0x41, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_65_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_65_sub_x:
@@ -741,46 +741,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
@@ -893,24 +893,24 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, -16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg16_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_neg16_sub_x:
@@ -1008,46 +1008,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
@@ -1160,24 +1160,24 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0xffffffef, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg17_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_neg17_sub_x:
@@ -1330,24 +1330,24 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i16_x_sub_64:
@@ -1451,27 +1451,27 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
@@ -1597,35 +1597,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX10-NEXT:    v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1748,24 +1748,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
@@ -1878,37 +1878,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x400007
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x400007
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x400007
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
@@ -2021,37 +2021,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x7b0040
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x7b0040
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b0040
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
@@ -2161,24 +2161,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
@@ -2286,24 +2286,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
@@ -2410,37 +2410,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_brev_b32 s0, 35
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_brev_b32 s2, 35
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 35
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
@@ -2547,37 +2547,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_brev_b32 s0, 34
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_brev_b32 s2, 34
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 34
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
@@ -2691,24 +2691,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
@@ -2815,24 +2815,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
@@ -2941,24 +2941,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
@@ -3072,24 +3072,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
@@ -3196,24 +3196,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
@@ -3322,24 +3322,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
@@ -3452,48 +3452,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0xc400
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0xc400
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc400c400
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
@@ -3621,48 +3621,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0x4400
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x44004400
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
@@ -3790,24 +3790,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
@@ -3920,24 +3920,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
@@ -4043,24 +4043,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
@@ -4163,47 +4163,47 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_not_b32_e32 v2, 31
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir b/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir
new file mode 100644
index 0000000000000..1f4d046a8739f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[V_S_RSQ_F32_e64_:%[0-9]+]]:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_S_RSQ_F32_e64_]]
+    ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[COPY1]], [[COPY1]], implicit $mode, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e32_]]
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, %0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %1, 0, %1, -1, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = nofpexcept V_ADD_F32_e32 %2:vgpr_32, %2:vgpr_32, implicit $mode, implicit $exec
+    $vgpr0 = COPY %3
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index ba1caf376975c..bf21ed6898ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -375,11 +375,8 @@ declare float @_Z4pownfi(float, i32)
 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
 ; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
-; GCN: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp)
+; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -435,11 +432,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 ; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 ; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
-; GCN: %1 = bitcast half %x to i16
-; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x)
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
@@ -450,11 +443,7 @@ define half @test_pow_fast_f16__y_13(half %x) {
 ; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
 ; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
-; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
-; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %1 = tail call <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %x)
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
index 266e67364c82b..ff87c28bb7ec5 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
 
 # Ensure that PostRA Machine Sink does not sink instructions
 # past block prologues which would overwrite their uses.
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a440b87efc81f..eebd32cd67e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 
 define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
 ; GCN-LABEL: test_kill_depth_0_imm_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
index 0f692107711df..2f1eeb82bf648 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
 
 # GCN-LABEL: name: hazard_smem_war
 # GCN:        S_LOAD_DWORD_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 52db7fea08e05..c9c00a84e0f4b 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -92,8 +92,8 @@ entry:
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; GFX9_10: s_add_u32 s2, s2, -4
-; GFX9_10: s_addc_u32 s3, s3, -1
+; GFX9_10: s_add_u32 s0, s6, -4
+; GFX9_10: s_addc_u32 s1, s7, -1
 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 537aca12bdb70..639bf6a6d550c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
 
 --- |
   define amdgpu_kernel void @check_vcc() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index abf013e39eefa..7b0241984a349 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -7,20 +7,20 @@
 define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i16_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_movk_i32 s0, 0x4925
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GCN-NEXT:    s_movk_i32 s2, 0x4925
+; GCN-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 17, v2
 ; GCN-NEXT:    v_add_u16_e32 v2, v2, v3
 ; GCN-NEXT:    v_mul_lo_u16_e32 v2, 7, v2
 ; GCN-NEXT:    v_sub_u16_e32 v1, v1, v2
-; GCN-NEXT:    global_store_short v0, v1, s[0:1]
+; GCN-NEXT:    global_store_short v0, v1, s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i16_7:
@@ -113,38 +113,38 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v1
-; GCN-NEXT:    s_abs_i32 s2, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    v_readfirstlane_b32 s3, v0
-; GCN-NEXT:    s_sub_i32 s5, 0, s2
-; GCN-NEXT:    s_ashr_i32 s4, s3, 31
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    s_abs_i32 s0, s0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    s_sub_i32 s3, 0, s0
+; GCN-NEXT:    s_ashr_i32 s2, s1, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_abs_i32 s3, s3
+; GCN-NEXT:    s_abs_i32 s1, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_mul_i32 s5, s5, s6
-; GCN-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GCN-NEXT:    s_add_i32 s6, s6, s5
-; GCN-NEXT:    s_mul_hi_u32 s5, s3, s6
-; GCN-NEXT:    s_mul_i32 s5, s5, s2
-; GCN-NEXT:    s_sub_i32 s3, s3, s5
-; GCN-NEXT:    s_sub_i32 s5, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s3, s5, s3
-; GCN-NEXT:    s_sub_i32 s5, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s2, s5, s3
-; GCN-NEXT:    s_xor_b32 s2, s2, s4
-; GCN-NEXT:    s_sub_i32 s2, s2, s4
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    s_mul_i32 s3, s3, s6
+; GCN-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GCN-NEXT:    s_add_i32 s6, s6, s3
+; GCN-NEXT:    s_mul_hi_u32 s3, s1, s6
+; GCN-NEXT:    s_mul_i32 s3, s3, s0
+; GCN-NEXT:    s_sub_i32 s1, s1, s3
+; GCN-NEXT:    s_sub_i32 s3, s1, s0
+; GCN-NEXT:    s_cmp_ge_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s1, s3, s1
+; GCN-NEXT:    s_sub_i32 s3, s1, s0
+; GCN-NEXT:    s_cmp_ge_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s0, s3, s1
+; GCN-NEXT:    s_xor_b32 s0, s0, s2
+; GCN-NEXT:    s_sub_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32:
@@ -277,17 +277,17 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
+; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
 ; GCN-NEXT:    v_add_u32_e32 v2, v1, v2
 ; GCN-NEXT:    v_and_b32_e32 v2, -4, v2
 ; GCN-NEXT:    v_sub_u32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32_4:
@@ -363,20 +363,20 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s0, 0x92492493
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
-; GCN-NEXT:    s_mov_b32 s2, 0x92492493
+; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_hi_i32 v2, v1, s2
+; GCN-NEXT:    v_mul_hi_i32 v2, v1, s0
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, 7
 ; GCN-NEXT:    v_sub_u32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32_7:
@@ -459,64 +459,64 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v2
-; GCN-NEXT:    s_abs_i32 s2, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT:    v_readfirstlane_b32 s3, v0
-; GCN-NEXT:    s_sub_i32 s6, 0, s2
-; GCN-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN-NEXT:    v_readfirstlane_b32 s0, v2
+; GCN-NEXT:    s_abs_i32 s0, s0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    s_sub_i32 s6, 0, s0
+; GCN-NEXT:    s_ashr_i32 s3, s1, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    s_abs_i32 s3, s3
-; GCN-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-NEXT:    s_abs_i32 s1, s1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s3, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s2
-; GCN-NEXT:    s_sub_i32 s3, s3, s6
-; GCN-NEXT:    s_sub_i32 s6, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s3, s6, s3
-; GCN-NEXT:    s_sub_i32 s6, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s2, s6, s3
-; GCN-NEXT:    s_abs_i32 s3, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GCN-NEXT:    s_xor_b32 s2, s2, s5
-; GCN-NEXT:    s_sub_i32 s7, 0, s3
-; GCN-NEXT:    s_sub_i32 s2, s2, s5
+; GCN-NEXT:    s_mul_hi_u32 s6, s1, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s0
+; GCN-NEXT:    s_sub_i32 s1, s1, s6
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_ge_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s1, s6, s1
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_ge_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s0, s6, s1
+; GCN-NEXT:    s_abs_i32 s1, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GCN-NEXT:    s_xor_b32 s0, s0, s3
+; GCN-NEXT:    s_sub_i32 s7, 0, s1
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s4, v1
-; GCN-NEXT:    s_ashr_i32 s6, s4, 31
-; GCN-NEXT:    s_abs_i32 s4, s4
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-NEXT:    s_ashr_i32 s6, s2, 31
+; GCN-NEXT:    s_abs_i32 s2, s2
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_mul_i32 s7, s7, s5
-; GCN-NEXT:    s_mul_hi_u32 s7, s5, s7
-; GCN-NEXT:    s_add_i32 s5, s5, s7
-; GCN-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GCN-NEXT:    s_mul_i32 s5, s5, s3
-; GCN-NEXT:    s_sub_i32 s4, s4, s5
-; GCN-NEXT:    s_sub_i32 s5, s4, s3
-; GCN-NEXT:    s_cmp_ge_u32 s4, s3
-; GCN-NEXT:    s_cselect_b32 s4, s5, s4
-; GCN-NEXT:    s_sub_i32 s5, s4, s3
-; GCN-NEXT:    s_cmp_ge_u32 s4, s3
-; GCN-NEXT:    s_cselect_b32 s3, s5, s4
-; GCN-NEXT:    s_xor_b32 s3, s3, s6
-; GCN-NEXT:    s_sub_i32 s3, s3, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s7, s7, s3
+; GCN-NEXT:    s_mul_hi_u32 s7, s3, s7
+; GCN-NEXT:    s_add_i32 s3, s3, s7
+; GCN-NEXT:    s_mul_hi_u32 s3, s2, s3
+; GCN-NEXT:    s_mul_i32 s3, s3, s1
+; GCN-NEXT:    s_sub_i32 s2, s2, s3
+; GCN-NEXT:    s_sub_i32 s3, s2, s1
+; GCN-NEXT:    s_cmp_ge_u32 s2, s1
+; GCN-NEXT:    s_cselect_b32 s2, s3, s2
+; GCN-NEXT:    s_sub_i32 s3, s2, s1
+; GCN-NEXT:    s_cmp_ge_u32 s2, s1
+; GCN-NEXT:    s_cselect_b32 s1, s3, s2
+; GCN-NEXT:    s_xor_b32 s1, s1, s6
+; GCN-NEXT:    s_sub_i32 s1, s1, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i32:
@@ -723,26 +723,26 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s3, v1
-; GCN-NEXT:    s_ashr_i32 s4, s2, 31
-; GCN-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN-NEXT:    s_lshr_b32 s4, s4, 30
-; GCN-NEXT:    s_lshr_b32 s5, s5, 30
-; GCN-NEXT:    s_add_i32 s4, s2, s4
-; GCN-NEXT:    s_add_i32 s5, s3, s5
-; GCN-NEXT:    s_and_b32 s4, s4, -4
-; GCN-NEXT:    s_and_b32 s5, s5, -4
-; GCN-NEXT:    s_sub_i32 s2, s2, s4
-; GCN-NEXT:    s_sub_i32 s3, s3, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    s_ashr_i32 s2, s0, 31
+; GCN-NEXT:    s_ashr_i32 s3, s1, 31
+; GCN-NEXT:    s_lshr_b32 s2, s2, 30
+; GCN-NEXT:    s_lshr_b32 s3, s3, 30
+; GCN-NEXT:    s_add_i32 s2, s0, s2
+; GCN-NEXT:    s_add_i32 s3, s1, s3
+; GCN-NEXT:    s_and_b32 s2, s2, -4
+; GCN-NEXT:    s_and_b32 s3, s3, -4
+; GCN-NEXT:    s_sub_i32 s0, s0, s2
+; GCN-NEXT:    s_sub_i32 s1, s1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i32_4:
@@ -842,118 +842,118 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[1:4], v0, s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[5:8], v0, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v1
-; GCN-NEXT:    s_abs_i32 s2, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    s_sub_i32 s6, 0, s2
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    s_abs_i32 s0, s0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GCN-NEXT:    s_sub_i32 s6, 0, s0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s4, v5
-; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    v_readfirstlane_b32 s2, v5
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_abs_i32 s4, s4
-; GCN-NEXT:    v_readfirstlane_b32 s3, v2
+; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_readfirstlane_b32 s1, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v1
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s4, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s2
-; GCN-NEXT:    s_sub_i32 s4, s4, s6
-; GCN-NEXT:    s_sub_i32 s6, s4, s2
-; GCN-NEXT:    s_cmp_ge_u32 s4, s2
-; GCN-NEXT:    s_cselect_b32 s4, s6, s4
-; GCN-NEXT:    s_sub_i32 s6, s4, s2
-; GCN-NEXT:    s_cmp_ge_u32 s4, s2
-; GCN-NEXT:    s_cselect_b32 s2, s6, s4
-; GCN-NEXT:    s_abs_i32 s3, s3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_xor_b32 s2, s2, s5
-; GCN-NEXT:    s_sub_i32 s8, 0, s3
-; GCN-NEXT:    s_sub_i32 s2, s2, s5
+; GCN-NEXT:    s_mul_hi_u32 s6, s2, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s0
+; GCN-NEXT:    s_sub_i32 s2, s2, s6
+; GCN-NEXT:    s_sub_i32 s6, s2, s0
+; GCN-NEXT:    s_cmp_ge_u32 s2, s0
+; GCN-NEXT:    s_cselect_b32 s2, s6, s2
+; GCN-NEXT:    s_sub_i32 s6, s2, s0
+; GCN-NEXT:    s_cmp_ge_u32 s2, s0
+; GCN-NEXT:    s_cselect_b32 s0, s6, s2
+; GCN-NEXT:    s_abs_i32 s1, s1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GCN-NEXT:    s_xor_b32 s0, s0, s3
+; GCN-NEXT:    s_sub_i32 s8, 0, s1
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v6
 ; GCN-NEXT:    s_ashr_i32 s7, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v3
-; GCN-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NEXT:    s_mul_i32 s8, s8, s5
-; GCN-NEXT:    s_mul_hi_u32 s8, s5, s8
-; GCN-NEXT:    s_add_i32 s5, s5, s8
-; GCN-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GCN-NEXT:    s_mul_i32 s5, s5, s3
-; GCN-NEXT:    s_sub_i32 s5, s6, s5
-; GCN-NEXT:    s_sub_i32 s6, s5, s3
-; GCN-NEXT:    s_cmp_ge_u32 s5, s3
-; GCN-NEXT:    s_cselect_b32 s5, s6, s5
-; GCN-NEXT:    s_sub_i32 s6, s5, s3
-; GCN-NEXT:    s_cmp_ge_u32 s5, s3
-; GCN-NEXT:    s_cselect_b32 s3, s6, s5
-; GCN-NEXT:    s_abs_i32 s4, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GCN-NEXT:    s_xor_b32 s3, s3, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s4
-; GCN-NEXT:    s_sub_i32 s3, s3, s7
+; GCN-NEXT:    v_readfirstlane_b32 s2, v3
+; GCN-NEXT:    v_readfirstlane_b32 s3, v1
+; GCN-NEXT:    s_mul_i32 s8, s8, s3
+; GCN-NEXT:    s_mul_hi_u32 s8, s3, s8
+; GCN-NEXT:    s_add_i32 s3, s3, s8
+; GCN-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GCN-NEXT:    s_mul_i32 s3, s3, s1
+; GCN-NEXT:    s_sub_i32 s3, s6, s3
+; GCN-NEXT:    s_sub_i32 s6, s3, s1
+; GCN-NEXT:    s_cmp_ge_u32 s3, s1
+; GCN-NEXT:    s_cselect_b32 s3, s6, s3
+; GCN-NEXT:    s_sub_i32 s6, s3, s1
+; GCN-NEXT:    s_cmp_ge_u32 s3, s1
+; GCN-NEXT:    s_cselect_b32 s1, s6, s3
+; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GCN-NEXT:    s_xor_b32 s1, s1, s7
+; GCN-NEXT:    s_sub_i32 s9, 0, s2
+; GCN-NEXT:    s_sub_i32 s1, s1, s7
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v7
 ; GCN-NEXT:    s_ashr_i32 s8, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v4
+; GCN-NEXT:    v_readfirstlane_b32 s3, v4
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v1
 ; GCN-NEXT:    s_mul_i32 s9, s9, s7
 ; GCN-NEXT:    s_mul_hi_u32 s9, s7, s9
 ; GCN-NEXT:    s_add_i32 s7, s7, s9
 ; GCN-NEXT:    s_mul_hi_u32 s7, s6, s7
-; GCN-NEXT:    s_mul_i32 s7, s7, s4
+; GCN-NEXT:    s_mul_i32 s7, s7, s2
 ; GCN-NEXT:    s_sub_i32 s6, s6, s7
-; GCN-NEXT:    s_sub_i32 s7, s6, s4
-; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_sub_i32 s7, s6, s2
+; GCN-NEXT:    s_cmp_ge_u32 s6, s2
 ; GCN-NEXT:    s_cselect_b32 s6, s7, s6
-; GCN-NEXT:    s_sub_i32 s7, s6, s4
-; GCN-NEXT:    s_cmp_ge_u32 s6, s4
-; GCN-NEXT:    s_cselect_b32 s4, s7, s6
-; GCN-NEXT:    s_abs_i32 s5, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-NEXT:    s_sub_i32 s7, s6, s2
+; GCN-NEXT:    s_cmp_ge_u32 s6, s2
+; GCN-NEXT:    s_cselect_b32 s2, s7, s6
+; GCN-NEXT:    s_abs_i32 s3, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v8
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_ashr_i32 s2, s6, 31
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_ashr_i32 s0, s6, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    s_abs_i32 s3, s6
-; GCN-NEXT:    s_sub_i32 s6, 0, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_abs_i32 s1, s6
+; GCN-NEXT:    s_sub_i32 s6, 0, s3
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    s_xor_b32 s4, s4, s8
-; GCN-NEXT:    s_sub_i32 s4, s4, s8
+; GCN-NEXT:    s_xor_b32 s2, s2, s8
+; GCN-NEXT:    s_sub_i32 s2, s2, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v3
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s3, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s5
-; GCN-NEXT:    s_sub_i32 s3, s3, s6
-; GCN-NEXT:    s_sub_i32 s6, s3, s5
-; GCN-NEXT:    s_cmp_ge_u32 s3, s5
-; GCN-NEXT:    s_cselect_b32 s3, s6, s3
-; GCN-NEXT:    s_sub_i32 s6, s3, s5
-; GCN-NEXT:    s_cmp_ge_u32 s3, s5
-; GCN-NEXT:    s_cselect_b32 s3, s6, s3
-; GCN-NEXT:    s_xor_b32 s3, s3, s2
-; GCN-NEXT:    s_sub_i32 s2, s3, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
+; GCN-NEXT:    s_mul_hi_u32 s6, s1, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s3
+; GCN-NEXT:    s_sub_i32 s1, s1, s6
+; GCN-NEXT:    s_sub_i32 s6, s1, s3
+; GCN-NEXT:    s_cmp_ge_u32 s1, s3
+; GCN-NEXT:    s_cselect_b32 s1, s6, s1
+; GCN-NEXT:    s_sub_i32 s6, s1, s3
+; GCN-NEXT:    s_cmp_ge_u32 s1, s3
+; GCN-NEXT:    s_cselect_b32 s1, s6, s1
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i32:
@@ -1317,40 +1317,40 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s3, v1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v2
-; GCN-NEXT:    v_readfirstlane_b32 s5, v3
-; GCN-NEXT:    s_ashr_i32 s6, s2, 31
-; GCN-NEXT:    s_ashr_i32 s7, s3, 31
-; GCN-NEXT:    s_ashr_i32 s8, s4, 31
-; GCN-NEXT:    s_ashr_i32 s9, s5, 31
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v2
+; GCN-NEXT:    v_readfirstlane_b32 s3, v3
+; GCN-NEXT:    s_ashr_i32 s6, s0, 31
+; GCN-NEXT:    s_ashr_i32 s7, s1, 31
+; GCN-NEXT:    s_ashr_i32 s8, s2, 31
+; GCN-NEXT:    s_ashr_i32 s9, s3, 31
 ; GCN-NEXT:    s_lshr_b32 s6, s6, 30
 ; GCN-NEXT:    s_lshr_b32 s7, s7, 30
 ; GCN-NEXT:    s_lshr_b32 s8, s8, 30
 ; GCN-NEXT:    s_lshr_b32 s9, s9, 30
-; GCN-NEXT:    s_add_i32 s6, s2, s6
-; GCN-NEXT:    s_add_i32 s7, s3, s7
-; GCN-NEXT:    s_add_i32 s8, s4, s8
-; GCN-NEXT:    s_add_i32 s9, s5, s9
+; GCN-NEXT:    s_add_i32 s6, s0, s6
+; GCN-NEXT:    s_add_i32 s7, s1, s7
+; GCN-NEXT:    s_add_i32 s8, s2, s8
+; GCN-NEXT:    s_add_i32 s9, s3, s9
 ; GCN-NEXT:    s_and_b32 s6, s6, -4
 ; GCN-NEXT:    s_and_b32 s7, s7, -4
 ; GCN-NEXT:    s_and_b32 s8, s8, -4
 ; GCN-NEXT:    s_and_b32 s9, s9, -4
-; GCN-NEXT:    s_sub_i32 s2, s2, s6
-; GCN-NEXT:    s_sub_i32 s3, s3, s7
-; GCN-NEXT:    s_sub_i32 s4, s4, s8
-; GCN-NEXT:    s_sub_i32 s5, s5, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT:    s_sub_i32 s0, s0, s6
+; GCN-NEXT:    s_sub_i32 s1, s1, s7
+; GCN-NEXT:    s_sub_i32 s2, s2, s8
+; GCN-NEXT:    s_sub_i32 s3, s3, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i32_4:
@@ -2589,10 +2589,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
@@ -2601,7 +2601,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_and_b32_e32 v3, -4, v3
 ; GCN-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i64_4:
@@ -4733,10 +4733,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
@@ -4752,7 +4752,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GCN-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
 ; GCN-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i64_4:
@@ -8883,11 +8883,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 30, v9
@@ -8918,8 +8918,8 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v15, vcc
 ; GCN-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v12
 ; GCN-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v16, vcc
-; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
+; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i64_4:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 6044873563254..df12e32d971a2 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: s_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_sub_i32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i32:
@@ -144,13 +144,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: test_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_i32:
@@ -208,13 +208,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: test_sub_imm_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0x7b, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_imm_i32:
@@ -272,14 +272,14 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v2i32:
@@ -350,17 +350,17 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v6, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v0
-; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v4i32:
@@ -432,16 +432,16 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: test_sub_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_sub_u16_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_i16:
@@ -517,14 +517,14 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v2i16:
@@ -608,15 +608,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index fe234a82ba6f7..5a821db6ff040 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -216,15 +216,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
@@ -248,14 +248,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_sub_i16 v0, s6, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
@@ -279,16 +279,16 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT:    s_mov_b32 s0, 0x1c8007b
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_constant:
@@ -312,16 +312,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_constant:
@@ -353,16 +353,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT:    s_mov_b32 s0, 0xfc21fcb3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
@@ -386,16 +386,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
@@ -426,15 +426,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
@@ -458,16 +458,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
@@ -498,15 +498,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
@@ -529,16 +529,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 32
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
@@ -570,15 +570,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 1.0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
@@ -601,16 +601,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 1.0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index e668c1d2b7f3d..d9e0e0298e072 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+  ; CHECK-NEXT:   early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
   ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY3]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
   ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY4]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY5]]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 03a1b3598024b..c08571a733cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_uaddo_i32_novcc:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_addc_u32 s7, s5, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_add_u32 s0, s8, s10
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_addc_u32 s1, s9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v1, v2
 ; GFX9-NEXT:    v_cmp_lt_u16_e32 vcc, v2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_uaddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index dacc986205983..36a0cbd3f0970 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_usubo_i32_novcc:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_subb_u32 s7, s5, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_sub_u32 s0, s8, s10
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_subb_u32 s1, s9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v1, v2
 ; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, v2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
-; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_usubo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index 2210b6c0d3c3a..1bb76cf547e25 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
 define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_add_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT:    global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 4b9b5f9ffdf84..39c7538738eb1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
 
 define amdgpu_kernel void @icmp_test() {
 ; CHECK-LABEL: icmp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index a8f3635416cff..d1bf5ecb56984 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -137,13 +137,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; GFX10-LABEL: v_cnd_nan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cnd_nan:
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 9f6d27802e184..5a7cce39f6103 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ;
 ; GISEL-LABEL: v_pack_b32_v2f16:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16_sub:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ;
 ; GISEL-LABEL: v_pack_b32_v2f16_sub:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 define amdgpu_kernel void @fptrunc(
 ; GCN-LABEL: fptrunc:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0x31016000
-; GCN-NEXT:    s_mov_b32 s10, s6
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x31016000
+; GCN-NEXT:    s_mov_b32 s10, s2
+; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s2
-; GCN-NEXT:    s_mov_b32 s9, s3
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s8, s6
+; GCN-NEXT:    s_mov_b32 s9, s7
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fptrunc:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GISEL-NEXT:    s_mov_b32 s6, -1
+; GISEL-NEXT:    s_mov_b32 s7, 0x31016000
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
-; GISEL-NEXT:    s_mov_b32 s2, -1
-; GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s1
 ; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc(
 define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ;
 ; GISEL-LABEL: v_pack_b32.fabs:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ;
 ; GISEL-LABEL: v_pack_b32.fneg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 8579cbdf47137..ee99fcc586334 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_med3_i16 v2, s2, 0, v1
-; SDAG-GFX9-NEXT:    v_med3_i16 v1, s3, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v2, s6, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v1, s7, 0, v1
 ; SDAG-GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SDAG-GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX9-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
@@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s4, 0
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s5, 0xff
+; GISEL-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, 0
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s1, 0xff
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s6
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s7
+; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s0
+; GISEL-GFX9-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s4
-; GISEL-GFX9-NEXT:    s_max_i32 s3, s3, s4
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s5
-; GISEL-GFX9-NEXT:    s_min_i32 s3, s3, s5
-; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s1
+; GISEL-GFX9-NEXT:    s_min_i32 s0, s0, s1
+; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
+; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index 02a6024f858e9..973b7f5c04987 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
 define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_sub_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT:    global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index e143611ff9b01..17e3d93ed393b 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=tahiti -o - %s | FileCheck %s -check-prefixes=CHECK,SI
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
index 4592498199d18..47f13cb5ebe16 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
 ---
 
 # After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
index 645d473819139..e97c518da1e3c 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo
 # GCN:        $sgpr0 = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
index 81d17a8fd0f90..76713dd2996dc 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
@@ -1,9 +1,9 @@
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 
 # GFX9-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX9-ERR: $vgpr0 = V_CNDMASK_B32_e64 0, $sgpr0, 0, -1, killed $sgpr0_sgpr1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
index 6c55183bb5287..bbd0817d5995a 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
 # GFX12-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX12-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
index dc7d4afa85741..b310f55d5b86b 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 
 # GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 02da6deb96f1f..2a280bcda42f5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec
-  ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; SI-NEXT:   early-clobber %33:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %33.sub0, killed %54, 0, implicit $exec
+  ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %33.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
   ; SI-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
+  ; SI-NEXT:   early-clobber %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
   ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
-  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %10, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
   ; SI-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 20dc5ad5c8665..50927a2cf21af 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -25,8 +25,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    ds_write_b8 v1, v2
 ; CHECK-NEXT:    s_mov_b64 s[4:5], exec
 ; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 7301b341cbc71..33c06e5d1e3a5 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -405,33 +405,33 @@ return:
 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_chain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
 ; GFX906-NEXT:  .LBB8_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -460,28 +460,28 @@ bb.3:
 define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_zeroinit:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[4:5]
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
-; GFX906-NEXT:    s_mov_b32 s2, 0
-; GFX906-NEXT:    s_mov_b32 s3, s2
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[6:7]
+; GFX906-NEXT:    s_mov_b32 s4, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    s_mov_b32 s5, s4
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_mov_b32_e32 v4, s3
-; GFX906-NEXT:    v_mov_b32_e32 v3, s2
+; GFX906-NEXT:    v_mov_b32_e32 v3, s4
+; GFX906-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX906-NEXT:  .LBB9_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
@@ -489,12 +489,12 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -522,8 +522,9 @@ bb.3:
 define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_const:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    ; implicit-def: $vgpr3
 ; GFX906-NEXT:    ; implicit-def: $vgpr13
@@ -533,8 +534,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    ; implicit-def: $vgpr12
 ; GFX906-NEXT:    ; implicit-def: $vgpr16
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -542,13 +542,13 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
-; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v4, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v4, s[6:7]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX906-NEXT:    v_mov_b32_e32 v10, 2
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 3
@@ -557,7 +557,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_mov_b32_e32 v7, 6
 ; GFX906-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 8
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
@@ -566,7 +566,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX906-NEXT:  .LBB10_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
@@ -581,7 +581,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
 ; GFX906-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX906-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX906-NEXT:    v_mov_b32_e32 v11, v9
@@ -603,7 +603,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -631,31 +631,31 @@ bb.3:
 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_multi_block:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[0:1]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
 ; GFX906-NEXT:  .LBB11_3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB11_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[6:7]
+; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[10:11]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
index 5d2e299aa854a..1e60261df1304 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
@@ -349,4 +349,223 @@ bb.2:
   ret void
 }
 
+; Should not produce a broken phi
+
+define void @broken_phi() {
+; GFX906-LABEL: define void @broken_phi(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  bb:
+; GFX906-NEXT:    br label [[BB1:%.*]]
+; GFX906:       bb1:
+; GFX906-NEXT:    [[I:%.*]] = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ]
+; GFX906-NEXT:    br i1 false, label [[BB3:%.*]], label [[BB2:%.*]]
+; GFX906:       bb2:
+; GFX906-NEXT:    br label [[BB3]]
+; GFX906:       bb3:
+; GFX906-NEXT:    [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ]
+; GFX906-NEXT:    br i1 false, label [[BB7]], label [[BB5:%.*]]
+; GFX906:       bb5:
+; GFX906-NEXT:    [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer)
+; GFX906-NEXT:    br label [[BB7]]
+; GFX906:       bb7:
+; GFX906-NEXT:    [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ]
+; GFX906-NEXT:    br label [[BB1]]
+;
+bb:
+  br label %bb1
+bb1:
+  %i = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, %bb ], [ %i8, %bb7 ]
+  br i1 false, label %bb3, label %bb2
+bb2:
+  br label %bb3
+bb3:
+  %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ]
+  br i1 false, label %bb7, label %bb5
+bb5:
+  %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer)
+  br label %bb7
+bb7:
+  %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ]
+  br label %bb1
+}
+
+; %sel1 should just use %sel0 instead of trying to convert back the
+; converted version of %sel0
+
+define amdgpu_kernel void @reuseOp() {
+; GFX906-LABEL: define amdgpu_kernel void @reuseOp(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
+; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32>
+; GFX906-NEXT:    br label [[BB_1:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
+; GFX906-NEXT:    [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; GFX906-NEXT:    [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
+; GFX906-NEXT:    [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
+; GFX906-NEXT:    br label [[BB_2:%.*]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>
+; GFX906-NEXT:    [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0
+; GFX906-NEXT:    ret void
+;
+entry:
+  %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
+  br label %bb.1
+
+bb.1:
+  %sel0 = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+  %sel1 = select i1 false, <16 x i8> %vec1, <16 x i8> %sel0
+  br label %bb.2
+
+bb.2:
+  %val = extractelement <16 x i8> %sel0, i64 0
+  ret void
+}
+
+
+define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) {
+; GFX906-LABEL: define amdgpu_kernel void @deletedPHI(
+; GFX906-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    br label [[BB_1:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ]
+; GFX906-NEXT:    [[PHI1:%.*]] = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    br label [[BB_3]]
+; GFX906:       bb.3:
+; GFX906-NEXT:    [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
+; GFX906:       bb.4:
+; GFX906-NEXT:    [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0
+; GFX906-NEXT:    br label [[BB_5]]
+; GFX906:       bb.5:
+; GFX906-NEXT:    [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
+; GFX906:       bb.6:
+; GFX906-NEXT:    br label [[BB_7]]
+; GFX906:       bb.7:
+; GFX906-NEXT:    [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]]
+; GFX906:       bb.8:
+; GFX906-NEXT:    br label [[BB_9]]
+; GFX906:       bb.9:
+; GFX906-NEXT:    [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]]
+; GFX906:       bb.10:
+; GFX906-NEXT:    br label [[BB_11]]
+; GFX906:       bb.11:
+; GFX906-NEXT:    [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ]
+; GFX906-NEXT:    [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
+; GFX906-NEXT:    br label [[BB_1]]
+;
+entry:
+  br label %bb.1
+
+bb.1:
+  %phi0 = phi i32 [ 0, %entry ], [ 1, %bb.11 ]
+  %phi1 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %vec1, %bb.11 ]
+  br i1 %cmp, label %bb.3, label %bb.2
+
+bb.2:
+  br label %bb.3
+
+bb.3:
+  %phi2 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi1, %bb.1 ]
+  br i1 %cmp, label %bb.5, label %bb.4
+
+bb.4:
+  %vec0 = insertelement <10 x i8> %phi2, i8 0, i64 0
+  br label %bb.5
+
+bb.5:                               ; preds = %bb.4, %bb.3
+  %phi3 = phi <10 x i8> [ %vec0, %bb.4 ], [ %phi2, %bb.3 ]
+  br i1 %cmp, label %bb.7, label %bb.6
+
+bb.6:
+  br label %bb.7
+
+bb.7:                               ; preds = %bb.6, %bb.5
+  %phi4 = phi <10 x i8> [ %invec0, %bb.6 ], [ %phi3, %bb.5 ]
+  br i1 %cmp, label %bb.9, label %bb.8
+
+bb.8:
+  br label %bb.9
+
+bb.9:
+  %phi5 = phi <10 x i8> [ %invec0, %bb.8 ], [ %phi4, %bb.7 ]
+  br i1 %cmp, label %bb.11, label %bb.10
+
+bb.10:
+  br label %bb.11
+
+bb.11:
+  %phi6 = phi <10 x i8> [ zeroinitializer, %bb.10 ], [ %phi5, %bb.9 ]
+  %vec1 = shufflevector <10 x i8> %phi6, <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
+  br label %bb.1
+}
+
+define amdgpu_kernel void @multiple_unwind(i1 %cmp, <10 x i8> %invec) {
+; GFX906-LABEL: define amdgpu_kernel void @multiple_unwind(
+; GFX906-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:  entry:
+; GFX906-NEXT:    br label [[BB_1:%.*]]
+; GFX906:       bb.1:
+; GFX906-NEXT:    [[PHI0:%.*]] = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
+; GFX906:       bb.2:
+; GFX906-NEXT:    br label [[BB_3]]
+; GFX906:       bb.3:
+; GFX906-NEXT:    [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
+; GFX906:       bb.4:
+; GFX906-NEXT:    br label [[BB_5]]
+; GFX906:       bb.5:
+; GFX906-NEXT:    [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ]
+; GFX906-NEXT:    br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
+; GFX906:       bb.6:
+; GFX906-NEXT:    br label [[BB_7]]
+; GFX906:       bb.7:
+; GFX906-NEXT:    [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ]
+; GFX906-NEXT:    br label [[BB_8]]
+; GFX906:       bb.8:
+; GFX906-NEXT:    br label [[BB_1]]
+;
+entry:
+  br label %bb.1
+
+bb.1:
+  %phi0 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %phi3, %bb.8 ]
+  br i1 %cmp, label %bb.3, label %bb.2
+
+bb.2:
+  br label %bb.3
+
+bb.3:
+  %phi1 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi0, %bb.1 ]
+  br i1 %cmp, label %bb.5, label %bb.4
+
+bb.4:
+  br label %bb.5
+
+bb.5:
+  %phi2 = phi <10 x i8> [ %phi0, %bb.4 ], [ %phi1, %bb.3 ]
+  br i1 %cmp, label %bb.7, label %bb.6
+
+bb.6:                              ; preds = %bb.5
+  br label %bb.7
+
+bb.7:
+  %phi3 = phi <10 x i8> [ %invec, %bb.6 ], [ %phi2, %bb.5 ]
+  br label %bb.8
+
+bb.8:
+  br label %bb.1
+}
+
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 715d4c0c42b76..3df757a426ae9 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -1,23 +1,66 @@
-;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=FUNC %s
-;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_select_v2i32:
-
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-
-; VI: s_cmp_gt_i32
-; VI: s_cselect_b32
-; VI: s_cmp_gt_i32
-; VI: s_cselect_b32
-
-; SI-DAG: s_cmp_gt_i32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cmp_gt_i32
-; SI-DAG: s_cselect_b32
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s
+;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s
 
 define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
+; SI-LABEL: test_select_v2i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_gt_i32 s9, s5
+; SI-NEXT:    s_cselect_b32 s5, s7, s9
+; SI-NEXT:    s_cmp_gt_i32 s8, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s8
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v2i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_gt_i32 s9, s5
+; VI-NEXT:    s_cselect_b32 s5, s7, s9
+; VI-NEXT:    s_cmp_gt_i32 s8, s4
+; VI-NEXT:    s_cselect_b32 s4, s6, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v2i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETGT_INT * T0.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
+; EG-NEXT:     SETGT_INT * T0.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %load0 = load <2 x i32>, ptr addrspace(1) %in0
   %load1 = load <2 x i32>, ptr addrspace(1) %in1
@@ -27,17 +70,72 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_select_v2f32:
-
-; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cmp_neq_f32_e32 vcc
-; SI: v_cndmask_b32_e32
-; SI: v_cmp_neq_f32_e32 vcc
-; SI: v_cndmask_b32_e32
-
 define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: test_select_v2f32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v2f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v2f32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETNE_DX10 * T0.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
+; EG-NEXT:     SETNE_DX10 * T0.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = load <2 x float>, ptr addrspace(1) %in0
   %1 = load <2 x float>, ptr addrspace(1) %in1
@@ -47,24 +145,86 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_select_v4i32:
-
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[4].X
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].W
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-
 define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) {
+; SI-LABEL: test_select_v4i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x0
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x11
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_gt_i32 s10, s14
+; SI-NEXT:    s_cselect_b32 s2, s2, s10
+; SI-NEXT:    s_cmp_gt_i32 s9, s13
+; SI-NEXT:    s_cselect_b32 s1, s1, s9
+; SI-NEXT:    s_cmp_gt_i32 s11, s15
+; SI-NEXT:    s_cselect_b32 s3, s3, s11
+; SI-NEXT:    s_cmp_gt_i32 s8, s12
+; SI-NEXT:    s_cselect_b32 s0, s0, s8
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v4i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x44
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_mov_b32 s9, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_gt_i32 s14, s18
+; VI-NEXT:    s_cselect_b32 s2, s2, s14
+; VI-NEXT:    s_cmp_gt_i32 s13, s17
+; VI-NEXT:    s_cselect_b32 s1, s1, s13
+; VI-NEXT:    s_cmp_gt_i32 s15, s19
+; VI-NEXT:    s_cselect_b32 s3, s3, s15
+; VI-NEXT:    s_cmp_gt_i32 s12, s16
+; VI-NEXT:    s_cselect_b32 s0, s0, s12
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v4i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETGT_INT T1.W, T0.W, T1.W,
+; EG-NEXT:     SETGT_INT * T2.W, T0.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, KC0[4].X,
+; EG-NEXT:     CNDE_INT T0.Z, T2.W, T0.Z, KC0[3].W,
+; EG-NEXT:     SETGT_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
+; EG-NEXT:     SETGT_INT * T1.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %load0 = load <4 x i32>, ptr addrspace(1) %in0
   %load1 = load <4 x i32>, ptr addrspace(1) %in1
@@ -74,17 +234,92 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_select_v4f32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
 define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: test_select_v4f32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v3
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s2
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s1, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v4f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v2
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v4f32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETNE_DX10 T2.W, T0.W, T1.W,
+; EG-NEXT:     SETNE_DX10 * T3.W, T0.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T0.W, PV.W, T1.W, T0.W,
+; EG-NEXT:     CNDE_INT T0.Z, T3.W, T1.Z, T0.Z,
+; EG-NEXT:     SETNE_DX10 * T1.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
+; EG-NEXT:     SETNE_DX10 * T1.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = load <4 x float>, ptr addrspace(1) %in0
   %1 = load <4 x float>, ptr addrspace(1) %in1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index dae46361b9bcc..c3a81771a2790 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -200,30 +200,30 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0
 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0, v1
 ; GFX1032-NEXT:    v_cmp_nle_f32_e64 s0, 1.0, v1
 ; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_nge_f32_e32 vcc, 0, v1
 ; GFX1064-NEXT:    v_cmp_nle_f32_e64 s[0:1], 1.0, v1
 ; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
@@ -239,30 +239,30 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
 define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v1
 ; GFX1032-NEXT:    v_cmp_gt_i32_e64 s0, 1, v1
 ; GFX1032-NEXT:    s_xor_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX1064-NEXT:    v_cmp_gt_i32_e64 s[0:1], 1, v1
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -278,30 +278,30 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
 define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 3, v1
 ; GFX1032-NEXT:    v_cmp_gt_u32_e64 s0, 2, v1
 ; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v1
 ; GFX1064-NEXT:    v_cmp_gt_u32_e64 s[0:1], 2, v1
 ; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -631,26 +631,26 @@ bb8:
 define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_addc_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_addc_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_add_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -664,26 +664,26 @@ bb:
 define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_subbrev_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_subbrev_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,26 +697,26 @@ bb:
 define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_subb_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s6, v0
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_subb_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s6, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -730,18 +730,18 @@ bb:
 define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-LABEL: test_udiv64:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_or_b64 s[8:9], s[6:7], s[4:5]
-; GFX1032-NEXT:    s_mov_b32 s8, 0
-; GFX1032-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX1032-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1032-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1032-NEXT:    s_sub_u32 s9, 0, s4
+; GFX1032-NEXT:    s_sub_u32 s3, 0, s4
 ; GFX1032-NEXT:    s_subb_u32 s10, 0, s5
 ; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1032-NEXT:    v_rcp_f32_e32 v0, v0
@@ -753,11 +753,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s1
+; GFX1032-NEXT:    s_mul_i32 s11, s3, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s1
 ; GFX1032-NEXT:    s_mul_i32 s12, s10, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s13, s11
-; GFX1032-NEXT:    s_mul_i32 s14, s9, s1
+; GFX1032-NEXT:    s_mul_i32 s14, s3, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s11, s12
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s1, s14
 ; GFX1032-NEXT:    s_mul_hi_u32 s15, s0, s14
@@ -777,46 +777,46 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032-NEXT:    s_addc_u32 s0, s0, s11
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s9, s1
+; GFX1032-NEXT:    s_mul_i32 s11, s3, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s3, s1
 ; GFX1032-NEXT:    s_mul_i32 s10, s10, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s12, s11
-; GFX1032-NEXT:    s_mul_i32 s9, s9, s1
+; GFX1032-NEXT:    s_mul_i32 s3, s3, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s11, s10
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s0, s9
-; GFX1032-NEXT:    s_mul_i32 s13, s0, s9
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s1, s9
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s0, s3
+; GFX1032-NEXT:    s_mul_i32 s13, s0, s3
+; GFX1032-NEXT:    s_mul_hi_u32 s3, s1, s3
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s1, s11
 ; GFX1032-NEXT:    s_mul_i32 s1, s1, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s10, s0, s11
-; GFX1032-NEXT:    s_add_u32 s1, s9, s1
-; GFX1032-NEXT:    s_addc_u32 s9, 0, s14
+; GFX1032-NEXT:    s_add_u32 s1, s3, s1
+; GFX1032-NEXT:    s_addc_u32 s3, 0, s14
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s13
 ; GFX1032-NEXT:    s_mul_i32 s11, s0, s11
-; GFX1032-NEXT:    s_addc_u32 s1, s9, s12
-; GFX1032-NEXT:    s_addc_u32 s9, s10, 0
+; GFX1032-NEXT:    s_addc_u32 s1, s3, s12
+; GFX1032-NEXT:    s_addc_u32 s3, s10, 0
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s11
-; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
+; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
 ; GFX1032-NEXT:    v_add_co_u32 v0, s1, v0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT:    s_addc_u32 s0, s0, s9
+; GFX1032-NEXT:    s_addc_u32 s0, s0, s3
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX1032-NEXT:    s_mul_i32 s10, s6, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s6, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s0
 ; GFX1032-NEXT:    s_mul_hi_u32 s11, s7, s0
 ; GFX1032-NEXT:    s_mul_i32 s0, s7, s0
 ; GFX1032-NEXT:    s_mul_hi_u32 s12, s6, s1
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s7, s1
 ; GFX1032-NEXT:    s_mul_i32 s1, s7, s1
 ; GFX1032-NEXT:    s_add_u32 s10, s12, s10
-; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
+; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
 ; GFX1032-NEXT:    s_add_u32 s1, s10, s1
-; GFX1032-NEXT:    s_addc_u32 s1, s9, s13
-; GFX1032-NEXT:    s_addc_u32 s9, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s1, s3, s13
+; GFX1032-NEXT:    s_addc_u32 s3, s11, 0
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s0
-; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
+; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
 ; GFX1032-NEXT:    s_mul_hi_u32 s0, s4, s1
-; GFX1032-NEXT:    s_mul_i32 s11, s4, s9
+; GFX1032-NEXT:    s_mul_i32 s11, s4, s3
 ; GFX1032-NEXT:    s_mul_i32 s12, s4, s1
 ; GFX1032-NEXT:    s_add_i32 s0, s0, s11
 ; GFX1032-NEXT:    v_sub_co_u32 v0, s11, s6, s12
@@ -836,9 +836,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1032-NEXT:    s_add_u32 s10, s1, 1
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX1032-NEXT:    s_addc_u32 s12, s9, 0
+; GFX1032-NEXT:    s_addc_u32 s12, s3, 0
 ; GFX1032-NEXT:    s_add_u32 s13, s1, 2
-; GFX1032-NEXT:    s_addc_u32 s14, s9, 0
+; GFX1032-NEXT:    s_addc_u32 s14, s3, 0
 ; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v0
 ; GFX1032-NEXT:    s_subb_u32 s0, s7, s0
@@ -854,9 +854,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v2, s10, v2, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
 ; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
+; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s2
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX1032-NEXT:  .LBB15_2:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -870,21 +870,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_i32 s0, s0, s1
 ; GFX1032-NEXT:    s_mul_hi_u32 s0, s6, s0
 ; GFX1032-NEXT:    s_mul_i32 s1, s0, s4
-; GFX1032-NEXT:    s_add_i32 s5, s0, 1
+; GFX1032-NEXT:    s_add_i32 s2, s0, 1
 ; GFX1032-NEXT:    s_sub_i32 s1, s6, s1
-; GFX1032-NEXT:    s_sub_i32 s6, s1, s4
+; GFX1032-NEXT:    s_sub_i32 s3, s1, s4
 ; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
-; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
-; GFX1032-NEXT:    s_cselect_b32 s1, s6, s1
-; GFX1032-NEXT:    s_add_i32 s5, s0, 1
+; GFX1032-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX1032-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX1032-NEXT:    s_add_i32 s2, s0, 1
 ; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
 ; GFX1032-NEXT:    s_mov_b32 s1, 0
-; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX1032-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB15_3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
 ; GFX1032-NEXT:    s_endpgm
 ; GFX1032-NEXT:  .LBB15_4:
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -892,9 +892,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX1064-LABEL: test_udiv64:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
 ; GFX1064-NEXT:    s_mov_b32 s0, 0
@@ -903,7 +903,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1064-NEXT:    s_sub_u32 s9, 0, s4
+; GFX1064-NEXT:    s_sub_u32 s3, 0, s4
 ; GFX1064-NEXT:    s_subb_u32 s10, 0, s5
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
@@ -913,92 +913,92 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s0
+; GFX1064-NEXT:    s_mul_i32 s1, s3, s2
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s0
 ; GFX1064-NEXT:    s_mul_i32 s11, s10, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s12, s1
-; GFX1064-NEXT:    s_mul_i32 s13, s9, s0
+; GFX1064-NEXT:    s_mul_i32 s13, s3, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s1, s11
 ; GFX1064-NEXT:    s_mul_hi_u32 s12, s0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s2, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s2, s13
 ; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
 ; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
-; GFX1064-NEXT:    s_mul_hi_u32 s15, s8, s1
+; GFX1064-NEXT:    s_mul_hi_u32 s15, s2, s1
 ; GFX1064-NEXT:    s_add_u32 s0, s12, s0
 ; GFX1064-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s11
-; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
+; GFX1064-NEXT:    s_mul_i32 s1, s2, s1
 ; GFX1064-NEXT:    s_addc_u32 s0, s12, s14
 ; GFX1064-NEXT:    s_addc_u32 s11, s15, 0
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s1
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
+; GFX1064-NEXT:    s_addc_u32 s2, s2, s11
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s9, s0
+; GFX1064-NEXT:    s_mul_i32 s1, s3, s2
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s3, s0
 ; GFX1064-NEXT:    s_mul_i32 s10, s10, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s11, s1
-; GFX1064-NEXT:    s_mul_i32 s9, s9, s0
+; GFX1064-NEXT:    s_mul_i32 s3, s3, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s1, s10
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s9
-; GFX1064-NEXT:    s_mul_i32 s12, s8, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s9, s0, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s3
+; GFX1064-NEXT:    s_mul_i32 s12, s2, s3
+; GFX1064-NEXT:    s_mul_hi_u32 s3, s0, s3
 ; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
 ; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s1
-; GFX1064-NEXT:    s_add_u32 s0, s9, s0
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s1
+; GFX1064-NEXT:    s_add_u32 s0, s3, s0
+; GFX1064-NEXT:    s_addc_u32 s3, 0, s13
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s12
-; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
-; GFX1064-NEXT:    s_addc_u32 s0, s9, s11
-; GFX1064-NEXT:    s_addc_u32 s9, s10, 0
+; GFX1064-NEXT:    s_mul_i32 s1, s2, s1
+; GFX1064-NEXT:    s_addc_u32 s0, s3, s11
+; GFX1064-NEXT:    s_addc_u32 s3, s10, 0
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s1
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
+; GFX1064-NEXT:    s_addc_u32 s3, 0, s3
 ; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    s_addc_u32 s0, s8, s9
+; GFX1064-NEXT:    s_addc_u32 s0, s2, s3
 ; GFX1064-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1064-NEXT:    s_mul_i32 s9, s6, s0
-; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s0
+; GFX1064-NEXT:    s_mul_i32 s3, s6, s0
+; GFX1064-NEXT:    s_mul_hi_u32 s2, s6, s0
 ; GFX1064-NEXT:    s_mul_hi_u32 s10, s7, s0
 ; GFX1064-NEXT:    s_mul_i32 s0, s7, s0
 ; GFX1064-NEXT:    s_mul_hi_u32 s11, s6, s1
 ; GFX1064-NEXT:    s_mul_hi_u32 s12, s7, s1
 ; GFX1064-NEXT:    s_mul_i32 s1, s7, s1
-; GFX1064-NEXT:    s_add_u32 s9, s11, s9
-; GFX1064-NEXT:    s_addc_u32 s8, 0, s8
-; GFX1064-NEXT:    s_add_u32 s1, s9, s1
-; GFX1064-NEXT:    s_addc_u32 s1, s8, s12
-; GFX1064-NEXT:    s_addc_u32 s8, s10, 0
+; GFX1064-NEXT:    s_add_u32 s3, s11, s3
+; GFX1064-NEXT:    s_addc_u32 s2, 0, s2
+; GFX1064-NEXT:    s_add_u32 s1, s3, s1
+; GFX1064-NEXT:    s_addc_u32 s1, s2, s12
+; GFX1064-NEXT:    s_addc_u32 s2, s10, 0
 ; GFX1064-NEXT:    s_add_u32 s10, s1, s0
-; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
+; GFX1064-NEXT:    s_addc_u32 s11, 0, s2
 ; GFX1064-NEXT:    s_mul_hi_u32 s0, s4, s10
 ; GFX1064-NEXT:    s_mul_i32 s1, s4, s11
-; GFX1064-NEXT:    s_mul_i32 s9, s4, s10
+; GFX1064-NEXT:    s_mul_i32 s3, s4, s10
 ; GFX1064-NEXT:    s_add_i32 s12, s0, s1
-; GFX1064-NEXT:    v_sub_co_u32 v0, s[0:1], s6, s9
-; GFX1064-NEXT:    s_mul_i32 s8, s5, s10
-; GFX1064-NEXT:    s_add_i32 s12, s12, s8
-; GFX1064-NEXT:    v_sub_co_u32 v1, s[8:9], v0, s4
+; GFX1064-NEXT:    v_sub_co_u32 v0, s[0:1], s6, s3
+; GFX1064-NEXT:    s_mul_i32 s2, s5, s10
+; GFX1064-NEXT:    s_add_i32 s12, s12, s2
+; GFX1064-NEXT:    v_sub_co_u32 v1, s[2:3], v0, s4
 ; GFX1064-NEXT:    s_sub_i32 s13, s7, s12
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_subb_u32 s13, s13, s5
-; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GFX1064-NEXT:    s_subb_u32 s8, s13, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s8, s5
+; GFX1064-NEXT:    s_subb_u32 s2, s13, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s2, s5
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s8, s5
+; GFX1064-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s2, s5
 ; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX1064-NEXT:    s_add_u32 s8, s10, 1
-; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
-; GFX1064-NEXT:    s_addc_u32 s9, s11, 0
+; GFX1064-NEXT:    s_add_u32 s2, s10, 1
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT:    s_addc_u32 s3, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s13, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
@@ -1013,8 +1013,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
 ; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s11, v1, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s10, v2, vcc
@@ -1031,21 +1031,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_add_i32 s0, s0, s1
 ; GFX1064-NEXT:    s_mul_hi_u32 s0, s6, s0
 ; GFX1064-NEXT:    s_mul_i32 s1, s0, s4
-; GFX1064-NEXT:    s_add_i32 s5, s0, 1
+; GFX1064-NEXT:    s_add_i32 s2, s0, 1
 ; GFX1064-NEXT:    s_sub_i32 s1, s6, s1
-; GFX1064-NEXT:    s_sub_i32 s6, s1, s4
+; GFX1064-NEXT:    s_sub_i32 s3, s1, s4
 ; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
-; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
-; GFX1064-NEXT:    s_cselect_b32 s1, s6, s1
-; GFX1064-NEXT:    s_add_i32 s5, s0, 1
+; GFX1064-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX1064-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX1064-NEXT:    s_add_i32 s2, s0, 1
 ; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
 ; GFX1064-NEXT:    s_mov_b32 s1, 0
-; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX1064-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB15_3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
 ; GFX1064-NEXT:    s_endpgm
 ; GFX1064-NEXT:  .LBB15_4:
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1063,30 +1063,30 @@ bb:
 define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX1032-LABEL: test_div_scale_f32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1032-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1032-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_div_scale_f32 v1, s2, v2, v2, v1
-; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT:    v_div_scale_f32 v1, s0, v2, v2, v1
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_div_scale_f32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1064-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1064-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_div_scale_f32 v1, s[2:3], v2, v2, v1
-; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], v2, v2, v1
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1263,9 +1263,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
 ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x34
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
 ; GFX1032-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1032-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1274,7 +1275,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX1032-NEXT:  ; %bb.1: ; %bb
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX1032-NEXT:    global_load_dword v0, v0, s[8:9] glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_b32 vcc_lo, vcc_lo, exec_lo
@@ -1343,40 +1344,40 @@ exit:
 define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 ; GFX1032-LABEL: fdiv_f32:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1032-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fdiv_f32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v0, s[0:1], s7, s7, s6
 ; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s6, s7, s6
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1064-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %fdiv = fdiv float %a, %b
@@ -1703,30 +1704,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
 ; GFX1032-LABEL: test_set_inactive_64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_set_inactive_64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
   store i64 %tmp, ptr addrspace(1) %out
@@ -2137,23 +2138,23 @@ main_body:
 define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -2194,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
 define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
@@ -2353,42 +2354,42 @@ define amdgpu_ps float @test_ps_live() #0 {
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_neq_f64_e64 s4, s[2:3], 1.0
-; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s4
+; GFX1032-NEXT:    v_cmp_neq_f64_e64 s2, s[0:1], 1.0
+; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB47_2
 ; GFX1032-NEXT:  ; %bb.1: ; %if
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[0:1], s[0:1]
 ; GFX1032-NEXT:    s_branch .LBB47_3
 ; GFX1032-NEXT:  .LBB47_2:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB47_3: ; %endif
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
-; GFX1064-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX1064-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0
+; GFX1064-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_vccnz .LBB47_2
 ; GFX1064-NEXT:  ; %bb.1: ; %if
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[0:1], s[0:1]
 ; GFX1064-NEXT:    s_branch .LBB47_3
 ; GFX1064-NEXT:  .LBB47_2:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB47_3: ; %endif
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %v = load double, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 13b37b40ee95c..46254994580d2 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -152,4 +152,90 @@ end:
   ret float %r
 }
 
+; Two chains of phi network that have the same value from %if block.
+define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 %z, ptr addrspace(1) %p) #0 {
+; GCN-LABEL: while_break_two_chains_of_phi:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_branch .LBB2_2
+; GCN-NEXT:  .LBB2_1: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 s2, s1, s2
+; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    s_cbranch_execz .LBB2_6
+; GCN-NEXT:  .LBB2_2: ; %header
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_cmp_ge_i32_e64 s3, s0, v1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v1
+; GCN-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GCN-NEXT:    s_cbranch_execz .LBB2_4
+; GCN-NEXT:  ; %bb.3: ; %if
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[0:1], 2
+; GCN-NEXT:    s_andn2_b32 s1, s3, exec_lo
+; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v4, s6
+; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s7, v5, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v2
+; GCN-NEXT:    global_load_dword v0, v[6:7], off
+; GCN-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GCN-NEXT:    s_or_b32 s3, s1, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-NEXT:  .LBB2_4: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    s_mov_b32 s4, -1
+; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_cbranch_execz .LBB2_1
+; GCN-NEXT:  ; %bb.5: ; %latch
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
+; GCN-NEXT:    v_mov_b32_e32 v7, v0
+; GCN-NEXT:    s_add_i32 s0, s0, 1
+; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_branch .LBB2_1
+; GCN-NEXT:  .LBB2_6: ; %end
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, v7
+; GCN-NEXT:    v_mov_b32_e32 v1, v6
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  br label %header
+
+header:
+  %v.1 = phi float [ %v, %entry ], [ %v.2, %latch ]
+  %v.copy = phi float [ 0.0, %entry ], [ %v.copy.2, %latch ]
+  %ind = phi i32 [ 0, %entry], [ %ind.inc, %latch ]
+  %cc = icmp slt i32 %ind, %x
+  br i1 %cc, label %if, label %latch
+
+if:
+  %v.ptr = getelementptr float, ptr addrspace(1) %p, i32 %ind
+  %v.load = load float, ptr addrspace(1) %v.ptr
+  %v.if = fadd float %v.load, 1.0
+  %cc2 = icmp slt i32 %ind, %y
+  br i1 %cc2, label %latch, label %end
+
+latch:
+  %v.2 = phi float [ %v.1, %header ], [ %v.if, %if ]
+  %v.copy.2 = phi float [ %v.copy, %header ], [ %v.if, %if ]
+  %ind.inc = add i32 %ind, 1
+  %cc3 = icmp slt i32 %ind, %z
+  br i1 %cc3, label %end, label %header
+
+end:
+  %r = phi float [ %v.2, %latch ], [ %v.if, %if ]
+  %r2 = phi float [ %v.copy.2, %latch ], [ %v.if, %if ]
+  %packed0 = insertelement < 2 x float > poison, float %r, i32 0
+  %packed1 = insertelement < 2 x float > %packed0, float %r2, i32 1
+  ret < 2 x float> %packed1
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index ab1121a705529..645f4981ba317 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index 829d77018ff96..ad1d66e448b24 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 161d222d10ff7..c52b079aec2f7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 511a116a78be5..ebbb3d1fe7dea 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 5fde11cb4b1b1..0f8df5a6e3d2d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 277db33e940dd..4073964e2b038 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
 
 # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
 #  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 801bb06aacb63..2efbd3277f209 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 12b65986cf1c7..ac218a2b19c67 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index b43969dd4f5c0..9ec2d8f33edaa 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 11003c4c9edfd..6b4c2da772cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
 
 ; Check that WQM isn't triggered by image load/store intrinsics.
 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
@@ -3538,6 +3538,155 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
   ret void
 }
 
+; Check that exact regions with execz affected instructions are as short as possible
+define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
+; GFX9-W64-LABEL: short_exact_regions:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
+; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-W64-NEXT:    s_cbranch_execz .LBB59_2
+; GFX9-W64-NEXT:  ; %bb.1: ; %if
+; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX9-W64-NEXT:    s_buffer_load_dword s16, s[8:11], s16 offset:0x0
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
+; GFX9-W64-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX9-W64-NEXT:  .LBB59_2: ; %endif
+; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-W64-NEXT:    ; return to shader part epilog
+;
+; GFX10-W32-LABEL: short_exact_regions:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX10-W32-NEXT:    v_cmpx_gt_u32_e32 16, v0
+; GFX10-W32-NEXT:    s_cbranch_execz .LBB59_2
+; GFX10-W32-NEXT:  ; %bb.1: ; %if
+; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX10-W32-NEXT:    s_buffer_load_dword s14, s[8:11], s14 offset:0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s14
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
+; GFX10-W32-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
+; GFX10-W32-NEXT:  .LBB59_2: ; %endif
+; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-W32-NEXT:    ; return to shader part epilog
+main_body:
+  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 16
+  br i1 %cc, label %endif, label %if
+
+if:
+  %idx1 = extractelement <4 x i32> %idx0, i64 0
+  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
+  %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
+
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
+  br label %endif
+
+endif:
+  %d = extractelement <4 x float> %tex1, i64 0
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %r0 = extractelement <4 x float> %tex1, i64 1
+  %r1 = extractelement <4 x float> %tex2, i64 2
+  %r2 = fadd float %r0, %r1
+  %out = call float @llvm.amdgcn.wqm.f32(float %r2)
+
+  ret float %out
+}
+
+; Check that exact regions shortening doesn't prevent early WQM exit
+define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
+; GFX9-W64-LABEL: short_exact_regions_2:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-W64-NEXT:    image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
+; GFX9-W64-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-W64-NEXT:    ; kill: killed $vgpr3
+; GFX9-W64-NEXT:    ; kill: killed $vgpr1 killed $vgpr2
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-W64-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v5
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX9-W64-NEXT:    ; return to shader part epilog
+;
+; GFX10-W32-LABEL: short_exact_regions_2:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-W32-NEXT:    image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v1
+; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX10-W32-NEXT:    ; return to shader part epilog
+main_body:
+  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %idx1 = extractelement <4 x i32> %idx0, i64 0
+  %d = extractelement <4 x float> %tex1, i64 0
+
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+
+  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
+  %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
+
+  %r0 = extractelement <4 x float> %tex1, i64 1
+  %r1 = extractelement <4 x float> %tex2, i64 2
+  %r2 = fadd float %r0, %r1
+  %out = fadd float %r2, %idx3
+
+  ret float %out
+}
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
 
@@ -3577,6 +3726,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f9137b075e462..af50e09f509a3 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
 
 ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 121e5f3beec6f..4a08f49b8da2f 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -30,7 +30,6 @@
 ; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
-; CHECK-NEXT:      Lower constant intrinsics
 ; CHECK-NEXT:      Remove unreachable blocks from the CFG
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/ARM/block-order.mir b/llvm/test/CodeGen/ARM/block-order.mir
new file mode 100644
index 0000000000000..ecc749382f1f7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/block-order.mir
@@ -0,0 +1,529 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -o - %s -mtriple=thumbv7em-arm-none-eabihf -run-pass=block-placement -verify-machineinstrs | FileCheck %s
+# RUN: llc -o - %s -mtriple=thumbv7em-arm-none-eabihf -run-pass=block-placement -force-loop-cold-block -verify-machineinstrs | FileCheck %s
+---
+name:            fn
+tracksRegLiveness: true
+jumpTable:
+  kind:            inline
+  entries:
+    - id:              0
+      blocks:          [ '%bb.7', '%bb.16', '%bb.25', '%bb.32' ]
+body:             |
+  ; CHECK-LABEL: name: fn
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x5c0b8170), %bb.2(0x23f47e90)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr
+  ; CHECK-NEXT:   $sp = frame-setup tSUBspi $sp, 3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r5 = tMOVr $r1, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r1 = nsw t2SUBri killed $r1, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r7 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r8 = tMOVr $r2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r1 = t2ADDrs killed renamable $r1, killed renamable $r7, 235, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r7 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   $r9 = tMOVr killed $r3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r4 = nuw nsw t2ADDrs killed renamable $r7, killed renamable $r1, 25, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r1 = nuw nsw t2MUL renamable $r4, killed $r2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2CMPri renamable $r5, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.2, 10 /* CC::ge */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $sp = frame-destroy tADDspi $sp, 3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT:   liveins: $r1, $r4, $r5, $r8, $r9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r0 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   $lr = tMOVr killed $r5, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r10 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r2 = t2ADDrs renamable $r1, killed renamable $r0, 235, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r0 = nuw nsw t2LSLri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r2 = t2BICri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r1 = nsw t2MUL renamable $r0, killed renamable $r1, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x04000000), %bb.4(0x7c000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   t2CMPrr renamable $r10, renamable $lr, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.1, 0 /* CC::eq */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2LDRi12 $sp, 4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r7 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r4 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r1 = t2MUL renamable $r10, killed renamable $r1, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r2 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 219, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r7 = t2LDRi12 $sp, 0, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.29:
+  ; CHECK-NEXT:   successors: %bb.30(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.30:
+  ; CHECK-NEXT:   successors: %bb.32(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.32:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.3(0x04000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r10, $r2, $r4, $r7, $r8, $r9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   t2CMPrr renamable $r4, renamable $r8, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.7, 11 /* CC::lt */, killed $cpsr
+  ; CHECK-NEXT:   t2B %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.3(0x04000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r2, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = t2MLA renamable $r1, renamable $r0, killed renamable $r7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2CMPrr renamable $r4, renamable $r8, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.3, 10 /* CC::ge */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.5(0x30000000), %bb.8(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r2, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r3, renamable $r2 = t2LDR_POST killed renamable $r2, 4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2CMPri renamable $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.5, 0 /* CC::eq */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.32(0x19999998), %bb.9(0x66666668)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2RORri killed renamable $r1, 3, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   t2CMPri renamable $r1, 3, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 8 /* CC::hi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.10(0x20000000), %bb.17(0x20000000), %bb.24(0x20000000), %bb.30(0x20000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r12 = t2LEApcrelJT %jump-table.0, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $r5 = t2ADDrs killed renamable $r12, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   t2BR_JT killed renamable $r5, killed renamable $r1, %jump-table.0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.11(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 31, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.12, 0 /* CC::eq */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r5 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.32(0x30000000), %bb.13(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 4 /* CC::mi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13:
+  ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.14(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 30, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.15, 5 /* CC::pl */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14:
+  ; CHECK-NEXT:   successors: %bb.15(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.15:
+  ; CHECK-NEXT:   successors: %bb.32(0x30000000), %bb.17(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 4 /* CC::mi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.17:
+  ; CHECK-NEXT:   successors: %bb.19(0x40000000), %bb.18(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 23, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.19, 5 /* CC::pl */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.18:
+  ; CHECK-NEXT:   successors: %bb.19(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r6 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.19:
+  ; CHECK-NEXT:   successors: %bb.32(0x30000000), %bb.20(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 4 /* CC::mi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.20:
+  ; CHECK-NEXT:   successors: %bb.22(0x40000000), %bb.21(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 22, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.22, 5 /* CC::pl */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.21:
+  ; CHECK-NEXT:   successors: %bb.22(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.22:
+  ; CHECK-NEXT:   successors: %bb.32(0x30000000), %bb.24(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 4 /* CC::mi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.24:
+  ; CHECK-NEXT:   successors: %bb.26(0x40000000), %bb.25(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 15, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.26, 5 /* CC::pl */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.25:
+  ; CHECK-NEXT:   successors: %bb.26(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r6 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.26:
+  ; CHECK-NEXT:   successors: %bb.32(0x30000000), %bb.27(0x50000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.32, 4 /* CC::mi */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.27:
+  ; CHECK-NEXT:   successors: %bb.29(0x40000000), %bb.28(0x40000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r1 = t2LSLri renamable $r3, 14, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.29, 5 /* CC::pl */, killed $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.28:
+  ; CHECK-NEXT:   successors: %bb.29(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   t2B %bb.29, 14 /* CC::al */, $noreg
+  bb.0:
+    successors: %bb.37(0x80000000), %bb.1(0x32000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr
+
+    $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr
+    $sp = frame-setup tSUBspi $sp, 3, 14 /* CC::al */, $noreg
+    $r5 = tMOVr $r1, 14 /* CC::al */, $noreg
+    renamable $r1 = nsw t2SUBri killed $r1, 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $r7 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+    $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
+    $r8 = tMOVr $r2, 14 /* CC::al */, $noreg
+    renamable $r1 = t2ADDrs killed renamable $r1, killed renamable $r7, 235, 14 /* CC::al */, $noreg, $noreg
+    renamable $r7 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    $r9 = tMOVr killed $r3, 14 /* CC::al */, $noreg
+    renamable $r4 = nuw nsw t2ADDrs killed renamable $r7, killed renamable $r1, 25, 14 /* CC::al */, $noreg, $noreg
+    renamable $r1 = nuw nsw t2MUL renamable $r4, killed $r2, 14 /* CC::al */, $noreg
+    t2CMPri renamable $r5, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.37, 11 /* CC::lt */, killed $cpsr
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $r1, $r4, $r5, $r8, $r9
+
+    renamable $r0 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+    $lr = tMOVr killed $r5, 14 /* CC::al */, $noreg
+    renamable $r10 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    renamable $r2 = t2ADDrs renamable $r1, killed renamable $r0, 235, 14 /* CC::al */, $noreg, $noreg
+    renamable $r0 = nuw nsw t2LSLri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r2 = t2BICri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg
+    renamable $r1 = nsw t2MUL renamable $r0, killed renamable $r1, 14 /* CC::al */, $noreg
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    liveins: $lr, $r0, $r8, $r9, $r10
+
+    renamable $r1 = t2LDRi12 $sp, 4, 14 /* CC::al */, $noreg
+    renamable $r7 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg
+    renamable $r4 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    renamable $r1 = t2MUL renamable $r10, killed renamable $r1, 14 /* CC::al */, $noreg
+    renamable $r2 = t2ASRri renamable $r1, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 219, 14 /* CC::al */, $noreg, $noreg
+    renamable $r7 = t2LDRi12 $sp, 0, 14 /* CC::al */, $noreg
+
+  bb.3:
+    successors: %bb.34(0x30000000), %bb.4(0x50000000)
+    liveins: $lr, $r0, $r1, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r3, renamable $r2 = t2LDR_POST killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    t2CMPri renamable $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.34, 0 /* CC::eq */, killed $cpsr
+
+  bb.4:
+    successors: %bb.5(0x19999998), %bb.6(0x66666668)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2RORri killed renamable $r1, 3, 14 /* CC::al */, $noreg, $noreg
+    t2CMPri renamable $r1, 3, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.6, 9 /* CC::ls */, killed $cpsr
+
+  bb.5:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.6:
+    successors: %bb.7(0x20000000), %bb.16(0x20000000), %bb.25(0x20000000), %bb.32(0x20000000)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r12 = t2LEApcrelJT %jump-table.0, 14 /* CC::al */, $noreg
+    renamable $r5 = t2ADDrs killed renamable $r12, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
+    t2BR_JT killed renamable $r5, killed renamable $r1, %jump-table.0
+
+  bb.7:
+    successors: %bb.9(0x40000000), %bb.8(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 31, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr
+
+  bb.8:
+    successors: %bb.9(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r5 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.9:
+    successors: %bb.10(0x30000000), %bb.11(0x50000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.11, 5 /* CC::pl */, killed $cpsr
+
+  bb.10:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.11:
+    successors: %bb.13(0x40000000), %bb.12(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 30, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.13, 5 /* CC::pl */, killed $cpsr
+
+  bb.12:
+    successors: %bb.13(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.13:
+    successors: %bb.14(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+
+  bb.14:
+    successors: %bb.15(0x30000000), %bb.16(0x50000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.16, 5 /* CC::pl */, killed $cpsr
+
+  bb.15:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.16:
+    successors: %bb.18(0x40000000), %bb.17(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 23, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.18, 5 /* CC::pl */, killed $cpsr
+
+  bb.17:
+    successors: %bb.18(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r6 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.18:
+    successors: %bb.19(0x30000000), %bb.20(0x50000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.20, 5 /* CC::pl */, killed $cpsr
+
+  bb.19:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.20:
+    successors: %bb.22(0x40000000), %bb.21(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 22, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.22, 5 /* CC::pl */, killed $cpsr
+
+  bb.21:
+    successors: %bb.22(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.22:
+    successors: %bb.23(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+
+  bb.23:
+    successors: %bb.24(0x30000000), %bb.25(0x50000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.25, 5 /* CC::pl */, killed $cpsr
+
+  bb.24:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.25:
+    successors: %bb.27(0x40000000), %bb.26(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 15, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.27, 5 /* CC::pl */, killed $cpsr
+
+  bb.26:
+    successors: %bb.27(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r6 = t2ASRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.27:
+    successors: %bb.28(0x30000000), %bb.29(0x50000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.29, 5 /* CC::pl */, killed $cpsr
+
+  bb.28:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.29:
+    successors: %bb.31(0x40000000), %bb.30(0x40000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    dead renamable $r1 = t2LSLri renamable $r3, 14, 14 /* CC::al */, $noreg, def $cpsr
+    t2Bcc %bb.31, 5 /* CC::pl */, killed $cpsr
+
+  bb.30:
+    successors: %bb.31(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r1 = t2ANDri renamable $r7, 31, 14 /* CC::al */, $noreg, $noreg
+    renamable $r6 = t2LSRri renamable $r7, 5, 14 /* CC::al */, $noreg, $noreg
+
+  bb.31:
+    successors: %bb.32(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+
+  bb.32:
+    successors: %bb.33(0x80000000)
+    liveins: $lr, $r0, $r2, $r3, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, def $cpsr
+
+  bb.33:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = nsw t2SUBrr killed renamable $r7, renamable $r0, 14 /* CC::al */, $noreg, $noreg
+    renamable $r1 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.35, 14 /* CC::al */, $noreg
+
+  bb.34:
+    successors: %bb.35(0x80000000)
+    liveins: $lr, $r0, $r1, $r2, $r4, $r7, $r8, $r9, $r10
+
+    renamable $r7 = t2MLA renamable $r1, renamable $r0, killed renamable $r7, 14 /* CC::al */, $noreg
+
+  bb.35:
+    successors: %bb.3(0x7c000000), %bb.36(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r4, $r7, $r8, $r9, $r10
+
+    t2CMPrr renamable $r4, renamable $r8, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.3, 11 /* CC::lt */, killed $cpsr
+
+  bb.36:
+    successors: %bb.37(0x04000000), %bb.2(0x7c000000)
+    liveins: $lr, $r0, $r8, $r9, $r10
+
+    t2CMPrr renamable $r10, renamable $lr, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1 /* CC::ne */, killed $cpsr
+
+  bb.37:
+    $sp = frame-destroy tADDspi $sp, 3, 14 /* CC::al */, $noreg
+    $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc
+
+...
diff --git a/llvm/test/CodeGen/ARM/neon_vabd.ll b/llvm/test/CodeGen/ARM/neon_vabd.ll
index 8695c3e5f3db9..cdfc48468e044 100644
--- a/llvm/test/CodeGen/ARM/neon_vabd.ll
+++ b/llvm/test/CodeGen/ARM/neon_vabd.ll
@@ -340,20 +340,20 @@ define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    sbcs r2, r3, r12
 ; CHECK-NEXT:    sbcs r3, r1, #0
 ; CHECK-NEXT:    sbc r3, r1, #0
-; CHECK-NEXT:    eor r0, r0, r3, asr #31
-; CHECK-NEXT:    eor r2, r2, r3, asr #31
-; CHECK-NEXT:    subs r0, r0, r3, asr #31
-; CHECK-NEXT:    sbc r2, r2, r3, asr #31
+; CHECK-NEXT:    eor r0, r0, r3
+; CHECK-NEXT:    eor r2, r2, r3
+; CHECK-NEXT:    subs r0, r0, r3
+; CHECK-NEXT:    sbc r2, r2, r3
 ; CHECK-NEXT:    subs r3, r4, lr
 ; CHECK-NEXT:    sbcs r6, r5, r6
 ; CHECK-NEXT:    vmov.32 d1[0], r0
 ; CHECK-NEXT:    sbcs r5, r1, #0
 ; CHECK-NEXT:    sbc r1, r1, #0
-; CHECK-NEXT:    eor r3, r3, r1, asr #31
-; CHECK-NEXT:    subs r0, r3, r1, asr #31
+; CHECK-NEXT:    eor r3, r3, r1
+; CHECK-NEXT:    subs r0, r3, r1
 ; CHECK-NEXT:    vmov.32 d0[0], r0
-; CHECK-NEXT:    eor r0, r6, r1, asr #31
-; CHECK-NEXT:    sbc r0, r0, r1, asr #31
+; CHECK-NEXT:    eor r0, r6, r1
+; CHECK-NEXT:    sbc r0, r0, r1
 ; CHECK-NEXT:    vmov.32 d1[1], r2
 ; CHECK-NEXT:    vmov.32 d0[1], r0
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index 37f511fcc68cc..9f0edb7117bd1 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes='print<cost-model>' -mtriple=arm-apple-ios6.0.0 -mcpu=cortex-a8 2>&1 -disable-output | FileCheck %s --check-prefix=COST
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Make sure that ARM backend with NEON handles vselect.
 
 define void @vmax_v4i32(ptr %m, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vmax_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    add r1, sp, #8
+; CHECK-NEXT:    vldr d17, [sp]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vmov d16, r2, r3
+; CHECK-NEXT:    vmax.s32 q8, q8, q9
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
     %cmpres = icmp sgt <4 x i32> %a, %b
     %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
     store <4 x i32> %maxres, ptr %m
@@ -12,51 +21,84 @@ define void @vmax_v4i32(ptr %m, <4 x i32> %a, <4 x i32> %b) {
 
 %T0_10 = type <16 x i16>
 %T1_10 = type <16 x i1>
+define void @func_blend10(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend10:
-define void @func_blend10(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-  %v0 = load %T0_10, ptr %loadaddr
-  %v1 = load %T0_10, ptr %loadaddr2
-  %c = icmp slt %T0_10 %v0, %v1
-; CHECK: vmin.s16
-; CHECK: vmin.s16
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.16 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s16 q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    vmin.s16 q9, q9, q10
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend10
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 4 {{.*}} select
+
+  %v0 = load %T0_10, ptr %loadaddr
+  %v1 = load %T0_10, ptr %loadaddr2
+  %c = icmp slt %T0_10 %v0, %v1
   %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1
   store %T0_10 %r, ptr %storeaddr
   ret void
 }
+
 %T0_14 = type <8 x i32>
 %T1_14 = type <8 x i1>
+define void @func_blend14(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend14:
-define void @func_blend14(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-  %v0 = load %T0_14, ptr %loadaddr
-  %v1 = load %T0_14, ptr %loadaddr2
-  %c = icmp slt %T0_14 %v0, %v1
-; CHECK: vmin.s32
-; CHECK: vmin.s32
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    vmin.s32 q9, q9, q10
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend14
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 4 {{.*}} select
+  %v0 = load %T0_14, ptr %loadaddr
+  %v1 = load %T0_14, ptr %loadaddr2
+  %c = icmp slt %T0_14 %v0, %v1
   %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1
   store %T0_14 %r, ptr %storeaddr
   ret void
 }
+
 %T0_15 = type <16 x i32>
 %T1_15 = type <16 x i1>
+define void @func_blend15(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend15:
-define void @func_blend15(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-; CHECK: vmin.s32
-; CHECK: vmin.s32
-  %v0 = load %T0_15, ptr %loadaddr
-  %v1 = load %T0_15, ptr %loadaddr2
-  %c = icmp slt %T0_15 %v0, %v1
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q8, q9, q8
+; CHECK-NEXT:    vld1.32 {d20, d21}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d22, d23}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q10, q11, q10
+; CHECK-NEXT:    vld1.32 {d24, d25}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d26, d27}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q12, q13, q12
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r0:128]
+; CHECK-NEXT:    vmin.s32 q9, q11, q9
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.32 {d20, d21}, [r3:128]!
+; CHECK-NEXT:    vst1.32 {d24, d25}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend15
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 8 {{.*}} select
+
+  %v0 = load %T0_15, ptr %loadaddr
+  %v1 = load %T0_15, ptr %loadaddr2
+  %c = icmp slt %T0_15 %v0, %v1
   %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1
   store %T0_15 %r, ptr %storeaddr
   ret void
@@ -66,8 +108,7 @@ define void @func_blend15(ptr %loadaddr, ptr %loadaddr2,
 ; lowering we also need to adjust the cost.
 %T0_18 = type <4 x i64>
 %T1_18 = type <4 x i1>
-define void @func_blend18(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend18:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
@@ -118,20 +159,20 @@ define void @func_blend18(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_18, ptr %loadaddr
-  %v1 = load %T0_18, ptr %loadaddr2
-  %c = icmp slt %T0_18 %v0, %v1
 ; COST: func_blend18
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 21 {{.*}} select
+  %v0 = load %T0_18, ptr %loadaddr
+  %v1 = load %T0_18, ptr %loadaddr2
+  %c = icmp slt %T0_18 %v0, %v1
   %r = select %T1_18 %c, %T0_18 %v0, %T0_18 %v1
   store %T0_18 %r, ptr %storeaddr
   ret void
 }
+
 %T0_19 = type <8 x i64>
 %T1_19 = type <8 x i1>
-define void @func_blend19(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend19(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend19:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
@@ -226,20 +267,20 @@ define void @func_blend19(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r3:128]
 ; CHECK-NEXT:    pop {r4, r5, r6, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_19, ptr %loadaddr
-  %v1 = load %T0_19, ptr %loadaddr2
-  %c = icmp slt %T0_19 %v0, %v1
 ; COST: func_blend19
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 54 {{.*}} select
+  %v0 = load %T0_19, ptr %loadaddr
+  %v1 = load %T0_19, ptr %loadaddr2
+  %c = icmp slt %T0_19 %v0, %v1
   %r = select %T1_19 %c, %T0_19 %v0, %T0_19 %v1
   store %T0_19 %r, ptr %storeaddr
   ret void
 }
+
 %T0_20 = type <16 x i64>
 %T1_20 = type <16 x i1>
-define void @func_blend20(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend20:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
@@ -435,12 +476,12 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_20, ptr %loadaddr
-  %v1 = load %T0_20, ptr %loadaddr2
-  %c = icmp slt %T0_20 %v0, %v1
 ; COST: func_blend20
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 108 {{.*}} select
+  %v0 = load %T0_20, ptr %loadaddr
+  %v1 = load %T0_20, ptr %loadaddr2
+  %c = icmp slt %T0_20 %v0, %v1
   %r = select %T1_20 %c, %T0_20 %v0, %T0_20 %v1
   store %T0_20 %r, ptr %storeaddr
   ret void
diff --git a/llvm/test/CodeGen/DirectX/UAVMetadata.ll b/llvm/test/CodeGen/DirectX/UAVMetadata.ll
index 0bc8a8cfcd713..bdad9fd40c9bd 100644
--- a/llvm/test/CodeGen/DirectX/UAVMetadata.ll
+++ b/llvm/test/CodeGen/DirectX/UAVMetadata.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s
-; RUN: opt -S --passes="print-dxil-resource" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
diff --git a/llvm/test/CodeGen/DirectX/abs.ll b/llvm/test/CodeGen/DirectX/abs.ll
index 822580e8c089a..85090a51c55a4 100644
--- a/llvm/test/CodeGen/DirectX/abs.ll
+++ b/llvm/test/CodeGen/DirectX/abs.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for abs are generated for int16_t/int/int64_t.
 
diff --git a/llvm/test/CodeGen/DirectX/acos.ll b/llvm/test/CodeGen/DirectX/acos.ll
index 31b08833f45a1..cc32182395627 100644
--- a/llvm/test/CodeGen/DirectX/acos.ll
+++ b/llvm/test/CodeGen/DirectX/acos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for acos are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/acos_error.ll b/llvm/test/CodeGen/DirectX/acos_error.ll
index e0474e9b758e7..4125709a57e7a 100644
--- a/llvm/test/CodeGen/DirectX/acos_error.ll
+++ b/llvm/test/CodeGen/DirectX/acos_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation acos does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/asin.ll b/llvm/test/CodeGen/DirectX/asin.ll
index 56c2d86be3547..06e3bab545a6a 100644
--- a/llvm/test/CodeGen/DirectX/asin.ll
+++ b/llvm/test/CodeGen/DirectX/asin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for asin are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/asin_error.ll b/llvm/test/CodeGen/DirectX/asin_error.ll
index ddd4b2e424f62..de63b0d6be027 100644
--- a/llvm/test/CodeGen/DirectX/asin_error.ll
+++ b/llvm/test/CodeGen/DirectX/asin_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation asin does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/atan.ll b/llvm/test/CodeGen/DirectX/atan.ll
index 7aa4418a59813..d7c4cd00e286a 100644
--- a/llvm/test/CodeGen/DirectX/atan.ll
+++ b/llvm/test/CodeGen/DirectX/atan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for atan are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/atan_error.ll b/llvm/test/CodeGen/DirectX/atan_error.ll
index 1880b1d38ba3c..c320868ef4e57 100644
--- a/llvm/test/CodeGen/DirectX/atan_error.ll
+++ b/llvm/test/CodeGen/DirectX/atan_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation atan does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/cbuf.ll b/llvm/test/CodeGen/DirectX/cbuf.ll
index d07cc1e880b1a..38f08fad995d1 100644
--- a/llvm/test/CodeGen/DirectX/cbuf.ll
+++ b/llvm/test/CodeGen/DirectX/cbuf.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD
-; RUN: opt -S --passes="print-dxil-resource" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-unknown-shadermodel6.7-library"
diff --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll
index 1585471467801..48bc5495a8e05 100644
--- a/llvm/test/CodeGen/DirectX/ceil.ll
+++ b/llvm/test/CodeGen/DirectX/ceil.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for ceil are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/ceil_error.ll b/llvm/test/CodeGen/DirectX/ceil_error.ll
index 1b554d8715566..da6f083550186 100644
--- a/llvm/test/CodeGen/DirectX/ceil_error.ll
+++ b/llvm/test/CodeGen/DirectX/ceil_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation ceil does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/clamp.ll b/llvm/test/CodeGen/DirectX/clamp.ll
index f122313b8d7dc..2f29e4479f9ca 100644
--- a/llvm/test/CodeGen/DirectX/clamp.ll
+++ b/llvm/test/CodeGen/DirectX/clamp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for clamp/uclamp are generated for half/float/double/i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/comput_ids.ll b/llvm/test/CodeGen/DirectX/comput_ids.ll
index 553994094d71e..976b3ea5c6ecd 100644
--- a/llvm/test/CodeGen/DirectX/comput_ids.ll
+++ b/llvm/test/CodeGen/DirectX/comput_ids.ll
@@ -1,9 +1,9 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower  %s | FileCheck %s
 
 ; Make sure dxil operation function calls for all ComputeID dxil operations are generated.
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
+target triple = "dxil-pc-shadermodel6.7-compute"
 
 ; CHECK-LABEL: @test_thread_id(
 ; Function Attrs: noinline nounwind optnone
diff --git a/llvm/test/CodeGen/DirectX/cos.ll b/llvm/test/CodeGen/DirectX/cos.ll
index 00f2e2c3f6e5a..72f4bfca23f9d 100644
--- a/llvm/test/CodeGen/DirectX/cos.ll
+++ b/llvm/test/CodeGen/DirectX/cos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cos are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/cos_error.ll b/llvm/test/CodeGen/DirectX/cos_error.ll
index a074f5b493dfd..6bb85a7cec1e3 100644
--- a/llvm/test/CodeGen/DirectX/cos_error.ll
+++ b/llvm/test/CodeGen/DirectX/cos_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation cos does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/cosh.ll b/llvm/test/CodeGen/DirectX/cosh.ll
index 4fe22f0a38ce1..91aaf893f3997 100644
--- a/llvm/test/CodeGen/DirectX/cosh.ll
+++ b/llvm/test/CodeGen/DirectX/cosh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cosh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/cosh_error.ll b/llvm/test/CodeGen/DirectX/cosh_error.ll
index cf66c54db1a08..4c5f0c7146ab5 100644
--- a/llvm/test/CodeGen/DirectX/cosh_error.ll
+++ b/llvm/test/CodeGen/DirectX/cosh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation cosh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot2_error.ll b/llvm/test/CodeGen/DirectX/dot2_error.ll
index a27bfaedacd57..54780d18e71fb 100644
--- a/llvm/test/CodeGen/DirectX/dot2_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot3_error.ll b/llvm/test/CodeGen/DirectX/dot3_error.ll
index eb69fb145038a..242716b0b71ba 100644
--- a/llvm/test/CodeGen/DirectX/dot3_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot3_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot3 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot4_error.ll b/llvm/test/CodeGen/DirectX/dot4_error.ll
index 5cd632684c0c0..731adda153def 100644
--- a/llvm/test/CodeGen/DirectX/dot4_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot4_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot4 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/exp.ll b/llvm/test/CodeGen/DirectX/exp.ll
index fdafc1438cf0e..f67e2744c4ee3 100644
--- a/llvm/test/CodeGen/DirectX/exp.ll
+++ b/llvm/test/CodeGen/DirectX/exp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for exp are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/exp2_error.ll b/llvm/test/CodeGen/DirectX/exp2_error.ll
index 6b9126785fd4b..4d13f936eb6be 100644
--- a/llvm/test/CodeGen/DirectX/exp2_error.ll
+++ b/llvm/test/CodeGen/DirectX/exp2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation exp2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/fabs.ll b/llvm/test/CodeGen/DirectX/fabs.ll
index 3b3f8aa9a4a92..becbdf8d68aeb 100644
--- a/llvm/test/CodeGen/DirectX/fabs.ll
+++ b/llvm/test/CodeGen/DirectX/fabs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for abs are generated for float, half, and double.
 
diff --git a/llvm/test/CodeGen/DirectX/fdot.ll b/llvm/test/CodeGen/DirectX/fdot.ll
index 3e13b2ad2650c..56817a172ff9e 100644
--- a/llvm/test/CodeGen/DirectX/fdot.ll
+++ b/llvm/test/CodeGen/DirectX/fdot.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for dot are generated for int/uint vectors.
 
diff --git a/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll b/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll
new file mode 100644
index 0000000000000..9abea5e82868f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation sin is not valid in library stage
+; CHECK: LLVM ERROR: library : Invalid Shader Stage for DXIL operation - FlattenedThreadIdInGroup
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_flattened_thread_id_in_group() #0 {
+entry:
+  %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/floor.ll b/llvm/test/CodeGen/DirectX/floor.ll
index b033e2eaa491e..f667cab4aa249 100644
--- a/llvm/test/CodeGen/DirectX/floor.ll
+++ b/llvm/test/CodeGen/DirectX/floor.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for floor are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/floor_error.ll b/llvm/test/CodeGen/DirectX/floor_error.ll
index 3b51a4b543b7f..e3190e5afb63f 100644
--- a/llvm/test/CodeGen/DirectX/floor_error.ll
+++ b/llvm/test/CodeGen/DirectX/floor_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation floor does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/fmax.ll b/llvm/test/CodeGen/DirectX/fmax.ll
index aff722c29309c..05852ee33486d 100644
--- a/llvm/test/CodeGen/DirectX/fmax.ll
+++ b/llvm/test/CodeGen/DirectX/fmax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for fmax are generated for half/float/double.
 
diff --git a/llvm/test/CodeGen/DirectX/fmin.ll b/llvm/test/CodeGen/DirectX/fmin.ll
index 2f7c209f0278a..1c6c7ca3f2e38 100644
--- a/llvm/test/CodeGen/DirectX/fmin.ll
+++ b/llvm/test/CodeGen/DirectX/fmin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for fmin are generated for half/float/double.
 
diff --git a/llvm/test/CodeGen/DirectX/frac_error.ll b/llvm/test/CodeGen/DirectX/frac_error.ll
index ebce76105ad4d..1bc3558ab0c9a 100644
--- a/llvm/test/CodeGen/DirectX/frac_error.ll
+++ b/llvm/test/CodeGen/DirectX/frac_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation frac does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/group_id_error.ll b/llvm/test/CodeGen/DirectX/group_id_error.ll
new file mode 100644
index 0000000000000..2a6adcf2a9362
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/group_id_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation not valid for pixel stage
+; CHECK: LLVM ERROR: pixel : Invalid Shader Stage for DXIL operation - GroupId
+
+target triple = "dxil-pc-shadermodel6.7-pixel"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_group_id(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.group.id(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/idot.ll b/llvm/test/CodeGen/DirectX/idot.ll
index 9f89a8d6d340d..eac1b91106dde 100644
--- a/llvm/test/CodeGen/DirectX/idot.ll
+++ b/llvm/test/CodeGen/DirectX/idot.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for dot are generated for int/uint vectors.
 
diff --git a/llvm/test/CodeGen/DirectX/isinf.ll b/llvm/test/CodeGen/DirectX/isinf.ll
index e2975da90bfc1..295776b089347 100644
--- a/llvm/test/CodeGen/DirectX/isinf.ll
+++ b/llvm/test/CodeGen/DirectX/isinf.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for isinf are generated for float and half.
 ; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
diff --git a/llvm/test/CodeGen/DirectX/isinf_error.ll b/llvm/test/CodeGen/DirectX/isinf_error.ll
index 95b2d0cabcc43..39b83554d74d0 100644
--- a/llvm/test/CodeGen/DirectX/isinf_error.ll
+++ b/llvm/test/CodeGen/DirectX/isinf_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation isinf does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/log.ll b/llvm/test/CodeGen/DirectX/log.ll
index 172c3bfed3b77..ee289088d243b 100644
--- a/llvm/test/CodeGen/DirectX/log.ll
+++ b/llvm/test/CodeGen/DirectX/log.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for log are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/log10.ll b/llvm/test/CodeGen/DirectX/log10.ll
index d4f827a0d1af8..a69f270f9dc88 100644
--- a/llvm/test/CodeGen/DirectX/log10.ll
+++ b/llvm/test/CodeGen/DirectX/log10.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for log10 are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/log2.ll b/llvm/test/CodeGen/DirectX/log2.ll
index 2164d4db9396d..d6a7ba0b7dda7 100644
--- a/llvm/test/CodeGen/DirectX/log2.ll
+++ b/llvm/test/CodeGen/DirectX/log2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for log2 are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/log2_error.ll b/llvm/test/CodeGen/DirectX/log2_error.ll
index a26f6e8c3117f..b8876854d389f 100644
--- a/llvm/test/CodeGen/DirectX/log2_error.ll
+++ b/llvm/test/CodeGen/DirectX/log2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation log2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/pow.ll b/llvm/test/CodeGen/DirectX/pow.ll
index 25ce0fe731d0b..0e83c4ff8add6 100644
--- a/llvm/test/CodeGen/DirectX/pow.ll
+++ b/llvm/test/CodeGen/DirectX/pow.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for pow are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/reversebits.ll b/llvm/test/CodeGen/DirectX/reversebits.ll
index b6a7a1bc6152e..1ade57b40100f 100644
--- a/llvm/test/CodeGen/DirectX/reversebits.ll
+++ b/llvm/test/CodeGen/DirectX/reversebits.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for reversebits are generated for all integer types.
 
diff --git a/llvm/test/CodeGen/DirectX/round.ll b/llvm/test/CodeGen/DirectX/round.ll
index e0a3772ebca8f..db953fb29c204 100644
--- a/llvm/test/CodeGen/DirectX/round.ll
+++ b/llvm/test/CodeGen/DirectX/round.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/round_error.ll b/llvm/test/CodeGen/DirectX/round_error.ll
index 2d27fbb5ee20d..9d2a4e778a924 100644
--- a/llvm/test/CodeGen/DirectX/round_error.ll
+++ b/llvm/test/CodeGen/DirectX/round_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; This test is expected to fail with the following error
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/rsqrt.ll b/llvm/test/CodeGen/DirectX/rsqrt.ll
index 52af0e62220b3..054c84483ef82 100644
--- a/llvm/test/CodeGen/DirectX/rsqrt.ll
+++ b/llvm/test/CodeGen/DirectX/rsqrt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for rsqrt are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/rsqrt_error.ll b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
index 9cd5002c20f7e..5e29e37113d19 100644
--- a/llvm/test/CodeGen/DirectX/rsqrt_error.ll
+++ b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation rsqrt does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index 1f285c433581c..f309a36c6b8e6 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sin are generated for float and half.
 ; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
diff --git a/llvm/test/CodeGen/DirectX/sin_error.ll b/llvm/test/CodeGen/DirectX/sin_error.ll
index ece0e530315b2..0e20031501365 100644
--- a/llvm/test/CodeGen/DirectX/sin_error.ll
+++ b/llvm/test/CodeGen/DirectX/sin_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sin does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll b/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll
new file mode 100644
index 0000000000000..673fc22c32cd0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0 %s 2>&1 | FileCheck %s
+
+; Shader Stage is required to ensure the operation is supported.
+; CHECK: LLVM ERROR: 1.0: Unknown Compilation Target Shader Stage specified
+
+define noundef float @sin_float(float noundef %a) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %1 = call float @llvm.sin.f32(float %0)
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/DirectX/sinh.ll b/llvm/test/CodeGen/DirectX/sinh.ll
index 76d189836f393..d4d3eda9eccb6 100644
--- a/llvm/test/CodeGen/DirectX/sinh.ll
+++ b/llvm/test/CodeGen/DirectX/sinh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sinh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/sinh_error.ll b/llvm/test/CodeGen/DirectX/sinh_error.ll
index 6a021ce88eb3b..06aeca0339261 100644
--- a/llvm/test/CodeGen/DirectX/sinh_error.ll
+++ b/llvm/test/CodeGen/DirectX/sinh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sinh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/smax.ll b/llvm/test/CodeGen/DirectX/smax.ll
index 8b2406782c093..bcda51cb0bfba 100644
--- a/llvm/test/CodeGen/DirectX/smax.ll
+++ b/llvm/test/CodeGen/DirectX/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for smax are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/smin.ll b/llvm/test/CodeGen/DirectX/smin.ll
index b2b40a1b62433..8d4884704df21 100644
--- a/llvm/test/CodeGen/DirectX/smin.ll
+++ b/llvm/test/CodeGen/DirectX/smin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for smin are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/sqrt.ll b/llvm/test/CodeGen/DirectX/sqrt.ll
index 76a572efd2055..792fbc8d0614d 100644
--- a/llvm/test/CodeGen/DirectX/sqrt.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sqrt are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/sqrt_error.ll b/llvm/test/CodeGen/DirectX/sqrt_error.ll
index fffa2e19b80fa..1477abc62c13a 100644
--- a/llvm/test/CodeGen/DirectX/sqrt_error.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sqrt does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/tan.ll b/llvm/test/CodeGen/DirectX/tan.ll
index 567ab02d40f91..6f7beb592339a 100644
--- a/llvm/test/CodeGen/DirectX/tan.ll
+++ b/llvm/test/CodeGen/DirectX/tan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tan are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/tan_error.ll b/llvm/test/CodeGen/DirectX/tan_error.ll
index c870c36f54925..fa03e531bd672 100644
--- a/llvm/test/CodeGen/DirectX/tan_error.ll
+++ b/llvm/test/CodeGen/DirectX/tan_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation tan does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/tanh.ll b/llvm/test/CodeGen/DirectX/tanh.ll
index d0313178c7ac3..e6642d9a74c8a 100644
--- a/llvm/test/CodeGen/DirectX/tanh.ll
+++ b/llvm/test/CodeGen/DirectX/tanh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tanh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/tanh_error.ll b/llvm/test/CodeGen/DirectX/tanh_error.ll
index a1b8cbf0e13bc..933ffbc87e23f 100644
--- a/llvm/test/CodeGen/DirectX/tanh_error.ll
+++ b/llvm/test/CodeGen/DirectX/tanh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation tanh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/thread_id_error.ll b/llvm/test/CodeGen/DirectX/thread_id_error.ll
new file mode 100644
index 0000000000000..6928932328fbe
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/thread_id_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation not valid for library stage
+; CHECK: LLVM ERROR: library : Invalid Shader Stage for DXIL operation - ThreadId
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_thread_id(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.thread.id(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll b/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll
new file mode 100644
index 0000000000000..8b63fd7bae543
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation sin is not valid in vertex stage
+; CHECK: LLVM ERROR: vertex : Invalid Shader Stage for DXIL operation - ThreadIdInGroup
+
+target triple = "dxil-pc-shadermodel6.7-vertex"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_thread_id_in_group(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.thread.id.in.group(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/trunc.ll b/llvm/test/CodeGen/DirectX/trunc.ll
index 2072f28cef50a..f00b737da4dbb 100644
--- a/llvm/test/CodeGen/DirectX/trunc.ll
+++ b/llvm/test/CodeGen/DirectX/trunc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for trunc are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/trunc_error.ll b/llvm/test/CodeGen/DirectX/trunc_error.ll
index 751b0b94c280d..ccc7b1df879ee 100644
--- a/llvm/test/CodeGen/DirectX/trunc_error.ll
+++ b/llvm/test/CodeGen/DirectX/trunc_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation trunc does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/umax.ll b/llvm/test/CodeGen/DirectX/umax.ll
index be0f557fc8da6..a4bd66ef0bd6c 100644
--- a/llvm/test/CodeGen/DirectX/umax.ll
+++ b/llvm/test/CodeGen/DirectX/umax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for umax are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/umin.ll b/llvm/test/CodeGen/DirectX/umin.ll
index 5051c71174489..a551f8ff3bfa9 100644
--- a/llvm/test/CodeGen/DirectX/umin.ll
+++ b/llvm/test/CodeGen/DirectX/umin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for umin are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/Generic/disable-debug-info-print.ll b/llvm/test/CodeGen/Generic/disable-debug-info-print.ll
deleted file mode 100644
index befa91c15d3c8..0000000000000
--- a/llvm/test/CodeGen/Generic/disable-debug-info-print.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -disable-debug-info-print=true -exception-model=dwarf -o - %s | FileCheck %s
-; RUN: llc -disable-debug-info-print=true -exception-model=sjlj -o - %s | FileCheck %s --check-prefix=SJLJ-CHECK
-
-define i16 @main() nounwind !dbg !7 {
-entry:
-  ret i16 0, !dbg !9
-}
-
-define i16 @helper() !dbg !10 {
-entry:
-  ret i16 0, !dbg !11
-}
-
-
-; CHECK: main
-; CHECK-NOT: cfi_startproc
-; CHECK-NOT: .file
-; CHECK-NOT: .loc
-; CHECK: helper
-; CHECK: cfi_startproc
-; CHECK-NOT: .file
-; CHECK-NOT: .loc
-; CHECK: cfi_endproc
-
-; SJLJ-CHECK: main
-; SJLJ-CHECK-NOT: cfi_startproc
-; SJLJ-CHECK-NOT: .file
-; SJLJ-CHECK-NOT: .loc
-; SJLJ-CHECK: helper
-; SJLJ-CHECK-NOT: cfi_startproc
-; SJLJ-CHECK-NOT: .file
-; SJLJ-CHECK-NOT: .loc
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "unwind-tables.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 7, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 12.0.0"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 3, scope: !7)
-!10 = distinct !DISubprogram(name: "helper", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!11 = !DILocation(line: 2, column: 3, scope: !10)
diff --git a/llvm/test/CodeGen/Hexagon/cext-opt-block-addr.mir b/llvm/test/CodeGen/Hexagon/cext-opt-block-addr.mir
new file mode 100644
index 0000000000000..9f140132dcd6c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/cext-opt-block-addr.mir
@@ -0,0 +1,173 @@
+# REQUIRES: asserts
+# RUN: llc -march=hexagon -run-pass hexagon-cext-opt %s -o - | FileCheck %s
+
+# Check that the HexagonConstantExtenders pass does not assert when block
+# addresses from different functions are used
+# CHECK-LABEL: name: wibble
+# CHECK: A2_tfrsi blockaddress(@baz
+# CHECK: A2_tfrsi blockaddress(@wibble
+
+--- |
+  target triple = "hexagon"
+
+  define dso_local void @baz() {
+  bb:
+    br label %bb1
+
+  bb1:                                              ; preds = %bb
+    %call = tail call fastcc i32 @wibble(i32 poison)
+    ret void
+  }
+
+  define internal fastcc i32 @wibble(i32 %arg) {
+  bb:
+    %call = tail call i32 @eggs(i32 noundef ptrtoint (ptr blockaddress(@baz, %bb1) to i32))
+    br label %bb1
+
+  bb1:                                              ; preds = %bb
+    tail call void @baz.1(i32 noundef ptrtoint (ptr blockaddress(@wibble, %bb1) to i32))
+    ret i32 %call
+  }
+
+  declare i32 @eggs(i32 noundef) local_unnamed_addr
+
+  declare void @baz.1(i32 noundef) local_unnamed_addr
+
+...
+---
+name:            baz
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     true
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.bb:
+    successors: %bb.1(0x80000000)
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    %0:intregs = IMPLICIT_DEF
+    $r0 = COPY %0
+    PS_tailcall_i @wibble, hexagoncsr, implicit $r0
+
+...
+---
+name:            wibble
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: intregs, preferred-register: '' }
+  - { id: 3, class: intregs, preferred-register: '' }
+  - { id: 4, class: intregs, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.bb:
+    successors: %bb.1(0x80000000)
+
+    %2:intregs = A2_tfrsi blockaddress(@baz, %ir-block.bb1)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+    $r0 = COPY %2
+    J2_call @eggs, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit $r0, implicit-def $r29, implicit-def $r0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+    %3:intregs = COPY $r0
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    %4:intregs = A2_tfrsi blockaddress(@wibble, %ir-block.bb1)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+    $r0 = COPY %4
+    J2_call @baz.1, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit $r0, implicit-def $r29
+    ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+    $r0 = COPY %3
+    PS_jmpret $r31, implicit-def dead $pc, implicit $r0
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/hvx-concat-lower.ll b/llvm/test/CodeGen/Hexagon/hvx-concat-lower.ll
new file mode 100644
index 0000000000000..2037493972fc7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hvx-concat-lower.ll
@@ -0,0 +1,135 @@
+; During lowering of HVX instruction for 64B vector, the rotation
+; direction for VROR (as part of concat of vectors lowering) is fixed.
+
+; RUN: llc -march=hexagon -O2 %s -o - | FileCheck %s
+
+; CHECK: vec.epilog.ph
+; CHECK: r{{.*}} = {{.*}}#48
+; CHECK: vec.epilog.vector.body
+
+
+%struct.str = type { i8, i8, i8 }
+
+define dso_local void @foo(i16* nocapture noundef writeonly %pOut, i16* nocapture noundef readonly %Coefs, %struct.str* nocapture noundef readonly %pQ, i32 noundef %Intra) local_unnamed_addr #0 {
+entry:
+  %Coefs13 = ptrtoint i16* %Coefs to i32
+  %pOut12 = ptrtoint i16* %pOut to i32
+  %cmp10 = icmp slt i32 %Intra, 16
+  br i1 %cmp10, label %iter.check, label %for.end
+
+iter.check:                                       ; preds = %entry
+  %Q = getelementptr inbounds %struct.str, %struct.str* %pQ, i32 0, i32 0
+  %0 = load i8, i8* %Q, align 1
+  %conv3 = zext i8 %0 to i32
+  %1 = sub nsw i32 0, %conv3
+  %2 = sub i32 16, %Intra
+  %min.iters.check = icmp ult i32 %2, 8
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %iter.check
+  %3 = shl i32 %Intra, 1
+  %4 = add i32 %3, %pOut12
+  %5 = add i32 %3, %Coefs13
+  %6 = sub i32 %4, %5
+  %diff.check = icmp ult i32 %6, 128
+  br i1 %diff.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %vector.memcheck
+  %min.iters.check14 = icmp ult i32 %2, 64
+  br i1 %min.iters.check14, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.vec = and i32 %2, -64
+  %bd.spinsert = insertelement <32 x i32> poison, i32 %1, i64 0
+  %bd.sp = shufflevector <32 x i32> %bd.spinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %bd.spinsert16 = insertelement <32 x i32> poison, i32 %conv3, i64 0
+  %bd.sp17 = shufflevector <32 x i32> %bd.spinsert16, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %offset.idx = add i32 %index, %Intra
+  %7 = getelementptr inbounds i16, i16* %Coefs, i32 %offset.idx
+  %8 = bitcast i16* %7 to <32 x i16>*
+  %wide.load = load <32 x i16>, <32 x i16>* %8, align 2
+  %9 = getelementptr inbounds i16, i16* %7, i32 32
+  %10 = bitcast i16* %9 to <32 x i16>*
+  %wide.load15 = load <32 x i16>, <32 x i16>* %10, align 2
+  %11 = icmp slt <32 x i16> %wide.load, zeroinitializer
+  %12 = icmp slt <32 x i16> %wide.load15, zeroinitializer
+  %13 = select <32 x i1> %11, <32 x i32> %bd.sp, <32 x i32> %bd.sp17
+  %14 = select <32 x i1> %12, <32 x i32> %bd.sp, <32 x i32> %bd.sp17
+  %15 = trunc <32 x i32> %13 to <32 x i16>
+  %16 = trunc <32 x i32> %14 to <32 x i16>
+  %17 = getelementptr inbounds i16, i16* %pOut, i32 %offset.idx
+  %18 = bitcast i16* %17 to <32 x i16>*
+  store <32 x i16> %15, <32 x i16>* %18, align 2
+  %19 = getelementptr inbounds i16, i16* %17, i32 32
+  %20 = bitcast i16* %19 to <32 x i16>*
+  store <32 x i16> %16, <32 x i16>* %20, align 2
+  %index.next = add nuw i32 %index, 64
+  %21 = icmp eq i32 %index.next, %n.vec
+  br i1 %21, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i32 %2, %n.vec
+  br i1 %cmp.n, label %for.end, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %ind.end24 = add i32 %n.vec, %Intra
+  %n.vec.remaining = and i32 %2, 56
+  %min.epilog.iters.check = icmp eq i32 %n.vec.remaining, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+  %vec.epilog.resume.val = phi i32 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec23 = and i32 %2, -8
+  %ind.end = add i32 %n.vec23, %Intra
+  %bd.spinsert29 = insertelement <8 x i32> poison, i32 %1, i64 0
+  %bd.sp30 = shufflevector <8 x i32> %bd.spinsert29, <8 x i32> poison, <8 x i32> zeroinitializer
+  %bd.spinsert31 = insertelement <8 x i32> poison, i32 %conv3, i64 0
+  %bd.sp32 = shufflevector <8 x i32> %bd.spinsert31, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index26 = phi i32 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next33, %vec.epilog.vector.body ]
+  %offset.idx27 = add i32 %index26, %Intra
+  %22 = getelementptr inbounds i16, i16* %Coefs, i32 %offset.idx27
+  %23 = bitcast i16* %22 to <8 x i16>*
+  %wide.load28 = load <8 x i16>, <8 x i16>* %23, align 2
+  %24 = icmp slt <8 x i16> %wide.load28, zeroinitializer
+  %25 = select <8 x i1> %24, <8 x i32> %bd.sp30, <8 x i32> %bd.sp32
+  %26 = trunc <8 x i32> %25 to <8 x i16>
+  %27 = getelementptr inbounds i16, i16* %pOut, i32 %offset.idx27
+  %28 = bitcast i16* %27 to <8 x i16>*
+  store <8 x i16> %26, <8 x i16>* %28, align 2
+  %index.next33 = add nuw i32 %index26, 8
+  %29 = icmp eq i32 %index.next33, %n.vec23
+  br i1 %29, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %cmp.n25 = icmp eq i32 %2, %n.vec23
+  br i1 %cmp.n25, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %vector.memcheck, %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+  %i.011.ph = phi i32 [ %Intra, %iter.check ], [ %Intra, %vector.memcheck ], [ %ind.end24, %vec.epilog.iter.check ], [ %ind.end, %vec.epilog.middle.block ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %Coefs, i32 %i.011
+  %30 = load i16, i16* %arrayidx, align 2
+  %cmp1 = icmp slt i16 %30, 0
+  %31 = select i1 %cmp1, i32 %1, i32 %conv3
+  %conv4 = trunc i32 %31 to i16
+  %arrayidx5 = getelementptr inbounds i16, i16* %pOut, i32 %i.011
+  store i16 %conv4, i16* %arrayidx5, align 2
+  %inc = add i32 %i.011, 1
+  %exitcond.not = icmp eq i32 %inc, 16
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  ret void
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind "target-cpu"="hexagonv66" "target-features"="+hvx-length64b,+hvxv66,+v66" }
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
index 601b98dca8e20..be75301b016ed 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
@@ -3,6 +3,7 @@
 # RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
 # RUN: | FileCheck %s
 
+# CHECK: Physical registers are not supported in window scheduling!
 # CHECK: The WindowScheduler failed to initialize!
 
 ---
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
new file mode 100644
index 0000000000000..6e69a76290fb1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
@@ -0,0 +1,83 @@
+# RUN: llc  --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+# REQUIRES: asserts
+
+# Test that checks no window scheduler is performed if the II set by pragma was
+# enabled
+
+# CHECK: Window scheduling is disabled when llvm.loop.pipeline.initiationinterval is set.
+
+--- |
+  define void @test_pragma_ii_fail(ptr %a0, i32 %a1) {
+  b0:
+    %v0 = icmp sgt i32 %a1, 1
+    br i1 %v0, label %b1, label %b4
+
+  b1:                                               ; preds = %b0
+    %v1 = load i32, ptr %a0, align 4
+    %v2 = add i32 %v1, 10
+    %v4 = add i32 %a1, -1
+    %cgep = getelementptr i32, ptr %a0, i32 1
+    br label %b2
+
+  b2:                                               ; preds = %b2, %b1
+    %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ]
+    %v6 = phi ptr [ %cgep2, %b2 ], [ %cgep, %b1 ]
+    %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ]
+    store i32 %v7, ptr %v6, align 4
+    %v8 = add i32 %v7, 10
+    %cgep1 = getelementptr i32, ptr %v6, i32 -1
+    store i32 %v8, ptr %cgep1, align 4
+    %v10 = add i32 %v7, 10
+    %v12 = add i32 %v5, -1
+    %v13 = icmp eq i32 %v12, 0
+    %cgep2 = getelementptr i32, ptr %v6, i32 1
+    br i1 %v13, label %b4, label %b2, !llvm.loop !0
+
+  b4:                                               ; preds = %b2, %b0
+    ret void
+  }
+
+  !0 = distinct !{!0, !1}
+  !1 = !{!"llvm.loop.pipeline.initiationinterval", i32 2}
+...
+---
+name:            test_pragma_ii_fail
+tracksRegLiveness: true
+body:             |
+  bb.0.b0:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $r0, $r1
+  
+    %0:intregs = COPY $r1
+    %1:intregs = COPY $r0
+    %2:predregs = C2_cmpgti %0, 1
+    J2_jumpf %2, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.b1:
+    successors: %bb.2(0x80000000)
+  
+    %3:intregs, %4:intregs = L2_loadri_pi %1, 4
+    %5:intregs = A2_addi killed %3, 10
+    %6:intregs = A2_addi %0, -1
+    %7:intregs = COPY %6
+    J2_loop0r %bb.2, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.b2 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %8:intregs = PHI %4, %bb.1, %9, %bb.2
+    %10:intregs = PHI %5, %bb.1, %11, %bb.2
+    S2_storeri_io %8, 0, %10
+    %11:intregs = A2_addi %10, 10
+    S2_storeri_io %8, -4, %11
+    %9:intregs = A2_addi %8, 4
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.b4:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
new file mode 100644
index 0000000000000..ddba67d78eb58
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
@@ -0,0 +1,59 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs \
+# RUN: -window-region-limit=1 -window-search-ratio=100 -window-diff-limit=0 \
+# RUN: 2>&1 | FileCheck %s
+
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+
+---
+name:            test_window_stall_cycle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $r0, $r1
+  
+    %0:intregs = COPY $r1
+    %1:intregs = COPY $r0
+    %2:intregs = nsw A2_add %0, %1
+    %3:intregs = S2_lsr_i_r_acc %2, %2, 31
+    %4:intregs = S2_asr_i_r killed %3, 1
+    %5:predregs = C2_cmpgt %1, %4
+    %6:intregs = A2_tfrsi 0
+    J2_jumpt killed %5, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1:
+    successors: %bb.2(0x80000000)
+  
+    %7:intregs = A2_addi %4, 2
+    %8:intregs = A2_tfrsi 0
+    %9:intregs = A2_sub %4, %1
+    %10:intregs = A2_addi %9, 1
+    %11:intregs = COPY %10
+    J2_loop0r %bb.2, %11, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %12:intregs = PHI %7, %bb.1, %13, %bb.2
+    %14:intregs = PHI %8, %bb.1, %15, %bb.2
+    %16:intregs = PHI %8, %bb.1, %17, %bb.2
+    %18:intregs, %13:intregs = L2_loadri_pi %12, -4
+    %17:intregs = nsw A2_add killed %18, %16
+    %15:intregs = A2_max %17, %14
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3:
+    %19:intregs = PHI %6, %bb.0, %15, %bb.2
+    $r0 = COPY %19
+    PS_jmpret $r31, implicit-def dead $pc, implicit $r0
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
new file mode 100644
index 0000000000000..ecf49a83c69e1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
@@ -0,0 +1,45 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
+# CHECK-NOT: Can't find a valid II. Keep searching...
+# CHECK: Start analyzing II
+# CHECK: Start scheduling Phis
+# CHECK: Current window Offset is {{[0-9]+}} and II is {{[0-9]+}}
+
+---
+name:            relu
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  bb.1:
+    successors: %bb.3(0x80000000)
+    %4:hvxvr = V6_vd0
+    %5:intregs = A2_addi %2, 31
+    %6:intregs = S2_lsr_i_r %5, 5
+    %7:intregs = COPY %6
+    J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+    %8:intregs = PHI %1, %bb.1, %9, %bb.3
+    %10:intregs = PHI %0, %bb.1, %14, %bb.3
+    %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
+    %12:intregs = COPY %10
+    %13:hvxvr = V6_vmaxw killed %11, %4
+    %14:intregs = V6_vS32b_pi %12, 128, killed %13
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
+
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index 13f774c96d5b1..138f0c81238b3 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -25,7 +25,6 @@
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Instrument function entry/exit with calls to e.g. mcount() (post inlining)
diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
index 6d07e7a947297..67d62eca914e9 100644
--- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll
+++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
@@ -577,3 +577,24 @@ define i1 @andn_icmp_ult_i8_nn(i8 %a, i8 %b) nounwind {
   %cmp = icmp ult i8 %and, %b
   ret i1 %cmp
 }
+
+define i1 @andn_icmp_eq_i8_i32(i8 signext %a, i8 signext %b) nounwind {
+; LA32-LABEL: andn_icmp_eq_i8_i32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    sltui $a0, $a0, 1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: andn_icmp_eq_i8_i32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    andn $a0, $a1, $a0
+; LA64-NEXT:    andi $a0, $a0, 255
+; LA64-NEXT:    sltui $a0, $a0, 1
+; LA64-NEXT:    ret
+  %x = zext i8 %a to i32
+  %y = zext i8 %b to i32
+  %not = xor i32 %x, -1
+  %and = and i32 %not, %y
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index 4b2b72afaee17..7bc7a982db86d 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; LARGE:       # %bb.0:
 ; LARGE-NEXT:    addi.d $sp, $sp, -16
 ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
-; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
-; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
-; LARGE-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
+; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
+; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
+; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
+; LARGE-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
 ; LARGE-NEXT:    .cfi_offset 1, -8
 ; LARGE-NEXT:    ori $a2, $zero, 1000
 ; LARGE-NEXT:    move $a1, $zero
-; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
-; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
-; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
-; LARGE-NEXT:    add.d $ra, $t8, $ra
+; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
+; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
+; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
+; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
+; LARGE-NEXT:    add.d $ra, $ra, $a3
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
-; MEDIUM-NEXT:    jr $t8
+; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $a1
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
-; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
-; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
-; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
-; LARGE-NEXT:    ldx.d $t7, $t8, $t7
-; LARGE-NEXT:    jr $t7
+; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
+; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
+; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
+; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
+; LARGE-NEXT:    ldx.d $a1, $a2, $a1
+; LARGE-NEXT:    jr $a1
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
index 41ff1be496e3b..be3e05e394ff9 100644
--- a/llvm/test/CodeGen/LoongArch/cpus.ll
+++ b/llvm/test/CodeGen/LoongArch/cpus.ll
@@ -3,6 +3,7 @@
 
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=loongarch64 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=la464 2>&1 | FileCheck %s
+; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=la664 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d 2>&1 | FileCheck %s
 
 ; CHECK-NOT: {{.*}} is not a recognized processor for this target
@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
 define void @tune_cpu_la464() "tune-cpu"="la464" {
   ret void
 }
+
+define void @tune_cpu_la664() "tune-cpu"="la664" {
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
index 8c21adbcbb55c..d221200401bc5 100644
--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
+++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
@@ -1,6 +1,6 @@
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --stop-before loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
-; RUN: llc --mtriple=loongarch64 -mattr=+d --stop-before machine-opt-remark-emitter \
+; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
 
 declare void @callee()
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index fb2929572f31a..2423dd81a4d3a 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -53,32 +53,32 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-LABEL: foo:
 ; LA64LARGENOPIC:       # %bb.0:
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
 ; LA64LARGEPIC:       # %bb.0:
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
diff --git a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
index 277b0b9061398..2b7a862ecde11 100644
--- a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
+++ b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
@@ -20,10 +20,10 @@ define dso_local signext i32 @local_large() #0 {
 ; CHECK-LABEL: local_large:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(b)
-; CHECK-NEXT:    addi.d $t8, $zero, %pc_lo12(b)
-; CHECK-NEXT:    lu32i.d $t8, %pc64_lo20(b)
-; CHECK-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(b)
-; CHECK-NEXT:    add.d $a0, $t8, $a0
+; CHECK-NEXT:    addi.d $a1, $zero, %pc_lo12(b)
+; CHECK-NEXT:    lu32i.d $a1, %pc64_lo20(b)
+; CHECK-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(b)
+; CHECK-NEXT:    add.d $a0, $a1, $a0
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @b, align 4
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
index 281d52c47b685..becb3cae46b8c 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
@@ -141,3 +141,49 @@ define i32 @m_offset_2048(ptr %p) nounwind {
   %2 = call i32 asm "ld.w $0, $1", "=r,*m"(ptr elementtype(i32) %1)
   ret i32 %2
 }
+
+@g_i32 = dso_local global i32 0
+
+define i32 @m_addr_pcrel() nounwind {
+; LA32-LABEL: m_addr_pcrel:
+; LA32:       # %bb.0:
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    #APP
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    #NO_APP
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: m_addr_pcrel:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    #APP
+; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    #NO_APP
+; LA64-NEXT:    ret
+  %1 = tail call i32 asm sideeffect "ld.w $0, $1", "=&r,*m"(ptr nonnull elementtype(i32) @g_i32)
+  ret i32 %1
+}
+
+define i32 @m_addr_should_not_fold() nounwind {
+; LA32-LABEL: m_addr_should_not_fold:
+; LA32:       # %bb.0:
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    #APP
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    #NO_APP
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: m_addr_should_not_fold:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    #APP
+; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    #NO_APP
+; LA64-NEXT:    ret
+  %1 = tail call i32 asm sideeffect "ld.w $0, $1, 0", "=&r,r,~{memory}"(ptr nonnull @g_i32)
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
new file mode 100644
index 0000000000000..22ab19b9fa446
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvilvl.b
+define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
+                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvilvl.h
+define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+    ret <16 x i16> %c
+}
+
+;; xvilvl.w
+define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.b
+define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
+                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvilvh.h
+define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvilvh.w
+define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.w
+define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
new file mode 100644
index 0000000000000..2ff9af4069b9b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpackev.b
+define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
+                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpackev.h
+define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpackev.w
+define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i64> %c
+}
+
+;; xvpackev.w
+define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x double> %c
+}
+
+;; xvpackod.b
+define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
+                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpackod.h
+define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpackod.w
+define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvpackod.w
+define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x float> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
new file mode 100644
index 0000000000000..294d292d17640
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpickev.b
+define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
+                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpickev.h
+define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpickev.w
+define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.w
+define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickod.b
+define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
+                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpickod.h
+define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpickod.w
+define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.w
+define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
new file mode 100644
index 0000000000000..dce1e4b777e29
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvrepl128vei.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+    ret <32 x i8> %c
+}
+
+;; xvrepl128vei.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
+                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+    ret <16 x i16> %c
+}
+
+;; xvrepl128vei.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
+    ret <8 x i32> %c
+}
+
+;; xvrepl128vei.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+    ret <4 x i64> %c
+}
+
+;; xvrepl128vei.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+    ret <8 x float> %c
+}
+
+;; xvrepl128vei.d
+define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
new file mode 100644
index 0000000000000..4cc819018f0a8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvshuf.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
+                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvshuf.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
+; CHECK-NEXT:    xvshuf.h $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
+                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
+    ret <16 x i16> %c
+}
+
+;; xvshuf.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
+    ret <8 x i32> %c
+}
+
+;; xvshuf.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 238
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvshuf.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
new file mode 100644
index 0000000000000..dc4532a7292ab
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xxvshuf4i.b
+define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
+                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
+    ret <32 x i8> %c
+}
+
+;; xvshuf4i.h
+define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i16> %c
+}
+
+;; xvshuf4i.w
+define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i32> %c
+}
+
+;; xvshuf4i.w
+define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
new file mode 100644
index 0000000000000..31398c6081c0a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvl.b
+define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+    ret <16 x i8> %c
+}
+
+;; vilvl.h
+define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+    ret <8 x i16> %c
+}
+
+;; vilvl.w
+define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x i32> %c
+}
+
+;; vilvl.w
+define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x float> %c
+}
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
new file mode 100644
index 0000000000000..171e68306cd11
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpackev.b
+define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpackev.h
+define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpackev.w
+define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x i64> %c
+}
+
+;; vpackev.w
+define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x double> %c
+}
+
+;; vpackod.b
+define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpackod.h
+define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpackod.w
+define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vpackod.w
+define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x float> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
new file mode 100644
index 0000000000000..ca636d942b583
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpickev.b
+define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpickev.h
+define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpickev.w
+define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.w
+define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickod.b
+define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpickod.h
+define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpickod.w
+define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.w
+define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
new file mode 100644
index 0000000000000..10510786f3216
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vreplvei.b
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+    ret <16 x i8> %c
+}
+
+;; vreplvei.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+    ret <8 x i16> %c
+}
+
+;; vreplvei.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vreplvei.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x i64> %c
+}
+
+;; vreplvei.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x float> %c
+}
+
+;; vreplvei.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
new file mode 100644
index 0000000000000..55800b31446b3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vshuf.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vshuf.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vshuf.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vshuf.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
+
+;; vshuf.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
new file mode 100644
index 0000000000000..660b9581c3d1f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index ed1a24e82b4e4..5248468b6027d 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -41,12 +41,12 @@ define void @test_la_pcrel(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_pcrel:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
 ; LA64LARGE-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(l)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(l)
-; LA64LARGE-NEXT:    add.d $a2, $t8, $a2
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(l)
+; LA64LARGE-NEXT:    lu52i.d $a3, $a1, %pc64_hi12(l)
+; LA64LARGE-NEXT:    move $a1, $zero
+; LA64LARGE-NEXT:    add.d $a2, $a3, $a2
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB0_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -103,10 +103,10 @@ define void @test_la_got(i32 signext %n) {
 ; LA64LARGE-LABEL: test_la_got:
 ; LA64LARGE:       # %bb.0: # %entry
 ; LA64LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(g)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(g)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(g)
-; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(g)
+; LA64LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(g)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(g)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
 ; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB1_1: # %loop
@@ -165,10 +165,10 @@ define void @test_la_tls_ie(i32 signext %n) {
 ; LA64LARGE-LABEL: test_la_tls_ie:
 ; LA64LARGE:       # %bb.0: # %entry
 ; LA64LARGE-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGE-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %ie_pc_lo12(ie)
+; LA64LARGE-NEXT:    lu32i.d $a2, %ie64_pc_lo20(ie)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %ie64_pc_hi12(ie)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
 ; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB2_1: # %loop
@@ -272,21 +272,21 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64LARGE-NEXT:    .cfi_offset 23, -24
 ; LA64LARGE-NEXT:    .cfi_offset 24, -32
 ; LA64LARGE-NEXT:    move $fp, $a0
+; LA64LARGE-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
 ; LA64LARGE-NEXT:    move $s1, $zero
-; LA64LARGE-NEXT:    pcalau12i $s0, %ld_pc_hi20(ld)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB3_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64LARGE-NEXT:    move $a0, $s0
-; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGE-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
@@ -438,21 +438,21 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64LARGE-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    move $fp, $a0
+; LA64LARGE-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
 ; LA64LARGE-NEXT:    move $s1, $zero
-; LA64LARGE-NEXT:    pcalau12i $s0, %gd_pc_hi20(gd)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB5_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64LARGE-NEXT:    move $a0, $s0
-; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGE-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
new file mode 100644
index 0000000000000..48d18dbedcaf2
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -0,0 +1,1107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+d --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d --verify-machineinstrs \
+; RUN:   --code-model=large < %s | FileCheck --check-prefix=LA64-LARGE %s
+
+@g_i8 = dso_local global i8 0
+
+define dso_local signext i8 @load_s8() nounwind {
+; LA32-LABEL: load_s8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ld.b $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ld.b $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.b $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i8, ptr @g_i8
+  ret i8 %0
+}
+
+define dso_local zeroext i8 @load_u8() nounwind {
+; LA32-LABEL: load_u8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ld.bu $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ld.bu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.bu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i8, ptr @g_i8
+  ret i8 %0
+}
+
+define dso_local void @store_i8() nounwind {
+; LA32-LABEL: store_i8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.b $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.b $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.b $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i8 1, ptr @g_i8
+  ret void
+}
+
+@g_i16 = dso_local global i16 0
+
+define dso_local signext i16 @load_s16() nounwind {
+; LA32-LABEL: load_s16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ld.h $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ld.h $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.h $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i16, ptr @g_i16
+  ret i16 %0
+}
+
+define dso_local zeroext i16 @load_u16() nounwind {
+; LA32-LABEL: load_u16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.hu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i16, ptr @g_i16
+  ret i16 %0
+}
+
+define dso_local void @store_i16() nounwind {
+; LA32-LABEL: store_i16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.h $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.h $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.h $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i16 1, ptr @g_i16
+  ret void
+}
+
+@g_i32 = dso_local global i32 0
+
+define dso_local signext i32 @load_s32() nounwind {
+; LA32-LABEL: load_s32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ld.w $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.w $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr @g_i32
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @load_u32() nounwind {
+; LA32-LABEL: load_u32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ld.wu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.wu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr @g_i32
+  ret i32 %0
+}
+
+define dso_local void @store_i32() nounwind {
+; LA32-LABEL: store_i32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr @g_i32
+  ret void
+}
+
+@g_i64 = dso_local global i64 0
+
+define dso_local i64 @load_64() nounwind {
+; LA32-LABEL: load_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i64)
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
+; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i64, ptr @g_i64
+  ret i64 %0
+}
+
+define dso_local void @store_i64() nounwind {
+; LA32-LABEL: store_i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i64)
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i64 1, ptr @g_i64
+  ret void
+}
+
+@g_f32 = dso_local global float 0.0
+
+define dso_local float @load_f32() nounwind {
+; LA32-LABEL: load_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
+; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
+; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_f32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load float, ptr @g_f32
+  ret float %0
+}
+
+define dso_local void @store_f32() nounwind {
+; LA32-LABEL: store_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
+; LA32-NEXT:    lu12i.w $a1, 260096
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
+; LA64-NEXT:    lu12i.w $a1, 260096
+; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_f32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 260096
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store float 1.0, ptr @g_f32
+  ret void
+}
+
+@g_f64 = dso_local global double 0.0
+
+define dso_local double @load_f64() nounwind {
+; LA32-LABEL: load_f64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
+; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_f64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
+; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_f64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load double, ptr @g_f64
+  ret double %0
+}
+
+define dso_local void @store_f64() nounwind {
+; LA32-LABEL: store_f64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
+; LA32-NEXT:    addi.w $a1, $zero, 1
+; LA32-NEXT:    movgr2fr.w $fa0, $a1
+; LA32-NEXT:    ffint.s.w $fa0, $fa0
+; LA32-NEXT:    fcvt.d.s $fa0, $fa0
+; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_f64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
+; LA64-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_f64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store double 1.0, ptr @g_f64
+  ret void
+}
+
+@g_m64 = dso_local global i64 0
+
+define dso_local void @store_multi() nounwind {
+; LA32-LABEL: store_multi:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_m64)
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 2
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_multi:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_m64)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ori $a1, $zero, 2
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_multi:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_m64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_m64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_m64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 2
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store volatile i64 1, ptr @g_m64
+  store volatile i64 2, ptr @g_m64
+  ret void
+}
+
+@g_sf32 = dso_local global float 0.0
+
+define dso_local void @store_sf32() nounwind {
+; LA32-LABEL: store_sf32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf32)
+; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fst.s $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_sf32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf32)
+; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fst.s $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_sf32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fst.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load float, ptr @g_sf32
+  store volatile float %0, ptr @g_sf32
+  ret void
+}
+
+@g_sf64 = dso_local global double 0.0
+
+define dso_local void @store_sf64() nounwind {
+; LA32-LABEL: store_sf64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf64)
+; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_sf64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf64)
+; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fst.d $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_sf64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fst.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load double, ptr @g_sf64
+  store volatile double %0, ptr @g_sf64
+  ret void
+}
+
+@g_rmw = dso_local global i64 0
+
+define dso_local void @rmw() nounwind {
+; LA32-LABEL: rmw:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_rmw)
+; LA32-NEXT:    ld.w $a1, $a0, 0
+; LA32-NEXT:    ld.w $a2, $a0, 4
+; LA32-NEXT:    addi.w $a1, $a1, 1
+; LA32-NEXT:    sltui $a3, $a1, 1
+; LA32-NEXT:    add.w $a2, $a2, $a3
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: rmw:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_rmw)
+; LA64-NEXT:    ld.d $a1, $a0, 0
+; LA64-NEXT:    addi.d $a1, $a1, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: rmw:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_rmw)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_rmw)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_rmw)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    addi.d $a1, $a1, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i64, ptr @g_rmw
+  %1 = add i64 %0, 1
+  store i64 %1, ptr @g_rmw
+  ret void
+}
+
+@g_a32 = dso_local global [2048 x i32] zeroinitializer, align 4
+
+define dso_local void @store_a32() nounwind {
+; LA32-LABEL: store_a32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    lu12i.w $a1, 1
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_a32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_a32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
+  ret void
+}
+
+define dso_local void @store_a32_2() nounwind {
+; LA32-LABEL: store_a32_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    lu12i.w $a1, 1
+; LA32-NEXT:    add.w $a2, $a0, $a1
+; LA32-NEXT:    ori $a3, $zero, 1
+; LA32-NEXT:    st.w $a3, $a2, 0
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ori $a1, $zero, 2
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_a32_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    ori $a1, $zero, 2
+; LA64-NEXT:    stptr.w $a1, $a0, 4104
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_a32_2:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    ori $a1, $zero, 2
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4104
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
+  store i32 2, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1026), align 4
+  ret void
+}
+
+define dso_local void @control_flow_with_mem_access() nounwind {
+; LA32-LABEL: control_flow_with_mem_access:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    ld.w $a1, $a0, 4
+; LA32-NEXT:    ori $a2, $zero, 1
+; LA32-NEXT:    blt $a1, $a2, .LBB21_2
+; LA32-NEXT:  # %bb.1: # %if.then
+; LA32-NEXT:    ori $a1, $zero, 10
+; LA32-NEXT:    st.w $a1, $a0, 4
+; LA32-NEXT:  .LBB21_2: # %if.end
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: control_flow_with_mem_access:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ld.w $a1, $a0, 4
+; LA64-NEXT:    ori $a2, $zero, 1
+; LA64-NEXT:    blt $a1, $a2, .LBB21_2
+; LA64-NEXT:  # %bb.1: # %if.then
+; LA64-NEXT:    ori $a1, $zero, 10
+; LA64-NEXT:    st.w $a1, $a0, 4
+; LA64-NEXT:  .LBB21_2: # %if.end
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: control_flow_with_mem_access:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.w $a0, $a0, 4
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    blt $a0, $a1, .LBB21_2
+; LA64-LARGE-NEXT:  # %bb.1: # %if.then
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 10
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 4
+; LA64-LARGE-NEXT:  .LBB21_2: # %if.end
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 10, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define dso_local ptr @load_ba_1() nounwind {
+; LA32-LABEL: load_ba_1:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:  .Ltmp0: # Block address taken
+; LA32-NEXT:  # %bb.1: # %label
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp0)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_ba_1:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:  .Ltmp0: # Block address taken
+; LA64-NEXT:  # %bb.1: # %label
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp0)
+; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_ba_1:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:  .Ltmp0: # Block address taken
+; LA64-LARGE-NEXT:  # %bb.1: # %label
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp0)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp0)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp0)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  br label %label
+label:
+  %0 = load ptr, ptr blockaddress(@load_ba_1, %label)
+  ret ptr %0
+}
+
+define dso_local ptr @load_ba_2() nounwind {
+; LA32-LABEL: load_ba_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:  .Ltmp1: # Block address taken
+; LA32-NEXT:  # %bb.1: # %label
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp1)
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_ba_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:  .Ltmp1: # Block address taken
+; LA64-NEXT:  # %bb.1: # %label
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp1)
+; LA64-NEXT:    ld.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_ba_2:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:  .Ltmp1: # Block address taken
+; LA64-LARGE-NEXT:  # %bb.1: # %label
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  br label %label
+label:
+  %0 = load ptr, ptr getelementptr inbounds (i8, ptr blockaddress(@load_ba_2, %label), i32 8)
+  ret ptr %0
+}
+
+@g_a64 = dso_local global [614750729487779976 x i64] zeroinitializer, align 8
+
+define dso_local ptr @load_addr_offset_1() nounwind {
+; LA32-LABEL: load_addr_offset_1:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, 8
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1)
+}
+
+define dso_local ptr @load_addr_offset_257() nounwind {
+; LA32-LABEL: load_addr_offset_257:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, 2047
+; LA32-NEXT:    addi.w $a0, $a0, 9
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_257:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, 2047
+; LA64-NEXT:    addi.d $a0, $a0, 9
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_257:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 2047
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 9
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257)
+}
+
+define dso_local ptr @load_addr_offset_1048576() nounwind {
+; LA32-LABEL: load_addr_offset_1048576:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 2048
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1048576:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1048576:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576)
+}
+
+define dso_local ptr @load_addr_offset_1048577() nounwind {
+; LA32-LABEL: load_addr_offset_1048577:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 2048
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1048577:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1048577:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577)
+}
+
+define dso_local ptr @load_addr_offset_268432896() nounwind {
+; LA32-LABEL: load_addr_offset_268432896:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 524283
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_268432896:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 524283
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_268432896:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896)
+}
+
+define dso_local ptr @load_addr_offset_268432897() nounwind {
+; LA32-LABEL: load_addr_offset_268432897:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 524283
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_268432897:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 524283
+; LA64-NEXT:    ori $a1, $a1, 8
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_268432897:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
+; LA64-LARGE-NEXT:    ori $a1, $a1, 8
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897)
+}
+
+define dso_local ptr @load_addr_offset_9380351707272() nounwind {
+; LA32-LABEL: load_addr_offset_9380351707272:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 279556
+; LA32-NEXT:    ori $a1, $a1, 1088
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_9380351707272:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 279556
+; LA64-NEXT:    ori $a1, $a1, 1088
+; LA64-NEXT:    lu32i.d $a1, 17472
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_9380351707272:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 279556
+; LA64-LARGE-NEXT:    ori $a1, $a1, 1088
+; LA64-LARGE-NEXT:    lu32i.d $a1, 17472
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 9380351707272)
+}
+
+define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
+; LA32-LABEL: load_addr_offset_614750729487779976:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 279556
+; LA32-NEXT:    ori $a1, $a1, 1088
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_614750729487779976:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 279556
+; LA64-NEXT:    ori $a1, $a1, 1088
+; LA64-NEXT:    lu32i.d $a1, 17472
+; LA64-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_614750729487779976:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 279556
+; LA64-LARGE-NEXT:    ori $a1, $a1, 1088
+; LA64-LARGE-NEXT:    lu32i.d $a1, 17472
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 614750729487779976)
+}
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 4e5a5433596db..c5c5342e303c7 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -53,7 +53,6 @@
 ; LAXX-NEXT:       Expand memcmp() to load/stores
 ; LAXX-NEXT:       Lower Garbage Collection Instructions
 ; LAXX-NEXT:       Shadow Stack GC Lowering
-; LAXX-NEXT:       Lower constant intrinsics
 ; LAXX-NEXT:       Remove unreachable blocks from the CFG
 ; LAXX-NEXT:       Natural Loop Information
 ; LAXX-NEXT:       Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 3e8ef59a75633..b03a523fb79a9 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --post-RA-scheduler=0 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --post-RA-scheduler=1 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --post-RA-scheduler=0 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --post-RA-scheduler=1 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
 
 @g = dso_local global i64 zeroinitializer, align 4
@@ -24,21 +24,25 @@ define void @foo() nounwind {
 ; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
-; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
+; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
+; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
-; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_NO_SCH-NEXT:    ret
@@ -50,21 +54,25 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; MEDIUM_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
-; MEDIUM_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
-; MEDIUM_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
-; MEDIUM_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
-; MEDIUM_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
+; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
+; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ie)
 ; MEDIUM_SCH-NEXT:    ldx.d $zero, $a1, $tp
-; MEDIUM_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_SCH-NEXT:    ret
@@ -74,42 +82,54 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    ldx.d $a1, $t8, $a1
-; LARGE_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a2, $t8, $a2
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    add.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    add.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
-; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_NO_SCH-NEXT:    ret
@@ -119,42 +139,54 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
-; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
-; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LARGE_SCH-NEXT:    ldx.d $a1, $t8, $a1
-; LARGE_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_SCH-NEXT:    ldx.d $a2, $t8, $a2
+; LARGE_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_SCH-NEXT:    add.d $ra, $ra, $a1
+; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_SCH-NEXT:    add.d $ra, $ra, $a1
+; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $tp
-; LARGE_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_SCH-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
index b2d46f5c088ba..774cf614f6099 100644
--- a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
+++ b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
@@ -5,15 +5,13 @@
 define signext i32 @rotl_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    ori $a2, $zero, 32
-; LA32-NEXT:    sub.w $a1, $a2, $a1
+; LA32-NEXT:    sub.w $a1, $zero, $a1
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: rotl_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 32, %y
@@ -80,8 +78,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 64
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.d $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i64 64, %y
@@ -149,8 +146,7 @@ define signext i32 @rotl_32_mask(i32 signext %x, i32 signext %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32_mask:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 0, %y
@@ -170,8 +166,7 @@ define signext i32 @rotl_32_mask_and_63_and_31(i32 signext %x, i32 signext %y) n
 ;
 ; LA64-LABEL: rotl_32_mask_and_63_and_31:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = and i32 %y, 63
@@ -192,8 +187,7 @@ define signext i32 @rotl_32_mask_or_64_or_32(i32 signext %x, i32 signext %y) nou
 ;
 ; LA64-LABEL: rotl_32_mask_or_64_or_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = or i32 %y, 64
@@ -504,6 +498,42 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
   ret i64 %f
 }
 
+define signext i32 @rotr_64_trunc_32(i64 %x, i64 %y) nounwind {
+; LA32-LABEL: rotr_64_trunc_32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    srl.w $a3, $a0, $a2
+; LA32-NEXT:    xori $a4, $a2, 31
+; LA32-NEXT:    slli.w $a5, $a1, 1
+; LA32-NEXT:    sll.w $a4, $a5, $a4
+; LA32-NEXT:    or $a3, $a3, $a4
+; LA32-NEXT:    addi.w $a4, $a2, -32
+; LA32-NEXT:    slti $a5, $a4, 0
+; LA32-NEXT:    maskeqz $a3, $a3, $a5
+; LA32-NEXT:    srl.w $a1, $a1, $a4
+; LA32-NEXT:    masknez $a1, $a1, $a5
+; LA32-NEXT:    or $a1, $a3, $a1
+; LA32-NEXT:    sub.w $a3, $zero, $a2
+; LA32-NEXT:    sll.w $a0, $a0, $a3
+; LA32-NEXT:    ori $a3, $zero, 32
+; LA32-NEXT:    sub.w $a2, $a3, $a2
+; LA32-NEXT:    srai.w $a2, $a2, 31
+; LA32-NEXT:    and $a0, $a2, $a0
+; LA32-NEXT:    or $a0, $a1, $a0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: rotr_64_trunc_32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    rotr.d $a0, $a0, $a1
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    ret
+  %z = sub i64 64, %y
+  %b = lshr i64 %x, %y
+  %c = shl i64 %x, %z
+  %d = or i64 %b, %c
+  %e = trunc i64 %d to i32
+  ret i32 %e
+}
+
 define signext i32 @rotri_i32(i32 signext %a) nounwind {
 ; LA32-LABEL: rotri_i32:
 ; LA32:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index bb89794d1c843..4ac6201fdd9d4 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -51,15 +51,15 @@ define ptr @f1() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -82,10 +82,10 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-LABEL: f1:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -120,14 +120,13 @@ define ptr @f1() nounwind {
 ; DESC64-NEXT:    addi.d $sp, $sp, -16
 ; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(unspecified)
-; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(unspecified)
-; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(unspecified)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(unspecified)
-; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(unspecified)
+; DESC64-NEXT:    lu32i.d $a1, %desc64_pc_lo20(unspecified)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(unspecified)
+; DESC64-NEXT:    add.d $a0, $a0, $a1
 ; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(unspecified)
 ; DESC64-NEXT:    jirl $ra, $ra, %desc_call(unspecified)
-; DESC64-NEXT:    add.d $a1, $a0, $tp
-; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; DESC64-NEXT:    addi.d $sp, $sp, 16
 ; DESC64-NEXT:    ret
@@ -165,15 +164,15 @@ define ptr @f2() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -196,10 +195,10 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-LABEL: f2:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -234,14 +233,13 @@ define ptr @f2() nounwind {
 ; DESC64-NEXT:    addi.d $sp, $sp, -16
 ; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(ld)
-; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(ld)
-; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(ld)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(ld)
-; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(ld)
+; DESC64-NEXT:    lu32i.d $a1, %desc64_pc_lo20(ld)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(ld)
+; DESC64-NEXT:    add.d $a0, $a0, $a1
 ; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(ld)
 ; DESC64-NEXT:    jirl $ra, $ra, %desc_call(ld)
-; DESC64-NEXT:    add.d $a1, $a0, $tp
-; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; DESC64-NEXT:    addi.d $sp, $sp, 16
 ; DESC64-NEXT:    ret
@@ -269,10 +267,10 @@ define ptr @f3() nounwind {
 ; LA64LARGEPIC-LABEL: f3:
 ; LA64LARGEPIC:       # %bb.0: # %entry
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGEPIC-NEXT:    ret
 ;
@@ -293,10 +291,10 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-LABEL: f3:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -317,10 +315,10 @@ define ptr @f3() nounwind {
 ; DESC64-LABEL: f3:
 ; DESC64:       # %bb.0: # %entry
 ; DESC64-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; DESC64-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; DESC64-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; DESC64-NEXT:    ldx.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; DESC64-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; DESC64-NEXT:    ldx.d $a0, $a1, $a0
 ; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/ucmp.ll b/llvm/test/CodeGen/LoongArch/ucmp.ll
index 548c5bd0db72b..b91d3bf15d812 100644
--- a/llvm/test/CodeGen/LoongArch/ucmp.ll
+++ b/llvm/test/CodeGen/LoongArch/ucmp.ll
@@ -26,8 +26,8 @@ define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp.8.32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    sltu $a2, $a0, $a1
 ; CHECK-NEXT:    sltu $a0, $a1, $a0
 ; CHECK-NEXT:    sub.d $a0, $a0, $a2
@@ -71,8 +71,8 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp.32.32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    sltu $a2, $a0, $a1
 ; CHECK-NEXT:    sltu $a0, $a1, $a0
 ; CHECK-NEXT:    sub.d $a0, $a0, $a2
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 37afe7e3ed2ac..ad48778d2d0ba 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -94,4 +94,50 @@ entry:
   ret void
 }
 
+@.str = private constant [22 x i8] c"preemption imbalance \00", align 1
+
+define void @t3() {
+; LA32-LABEL: t3:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    .cfi_def_cfa_offset 64
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.L.str)
+; LA32-NEXT:    ld.h $a1, $a0, 20
+; LA32-NEXT:    ld.w $a2, $a0, 16
+; LA32-NEXT:    st.h $a1, $sp, 20
+; LA32-NEXT:    st.w $a2, $sp, 16
+; LA32-NEXT:    ld.w $a1, $a0, 12
+; LA32-NEXT:    ld.w $a2, $a0, 8
+; LA32-NEXT:    ld.w $a3, $a0, 4
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    st.w $a1, $sp, 12
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    st.w $a3, $sp, 4
+; LA32-NEXT:    st.w $a0, $sp, 0
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: t3:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.L.str)
+; LA64-NEXT:    ld.h $a1, $a0, 20
+; LA64-NEXT:    ld.w $a2, $a0, 16
+; LA64-NEXT:    ld.d $a3, $a0, 8
+; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    st.h $a1, $sp, 20
+; LA64-NEXT:    st.w $a2, $sp, 16
+; LA64-NEXT:    st.d $a3, $sp, 8
+; LA64-NEXT:    st.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    ret
+entry:
+  %msgbuf = alloca [64 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %msgbuf, ptr align 1 @.str, i64 22, i1 false)
+  ret void
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll
index e7d0b20640f7a..0481d5c18e0c4 100644
--- a/llvm/test/CodeGen/M68k/pipeline.ll
+++ b/llvm/test/CodeGen/M68k/pipeline.ll
@@ -24,7 +24,6 @@
 ; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
-; CHECK-NEXT:      Lower constant intrinsics
 ; CHECK-NEXT:      Remove unreachable blocks from the CFG
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/NVPTX/addr-mode.ll b/llvm/test/CodeGen/NVPTX/addr-mode.ll
new file mode 100644
index 0000000000000..a6a085c0e2e33
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/addr-mode.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_addr_mode_i64(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i64_param_0];
+; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %addr = getelementptr i32, ptr %x, i64 -1
+  %res = load i32, ptr %addr
+  ret i32 %res
+}
+
+define i32 @test_addr_mode_i32(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i32_param_0];
+; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %addr = getelementptr i32, ptr %x, i32 -1
+  %res = load i32, ptr %addr
+  ret i32 %res
+}
+
+define i32 @test_addr_mode_i16(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i16_param_0];
+; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %addr = getelementptr i32, ptr %x, i16 -1
+  %res = load i32, ptr %addr
+  ret i32 %res
+}
+
+define i32 @test_addr_mode_i8(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i8_param_0];
+; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %addr = getelementptr i32, ptr %x, i8 -1
+  %res = load i32, ptr %addr
+  ret i32 %res
+}
+
+define i32 @test_addr_mode_i64_large(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i64_large(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0];
+; CHECK-NEXT:    add.s64 %rd2, %rd1, 17179869172;
+; CHECK-NEXT:    ld.u32 %r1, [%rd2];
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %addr = getelementptr i32, ptr %x, i64 4294967293
+  %res = load i32, ptr %addr
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
new file mode 100644
index 0000000000000..d3aace95e9665
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s  --check-prefix=SM70
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+
+; CHECK-LABEL: fence_sc_sys
+define void @fence_sc_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.sc.sys
+  fence seq_cst
+  ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_sys
+define void @fence_acq_rel_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys
+  fence acq_rel
+  ret void  
+}
+
+; CHECK-LABEL: fence_release_sys
+define void @fence_release_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys  
+  fence release
+  ret void  
+}
+
+; CHECK-LABEL: fence_acquire_sys
+define void @fence_acquire_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys  
+  fence acquire
+  ret void  
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
new file mode 100644
index 0000000000000..b775e40470047
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\{\}\(\)]" --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
+; CHECK-LABEL: foo(
+; CHECK:    .reg .b16 %rs<2>;
+; CHECK:    .reg .b32 %r<4>;
+; CHECK:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK:    ld.param.u64 %rd1, [foo_param_0];
+; CHECK:    ld.param.u64 %rd2, [foo_param_1];
+; CHECK:    cvta.to.global.u64 %rd3, %rd2;
+; CHECK:    cvta.to.global.u64 %rd4, %rd1;
+; CHECK:    ld.global.nc.u8 %rs1, [%rd4];
+; CHECK:    cvt.u32.u8 %r1, %rs1;
+; CHECK:    add.s32 %r2, %r1, 1;
+; CHECK:    and.b32 %r3, %r2, 1;
+; CHECK:    st.global.u32 [%rd3], %r3;
+; CHECK:    ret;
+  %ld = load i1, ptr %ptr, align 1
+  %zext = zext i1 %ld to i32
+  %add = add i32 %zext, 1
+  %and = and i32 %add, 1
+  store i32 %and, ptr %retval
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+
+!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
new file mode 100644
index 0000000000000..d1f99b5724de8
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\(\)\{\}]" --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+@i1g = addrspace(1) global i1 false, align 2
+
+define void @foo() {
+; CHECK-LABEL: foo(
+; CHECK:    .reg .pred %p<2>;
+; CHECK:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK:    ld.global.u8 %rs1, [i1g];
+; CHECK:    and.b16 %rs2, %rs1, 1;
+; CHECK:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK:    @%p1 bra $L__BB0_2;
+; CHECK:    mov.u16 %rs3, 1;
+; CHECK:    st.global.u8 [i1g], %rs3;
+; CHECK:    ret;
+  %tmp = load i1, ptr addrspace(1) @i1g, align 2
+  br i1 %tmp, label %if.end, label %if.then
+
+if.then:
+  store i1 true, ptr addrspace(1) @i1g, align 2
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 4449e4f2ea4ed..396c29512933c 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -7,20 +7,20 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<19>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<127>;
+; CHECK-NEXT:    .reg .b64 %rd<129>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    mov.u64 %rd117, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd117, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd117, %rd46;
+; CHECK-NEXT:    mov.u64 %rd119, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd52, %rd119, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd53, %rd119, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, %rd53, %rd46, %p1;
 ; CHECK-NEXT:    selp.b64 %rd3, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd117, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd117, %rd50;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd119, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd119, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
 ; CHECK-NEXT:    selp.b64 %rd6, %rd55, %rd50, %p2;
 ; CHECK-NEXT:    selp.b64 %rd5, %rd54, %rd49, %p2;
@@ -43,109 +43,109 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd7, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd117, 0;
-; CHECK-NEXT:    setp.eq.s64 %p8, %rd8, 0;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd8, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd66, %rd61, %rd65;
+; CHECK-NEXT:    subc.cc.s64 %rd67, %rd119, 0;
+; CHECK-NEXT:    setp.eq.s64 %p8, %rd67, 0;
+; CHECK-NEXT:    setp.ne.s64 %p9, %rd67, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p9;
-; CHECK-NEXT:    setp.gt.u64 %p10, %rd7, 127;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd66, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p10;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p8;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p11, %r8, 1;
 ; CHECK-NEXT:    or.pred %p12, %p5, %p11;
-; CHECK-NEXT:    xor.b64 %rd66, %rd7, 127;
-; CHECK-NEXT:    or.b64 %rd67, %rd66, %rd8;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd67, 0;
-; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p12;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p12;
+; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd69, 0;
+; CHECK-NEXT:    selp.b64 %rd128, 0, %rd4, %p12;
+; CHECK-NEXT:    selp.b64 %rd127, 0, %rd3, %p12;
 ; CHECK-NEXT:    or.pred %p14, %p12, %p13;
 ; CHECK-NEXT:    @%p14 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd7, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd70, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.s64 %p15, %rd70, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd7;
+; CHECK-NEXT:    add.cc.s64 %rd121, %rd66, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd122, %rd67, 0;
+; CHECK-NEXT:    or.b64 %rd72, %rd121, %rd122;
+; CHECK-NEXT:    setp.eq.s64 %p15, %rd72, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd66;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd71, %rd4, %r11;
+; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd72, %rd3, %r13;
-; CHECK-NEXT:    or.b64 %rd73, %rd71, %rd72;
+; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r13;
+; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd74, %rd3, %r15;
+; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd124, %rd74, %rd73, %p16;
-; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r11;
-; CHECK-NEXT:    mov.u64 %rd114, %rd117;
+; CHECK-NEXT:    selp.b64 %rd126, %rd76, %rd75, %p16;
+; CHECK-NEXT:    shl.b64 %rd125, %rd3, %r11;
+; CHECK-NEXT:    mov.u64 %rd116, %rd119;
 ; CHECK-NEXT:    @%p15 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd119;
-; CHECK-NEXT:    shr.u64 %rd77, %rd3, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd121;
+; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd78, %rd4, %r18;
-; CHECK-NEXT:    or.b64 %rd79, %rd77, %rd78;
+; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r18;
+; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd80, %rd4, %r19;
+; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd121, %rd80, %rd79, %p17;
-; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r16;
+; CHECK-NEXT:    selp.b64 %rd123, %rd82, %rd81, %p17;
+; CHECK-NEXT:    shr.u64 %rd124, %rd4, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.u64 %rd114, 0;
-; CHECK-NEXT:    mov.u64 %rd117, %rd114;
+; CHECK-NEXT:    mov.u64 %rd116, 0;
+; CHECK-NEXT:    mov.u64 %rd119, %rd116;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd81, %rd121, 63;
-; CHECK-NEXT:    shl.b64 %rd82, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd83, %rd82, %rd81;
-; CHECK-NEXT:    shl.b64 %rd84, %rd121, 1;
-; CHECK-NEXT:    shr.u64 %rd85, %rd124, 63;
-; CHECK-NEXT:    or.b64 %rd86, %rd84, %rd85;
-; CHECK-NEXT:    shr.u64 %rd87, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd90;
-; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd89;
-; CHECK-NEXT:    sub.cc.s64 %rd91, %rd35, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd92, %rd36, %rd83;
-; CHECK-NEXT:    shr.s64 %rd93, %rd92, 63;
-; CHECK-NEXT:    and.b64 %rd117, %rd93, 1;
-; CHECK-NEXT:    and.b64 %rd94, %rd93, %rd5;
-; CHECK-NEXT:    and.b64 %rd95, %rd93, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd121, %rd86, %rd94;
-; CHECK-NEXT:    subc.cc.s64 %rd122, %rd83, %rd95;
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
-; CHECK-NEXT:    or.b64 %rd96, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.s64 %p18, %rd96, 0;
+; CHECK-NEXT:    shr.u64 %rd83, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd84, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT:    shl.b64 %rd86, %rd123, 1;
+; CHECK-NEXT:    shr.u64 %rd87, %rd126, 63;
+; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT:    shr.u64 %rd89, %rd125, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd126, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd125, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd119, %rd92;
+; CHECK-NEXT:    or.b64 %rd126, %rd116, %rd91;
+; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT:    and.b64 %rd119, %rd95, 1;
+; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd123, %rd88, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd124, %rd85, %rd97;
+; CHECK-NEXT:    add.cc.s64 %rd121, %rd121, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd122, %rd122, -1;
+; CHECK-NEXT:    or.b64 %rd98, %rd121, %rd122;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd98, 0;
 ; CHECK-NEXT:    @%p18 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd97, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd98, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd99, %rd98, %rd97;
-; CHECK-NEXT:    shl.b64 %rd100, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd100;
-; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd99;
+; CHECK-NEXT:    shr.u64 %rd99, %rd125, 63;
+; CHECK-NEXT:    shl.b64 %rd100, %rd126, 1;
+; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT:    shl.b64 %rd102, %rd125, 1;
+; CHECK-NEXT:    or.b64 %rd127, %rd119, %rd102;
+; CHECK-NEXT:    or.b64 %rd128, %rd116, %rd101;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd101, %rd5, %rd125;
-; CHECK-NEXT:    mul.lo.s64 %rd102, %rd5, %rd126;
-; CHECK-NEXT:    add.s64 %rd103, %rd101, %rd102;
-; CHECK-NEXT:    mul.lo.s64 %rd104, %rd6, %rd125;
+; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd127;
+; CHECK-NEXT:    mul.lo.s64 %rd104, %rd5, %rd128;
 ; CHECK-NEXT:    add.s64 %rd105, %rd103, %rd104;
-; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
-; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd111, %rd112};
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd6, %rd127;
+; CHECK-NEXT:    add.s64 %rd107, %rd105, %rd106;
+; CHECK-NEXT:    mul.lo.s64 %rd108, %rd5, %rd127;
+; CHECK-NEXT:    sub.cc.s64 %rd109, %rd3, %rd108;
+; CHECK-NEXT:    subc.cc.s64 %rd110, %rd4, %rd107;
+; CHECK-NEXT:    xor.b64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    xor.b64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd113, %rd111, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd114, %rd112, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd113, %rd114};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -156,7 +156,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<17>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<113>;
+; CHECK-NEXT:    .reg .b64 %rd<115>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -180,106 +180,106 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd103, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd103, 0;
-; CHECK-NEXT:    setp.eq.s64 %p6, %rd6, 0;
-; CHECK-NEXT:    setp.ne.s64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.u64 %rd105, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd105, 0;
+; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
+; CHECK-NEXT:    setp.ne.s64 %p7, %rd57, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p7;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd5, 127;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd56, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p8;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p6;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p9, %r8, 1;
 ; CHECK-NEXT:    or.pred %p10, %p3, %p9;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.s64 %p11, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p10;
-; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p10;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd114, 0, %rd42, %p10;
+; CHECK-NEXT:    selp.b64 %rd113, 0, %rd41, %p10;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    @%p12 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd107, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd108, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd107, %rd108;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd56;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r11;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r13;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r13;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r15;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p14, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd110, %rd64, %rd63, %p14;
-; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r11;
-; CHECK-NEXT:    mov.u64 %rd100, %rd103;
+; CHECK-NEXT:    selp.b64 %rd112, %rd66, %rd65, %p14;
+; CHECK-NEXT:    shl.b64 %rd111, %rd41, %r11;
+; CHECK-NEXT:    mov.u64 %rd102, %rd105;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd105;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd107;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r18;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r18;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r19;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd107, %rd70, %rd69, %p15;
-; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r16;
+; CHECK-NEXT:    selp.b64 %rd109, %rd72, %rd71, %p15;
+; CHECK-NEXT:    shr.u64 %rd110, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd100, 0;
-; CHECK-NEXT:    mov.u64 %rd103, %rd100;
+; CHECK-NEXT:    mov.u64 %rd102, 0;
+; CHECK-NEXT:    mov.u64 %rd105, %rd102;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd107, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd110, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd80;
-; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd103, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd109, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd112, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd111, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd112, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd111, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd105, %rd82;
+; CHECK-NEXT:    or.b64 %rd112, %rd102, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd105, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd109, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd110, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd107, %rd107, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd108, %rd108, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd107, %rd108;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd88, 0;
 ; CHECK-NEXT:    @%p16 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd90;
-; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd111, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd112, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd111, 1;
+; CHECK-NEXT:    or.b64 %rd113, %rd105, %rd92;
+; CHECK-NEXT:    or.b64 %rd114, %rd102, %rd91;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd91, %rd3, %rd111;
-; CHECK-NEXT:    mul.lo.s64 %rd92, %rd3, %rd112;
-; CHECK-NEXT:    add.s64 %rd93, %rd91, %rd92;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd4, %rd111;
+; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd113;
+; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd114;
 ; CHECK-NEXT:    add.s64 %rd95, %rd93, %rd94;
-; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
-; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd97, %rd98};
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd4, %rd113;
+; CHECK-NEXT:    add.s64 %rd97, %rd95, %rd96;
+; CHECK-NEXT:    mul.lo.s64 %rd98, %rd3, %rd113;
+; CHECK-NEXT:    sub.cc.s64 %rd99, %rd41, %rd98;
+; CHECK-NEXT:    subc.cc.s64 %rd100, %rd42, %rd97;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd99, %rd100};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -325,19 +325,19 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<19>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<120>;
+; CHECK-NEXT:    .reg .b64 %rd<122>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT:    mov.u64 %rd110, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd110, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd110, %rd46;
+; CHECK-NEXT:    mov.u64 %rd112, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd52, %rd112, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd53, %rd112, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
 ; CHECK-NEXT:    selp.b64 %rd2, %rd53, %rd46, %p1;
 ; CHECK-NEXT:    selp.b64 %rd1, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd110, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd110, %rd50;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd112, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd112, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, %rd55, %rd50, %p2;
 ; CHECK-NEXT:    selp.b64 %rd3, %rd54, %rd49, %p2;
@@ -362,101 +362,101 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd64, %r4;
 ; CHECK-NEXT:    add.s64 %rd65, %rd64, 64;
 ; CHECK-NEXT:    selp.b64 %rd66, %rd63, %rd65, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd7, %rd62, %rd66;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd110, 0;
-; CHECK-NEXT:    setp.eq.s64 %p8, %rd8, 0;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd8, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd67, %rd62, %rd66;
+; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT:    setp.eq.s64 %p8, %rd68, 0;
+; CHECK-NEXT:    setp.ne.s64 %p9, %rd68, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p9;
-; CHECK-NEXT:    setp.gt.u64 %p10, %rd7, 127;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd67, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p10;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p8;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p11, %r8, 1;
 ; CHECK-NEXT:    or.pred %p12, %p5, %p11;
-; CHECK-NEXT:    xor.b64 %rd67, %rd7, 127;
-; CHECK-NEXT:    or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd68, 0;
-; CHECK-NEXT:    selp.b64 %rd119, 0, %rd2, %p12;
-; CHECK-NEXT:    selp.b64 %rd118, 0, %rd1, %p12;
+; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd70, 0;
+; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p12;
+; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p12;
 ; CHECK-NEXT:    or.pred %p14, %p12, %p13;
 ; CHECK-NEXT:    @%p14 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd112, %rd7, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd113, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd71, %rd112, %rd113;
-; CHECK-NEXT:    setp.eq.s64 %p15, %rd71, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd7;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.s64 %p15, %rd73, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd67;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd72, %rd2, %r11;
+; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd73, %rd1, %r13;
-; CHECK-NEXT:    or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r13;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd75, %rd1, %r15;
+; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd117, %rd75, %rd74, %p16;
-; CHECK-NEXT:    shl.b64 %rd116, %rd1, %r11;
-; CHECK-NEXT:    mov.u64 %rd107, %rd110;
+; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p16;
+; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r11;
+; CHECK-NEXT:    mov.u64 %rd109, %rd112;
 ; CHECK-NEXT:    @%p15 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd112;
-; CHECK-NEXT:    shr.u64 %rd78, %rd1, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd114;
+; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd79, %rd2, %r18;
-; CHECK-NEXT:    or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r18;
+; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd81, %rd2, %r19;
+; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd114, %rd81, %rd80, %p17;
-; CHECK-NEXT:    shr.u64 %rd115, %rd2, %r16;
+; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p17;
+; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd107, 0;
-; CHECK-NEXT:    mov.u64 %rd110, %rd107;
+; CHECK-NEXT:    mov.u64 %rd109, 0;
+; CHECK-NEXT:    mov.u64 %rd112, %rd109;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd82, %rd114, 63;
-; CHECK-NEXT:    shl.b64 %rd83, %rd115, 1;
-; CHECK-NEXT:    or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT:    shl.b64 %rd85, %rd114, 1;
-; CHECK-NEXT:    shr.u64 %rd86, %rd117, 63;
-; CHECK-NEXT:    or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT:    shr.u64 %rd88, %rd116, 63;
-; CHECK-NEXT:    shl.b64 %rd89, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT:    shl.b64 %rd91, %rd116, 1;
-; CHECK-NEXT:    or.b64 %rd116, %rd110, %rd91;
-; CHECK-NEXT:    or.b64 %rd117, %rd107, %rd90;
-; CHECK-NEXT:    sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT:    subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT:    shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT:    and.b64 %rd110, %rd94, 1;
-; CHECK-NEXT:    and.b64 %rd95, %rd94, %rd3;
-; CHECK-NEXT:    and.b64 %rd96, %rd94, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd114, %rd87, %rd95;
-; CHECK-NEXT:    subc.cc.s64 %rd115, %rd84, %rd96;
-; CHECK-NEXT:    add.cc.s64 %rd112, %rd112, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT:    or.b64 %rd97, %rd112, %rd113;
-; CHECK-NEXT:    setp.eq.s64 %p18, %rd97, 0;
+; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd99, 0;
 ; CHECK-NEXT:    @%p18 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd98, %rd116, 63;
-; CHECK-NEXT:    shl.b64 %rd99, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT:    shl.b64 %rd101, %rd116, 1;
-; CHECK-NEXT:    or.b64 %rd118, %rd110, %rd101;
-; CHECK-NEXT:    or.b64 %rd119, %rd107, %rd100;
+; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd102, %rd118, %rd5;
-; CHECK-NEXT:    xor.b64 %rd103, %rd119, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd104, %rd102, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd104, %rd105};
+; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
+; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd106, %rd107};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -467,7 +467,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<17>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<105>;
+; CHECK-NEXT:    .reg .b64 %rd<107>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -491,98 +491,98 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd95, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT:    setp.eq.s64 %p6, %rd6, 0;
-; CHECK-NEXT:    setp.ne.s64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.u64 %rd97, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
+; CHECK-NEXT:    setp.ne.s64 %p7, %rd57, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p7;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd5, 127;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd56, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p8;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p6;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p9, %r8, 1;
 ; CHECK-NEXT:    or.pred %p10, %p3, %p9;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.s64 %p11, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd104, 0, %rd42, %p10;
-; CHECK-NEXT:    selp.b64 %rd103, 0, %rd41, %p10;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p10;
+; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p10;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    @%p12 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd56;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r11;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r13;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r13;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r15;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p14, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd102, %rd64, %rd63, %p14;
-; CHECK-NEXT:    shl.b64 %rd101, %rd41, %r11;
-; CHECK-NEXT:    mov.u64 %rd92, %rd95;
+; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p14;
+; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r11;
+; CHECK-NEXT:    mov.u64 %rd94, %rd97;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd97;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd99;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r18;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r18;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r19;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd99, %rd70, %rd69, %p15;
-; CHECK-NEXT:    shr.u64 %rd100, %rd42, %r16;
+; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p15;
+; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.u64 %rd92, 0;
-; CHECK-NEXT:    mov.u64 %rd95, %rd92;
+; CHECK-NEXT:    mov.u64 %rd94, 0;
+; CHECK-NEXT:    mov.u64 %rd97, %rd94;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT:    or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd88, 0;
 ; CHECK-NEXT:    @%p16 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT:    or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd103, %rd104};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd105, %rd106};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
new file mode 100644
index 0000000000000..ac6c4e262fd60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i8 }
+%struct.U = type { i64 }
+
+@ptr = external global ptr, align 8
+
+define internal i32 @foo() {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 1 .b8 __local_depot0[2];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot0;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u8 %rs1, [%SP+1];
+; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 1 .b8 param0[1];
+; CHECK-NEXT:    st.param.b8 [param0+0], %rs1;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd2;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_0;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.S, align 1
+  %agg.tmp = alloca %struct.S, align 1
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
+  ret i32 %call
+}
+
+define internal i32 @bar() {
+; CHECK-LABEL: bar(
+; CHECK:         // @bar
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot1;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u64 %rd2, [%SP+8];
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0+0], %rd2;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd3;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_1;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.U, align 8
+  %agg.tmp = alloca %struct.U, align 8
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
+  ret i32 %call
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 7cdced1778a53..68915b0f2698b 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -83,6 +83,47 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
   ret void
 }
 
+; CHECK-LABEL: generic_unordered
+define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: generic_monotonic
 define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -165,6 +206,47 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
   ret void
 }
 
+; CHECK-LABEL: generic_unordered_volatile
+define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: generic_monotonic_volatile
 define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -290,6 +372,47 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs
   ret void
 }
 
+; CHECK-LABEL: global_unordered
+define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: global_monotonic
 define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -331,6 +454,47 @@ define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr
   ret void
 }
 
+; CHECK-LABEL: global_unordered_volatile
+define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: global_monotonic_volatile
 define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -538,6 +702,88 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
   ret void
 }
 
+; CHECK-LABEL: shared_unordered
+define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile
+define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: shared_monotonic
 define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -786,6 +1032,88 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp
   ret void
 }
 
+; CHECK-LABEL: local_unordered
+define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile
+define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
 ; CHECK-LABEL: local_monotonic
 define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
   ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index 27065f5eca9f4..4c5e0920ce1ae 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -167,6 +167,88 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e)
   ret void
 }
 
+; CHECK-LABEL: generic_unordered
+define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile
+define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
 ;; global statespace
 
 ; CHECK-LABEL: global_plain
@@ -333,6 +415,88 @@ define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b,
   ret void
 }
 
+; CHECK-LABEL: global_unordered
+define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile
+define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
 ;; shared statespace
 
 ; CHECK-LABEL: shared_plain
@@ -499,6 +663,88 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b,
   ret void
 }
 
+; CHECK-LABEL: shared_unordered
+define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile
+define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
 ;; local statespace
 
 ; CHECK-LABEL: local_plain
@@ -664,3 +910,85 @@ define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b,
 
   ret void
 }
+
+; CHECK-LABEL: local_unordered
+define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile
+define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index afa7fde6c842b..59e755aee215c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -154,7 +154,7 @@ entry:
 ; IR-LABEL:   @memmove_caller
 ; IR:         icmp ult ptr %src, %dst
 ; IR:         [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
-; IR-NEXT:    %index_ptr = sub i64 [[PHIVAL]], 1
+; IR-NEXT:    %bwd_main_index = sub i64 [[PHIVAL]], 1
 ; IR:         [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
 ; IR:         {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
 
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll
index 34b9c08509326..3e6477c199577 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll
@@ -6,40 +6,16 @@ declare i32 @llvm.nvvm.abs.bf16x2(i32)
 declare i16 @llvm.nvvm.neg.bf16(i16)
 declare i32 @llvm.nvvm.neg.bf16x2(i32)
 
-declare float @llvm.nvvm.fmin.nan.f(float, float)
-declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
-declare half @llvm.nvvm.fmin.f16(half, half)
-declare half @llvm.nvvm.fmin.ftz.f16(half, half)
-declare half @llvm.nvvm.fmin.nan.f16(half, half)
-declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half)
-declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>)
 declare i16 @llvm.nvvm.fmin.bf16(i16, i16)
 declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16)
 declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32)
 declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32)
 
-declare float @llvm.nvvm.fmax.nan.f(float, float)
-declare float @llvm.nvvm.fmax.ftz.nan.f(float, float)
-declare half @llvm.nvvm.fmax.f16(half, half)
-declare half @llvm.nvvm.fmax.ftz.f16(half, half)
-declare half @llvm.nvvm.fmax.nan.f16(half, half)
-declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
-declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
 declare i16 @llvm.nvvm.fmax.bf16(i16, i16)
 declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16)
 declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32)
 declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32)
 
-declare half @llvm.nvvm.fma.rn.relu.f16(half, half, half)
-declare half @llvm.nvvm.fma.rn.ftz.relu.f16(half, half, half)
-declare <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half>, <2 x half>, <2 x half>)
-declare <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2(<2 x half>, <2 x half>, <2 x half>)
 declare i16 @llvm.nvvm.fma.rn.bf16(i16, i16, i16)
 declare i16 @llvm.nvvm.fma.rn.relu.bf16(i16, i16, i16)
 declare i32 @llvm.nvvm.fma.rn.bf16x2(i32, i32, i32)
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index d31844549a322..fcc4ec6e4017f 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -1,6 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
-; RUN: llc < %s -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
-; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
+; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
+; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-SM80-NOF16
 ; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %}
@@ -29,330 +30,1571 @@ declare half @llvm.minnum.f16(half, half) #0
 declare float @llvm.minnum.f32(float, float) #0
 declare double @llvm.minnum.f64(double, double) #0
 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.minimum.f16(half, half) #0
+declare float @llvm.minimum.f32(float, float) #0
+declare double @llvm.minimum.f64(double, double) #0
+declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>) #0
 declare half @llvm.maxnum.f16(half, half) #0
 declare float @llvm.maxnum.f32(float, float) #0
 declare double @llvm.maxnum.f64(double, double) #0
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.maximum.f16(half, half) #0
+declare float @llvm.maximum.f32(float, float) #0
+declare double @llvm.maximum.f64(double, double) #0
+declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
 
 ; ---- ceil ----
 
-; CHECK-LABEL: ceil_float
 define float @ceil_float(float %a) {
-  ; CHECK: cvt.rpi.f32.f32
+; CHECK-LABEL: ceil_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_param_0];
+; CHECK-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: ceil_float_ftz
 define float @ceil_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rpi.ftz.f32.f32
+; CHECK-LABEL: ceil_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rpi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: ceil_double
 define double @ceil_double(double %a) {
-  ; CHECK: cvt.rpi.f64.f64
+; CHECK-LABEL: ceil_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [ceil_double_param_0];
+; CHECK-NEXT:    cvt.rpi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.ceil.f64(double %a)
   ret double %b
 }
 
 ; ---- floor ----
 
-; CHECK-LABEL: floor_float
 define float @floor_float(float %a) {
-  ; CHECK: cvt.rmi.f32.f32
+; CHECK-LABEL: floor_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_param_0];
+; CHECK-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: floor_float_ftz
 define float @floor_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rmi.ftz.f32.f32
+; CHECK-LABEL: floor_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rmi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: floor_double
 define double @floor_double(double %a) {
-  ; CHECK: cvt.rmi.f64.f64
+; CHECK-LABEL: floor_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [floor_double_param_0];
+; CHECK-NEXT:    cvt.rmi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.floor.f64(double %a)
   ret double %b
 }
 
 ; ---- round ----
 
-; CHECK-LABEL: round_float
 define float @round_float(float %a) {
 ; check the use of sign mask and 0.5 to implement round
-; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK: or.b32 {{.*}}, [[R1]], 1056964608;
+; CHECK-LABEL: round_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [round_float_param_0];
+; CHECK-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r3;
+; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f8;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: round_float_ftz
 define float @round_float_ftz(float %a) #1 {
 ; check the use of sign mask and 0.5 to implement round
-; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK: or.b32 {{.*}}, [[R1]], 1056964608;
+; CHECK-LABEL: round_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [round_float_ftz_param_0];
+; CHECK-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r3;
+; CHECK-NEXT:    add.rn.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.ftz.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.ftz.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.ftz.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f8;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: round_double
 define double @round_double(double %a) {
 ; check the use of 0.5 to implement round
-; CHECK: setp.lt.f64 {{.*}}, [[R:%fd[0-9]+]], 0d3FE0000000000000;
-; CHECK: add.rn.f64 {{.*}}, [[R]], 0d3FE0000000000000;
+; CHECK-LABEL: round_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .f64 %fd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [round_double_param_0];
+; CHECK-NEXT:    abs.f64 %fd2, %fd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %fd2, 0d3FE0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FE0000000000000;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
+; CHECK-NEXT:    selp.f64 %fd5, 0d0000000000000000, %fd4, %p1;
+; CHECK-NEXT:    abs.f64 %fd6, %fd5;
+; CHECK-NEXT:    neg.f64 %fd7, %fd6;
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    shr.u64 %rd2, %rd1, 63;
+; CHECK-NEXT:    and.b64 %rd3, %rd2, 1;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd3, 1;
+; CHECK-NEXT:    selp.f64 %fd8, %fd7, %fd6, %p2;
+; CHECK-NEXT:    setp.gt.f64 %p3, %fd2, 0d4330000000000000;
+; CHECK-NEXT:    selp.f64 %fd9, %fd1, %fd8, %p3;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd9;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.round.f64(double %a)
   ret double %b
 }
 
 ; ---- nearbyint ----
 
-; CHECK-LABEL: nearbyint_float
 define float @nearbyint_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: nearbyint_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: nearbyint_float_ftz
 define float @nearbyint_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: nearbyint_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: nearbyint_double
 define double @nearbyint_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: nearbyint_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [nearbyint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.nearbyint.f64(double %a)
   ret double %b
 }
 
 ; ---- rint ----
 
-; CHECK-LABEL: rint_float
 define float @rint_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: rint_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: rint_float_ftz
 define float @rint_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: rint_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: rint_double
 define double @rint_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: rint_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [rint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.rint.f64(double %a)
   ret double %b
 }
 
 ; ---- roundeven ----
 
-; CHECK-LABEL: roundeven_float
 define float @roundeven_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: roundeven_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: roundeven_float_ftz
 define float @roundeven_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: roundeven_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: roundeven_double
 define double @roundeven_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: roundeven_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [roundeven_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.roundeven.f64(double %a)
   ret double %b
 }
 
 ; ---- trunc ----
 
-; CHECK-LABEL: trunc_float
 define float @trunc_float(float %a) {
-  ; CHECK: cvt.rzi.f32.f32
+; CHECK-LABEL: trunc_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_param_0];
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: trunc_float_ftz
 define float @trunc_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rzi.ftz.f32.f32
+; CHECK-LABEL: trunc_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: trunc_double
 define double @trunc_double(double %a) {
-  ; CHECK: cvt.rzi.f64.f64
+; CHECK-LABEL: trunc_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [trunc_double_param_0];
+; CHECK-NEXT:    cvt.rzi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.trunc.f64(double %a)
   ret double %b
 }
 
 ; ---- abs ----
 
-; CHECK-LABEL: abs_float
 define float @abs_float(float %a) {
-  ; CHECK: abs.f32
+; CHECK-LABEL: abs_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: abs_float_ftz
 define float @abs_float_ftz(float %a) #1 {
-  ; CHECK: abs.ftz.f32
+; CHECK-LABEL: abs_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_ftz_param_0];
+; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: abs_double
 define double @abs_double(double %a) {
-  ; CHECK: abs.f64
+; CHECK-LABEL: abs_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [abs_double_param_0];
+; CHECK-NEXT:    abs.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.fabs.f64(double %a)
   ret double %b
 }
 
-; ---- min ----
+; ---- minnum ----
 
-; CHECK-LABEL: min_half
-define half @min_half(half %a, half %b) {
-  ; CHECK-NOF16: min.f32
-  ; CHECK-F16: min.f16
+define half @minnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minnum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minnum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-F16-NEXT:    min.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minnum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.minnum.f16(half %a, half %b)
   ret half %x
 }
 
-; CHECK-LABEL: min_float
-define float @min_float(float %a, float %b) {
-  ; CHECK: min.f32
+define float @minnum_float(float %a, float %b) {
+; CHECK-LABEL: minnum_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_param_1];
+; CHECK-NEXT:    min.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: min_imm1
-define float @min_imm1(float %a) {
-  ; CHECK: min.f32
+define float @minnum_imm1(float %a) {
+; CHECK-LABEL: minnum_imm1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm1_param_0];
+; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float 0.0)
   ret float %x
 }
 
-; CHECK-LABEL: min_imm2
-define float @min_imm2(float %a) {
-  ; CHECK: min.f32
+define float @minnum_imm2(float %a) {
+; CHECK-LABEL: minnum_imm2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm2_param_0];
+; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float 0.0, float %a)
   ret float %x
 }
 
-; CHECK-LABEL: min_float_ftz
-define float @min_float_ftz(float %a, float %b) #1 {
-  ; CHECK: min.ftz.f32
+define float @minnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: minnum_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_ftz_param_1];
+; CHECK-NEXT:    min.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: min_double
-define double @min_double(double %a, double %b) {
-  ; CHECK: min.f64
+define double @minnum_double(double %a, double %b) {
+; CHECK-LABEL: minnum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [minnum_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [minnum_double_param_1];
+; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd3;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.minnum.f64(double %a, double %b)
   ret double %x
 }
 
-; CHECK-LABEL: min_v2half
-define <2 x half> @min_v2half(<2 x half> %a, <2 x half> %b) {
-  ; CHECK-NOF16: min.f32
-  ; CHECK-NOF16: min.f32
-  ; CHECK-F16: min.f16x2
+define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minnum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minnum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_0];
+; CHECK-F16-NEXT:    min.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minnum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
 }
 
-; ---- max ----
+; ---- minimum ----
 
-; CHECK-LABEL: max_half
-define half @max_half(half %a, half %b) {
-  ; CHECK-NOF16: max.f32
-  ; CHECK-F16: max.f16
+define half @minimum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minimum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-F16-NEXT:    min.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call half @llvm.minimum.f16(half %a, half %b)
+  ret half %x
+}
+
+define float @minimum_float(float %a, float %b) {
+; CHECK-NOF16-LABEL: minimum_float(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_float(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-F16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_float(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %x
+}
+
+define float @minimum_imm1(float %a) {
+; CHECK-NOF16-LABEL: minimum_imm1(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<6>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f5;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_imm1(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_imm1(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float 0.0)
+  ret float %x
+}
+
+define float @minimum_imm2(float %a) {
+; CHECK-NOF16-LABEL: minimum_imm2(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<6>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f5;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_imm2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_imm2(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float 0.0, float %a)
+  ret float %x
+}
+
+define float @minimum_float_ftz(float %a, float %b) #1 {
+; CHECK-NOF16-LABEL: minimum_float_ftz(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    min.ftz.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_float_ftz(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-F16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_float_ftz(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %x
+}
+
+define double @minimum_double(double %a, double %b) {
+; CHECK-LABEL: minimum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .f64 %fd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [minimum_double_param_0];
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    ld.param.f64 %fd2, [minimum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
+; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
+; CHECK-NEXT:    mov.b64 %rd2, %fd2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd7;
+; CHECK-NEXT:    ret;
+  %x = call double @llvm.minimum.f64(double %a, double %b)
+  ret double %x
+}
+
+define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minimum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_0];
+; CHECK-F16-NEXT:    min.NaN.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %x
+}
+
+; ---- maxnum ----
+
+define half @maxnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maxnum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maxnum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-F16-NEXT:    max.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maxnum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.maxnum.f16(half %a, half %b)
   ret half %x
 }
 
-; CHECK-LABEL: max_imm1
-define float @max_imm1(float %a) {
-  ; CHECK: max.f32
+define float @maxnum_imm1(float %a) {
+; CHECK-LABEL: maxnum_imm1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm1_param_0];
+; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float 0.0)
   ret float %x
 }
 
-; CHECK-LABEL: max_imm2
-define float @max_imm2(float %a) {
-  ; CHECK: max.f32
+define float @maxnum_imm2(float %a) {
+; CHECK-LABEL: maxnum_imm2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm2_param_0];
+; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float 0.0, float %a)
   ret float %x
 }
 
-; CHECK-LABEL: max_float
-define float @max_float(float %a, float %b) {
-  ; CHECK: max.f32
+define float @maxnum_float(float %a, float %b) {
+; CHECK-LABEL: maxnum_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_param_1];
+; CHECK-NEXT:    max.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: max_float_ftz
-define float @max_float_ftz(float %a, float %b) #1 {
-  ; CHECK: max.ftz.f32
+define float @maxnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: maxnum_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_ftz_param_1];
+; CHECK-NEXT:    max.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: max_double
-define double @max_double(double %a, double %b) {
-  ; CHECK: max.f64
+define double @maxnum_double(double %a, double %b) {
+; CHECK-LABEL: maxnum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [maxnum_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [maxnum_double_param_1];
+; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd3;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.maxnum.f64(double %a, double %b)
   ret double %x
 }
 
-; CHECK-LABEL: max_v2half
-define <2 x half> @max_v2half(<2 x half> %a, <2 x half> %b) {
-  ; CHECK-NOF16: max.f32
-  ; CHECK-NOF16: max.f32
-  ; CHECK-F16: max.f16x2
+define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maxnum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maxnum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_0];
+; CHECK-F16-NEXT:    max.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maxnum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
 }
 
+; ---- maximum ----
+
+define half @maximum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maximum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
+; CHECK-F16-NEXT:    max.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call half @llvm.maximum.f16(half %a, half %b)
+  ret half %x
+}
+
+define float @maximum_imm1(float %a) {
+; CHECK-NOF16-LABEL: maximum_imm1(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_imm1(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_imm1(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float 0.0)
+  ret float %x
+}
+
+define float @maximum_imm2(float %a) {
+; CHECK-NOF16-LABEL: maximum_imm2(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_imm2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_imm2(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float 0.0, float %a)
+  ret float %x
+}
+
+define float @maximum_float(float %a, float %b) {
+; CHECK-NOF16-LABEL: maximum_float(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_float(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-F16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_float(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %x
+}
+
+define float @maximum_float_ftz(float %a, float %b) #1 {
+; CHECK-NOF16-LABEL: maximum_float_ftz(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    max.ftz.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_float_ftz(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-F16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_float_ftz(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %x
+}
+
+define double @maximum_double(double %a, double %b) {
+; CHECK-LABEL: maximum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .f64 %fd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [maximum_double_param_0];
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    ld.param.f64 %fd2, [maximum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
+; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, 0;
+; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
+; CHECK-NEXT:    mov.b64 %rd2, %fd2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, 0;
+; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd7;
+; CHECK-NEXT:    ret;
+  %x = call double @llvm.maximum.f64(double %a, double %b)
+  ret double %x
+}
+
+define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maximum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_0];
+; CHECK-F16-NEXT:    max.NaN.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %x
+}
+
 ; ---- fma ----
 
-; CHECK-LABEL: @fma_float
 define float @fma_float(float %a, float %b, float %c) {
-  ; CHECK: fma.rn.f32
+; CHECK-LABEL: fma_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_param_2];
+; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
 }
 
-; CHECK-LABEL: @fma_float_ftz
 define float @fma_float_ftz(float %a, float %b, float %c) #1 {
-  ; CHECK: fma.rn.ftz.f32
+; CHECK-LABEL: fma_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_ftz_param_2];
+; CHECK-NEXT:    fma.rn.ftz.f32 %f4, %f1, %f2, %f3;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
 }
 
-; CHECK-LABEL: @fma_double
 define double @fma_double(double %a, double %b, double %c) {
-  ; CHECK: fma.rn.f64
+; CHECK-LABEL: fma_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [fma_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [fma_double_param_1];
+; CHECK-NEXT:    ld.param.f64 %fd3, [fma_double_param_2];
+; CHECK-NEXT:    fma.rn.f64 %fd4, %fd1, %fd2, %fd3;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd4;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %x
 }
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 4941760a78c79..109d28a3e3c59 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -90,3 +90,61 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
   store %struct.T %S2, ptr %out
   ret void
 }
+
+define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
+; CHECK-LABEL: test_store_volatile_undef(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<23>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1];
+; CHECK-NEXT:    ld.param.v4.u32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd3], {%r11, %r12, %r13, %r14};
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10};
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r15, %r16, %r17, %r18};
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1], {%r19, %r20, %r21, %r22};
+; CHECK-NEXT:    ret;
+  store volatile %struct.T undef, ptr %out
+  store volatile <8 x i32> %vec, ptr undef
+  store volatile <8 x i32> undef, ptr %out
+  ret void
+}
+
+define void @test_store_volatile_of_poison(ptr %out) {
+; CHECK-LABEL: test_store_volatile_of_poison(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_of_poison_param_0];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ret;
+  store volatile %struct.T poison, ptr %out
+  ret void
+}
+
+define void @test_store_volatile_to_poison(%struct.T %param) {
+; CHECK-LABEL: test_store_volatile_to_poison(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_to_poison_param_0];
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8];
+; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd2], {%r3, %r4, %r5, %r6};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd3], {%r1, %r2};
+; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd1;
+; CHECK-NEXT:    ret;
+  store volatile %struct.T %param, ptr poison
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index cd37bee4592d6..f1c9da078a16e 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -24,7 +24,6 @@
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Instrument function entry/exit with calls to e.g. mcount() (post inlining)
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 9660102ffee3c..bb9aa6bec3981 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -54,7 +54,6 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
index aae23265710ce..afc7a39e18dc8 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
@@ -750,25 +750,21 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-64-LABEL: testDoubleImm1:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testDoubleImm1:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-NEXT:    blr
 ;
 ; CHECK-64-P10-LABEL: testDoubleImm1:
 ; CHECK-64-P10:       # %bb.0: # %entry
-; CHECK-64-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-P10-NEXT:    blr
 ;
 ; CHECK-32-P10-LABEL: testDoubleImm1:
 ; CHECK-32-P10:       # %bb.0: # %entry
-; CHECK-32-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-P10-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
index 19e298a633e0b..2f543da9b29a0 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
@@ -1099,7 +1099,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-LABEL: getd1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxswapd 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 1
@@ -1115,7 +1114,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-NEXT:    lvsl 3, 0, 3
 ; CHECK-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-NEXT:    xxlor 1, 34, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 %i
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index f729018dd4106..91431ed15f6a7 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1319,11 +1319,7 @@ entry:
 define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ; P9BE-LABEL: fromRegsConvftoi:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpsxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpsxws v3, vs0
@@ -1332,11 +1328,7 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P9LE-LABEL: fromRegsConvftoi:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpsxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpsxws v3, vs0
@@ -1345,10 +1337,6 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P8BE-LABEL: fromRegsConvftoi:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpsxws v2, vs0
@@ -1358,10 +1346,6 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P8LE-LABEL: fromRegsConvftoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpsxws v2, vs0
@@ -1773,11 +1757,7 @@ entry:
 define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoi:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpsxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpsxws v3, vs0
@@ -1786,11 +1766,7 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P9LE-LABEL: fromRegsConvdtoi:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpsxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpsxws v3, vs0
@@ -1799,10 +1775,6 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8BE-LABEL: fromRegsConvdtoi:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpsxws v2, vs0
@@ -1812,10 +1784,6 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8LE-LABEL: fromRegsConvdtoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpsxws v2, vs0
@@ -2839,11 +2807,7 @@ entry:
 define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ; P9BE-LABEL: fromRegsConvftoui:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpuxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpuxws v3, vs0
@@ -2852,11 +2816,7 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P9LE-LABEL: fromRegsConvftoui:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpuxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpuxws v3, vs0
@@ -2865,10 +2825,6 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P8BE-LABEL: fromRegsConvftoui:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpuxws v2, vs0
@@ -2878,10 +2834,6 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P8LE-LABEL: fromRegsConvftoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpuxws v2, vs0
@@ -3294,11 +3246,7 @@ entry:
 define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoui:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpuxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpuxws v3, vs0
@@ -3307,11 +3255,7 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P9LE-LABEL: fromRegsConvdtoui:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpuxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpuxws v3, vs0
@@ -3320,10 +3264,6 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8BE-LABEL: fromRegsConvdtoui:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpuxws v2, vs0
@@ -3333,10 +3273,6 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8LE-LABEL: fromRegsConvdtoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpuxws v2, vs0
@@ -4269,32 +4205,24 @@ entry:
 define <2 x i64> @fromRegsConvftoll(float %a, float %b) {
 ; P9BE-LABEL: fromRegsConvftoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpsxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvftoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpsxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvftoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpsxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvftoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpsxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -4630,32 +4558,24 @@ entry:
 define <2 x i64> @fromRegsConvdtoll(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpsxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpsxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpsxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpsxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -5451,32 +5371,24 @@ entry:
 define <2 x i64> @fromRegsConvftoull(float %a, float %b) {
 ; P9BE-LABEL: fromRegsConvftoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpuxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvftoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpuxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvftoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpuxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvftoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpuxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -5812,32 +5724,24 @@ entry:
 define <2 x i64> @fromRegsConvdtoull(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpuxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpuxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpuxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpuxds v2, vs0
 ; P8LE-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
index 7aa8b0e7e8327..798c23cd6b961 100644
--- a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
+++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
@@ -22,7 +22,6 @@ define dso_local double @insert_exp(double %d, i64 %ull) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mffprd 3, 1
 ; CHECK-NEXT:    xsiexpdp 1, 3, 4
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %0 = tail call double @llvm.ppc.insert.exp(double %d, i64 %ull)
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index f2bd4c7f40a46..c26f98c5b0495 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -565,7 +565,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P8-NEXT:    bl dummy
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    xxlxor f0, f0, f0
-; CHECK-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P8-NEXT:    xxswapd vs0, vs0
 ; CHECK-P8-NEXT:    stxvd2x vs0, 0, r30
@@ -580,7 +579,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-NEXT:    bl dummy
 ; CHECK-P9-NEXT:    nop
 ; CHECK-P9-NEXT:    xxlxor f0, f0, f0
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P9-NEXT:    stxv vs0, 0(r30)
 ;
@@ -594,7 +592,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-BE-NEXT:    bl dummy
 ; CHECK-P9-BE-NEXT:    nop
 ; CHECK-P9-BE-NEXT:    xxlxor f0, f0, f0
-; CHECK-P9-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-BE-NEXT:    xxmrghd vs0, vs0, vs1
 ; CHECK-P9-BE-NEXT:    stxv vs0, 0(r30)
 ;
@@ -621,7 +618,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P7-NEXT:    bl dummy
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    xxlxor f0, f0, f0
-; CHECK-P7-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P7-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P7-NEXT:    xxswapd vs0, vs0
 ; CHECK-P7-NEXT:    stxvd2x vs0, 0, r30
@@ -636,7 +632,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-64-NEXT:    bl .dummy[PR]
 ; P8-AIX-64-NEXT:    nop
 ; P8-AIX-64-NEXT:    xxlxor f0, f0, f0
-; P8-AIX-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-64-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-64-NEXT:    stxvd2x vs0, 0, r31
 ;
@@ -650,7 +645,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-32-NEXT:    bl .dummy[PR]
 ; P8-AIX-32-NEXT:    nop
 ; P8-AIX-32-NEXT:    xxlxor f0, f0, f0
-; P8-AIX-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-32-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-32-NEXT:    stxvd2x vs0, 0, r31
 test_entry:
diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
index e1a201427a410..1dc532cb428f4 100644
--- a/llvm/test/CodeGen/PowerPC/check-cpu.ll
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -3,6 +3,10 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -mcpu=future < %s 2>&1 | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr11 < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr11 < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:     -mcpu=pwr10 < %s 2>&1 | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -mcpu=pwr10 < %s 2>&1 | FileCheck %s
@@ -13,7 +17,7 @@
 
 
 
-; Test -mcpu=[pwr9|pwr10|future] is recognized on PowerPC.
+; Test -mcpu=[pwr9|pwr10|pwr11|future] is recognized on PowerPC.
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; CHECK:     .text
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index a72abf7007e8d..04af0947c7a33 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -6,7 +6,6 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-LABEL: fneg_fdiv_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxspltd 0, 1, 0
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
 ; CHECK-NEXT:    xvredp 1, 0
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index 5f8c21e30f8fd..ccf0e4520f468 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -642,8 +642,8 @@ define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2
 ; CHECK-NEXT:    cmpdi r7, 0
 ; CHECK-NEXT:    ble cr0, .LBB6_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    add r5, r5, r4
+; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    mtctr r7
 ; CHECK-NEXT:    sldi r4, r4, 1
 ; CHECK-NEXT:    add r5, r3, r5
@@ -743,214 +743,219 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-NEXT:    std r9, -184(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r4, -160(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    ble cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    sldi r6, r6, 2
-; CHECK-NEXT:    li r7, 1
-; CHECK-NEXT:    mr r30, r10
-; CHECK-NEXT:    cmpdi r6, 1
-; CHECK-NEXT:    iselgt r7, r6, r7
-; CHECK-NEXT:    addi r8, r7, -1
-; CHECK-NEXT:    clrldi r6, r7, 63
-; CHECK-NEXT:    cmpldi r8, 3
+; CHECK-NEXT:    sldi r4, r6, 2
+; CHECK-NEXT:    li r6, 1
+; CHECK-NEXT:    mr r0, r10
+; CHECK-NEXT:    std r10, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    cmpdi r4, 1
+; CHECK-NEXT:    iselgt r4, r4, r6
+; CHECK-NEXT:    addi r7, r4, -1
+; CHECK-NEXT:    clrldi r6, r4, 63
+; CHECK-NEXT:    cmpldi r7, 3
 ; CHECK-NEXT:    blt cr0, .LBB7_4
 ; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT:    ld r14, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    mulli r24, r30, 24
-; CHECK-NEXT:    ld r16, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r15, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl r0, r7, 62, 2
-; CHECK-NEXT:    sldi r11, r30, 5
-; CHECK-NEXT:    sldi r19, r30, 4
-; CHECK-NEXT:    sldi r7, r14, 3
-; CHECK-NEXT:    add r14, r30, r14
-; CHECK-NEXT:    sldi r10, r16, 3
-; CHECK-NEXT:    sldi r12, r15, 3
-; CHECK-NEXT:    add r16, r30, r16
-; CHECK-NEXT:    add r15, r30, r15
-; CHECK-NEXT:    add r27, r11, r7
-; CHECK-NEXT:    add r22, r24, r7
-; CHECK-NEXT:    add r17, r19, r7
-; CHECK-NEXT:    sldi r2, r14, 3
-; CHECK-NEXT:    add r26, r24, r10
-; CHECK-NEXT:    add r25, r24, r12
-; CHECK-NEXT:    add r21, r19, r10
-; CHECK-NEXT:    add r20, r19, r12
-; CHECK-NEXT:    add r8, r11, r10
-; CHECK-NEXT:    sldi r16, r16, 3
-; CHECK-NEXT:    add r29, r5, r27
-; CHECK-NEXT:    add r28, r4, r27
-; CHECK-NEXT:    add r27, r3, r27
-; CHECK-NEXT:    add r24, r5, r22
-; CHECK-NEXT:    add r23, r4, r22
-; CHECK-NEXT:    add r22, r3, r22
-; CHECK-NEXT:    add r19, r5, r17
-; CHECK-NEXT:    add r18, r4, r17
-; CHECK-NEXT:    add r17, r3, r17
-; CHECK-NEXT:    add r14, r5, r2
-; CHECK-NEXT:    add r31, r4, r2
-; CHECK-NEXT:    add r2, r3, r2
-; CHECK-NEXT:    add r9, r5, r8
-; CHECK-NEXT:    add r8, r11, r12
+; CHECK-NEXT:    ld r0, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r30, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r8, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    rldicl r7, r4, 62, 2
+; CHECK-NEXT:    ld r9, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r11, r0, r30
+; CHECK-NEXT:    add r4, r0, r0
+; CHECK-NEXT:    mulli r23, r0, 24
+; CHECK-NEXT:    add r14, r0, r8
+; CHECK-NEXT:    sldi r12, r0, 5
+; CHECK-NEXT:    add r31, r0, r9
+; CHECK-NEXT:    sldi r9, r9, 3
+; CHECK-NEXT:    sldi r18, r0, 4
+; CHECK-NEXT:    sldi r8, r8, 3
+; CHECK-NEXT:    add r10, r4, r4
+; CHECK-NEXT:    sldi r4, r30, 3
+; CHECK-NEXT:    sldi r11, r11, 3
+; CHECK-NEXT:    add r26, r12, r9
+; CHECK-NEXT:    add r16, r18, r9
+; CHECK-NEXT:    add r29, r12, r8
+; CHECK-NEXT:    add r19, r18, r8
+; CHECK-NEXT:    add r30, r12, r4
+; CHECK-NEXT:    mr r20, r4
+; CHECK-NEXT:    std r4, -200(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    ld r4, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r15, r5, r11
+; CHECK-NEXT:    sldi r11, r14, 3
+; CHECK-NEXT:    add r29, r5, r29
+; CHECK-NEXT:    add r28, r3, r26
+; CHECK-NEXT:    add r19, r5, r19
+; CHECK-NEXT:    add r21, r23, r9
+; CHECK-NEXT:    add r24, r23, r8
+; CHECK-NEXT:    add r14, r5, r11
+; CHECK-NEXT:    sldi r11, r31, 3
+; CHECK-NEXT:    add r25, r23, r20
+; CHECK-NEXT:    add r20, r18, r20
+; CHECK-NEXT:    add r30, r5, r30
+; CHECK-NEXT:    add r18, r3, r16
+; CHECK-NEXT:    add r24, r5, r24
+; CHECK-NEXT:    add r23, r3, r21
+; CHECK-NEXT:    add r27, r4, r26
+; CHECK-NEXT:    add r22, r4, r21
+; CHECK-NEXT:    add r17, r4, r16
+; CHECK-NEXT:    add r2, r4, r11
+; CHECK-NEXT:    rldicl r4, r7, 2, 1
+; CHECK-NEXT:    sub r7, r8, r9
+; CHECK-NEXT:    ld r8, -200(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    add r26, r5, r26
 ; CHECK-NEXT:    add r25, r5, r25
 ; CHECK-NEXT:    add r21, r5, r21
 ; CHECK-NEXT:    add r20, r5, r20
 ; CHECK-NEXT:    add r16, r5, r16
-; CHECK-NEXT:    add r8, r5, r8
-; CHECK-NEXT:    rldicl r3, r0, 2, 1
-; CHECK-NEXT:    addi r3, r3, -4
-; CHECK-NEXT:    sub r0, r12, r7
-; CHECK-NEXT:    sub r12, r10, r7
-; CHECK-NEXT:    li r7, 0
-; CHECK-NEXT:    mr r10, r30
-; CHECK-NEXT:    sldi r15, r15, 3
-; CHECK-NEXT:    add r15, r5, r15
-; CHECK-NEXT:    rldicl r3, r3, 62, 2
-; CHECK-NEXT:    addi r3, r3, 1
-; CHECK-NEXT:    mtctr r3
+; CHECK-NEXT:    add r31, r5, r11
+; CHECK-NEXT:    add r11, r3, r11
+; CHECK-NEXT:    addi r4, r4, -4
+; CHECK-NEXT:    rldicl r4, r4, 62, 2
+; CHECK-NEXT:    sub r8, r8, r9
+; CHECK-NEXT:    li r9, 0
+; CHECK-NEXT:    addi r4, r4, 1
+; CHECK-NEXT:    mtctr r4
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_3: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfd f0, 0(r2)
-; CHECK-NEXT:    lfd f1, 0(r31)
-; CHECK-NEXT:    add r3, r10, r30
-; CHECK-NEXT:    add r3, r3, r30
+; CHECK-NEXT:    lfd f0, 0(r11)
+; CHECK-NEXT:    lfd f1, 0(r2)
+; CHECK-NEXT:    add r0, r0, r10
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfd f1, 0(r14)
-; CHECK-NEXT:    add r3, r3, r30
-; CHECK-NEXT:    add r10, r3, r30
+; CHECK-NEXT:    lfd f1, 0(r31)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfd f0, 0(r14)
-; CHECK-NEXT:    add r14, r14, r11
-; CHECK-NEXT:    lfdx f0, r2, r0
-; CHECK-NEXT:    lfdx f1, r31, r0
+; CHECK-NEXT:    stfd f0, 0(r31)
+; CHECK-NEXT:    add r31, r31, r12
+; CHECK-NEXT:    lfdx f0, r11, r7
+; CHECK-NEXT:    lfdx f1, r2, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r15, r7
+; CHECK-NEXT:    lfdx f1, r14, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r15, r7
-; CHECK-NEXT:    lfdx f0, r2, r12
-; CHECK-NEXT:    lfdx f1, r31, r12
-; CHECK-NEXT:    add r2, r2, r11
-; CHECK-NEXT:    add r31, r31, r11
+; CHECK-NEXT:    stfdx f0, r14, r9
+; CHECK-NEXT:    lfdx f0, r11, r8
+; CHECK-NEXT:    lfdx f1, r2, r8
+; CHECK-NEXT:    add r11, r11, r12
+; CHECK-NEXT:    add r2, r2, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r16, r7
+; CHECK-NEXT:    lfdx f1, r15, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r16, r7
-; CHECK-NEXT:    lfd f0, 0(r17)
-; CHECK-NEXT:    lfd f1, 0(r18)
+; CHECK-NEXT:    stfdx f0, r15, r9
+; CHECK-NEXT:    lfd f0, 0(r18)
+; CHECK-NEXT:    lfd f1, 0(r17)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r19, r7
+; CHECK-NEXT:    lfdx f1, r16, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r19, r7
-; CHECK-NEXT:    lfdx f0, r17, r0
-; CHECK-NEXT:    lfdx f1, r18, r0
+; CHECK-NEXT:    stfdx f0, r16, r9
+; CHECK-NEXT:    lfdx f0, r18, r7
+; CHECK-NEXT:    lfdx f1, r17, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r20, r7
+; CHECK-NEXT:    lfdx f1, r19, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r20, r7
-; CHECK-NEXT:    lfdx f0, r17, r12
-; CHECK-NEXT:    lfdx f1, r18, r12
-; CHECK-NEXT:    add r17, r17, r11
-; CHECK-NEXT:    add r18, r18, r11
+; CHECK-NEXT:    stfdx f0, r19, r9
+; CHECK-NEXT:    lfdx f0, r18, r8
+; CHECK-NEXT:    lfdx f1, r17, r8
+; CHECK-NEXT:    add r18, r18, r12
+; CHECK-NEXT:    add r17, r17, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r21, r7
+; CHECK-NEXT:    lfdx f1, r20, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r21, r7
-; CHECK-NEXT:    lfd f0, 0(r22)
-; CHECK-NEXT:    lfd f1, 0(r23)
+; CHECK-NEXT:    stfdx f0, r20, r9
+; CHECK-NEXT:    lfd f0, 0(r23)
+; CHECK-NEXT:    lfd f1, 0(r22)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r24, r7
+; CHECK-NEXT:    lfdx f1, r21, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r24, r7
-; CHECK-NEXT:    lfdx f0, r22, r0
-; CHECK-NEXT:    lfdx f1, r23, r0
+; CHECK-NEXT:    stfdx f0, r21, r9
+; CHECK-NEXT:    lfdx f0, r23, r7
+; CHECK-NEXT:    lfdx f1, r22, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r25, r7
+; CHECK-NEXT:    lfdx f1, r24, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r25, r7
-; CHECK-NEXT:    lfdx f0, r22, r12
-; CHECK-NEXT:    lfdx f1, r23, r12
-; CHECK-NEXT:    add r22, r22, r11
-; CHECK-NEXT:    add r23, r23, r11
+; CHECK-NEXT:    stfdx f0, r24, r9
+; CHECK-NEXT:    lfdx f0, r23, r8
+; CHECK-NEXT:    lfdx f1, r22, r8
+; CHECK-NEXT:    add r23, r23, r12
+; CHECK-NEXT:    add r22, r22, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r26, r7
+; CHECK-NEXT:    lfdx f1, r25, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r26, r7
-; CHECK-NEXT:    lfd f0, 0(r27)
-; CHECK-NEXT:    lfd f1, 0(r28)
+; CHECK-NEXT:    stfdx f0, r25, r9
+; CHECK-NEXT:    lfd f0, 0(r28)
+; CHECK-NEXT:    lfd f1, 0(r27)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r29, r7
+; CHECK-NEXT:    lfdx f1, r26, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r29, r7
-; CHECK-NEXT:    lfdx f0, r27, r0
-; CHECK-NEXT:    lfdx f1, r28, r0
+; CHECK-NEXT:    stfdx f0, r26, r9
+; CHECK-NEXT:    lfdx f0, r28, r7
+; CHECK-NEXT:    lfdx f1, r27, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r8, r7
+; CHECK-NEXT:    lfdx f1, r29, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r8, r7
-; CHECK-NEXT:    lfdx f0, r27, r12
-; CHECK-NEXT:    lfdx f1, r28, r12
-; CHECK-NEXT:    add r27, r27, r11
-; CHECK-NEXT:    add r28, r28, r11
+; CHECK-NEXT:    stfdx f0, r29, r9
+; CHECK-NEXT:    lfdx f0, r28, r8
+; CHECK-NEXT:    lfdx f1, r27, r8
+; CHECK-NEXT:    add r28, r28, r12
+; CHECK-NEXT:    add r27, r27, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r9, r7
+; CHECK-NEXT:    lfdx f1, r30, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r9, r7
-; CHECK-NEXT:    add r7, r7, r11
+; CHECK-NEXT:    stfdx f0, r30, r9
+; CHECK-NEXT:    add r9, r9, r12
 ; CHECK-NEXT:    bdnz .LBB7_3
 ; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT:    ld r7, -192(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    beq cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT:    ld r3, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r0, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    sldi r8, r30, 3
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r7, r5, r3
-; CHECK-NEXT:    add r9, r4, r3
-; CHECK-NEXT:    add r11, r0, r3
-; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r12, r5, r3
-; CHECK-NEXT:    add r30, r4, r3
-; CHECK-NEXT:    add r29, r0, r3
-; CHECK-NEXT:    ld r3, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    li r10, 0
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r5, r5, r3
-; CHECK-NEXT:    add r4, r4, r3
-; CHECK-NEXT:    add r3, r0, r3
+; CHECK-NEXT:    ld r4, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r29, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    mr r30, r3
+; CHECK-NEXT:    sldi r7, r7, 3
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r3, r5, r4
+; CHECK-NEXT:    add r8, r29, r4
+; CHECK-NEXT:    add r9, r30, r4
+; CHECK-NEXT:    ld r4, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r10, r5, r4
+; CHECK-NEXT:    add r11, r29, r4
+; CHECK-NEXT:    add r12, r30, r4
+; CHECK-NEXT:    ld r4, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r0, r4, 3
+; CHECK-NEXT:    add r5, r5, r0
+; CHECK-NEXT:    add r4, r29, r0
+; CHECK-NEXT:    add r30, r30, r0
+; CHECK-NEXT:    li r0, 0
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_6: # %for.body.epil
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfdx f0, r3, r10
-; CHECK-NEXT:    lfdx f1, r4, r10
+; CHECK-NEXT:    lfdx f0, r30, r0
+; CHECK-NEXT:    lfdx f1, r4, r0
 ; CHECK-NEXT:    addi r6, r6, -1
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
 ; CHECK-NEXT:    lfd f1, 0(r5)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
 ; CHECK-NEXT:    stfd f0, 0(r5)
-; CHECK-NEXT:    add r5, r5, r8
-; CHECK-NEXT:    lfdx f0, r29, r10
-; CHECK-NEXT:    lfdx f1, r30, r10
+; CHECK-NEXT:    add r5, r5, r7
+; CHECK-NEXT:    lfdx f0, r12, r0
+; CHECK-NEXT:    lfdx f1, r11, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r12, r10
+; CHECK-NEXT:    lfdx f1, r10, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r12, r10
-; CHECK-NEXT:    lfdx f0, r11, r10
-; CHECK-NEXT:    lfdx f1, r9, r10
+; CHECK-NEXT:    stfdx f0, r10, r0
+; CHECK-NEXT:    lfdx f0, r9, r0
+; CHECK-NEXT:    lfdx f1, r8, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r7, r10
+; CHECK-NEXT:    lfdx f1, r3, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r7, r10
-; CHECK-NEXT:    add r10, r10, r8
+; CHECK-NEXT:    stfdx f0, r3, r0
+; CHECK-NEXT:    add r0, r0, r7
 ; CHECK-NEXT:    bne cr0, .LBB7_6
 ; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/constant-pool.ll b/llvm/test/CodeGen/PowerPC/constant-pool.ll
index a9feb93627b06..2ded7215d8fd6 100644
--- a/llvm/test/CodeGen/PowerPC/constant-pool.ll
+++ b/llvm/test/CodeGen/PowerPC/constant-pool.ll
@@ -11,7 +11,6 @@
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 940572664
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, 1073741824
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: FloatConstantPool:
@@ -28,7 +27,6 @@ entry:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 1048574
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, 780229072
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: DoubleConstantPool:
@@ -47,8 +45,6 @@ entry:
 ; CHECK-NEXT:    xxsplti32dx vs2, 0, -2146625897
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, -609716532
 ; CHECK-NEXT:    xxsplti32dx vs2, 1, 1339675259
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: LongDoubleConstantPool:
@@ -224,13 +220,11 @@ define double @two_constants_two_bb(i32 %m, double %a) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 1074935889
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, -343597384
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB12_2: # %if.end
 ; CHECK-NEXT:    xxsplti32dx vs0, 0, 1076085391
 ; CHECK-NEXT:    xxsplti32dx vs0, 1, 1546188227
 ; CHECK-NEXT:    xsadddp f1, f1, f0
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: two_constants_two_bb:
@@ -369,12 +363,10 @@ define ppc_fp128 @three_constants_ppcf128(ppc_fp128 %a, ppc_fp128 %c) {
 ; CHECK-NEXT:    stxv vs63, 32(r1) # 16-byte Folded Spill
 ; CHECK-NEXT:    xxsplti32dx vs63, 0, 1074935889
 ; CHECK-NEXT:    xxsplti32dx vs3, 1, -343597384
-; CHECK-NEXT:    # kill: def $f3 killed $f3 killed $vsl3
 ; CHECK-NEXT:    bl __gcc_qadd@notoc
 ; CHECK-NEXT:    xxsplti32dx vs3, 0, 1074935889
 ; CHECK-NEXT:    xxlxor f4, f4, f4
 ; CHECK-NEXT:    xxsplti32dx vs3, 1, -1719329096
-; CHECK-NEXT:    # kill: def $f3 killed $f3 killed $vsl3
 ; CHECK-NEXT:    bl __gcc_qadd@notoc
 ; CHECK-NEXT:    xxsplti32dx vs63, 1, 8724152
 ; CHECK-NEXT:    xxlxor f4, f4, f4
diff --git a/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll b/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
index fc0bfef11d7a6..9d537d89c009d 100644
--- a/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
+++ b/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
@@ -403,11 +403,10 @@ define void @call_test_byval_mem32_2() #0 {
 ; CHECK-NEXT:    std 0, 48(1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addis 3, 2, .LC5@toc@ha
 ; CHECK-NEXT:    vspltisw 2, 1
+; CHECK-NEXT:    addis 3, 2, .LC5@toc@ha
 ; CHECK-NEXT:    ld 3, .LC5@toc@l(3)
 ; CHECK-NEXT:    xvcvsxwdp 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    ld 7, 24(3)
 ; CHECK-NEXT:    ld 6, 16(3)
 ; CHECK-NEXT:    ld 5, 8(3)
@@ -453,9 +452,7 @@ define void @call_test_byval_mem32_3() #0 {
 ; CHECK-NEXT:    li 7, 2
 ; CHECK-NEXT:    ld 3, .LC5@toc@l(3)
 ; CHECK-NEXT:    xvcvsxwdp 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    xvcvsxwdp 2, 35
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    lxvd2x 0, 3, 4
 ; CHECK-NEXT:    li 4, 88
 ; CHECK-NEXT:    stxvd2x 0, 1, 4
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index b39c9327bc8ab..3d45e9a3a509c 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -202,7 +202,6 @@ define dso_local double @getNegatedExpression_crash(double %x, double %y) {
 ; CHECK-FAST-NEXT:    xvcvsxwdp 4, 34
 ; CHECK-FAST-NEXT:    lfs 3, .LCPI5_0@toc@l(3)
 ; CHECK-FAST-NEXT:    xssubdp 0, 1, 4
-; CHECK-FAST-NEXT:    # kill: def $f4 killed $f4 killed $vsl4
 ; CHECK-FAST-NEXT:    xsmaddadp 4, 1, 3
 ; CHECK-FAST-NEXT:    xsmaddadp 0, 4, 2
 ; CHECK-FAST-NEXT:    fmr 1, 0
@@ -226,7 +225,6 @@ define dso_local double @getNegatedExpression_crash(double %x, double %y) {
 ; CHECK-NEXT:    xvcvsxwdp 4, 34
 ; CHECK-NEXT:    lfs 3, .LCPI5_0@toc@l(3)
 ; CHECK-NEXT:    xssubdp 0, 1, 4
-; CHECK-NEXT:    # kill: def $f4 killed $f4 killed $vsl4
 ; CHECK-NEXT:    xsmaddadp 4, 1, 3
 ; CHECK-NEXT:    xsmaddadp 0, 4, 2
 ; CHECK-NEXT:    fmr 1, 0
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
index 4c8729b9f43a5..eac4fb6f98bf7 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
@@ -229,7 +229,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs1, v30
 ; P8-NEXT:    xscvspdpn f1, v31
 ; P8-NEXT:    xvcvdpsp v29, vs0
@@ -240,7 +239,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, v30, vs1
 ; P8-NEXT:    li r3, 160
 ; P8-NEXT:    xvcvdpsp v2, vs0
@@ -278,7 +276,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, vs1, v30
 ; P9-NEXT:    xscvspdpn f1, v31
 ; P9-NEXT:    xvcvdpsp v29, vs0
@@ -289,7 +286,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, v30, vs1
 ; P9-NEXT:    lxv v31, 64(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 48(r1) # 16-byte Folded Reload
@@ -327,11 +323,9 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlor v30, f1, f1
 ; P8-NEXT:    xxswapd vs1, v31
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    bl nearbyint
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 144
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd v2, v30, vs1
 ; P8-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
 ; P8-NEXT:    li r3, 128
@@ -358,10 +352,8 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P9-NEXT:    nop
 ; P9-NEXT:    xscpsgndp v30, f1, f1
 ; P9-NEXT:    xxswapd vs1, v31
-; P9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P9-NEXT:    bl nearbyint
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd v2, v30, vs1
 ; P9-NEXT:    lxv v31, 48(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 32(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 8cb68e60f7f9b..19b4b1c9cdf95 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -70,7 +70,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 1, 61
 ; CHECK-NEXT:    xscvspdpn 1, 62
 ; CHECK-NEXT:    xscvspdpn 2, 63
@@ -84,7 +83,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 61, 1
 ; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
@@ -124,11 +122,8 @@ define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT:    xscpsgndp 61, 1, 1
 ; CHECK-NEXT:    xxswapd 1, 62
 ; CHECK-NEXT:    xxswapd 2, 63
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    bl fmod
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 34, 61, 1
 ; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
index 13f70f420400b..4256933300243 100644
--- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
+++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
@@ -666,7 +666,6 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 {
 ; P8-NEXT:    bl __gnu_h2f_ieee
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 80
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs61, vs1
 ; P8-NEXT:    xxmrghd vs1, vs63, vs62
 ; P8-NEXT:    ld r30, 96(r1) # 8-byte Folded Reload
@@ -776,7 +775,6 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 {
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 80
 ; P8-NEXT:    xxmrghd vs35, vs63, vs62
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs34, vs61, vs1
 ; P8-NEXT:    ld r30, 96(r1) # 8-byte Folded Reload
 ; P8-NEXT:    lxvd2x vs63, r1, r3 # 16-byte Folded Reload
@@ -1005,11 +1003,10 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; P8-NEXT:    stdu r1, -128(r1)
 ; P8-NEXT:    li r3, 48
 ; P8-NEXT:    std r0, 144(r1)
-; P8-NEXT:    std r27, 88(r1) # 8-byte Folded Spill
 ; P8-NEXT:    xxswapd vs1, vs34
+; P8-NEXT:    std r27, 88(r1) # 8-byte Folded Spill
 ; P8-NEXT:    std r28, 96(r1) # 8-byte Folded Spill
 ; P8-NEXT:    std r29, 104(r1) # 8-byte Folded Spill
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
 ; P8-NEXT:    mr r30, r7
 ; P8-NEXT:    stxvd2x vs62, r1, r3 # 16-byte Folded Spill
@@ -1019,9 +1016,8 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; P8-NEXT:    vmr v31, v3
 ; P8-NEXT:    bl __truncdfhf2
 ; P8-NEXT:    nop
-; P8-NEXT:    mr r29, r3
 ; P8-NEXT:    xxswapd vs1, vs63
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; P8-NEXT:    mr r29, r3
 ; P8-NEXT:    bl __truncdfhf2
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlor f1, vs62, vs62
@@ -1238,7 +1234,6 @@ define half @PR40273(half) #0 {
 ; P8-NEXT:    vspltisw v2, 1
 ; P8-NEXT:    xvcvsxwdp vs1, vs34
 ; P8-NEXT:  .LBB20_2:
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    addi r1, r1, 32
 ; P8-NEXT:    ld r0, 16(r1)
 ; P8-NEXT:    mtlr r0
@@ -1253,12 +1248,10 @@ define half @PR40273(half) #0 {
 ; CHECK-NEXT:    mtfprwz f0, r3
 ; CHECK-NEXT:    xscvhpdp f0, f0
 ; CHECK-NEXT:    fcmpu cr0, f0, f1
-; CHECK-NEXT:    beq cr0, .LBB20_2
+; CHECK-NEXT:    beqlr cr0
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    xvcvsxwdp vs1, vs34
-; CHECK-NEXT:  .LBB20_2:
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; SOFT-LABEL: PR40273:
diff --git a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
index f1039df6f549a..78bdac021ac8a 100644
--- a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
+++ b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
@@ -18,7 +18,7 @@ define void @foo(i8 %x) {
 ; CHECK-LE-NEXT:    oris 0, 0, 65535
 ; CHECK-LE-NEXT:    ori 0, 0, 65504
 ; CHECK-LE-NEXT:    stdux 1, 1, 0
-; CHECK-LE-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-LE-NEXT:    .cfi_def_cfa_offset 4294967328
 ; CHECK-LE-NEXT:    li 4, 1
 ; CHECK-LE-NEXT:    addi 5, 1, 32
 ; CHECK-LE-NEXT:    stb 3, 32(1)
diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll
index ed8089b4b303e..151df6096b30b 100644
--- a/llvm/test/CodeGen/PowerPC/ldexp.ll
+++ b/llvm/test/CodeGen/PowerPC/ldexp.ll
@@ -13,7 +13,6 @@ define float @ldexp_f32(i8 zeroext %x) {
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    mr r4, r3
 ; CHECK-NEXT:    xvcvsxwdp vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
@@ -36,7 +35,6 @@ define double @ldexp_f64(i8 zeroext %x) {
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    mr r4, r3
 ; CHECK-NEXT:    xvcvsxwdp vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl ldexp
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
@@ -120,7 +118,6 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) {
 ; CHECK-NEXT:    vextuwrx r4, r3, v31
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, v29, vs1
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    vextuwrx r4, r3, v31
@@ -135,7 +132,6 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) {
 ; CHECK-NEXT:    xscvspdpn f1, vs0
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, vs1, v29
 ; CHECK-NEXT:    lxv v31, 80(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v30, 64(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
index 8d03594fe1bfd..681f81d74794d 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
@@ -6,6 +6,13 @@
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
 declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/p10-constants.ll b/llvm/test/CodeGen/PowerPC/p10-p11-constants.ll
similarity index 94%
rename from llvm/test/CodeGen/PowerPC/p10-constants.ll
rename to llvm/test/CodeGen/PowerPC/p10-p11-constants.ll
index 77472afd9c3d4..5f6a345bdd938 100644
--- a/llvm/test/CodeGen/PowerPC/p10-constants.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-p11-constants.ll
@@ -8,7 +8,17 @@
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
 ; RUN:   FileCheck %s --check-prefix=CHECK32
 
-; These test cases aim to test constant materialization using the pli instruction on Power10.
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu \
+; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK32
+
+; These test cases aim to test constant materialization using the pli instruction on Power10 and Power11.
 
 define  signext i32 @t_16BitsMinRequiring34Bits() {
 ; CHECK-LABEL: t_16BitsMinRequiring34Bits:
diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
index 7373a328a4f05..842cb929541cf 100644
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
@@ -124,21 +124,18 @@ define dso_local double @testDoubleNonRepresentableScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testDoubleNonRepresentableScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testDoubleNonRepresentableScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testDoubleNonRepresentableScalar:
@@ -151,7 +148,6 @@ define dso_local double @testDoubleNonRepresentableScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret double 3.423300e+02
@@ -162,21 +158,18 @@ define dso_local float @testFloatDenormScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testFloatDenormScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testFloatDenormScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testFloatDenormScalar:
@@ -189,7 +182,6 @@ define dso_local float @testFloatDenormScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret float 0x380B38FB80000000
@@ -200,21 +192,18 @@ define dso_local double @testFloatDenormToDoubleScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testFloatDenormToDoubleScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testFloatDenormToDoubleScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testFloatDenormToDoubleScalar:
@@ -227,7 +216,6 @@ define dso_local double @testFloatDenormToDoubleScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret double 0x380B38FB80000000
diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 87b8a64cc67bd..8f12b182283f5 100644
--- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -2416,7 +2416,6 @@ define double @getd0(<2 x double> %vd) {
 ; CHECK-LE-LABEL: getd0:
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-AIX-LABEL: getd0:
@@ -2435,7 +2434,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-LABEL: getd1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxswapd vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: getd1:
@@ -2446,7 +2444,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-AIX-LABEL: getd1:
 ; CHECK-AIX:       # %bb.0: # %entry
 ; CHECK-AIX-NEXT:    xxswapd 1, 34
-; CHECK-AIX-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-AIX-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 1
@@ -2462,7 +2459,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-NEXT:    lvsl v3, 0, r3
 ; CHECK-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-NEXT:    xxlor vs1, v2, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: getveld:
@@ -2474,7 +2470,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-LE-NEXT:    lvsl v3, 0, r3
 ; CHECK-LE-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-LE-NEXT:    xxlor vs1, v2, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-AIX-LABEL: getveld:
@@ -2484,7 +2479,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-AIX-NEXT:    lvsl 3, 0, 3
 ; CHECK-AIX-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-AIX-NEXT:    xxlor 1, 34, 34
-; CHECK-AIX-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-AIX-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 %i
diff --git a/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll b/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
new file mode 100644
index 0000000000000..0c2d2829a6d4b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=powerpc %s -o - | FileCheck %s --check-prefixes=CHECK,PPC32
+; RUN: llc -mtriple=powerpc64 %s -o - | FileCheck %s --check-prefixes=CHECK,PPC64
+
+@a = global i32 0, align 4
+
+define void @f0() {
+; CHECK-LABEL: f0:
+; CHECK-NOT:   nop
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+; CHECK-NOT:   .section    __patchable_function_entries
+  ret void
+}
+
+define void @f1() "patchable-function-entry"="0" {
+; CHECK-LABEL: f1:
+; CHECK-NOT:   nop
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+; CHECK-NOT:   .section    __patchable_function_entries
+  ret void
+}
+
+define void @f2() "patchable-function-entry"="1" {
+; CHECK-LABEL: f2:
+; CHECK-LABEL-NEXT:  .Lfunc_begin2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    blr
+; CHECK:       .section    __patchable_function_entries
+; PPC32:       .p2align    2, 0x0
+; PPC64:       .p2align    3, 0x0
+; PPC32-NEXT:  .long   .Lfunc_begin2
+; PPC64-NEXT:  .quad   .Lfunc_begin2
+  ret void
+}
+
+define i32 @f3() "patchable-function-entry"="1" "patchable-function-prefix"="2" {
+; CHECK-LABEL: .Ltmp0:
+; CHECK-COUNT-2: nop
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nop
+; PPC32:         lis 3, a@ha
+; PPC32-NEXT:    lwz 3, a@l(3)
+; PPC64:         addis 3, 2, .LC0@toc@ha
+; PPC64-NEXT:    ld 3, .LC0@toc@l(3)
+; PPC64-NEXT:    lwz 3, 0(3)
+; CHECK:         blr
+; CHECK:       .section    __patchable_function_entries
+; PPC32:       .p2align    2, 0x0
+; PPC64:       .p2align    3, 0x0
+; PPC32-NEXT:  .long   .Ltmp0
+; PPC64-NEXT:  .quad   .Ltmp0
+entry:
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
index 541b2c46dd395..0b1047bb6cfbe 100644
--- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
+++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
@@ -175,7 +175,6 @@ define dso_local double @UsesX2AsConstPoolTOC() local_unnamed_addr {
 ; CHECK-ALL:       # %bb.0: # %entry
 ; CHECK-S-NEXT:    xxsplti32dx vs1, 0, 1078011044
 ; CHECK-S-NEXT:    xxsplti32dx vs1, 1, -337824948
-; CHECK-S-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-S-NEXT:    blr
 entry:
   ret double 0x404124A4EBDD334C
diff --git a/llvm/test/CodeGen/PowerPC/save-reg-params.ll b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
new file mode 100644
index 0000000000000..da4cd51c864ea
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
@@ -0,0 +1,865 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=32BIT
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=64BIT
+
+define void @i64_join(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+; 32BIT-LABEL: i64_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i64_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = add nsw i64 %b, %a
+  %add1 = add nsw i64 %add, %c
+  %add2 = add nsw i64 %add1, %d
+  %add3 = add nsw i64 %add2, %e
+  %add4 = add nsw i64 %add3, %f
+  %add5 = add nsw i64 %add4, %g
+  %add6 = add nsw i64 %add5, %h
+  %add7 = add nsw i64 %add6, %i
+  %add8 = add nsw i64 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i64_join_missing(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+; 32BIT-LABEL: i64_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i64_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add1 = mul nsw i64 %a, 3
+  %add2 = add nsw i64 %add1, %d
+  %add3 = add nsw i64 %add2, %e
+  %add4 = add nsw i64 %add3, %f
+  %add5 = add nsw i64 %add4, %g
+  %add6 = add nsw i64 %add5, %h
+  %add7 = add nsw i64 %add6, %i
+  %add8 = add nsw i64 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i32_join(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+; 32BIT-LABEL: i32_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i32_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = add nsw i32 %b, %a
+  %add1 = add nsw i32 %add, %c
+  %add2 = add nsw i32 %add1, %d
+  %add3 = add nsw i32 %add2, %e
+  %add4 = add nsw i32 %add3, %f
+  %add5 = add nsw i32 %add4, %g
+  %add6 = add nsw i32 %add5, %h
+  %add7 = add nsw i32 %add6, %i
+  %add8 = add nsw i32 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i32_join_missing(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+; 32BIT-LABEL: i32_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i32_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add1 = mul nsw i32 %a, 3
+  %add2 = add nsw i32 %add1, %d
+  %add3 = add nsw i32 %add2, %e
+  %add4 = add nsw i32 %add3, %f
+  %add5 = add nsw i32 %add4, %g
+  %add6 = add nsw i32 %add5, %h
+  %add7 = add nsw i32 %add6, %i
+  %add8 = add nsw i32 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f32_join(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+; 32BIT-LABEL: f32_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f32_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %add, %c
+  %add2 = fadd float %add1, %d
+  %add3 = fadd float %add2, %e
+  %add4 = fadd float %add3, %f
+  %add5 = fadd float %add4, %g
+  %add6 = fadd float %add5, %h
+  %add7 = fadd float %add6, %i
+  %add8 = fadd float %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f32_join_missing(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+; 32BIT-LABEL: f32_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f32_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd float %a, %a
+  %add1 = fadd float %add, %a
+  %add2 = fadd float %add1, %d
+  %add3 = fadd float %add2, %e
+  %add4 = fadd float %add3, %f
+  %add5 = fadd float %add4, %g
+  %add6 = fadd float %add5, %h
+  %add7 = fadd float %add6, %i
+  %add8 = fadd float %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f64_join(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+; 32BIT-LABEL: f64_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f64_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %b
+  %add1 = fadd double %add, %c
+  %add2 = fadd double %add1, %d
+  %add3 = fadd double %add2, %e
+  %add4 = fadd double %add3, %f
+  %add5 = fadd double %add4, %g
+  %add6 = fadd double %add5, %h
+  %add7 = fadd double %add6, %i
+  %add8 = fadd double %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f64_missing(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+; 32BIT-LABEL: f64_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f64_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %a
+  %add1 = fadd double %add, %a
+  %add2 = fadd double %add1, %d
+  %add3 = fadd double %add2, %e
+  %add4 = fadd double %add3, %f
+  %add5 = fadd double %add4, %g
+  %add6 = fadd double %add5, %h
+  %add7 = fadd double %add6, %i
+  %add8 = fadd double %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @mixed_1(double %a, i64 %b, i64 %c, i32 signext %d, i64 %e, float %f, float %g, double %h, i32 signext %i, double %j) #0 {
+; 32BIT-LABEL: mixed_1:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -112(1)
+; 32BIT-NEXT:    stw 0, 120(1)
+; 32BIT-NEXT:    stfd 1, 136(1)
+; 32BIT-NEXT:    xsadddp 1, 1, 5
+; 32BIT-NEXT:    stw 24, 64(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    lwz 24, 168(1)
+; 32BIT-NEXT:    stw 25, 68(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    lwz 25, 188(1)
+; 32BIT-NEXT:    stw 26, 72(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 27, 76(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 28, 80(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 29, 84(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 30, 88(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 31, 92(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stfd 30, 96(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd 31, 104(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    fmr 31, 3
+; 32BIT-NEXT:    fmr 30, 2
+; 32BIT-NEXT:    mr 31, 10
+; 32BIT-NEXT:    mr 30, 9
+; 32BIT-NEXT:    mr 29, 8
+; 32BIT-NEXT:    mr 28, 7
+; 32BIT-NEXT:    mr 27, 6
+; 32BIT-NEXT:    mr 26, 5
+; 32BIT-NEXT:    stfd 5, 192(1)
+; 32BIT-NEXT:    stfd 4, 180(1)
+; 32BIT-NEXT:    stfs 3, 176(1)
+; 32BIT-NEXT:    stfs 2, 172(1)
+; 32BIT-NEXT:    stw 10, 164(1)
+; 32BIT-NEXT:    stw 9, 160(1)
+; 32BIT-NEXT:    stw 8, 156(1)
+; 32BIT-NEXT:    stw 7, 152(1)
+; 32BIT-NEXT:    stw 6, 148(1)
+; 32BIT-NEXT:    stw 5, 144(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    fadds 1, 30, 31
+; 32BIT-NEXT:    bl .consume_f32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addc 3, 29, 27
+; 32BIT-NEXT:    adde 4, 28, 26
+; 32BIT-NEXT:    srawi 5, 30, 31
+; 32BIT-NEXT:    addc 3, 3, 30
+; 32BIT-NEXT:    adde 5, 4, 5
+; 32BIT-NEXT:    addc 4, 3, 24
+; 32BIT-NEXT:    adde 3, 5, 31
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    add 3, 25, 30
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lfd 31, 104(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd 30, 96(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz 31, 92(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 30, 88(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 29, 84(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 28, 80(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 27, 76(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 26, 72(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 25, 68(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 24, 64(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    addi 1, 1, 112
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_1:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -176(1)
+; 64BIT-NEXT:    std 0, 192(1)
+; 64BIT-NEXT:    stfd 1, 224(1)
+; 64BIT-NEXT:    xsadddp 1, 1, 5
+; 64BIT-NEXT:    std 27, 120(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    lwz 27, 292(1)
+; 64BIT-NEXT:    std 28, 128(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 29, 136(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 30, 144(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 31, 152(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 30, 160(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 31, 168(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    fmr 31, 3
+; 64BIT-NEXT:    fmr 30, 2
+; 64BIT-NEXT:    mr 31, 7
+; 64BIT-NEXT:    mr 30, 6
+; 64BIT-NEXT:    mr 29, 5
+; 64BIT-NEXT:    mr 28, 4
+; 64BIT-NEXT:    stfd 5, 296(1)
+; 64BIT-NEXT:    stfd 4, 280(1)
+; 64BIT-NEXT:    stfs 3, 272(1)
+; 64BIT-NEXT:    stfs 2, 264(1)
+; 64BIT-NEXT:    std 7, 256(1)
+; 64BIT-NEXT:    std 6, 248(1)
+; 64BIT-NEXT:    std 5, 240(1)
+; 64BIT-NEXT:    std 4, 232(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    fadds 1, 30, 31
+; 64BIT-NEXT:    bl .consume_f32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    add 3, 29, 28
+; 64BIT-NEXT:    add 3, 3, 30
+; 64BIT-NEXT:    add 3, 3, 31
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    add 3, 27, 30
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lfd 31, 168(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd 30, 160(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 31, 152(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 30, 144(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 29, 136(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 28, 128(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 27, 120(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    addi 1, 1, 176
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %j
+  tail call void @consume_f64(double %add)
+  %add1 = fadd float %f, %g
+  tail call void @consume_f32(float %add1)
+  %add2 = add nsw i64 %c, %b
+  %conv = sext i32 %d to i64
+  %add3 = add nsw i64 %add2, %conv
+  %add4 = add nsw i64 %add3, %e
+  tail call void @consume_i64(i64 %add4)
+  %add5 = add nsw i32 %i, %d
+  tail call void @consume_i32(i32 signext %add5)
+  ret void
+}
+
+define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) #0 {
+; 32BIT-LABEL: mixed_2:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    li 5, 64
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stw 4, 140(1)
+; 32BIT-NEXT:    stw 3, 136(1)
+; 32BIT-NEXT:    stxvd2x 34, 1, 5 # 16-byte Folded Spill
+; 32BIT-NEXT:    addi 5, 1, 120
+; 32BIT-NEXT:    stxvw4x 35, 0, 5
+; 32BIT-NEXT:    addi 5, 1, 104
+; 32BIT-NEXT:    stxvd2x 34, 0, 5
+; 32BIT-NEXT:    lwz 5, 120(1)
+; 32BIT-NEXT:    srawi 6, 5, 31
+; 32BIT-NEXT:    addc 4, 5, 4
+; 32BIT-NEXT:    adde 3, 6, 3
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    li 3, 64
+; 32BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_2:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -144(1)
+; 64BIT-NEXT:    li 4, 128
+; 64BIT-NEXT:    std 0, 160(1)
+; 64BIT-NEXT:    std 3, 224(1)
+; 64BIT-NEXT:    stxvd2x 34, 1, 4 # 16-byte Folded Spill
+; 64BIT-NEXT:    addi 4, 1, 208
+; 64BIT-NEXT:    stxvw4x 35, 0, 4
+; 64BIT-NEXT:    addi 4, 1, 192
+; 64BIT-NEXT:    stxvd2x 34, 0, 4
+; 64BIT-NEXT:    lwa 4, 208(1)
+; 64BIT-NEXT:    add 3, 4, 3
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    li 3, 128
+; 64BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 144
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %b, i64 0
+  %conv = sext i32 %vecext to i64
+  %add = add nsw i64 %conv, %c
+  tail call void @consume_i64(i64 %add)
+  %vecext1 = extractelement <2 x double> %a, i64 0
+  tail call void @consume_f64(double %vecext1)
+  ret void
+}
+
+%struct.foo = type <{ [3 x i32], double, [12 x i8], <4 x i32> }>
+
+define void @mixed_3(<2 x double> %a, i64 %b, double %c, float %d, i32 signext %e, double %f, ...) #0 {
+; 32BIT-LABEL: mixed_3:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    xsadddp 0, 34, 3
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stfd 1, 128(1)
+; 32BIT-NEXT:    stw 29, 60(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    addi 3, 1, 104
+; 32BIT-NEXT:    lwz 29, 148(1)
+; 32BIT-NEXT:    stw 30, 64(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 31, 68(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    fmr 31, 2
+; 32BIT-NEXT:    mr 31, 10
+; 32BIT-NEXT:    mr 30, 9
+; 32BIT-NEXT:    xsadddp 1, 0, 1
+; 32BIT-NEXT:    stxvd2x 34, 0, 3
+; 32BIT-NEXT:    stfd 3, 144(1)
+; 32BIT-NEXT:    stfs 2, 136(1)
+; 32BIT-NEXT:    stw 10, 124(1)
+; 32BIT-NEXT:    stw 9, 120(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    mr 3, 30
+; 32BIT-NEXT:    mr 4, 31
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    fmr 1, 31
+; 32BIT-NEXT:    bl .consume_f32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    mr 3, 29
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz 31, 68(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 30, 64(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 29, 60(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+; 32BIT: NumOfGPRsSaved = 3
+;
+; 64BIT-LABEL: mixed_3:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -144(1)
+; 64BIT-NEXT:    xsadddp 0, 34, 3
+; 64BIT-NEXT:    std 0, 160(1)
+; 64BIT-NEXT:    stfd 1, 216(1)
+; 64BIT-NEXT:    addi 3, 1, 192
+; 64BIT-NEXT:    std 30, 120(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 31, 128(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 31, 136(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    mr 31, 8
+; 64BIT-NEXT:    fmr 31, 2
+; 64BIT-NEXT:    mr 30, 5
+; 64BIT-NEXT:    stxvd2x 34, 0, 3
+; 64BIT-NEXT:    xsadddp 1, 0, 1
+; 64BIT-NEXT:    std 10, 248(1)
+; 64BIT-NEXT:    stfd 3, 240(1)
+; 64BIT-NEXT:    std 8, 232(1)
+; 64BIT-NEXT:    stfs 2, 224(1)
+; 64BIT-NEXT:    std 5, 208(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    mr 3, 30
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    fmr 1, 31
+; 64BIT-NEXT:    bl .consume_f32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    mr 3, 31
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lfd 31, 136(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 31, 128(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 30, 120(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    addi 1, 1, 144
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+; 64BIT: NumOfGPRsSaved = 2
+entry:
+  %vecext = extractelement <2 x double> %a, i64 0
+  %add = fadd double %vecext, %f
+  %add1 = fadd double %add, %c
+  tail call void @consume_f64(double %add1)
+  tail call void @consume_i64(i64 %b)
+  tail call void @consume_f32(float %d)
+  tail call void @consume_i32(i32 signext %e)
+  ret void
+}
+
+define signext i32 @mixed_4(ptr byval(%struct.foo) align 16 %foo, i32 %sec) #0 {
+; 32BIT-LABEL: mixed_4:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    lfd 0, 44(1)
+; 32BIT-NEXT:    addi 3, 1, -4
+; 32BIT-NEXT:    xscvdpsxws 0, 0
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    stfiwx 0, 0, 3
+; 32BIT-NEXT:    lwz 3, -4(1)
+; 32BIT-NEXT:    lwz 4, 76(1)
+; 32BIT-NEXT:    add 3, 5, 3
+; 32BIT-NEXT:    add 3, 3, 4
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_4:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    lfd 0, 60(1)
+; 64BIT-NEXT:    addi 4, 1, -4
+; 64BIT-NEXT:    xscvdpsxws 0, 0
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    std 7, 80(1)
+; 64BIT-NEXT:    std 8, 88(1)
+; 64BIT-NEXT:    std 9, 96(1)
+; 64BIT-NEXT:    rldicl 3, 3, 32, 32
+; 64BIT-NEXT:    stfiwx 0, 0, 4
+; 64BIT-NEXT:    lwz 4, -4(1)
+; 64BIT-NEXT:    add 3, 3, 4
+; 64BIT-NEXT:    add 3, 3, 8
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    blr
+entry:
+  %0 = load i32, ptr %foo, align 16
+  %x = getelementptr inbounds i8, ptr %foo, i64 12
+  %1 = load double, ptr %x, align 4
+  %conv = fptosi double %1 to i32
+  %add = add nsw i32 %0, %conv
+  %2 = getelementptr inbounds i8, ptr %foo, i64 44
+  %vecext = load i32, ptr %2, align 4
+  %add1 = add nsw i32 %add, %vecext
+  ret i32 %add1
+}
+
+%struct.bar = type { i8, i32, <4 x i32>, ptr, i8 }
+
+define void @mixed_5(ptr byref(%struct.bar) align 16 %r, ptr byval(%struct.bar) align 16 %x, i32 signext %y, ptr byval(%struct.foo) align 16 %f) #0 {
+; 32BIT-LABEL: mixed_5:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    lfd 1, 172(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lwz 3, 100(1)
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_5:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    lfd 1, 252(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lwa 3, 180(1)
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %d = getelementptr inbounds i8, ptr %f, i64 12
+  %0 = load double, ptr %d, align 4
+  tail call void @consume_f64(double %0)
+  %i = getelementptr inbounds i8, ptr %x, i64 4
+  %1 = load i32, ptr %i, align 4
+  tail call void @consume_i32(i32 signext %1)
+  ret void
+}
+
+declare void @foo()
+declare void @consume_f64(double)
+declare void @consume_f32(float)
+declare void @consume_i64(i64)
+declare void @consume_i32(i32 signext)
+
+attributes #0 = { nofree noinline nounwind "save-reg-params" }
diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll
index ca4be83cc16ac..a48d6968aafbf 100644
--- a/llvm/test/CodeGen/PowerPC/select_const.ll
+++ b/llvm/test/CodeGen/PowerPC/select_const.ll
@@ -845,12 +845,10 @@ define double @sel_constants_frem_constant(i1 %cond) {
 ; ALL-NEXT:  # %bb.1:
 ; ALL-NEXT:    addis 3, 2, .LCPI48_0@toc@ha
 ; ALL-NEXT:    lfd 1, .LCPI48_0@toc@l(3)
-; ALL-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; ALL-NEXT:    blr
 ; ALL-NEXT:  .LBB48_2:
 ; ALL-NEXT:    vspltisw 2, -4
 ; ALL-NEXT:    xvcvsxwdp 1, 34
-; ALL-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; ALL-NEXT:    blr
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = frem double %sel, 5.1
diff --git a/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir b/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir
new file mode 100644
index 0000000000000..31407e0d44cfb
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s \
+# RUN:   -verify-coalescing --run-pass=register-coalescer -o - | FileCheck %s
+
+# Check that the register coalescer correctly handles merging live ranges over
+# SUBREG_TO_REG on PowerPC. The -verify-coalescing option will give an error if
+# this is incorrect.
+
+---
+name: check_subregs
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x3
+
+    ; CHECK-LABEL: name: check_subregs
+    ; CHECK: liveins: $x3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x3
+    ; CHECK-NEXT: [[LFSUX:%[0-9]+]]:f8rc, dead [[LFSUX1:%[0-9]+]]:g8rc_and_g8rc_nox0 = LFSUX [[COPY]], [[COPY]]
+    ; CHECK-NEXT: undef [[FRSP:%[0-9]+]].sub_64:vslrc = FRSP [[LFSUX]], implicit $rm
+    ; CHECK-NEXT: [[XVCVDPSP:%[0-9]+]]:vrrc = XVCVDPSP [[FRSP]], implicit $rm
+    ; CHECK-NEXT: $v2 = COPY [[XVCVDPSP]]
+    ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $v2
+    %0:g8rc_and_g8rc_nox0 = COPY $x3
+    %1:f8rc, %2:g8rc_and_g8rc_nox0 = LFSUX %0, %0
+    %3:f4rc = FRSP killed %1, implicit $rm
+    %4:vslrc = SUBREG_TO_REG 1, %3, %subreg.sub_64
+    %5:vrrc = XVCVDPSP killed %4, implicit $rm
+    $v2 = COPY %5
+    BLR8 implicit $lr8, implicit $rm, implicit $v2
+...
diff --git a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
index e1fd6180b2662..cf69d3ad09878 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
+++ b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
@@ -5,21 +5,18 @@
 
 # Keep track of all of the lanemasks for various subregsiters.
 #
-# TODO: The mask for %6.sub_vsx1:accrc is the same as the mask for %10.sub_vsx1_then_sub_64:accrc.
-#       Ideally on PowerPC these masks should be different. To be addressed in a later patch.
-#
-# CHECK: %3 [80r,80d:0) 0@80r  L0000000000000004 [80r,80d:0) 0@80r  weight:0.000000e+00
-# CHECK: %4 [96r,96d:0) 0@96r  L0000000000000800 [96r,96d:0) 0@96r  weight:0.000000e+00
-# CHECK: %5 [112r,112d:0) 0@112r  L0000000000000004 [112r,112d:0) 0@112r  weight:0.000000e+00
-# CHECK: %6 [128r,128d:0) 0@128r  L0000000000000800 [128r,128d:0) 0@128r  weight:0.000000e+00
+# CHECK: %3 [80r,80d:0) 0@80r  L000000000000000C [80r,80d:0) 0@80r  weight:0.000000e+00
+# CHECK: %4 [96r,96d:0) 0@96r  L0000000000003000 [96r,96d:0) 0@96r  weight:0.000000e+00
+# CHECK: %5 [112r,112d:0) 0@112r  L000000000000000C [112r,112d:0) 0@112r  weight:0.000000e+00
+# CHECK: %6 [128r,128d:0) 0@128r  L0000000000003000 [128r,128d:0) 0@128r  weight:0.000000e+00
 # CHECK: %7 [144r,144d:0) 0@144r  L0000000000000004 [144r,144d:0) 0@144r  weight:0.000000e+00
-# CHECK: %8 [160r,160d:0) 0@160r  L0000000000000800 [160r,160d:0) 0@160r  weight:0.000000e+00
+# CHECK: %8 [160r,160d:0) 0@160r  L0000000000001000 [160r,160d:0) 0@160r  weight:0.000000e+00
 # CHECK: %9 [176r,176d:0) 0@176r  L0000000000000004 [176r,176d:0) 0@176r  weight:0.000000e+00
-# CHECK: %10 [192r,192d:0) 0@192r  L0000000000000800 [192r,192d:0) 0@192r  weight:0.000000e+00
-# CHECK: %11 [208r,208d:0) 0@208r  L0000000000001000 [208r,208d:0) 0@208r  weight:0.000000e+00
-# CHECK: %12 [224r,224d:0) 0@224r  L0000000000002000 [224r,224d:0) 0@224r  weight:0.000000e+00
-# CHECK: %13 [240r,240d:0) 0@240r  L0000000000000804 [240r,240d:0) 0@240r  weight:0.000000e+00
-# CHECK: %14 [256r,256d:0) 0@256r  L0000000000003000 [256r,256d:0) 0@256r  weight:0.000000e+00
+# CHECK: %10 [192r,192d:0) 0@192r  L0000000000001000 [192r,192d:0) 0@192r  weight:0.000000e+00
+# CHECK: %11 [208r,208d:0) 0@208r  L0000000000004000 [208r,208d:0) 0@208r  weight:0.000000e+00
+# CHECK: %12 [224r,224d:0) 0@224r  L0000000000010000 [224r,224d:0) 0@224r  weight:0.000000e+00
+# CHECK: %13 [240r,240d:0) 0@240r  L000000000000300C [240r,240d:0) 0@240r  weight:0.000000e+00
+# CHECK: %14 [256r,256d:0) 0@256r  L000000000003C000 [256r,256d:0) 0@256r  weight:0.000000e+00
 
 
 # CHECK:       0B bb.0
diff --git a/llvm/test/CodeGen/PowerPC/toc-float.ll b/llvm/test/CodeGen/PowerPC/toc-float.ll
index 1d6f1f71a2383..943edd5948429 100644
--- a/llvm/test/CodeGen/PowerPC/toc-float.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-float.ll
@@ -9,14 +9,12 @@ define double @doubleConstant1() {
 ; CHECK-P9:       # %bb.0:
 ; CHECK-P9-NEXT:    vspltisw 2, 14
 ; CHECK-P9-NEXT:    xvcvsxwdp 1, 34
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: doubleConstant1:
 ; CHECK-P8:       # %bb.0:
 ; CHECK-P8-NEXT:    vspltisw 2, 14
 ; CHECK-P8-NEXT:    xvcvsxwdp 1, 34
-; CHECK-P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P8-NEXT:    blr
   ret double 1.400000e+01
 }
diff --git a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index 49c80a91f8f99..d0dda1a071754 100644
--- a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -122,7 +122,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-NEXT:    lvsl 3, 0, 3
 ; CHECK-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-NEXT:    xxlor 1, 34, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: getd:
@@ -132,7 +131,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-BE-NEXT:    lvsl 3, 0, 3
 ; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-BE-NEXT:    xxlor 1, 34, 34
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P7-LABEL: getd:
@@ -142,7 +140,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-P7-NEXT:    lvsl 3, 0, 3
 ; CHECK-P7-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-P7-NEXT:    xxlor 1, 34, 34
-; CHECK-P7-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P7-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %a, i32 %b
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
index b98aed8616509..291a9c1f978da 100644
--- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -940,25 +940,21 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-LABEL: testDoubleImm1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd v2, v2, vs1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: testDoubleImm1:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-BE-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: testDoubleImm1:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-P9-NEXT:    blr
 ;
 ; AIX-P8-LABEL: testDoubleImm1:
 ; AIX-P8:       # %bb.0: # %entry
-; AIX-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; AIX-P8-NEXT:    xxpermdi v2, vs1, v2, 1
 ; AIX-P8-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index f217162782bfd..aedb1a9c65cf8 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -107,32 +107,20 @@ entry:
 define <3 x double> @constrained_vector_fdiv_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsdivdp 3, 3, 6
 ; PC64LE-NEXT:    xvdivdp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsdivdp 3, 3, 6
 ; PC64LE9-NEXT:    xvdivdp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64(
@@ -217,13 +205,10 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -252,11 +237,8 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -408,7 +390,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -423,7 +404,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    lfd 29, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lfd 28, 64(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
 ; PC64LE-NEXT:    mtlr 0
@@ -451,7 +431,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -462,7 +441,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 29, 56(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 28, 48(1) # 8-byte Folded Reload
@@ -505,12 +483,9 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 59, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 60
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 62
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -518,14 +493,11 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 60, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 61
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -562,11 +534,8 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 59, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 60
 ; PC64LE9-NEXT:    xxswapd 2, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -575,11 +544,8 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 60, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 61
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -704,32 +670,20 @@ entry:
 define <3 x double> @constrained_vector_fmul_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsmuldp 3, 3, 6
 ; PC64LE-NEXT:    xvmuldp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsmuldp 3, 3, 6
 ; PC64LE9-NEXT:    xvmuldp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(
@@ -866,32 +820,20 @@ entry:
 define <3 x double> @constrained_vector_fadd_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsadddp 3, 3, 6
 ; PC64LE-NEXT:    xvadddp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsadddp 3, 3, 6
 ; PC64LE9-NEXT:    xvadddp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(
@@ -1028,32 +970,20 @@ entry:
 define <3 x double> @constrained_vector_fsub_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xssubdp 3, 3, 6
 ; PC64LE-NEXT:    xvsubdp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xssubdp 3, 3, 6
 ; PC64LE9-NEXT:    xvsubdp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(
@@ -1175,26 +1105,18 @@ entry:
 define <3 x double> @constrained_vector_sqrt_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xssqrtdp 3, 3
 ; PC64LE-NEXT:    xvsqrtdp 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xssqrtdp 3, 3
 ; PC64LE9-NEXT:    xvsqrtdp 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64(
@@ -1277,13 +1199,10 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -1312,11 +1231,8 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -1468,7 +1384,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -1483,7 +1398,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    lfd 29, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lfd 28, 64(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
 ; PC64LE-NEXT:    mtlr 0
@@ -1511,7 +1425,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -1522,7 +1435,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 29, 56(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 28, 48(1) # 8-byte Folded Reload
@@ -1565,12 +1477,9 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 59, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 60
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 62
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -1578,14 +1487,11 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 60, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 61
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -1622,11 +1528,8 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 59, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 60
 ; PC64LE9-NEXT:    xxswapd 2, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -1635,11 +1538,8 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 60, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 61
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -1712,14 +1612,12 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    ld 30, 80(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -1744,13 +1642,11 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -1894,7 +1790,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    mr 4, 30
@@ -1907,7 +1802,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 80(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    ld 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -1934,7 +1828,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    mr 4, 30
@@ -1945,7 +1838,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    ld 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 80
@@ -1981,27 +1873,23 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    vmr 31, 3
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    ld 30, 96(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -2030,25 +1918,21 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    vmr 31, 3
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2116,11 +2000,9 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2143,10 +2025,8 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2269,7 +2149,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl sin
@@ -2280,7 +2159,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -2303,7 +2181,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl sin
@@ -2313,7 +2190,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -2346,22 +2222,18 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2388,20 +2260,16 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2467,11 +2335,9 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2494,10 +2360,8 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2620,7 +2484,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl cos
@@ -2631,7 +2494,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -2654,7 +2516,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl cos
@@ -2664,7 +2525,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -2697,22 +2557,18 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2739,20 +2595,16 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2818,11 +2670,9 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2845,10 +2695,8 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2971,7 +2819,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp
@@ -2982,7 +2829,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3005,7 +2851,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp
@@ -3015,7 +2860,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3048,22 +2892,18 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3090,20 +2930,16 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3169,11 +3005,9 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3196,10 +3030,8 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3322,7 +3154,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp2
@@ -3333,7 +3164,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3356,7 +3186,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp2
@@ -3366,7 +3195,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3399,22 +3227,18 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3441,20 +3265,16 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3520,11 +3340,9 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3547,10 +3365,8 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3673,7 +3489,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log
@@ -3684,7 +3499,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3707,7 +3521,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log
@@ -3717,7 +3530,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3750,22 +3562,18 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3792,20 +3600,16 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3871,11 +3675,9 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3898,10 +3700,8 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4024,7 +3824,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log10
@@ -4035,7 +3834,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4058,7 +3856,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log10
@@ -4068,7 +3865,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4101,22 +3897,18 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4143,20 +3935,16 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4222,11 +4010,9 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4249,10 +4035,8 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4375,7 +4159,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log2
@@ -4386,7 +4169,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4409,7 +4191,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log2
@@ -4419,7 +4200,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4452,22 +4232,18 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4494,20 +4270,16 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4615,26 +4387,18 @@ define <3 x float> @constrained_vector_rint_v3f32(<3 x float> %x) #0 {
 define <3 x double> @constrained_vector_rint_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpic 3, 3
 ; PC64LE-NEXT:    xvrdpic 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpic 3, 3
 ; PC64LE9-NEXT:    xvrdpic 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64(
@@ -4712,11 +4476,9 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4739,10 +4501,8 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4865,7 +4625,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl nearbyint
@@ -4876,7 +4635,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4899,7 +4657,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl nearbyint
@@ -4909,7 +4666,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4942,22 +4698,18 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4984,20 +4736,16 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -5179,10 +4927,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -5195,7 +4939,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    li 3, 48
 ; PC64LE-NEXT:    fmr 3, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 64
@@ -5207,10 +4950,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -5224,7 +4963,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    addi 1, 1, 48
 ; PC64LE9-NEXT:    ld 0, 16(1)
 ; PC64LE9-NEXT:    mtlr 0
@@ -5421,10 +5159,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -5437,7 +5171,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    li 3, 48
 ; PC64LE-NEXT:    fmr 3, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 64
@@ -5449,10 +5182,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -5466,7 +5195,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    addi 1, 1, 48
 ; PC64LE9-NEXT:    ld 0, 16(1)
 ; PC64LE9-NEXT:    mtlr 0
@@ -6792,26 +6520,18 @@ entry:
 define <3 x double> @constrained_vector_ceil_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpip 3, 3
 ; PC64LE-NEXT:    xvrdpip 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpip 3, 3
 ; PC64LE9-NEXT:    xvrdpip 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64(
@@ -6908,26 +6628,18 @@ entry:
 define <3 x double> @constrained_vector_floor_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpim 3, 3
 ; PC64LE-NEXT:    xvrdpim 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpim 3, 3
 ; PC64LE9-NEXT:    xvrdpim 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64(
@@ -7024,26 +6736,18 @@ entry:
 define <3 x double> @constrained_vector_round_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_round_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpi 3, 3
 ; PC64LE-NEXT:    xvrdpi 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_round_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpi 3, 3
 ; PC64LE9-NEXT:    xvrdpi 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %round = call <3 x double> @llvm.experimental.constrained.round.v3f64(
@@ -7139,26 +6843,18 @@ entry:
 define <3 x double> @constrained_vector_trunc_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpiz 3, 3
 ; PC64LE-NEXT:    xvrdpiz 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpiz 3, 3
 ; PC64LE9-NEXT:    xvrdpiz 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64(
@@ -8350,11 +8046,9 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -8377,10 +8071,8 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -8503,7 +8195,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl tan
@@ -8514,7 +8205,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -8537,7 +8227,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl tan
@@ -8547,7 +8236,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -8580,22 +8268,18 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -8622,20 +8306,16 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
index 4321b213b631c..190cf6fe1eaad 100644
--- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
@@ -4465,9 +4465,8 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 136(r1)
@@ -4496,7 +4495,6 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4544,18 +4542,16 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; BE-NEXT:    vmr v31, v3
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 152(r1)
@@ -4592,7 +4588,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4602,7 +4597,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4670,36 +4664,32 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; BE-NEXT:    vmr v31, v5
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v28
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v29, v29
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v29
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v30, v30
 ; BE-NEXT:    std r3, 152(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 168(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 184(r1)
@@ -4752,7 +4742,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v28
 ; CHECK-NEXT:    mtvsrd v28, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4762,7 +4751,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v29
 ; CHECK-NEXT:    mtvsrd v29, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4772,7 +4760,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4782,7 +4769,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
index 9667a26120149..b6d0bd5c05894 100644
--- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
@@ -4476,9 +4476,8 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) {
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 136(r1)
@@ -4507,7 +4506,6 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4555,18 +4553,16 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; BE-NEXT:    vmr v31, v3
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 152(r1)
@@ -4603,7 +4599,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4613,7 +4608,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4681,36 +4675,32 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; BE-NEXT:    vmr v31, v5
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v28
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v29, v29
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v29
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v30, v30
 ; BE-NEXT:    std r3, 152(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 168(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 184(r1)
@@ -4763,7 +4753,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v28
 ; CHECK-NEXT:    mtvsrd v28, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4773,7 +4762,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v29
 ; CHECK-NEXT:    mtvsrd v29, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4783,7 +4771,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4793,7 +4780,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index edd3fb7b1754b..4a036a7868c1a 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -1081,14 +1081,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvadddp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvadddp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -1096,14 +1094,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvadddp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvadddp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a)
@@ -1203,7 +1199,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -1211,7 +1206,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -1220,7 +1214,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -1228,7 +1221,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a)
@@ -1378,7 +1370,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -1388,7 +1379,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -1399,7 +1389,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -1409,7 +1398,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a)
@@ -1659,7 +1647,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -1673,7 +1660,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -1688,7 +1674,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -1702,7 +1687,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a)
@@ -2188,7 +2172,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -2214,7 +2197,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -2241,7 +2223,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -2267,7 +2248,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a)
@@ -3297,7 +3277,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v64f64_fast:
@@ -3355,7 +3334,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v64f64_fast:
@@ -3414,7 +3392,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v64f64_fast:
@@ -3472,7 +3449,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a)
@@ -3660,8 +3636,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR9LE-NEXT:    stfd f1, 32(r1)
 ; PWR9LE-NEXT:    lxv vs1, 32(r1)
 ; PWR9LE-NEXT:    xxswapd vs2, vs1
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9LE-NEXT:    addi r1, r1, 64
 ; PWR9LE-NEXT:    ld r0, 16(r1)
 ; PWR9LE-NEXT:    mtlr r0
@@ -3678,8 +3652,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR9BE-NEXT:    stfd f1, 112(r1)
 ; PWR9BE-NEXT:    lxv vs1, 112(r1)
 ; PWR9BE-NEXT:    xxswapd vs2, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9BE-NEXT:    addi r1, r1, 144
 ; PWR9BE-NEXT:    ld r0, 16(r1)
 ; PWR9BE-NEXT:    mtlr r0
@@ -3695,8 +3667,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR10LE-NEXT:    stfd f1, 32(r1)
 ; PWR10LE-NEXT:    lxv vs1, 32(r1)
 ; PWR10LE-NEXT:    xxswapd vs2, vs1
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10LE-NEXT:    addi r1, r1, 64
 ; PWR10LE-NEXT:    ld r0, 16(r1)
 ; PWR10LE-NEXT:    mtlr r0
@@ -3713,8 +3683,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR10BE-NEXT:    stfd f1, 112(r1)
 ; PWR10BE-NEXT:    lxv vs1, 112(r1)
 ; PWR10BE-NEXT:    xxswapd vs2, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10BE-NEXT:    addi r1, r1, 144
 ; PWR10BE-NEXT:    ld r0, 16(r1)
 ; PWR10BE-NEXT:    mtlr r0
@@ -4077,8 +4045,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR9LE-NEXT:    stfd f1, 32(r1)
 ; PWR9LE-NEXT:    lxv vs1, 32(r1)
 ; PWR9LE-NEXT:    xxswapd vs2, vs1
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9LE-NEXT:    addi r1, r1, 96
 ; PWR9LE-NEXT:    ld r0, 16(r1)
 ; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
@@ -4133,8 +4099,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR9BE-NEXT:    lfd f28, 144(r1) # 8-byte Folded Reload
 ; PWR9BE-NEXT:    lfd f27, 136(r1) # 8-byte Folded Reload
 ; PWR9BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9BE-NEXT:    addi r1, r1, 176
 ; PWR9BE-NEXT:    ld r0, 16(r1)
 ; PWR9BE-NEXT:    mtlr r0
@@ -4174,8 +4138,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR10LE-NEXT:    stfd f1, 32(r1)
 ; PWR10LE-NEXT:    lxv vs1, 32(r1)
 ; PWR10LE-NEXT:    xxswapd vs2, vs1
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10LE-NEXT:    addi r1, r1, 96
 ; PWR10LE-NEXT:    ld r0, 16(r1)
 ; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
@@ -4230,8 +4192,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR10BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
 ; PWR10BE-NEXT:    lxv vs1, 112(r1)
 ; PWR10BE-NEXT:    xxswapd vs2, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10BE-NEXT:    addi r1, r1, 176
 ; PWR10BE-NEXT:    ld r0, 16(r1)
 ; PWR10BE-NEXT:    mtlr r0
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
index b1f72f694aea5..7d024144b7c32 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
@@ -635,14 +635,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmaxdp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmaxdp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -650,14 +648,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmaxdp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmaxdp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
@@ -704,7 +700,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -712,7 +707,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -721,7 +715,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -729,7 +722,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a)
@@ -786,7 +778,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -796,7 +787,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -807,7 +797,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -817,7 +806,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a)
@@ -894,7 +882,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -908,7 +895,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -923,7 +909,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -937,7 +922,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a)
@@ -1074,7 +1058,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -1100,7 +1083,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -1127,7 +1109,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -1153,7 +1134,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
index e806a702cd62b..9b01889b91f6e 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
@@ -635,14 +635,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmindp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmindp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -650,14 +648,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmindp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmindp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
@@ -704,7 +700,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -712,7 +707,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -721,7 +715,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -729,7 +722,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a)
@@ -786,7 +778,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -796,7 +787,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -807,7 +797,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -817,7 +806,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a)
@@ -894,7 +882,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -908,7 +895,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -923,7 +909,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -937,7 +922,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a)
@@ -1074,7 +1058,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -1100,7 +1083,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -1127,7 +1109,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -1153,7 +1134,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
index e123f5c2056d6..b566bb9d2d911 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
@@ -1081,14 +1081,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmuldp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmuldp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -1096,14 +1094,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmuldp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmuldp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a)
@@ -1203,7 +1199,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -1211,7 +1206,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -1220,7 +1214,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -1228,7 +1221,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a)
@@ -1378,7 +1370,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -1388,7 +1379,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -1399,7 +1389,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -1409,7 +1398,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a)
@@ -1659,7 +1647,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -1673,7 +1660,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -1688,7 +1674,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -1702,7 +1687,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 32cbfd6d810ac..d1d29a0f884c6 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -1993,7 +1993,6 @@ define double @test63(<2 x double> %a) {
 ; CHECK-LE-LABEL: test63:
 ; CHECK-LE:       # %bb.0:
 ; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
   %v = extractelement <2 x double> %a, i32 0
   ret double %v
@@ -2006,13 +2005,11 @@ define double @test64(<2 x double> %a) {
 ; CHECK-LABEL: test64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xxswapd vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-REG-LABEL: test64:
 ; CHECK-REG:       # %bb.0:
 ; CHECK-REG-NEXT:    xxswapd vs1, v2
-; CHECK-REG-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-REG-NEXT:    blr
 ;
 ; CHECK-FISL-LABEL: test64:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir
new file mode 100644
index 0000000000000..12f218863e400
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir
@@ -0,0 +1,1043 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+
+  define <vscale x 1 x i8> @vload_nxv1i8(ptr %pa) #0 {
+    %va = load <vscale x 1 x i8>, ptr %pa, align 1
+    ret <vscale x 1 x i8> %va
+  }
+
+  define <vscale x 2 x i8> @vload_nxv2i8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i8>, ptr %pa, align 2
+    ret <vscale x 2 x i8> %va
+  }
+
+  define <vscale x 4 x i8> @vload_nxv4i8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i8>, ptr %pa, align 4
+    ret <vscale x 4 x i8> %va
+  }
+
+  define <vscale x 8 x i8> @vload_nxv8i8(ptr %pa) #0 {
+    %va = load <vscale x 8 x i8>, ptr %pa, align 8
+    ret <vscale x 8 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 32 x i8> @vload_nxv32i8(ptr %pa) #0 {
+    %va = load <vscale x 32 x i8>, ptr %pa, align 32
+    ret <vscale x 32 x i8> %va
+  }
+
+  define <vscale x 64 x i8> @vload_nxv64i8(ptr %pa) #0 {
+    %va = load <vscale x 64 x i8>, ptr %pa, align 64
+    ret <vscale x 64 x i8> %va
+  }
+
+  define <vscale x 1 x i16> @vload_nxv1i16(ptr %pa) #0 {
+    %va = load <vscale x 1 x i16>, ptr %pa, align 2
+    ret <vscale x 1 x i16> %va
+  }
+
+  define <vscale x 2 x i16> @vload_nxv2i16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i16>, ptr %pa, align 4
+    ret <vscale x 2 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 8 x i16> @vload_nxv8i16(ptr %pa) #0 {
+    %va = load <vscale x 8 x i16>, ptr %pa, align 16
+    ret <vscale x 8 x i16> %va
+  }
+
+  define <vscale x 16 x i16> @vload_nxv16i16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i16>, ptr %pa, align 32
+    ret <vscale x 16 x i16> %va
+  }
+
+  define <vscale x 32 x i16> @vload_nxv32i16(ptr %pa) #0 {
+    %va = load <vscale x 32 x i16>, ptr %pa, align 64
+    ret <vscale x 32 x i16> %va
+  }
+
+  define <vscale x 1 x i32> @vload_nxv1i32(ptr %pa) #0 {
+    %va = load <vscale x 1 x i32>, ptr %pa, align 4
+    ret <vscale x 1 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 4 x i32> @vload_nxv4i32(ptr %pa) #0 {
+    %va = load <vscale x 4 x i32>, ptr %pa, align 16
+    ret <vscale x 4 x i32> %va
+  }
+
+  define <vscale x 8 x i32> @vload_nxv8i32(ptr %pa) #0 {
+    %va = load <vscale x 8 x i32>, ptr %pa, align 32
+    ret <vscale x 8 x i32> %va
+  }
+
+  define <vscale x 16 x i32> @vload_nxv16i32(ptr %pa) #0 {
+    %va = load <vscale x 16 x i32>, ptr %pa, align 64
+    ret <vscale x 16 x i32> %va
+  }
+
+  define <vscale x 1 x i64> @vload_nxv1i64(ptr %pa) #0 {
+    %va = load <vscale x 1 x i64>, ptr %pa, align 8
+    ret <vscale x 1 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 4 x i64> @vload_nxv4i64(ptr %pa) #0 {
+    %va = load <vscale x 4 x i64>, ptr %pa, align 32
+    ret <vscale x 4 x i64> %va
+  }
+
+  define <vscale x 8 x i64> @vload_nxv8i64(ptr %pa) #0 {
+    %va = load <vscale x 8 x i64>, ptr %pa, align 64
+    ret <vscale x 8 x i64> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align1(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 1
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align2(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 2
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align64(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 64
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align1(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 1
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align2(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 2
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align4(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 4
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 16
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align2(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 2
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 4
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 16
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align256(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 256
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 4
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 8
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 32
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 1 x ptr> @vload_nxv1ptr(ptr %pa) #0 {
+    %va = load <vscale x 1 x ptr>, ptr %pa, align 4
+    ret <vscale x 1 x ptr> %va
+  }
+
+  define <vscale x 2 x ptr> @vload_nxv2ptr(ptr %pa) #0 {
+    %va = load <vscale x 2 x ptr>, ptr %pa, align 8
+    ret <vscale x 2 x ptr> %va
+  }
+
+  define <vscale x 8 x ptr> @vload_nxv8ptr(ptr %pa) #0 {
+    %va = load <vscale x 8 x ptr>, ptr %pa, align 32
+    ret <vscale x 8 x ptr> %va
+  }
+
+  attributes #0 = { "target-features"="+v" }
+
+...
+---
+name:            vload_nxv1i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv16i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv32i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv32i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv64i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv64i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv32i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv32i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv8i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv16i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv4i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv8i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv16i8_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv4i16_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align256
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align256
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i64_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv1ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x p0>)
+    PseudoRET implicit $v8m4
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir
new file mode 100644
index 0000000000000..b91d25509646f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir
@@ -0,0 +1,1043 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+
+  define void @vstore_nx1i8(ptr %pa, <vscale x 1 x i8> %b) #0 {
+    store <vscale x 1 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx2i8(ptr %pa, <vscale x 2 x i8> %b) #0 {
+    store <vscale x 2 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i8(ptr %pa, <vscale x 4 x i8> %b) #0 {
+    store <vscale x 4 x i8> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx8i8(ptr %pa, <vscale x 8 x i8> %b) #0 {
+    store <vscale x 8 x i8> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx16i8(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx32i8(ptr %pa, <vscale x 32 x i8> %b) #0 {
+    store <vscale x 32 x i8> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx64i8(ptr %pa, <vscale x 64 x i8> %b) #0 {
+    store <vscale x 64 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i16(ptr %pa, <vscale x 1 x i16> %b) #0 {
+    store <vscale x 1 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i16(ptr %pa, <vscale x 2 x i16> %b) #0 {
+    store <vscale x 2 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8i16(ptr %pa, <vscale x 8 x i16> %b) #0 {
+    store <vscale x 8 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i16(ptr %pa, <vscale x 16 x i16> %b) #0 {
+    store <vscale x 16 x i16> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx32i16(ptr %pa, <vscale x 32 x i16> %b) #0 {
+    store <vscale x 32 x i16> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i32(ptr %pa, <vscale x 1 x i32> %b) #0 {
+    store <vscale x 1 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i32(ptr %pa, <vscale x 4 x i32> %b) #0 {
+    store <vscale x 4 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx8i32(ptr %pa, <vscale x 8 x i32> %b) #0 {
+    store <vscale x 8 x i32> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx16i32(ptr %pa, <vscale x 16 x i32> %b) #0 {
+    store <vscale x 16 x i32> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i64(ptr %pa, <vscale x 1 x i64> %b) #0 {
+    store <vscale x 1 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx4i64(ptr %pa, <vscale x 4 x i64> %b) #0 {
+    store <vscale x 4 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx8i64(ptr %pa, <vscale x 8 x i64> %b) #0 {
+    store <vscale x 8 x i64> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx16i8_align1(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx16i8_align2(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx16i8_align16(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i8_align64(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx4i16_align1(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx4i16_align2(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i16_align4(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16_align8(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i16_align16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align2(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i32_align4(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32_align8(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i32_align16(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align256(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 256
+    ret void
+  }
+
+  define void @vstore_nx2i64_align4(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i64_align8(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64_align16(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i64_align32(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx1ptr(ptr %pa, <vscale x 1 x ptr> %b) #0 {
+    store <vscale x 1 x ptr> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2ptr(ptr %pa, <vscale x 2 x ptr> %b) #0 {
+    store <vscale x 2 x ptr> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8ptr(ptr %pa, <vscale x 8 x ptr> %b) #0 {
+    store <vscale x 8 x ptr> %b, ptr %pa, align 32
+    ret void
+  }
+
+  attributes #0 = { "target-features"="+v" }
+
+...
+---
+name:            vstore_nx1i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s8>), %0(p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s8>), %0(p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s8>), %0(p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx8i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx32i8
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    G_STORE %1(<vscale x 32 x s8>), %0(p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx64i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx64i8
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = COPY $v8m8
+    G_STORE %1(<vscale x 64 x s8>), %0(p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s16>), %0(p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s16>), %0(p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx8i16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    G_STORE %1(<vscale x 8 x s16>), %0(p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx16i16
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    G_STORE %1(<vscale x 16 x s16>), %0(p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx32i16
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = COPY $v8m8
+    G_STORE %1(<vscale x 32 x s16>), %0(p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i32
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s32>), %0(p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    G_STORE %1(<vscale x 4 x s32>), %0(p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx8i32
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x s32>), %0(p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx16i32
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = COPY $v8m8
+    G_STORE %1(<vscale x 16 x s32>), %0(p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i64
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s64>), %0(p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx4i64
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = COPY $v8m4
+    G_STORE %1(<vscale x 4 x s64>), %0(p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx8i64
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x s64>), %0(p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align1
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align2
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align64
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align1
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align2
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align4
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align2
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align4
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align256
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align256
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align4
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align8
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1ptr
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x p0>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 1 x p0>), %0(p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2ptr
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x p0>), %0(p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx8ptr
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x p0>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x p0>), %0(p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir
new file mode 100644
index 0000000000000..5c02c720822b1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir
@@ -0,0 +1,1481 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+--- |
+
+  define <vscale x 1 x i8> @vload_nx1i8(ptr %pa) #0 {
+    %va = load <vscale x 1 x i8>, ptr %pa, align 1
+    ret <vscale x 1 x i8> %va
+  }
+
+  define <vscale x 2 x i8> @vload_nx2i8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i8>, ptr %pa, align 2
+    ret <vscale x 2 x i8> %va
+  }
+
+  define <vscale x 4 x i8> @vload_nx4i8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i8>, ptr %pa, align 4
+    ret <vscale x 4 x i8> %va
+  }
+
+  define <vscale x 8 x i8> @vload_nx8i8(ptr %pa) #0 {
+    %va = load <vscale x 8 x i8>, ptr %pa, align 8
+    ret <vscale x 8 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 32 x i8> @vload_nx32i8(ptr %pa) #0 {
+    %va = load <vscale x 32 x i8>, ptr %pa, align 32
+    ret <vscale x 32 x i8> %va
+  }
+
+  define <vscale x 64 x i8> @vload_nx64i8(ptr %pa) #0 {
+    %va = load <vscale x 64 x i8>, ptr %pa, align 64
+    ret <vscale x 64 x i8> %va
+  }
+
+  define <vscale x 1 x i16> @vload_nx1i16(ptr %pa) #0 {
+    %va = load <vscale x 1 x i16>, ptr %pa, align 2
+    ret <vscale x 1 x i16> %va
+  }
+
+  define <vscale x 2 x i16> @vload_nx2i16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i16>, ptr %pa, align 4
+    ret <vscale x 2 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 8 x i16> @vload_nx8i16(ptr %pa) #0 {
+    %va = load <vscale x 8 x i16>, ptr %pa, align 16
+    ret <vscale x 8 x i16> %va
+  }
+
+  define <vscale x 16 x i16> @vload_nx16i16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i16>, ptr %pa, align 32
+    ret <vscale x 16 x i16> %va
+  }
+
+  define <vscale x 32 x i16> @vload_nx32i16(ptr %pa) #0 {
+    %va = load <vscale x 32 x i16>, ptr %pa, align 64
+    ret <vscale x 32 x i16> %va
+  }
+
+  define <vscale x 1 x i32> @vload_nx1i32(ptr %pa) #0 {
+    %va = load <vscale x 1 x i32>, ptr %pa, align 4
+    ret <vscale x 1 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 4 x i32> @vload_nx4i32(ptr %pa) #0 {
+    %va = load <vscale x 4 x i32>, ptr %pa, align 16
+    ret <vscale x 4 x i32> %va
+  }
+
+  define <vscale x 8 x i32> @vload_nx8i32(ptr %pa) #0 {
+    %va = load <vscale x 8 x i32>, ptr %pa, align 32
+    ret <vscale x 8 x i32> %va
+  }
+
+  define <vscale x 16 x i32> @vload_nx16i32(ptr %pa) #0 {
+    %va = load <vscale x 16 x i32>, ptr %pa, align 64
+    ret <vscale x 16 x i32> %va
+  }
+
+  define <vscale x 1 x i64> @vload_nx1i64(ptr %pa) #0 {
+    %va = load <vscale x 1 x i64>, ptr %pa, align 8
+    ret <vscale x 1 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 4 x i64> @vload_nx4i64(ptr %pa) #0 {
+    %va = load <vscale x 4 x i64>, ptr %pa, align 32
+    ret <vscale x 4 x i64> %va
+  }
+
+  define <vscale x 8 x i64> @vload_nx8i64(ptr %pa) #0 {
+    %va = load <vscale x 8 x i64>, ptr %pa, align 64
+    ret <vscale x 8 x i64> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align1(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 1
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align2(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 2
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align64(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 64
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align1(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 1
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align2(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 2
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align4(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 4
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 16
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align2(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 2
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 4
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 16
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align256(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 256
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 4
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 8
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 32
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 1 x ptr> @vload_nx1ptr(ptr %pa) #0 {
+    %va = load <vscale x 1 x ptr>, ptr %pa, align 4
+    ret <vscale x 1 x ptr> %va
+  }
+
+  define <vscale x 2 x ptr> @vload_nx2ptr(ptr %pa) #0 {
+    %va = load <vscale x 2 x ptr>, ptr %pa, align 8
+    ret <vscale x 2 x ptr> %va
+  }
+
+  define <vscale x 8 x ptr> @vload_nx8ptr(ptr %pa) #0 {
+    %va = load <vscale x 8 x ptr>, ptr %pa, align 32
+    ret <vscale x 8 x ptr> %va
+  }
+
+...
+---
+name:            vload_nx1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx8i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx32i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx32i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx64i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx64i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx8i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx16i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx32i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx32i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx4i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx8i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx16i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx4i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx8i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx16i8_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align1
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align1
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx4i16_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align1
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align1
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    %1:_(<vscale x 4 x s16>) = G_BITCAST %2(<vscale x 8 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    %1:_(<vscale x 2 x s32>) = G_BITCAST %2(<vscale x 8 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align256
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align256
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align256
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i64_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    %1:_(<vscale x 2 x s64>) = G_BITCAST %2(<vscale x 16 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx1ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx8ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x p0>)
+    PseudoRET implicit $v8m4
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir
new file mode 100644
index 0000000000000..0bcef4efea36c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir
@@ -0,0 +1,1481 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+--- |
+
+  define void @vstore_nx1i8(ptr %pa, <vscale x 1 x i8> %b) #0 {
+    store <vscale x 1 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx2i8(ptr %pa, <vscale x 2 x i8> %b) #0 {
+    store <vscale x 2 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i8(ptr %pa, <vscale x 4 x i8> %b) #0 {
+    store <vscale x 4 x i8> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx8i8(ptr %pa, <vscale x 8 x i8> %b) #0 {
+    store <vscale x 8 x i8> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx16i8(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx32i8(ptr %pa, <vscale x 32 x i8> %b) #0 {
+    store <vscale x 32 x i8> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx64i8(ptr %pa, <vscale x 64 x i8> %b) #0 {
+    store <vscale x 64 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i16(ptr %pa, <vscale x 1 x i16> %b) #0 {
+    store <vscale x 1 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i16(ptr %pa, <vscale x 2 x i16> %b) #0 {
+    store <vscale x 2 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8i16(ptr %pa, <vscale x 8 x i16> %b) #0 {
+    store <vscale x 8 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i16(ptr %pa, <vscale x 16 x i16> %b) #0 {
+    store <vscale x 16 x i16> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx32i16(ptr %pa, <vscale x 32 x i16> %b) #0 {
+    store <vscale x 32 x i16> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i32(ptr %pa, <vscale x 1 x i32> %b) #0 {
+    store <vscale x 1 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i32(ptr %pa, <vscale x 4 x i32> %b) #0 {
+    store <vscale x 4 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx8i32(ptr %pa, <vscale x 8 x i32> %b) #0 {
+    store <vscale x 8 x i32> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx16i32(ptr %pa, <vscale x 16 x i32> %b) #0 {
+    store <vscale x 16 x i32> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i64(ptr %pa, <vscale x 1 x i64> %b) #0 {
+    store <vscale x 1 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx4i64(ptr %pa, <vscale x 4 x i64> %b) #0 {
+    store <vscale x 4 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx8i64(ptr %pa, <vscale x 8 x i64> %b) #0 {
+    store <vscale x 8 x i64> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx16i8_align1(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx16i8_align2(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx16i8_align16(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i8_align64(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx4i16_align1(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx4i16_align2(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i16_align4(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16_align8(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i16_align16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align2(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i32_align4(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32_align8(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i32_align16(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align256(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 256
+    ret void
+  }
+
+  define void @vstore_nx2i64_align4(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i64_align8(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64_align16(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i64_align32(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx1ptr(ptr %pa, <vscale x 1 x ptr> %b) #0 {
+    store <vscale x 1 x ptr> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2ptr(ptr %pa, <vscale x 2 x ptr> %b) #0 {
+    store <vscale x 2 x ptr> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8ptr(ptr %pa, <vscale x 8 x ptr> %b) #0 {
+    store <vscale x 8 x ptr> %b, ptr %pa, align 32
+    ret void
+  }
+
+...
+---
+name:            vstore_nx1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s8>), %0(p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s8>), %0(p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s8>), %0(p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx8i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx32i8
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx32i8
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    G_STORE %1(<vscale x 32 x s8>), %0(p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx64i8
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 64 x s8>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx64i8
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 64 x s8>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = COPY $v8m8
+    G_STORE %1(<vscale x 64 x s8>), %0(p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s16>), %0(p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s16>), %0(p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx8i16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    G_STORE %1(<vscale x 8 x s16>), %0(p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx16i16
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i16
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    G_STORE %1(<vscale x 16 x s16>), %0(p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx32i16
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s16>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx32i16
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s16>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = COPY $v8m8
+    G_STORE %1(<vscale x 32 x s16>), %0(p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i32
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i32
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s32>), %0(p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx4i32
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i32
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    G_STORE %1(<vscale x 4 x s32>), %0(p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx8i32
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i32
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x s32>), %0(p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx16i32
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s32>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i32
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s32>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = COPY $v8m8
+    G_STORE %1(<vscale x 16 x s32>), %0(p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i64
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s64>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i64
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s64>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s64>), %0(p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx4i64
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s64>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i64
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s64>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = COPY $v8m4
+    G_STORE %1(<vscale x 4 x s64>), %0(p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx8i64
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s64>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i64
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s64>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x s64>), %0(p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align1
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align1
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align2
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align2
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align64
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align64
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align1
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align1
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(<vscale x 8 x s8>) = G_BITCAST %1(<vscale x 4 x s16>)
+    G_STORE %2(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align2
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align2
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align4
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align4
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align2
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align2
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %2:_(<vscale x 8 x s8>) = G_BITCAST %1(<vscale x 2 x s32>)
+    G_STORE %2(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align4
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align4
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align256
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align256
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align256
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align4
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align4
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    %2:_(<vscale x 16 x s8>) = G_BITCAST %1(<vscale x 2 x s64>)
+    G_STORE %2(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align8
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align8
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align32
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align32
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1ptr
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x p0>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1ptr
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x p0>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 1 x p0>), %0(p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2ptr
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x p0>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2ptr
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x p0>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 2 x p0>), %0(p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx8ptr
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x p0>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8ptr
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x p0>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x p0>), %0(p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 88a35127c2926..3c63d3b11cbb7 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -25,7 +25,6 @@
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Instrument function entry/exit with calls to e.g. mcount() (post inlining)
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3611d92826235..d6d0cca6ddae7 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -54,7 +54,6 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
@@ -90,7 +89,7 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       RISC-V DAG->DAG Pattern Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT:       RISC-V Fold Masks
+; CHECK-NEXT:       RISC-V Vector Peephole Optimization
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index e6462ef93998f..b98d2d57a0b52 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV32IA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV64IA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZABHA %s
 
 ; Test cmpxchg followed by a branch on the cmpxchg success value to see if the
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index e336246b450a4..c47db319fc2c3 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -3,25 +3,25 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-WMO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO-ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-TSO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-WMO-ZABHA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-TSO-ZABHA %s
 
 define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index c7c9c339a8880..4223440b9cb88 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -12,22 +12,22 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS,RV64IA-TSO,RV64IA-TSO-NOZACAS %s
 
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO,RV32IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO,RV32IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO,RV64IA-TSO-ZACAS %s
 
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-ZACAS %s
 
 define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index f7268f6288127..775c17c3ceb3f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3,13 +3,13 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS %s
 
 define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index c9fe1059b1378..4f841683a868c 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -120,7 +120,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV32ZAMA16B %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s
@@ -259,7 +259,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV64ZALASR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV64ZICFILP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha %s -o - | FileCheck --check-prefix=RV64ZABHA %s
diff --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll
index b58aaab6aaf4a..fa88c3760e455 100644
--- a/llvm/test/CodeGen/RISCV/avgflooru.ll
+++ b/llvm/test/CodeGen/RISCV/avgflooru.ll
@@ -164,18 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_fixed_i64:
@@ -195,18 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ext_i64:
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 101cb5aeeb094..622365cf13bce 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV32I
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV32XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
@@ -20,6 +21,12 @@ define i64 @zero1(i64 %rs1, i1 zeroext %rc) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a1
@@ -58,6 +65,12 @@ define i64 @zero2(i64 %rs1, i1 zeroext %rc) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a1
@@ -98,6 +111,13 @@ define i64 @zero_singlebit1(i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero_singlebit1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a2, 12
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero_singlebit1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a1, 12
@@ -145,6 +165,13 @@ define i64 @zero_singlebit2(i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero_singlebit2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a2, 12
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero_singlebit2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a1, 12
@@ -195,6 +222,16 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -246,6 +283,16 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -297,6 +344,16 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -348,6 +405,16 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -400,6 +467,17 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sub1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sub1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -453,6 +531,17 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sub2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sub2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -503,6 +592,15 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -551,6 +649,15 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -599,6 +706,15 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -647,6 +763,15 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -695,6 +820,15 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -743,6 +877,15 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -791,6 +934,15 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -839,6 +991,15 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -891,6 +1052,17 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a2, a1, a2
@@ -948,6 +1120,17 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a1, a1, a2
@@ -1005,6 +1188,17 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a2, a1, a2
@@ -1062,6 +1256,17 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a1, a1, a2
@@ -1119,6 +1324,17 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: basic:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: basic:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
@@ -1177,6 +1393,19 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -1241,6 +1470,19 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -1309,6 +1551,22 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setgt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setgt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a1, a0
@@ -1380,6 +1638,22 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setge:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setge:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a0, a1
@@ -1451,6 +1725,22 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setlt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setlt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a0, a1
@@ -1522,6 +1812,22 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setle:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setle:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a1, a0
@@ -1593,6 +1899,22 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setugt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setugt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a1, a0
@@ -1664,6 +1986,22 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setuge:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setuge:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a0, a1
@@ -1735,6 +2073,22 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setult:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setult:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a0, a1
@@ -1806,6 +2160,22 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setule:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setule:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a1, a0
@@ -1871,6 +2241,17 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
@@ -1928,6 +2309,17 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
@@ -1987,6 +2379,18 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 123
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -123
@@ -2050,6 +2454,18 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 456
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -456
@@ -2113,6 +2529,18 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    binvi a0, a0, 11
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -2048
@@ -2177,6 +2605,19 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2242,6 +2683,19 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2302,6 +2756,15 @@ define i64 @zero1_seteq(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2354,6 +2817,15 @@ define i64 @zero2_seteq(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2406,6 +2878,15 @@ define i64 @zero1_setne(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2458,6 +2939,15 @@ define i64 @zero2_setne(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2507,6 +2997,13 @@ define i64 @zero1_seteq_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -2551,6 +3048,13 @@ define i64 @zero2_seteq_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -2595,6 +3099,13 @@ define i64 @zero1_setne_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -2639,6 +3150,13 @@ define i64 @zero2_setne_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -2686,6 +3204,15 @@ define i64 @zero1_seteq_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -231
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, 231
@@ -2737,6 +3264,14 @@ define i64 @zero2_seteq_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 546
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -546
@@ -2787,6 +3322,14 @@ define i64 @zero1_setne_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 321
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -321
@@ -2838,6 +3381,15 @@ define i64 @zero2_setne_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -654
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, 654
@@ -2890,6 +3442,15 @@ define i64 @zero1_seteq_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2942,6 +3503,15 @@ define i64 @zero2_seteq_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2994,6 +3564,15 @@ define i64 @zero1_setne_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -3046,6 +3625,15 @@ define i64 @zero2_setne_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -3125,6 +3713,28 @@ define void @sextw_removal_maskc(i1 %c, i32 signext %arg, i32 signext %arg1) nou
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sextw_removal_maskc:
+; RV32XVENTANACONDOPS:       # %bb.0: # %bb
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    mv s0, a2
+; RV32XVENTANACONDOPS-NEXT:    andi a0, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc s1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:  .LBB56_1: # %bb2
+; RV32XVENTANACONDOPS-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s1
+; RV32XVENTANACONDOPS-NEXT:    call bar
+; RV32XVENTANACONDOPS-NEXT:    sll s1, s1, s0
+; RV32XVENTANACONDOPS-NEXT:    bnez a0, .LBB56_1
+; RV32XVENTANACONDOPS-NEXT:  # %bb.2: # %bb7
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sextw_removal_maskc:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %bb
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -32
@@ -3276,6 +3886,28 @@ define void @sextw_removal_maskcn(i1 %c, i32 signext %arg, i32 signext %arg1) no
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sextw_removal_maskcn:
+; RV32XVENTANACONDOPS:       # %bb.0: # %bb
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    mv s0, a2
+; RV32XVENTANACONDOPS-NEXT:    andi a0, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn s1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:  .LBB57_1: # %bb2
+; RV32XVENTANACONDOPS-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s1
+; RV32XVENTANACONDOPS-NEXT:    call bar
+; RV32XVENTANACONDOPS-NEXT:    sll s1, s1, s0
+; RV32XVENTANACONDOPS-NEXT:    bnez a0, .LBB57_1
+; RV32XVENTANACONDOPS-NEXT:  # %bb.2: # %bb7
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sextw_removal_maskcn:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %bb
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -32
@@ -3398,6 +4030,14 @@ define i32 @setune_32(float %a, float %b, i32 %rs1, i32 %rs2) {
 ; RV64I-NEXT:  .LBB58_2:
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setune_32:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setune_32:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
@@ -3452,6 +4092,17 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:  .LBB59_2:
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setune_64:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    feq.s a4, fa0, fa1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a4
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a4
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setune_64:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
@@ -3534,6 +4185,25 @@ define signext i16 @numsignbits(i16 signext %0, i16 signext %1, i16 signext %2,
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: numsignbits:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn s0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or s0, s0, a2
+; RV32XVENTANACONDOPS-NEXT:    beqz a1, .LBB60_2
+; RV32XVENTANACONDOPS-NEXT:  # %bb.1:
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s0
+; RV32XVENTANACONDOPS-NEXT:    call bat
+; RV32XVENTANACONDOPS-NEXT:  .LBB60_2:
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s0
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: numsignbits:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -16
@@ -3639,6 +4309,13 @@ define i64 @single_bit(i64 %x) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    andi a2, a0, 1024
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    andi a1, a0, 1024
@@ -3688,6 +4365,13 @@ define i64 @single_bit2(i64 %x) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit2:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a0, 11
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit2:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a0, 11
@@ -3738,6 +4422,14 @@ define i64 @single_bit3(i80 %x, i64 %y) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit3:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    lw a0, 8(a0)
+; RV32XVENTANACONDOPS-NEXT:    andi a3, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit3:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    andi a1, a1, 1
diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index ced6ff66ef678..ee54501fe59a8 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -1497,3 +1497,51 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
   %2 = fsub contract double %c, %1
   ret double %2
 }
+
+define double @fsgnjx_f64(double %x, double %y) nounwind {
+; CHECKIFD-LABEL: fsgnjx_f64:
+; CHECKIFD:       # %bb.0:
+; CHECKIFD-NEXT:    fsgnjx.d fa0, fa1, fa0
+; CHECKIFD-NEXT:    ret
+;
+; RV32IZFINXZDINX-LABEL: fsgnjx_f64:
+; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    fsgnjx.d a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    ret
+;
+; RV64IZFINXZDINX-LABEL: fsgnjx_f64:
+; RV64IZFINXZDINX:       # %bb.0:
+; RV64IZFINXZDINX-NEXT:    fsgnjx.d a0, a1, a0
+; RV64IZFINXZDINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lui a1, 261888
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    li a2, 1023
+; RV64I-NEXT:    slli a2, a2, 52
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %z = call double @llvm.copysign.f64(double 1.0, double %x)
+  %mul = fmul double %z, %y
+  ret double %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index 6024a29da33d2..feea4f19720b0 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -742,9 +742,8 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI12_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB12_2
+; RV32IZFINXZDINX-NEXT:    beqz a2, .LBB12_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1: # %start
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_2: # %start
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 927eee2e9e545..0839f61b2d793 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -102,30 +102,29 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI1_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB1_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB1_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI1_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI1_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI1_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB1_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -347,30 +346,29 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI5_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB5_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB5_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI5_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI5_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI5_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB5_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -592,30 +590,29 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI9_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB9_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB9_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI9_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI9_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI9_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB9_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -837,30 +834,29 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI13_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB13_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB13_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI13_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI13_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI13_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB13_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1082,30 +1078,29 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI17_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB17_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB17_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI17_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI17_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI17_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB17_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1327,30 +1322,29 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI21_1)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_1+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_1)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB21_2
+; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB21_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_2:
-; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI21_1)
-; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI21_1)(a1)
-; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI21_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB21_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_4:
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll
index 7a7ebe651c08e..931f73a94170a 100644
--- a/llvm/test/CodeGen/RISCV/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/float-arith.ll
@@ -1195,3 +1195,44 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
   %2 = fsub contract float %c, %1
   ret float %2
 }
+
+define float @fsgnjx_f32(float %x, float %y) nounwind {
+; CHECKIF-LABEL: fsgnjx_f32:
+; CHECKIF:       # %bb.0:
+; CHECKIF-NEXT:    fsgnjx.s fa0, fa1, fa0
+; CHECKIF-NEXT:    ret
+;
+; CHECKIZFINX-LABEL: fsgnjx_f32:
+; CHECKIZFINX:       # %bb.0:
+; CHECKIZFINX-NEXT:    fsgnjx.s a0, a1, a0
+; CHECKIZFINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lui a2, 524288
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 260096
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    call __mulsf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 260096
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    call __mulsf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %z = call float @llvm.copysign.f32(float 1.0, float %x)
+  %mul = fmul float %z, %y
+  ret float %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index f54adaa24b1bb..10e63e3a9f748 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -3104,3 +3104,112 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind {
   %2 = fsub contract half %c, %1
   ret half %2
 }
+
+define half @fsgnjx_f16(half %x, half %y) nounwind {
+; CHECKIZFH-LABEL: fsgnjx_f16:
+; CHECKIZFH:       # %bb.0:
+; CHECKIZFH-NEXT:    fsgnjx.h fa0, fa1, fa0
+; CHECKIZFH-NEXT:    ret
+;
+; CHECK-ZHINX-LABEL: fsgnjx_f16:
+; CHECK-ZHINX:       # %bb.0:
+; CHECK-ZHINX-NEXT:    fsgnjx.h a0, a1, a0
+; CHECK-ZHINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a2, 15
+; RV32I-NEXT:    slli a2, a2, 10
+; RV32I-NEXT:    or s1, a0, a2
+; RV32I-NEXT:    slli a0, a1, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui a0, 12
+; RV32I-NEXT:    addi a0, a0, -1024
+; RV32I-NEXT:    and a0, s1, a0
+; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    call __mulsf3
+; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 15
+; RV64I-NEXT:    slli a2, a2, 10
+; RV64I-NEXT:    or s1, a0, a2
+; RV64I-NEXT:    slli a0, a1, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lui a0, 12
+; RV64I-NEXT:    addiw a0, a0, -1024
+; RV64I-NEXT:    and a0, s1, a0
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    mv a1, s0
+; RV64I-NEXT:    call __mulsf3
+; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; CHECK-RV32-FSGNJ-LABEL: fsgnjx_f16:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a1, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    andi a1, a1, 127
+; CHECK-RV32-FSGNJ-NEXT:    or a0, a1, a0
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
+;
+; CHECK-RV64-FSGNJ-LABEL: fsgnjx_f16:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 127
+; CHECK-RV64-FSGNJ-NEXT:    or a0, a1, a0
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
+  %z = call half @llvm.copysign.f16(half 1.0, half %x)
+  %mul = fmul half %z, %y
+  ret half %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
index 52d0dabf18839..6666d92feaac2 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
@@ -2252,3 +2252,53 @@ label:
   call void asm "lw zero, $0", "*A"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_A_with_local_3, %label), i32 2000))
   ret void
 }
+
+@_ZN5repro9MY_BUFFER17hb0f674501d5980a6E = external global <{ [16 x i8] }>
+
+; Address is not used by a memory constraint.
+define void @should_not_fold() {
+; RV32I-LABEL: should_not_fold:
+; RV32I:       # %bb.0: # %start
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lui a0, %hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-NEXT:    addi a0, a0, %lo(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    ecall
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: should_not_fold:
+; RV64I:       # %bb.0: # %start
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    lui a0, %hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-NEXT:    addi a0, a0, %lo(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-NEXT:    #APP
+; RV64I-NEXT:    ecall
+; RV64I-NEXT:    #NO_APP
+; RV64I-NEXT:    ret
+;
+; RV32I-MEDIUM-LABEL: should_not_fold:
+; RV32I-MEDIUM:       # %bb.0: # %start
+; RV32I-MEDIUM-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-MEDIUM-NEXT:  .Lpcrel_hi39:
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi39)
+; RV32I-MEDIUM-NEXT:    #APP
+; RV32I-MEDIUM-NEXT:    ecall
+; RV32I-MEDIUM-NEXT:    #NO_APP
+; RV32I-MEDIUM-NEXT:    ret
+;
+; RV64I-MEDIUM-LABEL: should_not_fold:
+; RV64I-MEDIUM:       # %bb.0: # %start
+; RV64I-MEDIUM-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-MEDIUM-NEXT:  .Lpcrel_hi39:
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi39)
+; RV64I-MEDIUM-NEXT:    #APP
+; RV64I-MEDIUM-NEXT:    ecall
+; RV64I-MEDIUM-NEXT:    #NO_APP
+; RV64I-MEDIUM-NEXT:    ret
+start:
+  %0 = tail call ptr asm sideeffect alignstack "ecall", "=&{x10},0,~{vtype},~{vl},~{vxsat},~{vxrm},~{memory}"(ptr @_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index 3fa494e1a57dd..f9b9c8a69d431 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -54,7 +54,8 @@ define i1 @pr85190(i64 %a) {
 ; CHECK-ZBB-NEXT:    li a2, -1
 ; CHECK-ZBB-NEXT:    slli a2, a2, 63
 ; CHECK-ZBB-NEXT:    sub a2, a2, a1
-; CHECK-ZBB-NEXT:    slt a0, a0, a2
+; CHECK-ZBB-NEXT:    min a1, a2, zero
+; CHECK-ZBB-NEXT:    slt a0, a0, a1
 ; CHECK-ZBB-NEXT:    ret
   %or = or i64 %a, 7
   %cmp1 = icmp slt i64 %a, 0
diff --git a/llvm/test/CodeGen/RISCV/pr88365.ll b/llvm/test/CodeGen/RISCV/pr88365.ll
index 73010fdf40447..4e4dead98ee69 100644
--- a/llvm/test/CodeGen/RISCV/pr88365.ll
+++ b/llvm/test/CodeGen/RISCV/pr88365.ll
@@ -10,7 +10,7 @@ define void @foo() {
 ; CHECK-NEXT:    .cfi_offset ra, -4
 ; CHECK-NEXT:    li a0, -2048
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa_offset -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 4294967280
 ; CHECK-NEXT:    addi a0, sp, 4
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    li a0, -2048
diff --git a/llvm/test/CodeGen/RISCV/pr89833.ll b/llvm/test/CodeGen/RISCV/pr89833.ll
deleted file mode 100644
index 54a985040e758..0000000000000
--- a/llvm/test/CodeGen/RISCV/pr89833.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
-
-declare void @llvm.riscv.masked.strided.store.nxv16i8.p0.i64(<vscale x 16 x i8>, ptr, i64, <vscale x 16 x i1>)
-
-define void @test(<vscale x 16 x i16> %value, <vscale x 16 x i1> %mask) {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-NEXT:    vse8.v v12, (zero), v0.t
-; CHECK-NEXT:    ret
-  %trunc = trunc <vscale x 16 x i16> %value to <vscale x 16 x i8>
-  call void @llvm.riscv.masked.strided.store.nxv16i8.p0.i64(<vscale x 16 x i8> %trunc, ptr null, i64 1, <vscale x 16 x i1> %mask)
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/pr94265.ll b/llvm/test/CodeGen/RISCV/pr94265.ll
index cb41e22381d19..f92cdb4ca7395 100644
--- a/llvm/test/CodeGen/RISCV/pr94265.ll
+++ b/llvm/test/CodeGen/RISCV/pr94265.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=riscv32-- -mattr=+v | FileCheck -check-prefix=RV32I %s
 ; RUN: llc < %s -mtriple=riscv64-- -mattr=+v | FileCheck -check-prefix=RV64I %s
+; RUN: llc < %s -mtriple=riscv32-- -mattr=+zve32x,+zvl128b | FileCheck -check-prefix=RV32I %s
+; RUN: llc < %s -mtriple=riscv64-- -mattr=+zve32x,+zvl128b | FileCheck -check-prefix=RV64I %s
 
 define <8 x i16> @PR94265(<8 x i32> %a0) #0 {
 ; RV32I-LABEL: PR94265:
diff --git a/llvm/test/CodeGen/RISCV/pr97304.ll b/llvm/test/CodeGen/RISCV/pr97304.ll
new file mode 100644
index 0000000000000..120a0e787384d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr97304.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s
+
+define i32 @_ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE(i1 %cmp436) {
+  ; CHECK-LABEL: name: _ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY [[COPY]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.for.cond:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 8, 0, implicit-def dead $x2, implicit $x2
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprjalr = COPY $x0
+  ; CHECK-NEXT:   SD [[COPY3]], [[COPY2]], 0 :: (store (s64))
+  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 32
+  ; CHECK-NEXT:   BNE [[ANDI]], $x0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.for.cond (call-frame-size 8):
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.for.cond (call-frame-size 8):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[ADDI1]], %bb.1, [[ADDI]], %bb.2
+  ; CHECK-NEXT:   $x10 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x11 = COPY [[PHI]]
+  ; CHECK-NEXT:   $x12 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x13 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x14 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x15 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x16 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $x17 = COPY [[COPY3]]
+  ; CHECK-NEXT:   PseudoCALLIndirect [[COPY3]], csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x2, implicit-def $x10
+  ; CHECK-NEXT:   ADJCALLSTACKUP 8, 0, implicit-def dead $x2, implicit $x2
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   PseudoBR %bb.1
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %conv = select i1 %cmp436, i32 32, i32 1
+  %call479 = call i32 (ptr, ...) null(ptr null, i32 %conv, i32 0, i32 0, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
+  br label %for.cond
+}
diff --git a/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll b/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll
new file mode 100644
index 0000000000000..0508016736004
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -O2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -O3 < %s | FileCheck %s --check-prefix=CHECK-O3
+
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-size=2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-placement-threshold=2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-placement-threshold=6 < %s | FileCheck %s --check-prefix=CHECK-O3
+
+@a = external global i32
+@b = external global i32
+@c = external global i32
+
+declare i32 @foo(i32)
+
+define i32 @test(i32 %n) {
+; CHECK-O2-LABEL: test:
+; CHECK-O2:       # %bb.0: # %entry
+; CHECK-O2-NEXT:    sext.w a1, a0
+; CHECK-O2-NEXT:    blez a1, .LBB0_2
+; CHECK-O2-NEXT:  # %bb.1: # %if.then
+; CHECK-O2-NEXT:    lui a1, %hi(a)
+; CHECK-O2-NEXT:    lw a1, %lo(a)(a1)
+; CHECK-O2-NEXT:    mul a0, a1, a0
+; CHECK-O2-NEXT:    j .LBB0_3
+; CHECK-O2-NEXT:  .LBB0_2: # %if.else
+; CHECK-O2-NEXT:    lui a1, %hi(b)
+; CHECK-O2-NEXT:    lw a1, %lo(b)(a1)
+; CHECK-O2-NEXT:    divw a0, a1, a0
+; CHECK-O2-NEXT:  .LBB0_3: # %if.end
+; CHECK-O2-NEXT:    lui a1, %hi(c)
+; CHECK-O2-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O2-NEXT:    addi a0, a0, -1
+; CHECK-O2-NEXT:    mulw a0, a0, a1
+; CHECK-O2-NEXT:    tail foo
+;
+; CHECK-O3-LABEL: test:
+; CHECK-O3:       # %bb.0: # %entry
+; CHECK-O3-NEXT:    sext.w a1, a0
+; CHECK-O3-NEXT:    blez a1, .LBB0_2
+; CHECK-O3-NEXT:  # %bb.1: # %if.then
+; CHECK-O3-NEXT:    lui a1, %hi(a)
+; CHECK-O3-NEXT:    lw a1, %lo(a)(a1)
+; CHECK-O3-NEXT:    mul a0, a1, a0
+; CHECK-O3-NEXT:    lui a1, %hi(c)
+; CHECK-O3-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O3-NEXT:    addi a0, a0, -1
+; CHECK-O3-NEXT:    mulw a0, a0, a1
+; CHECK-O3-NEXT:    tail foo
+; CHECK-O3-NEXT:  .LBB0_2: # %if.else
+; CHECK-O3-NEXT:    lui a1, %hi(b)
+; CHECK-O3-NEXT:    lw a1, %lo(b)(a1)
+; CHECK-O3-NEXT:    divw a0, a1, a0
+; CHECK-O3-NEXT:    lui a1, %hi(c)
+; CHECK-O3-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O3-NEXT:    addi a0, a0, -1
+; CHECK-O3-NEXT:    mulw a0, a0, a1
+; CHECK-O3-NEXT:    tail foo
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %va = load i32, ptr @a
+  %mul = mul nsw i32 %va, %n
+  br label %if.end
+
+if.else:
+  %vb = load i32, ptr @b
+  %div = sdiv i32 %vb, %n
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %mul, %if.then ], [ %div, %if.else ]
+  %vc = load i32, ptr @c
+  %add = add nsw i32 %phi, -1
+  %arg = mul i32 %add, %vc
+  %ret = tail call i32 @foo(i32 %arg)
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7cb2452e1a148..20a0484464018 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -2879,8 +2879,8 @@ entry:
   ret ptr %5
 }
 
-define i64 @srli_slliw(i64 %1) {
-; RV64I-LABEL: srli_slliw:
+define i64 @srli_slliuw(i64 %1) {
+; RV64I-LABEL: srli_slliuw:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2889,7 +2889,7 @@ define i64 @srli_slliw(i64 %1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw:
+; RV64ZBA-LABEL: srli_slliuw:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2901,8 +2901,8 @@ entry:
   ret i64 %4
 }
 
-define i64 @srli_slliw_canonical(i64 %0) {
-; RV64I-LABEL: srli_slliw_canonical:
+define i64 @srli_slliuw_canonical(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2911,7 +2911,7 @@ define i64 @srli_slliw_canonical(i64 %0) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw_canonical:
+; RV64ZBA-LABEL: srli_slliuw_canonical:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2949,3 +2949,46 @@ entry:
   %4 = shl i64 %3, 4
   ret i64 %4
 }
+
+define i64 @srli_slliuw_2(i64 %1) {
+; RV64I-LABEL: srli_slliuw_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 18
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %2 = lshr i64 %1, 18
+  %3 = and i64 %2, 4294967295
+  %4 = shl i64 %3, 3
+  ret i64 %4
+}
+
+define i64 @srli_slliuw_canonical_2(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_canonical_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 18
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %1 = lshr i64 %0, 15
+  %2 = and i64 %1, 34359738360
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
index c99388cbdaf44..93fe66695b70e 100644
--- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -27,8 +27,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    addi a0, sp, 16
 ; OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 4 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 4 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x08, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 3 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x08, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 7 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x08, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 6 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x08, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 5 * vlenb
 ; OMIT-FP-NEXT:    #APP
 ; OMIT-FP-NEXT:    #NO_APP
 ; OMIT-FP-NEXT:    csrr a0, vlenb
@@ -79,8 +83,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 32 - 4 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 32 - 4 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 32 - 3 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 32 - 7 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 32 - 6 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 32 - 5 * vlenb
 ; NO-OMIT-FP-NEXT:    #APP
 ; NO-OMIT-FP-NEXT:    #NO_APP
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 05d6716e47192..cd2208e31eb6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 declare <vscale x 1 x i8> @llvm.vp.abs.nxv1i8(<vscale x 1 x i8>, i1 immarg, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 5217148ba4f4e..5709de567c18d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index aadd9852af11e..6917d7e44a8e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvkb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable.ll b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
index 5bca2eeb3fddf..e26c467f025bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/commutable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f,+d,+zvfh,+v \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f,+d,+zvfh,+v \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 ; vadd.vv
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index b5cafe410ae8d..01aac122d5957 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 3b7952f9f5e6d..209a37bf66ae3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -7,7 +7,7 @@
 define half @extractelt_nxv1f16_0(<vscale x 1 x half> %v) {
 ; CHECK-LABEL: extractelt_nxv1f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 0
@@ -39,7 +39,7 @@ define half @extractelt_nxv1f16_idx(<vscale x 1 x half> %v, i32 zeroext %idx) {
 define half @extractelt_nxv2f16_0(<vscale x 2 x half> %v) {
 ; CHECK-LABEL: extractelt_nxv2f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 0
@@ -199,7 +199,7 @@ define half @extractelt_nxv32f16_idx(<vscale x 32 x half> %v, i32 zeroext %idx)
 define float @extractelt_nxv1f32_0(<vscale x 1 x float> %v) {
 ; CHECK-LABEL: extractelt_nxv1f32_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 1 x float> %v, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index df9949e617b80..4d6bc349ffacb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32NOM
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32M
 
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index a96cf5807e6c1..aba0ad022005b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64NOM
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64M
 
 define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index f3e823562888f..84da351de76ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
index 5252eb71c383d..f124d550df16d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
@@ -265,13 +265,13 @@ define i64 @bitcast_v1i64_i64(<1 x i64> %a) {
 define half @bitcast_v2i8_f16(<2 x i8> %a) {
 ; CHECK-LABEL: bitcast_v2i8_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
 ; ELEN32-LABEL: bitcast_v2i8_f16:
 ; ELEN32:       # %bb.0:
-; ELEN32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ELEN32-NEXT:    vfmv.f.s fa0, v8
 ; ELEN32-NEXT:    ret
   %b = bitcast <2 x i8> %a to half
@@ -281,13 +281,13 @@ define half @bitcast_v2i8_f16(<2 x i8> %a) {
 define half @bitcast_v1i16_f16(<1 x i16> %a) {
 ; CHECK-LABEL: bitcast_v1i16_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
 ; ELEN32-LABEL: bitcast_v1i16_f16:
 ; ELEN32:       # %bb.0:
-; ELEN32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ELEN32-NEXT:    vfmv.f.s fa0, v8
 ; ELEN32-NEXT:    ret
   %b = bitcast <1 x i16> %a to half
@@ -297,7 +297,7 @@ define half @bitcast_v1i16_f16(<1 x i16> %a) {
 define float @bitcast_v4i8_f32(<4 x i8> %a) {
 ; CHECK-LABEL: bitcast_v4i8_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
@@ -313,7 +313,7 @@ define float @bitcast_v4i8_f32(<4 x i8> %a) {
 define float @bitcast_v2i16_f32(<2 x i16> %a) {
 ; CHECK-LABEL: bitcast_v2i16_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
@@ -329,7 +329,7 @@ define float @bitcast_v2i16_f32(<2 x i16> %a) {
 define float @bitcast_v1i32_f32(<1 x i32> %a) {
 ; CHECK-LABEL: bitcast_v1i32_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 068c25b821002..90bedf87e04d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.bitreverse.v2i8(<2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index 1490738687322..6f2e86097d6ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index 3286c336a0fd1..809884cb18129 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
 
 define <4 x i8> @ret_v4i8(ptr %p) {
 ; CHECK-LABEL: ret_v4i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index b42fb8c686164..f5e6b92905193 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 5fceab869ab85..e90e52fba642b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.ctpop.v2i8(<2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index e7736e7f360f3..dfad7881066a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
index 369f90521cf00..bb2b57fbcc3b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
 
 define <2 x i16> @sextload_v2i1_v2i16(ptr %x) {
 ; CHECK-LABEL: sextload_v2i1_v2i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 386c71cf665ce..493481ad129d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32ZBS
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZBS
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32ZBS
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZBS
 
 define i1 @extractelt_v1i1(ptr %x, i64 %idx) nounwind {
 ; CHECK-LABEL: extractelt_v1i1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
index 2fe08fc4c2129..5f5015c9ad169 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
@@ -19,7 +19,7 @@ define i16 @bitcast_v1f16_i16(<1 x half> %a) {
 define half @bitcast_v1f16_f16(<1 x half> %a) {
 ; CHECK-LABEL: bitcast_v1f16_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <1 x half> %a to half
@@ -49,7 +49,7 @@ define i32 @bitcast_v1f32_i32(<1 x float> %a) {
 define float @bitcast_v2f16_f32(<2 x half> %a) {
 ; CHECK-LABEL: bitcast_v2f16_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <2 x half> %a to float
@@ -59,7 +59,7 @@ define float @bitcast_v2f16_f32(<2 x half> %a) {
 define float @bitcast_v1f32_f32(<1 x float> %a) {
 ; CHECK-LABEL: bitcast_v1f32_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <1 x float> %a to float
@@ -237,7 +237,7 @@ define <1 x double> @bitcast_i64_v1f64(i64 %a) {
 define <1 x i16> @bitcast_f16_v1i16(half %a) {
 ; CHECK-LABEL: bitcast_f16_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast half %a to <1 x i16>
@@ -247,7 +247,7 @@ define <1 x i16> @bitcast_f16_v1i16(half %a) {
 define <1 x half> @bitcast_f16_v1f16(half %a) {
 ; CHECK-LABEL: bitcast_f16_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast half %a to <1 x half>
@@ -257,7 +257,7 @@ define <1 x half> @bitcast_f16_v1f16(half %a) {
 define <2 x i16> @bitcast_f32_v2i16(float %a) {
 ; CHECK-LABEL: bitcast_f32_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <2 x i16>
@@ -267,7 +267,7 @@ define <2 x i16> @bitcast_f32_v2i16(float %a) {
 define <2 x half> @bitcast_f32_v2f16(float %a) {
 ; CHECK-LABEL: bitcast_f32_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <2 x half>
@@ -277,7 +277,7 @@ define <2 x half> @bitcast_f32_v2f16(float %a) {
 define <1 x i32> @bitcast_f32_v1i32(float %a) {
 ; CHECK-LABEL: bitcast_f32_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <1 x i32>
@@ -287,7 +287,7 @@ define <1 x i32> @bitcast_f32_v1i32(float %a) {
 define <1 x float> @bitcast_f32_v1f32(float %a) {
 ; CHECK-LABEL: bitcast_f32_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <1 x float>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index bc46e7d264bc0..d92dc3edecb0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -187,63 +187,20 @@ define void @fp2ui_v2f16_v2i64(ptr %x, ptr %y) {
 declare <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half>)
 
 define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2si_v2f64_v2i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32-NEXT:    fld fa4, %lo(.LCPI10_0)(a0)
-; RV32-NEXT:    lui a0, %hi(.LCPI10_1)
-; RV32-NEXT:    fld fa3, %lo(.LCPI10_1)(a0)
-; RV32-NEXT:    feq.d a0, fa5, fa5
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    fmax.d fa5, fa5, fa4
-; RV32-NEXT:    fmin.d fa5, fa5, fa3
-; RV32-NEXT:    fcvt.w.d a2, fa5, rtz
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    feq.d a2, fa5, fa5
-; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    fmax.d fa5, fa5, fa4
-; RV32-NEXT:    fmin.d fa5, fa5, fa3
-; RV32-NEXT:    fcvt.w.d a3, fa5, rtz
-; RV32-NEXT:    and a2, a2, a3
-; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vse8.v v8, (a1)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fp2si_v2f64_v2i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV64-NEXT:    fld fa4, %lo(.LCPI10_0)(a0)
-; RV64-NEXT:    lui a0, %hi(.LCPI10_1)
-; RV64-NEXT:    fld fa3, %lo(.LCPI10_1)(a0)
-; RV64-NEXT:    feq.d a0, fa5, fa5
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    fmax.d fa5, fa5, fa4
-; RV64-NEXT:    fmin.d fa5, fa5, fa3
-; RV64-NEXT:    fcvt.l.d a2, fa5, rtz
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    feq.d a2, fa5, fa5
-; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    fmax.d fa5, fa5, fa4
-; RV64-NEXT:    fmin.d fa5, fa5, fa3
-; RV64-NEXT:    fcvt.l.d a3, fa5, rtz
-; RV64-NEXT:    and a2, a2, a3
-; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vse8.v v8, (a1)
-; RV64-NEXT:    ret
+; CHECK-LABEL: fp2si_v2f64_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnclip.wi v8, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
   %d = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> %a)
   store <2 x i8> %d, ptr %y
@@ -252,49 +209,20 @@ define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
 declare <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double>)
 
 define void @fp2ui_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2ui_v2f64_v2i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; RV32-NEXT:    vfmv.f.s fa4, v9
-; RV32-NEXT:    fcvt.d.w fa3, zero
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT:    vfmv.f.s fa4, v8
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa5, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a2, fa5, rtz
-; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vse8.v v8, (a1)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fp2ui_v2f64_v2i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    lui a0, %hi(.LCPI11_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; RV64-NEXT:    vfmv.f.s fa4, v9
-; RV64-NEXT:    fmv.d.x fa3, zero
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT:    vfmv.f.s fa4, v8
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa5, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a2, fa5, rtz
-; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vse8.v v8, (a1)
-; RV64-NEXT:    ret
+; CHECK-LABEL: fp2ui_v2f64_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
   %d = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> %a)
   store <2 x i8> %d, ptr %y
@@ -304,203 +232,20 @@ declare <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double>)
 
 define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ;
-; RV32-LABEL: fp2si_v8f64_v8i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa3, v10
-; RV32-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
-; RV32-NEXT:    lui a0, %hi(.LCPI12_1)
-; RV32-NEXT:    fld fa4, %lo(.LCPI12_1)(a0)
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vfmv.f.s fa3, v8
-; RV32-NEXT:    feq.d a2, fa3, fa3
-; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
-; RV32-NEXT:    and a2, a2, a3
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a2
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vfmv.f.s fa3, v12
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa3, v8
-; RV32-NEXT:    feq.d a2, fa3, fa3
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
-; RV32-NEXT:    fld fa3, 40(sp)
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    neg a0, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    feq.d a2, fa3, fa3
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
-; RV32-NEXT:    fld fa3, 32(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    neg a0, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    feq.d a2, fa3, fa3
-; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
-; RV32-NEXT:    fld fa3, 48(sp)
-; RV32-NEXT:    and a2, a2, a3
-; RV32-NEXT:    vmv.v.x v9, a2
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
-; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    fld fa3, 56(sp)
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    fmax.d fa5, fa3, fa5
-; RV32-NEXT:    fmin.d fa5, fa5, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa5, rtz
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vse8.v v9, (a1)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fp2si_v8f64_v8i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa3, v10
-; RV64-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
-; RV64-NEXT:    lui a0, %hi(.LCPI12_1)
-; RV64-NEXT:    fld fa4, %lo(.LCPI12_1)(a0)
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vfmv.f.s fa3, v8
-; RV64-NEXT:    feq.d a2, fa3, fa3
-; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
-; RV64-NEXT:    and a2, a2, a3
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a2
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vfmv.f.s fa3, v12
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vfmv.f.s fa3, v8
-; RV64-NEXT:    feq.d a2, fa3, fa3
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
-; RV64-NEXT:    fld fa3, 40(sp)
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    neg a0, a2
-; RV64-NEXT:    and a0, a0, a3
-; RV64-NEXT:    feq.d a2, fa3, fa3
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
-; RV64-NEXT:    fld fa3, 32(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    neg a0, a2
-; RV64-NEXT:    and a0, a0, a3
-; RV64-NEXT:    feq.d a2, fa3, fa3
-; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
-; RV64-NEXT:    fld fa3, 48(sp)
-; RV64-NEXT:    and a2, a2, a3
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
-; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    fld fa3, 56(sp)
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    fmax.d fa5, fa3, fa5
-; RV64-NEXT:    fmin.d fa5, fa5, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa5, rtz
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vmv.v.i v0, 15
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vse8.v v9, (a1)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: fp2si_v8f64_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnclip.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
   %a = load <8 x double>, ptr %x
   %d = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> %a)
   store <8 x i8> %d, ptr %y
@@ -510,151 +255,20 @@ declare <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double>)
 
 define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ;
-; RV32-LABEL: fp2ui_v8f64_v8i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    lui a0, %hi(.LCPI13_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    fcvt.d.w fa3, zero
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT:    vfmv.f.s fa4, v8
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa4, v8
-; RV32-NEXT:    fld fa2, 40(sp)
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a4, fa4, rtz
-; RV32-NEXT:    fmax.d fa4, fa2, fa3
-; RV32-NEXT:    fld fa2, 32(sp)
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a5, fa4, rtz
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    fmax.d fa4, fa2, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT:    fld fa4, 48(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a4
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT:    fld fa4, 56(sp)
-; RV32-NEXT:    vmv.v.x v9, a2
-; RV32-NEXT:    vslide1down.vx v9, v9, a5
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fmin.d fa5, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a0, fa5, rtz
-; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vse8.v v9, (a1)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fp2ui_v8f64_v8i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    lui a0, %hi(.LCPI13_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
-; RV64-NEXT:    vfmv.f.s fa4, v10
-; RV64-NEXT:    fmv.d.x fa3, zero
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT:    vfmv.f.s fa4, v8
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a2, fa4, rtz
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vfmv.f.s fa4, v10
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vfmv.f.s fa4, v8
-; RV64-NEXT:    fld fa2, 40(sp)
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a4, fa4, rtz
-; RV64-NEXT:    fmax.d fa4, fa2, fa3
-; RV64-NEXT:    fld fa2, 32(sp)
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a5, fa4, rtz
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    fmax.d fa4, fa2, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a2, fa4, rtz
-; RV64-NEXT:    fld fa4, 48(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    vslide1down.vx v8, v8, a4
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT:    fld fa4, 56(sp)
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vslide1down.vx v9, v9, a5
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fmin.d fa5, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a0, fa5, rtz
-; RV64-NEXT:    vmv.v.i v0, 15
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vse8.v v9, (a1)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: fp2ui_v8f64_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
   %a = load <8 x double>, ptr %x
   %d = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> %a)
   store <8 x i8> %d, ptr %y
@@ -697,3 +311,6 @@ define void @fp2ui_v2f64_v2i32(ptr %x, ptr %y) {
   ret void
 }
 declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index be602e36c4e85..f65431bf470aa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -915,6 +915,152 @@ define <16 x i8> @buildvec_not_vid_v16i8() {
   ret <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 3, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0>
 }
 
+define <512 x i8> @buildvec_vid_v512i8_indices_overflow() vscale_range(16, 1024) {
+; CHECK-LABEL: buildvec_vid_v512i8_indices_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 128, i8 129, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137, i8 138, i8 139, i8 140, i8 141, i8 142, i8 143, i8 144, i8 145, i8 146, i8 147, i8 148, i8 149, i8 150, i8 151, i8 152, i8 153, i8 154, i8 155, i8 156, i8 157, i8 158, i8 159, i8 160, i8 161, i8 162, i8 163, i8 164, i8 165, i8 166, i8 167, i8 168, i8 169, i8 170, i8 171, i8 172, i8 173, i8 174, i8 175, i8 176, i8 177, i8 178, i8 179, i8 180, i8 181, i8 182, i8 183, i8 184, i8 185, i8 186, i8 187, i8 188, i8 189, i8 190, i8 191, i8 192, i8 193, i8 194, i8 195, i8 196, i8 197, i8 198, i8 199, i8 200, i8 201, i8 202, i8 203, i8 204, i8 205, i8 206, i8 207, i8 208, i8 209, i8 210, i8 211, i8 212, i8 213, i8 214, i8 215, i8 216, i8 217, i8 218, i8 219, i8 220, i8 221, i8 222, i8 223, i8 224, i8 225, i8 226, i8 227, i8 228, i8 229, i8 230, i8 231, i8 232, i8 233, i8 234, i8 235, i8 236, i8 237, i8 238, i8 239, i8 240, i8 241, i8 242, i8 243, i8 244, i8 245, i8 246, i8 247, i8 248, i8 249, i8 250, i8 251, i8 252, i8 253, i8 254, i8 255, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 128, i8 129, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137, i8 138, i8 139, i8 140, i8 141, i8 142, i8 143, i8 144, i8 145, i8 146, i8 147, i8 148, i8 149, i8 150, i8 151, i8 152, i8 153, i8 154, i8 155, i8 156, i8 157, i8 158, i8 159, i8 160, i8 161, i8 162, i8 163, i8 164, i8 165, i8 166, i8 167, i8 168, i8 169, i8 170, i8 171, i8 172, i8 173, i8 174, i8 175, i8 176, i8 177, i8 178, i8 179, i8 180, i8 181, i8 182, i8 183, i8 184, i8 185, i8 186, i8 187, i8 188, i8 189, i8 190, i8 191, i8 192, i8 193, i8 194, i8 195, i8 196, i8 197, i8 198, i8 199, i8 200, i8 201, i8 202, i8 203, i8 204, i8 205, i8 206, i8 207, i8 208, i8 209, i8 210, i8 211, i8 212, i8 213, i8 214, i8 215, i8 216, i8 217, i8 218, i8 219, i8 220, i8 221, i8 222, i8 223, i8 224, i8 225, i8 226, i8 227, i8 228, i8 229, i8 230, i8 231, i8 232, i8 233, i8 234, i8 235, i8 236, i8 237, i8 238, i8 239, i8 240, i8 241, i8 242, i8 243, i8 244, i8 245, i8 246, i8 247, i8 248, i8 249, i8 250, i8 251, i8 252, i8 253, i8 254, i8 255>
+}
+
+define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16, 1024) {
+; RV32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 512
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsrl.vi v8, v8, 3
+; RV32-NEXT:    vadd.vi v0, v8, -1
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 1
+; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 2
+; RV64V-NEXT:    vadd.vi v0, v8, -1
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.i v8, 1
+; RV64V-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV64ZVE32:       # %bb.0:
+; RV64ZVE32-NEXT:    li a0, 512
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vid.v v8
+; RV64ZVE32-NEXT:    vsrl.vi v8, v8, 3
+; RV64ZVE32-NEXT:    vadd.vi v0, v8, -1
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v8, 1
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64ZVE32-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+}
+
+define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, 1024) {
+; RV32-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 15
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    li a0, 512
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 3
+; RV32-NEXT:    li a1, 240
+; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    slli a1, a1, 8
+; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmv.v.i v0, 3
+; RV64V-NEXT:    vmv.v.i v9, 0
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.i v12, 3
+; RV64V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64V-NEXT:    vmv.v.i v8, 12
+; RV64V-NEXT:    vmv1r.v v0, v10
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64V-NEXT:    vmv1r.v v0, v8
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64V-NEXT:    li a1, 48
+; RV64V-NEXT:    vmv.s.x v8, a1
+; RV64V-NEXT:    vmv.v.v v0, v10
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64V-NEXT:    vmv1r.v v0, v8
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64V-NEXT:    vmv.v.v v0, v8
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV64ZVE32:       # %bb.0:
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32-NEXT:    vmv.v.i v9, 0
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    li a0, 512
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v12, 3
+; RV64ZVE32-NEXT:    li a1, 240
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    vmv1r.v v0, v10
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    li a1, 15
+; RV64ZVE32-NEXT:    slli a1, a1, 8
+; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.v v0, v10
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64ZVE32-NEXT:    vmv.v.v v0, v8
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64ZVE32-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+}
+
 define <8 x i32> @prefix_overwrite(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: prefix_overwrite:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index 2d3865ba4533d..901be442c0012 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -11,7 +11,7 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -23,10 +23,10 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ;
 ; RV64-LABEL: llrint_v1i64_v1f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.s.x v8, a0
 ; RV64-NEXT:    ret
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
@@ -47,7 +47,7 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index de47d8572017b..a90ee3ebb8766 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -9,7 +9,7 @@
 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; RV32-LABEL: lrint_v1f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vmv.s.x v8, a0
@@ -17,7 +17,7 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ;
 ; RV64-i32-LABEL: lrint_v1f32:
 ; RV64-i32:       # %bb.0:
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vmv.s.x v8, a0
@@ -25,10 +25,10 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ;
 ; RV64-i64-LABEL: lrint_v1f32:
 ; RV64-i64:       # %bb.0:
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vmv.s.x v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index 7adaaa05f9dd9..793e8eb5aee6a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -13,7 +13,7 @@ declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32)
 define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -39,7 +39,7 @@ define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroex
 define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -67,7 +67,7 @@ declare half @llvm.vp.reduce.fadd.v4f16(half, <4 x half>, <4 x i1>, i32)
 define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -93,7 +93,7 @@ define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroex
 define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -121,7 +121,7 @@ declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32)
 define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_fadd_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -134,7 +134,7 @@ define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zer
 define float @vpreduce_ord_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_ord_fadd_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
index d528970def626..a6bbbaa71d238 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 define <8 x i8> @v8i8_from_v16xi8_low(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: v8i8_from_v16xi8_low:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index ab5885a604443..d723c2f6df1af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -16,7 +16,8 @@ define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -58,7 +59,8 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> [[MASKEDOFF:%.*]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -100,7 +102,8 @@ define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr [[TMP0]], i64 -5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 -5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -142,7 +145,8 @@ define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr [[TMP0]], i64 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -190,9 +194,10 @@ define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
 ; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -232,9 +237,10 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> [[MASKEDOFF:%.*]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
 ; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -276,7 +282,8 @@ define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -321,7 +328,8 @@ define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP1]], i64 [[TMP0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 [[TMP0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -418,9 +426,10 @@ define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I]], align 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0.i64(<8 x i32> [[I4]], ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I4]], ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
 ; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -468,8 +477,10 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[B]], i64 [[VEC_IND_SCALAR1]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 4
 ; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr [[I2]], i64 8
@@ -537,29 +548,37 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP0]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP2]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I3:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0.i64(<8 x i32> [[I3]], ptr [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP2]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I3]], ptr [[TMP2]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP4]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP5]], <8 x i32> undef, i32 8)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP6]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP7]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0.i64(<8 x i32> [[I8]], ptr [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP4]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR9]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I8]], ptr [[TMP6]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP8]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP9]], <8 x i32> undef, i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP10]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP11]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0.i64(<8 x i32> [[I13]], ptr [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR11]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP6]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR13]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0.i64(<8 x i32> undef, ptr [[TMP7]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I13]], ptr [[TMP10]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP12]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP13]], <8 x i32> undef, i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP14]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> [[TMP15]], <8 x i32> undef, i32 8)
 ; CHECK-NEXT:    [[I18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]]
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0.i64(<8 x i32> [[I18]], ptr [[TMP7]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I18]], ptr [[TMP14]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
@@ -636,8 +655,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; V-NEXT:    [[I3_SCALAR1:%.*]] = phi i64 [ 10, [[BB]] ], [ [[I16_SCALAR2:%.*]], [[BB2]] ]
 ; V-NEXT:    [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG1:%.*]], i64 [[I3_SCALAR]]
 ; V-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[I3_SCALAR1]]
-; V-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.riscv.masked.strided.load.v2p0.p0.i64(<2 x ptr> undef, ptr [[TMP0]], i64 40, <2 x i1> <i1 true, i1 true>)
-; V-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.riscv.masked.strided.load.v2p0.p0.i64(<2 x ptr> undef, ptr [[TMP1]], i64 40, <2 x i1> <i1 true, i1 true>)
+; V-NEXT:    [[TMP2:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP0]], i64 40, <2 x i1> <i1 true, i1 true>, i32 2)
+; V-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> <i1 true, i1 true>, <2 x ptr> [[TMP2]], <2 x ptr> undef, i32 2)
+; V-NEXT:    [[TMP3:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP1]], i64 40, <2 x i1> <i1 true, i1 true>, i32 2)
+; V-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> <i1 true, i1 true>, <2 x ptr> [[TMP3]], <2 x ptr> undef, i32 2)
 ; V-NEXT:    [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
 ; V-NEXT:    store <2 x ptr> [[I9]], ptr [[I11]], align 8
 ; V-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
@@ -717,8 +738,8 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; V-NEXT:    [[I9:%.*]] = load <2 x ptr>, ptr [[I7]], align 8
 ; V-NEXT:    [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i64 [[I3_SCALAR]]
 ; V-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG]], i64 [[I3_SCALAR1]]
-; V-NEXT:    call void @llvm.riscv.masked.strided.store.v2p0.p0.i64(<2 x ptr> [[I6]], ptr [[TMP0]], i64 40, <2 x i1> <i1 true, i1 true>)
-; V-NEXT:    call void @llvm.riscv.masked.strided.store.v2p0.p0.i64(<2 x ptr> [[I9]], ptr [[TMP1]], i64 40, <2 x i1> <i1 true, i1 true>)
+; V-NEXT:    call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I6]], ptr [[TMP0]], i64 40, <2 x i1> <i1 true, i1 true>, i32 2)
+; V-NEXT:    call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I9]], ptr [[TMP1]], i64 40, <2 x i1> <i1 true, i1 true>, i32 2)
 ; V-NEXT:    [[I15]] = add nuw i64 [[I]], 4
 ; V-NEXT:    [[I16_SCALAR]] = add i64 [[I3_SCALAR]], 20
 ; V-NEXT:    [[I16_SCALAR2]] = add i64 [[I3_SCALAR1]], 20
@@ -801,7 +822,8 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    [[I17_SCALAR:%.*]] = phi i64 [ [[START]], [[BB9]] ], [ [[I28_SCALAR:%.*]], [[BB15]] ]
 ; CHECK-NEXT:    [[I18:%.*]] = add i64 [[I16]], [[I4]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I17_SCALAR]]
-; CHECK-NEXT:    [[I21:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 32)
+; CHECK-NEXT:    [[I21:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
 ; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I18]]
 ; CHECK-NEXT:    [[I24:%.*]] = load <32 x i8>, ptr [[I22]], align 1
 ; CHECK-NEXT:    [[I25:%.*]] = add <32 x i8> [[I24]], [[I21]]
@@ -909,7 +931,8 @@ define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr
 ; CHECK-NEXT:    [[I5:%.*]] = phi i64 [ [[I13:%.*]], [[BB4]] ], [ 0, [[BB2]] ]
 ; CHECK-NEXT:    [[I6_SCALAR:%.*]] = phi i64 [ 0, [[BB2]] ], [ [[I14_SCALAR:%.*]], [[BB4]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I6_SCALAR]]
-; CHECK-NEXT:    [[I9:%.*]] = call <16 x i8> @llvm.riscv.masked.strided.load.v16i8.p0.i64(<16 x i8> undef, ptr [[TMP0]], i64 5, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr [[TMP0]], i64 5, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 16)
+; CHECK-NEXT:    [[I9:%.*]] = call <16 x i8> @llvm.vp.select.v16i8(<16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> [[TMP1]], <16 x i8> undef, i32 16)
 ; CHECK-NEXT:    [[I10:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I5]]
 ; CHECK-NEXT:    [[I11:%.*]] = load <16 x i8>, ptr [[I10]], align 1
 ; CHECK-NEXT:    [[I12:%.*]] = add <16 x i8> [[I11]], [[I9]]
@@ -951,8 +974,9 @@ bb16:                                             ; preds = %bb4, %bb
 define <8 x i8> @broadcast_ptr_base(ptr %a) {
 ; CHECK-LABEL: @broadcast_ptr_base(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.riscv.masked.strided.load.v8i8.p0.i64(<8 x i8> poison, ptr [[A:%.*]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr [[A:%.*]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.vp.select.v8i8(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[TMP0]], <8 x i8> poison, i32 8)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
 entry:
   %0 = insertelement <8 x ptr> poison, ptr %a, i64 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index b8c7037580c46..849f98c26f459 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -638,14 +638,14 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64,
 define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
 ; CHECK-OPT:       # %bb.0:
-; CHECK-OPT-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-OPT-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
 ; CHECK-OPT-NEXT:    vlse8.v v8, (a0), zero
 ; CHECK-OPT-NEXT:    ret
 ;
 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
 ; CHECK-NO-OPT:       # %bb.0:
 ; CHECK-NO-OPT-NEXT:    lbu a0, 0(a0)
-; CHECK-NO-OPT-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NO-OPT-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
 ; CHECK-NO-OPT-NEXT:    vmv.v.x v8, a0
 ; CHECK-NO-OPT-NEXT:    ret
   %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3)
@@ -657,14 +657,14 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
 define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
 ; CHECK-OPT:       # %bb.0:
-; CHECK-OPT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-OPT-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; CHECK-OPT-NEXT:    vlse16.v v8, (a0), zero
 ; CHECK-OPT-NEXT:    ret
 ;
 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
 ; CHECK-NO-OPT:       # %bb.0:
 ; CHECK-NO-OPT-NEXT:    flh fa5, 0(a0)
-; CHECK-NO-OPT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NO-OPT-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; CHECK-NO-OPT-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NO-OPT-NEXT:    ret
   %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
index fdb6bfe1baa77..4334f293d1e85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
@@ -34,14 +34,14 @@ declare <1 x i7> @llvm.experimental.constrained.fptosi.v1i7.v1f16(<1 x half>, me
 define <1 x i7> @vfptosi_v1f16_v1i7(<1 x half> %va) strictfp {
 ; RV32-LABEL: vfptosi_v1f16_v1i7:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.w.h a0, fa5, rtz
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vfptosi_v1f16_v1i7:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.h a0, fa5, rtz
 ; RV64-NEXT:    ret
@@ -53,14 +53,14 @@ declare <1 x i7> @llvm.experimental.constrained.fptoui.v1i7.v1f16(<1 x half>, me
 define <1 x i7> @vfptoui_v1f16_v1i7(<1 x half> %va) strictfp {
 ; RV32-LABEL: vfptoui_v1f16_v1i7:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.wu.h a0, fa5, rtz
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vfptoui_v1f16_v1i7:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.lu.h a0, fa5, rtz
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 26f77225dbb0e..cb50ca4a72120 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN
 ; Check that the default value enables the web folding and
 ; that it is bigger than 3.
@@ -23,6 +23,17 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a,
 ; NO_FOLDING-NEXT:    vse32.v v8, (a2)
 ; NO_FOLDING-NEXT:    ret
 ;
+; ZVFH-LABEL: vfwmul_v2f116_multiple_users:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwmul.vv v11, v8, v9
+; ZVFH-NEXT:    vfwadd.vv v12, v8, v10
+; ZVFH-NEXT:    vfwsub.vv v8, v9, v10
+; ZVFH-NEXT:    vse32.v v11, (a0)
+; ZVFH-NEXT:    vse32.v v12, (a1)
+; ZVFH-NEXT:    vse32.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
 ; ZVFHMIN-LABEL: vfwmul_v2f116_multiple_users:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
@@ -86,3 +97,112 @@ define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a,
   store <2 x double> %g, ptr %z
   ret void
 }
+
+define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2, <2 x double> %w) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v12, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v12, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v12, v9, v11
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v12, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwmul.vv v12, v8, v9
+; FOLDING-NEXT:    vfwmacc.vv v11, v8, v10
+; FOLDING-NEXT:    vfwsub.vv v8, v9, v10
+; FOLDING-NEXT:    vse64.v v12, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %w)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_multiple_users_addend_user(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v11, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; FOLDING-NEXT:    vse64.v v10, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %d)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_addend_user(ptr %x, <2 x float> %a, <2 x float> %b) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vse64.v v8, (a0)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; FOLDING-NEXT:    vse64.v v8, (a0)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d, <2 x double> %d)
+  store <2 x double> %f, ptr %x
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index bcf5d7b3365c3..5140d89b78307 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -2027,3 +2027,63 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
   %vg = call <8 x double> @llvm.fma.v8f64(<8 x double> %vd, <8 x double> %vf, <8 x double> %va)
   ret <8 x double> %vg
 }
+
+define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwmacc_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = call <2 x float> @llvm.fma.v2f32(<2 x float> %vd, <2 x float> %splat, <2 x float> %va)
+  ret <2 x float> %vf
+}
+
+define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwmsac_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %ve = fneg <2 x float> %va
+  %vf = call <2 x float> @llvm.fma.v2f32(<2 x float> %vd, <2 x float> %splat, <2 x float> %ve)
+  ret <2 x float> %vf
+}
+
+define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwnmacc_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = fneg <2 x float> %va
+  %vg = fneg <2 x float> %vd
+  %vh = call <2 x float> @llvm.fma.v2f32(<2 x float> %vg, <2 x float> %splat, <2 x float> %vf)
+  ret <2 x float> %vh
+}
+
+define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwnmsac_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = fneg <2 x float> %vd
+  %vg = call <2 x float> @llvm.fma.v2f32(<2 x float> %vf, <2 x float> %splat, <2 x float> %va)
+  ret <2 x float> %vg
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
index 2181fd8498f5a..4805d6782a3b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
index 695fba6d54e03..805e2e2e6bd35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp-bf16.ll
index a0301bbf4f746..31ab6699d7c51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp-bf16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+m,+zfbfmin,+zvfbfmin -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+m,+zvfbfmin -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+m,+zfbfmin,+zvfbfmin -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+m,+zvfbfmin -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s
 
 declare <2 x bfloat> @llvm.vp.select.v2bf16(<2 x i1>, <2 x bfloat>, <2 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index 9d63b8f31a3e8..97c7f101c2582 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -882,3 +882,40 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) {
   %g = mul <2 x i64> %e, %f
   ret <2 x i64> %g
 }
+
+define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) {
+; CHECK-LABEL: vwmul_v2i16_multiuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a1)
+; CHECK-NEXT:    vle8.v v10, (a2)
+; CHECK-NEXT:    vle8.v v11, (a3)
+; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v9
+; CHECK-NEXT:    vsext.vf2 v9, v10
+; CHECK-NEXT:    vsext.vf2 v10, v11
+; CHECK-NEXT:    vmul.vv v11, v12, v10
+; CHECK-NEXT:    vmul.vv v10, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v9, v11, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %a = load <2 x i8>, ptr %x
+  %b = load <2 x i8>, ptr %y
+  %c = load <2 x i8>, ptr %z
+  %d = load <2 x i8>, ptr %w
+
+  %as = sext <2 x i8> %a to <2 x i16>
+  %bs = sext <2 x i8> %b to <2 x i16>
+  %cs = sext <2 x i8> %c to <2 x i16>
+  %ds = sext <2 x i8> %d to <2 x i16>
+
+  %e = mul <2 x i16> %as, %ds
+  %f = mul <2 x i16> %bs, %ds ; shares 1 use with %e
+  %g = udiv <2 x i16> %bs, %cs ; shares 1 use with %f, and no uses with %e
+
+  %h = or <2 x i16> %e, %f
+  %i = or <2 x i16> %h, %g
+  ret <2 x i16> %i
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 3e2db3fa4685d..1395dc914bb40 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2670,7 +2670,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
@@ -2803,7 +2803,7 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    snez a1, a1
@@ -2915,7 +2915,7 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, s1
@@ -5985,7 +5985,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
@@ -6111,7 +6111,7 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s0, a0
@@ -6219,7 +6219,7 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
index c45af61ced94f..393cd5c7f52e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
@@ -154,19 +154,12 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI10_1)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vfmax.vf v12, v8, fa5
-; CHECK-NEXT:    vfmin.vf v12, v12, fa4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    vnclip.wi v8, v12, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
@@ -176,19 +169,12 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI11_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI11_1)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vf v16, v8, fa5
-; CHECK-NEXT:    vfmin.vf v16, v16, fa4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0
+; CHECK-NEXT:    vnclip.wi v8, v16, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f64.nxv8i16(<vscale x 8 x double> %f)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
index 52322f64416ce..a7efa4b3de940 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
@@ -152,65 +152,31 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 }
 
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
-; CHECK32-LABEL: test_signed_v4f64_v4i16:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    lui a0, %hi(.LCPI10_0)
-; CHECK32-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
-; CHECK32-NEXT:    fcvt.d.w fa4, zero
-; CHECK32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK32-NEXT:    vfmax.vf v8, v8, fa4
-; CHECK32-NEXT:    vfmin.vf v8, v8, fa5
-; CHECK32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK32-NEXT:    vfncvt.rtz.xu.f.w v12, v8
-; CHECK32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK32-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK32-NEXT:    ret
-;
-; CHECK64-LABEL: test_signed_v4f64_v4i16:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    lui a0, %hi(.LCPI10_0)
-; CHECK64-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
-; CHECK64-NEXT:    fmv.d.x fa4, zero
-; CHECK64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK64-NEXT:    vfmax.vf v8, v8, fa4
-; CHECK64-NEXT:    vfmin.vf v8, v8, fa5
-; CHECK64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK64-NEXT:    vfncvt.rtz.xu.f.w v12, v8
-; CHECK64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK64-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK64-NEXT:    ret
+; CHECK-LABEL: test_signed_v4f64_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v12, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
 }
 
 define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
-; CHECK32-LABEL: test_signed_v8f64_v8i16:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK32-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; CHECK32-NEXT:    fcvt.d.w fa4, zero
-; CHECK32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK32-NEXT:    vfmax.vf v8, v8, fa4
-; CHECK32-NEXT:    vfmin.vf v8, v8, fa5
-; CHECK32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK32-NEXT:    vfncvt.rtz.xu.f.w v16, v8
-; CHECK32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK32-NEXT:    vnsrl.wi v8, v16, 0
-; CHECK32-NEXT:    ret
-;
-; CHECK64-LABEL: test_signed_v8f64_v8i16:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK64-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; CHECK64-NEXT:    fmv.d.x fa4, zero
-; CHECK64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK64-NEXT:    vfmax.vf v8, v8, fa4
-; CHECK64-NEXT:    vfmin.vf v8, v8, fa5
-; CHECK64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK64-NEXT:    vfncvt.rtz.xu.f.w v16, v8
-; CHECK64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK64-NEXT:    vnsrl.wi v8, v16, 0
-; CHECK64-NEXT:    ret
+; CHECK-LABEL: test_signed_v8f64_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v16, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vnclipu.wi v8, v16, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f64.nxv8i16(<vscale x 8 x double> %f)
     ret <vscale x 8 x i16> %x
 }
@@ -342,3 +308,6 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK32: {{.*}}
+; CHECK64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
index 060e99691cb13..8cfa88e6f9569 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @insertelt_nxv1f16_0(<vscale x 1 x half> %v, half %elt) {
 ; CHECK-LABEL: insertelt_nxv1f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 1 x half> %v, half %elt, i32 0
@@ -29,7 +29,7 @@ define <vscale x 1 x half> @insertelt_nxv1f16_idx(<vscale x 1 x half> %v, half %
 ; CHECK-LABEL: insertelt_nxv1f16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
@@ -41,7 +41,7 @@ define <vscale x 1 x half> @insertelt_nxv1f16_idx(<vscale x 1 x half> %v, half %
 define <vscale x 2 x half> @insertelt_nxv2f16_0(<vscale x 2 x half> %v, half %elt) {
 ; CHECK-LABEL: insertelt_nxv2f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 2 x half> %v, half %elt, i32 0
@@ -63,7 +63,7 @@ define <vscale x 2 x half> @insertelt_nxv2f16_idx(<vscale x 2 x half> %v, half %
 ; CHECK-LABEL: insertelt_nxv2f16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
@@ -211,7 +211,7 @@ define <vscale x 32 x half> @insertelt_nxv32f16_idx(<vscale x 32 x half> %v, hal
 define <vscale x 1 x float> @insertelt_nxv1f32_0(<vscale x 1 x float> %v, float %elt) {
 ; CHECK-LABEL: insertelt_nxv1f32_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 1 x float> %v, float %elt, i32 0
@@ -233,7 +233,7 @@ define <vscale x 1 x float> @insertelt_nxv1f32_idx(<vscale x 1 x float> %v, floa
 ; CHECK-LABEL: insertelt_nxv1f32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
index 410ef8a02383f..c3cc90c6a8de3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x i8> @insertelt_nxv1i8_0(<vscale x 1 x i8> %v, i8 signext %elt) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
index 5bba1c549972b..1523126af2124 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x i8> @insertelt_nxv1i8_0(<vscale x 1 x i8> %v, i8 signext %elt) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 4b4cffc461d46..77438ee53b634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s
 
 ; Check that we are able to legalize scalable-vector stores that require widening.
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
deleted file mode 100644
index ef5ad1f651fa9..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
+++ /dev/null
@@ -1,728 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-define <vscale x 1 x i8> @sext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i8>
-  ret <vscale x 1 x i8> %r
-}
-
-define <vscale x 1 x i8> @zext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i8>
-  ret <vscale x 1 x i8> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i8_nxv1i1(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: trunc_nxv1i8_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i8> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i8> @sext_nxv2i1_nxv2i8(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i8>
-  ret <vscale x 2 x i8> %r
-}
-
-define <vscale x 2 x i8> @zext_nxv2i1_nxv2i8(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i8>
-  ret <vscale x 2 x i8> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i8_nxv2i1(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: trunc_nxv2i8_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i8> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i8> @sext_nxv4i1_nxv4i8(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i8>
-  ret <vscale x 4 x i8> %r
-}
-
-define <vscale x 4 x i8> @zext_nxv4i1_nxv4i8(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i8>
-  ret <vscale x 4 x i8> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i8_nxv4i1(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: trunc_nxv4i8_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i8> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i8> @sext_nxv8i1_nxv8i8(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %r
-}
-
-define <vscale x 8 x i8> @zext_nxv8i1_nxv8i8(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i8_nxv8i1(<vscale x 8 x i8> %v) {
-; CHECK-LABEL: trunc_nxv8i8_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i8> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i8> @sext_nxv16i1_nxv16i8(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i8>
-  ret <vscale x 16 x i8> %r
-}
-
-define <vscale x 16 x i8> @zext_nxv16i1_nxv16i8(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i8>
-  ret <vscale x 16 x i8> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i8_nxv16i1(<vscale x 16 x i8> %v) {
-; CHECK-LABEL: trunc_nxv16i8_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i8> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 32 x i8> @sext_nxv32i1_nxv32i8(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: sext_nxv32i1_nxv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 32 x i1> %v to <vscale x 32 x i8>
-  ret <vscale x 32 x i8> %r
-}
-
-define <vscale x 32 x i8> @zext_nxv32i1_nxv32i8(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: zext_nxv32i1_nxv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 32 x i1> %v to <vscale x 32 x i8>
-  ret <vscale x 32 x i8> %r
-}
-
-define <vscale x 32 x i1> @trunc_nxv32i8_nxv32i1(<vscale x 32 x i8> %v) {
-; CHECK-LABEL: trunc_nxv32i8_nxv32i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 32 x i8> %v to <vscale x 32 x i1>
-  ret <vscale x 32 x i1> %r
-}
-
-define <vscale x 64 x i8> @sext_nxv64i1_nxv64i8(<vscale x 64 x i1> %v) {
-; CHECK-LABEL: sext_nxv64i1_nxv64i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 64 x i1> %v to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %r
-}
-
-define <vscale x 64 x i8> @zext_nxv64i1_nxv64i8(<vscale x 64 x i1> %v) {
-; CHECK-LABEL: zext_nxv64i1_nxv64i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 64 x i1> %v to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %r
-}
-
-define <vscale x 64 x i1> @trunc_nxv64i8_nxv64i1(<vscale x 64 x i8> %v) {
-; CHECK-LABEL: trunc_nxv64i8_nxv64i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 64 x i8> %v to <vscale x 64 x i1>
-  ret <vscale x 64 x i1> %r
-}
-
-define <vscale x 1 x i16> @sext_nxv1i1_nxv1i16(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %r
-}
-
-define <vscale x 1 x i16> @zext_nxv1i1_nxv1i16(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i16_nxv1i1(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: trunc_nxv1i16_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i16> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i16> @sext_nxv2i1_nxv2i16(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %r
-}
-
-define <vscale x 2 x i16> @zext_nxv2i1_nxv2i16(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i16_nxv2i1(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: trunc_nxv2i16_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i16> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i16> @sext_nxv4i1_nxv4i16(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i16>
-  ret <vscale x 4 x i16> %r
-}
-
-define <vscale x 4 x i16> @zext_nxv4i1_nxv4i16(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i16>
-  ret <vscale x 4 x i16> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i16_nxv4i1(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: trunc_nxv4i16_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i16> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i16> @sext_nxv8i1_nxv8i16(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i16>
-  ret <vscale x 8 x i16> %r
-}
-
-define <vscale x 8 x i16> @zext_nxv8i1_nxv8i16(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i16>
-  ret <vscale x 8 x i16> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i16_nxv8i1(<vscale x 8 x i16> %v) {
-; CHECK-LABEL: trunc_nxv8i16_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i16> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i16> @sext_nxv16i1_nxv16i16(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i16>
-  ret <vscale x 16 x i16> %r
-}
-
-define <vscale x 16 x i16> @zext_nxv16i1_nxv16i16(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i16>
-  ret <vscale x 16 x i16> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i16_nxv16i1(<vscale x 16 x i16> %v) {
-; CHECK-LABEL: trunc_nxv16i16_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i16> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 32 x i16> @sext_nxv32i1_nxv32i16(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: sext_nxv32i1_nxv32i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 32 x i1> %v to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %r
-}
-
-define <vscale x 32 x i16> @zext_nxv32i1_nxv32i16(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: zext_nxv32i1_nxv32i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 32 x i1> %v to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %r
-}
-
-define <vscale x 32 x i1> @trunc_nxv32i16_nxv32i1(<vscale x 32 x i16> %v) {
-; CHECK-LABEL: trunc_nxv32i16_nxv32i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 32 x i16> %v to <vscale x 32 x i1>
-  ret <vscale x 32 x i1> %r
-}
-
-define <vscale x 1 x i32> @sext_nxv1i1_nxv1i32(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %r
-}
-
-define <vscale x 1 x i32> @zext_nxv1i1_nxv1i32(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i32_nxv1i1(<vscale x 1 x i32> %v) {
-; CHECK-LABEL: trunc_nxv1i32_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i32> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i32> @sext_nxv2i1_nxv2i32(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %r
-}
-
-define <vscale x 2 x i32> @zext_nxv2i1_nxv2i32(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i32_nxv2i1(<vscale x 2 x i32> %v) {
-; CHECK-LABEL: trunc_nxv2i32_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i32> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i32> @sext_nxv4i1_nxv4i32(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %r
-}
-
-define <vscale x 4 x i32> @zext_nxv4i1_nxv4i32(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i32_nxv4i1(<vscale x 4 x i32> %v) {
-; CHECK-LABEL: trunc_nxv4i32_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i32> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i32> @sext_nxv8i1_nxv8i32(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %r
-}
-
-define <vscale x 8 x i32> @zext_nxv8i1_nxv8i32(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i32_nxv8i1(<vscale x 8 x i32> %v) {
-; CHECK-LABEL: trunc_nxv8i32_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i32> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i32> @sext_nxv16i1_nxv16i32(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %r
-}
-
-define <vscale x 16 x i32> @zext_nxv16i1_nxv16i32(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i32_nxv16i1(<vscale x 16 x i32> %v) {
-; CHECK-LABEL: trunc_nxv16i32_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i32> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 1 x i64> @sext_nxv1i1_nxv1i64(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %r
-}
-
-define <vscale x 1 x i64> @zext_nxv1i1_nxv1i64(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i64_nxv1i1(<vscale x 1 x i64> %v) {
-; CHECK-LABEL: trunc_nxv1i64_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i64> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i64> @sext_nxv2i1_nxv2i64(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %r
-}
-
-define <vscale x 2 x i64> @zext_nxv2i1_nxv2i64(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i64_nxv2i1(<vscale x 2 x i64> %v) {
-; CHECK-LABEL: trunc_nxv2i64_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i64> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i64> @sext_nxv4i1_nxv4i64(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %r
-}
-
-define <vscale x 4 x i64> @zext_nxv4i1_nxv4i64(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i64_nxv4i1(<vscale x 4 x i64> %v) {
-; CHECK-LABEL: trunc_nxv4i64_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i64> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i64> @sext_nxv8i1_nxv8i64(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %r
-}
-
-define <vscale x 8 x i64> @zext_nxv8i1_nxv8i64(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
-; CHECK-LABEL: trunc_nxv8i64_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs.ll
similarity index 99%
rename from llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs.ll
index aaac0c7b5a41a..023a704be79f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s
 
 define <vscale x 1 x i8> @sext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
 ; CHECK-LABEL: sext_nxv1i1_nxv1i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
index 1c3b429202adf..e686ac881fabe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
 
 %struct = type { i64, i64, ptr, i32, i32, i32, [4 x i32] }
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr83017.ll b/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
index 3719a2ad994d6..beca480378a35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
@@ -35,11 +35,11 @@ define void @aliasing(ptr %p) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vs1r.v v8, (a2)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    addi a2, a0, 64
 ; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    sw a1, 84(a0)
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i64 84
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr90559.ll b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
index 8d330b12055ae..7e109f307c4a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
@@ -32,11 +32,11 @@ define void @f(ptr %p) vscale_range(2,2) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vs1r.v v8, (a2)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    addi a2, a0, 64
 ; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    sw a1, 84(a0)
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i64 84
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr99782.ll b/llvm/test/CodeGen/RISCV/rvv/pr99782.ll
new file mode 100644
index 0000000000000..a4875c63da449
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr99782.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -stop-after=finalize-isel | FileCheck %s
+
+define void @vslidedown() {
+  ; CHECK-LABEL: name: vslidedown
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI %stack.0.v, 0
+  ; CHECK-NEXT:   [[PseudoVLE8_V_M8_:%[0-9]+]]:vrm8 = PseudoVLE8_V_M8 $noreg, killed [[ADDI]], -1, 3 /* e8 */, 3 /* ta, ma */ :: (load (<vscale x 1 x s512>) from %ir.v, align 1)
+  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI %stack.1, 0
+  ; CHECK-NEXT:   PseudoVSE8_V_M8 killed [[PseudoVLE8_V_M8_]], killed [[ADDI1]], -1, 3 /* e8 */ :: (store (<vscale x 1 x s512>) into %stack.1)
+  ; CHECK-NEXT:   INLINEASM &"vadd.vv $0, $0, $0", 25 /* sideeffect mayload maystore attdialect */, 262166 /* mem:m */, %stack.0.v, 0, 262166 /* mem:m */, %stack.1, 0
+  ; CHECK-NEXT:   PseudoRET
+entry:
+  %v = alloca <vscale x 64 x i8>, align 1
+  %0 = load <vscale x 64 x i8>, ptr %v, align 1
+  call void asm sideeffect "vadd.vv $0, $0, $0", "=*imr,imr"(ptr elementtype(<vscale x 64 x i8>) %v, <vscale x 64 x i8> %0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll b/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
index 64bc1ef156bea..b6f9d319fe57d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m \
 ; RUN:     -regalloc=fast -verify-machineinstrs < %s | FileCheck %s
 
 ; This test previously crashed with an error "ran out of registers during register allocation"
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll
index d5ba11c8d19d0..3850d5d6d2c74 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll
@@ -20,7 +20,8 @@ define dso_local void @lots_args(i32 signext %x0, i32 signext %x1, <vscale x 16
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a0, s0, a0
 ; CHECK-NEXT:    addi a0, a0, -64
-; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    sw a2, -36(s0)
 ; CHECK-NEXT:    sw a3, -40(s0)
 ; CHECK-NEXT:    sw a4, -44(s0)
@@ -85,7 +86,8 @@ define dso_local signext i32 @main() #0 {
 ; CHECK-NEXT:    slli s1, s1, 3
 ; CHECK-NEXT:    sub s1, s0, s1
 ; CHECK-NEXT:    addi s1, s1, -112
-; CHECK-NEXT:    vs8r.v v8, (s1)
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (s1)
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    sw a0, -76(s0)
 ; CHECK-NEXT:    sw a0, -80(s0)
@@ -99,7 +101,7 @@ define dso_local signext i32 @main() #0 {
 ; CHECK-NEXT:    sw a0, -112(s0)
 ; CHECK-NEXT:    lw a0, -76(s0)
 ; CHECK-NEXT:    lw a1, -80(s0)
-; CHECK-NEXT:    vl8re32.v v8, (s1)
+; CHECK-NEXT:    vle32.v v8, (s1)
 ; CHECK-NEXT:    lw a2, -84(s0)
 ; CHECK-NEXT:    lw a3, -88(s0)
 ; CHECK-NEXT:    lw a4, -92(s0)
@@ -115,7 +117,8 @@ define dso_local signext i32 @main() #0 {
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    lw a0, -76(s0)
 ; CHECK-NEXT:    lw a1, -80(s0)
-; CHECK-NEXT:    vl8re32.v v8, (s1)
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vle32.v v8, (s1)
 ; CHECK-NEXT:    lw a2, -84(s0)
 ; CHECK-NEXT:    lw a3, -88(s0)
 ; CHECK-NEXT:    lw a4, -92(s0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
index 31fd5bdbd31fd..c01cbf49483b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
@@ -17,7 +17,7 @@ define void @vpmerge_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
   ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8)
-  ; CHECK-NEXT:   VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
+  ; CHECK-NEXT:   PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
   ; CHECK-NEXT:   PseudoRET
   %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
   %b = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
@@ -36,7 +36,7 @@ define void @vpselect_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
   ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8)
-  ; CHECK-NEXT:   VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
+  ; CHECK-NEXT:   PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
   ; CHECK-NEXT:   PseudoRET
   %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
   %b = call <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index a08bcae074b9b..259515f160048 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1196,3 +1196,24 @@ define <vscale x 2 x i32> @true_mask_vmerge_implicit_passthru(<vscale x 2 x i32>
   )
   ret <vscale x 2 x i32> %b
 }
+
+
+define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, <vscale x 2 x i1> %mask, i64 %avl) {
+; CHECK-LABEL: unfoldable_mismatched_sew:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, i64 %avl)
+  %a.bitcast = bitcast <vscale x 1 x i64> %a to <vscale x 2 x i32>
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %a.bitcast,
+    <vscale x 2 x i1> splat (i1 true),
+    i64 %avl
+  )
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll
deleted file mode 100644
index f8a5b0f9d03a6..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v | FileCheck %s
-
-declare <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8>, ptr, i64, <32 x i1>)
-declare <2 x i64> @llvm.riscv.masked.strided.load.v2i64.p0.i64(<2 x i64>, ptr, i64, <2 x i1>)
-
-declare void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8>, ptr, i64, <32 x i1>)
-declare void @llvm.riscv.masked.strided.store.v2i64.p0.i64(<2 x i64>, ptr, i64, <2 x i1>)
-
-define <32 x i8> @strided_load_i8(ptr %p, i64 %stride, <32 x i1> %m) {
-; CHECK-LABEL: strided_load_i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlse8.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr %p, i64 %stride, <32 x i1> %m)
-  ret <32 x i8> %res
-}
-
-define <2 x i64> @strided_load_i64(ptr %p, i64 %stride, <2 x i1> %m) {
-; CHECK-LABEL: strided_load_i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  %res = call <2 x i64> @llvm.riscv.masked.strided.load.v2i64.p0.i64(<2 x i64> undef, ptr %p, i64 %stride, <2 x i1> %m)
-  ret <2 x i64> %res
-}
-
-define <32 x i8> @strided_load_i8_splat(ptr %p, <32 x i1> %m) {
-; CHECK-LABEL: strided_load_i8_splat:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlse8.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr %p, i64 0, <32 x i1> %m)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @strided_load_i8_reverse(ptr %p, <32 x i1> %m) {
-; CHECK-LABEL: strided_load_i8_reverse:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    li a2, -1
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlse8.v v8, (a0), a2, v0.t
-; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr %p, i64 -1, <32 x i1> %m)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @strided_load_i8_nostride(ptr %p, <32 x i1> %m) {
-; CHECK-LABEL: strided_load_i8_nostride:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0), v0.t
-; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0.i64(<32 x i8> undef, ptr %p, i64 1, <32 x i1> %m)
-  ret <32 x i8> %res
-}
-
-
-define void @strided_store_i8(ptr %p, <32 x i8> %v, i64 %stride, <32 x i1> %m) {
-; CHECK-LABEL: strided_store_i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> %v, ptr %p, i64 %stride, <32 x i1> %m)
-  ret void
-}
-
-define void @strided_store_i8_zero(ptr %p, <32 x i8> %v, <32 x i1> %m) {
-; CHECK-LABEL: strided_store_i8_zero:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    ret
-  call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> %v, ptr %p, i64 0, <32 x i1> %m)
-  ret void
-}
-
-define void @strided_store_i8_nostride(ptr %p, <32 x i8> %v, <32 x i1> %m) {
-; CHECK-LABEL: strided_store_i8_nostride:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a0), v0.t
-; CHECK-NEXT:    ret
-  call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> %v, ptr %p, i64 1, <32 x i1> %m)
-  ret void
-}
-
-define void @strided_store_i8_reverse(ptr %p, <32 x i8> %v, <32 x i1> %m) {
-; CHECK-LABEL: strided_store_i8_reverse:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    li a2, -1
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a0), a2, v0.t
-; CHECK-NEXT:    ret
-  call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> %v, ptr %p, i64 -1, <32 x i1> %m)
-  ret void
-}
-
-declare void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64>, ptr, i64, <vscale x 1 x i1>)
-
-declare <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64>, ptr, i64, <vscale x 1 x i1>)
-
-define <vscale x 1 x i64> @strided_load_vscale_i64(ptr %p, i64 %stride, <vscale x 1 x i1> %m) {
-; CHECK-LABEL: strided_load_vscale_i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr %p, i64 %stride, <vscale x 1 x i1> %m)
-  ret <vscale x 1 x i64> %res
-}
-
-define void @strided_store_vscale_i64(ptr %p, <vscale x 1 x i64> %v, i64 %stride, <vscale x 1 x i1> %m) {
-; CHECK-LABEL: strided_store_vscale_i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> %v, ptr %p, i64 %stride, <vscale x 1 x i1> %m)
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index 70412de1d0e91..381d1183995be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -16,12 +16,14 @@ define <vscale x 1 x i64> @gather(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> undef, i32 [[TMP2]])
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
 ;
@@ -59,7 +61,9 @@ define <vscale x 1 x i64> @gather_disjoint_or(ptr %a, i64 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> poison, i32 [[TMP1]])
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 2
@@ -111,11 +115,12 @@ define void @scatter(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
@@ -144,7 +149,9 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define <vscale x 1 x i64> @gather_loopless(ptr %p, i64 %stride) {
 ; CHECK-LABEL: @gather_loopless(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE:%.*]], 4
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[P:%.*]], i64 [[TMP1]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[P:%.*]], i64 [[TMP1]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> poison, i32 [[TMP2]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -164,7 +171,9 @@ define <vscale x 1 x i64> @gather_loopless(ptr %p, i64 %stride) {
 define <vscale x 1 x i64> @straightline_offset_add(ptr %p, i64 %offset) {
 ; CHECK-LABEL: @straightline_offset_add(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP1]], i64 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP1]], i64 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> poison, i32 [[TMP2]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -184,7 +193,9 @@ define <vscale x 1 x i64> @straightline_offset_add(ptr %p, i64 %offset) {
 define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset) {
 ; CHECK-LABEL: @straightline_offset_disjoint_or(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP1]], i64 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP1]], i64 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> poison, i32 [[TMP2]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -202,7 +213,9 @@ define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset)
 
 define <vscale x 1 x i64> @straightline_offset_shl(ptr %p) {
 ; CHECK-LABEL: @straightline_offset_shl(
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[P:%.*]], i64 32, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[P:%.*]], i64 32, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> poison, i32 [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -245,7 +258,9 @@ define <vscale x 1 x i64> @straightline_offset_shl_nonc(ptr %p, i64 %shift) {
 ; CHECK-LABEL: @straightline_offset_shl_nonc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 1, [[SHIFT:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[P:%.*]], i64 [[TMP2]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[P:%.*]], i64 [[TMP2]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP3]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP4]], <vscale x 1 x i64> poison, i32 [[TMP3]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -265,7 +280,8 @@ define <vscale x 1 x i64> @straightline_offset_shl_nonc(ptr %p, i64 %shift) {
 define void @scatter_loopless(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 ; CHECK-LABEL: @scatter_loopless(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE:%.*]], 4
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> [[X:%.*]], ptr [[P:%.*]], i64 [[TMP1]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> [[X:%.*]], ptr [[P:%.*]], i64 [[TMP1]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -301,8 +317,10 @@ define void @constant_stride(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 
 define <vscale x 1 x i64> @vector_base_scalar_offset(ptr %p, i64 %offset) {
 ; CHECK-LABEL: @vector_base_scalar_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP1]], i64 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[PTRS2OFFSET:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[PTRS2OFFSET]], i64 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> poison, i32 [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
@@ -319,8 +337,10 @@ define <vscale x 1 x i64> @vector_base_scalar_offset(ptr %p, i64 %offset) {
 
 define <vscale x 1 x i64> @splat_base_scalar_offset(ptr %p, i64 %offset) {
 ; CHECK-LABEL: @splat_base_scalar_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP1]], i64 0, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[PTRSOFFSET:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[PTRSOFFSET]], i64 0, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> poison, i32 [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %head = insertelement <vscale x 1 x ptr> poison, ptr %p, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 0010f64a93fd6..14976f21b7dbb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-RV64
 
 declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
 
@@ -823,15 +823,15 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
   ret <vscale x 1 x half> %load
 }
 
-define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr) {
-; CHECK-RV32-LABEL: zero_strided_vadd.vx:
+define <vscale x 1 x i64> @zero_strided_vadd_nxv1i64(<vscale x 1 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd_nxv1i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: zero_strided_vadd.vx:
+; CHECK-RV64-LABEL: zero_strided_vadd_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
@@ -842,3 +842,69 @@ define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr)
   %w = add <vscale x 1 x i64> %v, %load
   ret <vscale x 1 x i64> %w
 }
+
+define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd_nxv16i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    csrr a1, vlenb
+; CHECK-RV32-NEXT:    srli a2, a1, 3
+; CHECK-RV32-NEXT:    sub a3, a2, a1
+; CHECK-RV32-NEXT:    sltu a4, a2, a3
+; CHECK-RV32-NEXT:    addi a4, a4, -1
+; CHECK-RV32-NEXT:    and a3, a4, a3
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v24, (a0), zero
+; CHECK-RV32-NEXT:    bltu a2, a1, .LBB55_2
+; CHECK-RV32-NEXT:  # %bb.1:
+; CHECK-RV32-NEXT:    mv a2, a1
+; CHECK-RV32-NEXT:  .LBB55_2:
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v0, (a0), zero
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vadd.vv v16, v16, v24
+; CHECK-RV32-NEXT:    vadd.vv v8, v8, v0
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: zero_strided_vadd_nxv16i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vadd.vx v8, v8, a0
+; CHECK-RV64-NEXT:    vadd.vx v16, v16, a0
+; CHECK-RV64-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %load = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i32(ptr %ptr, i32 0, <vscale x 16 x i1> splat (i1 true), i32 %vscale)
+  %w = add <vscale x 16 x i64> %v, %load
+  ret <vscale x 16 x i64> %w
+}
+
+define <vscale x 1 x ptr> @zero_strided_vadd_nxv1p0(<vscale x 1 x ptr> %v, ptr %ptr) {
+; CHECK-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-OPT-RV32:       # %bb.0:
+; CHECK-OPT-RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-OPT-RV32-NEXT:    vlse32.v v8, (a0), zero
+; CHECK-OPT-RV32-NEXT:    ret
+;
+; CHECK-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-OPT-RV64:       # %bb.0:
+; CHECK-OPT-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-OPT-RV64-NEXT:    vlse64.v v8, (a0), zero
+; CHECK-OPT-RV64-NEXT:    ret
+;
+; CHECK-NO-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-NO-OPT-RV32:       # %bb.0:
+; CHECK-NO-OPT-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-NO-OPT-RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NO-OPT-RV32-NEXT:    vmv.v.x v8, a0
+; CHECK-NO-OPT-RV32-NEXT:    ret
+;
+; CHECK-NO-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-NO-OPT-RV64:       # %bb.0:
+; CHECK-NO-OPT-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-NO-OPT-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NO-OPT-RV64-NEXT:    vmv.v.x v8, a0
+; CHECK-NO-OPT-RV64-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %load = call <vscale x 1 x ptr> @llvm.experimental.vp.strided.load.nxv1p0.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
+  ret <vscale x 1 x ptr> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
index 28d7588b9347a..d2f73826e4e9e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zve64x,+zvl128b | FileCheck %s
 
 define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) {
 ; CHECK-LABEL: test_v4i16_v4i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
index 503085f01f812..ae8c36a7cb5e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -928,7 +928,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half>, ha
 define <vscale x 1 x half> @intrinsic_vfmv.s.f_f_nxv1f16(half %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
index 439301ff40110..1f027aef3103d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
@@ -2576,21 +2576,21 @@ entry:
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
index d9df1d4d30130..e47c2a47d6c64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
@@ -4,21 +4,21 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
-declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
@@ -52,21 +52,21 @@ entry:
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+declare <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x half> @intrinsic_vfmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f16_nxv2f16_nxv2f16:
+define <vscale x 2 x half> @intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+  %a = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
@@ -100,21 +100,21 @@ entry:
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+declare <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x half> @intrinsic_vfmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f16_nxv4f16_nxv4f16:
+define <vscale x 4 x half> @intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+  %a = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
@@ -148,21 +148,21 @@ entry:
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+declare <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x half> @intrinsic_vfmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f16_nxv8f16_nxv8f16:
+define <vscale x 8 x half> @intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+  %a = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
@@ -196,21 +196,21 @@ entry:
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+declare <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i1>,
   iXLen);
 
-define <vscale x 16 x half> @intrinsic_vfmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv16f16_nxv16f16_nxv16f16:
+define <vscale x 16 x half> @intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+  %a = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
@@ -244,21 +244,21 @@ entry:
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+declare <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i1>,
   iXLen);
 
-define <vscale x 32 x half> @intrinsic_vfmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv32f16_nxv32f16_nxv32f16:
+define <vscale x 32 x half> @intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+  %a = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
@@ -292,21 +292,21 @@ entry:
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+declare <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x float> @intrinsic_vfmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f32_nxv1f32_nxv1f32:
+define <vscale x 1 x float> @intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+  %a = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
@@ -340,21 +340,21 @@ entry:
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+declare <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x float> @intrinsic_vfmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f32_nxv2f32_nxv2f32:
+define <vscale x 2 x float> @intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+  %a = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
@@ -388,21 +388,21 @@ entry:
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+declare <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x float> @intrinsic_vfmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f32_nxv4f32_nxv4f32:
+define <vscale x 4 x float> @intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+  %a = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
@@ -436,21 +436,21 @@ entry:
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+declare <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x float> @intrinsic_vfmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f32_nxv8f32_nxv8f32:
+define <vscale x 8 x float> @intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+  %a = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
@@ -484,21 +484,21 @@ entry:
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+declare <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i1>,
   iXLen);
 
-define <vscale x 16 x float> @intrinsic_vfmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv16f32_nxv16f32_nxv16f32:
+define <vscale x 16 x float> @intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+  %a = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
@@ -532,21 +532,21 @@ entry:
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+declare <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x double> @intrinsic_vfmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f64_nxv1f64_nxv1f64:
+define <vscale x 1 x double> @intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
@@ -587,14 +587,14 @@ declare <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x double> @intrinsic_vfmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f64_nxv2f64_nxv2f64:
+define <vscale x 2 x double> @intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
@@ -628,21 +628,21 @@ entry:
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+declare <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x double> @intrinsic_vfmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f64_nxv4f64_nxv4f64:
+define <vscale x 4 x double> @intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
@@ -676,21 +676,21 @@ entry:
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+declare <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x double> @intrinsic_vfmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f64_nxv8f64_nxv8f64:
+define <vscale x 8 x double> @intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
index 47b4b61231122..af1c378c56812 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
@@ -7,7 +7,7 @@ declare half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half>)
 define half @intrinsic_vfmv.f.s_s_nxv1f16(<vscale x 1 x half> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -20,7 +20,7 @@ declare half @llvm.riscv.vfmv.f.s.nxv2f16(<vscale x 2 x half>)
 define half @intrinsic_vfmv.f.s_s_nxv2f16(<vscale x 2 x half> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -85,7 +85,7 @@ declare float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float>)
 define float @intrinsic_vfmv.f.s_s_nxv1f32(<vscale x 1 x float> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
index b3aab2382370d..1e863a4adbc21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half>, ha
 define <vscale x 1 x half> @intrinsic_vfmv.s.f_f_nxv1f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
@@ -22,7 +22,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmv.s.f.nxv2f16(<vscale x 2 x half>, ha
 define <vscale x 2 x half> @intrinsic_vfmv.s.f_f_nxv2f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
@@ -87,7 +87,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmv.s.f.nxv1f32(<vscale x 1 x float>,
 define <vscale x 1 x float> @intrinsic_vfmv.s.f_f_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vle.ll b/llvm/test/CodeGen/RISCV/rvv/vle.ll
index 4f2e490f36ec8..bc92dfd16c880 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vle.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,,+zfbfmin,+zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,,+zfbfmin,+zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 390647fd9e6c6..578b5dc6a2560 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -23,7 +23,7 @@ entry:
   ret i64 %1
 }
 
-define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %merge, ptr %p, i64 %vl) {
+define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl) {
   ; CHECK-LABEL: name: test_vleff_nxv8i8_tu
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   liveins: $v8, $x10, $x11
@@ -35,7 +35,7 @@ define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %merge, ptr %p, i64 %vl) {
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
-  %0 = call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff.nxv8i8(<vscale x 8 x i8> %merge, ptr %p, i64 %vl)
+  %0 = call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff.nxv8i8(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl)
   %1 = extractvalue { <vscale x 8 x i8>, i64 } %0, 1
   ret i64 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff.ll b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
index 7a8ed4153c352..39e0a0d02e88d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare { <vscale x 1 x i64>, iXLen } @llvm.riscv.vleff.nxv1i64(
@@ -1636,13 +1636,13 @@ entry:
   ret <vscale x 32 x i16> %b
 }
 
-declare { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.nxv1f16(
+declare { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.nxv1bf16(
   <vscale x 1 x half>,
   ptr,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vleff_v_nxv1half_nxv1f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv1half_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vleff_v_nxv1half_nxv1bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv1half_nxv1bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1650,7 +1650,7 @@ define <vscale x 1 x half> @intrinsic_vleff_v_nxv1half_nxv1f16(ptr %0, iXLen %1,
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv1half_nxv1f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv1half_nxv1bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1658,7 +1658,7 @@ define <vscale x 1 x half> @intrinsic_vleff_v_nxv1half_nxv1f16(ptr %0, iXLen %1,
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.nxv1f16(
+  %a = call { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.nxv1bf16(
     <vscale x 1 x half> undef,
     ptr %0,
     iXLen %1)
@@ -1668,15 +1668,15 @@ entry:
   ret <vscale x 1 x half> %b
 }
 
-declare { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.mask.nxv1f16(
+declare { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.mask.nxv1bf16(
   <vscale x 1 x half>,
   ptr,
   <vscale x 1 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vleff_mask_v_nxv1half_nxv1f16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv1half_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vleff_mask_v_nxv1half_nxv1bf16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv1half_nxv1bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1684,7 +1684,7 @@ define <vscale x 1 x half> @intrinsic_vleff_mask_v_nxv1half_nxv1f16(<vscale x 1
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv1half_nxv1f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv1half_nxv1bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1692,7 +1692,7 @@ define <vscale x 1 x half> @intrinsic_vleff_mask_v_nxv1half_nxv1f16(<vscale x 1
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.mask.nxv1f16(
+  %a = call { <vscale x 1 x half>, iXLen } @llvm.riscv.vleff.mask.nxv1bf16(
     <vscale x 1 x half> %0,
     ptr %1,
     <vscale x 1 x i1> %2,
@@ -1704,13 +1704,13 @@ entry:
   ret <vscale x 1 x half> %b
 }
 
-declare { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.nxv2f16(
+declare { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.nxv2bf16(
   <vscale x 2 x half>,
   ptr,
   iXLen);
 
-define <vscale x 2 x half> @intrinsic_vleff_v_nxv2half_nxv2f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv2half_nxv2f16:
+define <vscale x 2 x half> @intrinsic_vleff_v_nxv2half_nxv2bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv2half_nxv2bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1718,7 +1718,7 @@ define <vscale x 2 x half> @intrinsic_vleff_v_nxv2half_nxv2f16(ptr %0, iXLen %1,
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv2half_nxv2f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv2half_nxv2bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1726,7 +1726,7 @@ define <vscale x 2 x half> @intrinsic_vleff_v_nxv2half_nxv2f16(ptr %0, iXLen %1,
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.nxv2f16(
+  %a = call { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.nxv2bf16(
     <vscale x 2 x half> undef,
     ptr %0,
     iXLen %1)
@@ -1736,15 +1736,15 @@ entry:
   ret <vscale x 2 x half> %b
 }
 
-declare { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.mask.nxv2f16(
+declare { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.mask.nxv2bf16(
   <vscale x 2 x half>,
   ptr,
   <vscale x 2 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 2 x half> @intrinsic_vleff_mask_v_nxv2half_nxv2f16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv2half_nxv2f16:
+define <vscale x 2 x half> @intrinsic_vleff_mask_v_nxv2half_nxv2bf16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv2half_nxv2bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1752,7 +1752,7 @@ define <vscale x 2 x half> @intrinsic_vleff_mask_v_nxv2half_nxv2f16(<vscale x 2
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv2half_nxv2f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv2half_nxv2bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1760,7 +1760,7 @@ define <vscale x 2 x half> @intrinsic_vleff_mask_v_nxv2half_nxv2f16(<vscale x 2
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.mask.nxv2f16(
+  %a = call { <vscale x 2 x half>, iXLen } @llvm.riscv.vleff.mask.nxv2bf16(
     <vscale x 2 x half> %0,
     ptr %1,
     <vscale x 2 x i1> %2,
@@ -1772,13 +1772,13 @@ entry:
   ret <vscale x 2 x half> %b
 }
 
-declare { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.nxv4f16(
+declare { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.nxv4bf16(
   <vscale x 4 x half>,
   ptr,
   iXLen);
 
-define <vscale x 4 x half> @intrinsic_vleff_v_nxv4half_nxv4f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv4half_nxv4f16:
+define <vscale x 4 x half> @intrinsic_vleff_v_nxv4half_nxv4bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv4half_nxv4bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1786,7 +1786,7 @@ define <vscale x 4 x half> @intrinsic_vleff_v_nxv4half_nxv4f16(ptr %0, iXLen %1,
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv4half_nxv4f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv4half_nxv4bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1794,7 +1794,7 @@ define <vscale x 4 x half> @intrinsic_vleff_v_nxv4half_nxv4f16(ptr %0, iXLen %1,
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.nxv4f16(
+  %a = call { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.nxv4bf16(
     <vscale x 4 x half> undef,
     ptr %0,
     iXLen %1)
@@ -1804,15 +1804,15 @@ entry:
   ret <vscale x 4 x half> %b
 }
 
-declare { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.mask.nxv4f16(
+declare { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.mask.nxv4bf16(
   <vscale x 4 x half>,
   ptr,
   <vscale x 4 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 4 x half> @intrinsic_vleff_mask_v_nxv4half_nxv4f16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv4half_nxv4f16:
+define <vscale x 4 x half> @intrinsic_vleff_mask_v_nxv4half_nxv4bf16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv4half_nxv4bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1820,7 +1820,7 @@ define <vscale x 4 x half> @intrinsic_vleff_mask_v_nxv4half_nxv4f16(<vscale x 4
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv4half_nxv4f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv4half_nxv4bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1828,7 +1828,7 @@ define <vscale x 4 x half> @intrinsic_vleff_mask_v_nxv4half_nxv4f16(<vscale x 4
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.mask.nxv4f16(
+  %a = call { <vscale x 4 x half>, iXLen } @llvm.riscv.vleff.mask.nxv4bf16(
     <vscale x 4 x half> %0,
     ptr %1,
     <vscale x 4 x i1> %2,
@@ -1840,13 +1840,13 @@ entry:
   ret <vscale x 4 x half> %b
 }
 
-declare { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.nxv8f16(
+declare { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.nxv8bf16(
   <vscale x 8 x half>,
   ptr,
   iXLen);
 
-define <vscale x 8 x half> @intrinsic_vleff_v_nxv8half_nxv8f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv8half_nxv8f16:
+define <vscale x 8 x half> @intrinsic_vleff_v_nxv8half_nxv8bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv8half_nxv8bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1854,7 +1854,7 @@ define <vscale x 8 x half> @intrinsic_vleff_v_nxv8half_nxv8f16(ptr %0, iXLen %1,
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv8half_nxv8f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv8half_nxv8bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1862,7 +1862,7 @@ define <vscale x 8 x half> @intrinsic_vleff_v_nxv8half_nxv8f16(ptr %0, iXLen %1,
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.nxv8f16(
+  %a = call { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.nxv8bf16(
     <vscale x 8 x half> undef,
     ptr %0,
     iXLen %1)
@@ -1872,15 +1872,15 @@ entry:
   ret <vscale x 8 x half> %b
 }
 
-declare { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.mask.nxv8f16(
+declare { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.mask.nxv8bf16(
   <vscale x 8 x half>,
   ptr,
   <vscale x 8 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 8 x half> @intrinsic_vleff_mask_v_nxv8half_nxv8f16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv8half_nxv8f16:
+define <vscale x 8 x half> @intrinsic_vleff_mask_v_nxv8half_nxv8bf16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv8half_nxv8bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1888,7 +1888,7 @@ define <vscale x 8 x half> @intrinsic_vleff_mask_v_nxv8half_nxv8f16(<vscale x 8
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv8half_nxv8f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv8half_nxv8bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1896,7 +1896,7 @@ define <vscale x 8 x half> @intrinsic_vleff_mask_v_nxv8half_nxv8f16(<vscale x 8
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.mask.nxv8f16(
+  %a = call { <vscale x 8 x half>, iXLen } @llvm.riscv.vleff.mask.nxv8bf16(
     <vscale x 8 x half> %0,
     ptr %1,
     <vscale x 8 x i1> %2,
@@ -1908,13 +1908,13 @@ entry:
   ret <vscale x 8 x half> %b
 }
 
-declare { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.nxv16f16(
+declare { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.nxv16bf16(
   <vscale x 16 x half>,
   ptr,
   iXLen);
 
-define <vscale x 16 x half> @intrinsic_vleff_v_nxv16half_nxv16f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv16half_nxv16f16:
+define <vscale x 16 x half> @intrinsic_vleff_v_nxv16half_nxv16bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv16half_nxv16bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1922,7 +1922,7 @@ define <vscale x 16 x half> @intrinsic_vleff_v_nxv16half_nxv16f16(ptr %0, iXLen
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv16half_nxv16f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv16half_nxv16bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1930,7 +1930,7 @@ define <vscale x 16 x half> @intrinsic_vleff_v_nxv16half_nxv16f16(ptr %0, iXLen
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.nxv16f16(
+  %a = call { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.nxv16bf16(
     <vscale x 16 x half> undef,
     ptr %0,
     iXLen %1)
@@ -1940,15 +1940,15 @@ entry:
   ret <vscale x 16 x half> %b
 }
 
-declare { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.mask.nxv16f16(
+declare { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.mask.nxv16bf16(
   <vscale x 16 x half>,
   ptr,
   <vscale x 16 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 16 x half> @intrinsic_vleff_mask_v_nxv16half_nxv16f16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv16half_nxv16f16:
+define <vscale x 16 x half> @intrinsic_vleff_mask_v_nxv16half_nxv16bf16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv16half_nxv16bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1956,7 +1956,7 @@ define <vscale x 16 x half> @intrinsic_vleff_mask_v_nxv16half_nxv16f16(<vscale x
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv16half_nxv16f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv16half_nxv16bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -1964,7 +1964,7 @@ define <vscale x 16 x half> @intrinsic_vleff_mask_v_nxv16half_nxv16f16(<vscale x
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.mask.nxv16f16(
+  %a = call { <vscale x 16 x half>, iXLen } @llvm.riscv.vleff.mask.nxv16bf16(
     <vscale x 16 x half> %0,
     ptr %1,
     <vscale x 16 x i1> %2,
@@ -1976,13 +1976,13 @@ entry:
   ret <vscale x 16 x half> %b
 }
 
-declare { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.nxv32f16(
+declare { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.nxv32bf16(
   <vscale x 32 x half>,
   ptr,
   iXLen);
 
-define <vscale x 32 x half> @intrinsic_vleff_v_nxv32half_nxv32f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
-; RV32-LABEL: intrinsic_vleff_v_nxv32half_nxv32f16:
+define <vscale x 32 x half> @intrinsic_vleff_v_nxv32half_nxv32bf16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv32half_nxv32bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV32-NEXT:    vle16ff.v v8, (a0)
@@ -1990,7 +1990,7 @@ define <vscale x 32 x half> @intrinsic_vleff_v_nxv32half_nxv32f16(ptr %0, iXLen
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_v_nxv32half_nxv32f16:
+; RV64-LABEL: intrinsic_vleff_v_nxv32half_nxv32bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV64-NEXT:    vle16ff.v v8, (a0)
@@ -1998,7 +1998,7 @@ define <vscale x 32 x half> @intrinsic_vleff_v_nxv32half_nxv32f16(ptr %0, iXLen
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.nxv32f16(
+  %a = call { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.nxv32bf16(
     <vscale x 32 x half> undef,
     ptr %0,
     iXLen %1)
@@ -2008,15 +2008,15 @@ entry:
   ret <vscale x 32 x half> %b
 }
 
-declare { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.mask.nxv32f16(
+declare { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.mask.nxv32bf16(
   <vscale x 32 x half>,
   ptr,
   <vscale x 32 x i1>,
   iXLen,
   iXLen);
 
-define <vscale x 32 x half> @intrinsic_vleff_mask_v_nxv32half_nxv32f16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3, iXLen* %4) nounwind {
-; RV32-LABEL: intrinsic_vleff_mask_v_nxv32half_nxv32f16:
+define <vscale x 32 x half> @intrinsic_vleff_mask_v_nxv32half_nxv32bf16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv32half_nxv32bf16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -2024,7 +2024,7 @@ define <vscale x 32 x half> @intrinsic_vleff_mask_v_nxv32half_nxv32f16(<vscale x
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: intrinsic_vleff_mask_v_nxv32half_nxv32f16:
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv32half_nxv32bf16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
@@ -2032,7 +2032,7 @@ define <vscale x 32 x half> @intrinsic_vleff_mask_v_nxv32half_nxv32f16(<vscale x
 ; RV64-NEXT:    sd a0, 0(a2)
 ; RV64-NEXT:    ret
 entry:
-  %a = call { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.mask.nxv32f16(
+  %a = call { <vscale x 32 x half>, iXLen } @llvm.riscv.vleff.mask.nxv32bf16(
     <vscale x 32 x half> %0,
     ptr %1,
     <vscale x 32 x i1> %2,
@@ -2044,6 +2044,414 @@ entry:
   ret <vscale x 32 x half> %b
 }
 
+declare { <vscale x 1 x bfloat>, iXLen } @llvm.riscv.vleff.nxv1f16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vleff_v_nxv1bfloat_nxv1f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv1bfloat_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv1bfloat_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 1 x bfloat>, iXLen } @llvm.riscv.vleff.nxv1f16(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 1 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 1 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 1 x bfloat> %b
+}
+
+declare { <vscale x 1 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv1f16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vleff_mask_v_nxv1bfloat_nxv1f16(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv1bfloat_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv1bfloat_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 1 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv1f16(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 1 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 1 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 1 x bfloat> %b
+}
+
+declare { <vscale x 2 x bfloat>, iXLen } @llvm.riscv.vleff.nxv2f16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vleff_v_nxv2bfloat_nxv2f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv2bfloat_nxv2f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv2bfloat_nxv2f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 2 x bfloat>, iXLen } @llvm.riscv.vleff.nxv2f16(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 2 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 2 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 2 x bfloat> %b
+}
+
+declare { <vscale x 2 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv2f16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vleff_mask_v_nxv2bfloat_nxv2f16(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv2bfloat_nxv2f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv2bfloat_nxv2f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 2 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv2f16(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 2 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 2 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 2 x bfloat> %b
+}
+
+declare { <vscale x 4 x bfloat>, iXLen } @llvm.riscv.vleff.nxv4f16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vleff_v_nxv4bfloat_nxv4f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv4bfloat_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv4bfloat_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 4 x bfloat>, iXLen } @llvm.riscv.vleff.nxv4f16(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 4 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 4 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 4 x bfloat> %b
+}
+
+declare { <vscale x 4 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv4f16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vleff_mask_v_nxv4bfloat_nxv4f16(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv4bfloat_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv4bfloat_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 4 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv4f16(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 4 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 4 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 4 x bfloat> %b
+}
+
+declare { <vscale x 8 x bfloat>, iXLen } @llvm.riscv.vleff.nxv8f16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vleff_v_nxv8bfloat_nxv8f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv8bfloat_nxv8f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv8bfloat_nxv8f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 8 x bfloat>, iXLen } @llvm.riscv.vleff.nxv8f16(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 8 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 8 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 8 x bfloat> %b
+}
+
+declare { <vscale x 8 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv8f16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vleff_mask_v_nxv8bfloat_nxv8f16(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv8bfloat_nxv8f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv8bfloat_nxv8f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 8 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv8f16(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 8 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 8 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 8 x bfloat> %b
+}
+
+declare { <vscale x 16 x bfloat>, iXLen } @llvm.riscv.vleff.nxv16f16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vleff_v_nxv16bfloat_nxv16f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv16bfloat_nxv16f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv16bfloat_nxv16f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 16 x bfloat>, iXLen } @llvm.riscv.vleff.nxv16f16(
+    <vscale x 16 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 16 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 16 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 16 x bfloat> %b
+}
+
+declare { <vscale x 16 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv16f16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vleff_mask_v_nxv16bfloat_nxv16f16(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv16bfloat_nxv16f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv16bfloat_nxv16f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 16 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv16f16(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 16 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 16 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 16 x bfloat> %b
+}
+
+declare { <vscale x 32 x bfloat>, iXLen } @llvm.riscv.vleff.nxv32f16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vleff_v_nxv32bfloat_nxv32f16(ptr %0, iXLen %1, iXLen* %2) nounwind {
+; RV32-LABEL: intrinsic_vleff_v_nxv32bfloat_nxv32f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vle16ff.v v8, (a0)
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_v_nxv32bfloat_nxv32f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT:    vle16ff.v v8, (a0)
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 32 x bfloat>, iXLen } @llvm.riscv.vleff.nxv32f16(
+    <vscale x 32 x bfloat> undef,
+    ptr %0,
+    iXLen %1)
+  %b = extractvalue { <vscale x 32 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 32 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %2
+  ret <vscale x 32 x bfloat> %b
+}
+
+declare { <vscale x 32 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv32f16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vleff_mask_v_nxv32bfloat_nxv32f16(<vscale x 32 x bfloat> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3, iXLen* %4) nounwind {
+; RV32-LABEL: intrinsic_vleff_mask_v_nxv32bfloat_nxv32f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; RV32-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV32-NEXT:    csrr a0, vl
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vleff_mask_v_nxv32bfloat_nxv32f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; RV64-NEXT:    vle16ff.v v8, (a0), v0.t
+; RV64-NEXT:    csrr a0, vl
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %a = call { <vscale x 32 x bfloat>, iXLen } @llvm.riscv.vleff.mask.nxv32f16(
+    <vscale x 32 x bfloat> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 1)
+  %b = extractvalue { <vscale x 32 x bfloat>, iXLen } %a, 0
+  %c = extractvalue { <vscale x 32 x bfloat>, iXLen } %a, 1
+  store iXLen %c, iXLen* %4
+
+  ret <vscale x 32 x bfloat> %b
+}
+
 declare { <vscale x 1 x i8>, iXLen } @llvm.riscv.vleff.nxv1i8(
   <vscale x 1 x i8>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vloxei-rv64.ll
index c56ee04fb6f60..5d28534972b3f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vloxei-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vloxei-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh,+f,+d -verify-machineinstrs \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 
 ; The intrinsics are not supported with RV32.
@@ -960,6 +960,198 @@ entry:
   ret <vscale x 8 x half> %a
 }
 
+declare <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x bfloat> @intrinsic_vloxei_v_nxv1bf16_nxv1bf16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x bfloat> @intrinsic_vloxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x bfloat> @intrinsic_vloxei_v_nxv2bf16_nxv2bf16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x bfloat> @intrinsic_vloxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x bfloat> @intrinsic_vloxei_v_nxv4bf16_nxv4bf16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x bfloat> @intrinsic_vloxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x bfloat> @intrinsic_vloxei_v_nxv8bf16_nxv8bf16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x bfloat> @intrinsic_vloxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
 declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
index 8f0141526a62b..65eedbb0cc898 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
@@ -4631,6 +4631,246 @@ entry:
   ret <vscale x 32 x half> %a
 }
 
+declare <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vloxei_v_nxv1bf16_nxv1bf16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vloxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vloxei_v_nxv2bf16_nxv2bf16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vloxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vloxei_v_nxv4bf16_nxv4bf16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vloxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vloxei_v_nxv8bf16_nxv8bf16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vloxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vloxei_v_nxv16bf16_nxv16bf16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> undef,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vloxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
 declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlse.ll b/llvm/test/CodeGen/RISCV/rvv/vlse.ll
index 3b191a7f8bb80..3dcd254d0c195 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlse.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x i64> @llvm.riscv.vlse.nxv1i64(
@@ -1414,6 +1414,288 @@ entry:
   ret <vscale x 32 x half> %a
 }
 
+declare <vscale x 1 x bfloat> @llvm.riscv.vlse.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vlse_v_nxv1bf16_nxv1bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vlse.nxv1bf16(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vlse.mask.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vlse_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vlse.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vlse.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vlse_v_nxv2bf16_nxv2bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vlse.nxv2bf16(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vlse.mask.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vlse_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vlse.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vlse.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vlse_v_nxv4bf16_nxv4bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vlse.nxv4bf16(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vlse.mask.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vlse_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vlse.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vlse.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vlse_v_nxv8bf16_nxv8bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vlse.nxv8bf16(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vlse.mask.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vlse_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vlse.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vlse.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vlse_v_nxv16bf16_nxv16bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vlse.nxv16bf16(
+    <vscale x 16 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vlse.mask.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vlse_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vlse.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vlse.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vlse_v_nxv32bf16_nxv32bf16(ptr %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vlse_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vlse.nxv32bf16(
+    <vscale x 32 x bfloat> undef,
+    ptr %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vlse.mask.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vlse_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vlse_mask_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, mu
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vlse.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
 declare <vscale x 1 x i8> @llvm.riscv.vlse.nxv1i8(
   <vscale x 1 x i8>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
index 64e33916ad646..9588c85fbdbdd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare {<vscale x 16 x i16>,<vscale x 16 x i16>, i32} @llvm.riscv.vlseg2ff.nxv16i16(<vscale x 16 x i16>,<vscale x 16 x i16>, ptr , i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
index 2cc924e67e938..02c2994d96622 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh \
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare {<vscale x 16 x i16>,<vscale x 16 x i16>, i64} @llvm.riscv.vlseg2ff.nxv16i16(<vscale x 16 x i16>,<vscale x 16 x i16>, ptr , i64)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vluxei-rv64.ll
index 93c821b5357c8..b102a12dea9a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vluxei-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vluxei-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh,+f,+d -verify-machineinstrs \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 
 ; The intrinsics are not supported with RV32.
@@ -960,6 +960,198 @@ entry:
   ret <vscale x 8 x half> %a
 }
 
+declare <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x bfloat> @intrinsic_vluxei_v_nxv1bf16_nxv1bf16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x bfloat> @intrinsic_vluxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x bfloat> @intrinsic_vluxei_v_nxv2bf16_nxv2bf16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x bfloat> @intrinsic_vluxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x bfloat> @intrinsic_vluxei_v_nxv4bf16_nxv4bf16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x bfloat> @intrinsic_vluxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x bfloat> @intrinsic_vluxei_v_nxv8bf16_nxv8bf16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x bfloat> @intrinsic_vluxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
 declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
index 679726fe66695..240f377be1ce3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
@@ -1151,6 +1151,246 @@ entry:
   ret <vscale x 16 x half> %a
 }
 
+declare <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vluxei_v_nxv1bf16_nxv1bf16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> undef,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vluxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vluxei_v_nxv2bf16_nxv2bf16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> undef,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vluxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vluxei_v_nxv4bf16_nxv4bf16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> undef,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vluxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vluxei_v_nxv8bf16_nxv8bf16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> undef,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vluxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vluxei_v_nxv16bf16_nxv16bf16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> undef,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vluxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
 declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
index 5aba7ef4cc5b6..e232ac255c56f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
index 22ed56afbd94e..5401bf7db49e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vmerge.ll
index 6550a26d8b885..c6d3941958147 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll
deleted file mode 100644
index 0c6600080858b..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll
+++ /dev/null
@@ -1,316 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, i32)
-
-define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, i32)
-
-define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, i32)
-
-define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, i32)
-
-define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, i32)
-
-define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, i32)
-
-define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, i32)
-
-define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, i32)
-
-define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, i32)
-
-define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, i32)
-
-define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, i32)
-
-define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, i32)
-
-define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, i32)
-
-define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, i32)
-
-define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, i32)
-
-define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, i32)
-
-define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, i32)
-
-define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, i32)
-
-define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, i32);
-
-define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vmseq.vi v0, v9, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, i32);
-
-define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vmseq.vi v0, v10, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, i32);
-
-define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vmseq.vi v0, v12, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, i32);
-
-define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vmseq.vi v0, v16, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
similarity index 61%
rename from llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
index 163eb73ee4654..afb9cba328e7e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
@@ -1,302 +1,372 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
-declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, i64);
+declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, i64);
+declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, i64);
+declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, i64);
+declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, i64);
+declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, i64);
+declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, i64);
+declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, i64);
+declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, i64);
+declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, i64);
+declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, i64);
+declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, i64);
+declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, i64);
+declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, i64);
+declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, i64);
+declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, i64);
+declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, i64);
+declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, i64);
+declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vmseq.vi v0, v10, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vid.v v12
+; RV32-NEXT:    vmseq.vi v0, v12, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 8 x i64> %a
 }
 
 ; We should not emit a tail agnostic vlse for a tail undisturbed vmv.s.x
 define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64_bug(<vscale x 1 x i64> %0, ptr %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    ld a0, 0(a0)
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a0, 0(a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = load i64, ptr %1, align 8
-  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %a, i64 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %a, iXLen 1)
   ret <vscale x 1 x i64> %b
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
similarity index 68%
rename from llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll
rename to llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index ec03f773c7108..3952e48c5c28f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -2,20 +2,14 @@
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
 
-declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
-
-declare <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
-
-define <vscale x 4 x i32> @vadd(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+define <vscale x 4 x i32> @vadd(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
 ; CHECK-LABEL: vadd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vv v10, v10, v12
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v12
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl2)
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl)
   ret <vscale x 4 x i32> %w
 }
 
@@ -30,23 +24,31 @@ define <vscale x 4 x i32> @vadd_mask(<vscale x 4 x i32> %passthru, <vscale x 4 x
   ret <vscale x 4 x i32> %w
 }
 
-define <vscale x 4 x i32> @vadd_undef(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+define <vscale x 4 x i32> @vadd_undef(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
 ; CHECK-LABEL: vadd_undef:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, iXLen %vl2)
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, iXLen %vl)
   ret <vscale x 4 x i32> %w
 }
 
-; TODO: Is this correct if there's already a passthru in the src?
-define <vscale x 4 x i32> @vadd_same_passthru(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+define <vscale x 4 x i32> @vadd_same_passthru(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
 ; CHECK-LABEL: vadd_same_passthru:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl)
+  ret <vscale x 4 x i32> %w
+}
+
+define <vscale x 4 x i32> @unfoldable_diff_avl_unknown(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; CHECK-LABEL: unfoldable_diff_avl_unknown:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v14, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
 ; CHECK-NEXT:    vadd.vv v14, v10, v12
@@ -58,36 +60,50 @@ define <vscale x 4 x i32> @vadd_same_passthru(<vscale x 4 x i32> %passthru, <vsc
   ret <vscale x 4 x i32> %w
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, iXLen, iXLen)
+define <vscale x 4 x i32> @diff_avl_known(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: diff_avl_known:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 4)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen 8)
+  ret <vscale x 4 x i32> %w
+}
+
+define <vscale x 4 x i32> @diff_avl_vlmax(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; CHECK-LABEL: diff_avl_vlmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl)
+  ret <vscale x 4 x i32> %w
+}
 
-define <vscale x 4 x i32> @vadd_mask_ma(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl1, iXLen %vl2) {
+define <vscale x 4 x i32> @vadd_mask_ma(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl) {
 ; CHECK-LABEL: vadd_mask_ma:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vv v10, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    vadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl1, iXLen 2)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl2)
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl, iXLen 2)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl)
   ret <vscale x 4 x i32> %w
 }
 
-define <vscale x 4 x i32> @vadd_mask_mu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl1, iXLen %vl2) {
+define <vscale x 4 x i32> @vadd_mask_mu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl) {
 ; CHECK-LABEL: vadd_mask_mu:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vv v10, v10, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    vadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl1, iXLen 0)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl2)
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen %vl, iXLen 0)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, iXLen %vl)
   ret <vscale x 4 x i32> %w
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vle.nxv4i32(<vscale x 4 x i32>, ptr, iXLen)
-
 define <vscale x 4 x i32> @foldable_load(<vscale x 4 x i32> %passthru, ptr %p) {
 ; CHECK-LABEL: foldable_load:
 ; CHECK:       # %bb.0:
@@ -113,10 +129,6 @@ define <vscale x 4 x i32> @unfoldable_load(<vscale x 4 x i32> %passthru, ptr %p,
   ret <vscale x 4 x i32> %w
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, iXLen)
-
-declare <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, iXLen, iXLen)
-
 define <vscale x 4 x float> @unfoldable_vfadd(<vscale x 4 x float> %passthru, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl1, iXLen %vl2) {
 ; CHECK-LABEL: unfoldable_vfadd:
 ; CHECK:       # %bb.0:
@@ -168,3 +180,17 @@ define <vscale x 2 x i32> @unfoldable_vredsum(<vscale x 2 x i32> %passthru, <vsc
   %b = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %a, iXLen 1)
   ret <vscale x 2 x i32> %b
 }
+
+define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl) {
+; CHECK-LABEL: unfoldable_mismatched_sew:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl)
+  %a.bitcast = bitcast <vscale x 1 x i64> %a to <vscale x 2 x i32>
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %a.bitcast, iXLen %avl)
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
deleted file mode 100644
index 6560063ddee26..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
+++ /dev/null
@@ -1,743 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i64 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i64 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i64 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i64 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i64 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i64 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i64 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i64 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i64 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i64 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i64 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i64 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i64 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i64 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i64 %1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
similarity index 87%
rename from llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
rename to llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
index ca317209055a2..a061f1829b80a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -17,7 +19,7 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -25,9 +27,9 @@ entry:
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -37,7 +39,7 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -45,9 +47,9 @@ entry:
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -57,7 +59,7 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -65,9 +67,9 @@ entry:
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -77,7 +79,7 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -85,9 +87,9 @@ entry:
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -97,7 +99,7 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -105,9 +107,9 @@ entry:
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -117,7 +119,7 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -125,9 +127,9 @@ entry:
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -137,7 +139,7 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -145,9 +147,9 @@ entry:
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -157,7 +159,7 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -165,9 +167,9 @@ entry:
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -177,7 +179,7 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -185,9 +187,9 @@ entry:
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -197,7 +199,7 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -205,9 +207,9 @@ entry:
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -217,7 +219,7 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -225,9 +227,9 @@ entry:
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -237,7 +239,7 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -245,9 +247,9 @@ entry:
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -257,7 +259,7 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -265,9 +267,9 @@ entry:
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -277,7 +279,7 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -285,9 +287,9 @@ entry:
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -297,7 +299,7 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -305,9 +307,9 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -317,7 +319,7 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -325,9 +327,9 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -337,7 +339,7 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -345,9 +347,9 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -357,7 +359,7 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -365,9 +367,9 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -377,7 +379,7 @@ entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -385,9 +387,9 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -397,7 +399,7 @@ entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -405,9 +407,9 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -417,7 +419,7 @@ entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -425,9 +427,9 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -437,7 +439,7 @@ entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -445,9 +447,9 @@ entry:
 declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
+define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -457,7 +459,7 @@ entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x half> %a
 }
@@ -465,9 +467,9 @@ entry:
 declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
+define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -477,7 +479,7 @@ entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x half> %a
 }
@@ -485,9 +487,9 @@ entry:
 declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
+define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -497,7 +499,7 @@ entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x half> %a
 }
@@ -505,9 +507,9 @@ entry:
 declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
+define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -517,7 +519,7 @@ entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x half> %a
 }
@@ -525,9 +527,9 @@ entry:
 declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
+define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x half> %a
 }
@@ -545,9 +547,9 @@ entry:
 declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
+define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -557,7 +559,7 @@ entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x half> %a
 }
@@ -565,9 +567,9 @@ entry:
 declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
+define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -577,7 +579,7 @@ entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x float> %a
 }
@@ -585,9 +587,9 @@ entry:
 declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
+define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -597,7 +599,7 @@ entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x float> %a
 }
@@ -605,9 +607,9 @@ entry:
 declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
+define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -617,7 +619,7 @@ entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x float> %a
 }
@@ -625,9 +627,9 @@ entry:
 declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
+define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -637,7 +639,7 @@ entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x float> %a
 }
@@ -645,9 +647,9 @@ entry:
 declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
+define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -657,7 +659,7 @@ entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x float> %a
 }
@@ -665,9 +667,9 @@ entry:
 declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
+define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -677,7 +679,7 @@ entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x double> %a
 }
@@ -685,9 +687,9 @@ entry:
 declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
+define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -697,7 +699,7 @@ entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x double> %a
 }
@@ -705,9 +707,9 @@ entry:
 declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
+define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -717,7 +719,7 @@ entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x double> %a
 }
@@ -725,9 +727,9 @@ entry:
 declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
+define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -737,7 +739,7 @@ entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
deleted file mode 100644
index 3c2856853e983..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
+++ /dev/null
@@ -1,853 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-    <vscale x 1 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-    <vscale x 2 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-    <vscale x 4 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-    <vscale x 8 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-    <vscale x 16 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-    <vscale x 32 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-    <vscale x 64 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-    <vscale x 1 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-    <vscale x 2 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-    <vscale x 4 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-    <vscale x 8 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-    <vscale x 16 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-    <vscale x 32 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-    <vscale x 1 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-    <vscale x 2 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-    <vscale x 4 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-    <vscale x 8 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-    <vscale x 16 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-    <vscale x 1 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-    <vscale x 2 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-    <vscale x 4 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-    <vscale x 8 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-    <vscale x 16 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-    <vscale x 32 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-    <vscale x 64 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-    <vscale x 1 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-    <vscale x 2 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-    <vscale x 4 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-    <vscale x 8 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-    <vscale x 16 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-    <vscale x 32 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-    <vscale x 1 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-    <vscale x 2 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-    <vscale x 4 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-    <vscale x 8 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-    <vscale x 16 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x.ll
similarity index 74%
rename from llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vmv.v.x.ll
index feb05fe4fea39..4fa95fb2d945d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -17,7 +19,7 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
     <vscale x 1 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -25,9 +27,9 @@ entry:
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -37,7 +39,7 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
     <vscale x 2 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -45,9 +47,9 @@ entry:
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -57,7 +59,7 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
     <vscale x 4 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -65,9 +67,9 @@ entry:
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -77,7 +79,7 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
     <vscale x 8 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -85,9 +87,9 @@ entry:
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -97,7 +99,7 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
     <vscale x 16 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -105,9 +107,9 @@ entry:
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -117,7 +119,7 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
     <vscale x 32 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -125,9 +127,9 @@ entry:
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -137,7 +139,7 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
     <vscale x 64 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -145,9 +147,9 @@ entry:
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -157,7 +159,7 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
     <vscale x 1 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -165,9 +167,9 @@ entry:
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -177,7 +179,7 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
     <vscale x 2 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -185,9 +187,9 @@ entry:
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -197,7 +199,7 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
     <vscale x 4 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -205,9 +207,9 @@ entry:
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -217,7 +219,7 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
     <vscale x 8 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -225,9 +227,9 @@ entry:
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -237,7 +239,7 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
     <vscale x 16 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -245,9 +247,9 @@ entry:
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -257,7 +259,7 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
     <vscale x 32 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -265,9 +267,9 @@ entry:
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -277,7 +279,7 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
     <vscale x 1 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -285,9 +287,9 @@ entry:
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -297,7 +299,7 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
     <vscale x 2 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -305,9 +307,9 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -317,7 +319,7 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
     <vscale x 4 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -325,9 +327,9 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -337,7 +339,7 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
     <vscale x 8 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -345,9 +347,9 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -357,7 +359,7 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
     <vscale x 16 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -365,19 +367,30 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
     <vscale x 1 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -385,19 +398,30 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
     <vscale x 2 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -405,19 +429,30 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
     <vscale x 4 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -425,24 +460,35 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
     <vscale x 8 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i64 %0) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -452,12 +498,12 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
     <vscale x 1 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i64 %0) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -467,12 +513,12 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
     <vscale x 2 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i64 %0) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -482,12 +528,12 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
     <vscale x 4 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i64 %0) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -497,12 +543,12 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
     <vscale x 8 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i64 %0) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -512,12 +558,12 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
     <vscale x 16 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i64 %0) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -527,12 +573,12 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
     <vscale x 32 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i64 %0) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -542,12 +588,12 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
     <vscale x 64 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i64 %0) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -557,12 +603,12 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
     <vscale x 1 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i64 %0) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -572,12 +618,12 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
     <vscale x 2 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i64 %0) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -587,12 +633,12 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
     <vscale x 4 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i64 %0) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -602,12 +648,12 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
     <vscale x 8 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i64 %0) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -617,12 +663,12 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
     <vscale x 16 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i64 %0) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -632,12 +678,12 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
     <vscale x 32 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i64 %0) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -647,12 +693,12 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
     <vscale x 1 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i64 %0) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -662,12 +708,12 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
     <vscale x 2 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i64 %0) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -677,12 +723,12 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
     <vscale x 4 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i64 %0) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -692,12 +738,12 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
     <vscale x 8 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i64 %0) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -707,12 +753,12 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
     <vscale x 16 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i64 %0) nounwind {
+define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -722,12 +768,12 @@ entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
     <vscale x 1 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i64 %0) nounwind {
+define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -737,12 +783,12 @@ entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
     <vscale x 2 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i64 %0) nounwind {
+define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -752,12 +798,12 @@ entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
     <vscale x 4 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i64 %0) nounwind {
+define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -767,7 +813,7 @@ entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
     <vscale x 8 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll
deleted file mode 100644
index 180554baa44a4..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll
+++ /dev/null
@@ -1,300 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-declare i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv1i8(<vscale x 1 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv2i8(<vscale x 2 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv2i8(<vscale x 2 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv2i8(<vscale x 2 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv4i8(<vscale x 4 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv4i8(<vscale x 4 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv4i8(<vscale x 4 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv8i8(<vscale x 8 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv8i8(<vscale x 8 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv8i8(<vscale x 8 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv16i8(<vscale x 16 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv16i8(<vscale x 16 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv16i8(<vscale x 16 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv32i8(<vscale x 32 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv32i8(<vscale x 32 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv32i8(<vscale x 32 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv64i8(<vscale x 64 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv64i8(<vscale x 64 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv64i8(<vscale x 64 x i8> %0)
-  ret i8 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv1i16(<vscale x 1 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv1i16(<vscale x 1 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv1i16(<vscale x 1 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv2i16(<vscale x 2 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv2i16(<vscale x 2 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv2i16(<vscale x 2 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv4i16(<vscale x 4 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv4i16(<vscale x 4 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv4i16(<vscale x 4 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv8i16(<vscale x 8 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv8i16(<vscale x 8 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv8i16(<vscale x 8 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv16i16(<vscale x 16 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv16i16(<vscale x 16 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv16i16( <vscale x 16 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv32i16( <vscale x 32 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv32i16(<vscale x 32 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv32i16( <vscale x 32 x i16> %0)
-  ret i16 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv1i32( <vscale x 1 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv1i32(<vscale x 1 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv1i32( <vscale x 1 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv2i32( <vscale x 2 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv2i32(<vscale x 2 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv2i32( <vscale x 2 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv4i32( <vscale x 4 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv4i32(<vscale x 4 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv4i32( <vscale x 4 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv8i32( <vscale x 8 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv8i32(<vscale x 8 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv8i32( <vscale x 8 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv16i32( <vscale x 16 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv16i32(<vscale x 16 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv16i32( <vscale x 16 x i32> %0)
-  ret i32 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv1i64(<vscale x 1 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v9, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv2i64(<vscale x 2 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v10, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv4i64(<vscale x 4 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v12, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v12
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv8i64(<vscale x 8 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v16, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v16
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64> %0)
-  ret i64 %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s.ll
similarity index 80%
rename from llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vmv.x.s.ll
index 8c6c010a85c63..0ec9439e04a08 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8>)
 
@@ -238,11 +241,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv1i64(<vscale x 1 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64> %0)
   ret i64 %a
@@ -251,11 +263,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv2i64(<vscale x 2 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64> %0)
   ret i64 %a
@@ -264,11 +285,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv4i64(<vscale x 4 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64> %0)
   ret i64 %a
@@ -277,11 +307,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv8i64(<vscale x 8 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64> %0)
   ret i64 %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
index f958fe815caaa..30edcaf9b15b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index c0a210e680c79..d4f117fad37ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -536,3 +536,14 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
   store <vscale x 1 x double> %hi, ptr %out
   ret <vscale x 16 x double> %lo
 }
+
+define <vscale x 8 x i8> @vpload_all_active_nxv8i8(ptr %ptr) {
+; CHECK-LABEL: vpload_all_active_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl1r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %evl = mul i32 %vscale, 8
+  %load = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 8 x i8> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index ce0ee38bc7047..015d7645aaa29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -459,3 +459,14 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
   call void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 %evl)
   ret void
 }
+
+define void @vpstore_all_active_nxv8i8(<vscale x 8 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: vpstore_all_active_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %evl = mul i32 %vscale, 8
+  call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> %val, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 3ccdb5dfa5e00..30e31cecbf2c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -111,9 +111,9 @@ define float @vreduce_ord_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
 define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -126,9 +126,9 @@ define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 define float @vreduce_ord_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index f21b42e9519b6..7f2e3cdbfd0e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -13,7 +13,7 @@ declare half @llvm.vp.reduce.fadd.nxv1f16(half, <vscale x 1 x half>, <vscale x 1
 define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -39,7 +39,7 @@ define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1
 define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -67,7 +67,7 @@ declare half @llvm.vp.reduce.fadd.nxv2f16(half, <vscale x 2 x half>, <vscale x 2
 define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -93,7 +93,7 @@ define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2
 define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -389,7 +389,7 @@ declare float @llvm.vp.reduce.fadd.nxv1f32(float, <vscale x 1 x float>, <vscale
 define float @vpreduce_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_fadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -402,7 +402,7 @@ define float @vpreduce_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x
 define float @vpreduce_ord_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_ord_fadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll
deleted file mode 100644
index 0a8b051336e5c..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll
+++ /dev/null
@@ -1,4299 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, <vscale x 1 x half> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, <vscale x 2 x half> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, <vscale x 4 x half> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, <vscale x 8 x half> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, <vscale x 16 x half> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, <vscale x 32 x half> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll
similarity index 72%
rename from llvm/test/CodeGen/RISCV/rvv/vrgather-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vrgather.ll
index f6c48690b9bdb..d11e172b25037 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -16,47 +18,47 @@ define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -64,47 +66,47 @@ define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -112,47 +114,47 @@ define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,47 +162,47 @@ define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -208,47 +210,47 @@ define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -256,47 +258,47 @@ define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -304,24 +306,24 @@ define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -329,23 +331,23 @@ define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<v
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -353,47 +355,47 @@ define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -401,47 +403,47 @@ define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -449,47 +451,47 @@ define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -497,47 +499,47 @@ define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -545,47 +547,47 @@ define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -593,24 +595,24 @@ define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -618,23 +620,23 @@ define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i1
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -642,47 +644,47 @@ define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -690,47 +692,47 @@ define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -738,47 +740,47 @@ define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -786,47 +788,47 @@ define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -834,24 +836,24 @@ define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -859,23 +861,23 @@ define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i3
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -883,47 +885,47 @@ define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -931,47 +933,47 @@ define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -979,47 +981,47 @@ define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1027,24 +1029,24 @@ define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1052,23 +1054,23 @@ define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<v
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -1076,47 +1078,47 @@ define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscal
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -1124,47 +1126,47 @@ define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscal
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -1172,47 +1174,47 @@ define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscal
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -1220,47 +1222,47 @@ define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscal
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -1268,47 +1270,47 @@ define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<v
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -1316,24 +1318,24 @@ define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<v
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -1341,23 +1343,23 @@ define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -1365,47 +1367,47 @@ define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vsca
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -1413,47 +1415,47 @@ define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1461,47 +1463,47 @@ define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -1509,47 +1511,47 @@ define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1557,24 +1559,24 @@ define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -1582,23 +1584,23 @@ define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -1606,47 +1608,47 @@ define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -1654,47 +1656,47 @@ define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -1702,47 +1704,47 @@ define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1750,24 +1752,24 @@ define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1775,3046 +1777,3046 @@ define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64,
+  iXLen,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_vx_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_vx_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_vx_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_vx_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, <vscale x 1 x half> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, <vscale x 2 x half> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, <vscale x 4 x half> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, <vscale x 8 x half> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, <vscale x 16 x half> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, <vscale x 32 x half> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, <vscale x 2 x float> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, <vscale x 4 x float> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, <vscale x 8 x float> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, <vscale x 16 x float> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_vi_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_vi_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_vi_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_vi_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x half> %a
 }
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x half> %a
 }
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x half> %a
 }
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x half> %a
 }
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x half> %a
 }
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x half> %a
 }
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x float> %a
 }
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x float> %a
 }
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x float> %a
 }
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x float> %a
 }
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x float> %a
 }
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x double> %a
 }
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x double> %a
 }
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x double> %a
 }
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x double> %a
 }
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll
deleted file mode 100644
index ce14b19f89a35..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll
+++ /dev/null
@@ -1,1449 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl2re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl2re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
similarity index 87%
rename from llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
index ea54912a22619..d1e947e2f3367 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -20,7 +22,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -30,10 +32,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -45,7 +47,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -54,9 +56,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -68,7 +70,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -78,10 +80,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -93,7 +95,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -102,9 +104,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -116,7 +118,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -126,10 +128,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -141,7 +143,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -150,9 +152,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -164,7 +166,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -174,10 +176,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -189,7 +191,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -198,9 +200,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -212,7 +214,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -222,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -237,7 +239,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -246,9 +248,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -260,7 +262,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -270,10 +272,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -285,7 +287,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -294,9 +296,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -308,7 +310,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -318,10 +320,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -333,7 +335,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -342,9 +344,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -356,7 +358,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -366,10 +368,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -381,7 +383,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -390,9 +392,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -404,7 +406,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -414,10 +416,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -429,7 +431,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -438,9 +440,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -452,7 +454,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -462,10 +464,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -477,7 +479,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -486,9 +488,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -500,7 +502,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -510,10 +512,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -525,7 +527,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -534,9 +536,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -548,7 +550,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -558,10 +560,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -574,7 +576,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -583,9 +585,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -597,7 +599,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -607,10 +609,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -622,7 +624,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -631,9 +633,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -645,7 +647,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -655,10 +657,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -670,7 +672,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -679,9 +681,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -693,7 +695,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -703,10 +705,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -718,7 +720,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -727,9 +729,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -741,7 +743,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -751,10 +753,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
@@ -767,7 +769,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -776,9 +778,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -790,7 +792,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -800,10 +802,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -815,7 +817,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -824,9 +826,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -838,7 +840,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -848,10 +850,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl2re16.v v24, (a0)
@@ -864,7 +866,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -873,9 +875,9 @@ declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -887,7 +889,7 @@ entry:
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
@@ -897,10 +899,10 @@ declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -912,7 +914,7 @@ entry:
     <vscale x 1 x half> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
@@ -921,9 +923,9 @@ declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -935,7 +937,7 @@ entry:
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
@@ -945,10 +947,10 @@ declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -960,7 +962,7 @@ entry:
     <vscale x 2 x half> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
@@ -969,9 +971,9 @@ declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -983,7 +985,7 @@ entry:
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
@@ -993,10 +995,10 @@ declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -1008,7 +1010,7 @@ entry:
     <vscale x 4 x half> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
@@ -1017,9 +1019,9 @@ declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -1031,7 +1033,7 @@ entry:
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
@@ -1041,10 +1043,10 @@ declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -1056,7 +1058,7 @@ entry:
     <vscale x 8 x half> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
@@ -1065,9 +1067,9 @@ declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -1079,7 +1081,7 @@ entry:
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
@@ -1089,10 +1091,10 @@ declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 16 x half> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
@@ -1113,9 +1115,9 @@ declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -1127,7 +1129,7 @@ entry:
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
@@ -1137,10 +1139,10 @@ declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -1153,7 +1155,7 @@ entry:
     <vscale x 32 x half> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
@@ -1162,9 +1164,9 @@ declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
@@ -1186,10 +1188,10 @@ declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -1201,7 +1203,7 @@ entry:
     <vscale x 1 x float> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
@@ -1210,9 +1212,9 @@ declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1224,7 +1226,7 @@ entry:
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
@@ -1234,10 +1236,10 @@ declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -1249,7 +1251,7 @@ entry:
     <vscale x 4 x float> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
@@ -1258,9 +1260,9 @@ declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -1272,7 +1274,7 @@ entry:
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
@@ -1282,10 +1284,10 @@ declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -1297,7 +1299,7 @@ entry:
     <vscale x 8 x float> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
@@ -1306,9 +1308,9 @@ declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1320,7 +1322,7 @@ entry:
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
@@ -1330,10 +1332,10 @@ declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
@@ -1346,7 +1348,7 @@ entry:
     <vscale x 16 x float> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
@@ -1355,9 +1357,9 @@ declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -1369,7 +1371,7 @@ entry:
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
@@ -1379,10 +1381,10 @@ declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -1394,7 +1396,7 @@ entry:
     <vscale x 4 x double> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
@@ -1403,9 +1405,9 @@ declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1417,7 +1419,7 @@ entry:
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
@@ -1427,10 +1429,10 @@ declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl2re16.v v24, (a0)
@@ -1443,7 +1445,7 @@ entry:
     <vscale x 8 x double> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll
deleted file mode 100644
index c2586e4bc2d84..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll
+++ /dev/null
@@ -1,2849 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd.ll
similarity index 83%
rename from llvm/test/CodeGen/RISCV/rvv/vsadd-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vsadd.ll
index ca56ad2122c1f..a108d98c1731b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,27 +2140,39 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -2085,12 +2183,12 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -2102,12 +2200,12 @@ entry:
     <vscale x 1 x i8> %1,
     i8 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -2118,12 +2216,12 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -2135,12 +2233,12 @@ entry:
     <vscale x 2 x i8> %1,
     i8 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -2151,12 +2249,12 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -2168,12 +2266,12 @@ entry:
     <vscale x 4 x i8> %1,
     i8 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -2184,12 +2282,12 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -2201,12 +2299,12 @@ entry:
     <vscale x 8 x i8> %1,
     i8 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -2217,12 +2315,12 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -2234,12 +2332,12 @@ entry:
     <vscale x 16 x i8> %1,
     i8 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -2250,12 +2348,12 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -2267,12 +2365,12 @@ entry:
     <vscale x 32 x i8> %1,
     i8 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -2283,12 +2381,12 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
@@ -2300,12 +2398,12 @@ entry:
     <vscale x 64 x i8> %1,
     i8 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -2316,12 +2414,12 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -2333,12 +2431,12 @@ entry:
     <vscale x 1 x i16> %1,
     i16 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -2349,12 +2447,12 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -2366,12 +2464,12 @@ entry:
     <vscale x 2 x i16> %1,
     i16 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -2382,12 +2480,12 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -2399,12 +2497,12 @@ entry:
     <vscale x 4 x i16> %1,
     i16 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -2415,12 +2513,12 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -2432,12 +2530,12 @@ entry:
     <vscale x 8 x i16> %1,
     i16 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -2448,12 +2546,12 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -2465,12 +2563,12 @@ entry:
     <vscale x 16 x i16> %1,
     i16 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -2481,12 +2579,12 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
@@ -2498,12 +2596,12 @@ entry:
     <vscale x 32 x i16> %1,
     i16 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -2514,12 +2612,12 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -2531,12 +2629,12 @@ entry:
     <vscale x 1 x i32> %1,
     i32 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -2547,12 +2645,12 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -2564,12 +2662,12 @@ entry:
     <vscale x 2 x i32> %1,
     i32 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -2580,12 +2678,12 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -2597,12 +2695,12 @@ entry:
     <vscale x 4 x i32> %1,
     i32 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -2613,12 +2711,12 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -2630,12 +2728,12 @@ entry:
     <vscale x 8 x i32> %1,
     i32 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -2646,12 +2744,12 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
@@ -2663,12 +2761,12 @@ entry:
     <vscale x 16 x i32> %1,
     i32 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -2679,12 +2777,12 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -2696,12 +2794,12 @@ entry:
     <vscale x 1 x i64> %1,
     i64 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -2712,12 +2810,12 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -2729,12 +2827,12 @@ entry:
     <vscale x 2 x i64> %1,
     i64 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -2745,12 +2843,12 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -2762,12 +2860,12 @@ entry:
     <vscale x 4 x i64> %1,
     i64 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2778,12 +2876,12 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
@@ -2795,7 +2893,7 @@ entry:
     <vscale x 8 x i64> %1,
     i64 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll
deleted file mode 100644
index b5fa9a921d46c..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll
+++ /dev/null
@@ -1,2849 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu.ll
similarity index 83%
rename from llvm/test/CodeGen/RISCV/rvv/vsaddu-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vsaddu.ll
index 077e45f6408d9..57a89d6fe7d23 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,27 +2140,39 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -2085,12 +2183,12 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -2102,12 +2200,12 @@ entry:
     <vscale x 1 x i8> %1,
     i8 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -2118,12 +2216,12 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -2135,12 +2233,12 @@ entry:
     <vscale x 2 x i8> %1,
     i8 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -2151,12 +2249,12 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -2168,12 +2266,12 @@ entry:
     <vscale x 4 x i8> %1,
     i8 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -2184,12 +2282,12 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -2201,12 +2299,12 @@ entry:
     <vscale x 8 x i8> %1,
     i8 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -2217,12 +2315,12 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -2234,12 +2332,12 @@ entry:
     <vscale x 16 x i8> %1,
     i8 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -2250,12 +2348,12 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -2267,12 +2365,12 @@ entry:
     <vscale x 32 x i8> %1,
     i8 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -2283,12 +2381,12 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
@@ -2300,12 +2398,12 @@ entry:
     <vscale x 64 x i8> %1,
     i8 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -2316,12 +2414,12 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -2333,12 +2431,12 @@ entry:
     <vscale x 1 x i16> %1,
     i16 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -2349,12 +2447,12 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -2366,12 +2464,12 @@ entry:
     <vscale x 2 x i16> %1,
     i16 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -2382,12 +2480,12 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -2399,12 +2497,12 @@ entry:
     <vscale x 4 x i16> %1,
     i16 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -2415,12 +2513,12 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -2432,12 +2530,12 @@ entry:
     <vscale x 8 x i16> %1,
     i16 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -2448,12 +2546,12 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -2465,12 +2563,12 @@ entry:
     <vscale x 16 x i16> %1,
     i16 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -2481,12 +2579,12 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
@@ -2498,12 +2596,12 @@ entry:
     <vscale x 32 x i16> %1,
     i16 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -2514,12 +2612,12 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -2531,12 +2629,12 @@ entry:
     <vscale x 1 x i32> %1,
     i32 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -2547,12 +2645,12 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -2564,12 +2662,12 @@ entry:
     <vscale x 2 x i32> %1,
     i32 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -2580,12 +2678,12 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -2597,12 +2695,12 @@ entry:
     <vscale x 4 x i32> %1,
     i32 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -2613,12 +2711,12 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -2630,12 +2728,12 @@ entry:
     <vscale x 8 x i32> %1,
     i32 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -2646,12 +2744,12 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
@@ -2663,12 +2761,12 @@ entry:
     <vscale x 16 x i32> %1,
     i32 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -2679,12 +2777,12 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -2696,12 +2794,12 @@ entry:
     <vscale x 1 x i64> %1,
     i64 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -2712,12 +2810,12 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -2729,12 +2827,12 @@ entry:
     <vscale x 2 x i64> %1,
     i64 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -2745,12 +2843,12 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -2762,12 +2860,12 @@ entry:
     <vscale x 4 x i64> %1,
     i64 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2778,12 +2876,12 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
@@ -2795,7 +2893,7 @@ entry:
     <vscale x 8 x i64> %1,
     i64 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vse.ll b/llvm/test/CodeGen/RISCV/rvv/vse.ll
index 6b404db77e3a3..f2ae2136078c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vse.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare void @llvm.riscv.vse.nxv1i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index f8274b4cf8037..b1980fcf420a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -487,42 +487,18 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
 define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
 ; CHECK-LABEL: vselect_combine_regression:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmseq.vi v24, v16, 0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v0, v16, 0
+; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmseq.vi v7, v24, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vle64.v v8, (a0), v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vle64.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
   %cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
   %sel = select <vscale x 16 x i1> %cond, <vscale x 16 x double> %vb, <vscale x 16 x double> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp-bf16.ll
index bf1f52050ebf9..76fd1e1d8293f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp-bf16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+v,+zfbfmin,+zvfbfmin -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+v,+zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+v,+zfbfmin,+zvfbfmin -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+v,+zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 1 x bfloat> @llvm.vp.select.nxv1bf16(<vscale x 1 x i1>, <vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
index 4115e6a91f965..fd90e67b1fb2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -84,7 +84,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl3(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl3:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -117,7 +117,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl8(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl8:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -152,7 +152,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl9(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl9:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -187,7 +187,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl15(<vs
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl15:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -213,14 +213,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16(<vs
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -247,14 +247,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047(<
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -269,12 +269,26 @@ entry:
 }
 
 define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
+; CHECK-128-65536-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-128-65536:       # %bb.0: # %entry
+; CHECK-128-65536-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-128-65536-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-128-65536-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-128-65536-NEXT:    ret
+;
+; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-512:       # %bb.0: # %entry
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-512-NEXT:    ret
+;
+; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
     <vscale x 1 x i64> undef,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll
deleted file mode 100644
index 0699737eccdf3..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll
+++ /dev/null
@@ -1,1069 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+f -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v12, v12, a0
-; CHECK-NEXT:    vslide1down.vx v12, v12, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v16, v16, a0
-; CHECK-NEXT:    vslide1down.vx v16, v16, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down.ll
similarity index 74%
rename from llvm/test/CodeGen/RISCV/rvv/vslide1down-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vslide1down.ll
index ccb107bf160d4..e7ca2762d7908 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1down.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -326,7 +328,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -335,9 +337,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -348,7 +350,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -358,10 +360,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -373,7 +375,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -382,9 +384,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -395,7 +397,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -405,10 +407,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -420,7 +422,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -429,9 +431,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -442,7 +444,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -452,10 +454,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -467,7 +469,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -476,9 +478,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -489,7 +491,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -499,10 +501,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -514,7 +516,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -523,9 +525,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -536,7 +538,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -546,10 +548,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -561,7 +563,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -570,9 +572,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -583,7 +585,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -593,10 +595,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -608,7 +610,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -617,9 +619,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -630,7 +632,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -640,10 +642,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -655,7 +657,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -664,9 +666,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -677,7 +679,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -687,10 +689,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -702,7 +704,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -711,9 +713,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -724,7 +726,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -734,10 +736,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -749,7 +751,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -758,9 +760,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -771,7 +773,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -781,10 +783,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -796,7 +798,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -805,9 +807,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -818,7 +820,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -828,10 +830,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -843,7 +845,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -852,20 +854,29 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -875,22 +886,33 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -899,20 +921,29 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -922,22 +953,33 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -946,20 +988,29 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -969,22 +1020,33 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v12, v12, a0
+; RV32-NEXT:    vslide1down.vx v12, v12, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -993,20 +1055,29 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1016,22 +1087,33 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    vslide1down.vx v16, v16, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
index f0d621bef2b91..b26f1cab97c77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -84,7 +84,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl3(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl3:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -117,7 +117,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl8(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl8:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -152,7 +152,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl9(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl9:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -187,7 +187,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl15(<vsca
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl15:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -213,14 +213,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16(<vsca
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -247,14 +247,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047(<vs
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -269,12 +269,26 @@ entry:
 }
 
 define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0
-; CHECK-NEXT:    ret
+; CHECK-128-65536-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-128-65536:       # %bb.0: # %entry
+; CHECK-128-65536-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-128-65536-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-128-65536-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-128-65536-NEXT:    ret
+;
+; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-512:       # %bb.0: # %entry
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-512-NEXT:    ret
+;
+; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
     <vscale x 1 x i64> undef,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll
deleted file mode 100644
index 6c82149da5fd4..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll
+++ /dev/null
@@ -1,1059 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up.ll
similarity index 75%
rename from llvm/test/CodeGen/RISCV/rvv/vslide1up-rv32.ll
rename to llvm/test/CodeGen/RISCV/rvv/vslide1up.ll
index 55f0196dbeb7d..e1f020af4d5cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1up.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+f -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -20,7 +22,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -30,10 +32,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -45,7 +47,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -54,9 +56,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -68,7 +70,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -78,10 +80,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -93,7 +95,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -102,9 +104,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -116,7 +118,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -126,10 +128,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -141,7 +143,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -150,9 +152,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -164,7 +166,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -174,10 +176,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -189,7 +191,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -198,9 +200,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -212,7 +214,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -222,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -237,7 +239,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -246,9 +248,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -260,7 +262,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -270,10 +272,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -285,7 +287,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -294,9 +296,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -308,7 +310,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -318,10 +320,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -333,7 +335,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -342,9 +344,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -356,7 +358,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -366,10 +368,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -381,7 +383,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -390,9 +392,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -404,7 +406,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -414,10 +416,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -429,7 +431,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -438,9 +440,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -452,7 +454,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -462,10 +464,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -477,7 +479,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -486,9 +488,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -500,7 +502,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -510,10 +512,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -525,7 +527,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -534,9 +536,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -548,7 +550,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -558,10 +560,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -573,7 +575,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -582,9 +584,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -596,7 +598,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -606,10 +608,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -621,7 +623,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -630,9 +632,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -644,7 +646,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -654,10 +656,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -669,7 +671,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -678,9 +680,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -692,7 +694,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -702,10 +704,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -717,7 +719,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -726,9 +728,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -740,7 +742,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -750,10 +752,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -765,7 +767,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -774,9 +776,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -788,7 +790,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -798,10 +800,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -813,7 +815,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -822,9 +824,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -836,7 +838,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -846,10 +848,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -861,7 +863,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -870,23 +872,30 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vslide1up.vx v9, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v9, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vslide1up.vx v9, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v9
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -896,27 +905,33 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v9, a1
-; CHECK-NEXT:    vslide1up.vx v9, v10, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; RV32-NEXT:    vslide1up.vx v10, v9, a1
+; RV32-NEXT:    vslide1up.vx v9, v10, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -925,23 +940,30 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV32-NEXT:    vslide1up.vx v10, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v10, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vslide1up.vx v10, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -951,27 +973,33 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v10, a1
-; CHECK-NEXT:    vslide1up.vx v10, v12, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
+; RV32-NEXT:    vslide1up.vx v12, v10, a1
+; RV32-NEXT:    vslide1up.vx v10, v12, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -980,23 +1008,30 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; RV32-NEXT:    vslide1up.vx v12, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v12, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vslide1up.vx v12, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1006,27 +1041,33 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v12, a1
-; CHECK-NEXT:    vslide1up.vx v12, v16, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
+; RV32-NEXT:    vslide1up.vx v16, v12, a1
+; RV32-NEXT:    vslide1up.vx v12, v16, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1035,23 +1076,30 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vslide1up.vx v16, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v16, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vslide1up.vx v16, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1061,27 +1109,33 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v24, v16, a1
-; CHECK-NEXT:    vslide1up.vx v16, v24, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vslide1up.vx v24, v16, a1
+; RV32-NEXT:    vslide1up.vx v16, v24, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
deleted file mode 100644
index e7d8ae635f75c..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
+++ /dev/null
@@ -1,2166 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-; RUN: not --crash llc -mtriple=riscv32  -mattr=+zve64d 2>&1 \
-; RUN:   < %s | FileCheck %s --check-prefixes=ZVE64D
-
-; ZVE64D: LLVM ERROR: Cannot select: intrinsic %llvm.riscv.vsmul
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32, i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32, i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul.ll
similarity index 79%
rename from llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vsmul.ll
index 66bc5c9103a48..bc53bce889ddb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsmul.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-; RUN: not --crash llc -mtriple=riscv64  -mattr=+zve64d 2>&1 \
-; RUN:   < %s | FileCheck %s --check-prefixes=ZVE64D
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: sed 's/iXLen/i64/g' %s | not --crash llc -mtriple=riscv64 \
+; RUN:   -mattr=+zve64d 2>&1 | FileCheck %s --check-prefixes=ZVE64D
 
 ; ZVE64D: LLVM ERROR: Cannot select: intrinsic %llvm.riscv.vsmul
 
@@ -10,9 +12,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -24,7 +26,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -34,9 +36,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -49,7 +51,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -58,9 +60,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -72,7 +74,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -82,9 +84,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -97,7 +99,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -106,9 +108,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -120,7 +122,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -130,9 +132,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -145,7 +147,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -154,9 +156,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -168,7 +170,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -178,9 +180,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -193,7 +195,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -202,9 +204,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -216,7 +218,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -226,9 +228,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -241,7 +243,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -250,9 +252,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -264,7 +266,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -274,9 +276,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -289,7 +291,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -298,9 +300,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -312,7 +314,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -322,9 +324,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -338,7 +340,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -347,9 +349,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -361,7 +363,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -371,9 +373,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -386,7 +388,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -395,9 +397,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -409,7 +411,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -419,9 +421,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -434,7 +436,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -443,9 +445,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -457,7 +459,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -467,9 +469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -482,7 +484,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -491,9 +493,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -505,7 +507,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -515,9 +517,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -530,7 +532,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -539,9 +541,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -553,7 +555,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -563,9 +565,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -578,7 +580,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -587,9 +589,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -601,7 +603,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -611,9 +613,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -627,7 +629,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -636,9 +638,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -650,7 +652,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -660,9 +662,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -675,7 +677,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -684,9 +686,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -698,7 +700,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -708,9 +710,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -723,7 +725,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -732,9 +734,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -746,7 +748,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -756,9 +758,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -771,7 +773,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -780,9 +782,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -794,7 +796,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -804,9 +806,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -819,7 +821,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -828,9 +830,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -842,7 +844,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -852,9 +854,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -868,7 +870,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -877,9 +879,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -891,7 +893,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -901,9 +903,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -916,7 +918,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -925,9 +927,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -939,7 +941,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -964,7 +966,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -973,9 +975,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -997,9 +999,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1012,7 +1014,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1021,9 +1023,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1045,9 +1047,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1061,7 +1063,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1070,9 +1072,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1084,7 +1086,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1094,9 +1096,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1109,7 +1111,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1118,9 +1120,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1132,7 +1134,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1142,9 +1144,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1157,7 +1159,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1166,9 +1168,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1180,7 +1182,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1190,9 +1192,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1205,7 +1207,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1214,9 +1216,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1228,7 +1230,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1238,9 +1240,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1253,7 +1255,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1262,9 +1264,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1276,7 +1278,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1286,9 +1288,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1301,7 +1303,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1310,9 +1312,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1324,7 +1326,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1334,9 +1336,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1349,7 +1351,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1358,9 +1360,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1372,7 +1374,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1382,9 +1384,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1397,7 +1399,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1406,9 +1408,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1420,7 +1422,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1430,9 +1432,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1445,7 +1447,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1454,9 +1456,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1468,7 +1470,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1478,9 +1480,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1493,7 +1495,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1502,9 +1504,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1516,7 +1518,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1526,9 +1528,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1541,7 +1543,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1550,9 +1552,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1564,7 +1566,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1574,9 +1576,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1589,7 +1591,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1598,9 +1600,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1612,7 +1614,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1622,9 +1624,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1637,7 +1639,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1646,9 +1648,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1660,7 +1662,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1670,9 +1672,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1685,7 +1687,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1694,9 +1696,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1708,7 +1710,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1718,9 +1720,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1733,7 +1735,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1742,9 +1744,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1756,7 +1758,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1766,9 +1768,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1781,7 +1783,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1790,9 +1792,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1804,7 +1806,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1814,9 +1816,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1829,7 +1831,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1838,9 +1840,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1852,7 +1854,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1862,9 +1864,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1877,7 +1879,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1886,9 +1888,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1900,7 +1902,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1910,9 +1912,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1925,7 +1927,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1933,21 +1935,35 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1957,22 +1973,35 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1980,21 +2009,35 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2004,22 +2047,35 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2027,21 +2083,35 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2051,22 +2121,35 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2074,21 +2157,35 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2098,22 +2195,35 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsoxei-rv64.ll
index 168d71dab92d9..f3ad06529210a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsoxei-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsoxei-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh,+f -verify-machineinstrs \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 
 ; The intrinsics are not supported with RV32.
@@ -924,6 +924,190 @@ entry:
   ret void
 }
 
+declare void @llvm.riscv.vsoxei.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
 declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/rvv/vsoxei.ll
index bcb00242741cb..89222711d4d91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsoxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsoxei.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
@@ -4466,6 +4466,236 @@ entry:
   ret void
 }
 
+declare void @llvm.riscv.vsoxei.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
 declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsse.ll b/llvm/test/CodeGen/RISCV/rvv/vsse.ll
index 9b627bcd66467..6908a2000653b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsse.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare void @llvm.riscv.vsse.nxv1i64(
@@ -1384,6 +1384,282 @@ entry:
   ret void
 }
 
+declare void @llvm.riscv.vsse.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
 declare void @llvm.riscv.vsse.nxv1i8(
   <vscale x 1 x i8>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll
deleted file mode 100644
index 3928e6fbd1f75..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll
+++ /dev/null
@@ -1,2075 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssub.ll
similarity index 80%
rename from llvm/test/CodeGen/RISCV/rvv/vssub-rv32.ll
rename to llvm/test/CodeGen/RISCV/rvv/vssub.ll
index f3ba5daafb683..50fca5e832af5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,26 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1919,28 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vssub.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1949,26 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1978,28 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vssub.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2008,26 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2037,28 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vssub.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2067,26 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2096,28 +2140,34 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vssub.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll
deleted file mode 100644
index 5e7147158d0f1..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll
+++ /dev/null
@@ -1,2123 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu.ll
similarity index 80%
rename from llvm/test/CodeGen/RISCV/rvv/vssubu-rv64.ll
rename to llvm/test/CodeGen/RISCV/rvv/vssubu.ll
index 71b623c555a5f..db1b4ce34e9b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,22 +2140,34 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsuxei-rv64.ll
index dbc7e719f14a8..3572f5909400a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsuxei-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsuxei-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh,+f -verify-machineinstrs \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 
 ; The intrinsics are not supported with RV32.
@@ -924,6 +924,190 @@ entry:
   ret void
 }
 
+declare void @llvm.riscv.vsuxei.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1bf16.nxv1i64(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1bf16_nxv1bf16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1bf16.nxv1i64(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2bf16.nxv2i64(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2bf16_nxv2bf16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2bf16.nxv2i64(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4bf16.nxv4i64(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4bf16_nxv4bf16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4bf16.nxv4i64(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8bf16.nxv8i64(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8bf16_nxv8bf16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8bf16.nxv8i64(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
 declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/rvv/vsuxei.ll
index 7413177918e63..69b1173d9531c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsuxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsuxei.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
@@ -4466,6 +4466,236 @@ entry:
   ret void
 }
 
+declare void @llvm.riscv.vsuxei.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1bf16.nxv1i32(
+  <vscale x 1 x bfloat>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32(<vscale x 1 x bfloat> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1bf16_nxv1bf16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1bf16.nxv1i32(
+    <vscale x 1 x bfloat> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2bf16.nxv2i32(
+  <vscale x 2 x bfloat>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32(<vscale x 2 x bfloat> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2bf16_nxv2bf16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2bf16.nxv2i32(
+    <vscale x 2 x bfloat> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4bf16.nxv4i32(
+  <vscale x 4 x bfloat>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32(<vscale x 4 x bfloat> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4bf16_nxv4bf16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4bf16.nxv4i32(
+    <vscale x 4 x bfloat> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8bf16.nxv8i32(
+  <vscale x 8 x bfloat>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32(<vscale x 8 x bfloat> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8bf16_nxv8bf16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8bf16.nxv8i32(
+    <vscale x 8 x bfloat> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16bf16.nxv16i32(
+  <vscale x 16 x bfloat>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32(<vscale x 16 x bfloat> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16bf16_nxv16bf16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16bf16.nxv16i32(
+    <vscale x 16 x bfloat> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
 declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
   <vscale x 1 x float>,
   ptr,
diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll
index 026340ede1f90..c74bc6838ff7d 100644
--- a/llvm/test/CodeGen/RISCV/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/ucmp.ll
@@ -48,10 +48,8 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: ucmp.8.32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sltu a2, a0, a1
 ; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    sub a0, a0, a2
@@ -164,10 +162,44 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: ucmp.32.32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind {
+; RV32I-LABEL: ucmp.32.32_sext:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.32_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.32_zext(i32 zeroext %x, i32 zeroext %y) nounwind {
+; RV32I-LABEL: ucmp.32.32_zext:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.32_zext:
+; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a0, a1
 ; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    sub a0, a0, a2
@@ -179,13 +211,13 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: ucmp.32.64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    beq a1, a3, .LBB6_2
+; RV32I-NEXT:    beq a1, a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sltu a0, a3, a1
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:  .LBB8_2:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    sub a0, a0, a4
@@ -204,15 +236,15 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: ucmp.64.64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    beq a1, a3, .LBB7_2
+; RV32I-NEXT:    beq a1, a3, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sltu a0, a3, a1
-; RV32I-NEXT:    j .LBB7_3
-; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
index c7bf71bb06c5b..62cf06d7ee3c8 100644
--- a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -1,128 +1,459 @@
-; RUN: llc -march=sparc <%s | FileCheck %s -check-prefix=V8
-; RUN: llc -march=sparc -mattr=v9 <%s | FileCheck %s -check-prefix=V9
-; RUN: llc -mtriple=sparc64-unknown-linux <%s | FileCheck %s -check-prefix=SPARC64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=sparc %s -o - | FileCheck %s -check-prefix=V8
+; RUN: llc -march=sparc -mattr=v9 %s -o - | FileCheck %s -check-prefix=V9
+; RUN: llc -mtriple=sparc64-unknown-linux %s -o - | FileCheck %s -check-prefix=SPARC64
 
-
-define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind readnone noinline {
+define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind {
+; V8-LABEL: test_addx:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    addcc %o1, %o3, %o3
+; V8-NEXT:    addxcc %o0, %o2, %o1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bleu .LBB0_4
+; V8-NEXT:    mov %o0, %o2
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bleu .LBB0_5
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_2: ! %entry
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bne .LBB0_6
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_3: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_4: ! %entry
+; V8-NEXT:    mov %g0, %o2
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bgu .LBB0_2
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_5: ! %entry
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    be .LBB0_3
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_6: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    mov %o2, %o0
+;
+; V9-LABEL: test_addx:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    mov %g0, %g2
+; V9-NEXT:    mov %g0, %g3
+; V9-NEXT:    addcc %o1, %o3, %o1
+; V9-NEXT:    addxcc %o0, %o2, %o0
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    movgu %icc, 1, %g2
+; V9-NEXT:    cmp %o1, %o5
+; V9-NEXT:    movgu %icc, 1, %g3
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    move %icc, %g3, %g2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %g2, %o0
+;
+; SPARC64-LABEL: test_addx:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    mov %g0, %o3
+; SPARC64-NEXT:    add %o0, %o1, %o0
+; SPARC64-NEXT:    cmp %o0, %o2
+; SPARC64-NEXT:    movgu %xcc, 1, %o3
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    srl %o3, 0, %o0
 entry:
-; V8: addcc
-; V8-NOT: subcc
-; V8: addx
-; V9: addcc
-; V9-NOT: subcc
-; V9: addx
-; V9: mov{{e|ne}} %icc
   %0 = add i64 %a, %b
   %1 = icmp ugt i64 %0, %c
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
 
-
-define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind readnone noinline {
+define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind {
+; V8-LABEL: test_select_int_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB1_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB1_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+;
+; V9-LABEL: test_select_int_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    move %icc, %o1, %o2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %o2, %o0
+;
+; SPARC64-LABEL: test_select_int_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    move %icc, %o1, %o2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %o2, %o0
 entry:
-; V8: test_select_int_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_int_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: mov{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, i32 %b, i32 %c
   ret i32 %1
 }
 
-
-define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB2_2
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB2_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    ld [%sp+96], %f1
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovse %icc, %f1, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovse %icc, %f3, %f0
 entry:
-; V8: test_select_fp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_fp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovs{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    mov %o4, %o5
+; V8-NEXT:    mov %o2, %g3
+; V8-NEXT:    mov %o3, %o4
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    mov %o1, %g2
+; V8-NEXT:    be .LBB3_2
+; V8-NEXT:    std %g2, [%sp+104]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB3_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_select_dfp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    mov %o4, %o5
+; V9-NEXT:    mov %o2, %g3
+; V9-NEXT:    mov %o3, %o4
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    mov %o1, %g2
+; V9-NEXT:    std %g2, [%sp+104]
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovde %icc, %f2, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_select_dfp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovde %icc, %f2, %f0
 entry:
-; V8: test_select_dfp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_dfp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind readnone noinline {
+define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind {
+; V8-LABEL: test_select_int_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -96, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB4_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB4_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 96, %sp
+;
+; V9-LABEL: test_select_int_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -96, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    ld [%sp+92], %f0
+; V9-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V9-NEXT:    mov %o2, %o0
+; V9-NEXT:    fcmps %fcc0, %f0, %f1
+; V9-NEXT:    movne %fcc0, %o1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 96, %sp
+;
+; SPARC64-LABEL: test_select_int_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI4_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI4_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI4_0)], %f0
+; SPARC64-NEXT:    mov %o2, %o0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    movne %fcc0, %o1, %o0
 entry:
-;V8-LABEL: test_select_int_fcc:
-;V8: fcmps
-;V8-NEXT: nop
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_int_fcc:
-;V9: fcmps
-;V9-NOT: nop
-;V9-NOT: {{fbe|fbne}}
-;V9: mov{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %a.b = select i1 %0, i32 %a, i32 %b
   ret i32 %a.b
 }
 
-
-define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB5_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB5_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+92], %f1
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f2
+; V9-NEXT:    ld [%sp+96], %f3
+; V9-NEXT:    fcmps %fcc0, %f1, %f2
+; V9-NEXT:    fmovsne %fcc0, %f3, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI5_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI5_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI5_0)], %f2
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovsne %fcc0, %f3, %f0
 entry:
-;V8-LABEL: test_select_fp_fcc:
-;V8: fcmps
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_fp_fcc:
-;V9: fcmps
-;V9-NOT: {{fbe|fbne}}
-;V9: fmovs{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -120, %sp
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+112]
+; V8-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+104]
+; V8-NEXT:    ldd [%sp+112], %f0
+; V8-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f2
+; V8-NEXT:    fcmpd %f0, %f2
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB6_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+; V8-NEXT:  .LBB6_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+;
+; V9-LABEL: test_select_dfp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -120, %sp
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+112]
+; V9-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+104]
+; V9-NEXT:    ldd [%sp+112], %f2
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f4
+; V9-NEXT:    ldd [%sp+104], %f6
+; V9-NEXT:    fcmpd %fcc0, %f2, %f4
+; V9-NEXT:    fmovdne %fcc0, %f6, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 120, %sp
+;
+; SPARC64-LABEL: test_select_dfp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI6_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI6_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI6_0)], %f6
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f6
+; SPARC64-NEXT:    fmovdne %fcc0, %f2, %f4
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    nop
 entry:
-;V8-LABEL: test_select_dfp_fcc:
-;V8: fcmpd
-;V8-NEXT: nop
-;V8: {{fbne|fbe}}
-;V9-LABEL: test_select_dfp_fcc:
-;V9: fcmpd
-;V9-NOT: nop
-;V9-NOT: {{fbne|fbe}}
-;V9: fmovd{{e|ne}} %fcc0
   %0 = fcmp une double %f, 0.000000e+00
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) {
+define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) nounwind {
+; V8-LABEL: test_float_cc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+96]
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+104]
+; V8-NEXT:    ldd [%sp+104], %f2
+; V8-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbuge .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %loop.2
+; V8-NEXT:    ldd [%sp+96], %f2
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbule .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.2: ! %exit.1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB7_3: ! %loop
+; V8-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V8-NEXT:    cmp %o4, 10
+; V8-NEXT:    be .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.4: ! %exit.0
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_float_cc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+96]
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+104]
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbuge %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.1: ! %loop.2
+; V9-NEXT:    ldd [%sp+96], %f2
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbule %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.2: ! %exit.1
+; V9-NEXT:    mov 1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+; V9-NEXT:  .LBB7_3: ! %loop
+; V9-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V9-NEXT:    cmp %o4, 10
+; V9-NEXT:    be %icc, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.4: ! %exit.0
+; V9-NEXT:    mov %g0, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_float_cc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI7_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI7_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI7_0)], %f4
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f4
+; SPARC64-NEXT:    fbuge %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.1: ! %loop.2
+; SPARC64-NEXT:    fcmpd %fcc0, %f2, %f4
+; SPARC64-NEXT:    fbule %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %exit.1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov 1, %o0
+; SPARC64-NEXT:  .LBB7_3: ! %loop
+; SPARC64-NEXT:    ! =>This Inner Loop Header: Depth=1
+; SPARC64-NEXT:    cmp %o2, 10
+; SPARC64-NEXT:    be %icc, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %exit.0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %g0, %o0
 entry:
-; V8-LABEL: test_float_cc
-; V8:       fcmpd
-; V8:       {{fbl|fbuge}} .LBB
-; V8:       fcmpd
-; V8:       {{fbule|fbg}} .LBB
-
-; V9-LABEL: test_float_cc
-; V9:       fcmpd %fcc0
-; V9:       {{fbl|fbuge}} %fcc0, .LBB
-; V9:       fcmpd %fcc0
-; V9:       {{fbule|fbg}} %fcc0, .LBB
-
    %0 = fcmp uge double %a, 0.000000e+00
    br i1 %0, label %loop, label %loop.2
 
@@ -141,39 +472,92 @@ exit.1:
    ret i32 1
 }
 
-; V8-LABEL: test_adde_sube
-; V8:       addcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       subcc
-; V8:       subxcc
-; V8:       subxcc
-; V8:       subxcc
-
-
-; V9-LABEL: test_adde_sube
-; V9:       addcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       subcc
-; V9:       subxcc
-; V9:       subxcc
-; V9:       subxcc
-
-; SPARC64-LABEL: test_adde_sube
-; SPARC64:       addcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       subcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-
-
-define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) {
+define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) nounwind {
+; V8-LABEL: test_adde_sube:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    save %sp, -96, %sp
+; V8-NEXT:    ldd [%i0+8], %i4
+; V8-NEXT:    ldd [%i1+8], %l0
+; V8-NEXT:    ldd [%i0], %g2
+; V8-NEXT:    ldd [%i1], %l2
+; V8-NEXT:    addcc %i5, %l1, %l5
+; V8-NEXT:    addxcc %i4, %l0, %l4
+; V8-NEXT:    addxcc %g3, %l3, %l1
+; V8-NEXT:    addxcc %g2, %l2, %l0
+; V8-NEXT:    std %l4, [%i2+8]
+; V8-NEXT:    std %l0, [%i2]
+; V8-NEXT:    !APP
+; V8-NEXT:    !NO_APP
+; V8-NEXT:    ldd [%i0+8], %l0
+; V8-NEXT:    ldd [%i0], %i0
+; V8-NEXT:    subcc %i5, %l1, %l3
+; V8-NEXT:    subxcc %i4, %l0, %l2
+; V8-NEXT:    subxcc %g3, %i1, %i5
+; V8-NEXT:    subxcc %g2, %i0, %i4
+; V8-NEXT:    std %l2, [%i3+8]
+; V8-NEXT:    std %i4, [%i3]
+; V8-NEXT:    ret
+; V8-NEXT:    restore
+;
+; V9-LABEL: test_adde_sube:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    save %sp, -96, %sp
+; V9-NEXT:    ldd [%i0+8], %i4
+; V9-NEXT:    ldd [%i1+8], %l0
+; V9-NEXT:    ldd [%i0], %g2
+; V9-NEXT:    ldd [%i1], %l2
+; V9-NEXT:    addcc %i5, %l1, %l5
+; V9-NEXT:    addxcc %i4, %l0, %l4
+; V9-NEXT:    addxcc %g3, %l3, %l1
+; V9-NEXT:    addxcc %g2, %l2, %l0
+; V9-NEXT:    std %l4, [%i2+8]
+; V9-NEXT:    std %l0, [%i2]
+; V9-NEXT:    !APP
+; V9-NEXT:    !NO_APP
+; V9-NEXT:    ldd [%i0+8], %l0
+; V9-NEXT:    ldd [%i0], %i0
+; V9-NEXT:    subcc %i5, %l1, %l3
+; V9-NEXT:    subxcc %i4, %l0, %l2
+; V9-NEXT:    subxcc %g3, %i1, %i5
+; V9-NEXT:    subxcc %g2, %i0, %i4
+; V9-NEXT:    std %l2, [%i3+8]
+; V9-NEXT:    std %i4, [%i3]
+; V9-NEXT:    ret
+; V9-NEXT:    restore
+;
+; SPARC64-LABEL: test_adde_sube:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:    .register %g3, #scratch
+; SPARC64-NEXT:  ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:    ldx [%i0+8], %i4
+; SPARC64-NEXT:    ldx [%i0], %i5
+; SPARC64-NEXT:    ldx [%i1], %g2
+; SPARC64-NEXT:    ldx [%i1+8], %i1
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %i5, %g2, %g2
+; SPARC64-NEXT:    add %i4, %i1, %i1
+; SPARC64-NEXT:    cmp %i1, %i4
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %g3
+; SPARC64-NEXT:    add %g2, %g3, %g2
+; SPARC64-NEXT:    stx %i1, [%i2+8]
+; SPARC64-NEXT:    stx %g2, [%i2]
+; SPARC64-NEXT:    !APP
+; SPARC64-NEXT:    !NO_APP
+; SPARC64-NEXT:    ldx [%i0+8], %i1
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    ldx [%i0], %i0
+; SPARC64-NEXT:    cmp %i4, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    sub %i5, %i0, %i0
+; SPARC64-NEXT:    sub %i0, %i2, %i0
+; SPARC64-NEXT:    sub %i4, %i1, %i1
+; SPARC64-NEXT:    stx %i1, [%i3+8]
+; SPARC64-NEXT:    stx %i0, [%i3]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore
 entry:
    %0 = bitcast ptr %a to ptr
    %1 = bitcast ptr %b to ptr
diff --git a/llvm/test/CodeGen/SPARC/64cond.ll b/llvm/test/CodeGen/SPARC/64cond.ll
index 10d070055a4ec..5a90022004664 100644
--- a/llvm/test/CodeGen/SPARC/64cond.ll
+++ b/llvm/test/CodeGen/SPARC/64cond.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=sparc64-pc-openbsd -disable-sparc-leaf-proc | FileCheck %s
 ; Testing 64-bit conditionals. The sparc64 triple is an alias for sparcv9.
 
-; CHECK: cmpri
-; CHECK: cmp %i1, 1
-; CHECK: be %xcc,
-define void @cmpri(ptr %p, i64 %x) {
+define void @cmpri(ptr %p, i64 %x) nounwind {
+; CHECK-LABEL: cmpri:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, 1
+; CHECK-NEXT:    be %xcc, .LBB0_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB0_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp eq i64 %x, 1
   br i1 %tobool, label %if.end, label %if.then
@@ -17,10 +26,18 @@ if.end:
   ret void
 }
 
-; CHECK: cmprr
-; CHECK: cmp %i1, %i2
-; CHECK: bgu %xcc,
-define void @cmprr(ptr %p, i64 %x, i64 %y) {
+define void @cmprr(ptr %p, i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: cmprr:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, %i2
+; CHECK-NEXT:    bgu %xcc, .LBB1_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB1_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp ugt i64 %x, %y
   br i1 %tobool, label %if.end, label %if.then
@@ -33,67 +50,87 @@ if.end:
   ret void
 }
 
-; CHECK: selecti32_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) {
+define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: selecti32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i32 %a, i32 %b
   ret i32 %rv
 }
 
-; CHECK: selecti64_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) {
+define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_icc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %icc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) {
+define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_icc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %icc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i32 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_fcc
-; CHECK: mov %i3, %i0
-; CHECK: fcmps %fcc0, %f1, %f3
-; CHECK: movul %fcc0, %i2, %i0
-; CHECK: restore
-define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) {
+define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_fcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %i3, %i0
+; CHECK-NEXT:    fcmps %fcc0, %f1, %f3
+; CHECK-NEXT:    movul %fcc0, %i2, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = fcmp ult float %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selectf32_xcc
-; CHECK: fmovs %f7, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovsg %xcc, %f5, %f0
-define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) {
+define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) nounwind {
+; CHECK-LABEL: selectf32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovs %f7, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovsg %xcc, %f5, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, float %a, float %b
   ret float %rv
 }
 
-; CHECK: selectf64_xcc
-; CHECK: fmovd %f6, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovdg %xcc, %f4, %f0
-define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) {
+define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) nounwind {
+; CHECK-LABEL: selectf64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovd %f6, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovdg %xcc, %f4, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, double %a, double %b
@@ -101,26 +138,38 @@ entry:
 }
 
 ; The MOVXCC instruction can't use %g0 for its tied operand.
-; CHECK: select_consti64_xcc
-; CHECK: cmp
-; CHECK: movg %xcc, 123, %i{{[0-2]}}
-define i64 @select_consti64_xcc(i64 %x, i64 %y) {
+define i64 @select_consti64_xcc(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: select_consti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, 123, %i2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i2, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 123, i64 0
   ret i64 %rv
 }
 
-; CHECK-LABEL: setcc_resultty
-; CHECK-DAG:       mov %g0, %o0
-; CHECK-DAG:       mov %i0, %o1
-; CHECK-DAG:       mov %g0, %o2
-; CHECK-DAG:       mov 32, %o3
-; CHECK-DAG:       call __multi3
-; CHECK:       movrnz %o0, 1, [[R:%[gilo][0-7]]]
-; CHECK:       or [[R]], %i1, %i0
-
-define i1 @setcc_resultty(i64 %a, i1 %b) {
+define i1 @setcc_resultty(i64 %a, i1 %b) nounwind {
+; CHECK-LABEL: setcc_resultty:
+; CHECK:       ! %bb.0:
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    sethi 4194303, %i3
+; CHECK-NEXT:    or %i3, 1023, %i3
+; CHECK-NEXT:    sethi 131071, %i4
+; CHECK-NEXT:    or %i4, 1023, %i4
+; CHECK-NEXT:    sllx %i4, 32, %i4
+; CHECK-NEXT:    or %i4, %i3, %i3
+; CHECK-NEXT:    and %i0, %i3, %i3
+; CHECK-NEXT:    cmp %i3, %i0
+; CHECK-NEXT:    movne %xcc, 1, %i2
+; CHECK-NEXT:    or %i2, %i1, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
   %a0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a, i64 32)
   %a1 = extractvalue { i64, i1 } %a0, 1
   %a4 = or i1 %a1, %b
diff --git a/llvm/test/CodeGen/SPARC/fp128-split.ll b/llvm/test/CodeGen/SPARC/fp128-split.ll
index c87cfb932b781..8a127c9c28cc7 100644
--- a/llvm/test/CodeGen/SPARC/fp128-split.ll
+++ b/llvm/test/CodeGen/SPARC/fp128-split.ll
@@ -8,45 +8,33 @@
 define fp128 @testcase(fp128 %0) {
   ; CHECK-LABEL: name: testcase
   ; CHECK: bb.0.Entry:
-  ; CHECK:   liveins: $q0
-  ; CHECK:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
-  ; CHECK:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
-  ; CHECK:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
-  ; CHECK:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
-  ; CHECK:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
-  ; CHECK:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
-  ; CHECK:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
-  ; CHECK:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
-  ; CHECK:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
-  ; CHECK:   [[COPY3:%[0-9]+]]:intregs = COPY [[LDXrr]]
-  ; CHECK:   [[COPY4:%[0-9]+]]:intregs = COPY [[LDXri]]
-  ; CHECK:   [[SRLXri:%[0-9]+]]:i64regs = SRLXri [[LDXrr]], 32
-  ; CHECK:   [[COPY5:%[0-9]+]]:intregs = COPY [[SRLXri]]
-  ; CHECK:   [[SRLXri1:%[0-9]+]]:i64regs = SRLXri [[LDXri]], 32
-  ; CHECK:   [[COPY6:%[0-9]+]]:intregs = COPY [[SRLXri1]]
-  ; CHECK:   [[ADDCCri:%[0-9]+]]:intregs = ADDCCri killed [[COPY3]], -1, implicit-def $icc
-  ; CHECK:   [[ADDEri:%[0-9]+]]:intregs = ADDEri killed [[COPY5]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri1:%[0-9]+]]:intregs = ADDEri killed [[COPY4]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri2:%[0-9]+]]:intregs = ADDEri killed [[COPY6]], -1, implicit-def dead $icc, implicit $icc
-  ; CHECK:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[ADDCCri]], 0
-  ; CHECK:   [[COPY7:%[0-9]+]]:i64regs = COPY [[ADDEri]]
-  ; CHECK:   [[SLLXri:%[0-9]+]]:i64regs = SLLXri killed [[COPY7]], 32
-  ; CHECK:   [[ORrr:%[0-9]+]]:i64regs = ORrr killed [[SLLXri]], killed [[SRLri]]
-  ; CHECK:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
-  ; CHECK:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
-  ; CHECK:   STXrr [[ORri1]], $g0, killed [[ORrr]] :: (store (s64) into %stack.1 + 8, basealign 16)
-  ; CHECK:   [[SRLri1:%[0-9]+]]:i64regs = SRLri killed [[ADDEri1]], 0
-  ; CHECK:   [[COPY8:%[0-9]+]]:i64regs = COPY [[ADDEri2]]
-  ; CHECK:   [[SLLXri1:%[0-9]+]]:i64regs = SLLXri killed [[COPY8]], 32
-  ; CHECK:   [[ORrr1:%[0-9]+]]:i64regs = ORrr killed [[SLLXri1]], killed [[SRLri1]]
-  ; CHECK:   STXri %stack.1, 0, killed [[ORrr1]] :: (store (s64) into %stack.1, align 16)
-  ; CHECK:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
-  ; CHECK:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
-  ; CHECK:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
-  ; CHECK:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
-  ; CHECK:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
-  ; CHECK:   $q0 = COPY [[INSERT_SUBREG1]]
-  ; CHECK:   RETL 8, implicit $q0
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
+  ; CHECK-NEXT:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
+  ; CHECK-NEXT:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
+  ; CHECK-NEXT:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
+  ; CHECK-NEXT:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
+  ; CHECK-NEXT:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
+  ; CHECK-NEXT:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
+  ; CHECK-NEXT:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
+  ; CHECK-NEXT:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
+  ; CHECK-NEXT:   [[ADDri2:%[0-9]+]]:i64regs = ADDri [[LDXrr]], -1
+  ; CHECK-NEXT:   STXrr [[ORri1]], $g0, killed [[ADDri2]] :: (store (s64) into %stack.1 + 8, basealign 16)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:intregs = COPY $g0
+  ; CHECK-NEXT:   [[MOVRri:%[0-9]+]]:intregs = MOVRri [[LDXrr]], 1, [[COPY3]], 49
+  ; CHECK-NEXT:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[MOVRri]], 0
+  ; CHECK-NEXT:   [[SUBrr:%[0-9]+]]:i64regs = SUBrr killed [[LDXri]], killed [[SRLri]]
+  ; CHECK-NEXT:   STXri %stack.1, 0, killed [[SUBrr]] :: (store (s64) into %stack.1, align 16)
+  ; CHECK-NEXT:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
+  ; CHECK-NEXT:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
+  ; CHECK-NEXT:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
+  ; CHECK-NEXT:   $q0 = COPY [[INSERT_SUBREG1]]
+  ; CHECK-NEXT:   RETL 8, implicit $q0
 Entry:
   %1 = bitcast fp128 %0 to i128
   %2 = add i128 %1, -1
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index ae1de443bce05..ac0b1128ca812 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    ld [%fp+96], %l1
 ; SPARC-NEXT:    mov %i3, %g4
 ; SPARC-NEXT:    mov %i2, %g2
@@ -172,105 +168,97 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g3, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
-; SPARC64-NEXT:    mov %i3, %i4
-; SPARC64-NEXT:    mov %i1, %i3
-; SPARC64-NEXT:    srax %i0, 63, %o2
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %o2, %o3
-; SPARC64-NEXT:    mov %o0, %i1
-; SPARC64-NEXT:    mov %o1, %i5
-; SPARC64-NEXT:    srax %i2, 63, %o0
-; SPARC64-NEXT:    mov %o0, %o1
-; SPARC64-NEXT:    mov %i0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %g2
-; SPARC64-NEXT:    srlx %o1, 32, %g3
-; SPARC64-NEXT:    srlx %i1, 32, %g4
-; SPARC64-NEXT:    srlx %o0, 32, %g5
-; SPARC64-NEXT:    addcc %o1, %i5, %l0
-; SPARC64-NEXT:    addxcc %g3, %g2, %l1
-; SPARC64-NEXT:    addxcc %o0, %i1, %l2
-; SPARC64-NEXT:    addxcc %g5, %g4, %l3
+; SPARC64-NEXT:    mov %i3, %i5
+; SPARC64-NEXT:    mov %i2, %i3
+; SPARC64-NEXT:    mov %i1, %i2
+; SPARC64-NEXT:    mov %i0, %i4
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %o0, %i0
 ; SPARC64-NEXT:    mov %o1, %i1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %i4
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %l4
-; SPARC64-NEXT:    addxcc %g3, 0, %l5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i3
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %i5
-; SPARC64-NEXT:    addxcc %g3, 0, %g2
-; SPARC64-NEXT:    addcc %l4, %i5, %i5
-; SPARC64-NEXT:    addxcc %l5, %g2, %l4
-; SPARC64-NEXT:    addxcc %g0, 0, %l5
-; SPARC64-NEXT:    addxcc %g0, 0, %l6
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %g2
+; SPARC64-NEXT:    add %l0, %g2, %l1
+; SPARC64-NEXT:    cmp %l1, %l0
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o0, %l0, %g3
+; SPARC64-NEXT:    add %o1, %l1, %l1
+; SPARC64-NEXT:    cmp %l1, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %g3, %g2, %l2
+; SPARC64-NEXT:    srax %i4, 63, %o2
+; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %i5, %o1
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %o2, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %l0
+; SPARC64-NEXT:    srax %i3, 63, %o0
+; SPARC64-NEXT:    mov %o0, %o1
+; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
 ; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    srlx %o1, 32, %i0
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    srlx %o0, 32, %g2
-; SPARC64-NEXT:    addxcc %i0, %l4, %i0
-; SPARC64-NEXT:    addxcc %o0, %l5, %g3
-; SPARC64-NEXT:    addxcc %g2, %l6, %g2
-; SPARC64-NEXT:    addcc %i5, %l0, %i5
-; SPARC64-NEXT:    addxcc %i0, %l1, %i0
-; SPARC64-NEXT:    addxcc %g3, %l2, %g3
-; SPARC64-NEXT:    addxcc %g2, %l3, %g2
-; SPARC64-NEXT:    srl %g3, 0, %g3
-; SPARC64-NEXT:    sllx %g2, 32, %g2
-; SPARC64-NEXT:    or %g2, %g3, %g2
-; SPARC64-NEXT:    sllx %i4, 32, %i4
-; SPARC64-NEXT:    srax %i4, 63, %g3
-; SPARC64-NEXT:    xor %g2, %g3, %g2
-; SPARC64-NEXT:    srl %i5, 0, %i5
-; SPARC64-NEXT:    sllx %i0, 32, %i0
-; SPARC64-NEXT:    or %i0, %i5, %i0
-; SPARC64-NEXT:    xor %i0, %g3, %i0
-; SPARC64-NEXT:    or %i0, %g2, %i0
-; SPARC64-NEXT:    movrnz %i0, 1, %i2
-; SPARC64-NEXT:    srl %i3, 0, %i0
-; SPARC64-NEXT:    or %i4, %i0, %i0
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    add %o0, %i5, %i5
+; SPARC64-NEXT:    add %o1, %l0, %g2
+; SPARC64-NEXT:    cmp %g2, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
 ; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %i5, %i2, %i2
+; SPARC64-NEXT:    add %l2, %i2, %i2
+; SPARC64-NEXT:    add %l1, %g2, %i5
+; SPARC64-NEXT:    cmp %i5, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %i2, %i3, %i2
+; SPARC64-NEXT:    srax %i0, 63, %i3
+; SPARC64-NEXT:    xor %i2, %i3, %i2
+; SPARC64-NEXT:    xor %i5, %i3, %i3
+; SPARC64-NEXT:    or %i3, %i2, %i2
+; SPARC64-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-NEXT:    srl %i4, 0, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 start:
-  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -279,9 +267,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 9ca895fe78073..01383a00c2619 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    mov %i3, %g2
 ; SPARC-NEXT:    mov %i2, %g4
 ; SPARC-NEXT:    umul %i2, %i5, %i2
@@ -160,14 +156,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g2, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
@@ -208,7 +200,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore %g0, %o1, %o1
 start:
-  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -217,9 +209,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll
new file mode 100644
index 0000000000000..3c48782a18586
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll
@@ -0,0 +1,68 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#float_64:]] = OpTypeFloat 64
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_64:]] = OpTypeVector %[[#float_64]] 4
+
+define noundef float @frac_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Fract %[[#float_32_arg]]
+  %elt.frac = call float @llvm.spv.frac.f32(float %a)
+  ret float %elt.frac
+}
+
+define noundef half @frac_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Fract %[[#float_16_arg]]
+  %elt.frac = call half @llvm.spv.frac.f16(half %a)
+  ret half %elt.frac
+}
+
+define noundef double @frac_double(double noundef %a) {
+entry:
+; CHECK: %[[#float_64_arg:]] = OpFunctionParameter %[[#float_64]]
+; CHECK: %[[#]] = OpExtInst %[[#float_64]] %[[#op_ext_glsl]] Fract %[[#float_64_arg]]
+  %elt.frac = call double @llvm.spv.frac.f64(double %a)
+  ret double %elt.frac
+}
+
+define noundef <4 x float> @frac_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Fract %[[#vec4_float_32_arg]]
+  %elt.frac = call <4 x float> @llvm.spv.frac.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.frac
+}
+
+define noundef <4 x half> @frac_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Fract %[[#vec4_float_16_arg]]
+  %elt.frac = call <4 x half> @llvm.spv.frac.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.frac
+}
+
+define noundef <4 x double> @frac_double_vector(<4 x double> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_64_arg:]] = OpFunctionParameter %[[#vec4_float_64]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_64]] %[[#op_ext_glsl]] Fract %[[#vec4_float_64_arg]]
+  %elt.frac = call <4 x double> @llvm.spv.frac.v4f64(<4 x double> %a)
+  ret <4 x double> %elt.frac
+}
+
+declare half @llvm.spv.frac.f16(half)
+declare float @llvm.spv.frac.f32(float)
+declare double @llvm.spv.frac.f64(double)
+
+declare <4 x float> @llvm.spv.frac.v4f32(<4 x float>)
+declare <4 x half> @llvm.spv.frac.v4f16(<4 x half>)
+declare <4 x double> @llvm.spv.frac.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/SystemZ/Large/large-ada-01.py b/llvm/test/CodeGen/SystemZ/Large/large-ada-01.py
new file mode 100644
index 0000000000000..17efde5b519cb
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/Large/large-ada-01.py
@@ -0,0 +1,30 @@
+# Test code generation for retrieving function descriptors
+# from the ADA when the ADA is extremely large and forces the
+# generation of a different instruction sequence
+# RUN: %python %s | llc -mtriple=s390x-ibm-zos -O2 | FileCheck %s
+
+# CHECK: llilf	1, {{[0-9]+}}
+# CHECK-NEXT: la	1, 0(1,8)
+
+from __future__ import print_function
+
+num_calls = 35000
+
+print("define hidden signext i32 @main() {")
+print("entry:")
+
+for i in range(num_calls):
+    print("  call void @foo%d()" % i)
+
+print("  call void @bar(ptr noundef @foo)")
+print("ret i32 0")
+print("}")
+
+for i in range(num_calls):
+    print("declare void @foo%d(...)" % i)
+
+print("declare void @bar(ptr noundef)")
+print("define internal void @foo() {")
+print("entry:")
+print("  ret void")
+print("  }")
diff --git a/llvm/test/CodeGen/SystemZ/Large/large-ada-02.py b/llvm/test/CodeGen/SystemZ/Large/large-ada-02.py
new file mode 100644
index 0000000000000..45bcb276f9b91
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/Large/large-ada-02.py
@@ -0,0 +1,33 @@
+# Test code generation for retrieving function descriptors
+# from the ADA when the ADA is extremely large and forces the
+# generation of a different instruction sequence
+# RUN: %python %s | llc -mtriple=s390x-ibm-zos -O2 | FileCheck %s
+
+# CHECK: algfi	8, {{[0-9]+}}
+# CHECK: la	8, 0(8)
+
+from __future__ import print_function
+
+num_calls = 35000
+
+print("define hidden signext i32 @main() {")
+print("entry:")
+
+for i in range(num_calls):
+    print("  call void @foo%d()" % i)
+
+# This is added to force the use of register r8 to generate
+# la 8, 0(8) which generates the algfi instruction
+print('%0 = call ptr asm " LGR $0,$1\0A", "=r,{r8}"(ptr nonnull @foo)')
+print("  call void @bar(ptr noundef %0)")
+print("ret i32 0")
+print("}")
+
+for i in range(num_calls):
+    print("declare void @foo%d(...)" % i)
+
+print("declare void @bar(ptr noundef)")
+print("define internal void @foo() {")
+print("entry:")
+print("  ret void")
+print("  }")
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 8a6a30318ae58..62f5d49192ea9 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,7 +13,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
 ; CHECK-NEXT:    lhrl %r1, f+4
 ; CHECK-NEXT:    sll %r1, 8
@@ -21,59 +20,66 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    ic %r1, 6(%r2)
 ; CHECK-NEXT:    larl %r2, e
 ; CHECK-NEXT:    lb %r0, 3(%r2)
-; CHECK-NEXT:    vlvgp %v0, %r0, %r1
-; CHECK-NEXT:    vlvgp %v1, %r1, %r0
 ; CHECK-NEXT:    vlvgf %v1, %r1, 0
-; CHECK-NEXT:    vlvgf %v1, %r1, 2
-; CHECK-NEXT:    vlvgp %v2, %r1, %r1
-; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT:    vlvgf %v1, %r1, 1
+; CHECK-NEXT:    larl %r2, .LCPI0_0
+; CHECK-NEXT:    vl %v2, 0(%r2), 3
+; CHECK-NEXT:    vlvgf %v1, %r1, 3
+; CHECK-NEXT:    vlvgf %v3, %r1, 3
+; CHECK-NEXT:    vlvgf %v0, %r1, 1
+; CHECK-NEXT:    vperm %v4, %v1, %v0, %v2
+; CHECK-NEXT:    vlvgf %v0, %r1, 3
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT:    vlvgf %v0, %r0, 0
-; CHECK-NEXT:    vlvgf %v0, %r0, 2
-; CHECK-NEXT:    vgbm %v3, 30583
-; CHECK-NEXT:    vn %v0, %v0, %v3
-; CHECK-NEXT:    vn %v1, %v1, %v3
-; CHECK-NEXT:    vrepf %v2, %v2, 1
-; CHECK-NEXT:    vn %v2, %v2, %v3
-; CHECK-NEXT:    vrepif %v3, 127
-; CHECK-NEXT:    vchlf %v1, %v1, %v3
-; CHECK-NEXT:    vlgvf %r13, %v1, 0
-; CHECK-NEXT:    vchlf %v2, %v2, %v3
+; CHECK-NEXT:    vperm %v0, %v3, %v0, %v2
+; CHECK-NEXT:    larl %r2, .LCPI0_1
+; CHECK-NEXT:    vl %v5, 0(%r2), 3
+; CHECK-NEXT:    vgbm %v6, 30583
+; CHECK-NEXT:    vn %v0, %v0, %v6
+; CHECK-NEXT:    vn %v4, %v4, %v6
+; CHECK-NEXT:    vperm %v1, %v1, %v1, %v5
+; CHECK-NEXT:    vn %v5, %v1, %v6
+; CHECK-NEXT:    vperm %v1, %v0, %v3, %v2
+; CHECK-NEXT:    vn %v2, %v1, %v6
+; CHECK-NEXT:    vrepif %v1, 127
+; CHECK-NEXT:    vchlf %v3, %v5, %v1
+; CHECK-NEXT:    vlgvf %r3, %v3, 1
+; CHECK-NEXT:    vlgvf %r2, %v3, 0
+; CHECK-NEXT:    risbg %r2, %r2, 48, 176, 15
+; CHECK-NEXT:    rosbg %r2, %r3, 49, 49, 14
+; CHECK-NEXT:    vlgvf %r3, %v3, 2
+; CHECK-NEXT:    rosbg %r2, %r3, 50, 50, 13
+; CHECK-NEXT:    vlgvf %r3, %v3, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 51, 51, 12
+; CHECK-NEXT:    vchlf %v3, %v4, %v1
+; CHECK-NEXT:    vlgvf %r3, %v3, 0
+; CHECK-NEXT:    rosbg %r2, %r3, 52, 52, 11
+; CHECK-NEXT:    vlgvf %r3, %v3, 1
+; CHECK-NEXT:    rosbg %r2, %r3, 53, 53, 10
+; CHECK-NEXT:    vlgvf %r3, %v3, 2
+; CHECK-NEXT:    rosbg %r2, %r3, 54, 54, 9
+; CHECK-NEXT:    vlgvf %r3, %v3, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 55, 55, 8
+; CHECK-NEXT:    vchlf %v2, %v2, %v1
+; CHECK-NEXT:    vlgvf %r3, %v2, 0
+; CHECK-NEXT:    rosbg %r2, %r3, 56, 56, 7
 ; CHECK-NEXT:    vlgvf %r3, %v2, 1
-; CHECK-NEXT:    nilf %r3, 1
-; CHECK-NEXT:    vlgvf %r4, %v2, 0
-; CHECK-NEXT:    risbg %r2, %r4, 48, 176, 15
-; CHECK-NEXT:    rosbg %r2, %r3, 32, 49, 14
-; CHECK-NEXT:    vlgvf %r5, %v2, 2
-; CHECK-NEXT:    nilf %r5, 1
-; CHECK-NEXT:    rosbg %r2, %r5, 32, 50, 13
-; CHECK-NEXT:    vlgvf %r14, %v2, 3
-; CHECK-NEXT:    nilf %r14, 1
-; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
-; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r13, %v1, 1
-; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r13, %v1, 2
-; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r13, %v1, 3
-; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r13, %v0, 0
-; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
-; CHECK-NEXT:    vlgvf %r13, %v0, 1
-; CHECK-NEXT:    rosbg %r2, %r13, 57, 57, 6
-; CHECK-NEXT:    vlgvf %r13, %v0, 2
-; CHECK-NEXT:    rosbg %r2, %r13, 58, 58, 5
-; CHECK-NEXT:    vlgvf %r13, %v0, 3
-; CHECK-NEXT:    rosbg %r2, %r13, 59, 59, 4
-; CHECK-NEXT:    nilf %r4, 1
-; CHECK-NEXT:    rosbg %r2, %r4, 32, 60, 3
-; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
-; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
-; CHECK-NEXT:    or %r2, %r14
+; CHECK-NEXT:    rosbg %r2, %r3, 57, 57, 6
+; CHECK-NEXT:    vlgvf %r3, %v2, 2
+; CHECK-NEXT:    rosbg %r2, %r3, 58, 58, 5
+; CHECK-NEXT:    vlgvf %r3, %v2, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 59, 59, 4
+; CHECK-NEXT:    vchlf %v0, %v0, %v1
+; CHECK-NEXT:    vlgvf %r3, %v0, 0
+; CHECK-NEXT:    rosbg %r2, %r3, 60, 60, 3
+; CHECK-NEXT:    vlgvf %r3, %v0, 1
+; CHECK-NEXT:    rosbg %r2, %r3, 61, 61, 2
+; CHECK-NEXT:    vlgvf %r3, %v0, 2
+; CHECK-NEXT:    rosbg %r2, %r3, 62, 62, 1
+; CHECK-NEXT:    vlgvf %r3, %v0, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 63, 63, 0
 ; CHECK-NEXT:    vlgvb %r4, %v0, 1
 ; CHECK-NEXT:    vlgvb %r3, %v0, 0
 ; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
@@ -116,7 +122,7 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    nr %r2, %r0
 ; CHECK-NEXT:    larl %r1, g
 ; CHECK-NEXT:    stc %r2, 0(%r1)
-; CHECK-NEXT:    lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT:    aghi %r15, 168
 ; CHECK-NEXT:    br %r14
 entry:
   %n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
index 6f0abd6ea5baf..16231b2d89526 100644
--- a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
@@ -153,3 +153,13 @@ define void @f7(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
   store i8 %trunc3, ptr %ptr4
   ret void
 }
+
+; Test that a truncating store with a non-simple VT can be handled.
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+  %1 = load <12 x i32>, ptr %src, align 64
+  %2 = extractelement <12 x i32> %1, i64 11
+  %3 = trunc i32 %2 to i16
+  store i16 %3, ptr %dst, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 570834fb67010..81b6a6940a7d6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -2318,43 +2318,41 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    strb.w r4, [r9, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
 ; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsrl r2, r1, #28
 ; CHECK-NEXT:    strb.w r2, [r9, #24]
@@ -3687,268 +3685,257 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
-; CHECK-NEXT:    vmov r5, r7, d8
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    vmov r9, r3, d0
-; CHECK-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    vmov r10, r9, d8
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r7, r3, d0
+; CHECK-NEXT:    str r3, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    str r7, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    vmov r8, r3, d0
 ; CHECK-NEXT:    mov r2, r8
 ; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    strd r1, r0, [sp, #8] @ 8-byte Folded Spill
 ; CHECK-NEXT:    csel r4, r2, r4, ne
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    str.w r4, [r10, #8]
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    str.w r9, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    str r4, [r6, #8]
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    ldr r5, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    mov r2, r8
 ; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    str.w r8, [sp, #32] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    csel r6, r1, r0, ne
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    csel r7, r1, r0, ne
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r6, #-1
+; CHECK-NEXT:    movne.w r7, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r6, #0
-; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    str r6, [r0, #4]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    movne r7, #0
+; CHECK-NEXT:    str r7, [r6, #4]
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    mov r2, r8
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r10, r8
+; CHECK-NEXT:    str.w r11, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    csel r6, r1, r0, ne
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    csel r7, r1, r0, ne
+; CHECK-NEXT:    mov r0, r10
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r6, #-1
-; CHECK-NEXT:    str r5, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    str r7, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    movne.w r7, #-1
+; CHECK-NEXT:    str.w r10, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    vmov r9, r8, d9
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r6, #0
-; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    str r6, [r0]
-; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    movne r7, #0
+; CHECK-NEXT:    str r7, [r6]
+; CHECK-NEXT:    ldr r6, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov r2, r6
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    ldr.w r10, [sp, #32] @ 4-byte Reload
 ; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r4, r10
-; CHECK-NEXT:    str.w r10, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    str.w r11, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    mov r10, r3
-; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    cmp.w r11, #0
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    mvneq r10, #7
+; CHECK-NEXT:    strd r2, r3, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT:    csel r11, r1, r11, ne
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r10, #7
+; CHECK-NEXT:    movne.w r11, #-1
+; CHECK-NEXT:    bl __aeabi_dcmpun
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r11, #0
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    ldr r5, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    bl __aeabi_dcmpge
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    csel r10, r4, r0, ne
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r10, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r10, #0
 ; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    ldr r3, [sp, #40] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    str.w r10, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    lsrl r10, r11, #28
 ; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    mov r2, r6
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    csel r4, r1, r0, ne
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    cmp.w r11, #0
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r10
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    orr.w r0, r11, r4, lsl #4
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r5, r6
-; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    strd r10, r0, [r6, #16]
 ; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    ldr.w r11, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    csel r11, r1, r0, ne
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r11, #-1
-; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    mvneq r0, #7
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r11, #0
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    lsrl r4, r1, #28
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    movne r0, #7
 ; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r7, r6
-; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    csel r6, r1, r0, ne
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r6, #0
-; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    and r1, r10, #15
-; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    lsrl r6, r1, #28
-; CHECK-NEXT:    strd r4, r0, [r2, #16]
-; CHECK-NEXT:    mov r8, r2
-; CHECK-NEXT:    strb r6, [r2, #24]
-; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    movne r5, #0
+; CHECK-NEXT:    and r1, r5, #15
+; CHECK-NEXT:    mov r8, r6
+; CHECK-NEXT:    lsrl r4, r1, #28
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    strb r4, [r6, #24]
 ; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r6
-; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    mvneq r0, #7
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #7
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r6
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    and r0, r4, #15
-; CHECK-NEXT:    orr.w r0, r0, r11, lsl #4
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
 ; CHECK-NEXT:    str.w r0, [r8, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -5433,7 +5420,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    vcvtb.f32.f16 s21, s19
 ; CHECK-NEXT:    vcvtt.f32.f16 s24, s19
 ; CHECK-NEXT:    vmov r0, s21
@@ -5442,13 +5429,13 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vcvtb.f32.f16 s30, s18
 ; CHECK-NEXT:    vldr s20, .LCPI50_2
 ; CHECK-NEXT:    vmov r8, s24
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r9, s26
 ; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
 ; CHECK-NEXT:    vmov r6, s28
-; CHECK-NEXT:    vmov r5, s30
+; CHECK-NEXT:    vmov r7, s30
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vldr s18, .LCPI50_3
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s21, s20
@@ -5464,7 +5451,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s21, s20
-; CHECK-NEXT:    str.w r2, [r9, #83]
+; CHECK-NEXT:    str.w r2, [r11, #83]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5476,7 +5463,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #79]
+; CHECK-NEXT:    str.w r1, [r11, #79]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s21, s20
@@ -5487,11 +5474,11 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9, #75]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r11, #75]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r7, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
@@ -5506,7 +5493,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r9, #58]
+; CHECK-NEXT:    str.w r2, [r11, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5518,7 +5505,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #54]
+; CHECK-NEXT:    str.w r1, [r11, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s30, s20
@@ -5529,7 +5516,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9, #50]
+; CHECK-NEXT:    str.w r0, [r11, #50]
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s28, s18
@@ -5548,7 +5535,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r9, #33]
+; CHECK-NEXT:    str.w r2, [r11, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5560,7 +5547,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #29]
+; CHECK-NEXT:    str.w r1, [r11, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
@@ -5571,8 +5558,8 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9, #25]
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    str.w r0, [r11, #25]
+; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
@@ -5590,7 +5577,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r9, #8]
+; CHECK-NEXT:    str.w r2, [r11, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5602,7 +5589,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #4]
+; CHECK-NEXT:    str.w r1, [r11, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s26, s20
@@ -5613,7 +5600,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    str.w r0, [r11]
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s24, s18
@@ -5633,69 +5620,68 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
+; CHECK-NEXT:    mvnlt r5, #7
 ; CHECK-NEXT:    vcmp.f32 s21, s20
-; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
+; CHECK-NEXT:    movgt r5, #7
 ; CHECK-NEXT:    vcmp.f32 s21, s21
-; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    mov r8, r2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    and r0, r5, #15
 ; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    str.w r1, [r9, #87]
+; CHECK-NEXT:    str.w r1, [r11, #87]
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    movgt.w r10, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    movvs.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r5, #7
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #7
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r0, r5, #15
-; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
 ; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
-; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    str.w r0, [r11, #62]
 ; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    mov r7, r3
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
@@ -5715,8 +5701,9 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    str.w r0, [r11, #37]
 ; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s16, s18
@@ -5732,26 +5719,26 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    vcmp.f32 s26, s18
-; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r5, #7
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #7
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    and r5, r5, #15
+; CHECK-NEXT:    and r7, r7, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
-; CHECK-NEXT:    str.w r5, [r9, #12]
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
+; CHECK-NEXT:    str.w r7, [r11, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt.w r9, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    b.w .LBB50_3
 ; CHECK-NEXT:    .p2align 2
@@ -5766,181 +5753,176 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:  .LBB50_3:
 ; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    movgt.w r9, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    movvs.w r9, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r6, r11, #28
+; CHECK-NEXT:    lsrl r6, r9, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
-; CHECK-NEXT:    str.w r5, [r9, #95]
-; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    orr.w r7, r9, r8, lsl #4
+; CHECK-NEXT:    str.w r7, [r11, #95]
+; CHECK-NEXT:    str.w r6, [r11, #91]
 ; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r6, #7
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #7
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    and r5, r6, #15
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    and r7, r7, #15
 ; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    lsrl r10, r5, #28
+; CHECK-NEXT:    lsrl r8, r7, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    strb.w r10, [r9, #99]
+; CHECK-NEXT:    strb.w r8, [r11, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r7
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r11, #70]
+; CHECK-NEXT:    str.w r10, [r11, #66]
 ; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    and r5, r7, #15
-; CHECK-NEXT:    lsrl r4, r5, #28
-; CHECK-NEXT:    str.w r6, [r9, #70]
-; CHECK-NEXT:    str.w r8, [r9, #66]
 ; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    strb.w r4, [r9, #74]
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    mov r5, r4
+; CHECK-NEXT:    strb.w r6, [r11, #74]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r4, #7
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #7
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r4, r5, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r4
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r11, #45]
+; CHECK-NEXT:    str.w r4, [r11, #41]
 ; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    ldr.w r12, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r12, r5, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    and r5, r7, #15
 ; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str.w r7, [r9, #45]
-; CHECK-NEXT:    str.w r12, [r9, #41]
-; CHECK-NEXT:    strb.w r6, [r9, #49]
+; CHECK-NEXT:    strb.w r6, [r11, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
 ; CHECK-NEXT:    vcmp.f32 s16, s18
-; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    strd r0, r1, [r11, #16]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    strb.w r2, [r11, #24]
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 2b6d0da531704..5ab184a066e49 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -1879,26 +1879,16 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    strb.w r6, [r8, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
 ; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
@@ -1906,7 +1896,15 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    strd r0, r1, [r8, #16]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsrl r2, r1, #28
 ; CHECK-NEXT:    strb.w r2, [r8, #24]
@@ -2925,195 +2923,191 @@ define arm_aapcs_vfpcc <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
 ; CHECK-NEXT:    vmov r6, r5, d8
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmov r2, r9, d0
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r2, r7, d0
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    mov r9, r2
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vmov r11, r3, d0
-; CHECK-NEXT:    str r3, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    str r2, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    str r5, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    cmp.w r8, #0
 ; CHECK-NEXT:    strd r1, r0, [sp, #20] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r0, r2, r4, ne
+; CHECK-NEXT:    csel r0, r2, r8, ne
 ; CHECK-NEXT:    str r3, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    cmp.w r11, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #8]
+; CHECK-NEXT:    str r0, [r4, #8]
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r11, r6
+; CHECK-NEXT:    ldr r6, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r11
-; CHECK-NEXT:    mov r7, r6
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r2, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    cmp.w r8, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #4]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    str r0, [r4, #4]
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r6, r8
-; CHECK-NEXT:    strd r8, r7, [sp, #28] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r4, r11, [sp, #28] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r5, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r11
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    vmov r8, r11, d9
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str r0, [r6]
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    mov r3, r4
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r2, r6
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    mov r6, r5
-; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
-; CHECK-NEXT:    mov r5, r7
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    strd r0, r2, [sp, #20] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r0, r3, r7, ne
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #15
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    add.w r12, sp, #16
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    stm.w r12, {r0, r2, r3} @ 12-byte Folded Spill
+; CHECK-NEXT:    csel r9, r1, r10, ne
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r9, #-1
+; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r10, r4
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    ldr r5, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r7
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    csel r4, r1, r0, ne
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r6, r10
-; CHECK-NEXT:    str.w r10, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    str.w r9, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r4, #-1
 ; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    str r4, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    lsrl r4, r9, #28
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    csel r4, r1, r0, ne
-; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r4
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    movne.w r6, #-1
+; CHECK-NEXT:    ldr r5, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    orr.w r0, r9, r6, lsl #4
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    lsrl r10, r5, #28
+; CHECK-NEXT:    strd r4, r0, [r5, #16]
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    ldr.w r9, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    ldr.w r11, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r9, r7
-; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    ldr.w r8, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r1, r5, r0, lsl #4
-; CHECK-NEXT:    strd r10, r1, [r2, #16]
-; CHECK-NEXT:    mov r8, r2
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsrl r0, r1, #28
-; CHECK-NEXT:    strb r0, [r2, #24]
+; CHECK-NEXT:    movne r0, #15
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strb r6, [r5, #24]
 ; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    ldr r7, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r4, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #15
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    and r0, r0, #15
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
-; CHECK-NEXT:    str.w r0, [r8, #12]
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
+; CHECK-NEXT:    str r0, [r5, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
@@ -4216,7 +4210,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
 ; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
 ; CHECK-NEXT:    vmov r0, s30
@@ -4224,14 +4218,14 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
 ; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
 ; CHECK-NEXT:    vldr s20, .LCPI50_1
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r7, s28
 ; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r9, s24
 ; CHECK-NEXT:    vmov r6, s26
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
@@ -4242,7 +4236,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r9, #83]
+; CHECK-NEXT:    str.w r2, [r8, #83]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4250,18 +4244,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #79]
+; CHECK-NEXT:    str.w r1, [r8, #79]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r9, #75]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r8, #75]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r7, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
@@ -4272,7 +4266,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r9, #58]
+; CHECK-NEXT:    str.w r2, [r8, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4280,14 +4274,14 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #54]
+; CHECK-NEXT:    str.w r1, [r8, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r9, #50]
+; CHECK-NEXT:    str.w r0, [r8, #50]
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s26, #0
@@ -4302,7 +4296,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r9, #33]
+; CHECK-NEXT:    str.w r2, [r8, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4310,18 +4304,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #29]
+; CHECK-NEXT:    str.w r1, [r8, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r9, #25]
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    str.w r0, [r8, #25]
+; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    it lt
@@ -4332,7 +4326,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str.w r2, [r9, #8]
+; CHECK-NEXT:    str.w r2, [r8, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4340,21 +4334,21 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #4]
+; CHECK-NEXT:    str.w r1, [r8, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r9]
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4363,177 +4357,174 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r0, r7, #15
-; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    movgt r5, #15
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    mov r10, r2
-; CHECK-NEXT:    str.w r1, [r9, #87]
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    str.w r1, [r8, #87]
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    mov r7, r3
-; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    movgt.w r10, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r0, r7, #15
 ; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
-; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
+; CHECK-NEXT:    str.w r0, [r8, #62]
 ; CHECK-NEXT:    vmov r0, s28
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r0, #15
 ; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    str.w r0, [r8, #37]
 ; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #15
-; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r7, r7, #15
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r5, [r9, #12]
+; CHECK-NEXT:    str.w r7, [r8, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt.w r9, #0
 ; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    movgt.w r9, #-1
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    lsrl r6, r11, #28
+; CHECK-NEXT:    lsrl r6, r9, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
-; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
-; CHECK-NEXT:    str.w r5, [r9, #95]
-; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    orr.w r7, r9, r4, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #95]
+; CHECK-NEXT:    str.w r6, [r8, #91]
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #15
-; CHECK-NEXT:    and r5, r6, #15
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r7, r7, #15
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    lsrl r10, r5, #28
+; CHECK-NEXT:    lsrl r4, r7, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r10, [r9, #99]
+; CHECK-NEXT:    strb.w r4, [r8, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r7
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #70]
+; CHECK-NEXT:    str.w r10, [r8, #66]
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    movgt r7, #15
 ; CHECK-NEXT:    and r5, r7, #15
-; CHECK-NEXT:    lsrl r4, r5, #28
-; CHECK-NEXT:    str.w r6, [r9, #70]
-; CHECK-NEXT:    str.w r8, [r9, #66]
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    strb.w r4, [r9, #74]
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    strb.w r6, [r8, #74]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r4, r11, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    b.w .LBB50_2
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -4541,46 +4532,34 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .long 0x717fffff @ float 1.26765052E+30
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r12
-; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    orr.w r7, r11, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #45]
+; CHECK-NEXT:    str.w r4, [r8, #41]
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    and r5, r12, #15
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r5, r7, #15
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str.w r7, [r9, #45]
-; CHECK-NEXT:    str.w r4, [r9, #41]
-; CHECK-NEXT:    strb.w r6, [r9, #49]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    strb.w r6, [r8, #49]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
@@ -4588,10 +4567,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    strd r0, r1, [r8, #16]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    strb.w r2, [r8, #24]
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
index f2c8440b177d8..55a621eaf4c9c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
@@ -207,10 +207,8 @@ define arm_aapcs_vfpcc <1 x i64> @smax1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    sbcs.w r12, r3, r1
 ; CHECK-NEXT:    cset r12, lt
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
   %c = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %a, <1 x i64> %b)
@@ -486,10 +484,8 @@ define arm_aapcs_vfpcc <1 x i64> @umax1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    sbcs.w r12, r3, r1
 ; CHECK-NEXT:    cset r12, lo
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
   %c = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %a, <1 x i64> %b)
@@ -772,10 +768,8 @@ define arm_aapcs_vfpcc <1 x i64> @smin1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    sbcs.w r12, r1, r3
 ; CHECK-NEXT:    cset r12, lt
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
   %c = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %a, <1 x i64> %b)
@@ -1051,10 +1045,8 @@ define arm_aapcs_vfpcc <1 x i64> @umin1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    sbcs.w r12, r1, r3
 ; CHECK-NEXT:    cset r12, lo
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    csel r1, r1, r3, ne
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
   %c = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %a, <1 x i64> %b)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index d80dd5a673e20..85317e1fe4626 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -535,22 +535,20 @@ define void @vst3_v2i8(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    ldrb r3, [r0, #1]
-; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    ldrb.w lr, [r0, #3]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r5, [r0, #5]
+; CHECK-NEXT:    mov r5, sp
+; CHECK-NEXT:    ldrb r3, [r0, #2]
 ; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrb.w r12, [r0, #1]
+; CHECK-NEXT:    ldrb.w lr, [r0, #3]
+; CHECK-NEXT:    vmov.16 q0[1], r3
+; CHECK-NEXT:    ldrb r4, [r0, #5]
 ; CHECK-NEXT:    ldrb r0, [r0, #4]
-; CHECK-NEXT:    vmov.16 q0[1], r12
 ; CHECK-NEXT:    vmov.16 q0[2], r0
 ; CHECK-NEXT:    add r0, sp, #8
-; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[3], r12
 ; CHECK-NEXT:    vmov.16 q0[4], lr
-; CHECK-NEXT:    vmov.16 q0[5], r5
-; CHECK-NEXT:    vstrb.16 q0, [r4]
+; CHECK-NEXT:    vmov.16 q0[5], r4
+; CHECK-NEXT:    vstrb.16 q0, [r5]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    ldr r2, [sp]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index f3a65c40031af..b36904495e878 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -238,27 +238,23 @@ define void @vst4_v2i16(ptr %src, ptr %dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    ldrh r3, [r0, #2]
-; CHECK-NEXT:    ldrh r2, [r0]
-; CHECK-NEXT:    ldrh.w r12, [r0, #10]
-; CHECK-NEXT:    ldrh.w lr, [r0, #4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    ldrh r4, [r0, #12]
-; CHECK-NEXT:    ldrh r5, [r0, #6]
+; CHECK-NEXT:    ldrh r2, [r0, #4]
+; CHECK-NEXT:    ldrh r3, [r0, #8]
+; CHECK-NEXT:    ldrh.w r12, [r0, #12]
+; CHECK-NEXT:    ldrh.w lr, [r0, #2]
+; CHECK-NEXT:    ldrh r4, [r0, #6]
+; CHECK-NEXT:    ldrh r5, [r0, #10]
 ; CHECK-NEXT:    ldrh r6, [r0, #14]
-; CHECK-NEXT:    ldrh r0, [r0, #8]
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[1], lr
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.16 q1[3], r4
-; CHECK-NEXT:    vmov.16 q1[4], r3
-; CHECK-NEXT:    vmov.16 q1[5], r5
-; CHECK-NEXT:    vmov.16 q1[6], r12
-; CHECK-NEXT:    vmov.16 q1[7], r6
-; CHECK-NEXT:    vstrh.16 q1, [r1]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.16 q0[2], r3
+; CHECK-NEXT:    vmov.16 q0[3], r12
+; CHECK-NEXT:    vmov.16 q0[4], lr
+; CHECK-NEXT:    vmov.16 q0[5], r4
+; CHECK-NEXT:    vmov.16 q0[6], r5
+; CHECK-NEXT:    vmov.16 q0[7], r6
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <2 x i16>, ptr %src, align 4
@@ -475,26 +471,22 @@ define void @vst4_v2i8(ptr %src, ptr %dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    ldrb r4, [r0, #5]
-; CHECK-NEXT:    ldrb r5, [r0, #4]
-; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    ldrb r3, [r0, #1]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    ldrb.w lr, [r0, #3]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r0, #7]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    ldrb r0, [r0, #6]
-; CHECK-NEXT:    vmov.16 q0[1], r12
-; CHECK-NEXT:    vmov.16 q0[2], r5
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.16 q0[4], r3
-; CHECK-NEXT:    vmov.16 q0[5], lr
-; CHECK-NEXT:    vmov.16 q0[6], r4
-; CHECK-NEXT:    vmov.16 q0[7], r6
+; CHECK-NEXT:    ldrb r4, [r0]
+; CHECK-NEXT:    ldrb r6, [r0, #2]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb r2, [r0, #4]
+; CHECK-NEXT:    vmov.16 q0[1], r6
+; CHECK-NEXT:    ldrb r3, [r0, #6]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    ldrb r5, [r0, #1]
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    ldrb.w r12, [r0, #5]
+; CHECK-NEXT:    ldrb.w lr, [r0, #7]
+; CHECK-NEXT:    vmov.16 q0[4], r5
+; CHECK-NEXT:    ldrb r0, [r0, #3]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.16 q0[6], r12
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    vstrb.16 q0, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 12aa101cb48c4..51da557c6c49f 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -281,11 +281,13 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
 ; CHECK-LABEL: maxi1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    or %s0, %s0, %s1
+; CHECK-NEXT:    and %s0, 1, %s0
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; OPT-LABEL: maxi1:
 ; OPT:       # %bb.0:
 ; OPT-NEXT:    or %s0, %s0, %s1
+; OPT-NEXT:    and %s0, 1, %s0
 ; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %1, true
   %4 = and i1 %3, %0
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index da92ebafd0590..69d5ce48601f8 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -278,6 +278,7 @@ define i32 @min2u32(i32, i32) {
 define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ; CHECK-LABEL: mini1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    and %s2, %s1, %s0
 ; CHECK-NEXT:    cmov.w.ne %s2, %s1, %s0
 ; CHECK-NEXT:    adds.w.zx %s0, %s2, (0)1
@@ -285,6 +286,7 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ;
 ; OPT-LABEL: mini1:
 ; OPT:       # %bb.0:
+; OPT-NEXT:    and %s0, %s0, (32)0
 ; OPT-NEXT:    and %s2, %s1, %s0
 ; OPT-NEXT:    cmov.w.ne %s2, %s1, %s0
 ; OPT-NEXT:    adds.w.zx %s0, %s2, (0)1
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index fa78f5f9591d6..dba4138ad59cc 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -36,6 +36,14 @@ define float @extract_lane_v8f16(<8 x half> %v) {
   ret float %r
 }
 
+; CHECK-LABEL: replace_lane_v8f16:
+; CHECK:       f16x8.replace_lane $push0=, $0, 1, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @replace_lane_v8f16(<8 x half> %v, float %f) {
+  %r = call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %v, i32 1, float %f)
+  ret <8 x half> %r
+}
+
 ; CHECK-LABEL: add_v8f16:
 ; CHECK:       f16x8.add $push0=, $0, $1
 ; CHECK-NEXT:  return $pop0
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
new file mode 100644
index 0000000000000..8bc9c043fcf8a
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+
+; Test a subset of compiler-rt/libm libcalls expected to be emitted by the wasm backend
+
+target triple = "wasm32-unknown-unknown"
+
+declare fp128 @llvm.sin.f128(fp128)
+declare fp128 @llvm.cos.f128(fp128)
+declare fp128 @llvm.tan.f128(fp128)
+declare fp128 @llvm.asin.f128(fp128)
+declare fp128 @llvm.acos.f128(fp128)
+declare fp128 @llvm.atan.f128.i32(fp128)
+declare fp128 @llvm.sinh.f128(fp128)
+declare fp128 @llvm.cosh.f128(fp128)
+declare fp128 @llvm.tanh.f128(fp128)
+
+declare double @llvm.sin.f64(double)
+declare double @llvm.cos.f64(double)
+declare double @llvm.tan.f64(double)
+declare double @llvm.asin.f64(double)
+declare double @llvm.acos.f64(double)
+declare double @llvm.atan.f64(double)
+declare double @llvm.sinh.f64(double)
+declare double @llvm.cosh.f64(double)
+declare double @llvm.tanh.f64(double)
+
+declare float @llvm.sin.f32(float)
+declare float @llvm.cos.f32(float)
+declare float @llvm.tan.f32(float)
+declare float @llvm.asin.f32(float)
+declare float @llvm.acos.f32(float)
+declare float @llvm.atan.f32(float)
+declare float @llvm.sinh.f32(float)
+declare float @llvm.cosh.f32(float)
+declare float @llvm.tanh.f32(float)
+
+
+define fp128 @fp128libcalls(fp128 %x) {
+  ; compiler-rt call
+; CHECK-LABEL: fp128libcalls:
+; CHECK:         .functype fp128libcalls (i32, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get      $push28=, __stack_pointer
+; CHECK-NEXT:    i32.const       $push29=, 144
+; CHECK-NEXT:    i32.sub         $push73=, $pop28, $pop29
+; CHECK-NEXT:    local.tee       $push72=, 3, $pop73
+; CHECK-NEXT:    global.set      __stack_pointer, $pop72
+; CHECK-NEXT:    local.get       $push74=, 3
+; CHECK-NEXT:    i32.const       $push62=, 128
+; CHECK-NEXT:    i32.add         $push63=, $pop74, $pop62
+; CHECK-NEXT:    local.get       $push76=, 1
+; CHECK-NEXT:    local.get       $push75=, 2
+; CHECK-NEXT:    call    sinl, $pop63, $pop76, $pop75
+; CHECK-NEXT:    local.get       $push77=, 3
+; CHECK-NEXT:    i32.const       $push58=, 112
+; CHECK-NEXT:    i32.add         $push59=, $pop77, $pop58
+; CHECK-NEXT:    local.get       $push78=, 3
+; CHECK-NEXT:    i64.load        $push3=, 128($pop78)
+; CHECK-NEXT:    local.get       $push79=, 3
+; CHECK-NEXT:    i32.const       $push60=, 128
+; CHECK-NEXT:    i32.add         $push61=, $pop79, $pop60
+; CHECK-NEXT:    i32.const       $push0=, 8
+; CHECK-NEXT:    i32.add         $push1=, $pop61, $pop0
+; CHECK-NEXT:    i64.load        $push2=, 0($pop1)
+; CHECK-NEXT:    call    cosl, $pop59, $pop3, $pop2
+; CHECK-NEXT:    local.get       $push80=, 3
+; CHECK-NEXT:    i32.const       $push54=, 96
+; CHECK-NEXT:    i32.add         $push55=, $pop80, $pop54
+; CHECK-NEXT:    local.get       $push81=, 3
+; CHECK-NEXT:    i64.load        $push6=, 112($pop81)
+; CHECK-NEXT:    local.get       $push82=, 3
+; CHECK-NEXT:    i32.const       $push56=, 112
+; CHECK-NEXT:    i32.add         $push57=, $pop82, $pop56
+; CHECK-NEXT:    i32.const       $push71=, 8
+; CHECK-NEXT:    i32.add         $push4=, $pop57, $pop71
+; CHECK-NEXT:    i64.load        $push5=, 0($pop4)
+; CHECK-NEXT:    call    tanl, $pop55, $pop6, $pop5
+; CHECK-NEXT:    local.get       $push83=, 3
+; CHECK-NEXT:    i32.const       $push50=, 80
+; CHECK-NEXT:    i32.add         $push51=, $pop83, $pop50
+; CHECK-NEXT:    local.get       $push84=, 3
+; CHECK-NEXT:    i64.load        $push9=, 96($pop84)
+; CHECK-NEXT:    local.get       $push85=, 3
+; CHECK-NEXT:    i32.const       $push52=, 96
+; CHECK-NEXT:    i32.add         $push53=, $pop85, $pop52
+; CHECK-NEXT:    i32.const       $push70=, 8
+; CHECK-NEXT:    i32.add         $push7=, $pop53, $pop70
+; CHECK-NEXT:    i64.load        $push8=, 0($pop7)
+; CHECK-NEXT:    call    asinl, $pop51, $pop9, $pop8
+; CHECK-NEXT:    local.get       $push86=, 3
+; CHECK-NEXT:    i32.const       $push46=, 64
+; CHECK-NEXT:    i32.add         $push47=, $pop86, $pop46
+; CHECK-NEXT:    local.get       $push87=, 3
+; CHECK-NEXT:    i64.load        $push12=, 80($pop87)
+; CHECK-NEXT:    local.get       $push88=, 3
+; CHECK-NEXT:    i32.const       $push48=, 80
+; CHECK-NEXT:    i32.add         $push49=, $pop88, $pop48
+; CHECK-NEXT:    i32.const       $push69=, 8
+; CHECK-NEXT:    i32.add         $push10=, $pop49, $pop69
+; CHECK-NEXT:    i64.load        $push11=, 0($pop10)
+; CHECK-NEXT:    call    acosl, $pop47, $pop12, $pop11
+; CHECK-NEXT:    local.get       $push89=, 3
+; CHECK-NEXT:    i32.const       $push42=, 48
+; CHECK-NEXT:    i32.add         $push43=, $pop89, $pop42
+; CHECK-NEXT:    local.get       $push90=, 3
+; CHECK-NEXT:    i64.load        $push15=, 64($pop90)
+; CHECK-NEXT:    local.get       $push91=, 3
+; CHECK-NEXT:    i32.const       $push44=, 64
+; CHECK-NEXT:    i32.add         $push45=, $pop91, $pop44
+; CHECK-NEXT:    i32.const       $push68=, 8
+; CHECK-NEXT:    i32.add         $push13=, $pop45, $pop68
+; CHECK-NEXT:    i64.load        $push14=, 0($pop13)
+; CHECK-NEXT:    call    atanl, $pop43, $pop15, $pop14
+; CHECK-NEXT:    local.get       $push92=, 3
+; CHECK-NEXT:    i32.const       $push38=, 32
+; CHECK-NEXT:    i32.add         $push39=, $pop92, $pop38
+; CHECK-NEXT:    local.get       $push93=, 3
+; CHECK-NEXT:    i64.load        $push18=, 48($pop93)
+; CHECK-NEXT:    local.get       $push94=, 3
+; CHECK-NEXT:    i32.const       $push40=, 48
+; CHECK-NEXT:    i32.add         $push41=, $pop94, $pop40
+; CHECK-NEXT:    i32.const       $push67=, 8
+; CHECK-NEXT:    i32.add         $push16=, $pop41, $pop67
+; CHECK-NEXT:    i64.load        $push17=, 0($pop16)
+; CHECK-NEXT:    call    sinhl, $pop39, $pop18, $pop17
+; CHECK-NEXT:    local.get       $push95=, 3
+; CHECK-NEXT:    i32.const       $push34=, 16
+; CHECK-NEXT:    i32.add         $push35=, $pop95, $pop34
+; CHECK-NEXT:    local.get       $push96=, 3
+; CHECK-NEXT:    i64.load        $push21=, 32($pop96)
+; CHECK-NEXT:    local.get       $push97=, 3
+; CHECK-NEXT:    i32.const       $push36=, 32
+; CHECK-NEXT:    i32.add         $push37=, $pop97, $pop36
+; CHECK-NEXT:    i32.const       $push66=, 8
+; CHECK-NEXT:    i32.add         $push19=, $pop37, $pop66
+; CHECK-NEXT:    i64.load        $push20=, 0($pop19)
+; CHECK-NEXT:    call    coshl, $pop35, $pop21, $pop20
+; CHECK-NEXT:    local.get       $push100=, 3
+; CHECK-NEXT:    local.get       $push98=, 3
+; CHECK-NEXT:    i64.load        $push24=, 16($pop98)
+; CHECK-NEXT:    local.get       $push99=, 3
+; CHECK-NEXT:    i32.const       $push32=, 16
+; CHECK-NEXT:    i32.add         $push33=, $pop99, $pop32
+; CHECK-NEXT:    i32.const       $push65=, 8
+; CHECK-NEXT:    i32.add         $push22=, $pop33, $pop65
+; CHECK-NEXT:    i64.load        $push23=, 0($pop22)
+; CHECK-NEXT:    call    tanhl, $pop100, $pop24, $pop23
+; CHECK-NEXT:    local.get       $push102=, 0
+; CHECK-NEXT:    local.get       $push101=, 3
+; CHECK-NEXT:    i32.const       $push64=, 8
+; CHECK-NEXT:    i32.add         $push25=, $pop101, $pop64
+; CHECK-NEXT:    i64.load        $push26=, 0($pop25)
+; CHECK-NEXT:    i64.store       8($pop102), $pop26
+; CHECK-NEXT:    local.get       $push104=, 0
+; CHECK-NEXT:    local.get       $push103=, 3
+; CHECK-NEXT:    i64.load        $push27=, 0($pop103)
+; CHECK-NEXT:    i64.store       0($pop104), $pop27
+; CHECK-NEXT:    local.get       $push105=, 3
+; CHECK-NEXT:    i32.const       $push30=, 144
+; CHECK-NEXT:    i32.add         $push31=, $pop105, $pop30
+; CHECK-NEXT:    global.set      __stack_pointer, $pop31
+; CHECK-NEXT:    return
+  ; libm calls
+  %d = call fp128 @llvm.sin.f128(fp128 %x)
+  %e = call fp128 @llvm.cos.f128(fp128 %d)
+  %f = call fp128 @llvm.tan.f128(fp128 %e)
+  %g = call fp128 @llvm.asin.f128.i32(fp128 %f)
+  %h = call fp128 @llvm.acos.f128(fp128 %g)
+  %i = call fp128 @llvm.atan.f128(fp128 %h)
+  %a = call fp128 @llvm.sinh.f128(fp128 %i)
+  %b = call fp128 @llvm.cosh.f128(fp128 %a)
+  %c = call fp128 @llvm.tanh.f128(fp128 %b)
+  ret fp128 %c
+}
+
+define double @f64libcalls(double %x) {
+; CHECK-LABEL: f64libcalls:
+; CHECK:         .functype f64libcalls (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push9=, 0
+; CHECK-NEXT:    call    $push0=, sin, $pop9
+; CHECK-NEXT:    call    $push1=, cos, $pop0
+; CHECK-NEXT:    call    $push2=, tan, $pop1
+; CHECK-NEXT:    call    $push3=, asin, $pop2
+; CHECK-NEXT:    call    $push4=, acos, $pop3
+; CHECK-NEXT:    call    $push5=, atan, $pop4
+; CHECK-NEXT:    call    $push6=, sinh, $pop5
+; CHECK-NEXT:    call    $push7=, cosh, $pop6
+; CHECK-NEXT:    call    $push8=, tanh, $pop7  
+; CHECK-NEXT:    return $pop8
+
+
+ %k = call double @llvm.sin.f64(double %x)
+ %a = call double @llvm.cos.f64(double %k)
+ %b = call double @llvm.tan.f64(double %a)
+ %c = call double @llvm.asin.f64(double %b)
+ %d = call double @llvm.acos.f64(double %c)
+ %e = call double @llvm.atan.f64(double %d)
+ %f = call double @llvm.sinh.f64(double %e)
+ %g = call double @llvm.cosh.f64(double %f)
+ %h = call double @llvm.tanh.f64(double %g)
+ ret double %h
+}
+
+define float @f32libcalls(float %x) {
+; CHECK-LABEL: f32libcalls:
+; CHECK:         .functype f32libcalls (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push9=, 0
+; CHECK-NEXT:    call    $push0=, sinf, $pop9
+; CHECK-NEXT:    call    $push1=, cosf, $pop0
+; CHECK-NEXT:    call    $push2=, tanf, $pop1
+; CHECK-NEXT:    call    $push3=, asinf, $pop2
+; CHECK-NEXT:    call    $push4=, acosf, $pop3
+; CHECK-NEXT:    call    $push5=, atanf, $pop4
+; CHECK-NEXT:    call    $push6=, sinhf, $pop5
+; CHECK-NEXT:    call    $push7=, coshf, $pop6
+; CHECK-NEXT:    call    $push8=, tanhf, $pop7  
+; CHECK-NEXT:    return $pop8
+
+
+ %k = call float @llvm.sin.f32(float %x)
+ %a = call float @llvm.cos.f32(float %k)
+ %b = call float @llvm.tan.f32(float %a)
+ %c = call float @llvm.asin.f32(float %b)
+ %d = call float @llvm.acos.f32(float %c)
+ %e = call float @llvm.atan.f32(float %d)
+ %f = call float @llvm.sinh.f32(float %e)
+ %g = call float @llvm.cosh.f32(float %f)
+ %h = call float @llvm.tanh.f32(float %g)
+ ret float %h
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll
new file mode 100644
index 0000000000000..97f6d797a63bd
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-eh -wasm-enable-sjlj -S | FileCheck %s
+
+target triple = "wasm32-unknown-emscripten"
+
+%struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
+@buf = internal global [1 x %struct.__jmp_buf_tag] zeroinitializer, align 16
+
+; When longjmpable calls are coverted into invokes in Wasm SjLj transformation
+; and their unwind destination is an existing catchpad or cleanuppad due to
+; maintain the scope structure, the new pred BBs created by invokes and the
+; correct incoming values should be added the existing phis in those unwind
+; destinations.
+
+; When longjmpable calls are within a cleanuppad.
+define void @longjmpable_invoke_phi0() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: @longjmpable_invoke_phi0
+entry:
+  %val.entry = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  %0 = call i32 @setjmp(ptr @buf) #2
+  invoke void @foo()
+          to label %bb1 unwind label %ehcleanup1
+
+bb1:                                              ; preds = %entry
+  ; We use llvm.wasm.memory.size intrinsic just to get/use an i32 value. The
+  ; reason we use an intrinsic here is to make it not longjmpable. If this can
+  ; longjmp, the result will be more complicated and hard to check.
+  %val.bb1 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  invoke void @foo()
+          to label %bb2 unwind label %ehcleanup0
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+ehcleanup0:                                       ; preds = %bb1
+  %1 = cleanuppad within none []
+  call void @longjmpable() [ "funclet"(token %1) ]
+; CHECK:      ehcleanup0
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc unwind label %ehcleanup1
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %bb3 unwind label %ehcleanup1
+
+bb3:                                              ; preds = %ehcleanup0
+  %val.bb3 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  call void @longjmpable() [ "funclet"(token %1) ]
+; CHECK:      bb3:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc1 unwind label %ehcleanup1
+  cleanupret from %1 unwind label %ehcleanup1
+
+ehcleanup1:                                       ; preds = %bb3, %ehcleanup0, %entry
+  %phi = phi i32 [ %val.entry, %entry ], [ %val.bb1, %ehcleanup0 ], [ %val.bb3, %bb3 ]
+; CHECK:      ehcleanup1:
+; CHECK-NEXT:   %phi = phi i32 [ %val.entry2, %entry.split.split ], [ %val.bb1, %.noexc ], [ %val.bb3, %.noexc1 ], [ %val.bb1, %ehcleanup0 ], [ %val.bb3, %bb3 ]
+  %2 = cleanuppad within none []
+  %3 = call i32 @llvm.wasm.memory.size.i32(i32 %phi)
+  cleanupret from %2 unwind to caller
+}
+
+; When longjmpable calls are within a catchpad.
+define void @longjmpable_invoke_phi1() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: @longjmpable_invoke_phi1
+entry:
+  %val.entry = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  %0 = call i32 @setjmp(ptr @buf) #2
+  invoke void @foo()
+          to label %bb1 unwind label %ehcleanup
+
+bb1:                                              ; preds = %entry
+  %val.bb1 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  invoke void @foo()
+          to label %bb2 unwind label %catch.dispatch
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+catch.dispatch:                                   ; preds = %bb1
+  %1 = catchswitch within none [label %catch.start] unwind label %ehcleanup
+
+catch.start:                                      ; preds = %catch.dispatch
+  %2 = catchpad within %1 [ptr null]
+  %3 = call ptr @llvm.wasm.get.exception(token %2)
+  %4 = call i32 @llvm.wasm.get.ehselector(token %2)
+  call void @longjmpable() [ "funclet"(token %2) ]
+; CHECK:      catch.start:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc unwind label %ehcleanup
+  invoke void @foo() [ "funclet"(token %2) ]
+          to label %bb3 unwind label %ehcleanup
+
+bb3:                                              ; preds = %catch.start
+  %val.bb3 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  call void @longjmpable() [ "funclet"(token %2) ]
+; CHECK:      bb3:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc1 unwind label %ehcleanup
+  invoke void @foo() [ "funclet"(token %2) ]
+          to label %bb4 unwind label %ehcleanup
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+ehcleanup:                                        ; preds = %bb3, %catch.start, %catch.dispatch, %entry
+  %phi = phi i32 [ %val.entry, %entry ], [ %val.bb1, %catch.dispatch ], [ %val.bb1, %catch.start ], [ %val.bb3, %bb3 ]
+; CHECK:      ehcleanup:
+; CHECK-NEXT:   %phi = phi i32 [ %val.entry2, %entry.split.split ], [ %val.bb1, %catch.dispatch ], [ %val.bb1, %.noexc ], [ %val.bb3, %.noexc1 ], [ %val.bb1, %catch.start ], [ %val.bb3, %bb3 ]
+  %5 = cleanuppad within none []
+  %6 = call i32 @llvm.wasm.memory.size.i32(i32 %phi)
+  cleanupret from %5 unwind to caller
+}
+
+declare i32 @setjmp(ptr)
+declare i32 @__gxx_wasm_personality_v0(...)
+declare void @foo()
+declare void @longjmpable()
+declare void @use_i32(i32)
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i32 @llvm.wasm.get.ehselector(token) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.wasm.get.exception(token) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare i32 @llvm.wasm.memory.size.i32(i32) #1
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #2 = { returns_twice }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 67388b688e3bb..185c46aa5681e 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -12499,6 +12499,210 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %a
 }
 
+define <4 x float> @pmin_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmin_v4f32_fast_olt:
+; SIMD128:         .functype pmin_v4f32_fast_olt (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v4f32_fast_olt:
+; SIMD128-FAST:         .functype pmin_v4f32_fast_olt (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $1, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v4f32_fast_olt:
+; NO-SIMD128:         .functype pmin_v4f32_fast_olt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.lt $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v4f32_fast_olt:
+; NO-SIMD128-FAST:         .functype pmin_v4f32_fast_olt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.lt $push0=, $5, $1
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $6, $2
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast olt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmin_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmin_v4f32_fast_ogt:
+; SIMD128:         .functype pmin_v4f32_fast_ogt (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v4f32_fast_ogt:
+; SIMD128-FAST:         .functype pmin_v4f32_fast_ogt (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v4f32_fast_ogt:
+; NO-SIMD128:         .functype pmin_v4f32_fast_ogt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.gt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.gt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.gt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.gt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v4f32_fast_ogt:
+; NO-SIMD128-FAST:         .functype pmin_v4f32_fast_ogt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.gt $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.gt $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.gt $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.gt $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmin_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmin_v4f32_fast_ole:
+; SIMD128:         .functype pmin_v4f32_fast_ole (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v4f32_fast_ole:
+; SIMD128-FAST:         .functype pmin_v4f32_fast_ole (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $1, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v4f32_fast_ole:
+; NO-SIMD128:         .functype pmin_v4f32_fast_ole (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.le $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.le $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.le $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.le $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v4f32_fast_ole:
+; NO-SIMD128-FAST:         .functype pmin_v4f32_fast_ole (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.le $push0=, $5, $1
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.le $push2=, $6, $2
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.le $push4=, $7, $3
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.le $push6=, $8, $4
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast ole <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmin_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmin_v4f32_fast_oge:
+; SIMD128:         .functype pmin_v4f32_fast_oge (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v4f32_fast_oge:
+; SIMD128-FAST:         .functype pmin_v4f32_fast_oge (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v4f32_fast_oge:
+; NO-SIMD128:         .functype pmin_v4f32_fast_oge (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.ge $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.ge $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.ge $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.ge $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v4f32_fast_oge:
+; NO-SIMD128-FAST:         .functype pmin_v4f32_fast_oge (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.ge $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.ge $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.ge $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.ge $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
 define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; SIMD128-LABEL: pmin_int_v4f32:
 ; SIMD128:         .functype pmin_int_v4f32 (v128, v128) -> (v128)
@@ -12619,6 +12823,210 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %a
 }
 
+define <4 x float> @pmax_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmax_v4f32_fast_ogt:
+; SIMD128:         .functype pmax_v4f32_fast_ogt (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v4f32_fast_ogt:
+; SIMD128-FAST:         .functype pmax_v4f32_fast_ogt (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v4f32_fast_ogt:
+; NO-SIMD128:         .functype pmax_v4f32_fast_ogt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.gt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.gt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.gt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.gt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v4f32_fast_ogt:
+; NO-SIMD128-FAST:         .functype pmax_v4f32_fast_ogt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.gt $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.gt $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.gt $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.gt $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmax_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmax_v4f32_fast_olt:
+; SIMD128:         .functype pmax_v4f32_fast_olt (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v4f32_fast_olt:
+; SIMD128-FAST:         .functype pmax_v4f32_fast_olt (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v4f32_fast_olt:
+; NO-SIMD128:         .functype pmax_v4f32_fast_olt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v4f32_fast_olt:
+; NO-SIMD128-FAST:         .functype pmax_v4f32_fast_olt (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.lt $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast olt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmax_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmax_v4f32_fast_oge:
+; SIMD128:         .functype pmax_v4f32_fast_oge (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v4f32_fast_oge:
+; SIMD128-FAST:         .functype pmax_v4f32_fast_oge (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v4f32_fast_oge:
+; NO-SIMD128:         .functype pmax_v4f32_fast_oge (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.ge $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.ge $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.ge $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.ge $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v4f32_fast_oge:
+; NO-SIMD128-FAST:         .functype pmax_v4f32_fast_oge (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.ge $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.ge $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.ge $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.ge $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+define <4 x float> @pmax_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmax_v4f32_fast_ole:
+; SIMD128:         .functype pmax_v4f32_fast_ole (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v4f32_fast_ole:
+; SIMD128-FAST:         .functype pmax_v4f32_fast_ole (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v4f32_fast_ole:
+; NO-SIMD128:         .functype pmax_v4f32_fast_ole (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.le $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.le $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.le $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.le $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v4f32_fast_ole:
+; NO-SIMD128-FAST:         .functype pmax_v4f32_fast_ole (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.le $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.le $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.le $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.le $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
+  %c = fcmp fast ole <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
 define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; SIMD128-LABEL: pmax_int_v4f32:
 ; SIMD128:         .functype pmax_int_v4f32 (v128, v128) -> (v128)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll b/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll
new file mode 100644
index 0000000000000..f022c3e745125
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -verify-machineinstrs -mattr=+relaxed-simd | FileCheck %s
+
+; Test that setting "relaxed-simd" target feature set also implies 'simd128' in
+; AssemblerPredicate, which is used to verify instructions in AsmPrinter.
+
+target triple = "wasm32-unknown-unknown"
+
+declare <2 x i64> @llvm.wasm.relaxed.laneselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+; The compiled result of this function uses LOCAL_GET_V128, which is predicated
+; on the 'simd128' feature. We should be able to compile this when only
+; 'relaxed-simd' is set, which implies 'simd128'.
+define <2 x i64> @test(<2 x i64>, <2 x i64>, <2 x i64>) #0 {
+; CHECK-LABEL: test:
+; CHECK:         .functype  test (v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get  0
+; CHECK-NEXT:    local.get  1
+; CHECK-NEXT:    local.get  2
+; CHECK-NEXT:    i64x2.relaxed_laneselect
+start:
+  %_4 = tail call <2 x i64> @llvm.wasm.relaxed.laneselect.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) #3
+  ret <2 x i64> %_4
+}
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 4b1a1a8a0c5b6..1d194b640eab2 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -791,10 +791,8 @@ define double @pairwise_max_v2f64_fast(<2 x double> %arg) {
 ; SIMD128-LABEL: pairwise_max_v2f64_fast:
 ; SIMD128:         .functype pairwise_max_v2f64_fast (v128) -> (f64)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
-; SIMD128-NEXT:    f64x2.gt $push0=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    f64x2.pmax $push1=, $0, $pop0
 ; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
 ; SIMD128-NEXT:    return $pop2
   %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg)
@@ -821,15 +819,11 @@ define float @pairwise_max_v4f32_fast(<4 x float> %arg) {
 ; SIMD128-LABEL: pairwise_max_v4f32_fast:
 ; SIMD128:         .functype pairwise_max_v4f32_fast (v128) -> (f32)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
-; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
-; SIMD128-NEXT:    f32x4.gt $push0=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
-; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
-; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
-; SIMD128-NEXT:    f32x4.gt $push1=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.pmax $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.pmax $push2=, $pop4, $pop1
 ; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
 ; SIMD128-NEXT:    return $pop3
   %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
@@ -868,10 +862,8 @@ define double @pairwise_min_v2f64_fast(<2 x double> %arg) {
 ; SIMD128-LABEL: pairwise_min_v2f64_fast:
 ; SIMD128:         .functype pairwise_min_v2f64_fast (v128) -> (f64)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
-; SIMD128-NEXT:    f64x2.lt $push0=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    f64x2.pmin $push1=, $0, $pop0
 ; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
 ; SIMD128-NEXT:    return $pop2
   %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg)
@@ -898,15 +890,11 @@ define float @pairwise_min_v4f32_fast(<4 x float> %arg) {
 ; SIMD128-LABEL: pairwise_min_v4f32_fast:
 ; SIMD128:         .functype pairwise_min_v4f32_fast (v128) -> (f32)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
-; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
-; SIMD128-NEXT:    f32x4.lt $push0=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
-; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
-; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
-; SIMD128-NEXT:    f32x4.lt $push1=, $0, $1
-; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.pmin $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.pmin $push2=, $pop4, $pop1
 ; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
 ; SIMD128-NEXT:    return $pop3
   %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
index 69f733461efc7..ba40c5c4627d9 100644
--- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
@@ -25,10 +25,8 @@ define void @test(<1 x i64> %c64, <1 x i64> %mask1, ptr %P) {
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
-	%tmp4 = bitcast <1 x i64> %mask1 to x86_mmx		; <x86_mmx> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %c64 to x86_mmx		; <x86_mmx> [#uses=1]
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, ptr %P )
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %mask1, <1 x i64> %c64, ptr %P )
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
diff --git a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
index 79b06ba836af2..6c586782420e1 100644
--- a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
+++ b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | FileCheck %s
 
-@R = external global x86_mmx		; <ptr> [#uses=1]
+@R = external global <1 x i64>		; <ptr> [#uses=1]
 
 define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-LABEL: foo:
@@ -14,13 +14,11 @@ define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-	%tmp4 = bitcast <1 x i64> %B to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %A to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 )		; <x86_mmx> [#uses=1]
-	store x86_mmx %tmp7, ptr @R
+	%tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w( <1 x i64> %A, <1 x i64> %B )		; <<1 x i64>> [#uses=1]
+	store <1 x i64> %tmp7, ptr @R
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index d439e827e8199..0c792644fc5c8 100644
--- a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -5,15 +5,15 @@ entry:
 	tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <x86_mmx> [#uses=1]
+	%tmp1 = tail call <1 x i64> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <<1 x i64>> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind 		; <i32> [#uses=1]
+	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef ) nounwind 		; <i32> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind 
+	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef, i32 undef, i32 %tmp3 ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind 		; <i32> [#uses=0]
+	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> %tmp1 ) nounwind 		; <i32> [#uses=0]
 	ret i32 undef
 }
diff --git a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index 594edbaad2944..4a4477823a61d 100644
--- a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -17,13 +17,13 @@ entry:
 	br i1 false, label %bb.nph144.split, label %bb133
 
 bb.nph144.split:		; preds = %entry
-        %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
-        %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, ptr null ) nounwind
+        %tmp = bitcast <8 x i8> zeroinitializer to <1 x i64>
+        %tmp2 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp, <1 x i64> %tmp2, ptr null ) nounwind
 	unreachable
 
 bb133:		; preds = %entry
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index ac86279ca6667..20673a177ac31 100644
--- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -26,25 +26,44 @@ entry:
 
 ; This is how to get MMX instructions.
 
-define <2 x double> @a2(x86_mmx %x) nounwind {
+define <2 x double> @a2(<1 x i64> %x) nounwind {
 ; CHECK-LABEL: a2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvtpi2pd %mm0, %xmm0
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    cvtpi2pd (%esp), %xmm0
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
+  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %x)
   ret <2 x double> %y
 }
 
-define x86_mmx @b2(<2 x double> %x) nounwind {
+define <1 x i64> @b2(<2 x double> %x) nounwind {
 ; CHECK-LABEL: b2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    cvttpd2pi %xmm0, %mm0
+; CHECK-NEXT:    movq %mm0, (%esp)
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
-  ret x86_mmx %y
+  %y = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
+  ret <1 x i64> %y
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>)
diff --git a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
index 306aeed1ace3e..582ebb9bdcfd1 100644
--- a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
+++ b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
@@ -3,14 +3,14 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-macosx10.6.6"
 
-%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+%0 = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
 
 define i32 @pixman_fill_mmx(ptr nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp {
 entry:
   %conv = zext i32 %xor to i64
   %shl = shl nuw i64 %conv, 32
   %or = or i64 %shl, %conv
-  %0 = bitcast i64 %or to x86_mmx
+  %0 = bitcast i64 %or to <1 x i64>
 ; CHECK:      movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
@@ -18,7 +18,7 @@ entry:
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
-  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0
+  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %0) nounwind, !srcloc !0
   %asmresult = extractvalue %0 %1, 0
   %asmresult6 = extractvalue %0 %1, 1
   %asmresult7 = extractvalue %0 %1, 2
@@ -34,7 +34,7 @@ entry:
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
-  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1
+  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, <1 x i64> %0, <1 x i64> %asmresult, <1 x i64> %asmresult6, <1 x i64> %asmresult7, <1 x i64> %asmresult8, <1 x i64> %asmresult9, <1 x i64> %asmresult10, <1 x i64> %asmresult11) nounwind, !srcloc !1
   tail call void @llvm.x86.mmx.emms() nounwind
   ret i32 1
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/phi.ll b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
deleted file mode 100644
index b29540f002598..0000000000000
--- a/llvm/test/CodeGen/X86/GlobalISel/phi.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
-
-define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
-; ALL-LABEL: test_i8:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB0_2
-; ALL-NEXT:  # %bb.1:
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    # kill: def $al killed $al killed $eax
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB0_2: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    # kill: def $al killed $al killed $eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i8 %cond
-}
-
-define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
-; ALL-LABEL: test_i16:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB1_2
-; ALL-NEXT:  # %bb.1:
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB1_2: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i16 %cond
-}
-
-define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
-; ALL-LABEL: test_i32:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB2_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB2_1: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i32 %cond
-}
-
-define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
-; ALL-LABEL: test_i64:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB3_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB3_1: # %cond.false
-; ALL-NEXT:    movq %rdx, %rax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i64 %cond
-}
-
-define float @test_float(i32 %a, float %f, float %t) {
-; ALL-LABEL: test_float:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    je .LBB4_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB4_1: # %cond.false
-; ALL-NEXT:    movaps %xmm1, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
-  ret float %cond
-}
-
-define double @test_double(i32 %a, double %f, double %t) {
-; ALL-LABEL: test_double:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    je .LBB5_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB5_1: # %cond.false
-; ALL-NEXT:    movaps %xmm1, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
-  ret double %cond
-}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 79c954f2ce540..014e54027f55e 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -25,7 +25,6 @@
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Instrument function entry/exit with calls to e.g. mcount() (post inlining)
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 11719be4ab5cd..d1f07b9eaadcb 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -289,7 +289,6 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
@@ -325,7 +324,6 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/apx/i386-ndd.ll b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
new file mode 100644
index 0000000000000..146a99340eb66
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+cmov,+ndd < %s | FileCheck %s
+define i32 @test(i1 %cmp, i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmovnel %eax, %ecx
+; CHECK-NEXT:    movl (%ecx), %eax
+; CHECK-NEXT:    retl
+entry:
+  %cmov = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cmov
+}
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
index 1436d396bd077..084e54235330c 100644
--- a/llvm/test/CodeGen/X86/apx/setzucc.ll
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -46,3 +46,46 @@ define i64 @i64(i64 %x) nounwind {
   %if = select i1 %t0, i64 1, i64 0
   ret i64 %if
 }
+
+define i32 @flags_copy_lowering() nounwind {
+; CHECK-LABEL: flags_copy_lowering:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addl %edx, 0
+; CHECK-NEXT:    setb %sil
+; CHECK-NEXT:    adcl $0, %ecx
+; CHECK-NEXT:    testb %sil, %sil
+; CHECK-NEXT:    setzune %dl
+; CHECK-NEXT:    testb %sil, %sil
+; CHECK-NEXT:    je .LBB4_3
+; CHECK-NEXT:  # %bb.2: # %bb1
+; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB4_1
+; CHECK-NEXT:  .LBB4_3: # %bb2
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ %zext, %bb1 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %add3, %bb1 ]
+  %load = load i32, ptr null, align 4
+  %add = add i32 %load, %phi
+  store i32 %add, ptr null, align 4
+  %icmp = icmp ugt i32 %phi, %add
+  %zext = zext i1 %icmp to i32
+  %add3 = add i32 %phi2, %zext
+  %icmp4 = icmp ult i32 %phi2, 0
+  %and = and i1 %icmp, false
+  br i1 %and, label %bb1, label %bb2
+
+bb2:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll
new file mode 100644
index 0000000000000..da16a7da48ca6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    shrdq $1, %rsi, %rax
+; CHECK-NEXT:    movzbl %cl, %edx
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  %lshr = lshr i128 %xor, 1
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rdx, %r14
+; CHECK-NEXT:    movq %rsi, %r15
+; CHECK-NEXT:    movq %rdi, %r12
+; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    xorq %rdi, %r13
+; CHECK-NEXT:    movq %rcx, %rbp
+; CHECK-NEXT:    xorq %rsi, %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    shrdq $1, %rbp, %r13
+; CHECK-NEXT:    shrq %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    addq %r14, %r12
+; CHECK-NEXT:    adcq %rbx, %r15
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shrdq $1, %r15, %r12
+; CHECK-NEXT:    movzbl %al, %edx
+; CHECK-NEXT:    shldq $63, %r15, %rdx
+; CHECK-NEXT:    movq %r12, %rax
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  call void @use(i128 %xor)
+  %lshr = lshr i128 %xor, 1
+  call void @use(i128 %lshr)
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+; This test case shouldn't combine because it's not
+; an avgflooru operation
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    andq %rdi, %rdx
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %x, -1
+  %and = and i128 %y, %x
+  %add = add i128 %xor, %and
+  ret i128 %add
+}
+
+; This negative test case shouldn't combine, i32 is already properly
+; handled in terms of legalization, compared to the i128
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i32 %y, %x
+  %lshr = lshr i32 %xor, 1
+  %and = and i32 %y, %x
+  %add = add i32 %lshr, %and
+  ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    setb %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    shldq $63, %rdx, %rdi
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    setb %r9b
+; CHECK-NEXT:    movzbl %r9b, %r9d
+; CHECK-NEXT:    shldq $63, %r8, %r9
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    shldq $63, %rcx, %r8
+; CHECK-NEXT:    movq %r8, 16(%rax)
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %r9, 24(%rax)
+; CHECK-NEXT:    movq %rdi, 8(%rax)
+; CHECK-NEXT:    retq
+start:
+  %xor = xor <2 x i128> %y, %x
+  %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+  %and = and <2 x i128> %y, %x
+  %add = add <2 x i128> %lshr, %and
+  ret <2 x i128> %add
+}
diff --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
index d21c9d65ea9c8..0c91a9da5720a 100644
--- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
@@ -168,26 +168,14 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_fixed_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i64:
@@ -208,26 +196,14 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_ext_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i64:
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index b442a6337e3b8..c69886df82bdf 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1011,23 +1011,19 @@ define float @broadcast_lifetime() nounwind {
   ret float %7
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_x86_mmx:
 ; X64:       ## %bb.0: ## %bb
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X64-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index b7516d30df5f6..9ac0503831eb7 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1449,31 +1449,24 @@ eintry:
   ret void
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: broadcast_x86_mmx:
 ; X64-AVX2:       ## %bb.0: ## %bb
-; X64-AVX2-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX2-NEXT:    movq %mm0, %rax
-; X64-AVX2-NEXT:    vmovq %rax, %xmm0
+; X64-AVX2-NEXT:    vmovq %rdi, %xmm0
 ; X64-AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: broadcast_x86_mmx:
 ; X64-AVX512VL:       ## %bb.0: ## %bb
-; X64-AVX512VL-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX512VL-NEXT:    movq %mm0, %rax
-; X64-AVX512VL-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm0
 ; X64-AVX512VL-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll
index 061723a0966e2..fe48a96a51d3e 100644
--- a/llvm/test/CodeGen/X86/bitcast-mmx.ll
+++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll
@@ -17,9 +17,9 @@ define i32 @t0(i64 %x) nounwind {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast i64 %x to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18)
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 -18)
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   %6 = bitcast i64 %5 to <2 x i32>
@@ -52,9 +52,9 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast i64 %x to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
-  %2 = bitcast x86_mmx %1 to i64
+  %0 = bitcast i64 %x to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %n)
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
@@ -88,11 +88,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %w, i32 0
   %1 = insertelement <2 x i32> %0, i32 0, i32 1
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n)
-  %4 = bitcast i64 %x to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3)
-  %6 = bitcast x86_mmx %5 to i64
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %2, i32 %n)
+  %4 = bitcast i64 %x to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %4, <1 x i64> %3)
+  %6 = bitcast <1 x i64> %5 to i64
   ret i64 %6
 }
 
@@ -123,14 +123,14 @@ define i64 @t3(ptr %y, ptr %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %y, align 8
+  %0 = load <1 x i64>, ptr %y, align 8
   %1 = load i32, ptr %n, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 4f2654843728f..e256b811ee839 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -1340,7 +1340,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; GFNI-NEXT:    pushq %r14
 ; GFNI-NEXT:    pushq %rbx
 ; GFNI-NEXT:    movq %rdi, %rax
-; GFNI-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [9241421688590303745,9241421688590303745]
+; GFNI-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
 ; GFNI-NEXT:    vmovq %xmm1, %r10
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index 842aced4884f7..ac172d32c6d8b 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=0 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4
 
 @yydebug = dso_local global i32 0, align 4
 
@@ -110,6 +111,20 @@ define void @func_large() !prof !0 {
 ; CHECK3: b7
 ; CHECK3: b8
 ; CHECK3: b9
+;
+; An expected output with function size larger than the threshold -- the layout is not modified:
+;
+; CHECK4-LABEL: func_large:
+; CHECK4: b0
+; CHECK4: b1
+; CHECK4: b2
+; CHECK4: b3
+; CHECK4: b4
+; CHECK4: b5
+; CHECK4: b6
+; CHECK4: b7
+; CHECK4: b8
+; CHECK4: b9
 
 b0:
   %0 = load i32, ptr @yydebug, align 4
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index 565d9ef6eb3e6..eb627f13db08f 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -63,6 +63,36 @@ define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>
   ret <8 x i32> %3
 }
 
+define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: combine_pmaddwd_concat_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE-NEXT:    pmaddwd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_pmaddwd_concat_freeze:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pmaddwd_concat_freeze:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    retq
+  %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %flo = freeze <4 x i32> %lo
+  %fhi = freeze <4 x i32> %hi
+  %res = shufflevector <4 x i32> %flo, <4 x i32> %fhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %res
+}
+
 define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: combine_pmaddwd_demandedelts:
 ; SSE:       # %bb.0:
@@ -178,6 +208,36 @@ define <16 x i16> @combine_pmaddubsw_concat(<16 x i8> %a0, <16 x i8> %a1, <16 x
   ret <16 x i16> %3
 }
 
+define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_pmaddubsw_concat_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE-NEXT:    pmaddubsw %xmm2, %xmm0
+; SSE-NEXT:    pmaddubsw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_pmaddubsw_concat_freeze:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pmaddubsw_concat_freeze:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    retq
+  %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %flo = freeze <8 x i16> %lo
+  %fhi = freeze <8 x i16> %hi
+  %res = shufflevector <8 x i16> %flo, <8 x i16> %fhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %res
+}
+
 define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: combine_pmaddubsw_demandedelts:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index e429ac0c63c2d..d5a481549f851 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -755,3 +755,64 @@ define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) {
   %r = udiv <4 x i1> %x, %y
   ret <4 x i1> %r
 }
+
+define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
+; SSE2-LABEL: vector_div_leading_zeros:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: vector_div_leading_zeros:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    pmuludq %xmm2, %xmm1
+; SSE41-NEXT:    pmuludq %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vector_div_leading_zeros:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vector_div_leading_zeros:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: vector_div_leading_zeros:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; XOP-NEXT:    retq
+  %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
+  %b = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %b
+}
diff --git a/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll b/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll
deleted file mode 100644
index 930dafc119b3c..0000000000000
--- a/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -disable-debug-info-print -o - %s | FileCheck %s
-
-; Check that debug info isn't emitted for CodeView with
-; -disable-debug-info-print.
-
-; CHECK-NOT:      CodeViewTypes
-; CHECK-NOT:      CodeViewDebugInfo
-
-source_filename = "empty"
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc19.0.24215"
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: FullDebug)
-!1 = !DIFile(filename: "empty", directory: "path/to")
-!2 = !{i32 2, !"CodeView", i32 1}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index aa7b77f01d5ba..1c303de55c95d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,7 +177,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $152, %esp
+; X86-NEXT:    subl $156, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -273,42 +273,44 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %edi, %ecx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    cmovnel %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %ebx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -331,34 +333,34 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 144(%esp,%eax), %edx
-; X86-NEXT:    movl 148(%esp,%eax), %esi
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 148(%esp,%edi), %edx
+; X86-NEXT:    movl 152(%esp,%edi), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl 144(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    shrl %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    movl 136(%esp,%eax), %eax
+; X86-NEXT:    movl 140(%esp,%edi), %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
@@ -374,180 +376,176 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 100(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%eax), %edx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl 104(%esp,%edx), %ebx
+; X86-NEXT:    movl 100(%esp,%edx), %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esp,%eax), %ebx
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    shrl %cl, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    shldl $1, %ecx, %ebx
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shldl $1, %eax, %ebp
+; X86-NEXT:    orl %ecx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %ebp, 8(%ecx)
+; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
@@ -557,12 +555,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %esi
@@ -586,7 +584,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $152, %esp
+; X86-NEXT:    addl $156, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/exp10-libcall-names.ll b/llvm/test/CodeGen/X86/exp10-libcall-names.ll
index 52f0f4ac15a1b..4f993cfe74a3b 100644
--- a/llvm/test/CodeGen/X86/exp10-libcall-names.ll
+++ b/llvm/test/CodeGen/X86/exp10-libcall-names.ll
@@ -8,6 +8,8 @@
 ; RUN: llc -mtriple=x86_64-apple-ios8.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=x86_64-apple-tvos8.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=x86_64-apple-xros8.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=x86_64-apple-driverkit < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=x86_64-apple-driverkit24.0 < %s | FileCheck -check-prefix=APPLE %s
 
 ; RUN: not llc -mtriple=x86_64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
 ; Check exp10/exp10f is emitted as __exp10/__exp10f on assorted systems.
@@ -22,6 +24,11 @@ define float @test_exp10_f32(float %x) {
 ; APPLE-LABEL: test_exp10_f32:
 ; APPLE:       ## %bb.0:
 ; APPLE-NEXT:    jmp ___exp10f ## TAILCALL
+;
+; MISSED-LABEL: test_exp10_f32:
+; MISSED:       ## %bb.0:
+; MISSED-NEXT:    jmp _exp10f ## TAILCALL
+
   %ret = call float @llvm.exp10.f32(float %x)
   ret float %ret
 }
@@ -34,6 +41,11 @@ define double @test_exp10_f64(double %x) {
 ; APPLE-LABEL: test_exp10_f64:
 ; APPLE:       ## %bb.0:
 ; APPLE-NEXT:    jmp ___exp10 ## TAILCALL
+;
+; MISSED-LABEL: test_exp10_f64:
+; MISSED:       ## %bb.0:
+; MISSED-NEXT:    jmp _exp10 ## TAILCALL
+;
   %ret = call double @llvm.exp10.f64(double %x)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 559560ac20f8a..aa637e7408f22 100644
--- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -6,9 +6,9 @@
 
   define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone {
   entry:
-    %0 = bitcast <2 x i32> %a to x86_mmx
-    %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0)
-    %2 = bitcast x86_mmx %1 to <2 x i32>
+    %0 = bitcast <2 x i32> %a to <1 x i64>
+    %1 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %0, <1 x i64> %0)
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
     ret <2 x i32> %2
   }
 
diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll
index 0fbc9fab05681..64bdfd6d4f863 100644
--- a/llvm/test/CodeGen/X86/fast-isel-bc.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll
@@ -4,7 +4,7 @@
 
 ; PR4684
 
-declare void @func2(x86_mmx)
+declare void @func2(<1 x i64>)
 
 ; This isn't spectacular, but it's MMX code at -O0...
 
@@ -12,7 +12,11 @@ define void @func1() nounwind {
 ; X86-LABEL: func1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000
+; X86-NEXT:    movl $2, %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    calll _func2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -20,12 +24,11 @@ define void @func1() nounwind {
 ; X64-LABEL: func1:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movabsq $8589934592, %rdi ## imm = 0x200000000
 ; X64-NEXT:    callq _func2
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to x86_mmx
-  call void @func2(x86_mmx %tmp0)
+  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to <1 x i64>
+  call void @func2(<1 x i64> %tmp0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
index c13fdae540d0b..3b1a8f541b490 100644
--- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -104,12 +104,12 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; ALL-NEXT:    movntq %mm0, (%rsi)
 ; ALL-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a0
-  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
-  store x86_mmx %1, ptr %a1, align 8, !nontemporal !1
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !1
   ret void
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 ;
 ; 128-bit Vector Stores
diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index cfec52c0e6886..5d4e86afc8ace 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -177,6 +177,107 @@ define float @tan(float %x) #0 {
   ret float %result
 }
 
+define float @acos(float %x) #0 {
+; CHECK-LABEL: acos:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _acos
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @asin(float %x) #0 {
+; CHECK-LABEL: asin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _asin
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @atan(float %x) #0 {
+; CHECK-LABEL: atan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _atan
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @cosh(float %x) #0 {
+; CHECK-LABEL: cosh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _cosh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @sinh(float %x) #0 {
+; CHECK-LABEL: sinh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _sinh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @tanh(float %x) #0 {
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _tanh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
 attributes #0 = { strictfp }
 
 declare float @llvm.experimental.constrained.ceil.f32(float, metadata)
@@ -189,3 +290,9 @@ declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index 2857a738e8efc..dcbcb481f927c 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print < %s | FileCheck %s
-; RUN: opt -passes=strip -S < %s | llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnueabi < %s | FileCheck %s
+; RUN: opt -passes=strip -S < %s | llc -mtriple=x86_64-linux-gnueabi | FileCheck %s
 
 ; This test checks if the code is generated correctly with and without debug info.
 
diff --git a/llvm/test/CodeGen/X86/fsafdo_test1.ll b/llvm/test/CodeGen/X86/fsafdo_test1.ll
index 61c0f59aba6f8..e80a7f2f354f2 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test1.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test1.ll
@@ -4,9 +4,9 @@
 ; Check that fs-afdo discriminators are generated.
 ; V01: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
 ; V01: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; V0: .loc    1 9 5 is_stmt 0 discriminator 11266 # foo.c:9:5
+; V0: .loc    1 9 5 discriminator 11266 # foo.c:9:5
 ; V0: .loc    1 7 3 is_stmt 1 discriminator 11266 # foo.c:7:3
-; V1: .loc    1 9 5 is_stmt 0 discriminator 514 # foo.c:9:5
+; V1: .loc    1 9 5 discriminator 514 # foo.c:9:5
 ; V1: .loc    1 7 3 is_stmt 1 discriminator 258 # foo.c:7:3
 ; Check that variable __llvm_fs_discriminator__ is generated.
 ; V01: .type   __llvm_fs_discriminator__,@object # @__llvm_fs_discriminator__
diff --git a/llvm/test/CodeGen/X86/fsafdo_test4.ll b/llvm/test/CodeGen/X86/fsafdo_test4.ll
index 6a22ea9822412..effc72b44ade8 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test4.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test4.ll
@@ -1,11 +1,16 @@
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck %s
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck --implicit-check-not=.loc %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck --implicit-check-not=.loc %s
 ;
 ; Check that fs-afdo discriminators are NOT generated, as debugInfoForProfiling is false (not set).
+; CHECK: .loc    1 7 15 prologue_end discriminator 2 # foo.c:7:15
 ; CHECK: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 0 3 # foo.c:0:3
 ; CHECK: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; CHECK-NOT: .loc    1 9 5 is_stmt 0 discriminator
-; CHECK-NOT: .loc    1 7 3 is_stmt 1 discriminator
+; CHECK: .loc    1 0 5 is_stmt 0 # :0:5
+; CHECK: .loc    1 9 5 discriminator 2 # foo.c:9:5
+; CHECK: .loc    1 0 5 # :0:5
+; CHECK: .loc	   1 7 3 is_stmt 1 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 14 3 # foo.c:14:3
 ; Check that variable __llvm_fs_discriminator__ is NOT generated.
 ; CHECK-NOT: __llvm_fs_discriminator__:
 
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 5857ff1162ceb..c071f64dc66cd 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -699,7 +699,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
@@ -743,7 +743,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm9
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
@@ -751,12 +751,12 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNISSE-NEXT:    pand %xmm7, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm11
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
@@ -769,7 +769,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm13
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
@@ -819,7 +819,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNIAVX1-LABEL: var_fshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm4, %xmm6
 ; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
@@ -827,25 +827,25 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm6, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm4, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm9, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm4, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm10, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm10 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm8, %xmm11
 ; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm11, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm11 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm11 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm11, %xmm8, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
@@ -880,33 +880,28 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ;
 ; GFNIAVX2-LABEL: var_fshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm2, %ymm5
-; GFNIAVX2-NEXT:    vpsllw $5, %ymm5, %ymm5
-; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
-; GFNIAVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -1288,7 +1283,7 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
 ; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm5 = [16909320,16909320]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
@@ -1307,10 +1302,8 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -1328,9 +1321,9 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
 define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    por %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
@@ -1347,10 +1340,8 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -1686,9 +1677,9 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-LABEL: var_fshl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX512VL-NEXT:    vpandq %zmm7, %zmm2, %zmm2
@@ -1696,7 +1687,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm8, %ymm9
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm9, %ymm9
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm3, %ymm10
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm3, %ymm3
@@ -1716,11 +1707,11 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm8, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm7
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm3, %ymm3
@@ -1805,19 +1796,19 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pand %xmm12, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm14 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm14 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm14, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
@@ -1830,7 +1821,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm0 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm0 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm0, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm9, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
@@ -1917,7 +1908,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX1-LABEL: var_fshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
@@ -1925,25 +1916,25 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm11, %xmm10
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm9
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm9, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm13, %xmm9, %xmm12
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm13
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm13, %xmm14
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm10
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm15
 ; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm10 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm13, %xmm14
 ; GFNIAVX1-NEXT:    vpaddb %xmm15, %xmm15, %xmm15
@@ -2025,27 +2016,27 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ;
 ; GFNIAVX2-LABEL: var_fshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm8
 ; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm9
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm9, %ymm9
 ; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm10
 ; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
 ; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm10, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
 ; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
 ; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm0, %ymm11
 ; GFNIAVX2-NEXT:    vpandn %ymm6, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm11, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm11, %ymm0, %ymm12
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
@@ -2080,18 +2071,18 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-LABEL: var_fshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX512VL-NEXT:    vpandq %zmm6, %zmm2, %zmm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm8
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm9
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm9, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm3, %ymm10
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm10, %ymm3, %ymm3
@@ -2107,12 +2098,12 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
 ; GFNIAVX512VL-NEXT:    vpxor %ymm6, %ymm7, %ymm7
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm8
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm8, %ymm3, %ymm3
@@ -2727,7 +2718,7 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    por %xmm4, %xmm0
@@ -2761,7 +2752,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
@@ -2794,7 +2785,7 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
 define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
 ; GFNISSE-NEXT:    pmovsxwq {{.*#+}} xmm9 = [258,258]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
@@ -2824,9 +2815,9 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [258,258,258,258]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index cc077410228cb..5fd4dfa7cc262 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -415,7 +415,7 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm6 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
@@ -424,17 +424,17 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm9
 ; GFNISSE-NEXT:    por %xmm0, %xmm9
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
@@ -473,26 +473,26 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotl_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
 ; GFNIAVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm10
@@ -519,22 +519,17 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
@@ -581,7 +576,7 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v32i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
 ; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm7 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm8
@@ -592,16 +587,16 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    psubb %xmm2, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm9
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm10
 ; GFNISSE-NEXT:    por %xmm9, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm11
@@ -640,10 +635,10 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
@@ -652,16 +647,16 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-NEXT:    vpsubb %xmm6, %xmm7, %xmm6
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
 ; GFNIAVX1-NEXT:    vpor %xmm8, %xmm10, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm11
@@ -689,24 +684,19 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; GFNIAVX2-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
@@ -1075,21 +1065,15 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotl_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1161999622378488840,1161999622378488840]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_rotl_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_rotl_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1161999622378488840,1161999622378488840,1161999622378488840,1161999622378488840]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_rotl_v32i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotl_v32i8:
 ; GFNIAVX512:       # %bb.0:
@@ -1103,21 +1087,15 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
 define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [4647715923615551520,4647715923615551520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_rotr_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_rotr_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4647715923615551520,4647715923615551520,4647715923615551520,4647715923615551520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_rotr_v32i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotr_v32i8:
 ; GFNIAVX512:       # %bb.0:
@@ -1137,7 +1115,7 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm10 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
@@ -1146,17 +1124,17 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm13
 ; GFNISSE-NEXT:    por %xmm0, %xmm13
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm14
@@ -1243,26 +1221,26 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotl_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm9, %xmm11
 ; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm11
 ; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm9, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
@@ -1322,21 +1300,21 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9
 ; GFNIAVX2-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm9
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
 ; GFNIAVX2-NEXT:    vpor %ymm9, %ymm10, %ymm9
@@ -1362,22 +1340,22 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
 ; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm10
 ; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm10, %ymm9
@@ -1422,7 +1400,7 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v64i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
 ; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm12
@@ -1433,16 +1411,16 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    psubb %xmm4, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm13
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm13
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm14
 ; GFNISSE-NEXT:    por %xmm13, %xmm14
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm14
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm15
@@ -1527,10 +1505,10 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotr_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm7, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm7, %xmm8
 ; GFNIAVX1-NEXT:    vpor %xmm6, %xmm8, %xmm8
@@ -1539,16 +1517,16 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX1-NEXT:    vpsubb %xmm9, %xmm6, %xmm9
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm9, %xmm9
 ; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm7, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm10, %xmm11
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm10, %xmm12
 ; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
 ; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm10, %xmm11
 ; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm13
@@ -1611,23 +1589,23 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; GFNIAVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
 ; GFNIAVX2-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10
 ; GFNIAVX2-NEXT:    vpor %ymm8, %ymm10, %ymm8
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm8, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
 ; GFNIAVX2-NEXT:    vpor %ymm10, %ymm11, %ymm10
@@ -1654,24 +1632,24 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [258,258,258,258]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
 ; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,1,1,1]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
 ; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm11, %ymm9
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
@@ -2257,7 +2235,7 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223655728169885760,9223655728169885760]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
@@ -2266,14 +2244,14 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760]
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -2290,7 +2268,7 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
 define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [290499906672525570,290499906672525570]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
@@ -2299,14 +2277,14 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570]
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index fc57b84ab9f02..6ed524e406826 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -615,7 +615,7 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
@@ -647,13 +647,13 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_shl_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
@@ -675,12 +675,10 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_shl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
@@ -717,19 +715,19 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
@@ -756,18 +754,18 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_lshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
@@ -786,16 +784,13 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_lshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
@@ -1459,16 +1454,10 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_shl_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_shl_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [258,258,258,258]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_shl_v32i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_shl_v32i8:
 ; GFNIAVX512:       # %bb.0:
@@ -1481,21 +1470,15 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_lshr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_lshr_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_lshr_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_lshr_v32i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8:
 ; GFNIAVX512:       # %bb.0:
@@ -1508,21 +1491,15 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_ashr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [290499906672558208,290499906672558208]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,64,32,16,8,4,128,128,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_ashr_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_ashr_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [290499906672558208,290499906672558208,290499906672558208,290499906672558208]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_ashr_v32i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8:
 ; GFNIAVX512:       # %bb.0:
@@ -1547,7 +1524,7 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
@@ -1609,13 +1586,13 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_shl_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [16909320,16909320]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
@@ -1658,11 +1635,11 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_shl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
@@ -1683,12 +1660,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_shl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
@@ -1728,19 +1705,19 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm10
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
@@ -1797,18 +1774,18 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_lshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
@@ -1848,15 +1825,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_lshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
@@ -1874,16 +1851,16 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_lshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
@@ -2999,7 +2976,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: splatconstant_shl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [66052,66052,66052,66052]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -3015,7 +2992,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_lshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
@@ -3031,7 +3008,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -3047,7 +3024,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_ashr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [145249953336295552,145249953336295552]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
@@ -3063,7 +3040,7 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [145249953336295552,145249953336295552,145249953336295552,145249953336295552]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll
index a7ceb4a4ee6fe..920033ba1182c 100644
--- a/llvm/test/CodeGen/X86/huge-stack.ll
+++ b/llvm/test/CodeGen/X86/huge-stack.ll
@@ -7,7 +7,7 @@ define void @foo() unnamed_addr #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
 ; CHECK-NEXT:    subq %rax, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset -122
+; CHECK-NEXT:    .cfi_def_cfa_offset 8589934470
 ; CHECK-NEXT:    movb $42, -129(%rsp)
 ; CHECK-NEXT:    movb $43, -128(%rsp)
 ; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 532b2c09a9175..cc4d4c4543a51 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2602,24 +2602,22 @@ define i1 @issubnormal_or_nan_f(float %x) {
 define i1 @issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
   ret i1 %class
@@ -2773,24 +2771,22 @@ define i1 @not_issubnormal_or_nan_f(float %x) {
 define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setl %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setne %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setl %al
-; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
   ret i1 %class
diff --git a/llvm/test/CodeGen/X86/isel-phi.ll b/llvm/test/CodeGen/X86/isel-phi.ll
new file mode 100644
index 0000000000000..412aeac5eed6a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-phi.ll
@@ -0,0 +1,494 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-linux-gnu -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86
+; RUN: llc -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86
+; TODO: enable when x87 is supported
+; llc -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86,GLOBAL-X86
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,GLOBAL-X64
+
+define i1 @test_i1(i1 %a, i1 %b, i1 %c, i1 %pred0, i1 %pred1) {
+; X86-LABEL: test_i1:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_4: # %cond.false
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %cond
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    testb $1, %r8b
+; X64-NEXT:    jne .LBB0_4
+; X64-NEXT:  # %bb.3: # %cond.false
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB0_4: # %cond.end
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred0, label %cond, label %cond.end
+
+cond:
+  br i1 %pred1, label %cond.true, label %cond.false
+
+cond.true:
+  br label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %res = phi i1 [ %a, %entry ], [ %b, %cond.true ], [ %c, %cond.false ]
+  ret i1 %res
+}
+
+define i8 @test_i8(i8 %f, i8 %t, i1 %pred) {
+; X86-LABEL: test_i8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_2: # %cond.false
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    je .LBB1_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB1_2: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i8 %cond
+}
+
+define i16 @test_i16(i16 %f, i16 %t, i1 %pred) {
+; X86-LABEL: test_i16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_2: # %cond.false
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    je .LBB2_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_2: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i16 %cond
+}
+
+define i32 @test_i32(i32 %f, i32 %t, i1 %pred) {
+; X86-LABEL: test_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_2: # %cond.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    jne .LBB3_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:  .LBB3_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i32 %cond
+}
+
+define i64 @test_i64(i64 %f, i64 %t, i1 %pred) {
+; X86-LABEL: test_i64:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB4_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_2: # %cond.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    jne .LBB4_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:  .LBB4_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i64 %cond
+}
+
+define float @test_float(float %f, float %t, i1 %pred) {
+; X86-LABEL: test_float:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB5_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB5_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB5_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:  .LBB5_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+  ret float %cond
+}
+
+define double @test_double(i32 %a, double %f, double %t, i1 %pred) {
+; X86-LABEL: test_double:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB6_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB6_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    jne .LBB6_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:  .LBB6_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
+  ret double %cond
+}
+
+define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 %pred0, i1 %pred1, i1 %pred2) {
+; X86-LABEL: test_ptr:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB7_6
+; X86-NEXT:  # %bb.1: # %cond.true
+; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    je .LBB7_3
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_6: # %cond.false
+; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    je .LBB7_10
+; X86-NEXT:  # %bb.7: # %cond.false.true
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_9
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_3: # %cond.true.false
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_5
+; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_10: # %cond.false.false
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_12
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_9: # %cond.false.true.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_5: # %cond.true.false.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_12: # %cond.false.false.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; DAG-X64-LABEL: test_ptr:
+; DAG-X64:       # %bb.0: # %entry
+; DAG-X64-NEXT:    movq %rdi, %rax
+; DAG-X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; DAG-X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; DAG-X64-NEXT:    testb $1, {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    je .LBB7_4
+; DAG-X64-NEXT:  # %bb.1: # %cond.true
+; DAG-X64-NEXT:    testb $1, %r10b
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.2: # %cond.true.false
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %rsi, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.3: # %cond.true.false.false
+; DAG-X64-NEXT:    movq %rdx, %rax
+; DAG-X64-NEXT:    retq
+; DAG-X64-NEXT:  .LBB7_4: # %cond.false
+; DAG-X64-NEXT:    testb $1, %r10b
+; DAG-X64-NEXT:    je .LBB7_7
+; DAG-X64-NEXT:  # %bb.5: # %cond.false.true
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %rcx, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.6: # %cond.false.true.false
+; DAG-X64-NEXT:    movq %r8, %rax
+; DAG-X64-NEXT:    retq
+; DAG-X64-NEXT:  .LBB7_7: # %cond.false.false
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %r9, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.8: # %cond.false.false.false
+; DAG-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; DAG-X64-NEXT:  .LBB7_9: # %cond.end
+; DAG-X64-NEXT:    retq
+;
+; GLOBAL-X64-LABEL: test_ptr:
+; GLOBAL-X64:       # %bb.0: # %entry
+; GLOBAL-X64-NEXT:    movq %rdi, %rax
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; GLOBAL-X64-NEXT:    testb $1, %r11b
+; GLOBAL-X64-NEXT:    je .LBB7_4
+; GLOBAL-X64-NEXT:  # %bb.1: # %cond.true
+; GLOBAL-X64-NEXT:    testb $1, %r10b
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.2: # %cond.true.false
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %rsi, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.3: # %cond.true.false.false
+; GLOBAL-X64-NEXT:    movq %rdx, %rax
+; GLOBAL-X64-NEXT:    retq
+; GLOBAL-X64-NEXT:  .LBB7_4: # %cond.false
+; GLOBAL-X64-NEXT:    testb $1, %r10b
+; GLOBAL-X64-NEXT:    je .LBB7_7
+; GLOBAL-X64-NEXT:  # %bb.5: # %cond.false.true
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %rcx, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.6: # %cond.false.true.false
+; GLOBAL-X64-NEXT:    movq %r8, %rax
+; GLOBAL-X64-NEXT:    retq
+; GLOBAL-X64-NEXT:  .LBB7_7: # %cond.false.false
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %r9, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.8: # %cond.false.false.false
+; GLOBAL-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; GLOBAL-X64-NEXT:  .LBB7_9: # %cond.end
+; GLOBAL-X64-NEXT:    retq
+entry:
+  br i1 %pred0, label %cond.true, label %cond.false
+
+cond.true:
+  br i1 %pred1, label %cond.end, label %cond.true.false
+
+cond.true.false:
+  br i1 %pred2, label %cond.true.false.true, label %cond.true.false.false
+
+cond.true.false.true:
+  br label %cond.end
+
+cond.true.false.false:
+  br label %cond.end
+
+cond.false:
+  br i1 %pred1, label %cond.false.true, label %cond.false.false
+
+cond.false.true:
+  br i1 %pred2, label %cond.false.true.true, label %cond.false.true.false
+
+cond.false.true.true:
+  br label %cond.end
+
+cond.false.true.false:
+  br label %cond.end
+
+cond.false.false:
+  br i1 %pred2, label %cond.false.false.true, label %cond.false.false.false
+
+cond.false.false.true:
+  br label %cond.end
+
+cond.false.false.false:
+  br label %cond.end
+
+cond.end:
+  %res = phi ptr [ %a, %cond.true ], [ %b, %cond.true.false.true ], [ %c, %cond.true.false.false ], [ %d, %cond.false.true.true ], [ %e, %cond.false.true.false ], [ %f, %cond.false.false.true ], [ %g, %cond.false.false.false ]
+  ret ptr %res
+}
+
+define x86_fp80 @test_fp80(x86_fp80 %f, x86_fp80 %t, i1 %pred) {
+; X86-LABEL: test_fp80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB8_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB8_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; DAG-X64-LABEL: test_fp80:
+; DAG-X64:       # %bb.0: # %entry
+; DAG-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    jne .LBB8_2
+; DAG-X64-NEXT:  # %bb.1: # %cond.false
+; DAG-X64-NEXT:    fstp %st(0)
+; DAG-X64-NEXT:    fldz
+; DAG-X64-NEXT:    fxch %st(1)
+; DAG-X64-NEXT:  .LBB8_2: # %cond.end
+; DAG-X64-NEXT:    fstp %st(1)
+; DAG-X64-NEXT:    retq
+;
+; GLOBAL-X64-LABEL: test_fp80:
+; GLOBAL-X64:       # %bb.0: # %entry
+; GLOBAL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GLOBAL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    jne .LBB8_2
+; GLOBAL-X64-NEXT:  # %bb.1: # %cond.false
+; GLOBAL-X64-NEXT:    fstp %st(1)
+; GLOBAL-X64-NEXT:    fldz
+; GLOBAL-X64-NEXT:  .LBB8_2: # %cond.end
+; GLOBAL-X64-NEXT:    fstp %st(0)
+; GLOBAL-X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi x86_fp80 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret x86_fp80 %cond
+}
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f0a5e2d..f7b2538e1de2f 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -115,7 +115,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X64-NEXT:    andq $-1024, %rdi # imm = 0xFC00
 ; X64-NEXT:    andq $-1024, %rsi # imm = 0xFC00
 ; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcl $0, %edx
 ; X64-NEXT:    shldq $54, %rsi, %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index a4dbb10e0d7a0..439d7efc2d755 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -10,30 +10,29 @@ define void @t3() nounwind  {
 ; X86-64-LABEL: t3:
 ; X86-64:       ## %bb.0:
 ; X86-64-NEXT:    movq _g_v8qi@GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq (%rax), %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
-define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
+define void @t4(<1 x i64> %v1, <1 x i64> %v2) nounwind  {
 ; X86-64-LABEL: t4:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm1, %mm0
-; X86-64-NEXT:    movdq2q %xmm0, %mm1
-; X86-64-NEXT:    movq2dq %mm1, %xmm1
-; X86-64-NEXT:    movq2dq %mm0, %xmm0
-; X86-64-NEXT:    paddb %xmm1, %xmm0
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq %rdi, %xmm0
+; X86-64-NEXT:    movq %rsi, %xmm1
+; X86-64-NEXT:    paddb %xmm0, %xmm1
+; X86-64-NEXT:    movq %xmm1, %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
-  %v1a = bitcast x86_mmx %v1 to <8 x i8>
-  %v2b = bitcast x86_mmx %v2 to <8 x i8>
+  %v1a = bitcast <1 x i64> %v1 to <8 x i8>
+  %v2b = bitcast <1 x i64> %v2 to <8 x i8>
   %tmp3 = add <8 x i8> %v1a, %v2b
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
index af116a2ac281b..d933149c5e027 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
@@ -8,26 +8,28 @@
 ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
 ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
 
-@u1 = external global x86_mmx
+@u1 = external global <1 x i64>
 
-define void @t1(x86_mmx %v1) nounwind  {
+define void @t1(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t1:
 ; X86-32:       ## %bb.0:
-; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %eax
-; X86-32-NEXT:    movq %mm0, (%eax)
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %edx
+; X86-32-NEXT:    movl %ecx, 4(%edx)
+; X86-32-NEXT:    movl %eax, (%edx)
 ; X86-32-NEXT:    retl
 ;
 ; X86-64-LABEL: t1:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm0, %mm0
 ; X86-64-NEXT:    movq _u1@GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movq %mm0, (%rax)
+; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-	store x86_mmx %v1, ptr @u1, align 8
+	store <1 x i64> %v1, ptr @u1, align 8
 	ret void
 }
 
-@u2 = external global x86_mmx
+@u2 = external global <1 x i64>
 
 define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t2:
@@ -44,7 +46,6 @@ define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-64-NEXT:    movq _u2@GOTPCREL(%rip), %rax
 ; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-        %tmp = bitcast <1 x i64> %v1 to x86_mmx
-	store x86_mmx %tmp, ptr @u2, align 8
+	store <1 x i64> %v1, ptr @u2, align 8
 	ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 68287a4feee47..5bb3b17fdf421 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubb %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubb %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -88,53 +88,53 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
-  %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <8 x i8>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <8 x i8>
   %tmp4 = add <8 x i8> %tmp1a, %tmp3a
-  %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
-  %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+  %tmp4a = bitcast <8 x i8> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <8 x i8>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <8 x i8>
   %tmp28 = sub <8 x i8> %tmp21a, %tmp27a
-  %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
-  %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+  %tmp28a = bitcast <8 x i8> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <8 x i8>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <8 x i8>
   %tmp52 = mul <8 x i8> %tmp45a, %tmp51a
-  %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp57 = load x86_mmx, ptr %B
-  %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+  %tmp52a = bitcast <8 x i8> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp57 = load <1 x i64>, ptr %B
+  %tmp57a = bitcast <1 x i64> %tmp57 to <8 x i8>
   %tmp58 = and <8 x i8> %tmp52, %tmp57a
-  %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
-  store x86_mmx %tmp58a, ptr %A
-  %tmp63 = load x86_mmx, ptr %B
-  %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+  %tmp58a = bitcast <8 x i8> %tmp58 to <1 x i64>
+  store <1 x i64> %tmp58a, ptr %A
+  %tmp63 = load <1 x i64>, ptr %B
+  %tmp63a = bitcast <1 x i64> %tmp63 to <8 x i8>
   %tmp64 = or <8 x i8> %tmp58, %tmp63a
-  %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
-  store x86_mmx %tmp64a, ptr %A
-  %tmp69 = load x86_mmx, ptr %B
-  %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
-  %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+  %tmp64a = bitcast <8 x i8> %tmp64 to <1 x i64>
+  store <1 x i64> %tmp64a, ptr %A
+  %tmp69 = load <1 x i64>, ptr %B
+  %tmp69a = bitcast <1 x i64> %tmp69 to <8 x i8>
+  %tmp64b = bitcast <1 x i64> %tmp64a to <8 x i8>
   %tmp70 = xor <8 x i8> %tmp64b, %tmp69a
-  %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
-  store x86_mmx %tmp70a, ptr %A
+  %tmp70a = bitcast <8 x i8> %tmp70 to <1 x i64>
+  store <1 x i64> %tmp70a, ptr %A
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -196,42 +196,42 @@ define void @test1(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
-  %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <2 x i32>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <2 x i32>
   %tmp4 = add <2 x i32> %tmp1a, %tmp3a
-  %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp9 = load x86_mmx, ptr %B
-  %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+  %tmp4a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp9 = load <1 x i64>, ptr %B
+  %tmp9a = bitcast <1 x i64> %tmp9 to <2 x i32>
   %tmp10 = sub <2 x i32> %tmp4, %tmp9a
-  %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp10a, ptr %A
-  %tmp15 = load x86_mmx, ptr %B
-  %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
-  %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+  %tmp10a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp10a, ptr %A
+  %tmp15 = load <1 x i64>, ptr %B
+  %tmp10b = bitcast <1 x i64> %tmp10a to <2 x i32>
+  %tmp15a = bitcast <1 x i64> %tmp15 to <2 x i32>
   %tmp16 = mul <2 x i32> %tmp10b, %tmp15a
-  %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
-  store x86_mmx %tmp16a, ptr %A
-  %tmp21 = load x86_mmx, ptr %B
-  %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
-  %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+  %tmp16a = bitcast <2 x i32> %tmp16 to <1 x i64>
+  store <1 x i64> %tmp16a, ptr %A
+  %tmp21 = load <1 x i64>, ptr %B
+  %tmp16b = bitcast <1 x i64> %tmp16a to <2 x i32>
+  %tmp21a = bitcast <1 x i64> %tmp21 to <2 x i32>
   %tmp22 = and <2 x i32> %tmp16b, %tmp21a
-  %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
-  store x86_mmx %tmp22a, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
-  %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+  %tmp22a = bitcast <2 x i32> %tmp22 to <1 x i64>
+  store <1 x i64> %tmp22a, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp22b = bitcast <1 x i64> %tmp22a to <2 x i32>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <2 x i32>
   %tmp28 = or <2 x i32> %tmp22b, %tmp27a
-  %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp33 = load x86_mmx, ptr %B
-  %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
-  %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+  %tmp28a = bitcast <2 x i32> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp33 = load <1 x i64>, ptr %B
+  %tmp28b = bitcast <1 x i64> %tmp28a to <2 x i32>
+  %tmp33a = bitcast <1 x i64> %tmp33 to <2 x i32>
   %tmp34 = xor <2 x i32> %tmp28b, %tmp33a
-  %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
-  store x86_mmx %tmp34a, ptr %A
+  %tmp34a = bitcast <2 x i32> %tmp34 to <1 x i64>
+  store <1 x i64> %tmp34a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -239,8 +239,13 @@ entry:
 define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddw %xmm0, %xmm1
@@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubw %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    pmullw %xmm0, %xmm1
 ; X86-NEXT:    movdq2q %xmm1, %mm0
@@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    pmulhw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    pmaddwd (%ecx), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    andps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    orps %xmm1, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    xorps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
+; X86-NEXT:    andl 4(%ecx), %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    andl (%ecx), %edx
+; X86-NEXT:    movd %edx, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    pxor %xmm0, %xmm1
+; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    emms
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubw %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    pmullw %xmm0, %xmm1
 ; X64-NEXT:    movdq2q %xmm1, %mm0
@@ -309,76 +322,75 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    pmulhw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    pmaddwd (%rsi), %mm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    andps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    orps %xmm1, %xmm0
-; X64-NEXT:    movlps %xmm0, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    xorps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
+; X64-NEXT:    andq (%rsi), %rax
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    por %xmm0, %xmm1
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
-  %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <4 x i16>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <4 x i16>
   %tmp4 = add <4 x i16> %tmp1a, %tmp3a
-  %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
-  %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+  %tmp4a = bitcast <4 x i16> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <4 x i16>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <4 x i16>
   %tmp28 = sub <4 x i16> %tmp21a, %tmp27a
-  %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
-  %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+  %tmp28a = bitcast <4 x i16> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <4 x i16>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <4 x i16>
   %tmp52 = mul <4 x i16> %tmp45a, %tmp51a
-  %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp55 = load x86_mmx, ptr %B
-  %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55)
-  store x86_mmx %tmp60, ptr %A
-  %tmp64 = load x86_mmx, ptr %B
-  %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64)
-  %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx
-  store x86_mmx %tmp70, ptr %A
-  %tmp75 = load x86_mmx, ptr %B
-  %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
-  %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+  %tmp52a = bitcast <4 x i16> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp55 = load <1 x i64>, ptr %B
+  %tmp60 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %tmp52a, <1 x i64> %tmp55)
+  store <1 x i64> %tmp60, ptr %A
+  %tmp64 = load <1 x i64>, ptr %B
+  %tmp69 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %tmp60, <1 x i64> %tmp64)
+  store <1 x i64> %tmp69, ptr %A
+  %tmp75 = load <1 x i64>, ptr %B
+  %tmp70a = bitcast <1 x i64> %tmp69 to <4 x i16>
+  %tmp75a = bitcast <1 x i64> %tmp75 to <4 x i16>
   %tmp76 = and <4 x i16> %tmp70a, %tmp75a
-  %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
-  store x86_mmx %tmp76a, ptr %A
-  %tmp81 = load x86_mmx, ptr %B
-  %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
-  %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+  %tmp76a = bitcast <4 x i16> %tmp76 to <1 x i64>
+  store <1 x i64> %tmp76a, ptr %A
+  %tmp81 = load <1 x i64>, ptr %B
+  %tmp76b = bitcast <1 x i64> %tmp76a to <4 x i16>
+  %tmp81a = bitcast <1 x i64> %tmp81 to <4 x i16>
   %tmp82 = or <4 x i16> %tmp76b, %tmp81a
-  %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
-  store x86_mmx %tmp82a, ptr %A
-  %tmp87 = load x86_mmx, ptr %B
-  %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
-  %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+  %tmp82a = bitcast <4 x i16> %tmp82 to <1 x i64>
+  store <1 x i64> %tmp82a, ptr %A
+  %tmp87 = load <1 x i64>, ptr %B
+  %tmp82b = bitcast <1 x i64> %tmp82a to <4 x i16>
+  %tmp87a = bitcast <1 x i64> %tmp87 to <4 x i16>
   %tmp88 = xor <4 x i16> %tmp82b, %tmp87a
-  %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
-  store x86_mmx %tmp88a, ptr %A
+  %tmp88a = bitcast <4 x i16> %tmp88 to <1 x i64>
+  store <1 x i64> %tmp88a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -574,10 +586,10 @@ define void @ti8a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -597,10 +609,10 @@ define void @ti16a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -620,10 +632,10 @@ define void @ti32a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -643,10 +655,10 @@ define void @ti64a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -674,28 +686,28 @@ define i64 @pr43922() nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to x86_mmx), i32 268435456)
-  %1 = bitcast x86_mmx %0 to i64
+  %0 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to <1 x i64>), i32 268435456)
+  %1 = bitcast <1 x i64> %0 to i64
   ret i64 %1
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
 
 declare void @llvm.x86.mmx.emms()
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
index 0fa7b24ff445a..fb2517f5a891b 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s
 
-; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+; CHECK: @foo(<1 x i64> zeroinitializer)
 
 define void @bar() {
 entry:
-  %0 = bitcast double 0.0 to x86_mmx
-  %1 = call x86_mmx @foo(x86_mmx %0)
+  %0 = bitcast double 0.0 to <1 x i64>
+  %1 = call <1 x i64> @foo(<1 x i64> %0)
   ret void
 }
 
-declare x86_mmx @foo(x86_mmx)
+declare <1 x i64> @foo(<1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll
index f914b8622fcf4..5e5be820dd5b4 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll
@@ -8,9 +8,9 @@ define i64 @t0(ptr %p) {
 ; CHECK-NEXT:    paddq %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -21,9 +21,9 @@ define i64 @t1(ptr %p) {
 ; CHECK-NEXT:    paddd %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -34,9 +34,9 @@ define i64 @t2(ptr %p) {
 ; CHECK-NEXT:    paddw %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -47,29 +47,27 @@ define i64 @t3(ptr %p) {
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
-@R = external global x86_mmx
+@R = external global <1 x i64>
 
 define void @t4(<1 x i64> %A, <1 x i64> %B) {
 ; CHECK-LABEL: t4:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rdi, %mm0
-; CHECK-NEXT:    movq %rsi, %mm1
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
 ; CHECK-NEXT:    paddusw %mm0, %mm1
 ; CHECK-NEXT:    movq _R@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq %mm1, (%rax)
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-  %tmp2 = bitcast <1 x i64> %A to x86_mmx
-  %tmp3 = bitcast <1 x i64> %B to x86_mmx
-  %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3)
-  store x86_mmx %tmp7, ptr @R
+  %tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %A, <1 x i64> %B)
+  store <1 x i64> %tmp7, ptr @R
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -88,7 +86,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
   ret i64 %conv
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define <1 x i64> @t6(i64 %t) {
 ; CHECK-LABEL: t6:
@@ -98,16 +96,14 @@ define <1 x i64> @t6(i64 %t) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %t1 = insertelement <1 x i64> undef, i64 %t, i32 0
-  %t0 = bitcast <1 x i64> %t1 to x86_mmx
-  %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
-  %t3 = bitcast x86_mmx %t2 to <1 x i64>
-  ret <1 x i64> %t3
+  %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
+  ret <1 x i64> %t2
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll
index b919c9a33ea2f..d8a010bacc683 100644
--- a/llvm/test/CodeGen/X86/mmx-build-vector.ll
+++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2    | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
 ;
 ; v2i32
@@ -35,9 +35,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -58,9 +58,9 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32   0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -92,9 +92,9 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 undef, i32 0
   %2 = insertelement <2 x i32>    %1, i32   %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -119,9 +119,9 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32   0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -153,9 +153,9 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -194,9 +194,9 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16 %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 %a3, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -229,9 +229,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -254,9 +254,9 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -281,9 +281,9 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16     0, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -316,9 +316,9 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16   %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 undef, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -353,9 +353,9 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   %a0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -414,9 +414,9 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a6, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -469,9 +469,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8   %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8    0,  i32 6
   %8  = insertelement <8 x i8>    %7, i8   %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -522,9 +522,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -551,9 +551,9 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8     0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -582,9 +582,9 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -626,9 +626,9 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a0, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -669,9 +669,9 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -707,9 +707,9 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float 0.0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -742,9 +742,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float undef, i32 0
   %2 = insertelement <2 x float>    %1, float   %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -780,9 +780,9 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float 0.0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -815,8 +815,8 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-coalescing.ll b/llvm/test/CodeGen/X86/mmx-coalescing.ll
index dac526fe20bbf..589f5af4bb4d6 100644
--- a/llvm/test/CodeGen/X86/mmx-coalescing.ll
+++ b/llvm/test/CodeGen/X86/mmx-coalescing.ll
@@ -42,9 +42,9 @@ entry:
   %SA2 = getelementptr inbounds %SA, ptr %pSA, i64 0, i32 4
   %v3 = load ptr, ptr %SA2, align 8
   %v4 = bitcast <1 x i64> %v0 to <4 x i16>
-  %v5 = bitcast <4 x i16> %v4 to x86_mmx
-  %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
-  %v7 = bitcast x86_mmx %v6 to <4 x i16>
+  %v5 = bitcast <4 x i16> %v4 to <1 x i64>
+  %v6 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v5, i8 -18)
+  %v7 = bitcast <1 x i64> %v6 to <4 x i16>
   %v8 = bitcast <4 x i16> %v7 to <1 x i64>
   %v9 = extractelement <1 x i64> %v8, i32 0
   %v10 = bitcast i64 %v9 to <2 x i32>
@@ -55,18 +55,18 @@ entry:
 if.A:
   %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
   %v17 = extractelement <1 x i64> %pa, i32 0
-  %v18 = bitcast i64 %v17 to x86_mmx
-  %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
-  %v20 = bitcast x86_mmx %v19 to i64
+  %v18 = bitcast i64 %v17 to <1 x i64>
+  %v19 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %v18, i32 %B) #2
+  %v20 = bitcast <1 x i64> %v19 to i64
   %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
   %cmp3 = icmp eq i64 %v20, 0
   br i1 %cmp3, label %if.C, label %merge
 
 if.B:
   %v34 = bitcast <1 x i64> %v8 to <4 x i16>
-  %v35 = bitcast <4 x i16> %v34 to x86_mmx
-  %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
-  %v37 = bitcast x86_mmx %v36 to <4 x i16>
+  %v35 = bitcast <4 x i16> %v34 to <1 x i64>
+  %v36 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v35, i8 -18)
+  %v37 = bitcast <1 x i64> %v36 to <4 x i16>
   %v38 = bitcast <4 x i16> %v37 to <1 x i64>
   br label %if.C
 
@@ -80,9 +80,9 @@ if.C:
 merge:
   %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
   %v130 = bitcast <1 x i64> %vy to <4 x i16>
-  %v131 = bitcast <4 x i16> %v130 to x86_mmx
-  %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
-  %v133 = bitcast x86_mmx %v132 to <4 x i16>
+  %v131 = bitcast <4 x i16> %v130 to <1 x i64>
+  %v132 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v131, i8 -18)
+  %v133 = bitcast <1 x i64> %v132 to <4 x i16>
   %v134 = bitcast <4 x i16> %v133 to <1 x i64>
   %v135 = extractelement <1 x i64> %v134, i32 0
   %v136 = bitcast i64 %v135 to <2 x i32>
@@ -91,5 +91,5 @@ merge:
 }
 
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index c09c417c11c96..51a71dab37f6d 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -8,20 +8,10 @@
 define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f64_v2i32:
@@ -33,9 +23,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f64_v2i32:
@@ -69,9 +49,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f64_v2i32:
@@ -103,9 +73,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    retq
   %3 = fptosi <2 x double> %0 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <1 x i64> undef, i64 %6, i32 0
   store <1 x i64> %7, ptr %1
   ret void
@@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f32_v2i32:
@@ -139,9 +99,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f32_v2i32:
@@ -175,9 +125,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v4f32_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v4f32_v4i32:
@@ -210,9 +150,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X64-NEXT:    retq
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to i64
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to i64
   %8 = insertelement <1 x i64> undef, i64 %7, i32 0
   store <1 x i64> %8, ptr %1
   ret void
@@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f32_v2i32:
@@ -246,9 +176,9 @@ define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -280,9 +210,9 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = bitcast <2 x i64> %5 to <4 x i32>
   %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@@ -307,9 +237,9 @@ define <4 x float> @sitofp_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = shufflevector <2 x i32> %4, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = sitofp <4 x i32> %5 to <4 x float>
   ret <4 x float> %6
@@ -339,9 +269,9 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = insertelement <2 x i64> %5, i64 0, i32 1
   %7 = bitcast <2 x i64> %6 to <4 x i32>
@@ -349,7 +279,7 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
   ret <4 x float> %8
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 73df6be8d7989..6fe3bc4973185 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -29,13 +29,13 @@ define i64 @t0(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t1:
@@ -64,13 +64,13 @@ define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32)
 
 define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t2:
@@ -99,13 +99,13 @@ define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32)
 
 define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t3:
@@ -134,13 +134,13 @@ define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32)
 
 define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t4:
@@ -169,13 +169,13 @@ define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32)
 
 define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t5:
@@ -204,13 +204,13 @@ define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32)
 
 define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t6:
@@ -239,13 +239,13 @@ define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32)
 
 define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t7:
@@ -274,22 +274,27 @@ define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt0(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt0:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -301,28 +306,34 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt0:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
-define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt1(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -334,27 +345,33 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt2(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -366,27 +383,33 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddd (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddd (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt3(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -398,27 +421,33 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt4(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -430,27 +459,33 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
 
-define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt5(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt5:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -462,27 +497,33 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt6(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt6:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -494,27 +535,33 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt7(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt7:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrld (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -526,27 +573,33 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrld (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrld (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt8(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -558,18 +611,19 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>)
 
 define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind {
 ; X86-LABEL: test_psrlq_by_volatile_shift_amount:
@@ -599,8 +653,8 @@ entry:
   call void @llvm.lifetime.start(i64 4, ptr nonnull %0)
   store volatile i32 1, ptr %0, align 4
   %1 = load volatile i32, ptr %0, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %1)
-  store x86_mmx %2, ptr %t, align 8
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> <i64 255>, i32 %1)
+  store <1 x i64> %2, ptr %t, align 8
   call void @llvm.lifetime.end(i64 4, ptr nonnull %0)
   ret void
 }
@@ -609,28 +663,41 @@ declare void @llvm.lifetime.start(i64, ptr nocapture)
 declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 ; Make sure we shrink this vector load and fold it.
-define x86_mmx @vec_load(ptr %x) {
+define <1 x i64> @vec_load(ptr %x) {
 ; X86-LABEL: vec_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
 ; X86-NEXT:    paddsb %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_load:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
 ; X64-NEXT:    paddsb %mm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
   %z = load <4 x float>, ptr %x
   %y = extractelement <4 x float> %z, i32 0
   %a = insertelement <2 x float> undef, float %y, i32 0
   %b = insertelement <2 x float> %a, float %y, i32 1
-  %c = bitcast <2 x float> %b to x86_mmx
-  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
-  ret x86_mmx %d
+  %c = bitcast <2 x float> %b to <1 x i64>
+  %d = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %c, <1 x i64> %c)
+  ret <1 x i64> %d
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
index b2c94e3aaa3a6..a6e1275875dbc 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
@@ -115,32 +115,32 @@ define double @mmx_zero(double, double, double, double) nounwind {
 ; X64-LARGE-NEXT:    paddw %mm2, %mm0
 ; X64-LARGE-NEXT:    movq2dq %mm0, %xmm0
 ; X64-LARGE-NEXT:    retq
-  %5 = bitcast double %0 to x86_mmx
-  %6 = bitcast double %1 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
-  %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %9 = bitcast double %2 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
-  %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
-  %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
-  %13 = bitcast double %3 to x86_mmx
-  %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
-  %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
-  %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
-  %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
-  %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
-  %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
-  %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
-  %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
-  %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
-  %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
-  %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
-  %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
-  %27 = bitcast x86_mmx %26 to double
+  %5 = bitcast double %0 to <1 x i64>
+  %6 = bitcast double %1 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
+  %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %9 = bitcast double %2 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
+  %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
+  %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
+  %13 = bitcast double %3 to <1 x i64>
+  %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
+  %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
+  %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
+  %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
+  %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
+  %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
+  %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
+  %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
+  %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
+  %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
+  %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
+  %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
+  %27 = bitcast <1 x i64> %26 to double
   ret double %27
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-inlineasm.ll b/llvm/test/CodeGen/X86/mmx-inlineasm.ll
new file mode 100644
index 0000000000000..86fca9051dab4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mmx-inlineasm.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx | FileCheck %s
+
+;; Verify that the mmx 'y' constraint works with arbitrary IR types.
+define <2 x i32> @test_mmx_asm(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_mmx_asm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdq2q %xmm0, %mm0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # %mm0 = %mm0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # %mm0 = %mm0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call i64 asm sideeffect "# $0 = $1", "=y,y"(<2 x i32> %a)
+  %2 = tail call <2 x i32> asm sideeffect "# $0 = $1", "=y,y"(i64 %1)
+  ret <2 x i32> %2
+}
+
+;; And same thing with the 'Ym' constraint.
+define <2 x i32> @test_mmx_asm_Ym(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_mmx_asm_Ym:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdq2q %xmm0, %mm0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # %mm0 = %mm0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # %mm0 = %mm0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call i64 asm sideeffect "# $0 = $1", "=^Ym,^Ym"(<2 x i32> %a)
+  %2 = tail call <2 x i32> asm sideeffect "# $0 = $1", "=^Ym,^Ym"(i64 %1)
+  ret <2 x i32> %2
+}
diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
index a43d9400cde6c..a7b6ed416622e 100644
--- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefixes=ALL,X64
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+avx | FileCheck %s --check-prefixes=ALL,X64
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test1:
@@ -32,24 +32,24 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test88:
@@ -77,24 +77,24 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test88:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test87:
@@ -122,24 +122,24 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test87:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test86:
@@ -167,24 +167,24 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test86:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test85:
@@ -212,24 +212,24 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test85:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test84:
@@ -257,24 +257,24 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test84:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test83:
@@ -302,24 +302,24 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test83:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test82:
@@ -347,24 +347,24 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test82:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test81:
@@ -392,24 +392,24 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test81:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test80:
@@ -437,24 +437,24 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test80:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test79:
@@ -482,24 +482,24 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test79:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test78:
@@ -527,24 +527,24 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test78:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test77:
@@ -572,24 +572,24 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test77:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test76:
@@ -617,24 +617,24 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test76:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packuswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packuswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test75:
@@ -662,24 +662,24 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test75:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packssdw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packssdw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test74:
@@ -707,24 +707,24 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test74:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packsswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packsswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test73:
@@ -754,15 +754,15 @@ define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test72:
@@ -792,9 +792,9 @@ define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -825,15 +825,15 @@ define i64 @test72_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test71:
@@ -859,13 +859,13 @@ define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test70:
@@ -895,9 +895,9 @@ define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -928,15 +928,15 @@ define i64 @test70_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test69:
@@ -966,15 +966,15 @@ define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test68:
@@ -1000,13 +1000,13 @@ define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test67:
@@ -1036,15 +1036,15 @@ define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test66:
@@ -1074,9 +1074,9 @@ define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1107,15 +1107,15 @@ define i64 @test66_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test65:
@@ -1146,17 +1146,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test64:
@@ -1187,17 +1187,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test63:
@@ -1224,15 +1224,15 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test62:
@@ -1263,17 +1263,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test61:
@@ -1304,17 +1304,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test60:
@@ -1341,15 +1341,15 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test59:
@@ -1380,17 +1380,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test58:
@@ -1421,17 +1421,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test56:
@@ -1459,24 +1459,24 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test56:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pxor %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test55:
@@ -1504,24 +1504,24 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test55:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    por %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test54:
@@ -1549,24 +1549,24 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test54:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pandn %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pandn %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test53:
@@ -1594,24 +1594,24 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test53:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pand %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test52:
@@ -1639,18 +1639,18 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test52:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1682,24 +1682,24 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test51:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test50:
@@ -1727,24 +1727,24 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test50:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test49:
@@ -1772,24 +1772,24 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test49:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaddwd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test48:
@@ -1817,24 +1817,24 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test48:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test47:
@@ -1862,24 +1862,24 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test46:
@@ -1907,24 +1907,24 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test46:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test45:
@@ -1952,18 +1952,18 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test45:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1994,17 +1994,17 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test43:
@@ -2032,24 +2032,24 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test43:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test42:
@@ -2077,24 +2077,24 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test42:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test41:
@@ -2122,24 +2122,24 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test41:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test40:
@@ -2167,24 +2167,24 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test40:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test39:
@@ -2212,24 +2212,24 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test39:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test38:
@@ -2257,24 +2257,24 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test38:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test37:
@@ -2302,24 +2302,24 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test37:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test36:
@@ -2346,15 +2346,15 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test35:
@@ -2382,24 +2382,24 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test35:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test34:
@@ -2427,24 +2427,24 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test34:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test33:
@@ -2472,24 +2472,24 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test33:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test32:
@@ -2517,22 +2517,22 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test32:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    psadbw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test31:
@@ -2560,24 +2560,24 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test30:
@@ -2605,24 +2605,24 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test30:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test29:
@@ -2650,24 +2650,24 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test29:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test28:
@@ -2695,24 +2695,24 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test28:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test27:
@@ -2740,24 +2740,24 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test27:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test26:
@@ -2785,24 +2785,24 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test26:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X86-LABEL: test25:
@@ -2819,12 +2819,12 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test24:
@@ -2850,12 +2850,12 @@ define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ; X86-LABEL: test23:
@@ -2884,21 +2884,21 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ;
 ; X64-LABEL: test23:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    maskmovq %mm1, %mm0
+; X64-NEXT:    maskmovq %mm0, %mm1
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test22:
@@ -2926,24 +2926,24 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test22:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhuw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test21:
@@ -2972,9 +2972,9 @@ define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -3005,15 +3005,15 @@ define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test20:
@@ -3041,22 +3041,22 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test20:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmuludq %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test19:
@@ -3081,12 +3081,12 @@ define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test18:
@@ -3109,14 +3109,14 @@ define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test17:
@@ -3139,14 +3139,14 @@ define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test16:
@@ -3173,15 +3173,15 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test15:
@@ -3210,15 +3210,15 @@ define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test14:
@@ -3247,15 +3247,15 @@ define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test13:
@@ -3284,15 +3284,15 @@ define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test12:
@@ -3320,24 +3320,24 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test12:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test11:
@@ -3365,24 +3365,24 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test11:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test10:
@@ -3410,24 +3410,24 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test10:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test9:
@@ -3455,24 +3455,24 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test9:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pshufb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pshufb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test8:
@@ -3500,24 +3500,24 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhrsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test7:
@@ -3545,24 +3545,24 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pmaddubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pmaddubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test6:
@@ -3590,24 +3590,24 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test5:
@@ -3635,24 +3635,24 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test4:
@@ -3680,24 +3680,24 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test3:
@@ -3725,24 +3725,24 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test2:
@@ -3770,33 +3770,49 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; ALL-LABEL: test89:
-; ALL:       # %bb.0:
-; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
-; ALL-NEXT:    ret{{[l|q]}}
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind {
+; X86-LABEL: test89:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    cvtpi2ps (%esp), %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test89:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    cvtpi2ps %mm0, %xmm0
+; X64-NEXT:    retq
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() {
 ; ALL-LABEL: test90:
@@ -3836,13 +3852,11 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X86-LABEL: test_mm_extract_pi16:
@@ -3867,9 +3881,8 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X64-NEXT:    pextrw $2, %mm0, %eax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
diff --git a/llvm/test/CodeGen/X86/mmx-only.ll b/llvm/test/CodeGen/X86/mmx-only.ll
index eab67e08b9574..8a87350a79429 100644
--- a/llvm/test/CodeGen/X86/mmx-only.ll
+++ b/llvm/test/CodeGen/X86/mmx-only.ll
@@ -3,7 +3,7 @@
 
 ; Test that turning off sse doesn't turn off mmx.
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: @test88
@@ -11,10 +11,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
index fd8bd1facaf6b..6bb564c4b757e 100644
--- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -1,18 +1,18 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s
 ; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
 
-define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
+define <1 x i64> @mxcsr_mmx(<4 x float> %a0) {
 ; CHECK: MMX_CVTPS2PIrr %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PSrr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTTPS2PIrr killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PDrr killed %{{[0-9]$}}
 ; CHECK: MMX_CVTPD2PIrr killed %{{[0-9]}}, implicit $mxcsr
-  %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
-  %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
-  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
-  %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
-  ret x86_mmx %5
+  %1 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %1)
+  %3 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %2)
+  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %3)
+  %5 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
+  ret <1 x i64> %5
 }
 
 define half @mxcsr_f16c(float %a) {
@@ -41,11 +41,11 @@ define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double
   ret <8 x double> %res
 }
 
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
-declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>)
+declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
 declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
diff --git a/llvm/test/CodeGen/X86/nomerge.ll b/llvm/test/CodeGen/X86/nomerge.ll
index efbadf5b6911f..f0aedcb90c4ad 100644
--- a/llvm/test/CodeGen/X86/nomerge.ll
+++ b/llvm/test/CodeGen/X86/nomerge.ll
@@ -62,4 +62,96 @@ if.end:
 
 declare dso_local void @bar()
 
+define void @nomerge_trap(i32 %i) {
+; CHECK-LABEL: nomerge_trap:
+; CHECK:      # %bb.0: # %entry
+; CHECK:      # %bb.1: # %entry
+; CHECK:      # %bb.2: # %if.then
+; CHECK-NEXT: ud2
+; CHECK-NEXT: LBB{{.*}}: # %if.then2
+; CHECK-NEXT: ud2
+; CHECK-NEXT: .LBB{{.*}}: # %if.end3
+; CHECK-NEXT: ud2
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @llvm.trap() #0
+  unreachable
+
+if.then2:
+  tail call void @llvm.trap() #0
+  unreachable
+
+if.end3:
+  tail call void @llvm.trap() #0
+  unreachable
+}
+
+declare dso_local void @llvm.trap()
+
+define void @nomerge_debugtrap(i32 %i) {
+; CHECK-LABEL: nomerge_debugtrap:
+; CHECK:      # %bb.0: # %entry
+; CHECK:      # %bb.1: # %entry
+; CHECK:      # %bb.2: # %if.then
+; CHECK-NEXT: int3
+; CHECK-NEXT: LBB{{.*}}: # %if.then2
+; CHECK-NEXT: int3
+; CHECK-NEXT: .LBB{{.*}}: # %if.end3
+; CHECK-NEXT: int3
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @llvm.debugtrap() #0
+  unreachable
+
+if.then2:
+  tail call void @llvm.debugtrap() #0
+  unreachable
+
+if.end3:
+  tail call void @llvm.debugtrap() #0
+  unreachable
+}
+
+define void @nomerge_named_debugtrap(i32 %i) {
+; CHECK-LABEL: nomerge_named_debugtrap:
+; CHECK:      # %bb.0: # %entry
+; CHECK:      # %bb.1: # %entry
+; CHECK:      # %bb.2: # %if.then
+; CHECK-NEXT: callq trap_func@PLT
+; CHECK-NEXT: LBB{{.*}}: # %if.then2
+; CHECK-NEXT: callq trap_func@PLT
+; CHECK-NEXT: .LBB{{.*}}: # %if.end3
+; CHECK-NEXT: callq trap_func@PLT
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @llvm.debugtrap() #1
+  unreachable
+
+if.then2:
+  tail call void @llvm.debugtrap() #1
+  unreachable
+
+if.end3:
+  tail call void @llvm.debugtrap() #1
+  unreachable
+}
+
+declare dso_local void @llvm.debugtrap()
+
 attributes #0 = { nomerge }
+attributes #1 = { nomerge "trap-func-name"="trap_func" }
diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 8d3d0d0799c26..3b6ffacb0b230 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+avx | FileCheck %s --check-prefixes=X86,X86-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X64,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx | FileCheck %s --check-prefixes=X64,X64-AVX
 
 define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I, ptr %loadptr) nounwind {
 ; X86-SSE-LABEL: f:
@@ -176,4 +176,28 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
   ret i32 %sum8
 }
 
+define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
+; X86-LABEL: test_mmx:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    psrlq $3, %mm0
+; X86-NEXT:    movntq %mm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mmx:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    psrlq $3, %mm0
+; X64-NEXT:    movntq %mm0, (%rsi)
+; X64-NEXT:    retq
+entry:
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !0
+  ret void
+}
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
+
 !0 = !{i32 1}
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 886c3ae10324d..9e398096bfcc5 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -230,7 +230,7 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32
 ; SSE2-LABEL: p7_vector_urem_by_const__nonsplat_undef2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -238,7 +238,6 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psrld $2, %xmm2
 ; SSE2-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,0,6,0,6,0,6,0]
 ; SSE2-NEXT:    psubd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -249,12 +248,11 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [715827883,715827883,715827883,715827883]
 ; SSE4-NEXT:    pmuludq %xmm2, %xmm1
 ; SSE4-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT:    psrld $2, %xmm2
 ; SSE4-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,0,6,0,6,0,6,0]
 ; SSE4-NEXT:    psubd %xmm2, %xmm0
 ; SSE4-NEXT:    pxor %xmm1, %xmm1
@@ -266,12 +264,11 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [715827883,715827883,715827883,715827883]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [6,0,6,0,6,0,6,0]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 6812af867d8dd..762851747c415 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -51,7 +51,6 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index fe791e45eff99..2b475644a38cf 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -964,7 +964,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512F-LABEL: mul_v64i8:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
 ; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm5, %ymm4
diff --git a/llvm/test/CodeGen/X86/pr13859.ll b/llvm/test/CodeGen/X86/pr13859.ll
index 9b290e6947d58..35466478f289b 100644
--- a/llvm/test/CodeGen/X86/pr13859.ll
+++ b/llvm/test/CodeGen/X86/pr13859.ll
@@ -13,8 +13,7 @@ entry:
   %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
   %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
   %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
-  %a40 = bitcast <4 x i16> %a39 to x86_mmx
-  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+  %a40 = bitcast <4 x i16> %a39 to <1 x i64>
 
   %a47 = trunc i32 %a32 to i1
   br i1 %a47, label %a48, label %a49
@@ -23,6 +22,6 @@ a48:
   unreachable
 
 a49:
-  store <1 x i64> %a41, ptr %dest, align 8 ; !!!
+  store <1 x i64> %a40, ptr %dest, align 8 ; !!!
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index 45587b8c69cd4..da3246a917ea3 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -6,15 +6,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; PR23246
 ; We're really only interested in doing something sane with the shuffle.
 
-define <2 x i64> @test(x86_mmx %a) #0 {
+define <2 x i64> @test(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %rdi, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
 entry:
-  %b = bitcast x86_mmx %a to <1 x i64>
-  %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %s = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
   ret <2 x i64> %s
 }
 
diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll
index 9a38515b65594..6b8ac918386fa 100644
--- a/llvm/test/CodeGen/X86/pr29222.ll
+++ b/llvm/test/CodeGen/X86/pr29222.ll
@@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind {
 ; X86-AVX-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-AVX-NEXT:    packsswb %mm0, %mm0
 ; X86-AVX-NEXT:    movq %mm0, (%esp)
-; X86-AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vpbroadcastq (%esp), %xmm0
 ; X86-AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    movl %ebp, %esp
@@ -54,15 +54,17 @@ define i32 @PR29222(i32) nounwind {
 ; X64-AVX-NEXT:    movd %edi, %mm0
 ; X64-AVX-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X64-AVX-NEXT:    packsswb %mm0, %mm0
-; X64-AVX-NEXT:    movq2dq %mm0, %xmm0
+; X64-AVX-NEXT:    movq %mm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    retq
   %2 = insertelement <2 x i32> undef, i32 %0, i32 0
   %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <2 x i64> undef, i64 %6, i32 0
   %8 = bitcast <2 x i64> %7 to <8 x i16>
   %9 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %8, <8 x i16> undef)
@@ -71,5 +73,5 @@ define i32 @PR29222(i32) nounwind {
   ret i32 %11
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>)
 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll
index b6022698edaeb..0ad35309b87bb 100644
--- a/llvm/test/CodeGen/X86/pr35982.ll
+++ b/llvm/test/CodeGen/X86/pr35982.ll
@@ -35,9 +35,9 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   %2 = bitcast <1 x i64> %0 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = extractelement <1 x i64> %0, i32 0
-  %5 = bitcast i64 %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast i64 %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = extractelement <2 x i32> %7, i32 0
   tail call void @llvm.x86.mmx.emms()
   %9 = sitofp i32 %3 to float
@@ -46,5 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   ret float %11
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/pr64589.ll b/llvm/test/CodeGen/X86/pr64589.ll
index 130ef517ae28e..d93d54f4c31d0 100644
--- a/llvm/test/CodeGen/X86/pr64589.ll
+++ b/llvm/test/CodeGen/X86/pr64589.ll
@@ -7,8 +7,8 @@
 define i8 @test(ptr %p) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl (%rdi), %eax
-; CHECK-NEXT:    orb 1(%rdi), %al
+; CHECK-NEXT:    movzbl 1(%rdi), %eax
+; CHECK-NEXT:    orb (%rdi), %al
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    addb %al, %al
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr99396.ll b/llvm/test/CodeGen/X86/pr99396.ll
new file mode 100644
index 0000000000000..f534d32038c22
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr99396.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=i386-unknown-freebsd -enable-misched -relocation-model=pic | FileCheck %s
+
+@c = external local_unnamed_addr global ptr
+
+declare i32 @fn2() local_unnamed_addr
+
+declare i32 @fn3() local_unnamed_addr
+
+define noundef i32 @fn4() #0 {
+entry:
+  %tmp0 = load i32, ptr @fn4, align 4
+; CHECK: movl fn4@GOT(%ebx), %edi
+; CHECK-NEXT: movl (%edi), %edx
+  %tmp1 = load ptr, ptr @c, align 4
+; CHECK: movl c@GOT(%ebx), %eax
+; CHECK-NEXT: movl (%eax), %esi
+; CHECK-NEXT: testl %esi, %esi
+  %cmp.g = icmp eq ptr %tmp1, null
+  br i1 %cmp.g, label %if.then.g, label %if.end3.g
+
+if.then.g:                                        ; preds = %entry
+  %tmp2 = load i32, ptr inttoptr (i32 1 to ptr), align 4
+  %cmp1.g = icmp slt i32 %tmp2, 0
+  br i1 %cmp1.g, label %if.then2.g, label %if.end3.g
+
+if.then2.g:                                       ; preds = %if.then.g
+  %.g = load volatile i32, ptr null, align 2147483648
+  br label %f.exit
+
+if.end3.g:                                        ; preds = %if.then.g, %entry
+  %h.i.g = icmp eq i32 %tmp0, 0
+  br i1 %h.i.g, label %f.exit, label %while.body.g
+
+while.body.g:                                     ; preds = %if.end3.g, %if.end8.g
+  %buff.addr.019.g = phi ptr [ %incdec.ptr.g, %if.end8.g ], [ @fn4, %if.end3.g ]
+  %g.addr.018.g = phi i32 [ %dec.g, %if.end8.g ], [ %tmp0, %if.end3.g ]
+  %call4.g = tail call i32 @fn3(ptr %tmp1, ptr %buff.addr.019.g, i32 %g.addr.018.g)
+  %cmp5.g = icmp slt i32 %call4.g, 0
+  br i1 %cmp5.g, label %if.then6.g, label %if.end8.g
+
+if.then6.g:                                       ; preds = %while.body.g
+  %call7.g = tail call i32 @fn2(ptr null)
+  br label %f.exit
+
+if.end8.g:                                        ; preds = %while.body.g
+  %dec.g = add i32 %g.addr.018.g, 1
+  %incdec.ptr.g = getelementptr i32, ptr %buff.addr.019.g, i32 1
+  store i64 0, ptr %tmp1, align 4
+  %h.not.g = icmp eq i32 %dec.g, 0
+  br i1 %h.not.g, label %f.exit, label %while.body.g
+
+f.exit:                                           ; preds = %if.end8.g, %if.then6.g, %if.end3.g, %if.then2.g
+  ret i32 0
+}
+
+attributes #0 = { "frame-pointer"="all" "tune-cpu"="generic" }
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 27b7ebb8381cd..8a4308a5af64b 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -14,15 +14,11 @@ define i64 @test47(i64 %arg)  {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB0_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    pxor %mm0, %mm0
-; X64-NEXT:    jmp .LBB0_3
-; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    movd %eax, %mm0
-; X64-NEXT:  .LBB0_3:
+; X64-NEXT:    movl $7, %ecx
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %mm0
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
@@ -35,17 +31,17 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    orl 12(%ebp), %eax
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    jmp .LBB0_3
-; X86-NEXT:  .LBB0_1:
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    movd %eax, %mm0
-; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psllw %mm0, %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -55,9 +51,9 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %slct = select i1 %cond, <1 x i64> bitcast (i64 7 to <1 x i64>), <1 x i64> bitcast (i64 0 to <1 x i64>)
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
@@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X64-LABEL: test49:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB1_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq %rdx, %mm0
-; X64-NEXT:    jmp .LBB1_3
-; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    cmovneq %rdx, %rsi
 ; X64-NEXT:    movq %rsi, %mm0
-; X64-NEXT:  .LBB1_3:
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
@@ -113,13 +104,13 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %xmmx = bitcast i64 %x to x86_mmx
-  %ymmx = bitcast i64 %y to x86_mmx
-  %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %xmmx = bitcast i64 %x to <1 x i64>
+  %ymmx = bitcast i64 %y to <1 x i64>
+  %slct = select i1 %cond, <1 x i64> %xmmx, <1 x i64> %ymmx
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
index 11ca9e2a547ee..6eb99dd6c6758 100644
--- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s
 
-define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9,45 +9,47 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
+define <2 x double> @stack_fold_cvtpi2pd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpi2pd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %a0) nounwind readnone
   ret <2 x double> %2
 }
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
-define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
+define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_cvtpi2ps:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) nounwind readnone
   ret <4 x float> %2
 }
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvtps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -55,15 +57,15 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvttpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -71,15 +73,15 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvttps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -87,18 +89,18 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
 
 ; TODO stack_fold_movd_load
 
 ; padd forces execution on mmx
-define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
+define i32 @stack_fold_movd_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movd_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    #APP
@@ -120,8 +123,8 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %3
@@ -130,7 +133,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; TODO stack_fold_movq_load
 
 ; padd forces execution on mmx
-define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
+define i64 @stack_fold_movq_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movq_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
@@ -152,1171 +156,1235 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to i64
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to i64
   %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i64 %2
 }
 
-define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsb(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsb:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsw(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packssdw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packssdw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packssdw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packsswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packsswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packsswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packuswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packuswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packuswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_palignr(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_palignr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    palignr $1, %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %a, <1 x i64> %b, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pand(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pand:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pand %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pandn(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pandn:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pandn %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 ; TODO stack_fold_pinsrw
 
-define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddwd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhrsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhrsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhrsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhuw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhuw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhuw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmullw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmullw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmullw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmuludq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmuludq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmuludq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_por(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_por:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    por %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psadbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psadbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psadbw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pshufb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pshufb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pshufb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload
-; CHECK-NEXT:    pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
+define <1 x i64> @stack_fold_pshufw(<1 x i64> %a) {
 ; CHECK-LABEL: stack_fold_pshufw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    # mm0 = mem[1,0,0,0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %a, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignb(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignd(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignw(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pslld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pslld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pslld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrad(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrad:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrad %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psraw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psraw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psraw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhdq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhdq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckldq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckldq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pxor(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pxor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pxor %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
index cd5edcf2ae502..d8ce5b041042e 100644
--- a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
+++ b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
@@ -35,7 +35,7 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 
 ; BOTH: Function: cleanup_array
-; BOTH-NEXT:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH-NEXT:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: a @ dot.c:13
 ; STRIPPED-NOT: a @ dot.c:13
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -47,7 +47,7 @@ define void @cleanup_array(ptr %0) #1 {
 }
 
 ; BOTH: Function: cleanup_result
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: res @ dot.c:21
 ; STRIPPED-NOT: res @ dot.c:21
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -59,11 +59,11 @@ define void @cleanup_result(ptr %0) #1 {
 }
 
 ; BOTH: Function: do_work
-; BOTH:  Offset: [SP+12], Type: Variable, Align: 8, Size: 4
+; BOTH:  Offset: [SP+12], Type: Fixed, Align: 8, Size: 4
 ; DEBUG: out @ dot.c:32
 ; STRIPPED-NOT: out @ dot.c:32
-; BOTH:  Offset: [SP+8], Type: Variable, Align: 4, Size: 4
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+8], Type: Fixed, Align: 4, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: A @ dot.c:32
 ; STRIPPED-NOT: A @ dot.c:32
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -125,7 +125,7 @@ define i32 @do_work(ptr %0, ptr %1, ptr %2) #2 {
 }
 
 ; BOTH: Function: gen_array
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: size @ dot.c:62
 ; STRIPPED-NOT: size @ dot.c:62
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
diff --git a/llvm/test/CodeGen/X86/subreg-fail.mir b/llvm/test/CodeGen/X86/subreg-fail.mir
new file mode 100644
index 0000000000000..c8146f099b814
--- /dev/null
+++ b/llvm/test/CodeGen/X86/subreg-fail.mir
@@ -0,0 +1,37 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple x86_64-unknown-unknown %s \
+# RUN:   -verify-coalescing -enable-subreg-liveness \
+# RUN:   --run-pass=register-coalescer -o - | FileCheck %s
+
+# Check that the register coalescer correctly handles merging live ranges over
+# SUBREG_TO_REG on X86. The -verify-coalescing option will give an error if
+# this is incorrect.
+
+---
+name:            test1
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test1
+    ; CHECK: undef [[MOV32rm:%[0-9]+]].sub_32bit:gr64_nosp = MOV32rm undef %1:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
+    ; CHECK-NEXT: undef [[MOV32rm1:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32rm undef %4:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
+    ; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[MOV32rm1]], 32, implicit-def dead $eflags
+    ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64_with_sub_8bit = LEA64r [[MOV32rm1]], 1, [[MOV32rm]], 256, $noreg
+    ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64_with_sub_8bit = SHR64ri [[LEA64r]], 8, implicit-def dead $eflags
+    ; CHECK-NEXT: MOV32mr undef %10:gr64, 1, $noreg, 0, $noreg, [[LEA64r]].sub_32bit :: (volatile store (s32) into `ptr undef`)
+    ; CHECK-NEXT: RET 0, undef $eax
+    %0:gr32 = MOV32rm undef %1:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
+    %2:gr64_nosp = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    %3:gr32 = MOV32rm undef %4:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
+    %5:gr64 = SUBREG_TO_REG 0, killed %3, %subreg.sub_32bit
+    %6:gr64 = COPY killed %5
+    %6:gr64 = SHL64ri %6, 32, implicit-def dead $eflags
+    %7:gr64 = LEA64r killed %6, 1, killed %2, 256, $noreg
+    %8:gr64 = COPY killed %7
+    %8:gr64 = SHR64ri %8, 8, implicit-def dead $eflags
+    %9:gr32 = COPY killed %8.sub_32bit
+    MOV32mr undef %10:gr64, 1, $noreg, 0, $noreg, killed %9 :: (volatile store (s32) into `ptr undef`)
+    RET 0, undef $eax
+
+...
diff --git a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
index 52d0c2b509128..629f44b52bc05 100644
--- a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
+++ b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
@@ -17,4 +17,22 @@ entry:
   ret i32 %b
 }
 
+define i32 @foo2(i32 %arg1) #1 {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $31, %edi
+; CHECK-NEXT:    movzwl -72(%rsp,%rdi,2), %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = extractelement <32 x i16> zeroinitializer, i32 %arg1
+  %b = zext i16 %a to i32
+  ret i32 %b
+}
+
 attributes #0 = { "no-realign-stack" "target-cpu"="skylake-avx512" }
+attributes #1 = { "no-realign-stack" "target-cpu"="skylake" }
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 6350344457d87..7f4111e65cc17 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -63,6 +63,145 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
   ret <2 x i64> %ret1
 }
 
+define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v2i64:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE3-NEXT:    pand %xmm4, %xmm3
+; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE3-NEXT:    por %xmm3, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm1
+; SSE3-NEXT:    movq %xmm1, %rax
+; SSE3-NEXT:    andl $1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movq %xmm1, %rcx
+; SSE3-NEXT:    andl $1, %ecx
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT:    pandn %xmm0, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v2i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm1, %rcx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    pandn %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
+  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
+  %idx0 = extractelement <2 x i64> %or, i64 0
+  %idx1 = extractelement <2 x i64> %or, i64 1
+  %elt0 = extractelement <2 x i64> %v, i64 %idx0
+  %elt1 = extractelement <2 x i64> %v, i64 %idx1
+  %vec0 = insertelement <2 x i64> poison, i64 %elt0, i64 0
+  %vec1 = insertelement <2 x i64> %vec0, i64 %elt1, i64 1
+  %res = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vec1
+  ret <2 x i64> %res
+}
+
 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v4i32:
 ; SSE3:       # %bb.0:
@@ -126,6 +265,133 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
   ret <4 x i32> %ret3
 }
 
+define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v4i32:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movd %xmm1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE3-NEXT:    movd %xmm3, %ecx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movd %xmm3, %edx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; SSE3-NEXT:    movd %xmm1, %esi
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $3, %eax
+; SSE3-NEXT:    andl $3, %ecx
+; SSE3-NEXT:    andl $3, %edx
+; SSE3-NEXT:    andl $3, %esi
+; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
+; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
+  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
+  %idx0 = extractelement <4 x i32> %or, i64 0
+  %idx1 = extractelement <4 x i32> %or, i64 1
+  %idx2 = extractelement <4 x i32> %or, i64 2
+  %idx3 = extractelement <4 x i32> %or, i64 3
+  %elt0 = extractelement <4 x i32> %v, i32 %idx0
+  %elt1 = extractelement <4 x i32> %v, i32 %idx1
+  %elt2 = extractelement <4 x i32> %v, i32 %idx2
+  %elt3 = extractelement <4 x i32> %v, i32 %idx3
+  %vec0 = insertelement <4 x i32> poison, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+  %res = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vec3
+  ret <4 x i32> %res
+}
+
 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v8i16:
 ; SSE3:       # %bb.0:
@@ -223,6 +489,154 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
   ret <8 x i16> %ret7
 }
 
+define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v8i16:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
+; SSE3-NEXT:    psubusw %xmm1, %xmm3
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    pcmpeqw %xmm3, %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    pextrw $0, %xmm1, %eax
+; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE3-NEXT:    pextrw $2, %xmm1, %edx
+; SSE3-NEXT:    pextrw $3, %xmm1, %esi
+; SSE3-NEXT:    pextrw $4, %xmm1, %edi
+; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
+; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
+; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
+; SSE3-NEXT:    movdqa %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $7, %eax
+; SSE3-NEXT:    andl $7, %ecx
+; SSE3-NEXT:    andl $7, %edx
+; SSE3-NEXT:    andl $7, %esi
+; SSE3-NEXT:    andl $7, %edi
+; SSE3-NEXT:    andl $7, %r8d
+; SSE3-NEXT:    andl $7, %r9d
+; SSE3-NEXT:    andl $7, %r10d
+; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSE3-NEXT:    movd %r10d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
+; SSE3-NEXT:    movd %r9d, %xmm2
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
+; SSE3-NEXT:    movd %r8d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE3-NEXT:    movd %edi, %xmm3
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
+; SSE3-NEXT:    movd %esi, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSE3-NEXT:    movd %edx, %xmm2
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSE3-NEXT:    movd %ecx, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE3-NEXT:    movd %eax, %xmm4
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE3-NEXT:    pandn %xmm4, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
+; SSSE3-NEXT:    psubusw %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
+; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
+; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; XOP-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqu16 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
+  %idx0 = extractelement <8 x i16> %or, i64 0
+  %idx1 = extractelement <8 x i16> %or, i64 1
+  %idx2 = extractelement <8 x i16> %or, i64 2
+  %idx3 = extractelement <8 x i16> %or, i64 3
+  %idx4 = extractelement <8 x i16> %or, i64 4
+  %idx5 = extractelement <8 x i16> %or, i64 5
+  %idx6 = extractelement <8 x i16> %or, i64 6
+  %idx7 = extractelement <8 x i16> %or, i64 7
+  %elt0 = extractelement <8 x i16> %v, i16 %idx0
+  %elt1 = extractelement <8 x i16> %v, i16 %idx1
+  %elt2 = extractelement <8 x i16> %v, i16 %idx2
+  %elt3 = extractelement <8 x i16> %v, i16 %idx3
+  %elt4 = extractelement <8 x i16> %v, i16 %idx4
+  %elt5 = extractelement <8 x i16> %v, i16 %idx5
+  %elt6 = extractelement <8 x i16> %v, i16 %idx6
+  %elt7 = extractelement <8 x i16> %v, i16 %idx7
+  %vec0 = insertelement <8 x i16> poison, i16 %elt0, i64 0
+  %vec1 = insertelement <8 x i16> %vec0, i16 %elt1, i64 1
+  %vec2 = insertelement <8 x i16> %vec1, i16 %elt2, i64 2
+  %vec3 = insertelement <8 x i16> %vec2, i16 %elt3, i64 3
+  %vec4 = insertelement <8 x i16> %vec3, i16 %elt4, i64 4
+  %vec5 = insertelement <8 x i16> %vec4, i16 %elt5, i64 5
+  %vec6 = insertelement <8 x i16> %vec5, i16 %elt6, i64 6
+  %vec7 = insertelement <8 x i16> %vec6, i16 %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vec7
+  ret <8 x i16> %res
+}
+
 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8:
 ; SSE3:       # %bb.0:
@@ -374,6 +788,202 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
   ret <16 x i8> %ret15
 }
 
+define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v16i8:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE3-NEXT:    pmaxub %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, -40(%rsp)
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    movzbl -25(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm1
+; SSE3-NEXT:    movzbl -26(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm2
+; SSE3-NEXT:    movzbl -27(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm4
+; SSE3-NEXT:    movzbl -28(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm3
+; SSE3-NEXT:    movzbl -29(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm6
+; SSE3-NEXT:    movzbl -30(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm7
+; SSE3-NEXT:    movzbl -31(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm8
+; SSE3-NEXT:    movzbl -32(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm5
+; SSE3-NEXT:    movzbl -33(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm9
+; SSE3-NEXT:    movzbl -34(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm10
+; SSE3-NEXT:    movzbl -35(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm12
+; SSE3-NEXT:    movzbl -36(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm11
+; SSE3-NEXT:    movzbl -37(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm13
+; SSE3-NEXT:    movzbl -38(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm14
+; SSE3-NEXT:    movzbl -39(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm15
+; SSE3-NEXT:    movzbl -40(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm1
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSSE3-NEXT:    pmaxub %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    pmaxub %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqu8 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+  %or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
+  %idx0 = extractelement <16 x i8> %or, i64 0
+  %idx1 = extractelement <16 x i8> %or, i64 1
+  %idx2 = extractelement <16 x i8> %or, i64 2
+  %idx3 = extractelement <16 x i8> %or, i64 3
+  %idx4 = extractelement <16 x i8> %or, i64 4
+  %idx5 = extractelement <16 x i8> %or, i64 5
+  %idx6 = extractelement <16 x i8> %or, i64 6
+  %idx7 = extractelement <16 x i8> %or, i64 7
+  %idx8 = extractelement <16 x i8> %or, i64 8
+  %idx9 = extractelement <16 x i8> %or, i64 9
+  %idxA = extractelement <16 x i8> %or, i64 10
+  %idxB = extractelement <16 x i8> %or, i64 11
+  %idxC = extractelement <16 x i8> %or, i64 12
+  %idxD = extractelement <16 x i8> %or, i64 13
+  %idxE = extractelement <16 x i8> %or, i64 14
+  %idxF = extractelement <16 x i8> %or, i64 15
+  %elt0 = extractelement <16 x i8> %v, i8 %idx0
+  %elt1 = extractelement <16 x i8> %v, i8 %idx1
+  %elt2 = extractelement <16 x i8> %v, i8 %idx2
+  %elt3 = extractelement <16 x i8> %v, i8 %idx3
+  %elt4 = extractelement <16 x i8> %v, i8 %idx4
+  %elt5 = extractelement <16 x i8> %v, i8 %idx5
+  %elt6 = extractelement <16 x i8> %v, i8 %idx6
+  %elt7 = extractelement <16 x i8> %v, i8 %idx7
+  %elt8 = extractelement <16 x i8> %v, i8 %idx8
+  %elt9 = extractelement <16 x i8> %v, i8 %idx9
+  %eltA = extractelement <16 x i8> %v, i8 %idxA
+  %eltB = extractelement <16 x i8> %v, i8 %idxB
+  %eltC = extractelement <16 x i8> %v, i8 %idxC
+  %eltD = extractelement <16 x i8> %v, i8 %idxD
+  %eltE = extractelement <16 x i8> %v, i8 %idxE
+  %eltF = extractelement <16 x i8> %v, i8 %idxF
+  %vec0 = insertelement <16 x i8> poison, i8 %elt0, i64 0
+  %vec1 = insertelement <16 x i8> %vec0, i8 %elt1, i64 1
+  %vec2 = insertelement <16 x i8> %vec1, i8 %elt2, i64 2
+  %vec3 = insertelement <16 x i8> %vec2, i8 %elt3, i64 3
+  %vec4 = insertelement <16 x i8> %vec3, i8 %elt4, i64 4
+  %vec5 = insertelement <16 x i8> %vec4, i8 %elt5, i64 5
+  %vec6 = insertelement <16 x i8> %vec5, i8 %elt6, i64 6
+  %vec7 = insertelement <16 x i8> %vec6, i8 %elt7, i64 7
+  %vec8 = insertelement <16 x i8> %vec7, i8 %elt8, i64 8
+  %vec9 = insertelement <16 x i8> %vec8, i8 %elt9, i64 9
+  %vecA = insertelement <16 x i8> %vec9, i8 %eltA, i64 10
+  %vecB = insertelement <16 x i8> %vecA, i8 %eltB, i64 11
+  %vecC = insertelement <16 x i8> %vecB, i8 %eltC, i64 12
+  %vecD = insertelement <16 x i8> %vecC, i8 %eltD, i64 13
+  %vecE = insertelement <16 x i8> %vecD, i8 %eltE, i64 14
+  %vecF = insertelement <16 x i8> %vecE, i8 %eltF, i64 15
+  %res = select <16 x i1> %cmp, <16 x i8> zeroinitializer, <16 x i8> %vecF
+  ret <16 x i8> %res
+}
+
 define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v2f64:
 ; SSE3:       # %bb.0:
@@ -424,6 +1034,143 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
   ret <2 x double> %ret1
 }
 
+define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v2f64:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE3-NEXT:    pand %xmm4, %xmm3
+; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE3-NEXT:    por %xmm3, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm1
+; SSE3-NEXT:    movq %xmm1, %rax
+; SSE3-NEXT:    andl $1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movq %xmm1, %rcx
+; SSE3-NEXT:    andl $1, %ecx
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; SSE3-NEXT:    pandn %xmm0, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v2f64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm1, %rcx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v2f64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
+  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
+  %idx0 = extractelement <2 x i64> %or, i64 0
+  %idx1 = extractelement <2 x i64> %or, i64 1
+  %elt0 = extractelement <2 x double> %v, i64 %idx0
+  %elt1 = extractelement <2 x double> %v, i64 %idx1
+  %vec0 = insertelement <2 x double> poison, double %elt0, i64 0
+  %vec1 = insertelement <2 x double> %vec0, double %elt1, i64 1
+  %res = select <2 x i1> %cmp, <2 x double> zeroinitializer, <2 x double> %vec1
+  ret <2 x double> %res
+}
+
 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v4f32:
 ; SSE3:       # %bb.0:
@@ -487,6 +1234,133 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
   ret <4 x float> %ret3
 }
 
+define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v4f32:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movd %xmm1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE3-NEXT:    movd %xmm3, %ecx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movd %xmm3, %edx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; SSE3-NEXT:    movd %xmm1, %esi
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $3, %eax
+; SSE3-NEXT:    andl $3, %ecx
+; SSE3-NEXT:    andl $3, %edx
+; SSE3-NEXT:    andl $3, %esi
+; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v4f32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v4f32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
+; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
+  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
+  %idx0 = extractelement <4 x i32> %or, i64 0
+  %idx1 = extractelement <4 x i32> %or, i64 1
+  %idx2 = extractelement <4 x i32> %or, i64 2
+  %idx3 = extractelement <4 x i32> %or, i64 3
+  %elt0 = extractelement <4 x float> %v, i32 %idx0
+  %elt1 = extractelement <4 x float> %v, i32 %idx1
+  %elt2 = extractelement <4 x float> %v, i32 %idx2
+  %elt3 = extractelement <4 x float> %v, i32 %idx3
+  %vec0 = insertelement <4 x float> poison, float %elt0, i64 0
+  %vec1 = insertelement <4 x float> %vec0, float %elt1, i64 1
+  %vec2 = insertelement <4 x float> %vec1, float %elt2, i64 2
+  %vec3 = insertelement <4 x float> %vec2, float %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x float> zeroinitializer, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
 ; SSE3:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 0baa5bdf27624..2968c97a1355c 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -78,6 +78,111 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
   ret <4 x i64> %ret3
 }
 
+define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v4i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,3]
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm4, %ymm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT:    vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k2
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
+  %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
+  %idx0 = extractelement <4 x i64> %or, i64 0
+  %idx1 = extractelement <4 x i64> %or, i64 1
+  %idx2 = extractelement <4 x i64> %or, i64 2
+  %idx3 = extractelement <4 x i64> %or, i64 3
+  %elt0 = extractelement <4 x i64> %v, i64 %idx0
+  %elt1 = extractelement <4 x i64> %v, i64 %idx1
+  %elt2 = extractelement <4 x i64> %v, i64 %idx2
+  %elt3 = extractelement <4 x i64> %v, i64 %idx3
+  %vec0 = insertelement <4 x i64> poison, i64 %elt0, i64 0
+  %vec1 = insertelement <4 x i64> %vec0, i64 %elt1, i64 1
+  %vec2 = insertelement <4 x i64> %vec1, i64 %elt2, i64 2
+  %vec3 = insertelement <4 x i64> %vec2, i64 %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x i64> zeroinitializer, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v8i32:
 ; XOP:       # %bb.0:
@@ -130,6 +235,104 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
   ret <8 x i32> %ret7
 }
 
+define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v8i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm3, %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
+  %idx0 = extractelement <8 x i32> %or, i64 0
+  %idx1 = extractelement <8 x i32> %or, i64 1
+  %idx2 = extractelement <8 x i32> %or, i64 2
+  %idx3 = extractelement <8 x i32> %or, i64 3
+  %idx4 = extractelement <8 x i32> %or, i64 4
+  %idx5 = extractelement <8 x i32> %or, i64 5
+  %idx6 = extractelement <8 x i32> %or, i64 6
+  %idx7 = extractelement <8 x i32> %or, i64 7
+  %elt0 = extractelement <8 x i32> %v, i32 %idx0
+  %elt1 = extractelement <8 x i32> %v, i32 %idx1
+  %elt2 = extractelement <8 x i32> %v, i32 %idx2
+  %elt3 = extractelement <8 x i32> %v, i32 %idx3
+  %elt4 = extractelement <8 x i32> %v, i32 %idx4
+  %elt5 = extractelement <8 x i32> %v, i32 %idx5
+  %elt6 = extractelement <8 x i32> %v, i32 %idx6
+  %elt7 = extractelement <8 x i32> %v, i32 %idx7
+  %vec0 = insertelement <8 x i32> poison, i32 %elt0, i64 0
+  %vec1 = insertelement <8 x i32> %vec0, i32 %elt1, i64 1
+  %vec2 = insertelement <8 x i32> %vec1, i32 %elt2, i64 2
+  %vec3 = insertelement <8 x i32> %vec2, i32 %elt3, i64 3
+  %vec4 = insertelement <8 x i32> %vec3, i32 %elt4, i64 4
+  %vec5 = insertelement <8 x i32> %vec4, i32 %elt5, i64 5
+  %vec6 = insertelement <8 x i32> %vec5, i32 %elt6, i64 6
+  %vec7 = insertelement <8 x i32> %vec6, i32 %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x i32> zeroinitializer, <8 x i32> %vec7
+  ret <8 x i32> %res
+}
+
 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v16i16:
 ; XOP:       # %bb.0:
@@ -262,6 +465,163 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
   ret <16 x i16> %ret15
 }
 
+define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v16i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT:    vpcomgtuw %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuw %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [256,256,256,256,256,256,256,256]
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm4 = [514,514,514,514,514,514,514,514]
+; XOP-NEXT:    vpmacsww %xmm3, %xmm4, %xmm1, %xmm5
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpmacsww %xmm3, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT:    vpperm %xmm1, %xmm3, %xmm0, %xmm1
+; XOP-NEXT:    vpperm %xmm5, %xmm3, %xmm0, %xmm0
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [256,256,256,256,256,256,256,256]
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm7
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: var_shuffle_zero_v16i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLDQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512VLDQ-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpternlogq $202, %ymm3, %ymm0, %ymm1
+; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; VLVBMI-LABEL: var_shuffle_zero_v16i16:
+; VLVBMI:       # %bb.0:
+; VLVBMI-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT:    retq
+  %cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
+  %idx0 = extractelement <16 x i16> %or, i64 0
+  %idx1 = extractelement <16 x i16> %or, i64 1
+  %idx2 = extractelement <16 x i16> %or, i64 2
+  %idx3 = extractelement <16 x i16> %or, i64 3
+  %idx4 = extractelement <16 x i16> %or, i64 4
+  %idx5 = extractelement <16 x i16> %or, i64 5
+  %idx6 = extractelement <16 x i16> %or, i64 6
+  %idx7 = extractelement <16 x i16> %or, i64 7
+  %idx8 = extractelement <16 x i16> %or, i64 8
+  %idx9 = extractelement <16 x i16> %or, i64 9
+  %idxA = extractelement <16 x i16> %or, i64 10
+  %idxB = extractelement <16 x i16> %or, i64 11
+  %idxC = extractelement <16 x i16> %or, i64 12
+  %idxD = extractelement <16 x i16> %or, i64 13
+  %idxE = extractelement <16 x i16> %or, i64 14
+  %idxF = extractelement <16 x i16> %or, i64 15
+  %elt0 = extractelement <16 x i16> %v, i16 %idx0
+  %elt1 = extractelement <16 x i16> %v, i16 %idx1
+  %elt2 = extractelement <16 x i16> %v, i16 %idx2
+  %elt3 = extractelement <16 x i16> %v, i16 %idx3
+  %elt4 = extractelement <16 x i16> %v, i16 %idx4
+  %elt5 = extractelement <16 x i16> %v, i16 %idx5
+  %elt6 = extractelement <16 x i16> %v, i16 %idx6
+  %elt7 = extractelement <16 x i16> %v, i16 %idx7
+  %elt8 = extractelement <16 x i16> %v, i16 %idx8
+  %elt9 = extractelement <16 x i16> %v, i16 %idx9
+  %eltA = extractelement <16 x i16> %v, i16 %idxA
+  %eltB = extractelement <16 x i16> %v, i16 %idxB
+  %eltC = extractelement <16 x i16> %v, i16 %idxC
+  %eltD = extractelement <16 x i16> %v, i16 %idxD
+  %eltE = extractelement <16 x i16> %v, i16 %idxE
+  %eltF = extractelement <16 x i16> %v, i16 %idxF
+  %vec0 = insertelement <16 x i16> poison, i16 %elt0, i64 0
+  %vec1 = insertelement <16 x i16> %vec0, i16 %elt1, i64 1
+  %vec2 = insertelement <16 x i16> %vec1, i16 %elt2, i64 2
+  %vec3 = insertelement <16 x i16> %vec2, i16 %elt3, i64 3
+  %vec4 = insertelement <16 x i16> %vec3, i16 %elt4, i64 4
+  %vec5 = insertelement <16 x i16> %vec4, i16 %elt5, i64 5
+  %vec6 = insertelement <16 x i16> %vec5, i16 %elt6, i64 6
+  %vec7 = insertelement <16 x i16> %vec6, i16 %elt7, i64 7
+  %vec8 = insertelement <16 x i16> %vec7, i16 %elt8, i64 8
+  %vec9 = insertelement <16 x i16> %vec8, i16 %elt9, i64 9
+  %vecA = insertelement <16 x i16> %vec9, i16 %eltA, i64 10
+  %vecB = insertelement <16 x i16> %vecA, i16 %eltB, i64 11
+  %vecC = insertelement <16 x i16> %vecB, i16 %eltC, i64 12
+  %vecD = insertelement <16 x i16> %vecC, i16 %eltD, i64 13
+  %vecE = insertelement <16 x i16> %vecD, i16 %eltE, i64 14
+  %vecF = insertelement <16 x i16> %vecE, i16 %eltF, i64 15
+  %res = select <16 x i1> %cmp, <16 x i16> zeroinitializer, <16 x i16> %vecF
+  ret <16 x i16> %res
+}
+
 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v32i8:
 ; XOP:       # %bb.0:
@@ -431,6 +791,202 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
   ret <32 x i8> %ret31
 }
 
+define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v32i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; XOP-NEXT:    vpcomgtub %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtub %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOP-NEXT:    vpperm %xmm3, %xmm4, %xmm0, %xmm3
+; XOP-NEXT:    vpperm %xmm1, %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm7
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: var_shuffle_zero_v32i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpternlogq $202, %ymm3, %ymm0, %ymm1
+; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k2}
+; AVX512VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; VLVBMI-LABEL: var_shuffle_zero_v32i8:
+; VLVBMI:       # %bb.0:
+; VLVBMI-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT:    retq
+  %cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
+  %or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
+  %idx00 = extractelement <32 x i8> %or, i64 0
+  %idx01 = extractelement <32 x i8> %or, i64 1
+  %idx02 = extractelement <32 x i8> %or, i64 2
+  %idx03 = extractelement <32 x i8> %or, i64 3
+  %idx04 = extractelement <32 x i8> %or, i64 4
+  %idx05 = extractelement <32 x i8> %or, i64 5
+  %idx06 = extractelement <32 x i8> %or, i64 6
+  %idx07 = extractelement <32 x i8> %or, i64 7
+  %idx08 = extractelement <32 x i8> %or, i64 8
+  %idx09 = extractelement <32 x i8> %or, i64 9
+  %idx0A = extractelement <32 x i8> %or, i64 10
+  %idx0B = extractelement <32 x i8> %or, i64 11
+  %idx0C = extractelement <32 x i8> %or, i64 12
+  %idx0D = extractelement <32 x i8> %or, i64 13
+  %idx0E = extractelement <32 x i8> %or, i64 14
+  %idx0F = extractelement <32 x i8> %or, i64 15
+  %idx10 = extractelement <32 x i8> %or, i64 16
+  %idx11 = extractelement <32 x i8> %or, i64 17
+  %idx12 = extractelement <32 x i8> %or, i64 18
+  %idx13 = extractelement <32 x i8> %or, i64 19
+  %idx14 = extractelement <32 x i8> %or, i64 20
+  %idx15 = extractelement <32 x i8> %or, i64 21
+  %idx16 = extractelement <32 x i8> %or, i64 22
+  %idx17 = extractelement <32 x i8> %or, i64 23
+  %idx18 = extractelement <32 x i8> %or, i64 24
+  %idx19 = extractelement <32 x i8> %or, i64 25
+  %idx1A = extractelement <32 x i8> %or, i64 26
+  %idx1B = extractelement <32 x i8> %or, i64 27
+  %idx1C = extractelement <32 x i8> %or, i64 28
+  %idx1D = extractelement <32 x i8> %or, i64 29
+  %idx1E = extractelement <32 x i8> %or, i64 30
+  %idx1F = extractelement <32 x i8> %or, i64 31
+  %elt00 = extractelement <32 x i8> %v, i8 %idx00
+  %elt01 = extractelement <32 x i8> %v, i8 %idx01
+  %elt02 = extractelement <32 x i8> %v, i8 %idx02
+  %elt03 = extractelement <32 x i8> %v, i8 %idx03
+  %elt04 = extractelement <32 x i8> %v, i8 %idx04
+  %elt05 = extractelement <32 x i8> %v, i8 %idx05
+  %elt06 = extractelement <32 x i8> %v, i8 %idx06
+  %elt07 = extractelement <32 x i8> %v, i8 %idx07
+  %elt08 = extractelement <32 x i8> %v, i8 %idx08
+  %elt09 = extractelement <32 x i8> %v, i8 %idx09
+  %elt0A = extractelement <32 x i8> %v, i8 %idx0A
+  %elt0B = extractelement <32 x i8> %v, i8 %idx0B
+  %elt0C = extractelement <32 x i8> %v, i8 %idx0C
+  %elt0D = extractelement <32 x i8> %v, i8 %idx0D
+  %elt0E = extractelement <32 x i8> %v, i8 %idx0E
+  %elt0F = extractelement <32 x i8> %v, i8 %idx0F
+  %elt10 = extractelement <32 x i8> %v, i8 %idx10
+  %elt11 = extractelement <32 x i8> %v, i8 %idx11
+  %elt12 = extractelement <32 x i8> %v, i8 %idx12
+  %elt13 = extractelement <32 x i8> %v, i8 %idx13
+  %elt14 = extractelement <32 x i8> %v, i8 %idx14
+  %elt15 = extractelement <32 x i8> %v, i8 %idx15
+  %elt16 = extractelement <32 x i8> %v, i8 %idx16
+  %elt17 = extractelement <32 x i8> %v, i8 %idx17
+  %elt18 = extractelement <32 x i8> %v, i8 %idx18
+  %elt19 = extractelement <32 x i8> %v, i8 %idx19
+  %elt1A = extractelement <32 x i8> %v, i8 %idx1A
+  %elt1B = extractelement <32 x i8> %v, i8 %idx1B
+  %elt1C = extractelement <32 x i8> %v, i8 %idx1C
+  %elt1D = extractelement <32 x i8> %v, i8 %idx1D
+  %elt1E = extractelement <32 x i8> %v, i8 %idx1E
+  %elt1F = extractelement <32 x i8> %v, i8 %idx1F
+  %vec00 = insertelement <32 x i8> poison, i8 %elt00, i64 0
+  %vec01 = insertelement <32 x i8> %vec00, i8 %elt01, i64 1
+  %vec02 = insertelement <32 x i8> %vec01, i8 %elt02, i64 2
+  %vec03 = insertelement <32 x i8> %vec02, i8 %elt03, i64 3
+  %vec04 = insertelement <32 x i8> %vec03, i8 %elt04, i64 4
+  %vec05 = insertelement <32 x i8> %vec04, i8 %elt05, i64 5
+  %vec06 = insertelement <32 x i8> %vec05, i8 %elt06, i64 6
+  %vec07 = insertelement <32 x i8> %vec06, i8 %elt07, i64 7
+  %vec08 = insertelement <32 x i8> %vec07, i8 %elt08, i64 8
+  %vec09 = insertelement <32 x i8> %vec08, i8 %elt09, i64 9
+  %vec0A = insertelement <32 x i8> %vec09, i8 %elt0A, i64 10
+  %vec0B = insertelement <32 x i8> %vec0A, i8 %elt0B, i64 11
+  %vec0C = insertelement <32 x i8> %vec0B, i8 %elt0C, i64 12
+  %vec0D = insertelement <32 x i8> %vec0C, i8 %elt0D, i64 13
+  %vec0E = insertelement <32 x i8> %vec0D, i8 %elt0E, i64 14
+  %vec0F = insertelement <32 x i8> %vec0E, i8 %elt0F, i64 15
+  %vec10 = insertelement <32 x i8> %vec0F, i8 %elt10, i64 16
+  %vec11 = insertelement <32 x i8> %vec10, i8 %elt11, i64 17
+  %vec12 = insertelement <32 x i8> %vec11, i8 %elt12, i64 18
+  %vec13 = insertelement <32 x i8> %vec12, i8 %elt13, i64 19
+  %vec14 = insertelement <32 x i8> %vec13, i8 %elt14, i64 20
+  %vec15 = insertelement <32 x i8> %vec14, i8 %elt15, i64 21
+  %vec16 = insertelement <32 x i8> %vec15, i8 %elt16, i64 22
+  %vec17 = insertelement <32 x i8> %vec16, i8 %elt17, i64 23
+  %vec18 = insertelement <32 x i8> %vec17, i8 %elt18, i64 24
+  %vec19 = insertelement <32 x i8> %vec18, i8 %elt19, i64 25
+  %vec1A = insertelement <32 x i8> %vec19, i8 %elt1A, i64 26
+  %vec1B = insertelement <32 x i8> %vec1A, i8 %elt1B, i64 27
+  %vec1C = insertelement <32 x i8> %vec1B, i8 %elt1C, i64 28
+  %vec1D = insertelement <32 x i8> %vec1C, i8 %elt1D, i64 29
+  %vec1E = insertelement <32 x i8> %vec1D, i8 %elt1E, i64 30
+  %vec1F = insertelement <32 x i8> %vec1E, i8 %elt1F, i64 31
+  %res = select <32 x i1> %cmp, <32 x i8> zeroinitializer, <32 x i8> %vec1F
+  ret <32 x i8> %res
+}
+
 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v4f64:
 ; XOP:       # %bb.0:
@@ -498,6 +1054,111 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
   ret <4 x double> %ret3
 }
 
+define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v4f64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,3]
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm4, %ymm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT:    vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k2
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
+  %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
+  %idx0 = extractelement <4 x i64> %or, i64 0
+  %idx1 = extractelement <4 x i64> %or, i64 1
+  %idx2 = extractelement <4 x i64> %or, i64 2
+  %idx3 = extractelement <4 x i64> %or, i64 3
+  %elt0 = extractelement <4 x double> %v, i64 %idx0
+  %elt1 = extractelement <4 x double> %v, i64 %idx1
+  %elt2 = extractelement <4 x double> %v, i64 %idx2
+  %elt3 = extractelement <4 x double> %v, i64 %idx3
+  %vec0 = insertelement <4 x double> poison, double %elt0, i64 0
+  %vec1 = insertelement <4 x double> %vec0, double %elt1, i64 1
+  %vec2 = insertelement <4 x double> %vec1, double %elt2, i64 2
+  %vec3 = insertelement <4 x double> %vec2, double %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x double> zeroinitializer, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v8f32:
 ; XOP:       # %bb.0:
@@ -550,6 +1211,104 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
   ret <8 x float> %ret7
 }
 
+define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v8f32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm3, %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovaps %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
+  %idx0 = extractelement <8 x i32> %or, i64 0
+  %idx1 = extractelement <8 x i32> %or, i64 1
+  %idx2 = extractelement <8 x i32> %or, i64 2
+  %idx3 = extractelement <8 x i32> %or, i64 3
+  %idx4 = extractelement <8 x i32> %or, i64 4
+  %idx5 = extractelement <8 x i32> %or, i64 5
+  %idx6 = extractelement <8 x i32> %or, i64 6
+  %idx7 = extractelement <8 x i32> %or, i64 7
+  %elt0 = extractelement <8 x float> %v, i32 %idx0
+  %elt1 = extractelement <8 x float> %v, i32 %idx1
+  %elt2 = extractelement <8 x float> %v, i32 %idx2
+  %elt3 = extractelement <8 x float> %v, i32 %idx3
+  %elt4 = extractelement <8 x float> %v, i32 %idx4
+  %elt5 = extractelement <8 x float> %v, i32 %idx5
+  %elt6 = extractelement <8 x float> %v, i32 %idx6
+  %elt7 = extractelement <8 x float> %v, i32 %idx7
+  %vec0 = insertelement <8 x float> poison, float %elt0, i64 0
+  %vec1 = insertelement <8 x float> %vec0, float %elt1, i64 1
+  %vec2 = insertelement <8 x float> %vec1, float %elt2, i64 2
+  %vec3 = insertelement <8 x float> %vec2, float %elt3, i64 3
+  %vec4 = insertelement <8 x float> %vec3, float %elt4, i64 4
+  %vec5 = insertelement <8 x float> %vec4, float %elt5, i64 5
+  %vec6 = insertelement <8 x float> %vec5, float %elt6, i64 6
+  %vec7 = insertelement <8 x float> %vec6, float %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x float> zeroinitializer, <8 x float> %vec7
+  ret <8 x float> %res
+}
+
 ;
 ; PR35820 - Unequal source/destination vector sizes
 ;
diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll
index 6857101d3d75b..b107b1c2749cc 100644
--- a/llvm/test/CodeGen/X86/vec-libcalls.ll
+++ b/llvm/test/CodeGen/X86/vec-libcalls.ll
@@ -25,6 +25,54 @@ declare <5 x float> @llvm.tan.v5f32(<5 x float>)
 declare <6 x float> @llvm.tan.v6f32(<6 x float>)
 declare <3 x double> @llvm.tan.v3f64(<3 x double>)
 
+declare <1 x float> @llvm.acos.v1f32(<1 x float>)
+declare <2 x float> @llvm.acos.v2f32(<2 x float>)
+declare <3 x float> @llvm.acos.v3f32(<3 x float>)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)
+declare <5 x float> @llvm.acos.v5f32(<5 x float>)
+declare <6 x float> @llvm.acos.v6f32(<6 x float>)
+declare <3 x double> @llvm.acos.v3f64(<3 x double
+>)
+declare <1 x float> @llvm.asin.v1f32(<1 x float>)
+declare <2 x float> @llvm.asin.v2f32(<2 x float>)
+declare <3 x float> @llvm.asin.v3f32(<3 x float>)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)
+declare <5 x float> @llvm.asin.v5f32(<5 x float>)
+declare <6 x float> @llvm.asin.v6f32(<6 x float>)
+declare <3 x double> @llvm.asin.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.atan.v1f32(<1 x float>)
+declare <2 x float> @llvm.atan.v2f32(<2 x float>)
+declare <3 x float> @llvm.atan.v3f32(<3 x float>)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>)
+declare <5 x float> @llvm.atan.v5f32(<5 x float>)
+declare <6 x float> @llvm.atan.v6f32(<6 x float>)
+declare <3 x double> @llvm.atan.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.cosh.v1f32(<1 x float>)
+declare <2 x float> @llvm.cosh.v2f32(<2 x float>)
+declare <3 x float> @llvm.cosh.v3f32(<3 x float>)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)
+declare <5 x float> @llvm.cosh.v5f32(<5 x float>)
+declare <6 x float> @llvm.cosh.v6f32(<6 x float>)
+declare <3 x double> @llvm.cosh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.sinh.v1f32(<1 x float>)
+declare <2 x float> @llvm.sinh.v2f32(<2 x float>)
+declare <3 x float> @llvm.sinh.v3f32(<3 x float>)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)
+declare <5 x float> @llvm.sinh.v5f32(<5 x float>)
+declare <6 x float> @llvm.sinh.v6f32(<6 x float>)
+declare <3 x double> @llvm.sinh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.tanh.v1f32(<1 x float>)
+declare <2 x float> @llvm.tanh.v2f32(<2 x float>)
+declare <3 x float> @llvm.tanh.v3f32(<3 x float>)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)
+declare <5 x float> @llvm.tanh.v5f32(<5 x float>)
+declare <6 x float> @llvm.tanh.v6f32(<6 x float>)
+declare <3 x double> @llvm.tanh.v3f64(<3 x double>)
+
 ; Verify that all of the potential libcall candidates are handled.
 ; Some of these have custom lowering, so those cases won't have
 ; libcalls.
@@ -432,6 +480,1170 @@ define <3 x double> @tan_v3f64(<3 x double> %x) nounwind {
   ret <3 x double> %r
 }
 
+define <1 x float> @acos_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: acos_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.acos.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @acos_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: acos_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.acos.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @acos_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: acos_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.acos.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @acos_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: acos_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.acos.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @acos_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: acos_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.acos.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @acos_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: acos_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.acos.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @acos_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: acos_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.acos.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @asin_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: asin_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.asin.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @asin_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: asin_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.asin.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @asin_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: asin_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.asin.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @asin_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: asin_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.asin.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @asin_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: asin_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.asin.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @asin_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: asin_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.asin.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @asin_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: asin_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.asin.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @atan_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: atan_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.atan.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @atan_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: atan_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.atan.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @atan_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: atan_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.atan.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @atan_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: atan_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.atan.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @atan_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: atan_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.atan.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @atan_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: atan_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.atan.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @atan_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: atan_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.atan.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @cosh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.cosh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @cosh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.cosh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @cosh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.cosh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @cosh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.cosh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @cosh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.cosh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @cosh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.cosh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @cosh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: cosh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.cosh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @sinh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.sinh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @sinh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.sinh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @sinh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.sinh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @sinh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.sinh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @sinh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.sinh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @sinh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.sinh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @sinh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: sinh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.sinh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @tanh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.tanh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @tanh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.tanh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @tanh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.tanh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @tanh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.tanh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @tanh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.tanh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @tanh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.tanh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @tanh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: tanh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.tanh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
 define <2 x float> @fabs_v2f32(<2 x float> %x) nounwind {
 ; CHECK-LABEL: fabs_v2f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index 672b4591316ce..cd375c0416881 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -20,9 +20,9 @@ define i32 @test0(ptr %v4) nounwind {
 entry:
   %v5 = load <1 x i64>, ptr %v4, align 8
   %v12 = bitcast <1 x i64> %v5 to <4 x i16>
-  %v13 = bitcast <4 x i16> %v12 to x86_mmx
-  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
-  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to <1 x i64>
+  %v14 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v13, i8 -18)
+  %v15 = bitcast <1 x i64> %v14 to <4 x i16>
   %v16 = bitcast <4 x i16> %v15 to <1 x i64>
   %v17 = extractelement <1 x i64> %v16, i32 0
   %v18 = bitcast i64 %v17 to <2 x i32>
@@ -52,12 +52,12 @@ entry:
   %0 = load i32, ptr %ptr, align 4
   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
   %2 = insertelement <2 x i32> %1, i32 0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = bitcast x86_mmx %3 to i64
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = bitcast i64 %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24)
-  %8 = bitcast x86_mmx %7 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %6, i8 -24)
+  %8 = bitcast <1 x i64> %7 to <4 x i16>
   %9 = bitcast <4 x i16> %8 to <1 x i64>
   %10 = extractelement <1 x i64> %9, i32 0
   %11 = bitcast i64 %10 to <2 x i32>
@@ -82,9 +82,9 @@ define i32 @test2(ptr nocapture readonly %ptr) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %ptr, align 8
-  %1 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %0, i8 -24)
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %0 = load <1 x i64>, ptr %ptr, align 8
+  %1 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %0, i8 -24)
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   %5 = bitcast i64 %4 to <2 x i32>
@@ -93,40 +93,39 @@ entry:
   ret i32 %6
 }
 
-define i32 @test3(x86_mmx %a) nounwind {
+define i32 @test3(<1 x i64> %a) nounwind {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movd %mm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %mm0, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 0
   ret i32 %tmp1
 }
 
 ; Verify we don't muck with extractelts from the upper lane.
-define i32 @test4(x86_mmx %a) nounwind {
+define i32 @test4(<1 x i64> %a) nounwind {
 ; X86-LABEL: test4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %rdi, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 1
   ret i32 %tmp1
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 34280aa647aab..176ae81e08a76 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -26,8 +26,8 @@ define void  @t1(i32 %a, ptr %P) nounwind {
  %tmp12 = shl i32 %a, 12
  %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
  %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
- %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
- store x86_mmx %tmp23, ptr %P
+ %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
+ store <1 x i64> %tmp23, ptr %P
  ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index cea047453de43..67473febf28c7 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -5,21 +5,20 @@
 ; MMX insertelement is not available; these are promoted to xmm.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
-define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
+define <1 x i64> @mmx_movzl(<1 x i64> %x) nounwind {
 ; X86-LABEL: mmx_movzl:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    movd %eax, %mm0
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    movq %rax, %xmm0
 ; X64-NEXT:    retq
-  %tmp = bitcast x86_mmx %x to <2 x i32>
+  %tmp = bitcast <1 x i64> %x to <2 x i32>
   %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
   %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
-  %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
-  ret x86_mmx %tmp9
+  %tmp9 = bitcast <2 x i32> %tmp8 to <1 x i64>
+  ret <1 x i64> %tmp9
 }
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index f561a2a20e194..f95b34685211d 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -3,22 +3,22 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
 
 ; This is not an MMX operation; promoted to xmm.
-define x86_mmx @t0(i32 %A) nounwind {
+define <1 x i64> @t0(i32 %A) nounwind {
 ; X86-LABEL: t0:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t0:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
 ; X64-NEXT:    retq
   %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
-  %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
-  ret x86_mmx %tmp4
+  %tmp4 = bitcast <2 x i32> %tmp3 to <1 x i64>
+  ret <1 x i64> %tmp4
 }
 
 define <8 x i8> @t1(i8 zeroext %x) nounwind {
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index c54a7f4642253..5dcf19013f0b7 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -875,27 +875,15 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ;
 ; GFNISSE-LABEL: test_bitreverse_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: test_bitreverse_v32i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: test_bitreverse_v32i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    retq
-;
-; GFNIAVX512-LABEL: test_bitreverse_v32i8:
-; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    retq
+; GFNIAVX-LABEL: test_bitreverse_v32i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX-NEXT:    retq
   %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %b
 }
@@ -1058,7 +1046,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
@@ -1071,21 +1059,20 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v16i16:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: test_bitreverse_v16i16:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %b
@@ -1258,7 +1245,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
@@ -1271,21 +1258,20 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v8i32:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: test_bitreverse_v8i32:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %b
@@ -1462,7 +1448,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
@@ -1475,21 +1461,20 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v4i64:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: test_bitreverse_v4i64:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %b
@@ -1741,7 +1726,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNISSE-LABEL: test_bitreverse_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
@@ -1757,7 +1742,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -2054,7 +2039,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
@@ -2085,7 +2070,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
@@ -2412,7 +2397,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
@@ -2443,7 +2428,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
@@ -2778,7 +2763,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
@@ -2809,7 +2794,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
 ; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index ec7dca4285a35..30202701fdb8c 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1550,26 +1550,40 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
 ;
 ; SSE42-LABEL: select_v2i8:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pxor %xmm0, %xmm1
-; SSE42-NEXT:    ptest %xmm1, %xmm1
+; SSE42-NEXT:    movzwl (%rdi), %eax
+; SSE42-NEXT:    movd %eax, %xmm0
+; SSE42-NEXT:    movzwl (%rsi), %eax
+; SSE42-NEXT:    movd %eax, %xmm1
+; SSE42-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
+; SSE42-NEXT:    movmskpd %xmm0, %eax
+; SSE42-NEXT:    cmpl $3, %eax
 ; SSE42-NEXT:    sete %al
 ; SSE42-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: select_v2i8:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vptest %xmm0, %xmm0
-; AVX1OR2-NEXT:    sete %al
+; AVX1OR2-NEXT:    movzwl (%rdi), %eax
+; AVX1OR2-NEXT:    vmovd %eax, %xmm0
+; AVX1OR2-NEXT:    movzwl (%rsi), %eax
+; AVX1OR2-NEXT:    vmovd %eax, %xmm1
+; AVX1OR2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1OR2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vtestpd %xmm1, %xmm0
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512-LABEL: select_v2i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movzwl (%rdi), %eax
-; AVX512-NEXT:    cmpw (%rsi), %ax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    movzwl (%rsi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
+; AVX512-NEXT:    knotw %k0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    testb $3, %al
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    retq
   %v0 = load <2 x i8>, ptr %s0, align 1
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 951bcfa8fc1b7..2df39d69dbb75 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1433,19 +1433,25 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
 ;
 ; SSE42-LABEL: select_v2i8:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pcmpeqq %xmm0, %xmm1
-; SSE42-NEXT:    movmskpd %xmm1, %eax
+; SSE42-NEXT:    movzwl (%rdi), %eax
+; SSE42-NEXT:    movd %eax, %xmm0
+; SSE42-NEXT:    movzwl (%rsi), %eax
+; SSE42-NEXT:    movd %eax, %xmm1
+; SSE42-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
+; SSE42-NEXT:    movmskpd %xmm0, %eax
 ; SSE42-NEXT:    testl %eax, %eax
 ; SSE42-NEXT:    setne %al
 ; SSE42-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: select_v2i8:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT:    movzwl (%rdi), %eax
+; AVX1OR2-NEXT:    vmovd %eax, %xmm0
+; AVX1OR2-NEXT:    movzwl (%rsi), %eax
+; AVX1OR2-NEXT:    vmovd %eax, %xmm1
+; AVX1OR2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vpmovsxbq %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vtestpd %xmm0, %xmm0
 ; AVX1OR2-NEXT:    setne %al
 ; AVX1OR2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index b33cc83ac3f79..c725dcd972cd5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -3931,49 +3931,40 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride6_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm9
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm2
 ; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm12
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm3
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm16
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
@@ -3983,178 +3974,179 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
-; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm11
-; AVX512-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6],ymm4[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm4[4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm7[2,1,2,3]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm14
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[12],ymm0[12],ymm10[13],ymm0[13],ymm10[14],ymm0[14],ymm10[15],ymm0[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm6 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm5
+; AVX512-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm17
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-NEXT:    vpermt2d %zmm7, %zmm24, %zmm2
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[1,1,1,1]
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm2
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm5
-; AVX512-NEXT:    vmovdqa (%r8), %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6],ymm7[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm5[0,1,0,1]
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512-NEXT:    vpermt2d %zmm7, %zmm24, %zmm9
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,2,2,2]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
-; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm10
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-NEXT:    vpermt2d %zmm3, %zmm5, %zmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm9[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT:    vmovdqa (%r8), %xmm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512-NEXT:    vpermt2d %zmm8, %zmm18, %zmm4
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm4 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm8
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm12
+; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm4
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm11
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[2,1,2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[0,1,0,1]
 ; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm4 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm6, %ymm6
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-NEXT:    vpermt2d %zmm10, %zmm5, %zmm9
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm11[0,1,2,1]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12
+; AVX512-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm5 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
 ; AVX512-NEXT:    vpbroadcastq %xmm5, %ymm5
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm11, %ymm11
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm15, %ymm15
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm18, %zmm9
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm17, %zmm10, %zmm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm21, %zmm11
-; AVX512-NEXT:    vpternlogd $184, %zmm20, %zmm10, %zmm11
-; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm16, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm23, %zmm12, %zmm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm12, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm6, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm16, %zmm9, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm13
+; AVX512-NEXT:    vpternlogd $184, %zmm17, %zmm9, %zmm13
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm4
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm12, %zmm5, %zmm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm3
+; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 320(%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -4168,16 +4160,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7]
 ; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
@@ -4185,17 +4177,17 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,21,10,11,20,13,14,23]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm17, %ymm1
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
@@ -4206,13 +4198,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,2,0,3,10,0,10,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[12],ymm9[12],ymm2[13],ymm9[13],ymm2[14],ymm9[14],ymm2[15],ymm9[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm6
@@ -4225,14 +4217,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm11[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm2
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm12, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm14
@@ -4242,120 +4234,120 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm13
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm21, %zmm12
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [1,0,2,2,1,0,2,2]
-; AVX512-FCP-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm23, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm8
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm12 {%k2}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm8, %ymm22, %ymm12
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,8,8,0,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm4
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm21, %zmm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm23, %ymm4
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm14, %ymm14
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm2 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm4
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm4[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %zmm14, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512-FCP-NEXT:    vpermt2d %ymm14, %ymm22, %ymm2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm2[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm11
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,1,8,3,4,9,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm21, %ymm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm5 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm21
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm5
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm15
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm4, %zmm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm21, %zmm21
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[8],ymm9[8],ymm2[9],ymm9[9],ymm2[10],ymm9[10],ymm2[11],ymm9[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm7, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm18 {%k1}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm22
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm18, %ymm22
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm10
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm21, %ymm18
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm2, %zmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm8
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,0,10,10,0]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm7, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm5, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm9, %zmm0
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm4, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm18, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm22, %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm19, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm21, %zmm0, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm19, %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm1
@@ -4370,261 +4362,253 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-LABEL: store_i16_stride6_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm9
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm12
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm5
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm28
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm31
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm26
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm30
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm16
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm25
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm31
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm22
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm16
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm17
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm13
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm15
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[2,2,2,2]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[12],ymm15[12],ymm7[13],ymm15[13],ymm7[14],ymm15[14],ymm7[15],ymm15[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm22
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm3
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm0[0,1,2,3],zmm9[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm18, %zmm12
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm1, %zmm12 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm12, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm18
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm5, %ymm19
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm20 = ymm5[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm5
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm8, %ymm21
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm8[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm10, %zmm3
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,1,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm16[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm9
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[8],ymm15[8],ymm7[9],ymm15[9],ymm7[10],ymm15[10],ymm7[11],ymm15[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm9
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm9, %zmm8, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm5
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm4 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm10, %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm10
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm4, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm5, %ymm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm30[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm26, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm22, %zmm8
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm21, %zmm7, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm20, %zmm9, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm10
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm23, %zmm9, %zmm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm16, %zmm11, %zmm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm21, %zmm12
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm17, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm18, %zmm4, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
@@ -4633,63 +4617,62 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm19
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm20, %zmm19
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm17, %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm23
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm17, %ymm23
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm6, %ymm19
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,0,10,10,0]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm21, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,0,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm20, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm13, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm20, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm20, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
@@ -4701,126 +4684,126 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm4, %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm4, %zmm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm22, %ymm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm9[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm14
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,0,3,10,0,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm21, %ymm6
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm14 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm6, %zmm14, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm22, %ymm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm7, %ymm7
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm8, %zmm6, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm6, %zmm9
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm6, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,8,3,4,9,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,8,8,0,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm5, %zmm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm9[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm4, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,8,8,0,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm10[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm4, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm6, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm6
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm18, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm18, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -8334,901 +8317,875 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride6_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $600, %rsp # imm = 0x258
+; AVX512-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm9
 ; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm6
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm4
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm10
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm8
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm5
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm3
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm5
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm7
+; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm7
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm16
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm18
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm20
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm11
-; AVX512-NEXT:    vpshufb %ymm7, %ymm11, %ymm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm12
+; AVX512-NEXT:    vpshufb %ymm14, %ymm12, %ymm5
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm5
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm29
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15]
-; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm23
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm7
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa %ymm7, %ymm8
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,1,2,3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm0
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm20 = ymm3[1,2,3,3,5,6,7,7]
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11]
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm13
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,3,3,5,6,7,7]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm20, %zmm25, %zmm20
-; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
-; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm15 {%k1}
-; AVX512-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm15, %ymm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm19, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k1}
+; AVX512-NEXT:    vmovdqa (%r8), %ymm7
+; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm13
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm14
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm26
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm14
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-NEXT:    vpermt2d %zmm8, %zmm23, %zmm0
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX512-NEXT:    vmovdqa (%r8), %xmm8
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm11
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11]
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm4
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm29, %zmm6
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm0
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
-; AVX512-NEXT:    vmovdqa 96(%r8), %xmm2
-; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm8
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm4
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm23, %zmm7
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm0
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
+; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-NEXT:    vpermt2d %zmm3, %zmm31, %zmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm31, %zmm3
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,0,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,0,1]
 ; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k2}
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k2}
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm22
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm0[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm6 {%k2}
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm4 {%k2}
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm28
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm5[2,1,2,3]
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm5[2,1,2,3]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm5
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa %xmm14, %xmm4
-; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm20 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,1,0,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512-NEXT:    vmovdqa %xmm11, %xmm4
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm20
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm29, %zmm6
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512-NEXT:    vpshufb %xmm4, %xmm8, %xmm9
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm2, %zmm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm9[0,1,2,3],zmm14[4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm9
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm10[2,1,2,3]
+; AVX512-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm15
+; AVX512-NEXT:    vpshufb %xmm4, %xmm15, %xmm12
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm19
+; AVX512-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm9
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,2,1]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm12
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa 96(%r9), %xmm13
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm14[0,1,0,1]
-; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm14 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm14[2,2,2,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k2}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm9, %ymm18
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm31, %zmm0
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm2[2,1,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k2}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm15[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm15
+; AVX512-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm23 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm31, %zmm1
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm26 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm26, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm3, %ymm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm20, %zmm4
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm24, %zmm16
-; AVX512-NEXT:    vpternlogd $184, %zmm27, %zmm10, %zmm16
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512-NEXT:    vpternlogd $184, %zmm25, %zmm10, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm10, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm18, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm28, %zmm13, %zmm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512-NEXT:    vpternlogd $184, %zmm30, %zmm13, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm3, %zmm3
-; AVX512-NEXT:    vpternlogd $184, %zmm21, %zmm13, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm13, %zmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
+; AVX512-NEXT:    vmovdqa 96(%r9), %xmm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm13
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm14, %ymm14
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm24
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm26, %zmm26
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm9, %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm30, %zmm17
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm18, %zmm16
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm23, %zmm18
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
+; AVX512-NEXT:    vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512-NEXT:    vpternlogd $184, %zmm20, %zmm23, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm10, %zmm5
+; AVX512-NEXT:    vpternlogd $184, %zmm19, %zmm23, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm10, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm6
+; AVX512-NEXT:    vpternlogd $184, %zmm28, %zmm10, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
+; AVX512-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm10, %zmm2
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm10, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm12, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm23, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 704(%rax)
-; AVX512-NEXT:    addq $600, %rsp # imm = 0x258
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 576(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm18, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm26, 512(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm24, 704(%rax)
+; AVX512-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1256, %rsp # imm = 0x4E8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-FCP-NEXT:    subq $1240, %rsp # imm = 0x4D8
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm6
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rsp) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm10, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm23
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm18
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm26, %ymm8
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm8
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm12
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,0,3,10,0,10,11]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm25, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm5
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm4
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2]
-; AVX512-FCP-NEXT:    # ymm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm31, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
+; AVX512-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm16, %ymm4
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,1,8,3,4,9,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,8,8,0,9]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm20
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm31, %ymm2
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm16, %ymm3
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm5
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm25, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm29, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm4
-; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm1, %zmm25
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm1
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm25[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm10
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm28, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm6
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm25
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm25, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm31, %ymm1
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm6[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm1
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm16, %ymm7
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm26 = zmm7[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm8, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm4 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm8
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm17, %zmm4
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm26, %ymm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm31, %ymm18
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm18, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm12 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm12, %zmm30
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm16, %ymm12
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,2,3],zmm30[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm12
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm14, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,0,10,10,0]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm30, %ymm4
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm0, %ymm26
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm31, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm31, %zmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm12, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm27, %ymm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm31
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,0,10,10,0]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm28, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm28
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm28
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm12, %zmm0
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm7 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
+; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm27, %ymm29
 ; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm13
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm9
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2d %ymm6, %ymm14, %ymm0
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm11
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm12, %zmm10
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512-FCP-NEXT:    vpermt2d %ymm10, %ymm27, %ymm19
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm10
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm10 {%k1}
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm8
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm7
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm10, %ymm27
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm10
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm11
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm12
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm27, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm9, %zmm6
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 448(%rax)
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 640(%rax)
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 640(%rax)
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm26, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm25, %zmm1, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm25, %zmm1, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
@@ -9244,15 +9201,15 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
-; AVX512-FCP-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX512-FCP-NEXT:    addq $1240, %rsp # imm = 0x4D8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride6_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $936, %rsp # imm = 0x3A8
+; AVX512DQ-NEXT:    subq $584, %rsp # imm = 0x248
 ; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -9262,7 +9219,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
@@ -9282,22 +9239,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
@@ -9308,57 +9255,47 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm7
-; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm7
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm31
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
@@ -9368,39 +9305,30 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm19
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm28
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm18
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm21
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
@@ -9409,12 +9337,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm18
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
@@ -9425,34 +9352,32 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm4
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm11
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm12
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm9
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm9, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm6, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
@@ -9463,50 +9388,49 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm10
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm7
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm5
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm2[4],ymm14[5],ymm2[5],ymm14[6],ymm2[6],ymm14[7],ymm2[7],ymm14[12],ymm2[12],ymm14[13],ymm2[13],ymm14[14],ymm2[14],ymm14[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm7
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm10
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -9516,252 +9440,257 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm0[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm1
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm15, %zmm25, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm15, %ymm14
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm13
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm8
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6],ymm8[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm8[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm15
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm16
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm5[2,1,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm15
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm5[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm4, %zmm11
+; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm26
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm24, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm11 {%k1}
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm11, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm4, %zmm13
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm4, %zmm5
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm13 {%k1}
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm6, %zmm4, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm27
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm9
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm29
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm11
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm24, %zmm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm4[2,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm10 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm7 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm10, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm7[2,1,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm10
-; AVX512DQ-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm10
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm10
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm12
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm12[0,1,2,3],zmm11[4,5,6,7]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm5, %ymm12
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm10[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm12[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm15
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm15[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm7 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm13 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm25
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm23, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm21, %zmm17, %zmm16
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm24, %zmm14
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm5, %zmm5
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm29, %zmm18, %zmm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm19, %zmm9, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm20, %zmm9, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm9, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm13, %zmm10
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm29, %zmm17
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm24, %zmm22
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm23, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm25, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm26, %zmm13, %zmm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm27, %zmm13, %zmm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm13, %zmm8
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 704(%rax)
-; AVX512DQ-NEXT:    addq $936, %rsp # imm = 0x3A8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 640(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 576(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 704(%rax)
+; AVX512DQ-NEXT:    addq $584, %rsp # imm = 0x248
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1224, %rsp # imm = 0x4C8
+; AVX512DQ-FCP-NEXT:    subq $1176, %rsp # imm = 0x498
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
@@ -9771,8 +9700,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
@@ -9782,95 +9711,95 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm28
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm22
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm21
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm24, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm19, %ymm22
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,0,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm21
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,0,1,0,10,10,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm4, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm19, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm4, %zmm25 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm19, %ymm26
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm18, %zmm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9880,332 +9809,329 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm18, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm20, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm25, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm17, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm19, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm7, %ymm19, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm24, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm7
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm19, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm19, %ymm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm20, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm19, %ymm20
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm27, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm26, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm22, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm30, %ymm8
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm27, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm30, %ymm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm29, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
+; AVX512DQ-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm16, %ymm3
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm2 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,8,3,4,9,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm3, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm16, %ymm4
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm21
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm27, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm6, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm23, %zmm6
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm30, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm29, %zmm5
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm31, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm6, %zmm2, %zmm5 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm7, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm22, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm29, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm27, %ymm8
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm10 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm8, %zmm10, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm5, %zmm23
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm30, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm31, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm19, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm31, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm26, %ymm7
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm11, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm29, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm31, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm12
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm30, %ymm12
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm13, %zmm11, %zmm10 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm28, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm19, %zmm13
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm13 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm13, %zmm28
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm9, %ymm16, %ymm10
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm16, %ymm13
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm23[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm11[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm18, %zmm10
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm28[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm18, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm9, %zmm11, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm18, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm13, %zmm12, %zmm11 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm30, %ymm13
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm10, %zmm13, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm10, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm16, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm8, %zmm11
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm29, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm12, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm8, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm27, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm24, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm23, %zmm8, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $1224, %rsp # imm = 0x4C8
+; AVX512DQ-FCP-NEXT:    addq $1176, %rsp # imm = 0x498
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 208ee607909ed..dc362d729fcd3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -2850,8 +2850,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vporq %ymm1, %ymm4, %ymm17
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
 ; AVX512-NEXT:    vporq %ymm5, %ymm10, %ymm19
 ; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm5
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -2982,8 +2982,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
 ; AVX512-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm18
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
@@ -3112,8 +3112,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vporq %ymm1, %ymm4, %ymm17
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
 ; AVX512DQ-NEXT:    vporq %ymm5, %ymm10, %ymm19
 ; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm5
 ; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -3244,8 +3244,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm18
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
@@ -5893,1237 +5893,1257 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $632, %rsp # imm = 0x278
+; AVX512-NEXT:    subq $680, %rsp # imm = 0x2A8
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm9
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm10
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
-; AVX512-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm11, %ymm3
+; AVX512-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm17
 ; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm10
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
+; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512-NEXT:    vpshufb %ymm12, %ymm6, %ymm2
-; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
-; AVX512-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
+; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
+; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
-; AVX512-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
-; AVX512-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm15
+; AVX512-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
+; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
+; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
 ; AVX512-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm10, %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10]
-; AVX512-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
+; AVX512-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm31
-; AVX512-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm28
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512-NEXT:    vprold $16, %ymm13, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm28
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512-NEXT:    vprold $16, %ymm13, %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm29[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
-; AVX512-NEXT:    vmovdqa %ymm15, %ymm8
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm9
-; AVX512-NEXT:    vprold $16, %xmm9, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX512-NEXT:    vprold $16, %xmm5, %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm15
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
-; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm6
-; AVX512-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512-NEXT:    vpshufb %ymm7, %ymm6, %ymm10
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm10
-; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm13
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512-NEXT:    vprold $16, %xmm14, %xmm14
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm12
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm30[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm23[0,2,2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm2
+; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm27
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
+; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm14
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512-NEXT:    vprold $16, %ymm16, %ymm1
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm6 = mem[2,1,3,2]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,2,2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
+; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[2,1,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
+; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
+; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm12 = mem[0,0,1,1]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm17 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm18 = mem[2,1,3,2]
+; AVX512-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm19 = mem[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm18[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm5[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14,15]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm12, %zmm11, %zmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm31, %zmm1
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm9 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm9
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm12 = mem[2,2,2,3]
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm15 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm17 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm8 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm26[0,0,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm13
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm24[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm22[2,1,3,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm21[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm9
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm18, %zmm8
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm8
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm1
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm29[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm8
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
-; AVX512-NEXT:    vpbroadcastd (%rax), %ymm9
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd (%rax), %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
 ; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
+; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
+; AVX512-NEXT:    vpbroadcastd (%rax), %ymm4
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd (%rax), %zmm6, %zmm6
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 384(%rax)
-; AVX512-NEXT:    addq $632, %rsp # imm = 0x278
+; AVX512-NEXT:    vmovdqa64 %zmm27, 384(%rax)
+; AVX512-NEXT:    addq $680, %rsp # imm = 0x2A8
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $248, %rsp
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm10
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm9
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm7
-; AVX512-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm15
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm8
+; AVX512-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm22
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm30
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm24
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm25
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm20
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm7
-; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm4, %zmm22
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm21
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm23
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm6
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm26
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm6
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm27
-; AVX512-FCP-NEXT:    vprold $16, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm2, %zmm25
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm3, %zmm28
+; AVX512-FCP-NEXT:    vprold $16, %ymm15, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm5, %zmm31
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm2
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm9
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm21, %ymm2
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm13
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7,8,9],ymm13[10],ymm3[11,12],ymm13[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm15, %ymm13
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm6
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm11
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm2, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm30
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm13
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm22, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm22, %ymm4
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm12
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm20, %zmm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm13, %ymm8
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm8
+; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm14, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm15
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
-; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm8, %zmm7
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
-; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm6
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0
-; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
+; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2]
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7,8],ymm0[9],ymm14[10,11],ymm0[12],ymm14[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm14
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7]
-; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm8, %zmm9
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm23, %zmm2, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm31
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm31
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
 ; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm16
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
+; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512-FCP-NEXT:    addq $248, %rsp
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512-FCP-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $632, %rsp # imm = 0x278
+; AVX512DQ-NEXT:    subq $680, %rsp # imm = 0x2A8
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm9
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm1, %ymm15
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm10
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
 ; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
-; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm10
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
+; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm10, %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm31
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm28
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm28
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm29[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm18
-; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm8
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm9
-; AVX512DQ-NEXT:    vprold $16, %xmm9, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm15
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm6
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm10
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm13
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512DQ-NEXT:    vprold $16, %xmm14, %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm30[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm23[0,2,2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm2
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm27
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512DQ-NEXT:    vprold $16, %ymm16, %ymm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm6 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm12 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm17 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm18 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm19 = mem[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm18[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm5[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm12, %zmm11, %zmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm31, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm9 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm12 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm15 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm17 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm8 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm26[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm24[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm22[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm21[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm18, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm8
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm1
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm29[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm8
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
-; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm9
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
+; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm4
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd (%rax), %zmm6, %zmm6
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rax)
-; AVX512DQ-NEXT:    addq $632, %rsp # imm = 0x278
+; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 384(%rax)
+; AVX512DQ-NEXT:    addq $680, %rsp # imm = 0x2A8
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $248, %rsp
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm8
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm30
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm24
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm25
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm4, %zmm22
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm23
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm26
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm27
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm2, %zmm25
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm3, %zmm28
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm15, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm5, %zmm31
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm9
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7,8,9],ymm13[10],ymm3[11,12],ymm13[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm15, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm11
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm2, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm30
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm22, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm22, %ymm4
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm20, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm8
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm14, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm8, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0
-; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7,8],ymm0[9],ymm14[10,11],ymm0[12],ymm14[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm14
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm8, %zmm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm23, %zmm2, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm31
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm31
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm16
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $248, %rsp
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -12619,2682 +12639,2656 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $2168, %rsp # imm = 0x878
-; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm9
+; AVX512-NEXT:    subq $2808, %rsp # imm = 0xAF8
+; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm6
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm13
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
-; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm16
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
-; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm17
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
+; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
+; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm18
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa %ymm3, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
 ; AVX512-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm6
-; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm3
+; AVX512-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT:    vpshufb %ymm13, %ymm4, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
-; AVX512-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa (%r8), %ymm3
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm22
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
-; AVX512-NEXT:    vpor %ymm0, %ymm11, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
+; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vpshufb %ymm13, %ymm11, %ymm13
-; AVX512-NEXT:    vpor %ymm12, %ymm13, %ymm10
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-NEXT:    vpshufb %ymm15, %ymm13, %ymm12
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm14
-; AVX512-NEXT:    vpor %ymm14, %ymm12, %ymm10
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm15, %ymm12
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm5
+; AVX512-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm1
+; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm0
+; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
+; AVX512-NEXT:    vpor %ymm10, %ymm9, %ymm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm31
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15]
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11]
-; AVX512-NEXT:    vpermi2q %zmm10, %zmm12, %zmm14
-; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512-NEXT:    vpshufb %ymm12, %ymm14, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm9
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm17, %zmm7
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm10
+; AVX512-NEXT:    vprold $16, %ymm9, %ymm8
+; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm6
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero
-; AVX512-NEXT:    vpternlogq $248, %ymm26, %ymm7, %ymm8
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm12
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm3, %ymm12, %ymm9
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %ymm14, %ymm8, %ymm9
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
+; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm12
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %ymm7, %ymm10, %ymm8
-; AVX512-NEXT:    vprold $16, %ymm12, %ymm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 96(%rax), %ymm7
-; AVX512-NEXT:    vpermd %zmm7, %zmm18, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT:    vpandnq %ymm9, %ymm26, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512-NEXT:    vmovdqa 96(%rax), %ymm6
+; AVX512-NEXT:    vpermd %zmm6, %zmm18, %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm7
-; AVX512-NEXT:    vpandnq %ymm7, %ymm16, %ymm9
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
+; AVX512-NEXT:    vpandn %ymm7, %ymm12, %ymm7
+; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm6
+; AVX512-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
 ; AVX512-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm9
-; AVX512-NEXT:    vpandnq %ymm9, %ymm16, %ymm9
-; AVX512-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm12
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm24
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm9
-; AVX512-NEXT:    vpshufb %ymm10, %ymm9, %ymm10
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm8
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm17
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm20
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm15
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm9
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm10
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm8
+; AVX512-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
+; AVX512-NEXT:    vmovdqa (%rax), %ymm12
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm29
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm24
+; AVX512-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm25
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
+; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm12
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm16
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
+; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm15
+; AVX512-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512-NEXT:    vpandnq %ymm12, %ymm26, %ymm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm10
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
+; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm31
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vprold $16, %ymm0, %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermd %zmm9, %zmm18, %zmm0
+; AVX512-NEXT:    vpermd %zmm15, %zmm18, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm12
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm11
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm8
-; AVX512-NEXT:    vprold $16, %ymm21, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10]
-; AVX512-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd 64(%rax), %zmm11, %zmm0
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm26, %zmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vprold $16, %ymm5, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm2
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm12
+; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm4, %zmm12, %zmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm4
-; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512-NEXT:    vprold $16, %xmm3, %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm23
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm4
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
-; AVX512-NEXT:    vmovdqa %xmm6, %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
-; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm14, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm1
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
+; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm1
+; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm4, %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm25
-; AVX512-NEXT:    vprold $16, %xmm4, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm2
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm17
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
-; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm8, %xmm14
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3],xmm1[4],xmm5[5,6],xmm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm5, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm19
-; AVX512-NEXT:    vprold $16, %xmm5, %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512-NEXT:    vprold $16, %xmm10, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm2
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm3, %zmm6, %zmm1
-; AVX512-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
+; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512-NEXT:    vprold $16, %xmm21, %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512-NEXT:    vpbroadcastd (%rax), %ymm1
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vprold $16, %ymm2, %ymm1
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15]
-; AVX512-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512-NEXT:    vpermd (%rax), %zmm11, %zmm1
-; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm26, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm11, %zmm12, %zmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm24[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm26
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm26
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX512-NEXT:    vprold $16, %xmm12, %xmm15
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm23 = mem[0,2,2,3]
-; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm7 = mem[0,2,2,3]
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm17 = mem[2,1,3,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm29, %zmm4
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm0
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm28
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3],xmm4[4],xmm12[5,6],xmm4[7]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[3,3,3,3]
-; AVX512-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm4 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm6 = mem[2,2,3,3]
-; AVX512-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[2,2,2,3]
-; AVX512-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm27 = mem[2,1,3,3]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm20 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm11 = mem[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm30[2,1,3,2]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm22[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm21[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,1,3,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm23
-; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm7, %zmm24
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm23, %zmm30, %zmm24
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm24, %ymm7, %ymm0
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm14
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm10, %zmm23, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm10
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm10
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqa %ymm3, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512-NEXT:    vprold $16, %ymm7, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512-NEXT:    vprold $16, %xmm10, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
+; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm27
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm21
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21
-; AVX512-NEXT:    vextracti64x4 $1, %zmm24, %ymm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm3, %ymm29
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = mem[2,1,3,2]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
+; AVX512-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm2
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm29
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm20, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm19, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm25, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm5
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm20
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm25
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm31, %zmm3
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm30, %zmm3
-; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[0,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm4 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm4 = mem[2,1,3,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = mem[0,0,1,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm6 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm19 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm9 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm11 = mem[0,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm12 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = mem[2,1,3,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm14 = mem[0,0,1,1]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm15 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm16 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm11, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm11
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm12
-; AVX512-NEXT:    vpternlogq $184, %zmm11, %zmm2, %zmm26
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm4
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm16, %zmm1
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
+; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
+; AVX512-NEXT:    vpermd (%rax), %zmm14, %zmm14
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
+; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm2
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm28
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm7 = mem[2,2,2,3]
+; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
+; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm16 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm21 = mem[0,0,1,1]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm24 = mem[2,2,2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
+; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm27 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
+; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
+; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm28, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm26, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm25, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 704(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm12, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm20, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm22, 768(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm29, 832(%rax)
-; AVX512-NEXT:    addq $2168, %rsp # imm = 0x878
+; AVX512-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm14, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm23, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm31, 704(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 576(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm18, 512(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm30, 832(%rax)
+; AVX512-NEXT:    addq $2808, %rsp # imm = 0xAF8
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    subq $1496, %rsp # imm = 0x5D8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm28
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm4, %ymm18
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm13, %ymm19
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpor %ymm12, %ymm14, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm13
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm12
-; AVX512-FCP-NEXT:    vporq %ymm5, %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7,8,9],ymm3[10],ymm11[11,12],ymm3[13],ymm11[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm17, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm11
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,u,u,u,u],zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm25, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm5
-; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm12, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm17, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512-FCP-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
+; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm9
+; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
+; AVX512-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm22
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
+; AVX512-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512-FCP-NEXT:    vprold $16, %ymm10, %ymm11
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm0
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpandnq %ymm1, %ymm25, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
+; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpandn %ymm11, %ymm13, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm11
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm16
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm27
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
-; AVX512-FCP-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm0
-; AVX512-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm3
+; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm25
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm3
+; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm5
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm14, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm19
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm9, %ymm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm19, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT:    vprold $16, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm25, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm19
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
+; AVX512-FCP-NEXT:    vprold $16, %ymm26, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm30
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm7
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, %xmm8
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm6, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX512-FCP-NEXT:    vprold $16, %xmm6, %xmm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm11, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm11, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm5, %zmm9
-; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm23
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm2
+; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
+; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm29
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vprold $16, %xmm13, %xmm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
 ; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm20
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm4, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm2, %zmm20
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-FCP-NEXT:    vprold $16, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vprold $16, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm17, %ymm16
+; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm18
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm19
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm25, %zmm19
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm30, %ymm31, %ymm1
-; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm13, %zmm30, %zmm21
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm26, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm24[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm13
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm30, %zmm8
+; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm14
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm30
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
+; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
+; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm8
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm25, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm30, %ymm7
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm2[3,3,3,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[2,2,2,2]
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
-; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
-; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm16, %zmm10
-; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm31, %ymm9
-; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm18
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm30, %zmm9
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm7, %ymm0, %ymm29
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm0, %ymm26
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm8 = mem[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm8, %ymm11, %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
-; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm10, %zmm25, %zmm1
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 704(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 512(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 704(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 640(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 512(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 832(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
 ; AVX512-FCP-NEXT:    addq $1496, %rsp # imm = 0x5D8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $2168, %rsp # imm = 0x878
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm9
+; AVX512DQ-NEXT:    subq $2808, %rsp # imm = 0xAF8
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm13
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
-; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm16
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
-; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm17
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
+; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
+; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm18
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm22
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm11, %ymm0
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm11, %ymm13
-; AVX512DQ-NEXT:    vpor %ymm12, %ymm13, %ymm10
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm13, %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm15, %ymm14
-; AVX512DQ-NEXT:    vpor %ymm14, %ymm12, %ymm10
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm15, %ymm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpor %ymm10, %ymm9, %ymm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm31
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15]
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-NEXT:    vpermi2q %zmm10, %zmm12, %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm14, %ymm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm9
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm17, %zmm7
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm10
+; AVX512DQ-NEXT:    vprold $16, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm26, %ymm7, %ymm8
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm12
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm12, %ymm9
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm14, %ymm8, %ymm9
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm7, %ymm10, %ymm8
-; AVX512DQ-NEXT:    vprold $16, %ymm12, %ymm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm7
-; AVX512DQ-NEXT:    vpermd %zmm7, %zmm18, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT:    vpandnq %ymm9, %ymm26, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm6
+; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm7
-; AVX512DQ-NEXT:    vpandnq %ymm7, %ymm16, %ymm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
+; AVX512DQ-NEXT:    vpandn %ymm7, %ymm12, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm6
+; AVX512DQ-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
 ; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm9
-; AVX512DQ-NEXT:    vpandnq %ymm9, %ymm16, %ymm9
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm12
-; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm24
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm9
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm9, %ymm10
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm17
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm20
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm15
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm10
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm8
+; AVX512DQ-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm12
+; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm25
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm12
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm16
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512DQ-NEXT:    vpandnq %ymm12, %ymm26, %ymm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm10
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm31
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm15, %zmm18, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm2
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm8
-; AVX512DQ-NEXT:    vprold $16, %ymm21, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm11, %zmm0
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm26, %zmm0
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vprold $16, %ymm5, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm12, %zmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-NEXT:    vprold $16, %xmm3, %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm23
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm4
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm14, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm1
+; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm25
-; AVX512DQ-NEXT:    vprold $16, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm17
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3],xmm1[4],xmm5[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm19
-; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm6, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512DQ-NEXT:    vprold $16, %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm1
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vprold $16, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15]
-; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm11, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm4, %zmm26, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm12, %zmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm24[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm26
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm26
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm15
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm23 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm7 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm17 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm29, %zmm4
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm0
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3],xmm4[4],xmm12[5,6],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[3,3,3,3]
-; AVX512DQ-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm4 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm6 = mem[2,2,3,3]
-; AVX512DQ-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm27 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm20 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm11 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm30[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm22[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm21[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm23
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm7, %zmm24
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm23, %zmm30, %zmm24
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm24, %ymm7, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm14
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm10, %zmm23, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm10
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm10
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512DQ-NEXT:    vprold $16, %ymm7, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm27
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm21
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm24, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm3, %ymm29
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm29
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm20, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm19, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm25, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm20
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm25
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm27, %zmm31, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm30, %zmm3
-; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm4 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm4 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm6 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm19 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm9 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm11 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm12 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm13 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm14 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm15 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm16 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm11, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm11
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm12
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm11, %zmm2, %zmm26
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm16, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512DQ-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
+; AVX512DQ-NEXT:    vpermd (%rax), %zmm14, %zmm14
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm7 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm16 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm21 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm24 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm27 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 704(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 768(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 832(%rax)
-; AVX512DQ-NEXT:    addq $2168, %rsp # imm = 0x878
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 704(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 576(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 832(%rax)
+; AVX512DQ-NEXT:    addq $2808, %rsp # imm = 0xAF8
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    subq $1496, %rsp # imm = 0x5D8
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm28
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm4, %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm13
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm13, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm13
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpor %ymm12, %ymm14, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm13
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT:    vporq %ymm5, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7,8,9],ymm3[10],ymm11[11,12],ymm3[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm17, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,u,u,u,u],zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm25, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm12, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm17, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
+; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
+; AVX512DQ-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm22
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm10, %ymm11
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm1, %ymm25, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
+; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpandn %ymm11, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm27
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm0
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm25
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm19
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm19, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm25, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm26, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm30
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, %xmm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm6, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm11, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm11, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm23
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm29
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm13, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm4, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm2, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm17, %ymm16
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm18
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm19
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm25, %zmm19
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm30, %ymm31, %ymm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm13, %zmm30, %zmm21
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm26, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm24[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm30, %zmm8
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm14
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm30
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm25, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm30, %ymm7
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm2[3,3,3,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm16, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm31, %ymm9
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm18
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm30, %zmm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm7, %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm8 = mem[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm8, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm10, %zmm25, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 704(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 640(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 832(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
 ; AVX512DQ-FCP-NEXT:    addq $1496, %rsp # imm = 0x5D8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index f56c43eb49df2..06d390f053c7e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -4879,12 +4879,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm7
 ; AVX512-NEXT:    vpternlogq $248, %zmm28, %zmm7, %zmm2
-; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[2,2,3,3,6,6,7,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7]
-; AVX512-NEXT:    vporq %zmm5, %zmm7, %zmm5
-; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm25[2,2,3,3,6,6,7,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[2,2,3,3,6,6,7,7]
-; AVX512-NEXT:    vporq %zmm7, %zmm9, %zmm7
+; AVX512-NEXT:    vporq %zmm24, %zmm26, %zmm5
+; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vporq %zmm25, %zmm27, %zmm7
+; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
 ; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm9, %zmm7
 ; AVX512-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm30
@@ -4918,165 +4916,165 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm15[8,u],zero,xmm15[7],zero,xmm15[9,u,11,u],zero,xmm15[10],zero,xmm15[12,u],zero
-; AVX512-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm18
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
+; AVX512-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
 ; AVX512-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
 ; AVX512-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
 ; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm24
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
 ; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm23
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm25
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
+; AVX512-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm10
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm27
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9,u,11,u],zero,xmm14[10],zero,xmm14[12,u],zero
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
-; AVX512-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm11
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[26],zero,ymm12[28],zero,zero,ymm12[27],zero,ymm12[29],zero,ymm12[31],zero,zero,ymm12[30],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
 ; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm28, %ymm30
-; AVX512-FCP-NEXT:    vpandnq %ymm30, %ymm29, %ymm30
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
+; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
+; AVX512-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm10
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm6
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm1
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm21[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm26[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vporq %zmm7, %zmm9, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm9, %zmm7
-; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm8
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm18[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm7, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm12, %zmm7, %zmm2
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm25[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm23, %zmm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm16[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm30, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $248, %zmm11, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm8
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -5224,12 +5222,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm7
 ; AVX512DQ-NEXT:    vpternlogq $248, %zmm28, %zmm7, %zmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[2,2,3,3,6,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7]
-; AVX512DQ-NEXT:    vporq %zmm5, %zmm7, %zmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm25[2,2,3,3,6,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[2,2,3,3,6,6,7,7]
-; AVX512DQ-NEXT:    vporq %zmm7, %zmm9, %zmm7
+; AVX512DQ-NEXT:    vporq %zmm24, %zmm26, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vporq %zmm25, %zmm27, %zmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm9, %zmm7
 ; AVX512DQ-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm30
@@ -5263,165 +5259,165 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm15[8,u],zero,xmm15[7],zero,xmm15[9,u,11,u],zero,xmm15[10],zero,xmm15[12,u],zero
-; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
+; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
 ; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
 ; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm24
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
 ; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm23
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm27
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9,u,11,u],zero,xmm14[10],zero,xmm14[12,u],zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
-; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[26],zero,ymm12[28],zero,zero,ymm12[27],zero,ymm12[29],zero,ymm12[31],zero,zero,ymm12[30],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm28, %ymm30
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm30, %ymm29, %ymm30
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm21[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm26[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm7, %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm18[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm12, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm25[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm23, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm16[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm10, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm30, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm11, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index e43aa56c96c28..60af864597f4f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -5405,304 +5405,289 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-LABEL: store_i8_stride6_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $328, %rsp # imm = 0x148
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
+; AVX512-NEXT:    subq $456, %rsp # imm = 0x1C8
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa (%r8), %xmm10
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX512-NEXT:    vmovdqa (%r9), %xmm12
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
+; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm31
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm3
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm26
-; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm7, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
-; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm29
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm30
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
-; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm31
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm27
+; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
+; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm5
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm18
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm20
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
+; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm6, %ymm0
-; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
+; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15],ymm12[24],ymm6[24],ymm12[25],ymm6[25],ymm12[26],ymm6[26],ymm12[27],ymm6[27],ymm12[28],ymm6[28],ymm12[29],ymm6[29],ymm12[30],ymm6[30],ymm12[31],ymm6[31]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
-; AVX512-NEXT:    vpshufb %ymm14, %ymm9, %ymm1
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
+; AVX512-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31]
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm17
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm13, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm14, %ymm0
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
 ; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm4, %ymm10
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
-; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512-NEXT:    vpshufb %ymm7, %ymm15, %ymm11
-; AVX512-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm25
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
-; AVX512-NEXT:    vpshufb %ymm14, %ymm11, %ymm5
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15],ymm11[24],ymm7[24],ymm11[25],ymm7[25],ymm11[26],ymm7[26],ymm11[27],ymm7[27],ymm11[28],ymm7[28],ymm11[29],ymm7[29],ymm11[30],ymm7[30],ymm11[31],ymm7[31]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm24
-; AVX512-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm23
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[16],ymm7[16],ymm11[17],ymm7[17],ymm11[18],ymm7[18],ymm11[19],ymm7[19],ymm11[20],ymm7[20],ymm11[21],ymm7[21],ymm11[22],ymm7[22],ymm11[23],ymm7[23]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
+; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm28
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpshufb %ymm15, %ymm1, %ymm10
+; AVX512-NEXT:    vpshufb %ymm15, %ymm0, %ymm15
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm26
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm6
+; AVX512-NEXT:    vpshufb %ymm12, %ymm15, %ymm9
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm24
+; AVX512-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm23
+; AVX512-NEXT:    vmovdqa (%r9), %ymm4
+; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm25
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm10
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm29
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm8
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm11
-; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm5
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm21
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23]
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
-; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
+; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm3
-; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm4
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
+; AVX512-NEXT:    vpshufb %ymm11, %ymm13, %ymm6
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
+; AVX512-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
+; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm14
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm8
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-NEXT:    vprold $16, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm27
-; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm0
-; AVX512-NEXT:    vpshufb %xmm14, %xmm10, %xmm1
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
+; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm8
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
 ; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm15
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm10
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm17
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm10
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm14
-; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm12
-; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
-; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,0,0,1]
-; AVX512-NEXT:    vprold $16, %ymm28, %ymm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1]
+; AVX512-NEXT:    vprold $16, %ymm16, %ymm8
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm6[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1]
+; AVX512-NEXT:    vprold $16, %ymm17, %ymm14
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm11, %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm6, %ymm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %ymm1, %ymm0, %ymm28
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm1
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm13, %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512-NEXT:    vpternlogd $184, %zmm5, %zmm8, %zmm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1]
-; AVX512-NEXT:    vprold $16, %ymm30, %ymm9
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm13
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm31[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm22[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm9
-; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm6, %zmm9
-; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm5
-; AVX512-NEXT:    vpternlogq $184, %ymm5, %ymm6, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm9, %ymm0, %ymm15
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm5
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm5
-; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm8
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm8
-; AVX512-NEXT:    vextracti64x4 $1, %zmm8, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm30
-; AVX512-NEXT:    vpternlogq $184, %ymm8, %ymm6, %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm2
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm8
-; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm11
-; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm14
-; AVX512-NEXT:    vpternlogq $184, %ymm11, %ymm6, %ymm31
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm2
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm11
-; AVX512-NEXT:    vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm10 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm10
-; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm13
-; AVX512-NEXT:    vextracti64x4 $1, %zmm10, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm7
-; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm3
-; AVX512-NEXT:    vpternlogq $184, %ymm10, %ymm9, %ymm12
-; AVX512-NEXT:    vpternlogq $184, %ymm13, %ymm9, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm1
+; AVX512-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm12, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm6, %zmm4, %zmm2
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm3
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm6
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogq $184, %zmm6, %zmm9, %zmm0
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm6
+; AVX512-NEXT:    vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm5 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm5
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm7
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm4, %zmm0
+; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $184, %zmm7, %zmm4, %zmm5
+; AVX512-NEXT:    vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm4
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogd $184, %zmm5, %zmm7, %zmm0
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512-NEXT:    addq $328, %rsp # imm = 0x148
+; AVX512-NEXT:    addq $456, %rsp # imm = 0x1C8
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX512-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
@@ -5712,561 +5697,526 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm22
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm5
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15],ymm11[24],ymm15[24],ymm11[25],ymm15[25],ymm11[26],ymm15[26],ymm11[27],ymm15[27],ymm11[28],ymm15[28],ymm11[29],ymm15[29],ymm11[30],ymm15[30],ymm11[31],ymm15[31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm15
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm9
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm30
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm14
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm29
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm31
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[16],ymm15[16],ymm11[17],ymm15[17],ymm11[18],ymm15[18],ymm11[19],ymm15[19],ymm11[20],ymm15[20],ymm11[21],ymm15[21],ymm11[22],ymm15[22],ymm11[23],ymm15[23]
 ; AVX512-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
-; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm11
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm16, %zmm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm8
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[16],ymm9[16],ymm2[17],ymm9[17],ymm2[18],ymm9[18],ymm2[19],ymm9[19],ymm2[20],ymm9[20],ymm2[21],ymm9[21],ymm2[22],ymm9[22],ymm2[23],ymm9[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,0,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm14[0,0,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm14, %ymm2
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm11, %ymm11
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm13
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[16],mem[16],ymm5[17],mem[17],ymm5[18],mem[18],ymm5[19],mem[19],ymm5[20],mem[20],ymm5[21],mem[21],ymm5[22],mem[22],ymm5[23],mem[23]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm6
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vprold $16, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm10, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm3 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm3
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm22
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm6
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm6, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm24
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm24[0,1,2,3],zmm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm7 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm6, %ymm2, %ymm31
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm4
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm29[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm9
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm5
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm16, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm4
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm25[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm29[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm9, %zmm2
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm21[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm28[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
-; AVX512-FCP-NEXT:    addq $392, %rsp # imm = 0x188
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
+; AVX512-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $328, %rsp # imm = 0x148
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
+; AVX512DQ-NEXT:    subq $456, %rsp # imm = 0x1C8
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm10
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm31
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm7, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm30
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm31
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm27
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm5
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm18
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm20
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15],ymm12[24],ymm6[24],ymm12[25],ymm6[25],ymm12[26],ymm6[26],ymm12[27],ymm6[27],ymm12[28],ymm6[28],ymm12[29],ymm6[29],ymm12[30],ymm6[30],ymm12[31],ymm6[31]
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm9, %ymm1
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31]
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm17
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm13, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm14, %ymm0
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm10
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm15, %ymm11
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm25
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm11, %ymm5
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15],ymm11[24],ymm7[24],ymm11[25],ymm7[25],ymm11[26],ymm7[26],ymm11[27],ymm7[27],ymm11[28],ymm7[28],ymm11[29],ymm7[29],ymm11[30],ymm7[30],ymm11[31],ymm7[31]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm24
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm23
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[16],ymm7[16],ymm11[17],ymm7[17],ymm11[18],ymm7[18],ymm11[19],ymm7[19],ymm11[20],ymm7[20],ymm11[21],ymm7[21],ymm11[22],ymm7[22],ymm11[23],ymm7[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm1
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm28
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm1, %ymm10
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm0, %ymm15
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm26
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm15, %ymm9
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm24
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm23
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm25
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm29
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm8
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm8, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm0, %xmm4
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm13, %ymm6
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm8
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-NEXT:    vprold $16, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm27
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm8
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
 ; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm15
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm17
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm12
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,0,0,1]
-; AVX512DQ-NEXT:    vprold $16, %ymm28, %ymm1
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm8
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1]
+; AVX512DQ-NEXT:    vprold $16, %ymm16, %ymm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm6[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1]
+; AVX512DQ-NEXT:    vprold $16, %ymm17, %ymm14
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm11, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm1, %ymm0, %ymm28
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm13, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm5, %zmm8, %zmm1
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1]
-; AVX512DQ-NEXT:    vprold $16, %ymm30, %ymm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm31[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm22[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm6, %zmm9
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm9, %ymm5
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm5, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm9, %ymm0, %ymm15
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm5
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm8
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm8, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm30
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm8, %ymm6, %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm8
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm11
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm11, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm14
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm11, %ymm6, %ymm31
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm11
-; AVX512DQ-NEXT:    vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm10 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm10
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm13
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm10, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm10, %ymm9, %ymm12
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm13, %ymm9, %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm12, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm2
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm6, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm6
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm6, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm5 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm7
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm7, %zmm4, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm5, %zmm7, %zmm0
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-NEXT:    addq $328, %rsp # imm = 0x148
+; AVX512DQ-NEXT:    addq $456, %rsp # imm = 0x1C8
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX512DQ-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
@@ -6276,258 +6226,238 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm5
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15],ymm11[24],ymm15[24],ymm11[25],ymm15[25],ymm11[26],ymm15[26],ymm11[27],ymm15[27],ymm11[28],ymm15[28],ymm11[29],ymm15[29],ymm11[30],ymm15[30],ymm11[31],ymm15[31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm29
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm31
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[16],ymm15[16],ymm11[17],ymm15[17],ymm11[18],ymm15[18],ymm11[19],ymm15[19],ymm11[20],ymm15[20],ymm11[21],ymm15[21],ymm11[22],ymm15[22],ymm11[23],ymm15[23]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm11
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm8
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[16],ymm9[16],ymm2[17],ymm9[17],ymm2[18],ymm9[18],ymm2[19],ymm9[19],ymm2[20],ymm9[20],ymm2[21],ymm9[21],ymm2[22],ymm9[22],ymm2[23],ymm9[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm14[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm14, %ymm2
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[16],mem[16],ymm5[17],mem[17],ymm5[18],mem[18],ymm5[19],mem[19],ymm5[20],mem[20],ymm5[21],mem[21],ymm5[22],mem[22],ymm5[23],mem[23]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm10, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm3 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm22
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm6
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm24
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm24[0,1,2,3],zmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm7 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm6, %ymm2, %ymm31
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm29[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm9
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm25[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm29[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm21[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm28[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
-; AVX512DQ-FCP-NEXT:    addq $392, %rsp # imm = 0x188
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
+; AVX512DQ-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 8091afbbfd70c..416fbe9aa340c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -4254,7 +4254,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
 ; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm14
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u,u,u,u,u,14],zero,ymm14[u,u,u,u,u,15],zero,ymm14[u,u,u,u,u,16],zero,ymm14[u,u,u,u,u,17],zero,ymm14[u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512-NEXT:    vpor %ymm0, %ymm9, %ymm0
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
@@ -4357,7 +4357,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
 ; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
@@ -4533,7 +4533,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
 ; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm14
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u,u,u,u,u,14],zero,ymm14[u,u,u,u,u,15],zero,ymm14[u,u,u,u,u,16],zero,ymm14[u,u,u,u,u,17],zero,ymm14[u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm9, %ymm0
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
@@ -4636,7 +4636,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
@@ -8544,39 +8544,35 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-LABEL: store_i8_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $1720, %rsp # imm = 0x6B8
+; AVX512-NEXT:    subq $1384, %rsp # imm = 0x568
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm7, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm14
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm15
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %ymm15
+; AVX512-NEXT:    vmovdqa (%r8), %ymm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm24
-; AVX512-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %ymm8
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
 ; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
@@ -8589,7 +8585,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8604,7 +8600,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8612,7 +8608,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm25
 ; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
@@ -8631,23 +8627,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
 ; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm13
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm22
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm22
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm12
+; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm19
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm16
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm15
-; AVX512-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
+; AVX512-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
@@ -8656,379 +8652,348 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
 ; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm15
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
 ; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512-NEXT:    vmovdqa 32(%rax), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
 ; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm20
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm14
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm18
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512-NEXT:    vporq %xmm7, %xmm8, %xmm31
 ; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
 ; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
 ; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm10, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
-; AVX512-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm5
+; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
+; AVX512-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
 ; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm4
-; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm4
 ; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm3
-; AVX512-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm28
+; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
 ; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm6
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm18
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
 ; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm8
-; AVX512-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm9
-; AVX512-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25]
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm3
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm10, %ymm15, %ymm1
+; AVX512-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-NEXT:    vmovdqa (%r8), %xmm10
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm3
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm15
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
-; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm21
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm22
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%rax), %xmm11
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,5,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
+; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rax), %ymm8
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
+; AVX512-NEXT:    vmovdqa (%rax), %xmm5
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm4, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[2,3,2,3],zmm2[0,1,0,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[2,3,2,3],zmm3[0,1,0,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[2,3,2,3],zmm13[0,1,0,1]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512-NEXT:    vmovdqa %xmm6, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm7
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm3
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm7
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT:    vpshufb %ymm9, %ymm10, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-NEXT:    vpternlogq $226, %zmm12, %zmm10, %zmm17
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[2,3,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm28 = mem[2,3,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm3, %zmm28
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm10, %zmm28
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,2,3,2]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm10
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm2
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 16-byte Folded Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
-; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,0,0,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm22
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm6 = mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm27 = mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
+; AVX512-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm27
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vporq %zmm9, %zmm11, %zmm9
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vporq %zmm11, %zmm12, %zmm11
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vporq %zmm9, %zmm12, %zmm9
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-NEXT:    vpternlogq $184, %zmm11, %zmm12, %zmm9
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm13[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm18[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm20[2,3,2,3]
-; AVX512-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = mem[1,1,0,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm19[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm0
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm21 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vporq %zmm13, %zmm21, %zmm13
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm18 = zmm30[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vporq %zmm20, %zmm18, %zmm18
-; AVX512-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm18
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm16
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-NEXT:    vpternlogq $184, %zmm13, %zmm21, %zmm11
+; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpternlogq $226, %zmm8, %zmm21, %zmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
+; AVX512-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
+; AVX512-NEXT:    vporq %zmm26, %zmm23, %zmm17
+; AVX512-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogq $226, %zmm18, %zmm21, %zmm17
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm19
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm11
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm0
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm14
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm10
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm5[0,0,1,0,4,4,5,4]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm15
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm15
+; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm9
+; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
+; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm16
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm15, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm10, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 64(%rax)
-; AVX512-NEXT:    addq $1720, %rsp # imm = 0x6B8
+; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rax)
+; AVX512-NEXT:    addq $1384, %rsp # imm = 0x568
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -9038,10 +9003,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero,zero,ymm5[18]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm3
@@ -9051,366 +9016,351 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm30
-; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm26
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm21
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm9
-; AVX512-FCP-NEXT:    vporq %xmm12, %xmm13, %xmm31
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm12
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm12
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm12
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm13
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX512-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
+; AVX512-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
 ; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm15
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm15, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vpor %ymm13, %ymm15, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm14
-; AVX512-FCP-NEXT:    vpor %ymm15, %ymm14, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29]
-; AVX512-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
+; AVX512-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero
-; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
+; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm11, %xmm17
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm12[13],zero,zero,zero,zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm20
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm5
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm26
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm31[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm30 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vporq %zmm31, %zmm30, %zmm30
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm30
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm30, %zmm31, %zmm14
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm18[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm13
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm7 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vporq %zmm7, %zmm29, %zmm7
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm27 = zmm27[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vporq %zmm28, %zmm27, %zmm27
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm27
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm23[2,3,2,3],zmm7[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm10[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,1,0,1,2,0,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm17, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm17
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [4,5,4,5,5,7,4,5]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm23, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm23, %ymm15
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm11, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm7
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm20, %zmm8, %zmm30
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm8, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm18 = mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm3, %zmm18
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm25[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm18
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm14 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm20
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm5
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm6 = zmm3[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm8, %zmm10
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm8, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm18
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm6 = zmm3[0,1,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm11
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm8
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
 ; AVX512-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $1720, %rsp # imm = 0x6B8
+; AVX512DQ-NEXT:    subq $1384, %rsp # imm = 0x568
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm7, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm14
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm15
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm14
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm24
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512DQ-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm8
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
@@ -9423,7 +9373,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9438,7 +9388,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9446,7 +9396,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm25
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
@@ -9465,23 +9415,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm17
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm12
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm19
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm16
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm16
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
@@ -9490,379 +9440,348 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
 ; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm15
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm20
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm14
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm18
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512DQ-NEXT:    vporq %xmm7, %xmm8, %xmm31
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
 ; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm10, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
 ; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm4
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm18
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm9
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm15, %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm10
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm3
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm15
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm21
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm22
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm11
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,5,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm5
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[2,3,2,3],zmm2[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[2,3,2,3],zmm3[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[2,3,2,3],zmm13[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm7
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm3
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm7
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm10, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm12, %zmm10, %zmm17
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm28 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm3, %zmm28
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm10, %zmm28
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,2,3,2]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm10
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm22
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm6 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm27 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm27
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vporq %zmm9, %zmm11, %zmm9
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vporq %zmm11, %zmm12, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vporq %zmm9, %zmm12, %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm11, %zmm12, %zmm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm13[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm18[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm20[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = mem[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm19[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm0
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm21 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vporq %zmm13, %zmm21, %zmm13
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm18 = zmm30[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vporq %zmm20, %zmm18, %zmm18
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm18
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm16
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm13, %zmm21, %zmm11
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm8, %zmm21, %zmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vporq %zmm26, %zmm23, %zmm17
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm18, %zmm21, %zmm17
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm19
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm11
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm10
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm5[0,0,1,0,4,4,5,4]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm15
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm15
+; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm16
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 64(%rax)
-; AVX512DQ-NEXT:    addq $1720, %rsp # imm = 0x6B8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rax)
+; AVX512DQ-NEXT:    addq $1384, %rsp # imm = 0x568
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -9872,10 +9791,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero,zero,ymm5[18]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
@@ -9885,327 +9804,316 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm30
-; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm21
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm9
-; AVX512DQ-FCP-NEXT:    vporq %xmm12, %xmm13, %xmm31
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm12
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm13
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
+; AVX512DQ-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm15
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm15, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm15, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT:    vpor %ymm15, %ymm14, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29]
-; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
+; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero
-; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
+; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm11, %xmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm12[13],zero,zero,zero,zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm20
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm26
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm31[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm30 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm31, %zmm30, %zmm30
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm30
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm30, %zmm31, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm18[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm13
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm7 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm7, %zmm29, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm27 = zmm27[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm28, %zmm27, %zmm27
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm23[2,3,2,3],zmm7[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm10[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,1,0,1,2,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm17, %ymm28
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm17
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [4,5,4,5,5,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm23, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm23, %ymm15
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm11, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm20, %zmm8, %zmm30
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm18 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm3, %zmm18
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm25[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm18
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm20
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = zmm3[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm8, %zmm10
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm8, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm18
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = zmm3[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm11
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm8
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
 ; AVX512DQ-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 1771b53a0f835..f5a6b9f59aacf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2104,16 +2104,14 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k3}
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
 ; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2159,16 +2157,14 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermq %zmm5, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vpermq %zmm4, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm4, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm4, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2213,16 +2209,14 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k3}
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm4, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2268,16 +2262,14 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm5, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm4, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm4, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm4, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -8620,230 +8612,336 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 16(%r10), %xmm13
-; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm22
-; AVX512BW-NEXT:    vmovdqa64 48(%r10), %xmm19
-; AVX512BW-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm16
-; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm23
-; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm5
-; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm17
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm25
-; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm21
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm18
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm26
-; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm24
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm4
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23]
+; AVX512BW-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%r10), %xmm12
+; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512BW-NEXT:    vmovdqa 48(%r10), %xmm15
+; AVX512BW-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512BW-NEXT:    vmovdqa 16(%rax), %xmm13
+; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm18
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
+; AVX512BW-NEXT:    vpermw %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm19
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm21
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm6, %ymm6
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
+; AVX512BW-NEXT:    vpermw %ymm7, %ymm6, %ymm7
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm14
 ; AVX512BW-NEXT:    movl $-2004318072, %eax # imm = 0x88888888
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm28
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm30
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm27
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512BW-NEXT:    vmovdqa 48(%rcx), %xmm1
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm12, %zmm14
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm10, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm9
+; AVX512BW-NEXT:    vmovdqa64 48(%rcx), %xmm28
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm29
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm11, %ymm11
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
+; AVX512BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm20
 ; AVX512BW-NEXT:    movl $572662306, %eax # imm = 0x22222222
 ; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vpermw %zmm14, %zmm12, %zmm3 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm15
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm14
-; AVX512BW-NEXT:    vpermw %zmm14, %zmm10, %zmm14
-; AVX512BW-NEXT:    vpermw %zmm15, %zmm9, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm29, %ymm27, %ymm27
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
+; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm1 {%k2}
+; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k3}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm14, %ymm3, %ymm14
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm22, %ymm22
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm23
+; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm22, %ymm25
+; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm22
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm25
+; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm14 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
+; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm14 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm18, %ymm18
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm23
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm15, %ymm3, %ymm15
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm15, %zmm15
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm19, %ymm21
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm19
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm18, %ymm6, %ymm18
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm26
+; AVX512BW-NEXT:    vmovdqu16 %zmm15, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm29, %ymm15
-; AVX512BW-NEXT:    vmovdqa 48(%rdx), %xmm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm27, %zmm15, %zmm15
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm27, %zmm27, %zmm27
-; AVX512BW-NEXT:    vpermw %zmm27, %zmm12, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm31
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm27
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm19, %zmm21
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm20, %zmm19
-; AVX512BW-NEXT:    vpermw %zmm19, %zmm10, %zmm19
-; AVX512BW-NEXT:    vpermw %zmm21, %zmm9, %zmm19 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm24, %ymm15
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm15, %zmm15
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm21
+; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm15 {%k2}
+; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k3}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm18, %ymm3, %ymm18
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm21, %ymm6, %ymm21
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm27
+; AVX512BW-NEXT:    vmovdqu16 %zmm18, %zmm27 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm24, %ymm18
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm24, %ymm28
+; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm24
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm28
+; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm18 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm21
+; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm18 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm17, %ymm17
+; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm27
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm16, %ymm3, %ymm16
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm16, %zmm16
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm17, %ymm25
+; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm17
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm22, %ymm6, %ymm22
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm22, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm22
+; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm20, %zmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm20 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm21
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm21 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm24, %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm24 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm28
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm30
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm25
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm22
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm23, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm23
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm23 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm26
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm23, %ymm16
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm16
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm19, %ymm11, %ymm19
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
+; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm16 {%k2}
+; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k3}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm19, %ymm3, %ymm19
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
+; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm20 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm25, %ymm19
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm19, %zmm19
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm26, %ymm25, %ymm25
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm23, %ymm11, %ymm23
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm23, %zmm23
+; AVX512BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
+; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm13, %ymm13
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm12, %ymm3, %ymm12
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vpermw %ymm13, %ymm6, %ymm13
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm13, %zmm13
+; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm13 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm12, %ymm21, %ymm12
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm12, %zmm12
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vpermw %ymm17, %ymm11, %ymm17
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm17
+; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm12 {%k2}
+; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k3}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpermw %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpermw %ymm2, %ymm6, %ymm2
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdx), %xmm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm27 {%k2}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermw %zmm13, %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpermw %zmm5, %zmm10, %zmm5
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm13, %zmm7
-; AVX512BW-NEXT:    vpermw %zmm7, %zmm9, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm7, %ymm7
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm9, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm12, %zmm1 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm6, %zmm6
-; AVX512BW-NEXT:    vpermw %zmm6, %zmm12, %zmm2 {%k2}
-; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm27 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpermw %ymm3, %ymm11, %ymm3
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm0 {%k2}
+; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -9024,230 +9122,336 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r10), %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm22
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r10), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm26
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm24
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm4
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23]
+; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%r10), %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%r10), %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rax), %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm18
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
+; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm19
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm21
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
+; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm6, %ymm7
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm14
 ; AVX512DQ-BW-NEXT:    movl $-2004318072, %eax # imm = 0x88888888
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm28
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm30
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm27
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%rcx), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm12, %zmm14
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm10, %ymm1
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rcx), %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm29
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm11, %ymm11
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm20
 ; AVX512DQ-BW-NEXT:    movl $572662306, %eax # imm = 0x22222222
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-NEXT:    vpermw %zmm14, %zmm12, %zmm3 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm15
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm14
-; AVX512DQ-BW-NEXT:    vpermw %zmm14, %zmm10, %zmm14
-; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm9, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm29, %ymm27, %ymm27
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm1 {%k2}
+; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm14, %ymm3, %ymm14
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm22, %ymm22
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm14, %zmm23 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm22, %ymm25
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm22
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm20
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm25
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm14 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm14 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm18, %ymm18
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm23
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm15, %ymm3, %ymm15
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm15, %zmm15
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm19, %ymm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm19
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm18, %ymm6, %ymm18
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm26
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm15, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm29, %ymm15
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%rdx), %xmm0
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm27, %zmm15, %zmm15
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm27, %zmm27, %zmm27
-; AVX512DQ-BW-NEXT:    vpermw %zmm27, %zmm12, %zmm15 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm31
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm27
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm19, %zmm21
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm20, %zmm19
-; AVX512DQ-BW-NEXT:    vpermw %zmm19, %zmm10, %zmm19
-; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm9, %zmm19 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm24, %ymm15
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm15, %zmm15
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm21
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm15 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm18, %ymm3, %ymm18
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm6, %ymm21
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm27
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm18, %zmm27 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm24, %ymm18
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm24, %ymm28
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm24
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm18 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm21
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm27, %zmm18 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm17, %ymm17
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm27
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm16, %ymm3, %ymm16
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm16, %zmm16
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm17, %ymm25
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm17
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm22, %ymm6, %ymm22
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm22, %zmm25
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm22
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm25 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm20, %zmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm20 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm21
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm21 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm24, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm24 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm28
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm30
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm25
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm22
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm22 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm23, %ymm0
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm23
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm23 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm26
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm26 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm23, %ymm16
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm16
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm19, %ymm11, %ymm19
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm16 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm19, %ymm3, %ymm19
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm20 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm25, %ymm19
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm19, %zmm19
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm26, %ymm25, %ymm25
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm23, %ymm11, %ymm23
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm23, %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm13, %ymm13
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm12, %ymm3, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm13, %ymm6, %ymm13
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm13, %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm13 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm12, %ymm21, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm12, %zmm12
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm17, %ymm11, %ymm17
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm12 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT:    vpermw %ymm0, %ymm3, %ymm0
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm6, %ymm2
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdx), %xmm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm27 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm10, %zmm0
-; AVX512DQ-BW-NEXT:    vpermw %zmm13, %zmm9, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX512DQ-BW-NEXT:    # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm10, %zmm5
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm13, %zmm7
-; AVX512DQ-BW-NEXT:    vpermw %zmm7, %zmm9, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm7, %ymm7
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm9, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm12, %zmm1 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-BW-NEXT:    # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT:    vpermw %zmm6, %zmm12, %zmm2 {%k2}
-; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm15 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm24 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm27 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX512DQ-BW-NEXT:    vpermw %ymm3, %ymm11, %ymm3
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm0 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
index 709be6534d777..60800673ed2dd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -52,9 +52,9 @@ entry:
   %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>
   %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >
   %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>
-  %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
-  %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, ptr null)
+  %tmp556 = bitcast <8 x i8> %tmp555 to <1 x i64>
+  %tmp557 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp557, <1 x i64> %tmp556, ptr null)
   ret void
 }
 
@@ -115,19 +115,19 @@ define <4 x float> @pr35869() nounwind {
 ; X64-NEXT:    punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X64-NEXT:    cvtpi2ps %mm0, %xmm0
 ; X64-NEXT:    retq
-  %1 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to x86_mmx), x86_mmx bitcast (<8 x i8> zeroinitializer to x86_mmx))
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx bitcast (<4 x i16> zeroinitializer to x86_mmx), x86_mmx %1)
-  %3 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2)
-  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, x86_mmx %3)
+  %1 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to <1 x i64>), <1 x i64> bitcast (<8 x i8> zeroinitializer to <1 x i64>))
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> bitcast (<4 x i16> zeroinitializer to <1 x i64>), <1 x i64> %1)
+  %3 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %1, <1 x i64> %2)
+  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, <1 x i64> %3)
   %5 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2)
-  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, x86_mmx %6)
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %1, <1 x i64> %2)
+  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, <1 x i64> %6)
   ret <4 x float> %7
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx)
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>)
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 18cb5e1b86ec3..541dfb54e96d2 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -629,71 +629,45 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
-; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
-; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
-; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
-; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
-; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
-; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
-; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
-; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
-; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
-; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
-; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
-; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
-; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
-; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
-; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
-; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
-; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
-; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
-; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
-; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
-; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
-; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
-; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
-; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
-; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
-; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
-; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
-; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
-; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
-; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
-; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
-; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
+; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
+; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
+; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
+; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
+; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
@@ -1008,71 +982,45 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
-; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
-; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
-; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
-; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
-; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
-; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
-; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
-; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
-; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
-; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
-; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
-; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
-; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
-; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
-; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
-; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
-; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
-; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
-; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
-; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
-; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
-; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
-; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
-; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
-; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
-; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
-; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
-; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
-; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
-; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
-; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
-; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
-; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
+; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
+; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
+; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
+; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
+; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/x86-64-psub.ll b/llvm/test/CodeGen/X86/x86-64-psub.ll
index 9817d798fd4bf..4c11464075ec9 100644
--- a/llvm/test/CodeGen/X86/x86-64-psub.ll
+++ b/llvm/test/CodeGen/X86/x86-64-psub.ll
@@ -32,11 +32,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -66,11 +66,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -100,11 +100,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to x86_mmx
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <2 x i32>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = bitcast <2 x i32> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -134,11 +134,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -168,11 +168,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -202,11 +202,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -236,26 +236,26 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
index 55119114bd602..c2bcd4dcfeeee 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
@@ -19,10 +19,7 @@
 ;;   return a;
 ;; }
 
-;; FIXME: Variable 'b' gets an incorrect location (value and expression) - see
-;; llvm.org/PR61981. This check just ensures that no fragment info is added to
-;; the dbg.value.
-; CHECK: #dbg_value(i32 %.sroa.0.0.extract.trunc, ![[B:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4),
+; CHECK: #dbg_value(i32 %.sroa.2.0.extract.trunc, ![[B:[0-9]+]], !DIExpression(),
 ; CHECK: #dbg_value(i32 %.sroa.0.0.extract.trunc, ![[A:[0-9]+]], !DIExpression(),
 ; CHECK: ![[A]] = !DILocalVariable(name: "a",
 ; CHECK: ![[B]] = !DILocalVariable(name: "b",
diff --git a/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll
new file mode 100644
index 0000000000000..3789084b6b712
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll
@@ -0,0 +1,275 @@
+; RUN: opt %s -passes=sroa -S | FileCheck %s --check-prefixes=COMMON,OLD
+; RUN: opt %s -passes=declare-to-assign,sroa -S | FileCheck %s --check-prefixes=COMMON,NEW
+
+;; C++17 source:
+;; struct two { int a, b; } gt;
+;; int fun1() {
+;;   auto [x, y] = gt;
+;;   return x + y;
+;; }
+;;
+;; struct four { two a, b; } gf;
+;; int fun2() {
+;;   auto [x, y] = gf;
+;;   return x.a + y.b;
+;; }
+;; Plus some hand-written IR.
+;;
+;; Check that SROA understands how to split dbg.declares and dbg.assigns with
+;; offsets into their storge (e.g., the second variable in a structured binding
+;; is stored at an offset into the shared alloca).
+;;
+;; Additional notes:
+;; We expect the same dbg.value intrinsics to come out of SROA whether assignment
+;; tracking is enabled or not. However, the order of the debug intrinsics may
+;; differ, and assignment tracking replaces some dbg.declares with dbg.assigns.
+;;
+;; Structured bindings produce an artificial variable that covers the entire
+;; alloca. Although they add clutter to the test, they've been preserved in
+;; order to increase coverage. These use the placehold name 'A' in comments and
+;; checks.
+
+%struct.two = type { i32, i32 }
+%struct.four = type { %struct.two, %struct.two }
+
+@gt = dso_local global %struct.two zeroinitializer, align 4, !dbg !0
+@gf = dso_local global %struct.four zeroinitializer, align 4, !dbg !5
+
+
+; COMMON-LABEL: @_Z4fun1v
+; COMMON-NEXT: entry
+;; 32 bit variable x (!27): value a_reg.
+;;
+;; 32 bit variable y (!28): value b_reg.
+;;
+;; 64 bit variable A (!29) bits [0,  32): value a_reg.
+;; 64 bit variable A (!29) bits [32, 64): value b_reg.
+
+; OLD-NEXT: %[[a_reg:.*]] = load i32, ptr @gt
+; OLD-NEXT: #dbg_value(i32 %[[a_reg]], ![[x0:[0-9]+]], !DIExpression(),
+; OLD-NEXT: #dbg_value(i32 %[[a_reg]], ![[A0:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+; OLD-NEXT: %[[b_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gt, i64 4)
+; OLD-NEXT: #dbg_value(i32 %[[b_reg]], ![[y0:[0-9]+]], !DIExpression(),
+; OLD-NEXT: #dbg_value(i32 %[[b_reg]], ![[A0]], !DIExpression(DW_OP_LLVM_fragment, 32, 32),
+
+; NEW-NEXT: %[[a_reg:.*]] = load i32, ptr @gt
+; NEW-NEXT: %[[b_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gt, i64 4)
+; NEW-NEXT: #dbg_value(i32 %[[b_reg]], ![[y0:[0-9]+]], !DIExpression(),
+; NEW-NEXT: #dbg_value(i32 %[[a_reg]], ![[A0:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+; NEW-NEXT: #dbg_value(i32 %[[b_reg]], ![[A0]], !DIExpression(DW_OP_LLVM_fragment, 32, 32),
+; NEW-NEXT: #dbg_value(i32 %[[a_reg]], ![[x0:[0-9]+]], !DIExpression(),
+define dso_local noundef i32 @_Z4fun1v() #0 !dbg !23 {
+entry:
+  %0 = alloca %struct.two, align 4
+    #dbg_declare(ptr %0, !27, !DIExpression(), !31)
+    #dbg_declare(ptr %0, !28, !DIExpression(DW_OP_plus_uconst, 4), !31)
+    #dbg_declare(ptr %0, !29, !DIExpression(), !31)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gt, i64 8, i1 false), !dbg !31
+  %a = getelementptr inbounds %struct.two, ptr %0, i32 0, i32 0, !dbg !31
+  %1 = load i32, ptr %a, align 4, !dbg !31
+  %b = getelementptr inbounds %struct.two, ptr %0, i32 0, i32 1, !dbg !31
+  %2 = load i32, ptr %b, align 4, !dbg !31
+  %add = add nsw i32 %1, %2, !dbg !31
+  ret i32 %add, !dbg !31
+}
+
+; COMMON-LABEL: _Z4fun2v()
+; COMMON-NEXT: entry:
+;; 64 bit variable x (!50) bits [0,  32): value aa_reg.
+;; 64 bit variable x (!50) bits [32, 64): deref ab_ba_addr
+;;
+;; 64 bit variable y (!51) bits [0,  32): deref ab_ba_addr + 32
+;; 64 bit variable y (!51) bits [32, 64): value bb_reg.
+;;
+;; 128 bit variable A (!52) bits [0,   32): value aa_reg
+;; 128 bit variable A (!52) bits [32,  64): deref ab_ba_addr
+;; 128 bit variable A (!52) bits [96, 128): value bb_reg
+;;
+;; NOTE: This 8 byte alloca contains x.b (4 bytes) and y.a (4 bytes).
+; COMMON-NEXT: %[[ab_ba_addr:.*]] = alloca [8 x i8], align 4
+; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[A1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 64),
+; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[y1:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_LLVM_fragment, 0, 32),
+; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[x1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32),
+; OLD-NEXT: %[[aa_reg:.*]] = load i32, ptr @gf, align 4
+; OLD-NEXT: #dbg_value(i32 %[[aa_reg]], ![[x1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+; OLD-NEXT: #dbg_value(i32 %[[aa_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+; OLD-NEXT: call void @llvm.memcpy{{.*}}(ptr align 4 %[[ab_ba_addr]], ptr align 4 getelementptr inbounds (i8, ptr @gf, i64 4), i64 8, i1 false)
+; OLD-NEXT: %[[bb_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 12), align 4
+; OLD-NEXT: #dbg_value(i32 %[[bb_reg]], ![[y1]], !DIExpression(DW_OP_LLVM_fragment, 32, 32),
+; OLD-NEXT: #dbg_value(i32 %[[bb_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 96, 32),
+
+; NEW-NEXT: #dbg_assign(i1 undef, ![[x1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[#]], ptr %[[ab_ba_addr]], !DIExpression(),
+; NEW-NEXT: #dbg_assign(i1 undef, ![[A1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 64), ![[#]], ptr %[[ab_ba_addr]], !DIExpression(),
+; NEW-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[y1:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_LLVM_fragment, 0, 32),
+; NEW-NEXT: %[[aa_reg:.*]] = load i32, ptr @gf, align 4
+; NEW-NEXT: llvm.memcpy{{.*}}(ptr align 4 %[[ab_ba_addr]], ptr align 4 getelementptr inbounds (i8, ptr @gf, i64 4), i64 8, i1 false){{.*}}, !DIAssignID ![[ID:[0-9]+]]
+; NEW-NEXT: %[[bb_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 12), align 4
+; NEW-NEXT: #dbg_value(i32 %[[bb_reg]], ![[y1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32),
+; NEW-NEXT: #dbg_value(i32 %[[aa_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+; NEW-NEXT: #dbg_assign(i1 undef, ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 32, 64), ![[ID]], ptr %[[ab_ba_addr]], !DIExpression(),
+; NEW-NEXT: #dbg_value(i32 %[[bb_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 96, 32),
+; NEW-NEXT: #dbg_value(i32 %[[aa_reg]], ![[x1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32),
+define dso_local noundef i32 @_Z4fun2v() #0 !dbg !48 {
+entry:
+  %0 = alloca %struct.four, align 4
+    #dbg_declare(ptr %0, !50, !DIExpression(), !54)
+    #dbg_declare(ptr %0, !51, !DIExpression(DW_OP_plus_uconst, 8), !54)
+    #dbg_declare(ptr %0, !52, !DIExpression(), !54)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gf, i64 16, i1 false), !dbg !54
+  %a = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 0, !dbg !54
+  %a1 = getelementptr inbounds %struct.two, ptr %a, i32 0, i32 0, !dbg !54
+  %1 = load i32, ptr %a1, align 4, !dbg !54
+  %b = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 1, !dbg !54
+  %b2 = getelementptr inbounds %struct.two, ptr %b, i32 0, i32 1, !dbg !54
+  %2 = load i32, ptr %b2, align 4, !dbg !54
+  %add = add nsw i32 %1, %2, !dbg !54
+  ret i32 %add, !dbg !54
+}
+
+;; Hand-written part to test what happens when variables are smaller than the
+;; new alloca slices (i.e., check offset rewriting works correctly). Note that
+;; mem2reg incorrectly preserves the offest in the DIExpression of a variable
+;; stuffed into the upper bits of a value (that is a bug), e.g. alloca+offset
+;; becomes vreg+offest. It should either convert the offest to a shift, encode
+;; the register-bit offest using DW_OP_bit_piece, or use the new
+;; DW_OP_LLVM_extract_bits_[sz]ext operation.
+; COMMON-LABEL: _Z4fun3v()
+; COMMON-NEXT: entry:
+;; 16 bit variable e (!61): value ve (upper bits)
+;;
+;; 16 bit variable f (!62): value vgf (lower bits)
+;; 16 bit variable g (!63): value vgf (upper bits)
+;;
+;; 16 bit variable h (!64): deref dead_64_128
+; COMMON-NEXT: %[[dead_64_128:.*]] = alloca %struct.two
+; COMMON-NEXT: #dbg_declare(ptr %[[dead_64_128]], ![[h:[0-9]+]], !DIExpression(),
+; COMMON-NEXT: %[[ve:.*]] = load i32, ptr @gf
+;; FIXME: mem2reg bug - offset is incorrect - see comment above.
+; COMMON-NEXT: #dbg_value(i32 %[[ve]], ![[e:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2),
+; COMMON-NEXT: %[[vfg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 4)
+; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[f:[0-9]+]], !DIExpression(),
+;; FIXME: mem2reg bug - offset is incorrect - see comment above.
+; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[g:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2),
+define dso_local noundef i32 @_Z4fun3v() #0 !dbg !55 {
+entry:
+  %0 = alloca %struct.four, align 4
+    #dbg_declare(ptr %0, !61, !DIExpression(DW_OP_plus_uconst, 2), !58)
+    #dbg_declare(ptr %0, !62, !DIExpression(DW_OP_plus_uconst, 4), !58)
+    #dbg_declare(ptr %0, !63, !DIExpression(DW_OP_plus_uconst, 6), !58)
+    #dbg_declare(ptr %0, !64, !DIExpression(DW_OP_plus_uconst, 8), !58)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gf, i64 16, i1 false), !dbg !58
+  %1 = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 0, !dbg !58
+  %2 = getelementptr inbounds %struct.two, ptr %1, i32 0, i32 1, !dbg !58
+  %3 = load i32, ptr %2, align 4, !dbg !58
+  ret i32 %3, !dbg !58
+}
+
+;; Check that DW_OP_extract_bits_[sz]ext compose with expression offsets and
+;; that new fragments are not created. DW_OP_extract_bits_[sz]ext and fragments
+;; don't compose currently (but could). There are checks that expressions with
+;; bit extracts and fragments are dropped in SROA the test
+;; in llvm/test/DebugInfo/Generic/sroa-extract-bits.ll. FIXME: Don't do that.
+;;
+;; Checks are inline for this one.
+;;
+;; %p alloca is 128 bits
+;; SROA is going to split it in half, discard the lower bits, then split
+;; the upper bits in half and discard the upper bits leaving us with
+;; bits [64, 96) of the original alloca.
+;;
+; COMMON-LABEL: fun4
+define dso_local noundef i32 @fun4(i64 %0) !dbg !65 {
+entry:
+  %p = alloca [2 x i64]
+  %1 = getelementptr inbounds [2 x i64], ptr %p, i32 0, i32 1
+  store i64 %0, ptr %1
+  ; COMMON: %p.sroa.0.8.extract.trunc = trunc i64 %0 to i32
+  ;; Simple case - the expression offset (8 bytes) matches the offset of the
+  ;; slice into the alloca, so can be discarded away entirely.
+  ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[p:[0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 32)
+    #dbg_declare(ptr %p, !67, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_LLVM_extract_bits_zext, 0, 32), !66)
+  ;; The expression offset is 6 bytes, with a bit-extract offset of 32 bits from
+  ;; there for a total offset of 80 bits. SROA is going to split the alloca in
+  ;; half (at bit 64). The new expression needs a final bit extract offset of
+  ;; 80-64=16 bits applied to the mem2reg'd value.
+  ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[q:[0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 8)
+    #dbg_declare(ptr %p, !68, !DIExpression(DW_OP_plus_uconst, 6, DW_OP_LLVM_extract_bits_zext, 32, 8), !66)
+  ;; FIXME: Just as in _Z4fun3v, the offset from the new alloca (2 bytes) is
+  ;; correct but mem2reg needs to change it from an offset to a shift or
+  ;; adjust the bit-extract (e.g., add the 2 byte offset to the existing 8 bit
+  ;; offset for a 24 bit total bit-extract offset).
+  ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[r:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2, DW_OP_LLVM_extract_bits_zext, 8, 8)
+    #dbg_declare(ptr %p, !69, !DIExpression(DW_OP_plus_uconst, 10, DW_OP_LLVM_extract_bits_zext, 8, 8), !66)
+  %2 = load i32, ptr %1, align 4
+  ret i32 %2
+}
+
+; COMMON-DAG: ![[x0]] = !DILocalVariable(name: "x",
+; COMMON-DAG: ![[y0]] = !DILocalVariable(name: "y",
+; COMMON-DAG: ![[A0]] = !DILocalVariable(scope:
+
+; COMMON-DAG: ![[x1]] = !DILocalVariable(name: "x",
+; COMMON-DAG: ![[y1]] = !DILocalVariable(name: "y",
+; COMMON-DAG: ![[A1]] = !DILocalVariable(scope:
+
+; COMMON-DAG: ![[e]] = !DILocalVariable(name: "e",
+; COMMON-DAG: ![[f]] = !DILocalVariable(name: "f",
+; COMMON-DAG: ![[g]] = !DILocalVariable(name: "g",
+; COMMON-DAG: ![[h]] = !DILocalVariable(name: "h",
+
+; COMMON-DAG: ![[p]] = !DILocalVariable(name: "p"
+; COMMON-DAG: ![[q]] = !DILocalVariable(name: "q"
+; COMMON-DAG: ![[r]] = !DILocalVariable(name: "r"
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!22}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "gt", scope: !2, file: !3, line: 1, type: !10, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 17.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.cpp", directory: "/")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "gf", scope: !2, file: !3, line: 7, type: !7, isLocal: false, isDefinition: true)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "four", file: !3, line: 7, size: 128, flags: DIFlagTypePassByValue, elements: !8, identifier: "_ZTS4four")
+!8 = !{!9, !15}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !7, file: !3, line: 7, baseType: !10, size: 64)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "two", file: !3, line: 1, size: 64, flags: DIFlagTypePassByValue, elements: !11, identifier: "_ZTS3two")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !3, line: 1, baseType: !13, size: 32)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !3, line: 1, baseType: !13, size: 32, offset: 32)
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !7, file: !3, line: 7, baseType: !10, size: 64, offset: 64)
+!16 = !{i32 7, !"Dwarf Version", i32 5}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !{!"clang version 17.0.0"}
+!23 = distinct !DISubprogram(name: "fun1", linkageName: "_Z4fun1v", scope: !3, file: !3, line: 2, type: !24, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !26)
+!24 = !DISubroutineType(types: !25)
+!25 = !{!13}
+!26 = !{!27, !28, !29}
+!27 = !DILocalVariable(name: "x", scope: !23, file: !3, line: 3, type: !13)
+!28 = !DILocalVariable(name: "y", scope: !23, file: !3, line: 3, type: !13)
+!29 = !DILocalVariable(scope: !23, file: !3, line: 3, type: !10)
+!31 = !DILocation(line: 3, column: 9, scope: !23)
+!48 = distinct !DISubprogram(name: "fun2", linkageName: "_Z4fun2v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !49)
+!49 = !{!50, !51, !52}
+!50 = !DILocalVariable(name: "x", scope: !48, file: !3, line: 9, type: !10)
+!51 = !DILocalVariable(name: "y", scope: !48, file: !3, line: 9, type: !10)
+!52 = !DILocalVariable(scope: !48, file: !3, line: 9, type: !7)
+!54 = !DILocation(line: 9, column: 9, scope: !48)
+!55 = distinct !DISubprogram(name: "fun3", linkageName: "_Z4fun3v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !56)
+!56 = !{}
+!58 = !DILocation(line: 9, column: 9, scope: !55)
+!60 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!61 = !DILocalVariable(name: "e", scope: !55, file: !3, line: 9, type: !60)
+!62 = !DILocalVariable(name: "f", scope: !55, file: !3, line: 9, type: !60)
+!63 = !DILocalVariable(name: "g", scope: !55, file: !3, line: 9, type: !60)
+!64 = !DILocalVariable(name: "h", scope: !55, file: !3, line: 9, type: !60)
+!65 = distinct !DISubprogram(name: "fun4", linkageName: "_Z4fun4v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !56)
+!66 = !DILocation(line: 9, column: 9, scope: !65)
+!67 = !DILocalVariable(name: "p", scope: !65, file: !3, line: 9, type: !13)
+!68 = !DILocalVariable(name: "q", scope: !65, file: !3, line: 9, type: !13)
+!69 = !DILocalVariable(name: "r", scope: !65, file: !3, line: 9, type: !13)
diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir
index b54c748ac9e84..67bfd85dcb379 100644
--- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir
+++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir
@@ -17,12 +17,12 @@
 # CHECK-LABEL: bb.3.bb3:
 # CHECK:      DBG_VALUE $ecx, $noreg, !{{[0-9]+}},
 # CHECK-SAME:                 !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, 
+# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}},
 # CHECK-SAME:                 !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 0, 32)
 # CHECK-SAME:                 $ecx, $r8d
 # CHECK-NEXT: DBG_VALUE $ebx, $noreg, !{{[0-9]+}},
 # CHECK-SAME:                 !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, 
+# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}},
 # CHECK-SAME:                 !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 32, 32)
 # CHECK-SAME:                 $ebx, $r10d
 # CHECK-NEXT: XOR32rr
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
index 2f1140cf62910..6990a82c4acfe 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
@@ -7,9 +7,9 @@
 ## addresses.
 # RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM
 
-# MAPPING_SYM:      0000000000000000 t $d.0
-# MAPPING_SYM-NEXT: 000000000000000c t $d.2
-# MAPPING_SYM-NEXT: 0000000000000004 t $x.1
+# MAPPING_SYM:      0000000000000000 t $d
+# MAPPING_SYM-NEXT: 000000000000000c t $d
+# MAPPING_SYM-NEXT: 0000000000000004 t $x
 # MAPPING_SYM-NEXT: 0000000000000000 T foo
 
 # RUN: llvm-symbolizer --obj=%t 0 4 0xc | FileCheck %s -check-prefix SYMBOL
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/arm-mapping-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/arm-mapping-symbol.s
index 6e17ef29ae577..27310c09fb07c 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/arm-mapping-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/arm-mapping-symbol.s
@@ -5,19 +5,19 @@
 
 ## Verify that mapping symbols are actually present in the object at expected
 ## addresses.
-# RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_A
+# RUN: llvm-nm --special-syms %t | FileCheck %s --check-prefix=MAPPING_A --match-full-lines
 
-# MAPPING_A:      00000004 t $a.1
-# MAPPING_A-NEXT: 00000000 t $d.0
-# MAPPING_A-NEXT: 00000008 t $d.2
+# MAPPING_A:      00000004 t $a
+# MAPPING_A-NEXT: 00000000 t $d
+# MAPPING_A-NEXT: 00000008 t $d
 # MAPPING_A-NEXT: 00000000 T foo
 
 # RUN: llvm-mc -filetype=obj -triple=thumbv7-none-linux %s -o %tthumb
-# RUN: llvm-nm --special-syms %tthumb | FileCheck %s -check-prefix MAPPING_T
+# RUN: llvm-nm --special-syms %tthumb | FileCheck %s --check-prefix=MAPPING_T --match-full-lines
 
-# MAPPING_T:      00000000 t $d.0
-# MAPPING_T-NEXT: 00000006 t $d.2
-# MAPPING_T-NEXT: 00000004 t $t.1
+# MAPPING_T:      00000000 t $d
+# MAPPING_T-NEXT: 00000006 t $d
+# MAPPING_T-NEXT: 00000004 t $t
 # MAPPING_T-NEXT: 00000000 T foo
 
 # RUN: llvm-symbolizer --obj=%t 4 8 | FileCheck %s -check-prefix SYMBOL
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
index d1cab1250636d..609b5e7e01b61 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
@@ -5,11 +5,11 @@
 
 ## Verify that mapping symbols are actually present in the object at expected
 ## addresses.
-# RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM
+# RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM --match-full-lines
 
-# MAPPING_SYM:      00000000 t $d.0
-# MAPPING_SYM-NEXT: 00000008 t $d.2
-# MAPPING_SYM-NEXT: 00000004 t $t.1
+# MAPPING_SYM:      00000000 t $d
+# MAPPING_SYM-NEXT: 00000008 t $d
+# MAPPING_SYM-NEXT: 00000004 t $t
 # MAPPING_SYM-NEXT: 00000000 T foo
 
 # RUN: llvm-symbolizer --obj=%t 0 4 0xc | FileCheck %s -check-prefix SYMBOL
diff --git a/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.test b/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.test
new file mode 100644
index 0000000000000..5729155c937e7
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.test
@@ -0,0 +1,41 @@
+# This checks that .debug_aranges is always generated for the SCE debugger
+# tuning.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+RUN: llc -debugger-tune=sce -filetype=obj foo.ll -o foo.o
+RUN: llvm-dwarfdump -debug-aranges foo.o | FileCheck %s
+
+CHECK:      .debug_aranges contents:
+CHECK-NEXT: Address Range Header:
+CHECK-SAME:   length = 0x0000002c,
+
+#--- foo.c
+int foo;
+
+#--- gen
+clang --target=x86_64-linux-gnu -g -S -emit-llvm foo.c -o -
+
+#--- foo.ll
+; ModuleID = 'foo.c'
+source_filename = "foo.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = dso_local global i32 0, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7, !8, !9, !10, !11, !12}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "foo.c", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "e0f2c326d820c28d30f9b0ea95178f64")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 8, !"PIC Level", i32 2}
+!10 = !{i32 7, !"PIE Level", i32 2}
+!11 = !{i32 7, !"uwtable", i32 2}
+!12 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/DebugInfo/X86/loop-align-debug.ll b/llvm/test/DebugInfo/X86/loop-align-debug.ll
new file mode 100644
index 0000000000000..a0302d08faa0c
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/loop-align-debug.ll
@@ -0,0 +1,55 @@
+; RUN: llc %s --filetype=obj -o %t
+; RUN: llvm-objdump -d %t | FileCheck %s --check-prefixes=OBJ
+; RUN: llvm-dwarfdump --debug-line %t | FileCheck %s --check-prefixes=DBG
+; RUN: llc %s -o - | FileCheck %s --check-prefixes=ASM
+
+; OBJ: 1:{{.*}}nop
+
+;;     Address            Line   Column File   ISA Discriminator OpIndex Flags
+; DBG: 0x0000000000000000      3      0      0   0             0       0  is_stmt
+; DBG: 0x0000000000000001      0      0      0   0             0       0
+; DBG: 0x0000000000000010      5      0      0   0             0       0  is_stmt prologue_end
+; DBG: 0x0000000000000017      5      0      0   0             0       0  is_stmt end_sequence
+
+; ASM:      .loc    0 0 0 is_stmt 0
+; ASM-NEXT: .L{{.*}}:
+; ASM-NEXT: .p2align        4, 0x90
+
+;; $ cat test.cpp
+;; void g();
+;; void f() {
+;;   [[clang::code_align(16)]]
+;;   while (1) { g(); }
+;; }
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @f() local_unnamed_addr !dbg !9 {
+entry:
+  br label %while.body, !dbg !12
+
+while.body:                                       ; preds = %entry, %while.body
+  tail call void @g(), !dbg !12
+  br label %while.body, !dbg !12, !llvm.loop !13
+}
+
+declare !dbg !16 void @g() local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{}
+!12 = !DILocation(line: 5, scope: !9)
+!13 = distinct !{!13, !12, !12, !14, !15}
+!14 = !{!"llvm.loop.mustprogress"}
+!15 = !{!"llvm.loop.align", i32 16}
+!16 = !DISubprogram(name: "g", scope: !1, file: !1, line: 2, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
diff --git a/llvm/test/DebugInfo/XCOFF/empty.ll b/llvm/test/DebugInfo/XCOFF/empty.ll
index c1393907169fc..445e1a63447ef 100644
--- a/llvm/test/DebugInfo/XCOFF/empty.ll
+++ b/llvm/test/DebugInfo/XCOFF/empty.ll
@@ -79,7 +79,7 @@ entry:
 ; ASM32-NEXT:                                          # -- End function
 ; ASM32-NEXT:  L..sec_end0:
 ; ASM32:               .dwsect 0x60000
-; ASM32-NEXT:  L...dwabrev:
+; ASM32-NEXT:  .dwabrev:
 ; ASM32-NEXT:          .byte   1                               # Abbreviation Code
 ; ASM32-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; ASM32-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -135,10 +135,10 @@ entry:
 ; ASM32-NEXT:          .byte   0                               # EOM(2)
 ; ASM32-NEXT:          .byte   0                               # EOM(3)
 ; ASM32:               .dwsect 0x10000
-; ASM32-NEXT:  L...dwinfo:
+; ASM32-NEXT:  .dwinfo:
 ; ASM32-NEXT:  L..cu_begin0:
 ; ASM32-NEXT:          .vbyte  2, 4                            # DWARF version number
-; ASM32-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
+; ASM32-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
 ; ASM32-NEXT:          .byte   4                               # Address Size (in bytes)
 ; ASM32-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x38 DW_TAG_compile_unit
 ; ASM32-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -166,7 +166,7 @@ entry:
 ; ASM32-NEXT:          .byte   0                               # End Of Children Mark
 ; ASM32-NEXT:  L..debug_info_end0:
 ; ASM32:               .dwsect 0x70000
-; ASM32-NEXT:  L...dwstr:
+; ASM32-NEXT:  .dwstr:
 ; ASM32-NEXT:  L..info_string0:
 ; ASM32-NEXT:          .string "clang version 12.0.0"          # string offset=0
 ; ASM32-NEXT:  L..info_string1:
@@ -179,7 +179,7 @@ entry:
 ; ASM32-NEXT:          .string "int"                           # string offset=36
 ; ASM32-NEXT:          .toc
 ; ASM32:               .dwsect 0x20000
-; ASM32-NEXT:  L...dwline:
+; ASM32-NEXT:  .dwline:
 ; ASM32-NEXT:  L..debug_line_0:
 ; ASM32-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; ASM32-NEXT:          .vbyte  2, 4
@@ -281,7 +281,7 @@ entry:
 ; ASM64-NEXT:                                          # -- End function
 ; ASM64-NEXT:  L..sec_end0:
 ; ASM64:               .dwsect 0x60000
-; ASM64-NEXT:  L...dwabrev:
+; ASM64-NEXT:  .dwabrev:
 ; ASM64-NEXT:          .byte   1                               # Abbreviation Code
 ; ASM64-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; ASM64-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -337,10 +337,10 @@ entry:
 ; ASM64-NEXT:          .byte   0                               # EOM(2)
 ; ASM64-NEXT:          .byte   0                               # EOM(3)
 ; ASM64:               .dwsect 0x10000
-; ASM64-NEXT:  L...dwinfo:
+; ASM64-NEXT:  .dwinfo:
 ; ASM64-NEXT:  L..cu_begin0:
 ; ASM64-NEXT:          .vbyte  2, 4                            # DWARF version number
-; ASM64-NEXT:          .vbyte  8, L...dwabrev                  # Offset Into Abbrev. Section
+; ASM64-NEXT:          .vbyte  8, .dwabrev                  # Offset Into Abbrev. Section
 ; ASM64-NEXT:          .byte   8                               # Address Size (in bytes)
 ; ASM64-NEXT:          .byte   1                               # Abbrev [1] 0x17:0x58 DW_TAG_compile_unit
 ; ASM64-NEXT:          .vbyte  8, L..info_string0              # DW_AT_producer
@@ -368,7 +368,7 @@ entry:
 ; ASM64-NEXT:          .byte   0                               # End Of Children Mark
 ; ASM64-NEXT:  L..debug_info_end0:
 ; ASM64:               .dwsect 0x70000
-; ASM64-NEXT:  L...dwstr:
+; ASM64-NEXT:  .dwstr:
 ; ASM64-NEXT:  L..info_string0:
 ; ASM64-NEXT:          .string "clang version 12.0.0"          # string offset=0
 ; ASM64-NEXT:  L..info_string1:
@@ -381,7 +381,7 @@ entry:
 ; ASM64-NEXT:          .string "int"                           # string offset=36
 ; ASM64-NEXT:         .toc
 ; ASM64:               .dwsect 0x20000
-; ASM64-NEXT:  L...dwline:
+; ASM64-NEXT:  .dwline:
 ; ASM64-NEXT:  L..debug_line_0:
 ; ASM64-NEXT:  .set L..line_table_start0, L..debug_line_0-12
 ; ASM64-NEXT:          .vbyte  2, 4
diff --git a/llvm/test/DebugInfo/XCOFF/explicit-section.ll b/llvm/test/DebugInfo/XCOFF/explicit-section.ll
index ed2ffb709168e..530267ec634fc 100644
--- a/llvm/test/DebugInfo/XCOFF/explicit-section.ll
+++ b/llvm/test/DebugInfo/XCOFF/explicit-section.ll
@@ -130,7 +130,7 @@ entry:
 ; CHECK-NEXT:                                          # -- End function
 ; CHECK-NEXT:  L..sec_end0:
 ; CHECK:               .dwsect 0x60000
-; CHECK-NEXT:  L...dwabrev:
+; CHECK-NEXT:  .dwabrev:
 ; CHECK-NEXT:          .byte   1                               # Abbreviation Code
 ; CHECK-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; CHECK-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -184,10 +184,10 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # EOM(2)
 ; CHECK-NEXT:          .byte   0                               # EOM(3)
 ; CHECK:               .dwsect 0x10000
-; CHECK-NEXT:  L...dwinfo:
+; CHECK-NEXT:  .dwinfo:
 ; CHECK-NEXT:  L..cu_begin0:
 ; CHECK-NEXT:          .vbyte  2, 3                            # DWARF version number
-; CHECK-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
+; CHECK-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
 ; CHECK-NEXT:          .byte   4                               # Address Size (in bytes)
 ; CHECK-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x4f DW_TAG_compile_unit
 ; CHECK-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -224,7 +224,7 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # End Of Children Mark
 ; CHECK-NEXT:  L..debug_info_end0:
 ; CHECK:               .dwsect 0x80000
-; CHECK-NEXT:  L...dwrnges:
+; CHECK-NEXT:  .dwrnges:
 ; CHECK-NEXT:  L..debug_ranges0:
 ; CHECK-NEXT:          .vbyte  4, L..func_begin0
 ; CHECK-NEXT:          .vbyte  4, L..func_end0
@@ -233,7 +233,7 @@ entry:
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK:               .dwsect 0x70000
-; CHECK-NEXT:  L...dwstr:
+; CHECK-NEXT:  .dwstr:
 ; CHECK-NEXT:  L..info_string0:
 ; CHECK-NEXT:  	.string	"clang version 13.0.0"          # string offset=0
 ; CHECK-NEXT:  L..info_string1:
@@ -248,7 +248,7 @@ entry:
 ; CHECK-NEXT:  	.string	"main"                          # string offset=39
 ; CHECK-NEXT:          .toc
 ; CHECK:               .dwsect 0x20000
-; CHECK-NEXT:  L...dwline:
+; CHECK-NEXT:  .dwline:
 ; CHECK-NEXT:  L..debug_line_0:
 ; CHECK-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; CHECK-NEXT:          .vbyte  2, 3
diff --git a/llvm/test/DebugInfo/XCOFF/function-sections.ll b/llvm/test/DebugInfo/XCOFF/function-sections.ll
index c899089102c64..5103b66db114d 100644
--- a/llvm/test/DebugInfo/XCOFF/function-sections.ll
+++ b/llvm/test/DebugInfo/XCOFF/function-sections.ll
@@ -112,7 +112,7 @@ entry:
 ; CHECK-NEXT:                                          # -- End function
 ; CHECK-NEXT:  L..sec_end0:
 ; CHECK:               .dwsect 0x60000
-; CHECK-NEXT:  L...dwabrev:
+; CHECK-NEXT:  .dwabrev:
 ; CHECK-NEXT:          .byte   1                               # Abbreviation Code
 ; CHECK-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; CHECK-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -168,10 +168,10 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # EOM(2)
 ; CHECK-NEXT:          .byte   0                               # EOM(3)
 ; CHECK:               .dwsect 0x10000
-; CHECK-NEXT:  L...dwinfo:
+; CHECK-NEXT:  .dwinfo:
 ; CHECK-NEXT:  L..cu_begin0:
 ; CHECK-NEXT:          .vbyte  2, 3                            # DWARF version number
-; CHECK-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
+; CHECK-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
 ; CHECK-NEXT:          .byte   4                               # Address Size (in bytes)
 ; CHECK-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x51 DW_TAG_compile_unit
 ; CHECK-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -210,7 +210,7 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # End Of Children Mark
 ; CHECK-NEXT:  L..debug_info_end0:
 ; CHECK:               .dwsect 0x80000
-; CHECK-NEXT:  L...dwrnges:
+; CHECK-NEXT:  .dwrnges:
 ; CHECK-NEXT:  L..debug_ranges0:
 ; CHECK-NEXT:          .vbyte  4, L..func_begin0
 ; CHECK-NEXT:          .vbyte  4, L..func_end0
@@ -219,7 +219,7 @@ entry:
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK:               .dwsect 0x70000
-; CHECK-NEXT:  L...dwstr:
+; CHECK-NEXT:  .dwstr:
 ; CHECK-NEXT:  L..info_string0:
 ; CHECK-NEXT:          .string "clang version 13.0.0"          # string offset=0
 ; CHECK-NEXT:  L..info_string1:
@@ -234,7 +234,7 @@ entry:
 ; CHECK-NEXT:          .string "bar"                           # string offset=39
 ; CHECK-NEXT:          .toc
 ; CHECK:               .dwsect 0x20000
-; CHECK-NEXT:  L...dwline:
+; CHECK-NEXT:  .dwline:
 ; CHECK-NEXT:  L..debug_line_0:
 ; CHECK-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; CHECK-NEXT:          .vbyte  2, 3
diff --git a/llvm/test/DebugInfo/debuglineinfo-path.ll b/llvm/test/DebugInfo/debuglineinfo-path.ll
index f6574a2a46f12..7e408aff6ceff 100644
--- a/llvm/test/DebugInfo/debuglineinfo-path.ll
+++ b/llvm/test/DebugInfo/debuglineinfo-path.ll
@@ -1,8 +1,9 @@
 ; Make sure that absolute source dir is detected correctly regardless of the platform.
 
 ; On powerpc llvm-nm describes win_func as a global variable, not a function. It breaks the test.
-; It is not essential to DWARF path handling code we're testing here.
-; UNSUPPORTED: target=powerpc{{.*}}
+; On PlayStation, (pre-relocated) .debug_aranges cause symbolization to fail, also breaking the test.
+; These are not essential to DWARF path handling code we're testing here.
+; UNSUPPORTED: target=powerpc{{.*}}, target=x86_64-{{.*-ps[45]}}
 ; REQUIRES: object-emission
 ; RUN: %llc_dwarf -O0 -filetype=obj -o %t < %s
 ; RUN: llvm-nm --radix=o %t | grep posix_absolute_func > %t.posix_absolute_func
diff --git a/llvm/test/DebugInfo/omit-empty.ll b/llvm/test/DebugInfo/omit-empty.ll
index 0267ec5556f11..351d3055e039b 100644
--- a/llvm/test/DebugInfo/omit-empty.ll
+++ b/llvm/test/DebugInfo/omit-empty.ll
@@ -1,4 +1,4 @@
-; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s
+; RUN: %llc_dwarf %s -filetype=obj -generate-arange-section -o - | llvm-objdump -h - | FileCheck %s
 ; REQUIRES: object-emission
 
 ; CHECK-NOT: .debug_
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_common_x_and_addr_getter.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_common_x_and_addr_getter.s
new file mode 100644
index 0000000000000..0f352f18a60f1
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_common_x_and_addr_getter.s
@@ -0,0 +1,10 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_getXAddr
+	.p2align	2
+_getXAddr:
+	adrp	x0, _x@GOTPAGE
+	ldr	x0, [x0, _x@GOTPAGEOFF]
+	ret
+
+	.comm	_x,4,2
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_strong_x_and_addr_getter.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_strong_x_and_addr_getter.s
new file mode 100644
index 0000000000000..88486ff000f5c
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_strong_x_and_addr_getter.s
@@ -0,0 +1,15 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_getXAddr
+	.p2align	2
+_getXAddr:
+	adrp	x0, _x@PAGE
+	add	x0, x0, _x@PAGEOFF
+	ret
+
+	.section	__DATA,__data
+	.globl	_x
+	.p2align	2, 0x0
+_x:
+	.long	42
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_common_symbol_x_multiple_defs.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_common_symbol_x_multiple_defs.s
new file mode 100644
index 0000000000000..5a8a916d30afe
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_common_symbol_x_multiple_defs.s
@@ -0,0 +1,33 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=arm64-apple-darwin19 -filetype=obj -o %t/main.o %s
+# RUN: llvm-mc -triple=arm64-apple-darwin19 -filetype=obj -o %t/aux_common.o \
+# RUN:     %S/Inputs/MachO_common_x_and_addr_getter.s
+# RUN: llvm-mc -triple=arm64-apple-darwin19 -filetype=obj -o %t/aux_strong.o \
+# RUN:     %S/Inputs/MachO_strong_x_and_addr_getter.s
+# RUN: llvm-jitlink -noexec %t/main.o %t/aux_common.o
+# RUN: llvm-jitlink -noexec -check=%s %t/main.o %t/aux_strong.o
+#
+# Check that linking multiple common definitions of the same symbol (in this
+# case _x) doesn't lead to an "Unexpected definitions" error.
+#
+# rdar://132314264
+#
+# Check that strong defs override:
+# jitlink-check: *{8}_x = 42
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main
+	.p2align	2
+_main:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_getXAddr
+	adrp	x8, _x@GOTPAGE
+	ldr	x8, [x8, _x@GOTPAGEOFF]
+	cmp	x0, x8
+	cset	w0, eq
+	ldp	x29, x30, [sp], #16
+	ret
+
+	.comm	_x,4,2
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
index cd1401ebd33a3..2d4ad30f94d8d 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
@@ -6,7 +6,7 @@
 #
 # CHECK: Creating graph symbols...
 # CHECK:      7: Creating defined graph symbol for COFF symbol "var" in (common) (index: 0)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000004, linkage: strong, scope: default, dead  -   var
+# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000004, linkage: weak, scope: default, dead  -   var
 
 	.text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s
new file mode 100644
index 0000000000000..f2bd64f02ca80
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s
@@ -0,0 +1,64 @@
+# Check that the generated __inits symbol name does not clash between objects
+# with the same base name in two different static archives. Otherwise we get a
+# duplicate symbol error.
+
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: split-file %s %t
+
+# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \
+# RUN:   -o %t/dir1/myobj.o %t/dir1/myobj.s
+# RUN: llvm-ar crs %t/libmyobj1.a %t/dir1/myobj.o
+
+# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \
+# RUN:   -o %t/dir2/myobj.o %t/dir2/myobj.s
+# RUN: llvm-ar crs %t/libmyobj2.a %t/dir2/myobj.o
+
+# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \
+# RUN:   -o %t/main.o %t/main.s
+
+# RUN: llvm-jitlink -noexec %t/main.o -lmyobj1 -lmyobj2 -L%t
+
+#--- dir1/myobj.s
+        .section        __TEXT,__text,regular,pure_instructions
+        .build_version macos, 15, 0     sdk_version 15, 0
+        .globl  _myobj1
+        .p2align        4, 0x90
+_myobj1:                                     ## @f
+        retq
+
+        .section        __DATA,__mod_init_func,mod_init_funcs
+        .p2align        3, 0x0
+        .quad   _myobj1
+
+        .subsections_via_symbols
+
+#--- dir2/myobj.s
+        .section        __TEXT,__text,regular,pure_instructions
+        .build_version macos, 15, 0     sdk_version 15, 0
+        .globl  _myobj2
+        .p2align        4, 0x90
+_myobj2:                                     ## @f
+        retq
+
+        .section        __DATA,__mod_init_func,mod_init_funcs
+        .p2align        3, 0x0
+        .quad   _myobj2
+
+        .subsections_via_symbols
+
+#--- main.s
+
+        .section  __TEXT,__text,regular,pure_instructions
+
+        .globl  _main
+        .p2align  4, 0x90
+_main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        callq   _myobj1
+        callq   _myobj2
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+
+        .subsections_via_symbols
diff --git a/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll b/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll
new file mode 100644
index 0000000000000..65b27ee2241dd
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll
@@ -0,0 +1,34 @@
+; Tests that asan skips pre-split coroutine and NoopCoro.Frame
+; RUN: opt < %s -S -passes=coro-early,asan | FileCheck %s
+
+; CHECK: %NoopCoro.Frame = type { ptr, ptr }
+; CHECK: @NoopCoro.Frame.Const = private constant %NoopCoro.Frame { ptr @__NoopCoro_ResumeDestroy, ptr @__NoopCoro_ResumeDestroy }
+; CHECK-NOT: @0 = private alias { %NoopCoro.Frame,
+
+%struct.Promise = type { %"struct.std::__n4861::coroutine_handle" }
+%"struct.std::__n4861::coroutine_handle" = type { ptr }
+
+; CHECK-LABEL: @foo(
+define ptr @foo() #0 {
+; CHECK-NEXT: entry
+; CHECK-NOT: %asan_local_stack_base
+entry:
+  %__promise = alloca %struct.Promise, align 8
+  %0 = call token @llvm.coro.id(i32 16, ptr nonnull %__promise, ptr null, ptr null)
+  %1 = call ptr @llvm.coro.noop()
+  ret ptr %1
+}
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.noop()
+
+attributes #0 = { sanitize_address presplitcoroutine }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "hand-written", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
index d5cb27d9b1ec1..0ef94235fd515 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
@@ -1,15 +1,9 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=HOT70
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE
 ; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 2>&1 | FileCheck %s --check-prefix=ALL
 ; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 2>&1 | FileCheck %s --check-prefix=NONE
-
-; HOT70: remark: <unknown>:0:0: Sanitized: F=sanitized
-; HOT70: @sanitized
-; HOT70-NEXT: @__hwasan_tls
-
-; HOT99: remark: <unknown>:0:0: Skipped: F=sanitized
-; HOT99: @sanitized
-; HOT99-NEXT: %x = alloca i8, i64 4
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=NONE
 
 ; ALL: remark: <unknown>:0:0: Sanitized: F=sanitize
 ; ALL: @sanitized
diff --git a/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll b/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll
new file mode 100644
index 0000000000000..c7ff129e5b4c4
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll
@@ -0,0 +1,57 @@
+; Test basic memory profiler instrumentation with histograms.
+;
+; RUN: opt < %s -passes='function(memprof),memprof-module' -memprof-histogram -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr @memprof.module_ctor]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @memprof.module_ctor, ptr null }]
+
+define i32 @test_load(ptr %a) {
+entry:
+  %tmp1 = load i32, ptr %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @test_load
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, ptr @__memprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint ptr %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -8
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[LOAD_SHADOW:[^ ]*]] = load i8, ptr %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    %[[ICMP_MAX_COUNT:[^ ]*]] = icmp ult i8 %[[LOAD_SHADOW]], -1
+; CHECK-NEXT:    br i1 %[[ICMP_MAX_COUNT]], label %[[INC_LABEL:[^ ]*]], label %[[ELSE_LABEL:[^ ]*]]
+; CHECK:         [[INC_LABEL]]:
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i8 %[[LOAD_SHADOW]], 1
+; CHECK-NEXT:    store i8 %[[NEW_SHADOW]], ptr %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    br label %[[ELSE_LABEL]]
+; The actual load.
+; CHECK:         [[ELSE_LABEL]]:
+; CHECK-NEXT:    %tmp1 = load i32, ptr %a
+; CHECK-NEXT:    ret i32 %tmp1
+
+define void @test_store(ptr %a) {
+entry:
+  store i32 42, ptr %a, align 4
+  ret void
+}
+; CHECK-LABEL: @test_store
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, ptr @__memprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint ptr %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -8
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[STORE_SHADOW:[^ ]*]] = load i8, ptr %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    %[[ICMP_MAX_COUNT:[^ ]*]] = icmp ult i8 %[[STORE_SHADOW]], -1
+; CHECK-NEXT:    br i1 %[[ICMP_MAX_COUNT]], label %[[INC_LABEL:[^ ]*]], label %[[ELSE_LABEL:[^ ]*]]
+; CHECK:         [[INC_LABEL]]:
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i8 %[[STORE_SHADOW]], 1
+; CHECK-NEXT:    store i8 %[[NEW_SHADOW]], ptr %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    br label %[[ELSE_LABEL]]
+; The actual store.
+; CHECK:         [[ELSE_LABEL]]:
+; CHECK-NEXT:    store i32 42, ptr %a, align 4
+; CHECK-NEXT:    ret void
\ No newline at end of file
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
index 20c002e0565db..fbdc7b57be428 100644
--- a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -1,9 +1,7 @@
 ; Check that MC/DC intrinsics are properly lowered
 ; RUN: opt < %s -passes=instrprof -S | FileCheck %s --check-prefixes=CHECK,BASIC
 ; RUN: opt < %s -passes=instrprof -S -instrprof-atomic-counter-update-all | FileCheck %s --check-prefixes=CHECK,ATOMIC
-; RUN: opt < %s -passes=instrprof -runtime-counter-relocation -S 2>&1 | FileCheck %s --check-prefix RELOC
-
-; RELOC: Runtime counter relocation is presently not supported for MC/DC bitmaps
+; RUN: opt < %s -passes=instrprof -S -runtime-counter-relocation | FileCheck %s --check-prefixes=CHECK,RELOC
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -14,10 +12,15 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define dso_local void @test(i32 noundef %A) {
 entry:
+  ; RELOC: %profbm_bias = load i64, ptr @__llvm_profile_bitmap_bias, align [[#]], !invariant.load !0
+  ; RELOC: %profc_bias = load i64, ptr @__llvm_profile_counter_bias, align [[#]]
   %A.addr = alloca i32, align 4
   %mcdc.addr = alloca i32, align 4
   call void @llvm.instrprof.cover(ptr @__profn_test, i64 99278, i32 5, i32 0)
   ; BASIC: store i8 0, ptr @__profc_test, align 1
+  ; RELOC: %[[PROFC_INTADDR:.+]] = add i64 ptrtoint (ptr @__profc_test to i64), %profc_bias
+  ; RELOC: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_INTADDR]] to ptr
+  ; RELOC: store i8 0, ptr %[[PROFC_ADDR]], align 1
 
   call void @llvm.instrprof.mcdc.parameters(ptr @__profn_test, i64 99278, i32 1)
   store i32 0, ptr %mcdc.addr, align 4
@@ -25,6 +28,7 @@ entry:
   %tobool = icmp ne i32 %0, 0
 
   call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 0, ptr %mcdc.addr)
+  ; RELOC:      [[PROFBM_ADDR:%.+]] = getelementptr i8, ptr @__profbm_test, i64 %profbm_bias
   ; CHECK:      %[[TEMP0:mcdc.*]] = load i32, ptr %mcdc.addr, align 4
   ; CHECK-NEXT: %[[TEMP:[0-9]+]] = add i32 %[[TEMP0]], 0
   ; CHECK-NEXT: %[[LAB4:[0-9]+]] = lshr i32 %[[TEMP]], 3
@@ -39,7 +43,9 @@ entry:
 ; CHECK: define private void @[[RMW_OR]](ptr %[[ARGPTR:.+]], i8 %[[ARGVAL:.+]])
 ; CHECK:      %[[BITS:.+]] = load i8, ptr %[[ARGPTR]], align 1
 ; BASIC-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[ARGVAL]]
+; RELOC-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[ARGVAL]]
 ; BASIC-NEXT: store i8 %[[LAB11]], ptr %[[ARGPTR]], align 1
+; RELOC-NEXT: store i8 %[[LAB11]], ptr %[[ARGPTR]], align 1
 ; ATOMIC-NEXT: %[[MASKED:.+]] = and i8 %[[BITS]], %[[ARGVAL]]
 ; ATOMIC-NEXT: %[[SHOULDWRITE:.+]] = icmp ne i8 %[[MASKED]], %[[ARGVAL]]
 ; ATOMIC-NEXT: br i1 %[[SHOULDWRITE]], label %[[WRITE:.+]], label %[[SKIP:.+]], !prof ![[MDPROF:[0-9]+]]
diff --git a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
index 53b1e4918e8d1..e1da23e7be31c 100644
--- a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
@@ -1,9 +1,13 @@
 ; RUN: opt < %s -S -passes=instrprof | FileCheck %s
 ; RUN: opt < %s -S -passes=instrprof -runtime-counter-relocation | FileCheck -check-prefixes=RELOC %s
+; RUN: opt < %s -S -passes=instrprof,inline,gvn -runtime-counter-relocation | FileCheck -check-prefixes=RELOC,RELOCOPT %s
+; RUN: opt < %s -S -passes=instrprof            -runtime-counter-relocation -instrprof-atomic-counter-update-all | FileCheck -check-prefixes=ATOMIC %s
+; RUN: opt < %s -S -passes=instrprof,inline,gvn -runtime-counter-relocation -instrprof-atomic-counter-update-all | FileCheck -check-prefixes=ATOMIC,ATOMICOPT %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
+@__profn_bar = private constant [3 x i8] c"bar"
 ; RELOC: $__llvm_profile_counter_bias = comdat any
 ; RELOC: @__llvm_profile_counter_bias = linkonce_odr hidden global i64 0, comdat
 
@@ -12,14 +16,34 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: %[[PGOCOUNTINC:.+]] = add i64 %[[PGOCOUNT]], 1
 ; CHECK-NEXT: store i64 %[[PGOCOUNTINC]], ptr @__profc_foo
 ; RELOC-LABEL: define void @foo
-; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias
+; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align {{[0-9]+}}, !invariant.load !0
 ; RELOC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
 ; RELOC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
 ; RELOC-NEXT: %[[PGOCOUNT:.+]] = load i64, ptr %[[PROFC_ADDR]]
 ; RELOC-NEXT: %[[PGOCOUNTINC:.+]] = add i64 %[[PGOCOUNT]], 1
 ; RELOC-NEXT: store i64 %[[PGOCOUNTINC]], ptr %[[PROFC_ADDR]]
+; RELOCOPT-NEXT: %[[PROFC_BIAS1:.+]] = add i64 ptrtoint (ptr @__profc_bar to i64), %[[BIAS]]
+; RELOCOPT-NEXT: %[[PROFC_ADDR1:.+]] = inttoptr i64 %[[PROFC_BIAS1]] to ptr
+; RELOCOPT-NEXT: %[[PGOCOUNT1:.+]] = load i64, ptr %[[PROFC_ADDR1]]
+; RELOCOPT-NEXT: %[[PGOCOUNTINC1:.+]] = add i64 %[[PGOCOUNT1]], 1
+; RELOCOPT-NEXT: store i64 %[[PGOCOUNTINC1]], ptr %[[PROFC_ADDR1]]
+; ATOMIC-LABEL: define void @foo
+; ATOMIC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align {{[0-9]+}}, !invariant.load !0
+; ATOMIC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
+; ATOMIC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
+; ATOMIC-NEXT: %[[PGOCOUNTINC:.+]] = atomicrmw add ptr %[[PROFC_ADDR]], i64 1 monotonic
+; ATOMICOPT-NEXT: %[[PROFC_BIAS1:.+]] = add i64 ptrtoint (ptr @__profc_bar to i64), %[[BIAS]]
+; ATOMICOPT-NEXT: %[[PROFC_ADDR1:.+]] = inttoptr i64 %[[PROFC_BIAS1]] to ptr
+; ATOMICOPT-NEXT: %[[PGOCOUNTINC1:.+]] = atomicrmw add ptr %[[PROFC_ADDR1]], i64 1 monotonic
+
+define void @bar() {
+  call void @llvm.instrprof.increment(ptr @__profn_bar, i64 0, i32 1, i32 0)
+  ret void
+}
+
 define void @foo() {
   call void @llvm.instrprof.increment(ptr @__profn_foo, i64 0, i32 1, i32 0)
+  call void @bar()
   ret void
 }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
index 1c4ca47b60c13..421f00fcbc56b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Test memory sanitizer instrumentation for Arm NEON VST instructions.
 ;
 ; RUN: opt < %s -passes=msan -S | FileCheck %s
@@ -11,24 +11,24 @@ target triple = "aarch64--linux-android9001"
 ; -----------------------------------------------------------------------------------------------------------------------------------------------
 
 define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -37,12 +37,23 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
 }
 
 define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_8b_undefA
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st2_8b_undefA(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -51,12 +62,23 @@ define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 }
 
 define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_8b_undefB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st2_8b_undefB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -65,11 +87,22 @@ define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 }
 
 define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_8b_undefAB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st2_8b_undefAB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -78,24 +111,21 @@ define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_
 }
 
 define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -108,13 +138,24 @@ define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sani
 }
 
 define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefA
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefA(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -123,13 +164,24 @@ define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 }
 
 define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -138,18 +190,24 @@ define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 }
 
 define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -158,12 +216,23 @@ define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 }
 
 define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefAB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefAB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -172,12 +241,23 @@ define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 }
 
 define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefAC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefAC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -186,12 +266,23 @@ define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 }
 
 define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefBC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefBC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -200,11 +291,22 @@ define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 }
 
 define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8b_undefABC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st3_8b_undefABC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -213,32 +315,26 @@ define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) noun
 }
 
 define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -247,14 +343,25 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
 }
 
 define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefA
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefA(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -263,14 +370,25 @@ define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 }
 
 define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -279,19 +397,25 @@ define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 }
 
 define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -300,22 +424,25 @@ define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 }
 
 define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -324,13 +451,24 @@ define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 }
 
 define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefAB
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefAB(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -339,13 +477,24 @@ define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefAC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefAC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -354,13 +503,24 @@ define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefBC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefBC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -369,13 +529,24 @@ define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefBD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefBD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -384,12 +555,23 @@ define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefABC
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefABC(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -398,12 +580,23 @@ define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefABD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefABD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -412,12 +605,23 @@ define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefACD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefACD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -426,12 +630,23 @@ define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefBCD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefBCD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -440,11 +655,22 @@ define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 }
 
 define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8b_undefABCD
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st4_8b_undefABCD(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -459,24 +685,24 @@ declare void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, ptr) n
 declare void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -485,24 +711,21 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 }
 
 define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -515,32 +738,26 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 }
 
 define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -555,24 +772,24 @@ declare void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, pt
 declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_4h
-; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -581,24 +798,21 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_4h
-; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -611,32 +825,26 @@ define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind s
 }
 
 define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_4h
-; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -651,24 +859,24 @@ declare void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, pt
 declare void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_8h
-; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -677,24 +885,21 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_8h
-; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -707,32 +912,26 @@ define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind s
 }
 
 define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_8h
-; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -747,24 +946,24 @@ declare void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, pt
 declare void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_2s
-; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -773,24 +972,21 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2s
-; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -803,32 +999,26 @@ define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind s
 }
 
 define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2s
-; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -841,24 +1031,24 @@ declare void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, pt
 declare void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_4s
-; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -867,24 +1057,21 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_4s
-; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -897,32 +1084,26 @@ define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind s
 }
 
 define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_4s
-; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -938,24 +1119,24 @@ declare void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4
 
 ; If there's only one element, st2/3/4 don't make much sense, stick to st1.
 define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_1d
-; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -964,24 +1145,21 @@ define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_1d
-; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -994,32 +1172,26 @@ define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind s
 }
 
 define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_1d
-; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1034,24 +1206,24 @@ declare void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, pt
 declare void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly
 
 define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_2d
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+;
+;
+; CHECK-LABEL: define void @st2_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1060,12 +1232,23 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory
 }
 
 define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_2d_undefA
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st2_2d_undefA(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1074,12 +1257,23 @@ define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 }
 
 define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_2d_undefB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st2_2d_undefB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1088,11 +1282,22 @@ define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 }
 
 define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st2_2d_undefAB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st2_2d_undefAB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1101,24 +1306,21 @@ define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitiz
 }
 
 define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+;
+;
+; CHECK-LABEL: define void @st3_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1131,13 +1333,24 @@ define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind s
 }
 
 define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefA
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefA(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1146,13 +1359,24 @@ define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 }
 
 define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1161,18 +1385,24 @@ define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 }
 
 define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1181,12 +1411,23 @@ define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 }
 
 define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefAB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefAB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1195,12 +1436,23 @@ define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 }
 
 define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefAC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefAC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1209,12 +1461,23 @@ define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 }
 
 define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefBC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefBC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1223,11 +1486,22 @@ define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 }
 
 define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st3_2d_undefABC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st3_2d_undefABC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1236,32 +1510,26 @@ define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) n
 }
 
 define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+;
+;
+; CHECK-LABEL: define void @st4_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1274,14 +1542,25 @@ declare void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, pt
 declare void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly
 
 define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefA
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefA(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1290,14 +1569,25 @@ define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 }
 
 define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1306,19 +1596,25 @@ define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 }
 
 define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1327,22 +1623,25 @@ define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 }
 
 define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1351,13 +1650,24 @@ define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 }
 
 define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefAB
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefAB(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1366,13 +1676,24 @@ define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefAC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefAC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1381,13 +1702,24 @@ define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefAD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefAD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1396,13 +1728,24 @@ define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefBC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefBC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1411,13 +1754,24 @@ define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefBD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefBD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1426,18 +1780,24 @@ define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefCD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefCD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1446,12 +1806,23 @@ define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefABC
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefABC(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1460,12 +1831,23 @@ define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefABD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefABD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1474,12 +1856,23 @@ define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefACD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefACD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1488,12 +1881,23 @@ define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefBCD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefBCD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1502,14 +1906,28 @@ define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 }
 
 define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
-; CHECK-LABEL: define void @st4_2d_undefABCD
-; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+;
+;
+; CHECK-LABEL: define void @st4_2d_undefABCD(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
   ret void
 }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
new file mode 100644
index 0000000000000..8fed5a78d6b79
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
@@ -0,0 +1,1388 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+;
+; Test memory sanitizer instrumentation for Arm NEON VST_{2,3,4} and
+; VST_1x{2,3,4} instructions, including floating-point parameters.
+;
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Generated with:
+;     grep call clang/test/CodeGen/aarch64-neon-intrinsics.c \
+;         |  grep 'neon[.]st'                                \
+;         | sed -r 's/^\/\/ CHECK[:][ ]*//'                  \
+;         | cut -d ' ' -f 1 --complement                     \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%A/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%B/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%C/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%D/'             \
+;         | sort                                             \
+;         | uniq                                             \
+;         | while read x;                                    \
+;             do                                             \
+;                 y=`echo "$x"                               \
+;                     | sed -r 's/@llvm[.]aarch64[.]neon[.]/@/' \
+;                     | sed -r 's/[.]p0//'                      \
+;                     | tr '.' '_'`;                            \
+;                 echo "define $y sanitize_memory {"; \
+;                 echo "  call $x";                   \
+;                 echo "  ret void";                  \
+;                 echo "}";                           \
+;                 echo;                               \
+;             done
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x2_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %p)
+  ret void
+}
+
+define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x2_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %p)
+  ret void
+}
+
+define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x2_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %p)
+  ret void
+}
+
+define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x2_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %p)
+  ret void
+}
+
+define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x3_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p)
+  ret void
+}
+
+define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x3_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p)
+  ret void
+}
+
+define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x3_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p)
+  ret void
+}
+
+define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x3_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p)
+  ret void
+}
+
+define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x4_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p)
+  ret void
+}
+
+define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x4_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p)
+  ret void
+}
+
+define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x4_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p)
+  ret void
+}
+
+define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st1x4_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p)
+  ret void
+}
+
+define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v16i8(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v2f32(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> %A, <2 x float> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v2i32(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v4f16(
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> %A, <4 x half> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %A, <4 x float> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v4i16(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v8f16(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> %A, <8 x half> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %p)
+  ret void
+}
+
+define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st2_v8i8(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %p)
+  ret void
+}
+
+define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v16i8(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v2f32(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v2i32(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v4f16(
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v4i16(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v8f16(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p)
+  ret void
+}
+
+define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st3_v8i8(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p)
+  ret void
+}
+
+define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v16i8(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v1f64(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v1i64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v2f32(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], <2 x float> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v2i32(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v2i64(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v4f16(
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], <4 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], <4 x half> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], <4 x float> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v4i16(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v8f16(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], <8 x half> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p)
+  ret void
+}
+
+define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p) sanitize_memory {
+; CHECK-LABEL: define void @st4_v8i8(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p)
+  ret void
+}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll
new file mode 100644
index 0000000000000..b7a2721e80e2b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll
@@ -0,0 +1,1977 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+;
+; Test memory sanitizer instrumentation for Arm store with lane instructions.
+; Note: st{2,3,4}lane uses Arm NEON but st1lane does not.
+;
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @st1lane_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 -1
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 -1
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 -1
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 -1
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 -1
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 -1
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 -1
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 -1
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 -1
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_1d(<1 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_1d(<1 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 -1
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_1d(<1 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 -1
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_1d_double(<1 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, i64 1, ptr %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, ptr %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, ptr %E)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll
new file mode 100644
index 0000000000000..52283811e3065
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; Test memory sanitizer instrumentation for Arm NEON VST instructions, with
+; origin tracking. These tests are deliberately shorter than neon_vst.ll, due
+; to the verbosity of the output.
+;
+; RUN: opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memory {
+;
+;
+; CHECK-LABEL: define void @st2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 35184372088832
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], ptr [[TMP9]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 1
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP12]], i32 2
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP12]], i32 3
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP12]], i32 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP12]], i32 5
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP12]], i32 6
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP12]], i32 7
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P)
+  ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind sanitize_memory {
+;
+;
+; CHECK-LABEL: define void @st3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 35184372088832
+; CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], -4
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], ptr [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP8]], i32 [[TMP17]]
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP14]], i32 1
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP14]], i32 2
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP14]], i32 3
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP14]], i32 4
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP14]], i32 5
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP14]], i32 6
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP14]], i32 7
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP14]], i32 8
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP28]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP14]], i32 9
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[TMP14]], i32 10
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP14]], i32 11
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP32:%.*]], label [[TMP33:%.*]], !prof [[PROF0]]
+; CHECK:       32:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       33:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) nounwind sanitize_memory {
+;
+;
+; CHECK-LABEL: define void @st4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], 35184372088832
+; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], -4
+; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]], ptr [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP8]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i128 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP10]], i32 [[TMP22]]
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 1
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 2
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP16]], i32 3
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP28]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP16]], i32 4
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[TMP16]], i32 5
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP16]], i32 6
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP16]], i32 7
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP16]], i32 9
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP16]], i32 10
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[TMP16]], i32 11
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP36]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr [[TMP16]], i32 12
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP37]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[TMP16]], i32 13
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP38]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[TMP16]], i32 14
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[TMP16]], i32 15
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP40]], align 4
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       42:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 39b2b6225d8b1..1d2e38eb5e63d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test1(
@@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -35,16 +35,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test88(
@@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -75,16 +75,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test87(
@@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -115,16 +115,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test86(
@@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -155,16 +155,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test85(
@@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -195,16 +195,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test84(
@@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -235,16 +235,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test83(
@@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -275,16 +275,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test82(
@@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -315,16 +315,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test81(
@@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -355,16 +355,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test80(
@@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -395,16 +395,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test79(
@@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -435,16 +435,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test78(
@@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -475,16 +475,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test77(
@@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -515,16 +515,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test76(
@@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -564,16 +563,16 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test75(
@@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -613,16 +611,16 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test74(
@@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -662,16 +659,16 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test73(
@@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -699,15 +694,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test72(
@@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -735,9 +728,9 @@ define i64 @test72(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -769,15 +760,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test71(
@@ -787,25 +778,25 @@ define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test70(
@@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -833,9 +822,9 @@ define i64 @test70(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -867,15 +854,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test69(
@@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -903,15 +888,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test68(
@@ -921,25 +906,25 @@ define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test67(
@@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -967,15 +950,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test66(
@@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1003,9 +984,9 @@ define i64 @test66(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1037,15 +1016,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test65(
@@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1079,17 +1059,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test64(
@@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1123,17 +1104,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test63(
@@ -1144,32 +1125,35 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test62(
@@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1203,17 +1188,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test61(
@@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1247,17 +1233,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test60(
@@ -1268,32 +1254,35 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test59(
@@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1327,17 +1317,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test58(
@@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1371,17 +1362,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test56(
@@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1412,16 +1403,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test55(
@@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1452,16 +1443,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test54(
@@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1492,16 +1483,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test53(
@@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1532,16 +1523,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test52(
@@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1572,10 +1563,10 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1610,16 +1601,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test50(
@@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1650,16 +1641,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test49(
@@ -1672,38 +1663,38 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP18]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test48(
@@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1734,16 +1725,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test47(
@@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1774,16 +1765,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test46(
@@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1814,16 +1805,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test45(
@@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1854,10 +1845,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1872,29 +1863,32 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test43(
@@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1925,16 +1919,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test42(
@@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1965,16 +1959,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test41(
@@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2005,16 +1999,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test40(
@@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2045,16 +2039,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test39(
@@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2085,16 +2079,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test38(
@@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2125,16 +2119,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test37(
@@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2165,16 +2159,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test36(
@@ -2185,27 +2179,30 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test35(
@@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2236,16 +2233,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test34(
@@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2276,16 +2273,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test33(
@@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2316,16 +2313,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test32(
@@ -2338,30 +2335,33 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 48
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64
+; CHECK-NEXT:    store i64 [[TMP15]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test31(
@@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2392,16 +2392,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test30(
@@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2432,16 +2432,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test29(
@@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2472,16 +2472,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test28(
@@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2512,16 +2512,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test27(
@@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2552,16 +2552,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test26(
@@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2592,16 +2592,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test25(
@@ -2612,26 +2612,29 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       8:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i32 @test24(
@@ -2641,26 +2644,27 @@ define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test23(
@@ -2674,33 +2678,35 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
+; CHECK:       12:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test22(
@@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2731,16 +2737,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test21(
@@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
@@ -2767,9 +2774,9 @@ define i64 @test21(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -2800,15 +2808,15 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test20(
@@ -2821,27 +2829,28 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define <2 x double> @test19(
@@ -2851,26 +2860,27 @@ define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]]
 ; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test18(
@@ -2885,22 +2895,22 @@ define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test17(
@@ -2915,22 +2925,22 @@ define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test16(
@@ -2941,34 +2951,38 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test15(
@@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2992,15 +3006,15 @@ define i64 @test15(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test14(
@@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3024,15 +3038,15 @@ define i64 @test14(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test13(
@@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3056,15 +3070,15 @@ define i64 @test13(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test12(
@@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3095,16 +3109,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test11(
@@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3135,16 +3149,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test10(
@@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3175,16 +3189,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test9(
@@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3215,16 +3229,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test8(
@@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3255,16 +3269,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test7(
@@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -3299,16 +3313,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test6(
@@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3339,16 +3353,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test5(
@@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3379,16 +3393,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test4(
@@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3419,16 +3433,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test3(
@@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3459,16 +3473,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test2(
@@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3499,43 +3513,44 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 {
 ; ALL-LABEL: test89:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
 ; ALL-NEXT:    ret{{[l|q]}}
 ; CHECK-LABEL: define <4 x float> @test89(
-; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]])
+; CHECK:       6:
+; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]])
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[C]]
 ;
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() #0 {
 ; ALL-LABEL: test90:
@@ -3562,28 +3577,24 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64>
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[A_COERCE]], i32 [[D]], i32 2)
 ; CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
+; CHECK-NEXT:    ret <1 x i64> [[TMP9]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-LABEL: define i32 @test_mm_extract_pi16(
@@ -3592,24 +3603,22 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; CHECK:       2:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2)
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[A_COERCE]], i32 2)
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
 
 attributes #0 = { sanitize_memory }
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 57d6003b3873f..fe5cf9dcc65b2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -6,9 +6,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
 entry:
@@ -24,19 +24,19 @@ entry:
 ; CHECK: ret <4 x i32>
 
 
-define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
-; CHECK: or i64
-; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: or <1 x i64>
+; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK: bitcast <4 x i16> {{.*}} to i64
-; CHECK: ret x86_mmx
+; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK: ret <1 x i64>
 
 
 define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
@@ -53,15 +53,15 @@ define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_mem
 ; CHECK: ret <2 x i64>
 
 
-define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_x86_mmx_psad_bw(
-; CHECK: or i64
+; CHECK: or <1 x i64>
 ; CHECK: icmp ne i64
 ; CHECK: sext i1 {{.*}} to i64
 ; CHECK: lshr i64 {{.*}}, 48
-; CHECK: ret x86_mmx
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 52acbfe0a0e77..e9202700b1df7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 
 ; Single argument vector conversion.
@@ -27,12 +27,12 @@ entry:
 ; CHECK: store i32 0, {{.*}} @__msan_retval_tls
 ; CHECK: ret i32
 
-; x86_mmx packed vector conversion.
+; <1 x i64> packed vector conversion.
 
-define x86_mmx @test_cvtps2pi(<4 x float> %value) sanitize_memory {
+define <1 x i64> @test_cvtps2pi(<4 x float> %value) sanitize_memory {
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %value)
-  ret x86_mmx %0
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %value)
+  ret <1 x i64> %0
 }
 
 ; CHECK-LABEL: @test_cvtps2pi
@@ -42,9 +42,9 @@ entry:
 ; CHECK: icmp ne {{.*}}[[S]], 0
 ; CHECK: br
 ; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
-; CHECK: store i64 0, {{.*}} @__msan_retval_tls
-; CHECK: ret x86_mmx
+; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi
+; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls
+; CHECK: ret <1 x i64>
 
 ; avx512 rounding conversion.
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 4f08ea7c00afe..13f7a1612de94 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
 entry:
@@ -41,22 +41,21 @@ entry:
 ; CHECK: ret <32 x i8>
 
 
-define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_mmx_packuswb(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_mmx_packuswb(
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
-; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
-; CHECK: ret x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 196285d910a6d..441dd8f64e284 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -7,7 +7,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>)
 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>)
 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>)
 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>)
@@ -19,10 +19,10 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32)
 define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) sanitize_memory {
 entry:
   %0 = bitcast i64 %x.coerce to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = bitcast i64 %y.coerce to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %1, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = bitcast i64 %y.coerce to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %1, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = bitcast <2 x i32> %4 to <1 x i64>
   %6 = extractelement <1 x i64> %5, i32 0
   ret i64 %6
@@ -30,11 +30,11 @@ entry:
 
 ; CHECK-LABEL: @test_mmx
 ; CHECK: = icmp ne i64 {{.*}}, 0
-; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64
-; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(
-; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64
-; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]]
-; CHECK: call x86_mmx @llvm.x86.mmx.psll.d(
+; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64
+; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64>
+; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(
+; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]]
+; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d(
 ; CHECK: ret i64
 
 
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 8110dd485d369..5da68320d91f9 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -737,6 +737,29 @@ entry:
   ret void
 }
 
+define void @freeze_vector_insert(<2 x float> %vec, i32 %idx, float %scalar) sanitize_numerical_stability {
+; CHECK-LABEL: @freeze_vector_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @freeze_vector_insert to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[VEC:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 16), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[SCALAR:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[VEC]], float [[SCALAR]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP7]], i32 [[IDX]]
+; CHECK-NEXT:    [[FROZEN:%.*]] = freeze <2 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <2 x double> [[TMP9]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x float> %vec, float %scalar, i32 %idx
+  %frozen = freeze <2 x float> %1
+  ret void
+}
 
 define void @vector_shuffle(<2 x float> %0) sanitize_numerical_stability {
 ; CHECK-LABEL: @vector_shuffle(
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
index 3fe78d8b19b0a..31f32d7bd8df5 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
@@ -7,15 +7,24 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)
 
-define void @call_memcpy_intrinsic(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numerical_stability {
-; CHECK-LABEL: @call_memcpy_intrinsic(
+define void @call_memcpy_intrinsics(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numerical_stability {
+; CHECK-LABEL: @call_memcpy_intrinsics(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @__nsan_copy_values(ptr [[A:%.*]], ptr [[B:%.*]], i64 16)
+; CHECK-NEXT:    call void @__nsan_copy_4(ptr [[A:%.*]], ptr [[B:%.*]])
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], ptr nonnull align 8 dereferenceable(16) [[B]], i64 4, i1 false)
+; CHECK-NEXT:    call void @__nsan_copy_8(ptr [[A:%.*]], ptr [[B:%.*]])
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], ptr nonnull align 8 dereferenceable(16) [[B]], i64 8, i1 false)
+; CHECK-NEXT:    call void @__nsan_copy_16(ptr [[A:%.*]], ptr [[B:%.*]])
 ; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], ptr nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+; CHECK-NEXT:    call void @__nsan_copy_values(ptr [[A:%.*]], ptr [[B:%.*]], i64 15)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], ptr nonnull align 8 dereferenceable(16) [[B]], i64 15, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 4, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 8, i1 false)
   tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 16, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 15, i1 false)
   ret void
 }
 
@@ -32,6 +41,26 @@ entry:
   ret void
 }
 
+define void @call_memset_intrinsics(i8* nonnull align 8 dereferenceable(16) %a) sanitize_numerical_stability {
+; CHECK-LABEL: @call_memset_intrinsics(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @__nsan_set_value_unknown_4(ptr [[A:%.*]])
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], i8 0, i64 4, i1 false)
+; CHECK-NEXT:    call void @__nsan_set_value_unknown_8(ptr [[A:%.*]])
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @__nsan_set_value_unknown_16(ptr [[A:%.*]])
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @__nsan_set_value_unknown(ptr [[A:%.*]], i64 15)
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], i8 0, i64 15, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) %a, i8 0, i64 4, i1 false)
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) %a, i8 0, i64 8, i1 false)
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) %a, i8 0, i64 16, i1 false)
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 8 dereferenceable(16) %a, i8 0, i64 15, i1 false)
+  ret void
+}
 
 define void @transfer_float(float* %dst, float* %src) sanitize_numerical_stability {
 ; CHECK-LABEL: @transfer_float(
diff --git a/llvm/test/LTO/X86/print-pipeline-passes.ll b/llvm/test/LTO/X86/print-pipeline-passes.ll
new file mode 100644
index 0000000000000..b24e3869c0a63
--- /dev/null
+++ b/llvm/test/LTO/X86/print-pipeline-passes.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s >%t.bc
+; RUN: llvm-lto -print-pipeline-passes -exported-symbol=_f -o /dev/null %t.bc 2>&1 | FileCheck %s
+
+; CHECK: pipeline-passes: verify,{{.*}},verify
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+
+define void @f() {
+entry:
+  ret void
+}
diff --git a/llvm/test/MC/AArch64/CheckDataSymbol.s b/llvm/test/MC/AArch64/CheckDataSymbol.s
index baa1158984386..13d1db7da091b 100644
--- a/llvm/test/MC/AArch64/CheckDataSymbol.s
+++ b/llvm/test/MC/AArch64/CheckDataSymbol.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -assemble \
 # RUN: -triple=aarch64- %s -o - \
 # RUN: | llvm-readobj -S --symbols - | FileCheck %s
-# CHECK:     Name: $d.1 ({{[1-9][0-9]+}})
+# CHECK:     Name: $d ({{[1-9][0-9]+}})
 # CHECK-NEXT:     Value: 0x4
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Local (0x0)
diff --git a/llvm/test/MC/AArch64/align-fill-byte-zero.s b/llvm/test/MC/AArch64/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..eed8750845568
--- /dev/null
+++ b/llvm/test/MC/AArch64/align-fill-byte-zero.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -triple aarch64 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple aarch64 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM: add     x14, x14, #1
+// OBJ: 910005ce      add     x14, x14, #0x1
+  add x14, x14, 0x1
+
+// ASM: .p2align 4, 0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+  .balign 0x10, 0
+
+// ASM: add     x14, x14, #1
+// OBJ-NEXT: 910005ce      add     x14, x14, #0x1
+  add x14, x14, 0x1
diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s
index 3bd8f5c19932e..057b298a0a0df 100644
--- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s
+++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s
@@ -3,16 +3,16 @@
 // RUN: llvm-mc -triple=aarch64 -filetype=obj %s | \
 // RUN:   llvm-readelf -S -r -x .test - | FileCheck %s --check-prefix=RELOC
 
-// RELOC: Relocation section '.rela.test' at offset 0x230 contains 8 entries:
+// RELOC: Relocation section '.rela.test' at offset {{.*}} contains 8 entries:
 // RELOC-NEXT:  Offset Info Type Symbol's Value Symbol's Name + Addend
-// RELOC-NEXT: 0000000000000000 0000000100000244 R_AARCH64_AUTH_ABS64 0000000000000000 .helper + 0
-// RELOC-NEXT: 0000000000000010 0000000800000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g1 + 0
-// RELOC-NEXT: 0000000000000020 0000000900000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g2 + 0
-// RELOC-NEXT: 0000000000000030 0000000a00000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g3 + 0
-// RELOC-NEXT: 0000000000000040 0000000b00000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g4 + 7
-// RELOC-NEXT: 0000000000000050 0000000c00000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g5 - 3
-// RELOC-NEXT: 0000000000000060 0000000200000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g 6 + 0
-// RELOC-NEXT: 0000000000000070 0000000d00000244 R_AARCH64_AUTH_ABS64 0000000000000000 _g 7 + 7
+// RELOC-NEXT: 0000000000000000 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 .helper + 0
+// RELOC-NEXT: 0000000000000010 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g1 + 0
+// RELOC-NEXT: 0000000000000020 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g2 + 0
+// RELOC-NEXT: 0000000000000030 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g3 + 0
+// RELOC-NEXT: 0000000000000040 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g4 + 7
+// RELOC-NEXT: 0000000000000050 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g5 - 3
+// RELOC-NEXT: 0000000000000060 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g 6 + 0
+// RELOC-NEXT: 0000000000000070 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g 7 + 7
 
 // RELOC: Hex dump of section '.test':
 //                VVVVVVVV addend, not needed for rela
diff --git a/llvm/test/MC/AArch64/local-bounds-single-trap.ll b/llvm/test/MC/AArch64/local-bounds-single-trap.ll
index 53a0e010537f0..6a017e24ab3cd 100644
--- a/llvm/test/MC/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/MC/AArch64/local-bounds-single-trap.ll
@@ -21,11 +21,11 @@ entry:
 ; CHECK-ASM: 	add	x10, x10, :lo12:B
 ; CHECK-ASM: 	strb	wzr, [x10, x8]
 ; CHECK-ASM: 	cmp	x9, #10
-; CHECK-ASM: 	b.hi	.LBB0_5
+; CHECK-ASM: 	b.hi	.LBB0_6
 ; CHECK-ASM: // %bb.3:
 ; CHECK-ASM: 	mov	w8, #10                         // =0xa
 ; CHECK-ASM: 	sub	x8, x8, x9
-; CHECK-ASM: 	cbz	x8, .LBB0_5
+; CHECK-ASM: 	cbz	x8, .LBB0_6
 ; CHECK-ASM: // %bb.4:
 ; CHECK-ASM: 	adrp	x8, B2
 ; CHECK-ASM: 	add	x8, x8, :lo12:B2
@@ -33,9 +33,11 @@ entry:
 ; CHECK-ASM: 	add	sp, sp, #16
 ; CHECK-ASM: 	.cfi_def_cfa_offset 0
 ; CHECK-ASM: 	ret
-; CHECK-ASM: .LBB0_5:                                // %trap3
-; CHECK-ASM: 	.cfi_restore_state
-; CHECK-ASM: 	brk	#0x1
+; CHECK-ASM: .LBB0_5:                                // %trap
+; CHECK-ASM: .cfi_restore_state
+; CHECK-ASM: brk     #0x1
+; CHECK-ASM: .LBB0_6:                                // %trap3
+; CHECK-ASM: brk     #0x1
   %i.addr = alloca i32, align 4
   %k.addr = alloca i32, align 4
   store i32 %i, ptr %i.addr, align 4
diff --git a/llvm/test/MC/AArch64/mapping-across-sections.s b/llvm/test/MC/AArch64/mapping-across-sections.s
index 053deb760dcfd..f453c86d45fb6 100644
--- a/llvm/test/MC/AArch64/mapping-across-sections.s
+++ b/llvm/test/MC/AArch64/mapping-across-sections.s
@@ -1,28 +1,40 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-
-        .text
-        add w0, w0, w0
-
-// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
-        .section .wibble
-        add w0, w0, w0
-
-// A setion should be able to start with a $d
-        .section .starts_data
-        .word 42
-
-// Changing back to .text should not emit a redundant $x
-        .text
-        add w0, w0, w0
-
-// With all those constraints, we want:
-//   + .text to have $x at 0 and no others
-//   + .wibble to have $x at 0
-//   + .starts_data to have $d at 0
-
-
-// CHECK:      0000000000000000 l .text        0000000000000000 $x
-// CHECK-NEXT: 0000000000000000 l .wibble      0000000000000000 $x
-// CHECK-NEXT: 0000000000000000 l .starts_data 0000000000000000 $d
-// CHECK-NOT: ${{[adtx]}}
-
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-objdump -t - | FileCheck %s --match-full-lines
+
+.section .text1,"ax"
+add w0, w0, w0
+
+.text
+add w0, w0, w0
+.word 42
+
+.pushsection .data,"aw"
+.word 42
+.popsection
+
+.text
+add w1, w1, w1
+
+.section .text1,"ax"
+add w1, w1, w1
+
+.text
+.word 42
+
+.section .rodata,"a"
+.word 42
+add w0, w0, w0
+
+.ident "clang"
+.section ".note.GNU-stack","",@progbits
+
+// CHECK:      SYMBOL TABLE:
+// CHECK-NEXT: 0000000000000000 l       .text1 0000000000000000 $x
+// CHECK-NEXT: 0000000000000000 l       .text  0000000000000000 $x
+// CHECK-NEXT: 0000000000000004 l       .text  0000000000000000 $d
+// CHECK-NEXT: 0000000000000000 l       .data  0000000000000000 $d
+// CHECK-NEXT: 0000000000000008 l       .text  0000000000000000 $x
+// CHECK-NEXT: 000000000000000c l       .text  0000000000000000 $d
+// CHECK-NEXT: 0000000000000000 l       .rodata        0000000000000000 $d
+// CHECK-NEXT: 0000000000000004 l       .rodata        0000000000000000 $x
+// CHECK-NEXT: 0000000000000000 l       .comment       0000000000000000 $d
+// CHECK-NOT:  {{.}}
diff --git a/llvm/test/MC/AArch64/mapping-within-section.s b/llvm/test/MC/AArch64/mapping-within-section.s
index 07a2d3cc0fed9..c84e3a4d2fe64 100644
--- a/llvm/test/MC/AArch64/mapping-within-section.s
+++ b/llvm/test/MC/AArch64/mapping-within-section.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s | llvm-nm --special-syms - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s | llvm-nm --no-sort --special-syms - | FileCheck %s --match-full-lines
 
     .text
 // $x at 0x0000
@@ -18,6 +18,13 @@
 // $x at 0x0018
     add x0, x0, x0
 
-// CHECK:      0000000000000004 t $d.1
-// CHECK-NEXT: 0000000000000000 t $x.0
-// CHECK-NEXT: 0000000000000064 t $x.2
+.globl $d
+$d:
+$x:
+
+// CHECK:      0000000000000000 t $x
+// CHECK-NEXT: 0000000000000004 t $d
+// CHECK-NEXT: 0000000000000064 t $x
+// CHECK-NEXT: 0000000000000068 t $x
+// CHECK-NEXT: 0000000000000068 T $d
+// CHECK-NOT:  {{.}}
diff --git a/llvm/test/MC/AArch64/size-directive.s b/llvm/test/MC/AArch64/size-directive.s
index 0b19cda4eaa1e..d9bdf970cf68b 100644
--- a/llvm/test/MC/AArch64/size-directive.s
+++ b/llvm/test/MC/AArch64/size-directive.s
@@ -32,7 +32,7 @@ aarch64_size:
 // CHECK-OBJ-NEXT: )
 
 // SYMS:      Type   Bind   Vis     Ndx Name
-// SYMS:      NOTYPE LOCAL  DEFAULT   3 $d.0
+// SYMS:      NOTYPE LOCAL  DEFAULT   3 $d{{$}}
 // SYMS-NEXT: FUNC   GLOBAL DEFAULT   3 aarch64_size
 // SYMS-NEXT: NOTYPE GLOBAL DEFAULT UND half_word
 // SYMS-NEXT: NOTYPE GLOBAL DEFAULT UND full_word
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index f4ab5fe5b14a9..51498d3c86d7f 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -207,3 +207,7 @@ image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] noa16
 
 image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12:15] noa16
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: image address size does not match a16
+
+// missing dim
+image_msaa_load v[1:4], v[5:7], s[8:15] dmask:0xf glc
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
index 8deb16ebeb204..bd61ad3908d21 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
@@ -1,8 +1,331 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
 
-// TODO: more helpful error message for missing dim operand
+image_atomic_add v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_and v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_cmpswap v[5:6], v1, s[8:15] dmask:0x3 unorm glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_dec v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_fcmpswap v[1:2], v2, s[12:19] dmask:0x3 unorm glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_fmax v4, v32, s[96:103] dmask:0x1 glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_fmin v4, v32, s[96:103] dmask:0x1 glc
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_inc v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_or v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_smax v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_smin v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_sub v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_swap v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_umax v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_umin v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_xor v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4 v[5:8], v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_cl v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_l_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_cl_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4h v[254:255], v[254:255], ttmp[8:15], ttmp[12:15] dmask:0x4 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_l v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_l_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz v[5:8], v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz_o v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_o v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_resinfo v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
 image_load v[0:3], v0, s[0:7] dmask:0xf unorm
-// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip v[5:6], v1, s[8:15] dmask:0x3 a16
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck v[5:6], v1, s[8:15] dmask:0x1 tfe
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck_sgn v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck_sgn v5, v[1:2], s[8:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_cl_o_g16 v[5:6], v[1:6], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cd_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o_g16 v[5:6], v[1:6], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_cl_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cd_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz_o v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_o v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store v1, v2, s[12:19] dmask:0x0 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip v1, v[2:3], s[12:19] dmask:0x0 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
 
 image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D da
 // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_err_pos.s b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
index c2679db3b2acf..4db454f0ced5a 100644
--- a/llvm/test/MC/AMDGPU/gfx10_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+WavefrontSize32,-WavefrontSize64 %s 2>&1 | FileCheck %s --implicit-check-not=error: --strict-whitespace
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck %s --implicit-check-not=error: --strict-whitespace
 
 //==============================================================================
 // operands are not valid for this GPU or mode
 
 image_atomic_add v252, v2, s[8:15]
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
 // CHECK-NEXT:{{^}}image_atomic_add v252, v2, s[8:15]
 // CHECK-NEXT:{{^}}^
 
diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
index 46b4e6ffb4037..1374417ac354b 100644
--- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
@@ -3287,6 +3287,9 @@ v_subrev_u32_e64 v255, s[12:13], v1, v2
 v_subrev_u32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+v_swap_b16 v0.l, v0.l
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
index 9dc88690d9562..9bf72a11e5eed 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
@@ -152,3 +152,251 @@ image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] noa16
 
 image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42]], s[12:15] noa16
 // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: image address size does not match a16
+
+// missing dim
+image_atomic_add v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_and v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_cmpswap v[5:6], v1, s[8:15] dmask:0x3 unorm glc
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_dec v5, v1, s[8:15] dmask:0x1 unorm glc
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_inc v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_or v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_smax v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_smin v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_sub v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_swap v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_umax v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_umin v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_xor v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4 v[5:8], v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_cl v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz_o v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4h v[254:255], v[254:255], ttmp[8:15], ttmp[12:15] dmask:0x4 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_l v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz v[5:8], v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz_o v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_o v[5:8], v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_resinfo v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip v[5:6], v1, s[8:15] dmask:0x3 a16
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck v[5:6], v1, s[8:15] dmask:0x1 tfe
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck_sgn v5, v1, s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck_sgn v5, v[1:2], s[8:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_msaa_load v[5:7], v[1:2], s[96:103] dmask:0x4 a16 tfe d16
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o_g16 v[5:6], v[1:6], s[8:15], s[12:15] dmask:0x3
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l_o v5, v[1:3], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz_o v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_o v5, v[1:2], s[8:15], s[12:15] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store v1, v2, s[12:19] dmask:0x0 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip v1, v[2:3], s[12:19] dmask:0x0 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index d95ef6f15e48d..90d5ca7f72751 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3448,6 +3448,15 @@ v_sqrt_f64 v[5:6], src_scc
 v_sqrt_f64 v[254:255], 0xaf123456
 // GFX11: encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
+v_swap_b16 v5.l, v1.h
+// GFX11: encoding: [0x81,0xcd,0x0a,0x7e]
+
+v_swap_b16 v5.h, v1.l
+// GFX11: encoding: [0x01,0xcd,0x0a,0x7f]
+
+v_swap_b16 v127.l, v127.l
+// GFX11: encoding: [0x7f,0xcd,0xfe,0x7e]
+
 v_swap_b32 v5, v1
 // GFX11: encoding: [0x01,0xcb,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 5b5381b752feb..ab587a524fc6e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -211,6 +211,18 @@ v_sqrt_f16_e32 v255.l, v1.l
 v_sqrt_f16_e32 v5.l, v199.l
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_swap_b16_e32 v128.l, v0.l
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, s0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
new file mode 100644
index 0000000000000..a0d11c985c6b7
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
@@ -0,0 +1,257 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+// missing dim
+image_atomic_add_flt v0, v0, s[0:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_add_uint v0, v0, s[0:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_and v5, v1, s[8:15] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_cmpswap v[0:1], v0, s[0:7] dmask:0x3
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_inc_uint v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_max_flt v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_max_int v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_max_uint v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_min_flt v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_min_int v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_min_uint v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_or v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_sub_uint v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_swap v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_atomic_xor v0, v0, s[0:7] dmask:0x1 th:TH_ATOMIC_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4 v[0:3], [v4, v5], s[0:7], s[100:103] dmask:0x8 unorm
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b v[64:67], [v32, v33], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_b_cl v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c v[64:67], [v32, v33], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_b_cl v[64:67], [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_cl v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_l v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_cl v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz v[64:67], [v32, v33], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_c_lz_o v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4h v[64:67], v32, s[4:11], s[4:7] dmask:0x8 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_l v[64:67], [v32, v33], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz v[64:67], v32, s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_gather4_lz_o v[64:67], [v32, v33], s[4:11], s[4:7] dmask:0x1 a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_lod v[64:67], [v32, v33, v34], s[4:11], s[100:103] dmask:0xf
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_get_resinfo v4, v32, s[96:103] dmask:0x1 th:TH_LOAD_RT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load v0, v0, s[0:7] dmask:0x1 th:TH_STORE_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip v[252:255], [v0, v1], s[0:7] dmask:0xf th:TH_LOAD_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck v5, v1, s[8:15] dmask:0x1 th:TH_LOAD_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_pck_sgn v5, v1, s[8:15] dmask:0x1 th:TH_LOAD_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_LOAD_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_load_mip_pck_sgn v5, [v0, v1], s[8:15] dmask:0x1 th:TH_LOAD_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample v[34:36], v37, s[36:43], s[64:67] dmask:0x3 tfe
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_cl_o v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_b_o v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_cl_o v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_b_o v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_cl_o v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_g16 v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o v64, [v32, v33, v34, v[35:37]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_cl_o_g16 v64, [v32, v33, v34, v[35:37]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_g16 v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_d_o_g16 v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_l_o v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_cl_o v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_lz_o v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_c_o v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_g16 v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_cl_o_g16 v64, [v32, v33, v34, v[35:36]], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_g16 v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_d_o_g16 v64, [v32, v33, v34, v35], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_l_o v64, [v32, v33, v34], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz v64, v32, s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_lz_o v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_sample_o v64, [v32, v33], s[4:11], s[4:7] dmask:0x1
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store v[0:3], v0, s[0:7] dmask:0xf a16
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip v[252:255], [v0, v1], s[0:7] dmask:0xf th:TH_STORE_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_pck v5, v1, s[8:15] dmask:0x1 th:TH_STORE_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+image_store_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_STORE_NT
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s
new file mode 100644
index 0000000000000..8fef2ab26dfdd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s
@@ -0,0 +1,3597 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+
+v_bfrev_b32_e32 v5, v1
+// GFX12: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
+
+v_bfrev_b32 v5, v255
+// GFX12: v_bfrev_b32_e32 v5, v255                ; encoding: [0xff,0x71,0x0a,0x7e]
+
+v_bfrev_b32 v5, s1
+// GFX12: v_bfrev_b32_e32 v5, s1                  ; encoding: [0x01,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, s105
+// GFX12: v_bfrev_b32_e32 v5, s105                ; encoding: [0x69,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, vcc_lo
+// GFX12: v_bfrev_b32_e32 v5, vcc_lo              ; encoding: [0x6a,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, vcc_hi
+// GFX12: v_bfrev_b32_e32 v5, vcc_hi              ; encoding: [0x6b,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, ttmp15
+// GFX12: v_bfrev_b32_e32 v5, ttmp15              ; encoding: [0x7b,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, m0
+// GFX12: v_bfrev_b32_e32 v5, m0                  ; encoding: [0x7d,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, exec_lo
+// GFX12: v_bfrev_b32_e32 v5, exec_lo             ; encoding: [0x7e,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, exec_hi
+// GFX12: v_bfrev_b32_e32 v5, exec_hi             ; encoding: [0x7f,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, null
+// GFX12: v_bfrev_b32_e32 v5, null                ; encoding: [0x7c,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, -1
+// GFX12: v_bfrev_b32_e32 v5, -1                  ; encoding: [0xc1,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, 0.5
+// GFX12: v_bfrev_b32_e32 v5, 0.5                 ; encoding: [0xf0,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, src_scc
+// GFX12: v_bfrev_b32_e32 v5, src_scc             ; encoding: [0xfd,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v255, 0xaf123456
+// GFX12: v_bfrev_b32_e32 v255, 0xaf123456        ; encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ceil_f16 v5, v1
+// GFX12: v_ceil_f16_e32 v5, v1                   ; encoding: [0x01,0xb9,0x0a,0x7e]
+
+v_ceil_f16 v5, v127
+// GFX12: v_ceil_f16_e32 v5, v127                 ; encoding: [0x7f,0xb9,0x0a,0x7e]
+
+v_ceil_f16 v5, s1
+// GFX12: v_ceil_f16_e32 v5, s1                   ; encoding: [0x01,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, s105
+// GFX12: v_ceil_f16_e32 v5, s105                 ; encoding: [0x69,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_lo
+// GFX12: v_ceil_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_hi
+// GFX12: v_ceil_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, ttmp15
+// GFX12: v_ceil_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, m0
+// GFX12: v_ceil_f16_e32 v5, m0                   ; encoding: [0x7d,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_lo
+// GFX12: v_ceil_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_hi
+// GFX12: v_ceil_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, null
+// GFX12: v_ceil_f16_e32 v5, null                 ; encoding: [0x7c,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, -1
+// GFX12: v_ceil_f16_e32 v5, -1                   ; encoding: [0xc1,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, 0.5
+// GFX12: v_ceil_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, src_scc
+// GFX12: v_ceil_f16_e32 v5, src_scc              ; encoding: [0xfd,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v127, 0xfe0b
+// GFX12: v_ceil_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_ceil_f32 v5, v1
+// GFX12: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, v255
+// GFX12: v_ceil_f32_e32 v5, v255                 ; encoding: [0xff,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, s1
+// GFX12: v_ceil_f32_e32 v5, s1                   ; encoding: [0x01,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, s105
+// GFX12: v_ceil_f32_e32 v5, s105                 ; encoding: [0x69,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, vcc_lo
+// GFX12: v_ceil_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, vcc_hi
+// GFX12: v_ceil_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, ttmp15
+// GFX12: v_ceil_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, m0
+// GFX12: v_ceil_f32_e32 v5, m0                   ; encoding: [0x7d,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, exec_lo
+// GFX12: v_ceil_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, exec_hi
+// GFX12: v_ceil_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, null
+// GFX12: v_ceil_f32_e32 v5, null                 ; encoding: [0x7c,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, -1
+// GFX12: v_ceil_f32_e32 v5, -1                   ; encoding: [0xc1,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, 0.5
+// GFX12: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, src_scc
+// GFX12: v_ceil_f32_e32 v5, src_scc              ; encoding: [0xfd,0x44,0x0a,0x7e]
+
+v_ceil_f32 v255, 0xaf123456
+// GFX12: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ceil_f64 v[5:6], v[1:2]
+// GFX12: v_ceil_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x31,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], v[254:255]
+// GFX12: v_ceil_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x31,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], s[2:3]
+// GFX12: v_ceil_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], s[104:105]
+// GFX12: v_ceil_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], vcc
+// GFX12: v_ceil_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], ttmp[14:15]
+// GFX12: v_ceil_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], exec
+// GFX12: v_ceil_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], null
+// GFX12: v_ceil_f64_e32 v[5:6], null             ; encoding: [0x7c,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], -1
+// GFX12: v_ceil_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], 0.5
+// GFX12: v_ceil_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], src_scc
+// GFX12: v_ceil_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[254:255], 0xaf123456
+// GFX12: v_ceil_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cls_i32 v5, v1
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
+
+v_cls_i32 v5, v255
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
+
+v_cls_i32 v5, s1
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, s105
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, vcc_lo
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, vcc_hi
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, ttmp15
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, m0
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, exec_lo
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, exec_hi
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, null
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, -1
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, 0.5
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, src_scc
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
+
+v_cls_i32 v255, 0xaf123456
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_clz_i32_u32 v5, v1
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
+
+v_clz_i32_u32 v5, v255
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
+
+v_clz_i32_u32 v5, s1
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, s105
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, vcc_lo
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, vcc_hi
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, ttmp15
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, m0
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, exec_lo
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, exec_hi
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, null
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, -1
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, 0.5
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, src_scc
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v255, 0xaf123456
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cos_f16 v5, v1
+// GFX12: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5, v127
+// GFX12: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5, s1
+// GFX12: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, s105
+// GFX12: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_lo
+// GFX12: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_hi
+// GFX12: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, ttmp15
+// GFX12: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, m0
+// GFX12: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, exec_lo
+// GFX12: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, exec_hi
+// GFX12: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, null
+// GFX12: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, -1
+// GFX12: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, 0.5
+// GFX12: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, src_scc
+// GFX12: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+
+v_cos_f16 v127, 0xfe0b
+// GFX12: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f32 v5, v1
+// GFX12: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, v255
+// GFX12: v_cos_f32_e32 v5, v255                  ; encoding: [0xff,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, s1
+// GFX12: v_cos_f32_e32 v5, s1                    ; encoding: [0x01,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, s105
+// GFX12: v_cos_f32_e32 v5, s105                  ; encoding: [0x69,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, vcc_lo
+// GFX12: v_cos_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, vcc_hi
+// GFX12: v_cos_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, ttmp15
+// GFX12: v_cos_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, m0
+// GFX12: v_cos_f32_e32 v5, m0                    ; encoding: [0x7d,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, exec_lo
+// GFX12: v_cos_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, exec_hi
+// GFX12: v_cos_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, null
+// GFX12: v_cos_f32_e32 v5, null                  ; encoding: [0x7c,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, -1
+// GFX12: v_cos_f32_e32 v5, -1                    ; encoding: [0xc1,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, 0.5
+// GFX12: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, src_scc
+// GFX12: v_cos_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6c,0x0a,0x7e]
+
+v_cos_f32 v255, 0xaf123456
+// GFX12: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ctz_i32_b32 v5, v1
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, v255
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, s1
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, s105
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, vcc_lo
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, vcc_hi
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, ttmp15
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, m0
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, exec_lo
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, exec_hi
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, null
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, -1
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, 0.5
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, src_scc
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v255, 0xaf123456
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_bf8_e32 v1, s3
+// GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, 3
+// GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, v3
+// GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, s3
+// GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, 3
+// GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, v3
+// GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], s5
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], s5         ; encoding: [0x05,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], 3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], 3          ; encoding: [0x83,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], v3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3         ; encoding: [0x03,0xdf,0x06,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_cvt_f16_f32 v5, v1
+// GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, v255
+// GFX12: v_cvt_f16_f32_e32 v5, v255              ; encoding: [0xff,0x15,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, s1
+// GFX12: v_cvt_f16_f32_e32 v5, s1                ; encoding: [0x01,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, s105
+// GFX12: v_cvt_f16_f32_e32 v5, s105              ; encoding: [0x69,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, vcc_lo
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, vcc_hi
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, ttmp15
+// GFX12: v_cvt_f16_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, m0
+// GFX12: v_cvt_f16_f32_e32 v5, m0                ; encoding: [0x7d,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, exec_lo
+// GFX12: v_cvt_f16_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, exec_hi
+// GFX12: v_cvt_f16_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, null
+// GFX12: v_cvt_f16_f32_e32 v5, null              ; encoding: [0x7c,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, -1
+// GFX12: v_cvt_f16_f32_e32 v5, -1                ; encoding: [0xc1,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, 0.5
+// GFX12: v_cvt_f16_f32_e32 v5, 0.5               ; encoding: [0xf0,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, src_scc
+// GFX12: v_cvt_f16_f32_e32 v5, src_scc           ; encoding: [0xfd,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v127, 0xaf123456
+// GFX12: v_cvt_f16_f32_e32 v127, 0xaf123456      ; encoding: [0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf]
+
+v_cvt_f16_i16 v5, v1
+// GFX12: v_cvt_f16_i16_e32 v5, v1                ; encoding: [0x01,0xa3,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, v127
+// GFX12: v_cvt_f16_i16_e32 v5, v127              ; encoding: [0x7f,0xa3,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, s1
+// GFX12: v_cvt_f16_i16_e32 v5, s1                ; encoding: [0x01,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, s105
+// GFX12: v_cvt_f16_i16_e32 v5, s105              ; encoding: [0x69,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_lo
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_hi
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, ttmp15
+// GFX12: v_cvt_f16_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, m0
+// GFX12: v_cvt_f16_i16_e32 v5, m0                ; encoding: [0x7d,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_lo
+// GFX12: v_cvt_f16_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_hi
+// GFX12: v_cvt_f16_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, null
+// GFX12: v_cvt_f16_i16_e32 v5, null              ; encoding: [0x7c,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, -1
+// GFX12: v_cvt_f16_i16_e32 v5, -1                ; encoding: [0xc1,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, 0.5
+// GFX12-ASM: v_cvt_f16_i16_e32 v5, 0.5               ; encoding: [0xf0,0xa2,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_i16_e32 v5, 0x3800            ; encoding: [0xff,0xa2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_f16_i16 v5, src_scc
+// GFX12: v_cvt_f16_i16_e32 v5, src_scc           ; encoding: [0xfd,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v127, 0xfe0b
+// GFX12: v_cvt_f16_i16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_u16 v5, v1
+// GFX12: v_cvt_f16_u16_e32 v5, v1                ; encoding: [0x01,0xa1,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, v127
+// GFX12: v_cvt_f16_u16_e32 v5, v127              ; encoding: [0x7f,0xa1,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, s1
+// GFX12: v_cvt_f16_u16_e32 v5, s1                ; encoding: [0x01,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, s105
+// GFX12: v_cvt_f16_u16_e32 v5, s105              ; encoding: [0x69,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_lo
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_hi
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, ttmp15
+// GFX12: v_cvt_f16_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, m0
+// GFX12: v_cvt_f16_u16_e32 v5, m0                ; encoding: [0x7d,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_lo
+// GFX12: v_cvt_f16_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_hi
+// GFX12: v_cvt_f16_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, null
+// GFX12: v_cvt_f16_u16_e32 v5, null              ; encoding: [0x7c,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, -1
+// GFX12: v_cvt_f16_u16_e32 v5, -1                ; encoding: [0xc1,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, 0.5
+// GFX12-ASM: v_cvt_f16_u16_e32 v5, 0.5               ; encoding: [0xf0,0xa0,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_u16_e32 v5, 0x3800            ; encoding: [0xff,0xa0,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_f16_u16 v5, src_scc
+// GFX12: v_cvt_f16_u16_e32 v5, src_scc           ; encoding: [0xfd,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v127, 0xfe0b
+// GFX12: v_cvt_f16_u16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f16 v5, v1
+// GFX12: v_cvt_f32_f16_e32 v5, v1                ; encoding: [0x01,0x17,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, v127
+// GFX12: v_cvt_f32_f16_e32 v5, v127              ; encoding: [0x7f,0x17,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, s1
+// GFX12: v_cvt_f32_f16_e32 v5, s1                ; encoding: [0x01,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, s105
+// GFX12: v_cvt_f32_f16_e32 v5, s105              ; encoding: [0x69,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, vcc_lo
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, vcc_hi
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, ttmp15
+// GFX12: v_cvt_f32_f16_e32 v5, ttmp15            ; encoding: [0x7b,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, m0
+// GFX12: v_cvt_f32_f16_e32 v5, m0                ; encoding: [0x7d,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, exec_lo
+// GFX12: v_cvt_f32_f16_e32 v5, exec_lo           ; encoding: [0x7e,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, exec_hi
+// GFX12: v_cvt_f32_f16_e32 v5, exec_hi           ; encoding: [0x7f,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, null
+// GFX12: v_cvt_f32_f16_e32 v5, null              ; encoding: [0x7c,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, -1
+// GFX12: v_cvt_f32_f16_e32 v5, -1                ; encoding: [0xc1,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, 0.5
+// GFX12: v_cvt_f32_f16_e32 v5, 0.5               ; encoding: [0xf0,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, src_scc
+// GFX12: v_cvt_f32_f16_e32 v5, src_scc           ; encoding: [0xfd,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v255, 0xfe0b
+// GFX12: v_cvt_f32_f16_e32 v255, 0xfe0b          ; encoding: [0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f64 v5, v[1:2]
+// GFX12: v_cvt_f32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x1f,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, v[254:255]
+// GFX12: v_cvt_f32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x1f,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, s[2:3]
+// GFX12: v_cvt_f32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, s[104:105]
+// GFX12: v_cvt_f32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, vcc
+// GFX12: v_cvt_f32_f64_e32 v5, vcc               ; encoding: [0x6a,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_f32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, exec
+// GFX12: v_cvt_f32_f64_e32 v5, exec              ; encoding: [0x7e,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, null
+// GFX12: v_cvt_f32_f64_e32 v5, null              ; encoding: [0x7c,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, -1
+// GFX12: v_cvt_f32_f64_e32 v5, -1                ; encoding: [0xc1,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, 0.5
+// GFX12: v_cvt_f32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, src_scc
+// GFX12: v_cvt_f32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v255, 0xaf123456
+// GFX12: v_cvt_f32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_i32 v5, v1
+// GFX12: v_cvt_f32_i32_e32 v5, v1                ; encoding: [0x01,0x0b,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, v255
+// GFX12: v_cvt_f32_i32_e32 v5, v255              ; encoding: [0xff,0x0b,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, s1
+// GFX12: v_cvt_f32_i32_e32 v5, s1                ; encoding: [0x01,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, s105
+// GFX12: v_cvt_f32_i32_e32 v5, s105              ; encoding: [0x69,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, vcc_lo
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, vcc_hi
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, ttmp15
+// GFX12: v_cvt_f32_i32_e32 v5, ttmp15            ; encoding: [0x7b,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, m0
+// GFX12: v_cvt_f32_i32_e32 v5, m0                ; encoding: [0x7d,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, exec_lo
+// GFX12: v_cvt_f32_i32_e32 v5, exec_lo           ; encoding: [0x7e,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, exec_hi
+// GFX12: v_cvt_f32_i32_e32 v5, exec_hi           ; encoding: [0x7f,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, null
+// GFX12: v_cvt_f32_i32_e32 v5, null              ; encoding: [0x7c,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, -1
+// GFX12: v_cvt_f32_i32_e32 v5, -1                ; encoding: [0xc1,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, 0.5
+// GFX12: v_cvt_f32_i32_e32 v5, 0.5               ; encoding: [0xf0,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, src_scc
+// GFX12: v_cvt_f32_i32_e32 v5, src_scc           ; encoding: [0xfd,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v255, 0xaf123456
+// GFX12: v_cvt_f32_i32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_u32 v5, v1
+// GFX12: v_cvt_f32_u32_e32 v5, v1                ; encoding: [0x01,0x0d,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, v255
+// GFX12: v_cvt_f32_u32_e32 v5, v255              ; encoding: [0xff,0x0d,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, s1
+// GFX12: v_cvt_f32_u32_e32 v5, s1                ; encoding: [0x01,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, s105
+// GFX12: v_cvt_f32_u32_e32 v5, s105              ; encoding: [0x69,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, vcc_lo
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, vcc_hi
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, ttmp15
+// GFX12: v_cvt_f32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, m0
+// GFX12: v_cvt_f32_u32_e32 v5, m0                ; encoding: [0x7d,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, exec_lo
+// GFX12: v_cvt_f32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, exec_hi
+// GFX12: v_cvt_f32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, null
+// GFX12: v_cvt_f32_u32_e32 v5, null              ; encoding: [0x7c,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, -1
+// GFX12: v_cvt_f32_u32_e32 v5, -1                ; encoding: [0xc1,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, 0.5
+// GFX12: v_cvt_f32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, src_scc
+// GFX12: v_cvt_f32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v255, 0xaf123456
+// GFX12: v_cvt_f32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte0 v5, v1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v1             ; encoding: [0x01,0x23,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, v255
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v255           ; encoding: [0xff,0x23,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, s1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s1             ; encoding: [0x01,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, s105
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s105           ; encoding: [0x69,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_lo         ; encoding: [0x6a,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_hi         ; encoding: [0x6b,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte0_e32 v5, ttmp15         ; encoding: [0x7b,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, m0
+// GFX12: v_cvt_f32_ubyte0_e32 v5, m0             ; encoding: [0x7d,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_lo        ; encoding: [0x7e,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_hi        ; encoding: [0x7f,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, null
+// GFX12: v_cvt_f32_ubyte0_e32 v5, null           ; encoding: [0x7c,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, -1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, -1             ; encoding: [0xc1,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, 0.5
+// GFX12: v_cvt_f32_ubyte0_e32 v5, 0.5            ; encoding: [0xf0,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, src_scc
+// GFX12: v_cvt_f32_ubyte0_e32 v5, src_scc        ; encoding: [0xfd,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte0_e32 v255, 0xaf123456   ; encoding: [0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte1 v5, v1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v1             ; encoding: [0x01,0x25,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, v255
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v255           ; encoding: [0xff,0x25,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, s1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s1             ; encoding: [0x01,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, s105
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s105           ; encoding: [0x69,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_lo         ; encoding: [0x6a,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_hi         ; encoding: [0x6b,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte1_e32 v5, ttmp15         ; encoding: [0x7b,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, m0
+// GFX12: v_cvt_f32_ubyte1_e32 v5, m0             ; encoding: [0x7d,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_lo        ; encoding: [0x7e,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_hi        ; encoding: [0x7f,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, null
+// GFX12: v_cvt_f32_ubyte1_e32 v5, null           ; encoding: [0x7c,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, -1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, -1             ; encoding: [0xc1,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, 0.5
+// GFX12: v_cvt_f32_ubyte1_e32 v5, 0.5            ; encoding: [0xf0,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, src_scc
+// GFX12: v_cvt_f32_ubyte1_e32 v5, src_scc        ; encoding: [0xfd,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte1_e32 v255, 0xaf123456   ; encoding: [0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte2 v5, v1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v1             ; encoding: [0x01,0x27,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, v255
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v255           ; encoding: [0xff,0x27,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, s1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s1             ; encoding: [0x01,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, s105
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s105           ; encoding: [0x69,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_lo         ; encoding: [0x6a,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_hi         ; encoding: [0x6b,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte2_e32 v5, ttmp15         ; encoding: [0x7b,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, m0
+// GFX12: v_cvt_f32_ubyte2_e32 v5, m0             ; encoding: [0x7d,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_lo        ; encoding: [0x7e,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_hi        ; encoding: [0x7f,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, null
+// GFX12: v_cvt_f32_ubyte2_e32 v5, null           ; encoding: [0x7c,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, -1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, -1             ; encoding: [0xc1,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, 0.5
+// GFX12: v_cvt_f32_ubyte2_e32 v5, 0.5            ; encoding: [0xf0,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, src_scc
+// GFX12: v_cvt_f32_ubyte2_e32 v5, src_scc        ; encoding: [0xfd,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte2_e32 v255, 0xaf123456   ; encoding: [0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte3 v5, v1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v1             ; encoding: [0x01,0x29,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, v255
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v255           ; encoding: [0xff,0x29,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, s1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s1             ; encoding: [0x01,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, s105
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s105           ; encoding: [0x69,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_lo         ; encoding: [0x6a,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_hi         ; encoding: [0x6b,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte3_e32 v5, ttmp15         ; encoding: [0x7b,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, m0
+// GFX12: v_cvt_f32_ubyte3_e32 v5, m0             ; encoding: [0x7d,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_lo        ; encoding: [0x7e,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_hi        ; encoding: [0x7f,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, null
+// GFX12: v_cvt_f32_ubyte3_e32 v5, null           ; encoding: [0x7c,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, -1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, -1             ; encoding: [0xc1,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, 0.5
+// GFX12: v_cvt_f32_ubyte3_e32 v5, 0.5            ; encoding: [0xf0,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, src_scc
+// GFX12: v_cvt_f32_ubyte3_e32 v5, src_scc        ; encoding: [0xfd,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte3_e32 v255, 0xaf123456   ; encoding: [0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_f32 v[5:6], v1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v1            ; encoding: [0x01,0x21,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], v255
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v255          ; encoding: [0xff,0x21,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], s1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s1            ; encoding: [0x01,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], s105
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s105          ; encoding: [0x69,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_f32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], m0
+// GFX12: v_cvt_f64_f32_e32 v[5:6], m0            ; encoding: [0x7d,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], null
+// GFX12: v_cvt_f64_f32_e32 v[5:6], null          ; encoding: [0x7c,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], -1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], -1            ; encoding: [0xc1,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], 0.5
+// GFX12: v_cvt_f64_f32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], src_scc
+// GFX12: v_cvt_f64_f32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_f32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_i32 v[5:6], v1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v1            ; encoding: [0x01,0x09,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], v255
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v255          ; encoding: [0xff,0x09,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], s1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s1            ; encoding: [0x01,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], s105
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s105          ; encoding: [0x69,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_i32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], m0
+// GFX12: v_cvt_f64_i32_e32 v[5:6], m0            ; encoding: [0x7d,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], null
+// GFX12: v_cvt_f64_i32_e32 v[5:6], null          ; encoding: [0x7c,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], -1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], -1            ; encoding: [0xc1,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], 0.5
+// GFX12: v_cvt_f64_i32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], src_scc
+// GFX12: v_cvt_f64_i32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_i32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_u32 v[5:6], v1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v1            ; encoding: [0x01,0x2d,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], v255
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v255          ; encoding: [0xff,0x2d,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], s1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s1            ; encoding: [0x01,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], s105
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s105          ; encoding: [0x69,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_u32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], m0
+// GFX12: v_cvt_f64_u32_e32 v[5:6], m0            ; encoding: [0x7d,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], null
+// GFX12: v_cvt_f64_u32_e32 v[5:6], null          ; encoding: [0x7c,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], -1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], -1            ; encoding: [0xc1,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], 0.5
+// GFX12: v_cvt_f64_u32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], src_scc
+// GFX12: v_cvt_f64_u32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_u32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_floor_i32_f32 v5, v1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, v255
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, s1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, s105
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, ttmp15
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, m0
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, exec_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, exec_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, null
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, -1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, 0.5
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, src_scc
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_flr_i32_f32 v5, v1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, v255
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, s1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, s105
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, ttmp15
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, m0
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, exec_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, exec_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, null
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, -1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, 0.5
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, src_scc
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i16_f16 v5, v1
+// GFX12: v_cvt_i16_f16_e32 v5, v1                ; encoding: [0x01,0xa7,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, v127
+// GFX12: v_cvt_i16_f16_e32 v5, v127              ; encoding: [0x7f,0xa7,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, s1
+// GFX12: v_cvt_i16_f16_e32 v5, s1                ; encoding: [0x01,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, s105
+// GFX12: v_cvt_i16_f16_e32 v5, s105              ; encoding: [0x69,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_lo
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_hi
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, ttmp15
+// GFX12: v_cvt_i16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, m0
+// GFX12: v_cvt_i16_f16_e32 v5, m0                ; encoding: [0x7d,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_lo
+// GFX12: v_cvt_i16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_hi
+// GFX12: v_cvt_i16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, null
+// GFX12: v_cvt_i16_f16_e32 v5, null              ; encoding: [0x7c,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, -1
+// GFX12: v_cvt_i16_f16_e32 v5, -1                ; encoding: [0xc1,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, 0.5
+// GFX12: v_cvt_i16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, src_scc
+// GFX12: v_cvt_i16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v127, 0xfe0b
+// GFX12: v_cvt_i16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i32_f32 v5, v1
+// GFX12: v_cvt_i32_f32_e32 v5, v1                ; encoding: [0x01,0x11,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, v255
+// GFX12: v_cvt_i32_f32_e32 v5, v255              ; encoding: [0xff,0x11,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, s1
+// GFX12: v_cvt_i32_f32_e32 v5, s1                ; encoding: [0x01,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, s105
+// GFX12: v_cvt_i32_f32_e32 v5, s105              ; encoding: [0x69,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, ttmp15
+// GFX12: v_cvt_i32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, m0
+// GFX12: v_cvt_i32_f32_e32 v5, m0                ; encoding: [0x7d,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, exec_lo
+// GFX12: v_cvt_i32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, exec_hi
+// GFX12: v_cvt_i32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, null
+// GFX12: v_cvt_i32_f32_e32 v5, null              ; encoding: [0x7c,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, -1
+// GFX12: v_cvt_i32_f32_e32 v5, -1                ; encoding: [0xc1,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, 0.5
+// GFX12: v_cvt_i32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, src_scc
+// GFX12: v_cvt_i32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_i32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_f64 v5, v[1:2]
+// GFX12: v_cvt_i32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x07,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, v[254:255]
+// GFX12: v_cvt_i32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x07,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, s[2:3]
+// GFX12: v_cvt_i32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, s[104:105]
+// GFX12: v_cvt_i32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, vcc
+// GFX12: v_cvt_i32_f64_e32 v5, vcc               ; encoding: [0x6a,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_i32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, exec
+// GFX12: v_cvt_i32_f64_e32 v5, exec              ; encoding: [0x7e,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, null
+// GFX12: v_cvt_i32_f64_e32 v5, null              ; encoding: [0x7c,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, -1
+// GFX12: v_cvt_i32_f64_e32 v5, -1                ; encoding: [0xc1,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, 0.5
+// GFX12: v_cvt_i32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, src_scc
+// GFX12: v_cvt_i32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v255, 0xaf123456
+// GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_i16 v5, v1
+// GFX12: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127
+// GFX12: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, s1
+// GFX12: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, s105
+// GFX12: v_cvt_i32_i16_e32 v5, s105              ; encoding: [0x69,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, vcc_lo
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, vcc_hi
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, ttmp15
+// GFX12: v_cvt_i32_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, m0
+// GFX12: v_cvt_i32_i16_e32 v5, m0                ; encoding: [0x7d,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, exec_lo
+// GFX12: v_cvt_i32_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, exec_hi
+// GFX12: v_cvt_i32_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, null
+// GFX12: v_cvt_i32_i16_e32 v5, null              ; encoding: [0x7c,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, -1
+// GFX12: v_cvt_i32_i16_e32 v5, -1                ; encoding: [0xc1,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, 0.5
+// GFX12-ASM: v_cvt_i32_i16_e32 v5, 0.5               ; encoding: [0xf0,0xd4,0x0a,0x7e]
+// GFX12-DIS: v_cvt_i32_i16_e32 v5, 0x3800            ; encoding: [0xff,0xd4,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_i32_i16 v5, src_scc
+// GFX12: v_cvt_i32_i16_e32 v5, src_scc           ; encoding: [0xfd,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v255, 0xfe0b
+// GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_cvt_nearest_i32_f32 v5, v1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, v255
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, s1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, s105
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, ttmp15
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, m0
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, exec_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, exec_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, null
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, -1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, 0.5
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, src_scc
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_norm_i16_f16 v5, v1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v1           ; encoding: [0x01,0xc7,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, v127
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v127         ; encoding: [0x7f,0xc7,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, s1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s1           ; encoding: [0x01,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, s105
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s105         ; encoding: [0x69,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, vcc_lo
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, vcc_hi
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, ttmp15
+// GFX12: v_cvt_norm_i16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, m0
+// GFX12: v_cvt_norm_i16_f16_e32 v5, m0           ; encoding: [0x7d,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, exec_lo
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, exec_hi
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, null
+// GFX12: v_cvt_norm_i16_f16_e32 v5, null         ; encoding: [0x7c,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, -1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, -1           ; encoding: [0xc1,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, 0.5
+// GFX12: v_cvt_norm_i16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, src_scc
+// GFX12: v_cvt_norm_i16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v127, 0xfe0b
+// GFX12: v_cvt_norm_i16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_norm_u16_f16 v5, v1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v1           ; encoding: [0x01,0xc9,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, v127
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v127         ; encoding: [0x7f,0xc9,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, s1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s1           ; encoding: [0x01,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, s105
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s105         ; encoding: [0x69,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, vcc_lo
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, vcc_hi
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, ttmp15
+// GFX12: v_cvt_norm_u16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, m0
+// GFX12: v_cvt_norm_u16_f16_e32 v5, m0           ; encoding: [0x7d,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, exec_lo
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, exec_hi
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, null
+// GFX12: v_cvt_norm_u16_f16_e32 v5, null         ; encoding: [0x7c,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, -1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, -1           ; encoding: [0xc1,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, 0.5
+// GFX12: v_cvt_norm_u16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, src_scc
+// GFX12: v_cvt_norm_u16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v127, 0xfe0b
+// GFX12: v_cvt_norm_u16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_off_f32_i4 v5, v1
+// GFX12: v_cvt_off_f32_i4_e32 v5, v1             ; encoding: [0x01,0x1d,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, v255
+// GFX12: v_cvt_off_f32_i4_e32 v5, v255           ; encoding: [0xff,0x1d,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, s1
+// GFX12: v_cvt_off_f32_i4_e32 v5, s1             ; encoding: [0x01,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, s105
+// GFX12: v_cvt_off_f32_i4_e32 v5, s105           ; encoding: [0x69,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, vcc_lo
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_lo         ; encoding: [0x6a,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, vcc_hi
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_hi         ; encoding: [0x6b,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, ttmp15
+// GFX12: v_cvt_off_f32_i4_e32 v5, ttmp15         ; encoding: [0x7b,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, m0
+// GFX12: v_cvt_off_f32_i4_e32 v5, m0             ; encoding: [0x7d,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, exec_lo
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_lo        ; encoding: [0x7e,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, exec_hi
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_hi        ; encoding: [0x7f,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, null
+// GFX12: v_cvt_off_f32_i4_e32 v5, null           ; encoding: [0x7c,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, -1
+// GFX12: v_cvt_off_f32_i4_e32 v5, -1             ; encoding: [0xc1,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, 0.5
+// GFX12: v_cvt_off_f32_i4_e32 v5, 0.5            ; encoding: [0xf0,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, src_scc
+// GFX12: v_cvt_off_f32_i4_e32 v5, src_scc        ; encoding: [0xfd,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v255, 0x4f
+// GFX12: v_cvt_off_f32_i4_e32 v255, 0x4f         ; encoding: [0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32 v5, v1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, v255
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, s1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, s105
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, ttmp15
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, m0
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, exec_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, exec_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, null
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, -1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, 0.5
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, src_scc
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u16_f16 v5, v1
+// GFX12: v_cvt_u16_f16_e32 v5, v1                ; encoding: [0x01,0xa5,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, v127
+// GFX12: v_cvt_u16_f16_e32 v5, v127              ; encoding: [0x7f,0xa5,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, s1
+// GFX12: v_cvt_u16_f16_e32 v5, s1                ; encoding: [0x01,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, s105
+// GFX12: v_cvt_u16_f16_e32 v5, s105              ; encoding: [0x69,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_lo
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_hi
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, ttmp15
+// GFX12: v_cvt_u16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, m0
+// GFX12: v_cvt_u16_f16_e32 v5, m0                ; encoding: [0x7d,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_lo
+// GFX12: v_cvt_u16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_hi
+// GFX12: v_cvt_u16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, null
+// GFX12: v_cvt_u16_f16_e32 v5, null              ; encoding: [0x7c,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, -1
+// GFX12: v_cvt_u16_f16_e32 v5, -1                ; encoding: [0xc1,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, 0.5
+// GFX12: v_cvt_u16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, src_scc
+// GFX12: v_cvt_u16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v127, 0xfe0b
+// GFX12: v_cvt_u16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u32_f32 v5, v1
+// GFX12: v_cvt_u32_f32_e32 v5, v1                ; encoding: [0x01,0x0f,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, v255
+// GFX12: v_cvt_u32_f32_e32 v5, v255              ; encoding: [0xff,0x0f,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, s1
+// GFX12: v_cvt_u32_f32_e32 v5, s1                ; encoding: [0x01,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, s105
+// GFX12: v_cvt_u32_f32_e32 v5, s105              ; encoding: [0x69,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, vcc_lo
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, vcc_hi
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, ttmp15
+// GFX12: v_cvt_u32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, m0
+// GFX12: v_cvt_u32_f32_e32 v5, m0                ; encoding: [0x7d,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, exec_lo
+// GFX12: v_cvt_u32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, exec_hi
+// GFX12: v_cvt_u32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, null
+// GFX12: v_cvt_u32_f32_e32 v5, null              ; encoding: [0x7c,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, -1
+// GFX12: v_cvt_u32_f32_e32 v5, -1                ; encoding: [0xc1,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, 0.5
+// GFX12: v_cvt_u32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, src_scc
+// GFX12: v_cvt_u32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v255, 0xaf123456
+// GFX12: v_cvt_u32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_f64 v5, v[1:2]
+// GFX12: v_cvt_u32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x2b,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, v[254:255]
+// GFX12: v_cvt_u32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x2b,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, s[2:3]
+// GFX12: v_cvt_u32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, s[104:105]
+// GFX12: v_cvt_u32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, vcc
+// GFX12: v_cvt_u32_f64_e32 v5, vcc               ; encoding: [0x6a,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_u32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, exec
+// GFX12: v_cvt_u32_f64_e32 v5, exec              ; encoding: [0x7e,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, null
+// GFX12: v_cvt_u32_f64_e32 v5, null              ; encoding: [0x7c,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, -1
+// GFX12: v_cvt_u32_f64_e32 v5, -1                ; encoding: [0xc1,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, 0.5
+// GFX12: v_cvt_u32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, src_scc
+// GFX12: v_cvt_u32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v255, 0xaf123456
+// GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_u16 v5, v1
+// GFX12: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127
+// GFX12: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, s1
+// GFX12: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, s105
+// GFX12: v_cvt_u32_u16_e32 v5, s105              ; encoding: [0x69,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, vcc_lo
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, vcc_hi
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, ttmp15
+// GFX12: v_cvt_u32_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, m0
+// GFX12: v_cvt_u32_u16_e32 v5, m0                ; encoding: [0x7d,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, exec_lo
+// GFX12: v_cvt_u32_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, exec_hi
+// GFX12: v_cvt_u32_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, null
+// GFX12: v_cvt_u32_u16_e32 v5, null              ; encoding: [0x7c,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, -1
+// GFX12: v_cvt_u32_u16_e32 v5, -1                ; encoding: [0xc1,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, 0.5
+// GFX12-ASM: v_cvt_u32_u16_e32 v5, 0.5               ; encoding: [0xf0,0xd6,0x0a,0x7e]
+// GFX12-DIS: v_cvt_u32_u16_e32 v5, 0x3800            ; encoding: [0xff,0xd6,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_u32_u16 v5, src_scc
+// GFX12: v_cvt_u32_u16_e32 v5, src_scc           ; encoding: [0xfd,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v255, 0xfe0b
+// GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_exp_f16 v5, v1
+// GFX12: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
+
+v_exp_f16 v5, v127
+// GFX12: v_exp_f16_e32 v5, v127                  ; encoding: [0x7f,0xb1,0x0a,0x7e]
+
+v_exp_f16 v5, s1
+// GFX12: v_exp_f16_e32 v5, s1                    ; encoding: [0x01,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, s105
+// GFX12: v_exp_f16_e32 v5, s105                  ; encoding: [0x69,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_lo
+// GFX12: v_exp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_hi
+// GFX12: v_exp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, ttmp15
+// GFX12: v_exp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, m0
+// GFX12: v_exp_f16_e32 v5, m0                    ; encoding: [0x7d,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, exec_lo
+// GFX12: v_exp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, exec_hi
+// GFX12: v_exp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, null
+// GFX12: v_exp_f16_e32 v5, null                  ; encoding: [0x7c,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, -1
+// GFX12: v_exp_f16_e32 v5, -1                    ; encoding: [0xc1,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, 0.5
+// GFX12: v_exp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, src_scc
+// GFX12: v_exp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xb0,0x0a,0x7e]
+
+v_exp_f16 v127, 0xfe0b
+// GFX12: v_exp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_exp_f32 v5, v1
+// GFX12: v_exp_f32_e32 v5, v1                    ; encoding: [0x01,0x4b,0x0a,0x7e]
+
+v_exp_f32 v5, v255
+// GFX12: v_exp_f32_e32 v5, v255                  ; encoding: [0xff,0x4b,0x0a,0x7e]
+
+v_exp_f32 v5, s1
+// GFX12: v_exp_f32_e32 v5, s1                    ; encoding: [0x01,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, s105
+// GFX12: v_exp_f32_e32 v5, s105                  ; encoding: [0x69,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, vcc_lo
+// GFX12: v_exp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, vcc_hi
+// GFX12: v_exp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, ttmp15
+// GFX12: v_exp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, m0
+// GFX12: v_exp_f32_e32 v5, m0                    ; encoding: [0x7d,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, exec_lo
+// GFX12: v_exp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, exec_hi
+// GFX12: v_exp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, null
+// GFX12: v_exp_f32_e32 v5, null                  ; encoding: [0x7c,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, -1
+// GFX12: v_exp_f32_e32 v5, -1                    ; encoding: [0xc1,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, 0.5
+// GFX12: v_exp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, src_scc
+// GFX12: v_exp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4a,0x0a,0x7e]
+
+v_exp_f32 v255, 0xaf123456
+// GFX12: v_exp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbh_i32 v5, v1
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
+
+v_ffbh_i32 v5, v255
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
+
+v_ffbh_i32 v5, s1
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, s105
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, vcc_lo
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, vcc_hi
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, ttmp15
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, m0
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, exec_lo
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, exec_hi
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, null
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, -1
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, 0.5
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, src_scc
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v255, 0xaf123456
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbh_u32 v5, v1
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
+
+v_ffbh_u32 v5, v255
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
+
+v_ffbh_u32 v5, s1
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, s105
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, vcc_lo
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, vcc_hi
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, ttmp15
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, m0
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, exec_lo
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, exec_hi
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, null
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, -1
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, 0.5
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, src_scc
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v255, 0xaf123456
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbl_b32 v5, v1
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
+
+v_ffbl_b32 v5, v255
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
+
+v_ffbl_b32 v5, s1
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, s105
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, vcc_lo
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, vcc_hi
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, ttmp15
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, m0
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, exec_lo
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, exec_hi
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, null
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, -1
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, 0.5
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, src_scc
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v255, 0xaf123456
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_floor_f16 v5, v1
+// GFX12: v_floor_f16_e32 v5, v1                  ; encoding: [0x01,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, v127
+// GFX12: v_floor_f16_e32 v5, v127                ; encoding: [0x7f,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, s1
+// GFX12: v_floor_f16_e32 v5, s1                  ; encoding: [0x01,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, s105
+// GFX12: v_floor_f16_e32 v5, s105                ; encoding: [0x69,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_lo
+// GFX12: v_floor_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_hi
+// GFX12: v_floor_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, ttmp15
+// GFX12: v_floor_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, m0
+// GFX12: v_floor_f16_e32 v5, m0                  ; encoding: [0x7d,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_lo
+// GFX12: v_floor_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_hi
+// GFX12: v_floor_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, null
+// GFX12: v_floor_f16_e32 v5, null                ; encoding: [0x7c,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, -1
+// GFX12: v_floor_f16_e32 v5, -1                  ; encoding: [0xc1,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, 0.5
+// GFX12: v_floor_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, src_scc
+// GFX12: v_floor_f16_e32 v5, src_scc             ; encoding: [0xfd,0xb6,0x0a,0x7e]
+
+v_floor_f16 v127, 0xfe0b
+// GFX12: v_floor_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_floor_f32 v5, v1
+// GFX12: v_floor_f32_e32 v5, v1                  ; encoding: [0x01,0x49,0x0a,0x7e]
+
+v_floor_f32 v5, v255
+// GFX12: v_floor_f32_e32 v5, v255                ; encoding: [0xff,0x49,0x0a,0x7e]
+
+v_floor_f32 v5, s1
+// GFX12: v_floor_f32_e32 v5, s1                  ; encoding: [0x01,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, s105
+// GFX12: v_floor_f32_e32 v5, s105                ; encoding: [0x69,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, vcc_lo
+// GFX12: v_floor_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, vcc_hi
+// GFX12: v_floor_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, ttmp15
+// GFX12: v_floor_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, m0
+// GFX12: v_floor_f32_e32 v5, m0                  ; encoding: [0x7d,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, exec_lo
+// GFX12: v_floor_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, exec_hi
+// GFX12: v_floor_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, null
+// GFX12: v_floor_f32_e32 v5, null                ; encoding: [0x7c,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, -1
+// GFX12: v_floor_f32_e32 v5, -1                  ; encoding: [0xc1,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, 0.5
+// GFX12: v_floor_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, src_scc
+// GFX12: v_floor_f32_e32 v5, src_scc             ; encoding: [0xfd,0x48,0x0a,0x7e]
+
+v_floor_f32 v255, 0xaf123456
+// GFX12: v_floor_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_floor_f64 v[5:6], v[1:2]
+// GFX12: v_floor_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x35,0x0a,0x7e]
+
+v_floor_f64 v[5:6], v[254:255]
+// GFX12: v_floor_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x35,0x0a,0x7e]
+
+v_floor_f64 v[5:6], s[2:3]
+// GFX12: v_floor_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], s[104:105]
+// GFX12: v_floor_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], vcc
+// GFX12: v_floor_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], ttmp[14:15]
+// GFX12: v_floor_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], exec
+// GFX12: v_floor_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], null
+// GFX12: v_floor_f64_e32 v[5:6], null            ; encoding: [0x7c,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], -1
+// GFX12: v_floor_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], 0.5
+// GFX12: v_floor_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], src_scc
+// GFX12: v_floor_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x34,0x0a,0x7e]
+
+v_floor_f64 v[254:255], 0xaf123456
+// GFX12: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_fract_f16 v5, v1
+// GFX12: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5, v127
+// GFX12: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5, s1
+// GFX12: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, s105
+// GFX12: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_lo
+// GFX12: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_hi
+// GFX12: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, ttmp15
+// GFX12: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, m0
+// GFX12: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, exec_lo
+// GFX12: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, exec_hi
+// GFX12: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, null
+// GFX12: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, -1
+// GFX12: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, 0.5
+// GFX12: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, src_scc
+// GFX12: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+
+v_fract_f16 v127, 0xfe0b
+// GFX12: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_fract_f32 v5, v1
+// GFX12: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
+
+v_fract_f32 v5, v255
+// GFX12: v_fract_f32_e32 v5, v255                ; encoding: [0xff,0x41,0x0a,0x7e]
+
+v_fract_f32 v5, s1
+// GFX12: v_fract_f32_e32 v5, s1                  ; encoding: [0x01,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, s105
+// GFX12: v_fract_f32_e32 v5, s105                ; encoding: [0x69,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, vcc_lo
+// GFX12: v_fract_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, vcc_hi
+// GFX12: v_fract_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, ttmp15
+// GFX12: v_fract_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, m0
+// GFX12: v_fract_f32_e32 v5, m0                  ; encoding: [0x7d,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, exec_lo
+// GFX12: v_fract_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, exec_hi
+// GFX12: v_fract_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, null
+// GFX12: v_fract_f32_e32 v5, null                ; encoding: [0x7c,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, -1
+// GFX12: v_fract_f32_e32 v5, -1                  ; encoding: [0xc1,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, 0.5
+// GFX12: v_fract_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, src_scc
+// GFX12: v_fract_f32_e32 v5, src_scc             ; encoding: [0xfd,0x40,0x0a,0x7e]
+
+v_fract_f32 v255, 0xaf123456
+// GFX12: v_fract_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_fract_f64 v[5:6], v[1:2]
+// GFX12: v_fract_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x7d,0x0a,0x7e]
+
+v_fract_f64 v[5:6], v[254:255]
+// GFX12: v_fract_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x7d,0x0a,0x7e]
+
+v_fract_f64 v[5:6], s[2:3]
+// GFX12: v_fract_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], s[104:105]
+// GFX12: v_fract_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], vcc
+// GFX12: v_fract_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], ttmp[14:15]
+// GFX12: v_fract_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], exec
+// GFX12: v_fract_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], null
+// GFX12: v_fract_f64_e32 v[5:6], null            ; encoding: [0x7c,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], -1
+// GFX12: v_fract_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], 0.5
+// GFX12: v_fract_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], src_scc
+// GFX12: v_fract_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[254:255], 0xaf123456
+// GFX12: v_fract_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i16_f16 v5, v1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v1          ; encoding: [0x01,0xb5,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, v127
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v127        ; encoding: [0x7f,0xb5,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, s1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s1          ; encoding: [0x01,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, s105
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s105        ; encoding: [0x69,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_lo
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_lo      ; encoding: [0x6a,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_hi
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_hi      ; encoding: [0x6b,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, ttmp15
+// GFX12: v_frexp_exp_i16_f16_e32 v5, ttmp15      ; encoding: [0x7b,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, m0
+// GFX12: v_frexp_exp_i16_f16_e32 v5, m0          ; encoding: [0x7d,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_lo
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_lo     ; encoding: [0x7e,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_hi
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_hi     ; encoding: [0x7f,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, null
+// GFX12: v_frexp_exp_i16_f16_e32 v5, null        ; encoding: [0x7c,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, -1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, -1          ; encoding: [0xc1,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, 0.5
+// GFX12: v_frexp_exp_i16_f16_e32 v5, 0.5         ; encoding: [0xf0,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, src_scc
+// GFX12: v_frexp_exp_i16_f16_e32 v5, src_scc     ; encoding: [0xfd,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v127, 0xfe0b
+// GFX12: v_frexp_exp_i16_f16_e32 v127, 0xfe0b    ; encoding: [0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_exp_i32_f32 v5, v1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v1          ; encoding: [0x01,0x7f,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, v255
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v255        ; encoding: [0xff,0x7f,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, s1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s1          ; encoding: [0x01,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, s105
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s105        ; encoding: [0x69,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, vcc_lo
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, vcc_hi
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, ttmp15
+// GFX12: v_frexp_exp_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, m0
+// GFX12: v_frexp_exp_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, exec_lo
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, exec_hi
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, null
+// GFX12: v_frexp_exp_i32_f32_e32 v5, null        ; encoding: [0x7c,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, -1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, 0.5
+// GFX12: v_frexp_exp_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, src_scc
+// GFX12: v_frexp_exp_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v255, 0xaf123456
+// GFX12: v_frexp_exp_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i32_f64 v5, v[1:2]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[1:2]      ; encoding: [0x01,0x79,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, v[254:255]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[254:255]  ; encoding: [0xfe,0x79,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, s[2:3]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[2:3]      ; encoding: [0x02,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, s[104:105]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[104:105]  ; encoding: [0x68,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, vcc
+// GFX12: v_frexp_exp_i32_f64_e32 v5, vcc         ; encoding: [0x6a,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, ttmp[14:15]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, ttmp[14:15] ; encoding: [0x7a,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, exec
+// GFX12: v_frexp_exp_i32_f64_e32 v5, exec        ; encoding: [0x7e,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, null
+// GFX12: v_frexp_exp_i32_f64_e32 v5, null        ; encoding: [0x7c,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, -1
+// GFX12: v_frexp_exp_i32_f64_e32 v5, -1          ; encoding: [0xc1,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, 0.5
+// GFX12: v_frexp_exp_i32_f64_e32 v5, 0.5         ; encoding: [0xf0,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, src_scc
+// GFX12: v_frexp_exp_i32_f64_e32 v5, src_scc     ; encoding: [0xfd,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v255, 0xaf123456
+// GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f16 v5, v1
+// GFX12: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, v127
+// GFX12: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, s1
+// GFX12: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, s105
+// GFX12: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_lo
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_hi
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, ttmp15
+// GFX12: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, m0
+// GFX12: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_lo
+// GFX12: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_hi
+// GFX12: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, null
+// GFX12: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, -1
+// GFX12: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, 0.5
+// GFX12: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v127, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f32 v5, v1
+// GFX12: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, v255
+// GFX12: v_frexp_mant_f32_e32 v5, v255           ; encoding: [0xff,0x81,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, s1
+// GFX12: v_frexp_mant_f32_e32 v5, s1             ; encoding: [0x01,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, s105
+// GFX12: v_frexp_mant_f32_e32 v5, s105           ; encoding: [0x69,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, vcc_lo
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_lo         ; encoding: [0x6a,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, vcc_hi
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_hi         ; encoding: [0x6b,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, ttmp15
+// GFX12: v_frexp_mant_f32_e32 v5, ttmp15         ; encoding: [0x7b,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, m0
+// GFX12: v_frexp_mant_f32_e32 v5, m0             ; encoding: [0x7d,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, exec_lo
+// GFX12: v_frexp_mant_f32_e32 v5, exec_lo        ; encoding: [0x7e,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, exec_hi
+// GFX12: v_frexp_mant_f32_e32 v5, exec_hi        ; encoding: [0x7f,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, null
+// GFX12: v_frexp_mant_f32_e32 v5, null           ; encoding: [0x7c,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, -1
+// GFX12: v_frexp_mant_f32_e32 v5, -1             ; encoding: [0xc1,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, 0.5
+// GFX12: v_frexp_mant_f32_e32 v5, 0.5            ; encoding: [0xf0,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, src_scc
+// GFX12: v_frexp_mant_f32_e32 v5, src_scc        ; encoding: [0xfd,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v255, 0xaf123456
+// GFX12: v_frexp_mant_f32_e32 v255, 0xaf123456   ; encoding: [0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f64 v[5:6], v[1:2]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[1:2]     ; encoding: [0x01,0x7b,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], v[254:255]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[254:255] ; encoding: [0xfe,0x7b,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], s[2:3]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[2:3]     ; encoding: [0x02,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], s[104:105]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[104:105] ; encoding: [0x68,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], vcc
+// GFX12: v_frexp_mant_f64_e32 v[5:6], vcc        ; encoding: [0x6a,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], ttmp[14:15]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], ttmp[14:15] ; encoding: [0x7a,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], exec
+// GFX12: v_frexp_mant_f64_e32 v[5:6], exec       ; encoding: [0x7e,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], null
+// GFX12: v_frexp_mant_f64_e32 v[5:6], null       ; encoding: [0x7c,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], -1
+// GFX12: v_frexp_mant_f64_e32 v[5:6], -1         ; encoding: [0xc1,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], 0.5
+// GFX12: v_frexp_mant_f64_e32 v[5:6], 0.5        ; encoding: [0xf0,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], src_scc
+// GFX12: v_frexp_mant_f64_e32 v[5:6], src_scc    ; encoding: [0xfd,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[254:255], 0xaf123456
+// GFX12: v_frexp_mant_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_log_f16 v5, v1
+// GFX12: v_log_f16_e32 v5, v1                    ; encoding: [0x01,0xaf,0x0a,0x7e]
+
+v_log_f16 v5, v127
+// GFX12: v_log_f16_e32 v5, v127                  ; encoding: [0x7f,0xaf,0x0a,0x7e]
+
+v_log_f16 v5, s1
+// GFX12: v_log_f16_e32 v5, s1                    ; encoding: [0x01,0xae,0x0a,0x7e]
+
+v_log_f16 v5, s105
+// GFX12: v_log_f16_e32 v5, s105                  ; encoding: [0x69,0xae,0x0a,0x7e]
+
+v_log_f16 v5, vcc_lo
+// GFX12: v_log_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xae,0x0a,0x7e]
+
+v_log_f16 v5, vcc_hi
+// GFX12: v_log_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xae,0x0a,0x7e]
+
+v_log_f16 v5, ttmp15
+// GFX12: v_log_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xae,0x0a,0x7e]
+
+v_log_f16 v5, m0
+// GFX12: v_log_f16_e32 v5, m0                    ; encoding: [0x7d,0xae,0x0a,0x7e]
+
+v_log_f16 v5, exec_lo
+// GFX12: v_log_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xae,0x0a,0x7e]
+
+v_log_f16 v5, exec_hi
+// GFX12: v_log_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xae,0x0a,0x7e]
+
+v_log_f16 v5, null
+// GFX12: v_log_f16_e32 v5, null                  ; encoding: [0x7c,0xae,0x0a,0x7e]
+
+v_log_f16 v5, -1
+// GFX12: v_log_f16_e32 v5, -1                    ; encoding: [0xc1,0xae,0x0a,0x7e]
+
+v_log_f16 v5, 0.5
+// GFX12: v_log_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xae,0x0a,0x7e]
+
+v_log_f16 v5, src_scc
+// GFX12: v_log_f16_e32 v5, src_scc               ; encoding: [0xfd,0xae,0x0a,0x7e]
+
+v_log_f16 v127, 0xfe0b
+// GFX12: v_log_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_log_f32 v5, v1
+// GFX12: v_log_f32_e32 v5, v1                    ; encoding: [0x01,0x4f,0x0a,0x7e]
+
+v_log_f32 v5, v255
+// GFX12: v_log_f32_e32 v5, v255                  ; encoding: [0xff,0x4f,0x0a,0x7e]
+
+v_log_f32 v5, s1
+// GFX12: v_log_f32_e32 v5, s1                    ; encoding: [0x01,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, s105
+// GFX12: v_log_f32_e32 v5, s105                  ; encoding: [0x69,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, vcc_lo
+// GFX12: v_log_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, vcc_hi
+// GFX12: v_log_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, ttmp15
+// GFX12: v_log_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, m0
+// GFX12: v_log_f32_e32 v5, m0                    ; encoding: [0x7d,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, exec_lo
+// GFX12: v_log_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, exec_hi
+// GFX12: v_log_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, null
+// GFX12: v_log_f32_e32 v5, null                  ; encoding: [0x7c,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, -1
+// GFX12: v_log_f32_e32 v5, -1                    ; encoding: [0xc1,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, 0.5
+// GFX12: v_log_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, src_scc
+// GFX12: v_log_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4e,0x0a,0x7e]
+
+v_log_f32 v255, 0xaf123456
+// GFX12: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_mov_b32 v5, v1
+// GFX12: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
+
+v_mov_b32 v5, v255
+// GFX12: v_mov_b32_e32 v5, v255                  ; encoding: [0xff,0x03,0x0a,0x7e]
+
+v_mov_b32 v5, s1
+// GFX12: v_mov_b32_e32 v5, s1                    ; encoding: [0x01,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, s105
+// GFX12: v_mov_b32_e32 v5, s105                  ; encoding: [0x69,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, vcc_lo
+// GFX12: v_mov_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, vcc_hi
+// GFX12: v_mov_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, ttmp15
+// GFX12: v_mov_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, m0
+// GFX12: v_mov_b32_e32 v5, m0                    ; encoding: [0x7d,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, exec_lo
+// GFX12: v_mov_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, exec_hi
+// GFX12: v_mov_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, null
+// GFX12: v_mov_b32_e32 v5, null                  ; encoding: [0x7c,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, -1
+// GFX12: v_mov_b32_e32 v5, -1                    ; encoding: [0xc1,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, 0.5
+// GFX12: v_mov_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, src_scc
+// GFX12: v_mov_b32_e32 v5, src_scc               ; encoding: [0xfd,0x02,0x0a,0x7e]
+
+v_mov_b32 v255, 0xaf123456
+// GFX12: v_mov_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_movreld_b32 v5, v1
+// GFX12: v_movreld_b32_e32 v5, v1                ; encoding: [0x01,0x85,0x0a,0x7e]
+
+v_movreld_b32 v5, v255
+// GFX12: v_movreld_b32_e32 v5, v255              ; encoding: [0xff,0x85,0x0a,0x7e]
+
+v_movreld_b32 v5, s1
+// GFX12: v_movreld_b32_e32 v5, s1                ; encoding: [0x01,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, s105
+// GFX12: v_movreld_b32_e32 v5, s105              ; encoding: [0x69,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, vcc_lo
+// GFX12: v_movreld_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, vcc_hi
+// GFX12: v_movreld_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, ttmp15
+// GFX12: v_movreld_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, m0
+// GFX12: v_movreld_b32_e32 v5, m0                ; encoding: [0x7d,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, exec_lo
+// GFX12: v_movreld_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, exec_hi
+// GFX12: v_movreld_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, null
+// GFX12: v_movreld_b32_e32 v5, null              ; encoding: [0x7c,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, -1
+// GFX12: v_movreld_b32_e32 v5, -1                ; encoding: [0xc1,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, 0.5
+// GFX12: v_movreld_b32_e32 v5, 0.5               ; encoding: [0xf0,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, src_scc
+// GFX12: v_movreld_b32_e32 v5, src_scc           ; encoding: [0xfd,0x84,0x0a,0x7e]
+
+v_movreld_b32 v255, 0xaf123456
+// GFX12: v_movreld_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_movrels_b32 v5, v1
+// GFX12: v_movrels_b32_e32 v5, v1                ; encoding: [0x01,0x87,0x0a,0x7e]
+
+v_movrels_b32 v255, v255
+// GFX12: v_movrels_b32_e32 v255, v255            ; encoding: [0xff,0x87,0xfe,0x7f]
+
+v_movrelsd_2_b32 v5, v1
+// GFX12: v_movrelsd_2_b32_e32 v5, v1             ; encoding: [0x01,0x91,0x0a,0x7e]
+
+v_movrelsd_2_b32 v255, v255
+// GFX12: v_movrelsd_2_b32_e32 v255, v255         ; encoding: [0xff,0x91,0xfe,0x7f]
+
+v_movrelsd_b32 v5, v1
+// GFX12: v_movrelsd_b32_e32 v5, v1               ; encoding: [0x01,0x89,0x0a,0x7e]
+
+v_movrelsd_b32 v255, v255
+// GFX12: v_movrelsd_b32_e32 v255, v255           ; encoding: [0xff,0x89,0xfe,0x7f]
+
+v_nop
+// GFX12: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
+
+v_not_b16 v5, v1
+// GFX12: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5, v127
+// GFX12: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+
+v_not_b16 v5, s1
+// GFX12: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, s105
+// GFX12: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, vcc_lo
+// GFX12: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, vcc_hi
+// GFX12: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, ttmp15
+// GFX12: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, m0
+// GFX12: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, exec_lo
+// GFX12: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, exec_hi
+// GFX12: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, null
+// GFX12: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, -1
+// GFX12: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, 0.5
+// GFX12-ASM: v_not_b16_e32 v5, 0.5                   ; encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-DIS: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_not_b16 v5, src_scc
+// GFX12: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+
+v_not_b16 v127, 0xfe0b
+// GFX12: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_not_b32 v5, v1
+// GFX12: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
+
+v_not_b32 v5, v255
+// GFX12: v_not_b32_e32 v5, v255                  ; encoding: [0xff,0x6f,0x0a,0x7e]
+
+v_not_b32 v5, s1
+// GFX12: v_not_b32_e32 v5, s1                    ; encoding: [0x01,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, s105
+// GFX12: v_not_b32_e32 v5, s105                  ; encoding: [0x69,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, vcc_lo
+// GFX12: v_not_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, vcc_hi
+// GFX12: v_not_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, ttmp15
+// GFX12: v_not_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, m0
+// GFX12: v_not_b32_e32 v5, m0                    ; encoding: [0x7d,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, exec_lo
+// GFX12: v_not_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, exec_hi
+// GFX12: v_not_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, null
+// GFX12: v_not_b32_e32 v5, null                  ; encoding: [0x7c,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, -1
+// GFX12: v_not_b32_e32 v5, -1                    ; encoding: [0xc1,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, 0.5
+// GFX12: v_not_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, src_scc
+// GFX12: v_not_b32_e32 v5, src_scc               ; encoding: [0xfd,0x6e,0x0a,0x7e]
+
+v_not_b32 v255, 0xaf123456
+// GFX12: v_not_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_permlane64_b32 v5, v1
+// GFX12: v_permlane64_b32 v5, v1                 ; encoding: [0x01,0xcf,0x0a,0x7e]
+
+v_permlane64_b32 v255, v255
+// GFX12: v_permlane64_b32 v255, v255             ; encoding: [0xff,0xcf,0xfe,0x7f]
+
+v_pipeflush
+// GFX12: v_pipeflush                             ; encoding: [0x00,0x36,0x00,0x7e]
+
+v_rcp_f16 v5, v1
+// GFX12: v_rcp_f16_e32 v5, v1                    ; encoding: [0x01,0xa9,0x0a,0x7e]
+
+v_rcp_f16 v5, v127
+// GFX12: v_rcp_f16_e32 v5, v127                  ; encoding: [0x7f,0xa9,0x0a,0x7e]
+
+v_rcp_f16 v5, s1
+// GFX12: v_rcp_f16_e32 v5, s1                    ; encoding: [0x01,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, s105
+// GFX12: v_rcp_f16_e32 v5, s105                  ; encoding: [0x69,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_lo
+// GFX12: v_rcp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_hi
+// GFX12: v_rcp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, ttmp15
+// GFX12: v_rcp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, m0
+// GFX12: v_rcp_f16_e32 v5, m0                    ; encoding: [0x7d,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_lo
+// GFX12: v_rcp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_hi
+// GFX12: v_rcp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, null
+// GFX12: v_rcp_f16_e32 v5, null                  ; encoding: [0x7c,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, -1
+// GFX12: v_rcp_f16_e32 v5, -1                    ; encoding: [0xc1,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, 0.5
+// GFX12: v_rcp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, src_scc
+// GFX12: v_rcp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v127, 0xfe0b
+// GFX12: v_rcp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rcp_f32 v5, v1
+// GFX12: v_rcp_f32_e32 v5, v1                    ; encoding: [0x01,0x55,0x0a,0x7e]
+
+v_rcp_f32 v5, v255
+// GFX12: v_rcp_f32_e32 v5, v255                  ; encoding: [0xff,0x55,0x0a,0x7e]
+
+v_rcp_f32 v5, s1
+// GFX12: v_rcp_f32_e32 v5, s1                    ; encoding: [0x01,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, s105
+// GFX12: v_rcp_f32_e32 v5, s105                  ; encoding: [0x69,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, vcc_lo
+// GFX12: v_rcp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, vcc_hi
+// GFX12: v_rcp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, ttmp15
+// GFX12: v_rcp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, m0
+// GFX12: v_rcp_f32_e32 v5, m0                    ; encoding: [0x7d,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, exec_lo
+// GFX12: v_rcp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, exec_hi
+// GFX12: v_rcp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, null
+// GFX12: v_rcp_f32_e32 v5, null                  ; encoding: [0x7c,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, -1
+// GFX12: v_rcp_f32_e32 v5, -1                    ; encoding: [0xc1,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, 0.5
+// GFX12: v_rcp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, src_scc
+// GFX12: v_rcp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x54,0x0a,0x7e]
+
+v_rcp_f32 v255, 0xaf123456
+// GFX12: v_rcp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rcp_f64 v[5:6], v[1:2]
+// GFX12: v_rcp_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x5f,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], v[254:255]
+// GFX12: v_rcp_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x5f,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], s[2:3]
+// GFX12: v_rcp_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], s[104:105]
+// GFX12: v_rcp_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], vcc
+// GFX12: v_rcp_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rcp_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], exec
+// GFX12: v_rcp_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], null
+// GFX12: v_rcp_f64_e32 v[5:6], null              ; encoding: [0x7c,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], -1
+// GFX12: v_rcp_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], 0.5
+// GFX12: v_rcp_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], src_scc
+// GFX12: v_rcp_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[254:255], 0xaf123456
+// GFX12: v_rcp_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rcp_iflag_f32 v5, v1
+// GFX12: v_rcp_iflag_f32_e32 v5, v1              ; encoding: [0x01,0x57,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, v255
+// GFX12: v_rcp_iflag_f32_e32 v5, v255            ; encoding: [0xff,0x57,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, s1
+// GFX12: v_rcp_iflag_f32_e32 v5, s1              ; encoding: [0x01,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, s105
+// GFX12: v_rcp_iflag_f32_e32 v5, s105            ; encoding: [0x69,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, vcc_lo
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_lo          ; encoding: [0x6a,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, vcc_hi
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_hi          ; encoding: [0x6b,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, ttmp15
+// GFX12: v_rcp_iflag_f32_e32 v5, ttmp15          ; encoding: [0x7b,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, m0
+// GFX12: v_rcp_iflag_f32_e32 v5, m0              ; encoding: [0x7d,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, exec_lo
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_lo         ; encoding: [0x7e,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, exec_hi
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_hi         ; encoding: [0x7f,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, null
+// GFX12: v_rcp_iflag_f32_e32 v5, null            ; encoding: [0x7c,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, -1
+// GFX12: v_rcp_iflag_f32_e32 v5, -1              ; encoding: [0xc1,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, 0.5
+// GFX12: v_rcp_iflag_f32_e32 v5, 0.5             ; encoding: [0xf0,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, src_scc
+// GFX12: v_rcp_iflag_f32_e32 v5, src_scc         ; encoding: [0xfd,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v255, 0xaf123456
+// GFX12: v_rcp_iflag_f32_e32 v255, 0xaf123456    ; encoding: [0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_readfirstlane_b32 s5, v1
+// GFX12: v_readfirstlane_b32 s5, v1              ; encoding: [0x01,0x05,0x0a,0x7e]
+
+v_readfirstlane_b32 s105, v1
+// GFX12: v_readfirstlane_b32 s105, v1            ; encoding: [0x01,0x05,0xd2,0x7e]
+
+v_readfirstlane_b32 vcc_lo, v1
+// GFX12: v_readfirstlane_b32 vcc_lo, v1          ; encoding: [0x01,0x05,0xd4,0x7e]
+
+v_readfirstlane_b32 vcc_hi, v1
+// GFX12: v_readfirstlane_b32 vcc_hi, v1          ; encoding: [0x01,0x05,0xd6,0x7e]
+
+v_readfirstlane_b32 ttmp15, v1
+// GFX12: v_readfirstlane_b32 ttmp15, v1          ; encoding: [0x01,0x05,0xf6,0x7e]
+
+v_readfirstlane_b32 null, v255
+// GFX12: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
+
+v_rndne_f16 v5, v1
+// GFX12: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5, v127
+// GFX12: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5, s1
+// GFX12: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, s105
+// GFX12: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_lo
+// GFX12: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_hi
+// GFX12: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, ttmp15
+// GFX12: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, m0
+// GFX12: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_lo
+// GFX12: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_hi
+// GFX12: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, null
+// GFX12: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, -1
+// GFX12: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, 0.5
+// GFX12: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, src_scc
+// GFX12: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v127, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f32 v5, v1
+// GFX12: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
+
+v_rndne_f32 v5, v255
+// GFX12: v_rndne_f32_e32 v5, v255                ; encoding: [0xff,0x47,0x0a,0x7e]
+
+v_rndne_f32 v5, s1
+// GFX12: v_rndne_f32_e32 v5, s1                  ; encoding: [0x01,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, s105
+// GFX12: v_rndne_f32_e32 v5, s105                ; encoding: [0x69,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, vcc_lo
+// GFX12: v_rndne_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, vcc_hi
+// GFX12: v_rndne_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, ttmp15
+// GFX12: v_rndne_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, m0
+// GFX12: v_rndne_f32_e32 v5, m0                  ; encoding: [0x7d,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, exec_lo
+// GFX12: v_rndne_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, exec_hi
+// GFX12: v_rndne_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, null
+// GFX12: v_rndne_f32_e32 v5, null                ; encoding: [0x7c,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, -1
+// GFX12: v_rndne_f32_e32 v5, -1                  ; encoding: [0xc1,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, 0.5
+// GFX12: v_rndne_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, src_scc
+// GFX12: v_rndne_f32_e32 v5, src_scc             ; encoding: [0xfd,0x46,0x0a,0x7e]
+
+v_rndne_f32 v255, 0xaf123456
+// GFX12: v_rndne_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rndne_f64 v[5:6], v[1:2]
+// GFX12: v_rndne_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x33,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], v[254:255]
+// GFX12: v_rndne_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x33,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], s[2:3]
+// GFX12: v_rndne_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], s[104:105]
+// GFX12: v_rndne_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], vcc
+// GFX12: v_rndne_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rndne_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], exec
+// GFX12: v_rndne_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], null
+// GFX12: v_rndne_f64_e32 v[5:6], null            ; encoding: [0x7c,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], -1
+// GFX12: v_rndne_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], 0.5
+// GFX12: v_rndne_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], src_scc
+// GFX12: v_rndne_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[254:255], 0xaf123456
+// GFX12: v_rndne_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rsq_f16 v5, v1
+// GFX12: v_rsq_f16_e32 v5, v1                    ; encoding: [0x01,0xad,0x0a,0x7e]
+
+v_rsq_f16 v5, v127
+// GFX12: v_rsq_f16_e32 v5, v127                  ; encoding: [0x7f,0xad,0x0a,0x7e]
+
+v_rsq_f16 v5, s1
+// GFX12: v_rsq_f16_e32 v5, s1                    ; encoding: [0x01,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, s105
+// GFX12: v_rsq_f16_e32 v5, s105                  ; encoding: [0x69,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_lo
+// GFX12: v_rsq_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_hi
+// GFX12: v_rsq_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, ttmp15
+// GFX12: v_rsq_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, m0
+// GFX12: v_rsq_f16_e32 v5, m0                    ; encoding: [0x7d,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_lo
+// GFX12: v_rsq_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_hi
+// GFX12: v_rsq_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, null
+// GFX12: v_rsq_f16_e32 v5, null                  ; encoding: [0x7c,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, -1
+// GFX12: v_rsq_f16_e32 v5, -1                    ; encoding: [0xc1,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, 0.5
+// GFX12: v_rsq_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, src_scc
+// GFX12: v_rsq_f16_e32 v5, src_scc               ; encoding: [0xfd,0xac,0x0a,0x7e]
+
+v_rsq_f16 v127, 0xfe0b
+// GFX12: v_rsq_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rsq_f32 v5, v1
+// GFX12: v_rsq_f32_e32 v5, v1                    ; encoding: [0x01,0x5d,0x0a,0x7e]
+
+v_rsq_f32 v5, v255
+// GFX12: v_rsq_f32_e32 v5, v255                  ; encoding: [0xff,0x5d,0x0a,0x7e]
+
+v_rsq_f32 v5, s1
+// GFX12: v_rsq_f32_e32 v5, s1                    ; encoding: [0x01,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, s105
+// GFX12: v_rsq_f32_e32 v5, s105                  ; encoding: [0x69,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, vcc_lo
+// GFX12: v_rsq_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, vcc_hi
+// GFX12: v_rsq_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, ttmp15
+// GFX12: v_rsq_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, m0
+// GFX12: v_rsq_f32_e32 v5, m0                    ; encoding: [0x7d,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, exec_lo
+// GFX12: v_rsq_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, exec_hi
+// GFX12: v_rsq_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, null
+// GFX12: v_rsq_f32_e32 v5, null                  ; encoding: [0x7c,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, -1
+// GFX12: v_rsq_f32_e32 v5, -1                    ; encoding: [0xc1,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, 0.5
+// GFX12: v_rsq_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, src_scc
+// GFX12: v_rsq_f32_e32 v5, src_scc               ; encoding: [0xfd,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v255, 0xaf123456
+// GFX12: v_rsq_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rsq_f64 v[5:6], v[1:2]
+// GFX12: v_rsq_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x63,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], v[254:255]
+// GFX12: v_rsq_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x63,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], s[2:3]
+// GFX12: v_rsq_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], s[104:105]
+// GFX12: v_rsq_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], vcc
+// GFX12: v_rsq_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rsq_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], exec
+// GFX12: v_rsq_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], null
+// GFX12: v_rsq_f64_e32 v[5:6], null              ; encoding: [0x7c,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], -1
+// GFX12: v_rsq_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], 0.5
+// GFX12: v_rsq_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], src_scc
+// GFX12: v_rsq_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[254:255], 0xaf123456
+// GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sat_pk_u8_i16 v5, v1
+// GFX12: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, v255
+// GFX12: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, s1
+// GFX12: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, s105
+// GFX12: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, vcc_lo
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, vcc_hi
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, ttmp15
+// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, m0
+// GFX12: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, exec_lo
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, exec_hi
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, null
+// GFX12: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, -1
+// GFX12: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, 0.5
+// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, src_scc
+// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v127, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5, v1
+// GFX12: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5, v127
+// GFX12: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5, s1
+// GFX12: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, s105
+// GFX12: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_lo
+// GFX12: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_hi
+// GFX12: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, ttmp15
+// GFX12: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, m0
+// GFX12: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, exec_lo
+// GFX12: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, exec_hi
+// GFX12: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, null
+// GFX12: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, -1
+// GFX12: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, 0.5
+// GFX12: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, src_scc
+// GFX12: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+
+v_sin_f16 v127, 0xfe0b
+// GFX12: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f32 v5, v1
+// GFX12: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
+
+v_sin_f32 v5, v255
+// GFX12: v_sin_f32_e32 v5, v255                  ; encoding: [0xff,0x6b,0x0a,0x7e]
+
+v_sin_f32 v5, s1
+// GFX12: v_sin_f32_e32 v5, s1                    ; encoding: [0x01,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, s105
+// GFX12: v_sin_f32_e32 v5, s105                  ; encoding: [0x69,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, vcc_lo
+// GFX12: v_sin_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, vcc_hi
+// GFX12: v_sin_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, ttmp15
+// GFX12: v_sin_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, m0
+// GFX12: v_sin_f32_e32 v5, m0                    ; encoding: [0x7d,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, exec_lo
+// GFX12: v_sin_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, exec_hi
+// GFX12: v_sin_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, null
+// GFX12: v_sin_f32_e32 v5, null                  ; encoding: [0x7c,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, -1
+// GFX12: v_sin_f32_e32 v5, -1                    ; encoding: [0xc1,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, 0.5
+// GFX12: v_sin_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, src_scc
+// GFX12: v_sin_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6a,0x0a,0x7e]
+
+v_sin_f32 v255, 0xaf123456
+// GFX12: v_sin_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f16 v5, v1
+// GFX12: v_sqrt_f16_e32 v5, v1                   ; encoding: [0x01,0xab,0x0a,0x7e]
+
+v_sqrt_f16 v5, v127
+// GFX12: v_sqrt_f16_e32 v5, v127                 ; encoding: [0x7f,0xab,0x0a,0x7e]
+
+v_sqrt_f16 v5, s1
+// GFX12: v_sqrt_f16_e32 v5, s1                   ; encoding: [0x01,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, s105
+// GFX12: v_sqrt_f16_e32 v5, s105                 ; encoding: [0x69,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_lo
+// GFX12: v_sqrt_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_hi
+// GFX12: v_sqrt_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, ttmp15
+// GFX12: v_sqrt_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, m0
+// GFX12: v_sqrt_f16_e32 v5, m0                   ; encoding: [0x7d,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_lo
+// GFX12: v_sqrt_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_hi
+// GFX12: v_sqrt_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, null
+// GFX12: v_sqrt_f16_e32 v5, null                 ; encoding: [0x7c,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, -1
+// GFX12: v_sqrt_f16_e32 v5, -1                   ; encoding: [0xc1,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, 0.5
+// GFX12: v_sqrt_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, src_scc
+// GFX12: v_sqrt_f16_e32 v5, src_scc              ; encoding: [0xfd,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v127, 0xfe0b
+// GFX12: v_sqrt_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sqrt_f32 v5, v1
+// GFX12: v_sqrt_f32_e32 v5, v1                   ; encoding: [0x01,0x67,0x0a,0x7e]
+
+v_sqrt_f32 v5, v255
+// GFX12: v_sqrt_f32_e32 v5, v255                 ; encoding: [0xff,0x67,0x0a,0x7e]
+
+v_sqrt_f32 v5, s1
+// GFX12: v_sqrt_f32_e32 v5, s1                   ; encoding: [0x01,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, s105
+// GFX12: v_sqrt_f32_e32 v5, s105                 ; encoding: [0x69,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, vcc_lo
+// GFX12: v_sqrt_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, vcc_hi
+// GFX12: v_sqrt_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, ttmp15
+// GFX12: v_sqrt_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, m0
+// GFX12: v_sqrt_f32_e32 v5, m0                   ; encoding: [0x7d,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, exec_lo
+// GFX12: v_sqrt_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, exec_hi
+// GFX12: v_sqrt_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, null
+// GFX12: v_sqrt_f32_e32 v5, null                 ; encoding: [0x7c,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, -1
+// GFX12: v_sqrt_f32_e32 v5, -1                   ; encoding: [0xc1,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, 0.5
+// GFX12: v_sqrt_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, src_scc
+// GFX12: v_sqrt_f32_e32 v5, src_scc              ; encoding: [0xfd,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v255, 0xaf123456
+// GFX12: v_sqrt_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f64 v[5:6], v[1:2]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x69,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], v[254:255]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x69,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], s[2:3]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], s[104:105]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], vcc
+// GFX12: v_sqrt_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], ttmp[14:15]
+// GFX12: v_sqrt_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], exec
+// GFX12: v_sqrt_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], null
+// GFX12: v_sqrt_f64_e32 v[5:6], null             ; encoding: [0x7c,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], -1
+// GFX12: v_sqrt_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], 0.5
+// GFX12: v_sqrt_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], src_scc
+// GFX12: v_sqrt_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[254:255], 0xaf123456
+// GFX12: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_swap_b32 v5, v1
+// GFX12: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]
+
+v_swap_b32 v255, v255
+// GFX12: v_swap_b32 v255, v255                   ; encoding: [0xff,0xcb,0xfe,0x7f]
+
+v_swaprel_b32 v5, v1
+// GFX12: v_swaprel_b32 v5, v1                    ; encoding: [0x01,0xd1,0x0a,0x7e]
+
+v_swaprel_b32 v255, v255
+// GFX12: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
+
+v_trunc_f16 v5, v1
+// GFX12: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5, v127
+// GFX12: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5, s1
+// GFX12: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, s105
+// GFX12: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_lo
+// GFX12: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_hi
+// GFX12: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, ttmp15
+// GFX12: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, m0
+// GFX12: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_lo
+// GFX12: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_hi
+// GFX12: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, null
+// GFX12: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, -1
+// GFX12: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, 0.5
+// GFX12: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, src_scc
+// GFX12: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+
+v_trunc_f16 v127, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f32 v5, v1
+// GFX12: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
+
+v_trunc_f32 v5, v255
+// GFX12: v_trunc_f32_e32 v5, v255                ; encoding: [0xff,0x43,0x0a,0x7e]
+
+v_trunc_f32 v5, s1
+// GFX12: v_trunc_f32_e32 v5, s1                  ; encoding: [0x01,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, s105
+// GFX12: v_trunc_f32_e32 v5, s105                ; encoding: [0x69,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, vcc_lo
+// GFX12: v_trunc_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, vcc_hi
+// GFX12: v_trunc_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, ttmp15
+// GFX12: v_trunc_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, m0
+// GFX12: v_trunc_f32_e32 v5, m0                  ; encoding: [0x7d,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, exec_lo
+// GFX12: v_trunc_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, exec_hi
+// GFX12: v_trunc_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, null
+// GFX12: v_trunc_f32_e32 v5, null                ; encoding: [0x7c,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, -1
+// GFX12: v_trunc_f32_e32 v5, -1                  ; encoding: [0xc1,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, 0.5
+// GFX12: v_trunc_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, src_scc
+// GFX12: v_trunc_f32_e32 v5, src_scc             ; encoding: [0xfd,0x42,0x0a,0x7e]
+
+v_trunc_f32 v255, 0xaf123456
+// GFX12: v_trunc_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_trunc_f64 v[5:6], v[1:2]
+// GFX12: v_trunc_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x2f,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], v[254:255]
+// GFX12: v_trunc_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x2f,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], s[2:3]
+// GFX12: v_trunc_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], s[104:105]
+// GFX12: v_trunc_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], vcc
+// GFX12: v_trunc_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], ttmp[14:15]
+// GFX12: v_trunc_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], exec
+// GFX12: v_trunc_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], null
+// GFX12: v_trunc_f64_e32 v[5:6], null            ; encoding: [0x7c,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], -1
+// GFX12: v_trunc_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], 0.5
+// GFX12: v_trunc_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], src_scc
+// GFX12: v_trunc_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[254:255], 0xaf123456
+// GFX12: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 7021bd4dcdcc9..d2fe7b3d8a95a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -1,3590 +1,3608 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_e32 v5, v1
-// GFX12: encoding: [0x01,0x71,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
 
 v_bfrev_b32 v5, v255
-// GFX12: encoding: [0xff,0x71,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, v255                ; encoding: [0xff,0x71,0x0a,0x7e]
 
 v_bfrev_b32 v5, s1
-// GFX12: encoding: [0x01,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, s1                  ; encoding: [0x01,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, s105
-// GFX12: encoding: [0x69,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, s105                ; encoding: [0x69,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, vcc_lo              ; encoding: [0x6a,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, vcc_hi              ; encoding: [0x6b,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, ttmp15              ; encoding: [0x7b,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, m0
-// GFX12: encoding: [0x7d,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, m0                  ; encoding: [0x7d,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, exec_lo             ; encoding: [0x7e,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, exec_hi             ; encoding: [0x7f,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, null
-// GFX12: encoding: [0x7c,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, null                ; encoding: [0x7c,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, -1
-// GFX12: encoding: [0xc1,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, -1                  ; encoding: [0xc1,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, 0.5                 ; encoding: [0xf0,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x70,0x0a,0x7e]
+// GFX12: v_bfrev_b32_e32 v5, src_scc             ; encoding: [0xfd,0x70,0x0a,0x7e]
 
 v_bfrev_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_bfrev_b32_e32 v255, 0xaf123456        ; encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ceil_f16 v5, v1
-// GFX12: encoding: [0x01,0xb9,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, v1                   ; encoding: [0x01,0xb9,0x0a,0x7e]
 
 v_ceil_f16 v5, v127
-// GFX12: encoding: [0x7f,0xb9,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, v127                 ; encoding: [0x7f,0xb9,0x0a,0x7e]
 
 v_ceil_f16 v5, s1
-// GFX12: encoding: [0x01,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, s1                   ; encoding: [0x01,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, s105
-// GFX12: encoding: [0x69,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, s105                 ; encoding: [0x69,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, m0
-// GFX12: encoding: [0x7d,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, m0                   ; encoding: [0x7d,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, null
-// GFX12: encoding: [0x7c,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, null                 ; encoding: [0x7c,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, -1
-// GFX12: encoding: [0xc1,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, -1                   ; encoding: [0xc1,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xb8,0x0a,0x7e]
+// GFX12: v_ceil_f16_e32 v5, src_scc              ; encoding: [0xfd,0xb8,0x0a,0x7e]
 
 v_ceil_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_ceil_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_ceil_f32 v5, v1
-// GFX12: encoding: [0x01,0x45,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
 
 v_ceil_f32 v5, v255
-// GFX12: encoding: [0xff,0x45,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, v255                 ; encoding: [0xff,0x45,0x0a,0x7e]
 
 v_ceil_f32 v5, s1
-// GFX12: encoding: [0x01,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, s1                   ; encoding: [0x01,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, s105
-// GFX12: encoding: [0x69,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, s105                 ; encoding: [0x69,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, m0
-// GFX12: encoding: [0x7d,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, m0                   ; encoding: [0x7d,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, null
-// GFX12: encoding: [0x7c,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, null                 ; encoding: [0x7c,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, -1
-// GFX12: encoding: [0xc1,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, -1                   ; encoding: [0xc1,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
 
 v_ceil_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x44,0x0a,0x7e]
+// GFX12: v_ceil_f32_e32 v5, src_scc              ; encoding: [0xfd,0x44,0x0a,0x7e]
 
 v_ceil_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ceil_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x31,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x31,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x31,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x31,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], null             ; encoding: [0x7c,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x30,0x0a,0x7e]
+// GFX12: v_ceil_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x30,0x0a,0x7e]
 
 v_ceil_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_ceil_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cls_i32 v5, v1
-// GFX12: encoding: [0x01,0x77,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
 
 v_cls_i32 v5, v255
-// GFX12: encoding: [0xff,0x77,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
 
 v_cls_i32 v5, s1
-// GFX12: encoding: [0x01,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, s105
-// GFX12: encoding: [0x69,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, m0
-// GFX12: encoding: [0x7d,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, null
-// GFX12: encoding: [0x7c,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, -1
-// GFX12: encoding: [0xc1,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, 0.5
-// GFX12: encoding: [0xf0,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
 
 v_cls_i32 v5, src_scc
-// GFX12: encoding: [0xfd,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
 
 v_cls_i32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_clz_i32_u32 v5, v1
-// GFX12: encoding: [0x01,0x73,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
 
 v_clz_i32_u32 v5, v255
-// GFX12: encoding: [0xff,0x73,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
 
 v_clz_i32_u32 v5, s1
-// GFX12: encoding: [0x01,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, s105
-// GFX12: encoding: [0x69,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, m0
-// GFX12: encoding: [0x7d,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, null
-// GFX12: encoding: [0x7c,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, -1
-// GFX12: encoding: [0xc1,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, 0.5
-// GFX12: encoding: [0xf0,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v5, src_scc
-// GFX12: encoding: [0xfd,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
 
 v_clz_i32_u32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cos_f16 v5, v1
-// GFX12: encoding: [0x01,0xc3,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
 
 v_cos_f16 v5, v127
-// GFX12: encoding: [0x7f,0xc3,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
 v_cos_f16 v5, s1
-// GFX12: encoding: [0x01,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, s105
-// GFX12: encoding: [0x69,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, m0
-// GFX12: encoding: [0x7d,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, null
-// GFX12: encoding: [0x7c,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, -1
-// GFX12: encoding: [0xc1,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
 v_cos_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xc2,0x0a,0x7e]
+// GFX12: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
 v_cos_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32 v5, v1
-// GFX12: encoding: [0x01,0x6d,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
 
 v_cos_f32 v5, v255
-// GFX12: encoding: [0xff,0x6d,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, v255                  ; encoding: [0xff,0x6d,0x0a,0x7e]
 
 v_cos_f32 v5, s1
-// GFX12: encoding: [0x01,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, s1                    ; encoding: [0x01,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, s105
-// GFX12: encoding: [0x69,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, s105                  ; encoding: [0x69,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, m0
-// GFX12: encoding: [0x7d,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, m0                    ; encoding: [0x7d,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, null
-// GFX12: encoding: [0x7c,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, null                  ; encoding: [0x7c,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, -1
-// GFX12: encoding: [0xc1,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, -1                    ; encoding: [0xc1,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
 
 v_cos_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x6c,0x0a,0x7e]
+// GFX12: v_cos_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6c,0x0a,0x7e]
 
 v_cos_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ctz_i32_b32 v5, v1
-// GFX12: encoding: [0x01,0x75,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, v255
-// GFX12: encoding: [0xff,0x75,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, s1
-// GFX12: encoding: [0x01,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, s105
-// GFX12: encoding: [0x69,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, m0
-// GFX12: encoding: [0x7d,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, null
-// GFX12: encoding: [0x7c,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, -1
-// GFX12: encoding: [0xc1,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
 
 v_ctz_i32_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_bf8_e32 v1, s3
-// GFX12: encoding: [0x03,0xda,0x02,0x7e]
+// GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
 
 v_cvt_f32_bf8_e32 v1, 3
-// GFX12: encoding: [0x83,0xda,0x02,0x7e]
+// GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
 
 v_cvt_f32_bf8_e32 v1, v3
-// GFX12: encoding: [0x03,0xdb,0x02,0x7e]
+// GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
 
 v_cvt_f32_fp8_e32 v1, s3
-// GFX12: encoding: [0x03,0xd8,0x02,0x7e]
+// GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
 
 v_cvt_f32_fp8_e32 v1, 3
-// GFX12: encoding: [0x83,0xd8,0x02,0x7e]
+// GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
 
 v_cvt_f32_fp8_e32 v1, v3
-// GFX12: encoding: [0x03,0xd9,0x02,0x7e]
+// GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[2:3], s3
-// GFX12: encoding: [0x03,0xde,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[3:4], s5
-// GFX12: encoding: [0x05,0xde,0x06,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], s5         ; encoding: [0x05,0xde,0x06,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[2:3], 3
-// GFX12: encoding: [0x83,0xde,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[3:4], 3
-// GFX12: encoding: [0x83,0xde,0x06,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], 3          ; encoding: [0x83,0xde,0x06,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[2:3], v3
-// GFX12: encoding: [0x03,0xdf,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[3:4], v3
-// GFX12: encoding: [0x03,0xdf,0x06,0x7e]
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3         ; encoding: [0x03,0xdf,0x06,0x7e]
 
 v_cvt_pk_f32_fp8_e32 v[2:3], s3
-// GFX12: encoding: [0x03,0xdc,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
 
 v_cvt_pk_f32_fp8_e32 v[2:3], 3
-// GFX12: encoding: [0x83,0xdc,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
 
 v_cvt_pk_f32_fp8_e32 v[2:3], v3
-// GFX12: encoding: [0x03,0xdd,0x04,0x7e]
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
 
 v_cvt_f16_f32 v5, v1
-// GFX12: encoding: [0x01,0x15,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, v255
-// GFX12: encoding: [0xff,0x15,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, v255              ; encoding: [0xff,0x15,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, s1
-// GFX12: encoding: [0x01,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, s1                ; encoding: [0x01,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, s105
-// GFX12: encoding: [0x69,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, s105              ; encoding: [0x69,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, m0
-// GFX12: encoding: [0x7d,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, m0                ; encoding: [0x7d,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, null
-// GFX12: encoding: [0x7c,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, null              ; encoding: [0x7c,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, -1
-// GFX12: encoding: [0xc1,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, -1                ; encoding: [0xc1,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, 0.5               ; encoding: [0xf0,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x14,0x0a,0x7e]
+// GFX12: v_cvt_f16_f32_e32 v5, src_scc           ; encoding: [0xfd,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v127, 0xaf123456
-// GFX12: encoding: [0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f16_f32_e32 v127, 0xaf123456      ; encoding: [0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf]
 
 v_cvt_f16_i16 v5, v1
-// GFX12: encoding: [0x01,0xa3,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, v1                ; encoding: [0x01,0xa3,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, v127
-// GFX12: encoding: [0x7f,0xa3,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, v127              ; encoding: [0x7f,0xa3,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, s1
-// GFX12: encoding: [0x01,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, s1                ; encoding: [0x01,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, s105
-// GFX12: encoding: [0x69,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, s105              ; encoding: [0x69,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, m0
-// GFX12: encoding: [0x7d,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, m0                ; encoding: [0x7d,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, null
-// GFX12: encoding: [0x7c,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, null              ; encoding: [0x7c,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, -1
-// GFX12: encoding: [0xc1,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, -1                ; encoding: [0xc1,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v5, 0.5
-// GFX12: encoding: [0xf0,0xa2,0x0a,0x7e]
+// GFX12-ASM: v_cvt_f16_i16_e32 v5, 0.5               ; encoding: [0xf0,0xa2,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_i16_e32 v5, 0x3800            ; encoding: [0xff,0xa2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 v_cvt_f16_i16 v5, src_scc
-// GFX12: encoding: [0xfd,0xa2,0x0a,0x7e]
+// GFX12: v_cvt_f16_i16_e32 v5, src_scc           ; encoding: [0xfd,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_f16_i16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f16_u16 v5, v1
-// GFX12: encoding: [0x01,0xa1,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, v1                ; encoding: [0x01,0xa1,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, v127
-// GFX12: encoding: [0x7f,0xa1,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, v127              ; encoding: [0x7f,0xa1,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, s1
-// GFX12: encoding: [0x01,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, s1                ; encoding: [0x01,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, s105
-// GFX12: encoding: [0x69,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, s105              ; encoding: [0x69,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, m0
-// GFX12: encoding: [0x7d,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, m0                ; encoding: [0x7d,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, null
-// GFX12: encoding: [0x7c,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, null              ; encoding: [0x7c,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, -1
-// GFX12: encoding: [0xc1,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, -1                ; encoding: [0xc1,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v5, 0.5
-// GFX12: encoding: [0xf0,0xa0,0x0a,0x7e]
+// GFX12-ASM: v_cvt_f16_u16_e32 v5, 0.5               ; encoding: [0xf0,0xa0,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_u16_e32 v5, 0x3800            ; encoding: [0xff,0xa0,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 v_cvt_f16_u16 v5, src_scc
-// GFX12: encoding: [0xfd,0xa0,0x0a,0x7e]
+// GFX12: v_cvt_f16_u16_e32 v5, src_scc           ; encoding: [0xfd,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_f16_u16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f32_f16 v5, v1
-// GFX12: encoding: [0x01,0x17,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, v1                ; encoding: [0x01,0x17,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, v127
-// GFX12: encoding: [0x7f,0x17,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, v127              ; encoding: [0x7f,0x17,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, s1
-// GFX12: encoding: [0x01,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, s1                ; encoding: [0x01,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, s105
-// GFX12: encoding: [0x69,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, s105              ; encoding: [0x69,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, ttmp15            ; encoding: [0x7b,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, m0
-// GFX12: encoding: [0x7d,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, m0                ; encoding: [0x7d,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, exec_lo           ; encoding: [0x7e,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, exec_hi           ; encoding: [0x7f,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, null
-// GFX12: encoding: [0x7c,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, null              ; encoding: [0x7c,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, -1
-// GFX12: encoding: [0xc1,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, -1                ; encoding: [0xc1,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, 0.5               ; encoding: [0xf0,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0x16,0x0a,0x7e]
+// GFX12: v_cvt_f32_f16_e32 v5, src_scc           ; encoding: [0xfd,0x16,0x0a,0x7e]
 
 v_cvt_f32_f16 v255, 0xfe0b
-// GFX12: encoding: [0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_f32_f16_e32 v255, 0xfe0b          ; encoding: [0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f32_f64 v5, v[1:2]
-// GFX12: encoding: [0x01,0x1f,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x1f,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, v[254:255]
-// GFX12: encoding: [0xfe,0x1f,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x1f,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, s[2:3]
-// GFX12: encoding: [0x02,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, s[104:105]
-// GFX12: encoding: [0x68,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, vcc
-// GFX12: encoding: [0x6a,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, vcc               ; encoding: [0x6a,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, ttmp[14:15]
-// GFX12: encoding: [0x7a,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, exec
-// GFX12: encoding: [0x7e,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, exec              ; encoding: [0x7e,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, null
-// GFX12: encoding: [0x7c,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, null              ; encoding: [0x7c,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, -1
-// GFX12: encoding: [0xc1,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, -1                ; encoding: [0xc1,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, 0.5
-// GFX12: encoding: [0xf0,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v5, src_scc
-// GFX12: encoding: [0xfd,0x1e,0x0a,0x7e]
+// GFX12: v_cvt_f32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x1e,0x0a,0x7e]
 
 v_cvt_f32_f64 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_i32 v5, v1
-// GFX12: encoding: [0x01,0x0b,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, v1                ; encoding: [0x01,0x0b,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, v255
-// GFX12: encoding: [0xff,0x0b,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, v255              ; encoding: [0xff,0x0b,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, s1
-// GFX12: encoding: [0x01,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, s1                ; encoding: [0x01,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, s105
-// GFX12: encoding: [0x69,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, s105              ; encoding: [0x69,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, ttmp15            ; encoding: [0x7b,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, m0
-// GFX12: encoding: [0x7d,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, m0                ; encoding: [0x7d,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, exec_lo           ; encoding: [0x7e,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, exec_hi           ; encoding: [0x7f,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, null
-// GFX12: encoding: [0x7c,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, null              ; encoding: [0x7c,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, -1
-// GFX12: encoding: [0xc1,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, -1                ; encoding: [0xc1,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, 0.5
-// GFX12: encoding: [0xf0,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, 0.5               ; encoding: [0xf0,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v5, src_scc
-// GFX12: encoding: [0xfd,0x0a,0x0a,0x7e]
+// GFX12: v_cvt_f32_i32_e32 v5, src_scc           ; encoding: [0xfd,0x0a,0x0a,0x7e]
 
 v_cvt_f32_i32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_i32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_u32 v5, v1
-// GFX12: encoding: [0x01,0x0d,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, v1                ; encoding: [0x01,0x0d,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, v255
-// GFX12: encoding: [0xff,0x0d,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, v255              ; encoding: [0xff,0x0d,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, s1
-// GFX12: encoding: [0x01,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, s1                ; encoding: [0x01,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, s105
-// GFX12: encoding: [0x69,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, s105              ; encoding: [0x69,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, m0
-// GFX12: encoding: [0x7d,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, m0                ; encoding: [0x7d,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, null
-// GFX12: encoding: [0x7c,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, null              ; encoding: [0x7c,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, -1
-// GFX12: encoding: [0xc1,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, -1                ; encoding: [0xc1,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, 0.5
-// GFX12: encoding: [0xf0,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v5, src_scc
-// GFX12: encoding: [0xfd,0x0c,0x0a,0x7e]
+// GFX12: v_cvt_f32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x0c,0x0a,0x7e]
 
 v_cvt_f32_u32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_ubyte0 v5, v1
-// GFX12: encoding: [0x01,0x23,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v1             ; encoding: [0x01,0x23,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, v255
-// GFX12: encoding: [0xff,0x23,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v255           ; encoding: [0xff,0x23,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, s1
-// GFX12: encoding: [0x01,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s1             ; encoding: [0x01,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, s105
-// GFX12: encoding: [0x69,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s105           ; encoding: [0x69,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_lo         ; encoding: [0x6a,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_hi         ; encoding: [0x6b,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, ttmp15
-// GFX12: encoding: [0x7b,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, ttmp15         ; encoding: [0x7b,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, m0
-// GFX12: encoding: [0x7d,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, m0             ; encoding: [0x7d,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, exec_lo
-// GFX12: encoding: [0x7e,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_lo        ; encoding: [0x7e,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, exec_hi
-// GFX12: encoding: [0x7f,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_hi        ; encoding: [0x7f,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, null
-// GFX12: encoding: [0x7c,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, null           ; encoding: [0x7c,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, -1
-// GFX12: encoding: [0xc1,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, -1             ; encoding: [0xc1,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, 0.5
-// GFX12: encoding: [0xf0,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, 0.5            ; encoding: [0xf0,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v5, src_scc
-// GFX12: encoding: [0xfd,0x22,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte0_e32 v5, src_scc        ; encoding: [0xfd,0x22,0x0a,0x7e]
 
 v_cvt_f32_ubyte0 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_ubyte0_e32 v255, 0xaf123456   ; encoding: [0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_ubyte1 v5, v1
-// GFX12: encoding: [0x01,0x25,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v1             ; encoding: [0x01,0x25,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, v255
-// GFX12: encoding: [0xff,0x25,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v255           ; encoding: [0xff,0x25,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, s1
-// GFX12: encoding: [0x01,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s1             ; encoding: [0x01,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, s105
-// GFX12: encoding: [0x69,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s105           ; encoding: [0x69,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_lo         ; encoding: [0x6a,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_hi         ; encoding: [0x6b,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, ttmp15
-// GFX12: encoding: [0x7b,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, ttmp15         ; encoding: [0x7b,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, m0
-// GFX12: encoding: [0x7d,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, m0             ; encoding: [0x7d,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, exec_lo
-// GFX12: encoding: [0x7e,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_lo        ; encoding: [0x7e,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, exec_hi
-// GFX12: encoding: [0x7f,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_hi        ; encoding: [0x7f,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, null
-// GFX12: encoding: [0x7c,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, null           ; encoding: [0x7c,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, -1
-// GFX12: encoding: [0xc1,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, -1             ; encoding: [0xc1,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, 0.5
-// GFX12: encoding: [0xf0,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, 0.5            ; encoding: [0xf0,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v5, src_scc
-// GFX12: encoding: [0xfd,0x24,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte1_e32 v5, src_scc        ; encoding: [0xfd,0x24,0x0a,0x7e]
 
 v_cvt_f32_ubyte1 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_ubyte1_e32 v255, 0xaf123456   ; encoding: [0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_ubyte2 v5, v1
-// GFX12: encoding: [0x01,0x27,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v1             ; encoding: [0x01,0x27,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, v255
-// GFX12: encoding: [0xff,0x27,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v255           ; encoding: [0xff,0x27,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, s1
-// GFX12: encoding: [0x01,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s1             ; encoding: [0x01,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, s105
-// GFX12: encoding: [0x69,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s105           ; encoding: [0x69,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_lo         ; encoding: [0x6a,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_hi         ; encoding: [0x6b,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, ttmp15
-// GFX12: encoding: [0x7b,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, ttmp15         ; encoding: [0x7b,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, m0
-// GFX12: encoding: [0x7d,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, m0             ; encoding: [0x7d,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, exec_lo
-// GFX12: encoding: [0x7e,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_lo        ; encoding: [0x7e,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, exec_hi
-// GFX12: encoding: [0x7f,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_hi        ; encoding: [0x7f,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, null
-// GFX12: encoding: [0x7c,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, null           ; encoding: [0x7c,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, -1
-// GFX12: encoding: [0xc1,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, -1             ; encoding: [0xc1,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, 0.5
-// GFX12: encoding: [0xf0,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, 0.5            ; encoding: [0xf0,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v5, src_scc
-// GFX12: encoding: [0xfd,0x26,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte2_e32 v5, src_scc        ; encoding: [0xfd,0x26,0x0a,0x7e]
 
 v_cvt_f32_ubyte2 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_ubyte2_e32 v255, 0xaf123456   ; encoding: [0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f32_ubyte3 v5, v1
-// GFX12: encoding: [0x01,0x29,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v1             ; encoding: [0x01,0x29,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, v255
-// GFX12: encoding: [0xff,0x29,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v255           ; encoding: [0xff,0x29,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, s1
-// GFX12: encoding: [0x01,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s1             ; encoding: [0x01,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, s105
-// GFX12: encoding: [0x69,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s105           ; encoding: [0x69,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_lo         ; encoding: [0x6a,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_hi         ; encoding: [0x6b,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, ttmp15
-// GFX12: encoding: [0x7b,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, ttmp15         ; encoding: [0x7b,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, m0
-// GFX12: encoding: [0x7d,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, m0             ; encoding: [0x7d,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, exec_lo
-// GFX12: encoding: [0x7e,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_lo        ; encoding: [0x7e,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, exec_hi
-// GFX12: encoding: [0x7f,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_hi        ; encoding: [0x7f,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, null
-// GFX12: encoding: [0x7c,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, null           ; encoding: [0x7c,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, -1
-// GFX12: encoding: [0xc1,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, -1             ; encoding: [0xc1,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, 0.5
-// GFX12: encoding: [0xf0,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, 0.5            ; encoding: [0xf0,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v5, src_scc
-// GFX12: encoding: [0xfd,0x28,0x0a,0x7e]
+// GFX12: v_cvt_f32_ubyte3_e32 v5, src_scc        ; encoding: [0xfd,0x28,0x0a,0x7e]
 
 v_cvt_f32_ubyte3 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f32_ubyte3_e32 v255, 0xaf123456   ; encoding: [0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f64_f32 v[5:6], v1
-// GFX12: encoding: [0x01,0x21,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v1            ; encoding: [0x01,0x21,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], v255
-// GFX12: encoding: [0xff,0x21,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v255          ; encoding: [0xff,0x21,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], s1
-// GFX12: encoding: [0x01,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s1            ; encoding: [0x01,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], s105
-// GFX12: encoding: [0x69,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s105          ; encoding: [0x69,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], vcc_lo
-// GFX12: encoding: [0x6a,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], vcc_hi
-// GFX12: encoding: [0x6b,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], ttmp15
-// GFX12: encoding: [0x7b,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], m0
-// GFX12: encoding: [0x7d,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], m0            ; encoding: [0x7d,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], exec_lo
-// GFX12: encoding: [0x7e,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], exec_hi
-// GFX12: encoding: [0x7f,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], null
-// GFX12: encoding: [0x7c,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], null          ; encoding: [0x7c,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], -1
-// GFX12: encoding: [0xc1,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], -1            ; encoding: [0xc1,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x20,0x0a,0x7e]
+// GFX12: v_cvt_f64_f32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x20,0x0a,0x7e]
 
 v_cvt_f64_f32 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f64_f32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f64_i32 v[5:6], v1
-// GFX12: encoding: [0x01,0x09,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v1            ; encoding: [0x01,0x09,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], v255
-// GFX12: encoding: [0xff,0x09,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v255          ; encoding: [0xff,0x09,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], s1
-// GFX12: encoding: [0x01,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s1            ; encoding: [0x01,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], s105
-// GFX12: encoding: [0x69,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s105          ; encoding: [0x69,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], vcc_lo
-// GFX12: encoding: [0x6a,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], vcc_hi
-// GFX12: encoding: [0x6b,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], ttmp15
-// GFX12: encoding: [0x7b,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], m0
-// GFX12: encoding: [0x7d,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], m0            ; encoding: [0x7d,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], exec_lo
-// GFX12: encoding: [0x7e,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], exec_hi
-// GFX12: encoding: [0x7f,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], null
-// GFX12: encoding: [0x7c,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], null          ; encoding: [0x7c,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], -1
-// GFX12: encoding: [0xc1,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], -1            ; encoding: [0xc1,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x08,0x0a,0x7e]
+// GFX12: v_cvt_f64_i32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x08,0x0a,0x7e]
 
 v_cvt_f64_i32 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f64_i32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_f64_u32 v[5:6], v1
-// GFX12: encoding: [0x01,0x2d,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v1            ; encoding: [0x01,0x2d,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], v255
-// GFX12: encoding: [0xff,0x2d,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v255          ; encoding: [0xff,0x2d,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], s1
-// GFX12: encoding: [0x01,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s1            ; encoding: [0x01,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], s105
-// GFX12: encoding: [0x69,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s105          ; encoding: [0x69,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], vcc_lo
-// GFX12: encoding: [0x6a,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], vcc_hi
-// GFX12: encoding: [0x6b,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], ttmp15
-// GFX12: encoding: [0x7b,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], m0
-// GFX12: encoding: [0x7d,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], m0            ; encoding: [0x7d,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], exec_lo
-// GFX12: encoding: [0x7e,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], exec_hi
-// GFX12: encoding: [0x7f,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], null
-// GFX12: encoding: [0x7c,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], null          ; encoding: [0x7c,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], -1
-// GFX12: encoding: [0xc1,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], -1            ; encoding: [0xc1,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x2c,0x0a,0x7e]
+// GFX12: v_cvt_f64_u32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x2c,0x0a,0x7e]
 
 v_cvt_f64_u32 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_f64_u32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_floor_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x1b,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x1b,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
 
 v_cvt_floor_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_flr_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x1b,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x1b,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x1a,0x0a,0x7e]
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
 
 v_cvt_flr_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_i16_f16 v5, v1
-// GFX12: encoding: [0x01,0xa7,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, v1                ; encoding: [0x01,0xa7,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, v127
-// GFX12: encoding: [0x7f,0xa7,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, v127              ; encoding: [0x7f,0xa7,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, s1
-// GFX12: encoding: [0x01,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, s1                ; encoding: [0x01,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, s105
-// GFX12: encoding: [0x69,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, s105              ; encoding: [0x69,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, m0
-// GFX12: encoding: [0x7d,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, m0                ; encoding: [0x7d,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, null
-// GFX12: encoding: [0x7c,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, null              ; encoding: [0x7c,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, -1
-// GFX12: encoding: [0xc1,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, -1                ; encoding: [0xc1,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xa6,0x0a,0x7e]
+// GFX12: v_cvt_i16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa6,0x0a,0x7e]
 
 v_cvt_i16_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_i16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x11,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, v1                ; encoding: [0x01,0x11,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x11,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, v255              ; encoding: [0xff,0x11,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, s1                ; encoding: [0x01,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, s105              ; encoding: [0x69,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, m0                ; encoding: [0x7d,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, null              ; encoding: [0x7c,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, -1                ; encoding: [0xc1,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x10,0x0a,0x7e]
+// GFX12: v_cvt_i32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x10,0x0a,0x7e]
 
 v_cvt_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_i32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_i32_f64 v5, v[1:2]
-// GFX12: encoding: [0x01,0x07,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x07,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, v[254:255]
-// GFX12: encoding: [0xfe,0x07,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x07,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, s[2:3]
-// GFX12: encoding: [0x02,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, s[104:105]
-// GFX12: encoding: [0x68,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, vcc
-// GFX12: encoding: [0x6a,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, vcc               ; encoding: [0x6a,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, ttmp[14:15]
-// GFX12: encoding: [0x7a,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, exec
-// GFX12: encoding: [0x7e,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, exec              ; encoding: [0x7e,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, null
-// GFX12: encoding: [0x7c,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, null              ; encoding: [0x7c,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, -1
-// GFX12: encoding: [0xc1,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, -1                ; encoding: [0xc1,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, 0.5
-// GFX12: encoding: [0xf0,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v5, src_scc
-// GFX12: encoding: [0xfd,0x06,0x0a,0x7e]
+// GFX12: v_cvt_i32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x06,0x0a,0x7e]
 
 v_cvt_i32_f64 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_i32_i16 v5, v1
-// GFX12: encoding: [0x01,0xd5,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, v127
-// GFX12: encoding: [0x7f,0xd5,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s1
-// GFX12: encoding: [0x01,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s105
-// GFX12: encoding: [0x69,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, s105              ; encoding: [0x69,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, m0
-// GFX12: encoding: [0x7d,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, m0                ; encoding: [0x7d,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, null
-// GFX12: encoding: [0x7c,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, null              ; encoding: [0x7c,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, -1
-// GFX12: encoding: [0xc1,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, -1                ; encoding: [0xc1,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, 0.5
-// GFX12: encoding: [0xf0,0xd4,0x0a,0x7e]
+// GFX12-ASM: v_cvt_i32_i16_e32 v5, 0.5               ; encoding: [0xf0,0xd4,0x0a,0x7e]
+// GFX12-DIS: v_cvt_i32_i16_e32 v5, 0x3800            ; encoding: [0xff,0xd4,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 v_cvt_i32_i16 v5, src_scc
-// GFX12: encoding: [0xfd,0xd4,0x0a,0x7e]
+// GFX12: v_cvt_i32_i16_e32 v5, src_scc           ; encoding: [0xfd,0xd4,0x0a,0x7e]
 
 v_cvt_i32_i16 v255, 0xfe0b
-// GFX12: encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cvt_nearest_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x19,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x19,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
 
 v_cvt_nearest_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_norm_i16_f16 v5, v1
-// GFX12: encoding: [0x01,0xc7,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v1           ; encoding: [0x01,0xc7,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, v127
-// GFX12: encoding: [0x7f,0xc7,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v127         ; encoding: [0x7f,0xc7,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, s1
-// GFX12: encoding: [0x01,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s1           ; encoding: [0x01,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, s105
-// GFX12: encoding: [0x69,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s105         ; encoding: [0x69,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, m0
-// GFX12: encoding: [0x7d,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, m0           ; encoding: [0x7d,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, null
-// GFX12: encoding: [0x7c,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, null         ; encoding: [0x7c,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, -1
-// GFX12: encoding: [0xc1,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, -1           ; encoding: [0xc1,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xc6,0x0a,0x7e]
+// GFX12: v_cvt_norm_i16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc6,0x0a,0x7e]
 
 v_cvt_norm_i16_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_norm_i16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_u16_f16 v5, v1
-// GFX12: encoding: [0x01,0xc9,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v1           ; encoding: [0x01,0xc9,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, v127
-// GFX12: encoding: [0x7f,0xc9,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v127         ; encoding: [0x7f,0xc9,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, s1
-// GFX12: encoding: [0x01,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s1           ; encoding: [0x01,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, s105
-// GFX12: encoding: [0x69,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s105         ; encoding: [0x69,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, m0
-// GFX12: encoding: [0x7d,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, m0           ; encoding: [0x7d,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, null
-// GFX12: encoding: [0x7c,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, null         ; encoding: [0x7c,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, -1
-// GFX12: encoding: [0xc1,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, -1           ; encoding: [0xc1,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xc8,0x0a,0x7e]
+// GFX12: v_cvt_norm_u16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc8,0x0a,0x7e]
 
 v_cvt_norm_u16_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_norm_u16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_off_f32_i4 v5, v1
-// GFX12: encoding: [0x01,0x1d,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, v1             ; encoding: [0x01,0x1d,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, v255
-// GFX12: encoding: [0xff,0x1d,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, v255           ; encoding: [0xff,0x1d,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, s1
-// GFX12: encoding: [0x01,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, s1             ; encoding: [0x01,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, s105
-// GFX12: encoding: [0x69,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, s105           ; encoding: [0x69,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_lo         ; encoding: [0x6a,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_hi         ; encoding: [0x6b,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, ttmp15
-// GFX12: encoding: [0x7b,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, ttmp15         ; encoding: [0x7b,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, m0
-// GFX12: encoding: [0x7d,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, m0             ; encoding: [0x7d,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, exec_lo
-// GFX12: encoding: [0x7e,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_lo        ; encoding: [0x7e,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, exec_hi
-// GFX12: encoding: [0x7f,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_hi        ; encoding: [0x7f,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, null
-// GFX12: encoding: [0x7c,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, null           ; encoding: [0x7c,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, -1
-// GFX12: encoding: [0xc1,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, -1             ; encoding: [0xc1,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, 0.5
-// GFX12: encoding: [0xf0,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, 0.5            ; encoding: [0xf0,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v5, src_scc
-// GFX12: encoding: [0xfd,0x1c,0x0a,0x7e]
+// GFX12: v_cvt_off_f32_i4_e32 v5, src_scc        ; encoding: [0xfd,0x1c,0x0a,0x7e]
 
 v_cvt_off_f32_i4 v255, 0x4f
-// GFX12: encoding: [0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00]
+// GFX12: v_cvt_off_f32_i4_e32 v255, 0x4f         ; encoding: [0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00]
 
 v_cvt_rpi_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x19,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x19,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x18,0x0a,0x7e]
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
 
 v_cvt_rpi_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_u16_f16 v5, v1
-// GFX12: encoding: [0x01,0xa5,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, v1                ; encoding: [0x01,0xa5,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, v127
-// GFX12: encoding: [0x7f,0xa5,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, v127              ; encoding: [0x7f,0xa5,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, s1
-// GFX12: encoding: [0x01,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, s1                ; encoding: [0x01,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, s105
-// GFX12: encoding: [0x69,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, s105              ; encoding: [0x69,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, m0
-// GFX12: encoding: [0x7d,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, m0                ; encoding: [0x7d,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, null
-// GFX12: encoding: [0x7c,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, null              ; encoding: [0x7c,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, -1
-// GFX12: encoding: [0xc1,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, -1                ; encoding: [0xc1,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xa4,0x0a,0x7e]
+// GFX12: v_cvt_u16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa4,0x0a,0x7e]
 
 v_cvt_u16_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_u16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_u32_f32 v5, v1
-// GFX12: encoding: [0x01,0x0f,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, v1                ; encoding: [0x01,0x0f,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, v255
-// GFX12: encoding: [0xff,0x0f,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, v255              ; encoding: [0xff,0x0f,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, s1
-// GFX12: encoding: [0x01,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, s1                ; encoding: [0x01,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, s105
-// GFX12: encoding: [0x69,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, s105              ; encoding: [0x69,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, m0                ; encoding: [0x7d,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, null
-// GFX12: encoding: [0x7c,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, null              ; encoding: [0x7c,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, -1                ; encoding: [0xc1,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x0e,0x0a,0x7e]
+// GFX12: v_cvt_u32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x0e,0x0a,0x7e]
 
 v_cvt_u32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_u32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_u32_f64 v5, v[1:2]
-// GFX12: encoding: [0x01,0x2b,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x2b,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, v[254:255]
-// GFX12: encoding: [0xfe,0x2b,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x2b,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, s[2:3]
-// GFX12: encoding: [0x02,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, s[104:105]
-// GFX12: encoding: [0x68,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, vcc
-// GFX12: encoding: [0x6a,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, vcc               ; encoding: [0x6a,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, ttmp[14:15]
-// GFX12: encoding: [0x7a,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, exec
-// GFX12: encoding: [0x7e,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, exec              ; encoding: [0x7e,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, null
-// GFX12: encoding: [0x7c,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, null              ; encoding: [0x7c,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, -1
-// GFX12: encoding: [0xc1,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, -1                ; encoding: [0xc1,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, 0.5
-// GFX12: encoding: [0xf0,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v5, src_scc
-// GFX12: encoding: [0xfd,0x2a,0x0a,0x7e]
+// GFX12: v_cvt_u32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x2a,0x0a,0x7e]
 
 v_cvt_u32_f64 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_cvt_u32_u16 v5, v1
-// GFX12: encoding: [0x01,0xd7,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, v127
-// GFX12: encoding: [0x7f,0xd7,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s1
-// GFX12: encoding: [0x01,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s105
-// GFX12: encoding: [0x69,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, s105              ; encoding: [0x69,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, m0
-// GFX12: encoding: [0x7d,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, m0                ; encoding: [0x7d,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, null
-// GFX12: encoding: [0x7c,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, null              ; encoding: [0x7c,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, -1
-// GFX12: encoding: [0xc1,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, -1                ; encoding: [0xc1,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, 0.5
-// GFX12: encoding: [0xf0,0xd6,0x0a,0x7e]
+// GFX12-ASM: v_cvt_u32_u16_e32 v5, 0.5               ; encoding: [0xf0,0xd6,0x0a,0x7e]
+// GFX12-DIS: v_cvt_u32_u16_e32 v5, 0x3800            ; encoding: [0xff,0xd6,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 v_cvt_u32_u16 v5, src_scc
-// GFX12: encoding: [0xfd,0xd6,0x0a,0x7e]
+// GFX12: v_cvt_u32_u16_e32 v5, src_scc           ; encoding: [0xfd,0xd6,0x0a,0x7e]
 
 v_cvt_u32_u16 v255, 0xfe0b
-// GFX12: encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+// GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_exp_f16 v5, v1
-// GFX12: encoding: [0x01,0xb1,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
 
 v_exp_f16 v5, v127
-// GFX12: encoding: [0x7f,0xb1,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, v127                  ; encoding: [0x7f,0xb1,0x0a,0x7e]
 
 v_exp_f16 v5, s1
-// GFX12: encoding: [0x01,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, s1                    ; encoding: [0x01,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, s105
-// GFX12: encoding: [0x69,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, s105                  ; encoding: [0x69,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, m0
-// GFX12: encoding: [0x7d,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, m0                    ; encoding: [0x7d,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, null
-// GFX12: encoding: [0x7c,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, null                  ; encoding: [0x7c,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, -1
-// GFX12: encoding: [0xc1,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, -1                    ; encoding: [0xc1,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xb0,0x0a,0x7e]
 
 v_exp_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xb0,0x0a,0x7e]
+// GFX12: v_exp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xb0,0x0a,0x7e]
 
 v_exp_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_exp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_exp_f32 v5, v1
-// GFX12: encoding: [0x01,0x4b,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, v1                    ; encoding: [0x01,0x4b,0x0a,0x7e]
 
 v_exp_f32 v5, v255
-// GFX12: encoding: [0xff,0x4b,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, v255                  ; encoding: [0xff,0x4b,0x0a,0x7e]
 
 v_exp_f32 v5, s1
-// GFX12: encoding: [0x01,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, s1                    ; encoding: [0x01,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, s105
-// GFX12: encoding: [0x69,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, s105                  ; encoding: [0x69,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, m0
-// GFX12: encoding: [0x7d,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, m0                    ; encoding: [0x7d,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, null
-// GFX12: encoding: [0x7c,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, null                  ; encoding: [0x7c,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, -1
-// GFX12: encoding: [0xc1,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, -1                    ; encoding: [0xc1,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4a,0x0a,0x7e]
 
 v_exp_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x4a,0x0a,0x7e]
+// GFX12: v_exp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4a,0x0a,0x7e]
 
 v_exp_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_exp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ffbh_i32 v5, v1
-// GFX12: encoding: [0x01,0x77,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
 
 v_ffbh_i32 v5, v255
-// GFX12: encoding: [0xff,0x77,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
 
 v_ffbh_i32 v5, s1
-// GFX12: encoding: [0x01,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, s105
-// GFX12: encoding: [0x69,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, m0
-// GFX12: encoding: [0x7d,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, null
-// GFX12: encoding: [0x7c,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, -1
-// GFX12: encoding: [0xc1,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, 0.5
-// GFX12: encoding: [0xf0,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v5, src_scc
-// GFX12: encoding: [0xfd,0x76,0x0a,0x7e]
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
 
 v_ffbh_i32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ffbh_u32 v5, v1
-// GFX12: encoding: [0x01,0x73,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
 
 v_ffbh_u32 v5, v255
-// GFX12: encoding: [0xff,0x73,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
 
 v_ffbh_u32 v5, s1
-// GFX12: encoding: [0x01,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, s105
-// GFX12: encoding: [0x69,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, m0
-// GFX12: encoding: [0x7d,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, null
-// GFX12: encoding: [0x7c,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, -1
-// GFX12: encoding: [0xc1,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, 0.5
-// GFX12: encoding: [0xf0,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v5, src_scc
-// GFX12: encoding: [0xfd,0x72,0x0a,0x7e]
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
 
 v_ffbh_u32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_ffbl_b32 v5, v1
-// GFX12: encoding: [0x01,0x75,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
 
 v_ffbl_b32 v5, v255
-// GFX12: encoding: [0xff,0x75,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
 
 v_ffbl_b32 v5, s1
-// GFX12: encoding: [0x01,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, s105
-// GFX12: encoding: [0x69,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, m0
-// GFX12: encoding: [0x7d,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, null
-// GFX12: encoding: [0x7c,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, -1
-// GFX12: encoding: [0xc1,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x74,0x0a,0x7e]
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
 
 v_ffbl_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_floor_f16 v5, v1
-// GFX12: encoding: [0x01,0xb7,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, v1                  ; encoding: [0x01,0xb7,0x0a,0x7e]
 
 v_floor_f16 v5, v127
-// GFX12: encoding: [0x7f,0xb7,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, v127                ; encoding: [0x7f,0xb7,0x0a,0x7e]
 
 v_floor_f16 v5, s1
-// GFX12: encoding: [0x01,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, s1                  ; encoding: [0x01,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, s105
-// GFX12: encoding: [0x69,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, s105                ; encoding: [0x69,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, m0
-// GFX12: encoding: [0x7d,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, m0                  ; encoding: [0x7d,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, null
-// GFX12: encoding: [0x7c,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, null                ; encoding: [0x7c,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, -1
-// GFX12: encoding: [0xc1,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, -1                  ; encoding: [0xc1,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xb6,0x0a,0x7e]
 
 v_floor_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xb6,0x0a,0x7e]
+// GFX12: v_floor_f16_e32 v5, src_scc             ; encoding: [0xfd,0xb6,0x0a,0x7e]
 
 v_floor_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_floor_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_floor_f32 v5, v1
-// GFX12: encoding: [0x01,0x49,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, v1                  ; encoding: [0x01,0x49,0x0a,0x7e]
 
 v_floor_f32 v5, v255
-// GFX12: encoding: [0xff,0x49,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, v255                ; encoding: [0xff,0x49,0x0a,0x7e]
 
 v_floor_f32 v5, s1
-// GFX12: encoding: [0x01,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, s1                  ; encoding: [0x01,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, s105
-// GFX12: encoding: [0x69,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, s105                ; encoding: [0x69,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, m0
-// GFX12: encoding: [0x7d,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, m0                  ; encoding: [0x7d,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, null
-// GFX12: encoding: [0x7c,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, null                ; encoding: [0x7c,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, -1
-// GFX12: encoding: [0xc1,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, -1                  ; encoding: [0xc1,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x48,0x0a,0x7e]
 
 v_floor_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x48,0x0a,0x7e]
+// GFX12: v_floor_f32_e32 v5, src_scc             ; encoding: [0xfd,0x48,0x0a,0x7e]
 
 v_floor_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_floor_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_floor_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x35,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x35,0x0a,0x7e]
 
 v_floor_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x35,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x35,0x0a,0x7e]
 
 v_floor_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], null            ; encoding: [0x7c,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x34,0x0a,0x7e]
 
 v_floor_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x34,0x0a,0x7e]
+// GFX12: v_floor_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x34,0x0a,0x7e]
 
 v_floor_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_fract_f16 v5, v1
-// GFX12: encoding: [0x01,0xbf,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
 
 v_fract_f16 v5, v127
-// GFX12: encoding: [0x7f,0xbf,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
 v_fract_f16 v5, s1
-// GFX12: encoding: [0x01,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, s105
-// GFX12: encoding: [0x69,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, m0
-// GFX12: encoding: [0x7d,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, null
-// GFX12: encoding: [0x7c,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, -1
-// GFX12: encoding: [0xc1,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
 v_fract_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xbe,0x0a,0x7e]
+// GFX12: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
 v_fract_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32 v5, v1
-// GFX12: encoding: [0x01,0x41,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
 
 v_fract_f32 v5, v255
-// GFX12: encoding: [0xff,0x41,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, v255                ; encoding: [0xff,0x41,0x0a,0x7e]
 
 v_fract_f32 v5, s1
-// GFX12: encoding: [0x01,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, s1                  ; encoding: [0x01,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, s105
-// GFX12: encoding: [0x69,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, s105                ; encoding: [0x69,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, m0
-// GFX12: encoding: [0x7d,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, m0                  ; encoding: [0x7d,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, null
-// GFX12: encoding: [0x7c,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, null                ; encoding: [0x7c,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, -1
-// GFX12: encoding: [0xc1,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, -1                  ; encoding: [0xc1,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x40,0x0a,0x7e]
 
 v_fract_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x40,0x0a,0x7e]
+// GFX12: v_fract_f32_e32 v5, src_scc             ; encoding: [0xfd,0x40,0x0a,0x7e]
 
 v_fract_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_fract_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_fract_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x7d,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x7d,0x0a,0x7e]
 
 v_fract_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x7d,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x7d,0x0a,0x7e]
 
 v_fract_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], null            ; encoding: [0x7c,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x7c,0x0a,0x7e]
+// GFX12: v_fract_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x7c,0x0a,0x7e]
 
 v_fract_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_fract_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_frexp_exp_i16_f16 v5, v1
-// GFX12: encoding: [0x01,0xb5,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v1          ; encoding: [0x01,0xb5,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, v127
-// GFX12: encoding: [0x7f,0xb5,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v127        ; encoding: [0x7f,0xb5,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, s1
-// GFX12: encoding: [0x01,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s1          ; encoding: [0x01,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, s105
-// GFX12: encoding: [0x69,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s105        ; encoding: [0x69,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_lo      ; encoding: [0x6a,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_hi      ; encoding: [0x6b,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, ttmp15      ; encoding: [0x7b,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, m0
-// GFX12: encoding: [0x7d,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, m0          ; encoding: [0x7d,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_lo     ; encoding: [0x7e,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_hi     ; encoding: [0x7f,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, null
-// GFX12: encoding: [0x7c,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, null        ; encoding: [0x7c,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, -1
-// GFX12: encoding: [0xc1,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, -1          ; encoding: [0xc1,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, 0.5         ; encoding: [0xf0,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xb4,0x0a,0x7e]
+// GFX12: v_frexp_exp_i16_f16_e32 v5, src_scc     ; encoding: [0xfd,0xb4,0x0a,0x7e]
 
 v_frexp_exp_i16_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_frexp_exp_i16_f16_e32 v127, 0xfe0b    ; encoding: [0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_exp_i32_f32 v5, v1
-// GFX12: encoding: [0x01,0x7f,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v1          ; encoding: [0x01,0x7f,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, v255
-// GFX12: encoding: [0xff,0x7f,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v255        ; encoding: [0xff,0x7f,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, s1
-// GFX12: encoding: [0x01,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s1          ; encoding: [0x01,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, s105
-// GFX12: encoding: [0x69,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s105        ; encoding: [0x69,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, m0
-// GFX12: encoding: [0x7d,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, null
-// GFX12: encoding: [0x7c,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, null        ; encoding: [0x7c,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, -1
-// GFX12: encoding: [0xc1,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x7e,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x7e,0x0a,0x7e]
 
 v_frexp_exp_i32_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_frexp_exp_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_frexp_exp_i32_f64 v5, v[1:2]
-// GFX12: encoding: [0x01,0x79,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[1:2]      ; encoding: [0x01,0x79,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, v[254:255]
-// GFX12: encoding: [0xfe,0x79,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[254:255]  ; encoding: [0xfe,0x79,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, s[2:3]
-// GFX12: encoding: [0x02,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[2:3]      ; encoding: [0x02,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, s[104:105]
-// GFX12: encoding: [0x68,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[104:105]  ; encoding: [0x68,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, vcc
-// GFX12: encoding: [0x6a,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, vcc         ; encoding: [0x6a,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, ttmp[14:15]
-// GFX12: encoding: [0x7a,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, ttmp[14:15] ; encoding: [0x7a,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, exec
-// GFX12: encoding: [0x7e,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, exec        ; encoding: [0x7e,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, null
-// GFX12: encoding: [0x7c,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, null        ; encoding: [0x7c,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, -1
-// GFX12: encoding: [0xc1,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, -1          ; encoding: [0xc1,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, 0.5
-// GFX12: encoding: [0xf0,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, 0.5         ; encoding: [0xf0,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v5, src_scc
-// GFX12: encoding: [0xfd,0x78,0x0a,0x7e]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, src_scc     ; encoding: [0xfd,0x78,0x0a,0x7e]
 
 v_frexp_exp_i32_f64 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_frexp_mant_f16 v5, v1
-// GFX12: encoding: [0x01,0xb3,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, v127
-// GFX12: encoding: [0x7f,0xb3,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, s1
-// GFX12: encoding: [0x01,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, s105
-// GFX12: encoding: [0x69,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, m0
-// GFX12: encoding: [0x7d,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, null
-// GFX12: encoding: [0x7c,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, -1
-// GFX12: encoding: [0xc1,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xb2,0x0a,0x7e]
+// GFX12: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
 v_frexp_mant_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1
-// GFX12: encoding: [0x01,0x81,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, v255
-// GFX12: encoding: [0xff,0x81,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, v255           ; encoding: [0xff,0x81,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, s1
-// GFX12: encoding: [0x01,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, s1             ; encoding: [0x01,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, s105
-// GFX12: encoding: [0x69,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, s105           ; encoding: [0x69,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_lo         ; encoding: [0x6a,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_hi         ; encoding: [0x6b,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, ttmp15         ; encoding: [0x7b,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, m0
-// GFX12: encoding: [0x7d,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, m0             ; encoding: [0x7d,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, exec_lo        ; encoding: [0x7e,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, exec_hi        ; encoding: [0x7f,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, null
-// GFX12: encoding: [0x7c,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, null           ; encoding: [0x7c,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, -1
-// GFX12: encoding: [0xc1,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, -1             ; encoding: [0xc1,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, 0.5            ; encoding: [0xf0,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x80,0x0a,0x7e]
+// GFX12: v_frexp_mant_f32_e32 v5, src_scc        ; encoding: [0xfd,0x80,0x0a,0x7e]
 
 v_frexp_mant_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_frexp_mant_f32_e32 v255, 0xaf123456   ; encoding: [0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_frexp_mant_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x7b,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[1:2]     ; encoding: [0x01,0x7b,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x7b,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[254:255] ; encoding: [0xfe,0x7b,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[2:3]     ; encoding: [0x02,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[104:105] ; encoding: [0x68,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], vcc        ; encoding: [0x6a,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], ttmp[14:15] ; encoding: [0x7a,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], exec       ; encoding: [0x7e,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], null       ; encoding: [0x7c,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], -1         ; encoding: [0xc1,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], 0.5        ; encoding: [0xf0,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x7a,0x0a,0x7e]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], src_scc    ; encoding: [0xfd,0x7a,0x0a,0x7e]
 
 v_frexp_mant_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_frexp_mant_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_log_f16 v5, v1
-// GFX12: encoding: [0x01,0xaf,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, v1                    ; encoding: [0x01,0xaf,0x0a,0x7e]
 
 v_log_f16 v5, v127
-// GFX12: encoding: [0x7f,0xaf,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, v127                  ; encoding: [0x7f,0xaf,0x0a,0x7e]
 
 v_log_f16 v5, s1
-// GFX12: encoding: [0x01,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, s1                    ; encoding: [0x01,0xae,0x0a,0x7e]
 
 v_log_f16 v5, s105
-// GFX12: encoding: [0x69,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, s105                  ; encoding: [0x69,0xae,0x0a,0x7e]
 
 v_log_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xae,0x0a,0x7e]
 
 v_log_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xae,0x0a,0x7e]
 
 v_log_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xae,0x0a,0x7e]
 
 v_log_f16 v5, m0
-// GFX12: encoding: [0x7d,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, m0                    ; encoding: [0x7d,0xae,0x0a,0x7e]
 
 v_log_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xae,0x0a,0x7e]
 
 v_log_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xae,0x0a,0x7e]
 
 v_log_f16 v5, null
-// GFX12: encoding: [0x7c,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, null                  ; encoding: [0x7c,0xae,0x0a,0x7e]
 
 v_log_f16 v5, -1
-// GFX12: encoding: [0xc1,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, -1                    ; encoding: [0xc1,0xae,0x0a,0x7e]
 
 v_log_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xae,0x0a,0x7e]
 
 v_log_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xae,0x0a,0x7e]
+// GFX12: v_log_f16_e32 v5, src_scc               ; encoding: [0xfd,0xae,0x0a,0x7e]
 
 v_log_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_log_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_log_f32 v5, v1
-// GFX12: encoding: [0x01,0x4f,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, v1                    ; encoding: [0x01,0x4f,0x0a,0x7e]
 
 v_log_f32 v5, v255
-// GFX12: encoding: [0xff,0x4f,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, v255                  ; encoding: [0xff,0x4f,0x0a,0x7e]
 
 v_log_f32 v5, s1
-// GFX12: encoding: [0x01,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, s1                    ; encoding: [0x01,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, s105
-// GFX12: encoding: [0x69,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, s105                  ; encoding: [0x69,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, m0
-// GFX12: encoding: [0x7d,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, m0                    ; encoding: [0x7d,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, null
-// GFX12: encoding: [0x7c,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, null                  ; encoding: [0x7c,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, -1
-// GFX12: encoding: [0xc1,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, -1                    ; encoding: [0xc1,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4e,0x0a,0x7e]
 
 v_log_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x4e,0x0a,0x7e]
+// GFX12: v_log_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4e,0x0a,0x7e]
 
 v_log_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_mov_b32 v5, v1
-// GFX12: encoding: [0x01,0x03,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
 
 v_mov_b32 v5, v255
-// GFX12: encoding: [0xff,0x03,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, v255                  ; encoding: [0xff,0x03,0x0a,0x7e]
 
 v_mov_b32 v5, s1
-// GFX12: encoding: [0x01,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, s1                    ; encoding: [0x01,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, s105
-// GFX12: encoding: [0x69,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, s105                  ; encoding: [0x69,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, m0
-// GFX12: encoding: [0x7d,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, m0                    ; encoding: [0x7d,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, null
-// GFX12: encoding: [0x7c,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, null                  ; encoding: [0x7c,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, -1
-// GFX12: encoding: [0xc1,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, -1                    ; encoding: [0xc1,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x02,0x0a,0x7e]
 
 v_mov_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x02,0x0a,0x7e]
+// GFX12: v_mov_b32_e32 v5, src_scc               ; encoding: [0xfd,0x02,0x0a,0x7e]
 
 v_mov_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_mov_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_movreld_b32 v5, v1
-// GFX12: encoding: [0x01,0x85,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, v1                ; encoding: [0x01,0x85,0x0a,0x7e]
 
 v_movreld_b32 v5, v255
-// GFX12: encoding: [0xff,0x85,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, v255              ; encoding: [0xff,0x85,0x0a,0x7e]
 
 v_movreld_b32 v5, s1
-// GFX12: encoding: [0x01,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, s1                ; encoding: [0x01,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, s105
-// GFX12: encoding: [0x69,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, s105              ; encoding: [0x69,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, m0
-// GFX12: encoding: [0x7d,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, m0                ; encoding: [0x7d,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, null
-// GFX12: encoding: [0x7c,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, null              ; encoding: [0x7c,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, -1
-// GFX12: encoding: [0xc1,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, -1                ; encoding: [0xc1,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, 0.5               ; encoding: [0xf0,0x84,0x0a,0x7e]
 
 v_movreld_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x84,0x0a,0x7e]
+// GFX12: v_movreld_b32_e32 v5, src_scc           ; encoding: [0xfd,0x84,0x0a,0x7e]
 
 v_movreld_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_movreld_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_movrels_b32 v5, v1
-// GFX12: encoding: [0x01,0x87,0x0a,0x7e]
+// GFX12: v_movrels_b32_e32 v5, v1                ; encoding: [0x01,0x87,0x0a,0x7e]
 
 v_movrels_b32 v255, v255
-// GFX12: encoding: [0xff,0x87,0xfe,0x7f]
+// GFX12: v_movrels_b32_e32 v255, v255            ; encoding: [0xff,0x87,0xfe,0x7f]
 
 v_movrelsd_2_b32 v5, v1
-// GFX12: encoding: [0x01,0x91,0x0a,0x7e]
+// GFX12: v_movrelsd_2_b32_e32 v5, v1             ; encoding: [0x01,0x91,0x0a,0x7e]
 
 v_movrelsd_2_b32 v255, v255
-// GFX12: encoding: [0xff,0x91,0xfe,0x7f]
+// GFX12: v_movrelsd_2_b32_e32 v255, v255         ; encoding: [0xff,0x91,0xfe,0x7f]
 
 v_movrelsd_b32 v5, v1
-// GFX12: encoding: [0x01,0x89,0x0a,0x7e]
+// GFX12: v_movrelsd_b32_e32 v5, v1               ; encoding: [0x01,0x89,0x0a,0x7e]
 
 v_movrelsd_b32 v255, v255
-// GFX12: encoding: [0xff,0x89,0xfe,0x7f]
+// GFX12: v_movrelsd_b32_e32 v255, v255           ; encoding: [0xff,0x89,0xfe,0x7f]
 
 v_nop
-// GFX12: encoding: [0x00,0x00,0x00,0x7e]
+// GFX12: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
 v_not_b16 v5, v1
-// GFX12: encoding: [0x01,0xd3,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
 
 v_not_b16 v5, v127
-// GFX12: encoding: [0x7f,0xd3,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
 v_not_b16 v5, s1
-// GFX12: encoding: [0x01,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, s105
-// GFX12: encoding: [0x69,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, m0
-// GFX12: encoding: [0x7d,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, null
-// GFX12: encoding: [0x7c,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, -1
-// GFX12: encoding: [0xc1,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
 v_not_b16 v5, 0.5
-// GFX12: encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-ASM: v_not_b16_e32 v5, 0.5                   ; encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-DIS: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 v_not_b16 v5, src_scc
-// GFX12: encoding: [0xfd,0xd2,0x0a,0x7e]
+// GFX12: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
 v_not_b16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_not_b32 v5, v1
-// GFX12: encoding: [0x01,0x6f,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
 
 v_not_b32 v5, v255
-// GFX12: encoding: [0xff,0x6f,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, v255                  ; encoding: [0xff,0x6f,0x0a,0x7e]
 
 v_not_b32 v5, s1
-// GFX12: encoding: [0x01,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, s1                    ; encoding: [0x01,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, s105
-// GFX12: encoding: [0x69,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, s105                  ; encoding: [0x69,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, m0
-// GFX12: encoding: [0x7d,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, m0                    ; encoding: [0x7d,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, null
-// GFX12: encoding: [0x7c,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, null                  ; encoding: [0x7c,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, -1
-// GFX12: encoding: [0xc1,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, -1                    ; encoding: [0xc1,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, 0.5
-// GFX12: encoding: [0xf0,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x6e,0x0a,0x7e]
 
 v_not_b32 v5, src_scc
-// GFX12: encoding: [0xfd,0x6e,0x0a,0x7e]
+// GFX12: v_not_b32_e32 v5, src_scc               ; encoding: [0xfd,0x6e,0x0a,0x7e]
 
 v_not_b32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_not_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_permlane64_b32 v5, v1
-// GFX12: encoding: [0x01,0xcf,0x0a,0x7e]
+// GFX12: v_permlane64_b32 v5, v1                 ; encoding: [0x01,0xcf,0x0a,0x7e]
 
 v_permlane64_b32 v255, v255
-// GFX12: encoding: [0xff,0xcf,0xfe,0x7f]
+// GFX12: v_permlane64_b32 v255, v255             ; encoding: [0xff,0xcf,0xfe,0x7f]
 
 v_pipeflush
-// GFX12: encoding: [0x00,0x36,0x00,0x7e]
+// GFX12: v_pipeflush                             ; encoding: [0x00,0x36,0x00,0x7e]
 
 v_rcp_f16 v5, v1
-// GFX12: encoding: [0x01,0xa9,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, v1                    ; encoding: [0x01,0xa9,0x0a,0x7e]
 
 v_rcp_f16 v5, v127
-// GFX12: encoding: [0x7f,0xa9,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, v127                  ; encoding: [0x7f,0xa9,0x0a,0x7e]
 
 v_rcp_f16 v5, s1
-// GFX12: encoding: [0x01,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, s1                    ; encoding: [0x01,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, s105
-// GFX12: encoding: [0x69,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, s105                  ; encoding: [0x69,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, m0
-// GFX12: encoding: [0x7d,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, m0                    ; encoding: [0x7d,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, null
-// GFX12: encoding: [0x7c,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, null                  ; encoding: [0x7c,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, -1
-// GFX12: encoding: [0xc1,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, -1                    ; encoding: [0xc1,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xa8,0x0a,0x7e]
+// GFX12: v_rcp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xa8,0x0a,0x7e]
 
 v_rcp_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_rcp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rcp_f32 v5, v1
-// GFX12: encoding: [0x01,0x55,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, v1                    ; encoding: [0x01,0x55,0x0a,0x7e]
 
 v_rcp_f32 v5, v255
-// GFX12: encoding: [0xff,0x55,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, v255                  ; encoding: [0xff,0x55,0x0a,0x7e]
 
 v_rcp_f32 v5, s1
-// GFX12: encoding: [0x01,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, s1                    ; encoding: [0x01,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, s105
-// GFX12: encoding: [0x69,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, s105                  ; encoding: [0x69,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, m0
-// GFX12: encoding: [0x7d,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, m0                    ; encoding: [0x7d,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, null
-// GFX12: encoding: [0x7c,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, null                  ; encoding: [0x7c,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, -1
-// GFX12: encoding: [0xc1,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, -1                    ; encoding: [0xc1,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x54,0x0a,0x7e]
 
 v_rcp_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x54,0x0a,0x7e]
+// GFX12: v_rcp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x54,0x0a,0x7e]
 
 v_rcp_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rcp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_rcp_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x5f,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x5f,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x5f,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x5f,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], null              ; encoding: [0x7c,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x5e,0x0a,0x7e]
+// GFX12: v_rcp_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x5e,0x0a,0x7e]
 
 v_rcp_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rcp_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_rcp_iflag_f32 v5, v1
-// GFX12: encoding: [0x01,0x57,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, v1              ; encoding: [0x01,0x57,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, v255
-// GFX12: encoding: [0xff,0x57,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, v255            ; encoding: [0xff,0x57,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, s1
-// GFX12: encoding: [0x01,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, s1              ; encoding: [0x01,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, s105
-// GFX12: encoding: [0x69,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, s105            ; encoding: [0x69,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_lo          ; encoding: [0x6a,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_hi          ; encoding: [0x6b,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, ttmp15          ; encoding: [0x7b,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, m0
-// GFX12: encoding: [0x7d,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, m0              ; encoding: [0x7d,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_lo         ; encoding: [0x7e,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_hi         ; encoding: [0x7f,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, null
-// GFX12: encoding: [0x7c,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, null            ; encoding: [0x7c,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, -1
-// GFX12: encoding: [0xc1,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, -1              ; encoding: [0xc1,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, 0.5             ; encoding: [0xf0,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x56,0x0a,0x7e]
+// GFX12: v_rcp_iflag_f32_e32 v5, src_scc         ; encoding: [0xfd,0x56,0x0a,0x7e]
 
 v_rcp_iflag_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rcp_iflag_f32_e32 v255, 0xaf123456    ; encoding: [0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_readfirstlane_b32 s5, v1
-// GFX12: encoding: [0x01,0x05,0x0a,0x7e]
+// GFX12: v_readfirstlane_b32 s5, v1              ; encoding: [0x01,0x05,0x0a,0x7e]
 
 v_readfirstlane_b32 s105, v1
-// GFX12: encoding: [0x01,0x05,0xd2,0x7e]
+// GFX12: v_readfirstlane_b32 s105, v1            ; encoding: [0x01,0x05,0xd2,0x7e]
 
 v_readfirstlane_b32 vcc_lo, v1
-// GFX12: encoding: [0x01,0x05,0xd4,0x7e]
+// GFX12: v_readfirstlane_b32 vcc_lo, v1          ; encoding: [0x01,0x05,0xd4,0x7e]
 
 v_readfirstlane_b32 vcc_hi, v1
-// GFX12: encoding: [0x01,0x05,0xd6,0x7e]
+// GFX12: v_readfirstlane_b32 vcc_hi, v1          ; encoding: [0x01,0x05,0xd6,0x7e]
 
 v_readfirstlane_b32 ttmp15, v1
-// GFX12: encoding: [0x01,0x05,0xf6,0x7e]
+// GFX12: v_readfirstlane_b32 ttmp15, v1          ; encoding: [0x01,0x05,0xf6,0x7e]
 
 v_readfirstlane_b32 null, v255
-// GFX12: encoding: [0xff,0x05,0xf8,0x7e]
+// GFX12: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
 v_rndne_f16 v5, v1
-// GFX12: encoding: [0x01,0xbd,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
 
 v_rndne_f16 v5, v127
-// GFX12: encoding: [0x7f,0xbd,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
 v_rndne_f16 v5, s1
-// GFX12: encoding: [0x01,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, s105
-// GFX12: encoding: [0x69,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, m0
-// GFX12: encoding: [0x7d,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, null
-// GFX12: encoding: [0x7c,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, -1
-// GFX12: encoding: [0xc1,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xbc,0x0a,0x7e]
+// GFX12: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
 v_rndne_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
-// GFX12: encoding: [0x01,0x47,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
 
 v_rndne_f32 v5, v255
-// GFX12: encoding: [0xff,0x47,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, v255                ; encoding: [0xff,0x47,0x0a,0x7e]
 
 v_rndne_f32 v5, s1
-// GFX12: encoding: [0x01,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, s1                  ; encoding: [0x01,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, s105
-// GFX12: encoding: [0x69,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, s105                ; encoding: [0x69,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, m0
-// GFX12: encoding: [0x7d,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, m0                  ; encoding: [0x7d,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, null
-// GFX12: encoding: [0x7c,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, null                ; encoding: [0x7c,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, -1
-// GFX12: encoding: [0xc1,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, -1                  ; encoding: [0xc1,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x46,0x0a,0x7e]
 
 v_rndne_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x46,0x0a,0x7e]
+// GFX12: v_rndne_f32_e32 v5, src_scc             ; encoding: [0xfd,0x46,0x0a,0x7e]
 
 v_rndne_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rndne_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_rndne_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x33,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x33,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x33,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x33,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], null            ; encoding: [0x7c,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x32,0x0a,0x7e]
+// GFX12: v_rndne_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x32,0x0a,0x7e]
 
 v_rndne_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rndne_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_rsq_f16 v5, v1
-// GFX12: encoding: [0x01,0xad,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, v1                    ; encoding: [0x01,0xad,0x0a,0x7e]
 
 v_rsq_f16 v5, v127
-// GFX12: encoding: [0x7f,0xad,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, v127                  ; encoding: [0x7f,0xad,0x0a,0x7e]
 
 v_rsq_f16 v5, s1
-// GFX12: encoding: [0x01,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, s1                    ; encoding: [0x01,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, s105
-// GFX12: encoding: [0x69,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, s105                  ; encoding: [0x69,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, m0
-// GFX12: encoding: [0x7d,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, m0                    ; encoding: [0x7d,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, null
-// GFX12: encoding: [0x7c,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, null                  ; encoding: [0x7c,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, -1
-// GFX12: encoding: [0xc1,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, -1                    ; encoding: [0xc1,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xac,0x0a,0x7e]
 
 v_rsq_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xac,0x0a,0x7e]
+// GFX12: v_rsq_f16_e32 v5, src_scc               ; encoding: [0xfd,0xac,0x0a,0x7e]
 
 v_rsq_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_rsq_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rsq_f32 v5, v1
-// GFX12: encoding: [0x01,0x5d,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, v1                    ; encoding: [0x01,0x5d,0x0a,0x7e]
 
 v_rsq_f32 v5, v255
-// GFX12: encoding: [0xff,0x5d,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, v255                  ; encoding: [0xff,0x5d,0x0a,0x7e]
 
 v_rsq_f32 v5, s1
-// GFX12: encoding: [0x01,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, s1                    ; encoding: [0x01,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, s105
-// GFX12: encoding: [0x69,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, s105                  ; encoding: [0x69,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, m0
-// GFX12: encoding: [0x7d,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, m0                    ; encoding: [0x7d,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, null
-// GFX12: encoding: [0x7c,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, null                  ; encoding: [0x7c,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, -1
-// GFX12: encoding: [0xc1,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, -1                    ; encoding: [0xc1,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x5c,0x0a,0x7e]
+// GFX12: v_rsq_f32_e32 v5, src_scc               ; encoding: [0xfd,0x5c,0x0a,0x7e]
 
 v_rsq_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rsq_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_rsq_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x63,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x63,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x63,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x63,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], null              ; encoding: [0x7c,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x62,0x0a,0x7e]
+// GFX12: v_rsq_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x62,0x0a,0x7e]
 
 v_rsq_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_sat_pk_u8_i16 v5, v1
-// GFX12: encoding: [0x01,0xc5,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, v255
-// GFX12: encoding: [0xff,0xc5,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s1
-// GFX12: encoding: [0x01,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s105
-// GFX12: encoding: [0x69,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, m0
-// GFX12: encoding: [0x7d,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, null
-// GFX12: encoding: [0x7c,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, -1
-// GFX12: encoding: [0xc1,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, 0.5
-// GFX12: encoding: [0xf0,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, src_scc
-// GFX12: encoding: [0xfd,0xc4,0x0a,0x7e]
+// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sin_f16 v5, v1
-// GFX12: encoding: [0x01,0xc1,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
 
 v_sin_f16 v5, v127
-// GFX12: encoding: [0x7f,0xc1,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
 v_sin_f16 v5, s1
-// GFX12: encoding: [0x01,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, s105
-// GFX12: encoding: [0x69,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, m0
-// GFX12: encoding: [0x7d,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, null
-// GFX12: encoding: [0x7c,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, -1
-// GFX12: encoding: [0xc1,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
 v_sin_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xc0,0x0a,0x7e]
+// GFX12: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
 v_sin_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32 v5, v1
-// GFX12: encoding: [0x01,0x6b,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
 
 v_sin_f32 v5, v255
-// GFX12: encoding: [0xff,0x6b,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, v255                  ; encoding: [0xff,0x6b,0x0a,0x7e]
 
 v_sin_f32 v5, s1
-// GFX12: encoding: [0x01,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, s1                    ; encoding: [0x01,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, s105
-// GFX12: encoding: [0x69,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, s105                  ; encoding: [0x69,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, m0
-// GFX12: encoding: [0x7d,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, m0                    ; encoding: [0x7d,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, null
-// GFX12: encoding: [0x7c,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, null                  ; encoding: [0x7c,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, -1
-// GFX12: encoding: [0xc1,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, -1                    ; encoding: [0xc1,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6a,0x0a,0x7e]
 
 v_sin_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x6a,0x0a,0x7e]
+// GFX12: v_sin_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6a,0x0a,0x7e]
 
 v_sin_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_sin_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_sqrt_f16 v5, v1
-// GFX12: encoding: [0x01,0xab,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, v1                   ; encoding: [0x01,0xab,0x0a,0x7e]
 
 v_sqrt_f16 v5, v127
-// GFX12: encoding: [0x7f,0xab,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, v127                 ; encoding: [0x7f,0xab,0x0a,0x7e]
 
 v_sqrt_f16 v5, s1
-// GFX12: encoding: [0x01,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, s1                   ; encoding: [0x01,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, s105
-// GFX12: encoding: [0x69,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, s105                 ; encoding: [0x69,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, m0
-// GFX12: encoding: [0x7d,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, m0                   ; encoding: [0x7d,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, null
-// GFX12: encoding: [0x7c,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, null                 ; encoding: [0x7c,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, -1
-// GFX12: encoding: [0xc1,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, -1                   ; encoding: [0xc1,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xaa,0x0a,0x7e]
+// GFX12: v_sqrt_f16_e32 v5, src_scc              ; encoding: [0xfd,0xaa,0x0a,0x7e]
 
 v_sqrt_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_sqrt_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sqrt_f32 v5, v1
-// GFX12: encoding: [0x01,0x67,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, v1                   ; encoding: [0x01,0x67,0x0a,0x7e]
 
 v_sqrt_f32 v5, v255
-// GFX12: encoding: [0xff,0x67,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, v255                 ; encoding: [0xff,0x67,0x0a,0x7e]
 
 v_sqrt_f32 v5, s1
-// GFX12: encoding: [0x01,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, s1                   ; encoding: [0x01,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, s105
-// GFX12: encoding: [0x69,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, s105                 ; encoding: [0x69,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, m0
-// GFX12: encoding: [0x7d,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, m0                   ; encoding: [0x7d,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, null
-// GFX12: encoding: [0x7c,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, null                 ; encoding: [0x7c,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, -1
-// GFX12: encoding: [0xc1,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, -1                   ; encoding: [0xc1,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x66,0x0a,0x7e]
+// GFX12: v_sqrt_f32_e32 v5, src_scc              ; encoding: [0xfd,0x66,0x0a,0x7e]
 
 v_sqrt_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_sqrt_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_sqrt_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x69,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x69,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x69,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x69,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], null             ; encoding: [0x7c,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x68,0x0a,0x7e]
+// GFX12: v_sqrt_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x68,0x0a,0x7e]
 
 v_sqrt_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_swap_b16 v5.l, v1.h
+// GFX12: v_swap_b16  v5.l, v1.h                  ; encoding: [0x81,0xcd,0x0a,0x7e]
+
+v_swap_b16 v5.h, v1.l
+// GFX12: v_swap_b16  v5.h, v1.l                  ; encoding: [0x01,0xcd,0x0a,0x7f]
+
+v_swap_b16 v127.l, v127.l
+// GFX12: v_swap_b16  v127.l, v127.l              ; encoding: [0x7f,0xcd,0xfe,0x7e]
 
 v_swap_b32 v5, v1
-// GFX12: encoding: [0x01,0xcb,0x0a,0x7e]
+// GFX12: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]
 
 v_swap_b32 v255, v255
-// GFX12: encoding: [0xff,0xcb,0xfe,0x7f]
+// GFX12: v_swap_b32 v255, v255                   ; encoding: [0xff,0xcb,0xfe,0x7f]
 
 v_swaprel_b32 v5, v1
-// GFX12: encoding: [0x01,0xd1,0x0a,0x7e]
+// GFX12: v_swaprel_b32 v5, v1                    ; encoding: [0x01,0xd1,0x0a,0x7e]
 
 v_swaprel_b32 v255, v255
-// GFX12: encoding: [0xff,0xd1,0xfe,0x7f]
+// GFX12: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
 
 v_trunc_f16 v5, v1
-// GFX12: encoding: [0x01,0xbb,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
 
 v_trunc_f16 v5, v127
-// GFX12: encoding: [0x7f,0xbb,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
 
 v_trunc_f16 v5, s1
-// GFX12: encoding: [0x01,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, s105
-// GFX12: encoding: [0x69,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, vcc_lo
-// GFX12: encoding: [0x6a,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, vcc_hi
-// GFX12: encoding: [0x6b,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, ttmp15
-// GFX12: encoding: [0x7b,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, m0
-// GFX12: encoding: [0x7d,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, exec_lo
-// GFX12: encoding: [0x7e,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, exec_hi
-// GFX12: encoding: [0x7f,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, null
-// GFX12: encoding: [0x7c,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, -1
-// GFX12: encoding: [0xc1,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, 0.5
-// GFX12: encoding: [0xf0,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
 
 v_trunc_f16 v5, src_scc
-// GFX12: encoding: [0xfd,0xba,0x0a,0x7e]
+// GFX12: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
 
 v_trunc_f16 v127, 0xfe0b
-// GFX12: encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32 v5, v1
-// GFX12: encoding: [0x01,0x43,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
 
 v_trunc_f32 v5, v255
-// GFX12: encoding: [0xff,0x43,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, v255                ; encoding: [0xff,0x43,0x0a,0x7e]
 
 v_trunc_f32 v5, s1
-// GFX12: encoding: [0x01,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, s1                  ; encoding: [0x01,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, s105
-// GFX12: encoding: [0x69,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, s105                ; encoding: [0x69,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, vcc_lo
-// GFX12: encoding: [0x6a,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, vcc_hi
-// GFX12: encoding: [0x6b,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, ttmp15
-// GFX12: encoding: [0x7b,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, m0
-// GFX12: encoding: [0x7d,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, m0                  ; encoding: [0x7d,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, exec_lo
-// GFX12: encoding: [0x7e,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, exec_hi
-// GFX12: encoding: [0x7f,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, null
-// GFX12: encoding: [0x7c,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, null                ; encoding: [0x7c,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, -1
-// GFX12: encoding: [0xc1,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, -1                  ; encoding: [0xc1,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, 0.5
-// GFX12: encoding: [0xf0,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x42,0x0a,0x7e]
 
 v_trunc_f32 v5, src_scc
-// GFX12: encoding: [0xfd,0x42,0x0a,0x7e]
+// GFX12: v_trunc_f32_e32 v5, src_scc             ; encoding: [0xfd,0x42,0x0a,0x7e]
 
 v_trunc_f32 v255, 0xaf123456
-// GFX12: encoding: [0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_trunc_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 v_trunc_f64 v[5:6], v[1:2]
-// GFX12: encoding: [0x01,0x2f,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x2f,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], v[254:255]
-// GFX12: encoding: [0xfe,0x2f,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x2f,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], s[2:3]
-// GFX12: encoding: [0x02,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], s[104:105]
-// GFX12: encoding: [0x68,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], vcc
-// GFX12: encoding: [0x6a,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], ttmp[14:15]
-// GFX12: encoding: [0x7a,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], exec
-// GFX12: encoding: [0x7e,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], null
-// GFX12: encoding: [0x7c,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], null            ; encoding: [0x7c,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], -1
-// GFX12: encoding: [0xc1,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], 0.5
-// GFX12: encoding: [0xf0,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[5:6], src_scc
-// GFX12: encoding: [0xfd,0x2e,0x0a,0x7e]
+// GFX12: v_trunc_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x2e,0x0a,0x7e]
 
 v_trunc_f64 v[254:255], 0xaf123456
-// GFX12: encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+// GFX12: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s
new file mode 100644
index 0000000000000..9b181e98231ae
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s
@@ -0,0 +1,2828 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_bfrev_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_bfrev_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_bfrev_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_bfrev_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ceil_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ceil_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ceil_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_ceil_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ceil_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ceil_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ceil_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ceil_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x44,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cls_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cls_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cls_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cls_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cls_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cls_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cls_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cls_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x76,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_clz_i32_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_clz_i32_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_clz_i32_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cos_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cos_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cos_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cos_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cos_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cos_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cos_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cos_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cos_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cos_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_ctz_i32_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ctz_i32_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ctz_i32_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_f32 v127, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x14,0xfe,0x7e,0xff,0x6f,0x35,0x30]
+
+v_cvt_f16_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_cvt_f16_u16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_u16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_u16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa0,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_cvt_f32_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_f16 v255, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x16,0xfe,0x7f,0x7f,0x6f,0x35,0x30]
+
+v_cvt_f32_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0a,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0c,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte0 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte0 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte0 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x22,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte1 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte1 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte1 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x24,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte2 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte2 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte2 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x26,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte3 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte3 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte3 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x28,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_floor_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_floor_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_floor_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_flr_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_flr_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_flr_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+
+v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_nearest_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_nearest_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x18,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_norm_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_norm_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_norm_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_norm_u16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_norm_u16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_norm_u16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_off_f32_i4 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_off_f32_i4 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_off_f32_i4 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_off_f32_i4 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1c,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_rpi_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_rpi_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_rpi_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x18,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_u16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_u32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+
+v_exp_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_exp_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_exp_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_exp_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_exp_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_exp_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_exp_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_exp_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_exp_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_exp_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_exp_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_exp_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_exp_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_exp_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_exp_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_exp_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_ffbh_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbh_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbh_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbh_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbh_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x76,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ffbh_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbh_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbh_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbh_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbh_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ffbl_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbl_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbl_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbl_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbl_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_floor_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_floor_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_floor_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_floor_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_floor_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_floor_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_floor_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_floor_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_floor_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_floor_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_floor_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_floor_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_fract_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_fract_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_fract_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_fract_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_fract_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x40,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_frexp_exp_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_exp_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_exp_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_frexp_exp_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_log_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_log_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_log_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_log_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_log_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_log_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_log_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_log_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_log_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_log_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_log_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_log_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_log_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_log_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_log_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_log_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_log_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_log_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_log_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_log_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_log_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_log_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_log_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_log_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_log_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_log_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_log_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_log_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x4e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_mov_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_mov_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_mov_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_mov_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_mov_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_mov_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_mov_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_mov_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x02,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movreld_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movreld_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movreld_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movreld_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movreld_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x84,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrels_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrels_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrels_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrels_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrels_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x86,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrelsd_2_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrelsd_2_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrelsd_2_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrelsd_2_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x90,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrelsd_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrelsd_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrelsd_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_not_b16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_not_b16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_not_b16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_not_b16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_not_b16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_not_b16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_not_b16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_not_b16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_not_b16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_not_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_not_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_not_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_not_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_not_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_not_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_not_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_not_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_not_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_not_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_not_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_not_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_not_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_rcp_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rcp_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x54,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rcp_iflag_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_iflag_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rndne_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rndne_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rndne_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rndne_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rsq_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rsq_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rsq_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rsq_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rsq_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rsq_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rsq_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rsq_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
+
+v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sin_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sin_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sin_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sin_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sin_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sin_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sin_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sin_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sin_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_sqrt_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sqrt_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sqrt_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sqrt_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sqrt_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sqrt_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sqrt_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sqrt_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_trunc_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_trunc_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x42,0xfe,0x7f,0xff,0x6f,0x35,0x30]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index d5cafcd4c3874..323439b6edd53 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -1,5 +1,7 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s
new file mode 100644
index 0000000000000..82acbd421bae2
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s
@@ -0,0 +1,617 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_bfrev_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ceil_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_ceil_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x44,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cls_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cls_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cls_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x76,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x14,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+v_cvt_f16_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x16,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x16,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_f16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x16,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x22,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte0 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x22,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte0 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x22,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x24,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte1 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x24,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte1 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x24,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x26,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte2 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x26,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte2 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x26,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x28,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte3 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x28,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte3 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x28,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_floor_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_floor_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_flr_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_flr_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x10,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x10,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_nearest_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x18,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_u16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_off_f32_i4 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_off_f32_i4 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_rpi_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_rpi_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x18,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_exp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbh_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x76,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbh_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbl_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbl_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbl_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_floor_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x40,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x7e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x7e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_log_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_log_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x4e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_mov_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x02,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_mov_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x02,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_mov_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x02,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movreld_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x84,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movreld_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x84,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movreld_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x84,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrels_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x86,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrels_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x86,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrels_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x86,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrelsd_2_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x90,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_2_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x90,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_2_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x90,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x88,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x88,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rcp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rcp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x54,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x56,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x56,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rsq_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_sqrt_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x42,0xfe,0x7f,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 4c884018bc5a8..fa3234d8f9b2c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -1,5 +1,7 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s
new file mode 100644
index 0000000000000..d3aa94d373e80
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s
@@ -0,0 +1,505 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
+
+v_ceil_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v128, 0xaf123456
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v255, v255
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f32_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i32_i16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u32_u16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_exp_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_exp_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_not_b16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rcp_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rcp_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sat_pk_u8_i16_e32 v199, v5
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v128, 0xaf123456 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v255 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f32_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v128, 0xaf123456 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f32_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index 37edf627e3667..2fde8fd103750 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
 
 v_ceil_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -201,6 +201,21 @@ v_sqrt_f16_e32 v255, v1
 v_sqrt_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+v_swap_b16_e32 v128.l, v0.l
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, v255.l
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, s0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_err.s b/llvm/test/MC/AMDGPU/gfx12_err.s
index 8b2565cb7f569..d8578d87279d1 100644
--- a/llvm/test/MC/AMDGPU/gfx12_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_err.s
@@ -118,10 +118,10 @@ s_load_b128 s[20:23], s[2:3], vcc_lo th:TH_LOAD_NT_HT
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for SMEM instruction
 
 image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT scope:SCOPE_SE th:TH_LOAD_HT
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D scope:SCOPE_SE th:TH_LOAD_HT scope:SCOPE_SE
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 s_prefetch_inst s[14:15], 0xffffff, m0, 7
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 24-bit signed offset
diff --git a/llvm/test/MC/AMDGPU/lit.local.cfg b/llvm/test/MC/AMDGPU/lit.local.cfg
index 7c492428aec76..c5853ad178a04 100644
--- a/llvm/test/MC/AMDGPU/lit.local.cfg
+++ b/llvm/test/MC/AMDGPU/lit.local.cfg
@@ -1,2 +1,4 @@
+config.substitutions.append(("%extract-encodings", "sed 's/.*encoding://p'"))
+
 if not "AMDGPU" in config.root.targets:
     config.unsupported = True
diff --git a/llvm/test/MC/AMDGPU/reloc-operands-gfx10.s b/llvm/test/MC/AMDGPU/reloc-operands-gfx10.s
new file mode 100644
index 0000000000000..b0fe71ddadb3b
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/reloc-operands-gfx10.s
@@ -0,0 +1,31 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s
+
+; test vop3 operands
+
+// GFX10: v_mad_u32_u24 v0, g0@abs32@lo, v0, 12   ; encoding: [0x00,0x00,0x43,0xd5,0xff,0x00,0x32,0x02,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 8, value: g0@abs32@lo, kind: FK_Data_4
+v_mad_u32_u24 v0, g0@abs32@lo, v0, 12
+
+// GFX10: v_mad_u32_u24 v0, v0, g0@abs32@lo, 12   ; encoding: [0x00,0x00,0x43,0xd5,0x00,0xff,0x31,0x02,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 8, value: g0@abs32@lo, kind: FK_Data_4
+v_mad_u32_u24 v0, v0, g0@abs32@lo, 12
+
+// GFX10: v_mad_u32_u24 v0, v0, 12, g0@abs32@lo   ; encoding: [0x00,0x00,0x43,0xd5,0x00,0x19,0xfd,0x03,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 8, value: g0@abs32@lo, kind: FK_Data_4
+v_mad_u32_u24 v0, v0, 12, g0@abs32@lo
+
+; test vop2 operands
+
+// GFX10: v_add_nc_u32_e32 v0, g0@abs32@lo, v1    ; encoding: [0xff,0x02,0x00,0x4a,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 4, value: g0@abs32@lo, kind: FK_Data_4
+v_add_nc_u32 v0, g0@abs32@lo, v1
+
+// GFX10: v_add_nc_u32_e64 v0, v1, g0@abs32@lo    ; encoding: [0x00,0x00,0x25,0xd5,0x01,0xff,0x01,0x00,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 8, value: g0@abs32@lo, kind: FK_Data_4
+v_add_nc_u32 v0, v1, g0@abs32@lo
+
+// test vop1 operands
+// GFX10: v_not_b32_e32 v0, g0@abs32@lo           ; encoding: [0xff,0x6e,0x00,0x7e,A,A,A,A]
+// GFX10-NEXT:                                    ;   fixup A - offset: 4, value: g0@abs32@lo, kind: FK_Data_4
+v_not_b32 v0, g0@abs32@lo
+
diff --git a/llvm/test/MC/ARM/CheckDataSymbol.s b/llvm/test/MC/ARM/CheckDataSymbol.s
index 14ea92a943a1e..ec421f51395af 100644
--- a/llvm/test/MC/ARM/CheckDataSymbol.s
+++ b/llvm/test/MC/ARM/CheckDataSymbol.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -assemble \
 # RUN: -triple=arm-arm-none-eabi -mcpu=cortex-a9 %s -o - \
 # RUN: | llvm-readobj -S --symbols - | FileCheck %s
-# CHECK:     Name: $d.1 ({{[1-9][0-9]+}})
+# CHECK:     Name: $d
 # CHECK-NEXT:     Value: 0x4
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Local (0x0)
diff --git a/llvm/test/MC/ARM/Windows/invalid-relocation.s b/llvm/test/MC/ARM/Windows/invalid-relocation.s
index db41570b062a2..821de8dd55604 100644
--- a/llvm/test/MC/ARM/Windows/invalid-relocation.s
+++ b/llvm/test/MC/ARM/Windows/invalid-relocation.s
@@ -1,4 +1,4 @@
-# RUN: not --crash llvm-mc -triple thumbv7-windows -incremental-linker-compatible -filetype obj -o /dev/null 2>&1 %s \
+# RUN: not llvm-mc -triple thumbv7-windows -incremental-linker-compatible -filetype obj -o /dev/null 2>&1 %s \
 # RUN:     | FileCheck %s
 
 	.def invalid_relocation
@@ -9,5 +9,4 @@
 	.thumb_func
 	adr r0, invalid_relocation+1
 
-# CHECK: LLVM ERROR: unsupported relocation type: fixup_t2_adr_pcrel_12
-
+# CHECK: 10:2: error: unsupported relocation type
diff --git a/llvm/test/MC/ARM/align-fill-byte-zero.s b/llvm/test/MC/ARM/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..cc1e27e80e920
--- /dev/null
+++ b/llvm/test/MC/ARM/align-fill-byte-zero.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple armv7a %s -o - | FileCheck %s --check-prefix=ASM-ARM
+// RUN: llvm-mc -triple armv7a -filetype obj %s -o - | \
+// RUN:   llvm-objdump --triple=armv7a -dz - | FileCheck %s --check-prefix=OBJ-ARM
+
+// RUN: llvm-mc -triple thumbv7a %s -o - | FileCheck %s --check-prefix=ASM-THUMB
+// RUN: llvm-mc -triple thumbv7a -filetype obj %s -o - | \
+// RUN:   llvm-objdump --triple=thumbv7a -dz - | FileCheck %s --check-prefix=OBJ-THUMB
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM-ARM: add     r0, r0, #1
+// OBJ-ARM: e2800001      add     r0, r0, #1
+
+// ASM-THUMB: add.w   r0, r0, #1
+// OBJ-THUMB: f100 0001      add.w     r0, r0, #0x1
+  add r0, r0, 0x1
+
+// ASM-ARM: .p2align 4, 0x0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+
+// ASM-THUMB: .p2align 4, 0x0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+  .balign 0x10, 0
+
+// ASM-ARM: add     r0, r0, #1
+// OBJ-ARM-NEXT: e2800001      add     r0, r0, #1
+
+// ASM-THUMB: add.w   r0, r0, #1
+// OBJ-THUMB-NEXT: f100 0001      add.w     r0, r0, #0x1
+  add r0, r0, 0x1
diff --git a/llvm/test/MC/ARM/data-in-code.ll b/llvm/test/MC/ARM/data-in-code.ll
index 2e107f250e05d..b755c3bb5cad4 100644
--- a/llvm/test/MC/ARM/data-in-code.ll
+++ b/llvm/test/MC/ARM/data-in-code.ll
@@ -72,7 +72,7 @@ exit:
 ;; TMB-NEXT:     Section: [[MIXED_SECT:[^ ]+]]
 
 ;; TMB:        Symbol {
-;; TMB:          Name: $d.1
+;; TMB:          Name: $d
 ;; TMB-NEXT:     Value: 0x{{[0-9A-F]+}}
 ;; TMB-NEXT:     Size: 0
 ;; TMB-NEXT:     Binding: Local
diff --git a/llvm/test/MC/ARM/directive-arm-thumb-alignment.s b/llvm/test/MC/ARM/directive-arm-thumb-alignment.s
index b90c76d2b121c..0e798f67b48aa 100644
--- a/llvm/test/MC/ARM/directive-arm-thumb-alignment.s
+++ b/llvm/test/MC/ARM/directive-arm-thumb-alignment.s
@@ -10,12 +10,12 @@
 @ CHECK:      Num:    Value  Size Type    Bind   Vis      Ndx Name
 @ CHECK-NEXT:   0: 00000000     0 NOTYPE  LOCAL  DEFAULT  UND
 @ CHECK-NEXT:   1: 00000001     0 FUNC    LOCAL  DEFAULT    2 aligned_thumb
-@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $t.0
+@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $t
 @ CHECK-NEXT:   3: 00000004     0 FUNC    LOCAL  DEFAULT    2 thumb_to_arm
-@ CHECK-NEXT:   4: 00000004     0 NOTYPE  LOCAL  DEFAULT    2 $a.1
-@ CHECK-NEXT:   5: 00000008     0 NOTYPE  LOCAL  DEFAULT    2 $d.2
+@ CHECK-NEXT:   4: 00000004     0 NOTYPE  LOCAL  DEFAULT    2 $a
+@ CHECK-NEXT:   5: 00000008     0 NOTYPE  LOCAL  DEFAULT    2 $d
 @ CHECK-NEXT:   6: 0000000b     0 FUNC    LOCAL  DEFAULT    2 unaligned_arm_to_thumb
-@ CHECK-NEXT:   7: 0000000a     0 NOTYPE  LOCAL  DEFAULT    2 $t.3
+@ CHECK-NEXT:   7: 0000000a     0 NOTYPE  LOCAL  DEFAULT    2 $t
 
 .thumb
 
diff --git a/llvm/test/MC/ARM/multi-section-mapping.s b/llvm/test/MC/ARM/multi-section-mapping.s
index 6107f262b0b8c..ed531306042aa 100644
--- a/llvm/test/MC/ARM/multi-section-mapping.s
+++ b/llvm/test/MC/ARM/multi-section-mapping.s
@@ -1,4 +1,4 @@
-@ RUN: llvm-mc -triple=armv7-linux-gnueabi -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -filetype=obj < %s | llvm-objdump -t - | FileCheck %s --match-full-lines
 
         .text
         add r0, r0, r0
@@ -42,10 +42,10 @@
 @   + .starts_thumb to have $t at 0
 @   + .starts_data to have $d at 0
 
-@ CHECK:      00000000 l .text 00000000 $a.0
-@ CHECK-NEXT: 00000000 l .wibble 00000000 $a.1
-@ CHECK-NEXT: 00000000 l .starts_thumb 00000000 $t.2
-@ CHECK-NEXT: 00000008 l .text 00000000 $t.3
-@ CHECK-NEXT: 0000000a l .text 00000000 $d.4
+@ CHECK:      00000000 l .text 00000000 $a
+@ CHECK-NEXT: 00000000 l .wibble 00000000 $a
+@ CHECK-NEXT: 00000000 l .starts_thumb 00000000 $t
+@ CHECK-NEXT: 00000008 l .text 00000000 $t
+@ CHECK-NEXT: 0000000a l .text 00000000 $d
 @ CHECK-NOT: ${{[adt]}}
 
diff --git a/llvm/test/MC/ARM/thumb-function-address.s b/llvm/test/MC/ARM/thumb-function-address.s
index 753a049137bbf..d69dcb6724019 100644
--- a/llvm/test/MC/ARM/thumb-function-address.s
+++ b/llvm/test/MC/ARM/thumb-function-address.s
@@ -35,8 +35,8 @@ label:
 @ CHECK-NEXT: 00000000 0 NOTYPE LOCAL DEFAULT     UND
 @ CHECK-NEXT: 00000001 0 FUNC   LOCAL DEFAULT 2   func_label
 @ CHECK-NEXT: 00000001 0 FUNC   LOCAL DEFAULT 2   foo_impl
-@ CHECK-NEXT: 00000000 0 NOTYPE LOCAL DEFAULT 2   $t.0
+@ CHECK-NEXT: 00000000 0 NOTYPE LOCAL DEFAULT 2   $t
 @ CHECK-NEXT: 00000003 0 FUNC   LOCAL DEFAULT 2   foo_resolver
 @ CHECK-NEXT: 00000003 0 IFUNC  LOCAL DEFAULT 2   foo
 @ CHECK-NEXT: 00000004 0 FUNC   LOCAL DEFAULT 2   label
-@ CHECK-NEXT: 00000008 0 NOTYPE LOCAL DEFAULT 2   $a.1
+@ CHECK-NEXT: 00000008 0 NOTYPE LOCAL DEFAULT 2   $a
diff --git a/llvm/test/MC/ARM/thumb-types.s b/llvm/test/MC/ARM/thumb-types.s
index cb1b47e1fa7fb..b965cd8accf05 100644
--- a/llvm/test/MC/ARM/thumb-types.s
+++ b/llvm/test/MC/ARM/thumb-types.s
@@ -3,22 +3,22 @@
 @ CHECK:      Num:    Value  Size Type    Bind   Vis      Ndx Name
 @ CHECK-NEXT:   0: 00000000     0 NOTYPE  LOCAL  DEFAULT  UND
 @ CHECK-NEXT:   1: 00000001     0 FUNC    LOCAL  DEFAULT    2 implicit_function
-@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $t.0
+@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $t
 @ CHECK-NEXT:   3: 00000002     0 OBJECT  LOCAL  DEFAULT    2 implicit_data
-@ CHECK-NEXT:   4: 00000002     0 NOTYPE  LOCAL  DEFAULT    2 $d.1
+@ CHECK-NEXT:   4: 00000002     0 NOTYPE  LOCAL  DEFAULT    2 $d
 @ CHECK-NEXT:   5: 00000008     0 FUNC    LOCAL  DEFAULT    2 arm_function
-@ CHECK-NEXT:   6: 00000008     0 NOTYPE  LOCAL  DEFAULT    2 $a.2
+@ CHECK-NEXT:   6: 00000008     0 NOTYPE  LOCAL  DEFAULT    2 $a
 @ CHECK-NEXT:   7: 0000000c     0 NOTYPE  LOCAL  DEFAULT    2 untyped_text_label
-@ CHECK-NEXT:   8: 0000000c     0 NOTYPE  LOCAL  DEFAULT    2 $t.3
+@ CHECK-NEXT:   8: 0000000c     0 NOTYPE  LOCAL  DEFAULT    2 $t
 @ CHECK-NEXT:   9: 0000000f     0 FUNC    LOCAL  DEFAULT    2 explicit_function
-@ CHECK-NEXT:  10: 00000010     0 NOTYPE  LOCAL  DEFAULT    2 $d.4
+@ CHECK-NEXT:  10: 00000010     0 NOTYPE  LOCAL  DEFAULT    2 $d
 @ CHECK-NEXT:  11: 00000000     4 TLS     LOCAL  DEFAULT    5 tls
 @ CHECK-NEXT:  12: 00000015     0 IFUNC   LOCAL  DEFAULT    2 indirect_function
-@ CHECK-NEXT:  13: 00000014     0 NOTYPE  LOCAL  DEFAULT    2 $t.5
+@ CHECK-NEXT:  13: 00000014     0 NOTYPE  LOCAL  DEFAULT    2 $t
 @ CHECK-NEXT:  14: 00000000     0 NOTYPE  LOCAL  DEFAULT    4 untyped_data_label
-@ CHECK-NEXT:  15: 00000000     0 NOTYPE  LOCAL  DEFAULT    4 $t.6
+@ CHECK-NEXT:  15: 00000000     0 NOTYPE  LOCAL  DEFAULT    4 $t
 @ CHECK-NEXT:  16: 00000002     0 OBJECT  LOCAL  DEFAULT    4 explicit_data
-@ CHECK-NEXT:  17: 00000002     0 NOTYPE  LOCAL  DEFAULT    4 $d.7
+@ CHECK-NEXT:  17: 00000002     0 NOTYPE  LOCAL  DEFAULT    4 $d
 
 
 	.syntax unified
diff --git a/llvm/test/MC/ARM/thumb_set.s b/llvm/test/MC/ARM/thumb_set.s
index 4bb7b599aaf11..836eb0b62e0fa 100644
--- a/llvm/test/MC/ARM/thumb_set.s
+++ b/llvm/test/MC/ARM/thumb_set.s
@@ -6,12 +6,12 @@
 @ CHECK:      Num:    Value  Size Type    Bind   Vis      Ndx Name
 @ CHECK-NEXT:   0: 00000000     0 NOTYPE  LOCAL  DEFAULT  UND
 @ CHECK-NEXT:   1: 00000000     0 FUNC    LOCAL  DEFAULT    2 arm_func
-@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $a.0
+@ CHECK-NEXT:   2: 00000000     0 NOTYPE  LOCAL  DEFAULT    2 $a
 @ CHECK-NEXT:   3: 00000001     0 FUNC    LOCAL  DEFAULT    2 alias_arm_func
 @ CHECK-NEXT:   4: 00000001     0 FUNC    LOCAL  DEFAULT    2 alias_arm_func2
 @ CHECK-NEXT:   5: 00000001     0 FUNC    LOCAL  DEFAULT    2 alias_arm_func3
 @ CHECK-NEXT:   6: 00000005     0 FUNC    LOCAL  DEFAULT    2 thumb_func
-@ CHECK-NEXT:   7: 00000004     0 NOTYPE  LOCAL  DEFAULT    2 $t.1
+@ CHECK-NEXT:   7: 00000004     0 NOTYPE  LOCAL  DEFAULT    2 $t
 @ CHECK-NEXT:   8: 00000005     0 FUNC    LOCAL  DEFAULT    2 alias_thumb_func
 @ CHECK-NEXT:   9: 5eed1e55     0 FUNC    LOCAL  DEFAULT  ABS seedless
 @ CHECK-NEXT:  10: e665a1ad     0 FUNC    LOCAL  DEFAULT  ABS eggsalad
diff --git a/llvm/test/MC/AsmParser/directive_abort.s b/llvm/test/MC/AsmParser/directive_abort.s
index 86e6267a7a1eb..f4dda229a017b 100644
--- a/llvm/test/MC/AsmParser/directive_abort.s
+++ b/llvm/test/MC/AsmParser/directive_abort.s
@@ -1,6 +1,9 @@
-# RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t
-# RUN: FileCheck -input-file %t %s
+// RUN: not llvm-mc -filetype=obj -triple x86_64 %s 2>&1 -o /dev/null | FileCheck %s
 
-# CHECK: error: .abort 'please stop assembing'
-TEST0:
-	.abort       please stop assembing
+.abort
+// CHECK:      [[#@LINE-1]]:1: error: .abort detected. Assembly stopping
+// CHECK-NEXT: abort
+
+.abort "abort message"
+// CHECK:      [[#@LINE-1]]:1: error: .abort '"abort message"' detected. Assembly stopping
+// CHECK-NEXT: abort
diff --git a/llvm/test/MC/AsmParser/directive_align.s b/llvm/test/MC/AsmParser/directive_align.s
index 0430f51740861..d71f559fd03da 100644
--- a/llvm/test/MC/AsmParser/directive_align.s
+++ b/llvm/test/MC/AsmParser/directive_align.s
@@ -1,6 +1,8 @@
 # RUN: not llvm-mc -triple i386-apple-darwin9 %s 2> %t.err | FileCheck %s
 # RUN: FileCheck < %t.err %s --check-prefix=CHECK-WARN
 
+        .data
+
 # CHECK: TEST0:
 # CHECK: .p2align 1
 TEST0:  
diff --git a/llvm/test/MC/COFF/align-nops.s b/llvm/test/MC/COFF/align-nops.s
index d6906d07148b6..7264d70938cea 100644
--- a/llvm/test/MC/COFF/align-nops.s
+++ b/llvm/test/MC/COFF/align-nops.s
@@ -4,7 +4,7 @@
     .text
 f0:
     .long 0
-    .align  8, 0x90
+    .align  8
     .long 0
     .align  8
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index f6d2a19326e1d..d6e8b7ee2f01f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
@@ -19,6 +19,18 @@
 # W64: [[@LINE+1]]:1: warning: invalid instruction encoding
 0xff,0x04,0x02,0xc9,0x03,0x03,0x06,0x05,0x56,0x34,0x12,0xaf
 
+# GFX11: v_swap_b16 v5.h, s1/*Invalid register, operand has 'VGPR_16_Lo128' register class*/ ; encoding: [0x01,0xcc,0x0a,0x7f]
+# GFX12: v_swap_b16 v5.h, s1/*Invalid register, operand has 'VGPR_16_Lo128' register class*/ ; encoding: [0x01,0xcc,0x0a,0x7f]
+0x01,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x3c00/*Invalid immediate*/ ; encoding: [0x00,0xcc,0x0a,0x7f]
+# GFX12: v_swap_b16 v5.h, 0x3c00/*Invalid immediate*/ ; encoding: [0x00,0xcc,0x0a,0x7f]
+0xf2,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x78563412/*Invalid immediate*/ ; encoding: [0x12,0xcc,0x0a,0x7f]
+# GFX12: v_swap_b16 v5.h, 0x78563412/*Invalid immediate*/ ; encoding: [0x12,0xcc,0x0a,0x7f]
+0xff,0xcc,0x0a,0x7f,0x12,0x34,0x56,0x78
+
 # W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
 # W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
 0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
index 5c4f5829cdaf1..1b632b56400fb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: ds_add_f32 v0, v1                       ; encoding: [0x00,0x00,0x54,0xd8,0x00,0x01,0x00,0x00]
 0x00,0x00,0x54,0xd8,0x00,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
index 83f6a3da3ad7b..8b49de5d89909 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_atc_probe 7, s[4:5], 0x64             ; encoding: [0xc2,0x01,0x88,0xf4,0x64,0x00,0x00,0xf8]
 0xc2,0x01,0x88,0xf4,0x64,0x00,0x00,0xf8
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
index 929f3d2129be9..291f348a35cc6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_abs_i32 exec_hi, s1                   ; encoding: [0x01,0x15,0xff,0xbe]
 0x01,0x15,0xff,0xbe
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop2.txt
index e57648235df62..f45eba41d26d1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop2.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_absdiff_i32 exec_hi, s1, s2           ; encoding: [0x01,0x02,0x7f,0x83]
 0x01,0x02,0x7f,0x83
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopc.txt
index af2905f14dd01..b4c67e1e7c377 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopc.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_bitcmp0_b32 exec_hi, s1               ; encoding: [0x7f,0x01,0x0c,0xbf]
 0x7f,0x01,0x0c,0xbf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopk.txt
index f1717bd92832e..dbf615033244c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopk.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopk.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_addk_i32 exec_hi, 0x1234              ; encoding: [0x34,0x12,0xff,0xb7]
 0x34,0x12,0xff,0xb7
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopp.txt
index 3b122c834b875..51604da454d16 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sopp.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: s_barrier                               ; encoding: [0x00,0x00,0xbd,0xbf]
 0x00,0x00,0xbd,0xbf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index b176a57d70f86..851242dd5dece 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
 0x01,0x71,0x0a,0x7e
@@ -3344,6 +3344,12 @@
 # GFX11: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf
 
+# GFX11-TRUE16: v_swap_b16 v5.l, v1.h            ; encoding: [0x81,0xcd,0x0a,0x7e]
+0x81,0xcd,0x0a,0x7e
+
+# GFX11-TRUE16: v_swap_b16 v5.h, v1.l            ; encoding: [0x01,0xcd,0x0a,0x7f]
+0x01,0xcd,0x0a,0x7f
+
 # GFX11: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]
 0x01,0xcb,0x0a,0x7e
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index 710890c222650..d697addd2686c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index 94eed731c9f88..6b8de94465156 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
index 8cb47b0a596de..84ced2fd22499 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-FAKE16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64 %s
 
 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x40]
 # W64: v_add_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x40]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
index dd41cfa2a1045..eebf0cc13cee6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff]
 # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
index 97f9b67b4eea3..5f1d4d4b33cbd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05]
 # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
index 4575418801c5c..4dae6c2cf076c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopc.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
index 26168974198e8..cd97006838d3e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
 # GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
index 5f21c9b92f6f8..3587f13b6684f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopc.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
index 70ece76fcc9e1..14a4c1b295210 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
 # GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
index f4a8b99a45f46..a5978690d4ef1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s
 
 # W32: v_cmp_class_f16_e64 s10, v1, v2           ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
 # W64: v_cmp_class_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
index 555a6aa6a0fbb..3f2d1164029eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
 
 # GFX11: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
index 838e6e0d814d5..6cfd8f00e60d3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: v_dot2_f32_bf16 v5, v1, v2, v3          ; encoding: [0x05,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c]
 0x05,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
index ceca6d9fc3faa..524623ea8be3d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
index 57c96170eadce..7f26c47b1e88c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
 # GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
 0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop2.txt
index 349aedcb78a86..7c79bbb14b5f5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop2.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
 
 # GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x80,0xa9]
 0x02,0x04,0x80,0xa9
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
index 1d56c7b246e88..3e323ed69216c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
 
 # GFX12: s_addk_co_i32 exec_hi, 0x1234           ; encoding: [0x34,0x12,0xff,0xb7]
 0x34,0x12,0xff,0xb7
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
deleted file mode 100644
index 6637a88c3eee5..0000000000000
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
+++ /dev/null
@@ -1,3338 +0,0 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
-
-# GFX12: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
-0x01,0x71,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, v255                ; encoding: [0xff,0x71,0x0a,0x7e]
-0xff,0x71,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, s1                  ; encoding: [0x01,0x70,0x0a,0x7e]
-0x01,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, s105                ; encoding: [0x69,0x70,0x0a,0x7e]
-0x69,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, vcc_lo              ; encoding: [0x6a,0x70,0x0a,0x7e]
-0x6a,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, vcc_hi              ; encoding: [0x6b,0x70,0x0a,0x7e]
-0x6b,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, ttmp15              ; encoding: [0x7b,0x70,0x0a,0x7e]
-0x7b,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, m0                  ; encoding: [0x7d,0x70,0x0a,0x7e]
-0x7d,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, exec_lo             ; encoding: [0x7e,0x70,0x0a,0x7e]
-0x7e,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, exec_hi             ; encoding: [0x7f,0x70,0x0a,0x7e]
-0x7f,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, null                ; encoding: [0x7c,0x70,0x0a,0x7e]
-0x7c,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, -1                  ; encoding: [0xc1,0x70,0x0a,0x7e]
-0xc1,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, 0.5                 ; encoding: [0xf0,0x70,0x0a,0x7e]
-0xf0,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v5, src_scc             ; encoding: [0xfd,0x70,0x0a,0x7e]
-0xfd,0x70,0x0a,0x7e
-
-# GFX12: v_bfrev_b32_e32 v255, 0xaf123456        ; encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_ceil_f16_e32 v5, v1                   ; encoding: [0x01,0xb9,0x0a,0x7e]
-0x01,0xb9,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, v127                 ; encoding: [0x7f,0xb9,0x0a,0x7e]
-0x7f,0xb9,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, s1                   ; encoding: [0x01,0xb8,0x0a,0x7e]
-0x01,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, s105                 ; encoding: [0x69,0xb8,0x0a,0x7e]
-0x69,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xb8,0x0a,0x7e]
-0x6a,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xb8,0x0a,0x7e]
-0x6b,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xb8,0x0a,0x7e]
-0x7b,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, m0                   ; encoding: [0x7d,0xb8,0x0a,0x7e]
-0x7d,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xb8,0x0a,0x7e]
-0x7e,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xb8,0x0a,0x7e]
-0x7f,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, null                 ; encoding: [0x7c,0xb8,0x0a,0x7e]
-0x7c,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, -1                   ; encoding: [0xc1,0xb8,0x0a,0x7e]
-0xc1,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xb8,0x0a,0x7e]
-0xf0,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v5, src_scc              ; encoding: [0xfd,0xb8,0x0a,0x7e]
-0xfd,0xb8,0x0a,0x7e
-
-# GFX12: v_ceil_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
-0x01,0x45,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, v255                 ; encoding: [0xff,0x45,0x0a,0x7e]
-0xff,0x45,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, s1                   ; encoding: [0x01,0x44,0x0a,0x7e]
-0x01,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, s105                 ; encoding: [0x69,0x44,0x0a,0x7e]
-0x69,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x44,0x0a,0x7e]
-0x6a,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x44,0x0a,0x7e]
-0x6b,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x44,0x0a,0x7e]
-0x7b,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, m0                   ; encoding: [0x7d,0x44,0x0a,0x7e]
-0x7d,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x44,0x0a,0x7e]
-0x7e,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x44,0x0a,0x7e]
-0x7f,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, null                 ; encoding: [0x7c,0x44,0x0a,0x7e]
-0x7c,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, -1                   ; encoding: [0xc1,0x44,0x0a,0x7e]
-0xc1,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
-0xf0,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v5, src_scc              ; encoding: [0xfd,0x44,0x0a,0x7e]
-0xfd,0x44,0x0a,0x7e
-
-# GFX12: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_ceil_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x31,0x0a,0x7e]
-0x01,0x31,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x31,0x0a,0x7e]
-0xfe,0x31,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x30,0x0a,0x7e]
-0x02,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x30,0x0a,0x7e]
-0x68,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x30,0x0a,0x7e]
-0x6a,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x30,0x0a,0x7e]
-0x7a,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x30,0x0a,0x7e]
-0x7e,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], null             ; encoding: [0x7c,0x30,0x0a,0x7e]
-0x7c,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x30,0x0a,0x7e]
-0xc1,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x30,0x0a,0x7e]
-0xf0,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x30,0x0a,0x7e]
-0xfd,0x30,0x0a,0x7e
-
-# GFX12: v_ceil_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
-0x01,0x77,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
-0xff,0x77,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
-0x01,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
-0x69,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
-0x6a,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
-0x6b,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
-0x7b,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
-0x7d,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
-0x7e,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
-0x7f,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
-0x7c,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
-0xc1,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
-0xf0,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
-0xfd,0x76,0x0a,0x7e
-
-# GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
-0x01,0x73,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
-0xff,0x73,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
-0x01,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
-0x69,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
-0x6a,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
-0x6b,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
-0x7b,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
-0x7d,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
-0x7e,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
-0x7f,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
-0x7c,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
-0xc1,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
-0xf0,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
-0xfd,0x72,0x0a,0x7e
-
-# GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
-0x01,0xc3,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
-0x7f,0xc3,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
-0x01,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
-0x69,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
-0x6a,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
-0x6b,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
-0x7b,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
-0x7d,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
-0x7e,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
-0x7f,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
-0x7c,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
-0xc1,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
-0xf0,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
-0xfd,0xc2,0x0a,0x7e
-
-# GFX12: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
-0x01,0x6d,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, v255                  ; encoding: [0xff,0x6d,0x0a,0x7e]
-0xff,0x6d,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, s1                    ; encoding: [0x01,0x6c,0x0a,0x7e]
-0x01,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, s105                  ; encoding: [0x69,0x6c,0x0a,0x7e]
-0x69,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6c,0x0a,0x7e]
-0x6a,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6c,0x0a,0x7e]
-0x6b,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6c,0x0a,0x7e]
-0x7b,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, m0                    ; encoding: [0x7d,0x6c,0x0a,0x7e]
-0x7d,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6c,0x0a,0x7e]
-0x7e,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6c,0x0a,0x7e]
-0x7f,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, null                  ; encoding: [0x7c,0x6c,0x0a,0x7e]
-0x7c,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, -1                    ; encoding: [0xc1,0x6c,0x0a,0x7e]
-0xc1,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
-0xf0,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6c,0x0a,0x7e]
-0xfd,0x6c,0x0a,0x7e
-
-# GFX12: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
-0x01,0x75,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
-0xff,0x75,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
-0x01,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
-0x69,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
-0x6a,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
-0x6b,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
-0x7b,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
-0x7d,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
-0x7e,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
-0x7f,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
-0x7c,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
-0xc1,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
-0xf0,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
-0xfd,0x74,0x0a,0x7e
-
-# GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
-0x03,0xda,0x02,0x7e
-
-# GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
-0x83,0xda,0x02,0x7e
-
-# GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
-0x03,0xdb,0x02,0x7e
-
-# GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
-0x03,0xd8,0x02,0x7e
-
-# GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
-0x83,0xd8,0x02,0x7e
-
-# GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
-0x03,0xd9,0x02,0x7e
-
-# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
-0x03,0xde,0x04,0x7e
-
-# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
-0x83,0xde,0x04,0x7e
-
-# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
-0x03,0xdf,0x04,0x7e
-
-# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
-0x03,0xdc,0x04,0x7e
-
-# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
-0x83,0xdc,0x04,0x7e
-
-# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
-0x03,0xdd,0x04,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
-0x01,0x15,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, v255              ; encoding: [0xff,0x15,0x0a,0x7e]
-0xff,0x15,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, s1                ; encoding: [0x01,0x14,0x0a,0x7e]
-0x01,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, s105              ; encoding: [0x69,0x14,0x0a,0x7e]
-0x69,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x14,0x0a,0x7e]
-0x6a,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x14,0x0a,0x7e]
-0x6b,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x14,0x0a,0x7e]
-0x7b,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, m0                ; encoding: [0x7d,0x14,0x0a,0x7e]
-0x7d,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x14,0x0a,0x7e]
-0x7e,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x14,0x0a,0x7e]
-0x7f,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, null              ; encoding: [0x7c,0x14,0x0a,0x7e]
-0x7c,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, -1                ; encoding: [0xc1,0x14,0x0a,0x7e]
-0xc1,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, 0.5               ; encoding: [0xf0,0x14,0x0a,0x7e]
-0xf0,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v5, src_scc           ; encoding: [0xfd,0x14,0x0a,0x7e]
-0xfd,0x14,0x0a,0x7e
-
-# GFX12: v_cvt_f16_f32_e32 v127, 0xaf123456      ; encoding: [0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf]
-0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f16_i16_e32 v5, v1                ; encoding: [0x01,0xa3,0x0a,0x7e]
-0x01,0xa3,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, v127              ; encoding: [0x7f,0xa3,0x0a,0x7e]
-0x7f,0xa3,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, s1                ; encoding: [0x01,0xa2,0x0a,0x7e]
-0x01,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, s105              ; encoding: [0x69,0xa2,0x0a,0x7e]
-0x69,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa2,0x0a,0x7e]
-0x6a,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa2,0x0a,0x7e]
-0x6b,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xa2,0x0a,0x7e]
-0x7b,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, m0                ; encoding: [0x7d,0xa2,0x0a,0x7e]
-0x7d,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xa2,0x0a,0x7e]
-0x7e,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xa2,0x0a,0x7e]
-0x7f,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, null              ; encoding: [0x7c,0xa2,0x0a,0x7e]
-0x7c,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, -1                ; encoding: [0xc1,0xa2,0x0a,0x7e]
-0xc1,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, 0x3800
-0xf0,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v5, src_scc           ; encoding: [0xfd,0xa2,0x0a,0x7e]
-0xfd,0xa2,0x0a,0x7e
-
-# GFX12: v_cvt_f16_i16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_f16_u16_e32 v5, v1                ; encoding: [0x01,0xa1,0x0a,0x7e]
-0x01,0xa1,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, v127              ; encoding: [0x7f,0xa1,0x0a,0x7e]
-0x7f,0xa1,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, s1                ; encoding: [0x01,0xa0,0x0a,0x7e]
-0x01,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, s105              ; encoding: [0x69,0xa0,0x0a,0x7e]
-0x69,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa0,0x0a,0x7e]
-0x6a,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa0,0x0a,0x7e]
-0x6b,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xa0,0x0a,0x7e]
-0x7b,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, m0                ; encoding: [0x7d,0xa0,0x0a,0x7e]
-0x7d,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xa0,0x0a,0x7e]
-0x7e,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xa0,0x0a,0x7e]
-0x7f,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, null              ; encoding: [0x7c,0xa0,0x0a,0x7e]
-0x7c,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, -1                ; encoding: [0xc1,0xa0,0x0a,0x7e]
-0xc1,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, 0x3800
-0xf0,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v5, src_scc           ; encoding: [0xfd,0xa0,0x0a,0x7e]
-0xfd,0xa0,0x0a,0x7e
-
-# GFX12: v_cvt_f16_u16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_f32_f16_e32 v5, v1                ; encoding: [0x01,0x17,0x0a,0x7e]
-0x01,0x17,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, v127              ; encoding: [0x7f,0x17,0x0a,0x7e]
-0x7f,0x17,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, s1                ; encoding: [0x01,0x16,0x0a,0x7e]
-0x01,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, s105              ; encoding: [0x69,0x16,0x0a,0x7e]
-0x69,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0x16,0x0a,0x7e]
-0x6a,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0x16,0x0a,0x7e]
-0x6b,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, ttmp15            ; encoding: [0x7b,0x16,0x0a,0x7e]
-0x7b,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, m0                ; encoding: [0x7d,0x16,0x0a,0x7e]
-0x7d,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, exec_lo           ; encoding: [0x7e,0x16,0x0a,0x7e]
-0x7e,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, exec_hi           ; encoding: [0x7f,0x16,0x0a,0x7e]
-0x7f,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, null              ; encoding: [0x7c,0x16,0x0a,0x7e]
-0x7c,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, -1                ; encoding: [0xc1,0x16,0x0a,0x7e]
-0xc1,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, 0.5               ; encoding: [0xf0,0x16,0x0a,0x7e]
-0xf0,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v5, src_scc           ; encoding: [0xfd,0x16,0x0a,0x7e]
-0xfd,0x16,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f16_e32 v255, 0xfe0b          ; encoding: [0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
-0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_f32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x1f,0x0a,0x7e]
-0x01,0x1f,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x1f,0x0a,0x7e]
-0xfe,0x1f,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x1e,0x0a,0x7e]
-0x02,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x1e,0x0a,0x7e]
-0x68,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, vcc               ; encoding: [0x6a,0x1e,0x0a,0x7e]
-0x6a,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x1e,0x0a,0x7e]
-0x7a,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, exec              ; encoding: [0x7e,0x1e,0x0a,0x7e]
-0x7e,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, null              ; encoding: [0x7c,0x1e,0x0a,0x7e]
-0x7c,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, -1                ; encoding: [0xc1,0x1e,0x0a,0x7e]
-0xc1,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x1e,0x0a,0x7e]
-0xf0,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x1e,0x0a,0x7e]
-0xfd,0x1e,0x0a,0x7e
-
-# GFX12: v_cvt_f32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_i32_e32 v5, v1                ; encoding: [0x01,0x0b,0x0a,0x7e]
-0x01,0x0b,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, v255              ; encoding: [0xff,0x0b,0x0a,0x7e]
-0xff,0x0b,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, s1                ; encoding: [0x01,0x0a,0x0a,0x7e]
-0x01,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, s105              ; encoding: [0x69,0x0a,0x0a,0x7e]
-0x69,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0a,0x0a,0x7e]
-0x6a,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0a,0x0a,0x7e]
-0x6b,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, ttmp15            ; encoding: [0x7b,0x0a,0x0a,0x7e]
-0x7b,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, m0                ; encoding: [0x7d,0x0a,0x0a,0x7e]
-0x7d,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, exec_lo           ; encoding: [0x7e,0x0a,0x0a,0x7e]
-0x7e,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, exec_hi           ; encoding: [0x7f,0x0a,0x0a,0x7e]
-0x7f,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, null              ; encoding: [0x7c,0x0a,0x0a,0x7e]
-0x7c,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, -1                ; encoding: [0xc1,0x0a,0x0a,0x7e]
-0xc1,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, 0.5               ; encoding: [0xf0,0x0a,0x0a,0x7e]
-0xf0,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v5, src_scc           ; encoding: [0xfd,0x0a,0x0a,0x7e]
-0xfd,0x0a,0x0a,0x7e
-
-# GFX12: v_cvt_f32_i32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_u32_e32 v5, v1                ; encoding: [0x01,0x0d,0x0a,0x7e]
-0x01,0x0d,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, v255              ; encoding: [0xff,0x0d,0x0a,0x7e]
-0xff,0x0d,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, s1                ; encoding: [0x01,0x0c,0x0a,0x7e]
-0x01,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, s105              ; encoding: [0x69,0x0c,0x0a,0x7e]
-0x69,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0c,0x0a,0x7e]
-0x6a,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0c,0x0a,0x7e]
-0x6b,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x0c,0x0a,0x7e]
-0x7b,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, m0                ; encoding: [0x7d,0x0c,0x0a,0x7e]
-0x7d,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x0c,0x0a,0x7e]
-0x7e,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x0c,0x0a,0x7e]
-0x7f,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, null              ; encoding: [0x7c,0x0c,0x0a,0x7e]
-0x7c,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, -1                ; encoding: [0xc1,0x0c,0x0a,0x7e]
-0xc1,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x0c,0x0a,0x7e]
-0xf0,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x0c,0x0a,0x7e]
-0xfd,0x0c,0x0a,0x7e
-
-# GFX12: v_cvt_f32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, v1             ; encoding: [0x01,0x23,0x0a,0x7e]
-0x01,0x23,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, v255           ; encoding: [0xff,0x23,0x0a,0x7e]
-0xff,0x23,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, s1             ; encoding: [0x01,0x22,0x0a,0x7e]
-0x01,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, s105           ; encoding: [0x69,0x22,0x0a,0x7e]
-0x69,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_lo         ; encoding: [0x6a,0x22,0x0a,0x7e]
-0x6a,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_hi         ; encoding: [0x6b,0x22,0x0a,0x7e]
-0x6b,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, ttmp15         ; encoding: [0x7b,0x22,0x0a,0x7e]
-0x7b,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, m0             ; encoding: [0x7d,0x22,0x0a,0x7e]
-0x7d,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, exec_lo        ; encoding: [0x7e,0x22,0x0a,0x7e]
-0x7e,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, exec_hi        ; encoding: [0x7f,0x22,0x0a,0x7e]
-0x7f,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, null           ; encoding: [0x7c,0x22,0x0a,0x7e]
-0x7c,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, -1             ; encoding: [0xc1,0x22,0x0a,0x7e]
-0xc1,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, 0.5            ; encoding: [0xf0,0x22,0x0a,0x7e]
-0xf0,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v5, src_scc        ; encoding: [0xfd,0x22,0x0a,0x7e]
-0xfd,0x22,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte0_e32 v255, 0xaf123456   ; encoding: [0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, v1             ; encoding: [0x01,0x25,0x0a,0x7e]
-0x01,0x25,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, v255           ; encoding: [0xff,0x25,0x0a,0x7e]
-0xff,0x25,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, s1             ; encoding: [0x01,0x24,0x0a,0x7e]
-0x01,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, s105           ; encoding: [0x69,0x24,0x0a,0x7e]
-0x69,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_lo         ; encoding: [0x6a,0x24,0x0a,0x7e]
-0x6a,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_hi         ; encoding: [0x6b,0x24,0x0a,0x7e]
-0x6b,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, ttmp15         ; encoding: [0x7b,0x24,0x0a,0x7e]
-0x7b,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, m0             ; encoding: [0x7d,0x24,0x0a,0x7e]
-0x7d,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, exec_lo        ; encoding: [0x7e,0x24,0x0a,0x7e]
-0x7e,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, exec_hi        ; encoding: [0x7f,0x24,0x0a,0x7e]
-0x7f,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, null           ; encoding: [0x7c,0x24,0x0a,0x7e]
-0x7c,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, -1             ; encoding: [0xc1,0x24,0x0a,0x7e]
-0xc1,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, 0.5            ; encoding: [0xf0,0x24,0x0a,0x7e]
-0xf0,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v5, src_scc        ; encoding: [0xfd,0x24,0x0a,0x7e]
-0xfd,0x24,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte1_e32 v255, 0xaf123456   ; encoding: [0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, v1             ; encoding: [0x01,0x27,0x0a,0x7e]
-0x01,0x27,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, v255           ; encoding: [0xff,0x27,0x0a,0x7e]
-0xff,0x27,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, s1             ; encoding: [0x01,0x26,0x0a,0x7e]
-0x01,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, s105           ; encoding: [0x69,0x26,0x0a,0x7e]
-0x69,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_lo         ; encoding: [0x6a,0x26,0x0a,0x7e]
-0x6a,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_hi         ; encoding: [0x6b,0x26,0x0a,0x7e]
-0x6b,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, ttmp15         ; encoding: [0x7b,0x26,0x0a,0x7e]
-0x7b,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, m0             ; encoding: [0x7d,0x26,0x0a,0x7e]
-0x7d,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, exec_lo        ; encoding: [0x7e,0x26,0x0a,0x7e]
-0x7e,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, exec_hi        ; encoding: [0x7f,0x26,0x0a,0x7e]
-0x7f,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, null           ; encoding: [0x7c,0x26,0x0a,0x7e]
-0x7c,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, -1             ; encoding: [0xc1,0x26,0x0a,0x7e]
-0xc1,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, 0.5            ; encoding: [0xf0,0x26,0x0a,0x7e]
-0xf0,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v5, src_scc        ; encoding: [0xfd,0x26,0x0a,0x7e]
-0xfd,0x26,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte2_e32 v255, 0xaf123456   ; encoding: [0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, v1             ; encoding: [0x01,0x29,0x0a,0x7e]
-0x01,0x29,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, v255           ; encoding: [0xff,0x29,0x0a,0x7e]
-0xff,0x29,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, s1             ; encoding: [0x01,0x28,0x0a,0x7e]
-0x01,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, s105           ; encoding: [0x69,0x28,0x0a,0x7e]
-0x69,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_lo         ; encoding: [0x6a,0x28,0x0a,0x7e]
-0x6a,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_hi         ; encoding: [0x6b,0x28,0x0a,0x7e]
-0x6b,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, ttmp15         ; encoding: [0x7b,0x28,0x0a,0x7e]
-0x7b,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, m0             ; encoding: [0x7d,0x28,0x0a,0x7e]
-0x7d,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, exec_lo        ; encoding: [0x7e,0x28,0x0a,0x7e]
-0x7e,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, exec_hi        ; encoding: [0x7f,0x28,0x0a,0x7e]
-0x7f,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, null           ; encoding: [0x7c,0x28,0x0a,0x7e]
-0x7c,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, -1             ; encoding: [0xc1,0x28,0x0a,0x7e]
-0xc1,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, 0.5            ; encoding: [0xf0,0x28,0x0a,0x7e]
-0xf0,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v5, src_scc        ; encoding: [0xfd,0x28,0x0a,0x7e]
-0xfd,0x28,0x0a,0x7e
-
-# GFX12: v_cvt_f32_ubyte3_e32 v255, 0xaf123456   ; encoding: [0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], v1            ; encoding: [0x01,0x21,0x0a,0x7e]
-0x01,0x21,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], v255          ; encoding: [0xff,0x21,0x0a,0x7e]
-0xff,0x21,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], s1            ; encoding: [0x01,0x20,0x0a,0x7e]
-0x01,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], s105          ; encoding: [0x69,0x20,0x0a,0x7e]
-0x69,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x20,0x0a,0x7e]
-0x6a,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x20,0x0a,0x7e]
-0x6b,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x20,0x0a,0x7e]
-0x7b,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], m0            ; encoding: [0x7d,0x20,0x0a,0x7e]
-0x7d,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x20,0x0a,0x7e]
-0x7e,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x20,0x0a,0x7e]
-0x7f,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], null          ; encoding: [0x7c,0x20,0x0a,0x7e]
-0x7c,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], -1            ; encoding: [0xc1,0x20,0x0a,0x7e]
-0xc1,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x20,0x0a,0x7e]
-0xf0,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x20,0x0a,0x7e]
-0xfd,0x20,0x0a,0x7e
-
-# GFX12: v_cvt_f64_f32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], v1            ; encoding: [0x01,0x09,0x0a,0x7e]
-0x01,0x09,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], v255          ; encoding: [0xff,0x09,0x0a,0x7e]
-0xff,0x09,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], s1            ; encoding: [0x01,0x08,0x0a,0x7e]
-0x01,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], s105          ; encoding: [0x69,0x08,0x0a,0x7e]
-0x69,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x08,0x0a,0x7e]
-0x6a,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x08,0x0a,0x7e]
-0x6b,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x08,0x0a,0x7e]
-0x7b,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], m0            ; encoding: [0x7d,0x08,0x0a,0x7e]
-0x7d,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x08,0x0a,0x7e]
-0x7e,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x08,0x0a,0x7e]
-0x7f,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], null          ; encoding: [0x7c,0x08,0x0a,0x7e]
-0x7c,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], -1            ; encoding: [0xc1,0x08,0x0a,0x7e]
-0xc1,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x08,0x0a,0x7e]
-0xf0,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x08,0x0a,0x7e]
-0xfd,0x08,0x0a,0x7e
-
-# GFX12: v_cvt_f64_i32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], v1            ; encoding: [0x01,0x2d,0x0a,0x7e]
-0x01,0x2d,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], v255          ; encoding: [0xff,0x2d,0x0a,0x7e]
-0xff,0x2d,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], s1            ; encoding: [0x01,0x2c,0x0a,0x7e]
-0x01,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], s105          ; encoding: [0x69,0x2c,0x0a,0x7e]
-0x69,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x2c,0x0a,0x7e]
-0x6a,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x2c,0x0a,0x7e]
-0x6b,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x2c,0x0a,0x7e]
-0x7b,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], m0            ; encoding: [0x7d,0x2c,0x0a,0x7e]
-0x7d,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x2c,0x0a,0x7e]
-0x7e,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x2c,0x0a,0x7e]
-0x7f,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], null          ; encoding: [0x7c,0x2c,0x0a,0x7e]
-0x7c,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], -1            ; encoding: [0xc1,0x2c,0x0a,0x7e]
-0xc1,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x2c,0x0a,0x7e]
-0xf0,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x2c,0x0a,0x7e]
-0xfd,0x2c,0x0a,0x7e
-
-# GFX12: v_cvt_f64_u32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
-0x01,0x1b,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
-0xff,0x1b,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
-0x01,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
-0x69,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
-0x6a,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
-0x6b,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
-0x7b,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
-0x7d,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
-0x7e,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
-0x7f,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
-0x7c,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
-0xc1,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
-0xf0,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
-0xfd,0x1a,0x0a,0x7e
-
-# GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_i16_f16_e32 v5, v1                ; encoding: [0x01,0xa7,0x0a,0x7e]
-0x01,0xa7,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, v127              ; encoding: [0x7f,0xa7,0x0a,0x7e]
-0x7f,0xa7,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, s1                ; encoding: [0x01,0xa6,0x0a,0x7e]
-0x01,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, s105              ; encoding: [0x69,0xa6,0x0a,0x7e]
-0x69,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa6,0x0a,0x7e]
-0x6a,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa6,0x0a,0x7e]
-0x6b,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa6,0x0a,0x7e]
-0x7b,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, m0                ; encoding: [0x7d,0xa6,0x0a,0x7e]
-0x7d,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa6,0x0a,0x7e]
-0x7e,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa6,0x0a,0x7e]
-0x7f,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, null              ; encoding: [0x7c,0xa6,0x0a,0x7e]
-0x7c,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, -1                ; encoding: [0xc1,0xa6,0x0a,0x7e]
-0xc1,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa6,0x0a,0x7e]
-0xf0,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa6,0x0a,0x7e]
-0xfd,0xa6,0x0a,0x7e
-
-# GFX12: v_cvt_i16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_i32_f32_e32 v5, v1                ; encoding: [0x01,0x11,0x0a,0x7e]
-0x01,0x11,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, v255              ; encoding: [0xff,0x11,0x0a,0x7e]
-0xff,0x11,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, s1                ; encoding: [0x01,0x10,0x0a,0x7e]
-0x01,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, s105              ; encoding: [0x69,0x10,0x0a,0x7e]
-0x69,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x10,0x0a,0x7e]
-0x6a,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x10,0x0a,0x7e]
-0x6b,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x10,0x0a,0x7e]
-0x7b,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, m0                ; encoding: [0x7d,0x10,0x0a,0x7e]
-0x7d,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x10,0x0a,0x7e]
-0x7e,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x10,0x0a,0x7e]
-0x7f,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, null              ; encoding: [0x7c,0x10,0x0a,0x7e]
-0x7c,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, -1                ; encoding: [0xc1,0x10,0x0a,0x7e]
-0xc1,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x10,0x0a,0x7e]
-0xf0,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x10,0x0a,0x7e]
-0xfd,0x10,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_i32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x07,0x0a,0x7e]
-0x01,0x07,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x07,0x0a,0x7e]
-0xfe,0x07,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x06,0x0a,0x7e]
-0x02,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x06,0x0a,0x7e]
-0x68,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, vcc               ; encoding: [0x6a,0x06,0x0a,0x7e]
-0x6a,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x06,0x0a,0x7e]
-0x7a,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, exec              ; encoding: [0x7e,0x06,0x0a,0x7e]
-0x7e,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, null              ; encoding: [0x7c,0x06,0x0a,0x7e]
-0x7c,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, -1                ; encoding: [0xc1,0x06,0x0a,0x7e]
-0xc1,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x06,0x0a,0x7e]
-0xf0,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x06,0x0a,0x7e]
-0xfd,0x06,0x0a,0x7e
-
-# GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
-0x01,0xd5,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
-0x7f,0xd5,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
-0x01,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, s105              ; encoding: [0x69,0xd4,0x0a,0x7e]
-0x69,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd4,0x0a,0x7e]
-0x6a,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd4,0x0a,0x7e]
-0x6b,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xd4,0x0a,0x7e]
-0x7b,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, m0                ; encoding: [0x7d,0xd4,0x0a,0x7e]
-0x7d,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xd4,0x0a,0x7e]
-0x7e,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xd4,0x0a,0x7e]
-0x7f,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, null              ; encoding: [0x7c,0xd4,0x0a,0x7e]
-0x7c,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, -1                ; encoding: [0xc1,0xd4,0x0a,0x7e]
-0xc1,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, 0x3800
-0xf0,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v5, src_scc           ; encoding: [0xfd,0xd4,0x0a,0x7e]
-0xfd,0xd4,0x0a,0x7e
-
-# GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
-0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
-0x01,0x19,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
-0xff,0x19,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
-0x01,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
-0x69,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
-0x6a,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
-0x6b,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
-0x7b,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
-0x7d,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
-0x7e,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
-0x7f,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
-0x7c,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
-0xc1,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
-0xf0,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
-0xfd,0x18,0x0a,0x7e
-
-# GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, v1           ; encoding: [0x01,0xc7,0x0a,0x7e]
-0x01,0xc7,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, v127         ; encoding: [0x7f,0xc7,0x0a,0x7e]
-0x7f,0xc7,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, s1           ; encoding: [0x01,0xc6,0x0a,0x7e]
-0x01,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, s105         ; encoding: [0x69,0xc6,0x0a,0x7e]
-0x69,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc6,0x0a,0x7e]
-0x6a,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc6,0x0a,0x7e]
-0x6b,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc6,0x0a,0x7e]
-0x7b,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, m0           ; encoding: [0x7d,0xc6,0x0a,0x7e]
-0x7d,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc6,0x0a,0x7e]
-0x7e,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc6,0x0a,0x7e]
-0x7f,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, null         ; encoding: [0x7c,0xc6,0x0a,0x7e]
-0x7c,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, -1           ; encoding: [0xc1,0xc6,0x0a,0x7e]
-0xc1,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc6,0x0a,0x7e]
-0xf0,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc6,0x0a,0x7e]
-0xfd,0xc6,0x0a,0x7e
-
-# GFX12: v_cvt_norm_i16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, v1           ; encoding: [0x01,0xc9,0x0a,0x7e]
-0x01,0xc9,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, v127         ; encoding: [0x7f,0xc9,0x0a,0x7e]
-0x7f,0xc9,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, s1           ; encoding: [0x01,0xc8,0x0a,0x7e]
-0x01,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, s105         ; encoding: [0x69,0xc8,0x0a,0x7e]
-0x69,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc8,0x0a,0x7e]
-0x6a,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc8,0x0a,0x7e]
-0x6b,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc8,0x0a,0x7e]
-0x7b,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, m0           ; encoding: [0x7d,0xc8,0x0a,0x7e]
-0x7d,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc8,0x0a,0x7e]
-0x7e,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc8,0x0a,0x7e]
-0x7f,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, null         ; encoding: [0x7c,0xc8,0x0a,0x7e]
-0x7c,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, -1           ; encoding: [0xc1,0xc8,0x0a,0x7e]
-0xc1,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc8,0x0a,0x7e]
-0xf0,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc8,0x0a,0x7e]
-0xfd,0xc8,0x0a,0x7e
-
-# GFX12: v_cvt_norm_u16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, v1             ; encoding: [0x01,0x1d,0x0a,0x7e]
-0x01,0x1d,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, v255           ; encoding: [0xff,0x1d,0x0a,0x7e]
-0xff,0x1d,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, s1             ; encoding: [0x01,0x1c,0x0a,0x7e]
-0x01,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, s105           ; encoding: [0x69,0x1c,0x0a,0x7e]
-0x69,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, vcc_lo         ; encoding: [0x6a,0x1c,0x0a,0x7e]
-0x6a,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, vcc_hi         ; encoding: [0x6b,0x1c,0x0a,0x7e]
-0x6b,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, ttmp15         ; encoding: [0x7b,0x1c,0x0a,0x7e]
-0x7b,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, m0             ; encoding: [0x7d,0x1c,0x0a,0x7e]
-0x7d,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, exec_lo        ; encoding: [0x7e,0x1c,0x0a,0x7e]
-0x7e,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, exec_hi        ; encoding: [0x7f,0x1c,0x0a,0x7e]
-0x7f,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, null           ; encoding: [0x7c,0x1c,0x0a,0x7e]
-0x7c,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, -1             ; encoding: [0xc1,0x1c,0x0a,0x7e]
-0xc1,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, 0.5            ; encoding: [0xf0,0x1c,0x0a,0x7e]
-0xf0,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v5, src_scc        ; encoding: [0xfd,0x1c,0x0a,0x7e]
-0xfd,0x1c,0x0a,0x7e
-
-# GFX12: v_cvt_off_f32_i4_e32 v255, 0x4f         ; encoding: [0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00]
-0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00
-
-# GFX12: v_cvt_u16_f16_e32 v5, v1                ; encoding: [0x01,0xa5,0x0a,0x7e]
-0x01,0xa5,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, v127              ; encoding: [0x7f,0xa5,0x0a,0x7e]
-0x7f,0xa5,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, s1                ; encoding: [0x01,0xa4,0x0a,0x7e]
-0x01,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, s105              ; encoding: [0x69,0xa4,0x0a,0x7e]
-0x69,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa4,0x0a,0x7e]
-0x6a,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa4,0x0a,0x7e]
-0x6b,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa4,0x0a,0x7e]
-0x7b,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, m0                ; encoding: [0x7d,0xa4,0x0a,0x7e]
-0x7d,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa4,0x0a,0x7e]
-0x7e,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa4,0x0a,0x7e]
-0x7f,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, null              ; encoding: [0x7c,0xa4,0x0a,0x7e]
-0x7c,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, -1                ; encoding: [0xc1,0xa4,0x0a,0x7e]
-0xc1,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa4,0x0a,0x7e]
-0xf0,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa4,0x0a,0x7e]
-0xfd,0xa4,0x0a,0x7e
-
-# GFX12: v_cvt_u16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_cvt_u32_f32_e32 v5, v1                ; encoding: [0x01,0x0f,0x0a,0x7e]
-0x01,0x0f,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, v255              ; encoding: [0xff,0x0f,0x0a,0x7e]
-0xff,0x0f,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, s1                ; encoding: [0x01,0x0e,0x0a,0x7e]
-0x01,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, s105              ; encoding: [0x69,0x0e,0x0a,0x7e]
-0x69,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0e,0x0a,0x7e]
-0x6a,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0e,0x0a,0x7e]
-0x6b,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x0e,0x0a,0x7e]
-0x7b,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, m0                ; encoding: [0x7d,0x0e,0x0a,0x7e]
-0x7d,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x0e,0x0a,0x7e]
-0x7e,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x0e,0x0a,0x7e]
-0x7f,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, null              ; encoding: [0x7c,0x0e,0x0a,0x7e]
-0x7c,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, -1                ; encoding: [0xc1,0x0e,0x0a,0x7e]
-0xc1,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x0e,0x0a,0x7e]
-0xf0,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x0e,0x0a,0x7e]
-0xfd,0x0e,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_u32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x2b,0x0a,0x7e]
-0x01,0x2b,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x2b,0x0a,0x7e]
-0xfe,0x2b,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x2a,0x0a,0x7e]
-0x02,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x2a,0x0a,0x7e]
-0x68,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, vcc               ; encoding: [0x6a,0x2a,0x0a,0x7e]
-0x6a,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x2a,0x0a,0x7e]
-0x7a,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, exec              ; encoding: [0x7e,0x2a,0x0a,0x7e]
-0x7e,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, null              ; encoding: [0x7c,0x2a,0x0a,0x7e]
-0x7c,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, -1                ; encoding: [0xc1,0x2a,0x0a,0x7e]
-0xc1,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x2a,0x0a,0x7e]
-0xf0,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x2a,0x0a,0x7e]
-0xfd,0x2a,0x0a,0x7e
-
-# GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
-0x01,0xd7,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
-0x7f,0xd7,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
-0x01,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, s105              ; encoding: [0x69,0xd6,0x0a,0x7e]
-0x69,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd6,0x0a,0x7e]
-0x6a,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd6,0x0a,0x7e]
-0x6b,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xd6,0x0a,0x7e]
-0x7b,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, m0                ; encoding: [0x7d,0xd6,0x0a,0x7e]
-0x7d,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xd6,0x0a,0x7e]
-0x7e,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xd6,0x0a,0x7e]
-0x7f,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, null              ; encoding: [0x7c,0xd6,0x0a,0x7e]
-0x7c,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, -1                ; encoding: [0xc1,0xd6,0x0a,0x7e]
-0xc1,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, 0x3800
-0xf0,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v5, src_scc           ; encoding: [0xfd,0xd6,0x0a,0x7e]
-0xfd,0xd6,0x0a,0x7e
-
-# GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
-0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
-0x01,0xb1,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, v127                  ; encoding: [0x7f,0xb1,0x0a,0x7e]
-0x7f,0xb1,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, s1                    ; encoding: [0x01,0xb0,0x0a,0x7e]
-0x01,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, s105                  ; encoding: [0x69,0xb0,0x0a,0x7e]
-0x69,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xb0,0x0a,0x7e]
-0x6a,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xb0,0x0a,0x7e]
-0x6b,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xb0,0x0a,0x7e]
-0x7b,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, m0                    ; encoding: [0x7d,0xb0,0x0a,0x7e]
-0x7d,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xb0,0x0a,0x7e]
-0x7e,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xb0,0x0a,0x7e]
-0x7f,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, null                  ; encoding: [0x7c,0xb0,0x0a,0x7e]
-0x7c,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, -1                    ; encoding: [0xc1,0xb0,0x0a,0x7e]
-0xc1,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xb0,0x0a,0x7e]
-0xf0,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xb0,0x0a,0x7e]
-0xfd,0xb0,0x0a,0x7e
-
-# GFX12: v_exp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_exp_f32_e32 v5, v1                    ; encoding: [0x01,0x4b,0x0a,0x7e]
-0x01,0x4b,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, v255                  ; encoding: [0xff,0x4b,0x0a,0x7e]
-0xff,0x4b,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, s1                    ; encoding: [0x01,0x4a,0x0a,0x7e]
-0x01,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, s105                  ; encoding: [0x69,0x4a,0x0a,0x7e]
-0x69,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4a,0x0a,0x7e]
-0x6a,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4a,0x0a,0x7e]
-0x6b,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4a,0x0a,0x7e]
-0x7b,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, m0                    ; encoding: [0x7d,0x4a,0x0a,0x7e]
-0x7d,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4a,0x0a,0x7e]
-0x7e,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4a,0x0a,0x7e]
-0x7f,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, null                  ; encoding: [0x7c,0x4a,0x0a,0x7e]
-0x7c,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, -1                    ; encoding: [0xc1,0x4a,0x0a,0x7e]
-0xc1,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4a,0x0a,0x7e]
-0xf0,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4a,0x0a,0x7e]
-0xfd,0x4a,0x0a,0x7e
-
-# GFX12: v_exp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_floor_f16_e32 v5, v1                  ; encoding: [0x01,0xb7,0x0a,0x7e]
-0x01,0xb7,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, v127                ; encoding: [0x7f,0xb7,0x0a,0x7e]
-0x7f,0xb7,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, s1                  ; encoding: [0x01,0xb6,0x0a,0x7e]
-0x01,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, s105                ; encoding: [0x69,0xb6,0x0a,0x7e]
-0x69,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xb6,0x0a,0x7e]
-0x6a,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xb6,0x0a,0x7e]
-0x6b,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xb6,0x0a,0x7e]
-0x7b,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, m0                  ; encoding: [0x7d,0xb6,0x0a,0x7e]
-0x7d,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xb6,0x0a,0x7e]
-0x7e,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xb6,0x0a,0x7e]
-0x7f,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, null                ; encoding: [0x7c,0xb6,0x0a,0x7e]
-0x7c,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, -1                  ; encoding: [0xc1,0xb6,0x0a,0x7e]
-0xc1,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xb6,0x0a,0x7e]
-0xf0,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v5, src_scc             ; encoding: [0xfd,0xb6,0x0a,0x7e]
-0xfd,0xb6,0x0a,0x7e
-
-# GFX12: v_floor_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_floor_f32_e32 v5, v1                  ; encoding: [0x01,0x49,0x0a,0x7e]
-0x01,0x49,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, v255                ; encoding: [0xff,0x49,0x0a,0x7e]
-0xff,0x49,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, s1                  ; encoding: [0x01,0x48,0x0a,0x7e]
-0x01,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, s105                ; encoding: [0x69,0x48,0x0a,0x7e]
-0x69,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x48,0x0a,0x7e]
-0x6a,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x48,0x0a,0x7e]
-0x6b,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x48,0x0a,0x7e]
-0x7b,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, m0                  ; encoding: [0x7d,0x48,0x0a,0x7e]
-0x7d,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x48,0x0a,0x7e]
-0x7e,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x48,0x0a,0x7e]
-0x7f,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, null                ; encoding: [0x7c,0x48,0x0a,0x7e]
-0x7c,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, -1                  ; encoding: [0xc1,0x48,0x0a,0x7e]
-0xc1,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x48,0x0a,0x7e]
-0xf0,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v5, src_scc             ; encoding: [0xfd,0x48,0x0a,0x7e]
-0xfd,0x48,0x0a,0x7e
-
-# GFX12: v_floor_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_floor_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x35,0x0a,0x7e]
-0x01,0x35,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x35,0x0a,0x7e]
-0xfe,0x35,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x34,0x0a,0x7e]
-0x02,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x34,0x0a,0x7e]
-0x68,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x34,0x0a,0x7e]
-0x6a,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x34,0x0a,0x7e]
-0x7a,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x34,0x0a,0x7e]
-0x7e,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], null            ; encoding: [0x7c,0x34,0x0a,0x7e]
-0x7c,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x34,0x0a,0x7e]
-0xc1,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x34,0x0a,0x7e]
-0xf0,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x34,0x0a,0x7e]
-0xfd,0x34,0x0a,0x7e
-
-# GFX12: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
-0x01,0xbf,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
-0x7f,0xbf,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
-0x01,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
-0x69,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
-0x6a,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
-0x6b,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
-0x7b,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
-0x7d,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
-0x7e,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
-0x7f,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
-0x7c,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
-0xc1,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
-0xf0,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
-0xfd,0xbe,0x0a,0x7e
-
-# GFX12: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
-0x01,0x41,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, v255                ; encoding: [0xff,0x41,0x0a,0x7e]
-0xff,0x41,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, s1                  ; encoding: [0x01,0x40,0x0a,0x7e]
-0x01,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, s105                ; encoding: [0x69,0x40,0x0a,0x7e]
-0x69,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x40,0x0a,0x7e]
-0x6a,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x40,0x0a,0x7e]
-0x6b,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x40,0x0a,0x7e]
-0x7b,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, m0                  ; encoding: [0x7d,0x40,0x0a,0x7e]
-0x7d,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x40,0x0a,0x7e]
-0x7e,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x40,0x0a,0x7e]
-0x7f,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, null                ; encoding: [0x7c,0x40,0x0a,0x7e]
-0x7c,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, -1                  ; encoding: [0xc1,0x40,0x0a,0x7e]
-0xc1,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x40,0x0a,0x7e]
-0xf0,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v5, src_scc             ; encoding: [0xfd,0x40,0x0a,0x7e]
-0xfd,0x40,0x0a,0x7e
-
-# GFX12: v_fract_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_fract_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x7d,0x0a,0x7e]
-0x01,0x7d,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x7d,0x0a,0x7e]
-0xfe,0x7d,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x7c,0x0a,0x7e]
-0x02,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x7c,0x0a,0x7e]
-0x68,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x7c,0x0a,0x7e]
-0x6a,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x7c,0x0a,0x7e]
-0x7a,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x7c,0x0a,0x7e]
-0x7e,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], null            ; encoding: [0x7c,0x7c,0x0a,0x7e]
-0x7c,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x7c,0x0a,0x7e]
-0xc1,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x7c,0x0a,0x7e]
-0xf0,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x7c,0x0a,0x7e]
-0xfd,0x7c,0x0a,0x7e
-
-# GFX12: v_fract_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, v1          ; encoding: [0x01,0xb5,0x0a,0x7e]
-0x01,0xb5,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, v127        ; encoding: [0x7f,0xb5,0x0a,0x7e]
-0x7f,0xb5,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, s1          ; encoding: [0x01,0xb4,0x0a,0x7e]
-0x01,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, s105        ; encoding: [0x69,0xb4,0x0a,0x7e]
-0x69,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_lo      ; encoding: [0x6a,0xb4,0x0a,0x7e]
-0x6a,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_hi      ; encoding: [0x6b,0xb4,0x0a,0x7e]
-0x6b,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, ttmp15      ; encoding: [0x7b,0xb4,0x0a,0x7e]
-0x7b,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, m0          ; encoding: [0x7d,0xb4,0x0a,0x7e]
-0x7d,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, exec_lo     ; encoding: [0x7e,0xb4,0x0a,0x7e]
-0x7e,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, exec_hi     ; encoding: [0x7f,0xb4,0x0a,0x7e]
-0x7f,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, null        ; encoding: [0x7c,0xb4,0x0a,0x7e]
-0x7c,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, -1          ; encoding: [0xc1,0xb4,0x0a,0x7e]
-0xc1,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, 0.5         ; encoding: [0xf0,0xb4,0x0a,0x7e]
-0xf0,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v5, src_scc     ; encoding: [0xfd,0xb4,0x0a,0x7e]
-0xfd,0xb4,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i16_f16_e32 v127, 0xfe0b    ; encoding: [0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, v1          ; encoding: [0x01,0x7f,0x0a,0x7e]
-0x01,0x7f,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, v255        ; encoding: [0xff,0x7f,0x0a,0x7e]
-0xff,0x7f,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, s1          ; encoding: [0x01,0x7e,0x0a,0x7e]
-0x01,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, s105        ; encoding: [0x69,0x7e,0x0a,0x7e]
-0x69,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x7e,0x0a,0x7e]
-0x6a,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x7e,0x0a,0x7e]
-0x6b,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x7e,0x0a,0x7e]
-0x7b,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x7e,0x0a,0x7e]
-0x7d,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x7e,0x0a,0x7e]
-0x7e,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x7e,0x0a,0x7e]
-0x7f,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, null        ; encoding: [0x7c,0x7e,0x0a,0x7e]
-0x7c,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x7e,0x0a,0x7e]
-0xc1,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x7e,0x0a,0x7e]
-0xf0,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x7e,0x0a,0x7e]
-0xfd,0x7e,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, v[1:2]      ; encoding: [0x01,0x79,0x0a,0x7e]
-0x01,0x79,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, v[254:255]  ; encoding: [0xfe,0x79,0x0a,0x7e]
-0xfe,0x79,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, s[2:3]      ; encoding: [0x02,0x78,0x0a,0x7e]
-0x02,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, s[104:105]  ; encoding: [0x68,0x78,0x0a,0x7e]
-0x68,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, vcc         ; encoding: [0x6a,0x78,0x0a,0x7e]
-0x6a,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, ttmp[14:15] ; encoding: [0x7a,0x78,0x0a,0x7e]
-0x7a,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, exec        ; encoding: [0x7e,0x78,0x0a,0x7e]
-0x7e,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, null        ; encoding: [0x7c,0x78,0x0a,0x7e]
-0x7c,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, -1          ; encoding: [0xc1,0x78,0x0a,0x7e]
-0xc1,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, 0.5         ; encoding: [0xf0,0x78,0x0a,0x7e]
-0xf0,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v5, src_scc     ; encoding: [0xfd,0x78,0x0a,0x7e]
-0xfd,0x78,0x0a,0x7e
-
-# GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
-0x01,0xb3,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
-0x7f,0xb3,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
-0x01,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
-0x69,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
-0x6a,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
-0x6b,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
-0x7b,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
-0x7d,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
-0x7e,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
-0x7f,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
-0x7c,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
-0xc1,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
-0xf0,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
-0xfd,0xb2,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
-0x01,0x81,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, v255           ; encoding: [0xff,0x81,0x0a,0x7e]
-0xff,0x81,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, s1             ; encoding: [0x01,0x80,0x0a,0x7e]
-0x01,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, s105           ; encoding: [0x69,0x80,0x0a,0x7e]
-0x69,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, vcc_lo         ; encoding: [0x6a,0x80,0x0a,0x7e]
-0x6a,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, vcc_hi         ; encoding: [0x6b,0x80,0x0a,0x7e]
-0x6b,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, ttmp15         ; encoding: [0x7b,0x80,0x0a,0x7e]
-0x7b,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, m0             ; encoding: [0x7d,0x80,0x0a,0x7e]
-0x7d,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, exec_lo        ; encoding: [0x7e,0x80,0x0a,0x7e]
-0x7e,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, exec_hi        ; encoding: [0x7f,0x80,0x0a,0x7e]
-0x7f,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, null           ; encoding: [0x7c,0x80,0x0a,0x7e]
-0x7c,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, -1             ; encoding: [0xc1,0x80,0x0a,0x7e]
-0xc1,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, 0.5            ; encoding: [0xf0,0x80,0x0a,0x7e]
-0xf0,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v5, src_scc        ; encoding: [0xfd,0x80,0x0a,0x7e]
-0xfd,0x80,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f32_e32 v255, 0xaf123456   ; encoding: [0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], v[1:2]     ; encoding: [0x01,0x7b,0x0a,0x7e]
-0x01,0x7b,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], v[254:255] ; encoding: [0xfe,0x7b,0x0a,0x7e]
-0xfe,0x7b,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], s[2:3]     ; encoding: [0x02,0x7a,0x0a,0x7e]
-0x02,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], s[104:105] ; encoding: [0x68,0x7a,0x0a,0x7e]
-0x68,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], vcc        ; encoding: [0x6a,0x7a,0x0a,0x7e]
-0x6a,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], ttmp[14:15] ; encoding: [0x7a,0x7a,0x0a,0x7e]
-0x7a,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], exec       ; encoding: [0x7e,0x7a,0x0a,0x7e]
-0x7e,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], null       ; encoding: [0x7c,0x7a,0x0a,0x7e]
-0x7c,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], -1         ; encoding: [0xc1,0x7a,0x0a,0x7e]
-0xc1,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], 0.5        ; encoding: [0xf0,0x7a,0x0a,0x7e]
-0xf0,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[5:6], src_scc    ; encoding: [0xfd,0x7a,0x0a,0x7e]
-0xfd,0x7a,0x0a,0x7e
-
-# GFX12: v_frexp_mant_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_log_f16_e32 v5, v1                    ; encoding: [0x01,0xaf,0x0a,0x7e]
-0x01,0xaf,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, v127                  ; encoding: [0x7f,0xaf,0x0a,0x7e]
-0x7f,0xaf,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, s1                    ; encoding: [0x01,0xae,0x0a,0x7e]
-0x01,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, s105                  ; encoding: [0x69,0xae,0x0a,0x7e]
-0x69,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xae,0x0a,0x7e]
-0x6a,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xae,0x0a,0x7e]
-0x6b,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xae,0x0a,0x7e]
-0x7b,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, m0                    ; encoding: [0x7d,0xae,0x0a,0x7e]
-0x7d,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xae,0x0a,0x7e]
-0x7e,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xae,0x0a,0x7e]
-0x7f,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, null                  ; encoding: [0x7c,0xae,0x0a,0x7e]
-0x7c,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, -1                    ; encoding: [0xc1,0xae,0x0a,0x7e]
-0xc1,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xae,0x0a,0x7e]
-0xf0,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v5, src_scc               ; encoding: [0xfd,0xae,0x0a,0x7e]
-0xfd,0xae,0x0a,0x7e
-
-# GFX12: v_log_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_log_f32_e32 v5, v1                    ; encoding: [0x01,0x4f,0x0a,0x7e]
-0x01,0x4f,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, v255                  ; encoding: [0xff,0x4f,0x0a,0x7e]
-0xff,0x4f,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, s1                    ; encoding: [0x01,0x4e,0x0a,0x7e]
-0x01,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, s105                  ; encoding: [0x69,0x4e,0x0a,0x7e]
-0x69,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4e,0x0a,0x7e]
-0x6a,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4e,0x0a,0x7e]
-0x6b,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4e,0x0a,0x7e]
-0x7b,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, m0                    ; encoding: [0x7d,0x4e,0x0a,0x7e]
-0x7d,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4e,0x0a,0x7e]
-0x7e,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4e,0x0a,0x7e]
-0x7f,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, null                  ; encoding: [0x7c,0x4e,0x0a,0x7e]
-0x7c,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, -1                    ; encoding: [0xc1,0x4e,0x0a,0x7e]
-0xc1,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4e,0x0a,0x7e]
-0xf0,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4e,0x0a,0x7e]
-0xfd,0x4e,0x0a,0x7e
-
-# GFX12: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
-0x01,0x03,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, v255                  ; encoding: [0xff,0x03,0x0a,0x7e]
-0xff,0x03,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, s1                    ; encoding: [0x01,0x02,0x0a,0x7e]
-0x01,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, s105                  ; encoding: [0x69,0x02,0x0a,0x7e]
-0x69,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x02,0x0a,0x7e]
-0x6a,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x02,0x0a,0x7e]
-0x6b,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x02,0x0a,0x7e]
-0x7b,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, m0                    ; encoding: [0x7d,0x02,0x0a,0x7e]
-0x7d,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x02,0x0a,0x7e]
-0x7e,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x02,0x0a,0x7e]
-0x7f,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, null                  ; encoding: [0x7c,0x02,0x0a,0x7e]
-0x7c,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, -1                    ; encoding: [0xc1,0x02,0x0a,0x7e]
-0xc1,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x02,0x0a,0x7e]
-0xf0,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v5, src_scc               ; encoding: [0xfd,0x02,0x0a,0x7e]
-0xfd,0x02,0x0a,0x7e
-
-# GFX12: v_mov_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_movreld_b32_e32 v5, v1                ; encoding: [0x01,0x85,0x0a,0x7e]
-0x01,0x85,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, v255              ; encoding: [0xff,0x85,0x0a,0x7e]
-0xff,0x85,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, s1                ; encoding: [0x01,0x84,0x0a,0x7e]
-0x01,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, s105              ; encoding: [0x69,0x84,0x0a,0x7e]
-0x69,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x84,0x0a,0x7e]
-0x6a,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x84,0x0a,0x7e]
-0x6b,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x84,0x0a,0x7e]
-0x7b,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, m0                ; encoding: [0x7d,0x84,0x0a,0x7e]
-0x7d,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x84,0x0a,0x7e]
-0x7e,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x84,0x0a,0x7e]
-0x7f,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, null              ; encoding: [0x7c,0x84,0x0a,0x7e]
-0x7c,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, -1                ; encoding: [0xc1,0x84,0x0a,0x7e]
-0xc1,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, 0.5               ; encoding: [0xf0,0x84,0x0a,0x7e]
-0xf0,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v5, src_scc           ; encoding: [0xfd,0x84,0x0a,0x7e]
-0xfd,0x84,0x0a,0x7e
-
-# GFX12: v_movreld_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_movrels_b32_e32 v5, v1                ; encoding: [0x01,0x87,0x0a,0x7e]
-0x01,0x87,0x0a,0x7e
-
-# GFX12: v_movrels_b32_e32 v255, v255            ; encoding: [0xff,0x87,0xfe,0x7f]
-0xff,0x87,0xfe,0x7f
-
-# GFX12: v_movrelsd_2_b32_e32 v5, v1             ; encoding: [0x01,0x91,0x0a,0x7e]
-0x01,0x91,0x0a,0x7e
-
-# GFX12: v_movrelsd_2_b32_e32 v255, v255         ; encoding: [0xff,0x91,0xfe,0x7f]
-0xff,0x91,0xfe,0x7f
-
-# GFX12: v_movrelsd_b32_e32 v5, v1               ; encoding: [0x01,0x89,0x0a,0x7e]
-0x01,0x89,0x0a,0x7e
-
-# GFX12: v_movrelsd_b32_e32 v255, v255           ; encoding: [0xff,0x89,0xfe,0x7f]
-0xff,0x89,0xfe,0x7f
-
-# GFX12: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
-0x00,0x00,0x00,0x7e
-
-# GFX12: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
-0x01,0xd3,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
-0x7f,0xd3,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
-0x01,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
-0x69,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
-0x6a,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
-0x6b,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
-0x7b,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
-0x7d,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
-0x7e,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
-0x7f,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
-0x7c,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
-0xc1,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, 0x3800
-0xf0,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
-0xfd,0xd2,0x0a,0x7e
-
-# GFX12: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
-0x01,0x6f,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, v255                  ; encoding: [0xff,0x6f,0x0a,0x7e]
-0xff,0x6f,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, s1                    ; encoding: [0x01,0x6e,0x0a,0x7e]
-0x01,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, s105                  ; encoding: [0x69,0x6e,0x0a,0x7e]
-0x69,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6e,0x0a,0x7e]
-0x6a,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6e,0x0a,0x7e]
-0x6b,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x6e,0x0a,0x7e]
-0x7b,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, m0                    ; encoding: [0x7d,0x6e,0x0a,0x7e]
-0x7d,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x6e,0x0a,0x7e]
-0x7e,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x6e,0x0a,0x7e]
-0x7f,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, null                  ; encoding: [0x7c,0x6e,0x0a,0x7e]
-0x7c,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, -1                    ; encoding: [0xc1,0x6e,0x0a,0x7e]
-0xc1,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x6e,0x0a,0x7e]
-0xf0,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v5, src_scc               ; encoding: [0xfd,0x6e,0x0a,0x7e]
-0xfd,0x6e,0x0a,0x7e
-
-# GFX12: v_not_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_permlane64_b32 v5, v1                 ; encoding: [0x01,0xcf,0x0a,0x7e]
-0x01,0xcf,0x0a,0x7e
-
-# GFX12: v_permlane64_b32 v255, v255             ; encoding: [0xff,0xcf,0xfe,0x7f]
-0xff,0xcf,0xfe,0x7f
-
-# GFX12: v_pipeflush                             ; encoding: [0x00,0x36,0x00,0x7e]
-0x00,0x36,0x00,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, v1                    ; encoding: [0x01,0xa9,0x0a,0x7e]
-0x01,0xa9,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, v127                  ; encoding: [0x7f,0xa9,0x0a,0x7e]
-0x7f,0xa9,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, s1                    ; encoding: [0x01,0xa8,0x0a,0x7e]
-0x01,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, s105                  ; encoding: [0x69,0xa8,0x0a,0x7e]
-0x69,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xa8,0x0a,0x7e]
-0x6a,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xa8,0x0a,0x7e]
-0x6b,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xa8,0x0a,0x7e]
-0x7b,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, m0                    ; encoding: [0x7d,0xa8,0x0a,0x7e]
-0x7d,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xa8,0x0a,0x7e]
-0x7e,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xa8,0x0a,0x7e]
-0x7f,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, null                  ; encoding: [0x7c,0xa8,0x0a,0x7e]
-0x7c,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, -1                    ; encoding: [0xc1,0xa8,0x0a,0x7e]
-0xc1,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xa8,0x0a,0x7e]
-0xf0,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xa8,0x0a,0x7e]
-0xfd,0xa8,0x0a,0x7e
-
-# GFX12: v_rcp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_rcp_f32_e32 v5, v1                    ; encoding: [0x01,0x55,0x0a,0x7e]
-0x01,0x55,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, v255                  ; encoding: [0xff,0x55,0x0a,0x7e]
-0xff,0x55,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, s1                    ; encoding: [0x01,0x54,0x0a,0x7e]
-0x01,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, s105                  ; encoding: [0x69,0x54,0x0a,0x7e]
-0x69,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x54,0x0a,0x7e]
-0x6a,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x54,0x0a,0x7e]
-0x6b,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x54,0x0a,0x7e]
-0x7b,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, m0                    ; encoding: [0x7d,0x54,0x0a,0x7e]
-0x7d,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x54,0x0a,0x7e]
-0x7e,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x54,0x0a,0x7e]
-0x7f,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, null                  ; encoding: [0x7c,0x54,0x0a,0x7e]
-0x7c,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, -1                    ; encoding: [0xc1,0x54,0x0a,0x7e]
-0xc1,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x54,0x0a,0x7e]
-0xf0,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x54,0x0a,0x7e]
-0xfd,0x54,0x0a,0x7e
-
-# GFX12: v_rcp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rcp_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x5f,0x0a,0x7e]
-0x01,0x5f,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x5f,0x0a,0x7e]
-0xfe,0x5f,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x5e,0x0a,0x7e]
-0x02,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x5e,0x0a,0x7e]
-0x68,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x5e,0x0a,0x7e]
-0x6a,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x5e,0x0a,0x7e]
-0x7a,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x5e,0x0a,0x7e]
-0x7e,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], null              ; encoding: [0x7c,0x5e,0x0a,0x7e]
-0x7c,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x5e,0x0a,0x7e]
-0xc1,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x5e,0x0a,0x7e]
-0xf0,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x5e,0x0a,0x7e]
-0xfd,0x5e,0x0a,0x7e
-
-# GFX12: v_rcp_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rcp_iflag_f32_e32 v5, v1              ; encoding: [0x01,0x57,0x0a,0x7e]
-0x01,0x57,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, v255            ; encoding: [0xff,0x57,0x0a,0x7e]
-0xff,0x57,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, s1              ; encoding: [0x01,0x56,0x0a,0x7e]
-0x01,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, s105            ; encoding: [0x69,0x56,0x0a,0x7e]
-0x69,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, vcc_lo          ; encoding: [0x6a,0x56,0x0a,0x7e]
-0x6a,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, vcc_hi          ; encoding: [0x6b,0x56,0x0a,0x7e]
-0x6b,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, ttmp15          ; encoding: [0x7b,0x56,0x0a,0x7e]
-0x7b,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, m0              ; encoding: [0x7d,0x56,0x0a,0x7e]
-0x7d,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, exec_lo         ; encoding: [0x7e,0x56,0x0a,0x7e]
-0x7e,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, exec_hi         ; encoding: [0x7f,0x56,0x0a,0x7e]
-0x7f,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, null            ; encoding: [0x7c,0x56,0x0a,0x7e]
-0x7c,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, -1              ; encoding: [0xc1,0x56,0x0a,0x7e]
-0xc1,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, 0.5             ; encoding: [0xf0,0x56,0x0a,0x7e]
-0xf0,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v5, src_scc         ; encoding: [0xfd,0x56,0x0a,0x7e]
-0xfd,0x56,0x0a,0x7e
-
-# GFX12: v_rcp_iflag_f32_e32 v255, 0xaf123456    ; encoding: [0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
-0x01,0xbd,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
-0x7f,0xbd,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
-0x01,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
-0x69,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
-0x6a,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
-0x6b,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
-0x7b,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
-0x7d,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
-0x7e,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
-0x7f,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
-0x7c,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
-0xc1,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
-0xf0,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
-0xfd,0xbc,0x0a,0x7e
-
-# GFX12: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
-0x01,0x47,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, v255                ; encoding: [0xff,0x47,0x0a,0x7e]
-0xff,0x47,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, s1                  ; encoding: [0x01,0x46,0x0a,0x7e]
-0x01,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, s105                ; encoding: [0x69,0x46,0x0a,0x7e]
-0x69,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x46,0x0a,0x7e]
-0x6a,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x46,0x0a,0x7e]
-0x6b,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x46,0x0a,0x7e]
-0x7b,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, m0                  ; encoding: [0x7d,0x46,0x0a,0x7e]
-0x7d,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x46,0x0a,0x7e]
-0x7e,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x46,0x0a,0x7e]
-0x7f,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, null                ; encoding: [0x7c,0x46,0x0a,0x7e]
-0x7c,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, -1                  ; encoding: [0xc1,0x46,0x0a,0x7e]
-0xc1,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x46,0x0a,0x7e]
-0xf0,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v5, src_scc             ; encoding: [0xfd,0x46,0x0a,0x7e]
-0xfd,0x46,0x0a,0x7e
-
-# GFX12: v_rndne_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rndne_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x33,0x0a,0x7e]
-0x01,0x33,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x33,0x0a,0x7e]
-0xfe,0x33,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x32,0x0a,0x7e]
-0x02,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x32,0x0a,0x7e]
-0x68,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x32,0x0a,0x7e]
-0x6a,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x32,0x0a,0x7e]
-0x7a,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x32,0x0a,0x7e]
-0x7e,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], null            ; encoding: [0x7c,0x32,0x0a,0x7e]
-0x7c,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x32,0x0a,0x7e]
-0xc1,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x32,0x0a,0x7e]
-0xf0,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x32,0x0a,0x7e]
-0xfd,0x32,0x0a,0x7e
-
-# GFX12: v_rndne_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rsq_f16_e32 v5, v1                    ; encoding: [0x01,0xad,0x0a,0x7e]
-0x01,0xad,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, v127                  ; encoding: [0x7f,0xad,0x0a,0x7e]
-0x7f,0xad,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, s1                    ; encoding: [0x01,0xac,0x0a,0x7e]
-0x01,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, s105                  ; encoding: [0x69,0xac,0x0a,0x7e]
-0x69,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xac,0x0a,0x7e]
-0x6a,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xac,0x0a,0x7e]
-0x6b,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xac,0x0a,0x7e]
-0x7b,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, m0                    ; encoding: [0x7d,0xac,0x0a,0x7e]
-0x7d,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xac,0x0a,0x7e]
-0x7e,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xac,0x0a,0x7e]
-0x7f,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, null                  ; encoding: [0x7c,0xac,0x0a,0x7e]
-0x7c,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, -1                    ; encoding: [0xc1,0xac,0x0a,0x7e]
-0xc1,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xac,0x0a,0x7e]
-0xf0,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v5, src_scc               ; encoding: [0xfd,0xac,0x0a,0x7e]
-0xfd,0xac,0x0a,0x7e
-
-# GFX12: v_rsq_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_rsq_f32_e32 v5, v1                    ; encoding: [0x01,0x5d,0x0a,0x7e]
-0x01,0x5d,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, v255                  ; encoding: [0xff,0x5d,0x0a,0x7e]
-0xff,0x5d,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, s1                    ; encoding: [0x01,0x5c,0x0a,0x7e]
-0x01,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, s105                  ; encoding: [0x69,0x5c,0x0a,0x7e]
-0x69,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x5c,0x0a,0x7e]
-0x6a,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x5c,0x0a,0x7e]
-0x6b,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x5c,0x0a,0x7e]
-0x7b,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, m0                    ; encoding: [0x7d,0x5c,0x0a,0x7e]
-0x7d,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x5c,0x0a,0x7e]
-0x7e,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x5c,0x0a,0x7e]
-0x7f,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, null                  ; encoding: [0x7c,0x5c,0x0a,0x7e]
-0x7c,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, -1                    ; encoding: [0xc1,0x5c,0x0a,0x7e]
-0xc1,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x5c,0x0a,0x7e]
-0xf0,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v5, src_scc               ; encoding: [0xfd,0x5c,0x0a,0x7e]
-0xfd,0x5c,0x0a,0x7e
-
-# GFX12: v_rsq_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_rsq_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x63,0x0a,0x7e]
-0x01,0x63,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x63,0x0a,0x7e]
-0xfe,0x63,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x62,0x0a,0x7e]
-0x02,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x62,0x0a,0x7e]
-0x68,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x62,0x0a,0x7e]
-0x6a,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x62,0x0a,0x7e]
-0x7a,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x62,0x0a,0x7e]
-0x7e,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], null              ; encoding: [0x7c,0x62,0x0a,0x7e]
-0x7c,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x62,0x0a,0x7e]
-0xc1,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x62,0x0a,0x7e]
-0xf0,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x62,0x0a,0x7e]
-0xfd,0x62,0x0a,0x7e
-
-# GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
-0x01,0xc5,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
-0xff,0xc5,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
-0x01,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
-0x69,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
-0x6a,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
-0x6b,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
-0x7b,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
-0x7d,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
-0x7e,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
-0x7f,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
-0x7c,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
-0xc1,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
-0xf0,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
-0xfd,0xc4,0x0a,0x7e
-
-# GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
-0x01,0xc1,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
-0x7f,0xc1,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
-0x01,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
-0x69,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
-0x6a,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
-0x6b,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
-0x7b,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
-0x7d,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
-0x7e,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
-0x7f,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
-0x7c,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
-0xc1,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
-0xf0,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
-0xfd,0xc0,0x0a,0x7e
-
-# GFX12: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
-0x01,0x6b,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, v255                  ; encoding: [0xff,0x6b,0x0a,0x7e]
-0xff,0x6b,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, s1                    ; encoding: [0x01,0x6a,0x0a,0x7e]
-0x01,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, s105                  ; encoding: [0x69,0x6a,0x0a,0x7e]
-0x69,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6a,0x0a,0x7e]
-0x6a,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6a,0x0a,0x7e]
-0x6b,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6a,0x0a,0x7e]
-0x7b,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, m0                    ; encoding: [0x7d,0x6a,0x0a,0x7e]
-0x7d,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6a,0x0a,0x7e]
-0x7e,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6a,0x0a,0x7e]
-0x7f,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, null                  ; encoding: [0x7c,0x6a,0x0a,0x7e]
-0x7c,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, -1                    ; encoding: [0xc1,0x6a,0x0a,0x7e]
-0xc1,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6a,0x0a,0x7e]
-0xf0,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6a,0x0a,0x7e]
-0xfd,0x6a,0x0a,0x7e
-
-# GFX12: v_sin_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_sqrt_f16_e32 v5, v1                   ; encoding: [0x01,0xab,0x0a,0x7e]
-0x01,0xab,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, v127                 ; encoding: [0x7f,0xab,0x0a,0x7e]
-0x7f,0xab,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, s1                   ; encoding: [0x01,0xaa,0x0a,0x7e]
-0x01,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, s105                 ; encoding: [0x69,0xaa,0x0a,0x7e]
-0x69,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xaa,0x0a,0x7e]
-0x6a,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xaa,0x0a,0x7e]
-0x6b,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xaa,0x0a,0x7e]
-0x7b,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, m0                   ; encoding: [0x7d,0xaa,0x0a,0x7e]
-0x7d,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xaa,0x0a,0x7e]
-0x7e,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xaa,0x0a,0x7e]
-0x7f,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, null                 ; encoding: [0x7c,0xaa,0x0a,0x7e]
-0x7c,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, -1                   ; encoding: [0xc1,0xaa,0x0a,0x7e]
-0xc1,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xaa,0x0a,0x7e]
-0xf0,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v5, src_scc              ; encoding: [0xfd,0xaa,0x0a,0x7e]
-0xfd,0xaa,0x0a,0x7e
-
-# GFX12: v_sqrt_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_sqrt_f32_e32 v5, v1                   ; encoding: [0x01,0x67,0x0a,0x7e]
-0x01,0x67,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, v255                 ; encoding: [0xff,0x67,0x0a,0x7e]
-0xff,0x67,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, s1                   ; encoding: [0x01,0x66,0x0a,0x7e]
-0x01,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, s105                 ; encoding: [0x69,0x66,0x0a,0x7e]
-0x69,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x66,0x0a,0x7e]
-0x6a,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x66,0x0a,0x7e]
-0x6b,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x66,0x0a,0x7e]
-0x7b,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, m0                   ; encoding: [0x7d,0x66,0x0a,0x7e]
-0x7d,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x66,0x0a,0x7e]
-0x7e,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x66,0x0a,0x7e]
-0x7f,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, null                 ; encoding: [0x7c,0x66,0x0a,0x7e]
-0x7c,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, -1                   ; encoding: [0xc1,0x66,0x0a,0x7e]
-0xc1,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x66,0x0a,0x7e]
-0xf0,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v5, src_scc              ; encoding: [0xfd,0x66,0x0a,0x7e]
-0xfd,0x66,0x0a,0x7e
-
-# GFX12: v_sqrt_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_sqrt_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x69,0x0a,0x7e]
-0x01,0x69,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x69,0x0a,0x7e]
-0xfe,0x69,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x68,0x0a,0x7e]
-0x02,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x68,0x0a,0x7e]
-0x68,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x68,0x0a,0x7e]
-0x6a,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x68,0x0a,0x7e]
-0x7a,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x68,0x0a,0x7e]
-0x7e,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], null             ; encoding: [0x7c,0x68,0x0a,0x7e]
-0x7c,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x68,0x0a,0x7e]
-0xc1,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x68,0x0a,0x7e]
-0xf0,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x68,0x0a,0x7e]
-0xfd,0x68,0x0a,0x7e
-
-# GFX12: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]
-0x01,0xcb,0x0a,0x7e
-
-# GFX12: v_swap_b32 v255, v255                   ; encoding: [0xff,0xcb,0xfe,0x7f]
-0xff,0xcb,0xfe,0x7f
-
-# GFX12: v_swaprel_b32 v5, v1                    ; encoding: [0x01,0xd1,0x0a,0x7e]
-0x01,0xd1,0x0a,0x7e
-
-# GFX12: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
-0xff,0xd1,0xfe,0x7f
-
-# GFX12: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
-0x01,0xbb,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
-0x7f,0xbb,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
-0x01,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
-0x69,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
-0x6a,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
-0x6b,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
-0x7b,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
-0x7d,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
-0x7e,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
-0x7f,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
-0x7c,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
-0xc1,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
-0xf0,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
-0xfd,0xba,0x0a,0x7e
-
-# GFX12: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-
-# GFX12: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
-0x01,0x43,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, v255                ; encoding: [0xff,0x43,0x0a,0x7e]
-0xff,0x43,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, s1                  ; encoding: [0x01,0x42,0x0a,0x7e]
-0x01,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, s105                ; encoding: [0x69,0x42,0x0a,0x7e]
-0x69,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x42,0x0a,0x7e]
-0x6a,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x42,0x0a,0x7e]
-0x6b,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x42,0x0a,0x7e]
-0x7b,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, m0                  ; encoding: [0x7d,0x42,0x0a,0x7e]
-0x7d,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x42,0x0a,0x7e]
-0x7e,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x42,0x0a,0x7e]
-0x7f,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, null                ; encoding: [0x7c,0x42,0x0a,0x7e]
-0x7c,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, -1                  ; encoding: [0xc1,0x42,0x0a,0x7e]
-0xc1,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x42,0x0a,0x7e]
-0xf0,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v5, src_scc             ; encoding: [0xfd,0x42,0x0a,0x7e]
-0xfd,0x42,0x0a,0x7e
-
-# GFX12: v_trunc_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf
-
-# GFX12: v_trunc_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x2f,0x0a,0x7e]
-0x01,0x2f,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x2f,0x0a,0x7e]
-0xfe,0x2f,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x2e,0x0a,0x7e]
-0x02,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x2e,0x0a,0x7e]
-0x68,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x2e,0x0a,0x7e]
-0x6a,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x2e,0x0a,0x7e]
-0x7a,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x2e,0x0a,0x7e]
-0x7e,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], null            ; encoding: [0x7c,0x2e,0x0a,0x7e]
-0x7c,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x2e,0x0a,0x7e]
-0xc1,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x2e,0x0a,0x7e]
-0xf0,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x2e,0x0a,0x7e]
-0xfd,0x2e,0x0a,0x7e
-
-# GFX12: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf
diff --git a/llvm/test/MC/ELF/AArch64/cfi.s b/llvm/test/MC/ELF/AArch64/cfi.s
index 033c8d9c04094..6bdf03cc7bb85 100644
--- a/llvm/test/MC/ELF/AArch64/cfi.s
+++ b/llvm/test/MC/ELF/AArch64/cfi.s
@@ -227,7 +227,7 @@ f37:
         .cfi_endproc
 
 // CHECK:       Section {
-// CHECK:         Name: .eh_frame (20)
+// CHECK:         Name: .eh_frame
 // CHECK-NEXT:    Type: SHT_PROGBITS (0x1)
 // CHECK-NEXT:    Flags [ (0x2)
 // CHECK-NEXT:      SHF_ALLOC (0x2)
@@ -355,7 +355,7 @@ f37:
 // CHECK-NEXT:    )
 // CHECK-NEXT:  }
 // CHECK:       Section {
-// CHECK:         Name: .rela.eh_frame (15)
+// CHECK:         Name: .rela.eh_frame
 // CHECK-NEXT:    Type: SHT_RELA (0x4)
 // CHECK-NEXT:    Flags [ (0x40)
 // CHECK-NEXT:      SHF_INFO_LINK (0x40)
diff --git a/llvm/test/MC/ELF/ARM/execute-only-section.s b/llvm/test/MC/ELF/ARM/execute-only-section.s
index 12020e030cc04..ac5e31f70dba0 100644
--- a/llvm/test/MC/ELF/ARM/execute-only-section.s
+++ b/llvm/test/MC/ELF/ARM/execute-only-section.s
@@ -18,7 +18,7 @@ foo:
 
 
 // CHECK:      Section {
-// CHECK:        Name: .text (16)
+// CHECK:        Name: .text
 // CHECK-NEXT:   Type: SHT_PROGBITS (0x1)
 // CHECK-NEXT:   Flags [ (0x20000006)
 // CHECK-NEXT:     SHF_ALLOC (0x2)
@@ -29,7 +29,7 @@ foo:
 // CHECK:      }
 
 // CHECK:      Section {
-// CHECK:        Name: .text (16)
+// CHECK:        Name: .text
 // CHECK-NEXT:   Type: SHT_PROGBITS (0x1)
 // CHECK-NEXT:   Flags [ (0x20000006)
 // CHECK-NEXT:     SHF_ALLOC (0x2)
@@ -40,6 +40,6 @@ foo:
 // CHECK:      }
 
 // CHECK: Symbol {
-// CHECK:   Name: foo (22)
+// CHECK:   Name: foo
 // CHECK:   Section: .text (0x3)
 // CHECK: }
diff --git a/llvm/test/MC/ELF/align-nops.s b/llvm/test/MC/ELF/align-nops.s
index 41a007111f406..5ce03d5a08aff 100644
--- a/llvm/test/MC/ELF/align-nops.s
+++ b/llvm/test/MC/ELF/align-nops.s
@@ -4,14 +4,14 @@
     .text
 f0:
     .long 0
-    .align  8, 0x00000090
+    .align  8
     .long 0
     .align  8
 
 // But not in another section
     .data
     .long 0
-    .align  8, 0x00000090
+    .align  8, 0x90
     .long 0
     .align  8
 
diff --git a/llvm/test/MC/ELF/layout-interdependency2.s b/llvm/test/MC/ELF/layout-interdependency2.s
new file mode 100644
index 0000000000000..24fafe5429f4e
--- /dev/null
+++ b/llvm/test/MC/ELF/layout-interdependency2.s
@@ -0,0 +1,84 @@
+## Contrived .zero directive example, simplified from the Linux kernel use case,
+## which requires multiple iterations to converge.
+## https://github.com/llvm/llvm-project/issues/100283
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+# CHECK:       80: jne 0x0 <.text>
+# CHECK-EMPTY:
+
+	.text
+.Ltmp0:
+.Ltmp1:
+	cli
+	popq	%rdi
+text1:
+	.zero	(.Ltmp2-.Ltmp3)
+	.section	"","ax",@progbits
+.Ltmp3:
+	movq	$0, %rax
+.Ltmp4:
+.Ltmp5:
+	.section	.discard.intra_function_calls,"ax",@progbits
+	.long	.Ltmp5
+	.section	"","ax",@progbits
+	callq	.Ltmp6
+	int3
+.Ltmp7:
+.Ltmp8:
+	.section	.discard.intra_function_calls,"ax",@progbits
+	.long	.Ltmp8
+	.section	"","ax",@progbits
+	callq	.Ltmp6
+	int3
+.Ltmp6:
+	addq	$0, %rsp
+	decq	%rax
+	jne	.Ltmp4
+	lfence
+	movq	$-1, %gs:pcpu_hot+6
+
+.Ltmp2:
+	.text
+text2:
+
+	.zero	(.Ltmp9-.Ltmp10)
+	.section	"","ax",@progbits
+.Ltmp10:
+	jmp	.Ltmp11
+.Ltmp9:
+	.text
+text3:
+
+.Ltmp12:
+	.zero	(.Ltmp13-.Ltmp14)
+	.section	"","ax",@progbits
+.Ltmp14:
+	callq	entry_untrain_ret
+.Ltmp13:
+	.text
+
+	.zero	(.Ltmp15-.Ltmp16)
+	.section	"","ax",@progbits
+.Ltmp16:
+	xorl	%eax, %eax
+	btsq	$63, %rax
+	movq	%rax, %gs:pcpu_hot+6
+
+.Ltmp15:
+	.text
+
+	popq	%r12
+	popq	rbp
+	jmp	__x86_return_thunk
+	movl	936(%rdi), %eax
+	cmpl	%gs:x86_spec_ctrl_current, %eax
+	je	.Ltmp0
+	movl	edx, %edx
+	wrmsr
+	jmp	.Ltmp0
+.Ltmp11:
+	movl	$72, %ecx
+	jmp	.Ltmp12
+	cmpb	$0, kvm_rebooting
+	jne	.Ltmp1
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
index 958d5cab6f2f3..08a131d4d43f9 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
@@ -99,11 +99,13 @@ jirl $a0, $a0, 0x20000
 # CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %b16) or an integer in the range [-131072, 131068]
 
 ## simm20
-pcaddi $a0, -0x80001
-# CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-524288, 524287]
 pcaddu12i $a0, 0x80000
 # CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
 
+## simm20_pcaddi
+pcaddi $a0, -0x80001
+# CHECK: :[[#@LINE-1]]:13: error: operand must be a symbol with modifier (e.g. %pcrel_20) or an integer in the range [-524288, 524287]
+
 ## simm20_lu12iw
 lu12i.w $a0, -0x80001
 # CHECK: :[[#@LINE-1]]:14: error: operand must be a symbol with modifier (e.g. %abs_hi20) or an integer in the range [-524288, 524287]
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index 87df59978c6ea..091dce200b7de 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -273,3 +273,38 @@ lu52i.d $t1, $t1, %desc64_hi12(foo)
 # RELOC: R_LARCH_TLS_DESC64_HI12 foo 0x0
 # INSTR: lu52i.d $t1, $t1, %desc64_hi12(foo)
 # FIXUP: fixup A - offset: 0, value: %desc64_hi12(foo), kind: FK_NONE
+
+lu12i.w $t1, %le_hi20_r(foo)
+# RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# INSTR: lu12i.w $t1, %le_hi20_r(foo)
+# FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+
+add.d $t1, $t2, $tp, %le_add_r(foo)
+# RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# INSTR: add.d $t1, $t2, $tp, %le_add_r(foo)
+# FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+
+addi.d $t1, $a2, %le_lo12_r(foo)
+# RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# INSTR: addi.d $t1, $a2, %le_lo12_r(foo)
+# FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+
+pcaddi $t1, %pcrel_20(foo)
+# RELOC: R_LARCH_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %ld_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_LD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %ld_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %ld_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %gd_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_GD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %gd_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %gd_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %desc_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_DESC_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %desc_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pcrel_20(foo), kind: FK_NONE
diff --git a/llvm/test/MC/MachO/x86_32-optimal_nop.s b/llvm/test/MC/MachO/x86_32-optimal_nop.s
index bb784b9da9954..19655b5405152 100644
--- a/llvm/test/MC/MachO/x86_32-optimal_nop.s
+++ b/llvm/test/MC/MachO/x86_32-optimal_nop.s
@@ -5,7 +5,7 @@
         ret
         # nop
         # 0x90
-        .align 1, 0x90
+        .align 1
         ret
 # 2 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -13,14 +13,14 @@
         ret
         # xchg %ax,%ax
         # 0x66, 0x90
-        .align 2, 0x90
+        .align 2
         ret
 # 3 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
         ret
         # nopl (%[re]ax)
         # 0x0f, 0x1f, 0x00
-        .align 2, 0x90
+        .align 2
         ret
 # 4 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -30,7 +30,7 @@
         ret
         # nopl 0(%[re]ax)
         # 0x0f, 0x1f, 0x40, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 5 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -39,7 +39,7 @@
         ret
         # nopl 0(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 6 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -47,14 +47,14 @@
         ret
         # nopw 0(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 7 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
         ret
         # nopl 0L(%[re]ax)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 8 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -68,7 +68,7 @@
         ret
         # nopl 0L(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 9 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -81,7 +81,7 @@
         ret
         # nopw 0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 10 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -94,7 +94,7 @@
         ret
         # nopw %cs:0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 11 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -105,7 +105,7 @@
         ret
         # nopw %cs:0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 12 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -117,7 +117,7 @@
         # nopw 0(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 13 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -128,7 +128,7 @@
         # nopl 0L(%[re]ax)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 14 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -138,7 +138,7 @@
         # nopl 0L(%[re]ax)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 15 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -147,7 +147,7 @@
         # nopl 0L(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
         # 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 
         # Only the .text sections gets optimal nops.
diff --git a/llvm/test/MC/RISCV/align-fill-byte-zero.s b/llvm/test/MC/RISCV/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..cad881ea4e82c
--- /dev/null
+++ b/llvm/test/MC/RISCV/align-fill-byte-zero.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple riscv32 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple riscv32 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// RUN: llvm-mc -triple riscv64 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple riscv64 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM: addi     a0, a0, 1
+// OBJ: 00150513      addi     a0, a0, 0x1
+  addi a0, a0, 0x1
+
+// ASM: .p2align 4, 0x0
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+  .balign 0x10, 0
+
+// ASM: addi     a0, a0, 1
+// OBJ-NEXT: 00150513      addi     a0, a0, 0x1
+  addi a0, a0, 0x1
diff --git a/llvm/test/MC/RISCV/insn_c.s b/llvm/test/MC/RISCV/insn_c.s
index 19169e8b08c94..c63e8ab33aef9 100644
--- a/llvm/test/MC/RISCV/insn_c.s
+++ b/llvm/test/MC/RISCV/insn_c.s
@@ -31,6 +31,16 @@ target:
 # CHECK-OBJ: c.addi a0, 0xd
 .insn ci C1, 0, a0, 13
 
+# CHECK-ASM: .insn ci  1, 0, a6, 13
+# CHECK-ASM: encoding: [0x35,0x08]
+# CHECK-OBJ: c.addi a6, 0xd
+.insn ci  1, 0, a6, 13
+
+# CHECK-ASM: .insn ci  1, 0, a6, 13
+# CHECK-ASM: encoding: [0x35,0x08]
+# CHECK-OBJ: c.addi a6, 0xd
+.insn ci C1, 0, a6, 13
+
 # CHECK-ASM: .insn ciw  0, 0, a0, 13
 # CHECK-ASM: encoding: [0xa8,0x01]
 # CHECK-OBJ: c.addi4spn a0, sp, 0xc8
diff --git a/llvm/test/MC/RISCV/mapping-across-sections.s b/llvm/test/MC/RISCV/mapping-across-sections.s
index 700e86a0eb70e..11b05a616bdcf 100644
--- a/llvm/test/MC/RISCV/mapping-across-sections.s
+++ b/llvm/test/MC/RISCV/mapping-across-sections.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc -triple=riscv32 -filetype=obj < %s | llvm-readelf -Ss - | FileCheck %s
-# RUN: llvm-mc -triple=riscv64 -filetype=obj < %s | llvm-readelf -Ss - | FileCheck %s
+# RUN: llvm-mc -triple=riscv32 -filetype=obj %s | llvm-readelf -Ss - | FileCheck %s
+# RUN: llvm-mc -triple=riscv64 -filetype=obj %s | llvm-readelf -Ss - | FileCheck %s
 
         .text
         nop
@@ -28,6 +28,6 @@
 # CHECK: [[#STARTS_DATA:]]] .starts_data
 
 # CHECK:    Value  Size Type    Bind   Vis     Ndx              Name
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#TEXT]]        $x
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#WIBBLE]]      $x
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#STARTS_DATA]] $d
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#TEXT]]        $x{{$}}
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#WIBBLE]]      $x{{$}}
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#STARTS_DATA]] $d{{$}}
diff --git a/llvm/test/MC/RISCV/rv32zacas-invalid.s b/llvm/test/MC/RISCV/rv32zacas-invalid.s
index 66f939d139a12..bad2edcaaa915 100644
--- a/llvm/test/MC/RISCV/rv32zacas-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+a,zacas < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv32 -mattr=+a,+experimental-zacas < %s 2>&1 | FileCheck %s
 
 # Non-zero offsets not supported for the third operand (rs1).
 amocas.w a1, a3, 1(a5) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
diff --git a/llvm/test/MC/RISCV/rv32zacas-valid.s b/llvm/test/MC/RISCV/rv32zacas-valid.s
index 0e76f02399483..8ba2b02542bc0 100644
--- a/llvm/test/MC/RISCV/rv32zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+experimental-zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rv64zacas-invalid.s b/llvm/test/MC/RISCV/rv64zacas-invalid.s
index e6a4e4007e978..854e6fe308b0a 100644
--- a/llvm/test/MC/RISCV/rv64zacas-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zacas < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv64 -mattr=+a,+experimental-zacas < %s 2>&1 | FileCheck %s
 
 # Non-zero offsets not supported for the third operand (rs1).
 amocas.w a1, a3, 1(a5) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s
index c6bf1252fa40d..d5044a0e0671d 100644
--- a/llvm/test/MC/RISCV/rv64zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-valid.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+experimental-zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv64 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
index 97afb9d6563e5..f8aa6867aedc6 100644
--- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+experimental-zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zabha,+experimental-zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zabha,+zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zabha,+experimental-zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zabha,+experimental-zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+a,+zabha -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/Sparc/elf-sparc-machine-type.s b/llvm/test/MC/Sparc/elf-sparc-machine-type.s
index 630812394560c..85edec5dbaea5 100644
--- a/llvm/test/MC/Sparc/elf-sparc-machine-type.s
+++ b/llvm/test/MC/Sparc/elf-sparc-machine-type.s
@@ -1,11 +1,11 @@
 ## Emit correct machine type depending on triple and cpu options.
 ## - `-triple sparc` emits an object of type EM_SPARC;
-## - `-triple sparc -mcpu=v9` emits EM_SPARC32PLUS; and
+## - `-triple sparc -mattr=+v8plus` emits EM_SPARC32PLUS; and
 ## - `-triple sparcv9` emits EM_SPARCV9.
 
-# RUN: llvm-mc -filetype=obj -triple sparc            %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARC       %s
-# RUN: llvm-mc -filetype=obj -triple sparc   -mcpu=v9 %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARC32PLUS %s
-# RUN: llvm-mc -filetype=obj -triple sparcv9          %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARCV9     %s
+# RUN: llvm-mc -filetype=obj -triple sparc                  %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARC       %s
+# RUN: llvm-mc -filetype=obj -triple sparc   -mattr=+v8plus %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARC32PLUS %s
+# RUN: llvm-mc -filetype=obj -triple sparcv9                %s -o - | llvm-readobj -h - | FileCheck --check-prefixes=SPARCV9     %s
 
 # SPARC:       Machine: EM_SPARC (0x2)
 # SPARC32PLUS: Machine: EM_SPARC32PLUS (0x12)
diff --git a/llvm/test/MC/SystemZ/insn-bad.s b/llvm/test/MC/SystemZ/insn-bad.s
index 6f94731fa0871..f81278610c73a 100644
--- a/llvm/test/MC/SystemZ/insn-bad.s
+++ b/llvm/test/MC/SystemZ/insn-bad.s
@@ -4176,12 +4176,9 @@
 #CHECK: may	%f0, %f0, -1
 #CHECK: error: invalid operand
 #CHECK: may	%f0, %f0, 4096
-#CHECK: error: invalid register pair
-#CHECK: may	%f2, %f0, 0
 
 	may	%f0, %f0, -1
 	may	%f0, %f0, 4096
-	may	%f2, %f0, 0
 
 #CHECK: error: invalid operand
 #CHECK: mayh	%f0, %f0, -1
@@ -4199,11 +4196,6 @@
 	mayl	%f0, %f0, -1
 	mayl	%f0, %f0, 4096
 
-#CHECK: error: invalid register pair
-#CHECK: mayr	%f2, %f0, %f0
-
-	mayr	%f2, %f0, %f0
-
 #CHECK: error: invalid operand
 #CHECK: mc	-1, 0
 #CHECK: error: invalid operand
diff --git a/llvm/test/MC/SystemZ/insn-good.s b/llvm/test/MC/SystemZ/insn-good.s
index 9fcb8a42cd73c..553c1b281eb4d 100644
--- a/llvm/test/MC/SystemZ/insn-good.s
+++ b/llvm/test/MC/SystemZ/insn-good.s
@@ -11526,6 +11526,7 @@
 #CHECK: may	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3a]
 #CHECK: may	%f13, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xd0,0x3a]
 #CHECK: may	%f13, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xd0,0x3a]
+#CHECK: may	%f2, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x20,0x3a]
 
 	may	%f0, %f0, 0
 	may	%f0, %f0, 4095
@@ -11536,6 +11537,7 @@
 	may	%f0, %f15, 0
 	may	%f13, %f0, 0
 	may	%f13, %f15, 0
+	may	%f2, %f0, 0
 
 #CHECK: mayh	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3c]
 #CHECK: mayh	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3c]
@@ -11611,6 +11613,7 @@
 #CHECK: mayr	%f13, %f0, %f0          # encoding: [0xb3,0x3a,0xd0,0x00]
 #CHECK: mayr	%f5, %f8, %f9           # encoding: [0xb3,0x3a,0x50,0x89]
 #CHECK: mayr	%f13, %f15, %f15        # encoding: [0xb3,0x3a,0xd0,0xff]
+#CHECK: mayr	%f2, %f0, %f0           # encoding: [0xb3,0x3a,0x20,0x00]
 
 	mayr	%f0, %f0, %f0
 	mayr	%f0, %f0, %f15
@@ -11618,6 +11621,7 @@
 	mayr	%f13, %f0, %f0
 	mayr	%f5, %f8, %f9
 	mayr	%f13, %f15, %f15
+	mayr	%f2, %f0, %f0
 
 #CHECK: mc	0, 0                    # encoding: [0xaf,0x00,0x00,0x00]
 #CHECK: mc	4095, 0                 # encoding: [0xaf,0x00,0x0f,0xff]
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 8c3483bfaad7a..7ae4d47d888cf 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -851,6 +851,9 @@ main:
     # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
     f16x8.extract_lane 1
 
+    # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01]
+    f16x8.replace_lane 1
+
     # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
     f16x8.add
 
diff --git a/llvm/test/MC/X86/apx/ccmp-att.s b/llvm/test/MC/X86/apx/ccmp-att.s
index 405071b0f4310..32e6f371600bc 100644
--- a/llvm/test/MC/X86/apx/ccmp-att.s
+++ b/llvm/test/MC/X86/apx/ccmp-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-402: error:
+# ERROR-COUNT-428: error:
 # ERROR-NOT: error:
 ## Condition flags
 
@@ -1217,3 +1217,84 @@
 # CHECK: ccmpoq {dfv=of,sf,zf,cf} %rax, %rbx
 # CHECK: encoding: [0x62,0xf4,0xfc,0x00,0x39,0xc3]
          ccmpoq {dFV=Cf,zF,SF,of} %rax, %rbx
+
+## "{evex} cmp*" are alias for "ccmpt* {dfv=}"
+
+# CHECK: ccmptb	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x80,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpb	$123, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpw	$123, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	$1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x81,0x7c,0x80,0x7b,0xd2,0x04]
+         {evex} cmpw	$1234, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpl	$123, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmpl	$123456, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpq	$123, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmpq	$123456, 123(%r8,%rax,4)
+# CHECK: ccmptb	{dfv=}	%bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x38,0x5c,0x80,0x7b]
+         {evex} cmpb	%bl, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	%dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x39,0x54,0x80,0x7b]
+         {evex} cmpw	%dx, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	%ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmpl	%ecx, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	%r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmpq	%r9, 123(%r8,%rax,4)
+# CHECK: ccmptb	{dfv=}	123(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3a,0x5c,0x80,0x7b]
+         {evex} cmpb	123(%r8,%rax,4), %bl
+# CHECK: ccmptw	{dfv=}	123(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x3b,0x54,0x80,0x7b]
+         {evex} cmpw	123(%r8,%rax,4), %dx
+# CHECK: ccmptl	{dfv=}	123(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmpl	123(%r8,%rax,4), %ecx
+# CHECK: ccmptq	{dfv=}	123(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmpq	123(%r8,%rax,4), %r9
+# CHECK: ccmptb	{dfv=}	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x80,0xfb,0x7b]
+         {evex} cmpb	$123, %bl
+# CHECK: ccmptw	{dfv=}	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x83,0xfa,0x7b]
+         {evex} cmpw	$123, %dx
+# CHECK: ccmptl	{dfv=}	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x83,0xf9,0x7b]
+         {evex} cmpl	$123, %ecx
+# CHECK: ccmptq	{dfv=}	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0xf9,0x7b]
+         {evex} cmpq	$123, %r9
+# CHECK: ccmptw	{dfv=}	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x81,0xfa,0xd2,0x04]
+         {evex} cmpw	$1234, %dx
+# CHECK: ccmptl	{dfv=}	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmpl	$123456, %ecx
+# CHECK: ccmptq	{dfv=}	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmpq	$123456, %r9
+# CHECK: ccmptb	{dfv=}	%bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x38,0xda]
+         {evex} cmpb	%bl, %dl
+# CHECK: ccmptw	{dfv=}	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x39,0xd0]
+         {evex} cmpw	%dx, %ax
+# CHECK: ccmptl	{dfv=}	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x39,0xca]
+         {evex} cmpl	%ecx, %edx
+# CHECK: ccmptq	{dfv=}	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
+         {evex} cmpq	%r9, %r15
diff --git a/llvm/test/MC/X86/apx/ccmp-intel.s b/llvm/test/MC/X86/apx/ccmp-intel.s
index 2d446b053d94e..5e64b63228cb8 100644
--- a/llvm/test/MC/X86/apx/ccmp-intel.s
+++ b/llvm/test/MC/X86/apx/ccmp-intel.s
@@ -1214,3 +1214,84 @@
 # CHECK: ccmpo {dfv=of,sf,zf,cf} rbx, rax
 # CHECK: encoding: [0x62,0xf4,0xfc,0x00,0x39,0xc3]
          ccmpo {DFv=Cf,zF,SF,of} rbx, rax
+
+## "{evex} cmp*" are alias for "ccmpt* {dfv=}"
+
+# CHECK: ccmpt	{dfv=}	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x80,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x81,0x7c,0x80,0x7b,0xd2,0x04]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ccmpt	{dfv=}	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x38,0x5c,0x80,0x7b]
+         {evex} cmp	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x39,0x54,0x80,0x7b]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], dx
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ccmpt	{dfv=}	bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3a,0x5c,0x80,0x7b]
+         {evex} cmp	bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x3b,0x54,0x80,0x7b]
+         {evex} cmp	dx, word ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmp	ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmp	r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x80,0xfb,0x7b]
+         {evex} cmp	bl, 123
+# CHECK: ccmpt	{dfv=}	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x83,0xfa,0x7b]
+         {evex} cmp	dx, 123
+# CHECK: ccmpt	{dfv=}	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x83,0xf9,0x7b]
+         {evex} cmp	ecx, 123
+# CHECK: ccmpt	{dfv=}	r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0xf9,0x7b]
+         {evex} cmp	r9, 123
+# CHECK: ccmpt	{dfv=}	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x81,0xfa,0xd2,0x04]
+         {evex} cmp	dx, 1234
+# CHECK: ccmpt	{dfv=}	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmp	ecx, 123456
+# CHECK: ccmpt	{dfv=}	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmp	r9, 123456
+# CHECK: ccmpt	{dfv=}	dl, bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x38,0xda]
+         {evex} cmp	dl, bl
+# CHECK: ccmpt	{dfv=}	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x39,0xd0]
+         {evex} cmp	ax, dx
+# CHECK: ccmpt	{dfv=}	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x39,0xca]
+         {evex} cmp	edx, ecx
+# CHECK: ccmpt	{dfv=}	r15, r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
+         {evex} cmp	r15, r9
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
index b9e98adc9841b..4cb928748a1d2 100644
--- a/llvm/test/MC/X86/apx/ctest-att.s
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-256: error:
+# ERROR-COUNT-276: error:
 # ERROR-NOT: error:
 # CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
@@ -27,6 +27,19 @@
 # CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
          ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# Swap mr form
+# CHECK: ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestbb {dfv=of} 123(%r8,%rax,4), %bl
+# CHECK: ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestbw {dfv=of} 123(%r8,%rax,4), %dx
+# CHECK: ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbl {dfv=of} 123(%r8,%rax,4), %ecx
+# CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbq {dfv=of} 123(%r8,%rax,4), %r9
 # CHECK: ctestbb {dfv=of} $123, %bl
 # CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
          ctestbb {dfv=of} $123, %bl
@@ -771,3 +784,54 @@
 # CHECK: ctesteq {dfv=of} %r9, %r15
 # CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
          ctesteq {dfv=of} %r9, %r15
+
+## "{evex} test*" are alias for "ctestt* {dfv=}"
+
+# CHECK: ctesttb	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         {evex} testb	$123, 123(%r8,%rax,4)
+# CHECK: ctesttw	{dfv=}	$1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         {evex} testw	$1234, 123(%r8,%rax,4)
+# CHECK: ctesttl	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} testl	$123456, 123(%r8,%rax,4)
+# CHECK: ctesttq	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} testq	$123456, 123(%r8,%rax,4)
+# CHECK: ctesttb	{dfv=}	%bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x84,0x5c,0x80,0x7b]
+         {evex} testb	%bl, 123(%r8,%rax,4)
+# CHECK: ctesttw	{dfv=}	%dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x85,0x54,0x80,0x7b]
+         {evex} testw	%dx, 123(%r8,%rax,4)
+# CHECK: ctesttl	{dfv=}	%ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} testl	%ecx, 123(%r8,%rax,4)
+# CHECK: ctesttq	{dfv=}	%r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} testq	%r9, 123(%r8,%rax,4)
+# CHECK: ctesttb	{dfv=}	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf6,0xc3,0x7b]
+         {evex} testb	$123, %bl
+# CHECK: ctesttw	{dfv=}	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0xf7,0xc2,0xd2,0x04]
+         {evex} testw	$1234, %dx
+# CHECK: ctesttl	{dfv=}	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} testl	$123456, %ecx
+# CHECK: ctesttq	{dfv=}	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} testq	$123456, %r9
+# CHECK: ctesttb	{dfv=}	%bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x84,0xda]
+         {evex} testb	%bl, %dl
+# CHECK: ctesttw	{dfv=}	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x85,0xd0]
+         {evex} testw	%dx, %ax
+# CHECK: ctesttl	{dfv=}	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x85,0xca]
+         {evex} testl	%ecx, %edx
+# CHECK: ctesttq	{dfv=}	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
+         {evex} testq	%r9, %r15
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
index 17cea489b4765..701c517e27a79 100644
--- a/llvm/test/MC/X86/apx/ctest-intel.s
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -24,6 +24,18 @@
 # CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
 # CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
          ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestb {dfv=of} bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestb {dfv=of} dx, word ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} r9, qword ptr [r8 + 4*rax + 123]
 # CHECK: ctestb {dfv=of} bl, 123
 # CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
          ctestb {dfv=of} bl, 123
@@ -768,3 +780,54 @@
 # CHECK: cteste {dfv=of} r15, r9
 # CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
          cteste {dfv=of} r15, r9
+
+## "{evex} test*" are alias for "ctestt* {dfv=}"
+
+# CHECK: ctestt	{dfv=}	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         {evex} test	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestt	{dfv=}	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         {evex} test	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} test	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} test	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt	{dfv=}	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x84,0x5c,0x80,0x7b]
+         {evex} test	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestt	{dfv=}	word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x85,0x54,0x80,0x7b]
+         {evex} test	word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestt	{dfv=}	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} test	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestt	{dfv=}	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} test	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestt	{dfv=}	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf6,0xc3,0x7b]
+         {evex} test	bl, 123
+# CHECK: ctestt	{dfv=}	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0xf7,0xc2,0xd2,0x04]
+         {evex} test	dx, 1234
+# CHECK: ctestt	{dfv=}	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} test	ecx, 123456
+# CHECK: ctestt	{dfv=}	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} test	r9, 123456
+# CHECK: ctestt	{dfv=}	dl, bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x84,0xda]
+         {evex} test	dl, bl
+# CHECK: ctestt	{dfv=}	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x85,0xd0]
+         {evex} test	ax, dx
+# CHECK: ctestt	{dfv=}	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x85,0xca]
+         {evex} test	edx, ecx
+# CHECK: ctestt	{dfv=}	r15, r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
+         {evex} test	r15, r9
diff --git a/llvm/test/MC/X86/code16gcc-align.s b/llvm/test/MC/X86/code16gcc-align.s
index 657364a604275..107986a9ec70e 100644
--- a/llvm/test/MC/X86/code16gcc-align.s
+++ b/llvm/test/MC/X86/code16gcc-align.s
@@ -21,19 +21,19 @@
 	.text
 	.code16gcc
 	.globl	test
-	.p2align	4, 0x90
+	.p2align	4
 	.type	test,@function
 test:
 	.nops	34
 	movl	%eax, %edi
 	xorl	%ebx, %ebx
-	.p2align	4, 0x90
+	.p2align	4
 	movzbl	(%esi,%ebx), %ecx
 	calll	called
 	.nops	3
 	retl
 
-	.p2align	4, 0x90
+	.p2align	4
 	.type	called,@function
 called:
 	.nops	1
diff --git a/llvm/test/MC/X86/x86-32-coverage.s b/llvm/test/MC/X86/x86-32-coverage.s
index fbe2714aed263..5475946a9d216 100644
--- a/llvm/test/MC/X86/x86-32-coverage.s
+++ b/llvm/test/MC/X86/x86-32-coverage.s
@@ -10790,7 +10790,7 @@ btcl $4, (%eax)
           movdir64b 485498096, %ecx
 
 // CHECK: movdir64b 485498096, %cx
-// CHECK: # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x0d,0xf0,0x1c,0xf0,0x1c]
+// CHECK: # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x0e,0xf0,0x1c]
           movdir64b 485498096, %cx
 
 // CHECK: movdir64b (%edx), %eax
@@ -10877,6 +10877,10 @@ enqcmd  (%bx,%di), %di
 // CHECK: encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x81,0xc0,0x1f]
 enqcmd  8128(%bx,%di), %ax
 
+// CHECK: enqcmd 485498096, %cx
+// CHECK: encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x0e,0xf0,0x1c]
+enqcmd 485498096, %cx
+
 // CHECK: enqcmds (%bx,%di), %di
 // CHECK: encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x39]
 enqcmds (%bx,%di), %di
@@ -10885,6 +10889,10 @@ enqcmds (%bx,%di), %di
 // CHECK: encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x81,0xc0,0x1f]
 enqcmds 8128(%bx,%di), %ax
 
+// CHECK: enqcmds 485498096, %cx
+// CHECK: encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x0e,0xf0,0x1c]
+enqcmds 485498096, %cx
+
 // CHECK: serialize
 // CHECK: encoding: [0x0f,0x01,0xe8]
 serialize
diff --git a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
index 990e2810d851e..2e9da9f2b5b77 100644
--- a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
+++ b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
@@ -11,7 +11,7 @@ define void @f_Ym(i64 %m.coerce) {
 ; CHECK:         ## InlineAsm End
 
 entry:
-  %0 = tail call x86_mmx asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
+  %0 = tail call <1 x i64> asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
   ret void
 }
 
diff --git a/llvm/test/MachineVerifier/test_uscmp.mir b/llvm/test/MachineVerifier/test_uscmp.mir
new file mode 100644
index 0000000000000..aa686c4ec73e6
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_uscmp.mir
@@ -0,0 +1,31 @@
+# RUN: not --crash llc -verify-machineinstrs -run-pass none -mtriple=arm64 -o /dev/null %s 2>&1 | FileCheck %s
+# REQUIRES: aarch64-registered-target
+
+---
+name:            test_uscmp
+body:             |
+  bb.0:
+
+    %2:_(p0) = G_IMPLICIT_DEF
+    %3:_(p0) = G_IMPLICIT_DEF
+    ; CHECK: Generic scmp/ucmp does not support pointers as operands
+    %4:_(s1) = G_SCMP %2, %3
+
+    %12:_(s64) = G_IMPLICIT_DEF
+    %13:_(s64) = G_IMPLICIT_DEF
+    ; CHECK: Generic scmp/ucmp does not support pointers as a result
+    %14:_(p0) = G_SCMP %12, %13
+
+    %23:_(<2 x s32>) = G_IMPLICIT_DEF
+    %24:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK: Generic vector scmp/ucmp must preserve number of lanes
+    %5:_(s1) = G_UCMP  %23, %24
+
+    %15:_(s32) = G_CONSTANT i32 0
+    %16:_(s64) = G_CONSTANT i64 2
+    ; CHECK: Generic scmp/ucmp must have same input types
+    %17:_(s1) = G_SCMP %15, %16
+
+
+
+...
diff --git a/llvm/test/Object/ARM/nm-mapping-symbol.s b/llvm/test/Object/ARM/nm-mapping-symbol.s
deleted file mode 100644
index 6b41876c35e5e..0000000000000
--- a/llvm/test/Object/ARM/nm-mapping-symbol.s
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: llvm-mc %s -o %t.o -filetype=obj -triple=armv7-pc-linux
-// RUN: llvm-readobj --symbols %t.o | FileCheck %s
-// RUN: llvm-nm %t.o | FileCheck -allow-empty --check-prefix=NM %s
-
-// Test that nm doesn't print the mapping symbols
-
-// CHECK: Name: $d.0
-// NM-NOT: $d.0
-
-        .section        .foobar,"",%progbits
-        .asciz  "foo"
-        nop
diff --git a/llvm/test/Object/archive-malformed-object.test b/llvm/test/Object/archive-malformed-object.test
index a92762975bda6..021df7ed873d1 100644
--- a/llvm/test/Object/archive-malformed-object.test
+++ b/llvm/test/Object/archive-malformed-object.test
@@ -1,27 +1,51 @@
 ## Show that the archive library emits error messages when adding malformed
-## objects.
+## object files and skips symbol tables for "malformed" bitcode files, which
+## are assumed to be bitcode files generated by compilers from the future.
 
 # RUN: rm -rf %t.dir
 # RUN: split-file %s %t.dir
 # RUN: cd %t.dir
 
-## Malformed bitcode object is the first file member of archive if the symbol table is required.
+## Create a malformed bitcode object.
 # RUN: llvm-as input.ll -o input.bc
 # RUN: cp input.bc good.bc
 # RUN: %python -c "with open('input.bc', 'a') as f: f.truncate(10)"
-# RUN: not llvm-ar rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
 
-## Malformed bitcode object is the last file member of archive if the symbol table is required.
+## Malformed bitcode objects either warn or error depending on the archive format
+## (see switch in getSymbolicFile). If the archive was created with a warning,
+## we want to check that the archive map is empty. llvm-nm will fail when it
+## tries to read the malformed bitcode file, but it's supposed to print the
+## archive map first, which in this case it won't because there won't be one.
 # RUN: rm -rf bad.a
-# RUN: not llvm-ar rc bad.a good.bc input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
+# RUN: llvm-ar --format=bsd rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=WARN1
+# RUN: not llvm-nm --print-armap bad.a | count 0
+# RUN: rm -rf bad.a
+# RUN: llvm-ar --format=gnu rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=WARN1
+# RUN: not llvm-nm --print-armap bad.a | count 0
+# RUN: rm -rf bad.a
+# RUN: not llvm-ar --format=bigarchive rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
+# RUN: rm -rf bad.a
+# RUN: not llvm-ar --format=coff rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
+# RUN: rm -rf bad.a
+# RUN: not llvm-ar --format=darwin rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
+
+## Malformed bitcode object is the last file member of archive and
+## the symbol table is required. In this case we check that the
+## symbol table contains entries for the good object only.
+# RUN: rm -rf bad.a
+# RUN: llvm-ar rc bad.a good.bc input.bc 2>&1 | FileCheck %s --check-prefix=WARN1
+# RUN: not llvm-nm --print-armap bad.a | FileCheck %s --check-prefix=ARMAP
 
 ## Malformed bitcode object if the symbol table is not required for big archive.
+## For big archives we print an error instead of a warning because the AIX linker
+## presumably requires the index.
 # RUN: rm -rf bad.a
 # RUN: not llvm-ar --format=bigarchive rcS bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
 # RUN: rm -rf bad.a
 # RUN: not llvm-ar --format=bigarchive rcS bad.a good.bc input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
 
 # ERR1: error: bad.a: 'input.bc': Invalid bitcode signature
+# WARN1: warning: 'input.bc': Invalid bitcode signature
 
 ## Non-bitcode malformed file.
 # RUN: yaml2obj input.yaml -o input.o
@@ -29,17 +53,23 @@
 
 # ERR2: error: bad.a: 'input.o': section header table goes past the end of the file: e_shoff = 0x9999
 
-## Don't emit an error if the symbol table is not required for formats other than the big archive format.
-# RUN: llvm-ar --format=gnu rcS good.a input.o input.bc
+## Don't emit an error or warning if the symbol table is not required for formats other than the big archive format.
+# RUN: llvm-ar --format=gnu rcS good.a input.o input.bc 2>&1 | count 0
 # RUN: llvm-ar t good.a | FileCheck %s --check-prefix=CONTENTS
 
 # CONTENTS:      input.o
 # CONTENTS-NEXT: input.bc
 
+# ARMAP: Archive map
+# ARMAP-NEXT: foo in good.bc
+# ARMAP-EMPTY:
+
 #--- input.ll
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux"
 
+@foo = global i32 1
+
 #--- input.yaml
 --- !ELF
 FileHeader:
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index d451d2897f673..5543472df685b 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -69,6 +69,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
+; CHECK-O23SZ-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O23SZ-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O23SZ-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: InlinerPass
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index a524c9991f1bf..f2e80814f347a 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -49,8 +49,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print<stack-lifetime><may>,print<stack-lifetime><must>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 42ef49f8f7c7e..ab04f80abc572 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -184,12 +184,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index e74f88c1a3bf9..cb49cbd22d60c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -183,12 +183,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 0bb26330d000a..96e8349350442 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -148,12 +148,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index 7bbde818082ce..0dcd0c2dd50a8 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -86,12 +86,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
 // CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 4*/ GIMT_Encode4([[#DEFAULT:]]),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(410), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(428), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 2*/ GIMT_Encode4(440), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FABS*//*Label 3*/ GIMT_Encode4(452),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(418), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(436), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 2*/ GIMT_Encode4(448), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FABS*//*Label 3*/ GIMT_Encode4(460),
 // CHECK-NEXT:     // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]]
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(427), // Rule ID 2 //
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(435), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] x
 // CHECK-NEXT:       // No operand predicates
@@ -101,10 +101,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner1),
 // CHECK-NEXT:       // Combiner Rule #2: TwoMatchNoApply
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 5: @427
+// CHECK-NEXT:     // Label 5: @435
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @428
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(439), // Rule ID 3 //
+// CHECK-NEXT:     // Label 1: @436
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(447), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -112,10 +112,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #3: NoMatchTwoApply
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
-// CHECK-NEXT:     // Label 6: @439
+// CHECK-NEXT:     // Label 6: @447
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @440
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(451), // Rule ID 1 //
+// CHECK-NEXT:     // Label 2: @448
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(459), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -123,10 +123,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #1: TwoMatchTwoApply
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
-// CHECK-NEXT:     // Label 7: @451
+// CHECK-NEXT:     // Label 7: @459
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @452
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(463), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @460
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(471), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -134,7 +134,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #0: OneMatchOneApply
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 8: @463
+// CHECK-NEXT:     // Label 8: @471
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     // Label 4: @[[#%u, DEFAULT]]
 // CHECK-NEXT:     GIM_Reject,
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
index 86ae031caecb5..55429f98564af 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
@@ -28,19 +28,48 @@ def InstTest3 : GICombineRule<
   (match (G_UNMERGE_VALUES $a, $b, $c, $d)),
   (apply [{ APPLY }])>;
 
+def VariadicTypeTestCxx : GICombineRule<
+  (defs root:$a),
+  (match (G_BUILD_VECTOR $a, GIVariadic<2, 4>:$b)),
+  (apply [{ ${b} }])>;
+
+def VariadicTypeTestReuse : GICombineRule<
+  (defs root:$a),
+  (match (G_BUILD_VECTOR $a, $c, GIVariadic<2, 4>:$b)),
+  (apply (G_MERGE_VALUES $a, $b, $c))>;
+
 def MyCombiner: GICombiner<"GenMyCombiner", [
   InstTest0,
   InstTest1,
   InstTest2,
-  InstTest3
+  InstTest3,
+  VariadicTypeTestCxx,
+  VariadicTypeTestReuse
 ]>;
 
+// CHECK:     bool GenMyCombiner::runCustomAction(unsigned ApplyID, const MatcherState &State, NewMIVector &OutMIs) const {
+// CHECK-NEXT:   Helper.getBuilder().setInstrAndDebugLoc(*State.MIs[0]);
+// CHECK-NEXT:   switch(ApplyID) {
+// CHECK-NEXT:   case GICXXCustomAction_GICombiner0:{
+// CHECK-NEXT:     // Apply Patterns
+// CHECK-NEXT:     APPLY
+// CHECK-NEXT:     return true;
+// CHECK-NEXT:   }
+// CHECK-NEXT:   case GICXXCustomAction_GICombiner1:{
+// CHECK-NEXT:     // Apply Patterns
+// CHECK-NEXT:     getRemainingOperands(*State.MIs[0], 1)
+// CHECK-NEXT:     return true;
+// CHECK-NEXT:   }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   llvm_unreachable("Unknown Apply Action");
+// CHECK-NEXT: }
+
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 2*/ GIMT_Encode4([[#DEFAULT:]]),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(70), GIMT_Encode2(74), /*)*//*default:*//*Label 2*/ GIMT_Encode4(127),
 // CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(26), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_BUILD_VECTOR*//*Label 1*/ GIMT_Encode4(55),
-// CHECK-NEXT:     // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]]
+// CHECK-NEXT:     // Label 0: @26
 // CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(40), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
@@ -77,7 +106,35 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #1: InstTest1
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
 // CHECK-NEXT:     // Label 5: @69
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(83), // Rule ID 0 //
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(86), // Rule ID 4 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
+// CHECK-NEXT:       GIM_CheckNumOperandsGE, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT:       GIM_CheckNumOperandsLE, /*MI*/0, /*Expected*/5,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #4: VariadicTypeTestCxx
+// CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
+// CHECK-NEXT:     // Label 6: @86
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(112), // Rule ID 5 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
+// CHECK-NEXT:       GIM_CheckNumOperandsGE, /*MI*/0, /*Expected*/4,
+// CHECK-NEXT:       GIM_CheckNumOperandsLE, /*MI*/0, /*Expected*/6,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] c
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #5: VariadicTypeTestReuse
+// CHECK-NEXT:       GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(TargetOpcode::G_MERGE_VALUES),
+// CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // a
+// CHECK-NEXT:       GIR_CopyRemaining, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // b
+// CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/1, // c
+// CHECK-NEXT:       GIR_EraseRootFromParent_Done,
+// CHECK-NEXT:     // Label 7: @112
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(126), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
 // CHECK-NEXT:       // MIs[0] a
@@ -90,10 +147,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #0: InstTest0
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 6: @83
+// CHECK-NEXT:     // Label 8: @126
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @[[#%u, DEFAULT]]
+// CHECK-NEXT:     // Label 2: @127
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: [[#%u, DEFAULT + 1]] bytes
+// CHECK-NEXT:     }; // Size: 128 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
index 4769bed972401..9b5598661c8de 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
@@ -104,8 +104,32 @@ def TypeOfProp : GICombineRule<
   (apply (G_ANYEXT $x, GITypeOf<"$y">:$tmp),
          (G_ANYEXT $tmp, $y))>;
 
+// CHECK:      (CombineRule name:VariadicTypeTest id:3 root:a
+// CHECK-NEXT:   (MatchPats
+// CHECK-NEXT:     <match_root>__VariadicTypeTest_match_0:(CodeGenInstructionPattern G_UNMERGE_VALUES operands:[<def>$a, <def>$b, GIVariadic<1,0>:$z])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (ApplyPats
+// CHECK-NEXT:     <apply_root>__VariadicTypeTest_apply_0:(CodeGenInstructionPattern G_UNMERGE_VALUES operands:[<def>$a, <def>$b, GIVariadic<1,0>:$z])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable MatchPats
+// CHECK-NEXT:     a -> __VariadicTypeTest_match_0
+// CHECK-NEXT:     b -> __VariadicTypeTest_match_0
+// CHECK-NEXT:     z -> <live-in>
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable ApplyPats
+// CHECK-NEXT:     a -> __VariadicTypeTest_apply_0
+// CHECK-NEXT:     b -> __VariadicTypeTest_apply_0
+// CHECK-NEXT:     z -> <live-in>
+// CHECK-NEXT:   )
+// CHECK-NEXT: )
+def VariadicTypeTest: GICombineRule<
+  (defs root:$a),
+  (match (G_UNMERGE_VALUES $a, $b, GIVariadic<>:$z)),
+  (apply (G_UNMERGE_VALUES $a, $b, $z))>;
+
 def MyCombiner: GICombiner<"GenMyCombiner", [
   InstTest0,
   PatFragTest0,
-  TypeOfProp
+  TypeOfProp,
+  VariadicTypeTest,
 ]>;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td
index ee7b8f5f3a39c..076fdb78ae8aa 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td
@@ -21,7 +21,7 @@ def UnknownOperand : GICombineRule<
   (match (G_ZEXT $dst, $src)),
   (apply (G_ANYEXT $dst, (GITypeOf<"$unknown"> 0)))>;
 
-// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GISpecialType is not supported in 'match' patterns
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GITypeOf is not supported in 'match' patterns
 // CHECK: :[[@LINE+1]]:{{[0-9]+}}: note: operand 1 of '__UseInMatch_match_0' has type 'GITypeOf<$dst>'
 def UseInMatch : GICombineRule<
   (defs root:$dst),
@@ -41,7 +41,7 @@ def UseInPF: GICombineRule<
   (match (PFWithTypeOF $dst)),
   (apply (G_ANYEXT $dst, (i32 0)))>;
 
-// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GISpecialType is not supported in 'match' patterns
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GITypeOf is not supported in 'match' patterns
 // CHECK: :[[@LINE+1]]:{{[0-9]+}}: note: operand 1 of '__InferredUseInMatch_match_0' has type 'GITypeOf<$dst>'
 def InferredUseInMatch : GICombineRule<
   (defs root:$dst),
@@ -63,6 +63,13 @@ def TypeOfApplyTmp : GICombineRule<
   (apply (G_ANYEXT $dst, i32:$tmp),
          (G_ANYEXT $tmp, (GITypeOf<"$tmp"> 0)))>;
 
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: type 'GITypeOf<$src>' is ill-formed: 'src' is a variadic pack operand
+def TypeOfVariadic : GICombineRule<
+  (defs root:$dst),
+  (match (G_BUILD_VECTOR $dst,  $x, GIVariadic<>:$src)),
+  (apply (G_ANYEXT GITypeOf<"$src">:$tmp, $x),
+         (G_ANYEXT $dst, $tmp))>;
+
 // CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse one or more rules
 def MyCombiner: GICombiner<"GenMyCombiner", [
   NoDollarSign,
@@ -71,5 +78,6 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   UseInPF,
   InferredUseInMatch,
   InferenceConflict,
-  TypeOfApplyTmp
+  TypeOfApplyTmp,
+  TypeOfVariadic
 ]>;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/variadic-errors.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/variadic-errors.td
new file mode 100644
index 0000000000000..3f8d4a4bf2e13
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/variadic-errors.td
@@ -0,0 +1,89 @@
+// RUN: not llvm-tblgen -I %p/../../../include -gen-global-isel-combiner \
+// RUN:     -combiners=MyCombiner %s 2>&1| \
+// RUN: FileCheck %s -implicit-check-not=error:
+
+include "llvm/Target/Target.td"
+include "llvm/Target/GlobalISel/Combine.td"
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: 'G_BUILD_VECTOR': GIVariadic can only be used on the last operand
+def VariadicNotLastInList : GICombineRule<
+  (defs root:$dst),
+  (match (G_BUILD_VECTOR $dst, $a, GIVariadic<>:$b, $c)),
+  (apply (G_ANYEXT $dst, $a))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: 'G_IMPLICIT_DEF': GIVariadic cannot be used on defs
+def VariadicAsDef : GICombineRule<
+  (defs root:$dst),
+  (match (G_IMPLICIT_DEF GIVariadic<1>:$dst)),
+  (apply [{ APPLY }])>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: conflicting types for operand 'args': 'GIVariadic<2,4>' vs 'GIVariadic<3,6>'
+def ConflictingInference : GICombineRule<
+  (defs root:$dst),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<2, 4>:$args)),
+  (apply (G_MERGE_VALUES $dst, GIVariadic<3, 6>:$args))>;
+
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: cannot parse operand type: minimum number of arguments must be greater than zero in GIVariadic
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse pattern: '(G_BUILD_VECTOR ?:$dst, anonymous_{{[0-9]+}}:$a)'
+def InvalidBounds0 : GICombineRule<
+  (defs root:$dst),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<0>:$a)),
+  (apply [{ APPLY }])>;
+
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: cannot parse operand type: maximum number of arguments (1) must be zero, or greater than the minimum number of arguments (1) in GIVariadic
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse pattern: '(G_BUILD_VECTOR ?:$dst, anonymous_{{[0-9]+}}:$a)'
+def InvalidBounds1 : GICombineRule<
+  (defs root:$dst),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<1,1>:$a)),
+  (apply [{ APPLY }])>;
+
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: each instance of a GIVariadic operand must have a unique name within the match patterns
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: note: 'c' is used multiple times
+def VariadicTypeTestEqOp : GICombineRule<
+  (defs root:$a),
+  (match (G_MERGE_VALUES $b, $c),
+         (G_BUILD_VECTOR $a, $b, GIVariadic<2, 4>:$c)),
+  (apply (G_MERGE_VALUES $a, $c))>;
+
+// TODO: We could support this if needed
+
+// CHECK: :[[@LINE+3]]:{{[0-9]+}}: error: GISpecialType is not supported in GICombinePatFrag
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: note: operand 1 of '__PFWithVariadic_alt0_pattern_0' has type 'GIVariadic<1,0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Could not parse GICombinePatFrag 'PFWithVariadic'
+def PFWithVariadic: GICombinePatFrag<
+    (outs $dst), (ins),
+    [(pattern (G_ANYEXT $dst, GIVariadic<>:$b))]>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse pattern: '(PFWithVariadic ?:$dst)'
+def UseInPF: GICombineRule<
+  (defs root:$dst),
+  (match (PFWithVariadic $dst)),
+  (apply (G_ANYEXT $dst, (i32 0)))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error:  cannot use a GIVariadic operand on non-variadic instruction 'G_MUL'
+def NotVariadicInstMatch : GICombineRule<
+  (defs root:$a),
+  (match (G_MUL $a, $c, GIVariadic<2, 4>:$b)),
+  (apply (G_MERGE_VALUES $a, $b, $c))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error:  cannot use a GIVariadic operand on non-variadic instruction 'G_MUL'
+def NotVariadicInstApply : GICombineRule<
+  (defs root:$a),
+  (match (G_BUILD_VECTOR $a, GIVariadic<2, 4>:$b)),
+  (apply (G_MUL $a, $b, $c))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse one or more rules
+def MyCombiner: GICombiner<"GenMyCombiner", [
+  VariadicNotLastInList,
+  VariadicAsDef,
+  ConflictingInference,
+  InvalidBounds0,
+  InvalidBounds1,
+  VariadicTypeTestEqOp,
+  UseInPF,
+  NotVariadicInstMatch,
+  NotVariadicInstApply
+]>;
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 796f595930319..853831366fa53 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -513,7 +513,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1808 bytes
+// R00O-NEXT:  }; // Size: 1816 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/TableGen/RegBankFromRegClass.td b/llvm/test/TableGen/RegBankFromRegClass.td
new file mode 100644
index 0000000000000..072a436035052
--- /dev/null
+++ b/llvm/test/TableGen/RegBankFromRegClass.td
@@ -0,0 +1,45 @@
+// RUN: llvm-tblgen -gen-register-bank -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def MyTarget : Target;
+
+def R0 : Register<"r0">;
+def GR : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+
+def F0 : Register<"f0">;
+def FR : RegisterClass<"MyTarget", [f32], 32, (add F0)>;
+
+def V0 : Register<"V0">;
+def VR : RegisterClass<"MyTarget", [v4i8, f32], 32, (add V0)>;
+
+def AllFloatR : RegisterClass<"MyTarget", [f32], 32, (add F0, V0)>;
+def AnyR : RegisterClass<"MyTarget", [i32, f32, v4i8], 32, (add R0, F0, V0)>;
+
+def GRRegBank : RegisterBank<"GRB", [GR]>;
+def FRRegBank : RegisterBank<"FRB", [FR]>;
+def VRRegBank : RegisterBank<"VRB", [VR]>;
+
+
+// CHECK:      #ifdef GET_TARGET_REGBANK_CLASS
+// CHECK:        const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const override;
+
+// CHECK:      #ifdef GET_TARGET_REGBANK_IMPL
+// CHECK:      const RegisterBank &
+// CHECK-NEXT: MyTargetGenRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const {
+// CHECK-NEXT:   constexpr uint32_t InvalidRegBankID = uint32_t(MyTarget::InvalidRegBankID) & 3;
+// CHECK-NEXT:   static const uint32_t RegClass2RegBank[1] = {
+// CHECK-NEXT:     (uint32_t(InvalidRegBankID) << 0) |
+// CHECK-NEXT:     (uint32_t(InvalidRegBankID) << 2) |
+// CHECK-NEXT:     (uint32_t(MyTarget::FRRegBankID) << 4) | // FRRegClassID
+// CHECK-NEXT:     (uint32_t(MyTarget::GRRegBankID) << 6) | // GRRegClassID
+// CHECK-NEXT:     (uint32_t(MyTarget::VRRegBankID) << 8) // VRRegClassID
+// CHECK-NEXT:   };
+// CHECK-NEXT:   const unsigned RegClassID = RC.getID();
+// CHECK-NEXT:   if (LLVM_LIKELY(RegClassID < 5)) {
+// CHECK-NEXT:     unsigned RegBankID = (RegClass2RegBank[RegClassID / 16] >> ((RegClassID % 16) * 2)) & 3;
+// CHECK-NEXT:     if (RegBankID != InvalidRegBankID)
+// CHECK-NEXT:       return getRegBank(RegBankID);
+// CHECK-NEXT:   }
+// CHECK-NEXT:   llvm_unreachable(llvm::Twine("Target needs to handle register class ID 0x").concat(llvm::Twine::utohexstr(RegClassID)).str().c_str());
+// CHECK-NEXT: }
diff --git a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
index ba60df2bbcf6e..c41a19e97e763 100644
--- a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
+++ b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
@@ -27,13 +27,13 @@ def GPRAbove127 : RegisterClass<"TestTarget", [i32], 32,
 // CHECK-NEXT: OPC_CheckChild1Integer, 0,
 // CHECK-NEXT: OPC_EmitInteger32, 0|128,2/*256*/,
 // CHECK-NEXT: OPC_MorphNodeTo1None, TARGET_VAL(TargetOpcode::COPY_TO_REGCLASS),
-// CHECK-NEXT:     MVT::i32, 2/*#Ops*/, 1, 0,
+// CHECK-NEXT:     /*MVT::i32*/7, 2/*#Ops*/, 1, 0,
 def : Pat<(i32 (add i32:$src, (i32 0))),
           (COPY_TO_REGCLASS GPRAbove127, GPR0:$src)>;
 
 // CHECK:      OPC_CheckChild1Integer, 2,
 // CHECK-NEXT: OPC_EmitStringInteger32, TestNamespace::GPR127RegClassID,
 // CHECK-NEXT: OPC_MorphNodeTo1None, TARGET_VAL(TargetOpcode::COPY_TO_REGCLASS),
-// CHECK-NEXT:     MVT::i32, 2/*#Ops*/, 1, 0,
+// CHECK-NEXT:     /*MVT::i32*/7, 2/*#Ops*/, 1, 0,
 def : Pat<(i32 (add i32:$src, (i32 1))),
           (COPY_TO_REGCLASS GPR127, GPR0:$src)>;
diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td
index 7137cf96fd3d4..c071cfd731cb5 100644
--- a/llvm/test/TableGen/riscv-target-def.td
+++ b/llvm/test/TableGen/riscv-target-def.td
@@ -12,6 +12,11 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
+class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
+  int GroupID = groupID;
+  int BitPos = bitPos;
+}
+
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
                                  list<RISCVExtension> implies = [],
                                  string fieldname = !subst("Feature", "Has", NAME),
@@ -23,7 +28,8 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">;
+                     "'I' (Base Integer Instruction Set)">,
+      RISCVExtensionBitmask<0, 8>;
 
 def FeatureStdExtZicsr
     : RISCVExtension<"zicsr", 2, 0,
@@ -36,7 +42,8 @@ def FeatureStdExtZifencei
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>;
+                     [FeatureStdExtZicsr]>,
+      RISCVExtensionBitmask<0, 5>;
 
 def FeatureStdExtZidummy
     : RISCVExperimentalExtension<"zidummy", 0, 1,
@@ -171,3 +178,10 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 // CHECK-NEXT: TUNE_PROC(ROCKET, "rocket")
 
 // CHECK: #undef TUNE_PROC
+
+// CHECK: #ifdef GET_RISCVExtensionBitmaskTable_IMPL
+// CHECK-NEXT: static const RISCVExtensionBitmask ExtensionBitmask[]={
+// CHECK-NEXT:     {"f", 0, 5ULL},
+// CHECK-NEXT:     {"i", 0, 8ULL},
+// CHECK-NEXT: };
+// CHECK-NEXT: #endif
diff --git a/llvm/test/ThinLTO/X86/ctxprof.ll b/llvm/test/ThinLTO/X86/ctxprof.ll
new file mode 100644
index 0000000000000..4c86ec9f4c479
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/ctxprof.ll
@@ -0,0 +1,73 @@
+; Test workload based importing via -thinlto-pgo-ctx-prof
+; Use external linkage symbols so we don't depend on module paths which are
+; used when computing the GUIDs of internal linkage symbols.
+; The functionality is shared with what workload.ll tests, so here we only care
+; about testing the ctx profile is loaded and handled correctly.
+;
+; Set up
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: split-file %s %t
+;
+; RUN: opt -module-summary %t/m1.ll -o %t/m1.bc
+; RUN: opt -module-summary %t/m2.ll -o %t/m2.bc
+; RUN: llvm-dis %t/m1.bc -o - | FileCheck %s --check-prefix=GUIDS-1
+; RUN: llvm-dis %t/m2.bc -o - | FileCheck %s --check-prefix=GUIDS-2
+;
+; GUIDS-1: name: "m1_f1"
+; GUIDS-1-SAME: guid = 6019442868614718803
+; GUIDS-2: name: "m2_f1"
+; GUIDS-2-SAME: guid = 15593096274670919754
+;
+; RUN: rm -rf %t_baseline
+; RUN: rm -rf %t_exp
+; RUN: mkdir -p %t_baseline
+; RUN: mkdir -p %t_exp
+;
+; Normal run. m1 shouldn't get m2_f1 because it's not referenced from there, and
+; m1_f1 shouldn't go to m2.
+;
+; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc \
+; RUN:  -o %t_baseline/result.o -save-temps \
+; RUN:  -r %t/m1.bc,m1_f1,plx \
+; RUN:  -r %t/m2.bc,m2_f1,plx
+; RUN: llvm-dis %t_baseline/result.o.1.3.import.bc -o - | FileCheck %s --check-prefix=NOPROF-1
+; RUN: llvm-dis %t_baseline/result.o.2.3.import.bc -o - | FileCheck %s --check-prefix=NOPROF-2
+;
+; NOPROF-1-NOT: m2_f1()
+; NOPROF-2-NOT: m1_f1()
+;
+; The run with workload definitions - same other options.
+;
+; RUN: echo '[ \
+; RUN:        {"Guid": 6019442868614718803, "Counters": [1], "Callsites": [[{"Guid": 15593096274670919754, "Counters": [1]}]]}, \
+; RUN:        {"Guid": 15593096274670919754, "Counters": [1], "Callsites": [[{"Guid": 6019442868614718803, "Counters": [1]}]]} \
+; RUN:  ]' > %t_exp/ctxprof.json
+; RUN: llvm-ctxprof-util fromJSON --input %t_exp/ctxprof.json --output %t_exp/ctxprof.bitstream
+; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc \
+; RUN:  -o %t_exp/result.o -save-temps \
+; RUN:  -thinlto-pgo-ctx-prof=%t_exp/ctxprof.bitstream \
+; RUN:  -r %t/m1.bc,m1_f1,plx \
+; RUN:  -r %t/m2.bc,m2_f1,plx
+; RUN: llvm-dis %t_exp/result.o.1.3.import.bc -o - | FileCheck %s --check-prefix=FIRST
+; RUN: llvm-dis %t_exp/result.o.2.3.import.bc -o - | FileCheck %s --check-prefix=SECOND
+;
+;
+; FIRST: m2_f1()
+; SECOND: m1_f1()
+;
+;--- m1.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define dso_local void @m1_f1() {
+  ret void
+}
+
+;--- m2.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define dso_local void @m2_f1() {
+  ret void
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
index 5fea6f669ead6..2518a30838242 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
@@ -92,16 +92,10 @@ define i32 @f64_i16(double %in) {
 }
 
 define i64 @f16_i32(half %in) {
-; CHECK-FP-LABEL: @f16_i32(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi half [[IN:%.*]] to i64
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call i64 @llvm.smin.i64(i64 [[CONV]], i64 2147483647)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call i64 @llvm.smax.i64(i64 [[MIN]], i64 -2147483648)
-; CHECK-FP-NEXT:    ret i64 [[MAX]]
-;
-; CHECK-FP16-LABEL: @f16_i32(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-FP16-NEXT:    ret i64 [[TMP2]]
+; CHECK-LABEL: @f16_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %conv = fptosi half %in to i64
   %min = call i64 @llvm.smin.i64(i64 %conv, i64 2147483647)
@@ -185,16 +179,10 @@ define <8 x i64> @v8f32_i32(<8 x float> %in) {
 }
 
 define <4 x i32> @v4f16_i16(<4 x half> %in) {
-; CHECK-FP-LABEL: @v4f16_i16(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi <4 x half> [[IN:%.*]] to <4 x i32>
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[CONV]], <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[MIN]], <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
-; CHECK-FP-NEXT:    ret <4 x i32> [[MAX]]
-;
-; CHECK-FP16-LABEL: @v4f16_i16(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-FP16-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK-LABEL: @v4f16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %conv = fptosi <4 x half> %in to <4 x i32>
   %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -203,16 +191,10 @@ define <4 x i32> @v4f16_i16(<4 x half> %in) {
 }
 
 define <8 x i32> @v8f16_i16(<8 x half> %in) {
-; CHECK-FP-LABEL: @v8f16_i16(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi <8 x half> [[IN:%.*]] to <8 x i32>
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[CONV]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[MIN]], <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
-; CHECK-FP-NEXT:    ret <8 x i32> [[MAX]]
-;
-; CHECK-FP16-LABEL: @v8f16_i16(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[TMP1]] to <8 x i32>
-; CHECK-FP16-NEXT:    ret <8 x i32> [[TMP2]]
+; CHECK-LABEL: @v8f16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %conv = fptosi <8 x half> %in to <8 x i32>
   %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -292,3 +274,6 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
 declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP: {{.*}}
+; CHECK-FP16: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 7290a91c9ccd2..b0d2824a64ee3 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3345,176 +3345,936 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
 ;---------------------------------------------------------------------
 
 define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
@@ -3525,176 +4285,936 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
 ;---------------------------------------------------------------------
 
 define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 05fb224e6f145..c5bf26dc4c0e1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3367,66 +3367,339 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3455,66 +3728,339 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
@@ -3542,71 +4088,344 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3635,66 +4454,339 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index af6b7e0addfb1..ee360dee79425 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1241,88 +1241,468 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
 ;---------------------------------------------------------------------
 
 define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1457,88 +1837,468 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
 ;---------------------------------------------------------------------
 
 define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 69d65e6f1f379..ac5dd55002f3f 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1263,66 +1263,339 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1479,66 +1752,339 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
diff --git a/llvm/test/Transforms/Attributor/memory_locations_gpu.ll b/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
index 2c7a98a41f86f..c10883b54ad59 100644
--- a/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
@@ -55,7 +55,8 @@ define i32 @test_const_as_call2() {
 ; CHECK-LABEL: define {{[^@]+}}@test_const_as_call2
 ; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[P2:%.*]] = call ptr @ptr() #[[ATTR4]]
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[P2]], align 4
+; CHECK-NEXT:    [[C2:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(4)
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr addrspace(4) [[C2]], align 4
 ; CHECK-NEXT:    ret i32 [[L2]]
 ;
   %p2 = call ptr @ptr()
@@ -84,7 +85,8 @@ define i32 @test_shared_as_call2() {
 ; CHECK-LABEL: define {{[^@]+}}@test_shared_as_call2
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:    [[P2:%.*]] = call ptr @ptr() #[[ATTR4]]
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[P2]], align 4
+; CHECK-NEXT:    [[C2:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr addrspace(3) [[C2]], align 4
 ; CHECK-NEXT:    ret i32 [[L2]]
 ;
   %p2 = call ptr @ptr()
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll b/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll
index a5fb924404611..25bec18a5dbf3 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll
@@ -1,13 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
 
-; Check that debug locations are preserved. For more info see:
-;   https://llvm.org/docs/SourceLevelDebugging.html#fixing-errors
-; RUN: opt < %s -enable-debugify -passes=correlated-propagation -S 2>&1 | \
-; RUN:   FileCheck %s -check-prefix=DEBUG
-; DEBUG: CheckModuleDebugify: PASS
-
-; CHECK-LABEL: @test1
 define void @test1(i32 %n) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[SHR:%.*]], %[[FOR_BODY:.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[A]], 5
+; CHECK-NEXT:    br label %[[FOR_COND]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond
 
@@ -17,7 +25,6 @@ for.cond:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-; CHECK: lshr i32 %a, 5
   %shr = ashr i32 %a, 5
   br label %for.cond
 
@@ -26,8 +33,21 @@ for.end:                                          ; preds = %for.cond
 }
 
 ;; Negative test to show transform doesn't happen unless n > 0.
-; CHECK-LABEL: @test2
 define void @test2(i32 %n) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[SHR:%.*]], %[[FOR_BODY:.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], -2
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[A]], 2
+; CHECK-NEXT:    br label %[[FOR_COND]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond
 
@@ -37,7 +57,6 @@ for.cond:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-; CHECK: ashr i32 %a, 2
   %shr = ashr i32 %a, 2
   br label %for.cond
 
@@ -46,14 +65,23 @@ for.end:                                          ; preds = %for.cond
 }
 
 ;; Non looping test case.
-; CHECK-LABEL: @test3
 define void @test3(i32 %n) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BB:.*]], label %[[EXIT:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i32 [[N]], 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %bb, label %exit
 
 bb:
-; CHECK: lshr exact i32 %n, 4
   %shr = ashr exact i32 %n, 4
   br label %exit
 
@@ -65,14 +93,26 @@ exit:
 ; at the point of ashr, we know that the operand is always greater than 0,
 ; because of the guard before it, so we can transform it to lshr.
 declare void @llvm.experimental.guard(i1,...)
-; CHECK-LABEL: @test4
 define void @test4(i32 %n) {
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[SHR:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[A]], 2
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[A]], 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %exit
 
 loop:
-; CHECK: lshr i32 %a, 1
   %a = phi i32 [ %n, %entry ], [ %shr, %loop ]
   %cond = icmp sgt i32 %a, 2
   call void(i1,...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
@@ -85,14 +125,27 @@ exit:
 
 ; same test as above with assume instead of guard.
 declare void @llvm.assume(i1)
-; CHECK-LABEL: @test5
 define void @test5(i32 %n) {
+; CHECK-LABEL: define void @test5(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[SHR:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[A]], 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[A]], 1
+; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp ugt i32 [[SHR]], 8
+; CHECK-NEXT:    br i1 [[LOOPCOND]], label %[[LOOP]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %exit
 
 loop:
-; CHECK: lshr i32 %a, 1
   %a = phi i32 [ %n, %entry ], [ %shr, %loop ]
   %cond = icmp sgt i32 %a, 4
   call void @llvm.assume(i1 %cond)
@@ -105,54 +158,72 @@ exit:
 }
 
 ; check that ashr of -1 or 0 is optimized away
-; CHECK-LABEL: @test6
 define i32 @test6(i32 %f, i32 %g) {
+; CHECK-LABEL: define i32 @test6(
+; CHECK-SAME: i32 [[F:%.*]], i32 [[G:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[F]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 2
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[F]]
+;
 entry:
   %0 = add i32 %f, 1
   %1 = icmp ult i32 %0, 2
   tail call void @llvm.assume(i1 %1)
-; CHECK: ret i32 %f
   %shr = ashr i32 %f, %g
   ret i32 %shr
 }
 
 ; same test as above with different numbers
-; CHECK-LABEL: @test7
 define i32 @test7(i32 %f, i32 %g) {
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: i32 [[F:%.*]], i32 [[G:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[F]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 6
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[F]], -7
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
 entry:
   %0 = and i32 %f, -2
   %1 = icmp eq i32 %0, 6
   tail call void @llvm.assume(i1 %1)
   %sub = add nsw i32 %f, -7
-; CHECK: ret i32 %sub
   %shr = ashr i32 %sub, %g
   ret i32 %shr
 }
 
 ; check that ashr of -2 or 1 is not optimized away
-; CHECK-LABEL: @test8
 define i32 @test8(i32 %f, i32 %g, i1 %s) {
+; CHECK-LABEL: define i32 @test8(
+; CHECK-SAME: i32 [[F:%.*]], i32 [[G:%.*]], i1 [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 -2, [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 1, [[G]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[S]], i32 [[TMP0]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
 entry:
-; CHECK: ashr i32 -2, %f
   %0 = ashr i32 -2, %f
-; CHECK: lshr i32 1, %g
   %1 = ashr i32 1, %g
   %2 = select i1 %s, i32 %0, i32 %1
   ret i32 %2
 }
 
 define i32 @may_including_undef(i1 %c.1, i1 %c.2) {
-; CHECK-LABEL: define i32 @may_including_undef
-; CHECK-SAME: (i1 [[C_1:%.*]], i1 [[C_2:%.*]]) {
-; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE_1:%.*]], label [[FALSE:%.*]]
-; CHECK:       true.1:
-; CHECK-NEXT:    br i1 [[C_2]], label [[TRUE_2:%.*]], label [[EXIT:%.*]]
-; CHECK:       true.2:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       false:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 2, [[TRUE_1]] ], [ 4, [[TRUE_2]] ], [ undef, [[FALSE]] ]
+; CHECK-LABEL: define i32 @may_including_undef(
+; CHECK-SAME: i1 [[C_1:%.*]], i1 [[C_2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C_1]], label %[[TRUE_1:.*]], label %[[FALSE:.*]]
+; CHECK:       [[TRUE_1]]:
+; CHECK-NEXT:    br i1 [[C_2]], label %[[TRUE_2:.*]], label %[[EXIT:.*]]
+; CHECK:       [[TRUE_2]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 2, %[[TRUE_1]] ], [ 4, %[[TRUE_2]] ], [ undef, %[[FALSE]] ]
 ; CHECK-NEXT:    [[R:%.*]] = ashr i32 [[P]], 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index a3b1c89634835..c529bab4ef4a7 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -1,10 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
 ; PR2581
 
 define i32 @test1(i1 %C) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[BODY:%.*]]
+; CHECK-LABEL: define i32 @test1
+; CHECK-SAME: (i1 [[C:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    ret i32 11
 ; CHECK:       exit:
@@ -23,7 +24,7 @@ exit:           ; preds = %0
 ; PR4420
 declare i1 @ext()
 define i1 @test2() {
-; CHECK-LABEL: @test2(
+; CHECK-LABEL: define i1 @test2() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @ext()
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB2:%.*]]
@@ -56,9 +57,10 @@ bb3:
 ; PR4855
 @gv = internal constant i8 7
 define i8 @test3(ptr %a) nounwind {
-; CHECK-LABEL: @test3(
+; CHECK-LABEL: define i8 @test3
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[A:%.*]], @gv
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[A]], @gv
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    ret i8 0
@@ -80,9 +82,10 @@ bb2:
 
 ; PR1757
 define i32 @test4(i32) {
-; CHECK-LABEL: @test4(
+; CHECK-LABEL: define i32 @test4
+; CHECK-SAME: (i32 [[TMP0:%.*]]) {
 ; CHECK-NEXT:  EntryBlock:
-; CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = icmp sgt i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = icmp sgt i32 [[TMP0]], 2
 ; CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label [[GREATERTHANTWO:%.*]], label [[LESSTHANOREQUALTOTWO:%.*]]
 ; CHECK:       GreaterThanTwo:
 ; CHECK-NEXT:    br i1 false, label [[IMPOSSIBLE:%.*]], label [[NOTTWOANDGREATERTHANTWO:%.*]]
@@ -113,14 +116,15 @@ LessThanOrEqualToTwo:
 
 declare ptr @f(ptr)
 define void @test5(ptr %x, ptr %y) {
-; CHECK-LABEL: @test5(
+; CHECK-LABEL: define void @test5
+; CHECK-SAME: (ptr [[X:%.*]], ptr [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PRE:%.*]] = icmp eq ptr [[X:%.*]], null
+; CHECK-NEXT:    [[PRE:%.*]] = icmp eq ptr [[X]], null
 ; CHECK-NEXT:    br i1 [[PRE]], label [[RETURN:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[F:%.*]], [[LOOP]] ], [ [[X]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[F]] = tail call ptr @f(ptr [[PHI]])
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne ptr [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne ptr [[F]], [[Y]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], ptr [[F]], ptr null
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[SEL]], null
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN]], label [[LOOP]]
@@ -145,13 +149,14 @@ return:
 
 ; "false" case for CorrelatedValuePropagation
 define void @loop1(ptr %x, ptr %y) {
-; CHECK-LABEL: @loop1(
+; CHECK-LABEL: define void @loop1
+; CHECK-SAME: (ptr [[X:%.*]], ptr [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[F:%.*]], [[LOOP]] ], [ [[X]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[F]] = tail call ptr @f(ptr [[PHI]])
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne ptr [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne ptr [[F]], [[Y]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], ptr [[F]], ptr null
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[SEL]], null
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN:%.*]], label [[LOOP]]
@@ -175,13 +180,14 @@ return:
 
 ; "true" case for CorrelatedValuePropagation
 define void @loop2(ptr %x, ptr %y) {
-; CHECK-LABEL: @loop2(
+; CHECK-LABEL: define void @loop2
+; CHECK-SAME: (ptr [[X:%.*]], ptr [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[F:%.*]], [[LOOP]] ], [ [[X]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[F]] = tail call ptr @f(ptr [[PHI]])
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[F]], [[Y]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], ptr null, ptr [[F]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[SEL]], null
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN:%.*]], label [[LOOP]]
@@ -204,9 +210,10 @@ return:
 }
 
 define i32 @switch1(i32 %s) {
-; CHECK-LABEL: @switch1(
+; CHECK-LABEL: define i32 @switch1
+; CHECK-SAME: (i32 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[S]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[NEGATIVE:%.*]], label [[OUT:%.*]]
 ; CHECK:       negative:
 ; CHECK-NEXT:    switch i32 [[S]], label [[OUT]] [
@@ -243,9 +250,10 @@ next:
 }
 
 define i32 @switch2(i32 %s) {
-; CHECK-LABEL: @switch2(
+; CHECK-LABEL: define i32 @switch2
+; CHECK-SAME: (i32 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[POSITIVE:%.*]], label [[OUT:%.*]]
 ; CHECK:       positive:
 ; CHECK-NEXT:    br label [[OUT]]
@@ -276,9 +284,10 @@ next:
 }
 
 define i32 @switch3(i32 %s) {
-; CHECK-LABEL: @switch3(
+; CHECK-LABEL: define i32 @switch3
+; CHECK-SAME: (i32 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[POSITIVE:%.*]], label [[OUT:%.*]]
 ; CHECK:       positive:
 ; CHECK-NEXT:    br label [[OUT]]
@@ -309,9 +318,10 @@ next:
 }
 
 define void @switch4(i32 %s) {
-; CHECK-LABEL: @switch4(
+; CHECK-LABEL: define void @switch4
+; CHECK-SAME: (i32 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[S]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[ZERO:%.*]], label [[OUT:%.*]]
 ; CHECK:       zero:
 ; CHECK-NEXT:    br label [[NEXT:%.*]]
@@ -339,9 +349,10 @@ next:
 }
 
 define void @switch_nonzero_zext(i8 %s) {
-; CHECK-LABEL: @switch_nonzero_zext(
+; CHECK-LABEL: define void @switch_nonzero_zext
+; CHECK-SAME: (i8 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[S]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[SWITCH:%.*]], label [[EXIT:%.*]]
 ; CHECK:       switch:
 ; CHECK-NEXT:    [[S_EXT:%.*]] = zext i8 [[S]] to i32
@@ -371,9 +382,10 @@ unreachable:
 }
 
 define void @switch_assume_nonzero(i32 %s) {
-; CHECK-LABEL: @switch_assume_nonzero(
+; CHECK-LABEL: define void @switch_assume_nonzero
+; CHECK-SAME: (i32 [[S:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[S:%.*]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[S]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
@@ -398,9 +410,10 @@ unreachable:
 }
 
 define void @switch_nonzero_phi(i1 %cond) {
-; CHECK-LABEL: @switch_nonzero_phi(
+; CHECK-LABEL: define void @switch_nonzero_phi
+; CHECK-SAME: (i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    br label [[SWITCH:%.*]]
 ; CHECK:       else:
@@ -438,9 +451,10 @@ unreachable:
 }
 
 define i32 @switch_range(i32 %cond) {
-; CHECK-LABEL: @switch_range(
+; CHECK-LABEL: define i32 @switch_range
+; CHECK-SAME: (i32 [[COND:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
+; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND]], 3
 ; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
 ; CHECK-NEXT:    switch i32 [[S1]], label [[DEFAULT_UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[EXIT1:%.*]]
@@ -477,9 +491,10 @@ unreachable:
 ; switch condition, we should not change the default.
 
 define i32 @switch_range_not_full(i32 %cond) {
-; CHECK-LABEL: @switch_range_not_full(
+; CHECK-LABEL: define i32 @switch_range_not_full
+; CHECK-SAME: (i32 [[COND:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
+; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND]], 3
 ; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
 ; CHECK-NEXT:    switch i32 [[S1]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[EXIT1:%.*]]
@@ -511,9 +526,10 @@ unreachable:
 ; PR51531
 
 define i8 @switch_defaultdest_multipleuse(i8 %t0) {
-; CHECK-LABEL: @switch_defaultdest_multipleuse(
+; CHECK-LABEL: define i8 @switch_defaultdest_multipleuse
+; CHECK-SAME: (i8 [[T0:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[O:%.*]] = or i8 [[T0:%.*]], 1
+; CHECK-NEXT:    [[O:%.*]] = or i8 [[T0]], 1
 ; CHECK-NEXT:    [[R:%.*]] = srem i8 1, [[O]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       default.unreachable:
@@ -534,7 +550,8 @@ exit:
 }
 
 define i1 @arg_attribute(ptr nonnull %a) {
-; CHECK-LABEL: @arg_attribute(
+; CHECK-LABEL: define i1 @arg_attribute
+; CHECK-SAME: (ptr nonnull [[A:%.*]]) {
 ; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp eq ptr %a, null
@@ -543,7 +560,7 @@ define i1 @arg_attribute(ptr nonnull %a) {
 
 declare nonnull ptr @return_nonnull()
 define i1 @call_attribute() {
-; CHECK-LABEL: @call_attribute(
+; CHECK-LABEL: define i1 @call_attribute() {
 ; CHECK-NEXT:    [[A:%.*]] = call ptr @return_nonnull()
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -553,12 +570,13 @@ define i1 @call_attribute() {
 }
 
 define i1 @umin(i32 %a, i32 %b) {
-; CHECK-LABEL: @umin(
+; CHECK-LABEL: define i1 @umin
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B:%.*]], 20
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B]], 20
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
 ; CHECK:       b_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ult i32 [[A]], [[B]]
@@ -585,12 +603,13 @@ out:
 }
 
 define i1 @smin(i32 %a, i32 %b) {
-; CHECK-LABEL: @smin(
+; CHECK-LABEL: define i1 @smin
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B:%.*]], 20
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B]], 20
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
 ; CHECK:       b_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ule i32 [[A]], [[B]]
@@ -617,12 +636,13 @@ out:
 }
 
 define i1 @smax(i32 %a, i32 %b) {
-; CHECK-LABEL: @smax(
+; CHECK-LABEL: define i1 @smax
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B:%.*]], 20
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B]], 20
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
 ; CHECK:       b_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp uge i32 [[A]], [[B]]
@@ -649,12 +669,13 @@ out:
 }
 
 define i1 @umax(i32 %a, i32 %b) {
-; CHECK-LABEL: @umax(
+; CHECK-LABEL: define i1 @umax
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B:%.*]], 20
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B]], 20
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
 ; CHECK:       b_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp uge i32 [[A]], [[B]]
@@ -681,8 +702,9 @@ out:
 }
 
 define i1 @umin_lhs_overdefined_rhs_const(i32 %a) {
-; CHECK-LABEL: @umin_lhs_overdefined_rhs_const(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 42
+; CHECK-LABEL: define i1 @umin_lhs_overdefined_rhs_const
+; CHECK-SAME: (i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A]], 42
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[A]], i32 42
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -693,8 +715,9 @@ define i1 @umin_lhs_overdefined_rhs_const(i32 %a) {
 }
 
 define i1 @umin_rhs_overdefined_lhs_const(i32 %a) {
-; CHECK-LABEL: @umin_rhs_overdefined_lhs_const(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[A:%.*]], 42
+; CHECK-LABEL: define i1 @umin_rhs_overdefined_lhs_const
+; CHECK-SAME: (i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[A]], 42
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 42, i32 [[A]]
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -705,10 +728,11 @@ define i1 @umin_rhs_overdefined_lhs_const(i32 %a) {
 }
 
 define i1 @umin_lhs_overdefined_rhs_range(i32 %a, i32 %b) {
-; CHECK-LABEL: @umin_lhs_overdefined_rhs_range(
-; CHECK-NEXT:    [[ASSUME:%.*]] = icmp ult i32 [[B:%.*]], 42
+; CHECK-LABEL: define i1 @umin_lhs_overdefined_rhs_range
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[ASSUME:%.*]] = icmp ult i32 [[B]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ASSUME]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[B]]
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -721,10 +745,11 @@ define i1 @umin_lhs_overdefined_rhs_range(i32 %a, i32 %b) {
 }
 
 define i1 @umin_rhs_overdefined_lhs_range(i32 %a, i32 %b) {
-; CHECK-LABEL: @umin_rhs_overdefined_lhs_range(
-; CHECK-NEXT:    [[ASSUME:%.*]] = icmp ult i32 [[B:%.*]], 42
+; CHECK-LABEL: define i1 @umin_rhs_overdefined_lhs_range
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[ASSUME:%.*]] = icmp ult i32 [[B]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ASSUME]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[A]]
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -737,9 +762,10 @@ define i1 @umin_rhs_overdefined_lhs_range(i32 %a, i32 %b) {
 }
 
 define i1 @clamp_low1(i32 noundef %a) {
-; CHECK-LABEL: @clamp_low1(
+; CHECK-LABEL: define i1 @clamp_low1
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp eq i32 [[A]], 5
@@ -764,9 +790,10 @@ out:
 }
 
 define i1 @clamp_low2(i32 noundef %a) {
-; CHECK-LABEL: @clamp_low2(
+; CHECK-LABEL: define i1 @clamp_low2
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
@@ -791,9 +818,10 @@ out:
 }
 
 define i1 @clamp_low3(i32 noundef %a) {
-; CHECK-LABEL: @clamp_low3(
+; CHECK-LABEL: define i1 @clamp_low3
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ugt i32 [[A]], 5
@@ -818,9 +846,10 @@ out:
 }
 
 define i1 @clamp_low4(i32 noundef %a) {
-; CHECK-LABEL: @clamp_low4(
+; CHECK-LABEL: define i1 @clamp_low4
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ule i32 [[A]], 5
@@ -845,9 +874,10 @@ out:
 }
 
 define i1 @clamp_high1(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high1(
+; CHECK-LABEL: define i1 @clamp_high1
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp eq i32 [[A]], 5
@@ -872,9 +902,10 @@ out:
 }
 
 define i1 @clamp_high1_or(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high1_or(
+; CHECK-LABEL: define i1 @clamp_high1_or
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp eq i32 [[A]], 5
@@ -899,9 +930,10 @@ out:
 }
 
 define i1 @clamp_high2(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high2(
+; CHECK-LABEL: define i1 @clamp_high2
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
@@ -927,9 +959,10 @@ out:
 
 
 define i1 @clamp_high2_or_disjoint(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high2_or_disjoint(
+; CHECK-LABEL: define i1 @clamp_high2_or_disjoint
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
@@ -955,9 +988,10 @@ out:
 
 
 define i1 @clamp_high3(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high3(
+; CHECK-LABEL: define i1 @clamp_high3
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp slt i32 [[A]], 5
@@ -982,9 +1016,10 @@ out:
 }
 
 define i1 @clamp_high4(i32 noundef %a) {
-; CHECK-LABEL: @clamp_high4(
+; CHECK-LABEL: define i1 @clamp_high4
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp sge i32 [[A]], 5
@@ -1010,9 +1045,10 @@ out:
 
 ; Just showing arbitrary constants work, not really a clamp
 define i1 @not_clamp_high(i32 noundef %a) {
-; CHECK-LABEL: @not_clamp_high(
+; CHECK-LABEL: define i1 @not_clamp_high
+; CHECK-SAME: (i32 noundef [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], 5
 ; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
 ; CHECK:       a_guard:
 ; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
@@ -1037,9 +1073,10 @@ out:
 }
 
 define void @abs1(i32 %a, ptr %p) {
-; CHECK-LABEL: @abs1(
+; CHECK-LABEL: define void @abs1
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], -20
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
@@ -1047,7 +1084,7 @@ define void @abs1(i32 %a, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], 0
 ; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[ABS]], 19
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
@@ -1082,9 +1119,10 @@ exit:
 }
 
 define void @abs2(i32 %a, ptr %p) {
-; CHECK-LABEL: @abs2(
+; CHECK-LABEL: define void @abs2
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], -20
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
@@ -1092,7 +1130,7 @@ define void @abs2(i32 %a, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A]], 0
 ; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[SUB]]
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[ABS]], 19
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
@@ -1127,9 +1165,10 @@ exit:
 }
 
 define void @nabs1(i32 %a, ptr %p) {
-; CHECK-LABEL: @nabs1(
+; CHECK-LABEL: define void @nabs1
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], -20
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
@@ -1137,7 +1176,7 @@ define void @nabs1(i32 %a, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], 0
 ; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp sgt i32 [[NABS]], -19
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
@@ -1172,9 +1211,10 @@ exit:
 }
 
 define void @nabs2(i32 %a, ptr %p) {
-; CHECK-LABEL: @nabs2(
+; CHECK-LABEL: define void @nabs2
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], -20
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
@@ -1182,7 +1222,7 @@ define void @nabs2(i32 %a, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], 0
 ; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[SUB]]
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp sgt i32 [[NABS]], -19
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
@@ -1217,9 +1257,10 @@ exit:
 }
 
 define i1 @zext_unknown(i8 %a) {
-; CHECK-LABEL: @zext_unknown(
+; CHECK-LABEL: define i1 @zext_unknown
+; CHECK-SAME: (i8 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
 ; CHECK-NEXT:    ret i1 true
 ;
 entry:
@@ -1229,9 +1270,10 @@ entry:
 }
 
 define i1 @trunc_unknown(i32 %a) {
-; CHECK-LABEL: @trunc_unknown(
+; CHECK-LABEL: define i1 @trunc_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A8:%.*]] = trunc i32 [[A:%.*]] to i8
+; CHECK-NEXT:    [[A8:%.*]] = trunc i32 [[A]] to i8
 ; CHECK-NEXT:    [[A32:%.*]] = sext i8 [[A8]] to i32
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -1243,12 +1285,13 @@ entry:
 }
 
 define void @trunc_icmp_ule(i32 %x, ptr %p) {
-; CHECK-LABEL: @trunc_icmp_ule(
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-LABEL: define void @trunc_icmp_ule
+; CHECK-SAME: (i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    [[C:%.*]] = icmp uge i8 [[T]], 5
 ; CHECK-NEXT:    br i1 [[C]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ugt i32 [[X]], 5
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C3:%.*]] = icmp ule i32 [[X]], 5
@@ -1294,12 +1337,13 @@ false:
 }
 
 define void @trunc_icmp_eq(i32 %x, ptr %p) {
-; CHECK-LABEL: @trunc_icmp_eq(
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-LABEL: define void @trunc_icmp_eq
+; CHECK-SAME: (i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[T]], 5
 ; CHECK-NEXT:    br i1 [[C]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ugt i32 [[X]], 5
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C3:%.*]] = icmp ule i32 [[X]], 5
@@ -1347,9 +1391,10 @@ false:
 ; TODO: missed optimization
 ; Make sure we exercise non-integer inputs to unary operators (i.e. crash check).
 define i1 @bitcast_unknown(float %a) {
-; CHECK-LABEL: @bitcast_unknown(
+; CHECK-LABEL: define i1 @bitcast_unknown
+; CHECK-SAME: (float [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A32:%.*]] = bitcast float [[A:%.*]] to i32
+; CHECK-NEXT:    [[A32:%.*]] = bitcast float [[A]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A32]], 128
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -1360,9 +1405,10 @@ entry:
 }
 
 define i1 @bitcast_unknown2(ptr %p) {
-; CHECK-LABEL: @bitcast_unknown2(
+; CHECK-LABEL: define i1 @bitcast_unknown2
+; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P64:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[P64:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i64 [[P64]], 128
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -1374,9 +1420,10 @@ entry:
 
 
 define i1 @and_unknown(i32 %a) {
-; CHECK-LABEL: @and_unknown(
+; CHECK-LABEL: define i1 @and_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 128
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A]], 128
 ; CHECK-NEXT:    ret i1 true
 ;
 entry:
@@ -1386,9 +1433,10 @@ entry:
 }
 
 define i1 @lshr_unknown(i32 %a) {
-; CHECK-LABEL: @lshr_unknown(
+; CHECK-LABEL: define i1 @lshr_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[AND:%.*]] = lshr i32 [[A:%.*]], 30
+; CHECK-NEXT:    [[AND:%.*]] = lshr i32 [[A]], 30
 ; CHECK-NEXT:    ret i1 true
 ;
 entry:
@@ -1398,9 +1446,10 @@ entry:
 }
 
 define i1 @urem_unknown(i32 %a) {
-; CHECK-LABEL: @urem_unknown(
+; CHECK-LABEL: define i1 @urem_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UREM:%.*]] = urem i32 [[A:%.*]], 30
+; CHECK-NEXT:    [[UREM:%.*]] = urem i32 [[A]], 30
 ; CHECK-NEXT:    ret i1 true
 ;
 entry:
@@ -1410,9 +1459,10 @@ entry:
 }
 
 define i1 @srem_unknown(i32 %a) {
-; CHECK-LABEL: @srem_unknown(
+; CHECK-LABEL: define i1 @srem_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SREM:%.*]] = srem i32 [[A:%.*]], 30
+; CHECK-NEXT:    [[SREM:%.*]] = srem i32 [[A]], 30
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
 ; CHECK-NEXT:    ret i1 true
@@ -1431,9 +1481,10 @@ exit2:
 }
 
 define i1 @sdiv_unknown(i32 %a) {
-; CHECK-LABEL: @sdiv_unknown(
+; CHECK-LABEL: define i1 @sdiv_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SREM:%.*]] = sdiv i32 [[A:%.*]], 123
+; CHECK-NEXT:    [[SREM:%.*]] = sdiv i32 [[A]], 123
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
 ; CHECK-NEXT:    ret i1 true
@@ -1452,9 +1503,10 @@ exit2:
 }
 
 define i1 @uadd_sat_unknown(i32 %a) {
-; CHECK-LABEL: @uadd_sat_unknown(
+; CHECK-LABEL: define i1 @uadd_sat_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[A:%.*]], i32 100)
+; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[A]], i32 100)
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[VAL]], 100
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
@@ -1474,9 +1526,10 @@ exit2:
 }
 
 define i1 @usub_sat_unknown(i32 %a) {
-; CHECK-LABEL: @usub_sat_unknown(
+; CHECK-LABEL: define i1 @usub_sat_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A:%.*]], i32 100)
+; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A]], i32 100)
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[VAL]], -101
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
@@ -1496,9 +1549,10 @@ exit2:
 }
 
 define i1 @sadd_sat_unknown(i32 %a) {
-; CHECK-LABEL: @sadd_sat_unknown(
+; CHECK-LABEL: define i1 @sadd_sat_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[A:%.*]], i32 100)
+; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[A]], i32 100)
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[VAL]], -2147483548
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
@@ -1518,9 +1572,10 @@ exit2:
 }
 
 define i1 @ssub_sat_unknown(i32 %a) {
-; CHECK-LABEL: @ssub_sat_unknown(
+; CHECK-LABEL: define i1 @ssub_sat_unknown
+; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[A:%.*]], i32 100)
+; CHECK-NEXT:    [[VAL:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[A]], i32 100)
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[VAL]], 2147483547
 ; CHECK-NEXT:    br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]]
 ; CHECK:       exit1:
@@ -1540,14 +1595,15 @@ exit2:
 }
 
 define void @select_and(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_and(
+; CHECK-LABEL: define void @select_and
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
 ; CHECK:       guard:
-; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1571,15 +1627,16 @@ exit:
 }
 
 define void @select_and_wrong_const(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_and_wrong_const(
+; CHECK-LABEL: define void @select_and_wrong_const
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 true
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp sgt i32 [[A]], 20
-; CHECK-NEXT:    store i1 [[C1]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 [[C1]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[A]], -20
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1604,15 +1661,16 @@ exit:
 }
 
 define void @select_and_wrong_operand(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_and_wrong_operand(
+; CHECK-LABEL: define void @select_and_wrong_operand
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[A]], 10
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP1]], i1 false, i1 [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[GUARD:%.*]], label [[EXIT:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp sgt i32 [[A]], 20
-; CHECK-NEXT:    store i1 [[C1]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 [[C1]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[A]], -20
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1637,14 +1695,15 @@ exit:
 }
 
 define void @select_or(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_or(
+; CHECK-LABEL: define void @select_or
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], 10
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
-; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1668,15 +1727,16 @@ exit:
 }
 
 define void @select_or_wrong_const(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_or_wrong_const(
+; CHECK-LABEL: define void @select_or_wrong_const
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], 10
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP1]], i1 false, i1 [[CMP2]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp sgt i32 [[A]], 20
-; CHECK-NEXT:    store i1 [[C1]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 [[C1]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[A]], -20
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1701,15 +1761,16 @@ exit:
 }
 
 define void @select_or_wrong_operand(i32 %a, ptr %p) {
-; CHECK-LABEL: @select_or_wrong_operand(
+; CHECK-LABEL: define void @select_or_wrong_operand
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], -10
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], -10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[A]], 10
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 true
 ; CHECK-NEXT:    br i1 [[OR]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp sgt i32 [[A]], 20
-; CHECK-NEXT:    store i1 [[C1]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 [[C1]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[A]], -20
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1734,13 +1795,14 @@ exit:
 }
 
 define void @or_union(i32 %a, ptr %p) {
-; CHECK-LABEL: @or_union(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 10
+; CHECK-LABEL: define void @or_union
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[A]], 12
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[GUARD:%.*]], label [[EXIT:%.*]]
 ; CHECK:       guard:
-; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[A]], 10
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C3:%.*]] = icmp eq i32 [[A]], 11
@@ -1775,9 +1837,10 @@ exit:
 }
 
 define i1 @or_union_unknown_cond(i32 %a, i1 %c) {
-; CHECK-LABEL: @or_union_unknown_cond(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 10
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[C:%.*]]
+; CHECK-LABEL: define i1 @or_union_unknown_cond
+; CHECK-SAME: (i32 [[A:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A]], 10
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[C]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[GUARD:%.*]], label [[EXIT:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[A]], 10
@@ -1798,13 +1861,14 @@ exit:
 }
 
 define void @and_union(i32 %a, ptr %p) {
-; CHECK-LABEL: @and_union(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 10
+; CHECK-LABEL: define void @and_union
+; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A]], 10
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[A]], 12
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
-; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[A]], 10
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[C3:%.*]] = icmp eq i32 [[A]], 11
@@ -1839,9 +1903,10 @@ exit:
 }
 
 define i1 @and_union_unknown_cond(i32 %a, i1 %c) {
-; CHECK-LABEL: @and_union_unknown_cond(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 10
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[C:%.*]]
+; CHECK-LABEL: define i1 @and_union_unknown_cond
+; CHECK-SAME: (i32 [[A:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A]], 10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[C]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[A]], 10
@@ -1862,14 +1927,15 @@ exit:
 }
 
 define void @select_assume(i32 %a, i32 %b, i1 %c, ptr %p) {
-; CHECK-LABEL: @select_assume(
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A:%.*]], 10
+; CHECK-LABEL: define void @select_assume
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], 10
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
-; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[B:%.*]], 20
+; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[B]], 20
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]]
 ; CHECK-NEXT:    [[C3:%.*]] = icmp ult i32 [[S]], 19
-; CHECK-NEXT:    store i1 [[C3]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 [[C3]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -1886,10 +1952,11 @@ define void @select_assume(i32 %a, i32 %b, i1 %c, ptr %p) {
 }
 
 define void @xor(i8 %a, ptr %p) {
-; CHECK-LABEL: @xor(
-; CHECK-NEXT:    [[A_MASK:%.*]] = and i8 [[A:%.*]], 15
+; CHECK-LABEL: define void @xor
+; CHECK-SAME: (i8 [[A:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[A_MASK:%.*]] = and i8 [[A]], 15
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[A_MASK]], -86
-; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ugt i8 [[XOR]], -96
 ; CHECK-NEXT:    store i1 [[C2]], ptr [[P]], align 1
 ; CHECK-NEXT:    store i1 true, ptr [[P]], align 1
@@ -1911,8 +1978,9 @@ define void @xor(i8 %a, ptr %p) {
 }
 
 define i1 @xor_neg_cond(i32 %a) {
-; CHECK-LABEL: @xor_neg_cond(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 10
+; CHECK-LABEL: define i1 @xor_neg_cond
+; CHECK-SAME: (i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A]], 10
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[CMP1]], true
 ; CHECK-NEXT:    br i1 [[XOR]], label [[EXIT:%.*]], label [[GUARD:%.*]]
 ; CHECK:       guard:
@@ -1933,8 +2001,9 @@ exit:
 }
 
 define i1 @xor_approx(i32 %a) {
-; CHECK-LABEL: @xor_approx(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[A:%.*]], 2
+; CHECK-LABEL: define i1 @xor_approx
+; CHECK-SAME: (i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[A]], 2
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[A]], 5
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp ugt i32 [[A]], 7
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp ult i32 [[A]], 9
@@ -1968,8 +2037,9 @@ exit:
 }
 
 define i1 @binop_eval_order(i32 %x) {
-; CHECK-LABEL: @binop_eval_order(
-; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[X:%.*]], 1
+; CHECK-LABEL: define i1 @binop_eval_order
+; CHECK-SAME: (i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[B:%.*]] = add nuw nsw i32 [[A]], 1
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i1 true
@@ -1981,6 +2051,46 @@ define i1 @binop_eval_order(i32 %x) {
   ret i1 %d
 }
 
+define range(i32 0, 1024) i32 @range_larger(i8 %x) {
+; CHECK-LABEL: define range(i32 0, 1024) i32 @range_larger
+; CHECK-SAME: (i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT]]
+;
+  %zext = zext i8 %x to i32
+  ret i32 %zext
+}
+
+define range(i32 0, 128) i32 @range_smaller(i8 %x) {
+; CHECK-LABEL: define range(i32 0, 128) i32 @range_smaller
+; CHECK-SAME: (i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT]]
+;
+  %zext = zext i8 %x to i32
+  ret i32 %zext
+}
+
+define range(i32 128, 512) i32 @range_intersect(i8 %x) {
+; CHECK-LABEL: define range(i32 128, 512) i32 @range_intersect
+; CHECK-SAME: (i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT]]
+;
+  %zext = zext i8 %x to i32
+  ret i32 %zext
+}
+
+define range(i32 512, 1024) i32 @range_non_overlapping(i8 %x) {
+; CHECK-LABEL: define range(i32 512, 1024) i32 @range_non_overlapping
+; CHECK-SAME: (i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT]]
+;
+  %zext = zext i8 %x to i32
+  ret i32 %zext
+}
+
 declare i32 @llvm.uadd.sat.i32(i32, i32)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare i32 @llvm.sadd.sat.i32(i32, i32)
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
index 086043d4b7c1b..75ad9274eec85 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
@@ -224,3 +224,35 @@ define i1 @one_bit(i1 %a, i1 %b) {
   %mul = mul i1 %a, %b
   ret i1 %mul
 }
+
+define i1 @test_mul_nuw_nsw_nneg(i32 %x, i32 range(i32 3, 2147483648) %y) {
+; CHECK-LABEL: @test_mul_nuw_nsw_nneg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %mul = mul nuw nsw i32 %x, %y
+  %cmp = icmp sgt i32 %mul, -1
+  ret i1 %cmp
+}
+
+define i1 @test_mul_nuw_nsw_nneg_complex(i32 %x, i32 noundef %y, i32 %z) {
+; CHECK-LABEL: @test_mul_nuw_nsw_nneg_complex(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i32 3, i32 4
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP1]], i32 [[Y]], i32 [[SEL1]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[X]], [[SEL2]]
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp1 = icmp sgt i32 %y, 2
+  %cmp2 = icmp eq i32 %x, 0
+  %sel1 = select i1 %cmp2, i32 3, i32 4
+  %sel2 = select i1 %cmp1, i32 %y, i32 %sel1
+  %mul = mul nuw nsw i32 %x, %sel2
+  %cmp3 = icmp sgt i32 %mul, -1
+  ret i1 %cmp3
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/shl.ll b/llvm/test/Transforms/CorrelatedValuePropagation/shl.ll
index 88311219dee58..8b4dbc98425bf 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/shl.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/shl.ll
@@ -86,7 +86,7 @@ define i8 @test4(i8 %a, i8 %b) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i8 [[A:%.*]], [[B]]
-; CHECK-NEXT:    ret i8 [[SHL]]
+; CHECK-NEXT:    ret i8 -1
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i8 0
 ;
@@ -382,8 +382,7 @@ define i1 @nuw_range1(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 1
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i8 [[C]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 1
@@ -397,8 +396,7 @@ define i1 @nuw_range2(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 3
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i8 [[C]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 2
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 3
@@ -420,3 +418,59 @@ entry:
   %cmp = icmp slt i8 %c, %shl
   ret i1 %cmp
 }
+
+define i64 @shl_nuw_nsw_test1(i32 %x) {
+; CHECK-LABEL: @shl_nuw_nsw_test1(
+; CHECK-NEXT:    [[SHL1:%.*]] = shl nuw nsw i32 1, [[X:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[SHL1]], -1
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[SHL2:%.*]] = shl nuw nsw i64 [[EXT]], 2
+; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i64 [[SHL2]], 39
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[ADD2]], 3
+; CHECK-NEXT:    ret i64 [[LSHR]]
+;
+  %shl1 = shl nuw nsw i32 1, %x
+  %add1 = add nsw i32 %shl1, -1
+  %ext = sext i32 %add1 to i64
+  %shl2 = shl nsw i64 %ext, 2
+  %add2 = add nsw i64 %shl2, 39
+  %lshr = lshr i64 %add2, 3
+  %and = and i64 %lshr, 4294967295
+  ret i64 %and
+}
+
+define i32 @shl_nuw_nsw_test2(i32 range(i32 -2147483248, 1) %x) {
+; CHECK-LABEL: @shl_nuw_nsw_test2(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[X:%.*]], 1
+; CHECK-NEXT:    ret i32 200
+;
+  %shl = shl nsw i32 %x, 1
+  %smax = call i32 @llvm.smax.i32(i32 %shl, i32 200)
+  ret i32 %smax
+}
+
+define i64 @shl_nuw_nsw_test3(i1 %cond, i64 range(i64 1, 0) %x, i64 range(i64 3, 0) %y) {
+; CHECK-LABEL: @shl_nuw_nsw_test3(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[X:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i64 [[Y:%.*]], i64 [[SHL]]
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+  %shl = shl nuw i64 1, %x
+  %sel = select i1 %cond, i64 %y, i64 %shl
+  %umax = call i64 @llvm.umax.i64(i64 %sel, i64 2)
+  ret i64 %umax
+}
+
+define i1 @shl_nuw_nsw_test4(i32 %x, i32 range(i32 0, 32) %k) {
+; CHECK-LABEL: @shl_nuw_nsw_test4(
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[SH_PROM:%.*]] = zext nneg i32 [[K:%.*]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i64 [[CONV]], [[SH_PROM]]
+; CHECK-NEXT:    ret i1 false
+;
+  %conv = sext i32 %x to i64
+  %sh_prom = zext nneg i32 %k to i64
+  %shl = shl nsw i64 %conv, %sh_prom
+  %cmp = icmp eq i64 %shl, -9223372036854775808
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
index 6f13263fe92be..351a2c79cdff4 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
@@ -241,8 +241,6 @@ define <2 x i16> @and_with_poison(<2 x i8> %a) {
   ret <2 x i16> %res
 }
 
-
-
 define <4 x i64> @issue_97674_getConstantOnEdge(i1 %cond) {
 ; CHECK-LABEL: define <4 x i64> @issue_97674_getConstantOnEdge(
 ; CHECK-SAME: i1 [[COND:%.*]]) {
@@ -277,3 +275,80 @@ entry:
   %folds = add <4 x i64> zeroinitializer, zeroinitializer
   ret <4 x i64> %folds
 }
+
+define <2 x i16> @phi_merge1(i1 %c, <2 x i8> %a) {
+; CHECK-LABEL: define <2 x i16> @phi_merge1(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ <i16 1, i16 2>, %[[IF]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], <i16 2, i16 3>
+; CHECK-NEXT:    ret <2 x i16> [[ADD]]
+;
+entry:
+  %zext = zext <2 x i8> %a to <2 x i16>
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <2 x i16> [ %zext, %entry ], [ <i16 1, i16 2>, %if ]
+  %add = add <2 x i16> %phi, <i16 2, i16 3>
+  ret <2 x i16> %add
+}
+
+define <2 x i16> @phi_merge2(i1 %c, <2 x i8> %a) {
+; CHECK-LABEL: define <2 x i16> @phi_merge2(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x i16> [ <i16 1, i16 2>, %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], <i16 2, i16 3>
+; CHECK-NEXT:    ret <2 x i16> [[ADD]]
+;
+entry:
+  %zext = zext <2 x i8> %a to <2 x i16>
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <2 x i16> [ <i16 1, i16 2>, %entry ], [ %zext, %if ]
+  %add = add <2 x i16> %phi, <i16 2, i16 3>
+  ret <2 x i16> %add
+}
+
+;; Check if ICMP instruction is constant folded or not.
+define <2 x i1> @insertelement_fold1() {
+; CHECK-LABEL: define <2 x i1> @insertelement_fold1() {
+; CHECK-NEXT:    [[IE1:%.*]] = insertelement <2 x i32> poison, i32 10, i64 0
+; CHECK-NEXT:    [[IE2:%.*]] = insertelement <2 x i32> [[IE1]], i32 20, i64 1
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %ie1 = insertelement <2 x i32> poison, i32 10, i64 0
+  %ie2 = insertelement <2 x i32> %ie1, i32 20, i64 1
+  %icmp1 = icmp slt <2 x i32> %ie2, <i32 1024, i32 1024>
+  ret <2 x i1> %icmp1
+}
+
+;; Check if LVI is able to handle constant vector operands
+;; in InsertElementInst and CVP is able to fold ICMP instruction.
+define <2 x i1> @insertelement_fold2() {
+; CHECK-LABEL: define <2 x i1> @insertelement_fold2() {
+; CHECK-NEXT:    [[IE1:%.*]] = insertelement <2 x i32> <i32 poison, i32 20>, i32 10, i64 0
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %ie1 = insertelement <2 x i32> <i32 poison, i32 20>, i32 10, i64 0
+  %icmp1 = icmp slt <2 x i32> %ie1, <i32 1024, i32 1024>
+  ret <2 x i1> %icmp1
+}
diff --git a/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll b/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll
new file mode 100644
index 0000000000000..64a023ef45137
--- /dev/null
+++ b/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S < %s -passes=dce  | FileCheck %s
+
+; ldu/ldg intrinsics were erroneously not marked IntrWillReturn, preventing
+; them from being eliminated at IR level when dead.
+
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+
+define void @test_ldu_i8_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i8_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_i16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldu_i32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_i64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_p_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_p_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_f32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_f64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldu_v2f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_v2f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i8_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i8_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldg_i32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_p_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_p_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_f32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_f64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldg_v2f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_v2f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
index bd225def3addc..231aba5cc2f0c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
@@ -4,6 +4,7 @@
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
+; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=4 -S | FileCheck --check-prefix=LIMIT-4 %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -71,3 +72,47 @@ bb3:
   store i32 0, ptr %P
   ret void
 }
+
+define void @duplicate_worklist_endoffunction(ptr %ptr.0, ptr %ptr.1) {
+; LIMIT-4-LABEL: @duplicate_worklist_endoffunction(
+; LIMIT-4-NEXT:  entry:
+; LIMIT-4-NEXT:    [[STACK_0:%.*]] = alloca [768 x i8], align 16
+; LIMIT-4-NEXT:    [[VAL_0:%.*]] = load i16, ptr [[PTR_1:%.*]], align 8
+; LIMIT-4-NEXT:    [[COND:%.*]] = icmp ugt i16 [[VAL_0]], 24
+; LIMIT-4-NEXT:    br i1 [[COND]], label [[BB_1:%.*]], label [[EXIT:%.*]]
+; LIMIT-4:       bb.1:
+; LIMIT-4-NEXT:    br label [[LOOP:%.*]]
+; LIMIT-4:       loop:
+; LIMIT-4-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[BB_1]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; LIMIT-4-NEXT:    [[PTR_3:%.*]] = getelementptr i8, ptr [[STACK_0]], i64 [[IV]]
+; LIMIT-4-NEXT:    store ptr [[PTR_0:%.*]], ptr [[PTR_3]], align 2
+; LIMIT-4-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; LIMIT-4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 10
+; LIMIT-4-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
+; LIMIT-4:       exit:
+; LIMIT-4-NEXT:    ret void
+;
+entry:
+  %stack.0 = alloca [768 x i8], align 16
+  %stack.1 = alloca [20 x i8], align 8
+  %val.0 = load i16, ptr %ptr.1, align 8
+  %cond = icmp ugt i16 %val.0, 24
+  br i1 %cond, label %bb.1, label %exit
+
+bb.1:                                             ; preds = %entry
+  %ptr.2 = getelementptr inbounds i8, ptr %ptr.1, i64 8
+  %val.1 = load i64, ptr %ptr.2, align 8
+  store i64 %val.1, ptr %stack.1, align 8
+  br label %loop
+
+loop:                                             ; preds = %loop, %bb.1
+  %iv = phi i64 [ 0, %bb.1 ], [ %iv.next, %loop ]
+  %ptr.3 = getelementptr i8, ptr %stack.0, i64 %iv
+  store ptr %ptr.0, ptr %ptr.3, align 2
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 10
+  br i1 %exitcond, label %exit, label %loop
+
+exit:                                             ; preds = %loop, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
index ea6392714bf6f..10760e3b8b8b8 100644
--- a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
+++ b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
@@ -489,3 +489,184 @@ define void @test_scc_argmem_read_2(ptr %p) {
   call void @test_scc_argmem_read_1(ptr %p)
   ret void
 }
+
+define i64 @select_same_obj(i1 %c, ptr %p, i64 %x) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; FNATTRS-LABEL: define i64 @select_same_obj
+; FNATTRS-SAME: (i1 [[C:%.*]], ptr nocapture readonly [[P:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; FNATTRS-NEXT:    [[P3:%.*]] = select i1 [[C]], ptr [[P]], ptr [[P2]]
+; FNATTRS-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; FNATTRS-NEXT:    ret i64 [[R]]
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define i64 @select_same_obj
+; ATTRIBUTOR-SAME: (i1 [[C:%.*]], ptr nocapture nofree readonly [[P:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; ATTRIBUTOR-NEXT:    [[P3:%.*]] = select i1 [[C]], ptr [[P]], ptr [[P2]]
+; ATTRIBUTOR-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; ATTRIBUTOR-NEXT:    ret i64 [[R]]
+;
+entry:
+  %p2 = getelementptr i8, ptr %p, i64 %x
+  %p3 = select i1 %c, ptr %p, ptr %p2
+  %r = load i64, ptr %p3
+  ret i64 %r
+}
+
+; FIXME: This could be `memory(argmem: read)`.
+define i64 @select_different_obj(i1 %c, ptr %p, ptr %p2) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
+; FNATTRS-LABEL: define i64 @select_different_obj
+; FNATTRS-SAME: (i1 [[C:%.*]], ptr nocapture readonly [[P:%.*]], ptr nocapture readonly [[P2:%.*]]) #[[ATTR3]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    [[P3:%.*]] = select i1 [[C]], ptr [[P]], ptr [[P2]]
+; FNATTRS-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; FNATTRS-NEXT:    ret i64 [[R]]
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define i64 @select_different_obj
+; ATTRIBUTOR-SAME: (i1 [[C:%.*]], ptr nocapture nofree readonly [[P:%.*]], ptr nocapture nofree readonly [[P2:%.*]]) #[[ATTR0]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    [[P3:%.*]] = select i1 [[C]], ptr [[P]], ptr [[P2]]
+; ATTRIBUTOR-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; ATTRIBUTOR-NEXT:    ret i64 [[R]]
+;
+entry:
+  %p3 = select i1 %c, ptr %p, ptr %p2
+  %r = load i64, ptr %p3
+  ret i64 %r
+}
+
+define i64 @phi_same_obj(i1 %c, ptr %p, i64 %x) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; FNATTRS-LABEL: define i64 @phi_same_obj
+; FNATTRS-SAME: (i1 [[C:%.*]], ptr nocapture readonly [[P:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; FNATTRS-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[JOIN:%.*]]
+; FNATTRS:       if:
+; FNATTRS-NEXT:    br label [[JOIN]]
+; FNATTRS:       join:
+; FNATTRS-NEXT:    [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ]
+; FNATTRS-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; FNATTRS-NEXT:    ret i64 [[R]]
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; ATTRIBUTOR-LABEL: define i64 @phi_same_obj
+; ATTRIBUTOR-SAME: (i1 [[C:%.*]], ptr nocapture nofree readonly [[P:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; ATTRIBUTOR-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[JOIN:%.*]]
+; ATTRIBUTOR:       if:
+; ATTRIBUTOR-NEXT:    br label [[JOIN]]
+; ATTRIBUTOR:       join:
+; ATTRIBUTOR-NEXT:    [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ]
+; ATTRIBUTOR-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; ATTRIBUTOR-NEXT:    ret i64 [[R]]
+;
+entry:
+  %p2 = getelementptr i8, ptr %p, i64 %x
+  br i1 %c, label %if, label %join
+if:
+  br label %join
+join:
+  %p3 = phi ptr [ %p, %if ], [ %p2, %entry ]
+  %r = load i64, ptr %p3
+  ret i64 %r
+}
+
+; FIXME: This could be `memory(argmem: read)`.
+define i64 @phi_different_obj(i1 %c, ptr %p, ptr %p2) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
+; FNATTRS-LABEL: define i64 @phi_different_obj
+; FNATTRS-SAME: (i1 [[C:%.*]], ptr nocapture readonly [[P:%.*]], ptr nocapture readonly [[P2:%.*]]) #[[ATTR3]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[JOIN:%.*]]
+; FNATTRS:       if:
+; FNATTRS-NEXT:    br label [[JOIN]]
+; FNATTRS:       join:
+; FNATTRS-NEXT:    [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ]
+; FNATTRS-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; FNATTRS-NEXT:    ret i64 [[R]]
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; ATTRIBUTOR-LABEL: define i64 @phi_different_obj
+; ATTRIBUTOR-SAME: (i1 [[C:%.*]], ptr nocapture nofree readonly [[P:%.*]], ptr nocapture nofree readonly [[P2:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[JOIN:%.*]]
+; ATTRIBUTOR:       if:
+; ATTRIBUTOR-NEXT:    br label [[JOIN]]
+; ATTRIBUTOR:       join:
+; ATTRIBUTOR-NEXT:    [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ]
+; ATTRIBUTOR-NEXT:    [[R:%.*]] = load i64, ptr [[P3]], align 4
+; ATTRIBUTOR-NEXT:    ret i64 [[R]]
+;
+entry:
+  br i1 %c, label %if, label %join
+if:
+  br label %join
+join:
+  %p3 = phi ptr [ %p, %if ], [ %p2, %entry ]
+  %r = load i64, ptr %p3
+  ret i64 %r
+}
+
+; An `alloca` that won't be handled by `getModRefInfoMask`
+define i64 @alloca_with_phis(i64 %arg, i1 %cmp) {
+; COMMON: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; COMMON-LABEL: define i64 @alloca_with_phis
+; COMMON-SAME: (i64 [[ARG:%.*]], i1 [[CMP:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:  bb:
+; COMMON-NEXT:    [[I:%.*]] = alloca i64, align 8
+; COMMON-NEXT:    store i64 [[ARG]], ptr [[I]], align 4
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB4:%.*]]
+; COMMON:       bb1:
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB8:%.*]], label [[BB2:%.*]]
+; COMMON:       bb2:
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB8]], label [[BB3:%.*]]
+; COMMON:       bb3:
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB8]], label [[BB6:%.*]]
+; COMMON:       bb4:
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB8]], label [[BB5:%.*]]
+; COMMON:       bb5:
+; COMMON-NEXT:    br i1 [[CMP]], label [[BB8]], label [[BB6]]
+; COMMON:       bb6:
+; COMMON-NEXT:    [[I7:%.*]] = phi ptr [ [[I]], [[BB3]] ], [ [[I]], [[BB5]] ]
+; COMMON-NEXT:    br label [[BB8]]
+; COMMON:       bb8:
+; COMMON-NEXT:    [[I9:%.*]] = phi ptr [ [[I]], [[BB1]] ], [ [[I]], [[BB2]] ], [ [[I]], [[BB3]] ], [ [[I]], [[BB4]] ], [ [[I]], [[BB5]] ], [ [[I7]], [[BB6]] ]
+; COMMON-NEXT:    [[RET:%.*]] = load i64, ptr [[I9]], align 4
+; COMMON-NEXT:    ret i64 [[RET]]
+;
+bb:
+  %i = alloca i64
+  store i64 %arg, ptr %i
+  br i1 %cmp, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  br i1 %cmp, label %bb8, label %bb2
+
+bb2:                                              ; preds = %bb1
+  br i1 %cmp, label %bb8, label %bb3
+
+bb3:                                              ; preds = %bb2
+  br i1 %cmp, label %bb8, label %bb6
+
+bb4:                                              ; preds = %bb
+  br i1 %cmp, label %bb8, label %bb5
+
+bb5:                                              ; preds = %bb4
+  br i1 %cmp, label %bb8, label %bb6
+
+bb6:                                              ; preds = %bb5, %bb3
+  %i7 = phi ptr [ %i, %bb3 ], [ %i, %bb5 ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb6, %bb5, %bb4, %bb3, %bb2, %bb1
+  %i9 = phi ptr [ %i, %bb1 ], [ %i, %bb2 ], [ %i, %bb3 ], [ %i, %bb4 ], [ %i, %bb5 ], [ %i7, %bb6 ]
+  %ret = load i64, ptr %i9
+  ret i64 %ret
+}
diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index 6402a23157729..3babdd8173a21 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -804,5 +804,200 @@ bb5:
   br label %bb5
 }
 
+define void @select_same_obj(i1 %c, ptr %p, i64 %x) {
+; CHECK-LABEL: @select_same_obj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[X:%.*]]
+; CHECK-NEXT:    [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P]], ptr [[P2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p2 = getelementptr i8, ptr %p, i64 %x
+  %p3 = select i1 %c, ptr %p, ptr %p2
+  %cmp = icmp eq ptr %p, %p3
+  br i1 %cmp, label %if, label %exit
+
+if:
+  call void @use_ptr(ptr %p)
+  call void @use_ptr(ptr %p3)
+  ret void
+
+exit:
+  ret void
+}
+
+define void @select_different_obj(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: @select_different_obj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p3 = select i1 %c, ptr %p, ptr %p2
+  %cmp = icmp eq ptr %p, %p3
+  br i1 %cmp, label %if, label %exit
+
+if:
+  call void @use_ptr(ptr %p)
+  call void @use_ptr(ptr %p3)
+  ret void
+
+exit:
+  ret void
+}
+
+define void @select_same_obj_is_select(i1 %c, ptr %p, ptr %p2, i64 %x) {
+; CHECK-LABEL: @select_same_obj_is_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr i8, ptr [[P3]], i64 [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P3]], [[P4]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p3 = select i1 %c, ptr %p, ptr %p2
+  %p4 = getelementptr i8, ptr %p3, i64 %x
+  %cmp = icmp eq ptr %p3, %p4
+  br i1 %cmp, label %if, label %exit
+
+if:
+  call void @use_ptr(ptr %p3)
+  call void @use_ptr(ptr %p4)
+  ret void
+
+exit:
+  ret void
+}
+
+define void @phi_same_obj(i1 %c, ptr %p, i64 %x) {
+; CHECK-LABEL: @phi_same_obj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[X:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF2:%.*]], label [[EXIT:%.*]]
+; CHECK:       if2:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p2 = getelementptr i8, ptr %p, i64 %x
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %p3 = phi ptr [ %p, %if ], [ %p2, %entry ]
+  %cmp = icmp eq ptr %p, %p3
+  br i1 %cmp, label %if2, label %exit
+
+if2:
+  call void @use_ptr(ptr %p)
+  call void @use_ptr(ptr %p3)
+  ret void
+
+exit:
+  ret void
+}
+
+define void @phi_different_obj(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: @phi_different_obj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[P3:%.*]] = phi ptr [ [[P:%.*]], [[IF]] ], [ [[P2:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF2:%.*]], label [[EXIT:%.*]]
+; CHECK:       if2:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %p3 = phi ptr [ %p, %if ], [ %p2, %entry ]
+  %cmp = icmp eq ptr %p, %p3
+  br i1 %cmp, label %if2, label %exit
+
+if2:
+  call void @use_ptr(ptr %p)
+  call void @use_ptr(ptr %p3)
+  ret void
+
+exit:
+  ret void
+}
+
+define void @phi_same_obj_cycle(i1 %c, ptr %p, i64 %x) {
+; CHECK-LABEL: @phi_same_obj_cycle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P_IV:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[P_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[P_NEXT]] = getelementptr i8, ptr [[P_IV]], i64 [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P_IV]], [[P]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %p.iv = phi ptr [ %p, %entry ], [ %p.next, %loop.latch ]
+  %p.next = getelementptr i8, ptr %p.iv, i64 %x
+  %cmp = icmp eq ptr %p.iv, %p
+  br i1 %cmp, label %if, label %loop.latch
+
+if:
+  call void @use_ptr(ptr %p.iv)
+  call void @use_ptr(ptr %p)
+  br label %loop.latch
+
+loop.latch:
+  br label %loop
+}
+
 declare void @use_bool(i1)
 declare void @use_ptr(ptr)
diff --git a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll b/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
deleted file mode 100644
index e352900e2a458..0000000000000
--- a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=globalopt < %s | FileCheck %s
-
-@m64 = internal global <1 x i64> zeroinitializer
-
-define i32 @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    ret i32 0
-;
-  %temp = load x86_mmx, ptr @m64
-  ret i32 0
-}
diff --git a/llvm/test/Transforms/HotColdSplit/pr40056.ll b/llvm/test/Transforms/HotColdSplit/pr40056.ll
new file mode 100644
index 0000000000000..950b62c673fbf
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/pr40056.ll
@@ -0,0 +1,72 @@
+; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s
+; Hot cold splitting should not outline:
+; 1. Basic blocks with token type instructions
+; 2. Functions with scoped EH personality
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.0"
+
+; CHECK-LABEL: define {{.*}}@with_funclet
+; CHECK-NOT: with_funclet.cold
+define void @with_funclet() personality ptr @__CxxFrameHandler3 {
+entry:
+  invoke void @fYAXXZ()
+          to label %normal unwind label %exception
+
+normal:                                           ; preds = %entry
+  ret void
+
+exception:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  call void @terminateYAXXZ() [ "funclet"(token %0) ]
+  br label %continueexception
+
+continueexception:                                ; preds = %exception
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@with_personality
+; CHECK-NOT: with_personality.cold
+define void @with_personality(i32 %cond) personality ptr @__CxxFrameHandler3 {
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sideeffect(i32 1)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  call void (...) @sink()
+  ret void
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 2)
+  ret void
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @fYAXXZ()
+
+declare void @bar() #0
+
+declare void @terminateYAXXZ()
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) #0
+
+attributes #0 = { cold }
diff --git a/llvm/test/Transforms/IRCE/wide_indvar.ll b/llvm/test/Transforms/IRCE/wide_indvar.ll
index ecb13adb61fdf..b9be8ae841735 100644
--- a/llvm/test/Transforms/IRCE/wide_indvar.ll
+++ b/llvm/test/Transforms/IRCE/wide_indvar.ll
@@ -94,7 +94,7 @@ define i32 @test_increasing_slt_slt_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone [[META5:![0-9]+]]
 ;
 
 entry:
@@ -175,7 +175,7 @@ define i32 @test_increasing_slt_slt_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -268,7 +268,7 @@ define i32 @test_increasing_slt_slt_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -367,7 +367,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[LATCH_COND_PRELOOP:%.*]] = icmp slt i32 [[NARROW_IV_PRELOOP]], [[N]]
 ; CHECK-NEXT:    [[WIDE_NARROW_IV_PRELOOP:%.*]] = sext i32 [[NARROW_IV_PRELOOP]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[WIDE_NARROW_IV_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IV_NEXT_PRELOOP_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
 ; CHECK-NEXT:    [[NARROW_IV_PRELOOP_LCSSA]] = phi i32 [ [[NARROW_IV_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
@@ -389,7 +389,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -519,7 +519,7 @@ define i32 @test_increasing_slt_slt_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -688,7 +688,7 @@ define i32 @test_increasing_ult_ult_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -769,7 +769,7 @@ define i32 @test_increasing_ult_ult_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -859,7 +859,7 @@ define i32 @test_increasing_ult_ult_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -980,7 +980,7 @@ define i32 @test_increasing_ult_ult_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index 456155d7e4437..3b914dc29ca41 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -717,6 +717,15 @@ declare float @modff(float, ptr)
 ; CHECK: declare x86_fp80 @modfl(x86_fp80, ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
 declare x86_fp80 @modfl(x86_fp80, ptr)
 
+; CHECK: declare double @nan(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+declare double @nan(ptr)
+
+; CHECK: declare float @nanf(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
+declare float @nanf(ptr)
+
+; CHECK: declare x86_fp80 @nanl(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
+declare x86_fp80 @nanl(ptr)
+
 ; CHECK: declare double @nearbyint(double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
 declare double @nearbyint(double)
 
@@ -800,9 +809,27 @@ declare ptr @vec_realloc(ptr, i64)
 ; CHECK: declare noundef ptr @realpath(ptr nocapture noundef readonly, ptr noundef) [[NOFREE_NOUNWIND]]
 declare ptr @realpath(ptr, ptr)
 
+; CHECK: declare double @remainder(double, double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare double @remainder(double, double)
+
+; CHECK: declare float @remainderf(float, float) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare float @remainderf(float, float)
+
+; CHECK: declare x86_fp80 @remainderl(x86_fp80, x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare x86_fp80 @remainderl(x86_fp80, x86_fp80)
+
 ; CHECK: declare noundef i32 @remove(ptr nocapture noundef readonly) [[NOFREE_NOUNWIND]]
 declare i32 @remove(ptr)
 
+; CHECK: declare double @remquo(double, double, ptr nocapture) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare double @remquo(double, double, ptr)
+
+; CHECK: declare float @remquof(float, float, ptr nocapture) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare float @remquof(float, float, ptr)
+
+; CHECK: declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr nocapture) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
+declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr)
+
 ; CHECK: declare noundef i32 @rename(ptr nocapture noundef readonly, ptr nocapture noundef readonly) [[NOFREE_NOUNWIND]]
 declare i32 @rename(ptr, ptr)
 
@@ -938,7 +965,7 @@ declare ptr @strncpy(ptr, ptr, i64)
 ; CHECK: declare noalias ptr @strndup(ptr nocapture readonly, i64 noundef) [[INACCESSIBLEMEMORARGONLY_NOFREE_NOUNWIND_WILLRETURN_FAMILY_MALLOC]]
 declare ptr @strndup(ptr, i64)
 
-; CHECK: declare i64 @strnlen(ptr nocapture, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+; CHECK: declare i64 @strnlen(ptr nocapture, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
 declare i64 @strnlen(ptr, i64)
 
 ; CHECK: declare ptr @strpbrk(ptr, ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
@@ -1070,6 +1097,8 @@ declare i32 @vsscanf(ptr, ptr, ptr)
 ; CHECK: declare noundef i64 @write(i32 noundef, ptr nocapture noundef readonly, i64 noundef) [[NOFREE]]
 declare i64 @write(i32, ptr, i64)
 
+; CHECK: declare void @abort() [[NOFREE_COLD:#[0-9]+]]
+declare void @abort()
 
 ; memset_pattern{4,8,16} aren't available everywhere.
 ; CHECK-DARWIN: declare void @memset_pattern4(ptr nocapture writeonly, ptr nocapture readonly, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_WILLRETURN]]
@@ -1096,6 +1125,7 @@ declare void @memset_pattern16(ptr, ptr, i64)
 ; CHECK-DAG: attributes [[ARGMEMONLY_NOFREE_NOUNWIND]] = { nofree nounwind memory(argmem: readwrite) }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMORARGMEMONLY_NOUNWIND_WILLRETURN_ALLOCKIND_REALLOC_ALLOCSIZE1_FAMILY_MALLOC]] = { mustprogress nounwind willreturn allockind("realloc") allocsize(1) memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMORARGONLY_NOFREE_NOUNWIND_WILLRETURN_FAMILY_MALLOC]] = { mustprogress nofree nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" }
+; CHECK-DAG: attributes [[NOFREE_COLD]] = { cold nofree }
 
 ; CHECK-NVPTX-DAG: attributes [[NOFREE_NOUNWIND_READNONE]] = { nofree nosync nounwind memory(none) }
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 59118a172a2bc..9cb79b2644865 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -5608,8 +5608,7 @@ declare float @llvm.amdgcn.trig.preop.f32(float, i32)
 
 define double @trig_preop_constfold_variable_undef_arg(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_constfold_variable_undef_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 %arg)
   ret double %val
@@ -5617,8 +5616,7 @@ define double @trig_preop_constfold_variable_undef_arg(i32 %arg) {
 
 define double @trig_preop_constfold_variable_poison_arg(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_constfold_variable_poison_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double poison
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 %arg)
   ret double %val
@@ -5635,8 +5633,7 @@ define double @trig_preop_constfold_variable_arg_undef(double %arg) {
 
 define double @trig_preop_constfold_variable_arg_poison(double %arg) {
 ; CHECK-LABEL: @trig_preop_constfold_variable_arg_poison(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 poison)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double poison
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 poison)
   ret double %val
@@ -5653,8 +5650,7 @@ define double @trig_preop_constfold_variable_int(i32 %arg) {
 
 define double @trig_preop_qnan(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_qnan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 %arg)
   ret double %val
@@ -5662,8 +5658,7 @@ define double @trig_preop_qnan(i32 %arg) {
 
 define double @trig_preop_snan(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_snan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000001
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 %arg)
   ret double %val
@@ -5671,8 +5666,7 @@ define double @trig_preop_snan(i32 %arg) {
 
 define double @trig_preop_inf_0() {
 ; CHECK-LABEL: @trig_preop_inf_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0xB43DD63F5F2F8BD
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0)
   ret double %val
@@ -5680,8 +5674,7 @@ define double @trig_preop_inf_0() {
 
 define double @trig_preop_ninf_0() {
 ; CHECK-LABEL: @trig_preop_ninf_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0xB43DD63F5F2F8BD
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0)
   ret double %val
@@ -5707,10 +5700,36 @@ define double @trig_preop_variable_args(double %arg0, i32 %arg1) {
 
 define double @trig_preop_constfold() {
 ; CHECK-LABEL: @trig_preop_constfold(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 2)
+  ret double %val
+}
+
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 22)
+  ret double %val
+}
+
+; Only use src1[4:0], so segment is actually 31 for -1.
+define double @trig_preop_constfold_neg1_segment() {
+; CHECK-LABEL: @trig_preop_constfold_neg1_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -1)
+  ret double %val
+}
+
+; Only use src1[4:0], so segment is actually 0 for -32.
+define double @trig_preop_constfold_neg32_segment() {
+; CHECK-LABEL: @trig_preop_constfold_neg32_segment(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -32)
   ret double %val
 }
 
@@ -5723,84 +5742,234 @@ define double @trig_preop_constfold_strictfp() strictfp {
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__0() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__1(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 1)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissa1__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__neg1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg1(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -1)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissaX__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x0004A7F09D5F47D4, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__9999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__9999999(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 9999999)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 9999999)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__neg999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg999999(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -999999)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -999999)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x0020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x0020000000000000_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x10000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x0010000000000000, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x00094A6EE06DB14A, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x001fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x001fffffffffffff_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000fffffffffffff, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 22)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x8020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x8020000000000000_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 22)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x801fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x801fffffffffffff_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x801FFFFFFFFFFFFF, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000A6EE06DB14ACC, i32 22)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment0(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 0)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa1__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment1(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 0)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissaX__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment1(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6471B791D6398353, i32 0)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 2)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 2)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647272117E2EF7E4, i32 2)
+  ret double %val
+}
+
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 12)
+  ret double %val
+}
+
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 12)
+  ret double %val
+}
+
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647181272117E2EF, i32 12)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 0)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissa1__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 0)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissax__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissax__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B074F463F669E5F, i32 0)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment2(
+; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 2)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 2)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0A2F8BD9E839CE, i32 2)
+  ret double %val
+}
+
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 5)
+  ret double %val
+}
+
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 5)
+  ret double %val
+}
+
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x801fffffffffffff, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0A98F2F8BD9E83, i32 5)
   ret double %val
 }
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 63114288fc581..04bba79aada0a 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -7,12 +7,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; DemandedBits - MOVMSK zeros the upper bits of the result.
 ;
 
-define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_upper_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, 255
   ret i32 %2
 }
@@ -87,11 +87,11 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; DemandedBits - If we don't use the lower bits then we just return zero.
 ;
 
-define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_lower_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, -256
   ret i32 %2
 }
@@ -151,7 +151,7 @@ define i32 @undef_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> undef)
   ret i32 %1
 }
 
@@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() {
 ; Constant Folding (ZERO -> ZERO)
 ;
 
-define i32 @zero_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
-  ret i32 %2
-}
-
 define i32 @zero_x86_sse_movmsk_ps() {
 ; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
 ; CHECK-NEXT:    ret i32 0
@@ -271,11 +261,11 @@ define i32 @zero_x86_avx2_pmovmskb() {
 
 define i32 @fold_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> <i64 18084223940296448>)
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to <1 x i64>
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %1)
   ret i32 %2
 }
 
@@ -447,7 +437,7 @@ define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
   ret i32 %r
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>)
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
index b6740374c0791..f7c639f1d8e6b 100644
--- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
+++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
@@ -800,7 +800,7 @@ define i32 @sub_abs_wrong_pred(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
 ; CHECK-NEXT:    br label [[COND_END]]
 ; CHECK:       cond.end:
diff --git a/llvm/test/Transforms/InstCombine/ashr-demand.ll b/llvm/test/Transforms/InstCombine/ashr-demand.ll
index 2b5ccf0626dd7..a0e2af93b809b 100644
--- a/llvm/test/Transforms/InstCombine/ashr-demand.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-demand.ll
@@ -52,3 +52,36 @@ define <2 x i32> @srem2_ashr_mask_vector_nonconstant(<2 x i32> %a0, <2 x i32> %a
   %mask = and <2 x i32> %ashr, <i32 2, i32 2>
   ret <2 x i32> %mask
 }
+
+
+; If it does not matter if we do ashr or lshr, then we canonicalize to lshr.
+
+define i16 @ashr_can_be_lshr(i32 %a) {
+; CHECK-LABEL: @ashr_can_be_lshr(
+; CHECK-NEXT:    [[ASHR:%.*]] = lshr exact i32 [[A:%.*]], 16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i32 [[ASHR]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %ashr = ashr exact i32 %a, 16
+  %trunc = trunc nsw i32 %ashr to i16
+  ret i16 %trunc
+}
+
+; Historically SimplifyDemandedUseBits skipped replacing ashr with lshr here
+; due to known sign bits analysis indicating that %ashr had more than 33 sign
+; bits. It does however seem weird not to always canonicalize to lshr when
+; possible, and in this case rewriting into lshr would trigger further
+; optimizations.
+define i32 @ashr_can_be_lshr_2(i32 %a) {
+; CHECK-LABEL: @ashr_can_be_lshr_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[A:%.*]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], -67108864
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %ext = zext i32 %a to i64
+  %or = or i64 %ext, 4278190080
+  %shl = shl i64 %or, 34
+  %ashr = ashr exact i64 %shl, 32
+  %trunc = trunc nsw i64 %ashr to i32
+  ret i32 %trunc
+}
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
index 38a7391a1a1e3..d4ec9e3aae679 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
@@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
-
 ; FP source is ok.
 
 define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) {
@@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 8b8325b147263..f787b3c4cc9ac 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
 
 ; FP source is ok.
 
@@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 5599604b666fb..26047f2c899a3 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -711,3 +711,171 @@ define ptr @select_bitcast_unsized_pointer(i1 %c) {
   %s = select i1 %c, ptr @f1, ptr @f2
   ret ptr %s
 }
+
+define float @copysign_idiom_constant(float %x) {
+; CHECK-LABEL: @copysign_idiom_constant(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_commuted(float %x, i32 %magx) {
+; CHECK-LABEL: @copysign_idiom_commuted(
+; CHECK-NEXT:    [[MAG:%.*]] = add i32 [[MAGX:%.*]], -1
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %mag = add i32 %magx, -1 ; thwart complexity-based canonicalization
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %mag, %sign
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_abs(float %x, float %mag) {
+; CHECK-LABEL: @copysign_idiom_abs(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[MAG:%.*]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %abs = call float @llvm.fabs.f32(float %mag)
+  %absbits = bitcast float %abs to i32
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %absbits
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define double @copysign_idiom_f64(double %x, i64 %mag) {
+; CHECK-LABEL: @copysign_idiom_f64(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MAG]] to double
+; CHECK-NEXT:    [[Y:%.*]] = call double @llvm.copysign.f64(double [[TMP1]], double [[X:%.*]])
+; CHECK-NEXT:    ret double [[Y]]
+;
+  %cond = icmp sgt i64 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast double %x to i64
+  %sign = and i64 %bits, -9223372036854775808
+  %res = or i64 %sign, %mag
+  %y = bitcast i64 %res to double
+  ret double %y
+}
+
+define <2 x float> @copysign_idiom_vec(<2 x float> %x) {
+; CHECK-LABEL: @copysign_idiom_vec(
+; CHECK-NEXT:    [[Y:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x float> [[Y]]
+;
+  %bits = bitcast <2 x float> %x to <2 x i32>
+  %sign = and <2 x i32> %bits, splat(i32 -2147483648)
+  %res = or <2 x i32> %sign, splat(i32 1065353216)
+  %y = bitcast <2 x i32> %res to <2 x float>
+  ret <2 x float> %y
+}
+
+; negative tests
+
+define float @copysign_idiom_without_nneg(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_without_nneg(
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483648
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_not_signmask(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_not_signmask(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483647
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483647
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_constant_wrong_type1(<1 x i32> %x) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type1(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %bits = bitcast <1 x i32> %x to i32
+  %cond = icmp sgt i32 %bits, -1
+  call void @llvm.assume(i1 %cond)
+
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define half @copysign_idiom_constant_wrong_type2(bfloat %x, i16 %mag) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i16 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast bfloat [[X:%.*]] to i16
+; CHECK-NEXT:    [[SIGN:%.*]] = and i16 [[BITS]], -32768
+; CHECK-NEXT:    [[RES:%.*]] = or disjoint i16 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i16 [[RES]] to half
+; CHECK-NEXT:    ret half [[Y]]
+;
+  %cond = icmp sgt i16 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast bfloat %x to i16
+  %sign = and i16 %bits, -32768
+  %res = or i16 %sign, %mag
+  %y = bitcast i16 %res to half
+  ret half %y
+}
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index 6cee7bb650fd3..564e82945c13f 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -937,27 +937,6 @@ define float @test2c() {
   ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> <float -1.000000e+00, float -1.000000e+00> to double) to <2 x float>), i32 0)
 }
 
-define i64 @test_mmx(<2 x i32> %x) {
-; ALL-LABEL: @test_mmx(
-; ALL-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
-; ALL-NEXT:    ret i64 [[C]]
-;
-  %A = bitcast <2 x i32> %x to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
-define i64 @test_mmx_const(<2 x i32> %c) {
-; ALL-LABEL: @test_mmx_const(
-; ALL-NEXT:    ret i64 0
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
 ; PR12514
 define i1 @test67(i1 %a, i32 %b) {
 ; ALL-LABEL: @test67(
diff --git a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
index 7facdaf7590d3..4ef1ed0ec4976 100644
--- a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
@@ -60,7 +60,7 @@ define i8 @ctpop_imin_plus1_lshr_nz(i8 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[V:%.*]] = lshr i8 -127, [[X]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[V]])
 ; CHECK-NEXT:    ret i8 [[CNT]]
 ;
   %cmp = icmp ne i8 %x, 0
@@ -104,7 +104,7 @@ define <2 x i32> @ctpop_lshr_intmin_intmin_plus1_vec_nz(<2 x i32> %x) {
 ; CHECK-LABEL: @ctpop_lshr_intmin_intmin_plus1_vec_nz(
 ; CHECK-NEXT:    [[X1:%.*]] = or <2 x i32> [[X:%.*]], <i32 1, i32 1>
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> <i32 -2147483648, i32 -2147483647>, [[X1]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 1, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
 ; CHECK-NEXT:    ret <2 x i32> [[CNT]]
 ;
   %x1 = or <2 x i32> %x, <i32 1 ,i32 1>
diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll
index 83700e72de080..940bb868c4c61 100644
--- a/llvm/test/Transforms/InstCombine/ctpop.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop.ll
@@ -169,8 +169,8 @@ define <2 x i32> @_parity_of_not_poison(<2 x i32> %x) {
 
 define <2 x i32> @_parity_of_not_poison2(<2 x i32> %x) {
 ; CHECK-LABEL: @_parity_of_not_poison2(
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[CNT]], <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[TMP1]], <i32 1, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = xor <2 x i32> %x, <i32 -1 ,i32 -1>
@@ -485,3 +485,21 @@ define i32 @select_ctpop_zero(i32 %x) {
   %res = select i1 %cmp, i32 0, i32 %ctpop
   ret i32 %res
 }
+
+define i32 @ctpop_non_zero(i32 range(i32 1, 255) %x) {
+; CHECK-LABEL: @ctpop_non_zero(
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 9) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CTPOP]]
+;
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %ctpop
+}
+
+define i32 @ctpop_non_zero_with_existing_range_attr(i32 range(i32 1, 255) %x) {
+; CHECK-LABEL: @ctpop_non_zero_with_existing_range_attr(
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 9) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CTPOP]]
+;
+  %ctpop = call range(i32 0, 9) i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %ctpop
+}
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index b88250d434280..4590bbb7abaeb 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -856,3 +856,68 @@ define <2 x float> @fma_unary_shuffle_ops_uses(<2 x float> %x, <2 x float> %y, <
   %r = call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
   ret <2 x float> %r
 }
+
+define half @fma_negone(half %x, half %y) {
+; CHECK-LABEL: @fma_negone(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub half [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret half [[SUB]]
+;
+  %sub = call half @llvm.fma.f16(half %x, half -1.0, half %y)
+  ret half %sub
+}
+
+define half @fmuladd_negone(half %x, half %y) {
+; CHECK-LABEL: @fmuladd_negone(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub half [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret half [[SUB]]
+;
+  %sub = call half @llvm.fmuladd.f16(half %x, half -1.0, half %y)
+  ret half %sub
+}
+
+define half @fma_negone_fmf(half %x, half %y) {
+; CHECK-LABEL: @fma_negone_fmf(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub nnan ninf nsz half [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret half [[SUB]]
+;
+  %sub = call nnan ninf nsz half @llvm.fma.f16(half %x, half -1.0, half %y)
+  ret half %sub
+}
+
+define half @fmuladd_negone_fmf(half %x, half %y) {
+; CHECK-LABEL: @fmuladd_negone_fmf(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub nnan ninf nsz half [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret half [[SUB]]
+;
+  %sub = call nnan ninf nsz half @llvm.fmuladd.f16(half %x, half -1.0, half %y)
+  ret half %sub
+}
+
+define <2 x half> @fma_negone_vec(<2 x half> %x, <2 x half> %y) {
+; CHECK-LABEL: @fma_negone_vec(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x half> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x half> [[SUB]]
+;
+  %sub = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> splat(half -1.0), <2 x half> %y)
+  ret <2 x half> %sub
+}
+
+define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
+; CHECK-LABEL: @fma_negone_vec_partial_undef(
+; CHECK-NEXT:    [[SUB:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[X:%.*]], <2 x half> <half undef, half 0xHBC00>, <2 x half> [[Y:%.*]])
+; CHECK-NEXT:    ret <2 x half> [[SUB]]
+;
+  %sub = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> <half undef, half -1.0>, <2 x half> %y)
+  ret <2 x half> %sub
+}
+
+; negative tests
+
+define half @fma_non_negone(half %x, half %y) {
+; CHECK-LABEL: @fma_non_negone(
+; CHECK-NEXT:    [[SUB:%.*]] = call half @llvm.fma.f16(half [[X:%.*]], half 0xHBE00, half [[Y:%.*]])
+; CHECK-NEXT:    ret half [[SUB]]
+;
+  %sub = call half @llvm.fma.f16(half %x, half -1.5, half %y)
+  ret half %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
index e9ec6b415d462..618f5d641dc1a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
@@ -306,7 +306,7 @@ define i32 @pow2_32_nonconst_assume(i32 %x, i32 %y) {
 
 define i32 @pow2_32_gtnonconst_assume(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pow2_32_gtnonconst_assume(
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    [[YGT:%.*]] = icmp ugt i32 [[Y]], [[X:%.*]]
@@ -513,7 +513,7 @@ define i32 @maybe_pow2_32_noncont(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YGT8:%.*]] = icmp ugt i32 [[Y:%.*]], 8
 ; CHECK-NEXT:    br i1 [[YGT8]], label [[CONT1:%.*]], label [[CONT2:%.*]]
 ; CHECK:       Cont1:
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    br i1 true, label [[CONT2]], label [[FALSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index a143b1347ccee..3f2c31d05f3ed 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -197,7 +197,7 @@ define i1 @is_pow2_non_zero_ult_2(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ult_2(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -212,7 +212,7 @@ define i1 @is_pow2_non_zero_eq_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_eq_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -227,7 +227,7 @@ define i1 @is_pow2_non_zero_ugt_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ugt_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -242,7 +242,7 @@ define i1 @is_pow2_non_zero_ne_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ne_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/lib-call-exit.ll b/llvm/test/Transforms/InstCombine/lib-call-exit.ll
new file mode 100644
index 0000000000000..ad133af5ea366
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/lib-call-exit.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare void @exit(i32)
+declare void @_Exit(i32)
+
+define void @call_exit_0() {
+; CHECK-LABEL: define void @call_exit_0() {
+; CHECK-NEXT:    call void @exit(i32 0)
+; CHECK-NEXT:    ret void
+;
+  call void @exit(i32 0)
+  ret void
+}
+
+define void @call_exit_1() {
+; CHECK-LABEL: define void @call_exit_1() {
+; CHECK-NEXT:    call void @exit(i32 1) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @exit(i32 1)
+  ret void
+}
+
+define void @call__Exit_m1() {
+; CHECK-LABEL: define void @call__Exit_m1() {
+; CHECK-NEXT:    call void @_Exit(i32 -1) #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+  call void @_Exit(i32 -1)
+  ret void
+}
+
+define void @call__Exit_N(i32 %N) {
+; CHECK-LABEL: define void @call__Exit_N(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:    call void @_Exit(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+  call void @_Exit(i32 %N)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { cold }
+;.
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
index 7d53c8ee35684..6c087aa87845f 100644
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -56,6 +56,18 @@ define i32 @test5(i1 %C) {
   ret i32 %Z
 }
 
+; FIXME: Constants should be allowed for this optimization.
+define i32 @test5_asan(i1 %C) sanitize_address {
+; CHECK-LABEL: @test5_asan(
+; CHECK-NEXT:    [[Y:%.*]] = select i1 [[C:%.*]], ptr @X, ptr @X2
+; CHECK-NEXT:    [[Z:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %Y = select i1 %C, ptr @X, ptr @X2		; <ptr> [#uses=1]
+  %Z = load i32, ptr %Y		; <i32> [#uses=1]
+  ret i32 %Z
+}
+
 define i32 @load_gep_null_inbounds(i64 %X) {
 ; CHECK-LABEL: @load_gep_null_inbounds(
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
diff --git a/llvm/test/Transforms/InstCombine/mem-intrinsics.ll b/llvm/test/Transforms/InstCombine/mem-intrinsics.ll
new file mode 100644
index 0000000000000..5448d3c20fb12
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/mem-intrinsics.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define void @memset_null(i64 %len) {
+; CHECK-LABEL: define void @memset_null(
+; CHECK-SAME: i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[LEN]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_null_ub() {
+; CHECK-LABEL: define void @memset_null_ub() {
+; CHECK-NEXT:    store i64 poison, ptr null, align 4294967296
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 8, i1 false)
+  ret void
+}
+
+define void @memcpy_null_src(ptr %dst, i64 %len) {
+; CHECK-LABEL: define void @memcpy_null_src(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[LEN]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.i64(ptr %dst, ptr null, i64 %len, i1 false)
+  ret void
+}
+
+define void @memmove_null_src(ptr %dst, i64 %len) {
+; CHECK-LABEL: define void @memmove_null_src(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[LEN]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.i64(ptr %dst, ptr null, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_element_atomic(i64 %len) {
+; CHECK-LABEL: define void @memset_element_atomic(
+; CHECK-SAME: i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[LEN]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 1 null, i8 0, i64 %len, i32 1)
+  ret void
+}
+
+; negative tests
+
+define void @memset_null_volatile(i64 %len) {
+; CHECK-LABEL: define void @memset_null_volatile(
+; CHECK-SAME: i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr null, i8 0, i64 [[LEN]], i1 true)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 %len, i1 true)
+  ret void
+}
+
+define void @memset_null_is_defined(i64 %len) null_pointer_is_valid {
+; CHECK-LABEL: define void @memset_null_is_defined(
+; CHECK-SAME: i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4294967296 null, i8 0, i64 [[LEN]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 %len, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/mempcpy.ll b/llvm/test/Transforms/InstCombine/mempcpy.ll
index c996758a919d9..82c34f8a864ce 100644
--- a/llvm/test/Transforms/InstCombine/mempcpy.ll
+++ b/llvm/test/Transforms/InstCombine/mempcpy.ll
@@ -55,7 +55,7 @@ define ptr @memcpy_big_const_n(ptr %d, ptr nocapture readonly %s) {
 
 define i32 @PR48810() {
 ; CHECK-LABEL: @PR48810(
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 undef, ptr align 4294967296 null, i64 undef, i1 false)
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    ret i32 undef
 ;
   %r = call dereferenceable(1) ptr @mempcpy(ptr undef, ptr null, i64 undef)
diff --git a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
index a75a6496961b4..38fca0314ae11 100644
--- a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
+++ b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
@@ -7,8 +7,8 @@ define void @PR35618(ptr %st1, ptr %st2) {
 ; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
 ; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt double [[LD1]], [[LD2]]
-; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP10]], double [[LD1]], double [[LD2]]
+; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
 ; CHECK-NEXT:    ret void
@@ -17,8 +17,32 @@ define void @PR35618(ptr %st1, ptr %st2) {
   %z1 = alloca double
   %ld1 = load double, ptr %y1
   %ld2 = load double, ptr %z1
-  %tmp10 = fcmp olt double %ld1, %ld2
-  %sel = select i1 %tmp10, ptr %y1, ptr %z1
+  %tmp = fcmp olt double %ld1, %ld2
+  %sel = select i1 %tmp, ptr %y1, ptr %z1
+  %tmp12 = load i64, ptr %sel
+  store i64 %tmp12, ptr %st1
+  store i64 %tmp12, ptr %st2
+  ret void
+}
+
+define void @PR35618_asan(ptr %st1, ptr %st2) sanitize_address {
+; CHECK-LABEL: @PR35618_asan(
+; CHECK-NEXT:    [[Y1:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
+; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
+; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %y1 = alloca double
+  %z1 = alloca double
+  %ld1 = load double, ptr %y1
+  %ld2 = load double, ptr %z1
+  %tmp = fcmp olt double %ld1, %ld2
+  %sel = select i1 %tmp, ptr %y1, ptr %z1
   %tmp12 = load i64, ptr %sel
   store i64 %tmp12, ptr %st1
   store i64 %tmp12, ptr %st2
diff --git a/llvm/test/Transforms/InstCombine/nan.ll b/llvm/test/Transforms/InstCombine/nan.ll
new file mode 100644
index 0000000000000..09ebfc715babe
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/nan.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+@empty = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@dec = private unnamed_addr constant [2 x i8] c"1\00", align 1
+@hex = private unnamed_addr constant [4 x i8] c"0xf\00", align 1
+
+define double @nan_empty() {
+; CHECK-LABEL: define double @nan_empty() {
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %res = call double @nan(ptr @empty)
+  ret double %res
+}
+
+define double @nan_dec() {
+; CHECK-LABEL: define double @nan_dec() {
+; CHECK-NEXT:    ret double 0x7FF8000000000001
+;
+  %res = call double @nan(ptr @dec)
+  ret double %res
+}
+
+define double @nan_hex() {
+; CHECK-LABEL: define double @nan_hex() {
+; CHECK-NEXT:    ret double 0x7FF800000000000F
+;
+  %res = call double @nan(ptr @hex)
+  ret double %res
+}
+
+define float @nanf_empty() {
+; CHECK-LABEL: define float @nanf_empty() {
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %res = call float @nanf(ptr @empty)
+  ret float %res
+}
+
+; nagative tests
+
+define double @nan_poison() {
+; CHECK-LABEL: define double @nan_poison() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @nan(ptr poison)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @nan(ptr poison)
+  ret double %res
+}
+
+define double @nan_undef() {
+; CHECK-LABEL: define double @nan_undef() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @nan(ptr undef)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @nan(ptr undef)
+  ret double %res
+}
+
+define double @nan_null() {
+; CHECK-LABEL: define double @nan_null() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @nan(ptr null)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @nan(ptr null)
+  ret double %res
+}
+
+define double @nan_non_constant(ptr %x) {
+; CHECK-LABEL: define double @nan_non_constant(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call double @nan(ptr [[X]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @nan(ptr %x)
+  ret double %res
+}
+
+declare float @nanf(ptr)
+declare double @nan(ptr)
diff --git a/llvm/test/Transforms/InstCombine/nanl-fp128.ll b/llvm/test/Transforms/InstCombine/nanl-fp128.ll
new file mode 100644
index 0000000000000..21ba0fb14ca20
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/nanl-fp128.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+@empty = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@dec = private unnamed_addr constant [2 x i8] c"1\00", align 1
+@hex = private unnamed_addr constant [4 x i8] c"0xf\00", align 1
+
+define fp128 @nanl_empty() {
+; CHECK-LABEL: define fp128 @nanl_empty() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000000
+;
+  %res = call fp128 @nanl(ptr @empty)
+  ret fp128 %res
+}
+
+define fp128 @nanl_dec() {
+; CHECK-LABEL: define fp128 @nanl_dec() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000017FFF800000000000
+;
+  %res = call fp128 @nanl(ptr @dec)
+  ret fp128 %res
+}
+
+define fp128 @nanl_hex() {
+; CHECK-LABEL: define fp128 @nanl_hex() {
+; CHECK-NEXT:    ret fp128 0xL000000000000000F7FFF800000000000
+;
+  %res = call fp128 @nanl(ptr @hex)
+  ret fp128 %res
+}
+
+declare fp128 @nanl(ptr)
diff --git a/llvm/test/Transforms/InstCombine/nanl-fp80.ll b/llvm/test/Transforms/InstCombine/nanl-fp80.ll
new file mode 100644
index 0000000000000..7868af3696a56
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/nanl-fp80.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+@empty = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@dec = private unnamed_addr constant [2 x i8] c"1\00", align 1
+@hex = private unnamed_addr constant [4 x i8] c"0xf\00", align 1
+
+define x86_fp80 @nanl_empty() {
+; CHECK-LABEL: define x86_fp80 @nanl_empty() {
+; CHECK-NEXT:    ret x86_fp80 0xK7FFFC000000000000000
+;
+  %res = call x86_fp80 @nanl(ptr @empty)
+  ret x86_fp80 %res
+}
+
+define x86_fp80 @nanl_dec() {
+; CHECK-LABEL: define x86_fp80 @nanl_dec() {
+; CHECK-NEXT:    ret x86_fp80 0xK7FFFC000000000000001
+;
+  %res = call x86_fp80 @nanl(ptr @dec)
+  ret x86_fp80 %res
+}
+
+define x86_fp80 @nanl_hex() {
+; CHECK-LABEL: define x86_fp80 @nanl_hex() {
+; CHECK-NEXT:    ret x86_fp80 0xK7FFFC00000000000000F
+;
+  %res = call x86_fp80 @nanl(ptr @hex)
+  ret x86_fp80 %res
+}
+
+declare x86_fp80 @nanl(ptr)
diff --git a/llvm/test/Transforms/InstCombine/nanl-ppc-fp128.ll b/llvm/test/Transforms/InstCombine/nanl-ppc-fp128.ll
new file mode 100644
index 0000000000000..7f60a379c4885
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/nanl-ppc-fp128.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+@empty = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@dec = private unnamed_addr constant [2 x i8] c"1\00", align 1
+@hex = private unnamed_addr constant [4 x i8] c"0xf\00", align 1
+
+define ppc_fp128 @nanl_empty() {
+; CHECK-LABEL: define ppc_fp128 @nanl_empty() {
+; CHECK-NEXT:    ret ppc_fp128 0xM7FF80000000000000000000000000000
+;
+  %res = call ppc_fp128 @nanl(ptr @empty)
+  ret ppc_fp128 %res
+}
+
+define ppc_fp128 @nanl_dec() {
+; CHECK-LABEL: define ppc_fp128 @nanl_dec() {
+; CHECK-NEXT:    ret ppc_fp128 0xM7FF80000000000010000000000000000
+;
+  %res = call ppc_fp128 @nanl(ptr @dec)
+  ret ppc_fp128 %res
+}
+
+define ppc_fp128 @nanl_hex() {
+; CHECK-LABEL: define ppc_fp128 @nanl_hex() {
+; CHECK-NEXT:    ret ppc_fp128 0xM7FF800000000000F0000000000000000
+;
+  %res = call ppc_fp128 @nanl(ptr @hex)
+  ret ppc_fp128 %res
+}
+
+declare ppc_fp128 @nanl(ptr)
diff --git a/llvm/test/Transforms/InstCombine/pr100298.ll b/llvm/test/Transforms/InstCombine/pr100298.ll
new file mode 100644
index 0000000000000..6cf2a71bb916e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr100298.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; Make sure that the result of computeKnownBits for %indvar is correct.
+
+define i16 @pr100298() {
+; CHECK-LABEL: define i16 @pr100298() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ -15, %[[ENTRY]] ], [ [[MASK:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDVAR]], 9
+; CHECK-NEXT:    [[MASK]] = and i32 [[ADD]], 65535
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[MASK]], 5
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_INC]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[MASK]], 3
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i16 [[CONV]], 14
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i16 [[CONV]], i16 [[SHL]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+entry:
+  br label %for.inc
+
+for.inc:
+  %indvar = phi i32 [ -15, %entry ], [ %mask, %for.inc ]
+  %add = add nsw i32 %indvar, 9
+  %mask = and i32 %add, 65535
+  %cmp1 = icmp ugt i32 %mask, 5
+  br i1 %cmp1, label %for.inc, label %for.end
+
+for.end:
+  %conv = trunc i32 %add to i16
+  %cmp2 = icmp ugt i32 %mask, 3
+  %shl = shl nuw i16 %conv, 14
+  %res = select i1 %cmp2, i16 %conv, i16 %shl
+  ret i16 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index 7c65a93a00436..a1d10c274c465 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -427,6 +427,23 @@ entry:
   ret i8 %load
 }
 
+define i8 @select_diff_addrspace_remove_alloca_asan(i1 %cond, ptr %p) sanitize_address {
+; CHECK-LABEL: @select_diff_addrspace_remove_alloca_asan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP2:%.*]] = select i1 [[COND:%.*]], ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g2, i64 4), ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g2, i64 6)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(1) [[GEP2]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+entry:
+  %alloca = alloca [32 x i8]
+  call void @llvm.memcpy.p0.p1.i64(ptr %alloca, ptr addrspace(1) @g2, i64 32, i1 false)
+  %gep = getelementptr inbounds [32 x i8], ptr %alloca, i32 0, i32 2
+  %sel = select i1 %cond, ptr %alloca, ptr %gep
+  %gep2 = getelementptr inbounds i8, ptr %sel, i64 4
+  %load = load i8, ptr %gep2
+  ret i8 %load
+}
+
 declare i8 @readonly_callee(ptr readonly nocapture)
 
 ; FIXME: This should be able to fold to call i8 @readonly_callee(ptr nonnull @g1)
diff --git a/llvm/test/Transforms/InstCombine/remquo.ll b/llvm/test/Transforms/InstCombine/remquo.ll
new file mode 100644
index 0000000000000..b2499ea13179e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/remquo.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define float @remquo_f32(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -2, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_quo_sign(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_quo_sign(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 2, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret float -1.000000e+00
+;
+entry:
+  %call = call float @remquof(float 5.000000e+00, float 3.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_round(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_round(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -6, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret float 0xBFC9999900000000
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float 0x3FE99999A0000000, ptr %quo)
+  ret float %call
+}
+
+define double @remquo_f64(ptr %quo) {
+; CHECK-LABEL: define double @remquo_f64(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -5, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret double -0.000000e+00
+;
+entry:
+  %call = call double @remquo(double -5.000000e+00, double 1.000000e+00, ptr %quo)
+  ret double %call
+}
+
+; Negative tests
+
+define float @remquo_f32_inf_x(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_inf_x(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr [[QUO]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_zero_y(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_zero_y(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_nzero_y(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_nzero_y(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr [[QUO]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_nan_x(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_nan_x(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr [[QUO]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_nan_y(ptr %quo) {
+; CHECK-LABEL: define float @remquo_f32_nan_y(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr [[QUO]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr %quo)
+  ret float %call
+}
+
+define float @remquo_f32_strictfp(ptr %quo) strictfp {
+; CHECK-LABEL: define float @remquo_f32_strictfp(
+; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr [[QUO]]) #[[ATTR0]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo) strictfp
+  ret float %call
+}
+
+define float @remquo_f32_zero_y_strictfp(ptr %quo) strictfp {
+; CHECK-LABEL: define float @remquo_f32_zero_y_strictfp(
+; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]]) #[[ATTR0]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo) strictfp
+  ret float %call
+}
+
+declare float @remquof(float, float, ptr)
+declare double @remquo(double, double, ptr)
diff --git a/llvm/test/Transforms/InstCombine/remquol-fp128.ll b/llvm/test/Transforms/InstCombine/remquol-fp128.ll
new file mode 100644
index 0000000000000..38e0a6040a140
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/remquol-fp128.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define fp128 @remquo_fp128(ptr %quo) {
+; CHECK-LABEL: define fp128 @remquo_fp128(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -2, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret fp128 0xL00000000000000003FFF000000000000
+;
+entry:
+  %call = call fp128 @remquol(fp128 0xL0000000000000000C001400000000000, fp128 0xL00000000000000004000800000000000, ptr %quo)
+  ret fp128 %call
+}
+
+declare fp128 @remquol(fp128, fp128, ptr)
diff --git a/llvm/test/Transforms/InstCombine/remquol-fp80.ll b/llvm/test/Transforms/InstCombine/remquol-fp80.ll
new file mode 100644
index 0000000000000..fe65ee1acc902
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/remquol-fp80.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define x86_fp80 @remquo_fp80(ptr %quo) {
+; CHECK-LABEL: define x86_fp80 @remquo_fp80(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -2, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret x86_fp80 0xK3FFF8000000000000000
+;
+entry:
+  %call = call x86_fp80 @remquol(x86_fp80 0xKC001A000000000000000, x86_fp80 0xK4000C000000000000000, ptr %quo)
+  ret x86_fp80 %call
+}
+
+declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr)
diff --git a/llvm/test/Transforms/InstCombine/remquol-ppc-fp128.ll b/llvm/test/Transforms/InstCombine/remquol-ppc-fp128.ll
new file mode 100644
index 0000000000000..86dfd01f859ac
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/remquol-ppc-fp128.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define ppc_fp128 @remquo_ppc_fp128(ptr %quo) {
+; CHECK-LABEL: define ppc_fp128 @remquo_ppc_fp128(
+; CHECK-SAME: ptr [[QUO:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 -2, ptr [[QUO]], align 4
+; CHECK-NEXT:    ret ppc_fp128 0xM3FF00000000000000000000000000000
+;
+entry:
+  %call = call ppc_fp128 @remquol(ppc_fp128 0xMC0140000000000000000000000000000, ppc_fp128 0xM40080000000000000000000000000000, ptr %quo)
+  ret ppc_fp128 %call
+}
+
+declare ppc_fp128 @remquol(ppc_fp128, ppc_fp128, ptr)
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index 1fa0c09a9e987..fb56764598e2d 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -1315,6 +1315,19 @@ define i32 @select_replace_call_speculatable(i32 %x, i32 %y) {
   ret i32 %s
 }
 
+define i32 @select_replace_call_speculatable_intrinsic(i32 %x, i32 %y) {
+; CHECK-LABEL: @select_replace_call_speculatable_intrinsic(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @llvm.smax.i32(i32 [[Y:%.*]], i32 0)
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 [[CALL]], i32 [[Y]]
+; CHECK-NEXT:    ret i32 [[S]]
+;
+  %c = icmp eq i32 %x, 0
+  %call = call i32 @llvm.smax.i32(i32 %x, i32 %y)
+  %s = select i1 %c, i32 %call, i32 %y
+  ret i32 %s
+}
+
 ; We can't replace the call arguments, as the call is not speculatable. We
 ; may end up changing side-effects or causing undefined behavior.
 define i32 @select_replace_call_non_speculatable(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/select-load.ll b/llvm/test/Transforms/InstCombine/select-load.ll
new file mode 100644
index 0000000000000..36883423aea36
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-load.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+define i32 @test_plain(i1 %f) {
+; CHECK-LABEL: @test_plain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
+; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_asan(i1 %f) sanitize_address {
+; CHECK-LABEL: @test_asan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_hwasan(i1 %f) sanitize_hwaddress {
+; CHECK-LABEL: @test_hwasan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_tsan(i1 %f) sanitize_thread {
+; CHECK-LABEL: @test_tsan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Msan just propagates shadow, even if speculated load accesses uninitialized
+; value, instrumentation will select shadow of the desired value anyway.
+define i32 @test_msan(i1 %f) sanitize_memory {
+; CHECK-LABEL: @test_msan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
+; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index d66ffb9a63ac1..1369be305ec13 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4713,3 +4713,19 @@ define i8 @select_knownbits_simplify_missing_noundef(i8 %x)  {
   %res = select i1 %cmp, i8 %and, i8 0
   ret i8 %res
 }
+
+@g_ext = external global i8
+
+; Make sure we don't replace %ptr with @g_ext, which may cause the load to trigger UB.
+define i32 @pr99436(ptr align 4 dereferenceable(4) %ptr) {
+; CHECK-LABEL: @pr99436(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR:%.*]], @g_ext
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP]], i32 [[VAL]], i32 0
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %cmp = icmp eq ptr %ptr, @g_ext
+  %val = load i32, ptr %ptr, align 4
+  %ret = select i1 %cmp, i32 %val, i32 0
+  ret i32 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/strnlen-2.ll b/llvm/test/Transforms/InstCombine/strnlen-2.ll
index 5e95aaf8edcd9..9c042409a43f3 100644
--- a/llvm/test/Transforms/InstCombine/strnlen-2.ll
+++ b/llvm/test/Transforms/InstCombine/strnlen-2.ll
@@ -38,6 +38,21 @@ define i64 @fold_strnlen_s3_s5_1(i1 %C) {
   ret i64 %len
 }
 
+; FIXME: Constants should be allowed for this optimization.
+define i64 @fold_strnlen_s3_s5_1_asan(i1 %C) sanitize_address {
+; CHECK-LABEL: @fold_strnlen_s3_s5_1_asan(
+; CHECK-NEXT:    [[PTR:%.*]] = select i1 [[C:%.*]], ptr @s3, ptr @s6
+; CHECK-NEXT:    [[STRNLEN_CHAR0:%.*]] = load i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    [[STRNLEN_CHAR0CMP:%.*]] = icmp ne i8 [[STRNLEN_CHAR0]], 0
+; CHECK-NEXT:    [[LEN:%.*]] = zext i1 [[STRNLEN_CHAR0CMP]] to i64
+; CHECK-NEXT:    ret i64 [[LEN]]
+;
+  %ptr = select i1 %C, ptr @s3, ptr @s6
+
+  %len = call i64 @strnlen(ptr %ptr, i64 1)
+  ret i64 %len
+}
+
 
 ; Fold strnlen (C ? s3 : s5, 3) to 3.
 
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
new file mode 100644
index 0000000000000..b52a062fc6404
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+
+%struct.T = type { %struct.Wrapper, %struct.Wrapper }
+%struct.Wrapper = type { i16 }
+
+define void @test1(ptr %a1, ptr %a2) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A2:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A2]], align 2, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP0]], ptr [[A1:%.*]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds i8, ptr [[A1]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[B2]], align 2, !tbaa [[TBAA6]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds i8, ptr %a2, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %a1, ptr align 2 %a2, i64 2, i1 false), !tbaa !0, !tbaa.struct !6
+  %b2 = getelementptr inbounds %struct.T, ptr %a1, i32 0, i32 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %b2, ptr align 2 %b, i64 2, i1 false), !tbaa !8, !tbaa.struct !6
+  ret void
+}
+
+!0 = !{!1, !4, i64 0, i64 2}
+!1 = !{!2, i64 4, !"_ZTS1T", !4, i64 0, i64 2, !4, i64 2, i64 2}
+!2 = !{!3, i64 1, !"omnipotent char"}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!2, i64 2, !"_ZTS7Wrapper", !5, i64 0, i64 2}
+!5 = !{!2, i64 2, !"short"}
+!6 = !{i64 0, i64 2, !7}
+!7 = !{!5, !5, i64 0, i64 2}
+!8 = !{!1, !4, i64 2, i64 2}
+
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META4:![0-9]+]], i64 0, i64 2}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], i64 4, !"_ZTS1T", [[META4]], i64 0, i64 2, [[META4]], i64 2, i64 2}
+; CHECK: [[META2]] = !{[[META3:![0-9]+]], i64 1, !"omnipotent char"}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[META4]] = !{[[META2]], i64 2, !"_ZTS7Wrapper", [[META5:![0-9]+]], i64 0, i64 2}
+; CHECK: [[META5]] = !{[[META2]], i64 2, !"short"}
+; CHECK: [[TBAA6]] = !{[[META1]], [[META4]], i64 2, i64 2}
+;.
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 996d2c0e67e16..e96452a3cebc8 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -38,8 +38,8 @@ define ptr @test2() {
 define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) {
 ; CHECK-LABEL: @test3_multiple_fields(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4, !tbaa.struct [[TBAA_STRUCT3:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa.struct [[TBAA_STRUCT3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -50,8 +50,8 @@ entry:
 define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) {
 ; CHECK-LABEL: @test4_multiple_copy_first_field(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -86,5 +86,4 @@ entry:
 ; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[TBAA_STRUCT3]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 49770e8d9e43f..cb308ab66b093 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -2661,3 +2661,139 @@ define i8 @sub_of_adds_2xc(i8 %x, i8 %y) {
   %r = sub i8 %xc, %yc
   ret i8 %r
 }
+
+define i32 @sub_infer_nuw_from_domcond(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}
+
+define i1 @sub_infer_nuw_from_domcond_fold1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold1(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %ext0 = zext i32 %y to i64
+  %ext1 = zext i32 %x to i64
+  %sub = sub i32 %x, %y
+  %ext2 = zext i32 %sub to i64
+  %add = add nuw nsw i64 %ext2, %ext0
+  %cmp = icmp ugt i64 %add, %ext1
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
+define i64 @sub_infer_nuw_from_domcond_fold2(i32 range(i32 0, 2147483648) %x, i32 range(i32 0, 2147483648) %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[SUB]] to i64
+; CHECK-NEXT:    ret i64 [[EXT]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i64 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  %ext = zext i32 %sub to i64
+  ret i64 %ext
+
+if.else:
+  ret i64 0
+}
+
+define i1 @sub_infer_nuw_from_domcond_fold3(i16 %xx, i32 range(i32 0, 12) %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold3(
+; CHECK-NEXT:    [[X:%.*]] = zext i16 [[XX:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %x = zext i16 %xx to i32
+  %cond = icmp ult i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub nsw i32 %y, %x
+  %cmp = icmp eq i32 %sub, -1
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
+; negative tests
+
+define i32 @sub_infer_nuw_from_domcond_wrong_pred(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_wrong_pred(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp sge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}
+
+define i32 @sub_infer_nuw_from_domcond_lhs_is_false(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_lhs_is_false(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.else, label %if.then
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
index bce07b0775620..c383ff7a90ded 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
@@ -12,18 +12,5 @@ define <2 x ptr> @test_gep() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @a, ptr @a>
 ;
   %A = getelementptr [1 x %rec8], ptr @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer
-  %B = bitcast <2 x ptr> %A to <2 x ptr>
-  ret <2 x ptr> %B
-}
-
-; Testcase that verify the cast-of-cast when the outer/second cast is to a
-; vector type.
-
-define <4 x i16> @test_mmx_const() {
-; CHECK-LABEL: @test_mmx_const(
-; CHECK-NEXT:    ret <4 x i16> zeroinitializer
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <4 x i16>
-  ret <4 x i16> %B
+  ret <2 x ptr> %A
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index d4c49faf91b09..dd75560e25ced 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() {
   ret { i64, i64 } %v
 }
 
-@m64 = internal constant [2 x i64] zeroinitializer
-@idx = external global i32
-
-; This should not try to create an x86_mmx null value.
-define x86_mmx @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8
-; CHECK-NEXT:    ret x86_mmx [[TEMP]]
-;
-  %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64))
-  ret x86_mmx %temp
-}
-
 @g_offset = external global i64
 
 @g_neg_one_vec = constant <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 0000000000000..8bda74eec34a0
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Fold ADD and remove old op if unused.
+define void @add_one_use(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_one_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  %index.next = add i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD and copy NUW if both ops have it.
+; https://alive2.llvm.org/ce/z/bPAT7Z
+define void @add_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NUW if only one op has it.
+define void @add_no_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW if one op has it.
+define void @add_no_nsw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW even if both ops have it.
+define void @add_no_nsw_2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nsw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are different (even if they are both associative).
+define void @diff_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @diff_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = mul i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = mul i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are not associative.
+define void @noassoc_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @noassoc_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = sub i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = sub i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = sub i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = sub i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold floating-point ops, even if they are associative. This would be
+; valid, but is currently disabled.
+define void @fadd(float %c1, float %c2) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = fadd fast float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd fast float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd fast float %step.add, %c2
+  br label %loop
+}
+
+; Original reproducer, adapted from:
+;   for(long i = 0; i < n; ++i)
+;     a[i] = (i*k) * v;
+define void @test(i64 %n, i64 %k) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
+; CHECK-NEXT:    [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  %k.2 = shl nuw nsw i64 %k, 1
+  %vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
+  %.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
+  %.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
+  br label %loop
+
+loop:
+  %vec.ind = phi <2 x i64> [ %vec.init, %entry ], [ %vec.ind.next, %loop ]
+  %step.add = add <2 x i64> %vec.ind, %.splat
+  call void @use(<2 x i64> %step.add)
+  %vec.ind.next = add <2 x i64> %step.add, %.splat
+  br label %loop
+}
+
+declare void @use()
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5a12563..36e2eab6313dc 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -77,9 +77,10 @@ return:
 define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i32 1, 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD_REASS:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[IF_END]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_ADDR_0]], [[J:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[LOOPEXIT0:%.*]]
@@ -97,7 +98,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT:    [[ADD_REASS]] = add i32 [[I_ADDR]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
 ; CHECK:       loopexit0:
 ; CHECK-NEXT:    [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 483955c1c57a0..588696d20227f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,4 +384,55 @@ for.exit:
   ret void
 }
 
+;; Here are two writes that should be `16 * vscale * vscale` apart, so MUL VL
+;; addressing cannot be used to offset the second write, as for example,
+;; `#4, mul vl` would only be an offset of `16 * vscale` (dropping a vscale).
+define void @vscale_squared_offset(ptr %alloc) #0 {
+; COMMON-LABEL: vscale_squared_offset:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    rdvl x9, #1
+; COMMON-NEXT:    fmov z0.s, #4.00000000
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    lsr x9, x9, #4
+; COMMON-NEXT:    fmov z1.s, #8.00000000
+; COMMON-NEXT:    cntw x10
+; COMMON-NEXT:    ptrue p0.s, vl1
+; COMMON-NEXT:    umull x9, w9, w9
+; COMMON-NEXT:    lsl x9, x9, #6
+; COMMON-NEXT:    cmp x8, x10
+; COMMON-NEXT:    b.ge .LBB6_2
+; COMMON-NEXT:  .LBB6_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    add x11, x0, x9
+; COMMON-NEXT:    st1w { z0.s }, p0, [x0]
+; COMMON-NEXT:    add x8, x8, #1
+; COMMON-NEXT:    st1w { z1.s }, p0, [x11]
+; COMMON-NEXT:    addvl x0, x0, #1
+; COMMON-NEXT:    cmp x8, x10
+; COMMON-NEXT:    b.lt .LBB6_1
+; COMMON-NEXT:  .LBB6_2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = mul i64 %vscale, 4
+  br label %for.check
+for.check:
+  %i = phi i64 [ %next_i, %for.body ], [ 0, %entry ]
+  %is_lt = icmp slt i64 %i, %c4_vscale
+  br i1 %is_lt, label %for.body, label %for.exit
+for.body:
+  %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 0, i64 1)
+  %upper_offset = mul i64 %i, %c4_vscale
+  %upper_ptr = getelementptr float, ptr %alloc, i64 %upper_offset
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %upper_ptr, i32 4, <vscale x 4 x i1> %mask)
+  %lower_i = add i64 %i, %c4_vscale
+  %lower_offset = mul i64 %lower_i, %c4_vscale
+  %lower_ptr = getelementptr float, ptr %alloc, i64 %lower_offset
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 8.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %lower_ptr, i32 4, <vscale x 4 x i1> %mask)
+  %next_i = add i64 %i, 1
+  br label %for.check
+for.exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
deleted file mode 100644
index b460b79d0640a..0000000000000
--- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define x86_mmx @f() #0 {
-; CHECK-LABEL: define x86_mmx @f
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[PHI]], 7
-; CHECK-NEXT:    [[ADD_7]] = add i32 [[PHI]], 8
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ]
-; CHECK-NEXT:    ret x86_mmx [[RET]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %phi = phi i32 [ 1, %entry ], [ %add, %for.body ]
-  %add = add i32 %phi, 1
-  %cmp = icmp eq i32 %phi, 0
-  br i1 %cmp, label %exit, label %for.body
-
-exit:                                             ; preds = %for.body
-  %ret = phi x86_mmx [ undef, %for.body ]
-  ret x86_mmx %ret
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
new file mode 100644
index 0000000000000..bce2d6c14d866
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -0,0 +1,380 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mtriple aarch64 -mcpu=neoverse-v1 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/94328.
+define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
+; CHECK-LABEL: define void @sdiv_feeding_gep(
+; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CONV61:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP19]], [[CONV61]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP12]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP25]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[X]], [[TMP20]]
+; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[X]], [[TMP21]]
+; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[TMP29]], [[TMP27]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP30]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 2
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP34]], i64 [[TMP38]]
+; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP36]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP39]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DIV18:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT:    [[CONV20:%.*]] = trunc i64 [[DIV18]] to i32
+; CHECK-NEXT:    [[MUL30:%.*]] = mul i64 [[DIV18]], [[CONV61]]
+; CHECK-NEXT:    [[SUB31:%.*]] = sub i64 [[IV]], [[MUL30]]
+; CHECK-NEXT:    [[CONV34:%.*]] = trunc i64 [[SUB31]] to i32
+; CHECK-NEXT:    [[MUL35:%.*]] = mul i32 [[X]], [[CONV20]]
+; CHECK-NEXT:    [[ADD36:%.*]] = add i32 [[MUL35]], [[CONV34]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD36]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[DST]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv61 = zext i32 %x to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %div18 = sdiv i64 %M, %conv6
+  %conv20 = trunc i64 %div18 to i32
+  %mul30 = mul i64 %div18, %conv61
+  %sub31 = sub i64 %iv, %mul30
+  %conv34 = trunc i64 %sub31 to i32
+  %mul35 = mul i32 %x, %conv20
+  %add36 = add i32 %mul35, %conv34
+  %idxprom = sext i32 %add36 to i64
+  %gep = getelementptr double, ptr %dst, i64 %idxprom
+  store double 0.000000e+00, ptr %gep, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
+; CHECK-LABEL: define void @sdiv_feeding_gep_predicated(
+; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CONV61:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP19]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
+; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[X]], [[TMP27]]
+; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP35]], i32 8, <vscale x 2 x i1> [[TMP23]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <vscale x 2 x i1> [[TMP36]], i32 0
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ule i64 [[IV]], [[M]]
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[DIV18:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT:    [[CONV20:%.*]] = trunc i64 [[DIV18]] to i32
+; CHECK-NEXT:    [[MUL30:%.*]] = mul i64 [[DIV18]], [[CONV61]]
+; CHECK-NEXT:    [[SUB31:%.*]] = sub i64 [[IV]], [[MUL30]]
+; CHECK-NEXT:    [[CONV34:%.*]] = trunc i64 [[SUB31]] to i32
+; CHECK-NEXT:    [[MUL35:%.*]] = mul i32 [[X]], [[CONV20]]
+; CHECK-NEXT:    [[ADD36:%.*]] = add i32 [[MUL35]], [[CONV34]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD36]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[DST]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP]], align 8
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv61 = zext i32 %x to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c = icmp ule i64 %iv, %M
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div18 = sdiv i64 %M, %conv6
+  %conv20 = trunc i64 %div18 to i32
+  %mul30 = mul i64 %div18, %conv61
+  %sub31 = sub i64 %iv, %mul30
+  %conv34 = trunc i64 %sub31 to i32
+  %mul35 = mul i32 %x, %conv20
+  %add36 = add i32 %mul35, %conv34
+  %idxprom = sext i32 %add36 to i64
+  %gep = getelementptr double, ptr %dst, i64 %idxprom
+  store double 0.000000e+00, ptr %gep, align 8
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/80416.
+define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
+; CHECK-LABEL: define void @udiv_urem_feeding_gep(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MUL_1_I:%.*]] = mul i64 [[X]], [[X]]
+; CHECK-NEXT:    [[MUL_2_I:%.*]] = mul i64 [[MUL_1_I]], [[X]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[N]], 4294967295
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP0]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP19]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
+; CHECK-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
+; CHECK-NEXT:    [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
+; CHECK-NEXT:    [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP27]]
+; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], [[X]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP34]], [[TMP28]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
+; CHECK-NEXT:    [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
+; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DIV_I:%.*]] = udiv i64 [[IV]], [[MUL_2_I]]
+; CHECK-NEXT:    [[REM_I:%.*]] = urem i64 [[IV]], [[MUL_2_I]]
+; CHECK-NEXT:    [[DIV_1_I:%.*]] = udiv i64 [[REM_I]], [[MUL_1_I]]
+; CHECK-NEXT:    [[REM_1_I:%.*]] = urem i64 [[REM_I]], [[MUL_1_I]]
+; CHECK-NEXT:    [[DIV_2_I:%.*]] = udiv i64 [[REM_1_I]], [[X]]
+; CHECK-NEXT:    [[REM_2_I:%.*]] = urem i64 [[REM_1_I]], [[X]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul i64 [[X]], [[DIV_I]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], [[DIV_1_I]]
+; CHECK-NEXT:    [[MUL_1_I9:%.*]] = mul i64 [[ADD_I]], [[X]]
+; CHECK-NEXT:    [[ADD_1_I:%.*]] = add i64 [[MUL_1_I9]], [[DIV_2_I]]
+; CHECK-NEXT:    [[MUL_2_I11:%.*]] = mul i64 [[ADD_1_I]], [[X]]
+; CHECK-NEXT:    [[ADD_2_I:%.*]] = add i64 [[MUL_2_I11]], [[REM_2_I]]
+; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i64 [[ADD_2_I]], 32
+; CHECK-NEXT:    [[CONV6_I:%.*]] = ashr i64 [[SEXT_I]], 32
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[CONV6_I]]
+; CHECK-NEXT:    store i64 [[DIV_I]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul.1.i = mul i64 %x, %x
+  %mul.2.i = mul i64 %mul.1.i, %x
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %div.i = udiv i64 %iv, %mul.2.i
+  %rem.i = urem i64 %iv, %mul.2.i
+  %div.1.i = udiv i64 %rem.i, %mul.1.i
+  %rem.1.i = urem i64 %rem.i, %mul.1.i
+  %div.2.i = udiv i64 %rem.1.i, %x
+  %rem.2.i = urem i64 %rem.1.i, %x
+  %mul.i = mul i64 %x, %div.i
+  %add.i = add i64 %mul.i, %div.1.i
+  %mul.1.i9 = mul i64 %add.i, %x
+  %add.1.i = add i64 %mul.1.i9, %div.2.i
+  %mul.2.i11 = mul i64 %add.1.i, %x
+  %add.2.i = add i64 %mul.2.i11, %rem.2.i
+  %sext.i = shl i64 %add.2.i, 32
+  %conv6.i = ashr i64 %sext.i, 32
+  %gep = getelementptr i64, ptr %dst, i64 %conv6.i
+  store i64 %div.i, ptr %gep, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index 78798725b5b9c..21af9ae801e16 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -15,21 +15,21 @@ entry:
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i8, ptr %tmp0, align 1
   %tmp3 = load i8, ptr %tmp1, align 1
-  store i8 0, ptr %tmp0, align 1
-  store i8 0, ptr %tmp1, align 1
+  store i8 %tmp2, ptr %tmp0, align 1
+  store i8 %tmp3, ptr %tmp1, align 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -46,26 +46,26 @@ entry:
 ; VF_4-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i16, ptr %tmp0, align 2
   %tmp3 = load i16, ptr %tmp1, align 2
-  store i16 0, ptr %tmp0, align 2
-  store i16 0, ptr %tmp1, align 2
+  store i16 %tmp2, ptr %tmp0, align 2
+  store i16 %tmp3, ptr %tmp1, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -82,31 +82,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i32, ptr %tmp0, align 4
   %tmp3 = load i32, ptr %tmp1, align 4
-  store i32 0, ptr %tmp0, align 4
-  store i32 0, ptr %tmp1, align 4
+  store i32 %tmp2, ptr %tmp0, align 4
+  store i32 %tmp3, ptr %tmp1, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -123,31 +123,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i64, ptr %tmp0, align 8
   %tmp3 = load i64, ptr %tmp1, align 8
-  store i64 0, ptr %tmp0, align 8
-  store i64 0, ptr %tmp1, align 8
+  store i64 %tmp2, ptr %tmp0, align 8
+  store i64 %tmp3, ptr %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -170,16 +170,16 @@ entry:
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
 ; VF_2:         Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
   %tmp1 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 6
   %tmp2 = load i64, ptr %tmp0, align 8
   %tmp3 = load i64, ptr %tmp1, align 8
-  store i64 0, ptr %tmp0, align 8
-  store i64 0, ptr %tmp1, align 8
+  store i64 %tmp2, ptr %tmp0, align 8
+  store i64 %tmp3, ptr %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
index 5116a85e4f2a1..a70eafb6078a0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
@@ -32,7 +32,6 @@ define i32 @pr70988(ptr %src, i32 %n) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.if4:
@@ -42,7 +41,6 @@ define i32 @pr70988(ptr %src, i32 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF4]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF4]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP8]], i32 [[VEC_PHI]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI3]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 2b2c224dbefe0..95588d1176dcd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -380,41 +380,40 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP2]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
 ; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
-; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
-; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
-; PRED-NEXT:    [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
+; PRED-NEXT:    [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
-; PRED-NEXT:    [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
-; PRED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       middle.block:
-; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
+; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]])
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
 ; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
index b66bb948a47a5..0ef03c58be97a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
@@ -2,8 +2,8 @@
 ; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
-; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
+; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @alloca(ptr %vla, i64 %N) {
 ; CHECK-LABEL: @alloca(
 ; CHECK-NOT: <vscale x
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index 333bb20fd0d9a..bc6eeb470b154 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -101,9 +101,9 @@ for.end:
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
@@ -127,10 +127,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping_ite
 ; CHECK-NOT: <vscale x
@@ -163,9 +163,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_fixed_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
index 89a513515b205..ac6907609f5eb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
@@ -435,6 +435,295 @@ for.end:
   ret void
 }
 
+declare float @llvm.acos.f32(float) nounwind readnone
+define void @acos_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @acos_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_acos_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.acos.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.acos.f64(double) nounwind readnone
+define void @acos_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @acos_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_acos_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.acos.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.asin.f32(float) nounwind readnone
+define void @asin_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @asin_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_asin_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.asin.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.asin.f64(double) nounwind readnone
+define void @asin_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @asin_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_asin_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.asin.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.atan.f32(float) nounwind readnone
+define void @atan_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @atan_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_atan_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.atan.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.atan.f64(double) nounwind readnone
+define void @atan_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @atan_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_atan_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.atan.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.cosh.f32(float) nounwind readnone
+define void @cosh_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @cosh_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_cosh_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.cosh.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.cosh.f64(double) nounwind readnone
+define void @cosh_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @cosh_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_cosh_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.cosh.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sinh.f32(float) nounwind readnone
+define void @sinh_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @sinh_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_sinh_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.sinh.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.sinh.f64(double) nounwind readnone
+define void @sinh_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @sinh_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_sinh_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.sinh.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.tanh.f32(float) nounwind readnone
+define void @tanh_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tanh_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_tanh_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.tanh.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.tanh.f64(double) nounwind readnone
+define void @tanh_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tanh_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_tanh_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.tanh.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
 declare float @cbrtf(float) nounwind readnone
 define void @cbrtf_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @cbrtf_v4f32(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f0ae0093b2641..a107013124a71 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|exp|log|sin|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|trunc)" --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(acos|asin|atan|cos|cosh|exp|log|sin|sinh|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|tanh|trunc)" --version 2
 
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
@@ -12,24 +12,243 @@ target triple = "aarch64-unknown-linux-gnu"
 ; are checking fixed width vectorization with NEON and scalable vectorization
 ; with SVE.
 
+declare double @llvm.acos.f64(double)
+declare float @llvm.acos.f32(float)
+
+define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.acos.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.acos.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.asin.f64(double)
+declare float @llvm.asin.f32(float)
+
+define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.asin.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.asin.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.atan.f64(double)
+declare float @llvm.atan.f32(float)
+
+define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.atan.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.atan.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
 declare double @llvm.ceil.f64(double)
 declare float @llvm.ceil.f32(float)
 
 define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 ; SLEEF-NEON-LABEL: define void @ceil_f64
-; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
 ;
 ; SLEEF-SVE-LABEL: define void @ceil_f64
-; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
 ;
 ; ARMPL-NEON-LABEL: define void @ceil_f64
-; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @ceil_f64
-; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
 ;
   entry:
@@ -231,6 +450,79 @@ define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
   ret void
 }
 
+declare double @llvm.cosh.f64(double)
+declare float @llvm.cosh.f32(float)
+
+define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.cosh.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.cosh.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
 declare double @llvm.exp.f64(double)
 declare float @llvm.exp.f32(float)
 
@@ -1399,6 +1691,79 @@ define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
   ret void
 }
 
+declare double @llvm.sinh.f64(double)
+declare float @llvm.sinh.f32(float)
+
+define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.sinh.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.sinh.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
 declare double @llvm.sqrt.f64(double)
 declare float @llvm.sqrt.f32(float)
 
@@ -1545,6 +1910,79 @@ define void @tan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
   ret void
 }
 
+declare double @llvm.tanh.f64(double)
+declare float @llvm.tanh.f32(float)
+
+define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.tanh.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.tanh.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
 declare double @llvm.trunc.f64(double)
 declare float @llvm.trunc.f32(float)
 
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index d08c16dcb3ed1..214d9abd712cd 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -15,21 +15,21 @@ entry:
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i8, ptr %tmp0, align 1
   %tmp3 = load i8, ptr %tmp1, align 1
-  store i8 0, ptr %tmp0, align 1
-  store i8 0, ptr %tmp1, align 1
+  store i8 %tmp2, ptr %tmp0, align 1
+  store i8 %tmp3, ptr %tmp1, align 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -46,26 +46,26 @@ entry:
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i16, ptr %tmp0, align 2
   %tmp3 = load i16, ptr %tmp1, align 2
-  store i16 0, ptr %tmp0, align 2
-  store i16 0, ptr %tmp1, align 2
+  store i16 %tmp2, ptr %tmp0, align 2
+  store i16 %tmp3, ptr %tmp1, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -82,31 +82,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i32, ptr %tmp0, align 4
   %tmp3 = load i32, ptr %tmp1, align 4
-  store i32 0, ptr %tmp0, align 4
-  store i32 0, ptr %tmp1, align 4
+  store i32 %tmp2, ptr %tmp0, align 4
+  store i32 %tmp3, ptr %tmp1, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -123,21 +123,21 @@ entry:
 ; VF_4-LABEL: Checking a loop in 'half_factor_2'
 ; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 40 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL: Checking a loop in 'half_factor_2'
 ; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 80 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %half.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %half.2, ptr %data, i64 %i, i32 1
   %tmp2 = load half, ptr %tmp0, align 2
   %tmp3 = load half, ptr %tmp1, align 2
-  store half 0., ptr %tmp0, align 2
-  store half 0., ptr %tmp1, align 2
+  store half %tmp2, ptr %tmp0, align 2
+  store half %tmp3, ptr %tmp1, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 97aa2035ded55..c7a04e3669ed6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -17,31 +17,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i8 0, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i8, ptr %tmp0, align 1
   %tmp3 = load i8, ptr %tmp1, align 1
-  store i8 0, ptr %tmp0, align 1
-  store i8 0, ptr %tmp1, align 1
+  store i8 %tmp2, ptr %tmp0, align 1
+  store i8 %tmp3, ptr %tmp1, align 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -58,31 +58,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i16 0, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i16, ptr %tmp0, align 2
   %tmp3 = load i16, ptr %tmp1, align 2
-  store i16 0, ptr %tmp0, align 2
-  store i16 0, ptr %tmp1, align 2
+  store i16 %tmp2, ptr %tmp0, align 2
+  store i16 %tmp3, ptr %tmp1, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -99,31 +99,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i32 0, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i32, ptr %tmp0, align 4
   %tmp3 = load i32, ptr %tmp1, align 4
-  store i32 0, ptr %tmp0, align 4
-  store i32 0, ptr %tmp1, align 4
+  store i32 %tmp2, ptr %tmp0, align 4
+  store i32 %tmp3, ptr %tmp1, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -140,31 +140,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 44 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 88 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 176 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 352 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 1
   %tmp2 = load i64, ptr %tmp0, align 8
   %tmp3 = load i64, ptr %tmp1, align 8
-  store i64 0, ptr %tmp0, align 8
-  store i64 0, ptr %tmp1, align 8
+  store i64 %tmp2, ptr %tmp0, align 8
+  store i64 %tmp3, ptr %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -181,31 +181,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 16 for VF 4 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 18 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'f16_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store half 0xH0000, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store half %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %f16.2, ptr %data, i64 %i, i32 1
   %tmp2 = load half, ptr %tmp0, align 2
   %tmp3 = load half, ptr %tmp1, align 2
-  store half 0.0, ptr %tmp0, align 2
-  store half 0.0, ptr %tmp1, align 2
+  store half %tmp2, ptr %tmp0, align 2
+  store half %tmp3, ptr %tmp1, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -222,31 +222,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_2:          Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load float, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 10 for VF 2 For instruction: store float %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load float, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp2, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store float %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load float, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load float, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp2, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store float %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'f32_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load float, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp2, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store float %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %f32.2, ptr %data, i64 %i, i32 1
   %tmp2 = load float, ptr %tmp0, align 4
   %tmp3 = load float, ptr %tmp1, align 4
-  store float 0.0, ptr %tmp0, align 4
-  store float 0.0, ptr %tmp1, align 4
+  store float %tmp2, ptr %tmp0, align 4
+  store float %tmp3, ptr %tmp1, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -263,31 +263,31 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 16 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 32 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_2'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 64 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
   %tmp1 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 1
   %tmp2 = load double, ptr %tmp0, align 8
   %tmp3 = load double, ptr %tmp1, align 8
-  store double 0.0, ptr %tmp0, align 8
-  store double 0.0, ptr %tmp1, align 8
+  store double %tmp2, ptr %tmp0, align 8
+  store double %tmp3, ptr %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -309,30 +309,30 @@ entry:
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 0, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 0, ptr %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 0, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 0, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -341,9 +341,9 @@ for.body:
   %tmp3 = load i8, ptr %tmp0, align 1
   %tmp4 = load i8, ptr %tmp1, align 1
   %tmp5 = load i8, ptr %tmp2, align 1
-  store i8 0, ptr %tmp0, align 1
-  store i8 0, ptr %tmp1, align 1
-  store i8 0, ptr %tmp2, align 1
+  store i8 %tmp3, ptr %tmp0, align 1
+  store i8 %tmp4, ptr %tmp1, align 1
+  store i8 %tmp5, ptr %tmp2, align 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -361,30 +361,30 @@ entry:
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 0, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 0, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 0, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 0, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -393,9 +393,9 @@ for.body:
   %tmp3 = load i16, ptr %tmp0, align 2
   %tmp4 = load i16, ptr %tmp1, align 2
   %tmp5 = load i16, ptr %tmp2, align 2
-  store i16 0, ptr %tmp0, align 2
-  store i16 0, ptr %tmp1, align 2
-  store i16 0, ptr %tmp2, align 2
+  store i16 %tmp3, ptr %tmp0, align 2
+  store i16 %tmp4, ptr %tmp1, align 2
+  store i16 %tmp5, ptr %tmp2, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -413,30 +413,30 @@ entry:
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 0, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i32 0, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 0, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 0, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -445,9 +445,9 @@ for.body:
   %tmp3 = load i32, ptr %tmp0, align 4
   %tmp4 = load i32, ptr %tmp1, align 4
   %tmp5 = load i32, ptr %tmp2, align 4
-  store i32 0, ptr %tmp0, align 4
-  store i32 0, ptr %tmp1, align 4
-  store i32 0, ptr %tmp2, align 4
+  store i32 %tmp3, ptr %tmp0, align 4
+  store i32 %tmp4, ptr %tmp1, align 4
+  store i32 %tmp5, ptr %tmp2, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -465,30 +465,30 @@ entry:
 ; VF_2:          Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i64 0, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 66 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_4:          Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 36 for VF 4 For instruction: store i64 0, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 132 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_8:          Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 72 for VF 8 For instruction: store i64 0, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 264 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_3'
 ; VF_16:         Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 144 for VF 16 For instruction: store i64 0, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 528 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -497,9 +497,9 @@ for.body:
   %tmp3 = load i64, ptr %tmp0, align 8
   %tmp4 = load i64, ptr %tmp1, align 8
   %tmp5 = load i64, ptr %tmp2, align 8
-  store i64 0, ptr %tmp0, align 8
-  store i64 0, ptr %tmp1, align 8
-  store i64 0, ptr %tmp2, align 8
+  store i64 %tmp3, ptr %tmp0, align 8
+  store i64 %tmp4, ptr %tmp1, align 8
+  store i64 %tmp5, ptr %tmp2, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -517,30 +517,30 @@ entry:
 ; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store half 0xH0000, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 28 for VF 4 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_8:          Found an estimated cost of 56 for VF 8 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store half 0xH0000, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 56 for VF 8 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'f16_factor_3'
 ; VF_16:         Found an estimated cost of 112 for VF 16 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store half 0xH0000, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 112 for VF 16 For instruction: store half %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.3, ptr %data, i64 %i, i32 0
@@ -549,9 +549,9 @@ for.body:
   %tmp3 = load half, ptr %tmp0, align 2
   %tmp4 = load half, ptr %tmp1, align 2
   %tmp5 = load half, ptr %tmp2, align 2
-  store half 0.0, ptr %tmp0, align 2
-  store half 0.0, ptr %tmp1, align 2
-  store half 0.0, ptr %tmp2, align 2
+  store half %tmp3, ptr %tmp0, align 2
+  store half %tmp4, ptr %tmp1, align 2
+  store half %tmp5, ptr %tmp2, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -569,30 +569,30 @@ entry:
 ; VF_2:          Found an estimated cost of 16 for VF 2 For instruction: %tmp3 = load float, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load float, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store float %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4
 ; VF_8-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_8:          Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store float %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'f32_factor_3'
 ; VF_16:         Found an estimated cost of 128 for VF 16 For instruction: %tmp3 = load float, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load float, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store float %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.3, ptr %data, i64 %i, i32 0
@@ -601,9 +601,9 @@ for.body:
   %tmp3 = load float, ptr %tmp0, align 4
   %tmp4 = load float, ptr %tmp1, align 4
   %tmp5 = load float, ptr %tmp2, align 4
-  store float 0.0, ptr %tmp0, align 4
-  store float 0.0, ptr %tmp1, align 4
-  store float 0.0, ptr %tmp2, align 4
+  store float %tmp3, ptr %tmp0, align 4
+  store float %tmp4, ptr %tmp1, align 4
+  store float %tmp5, ptr %tmp2, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -621,30 +621,30 @@ entry:
 ; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_4:          Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 36 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_8:          Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 72 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_3'
 ; VF_16:         Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 144 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -653,9 +653,9 @@ for.body:
   %tmp3 = load double, ptr %tmp0, align 8
   %tmp4 = load double, ptr %tmp1, align 8
   %tmp5 = load double, ptr %tmp2, align 8
-  store double 0.0, ptr %tmp0, align 8
-  store double 0.0, ptr %tmp1, align 8
-  store double 0.0, ptr %tmp2, align 8
+  store double %tmp3, ptr %tmp0, align 8
+  store double %tmp4, ptr %tmp1, align 8
+  store double %tmp5, ptr %tmp2, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -677,37 +677,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i8 0, ptr %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_4:         Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store i8 0, ptr %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 96 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i8 0, ptr %tmp3, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store i8 0, ptr %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -718,10 +718,10 @@ for.body:
   %tmp5 = load i8, ptr %tmp1, align 1
   %tmp6 = load i8, ptr %tmp2, align 1
   %tmp7 = load i8, ptr %tmp3, align 1
-  store i8 0, ptr %tmp0, align 1
-  store i8 0, ptr %tmp1, align 1
-  store i8 0, ptr %tmp2, align 1
-  store i8 0, ptr %tmp3, align 1
+  store i8 %tmp4, ptr %tmp0, align 1
+  store i8 %tmp5, ptr %tmp1, align 1
+  store i8 %tmp6, ptr %tmp2, align 1
+  store i8 %tmp7, ptr %tmp3, align 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -740,37 +740,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i16 0, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_4:          Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i16 0, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 96 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i16 0, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store i16 0, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -781,10 +781,10 @@ for.body:
   %tmp5 = load i16, ptr %tmp1, align 2
   %tmp6 = load i16, ptr %tmp2, align 2
   %tmp7 = load i16, ptr %tmp3, align 2
-  store i16 0, ptr %tmp0, align 2
-  store i16 0, ptr %tmp1, align 2
-  store i16 0, ptr %tmp2, align 2
-  store i16 0, ptr %tmp3, align 2
+  store i16 %tmp4, ptr %tmp0, align 2
+  store i16 %tmp5, ptr %tmp1, align 2
+  store i16 %tmp6, ptr %tmp2, align 2
+  store i16 %tmp7, ptr %tmp3, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -803,37 +803,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i32 0, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i32 0, ptr %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i32 0, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store i32 0, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -844,10 +844,10 @@ for.body:
   %tmp5 = load i32, ptr %tmp1, align 4
   %tmp6 = load i32, ptr %tmp2, align 4
   %tmp7 = load i32, ptr %tmp3, align 4
-  store i32 0, ptr %tmp0, align 4
-  store i32 0, ptr %tmp1, align 4
-  store i32 0, ptr %tmp2, align 4
-  store i32 0, ptr %tmp3, align 4
+  store i32 %tmp4, ptr %tmp0, align 4
+  store i32 %tmp5, ptr %tmp1, align 4
+  store i32 %tmp6, ptr %tmp2, align 4
+  store i32 %tmp7, ptr %tmp3, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -866,37 +866,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i64 0, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 88 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_4:          Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store i64 0, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 176 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_8:          Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 96 for VF 8 For instruction: store i64 0, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 352 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_4'
 ; VF_16:         Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 192 for VF 16 For instruction: store i64 0, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 704 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -907,10 +907,10 @@ for.body:
   %tmp5 = load i64, ptr %tmp1, align 8
   %tmp6 = load i64, ptr %tmp2, align 8
   %tmp7 = load i64, ptr %tmp3, align 8
-  store i64 0, ptr %tmp0, align 8
-  store i64 0, ptr %tmp1, align 8
-  store i64 0, ptr %tmp2, align 8
-  store i64 0, ptr %tmp3, align 8
+  store i64 %tmp4, ptr %tmp0, align 8
+  store i64 %tmp5, ptr %tmp1, align 8
+  store i64 %tmp6, ptr %tmp2, align 8
+  store i64 %tmp7, ptr %tmp3, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -929,37 +929,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load half, ptr %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load half, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store half 0xH0000, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store half %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_4'
 ; VF_4:          Found an estimated cost of 36 for VF 4 For instruction: %tmp4 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load half, ptr %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load half, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 36 for VF 4 For instruction: store half %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'f16_factor_4'
 ; VF_8:          Found an estimated cost of 72 for VF 8 For instruction: %tmp4 = load half, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, ptr %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 72 for VF 8 For instruction: store half %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'f16_factor_4'
 ; VF_16:         Found an estimated cost of 144 for VF 16 For instruction: %tmp4 = load half, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, ptr %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store half 0xH0000, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 144 for VF 16 For instruction: store half %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.4, ptr %data, i64 %i, i32 0
@@ -970,10 +970,10 @@ for.body:
   %tmp5 = load half, ptr %tmp1, align 2
   %tmp6 = load half, ptr %tmp2, align 2
   %tmp7 = load half, ptr %tmp3, align 2
-  store half 0.0, ptr %tmp0, align 2
-  store half 0.0, ptr %tmp1, align 2
-  store half 0.0, ptr %tmp2, align 2
-  store half 0.0, ptr %tmp3, align 2
+  store half %tmp4, ptr %tmp0, align 2
+  store half %tmp5, ptr %tmp1, align 2
+  store half %tmp6, ptr %tmp2, align 2
+  store half %tmp7, ptr %tmp3, align 2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -992,37 +992,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load float, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store float 0.000000e+00, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 20 for VF 2 For instruction: store float %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store float 0.000000e+00, ptr %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_8:          Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, ptr %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store float 0.000000e+00, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 80 for VF 8 For instruction: store float %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'f32_factor_4'
 ; VF_16:         Found an estimated cost of 160 for VF 16 For instruction: %tmp4 = load float, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, ptr %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store float 0.000000e+00, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 160 for VF 16 For instruction: store float %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.4, ptr %data, i64 %i, i32 0
@@ -1033,10 +1033,10 @@ for.body:
   %tmp5 = load float, ptr %tmp1, align 4
   %tmp6 = load float, ptr %tmp2, align 4
   %tmp7 = load float, ptr %tmp3, align 4
-  store float 0.0, ptr %tmp0, align 4
-  store float 0.0, ptr %tmp1, align 4
-  store float 0.0, ptr %tmp2, align 4
-  store float 0.0, ptr %tmp3, align 4
+  store float %tmp4, ptr %tmp0, align 4
+  store float %tmp5, ptr %tmp1, align 4
+  store float %tmp6, ptr %tmp2, align 4
+  store float %tmp7, ptr %tmp3, align 4
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -1055,37 +1055,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store double 0.000000e+00, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_4:          Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store double 0.000000e+00, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_8:          Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store double 0.000000e+00, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 96 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_4'
 ; VF_16:         Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 128 for VF 16 For instruction: store double 0.000000e+00, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 192 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
@@ -1096,10 +1096,10 @@ for.body:
   %tmp5 = load double, ptr %tmp1, align 8
   %tmp6 = load double, ptr %tmp2, align 8
   %tmp7 = load double, ptr %tmp3, align 8
-  store double 0.0, ptr %tmp0, align 8
-  store double 0.0, ptr %tmp1, align 8
-  store double 0.0, ptr %tmp2, align 8
-  store double 0.0, ptr %tmp3, align 8
+  store double %tmp4, ptr %tmp0, align 8
+  store double %tmp5, ptr %tmp1, align 8
+  store double %tmp6, ptr %tmp2, align 8
+  store double %tmp7, ptr %tmp3, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index 2cee1dacea37d..09eada311d219 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
 
 ;; This is a collection of tests whose only purpose is to show changes in the
 ;; default configuration.  Please keep these tests minimal - if you're testing
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index 7f258d57e7018..0b8a2d2f02057 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -40,8 +40,6 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    pred.store.continue:
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<[[P1:%.+]]> = ir<%0>
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<[[P2:%.+]]> = ir<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): for.body.2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
new file mode 100644
index 0000000000000..da5db810ec638
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+; Test with a dead load in the loop, from
+; https://github.com/llvm/llvm-project/issues/99701
+define void @dead_load(ptr %p, i16 %start) {
+; CHECK-LABEL: define void @dead_load(
+; CHECK-SAME: ptr [[P:%.*]], i16 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START_EXT:%.*]] = sext i16 [[START]] to i64
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[START_EXT]], i64 111)
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[SMAX]], [[START_EXT]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[SMAX]], [[UMIN]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[START_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[UMIN]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[START_EXT]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 8 x i64> [[TMP16]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> [[DOTSPLAT]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 3, [[TMP19]]
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP20]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[P]], <vscale x 8 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP21]], i32 2, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START_EXT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IV]], 111
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %start.ext = sext i16 %start to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start.ext, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i16, ptr %p, i64 %iv
+  store i16 0, ptr %gep, align 2
+  %l = load i16, ptr %gep, align 2
+  %iv.next = add i64 %iv, 3
+  %cmp = icmp slt i64 %iv, 111
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/100464.
+; Loop with a live-out %l and scalar epilogue required due to an interleave
+; group. As the scalar epilogue is required the live-out is fed from the scalar
+; epilogue and dead in the vector loop.
+define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
+; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 4, [[TMP13]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %idxprom = sext i32 %iv to i64
+  %gep.src = getelementptr i8, ptr %src, i64 %idxprom
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %idxprom
+  store i8 0, ptr %gep.dst, align 1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp ult i32 %iv, 1001
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  %r = phi i8 [ %l, %loop ]
+  ret i8 %r
+}
+
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index e50d7362365b8..a151232df0cd5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -39,32 +39,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV32-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
-; RV32-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV32-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
-; RV32-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; RV32-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; RV32-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV32-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
+; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV32-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; RV32-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
-; RV32-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
-; RV32-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV32-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
-; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
+; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -121,32 +121,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV64-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
-; RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV64-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
-; RV64-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; RV64-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV64-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
+; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV64-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; RV64-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
 ; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
-; RV64-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
-; RV64-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV64-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
-; RV64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; RV64-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
+; RV64-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV64-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 8e9713fecf29d..9c9547c8b6322 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -18,6 +18,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
@@ -54,43 +55,46 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
-; CHECK:       ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK:       vector.ph:
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK:       <x1> vector loop: {
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK:       middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:  scalar.ph:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -131,10 +135,55 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%13>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
@@ -174,6 +223,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
@@ -193,7 +243,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -210,37 +260,40 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
-; CHECK:       ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK:       vector.ph:
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK:       <x1> vector loop: {
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    vp<[[STEPS]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
-; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK:       middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
@@ -255,7 +308,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -281,16 +334,61 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 34
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%13>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index bce965c723101..bb716d78ca411 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -7,24 +7,22 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP1]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP7]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP10]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -67,23 +65,23 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP7]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
new file mode 100644
index 0000000000000..2dd47d5c1ea8a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -0,0 +1,560 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; Dependence distance between read and write is greater than the trip
+; count of the loop.  Thus, values written are never read for any
+; valid vectorization of the loop.
+define void @test(ptr %p) {
+; IF-EVL-LABEL: @test(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP7]], 200
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 8 [[TMP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 8
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 8
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 8
+  %offset = add i64 %iv, 200
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 8
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance is less than trip count, thus we must prove that
+; chosen VF guaranteed to be less than dependence distance.
+define void @test_may_clobber1(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber1(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber1(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_may_clobber2(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber2(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber2(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 9
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_may_clobber3(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber3(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 10
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber3(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 10
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 10
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Trviailly no overlap due to maximum possible value of VLEN and LMUL
+define void @trivial_due_max_vscale(ptr %p) {
+; IF-EVL-LABEL: @trivial_due_max_vscale(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP9]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP7]], 8192
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @trivial_due_max_vscale(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 8192
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance could be violated via LMUL>=2 or interleaving
+define void @no_high_lmul_or_interleave(ptr %p) {
+; IF-EVL-LABEL: @no_high_lmul_or_interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 1024
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @no_high_lmul_or_interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 1024
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 3001
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @non-power-2-storeloadforward(ptr %A) {
+; IF-EVL-LABEL: @non-power-2-storeloadforward(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @non-power-2-storeloadforward(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 16, %entry ], [ %iv.next, %for.body ]
+  %0 = add nsw i64 %iv, -3
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %0
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = add nsw i64 %iv, 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %2
+  %3 = load i32, ptr %arrayidx2, align 4
+  %add3 = add nsw i32 %3, %1
+  %arrayidx5 = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add3, ptr %arrayidx5, align 4
+  %iv.next = add i64 %iv, 1
+  %lftr.wideiv = trunc i64 %iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
new file mode 100644
index 0000000000000..3477c8d879106
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=z16 -force-target-instruction-cost=1 -S %s | FileCheck %s
+
+target triple = "systemz-unknown-linux-unknown"
+
+define void @test_scalar_steps_target_instruction_cost(ptr %dst) {
+; CHECK-LABEL: define void @test_scalar_steps_target_instruction_cost(
+; CHECK-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IV]], <i64 8, i64 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
+; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; CHECK:       [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 30, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], 22
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i64, ptr %dst, i64 %iv
+  store i64 %iv, ptr %gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 3
+  %cmp = icmp ult i64 %iv, 22
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
index 83dc618fc83ff..02f8bc61cbb62 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -12,9 +12,11 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %mul = mul nsw i64 %iv, %s
-  %gep = getelementptr inbounds double, ptr %Src, i64 %mul
-  %bct = bitcast ptr %gep to ptr
+  %gep.src = getelementptr inbounds double, ptr %Src, i64 %mul
+  %bct = bitcast ptr %gep.src to ptr
   %ld = load i64, ptr %bct
+  %gep.data = getelementptr inbounds i64, ptr %data, i64 %iv
+  store i64 %ld, ptr %gep.data
   %iv.next = add nuw nsw i64 %iv, 1
   %cmp110.us = icmp slt i64 %iv.next, %n
   br i1 %cmp110.us, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
new file mode 100644
index 0000000000000..fcf1ba072a62c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mtriple=s390x-unknown-linux -mcpu=z16 -S %s | FileCheck %s
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+
+@src = external global [8 x i32], align 4
+
+; Test case where scalar steps are used by both a VPReplicateRecipe (demands
+; all scalar lanes) and a VPInstruction that only demands the first lane.
+; Test case for https://github.com/llvm/llvm-project/issues/88849.
+define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(ptr noalias %dst, ptr noalias %src.1) {
+; CHECK-LABEL: define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 [[TMP27]], ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; CHECK:       [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    store i32 [[TMP29]], ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_IF5]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 [[IV_OR]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    store i32 [[L_2]], ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.iv = mul nsw i64 %iv, 4
+  %gep.src.1 = getelementptr inbounds i8, ptr %src.1, i64 %mul.iv
+  %l.1 = load i8, ptr %gep.src.1, align 1
+  %c = icmp eq i8 %l.1, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %iv.or = or disjoint i64 %iv, 4
+  %gep.src = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 %iv.or
+  %l.2 = load i32, ptr %gep.src, align 4
+  store i32 %l.2, ptr %dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 4
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 1627292732b6a..3803921799858 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s --check-prefix=CHECK-AVX-VF2
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF8
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF16
 
@@ -20,6 +21,27 @@ declare float @tanf(float) #0
 declare double @llvm.tan.f64(double) #0
 declare float @llvm.tan.f32(float) #0
 
+declare float @acosf(float) #0
+declare float @llvm.acos.f32(float) #0
+
+declare double @asin(double) #0
+declare float @asinf(float) #0
+declare double @llvm.asin.f64(double) #0
+declare float @llvm.asin.f32(float) #0
+
+declare double @atan(double) #0
+declare float @atanf(float) #0
+declare double @llvm.atan.f64(double) #0
+declare float @llvm.atan.f32(float) #0
+
+declare double @cosh(double) #0
+declare float @coshf(float) #0
+declare double @llvm.cosh.f64(double) #0
+declare float @llvm.cosh.f32(float) #0
+
+declare float @tanhf(float) #0
+declare float @llvm.tanh.f32(float) #0
+
 declare double @pow(double, double) #0
 declare float @powf(float, float) #0
 declare double @llvm.pow.f64(double, double) #0
@@ -274,6 +296,10 @@ define void @tan_f64(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -328,6 +354,10 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64_intrinsic(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -377,6 +407,360 @@ for.end:
   ret void
 }
 
+define void @acos_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @acosf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @acos_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.acos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @asin(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @asinf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.asin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.asin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.atan.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.atan.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @coshf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cosh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.tanh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
 ; CHECK:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index bee5b397ecb47..bb35806cea20a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -76,7 +76,7 @@ attributes #0 = { "target-cpu"="knl" }
 ; CHECK:     LV: Found trip count: 3
 ; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
 ; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, ptr {{%.*}}, align 1
-; CHECK:     LV: Found not uniform being ScalarWithPredication:  {{%.*}} = load i32, ptr {{%.*}}, align 1
+; CHECK:     LV: Found not uniform due to requiring predication:  {{%.*}} = load i32, ptr {{%.*}}, align 1
 ; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
 ;
 ; FORCE-LABEL: @PR40816(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index 42a9ab0ca270f..32964d650ea0f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -23,42 +23,15 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
-; CHECK:       [[PRED_SDIV_IF]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
-; CHECK:       [[PRED_SDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
-; CHECK:       [[PRED_SDIV_IF3]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
-; CHECK:       [[PRED_SDIV_CONTINUE4]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE]] ], [ [[TMP11]], %[[PRED_SDIV_IF3]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]]
-; CHECK:       [[PRED_SDIV_IF5]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
-; CHECK:       [[PRED_SDIV_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP14]], %[[PRED_SDIV_IF5]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_SDIV_IF7:.*]], label %[[PRED_SDIV_CONTINUE8]]
-; CHECK:       [[PRED_SDIV_IF7]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE8]]
-; CHECK:       [[PRED_SDIV_CONTINUE8]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE6]] ], [ [[TMP17]], %[[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP9]], [[CONV61]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP17]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[TMP5]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul i32 [[X]], [[TMP19]]
@@ -254,125 +227,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MUL_1_I:%.*]] = mul i64 [[X]], [[X]]
 ; CHECK-NEXT:    [[MUL_2_I:%.*]] = mul i64 [[MUL_1_I]], [[X]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[N]], 4294967295
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_UREM_CONTINUE6]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
-; CHECK:       [[PRED_UREM_IF]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i64 [[TMP7]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP7]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP12:%.*]] = urem i64 [[TMP10]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP13:%.*]] = udiv i64 [[TMP12]], [[X]]
-; CHECK-NEXT:    [[TMP14:%.*]] = urem i64 [[TMP12]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
-; CHECK:       [[PRED_UREM_CONTINUE]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP11]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP21]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
-; CHECK:       [[PRED_UREM_IF1]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = udiv i64 [[TMP22]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP25:%.*]] = urem i64 [[TMP22]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP26:%.*]] = udiv i64 [[TMP25]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP27:%.*]] = urem i64 [[TMP25]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP28:%.*]] = udiv i64 [[TMP27]], [[X]]
-; CHECK-NEXT:    [[TMP29:%.*]] = urem i64 [[TMP27]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE2]]
-; CHECK:       [[PRED_UREM_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_UREM_CONTINUE]] ], [ [[TMP24]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP25]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP32:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP26]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP27]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP34:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP28]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP35:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP29]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
-; CHECK:       [[PRED_UREM_IF3]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP38:%.*]] = udiv i64 [[TMP37]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP38]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = urem i64 [[TMP37]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP41:%.*]] = udiv i64 [[TMP40]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP42:%.*]] = urem i64 [[TMP40]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP43:%.*]] = udiv i64 [[TMP42]], [[X]]
-; CHECK-NEXT:    [[TMP44:%.*]] = urem i64 [[TMP42]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE4]]
-; CHECK:       [[PRED_UREM_CONTINUE4]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = phi <4 x i64> [ [[TMP30]], %[[PRED_UREM_CONTINUE2]] ], [ [[TMP39]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP46:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP40]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP41]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP48:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP42]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP49:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP43]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP50:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP44]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP51]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
-; CHECK:       [[PRED_UREM_IF5]]:
-; CHECK-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP53:%.*]] = udiv i64 [[TMP52]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i64> [[TMP45]], i64 [[TMP53]], i32 3
-; CHECK-NEXT:    [[TMP55:%.*]] = urem i64 [[TMP52]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP56:%.*]] = udiv i64 [[TMP55]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP57:%.*]] = urem i64 [[TMP55]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP58:%.*]] = udiv i64 [[TMP57]], [[X]]
-; CHECK-NEXT:    [[TMP59:%.*]] = urem i64 [[TMP57]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE6]]
-; CHECK:       [[PRED_UREM_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP60:%.*]] = phi <4 x i64> [ [[TMP45]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP54]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP61:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP55]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP62:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP56]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP63:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP57]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP64:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP58]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP65:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP59]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i64> [[TMP60]], i32 0
-; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[X]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[TMP17]]
-; CHECK-NEXT:    [[TMP69:%.*]] = mul i64 [[TMP68]], [[X]]
-; CHECK-NEXT:    [[TMP70:%.*]] = add i64 [[TMP69]], [[TMP19]]
-; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP70]], [[X]]
-; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[TMP20]]
-; CHECK-NEXT:    [[TMP73:%.*]] = shl i64 [[TMP72]], 32
-; CHECK-NEXT:    [[TMP74:%.*]] = ashr i64 [[TMP73]], 32
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP74]]
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i64, ptr [[TMP75]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP60]], ptr [[TMP76]], i32 4, <4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP77]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DIV_I:%.*]] = udiv i64 [[IV]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[REM_I:%.*]] = urem i64 [[IV]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[DIV_1_I:%.*]] = udiv i64 [[REM_I]], [[MUL_1_I]]
@@ -391,7 +248,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    store i64 [[DIV_I]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -432,6 +289,4 @@ exit:
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index b1f7516f3c8dc..cc1d11754b27e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -188,7 +188,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 30
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 54
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG2]], i64 8
@@ -379,7 +379,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 28
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 52
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 3
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index a49090eecac41..eb2de69afbe58 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -38,7 +38,7 @@ define i32 @main(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 40
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[CONV3]], -1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
index ba777f52f532b..89821dd98c520 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Make sure that we can compile the test without crash.
-define void @barney() {
+define void @barney(ptr %dst) {
 
 ; CHECK-LABEL: @barney(
 ; CHECK:       middle.block:
@@ -43,6 +43,8 @@ bb48:                                             ; preds = %bb46
 bb50:                                             ; preds = %bb50, %bb19
   %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
   %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %gep = getelementptr inbounds i8, ptr %dst, i64 %tmp53
+  store i8 1, ptr %gep
   %tmp54 = add i32 %tmp52, 12
   %tmp55 = add i32 %tmp52, 13
   %tmp56 = add nuw nsw i64 %tmp53, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 743ca20f92b49..5aac001a8b9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -19,7 +19,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1
 ; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 60
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
new file mode 100644
index 0000000000000..46c62e1ea7741
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -0,0 +1,646 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mtriple=x86_64-apple-macosx -mcpu=skylake-avx512 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=COST %s
+; RUN: opt -p loop-vectorize -mtriple=x86_64-apple-macosx -mcpu=skylake-avx512 -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=FORCED %s
+
+define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
+; COST-LABEL: define void @switch_default_to_latch_common_dest(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN:.*]]
+; COST-NEXT:      i64 13, label %[[IF_THEN]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @switch_default_to_latch_common_dest(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN:.*]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %loop.latch [
+  i64 -12, label %if.then
+  i64 13, label %if.then
+  ]
+
+if.then:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch_all_dests_distinct(ptr %start, ptr %end) {
+; COST-LABEL: define void @switch_all_dests_distinct(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; COST-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; COST-NEXT:      i64 0, label %[[IF_THEN_3:.*]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN_1]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[IF_THEN_2]]:
+; COST-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[IF_THEN_3]]:
+; COST-NEXT:    store i64 1, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[DEFAULT]]:
+; COST-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @switch_all_dests_distinct(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; FORCED-NEXT:      i64 0, label %[[IF_THEN_3:.*]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN_1]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[IF_THEN_2]]:
+; FORCED-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[IF_THEN_3]]:
+; FORCED-NEXT:    store i64 1, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[DEFAULT]]:
+; FORCED-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %default [
+  i64 -12, label %if.then.1
+  i64 13, label %if.then.2
+  i64 0, label %if.then.3
+  ]
+
+if.then.1:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i64 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.3:
+  store i64 1, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i64 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+
+define void @switch_multiple_common_dests(ptr %start, ptr %end) {
+; COST-LABEL: define void @switch_multiple_common_dests(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; COST-NEXT:      i64 0, label %[[IF_THEN_1]]
+; COST-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; COST-NEXT:      i64 14, label %[[IF_THEN_2]]
+; COST-NEXT:      i64 15, label %[[IF_THEN_2]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN_1]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[IF_THEN_2]]:
+; COST-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[DEFAULT]]:
+; COST-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @switch_multiple_common_dests(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; FORCED-NEXT:      i64 0, label %[[IF_THEN_1]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; FORCED-NEXT:      i64 14, label %[[IF_THEN_2]]
+; FORCED-NEXT:      i64 15, label %[[IF_THEN_2]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN_1]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[IF_THEN_2]]:
+; FORCED-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[DEFAULT]]:
+; FORCED-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %default [
+  i64 -12, label %if.then.1
+  i64 0, label %if.then.1
+  i64 13, label %if.then.2
+  i64 14, label %if.then.2
+  i64 15, label %if.then.2
+  ]
+
+if.then.1:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i64 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i64 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
+; COST-LABEL: define void @switch4_default_common_dest_with_case(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; COST-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; COST-NEXT:      i64 0, label %[[DEFAULT]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN_1]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[IF_THEN_2]]:
+; COST-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[DEFAULT]]:
+; COST-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @switch4_default_common_dest_with_case(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; FORCED-NEXT:      i64 0, label %[[DEFAULT]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN_1]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[IF_THEN_2]]:
+; FORCED-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[DEFAULT]]:
+; FORCED-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %default [
+  i64 -12, label %if.then.1
+  i64 13, label %if.then.2
+  i64 0, label %default
+  ]
+
+if.then.1:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i64 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i64 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, i64 %x) {
+; COST-LABEL: define void @switch_under_br_default_common_dest_with_case(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], [[X]]
+; COST-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; COST:       [[THEN]]:
+; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; COST-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; COST-NEXT:      i64 0, label %[[DEFAULT]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN_1]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[IF_THEN_2]]:
+; COST-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[DEFAULT]]:
+; COST-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @switch_under_br_default_common_dest_with_case(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], [[X]]
+; FORCED-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; FORCED:       [[THEN]]:
+; FORCED-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; FORCED-NEXT:      i64 0, label %[[DEFAULT]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN_1]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[IF_THEN_2]]:
+; FORCED-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[DEFAULT]]:
+; FORCED-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  %c = icmp ule i64 %l, %x
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  switch i64 %l, label %default [
+  i64 -12, label %if.then.1
+  i64 13, label %if.then.2
+  i64 0, label %default
+  ]
+
+if.then.1:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i64 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i64 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, i64 %x) {
+; COST-LABEL: define void @br_under_switch_default_common_dest_with_case(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; COST-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; COST-NEXT:      i64 0, label %[[DEFAULT]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN_1]]:
+; COST-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], [[X]]
+; COST-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[IF_THEN_2]]
+; COST:       [[THEN]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[DEFAULT]]
+; COST:       [[IF_THEN_2]]:
+; COST-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[DEFAULT]]:
+; COST-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @br_under_switch_default_common_dest_with_case(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
+; FORCED-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
+; FORCED-NEXT:      i64 13, label %[[IF_THEN_2:.*]]
+; FORCED-NEXT:      i64 0, label %[[DEFAULT]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN_1]]:
+; FORCED-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], [[X]]
+; FORCED-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[IF_THEN_2]]
+; FORCED:       [[THEN]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[DEFAULT]]
+; FORCED:       [[IF_THEN_2]]:
+; FORCED-NEXT:    store i64 0, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[DEFAULT]]:
+; FORCED-NEXT:    store i64 2, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %default [
+  i64 -12, label %if.then.1
+  i64 13, label %if.then.2
+  i64 0, label %default
+  ]
+
+if.then.1:
+  %c = icmp ule i64 %l, %x
+  br i1 %c, label %then, label %if.then.2
+
+then:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %default
+
+if.then.2:
+  store i64 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i64 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @large_number_of_cases(ptr %start, ptr %end) {
+; COST-LABEL: define void @large_number_of_cases(
+; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COST:       [[LOOP_HEADER]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; COST-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
+; COST-NEXT:      i64 1, label %[[IF_THEN:.*]]
+; COST-NEXT:      i64 3, label %[[IF_THEN]]
+; COST-NEXT:      i64 11, label %[[IF_THEN]]
+; COST-NEXT:      i64 99, label %[[IF_THEN]]
+; COST-NEXT:      i64 213, label %[[IF_THEN]]
+; COST-NEXT:      i64 238, label %[[IF_THEN]]
+; COST-NEXT:      i64 513, label %[[IF_THEN]]
+; COST-NEXT:      i64 791, label %[[IF_THEN]]
+; COST-NEXT:      i64 899, label %[[IF_THEN]]
+; COST-NEXT:    ]
+; COST:       [[IF_THEN]]:
+; COST-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; COST-NEXT:    br label %[[LOOP_LATCH]]
+; COST:       [[LOOP_LATCH]]:
+; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST:       [[EXIT]]:
+; COST-NEXT:    ret void
+;
+; FORCED-LABEL: define void @large_number_of_cases(
+; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
+; FORCED-NEXT:  [[ENTRY:.*]]:
+; FORCED-NEXT:    br label %[[LOOP_HEADER:.*]]
+; FORCED:       [[LOOP_HEADER]]:
+; FORCED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; FORCED-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
+; FORCED-NEXT:      i64 1, label %[[IF_THEN:.*]]
+; FORCED-NEXT:      i64 3, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 11, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 99, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 213, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 238, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 513, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 791, label %[[IF_THEN]]
+; FORCED-NEXT:      i64 899, label %[[IF_THEN]]
+; FORCED-NEXT:    ]
+; FORCED:       [[IF_THEN]]:
+; FORCED-NEXT:    store i64 42, ptr [[PTR_IV]], align 1
+; FORCED-NEXT:    br label %[[LOOP_LATCH]]
+; FORCED:       [[LOOP_LATCH]]:
+; FORCED-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
+; FORCED-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; FORCED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i64, ptr %ptr.iv, align 1
+  switch i64 %l, label %loop.latch [
+  i64 1, label %if.then
+  i64 3, label %if.then
+  i64 11, label %if.then
+  i64 99, label %if.then
+  i64 213, label %if.then
+  i64 238, label %if.then
+  i64 513, label %if.then
+  i64 791, label %if.then
+  i64 899, label %if.then
+  ]
+
+if.then:
+  store i64 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
index 27038f3a24b66..b3a5836c6bbde 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -456,6 +456,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @asin_f32_intrinsic(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.asin.f32(float) nounwind readnone
+define void @asin_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.asin.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @acos_f32(
 ;CHECK: vacosf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -481,6 +506,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @acos_f32_intrinsic(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.acos.f32(float) nounwind readnone
+define void @acos_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.acos.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @atan_f32(
 ;CHECK: vatanf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -506,6 +556,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @atan_f32_intrinsic(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.atan.f32(float) nounwind readnone
+define void @atan_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.atan.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @sinh_f32(
 ;CHECK: vsinhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -531,6 +606,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @sinh_f32_intrinsic(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.sinh.f32(float) nounwind readnone
+define void @sinh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.sinh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @cosh_f32(
 ;CHECK: vcoshf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -556,6 +656,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @cosh_f32_intrinsic(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.cosh.f32(float) nounwind readnone
+define void @cosh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.cosh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @tanh_f32(
 ;CHECK: vtanhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -581,6 +706,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @tanh_f32_intrinsic(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.tanh.f32(float) nounwind readnone
+define void @tanh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.tanh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @asinh_f32(
 ;CHECK: vasinhf{{.*}}<4 x float>
 ;CHECK: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index feaa5fa2fc4d3..eee1b6f35d1b7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -238,7 +238,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
@@ -250,7 +249,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
@@ -314,7 +312,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 0
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; SINK-GATHER:       pred.udiv.continue:
-; SINK-GATHER-NEXT:    [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
 ; SINK-GATHER-NEXT:    [[TMP8:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; SINK-GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
 ; SINK-GATHER-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
@@ -326,7 +323,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; SINK-GATHER:       pred.udiv.continue2:
-; SINK-GATHER-NEXT:    [[TMP15:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
 ; SINK-GATHER-NEXT:    [[TMP16:%.*]] = phi <8 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF1]] ]
 ; SINK-GATHER-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
 ; SINK-GATHER-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
@@ -338,7 +334,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; SINK-GATHER:       pred.udiv.continue4:
-; SINK-GATHER-NEXT:    [[TMP23:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE2]] ], [ [[TMP20]], [[PRED_UDIV_IF3]] ]
 ; SINK-GATHER-NEXT:    [[TMP24:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP22]], [[PRED_UDIV_IF3]] ]
 ; SINK-GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
 ; SINK-GATHER-NEXT:    br i1 [[TMP25]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
@@ -350,7 +345,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
 ; SINK-GATHER:       pred.udiv.continue6:
-; SINK-GATHER-NEXT:    [[TMP31:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP28]], [[PRED_UDIV_IF5]] ]
 ; SINK-GATHER-NEXT:    [[TMP32:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP30]], [[PRED_UDIV_IF5]] ]
 ; SINK-GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
 ; SINK-GATHER-NEXT:    br i1 [[TMP33]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
@@ -362,7 +356,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; SINK-GATHER:       pred.udiv.continue8:
-; SINK-GATHER-NEXT:    [[TMP39:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE6]] ], [ [[TMP36]], [[PRED_UDIV_IF7]] ]
 ; SINK-GATHER-NEXT:    [[TMP40:%.*]] = phi <8 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP38]], [[PRED_UDIV_IF7]] ]
 ; SINK-GATHER-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
 ; SINK-GATHER-NEXT:    br i1 [[TMP41]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
@@ -374,7 +367,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
 ; SINK-GATHER:       pred.udiv.continue10:
-; SINK-GATHER-NEXT:    [[TMP47:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP44]], [[PRED_UDIV_IF9]] ]
 ; SINK-GATHER-NEXT:    [[TMP48:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP46]], [[PRED_UDIV_IF9]] ]
 ; SINK-GATHER-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
 ; SINK-GATHER-NEXT:    br i1 [[TMP49]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
@@ -386,7 +378,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
 ; SINK-GATHER:       pred.udiv.continue12:
-; SINK-GATHER-NEXT:    [[TMP55:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP52]], [[PRED_UDIV_IF11]] ]
 ; SINK-GATHER-NEXT:    [[TMP56:%.*]] = phi <8 x i32> [ [[TMP48]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP54]], [[PRED_UDIV_IF11]] ]
 ; SINK-GATHER-NEXT:    [[TMP57:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
 ; SINK-GATHER-NEXT:    br i1 [[TMP57]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]]
@@ -398,7 +389,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.udiv.continue14:
-; SINK-GATHER-NEXT:    [[TMP63:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE12]] ], [ [[TMP60]], [[PRED_UDIV_IF13]] ]
 ; SINK-GATHER-NEXT:    [[TMP64:%.*]] = phi <8 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP62]], [[PRED_UDIV_IF13]] ]
 ; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]]
 ; SINK-GATHER-NEXT:    [[TMP66]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
index 314a133debbd7..98a942a501077 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -51,7 +51,7 @@ define void @pr97452_scalable_vf1_for_no_live_out(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[L]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
@@ -67,7 +67,7 @@ loop:
   %gep = getelementptr inbounds i64, ptr %src, i64 %iv
   %l = load i64, ptr %gep, align 8
   %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
-  store i64 %l, ptr %gep.dst
+  store i64 %for, ptr %gep.dst
   %ec = icmp eq i64 %iv, 22
   br i1 %ec, label %exit, label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index bff730f263ab2..06fbeafba31c0 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -60,12 +60,11 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED2:%.+]]> = ir<%rem>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.2
+; CHECK-NEXT: Successor(s): loop.1
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.2:
+; CHECK-NEXT: loop.1:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -135,20 +134,19 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:  Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%rem>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.1
+; CHECK-NEXT: Successor(s): loop.0
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.1:
+; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -209,25 +207,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
-; CHECK-NEXT: Successor(s): pred.srem
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.srem: {
-; CHECK-NEXT:   pred.srem.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK]]>
-; CHECK-NEXT:   Successor(s): pred.srem.if, pred.srem.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:   pred.srem.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> (S->V)
-; CHECK-NEXT:   Successor(s): pred.srem.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:   pred.srem.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%rem>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.0
-; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
-; CHECK-NEXT:   WIDEN ir<%add> = add vp<[[PRED]]>, ir<%recur.next>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT:   WIDEN ir<%add> = add ir<%rem>, ir<%recur.next>
 ; CHECK-NEXT:   WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]>, ir<%and.red.next>, ir<%and.red>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -322,8 +303,8 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK:        pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep>
+; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%conv.lv.2> = sext ir<%lv.2>
 ; CHECK-NEXT:     REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
@@ -332,13 +313,11 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK:        pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED1:%.+]]> = ir<%rem>
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED2:%.+]]> = ir<%lv.2>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT:   Successor(s): loop.3
+; CHECK-NEXT:   Successor(s): loop.2
 ; CHECK-EMPTY:
-; CHECK:      loop.3:
+; CHECK:      loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -426,13 +405,11 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%rem>
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED2:%.+]]> = ir<%rem.div>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.3
+; CHECK-NEXT: Successor(s): loop.2
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.3:
+; CHECK-NEXT: loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -503,19 +480,18 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if:
-; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>
 ; CHECK-NEXT:       REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]>
+; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       REPLICATE store ir<%val>, ir<%gep.dst>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.continue:
-; CHECK-NEXT:       PHI-PREDICATED-INSTRUCTION vp<[[P_VAL:%.+]]> = ir<%val>
 ; CHECK-NEXT:     No successors
 ; CHECK-NEXT:   }
-; CHECK-NEXT:   Successor(s): loop.1
+; CHECK-NEXT:   Successor(s): loop.0
 ; CHECK-EMPTY:
-; CHECK-NEXT:   loop.1:
+; CHECK-NEXT:   loop.0:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index eade22f3fe11f..ecb57c539a40e 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -388,7 +388,6 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]]
@@ -401,7 +400,6 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP21]], i32 1
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
 ; CHECK:       pred.sdiv.continue4:
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP22]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[TMP5]], <2 x i32> [[TMP24]]
@@ -466,7 +464,6 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = sdiv i32 [[TMP8]], [[TMP14]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.sdiv.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_SDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP13]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.sdiv.if2:
@@ -474,7 +471,6 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP9]], [[TMP18]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.sdiv.continue3:
-; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_SDIV_IF2]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF2]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP12]], true
 ; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP13]], true
@@ -577,7 +573,6 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_SDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]]
@@ -590,7 +585,6 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP25]], i32 1
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
 ; CHECK:       pred.sdiv.continue4:
-; CHECK-NEXT:    [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP23]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>, !dbg [[DBG35]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer, !dbg [[DBG35]]
@@ -666,7 +660,6 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = sdiv i32 [[TMP8]], [[TMP22]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.sdiv.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_SDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP25:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP23]], [[PRED_SDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP21]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.sdiv.if2:
@@ -674,7 +667,6 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; UNROLL-NO-VF-NEXT:    [[TMP27:%.*]] = sdiv i32 [[TMP9]], [[TMP26]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.sdiv.continue3:
-; UNROLL-NO-VF-NEXT:    [[TMP28:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF2]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP27]], [[PRED_SDIV_IF2]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP14]], true, !dbg [[DBG35]]
 ; UNROLL-NO-VF-NEXT:    [[TMP31:%.*]] = xor i1 [[TMP15]], true, !dbg [[DBG35]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 7ed42edfc7753..2503520c0ff9d 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -96,7 +96,6 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT:     Successor(s): pred.store.continue
 ; DBG-EMPTY:
 ; DBG-NEXT:     pred.store.continue:
-; DBG-NEXT:       PHI-PREDICATED-INSTRUCTION vp<{{.+}}> = ir<%l>
 ; DBG-NEXT:     No successors
 ; DBG-NEXT:   }
 ; DBG-NEXT:   Successor(s): cond.false.1
@@ -137,7 +136,6 @@ define void @test_scalarize_with_branch_cond(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    br label %pred.store.continue
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP4]], %pred.store.if ]
 ; CHECK-NEXT:    br i1 [[INDUCTION3]], label %pred.store.if4, label %pred.store.continue5
 ; CHECK:       pred.store.if4:
 ; CHECK-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 1
@@ -147,7 +145,6 @@ define void @test_scalarize_with_branch_cond(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    br label %pred.store.continue5
 ; CHECK:       pred.store.continue5:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, %pred.store.continue ], [ [[TMP7]], %pred.store.if4 ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP9]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index b85d68c574a82..a7c9a18127ade 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
 
 declare void @init(ptr nocapture nofree)
 
@@ -17,7 +17,7 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -43,16 +43,16 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer
-; CHECK-NEXT:    [[TMP16]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP15]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -72,7 +72,7 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -114,7 +114,7 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -140,16 +140,16 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP14]], <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -169,7 +169,7 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -198,3 +198,101 @@ latch:
 loop_exit:
   ret i32 %accum.next
 }
+
+
+; Test where offset relative to alloca is negative and we shouldn't
+; treat predicated loads as being always dereferenceable.
+define i8 @test_negative_off(i16 %len, ptr %test_base) {
+; CHECK-LABEL: @test_negative_off(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [64638 x i8]
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
+  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i16 %iv, 1
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
+  %earlycnd = load i1, ptr %test_addr
+  br i1 %earlycnd, label %pred, label %latch
+pred:
+  %addr = getelementptr i8, ptr %alloca, i16 %iv
+  %val = load i8, ptr %addr
+  br label %latch
+latch:
+  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
+  %accum.next = add i8 %accum, %val.phi
+  %exit = icmp ugt i16 %iv, -990
+  br i1 %exit, label %loop_exit, label %loop
+loop_exit:
+  ret i8 %accum.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index f05ec30619c5d..7c23b603b6e91 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -224,7 +224,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_STORE_IF]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; CHECK:       pred.store.if1:
@@ -234,7 +233,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[B]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP9]], [[PRED_STORE_IF1]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
@@ -244,7 +242,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[B]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP14]], [[PRED_STORE_IF3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.if5:
@@ -254,7 +251,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    store i64 [[TMP19]], ptr [[B]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE4]] ], [ [[TMP19]], [[PRED_STORE_IF5]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
@@ -295,7 +291,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2-NEXT:    store i64 [[TMP5]], ptr [[B:%.*]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VF2UF2:       pred.store.continue:
-; VF2UF2-NEXT:    [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ]
 ; VF2UF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
 ; VF2UF2:       pred.store.if2:
@@ -305,7 +300,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2-NEXT:    store i64 [[TMP10]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; VF2UF2:       pred.store.continue3:
-; VF2UF2-NEXT:    [[TMP11:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP10]], [[PRED_STORE_IF2]] ]
 ; VF2UF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; VF2UF2-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; VF2UF2:       pred.store.if4:
@@ -315,7 +309,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2-NEXT:    store i64 [[TMP15]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; VF2UF2:       pred.store.continue5:
-; VF2UF2-NEXT:    [[TMP16:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE3]] ], [ [[TMP15]], [[PRED_STORE_IF4]] ]
 ; VF2UF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
 ; VF2UF2:       pred.store.if6:
@@ -325,7 +318,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2-NEXT:    store i64 [[TMP20]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; VF2UF2:       pred.store.continue7:
-; VF2UF2-NEXT:    [[TMP21:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE5]] ], [ [[TMP20]], [[PRED_STORE_IF6]] ]
 ; VF2UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; VF2UF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
 ; VF2UF2-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
@@ -368,7 +360,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF1UF4-NEXT:    store i64 [[TMP9]], ptr [[B:%.*]], align 8
 ; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VF1UF4:       pred.store.continue:
-; VF1UF4-NEXT:    [[TMP10:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_STORE_IF]] ]
 ; VF1UF4-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; VF1UF4:       pred.store.if1:
 ; VF1UF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
@@ -376,7 +367,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF1UF4-NEXT:    store i64 [[TMP12]], ptr [[B]], align 8
 ; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VF1UF4:       pred.store.continue2:
-; VF1UF4-NEXT:    [[TMP13:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP12]], [[PRED_STORE_IF1]] ]
 ; VF1UF4-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; VF1UF4:       pred.store.if3:
 ; VF1UF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
@@ -384,7 +374,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF1UF4-NEXT:    store i64 [[TMP15]], ptr [[B]], align 8
 ; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; VF1UF4:       pred.store.continue4:
-; VF1UF4-NEXT:    [[TMP16:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP15]], [[PRED_STORE_IF3]] ]
 ; VF1UF4-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; VF1UF4:       pred.store.if5:
 ; VF1UF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
@@ -392,7 +381,6 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF1UF4-NEXT:    store i64 [[TMP18]], ptr [[B]], align 8
 ; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; VF1UF4:       pred.store.continue6:
-; VF1UF4-NEXT:    [[TMP19:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE4]] ], [ [[TMP18]], [[PRED_STORE_IF5]] ]
 ; VF1UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; VF1UF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; VF1UF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
new file mode 100644
index 0000000000000..4fe971f5afc0c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=IC1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=IC2 %s
+
+define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
+; IC1-LABEL: define void @switch4_default_common_dest_with_case(
+; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC1:       [[LOOP_HEADER]]:
+; IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC1-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; IC1-NEXT:    switch i8 [[L]], label %[[DEFAULT:.*]] [
+; IC1-NEXT:      i8 -12, label %[[IF_THEN_1:.*]]
+; IC1-NEXT:      i8 13, label %[[IF_THEN_2:.*]]
+; IC1-NEXT:      i8 0, label %[[DEFAULT]]
+; IC1-NEXT:    ]
+; IC1:       [[IF_THEN_1]]:
+; IC1-NEXT:    store i8 42, ptr [[PTR_IV]], align 1
+; IC1-NEXT:    br label %[[LOOP_LATCH]]
+; IC1:       [[IF_THEN_2]]:
+; IC1-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
+; IC1-NEXT:    br label %[[LOOP_LATCH]]
+; IC1:       [[DEFAULT]]:
+; IC1-NEXT:    store i8 2, ptr [[PTR_IV]], align 1
+; IC1-NEXT:    br label %[[LOOP_LATCH]]
+; IC1:       [[LOOP_LATCH]]:
+; IC1-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1
+; IC1-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; IC1-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC1:       [[EXIT]]:
+; IC1-NEXT:    ret void
+;
+; IC2-LABEL: define void @switch4_default_common_dest_with_case(
+; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC2:       [[LOOP_HEADER]]:
+; IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC2-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; IC2-NEXT:    switch i8 [[L]], label %[[DEFAULT:.*]] [
+; IC2-NEXT:      i8 -12, label %[[IF_THEN_1:.*]]
+; IC2-NEXT:      i8 13, label %[[IF_THEN_2:.*]]
+; IC2-NEXT:      i8 0, label %[[DEFAULT]]
+; IC2-NEXT:    ]
+; IC2:       [[IF_THEN_1]]:
+; IC2-NEXT:    store i8 42, ptr [[PTR_IV]], align 1
+; IC2-NEXT:    br label %[[LOOP_LATCH]]
+; IC2:       [[IF_THEN_2]]:
+; IC2-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
+; IC2-NEXT:    br label %[[LOOP_LATCH]]
+; IC2:       [[DEFAULT]]:
+; IC2-NEXT:    store i8 2, ptr [[PTR_IV]], align 1
+; IC2-NEXT:    br label %[[LOOP_LATCH]]
+; IC2:       [[LOOP_LATCH]]:
+; IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1
+; IC2-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; IC2-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC2:       [[EXIT]]:
+; IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i8, ptr %ptr.iv, align 1
+  switch i8 %l, label %default [
+  i8 -12, label %if.then.1
+  i8 13, label %if.then.2
+  i8 0, label %default
+  ]
+
+if.then.1:
+  store i8 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i8 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i8 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch_exiting(ptr %start) {
+; IC1-LABEL: define void @switch_exiting(
+; IC1-SAME: ptr [[START:%.*]]) {
+; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC1:       [[LOOP_HEADER]]:
+; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
+; IC1-NEXT:      i64 -12, label %[[IF_THEN:.*]]
+; IC1-NEXT:      i64 100, label %[[EXIT:.*]]
+; IC1-NEXT:    ]
+; IC1:       [[IF_THEN]]:
+; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
+; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC1-NEXT:    br label %[[LOOP_LATCH]]
+; IC1:       [[LOOP_LATCH]]:
+; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC1-NEXT:    br label %[[LOOP_HEADER]]
+; IC1:       [[EXIT]]:
+; IC1-NEXT:    ret void
+;
+; IC2-LABEL: define void @switch_exiting(
+; IC2-SAME: ptr [[START:%.*]]) {
+; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC2:       [[LOOP_HEADER]]:
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
+; IC2-NEXT:      i64 -12, label %[[IF_THEN:.*]]
+; IC2-NEXT:      i64 100, label %[[EXIT:.*]]
+; IC2-NEXT:    ]
+; IC2:       [[IF_THEN]]:
+; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
+; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC2-NEXT:    br label %[[LOOP_LATCH]]
+; IC2:       [[LOOP_LATCH]]:
+; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC2-NEXT:    br label %[[LOOP_HEADER]]
+; IC2:       [[EXIT]]:
+; IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  switch i64 %iv, label %loop.latch [
+  i64 -12, label %if.then
+  i64 100, label %exit
+  ]
+
+if.then:
+  %gep = getelementptr inbounds i64, ptr %start, i64 %iv
+  store i64 42, ptr %gep, align 1
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch_to_header(ptr %start) {
+; IC1-LABEL: define void @switch_to_header(
+; IC1-SAME: ptr [[START:%.*]]) {
+; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC1:       [[LOOP_HEADER]]:
+; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_HEADER_BACKEDGE:.*]] ]
+; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
+; IC1-NEXT:      i64 120, label %[[LOOP_HEADER_BACKEDGE]]
+; IC1-NEXT:      i64 100, label %[[LOOP_LATCH]]
+; IC1-NEXT:    ]
+; IC1:       [[LOOP_HEADER_BACKEDGE]]:
+; IC1-NEXT:    br label %[[LOOP_HEADER]]
+; IC1:       [[IF_THEN:.*:]]
+; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
+; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC1-NEXT:    unreachable
+; IC1:       [[LOOP_LATCH]]:
+; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER_BACKEDGE]]
+; IC1:       [[EXIT]]:
+; IC1-NEXT:    ret void
+;
+; IC2-LABEL: define void @switch_to_header(
+; IC2-SAME: ptr [[START:%.*]]) {
+; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC2:       [[LOOP_HEADER]]:
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_HEADER_BACKEDGE:.*]] ]
+; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
+; IC2-NEXT:      i64 120, label %[[LOOP_HEADER_BACKEDGE]]
+; IC2-NEXT:      i64 100, label %[[LOOP_LATCH]]
+; IC2-NEXT:    ]
+; IC2:       [[LOOP_HEADER_BACKEDGE]]:
+; IC2-NEXT:    br label %[[LOOP_HEADER]]
+; IC2:       [[IF_THEN:.*:]]
+; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
+; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC2-NEXT:    unreachable
+; IC2:       [[LOOP_LATCH]]:
+; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER_BACKEDGE]]
+; IC2:       [[EXIT]]:
+; IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ], [ %iv.next, %loop.header ]
+  %iv.next = add i64 %iv, 1
+  switch i64 %iv, label %loop.latch [
+  i64 120, label %loop.header
+  i64 100, label %loop.latch
+  ]
+
+if.then:
+  %gep = getelementptr inbounds i64, ptr %start, i64 %iv
+  store i64 42, ptr %gep, align 1
+  br label %loop.latch
+
+loop.latch:
+  %cmp = icmp eq i64 %iv.next, 100
+  br i1 %cmp, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @switch_all_to_default(ptr %start) {
+; IC1-LABEL: define void @switch_all_to_default(
+; IC1-SAME: ptr [[START:%.*]]) {
+; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC1:       [[LOOP_HEADER]]:
+; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
+; IC1-NEXT:      i64 120, label %[[LOOP_LATCH]]
+; IC1-NEXT:      i64 100, label %[[LOOP_LATCH]]
+; IC1-NEXT:    ]
+; IC1:       [[LOOP_LATCH]]:
+; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
+; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC1:       [[EXIT]]:
+; IC1-NEXT:    ret void
+;
+; IC2-LABEL: define void @switch_all_to_default(
+; IC2-SAME: ptr [[START:%.*]]) {
+; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; IC2:       [[LOOP_HEADER]]:
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
+; IC2-NEXT:      i64 120, label %[[LOOP_LATCH]]
+; IC2-NEXT:      i64 100, label %[[LOOP_LATCH]]
+; IC2-NEXT:    ]
+; IC2:       [[LOOP_LATCH]]:
+; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
+; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
+; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC2:       [[EXIT]]:
+; IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, 1
+  switch i64 %iv, label %loop.latch [
+  i64 120, label %loop.latch
+  i64 100, label %loop.latch
+  ]
+
+loop.latch:
+  %gep = getelementptr inbounds i64, ptr %start, i64 %iv
+  store i64 42, ptr %gep, align 1
+  %cmp = icmp eq i64 %iv.next, 100
+  br i1 %cmp, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
index 8983c80bf3ef4..9eb90099214e1 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
@@ -510,7 +510,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC1-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK-VF4-IC1:       pred.store.continue:
-; CHECK-VF4-IC1-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_STORE_IF]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
 ; CHECK-VF4-IC1-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK-VF4-IC1:       pred.store.if3:
@@ -521,7 +520,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC1-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK-VF4-IC1:       pred.store.continue4:
-; CHECK-VF4-IC1-NEXT:    [[TMP18:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP16]], [[PRED_STORE_IF3]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
 ; CHECK-VF4-IC1-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK-VF4-IC1:       pred.store.if5:
@@ -532,7 +530,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC1-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK-VF4-IC1:       pred.store.continue6:
-; CHECK-VF4-IC1-NEXT:    [[TMP24:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE4]] ], [ [[TMP22]], [[PRED_STORE_IF5]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
 ; CHECK-VF4-IC1-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
 ; CHECK-VF4-IC1:       pred.store.if7:
@@ -543,7 +540,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC1-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK-VF4-IC1:       pred.store.continue8:
-; CHECK-VF4-IC1-NEXT:    [[TMP30:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE6]] ], [ [[TMP28]], [[PRED_STORE_IF7]] ]
 ; CHECK-VF4-IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4-IC1-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4-IC1-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -636,7 +632,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK-VF4-IC2:       pred.store.continue:
-; CHECK-VF4-IC2-NEXT:    [[TMP19:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_STORE_IF]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if6:
@@ -647,7 +642,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; CHECK-VF4-IC2:       pred.store.continue7:
-; CHECK-VF4-IC2-NEXT:    [[TMP25:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP23]], [[PRED_STORE_IF6]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if8:
@@ -658,7 +652,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP30]], ptr [[TMP28]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; CHECK-VF4-IC2:       pred.store.continue9:
-; CHECK-VF4-IC2-NEXT:    [[TMP31:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE7]] ], [ [[TMP29]], [[PRED_STORE_IF8]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if10:
@@ -669,7 +662,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP36]], ptr [[TMP34]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
 ; CHECK-VF4-IC2:       pred.store.continue11:
-; CHECK-VF4-IC2-NEXT:    [[TMP37:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE9]] ], [ [[TMP35]], [[PRED_STORE_IF10]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if12:
@@ -679,7 +671,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP41]], ptr [[TMP39]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE13]]
 ; CHECK-VF4-IC2:       pred.store.continue13:
-; CHECK-VF4-IC2-NEXT:    [[TMP42:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE11]] ], [ [[TMP40]], [[PRED_STORE_IF12]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP43:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if14:
@@ -690,7 +681,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP45]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE15]]
 ; CHECK-VF4-IC2:       pred.store.continue15:
-; CHECK-VF4-IC2-NEXT:    [[TMP48:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE13]] ], [ [[TMP46]], [[PRED_STORE_IF14]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
 ; CHECK-VF4-IC2:       pred.store.if16:
@@ -701,7 +691,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP53]], ptr [[TMP51]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE17]]
 ; CHECK-VF4-IC2:       pred.store.continue17:
-; CHECK-VF4-IC2-NEXT:    [[TMP54:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE15]] ], [ [[TMP52]], [[PRED_STORE_IF16]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP55]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]]
 ; CHECK-VF4-IC2:       pred.store.if18:
@@ -712,7 +701,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    store i32 [[TMP59]], ptr [[TMP57]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4-IC2-NEXT:    br label [[PRED_STORE_CONTINUE19]]
 ; CHECK-VF4-IC2:       pred.store.continue19:
-; CHECK-VF4-IC2-NEXT:    [[TMP60:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE17]] ], [ [[TMP58]], [[PRED_STORE_IF18]] ]
 ; CHECK-VF4-IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-VF4-IC2-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -804,7 +792,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF1-IC2-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF1-IC2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK-VF1-IC2:       pred.store.continue:
-; CHECK-VF1-IC2-NEXT:    [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_STORE_IF]] ]
 ; CHECK-VF1-IC2-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; CHECK-VF1-IC2:       pred.store.if5:
 ; CHECK-VF1-IC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
@@ -813,7 +800,6 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF1-IC2-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF1-IC2-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK-VF1-IC2:       pred.store.continue6:
-; CHECK-VF1-IC2-NEXT:    [[TMP22:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP20]], [[PRED_STORE_IF5]] ]
 ; CHECK-VF1-IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-VF1-IC2-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1-IC2-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 45596169da3cc..f53c7b2fc6bae 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -34,7 +34,6 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -110,7 +109,6 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -267,7 +265,6 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
new file mode 100644
index 0000000000000..6e3e55a319ec2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug -disable-output %s 2>&1 | FileCheck %s
+
+define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
+; CHECK-NOT: VPlan
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i8, ptr %ptr.iv, align 1
+  switch i8 %l, label %default [
+  i8 -12, label %if.then.1
+  i8 13, label %if.then.2
+  i8 0, label %default
+  ]
+
+if.then.1:
+  store i8 42, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+if.then.2:
+  store i8 0, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+default:
+  store i8 2, ptr %ptr.iv, align 1
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 313be091f5f09..9c07281a9a8a9 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -46,7 +46,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 
 ; CHECK:      pred.store.continue:
-; CHECK-NEXT:   PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 
@@ -768,8 +767,6 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) {
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED1:%.+]]> = ir<%lv.a>
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED2:%.+]]> = ir<%div>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): loop.2
@@ -854,7 +851,6 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) {
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED2:%.+]]> = ir<%div>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): loop.2
@@ -914,7 +910,6 @@ define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) {
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%l1>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): loop.then.1
@@ -1053,7 +1048,6 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT:     Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.continue:
-; CHECK-NEXT:       PHI-PREDICATED-INSTRUCTION vp<[[P_LOAD:%.+]]> = ir<%l>
 ; CHECK-NEXT:     No successors
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Successor(s): loop.1
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
index c196de17c4fe3..1638c1b1dd867 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
@@ -246,8 +246,8 @@ define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:    ret <9 x double> [[TMP65]]
 ;
 entry:
-  %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
   ret <9 x double> %c
 }
 
-declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
index c27dd0ceaea60..fa07c51630305 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
@@ -246,8 +246,8 @@ define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:    ret <9 x double> [[TMP65]]
 ;
 entry:
-  %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
   ret <9 x double> %c
 }
 
-declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
index d7e3f92354f5f..8ad9e2e77860a 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
@@ -67,11 +67,11 @@ define <4 x i32> @multiply_2x2(<4 x i32> %a, <4 x i32> %b) {
 ; RM-NEXT:    ret <4 x i32> [[TMP28]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32>, <4 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32)
 
 define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) {
 ; RM-LABEL: @multiply_1x2(
@@ -111,11 +111,11 @@ define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) {
 ; RM-NEXT:    ret <4 x i32> [[TMP16]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32>, <2 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32)
 
 define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) {
 ; RM-LABEL: @multiply_2x3(
@@ -248,8 +248,8 @@ define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) {
 ; RM-NEXT:    ret <9 x i32> [[TMP65]]
 ;
 entry:
-  %c = call <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x i32> @llvm.matrix.multiply.v9i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
   ret <9 x i32> %c
 }
 
-declare <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32>, <6 x i32>, i32, i32, i32)
+declare <9 x i32> @llvm.matrix.multiply.v9i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
index 1fbc7a2c0b87a..243cf84b64647 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
@@ -246,8 +246,8 @@ define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) {
 ; CHECK-NEXT:    ret <9 x i32> [[TMP65]]
 ;
 entry:
-  %c = call <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x i32> @llvm.matrix.multiply.v9i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
   ret <9 x i32> %c
 }
 
-declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32)
+declare <9 x i32> @llvm.matrix.multiply.v9i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32)
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
index 9a771e04999b9..abba3dd7f7cbf 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
@@ -16,6 +16,12 @@ source_filename = "memprof-tailcall.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16
+
+declare void @_Z2a1v() #0
+
+declare void @_Z2a2v() #0
+
 ; Function Attrs: noinline
 ; IR-LABEL: @_Z3barv()
 define ptr @_Z3barv() local_unnamed_addr #0 {
@@ -58,6 +64,8 @@ define i32 @main() #0 {
   ;; cloned functions.
   ; IR: call ptr @_Z3foov.memprof.1()
   %call1 = tail call ptr @_Z3foov(), !callsite !7
+  %2 = load ptr, ptr @a, align 16
+  call void %2(), !callsite !10
   ret i32 0
 }
 
@@ -79,7 +87,7 @@ attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3}
+!0 = !{!1, !3, !8}
 !1 = !{!2, !"notcold"}
 !2 = !{i64 3186456655321080972, i64 8632435727821051414}
 !3 = !{!4, !"cold"}
@@ -87,3 +95,6 @@ attributes #2 = { builtin allocsize(0) }
 !5 = !{i64 3186456655321080972}
 !6 = !{i64 8632435727821051414}
 !7 = !{i64 -3421689549917153178}
+!8 = !{!9, !"notcold"}
+!9 = !{i64 3186456655321080972, i64 1}
+!10 = !{i64 1}
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll b/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
new file mode 100644
index 0000000000000..1c8be82715f25
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
@@ -0,0 +1,82 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+$__llvm_profile_raw_version = comdat any
+$__llvm_profile_sampling = comdat any
+
+@odd = common dso_local local_unnamed_addr global i32 0, align 4
+@even = common dso_local local_unnamed_addr global i32 0, align 4
+@__llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
+@__llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
+@__llvm_profile_sampling = thread_local global i16 0, comdat
+@llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
+
+define dso_local void @bar(i32 %n) !prof !30 {
+entry:
+  %call = tail call fastcc i32 @cond(i32 %n)
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.else, label %if.then, !prof !31
+
+if.then:
+  %0 = load i32, i32* @odd, align 4, !tbaa !32
+  %inc = add i32 %0, 1
+  store i32 %inc, i32* @odd, align 4, !tbaa !32
+  br label %if.end
+
+if.else:
+  %1 = load i32, i32* @even, align 4, !tbaa !32
+  %inc1 = add i32 %1, 1
+  store i32 %inc1, i32* @even, align 4, !tbaa !32
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define internal fastcc i32 @cond(i32 %i) #1 !prof !30 !PGOFuncName !36 {
+entry:
+  %rem = srem i32 %i, 2
+  ret i32 %rem
+}
+
+attributes #1 = { inlinehint noinline }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
+!2 = !{i32 1, !"ProfileSummary", !3}
+!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
+!4 = !{!"ProfileFormat", !"InstrProf"}
+!5 = !{!"TotalCount", i64 500002}
+!6 = !{!"MaxCount", i64 200000}
+!7 = !{!"MaxInternalCount", i64 100000}
+!8 = !{!"MaxFunctionCount", i64 200000}
+!9 = !{!"NumCounts", i64 6}
+!10 = !{!"NumFunctions", i64 4}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 200000, i32 1}
+!14 = !{i32 100000, i64 200000, i32 1}
+!15 = !{i32 200000, i64 200000, i32 1}
+!16 = !{i32 300000, i64 200000, i32 1}
+!17 = !{i32 400000, i64 200000, i32 1}
+!18 = !{i32 500000, i64 100000, i32 4}
+!19 = !{i32 600000, i64 100000, i32 4}
+!20 = !{i32 700000, i64 100000, i32 4}
+!21 = !{i32 800000, i64 100000, i32 4}
+!22 = !{i32 900000, i64 100000, i32 4}
+!23 = !{i32 950000, i64 100000, i32 4}
+!24 = !{i32 990000, i64 100000, i32 4}
+!25 = !{i32 999000, i64 100000, i32 4}
+!26 = !{i32 999900, i64 100000, i32 4}
+!27 = !{i32 999990, i64 100000, i32 4}
+!28 = !{i32 999999, i64 1, i32 6}
+!30 = !{!"function_entry_count", i64 200000}
+!31 = !{!"branch_weights", i32 100000, i32 100000}
+!32 = !{!33, !33, i64 0}
+!33 = !{!"int", !34, i64 0}
+!34 = !{!"omnipotent char", !35, i64 0}
+!35 = !{!"Simple C/C++ TBAA"}
+!36 = !{!"cspgo_bar.c:cond"}
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
new file mode 100644
index 0000000000000..9d083fe04015e
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof --do-counter-promotion=true --sampled-instrumentation=true --skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
+
+; SAMPLING: $__llvm_profile_sampling = comdat any
+; SAMPLING: @__llvm_profile_sampling = thread_local global i16 0, comdat
+
+define void @foo(i32 %n, i32 %N) {
+; SAMPLING-LABEL: @foo
+; SAMPLING:  %[[VV0:[0-9]+]] = load i16, ptr @__llvm_profile_sampling, align 2
+; SAMPLING:  %[[VV1:[0-9]+]] = icmp ule i16 %[[VV0]], 200
+; SAMPLING:  br i1 %[[VV1]], label {{.*}}, label {{.*}}, !prof !0
+; SAMPLING: {{.*}} = load {{.*}} @__profc_foo{{.*}} 3)
+; SAMPLING-NEXT: add
+; SAMPLING-NEXT: store {{.*}}@__profc_foo{{.*}}3)
+bb:
+  %tmp = add nsw i32 %n, 1
+  %tmp1 = add nsw i32 %n, -1
+  br label %bb2
+
+bb2:
+; PROMO: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp10, %bb9 ]
+  %tmp3 = icmp slt i32 %i.0, %tmp
+  br i1 %tmp3, label %bb4, label %bb5
+
+bb4:
+  tail call void @bar(i32 1)
+  br label %bb9
+
+bb5:
+  %tmp6 = icmp slt i32 %i.0, %tmp1
+  br i1 %tmp6, label %bb7, label %bb8
+
+bb7:
+  tail call void @bar(i32 2)
+  br label %bb9
+
+bb8:
+  tail call void @bar(i32 3)
+  br label %bb9
+
+bb9:
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V1:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V1]], ptr @__llvm_profile_sampling, align 2
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V2:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V2]], ptr @__llvm_profile_sampling, align 2
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V3:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V3]], ptr @__llvm_profile_sampling, align 2
+; PROMO: %[[LIVEOUT3:[a-z0-9]+]] = phi {{.*}}
+; PROMO-NEXT: %[[LIVEOUT2:[a-z0-9]+]] = phi {{.*}}
+; PROMO-NEXT: %[[LIVEOUT1:[a-z0-9]+]] = phi {{.*}}
+  %tmp10 = add nsw i32 %i.0, 1
+  %tmp11 = icmp slt i32 %tmp10, %N
+  br i1 %tmp11, label %bb2, label %bb12
+
+bb12:
+  ret void
+; PROMO: %[[CHECK1:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}}
+; PROMO-NEXT: add {{.*}} %[[CHECK1]], %[[LIVEOUT1]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}
+; PROMO-NEXT: %[[CHECK2:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 1)
+; PROMO-NEXT: add {{.*}} %[[CHECK2]], %[[LIVEOUT2]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}1)
+; PROMO-NEXT: %[[CHECK3:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 2)
+; PROMO-NEXT: add {{.*}} %[[CHECK3]], %[[LIVEOUT3]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2)
+; PROMO-NOT: @__profc_foo{{.*}})
+
+}
+
+declare void @bar(i32)
+
+; SAMPLING: !0 = !{!"branch_weights", i32 200, i32 65336}
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
new file mode 100644
index 0000000000000..97ad4d00c9d9c
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %S/Inputs/cspgo_bar_sample.ll -o %t2.bc
+; RUN: llvm-lto2 run -lto-cspgo-profile-file=alloc -enable-sampled-instrumentation -lto-cspgo-gen -save-temps -o %t %t1.bc %t2.bc \
+; RUN:   -r=%t1.bc,foo,pl \
+; RUN:   -r=%t1.bc,bar,l \
+; RUN:   -r=%t1.bc,main,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_filename,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_raw_version,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_sampling,pl \
+; RUN:   -r=%t2.bc,bar,pl \
+; RUN:   -r=%t2.bc,odd,pl \
+; RUN:   -r=%t2.bc,even,pl \
+; RUN:   -r=%t2.bc,__llvm_profile_filename,x \
+; RUN:   -r=%t2.bc,__llvm_profile_raw_version,x \
+; RUN:   -r=%t2.bc,__llvm_profile_sampling,
+; RUN: llvm-dis %t.1.4.opt.bc -o - | FileCheck %s --check-prefix=CSGEN
+
+; CSGEN: @__llvm_profile_sampling = thread_local global i16 0, comdat
+; CSGEN: @__profc_
+; CSGEN: @__profd_
+
+source_filename = "cspgo.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+$__llvm_profile_raw_version = comdat any
+$__llvm_profile_sampling = comdat any
+@__llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
+@__llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
+@__llvm_profile_sampling = thread_local global i16 0, comdat
+@llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
+
+define dso_local void @foo() #0 !prof !30 {
+entry:
+  br label %for.body
+
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  tail call void @bar(i32 %i.06) #3
+  %add = or i32 %i.06, 1
+  tail call void @bar(i32 %add) #3
+  %add1 = add nuw nsw i32 %i.06, 2
+  %cmp = icmp ult i32 %add1, 200000
+  br i1 %cmp, label %for.body, label %for.end, !prof !31
+
+for.end:
+  ret void
+}
+
+; CSGEN-LABEL: @foo
+; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
+; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
+; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
+; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
+; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
+
+declare dso_local void @bar(i32)
+
+define dso_local i32 @main() !prof !30 {
+entry:
+  tail call void @foo()
+  ret i32 0
+}
+; CSGEN-LABEL: @main
+; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
+; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
+; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
+; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
+; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
+!2 = !{i32 1, !"ProfileSummary", !3}
+!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
+!4 = !{!"ProfileFormat", !"InstrProf"}
+!5 = !{!"TotalCount", i64 500002}
+!6 = !{!"MaxCount", i64 200000}
+!7 = !{!"MaxInternalCount", i64 100000}
+!8 = !{!"MaxFunctionCount", i64 200000}
+!9 = !{!"NumCounts", i64 6}
+!10 = !{!"NumFunctions", i64 4}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 200000, i32 1}
+!14 = !{i32 100000, i64 200000, i32 1}
+!15 = !{i32 200000, i64 200000, i32 1}
+!16 = !{i32 300000, i64 200000, i32 1}
+!17 = !{i32 400000, i64 200000, i32 1}
+!18 = !{i32 500000, i64 100000, i32 4}
+!19 = !{i32 600000, i64 100000, i32 4}
+!20 = !{i32 700000, i64 100000, i32 4}
+!21 = !{i32 800000, i64 100000, i32 4}
+!22 = !{i32 900000, i64 100000, i32 4}
+!23 = !{i32 950000, i64 100000, i32 4}
+!24 = !{i32 990000, i64 100000, i32 4}
+!25 = !{i32 999000, i64 100000, i32 4}
+!26 = !{i32 999900, i64 100000, i32 4}
+!27 = !{i32 999990, i64 100000, i32 4}
+!28 = !{i32 999999, i64 1, i32 6}
+!30 = !{!"function_entry_count", i64 1}
+!31 = !{!"branch_weights", i32 100000, i32 1}
+
+; CSGEN: [[PROF]] = !{!"branch_weights", i32 200, i32 65336}
+
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
new file mode 100644
index 0000000000000..dcc1e805ba6f6
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION,SAMPLE-WEIGHT
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=100 -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION100,SAMPLE-WEIGHT100
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+; SAMPLE-VAR: $__llvm_profile_sampling = comdat any
+
+@__llvm_profile_raw_version = constant i64 72057594037927940, comdat
+@__profn_f = private constant [1 x i8] c"f"
+
+; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
+; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
+; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
+; SAMPLE-VAR: @__llvm_prf_nm = private constant {{.*}}, section "__llvm_prf_names", align 1
+; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
+; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
+
+
+define void @f() {
+; SAMPLE-CODE-LABEL: @f(
+; SAMPLE-CODE:  entry:
+; SAMPLE-CODE-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; SAMPLE-DURATION:         [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 200
+; SAMPLE-DURATION100:     [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 100
+; SAMPLE-CODE:         br i1 [[TMP1]], label %[[TMP2:.*]], label %[[TMP4:.*]], !prof !0
+; SAMPLE-CODE:       [[TMP2]]:
+; SAMPLE-CODE-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f
+; SAMPLE-CODE-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; SAMPLE-CODE-NEXT:    store i64 [[TMP3]], ptr @__profc_f
+; SAMPLE-CODE-NEXT:    br label %[[TMP4]]
+; SAMPLE-CODE:       [[TMP4]]:
+; SAMPLE-CODE-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
+; SAMPLE-CODE-NEXT:    store i16 [[TMP5]],  ptr @__llvm_profile_sampling, align 2
+; SAMPLE-CODE-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+; SAMPLE-WEIGHT: !0 = !{!"branch_weights", i32 200, i32 65336}
+; SAMPLE-WEIGHT100: !0 = !{!"branch_weights", i32 100, i32 65436}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
new file mode 100644
index 0000000000000..57d1a0cd33fbe
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof -sampled-instrumentation --sampled-instr-period=1009 --sampled-instr-burst-duration=32 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+@__llvm_profile_raw_version = constant i64 72057594037927940, comdat
+@__profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 32
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i16 [[TMP5]], 1009
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    store i16 [[TMP5]], ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 32, i32 978}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1009}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
new file mode 100644
index 0000000000000..1ad889524bc6a
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-period=1000019 --sampled-instr-burst-duration=3000 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+@__llvm_profile_raw_version = constant i64 72057594037927940, comdat
+@__profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[TMP0]], 3000
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i32 [[TMP5]], 1000019
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    store i32 0, ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 3000, i32 997020}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1000019}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
new file mode 100644
index 0000000000000..8e846bbf1d982
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s --check-prefix=PERIOD1009
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 -S | FileCheck %s --check-prefix=DEFAULTPERIOD
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+@__llvm_profile_raw_version = constant i64 72057594037927940, comdat
+@__profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; PERIOD1009-LABEL: define void @f() {
+; PERIOD1009-NEXT:  [[ENTRY:.*:]]
+; PERIOD1009-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; PERIOD1009-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], 1009
+; PERIOD1009-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
+; PERIOD1009:       [[BB3]]:
+; PERIOD1009-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; PERIOD1009-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
+; PERIOD1009-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
+; PERIOD1009-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    br label %[[BB6:.*]]
+; PERIOD1009:       [[BB5]]:
+; PERIOD1009-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    br label %[[BB6]]
+; PERIOD1009:       [[BB6]]:
+; PERIOD1009-NEXT:    ret void
+;
+; DEFAULTPERIOD-LABEL: define void @f() {
+; DEFAULTPERIOD-NEXT:  [[ENTRY:.*:]]
+; DEFAULTPERIOD-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; DEFAULTPERIOD-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], -1
+; DEFAULTPERIOD-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
+; DEFAULTPERIOD:       [[BB3]]:
+; DEFAULTPERIOD-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; DEFAULTPERIOD-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
+; DEFAULTPERIOD-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
+; DEFAULTPERIOD-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    br label %[[BB6:.*]]
+; DEFAULTPERIOD:       [[BB5]]:
+; DEFAULTPERIOD-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    br label %[[BB6]]
+; DEFAULTPERIOD:       [[BB6]]:
+; DEFAULTPERIOD-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; PERIOD1009: [[PROF0]] = !{!"branch_weights", i32 1, i32 1009}
+;.
+; DEFAULTPERIOD: [[PROF0]] = !{!"branch_weights", i32 1, i32 65535}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
new file mode 100644
index 0000000000000..22511c018dca2
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -0,0 +1,484 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -O3 < %s | FileCheck %s
+
+; Check unrolling / SLP vectorization where the order of lanes is important for
+; producing efficient shuffles. The shuffles should be regular and cheap for
+; AArch64. [0 2 4 6] and [1 3 5 7] will produce uzp1/uzp2 instruction. The
+; v16i32 shuffles will be legalized to individual v4i32.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+; Function Attrs: nounwind uwtable
+define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
+; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering
+; CHECK-SAME: (ptr nocapture noundef readonly [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[IP1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64
+; CHECK-NEXT:    [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]]
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
+; CHECK-NEXT:    [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[RDD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %p1.addr = alloca ptr, align 8
+  %ip1.addr = alloca i32, align 4
+  %p2.addr = alloca ptr, align 8
+  %ip2.addr = alloca i32, align 4
+  %emp = alloca [4 x [4 x i32]], align 4
+  %r0 = alloca i32, align 4
+  %r1 = alloca i32, align 4
+  %r2 = alloca i32, align 4
+  %r3 = alloca i32, align 4
+  %sum = alloca i32, align 4
+  %i = alloca i32, align 4
+  %e0 = alloca i32, align 4
+  %e1 = alloca i32, align 4
+  %e2 = alloca i32, align 4
+  %e3 = alloca i32, align 4
+  %i65 = alloca i32, align 4
+  %e071 = alloca i32, align 4
+  %e179 = alloca i32, align 4
+  %e287 = alloca i32, align 4
+  %e395 = alloca i32, align 4
+  store ptr %p1, ptr %p1.addr, align 8, !tbaa !4
+  store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8
+  store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
+  store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 64, ptr %emp) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r1) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r2) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r3) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #2
+  store i32 0, ptr %sum, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2
+  store i32 0, ptr %i, align 4, !tbaa !8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !tbaa !8
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx = getelementptr inbounds i8, ptr %1, i64 0
+  %2 = load i8, ptr %rrrayidx, align 1, !tbaa !10
+  %conv = zext i8 %2 to i32
+  %3 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx1 = getelementptr inbounds i8, ptr %3, i64 0
+  %4 = load i8, ptr %rrrayidx1, align 1, !tbaa !10
+  %conv2 = zext i8 %4 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %5 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx3 = getelementptr inbounds i8, ptr %5, i64 4
+  %6 = load i8, ptr %rrrayidx3, align 1, !tbaa !10
+  %conv4 = zext i8 %6 to i32
+  %7 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx5 = getelementptr inbounds i8, ptr %7, i64 4
+  %8 = load i8, ptr %rrrayidx5, align 1, !tbaa !10
+  %conv6 = zext i8 %8 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl i32 %sub7, 16
+  %rdd = add nsw i32 %sub, %shl
+  store i32 %rdd, ptr %r0, align 4, !tbaa !8
+  %9 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx8 = getelementptr inbounds i8, ptr %9, i64 1
+  %10 = load i8, ptr %rrrayidx8, align 1, !tbaa !10
+  %conv9 = zext i8 %10 to i32
+  %11 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx10 = getelementptr inbounds i8, ptr %11, i64 1
+  %12 = load i8, ptr %rrrayidx10, align 1, !tbaa !10
+  %conv11 = zext i8 %12 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %13 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx13 = getelementptr inbounds i8, ptr %13, i64 5
+  %14 = load i8, ptr %rrrayidx13, align 1, !tbaa !10
+  %conv14 = zext i8 %14 to i32
+  %15 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx15 = getelementptr inbounds i8, ptr %15, i64 5
+  %16 = load i8, ptr %rrrayidx15, align 1, !tbaa !10
+  %conv16 = zext i8 %16 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl i32 %sub17, 16
+  %rdd19 = add nsw i32 %sub12, %shl18
+  store i32 %rdd19, ptr %r1, align 4, !tbaa !8
+  %17 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx20 = getelementptr inbounds i8, ptr %17, i64 2
+  %18 = load i8, ptr %rrrayidx20, align 1, !tbaa !10
+  %conv21 = zext i8 %18 to i32
+  %19 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx22 = getelementptr inbounds i8, ptr %19, i64 2
+  %20 = load i8, ptr %rrrayidx22, align 1, !tbaa !10
+  %conv23 = zext i8 %20 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %21 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx25 = getelementptr inbounds i8, ptr %21, i64 6
+  %22 = load i8, ptr %rrrayidx25, align 1, !tbaa !10
+  %conv26 = zext i8 %22 to i32
+  %23 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx27 = getelementptr inbounds i8, ptr %23, i64 6
+  %24 = load i8, ptr %rrrayidx27, align 1, !tbaa !10
+  %conv28 = zext i8 %24 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl i32 %sub29, 16
+  %rdd31 = add nsw i32 %sub24, %shl30
+  store i32 %rdd31, ptr %r2, align 4, !tbaa !8
+  %25 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx32 = getelementptr inbounds i8, ptr %25, i64 3
+  %26 = load i8, ptr %rrrayidx32, align 1, !tbaa !10
+  %conv33 = zext i8 %26 to i32
+  %27 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx34 = getelementptr inbounds i8, ptr %27, i64 3
+  %28 = load i8, ptr %rrrayidx34, align 1, !tbaa !10
+  %conv35 = zext i8 %28 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %29 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx37 = getelementptr inbounds i8, ptr %29, i64 7
+  %30 = load i8, ptr %rrrayidx37, align 1, !tbaa !10
+  %conv38 = zext i8 %30 to i32
+  %31 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx39 = getelementptr inbounds i8, ptr %31, i64 7
+  %32 = load i8, ptr %rrrayidx39, align 1, !tbaa !10
+  %conv40 = zext i8 %32 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl i32 %sub41, 16
+  %rdd43 = add nsw i32 %sub36, %shl42
+  store i32 %rdd43, ptr %r3, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e0) #2
+  %33 = load i32, ptr %r0, align 4, !tbaa !8
+  %34 = load i32, ptr %r1, align 4, !tbaa !8
+  %rdd44 = add i32 %33, %34
+  store i32 %rdd44, ptr %e0, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e1) #2
+  %35 = load i32, ptr %r0, align 4, !tbaa !8
+  %36 = load i32, ptr %r1, align 4, !tbaa !8
+  %sub45 = sub i32 %35, %36
+  store i32 %sub45, ptr %e1, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e2) #2
+  %37 = load i32, ptr %r2, align 4, !tbaa !8
+  %38 = load i32, ptr %r3, align 4, !tbaa !8
+  %rdd46 = add i32 %37, %38
+  store i32 %rdd46, ptr %e2, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e3) #2
+  %39 = load i32, ptr %r2, align 4, !tbaa !8
+  %40 = load i32, ptr %r3, align 4, !tbaa !8
+  %sub47 = sub i32 %39, %40
+  store i32 %sub47, ptr %e3, align 4, !tbaa !8
+  %41 = load i32, ptr %e0, align 4, !tbaa !8
+  %42 = load i32, ptr %e2, align 4, !tbaa !8
+  %rdd48 = add nsw i32 %41, %42
+  %43 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom = sext i32 %43 to i64
+  %rrrayidx49 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom
+  %rrrayidx50 = getelementptr inbounds [4 x i32], ptr %rrrayidx49, i64 0, i64 0
+  store i32 %rdd48, ptr %rrrayidx50, align 4, !tbaa !8
+  %44 = load i32, ptr %e0, align 4, !tbaa !8
+  %45 = load i32, ptr %e2, align 4, !tbaa !8
+  %sub51 = sub nsw i32 %44, %45
+  %46 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom52 = sext i32 %46 to i64
+  %rrrayidx53 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom52
+  %rrrayidx54 = getelementptr inbounds [4 x i32], ptr %rrrayidx53, i64 0, i64 2
+  store i32 %sub51, ptr %rrrayidx54, align 4, !tbaa !8
+  %47 = load i32, ptr %e1, align 4, !tbaa !8
+  %48 = load i32, ptr %e3, align 4, !tbaa !8
+  %rdd55 = add nsw i32 %47, %48
+  %49 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom56 = sext i32 %49 to i64
+  %rrrayidx57 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom56
+  %rrrayidx58 = getelementptr inbounds [4 x i32], ptr %rrrayidx57, i64 0, i64 1
+  store i32 %rdd55, ptr %rrrayidx58, align 4, !tbaa !8
+  %50 = load i32, ptr %e1, align 4, !tbaa !8
+  %51 = load i32, ptr %e3, align 4, !tbaa !8
+  %sub59 = sub nsw i32 %50, %51
+  %52 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom60 = sext i32 %52 to i64
+  %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60
+  %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3
+  store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e3) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e2) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e0) #2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %53 = load i32, ptr %i, align 4, !tbaa !8
+  %inc = add nsw i32 %53, 1
+  store i32 %inc, ptr %i, align 4, !tbaa !8
+  %54 = load i32, ptr %ip1.addr, align 4, !tbaa !8
+  %55 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %idx.ext = sext i32 %54 to i64
+  %rdd.ptr = getelementptr inbounds i8, ptr %55, i64 %idx.ext
+  store ptr %rdd.ptr, ptr %p1.addr, align 8, !tbaa !4
+  %56 = load i32, ptr %ip2.addr, align 4, !tbaa !8
+  %57 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %idx.ext63 = sext i32 %56 to i64
+  %rdd.ptr64 = getelementptr inbounds i8, ptr %57, i64 %idx.ext63
+  store ptr %rdd.ptr64, ptr %p2.addr, align 8, !tbaa !4
+  br label %for.cond, !llvm.loop !11
+
+for.end:                                          ; preds = %for.cond.cleanup
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i65) #2
+  store i32 0, ptr %i65, align 4, !tbaa !8
+  br label %for.cond66
+
+for.cond66:                                       ; preds = %for.inc114, %for.end
+  %58 = load i32, ptr %i65, align 4, !tbaa !8
+  %cmp67 = icmp slt i32 %58, 4
+  br i1 %cmp67, label %for.body70, label %for.cond.cleanup69
+
+for.cond.cleanup69:                               ; preds = %for.cond66
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i65) #2
+  br label %for.end116
+
+for.body70:                                       ; preds = %for.cond66
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e071) #2
+  %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
+  %59 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom73 = sext i32 %59 to i64
+  %rrrayidx74 = getelementptr inbounds [4 x i32], ptr %rrrayidx72, i64 0, i64 %idxprom73
+  %60 = load i32, ptr %rrrayidx74, align 4, !tbaa !8
+  %rrrayidx75 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1
+  %61 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom76 = sext i32 %61 to i64
+  %rrrayidx77 = getelementptr inbounds [4 x i32], ptr %rrrayidx75, i64 0, i64 %idxprom76
+  %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8
+  %rdd78 = add i32 %60, %62
+  store i32 %rdd78, ptr %e071, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e179) #2
+  %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
+  %63 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom81 = sext i32 %63 to i64
+  %rrrayidx82 = getelementptr inbounds [4 x i32], ptr %rrrayidx80, i64 0, i64 %idxprom81
+  %64 = load i32, ptr %rrrayidx82, align 4, !tbaa !8
+  %rrrayidx83 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1
+  %65 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom84 = sext i32 %65 to i64
+  %rrrayidx85 = getelementptr inbounds [4 x i32], ptr %rrrayidx83, i64 0, i64 %idxprom84
+  %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8
+  %sub86 = sub i32 %64, %66
+  store i32 %sub86, ptr %e179, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e287) #2
+  %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
+  %67 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom89 = sext i32 %67 to i64
+  %rrrayidx90 = getelementptr inbounds [4 x i32], ptr %rrrayidx88, i64 0, i64 %idxprom89
+  %68 = load i32, ptr %rrrayidx90, align 4, !tbaa !8
+  %rrrayidx91 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3
+  %69 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom92 = sext i32 %69 to i64
+  %rrrayidx93 = getelementptr inbounds [4 x i32], ptr %rrrayidx91, i64 0, i64 %idxprom92
+  %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8
+  %rdd94 = add i32 %68, %70
+  store i32 %rdd94, ptr %e287, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e395) #2
+  %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
+  %71 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom97 = sext i32 %71 to i64
+  %rrrayidx98 = getelementptr inbounds [4 x i32], ptr %rrrayidx96, i64 0, i64 %idxprom97
+  %72 = load i32, ptr %rrrayidx98, align 4, !tbaa !8
+  %rrrayidx99 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3
+  %73 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom100 = sext i32 %73 to i64
+  %rrrayidx101 = getelementptr inbounds [4 x i32], ptr %rrrayidx99, i64 0, i64 %idxprom100
+  %74 = load i32, ptr %rrrayidx101, align 4, !tbaa !8
+  %sub102 = sub i32 %72, %74
+  store i32 %sub102, ptr %e395, align 4, !tbaa !8
+  %75 = load i32, ptr %e071, align 4, !tbaa !8
+  %76 = load i32, ptr %e287, align 4, !tbaa !8
+  %rdd103 = add nsw i32 %75, %76
+  store i32 %rdd103, ptr %r0, align 4, !tbaa !8
+  %77 = load i32, ptr %e071, align 4, !tbaa !8
+  %78 = load i32, ptr %e287, align 4, !tbaa !8
+  %sub104 = sub nsw i32 %77, %78
+  store i32 %sub104, ptr %r2, align 4, !tbaa !8
+  %79 = load i32, ptr %e179, align 4, !tbaa !8
+  %80 = load i32, ptr %e395, align 4, !tbaa !8
+  %rdd105 = add nsw i32 %79, %80
+  store i32 %rdd105, ptr %r1, align 4, !tbaa !8
+  %81 = load i32, ptr %e179, align 4, !tbaa !8
+  %82 = load i32, ptr %e395, align 4, !tbaa !8
+  %sub106 = sub nsw i32 %81, %82
+  store i32 %sub106, ptr %r3, align 4, !tbaa !8
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e395) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e287) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e179) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e071) #2
+  %83 = load i32, ptr %r0, align 4, !tbaa !8
+  %call = call i32 @twoabs(i32 noundef %83)
+  %84 = load i32, ptr %r1, align 4, !tbaa !8
+  %call107 = call i32 @twoabs(i32 noundef %84)
+  %rdd108 = add i32 %call, %call107
+  %85 = load i32, ptr %r2, align 4, !tbaa !8
+  %call109 = call i32 @twoabs(i32 noundef %85)
+  %rdd110 = add i32 %rdd108, %call109
+  %86 = load i32, ptr %r3, align 4, !tbaa !8
+  %call111 = call i32 @twoabs(i32 noundef %86)
+  %rdd112 = add i32 %rdd110, %call111
+  %87 = load i32, ptr %sum, align 4, !tbaa !8
+  %rdd113 = add i32 %87, %rdd112
+  store i32 %rdd113, ptr %sum, align 4, !tbaa !8
+  br label %for.inc114
+
+for.inc114:                                       ; preds = %for.body70
+  %88 = load i32, ptr %i65, align 4, !tbaa !8
+  %inc115 = add nsw i32 %88, 1
+  store i32 %inc115, ptr %i65, align 4, !tbaa !8
+  br label %for.cond66, !llvm.loop !13
+
+for.end116:                                       ; preds = %for.cond.cleanup69
+  %89 = load i32, ptr %sum, align 4, !tbaa !8
+  %conv117 = trunc i32 %89 to i16
+  %conv118 = zext i16 %conv117 to i32
+  %90 = load i32, ptr %sum, align 4, !tbaa !8
+  %shr = lshr i32 %90, 16
+  %rdd119 = add i32 %conv118, %shr
+  %shr120 = lshr i32 %rdd119, 1
+  call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r3) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r2) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r0) #2
+  call void @llvm.lifetime.end.p0(i64 64, ptr %emp) #2
+  ret i32 %shr120
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define internal i32 @twoabs(i32 noundef %r) #0 {
+entry:
+  %r.addr = alloca i32, align 4
+  %s = alloca i32, align 4
+  store i32 %r, ptr %r.addr, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %s) #2
+  %0 = load i32, ptr %r.addr, align 4, !tbaa !8
+  %shr = lshr i32 %0, 15
+  %rnd = and i32 %shr, 65537
+  %mul = mul i32 %rnd, 65535
+  store i32 %mul, ptr %s, align 4, !tbaa !8
+  %1 = load i32, ptr %r.addr, align 4, !tbaa !8
+  %2 = load i32, ptr %s, align 4, !tbaa !8
+  %rdd = add i32 %1, %2
+  %3 = load i32, ptr %s, align 4, !tbaa !8
+  %xor = xor i32 %rdd, %3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %s) #2
+  ret i32 %xor
+}
+
+attributes #0 = { nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"int", !6, i64 0}
+!10 = !{!6, !6, i64 0}
+!11 = distinct !{!11, !12}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = distinct !{!13, !12}
diff --git a/llvm/test/Transforms/PhaseOrdering/varargs.ll b/llvm/test/Transforms/PhaseOrdering/varargs.ll
new file mode 100644
index 0000000000000..c66a982a06a6d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/varargs.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -S -passes='lto<O2>' < %s | FileCheck %s
+target triple = "amdgcn-amd-amdhsa"
+
+; REQUIRES: amdgpu-registered-target
+
+; We use the ExpandVariadics pass to lower variadic functions so they can be
+; inlined.
+
+define i32 @foo() {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i32 6
+;
+entry:
+  %call = tail call i32 (i32, ...) @vararg(i32 poison, i32 noundef 1, i32 noundef 2, i32 noundef 3)
+  ret i32 %call
+}
+
+define internal i32 @vararg(i32 %first, ...) {
+entry:
+  %vlist = alloca ptr, align 8
+  call void @llvm.va_start.p0(ptr %vlist)
+  %vlist.promoted = load ptr, ptr %vlist, align 8
+  %argp.a = getelementptr inbounds i8, ptr %vlist.promoted, i64 4
+  store ptr %argp.a, ptr %vlist, align 8
+  %a = load i32, ptr %vlist.promoted, align 4
+  %argp.b = getelementptr inbounds i8, ptr %vlist.promoted, i64 8
+  store ptr %argp.b, ptr %vlist, align 8
+  %b = load i32, ptr %argp.a, align 4
+  %sum = add nsw i32 %b, %a
+  %argp.c = getelementptr inbounds i8, ptr %vlist.promoted, i64 12
+  store ptr %argp.c, ptr %vlist, align 8
+  %c = load i32, ptr %argp.b, align 4
+  %ret = add nsw i32 %c, %sum
+  call void @llvm.va_end.p0(ptr %vlist)
+  ret i32 %ret
+}
+
+declare void @llvm.va_start.p0(ptr)
+
+declare void @llvm.va_end.p0(ptr)
diff --git a/llvm/test/Transforms/SCCP/crash.ll b/llvm/test/Transforms/SCCP/crash.ll
index 8f8ad44db437b..47d9329f6f03d 100644
--- a/llvm/test/Transforms/SCCP/crash.ll
+++ b/llvm/test/Transforms/SCCP/crash.ll
@@ -28,7 +28,7 @@ define i32 @test2([4 x i32] %A) {
   ret i32 %B
 }
 
-define x86_mmx @test3() {
-  %load = load x86_mmx, ptr null
-  ret x86_mmx %load
+define <1 x i64> @test3() {
+  %load = load <1 x i64>, ptr null
+  ret <1 x i64> %load
 }
diff --git a/llvm/test/Transforms/SCCP/float-denormal-simplification.ll b/llvm/test/Transforms/SCCP/float-denormal-simplification.ll
new file mode 100644
index 0000000000000..fec9883aabddd
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/float-denormal-simplification.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=sccp -S %s | FileCheck %s
+
+define float @test_ieee() #0 {
+; CHECK-LABEL: @test_ieee(
+; CHECK-NEXT:    ret float 0x36F4000000000000
+;
+  %1 = fmul float 2.802596928649634e-44, 2.000000e+00
+  ret float %1
+}
+
+define float @test_preserve_sign() #1 {
+; CHECK-LABEL: @test_preserve_sign(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %1 = fmul float 2.802596928649634e-44, 2.000000e+00
+  ret float %1
+}
+
+attributes #0 = {"denormal-fp-math"="ieee,ieee"}
+attributes #1 = {"denormal-fp-math"="preserve-sign,preserve-sign"}
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index d605611ee0de6..ff57569d12788 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -17,11 +17,11 @@
 ; CHECK-NOT: <badref>
 ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
 ; CHECK-NEXT:   [1]  <<exit node>> {4294967295,4294967295} [0]
+; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
+; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
 ; CHECK-NEXT:     [2] %for.body {4294967295,4294967295} [1]
 ; CHECK-NEXT:     [2] %if.end4 {4294967295,4294967295} [1]
 ; CHECK-NEXT:       [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
 ; CHECK-NEXT: Roots: %for.cond34 %for.body
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll
index 21d97c41388aa..9264a6eaefb85 100644
--- a/llvm/test/Transforms/SCCP/phis.ll
+++ b/llvm/test/Transforms/SCCP/phis.ll
@@ -1,9 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=sccp -S | FileCheck %s
 
 define i1 @float.1(i1 %cmp) {
-; CHECK-LABEL: define i1 @float.1(i1 %cmp) {
-
-; CHECK-LABEL: end:
+; CHECK-LABEL: define i1 @float.1(
+; CHECK-SAME: i1 [[CMP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i1 true
 ;
 entry:
@@ -19,12 +24,16 @@ end:
 }
 
 define i1 @float.2(i1 %cmp) {
-; CHECK-LABEL: define i1 @float.2(i1 %cmp) {
-
-; CHECK-LABEL: end:
-; CHECK-NEXT:    %p = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %if.true ]
-; CHECK-NEXT:    %c = fcmp ueq float %p, 1.000000e+00
-; CHECK-NEXT:    ret i1 %c
+; CHECK-LABEL: define i1 @float.2(
+; CHECK-SAME: i1 [[CMP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[P:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ 2.000000e+00, %[[IF_TRUE]] ]
+; CHECK-NEXT:    [[C:%.*]] = fcmp ueq float [[P]], 1.000000e+00
+; CHECK-NEXT:    ret i1 [[C]]
 ;
 entry:
   br i1 %cmp, label %if.true, label %end
@@ -39,13 +48,18 @@ end:
 }
 
 define i1 @float.3(float %f, i1 %cmp) {
-; CHECK-LABEL: define i1 @float.3(float %f, i1 %cmp)
-
-; CHECK-LABEL: end:
-; CHECK-NEXT:    %p = phi float [ 1.000000e+00, %entry ], [ %f, %if.true ]
-; CHECK-NEXT:    %c = fcmp ueq float %p, 1.000000e+00
-; CHECK-NEXT:    ret i1 %c
+; CHECK-LABEL: define i1 @float.3(
+; CHECK-SAME: float [[F:%.*]], i1 [[CMP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[P:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[F]], %[[IF_TRUE]] ]
+; CHECK-NEXT:    [[C:%.*]] = fcmp ueq float [[P]], 1.000000e+00
+; CHECK-NEXT:    ret i1 [[C]]
 ;
+
 entry:
   br i1 %cmp, label %if.true, label %end
 
@@ -60,11 +74,16 @@ end:
 
 
 define i1 @float.4_unreachable(float %f, i1 %cmp) {
-; CHECK-LABEL: define i1 @float.4_unreachable(float %f, i1 %cmp)
-
-; CHECK-LABEL: end:
+; CHECK-LABEL: define i1 @float.4_unreachable(
+; CHECK-SAME: float [[F:%.*]], i1 [[CMP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i1 false
 ;
+
 entry:
   br i1 %cmp, label %if.true, label %end
 
@@ -79,3 +98,77 @@ end:
   %c = fcmp une float %p, 1.0
   ret i1 %c
 }
+
+define <2 x i16> @phi_vector_merge1(i1 %c, <2 x i8> %a) {
+; CHECK-LABEL: define <2 x i16> @phi_vector_merge1(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ <i16 1, i16 2>, %[[IF]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], <i16 2, i16 3>
+; CHECK-NEXT:    ret <2 x i16> [[ADD]]
+;
+entry:
+  %zext = zext <2 x i8> %a to <2 x i16>
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <2 x i16> [ %zext, %entry ], [ <i16 1, i16 2>, %if ]
+  %add = add <2 x i16> %phi, <i16 2, i16 3>
+  ret <2 x i16> %add
+}
+
+define <2 x i16> @phi_vector_merge2(i1 %c, <2 x i8> %a) {
+; CHECK-LABEL: define <2 x i16> @phi_vector_merge2(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x i16> [ <i16 1, i16 2>, %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], <i16 2, i16 3>
+; CHECK-NEXT:    ret <2 x i16> [[ADD]]
+;
+entry:
+  %zext = zext <2 x i8> %a to <2 x i16>
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <2 x i16> [ <i16 1, i16 2>, %entry ], [ %zext, %if ]
+  %add = add <2 x i16> %phi, <i16 2, i16 3>
+  ret <2 x i16> %add
+}
+
+define <2 x float> @phi_vector_merge_float(i1 %c) {
+; CHECK-LABEL: define <2 x float> @phi_vector_merge_float(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x float> [ <float 2.000000e+00, float 1.000000e+00>, %[[ENTRY]] ], [ <float 1.000000e+00, float 2.000000e+00>, %[[IF]] ]
+; CHECK-NEXT:    ret <2 x float> [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <2 x float> [ <float 2.0, float 1.0>, %entry ], [ <float 1.0, float 2.0>, %if ]
+  ret <2 x float> %phi
+}
diff --git a/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll b/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll
new file mode 100644
index 0000000000000..790a794bfc23e
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll
@@ -0,0 +1,71 @@
+; Test that the debug information is propagated correctly to the new instructions
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
+define double @sdiv_ashr_sitofp_dbg_pres(i7 %y) !dbg !5 {
+; CHECK-LABEL: define double @sdiv_ashr_sitofp_dbg_pres(
+; CHECK:    [[SDIV:%.*]] = udiv i8 42, [[ZEXT1:%.*]], !dbg [[DBG9:![0-9]+]]
+; CHECK:    [[ASHR:%.*]] = lshr i8 42, [[SDIV]], !dbg [[DBG10:![0-9]+]]
+; CHECK:    [[SITOFP:%.*]] = uitofp nneg i16 [[ZEXT2:%.*]] to double, !dbg [[DBG12:![0-9]+]]
+;
+  %zext1 = zext i7 %y to i8, !dbg !8
+  %sdiv = sdiv i8 42, %zext1, !dbg !9
+  %ashr = ashr i8 42, %sdiv, !dbg !10
+  %zext2 = zext i8 %ashr to i16, !dbg !11
+  %sitofp = sitofp i16 %zext2 to double, !dbg !12
+  ret double %sitofp, !dbg !13
+}
+
+define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) !dbg !14 {
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi(
+; CHECK:       switch:
+; CHECK-NEXT:    br label %[[SWITCH_DEFAULT:.*]], !dbg [[DBG16:![0-9]+]]
+;
+entry:
+  br i1 %c, label %switch, label %end, !dbg !15
+
+switch:                                           ; preds = %entry
+  switch i32 -1, label %switch.default [
+  i32 0, label %end
+  i32 1, label %end
+  ], !dbg !16
+
+switch.default:                                   ; preds = %switch
+  ret i32 -1, !dbg !17
+
+end:                                              ; preds = %switch, %switch, %entry
+  %phi = phi i32 [ %x, %entry ], [ 1, %switch ], [ 1, %switch ], !dbg !18
+  ret i32 %phi, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+;.
+; CHECK: [[DBG9]] = !DILocation(line: 2
+; CHECK: [[DBG10]] = !DILocation(line: 3
+; CHECK: [[DBG12]] = !DILocation(line: 5
+; CHECK: [[DBG16]] = !DILocation(line: 8
+;.
+
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "sccp.ll", directory: "/")
+!2 = !{i32 11}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "sdiv_ashr_sitofp_dbg_pres", linkageName: "sdiv_ashr_sitofp_dbg_pres", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = distinct !DISubprogram(name: "test_duplicate_successors_phi", linkageName: "test_duplicate_successors_phi", scope: null, file: !1, line: 7, type: !6, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = !DILocation(line: 7, column: 1, scope: !14)
+!16 = !DILocation(line: 8, column: 1, scope: !14)
+!17 = !DILocation(line: 9, column: 1, scope: !14)
+!18 = !DILocation(line: 10, column: 1, scope: !14)
+!19 = !DILocation(line: 11, column: 1, scope: !14)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 807d2468d4271..d79aed89b0be7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -428,14 +428,14 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
@@ -1231,29 +1231,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
@@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 18, i32 22, i32 26, i32 30, i32 1, i32 5, i32 9, i32 13, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
 ; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
new file mode 100644
index 0000000000000..f44705a925d5a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -0,0 +1,354 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @uadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @usub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @usub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.umax.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umax.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @sadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @sadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.smin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.smin.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @ssub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @ssub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.smax.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.smax.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @uadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.umin.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.umin.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @usub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.umax.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.umax.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @sadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.smin.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.smin.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @ssub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.smax.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.smax.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v3i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
+;
+; GFX8-LABEL: @uadd_sat_v3i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
+;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
+bb:
+  %arg0.0 = extractelement <3 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <3 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <3 x i16> %arg0, i64 2
+  %arg1.0 = extractelement <3 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <3 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <3 x i16> %arg1, i64 2
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.umin.i16(i16 %arg0.2, i16 %arg1.2)
+  %ins.0 = insertelement <3 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <3 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <3 x i16> %ins.1, i16 %add.2, i64 2
+  ret <3 x i16> %ins.2
+}
+
+define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v4i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
+; GFX7-NEXT:    ret <4 x i16> [[INS_3]]
+;
+; GFX8-LABEL: @uadd_sat_v4i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i16> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i16> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i16> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i16> %arg1, i64 3
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.umin.i16(i16 %arg0.2, i16 %arg1.2)
+  %add.3 = call i16 @llvm.umin.i16(i16 %arg0.3, i16 %arg1.3)
+  %ins.0 = insertelement <4 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <4 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <4 x i16> %ins.1, i16 %add.2, i64 2
+  %ins.3 = insertelement <4 x i16> %ins.2, i16 %add.3, i64 3
+  ret <4 x i16> %ins.3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
index 7152115220b00..429640ea7ff16 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -7,10 +7,10 @@
 ; Simple 3-pair chain with loads and stores
 define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {
 ; GCN-LABEL: @test1_as_3_3_3_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -29,10 +29,10 @@ define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspa
 
 define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
 ; GCN-LABEL: @test1_as_3_0_0(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GCN-NEXT:    store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -51,10 +51,10 @@ define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
 
 define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {
 ; GCN-LABEL: @test1_as_0_0_3_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr %a, align 2
@@ -73,11 +73,11 @@ define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3)
 
 define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
 ; GCN-LABEL: @test1_fma_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
-; GCN-NEXT:    store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
+; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -99,10 +99,10 @@ define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3)
 
 define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {
 ; GCN-LABEL: @mul_scalar_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
-; GCN-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer
-; GCN-NEXT:    [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]]
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
+; GCN-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer
+; GCN-NEXT:    [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]]
 ; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
@@ -119,9 +119,9 @@ define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, p
 
 define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
 ; GCN-LABEL: @fabs_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
-; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
+; GCN-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -137,12 +137,12 @@ define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c)
 
 define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
 ; GCN-LABEL: @test1_fabs_fma_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
-; GCN-NEXT:    [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
-; GCN-NEXT:    store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
+; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
+; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -172,12 +172,12 @@ define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr
 ; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
 ; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
 ; GCN-NEXT:    [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
-; GCN-NEXT:    [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])
-; GCN-NEXT:    store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
+; GCN-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1
+; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]])
+; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -200,22 +200,11 @@ define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr
 }
 
 define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
-; GFX9-LABEL: @canonicalize_v2f16(
-; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])
-; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
-; GFX9-NEXT:    ret void
-;
-; VI-LABEL: @canonicalize_v2f16(
-; VI-NEXT:    [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2
-; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
-; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
-; VI-NEXT:    [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
-; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
-; VI-NEXT:    store half [[CANONICALIZE0]], ptr addrspace(3) [[C:%.*]], align 2
-; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
-; VI-NEXT:    store half [[CANONICALIZE1]], ptr addrspace(3) [[ARRAYIDX5]], align 2
-; VI-NEXT:    ret void
+; GCN-LABEL: @canonicalize_v2f16(
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]])
+; GCN-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
   %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
@@ -234,3 +223,6 @@ declare half @llvm.canonicalize.f16(half) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
+; VI: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
new file mode 100644
index 0000000000000..c79fa9c84d1c3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -0,0 +1,360 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; FIXME: Should not vectorize on gfx8
+
+; GCN-LABEL: @fadd_combine_v2f16
+; GCN: fadd <2 x half>
+define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fadd half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fadd half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fsub_combine_v2f16
+; GCN: fsub <2 x half>
+define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fsub half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fsub half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmul_combine_v2f16
+; GCN: fmul <2 x half>
+define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fmul half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fmul half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fdiv_combine_v2f16
+; GCN: fdiv <2 x half>
+define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fdiv half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fdiv half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @frem_combine_v2f16
+; GCN: frem <2 x half>
+define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = frem half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = frem half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fma_combine_v2f16
+; GCN: call <2 x half> @llvm.fma.v2f16
+define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmuladd_combine_v2f16
+; GCN: call <2 x half> @llvm.fmuladd.v2f16
+define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @minnum_combine_v2f16
+; GFX8: call half @llvm.minnum.f16(
+; GFX8: call half @llvm.minnum.f16(
+
+; GFX9: call <2 x half> @llvm.minnum.v2f16
+define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maxnum_combine_v2f16
+; GFX8: call half @llvm.maxnum.f16(
+; GFX8: call half @llvm.maxnum.f16(
+
+; GFX9: call <2 x half> @llvm.maxnum.v2f16
+define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should vectorize
+; GCN-LABEL: @minimum_combine_v2f16
+; GCN: call half @llvm.minimum.f16(
+; GCN: call half @llvm.minimum.f16(
+define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maximum_combine_v2f16
+; GCN: call half @llvm.maximum.f16(
+; GCN: call half @llvm.maximum.f16(
+define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v2f16
+; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_combine_v2f16
+; GCN: call <2 x half> @llvm.fabs.v2f16(
+define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.fabs.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.fabs.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fneg_combine_v2f16
+; GCN: fneg <2 x half>
+define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fneg half %tmp3
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fneg half %tmp7
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @copysign_combine_v2f16
+; GCN: call <2 x half> @llvm.copysign.v2f16(
+define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should always vectorize
+; GCN-LABEL: @copysign_combine_v4f16
+; GCN: call <2 x half> @llvm.copysign.v2f16(
+
+; GFX8: call half @llvm.copysign.f16(
+; GFX8: call half @llvm.copysign.f16(
+
+; GFX9: call <2 x half> @llvm.copysign.v2f16(
+define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v4f16
+; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-bv-multi-uses.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-bv-multi-uses.ll
new file mode 100644
index 0000000000000..3ca5733a6bad0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-bv-multi-uses.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr="+v" < %s -slp-threshold=-10 | FileCheck %s
+
+define i32 @test(i64 %v1, i64 %v2) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i64 [[V1:%.*]], i64 [[V2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP1]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <2 x i64> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+entry:
+  %t1 = trunc i64 %v1 to i32
+  %t2 = trunc i64 %v2 to i32
+  %lshr1 = lshr i64 %v1, 32
+  %lshr2 = lshr i64 %v2, 32
+  %t3 = trunc i64 %lshr1 to i32
+  %t4 = trunc i64 %lshr2 to i32
+  %add1 = add i32 %t1, %t3
+  %add2 = add i32 %t2, %t4
+  %mul = mul i32 %add1, %add2
+  ret i32 %mul
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
new file mode 100644
index 0000000000000..81837bdf99eaf
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux-gnu -mcpu=z16 -slp-threshold=-10 < %s | FileCheck %s
+
+define i1 @test(i64 %0, i64 %1, ptr %2) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i64 [[TMP0:%.*]], i64 [[TMP1:%.*]], ptr [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[GEP44:%.*]] = getelementptr i8, ptr null, i64 [[TMP0]]
+; CHECK-NEXT:    [[GEP45:%.*]] = getelementptr i8, ptr null, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP48:%.*]] = getelementptr i8, ptr null, i64 [[TMP0]]
+; CHECK-NEXT:    [[GEP49:%.*]] = getelementptr i8, ptr null, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[GEP44]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x ptr> [[TMP3]], ptr [[GEP48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[GEP45]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[GEP49]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <2 x ptr> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x ptr> [[TMP4]], <2 x ptr> [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x ptr> [[TMP9]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x ptr> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+entry:
+  %gep44 = getelementptr i8, ptr null, i64 %0
+  %gep45 = getelementptr i8, ptr null, i64 %1
+  %4 = icmp ult ptr %gep44, %gep45
+  %umin = select i1 %4, ptr %gep44, ptr %gep45
+  %gep48 = getelementptr i8, ptr null, i64 %0
+  %gep49 = getelementptr i8, ptr null, i64 %1
+  %5 = icmp ult ptr %gep48, %gep49
+  %umin50 = select i1 %5, ptr %gep48, ptr %gep49
+  %b095 = icmp ult ptr %umin, %2
+  %b196 = icmp ult ptr %umin50, %2
+  %res = and i1 %b095, %b196
+  ret i1 %res
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 8c4903dbc92bb..634870e3c780a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -26,105 +26,70 @@ declare i16 @llvm.sadd.sat.i16(i16, i16)
 declare i8  @llvm.sadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @add_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
 ; SLM-NEXT:    ret void
 ;
-; AVX1-LABEL: @add_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @add_v8i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
-; AVX2-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @add_v8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
@@ -198,25 +163,6 @@ define void @add_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
-; SLM-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
@@ -322,25 +268,6 @@ define void @add_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v32i16(
-; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SLM-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
@@ -510,25 +437,6 @@ define void @add_v64i8() {
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v64i8(
-; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], ptr @c8, align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
@@ -805,3 +713,6 @@ define void @add_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1: {{.*}}
+; AVX2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index cb8d45b1a21a2..bd10ecdf330ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -26,40 +26,59 @@ declare i16 @llvm.uadd.sat.i16(i16, i16)
 declare i8  @llvm.uadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @add_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
index 65e2a011cc9a1..35d46e4c85be0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -61,70 +61,16 @@ define void @sub_v8i64() {
 ; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT:    ret void
-;
-; AVX1-LABEL: @sub_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @sub_v8i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
-; AVX2-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sub_v8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
@@ -198,25 +144,6 @@ define void @sub_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
-; SLM-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
@@ -322,25 +249,6 @@ define void @sub_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v32i16(
-; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SLM-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
@@ -510,25 +418,6 @@ define void @sub_v64i8() {
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v64i8(
-; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], ptr @c8, align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
@@ -805,3 +694,8 @@ define void @sub_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1: {{.*}}
+; AVX2: {{.*}}
+; SLM: {{.*}}
+; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
index 18df499c6646e..7777bf433136e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -26,40 +26,59 @@ declare i16 @llvm.usub.sat.i16(i16, i16)
 declare i8  @llvm.usub.sat.i8 (i8 , i8 )
 
 define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @sub_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
index 7476ddb0fb873..19ca68fc9cb2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -4,68 +4,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @test1(x86_mmx %a, x86_mmx %b, ptr %ptr) {
-; Ensure we can handle x86_mmx values which are primitive and can be bitcast
-; with integer types but can't be put into a vector.
-;
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i32 1
-; CHECK-NEXT:    store i64 [[A_AND]], ptr [[PTR]], align 8
-; CHECK-NEXT:    store i64 [[B_AND]], ptr [[GEP]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %a.cast = bitcast x86_mmx %a to i64
-  %b.cast = bitcast x86_mmx %b to i64
-  %a.and = and i64 %a.cast, 42
-  %b.and = and i64 %b.cast, 42
-  %gep = getelementptr i64, ptr %ptr, i32 1
-  store i64 %a.and, ptr %ptr
-  store i64 %b.and, ptr %gep
-  ret void
-}
-
-define void @test2(x86_mmx %a, x86_mmx %b) {
-; Same as @test1 but using phi-input vectorization instead of store
-; vectorization.
-;
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ]
-; CHECK-NEXT:    tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  br i1 undef, label %if.then, label %exit
-
-if.then:
-  %a.cast = bitcast x86_mmx %a to i64
-  %b.cast = bitcast x86_mmx %b to i64
-  %a.and = and i64 %a.cast, 42
-  %b.and = and i64 %b.cast, 42
-  br label %exit
-
-exit:
-  %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
-  %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
-  tail call void @f(i64 %a.phi, i64 %b.phi)
-  ret void
-}
-
 define i8 @test3(ptr %addr) {
 ; Check that we do not vectorize types that are padded to a bigger ones.
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
index 31ad629160c8d..3d3d00f4a0b3f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
@@ -13,11 +13,11 @@ define void @test() {
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i64> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i64> [[TMP6]], <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP1]], <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 2
-; CHECK-NEXT:    [[OP_RDX33:%.*]] = add i64 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[OP_RDX25]] = add i64 [[OP_RDX33]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX16:%.*]] = add i64 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[OP_RDX25]] = add i64 [[OP_RDX16]], [[TMP3]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
new file mode 100644
index 0000000000000..f1a5709d07f02
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB20:.*]]:
+; CHECK-NEXT:    br label %[[BB105:.*]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP1:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB105]], label %[[BB51]]
+; CHECK:       [[BB51]]:
+; CHECK-NEXT:    [[TMP1]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ]
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB54]]:
+; CHECK-NEXT:    br label %[[BB51]]
+; CHECK:       [[BB105]]:
+; CHECK-NEXT:    [[PHI106:%.*]] = phi ptr addrspace(1) [ null, %[[BB20]] ], [ null, %[[BB43]] ]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %0 = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <2 x i32> <i32 1, i32 0>
+  %1 = extractelement <2 x ptr addrspace(1)> %0, i32 0
+  %2 = extractelement <2 x ptr addrspace(1)> %0, i32 1
+  br label %bb43
+
+bb20:
+  br label %bb105
+
+bb43:
+  %phi441 = phi ptr addrspace(1) [ %4, %bb51 ], [ %2, %bb ]
+  %phi452 = phi ptr addrspace(1) [ %5, %bb51 ], [ %1, %bb ]
+  br i1 false, label %bb105, label %bb51
+
+bb51:
+  %3 = phi <2 x ptr addrspace(1)> [ poison, %bb54 ], [ zeroinitializer, %bb43 ]
+  %4 = extractelement <2 x ptr addrspace(1)> %3, i32 0
+  %5 = extractelement <2 x ptr addrspace(1)> %3, i32 1
+  br label %bb43
+
+bb54:
+  br label %bb51
+
+bb105:
+  %phi106 = phi ptr addrspace(1) [ %1, %bb20 ], [ null, %bb43 ]
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
index 58c108b81a7ce..e66cce1b58287 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -1,33 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="-avx512pf,+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
 
 define i32 @foo(i32 %a) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[LOCAL:%.*]] = sub nsw i32 0, 0
 ; CHECK-NEXT:    br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]]
-; CHECK-NEXT:    ret i32 [[OP_RDX9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
+; CHECK-NEXT:    ret i32 [[OP_RDX1]]
 ; CHECK:       bb5:
 ; CHECK-NEXT:    br label [[BB4:%.*]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index de06daac7a75d..fa022ad69af79 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -977,64 +977,54 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE4-LABEL: @maxi8_wrong_parent(
 ; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
 ; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; SSE4-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; SSE4-NEXT:    br label [[PP:%.*]]
 ; SSE4:       pp:
-; SSE4-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SSE4-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; SSE4-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
 ; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; SSE4-NEXT:    ret i32 [[OP_RDX5]]
+; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
+; SSE4-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    br label [[PP:%.*]]
 ; AVX:       pp:
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
 ; AVX-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; AVX-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; AVX-NEXT:    ret i32 [[OP_RDX5]]
+; AVX-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
+; AVX-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; AVX-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; THRESH-LABEL: @maxi8_wrong_parent(
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
-; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    br label [[PP:%.*]]
 ; THRESH:       pp:
-; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
-; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
-; THRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
-; THRESH-NEXT:    ret i32 [[OP_RDX5]]
+; THRESH-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]])
+; THRESH-NEXT:    ret i32 [[TMP10]]
 ;
   %2 = load i32, ptr @arr, align 16
   %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll
new file mode 100644
index 0000000000000..429bf13b2b87a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer < %s | FileCheck %s
+
+target triple = "x86_64-redhat-linux-gnu"
+
+; The load+store sequence inside bb10 should not get vectorized. Previously,
+; we incorrectly determined that the pointers do not alias, because a cache
+; entry based indirectly on a disproven NoAlias assumption was not cleared
+; from the BatchAA cache.
+define void @test(ptr %p1, i64 %arg1, i64 %arg2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P1:%.*]], i64 [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
+; CHECK-NEXT:  [[_PREHEADER48_PREHEADER_1:.*]]:
+; CHECK-NEXT:    br label %[[_LOOPEXIT49_1:.*]]
+; CHECK:       [[_LOOPEXIT49_1]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr [ [[I21:%.*]], %[[BB20:.*]] ], [ [[P1]], %[[_PREHEADER48_PREHEADER_1]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB22:.*]], label %[[DOTPREHEADER48_PREHEADER_1:.*]]
+; CHECK:       [[DEAD:.*]]:
+; CHECK-NEXT:    br label %[[DOTPREHEADER48_PREHEADER_1]]
+; CHECK:       [[_PREHEADER48_PREHEADER_2:.*:]]
+; CHECK-NEXT:    [[I5:%.*]] = phi ptr [ [[I]], %[[DEAD]] ], [ [[I]], %[[_LOOPEXIT49_1]] ]
+; CHECK-NEXT:    br label %[[DOTLOOPEXIT49_1:.*]]
+; CHECK:       [[DEAD1:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[DOTLOOPEXIT49_1]], label %[[BB20]]
+; CHECK:       [[_LOOPEXIT49_2:.*:]]
+; CHECK-NEXT:    [[I6:%.*]] = phi ptr [ [[I5]], %[[DEAD1]] ], [ [[I5]], %[[DOTPREHEADER48_PREHEADER_1]] ]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[ARG1]]
+; CHECK-NEXT:    br label %[[BB10:.*]]
+; CHECK:       [[DEAD2:.*]]:
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[I11:%.*]] = phi ptr [ [[I7]], %[[DOTLOOPEXIT49_1]] ], [ null, %[[DEAD2]] ]
+; CHECK-NEXT:    [[I16:%.*]] = getelementptr i8, ptr [[I11]], i64 8
+; CHECK-NEXT:    [[I17:%.*]] = load i64, ptr [[I16]], align 1
+; CHECK-NEXT:    store i64 [[I17]], ptr [[I6]], align 1
+; CHECK-NEXT:    [[I18:%.*]] = getelementptr i8, ptr [[I6]], i64 8
+; CHECK-NEXT:    [[I19:%.*]] = load i64, ptr [[I11]], align 1
+; CHECK-NEXT:    store i64 [[I19]], ptr [[I18]], align 1
+; CHECK-NEXT:    br label %[[BB20]]
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[I21]] = phi ptr [ [[I5]], %[[DEAD1]] ], [ [[I6]], %[[BB10]] ]
+; CHECK-NEXT:    br label %[[_LOOPEXIT49_1]]
+; CHECK:       [[BB22]]:
+; CHECK-NEXT:    [[I23:%.*]] = getelementptr i8, ptr [[I]], i64 [[ARG2]]
+; CHECK-NEXT:    [[I25:%.*]] = getelementptr i8, ptr [[I23]], i64 8
+; CHECK-NEXT:    br label %[[BB26:.*]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    [[I27:%.*]] = phi ptr [ null, %[[BB26]] ], [ [[I25]], %[[BB22]] ]
+; CHECK-NEXT:    store i64 0, ptr [[I27]], align 1
+; CHECK-NEXT:    [[I28:%.*]] = getelementptr i8, ptr [[I27]], i64 8
+; CHECK-NEXT:    [[I29:%.*]] = load i64, ptr [[I23]], align 1
+; CHECK-NEXT:    store i64 0, ptr [[I28]], align 1
+; CHECK-NEXT:    br label %[[BB26]]
+;
+entry:
+  br label %loop1
+
+loop1:                                            ; preds = %bb20, %entry
+  %i = phi ptr [ %i21, %bb20 ], [ %p1, %entry ]
+  br i1 false, label %bb22, label %.preheader48.preheader.1
+
+dead:                                             ; No predecessors!
+  br label %.preheader48.preheader.1
+
+.preheader48.preheader.1:                         ; preds = %dead, %loop1
+  %i5 = phi ptr [ %i, %dead ], [ %i, %loop1 ]
+  br label %.loopexit49.1
+
+dead1:                                    ; No predecessors!
+  br i1 false, label %.loopexit49.1, label %bb20
+
+.loopexit49.1:                                    ; preds = %dead1, %.preheader48.preheader.1
+  %i6 = phi ptr [ %i5, %dead1 ], [ %i5, %.preheader48.preheader.1 ]
+  %i7 = getelementptr i8, ptr %i6, i64 %arg1
+  br label %bb10
+
+dead2:                                            ; No predecessors!
+  br label %bb10
+
+bb10:                                             ; preds = %dead2, %.loopexit49.1
+  %i11 = phi ptr [ %i7, %.loopexit49.1 ], [ null, %dead2 ]
+  %i16 = getelementptr i8, ptr %i11, i64 8
+  %i17 = load i64, ptr %i16, align 1
+  store i64 %i17, ptr %i6, align 1
+  %i18 = getelementptr i8, ptr %i6, i64 8
+  %i19 = load i64, ptr %i11, align 1
+  store i64 %i19, ptr %i18, align 1
+  br label %bb20
+
+bb20:                                             ; preds = %bb10, %dead1
+  %i21 = phi ptr [ %i5, %dead1 ], [ %i6, %bb10 ]
+  br label %loop1
+
+bb22:                                             ; preds = %loop1
+  %i23 = getelementptr i8, ptr %i, i64 %arg2
+  %i25 = getelementptr i8, ptr %i23, i64 8
+  br label %bb26
+
+bb26:                                             ; preds = %bb26, %bb22
+  %i27 = phi ptr [ null, %bb26 ], [ %i25, %bb22 ]
+  store i64 0, ptr %i27, align 1
+  %i28 = getelementptr i8, ptr %i27, i64 8
+  %i29 = load i64, ptr %i23, align 1
+  store i64 0, ptr %i28, align 1
+  br label %bb26
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea52357e04f..31f0e065cf77d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,19 @@ define i16 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT:    [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
+; CHECK-NEXT:    [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
index f032d4b6ecd45..e8abccea6b0f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
@@ -7,15 +7,13 @@ define void @tes() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
 ; CHECK:       1:
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 false, i1 false, i1 false
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false
-; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP5:%.*]], label [[TMP6:%.*]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[OP_RDX]], label [[TMP6:%.*]], label [[TMP5:%.*]]
+; CHECK:       4:
 ; CHECK-NEXT:    ret void
-; CHECK:       6:
+; CHECK:       5:
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 838a75dcd29e0..c25e07cfd451f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -436,10 +436,9 @@ define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
 define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-LABEL: @logical_and_icmp_extra_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 [[C]], i1 false
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[S3]], i1 [[TMP3]], i1 false
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
@@ -465,10 +464,9 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-LABEL: @logical_or_icmp_extra_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[C]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[S3]], i1 true, i1 [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C:%.*]]
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-fix-99411.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-fix-99411.ll
new file mode 100644
index 0000000000000..655e4a1aa5483
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-fix-99411.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple x86_64-unknown-linux-gnu -passes=slp-vectorizer -S %s | FileCheck %s
+
+define void @e() {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <1 x i64> zeroinitializer, i64 0
+  %bf.value = and i64 %0, 0
+  %bf.set = or i64 0, %bf.value
+  store i64 %bf.set, ptr getelementptr inbounds (i8, ptr null, i64 8), align 8
+  %bf.value2 = and i64 0, 0
+  %bf.set4 = or i64 0, %bf.value2
+  store i64 %bf.set4, ptr null, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 8f1d7a11e1509..69ecf1852aedd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
-; CHECK-NEXT:    [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
+; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index 4b37b100763a9..a6e1061189980 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -38,3 +38,53 @@ entry:
   store <4 x i32> %add.i65, ptr %arrayidx42, align 4
   ret void
 }
+
+define void @test2(ptr %in, ptr %out) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[IN:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP0]], <16 x i16> [[TMP0]])
+; CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[OUT:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = getelementptr i16, ptr %in, i64 8
+  %1 = load <8 x i16>, ptr %in, align 2
+  %2 = load <8 x i16>, ptr %0, align 2
+  %3 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %1, <8 x i16> %1)
+  %4 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %2, <8 x i16> %2)
+  %5 = getelementptr i16, ptr %out, i64 8
+  store <8 x i16> %3, ptr %out, align 2
+  store <8 x i16> %4, ptr %5, align 2
+  ret void
+}
+
+define void @test3(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> [[TMP0]], ptr [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x ptr> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]]
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr [[Z:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = getelementptr inbounds i32, ptr %x, i64 4
+  %1 = getelementptr inbounds i32, ptr %y, i64 4
+  %2 = load <4 x i32>, ptr %x, align 4
+  %3 = load <4 x i32>, ptr %0, align 4
+  %4 = load <4 x i32>, ptr %y, align 4
+  %5 = load <4 x i32>, ptr %1, align 4
+  %6 = icmp eq ptr %x, null
+  %7 = icmp eq ptr %y, null
+  %8 = select i1 %6, <4 x i32> %2, <4 x i32> %4
+  %9 = select i1 %7, <4 x i32> %3, <4 x i32> %5
+  %10 = getelementptr inbounds i32, ptr %z, i64 4
+  store <4 x i32> %8, ptr %z, align 4
+  store <4 x i32> %9, ptr %10, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 7c8b27c9de9c0..616617b079828 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -344,6 +344,40 @@ entry:
   ret i32 %loaded
 }
 
+; We should not unconditionally load with sanitizers.
+define i32 @test9_asan(i32 %b, ptr %ptr) sanitize_address {
+; Same as @test8 but for a select rather than a PHI node.
+;
+; CHECK-PRESERVE-CFG-LABEL: @test9_asan(
+; CHECK-PRESERVE-CFG-NEXT:  entry:
+; CHECK-PRESERVE-CFG-NEXT:    [[F:%.*]] = alloca float, align 4
+; CHECK-PRESERVE-CFG-NEXT:    store i32 0, ptr [[PTR:%.*]], align 4
+; CHECK-PRESERVE-CFG-NEXT:    [[TEST:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-PRESERVE-CFG-NEXT:    [[SELECT:%.*]] = select i1 [[TEST]], ptr [[F]], ptr [[PTR]]
+; CHECK-PRESERVE-CFG-NEXT:    [[LOADED:%.*]] = load i32, ptr [[SELECT]], align 4
+; CHECK-PRESERVE-CFG-NEXT:    ret i32 [[LOADED]]
+;
+; CHECK-MODIFY-CFG-LABEL: @test9_asan(
+; CHECK-MODIFY-CFG-NEXT:  entry:
+; CHECK-MODIFY-CFG-NEXT:    store i32 0, ptr [[PTR:%.*]], align 4
+; CHECK-MODIFY-CFG-NEXT:    [[TEST:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-MODIFY-CFG-NEXT:    [[LOADED_ELSE_VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[TEST]], label [[ENTRY_THEN:%.*]], label [[ENTRY_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[LOADED:%.*]] = phi i32 [ undef, [[ENTRY_THEN]] ], [ [[LOADED_ELSE_VAL]], [[ENTRY:%.*]] ]
+; CHECK-MODIFY-CFG-NEXT:    ret i32 [[LOADED]]
+;
+entry:
+  %f = alloca float
+  store i32 0, ptr %ptr
+  %test = icmp ne i32 %b, 0
+  %select = select i1 %test, ptr %f, ptr %ptr
+  %loaded = load i32, ptr %select, align 4
+  ret i32 %loaded
+}
+
 define float @test10(i32 %b, ptr %ptr) {
 ; Don't try to promote allocas which are not elligible for it even after
 ; rewriting due to the necessity of inserting bitcasts when speculating a PHI
diff --git a/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll b/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
index a51c757e48bd3..76e00a95e2caf 100644
--- a/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
+++ b/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
@@ -16,8 +16,8 @@ define void @f2(i1 %c1) {
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[G_0_SROA_SPECULATE_LOAD_CLEANUP:%.*]] = load i16, ptr @a, align 1
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -52,6 +52,52 @@ cleanup7:                                         ; preds = %cleanup
   ret void
 }
 
+define void @f2_hwasan(i1 %c1) sanitize_hwaddress {
+; CHECK-LABEL: @f2_hwasan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[E:%.*]] = alloca i16, align 1
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[CLEANUP:%.*]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
+; CHECK-NEXT:    ]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[LBL1]]
+; CHECK:       lbl1:
+; CHECK-NEXT:    [[G_0:%.*]] = phi ptr [ @a, [[CLEANUP]] ], [ @a, [[CLEANUP]] ], [ [[E]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[G_0]], align 1
+; CHECK-NEXT:    unreachable
+; CHECK:       cleanup7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %e = alloca i16, align 1
+  br i1 %c1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.then
+  switch i32 2, label %cleanup7 [
+  i32 0, label %lbl1
+  i32 2, label %lbl1
+  ]
+
+if.else:                                          ; preds = %entry
+  br label %lbl1
+
+lbl1:                                             ; preds = %if.else, %cleanup, %cleanup
+  %g.0 = phi ptr [ @a, %cleanup ], [ @a, %cleanup ], [ %e, %if.else ]
+  %0 = load i16, ptr %g.0, align 1
+  unreachable
+
+cleanup7:                                         ; preds = %cleanup
+  ret void
+}
+
 define void @f3(i1 %c1) {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  entry:
@@ -61,8 +107,8 @@ define void @f3(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -112,8 +158,8 @@ define void @f4(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -165,8 +211,8 @@ define void @f5(i1 %c1, i1 %c2) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -216,8 +262,8 @@ define void @f6(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll
index 1bf1ad7ee934a..4eb6a7107dad3 100644
--- a/llvm/test/Transforms/SROA/pr57796.ll
+++ b/llvm/test/Transforms/SROA/pr57796.ll
@@ -17,9 +17,9 @@ define void @foo() {
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32
 ; CHECK-NEXT:    [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0)
-; CHECK-NEXT:    store x86_mmx [[TMP2]], ptr @A, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0)
+; CHECK-NEXT:    store <1 x i64> [[TMP2]], ptr @A, align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -29,13 +29,13 @@ entry:
   %call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i)
   %0 = load <32 x i8>, ptr %call.i, align 32
   store <32 x i8> %0, ptr %ref.tmp, align 32
-  %1 = load x86_mmx, ptr %ref.tmp, align 32
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0)
-  store x86_mmx %2, ptr @A, align 8
+  %1 = load <1 x i64>, ptr %ref.tmp, align 32
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 0)
+  store <1 x i64> %2, ptr @A, align 8
   ret void
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8 immarg)
 
 declare dso_local void @value_create(ptr sret(%struct.Value) align 32)
 
diff --git a/llvm/test/Transforms/SROA/select-load.ll b/llvm/test/Transforms/SROA/select-load.ll
index 7df72419b0ce5..9de765071b535 100644
--- a/llvm/test/Transforms/SROA/select-load.ll
+++ b/llvm/test/Transforms/SROA/select-load.ll
@@ -36,7 +36,7 @@ entry:
 %st.args = type { i32, ptr }
 
 ; A bitcasted load and a direct load of select.
-define void @test_multiple_loads_select(i1 %cmp){
+define void @test_multiple_loads_select(i1 %cmp) {
 ; CHECK-LABEL: @test_multiple_loads_select(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADDR_I8_SROA_SPECULATED:%.*]] = select i1 [[CMP:%.*]], ptr undef, ptr undef
@@ -57,6 +57,51 @@ entry:
   ret void
 }
 
+; Sanitizer will break optimization.
+define void @test_multiple_loads_select_asan(i1 %cmp) sanitize_address {
+; CHECK-PRESERVE-CFG-LABEL: @test_multiple_loads_select_asan(
+; CHECK-PRESERVE-CFG-NEXT:  entry:
+; CHECK-PRESERVE-CFG-NEXT:    [[ARGS_SROA_0:%.*]] = alloca ptr, align 8
+; CHECK-PRESERVE-CFG-NEXT:    [[ARGS_SROA_1:%.*]] = alloca ptr, align 8
+; CHECK-PRESERVE-CFG-NEXT:    [[SEL_SROA_SEL:%.*]] = select i1 [[CMP:%.*]], ptr [[ARGS_SROA_1]], ptr [[ARGS_SROA_0]]
+; CHECK-PRESERVE-CFG-NEXT:    [[ADDR_I8:%.*]] = load ptr, ptr [[SEL_SROA_SEL]], align 8
+; CHECK-PRESERVE-CFG-NEXT:    call void @foo_i8(ptr [[ADDR_I8]])
+; CHECK-PRESERVE-CFG-NEXT:    [[ADDR_I32:%.*]] = load ptr, ptr [[SEL_SROA_SEL]], align 8
+; CHECK-PRESERVE-CFG-NEXT:    call void @foo_i32(ptr [[ADDR_I32]])
+; CHECK-PRESERVE-CFG-NEXT:    ret void
+;
+; CHECK-MODIFY-CFG-LABEL: @test_multiple_loads_select_asan(
+; CHECK-MODIFY-CFG-NEXT:  entry:
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[CMP:%.*]], label [[ENTRY_THEN:%.*]], label [[ENTRY_ELSE:%.*]]
+; CHECK-MODIFY-CFG:       entry.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.else:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[ADDR_I8:%.*]] = phi ptr [ undef, [[ENTRY_THEN]] ], [ undef, [[ENTRY_ELSE]] ]
+; CHECK-MODIFY-CFG-NEXT:    call void @foo_i8(ptr [[ADDR_I8]])
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[CMP]], label [[ENTRY_CONT_THEN:%.*]], label [[ENTRY_CONT_ELSE:%.*]]
+; CHECK-MODIFY-CFG:       entry.cont.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.cont.else:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[ADDR_I32:%.*]] = phi ptr [ undef, [[ENTRY_CONT_THEN]] ], [ undef, [[ENTRY_CONT_ELSE]] ]
+; CHECK-MODIFY-CFG-NEXT:    call void @foo_i32(ptr [[ADDR_I32]])
+; CHECK-MODIFY-CFG-NEXT:    ret void
+;
+entry:
+  %args = alloca [2 x %st.args], align 16
+  %arr1 = getelementptr inbounds [2 x %st.args], ptr %args, i64 0, i64 1
+  %sel = select i1 %cmp, ptr %arr1, ptr %args
+  %addr = getelementptr inbounds %st.args, ptr %sel, i64 0, i32 1
+  %addr.i8 = load ptr, ptr %addr, align 8
+  call void @foo_i8(ptr %addr.i8)
+  %addr.i32 = load ptr, ptr %addr, align 8
+  call void @foo_i32 (ptr %addr.i32)
+  ret void
+}
+
 declare void @foo_i8(ptr)
 declare void @foo_i32(ptr)
 
@@ -414,13 +459,13 @@ define void @load_of_select_with_noundef_nonnull(ptr %buffer, i1 %b) {
 ; CHECK-PRESERVE-CFG-LABEL: @load_of_select_with_noundef_nonnull(
 ; CHECK-PRESERVE-CFG-NEXT:    [[UB_PTR:%.*]] = alloca ptr, align 8
 ; CHECK-PRESERVE-CFG-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[B:%.*]], ptr [[BUFFER:%.*]], ptr [[UB_PTR]]
-; CHECK-PRESERVE-CFG-NEXT:    [[LOAD_PTR:%.*]] = load ptr, ptr [[SELECT_PTR]], align 8, !nonnull !1, !noundef !1
+; CHECK-PRESERVE-CFG-NEXT:    [[LOAD_PTR:%.*]] = load ptr, ptr [[SELECT_PTR]], align 8, !nonnull [[META1:![0-9]+]], !noundef [[META1]]
 ; CHECK-PRESERVE-CFG-NEXT:    ret void
 ;
 ; CHECK-MODIFY-CFG-LABEL: @load_of_select_with_noundef_nonnull(
 ; CHECK-MODIFY-CFG-NEXT:    br i1 [[B:%.*]], label [[DOTTHEN:%.*]], label [[DOTCONT:%.*]]
 ; CHECK-MODIFY-CFG:       .then:
-; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR_THEN_VAL:%.*]] = load ptr, ptr [[BUFFER:%.*]], align 8, !nonnull !2, !noundef !2
+; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR_THEN_VAL:%.*]] = load ptr, ptr [[BUFFER:%.*]], align 8, !nonnull [[META2:![0-9]+]], !noundef [[META2]]
 ; CHECK-MODIFY-CFG-NEXT:    br label [[DOTCONT]]
 ; CHECK-MODIFY-CFG:       .cont:
 ; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR:%.*]] = phi ptr [ [[LOAD_PTR_THEN_VAL]], [[DOTTHEN]] ], [ undef, [[TMP0:%.*]] ]
diff --git a/llvm/test/Transforms/SROA/tbaa-struct.ll b/llvm/test/Transforms/SROA/tbaa-struct.ll
deleted file mode 100644
index 29892cb84d8ef..0000000000000
--- a/llvm/test/Transforms/SROA/tbaa-struct.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -S -passes='sroa<preserve-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
-; RUN: opt -S -passes='sroa<modify-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
-
-; SROA should keep `!tbaa.struct` metadata
-
-%vector = type { float, float }
-declare void @llvm.memcpy.p0.p0.i64(ptr writeonly, ptr readonly, i64, i1 immarg)
-declare <2 x float> @foo(ptr %0)
-
-define void @bar(ptr %y2) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[X14:%.*]] = call <2 x float> @foo(ptr [[Y2:%.*]])
-; CHECK-NEXT:    store <2 x float> [[X14]], ptr [[Y2]], align 4, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %x7 = alloca %vector
-  %x14 = call <2 x float> @foo(ptr %y2)
-  store <2 x float> %x14, ptr %x7
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %y2, ptr align 4 %x7, i64 8, i1 false), !tbaa.struct !10
-  ret void
-}
-
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C++ TBAA"}
-!7 = !{!"vector", !8, i64 0, !8, i64 4}
-!8 = !{!"float", !4, i64 0}
-!10 = !{i64 0, i64 4, !11, i64 4, i64 4, !11}
-!11 = !{!8, !8, i64 0}
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-;.
-; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 4, !1, i64 4, i64 4, !1}
-; CHECK: [[META1:![0-9]+]] = !{!2, !2, i64 0}
-; CHECK: [[META2:![0-9]+]] = !{!"float", !3, i64 0}
-; CHECK: [[META3:![0-9]+]] = !{!"omnipotent char", !4, i64 0}
-; CHECK: [[META4:![0-9]+]] = !{!"Simple C++ TBAA"}
-;.
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-MODIFY-CFG: {{.*}}
-; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 0fcd787fef976..6a1a23728d3c7 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -56,7 +56,7 @@ define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceabl
 ; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B]] to i32
 ; CHECK-NEXT:    [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -77,8 +77,8 @@ define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24)
 ; CHECK-NEXT:    store i31 [[A]], ptr [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4
 ; CHECK-NEXT:    store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4
-; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
-; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT4]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -98,7 +98,7 @@ define void @store_vector_part_first(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_part_first(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 8
 ; CHECK-NEXT:    store float [[F]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -118,7 +118,7 @@ define void @store_vector_part_second(ptr %y2, float %f) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
 ; CHECK-NEXT:    store float [[F]], ptr [[Y2]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 4
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float, float, float }
@@ -134,7 +134,7 @@ define void @store_vector_single(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_single(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float }
@@ -161,7 +161,7 @@ define void @memset(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 1 [[A_SROA_4_0_SRC_SROA_IDX]], i32 10, i1 false)
 ; CHECK-NEXT:    store i16 1, ptr [[A_SROA_3]], align 2
 ; CHECK-NEXT:    [[A_SROA_0_1_A_1_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_1_A_1_SROA_IDX2]], i8 42, i32 6, i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_1_A_1_SROA_IDX2]], i8 42, i32 6, i1 false)
 ; CHECK-NEXT:    store i16 10794, ptr [[A_SROA_3]], align 2, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 7, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 7
@@ -199,8 +199,8 @@ define void @memset2(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 2 [[A_SROA_4_0_SRC_SROA_IDX]], i32 90, i1 false)
 ; CHECK-NEXT:    store i8 1, ptr [[A_SROA_3]], align 1
 ; CHECK-NEXT:    [[A_SROA_0_202_A_202_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 202
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[TBAA6]]
-; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[TBAA6]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[TBAA5]]
+; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 209, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 209
 ; CHECK-NEXT:    [[A_SROA_3_0_A_SROA_3_0_COPYLOAD1:%.*]] = load volatile i8, ptr [[A_SROA_3]], align 1
@@ -240,7 +240,7 @@ define void @slice_store_v2i8_1(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-NEXT:    [[A_SROA_2_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6
 ; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_COPYLOAD:%.*]] = load <2 x i8>, ptr [[A_SROA_2_0_SRC_SROA_IDX]], align 2
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_2_SROA_0_0_COPYLOAD]], ptr [[A_SROA_2_SROA_0]], align 4
-; CHECK-NEXT:    store <2 x i8> bitcast (<1 x i16> <i16 123> to <2 x i8>), ptr [[A_SROA_2_SROA_0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <2 x i8> bitcast (<1 x i16> <i16 123> to <2 x i8>), ptr [[A_SROA_2_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_A_SROA_2_SROA_0_0_A_SROA_2_6_V_4:%.*]] = load <2 x i8>, ptr [[A_SROA_2_SROA_0]], align 4
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_2_SROA_0_0_A_SROA_2_SROA_0_0_A_SROA_2_6_V_4]], ptr [[DST_2]], align 2
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 6, i1 true)
@@ -279,8 +279,8 @@ define void @slice_store_v2i8_2(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-NEXT:    store i8 [[A_SROA_0_SROA_4_1_COPYLOAD]], ptr [[A_SROA_0_SROA_4]], align 1
 ; CHECK-NEXT:    [[A_SROA_4_1_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 1 [[A_SROA_4_1_SRC_SROA_IDX]], i32 5, i1 false)
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[A_SROA_0_SROA_1]], align 2, !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[A_SROA_0_SROA_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[A_SROA_0_SROA_1]], align 2
+; CHECK-NEXT:    store i8 0, ptr [[A_SROA_0_SROA_4]], align 1
 ; CHECK-NEXT:    [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_A_SROA_0_1_V_4:%.*]] = load <2 x i8>, ptr [[A_SROA_0_SROA_1]], align 2
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_A_SROA_0_1_V_4]], ptr [[DST_2]], align 2
 ; CHECK-NEXT:    [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_COPYLOAD3:%.*]] = load volatile <2 x i8>, ptr [[A_SROA_0_SROA_1]], align 2
@@ -317,7 +317,7 @@ define double @tbaa_struct_load(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP_SROA_3_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_COPYLOAD:%.*]] = load i64, ptr [[TMP_SROA_3_0_SRC_SROA_IDX]], align 8
 ; CHECK-NEXT:    store i64 [[TMP_SROA_3_0_COPYLOAD]], ptr [[TMP_SROA_3]], align 8
-; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1:%.*]] = load volatile double, ptr [[TMP_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile double [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 8
@@ -356,7 +356,7 @@ define i32 @shorten_integer_store_multiple_fields(ptr %dst, ptr %dst.2, ptr %src
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD]], ptr [[DST]], align 1
@@ -375,7 +375,7 @@ define <2 x i16> @shorten_vector_store_multiple_fields(ptr %dst, ptr %dst.2, ptr
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
-; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 8 [[A_SROA_0]], i32 4, i1 true)
 ; CHECK-NEXT:    ret <2 x i16> [[A_SROA_0_0_A_SROA_0_0_L]]
@@ -393,7 +393,7 @@ define <2 x i16> @shorten_vector_store_single_fields(ptr %dst, ptr %dst.2, ptr %
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
-; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8, !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
+; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 8 [[A_SROA_0]], i32 4, i1 true)
 ; CHECK-NEXT:    ret <2 x i16> [[A_SROA_0_0_A_SROA_0_0_L]]
@@ -429,11 +429,11 @@ define i32 @split_load_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[A3_SROA_5_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
 ; CHECK-NEXT:    [[A3_SROA_5_0_COPYLOAD:%.*]] = load i8, ptr [[A3_SROA_5_0_SRC_SROA_IDX]], align 1
 ; CHECK-NEXT:    store i8 [[A3_SROA_5_0_COPYLOAD]], ptr [[A3_SROA_5]], align 1
-; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_0_INSERT:%.*]] = insertvalue { i16, float, i8 } poison, i16 [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_1_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_0_INSERT]], float [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_2_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_1_INSERT]], i8 [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD]], 2
 ; CHECK-NEXT:    [[UNWRAP2:%.*]] = extractvalue { i16, float, i8 } [[LOAD4_FCA_2_INSERT]], 1
 ; CHECK-NEXT:    [[VALCAST2:%.*]] = bitcast float [[UNWRAP2]] to i32
@@ -492,11 +492,11 @@ define i32 @split_store_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[I_2:%.*]] = insertvalue { i16, float, i8 } [[I_1]], float 3.000000e+00, 1
 ; CHECK-NEXT:    [[I_3:%.*]] = insertvalue { i16, float, i8 } [[I_2]], i8 99, 2
 ; CHECK-NEXT:    [[I_3_FCA_0_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 0
-; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_1_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 1
-; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_2_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 2
-; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1:%.*]] = load volatile i16, ptr [[A3_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile i16 [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[A3_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 2
@@ -552,11 +552,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
-; CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-; CHECK: [[META7]] = !{!"v2f32", [[META2]], i64 0}
-; CHECK: [[TBAA_STRUCT8]] = !{i64 0, i64 2, [[TBAA0]], i64 2, i64 6, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT9]] = !{i64 0, i64 3, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT10]] = !{i64 0, i64 4, [[TBAA6]]}
+; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"v2f32", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index c915b9a5e59ac..16e9e5eeb6143 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -397,3 +397,27 @@ entry:
   %ptr2 = getelementptr inbounds %struct0, ptr %ptr, i65 1, i32 3, i64 %idx, i32 1
   ret ptr %ptr2
 }
+
+; Do not extract large constant offset that cannot be folded in to PTX
+; addressing mode
+define void @large_offset(ptr %out, i32 %in) {
+; CHECK-LABEL: define void @large_offset(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[IN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[TMP0]], 536870912
+; CHECK-NEXT:    [[IDX:%.*]] = zext nneg i32 [[ADD]] to i64
+; CHECK-NEXT:    [[GETELEM:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[IN]], ptr [[GETELEM]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %add = add nuw nsw i32 %0, 536870912
+  %idx = zext nneg i32 %add to i64
+  %getElem = getelementptr inbounds i32, ptr %out, i64 %idx
+  store i32 %in, ptr %getElem, align 4
+  ret void
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserving-debugloc-trivial-terminators.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserving-debugloc-trivial-terminators.ll
new file mode 100644
index 0000000000000..a17430f2c8012
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserving-debugloc-trivial-terminators.ll
@@ -0,0 +1,95 @@
+; RUN: opt -passes='loop(simple-loop-unswitch)' -S < %s | FileCheck %s
+; RUN: opt -passes='loop-mssa(simple-loop-unswitch)' -S < %s | FileCheck %s
+
+; Check that SimpleLoopUnswitch's unswitchTrivialBranch() and unswitchTrivialSwitch()
+; propagates debug locations to the new terminators replacing the old ones.
+
+define i32 @test1(ptr %var, i1 %cond1, i1 %cond2) !dbg !5 {
+; CHECK-LABEL: define i32 @test1(
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %[[CONTINUE:.*]], !dbg [[DBG8:![0-9]+]]
+;
+entry:
+  br label %loop_begin, !dbg !8
+
+loop_begin:                                       ; preds = %do_something, %entry
+  br i1 %cond1, label %continue, label %loop_exit, !dbg !9
+
+continue:                                         ; preds = %loop_begin
+  %var_val = load i32, ptr %var, align 4, !dbg !10
+  br i1 %cond2, label %do_something, label %loop_exit, !dbg !11
+
+do_something:                                     ; preds = %continue
+  call void @some_func(), !dbg !12
+  br label %loop_begin, !dbg !13
+
+loop_exit:                                        ; preds = %continue, %loop_begin
+  ret i32 0, !dbg !14
+}
+
+define i32 @test7(i32 %cond1, i32 %x, i32 %y) !dbg !15 {
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: i32 [[COND1:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND1]], label %[[ENTRY_SPLIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[LOOP_EXIT:.*]]
+; CHECK-NEXT:      i32 1, label %[[LOOP_EXIT]]
+; CHECK-NEXT:    ], !dbg [[DBG16:![0-9]+]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %[[LATCH:.*]], !dbg [[DBG16]]
+;
+entry:
+  br label %loop_begin, !dbg !16
+
+loop_begin:                                       ; preds = %latch, %entry
+  switch i32 %cond1, label %latch [
+  i32 0, label %loop_exit
+  i32 1, label %loop_exit
+  ], !dbg !17
+
+latch:                                            ; preds = %loop_begin
+  call void @some_func(), !dbg !18
+  br label %loop_begin, !dbg !19
+
+loop_exit:                                        ; preds = %loop_begin, %loop_begin
+  %result1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ], !dbg !20
+  %result2 = phi i32 [ %y, %loop_begin ], [ %y, %loop_begin ], !dbg !21
+  %result = add i32 %result1, %result2, !dbg !22
+  ret i32 %result, !dbg !23
+}
+
+; Function Attrs: noreturn
+declare void @some_func()
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+; CHECK: [[DBG8]] = !DILocation(line: 2,
+
+; CHECK: [[DBG16]] = !DILocation(line: 9,
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test2.ll", directory: "/")
+!2 = !{i32 15}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test1", linkageName: "test1", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = !DILocation(line: 7, column: 1, scope: !5)
+!15 = distinct !DISubprogram(name: "test7", linkageName: "test7", scope: null, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 8, column: 1, scope: !15)
+!17 = !DILocation(line: 9, column: 1, scope: !15)
+!18 = !DILocation(line: 10, column: 1, scope: !15)
+!19 = !DILocation(line: 11, column: 1, scope: !15)
+!20 = !DILocation(line: 12, column: 1, scope: !15)
+!21 = !DILocation(line: 13, column: 1, scope: !15)
+!22 = !DILocation(line: 14, column: 1, scope: !15)
+!23 = !DILocation(line: 15, column: 1, scope: !15)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll
new file mode 100644
index 0000000000000..82566d47b0328
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; Two-entry phi nodes with unpredictable conditions may get increased budget for folding.
+; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
+; RUN: opt < %s -S -passes='simplifycfg<speculate-unpredictables>' | FileCheck --check-prefix=CHECK-NOFOLD %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes='simplifycfg<speculate-unpredictables>' | FileCheck --check-prefix=CHECK-FOLD %s
+
+define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
+; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
+; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NOFOLD-NEXT:  [[BB:.*]]:
+; CHECK-NOFOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
+; CHECK-NOFOLD-NEXT:    br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]]
+; CHECK-NOFOLD:       [[BB3]]:
+; CHECK-NOFOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
+; CHECK-NOFOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
+; CHECK-NOFOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
+; CHECK-NOFOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
+; CHECK-NOFOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
+; CHECK-NOFOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
+; CHECK-NOFOLD-NEXT:    [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]])
+; CHECK-NOFOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
+; CHECK-NOFOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
+; CHECK-NOFOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
+; CHECK-NOFOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
+; CHECK-NOFOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
+; CHECK-NOFOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
+; CHECK-NOFOLD-NEXT:    br label %[[BB20]]
+; CHECK-NOFOLD:       [[BB20]]:
+; CHECK-NOFOLD-NEXT:    [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NOFOLD-NEXT:    [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NOFOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
+; CHECK-NOFOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
+; CHECK-NOFOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
+;
+; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
+; CHECK-FOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-FOLD-NEXT:  [[BB:.*:]]
+; CHECK-FOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
+; CHECK-FOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
+; CHECK-FOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
+; CHECK-FOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
+; CHECK-FOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
+; CHECK-FOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
+; CHECK-FOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
+; CHECK-FOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
+; CHECK-FOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
+; CHECK-FOLD-NEXT:    [[I12:%.*]] = tail call fast float @llvm.sqrt.f32(float [[I11]])
+; CHECK-FOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
+; CHECK-FOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
+; CHECK-FOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
+; CHECK-FOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
+; CHECK-FOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
+; CHECK-FOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
+; CHECK-FOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
+; CHECK-FOLD-NEXT:    [[I21:%.*]] = select nsz i1 [[I]], <2 x float> [[I17]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]]
+; CHECK-FOLD-NEXT:    [[I22:%.*]] = select nsz i1 [[I]], <2 x float> [[I19]], <2 x float> zeroinitializer, !unpredictable [[META0]]
+; CHECK-FOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
+; CHECK-FOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
+; CHECK-FOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
+;
+bb:
+  %i = fcmp fast ogt float %arg, 0x3F747AE140000000
+  br i1 %i, label %bb3, label %bb20, !unpredictable !0
+
+bb3:                                              ; preds = %bb
+  %i4 = extractelement <2 x float> %arg1, i64 0
+  %i5 = fmul fast float %i4, %i4
+  %i6 = extractelement <2 x float> %arg1, i64 1
+  %i7 = fmul fast float %i6, %i6
+  %i8 = fadd fast float %i7, %i5
+  %i9 = extractelement <2 x float> %arg2, i64 0
+  %i10 = fmul fast float %i9, %i9
+  %i11 = fadd fast float %i8, %i10
+  %i12 = tail call fast noundef float @llvm.sqrt.f32(float %i11)
+  %i13 = fdiv fast float 0x3FEFD70A40000000, %i12
+  %i14 = fmul fast float %i13, %i4
+  %i15 = insertelement <2 x float> poison, float %i14, i64 0
+  %i16 = fmul fast float %i13, %i6
+  %i17 = insertelement <2 x float> %i15, float %i16, i64 1
+  %i18 = fmul fast float %i13, %i9
+  %i19 = insertelement <2 x float> %arg2, float %i18, i64 0
+  br label %bb20
+
+bb20:                                             ; preds = %bb3, %bb
+  %i21 = phi nsz <2 x float> [ %i17, %bb3 ], [ zeroinitializer, %bb ]
+  %i22 = phi nsz <2 x float> [ %i19, %bb3 ], [ zeroinitializer, %bb ]
+  %i23 = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %i21, 0
+  %i24 = insertvalue { <2 x float>, <2 x float> } %i23, <2 x float> %i22, 1
+  ret { <2 x float>, <2 x float> } %i24
+}
+
+declare float @llvm.sqrt.f32(float)
+
+attributes #0 = { nounwind }
+
+!0 = !{}
diff --git a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll
new file mode 100644
index 0000000000000..722ccc519a764
--- /dev/null
+++ b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes='unify-loop-exits' -S | FileCheck %s
+
+define fastcc void @undef_phi(i64 %i5247, i1 %i4530, i1 %i4936.not) {
+; CHECK-LABEL: define fastcc void @undef_phi(
+; CHECK-SAME: i64 [[I5247:%.*]], i1 [[I4530:%.*]], i1 [[I4936_NOT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    br label %[[MBB3932:.*]]
+; CHECK:       [[MBB3932]]:
+; CHECK-NEXT:    br label %[[MBB4454:.*]]
+; CHECK:       [[MBB4321:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I5247]] to i32
+; CHECK-NEXT:    [[I5290:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[I5290]], label %[[MBB3932]], label %[[LOOP_EXIT_GUARD:.*]]
+; CHECK:       [[MBB4454]]:
+; CHECK-NEXT:    br i1 [[I4530]], label %[[MBB4535:.*]], label %[[LOOP_EXIT_GUARD1:.*]]
+; CHECK:       [[MBB4531:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MBB4535]]:
+; CHECK-NEXT:    br i1 [[I4936_NOT]], label %[[LOOP_EXIT_GUARD1]], label %[[MBB4454]]
+; CHECK:       [[MBB5291:.*]]:
+; CHECK-NEXT:    [[I5293:%.*]] = insertvalue [2 x i32] zeroinitializer, i32 [[DOTMOVED:%.*]], 1
+; CHECK-NEXT:    store volatile [2 x i32] [[I5293]], ptr addrspace(5) null, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       [[LOOP_EXIT_GUARD]]:
+; CHECK-NEXT:    [[DOTMOVED]] = phi i32 [ [[TMP0]], %[[MBB4321]] ], [ undef, %[[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    [[GUARD_MBB4531:%.*]] = phi i1 [ false, %[[MBB4321]] ], [ [[GUARD_MBB4531_MOVED:%.*]], %[[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    br i1 [[GUARD_MBB4531]], label %[[MBB4531]], label %[[MBB5291]]
+; CHECK:       [[LOOP_EXIT_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_MBB4531_MOVED]] = phi i1 [ true, %[[MBB4454]] ], [ undef, %[[MBB4535]] ]
+; CHECK-NEXT:    [[GUARD_LOOP_EXIT_GUARD:%.*]] = phi i1 [ true, %[[MBB4454]] ], [ false, %[[MBB4535]] ]
+; CHECK-NEXT:    br i1 [[GUARD_LOOP_EXIT_GUARD]], label %[[LOOP_EXIT_GUARD]], label %[[MBB4321]]
+;
+mbb:
+  br label %mbb3932
+
+mbb3932:                                           ; preds = %mbb4321, %mbb
+  br label %mbb4454
+
+mbb4321:                                           ; preds = %mbb4535
+  %0 = trunc i64 %i5247 to i32
+  %i5290 = icmp eq i32 %0, 0
+  br i1 %i5290, label %mbb3932, label %mbb5291
+
+mbb4454:                                           ; preds = %mbb4535, %mbb3932
+  br i1 %i4530, label %mbb4535, label %mbb4531
+
+mbb4531:                                           ; preds = %mbb4454
+  ret void
+
+mbb4535:                                           ; preds = %mbb4454
+  br i1 %i4936.not, label %mbb4321, label %mbb4454
+
+mbb5291:                                           ; preds = %mbb4321
+  %i5293 = insertvalue [2 x i32] zeroinitializer, i32 %0, 1
+  store volatile [2 x i32] %i5293, ptr addrspace(5) null, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index b49f3c9f3eeb2..775ad4c5ecc36 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -22,13 +22,11 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) {
 
 define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test1_reduce(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 6, i32 9, i32 12, i32 15, i32 21, i32 24, i32 27, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 23, i32 26, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 4, i32 7, i32 10, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 8, i32 11, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 27, i32 12, i32 29, i32 30, i32 15>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 4, i32 19, i32 8, i32 7, i32 18, i32 11, i32 10, i32 17, i32 14, i32 13, i32 16>
+; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -741,70 +739,70 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]]
-; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP78:%.*]] = and <16 x i32> [[TMP77]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]]
-; CHECK-NEXT:    [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP82]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP82]], 16
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]]
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = add nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll b/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll
new file mode 100644
index 0000000000000..f04bcc90e5c35
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck  %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck  %s
+
+;
+; Fold reduce(cast(X)) -> trunc(cast(X)) if more cost efficient
+;
+
+define i32 @reduce_add_trunc_v8i64_to_v8i32(<8 x i64> %a0)  {
+; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = trunc <8 x i64> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+define i16 @reduce_add_trunc_v8i64_to_v8i16(<8 x i64> %a0)  {
+; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[RED]]
+;
+  %tr = trunc <8 x i64> %a0 to <8 x i16>
+  %red = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tr)
+  ret i16 %red
+}
+
+define i8 @reduce_add_trunc_v8i64_to_v8i8(<8 x i64> %a0)  {
+; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[RED]]
+;
+  %tr = trunc <8 x i64> %a0 to <8 x i8>
+  %red = tail call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %tr)
+  ret i8 %red
+}
+
+define i8 @reduce_or_trunc_v8i32_i8(<8 x i32> %a0)  {
+; CHECK-LABEL: @reduce_or_trunc_v8i32_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[RED]]
+;
+  %tr = trunc <8 x i32> %a0 to <8 x i8>
+  %red = tail call i8 @llvm.vector.reduce.or.v8i32(<8 x i8> %tr)
+  ret i8 %red
+}
+
+define i8 @reduce_xor_trunc_v16i64_i8(<16 x i64> %a0)  {
+; CHECK-LABEL: @reduce_xor_trunc_v16i64_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[RED]]
+;
+  %tr = trunc <16 x i64> %a0 to <16 x i8>
+  %red = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %tr)
+  ret i8 %red
+}
+
+define i16 @reduce_mul_trunc_v8i64_i16(<8 x i64> %a0)  {
+; CHECK-LABEL: @reduce_mul_trunc_v8i64_i16(
+; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RED:%.*]] = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[TR]])
+; CHECK-NEXT:    ret i16 [[RED]]
+;
+  %tr = trunc <8 x i64> %a0 to <8 x i16>
+  %red = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %tr)
+  ret i16 %red
+}
+
+define i32 @reduce_or_sext_v8i8_to_v8i32(<8 x i8> %a0)  {
+; CHECK-LABEL: @reduce_or_sext_v8i8_to_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = sext <8 x i8> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+define i32 @reduce_or_sext_v8i16_to_v8i32(<8 x i16> %a0)  {
+; CHECK-LABEL: @reduce_or_sext_v8i16_to_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = sext <8 x i16> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+define i32 @reduce_or_zext_v8i8_to_v8i32(<8 x i8> %a0)  {
+; CHECK-LABEL: @reduce_or_zext_v8i8_to_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = zext <8 x i8> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+define i32 @reduce_or_zext_v8i16_to_v8i32(<8 x i16> %a0)  {
+; CHECK-LABEL: @reduce_or_zext_v8i16_to_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = zext <8 x i16> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+; Negative case - narrowing the reduce (to i8) is illegal.
+; TODO: We could narrow to i16 instead.
+define i32 @reduce_add_trunc_v8i8_to_v8i32(<8 x i8> %a0)  {
+; CHECK-LABEL: @reduce_add_trunc_v8i8_to_v8i32(
+; CHECK-NEXT:    [[TR:%.*]] = zext <8 x i8> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TR]])
+; CHECK-NEXT:    ret i32 [[RED]]
+;
+  %tr = zext <8 x i8> %a0 to <8 x i32>
+  %red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %tr)
+  ret i32 %red
+}
+
+
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index 03fbb5e5a4674..97608174b524d 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -1130,14 +1130,23 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fadd_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fadd float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1164,14 +1173,23 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fsub_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fsub float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1198,14 +1216,23 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fdiv_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1275,14 +1302,23 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x float> %x, float %y) {
-; ALL-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 4)
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index f883282f40c6b..c4aba63568e2f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -626,6 +626,58 @@ define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceab
   ret <8 x i16> %r
 }
 
+; Negative sanitizer tests.
+
+define <4 x i32> @load_i32_insert_v4i32_asan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_address  {
+; CHECK-LABEL: @load_i32_insert_v4i32_asan(
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i32 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %s = load i32, ptr %p, align 4
+  %r = insertelement <4 x i32> poison, i32 %s, i32 0
+  ret <4 x i32> %r
+}
+
+define <4 x float> @load_v2f32_extract_insert_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_hwaddress  {
+; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %l = load <2 x float>, ptr %p, align 4
+  %s = extractelement <2 x float> %l, i32 0
+  %r = insertelement <4 x float> poison, float %s, i32 0
+  ret <4 x float> %r
+}
+
+define <4 x float> @load_v2f32_extract_insert_v4f32_tsan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_thread  {
+; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_tsan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %l = load <2 x float>, ptr %p, align 4
+  %s = extractelement <2 x float> %l, i32 0
+  %r = insertelement <4 x float> poison, float %s, i32 0
+  ret <4 x float> %r
+}
+
+; Double negative msan tests, it's OK with the optimization.
+
+define <2 x float> @load_f32_insert_v2f32_msan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_memory  {
+; CHECK-LABEL: @load_f32_insert_v2f32_msan(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %s = load float, ptr %p, align 4
+  %r = insertelement <2 x float> poison, float %s, i32 0
+  ret <2 x float> %r
+}
+
 ; PR30986 - split vector loads for scalarized operations
 define <2 x i64> @PR30986(ptr %0) {
 ; CHECK-LABEL: @PR30986(
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index a53abab8b7d14..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -382,3 +382,83 @@ define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 deref
   %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %s
 }
+
+; Negative-negative tests with msan, which should be OK with widening.
+
+define <4 x float> @load_v1f32_v4f32_msan(ptr dereferenceable(16) %p) sanitize_memory  {
+; CHECK-LABEL: @load_v1f32_v4f32_msan(
+; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <1 x float>, ptr %p, align 16
+  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+; Negative tests with sanitizers.
+
+define <4 x float> @load_v1f32_v4f32_asan(ptr dereferenceable(16) %p) sanitize_address  {
+; CHECK-LABEL: @load_v1f32_v4f32_asan(
+; CHECK-NEXT:    [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <1 x float>, ptr %p, align 16
+  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+define <4 x float> @load_v2f32_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2f32_v4f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <2 x float>, ptr %p, align 1
+  %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+define <4 x float> @load_v3f32_v4f32_tsan(ptr dereferenceable(16) %p) sanitize_thread  {
+; CHECK-LABEL: @load_v3f32_v4f32_tsan(
+; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <3 x float>, ptr %p, align 1
+  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %s
+}
+
+define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2f32_v8f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x float> [[S]]
+;
+  %l = load <2 x float>, ptr %p, align 1
+  %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %s
+}
+
+define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
+; CHECK-LABEL: @load_v2i32_v4i32_asan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %l = load <2 x i32>, ptr %p, align 1
+  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(ptr dereferenceable(16) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %l = load <2 x i32>, ptr %p, align 1
+  %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ret <4 x i32> %s
+}
diff --git a/llvm/test/Transforms/lower-builtin-allow-check.ll b/llvm/test/Transforms/lower-builtin-allow-check.ll
index 05d940a46716c..bcd9722d2b289 100644
--- a/llvm/test/Transforms/lower-builtin-allow-check.ll
+++ b/llvm/test/Transforms/lower-builtin-allow-check.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(lower-allow-check)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=0 -S | FileCheck %s --check-prefixes=NONE
+; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-random-rate=0 -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=NONE99
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-random-rate=1 -lower-allow-check-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=ALL70
 
 target triple = "x86_64-pc-linux-gnu"
 
@@ -36,6 +39,19 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @simple(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @simple(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -61,6 +77,32 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
+;
+; NONE99-LABEL: define dso_local noundef i32 @simple(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @simple(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
@@ -105,6 +147,19 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @hot(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @hot(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -130,6 +185,32 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
+;
+; NONE99-LABEL: define dso_local noundef i32 @hot(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @hot(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
@@ -173,6 +254,19 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @veryHot(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -198,6 +292,32 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
+;
+; NONE99-LABEL: define dso_local noundef i32 @veryHot(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @veryHot(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
@@ -254,6 +374,25 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NONE-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; NONE-NEXT:    ret i32 [[TMP10]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; ALL-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; ALL-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; ALL:       4:
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       7:
+; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL-NEXT:    br label [[TMP9]]
+; ALL:       9:
+; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL-NEXT:    ret i32 [[TMP10]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
 ; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -291,6 +430,44 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70:       9:
 ; HOT70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; HOT70-NEXT:    ret i32 [[TMP10]]
+;
+; NONE99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; NONE99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; NONE99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; NONE99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; NONE99:       4:
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NONE99:       6:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       7:
+; NONE99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; NONE99-NEXT:    br label [[TMP9]]
+; NONE99:       9:
+; NONE99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; NONE99-NEXT:    ret i32 [[TMP10]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; ALL70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; ALL70-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; ALL70:       4:
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL70:       6:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       7:
+; ALL70-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL70-NEXT:    br label [[TMP9]]
+; ALL70:       9:
+; ALL70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL70-NEXT:    ret i32 [[TMP10]]
 ;
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %9, label %4, !prof !38
@@ -354,6 +531,25 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NONE-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; NONE-NEXT:    ret i32 [[TMP10]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; ALL-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; ALL-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; ALL:       4:
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       7:
+; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL-NEXT:    br label [[TMP9]]
+; ALL:       9:
+; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL-NEXT:    ret i32 [[TMP10]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
 ; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -391,6 +587,44 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70:       9:
 ; HOT70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; HOT70-NEXT:    ret i32 [[TMP10]]
+;
+; NONE99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; NONE99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; NONE99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; NONE99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; NONE99:       4:
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NONE99:       6:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       7:
+; NONE99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; NONE99-NEXT:    br label [[TMP9]]
+; NONE99:       9:
+; NONE99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; NONE99-NEXT:    ret i32 [[TMP10]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; ALL70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; ALL70-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; ALL70:       4:
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL70:       6:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       7:
+; ALL70-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL70-NEXT:    br label [[TMP9]]
+; ALL70:       9:
+; ALL70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL70-NEXT:    ret i32 [[TMP10]]
 ;
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %9, label %4, !prof !37
@@ -450,6 +684,11 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NONE: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; NONE: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
+; ALL: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; ALL: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
 ; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
 ; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
@@ -460,3 +699,13 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; HOT70: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
+; NONE99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; NONE99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; NONE99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; NONE99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
+; ALL70: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; ALL70: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; ALL70: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; ALL70: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index fe70ba082cb4c..f835b98b24345 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -3,12 +3,12 @@
 ; CHECK: atomic store operand must have integer, pointer, or floating point type!
 ; CHECK: atomic load operand must have integer, pointer, or floating point type!
 
-define void @foo(ptr %P, x86_mmx %v) {
-  store atomic x86_mmx %v, ptr %P unordered, align 8
+define void @foo(ptr %P, <1 x i64> %v) {
+  store atomic <1 x i64> %v, ptr %P unordered, align 8
   ret void
 }
 
-define x86_mmx @bar(ptr %P) {
-  %v = load atomic x86_mmx, ptr %P unordered, align 8
-  ret x86_mmx %v
+define <1 x i64> @bar(ptr %P) {
+  %v = load atomic <1 x i64>, ptr %P unordered, align 8
+  ret <1 x i64> %v
 }
diff --git a/llvm/test/Verifier/byval-size-limit.ll b/llvm/test/Verifier/byval-size-limit.ll
new file mode 100644
index 0000000000000..3eb462b063636
--- /dev/null
+++ b/llvm/test/Verifier/byval-size-limit.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: huge 'byval' arguments are unsupported
+define void @f(ptr byval([2147483648 x i16])) { ret void }
diff --git a/llvm/test/Verifier/param-align.ll b/llvm/test/Verifier/param-align.ll
index bfd01cbc9faa5..caa8f9ac41ea5 100644
--- a/llvm/test/Verifier/param-align.ll
+++ b/llvm/test/Verifier/param-align.ll
@@ -2,19 +2,19 @@
 
 ; Large vector for intrinsics is valid
 ; CHECK-NOT: llvm.fshr
-define dso_local <8192 x i32> @test_intrin(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) {
+define dso_local <2147483648 x i32> @test_intrin(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) {
 entry:
-  %b = call <8192 x i32> @llvm.fshr.v8192i32(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt)
-  ret <8192 x i32> %b
+  %b = call <2147483648 x i32> @llvm.fshr.v8192i32(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt)
+  ret <2147483648 x i32> %b
 }
-declare <8192 x i32> @llvm.fshr.v8192i32 (<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt)
+declare <2147483648 x i32> @llvm.fshr.v8192i32 (<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt)
 
 ; CHECK: Incorrect alignment of argument passed to called function!
 ; CHECK: bar
-define dso_local void @foo(<8192 x float> noundef %vec) {
+define dso_local void @foo(<2147483648 x float> noundef %vec) {
 entry:
-  call void @bar(<8192 x float> %vec)
+  call void @bar(<2147483648 x float> %vec)
   ret void
 }
 
-declare dso_local void @bar(<8192 x float>)
+declare dso_local void @bar(<2147483648 x float>)
diff --git a/llvm/test/Verifier/param-attr-align.ll b/llvm/test/Verifier/param-attr-align.ll
index 038bfa3494f89..700efe5376841 100644
--- a/llvm/test/Verifier/param-attr-align.ll
+++ b/llvm/test/Verifier/param-attr-align.ll
@@ -1,9 +1,9 @@
 ; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
-; CHECK: Attribute 'align' exceed the max size 2^14
+; CHECK: huge alignments are not supported yet
 define dso_local void @foo(ptr %p) {
 entry:
-  call void @bar(ptr noundef byval(<8 x float>) align 32768 %p)
+  call void @bar(ptr noundef byval(<8 x float>) align 8589934592 %p)
   ret void
 }
 
diff --git a/llvm/test/Verifier/param-ret-align.ll b/llvm/test/Verifier/param-ret-align.ll
index dd302c38b53d2..98cbb4ee88a89 100644
--- a/llvm/test/Verifier/param-ret-align.ll
+++ b/llvm/test/Verifier/param-ret-align.ll
@@ -2,19 +2,19 @@
 
 ; Large vector for intrinsics is valid
 ; CHECK-NOT: llvm.fshr
-define dso_local <8192 x i32> @test_intrin(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) {
+define dso_local <2147483648 x i32> @test_intrin(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) {
 entry:
-  %b = call <8192 x i32> @llvm.fshr.v8192i32(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt)
-  ret <8192 x i32> %b
+  %b = call <2147483648 x i32> @llvm.fshr.v8192i32(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt)
+  ret <2147483648 x i32> %b
 }
-declare <8192 x i32> @llvm.fshr.v8192i32 (<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt)
+declare <2147483648 x i32> @llvm.fshr.v2147483648i32 (<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt)
 
 ; CHECK: Incorrect alignment of return type to called function!
 ; CHECK: bar
 define dso_local void @foo() {
 entry:
-  call <8192 x float> @bar()
+  call <2147483648 x float> @bar()
   ret void
 }
 
-declare dso_local <8192 x float> @bar()
+declare dso_local <2147483648 x float> @bar()
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 1d35fdaa55bbe..ee0c75fc67890 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -182,6 +182,7 @@ def get_asan_rtlib():
         "llvm-bitcode-strip",
         "llvm-config",
         "llvm-cov",
+        "llvm-ctxprof-util",
         "llvm-cxxdump",
         "llvm-cvtres",
         "llvm-debuginfod-find",
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index 38b8ba12f0662..bdba243634689 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -16,7 +16,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
 ; CHECK-NEXT:    t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
 ; CHECK-NEXT:    t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1
-; CHECK-NEXT:    t19: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
+; CHECK-NEXT:    t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
 ; CHECK-EMPTY:
   %loc = alloca i64, addrspace(5)
   %j = load i64, ptr addrspace(5) %loc
@@ -33,8 +33,8 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 ; CHECK-NEXT:    t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0>
 ; CHECK-NEXT:    t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
 ; CHECK-NEXT:    t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t16: ch,glue = CopyToReg # D:1 t14, Register:i32 $vgpr1, t22, t14:1
-; CHECK-NEXT:    t17: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
+; CHECK-NEXT:    t16: ch,glue = CopyToReg t14, Register:i32 $vgpr1, t22, t14:1
+; CHECK-NEXT:    t17: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
 ; CHECK-EMPTY:
   %loc = alloca i32, addrspace(5)
   %j = load i32, ptr addrspace(5) %loc
@@ -54,8 +54,8 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 ; CHECK-NEXT:    t25: i32 = V_AND_B32_e64 # D:1 t20, t24
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
 ; CHECK-NEXT:    t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT:    t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT:    t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
 ; CHECK-EMPTY:
   %loc = alloca i16, addrspace(5)
   %j = load i16, ptr addrspace(5) %loc
@@ -75,8 +75,8 @@ define i64 @i8_test(i8 %i) nounwind readnone {
 ; CHECK-NEXT:    t25: i32 = V_AND_B32_e64 # D:1 t20, t24
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
 ; CHECK-NEXT:    t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT:    t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT:    t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
 ; CHECK-EMPTY:
   %loc = alloca i8, addrspace(5)
   %j = load i8, ptr addrspace(5) %loc
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
new file mode 100644
index 0000000000000..35c169002386e
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
@@ -0,0 +1 @@
+[{]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
new file mode 100644
index 0000000000000..fe51488c7066f
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
@@ -0,0 +1 @@
+[]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
new file mode 100644
index 0000000000000..b47e0ee1a04ba
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
@@ -0,0 +1,8 @@
+[{
+  "Guid": 123,
+  "Counters": [1, 2],
+  "Callsites":
+  [
+    {"Guid": 1}
+  ]
+}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
new file mode 100644
index 0000000000000..95cdd45a5a0f7
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
@@ -0,0 +1,5 @@
+[
+  {
+    "Guid": 1231
+  }
+]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
new file mode 100644
index 0000000000000..93d51406d63fb
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
@@ -0,0 +1 @@
+[{}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
new file mode 100644
index 0000000000000..0967ef424bce6
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
@@ -0,0 +1 @@
+{}
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
new file mode 100644
index 0000000000000..d4a6da4142110
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
@@ -0,0 +1,47 @@
+[
+  {
+    "Guid": 1000,
+    "Counters": [
+      1,
+      2,
+      3
+    ],
+    "Callsites": [
+      [],
+      [
+        {
+          "Guid": 2000,
+          "Counters": [
+            4,
+            5
+          ]
+        },
+        {
+          "Guid": 18446744073709551613,
+          "Counters": [
+            6,
+            7,
+            8
+          ]
+        }
+      ],
+      [
+        {
+          "Guid": 3000,
+          "Counters": [
+            40,
+            50
+          ]
+        }
+      ]
+    ]
+  },
+  {
+    "Guid": 18446744073709551612,
+    "Counters": [
+      5,
+      9,
+      10
+    ]
+  }
+]
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
new file mode 100644
index 0000000000000..08c83c9f907fb
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -0,0 +1,24 @@
+; REQUIRES: x86_64-linux
+
+; RUN: not llvm-ctxprof-util nofile.json 2>&1 | FileCheck %s --check-prefix=NO_CMD
+; RUN: not llvm-ctxprof-util invalidCmd --input nofile.json 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
+; RUN: not llvm-ctxprof-util fromJSON nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FLAG
+; RUN: not llvm-ctxprof-util fromJSON --input nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FILE
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/bad.json 2>&1 | FileCheck %s --check-prefix=BAD_JSON
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-vector.json 2>&1 | FileCheck %s --check-prefix=NO_VECTOR
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-ctx.json 2>&1 | FileCheck %s --check-prefix=NO_CTX
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-counters.json 2>&1 | FileCheck %s --check-prefix=NO_COUNTERS
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-bad-subctx.json 2>&1 | FileCheck %s --check-prefix=BAD_SUBCTX
+; RUN: rm -rf %t
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json --output %t/output.bitstream 2>&1 | FileCheck %s --check-prefix=NO_DIR
+
+; NO_CMD: Unknown subcommand 'nofile.json'
+; INVALID_CMD: Unknown subcommand 'invalidCmd'
+; NO_FLAG: Unknown command line argument 'nofile.json'. 
+; NO_FILE: 'nofile.json': No such file or directory
+; BAD_JSON: Expected object key
+; NO_VECTOR: expected array
+; NO_CTX: missing value at (root)[0].Guid
+; NO_COUNTERS: missing value at (root)[0].Counters
+; BAD_SUBCTX: expected array at (root)[0].Callsites[0]
+; NO_DIR: failed to open output
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
new file mode 100644
index 0000000000000..5a21bffa59022
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -0,0 +1,48 @@
+; REQUIRES: x86_64-linux
+
+; RUN: mkdir -p %t
+; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/empty.json -output %t/empty.bitstream
+; RUN: llvm-bcanalyzer --dump %t/empty.bitstream | FileCheck %s --check-prefix=EMPTY
+
+; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json -output %t/valid.bitstream
+
+; For the valid case, check against a reference output.
+; Note that uint64_t are printed as signed values by llvm-bcanalyzer:
+;  * 18446744073709551613 in json is -3 in the output
+;  * 18446744073709551612 in json is -4 in the output
+; Also we have no callee/context at index 0, 2 callsites for index 1, and one for
+; index 2.
+; RUN: llvm-bcanalyzer --dump %t/valid.bitstream | FileCheck %s --check-prefix=VALID
+
+; EMPTY: <BLOCKINFO_BLOCK/>
+; EMPTY-NEXT: <Metadata NumWords=1 BlockCodeSize=2>
+; EMPTY-NEXT:   <Version op0=1/>
+; EMPTY-NEXT: </Metadata>
+
+; VALID:      <BLOCKINFO_BLOCK/>
+; VALID-NEXT: <Metadata NumWords=30 BlockCodeSize=2>
+; VALID-NEXT:   <Version op0=1/>
+; VALID-NEXT:   <Context NumWords=20 BlockCodeSize=2>
+; VALID-NEXT:     <GUID op0=1000/>
+; VALID-NEXT:     <Counters op0=1 op1=2 op2=3/>
+; VALID-NEXT:     <Context NumWords=5 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=-3/>
+; VALID-NEXT:       <CalleeIndex op0=1/>
+; VALID-NEXT:       <Counters op0=6 op1=7 op2=8/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:     <Context NumWords=3 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=2000/>
+; VALID-NEXT:       <CalleeIndex op0=1/>
+; VALID-NEXT:       <Counters op0=4 op1=5/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:     <Context NumWords=3 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=3000/>
+; VALID-NEXT:       <CalleeIndex op0=2/>
+; VALID-NEXT:       <Counters op0=40 op1=50/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:   </Context>
+; VALID-NEXT:   <Context NumWords=4 BlockCodeSize=2>
+; VALID-NEXT:     <GUID op0=-4/>
+; VALID-NEXT:     <Counters op0=5 op1=9 op2=10/>
+; VALID-NEXT:   </Context>
+; VALID-NEXT: </Metadata>
diff --git a/llvm/test/tools/llvm-dlltool/arm64ec.test b/llvm/test/tools/llvm-dlltool/arm64ec.test
index b03b4eaf7b2d0..4ed0f4a4387d6 100644
--- a/llvm/test/tools/llvm-dlltool/arm64ec.test
+++ b/llvm/test/tools/llvm-dlltool/arm64ec.test
@@ -44,6 +44,29 @@ RUN: llvm-nm --print-armap test3.lib | FileCheck --check-prefix=ARMAP2 %s
 RUN: not llvm-dlltool -m arm64 -d test.def -N test2.def -l test4.lib 2>&1 | FileCheck --check-prefix=ERR %s
 ERR: native .def file is supported only on arm64ec target
 
+RUN: llvm-dlltool -m arm64ec -d test3.def -l test3.lib
+RUN: llvm-readobj test3.lib | FileCheck --check-prefix=ALIAS %s
+
+ALIAS:      File: test.dll
+ALIAS-NEXT: Format: COFF-import-file-ARM64EC
+ALIAS-NEXT: Type: code
+ALIAS-NEXT: Name type: export as
+ALIAS-NEXT: Export name: efunc
+ALIAS-NEXT: Symbol: __imp_func
+ALIAS-NEXT: Symbol: func
+ALIAS-NEXT: Symbol: __imp_aux_func
+ALIAS-NEXT: Symbol: #func
+ALIAS-EMPTY:
+ALIAS-NEXT: File: test.dll
+ALIAS-NEXT: Format: COFF-import-file-ARM64EC
+ALIAS-NEXT: Type: code
+ALIAS-NEXT: Name type: export as
+ALIAS-NEXT: Export name: efunc
+ALIAS-NEXT: Symbol: __imp_efunc
+ALIAS-NEXT: Symbol: efunc
+ALIAS-NEXT: Symbol: __imp_aux_efunc
+ALIAS-NEXT: Symbol: #efunc
+
 #--- test.def
 LIBRARY test.dll
 EXPORTS
@@ -53,3 +76,9 @@ EXPORTS
 LIBRARY test.dll
 EXPORTS
     otherfunc
+
+#--- test3.def
+LIBRARY test.dll
+EXPORTS
+    func == efunc
+    efunc
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-abbrev-forms.s b/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-abbrev-forms.s
index cba6cb6ecd11e..bc3dcfadc31bd 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-abbrev-forms.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-abbrev-forms.s
@@ -9,7 +9,7 @@
 # CHECK: NameIndex @ 0x0: Abbreviation 0x1: DW_IDX_die_offset uses an unknown form: DW_FORM_unknown_1fff.
 # CHECK: warning: NameIndex @ 0x0: Abbreviation 0x3 references an unknown tag: DW_TAG_unknown_8080.
 # CHECK: error: NameIndex @ 0x0: Abbreviation 0x5 has no DW_IDX_die_offset attribute.
-# CHECK: error: NameIndex @ 0x55: Indexing multiple compile units and abbreviation 0x1 has no DW_IDX_compile_unit attribute.
+# CHECK: error: NameIndex @ 0x55: Indexing multiple compile units and abbreviation 0x1 has no DW_IDX_compile_unit or DW_IDX_type_unit attribute.
 
 	.section	.debug_str,"MS",@progbits,1
 .Lstring_producer:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s
index 5611963a585f6..aad748a301e6b 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s
@@ -18,15 +18,12 @@
 # doesn't really need templates - two local variables would've sufficed
 # (anything that references the type units) but I was working on something else
 # and this seemed minimal enough.
-# A gcc-style type signature reference was also inserted.
 
 
 # CHECK: DW_TAG_template_type_parameter
 # CHECK:   DW_AT_type ({{.*}} "t1")
 # CHECK: DW_TAG_template_type_parameter
 # CHECK:   DW_AT_type ({{.*}} "t2")
-# CHECK: DW_TAG_template_type_parameter
-# CHECK:   DW_AT_type (0xc6694e51369161f2 "t1")
 
 	.text
 	.file	"test.cpp"
@@ -273,13 +270,6 @@ _Z2f1IJ2t12t2EEvv:                      # @_Z2f1IJ2t12t2EEvv
 	.byte	11                              # DW_FORM_data1
 	.byte	0                               # EOM(1)
 	.byte	0                               # EOM(2)
-	.byte	12                              # Abbreviation Code
-	.byte	47                              # DW_TAG_template_type_parameter
-	.byte	0                               # DW_CHILDREN_no
-	.byte	73                              # DW_AT_type
-	.byte	32                              # DW_FORM_ref_sig8
-	.byte	0                               # EOM(1)
-	.byte	0                               # EOM(2)
 	.byte	0                               # EOM(3)
 	.section	.debug_info,"",@progbits
 .Lcu_begin0:
@@ -323,23 +313,18 @@ _Z2f1IJ2t12t2EEvv:                      # @_Z2f1IJ2t12t2EEvv
 	.byte	6                               # Abbrev [6] 0x46:0xd DW_TAG_GNU_template_parameter_pack
 	.byte	5                               # DW_AT_name
 	.byte	7                               # Abbrev [7] 0x48:0x5 DW_TAG_template_type_parameter
-	.long	.Lt1_decl-.Lcu_begin0           # DW_AT_type
+	.long	88                              # DW_AT_type
 	.byte	7                               # Abbrev [7] 0x4d:0x5 DW_TAG_template_type_parameter
-	# Simulate DWARF emitted by GCC where the signature is directly in the type attribute.
-	.long	.Lt2_decl-.Lcu_begin0           # DW_AT_type
-	.byte	12                              # Abbrev [12] DW_TAG_template_type_parameter
-	.quad	-4149699470930386446            # DW_AT_type
+	.long	97                              # DW_AT_type
 	.byte	0                               # End Of Children Mark
 	.byte	0                               # End Of Children Mark
 	.byte	8                               # Abbrev [8] 0x54:0x4 DW_TAG_base_type
 	.byte	4                               # DW_AT_name
 	.byte	5                               # DW_AT_encoding
 	.byte	4                               # DW_AT_byte_size
-  .Lt1_decl:
 	.byte	9                               # Abbrev [9] 0x58:0x9 DW_TAG_structure_type
                                         # DW_AT_declaration
 	.quad	-4149699470930386446            # DW_AT_signature
-  .Lt2_decl:
 	.byte	9                               # Abbrev [9] 0x61:0x9 DW_TAG_structure_type
                                         # DW_AT_declaration
 	.quad	5649318945901130368             # DW_AT_signature
diff --git a/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-die.yaml b/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-die.yaml
new file mode 100644
index 0000000000000..f85d97a555d80
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-die.yaml
@@ -0,0 +1,116 @@
+# RUN: yaml2obj %s -o - | not llvm-dwarfdump -verify - | FileCheck %s
+
+# Verifying local type units. Local type units and compile units are identical
+# except that the DWARF Unit offset for the type unit is specified using a
+# DW_IDX_type_unit. This is a single test that tests if we get an error for a
+# local type unit. We only need to verify that errors work for type units and
+# we can rely on the test for compile units to verify all other errors that can
+# happen in the .debug_names entries.
+
+# CHECK: Verifying .debug_names...
+# CHECK: error: Name Index @ 0x0: Entry @ 0x7f references a non-existing DIE @ 0x25.
+# CHECK: error: Aggregated error counts:
+# CHECK: error: NameIndex references nonexistent DIE occurred 1 time(s).
+# Errors detected.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x200040
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame
+    LastSec:         .eh_frame
+    VAddr:           0x200000
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .text
+    LastSec:         .text
+    VAddr:           0x201160
+    Align:           0x1000
+    Offset:          0x160
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x0
+    Offset:          0x0
+Sections:
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x200120
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C0708900100001C0000001C000000201000000F00000000410E108602430D064A0C070800000000000000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x201160
+    AddressAlign:    0x10
+    Content:         554889E5C745FC0000000031C05DC3
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         260000000500020800000000F23532F3E4235D67230000000121000000000008000000020506010001004800000005000108000000000300210001080000000000000002000F0000000800000004000F00000001560300033E0000000502917B05000442000000000604050407F23532F3E4235D6700
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0141011305101772170000021300360B03250B0B3A0B3B0B0000031101252513050325721710171B25111B120673170000042E01111B1206401803253A0B3B0B49133F190000053400021803253A0B3B0B4913000006240003253E0B0B0B00000713003C196920000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         20000000050000000400000096000000A80000009F000000A400000000000000D1000000
+  - Name:            .debug_names
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x4
+    Content:         900000000500000001000000010000000000000003000000030000001B000000080000004C4C564D303730302A000000000000000100000002000000030000008973880B6A7F9A7C3080880BD10000009F000000A400000000000000070000000D0000000113020B031304190000022E03130419000003240313041900000001002500000000022300000000033E000000000000
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         46616365626F6F6B20636C616E672076657273696F6E2031352E302E30202868747470733A2F2F6769742E696E7465726E616C2E7466626E772E6E65742F7265706F732F6769742F726F2F6F736D6574612F65787465726E616C2F6C6C766D2D70726F6A656374203864356561396432616431633161356139303862623632343663303261626162323235643562633829004C696E6B65723A204C4C442031392E302E300000
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         590000000500080037000000010101FB0E0D00010101010000000100000101011F010900000003011F020F051E01000000000014E7C4DFD187393499D4332D94A1154D040000090260112000000000001405030AAE060B2E0202000101
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         6D61696E2E637070002F55736572732F67636C6179746F6E2F446F63756D656E74732F7372632F7665726966792D74757300
+Symbols:
+  - Name:            main.cpp
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x201160
+    Size:            0xF
+DWARF:
+  debug_str:
+    - foo
+    - 'Facebook clang version 15.0.0 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 8d5ea9d2ad1c1a5a908bb6246c02abab225d5bc8)'
+    - main.cpp
+    - main
+    - int
+    - '/Users/gclayton/Documents/src/verify-tus'
+    - Foo
+  debug_addr:
+    - Length:          0xC
+      Version:         0x5
+      AddressSize:     0x8
+      Entries:
+        - Address:         0x201160
+...
diff --git a/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-tu-index.yaml b/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-tu-index.yaml
new file mode 100644
index 0000000000000..899c4616c8e2d
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/debug-names-local-tu-bad-tu-index.yaml
@@ -0,0 +1,114 @@
+# RUN: yaml2obj %s -o - | not llvm-dwarfdump -verify - | FileCheck %s
+
+# Verifying local type units. Local type units and compile units are identical
+# except that the DWARF Unit offset for the type unit is specified using a
+# DW_IDX_type_unit. This is a single test that tests that the type unit index
+# in the DW_IDX_type_unit value is valid.
+
+# CHECK: Verifying .debug_names...
+# CHECK: error: Name Index @ 0x0: Entry @ 0x7f contains an invalid TU index (1).
+# CHECK: error: Aggregated error counts:
+# CHECK: error: Name Index entry contains invalid TU index occurred 1 time(s).
+# Errors detected.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x200040
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame
+    LastSec:         .eh_frame
+    VAddr:           0x200000
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .text
+    LastSec:         .text
+    VAddr:           0x201160
+    Align:           0x1000
+    Offset:          0x160
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x0
+    Offset:          0x0
+Sections:
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x200120
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C0708900100001C0000001C000000201000000F00000000410E108602430D064A0C070800000000000000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x201160
+    AddressAlign:    0x10
+    Content:         554889E5C745FC0000000031C05DC3
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         260000000500020800000000F23532F3E4235D67230000000121000000000008000000020506010001004800000005000108000000000300210001080000000000000002000F0000000800000004000F00000001560300033E0000000502917B05000442000000000604050407F23532F3E4235D6700
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0141011305101772170000021300360B03250B0B3A0B3B0B0000031101252513050325721710171B25111B120673170000042E01111B1206401803253A0B3B0B49133F190000053400021803253A0B3B0B4913000006240003253E0B0B0B00000713003C196920000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         20000000050000000400000096000000A80000009F000000A400000000000000D1000000
+  - Name:            .debug_names
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x4
+    Content:         900000000500000001000000010000000000000003000000030000001B000000080000004C4C564D303730302A000000000000000100000002000000030000008973880B6A7F9A7C3080880BD10000009F000000A400000000000000070000000D0000000113020B031304190000022E03130419000003240313041900000001012500000000022300000000033E000000000000
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         46616365626F6F6B20636C616E672076657273696F6E2031352E302E30202868747470733A2F2F6769742E696E7465726E616C2E7466626E772E6E65742F7265706F732F6769742F726F2F6F736D6574612F65787465726E616C2F6C6C766D2D70726F6A656374203864356561396432616431633161356139303862623632343663303261626162323235643562633829004C696E6B65723A204C4C442031392E302E300000
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         590000000500080037000000010101FB0E0D00010101010000000100000101011F010900000003011F020F051E01000000000014E7C4DFD187393499D4332D94A1154D040000090260112000000000001405030AAE060B2E0202000101
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         6D61696E2E637070002F55736572732F67636C6179746F6E2F446F63756D656E74732F7372632F7665726966792D74757300
+Symbols:
+  - Name:            main.cpp
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x201160
+    Size:            0xF
+DWARF:
+  debug_str:
+    - foo
+    - 'Facebook clang version 15.0.0 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 8d5ea9d2ad1c1a5a908bb6246c02abab225d5bc8)'
+    - main.cpp
+    - main
+    - int
+    - '/Users/gclayton/Documents/src/verify-tus'
+    - Foo
+  debug_addr:
+    - Length:          0xC
+      Version:         0x5
+      AddressSize:     0x8
+      Entries:
+        - Address:         0x201160
+...
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
index 904454a547077..df0053a1dcb9b 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
index 71902fe3625c8..54ff0134bc8c4 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
@@ -49,13 +49,13 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      35
-# CHECK-NEXT: Total Cycles:      39
+# CHECK-NEXT: Total Cycles:      37
 # CHECK-NEXT: Total uOps:        35
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.90
-# CHECK-NEXT: IPC:               0.90
-# CHECK-NEXT: Block RThroughput: 11.0
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.95
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -79,7 +79,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -118,7 +118,7 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     2.00   12.00   -     6.00    -      -
+# CHECK-NEXT:  -      -     3.00   11.00   -     6.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -135,7 +135,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubq	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
@@ -155,48 +155,48 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pxor	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          0123456
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DR   .    .    .    .    .    .    .  .   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    .  .   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    .  .   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    .  .   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    .  .   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    .  .   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    .  .   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    .  .   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    .  .   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    .  .   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    .  .   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    .  .   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    .  .   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,13]    .  D=============eeeER   .    .    .  .   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,14]    .  D================eeeER.    .    .  .   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,15]    .  D--------------------R.    .    .  .   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,16]    .   D-------------------R.    .    .  .   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,17]    .   D-------------------R.    .    .  .   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,18]    .   D-------------------R.    .    .  .   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,19]    .   D==================eeeER  .    .  .   psubsb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D====================eeeER    .  .   psubsw	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    DeE----------------------R    .  .   psubsb	%xmm2, %xmm2
-# CHECK-NEXT: [0,22]    .    D=eE---------------------R    .  .   psubsw	%xmm2, %xmm2
-# CHECK-NEXT: [0,23]    .    D=======================eeeER .  .   psubusb	%mm2, %mm2
-# CHECK-NEXT: [0,24]    .    .D=========================eeeER .   psubusw	%mm2, %mm2
-# CHECK-NEXT: [0,25]    .    .D=eE--------------------------R .   psubusb	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D==eE-------------------------R .   psubusw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D==eE-------------------------R .   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,28]    .    . D==eE------------------------R .   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,29]    .    . D===========================eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,30]    .    . D==eE-------------------------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,31]    .    . D-----------------------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,32]    .    .  D----------------------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,33]    .    .  D===========================eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,34]    .    .  D-----------------------------R   pxor	%xmm2, %xmm2
+# CHECK:      [0,0]     DR   .    .    .    .    .    .    ..   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    ..   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    ..   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    ..   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    ..   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    ..   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    ..   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    ..   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    ..   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    ..   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    ..   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    ..   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    ..   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,13]    .  D=============eER.    .    .    ..   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,14]    .  D==============eeeER  .    .    ..   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,15]    .  D------------------R  .    .    ..   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,16]    .   D-----------------R  .    .    ..   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,17]    .   D-----------------R  .    .    ..   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,18]    .   D-----------------R  .    .    ..   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,19]    .   D================eeeER    .    ..   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D==================eeeER .    ..   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    DeE--------------------R .    ..   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,22]    .    D=eE-------------------R .    ..   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,23]    .    D=====================eeeER   ..   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,24]    .    .D=======================eeeER..   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,25]    .    .D=eE------------------------R..   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D==eE-----------------------R..   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D==eE-----------------------R..   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,28]    .    . D==eE----------------------R..   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,29]    .    . D=========================eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,30]    .    . D==eE-----------------------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,31]    .    . D---------------------------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,32]    .    .  D--------------------------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,33]    .    .  D=========================eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,34]    .    .  D---------------------------R   pxor	%xmm2, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -219,25 +219,25 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT: 11.    1     9.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 12.    1     11.0   0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 13.    1     14.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 14.    1     17.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 15.    1     0.0    0.0    20.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 16.    1     0.0    0.0    19.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 17.    1     0.0    0.0    19.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 18.    1     0.0    0.0    19.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 19.    1     19.0   0.0    0.0       psubsb	%mm2, %mm2
-# CHECK-NEXT: 20.    1     21.0   0.0    0.0       psubsw	%mm2, %mm2
-# CHECK-NEXT: 21.    1     1.0    1.0    22.0      psubsb	%xmm2, %xmm2
-# CHECK-NEXT: 22.    1     2.0    0.0    21.0      psubsw	%xmm2, %xmm2
-# CHECK-NEXT: 23.    1     24.0   0.0    0.0       psubusb	%mm2, %mm2
-# CHECK-NEXT: 24.    1     26.0   0.0    0.0       psubusw	%mm2, %mm2
-# CHECK-NEXT: 25.    1     2.0    0.0    26.0      psubusb	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     3.0    0.0    25.0      psubusw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     3.0    3.0    25.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 28.    1     3.0    3.0    24.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 29.    1     28.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 30.    1     3.0    0.0    25.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 31.    1     0.0    0.0    29.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 32.    1     0.0    0.0    28.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 33.    1     28.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 34.    1     0.0    0.0    29.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT:        1     6.5    0.2    10.5      <total>
+# CHECK-NEXT: 14.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 15.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 16.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 17.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 18.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 19.    1     17.0   0.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     19.0   0.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 21.    1     1.0    1.0    20.0      psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 22.    1     2.0    0.0    19.0      psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 23.    1     22.0   0.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 24.    1     24.0   0.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 25.    1     2.0    0.0    24.0      psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     3.0    0.0    23.0      psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     3.0    3.0    23.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 28.    1     3.0    3.0    22.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 29.    1     26.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 30.    1     3.0    0.0    23.0      pandn	%xmm2, %xmm2
+# CHECK-NEXT: 31.    1     0.0    0.0    27.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 32.    1     0.0    0.0    26.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 33.    1     26.0   0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 34.    1     0.0    0.0    27.0      pxor	%xmm2, %xmm2
+# CHECK-NEXT:        1     6.1    0.2    9.7       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
index 69491f06d7914..53b9d22313bf5 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
index 904454a547077..df0053a1dcb9b 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
index 5094dd1937fcc..01f516ad30410 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
index c3b8b7389df4c..e2cfd02bc76c8 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
index 4a034cc2ee191..bdca772b4956c 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
@@ -83,12 +83,12 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      63
-# CHECK-NEXT: Total Cycles:      27
+# CHECK-NEXT: Total Cycles:      25
 # CHECK-NEXT: Total uOps:        63
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.33
-# CHECK-NEXT: IPC:               2.33
+# CHECK-NEXT: uOps Per Cycle:    2.52
+# CHECK-NEXT: IPC:               2.52
 # CHECK-NEXT: Block RThroughput: 15.8
 
 # CHECK:      Instruction Info:
@@ -121,7 +121,7 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -250,71 +250,71 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DR   .    .    .    .    ..   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    ..   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    ..   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    ..   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    ..   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    ..   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    ..   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    ..   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    ..   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    ..   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    ..   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D---------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,12]    .  D--------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,13]    .  D--------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,14]    .  D--------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,15]    .  D--------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,16]    .   D-------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,17]    .   D-------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,18]    .   D-------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,19]    .   D======eeeER    .    ..   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D========eeeER .    ..   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    D===========eeeER   ..   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,22]    .    D==============eeeER..   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,23]    .    D------------------R..   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,24]    .    .D-----------------R..   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,25]    .    .D-----------------R..   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D-----------------R..   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D-----------------R..   vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,28]    .    . D----------------R..   vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,29]    .    . D----------------R..   vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,30]    .    . D----------------R..   vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,31]    .    . D----------------R..   vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,32]    .    .  D---------------R..   vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,33]    .    .  D---------------R..   vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,34]    .    .  D---------------R..   vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,35]    .    .  DeE-------------R..   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,36]    .    .   DeE------------R..   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,37]    .    .   D=eE-----------R..   vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,38]    .    .   D===eE---------R..   vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,39]    .    .   D==eE----------R..   vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,40]    .    .    D===eE--------R..   vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,41]    .    .    D============eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,42]    .    .    D==eE----------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,43]    .    .    DeE------------R.   vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,44]    .    .    .D===eE--------R.   vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: [0,45]    .    .    .D====eE-------R.   vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: [0,46]    .    .    .DeE-----------R.   vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,47]    .    .    .D=====eE------R.   vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: [0,48]    .    .    . D=====eE-----R.   vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: [0,49]    .    .    . D------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,50]    .    .    . D------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,51]    .    .    . D------------R.   vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,52]    .    .    .  D-----------R.   vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,53]    .    .    .  D-----------R.   vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,54]    .    .    .  D-----------R.   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,55]    .    .    .  D==========eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,56]    .    .    .   D-----------R   pxor	%xmm2, %xmm2
-# CHECK-NEXT: [0,57]    .    .    .   D-----------R   vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,58]    .    .    .   D-----------R   vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: [0,59]    .    .    .   D-----------R   vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: [0,60]    .    .    .    D----------R   vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: [0,61]    .    .    .    D----------R   vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: [0,62]    .    .    .    D----------R   vpxor	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DR   .    .    .    .   .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .   .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .   .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .   .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .   .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .   .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .   .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .   .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .   .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .   .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .   .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  D--------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D--------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D--------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D-------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   D-------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D-------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D======eeeER    .   .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D========eeeER .   .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D===========eER.   .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D============eeeER .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D----------------R .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D---------------R .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D---------------R .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D---------------R .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D---------------R .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D--------------R .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D--------------R .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D--------------R .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D--------------R .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D-------------R .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D-------------R .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D-------------R .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  DeE-----------R .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,36]    .    .   DeE----------R .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,37]    .    .   D=eE---------R .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D===eE-------R .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,39]    .    .   D==eE--------R .   vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,40]    .    .    D===eE------R .   vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,41]    .    .    D==========eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,42]    .    .    D==eE--------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,43]    .    .    DeE----------R.   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,44]    .    .    .D===eE------R.   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,45]    .    .    .D====eE-----R.   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,46]    .    .    .DeE---------R.   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,47]    .    .    .D=====eE----R.   vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: [0,48]    .    .    . D=====eE---R.   vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: [0,49]    .    .    . D----------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,50]    .    .    . D----------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,51]    .    .    . D----------R.   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,52]    .    .    .  D---------R.   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D---------R.   vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,54]    .    .    .  D---------R.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,55]    .    .    .  D========eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D---------R   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D---------R   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D---------R   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D---------R   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,60]    .    .    .    D--------R   vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: [0,61]    .    .    .    D--------R   vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: [0,62]    .    .    .    D--------R   vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -345,45 +345,45 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: 19.    1     7.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 20.    1     9.0    0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 21.    1     12.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 22.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 23.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 24.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 25.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     0.0    0.0    17.0      vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 28.    1     0.0    0.0    16.0      vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 29.    1     0.0    0.0    16.0      vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 30.    1     0.0    0.0    16.0      vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 31.    1     0.0    0.0    16.0      vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 32.    1     0.0    0.0    15.0      vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 33.    1     0.0    0.0    15.0      vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 34.    1     0.0    0.0    15.0      vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 35.    1     1.0    1.0    13.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 36.    1     1.0    1.0    12.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 37.    1     2.0    2.0    11.0      vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 38.    1     4.0    2.0    9.0       vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 39.    1     3.0    0.0    10.0      vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 40.    1     4.0    0.0    8.0       vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 41.    1     13.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 42.    1     3.0    0.0    10.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 43.    1     1.0    1.0    12.0      vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 44.    1     4.0    1.0    8.0       vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: 45.    1     5.0    1.0    7.0       vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: 46.    1     1.0    0.0    11.0      vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 47.    1     6.0    3.0    6.0       vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: 48.    1     6.0    3.0    5.0       vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: 49.    1     0.0    0.0    12.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 50.    1     0.0    0.0    12.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 51.    1     0.0    0.0    12.0      vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 52.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 53.    1     0.0    0.0    11.0      vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 54.    1     0.0    0.0    11.0      vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 55.    1     11.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 56.    1     0.0    0.0    11.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT: 57.    1     0.0    0.0    11.0      vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 58.    1     0.0    0.0    11.0      vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: 59.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: 60.    1     0.0    0.0    10.0      vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: 61.    1     0.0    0.0    10.0      vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: 62.    1     0.0    0.0    10.0      vpxor	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:        1     1.9    0.3    8.9       <total>
+# CHECK-NEXT: 22.    1     13.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    16.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    15.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    15.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    15.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    15.0      vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    14.0      vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    14.0      vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    14.0      vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    14.0      vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    13.0      vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    13.0      vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    13.0      vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     1.0    1.0    11.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 36.    1     1.0    1.0    10.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 37.    1     2.0    2.0    9.0       vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 38.    1     4.0    2.0    7.0       vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 39.    1     3.0    0.0    8.0       vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 40.    1     4.0    0.0    6.0       vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 41.    1     11.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 42.    1     3.0    0.0    8.0       pandn	%xmm2, %xmm2
+# CHECK-NEXT: 43.    1     1.0    1.0    10.0      vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 44.    1     4.0    1.0    6.0       vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 45.    1     5.0    1.0    5.0       vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 46.    1     1.0    0.0    9.0       vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 47.    1     6.0    3.0    4.0       vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: 48.    1     6.0    3.0    3.0       vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: 49.    1     0.0    0.0    10.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 50.    1     0.0    0.0    10.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 51.    1     0.0    0.0    10.0      vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 52.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 53.    1     0.0    0.0    9.0       vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 54.    1     0.0    0.0    9.0       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 55.    1     9.0    0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 56.    1     0.0    0.0    9.0       pxor	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     0.0    0.0    9.0       vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     0.0    0.0    9.0       vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 59.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 60.    1     0.0    0.0    8.0       vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: 61.    1     0.0    0.0    8.0       vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: 62.    1     0.0    0.0    8.0       vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:        1     1.8    0.3    7.7       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index ef5a9e34a932b..2711bc4579b00 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1461,8 +1461,8 @@ vzeroupper
 # CHECK-NEXT:  4      9     2.00    *                   vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     0.50                        vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubsw	%xmm0, %xmm1, %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 339.58 199.58 173.83 173.83 38.00  326.58 6.25   11.33
+# CHECK-NEXT:  -     126.00 340.58 198.58 173.83 173.83 38.00  326.58 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2171,8 +2171,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     2.00    -      -     vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -     2.00    -      -     vphsubsw	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 1d8d67fd323f1..0eec7e4cf58e8 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -189,8 +189,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  3      2     1.00           *            pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  3      2     1.00           *            pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      4     0.50                        phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  2      2     2.00                        pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  2      6     1.00    *                   pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  2      2     2.00                        pinsrd	$1, %eax, %xmm1
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     37.83  31.33  23.67  23.67  5.00   63.33  0.50   1.67
+# CHECK-NEXT:  -      -     38.83  30.33  23.67  23.67  5.00   63.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -304,8 +304,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrd	$1, %eax, %xmm1
diff --git a/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test b/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test
new file mode 100644
index 0000000000000..b17b1492690af
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test
@@ -0,0 +1,199 @@
+## This test tests the behavior of --change-section-address option.
+
+# RUN: yaml2obj -DTYPE=REL %s -o %ti1
+
+## Basic check that the option processes wildcards and changes the address as expected.
+# RUN: llvm-objcopy --change-section-address *+0x20 %ti1 %to1
+# RUN: llvm-readelf --section-headers %to1 | FileCheck %s --check-prefix=CHECK-ADD-ALL
+
+## Check that --change-section-address alias --adjust-section-vma produces the same output as the test above.
+# RUN: llvm-objcopy --adjust-section-vma *+0x20 %ti1 %to2
+# RUN: cmp %to1 %to2
+
+## Check that negative adjustment reduces the address by the specified value.
+# RUN: llvm-objcopy --change-section-address .anotherone-0x30 %ti1 %to3
+# RUN: llvm-readelf --section-headers %to3 | FileCheck %s --check-prefix=CHECK-SUB-SECTION
+
+## Check that a wildcard pattern works and only the specified sections are updated.
+# RUN: llvm-objcopy --change-section-address .text*+0x20 %ti1 %to4
+# RUN: llvm-readelf --section-headers %to4 | FileCheck %s --check-prefix=CHECK-ADD-PATTERN
+
+## Check that regex pattern can be used with --change-section-address.
+# RUN: llvm-objcopy --regex --change-section-address .text.+0x20 %ti1 %to5
+# RUN: llvm-readelf --section-headers %to5 | FileCheck %s --check-prefix=CHECK-ADD-PATTERN
+
+## Check that a section address can be set to a specific value.
+# RUN: llvm-objcopy --change-section-address .text*=0x10 %ti1 %to6
+# RUN: llvm-readelf --section-headers %to6 | FileCheck %s --check-prefix=CHECK-SET-PATTERN
+
+## Check setting that a section address can be set to the maximum possible value (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2=0xffffffffffffffff %ti1 %to7
+# RUN: llvm-readelf --section-headers %to7 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that a section address can be adjusted to the maximum possible value (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2+0xfffffffffffffdff %ti1 %to8
+# RUN: llvm-readelf --section-headers %to8 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that the section address can be adjusted to the minimum possible value (0).
+# RUN: llvm-objcopy --change-section-address .text2-0x200 %ti1 %to9
+# RUN: llvm-readelf --section-headers %to9 | FileCheck %s --check-prefix=CHECK-ZERO
+
+## Check that a section address can be adjusted by a maximum possible positive offset (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2=0 %ti1 %to10
+# RUN: llvm-objcopy --change-section-address .text2+0xffffffffffffffff %to10 %to11
+# RUN: llvm-readelf --section-headers %to11 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that a section address can be adjusted by a maximum possible negative offset (UINT64_MIN).
+# RUN: llvm-objcopy --change-section-address .text2=0xffffffffffffffff %ti1 %to12
+# RUN: llvm-objcopy --change-section-address .text2-0xffffffffffffffff %to12 %to13
+# RUN: llvm-readelf --section-headers %to13 | FileCheck %s --check-prefix=CHECK-ZERO
+
+## Check two independent changes.
+# RUN: llvm-objcopy --change-section-address .text1=0x110 --change-section-address .text2=0x210 %ti1 %to14
+# RUN: llvm-readelf --section-headers %to14 | FileCheck %s --check-prefix=CHECK-INDEPENDENT
+
+## Check two overlapping changes.
+# RUN: llvm-objcopy --change-section-address .anotherone-0x30 --change-section-address .anotherone+0x20 %ti1 %to15
+# RUN: llvm-readelf --section-headers %to15 | FileCheck %s --check-prefix=CHECK-USE-LAST
+
+## Check unused option.
+# RUN: llvm-objcopy --change-section-address .anotherone=0x455 --change-section-address *+0x20 %ti1 %to16
+# RUN: llvm-readelf --section-headers %to16 | FileCheck %s --check-prefix=CHECK-NOTSUPERSET-SET
+
+## Check partial overlap (.anotherone overlaps).
+# RUN: llvm-objcopy --change-section-address *+0x20 --change-section-address .anotherone=0x455 %ti1 %to17
+# RUN: llvm-readelf --section-headers %to17 | FileCheck %s --check-prefix=CHECK-SUPERSET-SET
+
+## Check more complex partial overlap (P1: .anotherone, .text2, P2: .text1, text2) (.text2 overlaps).
+# RUN: llvm-objcopy --regex  --change-section-address  ".(text2|anotherone)+0x20" --change-section-address .text.*+0x30  %ti1 %to18
+# RUN: llvm-readelf --section-headers %to18 | FileCheck %s --check-prefix=CHECK-PARTIAL-OVERLAP
+
+# CHECK-ADD-ALL:          [Nr] Name              Type            Address
+# CHECK-ADD-ALL:               .text1
+# CHECK-ADD-ALL-SAME:                                            0000000000000120
+# CHECK-ADD-ALL:               .text2
+# CHECK-ADD-ALL-SAME:                                            0000000000000220
+# CHECK-ADD-ALL:               .anotherone
+# CHECK-ADD-ALL-SAME:                                            0000000000000320
+# CHECK-ADD-ALL:               =a-b+c++d
+# CHECK-ADD-ALL-SAME:                                            0000000000000420
+# CHECK-ADD-ALL:               .strtab
+# CHECK-ADD_ALL-SAME:                                            0000000000000020
+# CHECK-ADD-ALL:               .shstrtab
+# CHECK-ADD-ALL-SAME:                                            0000000000000020
+
+# CHECK-SUB-SECTION:           .text1
+# CHECK-SUB-SECTION-SAME:                                        0000000000000100
+# CHECK-SUB-SECTION:           .text2
+# CHECK-SUB-SECTION-SAME:                                        0000000000000200
+# CHECK-SUB-SECTION:           .anotherone
+# CHECK-SUB-SECTION-SAME:                                        00000000000002d0
+
+# CHECK-ADD-PATTERN:           .text1
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000120
+# CHECK-ADD-PATTERN:           .text2
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000220
+# CHECK-ADD-PATTERN:           .anotherone
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000300
+
+# CHECK-SET-PATTERN:           .text1
+# CHECK-SET-PATTERN-SAME:                                        0000000000000010
+# CHECK-SET-PATTERN:           .text2
+# CHECK-SET-PATTERN-SAME:                                        0000000000000010
+# CHECK-SET-PATTERN:           .anotherone
+# CHECK-SET-PATTERN-SAME:                                        0000000000000300
+
+# CHECK-MAX:                   .text2
+# CHECK-MAX-SAME:                                                ffffffffffffffff
+# CHECK-ZERO:                  .text2
+# CHECK-ZERO-SAME:                                               0000000000000000
+
+# CHECK-INDEPENDENT:           .text1
+# CHECK-INDEPENDENT-SAME:                                        0000000000000110
+# CHECK-INDEPENDENT:           .text2
+# CHECK-INDEPENDENT-SAME:                                        0000000000000210
+
+# CHECK-USE-LAST:              .anotherone
+# CHECK-USE-LAST-SAME:                                           0000000000000320
+
+# CHECK-NOTSUPERSET-SET:       .text1
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000120
+# CHECK-NOTSUPERSET-SET:       .text2
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000220
+# CHECK-NOTSUPERSET-SET:       .anotherone
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000320
+
+# CHECK-SUPERSET-SET:          .text1
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000120
+# CHECK-SUPERSET-SET:          .text2
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000220
+# CHECK-SUPERSET-SET:          .anotherone
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000455
+
+# CHECK-PARTIAL-OVERLAP:        .text1
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000130
+# CHECK-PARTIAL-OVERLAP:        .text2
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000230
+# CHECK-PARTIAL-OVERLAP:        .anotherone
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000320
+
+## Check overflow by 1.
+# RUN: not llvm-objcopy --change-section-address .anotherone+0xfffffffffffffd00 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-OVERFLOW
+## Check underflow by 1.
+# RUN: not llvm-objcopy --change-section-address .text2-0x201 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-UNDERFLOW
+## Check error when argument value is invalid as a whole.
+# RUN: not llvm-objcopy --change-section-address 0 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-IVALID-VAL
+## Check error when the value is invalid in the argument value.
+# RUN: not llvm-objcopy --change-section-address .anotherone+0c50 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-NOT-INTEGER
+## Check error when the value does not fit in uint64_t.
+# RUN  not llvm-objcopy --change-section-address .text1=0x10000000000000000 %ti1 %to 2>&1 | FileCheck %s --chack-prefix=ERR-NOT-INTEGER
+## Check error when the section pattern is missing.
+# RUN: not llvm-objcopy --change-section-address =0x10 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-SECTION
+## Check error when the negative adjustment value is missing.
+# RUN: not llvm-objcopy --change-section-address .text1- %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-MINUS
+## Check error when the positive adjustment value is missing.
+# RUN: not llvm-objcopy --change-section-address .text1+ %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-PLUS
+## Check error when the value to set the address to is missing.
+# RUN: not llvm-objcopy --change-section-address .text1= %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-EQUAL
+## Check error when the provided regex is invalid.
+# RUN: not llvm-objcopy --regex --change-section-address "ab**-0x20" %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MATCHER-FAILURE
+
+# ERR-OVERFLOW: address 0x300 cannot be increased by 0xfffffffffffffd00. The result would overflow
+# ERR-UNDERFLOW: address 0x200 cannot be decreased by 0x201. The result would underflow
+# ERR-IVALID-VAL: error: bad format for --change-section-address: argument value 0 is invalid. See --help
+# ERR-NOT-INTEGER: error: bad format for --change-section-address: value after + is 0c50 when it should be a 64-bit integer
+# ERR-MISSING-SECTION: error: bad format for --change-section-address: missing section pattern to apply address change to
+# ERR-MISSING-VALUE-MINUS: error: bad format for --change-section-address: missing value of offset after '-'
+# ERR-MISSING-VALUE-PLUS: error: bad format for --change-section-address: missing value of offset after '+'
+# ERR-MISSING-VALUE-EQUAL: error: bad format for --change-section-address: missing address value after '='
+# ERR-MATCHER-FAILURE: error: cannot compile regular expression 'ab**': repetition-operator operand invalid
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_[[TYPE]]
+Sections:
+  - Name:          .text1
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x100
+  - Name:          .text2
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x200
+  - Name:          .anotherone
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x300
+  - Name:          =a-b+c++d
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x400
+
+# RUN: yaml2obj -DTYPE=EXEC %s -o %ti2
+
+## Input file is not ET_REL
+# RUN: not llvm-objcopy --change-section-address *+0x20 %ti2 2>&1 | FileCheck %s --check-prefix=ERR-FILE-TYPE
+
+# ERR-FILE-TYPE: cannot change section address in a non-relocatable file
diff --git a/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s b/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s
new file mode 100644
index 0000000000000..a52ed56d680e3
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s
@@ -0,0 +1,24 @@
+# REQUIRES: bpf-registered-target
+
+## Verify generation of 'Lxx' labels for local jump targets, when
+## --symbolize-operands option is specified.
+
+# RUN: llvm-mc -triple=bpfel %s -filetype=obj -o %t
+# RUN: llvm-objdump -d --symbolize-operands --no-show-raw-insn --no-leading-addr %t | \
+# RUN:   FileCheck %s
+        .text
+main:
+        if r1 > 42 goto +2
+        r1 -= 10
+        goto -3
+        r0 = 0
+        exit
+
+# CHECK:      <main>:
+# CHECK-NEXT: <L1>:
+# CHECK-NEXT: 	if r1 > 0x2a goto +0x2 <L0>
+# CHECK-NEXT: 	r1 -= 0xa
+# CHECK-NEXT: 	goto -0x3 <main>
+# CHECK-NEXT: <L0>:
+# CHECK-NEXT: 	r0 = 0x0
+# CHECK-NEXT: 	exit
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test b/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
index cf24952a2d8d4..011e67cda8c32 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
@@ -4,11 +4,11 @@
 
 # CHECK:       Disassembly of section .mysection:
 # CHECK-EMPTY:
-# ALL-NEXT:    <$x.0>:
+# ALL-NEXT:    <$x>:
 # CHECK-NEXT:  <_start>:
 # CHECK-NEXT:         0:       10000021        adr     x1, 0x4
 # CHECK-EMPTY:
-# ALL-NEXT:    <$d.1>:
+# ALL-NEXT:    <$d>:
 # CHECK-NEXT:  <msg>:
 # CHECK-NEXT:         4:       48 65 6c 6c     .word
 # CHECK-NEXT:         8:       6f 2c 20 77     .word
@@ -18,10 +18,10 @@
 # CHECK-NEXT:  Disassembly of section .myothersection:
 # CHECK-EMPTY:
 # NOALL-NEXT:  <.myothersection>:
-# ALL-NEXT:    <$x.2>:
+# ALL-NEXT:    <$x>:
 # CHECK-NEXT:         0:       90000001        adrp    x1, 0x0
 # CHECK-EMPTY:
-# ALL-NEXT:    <$d.3>:
+# ALL-NEXT:    <$d>:
 # CHECK-NEXT:  <mystr>:
 # CHECK-NEXT:         4:       62 6c 61 68     .word
 # CHECK-NEXT:         8:       9a              .byte   0x9a
diff --git a/llvm/test/tools/llvm-objdump/multiple-symbols.s b/llvm/test/tools/llvm-objdump/multiple-symbols.s
index 24c169e32147b..1b13f099ae98c 100644
--- a/llvm/test/tools/llvm-objdump/multiple-symbols.s
+++ b/llvm/test/tools/llvm-objdump/multiple-symbols.s
@@ -26,13 +26,13 @@
 
 @ HEAD:          Disassembly of section .text:
 @ HEAD-EMPTY:
-@ AMAP-NEXT:     00000000 <$a.0>:
+@ AMAP-NEXT:     00000000 <$a>:
 @ AAAA-NEXT:     00000000 <aaaa>:
 @ BBBB-NEXT:     00000000 <bbbb>:
 @ AABB-NEXT:            0: e0800080      add     r0, r0, r0, lsl #1
 @ AABB-NEXT:            4: e12fff1e      bx      lr
 @ BOTH-EMPTY:    
-@ TMAP-NEXT:     00000008 <$t.1>:
+@ TMAP-NEXT:     00000008 <$t>:
 @ CCCC-NEXT:     00000008 <cccc>:
 @ DDDD-NEXT:     00000008 <dddd>:
 @ CCDD-NEXT:            8: eb00 0080     add.w   r0, r0, r0, lsl #2
diff --git a/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfbin b/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfbin
new file mode 100755
index 0000000000000..7a1543041f805
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfbin differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfscript b/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfscript
new file mode 100644
index 0000000000000..3d29d444d56bb
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfscript
@@ -0,0 +1,39 @@
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/3//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/29//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//- 
+br_misp_retired.all_branches:upp:            4012fa 0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/24//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//- 
+br_misp_retired.all_branches:upp:            4012fa 0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/24//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/24//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/6//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/21//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/14//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/2//- 
+br_misp_retired.all_branches:upp:            4012fa 0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/2//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//- 
+br_misp_retired.all_branches:upp:            4012fa 0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/27//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/29//-  0x401310/0x4012f0/P/-/-/24//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/19//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/29//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//- 
+br_misp_retired.all_branches:upp:            4012fa 0x401310/0x4012f0/P/-/-/13//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//- 
+br_misp_retired.all_branches:upp:            4012fa 0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/20//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/34//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//- 
+  br_inst_retired.near_taken:upp:            4012fa 0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/17//-  0x401310/0x4012f0/P/-/-/9//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//- 
+br_misp_retired.all_branches:upp:            4012fa 0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/29//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/6//-  0x401310/0x4012f0/P/-/-/24//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/28//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/15//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/9//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/17//-  0x4012fa/0x4012ff/M/-/-/1//- 
+br_misp_retired.all_branches:upp:            4012fa 0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/19//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/10//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/7//-  0x4012fa/0x4012ff/M/-/-/1//- 
+br_misp_retired.all_branches:upp:            4012fa 0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/21//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/24//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/2//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/5//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/1//-  0x401310/0x4012f0/P/-/-/1//- 
+  br_inst_retired.near_taken:upp:            401310 0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/3//-  0x401310/0x4012f0/P/-/-/23//-  0x4012fa/0x4012ff/M/-/-/4//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/2//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/27//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/25//-  0x401310/0x4012f0/P/-/-/4//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/3//-  0x401310/0x4012f0/P/-/-/1//-  0x4012fa/0x4012ff/P/-/-/1//-  0x401310/0x4012f0/P/-/-/28//-  0x401310/0x4012f0/P/-/-/22//-  0x4012fa/0x4012ff/M/-/-/2//-  0x401310/0x4012f0/P/-/-/26//-  0x401310/0x4012f0/P/-/-/28//- 
diff --git a/llvm/test/tools/llvm-profgen/Inputs/ip-duplication.perfscript b/llvm/test/tools/llvm-profgen/Inputs/ip-duplication.perfscript
new file mode 100644
index 0000000000000..f0d4efcbe668e
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/ip-duplication.perfscript
@@ -0,0 +1,2 @@
+           4006b7 0x4006b7/0x40068b/P/-/-/1  0x4006c8/0x4006b0/P/-/-/1  0x400689/0x4006b9/P/-/-/1  0x40066d/0x400686/P/-/-/2  0x4007a6/0x400650/P/-/-/9  0x4007ca/0x400790/P/-/-/8  0x4007d7/0x4007bd/P/-/-/1  0x400792/0x4007d7/P/-/-/1  0x4007b8/0x400790/P/-/-/2  0x4006a2/0x4007a8/P/-/-/3
+           40065d 40065d/0x40068f/M/-/-/1
diff --git a/llvm/test/tools/llvm-profgen/Inputs/noprobe-skid.perfscript b/llvm/test/tools/llvm-profgen/Inputs/noprobe-skid.perfscript
new file mode 100644
index 0000000000000..7c70a18452e57
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/noprobe-skid.perfscript
@@ -0,0 +1,5 @@
+// Invalid perf line
+           40062f 0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/6  0x40062f/0x4005b0/P/-/-/16  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/6  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005c8/0x4005dc/P/-/-/8  0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/10  0x40062f/0x4005b0/P/-/-/14  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/7  0x40062f/0x4005b0/P/-/-/8  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005c8/0x4005dc/P/-/-/7  0x40062f/0x4005b0/P/-/-/15  0x400645/0x4005ff/P/-/-/1
+           4005d7 0x4005d7/0x4005e5/P/-/-/8  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/7  0x40062f/0x4005b0/P/-/-/11  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/8  0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/5  0x40062f/0x4005b0/P/-/-/11  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/7  0x40062f/0x4005b0/P/-/-/10  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/8  0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/13  0x40062f/0x4005b0/P/-/-/9
+           4005c8 0x4005c8/0x4005dc/P/-/-/11  0x40062f/0x4005b0/P/-/-/8  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/5  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/12  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/7  0x40062f/0x4005b0/P/-/-/10  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/8  0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/12  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/8  0x40062f/0x4005b0/P/-/-/8
+           4005c5 0x4005c8/0x4005dc/P/-/-/11  0x40062f/0x4005b0/P/-/-/8  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/5  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/12  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/7  0x40062f/0x4005b0/P/-/-/10  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/8  0x40062f/0x4005b0/P/-/-/9  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/1  0x4005d7/0x4005e5/P/-/-/12  0x40062f/0x4005b0/P/-/-/6  0x400645/0x4005ff/P/-/-/1  0x400637/0x400645/P/-/-/1  0x4005e9/0x400634/P/-/-/2  0x4005c8/0x4005dc/P/-/-/8  0x40062f/0x4005b0/P/-/-/8
diff --git a/llvm/test/tools/llvm-profgen/event-filtering.test b/llvm/test/tools/llvm-profgen/event-filtering.test
new file mode 100644
index 0000000000000..ea486a8fa2f7b
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/event-filtering.test
@@ -0,0 +1,78 @@
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
+
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
+
+// Check that we can use perf event filtering to generate multiple types of
+// source-level profiles from a single perf profile. In this case, we generate
+// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
+// and a branch mispredict profile using br_misp_retired.all_branches sample
+// IPs.
+
+// The source example below is based on perfKernelCpp/cmov_3, except a
+// misleading builtin is used to persuade the compiler not to use cmov, which
+// induces branch mispredicts.
+
+// CHECK: sel_arr:20229:0
+// CHECK:  3.1: 627
+// CHECK:  3.2: 627
+// CHECK:  4: 615
+// CHECK:  5: 627
+
+// UNPRED: sel_arr:18:0
+// UNPRED:  3.1: 0
+// UNPRED:  3.2: 0
+// UNPRED:  4: 9
+// UNPRED:  5: 0
+
+// CHECK-RAW-PROFILE:      3
+// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:303
+// CHECK-RAW-PROFILE-NEXT: 2f0-310:312
+// CHECK-RAW-PROFILE-NEXT: 2ff-310:315
+
+// UNPRED-RAW-PROFILE:      1
+// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9
+
+// original code:
+// clang -O2 -gline-tables-only -fdebug-info-for-profiling lit.c
+#include <stdlib.h>
+
+#define N 20000
+#define ITERS 10000
+
+static int *m_s1, *m_s2, *m_s3, *m_dst;
+
+void init(void) {
+    m_s1 = malloc(sizeof(int)*N);
+    m_s2 = malloc(sizeof(int)*N);
+    m_s3 = malloc(sizeof(int)*N);
+    m_dst = malloc(sizeof(int)*N);
+    srand(42);
+
+    for (int i = 0; i < N; i++) {
+        m_s1[i] = rand() % N;
+        m_s2[i] = 0;
+        m_s3[i] = 1;
+    }
+}
+
+void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
+#pragma nounroll
+#pragma clang loop vectorize(disable) interleave(disable)
+    for (int i = 0; i < N; i++) {
+        int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
+        dst[i] = *p;
+    }
+}
+
+int main(void) {
+  init();
+  for(int i=0; i<ITERS; ++i)
+    sel_arr(m_dst, m_s1, m_s2, m_s3);
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-profgen/iponly-nodupfactor.test b/llvm/test/tools/llvm-profgen/iponly-nodupfactor.test
new file mode 100644
index 0000000000000..006b1c42f3234
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/iponly-nodupfactor.test
@@ -0,0 +1,22 @@
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/ip-duplication.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --use-offset=0 --leading-ip-only
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK
+
+; Test that we don't over-count samples for duplicated source code when
+; building an IP-based profile.
+
+; The inline-noprobe2.perfbin binary is used for this test because one of the
+; partition_pivot_last+3.1 debug locations has a duplication factor of 2
+; encoded into its discriminator. In IP-sample mode, a hit in one instruction
+; in the duplicated code does not imply a hit to the other duplicates.
+
+; The perfscript input includes 1 sample at a location with duplication factor
+; of 2, and another sample at the same source location but with no duplication
+; factor. These should be summed without duplication factors. Ensure we record
+; a count of 1+1=2 (and not 2+1=3) for the 3.1 location.
+
+;CHECK-LABEL: partition_pivot_last
+;CHECK-NEXT:  1: 0
+;CHECK-NEXT:  2: 0
+;CHECK-NEXT:  3: 0
+;CHECK-NEXT:  3.1: 2
+
diff --git a/llvm/test/tools/llvm-profgen/iponly.test b/llvm/test/tools/llvm-profgen/iponly.test
new file mode 100644
index 0000000000000..2e81798d7e6fe
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/iponly.test
@@ -0,0 +1,58 @@
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/noprobe-skid.perfscript --binary=%S/Inputs/noprobe.perfbin --output=%t --skip-symbolization --leading-ip-only
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/noprobe-skid.perfscript --binary=%S/Inputs/noprobe.perfbin --output=%t --leading-ip-only
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK
+
+; Here we check the ability to ignore LBRs, which is useful for generating
+; profiles where only the precise PMU sample IP is of interest. In general the
+; IPs need not identify a branch. In this case there are exactly 4 samples, so
+; we see only these 4 locations as "hot" and none of the LBR history.
+; Compare with noinline-noprobe.test, which includes LBR history.
+
+; Note that there are two different IPs (5c5 and 5c8) contributing to line
+; offset 1 in bar. This tests that sample counts corresponding to the same
+; debug location are summed into that location in the profile rather than the
+; maximum being taken, as happens with basic block execution count profiles.
+
+;CHECK: bar:14:0
+;CHECK:  0: 0
+;CHECK:  1: 2
+;CHECK:  2: 1
+;CHECK:  4: 0
+;CHECK:  5: 0
+;CHECK: foo:5:0
+;CHECK:  0: 0
+;CHECK:  1: 0
+;CHECK:  2: 0
+;CHECK:  3: 1
+;CHECK:  4: 0
+;CHECK:  5: 0
+
+CHECK-RAW-PROFILE:      4
+CHECK-RAW-PROFILE-NEXT: 5c5-5c5:1
+CHECK-RAW-PROFILE-NEXT: 5c8-5c8:1
+CHECK-RAW-PROFILE-NEXT: 5d7-5d7:1
+CHECK-RAW-PROFILE-NEXT: 62f-62f:1
+
+; original code:
+; clang -O3 -g -fdebug-info-for-profiling test.c -fno-inline -o a.out
+#include <stdio.h>
+
+int bar(int x, int y) {
+  if (x % 3) {
+    return x - y;
+  }
+  return x + y;
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 4000 * 4000)
+    if (i % 91) s = bar(i, s); else s += 30;
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-profgen/period-scaling.test b/llvm/test/tools/llvm-profgen/period-scaling.test
new file mode 100644
index 0000000000000..a44c9a78caeaf
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/period-scaling.test
@@ -0,0 +1,84 @@
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
+
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
+
+// Check that we can use perf event filtering to generate multiple types of
+// source-level profiles from a single perf profile. In this case, we generate
+// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
+// and a branch mispredict profile using br_misp_retired.all_branches sample
+// IPs.
+
+// Check that we can use --sample-period to compute LBR and IP-based profiles
+// which have comparable and absolute magnitudes. For example, in this case the
+// branch of interest (at source line offset 4) is in a loop body which is
+// executed ~20M times in total, and it's mispredicted about 9M times, yielding
+// a mispredict rate of roughly 0.45.
+
+// The source example below is based on perfKernelCpp/cmov_3, except a
+// misleading builtin is used to persuade the compiler not to use cmov, which
+// induces branch mispredicts.
+
+// CHECK: sel_arr:652547082:0
+// CHECK:  3.1: 20225766
+// CHECK:  3.2: 20225766
+// CHECK:  4: 19838670
+// CHECK:  5: 20225766
+
+// UNPRED: sel_arr:18000054:0
+// UNPRED:  3.1: 0
+// UNPRED:  3.2: 0
+// UNPRED:  4: 9000027
+// UNPRED:  5: 0
+
+// CHECK-RAW-PROFILE:      3
+// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:9774174
+// CHECK-RAW-PROFILE-NEXT: 2f0-310:10064496
+// CHECK-RAW-PROFILE-NEXT: 2ff-310:10161270
+
+// UNPRED-RAW-PROFILE:      1
+// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9000027
+
+// original code:
+// icx -fprofile-sample-generate lit.c
+#include <stdlib.h>
+
+#define N 20000
+#define ITERS 10000
+
+static int *m_s1, *m_s2, *m_s3, *m_dst;
+
+void init(void) {
+    m_s1 = malloc(sizeof(int)*N);
+    m_s2 = malloc(sizeof(int)*N);
+    m_s3 = malloc(sizeof(int)*N);
+    m_dst = malloc(sizeof(int)*N);
+    srand(42);
+
+    for (int i = 0; i < N; i++) {
+        m_s1[i] = rand() % N;
+        m_s2[i] = 0;
+        m_s3[i] = 1;
+    }
+}
+
+void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
+#pragma nounroll
+#pragma clang loop vectorize(disable) interleave(disable)
+    for (int i = 0; i < N; i++) {
+        int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
+        dst[i] = *p;
+    }
+}
+
+int main(void) {
+  init();
+  for(int i=0; i<ITERS; ++i)
+    sel_arr(m_dst, m_s1, m_s2, m_s3);
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 95c23007b1a05..81f2c9c55b54d 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -34,21 +34,21 @@
 #
 # CHECK: << Total TLI yes SDK no:  8
 # CHECK: >> Total TLI no  SDK yes: 0
-# CHECK: == Total TLI yes SDK yes: 239
+# CHECK: == Total TLI yes SDK yes: 248
 #
 # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*)
 # WRONG_DETAIL: >> TLI no  SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int)
 # WRONG_DETAIL-COUNT-8: << TLI yes SDK no : {{.*}}__hot_cold_t
 # WRONG_SUMMARY: << Total TLI yes SDK no:  9{{$}}
 # WRONG_SUMMARY: >> Total TLI no  SDK yes: 1{{$}}
-# WRONG_SUMMARY: == Total TLI yes SDK yes: 238
+# WRONG_SUMMARY: == Total TLI yes SDK yes: 247
 #
 ## The -COUNT suffix doesn't care if there are too many matches, so check
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 480 symbols, 247 available
-# AVAIL-COUNT-247: {{^}} available
+# AVAIL: TLI knows 489 symbols, 256 available
+# AVAIL-COUNT-256: {{^}} available
 # AVAIL-NOT:       {{^}} available
 # UNAVAIL-COUNT-233: not available
 # UNAVAIL-NOT:       not available
@@ -267,6 +267,18 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            abort
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            exit
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            _Exit
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            atof
     Type:            STT_FUNC
     Section:         .text
@@ -691,6 +703,18 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            nan
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nanf
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nanl
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            nearbyint
     Type:            STT_FUNC
     Section:         .text
@@ -763,6 +787,18 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            remquo
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            remquof
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            remquol
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            rewind
     Type:            STT_FUNC
     Section:         .text
diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
index 4ee364b605551..7cf6990774959 100644
--- a/llvm/tools/CMakeLists.txt
+++ b/llvm/tools/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_tool_subdirectory(lto)
 add_llvm_tool_subdirectory(gold)
 add_llvm_tool_subdirectory(llvm-ar)
 add_llvm_tool_subdirectory(llvm-config)
+add_llvm_tool_subdirectory(llvm-ctxprof-util)
 add_llvm_tool_subdirectory(llvm-lto)
 add_llvm_tool_subdirectory(llvm-profdata)
 
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 31c089e6344d0..c8088da49a278 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -45,10 +45,6 @@
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
-namespace llvm {
-extern cl::opt<bool> PrintPipelinePasses;
-} // namespace llvm
-
 using namespace llvm;
 
 static cl::opt<std::string>
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index c5ae051c0a301..1e78637bf47ca 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -153,8 +153,6 @@ struct TypeCloner {
         return LLVMMetadataTypeInContext(Ctx);
       case LLVMX86_AMXTypeKind:
         return LLVMX86AMXTypeInContext(Ctx);
-      case LLVMX86_MMXTypeKind:
-        return LLVMX86MMXTypeInContext(Ctx);
       case LLVMTokenTypeKind:
         return LLVMTokenTypeInContext(Ctx);
       case LLVMTargetExtTypeKind: {
diff --git a/llvm/tools/llvm-ctxprof-util/CMakeLists.txt b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
new file mode 100644
index 0000000000000..4814c9941da5a
--- /dev/null
+++ b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Object
+  ProfileData
+  Support
+  )
+
+add_llvm_tool(llvm-ctxprof-util
+llvm-ctxprof-util.cpp
+
+  DEPENDS
+  intrinsics_gen
+  )
diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
new file mode 100644
index 0000000000000..3bb7681e33a87
--- /dev/null
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -0,0 +1,150 @@
+//===--- PGOCtxProfJSONReader.h - JSON format  ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// JSON format for the contextual profile for testing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/ProfileData/CtxInstrContextNode.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::SubCommand FromJSON("fromJSON", "Convert from json");
+
+static cl::opt<std::string> InputFilename(
+    "input", cl::value_desc("input"), cl::init("-"),
+    cl::desc(
+        "Input file. The format is an array of contexts.\n"
+        "Each context is a dictionary with the following keys:\n"
+        "'Guid', mandatory. The value is a 64-bit integer.\n"
+        "'Counters', mandatory. An array of 32-bit ints. These are the "
+        "counter values.\n"
+        "'Contexts', optional. An array containing arrays of contexts. The "
+        "context array at a position 'i' is the set of callees at that "
+        "callsite index. Use an empty array to indicate no callees."),
+    cl::sub(FromJSON));
+
+static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                           cl::init("-"),
+                                           cl::desc("Output file"),
+                                           cl::sub(FromJSON));
+
+namespace {
+// A structural representation of the JSON input.
+struct DeserializableCtx {
+  GlobalValue::GUID Guid = 0;
+  std::vector<uint64_t> Counters;
+  std::vector<std::vector<DeserializableCtx>> Callsites;
+};
+
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const std::vector<DeserializableCtx> &DCList);
+
+// Convert a DeserializableCtx into a ContextNode, potentially linking it to
+// its sibling (e.g. callee at same callsite) "Next".
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const DeserializableCtx &DC,
+           ctx_profile::ContextNode *Next = nullptr) {
+  auto AllocSize = ctx_profile::ContextNode::getAllocSize(DC.Counters.size(),
+                                                          DC.Callsites.size());
+  auto *Mem = Nodes.emplace_back(std::make_unique<char[]>(AllocSize)).get();
+  std::memset(Mem, 0, AllocSize);
+  auto *Ret = new (Mem) ctx_profile::ContextNode(DC.Guid, DC.Counters.size(),
+                                                 DC.Callsites.size(), Next);
+  std::memcpy(Ret->counters(), DC.Counters.data(),
+              sizeof(uint64_t) * DC.Counters.size());
+  for (const auto &[I, DCList] : llvm::enumerate(DC.Callsites))
+    Ret->subContexts()[I] = createNode(Nodes, DCList);
+  return Ret;
+}
+
+// Convert a list of DeserializableCtx into a linked list of ContextNodes.
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const std::vector<DeserializableCtx> &DCList) {
+  ctx_profile::ContextNode *List = nullptr;
+  for (const auto &DC : DCList)
+    List = createNode(Nodes, DC, List);
+  return List;
+}
+} // namespace
+
+namespace llvm {
+namespace json {
+// Hook into the JSON deserialization.
+bool fromJSON(const Value &E, DeserializableCtx &R, Path P) {
+  json::ObjectMapper Mapper(E, P);
+  return Mapper && Mapper.map("Guid", R.Guid) &&
+         Mapper.map("Counters", R.Counters) &&
+         Mapper.mapOptional("Callsites", R.Callsites);
+}
+} // namespace json
+} // namespace llvm
+
+// Save the bitstream profile from the JSON representation.
+Error convertFromJSON() {
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (!BufOrError)
+    return createFileError(InputFilename, BufOrError.getError());
+  auto P = json::parse(BufOrError.get()->getBuffer());
+  if (!P)
+    return P.takeError();
+
+  std::vector<DeserializableCtx> DCList;
+  json::Path::Root R("");
+  if (!fromJSON(*P, DCList, R))
+    return R.getError();
+  // Nodes provides memory backing for the ContextualNodes.
+  std::vector<std::unique_ptr<char[]>> Nodes;
+  std::error_code EC;
+  raw_fd_stream Out(OutputFilename, EC);
+  if (EC)
+    return createStringError(EC, "failed to open output");
+  PGOCtxProfileWriter Writer(Out);
+  for (const auto &DC : DCList) {
+    auto *TopList = createNode(Nodes, DC);
+    if (!TopList)
+      return createStringError(
+          "Unexpected error converting internal structure to ctx profile");
+    Writer.write(*TopList);
+  }
+  if (EC)
+    return createStringError(EC, "failed to write output");
+  return Error::success();
+}
+
+int main(int argc, const char **argv) {
+  cl::ParseCommandLineOptions(argc, argv, "LLVM Contextual Profile Utils\n");
+  ExitOnError ExitOnErr("llvm-ctxprof-util: ");
+  if (FromJSON) {
+    if (auto E = convertFromJSON()) {
+      handleAllErrors(std::move(E), [&](const ErrorInfoBase &E) {
+        E.log(errs());
+        errs() << "\n";
+      });
+      return 1;
+    }
+    return 0;
+  }
+  cl::PrintHelpMessage();
+  return 1;
+}
diff --git a/llvm/tools/llvm-driver/llvm-driver.cpp b/llvm/tools/llvm-driver/llvm-driver.cpp
index 53a8b9357e378..14ce162faee46 100644
--- a/llvm/tools/llvm-driver/llvm-driver.cpp
+++ b/llvm/tools/llvm-driver/llvm-driver.cpp
@@ -33,7 +33,7 @@ static void printHelpMessage() {
                << subcommands
                << "\n  Type \"llvm <subcommand> --help\" to get more help on a "
                   "specific subcommand\n\n"
-               << "OPTIONS:\n\n  --help - Display this message";
+               << "OPTIONS:\n\n  --help - Display this message\n";
 }
 
 static int findTool(int Argc, char **Argv, const char *Argv0) {
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 18f4f1a0eb9fb..60a89cb13c57a 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -266,9 +266,8 @@ int llvm_dwp_main(int argc, char **argv, const llvm::ToolContext &) {
 
   std::unique_ptr<MCStreamer> MS(TheTarget->createMCObjectStreamer(
       *ErrOrTriple, MC, std::unique_ptr<MCAsmBackend>(MAB),
-      MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(MCE), *MSTI,
-      MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-      /*DWARFMustBeAtTheEnd*/ false));
+      MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(MCE),
+      *MSTI));
   if (!MS)
     return error("no object streamer for target " + TripleName, Context);
 
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index ed53f8fabb175..adee869967d98 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -466,9 +466,20 @@ class SubProcessFunctionExecutorImpl
 // segfaults in the program. Unregister the rseq region so that we can safely
 // unmap it later
 #ifdef GLIBC_INITS_RSEQ
+    unsigned int RseqStructSize = __rseq_size;
+
+    // Glibc v2.40 (the change is also expected to be backported to v2.35)
+    // changes the definition of __rseq_size to be the usable area of the struct
+    // rather than the actual size of the struct. v2.35 uses only 20 bytes of
+    // the 32 byte struct. For now, it should be safe to assume that if the
+    // usable size is less than 32, the actual size of the struct will be 32
+    // bytes given alignment requirements.
+    if (__rseq_size < 32)
+      RseqStructSize = 32;
+
     long RseqDisableOutput =
         syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
-                __rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+                RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
     if (RseqDisableOutput != 0)
       exit(ChildProcessExitCodeE::RSeqDisableFailed);
 #endif // GLIBC_INITS_RSEQ
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index 431d99c72b808..08da04cc62d9c 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -260,8 +260,7 @@ Expected<std::vector<BenchmarkCode>> readSnippets(const LLVMState &State,
           *TM.getMCAsmInfo(), *TM.getMCInstrInfo(), *TM.getMCRegisterInfo()));
   // The following call will take care of calling Streamer.setTargetStreamer.
   TM.getTarget().createAsmTargetStreamer(Streamer, InstPrinterOStream,
-                                         InstPrinter.get(),
-                                         TM.Options.MCOptions.AsmVerbose);
+                                         InstPrinter.get());
   if (!Streamer.getTargetStreamer())
     return make_error<Failure>("cannot create target asm streamer");
 
diff --git a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
index 6ec19a367d58b..615a39ee9bb4d 100644
--- a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
+++ b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -131,10 +131,6 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
 
 
 int AssembleOneInput(const uint8_t *Data, size_t Size) {
-  const bool ShowInst = false;
-  const bool AsmVerbose = false;
-  const bool UseDwarfDirectory = true;
-
   Triple TheTriple(Triple::normalize(TripleName));
 
   SourceMgr SrcMgr;
@@ -204,9 +200,8 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
   std::unique_ptr<MCStreamer> Str;
 
   if (FileType == OFT_AssemblyFile) {
-    Str.reset(TheTarget->createAsmStreamer(Ctx, std::move(FOut), AsmVerbose,
-                                           UseDwarfDirectory, IP, std::move(CE),
-                                           std::move(MAB), ShowInst));
+    Str.reset(TheTarget->createAsmStreamer(Ctx, std::move(FOut), IP,
+                                           std::move(CE), std::move(MAB)));
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
 
@@ -233,9 +228,8 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
     Str.reset(TheTarget->createMCObjectStreamer(
         TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
-        MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE), *STI,
-        MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ false));
+        MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE),
+        *STI));
   }
   const int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI,
       *MCII, MCOptions);
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index de999a48d5753..3a44ddf17974a 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -356,6 +356,9 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
   MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
   MCOptions.CompressDebugSections = CompressDebugSections.getValue();
+  MCOptions.ShowMCInst = ShowInst;
+  MCOptions.AsmVerbose = true;
+  MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
 
   setDwarfDebugFlags(argc, argv);
   setDwarfDebugProducer();
@@ -534,10 +537,8 @@ int main(int argc, char **argv) {
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
     auto FOut = std::make_unique<formatted_raw_ostream>(*OS);
-    Str.reset(
-        TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true,
-                                     /*useDwarfDirectory*/ true, IP,
-                                     std::move(CE), std::move(MAB), ShowInst));
+    Str.reset(TheTarget->createAsmStreamer(Ctx, std::move(FOut), IP,
+                                           std::move(CE), std::move(MAB)));
 
   } else if (FileType == OFT_Null) {
     Str.reset(TheTarget->createNullStreamer(Ctx));
@@ -555,9 +556,7 @@ int main(int argc, char **argv) {
         TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
         DwoOut ? MAB->createDwoObjectWriter(*OS, DwoOut->os())
                : MAB->createObjectWriter(*OS),
-        std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
-        MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ false));
+        std::unique_ptr<MCCodeEmitter>(CE), *STI));
     if (NoExecStack)
       Str->initSections(true, *STI);
   }
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index 863766cd777d0..d9b8c6edb0ec9 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -41,8 +41,7 @@ Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
   // doesn't show up in the llvm-mca output.
   raw_ostream &OSRef = nulls();
   formatted_raw_ostream FOSRef(OSRef);
-  TheTarget.createAsmTargetStreamer(*Str, FOSRef, IP.get(),
-                                    /*IsVerboseAsm=*/true);
+  TheTarget.createAsmTargetStreamer(*Str, FOSRef, IP.get());
 
   // Create a MCAsmParser and setup the lexer to recognize llvm-mca ASM
   // comments.
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index 24643bd4296be..db69109e2d1fa 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -264,6 +264,8 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
   MCOptions.AssemblyLanguage = "masm";
   MCOptions.MCFatalWarnings = InputArgs.hasArg(OPT_fatal_warnings);
   MCOptions.MCSaveTempLabels = InputArgs.hasArg(OPT_save_temp_labels);
+  MCOptions.ShowMCInst = InputArgs.hasArg(OPT_show_inst);
+  MCOptions.AsmVerbose = true;
 
   Triple TheTriple = GetTriple(ProgName, InputArgs);
   std::string Error;
@@ -385,10 +387,8 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
     auto FOut = std::make_unique<formatted_raw_ostream>(*OS);
-    Str.reset(TheTarget->createAsmStreamer(
-        Ctx, std::move(FOut), /*asmverbose*/ true,
-        /*useDwarfDirectory*/ true, IP, std::move(CE), std::move(MAB),
-        InputArgs.hasArg(OPT_show_inst)));
+    Str.reset(TheTarget->createAsmStreamer(Ctx, std::move(FOut), IP,
+                                           std::move(CE), std::move(MAB)));
 
   } else if (FileType == "null") {
     Str.reset(TheTarget->createNullStreamer(Ctx));
@@ -402,9 +402,8 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
     Str.reset(TheTarget->createMCObjectStreamer(
         TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
-        MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE), *STI,
-        MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-        /*DWARFMustBeAtTheEnd*/ false));
+        MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE),
+        *STI));
   } else {
     llvm_unreachable("Invalid file type!");
   }
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index a54feb7644bda..d82ecc8b3d36e 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -584,6 +584,68 @@ static Expected<int64_t> parseChangeSectionLMA(StringRef ArgValue,
   return *LMAValue;
 }
 
+static Expected<SectionPatternAddressUpdate>
+parseChangeSectionAddr(StringRef ArgValue, StringRef OptionName,
+                       MatchStyle SectionMatchStyle,
+                       function_ref<Error(Error)> ErrorCallback) {
+  SectionPatternAddressUpdate PatternUpdate;
+
+  size_t LastSymbolIndex = ArgValue.find_last_of("+-=");
+  if (LastSymbolIndex == StringRef::npos)
+    return createStringError(errc::invalid_argument,
+                             "bad format for " + OptionName +
+                                 ": argument value " + ArgValue +
+                                 " is invalid. See --help");
+  char UpdateSymbol = ArgValue[LastSymbolIndex];
+
+  StringRef SectionPattern = ArgValue.slice(0, LastSymbolIndex);
+  if (SectionPattern.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for " + OptionName +
+            ": missing section pattern to apply address change to");
+  if (Error E = PatternUpdate.SectionPattern.addMatcher(NameOrPattern::create(
+          SectionPattern, SectionMatchStyle, ErrorCallback)))
+    return std::move(E);
+
+  StringRef Value = ArgValue.slice(LastSymbolIndex + 1, StringRef::npos);
+  if (Value.empty()) {
+    switch (UpdateSymbol) {
+    case '+':
+    case '-':
+      return createStringError(errc::invalid_argument,
+                               "bad format for " + OptionName +
+                                   ": missing value of offset after '" +
+                                   std::string({UpdateSymbol}) + "'");
+
+    case '=':
+      return createStringError(errc::invalid_argument,
+                               "bad format for " + OptionName +
+                                   ": missing address value after '='");
+    }
+  }
+  auto AddrValue = getAsInteger<uint64_t>(Value);
+  if (!AddrValue)
+    return createStringError(AddrValue.getError(),
+                             "bad format for " + OptionName + ": value after " +
+                                 std::string({UpdateSymbol}) + " is " + Value +
+                                 " when it should be a 64-bit integer");
+
+  switch (UpdateSymbol) {
+  case '+':
+    PatternUpdate.Update.Kind = AdjustKind::Add;
+    break;
+  case '-':
+    PatternUpdate.Update.Kind = AdjustKind::Subtract;
+    break;
+  case '=':
+    PatternUpdate.Update.Kind = AdjustKind::Set;
+  }
+
+  PatternUpdate.Update.Value = *AddrValue;
+  return PatternUpdate;
+}
+
 // parseObjcopyOptions returns the config and sets the input arguments. If a
 // help flag is set then parseObjcopyOptions will print the help messege and
 // exit.
@@ -874,6 +936,15 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
     Config.ChangeSectionLMAValAll = *LMAValue;
   }
 
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_change_section_address)) {
+    Expected<SectionPatternAddressUpdate> AddressUpdate =
+        parseChangeSectionAddr(Arg->getValue(), Arg->getSpelling(),
+                               SectionMatchStyle, ErrorCallback);
+    if (!AddressUpdate)
+      return AddressUpdate.takeError();
+    Config.ChangeSectionAddress.push_back(*AddressUpdate);
+  }
+
   for (auto *Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
     if (!StringRef(Arg->getValue()).contains('='))
       return createStringError(errc::invalid_argument,
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index b26c497ca3997..434b5ff92324e 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -267,6 +267,13 @@ defm change_section_lma
     : Eq<"change-section-lma", "Shift LMA of non-zero-sized sections in the program header by <val>">,
       MetaVarName<"*{+|-}val">;
 
+defm change_section_address
+    : Eq<"change-section-address", "Set the address of the <section> to, or adjust it by, <val>">,
+      MetaVarName<"sectionpattern{=|+|-}val">;
+def adjust_section_vma : JoinedOrSeparate<["--"], "adjust-section-vma">,
+                   Alias<change_section_address>,
+                   HelpText<"Alias for --change-section-address">;
+
 defm add_symbol
     : Eq<"add-symbol", "Add new symbol <name> to .symtab. Accepted flags: "
          "global, local, weak, default, hidden, protected, file, section, object, "
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index d1240025625ca..768a976cd9202 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1484,9 +1484,11 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
                           const MCSubtargetInfo *STI, uint64_t SectionAddr,
                           uint64_t Start, uint64_t End,
                           std::unordered_map<uint64_t, std::string> &Labels) {
-  // So far only supports PowerPC and X86.
+  // Supported by certain targets.
   const bool isPPC = STI->getTargetTriple().isPPC();
-  if (!isPPC && !STI->getTargetTriple().isX86())
+  const bool isX86 = STI->getTargetTriple().isX86();
+  const bool isBPF = STI->getTargetTriple().isBPF();
+  if (!isPPC && !isX86 && !isBPF)
     return;
 
   if (MIA)
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 111c546f5329f..4041271cc0a82 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -41,6 +41,21 @@ static cl::opt<bool>
                                 "and produce context-insensitive profile."));
 cl::opt<bool> ShowDetailedWarning("show-detailed-warning",
                                   cl::desc("Show detailed warning message."));
+cl::opt<bool>
+    LeadingIPOnly("leading-ip-only",
+                  cl::desc("Form a profile based only on sample IPs"));
+
+static cl::list<std::string> PerfEventFilter(
+    "perf-event",
+    cl::desc("Ignore samples not matching the given event names"));
+static cl::alias
+    PerfEventFilterPlural("perf-events", cl::CommaSeparated,
+                          cl::desc("Comma-delimited version of -perf-event"),
+                          cl::aliasopt(PerfEventFilter));
+
+static cl::opt<uint64_t>
+    SamplePeriod("sample-period", cl::init(1),
+                 cl::desc("The sampling period (-c) used for perf data"));
 
 extern cl::opt<std::string> PerfTraceFilename;
 extern cl::opt<bool> ShowDisassemblyOnly;
@@ -404,13 +419,18 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, bool SkipPID,
     }
   }
 
+  // If filtering by events was requested, additionally request the "event"
+  // field.
+  const std::string FieldList =
+      PerfEventFilter.empty() ? "ip,brstack" : "event,ip,brstack";
+
   // Run perf script again to retrieve events for PIDs collected above
   SmallVector<StringRef, 8> ScriptSampleArgs;
   ScriptSampleArgs.push_back(PerfPath);
   ScriptSampleArgs.push_back("script");
   ScriptSampleArgs.push_back("--show-mmap-events");
   ScriptSampleArgs.push_back("-F");
-  ScriptSampleArgs.push_back("ip,brstack");
+  ScriptSampleArgs.push_back(FieldList);
   ScriptSampleArgs.push_back("-i");
   ScriptSampleArgs.push_back(PerfData);
   if (!PIDs.empty()) {
@@ -575,14 +595,54 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
 
   // Skip the leading instruction pointer.
   size_t Index = 0;
+
+  StringRef EventName;
+  // Skip a perf event name. This may or may not exist.
+  if (Records.size() > Index && Records[Index].ends_with(":")) {
+    EventName = Records[Index].ltrim().rtrim(':');
+    Index++;
+
+    if (PerfEventFilter.empty()) {
+      WithColor::warning() << "No --perf-event filter was specified, but an "
+                              "\"event\" field was found in line "
+                           << TraceIt.getLineNumber() << ": "
+                           << TraceIt.getCurrentLine() << "\n";
+    } else if (std::find(PerfEventFilter.begin(), PerfEventFilter.end(),
+                         EventName) == PerfEventFilter.end()) {
+      TraceIt.advance();
+      return false;
+    }
+
+  } else if (!PerfEventFilter.empty()) {
+    WithColor::warning() << "A --perf-event filter was specified, but no "
+                            "\"event\" field found in line "
+                         << TraceIt.getLineNumber() << ": "
+                         << TraceIt.getCurrentLine() << "\n";
+  }
+
   uint64_t LeadingAddr;
-  if (!Records.empty() && !Records[0].contains('/')) {
-    if (Records[0].getAsInteger(16, LeadingAddr)) {
+  if (Records.size() > Index && !Records[Index].contains('/')) {
+    if (Records[Index].getAsInteger(16, LeadingAddr)) {
       WarnInvalidLBR(TraceIt);
       TraceIt.advance();
       return false;
     }
-    Index = 1;
+    Index++;
+  }
+
+  // We assume that if we saw an event name we also saw a leading addr.
+  // In other words, LeadingAddr is set if Index is 1 or 2.
+  if (LeadingIPOnly && Index > 0) {
+    // Form a profile only from the sample IP. Do not assume an LBR stack
+    // follows, and ignore it if it does.
+    uint64_t SampleIP = Binary->canonicalizeVirtualAddress(LeadingAddr);
+    bool SampleIPIsInternal = Binary->addressIsCode(SampleIP);
+    if (SampleIPIsInternal) {
+      // Form a half LBR entry where the sample IP is the destination.
+      LBRStack.emplace_back(LBREntry(SampleIP, SampleIP));
+    }
+    TraceIt.advance();
+    return !LBRStack.empty();
   }
 
   // Now extract LBR samples - note that we do not reverse the
@@ -902,6 +962,20 @@ void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample,
                                              uint64_t Repeat) {
   SampleCounter &Counter = SampleCounters.begin()->second;
   uint64_t EndAddress = 0;
+
+  if (LeadingIPOnly) {
+    assert(Sample->LBRStack.size() == 1 &&
+           "Expected only half LBR entries for ip-only mode");
+    const LBREntry &LBR = *(Sample->LBRStack.begin());
+    uint64_t SourceAddress = LBR.Source;
+    uint64_t TargetAddress = LBR.Target;
+    if (SourceAddress == TargetAddress &&
+        Binary->addressIsCode(TargetAddress)) {
+      Counter.recordRangeCount(SourceAddress, TargetAddress, Repeat);
+    }
+    return;
+  }
+
   for (const LBREntry &LBR : Sample->LBRStack) {
     uint64_t SourceAddress = LBR.Source;
     uint64_t TargetAddress = LBR.Target;
@@ -930,6 +1004,16 @@ void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
   if (extractLBRStack(TraceIt, Sample->LBRStack)) {
     warnIfMissingMMap();
     // Record LBR only samples by aggregation
+    // If a sampling period is given we can adjust the magnitude of sample
+    // counts to estimate the absolute magnitute.
+    if (SamplePeriod.getNumOccurrences()) {
+      Count *= SamplePeriod;
+      // If counts are LBR-based, as opposed to IP-based, then the magnitude is
+      // now amplified by roughly the LBR stack size. By adjusting this down, we
+      // can produce LBR-based and IP-based profiles with comparable magnitudes.
+      if (!LeadingIPOnly && Sample->LBRStack.size() > 1)
+        Count /= (Sample->LBRStack.size() - 1);
+    }
     AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
   }
 }
@@ -1062,6 +1146,18 @@ bool PerfScriptReader::isLBRSample(StringRef Line) {
   Line.trim().split(Records, " ", 2, false);
   if (Records.size() < 2)
     return false;
+  // Check if there is an event name before the leading IP.
+  // If there is, it will be in Records[0]. To skip it, we'll re-split on
+  // Records[1], which should contain the rest of the line.
+  if (Records[0].contains(":")) {
+    // If so, consume the event name and continue processing the rest of the
+    // line.
+    StringRef IPAndLBR = Records[1].ltrim();
+    Records.clear();
+    IPAndLBR.split(Records, " ", 2, false);
+    if (Records.size() < 2)
+      return false;
+  }
   if (Records[1].starts_with("0x") && Records[1].contains('/'))
     return true;
   return false;
@@ -1152,6 +1248,18 @@ void PerfScriptReader::warnInvalidRange() {
     const PerfSample *Sample = Item.first.getPtr();
     uint64_t Count = Item.second;
     uint64_t EndAddress = 0;
+
+    if (LeadingIPOnly) {
+      assert(Sample->LBRStack.size() == 1 &&
+             "Expected only half LBR entries for ip-only mode");
+      const LBREntry &LBR = *(Sample->LBRStack.begin());
+      if (LBR.Source == LBR.Target && LBR.Source != ExternalAddr) {
+        // This is an leading-addr-only profile.
+        Ranges[{LBR.Source, LBR.Source}] += Count;
+      }
+      continue;
+    }
+
     for (const LBREntry &LBR : Sample->LBRStack) {
       uint64_t SourceAddress = LBR.Source;
       uint64_t StartAddress = LBR.Target;
@@ -1199,11 +1307,15 @@ void PerfScriptReader::warnInvalidRange() {
         !Binary->addressIsCode(EndAddress))
       continue;
 
-    if (!Binary->addressIsCode(StartAddress) ||
-        !Binary->addressIsTransfer(EndAddress)) {
-      InstNotBoundary += I.second;
-      WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg);
-    }
+    // IP samples can indicate activity on individual instructions rather than
+    // basic blocks/edges. In this mode, don't warn if sampled IPs aren't
+    // branches.
+    if (!LeadingIPOnly)
+      if (!Binary->addressIsCode(StartAddress) ||
+          !Binary->addressIsTransfer(EndAddress)) {
+        InstNotBoundary += I.second;
+        WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg);
+      }
 
     auto *FRange = Binary->findFuncRange(StartAddress);
     if (!FRange) {
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 53a25b279b432..175556c2220e6 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -104,6 +104,8 @@ cl::opt<bool> InferMissingFrames(
         "Infer missing call frames due to compiler tail call elimination."),
     llvm::cl::Optional);
 
+extern cl::opt<bool> LeadingIPOnly;
+
 using namespace llvm;
 using namespace sampleprof;
 
@@ -388,18 +390,25 @@ void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
   // Use the maximum count of samples with same line location
   uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
 
-  // Use duplication factor to compensated for loop unroll/vectorization.
-  // Note that this is only needed when we're taking MAX of the counts at
-  // the location instead of SUM.
-  Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
-
-  ErrorOr<uint64_t> R =
-      FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
-
-  uint64_t PreviousCount = R ? R.get() : 0;
-  if (PreviousCount <= Count) {
+  if (LeadingIPOnly) {
+    // When computing an IP-based profile we take the SUM of counts at the
+    // location instead of applying duplication factors and taking the MAX.
     FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
-                                   Count - PreviousCount);
+                                   Count);
+  } else {
+    // Otherwise, use duplication factor to compensate for loop
+    // unroll/vectorization. Note that this is only needed when we're taking
+    // MAX of the counts at the location instead of SUM.
+    Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
+
+    ErrorOr<uint64_t> R = FunctionProfile.findSamplesAt(
+        LeafLoc.Location.LineOffset, Discriminator);
+
+    uint64_t PreviousCount = R ? R.get() : 0;
+    if (PreviousCount <= Count) {
+      FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
+                                     Count - PreviousCount);
+    }
   }
 }
 
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 78e6f72d7032d..3d36d80f82315 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -232,7 +232,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
                                                 MachineModuleInfo &DestMMI) {
   auto DstMF = std::make_unique<MachineFunction>(
       SrcMF->getFunction(), SrcMF->getTarget(), SrcMF->getSubtarget(),
-      SrcMF->getFunctionNumber(), DestMMI);
+      SrcMF->getContext(), SrcMF->getFunctionNumber());
   DenseMap<MachineBasicBlock *, MachineBasicBlock *> Src2DstMBB;
 
   auto *SrcMRI = &SrcMF->getRegInfo();
@@ -432,7 +432,7 @@ void ReducerWorkItem::print(raw_ostream &ROS, void *p) const {
     printMIR(ROS, *M);
     for (Function &F : *M) {
       if (auto *MF = MMI->getMachineFunction(F))
-        printMIR(ROS, *MF);
+        printMIR(ROS, *MMI, *MF);
     }
   } else {
     M->print(ROS, /*AssemblyAnnotationWriter=*/nullptr,
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 7cc512f0a0b1e..75aa1af46c32a 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -168,8 +168,6 @@ struct Modifier {
         Ty = Type::getX86_FP80Ty(Context);
       else if (Arg == "ppc_fp128")
         Ty = Type::getPPC_FP128Ty(Context);
-      else if (Arg == "x86_mmx")
-        Ty = Type::getX86_MMXTy(Context);
       else if (Arg.starts_with("i")) {
         unsigned N = 0;
         Arg.drop_front().getAsInteger(10, N);
@@ -289,11 +287,7 @@ struct Modifier {
   /// Pick a random vector type.
   Type *pickVectorType(VectorType *VTy = nullptr) {
 
-    // Vectors of x86mmx are illegal; keep trying till we get something else.
-    Type *Ty;
-    do {
-      Ty = pickScalarType();
-    } while (Ty->isX86_MMXTy());
+    Type *Ty = pickScalarType();
 
     if (VTy)
       return VectorType::get(Ty, VTy->getElementCount());
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 374698083763b..91ec905e58f12 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -227,10 +227,6 @@ static cl::opt<bool> DisableLoopUnrolling(
     "disable-loop-unrolling",
     cl::desc("Disable loop unrolling in all relevant passes"), cl::init(false));
 
-namespace llvm {
-extern cl::opt<bool> PrintPipelinePasses;
-} // namespace llvm
-
 template <typename PassManagerT>
 bool tryParsePipelineText(PassBuilder &PB,
                           const cl::opt<std::string> &PipelineOpt) {
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index d50bdf4a65dcb..be675bb7fe5a5 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) {
       {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1},
       {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
       {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
+      {&APFloat::Float8E3M4(), false, true, {0, 0}, 1},
+      {&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1},
       {&APFloat::FloatTF32(), false, true, {0, 0}, 1},
       {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
       {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
@@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) {
   EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
 }
 
+TEST(APFloatTest, Float8E3M4ToDouble) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+  APFloat PosZeroToDouble(PosZero.convertToDouble());
+  EXPECT_TRUE(PosZeroToDouble.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+  APFloat NegZeroToDouble(NegZero.convertToDouble());
+  EXPECT_TRUE(NegZeroToDouble.isNegZero());
+
+  APFloat One(APFloat::Float8E3M4(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float8E3M4(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(15.5F, PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-15.5F, NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble());
+
+  APFloat PosSmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E3M4(), false);
+  EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble());
+  APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+  EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+  EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble());
+
+  APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), PosInf.convertToDouble());
+  APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), NegInf.convertToDouble());
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+  EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
+}
+
 TEST(APFloatTest, FloatTF32ToDouble) {
   APFloat One(APFloat::FloatTF32(), "1.0");
   EXPECT_EQ(1.0, One.convertToDouble());
@@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) {
   EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
 }
 
+TEST(APFloatTest, Float8E3M4ToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float8E3M4(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float8E3M4(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(15.5F, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-15.5F, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat());
+
+  APFloat PosSmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E3M4(), false);
+  EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat());
+  APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+  EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+  EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat());
+
+  APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), PosInf.convertToFloat());
+  APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), NegInf.convertToFloat());
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+  EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
+}
+
 TEST(APFloatTest, FloatTF32ToFloat) {
   APFloat PosZero = APFloat::getZero(APFloat::FloatTF32());
   APFloat PosZeroToFloat(PosZero.convertToFloat());
diff --git a/llvm/unittests/ADT/SetOperationsTest.cpp b/llvm/unittests/ADT/SetOperationsTest.cpp
index b3d931cbfd479..f99f5c9b2af10 100644
--- a/llvm/unittests/ADT/SetOperationsTest.cpp
+++ b/llvm/unittests/ADT/SetOperationsTest.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -16,56 +18,50 @@
 using namespace llvm;
 
 using testing::IsEmpty;
+using testing::UnorderedElementsAre;
 
 namespace {
 
 TEST(SetOperationsTest, SetUnion) {
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {5, 6, 7, 8};
-  // Set1 should be the union of input sets Set1 and Set2.
-  std::set<int> ExpectedSet1 = {1, 2, 3, 4, 5, 6, 7, 8};
-  // Set2 should not be touched.
-  std::set<int> ExpectedSet2 = Set2;
 
   set_union(Set1, Set2);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 should be the union of input sets Set1 and Set2.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4, 5, 6, 7, 8));
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6, 7, 8));
 
   Set1.clear();
   Set2 = {1, 2};
+
+  set_union(Set1, Set2);
   // Set1 should be the union of input sets Set1 and Set2, which in this case
   // will be Set2.
-  ExpectedSet1 = Set2;
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2));
   // Set2 should not be touched.
-  ExpectedSet2 = Set2;
-
-  set_union(Set1, Set2);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  EXPECT_THAT(Set2, UnorderedElementsAre(1, 2));
 }
 
 TEST(SetOperationsTest, SetIntersect) {
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {3, 4, 5, 6};
-  // Set1 should be the intersection of sets Set1 and Set2.
-  std::set<int> ExpectedSet1 = {3, 4};
-  // Set2 should not be touched.
-  std::set<int> ExpectedSet2 = Set2;
 
   set_intersect(Set1, Set2);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 should be the intersection of sets Set1 and Set2.
+  EXPECT_THAT(Set1, UnorderedElementsAre(3, 4));
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(3, 4, 5, 6));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {5, 6};
-  // Set2 should not be touched.
-  ExpectedSet2 = Set2;
 
   set_intersect(Set1, Set2);
   // Set1 should be the intersection of sets Set1 and Set2, which
   // is empty as they are non-overlapping.
   EXPECT_THAT(Set1, IsEmpty());
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6));
 
   // Check that set_intersect works on SetVector via remove_if.
   SmallSetVector<int, 4> SV;
@@ -82,35 +78,27 @@ TEST(SetOperationsTest, SetIntersection) {
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {3, 4, 5, 6};
   std::set<int> Result;
-  // Result should be the intersection of sets Set1 and Set2.
-  std::set<int> ExpectedResult = {3, 4};
-  // Set1 and Set2 should not be touched.
-  std::set<int> ExpectedSet1 = Set1;
-  std::set<int> ExpectedSet2 = Set2;
 
   Result = set_intersection(Set1, Set2);
-  EXPECT_EQ(ExpectedResult, Result);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Result should be the intersection of sets Set1 and Set2.
+  EXPECT_THAT(Result, UnorderedElementsAre(3, 4));
+  // Set1 and Set2 should not be touched.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(Set2, UnorderedElementsAre(3, 4, 5, 6));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {5, 6};
-  // Set1 and Set2 should not be touched.
-  ExpectedSet1 = Set1;
-  ExpectedSet2 = Set2;
 
   Result = set_intersection(Set1, Set2);
   // Result should be the intersection of sets Set1 and Set2, which
   // is empty as they are non-overlapping.
   EXPECT_THAT(Result, IsEmpty());
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 and Set2 should not be touched.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6));
 
   Set1 = {5, 6};
   Set2 = {1, 2, 3, 4};
-  // Set1 and Set2 should not be touched.
-  ExpectedSet1 = Set1;
-  ExpectedSet2 = Set2;
 
   Result = set_intersection(Set1, Set2);
   // Result should be the intersection of sets Set1 and Set2, which
@@ -118,85 +106,105 @@ TEST(SetOperationsTest, SetIntersection) {
   // reversed, since the code takes a different path depending on which input
   // set is smaller.
   EXPECT_THAT(Result, IsEmpty());
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 and Set2 should not be touched.
+  EXPECT_THAT(Set1, UnorderedElementsAre(5, 6));
+  EXPECT_THAT(Set2, UnorderedElementsAre(1, 2, 3, 4));
 }
 
 TEST(SetOperationsTest, SetDifference) {
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {3, 4, 5, 6};
   std::set<int> Result;
-  // Result should be Set1 - Set2, leaving only {1, 2}.
-  std::set<int> ExpectedResult = {1, 2};
-  // Set1 and Set2 should not be touched.
-  std::set<int> ExpectedSet1 = Set1;
-  std::set<int> ExpectedSet2 = Set2;
 
   Result = set_difference(Set1, Set2);
-  EXPECT_EQ(ExpectedResult, Result);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Result should be Set1 - Set2, leaving only {1, 2}.
+  EXPECT_THAT(Result, UnorderedElementsAre(1, 2));
+  // Set1 and Set2 should not be touched.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(Set2, UnorderedElementsAre(3, 4, 5, 6));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {1, 2, 3, 4};
-  // Set1 and Set2 should not be touched.
-  ExpectedSet1 = Set1;
-  ExpectedSet2 = Set2;
 
   Result = set_difference(Set1, Set2);
   // Result should be Set1 - Set2, which should be empty.
   EXPECT_THAT(Result, IsEmpty());
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 and Set2 should not be touched.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(Set2, UnorderedElementsAre(1, 2, 3, 4));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {5, 6};
+
+  Result = set_difference(Set1, Set2);
   // Result should be Set1 - Set2, which should be Set1 as they are
   // non-overlapping.
-  ExpectedResult = Set1;
+  EXPECT_THAT(Result, UnorderedElementsAre(1, 2, 3, 4));
   // Set1 and Set2 should not be touched.
-  ExpectedSet1 = Set1;
-  ExpectedSet2 = Set2;
-
-  Result = set_difference(Set1, Set2);
-  EXPECT_EQ(ExpectedResult, Result);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6));
 }
 
 TEST(SetOperationsTest, SetSubtract) {
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {3, 4, 5, 6};
-  // Set1 should get Set1 - Set2, leaving only {1, 2}.
-  std::set<int> ExpectedSet1 = {1, 2};
-  // Set2 should not be touched.
-  std::set<int> ExpectedSet2 = Set2;
 
   set_subtract(Set1, Set2);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set1 should get Set1 - Set2, leaving only {1, 2}.
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2));
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(3, 4, 5, 6));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {1, 2, 3, 4};
-  // Set2 should not be touched.
-  ExpectedSet2 = Set2;
 
   set_subtract(Set1, Set2);
   // Set1 should get Set1 - Set2, which should be empty.
   EXPECT_THAT(Set1, IsEmpty());
-  EXPECT_EQ(ExpectedSet2, Set2);
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(1, 2, 3, 4));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {5, 6};
+
+  set_subtract(Set1, Set2);
   // Set1 should get Set1 - Set2, which should be Set1 as they are
   // non-overlapping.
-  ExpectedSet1 = Set1;
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
   // Set2 should not be touched.
-  ExpectedSet2 = Set2;
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6));
+}
+
+TEST(SetOperationsTest, SetSubtractSmallPtrSet) {
+  int A[4];
+
+  // Set1.size() < Set2.size()
+  SmallPtrSet<int *, 4> Set1 = {&A[0], &A[1]};
+  SmallPtrSet<int *, 4> Set2 = {&A[1], &A[2], &A[3]};
+  set_subtract(Set1, Set2);
+  EXPECT_THAT(Set1, UnorderedElementsAre(&A[0]));
+
+  // Set1.size() > Set2.size()
+  Set1 = {&A[0], &A[1], &A[2]};
+  Set2 = {&A[0], &A[2]};
+  set_subtract(Set1, Set2);
+  EXPECT_THAT(Set1, UnorderedElementsAre(&A[1]));
+}
+
+TEST(SetOperationsTest, SetSubtractSmallVector) {
+  int A[4];
+
+  // Set1.size() < Set2.size()
+  SmallPtrSet<int *, 4> Set1 = {&A[0], &A[1]};
+  SmallVector<int *> Set2 = {&A[1], &A[2], &A[3]};
+  set_subtract(Set1, Set2);
+  EXPECT_THAT(Set1, UnorderedElementsAre(&A[0]));
 
+  // Set1.size() > Set2.size()
+  Set1 = {&A[0], &A[1], &A[2]};
+  Set2 = {&A[0], &A[2]};
   set_subtract(Set1, Set2);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
+  EXPECT_THAT(Set1, UnorderedElementsAre(&A[1]));
 }
 
 TEST(SetOperationsTest, SetSubtractRemovedRemaining) {
@@ -204,56 +212,47 @@ TEST(SetOperationsTest, SetSubtractRemovedRemaining) {
 
   std::set<int> Set1 = {1, 2, 3, 4};
   std::set<int> Set2 = {3, 4, 5, 6};
+
+  set_subtract(Set1, Set2, Removed, Remaining);
   // Set1 should get Set1 - Set2, leaving only {1, 2}.
-  std::set<int> ExpectedSet1 = {1, 2};
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2));
   // Set2 should not be touched.
-  std::set<int> ExpectedSet2 = Set2;
+  EXPECT_THAT(Set2, UnorderedElementsAre(3, 4, 5, 6));
   // We should get back that {3, 4} from Set2 were removed from Set1, and {5, 6}
   // were not removed from Set1.
-  std::set<int> ExpectedRemoved = {3, 4};
-  std::set<int> ExpectedRemaining = {5, 6};
-
-  set_subtract(Set1, Set2, Removed, Remaining);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
-  EXPECT_EQ(ExpectedRemoved, Removed);
-  EXPECT_EQ(ExpectedRemaining, Remaining);
+  EXPECT_THAT(Removed, UnorderedElementsAre(3, 4));
+  EXPECT_THAT(Remaining, UnorderedElementsAre(5, 6));
 
   Set1 = {1, 2, 3, 4};
   Set2 = {1, 2, 3, 4};
   Removed.clear();
   Remaining.clear();
-  // Set2 should not be touched.
-  ExpectedSet2 = Set2;
-  // Set should get back that all of Set2 was removed from Set1, and nothing
-  // left in Set2 was not removed from Set1.
-  ExpectedRemoved = Set2;
 
   set_subtract(Set1, Set2, Removed, Remaining);
   // Set1 should get Set1 - Set2, which should be empty.
   EXPECT_THAT(Set1, IsEmpty());
-  EXPECT_EQ(ExpectedSet2, Set2);
-  EXPECT_EQ(ExpectedRemoved, Removed);
+  // Set2 should not be touched.
+  EXPECT_THAT(Set2, UnorderedElementsAre(1, 2, 3, 4));
+  // Set should get back that all of Set2 was removed from Set1, and nothing
+  // left in Set2 was not removed from Set1.
+  EXPECT_THAT(Removed, UnorderedElementsAre(1, 2, 3, 4));
   EXPECT_THAT(Remaining, IsEmpty());
 
   Set1 = {1, 2, 3, 4};
   Set2 = {5, 6};
   Removed.clear();
   Remaining.clear();
+
+  set_subtract(Set1, Set2, Removed, Remaining);
   // Set1 should get Set1 - Set2, which should be Set1 as they are
   // non-overlapping.
-  ExpectedSet1 = {1, 2, 3, 4};
+  EXPECT_THAT(Set1, UnorderedElementsAre(1, 2, 3, 4));
   // Set2 should not be touched.
-  ExpectedSet2 = Set2;
+  EXPECT_THAT(Set2, UnorderedElementsAre(5, 6));
+  EXPECT_THAT(Removed, IsEmpty());
   // Set should get back that none of Set2 was removed from Set1, and all
   // of Set2 was not removed from Set1.
-  ExpectedRemaining = Set2;
-
-  set_subtract(Set1, Set2, Removed, Remaining);
-  EXPECT_EQ(ExpectedSet1, Set1);
-  EXPECT_EQ(ExpectedSet2, Set2);
-  EXPECT_THAT(Removed, IsEmpty());
-  EXPECT_EQ(ExpectedRemaining, Remaining);
+  EXPECT_THAT(Remaining, UnorderedElementsAre(5, 6));
 }
 
 TEST(SetOperationsTest, SetIsSubset) {
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index b1aeaa6e71fd4..3cba630867a83 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -24,6 +24,7 @@ set(ANALYSIS_TEST_SOURCES
   ConstraintSystemTest.cpp
   DDGTest.cpp
   DomTreeUpdaterTest.cpp
+  DXILResourceTest.cpp
   GlobalsModRefTest.cpp
   FunctionPropertiesAnalysisTest.cpp
   InlineCostTest.cpp
diff --git a/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
similarity index 80%
rename from llvm/unittests/Transforms/Utils/DXILResourceTest.cpp
rename to llvm/unittests/Analysis/DXILResourceTest.cpp
index 1e494cbb7da92..7bbb417097882 100644
--- a/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/DXILResource.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/IR/Constants.h"
 #include "gtest/gtest.h"
 
@@ -113,9 +113,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // ByteAddressBuffer Buffer0;
   Value *Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.ByteAddressBuffer"));
-  ResourceInfo Resource =
-      ResourceInfo::RawBuffer(Symbol, "Buffer0", ResourceBinding{0, 0, 1},
-                              /*UniqueID=*/0);
+  ResourceInfo Resource = ResourceInfo::RawBuffer(Symbol, "Buffer0");
+  Resource.bind(0, 0, 0, 1);
   std::pair<uint32_t, uint32_t> Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000bU);
   EXPECT_EQ(Props.second, 0U);
@@ -125,9 +124,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // RWByteAddressBuffer BufferOut : register(u3, space2);
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.RWByteAddressBuffer"));
-  Resource = ResourceInfo::RWRawBuffer(
-      Symbol, "BufferOut", ResourceBinding{2, 3, 1}, /*UniqueID=*/1,
-      /*GloballyCoherent=*/false, /*IsROV=*/false);
+  Resource =
+      ResourceInfo::RWRawBuffer(Symbol, "BufferOut",
+                                /*GloballyCoherent=*/false, /*IsROV=*/false);
+  Resource.bind(1, 2, 3, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000100bU);
   EXPECT_EQ(Props.second, 0U);
@@ -141,9 +141,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
       StructType::create(Context, {Int32Ty, FloatTy, DoubleTy}, "BufType0");
   Symbol = UndefValue::get(StructType::create(
       Context, {BufType0}, "class.StructuredBuffer<BufType>"));
-  Resource = ResourceInfo::StructuredBuffer(
-      Symbol, "Buffer0", ResourceBinding{0, 0, 1}, /*UniqueID=*/0,
-      /*Stride=*/16, Align(8));
+  Resource = ResourceInfo::StructuredBuffer(Symbol, "Buffer0",
+                                            /*Stride=*/16, Align(8));
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000030cU);
   EXPECT_EQ(Props.second, 0x00000010U);
@@ -151,13 +151,26 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   EXPECT_MDEQ(
       MD, TestMD.get(0, Symbol, "Buffer0", 0, 0, 1, 12, 0, TestMD.get(1, 16)));
 
+  // StructuredBuffer<float3> Buffer1 : register(t1);
+  Symbol = UndefValue::get(StructType::create(
+      Context, {Floatx3Ty}, "class.StructuredBuffer<vector<float, 3> >"));
+  Resource = ResourceInfo::StructuredBuffer(Symbol, "Buffer1",
+                                            /*Stride=*/12, {});
+  Resource.bind(1, 0, 1, 1);
+  Props = Resource.getAnnotateProps();
+  EXPECT_EQ(Props.first, 0x0000000cU);
+  EXPECT_EQ(Props.second, 0x0000000cU);
+  MD = Resource.getAsMetadata(Context);
+  EXPECT_MDEQ(
+      MD, TestMD.get(1, Symbol, "Buffer1", 0, 1, 1, 12, 0, TestMD.get(1, 12)));
+
   // Texture2D<float4> ColorMapTexture : register(t2);
   Symbol = UndefValue::get(StructType::create(
       Context, {Floatx4Ty}, "class.Texture2D<vector<float, 4> >"));
   Resource =
-      ResourceInfo::SRV(Symbol, "ColorMapTexture", ResourceBinding{0, 2, 1},
-                        /*UniqueID=*/2, dxil::ElementType::F32,
+      ResourceInfo::SRV(Symbol, "ColorMapTexture", dxil::ElementType::F32,
                         /*ElementCount=*/4, dxil::ResourceKind::Texture2D);
+  Resource.bind(2, 0, 2, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00000002U);
   EXPECT_EQ(Props.second, 0x00000409U);
@@ -169,9 +182,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(
       StructType::create(Context, {FloatTy}, "class.Texture2DMS<float, 8>"));
   Resource =
-      ResourceInfo::Texture2DMS(Symbol, "DepthBuffer", ResourceBinding{0, 0, 1},
-                                /*UniqueID=*/0, dxil::ElementType::F32,
+      ResourceInfo::Texture2DMS(Symbol, "DepthBuffer", dxil::ElementType::F32,
                                 /*ElementCount=*/1, /*SampleCount=*/8);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00000003U);
   EXPECT_EQ(Props.second, 0x00080109U);
@@ -182,9 +195,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // FeedbackTexture2D<SAMPLER_FEEDBACK_MIN_MIP> feedbackMinMip;
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "class.FeedbackTexture2D<0>"));
-  Resource = ResourceInfo::FeedbackTexture2D(
-      Symbol, "feedbackMinMip", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, SamplerFeedbackType::MinMip);
+  Resource = ResourceInfo::FeedbackTexture2D(Symbol, "feedbackMinMip",
+                                             SamplerFeedbackType::MinMip);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001011U);
   EXPECT_EQ(Props.second, 0U);
@@ -196,8 +209,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32Ty}, "class.FeedbackTexture2DArray<1>"));
   Resource = ResourceInfo::FeedbackTexture2DArray(
-      Symbol, "feedbackMipRegion", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, SamplerFeedbackType::MipRegionUsed);
+      Symbol, "feedbackMipRegion", SamplerFeedbackType::MipRegionUsed);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001012U);
   EXPECT_EQ(Props.second, 0x00000001U);
@@ -208,11 +221,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // globallycoherent RWTexture2D<int2> OutputTexture : register(u0, space2);
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32x2Ty}, "class.RWTexture2D<vector<int, 2> >"));
-  Resource =
-      ResourceInfo::UAV(Symbol, "OutputTexture", ResourceBinding{2, 0, 1},
-                        /*UniqueID=*/0, dxil::ElementType::I32,
-                        /*ElementCount=*/2, /*GloballyCoherent=*/1, /*IsROV=*/0,
-                        dxil::ResourceKind::Texture2D);
+  Resource = ResourceInfo::UAV(Symbol, "OutputTexture", dxil::ElementType::I32,
+                               /*ElementCount=*/2, /*GloballyCoherent=*/1,
+                               /*IsROV=*/0, dxil::ResourceKind::Texture2D);
+  Resource.bind(0, 2, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00005002U);
   EXPECT_EQ(Props.second, 0x00000204U);
@@ -224,10 +236,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(
       StructType::create(Context, {Floatx4Ty},
                          "class.RasterizerOrderedBuffer<vector<float, 4> >"));
-  Resource = ResourceInfo::UAV(Symbol, "ROB", ResourceBinding{0, 0, 1},
-                               /*UniqueID=*/0, dxil::ElementType::F32,
+  Resource = ResourceInfo::UAV(Symbol, "ROB", dxil::ElementType::F32,
                                /*ElementCount=*/4, /*GloballyCoherent=*/0,
                                /*IsROV=*/1, dxil::ResourceKind::TypedBuffer);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000300aU);
   EXPECT_EQ(Props.second, 0x00000409U);
@@ -240,10 +252,11 @@ TEST(DXILResource, AnnotationsAndMetadata) {
       Context, {Floatx3Ty, FloatTy, Int32Ty}, "ParticleMotion");
   Symbol = UndefValue::get(StructType::create(
       Context, {BufType1}, "class.StructuredBuffer<ParticleMotion>"));
-  Resource = ResourceInfo::RWStructuredBuffer(
-      Symbol, "g_OutputBuffer", ResourceBinding{0, 2, 1},
-      /*UniqueID=*/0, /*Stride=*/20, Align(4), /*GloballyCoherent=*/false,
-      /*IsROV=*/false, /*HasCounter=*/true);
+  Resource =
+      ResourceInfo::RWStructuredBuffer(Symbol, "g_OutputBuffer", /*Stride=*/20,
+                                       Align(4), /*GloballyCoherent=*/false,
+                                       /*IsROV=*/false, /*HasCounter=*/true);
+  Resource.bind(0, 0, 2, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000920cU);
   EXPECT_EQ(Props.second, 0x00000014U);
@@ -255,9 +268,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32Ty}, "class.RWTexture2DMSArray<unsigned int, 8>"));
   Resource = ResourceInfo::RWTexture2DMSArray(
-      Symbol, "g_rw_t2dmsa", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, dxil::ElementType::U32, /*ElementCount=*/1,
+      Symbol, "g_rw_t2dmsa", dxil::ElementType::U32, /*ElementCount=*/1,
       /*SampleCount=*/8, /*GloballyCoherent=*/false);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001008U);
   EXPECT_EQ(Props.second, 0x00080105U);
@@ -268,8 +281,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // cbuffer cb0 { float4 g_X; float4 g_Y; }
   Symbol = UndefValue::get(
       StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0"));
-  Resource = ResourceInfo::CBuffer(Symbol, "cb0", ResourceBinding{0, 0, 1},
-                                   /*UniqueID=*/0, /*Size=*/32);
+  Resource = ResourceInfo::CBuffer(Symbol, "cb0", /*Size=*/32);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000dU);
   EXPECT_EQ(Props.second, 0x00000020U);
@@ -279,9 +292,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // SamplerState ColorMapSampler : register(s0);
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.SamplerState"));
-  Resource =
-      ResourceInfo::Sampler(Symbol, "ColorMapSampler", ResourceBinding{0, 0, 1},
-                            /*UniqueID=*/0, dxil::SamplerType::Default);
+  Resource = ResourceInfo::Sampler(Symbol, "ColorMapSampler",
+                                   dxil::SamplerType::Default);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000eU);
   EXPECT_EQ(Props.second, 0U);
@@ -290,9 +303,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
               TestMD.get(0, Symbol, "ColorMapSampler", 0, 0, 1, 0, nullptr));
 
   // SamplerComparisonState ShadowSampler {...};
-  Resource =
-      ResourceInfo::Sampler(Symbol, "CmpSampler", ResourceBinding{0, 0, 1},
-                            /*UniqueID=*/0, dxil::SamplerType::Comparison);
+  Resource = ResourceInfo::Sampler(Symbol, "CmpSampler",
+                                   dxil::SamplerType::Comparison);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000800eU);
   EXPECT_EQ(Props.second, 0U);
diff --git a/llvm/unittests/Analysis/LoadsTest.cpp b/llvm/unittests/Analysis/LoadsTest.cpp
index 5da3feaf762f3..13377a8082096 100644
--- a/llvm/unittests/Analysis/LoadsTest.cpp
+++ b/llvm/unittests/Analysis/LoadsTest.cpp
@@ -7,8 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -114,3 +119,85 @@ define void @f(i32* %p1, i32* %p2, i64 %i) {
   EXPECT_TRUE(canReplacePointersInUseIfEqual(PtrToIntUse, P2, DL));
   EXPECT_TRUE(canReplacePointersInUseIfEqual(IcmpUse, P2, DL));
 }
+
+TEST(LoadsTest, IsDerefReadOnlyLoop) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+define i64 @f1() {
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @f2(ptr %p1) {
+entry:
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+)IR");
+  auto *GV1 = M->getNamedValue("f1");
+  auto *GV2 = M->getNamedValue("f2");
+  ASSERT_TRUE(GV1 && GV2);
+  auto *F1 = dyn_cast<Function>(GV1);
+  auto *F2 = dyn_cast<Function>(GV2);
+  ASSERT_TRUE(F1 && F2);
+
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+
+  auto IsDerefReadOnlyLoop = [&TLI](Function *F) -> bool {
+    AssumptionCache AC(*F);
+    DominatorTree DT(*F);
+    LoopInfo LI(DT);
+    ScalarEvolution SE(*F, TLI, AC, DT, LI);
+
+    Function::iterator FI = F->begin();
+    // First basic block is entry - skip it.
+    BasicBlock *Header = &*(++FI);
+    assert(Header->getName() == "loop");
+    Loop *L = LI.getLoopFor(Header);
+
+    return isDereferenceableReadOnlyLoop(L, &SE, &DT, &AC);
+  };
+
+  ASSERT_TRUE(IsDerefReadOnlyLoop(F1));
+  ASSERT_FALSE(IsDerefReadOnlyLoop(F2));
+}
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index a6a5ffda3cb70..76e6095636305 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1625,4 +1625,40 @@ TEST_F(ScalarEvolutionsTest, ForgetValueWithOverflowInst) {
   });
 }
 
+TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering) {
+  // Regression test for a case where caching of equivalent values caused the
+  // comparator to get inconsistent.
+  LLVMContext C;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(R"(
+    define i32 @foo(i32 %arg0) {
+      %1 = add i32 %arg0, 1
+      %2 = add i32 %arg0, 1
+      %3 = xor i32 %2, %1
+      %4 = add i32 %3, %2
+      %5 = add i32 %arg0, 1
+      %6 = xor i32 %5, %arg0
+      %7 = add i32 %arg0, %6
+      %8 = add i32 %5, %7
+      %9 = xor i32 %8, %7
+      %10 = add i32 %9, %8
+      %11 = xor i32 %10, %9
+      %12 = add i32 %11, %10
+      %13 = xor i32 %12, %11
+      %14 = add i32 %12, %13
+      %15 = add i32 %14, %4
+      ret i32 %15
+    })",
+                                                  Err, C);
+
+  ASSERT_TRUE(M && "Could not parse module?");
+  ASSERT_TRUE(!verifyModule(*M) && "Must have been well formed!");
+
+  runWithSE(*M, "foo", [](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    // When _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG, this will
+    // crash if the comparator has the specific caching bug.
+    SE.getSCEV(F.getEntryBlock().getTerminator()->getOperand(0));
+  });
+}
+
 }  // end namespace llvm
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 1fe94e2aae059..d344ebe676799 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -255,6 +255,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare double @modf(double, double*)\n"
       "declare float @modff(float, float*)\n"
       "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
+      "declare double @nan(ptr)\n"
+      "declare float @nanf(ptr)\n"
+      "declare x86_fp80 @nanl(ptr)\n"
       "declare double @nearbyint(double)\n"
       "declare float @nearbyintf(float)\n"
       "declare x86_fp80 @nearbyintl(x86_fp80)\n"
@@ -281,6 +284,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare float @remainderf(float, float)\n"
       "declare x86_fp80 @remainderl(x86_fp80, x86_fp80)\n"
       "declare i32 @remove(i8*)\n"
+      "declare double @remquo(double, double, ptr)\n"
+      "declare float @remquof(float, float, ptr)\n"
+      "declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr)\n"
       "declare i32 @rename(i8*, i8*)\n"
       "declare void @rewind(%struct*)\n"
       "declare double @rint(double)\n"
@@ -497,6 +503,10 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
 
       "declare i32 @atexit(void ()*)\n"
 
+      "declare void @abort()\n"
+      "declare void @exit(i32)\n"
+      "declare void @_Exit(i32)\n"
+
       "declare i32 @__nvvm_reflect(i8*)\n"
 
       "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
@@ -660,4 +670,4 @@ class TLITestAarch64 : public ::testing::Test {
 TEST_F(TLITestAarch64, TestFrem) {
   EXPECT_EQ(getScalarName(Instruction::FRem, Type::getDoubleTy(Ctx)), "fmod");
   EXPECT_EQ(getScalarName(Instruction::FRem, Type::getFloatTy(Ctx)), "fmodf");
-}
\ No newline at end of file
+}
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index cc574b5887efc..79e27c7ced61f 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -61,14 +61,15 @@ class AArch64SelectionDAGTest : public testing::Test {
 
     MachineModuleInfo MMI(TM.get());
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F), 0,
-                                      MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 306a97c3149cc..d5365d9c79492 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -55,6 +55,7 @@ class InstrRefLDVTest : public testing::Test {
   DIBasicType *LongInt;
   DIExpression *EmptyExpr;
   LiveDebugValues::OverlapMap Overlaps;
+  LiveDebugValues::DebugVariableMap DVMap;
 
   DebugLoc OutermostLoc, InBlockLoc, NotNestedBlockLoc, InlinedLoc;
 
@@ -93,7 +94,7 @@ class InstrRefLDVTest : public testing::Test {
     const TargetSubtargetInfo &STI = *Machine->getSubtargetImpl(*F);
 
     MF = std::make_unique<MachineFunction>(*F, (LLVMTargetMachine &)*Machine,
-                                           STI, FunctionNum, *MMI);
+                                           STI, MMI->getContext(), FunctionNum);
 
     // Create metadata: CU, subprogram, some blocks and an inline function
     // scope.
@@ -176,7 +177,7 @@ class InstrRefLDVTest : public testing::Test {
 
   void addVTracker() {
     ASSERT_TRUE(LDV);
-    VTracker = std::make_unique<VLocTracker>(Overlaps, EmptyExpr);
+    VTracker = std::make_unique<VLocTracker>(DVMap, Overlaps, EmptyExpr);
     LDV->VTracker = &*VTracker;
   }
 
@@ -215,7 +216,7 @@ class InstrRefLDVTest : public testing::Test {
   }
 
   void buildVLocValueMap(const DILocation *DILoc,
-                    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                    const SmallSet<DebugVariableID, 4> &VarsWeCareAbout,
                     SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
                     InstrRefBasedLDV::LiveInsT &Output, FuncValueTable &MOutLocs,
                     FuncValueTable &MInLocs,
@@ -2632,10 +2633,11 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) {
   MInLocs[0][0] = MOutLocs[0][0] = LiveInRsp;
 
   DebugVariable Var(FuncVariable, std::nullopt, nullptr);
+  DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc);
   DbgValueProperties EmptyProps(EmptyExpr, false, false);
 
-  SmallSet<DebugVariable, 4> AllVars;
-  AllVars.insert(Var);
+  SmallSet<DebugVariableID, 4> AllVars;
+  AllVars.insert(VarID);
 
   // Mild hack: rather than constructing machine instructions in each block
   // and creating lexical scopes across them, instead just tell
@@ -2645,7 +2647,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) {
   AssignBlocks.insert(MBB0);
 
   SmallVector<VLocTracker, 1> VLocs;
-  VLocs.resize(1, VLocTracker(Overlaps, EmptyExpr));
+  VLocs.resize(1, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr));
 
   InstrRefBasedLDV::LiveInsT Output;
 
@@ -2657,7 +2659,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) {
 
   // If we put an assignment in the transfer function, that should... well,
   // do nothing, because we don't store the live-outs.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output.size(), 0ul);
@@ -2694,10 +2696,11 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   initValueArray(MOutLocs, 4, 2);
 
   DebugVariable Var(FuncVariable, std::nullopt, nullptr);
+  DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc);
   DbgValueProperties EmptyProps(EmptyExpr, false, false);
 
-  SmallSet<DebugVariable, 4> AllVars;
-  AllVars.insert(Var);
+  SmallSet<DebugVariableID, 4> AllVars;
+  AllVars.insert(VarID);
 
   // Mild hack: rather than constructing machine instructions in each block
   // and creating lexical scopes across them, instead just tell
@@ -2710,7 +2713,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   AssignBlocks.insert(MBB3);
 
   SmallVector<VLocTracker, 1> VLocs;
-  VLocs.resize(4, VLocTracker(Overlaps, EmptyExpr));
+  VLocs.resize(4, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr));
 
   InstrRefBasedLDV::LiveInsT Output;
 
@@ -2736,7 +2739,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
 
   // An assignment in the end block should also not affect other blocks; or
   // produce any live-ins.
-  VLocs[3].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[3].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2748,7 +2751,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   // Assignments in either of the side-of-diamond blocks should also not be
   // propagated anywhere.
   VLocs[3].Vars.clear();
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2758,7 +2761,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   VLocs[2].Vars.clear();
   ClearOutputs();
 
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2770,7 +2773,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
 
   // However: putting an assignment in the first block should propagate variable
   // values through to all other blocks, as it dominates.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2790,7 +2793,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   // should still be propagated, as buildVLocValueMap shouldn't care about
   // what's in the registers (except for PHIs).
   // values through to all other blocks, as it dominates.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2808,8 +2811,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
 
   // We should get a live-in to the merging block, if there are two assigns of
   // the same value in either side of the diamond.
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2824,8 +2827,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
 
   // If we assign a value in the entry block, then 'undef' on a branch, we
   // shouldn't have a live-in in the merge block.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(EmptyProps, DbgValue::Undef)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(EmptyProps, DbgValue::Undef)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2843,8 +2846,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   // Having different values joining into the merge block should mean we have
   // no live-in in that block. Block ones LiveInRax value doesn't appear as a
   // live-in anywhere, it's block internal.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2862,8 +2865,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   // But on the other hand, if there's a location in the register file where
   // those two values can be joined, do so.
   MOutLocs[1][0] = LiveInRax;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2915,14 +2918,15 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   initValueArray(MOutLocs, 3, 2);
 
   DebugVariable Var(FuncVariable, std::nullopt, nullptr);
+  DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc);
   DbgValueProperties EmptyProps(EmptyExpr, false, false);
   DIExpression *TwoOpExpr =
       DIExpression::get(Ctx, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_LLVM_arg,
                               1, dwarf::DW_OP_plus});
   DbgValueProperties VariadicProps(TwoOpExpr, false, true);
 
-  SmallSet<DebugVariable, 4> AllVars;
-  AllVars.insert(Var);
+  SmallSet<DebugVariableID, 4> AllVars;
+  AllVars.insert(VarID);
 
   SmallPtrSet<MachineBasicBlock *, 4> AssignBlocks;
   AssignBlocks.insert(MBB0);
@@ -2930,7 +2934,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   AssignBlocks.insert(MBB2);
 
   SmallVector<VLocTracker, 3> VLocs;
-  VLocs.resize(3, VLocTracker(Overlaps, EmptyExpr));
+  VLocs.resize(3, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr));
 
   InstrRefBasedLDV::LiveInsT Output;
 
@@ -2947,7 +2951,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   Output.resize(3);
 
   // Easy starter: a dominating assign should propagate to all blocks.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2963,7 +2967,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
 
   // A variadic assignment should behave the same.
   DbgOpID Locs0[] = {LiveInRspID, LiveInRaxID};
-  VLocs[0].Vars.insert({Var, DbgValue(Locs0, VariadicProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(Locs0, VariadicProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs,
                     MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2979,8 +2983,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   VLocs[1].Vars.clear();
 
   // Put an undef assignment in the loop. Should get no live-in value.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(EmptyProps, DbgValue::Undef)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(EmptyProps, DbgValue::Undef)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -2991,8 +2995,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   VLocs[1].Vars.clear();
 
   // Assignment of the same value should naturally join.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3008,8 +3012,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
 
   // Assignment of different values shouldn't join with no machine PHI vals.
   // Will be live-in to exit block as it's dominated.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3025,8 +3029,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   // with unrelated assign in loop block again.
   MInLocs[1][0] = RspPHIInBlk1;
   MOutLocs[1][0] = RspDefInBlk1;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3042,8 +3046,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   // find the appropriate PHI.
   MInLocs[1][0] = RspPHIInBlk1;
   MOutLocs[1][0] = RspDefInBlk1;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3063,8 +3067,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   MOutLocs[1][0] = LiveInRsp;
   MInLocs[1][1] = RaxPHIInBlk1;
   MOutLocs[1][1] = RspDefInBlk1;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3085,8 +3089,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   MOutLocs[1][0] = RspDefInBlk1;
   MInLocs[1][1] = RaxPHIInBlk1;
   MOutLocs[1][1] = RspDefInBlk1;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3119,8 +3123,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   MOutLocs[1][0] = RspPHIInBlk1;
   MInLocs[1][1] = LiveInRax;
   MOutLocs[1][1] = LiveInRax;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(RspPHIInBlk1ID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(RspPHIInBlk1ID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3138,8 +3142,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   // because there's a def in it.
   MInLocs[1][0] = LiveInRsp;
   MOutLocs[1][0] = LiveInRsp;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3193,10 +3197,11 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   initValueArray(MOutLocs, 5, 2);
 
   DebugVariable Var(FuncVariable, std::nullopt, nullptr);
+  DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc);
   DbgValueProperties EmptyProps(EmptyExpr, false, false);
 
-  SmallSet<DebugVariable, 4> AllVars;
-  AllVars.insert(Var);
+  SmallSet<DebugVariableID, 4> AllVars;
+  AllVars.insert(VarID);
 
   SmallPtrSet<MachineBasicBlock *, 5> AssignBlocks;
   AssignBlocks.insert(MBB0);
@@ -3206,7 +3211,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   AssignBlocks.insert(MBB4);
 
   SmallVector<VLocTracker, 5> VLocs;
-  VLocs.resize(5, VLocTracker(Overlaps, EmptyExpr));
+  VLocs.resize(5, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr));
 
   InstrRefBasedLDV::LiveInsT Output;
 
@@ -3223,7 +3228,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   Output.resize(5);
 
   // A dominating assign should propagate to all blocks.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3244,8 +3249,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
 
   // Test that an assign in the inner loop causes unresolved PHIs at the heads
   // of both loops, and no output location. Dominated blocks do get values.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3263,7 +3268,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
 
   // Same test, but with no assignment in block 0. We should still get values
   // in dominated blocks.
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3280,8 +3285,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
 
   // Similarly, assignments in the outer loop gives location to dominated
   // blocks, but no PHI locations are found at the outer loop head.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[3].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[3].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3295,8 +3300,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   VLocs[0].Vars.clear();
   VLocs[3].Vars.clear();
 
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3317,8 +3322,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   // With an assignment of the same value in the inner loop, we should work out
   // that all PHIs can be eliminated and the same value is live-through the
   // whole function.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3348,8 +3353,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   // one. Even though RspPHIInBlk2 isn't available later in the function, we
   // should still produce a live-in value. The fact it's unavailable is a
   // different concern.
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
@@ -3374,8 +3379,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   MOutLocs[2][0] = RspDefInBlk2;
   MInLocs[3][0] = RspDefInBlk2;
   MOutLocs[3][0] = RspDefInBlk2;
-  VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)});
-  VLocs[2].Vars.insert({Var, DbgValue(RspDefInBlk2ID, EmptyProps)});
+  VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)});
+  VLocs[2].Vars.insert({VarID, DbgValue(RspDefInBlk2ID, EmptyProps)});
   buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output,
                     MOutLocs, MInLocs, VLocs);
   EXPECT_EQ(Output[0].size(), 0ul);
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 1997e80522975..d464a16f636cf 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -135,6 +135,7 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   MachineModuleInfo MMI(TM);
   const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(*F);
 
-  return std::make_unique<MachineFunction>(*F, *TM, STI, FunctionNum, MMI);
+  return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
+                                           FunctionNum);
 }
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 88c4dbc2eec78..c89e5a45ab014 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -72,13 +72,14 @@ class SelectionDAGAddressAnalysisTest : public testing::Test {
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index a3d5e5f94b610..074247e6e7d18 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -71,13 +71,14 @@ class SelectionDAGPatternMatchTest : public testing::Test {
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
@@ -130,6 +131,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
   SDValue ICMP_EQ01 = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETEQ);
   SDValue ICMP_EQ10 = DAG->getSetCC(DL, MVT::i1, Op1, Op0, ISD::SETEQ);
 
+  auto Int1VT = EVT::getIntegerVT(Context, 1);
+  SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Int1VT);
+  SDValue T = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 4, Int1VT);
+  SDValue F = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 5, Int1VT);
+  SDValue Select = DAG->getSelect(DL, MVT::i1, Cond, T, F);
+
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+  SDValue V1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 6, VInt32VT);
+  SDValue V2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 7, VInt32VT);
+  SDValue VSelect = DAG->getNode(ISD::VSELECT, DL, VInt32VT, Cond, V1, V2);
+
   using namespace SDPatternMatch;
   ISD::CondCode CC;
   EXPECT_TRUE(sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(),
@@ -152,6 +164,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
                                             m_SpecificCondCode(ISD::SETEQ))));
   EXPECT_TRUE(sd_match(ICMP_EQ10, m_c_SetCC(m_Specific(Op0), m_Specific(Op1),
                                             m_SpecificCondCode(ISD::SETEQ))));
+
+  EXPECT_TRUE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(T), m_Specific(F))));
+  EXPECT_FALSE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(F), m_Specific(T))));
+  EXPECT_FALSE(sd_match(ICMP_EQ01, m_Select(m_Specific(Op0), m_Specific(Op1),
+                                            m_SpecificCondCode(ISD::SETEQ))));
+  EXPECT_TRUE(sd_match(
+      VSelect, m_VSelect(m_Specific(Cond), m_Specific(V1), m_Specific(V2))));
+  EXPECT_FALSE(sd_match(
+      Select, m_VSelect(m_Specific(Cond), m_Specific(V1), m_Specific(V2))));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
@@ -227,6 +250,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
   SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
 
+  SDValue VScale = DAG->getVScale(DL, Int32VT, APInt::getMaxValue(32));
+
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
   EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
@@ -237,6 +262,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));
   EXPECT_FALSE(sd_match(Sub, m_Neg(m_Value())));
   EXPECT_FALSE(sd_match(Neg, m_Not(m_Value())));
+  EXPECT_TRUE(sd_match(VScale, m_VScale(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp
index 43fdf5d3d6f31..ec9c0dddcbc0c 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp
@@ -112,10 +112,7 @@ DWARFExpressionCopyBytesTest::createStreamer(raw_pwrite_stream &OS) {
   std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(OS);
   Res.Streamer.reset(TheTarget->createMCObjectStreamer(
       Triple(TripleName), *Res.Ctx, std::unique_ptr<MCAsmBackend>(MAB),
-      std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *STI,
-      /* RelaxAll */ false,
-      /* IncrementalLinkerCompatible */ false,
-      /* DWARFMustBeAtTheEnd */ false));
+      std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *STI));
   return Res;
 }
 
diff --git a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
index ad5e51b7efb83..2cbd4cc55df95 100644
--- a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
@@ -503,8 +503,7 @@ llvm::Error dwarfgen::Generator::init(Triple TheTriple, uint16_t V) {
   MS = TheTarget->createMCObjectStreamer(
       TheTriple, *MC, std::unique_ptr<MCAsmBackend>(MAB),
       MAB->createObjectWriter(*Stream), std::unique_ptr<MCCodeEmitter>(MCE),
-      *MSTI, MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
-      /*DWARFMustBeAtTheEnd*/ false);
+      *MSTI);
   if (!MS)
     return make_error<StringError>("no object streamer for target " +
                                        TripleName,
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index af431658c9b7d..cc868e7587dc6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_unittest(OrcJITTests
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
+  MemoryFlagsTest.cpp
   MemoryMapperTest.cpp
   ObjectFormatsTest.cpp
   ObjectLinkingLayerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
new file mode 100644
index 0000000000000..270ea2f442b9e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
@@ -0,0 +1,55 @@
+//===------- MemoryFlagsTest.cpp - Test MemoryFlags and related APIs ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
+#include "gtest/gtest.h"
+
+#include <future>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+TEST(MemProtTest, Basics) {
+  MemProt MPNone = MemProt::None;
+
+  EXPECT_EQ(MPNone & MemProt::Read, MemProt::None);
+  EXPECT_EQ(MPNone & MemProt::Write, MemProt::None);
+  EXPECT_EQ(MPNone & MemProt::Exec, MemProt::None);
+
+  EXPECT_EQ(MPNone | MemProt::Read, MemProt::Read);
+  EXPECT_EQ(MPNone | MemProt::Write, MemProt::Write);
+  EXPECT_EQ(MPNone | MemProt::Exec, MemProt::Exec);
+
+  MemProt MPAll = MemProt::Read | MemProt::Write | MemProt::Exec;
+  EXPECT_EQ(MPAll & MemProt::Read, MemProt::Read);
+  EXPECT_EQ(MPAll & MemProt::Write, MemProt::Write);
+  EXPECT_EQ(MPAll & MemProt::Exec, MemProt::Exec);
+}
+
+TEST(AllocGroupSmallMap, EmptyMap) {
+  AllocGroupSmallMap<bool> EM;
+  EXPECT_TRUE(EM.empty());
+  EXPECT_EQ(EM.size(), 0u);
+}
+
+TEST(AllocGroupSmallMap, NonEmptyMap) {
+  AllocGroupSmallMap<unsigned> NEM;
+  NEM[MemProt::Read] = 42;
+
+  EXPECT_FALSE(NEM.empty());
+  EXPECT_EQ(NEM.size(), 1U);
+  EXPECT_EQ(NEM[MemProt::Read], 42U);
+  EXPECT_EQ(NEM.find(MemProt::Read), NEM.begin());
+  EXPECT_EQ(NEM.find(MemProt::Read | MemProt::Write), NEM.end());
+
+  NEM[MemProt::Read | MemProt::Write] = 7;
+  EXPECT_EQ(NEM.size(), 2U);
+  EXPECT_EQ(NEM.begin()->second, 42U);
+  EXPECT_EQ((NEM.begin() + 1)->second, 7U);
+}
+
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index c70341b5a86d2..f9541131e4b23 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -1100,4 +1100,21 @@ TEST_F(OpenMPDecompositionTest, Nowait1) {
   ASSERT_EQ(Dir1, "parallel");      // (23)
   ASSERT_EQ(Dir2, "for");           // (23)
 }
+
+// ---
+
+// Check that "simd linear(x)" does not fail despite the implied "firstprivate"
+// (which "simd" does not allow).
+TEST_F(OpenMPDecompositionTest, Misc1) {
+  omp::Object x{"x"};
+  omp::List<omp::Clause> Clauses{
+      {OMPC_linear,
+       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_simd, Clauses);
+  ASSERT_EQ(Dec.output.size(), 1u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  ASSERT_EQ(Dir0, "simd linear(, , , (x)) lastprivate(, (x))");
+}
 } // namespace
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 8653bbd3d38fd..cb4c289f409a1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6902,8 +6902,8 @@ TEST_F(OpenMPIRBuilderTest, EmitOffloadingArraysArguments) {
   Info.RTArgs.MappersArray =
       ConstantPointerNull::get(Array4VoidPtrTy->getPointerTo());
   Info.NumberOfPtrs = 4;
-
-  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, false, false);
+  Info.EmitDebug = false;
+  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, false);
 
   EXPECT_NE(RTArgs.BasePointersArray, nullptr);
   EXPECT_NE(RTArgs.PointersArray, nullptr);
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 0181e2ce6ac92..1705f3e6af977 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -1070,6 +1070,24 @@ TEST_F(ConstantRangeTest, MultiplyWithNoWrap) {
                 .multiplyWithNoWrap(ConstantRange(APInt(8, 100), APInt(8, 121)),
                                     OBO::NoSignedWrap),
             ConstantRange::getEmpty(8));
+  EXPECT_TRUE(ConstantRange::getFull(8)
+                  .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 128)),
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+                  .isAllNonNegative());
+  EXPECT_TRUE(ConstantRange(APInt(8, 2), APInt(8, 128))
+                  .multiplyWithNoWrap(ConstantRange::getFull(8),
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+                  .isAllNonNegative());
+  EXPECT_FALSE(
+      ConstantRange::getFull(8)
+          .multiplyWithNoWrap(ConstantRange(APInt(8, 1), APInt(8, 128)),
+                              OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+          .isAllNonNegative());
+  EXPECT_FALSE(
+      ConstantRange::getFull(8)
+          .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 128)),
+                              OBO::NoSignedWrap)
+          .isAllNonNegative());
 
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
@@ -1484,6 +1502,48 @@ TEST_F(ConstantRangeTest, Shl) {
       });
 }
 
+TEST_F(ConstantRangeTest, ShlWithNoWrap) {
+  using OBO = OverflowingBinaryOperator;
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.shlWithNoWrap(CR2, OBO::NoUnsignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.ushl_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.shlWithNoWrap(CR2, OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.sshl_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.shlWithNoWrap(CR2, OBO::NoUnsignedWrap | OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow1, IsOverflow2;
+        APInt Res1 = N1.ushl_ov(N2, IsOverflow1);
+        APInt Res2 = N1.sshl_ov(N2, IsOverflow2);
+        if (IsOverflow1 || IsOverflow2)
+          return std::nullopt;
+        assert(Res1 == Res2 && "Left shift results differ?");
+        return Res1;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+}
+
 TEST_F(ConstantRangeTest, Lshr) {
   EXPECT_EQ(Full.lshr(Full), Full);
   EXPECT_EQ(Full.lshr(Empty), Empty);
diff --git a/llvm/unittests/IR/DominatorTreeTest.cpp b/llvm/unittests/IR/DominatorTreeTest.cpp
index 44bde74ad350f..555348c65a63d 100644
--- a/llvm/unittests/IR/DominatorTreeTest.cpp
+++ b/llvm/unittests/IR/DominatorTreeTest.cpp
@@ -607,11 +607,10 @@ TEST(DominatorTree, DeletingEdgesIntroducesInfiniteLoop2) {
         SwitchC->removeCase(SwitchC->case_begin());
         DT->deleteEdge(C, C2);
         PDT->deleteEdge(C, C2);
-        C2->removeFromParent();
 
         EXPECT_EQ(DT->getNode(C2), nullptr);
         PDT->eraseNode(C2);
-        delete C2;
+        C2->eraseFromParent();
 
         EXPECT_TRUE(DT->verify());
         EXPECT_TRUE(PDT->verify());
diff --git a/llvm/unittests/IR/FunctionTest.cpp b/llvm/unittests/IR/FunctionTest.cpp
index 9aaff3ea33830..402667931fbc5 100644
--- a/llvm/unittests/IR/FunctionTest.cpp
+++ b/llvm/unittests/IR/FunctionTest.cpp
@@ -487,6 +487,98 @@ TEST(FunctionTest, EraseBBs) {
   EXPECT_EQ(F->size(), 0u);
 }
 
+TEST(FunctionTest, BasicBlockNumbers) {
+  LLVMContext Context;
+  Type *VoidType = Type::getVoidTy(Context);
+  FunctionType *FuncType = FunctionType::get(VoidType, false);
+  std::unique_ptr<Function> Func(
+      Function::Create(FuncType, GlobalValue::ExternalLinkage));
+
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 0u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 0u);
+
+  BasicBlock *BB1 = BasicBlock::Create(Context, "bb1", Func.get());
+  EXPECT_EQ(BB1->getNumber(), 0u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 1u);
+  BasicBlock *BB2 = BasicBlock::Create(Context, "bb2", Func.get());
+  EXPECT_EQ(BB2->getNumber(), 1u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 2u);
+  BasicBlock *BB3 = BasicBlock::Create(Context, "bb3", Func.get());
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 3u);
+
+  BB2->eraseFromParent();
+  // Erasing doesn't trigger renumbering
+  EXPECT_EQ(BB1->getNumber(), 0u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 3u);
+  // ... and number are assigned monotonically increasing
+  BasicBlock *BB4 = BasicBlock::Create(Context, "bb4", Func.get());
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+  // ... even if inserted not at the end
+  BasicBlock *BB5 = BasicBlock::Create(Context, "bb5", Func.get(), BB1);
+  EXPECT_EQ(BB5->getNumber(), 4u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 5u);
+
+  // Func is now: bb5, bb1, bb3, bb4
+  // Renumbering assigns numbers in their order in the function
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 0u);
+  Func->renumberBlocks();
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 1u);
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+
+  // Moving a block inside the function doesn't change numbers
+  BB1->moveBefore(BB5);
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+
+  // Removing a block and adding it back assigns a new number, because the
+  // block was temporarily without a parent.
+  BB4->removeFromParent();
+  BB4->insertInto(Func.get());
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 4u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 5u);
+
+  std::unique_ptr<Function> Func2(
+      Function::Create(FuncType, GlobalValue::ExternalLinkage));
+  BasicBlock *BB6 = BasicBlock::Create(Context, "bb6", Func2.get());
+  EXPECT_EQ(BB6->getNumber(), 0u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 1u);
+  // Moving a block to a different function assigns a new number
+  BB3->removeFromParent();
+  BB3->insertInto(Func2.get(), BB6);
+  EXPECT_EQ(BB3->getParent(), Func2.get());
+  EXPECT_EQ(BB3->getNumber(), 1u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 2u);
+
+  EXPECT_EQ(Func2->getBlockNumberEpoch(), 0u);
+  Func2->renumberBlocks();
+  EXPECT_EQ(Func2->getBlockNumberEpoch(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 0u);
+  EXPECT_EQ(BB6->getNumber(), 1u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 2u);
+
+  // splice works as expected and assigns new numbers
+  Func->splice(Func->end(), Func2.get());
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB4->getNumber(), 4u);
+  EXPECT_EQ(BB3->getNumber(), 5u);
+  EXPECT_EQ(BB6->getNumber(), 6u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 7u);
+}
+
 TEST(FunctionTest, UWTable) {
   LLVMContext Ctx;
   std::unique_ptr<Module> M = parseIR(Ctx, R"(
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 220da429215c7..8944ec65615e5 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) {
   Type *Int64Ty = Type::getInt64Ty(C);
   Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8);
   Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8);
-  Type *X86MMXTy = Type::getX86_MMXTy(C);
 
   Type *HalfTy = Type::getHalfTy(C);
   Type *FloatTy = Type::getFloatTy(C);
@@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) {
   EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true));
   EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true));
 
-  EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy));
-  EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty));
-  EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy));
   EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty));
   EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty));
 
@@ -1744,7 +1740,7 @@ TEST(InstructionsTest, AllocaInst) {
         %A = alloca i32, i32 1
         %B = alloca i32, i32 4
         %C = alloca i32, i32 %n
-        %D = alloca <8 x double>
+        %D = alloca double
         %E = alloca <vscale x 8 x double>
         %F = alloca [2 x half]
         %G = alloca [2 x [3 x i128]]
@@ -1770,7 +1766,8 @@ TEST(InstructionsTest, AllocaInst) {
   EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128));
   EXPECT_FALSE(C.getAllocationSizeInBits(DL));
-  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512));
+  EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64));
+  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64));
   EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512));
   EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768));
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index d3cc213ebde5d..b960d475bf60c 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2239,7 +2239,7 @@ typedef ::testing::Types<std::tuple<Value*, Instruction*>,
     MutableConstTestTypes;
 TYPED_TEST_SUITE(MutableConstTest, MutableConstTestTypes, );
 
-TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_ICmp) {
+TYPED_TEST(MutableConstTest, ICmp) {
   auto &IRB = PatternMatchTest::IRB;
 
   typedef std::tuple_element_t<0, TypeParam> ValueType;
@@ -2321,9 +2321,17 @@ TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_ICmp) {
   EXPECT_FALSE(m_SpecificCmp(ICmpInst::getInversePredicate(Pred),
                              m_Value(MatchL), m_Value(MatchR))
                    .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+
+  EXPECT_TRUE(m_c_SpecificICmp(Pred, m_Specific(L), m_Specific(R))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_TRUE(m_c_SpecificICmp(ICmpInst::getSwappedPredicate(Pred),
+                               m_Specific(R), m_Specific(L))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_FALSE(m_c_SpecificICmp(Pred, m_Specific(R), m_Specific(L))
+                   .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
 }
 
-TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_FCmp) {
+TYPED_TEST(MutableConstTest, FCmp) {
   auto &IRB = PatternMatchTest::IRB;
 
   typedef std::tuple_element_t<0, TypeParam> ValueType;
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index eab2850ca4e1e..cf0a10d1f2e95 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -367,6 +367,59 @@ TEST_F(VPIntrinsicTest, IntrinsicIDRoundTrip) {
   ASSERT_NE(FullTripCounts, 0u);
 }
 
+/// Check that going from intrinsic to VP intrinsic and back results in the same
+/// intrinsic.
+TEST_F(VPIntrinsicTest, IntrinsicToVPRoundTrip) {
+  bool IsFullTrip = false;
+  Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic + 1;
+  for (; IntrinsicID < Intrinsic::num_intrinsics; IntrinsicID++) {
+    Intrinsic::ID VPID = VPIntrinsic::getForIntrinsic(IntrinsicID);
+    // No equivalent VP intrinsic available.
+    if (VPID == Intrinsic::not_intrinsic)
+      continue;
+
+    // Return itself if passed intrinsic ID is VP intrinsic.
+    if (VPIntrinsic::isVPIntrinsic(IntrinsicID)) {
+      ASSERT_EQ(IntrinsicID, VPID);
+      continue;
+    }
+
+    std::optional<Intrinsic::ID> RoundTripIntrinsicID =
+        VPIntrinsic::getFunctionalIntrinsicIDForVP(VPID);
+    // No equivalent non-predicated intrinsic available.
+    if (!RoundTripIntrinsicID)
+      continue;
+
+    ASSERT_EQ(*RoundTripIntrinsicID, IntrinsicID);
+    IsFullTrip = true;
+  }
+  ASSERT_TRUE(IsFullTrip);
+}
+
+/// Check that going from VP intrinsic to equivalent non-predicated intrinsic
+/// and back results in the same intrinsic.
+TEST_F(VPIntrinsicTest, VPToNonPredIntrinsicRoundTrip) {
+  std::unique_ptr<Module> M = createVPDeclarationModule();
+  assert(M);
+
+  bool IsFullTrip = false;
+  for (const auto &VPDecl : *M) {
+    auto VPID = VPDecl.getIntrinsicID();
+    std::optional<Intrinsic::ID> NonPredID =
+        VPIntrinsic::getFunctionalIntrinsicIDForVP(VPID);
+
+    // No equivalent non-predicated intrinsic available
+    if (!NonPredID)
+      continue;
+
+    Intrinsic::ID RoundTripVPID = VPIntrinsic::getForIntrinsic(*NonPredID);
+
+    ASSERT_EQ(RoundTripVPID, VPID);
+    IsFullTrip = true;
+  }
+  ASSERT_TRUE(IsFullTrip);
+}
+
 /// Check that VPIntrinsic::getDeclarationForParams works.
 TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
   std::unique_ptr<Module> M = createVPDeclarationModule();
diff --git a/llvm/unittests/MC/DwarfLineTableHeaders.cpp b/llvm/unittests/MC/DwarfLineTableHeaders.cpp
index d8a657ed5048e..1fad1ba6ce638 100644
--- a/llvm/unittests/MC/DwarfLineTableHeaders.cpp
+++ b/llvm/unittests/MC/DwarfLineTableHeaders.cpp
@@ -83,10 +83,7 @@ class DwarfLineTableHeaders : public ::testing::Test {
     std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(OS);
     Res.Streamer.reset(TheTarget->createMCObjectStreamer(
         Triple(TripleName), *Res.Ctx, std::unique_ptr<MCAsmBackend>(MAB),
-        std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *STI,
-        /* RelaxAll */ false,
-        /* IncrementalLinkerCompatible */ false,
-        /* DWARFMustBeAtTheEnd */ false));
+        std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *STI));
     return Res;
   }
 
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index 364ab187c2858..9b1c3ef1c465a 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -253,7 +253,7 @@ body:             |
   auto *NewMMO = MF->getMachineMemOperand(OldMMO, AAInfo);
   MI.setMemRefs(*MF, NewMMO);
 
-  MachineModuleSlotTracker MST(MF);
+  MachineModuleSlotTracker MST(MMI, MF);
   // Print that MI with new machine metadata, which slot numbers should be
   // assigned.
   EXPECT_EQ("%1:gpr32 = LDRWui %0, 0 :: (load (s32) from %ir.p, "
@@ -277,7 +277,7 @@ body:             |
   EXPECT_EQ(Collected, Generated);
 
   // FileCheck the output from MIR printer.
-  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, *MF); });
+  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, MMI, *MF); });
   std::string CheckString = R"(
 CHECK: machineMetadataNodes:
 CHECK-DAG: ![[MMDOMAIN:[0-9]+]] = distinct !{!{{[0-9]+}}, !"domain"}
@@ -403,7 +403,7 @@ body:             |
   auto *NewMMO = MF->getMachineMemOperand(OldMMO, AAInfo);
   MI.setMemRefs(*MF, NewMMO);
 
-  MachineModuleSlotTracker MST(MF);
+  MachineModuleSlotTracker MST(MMI, MF);
   // Print that MI with new machine metadata, which slot numbers should be
   // assigned.
   EXPECT_EQ("%1:gr32 = MOV32rm %0, 1, $noreg, 0, $noreg :: (load (s32) from %ir.p, "
@@ -427,7 +427,7 @@ body:             |
   EXPECT_EQ(Collected, Generated);
 
   // FileCheck the output from MIR printer.
-  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, *MF); });
+  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, MMI, *MF); });
   std::string CheckString = R"(
 CHECK: machineMetadataNodes:
 CHECK-DAG: ![[MMDOMAIN:[0-9]+]] = distinct !{!{{[0-9]+}}, !"domain"}
@@ -501,7 +501,7 @@ body:             |
   auto *NewMMO = MF->getMachineMemOperand(OldMMO, AAInfo);
   MI.setMemRefs(*MF, NewMMO);
 
-  MachineModuleSlotTracker MST(MF);
+  MachineModuleSlotTracker MST(MMI, MF);
   // Print that MI with new machine metadata, which slot numbers should be
   // assigned.
   EXPECT_EQ(
@@ -526,7 +526,7 @@ body:             |
   EXPECT_EQ(Collected, Generated);
 
   // FileCheck the output from MIR printer.
-  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, *MF); });
+  std::string Output = print([&](raw_ostream &OS) { printMIR(OS, MMI, *MF); });
   std::string CheckString = R"(
 CHECK: machineMetadataNodes:
 CHECK-DAG: ![[MMDOMAIN:[0-9]+]] = distinct !{!{{[0-9]+}}, !"domain"}
diff --git a/llvm/unittests/ProfileData/CMakeLists.txt b/llvm/unittests/ProfileData/CMakeLists.txt
index c92642ded8282..0a7f7da085950 100644
--- a/llvm/unittests/ProfileData/CMakeLists.txt
+++ b/llvm/unittests/ProfileData/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BitReader
   Core
   Coverage
   ProfileData
diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
index 6c6798ded00b5..476f293780d84 100644
--- a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
+++ b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/Bitcode/BitcodeAnalyzer.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
 #include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
@@ -106,8 +106,20 @@ TEST_F(PGOCtxProfRWTest, RoundTrip) {
         MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+
+    // Check it's analyzable by the BCAnalyzer
+    BitcodeAnalyzer BA((*MB)->getBuffer());
+    std::string AnalyzerDump;
+    raw_string_ostream OS(AnalyzerDump);
+    BCDumpOptions Opts(OS);
+
+    // As in, expect no error.
+    EXPECT_FALSE(BA.analyze(Opts));
+    EXPECT_TRUE(AnalyzerDump.find("<Metadata BlockID") != std::string::npos);
+    EXPECT_TRUE(AnalyzerDump.find("<Context BlockID") != std::string::npos);
+    EXPECT_TRUE(AnalyzerDump.find("<CalleeIndex codeid") != std::string::npos);
+
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     ASSERT_TRUE(!!Expected);
     auto &Ctxes = *Expected;
@@ -143,8 +155,7 @@ TEST_F(PGOCtxProfRWTest, InvalidCounters) {
     auto MB = MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     EXPECT_FALSE(Expected);
     consumeError(Expected.takeError());
@@ -152,16 +163,14 @@ TEST_F(PGOCtxProfRWTest, InvalidCounters) {
 }
 
 TEST_F(PGOCtxProfRWTest, Empty) {
-  BitstreamCursor Cursor("");
-  PGOCtxProfileReader Reader(Cursor);
+  PGOCtxProfileReader Reader("");
   auto Expected = Reader.loadContexts();
   EXPECT_FALSE(Expected);
   consumeError(Expected.takeError());
 }
 
 TEST_F(PGOCtxProfRWTest, Invalid) {
-  BitstreamCursor Cursor("Surely this is not valid");
-  PGOCtxProfileReader Reader(Cursor);
+  PGOCtxProfileReader Reader("Surely this is not valid");
   auto Expected = Reader.loadContexts();
   EXPECT_FALSE(Expected);
   consumeError(Expected.takeError());
@@ -182,8 +191,8 @@ TEST_F(PGOCtxProfRWTest, ValidButEmpty) {
     auto MB = MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     EXPECT_TRUE(!!Expected);
     EXPECT_TRUE(Expected->empty());
@@ -204,8 +213,8 @@ TEST_F(PGOCtxProfRWTest, WrongVersion) {
     auto MB = MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     EXPECT_FALSE(Expected);
     consumeError(Expected.takeError());
@@ -228,8 +237,7 @@ TEST_F(PGOCtxProfRWTest, DuplicateRoots) {
     auto MB = MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     EXPECT_FALSE(Expected);
     consumeError(Expected.takeError());
@@ -255,8 +263,7 @@ TEST_F(PGOCtxProfRWTest, DuplicateTargets) {
     auto MB = MemoryBuffer::getFile(ProfileFile.path());
     ASSERT_TRUE(!!MB);
     ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
+    PGOCtxProfileReader Reader((*MB)->getBuffer());
     auto Expected = Reader.loadContexts();
     EXPECT_FALSE(Expected);
     consumeError(Expected.takeError());
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ec68ed0afeb2f..bc4e97c48bffe 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
@@ -90,7 +91,7 @@ define void @foo(i32 %v1) {
   EXPECT_FALSE(isa<sandboxir::Instruction>(Const0));
   EXPECT_TRUE(isa<sandboxir::Instruction>(OpaqueI));
 
-  EXPECT_FALSE(isa<sandboxir::User>(F));
+  EXPECT_TRUE(isa<sandboxir::User>(F));
   EXPECT_FALSE(isa<sandboxir::User>(Arg0));
   EXPECT_FALSE(isa<sandboxir::User>(BB));
   EXPECT_TRUE(isa<sandboxir::User>(AddI));
@@ -132,7 +133,7 @@ define i32 @foo(i32 %v0, i32 %v1) {
   auto *Arg1 = F.getArg(1);
   auto It = BB.begin();
   auto *I0 = &*It++;
-  auto *Ret = &*It++;
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
   SmallVector<sandboxir::Argument *> Args{Arg0, Arg1};
   unsigned OpIdx = 0;
@@ -180,8 +181,8 @@ define i32 @foo(i32 %v0, i32 %v1) {
   BS << "\n";
   I0->getOperandUse(0).dump(BS);
   EXPECT_EQ(Buff, R"IR(
-Def:  i32 %v0 ; SB1. (Argument)
-User:   %add0 = add i32 %v0, %v1 ; SB4. (Opaque)
+Def:  i32 %v0 ; SB2. (Argument)
+User:   %add0 = add i32 %v0, %v1 ; SB5. (Opaque)
 OperandNo: 0
 )IR");
 #endif // NDEBUG
@@ -245,7 +246,7 @@ define i32 @foo(i32 %arg0, i32 %arg1) {
   auto *I0 = &*It++;
   auto *I1 = &*It++;
   auto *I2 = &*It++;
-  auto *Ret = &*It++;
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
   bool Replaced;
   // Try to replace an operand that doesn't match.
@@ -398,10 +399,10 @@ define void @foo(i32 %arg0, i32 %arg1) {
     EXPECT_EQ(Buff, R"IR(
 void @foo(i32 %arg0, i32 %arg1) {
 bb0:
-  br label %bb1 ; SB3. (Opaque)
+  br label %bb1 ; SB4. (Br)
 
 bb1:
-  ret void ; SB5. (Opaque)
+  ret void ; SB6. (Ret)
 }
 )IR");
   }
@@ -466,7 +467,7 @@ define void @foo(i32 %v1) {
     BB0.dump(BS);
     EXPECT_EQ(Buff, R"IR(
 bb0:
-  br label %bb1 ; SB2. (Opaque)
+  br label %bb1 ; SB3. (Br)
 )IR");
   }
 #endif // NDEBUG
@@ -488,7 +489,7 @@ define void @foo(i8 %v1) {
   auto It = BB->begin();
   auto *I0 = &*It++;
   auto *I1 = &*It++;
-  auto *Ret = &*It++;
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
   // Check getPrevNode().
   EXPECT_EQ(Ret->getPrevNode(), I1);
@@ -508,7 +509,7 @@ define void @foo(i8 %v1) {
   // Check getOpcode().
   EXPECT_EQ(I0->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
   EXPECT_EQ(I1->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
-  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
+  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
 
   // Check moveBefore(I).
   I1->moveBefore(I0);
@@ -560,3 +561,1457 @@ define void @foo(i8 %v1) {
   EXPECT_EQ(I0->getNumUses(), 0u);
   EXPECT_EQ(I0->getNextNode(), Ret);
 }
+
+TEST_F(SandboxIRTest, SelectInst) {
+  parseIR(C, R"IR(
+define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
+  %sel = select i1 %c0, i8 %v0, i8 %v1
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Cond0 = F->getArg(0);
+  auto *V0 = F->getArg(1);
+  auto *V1 = F->getArg(2);
+  auto *Cond1 = F->getArg(3);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Select = cast<sandboxir::SelectInst>(&*It++);
+  auto *Ret = &*It++;
+
+  // Check getCondition().
+  EXPECT_EQ(Select->getCondition(), Cond0);
+  // Check getTrueValue().
+  EXPECT_EQ(Select->getTrueValue(), V0);
+  // Check getFalseValue().
+  EXPECT_EQ(Select->getFalseValue(), V1);
+  // Check setCondition().
+  Select->setCondition(Cond1);
+  EXPECT_EQ(Select->getCondition(), Cond1);
+  // Check setTrueValue().
+  Select->setTrueValue(V1);
+  EXPECT_EQ(Select->getTrueValue(), V1);
+  // Check setFalseValue().
+  Select->setFalseValue(V0);
+  EXPECT_EQ(Select->getFalseValue(), V0);
+
+  {
+    // Check SelectInst::create() InsertBefore.
+    auto *NewSel = cast<sandboxir::SelectInst>(sandboxir::SelectInst::create(
+        Cond0, V0, V1, /*InsertBefore=*/Ret, Ctx));
+    EXPECT_EQ(NewSel->getCondition(), Cond0);
+    EXPECT_EQ(NewSel->getTrueValue(), V0);
+    EXPECT_EQ(NewSel->getFalseValue(), V1);
+    EXPECT_EQ(NewSel->getNextNode(), Ret);
+  }
+  {
+    // Check SelectInst::create() InsertAtEnd.
+    auto *NewSel = cast<sandboxir::SelectInst>(
+        sandboxir::SelectInst::create(Cond0, V0, V1, /*InsertAtEnd=*/BB, Ctx));
+    EXPECT_EQ(NewSel->getCondition(), Cond0);
+    EXPECT_EQ(NewSel->getTrueValue(), V0);
+    EXPECT_EQ(NewSel->getFalseValue(), V1);
+    EXPECT_EQ(NewSel->getPrevNode(), Ret);
+  }
+  {
+    // Check SelectInst::create() Folded.
+    auto *False =
+        sandboxir::Constant::createInt(llvm::Type::getInt1Ty(C), 0, Ctx,
+                                       /*IsSigned=*/false);
+    auto *FortyTwo =
+        sandboxir::Constant::createInt(llvm::Type::getInt1Ty(C), 42, Ctx,
+                                       /*IsSigned=*/false);
+    auto *NewSel =
+        sandboxir::SelectInst::create(False, FortyTwo, FortyTwo, Ret, Ctx);
+    EXPECT_TRUE(isa<sandboxir::Constant>(NewSel));
+    EXPECT_EQ(NewSel, FortyTwo);
+  }
+}
+
+TEST_F(SandboxIRTest, BranchInst) {
+  parseIR(C, R"IR(
+define void @foo(i1 %cond0, i1 %cond2) {
+ bb0:
+   br i1 %cond0, label %bb1, label %bb2
+ bb1:
+   ret void
+ bb2:
+   ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Cond0 = F->getArg(0);
+  auto *Cond1 = F->getArg(1);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb1")));
+  auto *Ret1 = BB1->getTerminator();
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb2")));
+  auto *Ret2 = BB2->getTerminator();
+  auto It = BB0->begin();
+  auto *Br0 = cast<sandboxir::BranchInst>(&*It++);
+  // Check isUnconditional().
+  EXPECT_FALSE(Br0->isUnconditional());
+  // Check isConditional().
+  EXPECT_TRUE(Br0->isConditional());
+  // Check getCondition().
+  EXPECT_EQ(Br0->getCondition(), Cond0);
+  // Check setCondition().
+  Br0->setCondition(Cond1);
+  EXPECT_EQ(Br0->getCondition(), Cond1);
+  // Check getNumSuccessors().
+  EXPECT_EQ(Br0->getNumSuccessors(), 2u);
+  // Check getSuccessor().
+  EXPECT_EQ(Br0->getSuccessor(0), BB1);
+  EXPECT_EQ(Br0->getSuccessor(1), BB2);
+  // Check swapSuccessors().
+  Br0->swapSuccessors();
+  EXPECT_EQ(Br0->getSuccessor(0), BB2);
+  EXPECT_EQ(Br0->getSuccessor(1), BB1);
+  // Check successors().
+  EXPECT_EQ(range_size(Br0->successors()), 2u);
+  unsigned SuccIdx = 0;
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
+  for (sandboxir::BasicBlock *Succ : Br0->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  {
+    // Check unconditional BranchInst::create() InsertBefore.
+    auto *Br = sandboxir::BranchInst::create(BB1, /*InsertBefore=*/Ret1, Ctx);
+    EXPECT_FALSE(Br->isConditional());
+    EXPECT_TRUE(Br->isUnconditional());
+#ifndef NDEBUG
+    EXPECT_DEATH(Br->getCondition(), ".*condition.*");
+#endif // NDEBUG
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getNextNode(), Ret1);
+  }
+  {
+    // Check unconditional BranchInst::create() InsertAtEnd.
+    auto *Br = sandboxir::BranchInst::create(BB1, /*InsertAtEnd=*/BB1, Ctx);
+    EXPECT_FALSE(Br->isConditional());
+    EXPECT_TRUE(Br->isUnconditional());
+#ifndef NDEBUG
+    EXPECT_DEATH(Br->getCondition(), ".*condition.*");
+#endif // NDEBUG
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getPrevNode(), Ret1);
+  }
+  {
+    // Check conditional BranchInst::create() InsertBefore.
+    auto *Br = sandboxir::BranchInst::create(BB1, BB2, Cond0,
+                                             /*InsertBefore=*/Ret1, Ctx);
+    EXPECT_TRUE(Br->isConditional());
+    EXPECT_EQ(Br->getCondition(), Cond0);
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getNextNode(), Ret1);
+  }
+  {
+    // Check conditional BranchInst::create() InsertAtEnd.
+    auto *Br = sandboxir::BranchInst::create(BB1, BB2, Cond0,
+                                             /*InsertAtEnd=*/BB2, Ctx);
+    EXPECT_TRUE(Br->isConditional());
+    EXPECT_EQ(Br->getCondition(), Cond0);
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getPrevNode(), Ret2);
+  }
+}
+
+TEST_F(SandboxIRTest, LoadInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %arg0, ptr %arg1) {
+  %ld = load i8, ptr %arg0, align 64
+  %vld = load volatile i8, ptr %arg0, align 64
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Arg0 = F->getArg(0);
+  auto *Arg1 = F->getArg(1);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Ld = cast<sandboxir::LoadInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(Ld));
+  auto *VLd = cast<sandboxir::LoadInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check isVolatile()
+  EXPECT_FALSE(Ld->isVolatile());
+  // Check isVolatile()
+  EXPECT_TRUE(VLd->isVolatile());
+  // Check getPointerOperand()
+  EXPECT_EQ(Ld->getPointerOperand(), Arg0);
+  // Check getAlign()
+  EXPECT_EQ(Ld->getAlign(), 64);
+  // Check create(InsertBefore)
+  sandboxir::LoadInst *NewLd =
+      sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
+                                  /*InsertBefore=*/Ret, Ctx, "NewLd");
+  EXPECT_FALSE(NewLd->isVolatile());
+  EXPECT_EQ(NewLd->getType(), Ld->getType());
+  EXPECT_EQ(NewLd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewLd->getAlign(), 8);
+  EXPECT_EQ(NewLd->getName(), "NewLd");
+  // Check create(InsertBefore, IsVolatile=true)
+  sandboxir::LoadInst *NewVLd =
+      sandboxir::LoadInst::create(VLd->getType(), Arg1, Align(8),
+                                  /*InsertBefore=*/Ret,
+                                  /*IsVolatile=*/true, Ctx, "NewVLd");
+
+  EXPECT_TRUE(NewVLd->isVolatile());
+  EXPECT_EQ(NewVLd->getName(), "NewVLd");
+  // Check create(InsertAtEnd)
+  sandboxir::LoadInst *NewLdEnd =
+      sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
+                                  /*InsertAtEnd=*/BB, Ctx, "NewLdEnd");
+  EXPECT_FALSE(NewLdEnd->isVolatile());
+  EXPECT_EQ(NewLdEnd->getName(), "NewLdEnd");
+  EXPECT_EQ(NewLdEnd->getType(), Ld->getType());
+  EXPECT_EQ(NewLdEnd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewLdEnd->getAlign(), 8);
+  EXPECT_EQ(NewLdEnd->getParent(), BB);
+  EXPECT_EQ(NewLdEnd->getNextNode(), nullptr);
+  // Check create(InsertAtEnd, IsVolatile=true)
+  sandboxir::LoadInst *NewVLdEnd =
+      sandboxir::LoadInst::create(VLd->getType(), Arg1, Align(8),
+                                  /*InsertAtEnd=*/BB,
+                                  /*IsVolatile=*/true, Ctx, "NewVLdEnd");
+  EXPECT_TRUE(NewVLdEnd->isVolatile());
+  EXPECT_EQ(NewVLdEnd->getName(), "NewVLdEnd");
+  EXPECT_EQ(NewVLdEnd->getType(), VLd->getType());
+  EXPECT_EQ(NewVLdEnd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewVLdEnd->getAlign(), 8);
+  EXPECT_EQ(NewVLdEnd->getParent(), BB);
+  EXPECT_EQ(NewVLdEnd->getNextNode(), nullptr);
+}
+
+TEST_F(SandboxIRTest, StoreInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %val, ptr %ptr) {
+  store i8 %val, ptr %ptr, align 64
+  store volatile i8 %val, ptr %ptr, align 64
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Val = F->getArg(0);
+  auto *Ptr = F->getArg(1);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *St = cast<sandboxir::StoreInst>(&*It++);
+  auto *VSt = cast<sandboxir::StoreInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check that the StoreInst has been created correctly.
+  EXPECT_FALSE(St->isVolatile());
+  EXPECT_TRUE(VSt->isVolatile());
+  // Check getPointerOperand()
+  EXPECT_EQ(St->getValueOperand(), Val);
+  EXPECT_EQ(St->getPointerOperand(), Ptr);
+  // Check getAlign()
+  EXPECT_EQ(St->getAlign(), 64);
+  // Check create(InsertBefore)
+  sandboxir::StoreInst *NewSt =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertBefore=*/Ret, Ctx);
+  EXPECT_FALSE(NewSt->isVolatile());
+  EXPECT_EQ(NewSt->getType(), St->getType());
+  EXPECT_EQ(NewSt->getValueOperand(), Val);
+  EXPECT_EQ(NewSt->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewSt->getAlign(), 8);
+  EXPECT_EQ(NewSt->getNextNode(), Ret);
+  // Check create(InsertBefore, IsVolatile=true)
+  sandboxir::StoreInst *NewVSt =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertBefore=*/Ret,
+                                   /*IsVolatile=*/true, Ctx);
+  EXPECT_TRUE(NewVSt->isVolatile());
+  EXPECT_EQ(NewVSt->getType(), VSt->getType());
+  EXPECT_EQ(NewVSt->getValueOperand(), Val);
+  EXPECT_EQ(NewVSt->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewVSt->getAlign(), 8);
+  EXPECT_EQ(NewVSt->getNextNode(), Ret);
+  // Check create(InsertAtEnd)
+  sandboxir::StoreInst *NewStEnd =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertAtEnd=*/BB, Ctx);
+  EXPECT_FALSE(NewStEnd->isVolatile());
+  EXPECT_EQ(NewStEnd->getType(), St->getType());
+  EXPECT_EQ(NewStEnd->getValueOperand(), Val);
+  EXPECT_EQ(NewStEnd->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewStEnd->getAlign(), 8);
+  EXPECT_EQ(NewStEnd->getParent(), BB);
+  EXPECT_EQ(NewStEnd->getNextNode(), nullptr);
+  // Check create(InsertAtEnd, IsVolatile=true)
+  sandboxir::StoreInst *NewVStEnd =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertAtEnd=*/BB,
+                                   /*IsVolatile=*/true, Ctx);
+  EXPECT_TRUE(NewVStEnd->isVolatile());
+  EXPECT_EQ(NewVStEnd->getType(), VSt->getType());
+  EXPECT_EQ(NewVStEnd->getValueOperand(), Val);
+  EXPECT_EQ(NewVStEnd->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewVStEnd->getAlign(), 8);
+  EXPECT_EQ(NewVStEnd->getParent(), BB);
+  EXPECT_EQ(NewVStEnd->getNextNode(), nullptr);
+}
+
+TEST_F(SandboxIRTest, ReturnInst) {
+  parseIR(C, R"IR(
+define i8 @foo(i8 %val) {
+  %add = add i8 %val, 42
+  ret i8 %val
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Val = F->getArg(0);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  It++;
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check that the ReturnInst has been created correctly.
+  // Check getReturnValue().
+  EXPECT_EQ(Ret->getReturnValue(), Val);
+
+  // Check create(InsertBefore) a void ReturnInst.
+  auto *NewRet1 = cast<sandboxir::ReturnInst>(
+      sandboxir::ReturnInst::create(nullptr, /*InsertBefore=*/Ret, Ctx));
+  EXPECT_EQ(NewRet1->getReturnValue(), nullptr);
+  // Check create(InsertBefore) a non-void ReturnInst.
+  auto *NewRet2 = cast<sandboxir::ReturnInst>(
+      sandboxir::ReturnInst::create(Val, /*InsertBefore=*/Ret, Ctx));
+  EXPECT_EQ(NewRet2->getReturnValue(), Val);
+
+  // Check create(InsertAtEnd) a void ReturnInst.
+  auto *NewRet3 = cast<sandboxir::ReturnInst>(
+      sandboxir::ReturnInst::create(nullptr, /*InsertAtEnd=*/BB, Ctx));
+  EXPECT_EQ(NewRet3->getReturnValue(), nullptr);
+  // Check create(InsertAtEnd) a non-void ReturnInst.
+  auto *NewRet4 = cast<sandboxir::ReturnInst>(
+      sandboxir::ReturnInst::create(Val, /*InsertAtEnd=*/BB, Ctx));
+  EXPECT_EQ(NewRet4->getReturnValue(), Val);
+}
+
+TEST_F(SandboxIRTest, CallBase) {
+  parseIR(C, R"IR(
+declare void @bar1(i8)
+declare void @bar2()
+declare void @bar3()
+declare void @variadic(ptr, ...)
+
+define i8 @foo(i8 %arg0, i32 %arg1, ptr %indirectFoo) {
+  %call = call i8 @foo(i8 %arg0, i32 %arg1)
+  call void @bar1(i8 %arg0)
+  call void @bar2()
+  call void %indirectFoo()
+  call void @bar2() noreturn
+  tail call fastcc void @bar2()
+  call void (ptr, ...) @variadic(ptr %indirectFoo, i32 1)
+  ret i8 %call
+}
+)IR");
+  llvm::Function &LLVMF = *M->getFunction("foo");
+  unsigned ArgIdx = 0;
+  llvm::Argument *LLVMArg0 = LLVMF.getArg(ArgIdx++);
+  llvm::Argument *LLVMArg1 = LLVMF.getArg(ArgIdx++);
+  llvm::BasicBlock *LLVMBB = &*LLVMF.begin();
+  SmallVector<llvm::CallBase *, 8> LLVMCalls;
+  auto LLVMIt = LLVMBB->begin();
+  while (isa<llvm::CallBase>(&*LLVMIt))
+    LLVMCalls.push_back(cast<llvm::CallBase>(&*LLVMIt++));
+
+  sandboxir::Context Ctx(C);
+  sandboxir::Function &F = *Ctx.createFunction(&LLVMF);
+
+  for (llvm::CallBase *LLVMCall : LLVMCalls) {
+    // Check classof(Instruction *).
+    auto *Call = cast<sandboxir::CallBase>(Ctx.getValue(LLVMCall));
+    // Check classof(Value *).
+    EXPECT_TRUE(isa<sandboxir::CallBase>((sandboxir::Value *)Call));
+    // Check getFunctionType().
+    EXPECT_EQ(Call->getFunctionType(), LLVMCall->getFunctionType());
+    // Check data_ops().
+    EXPECT_EQ(range_size(Call->data_ops()), range_size(LLVMCall->data_ops()));
+    auto DataOpIt = Call->data_operands_begin();
+    for (llvm::Use &LLVMUse : LLVMCall->data_ops()) {
+      Value *LLVMOp = LLVMUse.get();
+      sandboxir::Use Use = *DataOpIt++;
+      EXPECT_EQ(Ctx.getValue(LLVMOp), Use.get());
+      // Check isDataOperand().
+      EXPECT_EQ(Call->isDataOperand(Use), LLVMCall->isDataOperand(&LLVMUse));
+      // Check getDataOperandNo().
+      EXPECT_EQ(Call->getDataOperandNo(Use),
+                LLVMCall->getDataOperandNo(&LLVMUse));
+      // Check isArgOperand().
+      EXPECT_EQ(Call->isArgOperand(Use), LLVMCall->isArgOperand(&LLVMUse));
+      // Check isCallee().
+      EXPECT_EQ(Call->isCallee(Use), LLVMCall->isCallee(&LLVMUse));
+    }
+    // Check data_operands_empty().
+    EXPECT_EQ(Call->data_operands_empty(), LLVMCall->data_operands_empty());
+    // Check data_operands_size().
+    EXPECT_EQ(Call->data_operands_size(), LLVMCall->data_operands_size());
+    // Check getNumTotalBundleOperands().
+    EXPECT_EQ(Call->getNumTotalBundleOperands(),
+              LLVMCall->getNumTotalBundleOperands());
+    // Check args().
+    EXPECT_EQ(range_size(Call->args()), range_size(LLVMCall->args()));
+    auto ArgIt = Call->arg_begin();
+    for (llvm::Use &LLVMUse : LLVMCall->args()) {
+      Value *LLVMArg = LLVMUse.get();
+      sandboxir::Use Use = *ArgIt++;
+      EXPECT_EQ(Ctx.getValue(LLVMArg), Use.get());
+    }
+    // Check arg_empty().
+    EXPECT_EQ(Call->arg_empty(), LLVMCall->arg_empty());
+    // Check arg_size().
+    EXPECT_EQ(Call->arg_size(), LLVMCall->arg_size());
+    for (unsigned ArgIdx = 0, E = Call->arg_size(); ArgIdx != E; ++ArgIdx) {
+      // Check getArgOperand().
+      EXPECT_EQ(Call->getArgOperand(ArgIdx),
+                Ctx.getValue(LLVMCall->getArgOperand(ArgIdx)));
+      // Check getArgOperandUse().
+      sandboxir::Use Use = Call->getArgOperandUse(ArgIdx);
+      llvm::Use &LLVMUse = LLVMCall->getArgOperandUse(ArgIdx);
+      EXPECT_EQ(Use.get(), Ctx.getValue(LLVMUse.get()));
+      // Check getArgOperandNo().
+      EXPECT_EQ(Call->getArgOperandNo(Use),
+                LLVMCall->getArgOperandNo(&LLVMUse));
+    }
+    // Check hasArgument().
+    SmallVector<llvm::Value *> TestArgs(
+        {LLVMArg0, LLVMArg1, &LLVMF, LLVMBB, LLVMCall});
+    for (llvm::Value *LLVMV : TestArgs) {
+      sandboxir::Value *V = Ctx.getValue(LLVMV);
+      EXPECT_EQ(Call->hasArgument(V), LLVMCall->hasArgument(LLVMV));
+    }
+    // Check getCalledOperand().
+    EXPECT_EQ(Call->getCalledOperand(),
+              Ctx.getValue(LLVMCall->getCalledOperand()));
+    // Check getCalledOperandUse().
+    EXPECT_EQ(Call->getCalledOperandUse().get(),
+              Ctx.getValue(LLVMCall->getCalledOperandUse()));
+    // Check getCalledFunction().
+    if (LLVMCall->getCalledFunction() == nullptr)
+      EXPECT_EQ(Call->getCalledFunction(), nullptr);
+    else {
+      auto *LLVMCF = cast<llvm::Function>(LLVMCall->getCalledFunction());
+      (void)LLVMCF;
+      EXPECT_EQ(Call->getCalledFunction(),
+                cast<sandboxir::Function>(
+                    Ctx.getValue(LLVMCall->getCalledFunction())));
+    }
+    // Check isIndirectCall().
+    EXPECT_EQ(Call->isIndirectCall(), LLVMCall->isIndirectCall());
+    // Check getCaller().
+    EXPECT_EQ(Call->getCaller(), Ctx.getValue(LLVMCall->getCaller()));
+    // Check isMustTailCall().
+    EXPECT_EQ(Call->isMustTailCall(), LLVMCall->isMustTailCall());
+    // Check isTailCall().
+    EXPECT_EQ(Call->isTailCall(), LLVMCall->isTailCall());
+    // Check getIntrinsicID().
+    EXPECT_EQ(Call->getIntrinsicID(), LLVMCall->getIntrinsicID());
+    // Check getCallingConv().
+    EXPECT_EQ(Call->getCallingConv(), LLVMCall->getCallingConv());
+    // Check isInlineAsm().
+    EXPECT_EQ(Call->isInlineAsm(), LLVMCall->isInlineAsm());
+  }
+
+  auto *Arg0 = F.getArg(0);
+  auto *Arg1 = F.getArg(1);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *Call0 = cast<sandboxir::CallBase>(&*It++);
+  [[maybe_unused]] auto *Call1 = cast<sandboxir::CallBase>(&*It++);
+  auto *Call2 = cast<sandboxir::CallBase>(&*It++);
+  // Check setArgOperand
+  Call0->setArgOperand(0, Arg1);
+  EXPECT_EQ(Call0->getArgOperand(0), Arg1);
+  Call0->setArgOperand(0, Arg0);
+  EXPECT_EQ(Call0->getArgOperand(0), Arg0);
+
+  auto *Bar3F = Ctx.createFunction(M->getFunction("bar3"));
+
+  // Check setCalledOperand
+  auto *SvOp = Call0->getCalledOperand();
+  Call0->setCalledOperand(Bar3F);
+  EXPECT_EQ(Call0->getCalledOperand(), Bar3F);
+  Call0->setCalledOperand(SvOp);
+  // Check setCalledFunction
+  Call2->setCalledFunction(Bar3F);
+  EXPECT_EQ(Call2->getCalledFunction(), Bar3F);
+}
+
+TEST_F(SandboxIRTest, CallInst) {
+  parseIR(C, R"IR(
+define i8 @foo(i8 %arg) {
+  %call = call i8 @foo(i8 %arg)
+  ret i8 %call
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F.getArg(ArgIdx++);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *Call = cast<sandboxir::CallInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  EXPECT_EQ(Call->getNumOperands(), 2u);
+  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<sandboxir::Value *, 1> Args;
+  Args.push_back(Arg0);
+  {
+    // Check create() WhereIt.
+    auto *Call = cast<sandboxir::CallInst>(sandboxir::CallInst::create(
+        FTy, &F, Args, /*WhereIt=*/Ret->getIterator(), BB, Ctx));
+    EXPECT_EQ(Call->getNextNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+  {
+    // Check create() InsertBefore.
+    auto *Call = cast<sandboxir::CallInst>(
+        sandboxir::CallInst::create(FTy, &F, Args, /*InsertBefore=*/Ret, Ctx));
+    EXPECT_EQ(Call->getNextNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *Call = cast<sandboxir::CallInst>(
+        sandboxir::CallInst::create(FTy, &F, Args, /*InsertAtEnd=*/BB, Ctx));
+    EXPECT_EQ(Call->getPrevNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+}
+
+TEST_F(SandboxIRTest, InvokeInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   invoke i8 @foo(i8 %arg) to label %normal_bb
+                       unwind label %exception_bb
+ normal_bb:
+   ret void
+ exception_bb:
+   %lpad = landingpad { ptr, i32}
+           cleanup
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Arg = F.getArg(0);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *NormalBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "normal_bb")));
+  auto *ExceptionBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "exception_bb")));
+  auto *LandingPad = &*ExceptionBB->begin();
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  // Check classof(Instruction *).
+  auto *Invoke = cast<sandboxir::InvokeInst>(&*It++);
+
+  // Check getNormalDest().
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+  // Check getUnwindDest().
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check getSuccessor().
+  EXPECT_EQ(Invoke->getSuccessor(0), NormalBB);
+  EXPECT_EQ(Invoke->getSuccessor(1), ExceptionBB);
+  // Check setNormalDest().
+  Invoke->setNormalDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check setUnwindDest().
+  Invoke->setUnwindDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), OtherBB);
+  // Check setSuccessor().
+  Invoke->setSuccessor(0, NormalBB);
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+  Invoke->setSuccessor(1, ExceptionBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check getLandingPadInst().
+  EXPECT_EQ(Invoke->getLandingPadInst(), LandingPad);
+
+  {
+    // Check create() WhereIt, WhereBB.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->begin();
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(sandboxir::InvokeInst::create(
+        F.getFunctionType(), &F, NormalBB, ExceptionBB, Args,
+        /*WhereIt=*/InsertBefore->getIterator(), /*WhereBB=*/BB0, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertBefore.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->begin();
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(
+        sandboxir::InvokeInst::create(F.getFunctionType(), &F, NormalBB,
+                                      ExceptionBB, Args, InsertBefore, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertAtEnd.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(sandboxir::InvokeInst::create(
+        F.getFunctionType(), &F, NormalBB, ExceptionBB, Args,
+        /*InsertAtEnd=*/BB0, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getParent(), BB0);
+    EXPECT_EQ(NewInvoke->getNextNode(), nullptr);
+  }
+}
+
+TEST_F(SandboxIRTest, CallBrInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   callbr void asm "", ""()
+               to label %bb1 [label %bb2]
+ bb1:
+   ret void
+ bb2:
+   ret void
+ other_bb:
+   ret void
+ bb3:
+   callbr void @foo(i8 %arg)
+               to label %bb1 [label %bb2]
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMBB0 = getBasicBlockByName(LLVMF, "bb0");
+  auto *LLVMCallBr = cast<llvm::CallBrInst>(&*LLVMBB0->begin());
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Arg = F.getArg(0);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto *BB3 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb3")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  // Check classof(Instruction *).
+  auto *CallBr0 = cast<sandboxir::CallBrInst>(&*It++);
+
+  It = BB3->begin();
+  auto *CallBr1 = cast<sandboxir::CallBrInst>(&*It++);
+  for (sandboxir::CallBrInst *CallBr : {CallBr0, CallBr1}) {
+    // Check getNumIndirectDests().
+    EXPECT_EQ(CallBr->getNumIndirectDests(), 1u);
+    // Check getIndirectDestLabel().
+    EXPECT_EQ(CallBr->getIndirectDestLabel(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDestLabel(0)));
+    // Check getIndirectDestLabelUse().
+    EXPECT_EQ(CallBr->getIndirectDestLabelUse(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDestLabelUse(0)));
+    // Check getDefaultDest().
+    EXPECT_EQ(CallBr->getDefaultDest(),
+              Ctx.getValue(LLVMCallBr->getDefaultDest()));
+    // Check getIndirectDest().
+    EXPECT_EQ(CallBr->getIndirectDest(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDest(0)));
+    // Check getIndirectDests().
+    auto Dests = CallBr->getIndirectDests();
+    EXPECT_EQ(Dests.size(), LLVMCallBr->getIndirectDests().size());
+    EXPECT_EQ(Dests[0], Ctx.getValue(LLVMCallBr->getIndirectDests()[0]));
+    // Check getNumSuccessors().
+    EXPECT_EQ(CallBr->getNumSuccessors(), LLVMCallBr->getNumSuccessors());
+    // Check getSuccessor().
+    for (unsigned SuccIdx = 0, E = CallBr->getNumSuccessors(); SuccIdx != E;
+         ++SuccIdx)
+      EXPECT_EQ(CallBr->getSuccessor(SuccIdx),
+                Ctx.getValue(LLVMCallBr->getSuccessor(SuccIdx)));
+    // Check setDefaultDest().
+    auto *SvDefaultDest = CallBr->getDefaultDest();
+    CallBr->setDefaultDest(OtherBB);
+    EXPECT_EQ(CallBr->getDefaultDest(), OtherBB);
+    CallBr->setDefaultDest(SvDefaultDest);
+    // Check setIndirectDest().
+    auto *SvIndirectDest = CallBr->getIndirectDest(0);
+    CallBr->setIndirectDest(0, OtherBB);
+    EXPECT_EQ(CallBr->getIndirectDest(0), OtherBB);
+    CallBr->setIndirectDest(0, SvIndirectDest);
+  }
+
+  {
+    // Check create() WhereIt, WhereBB.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(sandboxir::CallBrInst::create(
+        F.getFunctionType(), &F, BB1, {BB2}, Args, /*WhereIt=*/BB0->end(),
+        /*WhereBB=*/BB0, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), nullptr);
+    EXPECT_EQ(NewCallBr->getParent(), BB0);
+  }
+  {
+    // Check create() InsertBefore
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->rbegin();
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(sandboxir::CallBrInst::create(
+        F.getFunctionType(), &F, BB1, {BB2}, Args, InsertBefore, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertAtEnd.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(
+        sandboxir::CallBrInst::create(F.getFunctionType(), &F, BB1, {BB2}, Args,
+                                      /*InsertAtEnd=*/BB0, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), nullptr);
+    EXPECT_EQ(NewCallBr->getParent(), BB0);
+  }
+}
+
+TEST_F(SandboxIRTest, GetElementPtrInstruction) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, <2 x ptr> %ptrs) {
+  %gep0 = getelementptr i8, ptr %ptr, i32 0
+  %gep1 = getelementptr nusw i8, ptr %ptr, i32 0
+  %gep2 = getelementptr nuw i8, ptr %ptr, i32 0
+  %gep3 = getelementptr inbounds {i32, {i32, i8}}, ptr %ptr, i32 1, i32 0
+  %gep4 = getelementptr inbounds {i8, i8, {i32, i16}}, <2 x ptr> %ptrs, i32 2, <2 x i32> <i32 0, i32 0>
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  BasicBlock *LLVMBB = &*LLVMF.begin();
+  auto LLVMIt = LLVMBB->begin();
+  SmallVector<llvm::GetElementPtrInst *, 4> LLVMGEPs;
+  while (isa<llvm::GetElementPtrInst>(&*LLVMIt))
+    LLVMGEPs.push_back(cast<llvm::GetElementPtrInst>(&*LLVMIt++));
+  auto *LLVMRet = cast<llvm::ReturnInst>(&*LLVMIt++);
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+
+  for (llvm::GetElementPtrInst *LLVMGEP : LLVMGEPs) {
+    // Check classof().
+    auto *GEP = cast<sandboxir::GetElementPtrInst>(Ctx.getValue(LLVMGEP));
+    // Check getSourceElementType().
+    EXPECT_EQ(GEP->getSourceElementType(), LLVMGEP->getSourceElementType());
+    // Check getResultElementType().
+    EXPECT_EQ(GEP->getResultElementType(), LLVMGEP->getResultElementType());
+    // Check getAddressSpace().
+    EXPECT_EQ(GEP->getAddressSpace(), LLVMGEP->getAddressSpace());
+    // Check indices().
+    EXPECT_EQ(range_size(GEP->indices()), range_size(LLVMGEP->indices()));
+    auto IdxIt = GEP->idx_begin();
+    for (llvm::Value *LLVMIdxV : LLVMGEP->indices()) {
+      sandboxir::Value *IdxV = *IdxIt++;
+      EXPECT_EQ(IdxV, Ctx.getValue(LLVMIdxV));
+    }
+    // Check getPointerOperand().
+    EXPECT_EQ(GEP->getPointerOperand(),
+              Ctx.getValue(LLVMGEP->getPointerOperand()));
+    // Check getPointerOperandIndex().
+    EXPECT_EQ(GEP->getPointerOperandIndex(), LLVMGEP->getPointerOperandIndex());
+    // Check getPointerOperandType().
+    EXPECT_EQ(GEP->getPointerOperandType(), LLVMGEP->getPointerOperandType());
+    // Check getPointerAddressSpace().
+    EXPECT_EQ(GEP->getPointerAddressSpace(), LLVMGEP->getPointerAddressSpace());
+    // Check getNumIndices().
+    EXPECT_EQ(GEP->getNumIndices(), LLVMGEP->getNumIndices());
+    // Check hasIndices().
+    EXPECT_EQ(GEP->hasIndices(), LLVMGEP->hasIndices());
+    // Check hasAllConstantIndices().
+    EXPECT_EQ(GEP->hasAllConstantIndices(), LLVMGEP->hasAllConstantIndices());
+    // Check getNoWrapFlags().
+    EXPECT_EQ(GEP->getNoWrapFlags(), LLVMGEP->getNoWrapFlags());
+    // Check isInBounds().
+    EXPECT_EQ(GEP->isInBounds(), LLVMGEP->isInBounds());
+    // Check hasNoUnsignedWrap().
+    EXPECT_EQ(GEP->hasNoUnsignedWrap(), LLVMGEP->hasNoUnsignedWrap());
+    // Check accumulateConstantOffset().
+    DataLayout DL(M.get());
+    APInt Offset1 =
+        APInt::getZero(DL.getIndexSizeInBits(GEP->getPointerAddressSpace()));
+    APInt Offset2 =
+        APInt::getZero(DL.getIndexSizeInBits(GEP->getPointerAddressSpace()));
+    EXPECT_EQ(GEP->accumulateConstantOffset(DL, Offset1),
+              LLVMGEP->accumulateConstantOffset(DL, Offset2));
+    EXPECT_EQ(Offset1, Offset2);
+  }
+
+  auto *BB = &*F.begin();
+  auto *GEP0 = cast<sandboxir::GetElementPtrInst>(&*BB->begin());
+  auto *Ret = cast<sandboxir::ReturnInst>(Ctx.getValue(LLVMRet));
+  SmallVector<sandboxir::Value *> Indices(GEP0->indices());
+
+  // Check create() WhereIt, WhereBB.
+  auto *NewGEP0 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*WhereIt=*/Ret->getIterator(), /*WhereBB=*/Ret->getParent(), Ctx,
+          "NewGEP0"));
+  EXPECT_EQ(NewGEP0->getName(), "NewGEP0");
+  EXPECT_EQ(NewGEP0->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP0->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP0->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP0->getNextNode(), Ret);
+
+  // Check create() InsertBefore.
+  auto *NewGEP1 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*InsertBefore=*/Ret, Ctx, "NewGEP1"));
+  EXPECT_EQ(NewGEP1->getName(), "NewGEP1");
+  EXPECT_EQ(NewGEP1->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP1->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP1->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP1->getNextNode(), Ret);
+
+  // Check create() InsertAtEnd.
+  auto *NewGEP2 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*InsertAtEnd=*/BB, Ctx, "NewGEP2"));
+  EXPECT_EQ(NewGEP2->getName(), "NewGEP2");
+  EXPECT_EQ(NewGEP2->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP2->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP2->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP2->getPrevNode(), Ret);
+  EXPECT_EQ(NewGEP2->getNextNode(), nullptr);
+}
+
+TEST_F(SandboxIRTest, CastInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
+  %zext = zext i32 %arg to i64
+  %sext = sext i32 %arg to i64
+  %fptoui = fptoui float %farg to i32
+  %fptosi = fptosi float %farg to i32
+  %fpext = fpext float %farg to double
+  %ptrtoint = ptrtoint ptr %ptr to i32
+  %inttoptr = inttoptr i32 %arg to ptr
+  %sitofp = sitofp i32 %arg to float
+  %uitofp = uitofp i32 %arg to float
+  %trunc = trunc i32 %arg to i16
+  %fptrunc = fptrunc double %darg to float
+  %bitcast = bitcast i32 %arg to float
+  %addrspacecast = addrspacecast ptr %ptr to ptr addrspace(1)
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  Type *Ti64 = Type::getInt64Ty(C);
+  Type *Ti32 = Type::getInt32Ty(C);
+  Type *Ti16 = Type::getInt16Ty(C);
+  Type *Tdouble = Type::getDoubleTy(C);
+  Type *Tfloat = Type::getFloatTy(C);
+  Type *Tptr = Tfloat->getPointerTo();
+  Type *Tptr1 = Tfloat->getPointerTo(1);
+
+  // Check classof(), getOpcode(), getSrcTy(), getDstTy()
+  auto *ZExt = cast<sandboxir::CastInst>(&*It++);
+  auto *ZExtI = cast<sandboxir::ZExtInst>(ZExt);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(ZExtI));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(ZExtI));
+  EXPECT_EQ(ZExt->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+  EXPECT_EQ(ZExt->getSrcTy(), Ti32);
+  EXPECT_EQ(ZExt->getDestTy(), Ti64);
+
+  auto *SExt = cast<sandboxir::CastInst>(&*It++);
+  auto *SExtI = cast<sandboxir::SExtInst>(SExt);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(SExt));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(SExtI));
+  EXPECT_EQ(SExt->getOpcode(), sandboxir::Instruction::Opcode::SExt);
+  EXPECT_EQ(SExt->getSrcTy(), Ti32);
+  EXPECT_EQ(SExt->getDestTy(), Ti64);
+
+  auto *FPToUI = cast<sandboxir::CastInst>(&*It++);
+  auto *FPToUII = cast<sandboxir::FPToUIInst>(FPToUI);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPToUI));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPToUII));
+  EXPECT_EQ(FPToUI->getOpcode(), sandboxir::Instruction::Opcode::FPToUI);
+  EXPECT_EQ(FPToUI->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPToUI->getDestTy(), Ti32);
+
+  auto *FPToSI = cast<sandboxir::CastInst>(&*It++);
+  auto *FPToSII = cast<sandboxir::FPToSIInst>(FPToSI);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPToSI));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPToSII));
+  EXPECT_EQ(FPToSI->getOpcode(), sandboxir::Instruction::Opcode::FPToSI);
+  EXPECT_EQ(FPToSI->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPToSI->getDestTy(), Ti32);
+
+  auto *FPExt = cast<sandboxir::CastInst>(&*It++);
+  auto *FPExtI = cast<sandboxir::FPExtInst>(FPExt);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPExt));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPExtI));
+  EXPECT_EQ(FPExt->getOpcode(), sandboxir::Instruction::Opcode::FPExt);
+  EXPECT_EQ(FPExt->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPExt->getDestTy(), Tdouble);
+
+  auto *PtrToInt = cast<sandboxir::CastInst>(&*It++);
+  auto *PtrToIntI = cast<sandboxir::PtrToIntInst>(PtrToInt);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(PtrToInt));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(PtrToIntI));
+  EXPECT_EQ(PtrToInt->getOpcode(), sandboxir::Instruction::Opcode::PtrToInt);
+  EXPECT_EQ(PtrToInt->getSrcTy(), Tptr);
+  EXPECT_EQ(PtrToInt->getDestTy(), Ti32);
+
+  auto *IntToPtr = cast<sandboxir::CastInst>(&*It++);
+  auto *IntToPtrI = cast<sandboxir::IntToPtrInst>(IntToPtr);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(IntToPtr));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(IntToPtrI));
+  EXPECT_EQ(IntToPtr->getOpcode(), sandboxir::Instruction::Opcode::IntToPtr);
+  EXPECT_EQ(IntToPtr->getSrcTy(), Ti32);
+  EXPECT_EQ(IntToPtr->getDestTy(), Tptr);
+
+  auto *SIToFP = cast<sandboxir::CastInst>(&*It++);
+  auto *SIToFPI = cast<sandboxir::SIToFPInst>(SIToFP);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(SIToFP));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(SIToFPI));
+  EXPECT_EQ(SIToFP->getOpcode(), sandboxir::Instruction::Opcode::SIToFP);
+  EXPECT_EQ(SIToFP->getSrcTy(), Ti32);
+  EXPECT_EQ(SIToFP->getDestTy(), Tfloat);
+
+  auto *UIToFP = cast<sandboxir::CastInst>(&*It++);
+  auto *UIToFPI = cast<sandboxir::UIToFPInst>(UIToFP);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(UIToFP));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(UIToFPI));
+  EXPECT_EQ(UIToFP->getOpcode(), sandboxir::Instruction::Opcode::UIToFP);
+  EXPECT_EQ(UIToFP->getSrcTy(), Ti32);
+  EXPECT_EQ(UIToFP->getDestTy(), Tfloat);
+
+  auto *Trunc = cast<sandboxir::CastInst>(&*It++);
+  auto *TruncI = cast<sandboxir::TruncInst>(Trunc);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(Trunc));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(TruncI));
+  EXPECT_EQ(Trunc->getOpcode(), sandboxir::Instruction::Opcode::Trunc);
+  EXPECT_EQ(Trunc->getSrcTy(), Ti32);
+  EXPECT_EQ(Trunc->getDestTy(), Ti16);
+
+  auto *FPTrunc = cast<sandboxir::CastInst>(&*It++);
+  auto *FPTruncI = cast<sandboxir::FPTruncInst>(FPTrunc);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPTrunc));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(FPTruncI));
+  EXPECT_EQ(FPTrunc->getOpcode(), sandboxir::Instruction::Opcode::FPTrunc);
+  EXPECT_EQ(FPTrunc->getSrcTy(), Tdouble);
+  EXPECT_EQ(FPTrunc->getDestTy(), Tfloat);
+
+  auto *BitCast = cast<sandboxir::CastInst>(&*It++);
+  auto *BitCastI = cast<sandboxir::BitCastInst>(BitCast);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(BitCast));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(BitCastI));
+  EXPECT_EQ(BitCast->getOpcode(), sandboxir::Instruction::Opcode::BitCast);
+  EXPECT_EQ(BitCast->getSrcTy(), Ti32);
+  EXPECT_EQ(BitCast->getDestTy(), Tfloat);
+
+  auto *AddrSpaceCast = cast<sandboxir::CastInst>(&*It++);
+  auto *AddrSpaceCastI = cast<sandboxir::AddrSpaceCastInst>(AddrSpaceCast);
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(AddrSpaceCast));
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(AddrSpaceCastI));
+  EXPECT_EQ(AddrSpaceCast->getOpcode(),
+            sandboxir::Instruction::Opcode::AddrSpaceCast);
+  EXPECT_EQ(AddrSpaceCast->getSrcTy(), Tptr);
+  EXPECT_EQ(AddrSpaceCast->getDestTy(), Tptr1);
+
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  {
+    // Check create() WhereIt, WhereBB
+    auto *NewI = cast<sandboxir::CastInst>(sandboxir::CastInst::create(
+        Ti64, sandboxir::Instruction::Opcode::SExt, Arg, /*WhereIt=*/BB->end(),
+        /*WhereBB=*/BB, Ctx, "SExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::SExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getPrevNode(), Ret);
+  }
+
+  {
+    // Check create() InsertBefore.
+    auto *NewI = cast<sandboxir::CastInst>(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::ZExt,
+                                    Arg, /*InsertBefore=*/Ret, Ctx, "ZExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), Ret);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *NewI = cast<sandboxir::CastInst>(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::ZExt,
+                                    Arg, /*InsertAtEnd=*/BB, Ctx, "ZExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getParent(), BB);
+  }
+
+  {
+#ifndef NDEBUG
+    // Check that passing a non-cast opcode crashes.
+    EXPECT_DEATH(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::Store,
+                                    Arg, /*InsertBefore=*/Ret, Ctx, "Bad"),
+        ".*Opcode.*");
+#endif // NDEBUG
+  }
+}
+
+/// CastInst's subclasses are very similar so we can use a common test function
+/// for them.
+template <typename SubclassT, sandboxir::Instruction::Opcode OpcodeT>
+void testCastInst(llvm::Module &M, Type *SrcTy, Type *DstTy) {
+  Function &LLVMF = *M.getFunction("foo");
+  sandboxir::Context Ctx(M.getContext());
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  auto *CI = cast<SubclassT>(&*It++);
+  EXPECT_EQ(CI->getOpcode(), OpcodeT);
+  EXPECT_EQ(CI->getSrcTy(), SrcTy);
+  EXPECT_EQ(CI->getDestTy(), DstTy);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  {
+    // Check create() WhereIt, WhereBB
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy, /*WhereIt=*/BB->end(),
+                                          /*WhereBB=*/BB, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getPrevNode(), Ret);
+    // Check instr name.
+    EXPECT_EQ(NewI->getName(), "NewCI");
+  }
+  {
+    // Check create() InsertBefore.
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy,
+                                          /*InsertBefore=*/Ret, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), Ret);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy,
+                                          /*InsertAtEnd=*/BB, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getParent(), BB);
+  }
+}
+
+TEST_F(SandboxIRTest, TruncInst) {
+  parseIR(C, R"IR(
+define void @foo(i64 %arg) {
+  %trunc = trunc i64 %arg to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::TruncInst, sandboxir::Instruction::Opcode::Trunc>(
+      *M,
+      /*SrcTy=*/Type::getInt64Ty(C), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, ZExtInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %zext = zext i32 %arg to i64
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::ZExtInst, sandboxir::Instruction::Opcode::ZExt>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/Type::getInt64Ty(C));
+}
+
+TEST_F(SandboxIRTest, SExtInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %sext = sext i32 %arg to i64
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::SExtInst, sandboxir::Instruction::Opcode::SExt>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/Type::getInt64Ty(C));
+}
+
+TEST_F(SandboxIRTest, FPTruncInst) {
+  parseIR(C, R"IR(
+define void @foo(double %arg) {
+  %fptrunc = fptrunc double %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPTruncInst, sandboxir::Instruction::Opcode::FPTrunc>(
+      *M,
+      /*SrcTy=*/Type::getDoubleTy(C), /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, FPExtInst) {
+  parseIR(C, R"IR(
+define void @foo(float %arg) {
+  %fpext = fpext float %arg to double
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPExtInst, sandboxir::Instruction::Opcode::FPExt>(
+      *M,
+      /*SrcTy=*/Type::getFloatTy(C), /*DstTy=*/Type::getDoubleTy(C));
+}
+
+TEST_F(SandboxIRTest, UIToFPInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %uitofp = uitofp i32 %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::UIToFPInst, sandboxir::Instruction::Opcode::UIToFP>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, SIToFPInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %sitofp = sitofp i32 %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::SIToFPInst, sandboxir::Instruction::Opcode::SIToFP>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C),
+      /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, FPToUIInst) {
+  parseIR(C, R"IR(
+define void @foo(float %arg) {
+  %fptoui = fptoui float %arg to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPToUIInst, sandboxir::Instruction::Opcode::FPToUI>(
+
+      *M, /*SrcTy=*/Type::getFloatTy(C), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, FPToSIInst) {
+  parseIR(C, R"IR(
+define void @foo(float %arg) {
+  %fptosi = fptosi float %arg to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPToSIInst, sandboxir::Instruction::Opcode::FPToSI>(
+      *M, /*SrcTy=*/Type::getFloatTy(C), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, IntToPtrInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %inttoptr = inttoptr i32 %arg to ptr
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::IntToPtrInst,
+               sandboxir::Instruction::Opcode::IntToPtr>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/PointerType::get(C, 0));
+}
+
+TEST_F(SandboxIRTest, PtrToIntInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %ptrtoint = ptrtoint ptr %ptr to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::PtrToIntInst,
+               sandboxir::Instruction::Opcode::PtrToInt>(
+      *M, /*SrcTy=*/PointerType::get(C, 0), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, BitCastInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %bitcast = bitcast i32 %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::BitCastInst, sandboxir::Instruction::Opcode::BitCast>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, AddrSpaceCastInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %addrspacecast = addrspacecast ptr %ptr to ptr addrspace(1)
+  ret void
+}
+)IR");
+  Type *Tptr0 = PointerType::get(C, 0);
+  Type *Tptr1 = PointerType::get(C, 1);
+  testCastInst<sandboxir::AddrSpaceCastInst,
+               sandboxir::Instruction::Opcode::AddrSpaceCast>(*M,
+                                                              /*SrcTy=*/Tptr0,
+                                                              /*DstTy=*/Tptr1);
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  auto *AddrSpaceCast = cast<sandboxir::AddrSpaceCastInst>(&*It++);
+  EXPECT_EQ(AddrSpaceCast->getOpcode(),
+            sandboxir::Instruction::Opcode::AddrSpaceCast);
+  EXPECT_EQ(AddrSpaceCast->getPointerOperand(), Arg);
+  EXPECT_EQ(sandboxir::AddrSpaceCastInst::getPointerOperandIndex(), 0u);
+  EXPECT_EQ(AddrSpaceCast->getSrcAddressSpace(),
+            cast<PointerType>(Tptr0)->getPointerAddressSpace());
+  EXPECT_EQ(AddrSpaceCast->getDestAddressSpace(),
+            cast<PointerType>(Tptr1)->getPointerAddressSpace());
+}
+
+TEST_F(SandboxIRTest, PHINode) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+bb1:
+  br label %bb2
+
+bb2:
+  %phi = phi i32 [ %arg, %bb1 ], [ 0, %bb2 ]
+  br label %bb2
+
+bb3:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMBB1 = getBasicBlockByName(LLVMF, "bb1");
+  auto *LLVMBB2 = getBasicBlockByName(LLVMF, "bb2");
+  auto *LLVMBB3 = getBasicBlockByName(LLVMF, "bb3");
+  auto LLVMIt = LLVMBB2->begin();
+  auto *LLVMPHI = cast<llvm::PHINode>(&*LLVMIt++);
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  auto *Arg = F->getArg(0);
+  auto *BB1 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB1));
+  auto *BB2 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB2));
+  auto *BB3 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB3));
+  auto It = BB2->begin();
+  // Check classof().
+  auto *PHI = cast<sandboxir::PHINode>(&*It++);
+  auto *Br = cast<sandboxir::BranchInst>(&*It++);
+  // Check blocks().
+  EXPECT_EQ(range_size(PHI->blocks()), range_size(LLVMPHI->blocks()));
+  auto BlockIt = PHI->block_begin();
+  for (llvm::BasicBlock *LLVMBB : LLVMPHI->blocks()) {
+    sandboxir::BasicBlock *BB = *BlockIt++;
+    EXPECT_EQ(BB, Ctx.getValue(LLVMBB));
+  }
+  // Check incoming_values().
+  EXPECT_EQ(range_size(PHI->incoming_values()),
+            range_size(LLVMPHI->incoming_values()));
+  auto IncIt = PHI->incoming_values().begin();
+  for (llvm::Value *LLVMV : LLVMPHI->incoming_values()) {
+    sandboxir::Value *IncV = *IncIt++;
+    EXPECT_EQ(IncV, Ctx.getValue(LLVMV));
+  }
+  // Check getNumIncomingValues().
+  EXPECT_EQ(PHI->getNumIncomingValues(), LLVMPHI->getNumIncomingValues());
+  // Check getIncomingValue().
+  EXPECT_EQ(PHI->getIncomingValue(0),
+            Ctx.getValue(LLVMPHI->getIncomingValue(0)));
+  EXPECT_EQ(PHI->getIncomingValue(1),
+            Ctx.getValue(LLVMPHI->getIncomingValue(1)));
+  // Check setIncomingValue().
+  auto *OrigV = PHI->getIncomingValue(0);
+  PHI->setIncomingValue(0, PHI);
+  EXPECT_EQ(PHI->getIncomingValue(0), PHI);
+  PHI->setIncomingValue(0, OrigV);
+  // Check getOperandNumForIncomingValue().
+  EXPECT_EQ(sandboxir::PHINode::getOperandNumForIncomingValue(0),
+            llvm::PHINode::getOperandNumForIncomingValue(0));
+  // Check getIncomingValueNumForOperand().
+  EXPECT_EQ(sandboxir::PHINode::getIncomingValueNumForOperand(0),
+            llvm::PHINode::getIncomingValueNumForOperand(0));
+  // Check getIncomingBlock(unsigned).
+  EXPECT_EQ(PHI->getIncomingBlock(0),
+            Ctx.getValue(LLVMPHI->getIncomingBlock(0)));
+  // Check getIncomingBlock(Use).
+  llvm::Use &LLVMUse = LLVMPHI->getOperandUse(0);
+  sandboxir::Use Use = PHI->getOperandUse(0);
+  EXPECT_EQ(PHI->getIncomingBlock(Use),
+            Ctx.getValue(LLVMPHI->getIncomingBlock(LLVMUse)));
+  // Check setIncomingBlock().
+  sandboxir::BasicBlock *OrigBB = PHI->getIncomingBlock(0);
+  EXPECT_NE(OrigBB, BB2);
+  PHI->setIncomingBlock(0, BB2);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB2);
+  PHI->setIncomingBlock(0, OrigBB);
+  EXPECT_EQ(PHI->getIncomingBlock(0), OrigBB);
+  // Check addIncoming().
+  unsigned OrigNumIncoming = PHI->getNumIncomingValues();
+  PHI->addIncoming(Arg, BB3);
+  EXPECT_EQ(PHI->getNumIncomingValues(), LLVMPHI->getNumIncomingValues());
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming + 1);
+  EXPECT_EQ(PHI->getIncomingValue(OrigNumIncoming), Arg);
+  EXPECT_EQ(PHI->getIncomingBlock(OrigNumIncoming), BB3);
+  // Check removeIncomingValue(unsigned).
+  PHI->removeIncomingValue(OrigNumIncoming);
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming);
+  // Check removeIncomingValue(BasicBlock *).
+  PHI->addIncoming(Arg, BB3);
+  PHI->removeIncomingValue(BB3);
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming);
+  // Check getBasicBlockIndex().
+  EXPECT_EQ(PHI->getBasicBlockIndex(BB1), LLVMPHI->getBasicBlockIndex(LLVMBB1));
+  // Check getIncomingValueForBlock().
+  EXPECT_EQ(PHI->getIncomingValueForBlock(BB1),
+            Ctx.getValue(LLVMPHI->getIncomingValueForBlock(LLVMBB1)));
+  // Check hasConstantValue().
+  llvm::Value *ConstV = LLVMPHI->hasConstantValue();
+  EXPECT_EQ(PHI->hasConstantValue(),
+            ConstV != nullptr ? Ctx.getValue(ConstV) : nullptr);
+  // Check hasConstantOrUndefValue().
+  EXPECT_EQ(PHI->hasConstantOrUndefValue(), LLVMPHI->hasConstantOrUndefValue());
+  // Check isComplete().
+  EXPECT_EQ(PHI->isComplete(), LLVMPHI->isComplete());
+
+  // Check create().
+  auto *NewPHI = cast<sandboxir::PHINode>(
+      sandboxir::PHINode::create(PHI->getType(), 0, Br, Ctx, "NewPHI"));
+  EXPECT_EQ(NewPHI->getType(), PHI->getType());
+  EXPECT_EQ(NewPHI->getNextNode(), Br);
+  EXPECT_EQ(NewPHI->getName(), "NewPHI");
+  EXPECT_EQ(NewPHI->getNumIncomingValues(), 0u);
+  for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+    sandboxir::BasicBlock *IncBB = PHI->getIncomingBlock(Idx);
+    NewPHI->addIncoming(V, IncBB);
+  }
+  EXPECT_EQ(NewPHI->getNumIncomingValues(), PHI->getNumIncomingValues());
+}
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index f090dc521c32b..d016c7793a52c 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -69,6 +69,77 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(Ld->getOperand(0), Gep0);
 }
 
+TEST_F(TrackerTest, SetUse) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %arg) {
+  %ld = load i8, ptr %ptr
+  %add = add i8 %ld, %arg
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto &Tracker = Ctx.getTracker();
+  Tracker.save();
+  auto It = BB->begin();
+  auto *Ld = &*It++;
+  auto *Add = &*It++;
+
+  Ctx.save();
+  sandboxir::Use Use = Add->getOperandUse(0);
+  Use.set(Arg0);
+  EXPECT_EQ(Add->getOperand(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(Add->getOperand(0), Ld);
+}
+
+TEST_F(TrackerTest, SwapOperands) {
+  parseIR(C, R"IR(
+define void @foo(i1 %cond) {
+ bb0:
+   br i1 %cond, label %bb1, label %bb2
+ bb1:
+   ret void
+ bb2:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto &Tracker = Ctx.getTracker();
+  Tracker.save();
+  auto It = BB0->begin();
+  auto *Br = cast<sandboxir::BranchInst>(&*It++);
+
+  unsigned SuccIdx = 0;
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+  for (auto *Succ : Br->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  // This calls User::swapOperandsInternal() internally.
+  Br->swapSuccessors();
+
+  SuccIdx = 0;
+  for (auto *Succ : reverse(Br->successors()))
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  Ctx.getTracker().revert();
+  SuccIdx = 0;
+  for (auto *Succ : Br->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+}
+
 TEST_F(TrackerTest, RUWIf_RAUW_RUOW) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr) {
@@ -146,3 +217,494 @@ define void @foo(ptr %ptr) {
   Ctx.accept();
   EXPECT_EQ(St0->getOperand(0), Ld1);
 }
+
+// TODO: Test multi-instruction patterns.
+TEST_F(TrackerTest, EraseFromParent) {
+  parseIR(C, R"IR(
+define void @foo(i32 %v1) {
+  %add0 = add i32 %v1, %v1
+  %add1 = add i32 %add0, %v1
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::Instruction *Add0 = &*It++;
+  sandboxir::Instruction *Add1 = &*It++;
+  sandboxir::Instruction *Ret = &*It++;
+
+  Ctx.save();
+  // Check erase.
+  Add1->eraseFromParent();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  EXPECT_EQ(Add0->getNumUses(), 0u);
+
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  EXPECT_EQ(Add1->getOperand(0), Add0);
+
+  // Same for the last instruction in the block.
+  Ctx.save();
+  Ret->eraseFromParent();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(It, BB->end());
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+}
+
+// TODO: Test multi-instruction patterns.
+TEST_F(TrackerTest, RemoveFromParent) {
+  parseIR(C, R"IR(
+define i32 @foo(i32 %arg) {
+  %add0 = add i32 %arg, %arg
+  %add1 = add i32 %add0, %arg
+  ret i32 %add1
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  auto *Arg = F->getArg(0);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::Instruction *Add0 = &*It++;
+  sandboxir::Instruction *Add1 = &*It++;
+  sandboxir::Instruction *Ret = &*It++;
+
+  Ctx.save();
+  // Check removeFromParent().
+  Add1->removeFromParent();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  // Removed instruction still be connected to operands and users.
+  EXPECT_EQ(Add1->getOperand(0), Add0);
+  EXPECT_EQ(Add1->getOperand(1), Arg);
+  EXPECT_EQ(Add0->getNumUses(), 1u);
+
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  EXPECT_EQ(Add1->getOperand(0), Add0);
+
+  // Same for the last instruction in the block.
+  Ctx.save();
+  Ret->removeFromParent();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(It, BB->end());
+  EXPECT_EQ(Ret->getOperand(0), Add1);
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+}
+
+// TODO: Test multi-instruction patterns.
+TEST_F(TrackerTest, MoveInstr) {
+  parseIR(C, R"IR(
+define i32 @foo(i32 %arg) {
+  %add0 = add i32 %arg, %arg
+  %add1 = add i32 %add0, %arg
+  ret i32 %add1
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::Instruction *Add0 = &*It++;
+  sandboxir::Instruction *Add1 = &*It++;
+  sandboxir::Instruction *Ret = &*It++;
+
+  // Check moveBefore(Instruction *) with tracking enabled.
+  Ctx.save();
+  Add1->moveBefore(Add0);
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+
+  // Same for the last instruction in the block.
+  Ctx.save();
+  Ret->moveBefore(Add0);
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(It, BB->end());
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+
+  // Check moveBefore(BasicBlock &, BasicBlock::iterator) with tracking enabled.
+  Ctx.save();
+  Add1->moveBefore(*BB, Add0->getIterator());
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+
+  // Same for the last instruction in the block.
+  Ctx.save();
+  Ret->moveBefore(*BB, Add0->getIterator());
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(It, BB->end());
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+
+  // Check moveAfter(Instruction *) with tracking enabled.
+  Ctx.save();
+  Add0->moveAfter(Add1);
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+
+  // Same for the last instruction in the block.
+  Ctx.save();
+  Ret->moveAfter(Add0);
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(It, BB->end());
+  // Check revert().
+  Ctx.revert();
+  It = BB->begin();
+  EXPECT_EQ(&*It++, Add0);
+  EXPECT_EQ(&*It++, Add1);
+  EXPECT_EQ(&*It++, Ret);
+  EXPECT_EQ(It, BB->end());
+}
+
+TEST_F(TrackerTest, CallBaseSetters) {
+  parseIR(C, R"IR(
+declare void @bar1(i8)
+declare void @bar2(i8)
+
+define void @foo(i8 %arg0, i8 %arg1) {
+  call void @bar1(i8 %arg0)
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F->getArg(ArgIdx++);
+  auto *Arg1 = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Call = cast<sandboxir::CallBase>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check setArgOperand().
+  Ctx.save();
+  Call->setArgOperand(0, Arg1);
+  EXPECT_EQ(Call->getArgOperand(0), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(Call->getArgOperand(0), Arg0);
+
+  auto *Bar1F = Call->getCalledFunction();
+  auto *Bar2F = Ctx.createFunction(M->getFunction("bar2"));
+
+  // Check setCalledOperand().
+  Ctx.save();
+  Call->setCalledOperand(Bar2F);
+  EXPECT_EQ(Call->getCalledOperand(), Bar2F);
+  Ctx.revert();
+  EXPECT_EQ(Call->getCalledOperand(), Bar1F);
+
+  // Check setCalledFunction().
+  Ctx.save();
+  Call->setCalledFunction(Bar2F);
+  EXPECT_EQ(Call->getCalledFunction(), Bar2F);
+  Ctx.revert();
+  EXPECT_EQ(Call->getCalledFunction(), Bar1F);
+}
+
+TEST_F(TrackerTest, InvokeSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   invoke i8 @foo(i8 %arg) to label %normal_bb
+                       unwind label %exception_bb
+ normal_bb:
+   ret void
+ exception_bb:
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *NormalBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "normal_bb")));
+  auto *ExceptionBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "exception_bb")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  auto *Invoke = cast<sandboxir::InvokeInst>(&*It++);
+
+  // Check setNormalDest().
+  Ctx.save();
+  Invoke->setNormalDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+
+  // Check setUnwindDest().
+  Ctx.save();
+  Invoke->setUnwindDest(OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+
+  // Check setSuccessor().
+  Ctx.save();
+  Invoke->setSuccessor(0, OtherBB);
+  EXPECT_EQ(Invoke->getSuccessor(0), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getSuccessor(0), NormalBB);
+
+  Ctx.save();
+  Invoke->setSuccessor(1, OtherBB);
+  EXPECT_EQ(Invoke->getSuccessor(1), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getSuccessor(1), ExceptionBB);
+}
+
+TEST_F(TrackerTest, CallBrSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   callbr void @foo(i8 %arg)
+               to label %bb1 [label %bb2]
+ bb1:
+   ret void
+ bb2:
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  auto *CallBr = cast<sandboxir::CallBrInst>(&*It++);
+  // Check setDefaultDest().
+  Ctx.save();
+  auto *OrigDefaultDest = CallBr->getDefaultDest();
+  CallBr->setDefaultDest(OtherBB);
+  EXPECT_EQ(CallBr->getDefaultDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(CallBr->getDefaultDest(), OrigDefaultDest);
+
+  // Check setIndirectDest().
+  Ctx.save();
+  auto *OrigIndirectDest = CallBr->getIndirectDest(0);
+  CallBr->setIndirectDest(0, OtherBB);
+  EXPECT_EQ(CallBr->getIndirectDest(0), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(CallBr->getIndirectDest(0), OrigIndirectDest);
+}
+
+TEST_F(TrackerTest, PHINodeSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg0, i8 %arg1, i8 %arg2) {
+bb0:
+  br label %bb2
+
+bb1:
+  %phi = phi i8 [ %arg0, %bb0 ], [ %arg1, %bb1 ]
+  br label %bb1
+
+bb2:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F.getArg(ArgIdx++);
+  auto *Arg1 = F.getArg(ArgIdx++);
+  auto *Arg2 = F.getArg(ArgIdx++);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto *PHI = cast<sandboxir::PHINode>(&*BB1->begin());
+
+  // Check setIncomingValue().
+  Ctx.save();
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  PHI->setIncomingValue(0, Arg2);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg2);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check setIncomingBlock().
+  Ctx.save();
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  PHI->setIncomingBlock(0, BB2);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB2);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check addIncoming().
+  Ctx.save();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  PHI->addIncoming(Arg1, BB2);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 3u);
+  EXPECT_EQ(PHI->getIncomingBlock(2), BB2);
+  EXPECT_EQ(PHI->getIncomingValue(2), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(1).
+  Ctx.save();
+  PHI->removeIncomingValue(1);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(0).
+  Ctx.save();
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue() remove all.
+  Ctx.save();
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg1);
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 0u);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(BasicBlock *).
+  Ctx.save();
+  PHI->removeIncomingValue(BB1);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+}
diff --git a/llvm/unittests/Support/AllocatorTest.cpp b/llvm/unittests/Support/AllocatorTest.cpp
index b361b0aa72529..1069e436d0a16 100644
--- a/llvm/unittests/Support/AllocatorTest.cpp
+++ b/llvm/unittests/Support/AllocatorTest.cpp
@@ -208,6 +208,27 @@ TEST(AllocatorTest, TestSlowerSlabGrowthDelay) {
   EXPECT_EQ(SlabSize * GrowthDelay + SlabSize * 2, Alloc.getTotalMemory());
 }
 
+TEST(AllocatorTest, TestIdentifyObject) {
+  BumpPtrAllocator Alloc;
+
+  uint64_t *a = (uint64_t *)Alloc.Allocate(sizeof(uint64_t), alignof(uint64_t));
+  std::optional<int64_t> maybe_a_belongs = Alloc.identifyObject(a);
+  EXPECT_TRUE(maybe_a_belongs.has_value());
+  EXPECT_TRUE(*maybe_a_belongs >= 0);
+
+  uint64_t *b = nullptr;
+  std::optional<int64_t> maybe_b_belongs = Alloc.identifyObject(b);
+  EXPECT_FALSE(maybe_b_belongs);
+
+  // The default slab size is 4096 (or 512 uint64_t values). Custom slabs are
+  // allocated when the requested size is larger than the slab size.
+  uint64_t *c =
+      (uint64_t *)Alloc.Allocate(sizeof(uint64_t) * 1024, alignof(uint64_t));
+  std::optional<int64_t> maybe_c_belongs = Alloc.identifyObject(c);
+  EXPECT_TRUE(maybe_c_belongs.has_value());
+  EXPECT_TRUE(*maybe_c_belongs < 0);
+}
+
 // Mock slab allocator that returns slabs aligned on 4096 bytes.  There is no
 // easy portable way to do this, so this is kind of a hack.
 class MockSlabAllocator {
diff --git a/llvm/unittests/Support/ErrorTest.cpp b/llvm/unittests/Support/ErrorTest.cpp
index bd098a4988dc5..4cd9896c3b08e 100644
--- a/llvm/unittests/Support/ErrorTest.cpp
+++ b/llvm/unittests/Support/ErrorTest.cpp
@@ -930,6 +930,8 @@ TEST(Error, C_API) {
     });
   EXPECT_TRUE(GotCSE) << "Failed to round-trip ErrorList via C API";
   EXPECT_TRUE(GotCE) << "Failed to round-trip ErrorList via C API";
+
+  LLVMCantFail(wrap(Error::success()));
 }
 
 TEST(Error, FileErrorTest) {
diff --git a/llvm/unittests/Support/raw_socket_stream_test.cpp b/llvm/unittests/Support/raw_socket_stream_test.cpp
index c4e8cfbbe7e6a..348fb4bb3e089 100644
--- a/llvm/unittests/Support/raw_socket_stream_test.cpp
+++ b/llvm/unittests/Support/raw_socket_stream_test.cpp
@@ -62,17 +62,50 @@ TEST(raw_socket_streamTest, CLIENT_TO_SERVER_AND_SERVER_TO_CLIENT) {
   ssize_t BytesRead = Server.read(Bytes, 8);
 
   std::string string(Bytes, 8);
+  ASSERT_EQ(Server.has_error(), false);
 
   ASSERT_EQ(8, BytesRead);
   ASSERT_EQ("01234567", string);
 }
 
-TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
+TEST(raw_socket_streamTest, READ_WITH_TIMEOUT) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("timout_provided.sock", SocketPath, true);
+  llvm::sys::fs::createUniquePath("read_with_timeout.sock", SocketPath, true);
+
+  // Make sure socket file does not exist. May still be there from the last test
+  std::remove(SocketPath.c_str());
+
+  Expected<ListeningSocket> MaybeServerListener =
+      ListeningSocket::createUnix(SocketPath);
+  ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
+  ListeningSocket ServerListener = std::move(*MaybeServerListener);
+
+  Expected<std::unique_ptr<raw_socket_stream>> MaybeClient =
+      raw_socket_stream::createConnectedUnix(SocketPath);
+  ASSERT_THAT_EXPECTED(MaybeClient, llvm::Succeeded());
+
+  Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
+      ServerListener.accept();
+  ASSERT_THAT_EXPECTED(MaybeServer, llvm::Succeeded());
+  raw_socket_stream &Server = **MaybeServer;
+
+  char Bytes[8];
+  ssize_t BytesRead = Server.read(Bytes, 8, std::chrono::milliseconds(100));
+  ASSERT_EQ(BytesRead, -1);
+  ASSERT_EQ(Server.has_error(), true);
+  ASSERT_EQ(Server.error(), std::errc::timed_out);
+  Server.clear_error();
+}
+
+TEST(raw_socket_streamTest, ACCEPT_WITH_TIMEOUT) {
+  if (!hasUnixSocketSupport())
+    GTEST_SKIP();
+
+  SmallString<100> SocketPath;
+  llvm::sys::fs::createUniquePath("accept_with_timeout.sock", SocketPath, true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
@@ -82,19 +115,19 @@ TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
   ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
   ListeningSocket ServerListener = std::move(*MaybeServerListener);
 
-  std::chrono::milliseconds Timeout = std::chrono::milliseconds(100);
   Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
-      ServerListener.accept(Timeout);
+      ServerListener.accept(std::chrono::milliseconds(100));
   ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
             std::errc::timed_out);
 }
 
-TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
+TEST(raw_socket_streamTest, ACCEPT_WITH_SHUTDOWN) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("fd_closed.sock", SocketPath, true);
+  llvm::sys::fs::createUniquePath("accept_with_shutdown.sock", SocketPath,
+                                  true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index ece288aaf06a0..5ac4edae5f0df 100644
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -29,7 +29,8 @@ TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) {
   auto *F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &Mod);
 
   MachineModuleInfo MMI(TM.get());
-  auto MF = std::make_unique<MachineFunction>(*F, *TM, ST, 42, MMI);
+  auto MF =
+      std::make_unique<MachineFunction>(*F, *TM, ST, MMI.getContext(), 42);
   auto *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
index 7d4f3836088ab..86aa4753a2123 100644
--- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
+++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
@@ -62,7 +62,7 @@ class PALMetadata : public testing::Test {
                                         TM->getTargetCPU(),
                                         TM->getTargetFeatureString(), *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 1);
   }
 };
 
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 62cfda435a047..fe711619c6320 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -63,7 +63,7 @@ class RISCVInstrInfoTest : public testing::TestWithParam<const char *> {
         TM->getTargetFeatureString(),
         TM->getTargetTriple().isArch64Bit() ? "lp64" : "ilp32", 0, 0, *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 42, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 42);
   }
 };
 
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index 61921a99e1711..f8dd1d3a60a00 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -536,6 +536,7 @@ TEST(HostTest, AIXHostCPUDetect) {
                        .Case("POWER 8\n", "pwr8")
                        .Case("POWER 9\n", "pwr9")
                        .Case("POWER 10\n", "pwr10")
+                       .Case("POWER 11\n", "pwr11")
                        .Default("unknown");
 
   StringRef HostCPU = sys::getHostCPUName();
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index e0a848351d06f..bede4e64696c5 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -892,7 +892,6 @@ R"(All available -march extensions for RISC-V
     za64rs               1.0
     zaamo                1.0
     zabha                1.0
-    zacas                1.0
     zalrsc               1.0
     zama16b              1.0
     zawrs                1.0
@@ -1028,6 +1027,7 @@ R"(All available -march extensions for RISC-V
 Experimental extensions
     zicfilp              1.0       This is a long dummy description
     zicfiss              1.0
+    zacas                1.0
     zalasr               0.1
     smmpm                1.0
     smnpm                1.0
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index f93dc3671197a..0aecfc64da208 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -1169,6 +1169,12 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::Serenity, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("aarch64-unknown-linux-pauthtest");
+  EXPECT_EQ(Triple::aarch64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Linux, T.getOS());
+  EXPECT_EQ(Triple::PAuthTest, T.getEnvironment());
+
   T = Triple("huh");
   EXPECT_EQ(Triple::UnknownArch, T.getArch());
 }
diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt
index 8a14a5b8e249e..35055baa05ee9 100644
--- a/llvm/unittests/Transforms/Utils/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt
@@ -18,7 +18,6 @@ add_llvm_unittest(UtilsTests
   CodeLayoutTest.cpp
   CodeMoverUtilsTest.cpp
   DebugifyTest.cpp
-  DXILResourceTest.cpp
   FunctionComparatorTest.cpp
   IntegerDivisionTest.cpp
   LocalTest.cpp
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index d9d6789134d88..9cf9060458bc9 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                              &VecOp, false);
     VPValue EVL;
-    VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+    VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
     EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
     EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
     EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
@@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
   VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                            &VecOp, false);
   VPValue EVL;
-  VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+  VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
   EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
   VPRecipeBase *BaseR = &EVLRecipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 1f4d45d81fd33..d8df7427836c9 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -48,15 +48,12 @@ class CodeGenDAGPatterns;
 using TreePatternNodePtr = IntrusiveRefCntPtr<TreePatternNode>;
 
 /// This represents a set of MVTs. Since the underlying type for the MVT
-/// is uint8_t, there are at most 256 values. To reduce the number of memory
+/// is uint16_t, there are at most 65536 values. To reduce the number of memory
 /// allocations and deallocations, represent the set as a sequence of bits.
 /// To reduce the allocations even further, make MachineValueTypeSet own
 /// the storage and use std::array as the bit container.
 struct MachineValueTypeSet {
-  static_assert(std::is_same<std::underlying_type_t<MVT::SimpleValueType>,
-                             uint8_t>::value,
-                "Change uint8_t here to the SimpleValueType's type");
-  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max() + 1;
+  static unsigned constexpr Capacity = 512;
   using WordType = uint64_t;
   static unsigned constexpr WordWidth = CHAR_BIT * sizeof(WordType);
   static unsigned constexpr NumWords = Capacity / WordWidth;
@@ -84,9 +81,11 @@ struct MachineValueTypeSet {
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   unsigned count(MVT T) const {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     return (Words[T.SimpleTy / WordWidth] >> (T.SimpleTy % WordWidth)) & 1;
   }
   std::pair<MachineValueTypeSet &, bool> insert(MVT T) {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     bool V = count(T.SimpleTy);
     Words[T.SimpleTy / WordWidth] |= WordType(1) << (T.SimpleTy % WordWidth);
     return {*this, V};
@@ -98,6 +97,7 @@ struct MachineValueTypeSet {
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   void erase(MVT T) {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     Words[T.SimpleTy / WordWidth] &= ~(WordType(1) << (T.SimpleTy % WordWidth));
   }
 
@@ -193,8 +193,6 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   TypeSetByHwMode &operator=(const TypeSetByHwMode &) = default;
   TypeSetByHwMode(MVT::SimpleValueType VT)
       : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
-  TypeSetByHwMode(ValueTypeByHwMode VT)
-      : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
   TypeSetByHwMode(ArrayRef<ValueTypeByHwMode> VTList);
 
   SetType &getOrCreate(unsigned Mode) { return Map[Mode]; }
@@ -264,7 +262,8 @@ struct TypeInfer {
   bool MergeInTypeInfo(TypeSetByHwMode &Out, MVT::SimpleValueType InVT) const {
     return MergeInTypeInfo(Out, TypeSetByHwMode(InVT));
   }
-  bool MergeInTypeInfo(TypeSetByHwMode &Out, ValueTypeByHwMode InVT) const {
+  bool MergeInTypeInfo(TypeSetByHwMode &Out,
+                       const ValueTypeByHwMode &InVT) const {
     return MergeInTypeInfo(Out, TypeSetByHwMode(InVT));
   }
 
@@ -841,7 +840,8 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
                       TreePattern &TP);
   bool UpdateNodeType(unsigned ResNo, MVT::SimpleValueType InTy,
                       TreePattern &TP);
-  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, TreePattern &TP);
+  bool UpdateNodeType(unsigned ResNo, const ValueTypeByHwMode &InTy,
+                      TreePattern &TP);
 
   // Update node type with types inferred from an instruction operand or result
   // def from the ins/outs lists.
@@ -996,7 +996,7 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
 }
 
 inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
-                                            ValueTypeByHwMode InTy,
+                                            const ValueTypeByHwMode &InTy,
                                             TreePattern &TP) {
   TypeSetByHwMode VTS(InTy);
   TP.getInfer().expandOverloads(VTS);
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
index 3298965ab41d7..71bbeaaba9a91 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
@@ -218,7 +218,7 @@ void CheckChild2CondCodeMatcher::printImpl(raw_ostream &OS,
 }
 
 void CheckValueTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckValueType MVT::" << TypeName << '\n';
+  OS.indent(indent) << "CheckValueType " << getEnumName(VT) << '\n';
 }
 
 void CheckComplexPatMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
@@ -409,7 +409,7 @@ bool CheckChildIntegerMatcher::isContradictoryImpl(const Matcher *M) const {
 
 bool CheckValueTypeMatcher::isContradictoryImpl(const Matcher *M) const {
   if (const CheckValueTypeMatcher *CVT = dyn_cast<CheckValueTypeMatcher>(M))
-    return CVT->getTypeName() != getTypeName();
+    return CVT->getVT() != getVT();
   return false;
 }
 
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index d4fe513e2e967..1d78b93310f1c 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -684,13 +684,13 @@ class CheckChild2CondCodeMatcher : public Matcher {
 /// CheckValueTypeMatcher - This checks to see if the current node is a
 /// VTSDNode with the specified type, if not it fails to match.
 class CheckValueTypeMatcher : public Matcher {
-  StringRef TypeName;
+  MVT::SimpleValueType VT;
 
 public:
-  CheckValueTypeMatcher(StringRef type_name)
-      : Matcher(CheckValueType), TypeName(type_name) {}
+  CheckValueTypeMatcher(MVT::SimpleValueType SimpleVT)
+      : Matcher(CheckValueType), VT(SimpleVT) {}
 
-  StringRef getTypeName() const { return TypeName; }
+  MVT::SimpleValueType getVT() const { return VT; }
 
   static bool classof(const Matcher *N) {
     return N->getKind() == CheckValueType;
@@ -699,7 +699,7 @@ class CheckValueTypeMatcher : public Matcher {
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
   bool isEqualImpl(const Matcher *M) const override {
-    return cast<CheckValueTypeMatcher>(M)->TypeName == TypeName;
+    return cast<CheckValueTypeMatcher>(M)->VT == VT;
   }
   bool isContradictoryImpl(const Matcher *M) const override;
 };
diff --git a/llvm/utils/TableGen/Common/GlobalISel/CodeExpander.h b/llvm/utils/TableGen/Common/GlobalISel/CodeExpander.h
index 0b1e6ceab52c2..345da613f8435 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/CodeExpander.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/CodeExpander.h
@@ -32,7 +32,7 @@ class raw_ostream;
 class CodeExpander {
   StringRef Code;
   const CodeExpansions &Expansions;
-  const ArrayRef<SMLoc> &Loc;
+  ArrayRef<SMLoc> Loc;
   bool ShowExpansions;
   StringRef Indent;
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index e011e78c67b15..bcba295442ad5 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -687,10 +687,6 @@ StringRef RuleMatcher::getOpcode() const {
   return Matchers.front()->getOpcode();
 }
 
-unsigned RuleMatcher::getNumOperands() const {
-  return Matchers.front()->getNumOperands();
-}
-
 LLTCodeGen RuleMatcher::getFirstConditionAsRootType() {
   InstructionMatcher &InsnMatcher = *Matchers.front();
   if (!InsnMatcher.predicates_empty())
@@ -1344,6 +1340,7 @@ std::string OperandMatcher::getOperandExpr(unsigned InsnVarID) const {
 unsigned OperandMatcher::getInsnVarID() const { return Insn.getInsnVarID(); }
 
 TempTypeIdx OperandMatcher::getTempTypeIdx(RuleMatcher &Rule) {
+  assert(!IsVariadic && "Cannot use this on variadic operands!");
   if (TTIdx >= 0) {
     // Temp type index not assigned yet, so assign one and add the necessary
     // predicate.
@@ -1506,8 +1503,20 @@ StringRef InstructionOpcodeMatcher::getOperandType(unsigned OpIdx) const {
 
 void InstructionNumOperandsMatcher::emitPredicateOpcodes(
     MatchTable &Table, RuleMatcher &Rule) const {
-  Table << MatchTable::Opcode("GIM_CheckNumOperands")
-        << MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
+  StringRef Opc;
+  switch (CK) {
+  case CheckKind::Eq:
+    Opc = "GIM_CheckNumOperands";
+    break;
+  case CheckKind::GE:
+    Opc = "GIM_CheckNumOperandsGE";
+    break;
+  case CheckKind::LE:
+    Opc = "GIM_CheckNumOperandsLE";
+    break;
+  }
+  Table << MatchTable::Opcode(Opc) << MatchTable::Comment("MI")
+        << MatchTable::ULEB128Value(InsnVarID)
         << MatchTable::Comment("Expected")
         << MatchTable::ULEB128Value(NumOperands) << MatchTable::LineBreak;
 }
@@ -1695,12 +1704,15 @@ void MIFlagsInstructionPredicateMatcher::emitPredicateOpcodes(
 
 OperandMatcher &
 InstructionMatcher::addOperand(unsigned OpIdx, const std::string &SymbolicName,
-                               unsigned AllocatedTemporariesBaseID) {
-  Operands.emplace_back(new OperandMatcher(*this, OpIdx, SymbolicName,
-                                           AllocatedTemporariesBaseID));
+                               unsigned AllocatedTemporariesBaseID,
+                               bool IsVariadic) {
+  assert(Operands.empty() ||
+         !Operands.back()->isVariadic() &&
+             "Cannot add more operands after a variadic operand");
+  Operands.emplace_back(new OperandMatcher(
+      *this, OpIdx, SymbolicName, AllocatedTemporariesBaseID, IsVariadic));
   if (!SymbolicName.empty())
     Rule.defineOperand(SymbolicName, *Operands.back());
-
   return *Operands.back();
 }
 
@@ -1726,9 +1738,10 @@ OperandMatcher &InstructionMatcher::addPhysRegInput(Record *Reg, unsigned OpIdx,
 
 void InstructionMatcher::emitPredicateOpcodes(MatchTable &Table,
                                               RuleMatcher &Rule) {
-  if (NumOperandsCheck)
-    InstructionNumOperandsMatcher(InsnVarID, getNumOperands())
+  if (canAddNumOperandsCheck()) {
+    InstructionNumOperandsMatcher(InsnVarID, getNumOperandMatchers())
         .emitPredicateOpcodes(Table, Rule);
+  }
 
   // First emit all instruction level predicates need to be verified before we
   // can verify operands.
@@ -1793,11 +1806,13 @@ void InstructionMatcher::optimize() {
 
   Stash.push_back(predicates_pop_front());
   if (Stash.back().get() == &OpcMatcher) {
-    if (NumOperandsCheck && OpcMatcher.isVariadicNumOperands() &&
-        getNumOperands() != 0)
-      Stash.emplace_back(
-          new InstructionNumOperandsMatcher(InsnVarID, getNumOperands()));
-    NumOperandsCheck = false;
+    // FIXME: Is this even needed still? Why the isVariadicNumOperands check?
+    if (canAddNumOperandsCheck() && OpcMatcher.isVariadicNumOperands() &&
+        getNumOperandMatchers() != 0) {
+      Stash.emplace_back(new InstructionNumOperandsMatcher(
+          InsnVarID, getNumOperandMatchers()));
+    }
+    AllowNumOpsCheck = false;
 
     for (auto &OM : Operands)
       for (auto &OP : OM->predicates())
@@ -1862,11 +1877,13 @@ OperandRenderer::~OperandRenderer() {}
 
 void CopyRenderer::emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule,
                                      unsigned NewInsnID, unsigned OldInsnID,
-                                     unsigned OpIdx, StringRef Name) {
-  if (NewInsnID == 0 && OldInsnID == 0) {
+                                     unsigned OpIdx, StringRef Name,
+                                     bool ForVariadic) {
+  if (!ForVariadic && NewInsnID == 0 && OldInsnID == 0) {
     Table << MatchTable::Opcode("GIR_RootToRootCopy");
   } else {
-    Table << MatchTable::Opcode("GIR_Copy") << MatchTable::Comment("NewInsnID")
+    Table << MatchTable::Opcode(ForVariadic ? "GIR_CopyRemaining" : "GIR_Copy")
+          << MatchTable::Comment("NewInsnID")
           << MatchTable::ULEB128Value(NewInsnID)
           << MatchTable::Comment("OldInsnID")
           << MatchTable::ULEB128Value(OldInsnID);
@@ -1880,8 +1897,9 @@ void CopyRenderer::emitRenderOpcodes(MatchTable &Table,
                                      RuleMatcher &Rule) const {
   const OperandMatcher &Operand = Rule.getOperandMatcher(SymbolicName);
   unsigned OldInsnVarID = Rule.getInsnVarID(Operand.getInstructionMatcher());
+
   emitRenderOpcodes(Table, Rule, NewInsnID, OldInsnVarID, Operand.getOpIdx(),
-                    SymbolicName);
+                    SymbolicName, Operand.isVariadic());
 }
 
 //===- CopyPhysRegRenderer ------------------------------------------------===//
@@ -2127,10 +2145,10 @@ void CustomOperandRenderer::emitRenderOpcodes(MatchTable &Table,
 
 bool BuildMIAction::canMutate(RuleMatcher &Rule,
                               const InstructionMatcher *Insn) const {
-  if (!Insn)
+  if (!Insn || Insn->hasVariadicMatcher())
     return false;
 
-  if (OperandRenderers.size() != Insn->getNumOperands())
+  if (OperandRenderers.size() != Insn->getNumOperandMatchers())
     return false;
 
   for (const auto &Renderer : enumerate(OperandRenderers)) {
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 9e345dceddf52..e3eb1633a0b29 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -641,6 +641,10 @@ class RuleMatcher : public Matcher {
     return make_range(actions_begin(), actions_end());
   }
 
+  bool hasOperand(StringRef SymbolicName) const {
+    return DefinedOperands.contains(SymbolicName);
+  }
+
   void defineOperand(StringRef SymbolicName, OperandMatcher &OM);
 
   void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
@@ -678,7 +682,6 @@ class RuleMatcher : public Matcher {
   const PredicateMatcher &getFirstCondition() const override;
   LLTCodeGen getFirstConditionAsRootType();
   bool hasFirstCondition() const override;
-  unsigned getNumOperands() const;
   StringRef getOpcode() const;
 
   // FIXME: Remove this as soon as possible
@@ -1255,12 +1258,16 @@ class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
 
   TempTypeIdx TTIdx = 0;
 
+  // TODO: has many implications, figure them all out
+  bool IsVariadic = false;
+
 public:
   OperandMatcher(InstructionMatcher &Insn, unsigned OpIdx,
                  const std::string &SymbolicName,
-                 unsigned AllocatedTemporariesBaseID)
+                 unsigned AllocatedTemporariesBaseID, bool IsVariadic = false)
       : Insn(Insn), OpIdx(OpIdx), SymbolicName(SymbolicName),
-        AllocatedTemporariesBaseID(AllocatedTemporariesBaseID) {}
+        AllocatedTemporariesBaseID(AllocatedTemporariesBaseID),
+        IsVariadic(IsVariadic) {}
 
   bool hasSymbolicName() const { return !SymbolicName.empty(); }
   StringRef getSymbolicName() const { return SymbolicName; }
@@ -1272,7 +1279,8 @@ class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
   /// Construct a new operand predicate and add it to the matcher.
   template <class Kind, class... Args>
   std::optional<Kind *> addPredicate(Args &&...args) {
-    if (isSameAsAnotherOperand())
+    // TODO: Should variadic ops support predicates?
+    if (isSameAsAnotherOperand() || IsVariadic)
       return std::nullopt;
     Predicates.emplace_back(std::make_unique<Kind>(
         getInsnVarID(), getOpIdx(), std::forward<Args>(args)...));
@@ -1282,6 +1290,8 @@ class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
   unsigned getOpIdx() const { return OpIdx; }
   unsigned getInsnVarID() const;
 
+  bool isVariadic() const { return IsVariadic; }
+
   /// If this OperandMatcher has not been assigned a TempTypeIdx yet, assigns it
   /// one and adds a `RecordRegisterType` predicate to this matcher. If one has
   /// already been assigned, simply returns it.
@@ -1405,20 +1415,28 @@ class InstructionOpcodeMatcher : public InstructionPredicateMatcher {
 };
 
 class InstructionNumOperandsMatcher final : public InstructionPredicateMatcher {
+public:
+  enum class CheckKind { Eq, LE, GE };
+
+private:
   unsigned NumOperands = 0;
+  CheckKind CK;
 
 public:
-  InstructionNumOperandsMatcher(unsigned InsnVarID, unsigned NumOperands)
+  InstructionNumOperandsMatcher(unsigned InsnVarID, unsigned NumOperands,
+                                CheckKind CK = CheckKind::Eq)
       : InstructionPredicateMatcher(IPM_NumOperands, InsnVarID),
-        NumOperands(NumOperands) {}
+        NumOperands(NumOperands), CK(CK) {}
 
   static bool classof(const PredicateMatcher *P) {
     return P->getKind() == IPM_NumOperands;
   }
 
   bool isIdentical(const PredicateMatcher &B) const override {
-    return InstructionPredicateMatcher::isIdentical(B) &&
-           NumOperands == cast<InstructionNumOperandsMatcher>(&B)->NumOperands;
+    if (!InstructionPredicateMatcher::isIdentical(B))
+      return false;
+    const auto &Other = *cast<InstructionNumOperandsMatcher>(&B);
+    return NumOperands == Other.NumOperands && CK == Other.CK;
   }
 
   void emitPredicateOpcodes(MatchTable &Table,
@@ -1729,20 +1747,32 @@ class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
   /// The operands to match. All rendered operands must be present even if the
   /// condition is always true.
   OperandVec Operands;
-  bool NumOperandsCheck = true;
 
   std::string SymbolicName;
   unsigned InsnVarID;
+  bool AllowNumOpsCheck;
 
   /// PhysRegInputs - List list has an entry for each explicitly specified
   /// physreg input to the pattern.  The first elt is the Register node, the
   /// second is the recorded slot number the input pattern match saved it in.
   SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
 
+  bool canAddNumOperandsCheck() const {
+    // Add if it's allowed, and:
+    //    - We don't have a variadic operand
+    //    - We don't already have such a check.
+    return AllowNumOpsCheck && !hasVariadicMatcher() &&
+           none_of(Predicates, [&](const auto &P) {
+             return P->getKind() ==
+                    InstructionPredicateMatcher::IPM_NumOperands;
+           });
+  }
+
 public:
   InstructionMatcher(RuleMatcher &Rule, StringRef SymbolicName,
-                     bool NumOpsCheck = true)
-      : Rule(Rule), NumOperandsCheck(NumOpsCheck), SymbolicName(SymbolicName) {
+                     bool AllowNumOpsCheck = true)
+      : Rule(Rule), SymbolicName(SymbolicName),
+        AllowNumOpsCheck(AllowNumOpsCheck) {
     // We create a new instruction matcher.
     // Get a new ID for that instruction.
     InsnVarID = Rule.implicitlyDefineInsnVar(*this);
@@ -1762,7 +1792,8 @@ class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
 
   /// Add an operand to the matcher.
   OperandMatcher &addOperand(unsigned OpIdx, const std::string &SymbolicName,
-                             unsigned AllocatedTemporariesBaseID);
+                             unsigned AllocatedTemporariesBaseID,
+                             bool IsVariadic = false);
   OperandMatcher &getOperand(unsigned OpIdx);
   OperandMatcher &addPhysRegInput(Record *Reg, unsigned OpIdx,
                                   unsigned TempOpIdx);
@@ -1772,7 +1803,12 @@ class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
   }
 
   StringRef getSymbolicName() const { return SymbolicName; }
-  unsigned getNumOperands() const { return Operands.size(); }
+
+  unsigned getNumOperandMatchers() const { return Operands.size(); }
+  bool hasVariadicMatcher() const {
+    return !Operands.empty() && Operands.back()->isVariadic();
+  }
+
   OperandVec::iterator operands_begin() { return Operands.begin(); }
   OperandVec::iterator operands_end() { return Operands.end(); }
   iterator_range<OperandVec::iterator> operands() {
@@ -1834,9 +1870,10 @@ class InstructionOperandMatcher : public OperandPredicateMatcher {
 public:
   InstructionOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
                             RuleMatcher &Rule, StringRef SymbolicName,
-                            bool NumOpsCheck = true)
+                            bool AllowNumOpsCheck = true)
       : OperandPredicateMatcher(OPM_Instruction, InsnVarID, OpIdx),
-        InsnMatcher(new InstructionMatcher(Rule, SymbolicName, NumOpsCheck)),
+        InsnMatcher(
+            new InstructionMatcher(Rule, SymbolicName, AllowNumOpsCheck)),
         Flags(Rule.getGISelFlags()) {}
 
   static bool classof(const PredicateMatcher *P) {
@@ -1917,7 +1954,8 @@ class CopyRenderer : public OperandRenderer {
 
   static void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule,
                                 unsigned NewInsnID, unsigned OldInsnID,
-                                unsigned OpIdx, StringRef Name);
+                                unsigned OpIdx, StringRef Name,
+                                bool ForVariadic = false);
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override;
 };
diff --git a/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp b/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp
index 28a9bcc2568c3..52f7b0fcbd624 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp
@@ -46,6 +46,33 @@ std::optional<PatternType> PatternType::get(ArrayRef<SMLoc> DiagLoc,
     return PT;
   }
 
+  if (R->isSubClassOf(VariadicClassName)) {
+    const int64_t Min = R->getValueAsInt("MinArgs");
+    const int64_t Max = R->getValueAsInt("MaxArgs");
+
+    if (Min == 0) {
+      PrintError(
+          DiagLoc,
+          DiagCtx +
+              ": minimum number of arguments must be greater than zero in " +
+              VariadicClassName);
+      return std::nullopt;
+    }
+
+    if (Max <= Min && Max != 0) {
+      PrintError(DiagLoc, DiagCtx + ": maximum number of arguments (" +
+                              Twine(Max) +
+                              ") must be zero, or greater "
+                              "than the minimum number of arguments (" +
+                              Twine(Min) + ") in " + VariadicClassName);
+      return std::nullopt;
+    }
+
+    PatternType PT(PT_VariadicPack);
+    PT.Data.VPTI = {unsigned(Min), unsigned(Max)};
+    return PT;
+  }
+
   PrintError(DiagLoc, DiagCtx + ": unknown type '" + R->getName() + "'");
   return std::nullopt;
 }
@@ -66,6 +93,11 @@ const Record *PatternType::getLLTRecord() const {
   return Data.Def;
 }
 
+VariadicPackTypeInfo PatternType::getVariadicPackTypeInfo() const {
+  assert(isVariadicPack());
+  return Data.VPTI;
+}
+
 bool PatternType::operator==(const PatternType &Other) const {
   if (Kind != Other.Kind)
     return false;
@@ -77,6 +109,8 @@ bool PatternType::operator==(const PatternType &Other) const {
     return Data.Def == Other.Data.Def;
   case PT_TypeOf:
     return Data.Str == Other.Data.Str;
+  case PT_VariadicPack:
+    return Data.VPTI == Other.Data.VPTI;
   }
 
   llvm_unreachable("Unknown Type Kind");
@@ -90,6 +124,10 @@ std::string PatternType::str() const {
     return Data.Def->getName().str();
   case PT_TypeOf:
     return (TypeOfClassName + "<$" + getTypeOfOpName() + ">").str();
+  case PT_VariadicPack:
+    return (VariadicClassName + "<" + Twine(Data.VPTI.Min) + "," +
+            Twine(Data.VPTI.Max) + ">")
+        .str();
   }
 
   llvm_unreachable("Unknown type!");
@@ -525,6 +563,7 @@ bool PatFrag::checkSemantics() {
       case Pattern::K_CXX:
         continue;
       case Pattern::K_CodeGenInstruction:
+        // TODO: Allow VarArgs?
         if (cast<CodeGenInstructionPattern>(Pat.get())->diagnoseAllSpecialTypes(
                 Def.getLoc(), PatternType::SpecialTyClassName +
                                   " is not supported in " + ClassName))
diff --git a/llvm/utils/TableGen/Common/GlobalISel/Patterns.h b/llvm/utils/TableGen/Common/GlobalISel/Patterns.h
index 76d018bdbd71c..2d25ce37ed76c 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/Patterns.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/Patterns.h
@@ -45,21 +45,36 @@ class RuleMatcher;
 
 //===- PatternType --------------------------------------------------------===//
 
+struct VariadicPackTypeInfo {
+  VariadicPackTypeInfo(unsigned Min, unsigned Max) : Min(Min), Max(Max) {
+    assert(Min >= 1 && (Max >= Min || Max == 0));
+  }
+
+  bool operator==(const VariadicPackTypeInfo &Other) const {
+    return Min == Other.Min && Max == Other.Max;
+  }
+
+  unsigned Min;
+  unsigned Max;
+};
+
 /// Represent the type of a Pattern Operand.
 ///
 /// Types have two form:
 ///   - LLTs, which are straightforward.
-///   - Special types, e.g. GITypeOf
+///   - Special types, e.g. GITypeOf, Variadic arguments list.
 class PatternType {
 public:
   static constexpr StringLiteral SpecialTyClassName = "GISpecialType";
   static constexpr StringLiteral TypeOfClassName = "GITypeOf";
+  static constexpr StringLiteral VariadicClassName = "GIVariadic";
 
   enum PTKind : uint8_t {
     PT_None,
 
     PT_ValueType,
     PT_TypeOf,
+    PT_VariadicPack,
   };
 
   PatternType() : Kind(PT_None), Data() {}
@@ -70,11 +85,15 @@ class PatternType {
 
   bool isNone() const { return Kind == PT_None; }
   bool isLLT() const { return Kind == PT_ValueType; }
-  bool isSpecial() const { return isTypeOf(); }
+  bool isSpecial() const { return isTypeOf() || isVariadicPack(); }
   bool isTypeOf() const { return Kind == PT_TypeOf; }
+  bool isVariadicPack() const { return Kind == PT_VariadicPack; }
+
+  PTKind getKind() const { return Kind; }
 
   StringRef getTypeOfOpName() const;
   const Record *getLLTRecord() const;
+  VariadicPackTypeInfo getVariadicPackTypeInfo() const;
 
   explicit operator bool() const { return !isNone(); }
 
@@ -95,6 +114,9 @@ class PatternType {
 
     /// PT_TypeOf -> Operand name (without the '$')
     StringRef Str;
+
+    /// PT_VariadicPack -> min-max number of operands allowed.
+    VariadicPackTypeInfo VPTI;
   } Data;
 };
 
@@ -313,6 +335,8 @@ class InstructionPattern : public Pattern {
   InstructionOperand &getOperand(unsigned K) { return Operands[K]; }
   const InstructionOperand &getOperand(unsigned K) const { return Operands[K]; }
 
+  const InstructionOperand &operands_back() const { return Operands.back(); }
+
   /// When this InstructionPattern is used as the match root, returns the
   /// operands that must be redefined in the 'apply' pattern for the rule to be
   /// valid.
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index ff508d6487333..1b93e3d5e3b70 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -339,7 +339,8 @@ unsigned MatcherTableEmitter::SizeMatcher(Matcher *N, raw_ostream &OS) {
         Size += 2; // Count the child's opcode.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        ++Size; // Count the child's type.
+        Size += GetVBRSize(cast<SwitchTypeMatcher>(N)->getCaseType(
+            i)); // Count the child's type.
       }
       const unsigned ChildSize = SizeMatcherList(Child, OS);
       assert(ChildSize != 0 && "Matcher cannot have child of size 0");
@@ -599,7 +600,8 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         IdxSize = 2; // size of opcode in table is 2 bytes.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        IdxSize = 1; // size of type in table is 1 byte.
+        IdxSize = GetVBRSize(cast<SwitchTypeMatcher>(N)->getCaseType(
+            i)); // size of type in table is sizeof(VBR(MVT)) byte.
       }
 
       if (i != 0) {
@@ -615,8 +617,13 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       CurrentIdx += EmitVBRValue(ChildSize, OS) + IdxSize;
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
         OS << "TARGET_VAL(" << SOM->getCaseOpcode(i).getEnumName() << "),";
-      else
-        OS << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i)) << ',';
+      else {
+        if (!OmitComments)
+          OS << "/*" << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i))
+             << "*/";
+        EmitVBRValue(cast<SwitchTypeMatcher>(N)->getCaseType(i),
+                     OS);
+      }
       if (!OmitComments)
         OS << "// ->" << CurrentIdx + ChildSize;
       OS << '\n';
@@ -639,7 +646,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     return CurrentIdx - StartIdx + 1;
   }
 
-  case Matcher::CheckType:
+  case Matcher::CheckType: {
     if (cast<CheckTypeMatcher>(N)->getResNo() == 0) {
       MVT::SimpleValueType VT = cast<CheckTypeMatcher>(N)->getType();
       switch (VT) {
@@ -648,13 +655,21 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         OS << "OPC_CheckTypeI" << MVT(VT).getSizeInBits() << ",\n";
         return 1;
       default:
-        OS << "OPC_CheckType, " << getEnumName(VT) << ",\n";
-        return 2;
+        OS << "OPC_CheckType, ";
+        if (!OmitComments)
+          OS << "/*" << getEnumName(VT) << "*/";
+        unsigned NumBytes = EmitVBRValue(VT, OS);
+        OS << "\n";
+        return NumBytes + 1;
       }
     }
-    OS << "OPC_CheckTypeRes, " << cast<CheckTypeMatcher>(N)->getResNo() << ", "
-       << getEnumName(cast<CheckTypeMatcher>(N)->getType()) << ",\n";
-    return 3;
+    OS << "OPC_CheckTypeRes, " << cast<CheckTypeMatcher>(N)->getResNo() << ", ";
+    if (!OmitComments)
+      OS << "/*" << getEnumName(cast<CheckTypeMatcher>(N)->getType()) << "*/";
+    unsigned NumBytes = EmitVBRValue(cast<CheckTypeMatcher>(N)->getType(), OS);
+    OS << "\n";
+    return NumBytes + 2;
+  }
 
   case Matcher::CheckChildType: {
     MVT::SimpleValueType VT = cast<CheckChildTypeMatcher>(N)->getType();
@@ -666,8 +681,12 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       return 1;
     default:
       OS << "OPC_CheckChild" << cast<CheckChildTypeMatcher>(N)->getChildNo()
-         << "Type, " << getEnumName(VT) << ",\n";
-      return 2;
+         << "Type, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      unsigned NumBytes = EmitVBRValue(VT, OS);
+      OS << "\n";
+      return NumBytes + 1;
     }
   }
 
@@ -696,10 +715,16 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
        << cast<CheckChild2CondCodeMatcher>(N)->getCondCodeName() << ",\n";
     return 2;
 
-  case Matcher::CheckValueType:
-    OS << "OPC_CheckValueType, MVT::"
-       << cast<CheckValueTypeMatcher>(N)->getTypeName() << ",\n";
-    return 2;
+  case Matcher::CheckValueType: {
+    OS << "OPC_CheckValueType, ";
+    if (!OmitComments)
+      OS << "/*" << getEnumName(cast<CheckValueTypeMatcher>(N)->getVT())
+         << "*/";
+    unsigned NumBytes =
+        EmitVBRValue(cast<CheckValueTypeMatcher>(N)->getVT(), OS);
+    OS << "\n";
+    return NumBytes + 1;
+  }
 
   case Matcher::CheckComplexPat: {
     const CheckComplexPatMatcher *CCPM = cast<CheckComplexPatMatcher>(N);
@@ -766,8 +791,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitInteger" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitInteger, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitInteger, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     unsigned Bytes = OpBytes + EmitSignedVBRValue(Val, OS);
@@ -785,8 +812,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitStringInteger" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitStringInteger, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitStringInteger, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     OS << Val << ",\n";
@@ -797,14 +826,17 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     const EmitRegisterMatcher *Matcher = cast<EmitRegisterMatcher>(N);
     const CodeGenRegister *Reg = Matcher->getReg();
     MVT::SimpleValueType VT = Matcher->getVT();
+    unsigned OpBytes;
     // If the enum value of the register is larger than one byte can handle,
     // use EmitRegister2.
     if (Reg && Reg->EnumValue > 255) {
-      OS << "OPC_EmitRegister2, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitRegister2, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS);
       OS << "TARGET_VAL(" << getQualifiedName(Reg->TheDef) << "),\n";
-      return 4;
+      return OpBytes + 3;
     }
-    unsigned OpBytes;
     switch (VT) {
     case MVT::i32:
     case MVT::i64:
@@ -812,8 +844,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitRegisterI" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitRegister, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitRegister, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     if (Reg) {
@@ -958,8 +992,12 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         OS << "/*#VTs*/";
       OS << ", ";
     }
-    for (unsigned i = 0, e = EN->getNumVTs(); i != e; ++i)
-      OS << getEnumName(EN->getVT(i)) << ", ";
+    unsigned NumTypeBytes = 0;
+    for (unsigned i = 0, e = EN->getNumVTs(); i != e; ++i) {
+      if (!OmitComments)
+        OS << "/*" << getEnumName(EN->getVT(i)) << "*/";
+      NumTypeBytes += EmitVBRValue(EN->getVT(i), OS);
+    }
 
     OS << EN->getNumOperands();
     if (!OmitComments)
@@ -992,7 +1030,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     } else
       OS << '\n';
 
-    return 4 + !CompressVTs + !CompressNodeInfo + EN->getNumVTs() +
+    return 4 + !CompressVTs + !CompressNodeInfo + NumTypeBytes +
            NumOperandBytes + NumCoveredBytes;
   }
   case Matcher::CompleteMatch: {
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 99babdf07316f..db3fe8d2993f2 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -235,7 +235,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) {
     if (N.hasName())
       return;
     // An unnamed ValueType as in (sext_inreg GPR:$foo, i8).
-    return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName()));
+    return AddMatcher(new CheckValueTypeMatcher(llvm::getValueType(LeafRec)));
   }
 
   if ( // Handle register references.  Nothing to do here, they always match.
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 0439df8067ede..8e54ead5ac484 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -16,48 +16,37 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/Support/DXILABI.h"
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+
 #include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::dxil;
 
 namespace {
 
-struct DXILShaderModel {
-  int Major = 0;
-  int Minor = 0;
-};
-
 struct DXILOperationDesc {
   std::string OpName; // name of DXIL operation
   int OpCode;         // ID of DXIL operation
   StringRef OpClass;  // name of the opcode class
   StringRef Doc;      // the documentation description of this instruction
-  SmallVector<Record *> OpTypes; // Vector of operand type records -
-                                 // return type is at index 0
-  SmallVector<std::string>
-      OpAttributes;     // operation attribute represented as strings
-  StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
-                        // means no map exists
-  bool IsDeriv = false; // whether this is some kind of derivative
-  bool IsGradient = false; // whether this requires a gradient calculation
-  bool IsFeedback = false; // whether this is a sampler feedback op
-  bool IsWave =
-      false; // whether this requires in-wave, cross-lane functionality
-  bool RequiresUniformInputs = false; // whether this operation requires that
-                                      // all of its inputs are uniform across
-                                      // the wave
+  // Vector of operand type records - return type is at index 0
+  SmallVector<Record *> OpTypes;
+  SmallVector<Record *> OverloadRecs;
+  SmallVector<Record *> StageRecs;
+  SmallVector<Record *> AttrRecs;
+  StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which
+                       // means no map exists
   SmallVector<StringRef, 4>
       ShaderStages; // shader stages to which this applies, empty for all.
-  DXILShaderModel ShaderModel;           // minimum shader model required
-  DXILShaderModel ShaderModelTranslated; // minimum shader model required with
-                                         // translation by linker
   int OverloadParamIndex;             // Index of parameter with overload type.
                                       //   -1 : no overload types
   SmallVector<StringRef, 4> counters; // counters for this inst.
@@ -91,18 +80,32 @@ static ParameterKind getParameterKind(const Record *R) {
     return ParameterKind::I32;
   case MVT::fAny:
   case MVT::iAny:
+  case MVT::Any:
     return ParameterKind::Overload;
-  case MVT::Other:
-    // Handle DXIL-specific overload types
-    if (R->getValueAsInt("isHalfOrFloat") || R->getValueAsInt("isI16OrI32")) {
-      return ParameterKind::Overload;
-    }
-    [[fallthrough]];
   default:
-    llvm_unreachable("Support for specified DXIL Type not yet implemented");
+    llvm_unreachable(
+        "Support for specified parameter type not yet implemented");
   }
 }
 
+/// In-place sort TableGen records of class with a field
+///    Version dxil_version
+/// in the ascending version order.
+static void AscendingSortByVersion(std::vector<Record *> &Recs) {
+  std::sort(Recs.begin(), Recs.end(), [](Record *RecA, Record *RecB) {
+    unsigned RecAMaj =
+        RecA->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned RecAMin =
+        RecA->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    unsigned RecBMaj =
+        RecB->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned RecBMin =
+        RecB->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+
+    return (VersionTuple(RecAMaj, RecAMin) < VersionTuple(RecBMaj, RecBMin));
+  });
+}
+
 /// Construct an object using the DXIL Operation records specified
 /// in DXIL.td. This serves as the single source of reference of
 /// the information extracted from the specified Record R, for
@@ -113,9 +116,15 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   OpCode = R->getValueAsInt("OpCode");
 
   Doc = R->getValueAsString("Doc");
+  SmallVector<Record *> ParamTypeRecs;
+
+  ParamTypeRecs.push_back(R->getValueAsDef("result"));
 
-  auto TypeRecs = R->getValueAsListOfDefs("OpTypes");
-  unsigned TypeRecsSize = TypeRecs.size();
+  std::vector<Record *> ArgTys = R->getValueAsListOfDefs("arguments");
+  for (auto Ty : ArgTys) {
+    ParamTypeRecs.push_back(Ty);
+  }
+  size_t ParamTypeRecsSize = ParamTypeRecs.size();
   // Populate OpTypes with return type and parameter types
 
   // Parameter indices of overloaded parameters.
@@ -124,30 +133,23 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // the comment before the definition of class LLVMMatchType in
   // llvm/IR/Intrinsics.td
   SmallVector<int> OverloadParamIndices;
-  for (unsigned i = 0; i < TypeRecsSize; i++) {
-    auto TR = TypeRecs[i];
+  for (unsigned i = 0; i < ParamTypeRecsSize; i++) {
+    auto TR = ParamTypeRecs[i];
     // Track operation parameter indices of any overload types
     auto isAny = TR->getValueAsInt("isAny");
     if (isAny == 1) {
-      // TODO: At present it is expected that all overload types in a DXIL Op
-      // are of the same type. Hence, OverloadParamIndices will have only one
-      // element. This implies we do not need a vector. However, until more
-      // (all?) DXIL Ops are added in DXIL.td, a vector is being used to flag
-      // cases this assumption would not hold.
+      // All overload types in a DXIL Op are required to be of the same type.
       if (!OverloadParamIndices.empty()) {
-        bool knownType = true;
+        [[maybe_unused]] bool knownType = true;
         // Ensure that the same overload type registered earlier is being used
         for (auto Idx : OverloadParamIndices) {
-          if (TR != TypeRecs[Idx]) {
+          if (TR != ParamTypeRecs[Idx]) {
             knownType = false;
             break;
           }
         }
-        if (!knownType) {
-          report_fatal_error("Specification of multiple differing overload "
-                             "parameter types not yet supported",
-                             false);
-        }
+        assert(knownType && "Specification of multiple differing overload "
+                            "parameter types not yet supported");
       } else {
         OverloadParamIndices.push_back(i);
       }
@@ -160,7 +162,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
       // Get the parameter index of anonymous type, TR, references
       auto OLParamIndex = TR->getValueAsInt("Number");
       // Resolve and insert the type to that at OLParamIndex
-      OpTypes.emplace_back(TypeRecs[OLParamIndex]);
+      OpTypes.emplace_back(ParamTypeRecs[OLParamIndex]);
     } else {
       // A non-anonymous type. Just record it in OpTypes
       OpTypes.emplace_back(TR);
@@ -170,28 +172,62 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // Set the index of the overload parameter, if any.
   OverloadParamIndex = -1; // default; indicating none
   if (!OverloadParamIndices.empty()) {
-    if (OverloadParamIndices.size() > 1)
-      report_fatal_error("Multiple overload type specification not supported",
-                         false);
+    assert(OverloadParamIndices.size() == 1 &&
+           "Multiple overload type specification not supported");
     OverloadParamIndex = OverloadParamIndices[0];
   }
+
+  // Get overload records
+  std::vector<Record *> Recs = R->getValueAsListOfDefs("overloads");
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    OverloadRecs.push_back(CR);
+  }
+
+  // Get stage records
+  Recs = R->getValueAsListOfDefs("stages");
+
+  if (Recs.empty()) {
+    PrintFatalError(R, Twine("Atleast one specification of valid stage for ") +
+                           OpName + " is required");
+  }
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    StageRecs.push_back(CR);
+  }
+
+  // Get attribute records
+  Recs = R->getValueAsListOfDefs("attributes");
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    AttrRecs.push_back(CR);
+  }
+
   // Get the operation class
   OpClass = R->getValueAsDef("OpClass")->getName();
 
-  if (R->getValue("LLVMIntrinsic")) {
-    auto *IntrinsicDef = R->getValueAsDef("LLVMIntrinsic");
-    auto DefName = IntrinsicDef->getName();
-    assert(DefName.starts_with("int_") && "invalid intrinsic name");
-    // Remove the int_ from intrinsic name.
-    Intrinsic = DefName.substr(4);
-    // TODO: For now, assume that attributes of DXIL Operation are the same as
-    // that of the intrinsic. Deviations are expected to be encoded in TableGen
-    // record specification and handled accordingly here. Support to be added
-    // as needed.
-    auto IntrPropList = IntrinsicDef->getValueAsListInit("IntrProperties");
-    auto IntrPropListSize = IntrPropList->size();
-    for (unsigned i = 0; i < IntrPropListSize; i++) {
-      OpAttributes.emplace_back(IntrPropList->getElement(i)->getAsString());
+  if (!OpClass.str().compare("UnknownOpClass")) {
+    PrintFatalError(R, Twine("Unspecified DXIL OpClass for DXIL operation - ") +
+                           OpName);
+  }
+
+  const RecordVal *RV = R->getValue("LLVMIntrinsic");
+  if (RV && RV->getValue()) {
+    if (DefInit *DI = dyn_cast<DefInit>(RV->getValue())) {
+      auto *IntrinsicDef = DI->getDef();
+      auto DefName = IntrinsicDef->getName();
+      assert(DefName.starts_with("int_") && "invalid intrinsic name");
+      // Remove the int_ from intrinsic name.
+      Intrinsic = DefName.substr(4);
     }
   }
 }
@@ -239,10 +275,8 @@ static std::string getParameterKindStr(ParameterKind Kind) {
 /// \return std::string string representation of OverloadKind
 
 static std::string getOverloadKindStr(const Record *R) {
-  auto VTRec = R->getValueAsDef("VT");
+  Record *VTRec = R->getValueAsDef("VT");
   switch (getValueType(VTRec)) {
-  case MVT::isVoid:
-    return "OverloadKind::VOID";
   case MVT::f16:
     return "OverloadKind::HALF";
   case MVT::f32:
@@ -259,57 +293,163 @@ static std::string getOverloadKindStr(const Record *R) {
     return "OverloadKind::I32";
   case MVT::i64:
     return "OverloadKind::I64";
-  case MVT::iAny:
-    return "OverloadKind::I16 | OverloadKind::I32 | OverloadKind::I64";
-  case MVT::fAny:
-    return "OverloadKind::HALF | OverloadKind::FLOAT | OverloadKind::DOUBLE";
-  case MVT::Other:
-    // Handle DXIL-specific overload types
-    {
-      if (R->getValueAsInt("isHalfOrFloat")) {
-        return "OverloadKind::HALF | OverloadKind::FLOAT";
-      } else if (R->getValueAsInt("isI16OrI32")) {
-        return "OverloadKind::I16 | OverloadKind::I32";
-      }
-    }
-    [[fallthrough]];
   default:
-    llvm_unreachable(
-        "Support for specified parameter OverloadKind not yet implemented");
+    llvm_unreachable("Support for specified fixed type option for overload "
+                     "type not supported");
   }
 }
 
-/// Emit Enums of DXIL Ops
-/// \param A vector of DXIL Ops
-/// \param Output stream
-static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
-                          raw_ostream &OS) {
-  // Sort by OpCode
-  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
-    return A.OpCode < B.OpCode;
-  });
+/// Return a string representation of valid overload information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Overload records
+/// \return std::string string representation of overload mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getOverloadMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+  // If no overload information records were specified, assume the operation
+  // a) to be supported in DXIL Version 1.0 and later
+  // b) has no overload types
+  if (Recs.empty()) {
+    MaskString.append("{{1, 0}, OverloadKind::UNDEFINED}}");
+  } else {
+    for (auto Rec : Recs) {
+      unsigned Major =
+          Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+      unsigned Minor =
+          Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+      MaskString.append(Prefix)
+          .append("{{")
+          .append(std::to_string(Major))
+          .append(", ")
+          .append(std::to_string(Minor).append("}, "));
+
+      std::string PipePrefix = "";
+      auto Tys = Rec->getValueAsListOfDefs("overload_types");
+      if (Tys.empty()) {
+        MaskString.append("OverloadKind::UNDEFINED");
+      }
+      for (const auto *Ty : Tys) {
+        MaskString.append(PipePrefix).append(getOverloadKindStr(Ty));
+        PipePrefix = " | ";
+      }
 
-  OS << "// Enumeration for operations specified by DXIL\n";
-  OS << "enum class OpCode : unsigned {\n";
+      MaskString.append("}");
+      Prefix = ", ";
+    }
+    MaskString.append("}");
+  }
+  return MaskString;
+}
 
-  for (auto &Op : Ops) {
-    // Name = ID, // Doc
-    OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
+/// Return a string representation of valid shader stag information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Stages records
+/// \return std::string string representation of stages mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getStageMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+  // Atleast one stage information record is expected to be specified.
+  if (Recs.empty()) {
+    PrintFatalError("Atleast one specification of valid stages for "
+                    "operation must be specified");
   }
 
-  OS << "\n};\n\n";
+  for (auto Rec : Recs) {
+    unsigned Major = Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned Minor = Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    MaskString.append(Prefix)
+        .append("{{")
+        .append(std::to_string(Major))
+        .append(", ")
+        .append(std::to_string(Minor).append("}, "));
+
+    std::string PipePrefix = "";
+    auto Stages = Rec->getValueAsListOfDefs("shader_stages");
+    if (Stages.empty()) {
+      PrintFatalError("No valid stages for operation specified");
+    }
+    for (const auto *S : Stages) {
+      MaskString.append(PipePrefix).append("ShaderKind::").append(S->getName());
+      PipePrefix = " | ";
+    }
 
-  OS << "// Groups for DXIL operations with equivalent function templates\n";
-  OS << "enum class OpCodeClass : unsigned {\n";
-  // Build an OpClass set to print
-  SmallSet<StringRef, 2> OpClassSet;
-  for (auto &Op : Ops) {
-    OpClassSet.insert(Op.OpClass);
+    MaskString.append("}");
+    Prefix = ", ";
   }
-  for (auto &C : OpClassSet) {
-    OS << C << ",\n";
+  MaskString.append("}");
+  return MaskString;
+}
+
+/// Return a string representation of valid attribute information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Attribute records
+/// \return std::string string representation of stages mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getAttributeMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+
+  for (auto Rec : Recs) {
+    unsigned Major = Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned Minor = Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    MaskString.append(Prefix)
+        .append("{{")
+        .append(std::to_string(Major))
+        .append(", ")
+        .append(std::to_string(Minor).append("}, "));
+
+    std::string PipePrefix = "";
+    auto Attrs = Rec->getValueAsListOfDefs("op_attrs");
+    if (Attrs.empty()) {
+      MaskString.append("Attribute::None");
+    } else {
+      for (const auto *Attr : Attrs) {
+        MaskString.append(PipePrefix)
+            .append("Attribute::")
+            .append(Attr->getName());
+        PipePrefix = " | ";
+      }
+    }
+
+    MaskString.append("}");
+    Prefix = ", ";
   }
-  OS << "\n};\n\n";
+  MaskString.append("}");
+  return MaskString;
+}
+
+/// Emit a mapping of DXIL opcode to opname
+static void emitDXILOpCodes(std::vector<DXILOperationDesc> &Ops,
+                            raw_ostream &OS) {
+  OS << "#ifdef DXIL_OPCODE\n";
+  for (const DXILOperationDesc &Op : Ops)
+    OS << "DXIL_OPCODE(" << Op.OpCode << ", " << Op.OpName << ")\n";
+  OS << "#undef DXIL_OPCODE\n";
+  OS << "\n";
+  OS << "#endif\n\n";
+}
+
+/// Emit a list of DXIL op classes
+static void emitDXILOpClasses(RecordKeeper &Records,
+                              raw_ostream &OS) {
+  OS << "#ifdef DXIL_OPCLASS\n";
+  std::vector<Record *> OpClasses =
+      Records.getAllDerivedDefinitions("DXILOpClass");
+  for (Record *OpClass : OpClasses)
+    OS << "DXIL_OPCLASS(" << OpClass->getName() << ")\n";
+  OS << "#undef DXIL_OPCLASS\n";
+  OS << "#endif\n\n";
 }
 
 /// Emit map of DXIL operation to LLVM or DirectX intrinsic
@@ -317,37 +457,17 @@ static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
 /// \param Output stream
 static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
                                  raw_ostream &OS) {
+  OS << "#ifdef DXIL_OP_INTRINSIC\n";
   OS << "\n";
-  // FIXME: use array instead of SmallDenseMap.
-  OS << "static const SmallDenseMap<Intrinsic::ID, dxil::OpCode> LowerMap = "
-        "{\n";
-  for (auto &Op : Ops) {
+  for (const auto &Op : Ops) {
     if (Op.Intrinsic.empty())
       continue;
-    // {Intrinsic::sin, dxil::OpCode::Sin},
-    OS << "  { Intrinsic::" << Op.Intrinsic << ", dxil::OpCode::" << Op.OpName
-       << "},\n";
+    OS << "DXIL_OP_INTRINSIC(dxil::OpCode::" << Op.OpName
+       << ", Intrinsic::" << Op.Intrinsic << ")\n";
   }
-  OS << "};\n";
   OS << "\n";
-}
-
-/// Convert operation attribute string to Attribute enum
-///
-/// \param Attr string reference
-/// \return std::string Attribute enum string
-
-static std::string emitDXILOperationAttr(SmallVector<std::string> Attrs) {
-  for (auto Attr : Attrs) {
-    // TODO: For now just recognize IntrNoMem and IntrReadMem as valid and
-    //  ignore others.
-    if (Attr == "IntrNoMem") {
-      return "Attribute::ReadNone";
-    } else if (Attr == "IntrReadMem") {
-      return "Attribute::ReadOnly";
-    }
-  }
-  return "Attribute::None";
+  OS << "#undef DXIL_OP_INTRINSIC\n";
+  OS << "#endif\n\n";
 }
 
 /// Emit DXIL operation table
@@ -355,11 +475,6 @@ static std::string emitDXILOperationAttr(SmallVector<std::string> Attrs) {
 /// \param Output stream
 static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
                                    raw_ostream &OS) {
-  // Sort by OpCode.
-  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
-    return A.OpCode < B.OpCode;
-  });
-
   // Collect Names.
   SequenceToOffsetTable<std::string> OpClassStrings;
   SequenceToOffsetTable<std::string> OpStrings;
@@ -388,15 +503,13 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   OpClassStrings.layout();
   Parameters.layout();
 
-  // Emit the DXIL operation table.
-  //{dxil::OpCode::Sin, OpCodeNameIndex, OpCodeClass::unary,
-  // OpCodeClassNameIndex,
-  // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0,
-  // 3, ParameterTableOffset},
+  // Emit access function getOpcodeProperty() that embeds DXIL Operation table
+  // with entries of type struct OpcodeProperty.
   OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode Op) "
         "{\n";
 
   OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
+  std::string Prefix = "";
   for (auto &Op : Ops) {
     // Consider Op.OverloadParamIndex as the overload parameter index, by
     // default
@@ -408,13 +521,15 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
     if (OLParamIdx < 0) {
       OLParamIdx = (Op.OpTypes.size() > 1) ? 1 : 0;
     }
-    OS << "  { dxil::OpCode::" << Op.OpName << ", " << OpStrings.get(Op.OpName)
-       << ", OpCodeClass::" << Op.OpClass << ", "
+    OS << Prefix << "  { dxil::OpCode::" << Op.OpName << ", "
+       << OpStrings.get(Op.OpName) << ", OpCodeClass::" << Op.OpClass << ", "
        << OpClassStrings.get(Op.OpClass.data()) << ", "
-       << getOverloadKindStr(Op.OpTypes[OLParamIdx]) << ", "
-       << emitDXILOperationAttr(Op.OpAttributes) << ", "
-       << Op.OverloadParamIndex << ", " << Op.OpTypes.size() - 1 << ", "
-       << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
+       << getOverloadMaskString(Op.OverloadRecs) << ", "
+       << getStageMaskString(Op.StageRecs) << ", "
+       << getAttributeMaskString(Op.AttrRecs) << ", " << Op.OverloadParamIndex
+       << ", " << Op.OpTypes.size() - 1 << ", "
+       << Parameters.get(ParameterMap[Op.OpClass]) << " }";
+    Prefix = ",\n";
   }
   OS << "  };\n";
 
@@ -466,7 +581,43 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   OS << "  };\n\n";
   OS << "  unsigned Index = Prop.ParameterTableOffset;\n";
   OS << "  return DXILOpParameterKindTable + Index;\n";
-  OS << "}\n ";
+  OS << "}\n\n";
+}
+
+static void emitDXILOperationTableDataStructs(RecordKeeper &Records,
+                                              raw_ostream &OS) {
+  // Get Shader stage records
+  std::vector<Record *> ShaderKindRecs =
+      Records.getAllDerivedDefinitions("DXILShaderStage");
+  // Sort records by name
+  llvm::sort(ShaderKindRecs,
+             [](Record *A, Record *B) { return A->getName() < B->getName(); });
+
+  OS << "// Valid shader kinds\n\n";
+  // Choose the type of enum ShaderKind based on the number of stages declared.
+  // This gives the flexibility to just add add new stage records in DXIL.td, if
+  // needed, with no need to change this backend code.
+  size_t ShaderKindCount = ShaderKindRecs.size();
+  uint64_t ShaderKindTySz = PowerOf2Ceil(ShaderKindRecs.size() + 1);
+  OS << "enum ShaderKind : uint" << ShaderKindTySz << "_t {\n";
+  const std::string allStages("all_stages");
+  const std::string removed("removed");
+  int shiftVal = 1;
+  for (auto R : ShaderKindRecs) {
+    auto Name = R->getName();
+    if (Name.compare(removed) == 0) {
+      OS << "  " << Name
+         << " =  0,  // Pseudo-stage indicating op not supported in any "
+            "stage\n";
+    } else if (Name.compare(allStages) == 0) {
+      OS << "  " << Name << " =  0x"
+         << utohexstr(((1 << ShaderKindCount) - 1), false, 0)
+         << ", // Pseudo-stage indicating op is supported in all stages\n";
+    } else if (Name.compare(allStages)) {
+      OS << "  " << Name << " = 1 << " << std::to_string(shiftVal++) << ",\n";
+    }
+  }
+  OS << "}; // enum ShaderKind\n\n";
 }
 
 /// Entry function call that invokes the functionality of this TableGen backend
@@ -475,21 +626,31 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
 static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
   OS << "// Generated code, do not edit.\n";
   OS << "\n";
-  // Get all DXIL Ops to intrinsic mapping records
-  std::vector<Record *> OpIntrMaps =
-      Records.getAllDerivedDefinitions("DXILOpMapping");
+  // Get all DXIL Ops property records
+  std::vector<Record *> OpIntrProps =
+      Records.getAllDerivedDefinitions("DXILOp");
   std::vector<DXILOperationDesc> DXILOps;
-  for (auto *Record : OpIntrMaps) {
+  for (auto *Record : OpIntrProps) {
     DXILOps.emplace_back(DXILOperationDesc(Record));
   }
-  OS << "#ifdef DXIL_OP_ENUM\n";
-  emitDXILEnums(DXILOps, OS);
-  OS << "#endif\n\n";
-  OS << "#ifdef DXIL_OP_INTRINSIC_MAP\n";
+  // Sort by opcode.
+  llvm::sort(DXILOps, [](DXILOperationDesc &A, DXILOperationDesc &B) {
+    return A.OpCode < B.OpCode;
+  });
+  int PrevOp = -1;
+  for (DXILOperationDesc &Desc : DXILOps) {
+    if (Desc.OpCode == PrevOp)
+      PrintFatalError(Twine("Duplicate opcode: ") + Twine(Desc.OpCode));
+    PrevOp = Desc.OpCode;
+  }
+
+  emitDXILOpCodes(DXILOps, OS);
+  emitDXILOpClasses(Records, OS);
   emitDXILIntrinsicMap(DXILOps, OS);
-  OS << "#endif\n\n";
-  OS << "#ifdef DXIL_OP_OPERATION_TABLE\n";
+  OS << "#ifdef DXIL_OP_OPERATION_TABLE\n\n";
+  emitDXILOperationTableDataStructs(Records, OS);
   emitDXILOperationTable(DXILOps, OS);
+  OS << "#undef DXIL_OP_OPERATION_TABLE\n";
   OS << "#endif\n\n";
 }
 
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index e8fbaed0f50e8..0f8f1cce81700 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -99,8 +99,14 @@ void declareInstExpansion(CodeExpansions &CE, const BuildMIAction &A,
 
 void declareOperandExpansion(CodeExpansions &CE, const OperandMatcher &OM,
                              StringRef Name) {
-  CE.declare(Name, "State.MIs[" + to_string(OM.getInsnVarID()) +
-                       "]->getOperand(" + to_string(OM.getOpIdx()) + ")");
+  if (OM.isVariadic()) {
+    CE.declare(Name, "getRemainingOperands(*State.MIs[" +
+                         to_string(OM.getInsnVarID()) + "], " +
+                         to_string(OM.getOpIdx()) + ")");
+  } else {
+    CE.declare(Name, "State.MIs[" + to_string(OM.getInsnVarID()) +
+                         "]->getOperand(" + to_string(OM.getOpIdx()) + ")");
+  }
 }
 
 void declareTempRegExpansion(CodeExpansions &CE, unsigned TempRegID,
@@ -128,18 +134,6 @@ LLTCodeGen getLLTCodeGen(const PatternType &PT) {
   return *MVTToLLT(getValueType(PT.getLLTRecord()));
 }
 
-LLTCodeGenOrTempType getLLTCodeGenOrTempType(const PatternType &PT,
-                                             RuleMatcher &RM) {
-  assert(!PT.isNone());
-
-  if (PT.isLLT())
-    return getLLTCodeGen(PT);
-
-  assert(PT.isTypeOf());
-  auto &OM = RM.getOperandMatcher(PT.getTypeOfOpName());
-  return OM.getTempTypeIdx(RM);
-}
-
 //===- PrettyStackTrace Helpers  ------------------------------------------===//
 
 class PrettyStackTraceParse : public PrettyStackTraceEntry {
@@ -668,6 +662,9 @@ class CombineRuleBuilder {
     return CGT.getInstruction(RuleDef.getRecords().getDef("G_CONSTANT"));
   }
 
+  std::optional<LLTCodeGenOrTempType>
+  getLLTCodeGenOrTempType(const PatternType &PT, RuleMatcher &RM);
+
   void PrintError(Twine Msg) const { ::PrintError(&RuleDef, Msg); }
   void PrintWarning(Twine Msg) const { ::PrintWarning(RuleDef.getLoc(), Msg); }
   void PrintNote(Twine Msg) const { ::PrintNote(RuleDef.getLoc(), Msg); }
@@ -960,6 +957,24 @@ void CombineRuleBuilder::verify() const {
 }
 #endif
 
+std::optional<LLTCodeGenOrTempType>
+CombineRuleBuilder::getLLTCodeGenOrTempType(const PatternType &PT,
+                                            RuleMatcher &RM) {
+  assert(!PT.isNone());
+
+  if (PT.isLLT())
+    return getLLTCodeGen(PT);
+
+  assert(PT.isTypeOf());
+  auto &OM = RM.getOperandMatcher(PT.getTypeOfOpName());
+  if (OM.isVariadic()) {
+    PrintError("type '" + PT.str() + "' is ill-formed: '" +
+               OM.getSymbolicName() + "' is a variadic pack operand");
+    return std::nullopt;
+  }
+  return OM.getTempTypeIdx(RM);
+}
+
 void CombineRuleBuilder::print(raw_ostream &OS,
                                const PatternAlternatives &Alts) const {
   SmallVector<std::string, 1> Strings(
@@ -1079,11 +1094,18 @@ bool CombineRuleBuilder::typecheckPatterns() {
   // match patterns.
   for (auto &Pat : values(MatchPats)) {
     if (auto *IP = dyn_cast<InstructionPattern>(Pat.get())) {
-      if (IP->diagnoseAllSpecialTypes(
-              RuleDef.getLoc(), PatternType::SpecialTyClassName +
-                                    " is not supported in 'match' patterns")) {
-        return false;
+      bool HasDiag = false;
+      for (const auto &[Idx, Op] : enumerate(IP->operands())) {
+        if (Op.getType().isTypeOf()) {
+          PrintError(PatternType::TypeOfClassName +
+                     " is not supported in 'match' patterns");
+          PrintNote("operand " + Twine(Idx) + " of '" + IP->getName() +
+                    "' has type '" + Op.getType().str() + "'");
+          HasDiag = true;
+        }
       }
+      if (HasDiag)
+        return false;
     }
   }
   return true;
@@ -1145,6 +1167,39 @@ bool CombineRuleBuilder::buildPermutationsToEmit() {
 bool CombineRuleBuilder::checkSemantics() {
   assert(MatchRoot && "Cannot call this before findRoots()");
 
+  const auto CheckVariadicOperands = [&](const InstructionPattern &IP,
+                                         bool IsMatch) {
+    bool HasVariadic = false;
+    for (auto &Op : IP.operands()) {
+      if (!Op.getType().isVariadicPack())
+        continue;
+
+      HasVariadic = true;
+
+      if (IsMatch && &Op != &IP.operands_back()) {
+        PrintError("'" + IP.getInstName() +
+                   "': " + PatternType::VariadicClassName +
+                   " can only be used on the last operand");
+        return false;
+      }
+
+      if (Op.isDef()) {
+        PrintError("'" + IP.getInstName() + "': " +
+                   PatternType::VariadicClassName + " cannot be used on defs");
+        return false;
+      }
+    }
+
+    if (HasVariadic && !IP.isVariadic()) {
+      PrintError("cannot use a " + PatternType::VariadicClassName +
+                 " operand on non-variadic instruction '" + IP.getInstName() +
+                 "'");
+      return false;
+    }
+
+    return true;
+  };
+
   bool UsesWipMatchOpcode = false;
   for (const auto &Match : MatchPats) {
     const auto *Pat = Match.second.get();
@@ -1155,17 +1210,23 @@ bool CombineRuleBuilder::checkSemantics() {
       continue;
     }
 
-    // MIFlags in match cannot use the following syntax: (MIFlags $mi)
-    if (const auto *CGP = dyn_cast<CodeGenInstructionPattern>(Pat)) {
-      if (auto *FI = CGP->getMIFlagsInfo()) {
-        if (!FI->copy_flags().empty()) {
-          PrintError(
-              "'match' patterns cannot refer to flags from other instructions");
-          PrintNote("MIFlags in '" + CGP->getName() +
-                    "' refer to: " + join(FI->copy_flags(), ", "));
-          return false;
+    if (const auto IP = dyn_cast<InstructionPattern>(Pat)) {
+      if (!CheckVariadicOperands(*IP, /*IsMatch=*/true))
+        return false;
+
+      // MIFlags in match cannot use the following syntax: (MIFlags $mi)
+      if (const auto *CGP = dyn_cast<CodeGenInstructionPattern>(Pat)) {
+        if (auto *FI = CGP->getMIFlagsInfo()) {
+          if (!FI->copy_flags().empty()) {
+            PrintError("'match' patterns cannot refer to flags from other "
+                       "instructions");
+            PrintNote("MIFlags in '" + CGP->getName() +
+                      "' refer to: " + join(FI->copy_flags(), ", "));
+            return false;
+          }
         }
       }
+      continue;
     }
 
     const auto *AOP = dyn_cast<AnyOpcodePattern>(Pat);
@@ -1197,6 +1258,9 @@ bool CombineRuleBuilder::checkSemantics() {
     if (!IP)
       continue;
 
+    if (!CheckVariadicOperands(*IP, /*IsMatch=*/false))
+      return false;
+
     if (UsesWipMatchOpcode) {
       PrintError("cannot use wip_match_opcode in combination with apply "
                  "instruction patterns!");
@@ -1839,7 +1903,7 @@ bool CombineRuleBuilder::emitCXXMatchApply(CodeExpansions &CE, RuleMatcher &M,
   for (auto &Pat : ApplyPats) {
     auto *CXXPat = cast<CXXPattern>(Pat.second.get());
     CodeExpander Expander(CXXPat->getRawCode(), CE, RuleDef.getLoc(),
-                          /*ShowExpansions=*/ false);
+                          /*ShowExpansions=*/false);
     OS << LS;
     Expander.emit(OS);
   }
@@ -1939,8 +2003,8 @@ bool CombineRuleBuilder::emitInstructionApplyPattern(
       continue;
     }
 
-    // Determine what we're dealing with. Are we replace a matched instruction?
-    // Creating a new one?
+    // Determine what we're dealing with. Are we replacing a matched
+    // instruction? Creating a new one?
     auto OpLookupRes = MatchOpTable.lookup(OpName);
     if (OpLookupRes.Found) {
       if (OpLookupRes.isLiveIn()) {
@@ -1986,8 +2050,11 @@ bool CombineRuleBuilder::emitInstructionApplyPattern(
       declareTempRegExpansion(CE, TempRegID, OpName);
       // Always insert the action at the beginning, otherwise we may end up
       // using the temp reg before it's available.
-      M.insertAction<MakeTempRegisterAction>(
-          M.actions_begin(), getLLTCodeGenOrTempType(Ty, M), TempRegID);
+      auto Result = getLLTCodeGenOrTempType(Ty, M);
+      if (!Result)
+        return false;
+      M.insertAction<MakeTempRegisterAction>(M.actions_begin(), *Result,
+                                             TempRegID);
     }
 
     DstMI.addRenderer<TempRegRenderer>(TempRegID, /*IsDef=*/true);
@@ -2044,16 +2111,18 @@ bool CombineRuleBuilder::emitCodeGenInstructionApplyImmOperand(
   }
 
   auto ImmTy = getLLTCodeGenOrTempType(Ty, M);
+  if (!ImmTy)
+    return false;
 
   if (isGConstant) {
-    DstMI.addRenderer<ImmRenderer>(O.getImmValue(), ImmTy);
+    DstMI.addRenderer<ImmRenderer>(O.getImmValue(), *ImmTy);
     return true;
   }
 
   unsigned TempRegID = M.allocateTempRegID();
   // Ensure MakeTempReg & the BuildConstantAction occur at the beginning.
   auto InsertIt = M.insertAction<MakeTempRegisterAction>(M.actions_begin(),
-                                                         ImmTy, TempRegID);
+                                                         *ImmTy, TempRegID);
   M.insertAction<BuildConstantAction>(++InsertIt, TempRegID, O.getImmValue());
   DstMI.addRenderer<TempRegRenderer>(TempRegID);
   return true;
@@ -2159,6 +2228,8 @@ bool CombineRuleBuilder::emitCodeGenInstructionMatchPattern(
     assert(RemappedO.isNamedOperand() == OriginalO.isNamedOperand() &&
            "Cannot remap an unnamed operand to a named one!");
 
+    const auto Ty = RemappedO.getType();
+
     const auto OpName =
         RemappedO.isNamedOperand() ? RemappedO.getOperandName().str() : "";
 
@@ -2170,11 +2241,41 @@ bool CombineRuleBuilder::emitCodeGenInstructionMatchPattern(
     //    RealIdx = expected index in the MachineInstr.
     const unsigned RealIdx =
         (P.isIntrinsic() && !OriginalO.isDef()) ? (Idx + 1) : Idx;
+
+    if (Ty.isVariadicPack() && M.hasOperand(OpName)) {
+      // TODO: We could add some CheckIsSameOperand opcode variant that checks
+      // all operands. We could also just emit a C++ code snippet lazily to do
+      // the check since it's probably fairly rare that we need to do it.
+      //
+      // I'm just not sure it's worth the effort at this stage.
+      PrintError("each instance of a " + PatternType::VariadicClassName +
+                 " operand must have a unique name within the match patterns");
+      PrintNote("'" + OpName + "' is used multiple times");
+      return false;
+    }
+
     OperandMatcher &OM =
-        IM.addOperand(RealIdx, OpName, AllocatedTemporariesBaseID++);
+        IM.addOperand(RealIdx, OpName, AllocatedTemporariesBaseID++,
+                      /*IsVariadic=*/Ty.isVariadicPack());
     if (!OpName.empty())
       declareOperandExpansion(CE, OM, OriginalO.getOperandName());
 
+    if (Ty.isVariadicPack()) {
+      // In the presence of variadics, the InstructionMatcher won't insert a
+      // InstructionNumOperandsMatcher implicitly, so we have to emit our own.
+      assert((Idx + 1) == P.operands_size() &&
+             "VariadicPack isn't last operand!");
+      auto VPTI = Ty.getVariadicPackTypeInfo();
+      assert(VPTI.Min > 0 && (VPTI.Max == 0 || VPTI.Max > VPTI.Min));
+      IM.addPredicate<InstructionNumOperandsMatcher>(
+          RealIdx + VPTI.Min, InstructionNumOperandsMatcher::CheckKind::GE);
+      if (VPTI.Max) {
+        IM.addPredicate<InstructionNumOperandsMatcher>(
+            RealIdx + VPTI.Max, InstructionNumOperandsMatcher::CheckKind::LE);
+      }
+      break;
+    }
+
     // Handle immediates.
     if (RemappedO.hasImmValue()) {
       if (isLiteralImm(P, Idx))
@@ -2190,16 +2291,14 @@ bool CombineRuleBuilder::emitCodeGenInstructionMatchPattern(
     // for that Operand. "OM" here is always a new OperandMatcher.
     //
     // Always emit a check for unnamed operands.
-    if (OpName.empty() ||
-        !M.getOperandMatcher(OpName).contains<LLTOperandMatcher>()) {
-      if (const auto Ty = RemappedO.getType()) {
-        // TODO: We could support GITypeOf here on the condition that the
-        // OperandMatcher exists already. Though it's clunky to make this work
-        // and isn't all that useful so it's just rejected in typecheckPatterns
-        // at this time.
-        assert(Ty.isLLT() && "Only LLTs are supported in match patterns!");
-        OM.addPredicate<LLTOperandMatcher>(getLLTCodeGen(Ty));
-      }
+    if (Ty && (OpName.empty() ||
+               !M.getOperandMatcher(OpName).contains<LLTOperandMatcher>())) {
+      // TODO: We could support GITypeOf here on the condition that the
+      // OperandMatcher exists already. Though it's clunky to make this work
+      // and isn't all that useful so it's just rejected in typecheckPatterns
+      // at this time.
+      assert(Ty.isLLT());
+      OM.addPredicate<LLTOperandMatcher>(getLLTCodeGen(Ty));
     }
 
     // Stop here if the operand is a def, or if it had no name.
@@ -2558,8 +2657,10 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules) {
                                                const Matcher *B) {
     auto *L = static_cast<const RuleMatcher *>(A);
     auto *R = static_cast<const RuleMatcher *>(B);
-    return std::make_tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
-           std::make_tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
+    return std::make_tuple(OpcodeOrder[L->getOpcode()],
+                           L->insnmatchers_front().getNumOperandMatchers()) <
+           std::make_tuple(OpcodeOrder[R->getOpcode()],
+                           R->insnmatchers_front().getNumOperandMatchers());
   });
 
   for (Matcher *Rule : InputRules)
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c29cb4edec181..a491a049e7c81 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -1162,7 +1162,7 @@ Error GlobalISelEmitter::importChildMatcher(
       OperandMatcher &OM =
           InsnOperand.getInsnMatcher().addOperand(0, "", TempOpIdx);
       if (auto Error =
-              OM.addTypeCheckPredicate(VTy, false /* OperandIsAPointer */))
+              OM.addTypeCheckPredicate(TypeSetByHwMode(VTy), false /* OperandIsAPointer */))
         return failedImport(toString(std::move(Error)) +
                             " for result of Src pattern operator");
 
@@ -2248,8 +2248,10 @@ GlobalISelEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules,
                                                const Matcher *B) {
     auto *L = static_cast<const RuleMatcher *>(A);
     auto *R = static_cast<const RuleMatcher *>(B);
-    return std::tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
-           std::tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
+    return std::tuple(OpcodeOrder[L->getOpcode()],
+                      L->insnmatchers_front().getNumOperandMatchers()) <
+           std::tuple(OpcodeOrder[R->getOpcode()],
+                      R->insnmatchers_front().getNumOperandMatchers());
   });
 
   for (Matcher *Rule : InputRules)
diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index 01cfd4a1d9829..7c3abefd96f5d 100644
--- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -227,22 +227,20 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
   // Emit file header.
   emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
 
-  o << "bool " << Target.getName() + "AsmPrinter"
-    << "::\n"
-    << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
-    << "                            const MachineInstr *MI) {\n";
+  o << "bool " << Target.getName() + "AsmPrinter::\n"
+    << "lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst) {\n";
 
   if (!Expansions.empty()) {
-    o << "  switch (MI->getOpcode()) {\n"
+    o << "  Inst.clear();\n"
+      << "  switch (MI->getOpcode()) {\n"
       << "  default: return false;\n";
     for (auto &Expansion : Expansions) {
       CodeGenInstruction &Source = Expansion.Source;
       CodeGenInstruction &Dest = Expansion.Dest;
       o << "  case " << Source.Namespace << "::" << Source.TheDef->getName()
         << ": {\n"
-        << "    MCInst TmpInst;\n"
         << "    MCOperand MCOp;\n"
-        << "    TmpInst.setOpcode(" << Dest.Namespace
+        << "    Inst.setOpcode(" << Dest.Namespace
         << "::" << Dest.TheDef->getName() << ");\n";
 
       // Copy the operands from the source instruction.
@@ -260,15 +258,15 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
                          .MIOperandNo +
                      i
               << "), MCOp);\n"
-              << "    TmpInst.addOperand(MCOp);\n";
+              << "    Inst.addOperand(MCOp);\n";
             break;
           case OpData::Imm:
-            o << "    TmpInst.addOperand(MCOperand::createImm("
+            o << "    Inst.addOperand(MCOperand::createImm("
               << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
             break;
           case OpData::Reg: {
             Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-            o << "    TmpInst.addOperand(MCOperand::createReg(";
+            o << "    Inst.addOperand(MCOperand::createReg(";
             // "zero_reg" is special.
             if (Reg->getName() == "zero_reg")
               o << "0";
@@ -287,10 +285,9 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
         o << "    for (unsigned i = " << MIOpNo
           << ", e = MI->getNumOperands(); i != e; ++i)\n"
           << "      if (lowerOperand(MI->getOperand(i), MCOp))\n"
-          << "        TmpInst.addOperand(MCOp);\n";
+          << "        Inst.addOperand(MCOp);\n";
       }
-      o << "    EmitToStreamer(OutStreamer, TmpInst);\n"
-        << "    break;\n"
+      o << "    break;\n"
         << "  }\n";
     }
     o << "  }\n  return true;";
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 04e9e0fa48db0..910488a14b985 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/RISCVISAUtils.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -209,10 +210,46 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "\n#undef TUNE_PROC\n";
 }
 
+static void emitRISCVExtensionBitmask(RecordKeeper &RK, raw_ostream &OS) {
+
+  std::vector<Record *> Extensions =
+      RK.getAllDerivedDefinitionsIfDefined("RISCVExtensionBitmask");
+  llvm::sort(Extensions, [](const Record *Rec1, const Record *Rec2) {
+    return getExtensionName(Rec1) < getExtensionName(Rec2);
+  });
+
+#ifndef NDEBUG
+  llvm::DenseSet<std::pair<uint64_t, uint64_t>> Seen;
+#endif
+
+  OS << "#ifdef GET_RISCVExtensionBitmaskTable_IMPL\n";
+  OS << "static const RISCVExtensionBitmask ExtensionBitmask[]={\n";
+  for (const Record *Rec : Extensions) {
+    unsigned GroupIDVal = Rec->getValueAsInt("GroupID");
+    unsigned BitPosVal = Rec->getValueAsInt("BitPos");
+
+    StringRef ExtName = Rec->getValueAsString("Name");
+    ExtName.consume_front("experimental-");
+
+#ifndef NDEBUG
+    assert(Seen.insert(std::make_pair(GroupIDVal, BitPosVal)).second &&
+           "duplicated bitmask");
+#endif
+
+    OS << "    {"
+       << "\"" << ExtName << "\""
+       << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
+       << "},\n";
+  }
+  OS << "};\n";
+  OS << "#endif\n";
+}
+
 static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   emitRISCVExtensions(RK, OS);
   emitRISCVProfiles(RK, OS);
   emitRISCVProcs(RK, OS);
+  emitRISCVExtensionBitmask(RK, OS);
 }
 
 static TableGen::Emitter::Opt X("gen-riscv-target-def", EmitRISCVTargetDef,
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 5546e727af384..6872f16df4724 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -16,6 +16,7 @@
 #include "Common/InfoByHwMode.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -151,6 +152,9 @@ void RegisterBankEmitter::emitBaseClassDefinition(
   OS << "private:\n"
      << "  static const RegisterBank *RegBanks[];\n"
      << "  static const unsigned Sizes[];\n\n"
+     << "public:\n"
+     << "  const RegisterBank &getRegBankFromRegClass(const "
+        "TargetRegisterClass &RC, LLT Ty) const override;\n"
      << "protected:\n"
      << "  " << TargetName << "GenRegisterBankInfo(unsigned HwMode = 0);\n"
      << "\n";
@@ -287,8 +291,94 @@ void RegisterBankEmitter::emitBaseClassImplementation(
      << "  for (auto RB : enumerate(RegBanks))\n"
      << "    assert(RB.index() == RB.value()->getID() && \"Index != ID\");\n"
      << "#endif // NDEBUG\n"
-     << "}\n"
-     << "} // end namespace llvm\n";
+     << "}\n";
+
+  uint32_t NumRegBanks = Banks.size();
+  uint32_t BitSize = NextPowerOf2(Log2_32(NumRegBanks));
+  uint32_t ElemsPerWord = 32 / BitSize;
+  uint32_t BitMask = (1 << BitSize) - 1;
+  bool HasAmbigousOrMissingEntry = false;
+  struct Entry {
+    std::string RCIdName;
+    std::string RBIdName;
+  };
+  SmallVector<Entry, 0> Entries;
+  for (const auto &Bank : Banks) {
+    for (const auto *RC : Bank.register_classes()) {
+      if (RC->EnumValue >= Entries.size())
+        Entries.resize(RC->EnumValue + 1);
+      Entry &E = Entries[RC->EnumValue];
+      E.RCIdName = RC->getIdName();
+      if (!E.RBIdName.empty()) {
+        HasAmbigousOrMissingEntry = true;
+        E.RBIdName = "InvalidRegBankID";
+      } else {
+        E.RBIdName = (TargetName + "::" + Bank.getEnumeratorName()).str();
+      }
+    }
+  }
+  for (auto &E : Entries) {
+    if (E.RBIdName.empty()) {
+      HasAmbigousOrMissingEntry = true;
+      E.RBIdName = "InvalidRegBankID";
+    }
+  }
+  OS << "const RegisterBank &\n"
+     << TargetName
+     << "GenRegisterBankInfo::getRegBankFromRegClass"
+        "(const TargetRegisterClass &RC, LLT) const {\n";
+  if (HasAmbigousOrMissingEntry) {
+    OS << "  constexpr uint32_t InvalidRegBankID = uint32_t("
+       << TargetName + "::InvalidRegBankID) & " << BitMask << ";\n";
+  }
+  unsigned TableSize =
+      Entries.size() / ElemsPerWord + ((Entries.size() % ElemsPerWord) > 0);
+  OS << "  static const uint32_t RegClass2RegBank[" << TableSize << "] = {\n";
+  uint32_t Shift = 32 - BitSize;
+  bool First = true;
+  std::string TrailingComment;
+  for (auto &E : Entries) {
+    Shift += BitSize;
+    if (Shift == 32) {
+      Shift = 0;
+      if (First)
+        First = false;
+      else
+        OS << ',' << TrailingComment << '\n';
+    } else {
+      OS << " |" << TrailingComment << '\n';
+    }
+    OS << "    ("
+       << (E.RBIdName.empty()
+               ? "InvalidRegBankID"
+               : Twine("uint32_t(").concat(E.RBIdName).concat(")").str())
+       << " << " << Shift << ')';
+    if (!E.RCIdName.empty())
+      TrailingComment = " // " + E.RCIdName;
+    else
+      TrailingComment = "";
+  }
+  OS << TrailingComment
+     << "\n  };\n"
+        "  const unsigned RegClassID = RC.getID();\n"
+        "  if (LLVM_LIKELY(RegClassID < "
+     << Entries.size()
+     << ")) {\n"
+        "    unsigned RegBankID = (RegClass2RegBank[RegClassID / "
+     << ElemsPerWord << "] >> ((RegClassID % " << ElemsPerWord << ") * "
+     << BitSize << ")) & " << BitMask << ";\n";
+  if (HasAmbigousOrMissingEntry) {
+    OS << "    if (RegBankID != InvalidRegBankID)\n"
+          "      return getRegBank(RegBankID);\n";
+  } else
+    OS << "    return getRegBank(RegBankID);\n";
+  OS << "  }\n"
+        "  llvm_unreachable(llvm::Twine(\"Target needs to handle register "
+        "class ID "
+        "0x\").concat(llvm::Twine::utohexstr(RegClassID)).str().c_str());\n"
+        "}\n";
+
+  OS << "} // end namespace llvm\n";
 }
 
 void RegisterBankEmitter::run(raw_ostream &OS) {
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp
index 851a525cf48c0..eb58148e4e364 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/VTEmitter.cpp
@@ -79,12 +79,12 @@ static void VTtoGetLLVMTyString(raw_ostream &OS, const Record *VT) {
 void VTEmitter::run(raw_ostream &OS) {
   emitSourceFileHeader("ValueTypes Source Fragment", OS, Records);
 
-  std::array<const Record *, 256> VTsByNumber = {};
+  std::vector<const Record *> VTsByNumber{512};
   auto ValueTypes = Records.getAllDerivedDefinitions("ValueType");
   for (auto *VT : ValueTypes) {
     auto Number = VT->getValueAsInt("Value");
     assert(0 <= Number && Number < (int)VTsByNumber.size() &&
-           "ValueType should be uint8_t");
+           "ValueType should be uint16_t");
     assert(!VTsByNumber[Number] && "Duplicate ValueType");
     VTsByNumber[Number] = VT;
   }
diff --git a/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
index 701df4d0c8597..750f73630b7d5 100644
--- a/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
@@ -18,4 +18,8 @@ static_library("RuntimeLibs") {
     "InstrumentationRuntimeLibrary.cpp",
     "RuntimeLibrary.cpp",
   ]
+
+  # FIXME: Hopefully change this, see
+  # https://github.com/llvm/llvm-project/pull/97130/files#r1691863361
+  defines = [ "CMAKE_INSTALL_FULL_LIBDIR=\"\"" ]
 }
diff --git a/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
index 6a70a90815c8c..8517ff43be47b 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
@@ -1,7 +1,24 @@
+import("//llvm/utils/gn/build/write_vcsrevision.gni")
+
+# Configure the VCSVersion.inc file
+config("write_vcsrevision_config") {
+  # To pick up the generated inc file.
+  include_dirs = [ target_gen_dir ]
+  visibility = [ ":write_vcsversion" ]
+}
+
+write_vcsrevision("write_vcsversion") {
+  visibility = [ ":Utils" ]
+  header = "$target_gen_dir/VCSVersion.inc"
+  names = [ "LLDB" ]
+  public_configs = [ ":write_vcsrevision_config" ]
+}
+
 static_library("Utils") {
   output_name = "LLVMBOLTUtils"
   configs += [ "//llvm/utils/gn/build:bolt_code" ]
   deps = [
+    ":write_vcsversion",
     "//llvm/lib/Support",
     "//llvm/utils/gn/build/libs/pthread",
   ]
diff --git a/llvm/utils/gn/secondary/bolt/test/BUILD.gn b/llvm/utils/gn/secondary/bolt/test/BUILD.gn
index d8ef560624897..17072a8d47737 100644
--- a/llvm/utils/gn/secondary/bolt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/test/BUILD.gn
@@ -45,6 +45,8 @@ write_lit_config("lit_site_cfg") {
     "BOLT_ENABLE_RUNTIME=0",  # FIXME: enable runtime
     "BOLT_TARGETS_TO_BUILD=$bolt_targets_to_build_string",
     "GNU_LD_EXECUTABLE=",  # FIXME: set sometimes?
+    "LIBBOLT_RT_HUGIFY=",
+    "LIBBOLT_RT_INSTR=",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_USE_SANITIZER=",
     "Python3_EXECUTABLE=$python_path",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index b64355fded628..e3126f9f05741 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -59,6 +59,7 @@ static_library("clangd") {
     "//clang/lib/Tooling/Core",
     "//clang/lib/Tooling/Inclusions",
     "//clang/lib/Tooling/Inclusions/Stdlib",
+    "//clang/lib/Tooling/DependencyScanning",
     "//clang/lib/Tooling/Refactoring",
     "//clang/lib/Tooling/Syntax",
     "//llvm/lib/Support",
@@ -114,12 +115,14 @@ static_library("clangd") {
     "IncludeFixer.cpp",
     "InlayHints.cpp",
     "JSONTransport.cpp",
+    "ModulesBuilder.cpp",
     "ParsedAST.cpp",
     "PathMapping.cpp",
     "Preamble.cpp",
     "Protocol.cpp",
     "Quality.cpp",
     "RIFF.cpp",
+    "ScanningProjectModules.cpp",
     "Selection.cpp",
     "SemanticHighlighting.cpp",
     "SemanticSelection.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
index 8ba0d0f0ded83..9d42409f1973f 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
@@ -84,6 +84,7 @@ group("test") {
     "//llvm/utils/FileCheck",
     "//llvm/utils/llvm-lit",
     "//llvm/utils/not",
+    "//llvm/utils/split-file",
   ]
   if (clangd_build_xpc) {
     deps += [
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 90b231b5796d2..b46bac97b41fa 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -94,6 +94,7 @@ unittest("ClangdTests") {
     "ParsedASTTests.cpp",
     "PathMappingTests.cpp",
     "PreambleTests.cpp",
+    "PrerequisiteModulesTest.cpp",
     "PrintASTTests.cpp",
     "ProjectAwareIndexTests.cpp",
     "QualityTests.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index 839878dfd28f7..28f250ad3b7ba 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -98,7 +98,7 @@ static_library("CodeGen") {
     "MacroPPCallbacks.cpp",
     "MicrosoftCXXABI.cpp",
     "ModuleBuilder.cpp",
-    "ObjectFilePCHContainerOperations.cpp",
+    "ObjectFilePCHContainerWriter.cpp",
     "PatternInit.cpp",
     "SanitizerMetadata.cpp",
     "SwiftCallingConv.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index ed903559cac84..83a10d5b18bc8 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -60,6 +60,7 @@ static_library("Sema") {
     "SemaAvailability.cpp",
     "SemaBPF.cpp",
     "SemaBase.cpp",
+    "SemaBoundsSafety.cpp",
     "SemaCUDA.cpp",
     "SemaCXXScopeSpec.cpp",
     "SemaCast.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Serialization/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Serialization/BUILD.gn
index 1fdcaf2673c20..8d8b0178cecce 100644
--- a/llvm/utils/gn/secondary/clang/lib/Serialization/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Serialization/BUILD.gn
@@ -28,6 +28,7 @@ static_library("Serialization") {
     "ModuleFile.cpp",
     "ModuleFileExtension.cpp",
     "ModuleManager.cpp",
+    "ObjectFilePCHContainerReader.cpp",
     "PCHContainerOperations.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn
new file mode 100644
index 0000000000000..24a0e89b804bf
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn
@@ -0,0 +1,31 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("NVLinkOpts") {
+  args = [ "-gen-opt-parser-defs" ]
+  td_file = "NVLinkOpts.td"
+}
+
+executable("clang-nvlink-wrapper") {
+  configs += [ "//llvm/utils/gn/build:clang_code" ]
+  deps = [
+    ":NVLinkOpts",
+    "//clang/lib/Basic",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/IRReader",
+    "//llvm/lib/LTO",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Passes",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/TargetParser",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [ "ClangNVLinkWrapper.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
index bc50bdcfecd5b..6f00fcac4cd54 100644
--- a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
@@ -58,6 +58,7 @@ driver_executable("clang") {
     "//clang/lib/FrontendTool",
     "//clang/lib/Headers",
     "//clang/tools/clang-linker-wrapper",
+    "//clang/tools/clang-nvlink-wrapper",
     "//clang/tools/clang-offload-bundler",
     "//clang/tools/clang-offload-packager",
     "//llvm/include/llvm/Config:llvm-config",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
index 258fb61038df5..8e1d09c9c2ee9 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
@@ -59,6 +59,7 @@ unittest("ToolingTests") {
     "RecursiveASTVisitorTests/Concept.cpp",
     "RecursiveASTVisitorTests/ConstructExpr.cpp",
     "RecursiveASTVisitorTests/DeclRefExpr.cpp",
+    "RecursiveASTVisitorTests/DeductionGuide.cpp",
     "RecursiveASTVisitorTests/ImplicitCtor.cpp",
     "RecursiveASTVisitorTests/ImplicitCtorInitializer.cpp",
     "RecursiveASTVisitorTests/InitListExprPostOrder.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 3005a2a8a1852..1e2c0cdc86306 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -571,6 +571,7 @@ static_library("builtins") {
 
   if (current_cpu == "riscv" || current_cpu == "riscv64") {
     sources += [
+      "cpu_model/riscv.c",
       "riscv/fp_mode.c",
       "riscv/restore.S",
       "riscv/save.S",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index b976e9745fbef..cc759d2337516 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -168,6 +168,7 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/ranges_find_first_of.h",
       "__algorithm/ranges_find_if.h",
       "__algorithm/ranges_find_if_not.h",
+      "__algorithm/ranges_find_last.h",
       "__algorithm/ranges_for_each.h",
       "__algorithm/ranges_for_each_n.h",
       "__algorithm/ranges_generate.h",
@@ -579,6 +580,7 @@ if (current_toolchain == default_toolchain) {
       "__math/remainder.h",
       "__math/roots.h",
       "__math/rounding_functions.h",
+      "__math/special_functions.h",
       "__math/traits.h",
       "__math/trigonometric_functions.h",
       "__mbstate_t.h",
@@ -604,6 +606,8 @@ if (current_toolchain == default_toolchain) {
       "__memory/concepts.h",
       "__memory/construct_at.h",
       "__memory/destruct_n.h",
+      "__memory/inout_ptr.h",
+      "__memory/out_ptr.h",
       "__memory/pointer_traits.h",
       "__memory/ranges_construct_at.h",
       "__memory/ranges_uninitialized_algorithms.h",
@@ -863,7 +867,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_referenceable.h",
       "__type_traits/is_same.h",
       "__type_traits/is_scalar.h",
-      "__type_traits/is_scoped_enum.h",
       "__type_traits/is_signed.h",
       "__type_traits/is_signed_integer.h",
       "__type_traits/is_specialization.h",
diff --git a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
index 66f762e0a7c26..db608e3cc7449 100644
--- a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
@@ -27,6 +27,7 @@ static_library("MachO") {
     "Arch/ARM64Common.cpp",
     "Arch/ARM64_32.cpp",
     "Arch/X86_64.cpp",
+    "BPSectionOrderer.cpp",
     "ConcatOutputSection.cpp",
     "Driver.cpp",
     "DriverUtils.cpp",
@@ -44,6 +45,7 @@ static_library("MachO") {
     "OutputSegment.cpp",
     "Relocations.cpp",
     "SectionPriorities.cpp",
+    "Sections.cpp",
     "SymbolTable.cpp",
     "Symbols.cpp",
     "SyntheticSections.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index f0bf6a8f3dbaf..148f2cbf72044 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -86,6 +86,7 @@ target(liblldb_type, "liblldb") {
     "SBQueue.cpp",
     "SBQueueItem.cpp",
     "SBReproducer.cpp",
+    "SBSaveCoreOptions.cpp",
     "SBScriptObject.cpp",
     "SBSection.cpp",
     "SBSourceManager.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
index 7825396b402d5..c6c6ef994c28d 100644
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
@@ -15,6 +15,7 @@ static_library("Interpreter") {
   deps = [
     ":InterpreterProperties",
     ":InterpreterPropertiesEnum",
+    "Interfaces",
     "//lldb/source/Commands",
     "//lldb/source/Core",
     "//lldb/source/DataFormatters",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
new file mode 100644
index 0000000000000..2e70c54ea9564
--- /dev/null
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("Interfaces") {
+  output_name = "lldbInterpreterInterfaces"
+  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  deps = [
+    "//lldb/source/Utility",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "ScriptedInterfaceUsages.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/lldb/source/Symbol/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Symbol/BUILD.gn
index db769f2b4372e..3b1df82a6b2d2 100644
--- a/llvm/utils/gn/secondary/lldb/source/Symbol/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Symbol/BUILD.gn
@@ -29,6 +29,7 @@ static_library("Symbol") {
     "ObjectContainer.cpp",
     "ObjectFile.cpp",
     "PostfixExpression.cpp",
+    "SaveCoreOptions.cpp",
     "Symbol.cpp",
     "SymbolContext.cpp",
     "SymbolFile.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index 4f03e01c39c1e..e47a5f7db5528 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -39,6 +39,7 @@ static_library("Analysis") {
     "CycleAnalysis.cpp",
     "DDG.cpp",
     "DDGPrinter.cpp",
+    "DXILResource.cpp",
     "Delinearization.cpp",
     "DemandedBits.cpp",
     "DependenceAnalysis.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index fba81184d9920..f04feb2b7737c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -17,6 +17,7 @@ static_library("GlobalISel") {
     "CallLowering.cpp",
     "Combiner.cpp",
     "CombinerHelper.cpp",
+    "CombinerHelperCasts.cpp",
     "CombinerHelperVectorOps.cpp",
     "GIMatchTableExecutor.cpp",
     "GISelChangeObserver.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
index 4650741f3a2f2..7ff3faf63bedc 100644
--- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
@@ -4,5 +4,8 @@ static_library("SandboxIR") {
     "//llvm/lib/IR",
     "//llvm/lib/Support",
   ]
-  sources = [ "SandboxIR.cpp" ]
+  sources = [
+    "SandboxIR.cpp",
+    "Tracker.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index ab97507311a47..9fb898445ed6f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -134,6 +134,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUAnnotateKernelFeatures.cpp",
     "AMDGPUAnnotateUniformValues.cpp",
     "AMDGPUArgumentUsageInfo.cpp",
+    "AMDGPUAsanInstrumentation.cpp",
     "AMDGPUAsmPrinter.cpp",
     "AMDGPUAtomicOptimizer.cpp",
     "AMDGPUAttributor.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
index 31919badac7be..3dbc803d0d483 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
@@ -13,6 +13,7 @@ static_library("TargetParser") {
     "CSKYTargetParser.cpp",
     "Host.cpp",
     "LoongArchTargetParser.cpp",
+    "PPCTargetParser.cpp",
     "RISCVISAInfo.cpp",
     "RISCVTargetParser.cpp",
     "SubtargetFeature.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 6ea9c8e4f4748..58acabf85d296 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -26,7 +26,6 @@ static_library("Utils") {
     "CodeMoverUtils.cpp",
     "CountVisits.cpp",
     "CtorUtils.cpp",
-    "DXILResource.cpp",
     "DXILUpgrade.cpp",
     "Debugify.cpp",
     "DemoteRegToStack.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 945fb0e922c9e..a2b0603ce2e16 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -266,6 +266,7 @@ group("test") {
     "//llvm/tools/llvm-cat",
     "//llvm/tools/llvm-cfi-verify",
     "//llvm/tools/llvm-cov",
+    "//llvm/tools/llvm-ctxprof-util",
     "//llvm/tools/llvm-cvtres",
     "//llvm/tools/llvm-cxxdump",
     "//llvm/tools/llvm-cxxfilt",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
new file mode 100644
index 0000000000000..76f28a7a39115
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
@@ -0,0 +1,9 @@
+executable("llvm-ctxprof-util") {
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "llvm-ctxprof-util.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
index 339994d35af94..4749ef0a733da 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
@@ -24,6 +24,7 @@ unittest("AnalysisTests") {
     "CaptureTrackingTest.cpp",
     "ConstraintSystemTest.cpp",
     "DDGTest.cpp",
+    "DXILResourceTest.cpp",
     "DomTreeUpdaterTest.cpp",
     "FunctionPropertiesAnalysisTest.cpp",
     "GlobalsModRefTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index 3e05b9b1f8c3f..357e5d91ee19e 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -26,6 +26,7 @@ unittest("OrcJITTests") {
     "LookupAndRecordAddrsTest.cpp",
     "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
+    "MemoryFlagsTest.cpp",
     "MemoryMapperTest.cpp",
     "ObjectFormatsTest.cpp",
     "ObjectLinkingLayerTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
index d7164f2797626..2d246eccb872e 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
@@ -6,5 +6,8 @@ unittest("SandboxIRTests") {
     "//llvm/lib/IR",
     "//llvm/lib/SandboxIR",
   ]
-  sources = [ "SandboxIRTest.cpp" ]
+  sources = [
+    "SandboxIRTest.cpp",
+    "TrackerTest.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
index 4a924dab02908..380ed71a2bc01 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
@@ -19,7 +19,6 @@ unittest("UtilsTests") {
     "CodeExtractorTest.cpp",
     "CodeLayoutTest.cpp",
     "CodeMoverUtilsTest.cpp",
-    "DXILResourceTest.cpp",
     "DebugifyTest.cpp",
     "FunctionComparatorTest.cpp",
     "IntegerDivisionTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 7c02ed396db5f..e0107cdeae76d 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
-llvm_version_major = 19
+llvm_version_major = 20
 llvm_version_minor = 0
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index a5a1ff66bf417..ea56d192fb394 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (19, 0, 0)
+__versioninfo__ = (20, 0, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/lit/lit/builtin_commands/cat.py b/llvm/utils/lit/lit/builtin_commands/cat.py
index 6fb2152ef9332..d6dbe889cc2b1 100644
--- a/llvm/utils/lit/lit/builtin_commands/cat.py
+++ b/llvm/utils/lit/lit/builtin_commands/cat.py
@@ -10,7 +10,7 @@
 def convertToCaretAndMNotation(data):
     newdata = StringIO()
     if isinstance(data, str):
-        data = bytearray(data)
+        data = bytearray(data.encode())
 
     for intval in data:
         if intval == 9 or intval == 10:
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index b9122d07afd8a..ed78256ee414b 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -154,11 +154,6 @@ def parse_args():
         action="append",
         default=[],
     )
-    execution_group.add_argument(
-        "--time-tests",
-        help="Track elapsed wall time for each test",
-        action="store_true",
-    )
     execution_group.add_argument(
         "--no-execute",
         dest="noExecute",
@@ -209,6 +204,17 @@ def parse_args():
         action="store_true",
         help="Exit with status zero even if some tests fail",
     )
+    execution_test_time_group = execution_group.add_mutually_exclusive_group()
+    execution_test_time_group.add_argument(
+        "--skip-test-time-recording",
+        help="Do not track elapsed wall time for each test",
+        action="store_true",
+    )
+    execution_test_time_group.add_argument(
+        "--time-tests",
+        help="Track elapsed wall time for each test printed in a histogram",
+        action="store_true",
+    )
 
     selection_group = parser.add_argument_group("Test Selection")
     selection_group.add_argument(
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index db9f24f748d9e..24ba804f0c363 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -124,7 +124,8 @@ def main(builtin_params={}):
     run_tests(selected_tests, lit_config, opts, len(discovered_tests))
     elapsed = time.time() - start
 
-    record_test_times(selected_tests, lit_config)
+    if not opts.skip_test_time_recording:
+        record_test_times(selected_tests, lit_config)
 
     selected_tests, discovered_tests = GoogleTest.post_process_shard_results(
         selected_tests, discovered_tests
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
index e4997b1250ed3..26150c413dc03 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
@@ -1,10 +1,10 @@
 # Check that internal env can call internal env.
 
 # RUN: env env %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY %s
+# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s
 #
-# CHECK-2-EMPTY: BAR = 2
-# CHECK-2-EMPTY: FOO = 1
+# CHECK-2-EMPTY-ARGS: BAR = 2
+# CHECK-2-EMPTY-ARGS: FOO = 1
 
 # RUN: env FOO=2 env BAR=1 %{python} print_environment.py \
 # RUN: | FileCheck -check-prefix=CHECK-2-VAL %s
diff --git a/llvm/utils/lit/tests/Inputs/time-tests/a.txt b/llvm/utils/lit/tests/Inputs/time-tests/a.txt
new file mode 100644
index 0000000000000..b80b60b7a2794
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/time-tests/a.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/llvm/utils/lit/tests/Inputs/time-tests/lit.cfg b/llvm/utils/lit/tests/Inputs/time-tests/lit.cfg
new file mode 100644
index 0000000000000..e6ae41833874a
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/time-tests/lit.cfg
@@ -0,0 +1,7 @@
+import lit.formats
+
+config.name = "time-tests"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/llvm/utils/lit/tests/time-tests.py b/llvm/utils/lit/tests/time-tests.py
new file mode 100644
index 0000000000000..20b83a64330f0
--- /dev/null
+++ b/llvm/utils/lit/tests/time-tests.py
@@ -0,0 +1,15 @@
+## Check that --skip-test-time-recording skips .lit_test_times.txt recording.
+
+# RUN: %{lit-no-order-opt} --skip-test-time-recording %{inputs}/time-tests
+# RUN: not ls %{inputs}/time-tests/.lit_test_times.txt
+
+## Check that --time-tests generates a printed histogram.
+
+# RUN: %{lit-no-order-opt} --time-tests %{inputs}/time-tests > %t.out
+# RUN: FileCheck < %t.out %s
+# RUN: rm %{inputs}/time-tests/.lit_test_times.txt
+
+# CHECK:      Tests Times:
+# CHECK-NEXT: --------------------------------------------------------------------------
+# CHECK-NEXT: [    Range    ] :: [               Percentage               ] :: [Count]
+# CHECK-NEXT: --------------------------------------------------------------------------
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index c5b208cfba360..d3369abae70b9 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (19, 0, 0)
+__versioninfo__ = (20, 0, 0)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"
diff --git a/llvm/utils/release/bump-version.py b/llvm/utils/release/bump-version.py
index abff67ae926ac..5db62e88fec1d 100755
--- a/llvm/utils/release/bump-version.py
+++ b/llvm/utils/release/bump-version.py
@@ -12,6 +12,9 @@
 
 
 class Processor:
+    def __init__(self, args):
+        self.args = args
+
     def process_line(self, line: str) -> str:
         raise NotImplementedError()
 
@@ -23,6 +26,13 @@ def process_file(self, fpath: Path, version: packaging.version.Version) -> None:
             version.micro,
             version.pre,
         )
+
+        if self.args.rc:
+            self.suffix = f"-rc{self.args.rc}"
+
+        if self.args.git:
+            self.suffix = "git"
+
         data = fpath.read_text()
         new_data = []
 
@@ -64,7 +74,7 @@ def process_line(self, line: str) -> str:
             if self.suffix:
                 nline = re.sub(
                     r"set\(LLVM_VERSION_SUFFIX(.*)\)",
-                    f"set(LLVM_VERSION_SUFFIX -{self.suffix[0]}{self.suffix[1]})",
+                    f"set(LLVM_VERSION_SUFFIX {self.suffix})",
                     line,
                 )
             else:
@@ -144,6 +154,7 @@ def process_line(self, line: str) -> str:
     )
     parser.add_argument("version", help="Version to bump to, e.g. 15.0.1", default=None)
     parser.add_argument("--rc", default=None, type=int, help="RC version")
+    parser.add_argument("--git", action="store_true", help="Git version")
     parser.add_argument(
         "-s",
         "--source-root",
@@ -153,9 +164,10 @@ def process_line(self, line: str) -> str:
 
     args = parser.parse_args()
 
+    if args.rc and args.git:
+        raise RuntimeError("Can't specify --git and --rc at the same time!")
+
     verstr = args.version
-    if args.rc:
-        verstr += f"-rc{args.rc}"
 
     # parse the version string with distutils.
     # note that -rc will end up as version.pre here
@@ -170,20 +182,25 @@ def process_line(self, line: str) -> str:
 
     files_to_update = (
         # Main CMakeLists.
-        (source_root / "llvm" / "CMakeLists.txt", CMakeProcessor()),
+        (source_root / "cmake" / "Modules" / "LLVMVersion.cmake", CMakeProcessor(args)),
         # Lit configuration
         (
             "llvm/utils/lit/lit/__init__.py",
-            LitProcessor(),
+            LitProcessor(args),
+        ),
+        # mlgo-utils configuration
+        (
+            "llvm/utils/mlgo-utils/mlgo/__init__.py",
+            LitProcessor(args),
         ),
         # GN build system
         (
             "llvm/utils/gn/secondary/llvm/version.gni",
-            GNIProcessor(),
+            GNIProcessor(args),
         ),
         (
             "libcxx/include/__config",
-            LibCXXProcessor(),
+            LibCXXProcessor(args),
         ),
     )
 
diff --git a/mlir/docs/Canonicalization.md b/mlir/docs/Canonicalization.md
index d1cba572af212..03fd174229afe 100644
--- a/mlir/docs/Canonicalization.md
+++ b/mlir/docs/Canonicalization.md
@@ -33,6 +33,10 @@ together.
 
 Some important things to think about w.r.t. canonicalization patterns:
 
+*   The goal of canonicalization is to make subsequent analyses and
+    optimizations more effective. Therefore, performance improvements are not
+    necessary for canonicalization.
+
 *   Pass pipelines should not rely on the canonicalizer pass for correctness.
     They should work correctly with all instances of the canonicalization pass
     removed.
@@ -51,6 +55,39 @@ Some important things to think about w.r.t. canonicalization patterns:
 *   It is always good to eliminate operations entirely when possible, e.g. by
     folding known identities (like "x + 0 = x").
 
+*   Pattens with expensive running time (i.e. have O(n) complexity) or
+    complicated cost models don't belong to canonicalization: since the
+    algorithm is executed iteratively until fixed-point we want patterns that
+    execute quickly (in particular their matching phase).
+
+*   Canonicalize shouldn't lose the semantic of original operation: the original
+    information should always be recoverable from the transformed IR.
+
+For example, a pattern that transform
+
+```
+  %transpose = linalg.transpose
+      ins(%input : tensor<1x2x3xf32>)
+      outs(%init1 : tensor<2x1x3xf32>)
+      dimensions = [1, 0, 2]
+  %out = linalg.transpose
+      ins(%tranpose: tensor<2x1x3xf32>)
+      outs(%init2 : tensor<3x1x2xf32>)
+      permutation = [2, 1, 0]
+```
+
+to
+
+```
+  %out= linalg.transpose
+      ins(%input : tensor<1x2x3xf32>)
+      outs(%init2: tensor<3x1x2xf32>)
+      permutation = [2, 0, 1]
+```
+
+is a good canonicalization pattern because it removes a redundant operation,
+making other analysis optimizations and more efficient.
+
 ## Globally Applied Rules
 
 These transformations are applied to all levels of IR:
@@ -189,7 +226,7 @@ each of the operands, returning the corresponding constant attribute. These
 operands are those that implement the `ConstantLike` trait. If any of the
 operands are non-constant, a null `Attribute` value is provided instead. For
 example, if MyOp provides three operands [`a`, `b`, `c`], but only `b` is
-constant then `adaptor` will return Attribute() for `getA()` and `getC()`, 
+constant then `adaptor` will return Attribute() for `getA()` and `getC()`,
 and b-value for `getB()`.
 
 Also above, is the use of `OpFoldResult`. This class represents the possible
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 01fadef5c3dbe..c252d9caf98eb 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -101,6 +101,9 @@ their semantics via a special [TableGen backend][TableGenBackend]:
 *   The `AttrConstraint` class hierarchy: They are used to specify the
     constraints over attributes. A notable subclass hierarchy is `Attr`, which
     stands for constraints for attributes whose values are of common types.
+*   The `Property` class hierarchy: They are used to specify non-attribute-backed
+    properties that are inherent to operations. This will be expanded to a
+    `PropertyConstraint` class or something similar in the future.
 
 An operation is defined by specializing the `Op` class with concrete contents
 for all the fields it requires. For example, `tf.AvgPool` is defined as
@@ -172,9 +175,9 @@ understanding the operation.
 
 ### Operation arguments
 
-There are two kinds of arguments: operands and attributes. Operands are runtime
-values produced by other ops; while attributes are compile-time known constant
-values, including two categories:
+There are three kinds of arguments: operands, attributes, and properties.
+Operands are runtime values produced by other ops; while attributes and properties
+are compile-time known constant values, including two categories:
 
 1.  Natural attributes: these attributes affect the behavior of the operations
     (e.g., padding for convolution);
@@ -187,8 +190,11 @@ values, including two categories:
     even though they are not materialized, it should be possible to store as an
     attribute.
 
-Both operands and attributes are specified inside the `dag`-typed `arguments`,
-led by `ins`:
+Properties are similar to attributes, except that they are not stored within
+the MLIR context but are stored inline with the operation.
+
+Operands, attributes, and properties are specified inside the `dag`-typed
+`arguments`, led by `ins`:
 
 ```tablegen
 let arguments = (ins
@@ -196,13 +202,15 @@ let arguments = (ins
   ...
   <attr-constraint>:$<attr-name>,
   ...
+  <property-constraint>:$<property-name>,
 );
 ```
 
 Here `<type-constraint>` is a TableGen `def` from the `TypeConstraint` class
 hierarchy. Similarly, `<attr-constraint>` is a TableGen `def` from the
-`AttrConstraint` class hierarchy. See [Constraints](#constraints) for more
-information.
+`AttrConstraint` class hierarchy and `<property-constraint>` is a subclass
+of `Property` (though a `PropertyConstraint` hierarchy is planned).
+See [Constraints](#constraints) for more information.
 
 There is no requirements on the relative order of operands and attributes; they
 can mix freely. The relative order of operands themselves matters. From each
@@ -324,6 +332,18 @@ Right now, the following primitive constraints are supported:
 
 TODO: Design and implement more primitive constraints
 
+#### Optional and default-valued properties
+
+To declare a property with a default value, use `DefaultValuedProperty<..., "...">`.
+If the property's storage data type is different from its interface type,
+for example, in the case of array properties (which are stored as `SmallVector`s
+but use `ArrayRef` as an interface type), add the storage-type equivalent
+of the default value as the third argument.
+
+To declare an optional property, use `OptionalProperty<...>`.
+This wraps the underlying property in an `std::optional` and gives it a
+default value of `std::nullopt`.
+
 #### Combining constraints
 
 `AllAttrOf` is provided to allow combination of multiple constraints which
@@ -429,6 +449,8 @@ def MyOp : ... {
     I32Attr:$i32_attr,
     F32Attr:$f32_attr,
     ...
+    I32Property:$i32_prop,
+    ...
   );
 
   let results = (outs
@@ -453,7 +475,8 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop);
 
 // Each result-type/operand/attribute has a separate parameter. The parameters
 // for attributes are raw values unwrapped with mlir::Attribute instances.
@@ -462,13 +485,15 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  APInt i32_attr, StringRef f32_attr, ...);
+                  APInt i32_attr, StringRef f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // Each operand/attribute has a separate parameter but result type is aggregate.
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   TypeRange resultTypes,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // All operands/attributes have aggregate parameters.
 // Generated if return type can be inferred.
@@ -921,8 +946,10 @@ optional-group: `(` then-elements `)` (`:` `(` else-elements `)`)? `?`
 The elements of an optional group have the following requirements:
 
 *   The first element of `then-elements` must either be a attribute, literal,
-    operand, or region.
+    operand, property, or region.
     -   This is because the first element must be optionally parsable.
+    -   If a property is used, it must have an `optionalParser` defined and have a
+        default value.
 *   Exactly one argument variable or type directive within either
     `then-elements` or `else-elements` must be marked as the anchor of the
     group.
@@ -984,6 +1011,8 @@ foo.op is_read_only
 foo.op
 ```
 
+The same logic applies to a `UnitProperty`.
+
 ##### Optional "else" Group
 
 Optional groups also have support for an "else" group of elements. These are
@@ -1026,6 +1055,8 @@ to:
 1.  All operand and result types must appear within the format using the various
     `type` directives, either individually or with the `operands` or `results`
     directives.
+1.  Unless all non-attribute properties appear in the format, the `prop-dict`
+    directive must be present.
 1.  The `attr-dict` directive must always be present.
 1.  Must not contain overlapping information; e.g. multiple instances of
     'attr-dict', types, operands, etc.
diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
index 82bc61dd8c3ad..a6d59bdecbdd8 100644
--- a/mlir/docs/Diagnostics.md
+++ b/mlir/docs/Diagnostics.md
@@ -77,7 +77,7 @@ InFlightDiagnostic Operation::emitOpError();
 ## Diagnostic
 
 A `Diagnostic` in MLIR contains all of the necessary information for reporting a
-message to the user. A `Diagnostic` essentially boils down to three main
+message to the user. A `Diagnostic` essentially boils down to four main
 components:
 
 *   [Source Location](#source-locations)
@@ -85,6 +85,11 @@ components:
     -   Error, Note, Remark, Warning
 *   Diagnostic Arguments
     -   The diagnostic arguments are used when constructing the output message.
+*   Metadata
+    -   Some additional information attached that can be used to identify 
+        this diagnostic other than source location and severity level 
+        (e.g. for diagnostic handlers to do some filtering). 
+        Metadata is not part of the output message.
 
 ### Appending arguments
 
@@ -143,6 +148,11 @@ op->emitError("...").attachNote(noteLoc) << "...";
 op->emitError("...").attachNote() << "...";
 ```
 
+### Managing Metadata
+Metadata is a mutable vector of DiagnosticArguments. 
+It can be accessed and modified as a vector. 
+
+
 ## InFlight Diagnostic
 
 Now that [Diagnostics](#diagnostic) have been explained, we introduce the
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index bc0f484108fac..fadc81b567b4e 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type.
 The following non-parametric types derived from the LLVM IR are available in the
 LLVM dialect:
 
--   `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86
-    machine.
 -   `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two
     64 bits).
 -   `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an
diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
new file mode 100644
index 0000000000000..e787f348bd228
--- /dev/null
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -0,0 +1,294 @@
+# Using `mlir-opt`
+
+`mlir-opt` is a command-line entry point for running passes and lowerings on MLIR code.
+This tutorial will explain how to use `mlir-opt`, show some examples of its usage,
+and mention some useful tips for working with it.
+
+Prerequisites:
+
+- [Building MLIR from source](/getting_started/)
+- [MLIR Language Reference](/docs/LangRef/)
+
+[TOC]
+
+## `mlir-opt` basics
+
+The `mlir-opt` tool loads a textual IR or bytecode into an in-memory structure,
+and optionally executes a sequence of passes
+before serializing back the IR (textual form by default).
+It is intended as a testing and debugging utility.
+
+After building the MLIR project,
+the `mlir-opt` binary (located in `build/bin`)
+is the entry point for running passes and lowerings,
+as well as emitting debug and diagnostic data.
+
+Running `mlir-opt` with no flags will consume textual or bytecode IR
+from the standard input, parse and run verifiers on it,
+and write the textual format back to the standard output.
+This is a good way to test if an input MLIR is well-formed.
+
+`mlir-opt --help` shows a complete list of flags
+(there are nearly 1000).
+Each pass has its own flag,
+though it is recommended to use `--pass-pipeline`
+to run passes rather than bare flags.
+
+## Running a pass
+
+Next we run [`convert-to-llvm`](/docs/Passes/#-convert-to-llvm),
+which converts all supported dialects to the `llvm` dialect,
+on the following IR:
+
+```mlir
+// mlir/test/Examples/mlir-opt/ctlz.mlir
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = math.ctlz %arg0 : i32
+    func.return %0 : i32
+  }
+}
+```
+
+After building MLIR, and from the `llvm-project` base directory, run
+
+```bash
+build/bin/mlir-opt --pass-pipeline="builtin.module(convert-math-to-llvm)" mlir/test/Examples/mlir-opt/ctlz.mlir
+```
+
+which produces
+
+```mlir
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = "llvm.intr.ctlz"(%arg0) <{is_zero_poison = false}> : (i32) -> i32
+    return %0 : i32
+  }
+}
+```
+
+Note that `llvm` here is MLIR's `llvm` dialect,
+which would still need to be processed through `mlir-translate`
+to generate LLVM-IR.
+
+## Running a pass with options
+
+Next we will show how to run a pass that takes configuration options.
+Consider the following IR containing loops with poor cache locality.
+
+```mlir
+// mlir/test/Examples/mlir-opt/loop_fusion.mlir
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
+```
+
+Running this with the [`affine-loop-fusion`](/docs/Passes/#-affine-loop-fusion) pass
+produces a fused loop.
+
+```bash
+build/bin/mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion)" mlir/test/Examples/mlir-opt/loop_fusion.mlir
+```
+
+```mlir
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %alloc = memref.alloc() : memref<1xf32>
+    %alloc_0 = memref.alloc() : memref<1xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %alloc[0] : memref<1xf32>
+      affine.store %cst, %alloc_0[0] : memref<1xf32>
+      %0 = affine.load %alloc_0[0] : memref<1xf32>
+      %1 = arith.mulf %0, %0 : f32
+      affine.store %1, %arg1[%arg2] : memref<10xf32>
+      %2 = affine.load %alloc[0] : memref<1xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
+```
+
+This pass has options that allow the user to configure its behavior.
+For example, the `fusion-compute-tolerance` option
+is described as the "fractional increase in additional computation tolerated while fusing."
+If this value is set to zero on the command line,
+the pass will not fuse the loops.
+
+```bash
+build/bin/mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion{fusion-compute-tolerance=0})" \
+mlir/test/Examples/mlir-opt/loop_fusion.mlir
+```
+
+```mlir
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %alloc = memref.alloc() : memref<10xf32>
+    %alloc_0 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %alloc[%arg2] : memref<10xf32>
+      affine.store %cst, %alloc_0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %0 = affine.load %alloc[%arg2] : memref<10xf32>
+      %1 = arith.addf %0, %0 : f32
+      affine.store %1, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %0 = affine.load %alloc_0[%arg2] : memref<10xf32>
+      %1 = arith.mulf %0, %0 : f32
+      affine.store %1, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
+```
+
+Options passed to a pass
+are specified via the syntax `{option1=value1 option2=value2 ...}`,
+i.e., use space-separated `key=value` pairs for each option.
+
+## Building a pass pipeline on the command line
+
+The `--pass-pipeline` flag supports combining multiple passes into a pipeline.
+So far we have used the trivial pipeline with a single pass
+that is "anchored" on the top-level `builtin.module` op.
+[Pass anchoring](/docs/PassManagement/#oppassmanager)
+is a way for passes to specify
+that they only run on particular ops.
+While many passes are anchored on `builtin.module`,
+if you try to run a pass that is anchored on some other op
+inside `--pass-pipeline="builtin.module(pass-name)"`,
+it will not run.
+
+Multiple passes can be chained together
+by providing the pass names in a comma-separated list
+in the `--pass-pipeline` string,
+e.g.,
+`--pass-pipeline="builtin.module(pass1,pass2)"`.
+The passes will be run sequentially.
+
+To use passes that have nontrivial anchoring,
+the appropriate level of nesting must be specified
+in the pass pipeline.
+For example, consider the following IR which has the same redundant code,
+but in two different levels of nesting.
+
+```mlir
+module {
+  module {
+    func.func @func1(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      func.return %2 : i32
+    }
+  }
+
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
+}
+```
+
+The following pipeline runs `cse` (common subexpression elimination)
+but only on the `func.func` inside the two `builtin.module` ops.
+
+```bash
+build/bin/mlir-opt mlir/test/Examples/mlir-opt/ctlz.mlir --pass-pipeline='
+    builtin.module(
+        builtin.module(
+            func.func(cse,canonicalize),
+            convert-to-llvm
+        )
+    )'
+```
+
+The output leaves the `gpu.module` alone
+
+```mlir
+module {
+  module {
+    llvm.func @func1(%arg0: i32) -> i32 {
+      %0 = llvm.add %arg0, %arg0 : i32
+      %1 = llvm.add %0, %0 : i32
+      llvm.return %1 : i32
+    }
+  }
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
+}
+```
+
+Specifying a pass pipeline with nested anchoring
+is also beneficial for performance reasons:
+passes with anchoring can run on IR subsets in parallel,
+which provides better threaded runtime and cache locality
+within threads.
+For example,
+even if a pass is not restricted to anchor on `func.func`,
+running `builtin.module(func.func(cse, canonicalize))`
+is more efficient than `builtin.module(cse, canonicalize)`.
+
+For a spec of the pass-pipeline textual description language,
+see [the docs](/docs/PassManagement/#textual-pass-pipeline-specification).
+For more general information on pass management, see [Pass Infrastructure](/docs/PassManagement/#).
+
+## Useful CLI flags
+
+- `--debug` prints all debug information produced by `LLVM_DEBUG` calls.
+- `--debug-only="my-tag"` prints only the debug information produced by `LLVM_DEBUG`
+  in files that have the macro `#define DEBUG_TYPE "my-tag"`.
+  This often allows you to print only debug information associated with a specific pass.
+    - `"greedy-rewriter"` only prints debug information
+      for patterns applied with the greedy rewriter engine.
+    - `"dialect-conversion"` only prints debug information
+      for the dialect conversion framework.
+ - `--emit-bytecode` emits MLIR in the bytecode format.
+ - `--mlir-pass-statistics` print statistics about the passes run.
+    These are generated via [pass statistics](/docs/PassManagement/#pass-statistics).
+ - `--mlir-print-ir-after-all` prints the IR after each pass.
+    - See also `--mlir-print-ir-after-change`, `--mlir-print-ir-after-failure`,
+      and analogous versions of these flags with `before` instead of `after`.
+    - When using `print-ir` flags, adding `--mlir-print-ir-tree-dir` writes the
+      IRs to files in a directory tree, making them easier to inspect versus a
+      large dump to the terminal.
+ - `--mlir-timing` displays execution times of each pass.
+
+## Further readering
+
+- [List of passes](/docs/Passes/)
+- [List of dialects](/docs/Dialects/)
diff --git a/mlir/include/mlir-c/BuiltinTypes.h b/mlir/include/mlir-c/BuiltinTypes.h
index 99c5e3f46b04c..2212087b9898f 100644
--- a/mlir/include/mlir-c/BuiltinTypes.h
+++ b/mlir/include/mlir-c/BuiltinTypes.h
@@ -89,6 +89,16 @@ MLIR_CAPI_EXPORTED bool mlirTypeIsAFloat8E5M2(MlirType type);
 /// context.
 MLIR_CAPI_EXPORTED MlirType mlirFloat8E5M2TypeGet(MlirContext ctx);
 
+/// Returns the typeID of an Float8E4M3 type.
+MLIR_CAPI_EXPORTED MlirTypeID mlirFloat8E4M3TypeGetTypeID(void);
+
+/// Checks whether the given type is an f8E4M3 type.
+MLIR_CAPI_EXPORTED bool mlirTypeIsAFloat8E4M3(MlirType type);
+
+/// Creates an f8E4M3 type in the given context. The type is owned by the
+/// context.
+MLIR_CAPI_EXPORTED MlirType mlirFloat8E4M3TypeGet(MlirContext ctx);
+
 /// Returns the typeID of an Float8E4M3FN type.
 MLIR_CAPI_EXPORTED MlirTypeID mlirFloat8E4M3FNTypeGetTypeID(void);
 
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 902b45444d6c4..631b564618320 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -309,7 +309,7 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDILexicalBlockFileAttrGet(
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDILocalVariableAttrGet(
     MlirContext ctx, MlirAttribute scope, MlirAttribute name,
     MlirAttribute diFile, unsigned int line, unsigned int arg,
-    unsigned int alignInBits, MlirAttribute diType);
+    unsigned int alignInBits, MlirAttribute diType, int64_t flags);
 
 /// Creates a LLVM DISubprogramAttr attribute.
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDISubprogramAttrGet(
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 748646e605827..b5bb2f42f2961 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -47,7 +47,10 @@ def ConvertToSPIRVPass : Pass<"convert-to-spirv"> {
   let options = [
     Option<"runSignatureConversion", "run-signature-conversion", "bool",
     /*default=*/"true",
-    "Run function signature conversion to convert vector types">
+    "Run function signature conversion to convert vector types">,
+    Option<"runVectorUnrolling", "run-vector-unrolling", "bool",
+    /*default=*/"true",
+    "Run vector unrolling to convert vector types in function bodies">
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 3f27e1541cf38..aa2b4543927a7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -433,6 +433,46 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
+    "The possible options for scheduling barriers",
+    [
+      I32BitEnumAttrCaseNone<"none">,
+      I32BitEnumAttrCaseBit<"non_mem_non_sideffect", 0>,
+      I32BitEnumAttrCaseBit<"valu", 1>,
+      I32BitEnumAttrCaseBit<"salu", 2>,
+      I32BitEnumAttrCaseBit<"mfma_wmma",  3>,
+      I32BitEnumAttrCaseBit<"all_vmem",  4>,
+      I32BitEnumAttrCaseBit<"vmem_read",  5>,
+      I32BitEnumAttrCaseBit<"vmem_write", 6>,
+      I32BitEnumAttrCaseBit<"all_ds", 7>,
+      I32BitEnumAttrCaseBit<"ds_read", 8>,
+      I32BitEnumAttrCaseBit<"ds_write", 9>,
+      I32BitEnumAttrCaseBit<"transcendental", 10>
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierOpOpt,
+  "sched_barrier_opt">{
+   let assemblyFormat = "`<` $value `>`";
+}
+
+def AMDGPU_SchedBarrierOp :
+  AMDGPU_Op<"sched_barrier">,
+  Arguments<(ins  AMDGPU_SchedBarrierOpOptAttr:$opts)>
+  {
+  let summary = "Barrier that limits the backend scheduler of instruction movement";
+  let description = [{
+    `amdgpu.sched_barrier` serves as a barrier that could be
+    configured to restrict movements of instructions through it as
+    defined by sched_barrier_opts.
+  }];
+  let assemblyFormat = [{
+    `allow` `=` $opts attr-dict
+  }];
+}
+
 def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
     "The possible permutations of the lanes storing B available in an MFMA",
     [
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 7b92b930fb5f5..ed3c21d952a01 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -109,6 +109,16 @@ bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
 // the support.
 bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts);
 
+/// Checks whether hyper-rectangular loop tiling of the nest represented by
+/// `loops` is valid. The validity condition is from Irigoin and Triolet,
+/// which states that two tiles cannot depend on each other. We simplify such
+/// condition to just checking whether there is any negative dependence
+/// direction, since we have the prior knowledge that the tiling results will be
+/// hyper-rectangles, which are scheduled in the lexicographically increasing
+/// order on the vector of loop indices. This function will return failure when
+/// any dependence component is negative along any of `loops`.
+bool isTilingValid(ArrayRef<AffineForOp> loops);
+
 } // namespace affine
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
index dfd64f995546a..921234daad1f1 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -202,7 +202,8 @@ def VectorLegalization
     "func::FuncDialect",
     "arm_sme::ArmSMEDialect",
     "vector::VectorDialect",
-    "arith::ArithDialect"
+    "arith::ArithDialect",
+    "index::IndexDialect"
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 181bc5a2d07fa..3bdbfb0fedb9c 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -50,7 +50,7 @@ def AssertOp : CF_Op<"assert",
     Example:
 
     ```mlir
-    assert %b, "Expected ... to be true"
+    cf.assert %b, "Expected ... to be true"
     ```
   }];
 
@@ -118,7 +118,7 @@ def CondBranchOp : CF_Op<"cond_br",
      Pure, Terminator]> {
   let summary = "conditional branch operation";
   let description = [{
-    The `cond_br` terminator operation represents a conditional branch on a
+    The `cf.cond_br` terminator operation represents a conditional branch on a
     boolean (1-bit integer) value. If the bit is set, then the first destination
     is jumped to; if it is false, the second destination is chosen. The count
     and types of operands must align with the arguments in the corresponding
@@ -136,7 +136,7 @@ def CondBranchOp : CF_Op<"cond_br",
     ```mlir
     func.func @select(%a: i32, %b: i32, %flag: i1) -> i32 {
       // Both targets are the same, operands differ
-      cond_br %flag, ^bb1(%a : i32), ^bb1(%b : i32)
+      cf.cond_br %flag, ^bb1(%a : i32), ^bb1(%b : i32)
 
     ^bb1(%x : i32) :
       return %x : i32
@@ -233,7 +233,7 @@ def SwitchOp : CF_Op<"switch",
      Pure, Terminator]> {
   let summary = "switch operation";
   let description = [{
-    The `switch` terminator operation represents a switch on a signless integer
+    The `cf.switch` terminator operation represents a switch on a signless integer
     value. If the flag matches one of the specified cases, then the
     corresponding destination is jumped to. If the flag does not match any of
     the cases, the default destination is jumped to. The count and types of
@@ -242,7 +242,7 @@ def SwitchOp : CF_Op<"switch",
     Example:
 
     ```mlir
-    switch %flag : i32, [
+    cf.switch %flag : i32, [
       default: ^bb1(%a : i32),
       42: ^bb1(%b : i32),
       43: ^bb3(%c : i32)
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 626b0576efd9c..f49dbf1a681d9 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -58,7 +58,7 @@ def FloatIntegerIndexOrOpaqueType : AnyTypeOf<[EmitCFloatType, IntegerIndexOrOpa
 def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
   let summary = "Addition operation";
   let description = [{
-    With the `add` operation the arithmetic operator + (addition) can
+    With the `emitc.add` operation the arithmetic operator + (addition) can
     be applied.
 
     Example:
@@ -81,7 +81,7 @@ def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
 def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
   let summary = "Apply operation";
   let description = [{
-    With the `apply` operation the operators & (address of) and * (contents of)
+    With the `emitc.apply` operation the operators & (address of) and * (contents of)
     can be applied to a single operand.
 
     Example:
@@ -110,7 +110,7 @@ def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
 def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", [CExpression]> {
   let summary = "Bitwise and operation";
   let description = [{
-    With the `bitwise_and` operation the bitwise operator & (and) can
+    With the `emitc.bitwise_and` operation the bitwise operator & (and) can
     be applied.
 
     Example:
@@ -129,7 +129,7 @@ def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
     [CExpression]> {
   let summary = "Bitwise left shift operation";
   let description = [{
-    With the `bitwise_left_shift` operation the bitwise operator <<
+    With the `emitc.bitwise_left_shift` operation the bitwise operator <<
     (left shift) can be applied.
 
     Example:
@@ -147,7 +147,7 @@ def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
 def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
   let summary = "Bitwise not operation";
   let description = [{
-    With the `bitwise_not` operation the bitwise operator ~ (not) can
+    With the `emitc.bitwise_not` operation the bitwise operator ~ (not) can
     be applied.
 
     Example:
@@ -165,7 +165,7 @@ def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
 def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", [CExpression]> {
   let summary = "Bitwise or operation";
   let description = [{
-    With the `bitwise_or` operation the bitwise operator | (or)
+    With the `emitc.bitwise_or` operation the bitwise operator | (or)
     can be applied.
 
     Example:
@@ -184,7 +184,7 @@ def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
     [CExpression]> {
   let summary = "Bitwise right shift operation";
   let description = [{
-    With the `bitwise_right_shift` operation the bitwise operator >>
+    With the `emitc.bitwise_right_shift` operation the bitwise operator >>
     (right shift) can be applied.
 
     Example:
@@ -202,7 +202,7 @@ def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
 def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
   let summary = "Bitwise xor operation";
   let description = [{
-    With the `bitwise_xor` operation the bitwise operator ^ (xor)
+    With the `emitc.bitwise_xor` operation the bitwise operator ^ (xor)
     can be applied.
 
     Example:
@@ -220,7 +220,7 @@ def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
 def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> {
   let summary = "Opaque call operation";
   let description = [{
-    The `call_opaque` operation represents a C++ function call. The callee
+    The `emitc.call_opaque` operation represents a C++ function call. The callee
     can be an arbitrary non-empty string. The call allows specifying order
     of operands and attributes in the call as follows:
 
@@ -269,7 +269,7 @@ def EmitC_CastOp : EmitC_Op<"cast",
      SameOperandsAndResultShape]> {
   let summary = "Cast operation";
   let description = [{
-    The `cast` operation performs an explicit type conversion and is emitted
+    The `emitc.cast` operation performs an explicit type conversion and is emitted
     as a C-style cast expression. It can be applied to integer, float, index
     and EmitC types.
 
@@ -293,7 +293,7 @@ def EmitC_CastOp : EmitC_Op<"cast",
 def EmitC_CmpOp : EmitC_BinaryOp<"cmp", [CExpression]> {
   let summary = "Comparison operation";
   let description = [{
-    With the `cmp` operation the comparison operators ==, !=, <, <=, >, >=, <=> 
+    With the `emitc.cmp` operation the comparison operators ==, !=, <, <=, >, >=, <=> 
     can be applied.
 
     Its first argument is an attribute that defines the comparison operator:
@@ -334,10 +334,10 @@ def EmitC_CmpOp : EmitC_BinaryOp<"cmp", [CExpression]> {
 def EmitC_ConstantOp : EmitC_Op<"constant", [ConstantLike]> {
   let summary = "Constant operation";
   let description = [{
-    The `constant` operation produces an SSA value equal to some constant
+    The `emitc.constant` operation produces an SSA value equal to some constant
     specified by an attribute. This can be used to form simple integer and
     floating point constants, as well as more exotic things like tensor
-    constants. The `constant` operation also supports the EmitC opaque
+    constants. The `emitc.constant` operation also supports the EmitC opaque
     attribute and the EmitC opaque type. Since folding is supported,
     it should not be used with pointers.
 
@@ -363,7 +363,7 @@ def EmitC_ConstantOp : EmitC_Op<"constant", [ConstantLike]> {
 def EmitC_DivOp : EmitC_BinaryOp<"div", [CExpression]> {
   let summary = "Division operation";
   let description = [{
-    With the `div` operation the arithmetic operator / (division) can
+    With the `emitc.div` operation the arithmetic operator / (division) can
     be applied.
 
     Example:
@@ -389,7 +389,7 @@ def EmitC_ExpressionOp : EmitC_Op<"expression",
        NoRegionArguments]> {
   let summary = "Expression operation";
   let description = [{
-    The `expression` operation returns a single SSA value which is yielded by
+    The `emitc.expression` operation returns a single SSA value which is yielded by
     its single-basic-block region. The operation doesn't take any arguments.
 
     As the operation is to be emitted as a C expression, the operations within
@@ -593,8 +593,8 @@ def EmitC_DeclareFuncOp : EmitC_Op<"declare_func", [
 ]> {
   let summary = "An operation to declare a function";
   let description = [{
-    The `declare_func` operation allows to insert a function declaration for an
-    `emitc.func` at a specific position. The operation only requires the `callee`
+    The `emitc.declare_func` operation allows to insert a function declaration for an
+    `emitc.func` at a specific position. The operation only requires the "callee"
     of the `emitc.func` to be specified as an attribute.
 
     Example:
@@ -733,7 +733,7 @@ def EmitC_IncludeOp
     : EmitC_Op<"include", [HasParent<"ModuleOp">]> {
   let summary = "Include operation";
   let description = [{
-    The `include` operation allows to define a source file inclusion via the
+    The `emitc.include` operation allows to define a source file inclusion via the
     `#include` directive.
 
     Example:
@@ -762,7 +762,7 @@ def EmitC_IncludeOp
 def EmitC_LiteralOp : EmitC_Op<"literal", [Pure]> {
   let summary = "Literal operation";
   let description = [{
-    The `literal` operation produces an SSA value equal to some constant
+    The `emitc.literal` operation produces an SSA value equal to some constant
     specified by an attribute.
   }];
 
@@ -776,7 +776,7 @@ def EmitC_LiteralOp : EmitC_Op<"literal", [Pure]> {
 def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
   let summary = "Logical and operation";
   let description = [{
-    With the `logical_and` operation the logical operator && (and) can
+    With the `emitc.logical_and` operation the logical operator && (and) can
     be applied.
 
     Example:
@@ -797,7 +797,7 @@ def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
 def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
   let summary = "Logical not operation";
   let description = [{
-    With the `logical_not` operation the logical operator ! (negation) can
+    With the `emitc.logical_not` operation the logical operator ! (negation) can
     be applied.
 
     Example:
@@ -818,7 +818,7 @@ def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
 def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
   let summary = "Logical or operation";
   let description = [{
-    With the `logical_or` operation the logical operator || (inclusive or)
+    With the `emitc.logical_or` operation the logical operator || (inclusive or)
     can be applied.
 
     Example:
@@ -839,7 +839,7 @@ def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
 def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
   let summary = "Multiplication operation";
   let description = [{
-    With the `mul` operation the arithmetic operator * (multiplication) can
+    With the `emitc.mul` operation the arithmetic operator * (multiplication) can
     be applied.
 
     Example:
@@ -863,7 +863,7 @@ def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
 def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
   let summary = "Remainder operation";
   let description = [{
-    With the `rem` operation the arithmetic operator % (remainder) can
+    With the `emitc.rem` operation the arithmetic operator % (remainder) can
     be applied.
 
     Example:
@@ -885,7 +885,7 @@ def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
 def EmitC_SubOp : EmitC_BinaryOp<"sub", [CExpression]> {
   let summary = "Subtraction operation";
   let description = [{
-    With the `sub` operation the arithmetic operator - (subtraction) can
+    With the `emitc.sub` operation the arithmetic operator - (subtraction) can
     be applied.
 
     Example:
@@ -911,7 +911,7 @@ def EmitC_SubOp : EmitC_BinaryOp<"sub", [CExpression]> {
 def EmitC_MemberOp : EmitC_Op<"member"> {
   let summary = "Member operation";
   let description = [{
-    With the `member` operation the member access operator `.` can be
+    With the `emitc.member` operation the member access operator `.` can be
     applied.
 
     Example:
@@ -932,7 +932,7 @@ def EmitC_MemberOp : EmitC_Op<"member"> {
 def EmitC_MemberOfPtrOp : EmitC_Op<"member_of_ptr"> {
   let summary = "Member of pointer operation";
   let description = [{
-    With the `member_of_ptr` operation the member access operator `->`
+    With the `emitc.member_of_ptr` operation the member access operator `->`
     can be applied.
 
     Example:
@@ -954,7 +954,7 @@ def EmitC_ConditionalOp : EmitC_Op<"conditional",
     [AllTypesMatch<["true_value", "false_value", "result"]>, CExpression]> {
   let summary = "Conditional (ternary) operation";
   let description = [{
-    With the `conditional` operation the ternary conditional operator can
+    With the `emitc.conditional` operation the ternary conditional operator can
     be applied.
 
     Example:
@@ -983,13 +983,13 @@ def EmitC_ConditionalOp : EmitC_Op<"conditional",
 def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
   let summary = "Unary minus operation";
   let description = [{
-    With the `unary_minus` operation the unary operator - (minus) can be
+    With the `emitc.unary_minus` operation the unary operator - (minus) can be
     applied.
 
     Example:
 
     ```mlir
-    %0 = emitc.unary_plus %arg0 : (i32) -> i32
+    %0 = emitc.unary_minus %arg0 : (i32) -> i32
     ```
     ```c++
     // Code emitted for the operation above.
@@ -1001,7 +1001,7 @@ def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
 def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", [CExpression]> {
   let summary = "Unary plus operation";
   let description = [{
-    With the `unary_plus` operation the unary operator + (plus) can be
+    With the `emitc.unary_plus` operation the unary operator + (plus) can be
     applied.
 
     Example:
@@ -1019,13 +1019,13 @@ def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", [CExpression]> {
 def EmitC_VariableOp : EmitC_Op<"variable", []> {
   let summary = "Variable operation";
   let description = [{
-    The `variable` operation produces an SSA value equal to some value
+    The `emitc.variable` operation produces an SSA value equal to some value
     specified by an attribute. This can be used to form simple integer and
     floating point variables, as well as more exotic things like tensor
-    variables. The `variable` operation also supports the EmitC opaque
+    variables. The `emitc.variable` operation also supports the EmitC opaque
     attribute and the EmitC opaque type. If further supports the EmitC
     pointer type, whereas folding is not supported.
-    The `variable` is emitted as a C/C++ local variable.
+    The `emitc.variable` is emitted as a C/C++ local variable.
 
     Example:
 
@@ -1129,7 +1129,7 @@ def EmitC_GetGlobalOp : EmitC_Op<"get_global",
 def EmitC_VerbatimOp : EmitC_Op<"verbatim"> {
   let summary = "Verbatim operation";
   let description = [{
-    The `verbatim` operation produces no results and the value is emitted as is
+    The `emitc.verbatim` operation produces no results and the value is emitted as is
     followed by a line break  ('\n' character) during translation.
 
     Note: Use with caution. This operation can have arbitrary effects on the
@@ -1163,7 +1163,7 @@ def EmitC_VerbatimOp : EmitC_Op<"verbatim"> {
 def EmitC_AssignOp : EmitC_Op<"assign", []> {
   let summary = "Assign operation";
   let description = [{
-    The `assign` operation stores an SSA value to the location designated by an
+    The `emitc.assign` operation stores an SSA value to the location designated by an
     EmitC variable. This operation doesn't return any value. The assigned value
     must be of the same type as the variable being assigned. The operation is
     emitted as a C/C++ '=' operator.
@@ -1191,11 +1191,11 @@ def EmitC_YieldOp : EmitC_Op<"yield",
       [Pure, Terminator, ParentOneOf<["ExpressionOp", "IfOp", "ForOp"]>]> {
   let summary = "block termination operation";
   let description = [{
-    "yield" terminates its parent EmitC op's region, optionally yielding
+    The `emitc.yield` terminates its parent EmitC op's region, optionally yielding
     an SSA value. The semantics of how the values are yielded is defined by the
     parent operation.
-    If "yield" has an operand, the operand must match the parent operation's
-    result. If the parent operation defines no values, then the "emitc.yield"
+    If `emitc.yield` has an operand, the operand must match the parent operation's
+    result. If the parent operation defines no values, then the `emitc.yield`
     may be left out in the custom syntax and the builders will insert one
     implicitly. Otherwise, it has to be present in the syntax to indicate which
     value is yielded.
@@ -1216,7 +1216,7 @@ def EmitC_IfOp : EmitC_Op<"if",
     RecursiveMemoryEffects, NoRegionArguments]> {
   let summary = "if-then-else operation";
   let description = [{
-    The `if` operation represents an if-then-else construct for
+    The `emitc.if` operation represents an if-then-else construct for
     conditionally executing two regions of code. The operand to an if operation
     is a boolean value. For example:
 
@@ -1268,7 +1268,7 @@ def EmitC_IfOp : EmitC_Op<"if",
 def EmitC_SubscriptOp : EmitC_Op<"subscript", []> {
   let summary = "Subscript operation";
   let description = [{
-    With the `subscript` operation the subscript operator `[]` can be applied
+    With the `emitc.subscript` operation the subscript operator `[]` can be applied
     to variables or arguments of array, pointer and opaque type.
 
     Example:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index aefc02bce40fb..695a962bcab9b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -532,17 +532,18 @@ def LLVM_DILocalVariableAttr : LLVM_Attr<"DILocalVariable", "di_local_variable",
     OptionalParameter<"unsigned">:$line,
     OptionalParameter<"unsigned">:$arg,
     OptionalParameter<"unsigned">:$alignInBits,
-    OptionalParameter<"DITypeAttr">:$type
+    OptionalParameter<"DITypeAttr">:$type,
+    OptionalParameter<"DIFlags", "DIFlags::Zero">:$flags
   );
   let builders = [
     AttrBuilderWithInferredContext<(ins
       "DIScopeAttr":$scope, "StringRef":$name, "DIFileAttr":$file,
       "unsigned":$line, "unsigned":$arg, "unsigned":$alignInBits,
-      "DITypeAttr":$type
+      "DITypeAttr":$type, "DIFlags":$flags
     ), [{
       MLIRContext *ctx = scope.getContext();
       return $_get(ctx, scope, StringAttr::get(ctx, name), file, line,
-                   arg, alignInBits, type);
+                   arg, alignInBits, type, flags);
     }]>
   ];
   let assemblyFormat = "`<` struct(params) `>`";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 06656c791c594..260d42185b57f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -59,22 +59,9 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
                                    list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
     !listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
-  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags">:$overflowFlags);
+  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
   let arguments = !con(commonArgs, iofArg);
 
-  let builders = [
-    OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, type, lhs, rhs);
-    }]>,
-    OpBuilder<(ins "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, lhs, rhs);
-    }]>
-  ];
-
   string mlirBuilder = [{
     auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
     moduleImport.setIntegerOverflowFlags(inst, op);
@@ -560,14 +547,14 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                       DeclareOpInterfaceMethods<BranchWeightOpInterface>,
                       Terminator]> {
   let arguments = (ins
-                   OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$callee_type,
+                   OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
                    OptionalAttr<FlatSymbolRefAttr>:$callee,
                    Variadic<LLVM_Type>:$callee_operands,
                    Variadic<LLVM_Type>:$normalDestOperands,
                    Variadic<LLVM_Type>:$unwindDestOperands,
                    OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
                    DefaultValuedAttr<CConv, "CConv::C">:$CConv);
-  let results = (outs Variadic<LLVM_Type>);
+  let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
 
@@ -617,11 +604,12 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     start with a function name (`@`-prefixed) and indirect calls start with an
     SSA value (`%`-prefixed). The direct callee, if present, is stored as a
     function attribute `callee`. For indirect calls, the callee is of `!llvm.ptr` type
-    and is stored as the first value in `callee_operands`. If the callee is a variadic
-    function, then the `callee_type` attribute must carry the function type. The
-    trailing type list contains the optional indirect callee type and the MLIR
-    function type, which differs from the LLVM function type that uses a explicit
-    void type to model functions that do not return a value.
+    and is stored as the first value in `callee_operands`. If and only if the
+    callee is a variadic function, the `var_callee_type` attribute must carry
+    the variadic LLVM function type. The trailing type list contains the
+    optional indirect callee type and the MLIR function type, which differs from
+    the LLVM function type that uses an explicit void type to model functions
+    that do not return a value.
 
     Examples:
 
@@ -644,14 +632,19 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     ```
   }];
 
-  dag args = (ins OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$callee_type,
+  dag args = (ins OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
                   OptionalAttr<FlatSymbolRefAttr>:$callee,
                   Variadic<LLVM_Type>:$callee_operands,
                   DefaultValuedAttr<LLVM_FastmathFlagsAttr,
                                    "{}">:$fastmathFlags,
                   OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
                   DefaultValuedAttr<CConv, "CConv::C">:$CConv,
-                  DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind);
+                  DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind,
+                  OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
+                  OptionalAttr<UnitAttr>:$convergent,
+                  OptionalAttr<UnitAttr>:$no_unwind,
+                  OptionalAttr<UnitAttr>:$will_return
+                  );
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1433,7 +1426,7 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     OptionalAttr<DictArrayAttr>:$arg_attrs,
     OptionalAttr<DictArrayAttr>:$res_attrs,
     OptionalAttr<I64Attr>:$function_entry_count,
-    OptionalAttr<LLVM_MemoryEffectsAttr>:$memory,
+    OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
     DefaultValuedAttr<Visibility, "mlir::LLVM::Visibility::Default">:$visibility_,
     OptionalAttr<UnitAttr>:$arm_streaming,
     OptionalAttr<UnitAttr>:$arm_locally_streaming,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index 93733ccd4929a..1befdfa74f67c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -67,7 +67,6 @@ namespace LLVM {
 
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128");
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata");
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index 3af642752724c..db25c9b241734 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -30,6 +30,10 @@ class GenericOp;
 class LinalgOp;
 } // namespace linalg
 
+namespace scf {
+struct SCFTilingResult;
+} // namespace scf
+
 namespace tensor {
 class InsertSliceOp;
 class PackOp;
@@ -60,7 +64,7 @@ tileToForallOpImpl(RewriterBase &rewriter, transform::TransformState &state,
                    ArrayRef<OpFoldResult> mixedNumThreads,
                    ArrayRef<OpFoldResult> mixedTileSizes,
                    std::optional<ArrayAttr> mapping,
-                   linalg::ForallTilingResult &tilingResult);
+                   scf::SCFTilingResult &tilingResult);
 
 } // namespace transform
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0c7a8edff222f..477ef7bfafb18 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -866,29 +866,6 @@ FailureOr<ContinuousTileSizeSpecification>
 computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,
                            unsigned dimension, OpFoldResult targetSize,
                            bool emitAssertions);
-/// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying
-/// tiling by `numThreads`.
-/// If non-empty, the `mapping` is added as an attribute to the
-/// resulting `scf.forall`.
-/// Zero tile sizes indicate that the dimension is not tiled, and can be
-/// thought of as tiling by the full size of data. It is the user's
-/// responsibility to ensure that `numThreads` is a valid tiling specification
-/// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
-struct ForallTilingResult {
-  Operation *tileOp;
-  Operation *tiledOp;
-};
-FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder,
-                                             TilingInterface op,
-                                             ArrayRef<OpFoldResult> numThreads,
-                                             std::optional<ArrayAttr> mapping);
-
-/// Same as `tileToForallOp`, but calculate the number of threads
-/// required using the given tileSizes.
-FailureOr<ForallTilingResult>
-tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
-                             ArrayRef<OpFoldResult> tileSizes,
-                             std::optional<ArrayAttr> mapping);
 
 /// Transformation information returned after reduction tiling.
 struct ForallReductionTilingResult {
@@ -1750,10 +1727,12 @@ void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m,
 void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns);
 
 /// Adds patterns that reduce the rank of named contraction ops that have
-/// unit dimensions in the operand(s) by converting to a sequence of `collapse_shape`,
-/// `<corresponding linalg named op>`, `expand_shape` (if on tensors).  For example a
-/// `linalg.batch_matmul` with unit batch size will convert to `linalg.matmul`
-/// and a `linalg.matvec` with with unit spatial dim in lhs will convert to a `linalg.dot`.
+/// unit dimensions in the operand(s) by converting to a sequence of
+/// `collapse_shape`,
+/// `<corresponding linalg named op>`, `expand_shape` (if on tensors).  For
+/// example a `linalg.batch_matmul` with unit batch size will convert to
+/// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will
+/// convert to a `linalg.dot`.
 void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns);
 
 } // namespace linalg
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index dda8f31e688fe..1f52f6b91617c 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -20,6 +20,7 @@
 #ifndef NVGPU
 #define NVGPU
 
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
@@ -109,10 +110,22 @@ def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
   let cppNamespace = "::mlir::nvgpu";
 }
 
+def RcpApprox : I32EnumAttrCase<"APPROX", 0, "approx">;
+def RcpRN     : I32EnumAttrCase<"RN", 1, "rn">;
+def RcpRZ     : I32EnumAttrCase<"RZ", 2, "rz">;
+def RcpRM     : I32EnumAttrCase<"RM", 3, "rm">;
+def RcpRP     : I32EnumAttrCase<"RP", 4, "rp">;
+def RcpRoundingMode   : I32EnumAttr<"RcpRoundingMode", "Rounding mode of rcp",
+  [RcpApprox, RcpRN, RcpRZ, RcpRM, RcpRP]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::nvgpu";
+}
+
 def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
 def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
 def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
 def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
+def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
 
 //===----------------------------------------------------------------------===//
 // NVGPU Type Definitions
@@ -802,4 +815,24 @@ def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulat
   let hasVerifier = 1;
 }
 
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+                                   SameOperandsAndResultType]> {
+  let summary = "The reciprocal calculation for vector types";
+  let description = [{
+    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
+
+    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
+
+    The input and output must be of the same vector type and shape.
+  }];
+  let arguments = (ins VectorOf<[F32]>:$in,
+                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+                       UnitAttr:$ftz);
+  let results = (outs VectorOf<[F32]>:$out);
+  let assemblyFormat = [{
+    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
+    attr-dict `:` type($out)
+  }];
+  let hasVerifier = 1;
+}
 #endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index 19070f6f062a0..aad2ac6f4dd2b 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -17,6 +17,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
index 2aa4b1828e0ad..66b1e89a9881d 100644
--- a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
@@ -21,9 +21,10 @@ mlir_tablegen(OpenACCOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=a
 add_public_tablegen_target(MLIROpenACCAttributesIncGen)
 add_dependencies(mlir-headers MLIROpenACCAttributesIncGen)
 
+add_mlir_interface(OpenACCOpsInterfaces)
+
 set(LLVM_TARGET_DEFINITIONS OpenACCTypeInterfaces.td)
 mlir_tablegen(OpenACCTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(OpenACCTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIROpenACCTypeInterfacesIncGen)
 add_dependencies(mlir-headers MLIROpenACCTypeInterfacesIncGen)
-
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 8239367fdd3e7..ca96ce62ae404 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -22,6 +22,7 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc"
 #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc"
+#include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc"
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.h.inc"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 148bed62aa8f2..d9f38259c0ace 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -22,6 +22,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Dialect/OpenACC/OpenACCBase.td"
 include "mlir/Dialect/OpenACC/OpenACCOpsTypes.td"
+include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td"
 include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td"
 include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td"
 include "mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.td"
@@ -1067,7 +1068,9 @@ def OpenACC_ReductionRecipeOp : OpenACC_Op<"reduction.recipe",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_ParallelOp : OpenACC_Op<"parallel",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "parallel construct";
@@ -1231,7 +1234,9 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_SerialOp : OpenACC_Op<"serial",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "serial construct";
@@ -1347,7 +1352,9 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_KernelsOp : OpenACC_Op<"kernels",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "kernels construct";
@@ -1737,9 +1744,11 @@ def OpenACC_HostDataOp : OpenACC_Op<"host_data",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_LoopOp : OpenACC_Op<"loop",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>,
-     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
   let summary = "loop construct";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
new file mode 100644
index 0000000000000..6fb9a950489f8
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -0,0 +1,29 @@
+//===-- OpenACCOpsInterfaces.td - OpenACC type interfaces ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENACC_OPS_INTERFACES
+#define OPENACC_OPS_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for compute and loop construct operations.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",
+      (ins), [{
+        return &$_op.getRegion().front();
+      }]>,
+  ];
+}
+
+#endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index d3422f6e48b06..dd349d1392e7b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -4,14 +4,14 @@ add_public_tablegen_target(omp_common_td)
 
 set(LLVM_TARGET_DEFINITIONS OpenMPOps.td)
 
-# Run the OpenMP verifier tablegen pseudo-backend while preventing the produced
-# dummy output from being added as a dependency to any tablegen targets defined
-# below.
-set(TABLEGEN_OUTPUT_TMP ${TABLEGEN_OUTPUT})
+# The OpenMP verifier tablegen pseudo-backend does not produce any output, but
+# mlir_tablegen expects an output file name to be passed. An empty "no-output"
+# file is created by the statement below.
+#
+# This output will be added to the list of dependencies of the
+# MLIROpenMPOpsIncGen target below, which results in triggering this
+# verification pass every time OpenMPOps.td is modified and recompiled.
 mlir_tablegen(no-output -verify-openmp-ops)
-file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/no-output ${CMAKE_CURRENT_BINARY_DIR}/no-output.d)
-set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT_TMP})
-unset(TABLEGEN_OUTPUT_TMP)
 
 mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp)
 mlir_tablegen(OpenMPOpsDialect.cpp.inc -gen-dialect-defs -dialect=omp)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 0eefe06055b7d..38e4d8f245e4f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -32,37 +32,37 @@ namespace omp {
 
 struct AlignedClauseOps {
   llvm::SmallVector<Value> alignedVars;
-  llvm::SmallVector<Attribute> alignmentAttrs;
+  llvm::SmallVector<Attribute> alignments;
 };
 
 struct AllocateClauseOps {
-  llvm::SmallVector<Value> allocatorVars, allocateVars;
+  llvm::SmallVector<Value> allocateVars, allocatorVars;
 };
 
 struct CancelDirectiveNameClauseOps {
-  ClauseCancellationConstructTypeAttr cancelDirectiveNameAttr;
-};
-
-struct CollapseClauseOps {
-  llvm::SmallVector<Value> loopLBVar, loopUBVar, loopStepVar;
+  ClauseCancellationConstructTypeAttr cancelDirective;
 };
 
 struct CopyprivateClauseOps {
   llvm::SmallVector<Value> copyprivateVars;
-  llvm::SmallVector<Attribute> copyprivateFuncs;
+  llvm::SmallVector<Attribute> copyprivateSyms;
 };
 
 struct CriticalNameClauseOps {
-  StringAttr criticalNameAttr;
+  /// This field has a generic name because it's mirroring the `sym_name`
+  /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one
+  /// can't be renamed to anything more specific because the `sym_name` name is
+  /// a requirement of the `Symbol` MLIR trait associated with that clause.
+  StringAttr symName;
 };
 
 struct DependClauseOps {
-  llvm::SmallVector<Attribute> dependTypeAttrs;
+  llvm::SmallVector<Attribute> dependKinds;
   llvm::SmallVector<Value> dependVars;
 };
 
 struct DeviceClauseOps {
-  Value deviceVar;
+  Value device;
 };
 
 struct DeviceTypeClauseOps {
@@ -71,26 +71,26 @@ struct DeviceTypeClauseOps {
 };
 
 struct DistScheduleClauseOps {
-  UnitAttr distScheduleStaticAttr;
-  Value distScheduleChunkSizeVar;
+  UnitAttr distScheduleStatic;
+  Value distScheduleChunkSize;
 };
 
 struct DoacrossClauseOps {
-  llvm::SmallVector<Value> doacrossVectorVars;
-  ClauseDependAttr doacrossDependTypeAttr;
-  IntegerAttr doacrossNumLoopsAttr;
+  ClauseDependAttr doacrossDependType;
+  IntegerAttr doacrossNumLoops;
+  llvm::SmallVector<Value> doacrossDependVars;
 };
 
 struct FilterClauseOps {
-  Value filteredThreadIdVar;
+  Value filteredThreadId;
 };
 
 struct FinalClauseOps {
-  Value finalVar;
+  Value final;
 };
 
 struct GrainsizeClauseOps {
-  Value grainsizeVar;
+  Value grainsize;
 };
 
 struct HasDeviceAddrClauseOps {
@@ -98,7 +98,7 @@ struct HasDeviceAddrClauseOps {
 };
 
 struct HintClauseOps {
-  IntegerAttr hintAttr;
+  IntegerAttr hint;
 };
 
 struct IfClauseOps {
@@ -107,8 +107,8 @@ struct IfClauseOps {
 
 struct InReductionClauseOps {
   llvm::SmallVector<Value> inReductionVars;
-  llvm::SmallVector<bool> inReductionVarsByRef;
-  llvm::SmallVector<Attribute> inReductionDeclSymbols;
+  llvm::SmallVector<bool> inReductionByref;
+  llvm::SmallVector<Attribute> inReductionSyms;
 };
 
 struct IsDevicePtrClauseOps {
@@ -120,7 +120,8 @@ struct LinearClauseOps {
 };
 
 struct LoopRelatedOps {
-  UnitAttr loopInclusiveAttr;
+  llvm::SmallVector<Value> loopLowerBounds, loopUpperBounds, loopSteps;
+  UnitAttr loopInclusive;
 };
 
 struct MapClauseOps {
@@ -128,11 +129,11 @@ struct MapClauseOps {
 };
 
 struct MergeableClauseOps {
-  UnitAttr mergeableAttr;
+  UnitAttr mergeable;
 };
 
 struct NogroupClauseOps {
-  UnitAttr nogroupAttr;
+  UnitAttr nogroup;
 };
 
 struct NontemporalClauseOps {
@@ -140,36 +141,36 @@ struct NontemporalClauseOps {
 };
 
 struct NowaitClauseOps {
-  UnitAttr nowaitAttr;
+  UnitAttr nowait;
 };
 
 struct NumTasksClauseOps {
-  Value numTasksVar;
+  Value numTasks;
 };
 
 struct NumTeamsClauseOps {
-  Value numTeamsLowerVar, numTeamsUpperVar;
+  Value numTeamsLower, numTeamsUpper;
 };
 
 struct NumThreadsClauseOps {
-  Value numThreadsVar;
+  Value numThreads;
 };
 
 struct OrderClauseOps {
-  ClauseOrderKindAttr orderAttr;
-  OrderModifierAttr orderModAttr;
+  ClauseOrderKindAttr order;
+  OrderModifierAttr orderMod;
 };
 
 struct OrderedClauseOps {
-  IntegerAttr orderedAttr;
+  IntegerAttr ordered;
 };
 
 struct ParallelizationLevelClauseOps {
-  UnitAttr parLevelSimdAttr;
+  UnitAttr parLevelSimd;
 };
 
 struct PriorityClauseOps {
-  Value priorityVar;
+  Value priority;
 };
 
 struct PrivateClauseOps {
@@ -179,46 +180,46 @@ struct PrivateClauseOps {
   llvm::SmallVector<Value> privateVars;
   // The list of symbols referring to delayed privatizer ops (i.e. `omp.private`
   // ops).
-  llvm::SmallVector<Attribute> privatizers;
+  llvm::SmallVector<Attribute> privateSyms;
 };
 
 struct ProcBindClauseOps {
-  ClauseProcBindKindAttr procBindKindAttr;
+  ClauseProcBindKindAttr procBindKind;
 };
 
 struct ReductionClauseOps {
   llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reductionVarsByRef;
-  llvm::SmallVector<Attribute> reductionDeclSymbols;
+  llvm::SmallVector<bool> reductionByref;
+  llvm::SmallVector<Attribute> reductionSyms;
 };
 
 struct SafelenClauseOps {
-  IntegerAttr safelenAttr;
+  IntegerAttr safelen;
 };
 
 struct ScheduleClauseOps {
-  ClauseScheduleKindAttr scheduleValAttr;
-  ScheduleModifierAttr scheduleModAttr;
-  Value scheduleChunkVar;
-  UnitAttr scheduleSimdAttr;
+  ClauseScheduleKindAttr scheduleKind;
+  Value scheduleChunk;
+  ScheduleModifierAttr scheduleMod;
+  UnitAttr scheduleSimd;
 };
 
 struct SimdlenClauseOps {
-  IntegerAttr simdlenAttr;
+  IntegerAttr simdlen;
 };
 
 struct TaskReductionClauseOps {
   llvm::SmallVector<Value> taskReductionVars;
-  llvm::SmallVector<bool> taskReductionVarsByRef;
-  llvm::SmallVector<Attribute> taskReductionDeclSymbols;
+  llvm::SmallVector<bool> taskReductionByref;
+  llvm::SmallVector<Attribute> taskReductionSyms;
 };
 
 struct ThreadLimitClauseOps {
-  Value threadLimitVar;
+  Value threadLimit;
 };
 
 struct UntiedClauseOps {
-  UnitAttr untiedAttr;
+  UnitAttr untied;
 };
 
 struct UseDeviceAddrClauseOps {
@@ -241,82 +242,81 @@ template <typename... Mixins>
 struct Clauses : public Mixins... {};
 } // namespace detail
 
-using CancelClauseOps =
+using CancelOperands =
     detail::Clauses<CancelDirectiveNameClauseOps, IfClauseOps>;
 
-using CancellationPointClauseOps =
-    detail::Clauses<CancelDirectiveNameClauseOps>;
+using CancellationPointOperands = detail::Clauses<CancelDirectiveNameClauseOps>;
 
-using CriticalClauseOps = detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
+using CriticalDeclareOperands =
+    detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
 
 // TODO `indirect` clause.
-using DeclareTargetClauseOps = detail::Clauses<DeviceTypeClauseOps>;
+using DeclareTargetOperands = detail::Clauses<DeviceTypeClauseOps>;
 
-using DistributeClauseOps =
+using DistributeOperands =
     detail::Clauses<AllocateClauseOps, DistScheduleClauseOps, OrderClauseOps,
                     PrivateClauseOps>;
 
-using LoopNestClauseOps = detail::Clauses<CollapseClauseOps, LoopRelatedOps>;
+using LoopNestOperands = detail::Clauses<LoopRelatedOps>;
 
-using MaskedClauseOps = detail::Clauses<FilterClauseOps>;
+using MaskedOperands = detail::Clauses<FilterClauseOps>;
 
-using OrderedOpClauseOps = detail::Clauses<DoacrossClauseOps>;
+using OrderedOperands = detail::Clauses<DoacrossClauseOps>;
 
-using OrderedRegionClauseOps = detail::Clauses<ParallelizationLevelClauseOps>;
+using OrderedRegionOperands = detail::Clauses<ParallelizationLevelClauseOps>;
 
-using ParallelClauseOps =
+using ParallelOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumThreadsClauseOps,
                     PrivateClauseOps, ProcBindClauseOps, ReductionClauseOps>;
 
-using SectionsClauseOps = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
-                                          PrivateClauseOps, ReductionClauseOps>;
+using SectionsOperands = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
+                                         PrivateClauseOps, ReductionClauseOps>;
 
-// TODO `linear` clause.
-using SimdClauseOps =
-    detail::Clauses<AlignedClauseOps, IfClauseOps, NontemporalClauseOps,
-                    OrderClauseOps, PrivateClauseOps, ReductionClauseOps,
-                    SafelenClauseOps, SimdlenClauseOps>;
+using SimdOperands =
+    detail::Clauses<AlignedClauseOps, IfClauseOps, LinearClauseOps,
+                    NontemporalClauseOps, OrderClauseOps, PrivateClauseOps,
+                    ReductionClauseOps, SafelenClauseOps, SimdlenClauseOps>;
 
-using SingleClauseOps = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
-                                        NowaitClauseOps, PrivateClauseOps>;
+using SingleOperands = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
+                                       NowaitClauseOps, PrivateClauseOps>;
 
 // TODO `defaultmap`, `uses_allocators` clauses.
-using TargetClauseOps =
+using TargetOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, DeviceClauseOps,
                     HasDeviceAddrClauseOps, IfClauseOps, InReductionClauseOps,
                     IsDevicePtrClauseOps, MapClauseOps, NowaitClauseOps,
                     PrivateClauseOps, ThreadLimitClauseOps>;
 
-using TargetDataClauseOps =
+using TargetDataOperands =
     detail::Clauses<DeviceClauseOps, IfClauseOps, MapClauseOps,
                     UseDeviceAddrClauseOps, UseDevicePtrClauseOps>;
 
-using TargetEnterExitUpdateDataClauseOps =
+using TargetEnterExitUpdateDataOperands =
     detail::Clauses<DependClauseOps, DeviceClauseOps, IfClauseOps, MapClauseOps,
                     NowaitClauseOps>;
 
 // TODO `affinity`, `detach` clauses.
-using TaskClauseOps =
+using TaskOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, FinalClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     PriorityClauseOps, PrivateClauseOps, UntiedClauseOps>;
 
-using TaskgroupClauseOps =
+using TaskgroupOperands =
     detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
 
-using TaskloopClauseOps =
+using TaskloopOperands =
     detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
                     PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
 
-using TaskwaitClauseOps = detail::Clauses<DependClauseOps, NowaitClauseOps>;
+using TaskwaitOperands = detail::Clauses<DependClauseOps, NowaitClauseOps>;
 
-using TeamsClauseOps =
+using TeamsOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumTeamsClauseOps,
                     PrivateClauseOps, ReductionClauseOps, ThreadLimitClauseOps>;
 
-using WsloopClauseOps =
+using WsloopOperands =
     detail::Clauses<AllocateClauseOps, LinearClauseOps, NowaitClauseOps,
                     OrderClauseOps, OrderedClauseOps, PrivateClauseOps,
                     ReductionClauseOps, ScheduleClauseOps>;
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 99150bc5dff39..e703c323edbc8 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -20,6 +20,7 @@
 #define OPENMP_CLAUSES
 
 include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // V5.2: [5.11] `aligned` clause
@@ -32,18 +33,18 @@ class OpenMP_AlignedClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$aligned_vars,
-    OptionalAttr<I64ArrayAttr>:$alignment_values
+    OptionalAttr<I64ArrayAttr>:$alignments
   );
 
   let assemblyFormat = [{
     `aligned` `(` custom<AlignedClause>($aligned_vars, type($aligned_vars),
-                                        $alignment_values) `)`
+                                        $alignments) `)`
   }];
 
   let description = [{
-    The `alignment_values` attribute additionally specifies alignment of each
-    corresponding aligned operand. Note that `aligned_vars` and
-    `alignment_values` should contain the same number of elements.
+    The `alignments` attribute additionally specifies alignment of each
+    corresponding aligned operand. Note that `aligned_vars` and `alignments`
+    must contain the same number of elements.
   }];
 }
 
@@ -60,22 +61,22 @@ class OpenMP_AllocateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$allocate_vars,
-    Variadic<AnyType>:$allocators_vars
+    Variadic<AnyType>:$allocator_vars
   );
 
   let extraClassDeclaration = [{
     unsigned getNumAllocateVars() { return getAllocateVars().size(); }
-    unsigned getNumAllocatorsVars() { return getAllocatorsVars().size(); }
+    unsigned getNumAllocatorsVars() { return getAllocatorVars().size(); }
   }];
 
   let assemblyFormat = [{
     `allocate` `(`
       custom<AllocateAndAllocator>($allocate_vars, type($allocate_vars),
-                                   $allocators_vars, type($allocators_vars)) `)`
+                                   $allocator_vars, type($allocator_vars)) `)`
   }];
 
   let description = [{
-    The `allocators_vars` and `allocate_vars` parameters are a variadic list of
+    The `allocator_vars` and `allocate_vars` parameters are a variadic list of
     values that specify the memory allocator to be used to obtain storage for
     private values.
   }];
@@ -93,12 +94,12 @@ class OpenMP_CancelDirectiveNameClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    CancellationConstructTypeAttr:$cancellation_construct_type_val
+    CancellationConstructTypeAttr:$cancel_directive
   );
 
   let assemblyFormat = [{
     `cancellation_construct_type` `(`
-      custom<ClauseAttr>($cancellation_construct_type_val) `)`
+      custom<ClauseAttr>($cancel_directive) `)`
   }];
 
   // TODO: Add description.
@@ -106,36 +107,6 @@ class OpenMP_CancelDirectiveNameClauseSkip<
 
 def OpenMP_CancelDirectiveNameClause : OpenMP_CancelDirectiveNameClauseSkip<>;
 
-//===----------------------------------------------------------------------===//
-// V5.2: [4.4.3] `collapse` clause
-//===----------------------------------------------------------------------===//
-
-class OpenMP_CollapseClauseSkip<
-    bit traits = false, bit arguments = false, bit assemblyFormat = false,
-    bit description = false, bit extraClassDeclaration = false
-  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
-                    description, extraClassDeclaration> {
-  let traits = [
-    AllTypesMatch<["lowerBound", "upperBound", "step"]>
-  ];
-
-  let arguments = (ins
-    Variadic<IntLikeType>:$lowerBound,
-    Variadic<IntLikeType>:$upperBound,
-    Variadic<IntLikeType>:$step
-  );
-
-  let extraClassDeclaration = [{
-    /// Returns the number of loops in the loop nest.
-    unsigned getNumLoops() { return getLowerBound().size(); }
-  }];
-
-  // Description and formatting integrated in the `omp.loop_nest` operation,
-  // which is the only one currently accepting this clause.
-}
-
-def OpenMP_CollapseClause : OpenMP_CollapseClauseSkip<>;
-
 //===----------------------------------------------------------------------===//
 // V5.2: [5.7.2] `copyprivate` clause
 //===----------------------------------------------------------------------===//
@@ -147,13 +118,13 @@ class OpenMP_CopyprivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$copyprivate_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_funcs
+    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_syms
   );
 
   let assemblyFormat = [{
     `copyprivate` `(`
-      custom<CopyPrivateVarList>($copyprivate_vars, type($copyprivate_vars),
-                                 $copyprivate_funcs) `)`
+      custom<Copyprivate>($copyprivate_vars, type($copyprivate_vars),
+                          $copyprivate_syms) `)`
   }];
 
   let description = [{
@@ -174,6 +145,10 @@ class OpenMP_CriticalNameClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
+  let traits = [
+    Symbol
+  ];
+
   let arguments = (ins
     SymbolNameAttr:$sym_name
   );
@@ -197,18 +172,19 @@ class OpenMP_DependClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<TaskDependArrayAttr>:$depends,
+    OptionalAttr<TaskDependArrayAttr>:$depend_kinds,
     Variadic<OpenMP_PointerLikeType>:$depend_vars
   );
 
   let assemblyFormat = [{
     `depend` `(`
-      custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
+      custom<DependVarList>($depend_vars, type($depend_vars), $depend_kinds) `)`
   }];
 
   let description = [{
-    The `depends` and `depend_vars` arguments are variadic lists of values that
-    specify the dependencies of this particular task in relation to other tasks.
+    The `depend_kinds` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular task in relation to other
+    tasks.
   }];
 }
 
@@ -250,19 +226,20 @@ class OpenMP_DistScheduleClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     UnitAttr:$dist_schedule_static,
-    Optional<IntLikeType>:$chunk_size
+    Optional<IntLikeType>:$dist_schedule_chunk_size
   );
 
   let assemblyFormat = [{
     `dist_schedule_static` $dist_schedule_static
-    | `chunk_size` `(` $chunk_size `:` type($chunk_size) `)`
+    | `dist_schedule_chunk_size` `(` $dist_schedule_chunk_size `:`
+      type($dist_schedule_chunk_size) `)`
   }];
 
   let description = [{
     The `dist_schedule_static` attribute specifies the schedule for this loop,
     determining how the loop is distributed across the various teams. The
-    optional `chunk_size` associated with this determines further controls this
-    distribution.
+    optional `dist_schedule_chunk_size` associated with this determines further
+    controls this distribution.
   }];
 }
 
@@ -278,24 +255,25 @@ class OpenMP_DoacrossClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ClauseDependAttr>:$depend_type_val,
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$num_loops_val,
-    Variadic<AnyType>:$depend_vec_vars
+    OptionalAttr<ClauseDependAttr>:$doacross_depend_type,
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$doacross_num_loops,
+    Variadic<AnyType>:$doacross_depend_vars
   );
 
   let assemblyFormat = [{
-    ( `depend_type` `` $depend_type_val^ )?
-    ( `depend_vec` `(` $depend_vec_vars^ `:` type($depend_vec_vars) `)` )?
+    ( `depend_type` `` $doacross_depend_type^ )?
+    ( `depend_vec` `(` $doacross_depend_vars^ `:` type($doacross_depend_vars)
+                   `)` )?
   }];
 
   let description = [{
-    The `depend_type_val` attribute refers to either the DEPEND(SOURCE) clause
-    or the DEPEND(SINK: vec) clause.
+    The `doacross_depend_type` attribute refers to either the DEPEND(SOURCE)
+    clause or the DEPEND(SINK: vec) clause.
 
-    The `num_loops_val` attribute specifies the number of loops in the doacross
-    nest.
+    The `doacross_num_loops` attribute specifies the number of loops in the
+    doacross nest.
 
-    The `depend_vec_vars` is a variadic list of operands that specifies the
+    The `doacross_depend_vars` is a variadic list of operands that specifies the
     index of the loop iterator in the doacross nest for the DEPEND(SOURCE)
     clause or the index of the element of "vec" for the DEPEND(SINK: vec)
     clause. It contains the operands in multiple "vec" when multiple
@@ -305,6 +283,34 @@ class OpenMP_DoacrossClauseSkip<
 
 def OpenMP_DoacrossClause : OpenMP_DoacrossClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// V5.2: [10.5.1] `filter` clause
+//===----------------------------------------------------------------------===//
+
+class OpenMP_FilterClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
+                    description, extraClassDeclaration> {
+  let arguments = (ins
+    Optional<IntLikeType>:$filtered_thread_id
+  );
+
+  let assemblyFormat = [{
+    `filter` `(` $filtered_thread_id `:` type($filtered_thread_id) `)`
+  }];
+
+  let description = [{
+    If `filter` is specified, the masked construct masks the execution of
+    the region to only the thread id filtered. Other threads executing the
+    parallel region are not expected to execute the region specified within
+    the `masked` directive. If `filter` is not specified, master thread is
+    expected to execute the region enclosed within `masked` directive.
+  }];
+}
+
+def OpenMP_FilterClause : OpenMP_FilterClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [12.3] `final` clause
 //===----------------------------------------------------------------------===//
@@ -315,11 +321,11 @@ class OpenMP_FinalClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<I1>:$final_expr
+    Optional<I1>:$final
   );
 
   let assemblyFormat = [{
-    `final` `(` $final_expr `)`
+    `final` `(` $final `)`
   }];
 
   let description = [{
@@ -343,11 +349,11 @@ class OpenMP_GrainsizeClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$grain_size
+    Optional<IntLikeType>:$grainsize
   );
 
   let assemblyFormat = [{
-    `grain_size` `(` $grain_size `:` type($grain_size) `)`
+    `grainsize` `(` $grainsize `:` type($grainsize) `)`
   }];
 
   let description = [{
@@ -370,17 +376,18 @@ class OpenMP_HasDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$has_device_addr
+    Variadic<OpenMP_PointerLikeType>:$has_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `has_device_addr` `(` $has_device_addr `:` type($has_device_addr) `)`
+    `has_device_addr` `(` $has_device_addr_vars `:` type($has_device_addr_vars)
+                      `)`
   }];
 
   let description = [{
-    The optional `has_device_addr` indicates that list items already have device
-    addresses, so they may be directly accessed from the target device. This
-    includes array sections.
+    The optional `has_device_addr_vars` indicates that list items already have
+    device addresses, so they may be directly accessed from the target device.
+    This includes array sections.
   }];
 }
 
@@ -396,11 +403,11 @@ class OpenMP_HintClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    DefaultValuedOptionalAttr<I64Attr, "0">:$hint_val
+    DefaultValuedOptionalAttr<I64Attr, "0">:$hint
   );
 
   let assemblyFormat = [{
-    `hint` `(` custom<SynchronizationHint>($hint_val) `)`
+    `hint` `(` custom<SynchronizationHint>($hint) `)`
   }];
 
   let description = [{
@@ -449,14 +456,14 @@ class OpenMP_InReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$in_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$in_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$in_reduction_syms
   );
 
   let assemblyFormat = [{
     `in_reduction` `(`
       custom<ReductionVarList>($in_reduction_vars, type($in_reduction_vars),
-                               $in_reduction_vars_byref, $in_reductions) `)`
+                               $in_reduction_byref, $in_reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -482,15 +489,15 @@ class OpenMP_IsDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$is_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$is_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `is_device_ptr` `(` $is_device_ptr `:` type($is_device_ptr) `)`
+    `is_device_ptr` `(` $is_device_ptr_vars `:` type($is_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `is_device_ptr` indicates list items are device pointers.
+    The optional `is_device_ptr_vars` indicates list items are device pointers.
   }];
 }
 
@@ -526,6 +533,38 @@ class OpenMP_LinearClauseSkip<
 
 def OpenMP_LinearClause : OpenMP_LinearClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// Not in the spec: Clause-like structure to hold loop related information.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_LoopRelatedClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
+                    description, extraClassDeclaration> {
+  let traits = [
+    AllTypesMatch<
+      ["loop_lower_bounds", "loop_upper_bounds", "loop_steps"]>
+  ];
+  
+  let arguments = (ins
+    Variadic<IntLikeType>:$loop_lower_bounds,
+    Variadic<IntLikeType>:$loop_upper_bounds,
+    Variadic<IntLikeType>:$loop_steps,
+    UnitAttr:$loop_inclusive
+  );
+
+  let extraClassDeclaration = [{
+    /// Returns the number of loops in the loop nest.
+    unsigned getNumLoops() { return getLoopLowerBounds().size(); }
+  }];
+
+  // Description and formatting integrated in the `omp.loop_nest` operation,
+  // which is the only one currently accepting this clause.
+}
+
+def OpenMP_LoopRelatedClause : OpenMP_LoopRelatedClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [5.8.3] `map` clause
 //===----------------------------------------------------------------------===//
@@ -540,16 +579,16 @@ class OpenMP_MapClauseSkip<
   ];
 
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$map_operands
+    Variadic<OpenMP_PointerLikeType>:$map_vars
   );
 
   let assemblyFormat = [{
-    `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    `map_entries` `(` custom<MapEntries>($map_vars, type($map_vars)) `)`
   }];
 
   let description = [{
-    The optional `map_operands` maps data from the current task's data
-    environment to the device data environment.
+    The optional `map_vars` maps data from the current task's data environment
+    to the device data environment.
   }];
 }
 
@@ -565,11 +604,11 @@ class OpenMP_MemoryOrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<MemoryOrderKindAttr>:$memory_order_val
+    OptionalAttr<MemoryOrderKindAttr>:$memory_order
   );
 
   let assemblyFormat = [{
-    `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
+    `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
   }];
 
   let description = [{
@@ -751,16 +790,16 @@ class OpenMP_NumThreadsClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$num_threads_var
+    Optional<IntLikeType>:$num_threads
   );
 
   let assemblyFormat = [{
-    `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+    `num_threads` `(` $num_threads `:` type($num_threads) `)`
   }];
 
   let description = [{
-    The optional `num_threads_var` parameter specifies the number of threads
-    which should be used to execute the parallel region.
+    The optional `num_threads` parameter specifies the number of threads which
+    should be used to execute the parallel region.
   }];
 }
 
@@ -776,12 +815,12 @@ class OpenMP_OrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<OrderKindAttr>:$order_val,
+    OptionalAttr<OrderKindAttr>:$order,
     OptionalAttr<OrderModifierAttr>:$order_mod
   );
 
   let assemblyFormat = [{
-    `order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+    `order` `(` custom<OrderClause>($order, $order_mod) `)`
   }];
 
   let description = [{
@@ -803,15 +842,15 @@ class OpenMP_OrderedClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered
   );
 
   let assemblyFormat = [{
-    `ordered` `(` $ordered_val `)`
+    `ordered` `(` $ordered `)`
   }];
 
   let description = [{
-    The optional `ordered_val` attribute specifies how many loops are associated
+    The optional `ordered` attribute specifies how many loops are associated
     with the worksharing-loop construct. The value of zero refers to the ordered
     clause specified without parameter.
   }];
@@ -829,17 +868,17 @@ class OpenMP_ParallelizationLevelClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    UnitAttr:$simd
+    UnitAttr:$par_level_simd
   );
 
   let assemblyFormat = [{
-    `simd` $simd
+    `par_level_simd` $par_level_simd
   }];
 
   let description = [{
-    The `simd` attribute corresponds to the simd clause specified. If it is not
-    present, it behaves as if the threads clause is specified or no clause is
-    specified.
+    The `par_level_simd` attribute corresponds to the simd clause specified. If
+    it is not present, it behaves as if the threads clause is specified or no
+    clause is specified.
   }];
 }
 
@@ -886,12 +925,12 @@ class OpenMP_PrivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$private_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$privatizers
+    OptionalAttr<SymbolRefArrayAttr>:$private_syms
   );
 
   let assemblyFormat = [{
     `private` `(`
-      custom<PrivateList>($private_vars, type($private_vars), $privatizers) `)`
+      custom<PrivateList>($private_vars, type($private_vars), $private_syms) `)`
   }];
 
   // TODO: Add description.
@@ -909,15 +948,15 @@ class OpenMP_ProcBindClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ProcBindKindAttr>:$proc_bind_val
+    OptionalAttr<ProcBindKindAttr>:$proc_bind_kind
   );
 
   let assemblyFormat = [{
-    `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+    `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
   }];
 
   let description = [{
-    The optional `proc_bind_val` attribute controls the thread affinity for the
+    The optional `proc_bind_kind` attribute controls the thread affinity for the
     execution of the parallel region.
   }];
 }
@@ -939,14 +978,14 @@ class OpenMP_ReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$reductions
+    OptionalAttr<DenseBoolArrayAttr>:$reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$reduction_syms
   );
 
   let assemblyFormat = [{
     `reduction` `(`
       custom<ReductionVarList>($reduction_vars, type($reduction_vars),
-                               $reduction_vars_byref, $reductions) `)`
+                               $reduction_byref, $reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -958,10 +997,10 @@ class OpenMP_ReductionClauseSkip<
   let description = [{
     Reductions can be performed by specifying reduction accumulator variables in
     `reduction_vars`, symbols referring to reduction declarations in the
-    `reductions` attribute, and whether the reduction variable should be passed
-    into the reduction region by value or by reference in
-    `reduction_vars_byref`. Each reduction is identified by the accumulator it
-    uses and accumulators must not be repeated in the same reduction. A private
+    `reduction_syms` attribute, and whether the reduction variable should be
+    passed into the reduction region by value or by reference in
+    `reduction_byref`. Each reduction is identified by the accumulator it uses
+    and accumulators must not be repeated in the same reduction. A private
     variable corresponding to the accumulator is used in place of the
     accumulator inside the body of the operation. The reduction declaration
     specifies how to combine the values from each iteration, section, team,
@@ -1008,22 +1047,22 @@ class OpenMP_ScheduleClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ScheduleKindAttr>:$schedule_val,
-    Optional<AnyType>:$schedule_chunk_var,
-    OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
-    UnitAttr:$simd_modifier
+    OptionalAttr<ScheduleKindAttr>:$schedule_kind,
+    Optional<AnyType>:$schedule_chunk,
+    OptionalAttr<ScheduleModifierAttr>:$schedule_mod,
+    UnitAttr:$schedule_simd
   );
 
   let assemblyFormat = [{
     `schedule` `(`
-      custom<ScheduleClause>($schedule_val, $schedule_modifier, $simd_modifier,
-                             $schedule_chunk_var, type($schedule_chunk_var)) `)`
+      custom<ScheduleClause>($schedule_kind, $schedule_mod, $schedule_simd,
+                             $schedule_chunk, type($schedule_chunk)) `)`
   }];
 
   let description = [{
-    The optional `schedule_val` attribute specifies the loop schedule for this
+    The optional `schedule_kind` attribute specifies the loop schedule for this
     loop, determining how the loop is distributed across the parallel threads.
-    The optional `schedule_chunk_var` associated with this determines further
+    The optional `schedule_chunk` associated with this determines further
     controls this distribution.
   }];
 }
@@ -1070,14 +1109,14 @@ class OpenMP_TaskReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$task_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$task_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$task_reduction_syms
   );
 
   let assemblyFormat = [{
     `task_reduction` `(`
       custom<ReductionVarList>($task_reduction_vars, type($task_reduction_vars),
-                               $task_reduction_vars_byref, $task_reductions) `)`
+                               $task_reduction_byref, $task_reduction_syms) `)`
   }];
 
   let description = [{
@@ -1087,9 +1126,9 @@ class OpenMP_TaskReductionClauseSkip<
     participating in the reduction. After the end of the region, the original
     list item contains the result of the reduction. Similarly to the `reduction`
     clause, accumulator variables must be passed in `task_reduction_vars`,
-    symbols referring to reduction declarations in the `task_reductions`
+    symbols referring to reduction declarations in the `task_reduction_syms`
     attribute, and whether the reduction variable should be passed into the
-    reduction region by value or by reference in `task_reduction_vars_byref`.
+    reduction region by value or by reference in `task_reduction_byref`.
   }];
 
   let extraClassDeclaration = [{
@@ -1148,7 +1187,7 @@ class OpenMP_UntiedClauseSkip<
     If the `untied` clause is present on a task construct, any thread in the
     team can resume the task region after a suspension. The `untied` clause is
     ignored if a `final` clause is present on the same task construct and the
-    `final_expr` evaluates to `true`, or if a task is an included task.
+    `final` expression evaluates to `true`, or if a task is an included task.
   }];
 }
 
@@ -1164,16 +1203,16 @@ class OpenMP_UseDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_addr
+    Variadic<OpenMP_PointerLikeType>:$use_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_addr` `(` $use_device_addr `:` type($use_device_addr) `)`
+    `use_device_addr` `(` $use_device_addr_vars `:` type($use_device_addr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_addr` specifies the address of the objects in the
-    device data environment.
+    The optional `use_device_addr_vars` specifies the address of the objects in
+    the device data environment.
   }];
 }
 
@@ -1189,47 +1228,19 @@ class OpenMP_UseDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$use_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_ptr` `(` $use_device_ptr `:` type($use_device_ptr) `)`
+    `use_device_ptr` `(` $use_device_ptr_vars `:` type($use_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_ptr` specifies the device pointers to the
+    The optional `use_device_ptr_vars` specifies the device pointers to the
     corresponding list items in the device data environment.
   }];
 }
 
 def OpenMP_UseDevicePtrClause : OpenMP_UseDevicePtrClauseSkip<>;
 
-//===----------------------------------------------------------------------===//
-// V5.2: [10.5.1] `filter` clause
-//===----------------------------------------------------------------------===//
-
-class OpenMP_FilterClauseSkip<
-    bit traits = false, bit arguments = false, bit assemblyFormat = false,
-    bit description = false, bit extraClassDeclaration = false
-  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
-                    description, extraClassDeclaration> {
-  let arguments = (ins
-    Optional<IntLikeType>:$filtered_thread_id
-  );
-
-  let assemblyFormat = [{
-    `filter` `(` $filtered_thread_id `:` type($filtered_thread_id) `)`
-  }];
-
-  let description = [{
-    If `filter` is specified, the masked construct masks the execution of
-    the region to only the thread id filtered. Other threads executing the
-    parallel region are not expected to execute the region specified within
-    the `masked` directive. If `filter` is not specified, master thread is
-    expected to execute the region enclosed within `masked` directive.
-  }];
-}
-
-def OpenMP_FilterClause : OpenMP_FilterClauseSkip<>;
-
 #endif // OPENMP_CLAUSES
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 69fd1f1f0130f..68f92e6952694 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -132,28 +132,26 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
     DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
     RecursiveMemoryEffects
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
+    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
     OpenMP_IfClauseSkip<assemblyFormat = true>,
     OpenMP_NumThreadsClauseSkip<assemblyFormat = true>,
-    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
+    OpenMP_PrivateClauseSkip<assemblyFormat = true>,
     OpenMP_ProcBindClauseSkip<assemblyFormat = true>,
-    OpenMP_PrivateClauseSkip<assemblyFormat = true>
+    OpenMP_ReductionClauseSkip<assemblyFormat = true>
   ], singleRegion = true> {
   let summary = "parallel construct";
   let description = [{
     The parallel construct includes a region of code which is to be executed
     by a team of threads.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the parallel
-    region runs as normal, if it is 0 then the parallel region is executed with
-    one thread.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the parallel region runs
+    as normal, if it is 0 then the parallel region is executed with one thread.
   }] # clausesDescription;
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const ParallelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const ParallelOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -163,16 +161,16 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
   let assemblyFormat = [{
     oilist(
           `if` `(` $if_expr `)`
-          | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+          | `num_threads` `(` $num_threads `:` type($num_threads) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
                 $allocate_vars, type($allocate_vars),
-                $allocators_vars, type($allocators_vars)
+                $allocator_vars, type($allocator_vars)
               ) `)`
-          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
-                             $reduction_vars_byref, $reductions, $private_vars,
-                             type($private_vars), $privatizers) attr-dict
+                             $reduction_byref, $reduction_syms, $private_vars,
+                             type($private_vars), $private_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -196,10 +194,8 @@ def TerminatorOp : OpenMP_Op<"terminator", [Terminator, Pure]> {
 def TeamsOp : OpenMP_Op<"teams", traits = [
     AttrSizedOperandSegments, RecursiveMemoryEffects
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_NumTeamsClause, OpenMP_IfClause, OpenMP_ThreadLimitClause,
-    OpenMP_AllocateClause, OpenMP_ReductionClause
+    OpenMP_AllocateClause, OpenMP_IfClause, OpenMP_NumTeamsClause,
+    OpenMP_PrivateClause, OpenMP_ReductionClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "teams construct";
   let description = [{
@@ -212,7 +208,7 @@ def TeamsOp : OpenMP_Op<"teams", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TeamsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TeamsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -239,9 +235,8 @@ def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">],
 def SectionsOp : OpenMP_Op<"sections", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_ReductionClause, OpenMP_AllocateClause, OpenMP_NowaitClause
+    OpenMP_AllocateClause, OpenMP_NowaitClause, OpenMP_PrivateClause,
+    OpenMP_ReductionClause
   ], singleRegion = true> {
   let summary = "sections construct";
   let description = [{
@@ -258,7 +253,7 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
   let regions = (region SizedRegion<1>:$region);
 
   let builders = [
-    OpBuilder<(ins CArg<"const SectionsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SectionsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -272,8 +267,8 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
 def SingleOp : OpenMP_Op<"single", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
-    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause
+    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "single directive";
   let description = [{
@@ -285,7 +280,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SingleClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SingleOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -298,7 +293,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
 def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     RecursiveMemoryEffects, SameVariadicOperandSize
   ], clauses = [
-    OpenMP_CollapseClause
+    OpenMP_LoopRelatedClause
   ], singleRegion = true> {
   let summary = "rectangular loop nest";
   let description = [{
@@ -307,14 +302,14 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     lower and upper bounds, as well as a step variable, must be defined.
 
     The lower and upper bounds specify a half-open range: the range includes the
-    lower bound but does not include the upper bound. If the `inclusive`
+    lower bound but does not include the upper bound. If the `loop_inclusive`
     attribute is specified then the upper bound is also included.
 
     The body region can contain any number of blocks. The region is terminated
     by an `omp.yield` instruction without operands. The induction variables,
     represented as entry block arguments to the loop nest operation's single
-    region, match the types of the `lowerBound`, `upperBound` and `step`
-    arguments.
+    region, match the types of the `loop_lower_bounds`, `loop_upper_bounds` and
+    `loop_steps` arguments.
 
     ```mlir
     omp.loop_nest (%i1, %i2) : i32 = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
@@ -336,10 +331,8 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     non-perfectly nested loops.
   }];
 
-  let arguments = !con(clausesArgs, (ins UnitAttr:$inclusive));
-
   let builders = [
-    OpBuilder<(ins CArg<"const LoopNestClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const LoopNestOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -366,14 +359,14 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (allocate, private).
-    // TODO: Sort clauses alphabetically.
+    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
     OpenMP_LinearClauseSkip<assemblyFormat = true>,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
-    OpenMP_ScheduleClauseSkip<assemblyFormat = true>,
     OpenMP_NowaitClauseSkip<assemblyFormat = true>,
+    OpenMP_OrderClauseSkip<assemblyFormat = true>,
     OpenMP_OrderedClauseSkip<assemblyFormat = true>,
-    OpenMP_OrderClauseSkip<assemblyFormat = true>
+    OpenMP_PrivateClauseSkip<assemblyFormat = true>,
+    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
+    OpenMP_ScheduleClauseSkip<assemblyFormat = true>
   ], singleRegion = true> {
   let summary = "worksharing-loop construct";
   let description = [{
@@ -402,7 +395,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const WsloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const WsloopOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -415,13 +408,20 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
                                    $linear_step_vars) `)`
           |`schedule` `(`
               custom<ScheduleClause>(
-                $schedule_val, $schedule_modifier, $simd_modifier,
-                $schedule_chunk_var, type($schedule_chunk_var)) `)`
+                $schedule_kind, $schedule_mod, $schedule_simd,
+                $schedule_chunk, type($schedule_chunk)) `)`
           |`nowait` $nowait
-          |`ordered` `(` $ordered_val `)`
-          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+          |`ordered` `(` $ordered `)`
+          |`order` `(` custom<OrderClause>($order, $order_mod) `)`
+          |`allocate` `(`
+                custom<AllocateAndAllocator>(
+                  $allocate_vars, type($allocate_vars), $allocator_vars,
+                  type($allocator_vars)) `)`
+          |`private` `(`
+                custom<PrivateList>(
+                  $private_vars, type($private_vars), $private_syms) `)`
     ) custom<Wsloop>($region, $reduction_vars, type($reduction_vars),
-                     $reduction_vars_byref, $reductions) attr-dict
+                     $reduction_byref, $reduction_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -435,9 +435,9 @@ def SimdOp : OpenMP_Op<"simd", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (linear, private, reduction).
-    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_NontemporalClause,
-    OpenMP_OrderClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
+    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_LinearClause,
+    OpenMP_NontemporalClause, OpenMP_OrderClause, OpenMP_PrivateClause,
+    OpenMP_ReductionClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
   ], singleRegion = true> {
   let summary = "simd construct";
   let description = [{
@@ -468,7 +468,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SimdClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SimdOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -502,9 +502,8 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_DistScheduleClause, OpenMP_AllocateClause, OpenMP_OrderClause
+    OpenMP_AllocateClause, OpenMP_DistScheduleClause, OpenMP_OrderClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "distribute construct";
   let description = [{
@@ -541,7 +540,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const DistributeClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const DistributeOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -555,11 +554,10 @@ def TaskOp : OpenMP_Op<"task", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope,
     OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (affinity, detach, private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
-    OpenMP_MergeableClause, OpenMP_InReductionClause,
-    OpenMP_PriorityClause, OpenMP_DependClause, OpenMP_AllocateClause
+    // TODO: Complete clause list (affinity, detach).
+    OpenMP_AllocateClause, OpenMP_DependClause, OpenMP_FinalClause,
+    OpenMP_IfClause, OpenMP_InReductionClause, OpenMP_MergeableClause,
+    OpenMP_PriorityClause, OpenMP_PrivateClause, OpenMP_UntiedClause
   ], singleRegion = true> {
   let summary = "task construct";
   let description = [{
@@ -576,12 +574,12 @@ def TaskOp : OpenMP_Op<"task", traits = [
 
     The `in_reduction` clause specifies that this particular task (among all the
     tasks in current taskgroup, if any) participates in a reduction.
-    `in_reduction_vars_byref` indicates whether each reduction variable should
+    `in_reduction_byref` indicates whether each reduction variable should
     be passed by value or by reference.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -592,14 +590,12 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     DeclareOpInterfaceMethods<LoopWrapperInterface>, RecursiveMemoryEffects,
     SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
-    OpenMP_MergeableClause,
-    OpenMP_InReductionClauseSkip<extraClassDeclaration = true>,
+    OpenMP_AllocateClause, OpenMP_FinalClause, OpenMP_GrainsizeClause,
+    OpenMP_IfClause, OpenMP_InReductionClauseSkip<extraClassDeclaration = true>,
+    OpenMP_MergeableClause, OpenMP_NogroupClause, OpenMP_NumTasksClause,
+    OpenMP_PriorityClause, OpenMP_PrivateClause,
     OpenMP_ReductionClauseSkip<extraClassDeclaration = true>,
-    OpenMP_PriorityClause, OpenMP_AllocateClause, OpenMP_GrainsizeClause,
-    OpenMP_NumTasksClause, OpenMP_NogroupClause
+    OpenMP_UntiedClause
   ], singleRegion = true> {
   let summary = "taskloop construct";
   let description = [{
@@ -639,7 +635,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     items is present. Thus, the generated tasks are participants of a reduction
     previously defined by a reduction scoping clause. In this case, accumulator
     variables are specified in `in_reduction_vars`, symbols referring to
-    reduction declarations in `in_reductions` and `in_reduction_vars_byref`
+    reduction declarations in `in_reduction_syms` and `in_reduction_byref`
     indicate for each reduction variable whether it should be passed by value or
     by reference.
 
@@ -654,7 +650,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
   }];
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskloopOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -670,8 +666,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
 def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_TaskReductionClause, OpenMP_AllocateClause
+    OpenMP_AllocateClause, OpenMP_TaskReductionClause
   ], singleRegion = true> {
   let summary = "taskgroup construct";
   let description = [{
@@ -688,7 +683,7 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskgroupClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskgroupOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -951,9 +946,8 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
 def TargetDataOp: OpenMP_Op<"target_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_UseDevicePtrClause,
-    OpenMP_UseDeviceAddrClause, OpenMP_MapClause
+    OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_UseDeviceAddrClause, OpenMP_UseDevicePtrClause
   ], singleRegion = true> {
   let summary = "target data construct";
   let description = [{
@@ -965,14 +959,13 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
     to and from the offloading device when multiple target regions are using
     the same data.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -985,9 +978,8 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
 def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target enter data construct";
   let description = [{
@@ -995,14 +987,13 @@ def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
     a device data environment. The target enter data directive is a
     stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on
-    the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1015,9 +1006,8 @@ def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
 def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target exit data construct";
   let description = [{
@@ -1025,14 +1015,13 @@ def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
     device data environment. The target exit data directive is
     a stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1045,9 +1034,8 @@ def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
 def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target update construct";
   let description = [{
@@ -1056,10 +1044,9 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
     specified motion clauses. The target update construct is a stand-alone
     directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
 
     We use `MapInfoOp` to model the motion clauses and their modifiers. Even
     though the spec differentiates between map-types & map-type-modifiers vs.
@@ -1070,7 +1057,7 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1083,26 +1070,24 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
 def TargetOp : OpenMP_Op<"target", traits = [
     AttrSizedOperandSegments, IsolatedFromAbove, OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (allocate, defaultmap, in_reduction,
-    // uses_allocators).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_ThreadLimitClause,
-    OpenMP_DependClause, OpenMP_NowaitClause, OpenMP_IsDevicePtrClause,
-    OpenMP_HasDeviceAddrClause, OpenMP_MapClause, OpenMP_PrivateClause
+    // TODO: Complete clause list (defaultmap, uses_allocators).
+    OpenMP_AllocateClause, OpenMP_DependClause, OpenMP_DeviceClause,
+    OpenMP_HasDeviceAddrClause, OpenMP_IfClause, OpenMP_InReductionClause,
+    OpenMP_IsDevicePtrClause, OpenMP_MapClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "target construct";
   let description = [{
     The target construct includes a region of code which is to be executed
     on a device.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on the
-    host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1125,9 +1110,7 @@ def MasterOp : OpenMP_Op<"master", singleRegion = true> {
 //===----------------------------------------------------------------------===//
 // 2.17.1 critical Construct
 //===----------------------------------------------------------------------===//
-def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
-    Symbol
-  ], clauses = [
+def CriticalDeclareOp : OpenMP_Op<"critical.declare", clauses = [
     OpenMP_CriticalNameClause, OpenMP_HintClause
   ]> {
   let summary = "declares a named critical section.";
@@ -1136,7 +1119,7 @@ def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CriticalClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CriticalDeclareOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1189,7 +1172,7 @@ def OrderedOp : OpenMP_Op<"ordered", clauses = [OpenMP_DoacrossClause]> {
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedOpClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1206,7 +1189,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedRegionClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedRegionOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1217,7 +1200,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
 //===----------------------------------------------------------------------===//
 
 def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
-    // TODO: Complete clause list (depend, nowait).
+    OpenMP_DependClause, OpenMP_NowaitClause
   ]> {
   let summary = "taskwait construct";
   let description = [{
@@ -1226,11 +1209,8 @@ def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskwaitClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskwaitOperands &">:$clauses)>
   ];
-
-  // TODO: Remove overriden `assemblyFormat` once a clause is added.
-  let assemblyFormat = "attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1264,8 +1244,8 @@ def AtomicReadOp : OpenMP_Op<"atomic.read", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $v `=` $x
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     `:` type($x) `,` $element_type attr-dict
   }];
 
@@ -1308,8 +1288,8 @@ def AtomicWriteOp : OpenMP_Op<"atomic.write", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $x `=` $expr
-    oilist( `hint` `(` custom<SynchronizationHint>($hint_val) `)`
-          | `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`)
+    oilist( `hint` `(` custom<SynchronizationHint>($hint) `)`
+          | `memory_order` `(` custom<ClauseAttr>($memory_order) `)`)
     `:` type($x) `,` type($expr)
     attr-dict
   }];
@@ -1371,8 +1351,8 @@ def AtomicUpdateOp : OpenMP_Op<"atomic.update", traits = [
 
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     $x `:` type($x) $region attr-dict
   }];
 
@@ -1505,7 +1485,7 @@ def CancelOp : OpenMP_Op<"cancel", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancelOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1525,7 +1505,7 @@ def CancellationPointOp : OpenMP_Op<"cancellation_point", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancellationPointClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancellationPointOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1605,7 +1585,7 @@ def MaskedOp : OpenMP_Op<"masked", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const MaskedClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const MaskedOperands &">:$clauses)>
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 385aa8b1b016a..45d30a41bd29b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -41,14 +41,14 @@ def MapClauseOwningOpInterface : OpInterface<"MapClauseOwningOpInterface"> {
   let cppNamespace = "::mlir::omp";
 
   let methods = [
-    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapOperands",
+    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapVars",
       (ins), [{
-        return $_op.getMapOperands();
+        return $_op.getMapVars();
       }]>,
       InterfaceMethod<"Get mutable map operands", "::mlir::MutableOperandRange",
-                      "getMapOperandsMutable",
+                      "getMapVarsMutable",
       (ins), [{
-        return $_op.getMapOperandsMutable();
+        return $_op.getMapVarsMutable();
       }]>,
   ];
 }
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
index df07b8d5a63d9..82938962fb939 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_dialect(PtrOps ptr)
-add_mlir_doc(PtrOps PtrOps Dialects/ -gen-op-doc)
+add_mlir_doc(PtrOps PtrOps Dialects/ -gen-dialect-doc -dialect=ptr)
 
 set(LLVM_TARGET_DEFINITIONS PtrOps.td)
 mlir_tablegen(PtrOpsAttrs.h.inc -gen-attrdef-decls -attrdefs-dialect=ptr)
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
index fdf2570626980..5e66774d2f143 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
@@ -85,6 +85,9 @@ void populateSCFForLoopCanonicalizationPatterns(RewritePatternSet &patterns);
 ///  * `after` block containing arith.addi
 void populateUpliftWhileToForPatterns(RewritePatternSet &patterns);
 
+/// Populate patterns to rotate `scf.while` ops, constructing `do-while` loops
+/// from `while` loops.
+void populateSCFRotateWhileLoopPatterns(RewritePatternSet &patterns);
 } // namespace scf
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index d68ca11207376..1f21af6d6a29a 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -32,9 +32,11 @@ using SCFTileSizeComputationFunction =
 
 /// Options to use to control tiling.
 struct SCFTilingOptions {
-  /// Computation function that returns the tile sizes for each operation.
-  /// Delayed construction of constant tile sizes should occur to interoperate
-  /// with folding.
+  /// Computation function that returns the tile sizes to use for each loop.
+  /// Returning a tile size of zero implies no tiling for that loop. If the
+  /// size of the returned vector is smaller than the number of loops, the inner
+  /// loops are not tiled. If the size of the returned vector is larger, then
+  /// the vector is truncated to number of loops.
   SCFTileSizeComputationFunction tileSizeComputationFunction = nullptr;
 
   SCFTilingOptions &
@@ -45,7 +47,27 @@ struct SCFTilingOptions {
   /// Convenience function to set the `tileSizeComputationFunction` to a
   /// function that computes tile sizes at the point they are needed. Allows
   /// proper interaction with folding.
-  SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> ts);
+  SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> tileSizes);
+
+  /// Computation function that returns the number of threads to use for
+  /// each loop. Returning a num threads of zero implies no tiling for that
+  /// loop. If the size of the returned vector is smaller than the number of
+  /// loops, the inner loops are not tiled. If the size of the returned vector
+  /// is larger, then the vector is truncated to number of loops. Note: This
+  /// option is only supported with loopType set to `LoopType::ForallOp`. If the
+  /// tile size function is not specified while the num threads computation is,
+  /// then the tile size is determined automatically to map at most one tile per
+  /// thread.
+  SCFTileSizeComputationFunction numThreadsComputationFunction = nullptr;
+
+  SCFTilingOptions &
+  setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun) {
+    numThreadsComputationFunction = std::move(fun);
+    return *this;
+  }
+  /// Convenience function to set the `numThreadsComputationFunction` to a
+  /// function that computes num threads at the point they are needed.
+  SCFTilingOptions &setNumThreads(ArrayRef<OpFoldResult> numThreads);
 
   /// The interchange vector to reorder the tiled loops.
   SmallVector<int64_t> interchangeVector = {};
@@ -67,9 +89,8 @@ struct SCFTilingOptions {
   /// when using loop constructs that dont support such a mapping (like
   /// `scf.for`)
   SmallVector<Attribute> mappingVector = {};
-  SCFTilingOptions &setMapping(ArrayRef<DeviceMappingAttrInterface> mapping) {
-    mappingVector = llvm::map_to_vector(
-        mapping, [](auto attr) -> Attribute { return attr; });
+  SCFTilingOptions &setMapping(ArrayRef<Attribute> mapping) {
+    mappingVector = llvm::to_vector(mapping);
     return *this;
   }
 };
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
index 71835cd178930..ea2f457c4e889 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
@@ -228,6 +228,11 @@ FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
 ///   } else {
 ///     scf.yield %pre_val : i64
 ///   }
+///
+/// Failure mechanism is not implemented for this function, so it currently
+/// always returns a `WhileOp` operation: a new one if the transformation took
+/// place or the input `whileOp` if the loop was already in a `do-while` form
+/// and `forceCreateCheck` is `false`.
 FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp,
                                                 RewriterBase &rewriter,
                                                 bool forceCreateCheck = false);
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index b7d6e99b5fdcc..4001ba3fc84c9 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -195,6 +195,14 @@ scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target,
 scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source,
                                           RewriterBase &rewriter);
 
+/// Normalize an `scf.forall` operation. Returns `failure()`if normalization
+/// fails.
+// On `success()` returns the
+/// newly created operation with all uses of the original operation replaced
+/// with results of the new operation.
+FailureOr<scf::ForallOp> normalizeForallOp(RewriterBase &rewriter,
+                                           scf::ForallOp forallOp);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_SCF_UTILS_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 6ec97e17c5dcc..b38978272c5bd 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -3249,6 +3249,19 @@ def SPIRV_FC_OptNoneINTEL : I32BitEnumAttrCaseBit<"OptNoneINTEL", 16> {
   ];
 }
 
+def SPIRV_FPRM_RTE : I32EnumAttrCase<"RTE", 0>;
+def SPIRV_FPRM_RTZ : I32EnumAttrCase<"RTZ", 1>;
+def SPIRV_FPRM_RTP : I32EnumAttrCase<"RTP", 2>;
+def SPIRV_FPRM_RTN : I32EnumAttrCase<"RTN", 3>;
+
+// TODO: Enforce SPIR-V spec validation rule for Shader capability: only permit
+//       FPRoundingMode on a value stored to certain storage classes?
+//       (The OpenCL environment also has FPRoundingMode rules, but different.)
+def SPIRV_FPRoundingModeAttr :
+    SPIRV_I32EnumAttr<"FPRoundingMode", "valid SPIR-V FPRoundingMode", "fp_rounding_mode", [
+      SPIRV_FPRM_RTE, SPIRV_FPRM_RTZ, SPIRV_FPRM_RTP, SPIRV_FPRM_RTN
+    ]>;
+
 def SPIRV_FunctionControlAttr :
     SPIRV_BitEnumAttr<"FunctionControl", "valid SPIR-V FunctionControl", "function_control", [
       SPIRV_FC_None, SPIRV_FC_Inline, SPIRV_FC_DontInline, SPIRV_FC_Pure, SPIRV_FC_Const,
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
index 9ad3d5fc85dd3..f54c93a09e727 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
@@ -19,8 +19,10 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 
@@ -189,6 +191,25 @@ Value getVulkanElementPtr(const SPIRVTypeConverter &typeConverter,
                           MemRefType baseType, Value basePtr,
                           ValueRange indices, Location loc, OpBuilder &builder);
 
+// Find the largest factor of size among {2,3,4} for the lowest dimension of
+// the target shape.
+int getComputeVectorSize(int64_t size);
+
+// GetNativeVectorShape implementation for reduction ops.
+SmallVector<int64_t> getNativeVectorShapeImpl(vector::ReductionOp op);
+
+// GetNativeVectorShape implementation for transpose ops.
+SmallVector<int64_t> getNativeVectorShapeImpl(vector::TransposeOp op);
+
+// For general ops.
+std::optional<SmallVector<int64_t>> getNativeVectorShape(Operation *op);
+
+// Unroll vectors in function signatures to native size.
+LogicalResult unrollVectorsInSignatures(Operation *op);
+
+// Unroll vectors in function bodies to native size.
+LogicalResult unrollVectorsInFuncBodies(Operation *op);
+
 } // namespace spirv
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 68ca036121520..388efd1c454b1 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -61,37 +61,62 @@ struct COOSegment {
 /// A simple wrapper to encode a bitset of (at most 64) levels, currently used
 /// by `sparse_tensor.iterate` operation for the set of levels on which the
 /// coordinates should be loaded.
-class LevelSet {
-  uint64_t bits = 0;
+class I64BitSet {
+  uint64_t storage = 0;
 
 public:
-  LevelSet() = default;
-  explicit LevelSet(uint64_t bits) : bits(bits) {}
-  operator uint64_t() const { return bits; }
+  using const_set_bits_iterator = llvm::const_set_bits_iterator_impl<I64BitSet>;
+  const_set_bits_iterator begin() const {
+    return const_set_bits_iterator(*this);
+  }
+  const_set_bits_iterator end() const {
+    return const_set_bits_iterator(*this, -1);
+  }
+  iterator_range<const_set_bits_iterator> bits() const {
+    return make_range(begin(), end());
+  }
+
+  I64BitSet() = default;
+  explicit I64BitSet(uint64_t bits) : storage(bits) {}
+  operator uint64_t() const { return storage; }
 
-  LevelSet &set(unsigned i) {
+  I64BitSet &set(unsigned i) {
     assert(i < 64);
-    bits |= static_cast<uint64_t>(0x01u) << i;
+    storage |= static_cast<uint64_t>(0x01u) << i;
     return *this;
   }
 
-  LevelSet &operator|=(LevelSet lhs) {
-    bits |= static_cast<uint64_t>(lhs);
+  I64BitSet &operator|=(I64BitSet lhs) {
+    storage |= static_cast<uint64_t>(lhs);
     return *this;
   }
 
-  LevelSet &lshift(unsigned offset) {
-    bits = bits << offset;
+  I64BitSet &lshift(unsigned offset) {
+    storage = storage << offset;
     return *this;
   }
 
+  // Needed by `llvm::const_set_bits_iterator_impl`.
+  int find_first() const { return min(); }
+  int find_next(unsigned prev) const {
+    if (prev >= max())
+      return -1;
+
+    uint64_t b = storage >> (prev + 1);
+    if (b == 0)
+      return -1;
+
+    return llvm::countr_zero(b) + prev + 1;
+  }
+
   bool operator[](unsigned i) const {
     assert(i < 64);
-    return (bits & (1 << i)) != 0;
+    return (storage & (1 << i)) != 0;
   }
-  unsigned max() const { return 64 - llvm::countl_zero(bits); }
-  unsigned count() const { return llvm::popcount(bits); }
-  bool empty() const { return bits == 0; }
+  unsigned min() const { return llvm::countr_zero(storage); }
+  unsigned max() const { return 64 - llvm::countl_zero(storage); }
+  unsigned count() const { return llvm::popcount(storage); }
+  bool empty() const { return storage == 0; }
 };
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 69b212cce4ceb..cb6c1b63e4e4b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -24,16 +24,17 @@ class SparseTensor_Attr<string name,
 // sparse tensor levels.
 //===----------------------------------------------------------------------===//
 
-def LevelSetAttr :
-    TypedAttrBase<
-      I64, "IntegerAttr",
+def I64BitSetAttr : TypedAttrBase<I64, "IntegerAttr",
       And<[CPred<"::llvm::isa<::mlir::IntegerAttr>($_self)">,
            CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getType().isInteger(64)">]>,
       "LevelSet attribute"> {
-  let returnType = [{::mlir::sparse_tensor::LevelSet}];
-  let convertFromStorage = [{::mlir::sparse_tensor::LevelSet($_self.getValue().getZExtValue())}];
+  let returnType = [{::mlir::sparse_tensor::I64BitSet}];
+  let convertFromStorage = [{::mlir::sparse_tensor::I64BitSet($_self.getValue().getZExtValue())}];
 }
 
+def I64BitSetArrayAttr :
+    TypedArrayAttrBase<I64BitSetAttr, "I64BitSet array attribute">;
+
 //===----------------------------------------------------------------------===//
 // These attributes are just like `IndexAttr` except that they clarify whether
 // the index refers to a dimension (an axis of the semantic tensor) or a level
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index f31df080d7811..6e17f804993e2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1306,7 +1306,7 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu
 
 def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator,
     ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp",
-                 "ForeachOp", "IterateOp"]>]> {
+                 "ForeachOp", "IterateOp", "CoIterateOp"]>]> {
   let summary = "Yield from sparse_tensor set-like operations";
   let description = [{
       Yields a value from within a `binary`, `unary`, `reduce`,
@@ -1531,6 +1531,31 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
   let hasVerifier = 1;
 }
 
+def ExtractValOp : SparseTensor_Op<"extract_value", [
+    Pure,
+    TypesMatchWith<"result type matches element type of tensor",
+                   "tensor", "result",
+                   "::llvm::cast<TensorType>($_self).getElementType()">]> {
+  let summary = "Extracts a value from a sparse tensor using an iterator.";
+  let description = [{
+      The `sparse_tensor.extract_value` operation extracts the value
+      pointed to by a sparse iterator from a sparse tensor.
+
+      Example:
+
+      ```mlir
+      %val = sparse_tensor.extract_value %sp at %it
+           : tensor<?x?xf32, #CSR>, !sparse_tensor.iterator<#CSR, lvl = 1>
+      ```
+  }];
+
+  let arguments = (ins AnySparseTensor:$tensor, AnySparseIterator:$iterator);
+  let results = (outs AnyType:$result);
+
+  let assemblyFormat = "$tensor `at` $iterator attr-dict `:` type($tensor)`,` qualified(type($iterator))";
+  let hasVerifier = 1;
+}
+
 def IterateOp : SparseTensor_Op<"iterate",
     [RecursiveMemoryEffects, RecursivelySpeculatable,
      DeclareOpInterfaceMethods<LoopLikeOpInterface,
@@ -1604,14 +1629,14 @@ def IterateOp : SparseTensor_Op<"iterate",
 
   let arguments = (ins AnySparseIterSpace:$iterSpace,
                        Variadic<AnyType>:$initArgs,
-                       LevelSetAttr:$crdUsedLvls);
+                       I64BitSetAttr:$crdUsedLvls);
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$region);
 
   let skipDefaultBuilders = 1;
   let builders = [
     OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs)>,
-    OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs, "LevelSet" :$crdUsedLvls)>
+    OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs, "I64BitSet" :$crdUsedLvls)>
   ];
 
   let extraClassDeclaration = [{
@@ -1644,6 +1669,127 @@ def IterateOp : SparseTensor_Op<"iterate",
   let hasCustomAssemblyFormat = 1;
 }
 
+def SparseTensor_CoIterateOp : SparseTensor_Op<"coiterate",
+    [AttrSizedOperandSegments,
+     SingleBlockImplicitTerminator<"sparse_tensor::YieldOp">,
+     RecursiveMemoryEffects]> {
+  let summary = "Co-iterates over a set of sparse iteration spaces";
+  let description = [{
+      The `sparse_tensor.coiterate` operation represents a loop (nest) over
+      a set of iteration spaces. The operation can have multiple regions,
+      with each of them defining a case to compute a result at the current iterations.
+      The case condition is defined solely based on the pattern of specified iterators.
+      For example:
+      ```mlir
+      %ret = sparse_tensor.coiterate (%sp1, %sp2) at(%coord) iter_args(%arg = %init)
+           : (!sparse_tensor.iter_space<#CSR, lvls = 0>,
+              !sparse_tensor.iter_space<#COO, lvls = 0>)
+           -> index
+      case %it1, _ {
+        // %coord is specifed in space %sp1 but *NOT* specified in space %sp2.
+      }
+      case %it1, %it2 {
+        // %coord is specifed in *BOTH* spaces %sp1 and %sp2.
+      }
+      ```
+
+      `sparse_tensor.coiterate` can also operate on loop-carried variables.
+      It returns the final value for each loop-carried variable after loop termination.
+      The initial values of the variables are passed as additional SSA operands
+      to the iterator SSA value and used coordinate SSA values.
+      Each operation region has variadic arguments for specified (used), one argument
+      for each loop-carried variable, representing the value of the variable
+      at the current iteration, followed by a list of arguments for iterators.
+      The body region must contain exactly one block that terminates with
+      `sparse_tensor.yield`.
+
+      The results of an `sparse_tensor.coiterate` hold the final values after
+      the last iteration. If the `sparse_tensor.coiterate` defines any values,
+      a yield must be explicitly present in every region defined in the operation.
+      The number and types of the `sparse_tensor.coiterate` results must match
+      the initial values in the iter_args binding and the yield operands.
+
+
+      A `sparse_tensor.coiterate` example that does elementwise addition between two
+      sparse vectors.
+
+
+      ```mlir
+      %ret = sparse_tensor.coiterate (%sp1, %sp2) at(%coord) iter_args(%arg = %init)
+           : (!sparse_tensor.iter_space<#CSR, lvls = 0>,
+              !sparse_tensor.iter_space<#CSR, lvls = 0>)
+           -> tensor<?xindex, #CSR>
+      case %it1, _ {
+         // v = v1 + 0 = v1
+         %v1 = sparse_tensor.extract_value %t1 at %it1 : index
+         %yield = sparse_tensor.insert %v1 into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      case _, %it2 {
+         // v = v2 + 0 = v2
+         %v2 = sparse_tensor.extract_value %t2 at %it2 : index
+         %yield = sparse_tensor.insert %v1 into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      case %it1, %it2 {
+         // v = v1 + v2
+         %v1 = sparse_tensor.extract_value %t1 at %it1 : index
+         %v2 = sparse_tensor.extract_value %t2 at %it2 : index
+         %v = arith.addi %v1, %v2 : index
+         %yield = sparse_tensor.insert %v into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      ```
+  }];
+
+  let arguments = (ins Variadic<AnySparseIterSpace>:$iterSpaces,
+                       Variadic<AnyType>:$initArgs,
+                       I64BitSetAttr:$crdUsedLvls,
+                       I64BitSetArrayAttr:$cases);
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
+
+  let extraClassDeclaration = [{
+    unsigned getSpaceDim() {
+      return llvm::cast<::mlir::sparse_tensor::IterSpaceType>(
+                 getIterSpaces().front().getType())
+          .getSpaceDim();
+    }
+    I64BitSet getRegionDefinedSpace(unsigned regionIdx) {
+      return I64BitSet(llvm::cast<IntegerAttr>(getCases()[regionIdx])
+                           .getValue().getZExtValue());
+    }
+    auto getRegionDefinedSpaces() {
+      return llvm::map_range(getCases().getValue(), [](Attribute attr) {
+        return I64BitSet(llvm::cast<IntegerAttr>(attr).getValue().getZExtValue());
+      });
+    }
+
+    // The block arguments starts with referenced coordinates, follows by
+    // user-provided iteration arguments and ends with iterators.
+    Block::BlockArgListType getCrds(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .take_front(getCrdUsedLvls().count());
+    }
+    unsigned getNumRegionIterArgs(unsigned regionIdx) {
+      return getInitArgs().size();
+    }
+    Block::BlockArgListType getRegionIterArgs(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .slice(getCrdUsedLvls().count(), getNumRegionIterArgs(regionIdx));
+    }
+    Block::BlockArgListType getRegionIterators(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .take_back(getRegionDefinedSpace(regionIdx).count());
+    }
+    ValueRange getYieldedValues(unsigned regionIdx);
+  }];
+
+  let hasVerifier = 1;
+  let hasRegionVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 7a661c663e010..b946fc8875860 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -649,11 +649,12 @@ def ForeachOp : TransformDialectOp<"foreach",
   }];
 
   let arguments = (ins Variadic<Transform_AnyHandleOrParamType>:$targets,
-                       UnitAttr:$zip_shortest);
+                       UnitAttr:$with_zip_shortest);
   let results = (outs Variadic<Transform_AnyHandleOrParamType>:$results);
   let regions = (region SizedRegion<1>:$body);
   let assemblyFormat =
-    "$targets attr-dict `:` type($targets) (`->` type($results)^)? $body";
+    "$targets oilist(`with_zip_shortest` $with_zip_shortest) `:` "
+    "type($targets) (`->` type($results)^)? $body attr-dict";
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index b774359552aa5..7849782e5442b 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -243,6 +243,14 @@ SmallVector<int64_t>
 computePermutationVector(int64_t permSize, ArrayRef<int64_t> positions,
                          ArrayRef<int64_t> desiredPositions);
 
+/// Returns a permutation vector that drop the input dims in
+/// dropPositions from inputPerm.
+///
+/// For example, inputPerm = {2, 4, 0, 1, 3} and dropPositions= {1, 2} would
+/// result in a {2, 0, 1} permutation vector.
+SmallVector<int64_t> dropDims(ArrayRef<int64_t> inputPerm,
+                              ArrayRef<int64_t> dropPositions);
+
 /// Helper to return a subset of `arrayAttr` as a vector of int64_t.
 // TODO: Port everything relevant to DenseArrayAttr and drop this util.
 SmallVector<int64_t> getI64SubArray(ArrayAttr arrayAttr, unsigned dropFront = 0,
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 2019eb5a9fd7f..434ff3956c250 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -300,9 +300,9 @@ def Vector_MultiDimReductionOp :
 
     ```mlir
     %1 = vector.multi_reduction <add>, %0, %acc0 [1, 3] :
-      vector<4x8x16x32xf32> into vector<4x16xf32>
+      vector<4x8x16x32xf32> to vector<4x16xf32>
     %2 = vector.multi_reduction <add>, %1, %acc1 [0, 1] :
-      vector<4x16xf32> into f32
+      vector<4x16xf32> to f32
     ```
   }];
   let builders = [
@@ -421,7 +421,7 @@ def Vector_ShuffleOp :
                  TCresVTEtIsSameAsOpBase<0, 1>>,
      InferTypeOpAdaptor]>,
      Arguments<(ins AnyFixedVector:$v1, AnyFixedVector:$v2,
-                    I64ArrayAttr:$mask)>,
+                    DenseI64ArrayAttr:$mask)>,
      Results<(outs AnyVector:$vector)> {
   let summary = "shuffle operation";
   let description = [{
@@ -459,11 +459,7 @@ def Vector_ShuffleOp :
                : vector<f32>, vector<f32>           ; yields vector<2xf32>
     ```
   }];
-  let builders = [
-    OpBuilder<(ins "Value":$v1, "Value":$v2, "ArrayRef<int64_t>")>
-  ];
-  let hasFolder = 1;
-  let hasCanonicalizer = 1;
+
   let extraClassDeclaration = [{
     VectorType getV1VectorType() {
       return ::llvm::cast<VectorType>(getV1().getType());
@@ -475,7 +471,10 @@ def Vector_ShuffleOp :
       return ::llvm::cast<VectorType>(getVector().getType());
     }
   }];
+
   let assemblyFormat = "operands $mask attr-dict `:` type(operands)";
+
+  let hasFolder = 1;
   let hasVerifier = 1;
   let hasCanonicalizer = 1;
 }
@@ -2443,7 +2442,7 @@ def Vector_TypeCastOp :
 
 def Vector_ConstantMaskOp :
   Vector_Op<"constant_mask", [Pure]>,
-    Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>,
+    Arguments<(ins DenseI64ArrayAttr:$mask_dim_sizes)>,
     Results<(outs VectorOfAnyRankOf<[I1]>)> {
   let summary = "creates a constant vector mask";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 9c83acc76e77a..40e04b76593a0 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 #define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -101,6 +102,24 @@ bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);
 std::optional<StaticTileOffsetRange>
 createUnrollIterator(VectorType vType, int64_t targetRank = 1);
 
+/// Returns a functor (int64_t -> Value) which returns a constant vscale
+/// multiple.
+///
+/// Example:
+/// ```c++
+/// auto createVscaleMultiple = makeVscaleConstantBuilder(rewriter, loc);
+/// auto c4Vscale = createVscaleMultiple(4); // 4 * vector.vscale
+/// ```
+inline auto makeVscaleConstantBuilder(PatternRewriter &rewriter, Location loc) {
+  Value vscale = nullptr;
+  return [loc, vscale, &rewriter](int64_t multiplier) mutable {
+    if (!vscale)
+      vscale = rewriter.create<vector::VectorScaleOp>(loc);
+    return rewriter.create<arith::MulIOp>(
+        loc, vscale, rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
+  };
+}
+
 /// A wrapper for getMixedSizes for vector.transfer_read and
 /// vector.transfer_write Ops (for source and destination, respectively).
 ///
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 0d5fa719d0dee..1c4d329fbf0d8 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -61,6 +61,7 @@ class Builder {
 
   // Types.
   FloatType getFloat8E5M2Type();
+  FloatType getFloat8E4M3Type();
   FloatType getFloat8E4M3FNType();
   FloatType getFloat8E5M2FNUZType();
   FloatType getFloat8E4M3FNUZType();
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 5579b138668d2..4250be90ba7fb 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -39,6 +39,11 @@ struct IntegerTypeStorage;
 struct TupleTypeStorage;
 } // namespace detail
 
+/// Type trait indicating that the type has value semantics.
+template <typename ConcreteType>
+class ValueSemantics
+    : public TypeTrait::TraitBase<ConcreteType, ValueSemantics> {};
+
 //===----------------------------------------------------------------------===//
 // FloatType
 //===----------------------------------------------------------------------===//
@@ -56,6 +61,7 @@ class FloatType : public Type {
   static FloatType getF80(MLIRContext *ctx);
   static FloatType getF128(MLIRContext *ctx);
   static FloatType getFloat8E5M2(MLIRContext *ctx);
+  static FloatType getFloat8E4M3(MLIRContext *ctx);
   static FloatType getFloat8E4M3FN(MLIRContext *ctx);
   static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx);
   static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx);
@@ -405,16 +411,20 @@ inline bool BaseMemRefType::isValidElementType(Type type) {
 }
 
 inline bool FloatType::classof(Type type) {
-  return llvm::isa<Float8E5M2Type, Float8E4M3FNType, Float8E5M2FNUZType,
-                   Float8E4M3FNUZType, Float8E4M3B11FNUZType, BFloat16Type,
-                   Float16Type, FloatTF32Type, Float32Type, Float64Type,
-                   Float80Type, Float128Type>(type);
+  return llvm::isa<
+      Float8E5M2Type, Float8E4M3Type, Float8E4M3FNType, Float8E5M2FNUZType,
+      Float8E4M3FNUZType, Float8E4M3B11FNUZType, BFloat16Type, Float16Type,
+      FloatTF32Type, Float32Type, Float64Type, Float80Type, Float128Type>(type);
 }
 
 inline FloatType FloatType::getFloat8E5M2(MLIRContext *ctx) {
   return Float8E5M2Type::get(ctx);
 }
 
+inline FloatType FloatType::getFloat8E4M3(MLIRContext *ctx) {
+  return Float8E4M3Type::get(ctx);
+}
+
 inline FloatType FloatType::getFloat8E4M3FN(MLIRContext *ctx) {
   return Float8E4M3FNType::get(ctx);
 }
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 4cade83dd3c32..176a167a3ca31 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -30,6 +30,15 @@ class Builtin_Type<string name, string typeMnemonic, list<Trait> traits = [],
   let typeName = "builtin." # typeMnemonic;
 }
 
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+/// Type trait indicating that the type has value semantics.
+def ValueSemantics : NativeTypeTrait<"ValueSemantics"> {
+  let cppNamespace = "::mlir";
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexType
 //===----------------------------------------------------------------------===//
@@ -97,6 +106,25 @@ def Builtin_Float8E5M2 : Builtin_FloatType<"Float8E5M2", "f8E5M2"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Float8E4M3Type
+
+def Builtin_Float8E4M3 : Builtin_FloatType<"Float8E4M3", "f8E4M3"> {
+  let summary = "8-bit floating point with 3 bit mantissa";
+  let description = [{
+    An 8-bit floating point type with 1 sign bit, 4 bits exponent and 3 bits
+    mantissa. This is not a standard type as defined by IEEE-754, but it
+    follows similar conventions with the following characteristics:
+
+      * bit encoding: S1E4M3
+      * exponent bias: 7
+      * infinities: supported with exponent set to all 1s and mantissa 0s
+      * NaNs: supported with exponent bits set to all 1s and mantissa of
+        (001, 010, 011, 100, 101, 110, 111)
+      * denormals when exponent is 0
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Float8E4M3FNType
 
@@ -745,7 +773,7 @@ def Builtin_Opaque : Builtin_Type<"Opaque", "opaque"> {
 //===----------------------------------------------------------------------===//
 
 def Builtin_RankedTensor : Builtin_Type<"RankedTensor", "tensor", [
-    ShapedTypeInterface
+    ShapedTypeInterface, ValueSemantics
   ], "TensorType"> {
   let summary = "Multi-dimensional array with a fixed number of dimensions";
   let description = [{
@@ -1001,7 +1029,7 @@ def Builtin_UnrankedMemRef : Builtin_Type<"UnrankedMemRef", "unranked_memref", [
 //===----------------------------------------------------------------------===//
 
 def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [
-    ShapedTypeInterface
+    ShapedTypeInterface, ValueSemantics
   ], "TensorType"> {
   let summary = "Multi-dimensional array with unknown dimensions";
   let description = [{
@@ -1049,7 +1077,8 @@ def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [
 // VectorType
 //===----------------------------------------------------------------------===//
 
-def Builtin_Vector : Builtin_Type<"Vector", "vector", [ShapedTypeInterface], "Type"> {
+def Builtin_Vector : Builtin_Type<"Vector", "vector",
+    [ShapedTypeInterface, ValueSemantics], "Type"> {
   let summary = "Multi-dimensional SIMD vector type";
   let description = [{
     Syntax:
@@ -1139,6 +1168,11 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector", [ShapedTypeInterface], "Ty
       return !llvm::is_contained(getScalableDims(), false);
     }
 
+    /// Get the number of scalable dimensions.
+    int64_t getNumScalableDims() const {
+      return llvm::count(getScalableDims(), true);
+    }
+
     /// Get or create a new VectorType with the same shape as `this` and an
     /// element type of bitwidth scaled by `scale`.
     /// Return null if the scaled element type cannot be represented.
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index af4f13dc09360..c23d2d87e080f 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -89,6 +89,9 @@ def HasStaticShapePred :
 // Whether a type is a TupleType.
 def IsTupleTypePred : CPred<"::llvm::isa<::mlir::TupleType>($_self)">;
 
+// Whether a type has a ValueSemantics trait.
+def HasValueSemanticsPred : CPred<"$_self.hasTrait<::mlir::ValueSemantics>()">;
+
 //===----------------------------------------------------------------------===//
 // Type definitions
 //===----------------------------------------------------------------------===//
@@ -331,6 +334,8 @@ def F8E4M3FN : Type<CPred<"$_self.isFloat8E4M3FN()">, "f8E4M3FN type">,
                BuildableType<"$_builder.getFloat8E4M3FNType()">;
 def F8E5M2 : Type<CPred<"$_self.isFloat8E5M2()">, "f8E5M2 type">,
              BuildableType<"$_builder.getFloat8E5M2Type()">;
+def F8E4M3 : Type<CPred<"$_self.isFloat8E4M3()">, "f8E4M3 type">,
+             BuildableType<"$_builder.getFloat8E4M3Type()">;
 def F8E4M3FNUZ : Type<CPred<"$_self.isFloat8E4M3FNUZ()">, "f8E4M3FNUZ type">,
                  BuildableType<"$_builder.getFloat8E4M3FNUZType()">;
 def F8E4M3B11FNUZ : Type<CPred<"$_self.isFloat8E4M3B11FNUZ()">, "f8E4M3B11FNUZ type">,
@@ -403,6 +408,11 @@ class HasRankGreaterOrEqualPred<int rank> : And<[
     CPred<[{::llvm::cast<::mlir::ShapedType>($_self).getRank() >= }] # rank>
 ]>;
 
+// Container with value semantics.
+class ValueSemanticsContainerOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, HasValueSemanticsPred,
+  "container with value semantics">;
+
 // Vector types.
 
 class VectorOf<list<Type> allowedTypes> :
@@ -842,10 +852,18 @@ class NestedTupleOf<list<Type> allowedTypes> :
 // Common type constraints
 //===----------------------------------------------------------------------===//
 // Type constraint for types that are "like" some type or set of types T, that is
-// they're either a T, a vector of Ts, or a tensor of Ts
+// they're either a T, a vector of Ts, or a tensor of Ts.
 class TypeOrContainer<Type allowedType, string name> : TypeConstraint<Or<[
-  allowedType.predicate, VectorOf<[allowedType]>.predicate,
-  TensorOf<[allowedType]>.predicate]>,
+  allowedType.predicate,
+  ValueSemanticsContainerOf<[allowedType]>.predicate]>,
+  name>;
+
+// Type constraint for types that are "like" some type or set of types T, that is
+// they're either a T or a mapable container of Ts.
+class TypeOrValueSemanticsContainer<Type allowedType, string name>
+    : TypeConstraint<Or<[
+  allowedType.predicate,
+  ValueSemanticsContainerOf<[allowedType]>.predicate]>,
   name>;
 
 // Temporary constraint to allow gradual transition to supporting 0-D vectors.
@@ -864,8 +882,8 @@ def BoolLikeOfAnyRank : TypeOrContainerOfAnyRank<I1, "bool-like">;
 
 // Type constraint for signless-integer-like types: signless integers, indices,
 // vectors of signless integers or indices, tensors of signless integers.
-def SignlessIntegerLike : TypeOrContainer<AnySignlessIntegerOrIndex,
-    "signless-integer-like">;
+def SignlessIntegerLike : TypeOrValueSemanticsContainer<
+    AnySignlessIntegerOrIndex, "signless-integer-like">;
 
 def SignlessIntegerLikeOfAnyRank : TypeOrContainerOfAnyRank<
     AnySignlessIntegerOrIndex,
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index bb2e1bb183e9b..cb30bb3f59688 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -271,6 +271,9 @@ class Diagnostic {
     return failure();
   }
 
+  /// Returns the current list of diagnostic metadata.
+  SmallVectorImpl<DiagnosticArgument> &getMetadata() { return metadata; }
+
 private:
   Diagnostic(const Diagnostic &rhs) = delete;
   Diagnostic &operator=(const Diagnostic &rhs) = delete;
@@ -290,6 +293,9 @@ class Diagnostic {
 
   /// A list of attached notes.
   NoteVector notes;
+
+  /// A list of metadata attached to this Diagnostic.
+  SmallVector<DiagnosticArgument, 0> metadata;
 };
 
 inline raw_ostream &operator<<(raw_ostream &os, const Diagnostic &diag) {
diff --git a/mlir/include/mlir/IR/ODSSupport.h b/mlir/include/mlir/IR/ODSSupport.h
index 70e3f986431e2..25d6f3da6a861 100644
--- a/mlir/include/mlir/IR/ODSSupport.h
+++ b/mlir/include/mlir/IR/ODSSupport.h
@@ -33,6 +33,37 @@ convertFromAttribute(int64_t &storage, Attribute attr,
 /// Convert the provided int64_t to an IntegerAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, int64_t storage);
 
+/// Convert an IntegerAttr attribute to an int32_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(int32_t &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided int32_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, int32_t storage);
+
+/// Extract the string from `attr` into `storage`. If `attr` is not a
+/// `StringAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(std::string &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a StringAttr. Note that this takes a reference
+/// to the storage of a string property, which is an std::string.
+Attribute convertToAttribute(MLIRContext *ctx, const std::string &storage);
+
+/// Extract the boolean from `attr` into `storage`. If `attr` is not a
+/// `BoolAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(bool &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a BooleanAttr.
+Attribute convertToAttribute(MLIRContext *ctx, bool storage);
+
 /// Convert a DenseI64ArrayAttr to the provided storage. It is expected that the
 /// storage has the same size as the array. An error is returned if the
 /// attribute isn't a DenseI64ArrayAttr or it does not have the same size. If
@@ -49,9 +80,24 @@ LogicalResult
 convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                      function_ref<InFlightDiagnostic()> emitError);
 
+/// Convert a DenseI64ArrayAttr to the provided storage, which will be
+/// cleared before writing. An error is returned and emitted to the optional
+/// `emitError` function if the attribute isn't a DenseI64ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert a DenseI32ArrayAttr to the provided storage, which will be
+/// cleared before writing. It is expected that the storage has the same size as
+/// the array. An error is returned and emitted to the optional `emitError`
+/// function if the attribute isn't a DenseI32ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
 /// Convert the provided ArrayRef<int64_t> to a DenseI64ArrayAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, ArrayRef<int64_t> storage);
 
 } // namespace mlir
 
-#endif // MLIR_IR_ODSSUPPORT_H
\ No newline at end of file
+#endif // MLIR_IR_ODSSUPPORT_H
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index fa435cb3155ed..ae412c7227f8e 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -714,16 +714,27 @@ class AsmParser {
     return *parseResult;
   }
 
+  /// Parse a decimal integer value from the stream.
+  template <typename IntT>
+  ParseResult parseDecimalInteger(IntT &result) {
+    auto loc = getCurrentLocation();
+    OptionalParseResult parseResult = parseOptionalDecimalInteger(result);
+    if (!parseResult.has_value())
+      return emitError(loc, "expected decimal integer value");
+    return *parseResult;
+  }
+
   /// Parse an optional integer value from the stream.
   virtual OptionalParseResult parseOptionalInteger(APInt &result) = 0;
+  virtual OptionalParseResult parseOptionalDecimalInteger(APInt &result) = 0;
 
-  template <typename IntT>
-  OptionalParseResult parseOptionalInteger(IntT &result) {
+ private:
+  template <typename IntT, typename ParseFn>
+  OptionalParseResult parseOptionalIntegerAndCheck(IntT &result,
+                                                   ParseFn &&parseFn) {
     auto loc = getCurrentLocation();
-
-    // Parse the unsigned variant.
     APInt uintResult;
-    OptionalParseResult parseResult = parseOptionalInteger(uintResult);
+    OptionalParseResult parseResult = parseFn(uintResult);
     if (!parseResult.has_value() || failed(*parseResult))
       return parseResult;
 
@@ -737,6 +748,20 @@ class AsmParser {
     return success();
   }
 
+ public:
+  template <typename IntT>
+  OptionalParseResult parseOptionalInteger(IntT &result) {
+    return parseOptionalIntegerAndCheck(
+        result, [&](APInt &result) { return parseOptionalInteger(result); });
+  }
+
+  template <typename IntT>
+  OptionalParseResult parseOptionalDecimalInteger(IntT &result) {
+    return parseOptionalIntegerAndCheck(result, [&](APInt &result) {
+      return parseOptionalDecimalInteger(result);
+    });
+  }
+
   /// These are the supported delimiters around operand lists and region
   /// argument lists, used by parseOperandList.
   enum class Delimiter {
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 0babdbbfa05bc..0becf7d009835 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -29,7 +29,6 @@ class Property<string storageTypeParam = "", string desc = ""> {
   //
   // Format:
   // - `$_storage` will contain the property in the storage type.
-  // - `$_ctxt` will contain an `MLIRContext *`.
   code convertFromStorage = "$_storage";
 
   // The call expression to build a property storage from the interface type.
@@ -40,24 +39,26 @@ class Property<string storageTypeParam = "", string desc = ""> {
   code assignToStorage = "$_storage = $_value";
 
   // The call expression to convert from the storage type to an attribute.
+  // The resulting attribute must be non-null in non-error cases.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_ctxt` is a `MLIRContext *`.
   //
-  // The expression must result in an Attribute.
+  // The expression must return an `Attribute` and will be used as a function body.
   code convertToAttribute = [{
-    convertToAttribute($_ctxt, $_storage)
+    return convertToAttribute($_ctxt, $_storage);
   }];
 
   // The call expression to convert from an Attribute to the storage type.
   //
   // Format:
-  // - `$_storage` is the storage type value.
+  // - `$_storage` is a reference to a value of the storage type.
   // - `$_attr` is the attribute.
   // - `$_diag` is a callback to get a Diagnostic to emit error.
   //
-  // The expression must return a LogicalResult
+  // The expression must return a LogicalResult and will be used as a function body
+  // or in other similar contexts.
   code convertFromAttribute = [{
     return convertFromAttribute($_storage, $_attr, $_diag);
   }];
@@ -68,18 +69,68 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the variable to hash.
   //
   // The expression should define a llvm::hash_code.
-  code hashProperty = [{
-    llvm::hash_value($_storage);
+  // If unspecified, defaults to `llvm::hash_value($_storage)`.
+  // The default is not specified in tablegen because many combinators, like
+  // ArrayProperty, can fall back to more efficient implementations of
+  // `hashProperty` when their underlying elements have trivial hashing.
+  code hashProperty = "";
+
+  // The body of the parser for a value of this property.
+  // Format:
+  // - `$_parser` is the OpAsmParser.
+  // - `$_storage` is the location into which the value is to be placed if it is
+  //  present.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This defines the body of a function (typically a lambda) that returns a
+  // ParseResult. There is an implicit `return success()` at the end of the parser
+  // code.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
+  code parser = [{
+    auto value = ::mlir::FieldParser<}] # storageType # [{>::parse($_parser);
+    if (::mlir::failed(value))
+      return ::mlir::failure();
+    $_storage = std::move(*value);
   }];
 
+  // The body of the parser for a value of this property as the anchor of an optional
+  // group. This should parse the property if possible and do nothing if a value of
+  // the relevant type is not next in the parse stream.
+  // You are not required to define this parser if it cannot be meaningfully
+  // implemented.
+  // This has the same context and substitutions as `parser` except that it is
+  // required to return an OptionalParseResult.
+  //
+  // If the optional parser doesn't parse anything, it should not set
+  // $_storage, since the parser doesn't know if the default value has been
+  // overwritten.
+  code optionalParser = "";
+
+  // The printer for a value of this property.
+  // Format:
+  // - `$_storage` is the storage data.
+  // - `$_printer` is the OpAsmPrinter instance.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This may be called in an expression context, so variable declarations must
+  // be placed within a new scope.
+  //
+  // The printer for a property should always print a non-empty value - default value
+  // printing elision happens outside the context of this printing expression.
+  code printer = "$_printer << $_storage";
+
   // The call expression to emit the storage type to bytecode.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_writer` is a `DialectBytecodeWriter`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body af a function returning void.
   code writeToMlirBytecode = [{
-    writeToMlirBytecode($_writer, $_storage)
+    writeToMlirBytecode($_writer, $_storage);
   }];
 
   // The call expression to read the storage type from bytecode.
@@ -88,13 +139,31 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the storage type value.
   // - `$_reader` is a `DialectBytecodeReader`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body of a function returning LogicalResult.
+  // There is an implicit `return success()` at the end of this function.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
   code readFromMlirBytecode = [{
     if (::mlir::failed(readFromMlirBytecode($_reader, $_storage)))
       return ::mlir::failure();
   }];
 
-  // Default value for the property.
-  string defaultValue = ?;
+  // Base definition for the property. (Will be) used for `OptionalProperty` and
+  // such cases, analogously to `baseAttr`.
+  Property baseProperty = ?;
+
+  // Default value for the property within its storage. This should be an expression
+  // of type `interfaceType` and should be comparable with other types of that
+  // interface typ with `==`. The empty string means there is no default value.
+  string defaultValue = "";
+
+  // If set, the default value the storage of the property should be initilized to.
+  // This is only needed when the storage and interface types of the property
+  // are distinct (ex. SmallVector for storage vs. ArrayRef for interfacing), as it
+  // will fall back to `defaultValue` when unspecified.
+  string storageTypeValueOverride = "";
 }
 
 /// Implementation of the Property class's `readFromMlirBytecode` field using
@@ -133,12 +202,16 @@ defvar writeMlirBytecodeWithConvertToAttribute = [{
 // Primitive property kinds
 
 // Any kind of integer stored as properties.
-class IntProperty<string storageTypeParam = "", string desc = ""> :
+class IntProperty<string storageTypeParam, string desc = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  let summary = !if(!empty(desc), storageTypeParam, desc);
+  let optionalParser = [{
+    return $_parser.parseOptionalInteger($_storage);
+  }];
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt($_storage);
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
@@ -146,24 +219,472 @@ class IntProperty<string storageTypeParam = "", string desc = ""> :
   }];
 }
 
-class ArrayProperty<string storageTypeParam = "", int n, string desc = ""> :
-  Property<storageTypeParam # "[" # n # "]", desc> {
-  let interfaceType = "::llvm::ArrayRef<" # storageTypeParam # ">";
-  let convertFromStorage = "$_storage";
-  let assignToStorage = "::llvm::copy($_value, $_storage)";
-}
+def I32Property : IntProperty<"int32_t">;
+def I64Property : IntProperty<"int64_t">;
 
-class EnumProperty<string storageTypeParam, string desc = ""> :
+class EnumProperty<string storageTypeParam, string desc = "", string default = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  // TODO: take advantage of EnumAttrInfo and the like to make this share nice
+  // parsing code with EnumAttr.
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt(static_cast<uint64_t>($_storage));
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
     $_storage = static_cast<}] # storageTypeParam # [{>(val);
   }];
+  let defaultValue = default;
 }
 
+def StringProperty : Property<"std::string", "string"> {
+  let interfaceType = "::llvm::StringRef";
+  let convertFromStorage = "::llvm::StringRef{$_storage}";
+  let assignToStorage = "$_storage = $_value.str()";
+  let optionalParser = [{
+    if (::mlir::failed($_parser.parseOptionalString(&$_storage)))
+      return std::nullopt;
+  }];
+  let printer = "$_printer.printString($_storage)";
+  let readFromMlirBytecode = [{
+    StringRef val;
+    if (::mlir::failed($_reader.readString(val)))
+      return ::mlir::failure();
+    $_storage = val.str();
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedString($_storage);
+  }];
+}
+
+def BoolProperty : IntProperty<"bool", "boolean"> {
+  let printer = [{ $_printer << ($_storage ? "true" : "false") }];
+  let readFromMlirBytecode = [{
+    return $_reader.readBool($_storage);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+}
+
+def UnitProperty : Property<"bool", "unit property"> {
+  let summary = "unit property";
+  let description = [{
+    A property whose presence or abscence is used as a flag.
+
+    This is stored as a boolean that defaults to false, and is named UnitProperty
+    by analogy with UnitAttr, which has the more comprehensive rationale and
+    explains the less typical syntax.
+
+    Note that this attribute does have a syntax for the false case to allow for its
+    use in contexts where default values shouldn't be elided.
+  }];
+  let defaultValue = "false";
+
+  let convertToAttribute = [{
+    if ($_storage)
+      return ::mlir::UnitAttr::get($_ctxt);
+    else
+      return ::mlir::BoolAttr::get($_ctxt, false);
+  }];
+  let convertFromAttribute = [{
+    if (::llvm::isa<::mlir::UnitAttr>($_attr)) {
+      $_storage = true;
+      return ::mlir::success();
+    }
+    if (auto boolAttr = ::llvm::dyn_cast<::mlir::BoolAttr>($_attr)) {
+      $_storage = boolAttr.getValue();
+      return ::mlir::success();
+    }
+    return ::mlir::failure();
+  }];
+
+  let parser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected 'unit' or 'unit_absent'");
+    $_storage = (keyword == "unit");
+  }];
+
+  let optionalParser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return std::nullopt;
+    $_storage = (keyword == "unit");
+  }];
+
+  let printer = [{
+    $_printer << ($_storage ? "unit" : "unit_absent")
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+  let readFromMlirBytecode = [{
+    if (::mlir::failed($_reader.readBool($_storage)))
+      return ::mlir::failure();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive property combinators
+
+/// Create a variable named `name` of `prop`'s storage type that is initialized
+/// to the correct default value, if there is one.
+class _makePropStorage<Property prop, string name> {
+  code ret = prop.storageType # " " # name
+      # !cond(!not(!empty(prop.storageTypeValueOverride)) : " = " # prop.storageTypeValueOverride,
+        !not(!empty(prop.defaultValue)) : " = " # prop.defaultValue,
+        true : "") # ";";
+}
+
+/// The generic class for arrays of some other property, which is stored as a
+/// `SmallVector` of that property. This uses an `ArrayAttr` as its attribute form
+/// though subclasses can override this, as is the case with IntArrayAttr below.
+/// Those wishing to use a non-default number of SmallVector elements should
+/// subclass `ArrayProperty`.
+class ArrayProperty<Property elem = Property<>, string desc = ""> :
+  Property<"::llvm::SmallVector<" # elem.storageType # ">", desc> {
+  let summary = "array of " # elem.summary;
+  let interfaceType = "::llvm::ArrayRef<" # elem.storageType # ">";
+  let convertFromStorage = "::llvm::ArrayRef<" # elem.storageType # ">{$_storage}";
+  let assignToStorage = "$_storage.assign($_value.begin(), $_value.end())";
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast_if_present<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected array attribute";
+    for (::mlir::Attribute elemAttr : arrayAttr) {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRes = [&](Attribute propAttr, }] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_attr", "propAttr",
+          !subst("$_storage", "propStorage", elem.convertFromAttribute)) # [{
+      }(elemAttr, elemVal);
+      if (::mlir::failed(elemRes))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    SmallVector<Attribute> elems;
+    for (const auto& elemVal : $_storage) {
+      auto elemAttr = [&](const }] # elem.storageType #[{& propStorage) -> ::mlir::Attribute {
+        }] # !subst("$_storage", "propStorage", elem.convertToAttribute) # [{
+      }(elemVal);
+      elems.push_back(elemAttr);
+    }
+    return ::mlir::ArrayAttr::get($_ctxt, elems);
+  }];
+
+  defvar theParserBegin = [{
+    auto& storage = $_storage;
+    auto parseElemFn = [&]() -> ::mlir::ParseResult {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemParse = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::ParseResult {
+        }] # !subst("$_storage", "propStorage", elem.parser) # [{
+        return ::mlir::success();
+       }(elemVal);
+      if (::mlir::failed(elemParse))
+        return ::mlir::failure();
+      storage.push_back(std::move(elemVal));
+      return ::mlir::success();
+    };
+    }];
+  let parser = theParserBegin # [{
+    return $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::Square, parseElemFn);
+  }];
+  // Hack around the lack of a peek method
+  let optionalParser = theParserBegin # [{
+    auto oldLoc = $_parser.getCurrentLocation();
+    auto parseResult = $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::OptionalSquare, parseElemFn);
+    if (::mlir::failed(parseResult))
+      return ::mlir::failure();
+    auto newLoc = $_parser.getCurrentLocation();
+    if (oldLoc == newLoc)
+      return std::nullopt;
+    return ::mlir::success();
+  }];
+
+  let printer = [{ [&](){
+    $_printer << "[";
+    auto elemPrinter = [&](const }] # elem.storageType # [{& elemVal) {
+      }] # !subst("$_storage", "elemVal", elem.printer) #[{;
+    };
+    ::llvm::interleaveComma($_storage, $_printer, elemPrinter);
+    $_printer << "]";
+  }()}];
+
+  let readFromMlirBytecode = [{
+    uint64_t length;
+    if (::mlir::failed($_reader.readVarInt(length)))
+      return ::mlir::failure();
+    $_storage.reserve(length);
+    for (uint64_t i = 0; i < length; ++i) {
+      }]# _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRead = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_storage", "propStorage", elem.readFromMlirBytecode) # [{;
+        return ::mlir::success();
+      }(elemVal);
+      if (::mlir::failed(elemRead))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeVarInt($_storage.size());
+    for (const auto& elemVal : $_storage) {
+      [&]() {
+        }] # !subst("$_storage", "elemVal", elem.writeToMlirBytecode) #[{;
+      }();
+    }
+  }];
+
+  // There's no hash_value for SmallVector<T>, so we construct the ArrayRef ourselves.
+  // In the non-trivial case, we define a mapped range to get internal hash
+  // codes.
+  let hashProperty = !if(!empty(elem.hashProperty),
+    [{::llvm::hash_value(::llvm::ArrayRef<}] # elem.storageType # [{>{$_storage})}],
+    [{[&]() -> ::llvm::hash_code {
+        auto getElemHash = [](const auto& propStorage) -> ::llvm::hash_code {
+          return }] # !subst("$_storage", "propStorage", elem.hashProperty) # [{;
+        };
+        auto mapped = ::llvm::map_range($_storage, getElemHash);
+        return ::llvm::hash_combine_range(mapped.begin(), mapped.end());
+      }()
+    }]);
+}
+
+class IntArrayProperty<string storageTypeParam = "", string desc = ""> :
+    ArrayProperty<IntProperty<storageTypeParam, desc>> {
+  // Bring back the trivial conversions we don't get in the general case.
+  let convertFromAttribute = [{
+    return convertFromAttribute($_storage, $_attr, $_diag);
+  }];
+  let convertToAttribute = [{
+    return convertToAttribute($_ctxt, $_storage);
+  }];
+}
+
+/// Class for giving a property a default value.
+/// This doesn't change anything about the property other than giving it a default
+/// which can be used by ODS to elide printing.
+class DefaultValuedProperty<Property p, string default = "", string storageDefault = ""> : Property<p.storageType, p.summary> {
+  let defaultValue = default;
+  let storageTypeValueOverride = storageDefault;
+  let baseProperty = p;
+  // Keep up to date with `Property` above.
+  let summary = p.summary;
+  let description = p.description;
+  let storageType = p.storageType;
+  let interfaceType = p.interfaceType;
+  let convertFromStorage = p.convertFromStorage;
+  let assignToStorage = p.assignToStorage;
+  let convertToAttribute = p.convertToAttribute;
+  let convertFromAttribute = p.convertFromAttribute;
+  let hashProperty = p.hashProperty;
+  let parser = p.parser;
+  let optionalParser = p.optionalParser;
+  let printer = p.printer;
+  let readFromMlirBytecode = p.readFromMlirBytecode;
+  let writeToMlirBytecode = p.writeToMlirBytecode;
+}
+
+/// An optional property, stored as an std::optional<p.storageType>
+/// interfaced with as an std::optional<p.interfaceType>..
+/// The syntax is `none` (or empty string if elided) for an absent value or
+/// `some<[underlying property]>` when a value is set.
+///
+/// As a special exception, if the underlying property has an optional parser and
+/// no default value (ex. an integer property), the printer will skip the `some`
+/// bracketing and delegate to the optional parser. In that case, the syntax is the
+/// syntax of the underlying property, or the keyword `none` in the rare cases that
+/// it is needed. This behavior can be disabled by setting `canDelegateParsing` to 0.
+class OptionalProperty<Property p, bit canDelegateParsing = 1>
+    : Property<"std::optional<" # p.storageType # ">", "optional " # p.summary> {
+
+  // In the cases where the underlying attribute is plain old data that's passed by
+  // value, the conversion code is trivial.
+  defvar hasTrivialStorage = !and(!eq(p.convertFromStorage, "$_storage"),
+    !eq(p.assignToStorage, "$_storage = $_value"),
+    !eq(p.storageType, p.interfaceType));
+
+  defvar delegatesParsing = !and(!empty(p.defaultValue),
+    !not(!empty(p.optionalParser)), canDelegateParsing);
+
+  let interfaceType = "std::optional<" # p.interfaceType # ">";
+  let defaultValue = "std::nullopt";
+
+  let convertFromStorage = !if(hasTrivialStorage,
+    p.convertFromStorage,
+    [{($_storage.has_value() ? std::optional<}] # p.interfaceType # ">{"
+      # !subst("$_storage", "(*($_storage))", p.convertFromStorage)
+      # [{} : std::nullopt)}]);
+  let assignToStorage = !if(hasTrivialStorage,
+    p.assignToStorage,
+    [{[&]() {
+      if (!$_value.has_value()) {
+        $_storage = std::nullopt;
+        return;
+      }
+      }] # _makePropStorage<p, "presentVal">.ret # [{
+      [&](}] # p.storageType # [{& propStorage) {
+        }] # !subst("$_storage", "propStorage",
+          !subst("$_value", "(*($_value))", p.assignToStorage)) # [{;
+      }(presentVal);
+      $_storage = std::move(presentVal);
+    }()}]);
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected optional properties to materialize as arrays";
+    if (arrayAttr.size() > 1)
+      return $_diag() << "expected optional properties to become 0- or 1-element arrays";
+    if (arrayAttr.empty()) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    ::mlir::Attribute presentAttr = arrayAttr[0];
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentRes = [&](Attribute propAttr, }] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage",
+          !subst("$_attr", "propAttr", p.convertFromAttribute)) # [{
+    }(presentAttr, presentVal);
+    if (::mlir::failed(presentRes))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    if (!$_storage.has_value()) {
+      return ::mlir::ArrayAttr::get($_ctxt, {});
+    }
+    auto attr = [&]() -> ::mlir::Attribute {
+      }] # !subst("$_storage", "(*($_storage))", p.convertToAttribute) # [{
+    }();
+    return ::mlir::ArrayAttr::get($_ctxt, {attr});
+  }];
+
+  defvar delegatedParserBegin = [{
+    if (::mlir::succeeded($_parser.parseOptionalKeyword("none"))) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] #_makePropStorage<p, "presentVal">.ret # [{
+    auto delegParseResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::OptionalParseResult {
+    }] # !subst("$_storage", "propStorage", p.optionalParser) # [{
+        return ::mlir::success();
+    }(presentVal);
+    if (!delegParseResult.has_value()) {
+  }];
+
+  defvar delegatedParserEnd = [{
+    }
+    if (delegParseResult.has_value() && ::mlir::failed(*delegParseResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+  // If we're being explicitly called for our parser, we're expecting to have been
+  // printede into a context where the default value isn't elided. Therefore,
+  // not-present from the underlying parser is a failure.
+  defvar delegatedParser = delegatedParserBegin # [{
+    return ::mlir::failure();
+  }] # delegatedParserEnd;
+  defvar delegatedOptionalParser = delegatedParserBegin # [{
+      return std::nullopt;
+  }] # delegatedParserEnd;
+
+  defvar generalParserBegin = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword, {"none", "some"}))) {
+  }];
+  defvar generalParserEnd = [{
+    }
+    if (keyword == "none") {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    if (::mlir::failed($_parser.parseLess()))
+      return ::mlir::failure();
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentParse = [&](}] # p.storageType # [{& propStorage) -> ::mlir::ParseResult {
+      }] # !subst("$_storage", "propStorage", p.parser) # [{
+      return ::mlir::success();
+    }(presentVal);
+    if (presentParse || $_parser.parseGreater())
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  defvar generalParser = generalParserBegin # [{
+    return $_parser.emitError($_parser.getCurrentLocation(), "expected 'none' or 'some<prop>'");
+  }] # generalParserEnd;
+  defvar generalOptionalParser = generalParserBegin # [{
+    return std::nullopt;
+  }] # generalParserEnd;
+
+  let parser = !if(delegatesParsing, delegatedParser, generalParser);
+  let optionalParser = !if(delegatesParsing,
+    delegatedOptionalParser, generalOptionalParser);
+
+  defvar delegatedPrinter = [{
+    [&]() {
+      if (!$_storage.has_value()) {
+        $_printer << "none";
+        return;
+      }
+      }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+    }()}];
+  defvar generalPrinter = [{
+      [&]() {
+        if (!$_storage.has_value()) {
+          $_printer << "none";
+          return;
+        }
+        $_printer << "some<";
+        }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+        $_printer << ">";
+      }()}];
+  let printer = !if(delegatesParsing, delegatedPrinter, generalPrinter);
+
+  let readFromMlirBytecode = [{
+    bool isPresent = false;
+    if (::mlir::failed($_reader.readBool(isPresent)))
+      return ::mlir::failure();
+    if (!isPresent) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage", p.readFromMlirBytecode) # [{;
+      return ::mlir::success();
+    }(presentVal);
+    if (::mlir::failed(presentResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage.has_value());
+    if (!$_storage.has_value())
+      return;
+  }] # !subst("$_storage", "(*($_storage))", p.writeToMlirBytecode);
+
+  let hashProperty = !if(!empty(p.hashProperty), p.hashProperty,
+    [{ ::llvm::hash_value($_storage.has_value() ? std::optional<::llvm::hash_code>{}] #
+      !subst("$_storage", "(*($_storage))", p.hashProperty) #[{} : std::nullopt) }]);
+  assert !or(!not(delegatesParsing), !eq(defaultValue, "std::nullopt")),
+    "For delegated parsing to be used, the default value must be nullopt. " #
+    "To use a non-trivial default, set the canDelegateParsing argument to 0";
+}
 #endif // PROPERTIES
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index 65824531fdc90..a32de33114e40 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -126,6 +126,7 @@ class Type {
   // derived types should use isa/dyn_cast.
   bool isIndex() const;
   bool isFloat8E5M2() const;
+  bool isFloat8E4M3() const;
   bool isFloat8E4M3FN() const;
   bool isFloat8E5M2FNUZ() const;
   bool isFloat8E4M3FNUZ() const;
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index cc5853c044e97..768291a3a7267 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -384,7 +384,7 @@ class Operator {
   SmallVector<NamedAttribute, 4> attributes;
 
   /// The properties of the op.
-  SmallVector<NamedProperty> properties;
+  SmallVector<NamedProperty, 4> properties;
 
   /// The arguments of the op (operands and native attributes).
   SmallVector<Argument, 4> arguments;
diff --git a/mlir/include/mlir/TableGen/Property.h b/mlir/include/mlir/TableGen/Property.h
index d0d6f4940c7c0..702e6756e6a95 100644
--- a/mlir/include/mlir/TableGen/Property.h
+++ b/mlir/include/mlir/TableGen/Property.h
@@ -35,12 +35,20 @@ class Property {
 public:
   explicit Property(const llvm::Record *record);
   explicit Property(const llvm::DefInit *init);
-  Property(StringRef storageType, StringRef interfaceType,
-           StringRef convertFromStorageCall, StringRef assignToStorageCall,
-           StringRef convertToAttributeCall, StringRef convertFromAttributeCall,
+  Property(StringRef summary, StringRef description, StringRef storageType,
+           StringRef interfaceType, StringRef convertFromStorageCall,
+           StringRef assignToStorageCall, StringRef convertToAttributeCall,
+           StringRef convertFromAttributeCall, StringRef parserCall,
+           StringRef optionalParserCall, StringRef printerCall,
            StringRef readFromMlirBytecodeCall,
            StringRef writeToMlirBytecodeCall, StringRef hashPropertyCall,
-           StringRef defaultValue);
+           StringRef defaultValue, StringRef storageTypeValueOverride);
+
+  // Returns the summary (for error messages) of this property's type.
+  StringRef getSummary() const { return summary; }
+
+  // Returns the description of this property.
+  StringRef getDescription() const { return description; }
 
   // Returns the storage type.
   StringRef getStorageType() const { return storageType; }
@@ -66,6 +74,19 @@ class Property {
     return convertFromAttributeCall;
   }
 
+  // Returns the method call which parses this property from textual MLIR.
+  StringRef getParserCall() const { return parserCall; }
+
+  // Returns true if this property has defined an optional parser.
+  bool hasOptionalParser() const { return !optionalParserCall.empty(); }
+
+  // Returns the method call which optionally parses this property from textual
+  // MLIR.
+  StringRef getOptionalParserCall() const { return optionalParserCall; }
+
+  // Returns the method call which prints this property to textual MLIR.
+  StringRef getPrinterCall() const { return printerCall; }
+
   // Returns the method call which reads this property from
   // bytecode and assign it to the storage.
   StringRef getReadFromMlirBytecodeCall() const {
@@ -87,6 +108,24 @@ class Property {
   // Returns the default value for this Property.
   StringRef getDefaultValue() const { return defaultValue; }
 
+  // Returns whether this Property has a default storage-type value that is
+  // distinct from its default interface-type value.
+  bool hasStorageTypeValueOverride() const {
+    return !storageTypeValueOverride.empty();
+  }
+
+  StringRef getStorageTypeValueOverride() const {
+    return storageTypeValueOverride;
+  }
+
+  // Returns this property's TableGen def-name.
+  StringRef getPropertyDefName() const;
+
+  // Returns the base-level property that this Property constraint is based on
+  // or the Property itself otherwise. (Note: there are currently no
+  // property constraints, this function is added for future-proofing)
+  Property getBaseProperty() const;
+
   // Returns the TableGen definition this Property was constructed from.
   const llvm::Record &getDef() const { return *def; }
 
@@ -95,16 +134,22 @@ class Property {
   const llvm::Record *def;
 
   // Elements describing a Property, in general fetched from the record.
+  StringRef summary;
+  StringRef description;
   StringRef storageType;
   StringRef interfaceType;
   StringRef convertFromStorageCall;
   StringRef assignToStorageCall;
   StringRef convertToAttributeCall;
   StringRef convertFromAttributeCall;
+  StringRef parserCall;
+  StringRef optionalParserCall;
+  StringRef printerCall;
   StringRef readFromMlirBytecodeCall;
   StringRef writeToMlirBytecodeCall;
   StringRef hashPropertyCall;
   StringRef defaultValue;
+  StringRef storageTypeValueOverride;
 };
 
 // A struct wrapping an op property and its name together
diff --git a/mlir/lib/AsmParser/AsmParserImpl.h b/mlir/lib/AsmParser/AsmParserImpl.h
index 8f22be80865bf..b12687833e3fd 100644
--- a/mlir/lib/AsmParser/AsmParserImpl.h
+++ b/mlir/lib/AsmParser/AsmParserImpl.h
@@ -322,6 +322,11 @@ class AsmParserImpl : public BaseT {
     return parser.parseOptionalInteger(result);
   }
 
+  /// Parse an optional integer value from the stream.
+  OptionalParseResult parseOptionalDecimalInteger(APInt &result) override {
+    return parser.parseOptionalDecimalInteger(result);
+  }
+
   /// Parse a list of comma-separated items with an optional delimiter.  If a
   /// delimiter is provided, then an empty list is allowed.  If not, then at
   /// least one element will be parsed.
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 7181f13d3c8bb..2e4c4a36d46b9 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -41,6 +41,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Alignment.h"
@@ -307,6 +308,45 @@ OptionalParseResult Parser::parseOptionalInteger(APInt &result) {
   return success();
 }
 
+/// Parse an optional integer value only in decimal format from the stream.
+OptionalParseResult Parser::parseOptionalDecimalInteger(APInt &result) {
+  Token curToken = getToken();
+  if (curToken.isNot(Token::integer, Token::minus)) {
+    return std::nullopt;
+  }
+
+  bool negative = consumeIf(Token::minus);
+  Token curTok = getToken();
+  if (parseToken(Token::integer, "expected integer value")) {
+    return failure();
+  }
+
+  StringRef spelling = curTok.getSpelling();
+  // If the integer is in hexadecimal return only the 0. The lexer has already
+  // moved past the entire hexidecimal encoded integer so we reset the lex
+  // pointer to just past the 0 we actualy want to consume.
+  if (spelling[0] == '0' && spelling.size() > 1 &&
+      llvm::toLower(spelling[1]) == 'x') {
+    result = 0;
+    state.lex.resetPointer(spelling.data() + 1);
+    consumeToken();
+    return success();
+  }
+
+  if (spelling.getAsInteger(10, result))
+    return emitError(curTok.getLoc(), "integer value too large");
+
+  // Make sure we have a zero at the top so we return the right signedness.
+  if (result.isNegative())
+    result = result.zext(result.getBitWidth() + 1);
+
+  // Process the negative sign if present.
+  if (negative)
+    result.negate();
+
+  return success();
+}
+
 /// Parse a floating point value from an integer literal token.
 ParseResult Parser::parseFloatFromIntegerLiteral(
     std::optional<APFloat> &result, const Token &tok, bool isNegative,
diff --git a/mlir/lib/AsmParser/Parser.h b/mlir/lib/AsmParser/Parser.h
index b959e67b8e258..4caab499e1a0e 100644
--- a/mlir/lib/AsmParser/Parser.h
+++ b/mlir/lib/AsmParser/Parser.h
@@ -144,6 +144,9 @@ class Parser {
   /// Parse an optional integer value from the stream.
   OptionalParseResult parseOptionalInteger(APInt &result);
 
+  /// Parse an optional integer value only in decimal format from the stream.
+  OptionalParseResult parseOptionalDecimalInteger(APInt &result);
+
   /// Parse a floating point value from an integer literal token.
   ParseResult parseFloatFromIntegerLiteral(std::optional<APFloat> &result,
                                            const Token &tok, bool isNegative,
diff --git a/mlir/lib/AsmParser/TokenKinds.def b/mlir/lib/AsmParser/TokenKinds.def
index 297e074594530..eb3154c6da42e 100644
--- a/mlir/lib/AsmParser/TokenKinds.def
+++ b/mlir/lib/AsmParser/TokenKinds.def
@@ -95,6 +95,7 @@ TOK_KEYWORD(f32)
 TOK_KEYWORD(f64)
 TOK_KEYWORD(f80)
 TOK_KEYWORD(f8E5M2)
+TOK_KEYWORD(f8E4M3)
 TOK_KEYWORD(f8E4M3FN)
 TOK_KEYWORD(f8E5M2FNUZ)
 TOK_KEYWORD(f8E4M3FNUZ)
diff --git a/mlir/lib/AsmParser/TypeParser.cpp b/mlir/lib/AsmParser/TypeParser.cpp
index 0b46c96bbc04d..467b5f0844ab3 100644
--- a/mlir/lib/AsmParser/TypeParser.cpp
+++ b/mlir/lib/AsmParser/TypeParser.cpp
@@ -40,6 +40,7 @@ OptionalParseResult Parser::parseOptionalType(Type &type) {
   case Token::kw_vector:
   case Token::inttype:
   case Token::kw_f8E5M2:
+  case Token::kw_f8E4M3:
   case Token::kw_f8E4M3FN:
   case Token::kw_f8E5M2FNUZ:
   case Token::kw_f8E4M3FNUZ:
@@ -304,6 +305,9 @@ Type Parser::parseNonFunctionType() {
   case Token::kw_f8E5M2:
     consumeToken(Token::kw_f8E5M2);
     return builder.getFloat8E5M2Type();
+  case Token::kw_f8E4M3:
+    consumeToken(Token::kw_f8E4M3);
+    return builder.getFloat8E4M3Type();
   case Token::kw_f8E4M3FN:
     consumeToken(Token::kw_f8E4M3FN);
     return builder.getFloat8E4M3FNType();
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index e1e4eb999b3aa..5e0aebc03e2c1 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -143,7 +143,7 @@ class PyFloat8E4M3FNType
   }
 };
 
-/// Floating Point Type subclass - Float8M5E2Type.
+/// Floating Point Type subclass - Float8E5M2Type.
 class PyFloat8E5M2Type : public PyConcreteType<PyFloat8E5M2Type, PyFloatType> {
 public:
   static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFloat8E5M2;
@@ -163,6 +163,26 @@ class PyFloat8E5M2Type : public PyConcreteType<PyFloat8E5M2Type, PyFloatType> {
   }
 };
 
+/// Floating Point Type subclass - Float8E4M3Type.
+class PyFloat8E4M3Type : public PyConcreteType<PyFloat8E4M3Type, PyFloatType> {
+public:
+  static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFloat8E4M3;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirFloat8E4M3TypeGetTypeID;
+  static constexpr const char *pyClassName = "Float8E4M3Type";
+  using PyConcreteType::PyConcreteType;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](DefaultingPyMlirContext context) {
+          MlirType t = mlirFloat8E4M3TypeGet(context->get());
+          return PyFloat8E4M3Type(context->getRef(), t);
+        },
+        py::arg("context") = py::none(), "Create a float8_e4m3 type.");
+  }
+};
+
 /// Floating Point Type subclass - Float8E4M3FNUZ.
 class PyFloat8E4M3FNUZType
     : public PyConcreteType<PyFloat8E4M3FNUZType, PyFloatType> {
@@ -840,6 +860,7 @@ void mlir::python::populateIRTypes(py::module &m) {
   PyIndexType::bind(m);
   PyFloat8E4M3FNType::bind(m);
   PyFloat8E5M2Type::bind(m);
+  PyFloat8E4M3Type::bind(m);
   PyFloat8E4M3FNUZType::bind(m);
   PyFloat8E4M3B11FNUZType::bind(m);
   PyFloat8E5M2FNUZType::bind(m);
diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 1064896a1f714..a57b2178aec70 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -375,7 +375,7 @@ class StringSectionReader {
 
   /// Parse a shared string from the string section. The shared string is
   /// encoded using an index to a corresponding string in the string section.
-  LogicalResult parseString(EncodingReader &reader, StringRef &result) {
+  LogicalResult parseString(EncodingReader &reader, StringRef &result) const {
     return parseEntry(reader, strings, result, "string");
   }
 
@@ -383,7 +383,7 @@ class StringSectionReader {
   /// encoded using an index to a corresponding string in the string section.
   /// This variant parses a flag compressed with the index.
   LogicalResult parseStringWithFlag(EncodingReader &reader, StringRef &result,
-                                    bool &flag) {
+                                    bool &flag) const {
     uint64_t entryIdx;
     if (failed(reader.parseVarIntWithFlag(entryIdx, flag)))
       return failure();
@@ -393,7 +393,7 @@ class StringSectionReader {
   /// Parse a shared string from the string section. The shared string is
   /// encoded using an index to a corresponding string in the string section.
   LogicalResult parseStringAtIndex(EncodingReader &reader, uint64_t index,
-                                   StringRef &result) {
+                                   StringRef &result) const {
     return resolveEntry(reader, strings, index, result, "string");
   }
 
@@ -544,7 +544,7 @@ class ResourceSectionReader {
 
   /// Parse a dialect resource handle from the resource section.
   LogicalResult parseResourceHandle(EncodingReader &reader,
-                                    AsmDialectResourceHandle &result) {
+                                    AsmDialectResourceHandle &result) const {
     return parseEntry(reader, dialectResources, result, "resource handle");
   }
 
@@ -800,8 +800,8 @@ class AttrTypeReader {
   using TypeEntry = Entry<Type>;
 
 public:
-  AttrTypeReader(StringSectionReader &stringReader,
-                 ResourceSectionReader &resourceReader,
+  AttrTypeReader(const StringSectionReader &stringReader,
+                 const ResourceSectionReader &resourceReader,
                  const llvm::StringMap<BytecodeDialect *> &dialectsMap,
                  uint64_t &bytecodeVersion, Location fileLoc,
                  const ParserConfig &config)
@@ -881,11 +881,11 @@ class AttrTypeReader {
 
   /// The string section reader used to resolve string references when parsing
   /// custom encoded attribute/type entries.
-  StringSectionReader &stringReader;
+  const StringSectionReader &stringReader;
 
   /// The resource section reader used to resolve resource references when
   /// parsing custom encoded attribute/type entries.
-  ResourceSectionReader &resourceReader;
+  const ResourceSectionReader &resourceReader;
 
   /// The map of the loaded dialects used to retrieve dialect information, such
   /// as the dialect version.
@@ -908,8 +908,8 @@ class AttrTypeReader {
 class DialectReader : public DialectBytecodeReader {
 public:
   DialectReader(AttrTypeReader &attrTypeReader,
-                StringSectionReader &stringReader,
-                ResourceSectionReader &resourceReader,
+                const StringSectionReader &stringReader,
+                const ResourceSectionReader &resourceReader,
                 const llvm::StringMap<BytecodeDialect *> &dialectsMap,
                 EncodingReader &reader, uint64_t &bytecodeVersion)
       : attrTypeReader(attrTypeReader), stringReader(stringReader),
@@ -1042,8 +1042,8 @@ class DialectReader : public DialectBytecodeReader {
 
 private:
   AttrTypeReader &attrTypeReader;
-  StringSectionReader &stringReader;
-  ResourceSectionReader &resourceReader;
+  const StringSectionReader &stringReader;
+  const ResourceSectionReader &resourceReader;
   const llvm::StringMap<BytecodeDialect *> &dialectsMap;
   EncodingReader &reader;
   uint64_t &bytecodeVersion;
@@ -1082,7 +1082,7 @@ class PropertiesSectionReader {
   }
 
   LogicalResult read(Location fileLoc, DialectReader &dialectReader,
-                     OperationName *opName, OperationState &opState) {
+                     OperationName *opName, OperationState &opState) const {
     uint64_t propertiesIdx;
     if (failed(dialectReader.readVarInt(propertiesIdx)))
       return failure();
diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
index 449d7549eb724..0e96aa97abeba 100644
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -145,7 +146,9 @@ class EncodingEmitter {
   //===--------------------------------------------------------------------===//
 
   /// Backpatch a byte in the result buffer at the given offset.
-  void patchByte(uint64_t offset, uint8_t value) {
+  void patchByte(uint64_t offset, uint8_t value, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs() << "patchByte(" << offset << ',' << uint64_t(value)
+                            << ")\t" << desc << '\n');
     assert(offset < size() && offset >= prevResultSize &&
            "cannot patch previously emitted data");
     currentResult[offset - prevResultSize] = value;
@@ -153,7 +156,9 @@ class EncodingEmitter {
 
   /// Emit the provided blob of data, which is owned by the caller and is
   /// guaranteed to not die before the end of the bytecode process.
-  void emitOwnedBlob(ArrayRef<uint8_t> data) {
+  void emitOwnedBlob(ArrayRef<uint8_t> data, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitOwnedBlob(" << data.size() << "b)\t" << desc << '\n');
     // Push the current buffer before adding the provided data.
     appendResult(std::move(currentResult));
     appendOwnedResult(data);
@@ -163,17 +168,19 @@ class EncodingEmitter {
   /// owned by the caller and is guaranteed to not die before the end of the
   /// bytecode process. The alignment value is also encoded, making it available
   /// on load.
-  void emitOwnedBlobAndAlignment(ArrayRef<uint8_t> data, uint32_t alignment) {
-    emitVarInt(alignment);
-    emitVarInt(data.size());
+  void emitOwnedBlobAndAlignment(ArrayRef<uint8_t> data, uint32_t alignment,
+                                 StringLiteral desc) {
+    emitVarInt(alignment, desc);
+    emitVarInt(data.size(), desc);
 
     alignTo(alignment);
-    emitOwnedBlob(data);
+    emitOwnedBlob(data, desc);
   }
-  void emitOwnedBlobAndAlignment(ArrayRef<char> data, uint32_t alignment) {
+  void emitOwnedBlobAndAlignment(ArrayRef<char> data, uint32_t alignment,
+                                 StringLiteral desc) {
     ArrayRef<uint8_t> castedData(reinterpret_cast<const uint8_t *>(data.data()),
                                  data.size());
-    emitOwnedBlobAndAlignment(castedData, alignment);
+    emitOwnedBlobAndAlignment(castedData, alignment, desc);
   }
 
   /// Align the emitter to the given alignment.
@@ -187,7 +194,7 @@ class EncodingEmitter {
     size_t curOffset = size();
     size_t paddingSize = llvm::alignTo(curOffset, alignment) - curOffset;
     while (paddingSize--)
-      emitByte(bytecode::kAlignmentByte);
+      emitByte(bytecode::kAlignmentByte, "alignment byte");
 
     // Keep track of the maximum required alignment.
     requiredAlignment = std::max(requiredAlignment, alignment);
@@ -198,12 +205,16 @@ class EncodingEmitter {
 
   /// Emit a single byte.
   template <typename T>
-  void emitByte(T byte) {
+  void emitByte(T byte, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitByte(" << uint64_t(byte) << ")\t" << desc << '\n');
     currentResult.push_back(static_cast<uint8_t>(byte));
   }
 
   /// Emit a range of bytes.
-  void emitBytes(ArrayRef<uint8_t> bytes) {
+  void emitBytes(ArrayRef<uint8_t> bytes, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitBytes(" << bytes.size() << "b)\t" << desc << '\n');
     llvm::append_range(currentResult, bytes);
   }
 
@@ -214,40 +225,43 @@ class EncodingEmitter {
   /// All remaining bits in the first byte, along with all of the bits in
   /// additional bytes, provide the value of the integer encoded in
   /// little-endian order.
-  void emitVarInt(uint64_t value) {
+  void emitVarInt(uint64_t value, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs() << "emitVarInt(" << value << ")\t" << desc << '\n');
+
     // In the most common case, the value can be represented in a single byte.
     // Given how hot this case is, explicitly handle that here.
     if ((value >> 7) == 0)
-      return emitByte((value << 1) | 0x1);
-    emitMultiByteVarInt(value);
+      return emitByte((value << 1) | 0x1, desc);
+    emitMultiByteVarInt(value, desc);
   }
 
   /// Emit a signed variable length integer. Signed varints are encoded using
   /// a varint with zigzag encoding, meaning that we use the low bit of the
   /// value to indicate the sign of the value. This allows for more efficient
   /// encoding of negative values by limiting the number of active bits
-  void emitSignedVarInt(uint64_t value) {
-    emitVarInt((value << 1) ^ (uint64_t)((int64_t)value >> 63));
+  void emitSignedVarInt(uint64_t value, StringLiteral desc) {
+    emitVarInt((value << 1) ^ (uint64_t)((int64_t)value >> 63), desc);
   }
 
   /// Emit a variable length integer whose low bit is used to encode the
   /// provided flag, i.e. encoded as: (value << 1) | (flag ? 1 : 0).
-  void emitVarIntWithFlag(uint64_t value, bool flag) {
-    emitVarInt((value << 1) | (flag ? 1 : 0));
+  void emitVarIntWithFlag(uint64_t value, bool flag, StringLiteral desc) {
+    emitVarInt((value << 1) | (flag ? 1 : 0), desc);
   }
 
   //===--------------------------------------------------------------------===//
   // String Emission
 
   /// Emit the given string as a nul terminated string.
-  void emitNulTerminatedString(StringRef str) {
-    emitString(str);
-    emitByte(0);
+  void emitNulTerminatedString(StringRef str, StringLiteral desc) {
+    emitString(str, desc);
+    emitByte(0, "null terminator");
   }
 
   /// Emit the given string without a nul terminator.
-  void emitString(StringRef str) {
-    emitBytes({reinterpret_cast<const uint8_t *>(str.data()), str.size()});
+  void emitString(StringRef str, StringLiteral desc) {
+    emitBytes({reinterpret_cast<const uint8_t *>(str.data()), str.size()},
+              desc);
   }
 
   //===--------------------------------------------------------------------===//
@@ -260,14 +274,14 @@ class EncodingEmitter {
     // indicate whether the section alignment is present, so save an offset to
     // it.
     uint64_t codeOffset = currentResult.size();
-    emitByte(code);
-    emitVarInt(emitter.size());
+    emitByte(code, "section code");
+    emitVarInt(emitter.size(), "section size");
 
     // Integrate the alignment of the section into this emitter if necessary.
     unsigned emitterAlign = emitter.requiredAlignment;
     if (emitterAlign > 1) {
       if (size() & (emitterAlign - 1)) {
-        emitVarInt(emitterAlign);
+        emitVarInt(emitterAlign, "section alignment");
         alignTo(emitterAlign);
 
         // Indicate that we needed to align the section, the high bit of the
@@ -295,7 +309,8 @@ class EncodingEmitter {
   /// fallback when the number of bytes needed to encode the value is greater
   /// than 1. We mark it noinline here so that the single byte hot path isn't
   /// pessimized.
-  LLVM_ATTRIBUTE_NOINLINE void emitMultiByteVarInt(uint64_t value);
+  LLVM_ATTRIBUTE_NOINLINE void emitMultiByteVarInt(uint64_t value,
+                                                   StringLiteral desc);
 
   /// Append a new result buffer to the current contents.
   void appendResult(std::vector<uint8_t> &&result) {
@@ -345,15 +360,15 @@ class StringSectionBuilder {
 
   /// Write the current set of strings to the given emitter.
   void write(EncodingEmitter &emitter) {
-    emitter.emitVarInt(strings.size());
+    emitter.emitVarInt(strings.size(), "string section size");
 
     // Emit the sizes in reverse order, so that we don't need to backpatch an
     // offset to the string data or have a separate section.
     for (const auto &it : llvm::reverse(strings))
-      emitter.emitVarInt(it.first.size() + 1);
+      emitter.emitVarInt(it.first.size() + 1, "string size");
     // Emit the string data itself.
     for (const auto &it : strings)
-      emitter.emitNulTerminatedString(it.first.val());
+      emitter.emitNulTerminatedString(it.first.val(), "string");
   }
 
 private:
@@ -380,32 +395,35 @@ class DialectWriter : public DialectBytecodeWriter {
   //===--------------------------------------------------------------------===//
 
   void writeAttribute(Attribute attr) override {
-    emitter.emitVarInt(numberingState.getNumber(attr));
+    emitter.emitVarInt(numberingState.getNumber(attr), "dialect attr");
   }
   void writeOptionalAttribute(Attribute attr) override {
     if (!attr) {
-      emitter.emitVarInt(0);
+      emitter.emitVarInt(0, "dialect optional attr none");
       return;
     }
-    emitter.emitVarIntWithFlag(numberingState.getNumber(attr), true);
+    emitter.emitVarIntWithFlag(numberingState.getNumber(attr), true,
+                               "dialect optional attr");
   }
 
   void writeType(Type type) override {
-    emitter.emitVarInt(numberingState.getNumber(type));
+    emitter.emitVarInt(numberingState.getNumber(type), "dialect type");
   }
 
   void writeResourceHandle(const AsmDialectResourceHandle &resource) override {
-    emitter.emitVarInt(numberingState.getNumber(resource));
+    emitter.emitVarInt(numberingState.getNumber(resource), "dialect resource");
   }
 
   //===--------------------------------------------------------------------===//
   // Primitives
   //===--------------------------------------------------------------------===//
 
-  void writeVarInt(uint64_t value) override { emitter.emitVarInt(value); }
+  void writeVarInt(uint64_t value) override {
+    emitter.emitVarInt(value, "dialect writer");
+  }
 
   void writeSignedVarInt(int64_t value) override {
-    emitter.emitSignedVarInt(value);
+    emitter.emitSignedVarInt(value, "dialect writer");
   }
 
   void writeAPIntWithKnownWidth(const APInt &value) override {
@@ -414,21 +432,21 @@ class DialectWriter : public DialectBytecodeWriter {
     // If the value is a single byte, just emit it directly without going
     // through a varint.
     if (bitWidth <= 8)
-      return emitter.emitByte(value.getLimitedValue());
+      return emitter.emitByte(value.getLimitedValue(), "dialect APInt");
 
     // If the value fits within a single varint, emit it directly.
     if (bitWidth <= 64)
-      return emitter.emitSignedVarInt(value.getLimitedValue());
+      return emitter.emitSignedVarInt(value.getLimitedValue(), "dialect APInt");
 
     // Otherwise, we need to encode a variable number of active words. We use
     // active words instead of the number of total words under the observation
     // that smaller values will be more common.
     unsigned numActiveWords = value.getActiveWords();
-    emitter.emitVarInt(numActiveWords);
+    emitter.emitVarInt(numActiveWords, "dialect APInt word count");
 
     const uint64_t *rawValueData = value.getRawData();
     for (unsigned i = 0; i < numActiveWords; ++i)
-      emitter.emitSignedVarInt(rawValueData[i]);
+      emitter.emitSignedVarInt(rawValueData[i], "dialect APInt word");
   }
 
   void writeAPFloatWithKnownSemantics(const APFloat &value) override {
@@ -436,16 +454,20 @@ class DialectWriter : public DialectBytecodeWriter {
   }
 
   void writeOwnedString(StringRef str) override {
-    emitter.emitVarInt(stringSection.insert(str));
+    emitter.emitVarInt(stringSection.insert(str), "dialect string");
   }
 
   void writeOwnedBlob(ArrayRef<char> blob) override {
-    emitter.emitVarInt(blob.size());
-    emitter.emitOwnedBlob(ArrayRef<uint8_t>(
-        reinterpret_cast<const uint8_t *>(blob.data()), blob.size()));
+    emitter.emitVarInt(blob.size(), "dialect blob");
+    emitter.emitOwnedBlob(
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(blob.data()),
+                          blob.size()),
+        "dialect blob");
   }
 
-  void writeOwnedBool(bool value) override { emitter.emitByte(value); }
+  void writeOwnedBool(bool value) override {
+    emitter.emitByte(value, "dialect bool");
+  }
 
   int64_t getBytecodeVersion() const override { return bytecodeVersion; }
 
@@ -486,7 +508,7 @@ class PropertiesSectionBuilder {
       if (!prop)
         return std::nullopt;
       EncodingEmitter sizeEmitter;
-      sizeEmitter.emitVarInt(numberingState.getNumber(prop));
+      sizeEmitter.emitVarInt(numberingState.getNumber(prop), "properties size");
       scratch.clear();
       llvm::raw_svector_ostream os(scratch);
       sizeEmitter.writeTo(os);
@@ -507,16 +529,17 @@ class PropertiesSectionBuilder {
 
   /// Write the current set of properties to the given emitter.
   void write(EncodingEmitter &emitter) {
-    emitter.emitVarInt(propertiesStorage.size());
+    emitter.emitVarInt(propertiesStorage.size(), "properties size");
     if (propertiesStorage.empty())
       return;
     for (const auto &storage : propertiesStorage) {
       if (storage.empty()) {
-        emitter.emitBytes(ArrayRef<uint8_t>());
+        emitter.emitBytes(ArrayRef<uint8_t>(), "empty properties");
         continue;
       }
       emitter.emitBytes(ArrayRef(reinterpret_cast<const uint8_t *>(&storage[0]),
-                                 storage.size()));
+                                 storage.size()),
+                        "property");
     }
   }
 
@@ -532,7 +555,7 @@ class PropertiesSectionBuilder {
     SmallVector<char> sizeScratch;
     {
       EncodingEmitter sizeEmitter;
-      sizeEmitter.emitVarInt(rawProperties.size());
+      sizeEmitter.emitVarInt(rawProperties.size(), "properties");
       llvm::raw_svector_ostream os(sizeScratch);
       sizeEmitter.writeTo(os);
     }
@@ -576,7 +599,8 @@ class RawEmitterOstream : public raw_ostream {
 
 private:
   void write_impl(const char *ptr, size_t size) override {
-    emitter.emitBytes({reinterpret_cast<const uint8_t *>(ptr), size});
+    emitter.emitBytes({reinterpret_cast<const uint8_t *>(ptr), size},
+                      "raw emitter");
   }
   uint64_t current_pos() const override { return emitter.size(); }
 
@@ -591,7 +615,7 @@ void EncodingEmitter::writeTo(raw_ostream &os) const {
   os.write((const char *)currentResult.data(), currentResult.size());
 }
 
-void EncodingEmitter::emitMultiByteVarInt(uint64_t value) {
+void EncodingEmitter::emitMultiByteVarInt(uint64_t value, StringLiteral desc) {
   // Compute the number of bytes needed to encode the value. Each byte can hold
   // up to 7-bits of data. We only check up to the number of bits we can encode
   // in the first byte (8).
@@ -601,16 +625,16 @@ void EncodingEmitter::emitMultiByteVarInt(uint64_t value) {
       uint64_t encodedValue = (value << 1) | 0x1;
       encodedValue <<= (numBytes - 1);
       llvm::support::ulittle64_t encodedValueLE(encodedValue);
-      emitBytes({reinterpret_cast<uint8_t *>(&encodedValueLE), numBytes});
+      emitBytes({reinterpret_cast<uint8_t *>(&encodedValueLE), numBytes}, desc);
       return;
     }
   }
 
   // If the value is too large to encode in a single byte, emit a special all
   // zero marker byte and splat the value directly.
-  emitByte(0);
+  emitByte(0, desc);
   llvm::support::ulittle64_t valueLE(value);
-  emitBytes({reinterpret_cast<uint8_t *>(&valueLE), sizeof(valueLE)});
+  emitBytes({reinterpret_cast<uint8_t *>(&valueLE), sizeof(valueLE)}, desc);
 }
 
 //===----------------------------------------------------------------------===//
@@ -696,7 +720,7 @@ LogicalResult BytecodeWriter::write(Operation *rootOp, raw_ostream &os) {
 
   // Emit the bytecode file header. This is how we identify the output as a
   // bytecode file.
-  emitter.emitString("ML\xefR");
+  emitter.emitString("ML\xefR", "bytecode header");
 
   // Emit the bytecode version.
   if (config.bytecodeVersion < bytecode::kMinSupportedVersion ||
@@ -706,10 +730,10 @@ LogicalResult BytecodeWriter::write(Operation *rootOp, raw_ostream &os) {
            << ", must be in range ["
            << static_cast<int64_t>(bytecode::kMinSupportedVersion) << ", "
            << static_cast<int64_t>(bytecode::kVersion) << ']';
-  emitter.emitVarInt(config.bytecodeVersion);
+  emitter.emitVarInt(config.bytecodeVersion, "bytecode version");
 
   // Emit the producer.
-  emitter.emitNulTerminatedString(config.producer);
+  emitter.emitNulTerminatedString(config.producer, "bytecode producer");
 
   // Emit the dialect section.
   writeDialectSection(emitter);
@@ -760,8 +784,8 @@ static void writeDialectGrouping(EncodingEmitter &emitter, EntriesT &&entries,
     });
 
     // Emit the dialect and number of elements.
-    emitter.emitVarInt(currentDialect->number);
-    emitter.emitVarInt(std::distance(groupStart, it));
+    emitter.emitVarInt(currentDialect->number, "dialect number");
+    emitter.emitVarInt(std::distance(groupStart, it), "dialect offset");
 
     // Emit the entries within the group.
     for (auto &entry : llvm::make_range(groupStart, it))
@@ -774,13 +798,13 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
 
   // Emit the referenced dialects.
   auto dialects = numberingState.getDialects();
-  dialectEmitter.emitVarInt(llvm::size(dialects));
+  dialectEmitter.emitVarInt(llvm::size(dialects), "dialects count");
   for (DialectNumbering &dialect : dialects) {
     // Write the string section and get the ID.
     size_t nameID = stringSection.insert(dialect.name);
 
     if (config.bytecodeVersion < bytecode::kDialectVersioning) {
-      dialectEmitter.emitVarInt(nameID);
+      dialectEmitter.emitVarInt(nameID, "dialect name ID");
       continue;
     }
 
@@ -798,22 +822,25 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
     // this in the dialect ID, so if there is no version, we don't write the
     // section.
     size_t versionAvailable = versionEmitter.size() > 0;
-    dialectEmitter.emitVarIntWithFlag(nameID, versionAvailable);
+    dialectEmitter.emitVarIntWithFlag(nameID, versionAvailable,
+                                      "dialect version");
     if (versionAvailable)
       dialectEmitter.emitSection(bytecode::Section::kDialectVersions,
                                  std::move(versionEmitter));
   }
 
   if (config.bytecodeVersion >= bytecode::kElideUnknownBlockArgLocation)
-    dialectEmitter.emitVarInt(size(numberingState.getOpNames()));
+    dialectEmitter.emitVarInt(size(numberingState.getOpNames()),
+                              "op names count");
 
   // Emit the referenced operation names grouped by dialect.
   auto emitOpName = [&](OpNameNumbering &name) {
     size_t stringId = stringSection.insert(name.name.stripDialect());
     if (config.bytecodeVersion < bytecode::kNativePropertiesEncoding)
-      dialectEmitter.emitVarInt(stringId);
+      dialectEmitter.emitVarInt(stringId, "dialect op name");
     else
-      dialectEmitter.emitVarIntWithFlag(stringId, name.name.isRegistered());
+      dialectEmitter.emitVarIntWithFlag(stringId, name.name.isRegistered(),
+                                        "dialect op name");
   };
   writeDialectGrouping(dialectEmitter, numberingState.getOpNames(), emitOpName);
 
@@ -826,8 +853,10 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
 void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
   EncodingEmitter attrTypeEmitter;
   EncodingEmitter offsetEmitter;
-  offsetEmitter.emitVarInt(llvm::size(numberingState.getAttributes()));
-  offsetEmitter.emitVarInt(llvm::size(numberingState.getTypes()));
+  offsetEmitter.emitVarInt(llvm::size(numberingState.getAttributes()),
+                           "attributes count");
+  offsetEmitter.emitVarInt(llvm::size(numberingState.getTypes()),
+                           "types count");
 
   // A functor used to emit an attribute or type entry.
   uint64_t prevOffset = 0;
@@ -836,7 +865,7 @@ void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
 
     auto emitAttrOrTypeRawImpl = [&]() -> void {
       RawEmitterOstream(attrTypeEmitter) << entryValue;
-      attrTypeEmitter.emitByte(0);
+      attrTypeEmitter.emitByte(0, "attr/type separator");
     };
     auto emitAttrOrTypeImpl = [&]() -> bool {
       // TODO: We don't currently support custom encoded mutable types and
@@ -882,7 +911,8 @@ void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
 
     // Record the offset of this entry.
     uint64_t curOffset = attrTypeEmitter.size();
-    offsetEmitter.emitVarIntWithFlag(curOffset - prevOffset, hasCustomEncoding);
+    offsetEmitter.emitVarIntWithFlag(curOffset - prevOffset, hasCustomEncoding,
+                                     "attr/type offset");
     prevOffset = curOffset;
   };
 
@@ -910,30 +940,33 @@ LogicalResult BytecodeWriter::writeBlock(EncodingEmitter &emitter,
   // use the low bit of the operation count to indicate if the block has
   // arguments.
   unsigned numOps = numberingState.getOperationCount(block);
-  emitter.emitVarIntWithFlag(numOps, hasArgs);
+  emitter.emitVarIntWithFlag(numOps, hasArgs, "block num ops");
 
   // Emit the arguments of the block.
   if (hasArgs) {
-    emitter.emitVarInt(args.size());
+    emitter.emitVarInt(args.size(), "block args count");
     for (BlockArgument arg : args) {
       Location argLoc = arg.getLoc();
       if (config.bytecodeVersion >= bytecode::kElideUnknownBlockArgLocation) {
         emitter.emitVarIntWithFlag(numberingState.getNumber(arg.getType()),
-                                   !isa<UnknownLoc>(argLoc));
+                                   !isa<UnknownLoc>(argLoc), "block arg type");
         if (!isa<UnknownLoc>(argLoc))
-          emitter.emitVarInt(numberingState.getNumber(argLoc));
+          emitter.emitVarInt(numberingState.getNumber(argLoc),
+                             "block arg location");
       } else {
-        emitter.emitVarInt(numberingState.getNumber(arg.getType()));
-        emitter.emitVarInt(numberingState.getNumber(argLoc));
+        emitter.emitVarInt(numberingState.getNumber(arg.getType()),
+                           "block arg type");
+        emitter.emitVarInt(numberingState.getNumber(argLoc),
+                           "block arg location");
       }
     }
     if (config.bytecodeVersion >= bytecode::kUseListOrdering) {
       uint64_t maskOffset = emitter.size();
       uint8_t encodingMask = 0;
-      emitter.emitByte(0);
+      emitter.emitByte(0, "use-list separator");
       writeUseListOrders(emitter, encodingMask, args);
       if (encodingMask)
-        emitter.patchByte(maskOffset, encodingMask);
+        emitter.patchByte(maskOffset, encodingMask, "block patch encoding");
     }
   }
 
@@ -945,17 +978,17 @@ LogicalResult BytecodeWriter::writeBlock(EncodingEmitter &emitter,
 }
 
 LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
-  emitter.emitVarInt(numberingState.getNumber(op->getName()));
+  emitter.emitVarInt(numberingState.getNumber(op->getName()), "op name ID");
 
   // Emit a mask for the operation components. We need to fill this in later
   // (when we actually know what needs to be emitted), so emit a placeholder for
   // now.
   uint64_t maskOffset = emitter.size();
   uint8_t opEncodingMask = 0;
-  emitter.emitByte(0);
+  emitter.emitByte(0, "op separator");
 
   // Emit the location for this operation.
-  emitter.emitVarInt(numberingState.getNumber(op->getLoc()));
+  emitter.emitVarInt(numberingState.getNumber(op->getLoc()), "op location");
 
   // Emit the attributes of this operation.
   DictionaryAttr attrs = op->getDiscardableAttrDictionary();
@@ -969,7 +1002,7 @@ LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
   }
   if (!attrs.empty()) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasAttrs;
-    emitter.emitVarInt(numberingState.getNumber(attrs));
+    emitter.emitVarInt(numberingState.getNumber(attrs), "op attrs count");
   }
 
   // Emit the properties of this operation, for now we still support deployment
@@ -978,32 +1011,32 @@ LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
     std::optional<ssize_t> propertiesId = propertiesSection.emit(op);
     if (propertiesId.has_value()) {
       opEncodingMask |= bytecode::OpEncodingMask::kHasProperties;
-      emitter.emitVarInt(*propertiesId);
+      emitter.emitVarInt(*propertiesId, "op properties ID");
     }
   }
 
   // Emit the result types of the operation.
   if (unsigned numResults = op->getNumResults()) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasResults;
-    emitter.emitVarInt(numResults);
+    emitter.emitVarInt(numResults, "op results count");
     for (Type type : op->getResultTypes())
-      emitter.emitVarInt(numberingState.getNumber(type));
+      emitter.emitVarInt(numberingState.getNumber(type), "op result type");
   }
 
   // Emit the operands of the operation.
   if (unsigned numOperands = op->getNumOperands()) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasOperands;
-    emitter.emitVarInt(numOperands);
+    emitter.emitVarInt(numOperands, "op operands count");
     for (Value operand : op->getOperands())
-      emitter.emitVarInt(numberingState.getNumber(operand));
+      emitter.emitVarInt(numberingState.getNumber(operand), "op operand types");
   }
 
   // Emit the successors of the operation.
   if (unsigned numSuccessors = op->getNumSuccessors()) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasSuccessors;
-    emitter.emitVarInt(numSuccessors);
+    emitter.emitVarInt(numSuccessors, "op successors count");
     for (Block *successor : op->getSuccessors())
-      emitter.emitVarInt(numberingState.getNumber(successor));
+      emitter.emitVarInt(numberingState.getNumber(successor), "op successor");
   }
 
   // Emit the use-list orders to bytecode, so we can reconstruct the same order
@@ -1017,7 +1050,7 @@ LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasInlineRegions;
 
   // Update the mask for the operation.
-  emitter.patchByte(maskOffset, opEncodingMask);
+  emitter.patchByte(maskOffset, opEncodingMask, "op encoding mask");
 
   // With the mask emitted, we can now emit the regions of the operation. We do
   // this after mask emission to avoid offset complications that may arise by
@@ -1025,7 +1058,8 @@ LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
   // op encoding mask is more annoying).
   if (numRegions) {
     bool isIsolatedFromAbove = numberingState.isIsolatedFromAbove(op);
-    emitter.emitVarIntWithFlag(numRegions, isIsolatedFromAbove);
+    emitter.emitVarIntWithFlag(numRegions, isIsolatedFromAbove,
+                               "op regions count");
 
     // If the region is not isolated from above, or we are emitting bytecode
     // targeting version <kLazyLoading, we don't use a section.
@@ -1096,8 +1130,9 @@ void BytecodeWriter::writeUseListOrders(EncodingEmitter &emitter,
   opEncodingMask |= bytecode::OpEncodingMask::kHasUseListOrders;
   // Emit the number of results that have a custom use-list order if the number
   // of results is greater than one.
-  if (range.size() != 1)
-    emitter.emitVarInt(map.size());
+  if (range.size() != 1) {
+    emitter.emitVarInt(map.size(), "custom use-list size");
+  }
 
   for (const auto &item : map) {
     auto resultIdx = item.getFirst();
@@ -1113,20 +1148,22 @@ void BytecodeWriter::writeUseListOrders(EncodingEmitter &emitter,
 
     // For single result, we don't need to store the result index.
     if (range.size() != 1)
-      emitter.emitVarInt(resultIdx);
+      emitter.emitVarInt(resultIdx, "use-list result index");
 
     if (indexPairEncoding) {
-      emitter.emitVarIntWithFlag(shuffledElements * 2, indexPairEncoding);
+      emitter.emitVarIntWithFlag(shuffledElements * 2, indexPairEncoding,
+                                 "use-list index pair size");
       for (auto pair : llvm::enumerate(useListOrder)) {
         if (pair.index() != pair.value()) {
-          emitter.emitVarInt(pair.value());
-          emitter.emitVarInt(pair.index());
+          emitter.emitVarInt(pair.value(), "use-list index pair first");
+          emitter.emitVarInt(pair.index(), "use-list index pair second");
         }
       }
     } else {
-      emitter.emitVarIntWithFlag(useListOrder.size(), indexPairEncoding);
+      emitter.emitVarIntWithFlag(useListOrder.size(), indexPairEncoding,
+                                 "use-list size");
       for (const auto &index : useListOrder)
-        emitter.emitVarInt(index);
+        emitter.emitVarInt(index, "use-list order");
     }
   }
 }
@@ -1136,15 +1173,15 @@ LogicalResult BytecodeWriter::writeRegion(EncodingEmitter &emitter,
   // If the region is empty, we only need to emit the number of blocks (which is
   // zero).
   if (region->empty()) {
-    emitter.emitVarInt(/*numBlocks*/ 0);
+    emitter.emitVarInt(/*numBlocks*/ 0, "region block count empty");
     return success();
   }
 
   // Emit the number of blocks and values within the region.
   unsigned numBlocks, numValues;
   std::tie(numBlocks, numValues) = numberingState.getBlockValueCount(region);
-  emitter.emitVarInt(numBlocks);
-  emitter.emitVarInt(numValues);
+  emitter.emitVarInt(numBlocks, "region block count");
+  emitter.emitVarInt(numValues, "region value count");
 
   // Emit the blocks within the region.
   for (Block &block : *region)
@@ -1160,7 +1197,7 @@ LogicalResult BytecodeWriter::writeIRSection(EncodingEmitter &emitter,
   // Write the IR section the same way as a block with no arguments. Note that
   // the low-bit of the operation count for a block is used to indicate if the
   // block has arguments, which in this case is always false.
-  irEmitter.emitVarIntWithFlag(/*numOps*/ 1, /*hasArgs*/ false);
+  irEmitter.emitVarIntWithFlag(/*numOps*/ 1, /*hasArgs*/ false, "ir section");
 
   // Emit the operations.
   if (failed(writeOp(irEmitter, op)))
@@ -1189,17 +1226,17 @@ class ResourceBuilder : public AsmResourceBuilder {
   void buildBlob(StringRef key, ArrayRef<char> data,
                  uint32_t dataAlignment) final {
     if (!shouldElideData)
-      emitter.emitOwnedBlobAndAlignment(data, dataAlignment);
+      emitter.emitOwnedBlobAndAlignment(data, dataAlignment, "resource blob");
     postProcessFn(key, AsmResourceEntryKind::Blob);
   }
   void buildBool(StringRef key, bool data) final {
     if (!shouldElideData)
-      emitter.emitByte(data);
+      emitter.emitByte(data, "resource bool");
     postProcessFn(key, AsmResourceEntryKind::Bool);
   }
   void buildString(StringRef key, StringRef data) final {
     if (!shouldElideData)
-      emitter.emitVarInt(stringSection.insert(data));
+      emitter.emitVarInt(stringSection.insert(data), "resource string");
     postProcessFn(key, AsmResourceEntryKind::String);
   }
 
@@ -1229,12 +1266,14 @@ void BytecodeWriter::writeResourceSection(Operation *op,
 
   // Functor used to emit a resource group defined by 'key'.
   auto emitResourceGroup = [&](uint64_t key) {
-    resourceOffsetEmitter.emitVarInt(key);
-    resourceOffsetEmitter.emitVarInt(curResourceEntries.size());
+    resourceOffsetEmitter.emitVarInt(key, "resource group key");
+    resourceOffsetEmitter.emitVarInt(curResourceEntries.size(),
+                                     "resource group size");
     for (auto [key, kind, size] : curResourceEntries) {
-      resourceOffsetEmitter.emitVarInt(stringSection.insert(key));
-      resourceOffsetEmitter.emitVarInt(size);
-      resourceOffsetEmitter.emitByte(kind);
+      resourceOffsetEmitter.emitVarInt(stringSection.insert(key),
+                                       "resource key");
+      resourceOffsetEmitter.emitVarInt(size, "resource size");
+      resourceOffsetEmitter.emitByte(kind, "resource kind");
     }
   };
 
@@ -1244,7 +1283,8 @@ void BytecodeWriter::writeResourceSection(Operation *op,
                                config.shouldElideResourceData);
 
   // Emit the external resource entries.
-  resourceOffsetEmitter.emitVarInt(config.externalResourcePrinters.size());
+  resourceOffsetEmitter.emitVarInt(config.externalResourcePrinters.size(),
+                                   "external resource printer count");
   for (const auto &printer : config.externalResourcePrinters) {
     curResourceEntries.clear();
     printer->buildResources(op, entryBuilder);
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 754c94511524d..03e2f2be2156a 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -266,15 +266,14 @@ MlirAttribute mlirLLVMDILexicalBlockFileAttrGet(MlirContext ctx,
       cast<DIFileAttr>(unwrap(file)), discriminator));
 }
 
-MlirAttribute
-mlirLLVMDILocalVariableAttrGet(MlirContext ctx, MlirAttribute scope,
-                               MlirAttribute name, MlirAttribute diFile,
-                               unsigned int line, unsigned int arg,
-                               unsigned int alignInBits, MlirAttribute diType) {
+MlirAttribute mlirLLVMDILocalVariableAttrGet(
+    MlirContext ctx, MlirAttribute scope, MlirAttribute name,
+    MlirAttribute diFile, unsigned int line, unsigned int arg,
+    unsigned int alignInBits, MlirAttribute diType, int64_t flags) {
   return wrap(DILocalVariableAttr::get(
       unwrap(ctx), cast<DIScopeAttr>(unwrap(scope)),
       cast<StringAttr>(unwrap(name)), cast<DIFileAttr>(unwrap(diFile)), line,
-      arg, alignInBits, cast<DITypeAttr>(unwrap(diType))));
+      arg, alignInBits, cast<DITypeAttr>(unwrap(diType)), DIFlags(flags)));
 }
 
 MlirAttribute mlirLLVMDISubroutineTypeAttrGet(MlirContext ctx,
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index 19171d64d4094..f2a0ab33c0224 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -107,6 +107,7 @@ MlirSparseTensorLevelType mlirSparseTensorEncodingAttrBuildLvlType(
     unsigned size, unsigned n, unsigned m) {
 
   std::vector<LevelPropNonDefault> props;
+  props.reserve(size);
   for (unsigned i = 0; i < size; i++)
     props.push_back(static_cast<LevelPropNonDefault>(properties[i]));
 
diff --git a/mlir/lib/CAPI/IR/BuiltinTypes.cpp b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
index 01bb71d9228b4..d507027357c26 100644
--- a/mlir/lib/CAPI/IR/BuiltinTypes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
@@ -97,6 +97,18 @@ MlirType mlirFloat8E5M2TypeGet(MlirContext ctx) {
   return wrap(FloatType::getFloat8E5M2(unwrap(ctx)));
 }
 
+MlirTypeID mlirFloat8E4M3TypeGetTypeID() {
+  return wrap(Float8E4M3Type::getTypeID());
+}
+
+bool mlirTypeIsAFloat8E4M3(MlirType type) {
+  return unwrap(type).isFloat8E4M3();
+}
+
+MlirType mlirFloat8E4M3TypeGet(MlirContext ctx) {
+  return wrap(FloatType::getFloat8E4M3(unwrap(ctx)));
+}
+
 MlirTypeID mlirFloat8E4M3FNTypeGetTypeID() {
   return wrap(Float8E4M3FNType::getTypeID());
 }
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 033e66c6118f3..b808738804030 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -321,6 +321,22 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
     return success();
   }
 };
+
+struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
+  SchedBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<SchedBarrierOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(SchedBarrierOp op, SchedBarrierOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ROCDL::SchedBarrier>(op,
+                                                     (uint32_t)op.getOpts());
+    return success();
+  }
+};
+
 } // namespace
 
 /// If `input` is a vector of bytes, concatentate those bytes in little-endian
@@ -879,8 +895,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+           LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering,
+           WMMAOpLowering, ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering>(converter, chipset);
 }
 
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index db93f186fbd55..50384d9a08e5d 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -421,6 +421,38 @@ class ArithOpConversion final : public OpConversionPattern<ArithOp> {
   }
 };
 
+template <class ArithOp, class EmitCOp>
+class BinaryUIOpConversion final : public OpConversionPattern<ArithOp> {
+public:
+  using OpConversionPattern<ArithOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ArithOp uiBinOp, typename ArithOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type newRetTy = this->getTypeConverter()->convertType(uiBinOp.getType());
+    if (!newRetTy)
+      return rewriter.notifyMatchFailure(uiBinOp,
+                                         "converting result type failed");
+    if (!isa<IntegerType>(newRetTy)) {
+      return rewriter.notifyMatchFailure(uiBinOp, "expected integer type");
+    }
+    Type unsignedType =
+        adaptIntegralTypeSignedness(newRetTy, /*needsUnsigned=*/true);
+    if (!unsignedType)
+      return rewriter.notifyMatchFailure(uiBinOp,
+                                         "converting result type failed");
+    Value lhsAdapted = adaptValueType(uiBinOp.getLhs(), rewriter, unsignedType);
+    Value rhsAdapted = adaptValueType(uiBinOp.getRhs(), rewriter, unsignedType);
+
+    auto newDivOp =
+        rewriter.create<EmitCOp>(uiBinOp.getLoc(), unsignedType,
+                                 ArrayRef<Value>{lhsAdapted, rhsAdapted});
+    Value resultAdapted = adaptValueType(newDivOp, rewriter, newRetTy);
+    rewriter.replaceOp(uiBinOp, resultAdapted);
+    return success();
+  }
+};
+
 template <typename ArithOp, typename EmitCOp>
 class IntegerOpConversion final : public OpConversionPattern<ArithOp> {
 public:
@@ -722,6 +754,8 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     ArithOpConversion<arith::MulFOp, emitc::MulOp>,
     ArithOpConversion<arith::RemSIOp, emitc::RemOp>,
     ArithOpConversion<arith::SubFOp, emitc::SubOp>,
+    BinaryUIOpConversion<arith::DivUIOp, emitc::DivOp>,
+    BinaryUIOpConversion<arith::RemUIOp, emitc::RemOp>,
     IntegerOpConversion<arith::AddIOp, emitc::AddOp>,
     IntegerOpConversion<arith::MulIOp, emitc::MulOp>,
     IntegerOpConversion<arith::SubIOp, emitc::SubOp>,
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
index f7b090acf33af..c9d962d2de23f 100644
--- a/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
@@ -27,6 +27,7 @@ add_mlir_conversion_library(MLIRConvertToSPIRVPass
   MLIRTransforms
   MLIRTransformUtils
   MLIRUBToSPIRV
+  MLIRVectorDialect
   MLIRVectorToSPIRV
   MLIRVectorTransforms
   )
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
index 003a5feea9e9b..4694a147e1e94 100644
--- a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
@@ -17,6 +17,8 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -42,19 +44,16 @@ struct ConvertToSPIRVPass final
   using ConvertToSPIRVPassBase::ConvertToSPIRVPassBase;
 
   void runOnOperation() override {
-    MLIRContext *context = &getContext();
     Operation *op = getOperation();
+    MLIRContext *context = &getContext();
 
-    if (runSignatureConversion) {
-      // Unroll vectors in function signatures to native vector size.
-      RewritePatternSet patterns(context);
-      populateFuncOpVectorRewritePatterns(patterns);
-      populateReturnOpVectorRewritePatterns(patterns);
-      GreedyRewriteConfig config;
-      config.strictMode = GreedyRewriteStrictness::ExistingOps;
-      if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
-        return signalPassFailure();
-    }
+    // Unroll vectors in function signatures to native size.
+    if (runSignatureConversion && failed(spirv::unrollVectorsInSignatures(op)))
+      return signalPassFailure();
+
+    // Unroll vectors in function bodies to native size.
+    if (runVectorUnrolling && failed(spirv::unrollVectorsInFuncBodies(op)))
+      return signalPassFailure();
 
     spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnvOrDefault(op);
     std::unique_ptr<ConversionTarget> target =
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 059acb217709c..c1f6d8bc5b361 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -325,7 +325,7 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
         rewriter.getContext(),
         {LLVM::ModRefInfo::NoModRef, LLVM::ModRefInfo::NoModRef,
          LLVM::ModRefInfo::NoModRef});
-    newFuncOp.setMemoryAttr(memoryAttr);
+    newFuncOp.setMemoryEffectsAttr(memoryAttr);
   }
 
   // Propagate argument/result attributes to all converted arguments/result
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index ebce2d77310ae..6be5548fdb60e 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -9,15 +9,16 @@
 #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
 
 namespace mlir {
 
-/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func`
-/// depending on the element type that Op operates upon. The function
-/// declaration is added in case it was not added before.
+/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func` or
+/// `f32ApproxFunc` depending on the element type and the fastMathFlag of that
+/// Op. The function declaration is added in case it was not added before.
 ///
 /// If the input values are of f16 type, the value is first casted to f32, the
 /// function called and then the result casted back.
@@ -27,13 +28,22 @@ namespace mlir {
 ///
 /// will be transformed into
 ///   llvm.call @__nv_expf(%arg_f32) : (f32) -> f32
+///
+/// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers
+/// to the approximate calculation function.
+///
+/// Also example with NVVM:
+///   %exp_f32 = math.exp %arg_f32 fastmath<afn> : f32
+///
+/// will be transformed into
+///   llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32
 template <typename SourceOp>
 struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   explicit OpToFuncCallLowering(LLVMTypeConverter &lowering, StringRef f32Func,
-                                StringRef f64Func)
+                                StringRef f64Func, StringRef f32ApproxFunc)
       : ConvertOpToLLVMPattern<SourceOp>(lowering), f32Func(f32Func),
-        f64Func(f64Func) {}
+        f64Func(f64Func), f32ApproxFunc(f32ApproxFunc) {}
 
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
@@ -55,7 +65,8 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     Type resultType = castedOperands.front().getType();
     Type funcType = getFunctionType(resultType, castedOperands);
     StringRef funcName =
-        getFunctionName(cast<LLVM::LLVMFunctionType>(funcType).getReturnType());
+        getFunctionName(cast<LLVM::LLVMFunctionType>(funcType).getReturnType(),
+                        op.getFastmath());
     if (funcName.empty())
       return failure();
 
@@ -90,9 +101,14 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     return LLVM::LLVMFunctionType::get(resultType, operandTypes);
   }
 
-  StringRef getFunctionName(Type type) const {
-    if (isa<Float32Type>(type))
-      return f32Func;
+  StringRef getFunctionName(Type type, arith::FastMathFlags flag) const {
+    if (isa<Float32Type>(type)) {
+      if (((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
+          !f32ApproxFunc.empty())
+        return f32ApproxFunc;
+      else
+        return f32Func;
+    }
     if (isa<Float64Type>(type))
       return f64Func;
     return "";
@@ -113,6 +129,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 
   const std::string f32Func;
   const std::string f64Func;
+  const std::string f32ApproxFunc;
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index ebeb8f803d71d..27d63b5f8948d 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -43,8 +43,8 @@ namespace mlir {
 static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
                                               StringRef name,
                                               ArrayRef<Type> paramTypes,
-                                              Type resultType,
-                                              bool isConvergent = false) {
+                                              Type resultType, bool isMemNone,
+                                              bool isConvergent) {
   auto func = dyn_cast_or_null<LLVM::LLVMFuncOp>(
       SymbolTable::lookupSymbolIn(symbolTable, name));
   if (!func) {
@@ -53,6 +53,18 @@ static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
         symbolTable->getLoc(), name,
         LLVM::LLVMFunctionType::get(resultType, paramTypes));
     func.setCConv(LLVM::cconv::CConv::SPIR_FUNC);
+    func.setNoUnwind(true);
+    func.setWillReturn(true);
+
+    if (isMemNone) {
+      // no externally observable effects
+      constexpr auto noModRef = mlir::LLVM::ModRefInfo::NoModRef;
+      auto memAttr = b.getAttr<LLVM::MemoryEffectsAttr>(
+          /*other=*/noModRef,
+          /*argMem=*/noModRef, /*inaccessibleMem=*/noModRef);
+      func.setMemoryEffectsAttr(memAttr);
+    }
+
     func.setConvergent(isConvergent);
   }
   return func;
@@ -64,6 +76,10 @@ static LLVM::CallOp createSPIRVBuiltinCall(Location loc,
                                            ValueRange args) {
   auto call = rewriter.create<LLVM::CallOp>(loc, func, args);
   call.setCConv(func.getCConv());
+  call.setConvergentAttr(func.getConvergentAttr());
+  call.setNoUnwindAttr(func.getNoUnwindAttr());
+  call.setWillReturnAttr(func.getWillReturnAttr());
+  call.setMemoryEffectsAttr(func.getMemoryEffectsAttr());
   return call;
 }
 
@@ -91,8 +107,9 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
     assert(moduleOp && "Expecting module");
     Type flagTy = rewriter.getI32Type();
     Type voidTy = rewriter.getType<LLVM::LLVMVoidType>();
-    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(
-        moduleOp, funcName, flagTy, voidTy, /*isConvergent=*/true);
+    LLVM::LLVMFuncOp func =
+        lookupOrCreateSPIRVFn(moduleOp, funcName, flagTy, voidTy,
+                              /*isMemNone=*/false, /*isConvergent=*/true);
 
     // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE`.
     // See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`.
@@ -134,8 +151,9 @@ struct LaunchConfigConversion : ConvertToLLVMPattern {
     assert(moduleOp && "Expecting module");
     Type dimTy = rewriter.getI32Type();
     Type indexTy = getTypeConverter()->getIndexType();
-    LLVM::LLVMFuncOp func =
-        lookupOrCreateSPIRVFn(moduleOp, funcName, dimTy, indexTy);
+    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(moduleOp, funcName, dimTy,
+                                                  indexTy, /*isMemNone=*/true,
+                                                  /*isConvergent=*/false);
 
     Location loc = op->getLoc();
     gpu::Dimension dim = getDimension(op);
@@ -268,9 +286,9 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     Type valueType = adaptor.getValue().getType();
     Type offsetType = adaptor.getOffset().getType();
     Type resultType = valueType;
-    LLVM::LLVMFuncOp func =
-        lookupOrCreateSPIRVFn(moduleOp, funcName, {valueType, offsetType},
-                              resultType, /*isConvergent=*/true);
+    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(
+        moduleOp, funcName, {valueType, offsetType}, resultType,
+        /*isMemNone=*/false, /*isConvergent=*/true);
 
     Location loc = op->getLoc();
     std::array<Value, 2> args{adaptor.getValue(), adaptor.getOffset()};
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index fea8a0ddc7f06..faa97caacb885 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -309,10 +309,11 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
   target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
   target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
   target.addIllegalDialect<gpu::GPUDialect>();
-  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
-                      LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
-                      LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
-                      LLVM::SqrtOp>();
+  target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
+                      LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FMAOp,
+                      LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
+                      LLVM::PowOp, LLVM::RoundEvenOp, LLVM::RoundOp,
+                      LLVM::SinOp, LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
@@ -321,9 +322,11 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
 template <typename OpTy>
 static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func) {
+                               StringRef f64Func,
+                               StringRef f32ApproxFunc = "") {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f32ApproxFunc);
 }
 
 void mlir::populateGpuSubgroupReduceOpLoweringPattern(
@@ -370,42 +373,68 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
       StringAttr::get(&converter.getContext(),
                       NVVM::NVVMDialect::getMaxntidAttrName()));
 
+  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
+                                    "__nv_fmod");
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
                                    "__nv_fabs");
+  populateOpPatterns<math::AcosOp>(converter, patterns, "__nv_acosf",
+                                   "__nv_acos");
+  populateOpPatterns<math::AcoshOp>(converter, patterns, "__nv_acoshf",
+                                    "__nv_acosh");
+  populateOpPatterns<math::AsinOp>(converter, patterns, "__nv_asinf",
+                                   "__nv_asin");
+  populateOpPatterns<math::AsinhOp>(converter, patterns, "__nv_asinhf",
+                                    "__nv_asinh");
   populateOpPatterns<math::AtanOp>(converter, patterns, "__nv_atanf",
                                    "__nv_atan");
   populateOpPatterns<math::Atan2Op>(converter, patterns, "__nv_atan2f",
                                     "__nv_atan2");
+  populateOpPatterns<math::AtanhOp>(converter, patterns, "__nv_atanhf",
+                                    "__nv_atanh");
   populateOpPatterns<math::CbrtOp>(converter, patterns, "__nv_cbrtf",
                                    "__nv_cbrt");
   populateOpPatterns<math::CeilOp>(converter, patterns, "__nv_ceilf",
                                    "__nv_ceil");
-  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos");
+  populateOpPatterns<math::CopySignOp>(converter, patterns, "__nv_copysignf",
+                                       "__nv_copysign");
+  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos",
+                                  "__nv_fast_cosf");
+  populateOpPatterns<math::CoshOp>(converter, patterns, "__nv_coshf",
+                                   "__nv_cosh");
   populateOpPatterns<math::ErfOp>(converter, patterns, "__nv_erff", "__nv_erf");
-  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp");
+  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp",
+                                  "__nv_fast_expf");
   populateOpPatterns<math::Exp2Op>(converter, patterns, "__nv_exp2f",
                                    "__nv_exp2");
   populateOpPatterns<math::ExpM1Op>(converter, patterns, "__nv_expm1f",
                                     "__nv_expm1");
   populateOpPatterns<math::FloorOp>(converter, patterns, "__nv_floorf",
                                     "__nv_floor");
-  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
-                                    "__nv_fmod");
-  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log");
+  populateOpPatterns<math::FmaOp>(converter, patterns, "__nv_fmaf", "__nv_fma");
+  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log",
+                                  "__nv_fast_logf");
+  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
+                                    "__nv_log10", "__nv_fast_log10f");
   populateOpPatterns<math::Log1pOp>(converter, patterns, "__nv_log1pf",
                                     "__nv_log1p");
-  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
-                                    "__nv_log10");
   populateOpPatterns<math::Log2Op>(converter, patterns, "__nv_log2f",
-                                   "__nv_log2");
-  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf",
-                                   "__nv_pow");
+                                   "__nv_log2", "__nv_fast_log2f");
+  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf", "__nv_pow",
+                                   "__nv_fast_powf");
+  populateOpPatterns<math::RoundOp>(converter, patterns, "__nv_roundf",
+                                    "__nv_round");
+  populateOpPatterns<math::RoundEvenOp>(converter, patterns, "__nv_rintf",
+                                        "__nv_rint");
   populateOpPatterns<math::RsqrtOp>(converter, patterns, "__nv_rsqrtf",
                                     "__nv_rsqrt");
-  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin");
+  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin",
+                                  "__nv_fast_sinf");
+  populateOpPatterns<math::SinhOp>(converter, patterns, "__nv_sinhf",
+                                   "__nv_sinh");
   populateOpPatterns<math::SqrtOp>(converter, patterns, "__nv_sqrtf",
                                    "__nv_sqrt");
+  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan",
+                                  "__nv_fast_tanf");
   populateOpPatterns<math::TanhOp>(converter, patterns, "__nv_tanhf",
                                    "__nv_tanh");
-  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 32d02d5e438bd..d5df960928afb 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -247,8 +247,9 @@ Type LLVMTypeConverter::convertIntegerType(IntegerType type) const {
 }
 
 Type LLVMTypeConverter::convertFloatType(FloatType type) const {
-  if (type.isFloat8E5M2() || type.isFloat8E4M3FN() || type.isFloat8E5M2FNUZ() ||
-      type.isFloat8E4M3FNUZ() || type.isFloat8E4M3B11FNUZ())
+  if (type.isFloat8E5M2() || type.isFloat8E4M3() || type.isFloat8E4M3FN() ||
+      type.isFloat8E5M2FNUZ() || type.isFloat8E4M3FNUZ() ||
+      type.isFloat8E4M3B11FNUZ())
     return IntegerType::get(&getContext(), type.getWidth());
   return type;
 }
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index 03c7ce5dac0d1..7de6971ba2ee7 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -38,9 +38,11 @@ using namespace mlir;
 template <typename OpTy>
 static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func) {
+                               StringRef f64Func,
+                               StringRef f32ApproxFunc = "") {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f32ApproxFunc);
 }
 
 void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 11d29754aa760..cf984ca3293c0 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -1666,6 +1667,40 @@ struct NVGPUTmaPrefetchOpLowering
   }
 };
 
+struct NVGPURcpOpLowering : public ConvertOpToLLVMPattern<nvgpu::RcpOp> {
+  using ConvertOpToLLVMPattern<nvgpu::RcpOp>::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(nvgpu::RcpOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+    auto i64Ty = b.getI64Type();
+    auto f32Ty = b.getF32Type();
+    VectorType inTy = op.getIn().getType();
+    // apply rcp.approx.ftz.f on each element in vector.
+    auto convert1DVec = [&](Type llvm1DVectorTy, Value inVec) {
+      Value ret1DVec = b.create<LLVM::UndefOp>(llvm1DVectorTy);
+      int numElems = llvm::cast<VectorType>(llvm1DVectorTy).getNumElements();
+      for (int i = 0; i < numElems; i++) {
+        Value idx = b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(i));
+        Value elem = b.create<LLVM::ExtractElementOp>(inVec, idx);
+        Value dst = b.create<NVVM::RcpApproxFtzF32Op>(f32Ty, elem);
+        ret1DVec = b.create<LLVM::InsertElementOp>(ret1DVec, dst, idx);
+      }
+      return ret1DVec;
+    };
+    if (inTy.getRank() == 1) {
+      rewriter.replaceOp(op, convert1DVec(inTy, adaptor.getIn()));
+      return success();
+    }
+    return LLVM::detail::handleMultidimensionalVectors(
+        op.getOperation(), adaptor.getOperands(), *(this->getTypeConverter()),
+        [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+          OpAdaptor adaptor(operands);
+          return convert1DVec(llvm1DVectorTy, adaptor.getIn());
+        },
+        rewriter);
+  }
+};
 } // namespace
 
 void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -1688,5 +1723,5 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
       NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
       MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
       NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
-      NVGPUMmaSparseSyncLowering>(converter);
+      NVGPUMmaSparseSyncLowering, NVGPURcpOpLowering>(converter);
 }
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index fdc4c7be1ca5c..6d250237e0a33 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -366,7 +366,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Declare reductions.
     // TODO: consider checking it here is already a compatible reduction
     // declaration and use it instead of redeclaring.
-    SmallVector<Attribute> reductionDeclSymbols;
+    SmallVector<Attribute> reductionSyms;
     SmallVector<omp::DeclareReductionOp> ompReductionDecls;
     auto reduce = cast<scf::ReduceOp>(parallelOp.getBody()->getTerminator());
     for (int64_t i = 0, e = parallelOp.getNumReductions(); i < e; ++i) {
@@ -374,7 +374,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
       ompReductionDecls.push_back(decl);
       if (!decl)
         return failure();
-      reductionDeclSymbols.push_back(
+      reductionSyms.push_back(
           SymbolRefAttr::get(rewriter.getContext(), decl.getSymName()));
     }
 
@@ -444,16 +444,16 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Create the parallel wrapper.
     auto ompParallel = rewriter.create<omp::ParallelOp>(
         loc,
-        /* if_expr = */ Value{},
-        /* num_threads_var = */ numThreadsVar,
         /* allocate_vars = */ llvm::SmallVector<Value>{},
-        /* allocators_vars = */ llvm::SmallVector<Value>{},
-        /* reduction_vars = */ llvm::SmallVector<Value>{},
-        /* reduction_vars_isbyref = */ DenseBoolArrayAttr{},
-        /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /* allocator_vars = */ llvm::SmallVector<Value>{},
+        /* if_expr = */ Value{},
+        /* num_threads = */ numThreadsVar,
         /* private_vars = */ ValueRange(),
-        /* privatizers = */ nullptr);
+        /* private_syms = */ nullptr,
+        /* proc_bind_kind = */ omp::ClauseProcBindKindAttr{},
+        /* reduction_vars = */ llvm::SmallVector<Value>{},
+        /* reduction_byref = */ DenseBoolArrayAttr{},
+        /* reduction_syms = */ ArrayAttr{});
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
@@ -465,15 +465,15 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         // Create worksharing loop wrapper.
         auto wsloopOp = rewriter.create<omp::WsloopOp>(parallelOp.getLoc());
         if (!reductionVariables.empty()) {
-          wsloopOp.setReductionsAttr(
-              ArrayAttr::get(rewriter.getContext(), reductionDeclSymbols));
+          wsloopOp.setReductionSymsAttr(
+              ArrayAttr::get(rewriter.getContext(), reductionSyms));
           wsloopOp.getReductionVarsMutable().append(reductionVariables);
-          llvm::SmallVector<bool> byRefVec;
+          llvm::SmallVector<bool> reductionByRef;
           // false because these reductions always reduce scalars and so do
           // not need to pass by reference
-          byRefVec.resize(reductionVariables.size(), false);
-          wsloopOp.setReductionVarsByref(
-              DenseBoolArrayAttr::get(rewriter.getContext(), byRefVec));
+          reductionByRef.resize(reductionVariables.size(), false);
+          wsloopOp.setReductionByref(
+              DenseBoolArrayAttr::get(rewriter.getContext(), reductionByRef));
         }
         rewriter.create<omp::TerminatorOp>(loc); // omp.parallel terminator.
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 1442f2ad72255..ba259d4b84fce 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -298,6 +298,14 @@ static Value createLinalgBodyCalculationForElementwiseOp(
   if (isa<tosa::ExpOp>(op) && isa<FloatType>(elementTy))
     return rewriter.create<mlir::math::ExpOp>(loc, resultTypes, args);
 
+  // tosa::SinOp
+  if (isa<tosa::SinOp>(op) && isa<FloatType>(elementTy))
+    return rewriter.create<mlir::math::SinOp>(loc, resultTypes, args);
+
+  // tosa::CosOp
+  if (isa<tosa::CosOp>(op) && isa<FloatType>(elementTy))
+    return rewriter.create<mlir::math::CosOp>(loc, resultTypes, args);
+
   // tosa::TanhOp
   if (isa<tosa::TanhOp>(op) && isa<FloatType>(elementTy))
     return rewriter.create<mlir::math::TanhOp>(loc, resultTypes, args);
@@ -2598,6 +2606,8 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns(
       PointwiseConverter<tosa::LogOp>,
       PointwiseConverter<tosa::ExpOp>,
       PointwiseConverter<tosa::AbsOp>,
+      PointwiseConverter<tosa::SinOp>,
+      PointwiseConverter<tosa::CosOp>,
       PointwiseConverter<tosa::TanhOp>,
       PointwiseConverter<tosa::ErfOp>,
       PointwiseConverter<tosa::BitwiseAndOp>,
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index f6b1c42dcd24c..53e18a2e9d299 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -994,7 +994,7 @@ class VectorShuffleOpConversion
     auto v2Type = shuffleOp.getV2VectorType();
     auto vectorType = shuffleOp.getResultVectorType();
     Type llvmType = typeConverter->convertType(vectorType);
-    auto maskArrayAttr = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
 
     // Bail if result type cannot be lowered.
     if (!llvmType)
@@ -1015,7 +1015,7 @@ class VectorShuffleOpConversion
     if (rank <= 1 && v1Type == v2Type) {
       Value llvmShuffleOp = rewriter.create<LLVM::ShuffleVectorOp>(
           loc, adaptor.getV1(), adaptor.getV2(),
-          LLVM::convertArrayToIndices<int32_t>(maskArrayAttr));
+          llvm::to_vector_of<int32_t>(mask));
       rewriter.replaceOp(shuffleOp, llvmShuffleOp);
       return success();
     }
@@ -1029,8 +1029,7 @@ class VectorShuffleOpConversion
       eltType = cast<VectorType>(llvmType).getElementType();
     Value insert = rewriter.create<LLVM::UndefOp>(loc, llvmType);
     int64_t insPos = 0;
-    for (const auto &en : llvm::enumerate(maskArrayAttr)) {
-      int64_t extPos = cast<IntegerAttr>(en.value()).getInt();
+    for (int64_t extPos : mask) {
       Value value = adaptor.getV1();
       if (extPos >= v1Dim) {
         extPos -= v1Dim;
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index a4390447532a5..21b8858989839 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -144,8 +144,8 @@ struct VectorBroadcastConvert final
 
     SmallVector<Value, 4> source(castOp.getResultVectorType().getNumElements(),
                                  adaptor.getSource());
-    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(
-        castOp, castOp.getResultVectorType(), source);
+    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(castOp, resultType,
+                                                             source);
     return success();
   }
 };
@@ -527,10 +527,7 @@ struct VectorShuffleOpConvert final
       return rewriter.notifyMatchFailure(shuffleOp,
                                          "unsupported result vector type");
 
-    SmallVector<int32_t, 4> mask = llvm::map_to_vector<4>(
-        shuffleOp.getMask(), [](Attribute attr) -> int32_t {
-          return cast<IntegerAttr>(attr).getValue().getZExtValue();
-        });
+    auto mask = llvm::to_vector_of<int32_t>(shuffleOp.getMask());
 
     VectorType oldV1Type = shuffleOp.getV1VectorType();
     VectorType oldV2Type = shuffleOp.getV2VectorType();
@@ -906,6 +903,43 @@ struct VectorReductionToFPDotProd final
   }
 };
 
+struct VectorStepOpConvert final : OpConversionPattern<vector::StepOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StepOp stepOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    const auto &typeConverter = *getTypeConverter<SPIRVTypeConverter>();
+    Type dstType = typeConverter.convertType(stepOp.getType());
+    if (!dstType)
+      return failure();
+
+    Location loc = stepOp.getLoc();
+    int64_t numElements = stepOp.getType().getNumElements();
+    auto intType =
+        rewriter.getIntegerType(typeConverter.getIndexTypeBitwidth());
+
+    // Input vectors of size 1 are converted to scalars by the type converter.
+    // We just create a constant in this case.
+    if (numElements == 1) {
+      Value zero = spirv::ConstantOp::getZero(intType, loc, rewriter);
+      rewriter.replaceOp(stepOp, zero);
+      return success();
+    }
+
+    SmallVector<Value> source;
+    source.reserve(numElements);
+    for (int64_t i = 0; i < numElements; ++i) {
+      Attribute intAttr = rewriter.getIntegerAttr(intType, i);
+      Value constOp = rewriter.create<spirv::ConstantOp>(loc, intType, intAttr);
+      source.push_back(constOp);
+    }
+    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(stepOp, dstType,
+                                                             source);
+    return success();
+  }
+};
+
 } // namespace
 #define CL_INT_MAX_MIN_OPS                                                     \
   spirv::CLUMaxOp, spirv::CLUMinOp, spirv::CLSMaxOp, spirv::CLSMinOp
@@ -929,8 +963,9 @@ void mlir::populateVectorToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
       VectorReductionFloatMinMax<GL_FLOAT_MAX_MIN_OPS>, VectorShapeCast,
       VectorInsertStridedSliceOpConvert, VectorShuffleOpConvert,
       VectorInterleaveOpConvert, VectorDeinterleaveOpConvert,
-      VectorSplatPattern, VectorLoadOpConverter, VectorStoreOpConverter>(
-      typeConverter, patterns.getContext(), PatternBenefit(1));
+      VectorSplatPattern, VectorLoadOpConverter, VectorStoreOpConverter,
+      VectorStepOpConvert>(typeConverter, patterns.getContext(),
+                           PatternBenefit(1));
 
   // Make sure that the more specialized dot product pattern has higher benefit
   // than the generic one that extracts all elements.
diff --git a/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt b/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
index 61e49b0da8b2d..3a1996349dbed 100644
--- a/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
@@ -8,6 +8,9 @@ add_mlir_dialect_library(MLIRAffineAnalysis
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
 
+  DEPENDS
+  MLIRFuncOpsIncGen
+
   LINK_LIBS PUBLIC
   MLIRAffineDialect
   MLIRAnalysis
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 82ba8fc5ccbc1..411b5efb36cab 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
 #include <numeric>
 #include <optional>
 #include <type_traits>
@@ -30,6 +31,8 @@
 using namespace mlir;
 using namespace mlir::affine;
 
+#define DEBUG_TYPE "affine-loop-analysis"
+
 /// Returns the trip count of the loop as an affine expression if the latter is
 /// expressible as an affine expression, and nullptr otherwise. The trip count
 /// expression is simplified before returning. This method only utilizes map
@@ -390,3 +393,58 @@ bool mlir::affine::isOpwiseShiftValid(AffineForOp forOp,
   }
   return true;
 }
+
+bool mlir::affine::isTilingValid(ArrayRef<AffineForOp> loops) {
+  assert(!loops.empty() && "no original loops provided");
+
+  // We first find out all dependences we intend to check.
+  SmallVector<Operation *, 8> loadAndStoreOps;
+  loops[0]->walk([&](Operation *op) {
+    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
+      loadAndStoreOps.push_back(op);
+  });
+
+  unsigned numOps = loadAndStoreOps.size();
+  unsigned numLoops = loops.size();
+  for (unsigned d = 1; d <= numLoops + 1; ++d) {
+    for (unsigned i = 0; i < numOps; ++i) {
+      Operation *srcOp = loadAndStoreOps[i];
+      MemRefAccess srcAccess(srcOp);
+      for (unsigned j = 0; j < numOps; ++j) {
+        Operation *dstOp = loadAndStoreOps[j];
+        MemRefAccess dstAccess(dstOp);
+
+        SmallVector<DependenceComponent, 2> depComps;
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, /*dependenceConstraints=*/nullptr,
+            &depComps);
+
+        // Skip if there is no dependence in this case.
+        if (!hasDependence(result))
+          continue;
+
+        // Check whether there is any negative direction vector in the
+        // dependence components found above, which means that dependence is
+        // violated by the default hyper-rect tiling method.
+        LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
+                                   "for dependence at depth: "
+                                << Twine(d) << " between:\n";);
+        LLVM_DEBUG(srcAccess.opInst->dump());
+        LLVM_DEBUG(dstAccess.opInst->dump());
+        for (const DependenceComponent &depComp : depComps) {
+          if (depComp.lb.has_value() && depComp.ub.has_value() &&
+              *depComp.lb < *depComp.ub && *depComp.ub < 0) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Dependence component lb = " << Twine(*depComp.lb)
+                       << " ub = " << Twine(*depComp.ub)
+                       << " is negative  at depth: " << Twine(d)
+                       << " and thus violates the legality rule.\n");
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
index 2650a06d198ea..c8400dfe8cd5c 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -93,69 +93,6 @@ static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
   }
 }
 
-/// Checks whether hyper-rectangular loop tiling of the nest represented by
-/// `origLoops` is valid. The validity condition is from Irigoin and Triolet,
-/// which states that two tiles cannot depend on each other. We simplify such
-/// condition to just checking whether there is any negative dependence
-/// direction, since we have the prior knowledge that the tiling results will be
-/// hyper-rectangles, which are scheduled in the lexicographically increasing
-/// order on the vector of loop indices. This function will return failure when
-/// any dependence component is negative along any of `origLoops`.
-static bool checkTilingLegality(MutableArrayRef<AffineForOp> origLoops) {
-  assert(!origLoops.empty() && "no original loops provided");
-
-  // We first find out all dependences we intend to check.
-  SmallVector<Operation *, 8> loadAndStoreOps;
-  origLoops[0]->walk([&](Operation *op) {
-    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
-      loadAndStoreOps.push_back(op);
-  });
-
-  unsigned numOps = loadAndStoreOps.size();
-  unsigned numLoops = origLoops.size();
-  for (unsigned d = 1; d <= numLoops + 1; ++d) {
-    for (unsigned i = 0; i < numOps; ++i) {
-      Operation *srcOp = loadAndStoreOps[i];
-      MemRefAccess srcAccess(srcOp);
-      for (unsigned j = 0; j < numOps; ++j) {
-        Operation *dstOp = loadAndStoreOps[j];
-        MemRefAccess dstAccess(dstOp);
-
-        SmallVector<DependenceComponent, 2> depComps;
-        DependenceResult result = checkMemrefAccessDependence(
-            srcAccess, dstAccess, d, /*dependenceConstraints=*/nullptr,
-            &depComps);
-
-        // Skip if there is no dependence in this case.
-        if (!hasDependence(result))
-          continue;
-
-        // Check whether there is any negative direction vector in the
-        // dependence components found above, which means that dependence is
-        // violated by the default hyper-rect tiling method.
-        LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
-                                   "for dependence at depth: "
-                                << Twine(d) << " between:\n";);
-        LLVM_DEBUG(srcAccess.opInst->dump(););
-        LLVM_DEBUG(dstAccess.opInst->dump(););
-        for (const DependenceComponent &depComp : depComps) {
-          if (depComp.lb.has_value() && depComp.ub.has_value() &&
-              *depComp.lb < *depComp.ub && *depComp.ub < 0) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "Dependence component lb = " << Twine(*depComp.lb)
-                       << " ub = " << Twine(*depComp.ub)
-                       << " is negative  at depth: " << Twine(d)
-                       << " and thus violates the legality rule.\n");
-            return false;
-          }
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 // Returns tile sizes to use. Checks CL options; if none are specified, sets it
 // based on a simple model that looks at the memory footprint and determines
 // tile sizes assuming identity accesses / 1:1 tile size proportional footprint
@@ -242,8 +179,8 @@ void LoopTiling::runOnOperation() {
 
   // Tile each band.
   for (auto &band : bands) {
-    if (!checkTilingLegality(band)) {
-      band.front().emitRemark("tiling code is illegal due to dependences");
+    if (!isTilingValid(band)) {
+      band.front().emitRemark("tiling nest is invalid due to dependences");
       continue;
     }
 
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 605737542e9fc..f09d93f3ba444 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -12,10 +12,8 @@
 
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -2082,8 +2080,8 @@ static LogicalResult generateCopy(
 
   auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
 
-  Value dmaStride = nullptr;
-  Value numEltPerDmaStride = nullptr;
+  Value dmaStride;
+  Value numEltPerDmaStride;
   if (copyOptions.generateDma) {
     SmallVector<StrideInfo, 4> dmaStrideInfos;
     getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index aa5eb95a3d22e..641b7d7e2d13b 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1739,6 +1739,8 @@ OpFoldResult arith::BitcastOp::fold(FoldAdaptor adaptor) {
   APInt bits = llvm::isa<FloatAttr>(operand)
                    ? llvm::cast<FloatAttr>(operand).getValue().bitcastToAPInt()
                    : llvm::cast<IntegerAttr>(operand).getValue();
+  assert(resType.getIntOrFloatBitWidth() == bits.getBitWidth() &&
+         "trying to fold on broken IR: operands have incompatible types");
 
   if (auto resFloatType = llvm::dyn_cast<FloatType>(resType))
     return FloatAttr::get(resType,
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index 8e1cb474feee7..a362c8500aa5b 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -56,6 +56,7 @@ static std::optional<FloatType> parseFloatType(MLIRContext *ctx,
   Builder b(ctx);
   return llvm::StringSwitch<std::optional<FloatType>>(name)
       .Case("f8E5M2", b.getFloat8E5M2Type())
+      .Case("f8E4M3", b.getFloat8E4M3Type())
       .Case("f8E4M3FN", b.getFloat8E4M3FNType())
       .Case("f8E5M2FNUZ", b.getFloat8E5M2FNUZType())
       .Case("f8E4M3FNUZ", b.getFloat8E4M3FNUZType())
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
index 600f2ecdb51bc..8f9b5080e82db 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_dialect_library(MLIRArmSMETransforms
   MLIRFuncDialect
   MLIRLLVMCommonConversion
   MLIRVectorDialect
+  MLIRIndexDialect
   MLIRSCFDialect
   MLIRSCFTransforms
   MLIRFuncTransforms
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index 39292c4533d69..1e711678dc9ab 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -479,7 +479,7 @@ struct SwapVectorExtractOfArithExtend
       return rewriter.notifyMatchFailure(extractOp,
                                          "extracted type is not a vector type");
 
-    auto numScalableDims = llvm::count(resultType.getScalableDims(), true);
+    auto numScalableDims = resultType.getNumScalableDims();
     if (numScalableDims != 1)
       return rewriter.notifyMatchFailure(
           extractOp, "extracted type is not a 1-D scalable vector type");
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index 733e758b43907..3a2042d23e534 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -153,10 +153,18 @@ class TileAllocator {
     return failure();
   }
 
+  /// Acquires a specific tile ID. Asserts the tile is initially free.
+  void acquireTileId(ArmSMETileType tileType, unsigned tileId) {
+    TileMask tileMask = getMasks(tileType)[tileId];
+    assert((tilesInUse & tileMask) == TileMask::kNone &&
+           "cannot acquire allocated tile!");
+    tilesInUse |= tileMask;
+  }
+
   /// Releases a previously allocated tile ID.
   void releaseTileId(ArmSMETileType tileType, unsigned tileId) {
     TileMask tileMask = getMasks(tileType)[tileId];
-    assert((tilesInUse & tileMask) != TileMask::kNone &&
+    assert((tilesInUse & tileMask) == tileMask &&
            "cannot release unallocated tile!");
     tilesInUse ^= tileMask;
   }
@@ -289,6 +297,11 @@ struct LiveRange {
         .valid();
   }
 
+  /// Returns true if this range is active at `point` in the program.
+  bool overlaps(uint64_t point) const {
+    return ranges->lookup(point) == kValidLiveRange;
+  }
+
   /// Unions this live range with `otherRange`, aborts if the ranges overlap.
   void unionWith(LiveRange const &otherRange) {
     for (auto it = otherRange.ranges->begin(); it != otherRange.ranges->end();
@@ -488,76 +501,140 @@ coalesceTileLiveRanges(DenseMap<Value, LiveRange> &initialLiveRanges) {
   return std::move(coalescedLiveRanges);
 }
 
-/// Choose a live range to spill (via some heuristics). This picks either an
-/// active live range from `activeRanges` or the new live range `newRange`.
-LiveRange *chooseSpillUsingHeuristics(ArrayRef<LiveRange *> activeRanges,
-                                      LiveRange *newRange) {
+/// Choose a live range to spill (via some heuristics). This picks either a live
+/// range from `overlappingRanges`, or the new live range `newRange`.
+template <typename OverlappingRangesIterator>
+LiveRange *
+chooseSpillUsingHeuristics(OverlappingRangesIterator overlappingRanges,
+                           LiveRange *newRange) {
   // Heuristic: Spill trivially copyable operations (usually free).
-  auto isTrivialSpill = [&](LiveRange *allocatedRange) {
-    return isTileTypeGreaterOrEqual(allocatedRange->getTileType(),
+  auto isTrivialSpill = [&](LiveRange &allocatedRange) {
+    return isTileTypeGreaterOrEqual(allocatedRange.getTileType(),
                                     newRange->getTileType()) &&
-           allocatedRange->values.size() == 1 &&
+           allocatedRange.values.size() == 1 &&
            isTriviallyCloneableTileOp(
-               allocatedRange->values[0]
-                   .getDefiningOp<ArmSMETileOpInterface>());
+               allocatedRange.values[0].getDefiningOp<ArmSMETileOpInterface>());
   };
-  if (isTrivialSpill(newRange))
+  if (isTrivialSpill(*newRange))
     return newRange;
-  auto trivialSpill = llvm::find_if(activeRanges, isTrivialSpill);
-  if (trivialSpill != activeRanges.end())
-    return *trivialSpill;
+  auto trivialSpill = llvm::find_if(overlappingRanges, isTrivialSpill);
+  if (trivialSpill != overlappingRanges.end())
+    return &*trivialSpill;
 
   // Heuristic: Spill the range that ends last (with a compatible tile type).
-  auto isSmallerTileTypeOrEndsEarlier = [](LiveRange *a, LiveRange *b) {
-    return !isTileTypeGreaterOrEqual(a->getTileType(), b->getTileType()) ||
-           a->end() < b->end();
+  auto isSmallerTileTypeOrEndsEarlier = [](LiveRange &a, LiveRange &b) {
+    return !isTileTypeGreaterOrEqual(a.getTileType(), b.getTileType()) ||
+           a.end() < b.end();
   };
-  LiveRange *lastActiveLiveRange = *std::max_element(
-      activeRanges.begin(), activeRanges.end(), isSmallerTileTypeOrEndsEarlier);
-  if (!isSmallerTileTypeOrEndsEarlier(lastActiveLiveRange, newRange))
-    return lastActiveLiveRange;
+  LiveRange &latestEndingLiveRange =
+      *std::max_element(overlappingRanges.begin(), overlappingRanges.end(),
+                        isSmallerTileTypeOrEndsEarlier);
+  if (!isSmallerTileTypeOrEndsEarlier(latestEndingLiveRange, *newRange))
+    return &latestEndingLiveRange;
   return newRange;
 }
 
 /// Greedily allocate tile IDs to live ranges. Spill using simple heuristics.
-/// Note: This does not attempt to fill holes in active live ranges.
 void allocateTilesToLiveRanges(
     ArrayRef<LiveRange *> liveRangesSortedByStartPoint) {
   TileAllocator tileAllocator;
+  // `activeRanges` = Live ranges that need to be in a tile at the
+  // `currentPoint` in the program.
   SetVector<LiveRange *> activeRanges;
+  // `inactiveRanges` = Live ranges that _do not_ need to be in a tile
+  // at the `currentPoint` in the program but could become active again later.
+  // An inactive section of a live range can be seen as a 'hole' in the live
+  // range, where it is possible to reuse the live range's tile ID _before_ it
+  // has ended. By identifying 'holes', the allocator can reuse tiles more
+  // often, which helps avoid costly tile spills.
+  SetVector<LiveRange *> inactiveRanges;
   for (LiveRange *nextRange : liveRangesSortedByStartPoint) {
-    // Release tile IDs from live ranges that have ended.
+    auto currentPoint = nextRange->start();
+    // 1. Update the `activeRanges` at `currentPoint`.
     activeRanges.remove_if([&](LiveRange *activeRange) {
-      if (activeRange->end() <= nextRange->start()) {
+      // Check for live ranges that have expired.
+      if (activeRange->end() <= currentPoint) {
         tileAllocator.releaseTileId(activeRange->getTileType(),
                                     *activeRange->tileId);
         return true;
       }
+      // Check for live ranges that have become inactive.
+      if (!activeRange->overlaps(currentPoint)) {
+        tileAllocator.releaseTileId(activeRange->getTileType(),
+                                    *activeRange->tileId);
+        inactiveRanges.insert(activeRange);
+        return true;
+      }
       return false;
     });
+    // 2. Update the `inactiveRanges` at `currentPoint`.
+    inactiveRanges.remove_if([&](LiveRange *inactiveRange) {
+      // Check for live ranges that have expired.
+      if (inactiveRange->end() <= currentPoint) {
+        return true;
+      }
+      // Check for live ranges that have become active.
+      if (inactiveRange->overlaps(currentPoint)) {
+        tileAllocator.acquireTileId(inactiveRange->getTileType(),
+                                    *inactiveRange->tileId);
+        activeRanges.insert(inactiveRange);
+        return true;
+      }
+      return false;
+    });
+
+    // 3. Collect inactive live ranges that overlap with the new live range.
+    // Note: The overlap checks in steps 1 and 2 only look at the `currentPoint`
+    // whereas this checks if there is an overlap at any future point too.
+    SmallVector<LiveRange *> overlappingInactiveRanges;
+    for (LiveRange *inactiveRange : inactiveRanges) {
+      if (inactiveRange->overlaps(*nextRange)) {
+        // We need to reserve the tile IDs of overlapping inactive ranges to
+        // prevent two (overlapping) live ranges from getting the same tile ID.
+        tileAllocator.acquireTileId(inactiveRange->getTileType(),
+                                    *inactiveRange->tileId);
+        overlappingInactiveRanges.push_back(inactiveRange);
+      }
+    }
 
-    // Allocate a tile ID to `nextRange`.
+    // 4. Allocate a tile ID to `nextRange`.
     auto rangeTileType = nextRange->getTileType();
     auto tileId = tileAllocator.allocateTileId(rangeTileType);
     if (succeeded(tileId)) {
       nextRange->tileId = *tileId;
     } else {
+      // Create an iterator over all overlapping live ranges.
+      auto allOverlappingRanges = llvm::concat<LiveRange>(
+          llvm::make_pointee_range(activeRanges.getArrayRef()),
+          llvm::make_pointee_range(overlappingInactiveRanges));
+      // Choose an overlapping live range to spill.
       LiveRange *rangeToSpill =
-          chooseSpillUsingHeuristics(activeRanges.getArrayRef(), nextRange);
+          chooseSpillUsingHeuristics(allOverlappingRanges, nextRange);
       if (rangeToSpill != nextRange) {
-        // Spill an active live range (so release its tile ID first).
+        // Spill an (in)active live range (so release its tile ID first).
         tileAllocator.releaseTileId(rangeToSpill->getTileType(),
                                     *rangeToSpill->tileId);
-        activeRanges.remove(rangeToSpill);
         // This will always succeed after a spill (of an active live range).
         nextRange->tileId = *tileAllocator.allocateTileId(rangeTileType);
+        // Remove the live range from the active/inactive sets.
+        if (!activeRanges.remove(rangeToSpill)) {
+          bool removed = inactiveRanges.remove(rangeToSpill);
+          assert(removed && "expected a range to be removed!");
+          (void)removed;
+        }
       }
       rangeToSpill->tileId = tileAllocator.allocateInMemoryTileId();
     }
 
-    // Insert the live range into the active ranges.
+    // 5. Insert the live range into the active ranges.
     if (nextRange->tileId < kInMemoryTileIdBase)
       activeRanges.insert(nextRange);
+
+    // 6. Release tiles reserved for inactive live ranges (in step 3).
+    for (LiveRange *range : overlappingInactiveRanges) {
+      if (*range->tileId < kInMemoryTileIdBase)
+        tileAllocator.releaseTileId(range->getTileType(), *range->tileId);
+    }
   }
 }
 
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 96dad6518fec8..53df7af00aee8 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -18,10 +18,13 @@
 #include "mlir/Dialect/ArmSME/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/OneToNFuncConversions.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 
 #define DEBUG_TYPE "arm-sme-vector-legalization"
@@ -140,11 +143,11 @@ Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
 auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
                          VectorType smeTileType,
                          bool transposeIndices = false) {
-  assert(isMultipleOfSMETileVectorType(type) &&
-         "`type` not multiple of SME tiles");
   return llvm::map_range(
-      StaticTileOffsetRange(type.getShape(), {smeTileType.getDimSize(0),
-                                              smeTileType.getDimSize(1)}),
+      StaticTileOffsetRange(
+          type.getShape(),
+          {std::min(type.getDimSize(0), smeTileType.getDimSize(0)),
+           std::min(type.getDimSize(1), smeTileType.getDimSize(1))}),
       [=](auto indices) {
         int row = int(indices[0]);
         int col = int(indices[1]);
@@ -440,12 +443,8 @@ struct LegalizeMultiTileTransferWriteAsStoreLoop
                                          kMatchFailureUnsupportedMaskOp);
 
     auto loc = writeOp.getLoc();
-    auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
-    auto createVscaleMultiple = [&](int64_t multiplier) {
-      return rewriter.create<arith::MulIOp>(
-          loc, vscale,
-          rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
-    };
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
 
     // Get SME tile and slice types.
     auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
@@ -549,7 +548,7 @@ struct FoldExtractFromVectorOfSMELikeCreateMasks
       return rewriter.notifyMatchFailure(extractOp,
                                          "extracted type is not a vector type");
 
-    auto numScalable = llvm::count(extractedMaskType.getScalableDims(), true);
+    auto numScalable = extractedMaskType.getNumScalableDims();
     if (numScalable != 2)
       return rewriter.notifyMatchFailure(
           extractOp, "expected extracted type to be an SME-like mask");
@@ -775,6 +774,237 @@ struct ConvertIllegalShapeCastOpsToTransposes
   }
 };
 
+/// Returns an iterator over the dims (inc scalability) of a VectorType.
+static auto getDims(VectorType vType) {
+  return llvm::zip_equal(vType.getShape(), vType.getScalableDims());
+}
+
+/// Helper to drop (fixed-size) unit dims from a VectorType.
+static VectorType dropUnitDims(VectorType vType) {
+  SmallVector<bool> scalableFlags;
+  SmallVector<int64_t> dimSizes;
+  for (auto dim : getDims(vType)) {
+    if (dim == std::make_tuple(1, false))
+      continue;
+    auto [size, scalableFlag] = dim;
+    dimSizes.push_back(size);
+    scalableFlags.push_back(scalableFlag);
+  }
+  return VectorType::get(dimSizes, vType.getElementType(), scalableFlags);
+}
+
+/// A pattern to swap shape_cast(tranpose) with transpose(shape_cast) if the
+/// shape_cast only drops unit dimensions.
+///
+/// This simplifies the transpose making it possible for other legalization
+/// rewrites to handle it.
+///
+/// Example:
+///
+///  BEFORE:
+///  ```mlir
+///  %0 = vector.transpose %vector, [3, 0, 1, 2]
+///         : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+///  %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+///  ```
+///
+///  AFTER:
+///  ```mlir
+///  %0 = vector.shape_cast %arg0 : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+///  %1 = vector.transpose %0, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+///  ```
+struct SwapShapeCastOfTranspose : public OpRewritePattern<vector::ShapeCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
+                                PatternRewriter &rewriter) const override {
+    auto transposeOp =
+        shapeCastOp.getSource().getDefiningOp<vector::TransposeOp>();
+    if (!transposeOp)
+      return rewriter.notifyMatchFailure(shapeCastOp, "not TransposeOp");
+
+    auto resultType = shapeCastOp.getResultVectorType();
+    if (resultType.getRank() <= 1)
+      return rewriter.notifyMatchFailure(shapeCastOp, "result rank too low");
+
+    if (resultType != dropUnitDims(shapeCastOp.getSourceVectorType()))
+      return rewriter.notifyMatchFailure(
+          shapeCastOp, "ShapeCastOp changes non-unit dimension(s)");
+
+    auto transposeSourceVectorType = transposeOp.getSourceVectorType();
+    auto transposeSourceDims =
+        llvm::to_vector(getDims(transposeSourceVectorType));
+
+    // Construct a map from dimIdx -> number of dims dropped before dimIdx.
+    SmallVector<int64_t> droppedDimsBefore(transposeSourceVectorType.getRank());
+    int64_t droppedDims = 0;
+    for (auto [i, dim] : llvm::enumerate(transposeSourceDims)) {
+      droppedDimsBefore[i] = droppedDims;
+      if (dim == std::make_tuple(1, false))
+        ++droppedDims;
+    }
+
+    // Drop unit dims from transpose permutation.
+    auto perm = transposeOp.getPermutation();
+    SmallVector<int64_t> newPerm;
+    for (int64_t idx : perm) {
+      if (transposeSourceDims[idx] == std::make_tuple(1, false))
+        continue;
+      newPerm.push_back(idx - droppedDimsBefore[idx]);
+    }
+
+    auto loc = shapeCastOp.getLoc();
+    auto newShapeCastOp = rewriter.create<vector::ShapeCastOp>(
+        loc, dropUnitDims(transposeSourceVectorType), transposeOp.getVector());
+    rewriter.replaceOpWithNewOp<vector::TransposeOp>(shapeCastOp,
+                                                     newShapeCastOp, newPerm);
+    return success();
+  }
+};
+
+/// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
+/// the ZA state. This workaround rewrite to support these transposes when ZA is
+/// available.
+///
+/// Example:
+///
+///  BEFORE:
+///  ```mlir
+///  %transpose = vector.transpose %vec, [1, 0]
+///     : vector<2x[4]xf32> to vector<[4]x2xf32>
+///  vector.transfer_write %transpose, %dest[%y, %x]
+///     : vector<[4]x2xf32>,  memref<?x?xf32>
+///  ```
+///
+///  AFTER:
+///  ```mlir
+///   %0 = arm_sme.get_tile : vector<[4]x[4]xf32>
+///   %1 = vector.extract %vec[0] : vector<[4]xf32> from vector<2x[4]xf32>
+///   %2 = vector.insert %1, %0 [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
+///   %3 = vector.extract %vec[1] : vector<[4]xf32> from vector<2x[4]xf32>
+///   %4 = vector.insert %3, %2 [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+///   %c4_vscale = arith.muli %vscale, %c4 : index
+///   %mask = vector.create_mask %c4_vscale, %c2 : vector<[4]x[4]xi1>
+///   vector.transfer_write %4, %dest[%y, %x], %mask
+///      {permutation_map = affine_map<(d0, d1) -> (d1, d0)>}
+///      : vector<[4]x[4]xf32>, memref<?x?xf32>
+///  ```
+///
+/// Values larger than a single tile are supported via decomposition.
+struct LowerIllegalTransposeStoreViaZA
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
+                                PatternRewriter &rewriter) const override {
+    if (!isSupportedMaskOp(writeOp.getMask()))
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureUnsupportedMaskOp);
+
+    auto permutationMap = writeOp.getPermutationMap();
+    if (!permutationMap.isIdentity())
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureNonPermutationMap);
+
+    auto transposeOp = writeOp.getVector().getDefiningOp<vector::TransposeOp>();
+    if (!transposeOp)
+      return failure();
+
+    auto sourceType = transposeOp.getSourceVectorType();
+    auto resultType = transposeOp.getResultVectorType();
+
+    if (resultType.getRank() != 2)
+      return rewriter.notifyMatchFailure(transposeOp, "TransposeOp not rank 2");
+
+    if (!isLegalVectorType(sourceType) || isLegalVectorType(resultType))
+      return rewriter.notifyMatchFailure(
+          transposeOp, "not illegal/unsupported SVE transpose");
+
+    auto smeTileType = getSMETileTypeForElement(resultType.getElementType());
+    VectorType smeSliceType = VectorType::Builder(smeTileType).dropDim(0);
+
+    if (sourceType.getDimSize(0) <= 1 ||
+        sourceType.getDimSize(1) % smeSliceType.getDimSize(0) != 0)
+      return rewriter.notifyMatchFailure(writeOp, "unsupported source shape");
+
+    auto loc = writeOp.getLoc();
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
+
+    auto transposeMap = AffineMapAttr::get(
+        AffineMap::getPermutationMap(ArrayRef<int64_t>{1, 0}, getContext()));
+
+    // Note: We need to use `get_tile` as there's no vector-level `undef`.
+    Value undefTile = rewriter.create<arm_sme::GetTileOp>(loc, smeTileType);
+    Value destTensorOrMemref = writeOp.getSource();
+    auto numSlicesPerTile =
+        std::min(sourceType.getDimSize(0), smeTileType.getDimSize(0));
+    auto numSlices =
+        rewriter.create<arith::ConstantIndexOp>(loc, numSlicesPerTile);
+    for (auto [index, smeTile] : llvm::enumerate(
+             decomposeToSMETiles(rewriter, sourceType, smeTileType))) {
+      // 1. _Deliberately_ drop a scalable dimension and insert a fixed number
+      // of slices from the source type into the SME tile. Without checking
+      // vscale (and emitting multiple implementations) we can't make use of the
+      // rows of the tile after 1*vscale rows.
+      Value tile = undefTile;
+      for (int d = 0; d < numSlicesPerTile; ++d) {
+        Value vector = rewriter.create<vector::ExtractOp>(
+            loc, transposeOp.getVector(),
+            rewriter.getIndexAttr(d + smeTile.row));
+        if (vector.getType() != smeSliceType) {
+          vector = rewriter.create<vector::ScalableExtractOp>(
+              loc, smeSliceType, vector, smeTile.col);
+        }
+        tile = rewriter.create<vector::InsertOp>(loc, vector, tile, d);
+      }
+
+      // 2. Transpose the tile position.
+      auto transposedRow = createVscaleMultiple(smeTile.col);
+      auto transposedCol =
+          rewriter.create<arith::ConstantIndexOp>(loc, smeTile.row);
+
+      // 3. Compute mask for tile store.
+      Value maskRows;
+      Value maskCols;
+      if (auto mask = writeOp.getMask()) {
+        auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
+        maskRows = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(0),
+                                                  transposedRow);
+        maskCols = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(1),
+                                                  transposedCol);
+        maskCols = rewriter.create<index::MinSOp>(loc, maskCols, numSlices);
+      } else {
+        maskRows = createVscaleMultiple(smeTileType.getDimSize(0));
+        maskCols = numSlices;
+      }
+      auto subMask = rewriter.create<vector::CreateMaskOp>(
+          loc, smeTileType.clone(rewriter.getI1Type()),
+          ValueRange{maskRows, maskCols});
+
+      // 4. Emit a transposed tile write.
+      auto writeIndices = writeOp.getIndices();
+      Value destRow =
+          rewriter.create<arith::AddIOp>(loc, transposedRow, writeIndices[0]);
+      Value destCol =
+          rewriter.create<arith::AddIOp>(loc, transposedCol, writeIndices[1]);
+      auto smeWrite = rewriter.create<vector::TransferWriteOp>(
+          loc, tile, destTensorOrMemref, ValueRange{destRow, destCol},
+          transposeMap, subMask, writeOp.getInBounds());
+
+      if (writeOp.hasPureTensorSemantics())
+        destTensorOrMemref = smeWrite.getResult();
+    }
+
+    if (writeOp.hasPureTensorSemantics())
+      rewriter.replaceOp(writeOp, destTensorOrMemref);
+    else
+      rewriter.eraseOp(writeOp);
+
+    return success();
+  }
+};
+
 struct VectorLegalizationPass
     : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
   void runOnOperation() override {
@@ -796,7 +1026,9 @@ struct VectorLegalizationPass
 
     patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
                  LiftIllegalVectorTransposeToMemory,
-                 ConvertIllegalShapeCastOpsToTransposes>(context);
+                 ConvertIllegalShapeCastOpsToTransposes,
+                 SwapShapeCastOfTranspose, LowerIllegalTransposeStoreViaZA>(
+        context);
     // Note: These two patterns are added with a high benefit to ensure:
     //  - Masked outer products are handled before unmasked ones
     //  - Multi-tile writes are lowered as a store loop (if possible)
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
index 5227b22653eef..954485cfede3d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
@@ -463,15 +463,10 @@ struct BufferDeallocationSimplificationPass
                  SplitDeallocWhenNotAliasingAnyOther,
                  RetainedMemrefAliasingAlwaysDeallocatedMemref>(&getContext(),
                                                                 analysis);
-    // We don't want that the block structure changes invalidating the
-    // `BufferOriginAnalysis` so we apply the rewrites witha `Normal` level of
-    // region simplification
-    GreedyRewriteConfig config;
-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Normal;
     populateDeallocOpCanonicalizationPatterns(patterns, &getContext());
 
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
-                                            config)))
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
index 016ec2be62dce..d86bdb20a66bb 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
@@ -71,6 +71,14 @@ LogicalResult
 mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
   IRRewriter rewriter(module.getContext());
 
+  DenseMap<func::FuncOp, DenseSet<func::CallOp>> callerMap;
+  // Collect the mapping of functions to their call sites.
+  module.walk([&](func::CallOp callOp) {
+    if (func::FuncOp calledFunc = getCalledFunction(callOp)) {
+      callerMap[calledFunc].insert(callOp);
+    }
+  });
+
   for (auto funcOp : module.getOps<func::FuncOp>()) {
     if (funcOp.isExternal())
       continue;
@@ -109,10 +117,7 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
     returnOp.getOperandsMutable().assign(newReturnValues);
 
     // Update function calls.
-    module.walk([&](func::CallOp callOp) {
-      if (getCalledFunction(callOp) != funcOp)
-        return WalkResult::skip();
-
+    for (func::CallOp callOp : callerMap[funcOp]) {
       rewriter.setInsertionPoint(callOp);
       auto newCallOp = rewriter.create<func::CallOp>(callOp.getLoc(), funcOp,
                                                      callOp.getOperands());
@@ -136,8 +141,7 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
         newResults.push_back(replacement);
       }
       rewriter.replaceOp(callOp, newResults);
-      return WalkResult::advance();
-    });
+    }
   }
 
   return success();
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 9372caf6e32a7..90610118a45cd 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -948,6 +948,11 @@ static SmallVector<Type, 1> getCallOpResultTypes(LLVMFunctionType calleeType) {
   return results;
 }
 
+/// Gets the variadic callee type for a LLVMFunctionType.
+static TypeAttr getCallOpVarCalleeType(LLVMFunctionType calleeType) {
+  return calleeType.isVarArg() ? TypeAttr::get(calleeType) : nullptr;
+}
+
 /// Constructs a LLVMFunctionType from MLIR `results` and `args`.
 static LLVMFunctionType getLLVMFuncType(MLIRContext *context, TypeRange results,
                                         ValueRange args) {
@@ -974,9 +979,11 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
                    FlatSymbolRefAttr callee, ValueRange args) {
   assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
-        TypeAttr::get(getLLVMFuncType(builder.getContext(), results, args)),
-        callee, args, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*var_callee_type=*/nullptr, callee, args, /*fastmathFlags=*/nullptr,
+        /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
+        /*memory_effects=*/nullptr,
+        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -997,18 +1004,24 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
                    LLVMFunctionType calleeType, FlatSymbolRefAttr callee,
                    ValueRange args) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        TypeAttr::get(calleeType), callee, args, /*fastmathFlags=*/nullptr,
+        getCallOpVarCalleeType(calleeType), callee, args,
+        /*fastmathFlags=*/nullptr,
         /*branch_weights=*/nullptr, /*CConv=*/nullptr,
-        /*TailCallKind=*/nullptr, /*access_groups=*/nullptr,
+        /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
+        /*convergent=*/nullptr,
+        /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
 
 void CallOp::build(OpBuilder &builder, OperationState &state,
                    LLVMFunctionType calleeType, ValueRange args) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        TypeAttr::get(calleeType), /*callee=*/nullptr, args,
+        getCallOpVarCalleeType(calleeType),
+        /*callee=*/nullptr, args,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
-        /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
+        /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
+        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1017,9 +1030,10 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
                    ValueRange args) {
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
-        TypeAttr::get(calleeType), SymbolRefAttr::get(func), args,
+        getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), args,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
-        /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
+        /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
+        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1076,9 +1090,49 @@ static LogicalResult verifyCallOpDebugInfo(CallOp callOp, LLVMFuncOp callee) {
   return success();
 }
 
+/// Verify that the parameter and return types of the variadic callee type match
+/// the `callOp` argument and result types.
+template <typename OpTy>
+LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
+  std::optional<LLVMFunctionType> varCalleeType = callOp.getVarCalleeType();
+  if (!varCalleeType)
+    return success();
+
+  // Verify the variadic callee type is a variadic function type.
+  if (!varCalleeType->isVarArg())
+    return callOp.emitOpError(
+        "expected var_callee_type to be a variadic function type");
+
+  // Verify the variadic callee type has at most as many parameters as the call
+  // has argument operands.
+  if (varCalleeType->getNumParams() > callOp.getArgOperands().size())
+    return callOp.emitOpError("expected var_callee_type to have at most ")
+           << callOp.getArgOperands().size() << " parameters";
+
+  // Verify the variadic callee type matches the call argument types.
+  for (auto [paramType, operand] :
+       llvm::zip(varCalleeType->getParams(), callOp.getArgOperands()))
+    if (paramType != operand.getType())
+      return callOp.emitOpError()
+             << "var_callee_type parameter type mismatch: " << paramType
+             << " != " << operand.getType();
+
+  // Verify the variadic callee type matches the call result type.
+  if (!callOp.getNumResults()) {
+    if (!isa<LLVMVoidType>(varCalleeType->getReturnType()))
+      return callOp.emitOpError("expected var_callee_type to return void");
+  } else {
+    if (callOp.getResult().getType() != varCalleeType->getReturnType())
+      return callOp.emitOpError("var_callee_type return type mismatch: ")
+             << varCalleeType->getReturnType()
+             << " != " << callOp.getResult().getType();
+  }
+  return success();
+}
+
 LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
-  if (getNumResults() > 1)
-    return emitOpError("must have 0 or 1 result");
+  if (failed(verifyCallOpVarCalleeType(*this)))
+    return failure();
 
   // Type for the callee, we'll get it differently depending if it is a direct
   // or indirect call.
@@ -1120,8 +1174,8 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   if (!funcType)
     return emitOpError("callee does not have a functional type: ") << fnType;
 
-  if (funcType.isVarArg() && !getCalleeType())
-    return emitOpError() << "missing callee type attribute for vararg call";
+  if (funcType.isVarArg() && !getVarCalleeType())
+    return emitOpError() << "missing var_callee_type attribute for vararg call";
 
   // Verify that the operand and result types match the callee.
 
@@ -1168,21 +1222,13 @@ void CallOp::print(OpAsmPrinter &p) {
   auto callee = getCallee();
   bool isDirect = callee.has_value();
 
-  LLVMFunctionType calleeType;
-  bool isVarArg = false;
-
-  if (std::optional<LLVMFunctionType> optionalCalleeType = getCalleeType()) {
-    calleeType = *optionalCalleeType;
-    isVarArg = calleeType.isVarArg();
-  }
-
   p << ' ';
 
   // Print calling convention.
   if (getCConv() != LLVM::CConv::C)
     p << stringifyCConv(getCConv()) << ' ';
 
-  if(getTailCallKind() != LLVM::TailCallKind::None)
+  if (getTailCallKind() != LLVM::TailCallKind::None)
     p << tailcallkind::stringifyTailCallKind(getTailCallKind()) << ' ';
 
   // Print the direct callee if present as a function attribute, or an indirect
@@ -1195,12 +1241,13 @@ void CallOp::print(OpAsmPrinter &p) {
   auto args = getOperands().drop_front(isDirect ? 0 : 1);
   p << '(' << args << ')';
 
-  if (isVarArg)
-    p << " vararg(" << calleeType << ")";
+  // Print the variadic callee type if the call is variadic.
+  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
+    p << " vararg(" << *varCalleeType << ")";
 
   p.printOptionalAttrDict(processFMFAttr((*this)->getAttrs()),
-                          {getCConvAttrName(), "callee", "callee_type",
-                           getTailCallKindAttrName()});
+                          {getCalleeAttrName(), getTailCallKindAttrName(),
+                           getVarCalleeTypeAttrName(), getCConvAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1270,11 +1317,11 @@ static ParseResult parseOptionalCallFuncPtr(
 
 // <operation> ::= `llvm.call` (cconv)? (tailcallkind)? (function-id | ssa-use)
 //                             `(` ssa-use-list `)`
-//                             ( `vararg(` var-arg-func-type `)` )?
+//                             ( `vararg(` var-callee-type `)` )?
 //                             attribute-dict? `:` (type `,`)? function-type
 ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SymbolRefAttr funcAttr;
-  TypeAttr calleeType;
+  TypeAttr varCalleeType;
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
 
   // Default to C Calling Convention if no keyword is provided.
@@ -1305,8 +1352,12 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
 
   bool isVarArg = parser.parseOptionalKeyword("vararg").succeeded();
   if (isVarArg) {
+    StringAttr varCalleeTypeAttrName =
+        CallOp::getVarCalleeTypeAttrName(result.name);
     if (parser.parseLParen().failed() ||
-        parser.parseAttribute(calleeType, "callee_type", result.attributes)
+        parser
+            .parseAttribute(varCalleeType, varCalleeTypeAttrName,
+                            result.attributes)
             .failed() ||
         parser.parseRParen().failed())
       return failure();
@@ -1320,8 +1371,8 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 LLVMFunctionType CallOp::getCalleeFunctionType() {
-  if (getCalleeType())
-    return *getCalleeType();
+  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
+    return *varCalleeType;
   return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
@@ -1334,8 +1385,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
                      Block *unwind, ValueRange unwindOps) {
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
-        TypeAttr::get(calleeType), SymbolRefAttr::get(func), ops, normalOps,
-        unwindOps, nullptr, nullptr, normal, unwind);
+        getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
+        normalOps, unwindOps, nullptr, nullptr, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1343,8 +1394,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange normalOps, Block *unwind,
                      ValueRange unwindOps) {
   build(builder, state, tys,
-        TypeAttr::get(getLLVMFuncType(builder.getContext(), tys, ops)), callee,
-        ops, normalOps, unwindOps, nullptr, nullptr, normal, unwind);
+        /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
+        nullptr, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1352,8 +1403,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      ValueRange ops, Block *normal, ValueRange normalOps,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        TypeAttr::get(calleeType), callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, normal, unwind);
+        getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
+        nullptr, nullptr, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1390,8 +1441,8 @@ MutableOperandRange InvokeOp::getArgOperandsMutable() {
 }
 
 LogicalResult InvokeOp::verify() {
-  if (getNumResults() > 1)
-    return emitOpError("must have 0 or 1 result");
+  if (failed(verifyCallOpVarCalleeType(*this)))
+    return failure();
 
   Block *unwindDest = getUnwindDest();
   if (unwindDest->empty())
@@ -1409,14 +1460,6 @@ void InvokeOp::print(OpAsmPrinter &p) {
   auto callee = getCallee();
   bool isDirect = callee.has_value();
 
-  LLVMFunctionType calleeType;
-  bool isVarArg = false;
-
-  if (std::optional<LLVMFunctionType> optionalCalleeType = getCalleeType()) {
-    calleeType = *optionalCalleeType;
-    isVarArg = calleeType.isVarArg();
-  }
-
   p << ' ';
 
   // Print calling convention.
@@ -1435,12 +1478,13 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p << " unwind ";
   p.printSuccessorAndUseList(getUnwindDest(), getUnwindDestOperands());
 
-  if (isVarArg)
-    p << " vararg(" << calleeType << ")";
+  // Print the variadic callee type if the invoke is variadic.
+  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
+    p << " vararg(" << *varCalleeType << ")";
 
   p.printOptionalAttrDict((*this)->getAttrs(),
-                          {InvokeOp::getOperandSegmentSizeAttr(), "callee",
-                           "callee_type", InvokeOp::getCConvAttrName()});
+                          {getCalleeAttrName(), getOperandSegmentSizeAttr(),
+                           getCConvAttrName(), getVarCalleeTypeAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1453,12 +1497,12 @@ void InvokeOp::print(OpAsmPrinter &p) {
 //                  `(` ssa-use-list `)`
 //                  `to` bb-id (`[` ssa-use-and-type-list `]`)?
 //                  `unwind` bb-id (`[` ssa-use-and-type-list `]`)?
-//                  ( `vararg(` var-arg-func-type `)` )?
+//                  ( `vararg(` var-callee-type `)` )?
 //                  attribute-dict? `:` (type `,`)? function-type
 ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand, 8> operands;
   SymbolRefAttr funcAttr;
-  TypeAttr calleeType;
+  TypeAttr varCalleeType;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1488,8 +1532,12 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
 
   bool isVarArg = parser.parseOptionalKeyword("vararg").succeeded();
   if (isVarArg) {
+    StringAttr varCalleeTypeAttrName =
+        InvokeOp::getVarCalleeTypeAttrName(result.name);
     if (parser.parseLParen().failed() ||
-        parser.parseAttribute(calleeType, "callee_type", result.attributes)
+        parser
+            .parseAttribute(varCalleeType, varCalleeTypeAttrName,
+                            result.attributes)
             .failed() ||
         parser.parseRParen().failed())
       return failure();
@@ -1515,8 +1563,8 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 LLVMFunctionType InvokeOp::getCalleeFunctionType() {
-  if (getCalleeType())
-    return *getCalleeType();
+  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
+    return *varCalleeType;
   return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
@@ -3132,7 +3180,6 @@ void LLVMDialect::initialize() {
   // clang-format off
   addTypes<LLVMVoidType,
            LLVMPPCFP128Type,
-           LLVMX86MMXType,
            LLVMTokenType,
            LLVMLabelType,
            LLVMMetadataType,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index ddf36ce6c715c..137c1962b100a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -658,7 +658,7 @@ static Value handleByValArgument(OpBuilder &builder, Operation *callable,
                                  Value argument, Type elementType,
                                  uint64_t requestedAlignment) {
   auto func = cast<LLVM::LLVMFuncOp>(callable);
-  LLVM::MemoryEffectsAttr memoryEffects = func.getMemoryAttr();
+  LLVM::MemoryEffectsAttr memoryEffects = func.getMemoryEffectsAttr();
   // If there is no memory effects attribute, assume that the function is
   // not read-only.
   bool isReadOnly = memoryEffects &&
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index 9810d4d643677..c4708d826f2b3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -35,7 +35,6 @@ static StringRef getTypeKeyword(Type type) {
   return TypeSwitch<Type, StringRef>(type)
       .Case<LLVMVoidType>([&](Type) { return "void"; })
       .Case<LLVMPPCFP128Type>([&](Type) { return "ppc_fp128"; })
-      .Case<LLVMX86MMXType>([&](Type) { return "x86_mmx"; })
       .Case<LLVMTokenType>([&](Type) { return "token"; })
       .Case<LLVMLabelType>([&](Type) { return "label"; })
       .Case<LLVMMetadataType>([&](Type) { return "metadata"; })
@@ -309,7 +308,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) {
   return StringSwitch<function_ref<Type()>>(key)
       .Case("void", [&] { return LLVMVoidType::get(ctx); })
       .Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); })
-      .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); })
       .Case("token", [&] { return LLVMTokenType::get(ctx); })
       .Case("label", [&] { return LLVMLabelType::get(ctx); })
       .Case("metadata", [&] { return LLVMMetadataType::get(ctx); })
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index cf3f38b710130..dc7aef8ef7f85 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -780,8 +780,7 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) {
       LLVMFixedVectorType,
       LLVMScalableVectorType,
       LLVMTargetExtType,
-      LLVMVoidType,
-      LLVMX86MMXType
+      LLVMVoidType
     >(type)) {
     // clang-format on
     return true;
@@ -843,8 +842,7 @@ static bool isCompatibleImpl(Type type, DenseSet<Type> &compatibleTypes) {
             LLVMMetadataType,
             LLVMPPCFP128Type,
             LLVMTokenType,
-            LLVMVoidType,
-            LLVMX86MMXType
+            LLVMVoidType
           >([](Type) { return true; })
           // clang-format on
           .Default([](Type) { return false; });
@@ -986,8 +984,7 @@ llvm::TypeSize mlir::LLVM::getPrimitiveTypeSizeInBits(Type type) {
       .Case<BFloat16Type, Float16Type>(
           [](Type) { return llvm::TypeSize::getFixed(16); })
       .Case<Float32Type>([](Type) { return llvm::TypeSize::getFixed(32); })
-      .Case<Float64Type, LLVMX86MMXType>(
-          [](Type) { return llvm::TypeSize::getFixed(64); })
+      .Case<Float64Type>([](Type) { return llvm::TypeSize::getFixed(64); })
       .Case<Float80Type>([](Type) { return llvm::TypeSize::getFixed(80); })
       .Case<Float128Type>([](Type) { return llvm::TypeSize::getFixed(128); })
       .Case<IntegerType>([](IntegerType intTy) {
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index cefaad9b22653..99b625d99fec2 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -879,15 +879,66 @@ struct FoldFillWithTranspose : OpRewritePattern<linalg::TransposeOp> {
   }
 };
 
+/// Fold a concat with all elements being fills of the same value
+/// into a fill of the concat result shape.
+struct FoldConcatsOfFill : public OpRewritePattern<tensor::ConcatOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ConcatOp concatOp,
+                                PatternRewriter &rewriter) const override {
+    auto concatOperands = concatOp.getInputs();
+    if (concatOperands.empty()) {
+      return failure();
+    }
+
+    auto firstFillOp = concatOperands.front().getDefiningOp<linalg::FillOp>();
+    if (!firstFillOp) {
+      return failure();
+    }
+    // Prefetch the fill value.
+    OpFoldResult firstFillVal =
+        getAsOpFoldResult(firstFillOp.getDpsInputOperand(0)->get());
+    // Collect all the outs values for the fill operations.
+    SmallVector<Value> allOuts;
+    allOuts.push_back(firstFillOp.getDpsInitOperand(0)->get());
+
+    auto isDefinedByCompatibleFillOp = [&](Value v) -> bool {
+      auto fillOp = v.getDefiningOp<linalg::FillOp>();
+      if (!fillOp) {
+        return false;
+      }
+
+      OpFoldResult fillVal =
+          getAsOpFoldResult(fillOp.getDpsInputOperand(0)->get());
+      if (fillVal != firstFillVal)
+        return false;
+
+      allOuts.push_back(fillOp.getDpsInitOperand(0)->get());
+      return true;
+    };
+    if (!llvm::all_of(concatOperands.drop_front(),
+                      isDefinedByCompatibleFillOp)) {
+      return rewriter.notifyMatchFailure(
+          concatOp, "not all operands are defined by a compatible fill op");
+    }
+
+    Value outsConcat = rewriter.create<tensor::ConcatOp>(
+        concatOp.getLoc(), concatOp.getDim(), allOuts);
+    rewriter.replaceOpWithNewOp<linalg::FillOp>(
+        concatOp, firstFillOp.getDpsInputOperand(0)->get(), outsConcat);
+    return success();
+  }
+};
+
 } // namespace
 
 void FillOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results
-      .add<FoldFillWithCopy, FoldFillWithTensorExtract, FoldFillWithPack,
-           FoldFillWithPad, FoldFillWithTensorReshape<tensor::CollapseShapeOp>,
-           FoldFillWithTensorReshape<tensor::ExpandShapeOp>,
-           FoldInsertPadIntoFill, FoldFillWithTranspose>(context);
+  results.add<FoldConcatsOfFill, FoldFillWithCopy, FoldFillWithTensorExtract,
+              FoldFillWithPack, FoldFillWithPad,
+              FoldFillWithTensorReshape<tensor::CollapseShapeOp>,
+              FoldFillWithTensorReshape<tensor::ExpandShapeOp>,
+              FoldInsertPadIntoFill, FoldFillWithTranspose>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1895,9 +1946,68 @@ struct FoldTransposeWithTranspose : OpRewritePattern<linalg::TransposeOp> {
   }
 };
 
+/// This pattern canonicalize transpose by swapping the order of
+/// broadcast and transpose:
+///   transpose(broadcast(input)) -> broadcast(transpose(input))
+struct SwapTransposeWithBroadcast : OpRewritePattern<linalg::TransposeOp> {
+  using OpRewritePattern<linalg::TransposeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::TransposeOp transposeOp,
+                                PatternRewriter &rewriter) const override {
+    Value input = transposeOp.getInput();
+    BroadcastOp broadcastOp = input.getDefiningOp<BroadcastOp>();
+    if (!input.hasOneUse() || !broadcastOp)
+      return failure();
+
+    ArrayRef<int64_t> dimensions = broadcastOp.getDimensions();
+    ArrayRef<int64_t> perms = transposeOp.getPermutation();
+
+    // Get new perms and new dimensions.
+    SmallVector<int64_t> resultPerms = dropDims(perms, dimensions);
+    SmallVector<int64_t> invertPerm = invertPermutationVector(perms);
+    SmallVector<int64_t> resultDimensions;
+    unsigned dimensionSize = dimensions.size();
+    for (unsigned i = 0; i < dimensionSize; ++i)
+      resultDimensions.push_back(invertPerm[dimensions[i]]);
+
+    // Create transpose result.
+    Value broadcastInput = broadcastOp.getInput();
+    Location loc = transposeOp.getLoc();
+    MLIRContext *ctx = transposeOp.getContext();
+    SmallVector<OpFoldResult> dims;
+    auto broadcastInputTy =
+        mlir::cast<RankedTensorType>(broadcastInput.getType());
+    unsigned inputRank = broadcastInputTy.getRank();
+    for (unsigned i = 0; i < inputRank; ++i) {
+      if (broadcastInputTy.isDynamicDim(i)) {
+        dims.push_back(rewriter.create<tensor::DimOp>(loc, broadcastInput, i)
+                           ->getResult(0));
+      } else {
+        dims.push_back(IntegerAttr::get(IndexType::get(ctx),
+                                        broadcastInputTy.getDimSize(i)));
+      }
+    }
+    SmallVector<OpFoldResult> transposeResultShapes =
+        applyPermutation(dims, resultPerms);
+    Value transposeInit = rewriter.create<tensor::EmptyOp>(
+        transposeOp.getLoc(), transposeResultShapes,
+        broadcastInputTy.getElementType());
+
+    // Create broadcast(transpose(input)).
+    Value transposeResult =
+        rewriter
+            .create<TransposeOp>(loc, broadcastOp.getInput(), transposeInit,
+                                 resultPerms)
+            ->getResult(0);
+    rewriter.replaceOpWithNewOp<BroadcastOp>(
+        transposeOp, transposeResult, transposeOp.getInit(), resultDimensions);
+    return success();
+  }
+};
+
 void TransposeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
-  results.add<FoldTransposeWithTranspose>(context);
+  results.add<FoldTransposeWithTranspose, SwapTransposeWithBroadcast>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b611347b8de2e..9baf358a95503 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -3151,12 +3152,100 @@ void transform::TileUsingForallOp::build(OpBuilder &builder,
         /*mapping=*/mapping);
 }
 
+/// Given `lbs`, `ubs` and `steps` of loops, return (for each loop), the
+/// normalized upper bound.
+static SmallVector<OpFoldResult>
+normalizeUpperBounds(RewriterBase &rewriter, Location loc,
+                     ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
+                     ArrayRef<OpFoldResult> steps) {
+  AffineExpr s0, s1, s2;
+  bindSymbols(rewriter.getContext(), s0, s1, s2);
+  AffineExpr normalizedUbExpr = (s1 - s0).ceilDiv(s2);
+  SmallVector<OpFoldResult> normalizedUbs;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
+    OpFoldResult normalizedUb = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, normalizedUbExpr, {lb, ub, step});
+    normalizedUbs.push_back(normalizedUb);
+  }
+  return normalizedUbs;
+}
+
+/// When a loop is normalized, the uses of the induction variable within the
+/// loop need to replaced with `original_lb + old_iv * original_step`.
+static SmallVector<Value> denormalizeIndVar(RewriterBase &rewriter,
+                                            Location loc, ValueRange ivs,
+                                            ArrayRef<OpFoldResult> lbs,
+                                            ArrayRef<OpFoldResult> steps) {
+  AffineExpr s0, s1;
+  AffineExpr d0;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  bindDims(rewriter.getContext(), d0);
+  AffineExpr denormExpr = s0 + d0 * s1;
+  SmallVector<Value> denormalizedIvs;
+
+  for (auto [iv, lb, step] : llvm::zip_equal(ivs, lbs, steps)) {
+    OpFoldResult denormValue = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, denormExpr, ArrayRef<OpFoldResult>{iv, lb, step});
+    denormalizedIvs.push_back(
+        getValueOrCreateConstantIndexOp(rewriter, loc, denormValue));
+  }
+  return denormalizedIvs;
+}
+
+/// Given a `scf.forall` loop return a loop op with the loop bounds
+/// normalized.
+/// TODO: Replace this with a general utility to normalize `scf.forall`.
+/// At the time of writing, this wasnt done since adding this to `scf`
+/// dialect would disallow using of `affine.apply` operations due
+/// to cyclic dependencies. To avoid churn in lit tests
+/// with the change this was added with, defer that to a follow up.
+static scf::ForallOp normalizeForallLoopOp(RewriterBase &rewriter,
+                                           scf::ForallOp loop) {
+  SmallVector<OpFoldResult> lbs = loop.getMixedLowerBound();
+  SmallVector<OpFoldResult> ubs = loop.getMixedUpperBound();
+  SmallVector<OpFoldResult> steps = loop.getMixedStep();
+
+  if (llvm::all_of(
+          lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
+      llvm::all_of(
+          steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
+    return loop;
+  }
+
+  Location loc = loop.getLoc();
+  SmallVector<OpFoldResult> normalizedUbs =
+      normalizeUpperBounds(rewriter, loc, lbs, ubs, steps);
+  SmallVector<OpFoldResult> normalizedLbs(normalizedUbs.size(),
+                                          rewriter.getIndexAttr(0));
+  SmallVector<OpFoldResult> normalizedSteps(normalizedUbs.size(),
+                                            rewriter.getIndexAttr(1));
+
+  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
+      loc, normalizedLbs, normalizedUbs, normalizedSteps, loop.getOutputs(),
+      loop.getMapping(), [](OpBuilder &, Location, ValueRange) {});
+
+  auto normalizedLoopIvs = normalizedForallOp.getInductionVars();
+  OpBuilder::InsertionGuard g(rewriter);
+  Block *normalizedLoopBlock = normalizedForallOp.getBody();
+  rewriter.setInsertionPointToStart(normalizedLoopBlock);
+
+  SmallVector<Value> argValues =
+      denormalizeIndVar(rewriter, loc, normalizedLoopIvs, lbs, steps);
+  argValues.append(normalizedForallOp.getRegionIterArgs().begin(),
+                   normalizedForallOp.getRegionIterArgs().end());
+  Block *origLoopBlock = loop.getBody();
+  rewriter.mergeBlocks(origLoopBlock, normalizedLoopBlock, argValues);
+
+  rewriter.replaceOp(loop, normalizedForallOp);
+  return normalizedForallOp;
+}
+
 DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, Operation *target,
     ArrayRef<OpFoldResult> mixedNumThreads,
     ArrayRef<OpFoldResult> mixedTileSizes, std::optional<ArrayAttr> mapping,
-    linalg::ForallTilingResult &tilingResult) {
+    scf::SCFTilingResult &tilingResult) {
   // Transform all targets one by one.
   auto tileableOp = dyn_cast<TilingInterface>(target);
   if (!tileableOp) {
@@ -3167,20 +3256,35 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     return diag;
   }
   rewriter.setInsertionPoint(tileableOp);
-  FailureOr<linalg::ForallTilingResult> maybeTilingResult = failure();
+  scf::SCFTilingOptions options;
+  options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
   if (!mixedNumThreads.empty()) {
-    maybeTilingResult =
-        linalg::tileToForallOp(rewriter, tileableOp, mixedNumThreads, mapping);
+    options.setNumThreads(mixedNumThreads);
   } else {
-    maybeTilingResult = linalg::tileToForallOpUsingTileSizes(
-        rewriter, tileableOp, mixedTileSizes, mapping);
+    options.setTileSizes(mixedTileSizes);
   }
+  if (mapping) {
+    options.setMapping(mapping.value().getValue());
+  }
+  FailureOr<scf::SCFTilingResult> maybeTilingResult =
+      scf::tileUsingSCF(rewriter, tileableOp, options);
 
   if (failed(maybeTilingResult))
     return transformOp.emitDefaultSilenceableFailure(tileableOp);
-  rewriter.replaceOp(tileableOp, maybeTilingResult->tileOp->getResults());
+
+  rewriter.replaceOp(tileableOp, maybeTilingResult->replacements);
 
   tilingResult = *maybeTilingResult;
+
+  if (mixedNumThreads.empty()) {
+    auto generatedForallOp = cast<scf::ForallOp>(tilingResult.loops.front());
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(generatedForallOp);
+    scf::ForallOp normalizedForallOp =
+        normalizeForallLoopOp(rewriter, generatedForallOp);
+    tilingResult.loops.front() = normalizedForallOp;
+  }
+
   return DiagnosedSilenceableFailure::success();
 }
 
@@ -3214,14 +3318,14 @@ DiagnosedSilenceableFailure transform::TileUsingForallOp::apply(
     return status;
 
   for (Operation *target : state.getPayloadOps(getTarget())) {
-    linalg::ForallTilingResult tilingResult;
+    scf::SCFTilingResult tilingResult;
     DiagnosedSilenceableFailure diag = tileToForallOpImpl(
         rewriter, state, transformOp, target, mixedNumThreads, mixedTileSizes,
         getMapping(), tilingResult);
     if (!diag.succeeded())
       return diag;
-    tileOps.push_back(tilingResult.tileOp);
-    tiledOps.push_back(tilingResult.tiledOp);
+    tileOps.push_back(tilingResult.loops.front());
+    tiledOps.append(tilingResult.tiledOps);
   }
 
   transformResults.set(cast<OpResult>(getForallOp()), tileOps);
@@ -3699,7 +3803,7 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne(
 
   // OpBuilder only used to compute attributes.
   OpBuilder b(getContext());
-  linalg::ForallTilingResult tilingResult;
+  scf::SCFTilingResult tilingResult;
   DiagnosedSilenceableFailure diag = tileToForallOpImpl(
       /*rewriter=*/rewriter,
       /*state=*/state,
@@ -3712,8 +3816,9 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne(
   if (!diag.succeeded())
     return diag;
 
-  results.push_back(tilingResult.tileOp);
-  results.push_back(tilingResult.tiledOp);
+  results.push_back(tilingResult.loops.front());
+  for (auto op : tilingResult.tiledOps)
+    results.push_back(op);
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 8ef8651646829..fb6ab2055e7dd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -435,188 +435,6 @@ static void calculateTileOffsetsAndSizes(
   }
 }
 
-/// Returns a vector of bools representing if, for each axis, `op` can be tiled
-/// without incurring in a race condition and thus it is thread-safe to do the
-/// tiling. This is checked by iterating over numThreads and ensuring that the
-/// corresponding iterator type is "parallel". If it is not, then we know that
-/// such dimension is unsafe to tile.
-SmallVector<bool> safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp,
-                                     ArrayRef<OpFoldResult> numThreads) {
-  auto iterators = linalgOp.getIteratorTypesArray();
-  SmallVector<bool> safeToTile(numThreads.size(), true);
-
-  for (unsigned i = 0, e = numThreads.size(); i != e; i++) {
-    if (auto attr = llvm::dyn_cast_if_present<Attribute>(numThreads[i])) {
-      if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1) {
-        safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
-      }
-    } else {
-      safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
-    }
-  }
-  return safeToTile;
-}
-
-/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
-/// tiling is specified by the number of tiles/threads `numThreads` and the
-/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
-/// not specified, then  it is derived from `numThreads` as `ceilDiv(dimSize[i],
-/// numThreads[i])`. If non-empty, the `mapping` is added as an
-/// attribute to the resulting `scf.forall`. A zero tile sizes indicate
-/// that the dimension is not tiled, and can be thought of as tiling by the full
-/// size of data.
-/// It is the user's responsibility to ensure that `numThreads` is a valid
-/// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
-/// Linalg case). If the dimension is not parallelizable, a warning is issued to
-/// notify the user that the generated code is not safe to parallelize. If
-/// `omitTileOffsetBoundsCheck` is true, then the function will assume that
-/// `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
-static FailureOr<ForallTilingResult> tileToForallOpImpl(
-    RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
-    std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
-    std::optional<ArrayAttr> mapping, bool omitTileOffsetBoundsCheck) {
-  Location loc = op->getLoc();
-  OpBuilder::InsertionGuard g(b);
-
-  SmallVector<Range> loopRanges = op.getIterationDomain(b);
-  if (loopRanges.empty())
-    return op->emitOpError("expected non-empty loop ranges");
-  auto hasStrideOne = [](Range r) { return !isConstantIntValue(r.stride, 1); };
-  if (llvm::any_of(loopRanges, hasStrideOne))
-    return op->emitOpError("only stride-1 supported atm");
-
-  // Gather destination tensors.
-  SmallVector<Value> dest;
-  if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))
-    return op->emitOpError("failed to get destination tensors");
-
-  SmallVector<OpFoldResult> nonZeroNumThreads =
-      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
-        return !isConstantIntValue(ofr, 0);
-      }));
-  SmallVector<Value> materializedNonZeroNumThreads =
-      llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](OpFoldResult ofr) {
-        return getValueOrCreateConstantIndexOp(b, loc, ofr);
-      }));
-
-  LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
-  if (linalgOp) {
-    // Check if tiling is thread safe and print a warning if not.
-    SmallVector<bool> tilingSafety =
-        safeToTileToForall(b.getContext(), linalgOp, numThreads);
-    for (size_t i = 0; i < tilingSafety.size(); i++)
-      if (!tilingSafety[i])
-        op.emitWarning() << "tiling is not thread safe at axis #" << i;
-  }
-
-  // 1. Create the ForallOp. We don't use the lambda body-builder
-  // version because we require the use of RewriterBase in the body, so we
-  // manually move the insertion point to the body below.
-  scf::ForallOp forallOp = b.create<scf::ForallOp>(
-      loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping);
-
-  // 2. Fill out the ForallOp body.
-  SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges,
-                               omitTileOffsetBoundsCheck, nominalTileSizes,
-                               tiledOffsets, tiledSizes);
-
-  // 3. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForallOp.
-  ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
-  Operation *tiledOp = nullptr;
-  SmallVector<Value> tiledValues;
-  {
-    // 3.a. RAII guard, inserting within forallOp, before terminator.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(forallOp.getTerminator());
-    Operation *clonedOp = b.clone(*op.getOperation());
-    auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
-    if (destinationStyleOp) {
-      for (OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) {
-        // Swap tensor inits with the corresponding block argument of the
-        // scf.forall op. Memref inits remain as is.
-        if (isa<TensorType>(outOperand.get().getType())) {
-          auto *it = llvm::find(dest, outOperand.get());
-          assert(it != dest.end() && "could not find destination tensor");
-          unsigned destNum = std::distance(dest.begin(), it);
-          outOperand.set(destBbArgs[destNum]);
-        }
-      }
-    }
-
-    // 4. Tile the cloned op and delete the clone.
-    FailureOr<TilingResult> tilingResult =
-        cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
-                                                               tiledSizes);
-    if (failed(tilingResult))
-      return clonedOp->emitError("Failed to tile op: ");
-    if (tilingResult->tiledOps.size() != 1) {
-      return clonedOp->emitError("expected a single produced tiled op, got ")
-             << tilingResult->tiledOps.size();
-    }
-
-    b.eraseOp(clonedOp);
-    tiledOp = tilingResult->tiledOps.front();
-    tiledValues = tilingResult->tiledValues;
-  }
-
-  // 5. Parallel insert back into the result tensor.
-  for (auto it : llvm::zip(llvm::seq(unsigned(0), unsigned(dest.size())),
-                           tiledValues, destBbArgs)) {
-    // 5.a. Partial subset information is inserted just before the terminator.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(forallOp.getTerminator());
-
-    SmallVector<OpFoldResult> resultOffsets, resultSizes;
-    if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
-                                        tiledSizes, resultOffsets,
-                                        resultSizes)))
-      return op->emitOpError("output offsets couldn't be calculated");
-    SmallVector<OpFoldResult> strides(resultSizes.size(), b.getIndexAttr(1));
-
-    // 5.b. Parallel insertions are inserted at the end of the combining
-    // terminator.
-    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
-    b.create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
-                                            std::get<2>(it), resultOffsets,
-                                            resultSizes, strides);
-  }
-  return ForallTilingResult{forallOp, tiledOp};
-}
-
-FailureOr<ForallTilingResult>
-linalg::tileToForallOp(RewriterBase &b, TilingInterface op,
-                       ArrayRef<OpFoldResult> numThreads,
-                       std::optional<ArrayAttr> mapping) {
-  return tileToForallOpImpl(b, op, numThreads,
-                            /*nominalTileSizes=*/std::nullopt, mapping,
-                            /*omitTileOffsetBoundsCheck=*/false);
-}
-
-FailureOr<ForallTilingResult>
-linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op,
-                                     ArrayRef<OpFoldResult> tileSizes,
-                                     std::optional<ArrayAttr> mapping) {
-  SmallVector<Range> loopRanges = op.getIterationDomain(b);
-  unsigned nLoops = loopRanges.size();
-  SmallVector<OpFoldResult> numThreads;
-  numThreads.reserve(nLoops);
-  AffineExpr s0, s1;
-  bindSymbols(b.getContext(), s0, s1);
-  AffineExpr divExpr = s0.ceilDiv(s1);
-  for (const auto &it : llvm::zip(tileSizes, loopRanges)) {
-    OpFoldResult numTiles = std::get<0>(it);
-    if (!isConstantIntValue(numTiles, 0))
-      numTiles = makeComposedFoldedAffineApply(
-          b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
-    numThreads.push_back(numTiles);
-  }
-  return tileToForallOpImpl(b, op, numThreads,
-                            /*nominalTileSizes=*/tileSizes, mapping,
-                            /*omitTileOffsetBoundsCheck=*/true);
-}
-
 template <typename LoopTy>
 static FailureOr<TiledLinalgOp>
 tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 68ee915cca3f4..3d0d6abf702d7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -522,9 +522,11 @@ mlir::linalg::getCombinerOpKind(Operation *combinerOp) {
       .Case<arith::MaxSIOp>([&](auto op) { return CombiningKind::MAXSI; })
       .Case<arith::MaxUIOp>([&](auto op) { return CombiningKind::MAXUI; })
       .Case<arith::MaximumFOp>([&](auto op) { return CombiningKind::MAXIMUMF; })
+      .Case<arith::MaxNumFOp>([&](auto op) { return CombiningKind::MAXNUMF; })
       .Case<arith::MinSIOp>([&](auto op) { return CombiningKind::MINSI; })
       .Case<arith::MinUIOp>([&](auto op) { return CombiningKind::MINUI; })
       .Case<arith::MinimumFOp>([&](auto op) { return CombiningKind::MINIMUMF; })
+      .Case<arith::MinNumFOp>([&](auto op) { return CombiningKind::MINNUMF; })
       .Case<arith::MulIOp, arith::MulFOp>(
           [&](auto op) { return CombiningKind::MUL; })
       .Case<arith::OrIOp>([&](auto op) { return CombiningKind::OR; })
@@ -586,6 +588,14 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
       llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
 }
 
+/// Check if `op` is a linalg.reduce or a linalg.generic that has at least one
+/// reduction iterator.
+static bool hasReductionIterator(LinalgOp &op) {
+  return isa<linalg::ReduceOp>(op) ||
+         (isa<linalg::GenericOp>(op) &&
+          llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
+}
+
 /// Build a vector.transfer_write of `value` into `outputOperand` at indices set
 /// to all `0`; where `outputOperand` is an output operand of the LinalgOp
 /// currently being vectorized. If `dest` has null rank, build an memref.store.
@@ -1069,19 +1079,20 @@ vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state,
   //   * for vector indices (e.g. `vector<1x1x4xindex>`) - extract the bottom
   //    (0th) element and use that.
   SmallVector<Value> transferReadIdxs;
-  auto resTrailingDim = resultType.getShape().back();
   auto zero = rewriter.create<arith::ConstantOp>(
       loc, rewriter.getI32Type(), rewriter.getZeroAttr(rewriter.getI32Type()));
   for (size_t i = 0; i < extractOp.getIndices().size(); i++) {
-    auto idx = bvm.lookup(extractOp.getIndices()[i]);
+    Value idx = bvm.lookup(extractOp.getIndices()[i]);
     if (idx.getType().isIndex()) {
       transferReadIdxs.push_back(idx);
       continue;
     }
 
     auto indexAs1dVector = rewriter.create<vector::ShapeCastOp>(
-        loc, VectorType::get({resTrailingDim}, rewriter.getIndexType()),
-        bvm.lookup(extractOp.getIndices()[i]));
+        loc,
+        VectorType::get(resultType.getShape().back(), rewriter.getIndexType(),
+                        resultType.getScalableDims().back()),
+        idx);
     transferReadIdxs.push_back(
         rewriter.create<vector::ExtractElementOp>(loc, indexAs1dVector, zero));
   }
@@ -1787,6 +1798,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
   if (isa<ConvolutionOpInterface>(op.getOperation()))
     return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
 
+  if (hasReductionIterator(op))
+    return reductionPreconditions(op);
+
   // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
   // linalg.copy ops and ops that implement ContractionOpInterface for now.
   if (!isElementwise(op) &&
@@ -1945,7 +1959,8 @@ vectorizePadOpPrecondition(tensor::PadOp padOp,
   return success();
 }
 
-/// Preconditions for scalable vectors.
+/// Preconditions for scalable vectors. This is quite restrictive - it models
+/// the fact that in practice we would only make selected dimensions scalable.
 static LogicalResult
 vectorizeScalableVectorPrecondition(Operation *op,
                                     ArrayRef<int64_t> inputVectorSizes,
@@ -1953,18 +1968,98 @@ vectorizeScalableVectorPrecondition(Operation *op,
   assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
          "Number of input vector sizes and scalable dims doesn't match");
 
-  if (inputVectorSizes.empty())
-    return success();
+  size_t numOfScalableDims =
+      llvm::count_if(inputScalableVecDims, [](bool flag) { return flag; });
 
-  bool isScalable = inputScalableVecDims.back();
-  if (!isScalable)
+  if (numOfScalableDims == 0)
     return success();
 
-  // Only element-wise and 1d depthwise conv ops supported in the presence of
-  // scalable dims.
   auto linalgOp = dyn_cast<LinalgOp>(op);
-  return success(linalgOp && (isElementwise(linalgOp) ||
-                              isa<linalg::DepthwiseConv1DNwcWcOp>(op)));
+
+  // Cond 1: There's been no need for scalable vectorisation of
+  // non-linalg Ops so far
+  if (!linalgOp)
+    return failure();
+
+  // Cond 2: There's been no need for more than 2 scalable dims so far
+  if (numOfScalableDims > 2)
+    return failure();
+
+  // Cond 3: Look at the configuration in `inputScalableVecDims` and verify that
+  // it matches one of the supported cases:
+  //  1. exactly 1 dim is scalable and that's the _last_ parallel dim
+  //  2. exactly 2 dims are scalable and those are the _last two adjacent_
+  //     parallel dims
+  //  3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
+  // The 2nd restriction above means that only Matmul-like Ops are supported
+  // when 2 dims are scalable, e.g. :
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+
+  // Find the first scalable flag
+  bool seenParalell = false;
+  auto iterators = linalgOp.getIteratorTypesArray();
+  SmallVector<bool> scalableFlags(inputScalableVecDims);
+  while (!scalableFlags.back()) {
+    seenParalell |= (iterators.back() == utils::IteratorType::parallel);
+
+    iterators.pop_back();
+    scalableFlags.pop_back();
+  }
+
+  switch (iterators.back()) {
+  case utils::IteratorType::reduction: {
+    // Check 3. above is met.
+    if (iterators.size() != inputVectorSizes.size()) {
+      LDBG("Non-trailing reduction dim requested for scalable "
+           "vectorization\n");
+      return failure();
+    }
+    if (isa<linalg::MatmulOp>(op) || isa<linalg::MatmulTransposeAOp>(op)) {
+      LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
+           "is not supported\n");
+      return failure();
+    }
+    break;
+  }
+  case utils::IteratorType::parallel: {
+    // Check 1. and 2. above are met.
+    if (seenParalell) {
+      LDBG("Inner parallel dim not requested for scalable "
+           "vectorization\n");
+      return failure();
+    }
+    break;
+  }
+  }
+
+  // If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
+  // supported for which expect the folowing config:
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+  if (numOfScalableDims == 2) {
+    // Disallow below case which breaks 3. above:
+    //    * iterators = [..., parallel, reduction]
+    //    * scalable flags = [..., true, true]
+    if (iterators.back() == utils::IteratorType::reduction) {
+      LDBG("Higher dim than the trailing reduction dim requested for scalable "
+           "vectorization\n");
+      return failure();
+    }
+    scalableFlags.pop_back();
+    iterators.pop_back();
+
+    if (!scalableFlags.back() ||
+        (iterators.back() != utils::IteratorType::parallel))
+      return failure();
+  }
+
+  // Cond 4: Only the following ops are supported in the
+  // presence of scalable vectors
+  return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
+                 isa<linalg::MatmulTransposeAOp>(op) ||
+                 isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
+                 isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index f4fae68da63b3..d6fe22158d002 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -861,7 +861,34 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
     return builder.create<arith::MulFOp>(a, b);
   };
 
-  Value s = mul(operand, operand);
+  auto sub = [&](Value a, Value b) -> Value {
+    return builder.create<arith::SubFOp>(a, b);
+  };
+
+  auto abs = [&](Value a) -> Value { return builder.create<math::AbsFOp>(a); };
+
+  auto sqrt = [&](Value a) -> Value { return builder.create<math::SqrtOp>(a); };
+
+  auto scopy = [&](Value a, Value b) -> Value {
+    return builder.create<math::CopySignOp>(a, b);
+  };
+
+  auto sel = [&](Value a, Value b, Value c) -> Value {
+    return builder.create<arith::SelectOp>(a, b, c);
+  };
+
+  Value abso = abs(operand);
+  Value aa = mul(operand, operand);
+  Value opp = sqrt(sub(bcast(floatCst(builder, 1.0, elementType)), aa));
+
+  Value gt =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, aa,
+                                    bcast(floatCst(builder, 0.5, elementType)));
+
+  Value x = sel(gt, opp, abso);
+
+  // Asin(x) approximation for x = [-9/16, 9/16]:
+  Value s = mul(x, x);
   Value q = mul(s, s);
   Value r = bcast(floatCst(builder, 5.5579749017470502e-2, elementType));
   Value t = bcast(floatCst(builder, -6.2027913464120114e-2, elementType));
@@ -878,8 +905,12 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
   t = fma(t, q, bcast(floatCst(builder, 7.4999999991367292e-2, elementType)));
   r = fma(r, s, t);
   r = fma(r, s, bcast(floatCst(builder, 1.6666666666670193e-1, elementType)));
-  t = mul(operand, s);
-  r = fma(r, t, operand);
+  t = mul(x, s);
+  r = fma(r, t, x);
+
+  Value rsub = sub(bcast(floatCst(builder, 1.57079632679, elementType)), r);
+  r = sel(gt, rsub, r);
+  r = scopy(r, operand);
 
   rewriter.replaceOp(op, r);
   return success();
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 26f831f10a4e4..de9bbcbace692 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -644,6 +644,21 @@ LogicalResult WarpgroupMmaInitAccumulatorOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// RcpOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RcpOp::verify() {
+  RcpRoundingModeAttr rounding = getRoundingAttr();
+  bool ftz = getFtz();
+  // Currently, only `rcp_approx` and `ftz` is supported.
+  if (rounding.getValue() != RcpRoundingMode::APPROX || !ftz) {
+    return emitOpError() << "has a limitation. " << rounding
+                         << " or non-ftz is not supported yet.";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd dialect, type, and op definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
index 387a6903b9d8b..ed7425bd52525 100644
--- a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_dialect_library(MLIROpenACCDialect
   MLIROpenACCEnumsIncGen
   MLIROpenACCAttributesIncGen
   MLIROpenACCMPOpsInterfacesIncGen
+  MLIROpenACCOpsInterfacesIncGen
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index c3c6dffd5ae49..b4da50443c6cc 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -24,6 +24,7 @@ using namespace acc;
 
 #include "mlir/Dialect/OpenACC/OpenACCOpsDialect.cpp.inc"
 #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc"
+#include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.cpp.inc"
 
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 2a29bd1c9cfa9..41ba7f8f53d36 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIROpenACCTransforms
   MLIROpenACCEnumsIncGen
   MLIROpenACCAttributesIncGen
   MLIROpenACCMPOpsInterfacesIncGen
+  MLIROpenACCOpsInterfacesIncGen
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f5ec5a476ad8f..11780f84697b1 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -117,39 +117,39 @@ void OpenMPDialect::initialize() {
 /// ssa-id-and-type ::= ssa-id `:` type
 static ParseResult parseAllocateAndAllocator(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocate,
-    SmallVectorImpl<Type> &typesAllocate,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocator,
-    SmallVectorImpl<Type> &typesAllocator) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocateVars,
+    SmallVectorImpl<Type> &allocateTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocatorVars,
+    SmallVectorImpl<Type> &allocatorTypes) {
 
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand operand;
     Type type;
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
-    operandsAllocator.push_back(operand);
-    typesAllocator.push_back(type);
+    allocatorVars.push_back(operand);
+    allocatorTypes.push_back(type);
     if (parser.parseArrow())
       return failure();
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
 
-    operandsAllocate.push_back(operand);
-    typesAllocate.push_back(type);
+    allocateVars.push_back(operand);
+    allocateTypes.push_back(type);
     return success();
   });
 }
 
 /// Print allocate clause
 static void printAllocateAndAllocator(OpAsmPrinter &p, Operation *op,
-                                      OperandRange varsAllocate,
-                                      TypeRange typesAllocate,
-                                      OperandRange varsAllocator,
-                                      TypeRange typesAllocator) {
-  for (unsigned i = 0; i < varsAllocate.size(); ++i) {
-    std::string separator = i == varsAllocate.size() - 1 ? "" : ", ";
-    p << varsAllocator[i] << " : " << typesAllocator[i] << " -> ";
-    p << varsAllocate[i] << " : " << typesAllocate[i] << separator;
+                                      OperandRange allocateVars,
+                                      TypeRange allocateTypes,
+                                      OperandRange allocatorVars,
+                                      TypeRange allocatorTypes) {
+  for (unsigned i = 0; i < allocateVars.size(); ++i) {
+    std::string separator = i == allocateVars.size() - 1 ? "" : ", ";
+    p << allocatorVars[i] << " : " << allocatorTypes[i] << " -> ";
+    p << allocateVars[i] << " : " << allocateTypes[i] << separator;
   }
 }
 
@@ -183,11 +183,11 @@ void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
 /// linear ::= `linear` `(` linear-list `)`
 /// linear-list := linear-val | linear-val linear-list
 /// linear-val := ssa-id-and-type `=` ssa-id-and-type
-static ParseResult
-parseLinearClause(OpAsmParser &parser,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
-                  SmallVectorImpl<Type> &types,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &stepVars) {
+static ParseResult parseLinearClause(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearVars,
+    SmallVectorImpl<Type> &linearTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearStepVars) {
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand var;
     Type type;
@@ -196,16 +196,16 @@ parseLinearClause(OpAsmParser &parser,
         parser.parseOperand(stepVar) || parser.parseColonType(type))
       return failure();
 
-    vars.push_back(var);
-    types.push_back(type);
-    stepVars.push_back(stepVar);
+    linearVars.push_back(var);
+    linearTypes.push_back(type);
+    linearStepVars.push_back(stepVar);
     return success();
   });
 }
 
 /// Print Linear Clause
 static void printLinearClause(OpAsmPrinter &p, Operation *op,
-                              ValueRange linearVars, TypeRange linearVarTypes,
+                              ValueRange linearVars, TypeRange linearTypes,
                               ValueRange linearStepVars) {
   size_t linearVarsSize = linearVars.size();
   for (unsigned i = 0; i < linearVarsSize; ++i) {
@@ -221,12 +221,12 @@ static void printLinearClause(OpAsmPrinter &p, Operation *op,
 // Verifier for Nontemporal Clause
 //===----------------------------------------------------------------------===//
 
-static LogicalResult
-verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
+static LogicalResult verifyNontemporalClause(Operation *op,
+                                             OperandRange nontemporalVars) {
 
   // Check if each var is unique - OpenMP 5.0 -> 2.9.3.1 section
   DenseSet<Value> nontemporalItems;
-  for (const auto &it : nontemporalVariables)
+  for (const auto &it : nontemporalVars)
     if (!nontemporalItems.insert(it).second)
       return op->emitOpError() << "nontemporal variable used more than once";
 
@@ -236,32 +236,32 @@ verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
 //===----------------------------------------------------------------------===//
 // Parser, verifier and printer for Aligned Clause
 //===----------------------------------------------------------------------===//
-static LogicalResult
-verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
-                    OperandRange alignedVariables) {
+static LogicalResult verifyAlignedClause(Operation *op,
+                                         std::optional<ArrayAttr> alignments,
+                                         OperandRange alignedVars) {
   // Check if number of alignment values equals to number of aligned variables
-  if (!alignedVariables.empty()) {
-    if (!alignmentValues || alignmentValues->size() != alignedVariables.size())
+  if (!alignedVars.empty()) {
+    if (!alignments || alignments->size() != alignedVars.size())
       return op->emitOpError()
              << "expected as many alignment values as aligned variables";
   } else {
-    if (alignmentValues)
+    if (alignments)
       return op->emitOpError() << "unexpected alignment values attribute";
     return success();
   }
 
   // Check if each var is aligned only once - OpenMP 4.5 -> 2.8.1 section
   DenseSet<Value> alignedItems;
-  for (auto it : alignedVariables)
+  for (auto it : alignedVars)
     if (!alignedItems.insert(it).second)
       return op->emitOpError() << "aligned variable used more than once";
 
-  if (!alignmentValues)
+  if (!alignments)
     return success();
 
   // Check if all alignment values are positive - OpenMP 4.5 -> 2.8.1 section
-  for (unsigned i = 0; i < (*alignmentValues).size(); ++i) {
-    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignmentValues)[i])) {
+  for (unsigned i = 0; i < (*alignments).size(); ++i) {
+    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignments)[i])) {
       if (intAttr.getValue().sle(0))
         return op->emitOpError() << "alignment should be greater than 0";
     } else {
@@ -275,14 +275,15 @@ verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
 /// aligned ::= `aligned` `(` aligned-list `)`
 /// aligned-list := aligned-val | aligned-val aligned-list
 /// aligned-val := ssa-id-and-type `->` alignment
-static ParseResult parseAlignedClause(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedItems,
-    SmallVectorImpl<Type> &types, ArrayAttr &alignmentValues) {
+static ParseResult
+parseAlignedClause(OpAsmParser &parser,
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedVars,
+                   SmallVectorImpl<Type> &alignedTypes,
+                   ArrayAttr &alignmentsAttr) {
   SmallVector<Attribute> alignmentVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(alignedItems.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()) ||
+        if (parser.parseOperand(alignedVars.emplace_back()) ||
+            parser.parseColonType(alignedTypes.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseAttribute(alignmentVec.emplace_back())) {
           return failure();
@@ -291,20 +292,19 @@ static ParseResult parseAlignedClause(
       })))
     return failure();
   SmallVector<Attribute> alignments(alignmentVec.begin(), alignmentVec.end());
-  alignmentValues = ArrayAttr::get(parser.getContext(), alignments);
+  alignmentsAttr = ArrayAttr::get(parser.getContext(), alignments);
   return success();
 }
 
 /// Print Aligned Clause
 static void printAlignedClause(OpAsmPrinter &p, Operation *op,
-                               ValueRange alignedVars,
-                               TypeRange alignedVarTypes,
-                               std::optional<ArrayAttr> alignmentValues) {
+                               ValueRange alignedVars, TypeRange alignedTypes,
+                               std::optional<ArrayAttr> alignments) {
   for (unsigned i = 0; i < alignedVars.size(); ++i) {
     if (i != 0)
       p << ", ";
     p << alignedVars[i] << " : " << alignedVars[i].getType();
-    p << " -> " << (*alignmentValues)[i];
+    p << " -> " << (*alignments)[i];
   }
 }
 
@@ -353,10 +353,11 @@ verifyScheduleModifiers(OpAsmParser &parser,
 /// sched-wo-chunk ::=  `auto` | `runtime`
 /// sched-modifier ::=  sched-mod-val | sched-mod-val `,` sched-mod-val
 /// sched-mod-val ::=  `monotonic` | `nonmonotonic` | `simd` | `none`
-static ParseResult parseScheduleClause(
-    OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
-    ScheduleModifierAttr &scheduleModifier, UnitAttr &simdModifier,
-    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize, Type &chunkType) {
+static ParseResult
+parseScheduleClause(OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
+                    ScheduleModifierAttr &scheduleMod, UnitAttr &scheduleSimd,
+                    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize,
+                    Type &chunkType) {
   StringRef keyword;
   if (parser.parseKeyword(&keyword))
     return failure();
@@ -399,14 +400,14 @@ static ParseResult parseScheduleClause(
     SMLoc loc = parser.getCurrentLocation();
     if (std::optional<ScheduleModifier> mod =
             symbolizeScheduleModifier(modifiers[0])) {
-      scheduleModifier = ScheduleModifierAttr::get(parser.getContext(), *mod);
+      scheduleMod = ScheduleModifierAttr::get(parser.getContext(), *mod);
     } else {
       return parser.emitError(loc, "invalid schedule modifier");
     }
     // Only SIMD attribute is allowed here!
     if (modifiers.size() > 1) {
       assert(symbolizeScheduleModifier(modifiers[1]) == ScheduleModifier::simd);
-      simdModifier = UnitAttr::get(parser.getBuilder().getContext());
+      scheduleSimd = UnitAttr::get(parser.getBuilder().getContext());
     }
   }
 
@@ -415,16 +416,16 @@ static ParseResult parseScheduleClause(
 
 /// Print schedule clause
 static void printScheduleClause(OpAsmPrinter &p, Operation *op,
-                                ClauseScheduleKindAttr schedAttr,
-                                ScheduleModifierAttr modifier, UnitAttr simd,
-                                Value scheduleChunkVar,
+                                ClauseScheduleKindAttr scheduleKind,
+                                ScheduleModifierAttr scheduleMod,
+                                UnitAttr scheduleSimd, Value scheduleChunk,
                                 Type scheduleChunkType) {
-  p << stringifyClauseScheduleKind(schedAttr.getValue());
-  if (scheduleChunkVar)
-    p << " = " << scheduleChunkVar << " : " << scheduleChunkVar.getType();
-  if (modifier)
-    p << ", " << stringifyScheduleModifier(modifier.getValue());
-  if (simd)
+  p << stringifyClauseScheduleKind(scheduleKind.getValue());
+  if (scheduleChunk)
+    p << " = " << scheduleChunk << " : " << scheduleChunk.getType();
+  if (scheduleMod)
+    p << ", " << stringifyScheduleModifier(scheduleMod.getValue());
+  if (scheduleSimd)
     p << ", simd";
 }
 
@@ -435,15 +436,15 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // order ::= `order` `(` [order-modiﬁer ':'] concurrent `)`
 // order-modiﬁer ::= reproducible | unconstrained
 static ParseResult parseOrderClause(OpAsmParser &parser,
-                                    ClauseOrderKindAttr &kindAttr,
-                                    OrderModifierAttr &modifierAttr) {
+                                    ClauseOrderKindAttr &order,
+                                    OrderModifierAttr &orderMod) {
   StringRef enumStr;
   SMLoc loc = parser.getCurrentLocation();
   if (parser.parseKeyword(&enumStr))
     return failure();
   if (std::optional<OrderModifier> enumValue =
           symbolizeOrderModifier(enumStr)) {
-    modifierAttr = OrderModifierAttr::get(parser.getContext(), *enumValue);
+    orderMod = OrderModifierAttr::get(parser.getContext(), *enumValue);
     if (parser.parseOptionalColon())
       return failure();
     loc = parser.getCurrentLocation();
@@ -452,19 +453,19 @@ static ParseResult parseOrderClause(OpAsmParser &parser,
   }
   if (std::optional<ClauseOrderKind> enumValue =
           symbolizeClauseOrderKind(enumStr)) {
-    kindAttr = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
+    order = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
     return success();
   }
   return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";
 }
 
 static void printOrderClause(OpAsmPrinter &p, Operation *op,
-                             ClauseOrderKindAttr kindAttr,
-                             OrderModifierAttr modifierAttr) {
-  if (modifierAttr)
-    p << stringifyOrderModifier(modifierAttr.getValue()) << ":";
-  if (kindAttr)
-    p << stringifyClauseOrderKind(kindAttr.getValue());
+                             ClauseOrderKindAttr order,
+                             OrderModifierAttr orderMod) {
+  if (orderMod)
+    p << stringifyOrderModifier(orderMod.getValue()) << ":";
+  if (order)
+    p << stringifyClauseOrderKind(order.getValue());
 }
 
 //===----------------------------------------------------------------------===//
@@ -474,8 +475,7 @@ static void printOrderClause(OpAsmPrinter &p, Operation *op,
 static ParseResult parseClauseWithRegionArgs(
     OpAsmParser &parser, Region &region,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-    ArrayAttr &symbols,
+    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref, ArrayAttr &symbols,
     SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
@@ -494,7 +494,7 @@ static ParseResult parseClauseWithRegionArgs(
             return success();
           })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  byref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
 
   auto *argsBegin = regionPrivateArgs.begin();
   MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
@@ -510,13 +510,13 @@ static ParseResult parseClauseWithRegionArgs(
 static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
                                       ValueRange argsSubrange,
                                       StringRef clauseName, ValueRange operands,
-                                      TypeRange types, DenseBoolArrayAttr byRef,
+                                      TypeRange types, DenseBoolArrayAttr byref,
                                       ArrayAttr symbols) {
   if (!clauseName.empty())
     p << clauseName << "(";
 
   llvm::interleaveComma(llvm::zip_equal(symbols, operands, argsSubrange, types,
-                                        byRef.asArrayRef()),
+                                        byref.asArrayRef()),
                         p, [&p](auto t) {
                           auto [sym, op, arg, type, isByRef] = t;
                           p << (isByRef ? "byref " : "") << sym << " " << op
@@ -529,28 +529,27 @@ static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
 
 static ParseResult parseParallelRegion(
     OpAsmParser &parser, Region &region,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVarOperands,
-    SmallVectorImpl<Type> &reductionVarTypes,
-    DenseBoolArrayAttr &reductionByRef, ArrayAttr &reductionSymbols,
-    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVarOperands,
-    llvm::SmallVectorImpl<Type> &privateVarsTypes,
-    ArrayAttr &privatizerSymbols) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   llvm::SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (succeeded(parser.parseOptionalKeyword("reduction"))) {
-    if (failed(parseClauseWithRegionArgs(parser, region, reductionVarOperands,
-                                         reductionVarTypes, reductionByRef,
-                                         reductionSymbols, regionPrivateArgs)))
+    if (failed(parseClauseWithRegionArgs(parser, region, reductionVars,
+                                         reductionTypes, reductionByref,
+                                         reductionSyms, regionPrivateArgs)))
       return failure();
   }
 
   if (succeeded(parser.parseOptionalKeyword("private"))) {
-    auto privateByRef = DenseBoolArrayAttr::get(parser.getContext(), {});
-    if (failed(parseClauseWithRegionArgs(parser, region, privateVarOperands,
-                                         privateVarsTypes, privateByRef,
-                                         privatizerSymbols, regionPrivateArgs)))
+    auto privateByref = DenseBoolArrayAttr::get(parser.getContext(), {});
+    if (failed(parseClauseWithRegionArgs(parser, region, privateVars,
+                                         privateTypes, privateByref,
+                                         privateSyms, regionPrivateArgs)))
       return failure();
-    if (llvm::any_of(privateByRef.asArrayRef(),
+    if (llvm::any_of(privateByref.asArrayRef(),
                      [](bool byref) { return byref; })) {
       parser.emitError(parser.getCurrentLocation(),
                        "private clause cannot have byref attributes");
@@ -562,35 +561,30 @@ static ParseResult parseParallelRegion(
 }
 
 static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
-                                ValueRange reductionVarOperands,
-                                TypeRange reductionVarTypes,
-                                DenseBoolArrayAttr reductionVarIsByRef,
-                                ArrayAttr reductionSymbols,
-                                ValueRange privateVarOperands,
-                                TypeRange privateVarTypes,
-                                ArrayAttr privatizerSymbols) {
-  if (reductionSymbols) {
+                                ValueRange reductionVars,
+                                TypeRange reductionTypes,
+                                DenseBoolArrayAttr reductionByref,
+                                ArrayAttr reductionSyms, ValueRange privateVars,
+                                TypeRange privateTypes, ArrayAttr privateSyms) {
+  if (reductionSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin,
-                                 argsBegin + reductionVarTypes.size());
-    printClauseWithRegionArgs(p, op, argsSubrange, "reduction",
-                              reductionVarOperands, reductionVarTypes,
-                              reductionVarIsByRef, reductionSymbols);
+    MutableArrayRef argsSubrange(argsBegin, argsBegin + reductionTypes.size());
+    printClauseWithRegionArgs(p, op, argsSubrange, "reduction", reductionVars,
+                              reductionTypes, reductionByref, reductionSyms);
   }
 
-  if (privatizerSymbols) {
+  if (privateSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin + reductionVarOperands.size(),
-                                 argsBegin + reductionVarOperands.size() +
-                                     privateVarTypes.size());
+    MutableArrayRef argsSubrange(argsBegin + reductionVars.size(),
+                                 argsBegin + reductionVars.size() +
+                                     privateTypes.size());
     mlir::SmallVector<bool> isByRefVec;
-    isByRefVec.resize(privateVarTypes.size(), false);
+    isByRefVec.resize(privateTypes.size(), false);
     DenseBoolArrayAttr isByRef =
         makeDenseBoolArrayAttr(op->getContext(), isByRefVec);
 
-    printClauseWithRegionArgs(p, op, argsSubrange, "private",
-                              privateVarOperands, privateVarTypes, isByRef,
-                              privatizerSymbols);
+    printClauseWithRegionArgs(p, op, argsSubrange, "private", privateVars,
+                              privateTypes, isByRef, privateSyms);
   }
 
   p.printRegion(region, /*printEntryBlockArgs=*/false);
@@ -599,41 +593,41 @@ static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= (`byref`)? symbol-ref `->` ssa-id `:` type
-static ParseResult
-parseReductionVarList(OpAsmParser &parser,
-                      SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                      SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-                      ArrayAttr &reductionSymbols) {
+static ParseResult parseReductionVarList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         ParseResult optionalByref = parser.parseOptionalKeyword("byref");
         if (parser.parseAttribute(reductionVec.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(reductionVars.emplace_back()) ||
+            parser.parseColonType(reductionTypes.emplace_back()))
           return failure();
         isByRefVec.push_back(optionalByref.succeeded());
         return success();
       })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  reductionByref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
   SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  reductionSyms = ArrayAttr::get(parser.getContext(), reductions);
   return success();
 }
 
 /// Print Reduction clause
-static void printReductionVarList(OpAsmPrinter &p, Operation *op,
-                                  OperandRange reductionVars,
-                                  TypeRange reductionTypes,
-                                  std::optional<DenseBoolArrayAttr> isByRef,
-                                  std::optional<ArrayAttr> reductions) {
+static void
+printReductionVarList(OpAsmPrinter &p, Operation *op,
+                      OperandRange reductionVars, TypeRange reductionTypes,
+                      std::optional<DenseBoolArrayAttr> reductionByref,
+                      std::optional<ArrayAttr> reductionSyms) {
   auto getByRef = [&](unsigned i) -> const char * {
-    if (!isByRef || !*isByRef)
+    if (!reductionByref || !*reductionByref)
       return "";
-    assert(isByRef->empty() || i < isByRef->size());
-    if (!isByRef->empty() && (*isByRef)[i])
+    assert(reductionByref->empty() || i < reductionByref->size());
+    if (!reductionByref->empty() && (*reductionByref)[i])
       return "byref ";
     return "";
   };
@@ -641,26 +635,26 @@ static void printReductionVarList(OpAsmPrinter &p, Operation *op,
   for (unsigned i = 0, e = reductionVars.size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
-    p << getByRef(i) << (*reductions)[i] << " -> " << reductionVars[i] << " : "
-      << reductionVars[i].getType();
+    p << getByRef(i) << (*reductionSyms)[i] << " -> " << reductionVars[i]
+      << " : " << reductionVars[i].getType();
   }
 }
 
 /// Verifies Reduction Clause
 static LogicalResult
-verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
+verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductionSyms,
                        OperandRange reductionVars,
-                       std::optional<ArrayRef<bool>> byRef) {
+                       std::optional<ArrayRef<bool>> reductionByref) {
   if (!reductionVars.empty()) {
-    if (!reductions || reductions->size() != reductionVars.size())
+    if (!reductionSyms || reductionSyms->size() != reductionVars.size())
       return op->emitOpError()
              << "expected as many reduction symbol references "
                 "as reduction variables";
-    if (byRef && byRef->size() != reductionVars.size())
+    if (reductionByref && reductionByref->size() != reductionVars.size())
       return op->emitError() << "expected as many reduction variable by "
                                 "reference attributes as reduction variables";
   } else {
-    if (reductions)
+    if (reductionSyms)
       return op->emitOpError() << "unexpected reduction symbol references";
     return success();
   }
@@ -668,7 +662,7 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
   // TODO: The followings should be done in
   // SymbolUserOpInterface::verifySymbolUses.
   DenseSet<Value> accumulators;
-  for (auto args : llvm::zip(reductionVars, *reductions)) {
+  for (auto args : llvm::zip(reductionVars, *reductionSyms)) {
     Value accum = std::get<0>(args);
 
     if (!accumulators.insert(accum).second)
@@ -693,41 +687,40 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
 }
 
 //===----------------------------------------------------------------------===//
-// Parser, printer and verifier for CopyPrivateVarList
+// Parser, printer and verifier for Copyprivate
 //===----------------------------------------------------------------------===//
 
 /// copyprivate-entry-list ::= copyprivate-entry
 ///                          | copyprivate-entry-list `,` copyprivate-entry
 /// copyprivate-entry ::= ssa-id `->` symbol-ref `:` type
-static ParseResult parseCopyPrivateVarList(
+static ParseResult parseCopyprivate(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, ArrayAttr &copyPrivateSymbols) {
-  SmallVector<SymbolRefAttr> copyPrivateFuncsVec;
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &copyprivateVars,
+    SmallVectorImpl<Type> &copyprivateTypes, ArrayAttr &copyprivateSyms) {
+  SmallVector<SymbolRefAttr> symsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(operands.emplace_back()) ||
+        if (parser.parseOperand(copyprivateVars.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseAttribute(copyPrivateFuncsVec.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseAttribute(symsVec.emplace_back()) ||
+            parser.parseColonType(copyprivateTypes.emplace_back()))
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> copyPrivateFuncs(copyPrivateFuncsVec.begin(),
-                                          copyPrivateFuncsVec.end());
-  copyPrivateSymbols = ArrayAttr::get(parser.getContext(), copyPrivateFuncs);
+  SmallVector<Attribute> syms(symsVec.begin(), symsVec.end());
+  copyprivateSyms = ArrayAttr::get(parser.getContext(), syms);
   return success();
 }
 
-/// Print CopyPrivate clause
-static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
-                                    OperandRange copyPrivateVars,
-                                    TypeRange copyPrivateTypes,
-                                    std::optional<ArrayAttr> copyPrivateFuncs) {
-  if (!copyPrivateFuncs.has_value())
+/// Print Copyprivate clause
+static void printCopyprivate(OpAsmPrinter &p, Operation *op,
+                             OperandRange copyprivateVars,
+                             TypeRange copyprivateTypes,
+                             std::optional<ArrayAttr> copyprivateSyms) {
+  if (!copyprivateSyms.has_value())
     return;
   llvm::interleaveComma(
-      llvm::zip(copyPrivateVars, *copyPrivateFuncs, copyPrivateTypes), p,
+      llvm::zip(copyprivateVars, *copyprivateSyms, copyprivateTypes), p,
       [&](const auto &args) {
         p << std::get<0>(args) << " -> " << std::get<1>(args) << " : "
           << std::get<2>(args);
@@ -736,22 +729,22 @@ static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies CopyPrivate Clause
 static LogicalResult
-verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
-                         std::optional<ArrayAttr> copyPrivateFuncs) {
-  size_t copyPrivateFuncsSize =
-      copyPrivateFuncs.has_value() ? copyPrivateFuncs->size() : 0;
-  if (copyPrivateFuncsSize != copyPrivateVars.size())
-    return op->emitOpError() << "inconsistent number of copyPrivate vars (= "
-                             << copyPrivateVars.size()
-                             << ") and functions (= " << copyPrivateFuncsSize
+verifyCopyprivateVarList(Operation *op, OperandRange copyprivateVars,
+                         std::optional<ArrayAttr> copyprivateSyms) {
+  size_t copyprivateSymsSize =
+      copyprivateSyms.has_value() ? copyprivateSyms->size() : 0;
+  if (copyprivateSymsSize != copyprivateVars.size())
+    return op->emitOpError() << "inconsistent number of copyprivate vars (= "
+                             << copyprivateVars.size()
+                             << ") and functions (= " << copyprivateSymsSize
                              << "), both must be equal";
-  if (!copyPrivateFuncs.has_value())
+  if (!copyprivateSyms.has_value())
     return success();
 
-  for (auto copyPrivateVarAndFunc :
-       llvm::zip(copyPrivateVars, *copyPrivateFuncs)) {
+  for (auto copyprivateVarAndSym :
+       llvm::zip(copyprivateVars, *copyprivateSyms)) {
     auto symbolRef =
-        llvm::cast<SymbolRefAttr>(std::get<1>(copyPrivateVarAndFunc));
+        llvm::cast<SymbolRefAttr>(std::get<1>(copyprivateVarAndSym));
     std::optional<std::variant<mlir::func::FuncOp, mlir::LLVM::LLVMFuncOp>>
         funcOp;
     if (mlir::func::FuncOp mlirFuncOp =
@@ -785,7 +778,7 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
       return op->emitOpError() << "expected copy function " << symbolRef
                                << " arguments to have the same type";
 
-    Type varType = std::get<0>(copyPrivateVarAndFunc).getType();
+    Type varType = std::get<0>(copyprivateVarAndSym).getType();
     if (argTy != varType)
       return op->emitOpError()
              << "expected copy function arguments' type (" << argTy
@@ -805,39 +798,39 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
 /// depend-entry ::= depend-kind `->` ssa-id `:` type
 static ParseResult
 parseDependVarList(OpAsmParser &parser,
-                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                   SmallVectorImpl<Type> &types, ArrayAttr &dependsArray) {
-  SmallVector<ClauseTaskDependAttr> dependVec;
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dependVars,
+                   SmallVectorImpl<Type> &dependTypes, ArrayAttr &dependKinds) {
+  SmallVector<ClauseTaskDependAttr> kindsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         StringRef keyword;
         if (parser.parseKeyword(&keyword) || parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(dependVars.emplace_back()) ||
+            parser.parseColonType(dependTypes.emplace_back()))
           return failure();
         if (std::optional<ClauseTaskDepend> keywordDepend =
                 (symbolizeClauseTaskDepend(keyword)))
-          dependVec.emplace_back(
+          kindsVec.emplace_back(
               ClauseTaskDependAttr::get(parser.getContext(), *keywordDepend));
         else
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> depends(dependVec.begin(), dependVec.end());
-  dependsArray = ArrayAttr::get(parser.getContext(), depends);
+  SmallVector<Attribute> kinds(kindsVec.begin(), kindsVec.end());
+  dependKinds = ArrayAttr::get(parser.getContext(), kinds);
   return success();
 }
 
 /// Print Depend clause
 static void printDependVarList(OpAsmPrinter &p, Operation *op,
                                OperandRange dependVars, TypeRange dependTypes,
-                               std::optional<ArrayAttr> depends) {
+                               std::optional<ArrayAttr> dependKinds) {
 
-  for (unsigned i = 0, e = depends->size(); i < e; ++i) {
+  for (unsigned i = 0, e = dependKinds->size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
     p << stringifyClauseTaskDepend(
-             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*depends)[i])
+             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*dependKinds)[i])
                  .getValue())
       << " -> " << dependVars[i] << " : " << dependTypes[i];
   }
@@ -845,14 +838,14 @@ static void printDependVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies Depend clause
 static LogicalResult verifyDependVarList(Operation *op,
-                                         std::optional<ArrayAttr> depends,
+                                         std::optional<ArrayAttr> dependKinds,
                                          OperandRange dependVars) {
   if (!dependVars.empty()) {
-    if (!depends || depends->size() != dependVars.size())
+    if (!dependKinds || dependKinds->size() != dependVars.size())
       return op->emitOpError() << "expected as many depend values"
                                   " as depend variables";
   } else {
-    if (depends && !depends->empty())
+    if (dependKinds && !dependKinds->empty())
       return op->emitOpError() << "unexpected depend values";
     return success();
   }
@@ -1144,8 +1137,8 @@ static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
 
 static ParseResult
 parseMapEntries(OpAsmParser &parser,
-                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapOperands,
-                SmallVectorImpl<Type> &mapOperandTypes) {
+                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapVars,
+                SmallVectorImpl<Type> &mapTypes) {
   OpAsmParser::UnresolvedOperand arg;
   OpAsmParser::UnresolvedOperand blockArg;
   Type argType;
@@ -1154,14 +1147,14 @@ parseMapEntries(OpAsmParser &parser,
       return failure();
     if (succeeded(parser.parseOptionalArrow()) && parser.parseOperand(blockArg))
       return failure();
-    mapOperands.push_back(arg);
+    mapVars.push_back(arg);
     return success();
   };
 
   auto parseTypes = [&]() -> ParseResult {
     if (parser.parseType(argType))
       return failure();
-    mapOperandTypes.push_back(argType);
+    mapTypes.push_back(argType);
     return success();
   };
 
@@ -1178,48 +1171,47 @@ parseMapEntries(OpAsmParser &parser,
 }
 
 static void printMapEntries(OpAsmPrinter &p, Operation *op,
-                            OperandRange mapOperands,
-                            TypeRange mapOperandTypes) {
+                            OperandRange mapVars, TypeRange mapTypes) {
   // Get pointer to the region if this is an omp.target, because printing map
   // clauses for that operation has to also show the correspondence of each
   // variable to the corresponding block argument.
   Block *entryBlock = isa<TargetOp>(op) ? &op->getRegion(0).front() : nullptr;
   unsigned argIndex = 0;
 
-  for (const auto &mapOp : mapOperands) {
+  for (const auto &mapOp : mapVars) {
     p << mapOp;
     if (entryBlock) {
       const auto &blockArg = entryBlock->getArgument(argIndex);
       p << " -> " << blockArg;
     }
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
   p << " : ";
 
   argIndex = 0;
-  for (const auto &mapType : mapOperandTypes) {
+  for (const auto &mapType : mapTypes) {
     p << mapType;
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
 }
 
-static ParseResult parsePrivateList(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateOperands,
-    SmallVectorImpl<Type> &privateOperandTypes, ArrayAttr &privatizerSymbols) {
+static ParseResult
+parsePrivateList(OpAsmParser &parser,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+                 SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   SmallVector<SymbolRefAttr> privateSymRefs;
   SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
         if (parser.parseAttribute(privateSymRefs.emplace_back()) ||
-            parser.parseOperand(privateOperands.emplace_back()) ||
+            parser.parseOperand(privateVars.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseArgument(regionPrivateArgs.emplace_back()) ||
-            parser.parseColonType(privateOperandTypes.emplace_back()))
+            parser.parseColonType(privateTypes.emplace_back()))
           return failure();
         return success();
       })))
@@ -1227,32 +1219,31 @@ static ParseResult parsePrivateList(
 
   SmallVector<Attribute> privateSymAttrs(privateSymRefs.begin(),
                                          privateSymRefs.end());
-  privatizerSymbols = ArrayAttr::get(parser.getContext(), privateSymAttrs);
+  privateSyms = ArrayAttr::get(parser.getContext(), privateSymAttrs);
 
   return success();
 }
 
 static void printPrivateList(OpAsmPrinter &p, Operation *op,
-                             ValueRange privateVarOperands,
-                             TypeRange privateVarTypes,
-                             ArrayAttr privatizerSymbols) {
+                             ValueRange privateVars, TypeRange privateTypes,
+                             ArrayAttr privateSyms) {
   // TODO: Remove target-specific logic from this function.
   auto targetOp = mlir::dyn_cast<mlir::omp::TargetOp>(op);
   assert(targetOp);
 
   auto &region = op->getRegion(0);
   auto *argsBegin = region.front().getArguments().begin();
-  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapOperands().size(),
-                               argsBegin + targetOp.getMapOperands().size() +
-                                   privateVarTypes.size());
+  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapVars().size(),
+                               argsBegin + targetOp.getMapVars().size() +
+                                   privateTypes.size());
   mlir::SmallVector<bool> isByRefVec;
-  isByRefVec.resize(privateVarTypes.size(), false);
+  isByRefVec.resize(privateTypes.size(), false);
   DenseBoolArrayAttr isByRef =
       DenseBoolArrayAttr::get(op->getContext(), isByRefVec);
 
-  printClauseWithRegionArgs(
-      p, op, argsSubrange, /*clauseName=*/llvm::StringRef{}, privateVarOperands,
-      privateVarTypes, isByRef, privatizerSymbols);
+  printClauseWithRegionArgs(p, op, argsSubrange,
+                            /*clauseName=*/llvm::StringRef{}, privateVars,
+                            privateTypes, isByRef, privateSyms);
 }
 
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
@@ -1271,32 +1262,32 @@ static void printCaptureType(OpAsmPrinter &p, Operation *op,
 }
 
 static ParseResult parseCaptureType(OpAsmParser &parser,
-                                    VariableCaptureKindAttr &mapCapture) {
+                                    VariableCaptureKindAttr &mapCaptureType) {
   StringRef mapCaptureKey;
   if (parser.parseKeyword(&mapCaptureKey))
     return failure();
 
   if (mapCaptureKey == "This")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::This);
   if (mapCaptureKey == "ByRef")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByRef);
   if (mapCaptureKey == "ByCopy")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByCopy);
   if (mapCaptureKey == "VLAType")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::VLAType);
 
   return success();
 }
 
-static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
+static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateToVars;
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateFromVars;
 
-  for (auto mapOp : mapOperands) {
+  for (auto mapOp : mapVars) {
     if (!mapOp.getDefiningOp())
       emitError(op->getLoc(), "missing map operation");
 
@@ -1378,19 +1369,20 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
 //===----------------------------------------------------------------------===//
 
 void TargetDataOp::build(OpBuilder &builder, OperationState &state,
-                         const TargetDataClauseOps &clauses) {
-  TargetDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                      clauses.useDevicePtrVars, clauses.useDeviceAddrVars,
-                      clauses.mapVars);
+                         const TargetDataOperands &clauses) {
+  TargetDataOp::build(builder, state, clauses.device, clauses.ifVar,
+                      clauses.mapVars, clauses.useDeviceAddrVars,
+                      clauses.useDevicePtrVars);
 }
 
 LogicalResult TargetDataOp::verify() {
-  if (getMapOperands().empty() && getUseDevicePtr().empty() &&
-      getUseDeviceAddr().empty()) {
-    return ::emitError(this->getLoc(), "At least one of map, useDevicePtr, or "
-                                       "useDeviceAddr operand must be present");
+  if (getMapVars().empty() && getUseDevicePtrVars().empty() &&
+      getUseDeviceAddrVars().empty()) {
+    return ::emitError(this->getLoc(),
+                       "At least one of map, use_device_ptr_vars, or "
+                       "use_device_addr_vars operand must be present");
   }
-  return verifyMapClause(*this, getMapOperands());
+  return verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1399,40 +1391,39 @@ LogicalResult TargetDataOp::verify() {
 
 void TargetEnterDataOp::build(
     OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+    const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetEnterDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                           makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                           clauses.dependVars, clauses.nowaitAttr,
-                           clauses.mapVars);
+  TargetEnterDataOp::build(builder, state,
+                           makeArrayAttr(ctx, clauses.dependKinds),
+                           clauses.dependVars, clauses.device, clauses.ifVar,
+                           clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetEnterDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
 // TargetExitDataOp
 //===----------------------------------------------------------------------===//
 
-void TargetExitDataOp::build(
-    OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+void TargetExitDataOp::build(OpBuilder &builder, OperationState &state,
+                             const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetExitDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                          makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                          clauses.dependVars, clauses.nowaitAttr,
-                          clauses.mapVars);
+  TargetExitDataOp::build(builder, state,
+                          makeArrayAttr(ctx, clauses.dependKinds),
+                          clauses.dependVars, clauses.device, clauses.ifVar,
+                          clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetExitDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1440,19 +1431,18 @@ LogicalResult TargetExitDataOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetUpdateOp::build(OpBuilder &builder, OperationState &state,
-                           const TargetEnterExitUpdateDataClauseOps &clauses) {
+                           const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetUpdateOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                        makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                        clauses.dependVars, clauses.nowaitAttr,
-                        clauses.mapVars);
+  TargetUpdateOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds),
+                        clauses.dependVars, clauses.device, clauses.ifVar,
+                        clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetUpdateOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1460,24 +1450,24 @@ LogicalResult TargetUpdateOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetOp::build(OpBuilder &builder, OperationState &state,
-                     const TargetClauseOps &clauses) {
+                     const TargetOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars,
-  // inReduceVarByRef, inReductionDeclSymbols, reductionVars, reduceVarByRef,
-  // reductionDeclSymbols.
-  TargetOp::build(
-      builder, state, clauses.ifVar, clauses.deviceVar, clauses.threadLimitVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.nowaitAttr, clauses.isDevicePtrVars, clauses.hasDeviceAddrVars,
-      clauses.mapVars, clauses.privateVars,
-      makeArrayAttr(ctx, clauses.privatizers));
+  // inReductionByref, inReductionSyms.
+  TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
+                  makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
+                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifVar,
+                  /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
+                  /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars,
+                  clauses.mapVars, clauses.nowait, clauses.privateVars,
+                  makeArrayAttr(ctx, clauses.privateSyms), clauses.threadLimit);
 }
 
 LogicalResult TargetOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1486,56 +1476,53 @@ LogicalResult TargetOp::verify() {
 
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        ArrayRef<NamedAttribute> attributes) {
-  ParallelOp::build(
-      builder, state, /*if_expr=*/nullptr, /*num_threads_var=*/nullptr,
-      /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*reduction_vars_byref=*/nullptr,
-      /*reductions=*/nullptr, /*proc_bind_val=*/nullptr,
-      /*private_vars=*/ValueRange(), /*privatizers=*/nullptr);
+  ParallelOp::build(builder, state, /*allocate_vars=*/ValueRange(),
+                    /*allocator_vars=*/ValueRange(), /*if_expr=*/nullptr,
+                    /*num_threads=*/nullptr, /*private_vars=*/ValueRange(),
+                    /*private_syms=*/nullptr, /*proc_bind_kind=*/nullptr,
+                    /*reduction_vars=*/ValueRange(),
+                    /*reduction_byref=*/nullptr, /*reduction_syms=*/nullptr);
   state.addAttributes(attributes);
 }
 
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
-                       const ParallelClauseOps &clauses) {
+                       const ParallelOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
 
-  ParallelOp::build(builder, state, clauses.ifVar, clauses.numThreadsVar,
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.procBindKindAttr, clauses.privateVars,
-                    makeArrayAttr(ctx, clauses.privatizers));
+  ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                    clauses.ifVar, clauses.numThreads, clauses.privateVars,
+                    makeArrayAttr(ctx, clauses.privateSyms),
+                    clauses.procBindKind, clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms));
 }
 
 template <typename OpType>
 static LogicalResult verifyPrivateVarList(OpType &op) {
   auto privateVars = op.getPrivateVars();
-  auto privatizers = op.getPrivatizersAttr();
+  auto privateSyms = op.getPrivateSymsAttr();
 
-  if (privateVars.empty() && (privatizers == nullptr || privatizers.empty()))
+  if (privateVars.empty() && (privateSyms == nullptr || privateSyms.empty()))
     return success();
 
   auto numPrivateVars = privateVars.size();
-  auto numPrivatizers = (privatizers == nullptr) ? 0 : privatizers.size();
+  auto numPrivateSyms = (privateSyms == nullptr) ? 0 : privateSyms.size();
 
-  if (numPrivateVars != numPrivatizers)
+  if (numPrivateVars != numPrivateSyms)
     return op.emitError() << "inconsistent number of private variables and "
                              "privatizer op symbols, private vars: "
                           << numPrivateVars
-                          << " vs. privatizer op symbols: " << numPrivatizers;
+                          << " vs. privatizer op symbols: " << numPrivateSyms;
 
-  for (auto privateVarInfo : llvm::zip_equal(privateVars, privatizers)) {
+  for (auto privateVarInfo : llvm::zip_equal(privateVars, privateSyms)) {
     Type varType = std::get<0>(privateVarInfo).getType();
-    SymbolRefAttr privatizerSym =
-        cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
+    SymbolRefAttr privateSym = cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
     PrivateClauseOp privatizerOp =
-        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op,
-                                                              privatizerSym);
+        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op, privateSym);
 
     if (privatizerOp == nullptr)
       return op.emitError() << "failed to lookup privatizer op with symbol: '"
-                            << privatizerSym << "'";
+                            << privateSym << "'";
 
     Type privatizerType = privatizerOp.getType();
 
@@ -1570,15 +1557,15 @@ LogicalResult ParallelOp::verify() {
     }
   }
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
   if (failed(verifyPrivateVarList(*this)))
     return failure();
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1593,15 +1580,16 @@ static bool opInGlobalImplicitParallelRegion(Operation *op) {
 }
 
 void TeamsOp::build(OpBuilder &builder, OperationState &state,
-                    const TeamsClauseOps &clauses) {
+                    const TeamsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TeamsOp::build(builder, state, clauses.numTeamsLowerVar,
-                 clauses.numTeamsUpperVar, clauses.ifVar,
-                 clauses.threadLimitVar, clauses.allocateVars,
-                 clauses.allocatorVars, clauses.reductionVars,
-                 makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                 makeArrayAttr(ctx, clauses.reductionDeclSymbols));
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                 clauses.ifVar, clauses.numTeamsLower, clauses.numTeamsUpper,
+                 /*private_vars=*/{},
+                 /*private_syms=*/nullptr, clauses.reductionVars,
+                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                 makeArrayAttr(ctx, clauses.reductionSyms),
+                 clauses.threadLimit);
 }
 
 LogicalResult TeamsOp::verify() {
@@ -1628,12 +1616,12 @@ LogicalResult TeamsOp::verify() {
   }
 
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1641,23 +1629,23 @@ LogicalResult TeamsOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SectionsOp::build(OpBuilder &builder, OperationState &state,
-                       const SectionsClauseOps &clauses) {
+                       const SectionsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  SectionsOp::build(builder, state, clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.nowaitAttr);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  SectionsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                    clauses.nowait, /*private_vars=*/{},
+                    /*private_syms=*/nullptr, clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms));
 }
 
 LogicalResult SectionsOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 LogicalResult SectionsOp::verifyRegions() {
@@ -1676,23 +1664,23 @@ LogicalResult SectionsOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void SingleOp::build(OpBuilder &builder, OperationState &state,
-                     const SingleClauseOps &clauses) {
+                     const SingleOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   SingleOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
                   clauses.copyprivateVars,
-                  makeArrayAttr(ctx, clauses.copyprivateFuncs),
-                  clauses.nowaitAttr);
+                  makeArrayAttr(ctx, clauses.copyprivateSyms), clauses.nowait,
+                  /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult SingleOp::verify() {
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyCopyPrivateVarList(*this, getCopyprivateVars(),
-                                  getCopyprivateFuncs());
+  return verifyCopyprivateVarList(*this, getCopyprivateVars(),
+                                  getCopyprivateSyms());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1729,30 +1717,31 @@ void printWsloop(OpAsmPrinter &p, Operation *op, Region &region,
 
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
                      ArrayRef<NamedAttribute> attributes) {
-  build(builder, state, /*linear_vars=*/ValueRange(),
-        /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
-        /*reduction_vars_byref=*/nullptr,
-        /*reductions=*/nullptr, /*schedule_val=*/nullptr,
-        /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false,
-        /*ordered_val=*/nullptr, /*order_val=*/nullptr,
-        /*order_modifier=*/nullptr);
+  build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
+        /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(),
+        /*nowait=*/false, /*order=*/nullptr, /*order_mod=*/nullptr,
+        /*ordered=*/nullptr, /*private_vars=*/{}, /*private_syms=*/nullptr,
+        /*reduction_vars=*/ValueRange(), /*reduction_byref=*/nullptr,
+        /*reduction_syms=*/nullptr, /*schedule_kind=*/nullptr,
+        /*schedule_chunk=*/nullptr, /*schedule_mod=*/nullptr,
+        /*schedule_simd=*/false);
   state.addAttributes(attributes);
 }
 
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
-                     const WsloopClauseOps &clauses) {
+                     const WsloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO: Store clauses in op: allocateVars, allocatorVars, privateVars,
-  // privatizers.
-  WsloopOp::build(builder, state, clauses.linearVars, clauses.linearStepVars,
-                  clauses.reductionVars,
-                  makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                  makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                  clauses.scheduleValAttr, clauses.scheduleChunkVar,
-                  clauses.scheduleModAttr, clauses.scheduleSimdAttr,
-                  clauses.nowaitAttr, clauses.orderedAttr, clauses.orderAttr,
-                  clauses.orderModAttr);
+  // privateSyms.
+  WsloopOp::build(
+      builder, state,
+      /*allocate_vars=*/{}, /*allocator_vars=*/{}, clauses.linearVars,
+      clauses.linearStepVars, clauses.nowait, clauses.order, clauses.orderMod,
+      clauses.ordered, /*private_vars=*/{}, /*private_syms=*/nullptr,
+      clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.scheduleKind,
+      clauses.scheduleChunk, clauses.scheduleMod, clauses.scheduleSimd);
 }
 
 LogicalResult WsloopOp::verify() {
@@ -1766,8 +1755,8 @@ LogicalResult WsloopOp::verify() {
       return emitError() << "only supported nested wrapper is 'omp.simd'";
   }
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1775,14 +1764,17 @@ LogicalResult WsloopOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SimdOp::build(OpBuilder &builder, OperationState &state,
-                   const SimdClauseOps &clauses) {
+                   const SimdOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers, reductionVars,
-  // reduceVarByRef, reductionDeclSymbols.
+  // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
+  // privateSyms, reductionVars, reductionByref, reductionSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
-                makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar,
-                clauses.nontemporalVars, clauses.orderAttr,
-                clauses.orderModAttr, clauses.safelenAttr, clauses.simdlenAttr);
+                makeArrayAttr(ctx, clauses.alignments), clauses.ifVar,
+                /*linear_vars=*/{}, /*linear_step_vars=*/{},
+                clauses.nontemporalVars, clauses.order, clauses.orderMod,
+                /*private_vars=*/{}, /*private_syms=*/nullptr,
+                /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
+                /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
 }
 
 LogicalResult SimdOp::verify() {
@@ -1792,8 +1784,7 @@ LogicalResult SimdOp::verify() {
            << "simdlen clause and safelen clause are both present, but the "
               "simdlen value is not less than or equal to safelen value";
 
-  if (verifyAlignedClause(*this, getAlignmentValues(), getAlignedVars())
-          .failed())
+  if (verifyAlignedClause(*this, getAlignments(), getAlignedVars()).failed())
     return failure();
 
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
@@ -1813,20 +1804,20 @@ LogicalResult SimdOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void DistributeOp::build(OpBuilder &builder, OperationState &state,
-                         const DistributeClauseOps &clauses) {
-  // TODO Store clauses in op: privateVars, privatizers.
-  DistributeOp::build(builder, state, clauses.distScheduleStaticAttr,
-                      clauses.distScheduleChunkSizeVar, clauses.allocateVars,
-                      clauses.allocatorVars, clauses.orderAttr,
-                      clauses.orderModAttr);
+                         const DistributeOperands &clauses) {
+  // TODO Store clauses in op: privateVars, privateSyms.
+  DistributeOp::build(
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.distScheduleStatic, clauses.distScheduleChunkSize, clauses.order,
+      clauses.orderMod, /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult DistributeOp::verify() {
-  if (this->getChunkSize() && !this->getDistScheduleStatic())
+  if (this->getDistScheduleChunkSize() && !this->getDistScheduleStatic())
     return emitOpError() << "chunk size set without "
                             "dist_schedule_static being present";
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
@@ -1942,26 +1933,26 @@ LogicalResult DeclareReductionOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void TaskOp::build(OpBuilder &builder, OperationState &state,
-                   const TaskClauseOps &clauses) {
+                   const TaskOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TaskOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.priorityVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.allocateVars, clauses.allocatorVars);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
+                clauses.final, clauses.ifVar, clauses.inReductionVars,
+                makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+                makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
+                clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr,
+                clauses.untied);
 }
 
 LogicalResult TaskOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars)
              ? verifyDependVars
-             : verifyReductionVarList(*this, getInReductions(),
+             : verifyReductionVarList(*this, getInReductionSyms(),
                                       getInReductionVars(),
-                                      getInReductionVarsByref());
+                                      getInReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1969,19 +1960,18 @@ LogicalResult TaskOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskgroupOp::build(OpBuilder &builder, OperationState &state,
-                        const TaskgroupClauseOps &clauses) {
+                        const TaskgroupOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TaskgroupOp::build(
-      builder, state, clauses.taskReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.taskReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.taskReductionDeclSymbols),
-      clauses.allocateVars, clauses.allocatorVars);
+  TaskgroupOp::build(builder, state, clauses.allocateVars,
+                     clauses.allocatorVars, clauses.taskReductionVars,
+                     makeDenseBoolArrayAttr(ctx, clauses.taskReductionByref),
+                     makeArrayAttr(ctx, clauses.taskReductionSyms));
 }
 
 LogicalResult TaskgroupOp::verify() {
-  return verifyReductionVarList(*this, getTaskReductions(),
+  return verifyReductionVarList(*this, getTaskReductionSyms(),
                                 getTaskReductionVars(),
-                                getTaskReductionVarsByref());
+                                getTaskReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1989,18 +1979,18 @@ LogicalResult TaskgroupOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskloopOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskloopClauseOps &clauses) {
+                       const TaskloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   TaskloopOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-      makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
-      clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
-      clauses.numTasksVar, clauses.nogroupAttr);
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.final, clauses.grainsize, clauses.ifVar, clauses.inReductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+      makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
+      clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{},
+      /*private_syms=*/nullptr, clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.untied);
 }
 
 SmallVector<Value> TaskloopOp::getAllReductionVars() {
@@ -2012,14 +2002,14 @@ SmallVector<Value> TaskloopOp::getAllReductionVars() {
 }
 
 LogicalResult TaskloopOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
-  if (failed(verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                    getReductionVarsByref())) ||
-      failed(verifyReductionVarList(*this, getInReductions(),
+  if (failed(verifyReductionVarList(*this, getReductionSyms(),
+                                    getReductionVars(), getReductionByref())) ||
+      failed(verifyReductionVarList(*this, getInReductionSyms(),
                                     getInReductionVars(),
-                                    getInReductionVarsByref())))
+                                    getInReductionByref())))
     return failure();
 
   if (!getReductionVars().empty() && getNogroup())
@@ -2031,7 +2021,7 @@ LogicalResult TaskloopOp::verify() {
                        "and an in_reduction clause");
   }
 
-  if (getGrainSize() && getNumTasks()) {
+  if (getGrainsize() && getNumTasks()) {
     return emitError(
         "the grainsize clause and num_tasks clause are mutually exclusive and "
         "may not appear on the same taskloop directive");
@@ -2072,7 +2062,7 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 
   // Parse "inclusive" flag.
   if (succeeded(parser.parseOptionalKeyword("inclusive")))
-    result.addAttribute("inclusive",
+    result.addAttribute("loop_inclusive",
                         UnitAttr::get(parser.getBuilder().getContext()));
 
   // Parse step values.
@@ -2099,28 +2089,29 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 void LoopNestOp::print(OpAsmPrinter &p) {
   Region &region = getRegion();
   auto args = region.getArguments();
-  p << " (" << args << ") : " << args[0].getType() << " = (" << getLowerBound()
-    << ") to (" << getUpperBound() << ") ";
-  if (getInclusive())
+  p << " (" << args << ") : " << args[0].getType() << " = ("
+    << getLoopLowerBounds() << ") to (" << getLoopUpperBounds() << ") ";
+  if (getLoopInclusive())
     p << "inclusive ";
-  p << "step (" << getStep() << ") ";
+  p << "step (" << getLoopSteps() << ") ";
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
 void LoopNestOp::build(OpBuilder &builder, OperationState &state,
-                       const LoopNestClauseOps &clauses) {
-  LoopNestOp::build(builder, state, clauses.loopLBVar, clauses.loopUBVar,
-                    clauses.loopStepVar, clauses.loopInclusiveAttr);
+                       const LoopNestOperands &clauses) {
+  LoopNestOp::build(builder, state, clauses.loopLowerBounds,
+                    clauses.loopUpperBounds, clauses.loopSteps,
+                    clauses.loopInclusive);
 }
 
 LogicalResult LoopNestOp::verify() {
-  if (getLowerBound().empty())
+  if (getLoopLowerBounds().empty())
     return emitOpError() << "must represent at least one loop";
 
-  if (getLowerBound().size() != getIVs().size())
+  if (getLoopLowerBounds().size() != getIVs().size())
     return emitOpError() << "number of range arguments and IVs do not match";
 
-  for (auto [lb, iv] : llvm::zip_equal(getLowerBound(), getIVs())) {
+  for (auto [lb, iv] : llvm::zip_equal(getLoopLowerBounds(), getIVs())) {
     if (lb.getType() != iv.getType())
       return emitOpError()
              << "range argument type does not match corresponding IV type";
@@ -2152,13 +2143,12 @@ void LoopNestOp::gatherWrappers(
 //===----------------------------------------------------------------------===//
 
 void CriticalDeclareOp::build(OpBuilder &builder, OperationState &state,
-                              const CriticalClauseOps &clauses) {
-  CriticalDeclareOp::build(builder, state, clauses.criticalNameAttr,
-                           clauses.hintAttr);
+                              const CriticalDeclareOperands &clauses) {
+  CriticalDeclareOp::build(builder, state, clauses.symName, clauses.hint);
 }
 
 LogicalResult CriticalDeclareOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult CriticalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
@@ -2193,7 +2183,7 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 
   Operation *wrapper = loopOp->getParentOp();
   if (auto wsloopOp = dyn_cast<WsloopOp>(wrapper)) {
-    IntegerAttr orderedAttr = wsloopOp.getOrderedValAttr();
+    IntegerAttr orderedAttr = wsloopOp.getOrderedAttr();
     if (!orderedAttr)
       return op.emitOpError() << "the enclosing worksharing-loop region must "
                                  "have an ordered clause";
@@ -2213,9 +2203,9 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 }
 
 void OrderedOp::build(OpBuilder &builder, OperationState &state,
-                      const OrderedOpClauseOps &clauses) {
-  OrderedOp::build(builder, state, clauses.doacrossDependTypeAttr,
-                   clauses.doacrossNumLoopsAttr, clauses.doacrossVectorVars);
+                      const OrderedOperands &clauses) {
+  OrderedOp::build(builder, state, clauses.doacrossDependType,
+                   clauses.doacrossNumLoops, clauses.doacrossDependVars);
 }
 
 LogicalResult OrderedOp::verify() {
@@ -2223,7 +2213,7 @@ LogicalResult OrderedOp::verify() {
     return failure();
 
   auto wrapper = (*this)->getParentOfType<WsloopOp>();
-  if (!wrapper || *wrapper.getOrderedVal() != *getNumLoopsVal())
+  if (!wrapper || *wrapper.getOrdered() != *getDoacrossNumLoops())
     return emitOpError() << "number of variables in depend clause does not "
                          << "match number of iteration variables in the "
                          << "doacross loop";
@@ -2232,13 +2222,13 @@ LogicalResult OrderedOp::verify() {
 }
 
 void OrderedRegionOp::build(OpBuilder &builder, OperationState &state,
-                            const OrderedRegionClauseOps &clauses) {
-  OrderedRegionOp::build(builder, state, clauses.parLevelSimdAttr);
+                            const OrderedRegionOperands &clauses) {
+  OrderedRegionOp::build(builder, state, clauses.parLevelSimd);
 }
 
 LogicalResult OrderedRegionOp::verify() {
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (getSimd())
+  if (getParLevelSimd())
     return failure();
 
   return verifyOrderedParent(**this);
@@ -2249,9 +2239,10 @@ LogicalResult OrderedRegionOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskwaitOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskwaitClauseOps &clauses) {
-  // TODO Store clauses in op: dependTypeAttrs, dependVars, nowaitAttr.
-  TaskwaitOp::build(builder, state);
+                       const TaskwaitOperands &clauses) {
+  // TODO Store clauses in op: dependKinds, dependVars, nowait.
+  TaskwaitOp::build(builder, state, /*depend_kinds=*/nullptr,
+                    /*depend_vars=*/{}, /*nowait=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2262,14 +2253,14 @@ LogicalResult AtomicReadOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Release) {
       return emitError(
           "memory-order must not be acq_rel or release for atomic reads");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2280,14 +2271,14 @@ LogicalResult AtomicWriteOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
           "memory-order must not be acq_rel or acquire for atomic writes");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2301,9 +2292,8 @@ LogicalResult AtomicUpdateOp::canonicalize(AtomicUpdateOp op,
     return success();
   }
   if (Value writeVal = op.getWriteOpVal()) {
-    rewriter.replaceOpWithNewOp<AtomicWriteOp>(op, op.getX(), writeVal,
-                                               op.getHintValAttr(),
-                                               op.getMemoryOrderValAttr());
+    rewriter.replaceOpWithNewOp<AtomicWriteOp>(
+        op, op.getX(), writeVal, op.getHintAttr(), op.getMemoryOrderAttr());
     return success();
   }
   return failure();
@@ -2313,7 +2303,7 @@ LogicalResult AtomicUpdateOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
@@ -2321,7 +2311,7 @@ LogicalResult AtomicUpdateOp::verify() {
     }
   }
 
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicUpdateOp::verifyRegions() { return verifyRegionsCommon(); }
@@ -2349,19 +2339,19 @@ AtomicUpdateOp AtomicCaptureOp::getAtomicUpdateOp() {
 }
 
 LogicalResult AtomicCaptureOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicCaptureOp::verifyRegions() {
   if (verifyRegionsCommon().failed())
     return mlir::failure();
 
-  if (getFirstOp()->getAttr("hint_val") || getSecondOp()->getAttr("hint_val"))
+  if (getFirstOp()->getAttr("hint") || getSecondOp()->getAttr("hint"))
     return emitOpError(
         "operations inside capture region must not have hint clause");
 
-  if (getFirstOp()->getAttr("memory_order_val") ||
-      getSecondOp()->getAttr("memory_order_val"))
+  if (getFirstOp()->getAttr("memory_order") ||
+      getSecondOp()->getAttr("memory_order"))
     return emitOpError(
         "operations inside capture region must not have memory_order clause");
   return success();
@@ -2372,13 +2362,12 @@ LogicalResult AtomicCaptureOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void CancelOp::build(OpBuilder &builder, OperationState &state,
-                     const CancelClauseOps &clauses) {
-  CancelOp::build(builder, state, clauses.cancelDirectiveNameAttr,
-                  clauses.ifVar);
+                     const CancelOperands &clauses) {
+  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar);
 }
 
 LogicalResult CancelOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2404,7 +2393,7 @@ LogicalResult CancelOp::verify() {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have a nowait clause";
     }
-    if (wsloopOp.getOrderedValAttr()) {
+    if (wsloopOp.getOrderedAttr()) {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have an ordered clause";
     }
@@ -2429,12 +2418,12 @@ LogicalResult CancelOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void CancellationPointOp::build(OpBuilder &builder, OperationState &state,
-                                const CancellationPointClauseOps &clauses) {
-  CancellationPointOp::build(builder, state, clauses.cancelDirectiveNameAttr);
+                                const CancellationPointOperands &clauses) {
+  CancellationPointOp::build(builder, state, clauses.cancelDirective);
 }
 
 LogicalResult CancellationPointOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2574,8 +2563,8 @@ LogicalResult PrivateClauseOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void MaskedOp::build(OpBuilder &builder, OperationState &state,
-                     const MaskedClauseOps &clauses) {
-  MaskedOp::build(builder, state, clauses.filteredThreadIdVar);
+                     const MaskedOperands &clauses) {
+  MaskedOp::build(builder, state, clauses.filteredThreadId);
 }
 
 #define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index d363ffe941fce..8c73515c608f5 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   ParallelLoopCollapsing.cpp
   ParallelLoopFusion.cpp
   ParallelLoopTiling.cpp
+  RotateWhileLoop.cpp
   StructuralTypeConversions.cpp
   TileUsingInterface.cpp
   WrapInZeroTripCheck.cpp
diff --git a/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp b/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp
new file mode 100644
index 0000000000000..8707ec91328dc
--- /dev/null
+++ b/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp
@@ -0,0 +1,44 @@
+//===- RotateWhileLoop.cpp - scf.while loop rotation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Rotates `scf.while` loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+using namespace mlir;
+
+namespace {
+struct RotateWhileLoopPattern : OpRewritePattern<scf::WhileOp> {
+  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::WhileOp whileOp,
+                                PatternRewriter &rewriter) const final {
+    // Setting this option would lead to infinite recursion on a greedy driver
+    // as 'do-while' loops wouldn't be skipped.
+    constexpr bool forceCreateCheck = false;
+    FailureOr<scf::WhileOp> result =
+        scf::wrapWhileLoopInZeroTripCheck(whileOp, rewriter, forceCreateCheck);
+    // scf::wrapWhileLoopInZeroTripCheck hasn't yet implemented a failure
+    // mechanism. 'do-while' loops are simply returned unmodified. In order to
+    // stop recursion, we check input and output operations differ.
+    return success(succeeded(result) && *result != whileOp);
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace scf {
+void populateSCFRotateWhileLoopPatterns(RewritePatternSet &patterns) {
+  patterns.add<RotateWhileLoopPattern>(patterns.getContext());
+}
+} // namespace scf
+} // namespace mlir
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index a1392813d6de3..e404c01010a32 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -42,6 +42,16 @@ scf::SCFTilingOptions::setTileSizes(ArrayRef<OpFoldResult> ts) {
   return *this;
 }
 
+scf::SCFTilingOptions &
+scf::SCFTilingOptions::setNumThreads(ArrayRef<OpFoldResult> nt) {
+  assert(!numThreadsComputationFunction && "num tiles already set");
+  auto numThreads = llvm::to_vector(nt);
+  numThreadsComputationFunction = [numThreads](OpBuilder &b, Operation *op) {
+    return numThreads;
+  };
+  return *this;
+}
+
 /// Helper method to adjust the interchange vector to match the iteration
 /// domain.
 static SmallVector<int64_t>
@@ -61,7 +71,120 @@ fillInterchangeVector(ArrayRef<int64_t> interchangeVector,
 // tileUsingSCF implementation.
 //===----------------------------------------------------------------------===//
 
-// Check if `stride` evenly divides the trip count `size - offset`.
+/// Verify the tile size options are set in a consistent manner.
+static LogicalResult
+verifyTileSizeOptions(RewriterBase &rewriter, Location loc,
+                      const scf::SCFTilingOptions &options) {
+  // Specifying number of threads is only supported on `scf.forall` op.
+  if (options.numThreadsComputationFunction &&
+      options.loopType != scf::SCFTilingOptions::LoopType::ForallOp) {
+    return rewriter.notifyMatchFailure(
+        loc, "number of threads can only by specified when loop type is "
+             "set to use `scf.forall`");
+  }
+
+  // If specified, check that the interchange vector is a permutation.
+  if (!options.interchangeVector.empty()) {
+    if (!isPermutationVector(options.interchangeVector)) {
+      return rewriter.notifyMatchFailure(
+          loc, "invalid interchange vector, not a permutation of the entire "
+               "iteration space");
+    }
+  }
+  return success();
+}
+
+/// Method to instantiate the tile sizes and/or number of threads specified
+/// by the user.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
+getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op,
+                              ArrayRef<Range> iterationDomain,
+                              const scf::SCFTilingOptions &options) {
+  OpFoldResult zero = rewriter.getIndexAttr(0);
+  SmallVector<OpFoldResult> tileSizes, numThreads;
+  size_t numLoops = iterationDomain.size();
+
+  // Check whether the number of tiles to use is specified.
+  if (options.numThreadsComputationFunction) {
+    numThreads = options.numThreadsComputationFunction(rewriter, op);
+    numThreads.resize(numLoops, zero);
+
+    // If the number of tiles is also specified, use that.
+    if (options.tileSizeComputationFunction) {
+      tileSizes = options.tileSizeComputationFunction(rewriter, op);
+      tileSizes.resize(numLoops, zero);
+      return {tileSizes, numThreads};
+    }
+
+    // Compute the tile sizes from the iteration domain and number
+    // of tiles as follows
+    // - niters = ceilDiv(ub - lb, step)
+    // - tileSize = ceilDiv(niters, numThreads)
+    AffineExpr s0, s1, s2;
+    bindSymbols(rewriter.getContext(), s0, s1, s2);
+    // TODO: The step here is assumed to be 1.
+    AffineExpr numItersExpr = (s1 - s0);
+    AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s2);
+    tileSizes.resize(numLoops, zero);
+    for (auto [index, range, nt] :
+         llvm::enumerate(iterationDomain, numThreads)) {
+      if (isConstantIntValue(nt, 0))
+        continue;
+
+      tileSizes[index] = affine::makeComposedFoldedAffineApply(
+          rewriter, op.getLoc(), tileSizeExpr, {range.offset, range.size, nt});
+    }
+    tileSizes.resize(numLoops, zero);
+    return {tileSizes, numThreads};
+  }
+
+  // Enforce the convention that "tiling by zero"
+  // skips tiling a particular dimension. This convention is significantly
+  // simpler to handle instead of adjusting affine maps to account for missing
+  // dimensions.
+  assert(options.tileSizeComputationFunction &&
+         "expected tile sizes to be specified");
+  tileSizes = options.tileSizeComputationFunction(rewriter, op);
+  tileSizes.resize(numLoops, zero);
+
+  return {tileSizes, numThreads};
+}
+
+/// Checks if any of the tiled loops are not parallel.
+static void checkSafeToTileToForall(TilingInterface op,
+                                    ArrayRef<OpFoldResult> tileSizes,
+                                    ArrayRef<OpFoldResult> numThreads) {
+  auto iterators = op.getLoopIteratorTypes();
+  assert(iterators.size() == tileSizes.size() &&
+         "expected as many tile size values as number of loops");
+  assert((numThreads.empty() || (numThreads.size() == iterators.size())) &&
+         "when specified, expected number of threads to use for each loop");
+
+  for (auto [index, iterator, tileSize] :
+       llvm::enumerate(iterators, tileSizes)) {
+    // If num threads is specified, check that it is greater than one only for
+    // parallel dimensions.
+    if (!numThreads.empty()) {
+      if (std::optional<int64_t> constNumThreads =
+              getConstantIntValue(numThreads[index])) {
+        if (constNumThreads.value() > 1 &&
+            iterator != utils::IteratorType::parallel) {
+          op.emitWarning() << "tiling is not thread safe at axis #" << index;
+        }
+      }
+      continue;
+    }
+
+    if (std::optional<int64_t> constTileSize = getConstantIntValue(tileSize)) {
+      if (constTileSize.value() > 0 &&
+          iterator != utils::IteratorType::parallel) {
+        op.emitWarning() << "tiling is not thread safe at axis #" << index;
+      }
+    }
+  }
+}
+
+/// Check if `stride` evenly divides the trip count `size - offset`.
 static bool tileDividesIterationDomain(Range loopRange) {
   std::optional<int64_t> offsetAsInt = getConstantIntValue(loopRange.offset);
   if (!offsetAsInt)
@@ -75,10 +198,10 @@ static bool tileDividesIterationDomain(Range loopRange) {
   return ((sizeAsInt.value() - offsetAsInt.value()) % strideAsInt.value() == 0);
 }
 
-/// Returns the bounded tile size given the current `iv`, `loopRange` and
-/// `tileSize`, i.e., `min(tileSize, range.end() - iv)`.
+/// Returns the bounded tile size given the current `offset`, `loopRange` and
+/// `tileSize`, i.e., `min(tileSize, range.end() - offset)`.
 static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
-                                       Range loopRange, Value iv,
+                                       Range loopRange, OpFoldResult offset,
                                        OpFoldResult tileSize) {
   std::optional<int64_t> ts = getConstantIntValue(tileSize);
   if (ts && ts.value() == 1)
@@ -94,10 +217,132 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
   AffineExpr s0, s1, d0;
   bindDims(b.getContext(), d0);
   bindSymbols(b.getContext(), s0, s1);
-  AffineMap minMap = AffineMap::get(1, 2, {s0, s1 - d0}, b.getContext());
+  AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext());
   Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size);
   return affine::makeComposedFoldedAffineMin(
-      b, loc, minMap, SmallVector<OpFoldResult>{iv, tileSize, size});
+      b, loc, minMap, SmallVector<OpFoldResult>{offset, size, tileSize});
+}
+
+/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
+/// than `iterationSize`.
+static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
+                                           OpFoldResult numThreads,
+                                           OpFoldResult iterationSize) {
+  std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);
+  std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);
+  std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);
+  if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
+    return false;
+  return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
+}
+
+/// Compute the `OpFoldResult`s that represents the multi-dimensional
+/// `offset`s and `size`s of the tile of the iteration space that the
+/// innermost loop body of the generated tiled loops corresponds to.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
+getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
+                      ArrayRef<Range> iterationDomain,
+                      ArrayRef<OpFoldResult> tileSizes,
+                      ArrayRef<OpFoldResult> numThreads) {
+  SmallVector<OpFoldResult> offsets, sizes;
+  int materializedLoopNum = 0;
+
+  if (!numThreads.empty()) {
+    AffineExpr d0, d1, s0, s1;
+    AffineExpr offsetExpr, residualTileSizeExpr;
+    bindDims(rewriter.getContext(), d0, d1);
+    bindSymbols(rewriter.getContext(), s0, s1);
+    offsetExpr = d0 + d1 * s0;
+    residualTileSizeExpr = s1 - (d0 + d1 * s0);
+
+    for (auto [nt, tileSize, loopRange] :
+         llvm::zip_equal(numThreads, tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isConstantIntValue(nt, 0)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
+
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, offsetExpr,
+          ArrayRef<OpFoldResult>{loopRange.offset, iv, tileSize});
+      OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, residualTileSizeExpr,
+          {loopRange.offset, nt, tileSize, loopRange.size});
+
+      OpFoldResult size = tileSize;
+      if (!isConstantIntValue(residualTileSize, 0)) {
+        OpFoldResult sizeMinusOffsetPerThread =
+            affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
+                                                  {offset, loopRange.size});
+        size = affine::makeComposedFoldedAffineMin(
+            rewriter, loc,
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()),
+            {sizeMinusOffsetPerThread, tileSize});
+      }
+
+      // Consider the case where the original loop was `[0, 100)`.
+      // If number of threads are `7`, the tile size would be computed as
+      // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6)
+      // - `offset = 0 + 6 * 15 = 105`
+      // - `tileSize = min(15, 100 - 105) = -5`
+      // To avoid negative tile sizes, we need to do a further
+      // `nonNegativeTileSize = affine.max(0, tileSize)`.
+      // This `max` can be avoided if
+      //  `offset + tileSize * (numThreads - 1) < (ub - lb)`
+      if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) {
+        AffineMap maxMap =
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
+        size = affine::makeComposedFoldedAffineMax(
+            rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size});
+      }
+
+      offsets.push_back(offset);
+      sizes.push_back(size);
+    }
+    return {offsets, sizes};
+  } else {
+    for (auto [tileSize, loopRange] :
+         llvm::zip_equal(tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isConstantIntValue(tileSize, 0)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
+
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = getAsOpFoldResult(iv);
+      offsets.push_back(offset);
+      OpFoldResult size =
+          getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize);
+      sizes.push_back(size);
+    }
+    return {offsets, sizes};
+  }
+}
+
+/// Function to return the bounds of the loops to be generated.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>,
+                  SmallVector<OpFoldResult>>
+getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
+              ArrayRef<OpFoldResult> tileSizes) {
+  SmallVector<OpFoldResult> lbs, ubs, steps;
+  for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
+    // No loop if the tile size is 0.
+    if (isConstantIntValue(tileSize, 0))
+      continue;
+    lbs.push_back(loopRange.offset);
+    ubs.push_back(loopRange.size);
+    steps.push_back(tileSize);
+  }
+  return {lbs, ubs, steps};
 }
 
 /// A function that allows returning additional yielded values during
@@ -152,17 +397,19 @@ static LogicalResult generateLoopNestUsingForOp(
   assert(loopRanges.size() == tileSizes.size() &&
          "expected as many tile sizes as loop ranges");
   OpBuilder::InsertionGuard guard(rewriter);
-  SmallVector<Value> ivs;
 
-  for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
-    // No loops if tile size is zero. Set offset and size to the loop
-    // offset and size.
-    if (isConstantIntValue(tileSize, 0))
-      continue;
+  SmallVector<OpFoldResult> lbs, ubs, steps;
+  std::tie(lbs, ubs, steps) =
+      getLoopBounds(rewriter, loc, loopRanges, tileSizes);
+  SmallVector<Value> lbVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, lbs);
+  SmallVector<Value> ubVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, ubs);
+  SmallVector<Value> stepVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, steps);
 
-    Value lb = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.offset);
-    Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.size);
-    Value step = getValueOrCreateConstantIndexOp(rewriter, loc, tileSize);
+  SmallVector<Value> ivs;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbVals, ubVals, stepVals)) {
     auto loop =
         rewriter.create<scf::ForOp>(loc, lb, ub, step, destinationTensors,
                                     [](OpBuilder &bodyBuilder, Location bodyLoc,
@@ -224,10 +471,9 @@ static LogicalResult generateLoopNestUsingForOp(
 ///    populated.
 static LogicalResult generateLoopNestUsingForallOp(
     RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
-    ArrayRef<OpFoldResult> tileSizes, ArrayRef<Attribute> mappingVector,
-    ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn,
-    SmallVector<LoopLikeOpInterface> &loops) {
-  SmallVector<OpFoldResult> lbs, ubs, steps;
+    ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads,
+    ArrayRef<Attribute> mappingVector, ValueRange destinationTensors,
+    YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
   assert(!loopRanges.empty() && "unexpected empty loop ranges");
   assert(loopRanges.size() == tileSizes.size() &&
          "expected as many tile sizes as loop ranges");
@@ -235,21 +481,30 @@ static LogicalResult generateLoopNestUsingForallOp(
   SmallVector<OpFoldResult> offsets(loopRanges.size()),
       sizes(loopRanges.size());
 
-  for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, loopRanges)) {
-    if (isConstantIntValue(tileSize, 0))
-      continue;
-    lbs.push_back(loopRange.offset);
-    ubs.push_back(loopRange.size);
-    steps.push_back(tileSize);
-  }
-  assert(!lbs.empty() && "Expected at least one loop range");
-
   std::optional<ArrayAttr> mappingAttr;
   if (!mappingVector.empty())
     mappingAttr = rewriter.getArrayAttr(mappingVector);
 
-  auto forallOp = rewriter.create<scf::ForallOp>(
-      loc, lbs, ubs, steps, destinationTensors, mappingAttr);
+  scf::ForallOp forallOp;
+  bool useNumThreads = !numThreads.empty();
+
+  if (useNumThreads) {
+    // Prune the zero numthreads.
+    SmallVector<OpFoldResult> nonZeroNumThreads;
+    for (auto nt : numThreads) {
+      if (isConstantIntValue(nt, 0))
+        continue;
+      nonZeroNumThreads.push_back(nt);
+    }
+    forallOp = rewriter.create<scf::ForallOp>(loc, nonZeroNumThreads,
+                                              destinationTensors, mappingAttr);
+  } else {
+    SmallVector<OpFoldResult> lbs, ubs, steps;
+    std::tie(lbs, ubs, steps) =
+        getLoopBounds(rewriter, loc, loopRanges, tileSizes);
+    forallOp = rewriter.create<scf::ForallOp>(loc, lbs, ubs, steps,
+                                              destinationTensors, mappingAttr);
+  }
   loops.push_back(forallOp);
 
   rewriter.setInsertionPoint(forallOp.getTerminator());
@@ -286,13 +541,11 @@ static LogicalResult generateLoopNestUsingForallOp(
 ///    loop.
 /// - `loops` is an in-out parameter into which the generated loops are
 ///    populated.
-static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
-                                      const scf::SCFTilingOptions &options,
-                                      ArrayRef<Range> loopRanges,
-                                      ArrayRef<OpFoldResult> tileSizes,
-                                      ValueRange destinationTensors,
-                                      YieldTiledValuesFn tiledBodyFn,
-                                      SmallVector<LoopLikeOpInterface> &loops) {
+static LogicalResult generateLoopNest(
+    RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options,
+    ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes,
+    ArrayRef<OpFoldResult> numThreads, ValueRange destinationTensors,
+    YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
   // If the tile sizes are all zero, no loops are generated. Just call the
   // callback function to handle untiled case.
   if (llvm::all_of(tileSizes, isZeroIndex)) {
@@ -307,7 +560,7 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
   }
   if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
     return generateLoopNestUsingForallOp(
-        rewriter, loc, loopRanges, tileSizes, options.mappingVector,
+        rewriter, loc, loopRanges, tileSizes, numThreads, options.mappingVector,
         destinationTensors, tiledBodyFn, loops);
   }
   return rewriter.notifyMatchFailure(loc, "unhandled loop type");
@@ -531,27 +784,25 @@ static LogicalResult addInitOperandsToLoopNest(
 FailureOr<scf::SCFTilingResult>
 mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
                         const scf::SCFTilingOptions &options) {
+  if (failed(verifyTileSizeOptions(rewriter, op.getLoc(), options))) {
+    return failure();
+  }
+
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointAfter(op);
 
-  if (!options.tileSizeComputationFunction) {
-    return rewriter.notifyMatchFailure(
-        op, "missing tile size computation function");
-  }
-
   // 1. Get the range of the loops that are represented by the operation.
   SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
-  size_t numLoops = iterationDomain.size();
 
-  // 2. Materialize the tile sizes. Enforce the convention that "tiling by zero"
-  // skips tiling a particular dimension. This convention is significantly
-  // simpler to handle instead of adjusting affine maps to account for missing
-  // dimensions.
-  SmallVector<OpFoldResult> tileSizes =
-      options.tileSizeComputationFunction(rewriter, op);
-  if (tileSizes.size() < iterationDomain.size()) {
-    auto zero = rewriter.getIndexAttr(0);
-    tileSizes.append(numLoops - tileSizes.size(), zero);
+  // 2. Materialize the tile sizes and/or number of threads;
+  SmallVector<OpFoldResult> tileSizes, numThreads;
+  std::tie(tileSizes, numThreads) =
+      getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options);
+
+  // Check if it is safe to tile. This is hold over from previous iterations
+  // of tile to for-all. Consider dropping it.
+  if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
+    checkSafeToTileToForall(op, tileSizes, numThreads);
   }
 
   // 3. If there is an interchange specified, permute the iteration domain and
@@ -560,16 +811,13 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   if (!options.interchangeVector.empty()) {
     interchangeVector = fillInterchangeVector(options.interchangeVector,
                                               iterationDomain.size());
-  }
-  if (!interchangeVector.empty()) {
-    if (!isPermutationVector(interchangeVector)) {
-      return rewriter.notifyMatchFailure(
-          op, "invalid intechange vector, not a permutation of the entire "
-              "iteration space");
-    }
+    assert(isPermutationVector(interchangeVector) &&
+           "expected interchange vector to be a permutation");
 
     applyPermutationToVector(iterationDomain, interchangeVector);
     applyPermutationToVector(tileSizes, interchangeVector);
+    if (!numThreads.empty())
+      applyPermutationToVector(numThreads, interchangeVector);
   }
 
   FailureOr<TilingResult> tilingResult;
@@ -583,21 +831,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
       -> LogicalResult {
     // 4a. Compute the `offsets` and `sizes` to use for tiling.
     SmallVector<OpFoldResult> offsets, sizes;
-    {
-      int materializedLoopNum = 0;
-      for (auto [tileSize, loopRange] :
-           llvm::zip_equal(tileSizes, iterationDomain)) {
-        if (isConstantIntValue(tileSize, 0)) {
-          offsets.push_back(loopRange.offset);
-          sizes.push_back(loopRange.size);
-          continue;
-        }
-        Value iv = ivs[materializedLoopNum++];
-        offsets.push_back(iv);
-        sizes.push_back(
-            getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize));
-      }
-    }
+    std::tie(offsets, sizes) = getTileOffsetAndSizes(
+        rewriter, loc, ivs, iterationDomain, tileSizes, numThreads);
 
     // 4b. If interchange was provided, apply inverse of the interchange
     //     to get back the offsets/sizes in the order to be specified.
@@ -665,7 +900,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   // 7. Generate the tiled loops nest using the callback defined above.
   SmallVector<LoopLikeOpInterface> loops;
   if (failed(generateLoopNest(rewriter, op.getLoc(), options, iterationDomain,
-                              tileSizes, destinationTensors,
+                              tileSizes, numThreads, destinationTensors,
                               innerYieldTiledValuesFn, loops)))
     return op.emitOpError("failed to generate tiling loops");
   assert(succeeded(tilingResult) &&
@@ -781,6 +1016,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   scf::SCFTilingOptions options;
   options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
   if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector,
+                              /*numThreads=*/ArrayRef<OpFoldResult>{},
                               initTensors, innerYieldTiledValuesFn, loops)))
     return b.notifyMatchFailure(op, "failed to tile for parallel reduction");
 
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index c0ee9d2afe91c..9df6e24de178f 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -294,8 +294,8 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
 }
 
 /// Returns the trip count of `forOp` if its' low bound, high bound and step are
-/// constants, or optional otherwise. Trip count is computed as ceilDiv(highBound
-/// - lowBound, step).
+/// constants, or optional otherwise. Trip count is computed as
+/// ceilDiv(highBound - lowBound, step).
 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
   std::optional<int64_t> lbCstOp = getConstantIntValue(forOp.getLowerBound());
   std::optional<int64_t> ubCstOp = getConstantIntValue(forOp.getUpperBound());
@@ -1363,3 +1363,37 @@ scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
 
   return fusedLoop;
 }
+
+FailureOr<scf::ForallOp> mlir::normalizeForallOp(RewriterBase &rewriter,
+                                                 scf::ForallOp forallOp) {
+  SmallVector<OpFoldResult> lbs = forallOp.getMixedLowerBound();
+  SmallVector<OpFoldResult> ubs = forallOp.getMixedUpperBound();
+  SmallVector<OpFoldResult> steps = forallOp.getMixedStep();
+
+  if (llvm::all_of(
+          lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
+      llvm::all_of(
+          steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
+    return forallOp;
+  }
+
+  SmallVector<OpFoldResult> newLbs, newUbs, newSteps;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
+    Range normalizedLoopParams =
+        emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step);
+    newLbs.push_back(normalizedLoopParams.offset);
+    newUbs.push_back(normalizedLoopParams.size);
+    newSteps.push_back(normalizedLoopParams.stride);
+  }
+
+  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
+      forallOp.getLoc(), newLbs, newUbs, newSteps, forallOp.getOutputs(),
+      forallOp.getMapping(), [](OpBuilder &, Location, ValueRange) {});
+
+  rewriter.inlineRegionBefore(forallOp.getBodyRegion(),
+                              normalizedForallOp.getBodyRegion(),
+                              normalizedForallOp.getBodyRegion().begin());
+
+  rewriter.replaceAllOpUsesWith(forallOp, normalizedForallOp);
+  return success();
+}
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index 4de9b4729e720..68e0206e30a59 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRSPIRVConversion
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRIR
+  MLIRPass
   MLIRSPIRVDialect
   MLIRSupport
   MLIRTransformUtils
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index e3a09ef1ff684..d833ec9309baa 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -20,17 +20,21 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 
 #include <functional>
@@ -40,18 +44,12 @@
 
 using namespace mlir;
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 // Utility functions
 //===----------------------------------------------------------------------===//
 
-static int getComputeVectorSize(int64_t size) {
-  for (int i : {4, 3, 2}) {
-    if (size % i == 0)
-      return i;
-  }
-  return 1;
-}
-
 static std::optional<SmallVector<int64_t>> getTargetShape(VectorType vecType) {
   LLVM_DEBUG(llvm::dbgs() << "Get target shape\n");
   if (vecType.isScalable()) {
@@ -60,8 +58,8 @@ static std::optional<SmallVector<int64_t>> getTargetShape(VectorType vecType) {
     return std::nullopt;
   }
   SmallVector<int64_t> unrollShape = llvm::to_vector<4>(vecType.getShape());
-  std::optional<SmallVector<int64_t>> targetShape =
-      SmallVector<int64_t>(1, getComputeVectorSize(vecType.getShape().back()));
+  std::optional<SmallVector<int64_t>> targetShape = SmallVector<int64_t>(
+      1, mlir::spirv::getComputeVectorSize(vecType.getShape().back()));
   if (!targetShape) {
     LLVM_DEBUG(llvm::dbgs() << "--no unrolling target shape defined\n");
     return std::nullopt;
@@ -171,18 +169,6 @@ static spirv::ScalarType getIndexType(MLIRContext *ctx,
       IntegerType::get(ctx, options.use64bitIndex ? 64 : 32));
 }
 
-Type SPIRVTypeConverter::getIndexType() const {
-  return ::getIndexType(getContext(), options);
-}
-
-MLIRContext *SPIRVTypeConverter::getContext() const {
-  return targetEnv.getAttr().getContext();
-}
-
-bool SPIRVTypeConverter::allows(spirv::Capability capability) const {
-  return targetEnv.allows(capability);
-}
-
 // TODO: This is a utility function that should probably be exposed by the
 // SPIR-V dialect. Keeping it local till the use case arises.
 static std::optional<int64_t>
@@ -673,9 +659,9 @@ static Type convertMemrefType(const spirv::TargetEnv &targetEnv,
 /// This function is meant to handle the **compute** side; so it does not
 /// involve storage classes in its logic. The storage side is expected to be
 /// handled by MemRef conversion logic.
-std::optional<Value> castToSourceType(const spirv::TargetEnv &targetEnv,
-                                      OpBuilder &builder, Type type,
-                                      ValueRange inputs, Location loc) {
+static std::optional<Value> castToSourceType(const spirv::TargetEnv &targetEnv,
+                                             OpBuilder &builder, Type type,
+                                             ValueRange inputs, Location loc) {
   // We can only cast one value in SPIR-V.
   if (inputs.size() != 1) {
     auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
@@ -731,140 +717,185 @@ std::optional<Value> castToSourceType(const spirv::TargetEnv &targetEnv,
 }
 
 //===----------------------------------------------------------------------===//
-// SPIRVTypeConverter
+// Builtin Variables
 //===----------------------------------------------------------------------===//
 
-SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr,
-                                       const SPIRVConversionOptions &options)
-    : targetEnv(targetAttr), options(options) {
-  // Add conversions. The order matters here: later ones will be tried earlier.
+static spirv::GlobalVariableOp getBuiltinVariable(Block &body,
+                                                  spirv::BuiltIn builtin) {
+  // Look through all global variables in the given `body` block and check if
+  // there is a spirv.GlobalVariable that has the same `builtin` attribute.
+  for (auto varOp : body.getOps<spirv::GlobalVariableOp>()) {
+    if (auto builtinAttr = varOp->getAttrOfType<StringAttr>(
+            spirv::SPIRVDialect::getAttributeName(
+                spirv::Decoration::BuiltIn))) {
+      auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
+      if (varBuiltIn && *varBuiltIn == builtin) {
+        return varOp;
+      }
+    }
+  }
+  return nullptr;
+}
 
-  // Allow all SPIR-V dialect specific types. This assumes all builtin types
-  // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType)
-  // were tried before.
-  //
-  // TODO: This assumes that the SPIR-V types are valid to use in the given
-  // target environment, which should be the case if the whole pipeline is
-  // driven by the same target environment. Still, we probably still want to
-  // validate and convert to be safe.
-  addConversion([](spirv::SPIRVType type) { return type; });
+/// Gets name of global variable for a builtin.
+std::string getBuiltinVarName(spirv::BuiltIn builtin, StringRef prefix,
+                              StringRef suffix) {
+  return Twine(prefix).concat(stringifyBuiltIn(builtin)).concat(suffix).str();
+}
 
-  addConversion([this](IndexType /*indexType*/) { return getIndexType(); });
+/// Gets or inserts a global variable for a builtin within `body` block.
+static spirv::GlobalVariableOp
+getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin,
+                           Type integerType, OpBuilder &builder,
+                           StringRef prefix, StringRef suffix) {
+  if (auto varOp = getBuiltinVariable(body, builtin))
+    return varOp;
 
-  addConversion([this](IntegerType intType) -> std::optional<Type> {
-    if (auto scalarType = dyn_cast<spirv::ScalarType>(intType))
-      return convertScalarType(this->targetEnv, this->options, scalarType);
-    if (intType.getWidth() < 8)
-      return convertSubByteIntegerType(this->options, intType);
-    return Type();
-  });
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(&body);
 
-  addConversion([this](FloatType floatType) -> std::optional<Type> {
-    if (auto scalarType = dyn_cast<spirv::ScalarType>(floatType))
-      return convertScalarType(this->targetEnv, this->options, scalarType);
-    return Type();
-  });
+  spirv::GlobalVariableOp newVarOp;
+  switch (builtin) {
+  case spirv::BuiltIn::NumWorkgroups:
+  case spirv::BuiltIn::WorkgroupSize:
+  case spirv::BuiltIn::WorkgroupId:
+  case spirv::BuiltIn::LocalInvocationId:
+  case spirv::BuiltIn::GlobalInvocationId: {
+    auto ptrType = spirv::PointerType::get(VectorType::get({3}, integerType),
+                                           spirv::StorageClass::Input);
+    std::string name = getBuiltinVarName(builtin, prefix, suffix);
+    newVarOp =
+        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
+    break;
+  }
+  case spirv::BuiltIn::SubgroupId:
+  case spirv::BuiltIn::NumSubgroups:
+  case spirv::BuiltIn::SubgroupSize: {
+    auto ptrType =
+        spirv::PointerType::get(integerType, spirv::StorageClass::Input);
+    std::string name = getBuiltinVarName(builtin, prefix, suffix);
+    newVarOp =
+        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
+    break;
+  }
+  default:
+    emitError(loc, "unimplemented builtin variable generation for ")
+        << stringifyBuiltIn(builtin);
+  }
+  return newVarOp;
+}
 
-  addConversion([this](ComplexType complexType) {
-    return convertComplexType(this->targetEnv, this->options, complexType);
-  });
+//===----------------------------------------------------------------------===//
+// Push constant storage
+//===----------------------------------------------------------------------===//
 
-  addConversion([this](VectorType vectorType) {
-    return convertVectorType(this->targetEnv, this->options, vectorType);
-  });
+/// Returns the pointer type for the push constant storage containing
+/// `elementCount` 32-bit integer values.
+static spirv::PointerType getPushConstantStorageType(unsigned elementCount,
+                                                     Builder &builder,
+                                                     Type indexType) {
+  auto arrayType = spirv::ArrayType::get(indexType, elementCount,
+                                         /*stride=*/4);
+  auto structType = spirv::StructType::get({arrayType}, /*offsetInfo=*/0);
+  return spirv::PointerType::get(structType, spirv::StorageClass::PushConstant);
+}
 
-  addConversion([this](TensorType tensorType) {
-    return convertTensorType(this->targetEnv, this->options, tensorType);
-  });
+/// Returns the push constant varible containing `elementCount` 32-bit integer
+/// values in `body`. Returns null op if such an op does not exit.
+static spirv::GlobalVariableOp getPushConstantVariable(Block &body,
+                                                       unsigned elementCount) {
+  for (auto varOp : body.getOps<spirv::GlobalVariableOp>()) {
+    auto ptrType = dyn_cast<spirv::PointerType>(varOp.getType());
+    if (!ptrType)
+      continue;
 
-  addConversion([this](MemRefType memRefType) {
-    return convertMemrefType(this->targetEnv, this->options, memRefType);
-  });
+    // Note that Vulkan requires "There must be no more than one push constant
+    // block statically used per shader entry point." So we should always reuse
+    // the existing one.
+    if (ptrType.getStorageClass() == spirv::StorageClass::PushConstant) {
+      auto numElements = cast<spirv::ArrayType>(
+                             cast<spirv::StructType>(ptrType.getPointeeType())
+                                 .getElementType(0))
+                             .getNumElements();
+      if (numElements == elementCount)
+        return varOp;
+    }
+  }
+  return nullptr;
+}
 
-  // Register some last line of defense casting logic.
-  addSourceMaterialization(
-      [this](OpBuilder &builder, Type type, ValueRange inputs, Location loc) {
-        return castToSourceType(this->targetEnv, builder, type, inputs, loc);
-      });
-  addTargetMaterialization([](OpBuilder &builder, Type type, ValueRange inputs,
-                              Location loc) {
-    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
-    return std::optional<Value>(cast.getResult(0));
-  });
+/// Gets or inserts a global variable for push constant storage containing
+/// `elementCount` 32-bit integer values in `block`.
+static spirv::GlobalVariableOp
+getOrInsertPushConstantVariable(Location loc, Block &block,
+                                unsigned elementCount, OpBuilder &b,
+                                Type indexType) {
+  if (auto varOp = getPushConstantVariable(block, elementCount))
+    return varOp;
+
+  auto builder = OpBuilder::atBlockBegin(&block, b.getListener());
+  auto type = getPushConstantStorageType(elementCount, builder, indexType);
+  const char *name = "__push_constant_var__";
+  return builder.create<spirv::GlobalVariableOp>(loc, type, name,
+                                                 /*initializer=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
 // func::FuncOp Conversion Patterns
 //===----------------------------------------------------------------------===//
 
-namespace {
 /// A pattern for rewriting function signature to convert arguments of functions
 /// to be of valid SPIR-V types.
-class FuncOpConversion final : public OpConversionPattern<func::FuncOp> {
-public:
+struct FuncOpConversion final : OpConversionPattern<func::FuncOp> {
   using OpConversionPattern<func::FuncOp>::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override;
-};
-} // namespace
-
-LogicalResult
-FuncOpConversion::matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
-                                  ConversionPatternRewriter &rewriter) const {
-  auto fnType = funcOp.getFunctionType();
-  if (fnType.getNumResults() > 1)
-    return failure();
-
-  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
-  for (const auto &argType : enumerate(fnType.getInputs())) {
-    auto convertedType = getTypeConverter()->convertType(argType.value());
-    if (!convertedType)
-      return failure();
-    signatureConverter.addInputs(argType.index(), convertedType);
-  }
-
-  Type resultType;
-  if (fnType.getNumResults() == 1) {
-    resultType = getTypeConverter()->convertType(fnType.getResult(0));
-    if (!resultType)
+                  ConversionPatternRewriter &rewriter) const override {
+    FunctionType fnType = funcOp.getFunctionType();
+    if (fnType.getNumResults() > 1)
       return failure();
-  }
 
-  // Create the converted spirv.func op.
-  auto newFuncOp = rewriter.create<spirv::FuncOp>(
-      funcOp.getLoc(), funcOp.getName(),
-      rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
-                               resultType ? TypeRange(resultType)
-                                          : TypeRange()));
-
-  // Copy over all attributes other than the function name and type.
-  for (const auto &namedAttr : funcOp->getAttrs()) {
-    if (namedAttr.getName() != funcOp.getFunctionTypeAttrName() &&
-        namedAttr.getName() != SymbolTable::getSymbolAttrName())
-      newFuncOp->setAttr(namedAttr.getName(), namedAttr.getValue());
-  }
+    TypeConverter::SignatureConversion signatureConverter(
+        fnType.getNumInputs());
+    for (const auto &argType : enumerate(fnType.getInputs())) {
+      auto convertedType = getTypeConverter()->convertType(argType.value());
+      if (!convertedType)
+        return failure();
+      signatureConverter.addInputs(argType.index(), convertedType);
+    }
 
-  rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
-                              newFuncOp.end());
-  if (failed(rewriter.convertRegionTypes(
-          &newFuncOp.getBody(), *getTypeConverter(), &signatureConverter)))
-    return failure();
-  rewriter.eraseOp(funcOp);
-  return success();
-}
+    Type resultType;
+    if (fnType.getNumResults() == 1) {
+      resultType = getTypeConverter()->convertType(fnType.getResult(0));
+      if (!resultType)
+        return failure();
+    }
 
-void mlir::populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
-                                              RewritePatternSet &patterns) {
-  patterns.add<FuncOpConversion>(typeConverter, patterns.getContext());
-}
+    // Create the converted spirv.func op.
+    auto newFuncOp = rewriter.create<spirv::FuncOp>(
+        funcOp.getLoc(), funcOp.getName(),
+        rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
+                                 resultType ? TypeRange(resultType)
+                                            : TypeRange()));
+
+    // Copy over all attributes other than the function name and type.
+    for (const auto &namedAttr : funcOp->getAttrs()) {
+      if (namedAttr.getName() != funcOp.getFunctionTypeAttrName() &&
+          namedAttr.getName() != SymbolTable::getSymbolAttrName())
+        newFuncOp->setAttr(namedAttr.getName(), namedAttr.getValue());
+    }
 
-//===----------------------------------------------------------------------===//
-// func::FuncOp Conversion Patterns
-//===----------------------------------------------------------------------===//
+    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                                newFuncOp.end());
+    if (failed(rewriter.convertRegionTypes(
+            &newFuncOp.getBody(), *getTypeConverter(), &signatureConverter)))
+      return failure();
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
 
-namespace {
 /// A pattern for rewriting function signature to convert vector arguments of
 /// functions to be of valid types
 struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
@@ -1015,17 +1046,11 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
     return success();
   }
 };
-} // namespace
-
-void mlir::populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns) {
-  patterns.add<FuncOpVectorUnroll>(patterns.getContext());
-}
 
 //===----------------------------------------------------------------------===//
 // func::ReturnOp Conversion Patterns
 //===----------------------------------------------------------------------===//
 
-namespace {
 /// A pattern for rewriting function signature and the return op to convert
 /// vectors to be of valid types.
 struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
@@ -1069,13 +1094,20 @@ struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
       // the original operand of illegal type.
       auto originalShape =
           llvm::to_vector_of<int64_t, 4>(origVecType.getShape());
-      SmallVector<int64_t> strides(targetShape->size(), 1);
+      SmallVector<int64_t> strides(originalShape.size(), 1);
+      SmallVector<int64_t> extractShape(originalShape.size(), 1);
+      extractShape.back() = targetShape->back();
       SmallVector<Type> newTypes;
       Value returnValue = returnOp.getOperand(origResultNo);
       for (SmallVector<int64_t> offsets :
            StaticTileOffsetRange(originalShape, *targetShape)) {
         Value result = rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, returnValue, offsets, *targetShape, strides);
+            loc, returnValue, offsets, extractShape, strides);
+        if (originalShape.size() > 1) {
+          SmallVector<int64_t> extractIndices(originalShape.size() - 1, 0);
+          result =
+              rewriter.create<vector::ExtractOp>(loc, result, extractIndices);
+        }
         newOperands.push_back(result);
         newTypes.push_back(unrolledType);
       }
@@ -1097,81 +1129,13 @@ struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
     return success();
   }
 };
-} // namespace
 
-void mlir::populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns) {
-  patterns.add<ReturnOpVectorUnroll>(patterns.getContext());
-}
+} // namespace
 
 //===----------------------------------------------------------------------===//
-// Builtin Variables
+// Public function for builtin variables
 //===----------------------------------------------------------------------===//
 
-static spirv::GlobalVariableOp getBuiltinVariable(Block &body,
-                                                  spirv::BuiltIn builtin) {
-  // Look through all global variables in the given `body` block and check if
-  // there is a spirv.GlobalVariable that has the same `builtin` attribute.
-  for (auto varOp : body.getOps<spirv::GlobalVariableOp>()) {
-    if (auto builtinAttr = varOp->getAttrOfType<StringAttr>(
-            spirv::SPIRVDialect::getAttributeName(
-                spirv::Decoration::BuiltIn))) {
-      auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
-      if (varBuiltIn && *varBuiltIn == builtin) {
-        return varOp;
-      }
-    }
-  }
-  return nullptr;
-}
-
-/// Gets name of global variable for a builtin.
-static std::string getBuiltinVarName(spirv::BuiltIn builtin, StringRef prefix,
-                                     StringRef suffix) {
-  return Twine(prefix).concat(stringifyBuiltIn(builtin)).concat(suffix).str();
-}
-
-/// Gets or inserts a global variable for a builtin within `body` block.
-static spirv::GlobalVariableOp
-getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin,
-                           Type integerType, OpBuilder &builder,
-                           StringRef prefix, StringRef suffix) {
-  if (auto varOp = getBuiltinVariable(body, builtin))
-    return varOp;
-
-  OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToStart(&body);
-
-  spirv::GlobalVariableOp newVarOp;
-  switch (builtin) {
-  case spirv::BuiltIn::NumWorkgroups:
-  case spirv::BuiltIn::WorkgroupSize:
-  case spirv::BuiltIn::WorkgroupId:
-  case spirv::BuiltIn::LocalInvocationId:
-  case spirv::BuiltIn::GlobalInvocationId: {
-    auto ptrType = spirv::PointerType::get(VectorType::get({3}, integerType),
-                                           spirv::StorageClass::Input);
-    std::string name = getBuiltinVarName(builtin, prefix, suffix);
-    newVarOp =
-        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
-    break;
-  }
-  case spirv::BuiltIn::SubgroupId:
-  case spirv::BuiltIn::NumSubgroups:
-  case spirv::BuiltIn::SubgroupSize: {
-    auto ptrType =
-        spirv::PointerType::get(integerType, spirv::StorageClass::Input);
-    std::string name = getBuiltinVarName(builtin, prefix, suffix);
-    newVarOp =
-        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
-    break;
-  }
-  default:
-    emitError(loc, "unimplemented builtin variable generation for ")
-        << stringifyBuiltIn(builtin);
-  }
-  return newVarOp;
-}
-
 Value mlir::spirv::getBuiltinVariableValue(Operation *op,
                                            spirv::BuiltIn builtin,
                                            Type integerType, OpBuilder &builder,
@@ -1190,60 +1154,9 @@ Value mlir::spirv::getBuiltinVariableValue(Operation *op,
 }
 
 //===----------------------------------------------------------------------===//
-// Push constant storage
+// Public function for pushing constant storage
 //===----------------------------------------------------------------------===//
 
-/// Returns the pointer type for the push constant storage containing
-/// `elementCount` 32-bit integer values.
-static spirv::PointerType getPushConstantStorageType(unsigned elementCount,
-                                                     Builder &builder,
-                                                     Type indexType) {
-  auto arrayType = spirv::ArrayType::get(indexType, elementCount,
-                                         /*stride=*/4);
-  auto structType = spirv::StructType::get({arrayType}, /*offsetInfo=*/0);
-  return spirv::PointerType::get(structType, spirv::StorageClass::PushConstant);
-}
-
-/// Returns the push constant varible containing `elementCount` 32-bit integer
-/// values in `body`. Returns null op if such an op does not exit.
-static spirv::GlobalVariableOp getPushConstantVariable(Block &body,
-                                                       unsigned elementCount) {
-  for (auto varOp : body.getOps<spirv::GlobalVariableOp>()) {
-    auto ptrType = dyn_cast<spirv::PointerType>(varOp.getType());
-    if (!ptrType)
-      continue;
-
-    // Note that Vulkan requires "There must be no more than one push constant
-    // block statically used per shader entry point." So we should always reuse
-    // the existing one.
-    if (ptrType.getStorageClass() == spirv::StorageClass::PushConstant) {
-      auto numElements = cast<spirv::ArrayType>(
-                             cast<spirv::StructType>(ptrType.getPointeeType())
-                                 .getElementType(0))
-                             .getNumElements();
-      if (numElements == elementCount)
-        return varOp;
-    }
-  }
-  return nullptr;
-}
-
-/// Gets or inserts a global variable for push constant storage containing
-/// `elementCount` 32-bit integer values in `block`.
-static spirv::GlobalVariableOp
-getOrInsertPushConstantVariable(Location loc, Block &block,
-                                unsigned elementCount, OpBuilder &b,
-                                Type indexType) {
-  if (auto varOp = getPushConstantVariable(block, elementCount))
-    return varOp;
-
-  auto builder = OpBuilder::atBlockBegin(&block, b.getListener());
-  auto type = getPushConstantStorageType(elementCount, builder, indexType);
-  const char *name = "__push_constant_var__";
-  return builder.create<spirv::GlobalVariableOp>(loc, type, name,
-                                                 /*initializer=*/nullptr);
-}
-
 Value spirv::getPushConstantValue(Operation *op, unsigned elementCount,
                                   unsigned offset, Type integerType,
                                   OpBuilder &builder) {
@@ -1267,7 +1180,7 @@ Value spirv::getPushConstantValue(Operation *op, unsigned elementCount,
 }
 
 //===----------------------------------------------------------------------===//
-// Index calculation
+// Public functions for index calculation
 //===----------------------------------------------------------------------===//
 
 Value mlir::spirv::linearizeIndex(ValueRange indices, ArrayRef<int64_t> strides,
@@ -1375,6 +1288,193 @@ Value mlir::spirv::getElementPtr(const SPIRVTypeConverter &typeConverter,
                              builder);
 }
 
+//===----------------------------------------------------------------------===//
+// Public functions for vector unrolling
+//===----------------------------------------------------------------------===//
+
+int mlir::spirv::getComputeVectorSize(int64_t size) {
+  for (int i : {4, 3, 2}) {
+    if (size % i == 0)
+      return i;
+  }
+  return 1;
+}
+
+SmallVector<int64_t>
+mlir::spirv::getNativeVectorShapeImpl(vector::ReductionOp op) {
+  VectorType srcVectorType = op.getSourceVectorType();
+  assert(srcVectorType.getRank() == 1); // Guaranteed by semantics
+  int64_t vectorSize =
+      mlir::spirv::getComputeVectorSize(srcVectorType.getDimSize(0));
+  return {vectorSize};
+}
+
+SmallVector<int64_t>
+mlir::spirv::getNativeVectorShapeImpl(vector::TransposeOp op) {
+  VectorType vectorType = op.getResultVectorType();
+  SmallVector<int64_t> nativeSize(vectorType.getRank(), 1);
+  nativeSize.back() =
+      mlir::spirv::getComputeVectorSize(vectorType.getShape().back());
+  return nativeSize;
+}
+
+std::optional<SmallVector<int64_t>>
+mlir::spirv::getNativeVectorShape(Operation *op) {
+  if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
+    if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
+      SmallVector<int64_t> nativeSize(vecType.getRank(), 1);
+      nativeSize.back() =
+          mlir::spirv::getComputeVectorSize(vecType.getShape().back());
+      return nativeSize;
+    }
+  }
+
+  return TypeSwitch<Operation *, std::optional<SmallVector<int64_t>>>(op)
+      .Case<vector::ReductionOp, vector::TransposeOp>(
+          [](auto typedOp) { return getNativeVectorShapeImpl(typedOp); })
+      .Default([](Operation *) { return std::nullopt; });
+}
+
+LogicalResult mlir::spirv::unrollVectorsInSignatures(Operation *op) {
+  MLIRContext *context = op->getContext();
+  RewritePatternSet patterns(context);
+  populateFuncOpVectorRewritePatterns(patterns);
+  populateReturnOpVectorRewritePatterns(patterns);
+  // We only want to apply signature conversion once to the existing func ops.
+  // Without specifying strictMode, the greedy pattern rewriter will keep
+  // looking for newly created func ops.
+  GreedyRewriteConfig config;
+  config.strictMode = GreedyRewriteStrictness::ExistingOps;
+  return applyPatternsAndFoldGreedily(op, std::move(patterns), config);
+}
+
+LogicalResult mlir::spirv::unrollVectorsInFuncBodies(Operation *op) {
+  MLIRContext *context = op->getContext();
+
+  // Unroll vectors in function bodies to native vector size.
+  {
+    RewritePatternSet patterns(context);
+    auto options = vector::UnrollVectorOptions().setNativeShapeFn(
+        [](auto op) { return mlir::spirv::getNativeVectorShape(op); });
+    populateVectorUnrollPatterns(patterns, options);
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+
+  // Convert transpose ops into extract and insert pairs, in preparation of
+  // further transformations to canonicalize/cancel.
+  {
+    RewritePatternSet patterns(context);
+    auto options = vector::VectorTransformsOptions().setVectorTransposeLowering(
+        vector::VectorTransposeLowering::EltWise);
+    vector::populateVectorTransposeLoweringPatterns(patterns, options);
+    vector::populateVectorShapeCastLoweringPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+
+  // Run canonicalization to cast away leading size-1 dimensions.
+  {
+    RewritePatternSet patterns(context);
+
+    // We need to pull in casting way leading one dims.
+    vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
+    vector::ReductionOp::getCanonicalizationPatterns(patterns, context);
+    vector::TransposeOp::getCanonicalizationPatterns(patterns, context);
+
+    // Decompose different rank insert_strided_slice and n-D
+    // extract_slided_slice.
+    vector::populateVectorInsertExtractStridedSliceDecompositionPatterns(
+        patterns);
+    vector::InsertOp::getCanonicalizationPatterns(patterns, context);
+    vector::ExtractOp::getCanonicalizationPatterns(patterns, context);
+
+    // Trimming leading unit dims may generate broadcast/shape_cast ops. Clean
+    // them up.
+    vector::BroadcastOp::getCanonicalizationPatterns(patterns, context);
+    vector::ShapeCastOp::getCanonicalizationPatterns(patterns, context);
+
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V TypeConverter
+//===----------------------------------------------------------------------===//
+
+SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr,
+                                       const SPIRVConversionOptions &options)
+    : targetEnv(targetAttr), options(options) {
+  // Add conversions. The order matters here: later ones will be tried earlier.
+
+  // Allow all SPIR-V dialect specific types. This assumes all builtin types
+  // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType)
+  // were tried before.
+  //
+  // TODO: This assumes that the SPIR-V types are valid to use in the given
+  // target environment, which should be the case if the whole pipeline is
+  // driven by the same target environment. Still, we probably still want to
+  // validate and convert to be safe.
+  addConversion([](spirv::SPIRVType type) { return type; });
+
+  addConversion([this](IndexType /*indexType*/) { return getIndexType(); });
+
+  addConversion([this](IntegerType intType) -> std::optional<Type> {
+    if (auto scalarType = dyn_cast<spirv::ScalarType>(intType))
+      return convertScalarType(this->targetEnv, this->options, scalarType);
+    if (intType.getWidth() < 8)
+      return convertSubByteIntegerType(this->options, intType);
+    return Type();
+  });
+
+  addConversion([this](FloatType floatType) -> std::optional<Type> {
+    if (auto scalarType = dyn_cast<spirv::ScalarType>(floatType))
+      return convertScalarType(this->targetEnv, this->options, scalarType);
+    return Type();
+  });
+
+  addConversion([this](ComplexType complexType) {
+    return convertComplexType(this->targetEnv, this->options, complexType);
+  });
+
+  addConversion([this](VectorType vectorType) {
+    return convertVectorType(this->targetEnv, this->options, vectorType);
+  });
+
+  addConversion([this](TensorType tensorType) {
+    return convertTensorType(this->targetEnv, this->options, tensorType);
+  });
+
+  addConversion([this](MemRefType memRefType) {
+    return convertMemrefType(this->targetEnv, this->options, memRefType);
+  });
+
+  // Register some last line of defense casting logic.
+  addSourceMaterialization(
+      [this](OpBuilder &builder, Type type, ValueRange inputs, Location loc) {
+        return castToSourceType(this->targetEnv, builder, type, inputs, loc);
+      });
+  addTargetMaterialization([](OpBuilder &builder, Type type, ValueRange inputs,
+                              Location loc) {
+    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    return std::optional<Value>(cast.getResult(0));
+  });
+}
+
+Type SPIRVTypeConverter::getIndexType() const {
+  return ::getIndexType(getContext(), options);
+}
+
+MLIRContext *SPIRVTypeConverter::getContext() const {
+  return targetEnv.getAttr().getContext();
+}
+
+bool SPIRVTypeConverter::allows(spirv::Capability capability) const {
+  return targetEnv.allows(capability);
+}
+
 //===----------------------------------------------------------------------===//
 // SPIR-V ConversionTarget
 //===----------------------------------------------------------------------===//
@@ -1468,3 +1568,20 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) {
 
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+// Public functions for populating patterns
+//===----------------------------------------------------------------------===//
+
+void mlir::populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
+                                              RewritePatternSet &patterns) {
+  patterns.add<FuncOpConversion>(typeConverter, patterns.getContext());
+}
+
+void mlir::populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns) {
+  patterns.add<FuncOpVectorUnroll>(patterns.getContext());
+}
+
+void mlir::populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns) {
+  patterns.add<ReturnOpVectorUnroll>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 58c3f4c334577..8eb8e579954fa 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -1702,18 +1702,36 @@ struct ShapeOfOpToConstShapeOp : public OpRewritePattern<shape::ShapeOfOp> {
   }
 };
 
-struct ShapeOfWithTensor : public OpRewritePattern<shape::ShapeOfOp> {
+// Canonicalize
+//
+// %0 = tensor.reshape %input(%shape) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+// %1 = shape.shape_of %0 : tensor<*xf32> -> tensor<?xindex>
+//
+// to
+//
+// %0 = tensor.reshape %input(%shape) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+// %1 = %shape
+//
+struct ShapeOfFromReshape : public OpRewritePattern<shape::ShapeOfOp> {
   using OpRewritePattern<shape::ShapeOfOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(shape::ShapeOfOp op,
                                 PatternRewriter &rewriter) const override {
-    if (!llvm::isa<ShapedType>(op.getArg().getType()))
-      return failure();
-    if (llvm::isa<ShapedType>(op.getType()))
-      return failure();
-
-    rewriter.replaceOpWithNewOp<shape::ShapeOfOp>(op.getOperation(),
-                                                  op.getArg());
+    auto tensorReshapeOp = op.getArg().getDefiningOp<tensor::ReshapeOp>();
+    if (!tensorReshapeOp)
+      return rewriter.notifyMatchFailure(op, "producer is not tensor.reshape");
+    if (!isa<TensorType>(op.getType()))
+      return rewriter.notifyMatchFailure(op, "result is not a tensor");
+
+    // Operand 'shape' of 'tensor.reshape' may now be used as the result of
+    // 'shape.shape_of'. While its type is guaranteed to be compatible in well-
+    // formed IR, it may not be identical (dynamically vs statically shaped),
+    // in which case it needs to be cast first.
+    Value shape = tensorReshapeOp.getShape();
+    if (op.getType() != shape.getType())
+      shape = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(), shape);
+
+    rewriter.replaceOp(op, shape);
     return success();
   }
 };
@@ -1753,7 +1771,7 @@ struct ShapeOfCastExtentTensor : public OpRewritePattern<tensor::CastOp> {
 
 void ShapeOfOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                             MLIRContext *context) {
-  patterns.add<ShapeOfCastExtentTensor, ShapeOfWithTensor,
+  patterns.add<ShapeOfCastExtentTensor, ShapeOfFromReshape,
                ExtractFromShapeOfExtentTensor, ShapeOfOpToConstShapeOp>(
       context);
 }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 616e91ae04055..1135ea32fe1ab 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2131,10 +2131,82 @@ static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo,
   printLevelRange(p, lo, hi);
 }
 
+/// Parses a list of `optional` defined list in the form of
+/// "(%val0, _, %val1, ...)", where `_` is used to annotate that the
+/// corresponding value is not defined (e.g., to represent an undefined
+/// coordinate in the sparse iteration space).
+static ParseResult parseOptionalDefinedList(
+    OpAsmParser &parser, OperationState &state, I64BitSet &definedSet,
+    SmallVectorImpl<OpAsmParser::Argument> &definedArgs,
+    unsigned maxCnt = std::numeric_limits<unsigned>::max(),
+    OpAsmParser::Delimiter delimiter = OpAsmParser::Delimiter::Paren) {
+  unsigned cnt = 0;
+  ParseResult crdList =
+      parser.parseCommaSeparatedList(delimiter, [&]() -> ParseResult {
+        if (parser.parseOptionalKeyword("_")) {
+          if (parser.parseArgument(definedArgs.emplace_back()))
+            return failure();
+          definedSet.set(cnt);
+        }
+        cnt += 1;
+        return success();
+      });
+
+  if (cnt > maxCnt)
+    return parser.emitError(parser.getNameLoc(),
+                            "parsed more value than expected.");
+
+  if (failed(crdList)) {
+    return parser.emitError(
+        parser.getNameLoc(),
+        "expecting SSA value or \"_\" for level coordinates");
+  }
+  assert(definedArgs.size() == definedSet.count());
+  return success();
+}
+
+static void printOptionalDefinedList(OpAsmPrinter &p, unsigned size,
+                                     Block::BlockArgListType blocksArgs,
+                                     I64BitSet definedSet) {
+  if (definedSet.empty())
+    return;
+
+  for (unsigned i = 0; i < size; i++) {
+    if (definedSet[i]) {
+      p << blocksArgs.front();
+      blocksArgs = blocksArgs.drop_front();
+    } else {
+      p << "_";
+    }
+    if (i != size - 1)
+      p << ", ";
+  }
+  assert(blocksArgs.empty());
+}
+
+static ParseResult
+parseUsedCoordList(OpAsmParser &parser, OperationState &state,
+                   SmallVectorImpl<OpAsmParser::Argument> &coords) {
+  // Parse "at(%crd0, _, ...)"
+  I64BitSet crdUsedLvlSet;
+  if (succeeded(parser.parseOptionalKeyword("at")) &&
+      failed(parseOptionalDefinedList(parser, state, crdUsedLvlSet, coords)))
+    return failure();
+
+  // Always use IndexType for the coordinate.
+  for (auto &coord : coords)
+    coord.type = parser.getBuilder().getIndexType();
+
+  // Set the CrdUsedLvl bitset.
+  state.addAttribute("crdUsedLvls",
+                     parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet));
+  return success();
+}
+
 static ParseResult
-parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
-                     SmallVectorImpl<OpAsmParser::Argument> &iterators,
-                     SmallVectorImpl<OpAsmParser::Argument> &iterArgs) {
+parseSparseIterateLoop(OpAsmParser &parser, OperationState &state,
+                       SmallVectorImpl<OpAsmParser::Argument> &iterators,
+                       SmallVectorImpl<OpAsmParser::Argument> &blockArgs) {
   SmallVector<OpAsmParser::UnresolvedOperand> spaces;
   SmallVector<OpAsmParser::UnresolvedOperand> initArgs;
 
@@ -2148,37 +2220,14 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
         parser.getNameLoc(),
         "mismatch in number of sparse iterators and sparse spaces");
 
-  // Parse "at(%crd0, _, ...)"
-  LevelSet crdUsedLvlSet;
-  bool hasUsedCrds = succeeded(parser.parseOptionalKeyword("at"));
-  unsigned lvlCrdCnt = 0;
-  if (hasUsedCrds) {
-    ParseResult crdList = parser.parseCommaSeparatedList(
-        OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
-          if (parser.parseOptionalKeyword("_")) {
-            if (parser.parseArgument(iterArgs.emplace_back()))
-              return failure();
-            // Always use IndexType for the coordinate.
-            crdUsedLvlSet.set(lvlCrdCnt);
-            iterArgs.back().type = parser.getBuilder().getIndexType();
-          }
-          lvlCrdCnt += 1;
-          return success();
-        });
-    if (failed(crdList)) {
-      return parser.emitError(
-          parser.getNameLoc(),
-          "expecting SSA value or \"_\" for level coordinates");
-    }
-  }
-  // Set the CrdUsedLvl bitset.
-  state.addAttribute("crdUsedLvls",
-                     parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet));
+  if (failed(parseUsedCoordList(parser, state, blockArgs)))
+    return failure();
+  size_t numCrds = blockArgs.size();
 
   // Parse "iter_args(%arg = %init, ...)"
   bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args"));
   if (hasIterArgs)
-    if (parser.parseAssignmentList(iterArgs, initArgs))
+    if (parser.parseAssignmentList(blockArgs, initArgs))
       return failure();
 
   SmallVector<Type> iterSpaceTps;
@@ -2196,10 +2245,6 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
       return parser.emitError(parser.getNameLoc(),
                               "expected sparse_tensor.iter_space type for "
                               "iteration space operands");
-    if (hasUsedCrds && spaceTp.getSpaceDim() != lvlCrdCnt)
-      return parser.emitError(parser.getNameLoc(),
-                              "mismatch in number of iteration space dimension "
-                              "and specified coordinates");
     it.type = spaceTp.getIteratorType();
   }
 
@@ -2213,9 +2258,68 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
     return failure();
 
   if (hasIterArgs) {
-    unsigned numCrds = crdUsedLvlSet.count();
     // Strip off leading args that used for coordinates.
-    MutableArrayRef args = MutableArrayRef(iterArgs).drop_front(numCrds);
+    MutableArrayRef args = MutableArrayRef(blockArgs).drop_front(numCrds);
+    if (args.size() != initArgs.size() || args.size() != state.types.size()) {
+      return parser.emitError(
+          parser.getNameLoc(),
+          "mismatch in number of iteration arguments and return values");
+    }
+
+    for (auto [it, init, tp] : llvm::zip_equal(args, initArgs, state.types)) {
+      it.type = tp;
+      if (parser.resolveOperand(init, tp, state.operands))
+        return failure();
+    }
+  }
+  return success();
+}
+
+static ParseResult
+parseSparseCoIterateLoop(OpAsmParser &parser, OperationState &state,
+                         SmallVectorImpl<Value> &spacesVals,
+                         SmallVectorImpl<OpAsmParser::Argument> &blockArgs) {
+
+  // Parse "(%spaces, ...)"
+  SmallVector<OpAsmParser::UnresolvedOperand> spaces;
+  if (parser.parseOperandList(spaces, OpAsmParser::Delimiter::Paren))
+    return failure();
+
+  if (failed(parseUsedCoordList(parser, state, blockArgs)))
+    return failure();
+  size_t numCrds = blockArgs.size();
+
+  // Parse "iter_args(%arg = %init, ...)"
+  SmallVector<OpAsmParser::UnresolvedOperand> initArgs;
+  bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args"));
+  if (hasIterArgs)
+    if (parser.parseAssignmentList(blockArgs, initArgs))
+      return failure();
+
+  SmallVector<Type> iterSpaceTps;
+  // parse ": (sparse_tensor.iter_space, ...) -> ret"
+  if (parser.parseColon() || parser.parseLParen() ||
+      parser.parseTypeList(iterSpaceTps) || parser.parseRParen())
+    return failure();
+
+  if (iterSpaceTps.size() != spaces.size())
+    return parser.emitError(parser.getNameLoc(),
+                            "mismatch in number of iteration space operands "
+                            "and iteration space types");
+
+  if (hasIterArgs)
+    if (parser.parseArrowTypeList(state.types))
+      return failure();
+
+  // Resolves input sparse iteration spaces.
+  if (parser.resolveOperands(spaces, iterSpaceTps, parser.getNameLoc(),
+                             spacesVals))
+    return failure();
+  state.operands.append(spacesVals);
+
+  if (hasIterArgs) {
+    // Strip off leading args that used for coordinates.
+    MutableArrayRef args = MutableArrayRef(blockArgs).drop_front(numCrds);
     if (args.size() != initArgs.size() || args.size() != state.types.size()) {
       return parser.emitError(
           parser.getNameLoc(),
@@ -2267,12 +2371,25 @@ LogicalResult ExtractIterSpaceOp::verify() {
   return success();
 }
 
+LogicalResult ExtractValOp::verify() {
+  auto stt = getSparseTensorType(getTensor());
+  auto itTp = getIterator().getType();
+
+  if (stt.getEncoding() != itTp.getEncoding())
+    return emitOpError("mismatch in tensor encoding and iterator encoding.");
+
+  if (stt.getLvlRank() != itTp.getHiLvl())
+    return emitOpError("must use last-level iterator to extract values. ");
+
+  return success();
+}
+
 struct RemoveUnusedLvlCrds : public OpRewritePattern<IterateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(IterateOp iterateOp,
                                 PatternRewriter &rewriter) const override {
-    LevelSet newUsedLvls(0);
+    I64BitSet newUsedLvls(0);
     llvm::BitVector toRemove(iterateOp.getBody()->getNumArguments());
     for (unsigned i = 0, e = iterateOp.getSpaceDim(); i < e; i++) {
       if (auto crd = iterateOp.getLvlCrd(i)) {
@@ -2304,13 +2421,13 @@ void IterateOp::build(OpBuilder &builder, OperationState &odsState,
                       Value iterSpace, ValueRange initArgs) {
   unsigned rank = llvm::cast<IterSpaceType>(iterSpace.getType()).getSpaceDim();
   // All ones.
-  LevelSet set((1 << rank) - 1);
+  I64BitSet set((1 << rank) - 1);
   return build(builder, odsState, iterSpace, initArgs, set);
 }
 
 void IterateOp::build(OpBuilder &builder, OperationState &odsState,
                       Value iterSpace, ValueRange initArgs,
-                      LevelSet crdUsedLvls) {
+                      I64BitSet crdUsedLvls) {
   OpBuilder::InsertionGuard guard(builder);
 
   odsState.addOperands(iterSpace);
@@ -2340,7 +2457,7 @@ ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) {
   OpAsmParser::UnresolvedOperand iterSpace;
 
   SmallVector<OpAsmParser::Argument> iters, iterArgs;
-  if (parseSparseSpaceLoop(parser, result, iters, iterArgs))
+  if (parseSparseIterateLoop(parser, result, iters, iterArgs))
     return failure();
   if (iters.size() != 1)
     return parser.emitError(parser.getNameLoc(),
@@ -2380,51 +2497,39 @@ static void printInitializationList(OpAsmPrinter &p,
   p << ")";
 }
 
-static void printUsedCrdsList(OpAsmPrinter &p, unsigned spaceDim,
-                              Block::BlockArgListType blocksArgs,
-                              LevelSet crdUsedLvls) {
-  if (crdUsedLvls.empty())
-    return;
-
-  p << " at(";
-  for (unsigned i = 0; i < spaceDim; i++) {
-    if (crdUsedLvls[i]) {
-      p << blocksArgs.front();
-      blocksArgs = blocksArgs.drop_front();
-    } else {
-      p << "_";
-    }
-    if (i != spaceDim - 1)
-      p << ", ";
+template <typename SparseLoopOp>
+static LogicalResult verifySparseLoopOp(SparseLoopOp op) {
+  if (op.getInitArgs().size() != op.getNumResults()) {
+    return op.emitOpError(
+        "mismatch in number of loop-carried values and defined values");
   }
-  assert(blocksArgs.empty());
-  p << ")";
+  if (op.getCrdUsedLvls().max() > op.getSpaceDim())
+    return op.emitOpError("required out-of-bound coordinates");
+
+  return success();
 }
 
+LogicalResult IterateOp::verify() { return verifySparseLoopOp(*this); }
+LogicalResult CoIterateOp::verify() { return verifySparseLoopOp(*this); }
+
 void IterateOp::print(OpAsmPrinter &p) {
   p << " " << getIterator() << " in " << getIterSpace();
-  printUsedCrdsList(p, getSpaceDim(), getCrds(), getCrdUsedLvls());
+  if (!getCrdUsedLvls().empty()) {
+    p << " at(";
+    printOptionalDefinedList(p, getSpaceDim(), getCrds(), getCrdUsedLvls());
+    p << ")";
+  }
   printInitializationList(p, getRegionIterArgs(), getInitArgs(), " iter_args");
 
   p << " : " << getIterSpace().getType() << " ";
   if (!getInitArgs().empty())
-    p << "-> (" << getInitArgs().getTypes() << ") ";
+    p.printArrowTypeList(getInitArgs().getTypes());
 
+  p << " ";
   p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
                 /*printBlockTerminators=*/!getInitArgs().empty());
 }
 
-LogicalResult IterateOp::verify() {
-  if (getInitArgs().size() != getNumResults()) {
-    return emitOpError(
-        "mismatch in number of loop-carried values and defined values");
-  }
-  if (getCrdUsedLvls().max() > getSpaceDim())
-    return emitOpError("required out-of-bound coordinates");
-
-  return success();
-}
-
 LogicalResult IterateOp::verifyRegions() {
   if (getIterator().getType() != getIterSpace().getType().getIteratorType())
     return emitOpError("mismatch in iterator and iteration space type");
@@ -2482,13 +2587,136 @@ OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 
 void IterateOp::getSuccessorRegions(RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
-  // Both the operation itself and the region may be branching into the body or
-  // back into the operation itself.
+  // Both the operation itself and the region may be branching into the body
+  // or back into the operation itself.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   // It is possible for loop not to enter the body.
   regions.push_back(RegionSuccessor(getResults()));
 }
 
+ParseResult CoIterateOp::parse(OpAsmParser &parser, OperationState &result) {
+
+  SmallVector<Value> spaces;
+  // The block argument list of each regions, it is arranged in the order of
+  // ([used coordinate list], [loop iterations args], [sparse iterator list]).
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parseSparseCoIterateLoop(parser, result, spaces, blockArgs))
+    return failure();
+
+  result.addAttribute("operandSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(spaces.size()),
+                           static_cast<int32_t>(result.types.size())}));
+
+  SmallVector<Attribute> cases;
+  while (succeeded(parser.parseOptionalKeyword("case"))) {
+    // Parse one region per case.
+    I64BitSet definedItSet;
+    SmallVector<OpAsmParser::Argument> definedIts;
+    if (parseOptionalDefinedList(parser, result, definedItSet, definedIts,
+                                 spaces.size(), OpAsmParser::Delimiter::None))
+      return failure();
+
+    cases.push_back(parser.getBuilder().getI64IntegerAttr(definedItSet));
+
+    for (auto [i, definedIdx] : llvm::enumerate(definedItSet.bits())) {
+      // Resolve the iterator type based on the iteration space type.
+      auto spaceTp = llvm::cast<IterSpaceType>(spaces[definedIdx].getType());
+      definedIts[i].type = spaceTp.getIteratorType();
+    }
+    definedIts.insert(definedIts.begin(), blockArgs.begin(), blockArgs.end());
+    Region *body = result.addRegion();
+    if (parser.parseRegion(*body, definedIts))
+      return failure();
+
+    CoIterateOp::ensureTerminator(*body, parser.getBuilder(), result.location);
+  }
+
+  result.addAttribute("cases", ArrayAttr::get(parser.getContext(), cases));
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+void CoIterateOp::print(OpAsmPrinter &p) {
+  p << " (";
+  llvm::interleaveComma(getIterSpaces(), p, [&](auto s) { p << s; });
+  p << ")";
+
+  if (!getCrdUsedLvls().empty()) {
+    p << " at(";
+    printOptionalDefinedList(p, getSpaceDim(), getCrds(0), getCrdUsedLvls());
+    p << ")";
+  }
+
+  printInitializationList(p, getRegionIterArgs(0), getInitArgs(), " iter_args");
+
+  p << " : (" << getIterSpaces().getTypes() << ")";
+  if (!getInitArgs().empty())
+    p.printArrowTypeList(getInitArgs().getTypes());
+
+  for (unsigned idx = 0, e = getRegions().size(); idx < e; idx++) {
+    p.printNewline();
+    p << "case ";
+    printOptionalDefinedList(p, getIterSpaces().size(), getRegionIterators(idx),
+                             getRegionDefinedSpace(idx));
+    p << " ";
+    p.printRegion(getRegion(idx), /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/!getInitArgs().empty());
+  }
+}
+
+ValueRange CoIterateOp::getYieldedValues(unsigned regionIdx) {
+  return cast<sparse_tensor::YieldOp>(
+             getRegion(regionIdx).getBlocks().front().getTerminator())
+      .getResults();
+}
+
+LogicalResult CoIterateOp::verifyRegions() {
+  for (unsigned r = 0, e = getNumRegions(); r < e; r++) {
+    if (getNumRegionIterArgs(r) != getNumResults())
+      return emitOpError(
+          "mismatch in number of basic block args and defined values");
+
+    auto initArgs = getInitArgs();
+    auto iterArgs = getRegionIterArgs(r);
+    auto yieldVals = getYieldedValues(r);
+    auto opResults = getResults();
+    if (!llvm::all_equal({initArgs.size(), iterArgs.size(), yieldVals.size(),
+                          opResults.size()})) {
+      return emitOpError()
+             << "number mismatch between iter args and results on " << r
+             << "th region";
+    }
+
+    for (auto [i, init, iter, yield, ret] :
+         llvm::enumerate(initArgs, iterArgs, yieldVals, opResults)) {
+      if (init.getType() != ret.getType())
+        return emitOpError()
+               << "types mismatch between " << i
+               << "th iter operand and defined value on " << r << "th region";
+      if (iter.getType() != ret.getType())
+        return emitOpError() << "types mismatch between " << i
+                             << "th iter region arg and defined value on " << r
+                             << "th region";
+      if (yield.getType() != ret.getType())
+        return emitOpError()
+               << "types mismatch between " << i
+               << "th yield value and defined value on " << r << "th region";
+    }
+  }
+
+  auto cases = getRegionDefinedSpaces();
+  llvm::SmallSetVector<uint64_t, 8> set(cases.begin(), cases.end());
+  if (set.size() != getNumRegions())
+    return emitOpError("contains duplicated cases.");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Dialect Setups.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
index 1d614b7b29361..b1451dee738ac 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
@@ -2,6 +2,7 @@
 #include "Utils/CodegenUtils.h"
 #include "Utils/SparseTensorIterator.h"
 
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
@@ -10,8 +11,8 @@
 using namespace mlir;
 using namespace mlir::sparse_tensor;
 
-void convertLevelType(SparseTensorEncodingAttr enc, Level lvl,
-                      SmallVectorImpl<Type> &fields) {
+static void convertLevelType(SparseTensorEncodingAttr enc, Level lvl,
+                             SmallVectorImpl<Type> &fields) {
   // Position and coordinate buffer in the sparse structure.
   if (enc.getLvlType(lvl).isWithPosLT())
     fields.push_back(enc.getPosMemRefType());
@@ -71,6 +72,21 @@ class ExtractIterSpaceConverter
   }
 };
 
+/// Sparse codegen rule for number of entries operator.
+class ExtractValOpConverter : public OneToNOpConversionPattern<ExtractValOp> {
+public:
+  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ExtractValOp op, OpAdaptor adaptor,
+                  OneToNPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value pos = adaptor.getIterator().back();
+    Value valBuf = rewriter.create<ToValuesOp>(loc, op.getTensor());
+    rewriter.replaceOpWithNewOp<memref::LoadOp>(op, valBuf, pos);
+    return success();
+  }
+};
+
 class SparseIterateOpConverter : public OneToNOpConversionPattern<IterateOp> {
 public:
   using OneToNOpConversionPattern::OneToNOpConversionPattern;
@@ -193,6 +209,6 @@ void mlir::populateLowerSparseIterationToSCFPatterns(
     TypeConverter &converter, RewritePatternSet &patterns) {
 
   IterateOp::getCanonicalizationPatterns(patterns, patterns.getContext());
-  patterns.add<ExtractIterSpaceConverter, SparseIterateOpConverter>(
-      converter, patterns.getContext());
+  patterns.add<ExtractIterSpaceConverter, ExtractValOpConverter,
+               SparseIterateOpConverter>(converter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
index 924046fcd9961..f85c4761a8d52 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
@@ -141,10 +141,10 @@ void collapseSparseSpace(MutableArrayRef<CollapseSpaceInfo> toCollapse) {
   auto cloned = llvm::cast<IterateOp>(builder.clone(*innermost, mapper));
   builder.setInsertionPointToStart(cloned.getBody());
 
-  LevelSet crdUsedLvls;
+  I64BitSet crdUsedLvls;
   unsigned shift = 0, argIdx = 1;
   for (auto info : toCollapse.drop_back()) {
-    LevelSet set = info.loop.getCrdUsedLvls();
+    I64BitSet set = info.loop.getCrdUsedLvls();
     crdUsedLvls |= set.lshift(shift);
     shift += info.loop.getSpaceDim();
     for (BlockArgument crd : info.loop.getCrds()) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index c612a52aa8d50..08fc104fcbeea 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -357,6 +357,9 @@ static Value genSubscript(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
     const auto pos = env.emitter().getValPosits(tid);
     assert(!pos.empty());
     args.append(pos);
+    // Simply returns the tensor to extract value using iterators.
+    if (env.options().sparseEmitStrategy == SparseEmitStrategy::kSparseIterator)
+      return t->get();
   } else {
     // For dense tensors we push all level's coordinates onto `args`.
     const Level lvlRank = stt.getLvlRank();
@@ -512,9 +515,16 @@ static Value genTensorLoad(CodegenEnv &env, OpBuilder &builder, ExprId exp) {
       return genInsertionLoadReduce(env, builder, t);
     return genInsertionLoad(env, builder, t);
   }
+
   // Actual load.
   SmallVector<Value> args;
   Value ptr = genSubscript(env, builder, t, args);
+  if (llvm::isa<TensorType>(ptr.getType())) {
+    assert(env.options().sparseEmitStrategy ==
+               SparseEmitStrategy::kSparseIterator &&
+           args.size() == 1);
+    return builder.create<ExtractValOp>(loc, ptr, args.front());
+  }
   return builder.create<memref::LoadOp>(loc, ptr, args);
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 2a884b10e36b0..f3e73e4692c1f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -221,6 +221,11 @@ class LoopEmitter {
   /// Getters.
   ///
   SmallVector<Value> getValPosits(TensorId tid) const {
+    // Returns the iterator if we are generating sparse (co)iterate-based loops.
+    if (emitStrategy == SparseEmitStrategy::kSparseIterator)
+      return {spIterVals[tid].back()};
+
+    // Returns {[batch coords], last-level position}.
     SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds();
     Value lastLvlPos = iters[tid].back().back()->getCurPosition().front();
     batchCrds.push_back(lastLvlPos);
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 0e840da9530ed..0751ffc419cbf 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1585,13 +1585,25 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
           getResult().getType()))
     return reshapedSource;
 
+  // If the producer of operand 'source' is another 'tensor.reshape' op, use the
+  // producer's input instead as the original tensor to reshape. This could
+  // render such producer dead code.
+  if (auto reshapeOpProducer = getSource().getDefiningOp<ReshapeOp>()) {
+    getSourceMutable().assign(reshapeOpProducer.getSource());
+    return getResult();
+  }
+
   auto source = getSource();
   auto sourceTy = dyn_cast<RankedTensorType>(source.getType());
   auto resultTy = dyn_cast<RankedTensorType>(getType());
-
   if (!sourceTy || !resultTy || sourceTy != resultTy)
     return {};
 
+  // If the source and result are both 1D tensors and have the same type, the
+  // reshape has no effect, even if the tensor is dynamically shaped.
+  if (sourceTy.getRank() == 1)
+    return source;
+
   if (auto fromElements = getShape().getDefiningOp<tensor::FromElementsOp>()) {
     auto elements = fromElements.getElements();
     bool dynamicNoop =
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 866ab0d2228f7..da9a93feac4d6 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -764,7 +764,8 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
           llvm::cast<IntegerType>(outETy).getIntOrFloatBitWidth(), unsign);
       auto floatVal = operand.getSplatValue<APFloat>();
       bool exact;
-      floatVal.convertToInteger(intVal, llvm::RoundingMode::TowardZero, &exact);
+      floatVal.convertToInteger(intVal, llvm::RoundingMode::NearestTiesToEven,
+                                &exact);
       return SplatElementsAttr::get(outTy, intVal);
     }
 
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 3d6a9a18d9f41..c4238080533be 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -1396,11 +1396,11 @@ transform::ForeachOp::apply(transform::TransformRewriter &rewriter,
   SmallVector<SmallVector<MappedValue>> payloads;
   detail::prepareValueMappings(payloads, getTargets(), state);
   size_t numIterations = payloads.empty() ? 0 : payloads.front().size();
-  bool isZipShortest = getZipShortest();
+  bool withZipShortest = getWithZipShortest();
 
   // In case of `zip_shortest`, set the number of iterations to the
   // smallest payload in the targets.
-  if (isZipShortest) {
+  if (withZipShortest) {
     numIterations =
         llvm::min_element(payloads, [&](const SmallVector<MappedValue> &A,
                                         const SmallVector<MappedValue> &B) {
@@ -1414,7 +1414,7 @@ transform::ForeachOp::apply(transform::TransformRewriter &rewriter,
   // As we will be "zipping" over them, check all payloads have the same size.
   // `zip_shortest` adjusts all payloads to the same size, so skip this check
   // when true.
-  for (size_t argIdx = 1; !isZipShortest && argIdx < payloads.size();
+  for (size_t argIdx = 1; !withZipShortest && argIdx < payloads.size();
        argIdx++) {
     if (payloads[argIdx].size() != numIterations) {
       return emitSilenceableError()
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index aba225be720c3..108839a4d90e9 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -252,6 +252,32 @@ mlir::computePermutationVector(int64_t permSize, ArrayRef<int64_t> positions,
   return res;
 }
 
+SmallVector<int64_t> mlir::dropDims(ArrayRef<int64_t> inputPerm,
+                                    ArrayRef<int64_t> dropPositions) {
+  assert(inputPerm.size() >= dropPositions.size() &&
+         "expect inputPerm size large than position to drop");
+  SmallVector<int64_t> res;
+  unsigned permSize = inputPerm.size();
+  for (unsigned inputIndex = 0; inputIndex < permSize; ++inputIndex) {
+    int64_t targetIndex = inputPerm[inputIndex];
+    bool shouldDrop = false;
+    unsigned dropSize = dropPositions.size();
+    for (unsigned dropIndex = 0; dropIndex < dropSize; dropIndex++) {
+      if (dropPositions[dropIndex] == inputPerm[inputIndex]) {
+        shouldDrop = true;
+        break;
+      }
+      if (dropPositions[dropIndex] < inputPerm[inputIndex]) {
+        targetIndex--;
+      }
+    }
+    if (!shouldDrop) {
+      res.push_back(targetIndex);
+    }
+  }
+  return res;
+}
+
 SmallVector<int64_t> mlir::getI64SubArray(ArrayAttr arrayAttr,
                                           unsigned dropFront,
                                           unsigned dropBack) {
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index df3a59ed80ad4..5047bd925d4c5 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -88,15 +88,14 @@ static MaskFormat getMaskFormat(Value mask) {
     // Inspect constant mask index. If the index exceeds the
     // dimension size, all bits are set. If the index is zero
     // or less, no bits are set.
-    ArrayAttr masks = m.getMaskDimSizes();
+    ArrayRef<int64_t> masks = m.getMaskDimSizes();
     auto shape = m.getType().getShape();
     bool allTrue = true;
     bool allFalse = true;
     for (auto [maskIdx, dimSize] : llvm::zip_equal(masks, shape)) {
-      int64_t i = llvm::cast<IntegerAttr>(maskIdx).getInt();
-      if (i < dimSize)
+      if (maskIdx < dimSize)
         allTrue = false;
-      if (i > 0)
+      if (maskIdx > 0)
         allFalse = false;
     }
     if (allTrue)
@@ -2465,11 +2464,6 @@ void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // ShuffleOp
 //===----------------------------------------------------------------------===//
 
-void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value v1,
-                      Value v2, ArrayRef<int64_t> mask) {
-  build(builder, result, v1, v2, getVectorSubscriptAttr(builder, mask));
-}
-
 LogicalResult ShuffleOp::verify() {
   VectorType resultType = getResultVectorType();
   VectorType v1Type = getV1VectorType();
@@ -2492,8 +2486,8 @@ LogicalResult ShuffleOp::verify() {
       return emitOpError("dimension mismatch");
   }
   // Verify mask length.
-  auto maskAttr = getMask().getValue();
-  int64_t maskLength = maskAttr.size();
+  ArrayRef<int64_t> mask = getMask();
+  int64_t maskLength = mask.size();
   if (maskLength <= 0)
     return emitOpError("invalid mask length");
   if (maskLength != resultType.getDimSize(0))
@@ -2501,10 +2495,9 @@ LogicalResult ShuffleOp::verify() {
   // Verify all indices.
   int64_t indexSize = (v1Type.getRank() == 0 ? 1 : v1Type.getDimSize(0)) +
                       (v2Type.getRank() == 0 ? 1 : v2Type.getDimSize(0));
-  for (const auto &en : llvm::enumerate(maskAttr)) {
-    auto attr = llvm::dyn_cast<IntegerAttr>(en.value());
-    if (!attr || attr.getInt() < 0 || attr.getInt() >= indexSize)
-      return emitOpError("mask index #") << (en.index() + 1) << " out of range";
+  for (auto [idx, maskPos] : llvm::enumerate(mask)) {
+    if (maskPos < 0 || maskPos >= indexSize)
+      return emitOpError("mask index #") << (idx + 1) << " out of range";
   }
   return success();
 }
@@ -2528,13 +2521,12 @@ ShuffleOp::inferReturnTypes(MLIRContext *, std::optional<Location>,
   return success();
 }
 
-static bool isStepIndexArray(ArrayAttr idxArr, uint64_t begin, size_t width) {
-  uint64_t expected = begin;
-  return idxArr.size() == width &&
-         llvm::all_of(idxArr.getAsValueRange<IntegerAttr>(),
-                      [&expected](auto attr) {
-                        return attr.getZExtValue() == expected++;
-                      });
+template <typename T>
+static bool isStepIndexArray(ArrayRef<T> idxArr, uint64_t begin, size_t width) {
+  T expected = begin;
+  return idxArr.size() == width && llvm::all_of(idxArr, [&expected](T value) {
+           return value == expected++;
+         });
 }
 
 OpFoldResult vector::ShuffleOp::fold(FoldAdaptor adaptor) {
@@ -2569,8 +2561,7 @@ OpFoldResult vector::ShuffleOp::fold(FoldAdaptor adaptor) {
   SmallVector<Attribute> results;
   auto lhsElements = llvm::cast<DenseElementsAttr>(lhs).getValues<Attribute>();
   auto rhsElements = llvm::cast<DenseElementsAttr>(rhs).getValues<Attribute>();
-  for (const auto &index : this->getMask().getAsValueRange<IntegerAttr>()) {
-    int64_t i = index.getZExtValue();
+  for (int64_t i : this->getMask()) {
     if (i >= lhsSize) {
       results.push_back(rhsElements[i - lhsSize]);
     } else {
@@ -2591,13 +2582,13 @@ struct Canonicalize0DShuffleOp : public OpRewritePattern<ShuffleOp> {
   LogicalResult matchAndRewrite(ShuffleOp shuffleOp,
                                 PatternRewriter &rewriter) const override {
     VectorType v1VectorType = shuffleOp.getV1VectorType();
-    ArrayAttr mask = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
     if (v1VectorType.getRank() > 0)
       return failure();
     if (mask.size() != 1)
       return failure();
     VectorType resType = VectorType::Builder(v1VectorType).setShape({1});
-    if (llvm::cast<IntegerAttr>(mask[0]).getInt() == 0)
+    if (mask[0] == 0)
       rewriter.replaceOpWithNewOp<vector::BroadcastOp>(shuffleOp, resType,
                                                        shuffleOp.getV1());
     else
@@ -2652,11 +2643,11 @@ class ShuffleInterleave : public OpRewritePattern<ShuffleOp> {
           op, "ShuffleOp types don't match an interleave");
     }
 
-    ArrayAttr shuffleMask = op.getMask();
+    ArrayRef<int64_t> shuffleMask = op.getMask();
     int64_t resultVectorSize = resultType.getNumElements();
     for (int i = 0, e = resultVectorSize / 2; i < e; ++i) {
-      int64_t maskValueA = cast<IntegerAttr>(shuffleMask[i * 2]).getInt();
-      int64_t maskValueB = cast<IntegerAttr>(shuffleMask[(i * 2) + 1]).getInt();
+      int64_t maskValueA = shuffleMask[i * 2];
+      int64_t maskValueB = shuffleMask[(i * 2) + 1];
       if (maskValueA != i || maskValueB != (resultVectorSize / 2) + i)
         return rewriter.notifyMatchFailure(op,
                                            "ShuffleOp mask not interleaving");
@@ -3593,8 +3584,7 @@ class StridedSliceConstantMaskFolder final
     if (extractStridedSliceOp.hasNonUnitStrides())
       return failure();
     // Gather constant mask dimension sizes.
-    SmallVector<int64_t, 4> maskDimSizes;
-    populateFromInt64AttrArray(constantMaskOp.getMaskDimSizes(), maskDimSizes);
+    ArrayRef<int64_t> maskDimSizes = constantMaskOp.getMaskDimSizes();
     // Gather strided slice offsets and sizes.
     SmallVector<int64_t, 4> sliceOffsets;
     populateFromInt64AttrArray(extractStridedSliceOp.getOffsets(),
@@ -3625,7 +3615,7 @@ class StridedSliceConstantMaskFolder final
     // region.
     rewriter.replaceOpWithNewOp<ConstantMaskOp>(
         extractStridedSliceOp, extractStridedSliceOp.getResult().getType(),
-        vector::getVectorSubscriptAttr(rewriter, sliceMaskDimSizes));
+        sliceMaskDimSizes);
     return success();
   }
 };
@@ -5238,6 +5228,16 @@ static LogicalResult verifyVectorShapeCast(Operation *op,
     if (!isValidShapeCast(resultShape, sourceShape))
       return op->emitOpError("invalid shape cast");
   }
+
+  // Check that (non-)scalability is preserved
+  int64_t sourceNScalableDims = sourceVectorType.getNumScalableDims();
+  int64_t resultNScalableDims = resultVectorType.getNumScalableDims();
+  if (sourceNScalableDims != resultNScalableDims)
+    return op->emitOpError("different number of scalable dims at source (")
+           << sourceNScalableDims << ") and result (" << resultNScalableDims
+           << ")";
+  sourceVectorType.getNumDynamicDims();
+
   return success();
 }
 
@@ -5400,21 +5400,19 @@ class ShapeCastCreateMaskFolderTrailingOneDim final
     }
 
     if (constantMaskOp) {
-      auto maskDimSizes = constantMaskOp.getMaskDimSizes().getValue();
+      auto maskDimSizes = constantMaskOp.getMaskDimSizes();
       auto numMaskOperands = maskDimSizes.size();
 
       // Check every mask dim size to see whether it can be dropped
       for (size_t i = numMaskOperands - 1; i >= numMaskOperands - numDimsToDrop;
            --i) {
-        if (cast<IntegerAttr>(maskDimSizes[i]).getValue() != 1)
+        if (maskDimSizes[i] != 1)
           return failure();
       }
 
       auto newMaskOperands = maskDimSizes.drop_back(numDimsToDrop);
-      ArrayAttr newMaskOperandsAttr = rewriter.getArrayAttr(newMaskOperands);
-
       rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(shapeOp, shapeOpResTy,
-                                                          newMaskOperandsAttr);
+                                                          newMaskOperands);
       return success();
     }
 
@@ -5794,12 +5792,10 @@ class FoldTransposeCreateMask final : public OpRewritePattern<TransposeOp> {
 
     // ConstantMaskOp case.
     auto maskDimSizes = constantMaskOp.getMaskDimSizes();
-    SmallVector<Attribute> newMaskDimSizes(maskDimSizes.getValue());
-    applyPermutationToVector(newMaskDimSizes, permutation);
+    auto newMaskDimSizes = applyPermutation(maskDimSizes, permutation);
 
     rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(
-        transpOp, transpOp.getResultVectorType(),
-        ArrayAttr::get(transpOp.getContext(), newMaskDimSizes));
+        transpOp, transpOp.getResultVectorType(), newMaskDimSizes);
     return success();
   }
 };
@@ -5822,7 +5818,7 @@ LogicalResult ConstantMaskOp::verify() {
   if (resultType.getRank() == 0) {
     if (getMaskDimSizes().size() != 1)
       return emitError("array attr must have length 1 for 0-D vectors");
-    auto dim = llvm::cast<IntegerAttr>(getMaskDimSizes()[0]).getInt();
+    auto dim = getMaskDimSizes()[0];
     if (dim != 0 && dim != 1)
       return emitError("mask dim size must be either 0 or 1 for 0-D vectors");
     return success();
@@ -5836,9 +5832,8 @@ LogicalResult ConstantMaskOp::verify() {
   // result dimension size.
   auto resultShape = resultType.getShape();
   auto resultScalableDims = resultType.getScalableDims();
-  SmallVector<int64_t, 4> maskDimSizes;
-  for (const auto [index, intAttr] : llvm::enumerate(getMaskDimSizes())) {
-    int64_t maskDimSize = llvm::cast<IntegerAttr>(intAttr).getInt();
+  ArrayRef<int64_t> maskDimSizes = getMaskDimSizes();
+  for (const auto [index, maskDimSize] : llvm::enumerate(maskDimSizes)) {
     if (maskDimSize < 0 || maskDimSize > resultShape[index])
       return emitOpError(
           "array attr of size out of bounds of vector result dimension size");
@@ -5846,7 +5841,6 @@ LogicalResult ConstantMaskOp::verify() {
         maskDimSize != resultShape[index])
       return emitOpError(
           "only supports 'none set' or 'all set' scalable dimensions");
-    maskDimSizes.push_back(maskDimSize);
   }
   // Verify that if one mask dim size is zero, they all should be zero (because
   // the mask region is a conjunction of each mask dimension interval).
@@ -5863,11 +5857,10 @@ bool ConstantMaskOp::isAllOnesMask() {
   // Check the corner case of 0-D vectors first.
   if (resultType.getRank() == 0) {
     assert(getMaskDimSizes().size() == 1 && "invalid sizes for zero rank mask");
-    return llvm::cast<IntegerAttr>(getMaskDimSizes()[0]).getInt() == 1;
+    return getMaskDimSizes()[0] == 1;
   }
-  for (const auto [resultSize, intAttr] :
+  for (const auto [resultSize, maskDimSize] :
        llvm::zip_equal(resultType.getShape(), getMaskDimSizes())) {
-    int64_t maskDimSize = llvm::cast<IntegerAttr>(intAttr).getInt();
     if (maskDimSize < resultSize)
       return false;
   }
@@ -5997,9 +5990,8 @@ class CreateMaskFolder final : public OpRewritePattern<CreateMaskOp> {
     }
 
     // Replace 'createMaskOp' with ConstantMaskOp.
-    rewriter.replaceOpWithNewOp<ConstantMaskOp>(
-        createMaskOp, retTy,
-        vector::getVectorSubscriptAttr(rewriter, maskDimSizes));
+    rewriter.replaceOpWithNewOp<ConstantMaskOp>(createMaskOp, retTy,
+                                                maskDimSizes);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index dfeb7bc53adad..bfc05c71f5340 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -111,7 +111,7 @@ class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
     if (rank == 0) {
       assert(dimSizes.size() == 1 &&
              "Expected exactly one dim size for a 0-D vector");
-      bool value = cast<IntegerAttr>(dimSizes[0]).getInt() == 1;
+      bool value = dimSizes.front() == 1;
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(
           op, dstType,
           DenseIntElementsAttr::get(VectorType::get({}, rewriter.getI1Type()),
@@ -119,7 +119,7 @@ class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
       return success();
     }
 
-    int64_t trueDimSize = cast<IntegerAttr>(dimSizes[0]).getInt();
+    int64_t trueDimSize = dimSizes.front();
 
     if (rank == 1) {
       if (trueDimSize == 0 || trueDimSize == dstType.getDimSize(0)) {
@@ -147,7 +147,7 @@ class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
 
     VectorType lowType = VectorType::Builder(dstType).dropDim(0);
     Value trueVal = rewriter.create<vector::ConstantMaskOp>(
-        loc, lowType, rewriter.getArrayAttr(dimSizes.getValue().drop_front()));
+        loc, lowType, dimSizes.drop_front());
     Value result = rewriter.create<arith::ConstantOp>(
         loc, dstType, rewriter.getZeroAttr(dstType));
     for (int64_t d = 0; d < trueDimSize; d++)
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index 7ed3dea42b771..42ac717b44c4b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -550,9 +550,7 @@ struct CastAwayConstantMaskLeadingOneDim
       return failure();
 
     int64_t dropDim = oldType.getRank() - newType.getRank();
-    SmallVector<int64_t> dimSizes;
-    for (auto attr : mask.getMaskDimSizes())
-      dimSizes.push_back(llvm::cast<IntegerAttr>(attr).getInt());
+    ArrayRef<int64_t> dimSizes = mask.getMaskDimSizes();
 
     // If any of the dropped unit dims has a size of `0`, the entire mask is a
     // zero mask, else the unit dim has no effect on the mask.
@@ -563,7 +561,7 @@ struct CastAwayConstantMaskLeadingOneDim
     newDimSizes.append(dimSizes.begin() + dropDim + 1, dimSizes.end());
 
     auto newMask = rewriter.create<vector::ConstantMaskOp>(
-        mask.getLoc(), newType, rewriter.getI64ArrayAttr(newDimSizes));
+        mask.getLoc(), newType, newDimSizes);
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(mask, oldType, newMask);
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index ac2a4d3abcc68..d3296ee38c249 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -83,17 +83,14 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
     newMask = rewriter.create<vector::CreateMaskOp>(loc, newMaskType,
                                                     newMaskOperands);
   } else if (constantMaskOp) {
-    ArrayRef<Attribute> maskDimSizes =
-        constantMaskOp.getMaskDimSizes().getValue();
+    ArrayRef<int64_t> maskDimSizes = constantMaskOp.getMaskDimSizes();
     size_t numMaskOperands = maskDimSizes.size();
-    auto origIndex =
-        cast<IntegerAttr>(maskDimSizes[numMaskOperands - 1]).getInt();
-    IntegerAttr maskIndexAttr =
-        rewriter.getI64IntegerAttr((origIndex + scale - 1) / scale);
-    SmallVector<Attribute> newMaskDimSizes(maskDimSizes.drop_back());
-    newMaskDimSizes.push_back(maskIndexAttr);
-    newMask = rewriter.create<vector::ConstantMaskOp>(
-        loc, newMaskType, rewriter.getArrayAttr(newMaskDimSizes));
+    int64_t origIndex = maskDimSizes[numMaskOperands - 1];
+    int64_t maskIndex = (origIndex + scale - 1) / scale;
+    SmallVector<int64_t> newMaskDimSizes(maskDimSizes.drop_back());
+    newMaskDimSizes.push_back(maskIndex);
+    newMask = rewriter.create<vector::ConstantMaskOp>(loc, newMaskType,
+                                                      newMaskDimSizes);
   }
 
   while (!extractOps.empty()) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
index 37216cea7b615..ec2ef3fc7501c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
@@ -225,8 +225,7 @@ class Convert1DExtractStridedSliceIntoShuffle
          off += stride)
       offsets.push_back(off);
     rewriter.replaceOpWithNewOp<ShuffleOp>(op, dstType, op.getVector(),
-                                           op.getVector(),
-                                           rewriter.getI64ArrayAttr(offsets));
+                                           op.getVector(), offsets);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 4a3ae1b850517..868397f2daaae 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -232,8 +232,7 @@ struct LinearizeVectorExtractStridedSlice final
     }
     // Perform a shuffle to extract the kD vector.
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        extractOp, dstType, srcVector, srcVector,
-        rewriter.getI64ArrayAttr(indices));
+        extractOp, dstType, srcVector, srcVector, indices);
     return success();
   }
 
@@ -298,20 +297,17 @@ struct LinearizeVectorShuffle final
     // that needs to be shuffled to the destination vector. If shuffleSliceLen >
     // 1 we need to shuffle the slices (consecutive shuffleSliceLen number of
     // elements) instead of scalars.
-    ArrayAttr mask = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
     int64_t totalSizeOfShuffledElmnts = mask.size() * shuffleSliceLen;
     llvm::SmallVector<int64_t, 2> indices(totalSizeOfShuffledElmnts);
-    for (auto [i, value] :
-         llvm::enumerate(mask.getAsValueRange<IntegerAttr>())) {
-
-      int64_t v = value.getZExtValue();
+    for (auto [i, value] : llvm::enumerate(mask)) {
       std::iota(indices.begin() + shuffleSliceLen * i,
                 indices.begin() + shuffleSliceLen * (i + 1),
-                shuffleSliceLen * v);
+                shuffleSliceLen * value);
     }
 
-    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        shuffleOp, dstType, vec1, vec2, rewriter.getI64ArrayAttr(indices));
+    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(shuffleOp, dstType, vec1,
+                                                   vec2, indices);
     return success();
   }
 
@@ -368,8 +364,7 @@ struct LinearizeVectorExtract final
     llvm::SmallVector<int64_t, 2> indices(size);
     std::iota(indices.begin(), indices.end(), linearizedOffset);
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        extractOp, dstTy, adaptor.getVector(), adaptor.getVector(),
-        rewriter.getI64ArrayAttr(indices));
+        extractOp, dstTy, adaptor.getVector(), adaptor.getVector(), indices);
 
     return success();
   }
@@ -452,8 +447,7 @@ struct LinearizeVectorInsert final
                                            // [offset+srcNumElements, end)
 
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(),
-        rewriter.getI64ArrayAttr(indices));
+        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(), indices);
 
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 2686277bba59d..6777e589795c8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1321,11 +1321,8 @@ class DropInnerMostUnitDimsTransferRead
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             srcType.getShape().drop_back(dimsToDrop), srcType, offsets, sizes,
             strides));
-    ArrayAttr inBoundsAttr =
-        readOp.getInBounds()
-            ? rewriter.getArrayAttr(
-                  readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
-            : ArrayAttr();
+    ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
+        readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
     Value rankedReducedView = rewriter.create<memref::SubViewOp>(
         loc, resultMemrefType, readOp.getSource(), offsets, sizes, strides);
     auto permMap = getTransferMinorIdentityMap(
@@ -1415,11 +1412,8 @@ class DropInnerMostUnitDimsTransferWrite
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             srcType.getShape().drop_back(dimsToDrop), srcType, offsets, sizes,
             strides));
-    ArrayAttr inBoundsAttr =
-        writeOp.getInBounds()
-            ? rewriter.getArrayAttr(
-                  writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
-            : ArrayAttr();
+    ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
+        writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
 
     Value rankedReducedView = rewriter.create<memref::SubViewOp>(
         loc, resultMemrefType, writeOp.getSource(), offsets, sizes, strides);
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 4ed5a8bac20d1..e590d8c43c44b 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -323,8 +323,7 @@ SmallVector<OpFoldResult> vector::getMixedSizesXfer(bool hasTensorSemantics,
 }
 
 bool vector::isLinearizableVector(VectorType type) {
-  auto numScalableDims = llvm::count(type.getScalableDims(), true);
-  return (type.getRank() > 1) && (numScalableDims <= 1);
+  return (type.getRank() > 1) && (type.getNumScalableDims() <= 1);
 }
 
 Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 75cc01ee9a146..7978b35e7147d 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <utility>
@@ -257,7 +258,7 @@ int64_t AffineExpr::getLargestKnownDivisor() const {
     if (rhs && rhs.getValue() != 0) {
       int64_t lhsDiv = binExpr.getLHS().getLargestKnownDivisor();
       if (lhsDiv % rhs.getValue() == 0)
-        return lhsDiv / rhs.getValue();
+        return std::abs(lhsDiv / rhs.getValue());
     }
     return 1;
   }
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 13eb18036eeec..e5b1291afce2b 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -2576,6 +2576,7 @@ void AsmPrinter::Impl::printTypeImpl(Type type) {
       })
       .Case<IndexType>([&](Type) { os << "index"; })
       .Case<Float8E5M2Type>([&](Type) { os << "f8E5M2"; })
+      .Case<Float8E4M3Type>([&](Type) { os << "f8E4M3"; })
       .Case<Float8E4M3FNType>([&](Type) { os << "f8E4M3FN"; })
       .Case<Float8E5M2FNUZType>([&](Type) { os << "f8E5M2FNUZ"; })
       .Case<Float8E4M3FNUZType>([&](Type) { os << "f8E4M3FNUZ"; })
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index d49f69a7b7ae6..d0eb2d8fbae9d 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -38,6 +38,10 @@ FloatType Builder::getFloat8E5M2Type() {
   return FloatType::getFloat8E5M2(context);
 }
 
+FloatType Builder::getFloat8E4M3Type() {
+  return FloatType::getFloat8E4M3(context);
+}
+
 FloatType Builder::getFloat8E4M3FNType() {
   return FloatType::getFloat8E4M3FN(context);
 }
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 179797cb943a1..faa944937e007 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -87,8 +87,9 @@ IntegerType IntegerType::scaleElementBitwidth(unsigned scale) {
 //===----------------------------------------------------------------------===//
 
 unsigned FloatType::getWidth() {
-  if (llvm::isa<Float8E5M2Type, Float8E4M3FNType, Float8E5M2FNUZType,
-          Float8E4M3FNUZType, Float8E4M3B11FNUZType>(*this))
+  if (llvm::isa<Float8E5M2Type, Float8E4M3Type, Float8E4M3FNType,
+                Float8E5M2FNUZType, Float8E4M3FNUZType, Float8E4M3B11FNUZType>(
+          *this))
     return 8;
   if (llvm::isa<Float16Type, BFloat16Type>(*this))
     return 16;
@@ -107,6 +108,8 @@ unsigned FloatType::getWidth() {
 const llvm::fltSemantics &FloatType::getFloatSemantics() {
   if (llvm::isa<Float8E5M2Type>(*this))
     return APFloat::Float8E5M2();
+  if (llvm::isa<Float8E4M3Type>(*this))
+    return APFloat::Float8E4M3();
   if (llvm::isa<Float8E4M3FNType>(*this))
     return APFloat::Float8E4M3FN();
   if (llvm::isa<Float8E5M2FNUZType>(*this))
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 214b354c5347e..12336701c9ca0 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -222,6 +222,7 @@ class MLIRContextImpl {
 
   /// Cached Type Instances.
   Float8E5M2Type f8E5M2Ty;
+  Float8E4M3Type f8E4M3Ty;
   Float8E4M3FNType f8E4M3FNTy;
   Float8E5M2FNUZType f8E5M2FNUZTy;
   Float8E4M3FNUZType f8E4M3FNUZTy;
@@ -312,6 +313,7 @@ MLIRContext::MLIRContext(const DialectRegistry &registry, Threading setting)
   //// Types.
   /// Floating-point Types.
   impl->f8E5M2Ty = TypeUniquer::get<Float8E5M2Type>(this);
+  impl->f8E4M3Ty = TypeUniquer::get<Float8E4M3Type>(this);
   impl->f8E4M3FNTy = TypeUniquer::get<Float8E4M3FNType>(this);
   impl->f8E5M2FNUZTy = TypeUniquer::get<Float8E5M2FNUZType>(this);
   impl->f8E4M3FNUZTy = TypeUniquer::get<Float8E4M3FNUZType>(this);
@@ -1012,6 +1014,9 @@ StorageUniquer &MLIRContext::getTypeUniquer() { return getImpl().typeUniquer; }
 Float8E5M2Type Float8E5M2Type::get(MLIRContext *context) {
   return context->getImpl().f8E5M2Ty;
 }
+Float8E4M3Type Float8E4M3Type::get(MLIRContext *context) {
+  return context->getImpl().f8E4M3Ty;
+}
 Float8E4M3FNType Float8E4M3FNType::get(MLIRContext *context) {
   return context->getImpl().f8E4M3FNTy;
 }
diff --git a/mlir/lib/IR/ODSSupport.cpp b/mlir/lib/IR/ODSSupport.cpp
index 6e968d62e61c7..d56c75ede9849 100644
--- a/mlir/lib/IR/ODSSupport.cpp
+++ b/mlir/lib/IR/ODSSupport.cpp
@@ -33,6 +33,50 @@ Attribute mlir::convertToAttribute(MLIRContext *ctx, int64_t storage) {
   return IntegerAttr::get(IntegerType::get(ctx, 64), storage);
 }
 
+LogicalResult
+mlir::convertFromAttribute(int32_t &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<IntegerAttr>(attr);
+  if (!valueAttr) {
+    emitError() << "expected IntegerAttr for key `value`";
+    return failure();
+  }
+  storage = valueAttr.getValue().getSExtValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, int32_t storage) {
+  return IntegerAttr::get(IntegerType::get(ctx, 32), storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(std::string &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<StringAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue().str();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx,
+                                   const std::string &storage) {
+  return StringAttr::get(ctx, storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(bool &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<BoolAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, bool storage) {
+  return BoolAttr::get(ctx, storage);
+}
+
 template <typename DenseArrayTy, typename T>
 LogicalResult
 convertDenseArrayFromAttr(MutableArrayRef<T> storage, Attribute attr,
@@ -64,6 +108,33 @@ mlir::convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                                                       "DenseI32ArrayAttr");
 }
 
+template <typename DenseArrayTy, typename T>
+LogicalResult
+convertDenseArrayFromAttr(SmallVectorImpl<T> &storage, Attribute attr,
+                          function_ref<InFlightDiagnostic()> emitError,
+                          StringRef denseArrayTyStr) {
+  auto valueAttr = dyn_cast<DenseArrayTy>(attr);
+  if (!valueAttr) {
+    emitError() << "expected " << denseArrayTyStr << " for key `value`";
+    return failure();
+  }
+  storage.resize_for_overwrite(valueAttr.size());
+  llvm::copy(valueAttr.asArrayRef(), storage.begin());
+  return success();
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI64ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI64ArrayAttr");
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI32ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI32ArrayAttr");
+}
+
 Attribute mlir::convertToAttribute(MLIRContext *ctx,
                                    ArrayRef<int64_t> storage) {
   return DenseI64ArrayAttr::get(ctx, storage);
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
index 1d1ba6df4db2f..e8cd28bf9e85d 100644
--- a/mlir/lib/IR/Types.cpp
+++ b/mlir/lib/IR/Types.cpp
@@ -35,6 +35,7 @@ Type AbstractType::replaceImmediateSubElements(Type type,
 MLIRContext *Type::getContext() const { return getDialect().getContext(); }
 
 bool Type::isFloat8E5M2() const { return llvm::isa<Float8E5M2Type>(*this); }
+bool Type::isFloat8E4M3() const { return llvm::isa<Float8E4M3Type>(*this); }
 bool Type::isFloat8E4M3FN() const { return llvm::isa<Float8E4M3FNType>(*this); }
 bool Type::isFloat8E5M2FNUZ() const {
   return llvm::isa<Float8E5M2FNUZType>(*this);
diff --git a/mlir/lib/IR/Verifier.cpp b/mlir/lib/IR/Verifier.cpp
index a09b47ee981c9..5d81e9b99814e 100644
--- a/mlir/lib/IR/Verifier.cpp
+++ b/mlir/lib/IR/Verifier.cpp
@@ -268,7 +268,7 @@ LogicalResult OperationVerifier::verifyOnExit(Operation &op) {
 /// verifyBlockPostChildren.
 LogicalResult OperationVerifier::verifyOperation(Operation &op) {
   SmallVector<WorkItem> worklist{{&op}};
-  DenseSet<WorkItem> seen;
+  SmallPtrSet<WorkItem, 8> seen;
   while (!worklist.empty()) {
     WorkItem top = worklist.back();
 
diff --git a/mlir/lib/Pass/PassManagerOptions.cpp b/mlir/lib/Pass/PassManagerOptions.cpp
index 706a21a23ee3e..dd119a75f4069 100644
--- a/mlir/lib/Pass/PassManagerOptions.cpp
+++ b/mlir/lib/Pass/PassManagerOptions.cpp
@@ -61,7 +61,8 @@ struct PassManagerOptions {
   llvm::cl::opt<std::string> printTreeDir{
       "mlir-print-ir-tree-dir",
       llvm::cl::desc("When printing the IR before/after a pass, print file "
-                     "tree rooted at this directory")};
+                     "tree rooted at this directory. Use in conjunction with "
+                     "mlir-print-ir-* flags")};
 
   /// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
   void addPrinterInstrumentation(PassManager &pm);
diff --git a/mlir/lib/TableGen/Property.cpp b/mlir/lib/TableGen/Property.cpp
index e61d2fd2480fd..b86b87df91c60 100644
--- a/mlir/lib/TableGen/Property.cpp
+++ b/mlir/lib/TableGen/Property.cpp
@@ -33,16 +33,23 @@ static StringRef getValueAsString(const Init *init) {
 }
 
 Property::Property(const Record *def)
-    : Property(getValueAsString(def->getValueInit("storageType")),
-               getValueAsString(def->getValueInit("interfaceType")),
-               getValueAsString(def->getValueInit("convertFromStorage")),
-               getValueAsString(def->getValueInit("assignToStorage")),
-               getValueAsString(def->getValueInit("convertToAttribute")),
-               getValueAsString(def->getValueInit("convertFromAttribute")),
-               getValueAsString(def->getValueInit("readFromMlirBytecode")),
-               getValueAsString(def->getValueInit("writeToMlirBytecode")),
-               getValueAsString(def->getValueInit("hashProperty")),
-               getValueAsString(def->getValueInit("defaultValue"))) {
+    : Property(
+          getValueAsString(def->getValueInit("summary")),
+          getValueAsString(def->getValueInit("description")),
+          getValueAsString(def->getValueInit("storageType")),
+          getValueAsString(def->getValueInit("interfaceType")),
+          getValueAsString(def->getValueInit("convertFromStorage")),
+          getValueAsString(def->getValueInit("assignToStorage")),
+          getValueAsString(def->getValueInit("convertToAttribute")),
+          getValueAsString(def->getValueInit("convertFromAttribute")),
+          getValueAsString(def->getValueInit("parser")),
+          getValueAsString(def->getValueInit("optionalParser")),
+          getValueAsString(def->getValueInit("printer")),
+          getValueAsString(def->getValueInit("readFromMlirBytecode")),
+          getValueAsString(def->getValueInit("writeToMlirBytecode")),
+          getValueAsString(def->getValueInit("hashProperty")),
+          getValueAsString(def->getValueInit("defaultValue")),
+          getValueAsString(def->getValueInit("storageTypeValueOverride"))) {
   this->def = def;
   assert((def->isSubClassOf("Property") || def->isSubClassOf("Attr")) &&
          "must be subclass of TableGen 'Property' class");
@@ -50,22 +57,44 @@ Property::Property(const Record *def)
 
 Property::Property(const DefInit *init) : Property(init->getDef()) {}
 
-Property::Property(StringRef storageType, StringRef interfaceType,
+Property::Property(StringRef summary, StringRef description,
+                   StringRef storageType, StringRef interfaceType,
                    StringRef convertFromStorageCall,
                    StringRef assignToStorageCall,
                    StringRef convertToAttributeCall,
-                   StringRef convertFromAttributeCall,
+                   StringRef convertFromAttributeCall, StringRef parserCall,
+                   StringRef optionalParserCall, StringRef printerCall,
                    StringRef readFromMlirBytecodeCall,
                    StringRef writeToMlirBytecodeCall,
-                   StringRef hashPropertyCall, StringRef defaultValue)
-    : storageType(storageType), interfaceType(interfaceType),
+                   StringRef hashPropertyCall, StringRef defaultValue,
+                   StringRef storageTypeValueOverride)
+    : summary(summary), description(description), storageType(storageType),
+      interfaceType(interfaceType),
       convertFromStorageCall(convertFromStorageCall),
       assignToStorageCall(assignToStorageCall),
       convertToAttributeCall(convertToAttributeCall),
       convertFromAttributeCall(convertFromAttributeCall),
+      parserCall(parserCall), optionalParserCall(optionalParserCall),
+      printerCall(printerCall),
       readFromMlirBytecodeCall(readFromMlirBytecodeCall),
       writeToMlirBytecodeCall(writeToMlirBytecodeCall),
-      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue) {
+      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue),
+      storageTypeValueOverride(storageTypeValueOverride) {
   if (storageType.empty())
     storageType = "Property";
 }
+
+StringRef Property::getPropertyDefName() const {
+  if (def->isAnonymous()) {
+    return getBaseProperty().def->getName();
+  }
+  return def->getName();
+}
+
+Property Property::getBaseProperty() const {
+  if (const auto *defInit =
+          llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseProperty"))) {
+    return Property(defInit).getBaseProperty();
+  }
+  return *this;
+}
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index 70d6bcd76285a..4d23f987eb05e 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -324,8 +324,7 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
   mcStreamer.reset(target->createMCObjectStreamer(
       triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
-      *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
-      /*DWARFMustBeAtTheEnd*/ false));
+      *sti));
 
   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index f104b72209c39..1817c1271b43e 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -180,7 +180,8 @@ DILocalVariableAttr DebugImporter::translateImpl(llvm::DILocalVariable *node) {
   return DILocalVariableAttr::get(
       context, scope, getStringAttrOrNull(node->getRawName()),
       translate(node->getFile()), node->getLine(), node->getArg(),
-      node->getAlignInBits(), translate(node->getType()));
+      node->getAlignInBits(), translate(node->getType()),
+      symbolizeDIFlags(node->getFlags()).value_or(DIFlags::Zero));
 }
 
 DIVariableAttr DebugImporter::translateImpl(llvm::DIVariable *node) {
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 35800e993d89d..95b37e47d0461 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -228,8 +228,8 @@ DebugTranslation::translateImpl(DILocalVariableAttr attr) {
   return llvm::DILocalVariable::get(
       llvmCtx, translate(attr.getScope()), getMDStringOrNull(attr.getName()),
       translate(attr.getFile()), attr.getLine(), translate(attr.getType()),
-      attr.getArg(),
-      /*Flags=*/llvm::DINode::FlagZero, attr.getAlignInBits(),
+      attr.getArg(), static_cast<llvm::DINode::DIFlags>(attr.getFlags()),
+      attr.getAlignInBits(),
       /*Annotations=*/nullptr);
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 3d6dd1247b413..bdb15a290209b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -219,6 +219,25 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     call->setCallingConv(convertCConvToLLVM(callOp.getCConv()));
     call->setTailCallKind(convertTailCallKindToLLVM(callOp.getTailCallKind()));
+    if (callOp.getConvergentAttr())
+      call->addFnAttr(llvm::Attribute::Convergent);
+    if (callOp.getNoUnwindAttr())
+      call->addFnAttr(llvm::Attribute::NoUnwind);
+    if (callOp.getWillReturnAttr())
+      call->addFnAttr(llvm::Attribute::WillReturn);
+
+    if (MemoryEffectsAttr memAttr = callOp.getMemoryEffectsAttr()) {
+      llvm::MemoryEffects memEffects =
+          llvm::MemoryEffects(llvm::MemoryEffects::Location::ArgMem,
+                              convertModRefInfoToLLVM(memAttr.getArgMem())) |
+          llvm::MemoryEffects(
+              llvm::MemoryEffects::Location::InaccessibleMem,
+              convertModRefInfoToLLVM(memAttr.getInaccessibleMem())) |
+          llvm::MemoryEffects(llvm::MemoryEffects::Location::Other,
+                              convertModRefInfoToLLVM(memAttr.getOther()));
+      call->setMemoryEffects(memEffects);
+    }
+
     moduleTranslation.setAccessGroupsMetadata(callOp, call);
     moduleTranslation.setAliasScopeMetadata(callOp, call);
     moduleTranslation.setTBAAMetadata(callOp, call);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f7cb52236e1a9..ddee117838697 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -362,9 +362,9 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
     auto criticalDeclareOp =
         SymbolTable::lookupNearestSymbolFrom<omp::CriticalDeclareOp>(criticalOp,
                                                                      symbolRef);
-    hint = llvm::ConstantInt::get(
-        llvm::Type::getInt32Ty(llvmContext),
-        static_cast<int>(criticalDeclareOp.getHintVal()));
+    hint =
+        llvm::ConstantInt::get(llvm::Type::getInt32Ty(llvmContext),
+                               static_cast<int>(criticalDeclareOp.getHint()));
   }
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createCritical(
       ompLoc, bodyGenCB, finiCB, criticalOp.getName().value_or(""), hint));
@@ -376,7 +376,7 @@ template <typename T>
 static void
 collectReductionDecls(T loop,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
-  std::optional<ArrayAttr> attr = loop.getReductions();
+  std::optional<ArrayAttr> attr = loop.getReductionSyms();
   if (!attr)
     return;
 
@@ -534,11 +534,11 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder,
                   LLVM::ModuleTranslation &moduleTranslation) {
   auto orderedOp = cast<omp::OrderedOp>(opInst);
 
-  omp::ClauseDepend dependType = *orderedOp.getDependTypeVal();
+  omp::ClauseDepend dependType = *orderedOp.getDoacrossDependType();
   bool isDependSource = dependType == omp::ClauseDepend::dependsource;
-  unsigned numLoops = *orderedOp.getNumLoopsVal();
+  unsigned numLoops = *orderedOp.getDoacrossNumLoops();
   SmallVector<llvm::Value *> vecValues =
-      moduleTranslation.lookupValues(orderedOp.getDependVecVars());
+      moduleTranslation.lookupValues(orderedOp.getDoacrossDependVars());
 
   size_t indexVecValues = 0;
   while (indexVecValues < vecValues.size()) {
@@ -566,7 +566,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   auto orderedRegionOp = cast<omp::OrderedRegionOp>(opInst);
 
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (orderedRegionOp.getSimd())
+  if (orderedRegionOp.getParLevelSimd())
     return failure();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -588,7 +588,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(
       moduleTranslation.getOpenMPBuilder()->createOrderedThreadsSimd(
-          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getSimd()));
+          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getParLevelSimd()));
   return bodyGenStatus;
 }
 
@@ -837,11 +837,11 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   // TODO: Support the following clauses: private, firstprivate, lastprivate,
   // allocate
   if (!sectionsOp.getAllocateVars().empty() ||
-      !sectionsOp.getAllocatorsVars().empty())
-    return emitError(sectionsOp.getLoc())
-           << "allocate clause is not supported for sections construct";
+      !sectionsOp.getAllocatorVars().empty() ||
+      !sectionsOp.getPrivateVars().empty() || sectionsOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionByref());
   assert(isByRef.size() == sectionsOp.getNumReductionVars());
 
   SmallVector<omp::DeclareReductionOp> reductionDecls;
@@ -945,6 +945,9 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   LogicalResult bodyGenStatus = success();
+  if (!singleOp.getPrivateVars().empty() || singleOp.getPrivateSyms())
+    return singleOp.emitError("unhandled clauses for translation to LLVM IR");
+
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
     builder.restoreIP(codegenIP);
     convertOmpOpRegions(singleOp.getRegion(), "omp.single.region", builder,
@@ -954,7 +957,7 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
 
   // Handle copyprivate
   Operation::operand_range cpVars = singleOp.getCopyprivateVars();
-  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateSyms();
   llvm::SmallVector<llvm::Value *> llvmCPVars;
   llvm::SmallVector<llvm::Function *> llvmCPFuncs;
   for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
@@ -976,7 +979,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
                 LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
-  if (!op.getAllocatorsVars().empty() || op.getReductions())
+  if (!op.getAllocatorVars().empty() || op.getReductionSyms() ||
+      !op.getPrivateVars().empty() || op.getPrivateSyms())
     return op.emitError("unhandled clauses for translation to LLVM IR");
 
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1000,8 +1004,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
     threadLimit = moduleTranslation.lookupValue(threadLimitVar);
 
   llvm::Value *ifExpr = nullptr;
-  if (Value ifExprVar = op.getIfExpr())
-    ifExpr = moduleTranslation.lookupValue(ifExprVar);
+  if (Value ifVar = op.getIfExpr())
+    ifExpr = moduleTranslation.lookupValue(ifVar);
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTeams(
@@ -1009,6 +1013,32 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
+static void
+buildDependData(std::optional<ArrayAttr> dependKinds, OperandRange dependVars,
+                LLVM::ModuleTranslation &moduleTranslation,
+                SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
+  if (dependVars.empty())
+    return;
+  for (auto dep : llvm::zip(dependVars, dependKinds->getValue())) {
+    llvm::omp::RTLDependenceKindTy type;
+    switch (
+        cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+    case mlir::omp::ClauseTaskDepend::taskdependin:
+      type = llvm::omp::RTLDependenceKindTy::DepIn;
+      break;
+    // The OpenMP runtime requires that the codegen for 'depend' clause for
+    // 'out' dependency kind must be the same as codegen for 'depend' clause
+    // with 'inout' dependency.
+    case mlir::omp::ClauseTaskDepend::taskdependout:
+    case mlir::omp::ClauseTaskDepend::taskdependinout:
+      type = llvm::omp::RTLDependenceKindTy::DepInOut;
+      break;
+    };
+    llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+    llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+    dds.emplace_back(dd);
+  }
+}
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -1016,8 +1046,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
   if (taskOp.getUntiedAttr() || taskOp.getMergeableAttr() ||
-      taskOp.getInReductions() || taskOp.getPriority() ||
-      !taskOp.getAllocateVars().empty()) {
+      taskOp.getInReductionSyms() || taskOp.getPriority() ||
+      !taskOp.getAllocateVars().empty() || !taskOp.getPrivateVars().empty() ||
+      taskOp.getPrivateSyms()) {
     return taskOp.emitError("unhandled clauses for translation to LLVM IR");
   }
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1032,35 +1063,15 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  if (!taskOp.getDependVars().empty() && taskOp.getDepends()) {
-    for (auto dep :
-         llvm::zip(taskOp.getDependVars(), taskOp.getDepends()->getValue())) {
-      llvm::omp::RTLDependenceKindTy type;
-      switch (
-          cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
-      case mlir::omp::ClauseTaskDepend::taskdependin:
-        type = llvm::omp::RTLDependenceKindTy::DepIn;
-        break;
-      // The OpenMP runtime requires that the codegen for 'depend' clause for
-      // 'out' dependency kind must be the same as codegen for 'depend' clause
-      // with 'inout' dependency.
-      case mlir::omp::ClauseTaskDepend::taskdependout:
-      case mlir::omp::ClauseTaskDepend::taskdependinout:
-        type = llvm::omp::RTLDependenceKindTy::DepInOut;
-        break;
-      };
-      llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
-      llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
-      dds.emplace_back(dd);
-    }
-  }
+  buildDependData(taskOp.getDependKinds(), taskOp.getDependVars(),
+                  moduleTranslation, dds);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTask(
       ompLoc, allocaIP, bodyCB, !taskOp.getUntied(),
-      moduleTranslation.lookupValue(taskOp.getFinalExpr()),
+      moduleTranslation.lookupValue(taskOp.getFinal()),
       moduleTranslation.lookupValue(taskOp.getIfExpr()), dds));
   return bodyGenStatus;
 }
@@ -1085,30 +1096,47 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
       ompLoc, allocaIP, bodyCB));
   return bodyGenStatus;
 }
+
+static LogicalResult
+convertOmpTaskwaitOp(omp::TaskwaitOp twOp, llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  if (!twOp.getDependVars().empty() || twOp.getDependKinds() ||
+      twOp.getNowait())
+    return twOp.emitError("unhandled clauses for translation to LLVM IR");
+
+  moduleTranslation.getOpenMPBuilder()->createTaskwait(builder.saveIP());
+  return success();
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto wsloopOp = cast<omp::WsloopOp>(opInst);
+  if (!wsloopOp.getAllocateVars().empty() ||
+      !wsloopOp.getAllocatorVars().empty() ||
+      !wsloopOp.getPrivateVars().empty() || wsloopOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   // FIXME: Here any other nested wrappers (e.g. omp.simd) are skipped, so
   // codegen for composite constructs like 'DO/FOR SIMD' will be the same as for
   // 'DO/FOR'.
   auto loopOp = cast<omp::LoopNestOp>(wsloopOp.getWrappedLoop());
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionByref());
   assert(isByRef.size() == wsloopOp.getNumReductionVars());
 
   // Static is the default.
   auto schedule =
-      wsloopOp.getScheduleVal().value_or(omp::ClauseScheduleKind::Static);
+      wsloopOp.getScheduleKind().value_or(omp::ClauseScheduleKind::Static);
 
   // Find the loop configuration.
-  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[0]);
+  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]);
   llvm::Type *ivType = step->getType();
   llvm::Value *chunk = nullptr;
-  if (wsloopOp.getScheduleChunkVar()) {
+  if (wsloopOp.getScheduleChunk()) {
     llvm::Value *chunkVar =
-        moduleTranslation.lookupValue(wsloopOp.getScheduleChunkVar());
+        moduleTranslation.lookupValue(wsloopOp.getScheduleChunk());
     chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
   }
 
@@ -1172,10 +1200,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1188,7 +1216,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     loopInfos.push_back(ompBuilder->createCanonicalLoop(
         loc, bodyGen, lowerBound, upperBound, step,
-        /*IsSigned=*/true, loopOp.getInclusive(), computeIP));
+        /*IsSigned=*/true, loopOp.getLoopInclusive(), computeIP));
 
     if (failed(bodyGenStatus))
       return failure();
@@ -1203,16 +1231,15 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
 
   // TODO: Handle doacross loops when the ordered clause has a parameter.
-  bool isOrdered = wsloopOp.getOrderedVal().has_value();
-  std::optional<omp::ScheduleModifier> scheduleModifier =
-      wsloopOp.getScheduleModifier();
-  bool isSimd = wsloopOp.getSimdModifier();
+  bool isOrdered = wsloopOp.getOrdered().has_value();
+  std::optional<omp::ScheduleModifier> scheduleMod = wsloopOp.getScheduleMod();
+  bool isSimd = wsloopOp.getScheduleSimd();
 
   ompBuilder->applyWorkshareLoop(
       ompLoc.DL, loopInfo, allocaIP, !wsloopOp.getNowait(),
       convertToScheduleKind(schedule), chunk, isSimd,
-      scheduleModifier == omp::ScheduleModifier::monotonic,
-      scheduleModifier == omp::ScheduleModifier::nonmonotonic, isOrdered);
+      scheduleMod == omp::ScheduleModifier::monotonic,
+      scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
 
   // Continue building IR after the loop. Note that the LoopInfo returned by
   // `collapseLoops` points inside the outermost loop and is intended for
@@ -1269,7 +1296,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
-  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionVarsByref());
+  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -1414,11 +1441,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     auto [privVar, privatizerClone] =
         [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
       if (!opInst.getPrivateVars().empty()) {
-        auto privVars = opInst.getPrivateVars();
-        auto privatizers = opInst.getPrivatizers();
+        auto privateVars = opInst.getPrivateVars();
+        auto privateSyms = opInst.getPrivateSyms();
 
         for (auto [privVar, privatizerAttr] :
-             llvm::zip_equal(privVars, *privatizers)) {
+             llvm::zip_equal(privateVars, *privateSyms)) {
           // Find the MLIR private variable corresponding to the LLVM value
           // being privatized.
           llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
@@ -1558,13 +1585,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = opInst.getIfExpr())
-    ifCond = moduleTranslation.lookupValue(ifExprVar);
+  if (auto ifVar = opInst.getIfExpr())
+    ifCond = moduleTranslation.lookupValue(ifVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = opInst.getNumThreadsVar())
+  if (auto numThreadsVar = opInst.getNumThreads())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = opInst.getProcBindVal())
+  if (auto bind = opInst.getProcBindKind())
     pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
@@ -1602,6 +1629,12 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   auto simdOp = cast<omp::SimdOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
+  if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
+      !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
+      !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
+      simdOp.getReductionSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
   // Generator of the canonical loop body.
@@ -1637,10 +1670,10 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1674,7 +1707,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
     safelen = builder.getInt64(safelenVar.value());
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
-  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrderVal());
+  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder());
   ompBuilder->applySimd(loopInfo, alignedVars,
                         simdOp.getIfExpr()
                             ? moduleTranslation.lookupValue(simdOp.getIfExpr())
@@ -1716,7 +1749,7 @@ convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
-  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrderVal());
+  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrder());
   llvm::Value *x = moduleTranslation.lookupValue(readOp.getX());
   llvm::Value *v = moduleTranslation.lookupValue(readOp.getV());
 
@@ -1737,7 +1770,7 @@ convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
-  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrderVal());
+  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrder());
   llvm::Value *expr = moduleTranslation.lookupValue(writeOp.getExpr());
   llvm::Value *dest = moduleTranslation.lookupValue(writeOp.getX());
   llvm::Type *ty = moduleTranslation.convertType(writeOp.getExpr().getType());
@@ -1805,7 +1838,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(opInst.getMemoryOrderVal());
+      convertAtomicOrdering(opInst.getMemoryOrder());
 
   // Generate update code.
   LogicalResult updateGenStatus = success();
@@ -1897,7 +1930,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(atomicCaptureOp.getMemoryOrderVal());
+      convertAtomicOrdering(atomicCaptureOp.getMemoryOrder());
 
   LogicalResult updateGenStatus = success();
   auto updateFn = [&](llvm::Value *atomicx,
@@ -2160,12 +2193,11 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
   return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
-void collectMapDataFromMapOperands(MapInfoData &mapData,
-                                   llvm::SmallVectorImpl<Value> &mapOperands,
-                                   LLVM::ModuleTranslation &moduleTranslation,
-                                   DataLayout &dl,
-                                   llvm::IRBuilderBase &builder) {
-  for (mlir::Value mapValue : mapOperands) {
+void collectMapDataFromMapVars(MapInfoData &mapData,
+                               llvm::SmallVectorImpl<Value> &mapVars,
+                               LLVM::ModuleTranslation &moduleTranslation,
+                               DataLayout &dl, llvm::IRBuilderBase &builder) {
+  for (mlir::Value mapValue : mapVars) {
     if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
             mapValue.getDefiningOp())) {
       mlir::Value offloadPtr =
@@ -2205,7 +2237,7 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
       // TODO: May require some further additions to support nested record
       // types, i.e. member maps that can have member maps.
       mapData.IsAMember.push_back(false);
-      for (mlir::Value mapValue : mapOperands) {
+      for (mlir::Value mapValue : mapVars) {
         if (auto map = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
                 mapValue.getDefiningOp())) {
           for (auto member : map.getMembers()) {
@@ -2683,8 +2715,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
                         DataLayout &dl,
                         llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
                         MapInfoData &mapData,
-                        const SmallVector<Value> &devPtrOperands = {},
-                        const SmallVector<Value> &devAddrOperands = {},
+                        const SmallVector<Value> &useDevicePtrVars = {},
+                        const SmallVector<Value> &useDeviceAddrVars = {},
                         bool isTargetParams = false) {
   // We wish to modify some of the methods in which arguments are
   // passed based on their capture type by the target region, this can
@@ -2742,13 +2774,13 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     return false;
   };
 
-  auto addDevInfos = [&, fail](auto devOperands, auto devOpType) -> void {
-    for (const auto &devOp : devOperands) {
+  auto addDevInfos = [&, fail](auto useDeviceVars, auto devOpType) -> void {
+    for (const auto &useDeviceVar : useDeviceVars) {
       // TODO: Only LLVMPointerTypes are handled.
-      if (!isa<LLVM::LLVMPointerType>(devOp.getType()))
+      if (!isa<LLVM::LLVMPointerType>(useDeviceVar.getType()))
         return fail();
 
-      llvm::Value *mapOpValue = moduleTranslation.lookupValue(devOp);
+      llvm::Value *mapOpValue = moduleTranslation.lookupValue(useDeviceVar);
 
       // Check if map info is already present for this entry.
       unsigned infoIndex;
@@ -2761,7 +2793,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
         combinedInfo.Pointers.emplace_back(mapOpValue);
         combinedInfo.DevicePointers.emplace_back(devOpType);
         combinedInfo.Names.emplace_back(
-            LLVM::createMappingInformation(devOp.getLoc(), *ompBuilder));
+            LLVM::createMappingInformation(useDeviceVar.getLoc(), *ompBuilder));
         combinedInfo.Types.emplace_back(
             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM);
         combinedInfo.Sizes.emplace_back(builder.getInt64(0));
@@ -2769,8 +2801,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     }
   };
 
-  addDevInfos(devPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
-  addDevInfos(devAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
+  addDevInfos(useDevicePtrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
+  addDevInfos(useDeviceAddrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
 }
 
 static LogicalResult
@@ -2778,9 +2810,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                      LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ifCond = nullptr;
   int64_t deviceID = llvm::omp::OMP_DEVICEID_UNDEF;
-  SmallVector<Value> mapOperands;
-  SmallVector<Value> useDevPtrOperands;
-  SmallVector<Value> useDevAddrOperands;
+  SmallVector<Value> mapVars;
+  SmallVector<Value> useDevicePtrVars;
+  SmallVector<Value> useDeviceAddrVars;
   llvm::omp::RuntimeFunction RTLFn;
   DataLayout DL = DataLayout(op->getParentOfType<ModuleOp>());
 
@@ -2789,8 +2821,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
           .Case([&](omp::TargetDataOp dataOp) {
-            if (auto ifExprVar = dataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = dataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = dataOp.getDevice())
               if (auto constOp =
@@ -2798,9 +2830,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
 
-            mapOperands = dataOp.getMapOperands();
-            useDevPtrOperands = dataOp.getUseDevicePtr();
-            useDevAddrOperands = dataOp.getUseDeviceAddr();
+            mapVars = dataOp.getMapVars();
+            useDevicePtrVars = dataOp.getUseDevicePtrVars();
+            useDeviceAddrVars = dataOp.getUseDeviceAddrVars();
             return success();
           })
           .Case([&](omp::TargetEnterDataOp enterDataOp) {
@@ -2808,8 +2840,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(enterDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = enterDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = enterDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = enterDataOp.getDevice())
               if (auto constOp =
@@ -2817,7 +2849,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_begin_mapper;
-            mapOperands = enterDataOp.getMapOperands();
+            mapVars = enterDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetExitDataOp exitDataOp) {
@@ -2825,8 +2857,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(exitDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = exitDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = exitDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = exitDataOp.getDevice())
               if (auto constOp =
@@ -2835,7 +2867,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_end_mapper;
-            mapOperands = exitDataOp.getMapOperands();
+            mapVars = exitDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetUpdateOp updateDataOp) {
@@ -2843,8 +2875,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(updateDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = updateDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = updateDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = updateDataOp.getDevice())
               if (auto constOp =
@@ -2853,7 +2885,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_update_mapper;
-            mapOperands = updateDataOp.getMapOperands();
+            mapVars = updateDataOp.getMapVars();
             return success();
           })
           .Default([&](Operation *op) {
@@ -2867,8 +2899,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, DL,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, DL, builder);
 
   // Fill up the arrays with all the mapped variables.
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
@@ -2877,7 +2908,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     builder.restoreIP(codeGenIP);
     if (auto dataOp = dyn_cast<omp::TargetDataOp>(op)) {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData,
-                  useDevPtrOperands, useDevAddrOperands);
+                  useDevicePtrVars, useDeviceAddrVars);
     } else {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     }
@@ -2899,7 +2930,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
       if (!info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
         unsigned argIndex = 0;
-        for (auto &devPtrOp : useDevPtrOperands) {
+        for (auto &devPtrOp : useDevicePtrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devPtrOp);
           const auto &arg = region.front().getArgument(argIndex);
           moduleTranslation.mapValue(arg,
@@ -2907,7 +2938,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           argIndex++;
         }
 
-        for (auto &devAddrOp : useDevAddrOperands) {
+        for (auto &devAddrOp : useDeviceAddrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devAddrOp);
           const auto &arg = region.front().getArgument(argIndex);
           auto *LI = builder.CreateLoad(
@@ -3032,6 +3063,18 @@ static bool targetOpSupported(Operation &opInst) {
     return false;
   }
 
+  if (!targetOp.getAllocateVars().empty() ||
+      !targetOp.getAllocatorVars().empty()) {
+    opInst.emitError("Allocate clause not yet supported");
+    return false;
+  }
+
+  if (!targetOp.getInReductionVars().empty() ||
+      targetOp.getInReductionByref() || targetOp.getInReductionSyms()) {
+    opInst.emitError("In reduction clause not yet supported");
+    return false;
+  }
+
   return true;
 }
 
@@ -3194,7 +3237,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
-  SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  SmallVector<Value> mapVars = targetOp.getMapVars();
   llvm::Function *llvmOutlinedFn = nullptr;
 
   LogicalResult bodyGenStatus = success();
@@ -3219,7 +3262,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     builder.restoreIP(codeGenIP);
     unsigned argIndex = 0;
-    for (auto &mapOp : mapOperands) {
+    for (auto &mapOp : mapVars) {
       auto mapInfoOp =
           mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
@@ -3249,8 +3292,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, dl,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, dl, builder);
 
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
   auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
@@ -3282,7 +3324,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
-  for (size_t i = 0; i < mapOperands.size(); ++i) {
+  for (size_t i = 0; i < mapVars.size(); ++i) {
     // declare target arguments are not passed to kernels as arguments
     // TODO: We currently do not handle cases where a member is explicitly
     // passed in as an argument, this will likley need to be handled in
@@ -3292,10 +3334,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     if (!mapData.IsDeclareTarget[i] && !mapData.IsAMember[i])
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
+  SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
+  buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(),
+                  moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
-      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
+      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
+      dds));
 
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
@@ -3428,10 +3474,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
         ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
         return success();
       })
-      .Case([&](omp::TaskwaitOp) {
-        ompBuilder->createTaskwait(builder.saveIP());
-        return success();
-      })
       .Case([&](omp::TaskyieldOp) {
         ompBuilder->createTaskyield(builder.saveIP());
         return success();
@@ -3499,6 +3541,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::TaskgroupOp op) {
         return convertOmpTaskgroupOp(op, builder, moduleTranslation);
       })
+      .Case([&](omp::TaskwaitOp op) {
+        return convertOmpTaskwaitOp(op, builder, moduleTranslation);
+      })
       .Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
             omp::CriticalDeclareOp>([](auto op) {
         // `yield` and `terminator` can be just omitted. The block structure
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 16007592175f7..8b40b7b2df6c7 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1471,6 +1471,28 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     callOp.setTailCallKind(
         convertTailCallKindFromLLVM(callInst->getTailCallKind()));
     setFastmathFlagsAttr(inst, callOp);
+
+    // Handle function attributes.
+    if (callInst->hasFnAttr(llvm::Attribute::Convergent))
+      callOp.setConvergent(true);
+    if (callInst->hasFnAttr(llvm::Attribute::NoUnwind))
+      callOp.setNoUnwind(true);
+    if (callInst->hasFnAttr(llvm::Attribute::WillReturn))
+      callOp.setWillReturn(true);
+
+    llvm::MemoryEffects memEffects = callInst->getMemoryEffects();
+    ModRefInfo othermem = convertModRefInfoFromLLVM(
+        memEffects.getModRef(llvm::MemoryEffects::Location::Other));
+    ModRefInfo argMem = convertModRefInfoFromLLVM(
+        memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem));
+    ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM(
+        memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem));
+    auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem, argMem,
+                                          inaccessibleMem);
+    // Only set the attribute when it does not match the default value.
+    if (!memAttr.isReadWrite())
+      callOp.setMemoryEffectsAttr(memAttr);
+
     if (!callInst->getType()->isVoidTy())
       mapValue(inst, callOp.getResult());
     else
@@ -1661,7 +1683,7 @@ static void processMemoryEffects(llvm::Function *func, LLVMFuncOp funcOp) {
   // Only set the attr when it does not match the default value.
   if (memAttr.isReadWrite())
     return;
-  funcOp.setMemoryAttr(memAttr);
+  funcOp.setMemoryEffectsAttr(memAttr);
 }
 
 // List of LLVM IR attributes that map to an explicit attribute on the MLIR
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index fc3fb0b5334c1..3016d1846e00f 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1412,10 +1412,10 @@ LogicalResult ModuleTranslation::convertDialectAttributes(
 /// `llvmFunc`.
 static void convertFunctionMemoryAttributes(LLVMFuncOp func,
                                             llvm::Function *llvmFunc) {
-  if (!func.getMemory())
+  if (!func.getMemoryEffects())
     return;
 
-  MemoryEffectsAttr memEffects = func.getMemoryAttr();
+  MemoryEffectsAttr memEffects = func.getMemoryEffectsAttr();
 
   // Add memory effects incrementally.
   llvm::MemoryEffects newMemEffects =
diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
index a4db958207756..db184ae8e6e83 100644
--- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
@@ -65,8 +65,6 @@ class TypeFromLLVMIRTranslatorImpl {
       return Float80Type::get(&context);
     if (type->isPPC_FP128Ty())
       return LLVM::LLVMPPCFP128Type::get(&context);
-    if (type->isX86_MMXTy())
-      return LLVM::LLVMX86MMXType::get(&context);
     if (type->isLabelTy())
       return LLVM::LLVMLabelType::get(&context);
     if (type->isMetadataTy())
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index 6d8b415ff09dc..6591502723880 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl {
             .Case([this](LLVM::LLVMPPCFP128Type) {
               return llvm::Type::getPPC_FP128Ty(context);
             })
-            .Case([this](LLVM::LLVMX86MMXType) {
-              return llvm::Type::getX86_MMXTy(context);
-            })
             .Case([this](LLVM::LLVMTokenType) {
               return llvm::Type::getTokenTy(context);
             })
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index d7a308548cf4d..12980879b20ab 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -250,6 +250,15 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
         symbol, FPFastMathModeAttr::get(opBuilder.getContext(),
                                         static_cast<FPFastMathMode>(words[2])));
     break;
+  case spirv::Decoration::FPRoundingMode:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    decorations[words[0]].set(
+        symbol, FPRoundingModeAttr::get(opBuilder.getContext(),
+                                        static_cast<FPRoundingMode>(words[2])));
+    break;
   case spirv::Decoration::DescriptorSet:
   case spirv::Decoration::Binding:
     if (words.size() != 3) {
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 4c4fef177317e..714a3edfb5657 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -214,6 +214,9 @@ static std::string getDecorationName(StringRef attrName) {
   // expected FPFastMathMode.
   if (attrName == "fp_fast_math_mode")
     return "FPFastMathMode";
+  // similar here
+  if (attrName == "fp_rounding_mode")
+    return "FPRoundingMode";
 
   return llvm::convertToCamelFromSnakeCase(attrName, /*capitalizeFirst=*/true);
 }
@@ -242,6 +245,13 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
     }
     return emitError(loc, "expected FPFastMathModeAttr attribute for ")
            << stringifyDecoration(decoration);
+  case spirv::Decoration::FPRoundingMode:
+    if (auto intAttr = dyn_cast<FPRoundingModeAttr>(attr)) {
+      args.push_back(static_cast<uint32_t>(intAttr.getValue()));
+      break;
+    }
+    return emitError(loc, "expected FPRoundingModeAttr attribute for ")
+           << stringifyDecoration(decoration);
   case spirv::Decoration::Binding:
   case spirv::Decoration::DescriptorSet:
   case spirv::Decoration::Location:
diff --git a/mlir/lib/Transforms/OpStats.cpp b/mlir/lib/Transforms/OpStats.cpp
index 6a71e1f02edc9..6746ed52396af 100644
--- a/mlir/lib/Transforms/OpStats.cpp
+++ b/mlir/lib/Transforms/OpStats.cpp
@@ -55,6 +55,7 @@ void PrintOpStatsPass::runOnOperation() {
     printSummaryInJSON();
   } else
     printSummary();
+  markAllAnalysesPreserved();
 }
 
 void PrintOpStatsPass::printSummary() {
diff --git a/mlir/lib/Transforms/PrintIR.cpp b/mlir/lib/Transforms/PrintIR.cpp
index cc42c7e2f1db0..3c55f920dfd61 100644
--- a/mlir/lib/Transforms/PrintIR.cpp
+++ b/mlir/lib/Transforms/PrintIR.cpp
@@ -25,6 +25,7 @@ struct PrintIRPass : public impl::PrintIRPassBase<PrintIRPass> {
       llvm::dbgs() << " " << this->label;
     llvm::dbgs() << " //----- //\n";
     getOperation()->dump();
+    markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 1e0afee2373a9..f26aa0a1516a6 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -293,10 +293,9 @@ class CreateBlockRewrite : public BlockRewrite {
 /// original location.
 class EraseBlockRewrite : public BlockRewrite {
 public:
-  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-                    Region *region, Block *insertBeforeBlock)
-      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block), region(region),
-        insertBeforeBlock(insertBeforeBlock) {}
+  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block)
+      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block),
+        region(block->getParent()), insertBeforeBlock(block->getNextNode()) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::EraseBlock;
@@ -432,34 +431,14 @@ class MoveBlockRewrite : public BlockRewrite {
   Block *insertBeforeBlock;
 };
 
-/// This structure contains the information pertaining to an argument that has
-/// been converted.
-struct ConvertedArgInfo {
-  ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize,
-                   Value castValue = nullptr)
-      : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {}
-
-  /// The start index of in the new argument list that contains arguments that
-  /// replace the original.
-  unsigned newArgIdx;
-
-  /// The number of arguments that replaced the original argument.
-  unsigned newArgSize;
-
-  /// The cast value that was created to cast from the new arguments to the
-  /// old. This only used if 'newArgSize' > 1.
-  Value castValue;
-};
-
 /// Block type conversion. This rewrite is partially reflected in the IR.
 class BlockTypeConversionRewrite : public BlockRewrite {
 public:
-  BlockTypeConversionRewrite(
-      ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-      Block *origBlock, SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo,
-      const TypeConverter *converter)
+  BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+                             Block *block, Block *origBlock,
+                             const TypeConverter *converter)
       : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block),
-        origBlock(origBlock), argInfo(argInfo), converter(converter) {}
+        origBlock(origBlock), converter(converter) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::BlockTypeConversion;
@@ -479,10 +458,6 @@ class BlockTypeConversionRewrite : public BlockRewrite {
   /// The original block that was requested to have its signature converted.
   Block *origBlock;
 
-  /// The conversion information for each of the arguments. The information is
-  /// std::nullopt if the argument was dropped during conversion.
-  SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
-
   /// The type converter used to convert the arguments.
   const TypeConverter *converter;
 };
@@ -691,12 +666,16 @@ class CreateOperationRewrite : public OperationRewrite {
 /// The type of materialization.
 enum MaterializationKind {
   /// This materialization materializes a conversion for an illegal block
-  /// argument type, to a legal one.
+  /// argument type, to the original one.
   Argument,
 
   /// This materialization materializes a conversion from an illegal type to a
   /// legal one.
-  Target
+  Target,
+
+  /// This materialization materializes a conversion from a legal type back to
+  /// an illegal one.
+  Source
 };
 
 /// An unresolved materialization, i.e., a "builtin.unrealized_conversion_cast"
@@ -736,7 +715,7 @@ class UnresolvedMaterializationRewrite : public OperationRewrite {
 private:
   /// The corresponding type converter to use when resolving this
   /// materialization, and the kind of this materialization.
-  llvm::PointerIntPair<const TypeConverter *, 1, MaterializationKind>
+  llvm::PointerIntPair<const TypeConverter *, 2, MaterializationKind>
       converterAndKind;
 };
 } // namespace
@@ -855,11 +834,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
                                        ValueRange inputs, Type outputType,
                                        const TypeConverter *converter);
 
-  Value buildUnresolvedArgumentMaterialization(Block *block, Location loc,
-                                               ValueRange inputs,
-                                               Type outputType,
-                                               const TypeConverter *converter);
-
   Value buildUnresolvedTargetMaterialization(Location loc, Value input,
                                              Type outputType,
                                              const TypeConverter *converter);
@@ -989,28 +963,6 @@ void BlockTypeConversionRewrite::commit(RewriterBase &rewriter) {
           dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener()))
     for (Operation *op : block->getUsers())
       listener->notifyOperationModified(op);
-
-  // Process the remapping for each of the original arguments.
-  for (auto [origArg, info] :
-       llvm::zip_equal(origBlock->getArguments(), argInfo)) {
-    // Handle the case of a 1->0 value mapping.
-    if (!info) {
-      if (Value newArg =
-              rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType()))
-        rewriter.replaceAllUsesWith(origArg, newArg);
-      continue;
-    }
-
-    // Otherwise this is a 1->1+ value mapping.
-    Value castValue = info->castValue;
-    assert(info->newArgSize >= 1 && castValue && "expected 1->1+ mapping");
-
-    // If the argument is still used, replace it with the generated cast.
-    if (!origArg.use_empty()) {
-      rewriter.replaceAllUsesWith(origArg, rewriterImpl.mapping.lookupOrDefault(
-                                               castValue, origArg.getType()));
-    }
-  }
 }
 
 void BlockTypeConversionRewrite::rollback() {
@@ -1035,14 +987,12 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
       continue;
 
     Value replacementValue = rewriterImpl.mapping.lookupOrDefault(origArg);
-    bool isDroppedArg = replacementValue == origArg;
-    if (!isDroppedArg)
-      builder.setInsertionPointAfterValue(replacementValue);
+    assert(replacementValue && "replacement value not found");
     Value newArg;
     if (converter) {
+      builder.setInsertionPointAfterValue(replacementValue);
       newArg = converter->materializeSourceConversion(
-          builder, origArg.getLoc(), origArg.getType(),
-          isDroppedArg ? ValueRange() : ValueRange(replacementValue));
+          builder, origArg.getLoc(), origArg.getType(), replacementValue);
       assert((!newArg || newArg.getType() == origArg.getType()) &&
              "materialization hook did not provide a value of the expected "
              "type");
@@ -1053,8 +1003,6 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
           << "failed to materialize conversion for block argument #"
           << it.index() << " that remained live after conversion, type was "
           << origArg.getType();
-      if (!isDroppedArg)
-        diag << ", with target type " << replacementValue.getType();
       diag.attachNote(liveUser->getLoc())
           << "see existing live user here: " << *liveUser;
       return failure();
@@ -1340,73 +1288,68 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
   // Replace all uses of the old block with the new block.
   block->replaceAllUsesWith(newBlock);
 
-  // Remap each of the original arguments as determined by the signature
-  // conversion.
-  SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
-  argInfo.resize(origArgCount);
-
   for (unsigned i = 0; i != origArgCount; ++i) {
-    auto inputMap = signatureConversion.getInputMapping(i);
-    if (!inputMap)
-      continue;
     BlockArgument origArg = block->getArgument(i);
+    Type origArgType = origArg.getType();
+
+    std::optional<TypeConverter::SignatureConversion::InputMapping> inputMap =
+        signatureConversion.getInputMapping(i);
+    if (!inputMap) {
+      // This block argument was dropped and no replacement value was provided.
+      // Materialize a replacement value "out of thin air".
+      Value repl = buildUnresolvedMaterialization(
+          MaterializationKind::Source, newBlock, newBlock->begin(),
+          origArg.getLoc(), /*inputs=*/ValueRange(),
+          /*outputType=*/origArgType, converter);
+      mapping.map(origArg, repl);
+      appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
+      continue;
+    }
 
-    // If inputMap->replacementValue is not nullptr, then the argument is
-    // dropped and a replacement value is provided to be the remappedValue.
-    if (inputMap->replacementValue) {
+    if (Value repl = inputMap->replacementValue) {
+      // This block argument was dropped and a replacement value was provided.
       assert(inputMap->size == 0 &&
              "invalid to provide a replacement value when the argument isn't "
              "dropped");
-      mapping.map(origArg, inputMap->replacementValue);
+      mapping.map(origArg, repl);
       appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
       continue;
     }
 
-    // Otherwise, this is a 1->1+ mapping.
+    // This is a 1->1+ mapping. 1->N mappings are not fully supported in the
+    // dialect conversion. Therefore, we need an argument materialization to
+    // turn the replacement block arguments into a single SSA value that can be
+    // used as a replacement.
     auto replArgs =
         newBlock->getArguments().slice(inputMap->inputNo, inputMap->size);
-    Value newArg;
+    Value argMat = buildUnresolvedMaterialization(
+        MaterializationKind::Argument, newBlock, newBlock->begin(),
+        origArg.getLoc(), /*inputs=*/replArgs, origArgType, converter);
+    mapping.map(origArg, argMat);
+    appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
 
-    // If this is a 1->1 mapping and the types of new and replacement arguments
-    // match (i.e. it's an identity map), then the argument is mapped to its
-    // original type.
-    // FIXME: We simply pass through the replacement argument if there wasn't a
-    // converter, which isn't great as it allows implicit type conversions to
-    // appear. We should properly restructure this code to handle cases where a
-    // converter isn't provided and also to properly handle the case where an
-    // argument materialization is actually a temporary source materialization
-    // (e.g. in the case of 1->N).
-    if (replArgs.size() == 1 &&
-        (!converter || replArgs[0].getType() == origArg.getType())) {
-      newArg = replArgs.front();
-      mapping.map(origArg, newArg);
-    } else {
-      // Build argument materialization: new block arguments -> old block
-      // argument type.
-      Value argMat = buildUnresolvedArgumentMaterialization(
-          newBlock, origArg.getLoc(), replArgs, origArg.getType(), converter);
-      mapping.map(origArg, argMat);
-
-      // Build target materialization: old block argument type -> legal type.
-      // Note: This function returns an "empty" type if no valid conversion to
-      // a legal type exists. In that case, we continue the conversion with the
-      // original block argument type.
-      Type legalOutputType = converter->convertType(origArg.getType());
-      if (legalOutputType && legalOutputType != origArg.getType()) {
-        newArg = buildUnresolvedTargetMaterialization(
-            origArg.getLoc(), argMat, legalOutputType, converter);
-        mapping.map(argMat, newArg);
-      } else {
-        newArg = argMat;
-      }
+    Type legalOutputType;
+    if (converter) {
+      legalOutputType = converter->convertType(origArgType);
+    } else if (replArgs.size() == 1) {
+      // When there is no type converter, assume that the new block argument
+      // types are legal. This is reasonable to assume because they were
+      // specified by the user.
+      // FIXME: This won't work for 1->N conversions because multiple output
+      // types are not supported in parts of the dialect conversion. In such a
+      // case, we currently use the original block argument type (produced by
+      // the argument materialization).
+      legalOutputType = replArgs[0].getType();
+    }
+    if (legalOutputType && legalOutputType != origArgType) {
+      Value targetMat = buildUnresolvedTargetMaterialization(
+          origArg.getLoc(), argMat, legalOutputType, converter);
+      mapping.map(argMat, targetMat);
     }
-
     appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
-    argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg);
   }
 
-  appendRewrite<BlockTypeConversionRewrite>(newBlock, block, argInfo,
-                                            converter);
+  appendRewrite<BlockTypeConversionRewrite>(newBlock, block, converter);
 
   // Erase the old block. (It is just unlinked for now and will be erased during
   // cleanup.)
@@ -1431,19 +1374,13 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
 
   // Create an unresolved materialization. We use a new OpBuilder to avoid
   // tracking the materialization like we do for other operations.
-  OpBuilder builder(insertBlock, insertPt);
+  OpBuilder builder(outputType.getContext());
+  builder.setInsertionPoint(insertBlock, insertPt);
   auto convertOp =
       builder.create<UnrealizedConversionCastOp>(loc, outputType, inputs);
   appendRewrite<UnresolvedMaterializationRewrite>(convertOp, converter, kind);
   return convertOp.getResult(0);
 }
-Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization(
-    Block *block, Location loc, ValueRange inputs, Type outputType,
-    const TypeConverter *converter) {
-  return buildUnresolvedMaterialization(MaterializationKind::Argument, block,
-                                        block->begin(), loc, inputs, outputType,
-                                        converter);
-}
 Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization(
     Location loc, Value input, Type outputType,
     const TypeConverter *converter) {
@@ -1507,9 +1444,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
-  Region *region = block->getParent();
-  Block *origNextBlock = block->getNextNode();
-  appendRewrite<EraseBlockRewrite>(block, region, origNextBlock);
+  appendRewrite<EraseBlockRewrite>(block);
 }
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
@@ -2862,6 +2797,10 @@ static LogicalResult legalizeUnresolvedMaterialization(
       newMaterialization = converter->materializeTargetConversion(
           rewriter, op->getLoc(), outputType, inputOperands);
       break;
+    case MaterializationKind::Source:
+      newMaterialization = converter->materializeSourceConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
     }
     if (newMaterialization) {
       assert(newMaterialization.getType() == outputType &&
@@ -2874,8 +2813,8 @@ static LogicalResult legalizeUnresolvedMaterialization(
 
   InFlightDiagnostic diag = op->emitError()
                             << "failed to legalize unresolved materialization "
-                               "from "
-                            << inputOperands.getTypes() << " to " << outputType
+                               "from ("
+                            << inputOperands.getTypes() << ") to " << outputType
                             << " that remained live after conversion";
   if (Operation *liveUser = findLiveUser(op->getUsers())) {
     diag.attachNote(liveUser->getLoc())
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 946d65cef4186..4c0f15bafbaba 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
@@ -17,15 +16,11 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LogicalResult.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 
 #include <deque>
-#include <iterator>
 
 using namespace mlir;
 
@@ -679,91 +674,6 @@ static bool ableToUpdatePredOperands(Block *block) {
   return true;
 }
 
-/// Prunes the redundant list of arguments. E.g., if we are passing an argument
-/// list like [x, y, z, x] this would return [x, y, z] and it would update the
-/// `block` (to whom the argument are passed to) accordingly.
-static SmallVector<SmallVector<Value, 8>, 2> pruneRedundantArguments(
-    const SmallVector<SmallVector<Value, 8>, 2> &newArguments,
-    RewriterBase &rewriter, Block *block) {
-
-  SmallVector<SmallVector<Value, 8>, 2> newArgumentsPruned(
-      newArguments.size(), SmallVector<Value, 8>());
-
-  if (newArguments.empty())
-    return newArguments;
-
-  // `newArguments` is a 2D array of size `numLists` x `numArgs`
-  unsigned numLists = newArguments.size();
-  unsigned numArgs = newArguments[0].size();
-
-  // Map that for each arg index contains the index that we can use in place of
-  // the original index. E.g., if we have newArgs = [x, y, z, x], we will have
-  // idxToReplacement[3] = 0
-  llvm::DenseMap<unsigned, unsigned> idxToReplacement;
-
-  // This is a useful data structure to track the first appearance of a Value
-  // on a given list of arguments
-  DenseMap<Value, unsigned> firstValueToIdx;
-  for (unsigned j = 0; j < numArgs; ++j) {
-    Value newArg = newArguments[0][j];
-    if (!firstValueToIdx.contains(newArg))
-      firstValueToIdx[newArg] = j;
-  }
-
-  // Go through the first list of arguments (list 0).
-  for (unsigned j = 0; j < numArgs; ++j) {
-    bool shouldReplaceJ = false;
-    unsigned replacement = 0;
-    // Look back to see if there are possible redundancies in list 0. Please
-    // note that we are using a map to annotate when an argument was seen first
-    // to avoid a O(N^2) algorithm. This has the drawback that if we have two
-    // lists like:
-    // list0: [%a, %a, %a]
-    // list1: [%c, %b, %b]
-    // We cannot simplify it, because firstVlaueToIdx[%a] = 0, but we cannot
-    // point list1[1](==%b) or list1[2](==%b) to list1[0](==%c).  However, since
-    // the number of arguments can be potentially unbounded we cannot afford a
-    // O(N^2) algorithm (to search to all the possible pairs) and we need to
-    // accept the trade-off.
-    unsigned k = firstValueToIdx[newArguments[0][j]];
-    if (k != j) {
-      shouldReplaceJ = true;
-      replacement = k;
-      // If a possible redundancy is found, then scan the other lists: we
-      // can prune the arguments if and only if they are redundant in every
-      // list.
-      for (unsigned i = 1; i < numLists; ++i)
-        shouldReplaceJ =
-            shouldReplaceJ && (newArguments[i][k] == newArguments[i][j]);
-    }
-    // Save the replacement.
-    if (shouldReplaceJ)
-      idxToReplacement[j] = replacement;
-  }
-
-  // Populate the pruned argument list.
-  for (unsigned i = 0; i < numLists; ++i)
-    for (unsigned j = 0; j < numArgs; ++j)
-      if (!idxToReplacement.contains(j))
-        newArgumentsPruned[i].push_back(newArguments[i][j]);
-
-  // Replace the block's redundant arguments.
-  SmallVector<unsigned> toErase;
-  for (auto [idx, arg] : llvm::enumerate(block->getArguments())) {
-    if (idxToReplacement.contains(idx)) {
-      Value oldArg = block->getArgument(idx);
-      Value newArg = block->getArgument(idxToReplacement[idx]);
-      rewriter.replaceAllUsesWith(oldArg, newArg);
-      toErase.push_back(idx);
-    }
-  }
-
-  // Erase the block's redundant arguments.
-  for (unsigned idxToErase : llvm::reverse(toErase))
-    block->eraseArgument(idxToErase);
-  return newArgumentsPruned;
-}
-
 LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) {
   // Don't consider clusters that don't have blocks to merge.
   if (blocksToMerge.empty())
@@ -812,10 +722,6 @@ LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) {
         }
       }
     }
-
-    // Prune redundant arguments and update the leader block argument list
-    newArguments = pruneRedundantArguments(newArguments, rewriter, leaderBlock);
-
     // Update the predecessors for each of the blocks.
     auto updatePredecessors = [&](Block *block, unsigned clusterIndex) {
       for (auto predIt = block->pred_begin(), predE = block->pred_end();
@@ -912,108 +818,6 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
   return success(anyChanged);
 }
 
-static LogicalResult dropRedundantArguments(RewriterBase &rewriter,
-                                            Block &block) {
-  SmallVector<size_t> argsToErase;
-
-  // Go through the arguments of the block.
-  for (auto [argIdx, blockOperand] : llvm::enumerate(block.getArguments())) {
-    bool sameArg = true;
-    Value commonValue;
-
-    // Go through the block predecessor and flag if they pass to the block
-    // different values for the same argument.
-    for (auto predIt = block.pred_begin(), predE = block.pred_end();
-         predIt != predE; ++predIt) {
-      auto branch = dyn_cast<BranchOpInterface>((*predIt)->getTerminator());
-      if (!branch) {
-        sameArg = false;
-        break;
-      }
-      unsigned succIndex = predIt.getSuccessorIndex();
-      SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex);
-      auto branchOperands = succOperands.getForwardedOperands();
-      if (!commonValue) {
-        commonValue = branchOperands[argIdx];
-      } else {
-        if (branchOperands[argIdx] != commonValue) {
-          sameArg = false;
-          break;
-        }
-      }
-    }
-
-    // If they are passing the same value, drop the argument.
-    if (commonValue && sameArg) {
-      argsToErase.push_back(argIdx);
-
-      // Remove the argument from the block.
-      rewriter.replaceAllUsesWith(blockOperand, commonValue);
-    }
-  }
-
-  // Remove the arguments.
-  for (auto argIdx : llvm::reverse(argsToErase)) {
-    block.eraseArgument(argIdx);
-
-    // Remove the argument from the branch ops.
-    for (auto predIt = block.pred_begin(), predE = block.pred_end();
-         predIt != predE; ++predIt) {
-      auto branch = cast<BranchOpInterface>((*predIt)->getTerminator());
-      unsigned succIndex = predIt.getSuccessorIndex();
-      SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex);
-      succOperands.erase(argIdx);
-    }
-  }
-  return success(!argsToErase.empty());
-}
-
-/// This optimization drops redundant argument to blocks. I.e., if a given
-/// argument to a block receives the same value from each of the block
-/// predecessors, we can remove the argument from the block and use directly the
-/// original value. This is a simple example:
-///
-/// %cond = llvm.call @rand() : () -> i1
-/// %val0 = llvm.mlir.constant(1 : i64) : i64
-/// %val1 = llvm.mlir.constant(2 : i64) : i64
-/// %val2 = llvm.mlir.constant(3 : i64) : i64
-/// llvm.cond_br %cond, ^bb1(%val0 : i64, %val1 : i64), ^bb2(%val0 : i64, %val2
-/// : i64)
-///
-/// ^bb1(%arg0 : i64, %arg1 : i64):
-///    llvm.call @foo(%arg0, %arg1)
-///
-/// The previous IR can be rewritten as:
-/// %cond = llvm.call @rand() : () -> i1
-/// %val0 = llvm.mlir.constant(1 : i64) : i64
-/// %val1 = llvm.mlir.constant(2 : i64) : i64
-/// %val2 = llvm.mlir.constant(3 : i64) : i64
-/// llvm.cond_br %cond, ^bb1(%val1 : i64), ^bb2(%val2 : i64)
-///
-/// ^bb1(%arg0 : i64):
-///    llvm.call @foo(%val0, %arg0)
-///
-static LogicalResult dropRedundantArguments(RewriterBase &rewriter,
-                                            MutableArrayRef<Region> regions) {
-  llvm::SmallSetVector<Region *, 1> worklist;
-  for (Region &region : regions)
-    worklist.insert(&region);
-  bool anyChanged = false;
-  while (!worklist.empty()) {
-    Region *region = worklist.pop_back_val();
-
-    // Add any nested regions to the worklist.
-    for (Block &block : *region) {
-      anyChanged = succeeded(dropRedundantArguments(rewriter, block));
-
-      for (Operation &op : block)
-        for (Region &nestedRegion : op.getRegions())
-          worklist.insert(&nestedRegion);
-    }
-  }
-  return success(anyChanged);
-}
-
 //===----------------------------------------------------------------------===//
 // Region Simplification
 //===----------------------------------------------------------------------===//
@@ -1028,12 +832,8 @@ LogicalResult mlir::simplifyRegions(RewriterBase &rewriter,
   bool eliminatedBlocks = succeeded(eraseUnreachableBlocks(rewriter, regions));
   bool eliminatedOpsOrArgs = succeeded(runRegionDCE(rewriter, regions));
   bool mergedIdenticalBlocks = false;
-  bool droppedRedundantArguments = false;
-  if (mergeBlocks) {
+  if (mergeBlocks)
     mergedIdenticalBlocks = succeeded(mergeIdenticalBlocks(rewriter, regions));
-    droppedRedundantArguments =
-        succeeded(dropRedundantArguments(rewriter, regions));
-  }
   return success(eliminatedBlocks || eliminatedOpsOrArgs ||
-                 mergedIdenticalBlocks || droppedRedundantArguments);
+                 mergedIdenticalBlocks);
 }
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index b3c0a06c96fea..82e9863ab40bf 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -93,6 +93,7 @@ class PrintOpPass : public impl::ViewOpGraphBase<PrintOpPass> {
       processOperation(getOperation());
       emitAllEdgeStmts();
     });
+    markAllAnalysesPreserved();
   }
 
   /// Create a CFG graph for a region. Used in `Region::viewGraph`.
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 317e688076304..224e77a3f46be 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -123,6 +123,7 @@ __all__ = [
     "Float8E4M3B11FNUZType",
     "Float8E4M3FNType",
     "Float8E4M3FNUZType",
+    "Float8E4M3Type",
     "Float8E5M2FNUZType",
     "Float8E5M2Type",
     "FloatAttr",
@@ -1575,6 +1576,19 @@ class Float8E4M3FNUZType(FloatType):
     @property
     def typeid(self) -> TypeID: ...
 
+class Float8E4M3Type(FloatType):
+    static_typeid: ClassVar[TypeID]
+    @staticmethod
+    def get(context: Optional[Context] = None) -> Float8E4M3Type:
+        """
+        Create a float8_e4m3 type.
+        """
+    @staticmethod
+    def isinstance(other: Type) -> bool: ...
+    def __init__(self, cast_from_type: Type) -> None: ...
+    @property
+    def typeid(self) -> TypeID: ...
+
 class Float8E5M2FNUZType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
diff --git a/mlir/python/mlir/extras/types.py b/mlir/python/mlir/extras/types.py
index b93c46b172a9a..fde9909a8f9d6 100644
--- a/mlir/python/mlir/extras/types.py
+++ b/mlir/python/mlir/extras/types.py
@@ -14,6 +14,7 @@
     F64Type,
     Float8E4M3B11FNUZType,
     Float8E4M3FNType,
+    Float8E4M3Type,
     Float8E5M2Type,
     FunctionType,
     IndexType,
@@ -68,6 +69,7 @@ def ui(width):
 bf16 = lambda: BF16Type.get()
 
 f8E5M2 = lambda: Float8E5M2Type.get()
+f8E4M3 = lambda: Float8E4M3Type.get()
 f8E4M3FN = lambda: Float8E4M3FNType.get()
 f8E4M3B11FNUZ = lambda: Float8E4M3B11FNUZType.get()
 
diff --git a/mlir/python/mlir/runtime/np_to_memref.py b/mlir/python/mlir/runtime/np_to_memref.py
index f6b706f9bc8ae..882b2751921bf 100644
--- a/mlir/python/mlir/runtime/np_to_memref.py
+++ b/mlir/python/mlir/runtime/np_to_memref.py
@@ -7,6 +7,12 @@
 import numpy as np
 import ctypes
 
+try:
+    import ml_dtypes
+except ModuleNotFoundError:
+    # The third-party ml_dtypes provides some optional low precision data-types for NumPy.
+    ml_dtypes = None
+
 
 class C128(ctypes.Structure):
     """A ctype representation for MLIR's Double Complex."""
@@ -26,6 +32,12 @@ class F16(ctypes.Structure):
     _fields_ = [("f16", ctypes.c_int16)]
 
 
+class BF16(ctypes.Structure):
+    """A ctype representation for MLIR's BFloat16."""
+
+    _fields_ = [("bf16", ctypes.c_int16)]
+
+
 # https://stackoverflow.com/questions/26921836/correct-way-to-test-for-numpy-dtype
 def as_ctype(dtp):
     """Converts dtype to ctype."""
@@ -35,6 +47,8 @@ def as_ctype(dtp):
         return C64
     if dtp == np.dtype(np.float16):
         return F16
+    if ml_dtypes is not None and dtp == ml_dtypes.bfloat16:
+        return BF16
     return np.ctypeslib.as_ctypes_type(dtp)
 
 
@@ -46,6 +60,11 @@ def to_numpy(array):
         return array.view("complex64")
     if array.dtype == F16:
         return array.view("float16")
+    assert not (
+        array.dtype == BF16 and ml_dtypes is None
+    ), f"bfloat16 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
+    if array.dtype == BF16:
+        return array.view("bfloat16")
     return array
 
 
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index acd6dbb25edaf..6ec63e43adf89 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.19.5, <=1.26
 pybind11>=2.9.0, <=2.10.3
-PyYAML>=5.3.1, <=6.0.1
\ No newline at end of file
+PyYAML>=5.3.1, <=6.0.1
+ml_dtypes   # provides several NumPy dtype extensions, including the bf16
\ No newline at end of file
diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index 082ddc53e6282..d3054aa6a0d93 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -294,7 +294,7 @@ static void testDebugInfoAttributes(MlirContext ctx) {
 
   // CHECK: #llvm.di_local_variable<{{.*}}>
   MlirAttribute local_var = mlirLLVMDILocalVariableAttrGet(
-      ctx, compile_unit, foo, file, 1, 0, 8, di_type);
+      ctx, compile_unit, foo, file, 1, 0, 8, di_type, 0);
   mlirAttributeDump(local_var);
   // CHECK: #llvm.di_derived_type<{{.*}}>
   // CHECK-NOT: dwarfAddressSpace
diff --git a/mlir/test/CAPI/rewrite.c b/mlir/test/CAPI/rewrite.c
index a8b593eabb781..b33d225767046 100644
--- a/mlir/test/CAPI/rewrite.c
+++ b/mlir/test/CAPI/rewrite.c
@@ -68,6 +68,8 @@ void testInsertionPoint(MlirContext ctx) {
   // Get insertion blocks
   MlirBlock block1 = mlirRewriterBaseGetBlock(rewriter);
   MlirBlock block2 = mlirRewriterBaseGetInsertionBlock(rewriter);
+  (void)block1;
+  (void)block2;
   assert(body.ptr == block1.ptr);
   assert(body.ptr == block2.ptr);
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index bb1cedaa276b3..717667c22af80 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -226,3 +226,34 @@ func.func @lds_barrier() {
   amdgpu.lds_barrier
   func.return
 }
+
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+  // CHECK: rocdl.sched.barrier 0
+  amdgpu.sched_barrier allow = <none>
+  // CHECK: rocdl.sched.barrier 1
+  amdgpu.sched_barrier allow = <non_mem_non_sideffect>
+  // CHECK: rocdl.sched.barrier 2
+  amdgpu.sched_barrier allow = <valu>
+  // CHECK: rocdl.sched.barrier 4
+  amdgpu.sched_barrier allow = <salu>
+  // CHECK: rocdl.sched.barrier 8
+  amdgpu.sched_barrier allow = <mfma_wmma>
+  // CHECK: rocdl.sched.barrier 16
+  amdgpu.sched_barrier allow = <all_vmem>
+  // CHECK: rocdl.sched.barrier 32
+  amdgpu.sched_barrier allow = <vmem_read>
+  // CHECK: rocdl.sched.barrier 64
+  amdgpu.sched_barrier allow = <vmem_write>
+  // CHECK: rocdl.sched.barrier 128
+  amdgpu.sched_barrier allow = <all_ds>
+  // CHECK: rocdl.sched.barrier 256
+  amdgpu.sched_barrier allow = <ds_read>
+  // CHECK: rocdl.sched.barrier 512
+  amdgpu.sched_barrier allow = <ds_write>
+  // CHECK: rocdl.sched.barrier 1024
+  amdgpu.sched_barrier allow = <transcendental>
+  // CHECK: rocdl.sched.barrier 18
+  amdgpu.sched_barrier allow = <valu|all_vmem>
+  func.return
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index 766ad4039335e..ef0e71ee8673b 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -134,3 +134,19 @@ func.func @arith_shrui_i1(%arg0: i1, %arg1: i1) {
   %shrui = arith.shrui %arg0, %arg1 : i1
   return
 }
+
+// -----
+
+func.func @arith_divui_vector(%arg0: vector<5xi32>, %arg1: vector<5xi32>) -> vector<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.divui'}}
+  %divui = arith.divui %arg0, %arg1 : vector<5xi32>
+  return %divui: vector<5xi32>
+}
+
+// -----
+
+func.func @arith_remui_vector(%arg0: vector<5xi32>, %arg1: vector<5xi32>) -> vector<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.remui'}}
+  %divui = arith.remui %arg0, %arg1 : vector<5xi32>
+  return %divui: vector<5xi32>
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index 858ccd1171445..afd1198ede0f7 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -717,3 +717,21 @@ func.func @arith_index_castui(%arg0: i32) -> i32 {
 
   return %int : i32
 }
+
+// -----
+
+func.func @arith_divui_remui(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-LABEL: arith_divui_remui
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32, %[[Arg1:[^ ]*]]: i32)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Arg1]] : i32 to ui32
+  // CHECK: %[[Div:.*]] = emitc.div %[[Conv0]], %[[Conv1]] : (ui32, ui32) -> ui32
+  %div = arith.divui %arg0, %arg1 : i32
+
+  // CHECK: %[[Conv2:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv3:.*]] = emitc.cast %[[Arg1]] : i32 to ui32
+  // CHECK: %[[Rem:.*]] = emitc.rem %[[Conv2]], %[[Conv3]] : (ui32, ui32) -> ui32
+  %rem = arith.remui %arg0, %arg1 : i32
+
+  return %div : i32
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
index 1a844a7cd018b..6418e931f7460 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // arithmetic ops
diff --git a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
index 02b938be775a3..311174bef15ed 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @combined
 // CHECK: %[[C0_F32:.*]] = spirv.Constant 0.000000e+00 : f32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
index 347d282f9ee0c..c018ccb924983 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
@@ -66,6 +66,28 @@ func.func @simple_vector_8(%arg0 : vector<8xi32>) -> vector<8xi32> {
 
 // -----
 
+// CHECK-LABEL: @simple_vector_2d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
+func.func @simple_vector_2d(%arg0 : vector<4x4xi32>) -> vector<4x4xi32> {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<4x4xi32>
+  // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [1, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT1]] {offsets = [2, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [3, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT0_1:.*]] = vector.extract %[[EXTRACT0]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT1_1:.*]] = vector.extract %[[EXTRACT1]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT2_1:.*]] = vector.extract %[[EXTRACT2]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT3_1:.*]] = vector.extract %[[EXTRACT3]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: return %[[EXTRACT0_1]], %[[EXTRACT1_1]], %[[EXTRACT2_1]], %[[EXTRACT3_1]] : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
+  return %arg0 : vector<4x4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @vector_6and8
 // CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
 func.func @vector_6and8(%arg0 : vector<6xi32>, %arg1 : vector<8xi32>) -> (vector<6xi32>, vector<8xi32>) {
@@ -113,6 +135,28 @@ func.func @scalar_vector(%arg0 : vector<8xi32>, %arg1 : vector<3xi32>, %arg2 : i
 
 // -----
 
+// CHECK-LABEL: @vector_2dand1d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>, %[[ARG3:.+]]: vector<3xi32>, %[[ARG4:.+]]: vector<4xi32>)
+func.func @vector_2dand1d(%arg0 : vector<2x6xi32>, %arg1 : vector<4xi32>) -> (vector<2x6xi32>, vector<4xi32>) {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<2x6xi32>
+  // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0, 0], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [0, 3], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT1]] {offsets = [1, 0], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [1, 3], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[EXTRACT0:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 0], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT0_1:.*]]  = vector.extract %[[EXTRACT0]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT1:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 3], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT1_1:.*]]  = vector.extract %[[EXTRACT1]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT2:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 0], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT2_1:.*]]  = vector.extract %[[EXTRACT2]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT3:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 3], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT3_1:.*]]  = vector.extract %[[EXTRACT3]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: return %[[EXTRACT0_1]], %[[EXTRACT1_1]], %[[EXTRACT2_1]], %[[EXTRACT3_1]], %[[ARG4]] : vector<3xi32>, vector<3xi32>, vector<3xi32>, vector<3xi32>, vector<4xi32>
+  return %arg0, %arg1 : vector<2x6xi32>, vector<4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @reduction
 // CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>, %[[ARG4:.+]]: i32)
 func.func @reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>, %arg2 : i32) -> (i32) {
diff --git a/mlir/test/Conversion/ConvertToSPIRV/index.mlir b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
index e1cb18aac5d01..f4b116849fa93 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/index.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-to-spirv="run-signature-conversion=false" | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @basic
 func.func @basic(%a: index, %b: index) {
diff --git a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
index 58ec6ac61f6ac..246464928b81c 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @if_yield
 // CHECK: %[[VAR:.*]] = spirv.Variable : !spirv.ptr<f32, Function>
diff --git a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
index c5e0e6603d94a..00556140c3018 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @return_scalar
 // CHECK-SAME: %[[ARG0:.*]]: i32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
index a83bfb6f405a0..f34ca01c94f00 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @ub
 // CHECK: %[[UNDEF:.*]] = spirv.Undef : i32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir
new file mode 100644
index 0000000000000..043f9422d8790
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-opt -test-spirv-vector-unrolling -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @vaddi
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>, %[[ARG3:.+]]: vector<3xi32>)
+func.func @vaddi(%arg0 : vector<6xi32>, %arg1 : vector<6xi32>) -> (vector<6xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<3xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<3xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]] : vector<3xi32>, vector<3xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<6xi32>
+  return %0 : vector<6xi32>
+}
+
+// CHECK-LABEL: @vaddi_2d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<2xi32>, %[[ARG1:.+]]: vector<2xi32>, %[[ARG2:.+]]: vector<2xi32>, %[[ARG3:.+]]: vector<2xi32>)
+func.func @vaddi_2d(%arg0 : vector<2x2xi32>, %arg1 : vector<2x2xi32>) -> (vector<2x2xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<2xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<2xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]] : vector<2xi32>, vector<2xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<2x2xi32>
+  return %0 : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @vaddi_2d_8
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>, %[[ARG4:.+]]: vector<4xi32>, %[[ARG5:.+]]: vector<4xi32>, %[[ARG6:.+]]: vector<4xi32>, %[[ARG7:.+]]: vector<4xi32>)
+func.func @vaddi_2d_8(%arg0 : vector<2x8xi32>, %arg1 : vector<2x8xi32>) -> (vector<2x8xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG4]] : vector<4xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG5]] : vector<4xi32>
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[ARG2]], %[[ARG6]] : vector<4xi32>
+  // CHECK: %[[ADD3:.*]] = arith.addi %[[ARG3]], %[[ARG7]] : vector<4xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]], %[[ADD2]], %[[ADD3]] : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<2x8xi32>
+  return %0 : vector<2x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @reduction_5
+// CHECK-SAME: (%[[ARG0:.+]]: vector<1xi32>, %[[ARG1:.+]]: vector<1xi32>, %[[ARG2:.+]]: vector<1xi32>, %[[ARG3:.+]]: vector<1xi32>, %[[ARG4:.+]]: vector<1xi32>)
+func.func @reduction_5(%arg0 : vector<5xi32>) -> (i32) {
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : i32 from vector<1xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract %[[ARG1]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[EXTRACT0]], %[[EXTRACT1]] : i32
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract %[[ARG2]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[EXTRACT2]] : i32
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract %[[ARG3]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[ADD1]], %[[EXTRACT3]] : i32
+  // CHECK: %[[EXTRACT4:.*]] = vector.extract %[[ARG4]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[EXTRACT4]] : i32
+  // CHECK: return %[[ADD3]] : i32
+  %0 = vector.reduction <add>, %arg0 : vector<5xi32> into i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: @reduction_8
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>)
+func.func @reduction_8(%arg0 : vector<8xi32>) -> (i32) {
+  // CHECK: %[[REDUCTION0:.*]] = vector.reduction <add>, %[[ARG0]] : vector<4xi32> into i32
+  // CHECK: %[[REDUCTION1:.*]] = vector.reduction <add>, %[[ARG1]] : vector<4xi32> into i32
+  // CHECK: %[[ADD:.*]] = arith.addi %[[REDUCTION0]], %[[REDUCTION1]] : i32
+  // CHECK: return %[[ADD]] : i32
+  %0 = vector.reduction <add>, %arg0 : vector<8xi32> into i32
+  return %0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @vaddi_reduction
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
+func.func @vaddi_reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>) -> (i32) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<4xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<4xi32>
+  // CHECK: %[[REDUCTION0:.*]] = vector.reduction <add>, %[[ADD0]] : vector<4xi32> into i32
+  // CHECK: %[[REDUCTION1:.*]] = vector.reduction <add>, %[[ADD1]] : vector<4xi32> into i32
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[REDUCTION0]], %[[REDUCTION1]] : i32
+  // CHECK: return %[[ADD2]] : i32
+  %0 = arith.addi %arg0, %arg1 : vector<8xi32>
+  %1 = vector.reduction <add>, %0 : vector<8xi32> into i32
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @transpose
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>)
+func.func @transpose(%arg0 : vector<2x3xi32>) -> (vector<3x2xi32>) {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<2xi32>
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT0:.*]]= vector.insert %[[EXTRACT0]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract %[[ARG1]][0] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert %[[EXTRACT1]], %[[INSERT0]][1] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract %[[ARG0]][1] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert %[[EXTRACT2]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract %[[ARG1]][1] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert %[[EXTRACT3]], %[[INSERT2]] [1] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT4:.*]] = vector.extract %[[ARG0]][2] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT4:.*]] = vector.insert %[[EXTRACT4]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT5:.*]] = vector.extract %[[ARG1]][2] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT5:.*]] = vector.insert %[[EXTRACT5]], %[[INSERT4]] [1] : i32 into vector<2xi32>
+  // CHECK: return %[[INSERT1]], %[[INSERT3]], %[[INSERT5]] : vector<2xi32>, vector<2xi32>, vector<2xi32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
+  return %0 : vector<3x2xi32>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index c63dd030f4747..e369eadca5730 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @extract
 //  CHECK-SAME: %[[ARG:.+]]: vector<2xf32>
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
index dbbda8e1513a8..755c4cf42689c 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
@@ -32,7 +32,7 @@ func.func @pass_through(%arg0: () -> ()) -> (() -> ()) {
 func.func private @llvmlinkage(i32) attributes { "llvm.linkage" = #llvm.linkage<extern_weak> }
 
 // CHECK-LABEL: llvm.func @llvmreadnone(i32)
-// CHECK-SAME: memory = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+// CHECK-SAME: memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
 func.func private @llvmreadnone(i32) attributes { llvm.readnone }
 
 // CHECK-LABEL: llvm.func @body(i32)
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 1b0f89a9a573e..bd7e5d139b001 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -4,30 +4,70 @@
 // RUN: | FileCheck --check-prefixes=CHECK-32,CHECK %s
 
 gpu.module @builtins {
-  // CHECK-64:    llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z12get_local_idj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z13get_global_idj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64
-  // CHECK-32:    llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z12get_local_idj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z13get_global_idj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z12get_group_idj(i32) -> i32
+  // CHECK-64:        llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z12get_local_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z12get_local_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z13get_global_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z13get_global_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z12get_group_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_block_id
   func.func @gpu_block_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_x = gpu.block_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_y = gpu.block_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_z = gpu.block_id z
     return %block_id_x, %block_id_y, %block_id_z : index, index, index
   }
@@ -35,16 +75,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_global_id
   func.func @gpu_global_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_x = gpu.global_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_y = gpu.global_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_z = gpu.global_id z
     return %global_id_x, %global_id_y, %global_id_z : index, index, index
   }
@@ -52,16 +107,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_block_dim
   func.func @gpu_block_dim() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_x = gpu.block_dim x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_y = gpu.block_dim y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_z = gpu.block_dim z
     return %block_dim_x, %block_dim_y, %block_dim_z : index, index, index
   }
@@ -69,16 +139,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_thread_id
   func.func @gpu_thread_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_x = gpu.thread_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_y = gpu.thread_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_z = gpu.thread_id z
     return %thread_id_x, %thread_id_y, %thread_id_z : index, index, index
   }
@@ -86,16 +171,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_grid_dim
   func.func @gpu_grid_dim() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_x = gpu.grid_dim x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_y = gpu.grid_dim y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_z = gpu.grid_dim z
     return %grid_dim_x, %grid_dim_y, %grid_dim_z : index, index, index
   }
@@ -104,12 +204,22 @@ gpu.module @builtins {
 // -----
 
 gpu.module @barriers {
-  // CHECK:       llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z7barrierj(i32) attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_barrier
   func.func @gpu_barrier() {
     // CHECK:         [[FLAGS:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:         llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) : (i32) -> ()
+    // CHECK:         llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i32) -> ()
     gpu.barrier
     return
   }
@@ -120,10 +230,30 @@ gpu.module @barriers {
 // Check `gpu.shuffle` conversion with default subgroup size.
 
 gpu.module @shuffles {
-  // CHECK:       llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_shuffles
   // CHECK-SAME:              (%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: i32, %[[VAL_6:.*]]: f64, %[[VAL_7:.*]]: i32)
@@ -132,13 +262,33 @@ gpu.module @shuffles {
                           %val2: f32, %delta_up: i32,
                           %val3: f64, %delta_down: i32) {
     %width = arith.constant 32 : i32
-    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) : (i32, i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i32, i32) -> i32
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) : (i64, i32) -> i64
+    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i64, i32) -> i64
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) : (f32, i32) -> f32
+    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects= #llvm.memory_effects
+    // CHECK-SAME:    } : (f32, i32) -> f32
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) : (f64, i32) -> f64
+    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects= #llvm.memory_effects
+    // CHECK-SAME:    } : (f64, i32) -> f64
     // CHECK:         llvm.mlir.constant(true) : i1
     %shuffleResult0, %valid0 = gpu.shuffle idx %val0, %id, %width : i32
     %shuffleResult1, %valid1 = gpu.shuffle xor %val1, %mask, %width : i64
@@ -155,10 +305,30 @@ gpu.module @shuffles {
 gpu.module @shuffles attributes {
   spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Kernel, Addresses, GroupNonUniformShuffle, Int64], []>, #spirv.resource_limits<subgroup_size = 16>>
 } {
-  // CHECK:       llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_shuffles
   // CHECK-SAME:              (%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: i32, %[[VAL_6:.*]]: f64, %[[VAL_7:.*]]: i32)
@@ -167,13 +337,13 @@ gpu.module @shuffles attributes {
                           %val2: f32, %delta_up: i32,
                           %val3: f64, %delta_down: i32) {
     %width = arith.constant 16 : i32
-    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) : (i32, i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) : (i64, i32) -> i64
+    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) : (f32, i32) -> f32
+    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) : (f64, i32) -> f64
+    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]])
     // CHECK:         llvm.mlir.constant(true) : i1
     %shuffleResult0, %valid0 = gpu.shuffle idx %val0, %id, %width : i32
     %shuffleResult1, %valid1 = gpu.shuffle xor %val1, %mask, %width : i64
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index d914790c05fe0..c23b11e46b24c 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -254,13 +254,16 @@ gpu.module @test_module_9 {
 gpu.module @test_module_10 {
   // CHECK: llvm.func @__nv_cosf(f32) -> f32
   // CHECK: llvm.func @__nv_cos(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_cosf(f32) -> f32
   // CHECK-LABEL: func @gpu_cos
-  func.func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.cos %arg_f32 : f32
     // CHECK: llvm.call @__nv_cosf(%{{.*}}) : (f32) -> f32
     %result64 = math.cos %arg_f64 : f64
     // CHECK: llvm.call @__nv_cos(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.cos %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_cosf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -268,13 +271,16 @@ gpu.module @test_module_10 {
 gpu.module @test_module_11 {
   // CHECK: llvm.func @__nv_expf(f32) -> f32
   // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_expf(f32) -> f32
   // CHECK-LABEL: func @gpu_exp
-  func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.exp %arg_f32 : f32
     // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
     %result64 = math.exp %arg_f64 : f64
     // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.exp %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_expf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -297,13 +303,16 @@ gpu.module @test_module_12 {
 gpu.module @test_module_13 {
   // CHECK: llvm.func @__nv_logf(f32) -> f32
   // CHECK: llvm.func @__nv_log(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_logf(f32) -> f32
   // CHECK-LABEL: func @gpu_log
-  func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log %arg_f32 : f32
     // CHECK: llvm.call @__nv_logf(%{{.*}}) : (f32) -> f32
     %result64 = math.log %arg_f64 : f64
     // CHECK: llvm.call @__nv_log(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_logf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -312,13 +321,16 @@ gpu.module @test_module_13 {
 gpu.module @test_module_14 {
   // CHECK: llvm.func @__nv_log10f(f32) -> f32
   // CHECK: llvm.func @__nv_log10(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_log10f(f32) -> f32
   // CHECK-LABEL: func @gpu_log10
-  func.func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log10 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log10f(%{{.*}}) : (f32) -> f32
     %result64 = math.log10 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log10(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log10 %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_log10f(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -342,13 +354,16 @@ gpu.module @test_module_15 {
 gpu.module @test_module_16 {
   // CHECK: llvm.func @__nv_log2f(f32) -> f32
   // CHECK: llvm.func @__nv_log2(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_log2f(f32) -> f32
   // CHECK-LABEL: func @gpu_log2
-  func.func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log2 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log2f(%{{.*}}) : (f32) -> f32
     %result64 = math.log2 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log2(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log2 %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_log2f(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -357,13 +372,16 @@ gpu.module @test_module_16 {
 gpu.module @test_module_17 {
   // CHECK: llvm.func @__nv_sinf(f32) -> f32
   // CHECK: llvm.func @__nv_sin(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_sinf(f32) -> f32
   // CHECK-LABEL: func @gpu_sin
-  func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.sin %arg_f32 : f32
     // CHECK: llvm.call @__nv_sinf(%{{.*}}) : (f32) -> f32
     %result64 = math.sin %arg_f64 : f64
     // CHECK: llvm.call @__nv_sin(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.sin %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_sinf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -372,8 +390,9 @@ gpu.module @test_module_17 {
 gpu.module @test_module_18 {
   // CHECK: llvm.func @__nv_tanf(f32) -> f32
   // CHECK: llvm.func @__nv_tan(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_tanf(f32) -> f32
   // CHECK-LABEL: func @gpu_tan
-  func.func @gpu_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) {
+  func.func @gpu_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64, f32) {
     %result16 = math.tan %arg_f16 : f16
     // CHECK: llvm.fpext %{{.*}} : f16 to f32
     // CHECK-NEXT: llvm.call @__nv_tanf(%{{.*}}) : (f32) -> f32
@@ -382,7 +401,9 @@ gpu.module @test_module_18 {
     // CHECK: llvm.call @__nv_tanf(%{{.*}}) : (f32) -> f32
     %result64 = math.tan %arg_f64 : f64
     // CHECK: llvm.call @__nv_tan(%{{.*}}) : (f64) -> f64
-    func.return %result16, %result32, %result64 : f16, f32, f64
+    %result32Fast = math.tan %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_tanf(%{{.*}}) : (f32) -> f32
+    func.return %result16, %result32, %result64, %result32Fast : f16, f32, f64, f32
   }
 }
 
@@ -494,13 +515,16 @@ gpu.module @test_module_24 {
   // CHECK: test.symbol_scope
   // CHECK: llvm.func @__nv_expf(f32) -> f32
   // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_expf(f32) -> f32
   // CHECK-LABEL: func @gpu_exp
-    func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
       %result32 = math.exp %arg_f32 : f32
       // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
       %result64 = math.exp %arg_f64 : f64
       // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
-      func.return %result32, %result64 : f32, f64
+      %result32Fast = math.exp %arg_f32 fastmath<afn> : f32
+      // CHECK: llvm.call @__nv_fast_expf(%{{.*}}) : (f32) -> f32
+      func.return %result32, %result64, %result32Fast : f32, f64, f32
     }
     "test.finish" () : () -> ()
   }) : () -> ()
@@ -526,13 +550,16 @@ gpu.module @test_module_25 {
 gpu.module @test_module_26 {
   // CHECK: llvm.func @__nv_powf(f32, f32) -> f32
   // CHECK: llvm.func @__nv_pow(f64, f64) -> f64
+  // CHECK: llvm.func @__nv_fast_powf(f32, f32) -> f32
   // CHECK-LABEL: func @gpu_pow
-  func.func @gpu_pow(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_pow(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.powf %arg_f32, %arg_f32 : f32
     // CHECK: llvm.call @__nv_powf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
     %result64 = math.powf %arg_f64, %arg_f64 : f64
     // CHECK: llvm.call @__nv_pow(%{{.*}}, %{{.*}}) : (f64, f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.powf %arg_f32, %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_powf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -701,6 +728,194 @@ gpu.module @test_module_34 {
   }
 }
 
+gpu.module @test_module_35 {
+  // CHECK: llvm.func @__nv_acosf(f32) -> f32
+  // CHECK: llvm.func @__nv_acos(f64) -> f64
+  // CHECK-LABEL: func @gpu_acos
+  func.func @gpu_acos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.acos %arg_f32 : f32
+    // CHECK: llvm.call @__nv_acosf(%{{.*}}) : (f32) -> f32
+    %result64 = math.acos %arg_f64 : f64
+    // CHECK: llvm.call @__nv_acos(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_36 {
+  // CHECK: llvm.func @__nv_acoshf(f32) -> f32
+  // CHECK: llvm.func @__nv_acosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_acosh
+  func.func @gpu_acosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.acosh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_acoshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.acosh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_acosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_37 {
+  // CHECK: llvm.func @__nv_asinf(f32) -> f32
+  // CHECK: llvm.func @__nv_asin(f64) -> f64
+  // CHECK-LABEL: func @gpu_asin
+  func.func @gpu_asin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.asin %arg_f32 : f32
+    // CHECK: llvm.call @__nv_asinf(%{{.*}}) : (f32) -> f32
+    %result64 = math.asin %arg_f64 : f64
+    // CHECK: llvm.call @__nv_asin(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_38 {
+  // CHECK: llvm.func @__nv_asinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_asinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_asinh
+  func.func @gpu_asinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.asinh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_asinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.asinh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_asinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_39 {
+  // CHECK: llvm.func @__nv_atanhf(f32) -> f32
+  // CHECK: llvm.func @__nv_atanh(f64) -> f64
+  // CHECK-LABEL: func @gpu_atanh
+  func.func @gpu_atanh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64)
+      -> (f16, f32, f64) {
+    %result16 = math.atanh %arg_f16 : f16
+    // CHECK: llvm.fpext %{{.*}} : f16 to f32
+    // CHECK-NEXT: llvm.call @__nv_atanhf(%{{.*}}) : (f32) -> f32
+    // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to f16
+    %result32 = math.atanh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_atanhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.atanh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_atanh(%{{.*}}) : (f64) -> f64
+    func.return %result16, %result32, %result64 : f16, f32, f64
+  }
+}
+
+gpu.module @test_module_40 {
+  // CHECK: llvm.func @__nv_copysignf(f32, f32) -> f32
+  // CHECK: llvm.func @__nv_copysign(f64, f64) -> f64
+  // CHECK-LABEL: func @gpu_copysign
+  func.func @gpu_copysign(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.copysign %arg_f32, %arg_f32 : f32
+    // CHECK: llvm.call @__nv_copysignf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+    %result64 = math.copysign %arg_f64, %arg_f64 : f64
+    // CHECK: llvm.call @__nv_copysign(%{{.*}}, %{{.*}}) : (f64, f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_41 {
+  // CHECK: llvm.func @__nv_coshf(f32) -> f32
+  // CHECK: llvm.func @__nv_cosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_cosh
+  func.func @gpu_cosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.cosh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_coshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.cosh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_cosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_42 {
+  // CHECK: llvm.func @__nv_fmaf(f32, f32, f32) -> f32
+  // CHECK: llvm.func @__nv_fma(f64, f64, f64) -> f64
+  // CHECK-LABEL: func @gpu_fma
+  func.func @gpu_fma(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.fma %arg_f32, %arg_f32, %arg_f32 : f32
+    // CHECK: llvm.call @__nv_fmaf(%{{.*}}, %{{.*}}, %{{.*}}) : (f32, f32, f32) -> f32
+    %result64 = math.fma %arg_f64, %arg_f64, %arg_f64 : f64
+    // CHECK: llvm.call @__nv_fma(%{{.*}}, %{{.*}}, %{{.*}}) : (f64, f64, f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_43 {
+  // CHECK: llvm.func @__nv_roundf(f32) -> f32
+  // CHECK: llvm.func @__nv_round(f64) -> f64
+  // CHECK-LABEL: func @gpu_round
+  func.func @gpu_round(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.round %arg_f32 : f32
+    // CHECK: llvm.call @__nv_roundf(%{{.*}}) : (f32) -> f32
+    %result64 = math.round %arg_f64 : f64
+    // CHECK: llvm.call @__nv_round(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_44 {
+  // CHECK: llvm.func @__nv_rintf(f32) -> f32
+  // CHECK: llvm.func @__nv_rint(f64) -> f64
+  // CHECK-LABEL: func @gpu_roundeven
+  func.func @gpu_roundeven(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.roundeven %arg_f32 : f32
+    // CHECK: llvm.call @__nv_rintf(%{{.*}}) : (f32) -> f32
+    %result64 = math.roundeven %arg_f64 : f64
+    // CHECK: llvm.call @__nv_rint(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_45 {
+  // CHECK: llvm.func @__nv_sinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_sinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_sinh
+  func.func @gpu_sinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.sinh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_sinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.sinh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_sinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_46 {
+  // CHECK: llvm.func @__nv_coshf(f32) -> f32
+  // CHECK: llvm.func @__nv_cosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_cosh_with_fastmath
+  func.func @gpu_cosh_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.cosh %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_coshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.cosh %arg_f64 fastmath<afn> : f64
+    // CHECK: llvm.call @__nv_cosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_47 {
+  // CHECK: llvm.func @__nv_sinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_sinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_sinh_with_fastmath
+  func.func @gpu_sinh_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.sinh %arg_f32 fastmath<contract> : f32
+    // CHECK: llvm.call @__nv_sinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.sinh %arg_f64 fastmath<none> : f64
+    // CHECK: llvm.call @__nv_sinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_48 {
+  // CHECK: llvm.func @__nv_expf(f32) -> f32
+  // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK-LABEL: func @gpu_exp_with_fastmath
+  func.func @gpu_exp_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
+    %result32 = math.exp %arg_f32 fastmath<reassoc>: f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
+    %result64 = math.exp %arg_f64 fastmath<contract>: f64
+    // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
+    %result32Fast = math.exp %arg_f32 fastmath<ninf> : f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
+  }
+}
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
@@ -729,9 +944,9 @@ module attributes {transform.with_named_sequence} {
       legal_dialects = ["llvm", "memref", "nvvm", "test"],
       legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"],
       illegal_dialects = ["gpu"],
-      illegal_ops = ["llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
-                    "llvm.ffloor", "llvm.log", "llvm.log10", "llvm.log2","llvm.pow",
-                    "llvm.sin", "llvm.sqrt"],
+      illegal_ops = ["llvm.copysign", "llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
+                    "llvm.ffloor", "llvm.fma", "llvm.frem", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
+                    "llvm.roundeven", "llvm.round", "llvm.sin", "llvm.sqrt"],
       partial_conversion
     } : !transform.any_op
     transform.yield
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 86a552c03a473..156a8a468d5b4 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1339,3 +1339,18 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// CHECK-LABEL: @rcp_approx_ftz_f32
+// CHECK-SAME:  %[[IN:.*]]: vector<32x16xf32>
+func.func @rcp_approx_ftz_f32(%in: vector<32x16xf32>) {
+  // CHECK: %[[IN_LLVM:.*]] = builtin.unrealized_conversion_cast %[[IN]] : vector<32x16xf32> to !llvm.array<32 x vector<16xf32>>
+  // CHECK: %[[IN1DVEC:.*]] = llvm.extractvalue %[[IN_LLVM]][0] : !llvm.array<32 x vector<16xf32>>
+  // CHECK: %[[OUT1DVEC:.*]] = llvm.mlir.undef : vector<16xf32>
+  // CHECK: %[[IDX_0:.+]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK: %[[ELEM_0:.*]] = llvm.extractelement %[[IN1DVEC]][%[[IDX_0]] : i64]
+  // CHECK: %[[ELEM_RCP0:.*]] = nvvm.rcp.approx.ftz.f %[[ELEM_0]] : f32
+  // CHECK: llvm.insertelement %[[ELEM_RCP0]], %[[OUT1DVEC]][%[[IDX_0]] : i64] : vector<16xf32>
+  // CHECK-COUNT-511: nvvm.rcp.approx.ftz.f
+  %out = nvgpu.rcp %in {rounding = approx, ftz} : vector<32x16xf32>
+  return
+}
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 4c9e09970279a..d81487daf34f6 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -253,7 +253,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK:           %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
-// CHECK:           omp.target   thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK:           omp.target map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) thread_limit(%[[VAL_0]] : i32) {
 // CHECK:           ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
 // CHECK:             %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 5187d79fd4c0b..0e35f8ea9d0cd 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -545,6 +545,14 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
   // CHECK: math.erf
   %24 = tosa.erf %0 : (tensor<1xf32>) -> tensor<1xf32>
 
+  // CHECK: linalg.generic
+  // CHECK: math.sin
+  %25 = tosa.sin %arg0 : (tensor<1xf32>) -> tensor<1xf32>
+
+  // CHECK: linalg.generic
+  // CHECK: math.cos
+  %26 = tosa.cos %arg0 : (tensor<1xf32>) -> tensor<1xf32>
+
   return
 }
 
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index 667aad7645c51..dd0ed77470a25 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -150,6 +150,19 @@ func.func @broadcast(%arg0 : f32) -> (vector<4xf32>, vector<2xf32>) {
 
 // -----
 
+// CHECK-LABEL: @broadcast_index
+//  CHECK-SAME: %[[ARG0:.*]]: index
+//       CHECK:   %[[CAST0:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : index to i32
+//       CHECK:   %[[CONSTRUCT:.*]] = spirv.CompositeConstruct %[[CAST0]], %[[CAST0]], %[[CAST0]], %[[CAST0]] : (i32, i32, i32, i32) -> vector<4xi32>
+//       CHECK:   %[[CAST1:.*]] = builtin.unrealized_conversion_cast %[[CONSTRUCT]] : vector<4xi32> to vector<4xindex>
+//       CHECK:   return %[[CAST1]] : vector<4xindex>
+func.func @broadcast_index(%a: index) -> vector<4xindex> {
+  %0 = vector.broadcast %a : index to vector<4xindex>
+  return %0 : vector<4xindex>
+}
+
+// -----
+
 // CHECK-LABEL: @extract
 //  CHECK-SAME: %[[ARG:.+]]: vector<2xf32>
 //       CHECK:   spirv.CompositeExtract %[[ARG]][0 : i32] : vector<2xf32>
@@ -781,6 +794,32 @@ func.func @shape_cast_size1_vector(%arg0 : vector<f32>) -> vector<1xf32> {
 
 // -----
 
+// CHECK-LABEL: @step()
+//       CHECK:   %[[CST0:.*]] = spirv.Constant 0 : i32
+//       CHECK:   %[[CST1:.*]] = spirv.Constant 1 : i32
+//       CHECK:   %[[CST2:.*]] = spirv.Constant 2 : i32
+//       CHECK:   %[[CST3:.*]] = spirv.Constant 3 : i32
+//       CHECK:   %[[CONSTRUCT:.*]] = spirv.CompositeConstruct %[[CST0]], %[[CST1]], %[[CST2]], %[[CST3]] : (i32, i32, i32, i32) -> vector<4xi32>
+//       CHECK:   %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[CONSTRUCT]] : vector<4xi32> to vector<4xindex>
+//       CHECK:   return %[[CAST]] : vector<4xindex>
+func.func @step() -> vector<4xindex> {
+  %0 = vector.step : vector<4xindex>
+  return %0 : vector<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @step_size1()
+//       CHECK:   %[[CST0:.*]] = spirv.Constant 0 : i32
+//       CHECK:   %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[CST0]] : i32 to vector<1xindex>
+//       CHECK:   return %[[CAST]] : vector<1xindex>
+func.func @step_size1() -> vector<1xindex> {
+  %0 = vector.step : vector<1xindex>
+  return %0 : vector<1xindex>
+}
+
+// -----
+
 module attributes {
   spirv.target_env = #spirv.target_env<
     #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 744a096d757e0..9457a1b9e4498 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -109,6 +109,15 @@ func.func @lds_barrier() {
   func.return
 }
 
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+  // CHECK: amdgpu.sched_barrier allow = <none>
+  amdgpu.sched_barrier allow = <none>
+  // CHECK: amdgpu.sched_barrier allow = <valu|all_vmem>
+  amdgpu.sched_barrier allow = <valu|all_vmem>
+  func.return
+}
+
 // CHECK-LABEL: func @mfma
 func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
   // CHECK: amdgpu.mfma
diff --git a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
index d1b80520ca7fe..e2c3832f695cc 100644
--- a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
+++ b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
@@ -34,7 +34,7 @@ func.func @illegal_loop_with_diag_dependence() {
   %A = memref.alloc() : memref<64x64xf32>
 
   affine.for %i = 0 to 64 {
-    // expected-remark@above {{tiling code is illegal due to dependences}}
+    // expected-remark@above {{tiling nest is invalid due to dependences}}
     affine.for %j = 0 to 64 {
       %0 = affine.load %A[%j, %i] : memref<64x64xf32>
       %1 = affine.load %A[%i, %j - 1] : memref<64x64xf32>
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
index 9c22b29ac22e7..2e1f3d1ee10a9 100644
--- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
+++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
@@ -430,3 +430,181 @@ func.func @cond_branch_with_backedge(%slice: vector<[4]xf32>) {
   // Live here: %finalTileA, %finalTileB, %finalTileC, %finalTileD
   return
 }
+
+// -----
+
+//  CHECK-LIVE-RANGE-LABEL: @fill_holes_in_tile_liveness
+//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+//        CHECK-LIVE-RANGE: ^bb0:
+//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT: E  cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT: ^bb1:
+//   CHECK-LIVE-RANGE-NEXT:  S arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
+//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
+//   CHECK-LIVE-RANGE-NEXT:    cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^bb2:
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: E  test.some_use
+//   CHECK-LIVE-RANGE-NEXT:    cf.br
+
+// Here there's a 'hole' in the liveness of %tileA (in bb1) where another value
+// can reuse the tile ID assigned to %tileA. The liveness for %tileB is
+// entirely within the 'hole' in %tileA's live range, so %tileB should get the
+// same tile ID as %tileA.
+
+// CHECK-LABEL: @fill_holes_in_tile_liveness
+func.func @fill_holes_in_tile_liveness(%cond: i1) {
+  // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A:.*]] : i32}
+  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
+  cf.cond_br %cond, ^bb2, ^bb1
+^bb1:
+  // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A]] : i32}
+  %tileB = arm_sme.get_tile : vector<[4]x[4]xf32>
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> ()
+  cf.br ^bb3
+^bb2:
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
+  cf.br ^bb3
+^bb3:
+  return
+}
+
+// -----
+
+//  CHECK-LIVE-RANGE-LABEL: @holes_in_tile_liveness_inactive_overlaps
+//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+//        CHECK-LIVE-RANGE: ^bb0:
+//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT: E  cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT: ^bb1:
+//   CHECK-LIVE-RANGE-NEXT:  S arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
+//   CHECK-LIVE-RANGE-NEXT:  | test.some_use
+//   CHECK-LIVE-RANGE-NEXT:  | arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:  E cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^bb2:
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT: E| test.some_use
+//   CHECK-LIVE-RANGE-NEXT:  | arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:  E cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^bb3:
+//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
+//   CHECK-LIVE-RANGE-NEXT:    func.return
+
+// This tests an edge case in inactive live ranges. The first live range is
+// inactive at the start of ^bb1. If the tile allocator did not check if the
+// second live range overlapped the first it would wrongly re-use tile ID 0
+// (as the first live range is inactive so tile ID 0 is free). This would mean
+// in ^bb2 two overlapping live ranges would have the same tile ID (bad!).
+
+// CHECK-LABEL: @holes_in_tile_liveness_inactive_overlaps
+func.func @holes_in_tile_liveness_inactive_overlaps(%cond: i1) {
+  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
+  cf.cond_br %cond, ^bb2, ^bb1
+^bb1:
+  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
+  %tileB = arm_sme.get_tile : vector<[4]x[4]xf32>
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> ()
+  cf.br ^bb3(%tileB: vector<[4]x[4]xf32>)
+^bb2:
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
+  %tileC = arm_sme.get_tile : vector<[4]x[4]xf32>
+  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
+  cf.br ^bb3(%tileC: vector<[4]x[4]xf32>)
+^bb3(%tile: vector<[4]x[4]xf32>):
+  "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// This is the same as the previous example, but changes the tile types to
+// vector<[16]x[16]xi8>. This means in bb1 the allocator will need to spill the
+// first live range (which is inactive).
+
+// Note: The live ranges are the same as the previous example (so are not checked).
+
+// CHECK-LABEL: @spill_inactive_live_range
+func.func @spill_inactive_live_range(%cond: i1) {
+  // CHECK: arm_sme.get_tile {tile_id = 16 : i32}
+  %tileA = arm_sme.get_tile : vector<[16]x[16]xi8>
+  cf.cond_br %cond, ^bb2, ^bb1
+^bb1:
+  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+  %tileB = arm_sme.get_tile : vector<[16]x[16]xi8>
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> ()
+  cf.br ^bb3(%tileB: vector<[16]x[16]xi8>)
+^bb2:
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+  %tileC = arm_sme.get_tile : vector<[16]x[16]xi8>
+  "test.some_use"(%tileA) : (vector<[16]x[16]xi8>) -> ()
+  cf.br ^bb3(%tileC: vector<[16]x[16]xi8>)
+^bb3(%tile: vector<[16]x[16]xi8>):
+  "test.some_use"(%tile) : (vector<[16]x[16]xi8>) -> ()
+  return
+}
+
+// -----
+
+//  CHECK-LIVE-RANGE-LABEL: @reactivate_inactive_live_range
+//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+//        CHECK-LIVE-RANGE: ^bb0:
+//   CHECK-LIVE-RANGE-NEXT: S   arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT: E   cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT: ^bb1:
+//   CHECK-LIVE-RANGE-NEXT:  S  arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT:  |  test.dummy
+//   CHECK-LIVE-RANGE-NEXT:  E  test.some_use
+//   CHECK-LIVE-RANGE-NEXT:     cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^bb2:
+//   CHECK-LIVE-RANGE-NEXT: | S arm_sme.get_tile
+//   CHECK-LIVE-RANGE-NEXT: | | test.dummy
+//   CHECK-LIVE-RANGE-NEXT: | | test.dummy
+//   CHECK-LIVE-RANGE-NEXT: | E test.some_use
+//   CHECK-LIVE-RANGE-NEXT: E   test.some_use
+//   CHECK-LIVE-RANGE-NEXT:     cf.br
+
+// Here the live range for %tileA becomes inactive in bb1 (so %tileB gets tile
+// ID 0 too). Then in bb2 the live range for tileA is reactivated as it overlaps
+// with the start of %tileC's live range (which means %tileC gets tile ID 1).
+
+func.func @reactivate_inactive_live_range(%cond: i1) {
+  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
+  cf.cond_br %cond, ^bb2, ^bb1
+^bb1:
+  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+  %tileB = arm_sme.get_tile : vector<[16]x[16]xi8>
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> ()
+  cf.br ^bb3
+^bb2:
+  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
+  %tileC = arm_sme.get_tile : vector<[4]x[4]xf32>
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tileC) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
+  cf.br ^bb3
+^bb3:
+  return
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index 71d80bc16ea12..adc02adb6e974 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -544,3 +544,131 @@ func.func @multi_tile_splat() -> vector<[8]x[8]xi32>
   %0 = arith.constant dense<42> : vector<[8]x[8]xi32>
   return %0 : vector<[8]x[8]xi32>
 }
+
+// -----
+
+// CHECK: #[[$TRANSPOSE_MAP_0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+
+// CHECK-LABEL: @transpose_store_scalable_via_za(
+// CHECK-SAME:                                   %[[VEC:.*]]: vector<2x[4]xf32>
+// CHECK-SAME:                                   %[[DEST:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                   %[[I:.*]]: index,
+// CHECK-SAME:                                   %[[J:.*]]: index)
+func.func @transpose_store_scalable_via_za(%vec: vector<2x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+  // CHECK-NEXT: %[[INIT:.*]] = arm_sme.get_tile : vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[V0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+  // CHECK-NEXT: %[[R0:.*]] = vector.insert %[[V0]], %[[INIT]] [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[V1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+  // CHECK-NEXT: %[[RES:.*]] = vector.insert %[[V1]], %[[R0]] [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[VSCALE:.*]] = vector.vscale
+  // CHECK-NEXT: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+  // CHECK-NEXT: %[[MASK:.*]] = vector.create_mask %[[C4_VSCALE]], %[[C2]] : vector<[4]x[4]xi1>
+  // CHECK-NEXT: vector.transfer_write %[[RES]], %[[DEST]][%[[I]], %[[J]]], %[[MASK]] {in_bounds = [true, true], permutation_map = #[[$TRANSPOSE_MAP_0]]} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_masked(
+// CHECK-SAME:                                          %[[A:[a-z0-9]+]]: index,
+// CHECK-SAME:                                          %[[B:[a-z0-9]+]]: index)
+func.func @transpose_store_scalable_via_za_masked(%vec: vector<2x[4]xf32>, %dest: memref<?x?xf32>, %a: index, %b: index) {
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[MIN:.*]] = index.mins %[[B]], %[[C2]]
+  // CHECK: %[[MASK:.*]] = vector.create_mask %[[A]], %[[MIN]] : vector<[4]x[4]xi1>
+  // CHECK: vector.transfer_write {{.*}} %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %c0 = arith.constant 0 : index
+  %mask = vector.create_mask %a, %b : vector<[4]x2xi1>
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  vector.transfer_write %tr, %dest[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_multi_tile(
+// CHECK-SAME:                                              %[[VEC:.*]]: vector<8x[4]xf32>
+// CHECK-SAME:                                              %[[DEST:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                              %[[I:.*]]: index,
+// CHECK-SAME:                                              %[[J:.*]]: index)
+func.func @transpose_store_scalable_via_za_multi_tile(%vec: vector<8x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // CHECK: %[[C4:.*]] = arith.constant 4 : index
+
+  // <skip 3x other extract+insert chain>
+  // CHECK: %[[V3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<8x[4]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[V3]], %{{.*}} [3] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[VSCALE:.*]] = vector.vscale
+  // CHECK: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+  // CHECK: %[[MASK:.*]] = vector.create_mask %c4_vscale, %c4 : vector<[4]x[4]xi1>
+  // CHECK: vector.transfer_write %[[TILE_0]], %[[DEST]][%[[I]], %[[J]]], %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+
+  // <skip 3x other extract+insert chain>
+  // CHECK: %[[V7:.*]] = vector.extract %arg0[7] : vector<[4]xf32> from vector<8x[4]xf32>
+  // CHECK: %[[TILE_1:.*]] = vector.insert %[[V7]], %{{.*}} [3] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[J_OFFSET:.*]] = arith.addi %[[J]], %[[C4]] : index
+  // CHECK: vector.transfer_write %[[TILE_1]], %[[DEST]][%[[I]], %[[J_OFFSET]]], %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %tr = vector.transpose %vec, [1, 0] : vector<8x[4]xf32> to vector<[4]x8xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x8xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_multi_tile_wide
+func.func @transpose_store_scalable_via_za_multi_tile_wide(%vec: vector<2x[8]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // <check extracts from lower 4 x vscale of %vec>
+  // CHECK: vector.scalable.extract
+  // CHECK: %[[ROW_2_LOWER:.*]] = vector.scalable.extract %{{.*}}[0] : vector<[4]xf32> from vector<[8]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[ROW_2_LOWER]], %{{.*}}[1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: vector.transfer_write %[[TILE_0]], %{{.*}}[%[[I:.[a-z0-9]+]], %[[J:[a-z0-9]+]]]
+
+  // <check extracts from upper 4 x vscale of %vec>
+  // CHECK: vector.scalable.extract
+  // CHECK: %[[ROW_2_UPPER:.*]] = vector.scalable.extract %{{.*}}[4] : vector<[4]xf32> from vector<[8]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[ROW_2_UPPER]], %{{.*}}[1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[I_OFFSET:.*]] = arith.addi %c4_vscale, %[[I]] : index
+  // CHECK: vector.transfer_write %[[TILE_0]], %{{.*}}[%[[I_OFFSET]], %[[J]]]
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[8]xf32> to vector<[8]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[8]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_transpose_store_scalable_via_za__bad_source_shape
+// CHECK-NOT: arm_sme.get_tile
+func.func @negative_transpose_store_scalable_via_za__bad_source_shape(%vec: vector<2x[7]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[7]xf32> to vector<[7]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[7]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose(
+// CHECK-SAME:                                %[[VEC:.*]]: vector<1x1x4x[4]xf32>)
+func.func @swap_shape_cast_of_transpose(%vector: vector<1x1x4x[4]xf32>) -> vector<[4]x4xf32> {
+  // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+  // CHECK: return %[[TRANSPOSE]]
+  %0 = vector.transpose %vector, [3, 0, 1, 2] : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+  %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+  return %1 : vector<[4]x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose_units_dims_before_and_after(
+// CHECK-SAME:                                %[[VEC:.*]]: vector<1x1x1x4x[4]x1xf32>)
+func.func @swap_shape_cast_of_transpose_units_dims_before_and_after(%vector: vector<1x1x1x4x[4]x1xf32>) -> vector<[4]x4xf32> {
+  // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x1x4x[4]x1xf32> to vector<4x[4]xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+  // CHECK: return %[[TRANSPOSE]]
+  %0 = vector.transpose %vector, [4, 1, 0, 2, 3, 5] : vector<1x1x1x4x[4]x1xf32> to vector<[4]x1x1x1x4x1xf32>
+  %1 = vector.shape_cast %0 : vector<[4]x1x1x1x4x1xf32> to vector<[4]x4xf32>
+  return %1 : vector<[4]x4xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
index 8e14990502143..5e8104f83cc4d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
@@ -178,7 +178,7 @@ func.func @condBranchDynamicTypeNested(
 //  CHECK-NEXT: ^bb1
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb6([[ARG1]], %false{{[0-9_]*}} :
+//       CHECK: cf.br ^bb5([[ARG1]], %false{{[0-9_]*}} :
 //       CHECK: ^bb2([[IDX:%.*]]:{{.*}})
 //       CHECK: [[ALLOC1:%.*]] = memref.alloc([[IDX]])
 //  CHECK-NEXT: test.buffer_based
@@ -186,24 +186,20 @@ func.func @condBranchDynamicTypeNested(
 //  CHECK-NEXT: [[OWN:%.+]] = arith.select [[ARG0]], [[ARG0]], [[NOT_ARG0]]
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.cond_br{{.*}}, ^bb3, ^bb4
+//       CHECK: cf.cond_br{{.*}}, ^bb3, ^bb3
 //  CHECK-NEXT: ^bb3:
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]]
-//  CHECK-NEXT: ^bb4:
+//       CHECK: cf.br ^bb4([[ALLOC1]], [[OWN]]
+//  CHECK-NEXT: ^bb4([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}})
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]]
-//  CHECK-NEXT: ^bb5([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}})
-//   CHECK-NOT: bufferization.dealloc
-//   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb6([[ALLOC2]], [[COND1]]
-//  CHECK-NEXT: ^bb6([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}})
+//       CHECK: cf.br ^bb5([[ALLOC2]], [[COND1]]
+//  CHECK-NEXT: ^bb5([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}})
 //  CHECK-NEXT: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC4]]
 //  CHECK-NEXT: [[OWN:%.+]]:2 = bufferization.dealloc ([[BASE]] :{{.*}}) if ([[COND2]]) retain ([[ALLOC4]], [[ARG2]] :
-//       CHECK: cf.br ^bb7([[ALLOC4]], [[OWN]]#0
-//  CHECK-NEXT: ^bb7([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}})
+//       CHECK: cf.br ^bb6([[ALLOC4]], [[OWN]]#0
+//  CHECK-NEXT: ^bb6([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}})
 //       CHECK: test.copy
 //       CHECK: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC5]]
 //  CHECK-NEXT: bufferization.dealloc ([[BASE]] : {{.*}}) if ([[COND3]])
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 04de7659bcf17..f9551e311df59 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -422,7 +422,7 @@ llvm.func @test_byval(%ptr : !llvm.ptr) {
 
 // -----
 
-llvm.func @with_byval_arg(%ptr : !llvm.ptr { llvm.byval = f64 }) attributes {memory = #llvm.memory_effects<other = readwrite, argMem = read, inaccessibleMem = readwrite>} {
+llvm.func @with_byval_arg(%ptr : !llvm.ptr { llvm.byval = f64 }) attributes {memory_effects = #llvm.memory_effects<other = readwrite, argMem = read, inaccessibleMem = readwrite>} {
   llvm.return
 }
 
@@ -436,7 +436,7 @@ llvm.func @test_byval_read_only(%ptr : !llvm.ptr) {
 
 // -----
 
-llvm.func @with_byval_arg(%ptr : !llvm.ptr { llvm.byval = f64 }) attributes {memory = #llvm.memory_effects<other = readwrite, argMem = write, inaccessibleMem = readwrite>} {
+llvm.func @with_byval_arg(%ptr : !llvm.ptr { llvm.byval = f64 }) attributes {memory_effects = #llvm.memory_effects<other = readwrite, argMem = write, inaccessibleMem = readwrite>} {
   llvm.return
 }
 
@@ -451,7 +451,7 @@ llvm.func @test_byval_write_only(%ptr : !llvm.ptr) {
 
 // -----
 
-llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
+llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
   llvm.return
 }
 
@@ -472,7 +472,7 @@ llvm.func @test_byval_input_aligned(%unaligned : !llvm.ptr, %aligned : !llvm.ptr
 
 llvm.func @func_that_uses_ptr(%ptr : !llvm.ptr)
 
-llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
+llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
   llvm.call @func_that_uses_ptr(%ptr) : (!llvm.ptr) -> ()
   llvm.return
 }
@@ -496,7 +496,7 @@ module attributes {
 
 llvm.func @func_that_uses_ptr(%ptr : !llvm.ptr)
 
-llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
+llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
   llvm.call @func_that_uses_ptr(%ptr) : (!llvm.ptr) -> ()
   llvm.return
 }
@@ -524,7 +524,7 @@ module attributes {
 
 llvm.func @func_that_uses_ptr(%ptr : !llvm.ptr)
 
-llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
+llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
   llvm.call @func_that_uses_ptr(%ptr) : (!llvm.ptr) -> ()
   llvm.return
 }
@@ -550,7 +550,7 @@ llvm.func @test_alignment_exceeded_anyway() {
 llvm.mlir.global private @unaligned_global(42 : i64) : i64
 llvm.mlir.global private @aligned_global(42 : i64) { alignment = 64 } : i64
 
-llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
+llvm.func @aligned_byval_arg(%ptr : !llvm.ptr { llvm.byval = i16, llvm.align = 16 }) attributes {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = read>} {
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 39f8e70b9fb7b..fe288dab973f5 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1415,10 +1415,163 @@ func.func @invalid_zext_target_type_two(%arg: vector<1xi32>)  {
 
 // -----
 
+llvm.func @non_variadic(%arg: i32)
+
+llvm.func @invalid_var_callee_type(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to be a variadic function type}}
+  llvm.call @non_variadic(%arg) vararg(!llvm.func<void (i32)>) : (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_num_parameters(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to have at most 1 parameters}}
+  llvm.call @variadic(%arg) vararg(!llvm.func<void (i32, i64, ...)>) : (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @invalid_var_callee_type_num_parameters_indirect(%callee : !llvm.ptr, %arg: i32)  {
+  // expected-error@below {{expected var_callee_type to have at most 1 parameters}}
+  llvm.call %callee(%arg) vararg(!llvm.func<void (i32, i64, ...)>) : !llvm.ptr, (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_parameter_type_mismatch(%arg: i32)  {
+  // expected-error@below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
+  llvm.call @variadic(%arg) vararg(!llvm.func<void (i64, ...)>) : (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @invalid_var_callee_type_parameter_type_mismatch_indirect(%callee : !llvm.ptr, %arg: i32)  {
+  // expected-error@below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
+  llvm.call %callee(%arg) vararg(!llvm.func<void (i64, ...)>) : !llvm.ptr, (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_non_void(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to return void}}
+  llvm.call @variadic(%arg) vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...) -> i32
+
+llvm.func @invalid_var_callee_type_return_type_mismatch(%arg: i32)  {
+  // expected-error@below {{var_callee_type return type mismatch: 'i8' != 'i32'}}
+  %0 = llvm.call @variadic(%arg) vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> (i32)
+  llvm.return
+}
+
+// -----
+
+llvm.func @non_variadic(%arg: i32)
+
+llvm.func @invalid_var_callee_type(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to be a variadic function type}}
+  llvm.invoke @non_variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32)>) : (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_num_parameters(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to have at most 1 parameters}}
+  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32, i64, ...)>) : (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @invalid_var_callee_type_num_parameters_indirect(%callee : !llvm.ptr, %arg: i32)  {
+  // expected-error@below {{expected var_callee_type to have at most 1 parameters}}
+  llvm.invoke %callee(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32, i64, ...)>) : !llvm.ptr, (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_parameter_type_mismatch(%arg: i32)  {
+  // expected-error@below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
+  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i64, ...)>) : (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @invalid_var_callee_type_parameter_type_mismatch_indirect(%callee : !llvm.ptr, %arg: i32)  {
+  // expected-error@below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
+  llvm.invoke %callee(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i64, ...)>) : !llvm.ptr, (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...)
+
+llvm.func @invalid_var_callee_type_non_void(%arg: i32)  {
+  // expected-error@below {{expected var_callee_type to return void}}
+  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> ()
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
+llvm.func @variadic(%arg: i32, ...) -> i32
+
+llvm.func @invalid_var_callee_type_return_type_mismatch(%arg: i32)  {
+  // expected-error@below {{var_callee_type return type mismatch: 'i8' != 'i32'}}
+  %0 = llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> (i32)
+^bb1:
+  llvm.return
+^bb2:
+  llvm.return
+}
+
+// -----
+
 llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
-  // expected-error@+1 {{missing callee type attribute for vararg call}}
+  // expected-error@+1 {{missing var_callee_type attribute for vararg call}}
   "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
   llvm.return
 }
@@ -1428,7 +1581,7 @@ llvm.func @invalid_variadic_call(%arg: i32)  {
 llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
-  // expected-error@+1 {{missing callee type attribute for vararg call}}
+  // expected-error@+1 {{missing var_callee_type attribute for vararg call}}
   "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
   llvm.return
 }
@@ -1445,14 +1598,14 @@ llvm.func @foo(%arg: !llvm.ptr) {
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error@+1 {{to use im2col mode, the tensor has to be at least 3-dimensional}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
   return
 }
 // -----
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error@+1 {{im2col offsets must be 2 less than number of coordinates}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
   return
 }
 
@@ -1460,7 +1613,7 @@ func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error@+1 {{expects coordinates between 1 to 5 dimension}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[]: !llvm.ptr<3>, !llvm.ptr  
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[]: !llvm.ptr<3>, !llvm.ptr
   return
 }
 
@@ -1469,7 +1622,7 @@ func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error@+1 {{expects coordinates between 1 to 5 dimension}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd0,%crd1,%crd2,%crd3]: !llvm.ptr<3>, !llvm.ptr  
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd0,%crd1,%crd2,%crd3]: !llvm.ptr<3>, !llvm.ptr
   return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index ca9748a2b8b7b..ff16bb0f857dd 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s
 
+
+// CHECK-LABEL: func @baz
+// something to call
+llvm.func @baz()
+
 // CHECK-LABEL: func @ops
 // CHECK-SAME: (%[[I32:.*]]: i32, %[[FLOAT:.*]]: f32, %[[PTR1:.*]]: !llvm.ptr, %[[PTR2:.*]]: !llvm.ptr, %[[BOOL:.*]]: i1, %[[VPTR1:.*]]: !llvm.vec<2 x ptr>)
 func.func @ops(%arg0: i32, %arg1: f32,
@@ -93,6 +98,19 @@ func.func @ops(%arg0: i32, %arg1: f32,
   llvm.call %variadic_func(%arg0, %arg0) vararg(!llvm.func<void (i32, ...)>) : !llvm.ptr, (i32, i32) -> ()
   llvm.call %variadic_func(%arg0, %arg0) vararg(!llvm.func<void (i32, ...)>) {fastmathFlags = #llvm.fastmath<fast>} : !llvm.ptr, (i32, i32) -> ()
 
+// Function call attributes
+// CHECK: llvm.call @baz() {convergent} : () -> ()
+  llvm.call @baz() {convergent} : () -> ()
+
+// CHECK: llvm.call @baz() {no_unwind} : () -> ()
+  llvm.call @baz() {no_unwind} : () -> ()
+
+// CHECK: llvm.call @baz() {will_return} : () -> ()
+  llvm.call @baz() {will_return} : () -> ()
+
+// CHECK: llvm.call @baz() {memory = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = write>} : () -> ()
+  llvm.call @baz() {memory = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = write>} : () -> ()
+
 // Terminator operations and their successors.
 //
 // CHECK: llvm.br ^[[BB1:.*]]
diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir
index 2dd292408fa60..42d370a5477c2 100644
--- a/mlir/test/Dialect/LLVMIR/types.mlir
+++ b/mlir/test/Dialect/LLVMIR/types.mlir
@@ -6,8 +6,6 @@ func.func @primitive() {
   "some.op"() : () -> !llvm.void
   // CHECK: !llvm.ppc_fp128
   "some.op"() : () -> !llvm.ppc_fp128
-  // CHECK: !llvm.x86_mmx
-  "some.op"() : () -> !llvm.x86_mmx
   // CHECK: !llvm.token
   "some.op"() : () -> !llvm.token
   // CHECK: !llvm.label
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 928030a81dc02..b1212b8863a5d 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1017,7 +1017,7 @@ func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>)
   return %0 : tensor<2x3xf32>
 }
 
-// ----
+// -----
 
 func.func @transpose_1d(%input: tensor<16xf32>,
                         %init: tensor<16xf32>) -> tensor<16xf32> {
@@ -1096,3 +1096,103 @@ func.func @transpose_transpose_fold(%input: tensor<5x4x3xf32>,
   func.return %transpose2 : tensor<3x4x5xf32>
 }
 
+// -----
+
+func.func @broadcast_transpose_fold(%input: tensor<2x4x5xf32>,
+                                    %init1: tensor<1x2x3x4x5x6xf32>,
+                                    %init2: tensor<1x6x2x3x5x4xf32>) -> tensor<1x6x2x3x5x4xf32> {
+  // CHECK-LABEL: @broadcast_transpose_fold
+  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<2x4x5xf32>
+  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<1x2x3x4x5x6xf32>
+  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<1x6x2x3x5x4xf32>
+  //       CHECK:   %[[TMP_INIT:.+]] = tensor.empty() : tensor<2x5x4xf32>
+  //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[INPUT]] : tensor<2x4x5xf32>) outs(%[[TMP_INIT]] : tensor<2x5x4xf32>) permutation = [0, 2, 1]
+  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[TRANSPOSE]] : tensor<2x5x4xf32>) outs(%[[INIT2]] : tensor<1x6x2x3x5x4xf32>) dimensions = [0, 3, 1]
+  //       CHECK:   return %[[BROADCAST]] : tensor<1x6x2x3x5x4xf32>
+  %broadcast = linalg.broadcast
+      ins(%input : tensor<2x4x5xf32>)
+      outs(%init1 : tensor<1x2x3x4x5x6xf32>)
+      dimensions = [0, 2, 5]
+  %transpose = linalg.transpose
+      ins(%broadcast : tensor<1x2x3x4x5x6xf32>)
+      outs(%init2 : tensor<1x6x2x3x5x4xf32>)
+      permutation = [0, 5, 1, 2, 4, 3]
+  func.return %transpose : tensor<1x6x2x3x5x4xf32>
+}
+
+// -----
+
+func.func @broadcast_transpose_fold_dynamic(%input: tensor<?x?x5xf32>,
+                                            %init1: tensor<1x?x3x?x5x6xf32>,
+                                            %init2: tensor<1x3x?x6x5x?xf32>) -> tensor<1x3x?x6x5x?xf32> {
+  // CHECK-LABEL: @broadcast_transpose_fold_dynamic
+  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<?x?x5xf32>
+  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<1x?x3x?x5x6xf32>
+  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<1x3x?x6x5x?xf32>
+  //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+  //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+  //       CHECK:   %[[DIM0:.+]] = tensor.dim %[[INPUT]], %[[C0]] : tensor<?x?x5xf32>
+  //       CHECK:   %[[DIM1:.+]] = tensor.dim %[[INPUT]], %[[C1]] : tensor<?x?x5xf32>
+  //       CHECK:   %[[TMP_INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]]) : tensor<?x5x?xf32>
+  //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[INPUT]] : tensor<?x?x5xf32>) outs(%[[TMP_INIT]] : tensor<?x5x?xf32>) permutation = [1, 2, 0]
+  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[TRANSPOSE]] : tensor<?x5x?xf32>) outs(%[[INIT2]] : tensor<1x3x?x6x5x?xf32>) dimensions = [0, 1, 3]
+  //       CHECK:   return %[[BROADCAST]] : tensor<1x3x?x6x5x?xf32>
+  %broadcast = linalg.broadcast
+      ins(%input : tensor<?x?x5xf32>)
+      outs(%init1 : tensor<1x?x3x?x5x6xf32>)
+      dimensions = [0, 2, 5]
+  %transpose = linalg.transpose
+      ins(%broadcast : tensor<1x?x3x?x5x6xf32>)
+      outs(%init2 : tensor<1x3x?x6x5x?xf32>)
+      permutation = [0, 2, 3, 5, 4, 1]
+  func.return %transpose : tensor<1x3x?x6x5x?xf32>
+}
+
+// -----
+
+func.func @broadcast_transpose_fold_2dim(%input: tensor<2xf32>,
+                                         %init1: tensor<2x4xf32>,
+                                         %init2: tensor<4x2xf32>) -> tensor<4x2xf32> {
+  // CHECK-LABEL: @broadcast_transpose_fold_2dim
+  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<2xf32>
+  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<2x4xf32>
+  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<4x2xf32>
+  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[INPUT]] : tensor<2xf32>) outs(%[[INIT2]] : tensor<4x2xf32>) dimensions = [0]
+  //       CHECK:   return %[[BROADCAST]] : tensor<4x2xf32>
+  %broadcast = linalg.broadcast
+      ins(%input : tensor<2xf32>)
+      outs(%init1 : tensor<2x4xf32>)
+      dimensions = [1]
+  %transpose = linalg.transpose
+      ins(%broadcast : tensor<2x4xf32>)
+      outs(%init2 : tensor<4x2xf32>)
+      permutation = [1, 0]
+  func.return %transpose : tensor<4x2xf32>
+}
+
+// -----
+
+func.func @concats_of_fill(
+    %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index)
+    -> tensor<5x?x?xf32>
+{
+  %cst0 = arith.constant 0.0 : f32
+  %cst1 = arith.constant 0.0 : f32
+  %0 = tensor.empty(%arg0, %arg1) : tensor<5x?x?xf32>
+  %1 = linalg.fill ins(%cst0 : f32) outs(%0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  %2 = tensor.empty(%arg2, %arg3) : tensor<5x?x?xf32>
+  %3 = linalg.fill ins(%cst1 : f32) outs(%2 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  %4 = tensor.concat dim(1) %1, %3 : (tensor<5x?x?xf32>, tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  return %4 : tensor<5x?x?xf32>
+}
+//       CHECK: func @concats_of_fill(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index)
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.0
+//   CHECK-DAG:   %[[EMPTY0:.+]] = tensor.empty(%[[ARG0]], %[[ARG1]])
+//   CHECK-DAG:   %[[EMPTY1:.+]] = tensor.empty(%[[ARG2]], %[[ARG3]])
+//       CHECK:   %[[CONCAT:.+]] = tensor.concat dim(1) %[[EMPTY0]], %[[EMPTY1]]
+//       CHECK:   %[[FILL:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[CONCAT]] :
+//       CHECK:   return %[[FILL]]
diff --git a/mlir/test/Dialect/Linalg/continuous-tiling-full.mlir b/mlir/test/Dialect/Linalg/continuous-tiling-full.mlir
index b61c727f9cffc..7410ff593d01a 100644
--- a/mlir/test/Dialect/Linalg/continuous-tiling-full.mlir
+++ b/mlir/test/Dialect/Linalg/continuous-tiling-full.mlir
@@ -127,7 +127,7 @@ module attributes {transform.with_named_sequence} {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %tile_sizes, %chunk_sizes = transform.structured.continuous_tile_sizes %0 { dimension = 0, target_size = 9 } : (!transform.any_op) -> !transform.any_op
     %linalg_splits, %empty = transform.structured.split %0 after %chunk_sizes { dimension = 0, multiway } : !transform.any_op, !transform.any_op
-    transform.foreach %linalg_splits, %tile_sizes {zip_shortest} : !transform.any_op, !transform.any_op {
+    transform.foreach %linalg_splits, %tile_sizes with_zip_shortest : !transform.any_op, !transform.any_op {
     ^bb1(%linalg_split: !transform.any_op, %tile_size: !transform.any_op):
       %tiled_linalg_split, %dim0_loop = transform.structured.tile_using_for %linalg_split tile_sizes [%tile_size] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
diff --git a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
index 50a2d6bf532aa..d1a89226fdb58 100644
--- a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
@@ -15,7 +15,7 @@ func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @main
 // CHECK-SAME:       (%[[ARG0:.+]]: tensor<f32>) -> tensor<f32>
 // CHECK:   %[[EXTRACTED:.+]] = tensor.extract %[[ARG0]][] : tensor<f32>
-// CHECK: cf.br ^{{.*}}
-// CHECK: ^{{.*}}:
-// CHECK:   %[[ELEMENTS:.+]] = tensor.from_elements %[[EXTRACTED]] : tensor<f32>
+// CHECK: cf.br ^{{.*}}(%[[EXTRACTED]] : f32)
+// CHECK: ^{{.*}}(%[[ARG1:.+]]: f32):
+// CHECK:   %[[ELEMENTS:.+]] = tensor.from_elements %[[ARG1]] : tensor<f32>
 // CHECK:   return %[[ELEMENTS]] : tensor<f32>
diff --git a/mlir/test/Dialect/Linalg/detensorize_if.mlir b/mlir/test/Dialect/Linalg/detensorize_if.mlir
index c728ad21d2209..8d17763c04b6c 100644
--- a/mlir/test/Dialect/Linalg/detensorize_if.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_if.mlir
@@ -42,15 +42,18 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<0>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br
-// CHECK-NEXT:   ^[[bb1:.*]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3
-// CHECK-NEXT:   ^[[bb2]]
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
 
 // -----
@@ -103,17 +106,20 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<0>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br ^[[bb1:.*]]
-// CHECK-NEXT:   ^[[bb1:.*]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3
-// CHECK-NEXT:   ^[[bb2]]:
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]:
-// CHECK-NEXT:     cf.br ^[[bb4:.*]]
-// CHECK-NEXT:   ^[[bb4]]:
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     cf.br ^[[bb4:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb4]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
 
 // -----
@@ -165,13 +171,16 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<10>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br ^[[bb1:.*]]
-// CHECK-NEXT:   ^[[bb1]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb2
-// CHECK-NEXT:   ^[[bb2]]
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb2(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
diff --git a/mlir/test/Dialect/Linalg/detensorize_while.mlir b/mlir/test/Dialect/Linalg/detensorize_while.mlir
index 580a97d3a851b..aa30900f76a33 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while.mlir
@@ -46,11 +46,11 @@ func.func @main(%farg0: tensor<i32>, %farg1: tensor<i32>) -> tensor<i32> attribu
 // DET-ALL:         cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // DET-ALL:       ^[[bb1]](%{{.*}}: i32)
 // DET-ALL:         arith.cmpi slt, {{.*}}
-// DET-ALL:         cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-ALL:       ^[[bb2]]
+// DET-ALL:         cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-ALL:       ^[[bb2]](%{{.*}}: i32)
 // DET-ALL:         arith.addi {{.*}}
 // DET-ALL:         cf.br ^[[bb1]](%{{.*}} : i32)
-// DET-ALL:       ^[[bb3]]:
+// DET-ALL:       ^[[bb3]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements {{.*}}
 // DET-ALL:         return %{{.*}} : tensor<i32>
 
@@ -62,10 +62,10 @@ func.func @main(%farg0: tensor<i32>, %farg1: tensor<i32>) -> tensor<i32> attribu
 // DET-CF:         cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // DET-CF:       ^[[bb1]](%{{.*}}: i32)
 // DET-CF:         arith.cmpi slt, {{.*}}
-// DET-CF:         cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-CF:       ^[[bb2]]:
+// DET-CF:         cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-CF:       ^[[bb2]](%{{.*}}: i32)
 // DET-CF:         arith.addi {{.*}}
 // DET-CF:         cf.br ^[[bb1]](%{{.*}} : i32)
-// DET-CF:       ^[[bb3]]:
+// DET-CF:       ^[[bb3]](%{{.*}}: i32)
 // DET-CF:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-CF:         return %{{.*}} : tensor<i32>
diff --git a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
index 414d9b94cbf53..955c7be5ef4c8 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
@@ -74,8 +74,8 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-ALL:         } -> tensor<i32>
 // DET-ALL:         tensor.extract %{{.*}}[] : tensor<i32>
 // DET-ALL:         cmpi slt, %{{.*}}, %{{.*}} : i32
-// DET-ALL:         cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-ALL:       ^[[bb2]]:
+// DET-ALL:         cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-ALL:       ^[[bb2]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-ALL:         tensor.empty() : tensor<10xi32>
 // DET-ALL:         linalg.generic {{{.*}}} ins(%{{.*}} : tensor<i32>) outs(%{{.*}} : tensor<10xi32>) {
@@ -83,7 +83,7 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-ALL:           linalg.yield %{{.*}} : i32
 // DET-ALL:         } -> tensor<10xi32>
 // DET-ALL:         cf.br ^[[bb1]](%{{.*}} : tensor<10xi32>)
-// DET-ALL:       ^[[bb3]]
+// DET-ALL:       ^[[bb3]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-ALL:         return %{{.*}} : tensor<i32>
 // DET-ALL:       }
@@ -95,10 +95,10 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-CF:         %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor<10xi32>) outs(%{{.*}} : tensor<i32>) {
 // DET-CF:         tensor.extract %{{.*}}[] : tensor<i32>
 // DET-CF:         cmpi slt, %{{.*}}, %{{.*}} : i32
-// DET-CF:         cf.cond_br %{{.*}}, ^bb2, ^bb3
-// DET-CF:       ^bb2:
+// DET-CF:         cf.cond_br %{{.*}}, ^bb2(%{{.*}} : tensor<i32>), ^bb3(%{{.*}} : tensor<i32>)
+// DET-CF:       ^bb2(%{{.*}}: tensor<i32>)
 // DET-CF:         %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor<i32>) outs(%{{.*}} : tensor<10xi32>) {
 // DET-CF:         cf.br ^bb1(%{{.*}} : tensor<10xi32>)
-// DET-CF:       ^bb3:
+// DET-CF:       ^bb3(%{{.*}}: tensor<i32>)
 // DET-CF:         return %{{.*}} : tensor<i32>
 // DET-CF:       }
diff --git a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
index 913e78272db79..6d8d5fe71fca5 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
@@ -49,8 +49,8 @@ func.func @main() -> () attributes {} {
 // CHECK-NEXT:    cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // CHECK-NEXT:  ^[[bb1]](%{{.*}}: i32)
 // CHECK-NEXT:    %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}}
-// CHECK-NEXT:    cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// CHECK-NEXT:  ^[[bb2]]
+// CHECK-NEXT:    cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]]
+// CHECK-NEXT:  ^[[bb2]](%{{.*}}: i32)
 // CHECK-NEXT:    %{{.*}} = arith.addi %{{.*}}, %{{.*}}
 // CHECK-NEXT:    cf.br ^[[bb1]](%{{.*}} : i32)
 // CHECK-NEXT:  ^[[bb3]]:
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index 89183813c080b..8f13c69070457 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -119,7 +119,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-//  CHECK-DAG:  #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+//  CHECK-DAG:  #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
 
 //      CHECK:  fold_extract_slice
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x128xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index 8545dfd25eccf..778d5bb8b9c84 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -177,7 +177,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-
 // -----
 
 // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)>
@@ -194,13 +193,13 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 :
   //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //      CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]]
+  //      CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
@@ -220,7 +219,6 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
 // -----
 
 // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot.
@@ -235,11 +233,11 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
   //  CHECK-NOT:   affine.max
   //  CHECK-NOT:   affine.min
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
   //      CHECK:   %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
@@ -342,7 +340,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)>
-// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)>
 // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)>
 // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)>
 
@@ -355,8 +352,7 @@ module attributes {transform.with_named_sequence} {
                                          %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
                                          -> (tensor<100xf32>, tensor<100xf32>) {
 //      CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
-//      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
-//      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
+//      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV0]])
 //  CHECK-NOT:   affine.min
 //  CHECK-NOT:   affine.max
 //      CHECK:   %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]])
@@ -467,16 +463,16 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[c1:.*]] = arith.constant 1 : index
   //      CHECK: %[[c0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
-  //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
+  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
+  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
+  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
+  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
@@ -535,16 +531,16 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[c1:.*]] = arith.constant 1 : index
   //      CHECK: %[[c0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
-  //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
+  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
+  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
+  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
+  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
index 955ea6b0ebbbd..7bac850d0b7fe 100644
--- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics --cse %s | FileCheck %s
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -178,14 +178,13 @@ module {
 
 // CHECK-LABEL:   func.func @scalable_tile(
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>,
-// CHECK:           %[[C4:.*]] = arith.constant 0 : index
-// CHECK:           %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C4]] : tensor<?xf32>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : tensor<?xf32>
 // CHECK:           %[[VEC_SIZE:.*]] = arith.constant 4 : index
 // CHECK:           %[[VS:.*]] = vector.vscale
 // CHECK:           %[[STEP:.*]] = arith.muli %[[VEC_SIZE]], %[[VS]] : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           scf.for %[[IV:.*]] = %[[C0]] to %[[DIM]] step %[[STEP]] iter_args(%[[VAL:.*]] = %[[ARG_2]]) -> (tensor<?xf32>) {
-// CHECK:             %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%[[IV]])[%[[STEP]], %[[DIM]]]
+// CHECK:             %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)>(%[[IV]])[%[[DIM]], %[[STEP]]]
 // CHECK:             %[[SLICE_ARG0:.*]] = tensor.extract_slice %[[ARG_0]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
 // CHECK:             %[[SLICE_ARG1:.*]] = tensor.extract_slice %[[ARG_1]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
 // CHECK:             %[[SLICE_ARG2:.*]] = tensor.extract_slice %[[VAL]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
@@ -202,20 +201,14 @@ module {
 // -----
 
 // CHECK-LABEL:   func.func @scalable_and_fixed_length_tile
-// CHECK:           %[[C4:.*]] = arith.constant 4 : index
-// CHECK:           %[[VS:.*]] = vector.vscale
-// CHECK:           %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C128:.*]] = arith.constant 128 : index
-// CHECK:           %[[STEP_0:.*]] = arith.constant 4 : index
-// CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[STEP_0]]
-// CHECK:             %[[C0_1:.*]] = arith.constant 0 : index
-// CHECK:             %[[C128_1:.*]] = arith.constant 128 : index
-// CHECK:             %[[STEP_1:.*]] = arith.constant 4 : index
-// CHECK:             scf.for %[[VAL_16:.*]] = %[[C0_1]] to %[[C128_1]] step %[[STEP_1]]
-// CHECK:               %[[C0_2:.*]] = arith.constant 0 : index
-// CHECK:               %[[C128_2:.*]] = arith.constant 128 : index
-// CHECK:               scf.for %{{.*}} = %[[C0_2]] to %[[C128_2]] step %[[STEP_2]]
+//   CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG:     %[[VS:.*]] = vector.vscale
+//   CHECK-DAG:     %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index
+//   CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:     %[[C128:.*]] = arith.constant 128 : index
+//       CHECK:     scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[C4]]
+//       CHECK:       scf.for %[[VAL_16:.*]] = %[[C0]] to %[[C128]] step %[[C4]]
+//       CHECK:         scf.for %{{.*}} = %[[C0]] to %[[C128]] step %[[STEP_2]]
 
 func.func @scalable_and_fixed_length_tile(
   %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
index 4423ee6ea6a51..4ee3088cc3778 100644
--- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
@@ -189,3 +189,168 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
+                                                   %arg1: tensor<f32>) -> tensor<f32> {
+
+  %0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
+  (%in: f32, %init: f32) {
+    %0 = arith.addf %in, %init : f32
+    linalg.yield %0 : f32
+  }
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_reduction_scalable_1d(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:          %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:          %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32>
+// CHECK:          %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:          %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:          %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
+// CHECK:          %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK:          %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:          %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32>
+// CHECK:          %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector<f32>
+// CHECK:          %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32>
+// CHECK:          %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
+func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
+                                                   %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d0)>],
+                        iterator_types = ["parallel", "reduction"] }
+    ins(%arg0 : tensor<?x?xf32>)
+    outs(%arg1 : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_reduction_scalable_2d(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK:    %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1>
+// CHECK:    %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32>
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
+// CHECK:    %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK:    %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor<?x?xf32>,
+                                                           %arg1: tensor<?xf32>,
+                                                           %arg2: tensor<?xf32>) {
+  linalg.matvec ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
+                 outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
+  return
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_matvec_trailing_reduction_dim(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) {
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK:    %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1>
+// CHECK:    %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32>
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1>
+// CHECK:    %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32>
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
+// CHECK:    %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK:    %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32>
+// CHECK:    %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor<?x?xf32>,
+                                                                 %arg1: tensor<?xf32>,
+                                                                 %arg2: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d1)>,
+                                         affine_map<(d0, d1) -> (d0)>],
+                        iterator_types = ["parallel", "reduction"] }
+    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%arg2 : tensor<?xf32>) {
+    ^bb(%mat: f32, %vec: f32, %res: f32) :
+      %0 = arith.mulf %mat, %vec : f32
+      %1 = arith.addf %res, %0 : f32
+      linalg.yield %1 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK:    %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1>
+// CHECK:    %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32>
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1>
+// CHECK:    %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32>
+// CHECK:    %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
+// CHECK:    %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK:    %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32>
+// CHECK:    %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32>
+// CHECK:    %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK:    %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index 5d3c07c8e23c1..e9f8e08ca0c6b 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -110,7 +110,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-  // -----
+// -----
 
 func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> {
   %pad = arith.constant 0.000000e+00 : f32
@@ -126,3 +126,130 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>,
+                                              %acc: tensor<?xf32>) -> tensor<?xf32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0]
+  (%in: f32, %init: f32) {
+    %0 = arith.addf %in, %init : f32
+    linalg.yield %0 : f32
+  }
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_generic_reduction_scalable_leading_dim(%input: tensor<?x?xf32>,
+                                                         %acc: tensor<?xf32>) -> tensor<?xf32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d1)>],
+                        iterator_types = ["reduction", "parallel"] }
+    ins(%input : tensor<?x?xf32>)
+    outs(%acc : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_matvec_scalable_two_dims(%A: memref<?x?xf32>, %B: memref<?xf32>, %C: memref<?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.matvec ins(%A, %B: memref<?x?xf32>, memref<?xf32>)
+                outs(%C: memref<?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [[4], [4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+                outs(%C: memref<?x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [[8], 16, 4] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_matmul_scalable_trailing_reduction_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+                outs(%C: memref<?x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [8, 16, [4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_generic_matmul_scalable_two_trailing_dims(%A: tensor<?x64xf32>, %B: tensor<64x?xf32>,
+                                                            %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                                         affine_map<(d0, d1, d2) -> (d2, d1)>,
+                                         affine_map<(d0, d1, d2) -> (d0, d1)>],
+                        iterator_types = ["parallel", "parallel", "reduction"] }
+    ins(%A, %B : tensor<?x64xf32>, tensor<64x?xf32>)
+    outs(%C: tensor<?x?xf32>) {
+    ^bb(%in1: f32, %in2: f32, %out: f32) :
+      %0 = arith.mulf %in1, %in2 : f32
+      %1 = arith.addf %0, %out : f32
+      linalg.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [2, [4], [4]] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index d7ff1ded9d933..3404b73102e6a 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1240,8 +1240,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL:   func @red_max_2d(
-func.func @red_max_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:   func @red_maximumf_2d(
+func.func @red_maximumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CMINF:.+]] = arith.constant dense<-3.402820e+38> : vector<4xf32>
   // CHECK: tensor.empty() : tensor<4xf32>
   // CHECK: vector.multi_reduction <maximumf>, {{.*}}, %[[CMINF]] [1] : vector<4x4xf32> to vector<4xf32>
@@ -1272,8 +1272,40 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL:   func @red_min_2d(
-func.func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:   func @red_maxnumf_2d(
+func.func @red_maxnumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+  // CHECK: %[[CMINF:.+]] = arith.constant dense<-3.402820e+38> : vector<4xf32>
+  // CHECK: tensor.empty() : tensor<4xf32>
+  // CHECK: vector.multi_reduction <maxnumf>, {{.*}}, %[[CMINF]] [1] : vector<4x4xf32> to vector<4xf32>
+  // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
+  %ident = arith.constant -3.40282e+38 : f32
+  %init = tensor.empty() : tensor<4xf32>
+  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                          affine_map<(d0, d1) -> (d0)>],
+                         iterator_types = ["parallel", "reduction"]}
+                         ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
+  ^bb0(%in0: f32, %out0: f32):
+    %max = arith.maxnumf %in0, %out0 : f32
+    linalg.yield %max : f32
+  } -> tensor<4xf32>
+  return %red : tensor<4xf32>
+}
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %5 = transform.structured.vectorize_children_and_apply_patterns %4 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL:   func @red_minimumf_2d(
+func.func @red_minimumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CMAXF:.+]] = arith.constant dense<3.402820e+38> : vector<4xf32>
   // CHECK: tensor.empty() : tensor<4xf32>
   // CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
@@ -1294,6 +1326,39 @@ func.func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
 }
 
 
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %5 = transform.structured.vectorize_children_and_apply_patterns %4 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL:   func @red_minnumf_2d(
+func.func @red_minnumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+  // CHECK: %[[CMAXF:.+]] = arith.constant dense<3.402820e+38> : vector<4xf32>
+  // CHECK: tensor.empty() : tensor<4xf32>
+  // CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
+  // CHECK: vector.multi_reduction <minnumf>, {{.*}}, %[[CMAXF]] [1] : vector<4x4xf32> to vector<4xf32>
+  // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
+  %maxf32 = arith.constant 3.40282e+38 : f32
+  %init = tensor.empty() : tensor<4xf32>
+  %fill = linalg.fill ins(%maxf32 : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                          affine_map<(d0, d1) -> (d0)>],
+                         iterator_types = ["parallel", "reduction"]}
+                         ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
+  ^bb0(%in0: f32, %out0: f32):
+    %min = arith.minnumf %out0, %in0 : f32
+    linalg.yield %min : f32
+  } -> tensor<4xf32>
+  return %red : tensor<4xf32>
+}
+
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
index f042753780013..964565620fd01 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
@@ -1,29 +1,52 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
 
-func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x3xf32>) -> tensor<1x3xf32> {
+func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous(
+    %src: tensor<80x16xf32>,
+    %output : tensor<1x3xf32>,
+    %idx: index) -> tensor<1x3xf32> {
+
   %c79 = arith.constant 79 : index
   %1 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
     iterator_types = ["parallel", "parallel"]
-  } outs(%extracted_slice : tensor<1x3xf32>) {
+  } outs(%output : tensor<1x3xf32>) {
   ^bb0(%out: f32):
     %2 = linalg.index 1 : index
-    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
-    %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %idx)
+    %extracted = tensor.extract %src[%c79, %3] : tensor<80x16xf32>
     linalg.yield %extracted : f32
   } -> tensor<1x3xf32>
   return %1 : tensor<1x3xf32>
 }
 
 // CHECK-LABEL:   func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_5]] : vector<1x4xi1>
-// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
-// CHECK:           %[[VAL_11:.*]] = vector.broadcast {{.*}} : index to vector<4xindex>
-// CHECK:           %[[VAL_12:.*]] = arith.addi {{.*}} : vector<4xindex>
-// CHECK:           %[[VAL_20:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
-// CHECK:           %[[VAL_22:.*]] = vector.mask %[[VAL_8]] { vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
+// CHECK-SAME:      %[[SRC:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x3xf32>,
+// CHECK-SAME:      %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
+
+/// Create the mask
+// CHECK-DAG:       %[[DIM_0:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[DIM_1:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C79:.*]] = arith.constant 79 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x4xi1>
+
+/// TODO: This transfer_read is redundant - remove
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
+
+/// Caluclate the index vector
+// CHECK:           %[[STEP:.*]] = vector.step : vector<4xindex>
+// CHECK:           %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<4xindex>
+// CHECK:           %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<4xindex>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<4xindex> to vector<4xindex>
+
+/// Extract the starting point from the index vector
+// CHECK:           %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<4xindex>
+
+// Final read and write
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
+// CHECK:           %[[C0_1:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -33,7 +56,69 @@ module attributes {transform.with_named_sequence} {
    }
 }
 
- // -----
+// -----
+
+// Identical to the above, but with scalable vectors.
+
+func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable(
+    %src: tensor<80x16xf32>,
+    %output : tensor<1x3xf32>,
+    %idx: index) -> tensor<1x3xf32> {
+
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%output : tensor<1x3xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %idx)
+    %extracted = tensor.extract %src[%c79, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x3xf32>
+
+  return %1 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable
+// CHECK-SAME:      %[[SRC:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x3xf32>,
+// CHECK-SAME:      %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
+
+/// Create the mask
+// CHECK-DAG:       %[[DIM_0:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[DIM_1:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C79:.*]] = arith.constant 79 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x[4]xi1>
+
+/// TODO: This transfer_read is redundant - remove
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+
+/// Caluclate the index vector
+// CHECK:           %[[STEP:.*]] = vector.step : vector<[4]xindex>
+// CHECK:           %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<[4]xindex>
+// CHECK:           %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<[4]xindex>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<[4]xindex> to vector<[4]xindex>
+
+/// Extract the starting point from the index vector
+// CHECK:           %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<[4]xindex>
+
+// Final read and write
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+// CHECK:           %[[C0_1:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x[4]xf32>, tensor<1x3xf32> } : vector<1x[4]xi1> -> tensor<1x3xf32>
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, [4]] {vectorize_nd_extract} : !transform.any_op
+     transform.yield
+   }
+}
+
+// -----
 
 func.func @masked_dynamic_vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<?x?xf32>, %arg0: index, %extracted_slice : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c79 = arith.constant 79 : index
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index c3aed35153241..f7db1140794e5 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -336,3 +336,21 @@ func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc,  %buffer2: m
   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3>
   return
 }
+// -----
+
+func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 9977dd57e3023..1d1d93f097758 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -420,8 +420,8 @@ func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -435,7 +435,7 @@ func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -449,7 +449,7 @@ func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128]} : () -> ()
+  }) {alignments = [1, 128]} : () -> ()
   return
 }
 
@@ -463,7 +463,7 @@ func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -477,7 +477,7 @@ func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -491,7 +491,7 @@ func.func @omp_simd_nontemporal_the_same_var(%arg0 : index,  %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -839,7 +839,7 @@ func.func @omp_ordered_region3(%x : i32) -> () {
 
 func.func @omp_ordered1(%vec0 : i64) -> () {
   // expected-error @below {{op must be nested inside of a loop}}
-  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
   return
 }
 
@@ -849,7 +849,7 @@ func.func @omp_ordered2(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.distribute {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{op must be nested inside of a worksharing, simd or worksharing simd loop}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -863,7 +863,7 @@ func.func @omp_ordered3(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing worksharing-loop region must have an ordered clause}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -877,7 +877,7 @@ func.func @omp_ordered4(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop ordered(0) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing loop's ordered clause must have a parameter present}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -891,7 +891,7 @@ func.func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{number of variables in depend clause does not match number of iteration variables in the doacross loop}}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
       omp.yield
     }
     omp.terminator
@@ -1394,7 +1394,7 @@ func.func @omp_teams_allocate(%data_var : memref<i32>) {
     // expected-error @below {{expected equal sizes for allocate and allocator variables}}
     "omp.teams" (%data_var) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0,0>} : (memref<i32>) -> ()
     omp.terminator
   }
   return
@@ -1407,7 +1407,7 @@ func.func @omp_teams_num_teams1(%lb : i32) {
     // expected-error @below {{expected num_teams upper bound to be defined if the lower bound is defined}}
     "omp.teams" (%lb) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,0,1,0,0,0,0>} : (i32) -> ()
     omp.terminator
   }
   return
@@ -1432,7 +1432,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1442,7 +1442,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,0,1>} : (memref<i32>) -> ()
   return
 }
 
@@ -1557,17 +1557,17 @@ func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
 // -----
 
 func.func @omp_single_copyprivate(%data_var : memref<i32>) -> () {
-  // expected-error @below {{inconsistent number of copyPrivate vars (= 1) and functions (= 0), both must be equal}}
+  // expected-error @below {{inconsistent number of copyprivate vars (= 1) and functions (= 0), both must be equal}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 0,0,1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1623,7 +1623,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.task"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -1820,7 +1820,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1834,7 +1834,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>, reduction_syms = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1847,7 +1847,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>, reduction_syms = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1861,7 +1861,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1874,7 +1874,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1934,7 +1934,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testi64 = "test.i64"() : () -> (i64)
   // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}}
-  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) {
+  omp.taskloop grainsize(%testi64: i64) num_tasks(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
@@ -2001,7 +2001,7 @@ func.func @omp_target_data(%map1: memref<?xi32>) {
 // -----
 
 func.func @omp_target_data() {
-  // expected-error @below {{At least one of map, useDevicePtr, or useDeviceAddr operand must be present}}
+  // expected-error @below {{At least one of map, use_device_ptr_vars, or use_device_addr_vars operand must be present}}
   omp.target_data {}
   return
 }
@@ -2020,7 +2020,7 @@ func.func @omp_target_enter_data(%map1: memref<?xi32>) {
 func.func @omp_target_enter_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2038,7 +2038,7 @@ func.func @omp_target_exit_data(%map1: memref<?xi32>) {
 func.func @omp_target_exit_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2119,7 +2119,7 @@ llvm.mlir.global internal @_QFsubEx() : i32
 func.func @omp_target_update_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_update map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_update map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2129,7 +2129,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.target"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -2137,7 +2137,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
   // expected-error @below {{op chunk size set without dist_schedule_static being present}}
-  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 0, 0, 1, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (i32) -> ()
 }
@@ -2146,7 +2146,7 @@ func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
 
 func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
-  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
+  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (memref<i32>) -> ()
 }
@@ -2340,7 +2340,7 @@ func.func @undefined_privatizer(%arg0: index) {
 // -----
 func.func @undefined_privatizer(%arg0: !llvm.ptr) {
   // expected-error @below {{inconsistent number of private variables and privatizer op symbols, private vars: 1 vs. privatizer op symbols: 2}}
-  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1>, privatizers = [@x.privatizer, @y.privatizer]}> ({
+  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0>, private_syms = [@x.privatizer, @y.privatizer]}> ({
     ^bb0(%arg2: !llvm.ptr):
       omp.terminator
     }) : (!llvm.ptr) -> ()
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index d6f4a810c4a80..d2924998f41b8 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -67,37 +67,37 @@ func.func @omp_terminator() -> () {
 
 func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i32, %idx : index) -> () {
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-  "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({
+  "omp.parallel" (%data_var, %data_var, %if_cond, %num_threads) ({
 
   // test without if condition
   // CHECK: omp.parallel num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%num_threads, %data_var, %data_var) ({
+    "omp.parallel"(%data_var, %data_var, %num_threads) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,1,1,1,0,0>} : (i32, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,0,1,0,0>} : (memref<i32>, memref<i32>, i32) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
 
   // test without num_threads
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%if_cond, %data_var, %data_var) ({
+    "omp.parallel"(%data_var, %data_var, %if_cond) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,1,1,0,0>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} : (memref<i32>, memref<i32>, i1) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (i1, i32) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_kind = #omp<procbindkind spread>} : (memref<i32>, memref<i32>, i1, i32) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.distribute
   omp.distribute {
@@ -184,7 +184,7 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () {
     "omp.loop_nest" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-    }) {inclusive} : (index, index, index) -> ()
+    }) {loop_inclusive} : (index, index, index) -> ()
     omp.terminator
   }
 
@@ -382,7 +382,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, ordered_val = 1} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, ordered = 1} :
     () -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -392,7 +392,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -402,7 +402,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 2,2,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 0,0,2,2,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) ordered(2) {
@@ -412,7 +412,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0,1>, schedule_kind = #omp<schedulekind dynamic>, ordered = 2} :
     (memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop schedule(auto) nowait {
@@ -422,7 +422,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, nowait, schedule_kind = #omp<schedulekind auto>} :
     () -> ()
 
   // CHECK: omp.wsloop {
@@ -574,8 +574,8 @@ func.func @omp_simd_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32, 128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [32, 128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -589,8 +589,8 @@ func.func @omp_simd_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32],
-      operandSegmentSizes = array<i32: 1, 0, 0>} : (memref<i32>) -> ()
+  }) {alignments = [32],
+      operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -605,7 +605,7 @@ func.func @omp_simd_nontemporal_list(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i64>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i64>) -> ()
   return
 }
 
@@ -620,7 +620,7 @@ func.func @omp_simd_nontemporal_single(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -752,8 +752,8 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
     }
     omp.terminator
   }
-  // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
-  omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
+  // CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%{{.+}} : i32)
+  omp.distribute dist_schedule_static dist_schedule_chunk_size(%chunk_size : i32) {
     omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
       omp.yield
     }
@@ -805,16 +805,16 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
 func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %device_ptr: memref<i32>, %device_addr: memref<?xi32>, %map1: memref<?xi32>, %map2: memref<?xi32>) -> () {
 
     // Test with optional operands; if_expr, device, thread_limit, private, firstprivate and nowait.
-    // CHECK: omp.target if({{.*}}) device({{.*}}) thread_limit({{.*}}) nowait
-    "omp.target"(%if_cond, %device, %num_threads) ({
+    // CHECK: omp.target device({{.*}}) if({{.*}}) nowait thread_limit({{.*}})
+    "omp.target"(%device, %if_cond, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 0,0,0,1,0,1,0,0,0,0,1>} : ( si32, i1, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target is_device_ptr(%[[VAL_4:.*]] : memref<i32>) has_device_addr(%[[VAL_5:.*]] : memref<?xi32>) map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
+    // CHECK: omp.target has_device_addr(%[[VAL_5:.*]] : memref<?xi32>) is_device_ptr(%[[VAL_4:.*]] : memref<i32>) map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) is_device_ptr(%device_ptr : memref<i32>) has_device_addr(%device_addr : memref<?xi32>) {
@@ -838,12 +838,12 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
 
 func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i32>, %device_addr: memref<?xi32>, %map1: memref<?xi32>, %map2: memref<?xi32>) -> () {
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>)
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_data if(%if_cond) device(%device : si32) map_entries(%mapv1 : memref<?xi32>){}
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_data use_device_ptr(%[[VAL_3:.*]] : memref<i32>) use_device_addr(%[[VAL_4:.*]] : memref<?xi32>) map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref<?xi32>) use_device_addr(%[[VAL_4:.*]] : memref<?xi32>) use_device_ptr(%[[VAL_3:.*]] : memref<i32>)
     %mapv2 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_data use_device_ptr(%device_ptr : memref<i32>) use_device_addr(%device_addr : memref<?xi32>) map_entries(%mapv2 : memref<?xi32>) {}
 
@@ -855,12 +855,12 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
     omp.target_data map_entries(%mapv3, %mapv4 : memref<?xi32>, memref<?xi32>) {}
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_enter_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_enter_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
     %mapv5 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_enter_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv5 : memref<?xi32>)
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_exit_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_exit_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
     %mapv6 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_exit_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv6 : memref<?xi32>)
 
@@ -869,12 +869,12 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
 
 // CHECK-LABEL: omp_target_pretty
 func.func @omp_target_pretty(%if_cond : i1, %device : si32,  %num_threads : i32) -> () {
-    // CHECK: omp.target if({{.*}}) device({{.*}})
+    // CHECK: omp.target device({{.*}}) if({{.*}})
     omp.target if(%if_cond) device(%device : si32) {
       omp.terminator
     }
 
-    // CHECK: omp.target if({{.*}}) device({{.*}}) nowait
+    // CHECK: omp.target device({{.*}}) if({{.*}}) nowait
     omp.target if(%if_cond) device(%device : si32) thread_limit(%num_threads : i32) nowait {
       omp.terminator
     }
@@ -1294,11 +1294,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Only one DEPEND(SINK: vec) clause
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1308,11 +1308,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(2) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Multiple DEPEND(SINK: vec) clauses
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
@@ -1874,13 +1874,13 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>, reduction_vars_byref = array<i1: false>, reductions=[@add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,0,1>, reduction_byref = array<i1: false>, reduction_syms=[@add_f32]} : (!llvm.ptr) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {
@@ -2098,14 +2098,14 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   }
 
   // Checking multiple clauses
-  // CHECK: omp.task if(%[[bool_var]]) final(%[[bool_var]]) untied
-  omp.task if(%bool_var) final(%bool_var) untied
+  // CHECK: omp.task allocate(%[[data_var]] : memref<i32> -> %[[data_var]] : memref<i32>)
+  omp.task allocate(%data_var : memref<i32> -> %data_var : memref<i32>)
+      // CHECK-SAME: final(%[[bool_var]]) if(%[[bool_var]])
+      final(%bool_var) if(%bool_var) 
       // CHECK-SAME: in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, byref @add_f32 -> %[[redn_var2]] : !llvm.ptr)
       in_reduction(@add_f32 -> %0 : !llvm.ptr, byref @add_f32 -> %1 : !llvm.ptr)
-      // CHECK-SAME: priority(%[[i32_var]] : i32)
-      priority(%i32_var : i32)
-      // CHECK-SAME: allocate(%[[data_var]] : memref<i32> -> %[[data_var]] : memref<i32>)
-      allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
+      // CHECK-SAME: priority(%[[i32_var]] : i32) untied
+      priority(%i32_var : i32) untied {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -2136,7 +2136,7 @@ func.func @omp_target_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
   omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
     // CHECK: omp.terminator
     omp.terminator
-  } {operandSegmentSizes = array<i32: 0,0,0,3,0>}
+  } {operandSegmentSizes = array<i32: 0,0,0,3,0,0,0,0>}
   return
 }
 
@@ -2281,7 +2281,7 @@ func.func @omp_taskgroup_multiple_tasks() -> () {
 func.func @omp_taskgroup_clauses() -> () {
   %testmemref = "test.memref"() : () -> (memref<i32>)
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskgroup task_reduction(@add_f32 -> %{{.+}}: !llvm.ptr) allocate(%{{.+}}: memref<i32> -> %{{.+}}: memref<i32>)
+  // CHECK: omp.taskgroup allocate(%{{.+}}: memref<i32> -> %{{.+}}: memref<i32>) task_reduction(@add_f32 -> %{{.+}}: !llvm.ptr)
   omp.taskgroup allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) task_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
     // CHECK: omp.task
     omp.task {
@@ -2421,8 +2421,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
   }
 
   %testi64 = "test.i64"() : () -> (i64)
-  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) {
-  omp.taskloop grain_size(%testi64: i64) {
+  // CHECK: omp.taskloop grainsize(%{{[^:]+}}: i64) {
+  omp.taskloop grainsize(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2577,7 +2577,7 @@ func.func @omp_target_update_data (%if_cond : i1, %device : si32, %map1: memref<
 
     %mapv_to = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(present, to) capture(ByRef) -> memref<?xi32> {name = ""}
 
-    // CHECK: omp.target_update if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%{{.*}}, %{{.*}} : memref<?xi32>, memref<?xi32>)
+    // CHECK: omp.target_update device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%{{.*}}, %{{.*}} : memref<?xi32>, memref<?xi32>) nowait
     omp.target_update if(%if_cond) device(%device : si32) nowait map_entries(%mapv_from , %mapv_to : memref<?xi32>, memref<?xi32>)
     return
 }
@@ -2614,7 +2614,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   }
 
   // Then map that over to the target
-  // CHECK: omp.target_enter_data depend(taskdependin -> [[ARG0]] : memref<?xi32>) nowait map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>)
+  // CHECK: omp.target_enter_data depend(taskdependin -> [[ARG0]] : memref<?xi32>) map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>) nowait
   omp.target_enter_data depend(taskdependin ->  %a: memref<?xi32>) nowait map_entries(%map_a, %map_c: memref<?xi32>, memref<?xi32>)
 
   // Compute 'b' on the target and copy it back
@@ -2631,7 +2631,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   }
 
   // Copy the updated 'a' onto the target
-  // CHECK: omp.target_update depend(taskdependin -> [[ARG0]] : memref<?xi32>) nowait map_entries([[MAP0]] : memref<?xi32>)
+  // CHECK: omp.target_update depend(taskdependin -> [[ARG0]] : memref<?xi32>) map_entries([[MAP0]] : memref<?xi32>) nowait
   omp.target_update depend(taskdependin -> %a : memref<?xi32>) nowait map_entries(%map_a :  memref<?xi32>)
 
   // Compute 'c' on the target and copy it back
diff --git a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
index 8954839c3c93e..43a4693ca3d2b 100644
--- a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
+++ b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
@@ -20,7 +20,7 @@ func.func @wrap_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
 // CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
 // CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
 // CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
-// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi slt, %[[C0]], %[[BOUND]] : i32
+// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi sgt, %[[BOUND]], %[[C0]] : i32
 // CHECK-DAG:     %[[PRE_INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
 // CHECK:         %[[IF:.*]]:2 = scf.if %[[PRE_COND]] -> (i32, i32) {
 // CHECK:           %[[WHILE:.*]]:2 = scf.while (
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 40b137f1fa36e..5b98a7790debf 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -1361,6 +1361,45 @@ func.func @broadcast_as_from_extent_tensor(%a : tensor<?xindex>) -> !shape.shape
 
 // -----
 
+// CHECK-LABEL: func @shape_of_from_reshape
+// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32>
+// CHECK-SAME: %[[SHAPE:.*]]: tensor<?xindex>
+func.func @shape_of_from_reshape(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>) -> tensor<?xindex> {
+  // CHECK: return %[[SHAPE]] : tensor<?xindex>
+  %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  %1 = shape.shape_of %0 : tensor<*xf32> -> tensor<?xindex>
+  return %1 : tensor<?xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_of_from_reshape_compatible_types
+// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32>
+// CHECK-SAME: %[[SHAPE:.*]]: tensor<5xindex>
+func.func @shape_of_from_reshape_compatible_types(%arg0: tensor<*xf32>, %arg1: tensor<5xindex>) -> tensor<?xindex> {
+  // CHECK: %[[CAST_SHAPE:.*]] = tensor.cast %[[SHAPE]] : tensor<5xindex> to tensor<?xindex>
+  // CHECK: return %[[CAST_SHAPE]] : tensor<?xindex>
+  %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<5xindex>) -> tensor<*xf32>
+  %1 = shape.shape_of %0 : tensor<*xf32> -> tensor<?xindex>
+  return %1 : tensor<?xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_of_from_reshape_nofold
+// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32>
+// CHECK-SAME: %[[SHAPE:.*]]: tensor<?xindex>
+func.func @shape_of_from_reshape_nofold(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>) -> !shape.shape {
+  // CHECK: %[[RESHAPED:.*]] = tensor.reshape %[[INPUT]](%[[SHAPE]]) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[RESHAPED]] : tensor<*xf32> -> !shape.shape
+  // CHECK: return %[[SHAPE_OF]] : !shape.shape
+  %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  %1 = shape.shape_of %0 : tensor<*xf32> -> !shape.shape
+  return %1 : !shape.shape
+}
+
+// -----
+
 // CHECK-LABEL: @cast_extent_tensor
 // CHECK-SAME: (%[[ARG:.*]]: tensor<?x?x?xf32>) -> tensor<?xindex>
 func.func @cast_extent_tensor(%arg : tensor<?x?x?xf32>) -> tensor<?xindex> {
diff --git a/mlir/test/Dialect/Shape/unranked-tensor-lowering.mlir b/mlir/test/Dialect/Shape/unranked-tensor-lowering.mlir
new file mode 100644
index 0000000000000..c1fcbd1ad045f
--- /dev/null
+++ b/mlir/test/Dialect/Shape/unranked-tensor-lowering.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-opt -split-input-file -canonicalize -cse %s | FileCheck %s
+
+// This test verifies the simplification of IR patterns that emerge when
+// lowering high-level element-wise ops with unranked tensor inputs. Consider
+// the following function incrementing and doubling the value of an input
+// unranked tensor using ops in a hypothetical high-level dialect called 'hl':
+//
+//  func.func @f(%input: tensor<*xf32>) -> tensor<*xf32> {
+//    %0 = hl.inc %input : tensor<*xf32>
+//    %1 = hl.double %0 : tensor<*xf32>
+//    return %1 : tensor<*xf32>
+//  }
+//
+// A possible strategy to lower 'hl.inc' consists in reshaping its operand into
+// a 1D tensor, creating a 1D tensor splat with the same total size as the input
+// operand and with value 1.0, adding both 1D tensors using 'arith.addf', and
+// reshaping the result back into the original input shape. A similar process
+// applies for 'hl.double', except with a tensor splat with value 2.0 and an
+// 'arith.mulf' op. The body of the function in the test below contains the full
+// sequence.
+//
+// Since such lowering process would operate on individual 'hl' ops in a
+// context-oblivious manner, the emitted code produces a redundant IR pattern
+// where the result of 'arith.addf' is reshaped into an unranked tensor, just
+// for it to be immediately reshaped back into the 1D tensor consumed by
+// 'arith.mulf'. This entails the overhead of re-computing the unranked tensor
+// shape ('shape.shape_of') and size ('shape.num_elements').
+//
+// This test verifies that the consecutive application of a canonicalization and
+// a CSE pass successfully simplifies this emerging pattern, leading to a
+// version of the code in which the result of the emitted 'arith.addf' op
+// associated with 'hl.inc' is directly consumed by the 'arith.mulf' op
+// associated with 'hl.double', as observed in the FileCheck directives. The
+// main rewrite patterns at play are 'shape.shape_of' canonicalization,
+// 'tensor.reshape' canonicalization, and 'shape.num_elements' subexpression
+// elimination.
+//
+
+// CHECK-LABEL: @unranked_tensor_lowering
+// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32>
+
+// CHECK-DAG: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG: %[[TWO:.*]] = arith.constant 2.000000e+00 : f32
+
+// CHECK: %[[INPUT_SHAPE:.*]] = shape.shape_of %[[INPUT]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK: %[[INPUT_SIZE:.*]] = shape.num_elements %[[INPUT_SHAPE]] : tensor<?xindex> -> index
+// CHECK: %[[INPUT_COLLAPSED_SHAPE:.*]] = tensor.from_elements %[[INPUT_SIZE]] : tensor<1xindex>
+// CHECK: %[[INPUT_COLLAPSED:.*]] = tensor.reshape %[[INPUT]](%[[INPUT_COLLAPSED_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+
+// CHECK: %[[ONE_SPLAT:.*]] = tensor.splat %[[ONE]]{{\[}}%[[INPUT_SIZE]]] : tensor<?xf32>
+// CHECK: %[[SUM_COLLAPSED:.*]] = arith.addf %[[INPUT_COLLAPSED]], %[[ONE_SPLAT]] : tensor<?xf32>
+
+// CHECK: %[[TWO_SPLAT:.*]] = tensor.splat %[[TWO]]{{\[}}%[[INPUT_SIZE]]] : tensor<?xf32>
+// CHECK: %[[PRODUCT_COLLAPSED:.*]] = arith.mulf %[[SUM_COLLAPSED]], %[[TWO_SPLAT]] : tensor<?xf32>
+
+// CHECK: %[[PRODUCT:.*]] = tensor.reshape %[[PRODUCT_COLLAPSED]](%[[INPUT_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK: return %[[PRODUCT]] : tensor<*xf32>
+
+func.func @unranked_tensor_lowering(%input: tensor<*xf32>) -> tensor<*xf32> {
+
+  // Collapse input
+  %input_shape = shape.shape_of %input : tensor<*xf32> -> tensor<?xindex>
+  %input_size = shape.num_elements %input_shape : tensor<?xindex> -> index
+  %input_collapsed_shape = tensor.from_elements %input_size : tensor<1xindex>
+  %input_collapsed = tensor.reshape %input(%input_collapsed_shape) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+
+  // Second operand for sum
+  %one = arith.constant 1.0 : f32
+  %one_splat = tensor.splat %one[%input_size] : tensor<?xf32>
+
+  // Compute sum and expand it
+  %sum_collapsed = arith.addf %input_collapsed, %one_splat : tensor<?xf32>
+  %sum = tensor.reshape %sum_collapsed(%input_shape) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+  // Collapse sum
+  %sum_shape = shape.shape_of %sum : tensor<*xf32> -> tensor<?xindex>
+  %sum_size = shape.num_elements %sum_shape : tensor<?xindex> -> index
+  %sum_collapsed_shape = tensor.from_elements %sum_size : tensor<1xindex>
+  %sum_collapsed_0 = tensor.reshape %sum(%sum_collapsed_shape) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+
+  // Second operand for product
+  %two = arith.constant 2.0 : f32
+  %two_splat = tensor.splat %two[%sum_size] : tensor<?xf32>
+
+  // Compute product and expand it
+  %product_collapsed = arith.mulf %sum_collapsed_0, %two_splat : tensor<?xf32>
+  %product = tensor.reshape %product_collapsed(%sum_shape) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+  return %product : tensor<*xf32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index eb0dc01be25b9..737b736ba795f 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -1099,6 +1099,42 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
   return
 }
 
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+#CSR = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : dense,
+    j : compressed
+  )
+}>
+
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 1>) -> f32 {
+  // expected-error@+1 {{'sparse_tensor.extract_value' op mismatch in tensor encoding and iterator encoding.}}
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 1>
+  return %f : f32
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) -> f32 {
+  // expected-error@+1 {{'sparse_tensor.extract_value' op must use last-level iterator to extract values.}}
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+  return %f : f32
+}
 
 // -----
 
@@ -1155,3 +1191,78 @@ func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -
   }
   return %r1 : index
 }
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op contains duplicated cases.}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  return %ret : index
+}
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op types mismatch between 0th yield value and defined value on 0th region}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    %i = arith.constant 1 : i32
+    sparse_tensor.yield %i : i32
+  }
+  return %ret : index
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op required out-of-bound coordinates}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord1, %coord2) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    %i = arith.constant 1 : i32
+    sparse_tensor.yield %i : i32
+  }
+  return %ret : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index bce0b41a99828..ab861a2019dfa 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -739,6 +739,27 @@ func.func @sparse_has_runtime() -> i1 {
   return %has_runtime : i1
 }
 
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+// CHECK-LABEL:   func.func @sparse_extract_value(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x8xf32, #sparse>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse, lvls = 1>) -> f32 {
+// CHECK:           %[[VAL_2:.*]] = sparse_tensor.extract_value %[[VAL_0]] at %[[VAL_1]] : tensor<4x8xf32, #sparse>, !sparse_tensor.iterator<#sparse, lvls = 1>
+// CHECK:           return %[[VAL_2]] : f32
+// CHECK:         }
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 1>) -> f32 {
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 1>
+  return %f : f32
+}
+
+
 // -----
 
 #COO = #sparse_tensor.encoding<{
@@ -780,7 +801,7 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
 // CHECK-SAME:      %[[VAL_1:.*]]: index,
 // CHECK-SAME:      %[[VAL_2:.*]]: index) -> index {
 // CHECK:           %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 : tensor<4x8xf32, #sparse{{[0-9]*}}>
-// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> (index) {
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> index {
 // CHECK:             sparse_tensor.yield %[[VAL_7]] : index
 // CHECK:           }
 // CHECK:           return %[[VAL_4]] : index
@@ -792,3 +813,36 @@ func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -
   }
   return %r1 : index
 }
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+// CHECK-LABEL:   func.func @sparse_coiteration(
+// CHECK-SAME:      %[[SP1:.*]]: !sparse_tensor.iter_space<#sparse, lvls = 0>,
+// CHECK-SAME:      %[[SP2:.*]]: !sparse_tensor.iter_space<#sparse, lvls = 1>) -> index {
+// CHECK:           %[[INIT:.*]] = arith.constant 0 : index
+// CHECK:           %[[RET:.*]] = sparse_tensor.coiterate (%[[SP1]], %[[SP2]]) at(%[[COORD:.*]]) iter_args(%[[ARG:.*]] = %[[INIT]])
+// CHECK:           case %[[VAL_6:.*]], _ {
+// CHECK:             sparse_tensor.yield %[[ARG]] : index
+// CHECK:           }
+// CHECK:           return %[[RET]] : index
+// CHECK:         }
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  return %ret : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
index f5bbea0d340fb..268b3940418b7 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --sparse-space-collapse --lower-sparse-iteration-to-scf | FileCheck %s
+// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --cse --sparse-space-collapse --lower-sparse-iteration-to-scf --loop-invariant-code-motion | FileCheck %s
 
 
 #COO = #sparse_tensor.encoding<{
@@ -7,8 +7,7 @@
     d1 : singleton(nonunique, soa),
     d2 : singleton(nonunique, soa),
     d3 : singleton(soa)
-  ),
-  explicitVal = 1 : i32
+  )
 }>
 
 // CHECK-LABEL:   func.func @sqsum(
@@ -17,7 +16,10 @@
 // CHECK-DAG:       %[[POS_BUF:.*]] = sparse_tensor.positions %{{.*}} {level = 0 : index} : tensor<?x?x?x?xi32, #sparse> to memref<?xindex>
 // CHECK:           %[[POS_LO:.*]] = memref.load %[[POS_BUF]]{{\[}}%[[C0]]] : memref<?xindex>
 // CHECK:           %[[POS_HI:.*]] = memref.load %[[POS_BUF]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:           %[[VAL_BUF:.*]] = sparse_tensor.values %{{.*}} : tensor<?x?x?x?xi32, #sparse> to memref<?xi32>
 // CHECK:           %[[SQ_SUM:.*]] = scf.for %[[POS:.*]] = %[[POS_LO]] to %[[POS_HI]] step %[[C1]] {{.*}} {
+// CHECK:             %[[VAL:.*]] = memref.load %[[VAL_BUF]]{{\[}}%[[POS]]] : memref<?xi32>
+// CHECK:             %[[MUL:.*]] = arith.muli %[[VAL]], %[[VAL]] : i32
 // CHECK:             %[[SUM:.*]] = arith.addi
 // CHECK:             scf.yield %[[SUM]] : i32
 // CHECK:           }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index baa205b9f42c6..4b8efde78cc23 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -847,6 +847,33 @@ func.func @fold_reshape_constant_splat(%shape : tensor<1xi32>) -> tensor<4xf32>
 
 // -----
 
+// CHECK-LABEL: func @fold_reshape_chain
+//  CHECK-SAME: %[[INPUT:[a-zA-Z0-9_]+]]: tensor<*xf32>
+//  CHECK-SAME: %[[SHAPE_0:[a-zA-Z0-9_]+]]: tensor<?xindex>
+//  CHECK-SAME: %[[SHAPE_1:[a-zA-Z0-9_]+]]: tensor<?xindex>
+//  CHECK-SAME: %[[SHAPE_2:[a-zA-Z0-9_]+]]: tensor<?xindex>
+//       CHECK: %[[RESULT:.*]] = tensor.reshape %[[INPUT]](%[[SHAPE_2]])
+//       CHECK: return %[[RESULT]]
+func.func @fold_reshape_chain(%input: tensor<*xf32>, %shape_0: tensor<?xindex>, %shape_1: tensor<?xindex>, %shape_2: tensor<?xindex>) -> tensor<*xf32> {
+  %0 = tensor.reshape %input(%shape_0) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  %1 = tensor.reshape %0(%shape_1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  %2 = tensor.reshape %1(%shape_2) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fold_reshape_1d
+//  CHECK-SAME: %[[INPUT:[a-zA-Z0-9_]+]]: tensor<?xf32>
+//  CHECK-SAME: %[[SHAPE:[a-zA-Z0-9_]+]]: tensor<1xindex>
+//       CHECK: return %[[INPUT]]
+func.func @fold_reshape_1d(%input: tensor<?xf32>, %shape: tensor<1xindex>) -> tensor<?xf32> {
+  %0 = tensor.reshape %input(%shape) : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fold_extract_constant_splat
 //   CHECK-NOT: tensor.extract_slice
 //       CHECK: arith.constant dense<42> : tensor<4x4xi32>
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index ad82c9f8858e6..8e19f87dbf4aa 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -566,6 +566,17 @@ func.func @cast_float_to_int() -> tensor<i16> {
 
 // -----
 
+// CHECK: func.func @cast_float_to_int_round
+func.func @cast_float_to_int_round() -> tensor<i16> {
+  %splat = "tosa.const"() {value = dense<-3.5> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[SPLAT:.+]] = "tosa.const"() <{value = dense<-4> : tensor<i16>}
+  %cast = tosa.cast %splat : (tensor<f32>) -> tensor<i16>
+  // CHECK: return %[[SPLAT]]
+  return %cast : tensor<i16>
+}
+
+// -----
+
 // CHECK: func.func @cast_int_to_int_trunc
 func.func @cast_int_to_int_trunc() -> tensor<i16> {
   %splat = "tosa.const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 208982a3e0e7b..00914c1d1baf6 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1182,6 +1182,20 @@ func.func @shape_cast_invalid_rank_expansion(%arg0 : vector<15x2xf32>) {
 
 // -----
 
+func.func @shape_cast_scalability_flag_is_dropped(%arg0 : vector<15x[2]xf32>) {
+  // expected-error@+1 {{different number of scalable dims at source (1) and result (0)}}
+  %0 = vector.shape_cast %arg0 : vector<15x[2]xf32> to vector<30xf32>
+}
+
+// -----
+
+func.func @shape_cast_scalability_flag_is_dropped(%arg0 : vector<2x[15]x[2]xf32>) {
+  // expected-error@+1 {{different number of scalable dims at source (2) and result (1)}}
+  %0 = vector.shape_cast %arg0 : vector<2x[15]x[2]xf32> to vector<30x[2]xf32>
+}
+
+// -----
+
 func.func @bitcast_not_vector(%arg0 : vector<5x1x3x2xf32>) {
   // expected-error@+1 {{'vector.bitcast' invalid kind of type specified}}
   %0 = vector.bitcast %arg0 : vector<5x1x3x2xf32> to f32
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
index f70d23a193229..6e93923608cbf 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -249,11 +249,11 @@ func.func @vector_multi_reduction_parallel_middle(%arg0: vector<3x4x5xf32>, %acc
 //  CHECK-SAME:   %[[INPUT:.+]]: vector<3x4x5xf32>, %[[ACC:.+]]: vector<4xf32>
 //       CHECK: vector.transpose %[[INPUT]], [1, 0, 2] : vector<3x4x5xf32> to vector<4x3x5xf32>
 
-func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
+func.func private @vector_multi_reduction_non_scalable_dim(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
   %0 = vector.multi_reduction <add>, %A, %B [2] : vector<8x[4]x2xf32> to vector<8x[4]xf32>
   return %0 : vector<8x[4]xf32>
 }
-// CHECK-LABEL:   func.func private @scalable_dims(
+// CHECK-LABEL:   func.func private @vector_multi_reduction_non_scalable_dim(
 // CHECK-SAME:                                     %[[VAL_0:.*]]: vector<8x[4]x2xf32>,
 // CHECK-SAME:                                     %[[VAL_1:.*]]: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<[32]xf32>
@@ -282,12 +282,12 @@ func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>
 // CHECK:           return %[[VAL_163]] : vector<8x[4]xf32>
 
 // Check that OneDimMultiReductionToTwoDim handles scalable dim
-func.func @scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) -> f32 {
+func.func @vector_multi_reduction_scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) -> f32 {
     %0 = vector.mask %C { vector.multi_reduction <add>, %A, %B [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
     return %0 : f32
 }
 
-// CHECK-LABEL:  func.func @scalable_dim_1d(
+// CHECK-LABEL:  func.func @vector_multi_reduction_scalable_dim_1d(
 // CHECK-SAME:                                      %[[ARG_0:.*]]: vector<[4]xf32>,
 // CHECK-SAME:                                      %[[ARG_1:.*]]: f32,
 // CHECK-SAME:                                      %[[ARG_2:.*]]: vector<[4]xi1>) -> f32 {
@@ -298,6 +298,30 @@ func.func @scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) ->
 // CHECK:          %[[VAL_4:.*]] = vector.extract %[[VAL_3]][0] : f32 from vector<1xf32>
 // CHECK:          return %[[VAL_4]] : f32
 
+func.func @vector_multi_reduction_scalable_dim_2d(%A: vector<2x[4]xf32>, %B: vector<2xf32>, %C: vector<2x[4]xi1>) -> vector<2xf32> {
+    %0 = vector.mask %C { vector.multi_reduction <add>, %A, %B [1] : vector<2x[4]xf32> to vector<2xf32> } : vector<2x[4]xi1> -> vector<2xf32>
+    return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL:  func.func @vector_multi_reduction_scalable_dim_2d(
+// CHECK-SAME:                                      %[[ARG_0:.*]]: vector<2x[4]xf32>,
+// CHECK-SAME:                                      %[[ARG_1:.*]]: vector<2xf32>,
+// CHECK-SAME:                                      %[[ARG_2:.*]]: vector<2x[4]xi1>) -> vector<2xf32> {
+// CHECK-DAG:      %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C0_2xf32:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK:          %[[ARG0_0:.*]] = vector.extract %[[ARG_0]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:          %[[ARG1_0:.*]] = vector.extract %[[ARG_1]][0] : f32 from vector<2xf32>
+// CHECK:          %[[ARG2_0:.*]] = vector.extract %[[ARG_2]][0] : vector<[4]xi1> from vector<2x[4]xi1>
+// CHECK:          %[[REDUCE_0:.*]] = vector.mask %[[ARG2_0]] { vector.reduction <add>, %[[ARG0_0]], %[[ARG1_0]] : vector<[4]xf32> into f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[INSERT_0:.*]] = vector.insertelement %[[REDUCE_0]], %[[C0_2xf32]][%[[C0_idx]] : index] : vector<2xf32>
+// CHECK:          %[[ARG0_1:.*]] = vector.extract %[[ARG_0]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:          %[[ARG1_1:.*]] = vector.extract %[[ARG_1]][1] : f32 from vector<2xf32>
+// CHECK:          %[[ARG2_1:.*]] = vector.extract %[[ARG_2]][1] : vector<[4]xi1> from vector<2x[4]xi1>
+// CHECK:          %[[REDUCE_1:.*]] = vector.mask %[[ARG2_1]] { vector.reduction <add>, %[[ARG0_1]], %[[ARG1_1]] : vector<[4]xf32> into f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[INSERT_1:.*]] = vector.insertelement %[[REDUCE_1]], %[[INSERT_0]][%[[C1_idx]] : index] : vector<2xf32>
+// CHECK:          return %[[INSERT_1]] : vector<2xf32>
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index 3f36756fe95d7..cd56c1bf9695b 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -2,6 +2,7 @@
 
 //-----------------------------------------------------------------------------
 // 1. vector.transfer_read
+// [Pattern: DropInnerMostUnitDimsTransferRead]
 //-----------------------------------------------------------------------------
 
 func.func @contiguous_inner_most(%src: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
@@ -306,6 +307,7 @@ func.func @negative_non_unit_strides(%src: memref<512x16x1xf32, strided<[8192, 1
 
 //-----------------------------------------------------------------------------
 // 2. vector.transfer_write
+// [Pattern: DropInnerMostUnitDimsTransferWrite]
 //-----------------------------------------------------------------------------
 
 func.func @contiguous_inner_most(%dest: memref<1x512x16x1x1xf32>, %v: vector<1x16x16x1x1xf32>, %i: index) {
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 303f841e8a828..31c4775696b31 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -test-vector-transfer-flatten-patterns -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -test-vector-transfer-flatten-patterns=target-vector-bitwidth=128 -split-input-file | FileCheck %s --check-prefix=CHECK-128B
 
+// TODO: Align naming and format with e.g. vector-transfer-permutation-lowering.mlir
+
 ///----------------------------------------------------------------------------------------
 /// vector.transfer_read
 /// [Pattern: FlattenContiguousRowMajorTransferReadPattern]
@@ -110,12 +112,12 @@ func.func @transfer_read_dims_mismatch_non_zero_indices(
 
 func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
     %arg : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
-    %idx0 : index,
-    %idx1 : index) -> vector<2x2xf32> {
+    %idx_1 : index,
+    %idx_2 : index) -> vector<2x2xf32> {
 
   %c0 = arith.constant 0 : index
   %cst_1 = arith.constant 0.000000e+00 : f32
-  %8 = vector.transfer_read %arg[%c0, %idx0, %idx1, %c0], %cst_1 {in_bounds = [true, true]} :
+  %8 = vector.transfer_read %arg[%c0, %idx_1, %idx_2, %c0], %cst_1 {in_bounds = [true, true]} :
     memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, vector<2x2xf32>
   return %8 : vector<2x2xf32>
 }
@@ -123,7 +125,8 @@ func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
 // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)>
 
 // CHECK-LABEL:  func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
-// CHECK:         %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
+// CHECK:         %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]]
+// CHECK-SAME:      : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
 // CHECK:         %[[APPLY:.*]] = affine.apply #[[$MAP]]()
 
 // CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
@@ -131,10 +134,42 @@ func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
 
 // -----
 
-// The input memref has a dynamic trailing shape and hence is not flattened.
-// TODO: This case could be supported via memref.dim
+// The leading dynamic shapes don't affect whether this example is flattenable
+// or not. Indeed, those dynamic shapes are not candidates for flattening anyway.
+
+func.func @transfer_read_leading_dynamic_dims(
+    %arg : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>,
+    %idx_1 : index,
+    %idx_2 : index) -> vector<8x4xi8> {
+
+  %c0_i8 = arith.constant 0 : i8
+  %c0 = arith.constant 0 : index
+  %result = vector.transfer_read %arg[%idx_1, %idx_2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} :
+    memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
+  return %result : vector<8x4xi8>
+}
+
+// CHECK-LABEL: func @transfer_read_leading_dynamic_dims
+// CHECK-SAME:    %[[ARG0:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index
+// CHECK:         %[[C0_I8:.+]] = arith.constant 0 : i8
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG0]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
+// CHECK:         %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
+// CHECK-SAME:    [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]]
+// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:      : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
+// CHECK:         %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
+// CHECK:         return %[[VEC2D]] : vector<8x4xi8>
 
-func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
+// CHECK-128B-LABEL: func @transfer_read_leading_dynamic_dims
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
+// One of the dims to be flattened is dynamic - not supported ATM.
+
+func.func @negative_transfer_read_dynamic_dim_to_flatten(
     %idx_1: index,
     %idx_2: index,
     %m_in: memref<1x?x4x6xi32>) -> vector<1x2x6xi32> {
@@ -146,11 +181,11 @@ func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
   return %v : vector<1x2x6xi32>
 }
 
-// CHECK-LABEL: func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
+// CHECK-LABEL: func.func @negative_transfer_read_dynamic_dim_to_flatten
 // CHECK-NOT: memref.collapse_shape
 // CHECK-NOT: vector.shape_cast
 
-// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
+// CHECK-128B-LABEL: func @negative_transfer_read_dynamic_dim_to_flatten
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
@@ -326,11 +361,11 @@ func.func @transfer_write_dims_mismatch_non_zero_indices(
 func.func @transfer_write_dims_mismatch_non_contiguous_non_zero_indices(
     %value : vector<2x2xf32>,
     %subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
-    %idx0 : index,
-    %idx1 : index) {
+    %idx_1 : index,
+    %idx_2 : index) {
 
   %c0 = arith.constant 0 : index
-  vector.transfer_write %value, %subview[%c0, %idx0, %idx1, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>
+  vector.transfer_write %value, %subview[%c0, %idx_1, %idx_2, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>
   return
 }
 
@@ -345,10 +380,40 @@ func.func @transfer_write_dims_mismatch_non_contiguous_non_zero_indices(
 
 // -----
 
-// The input memref has a dynamic trailing shape and hence is not flattened.
-// TODO: This case could be supported via memref.dim
+// The leading dynamic shapes don't affect whether this example is flattenable
+// or not. Indeed, those dynamic shapes are not candidates for flattening anyway.
+
+func.func @transfer_write_leading_dynamic_dims(
+    %vec : vector<8x4xi8>,
+    %arg : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>,
+    %idx_1 : index,
+    %idx_2 : index) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %arg[%idx_1, %idx_2, %c0, %c0] {in_bounds = [true, true]} :
+    vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
+  return
+}
+
+// CHECK-LABEL: func @transfer_write_leading_dynamic_dims
+// CHECK-SAME:    %[[ARG0:.+]]: vector<8x4xi8>, %[[ARG1:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG2:.+]]: index, %[[ARG3:.+]]: index
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
+// CHECK:         %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8>
+// CHECK:         vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
+// CHECK-SAME:      [%[[ARG2]], %[[ARG3]], %[[C0]]]
+// CHECK-SAME:      {in_bounds = [true]}
+// CHECK-SAME:      : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
+
+// CHECK-128B-LABEL: func @transfer_write_leading_dynamic_dims
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
+// One of the dims to be flattened is dynamic - not supported ATM.
 
-func.func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
+func.func @negative_transfer_write_dynamic_to_flatten(
     %idx_1: index,
     %idx_2: index,
     %vec : vector<1x2x6xi32>,
@@ -361,11 +426,11 @@ func.func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
   return
 }
 
-// CHECK-LABEL: func.func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
+// CHECK-LABEL: func.func @negative_transfer_write_dynamic_to_flatten
 // CHECK-NOT: memref.collapse_shape
 // CHECK-NOT: vector.shape_cast
 
-// CHECK-128B-LABEL: func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
+// CHECK-128B-LABEL: func @negative_transfer_write_dynamic_to_flatten
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
@@ -434,56 +499,10 @@ func.func @transfer_write_non_contiguous_src(
 // -----
 
 ///----------------------------------------------------------------------------------------
-/// TODO: Categorize + re-format
+/// [Pattern: DropUnitDimFromElementwiseOps]
+/// TODO: Move to a dedicated file - there's no "flattening" in the following tests
 ///----------------------------------------------------------------------------------------
 
-func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) -> vector<8x4xi8> {
-    %c0_i8 = arith.constant 0 : i8
-    %c0 = arith.constant 0 : index
-    %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
-    return %result : vector<8x4xi8>
-}
-
-// CHECK-LABEL: func @transfer_read_flattenable_with_dynamic_dims_and_indices
-// CHECK-SAME:    %[[ARG0:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index
-// CHECK:       %[[C0_I8:.+]] = arith.constant 0 : i8
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG0]] {{\[}}[0], [1], [2, 3]{{\]}}
-// CHECK-SAME:    : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
-// CHECK:       %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
-// CHECK-SAME:    [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]]
-// CHECK-SAME:    {in_bounds = [true]}
-// CHECK-SAME:    : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
-// CHECK:       %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
-// CHECK:       return %[[VEC2D]] : vector<8x4xi8>
-
-// CHECK-128B-LABEL: func @transfer_read_flattenable_with_dynamic_dims_and_indices(
-//       CHECK-128B:   memref.collapse_shape
-
-// -----
-
-func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vector<8x4xi8>, %dst : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) {
-    %c0 = arith.constant 0 : index
-    vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true]} : vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
-    return
-}
-
-// CHECK-LABEL: func @transfer_write_flattenable_with_dynamic_dims_and_indices
-// CHECK-SAME:    %[[ARG0:.+]]: vector<8x4xi8>, %[[ARG1:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG2:.+]]: index, %[[ARG3:.+]]: index
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
-// CHECK-SAME:    : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
-// CHECK:       %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8>
-// CHECK:       vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
-// CHECK-SAME:    [%[[ARG2]], %[[ARG3]], %[[C0]]]
-// CHECK-SAME:    {in_bounds = [true]}
-// CHECK-SAME:    : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
-
-// CHECK-128B-LABEL: func @transfer_write_flattenable_with_dynamic_dims_and_indices(
-//       CHECK-128B:   memref.collapse_shape
-
-// -----
-
 func.func @fold_unit_dim_add_basic(%arg0 : vector<1x8xi32>) -> vector<1x8xi32> {
    %add = arith.addi %arg0, %arg0 : vector<1x8xi32>
    return %add : vector<1x8xi32>
diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
index b9dcb2b55e52e..15000d706adfc 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
@@ -12,18 +12,18 @@
 ///     _is_ a minor identity
 
 // CHECK-LABEL:   func.func @xfer_write_transposing_permutation_map
-// CHECK-SAME:       %[[ARG_0:.*]]: vector<4x8xi16>,
-// CHECK-SAME:       %[[MEM:.*]]: memref<2x2x8x4xi16>) {
-// CHECK:           %[[TR:.*]] = vector.transpose %[[ARG_0]], [1, 0] : vector<4x8xi16> to vector<8x4xi16>
+// CHECK-SAME:      %[[VEC:.*]]: vector<4x8xi16>,
+// CHECK-SAME:      %[[MEM:.*]]: memref<2x2x8x4xi16>) {
+// CHECK:           %[[TR:.*]] = vector.transpose %[[VEC]], [1, 0] : vector<4x8xi16> to vector<8x4xi16>
 // CHECK:           vector.transfer_write
 // CHECK-NOT:       permutation_map
 // CHECK-SAME:      %[[TR]], %[[MEM]]{{.*}} {in_bounds = [true, true]} : vector<8x4xi16>, memref<2x2x8x4xi16>
 func.func @xfer_write_transposing_permutation_map(
-    %arg0: vector<4x8xi16>,
+    %vec: vector<4x8xi16>,
     %mem: memref<2x2x8x4xi16>) {
 
   %c0 = arith.constant 0 : index
-  vector.transfer_write %arg0, %mem[%c0, %c0, %c0, %c0] {
+  vector.transfer_write %vec, %mem[%c0, %c0, %c0, %c0] {
     in_bounds = [true, true],
     permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
   } : vector<4x8xi16>, memref<2x2x8x4xi16>
@@ -31,21 +31,46 @@ func.func @xfer_write_transposing_permutation_map(
   return
 }
 
+// Even with out-of-bounds, it is safe to apply this pattern
+// CHECK-LABEL:   func.func @xfer_write_transposing_permutation_map_out_of_bounds
+// CHECK-SAME:      %[[VEC:.*]]: vector<4x8xi16>,
+// CHECK-SAME:      %[[MEM:.*]]: memref<2x2x?x?xi16>) {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[TR:.*]] = vector.transpose %[[VEC]], [1, 0] : vector<4x8xi16> to vector<8x4xi16>
+// Expect the in_bounds attribute to be preserved. Since we don't print it when
+// all flags are "false", it should not appear in the output.
+// CHECK-NOT:       in_bounds
+// CHECK:           vector.transfer_write
+// CHECK-NOT:       permutation_map
+// CHECK-SAME:      %[[TR]], %[[MEM]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] : vector<8x4xi16>, memref<2x2x?x?xi16>
+func.func @xfer_write_transposing_permutation_map_out_of_bounds(
+    %vec: vector<4x8xi16>,
+    %mem: memref<2x2x?x?xi16>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem[%c0, %c0, %c0, %c0] {
+    in_bounds = [false, false],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+  } : vector<4x8xi16>, memref<2x2x?x?xi16>
+
+  return
+}
+
 // CHECK-LABEL:   func.func @xfer_write_transposing_permutation_map_with_mask_scalable
-// CHECK-SAME:      %[[ARG_0:.*]]: vector<4x[8]xi16>,
+// CHECK-SAME:      %[[VEC:.*]]: vector<4x[8]xi16>,
 // CHECK-SAME:      %[[MEM:.*]]: memref<2x2x?x4xi16>,
 // CHECK-SAME:      %[[MASK:.*]]: vector<[8]x4xi1>) {
-// CHECK:           %[[TR:.*]] = vector.transpose %[[ARG_0]], [1, 0] : vector<4x[8]xi16> to vector<[8]x4xi16>
+// CHECK:           %[[TR:.*]] = vector.transpose %[[VEC]], [1, 0] : vector<4x[8]xi16> to vector<[8]x4xi16>
 // CHECK:           vector.transfer_write
 // CHECK-NOT:       permutation_map
 // CHECK-SAME:      %[[TR]], %[[MEM]]{{.*}}, %[[MASK]] {in_bounds = [true, true]} : vector<[8]x4xi16>, memref<2x2x?x4xi16>
 func.func @xfer_write_transposing_permutation_map_with_mask_scalable(
-    %arg0: vector<4x[8]xi16>,
+    %vec: vector<4x[8]xi16>,
     %mem: memref<2x2x?x4xi16>,
     %mask: vector<[8]x4xi1>) {
 
   %c0 = arith.constant 0 : index
-  vector.transfer_write %arg0, %mem[%c0, %c0, %c0, %c0], %mask {
+  vector.transfer_write %vec, %mem[%c0, %c0, %c0, %c0], %mask {
     in_bounds = [true, true],
     permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
   } : vector<4x[8]xi16>, memref<2x2x?x4xi16>
@@ -57,13 +82,13 @@ func.func @xfer_write_transposing_permutation_map_with_mask_scalable(
 // CHECK-LABEL:   func.func @xfer_write_transposing_permutation_map_masked
 // CHECK-NOT: vector.transpose
 func.func @xfer_write_transposing_permutation_map_masked(
-    %arg0: vector<4x8xi16>,
+    %vec: vector<4x8xi16>,
     %mem: memref<2x2x8x4xi16>,
     %mask: vector<8x4xi1>) {
 
   %c0 = arith.constant 0 : index
   vector.mask %mask {
-    vector.transfer_write %arg0, %mem[%c0, %c0, %c0, %c0] {
+    vector.transfer_write %vec, %mem[%c0, %c0, %c0, %c0] {
       in_bounds = [true, true],
       permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
     } : vector<4x8xi16>, memref<2x2x8x4xi16>
@@ -83,85 +108,144 @@ func.func @xfer_write_transposing_permutation_map_masked(
 ///   * vector.broadcast + vector.transpose + vector.transfer_write with a map
 ///     which _is_ a permutation of a minor identity
 
-// CHECK-LABEL: func @permutation_with_mask_xfer_write_fixed_width(
-//       CHECK:   %[[vec:.*]] = arith.constant dense<-2.000000e+00> : vector<7x1xf32>
-//       CHECK:   %[[mask:.*]] = arith.constant dense<[true, false, true, false, true, true, true]> : vector<7xi1>
-//       CHECK:   %[[b:.*]] = vector.broadcast %[[mask]] : vector<7xi1> to vector<1x7xi1>
-//       CHECK:   %[[tp:.*]] = vector.transpose %[[b]], [1, 0] : vector<1x7xi1> to vector<7x1xi1>
-//       CHECK:   vector.transfer_write %[[vec]], %{{.*}}[%{{.*}}, %{{.*}}], %[[tp]] {in_bounds = [false, true]} : vector<7x1xf32>, memref<?x?xf32>
-func.func @permutation_with_mask_xfer_write_fixed_width(%mem : memref<?x?xf32>, %base1 : index,
-                                                   %base2 : index) {
-
-  %fn1 = arith.constant -2.0 : f32
-  %vf0 = vector.splat %fn1 : vector<7xf32>
-  %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
-  vector.transfer_write %vf0, %mem[%base1, %base2], %mask
-    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false]}
-    : vector<7xf32>, memref<?x?xf32>
+// CHECK-LABEL:   func.func @xfer_write_non_transposing_permutation_map(
+// CHECK-SAME:      %[[MEM:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[VEC:.*]]: vector<7xf32>,
+// CHECK-SAME:      %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index) {
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[VEC]] : vector<7xf32> to vector<1x7xf32>
+// CHECK:           %[[TR:.*]] = vector.transpose %[[BC]], [1, 0] : vector<1x7xf32> to vector<7x1xf32>
+// CHECK:           vector.transfer_write %[[TR]], %[[MEM]]{{\[}}%[[IDX_1]], %[[IDX_2]]] {in_bounds = [false, true]} : vector<7x1xf32>, memref<?x?xf32>
+func.func @xfer_write_non_transposing_permutation_map(
+    %mem : memref<?x?xf32>,
+    %vec : vector<7xf32>,
+    %idx_1 : index,
+    %idx_2 : index) {
+
+  vector.transfer_write %vec, %mem[%idx_1, %idx_2] {
+    permutation_map = affine_map<(d0, d1) -> (d0)>
+  } : vector<7xf32>, memref<?x?xf32>
+
+  return
+}
+
+// Even with out-of-bounds, it is safe to apply this pattern
+// CHECK-LABEL:   func.func @xfer_write_non_transposing_permutation_map_with_mask_out_of_bounds(
+// CHECK-SAME:      %[[MEM:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[VEC:.*]]: vector<7xf32>,
+// CHECK-SAME:      %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index,
+// CHECK-SAME:      %[[MASK:.*]]: vector<7xi1>) {
+// CHECK:           %[[BC_VEC:.*]] = vector.broadcast %[[VEC]] : vector<7xf32> to vector<1x7xf32>
+// CHECK:           %[[BC_MASK:.*]] = vector.broadcast %[[MASK]] : vector<7xi1> to vector<1x7xi1>
+// CHECK:           %[[TR_MASK:.*]] = vector.transpose %[[BC_MASK]], [1, 0] : vector<1x7xi1> to vector<7x1xi1>
+// CHECK:           %[[TR_VEC:.*]] = vector.transpose %[[BC_VEC]], [1, 0] : vector<1x7xf32> to vector<7x1xf32>
+// CHECK:           vector.transfer_write %[[TR_VEC]], %[[MEM]]{{\[}}%[[IDX_1]], %[[IDX_2]]], %[[TR_MASK]] {in_bounds = [false, true]} : vector<7x1xf32>, memref<?x?xf32>
+func.func @xfer_write_non_transposing_permutation_map_with_mask_out_of_bounds(
+    %mem : memref<?x?xf32>,
+    %vec : vector<7xf32>,
+    %idx_1 : index,
+    %idx_2 : index,
+    %mask : vector<7xi1>) {
+
+  vector.transfer_write %vec, %mem[%idx_1, %idx_2], %mask {
+    permutation_map = affine_map<(d0, d1) -> (d0)>,
+    in_bounds = [false]
+  } : vector<7xf32>, memref<?x?xf32>
+
   return
 }
 
 // CHECK:           func.func @permutation_with_mask_xfer_write_scalable(
-// CHECK-SAME:        %[[ARG_0:.*]]: vector<4x[8]xi16>,
-// CHECK-SAME:        %[[ARG_1:.*]]: memref<1x4x?x1xi16>,
+// CHECK-SAME:        %[[VEC:.*]]: vector<4x[8]xi16>,
+// CHECK-SAME:        %[[MEM:.*]]: memref<1x4x?x1xi16>,
 // CHECK-SAME:        %[[MASK:.*]]: vector<4x[8]xi1>) {
 // CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[BCAST_1:.*]] = vector.broadcast %[[ARG_0]] : vector<4x[8]xi16> to vector<1x4x[8]xi16>
-// CHECK:             %[[BCAST_2:.*]] = vector.broadcast %[[MASK]] : vector<4x[8]xi1> to vector<1x4x[8]xi1>
-// CHECK:             %[[TRANSPOSE_1:.*]] =  vector.transpose %[[BCAST_2]], [1, 2, 0] : vector<1x4x[8]xi1> to vector<4x[8]x1xi1>
-// CHECK:             %[[TRANSPOSE_2:.*]] =  vector.transpose %[[BCAST_1]], [1, 2, 0] : vector<1x4x[8]xi16> to vector<4x[8]x1xi16>
-// CHECK:             vector.transfer_write %[[TRANSPOSE_2]], %[[ARG_1]]{{.*}}, %[[TRANSPOSE_1]] {in_bounds = [true, true, true]} : vector<4x[8]x1xi16>, memref<1x4x?x1xi16>
-func.func @permutation_with_mask_xfer_write_scalable(%arg0: vector<4x[8]xi16>, %mem: memref<1x4x?x1xi16>, %mask:  vector<4x[8]xi1>){
-     %c0 = arith.constant 0 : index
-      vector.transfer_write %arg0, %mem[%c0, %c0, %c0, %c0], %mask {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
-} : vector<4x[8]xi16>, memref<1x4x?x1xi16>
-
-    return
+// CHECK:             %[[BC_1:.*]] = vector.broadcast %[[VEC]] : vector<4x[8]xi16> to vector<1x4x[8]xi16>
+// CHECK:             %[[BC_2:.*]] = vector.broadcast %[[MASK]] : vector<4x[8]xi1> to vector<1x4x[8]xi1>
+// CHECK:             %[[TRANSPOSE_1:.*]] =  vector.transpose %[[BC_2]], [1, 2, 0] : vector<1x4x[8]xi1> to vector<4x[8]x1xi1>
+// CHECK:             %[[TRANSPOSE_2:.*]] =  vector.transpose %[[BC_1]], [1, 2, 0] : vector<1x4x[8]xi16> to vector<4x[8]x1xi16>
+// CHECK:             vector.transfer_write %[[TRANSPOSE_2]], %[[MEM]]{{.*}}, %[[TRANSPOSE_1]] {in_bounds = [true, true, true]} : vector<4x[8]x1xi16>, memref<1x4x?x1xi16>
+func.func @permutation_with_mask_xfer_write_scalable(
+    %vec: vector<4x[8]xi16>,
+    %mem: memref<1x4x?x1xi16>,
+    %mask: vector<4x[8]xi1>){
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem[%c0, %c0, %c0, %c0], %mask {
+    in_bounds = [true, true],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+  } : vector<4x[8]xi16>, memref<1x4x?x1xi16>
+
+  return
 }
 
 // transfer_write in MaskOp case not supported.
 // CHECK-LABEL: func @masked_permutation_xfer_write_fixed_width
-//  CHECK-SAME:        %[[ARG_0:.*]]: tensor<?x?xf32>,
-//  CHECK-SAME:        %[[ARG_1:.*]]: vector<16xf32>,
-//  CHECK-SAME:        %[[IDX:.*]]: index,
-//  CHECK-SAME:        %[[MASK:.*]]: vector<16xi1>
+//  CHECK-SAME:   %[[DEST:.*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[VEC:.*]]: vector<16xf32>,
+//  CHECK-SAME:   %[[IDX:.*]]: index,
+//  CHECK-SAME:   %[[MASK:.*]]: vector<16xi1>
 //   CHECK-NOT:   vector.transpose
-//       CHECK:   %[[RES:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[ARG_1]], %[[ARG_0]]{{.*}} vector<16xf32>, tensor<?x?xf32> } : vector<16xi1> -> tensor<?x?xf32>
-func.func @masked_permutation_xfer_write_fixed_width(%t: tensor<?x?xf32>, %val: vector<16xf32>, %idx: index, %mask: vector<16xi1>) -> tensor<?x?xf32> {
-  %r = vector.mask %mask { vector.transfer_write %val, %t[%idx, %idx] {permutation_map = affine_map<(d0, d1) -> (d0)>} : vector<16xf32>, tensor<?x?xf32> } : vector<16xi1> -> tensor<?x?xf32>
-  return %r : tensor<?x?xf32>
+//       CHECK:   vector.mask %[[MASK]] { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} vector<16xf32>, tensor<?x?xf32> } : vector<16xi1> -> tensor<?x?xf32>
+func.func @masked_permutation_xfer_write_fixed_width(
+    %dest: tensor<?x?xf32>,
+    %vec: vector<16xf32>,
+    %idx: index,
+    %mask: vector<16xi1>) -> tensor<?x?xf32> {
+
+  %res = vector.mask %mask {
+    vector.transfer_write %vec, %dest[%idx, %idx] {
+      permutation_map = affine_map<(d0, d1) -> (d0)>
+    } : vector<16xf32>, tensor<?x?xf32>
+  } : vector<16xi1> -> tensor<?x?xf32>
+
+  return %res : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: func.func @masked_permutation_xfer_write_scalable(
-//  CHECK-SAME:        %[[ARG_0:.*]]: vector<4x[8]xi16>,
-//  CHECK-SAME:        %[[ARG_1:.*]]: tensor<?x?x?x?xf32>,
-//  CHECK-SAME:        %[[MASK:.*]]: vector<4x[8]xi1>)
-//  CHECK-SAME:        -> tensor<?x?x?x?xf32> {
+//  CHECK-SAME:   %[[VEC:.*]]: vector<4x[8]xi16>,
+//  CHECK-SAME:   %[[DEST:.*]]: tensor<?x?x?x?xf32>,
+//  CHECK-SAME:   %[[MASK:.*]]: vector<4x[8]xi1>)
+//  CHECK-SAME:   -> tensor<?x?x?x?xf32> {
 //   CHECK-NOT:   vector.transpose
-//       CHECK:   %[[R:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[ARG_0]], %[[ARG_1]]{{.*}} : vector<4x[8]xi16>, tensor<?x?x?x?xf32> } : vector<4x[8]xi1> -> tensor<?x?x?x?xf32>
-func.func @masked_permutation_xfer_write_scalable(%arg0: vector<4x[8]xi16>, %t: tensor<?x?x?x?xf32>, %mask:  vector<4x[8]xi1>) -> tensor<?x?x?x?xf32> {
-     %c0 = arith.constant 0 : index
-     %r = vector.mask %mask { vector.transfer_write %arg0, %t[%c0, %c0, %c0, %c0] {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
-} : vector<4x[8]xi16>, tensor<?x?x?x?xf32> } : vector<4x[8]xi1> -> tensor<?x?x?x?xf32>
+//       CHECK:   vector.mask %[[MASK]] { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<4x[8]xi16>, tensor<?x?x?x?xf32> } : vector<4x[8]xi1> -> tensor<?x?x?x?xf32>
+func.func @masked_permutation_xfer_write_scalable(
+    %vec: vector<4x[8]xi16>,
+    %dest: tensor<?x?x?x?xf32>,
+    %mask:  vector<4x[8]xi1>) -> tensor<?x?x?x?xf32> {
 
-    return %r : tensor<?x?x?x?xf32>
+  %c0 = arith.constant 0 : index
+  %res = vector.mask %mask {
+    vector.transfer_write %vec, %dest[%c0, %c0, %c0, %c0] {
+      in_bounds = [true, true],
+      permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+    } : vector<4x[8]xi16>, tensor<?x?x?x?xf32>
+  } : vector<4x[8]xi1> -> tensor<?x?x?x?xf32>
+
+  return %res : tensor<?x?x?x?xf32>
 }
 
 // transfer_write in MaskOp case not supported.
 // CHECK-LABEL: func @masked_non_permutation_xfer_write_fixed_width
-//  CHECK-SAME:      %[[ARG0:.*]]: tensor<?x?x?x?xf32>
-//  CHECK-SAME:      %[[ARG1:.*]]: vector<14x8x16xf32>
-//  CHECK-SAME:      %[[IDX:.*]]: index) -> tensor<?x?x?x?xf32>
+//  CHECK-SAME:   %[[DEST:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:   %[[VEC:.*]]: vector<14x8x16xf32>
+//  CHECK-SAME:   %[[IDX:.*]]: index) -> tensor<?x?x?x?xf32>
 //   CHECK-NOT:   vector.broadcast
-//       CHECK:   %[[masked1:.*]] = vector.mask %0 { vector.transfer_write %[[ARG1]], %[[ARG0]]{{.*}} : vector<14x8x16xf32>, tensor<?x?x?x?xf32> } : vector<14x8x16xi1> -> tensor<?x?x?x?xf32>
+//       CHECK:   vector.mask %0 { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<14x8x16xf32>, tensor<?x?x?x?xf32> } : vector<14x8x16xi1> -> tensor<?x?x?x?xf32>
 func.func @masked_non_permutation_xfer_write_fixed_width(
-    %arg0 : tensor<?x?x?x?xf32>,
-    %v1 : vector<14x8x16xf32>, %dim : index) -> tensor<?x?x?x?xf32> {
+    %dest : tensor<?x?x?x?xf32>,
+    %vec : vector<14x8x16xf32>,
+    %dim : index) -> tensor<?x?x?x?xf32> {
+
   %c0 = arith.constant 0 : index
   %mask = vector.create_mask %dim, %dim, %dim : vector<14x8x16xi1>
-  %0 = vector.mask %mask { vector.transfer_write %v1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>} : vector<14x8x16xf32>, tensor<?x?x?x?xf32> } : vector<14x8x16xi1> -> tensor<?x?x?x?xf32>
-
-  return %0 : tensor<?x?x?x?xf32>
+  %res = vector.mask %mask {
+    vector.transfer_write %vec, %dest[%c0, %c0, %c0, %c0] {
+      in_bounds = [false, false, true],
+      permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+    } : vector<14x8x16xf32>, tensor<?x?x?x?xf32>
+  } : vector<14x8x16xi1> -> tensor<?x?x?x?xf32>
+
+  return %res : tensor<?x?x?x?xf32>
 }
 
 ///----------------------------------------------------------------------------------------
@@ -174,80 +258,105 @@ func.func @masked_non_permutation_xfer_write_fixed_width(
 ///     vector.transpose op
 
 // CHECK-LABEL:   func.func @permutation_with_mask_xfer_read_fixed_width(
-// CHECK-SAME:      %[[ARG_0:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[MEM:.*]]: memref<?x?xf32>,
 // CHECK-SAME:      %[[IDX_1:.*]]: index,
 // CHECK-SAME:      %[[IDX_2:.*]]: index) -> vector<8x4x2xf32> {
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[IDX_2]], %[[IDX_1]] : vector<2x4xi1>
-// CHECK:           %[[T_READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32>
+// CHECK:           %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[C0]], %[[C0]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32>
 // CHECK:           %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x4xf32> to vector<8x2x4xf32>
 // CHECK:           %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x4xf32> to vector<8x4x2xf32>
 // CHECK:           return %[[TRANSPOSE]] : vector<8x4x2xf32>
-func.func @permutation_with_mask_xfer_read_fixed_width(%mem: memref<?x?xf32>, %dim_1: index, %dim_2: index) -> (vector<8x4x2xf32>) {
+func.func @permutation_with_mask_xfer_read_fixed_width(
+    %mem: memref<?x?xf32>,
+    %dim_1: index,
+    %dim_2: index) -> (vector<8x4x2xf32>) {
 
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
 
   %mask = vector.create_mask %dim_2, %dim_1 : vector<2x4xi1>
-  %1 = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask
-    {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>}
-    : memref<?x?xf32>, vector<8x4x2xf32>
-  return %1 : vector<8x4x2xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask {
+    in_bounds = [true, true, true],
+    permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>
+  } : memref<?x?xf32>, vector<8x4x2xf32>
+
+  return %res : vector<8x4x2xf32>
 }
 
 // CHECK-LABEL:   func.func @permutation_with_mask_xfer_read_scalable(
-// CHECK-SAME:      %[[ARG_0:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[MEM:.*]]: memref<?x?xf32>,
 // CHECK-SAME:      %[[IDX_1:.*]]: index,
 // CHECK-SAME:      %[[IDX_2:.*]]: index) -> vector<8x[4]x2xf32> {
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[IDX_2]], %[[IDX_1]] : vector<2x[4]xi1>
-// CHECK:           %[[T_READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x[4]xf32>
+// CHECK:           %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[C0]], %[[C0]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x[4]xf32>
 // CHECK:           %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x[4]xf32> to vector<8x2x[4]xf32>
 // CHECK:           %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x[4]xf32> to vector<8x[4]x2xf32>
 // CHECK:           return %[[TRANSPOSE]] : vector<8x[4]x2xf32>
-func.func @permutation_with_mask_xfer_read_scalable(%mem: memref<?x?xf32>, %dim_1: index, %dim_2: index) -> (vector<8x[4]x2xf32>) {
+func.func @permutation_with_mask_xfer_read_scalable(
+    %mem: memref<?x?xf32>,
+    %dim_1: index,
+    %dim_2: index) -> (vector<8x[4]x2xf32>) {
 
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
 
   %mask = vector.create_mask %dim_2, %dim_1 : vector<2x[4]xi1>
-  %1 = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask
-    {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>}
-    : memref<?x?xf32>, vector<8x[4]x2xf32>
-  return %1 : vector<8x[4]x2xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask {
+    in_bounds = [true, true, true],
+    permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>
+  } : memref<?x?xf32>, vector<8x[4]x2xf32>
+
+  return %res : vector<8x[4]x2xf32>
 }
 
 // transfer_read in MaskOp case not supported.
 // CHECK-LABEL: func @masked_permutation_xfer_read_fixed_width
-//  CHECK-SAME:        %[[ARG_0:.*]]: tensor<?x1xf32>,
-//  CHECK-SAME:        %[[ARG_1:.*]]: vector<4x1xi1>
+//  CHECK-SAME:   %[[DEST:.*]]: tensor<?x1xf32>,
+//  CHECK-SAME:   %[[MASK:.*]]: vector<4x1xi1>
 //   CHECK-NOT:   vector.transpose
-//       CHECK:   vector.mask %[[ARG_1]] { vector.transfer_read %[[ARG_0]]{{.*}}: tensor<?x1xf32>, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32>
-func.func @masked_permutation_xfer_read_fixed_width(%arg0: tensor<?x1xf32>, %mask : vector<4x1xi1>) {
+//       CHECK:   vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}}: tensor<?x1xf32>, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32>
+func.func @masked_permutation_xfer_read_fixed_width(
+    %dest: tensor<?x1xf32>,
+    %mask : vector<4x1xi1>) {
+
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %3 = vector.mask %mask { vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [false, true, false], permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)>} : tensor<?x1xf32>, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32>
-  call @test.some_use(%3) : (vector<1x4x4xf32>) -> ()
+  %3 = vector.mask %mask {
+    vector.transfer_read %dest[%c0, %c0], %cst {
+      in_bounds = [false, true, false],
+      permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)>
+    } : tensor<?x1xf32>, vector<1x4x4xf32>
+  } : vector<4x1xi1> -> vector<1x4x4xf32>
+
+  "test.some_use"(%3) : (vector<1x4x4xf32>) -> ()
+
   return
 }
-func.func private @test.some_use(vector<1x4x4xf32>)
 
 // CHECK-LABEL:  func.func @masked_permutation_xfer_read_scalable(
-//  CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x?xf32>,
-//  CHECK-SAME:      %[[MASK:.*]]: vector<2x[4]xi1>) -> vector<8x[4]x2xf32> {
+//  CHECK-SAME:    %[[DEST:.*]]: tensor<?x?xf32>,
+//  CHECK-SAME:    %[[MASK:.*]]: vector<2x[4]xi1>) -> vector<8x[4]x2xf32> {
 //   CHECK-NOT:    vector.transpose
-//       CHECK:    %[[T_READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]]{{.*}} : tensor<?x?xf32>, vector<8x[4]x2xf32> } : vector<2x[4]xi1> -> vector<8x[4]x2xf32>
-func.func @masked_permutation_xfer_read_scalable(%t: tensor<?x?xf32>, %mask : vector<2x[4]xi1>) -> vector<8x[4]x2xf32> {
+//       CHECK:    %[[T_READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}} : tensor<?x?xf32>, vector<8x[4]x2xf32> } : vector<2x[4]xi1> -> vector<8x[4]x2xf32>
+func.func @masked_permutation_xfer_read_scalable(
+  %dest: tensor<?x?xf32>,
+  %mask : vector<2x[4]xi1>) -> vector<8x[4]x2xf32> {
 
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
 
-  %1 = vector.mask %mask { vector.transfer_read %t[%c0, %c0], %cst_0
-    {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>}
-    : tensor<?x?xf32>, vector<8x[4]x2xf32> } :vector<2x[4]xi1> -> vector<8x[4]x2xf32>
-  return %1 : vector<8x[4]x2xf32>
+  %res = vector.mask %mask {
+    vector.transfer_read %dest[%c0, %c0], %cst_0 {
+      in_bounds = [true, true, true],
+      permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>
+    } : tensor<?x?xf32>, vector<8x[4]x2xf32>
+  } :vector<2x[4]xi1> -> vector<8x[4]x2xf32>
+
+  return %res : vector<8x[4]x2xf32>
 }
 
 module attributes {transform.with_named_sequence} {
@@ -270,33 +379,46 @@ module attributes {transform.with_named_sequence} {
 
 //       CHECK:   #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>
 //       CHECK:   func.func @transfer_read_reduce_rank_scalable(
-//  CHECK-SAME:       %[[ARG_0:.*]]: memref<?x?x?x?xf32>) -> vector<8x[4]x2x3xf32> {
+//  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>) -> vector<8x[4]x2x3xf32> {
 //       CHECK:     %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:     %[[TFR:.*]] = vector.transfer_read %arg0[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]{{.*}} permutation_map = #[[MAP]]} : memref<?x?x?x?xf32>, vector<[4]x2x3xf32>
-//       CHECK:     %[[BC:.*]] = vector.broadcast %[[TFR]] : vector<[4]x2x3xf32> to vector<8x[4]x2x3xf32>
+//       CHECK:     %[[T_READ:.*]] = vector.transfer_read %[[MEM]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]]{{.*}} permutation_map = #[[MAP]]} : memref<?x?x?x?xf32>, vector<[4]x2x3xf32>
+//       CHECK:     %[[BC:.*]] = vector.broadcast %[[T_READ]] : vector<[4]x2x3xf32> to vector<8x[4]x2x3xf32>
 //       CHECK:     return %[[BC]] : vector<8x[4]x2x3xf32>
-func.func @transfer_read_reduce_rank_scalable(%mem: memref<?x?x?x?xf32>) -> vector<8x[4]x2x3xf32> {
+func.func @transfer_read_reduce_rank_scalable(
+    %mem: memref<?x?x?x?xf32>) -> vector<8x[4]x2x3xf32> {
+
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
-  %1 = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst_0
-    {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>}
-    : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32>
-  return %1 : vector<8x[4]x2x3xf32>
+
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst_0 {
+    in_bounds = [true, true, true, true],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
+  } : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32>
+
+  return %res : vector<8x[4]x2x3xf32>
 }
 
 // Masked case not supported.
 // CHECK-LABEL:   func.func @masked_transfer_read_reduce_rank(
-//  CHECK-SAME:       %[[ARG_0:.*]]: memref<?x?x?x?xf32>,
-//  CHECK-SAME:       %[[DIM:.*]]: index) -> vector<8x[4]x2x3xf32> {
+//  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>,
+//  CHECK-SAME:     %[[DIM:.*]]: index) -> vector<8x[4]x2x3xf32> {
 //   CHECK-NOT:     vector.broadcast
-//       CHECK:     %[[MASK:.*]] = vector.mask %0 { vector.transfer_read %arg0{{.*}} : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
-func.func @masked_transfer_read_reduce_rank(%mem: memref<?x?x?x?xf32>, %dim: index) -> vector<8x[4]x2x3xf32> {
+//       CHECK:     %[[MASK:.*]] = vector.mask %0 { vector.transfer_read %[[MEM]]{{.*}} : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
+func.func @masked_transfer_read_reduce_rank(
+    %mem: memref<?x?x?x?xf32>,
+    %dim: index) -> vector<8x[4]x2x3xf32> {
+
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
   %mask = vector.create_mask %dim, %dim: vector<[4]x3xi1>
-  %res = vector.mask %mask { vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst_0
-    {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>}
-    : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
+
+  %res = vector.mask %mask {
+    vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst_0 {
+      in_bounds = [true, true, true, true],
+      permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
+    } : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32>
+  } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
+
   return %res : vector<8x[4]x2x3xf32>
 }
 
diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
new file mode 100644
index 0000000000000..6e35010ca1c0b
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(convert-to-llvm)" %s | FileCheck %s
+
+// CHECK-LABEL: @main
+// CHECK: llvm.intr.ctlz
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = math.ctlz %arg0 : i32
+    func.return %0 : i32
+  }
+}
diff --git a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
new file mode 100644
index 0000000000000..5ba1f1f869688
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt --pass-pipeline='builtin.module(builtin.module(func.func(cse,canonicalize),convert-to-llvm))' %s | FileCheck %s
+
+// CHECK-LABEL: llvm.func @func1
+// CHECK-NEXT: llvm.add
+// CHECK-NEXT: llvm.add
+// CHECK-NEXT: llvm.return
+module {
+  module {
+    func.func @func1(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      func.return %2 : i32
+    }
+  }
+
+  // CHECK-LABEL: @gpu_module
+  // CHECK-LABEL: gpu.func @func2
+  // CHECK-COUNT-3: arith.addi
+  // CHECK-NEXT: gpu.return
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
+}
+
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion.mlir b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
new file mode 100644
index 0000000000000..eec0d3fa57093
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
@@ -0,0 +1,27 @@
+// This file is left in-tree despite having no assertions so it can be
+// referenced by the tutorial text.
+
+// RUN: mlir-opt %s
+
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
new file mode 100644
index 0000000000000..e5c86b84e43e5
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion)" %s | FileCheck %s
+
+
+// CHECK-LABEL: @producer_consumer_fusion
+// CHECK-COUNT-1: affine.for
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
new file mode 100644
index 0000000000000..b5c06f83cfba3
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion{fusion-compute-tolerance=0})" %s | FileCheck %s
+
+// CHECK-LABEL: @producer_consumer_fusion
+// CHECK-COUNT-3: affine.for
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
diff --git a/mlir/test/IR/attribute.mlir b/mlir/test/IR/attribute.mlir
index 291d5832fce79..362e98134ee4a 100644
--- a/mlir/test/IR/attribute.mlir
+++ b/mlir/test/IR/attribute.mlir
@@ -40,6 +40,10 @@ func.func @float_attrs_pass() {
     // CHECK: float_attr = 2.000000e+00 : f8E5M2
     float_attr = 2. : f8E5M2
   } : () -> ()
+  "test.float_attrs"() {
+    // CHECK: float_attr = 2.000000e+00 : f8E4M3
+    float_attr = 2. : f8E4M3
+  } : () -> ()
   "test.float_attrs"() {
     // CHECK: float_attr = 2.000000e+00 : f8E4M3FN
     float_attr = 2. : f8E4M3FN
diff --git a/mlir/test/IR/diagnostic-handler-metadata.mlir b/mlir/test/IR/diagnostic-handler-metadata.mlir
new file mode 100644
index 0000000000000..c089185cc3570
--- /dev/null
+++ b/mlir/test/IR/diagnostic-handler-metadata.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-diagnostic-metadata))" -verify-diagnostics -o - 2>&1 | FileCheck %s
+// COM: This test verifies that diagnostic handler can filter the diagnostic based on its metadata
+// COM: whether to emit the errors.
+
+// CHECK-LABEL: Test 'test'
+func.func @test() {
+  // expected-error @+1 {{test diagnostic metadata}}
+  "test.emit_error"() {
+    // CHECK: attr = "emit_error"
+    attr = "emit_error"
+  } : () -> ()
+
+  "test.do_not_emit_error"() {
+    // CHECK: attr = "do_not_emit_error"
+    attr = "do_not_emit_error"
+  } : () -> ()
+
+  return
+}
diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir
index 01ea856b03168..418b81dcbb034 100644
--- a/mlir/test/IR/properties.mlir
+++ b/mlir/test/IR/properties.mlir
@@ -2,10 +2,10 @@
 // # RUN: mlir-opt %s -mlir-print-op-generic -split-input-file  | mlir-opt -mlir-print-op-generic | FileCheck %s --check-prefix=GENERIC
 
 // CHECK:   test.with_properties
-// CHECK-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>{{$}}
+// CHECK-SAME: a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]{{$}}
 // GENERIC:   "test.with_properties"()
-// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}> : () -> ()
-test.with_properties <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo", c = "bar", flag = true}> : () -> ()
+test.with_properties a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
 
 // CHECK:   test.with_nice_properties
 // CHECK-SAME:    "foo bar" is -3{{$}}
@@ -34,18 +34,48 @@ test.using_property_in_custom [1, 4, 20]
 // GENERIC-SAME: }>
 test.using_property_ref_in_custom 1 + 4 = 5
 
-// CHECK:   test.with_default_valued_properties {{$}}
+// CHECK:   test.with_default_valued_properties na{{$}}
 // GENERIC: "test.with_default_valued_properties"()
-// GENERIC-SAME:  <{a = 0 : i32}>
-test.with_default_valued_properties <{a = 0 : i32}>
+// GENERIC-SAME: <{a = 0 : i32, b = "", c = -1 : i32, unit = false}> : () -> ()
+test.with_default_valued_properties 0 "" -1 unit_absent
+
+// CHECK:   test.with_default_valued_properties 1 "foo" 0 unit{{$}}
+// GENERIC: "test.with_default_valued_properties"()
+// GENERIC-SAME: <{a = 1 : i32, b = "foo", c = 0 : i32, unit}> : () -> ()
+test.with_default_valued_properties 1 "foo" 0 unit
 
 // CHECK:   test.with_optional_properties
-// CHECK-SAME:  <{b = 0 : i32}>
+// CHECK-SAME: simple = 0
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0]}> : () -> ()
+test.with_optional_properties simple = 0
+
+// CHECK:   test.with_optional_properties{{$}}
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  <{b = 0 : i32}>
-test.with_optional_properties <{b = 0 : i32}>
+// GENERIC-SAME: simple = []
+test.with_optional_properties
 
-// CHECK:   test.with_optional_properties {{$}}
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: anAttr = 0 simple = 1 nonTrivialStorage = "foo" hasDefault = some<0> nested = some<1>  longSyntax = some<"bar"> hasUnit maybeUnit = some<unit>
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  : () -> ()
+// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1]}> : () -> ()
 test.with_optional_properties
+  anAttr = 0
+  simple = 1
+  nonTrivialStorage = "foo"
+  hasDefault = some<0>
+  nested = some<1>
+  longSyntax = some<"bar">
+  hasUnit
+  maybeUnit = some<unit>
+
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: nested = some<none>
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME: nested = {{\[}}[]]
+test.with_optional_properties nested = some<none>
+
+// CHECK:    test.with_array_properties
+// CHECK-SAME: ints = [1, 2] strings = ["a", "b"] nested = {{\[}}[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent]
+// GENERIC: "test.with_array_properties"()
+test.with_array_properties ints = [1, 2] strings = ["a", "b"] nested = [[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent] [] thats_has_default
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 1e046706379cd..49cfd7e496746 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -502,6 +502,25 @@ func.func @succeededOilistTrivial() {
 
 // -----
 
+// CHECK-LABEL: @succeededOilistTrivialProperties
+func.func @succeededOilistTrivialProperties() {
+  // CHECK: test.oilist_with_keywords_only_properties keyword
+  test.oilist_with_keywords_only_properties keyword
+  // CHECK: test.oilist_with_keywords_only_properties otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties keyword otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword keyword
+  // CHECK: test.oilist_with_keywords_only_properties thirdKeyword
+  test.oilist_with_keywords_only_properties thirdKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword thirdKeyword
+  test.oilist_with_keywords_only_properties keyword thirdKeyword
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @succeededOilistSimple
 func.func @succeededOilistSimple(%arg0 : i32, %arg1 : i32, %arg2 : i32) {
   // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
new file mode 100644
index 0000000000000..7cdb35918c4c0
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
@@ -0,0 +1,175 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_1d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE:    -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-F32
+
+// REDEFINE: %{entry_point} = reduce_1d_i32
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-I32
+
+// REDEFINE: %{entry_point} = generic_reduce_1d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC-F32
+
+func.func @reduce_1d_f32() {
+  // 1-D Tensor
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+  %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+  %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+  // Reduce
+  %C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
+    (%in: f32, %init: f32) {
+      %0 = arith.addf %in, %init : f32
+      linalg.yield %0 : f32
+    }
+
+  // Print and verify the output
+  // REDUCE-F32-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // REDUCE-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+  // REDUCE-F32-NEXT: [3141.6]
+
+  %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // REDUCE-F32-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @reduce_1d_i32() {
+  // 1-D Tensor
+  %N = arith.constant 1000 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xi32>
+  %C_alloc = bufferization.alloc_tensor() : tensor<i32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3 : i32
+  %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?xi32>) -> tensor<?xi32>
+  %C_in = tensor.insert %c0_i32 into %C_alloc[] : tensor<i32>
+
+  // Reduce
+  %C_out = linalg.reduce ins(%A_in : tensor<?xi32>) outs(%C_in: tensor<i32>) dimensions = [0]
+    (%in: i32, %init: i32) {
+      %0 = arith.addi %in, %init : i32
+      linalg.yield %0 : i32
+    }
+
+  // Print and verify the output
+  // REDUCE-I32-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // REDUCE-I32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+  // REDUCE-I32-NEXT: [3000]
+
+  %xf = tensor.cast %C_out : tensor<i32> to tensor<*xi32>
+  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+  // REDUCE-I32-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @generic_reduce_1d_f32() {
+  // 1-D Tensor
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+  %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+  %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+  // Reduce
+  %C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
+                                             affine_map<(d0) -> ()>],
+                            iterator_types = ["reduction"] }
+    ins(%A_in : tensor<?xf32>)
+    outs(%C_in : tensor<f32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<f32>
+
+  // Print and verify the output
+  // GENERIC-F32-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // GENERIC-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+  // GENERIC-F32-NEXT: [3141.6]
+
+  %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // GENERIC-F32-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  // A sequence that will tile and vectorise a Reduce Op
+  transform.named_sequence @tile_and_vectorize_reduce(%func
+    : !transform.op<"func.func"> {transform.readonly}) {
+
+    // Step 0: Get a handle to the reduce Op
+    %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+      : (!transform.op<"func.func">) -> !transform.any_op
+
+    // Step 1: Tile
+    %tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op
+
+    // Step 3: Lower vector.multi_reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+
+  // A sequence that goes over all functions in tis module and applies
+  // "tile_and_vectorize_reduce"
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %funcs = transform.structured.match ops{["func.func"]} in %module
+        : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.foreach %funcs : !transform.op<"func.func"> {
+      ^bb2(%func : !transform.op<"func.func">):
+        transform.include @tile_and_vectorize_reduce failures(propagate)
+        (%func) : (!transform.op<"func.func">) -> ()
+    }
+    transform.yield
+  }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
new file mode 100644
index 0000000000000..bcfe12e374b4e
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
@@ -0,0 +1,180 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_2d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE:    -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+
+// REDEFINE: %{entry_point} = generic_reduce_2d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+
+func.func @reduce_2d_f32() {
+  // 2-D Tensor
+  %M = arith.constant 16 : index
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+  // Reduce
+  %C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1]
+    (%in: f32, %init: f32) {
+      %0 = arith.addf %in, %init : f32
+      linalg.yield %0 : f32
+    }
+
+  // Print and verify the output
+  // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+  // REDUCE-NEXT: [3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6]
+
+  %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @generic_reduce_2d_f32() {
+  // 2-D Tensor
+  %M = arith.constant 16 : index
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+  // Reduce
+  %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                             affine_map<(d0, d1) -> (d0)>],
+                            iterator_types = ["parallel", "reduction"] }
+    ins(%A_in : tensor<?x?xf32>)
+    outs(%C_in : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+
+  // Print and verify the output
+  // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+  // GENERIC-NEXT: [3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6]
+
+  %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @generic_reduce_2d_i32() {
+  // 2-D Tensor
+  %M = arith.constant 16 : index
+  %N = arith.constant 1000 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32>
+  %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xi32>
+
+  // Initialise the tensors
+  %pi = arith.constant 3 : i32
+  %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?x?xi32>) -> tensor<?x?xi32>
+  %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?xi32>) -> tensor<?xi32>
+
+  // Reduce
+  %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                             affine_map<(d0, d1) -> (d0)>],
+                            iterator_types = ["parallel", "reduction"] }
+    ins(%A_in : tensor<?x?xi32>)
+    outs(%C_in : tensor<?xi32>) {
+    ^bb(%in: i32, %out: i32) :
+      %0 = arith.addi %in, %out : i32
+      linalg.yield %0 : i32
+    } -> tensor<?xi32>
+
+  // Print and verify the output
+  // GENERIC-I32-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // GENERIC-I32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+  // GENERIC-I32-NEXT: [3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000,  3000]
+
+  %xf = tensor.cast %C_out : tensor<?xi32> to tensor<*xi32>
+  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+  // GENERIC-I32-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+
+module attributes {transform.with_named_sequence} {
+  // A sequence that will tile and vectorise a Reduce Op
+  transform.named_sequence @tile_and_vectorize_reduce(%func
+    : !transform.op<"func.func"> {transform.readonly}) {
+
+    // Step 0: Get a handle to the reduce Op
+    %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+      : (!transform.op<"func.func">) -> !transform.any_op
+
+    // Step 1: Tile
+    %tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op
+
+    // Step 3: Lower vector.multi_reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+
+  // A sequence that goes over all functions in tis module and applies
+  // "tile_and_vectorize_reduce"
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %funcs = transform.structured.match ops{["func.func"]} in %module
+        : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.foreach %funcs : !transform.op<"func.func"> {
+      ^bb2(%func : !transform.op<"func.func">):
+        transform.include @tile_and_vectorize_reduce failures(propagate)
+        (%func) : (!transform.op<"func.func">) -> ()
+    }
+    transform.yield
+  }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 11ab30a7d237c..d1aed593f4545 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -428,7 +428,7 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //       CHECK: func @matmul_sequence_fusion(
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
diff --git a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
index 7d247aefcf6b1..ccf8e37c094f4 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
@@ -31,8 +31,8 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:   %[[DIM_IN1:.+]] = tensor.dim %[[IN]], %[[C1]]
 //   CHECK-DAG:   %[[DIM1:.+]] = affine.apply #[[MAP1]]()[%[[DIM_IN1]]]
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 //       CHECK:   %[[RESULT:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[DIM0]] step %[[C2]]
-//       CHECK:     %[[C3:.+]] = arith.constant 3 : index
 //       CHECK:     scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
 //       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
 //       CHECK:         tensor.generate
@@ -62,8 +62,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//   CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 8)>
-//   CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 7)>
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 + 8)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 + 7)>
 //       CHECK: func @dynamic_2d_pad_tensor_inner_tiling(
 //  CHECK-SAME:     %[[IN:.*]]: tensor<?x?xf32>
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
@@ -107,9 +107,9 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C15:.*]] = arith.constant 15 : index
 //   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 //       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]]
-//   CHECK-DAG:     %[[C16:.*]] = arith.constant 16 : index
-//   CHECK-DAG:     %[[C3:.*]] = arith.constant 3 : index
 //       CHECK:     scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
 //       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
 //       CHECK:         tensor.generate
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
index 488a52e8e3e91..8eb1311170c66 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
@@ -16,21 +16,21 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK-LABEL: func.func @simple_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:   %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[ARG2]])
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     %[[INNER:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
 //  CHECK-DAG:       %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
@@ -68,23 +68,23 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //      CHECK-LABEL: func.func @simple_matmul_memref(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[M:.+]] = memref.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = memref.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = memref.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
-//  CHECK-DAG:       %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
 //  CHECK-DAG:         %[[TS_M:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
 //  CHECK-DAG:         %[[TS_N:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[N]]]
@@ -127,18 +127,18 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//   CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)>
+//   CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)>
 // CHECK-LABEL: func.func @multi_result(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>)
-//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 //   CHECK-DAG:   %[[INIT0:.+]] = tensor.empty()
 //   CHECK-DAG:   %[[INIT1:.+]] = tensor.empty()
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+//   CHECK-DAG:   %[[C300:.+]] = arith.constant 300 : index
+//   CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //       CHECK:   %[[OUTER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C10]]
 //  CHECK-SAME:       iter_args(%[[ARG1:[a-zA-Z0-9]+]] = %[[INIT0]], %[[ARG2:[a-zA-Z0-9]+]] = %[[INIT1]])
-//   CHECK-DAG:     %[[C300:.+]] = arith.constant 300 : index
-//   CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //       CHECK:     %[[INNER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[C300]] step %[[C20]]
 //  CHECK-SAME:         iter_args(%[[ARG3:[a-zA-Z0-9]+]] = %[[ARG1]], %[[ARG4:[a-zA-Z0-9]+]] = %[[ARG2]])
 //   CHECK-DAG:       %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])
@@ -180,9 +180,9 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //  CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
 //  CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
 //      CHECK-LABEL: func.func @conv2D(
@@ -193,7 +193,6 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //  CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[INPUT]], %[[C0]]
 //  CHECK-DAG:   %[[C:.+]] = tensor.dim %[[INPUT]], %[[C3]]
 //  CHECK-DAG:   %[[P:.+]] = tensor.dim %[[FILTER]], %[[C0]]
@@ -201,12 +200,13 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG:   %[[F:.+]] = tensor.dim %[[FILTER]], %[[C3]]
 //  CHECK-DAG:   %[[R:.+]] = tensor.dim %[[INIT]], %[[C1]]
 //  CHECK-DAG:   %[[S:.+]] = tensor.dim %[[INIT]], %[[C2]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[P]] step %[[C10]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[INIT]])
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[Q]] step %[[C20]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
-//  CHECK-DAG:       %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[C]] step %[[C30]]
 // CHECK-SAME:           iter_args(%[[INIT2:.+]] = %[[INIT1]])
 //  CHECK-DAG:         %[[TS_P:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[P]]]
@@ -287,25 +287,25 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //      CHECK-LABEL: func.func @interchange_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[ARG2]])
-//  CHECK-DAG:     %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:     %[[INNER1:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
-//  CHECK-DAG:       %[[C10:.+]] = arith.constant 10 : index
 //      CHECK:       %[[INNER2:[a-zA-Z0-9]+]] = scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
 // CHECK-SAME:           iter_args(%[[INIT2:.+]] = %[[INIT1]])
 //  CHECK-DAG:         %[[TS_N:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[N]]]
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
index c5aff744b57ee..53dd0c6a2425c 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
@@ -17,8 +17,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK: func.func @simple_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
@@ -65,8 +65,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK-LABEL: func.func @simple_matmul_memref(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: memref<?x?xf32>
@@ -117,7 +117,7 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)>
 //      CHECK-LABEL: func.func @multi_result(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>)
 //  CHECK-DAG:   %[[INIT0:.+]] = tensor.empty()
@@ -161,9 +161,9 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //  CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
 //  CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
 //      CHECK-LABEL: func.func @conv2D(
@@ -264,8 +264,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //      CHECK-LABEL: func.func @interchange_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index a194ecbf2eb20..03c3855a9a324 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -781,7 +781,7 @@ define void @string_type(ptr %arg1) {
 !6 = !DIStringType(name: "character(*)", stringLength: !4, size: 32, align: 8, stringLengthExpression: !8, stringLocationExpression: !7)
 !7 = !DIExpression(DW_OP_push_object_address, DW_OP_deref)
 !8 = !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 8)
-!9 = !DILocalVariable(scope: !5, name: "str", file: !2, type: !6);
+!9 = !DILocalVariable(scope: !5, name: "str", file: !2, type: !6, flags: 64);
 !10 = !DILocation(line: 1, column: 2, scope: !5)
 
 ; CHECK: #[[VAR:.+]] = #llvm.di_local_variable<{{.*}}name = "string_size"{{.*}}>
@@ -791,3 +791,4 @@ define void @string_type(ptr %arg1) {
 ; CHECK-SAME: stringLength = #[[VAR]]
 ; CHECK-SAME: stringLengthExp = <[DW_OP_push_object_address, DW_OP_plus_uconst(8)]>
 ; CHECK-SAME: stringLocationExp = <[DW_OP_push_object_address, DW_OP_deref]>>
+; CHECK: #di_local_variable1 = #llvm.di_local_variable<scope = #di_subprogram, name = "str", file = #di_file, type = #di_string_type, flags = Artificial>
diff --git a/mlir/test/Target/LLVMIR/Import/function-attributes.ll b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
index 6c38979a0a719..912f448657baa 100644
--- a/mlir/test/Target/LLVMIR/Import/function-attributes.ll
+++ b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
@@ -13,14 +13,14 @@ define internal spir_func void @spir_func_internal() {
 ; // -----
 
 ; CHECK-LABEL: @func_readnone
-; CHECK-SAME:  attributes {memory = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
+; CHECK-SAME:  attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
 ; CHECK:   llvm.return
 define void @func_readnone() readnone {
   ret void
 }
 
 ; CHECK-LABEL: @func_readnone_indirect
-; CHECK-SAME:  attributes {memory = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
+; CHECK-SAME:  attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
 declare void @func_readnone_indirect() #0
 attributes #0 = { readnone }
 
@@ -152,7 +152,7 @@ define void @entry_count() !prof !1 {
 ; // -----
 
 ; CHECK-LABEL: @func_memory
-; CHECK-SAME:  attributes {memory = #llvm.memory_effects<other = readwrite, argMem = none, inaccessibleMem = readwrite>}
+; CHECK-SAME:  attributes {memory_effects = #llvm.memory_effects<other = readwrite, argMem = none, inaccessibleMem = readwrite>}
 ; CHECK:   llvm.return
 define void @func_memory() memory(readwrite, argmem: none) {
   ret void
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index 005aafb20a510..3b1dcee1e85c7 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -528,6 +528,64 @@ define void @varargs_call(i32 %0) {
 
 ; // -----
 
+; CHECK: llvm.func @f()
+declare void @f()
+
+; CHECK-LABEL: @call_convergent
+define void @call_convergent() {
+; CHECK: llvm.call @f() {convergent}
+  call void @f() convergent
+  ret void
+}
+
+; // -----
+
+; CHECK: llvm.func @f()
+declare void @f()
+
+; CHECK-LABEL: @call_no_unwind
+define void @call_no_unwind() {
+; CHECK: llvm.call @f() {no_unwind}
+  call void @f() nounwind
+  ret void
+}
+
+; // -----
+
+; CHECK: llvm.func @f()
+declare void @f()
+
+; CHECK-LABEL: @call_will_return
+define void @call_will_return() {
+; CHECK: llvm.call @f() {will_return}
+  call void @f() willreturn
+  ret void
+}
+
+; // -----
+
+; CHECK: llvm.func @f()
+declare void @f()
+
+; CHECK-LABEL: @call_memory_effects
+define void @call_memory_effects() {
+; CHECK: llvm.call @f() {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
+  call void @f() memory(none)
+; CHECK: llvm.call @f() {memory_effects = #llvm.memory_effects<other = none, argMem = write, inaccessibleMem = read>}
+  call void @f() memory(none, argmem: write, inaccessiblemem: read)
+; CHECK: llvm.call @f() {memory_effects = #llvm.memory_effects<other = write, argMem = none, inaccessibleMem = write>}
+  call void @f() memory(write, argmem: none)
+; CHECK: llvm.call @f() {memory_effects = #llvm.memory_effects<other = readwrite, argMem = readwrite, inaccessibleMem = read>}
+  call void @f() memory(readwrite, inaccessiblemem: read)
+; CHECK: llvm.call @f()
+; CHECK-NOT: #llvm.memory_effects
+; CHECK-SAME: : () -> ()
+  call void @f() memory(readwrite)
+  ret void
+}
+
+; // -----
+
 %sub_struct = type { i32, i8 }
 %my_struct = type { %sub_struct, [4 x i32] }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index f89ef5af87f4b..1a9a8561de00d 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -603,7 +603,7 @@ llvm.func @subranges(%arg: !llvm.ptr) {
  file = #file, isOptimized = false, emissionKind = Full>
 #sp = #llvm.di_subprogram<compileUnit = #cu, scope = #file, name = "test",
  file = #file, subprogramFlags = Definition>
-#var = #llvm.di_local_variable<scope = #sp, name = "string_size", type = #bt>
+#var = #llvm.di_local_variable<scope = #sp, name = "string_size", type = #bt, flags = Artificial>
 #ty = #llvm.di_string_type<tag = DW_TAG_string_type, name = "character(*)",
  sizeInBits = 32, alignInBits = 8, stringLength = #var,
  stringLengthExp = <[DW_OP_push_object_address, DW_OP_plus_uconst(8)]>,
@@ -620,4 +620,4 @@ llvm.func @string_ty(%arg0: !llvm.ptr) {
 #loc2 = loc(fused<#sp>[#loc1])
 
 // CHECK-DAG: !DIStringType(name: "character(*)", stringLength: ![[VAR:[0-9]+]], stringLengthExpression: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 8), stringLocationExpression: !DIExpression(DW_OP_push_object_address, DW_OP_deref), size: 32, align: 8)
-// CHECK-DAG: ![[VAR]] = !DILocalVariable(name: "string_size"{{.*}})
+// CHECK-DAG: ![[VAR]] = !DILocalVariable(name: "string_size"{{.*}} flags: DIFlagArtificial)
diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir
index c85fa0101c00d..3e533211b0d0c 100644
--- a/mlir/test/Target/LLVMIR/llvmir-types.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir
@@ -20,8 +20,6 @@ llvm.func @return_fp128() -> f128
 llvm.func @return_x86_fp80() -> f80
 // CHECK: declare ppc_fp128 @return_ppc_fp128()
 llvm.func @return_ppc_fp128() -> !llvm.ppc_fp128
-// CHECK: declare x86_mmx @return_x86_mmx()
-llvm.func @return_x86_mmx() -> !llvm.x86_mmx
 
 //
 // Functions.
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 132a8eb668eba..db54d131299c6 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -39,6 +39,9 @@ llvm.mlir.global internal constant @string_const("foobar") : !llvm.array<6 x i8>
 // CHECK: @int_global_undef = internal global i64 undef
 llvm.mlir.global internal @int_global_undef() : i64
 
+// CHECK: @f8E4M3_global_as_i8 = internal global i8 60
+llvm.mlir.global internal @f8E4M3_global_as_i8(1.5 : f8E4M3) : i8
+
 // CHECK: @f8E4M3FN_global_as_i8 = internal global i8 60
 llvm.mlir.global internal @f8E4M3FN_global_as_i8(1.5 : f8E4M3FN) : i8
 
@@ -2273,7 +2276,7 @@ llvm.func @readonly_function(%arg0: !llvm.ptr {llvm.readonly})
 
 // CHECK: declare void @arg_mem_none_func() #[[ATTR:[0-9]+]]
 llvm.func @arg_mem_none_func() attributes {
-  memory = #llvm.memory_effects<other = readwrite, argMem = none, inaccessibleMem = readwrite>}
+  memory_effects = #llvm.memory_effects<other = readwrite, argMem = none, inaccessibleMem = readwrite>}
 
 // CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none) }
 
@@ -2281,7 +2284,7 @@ llvm.func @arg_mem_none_func() attributes {
 
 // CHECK: declare void @readwrite_func() #[[ATTR:[0-9]+]]
 llvm.func @readwrite_func() attributes {
-  memory = #llvm.memory_effects<other = readwrite, argMem = readwrite, inaccessibleMem = readwrite>}
+  memory_effects = #llvm.memory_effects<other = readwrite, argMem = readwrite, inaccessibleMem = readwrite>}
 
 // CHECK: attributes #[[ATTR]] = { memory(readwrite) }
 
@@ -2474,3 +2477,75 @@ llvm.func @willreturn() attributes { will_return } {
 
 // CHECK: #[[ATTRS]]
 // CHECK-SAME: willreturn
+
+// -----
+
+llvm.func @f()
+
+// CHECK-LABEL: @convergent_call
+// CHECK: call void @f() #[[ATTRS:[0-9]+]]
+llvm.func @convergent_call() {
+  llvm.call @f() {convergent} : () -> ()
+  llvm.return
+}
+
+// CHECK: #[[ATTRS]]
+// CHECK-SAME: convergent
+
+// -----
+
+llvm.func @f()
+
+// CHECK-LABEL: @nounwind_call
+// CHECK: call void @f() #[[ATTRS:[0-9]+]]
+llvm.func @nounwind_call() {
+  llvm.call @f() {no_unwind} : () -> ()
+  llvm.return
+}
+
+// CHECK: #[[ATTRS]]
+// CHECK-SAME: nounwind
+
+// -----
+
+llvm.func @f()
+
+// CHECK-LABEL: @willreturn_call
+// CHECK: call void @f() #[[ATTRS:[0-9]+]]
+llvm.func @willreturn_call() {
+  llvm.call @f() {will_return} : () -> ()
+  llvm.return
+}
+
+// CHECK: #[[ATTRS]]
+// CHECK-SAME: willreturn
+
+// -----
+
+llvm.func @fa()
+llvm.func @fb()
+llvm.func @fc()
+llvm.func @fd()
+
+// CHECK-LABEL: @mem_effects_call
+// CHECK: call void @fa() #[[ATTRS_0:[0-9]+]]
+// CHECK: call void @fb() #[[ATTRS_1:[0-9]+]]
+// CHECK: call void @fc() #[[ATTRS_2:[0-9]+]]
+// CHECK: call void @fd() #[[ATTRS_3:[0-9]+]]
+llvm.func @mem_effects_call() {
+  llvm.call @fa() {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>} : () -> ()
+  llvm.call @fb() {memory_effects = #llvm.memory_effects<other = read, argMem = none, inaccessibleMem = write>} : () -> ()
+  llvm.call @fc() {memory_effects = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = write>} : () -> ()
+  llvm.call @fd() {memory_effects = #llvm.memory_effects<other = readwrite, argMem = read, inaccessibleMem = readwrite>} : () -> ()
+  llvm.return
+
+}
+
+// CHECK: #[[ATTRS_0]]
+// CHECK-SAME: memory(none)
+// CHECK: #[[ATTRS_1]]
+// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write)
+// CHECK: #[[ATTRS_2]]
+// CHECK-SAME: memory(read, inaccessiblemem: write)
+// CHECK: #[[ATTRS_3]]
+// CHECK-SAME: memory(readwrite, argmem: read)
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
new file mode 100644
index 0000000000000..c386342005e5e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = llvm.mlir.constant(39 : index) : i64
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %2 = llvm.mlir.constant(1 : index) : i64
+    %3 = llvm.mlir.constant(40 : index) : i64
+    %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr
+    %5 = llvm.mlir.addressof @_QFEb : !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %8 = llvm.mlir.addressof @_QFEn : !llvm.ptr
+    omp.task {
+      %14 = llvm.mlir.constant(1 : i64) : i64
+      %15 = llvm.alloca %14 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+      %16 = llvm.load %8 : !llvm.ptr -> i32
+      %17 = llvm.sext %16 : i32 to i64
+      %18 = llvm.trunc %2 : i64 to i32
+      llvm.br ^bb1(%18, %17 : i32, i64)
+    ^bb1(%19: i32, %20: i64):  // 2 preds: ^bb0, ^bb2
+      %21 = llvm.icmp "sgt" %20, %1 : i64
+      llvm.cond_br %21, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      llvm.store %19, %15 : i32, !llvm.ptr
+      %22 = llvm.load %15 : !llvm.ptr -> i32
+      %23 = llvm.sext %22 : i32 to i64
+      %24 = llvm.mlir.constant(1 : i64) : i64
+      %25 = llvm.mlir.constant(0 : i64) : i64
+      %26 = llvm.sub %23, %24 overflow<nsw> : i64
+      %27 = llvm.mul %26, %24 overflow<nsw> : i64
+      %28 = llvm.mul %27, %24 overflow<nsw> : i64
+      %29 = llvm.add %28, %25 overflow<nsw> : i64
+      %30 = llvm.mul %24, %3 overflow<nsw> : i64
+      %31 = llvm.getelementptr %4[%29] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      llvm.store %22, %31 : i32, !llvm.ptr
+      %32 = llvm.load %15 : !llvm.ptr -> i32
+      %33 = llvm.add %32, %18 : i32
+      %34 = llvm.sub %20, %2 : i64
+      llvm.br ^bb1(%33, %34 : i32, i64)
+    ^bb3:  // pred: ^bb1
+      llvm.store %19, %15 : i32, !llvm.ptr
+      omp.terminator
+    }
+    %9 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%3 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %10 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(to) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "a"}
+    %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"}
+    %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+    omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
+      %14 = llvm.mlir.constant(0 : index) : i64
+      %15 = llvm.mlir.constant(10 : i32) : i32
+      %16 = llvm.mlir.constant(1 : index) : i64
+      %17 = llvm.mlir.constant(40 : index) : i64
+      %18 = llvm.load %arg3 : !llvm.ptr -> i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.trunc %16 : i64 to i32
+      llvm.br ^bb1(%20, %19 : i32, i64)
+    ^bb1(%21: i32, %22: i64):  // 2 preds: ^bb0, ^bb2
+      %23 = llvm.icmp "sgt" %22, %14 : i64
+      llvm.cond_br %23, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      llvm.store %21, %arg2 : i32, !llvm.ptr
+      %24 = llvm.load %arg2 : !llvm.ptr -> i32
+      %25 = llvm.sext %24 : i32 to i64
+      %26 = llvm.mlir.constant(1 : i64) : i64
+      %27 = llvm.mlir.constant(0 : i64) : i64
+      %28 = llvm.sub %25, %26 overflow<nsw> : i64
+      %29 = llvm.mul %28, %26 overflow<nsw> : i64
+      %30 = llvm.mul %29, %26 overflow<nsw> : i64
+      %31 = llvm.add %30, %27 overflow<nsw> : i64
+      %32 = llvm.mul %26, %17 overflow<nsw> : i64
+      %33 = llvm.getelementptr %arg0[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      %34 = llvm.load %33 : !llvm.ptr -> i32
+      %35 = llvm.add %34, %15 : i32
+      %36 = llvm.mlir.constant(1 : i64) : i64
+      %37 = llvm.mlir.constant(0 : i64) : i64
+      %38 = llvm.sub %25, %36 overflow<nsw> : i64
+      %39 = llvm.mul %38, %36 overflow<nsw> : i64
+      %40 = llvm.mul %39, %36 overflow<nsw> : i64
+      %41 = llvm.add %40, %37 overflow<nsw> : i64
+      %42 = llvm.mul %36, %17 overflow<nsw> : i64
+      %43 = llvm.getelementptr %arg1[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      llvm.store %35, %43 : i32, !llvm.ptr
+      %44 = llvm.load %arg2 : !llvm.ptr -> i32
+      %45 = llvm.add %44, %20 : i32
+      %46 = llvm.sub %22, %16 : i64
+      llvm.br ^bb1(%45, %46 : i32, i64)
+    ^bb3:  // pred: ^bb1
+      llvm.store %21, %arg2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEb() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEc() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEn() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(40 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+  llvm.func @_FortranAProgramEndStatement() attributes {sym_visibility = "private"}
+  llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.zero : !llvm.ptr
+    llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+    llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+    llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+    llvm.return %0 : i32
+  }
+
+// %strucArg holds pointers to shared data.
+// CHECK: define void @_QQmain() {
+// CHECK-DAG: %[[STRUCTARG:.+]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
+// CHECK: %[[PTR0:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 0
+// CHECK: store i64 ptrtoint (ptr @_QFEa to i64), ptr %[[PTR0]], align 4
+// CHECK: %[[PTR1:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 1
+// CHECK: store i64 8, ptr %[[PTR1]], align 4
+// CHECK: %[[PTR2:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 2
+// CHECK: store i8 1, ptr %[[PTR2]], align 1
+
+// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
+// CHECK: call void @__kmpc_omp_wait_deps({{.+}}, i32 1, ptr %[[DEP_ARRAY]], i32 0, ptr null)
+// CHECK: call void @__kmpc_omp_task_begin_if0({{.+}}, ptr  %[[TASKDATA]])
+// CHECK: call void @.omp_target_task_proxy_func({{.+}}, ptr %[[TASKDATA]])
+// CHECK: call void @__kmpc_omp_task_complete_if0({{.+}}, ptr %[[TASKDATA]])
+	      
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 04c2e9fa091bd..acebb20674406 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -148,12 +148,12 @@ llvm.func @test_omp_parallel_num_threads_3() -> () {
 // CHECK: define internal void @[[OMP_OUTLINED_FN_NUM_THREADS_3_1]]
   // CHECK: call void @__kmpc_barrier
 
-// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_VAR_1:.*]])
+// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_EXPR_1:.*]])
 llvm.func @test_omp_parallel_if_1(%arg0: i32) -> () {
 
   %0 = llvm.mlir.constant(0 : index) : i32
   %1 = llvm.icmp "slt" %arg0, %0 : i32
-// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_VAR_1]], 0
+// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_EXPR_1]], 0
 
 
 // CHECK: %[[GTN_IF_1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[SI_VAR_IF_1:.*]])
@@ -1330,14 +1330,14 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
       // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB3]], i32 [[OMP_THREAD2]], ptr [[TMP2]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP3]], align 8
       // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB5:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB5]], i32 [[OMP_THREAD4]], ptr [[TMP4]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1360,7 +1360,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
       // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD8]], ptr [[TMP10]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
       // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP11]], align 8
@@ -1369,7 +1369,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB9:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB9]], i32 [[OMP_THREAD10]], ptr [[TMP13]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 5924377581db6..e76f7e4d40f7a 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -234,3 +234,39 @@ omp.declare_reduction @reducer.part : !llvm.ptr init {
 ^bb0(%arg0: !llvm.ptr):
   omp.yield
 }
+
+// -----
+
+llvm.func @_QPequivalence() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.array<4 x i8> : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(0 : index) : i64
+  %3 = llvm.getelementptr %1[0, %2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<4 x i8>
+  omp.parallel private(@_QFequivalenceEx_firstprivate_ptr_f32 %3 -> %arg0 : !llvm.ptr) {
+    %4 = llvm.mlir.constant(3.140000e+00 : f32) : f32
+    llvm.store %4, %arg0 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+omp.private {type = firstprivate} @_QFequivalenceEx_firstprivate_ptr_f32 : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "x", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %0, %arg1 : f32, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+// CHECK: define internal void @_QPequivalence..omp_par
+// CHECK-NOT: define {{.*}} @{{.*}}
+// CHECK:   %[[PRIV_ALLOC:.*]] = alloca float, i64 1, align 4
+// CHECK:   %[[HOST_VAL:.*]] = load float, ptr %{{.*}}, align 4
+// Test that we initialize the firstprivate variable.
+// CHECK:   store float %[[HOST_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// Test that we inlined the body of the parallel region.
+// CHECK:   store float 0x{{.*}}, ptr %[[PRIV_ALLOC]], align 4
diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir
index 195773735431e..0a29290b6a6fa 100644
--- a/mlir/test/Target/SPIRV/decorations.mlir
+++ b/mlir/test/Target/SPIRV/decorations.mlir
@@ -97,3 +97,13 @@ spirv.func @fmul_decorations(%arg: f32) -> f32 "None" {
   spirv.ReturnValue %0 : f32
 }
 }
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel, Float16], []> {
+spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" {
+  // CHECK: spirv.FConvert %arg0 {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
+  %0 = spirv.FConvert %arg {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
+  spirv.ReturnValue %0 : f16
+}
+}
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index 92cfde817cf7f..3b8b1fce0575a 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -87,7 +87,7 @@ func.func @mismatch_operands_matching_arguments(%cond : i1, %arg0 : i32, %arg1 :
 
 // CHECK-LABEL: func @mismatch_argument_uses(
 func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32, i32) {
-  // CHECK: return {{.*}}, {{.*}}
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg1 : i32), ^bb2(%arg0 : i32)
 
@@ -101,7 +101,7 @@ func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32,
 
 // CHECK-LABEL: func @mismatch_argument_types(
 func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) {
-  // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2(%arg1 : i16)
 
@@ -115,7 +115,7 @@ func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) {
 
 // CHECK-LABEL: func @mismatch_argument_count(
 func.func @mismatch_argument_count(%cond : i1, %arg0 : i32) {
-  // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2
 
diff --git a/mlir/test/Transforms/canonicalize-dce.mlir b/mlir/test/Transforms/canonicalize-dce.mlir
index 84631947970de..ac034d567a26a 100644
--- a/mlir/test/Transforms/canonicalize-dce.mlir
+++ b/mlir/test/Transforms/canonicalize-dce.mlir
@@ -137,10 +137,10 @@ func.func @f(%arg0: f32) {
 // Test case: Test the mechanics of deleting multiple block arguments.
 
 // CHECK:      func @f(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>, %arg2: tensor<3xf32>, %arg3: tensor<4xf32>, %arg4: tensor<5xf32>)
-// CHECK-NEXT:   "test.br"()[^bb1]
-// CHECK-NEXT: ^bb1:
-// CHECK-NEXT:   "foo.print"(%arg1)
-// CHECK-NEXT:   "foo.print"(%arg3)
+// CHECK-NEXT:   "test.br"(%arg1, %arg3)[^bb1] : (tensor<2xf32>, tensor<4xf32>)
+// CHECK-NEXT: ^bb1([[VAL0:%.+]]: tensor<2xf32>, [[VAL1:%.+]]: tensor<4xf32>):
+// CHECK-NEXT:   "foo.print"([[VAL0]])
+// CHECK-NEXT:   "foo.print"([[VAL1]])
 // CHECK-NEXT:   return
 
 
diff --git a/mlir/test/Transforms/make-isolated-from-above.mlir b/mlir/test/Transforms/make-isolated-from-above.mlir
index a9d4325944fd9..58f6cfbc5dd65 100644
--- a/mlir/test/Transforms/make-isolated-from-above.mlir
+++ b/mlir/test/Transforms/make-isolated-from-above.mlir
@@ -78,9 +78,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]]
 //       CHECK:   test.isolated_one_region_op %[[ARG2]], %[[C0]], %[[C1]], %[[D0]], %[[D1]]
 //  CHECK-NEXT:     ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index, %[[B3:[a-zA-Z0-9]+]]: index, %[[B4:[a-zA-Z0-9]+]]: index)
-//  CHECK-NEXT:       cf.br ^bb1
-//       CHECK:     ^bb1:
-//       CHECK:       "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B0]])
+//  CHECK-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CHECK:     ^bb1(%[[B5:.+]]: index)
+//       CHECK:       "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]])
 
 // CLONE1-LABEL: func @make_isolated_from_above_multiple_blocks(
 //  CLONE1-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
@@ -95,9 +95,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //  CLONE1-NEXT:     ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index)
 //   CLONE1-DAG:       %[[C0_0:.+]] = arith.constant 0 : index
 //   CLONE1-DAG:       %[[C1_0:.+]] = arith.constant 1 : index
-//  CLONE1-NEXT:       cf.br ^bb1
-//       CLONE1:     ^bb1:
-//       CLONE1:       "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B0]])
+//  CLONE1-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CLONE1:     ^bb1(%[[B3:.+]]: index)
+//       CLONE1:       "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B3]])
 
 // CLONE2-LABEL: func @make_isolated_from_above_multiple_blocks(
 //  CLONE2-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
@@ -110,6 +110,6 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //   CLONE2-DAG:       %[[EMPTY:.+]] = tensor.empty(%[[B1]], %[[B2]])
 //   CLONE2-DAG:       %[[D0:.+]] = tensor.dim %[[EMPTY]], %[[C0]]
 //   CLONE2-DAG:       %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]]
-//  CLONE2-NEXT:       cf.br ^bb1
-//       CLONE2:     ^bb1:
-//       CLONE2:       "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B0]])
+//  CLONE2-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CLONE2:     ^bb1(%[[B3:.+]]: index)
+//       CLONE2:       "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B3]])
diff --git a/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir b/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir
deleted file mode 100644
index e821dcd0c2064..0000000000000
--- a/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir
+++ /dev/null
@@ -1,162 +0,0 @@
- // RUN: mlir-opt -pass-pipeline='builtin.module(llvm.func(canonicalize{region-simplify=aggressive}))' %s | FileCheck %s
-
-llvm.func @foo(%arg0: i64)
-
-llvm.func @rand() -> i1
-
-// CHECK-LABEL: func @large_merge_block(
-llvm.func @large_merge_block(%arg0: i64) {
-  // CHECK:  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK:  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK:  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK:  %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64
-  // CHECK:  %[[C4:.*]] = llvm.mlir.constant(4 : i64) : i64
-
-  // CHECK:  llvm.cond_br %5, ^bb1(%[[C1]], %[[C3]], %[[C4]], %[[C2]] : i64, i64, i64, i64), ^bb1(%[[C4]], %[[C2]], %[[C1]], %[[C3]] : i64, i64, i64, i64)
-  // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64, %[[arg3:.*]]: i64):
-  // CHECK:    llvm.cond_br %{{.*}}, ^bb2(%[[arg0]] : i64), ^bb2(%[[arg3]] : i64)
-  // CHECK: ^bb{{.*}}(%11: i64):
-  // CHECK:    llvm.br ^bb{{.*}}
-  // CHECK: ^bb{{.*}}:
-  // CHECK:   llvm.call
-  // CHECK:   llvm.cond_br {{.*}}, ^bb{{.*}}(%[[arg1]] : i64), ^bb{{.*}}(%[[arg2]] : i64)
-  // CHECK: ^bb{{.*}}:
-  // CHECK:   llvm.call
-  // CHECK    llvm.br ^bb{{.*}}
-
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %2 = llvm.mlir.constant(2 : i64) : i64
-  %3 = llvm.mlir.constant(3 : i64) : i64
-  %4 = llvm.mlir.constant(4 : i64) : i64
-  %10 = llvm.icmp "eq" %arg0, %0 : i64
-  llvm.cond_br %10, ^bb1, ^bb14
-^bb1:  // pred: ^bb0
-  %11 = llvm.call @rand() : () -> i1
-  llvm.cond_br %11, ^bb2, ^bb3
-^bb2:  // pred: ^bb1
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.br ^bb4
-^bb3:  // pred: ^bb1
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb4
-^bb4:  // 2 preds: ^bb2, ^bb3
-  %14 = llvm.call @rand() : () -> i1
-  llvm.cond_br %14, ^bb5, ^bb6
-^bb5:  // pred: ^bb4
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb13
-^bb6:  // pred: ^bb4
-  llvm.call @foo(%4) : (i64) -> ()
-  llvm.br ^bb13
-^bb13:  // 2 preds: ^bb11, ^bb12
-  llvm.br ^bb27
-^bb14:  // pred: ^bb0
-  %23 = llvm.call @rand() : () -> i1
-  llvm.cond_br %23, ^bb15, ^bb16
-^bb15:  // pred: ^bb14
-  llvm.call @foo(%4) : (i64) -> ()
-  llvm.br ^bb17
-^bb16:  // pred: ^bb14
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb17
-^bb17:  // 2 preds: ^bb15, ^bb16
-  %26 = llvm.call @rand() : () -> i1
-  llvm.cond_br %26, ^bb18, ^bb19
-^bb18:  // pred: ^bb17
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb26
-^bb19:  // pred: ^bb17
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.br ^bb26
-^bb26:  // 2 preds: ^bb24, ^bb25
-  llvm.br ^bb27
-^bb27:  // 2 preds: ^bb13, ^bb26
-  llvm.return
-}
-
-llvm.func @redundant_args0(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.mlir.constant(2 : i64) : i64
-  // CHECK  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64), ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64)
-  // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64)
-^bb1:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}
-
-llvm.func @redundant_args1(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.mlir.constant(2 : i64) : i64
-  // CHECK  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64), ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64)
-  // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64)
-^bb1:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}
-
-llvm.func @redundant_args_complex(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %2 = llvm.mlir.constant(2 : i64) : i64
-  %3 = llvm.mlir.constant(3 : i64) : i64
-  // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK: %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C2]], %[[C1]], %[[C3]] : i64, i64, i64), ^bb{{.*}}(%[[C0]], %[[C3]], %[[C2]] : i64, i64, i64)
-  // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64):
-  // CHECK: llvm.call @foo(%[[arg0]])
-  // CHECK: llvm.call @foo(%[[arg0]])
-  // CHECK: llvm.call @foo(%[[arg1]])
-  // CHECK: llvm.call @foo(%[[C2]])
-  // CHECK: llvm.call @foo(%[[arg2]])
-
-^bb1:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index b35cda8e724f6..d0563fed8e5d9 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -2,9 +2,8 @@
 
 
 func.func @test_invalid_arg_materialization(
-  // expected-error@below {{failed to materialize conversion for block argument #0 that remained live after conversion, type was 'i16'}}
+  // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}}
   %arg0: i16) {
-  // expected-note@below {{see existing live user here}}
   "foo.return"(%arg0) : (i16) -> ()
 }
 
@@ -104,9 +103,8 @@ func.func @test_block_argument_not_converted() {
 // Make sure argument type changes aren't implicitly forwarded.
 func.func @test_signature_conversion_no_converter() {
   "test.signature_conversion_no_converter"() ({
-  // expected-error@below {{failed to materialize conversion for block argument #0 that remained live after conversion}}
+  // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}}
   ^bb0(%arg0: f32):
-    // expected-note@below {{see existing live user here}}
     "test.type_consumer"(%arg0) : (f32) -> ()
     "test.return"(%arg0) : (f32) -> ()
   }) : () -> ()
@@ -129,3 +127,18 @@ llvm.func @unsupported_func_op_interface() {
   // CHECK: llvm.return
   llvm.return
 }
+
+// -----
+
+// CHECK-LABEL: func @test_signature_conversion_no_converter()
+func.func @test_signature_conversion_no_converter() {
+  // CHECK: "test.signature_conversion_no_converter"() ({
+  // CHECK: ^{{.*}}(%[[arg0:.*]]: f64):
+  "test.signature_conversion_no_converter"() ({
+  ^bb0(%arg0: f32):
+    // CHECK: "test.legal_op_d"(%[[arg0]]) : (f64) -> ()
+    "test.replace_with_legal_op"(%arg0) : (f32) -> ()
+    "test.return"() : () -> ()
+  }) : () -> ()
+  return
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 65c947198e06e..a789ab9a82e19 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -408,10 +408,10 @@ func.func @test_move_op_before_rollback() {
 
 // CHECK-LABEL: func @test_properties_rollback()
 func.func @test_properties_rollback() {
-  // CHECK: test.with_properties <{a = 32 : i64,
+  // CHECK: test.with_properties a = 32,
   // expected-remark @below{{op 'test.with_properties' is not legalizable}}
   test.with_properties
-      <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+      a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
       {modify_inplace}
   "test.return"() : () -> ()
 }
@@ -437,3 +437,18 @@ func.func @fold_legalization() -> i32 {
   %1 = "test.op_in_place_self_fold"() : () -> (i32)
   "test.return"(%1) : (i32) -> ()
 }
+
+// -----
+
+// CHECK-LABEL: func @convert_detached_signature()
+//       CHECK:   "test.legal_op_with_region"() ({
+//       CHECK:   ^bb0(%arg0: f64):
+//       CHECK:     "test.return"() : () -> ()
+//       CHECK:   }) : () -> ()
+func.func @convert_detached_signature() {
+  "test.detached_signature_conversion"() ({
+  ^bb0(%arg0: i64):
+    "test.return"() : () -> ()
+  }) : () -> ()
+  "test.return"() : () -> ()
+}
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
index 69b5787f7e851..aeade52c7ade5 100644
--- a/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestConvertToSPIRV
   TestSPIRVFuncSignatureConversion.cpp
+  TestSPIRVVectorUnrolling.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -13,4 +14,5 @@ add_mlir_library(MLIRTestConvertToSPIRV
   MLIRTransformUtils
   MLIRTransforms
   MLIRVectorDialect
+  MLIRVectorTransforms
   )
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
index ec67f85f6f27b..4a792336caba4 100644
--- a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
@@ -37,13 +37,8 @@ struct TestSPIRVFuncSignatureConversion final
   }
 
   void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateFuncOpVectorRewritePatterns(patterns);
-    populateReturnOpVectorRewritePatterns(patterns);
-    GreedyRewriteConfig config;
-    config.strictMode = GreedyRewriteStrictness::ExistingOps;
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
-                                       config);
+    Operation *op = getOperation();
+    (void)spirv::unrollVectorsInSignatures(op);
   }
 };
 
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp
new file mode 100644
index 0000000000000..0bad43d5214b1
--- /dev/null
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp
@@ -0,0 +1,50 @@
+//===- TestSPIRVVectorUnrolling.cpp - Test signature conversion -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace {
+
+struct TestSPIRVVectorUnrolling final
+    : PassWrapper<TestSPIRVVectorUnrolling, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSPIRVVectorUnrolling)
+
+  StringRef getArgument() const final { return "test-spirv-vector-unrolling"; }
+
+  StringRef getDescription() const final {
+    return "Test patterns that unroll vectors to types supported by SPIR-V";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<spirv::SPIRVDialect, vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    (void)spirv::unrollVectorsInSignatures(op);
+    (void)spirv::unrollVectorsInFuncBodies(op);
+  }
+};
+
+} // namespace
+
+namespace test {
+void registerTestSPIRVVectorUnrolling() {
+  PassRegistration<TestSPIRVVectorUnrolling>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
index 10206dd7cedf6..7e51d67702b05 100644
--- a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
@@ -1,4 +1,4 @@
-//===- TestWrapInZeroTripCheck.cpp -- Passes to test SCF zero-trip-check --===//
+//===- TestSCFWrapInZeroTripCheck.cpp -- Pass to test SCF zero-trip-check -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -13,9 +13,11 @@
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
 
@@ -46,13 +48,19 @@ struct TestWrapWhileLoopInZeroTripCheckPass
     func::FuncOp func = getOperation();
     MLIRContext *context = &getContext();
     IRRewriter rewriter(context);
-    func.walk([&](scf::WhileOp op) {
-      FailureOr<scf::WhileOp> result =
-          scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
-      // Ignore not implemented failure in tests. The expected output should
-      // catch problems (e.g. transformation doesn't happen).
-      (void)result;
-    });
+    if (forceCreateCheck) {
+      func.walk([&](scf::WhileOp op) {
+        FailureOr<scf::WhileOp> result =
+            scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
+        // Ignore not implemented failure in tests. The expected output should
+        // catch problems (e.g. transformation doesn't happen).
+        (void)result;
+      });
+    } else {
+      RewritePatternSet patterns(context);
+      scf::populateSCFRotateWhileLoopPatterns(patterns);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
   }
 
   Option<bool> forceCreateCheck{
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index a0a1cd30ed8ae..8f109f8ce5e6d 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -81,6 +81,15 @@ def AttrWithTrait : Test_Attr<"AttrWithTrait", [TestAttrTrait]> {
   let mnemonic = "attr_with_trait";
 }
 
+// An attribute of a list of decimal formatted integers in similar format to shapes.
+def TestDecimalShapeAttr : Test_Attr<"TestDecimalShape"> {
+  let mnemonic = "decimal_shape";
+
+  let parameters = (ins ArrayRefParameter<"int64_t">:$shape);
+
+  let hasCustomAssemblyFormat = 1;
+}
+
 // Test support for ElementsAttrInterface.
 def TestI64ElementsAttr : Test_Attr<"TestI64Elements", [ElementsAttrInterface]> {
   let mnemonic = "i64_elements";
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index b66dfbfcf0895..e09ea10906164 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -13,9 +13,12 @@
 
 #include "TestAttributes.h"
 #include "TestDialect.h"
+#include "TestTypes.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/ExtensibleDialect.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/Types.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/Hashing.h"
@@ -63,6 +66,39 @@ void CompoundAAttr::print(AsmPrinter &printer) const {
 // CompoundAAttr
 //===----------------------------------------------------------------------===//
 
+Attribute TestDecimalShapeAttr::parse(AsmParser &parser, Type type) {
+  if (parser.parseLess()){
+    return Attribute();
+  }
+  SmallVector<int64_t> shape;
+  if (parser.parseOptionalGreater()) {
+    auto parseDecimal = [&]() {
+      shape.emplace_back();
+      auto parseResult = parser.parseOptionalDecimalInteger(shape.back());
+      if (!parseResult.has_value() || failed(*parseResult)) {
+        parser.emitError(parser.getCurrentLocation()) << "expected an integer";
+        return failure();
+      }
+      return success();
+    };
+    if (failed(parseDecimal())) {
+      return Attribute();
+    }
+    while (failed(parser.parseOptionalGreater())) {
+      if (failed(parser.parseXInDimensionList()) || failed(parseDecimal())) {
+        return Attribute();
+      }
+    }
+  }
+  return get(parser.getContext(), shape);
+}
+
+void TestDecimalShapeAttr::print(AsmPrinter &printer) const {
+  printer << "<";
+  llvm::interleave(getShape(), printer, "x");
+  printer << ">";
+}
+
 Attribute TestI64ElementsAttr::parse(AsmParser &parser, Type type) {
   SmallVector<uint64_t> elements;
   if (parser.parseLess() || parser.parseLSquare())
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
index 6e75dd3932281..9ed1b3a47be36 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
@@ -297,11 +297,17 @@ void test::printSwitchCases(OpAsmPrinter &p, Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool test::parseUsingPropertyInCustom(OpAsmParser &parser, int64_t value[3]) {
-  return parser.parseLSquare() || parser.parseInteger(value[0]) ||
-         parser.parseComma() || parser.parseInteger(value[1]) ||
-         parser.parseComma() || parser.parseInteger(value[2]) ||
-         parser.parseRSquare();
+bool test::parseUsingPropertyInCustom(OpAsmParser &parser,
+                                      SmallVector<int64_t> &value) {
+  auto elemParser = [&]() {
+    int64_t v = 0;
+    if (failed(parser.parseInteger(v)))
+      return failure();
+    value.push_back(v);
+    return success();
+  };
+  return failed(parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Square,
+                                               elemParser));
 }
 
 void test::printUsingPropertyInCustom(OpAsmPrinter &printer, Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.h b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
index 7e9cd834278e3..6d4df7d82ffa5 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.h
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
@@ -160,7 +160,8 @@ void printSwitchCases(mlir::OpAsmPrinter &p, mlir::Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser, int64_t value[3]);
+bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser,
+                                llvm::SmallVector<int64_t> &value);
 
 void printUsingPropertyInCustom(mlir::OpAsmPrinter &printer,
                                 mlir::Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 9450764fcb1d5..2b55bff3538d3 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1884,6 +1884,7 @@ def LegalOpA : TEST_Op<"legal_op_a">,
 def LegalOpB : TEST_Op<"legal_op_b">, Results<(outs I32)>;
 def LegalOpC : TEST_Op<"legal_op_c">,
   Arguments<(ins I32)>, Results<(outs I32)>;
+def LegalOpD : TEST_Op<"legal_op_d">, Arguments<(ins AnyType)>;
 
 // Check that the conversion infrastructure can properly undo the creation of
 // operations where an operation was created before its parent, in this case,
@@ -2947,11 +2948,18 @@ def TestVersionedOpC : TEST_Op<"versionedC"> {
 
 // Op with a properties struct defined inline.
 def TestOpWithProperties : TEST_Op<"with_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
+  let assemblyFormat = [{
+    `a` `=` $a `,`
+    `b` `=` $b `,`
+    `c` `=` $c `,`
+    `flag` `=` $flag `,`
+    `array` `=` $array attr-dict}];
   let arguments = (ins
-    IntProperty<"int64_t">:$a,
+    I64Property:$a,
     StrAttr:$b, // Attributes can directly be used here.
-    ArrayProperty<"int64_t", 4>:$array // example of an array
+    StringProperty:$c,
+    BoolProperty:$flag,
+    IntArrayProperty<"int64_t">:$array // example of an array
   );
 }
 
@@ -2974,7 +2982,7 @@ def TestOpWithPropertiesAndInferredType
 
 // Demonstrate how to wrap an existing C++ class named MyPropStruct.
 def MyStructProperty : Property<"MyPropStruct"> {
-  let convertToAttribute = "$_storage.asAttribute($_ctxt)";
+  let convertToAttribute = "return $_storage.asAttribute($_ctxt);";
   let convertFromAttribute = "return MyPropStruct::setFromAttr($_storage, $_attr, $_diag);";
   let hashProperty = "$_storage.hash();";
 }
@@ -2988,14 +2996,14 @@ def TestOpWithWrappedProperties : TEST_Op<"with_wrapped_properties"> {
 
 def TestOpUsingPropertyInCustom : TEST_Op<"using_property_in_custom"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) attr-dict";
-  let arguments = (ins ArrayProperty<"int64_t", 3>:$prop);
+  let arguments = (ins IntArrayProperty<"int64_t">:$prop);
 }
 
 def TestOpUsingPropertyInCustomAndOther
   : TEST_Op<"using_property_in_custom_and_other"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) prop-dict attr-dict";
   let arguments = (ins
-    ArrayProperty<"int64_t", 3>:$prop,
+    IntArrayProperty<"int64_t">:$prop,
     IntProperty<"int64_t">:$other
   );
 }
@@ -3021,7 +3029,7 @@ def TestOpUsingIntPropertyWithWorseBytecode
 
 def PropertiesWithCustomPrint : Property<"PropertiesWithCustomPrint"> {
   let convertToAttribute = [{
-    getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3085,7 +3093,7 @@ def TestOpWithNiceProperties : TEST_Op<"with_nice_properties"> {
 
 def VersionedProperties : Property<"VersionedProperties"> {
   let convertToAttribute = [{
-      getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3131,13 +3139,65 @@ def TestOpWithVersionedProperties : TEST_Op<"with_versioned_properties"> {
 }
 
 def TestOpWithDefaultValuedProperties : TEST_Op<"with_default_valued_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a);
+  let assemblyFormat = [{
+    ($a^) : (`na`)?
+    ($b^)?
+    ($c^)?
+    ($unit^)?
+    attr-dict
+  }];
+  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a,
+    DefaultValuedProperty<StringProperty, "\"\"">:$b,
+    DefaultValuedProperty<IntProperty<"int32_t">, "-1">:$c,
+    UnitProperty:$unit);
 }
 
 def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins OptionalAttr<I32Attr>:$a, OptionalAttr<I32Attr>:$b);
+  let assemblyFormat = [{
+    (`anAttr` `=` $anAttr^)?
+    (`simple` `=` $simple^)?
+    (`nonTrivialStorage` `=` $nonTrivialStorage^)?
+    (`hasDefault` `=` $hasDefault^)?
+    (`nested` `=` $nested^)?
+    (`longSyntax` `=` $longSyntax^)?
+    (`hasUnit` $hasUnit^)?
+    (`maybeUnit` `=` $maybeUnit^)?
+    attr-dict
+  }];
+  let arguments = (ins
+    OptionalAttr<I32Attr>:$anAttr,
+    OptionalProperty<I64Property>:$simple,
+    OptionalProperty<StringProperty>:$nonTrivialStorage,
+    // Confirm that properties with default values now default to nullopt and have
+    // the long syntax.
+    OptionalProperty<DefaultValuedProperty<I64Property, "0">>:$hasDefault,
+    OptionalProperty<OptionalProperty<I64Property>>:$nested,
+    OptionalProperty<StringProperty, 0>:$longSyntax,
+    UnitProperty:$hasUnit,
+    OptionalProperty<UnitProperty>:$maybeUnit);
+}
+
+def TestOpWithArrayProperties : TEST_Op<"with_array_properties"> {
+  let assemblyFormat = [{
+    `ints` `=` $ints
+    `strings` `=` $strings
+    `nested` `=` $nested
+    `opt` `=` $opt
+    `explicitOptions` `=` $explicitOptions
+    `explicitUnits` `=` $explicitUnits
+    ($hasDefault^ `thats_has_default`)?
+    attr-dict
+  }];
+  let arguments = (ins
+    ArrayProperty<I64Property>:$ints,
+    ArrayProperty<StringProperty>:$strings,
+    ArrayProperty<ArrayProperty<I32Property>>:$nested,
+    OptionalProperty<ArrayProperty<I32Property>>:$opt,
+    ArrayProperty<OptionalProperty<I64Property>>:$explicitOptions,
+    ArrayProperty<UnitProperty>:$explicitUnits,
+    DefaultValuedProperty<ArrayProperty<I64Property>,
+      "::llvm::ArrayRef<int64_t>{}", "::llvm::SmallVector<int64_t>{}">:$hasDefault
+  );
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
index 3129085058fd9..795b9da955632 100644
--- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
+++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
@@ -86,6 +86,17 @@ def OIListTrivial : TEST_Op<"oilist_with_keywords_only"> {
   }];
 }
 
+// Ops related to OIList primitive
+def OIListTrivialProperties : TEST_Op<"oilist_with_keywords_only_properties"> {
+  let arguments = (ins UnitProperty:$keyword, UnitProperty:$otherKeyword,
+                       UnitProperty:$diffNameUnitPropertyKeyword);
+  let assemblyFormat = [{
+    oilist( `keyword` $keyword
+          | `otherKeyword` $otherKeyword
+          | `thirdKeyword` $diffNameUnitPropertyKeyword) attr-dict
+  }];
+}
+
 def OIListSimple : TEST_Op<"oilist_with_simple_args", [AttrSizedOperandSegments]> {
   let arguments = (ins Optional<AnyType>:$arg0,
                        Optional<AnyType>:$arg1,
@@ -392,6 +403,17 @@ def FormatOptionalUnitAttrNoElide
   let assemblyFormat = "($is_optional^)? attr-dict";
 }
 
+def FormatOptionalUnitProperty : TEST_Op<"format_optional_unit_property"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "(`is_optional` $is_optional^)? attr-dict";
+}
+
+def FormatOptionalUnitPropertyNoElide
+    : TEST_Op<"format_optional_unit_property_no_elide"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "($is_optional^)? attr-dict";
+}
+
 def FormatOptionalEnumAttr : TEST_Op<"format_optional_enum_attr"> {
   let arguments = (ins OptionalAttr<SomeI64Enum>:$attr);
   let assemblyFormat = "($attr^)? attr-dict";
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a14a5da341098..91dfb2faa80a1 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -685,6 +685,33 @@ namespace {
 //===----------------------------------------------------------------------===//
 // Region-Block Rewrite Testing
 
+/// This pattern applies a signature conversion to a block inside a detached
+/// region.
+struct TestDetachedSignatureConversion : public ConversionPattern {
+  TestDetachedSignatureConversion(MLIRContext *ctx)
+      : ConversionPattern("test.detached_signature_conversion", /*benefit=*/1,
+                          ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    if (op->getNumRegions() != 1)
+      return failure();
+    OperationState state(op->getLoc(), "test.legal_op_with_region", operands,
+                         op->getResultTypes(), {}, BlockRange());
+    Region *newRegion = state.addRegion();
+    rewriter.inlineRegionBefore(op->getRegion(0), *newRegion,
+                                newRegion->begin());
+    TypeConverter::SignatureConversion result(newRegion->getNumArguments());
+    for (unsigned i = 0, e = newRegion->getNumArguments(); i < e; ++i)
+      result.addInputs(i, rewriter.getF64Type());
+    rewriter.applySignatureConversion(&newRegion->front(), result);
+    Operation *newOp = rewriter.create(state);
+    rewriter.replaceOp(op, newOp->getResults());
+    return success();
+  }
+};
+
 /// This pattern is a simple pattern that inlines the first region of a given
 /// operation into the parent region.
 struct TestRegionRewriteBlockMovement : public ConversionPattern {
@@ -1112,16 +1139,16 @@ struct TestLegalizePatternDriver
     TestTypeConverter converter;
     mlir::RewritePatternSet patterns(&getContext());
     populateWithGenerated(patterns);
-    patterns
-        .add<TestRegionRewriteBlockMovement, TestRegionRewriteUndo,
-             TestCreateBlock, TestCreateIllegalBlock, TestUndoBlockArgReplace,
-             TestUndoBlockErase, TestPassthroughInvalidOp, TestSplitReturnType,
-             TestChangeProducerTypeI32ToF32, TestChangeProducerTypeF32ToF64,
-             TestChangeProducerTypeF32ToInvalid, TestUpdateConsumerType,
-             TestNonRootReplacement, TestBoundedRecursiveRewrite,
-             TestNestedOpCreationUndoRewrite, TestReplaceEraseOp,
-             TestCreateUnregisteredOp, TestUndoMoveOpBefore,
-             TestUndoPropertiesModification>(&getContext());
+    patterns.add<
+        TestRegionRewriteBlockMovement, TestDetachedSignatureConversion,
+        TestRegionRewriteUndo, TestCreateBlock, TestCreateIllegalBlock,
+        TestUndoBlockArgReplace, TestUndoBlockErase, TestPassthroughInvalidOp,
+        TestSplitReturnType, TestChangeProducerTypeI32ToF32,
+        TestChangeProducerTypeF32ToF64, TestChangeProducerTypeF32ToInvalid,
+        TestUpdateConsumerType, TestNonRootReplacement,
+        TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite,
+        TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore,
+        TestUndoPropertiesModification>(&getContext());
     patterns.add<TestDropOpSignatureConversion>(&getContext(), converter);
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1132,6 +1159,8 @@ struct TestLegalizePatternDriver
     target.addLegalOp<ModuleOp>();
     target.addLegalOp<LegalOpA, LegalOpB, LegalOpC, TestCastOp, TestValidOp,
                       TerminatorOp, OneRegionOp>();
+    target.addLegalOp(
+        OperationName("test.legal_op_with_region", &getContext()));
     target
         .addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>();
     target.addDynamicallyLegalOp<TestReturnOp>([](TestReturnOp op) {
@@ -1551,6 +1580,17 @@ struct TestTypeConversionAnotherProducer
   }
 };
 
+struct TestReplaceWithLegalOp : public ConversionPattern {
+  TestReplaceWithLegalOp(MLIRContext *ctx)
+      : ConversionPattern("test.replace_with_legal_op", /*benefit=*/1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<LegalOpD>(op, operands[0]);
+    return success();
+  }
+};
+
 struct TestTypeConversionDriver
     : public PassWrapper<TestTypeConversionDriver, OperationPass<>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTypeConversionDriver)
@@ -1642,6 +1682,7 @@ struct TestTypeConversionDriver
 
     // Initialize the conversion target.
     mlir::ConversionTarget target(getContext());
+    target.addLegalOp<LegalOpD>();
     target.addDynamicallyLegalOp<TestTypeProducerOp>([](TestTypeProducerOp op) {
       auto recursiveType = dyn_cast<test::TestRecursiveType>(op.getType());
       return op.getType().isF64() || op.getType().isInteger(64) ||
@@ -1667,7 +1708,8 @@ struct TestTypeConversionDriver
                  TestSignatureConversionUndo,
                  TestTestSignatureConversionNoConverter>(converter,
                                                          &getContext());
-    patterns.add<TestTypeConversionAnotherProducer>(&getContext());
+    patterns.add<TestTypeConversionAnotherProducer, TestReplaceWithLegalOp>(
+        &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
 
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
index faaa3bb8db24c..01297ad0a1148 100644
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_library(MLIRTestIR
   TestBuiltinDistinctAttributes.cpp
   TestClone.cpp
   TestDiagnostics.cpp
+  TestDiagnosticsMetadata.cpp
   TestDominance.cpp
   TestFunc.cpp
   TestInterfaces.cpp
diff --git a/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp b/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp
new file mode 100644
index 0000000000000..5cb0193baa171
--- /dev/null
+++ b/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp
@@ -0,0 +1,65 @@
+//===- TestDiagnosticsMetadata.cpp - Test Diagnostic Metatdata ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for constructing and resolving dominance
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+
+namespace {
+struct TestDiagnosticMetadataPass
+    : public PassWrapper<TestDiagnosticMetadataPass,
+                         InterfacePass<SymbolOpInterface>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestDiagnosticMetadataPass)
+
+  StringRef getArgument() const final { return "test-diagnostic-metadata"; }
+  StringRef getDescription() const final { return "Test diagnostic metadata."; }
+  TestDiagnosticMetadataPass() = default;
+  TestDiagnosticMetadataPass(const TestDiagnosticMetadataPass &) {}
+
+  void runOnOperation() override {
+    llvm::errs() << "Test '" << getOperation().getName() << "'\n";
+
+    // Build a diagnostic handler that has filtering capabilities.
+    ScopedDiagnosticHandler handler(&getContext(), [](mlir::Diagnostic &diag) {
+      return mlir::success(
+          llvm::none_of(diag.getMetadata(), [](mlir::DiagnosticArgument &arg) {
+            return arg.getKind() == mlir::DiagnosticArgument::
+                                        DiagnosticArgumentKind::String &&
+                   arg.getAsString().contains("hello");
+          }));
+    });
+
+    // Emit a diagnostic for every operation with a valid loc.
+    getOperation()->walk([&](Operation *op) {
+      if (StringAttr strAttr = op->getAttrOfType<StringAttr>("attr")) {
+        if (strAttr.getValue() == "emit_error")
+          emitError(op->getLoc(), "test diagnostic metadata")
+              .getUnderlyingDiagnostic()
+              ->getMetadata()
+              .push_back(DiagnosticArgument("hello"));
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestDiagnosticsMetadataPass() {
+  PassRegistration<TestDiagnosticMetadataPass>{};
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index 8f206d9077272..a99441cd7147b 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -234,11 +234,7 @@ applyTileToAll(RewriterBase &rewriter, Operation *transformOp,
     scf::SCFTilingOptions tilingOptions;
     tilingOptions.setTileSizes(tileSizes).setInterchange(interchange);
     if (mapping) {
-      auto mappingAttrs =
-          llvm::map_to_vector(mapping.value(), [](Attribute attr) {
-            return cast<DeviceMappingAttrInterface>(attr);
-          });
-      tilingOptions.setMapping(mappingAttrs);
+      tilingOptions.setMapping(mapping.value().getValue());
     }
     tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
 
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
index 370c5baa0adef..b8861198d596b 100644
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -493,6 +493,10 @@ func.func @asin() {
   %cst3 = arith.constant -0.25 : f32
   call @asin_f32(%cst3) : (f32) -> ()
 
+  // CHECK: -1.1197
+  %cst4 = arith.constant -0.90 : f32
+  call @asin_f32(%cst4) : (f32) -> ()
+
   // CHECK: 0.25268, 0.384397, 0.597406
   %vec_x = arith.constant dense<[0.25, 0.375, 0.5625]> : vector<3xf32>
   call @asin_3xf32(%vec_x) : (vector<3xf32>) -> ()
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 46d272649caed..03288ae8bd3d7 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -195,6 +195,16 @@ test.format_optional_unit_attribute
 // CHECK: test.format_optional_unit_attribute_no_elide unit
 test.format_optional_unit_attribute_no_elide unit
 
+// CHECK: test.format_optional_unit_property is_optional
+test.format_optional_unit_property is_optional
+
+// CHECK: test.format_optional_unit_property
+// CHECK-NOT: is_optional
+test.format_optional_unit_property
+
+// CHECK: test.format_optional_unit_property_no_elide unit
+test.format_optional_unit_property_no_elide unit
+
 // CHECK: test.format_optional_enum_attr case5
 test.format_optional_enum_attr case5
 
diff --git a/mlir/test/mlir-tblgen/op-format.td b/mlir/test/mlir-tblgen/op-format.td
index 4a19ffb3dfcc6..8af4341952f04 100644
--- a/mlir/test/mlir-tblgen/op-format.td
+++ b/mlir/test/mlir-tblgen/op-format.td
@@ -73,7 +73,7 @@ def OptionalGroupA : TestFormat_Op<[{
 // CHECK-NEXT: result.addAttribute("a", parser.getBuilder().getUnitAttr())
 // CHECK: parser.parseKeyword("bar")
 // CHECK-LABEL: OptionalGroupB::print
-// CHECK: if (!getAAttr())
+// CHECK: if (!(getAAttr() && getAAttr() != ((false) ? ::mlir::OpBuilder((*this)->getContext()).getUnitAttr() : nullptr)))
 // CHECK-NEXT: odsPrinter << ' ' << "foo"
 // CHECK-NEXT: else
 // CHECK-NEXT: odsPrinter << ' ' << "bar"
@@ -84,7 +84,7 @@ def OptionalGroupB : TestFormat_Op<[{
 // Optional group anchored on a default-valued attribute:
 // CHECK-LABEL: OptionalGroupC::parse
 
-//       CHECK: if (getAAttr() && getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
+//       CHECK: if (getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
 //  CHECK-NEXT:   odsPrinter << ' ';
 //  CHECK-NEXT:   odsPrinter.printAttributeWithoutType(getAAttr());
 //  CHECK-NEXT: }
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index 7b0ee6b2a1bd8..918583c9f4ed4 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -1,8 +1,10 @@
-// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEFS
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
+include "mlir/IR/Properties.td"
 
 def Test_Dialect : Dialect {
   let name = "test";
@@ -15,7 +17,115 @@ def OpWithAttr : NS_Op<"op_with_attr">{
   let arguments = (ins AnyAttr:$attr, OptionalAttr<AnyAttr>:$optional);
 }
 
-// CHECK: void setAttrAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().attr = attr
-// CHECK: void setOptionalAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().optional = attr
+// Test required and optional properties
+// ---
+
+def DefaultI64Array : IntArrayProperty<"int64_t"> {
+  let defaultValue = "::llvm::ArrayRef<int64_t>{}";
+  let storageTypeValueOverride = "::llvm::SmallVector<int64_t>{}";
+}
+
+def OpWithProps : NS_Op<"op_with_props"> {
+  let arguments = (ins
+    BoolProperty:$flag,
+    StringProperty:$string,
+    ArrayProperty<StringProperty>:$strings,
+    DefaultValuedProperty<I32Property, "0">:$default_int,
+    OptionalProperty<I64Property>:$optional,
+    DefaultI64Array:$intArray
+  );
+}
+
+/// Check that optional arguments to builders only go at the end.
+def OpWithSomeOptionalProperties : NS_Op<"op_with_some_optional_props"> {
+  let arguments = (ins
+    OptionalProperty<I64Property>:$mustSpecify,
+    I64Property:$required,
+    OptionalProperty<StringProperty>:$canOmit,
+    DefaultValuedProperty<I64Property, "-1">:$canOmit2
+  );
+}
+
+/// Check that the ambiguous attribute protection correctly stops optional properties
+/// from getting default argument values in builders.
+def OpWithOptionalPropsAndAttrs :
+    NS_Op<"with_some_optional_props_and_atts"> {
+  let arguments = (ins
+    OptionalProperty<BoolProperty>:$mustSpecify,
+    OptionalAttr<BoolAttr>:$ambiguous,
+    OptionalAttr<I32Attr>:$canOmit,
+    OptionalProperty<I32Property>:$canOmitProp
+  );
+}
+
+// DECL: void setAttrAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().attr = attr
+// DECL: void setOptionalAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().optional = attr
+
+// -----
+
+// DECL-LABEL: class OpWithOptionalPropsAndAttrs :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<bool> mustSpecify,
+// DECL-SAME: /*optional*/::mlir::BoolAttr ambiguous,
+// DECL-SAME: /*optional*/::mlir::IntegerAttr canOmit,
+// DECL-SAME: /*optional*/std::optional<int32_t> canOmitProp = std::nullopt);
+
+// -----
+
+// COM: Ensure the struct is set up how we expect
+// DECL-LABEL: class OpWithPropsGenericAdaptorBase
+// DECL: using flagTy = bool;
+// DECL-NEXT: flagTy flag;
+// DECL-NEXT: bool getFlag()
+// DECL-NEXT: propStorage = this->flag
+// DECL-NEXT: return propStorage;
+// DECL: void setFlag(bool propValue)
+// DECL-NEXT: propStorage = this->flag;
+// DECL-NEXT: propStorage = propValue;
+// DECL: using stringTy = std::string;
+// DECL: llvm::StringRef getString()
+// DECL: auto &propStorage = this->string;
+// DECL-NEXT: return ::llvm::StringRef{propStorage};
+// DECL: using stringsTy = ::llvm::SmallVector<std::string>
+// DECL: ::llvm::ArrayRef<std::string> getStrings()
+// DECL: using default_intTy = int32_t;
+// DECL: default_intTy default_int = 0;
+// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
+// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: propStorage.assign
+// DECL-LABEL: class OpWithProps :
+// DECL: setString(::llvm::StringRef newString)
+// DECL-NEXT: getProperties().setString(newString)
+
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: bool flag,
+// DECL-SAME: ::llvm::StringRef string,
+// DECL-SAME: ::llvm::ArrayRef<std::string> strings,
+// DECL-SAME: /*optional*/int32_t default_int = 0,
+// DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+
+// DEFS-LABEL: OpWithProps::computePropertiesHash
+// DEFS: hash_intArray
+// DEFS-NEXT: return ::llvm::hash_value(::llvm::ArrayRef<int64_t>{propStorage})
+// DEFS: ::llvm::hash_value(prop.optional)
+// DEFS: hash_intArray(prop.intArray)
+
+// -----
+
+// DECL-LABEL: class OpWithSomeOptionalProperties :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<int64_t> mustSpecify,
+// DECL-SAME: int64_t required,
+// DECL-SAME: /*optional*/std::optional<::llvm::StringRef> canOmit = std::nullopt,
+// DECL-SAME: /*optional*/int64_t canOmit2 = -1);
diff --git a/mlir/test/mlir-tblgen/testdialect-attrdefs.mlir b/mlir/test/mlir-tblgen/testdialect-attrdefs.mlir
index ee92ea06a208c..89ad3594eebd8 100644
--- a/mlir/test/mlir-tblgen/testdialect-attrdefs.mlir
+++ b/mlir/test/mlir-tblgen/testdialect-attrdefs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s | mlir-opt -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func private @compoundA()
 // CHECK-SAME: #test.cmpnd_a<1, !test.smpla, [5, 6]>
@@ -19,3 +19,28 @@ func.func private @qualifiedAttr() attributes {foo = #test.cmpnd_nested_outer_qu
 func.func private @overriddenAttr() attributes {
   foo = #test.override_builder<5>
 }
+
+// CHECK-LABEL: @decimalIntegerShapeEmpty
+// CHECK-SAME: foo = #test.decimal_shape<>
+func.func private @decimalIntegerShapeEmpty() attributes {
+  foo = #test.decimal_shape<>
+}
+
+// CHECK-LABEL: @decimalIntegerShape
+// CHECK-SAME: foo = #test.decimal_shape<5>
+func.func private @decimalIntegerShape() attributes {
+  foo = #test.decimal_shape<5>
+}
+
+// CHECK-LABEL: @decimalIntegerShapeMultiple
+// CHECK-SAME: foo = #test.decimal_shape<0x3x7>
+func.func private @decimalIntegerShapeMultiple() attributes {
+  foo = #test.decimal_shape<0x3x7>
+}
+
+// -----
+
+func.func private @hexdecimalInteger() attributes {
+// expected-error @below {{expected an integer}}
+  sdg = #test.decimal_shape<1x0xb>
+}
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index e8b47007a8907..8125bf3fb8fc9 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,6 +5,7 @@
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
+from ml_dtypes import bfloat16
 
 
 # Log everything to stderr and flush so that we have a unified stream to match
@@ -521,6 +522,45 @@ def testComplexUnrankedMemrefAdd():
 run(testComplexUnrankedMemrefAdd)
 
 
+# Test bf16 memrefs
+# CHECK-LABEL: TEST: testBF16Memref
+def testBF16Memref():
+    with Context():
+        module = Module.parse(
+            """
+    module  {
+      func.func @main(%arg0: memref<1xbf16>,
+                      %arg1: memref<1xbf16>) attributes { llvm.emit_c_interface } {
+        %0 = arith.constant 0 : index
+        %1 = memref.load %arg0[%0] : memref<1xbf16>
+        memref.store %1, %arg1[%0] : memref<1xbf16>
+        return
+      }
+    } """
+        )
+
+        arg1 = np.array([0.5]).astype(bfloat16)
+        arg2 = np.array([0.0]).astype(bfloat16)
+
+        arg1_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg1))
+        )
+        arg2_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg2))
+        )
+
+        execution_engine = ExecutionEngine(lowerToLLVM(module))
+        execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
+
+        # test to-numpy utility
+        # CHECK: [0.5]
+        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        log(npout)
+
+
+run(testBF16Memref)
+
+
 #  Test addition of two 2d_memref
 # CHECK-LABEL: TEST: testDynamicMemrefAdd2D
 def testDynamicMemrefAdd2D():
diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index cfe377c703717..3178f58cf2e74 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -113,6 +113,8 @@ def testTypeIsInstance():
 def testFloatTypeSubclasses():
     ctx = Context()
     # CHECK: True
+    print(isinstance(Type.parse("f8E4M3", ctx), FloatType))
+    # CHECK: True
     print(isinstance(Type.parse("f8E4M3FN", ctx), FloatType))
     # CHECK: True
     print(isinstance(Type.parse("f8E5M2", ctx), FloatType))
@@ -229,6 +231,8 @@ def testIndexType():
 @run
 def testFloatType():
     with Context():
+        # CHECK: float: f8E4M3
+        print("float:", Float8E4M3Type.get())
         # CHECK: float: f8E4M3FN
         print("float:", Float8E4M3FNType.get())
         # CHECK: float: f8E5M2
@@ -601,6 +605,7 @@ def testTypeIDs():
         types = [
             (IntegerType, IntegerType.get_signless(16)),
             (IndexType, IndexType.get()),
+            (Float8E4M3Type, Float8E4M3Type.get()),
             (Float8E4M3FNType, Float8E4M3FNType.get()),
             (Float8E5M2Type, Float8E5M2Type.get()),
             (Float8E4M3FNUZType, Float8E4M3FNUZType.get()),
@@ -624,6 +629,7 @@ def testTypeIDs():
 
         # CHECK: IntegerType(i16)
         # CHECK: IndexType(index)
+        # CHECK: Float8E4M3Type(f8E4M3)
         # CHECK: Float8E4M3FNType(f8E4M3FN)
         # CHECK: Float8E5M2Type(f8E5M2)
         # CHECK: Float8E4M3FNUZType(f8E4M3FNUZ)
@@ -704,6 +710,9 @@ def print_downcasted(typ):
         # CHECK: Float8E4M3B11FNUZType
         # CHECK: Float8E4M3B11FNUZType(f8E4M3B11FNUZ)
         print_downcasted(Float8E4M3B11FNUZType.get())
+        # CHECK: Float8E4M3Type
+        # CHECK: Float8E4M3Type(f8E4M3)
+        print_downcasted(Float8E4M3Type.get())
         # CHECK: Float8E4M3FNType
         # CHECK: Float8E4M3FNType(f8E4M3FN)
         print_downcasted(Float8E4M3FNType.get())
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 149f9d59961b8..1842fa158e75a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -92,6 +92,7 @@ void registerTestDataLayoutQuery();
 void registerTestDeadCodeAnalysisPass();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
+void registerTestDiagnosticsMetadataPass();
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestEmulateNarrowTypePass();
@@ -142,6 +143,7 @@ void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
 void registerTestSliceAnalysisPass();
 void registerTestSPIRVFuncSignatureConversion();
+void registerTestSPIRVVectorUnrolling();
 void registerTestTensorCopyInsertionPass();
 void registerTestTensorTransforms();
 void registerTestTopologicalSortAnalysisPass();
@@ -225,6 +227,7 @@ void registerTestPasses() {
   mlir::test::registerTestDeadCodeAnalysisPass();
   mlir::test::registerTestDecomposeCallGraphTypes();
   mlir::test::registerTestDiagnosticsPass();
+  mlir::test::registerTestDiagnosticsMetadataPass();
   mlir::test::registerTestDominancePass();
   mlir::test::registerTestDynamicPipelinePass();
   mlir::test::registerTestEmulateNarrowTypePass();
@@ -275,6 +278,7 @@ void registerTestPasses() {
   mlir::test::registerTestShapeMappingPass();
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestSPIRVFuncSignatureConversion();
+  mlir::test::registerTestSPIRVVectorUnrolling();
   mlir::test::registerTestTensorCopyInsertionPass();
   mlir::test::registerTestTensorTransforms();
   mlir::test::registerTestTopologicalSortAnalysisPass();
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0fc750c7bbc88..a2ceefb34db45 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -155,6 +155,36 @@ static const char *const valueRangeReturnCode = R"(
            std::next({0}, valueRange.first + valueRange.second)};
 )";
 
+/// Parse operand/result segment_size property.
+/// {0}: Number of elements in the segment array
+static const char *const parseTextualSegmentSizeFormat = R"(
+  size_t i = 0;
+  auto parseElem = [&]() -> ::mlir::ParseResult {
+    if (i >= {0})
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected `]` after {0} segment sizes");
+    if (failed($_parser.parseInteger($_storage[i])))
+      return ::mlir::failure();
+    i += 1;
+    return ::mlir::success();
+  };
+  if (failed($_parser.parseCommaSeparatedList(
+      ::mlir::AsmParser::Delimeter::Square, parseElem)))
+    return failure();
+  if (i < {0})
+    return $_parser.emitError($_parser.getCurrentLocation(),
+      "expected {0} segment sizes, found only ") << i;
+  return success();
+)";
+
+static const char *const printTextualSegmentSize = R"(
+  [&]() {
+    $_printer << '[';
+    ::llvm::interleaveComma($_storage, $_printer);
+    $_printer << ']';
+  }()
+)";
+
 /// Read operand/result segment_size from bytecode.
 static const char *const readBytecodeSegmentSizeNative = R"(
   if ($_reader.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
@@ -422,8 +452,10 @@ class OpOrAdaptorHelper {
   // Property
   std::optional<NamedProperty> operandSegmentsSize;
   std::string operandSegmentsSizeStorage;
+  std::string operandSegmentsSizeParser;
   std::optional<NamedProperty> resultSegmentsSize;
   std::string resultSegmentsSizeStorage;
+  std::string resultSegmentsSizeParser;
 
   // Indices to store the position in the emission order of the operand/result
   // segment sizes attribute if emitted as part of the properties for legacy
@@ -448,31 +480,40 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         {namedAttr.name, AttributeMetadata{namedAttr.name, !isOptional, attr}});
   }
 
-  auto makeProperty = [&](StringRef storageType) {
+  auto makeProperty = [&](StringRef storageType, StringRef parserCall) {
     return Property(
+        /*summary=*/"",
+        /*description=*/"",
         /*storageType=*/storageType,
         /*interfaceType=*/"::llvm::ArrayRef<int32_t>",
         /*convertFromStorageCall=*/"$_storage",
         /*assignToStorageCall=*/
         "::llvm::copy($_value, $_storage.begin())",
         /*convertToAttributeCall=*/
-        "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
+        "return ::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage);",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
+        /*parserCall=*/parserCall,
+        /*optionalParserCall=*/"",
+        /*printerCall=*/printTextualSegmentSize,
         /*readFromMlirBytecodeCall=*/readBytecodeSegmentSizeNative,
         /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSizeNative,
         /*hashPropertyCall=*/
         "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
-        /*StringRef defaultValue=*/"");
+        /*StringRef defaultValue=*/"",
+        /*storageTypeValueOverride=*/"");
   };
   // Include key attributes from several traits as implicitly registered.
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     if (op.getDialect().usePropertiesForAttributes()) {
       operandSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
-      operandSegmentsSize = {"operandSegmentSizes",
-                             makeProperty(operandSegmentsSizeStorage)};
+      operandSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumOperands());
+      operandSegmentsSize = {
+          "operandSegmentSizes",
+          makeProperty(operandSegmentsSizeStorage, operandSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {operandSegmentAttrName, AttributeMetadata{operandSegmentAttrName,
@@ -484,8 +525,11 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       resultSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
-      resultSegmentsSize = {"resultSegmentSizes",
-                            makeProperty(resultSegmentsSizeStorage)};
+      resultSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumResults());
+      resultSegmentsSize = {
+          "resultSegmentSizes",
+          makeProperty(resultSegmentsSizeStorage, resultSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {resultSegmentAttrName,
@@ -572,6 +616,12 @@ class OpEmitter {
   void
   genPropertiesSupportForBytecode(ArrayRef<ConstArgument> attrOrProperties);
 
+  // Generates getters for the properties.
+  void genPropGetters();
+
+  // Generates seters for the properties.
+  void genPropSetters();
+
   // Generates getters for the attributes.
   void genAttrGetters();
 
@@ -1041,6 +1091,8 @@ OpEmitter::OpEmitter(const Operator &op,
   genNamedRegionGetters();
   genNamedSuccessorGetters();
   genPropertiesSupport();
+  genPropGetters();
+  genPropSetters();
   genAttrGetters();
   genAttrSetters();
   genOptionalAttrRemovers();
@@ -1198,6 +1250,16 @@ void OpEmitter::genAttrNameGetters() {
   }
 }
 
+// Emit the getter for a named property.
+// It is templated to be shared between the Op and the adaptor class.
+template <typename OpClassOrAdaptor>
+static void emitPropGetter(OpClassOrAdaptor &opClass, const Operator &op,
+                           StringRef name, const Property &prop) {
+  auto *method = opClass.addInlineMethod(prop.getInterfaceType(), name);
+  ERROR_IF_PRUNED(method, name, op);
+  method->body() << formatv("  return getProperties().{0}();", name);
+}
+
 // Emit the getter for an attribute with the return type specified.
 // It is templated to be shared between the Op and the adaptor class.
 template <typename OpClassOrAdaptor>
@@ -1313,7 +1375,7 @@ void OpEmitter::genPropertiesSupport() {
     )decl";
   const char *propFromAttrFmt = R"decl(
       auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
         {0}
       };
       {2};
@@ -1358,7 +1420,10 @@ void OpEmitter::genPropertiesSupport() {
                                           .addSubst("_storage", propertyStorage)
                                           .addSubst("_diag", propertyDiag)),
                                name, getAttr);
-      if (prop.hasDefaultValue()) {
+      if (prop.hasStorageTypeValueOverride()) {
+        setPropMethod << formatv(attrGetDefaultFmt, name,
+                                 prop.getStorageTypeValueOverride());
+      } else if (prop.hasDefaultValue()) {
         setPropMethod << formatv(attrGetDefaultFmt, name,
                                  prop.getDefaultValue());
       } else {
@@ -1409,8 +1474,10 @@ void OpEmitter::genPropertiesSupport() {
   const char *propToAttrFmt = R"decl(
     {
       const auto &propStorage = prop.{0};
-      attrs.push_back(odsBuilder.getNamedAttr("{0}",
-                                              {1}));
+      auto attr = [&]() -> ::mlir::Attribute {{
+        {1}
+      }();
+      attrs.push_back(odsBuilder.getNamedAttr("{0}", attr));
     }
 )decl";
   for (const auto &attrOrProp : attrOrProperties) {
@@ -1458,9 +1525,12 @@ void OpEmitter::genPropertiesSupport() {
       StringRef name = namedProperty->name;
       auto &prop = namedProperty->prop;
       FmtContext fctx;
-      hashMethod << formatv(propHashFmt, name,
-                            tgfmt(prop.getHashPropertyCall(),
-                                  &fctx.addSubst("_storage", propertyStorage)));
+      if (!prop.getHashPropertyCall().empty()) {
+        hashMethod << formatv(
+            propHashFmt, name,
+            tgfmt(prop.getHashPropertyCall(),
+                  &fctx.addSubst("_storage", propertyStorage)));
+      }
     }
   }
   hashMethod << "  return llvm::hash_combine(";
@@ -1468,8 +1538,13 @@ void OpEmitter::genPropertiesSupport() {
       attrOrProperties, hashMethod, [&](const ConstArgument &attrOrProp) {
         if (const auto *namedProperty =
                 llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
-          hashMethod << "\n    hash_" << namedProperty->name << "(prop."
-                     << namedProperty->name << ")";
+          if (!namedProperty->prop.getHashPropertyCall().empty()) {
+            hashMethod << "\n    hash_" << namedProperty->name << "(prop."
+                       << namedProperty->name << ")";
+          } else {
+            hashMethod << "\n    ::llvm::hash_value(prop."
+                       << namedProperty->name << ")";
+          }
           return;
         }
         const auto *namedAttr =
@@ -1524,8 +1599,9 @@ void OpEmitter::genPropertiesSupport() {
                      "\"{0}\") return ",
                      resultSegmentAttrName);
     }
-    getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
-                          << ";\n";
+    getInherentAttrMethod << "[&]() -> ::mlir::Attribute { "
+                          << tgfmt(prop.getConvertToAttributeCall(), &fctx)
+                          << " }();\n";
 
     if (name == operandSegmentAttrName) {
       setInherentAttrMethod
@@ -1549,13 +1625,15 @@ void OpEmitter::genPropertiesSupport() {
 )decl",
                                      name);
     if (name == operandSegmentAttrName) {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          operandSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     } else {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", resultSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          resultSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     }
   }
   getInherentAttrMethod << "  return std::nullopt;\n";
@@ -1701,6 +1779,26 @@ void OpEmitter::genPropertiesSupportForBytecode(
   readPropertiesMethod << "  return ::mlir::success();";
 }
 
+void OpEmitter::genPropGetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getGetterName(prop.name);
+    emitPropGetter(opClass, op, name, prop.prop);
+  }
+}
+
+void OpEmitter::genPropSetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getSetterName(prop.name);
+    std::string argName = "new" + convertToCamelFromSnakeCase(
+                                      prop.name, /*capitalizeFirst=*/true);
+    auto *method = opClass.addInlineMethod(
+        "void", name, MethodParameter(prop.prop.getInterfaceType(), argName));
+    if (!method)
+      return;
+    method->body() << formatv("  getProperties().{0}({1});", name, argName);
+  }
+}
+
 void OpEmitter::genAttrGetters() {
   FmtContext fctx;
   fctx.withBuilder("::mlir::Builder((*this)->getContext())");
@@ -2957,6 +3055,12 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
   }
 
   // Add parameters for all arguments (operands and attributes).
+  // Track "attr-like" (property and attribute) optional values separate from
+  // attributes themselves so that the disambiguation code can look at the first
+  // attribute specifically when determining where to trim the optional-value
+  // list to avoid ambiguity while preserving the ability of all-property ops to
+  // use default parameters.
+  int defaultValuedAttrLikeStartIndex = op.getNumArgs();
   int defaultValuedAttrStartIndex = op.getNumArgs();
   // Successors and variadic regions go at the end of the parameter list, so no
   // default arguments are possible.
@@ -2967,6 +3071,15 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     for (int i = op.getNumArgs() - 1; i >= 0; --i) {
       auto *namedAttr =
           llvm::dyn_cast_if_present<tblgen::NamedAttribute *>(op.getArg(i));
+      auto *namedProperty =
+          llvm::dyn_cast_if_present<tblgen::NamedProperty *>(op.getArg(i));
+      if (namedProperty) {
+        Property prop = namedProperty->prop;
+        if (!prop.hasDefaultValue())
+          break;
+        defaultValuedAttrLikeStartIndex = i;
+        continue;
+      }
       if (!namedAttr)
         break;
 
@@ -2986,6 +3099,7 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
       if (retType == "::llvm::APInt" || retType == "::llvm::APFloat")
         break;
 
+      defaultValuedAttrLikeStartIndex = i;
       defaultValuedAttrStartIndex = i;
     }
   }
@@ -3001,8 +3115,10 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     if ((attrParamKind == AttrParamKind::WrappedAttr &&
          canUseUnwrappedRawValue(attr)) ||
         (attrParamKind == AttrParamKind::UnwrappedValue &&
-         !canUseUnwrappedRawValue(attr)))
+         !canUseUnwrappedRawValue(attr))) {
       ++defaultValuedAttrStartIndex;
+      defaultValuedAttrLikeStartIndex = defaultValuedAttrStartIndex;
+    }
   }
 
   /// Collect any inferred attributes.
@@ -3029,8 +3145,16 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
                              operand->isOptional());
       continue;
     }
-    if (llvm::isa_and_present<NamedProperty *>(arg)) {
-      // TODO
+    if (auto *propArg = llvm::dyn_cast_if_present<NamedProperty *>(arg)) {
+      const Property &prop = propArg->prop;
+      StringRef type = prop.getInterfaceType();
+      std::string defaultValue;
+      if (prop.hasDefaultValue() && i >= defaultValuedAttrLikeStartIndex) {
+        defaultValue = prop.getDefaultValue();
+      }
+      bool isOptional = prop.hasDefaultValue();
+      paramList.emplace_back(type, propArg->name, StringRef(defaultValue),
+                             isOptional);
       continue;
     }
     const NamedAttribute &namedAttr = *arg.get<NamedAttribute *>();
@@ -3157,6 +3281,15 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
     }
   }
 
+  // Push all properties to the result.
+  for (const auto &namedProp : op.getProperties()) {
+    // Use the setter from the Properties struct since the conversion from the
+    // interface type (used in the builder argument) to the storage type (used
+    // in the state) is not necessarily trivial.
+    std::string setterName = op.getSetterName(namedProp.name);
+    body << formatv("  {0}.getOrAddProperties<Properties>().{1}({2});\n",
+                    builderOpState, setterName, namedProp.name);
+  }
   // Push all attributes to the result.
   for (const auto &namedAttr : op.getAttributes()) {
     auto &attr = namedAttr.attr;
@@ -3996,17 +4129,19 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
         // Generate the data member using the storage type.
         os << "    using " << name << "Ty = " << prop.getStorageType() << ";\n"
            << "    " << name << "Ty " << name;
-        if (prop.hasDefaultValue())
+        if (prop.hasStorageTypeValueOverride())
+          os << " = " << prop.getStorageTypeValueOverride();
+        else if (prop.hasDefaultValue())
           os << " = " << prop.getDefaultValue();
         comparatorOs << "        rhs." << name << " == this->" << name
                      << " &&\n";
         // Emit accessors using the interface type.
         const char *accessorFmt = R"decl(;
-    {0} get{1}() {
+    {0} get{1}() const {
       auto &propStorage = this->{2};
       return {3};
     }
-    void set{1}(const {0} &propValue) {
+    void set{1}({0} propValue) {
       auto &propStorage = this->{2};
       {4};
     }
@@ -4274,6 +4409,11 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
     ERROR_IF_PRUNED(m, "Adaptor::getAttributes", op);
     m->body() << "  return odsAttrs;";
   }
+  for (auto &namedProp : op.getProperties()) {
+    std::string name = op.getGetterName(namedProp.name);
+    emitPropGetter(genericAdaptorBase, op, name, namedProp.prop);
+  }
+
   for (auto &namedAttr : op.getAttributes()) {
     const auto &name = namedAttr.name;
     const auto &attr = namedAttr.attr;
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index a97d8760842a9..27ad79a5c1efe 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -45,7 +45,7 @@ class OpVariableElement : public VariableElementBase<VariableKind> {
   OpVariableElement(const VarT *var) : var(var) {}
 
   /// Get the variable.
-  const VarT *getVar() { return var; }
+  const VarT *getVar() const { return var; }
 
 protected:
   /// The op variable, e.g. a type or attribute constraint.
@@ -64,11 +64,6 @@ struct AttributeVariable
     return attrType ? attrType->getBuilderCall() : std::nullopt;
   }
 
-  /// Return if this attribute refers to a UnitAttr.
-  bool isUnitAttr() const {
-    return var->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
-  }
-
   /// Indicate if this attribute is printed "qualified" (that is it is
   /// prefixed with the `#dialect.mnemonic`).
   bool shouldBeQualified() { return shouldBeQualifiedFlag; }
@@ -98,6 +93,42 @@ using SuccessorVariable =
 /// This class represents a variable that refers to a property argument.
 using PropertyVariable =
     OpVariableElement<NamedProperty, VariableElement::Property>;
+
+/// LLVM RTTI helper for attribute-like variables, that is, attributes or
+/// properties. This allows for common handling of attributes and properties in
+/// parts of the code that are oblivious to whether something is stored as an
+/// attribute or a property.
+struct AttributeLikeVariable : public VariableElement {
+  enum { AttributeLike = 1 << 0 };
+
+  static bool classof(const VariableElement *ve) {
+    return ve->getKind() == VariableElement::Attribute ||
+           ve->getKind() == VariableElement::Property;
+  }
+
+  static bool classof(const FormatElement *fe) {
+    return isa<VariableElement>(fe) && classof(cast<VariableElement>(fe));
+  }
+
+  /// Returns true if the variable is a UnitAttr or a UnitProperty.
+  bool isUnit() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
+    if (const auto *prop = dyn_cast<PropertyVariable>(this)) {
+      return prop->getVar()->prop.getBaseProperty().getPropertyDefName() ==
+             "UnitProperty";
+    }
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+
+  StringRef getName() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->name;
+    if (const auto *prop = dyn_cast<PropertyVariable>(this))
+      return prop->getVar()->name;
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -214,11 +245,11 @@ class OIListElement : public DirectiveElementBase<DirectiveElement::OIList> {
 
   /// If the parsing element is a single UnitAttr element, then it returns the
   /// attribute variable. Otherwise, returns nullptr.
-  AttributeVariable *
-  getUnitAttrParsingElement(ArrayRef<FormatElement *> pelement) {
+  AttributeLikeVariable *
+  getUnitVariableParsingElement(ArrayRef<FormatElement *> pelement) {
     if (pelement.size() == 1) {
-      auto *attrElem = dyn_cast<AttributeVariable>(pelement[0]);
-      if (attrElem && attrElem->isUnitAttr())
+      auto *attrElem = dyn_cast<AttributeLikeVariable>(pelement[0]);
+      if (attrElem && attrElem->isUnit())
         return attrElem;
     }
     return nullptr;
@@ -488,6 +519,36 @@ const char *const enumAttrParserCode = R"(
   }
 )";
 
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+/// {3}: The description of the expected property for the error message.
+const char *const propertyParserCode = R"(
+  auto {0}PropLoc = parser.getCurrentLocation();
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::ParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if (failed({0}PropParseResult)) {{
+    return parser.emitError({0}PropLoc, "invalid value for property {0}, expected {3}");
+  }
+)";
+
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+const char *const optionalPropertyParserCode = R"(
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::OptionalParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if ({0}PropParseResult.has_value() && failed(*{0}PropParseResult)) {{
+    return ::mlir::failure();
+  }
+)";
+
 /// The code snippet used to generate a parser call for an operand.
 ///
 /// {0}: The name of the operand.
@@ -796,9 +857,9 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
     // If the anchor is a unit attribute, it won't be parsed directly so elide
     // it.
-    auto *anchor = dyn_cast<AttributeVariable>(optional->getAnchor());
+    auto *anchor = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
     FormatElement *elidedAnchorElement = nullptr;
-    if (anchor && anchor != elements.front() && anchor->isUnitAttr())
+    if (anchor && anchor != elements.front() && anchor->isUnit())
       elidedAnchorElement = anchor;
     for (FormatElement *childElement : elements)
       if (childElement != elidedAnchorElement)
@@ -808,7 +869,7 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
   } else if (auto *oilist = dyn_cast<OIListElement>(element)) {
     for (ArrayRef<FormatElement *> pelement : oilist->getParsingElements()) {
-      if (!oilist->getUnitAttrParsingElement(pelement))
+      if (!oilist->getUnitVariableParsingElement(pelement))
         for (FormatElement *element : pelement)
           genElementParserStorage(element, op, body);
     }
@@ -1049,7 +1110,6 @@ static void genCustomDirectiveParser(CustomDirective *dir, MethodBody &body,
         body << llvm::formatv("    result.addAttribute(\"{0}\", {0}Attr);\n",
                               var->name);
       }
-
     } else if (auto *operand = dyn_cast<OperandVariable>(param)) {
       const NamedTypeConstraint *var = operand->getVar();
       if (var->isOptional()) {
@@ -1137,6 +1197,29 @@ static void genEnumAttrParser(const NamedAttribute *var, MethodBody &body,
                   validCaseKeywordsStr, errorMessage, attrAssignment);
 }
 
+// Generate the parser for a property.
+static void genPropertyParser(PropertyVariable *propVar, MethodBody &body,
+                              StringRef opCppClassName,
+                              bool requireParse = true) {
+  StringRef name = propVar->getVar()->name;
+  const Property &prop = propVar->getVar()->prop;
+  bool parseOptionally =
+      prop.hasDefaultValue() && !requireParse && prop.hasOptionalParser();
+  FmtContext fmtContext;
+  fmtContext.addSubst("_parser", "parser");
+  fmtContext.addSubst("_ctxt", "parser.getContext()");
+  fmtContext.addSubst("_storage", "propStorage");
+
+  if (parseOptionally) {
+    body << formatv(optionalPropertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getOptionalParserCall(), &fmtContext));
+  } else {
+    body << formatv(propertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getParserCall(), &fmtContext),
+                    prop.getSummary());
+  }
+}
+
 // Generate the parser for an attribute.
 static void genAttrParser(AttributeVariable *attr, MethodBody &body,
                           FmtContext &attrTypeCtx, bool parseAsOptional,
@@ -1213,14 +1296,16 @@ if (!dict) {
 }
 )decl";
 
-  // TODO: properties might be optional as well.
+  // {0}: fromAttribute call
+  // {1}: property name
+  // {2}: isRequired
   const char *propFromAttrFmt = R"decl(
 auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
   {0};
 };
 auto attr = dict.get("{1}");
-if (!attr) {{
+if (!attr && {2}) {{
   emitError() << "expected key entry for {1} in DictionaryAttr to set "
              "Properties.";
   return ::mlir::failure();
@@ -1238,13 +1323,14 @@ if (::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
 
     StringRef name = namedProperty.name;
     const Property &prop = namedProperty.prop;
+    bool isRequired = !prop.hasDefaultValue();
     FmtContext fctx;
     body << formatv(propFromAttrFmt,
                     tgfmt(prop.getConvertFromAttributeCall(),
                           &fctx.addSubst("_attr", "propAttr")
                                .addSubst("_storage", "propStorage")
                                .addSubst("_diag", "emitError")),
-                    name);
+                    name, isRequired);
   }
 
   // Generate the setter for any attribute not parsed elsewhere.
@@ -1331,20 +1417,24 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       // If the anchor is a unit attribute, we don't need to print it. When
       // parsing, we will add this attribute if this group is present.
       FormatElement *elidedAnchorElement = nullptr;
-      auto *anchorAttr = dyn_cast<AttributeVariable>(optional->getAnchor());
-      if (anchorAttr && anchorAttr != firstElement &&
-          anchorAttr->isUnitAttr()) {
-        elidedAnchorElement = anchorAttr;
+      auto *anchorVar = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
+      if (anchorVar && anchorVar != firstElement && anchorVar->isUnit()) {
+        elidedAnchorElement = anchorVar;
 
         if (!thenGroup == optional->isInverted()) {
-          // Add the anchor unit attribute to the operation state.
-          if (useProperties) {
+          // Add the anchor unit attribute or property to the operation state
+          // or set the property to true.
+          if (isa<PropertyVariable>(anchorVar)) {
+            body << formatv(
+                "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+                anchorVar->getName(), opCppClassName);
+          } else if (useProperties) {
             body << formatv(
                 "    result.getOrAddProperties<{1}::Properties>().{0} = "
                 "parser.getBuilder().getUnitAttr();",
-                anchorAttr->getVar()->name, opCppClassName);
+                anchorVar->getName(), opCppClassName);
           } else {
-            body << "    result.addAttribute(\"" << anchorAttr->getVar()->name
+            body << "    result.addAttribute(\"" << anchorVar->getName()
                  << "\", parser.getBuilder().getUnitAttr());\n";
           }
         }
@@ -1368,6 +1458,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       genAttrParser(attrVar, body, attrTypeCtx, /*parseAsOptional=*/true,
                     useProperties, opCppClassName);
       body << "  if (" << attrVar->getVar()->name << "Attr) {\n";
+    } else if (auto *propVar = dyn_cast<PropertyVariable>(firstElement)) {
+      genPropertyParser(propVar, body, opCppClassName, /*requireParse=*/false);
+      body << llvm::formatv("if ({0}PropParseResult.has_value() && "
+                            "succeeded(*{0}PropParseResult)) ",
+                            propVar->getVar()->name)
+           << " {\n";
     } else if (auto *literal = dyn_cast<LiteralElement>(firstElement)) {
       body << "  if (::mlir::succeeded(parser.parseOptional";
       genLiteralParser(literal->getSpelling(), body);
@@ -1430,15 +1526,19 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       body << ")) {\n";
       StringRef lelementName = lelement->getSpelling();
       body << formatv(oilistParserCode, lelementName);
-      if (AttributeVariable *unitAttrElem =
-              oilist->getUnitAttrParsingElement(pelement)) {
-        if (useProperties) {
+      if (AttributeLikeVariable *unitVarElem =
+              oilist->getUnitVariableParsingElement(pelement)) {
+        if (isa<PropertyVariable>(unitVarElem)) {
+          body << formatv(
+              "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+              unitVarElem->getName(), opCppClassName);
+        } else if (useProperties) {
           body << formatv(
               "    result.getOrAddProperties<{1}::Properties>().{0} = "
               "parser.getBuilder().getUnitAttr();",
-              unitAttrElem->getVar()->name, opCppClassName);
+              unitVarElem->getName(), opCppClassName);
         } else {
-          body << "  result.addAttribute(\"" << unitAttrElem->getVar()->name
+          body << "  result.addAttribute(\"" << unitVarElem->getName()
                << "\", UnitAttr::get(parser.getContext()));\n";
         }
       } else {
@@ -1468,6 +1568,8 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
         (genCtx == GenContext::Normal && attr->getVar()->attr.isOptional());
     genAttrParser(attr, body, attrTypeCtx, parseAsOptional, useProperties,
                   opCppClassName);
+  } else if (auto *prop = dyn_cast<PropertyVariable>(element)) {
+    genPropertyParser(prop, body, opCppClassName);
 
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     ArgumentLengthKind lengthKind = getArgumentLengthKind(operand->getVar());
@@ -1876,6 +1978,38 @@ const char *enumAttrBeginPrinterCode = R"(
     auto caseValueStr = {1}(caseValue);
 )";
 
+/// Generate a check that an optional or default-valued attribute or property
+/// has a non-default value. For these purposes, the default value of an
+/// optional attribute is its presence, even if the attribute itself has a
+/// default value.
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    AttributeVariable &attrElement) {
+  Attribute attr = attrElement.getVar()->attr;
+  std::string getter = op.getGetterName(attrElement.getVar()->name);
+  bool optionalAndDefault = attr.isOptional() && attr.hasDefaultValue();
+  if (optionalAndDefault)
+    body << "(";
+  if (attr.isOptional())
+    body << getter << "Attr()";
+  if (optionalAndDefault)
+    body << " && ";
+  if (attr.hasDefaultValue()) {
+    FmtContext fctx;
+    fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
+    body << getter << "Attr() != "
+         << tgfmt(attr.getConstBuilderTemplate(), &fctx,
+                  attr.getDefaultValue());
+  }
+  if (optionalAndDefault)
+    body << ")";
+}
+
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    PropertyVariable &propElement) {
+  body << op.getGetterName(propElement.getVar()->name)
+       << "() != " << propElement.getVar()->prop.getDefaultValue();
+}
+
 /// Generate the printer for the 'prop-dict' directive.
 static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
                                MethodBody &body) {
@@ -1904,6 +2038,15 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
       body << "  }\n";
     }
   }
+  // Similarly, elide default-valued properties.
+  for (const NamedProperty &prop : op.getProperties()) {
+    if (prop.prop.hasDefaultValue()) {
+      body << "  if (" << op.getGetterName(prop.name)
+           << "() == " << prop.prop.getDefaultValue() << ") {";
+      body << "    elidedProps.push_back(\"" << prop.name << "\");\n";
+      body << "  }\n";
+    }
+  }
 
   body << "  _odsPrinter << \" \";\n"
        << "  printProperties(this->getContext(), _odsPrinter, "
@@ -2031,7 +2174,6 @@ static void genCustomDirectiveParameterPrinter(FormatElement *element,
 
   } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
     FmtContext ctx;
-    ctx.addSubst("_ctxt", "getContext()");
     const NamedProperty *namedProperty = property->getVar();
     ctx.addSubst("_storage", "getProperties()." + namedProperty->name);
     body << tgfmt(namedProperty->prop.getConvertFromStorageCall(), &ctx);
@@ -2154,16 +2296,6 @@ static void genEnumAttrPrinter(const NamedAttribute *var, const Operator &op,
           "  }\n";
 }
 
-/// Generate a check that a DefaultValuedAttr has a value that is non-default.
-static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
-                                    AttributeVariable &attrElement) {
-  FmtContext fctx;
-  Attribute attr = attrElement.getVar()->attr;
-  fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
-  body << " && " << op.getGetterName(attrElement.getVar()->name) << "Attr() != "
-       << tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
-}
-
 /// Generate the check for the anchor of an optional group.
 static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
                                           const Operator &op,
@@ -2190,17 +2322,12 @@ static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
         genOptionalGroupPrinterAnchor(element->getInputs(), op, body);
       })
       .Case([&](AttributeVariable *element) {
-        Attribute attr = element->getVar()->attr;
-        body << op.getGetterName(element->getVar()->name) << "Attr()";
-        if (attr.isOptional())
-          return; // done
-        if (attr.hasDefaultValue()) {
-          // Consider a default-valued attribute as present if it's not the
-          // default value.
-          genNonDefaultValueCheck(body, op, *element);
-          return;
-        }
-        llvm_unreachable("attribute must be optional or default-valued");
+        // Consider a default-valued attribute as present if it's not the
+        // default value and an optional one present if it is set.
+        genNonDefaultValueCheck(body, op, *element);
+      })
+      .Case([&](PropertyVariable *element) {
+        genNonDefaultValueCheck(body, op, *element);
       })
       .Case([&](CustomDirective *ele) {
         body << '(';
@@ -2276,10 +2403,10 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     ArrayRef<FormatElement *> thenElements = optional->getThenElements();
     ArrayRef<FormatElement *> elseElements = optional->getElseElements();
     FormatElement *elidedAnchorElement = nullptr;
-    auto *anchorAttr = dyn_cast<AttributeVariable>(anchor);
+    auto *anchorAttr = dyn_cast<AttributeLikeVariable>(anchor);
     if (anchorAttr && anchorAttr != thenElements.front() &&
         (elseElements.empty() || anchorAttr != elseElements.front()) &&
-        anchorAttr->isUnitAttr()) {
+        anchorAttr->isUnit()) {
       elidedAnchorElement = anchorAttr;
     }
     auto genElementPrinters = [&](ArrayRef<FormatElement *> elements) {
@@ -2319,13 +2446,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       for (VariableElement *var : vars) {
         TypeSwitch<FormatElement *>(var)
             .Case([&](AttributeVariable *attrEle) {
-              body << " || (" << op.getGetterName(attrEle->getVar()->name)
-                   << "Attr()";
-              Attribute attr = attrEle->getVar()->attr;
-              if (attr.hasDefaultValue()) {
-                // Don't print default-valued attributes.
-                genNonDefaultValueCheck(body, op, *attrEle);
-              }
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *attrEle);
+              body << ")";
+            })
+            .Case([&](PropertyVariable *propEle) {
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *propEle);
               body << ")";
             })
             .Case([&](OperandVariable *ele) {
@@ -2352,7 +2479,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       body << ") {\n";
       genLiteralPrinter(lelement->getSpelling(), body, shouldEmitSpace,
                         lastWasPunctuation);
-      if (oilist->getUnitAttrParsingElement(pelement) == nullptr) {
+      if (oilist->getUnitVariableParsingElement(pelement) == nullptr) {
         for (FormatElement *element : pelement)
           genElementPrinter(element, body, op, shouldEmitSpace,
                             lastWasPunctuation);
@@ -2369,7 +2496,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     return;
   }
 
-  // Emit the attribute dictionary.
+  // Emit the property dictionary.
   if (isa<PropDictDirective>(element)) {
     genPropDictPrinter(*this, op, body);
     lastWasPunctuation = false;
@@ -2408,6 +2535,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     else
       body << "_odsPrinter.printStrippedAttrOrType("
            << op.getGetterName(var->name) << "Attr());\n";
+  } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
+    const NamedProperty *var = property->getVar();
+    FmtContext fmtContext;
+    fmtContext.addSubst("_printer", "_odsPrinter");
+    fmtContext.addSubst("_ctxt", "getContext()");
+    fmtContext.addSubst("_storage", "getProperties()." + var->name);
+    body << tgfmt(var->prop.getPrinterCall(), &fmtContext) << ";\n";
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     if (operand->getVar()->isVariadicOfVariadic()) {
       body << "  ::llvm::interleaveComma("
@@ -2737,6 +2871,10 @@ static bool isOptionallyParsed(FormatElement *el) {
     Attribute attr = attrVar->getVar()->attr;
     return attr.isOptional() || attr.hasDefaultValue();
   }
+  if (auto *propVar = dyn_cast<PropertyVariable>(el)) {
+    const Property &prop = propVar->getVar()->prop;
+    return prop.hasDefaultValue() && prop.hasOptionalParser();
+  }
   if (auto *operandVar = dyn_cast<OperandVariable>(el)) {
     const NamedTypeConstraint *operand = operandVar->getVar();
     return operand->isOptional() || operand->isVariadic() ||
@@ -3141,10 +3279,9 @@ OpFormatParser::parseVariableImpl(SMLoc loc, StringRef name, Context ctx) {
   }
 
   if (const NamedProperty *property = findArg(op.getProperties(), name)) {
-    if (ctx != CustomDirectiveContext && ctx != RefDirectiveContext)
+    if (ctx == TypeDirectiveContext)
       return emitError(
-          loc, "properties currently only supported in `custom` directive");
-
+          loc, "properties cannot be used as children to a `type` directive");
     if (ctx == RefDirectiveContext) {
       if (!seenProperties.count(property))
         return emitError(loc, "property '" + name +
@@ -3428,6 +3565,15 @@ LogicalResult OpFormatParser::verifyOIListParsingElement(FormatElement *element,
                                       "an oilist parsing group");
               return success();
             })
+            // Only optional properties can be within an oilist parsing group.
+            .Case([&](PropertyVariable *propEle) {
+              if (!propEle->getVar()->prop.hasDefaultValue())
+                return emitError(
+                    loc,
+                    "only default-valued or optional properties can be used in "
+                    "an olist parsing group");
+              return success();
+            })
             // Only optional-like(i.e. variadic) operands can be within an
             // oilist parsing group.
             .Case([&](OperandVariable *ele) {
@@ -3557,6 +3703,16 @@ LogicalResult OpFormatParser::verifyOptionalGroupElement(SMLoc loc,
                                 "can be used to anchor an optional group");
         return success();
       })
+      // All properties can be within the optional group, but only optional
+      // properties can be the anchor.
+      .Case([&](PropertyVariable *propEle) {
+        Property prop = propEle->getVar()->prop;
+        if (isAnchor && !(prop.hasDefaultValue() && prop.hasOptionalParser()))
+          return emitError(loc, "only properties with default values "
+                                "that can be optionally parsed "
+                                "can be used to anchor an optional group");
+        return success();
+      })
       // Only optional-like(i.e. variadic) operands can be within an optional
       // group.
       .Case([&](OperandVariable *ele) {
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
index 75c893334943d..dc78bbac85f3c 100644
--- a/mlir/unittests/IR/AffineExprTest.cpp
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -106,3 +106,9 @@ TEST(AffineExprTest, modSimplificationRegression) {
   auto sum = d0 + d0.floorDiv(3).floorDiv(-3);
   ASSERT_EQ(sum.getKind(), AffineExprKind::Add);
 }
+
+TEST(AffineExprTest, divisorOfNegativeFloorDiv) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  ASSERT_EQ(b.getAffineDimExpr(0).floorDiv(-1).getLargestKnownDivisor(), 1);
+}
diff --git a/mlir/unittests/IR/ShapedTypeTest.cpp b/mlir/unittests/IR/ShapedTypeTest.cpp
index 61264bc523648..7a5b0722a03ba 100644
--- a/mlir/unittests/IR/ShapedTypeTest.cpp
+++ b/mlir/unittests/IR/ShapedTypeTest.cpp
@@ -11,6 +11,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectInterface.h"
+#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "gtest/gtest.h"
 #include <cstdint>
@@ -226,4 +227,61 @@ TEST(ShapedTypeTest, RankedTensorTypeBuilder) {
   }
 }
 
+/// Simple wrapper class to enable "isa querying" and simple accessing of
+/// encoding.
+class TensorWithString : public RankedTensorType {
+public:
+  using RankedTensorType::RankedTensorType;
+
+  static TensorWithString get(ArrayRef<int64_t> shape, Type elementType,
+                              StringRef name) {
+    return mlir::cast<TensorWithString>(RankedTensorType::get(
+        shape, elementType, StringAttr::get(elementType.getContext(), name)));
+  }
+
+  StringRef getName() const {
+    if (Attribute enc = getEncoding())
+      return mlir::cast<StringAttr>(enc).getValue();
+    return {};
+  }
+
+  static bool classof(Type type) {
+    if (auto rt = mlir::dyn_cast_or_null<RankedTensorType>(type))
+      return mlir::isa_and_present<StringAttr>(rt.getEncoding());
+    return false;
+  }
+};
+
+TEST(ShapedTypeTest, RankedTensorTypeView) {
+  MLIRContext context;
+  Type f32 = FloatType::getF32(&context);
+
+  Type noEncodingRankedTensorType = RankedTensorType::get({10, 20}, f32);
+
+  UnitAttr unitAttr = UnitAttr::get(&context);
+  Type unitEncodingRankedTensorType =
+      RankedTensorType::get({10, 20}, f32, unitAttr);
+
+  StringAttr stringAttr = StringAttr::get(&context, "app");
+  Type stringEncodingRankedTensorType =
+      RankedTensorType::get({10, 20}, f32, stringAttr);
+
+  EXPECT_FALSE(mlir::isa<TensorWithString>(noEncodingRankedTensorType));
+  EXPECT_FALSE(mlir::isa<TensorWithString>(unitEncodingRankedTensorType));
+  ASSERT_TRUE(mlir::isa<TensorWithString>(stringEncodingRankedTensorType));
+
+  // Cast to TensorWithString view.
+  auto view = mlir::cast<TensorWithString>(stringEncodingRankedTensorType);
+  ASSERT_TRUE(mlir::isa<TensorWithString>(view));
+  EXPECT_EQ(view.getName(), "app");
+  // Verify one could cast view type back to base type.
+  ASSERT_TRUE(mlir::isa<RankedTensorType>(view));
+
+  Type viewCreated = TensorWithString::get({10, 20}, f32, "bob");
+  ASSERT_TRUE(mlir::isa<TensorWithString>(viewCreated));
+  ASSERT_TRUE(mlir::isa<RankedTensorType>(viewCreated));
+  view = mlir::cast<TensorWithString>(viewCreated);
+  EXPECT_EQ(view.getName(), "bob");
+}
+
 } // namespace
diff --git a/mlir/utils/lldb-scripts/mlirDataFormatters.py b/mlir/utils/lldb-scripts/mlirDataFormatters.py
index 3282b4a32d503..ed0ee431fd7d8 100644
--- a/mlir/utils/lldb-scripts/mlirDataFormatters.py
+++ b/mlir/utils/lldb-scripts/mlirDataFormatters.py
@@ -51,6 +51,7 @@ def build_ptr_str_from_addr(addrValue: lldb.SBValue, type: lldb.SBType):
     "mlir::FusedLoc": '"loc(fused<...>[...])"',
     "mlir::UnknownLoc": '"loc(unknown)"',
     "mlir::Float8E5M2Type": '"f8E5M2"',
+    "mlir::Float8E4M3Type": '"f8E4M3"',
     "mlir::Float8E4M3FNType": '"f8E4M3FN"',
     "mlir::Float8E5M2FNUZType": '"f8E5M2FNUZ"',
     "mlir::Float8E4M3FNUZType": '"f8E4M3FNUZ"',
diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py
index 426bfca1b4f88..78c1022428d8a 100755
--- a/mlir/utils/spirv/gen_spirv_dialect.py
+++ b/mlir/utils/spirv/gen_spirv_dialect.py
@@ -536,7 +536,10 @@ def gen_instr_coverage_report(path, instructions):
 
     content = content.split(AUTOGEN_OPCODE_SECTION_MARKER)
 
-    existing_opcodes = [k[11:] for k in re.findall("def SPIRV_OC_\w+", content[1])]
+    prefix = "def SPIRV_OC_"
+    existing_opcodes = [
+        k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1])
+    ]
     existing_instructions = list(
         filter(lambda inst: (inst["opname"] in existing_opcodes), instructions)
     )
@@ -637,7 +640,12 @@ def update_td_enum_attrs(path, operand_kinds, filter_list):
     assert len(content) == 3
 
     # Extend filter list with existing enum definitions
-    existing_kinds = [k[8:-4] for k in re.findall("def SPIRV_\w+Attr", content[1])]
+    prefix = "def SPIRV_"
+    suffix = "Attr"
+    existing_kinds = [
+        k[len(prefix) : -len(suffix)]
+        for k in re.findall(prefix + "\w+" + suffix, content[1])
+    ]
     filter_list.extend(existing_kinds)
 
     capability_mapping = get_capability_mapping(operand_kinds)
@@ -959,12 +967,20 @@ def extract_td_op_info(op_def):
       - A dict containing potential manually specified sections
     """
     # Get opname
-    opname = [o[8:-2] for o in re.findall("def SPIRV_\w+Op", op_def)]
+    prefix = "def SPIRV_"
+    suffix = "Op"
+    opname = [
+        o[len(prefix) : -len(suffix)]
+        for o in re.findall(prefix + "\w+" + suffix, op_def)
+    ]
     assert len(opname) == 1, "more than one ops in the same section!"
     opname = opname[0]
 
     # Get instruction category
-    inst_category = [o[4:] for o in re.findall("SPIRV_\w+Op", op_def.split(":", 1)[1])]
+    prefix = "SPIRV_"
+    inst_category = [
+        o[len(prefix) :] for o in re.findall(prefix + "\w+Op", op_def.split(":", 1)[1])
+    ]
     assert len(inst_category) <= 1, "more than one ops in the same section!"
     inst_category = inst_category[0] if len(inst_category) == 1 else "Op"
 
diff --git a/mlir/utils/tree-sitter-mlir/grammar.js b/mlir/utils/tree-sitter-mlir/grammar.js
index 0e7075b124b55..a657874f894b7 100644
--- a/mlir/utils/tree-sitter-mlir/grammar.js
+++ b/mlir/utils/tree-sitter-mlir/grammar.js
@@ -230,7 +230,8 @@ const common = {
   integer_type : $ =>
       token(seq(choice('si', 'ui', 'i'), /[1-9]/, repeat(/[0-9]/))),
   float_type : $ => token(
-      choice('f16', 'f32', 'f64', 'f80', 'f128', 'bf16', 'f8E4M3FN', 'f8E5M2')),
+      choice('f16', 'f32', 'f64', 'f80', 'f128', 'bf16', 'f8E4M3FN', 'f8E4M3',
+             'f8E5M2')),
   index_type : $ => token('index'),
   none_type : $ => token('none'),
   complex_type : $ => seq(token('complex'), '<', $._prim_type, '>'),
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index dde86af783af9..59a795cc62e0e 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -18,7 +18,6 @@ extern "C" {
 
 int memcmp(const void *lhs, const void *rhs, size_t count);
 void memset(void *dst, int C, size_t count);
-
 int printf(const char *format, ...);
 }
 
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 4e16591cc6c51..5a2c84c7ee38a 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -26,10 +26,13 @@ using namespace ompx;
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
 
+#ifndef OMPTARGET_HAS_LIBC
 [[gnu::weak]] void __assert_fail(const char *expr, const char *file,
                                  unsigned line, const char *function) {
   __assert_fail_internal(expr, nullptr, file, line, function);
 }
+#endif
+
 void __assert_fail_internal(const char *expr, const char *msg, const char *file,
                             unsigned line, const char *function) {
   if (msg) {
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index 4bca5d29643fe..291ceb023a69c 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -11,44 +11,33 @@
 #pragma omp begin declare target device_type(nohost)
 
 namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t);
+int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
 }
 
+#ifndef OMPTARGET_HAS_LIBC
+namespace impl {
 #pragma omp begin declare variant match(                                       \
         device = {arch(nvptx, nvptx64)},                                       \
             implementation = {extension(match_any)})
-extern "C" int32_t vprintf(const char *, void *);
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return vprintf(Format, Arguments);
+extern "C" int vprintf(const char *format, ...);
+int omp_vprintf(const char *Format, __builtin_va_list vlist) {
+  return vprintf(Format, vlist);
 }
-} // namespace impl
 #pragma omp end declare variant
 
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-#ifdef OMPTARGET_HAS_LIBC
-// TODO: Remove this handling once we have varargs support.
-extern "C" struct FILE *stdout;
-extern "C" int32_t rpc_fprintf(FILE *, const char *, void *, uint64_t);
-
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return rpc_fprintf(stdout, Format, Arguments, Size);
-}
+int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
+#pragma omp end declare variant
 } // namespace impl
-#else
-// We do not have a vprintf implementation for AMD GPU so we use a stub.
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
+
+extern "C" int printf(const char *Format, ...) {
+  __builtin_va_list vlist;
+  __builtin_va_start(vlist, Format);
+  return impl::omp_vprintf(Format, vlist);
 }
-} // namespace impl
-#endif
-#pragma omp end declare variant
+#endif // OMPTARGET_HAS_LIBC
 
 extern "C" {
-
 [[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
   auto *L = reinterpret_cast<const unsigned char *>(lhs);
   auto *R = reinterpret_cast<const unsigned char *>(rhs);
@@ -65,11 +54,6 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
-
-/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return impl::omp_vprintf(Format, Arguments, Size);
-}
 }
 
 #pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 031a5ced25518..15b991f202539 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -166,9 +166,6 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);
 
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
-  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
@@ -178,6 +175,10 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     return;
   }
 
+  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
+  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
+
   // We do *not* create a new data environment because all threads in the team
   // that are active are now running this parallel region. They share the
   // TeamState, which has an increase level-var and potentially active-level
diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake
index 11eafeb764260..6609d6301d0f9 100644
--- a/offload/cmake/OpenMPTesting.cmake
+++ b/offload/cmake/OpenMPTesting.cmake
@@ -37,6 +37,17 @@ function(find_standalone_test_dependencies)
     return()
   endif()
 
+  find_program(OFFLOAD_DEVICE_INFO_EXECUTABLE
+    NAMES llvm-offload-device-info
+    PATHS ${OPENMP_LLVM_TOOLS_DIR})
+  if (NOT OFFLOAD_DEVICE_INFO_EXECUTABLE)
+    message(STATUS "Cannot find 'llvm-offload-device-info'.")
+    message(STATUS "Please put 'not' in your PATH, set OFFLOAD_DEVICE_INFO_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+
   find_program(OPENMP_NOT_EXECUTABLE
     NAMES not
     PATHS ${OPENMP_LLVM_TOOLS_DIR})
@@ -71,6 +82,7 @@ else()
     set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
   endif()
   set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not)
+  set(OFFLOAD_DEVICE_INFO_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-offload-device-info)
 endif()
 
 # Macro to extract information about compiler from file. (no own scope)
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index fce2adca49c11..94b6d1090b5a8 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -113,8 +113,14 @@ struct PluginManager {
     return Devices.getExclusiveAccessor();
   }
 
-  // Initialize all plugins.
-  void initAllPlugins();
+  /// Initialize \p Plugin. Returns true on success.
+  bool initializePlugin(GenericPluginTy &Plugin);
+
+  /// Initialize device \p DeviceNo of \p Plugin. Returns true on success.
+  bool initializeDevice(GenericPluginTy &Plugin, int32_t DeviceId);
+
+  /// Eagerly initialize all plugins and their devices.
+  void initializeAllDevices();
 
   /// Iterator range for all plugins (in use or not, but always valid).
   auto plugins() { return llvm::make_pointee_range(Plugins); }
diff --git a/offload/include/Shared/EnvironmentVar.h b/offload/include/Shared/EnvironmentVar.h
index 4cbdad695a0ee..82f434e91a85b 100644
--- a/offload/include/Shared/EnvironmentVar.h
+++ b/offload/include/Shared/EnvironmentVar.h
@@ -28,6 +28,7 @@ struct StringParser {
 /// Class for reading and checking environment variables. Currently working with
 /// integer, floats, std::string and bool types.
 template <typename Ty> class Envar {
+  llvm::StringRef Name;
   Ty Data;
   bool IsPresent;
   bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
   /// take the value read from the environment variable, or the default if it
   /// was not set or not correct. This constructor is not fallible.
   Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
+      : Name(Name), Data(Default), IsPresent(false), Initialized(true) {
 
     if (const char *EnvStr = getenv(Name.data())) {
       // Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
   /// Get the definitive value.
   operator Ty() const { return get(); }
 
+  /// Return the environment variable name.
+  llvm::StringRef getName() const { return Name; }
+
   /// Indicate whether the environment variable was defined and valid.
   bool isPresent() const { return IsPresent; }
 
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 64a1d3308aed0..5d9fb5d7dc7cd 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -31,6 +31,7 @@ typedef enum {
   HSA_STATUS_ERROR = 0x1000,
   HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
   HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
 } hsa_status_t;
 
 hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e6643d3260eb4..604683370cd27 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,13 +13,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <functional>
 #include <mutex>
 #include <string>
 #include <system_error>
 #include <unistd.h>
 #include <unordered_map>
 
+#include "ErrorReporting.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -43,6 +46,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
     return true;
   }
 
+  const AMDGPUQueueTy *getQueue() const { return Queue; }
+
   /// Record the state of the stream on an event.
   Error recordEvent(AMDGPUEventTy &Event) const;
 
@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,7 +1664,7 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
@@ -1668,6 +1672,9 @@ struct AMDGPUStreamManagerTy final
     return Plugin::success();
   }
 
+  /// The device associated with this stream.
+  GenericDeviceTy &Device;
+
   /// Envar for controlling the tracking of busy HSA queues.
   BoolEnvar OMPX_QueueTracking;
 
@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     Initialized = true;
 
     // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
+    Status = hsa_amd_register_system_event_handler(eventHandler, this);
     if (auto Err = Plugin::check(
             Status, "Error in hsa_amd_register_system_event_handler: %s"))
       return std::move(Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 private:
   /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
+  static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
+                                   void *PluginPtr) {
     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
       return HSA_STATUS_SUCCESS;
 
@@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     uint32_t Node = -1;
     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
 
+    AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
+    for (uint32_t I = 0, E = Plugin.getNumDevices();
+         Node != uint32_t(-1) && I < E; ++I) {
+      AMDGPUDeviceTy &AMDGPUDevice =
+          reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
+      auto KernelTraceInfoRecord =
+          AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+
+      uint32_t DeviceNode = -1;
+      if (auto Err =
+              AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      if (DeviceNode != Node)
+        continue;
+
+      ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+    }
+
     // Abort the execution since we do not recover from this error.
     FATAL_MESSAGE(1,
                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,28 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+  auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
+
+  if (Status == HSA_STATUS_ERROR_EXCEPTION) {
+    auto KernelTraceInfoRecord =
+        AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+    std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
+        [=](__tgt_async_info &AsyncInfo) {
+          auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+          if (!Stream || !Stream->getQueue())
+            return false;
+          return Stream->getQueue()->Queue == Source;
+        };
+    ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
+                                      AsyncInfoWrapperMatcher);
+  }
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
new file mode 100644
index 0000000000000..e557b32c2c24f
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -0,0 +1,318 @@
+//===- ErrorReporting.h - Helper to provide nice error messages ----- c++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+
+#include "PluginInterface.h"
+#include "Shared/EnvironmentVar.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <optional>
+#include <string>
+#include <unistd.h>
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+class ErrorReporter {
+
+  enum ColorTy {
+    Yellow = int(HighlightColor::Address),
+    Green = int(HighlightColor::String),
+    DarkBlue = int(HighlightColor::Tag),
+    Cyan = int(HighlightColor::Attribute),
+    DarkPurple = int(HighlightColor::Enumerator),
+    DarkRed = int(HighlightColor::Macro),
+    BoldRed = int(HighlightColor::Error),
+    BoldLightPurple = int(HighlightColor::Warning),
+    BoldDarkGrey = int(HighlightColor::Note),
+    BoldLightBlue = int(HighlightColor::Remark),
+  };
+
+  /// The banner printed at the beginning of an error report.
+  static constexpr auto ErrorBanner = "OFFLOAD ERROR: ";
+
+  /// Return the device id as string, or n/a if not available.
+  static std::string getDeviceIdStr(GenericDeviceTy *Device) {
+    return Device ? std::to_string(Device->getDeviceId()) : "n/a";
+  }
+
+  /// Return a nice name for an TargetAllocTy.
+  static StringRef getAllocTyName(TargetAllocTy Kind) {
+    switch (Kind) {
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      return "device memory";
+    case TARGET_ALLOC_HOST:
+      return "pinned host memory";
+    case TARGET_ALLOC_SHARED:
+      return "managed memory";
+      break;
+    }
+    llvm_unreachable("Unknown target alloc kind");
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+#pragma clang diagnostic ignored "-Wformat-security"
+  /// Print \p Format, instantiated with \p Args to stderr.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
+                                                      ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    OS << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 2, 3)]] static void
+  print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    WithColor(OS, HighlightColor(Color)) << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored and with
+  /// a banner.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
+                                                            ArgsTy &&...Args) {
+    print(BoldRed, "%s", ErrorBanner);
+    print(BoldRed, Format, Args...);
+    print("\n");
+  }
+#pragma clang diagnostic pop
+
+  static void reportError(const char *Str) { reportError("%s", Str); }
+  static void print(const char *Str) { print("%s", Str); }
+  static void print(StringRef Str) { print("%s", Str.str().c_str()); }
+  static void print(ColorTy Color, const char *Str) { print(Color, "%s", Str); }
+  static void print(ColorTy Color, StringRef Str) {
+    print(Color, "%s", Str.str().c_str());
+  }
+
+  /// Pretty print a stack trace.
+  static void reportStackTrace(StringRef StackTrace) {
+    if (StackTrace.empty())
+      return;
+
+    SmallVector<StringRef> Lines, Parts;
+    StackTrace.split(Lines, "\n", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    int Start = Lines.empty() || !Lines[0].contains("PrintStackTrace") ? 0 : 1;
+    unsigned NumDigits =
+        (int)(floor(log10(Lines.size() - Start - /*0*/ 1)) + 1);
+    for (int I = Start, E = Lines.size(); I < E; ++I) {
+      auto Line = Lines[I];
+      Parts.clear();
+      Line = Line.drop_while([](char C) { return std::isspace(C); });
+      Line.split(Parts, " ", /*MaxSplit=*/2);
+      if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
+        print("%s\n", Line.str().c_str());
+        continue;
+      }
+      unsigned FrameIdx = std::stoi(Parts[0].drop_front(1).str());
+      if (Start)
+        FrameIdx -= 1;
+      print(DarkPurple, "    %s", Parts[0].take_front().str().c_str());
+      print(Green, "%*u", NumDigits, FrameIdx);
+      print(BoldLightBlue, " %s", Parts[1].str().c_str());
+      print(" %s\n", Parts[2].str().c_str());
+    }
+    print("\n");
+  }
+
+  /// Report information about an allocation associated with \p ATI.
+  static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
+    if (!ATI)
+      return;
+
+    if (!ATI->DeallocationTrace.empty()) {
+      print(BoldLightPurple, "Last deallocation:\n");
+      reportStackTrace(ATI->DeallocationTrace);
+    }
+
+    if (ATI->HostPtr)
+      print(BoldLightPurple,
+            "Last allocation of size %lu for host pointer %p:\n", ATI->Size,
+            ATI->HostPtr);
+    else
+      print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
+    reportStackTrace(ATI->AllocationTrace);
+    if (!ATI->LastAllocationInfo)
+      return;
+
+    unsigned I = 0;
+    print(BoldLightPurple, "Prior allocations with the same base pointer:");
+    while (ATI->LastAllocationInfo) {
+      print("\n");
+      ATI = ATI->LastAllocationInfo;
+      print(BoldLightPurple, " #%u Prior deallocation of size %lu:\n", I,
+            ATI->Size);
+      reportStackTrace(ATI->DeallocationTrace);
+      if (ATI->HostPtr)
+        print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
+              I, ATI->HostPtr);
+      else
+        print(BoldLightPurple, " #%u Prior allocation:\n", I);
+      reportStackTrace(ATI->AllocationTrace);
+      ++I;
+    }
+  }
+
+  /// End the execution of the program.
+  static void abortExecution() { abort(); }
+
+public:
+#define DEALLOCATION_ERROR(Format, ...)                                        \
+  reportError(Format, __VA_ARGS__);                                            \
+  reportStackTrace(StackTrace);                                                \
+  reportAllocationInfo(ATI);                                                   \
+  abortExecution();
+
+  static void reportDeallocationOfNonAllocatedPtr(void *DevicePtr,
+                                                  TargetAllocTy Kind,
+                                                  AllocationTraceInfoTy *ATI,
+                                                  std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation of non-allocated %s: %p",
+                       getAllocTyName(Kind).data(), DevicePtr);
+  }
+
+  static void reportDeallocationOfDeallocatedPtr(void *DevicePtr,
+                                                 TargetAllocTy Kind,
+                                                 AllocationTraceInfoTy *ATI,
+                                                 std::string &StackTrace) {
+    DEALLOCATION_ERROR("double-free of %s: %p", getAllocTyName(Kind).data(),
+                       DevicePtr);
+  }
+
+  static void reportDeallocationOfWrongPtrKind(void *DevicePtr,
+                                               TargetAllocTy Kind,
+                                               AllocationTraceInfoTy *ATI,
+                                               std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation requires %s but allocation was %s: %p",
+                       getAllocTyName(Kind).data(),
+                       getAllocTyName(ATI->Kind).data(), DevicePtr);
+#undef DEALLOCATION_ERROR
+  }
+
+  /// Report that a kernel encountered a trap instruction.
+  static void reportTrapInKernel(
+      GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
+      std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
+    assert(AsyncInfoWrapperMatcher && "A matcher is required");
+
+    uint32_t Idx = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      // Skip kernels issued in other queues.
+      if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+        continue;
+      Idx = I;
+      break;
+    }
+
+    auto KTI = KTIR.getKernelTraceInfo(Idx);
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
+      auto PrettyKernelName =
+          llvm::omp::prettifyFunctionName(KTI.Kernel->getName());
+      reportError("Kernel '%s'", PrettyKernelName.c_str());
+    }
+    reportError("execution interrupted by hardware trap instruction");
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
+      if (!KTI.LaunchTrace.empty())
+        reportStackTrace(KTI.LaunchTrace);
+      else
+        print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
+              Device.OMPX_TrackNumKernelLaunches.getName().data());
+    }
+    abort();
+  }
+
+  /// Report the kernel traces taken from \p KTIR, up to
+  /// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
+  static void reportKernelTraces(GenericDeviceTy &Device,
+                                 KernelTraceInfoRecordTy &KTIR) {
+    uint32_t NumKTIs = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      ++NumKTIs;
+    }
+    if (NumKTIs == 0) {
+      print(BoldRed, "No kernel launches known\n");
+      return;
+    }
+
+    uint32_t TracesToShow =
+        std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
+    if (TracesToShow == 0) {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display only launched kernel:\n");
+      else
+        print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
+    } else {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display kernel launch trace:\n");
+      else
+        print(BoldLightPurple,
+              "Display %u of the %u last kernel launch traces:\n", TracesToShow,
+              NumKTIs);
+    }
+
+    for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
+      auto KTI = KTIR.getKernelTraceInfo(Idx);
+      auto PrettyKernelName =
+          llvm::omp::prettifyFunctionName(KTI.Kernel->getName());
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Kernel '%s'\n", PrettyKernelName.c_str());
+      else
+        print(BoldLightPurple, "Kernel %d: '%s'\n", I,
+              PrettyKernelName.c_str());
+      reportStackTrace(KTI.LaunchTrace);
+      ++I;
+    }
+
+    if (NumKTIs != 1) {
+      print(Yellow,
+            "Use '%s=<num>' to adjust the number of shown stack traces (%u "
+            "now, up to %zu)\n",
+            Device.OMPX_TrackNumKernelLaunches.getName().data(),
+            Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
+    }
+    // TODO: Let users know how to serialize kernels
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 973add0ba1000..81823338fe211 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
 #include <shared_mutex>
 #include <vector>
 
+#include "ExclusiveAccess.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -382,6 +383,73 @@ struct GenericKernelTy {
   bool IsBareKernel = false;
 };
 
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct AllocationTraceInfoTy {
+
+  /// The stack trace of the allocation itself.
+  std::string AllocationTrace;
+
+  /// The stack trace of the deallocation, or empty.
+  std::string DeallocationTrace;
+
+  /// The allocated device pointer.
+  void *DevicePtr = nullptr;
+
+  /// The corresponding host pointer (can be null).
+  void *HostPtr = nullptr;
+
+  /// The size of the allocation.
+  uint64_t Size = 0;
+
+  /// The kind of the allocation.
+  TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
+
+  /// Information about the last allocation at this address, if any.
+  AllocationTraceInfoTy *LastAllocationInfo = nullptr;
+
+  /// Lock to keep accesses race free.
+  std::mutex Lock;
+};
+
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct KernelTraceInfoTy {
+
+  /// The launched kernel.
+  GenericKernelTy *Kernel;
+
+  /// The stack trace of the launch itself.
+  std::string LaunchTrace;
+
+  /// The async info the kernel was launched in.
+  __tgt_async_info *AsyncInfo;
+};
+
+struct KernelTraceInfoRecordTy {
+  KernelTraceInfoRecordTy() { KTIs.fill({}); }
+
+  /// Return the (maximal) record size.
+  auto size() const { return KTIs.size(); }
+
+  /// Create a new kernel trace info and add it into the record.
+  void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
+               __tgt_async_info *AsyncInfo) {
+    KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
+    Idx = (Idx + 1) % size();
+  }
+
+  /// Return the \p I'th last kernel trace info.
+  auto getKernelTraceInfo(int32_t I) const {
+    // Note that kernel trace infos "grow forward", so lookup is backwards.
+    return KTIs[(Idx - I - 1 + size()) % size()];
+  }
+
+private:
+  std::array<KernelTraceInfoTy, 8> KTIs;
+  unsigned Idx = 0;
+};
+
 /// Class representing a map of host pinned allocations. We track these pinned
 /// allocations, so memory tranfers invloving these buffers can be optimized.
 class PinnedAllocationMapTy {
@@ -866,6 +934,18 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
 
+  /// Map to record when allocations have been performed, and when they have
+  /// been deallocated, both for error reporting purposes.
+  ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
+
+  /// Map to record kernel have been launchedl, for error reporting purposes.
+  ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
+
+  /// Environment variable to determine if stack traces for kernel launches are
+  /// tracked.
+  UInt32Envar OMPX_TrackNumKernelLaunches =
+      UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -916,6 +996,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_InitialNumStreams;
   UInt32Envar OMPX_InitialNumEvents;
 
+  /// Environment variable to determine if stack traces for allocations and
+  /// deallocations are tracked.
+  BoolEnvar OMPX_TrackAllocationTraces =
+      BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
+
   /// Array of images loaded into the device. Images are automatically
   /// deallocated by the allocator.
   llvm::SmallVector<DeviceImageTy *> LoadedImages;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 118265973f327..c3ecbcc62f71f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -14,6 +14,7 @@
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
 
+#include "ErrorReporting.h"
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "Utils/ELF.h"
@@ -30,6 +31,8 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <cstdint>
 #include <limits>
@@ -1337,6 +1340,25 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
     if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
       return std::move(Err);
 
+  // Keep track of the allocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
+    ATI->AllocationTrace = std::move(StackTrace);
+    ATI->DevicePtr = Alloc;
+    ATI->HostPtr = HostPtr;
+    ATI->Size = Size;
+    ATI->Kind = Kind;
+
+    auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+    auto *&MapATI = (*AllocationTraceMap)[Alloc];
+    ATI->LastAllocationInfo = MapATI;
+    MapATI = ATI;
+  }
+
   return Alloc;
 }
 
@@ -1345,6 +1367,37 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
   if (Plugin.getRecordReplay().isRecordingOrReplaying())
     return Plugin::success();
 
+  // Keep track of the deallocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    AllocationTraceInfoTy *ATI = nullptr;
+    {
+      auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+      ATI = (*AllocationTraceMap)[TgtPtr];
+    }
+
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    if (!ATI)
+      ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
+                                                         StackTrace);
+
+    // ATI is not null, thus we can lock it to inspect and modify it further.
+    std::lock_guard<std::mutex> LG(ATI->Lock);
+    if (!ATI->DeallocationTrace.empty())
+      ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
+                                                        StackTrace);
+
+    if (ATI->Kind != Kind)
+      ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
+                                                      StackTrace);
+
+    ATI->DeallocationTrace = StackTrace;
+
+#undef DEALLOCATION_ERROR
+  }
+
   int Res;
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
@@ -1415,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  {
+    std::string StackTrace;
+    if (OMPX_TrackNumKernelLaunches) {
+      llvm::raw_string_ostream OS(StackTrace);
+      llvm::sys::PrintStackTrace(OS);
+    }
+
+    auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
+    (*KernelTraceInfoRecord)
+        .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
+  }
+
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);
 
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index efa5cdab33ec9..344069b6fcdcf 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_library(omptarget
   ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
 
   LINK_COMPONENTS
+  FrontendOpenMP
   Support
   Object
 
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 5e8f91792a551..c6117782fbab6 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -13,6 +13,7 @@
 #include "PluginManager.h"
 #include "Shared/Debug.h"
 #include "Shared/Profile.h"
+#include "device.h"
 
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -60,15 +61,61 @@ void PluginManager::deinit() {
   DP("RTLs unloaded!\n");
 }
 
-void PluginManager::initAllPlugins() {
-  for (auto &R : plugins()) {
-    if (auto Err = R.init()) {
-      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-      DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+bool PluginManager::initializePlugin(GenericPluginTy &Plugin) {
+  if (Plugin.is_initialized())
+    return true;
+
+  if (auto Err = Plugin.init()) {
+    [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+    DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+    return false;
+  }
+
+  DP("Registered plugin %s with %d visible device(s)\n", Plugin.getName(),
+     Plugin.number_of_devices());
+  return true;
+}
+
+bool PluginManager::initializeDevice(GenericPluginTy &Plugin,
+                                     int32_t DeviceId) {
+  if (Plugin.is_device_initialized(DeviceId))
+    return true;
+
+  // Initialize the device information for the RTL we are about to use.
+  auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
+
+  int32_t UserId = ExclusiveDevicesAccessor->size();
+
+  // Set the device identifier offset in the plugin.
+#ifdef OMPT_SUPPORT
+  Plugin.set_device_identifier(UserId, DeviceId);
+#endif
+
+  auto Device = std::make_unique<DeviceTy>(&Plugin, UserId, DeviceId);
+  if (auto Err = Device->init()) {
+    [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+    DP("Failed to init device %d: %s\n", DeviceId, InfoMsg.c_str());
+    return false;
+  }
+
+  ExclusiveDevicesAccessor->push_back(std::move(Device));
+
+  // We need to map between the plugin's device identifier and the one
+  // that OpenMP will use.
+  PM->DeviceIds[std::make_pair(&Plugin, DeviceId)] = UserId;
+
+  return true;
+}
+
+void PluginManager::initializeAllDevices() {
+  for (auto &Plugin : plugins()) {
+    if (!initializePlugin(Plugin))
       continue;
+
+    for (int32_t DeviceId = 0; DeviceId < Plugin.number_of_devices();
+         ++DeviceId) {
+      initializeDevice(Plugin, DeviceId);
     }
-    DP("Registered plugin %s with %d visible device(s)\n", R.getName(),
-       R.number_of_devices());
   }
 }
 
@@ -94,19 +141,12 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
-    for (auto &R : PM->plugins()) {
+    for (auto &R : plugins()) {
       if (!R.is_plugin_compatible(Img))
         continue;
 
-      if (!R.is_initialized()) {
-        if (auto Err = R.init()) {
-          [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-          DP("Failed to init plugin: %s\n", InfoMsg.c_str());
-          continue;
-        }
-        DP("Registered plugin %s with %d visible device(s)\n", R.getName(),
-           R.number_of_devices());
-      }
+      if (!initializePlugin(R))
+        continue;
 
       if (!R.number_of_devices()) {
         DP("Skipping plugin %s with no visible devices\n", R.getName());
@@ -120,30 +160,8 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
         DP("Image " DPxMOD " is compatible with RTL %s device %d!\n",
            DPxPTR(Img->ImageStart), R.getName(), DeviceId);
 
-        if (!R.is_device_initialized(DeviceId)) {
-          // Initialize the device information for the RTL we are about to use.
-          auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-
-          int32_t UserId = ExclusiveDevicesAccessor->size();
-
-          // Set the device identifier offset in the plugin.
-#ifdef OMPT_SUPPORT
-          R.set_device_identifier(UserId, DeviceId);
-#endif
-
-          auto Device = std::make_unique<DeviceTy>(&R, UserId, DeviceId);
-          if (auto Err = Device->init()) {
-            [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-            DP("Failed to init device %d: %s\n", DeviceId, InfoMsg.c_str());
-            continue;
-          }
-
-          ExclusiveDevicesAccessor->push_back(std::move(Device));
-
-          // We need to map between the plugin's device identifier and the one
-          // that OpenMP will use.
-          PM->DeviceIds[std::make_pair(&R, DeviceId)] = UserId;
-        }
+        if (!initializeDevice(R, DeviceId))
+          continue;
 
         // Initialize (if necessary) translation table for this library.
         PM->TrlTblMtx.lock();
@@ -219,7 +237,7 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image. We only need to scan RTLs that are already being used.
-    for (auto &R : PM->plugins()) {
+    for (auto &R : plugins()) {
       if (R.is_initialized())
         continue;
 
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 763b051cc6d77..b73e92f17ddb0 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -57,7 +57,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
 /// Initialize all available devices without registering any image
 EXTERN void __tgt_init_all_rtls() {
   assert(PM && "Runtime not initialized");
-  PM->initAllPlugins();
+  PM->initializeAllDevices();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca8529c5ee3..3b627d257a069 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -462,7 +462,9 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
-    FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
+    FATAL_MESSAGE(DeviceNum, "%s",
+                  "Failed to deallocate device ptr. Set "
+                  "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
 
   DP("omp_target_free deallocated device ptr\n");
 }
diff --git a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
index dd4d59257714b..0b4d9d6ea9d46 100644
--- a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
+++ b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
@@ -2,7 +2,6 @@
 // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
 // RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
 // REQUIRES: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
index 844c25dc9e025..656c3a20aaf82 100644
--- a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
+++ b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
@@ -2,7 +2,6 @@
 // RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
 // RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
 // REQUIRES: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa
 
 #include "omp_dynamic_shared_memory_mixed.inc"
 // CHECK: PASS
diff --git a/offload/test/libc/assert.c b/offload/test/libc/assert.c
index c2c5055c47051..bf155b6f463bf 100644
--- a/offload/test/libc/assert.c
+++ b/offload/test/libc/assert.c
@@ -2,9 +2,6 @@
 // RUN:   %fcheck-generic --check-prefix=CHECK
 
 // REQUIRES: libc
-
-// NVPTX without LTO uses the implementation in OpenMP currently.
-// UNSUPPORTED: nvptx64-nvidia-cuda
 // REQUIRES: gpu
 
 #include <assert.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index c508f03310f78..d0a2b72347e2c 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -183,13 +183,6 @@ def remove_suffix_if_present(name):
         return name
 
 def add_libraries(source):
-    if config.libomptarget_has_libc:
-        if config.libomptarget_current_target.startswith('nvptx'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
-        elif config.libomptarget_current_target.startswith('amdgcn'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
     return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
 
 # Add platform targets
@@ -419,3 +412,5 @@ config.substitutions.append(("%flags_clang", config.test_flags_clang))
 config.substitutions.append(("%flags_flang", config.test_flags_flang))
 config.substitutions.append(("%flags", config.test_flags))
 config.substitutions.append(("%not", config.libomptarget_not))
+config.substitutions.append(("%offload-device-info",
+                             config.offload_device_info))
diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in
index 43751970cac27..62ada1d81721d 100644
--- a/offload/test/lit.site.cfg.in
+++ b/offload/test/lit.site.cfg.in
@@ -23,6 +23,7 @@ config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
 config.libomptarget_current_target = "@CURRENT_TARGET@"
 config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
 config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
+config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 237e1585455c5..35ecf55aa8c53 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -31,7 +31,6 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 //
 // CUSTOM: Rewriting generic-mode kernel with a customized state machine.
-// XFAIL: amdgcn-amd-amdhsa
 
 #if ADD_REDUCTION
 #define REDUCTION(...) reduction(__VA_ARGS__)
diff --git a/offload/test/offloading/bug51982.c b/offload/test/offloading/bug51982.c
index c037d215a8a79..91ce4a264e238 100644
--- a/offload/test/offloading/bug51982.c
+++ b/offload/test/offloading/bug51982.c
@@ -1,7 +1,6 @@
 // RUN: %libomptarget-compile-generic -O1 && %libomptarget-run-generic
 // -O1 to run openmp-opt
 // RUN: %libomptarget-compileopt-generic -O1 && %libomptarget-run-generic
-// XFAIL: amdgcn-amd-amdhsa
 
 int main(void) {
   long int aa = 0;
diff --git a/offload/test/offloading/default_thread_limit.c b/offload/test/offloading/default_thread_limit.c
index c195d9ed3fbb7..ad8d08fdef299 100644
--- a/offload/test/offloading/default_thread_limit.c
+++ b/offload/test/offloading/default_thread_limit.c
@@ -2,6 +2,9 @@
 // RUN: %libomptarget-compile-generic
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+// RUN: %libomptarget-compile-generic -g
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
 
 // REQUIRES: amdgpu
 
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
new file mode 100644
index 0000000000000..928eb671c9706
--- /dev/null
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -0,0 +1,69 @@
+! Offloading test checking the use of the depend clause on
+! the target construct
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  implicit none
+  integer :: a = 0
+  INTERFACE
+     FUNCTION omp_get_device_num() BIND(C)
+       USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
+       integer :: omp_get_device_num
+     END FUNCTION omp_get_device_num
+  END INTERFACE
+
+  call foo(5, a)
+  print*, "======= FORTRAN Test passed! ======="
+  print*, "foo(5) returned ", a, ", expected 6\n"
+
+  !       stop 0
+  contains
+    subroutine foo(N, r)
+      integer, intent(in) :: N
+      integer, intent(out) :: r
+      integer :: z, i, accumulator
+      z = 1
+      accumulator = 0
+      ! Spawn 3 threads
+      !$omp parallel num_threads(3)
+
+      ! A single thread will then create two tasks - one is the 'producer' and
+      ! potentially slower task that updates 'z' to 'N'. The second is an
+      ! offloaded target task that increments 'z'. If the depend clauses work
+      ! properly, the target task should wait for the 'producer' task to
+      ! complete before incrementing 'z'. We use 'omp single' here because the
+      ! depend clause establishes dependencies between sibling tasks only.
+      ! This is the easiest way of creating two sibling tasks.
+      !$omp single
+      !$omp task depend(out: z) shared(z)
+      do i=1, 32766
+         ! dumb loop nest to slow down the update of 'z'.
+         ! Adding a function call slows down the producer to the point
+         ! that removing the depend clause from the target construct below
+         ! frequently results in the wrong answer.
+         accumulator = accumulator + omp_get_device_num()
+      end do
+      z = N
+      !$omp end task
+
+      ! z is 5 now. Increment z to 6.
+      !$omp target map(tofrom: z) depend(in:z)
+      z = z + 1
+      !$omp end target
+      !$omp end single
+      !$omp end parallel
+      ! Use 'accumulator' so it is not optimized away by the compiler.
+      print *, accumulator
+      r = z
+    end subroutine foo
+
+!CHECK: ======= FORTRAN Test passed! =======
+!CHECK: foo(5) returned 6 , expected 6
+end program main
diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c
new file mode 100644
index 0000000000000..ca7310e34fc9d
--- /dev/null
+++ b/offload/test/sanitizer/double_free.c
@@ -0,0 +1,68 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr1, 0);
+  void *Ptr2 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr2, 0);
+  void *Ptr3 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr3, 0);
+  omp_target_free(Ptr2, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK:   dataDelete
+// CHECK:   omp_target_free
+// NDEBG:   main
+// DEBUG:   main {{.*}}double_free.c:25
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:24
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:23
+//
+// CHECK: Prior allocations with the same base pointer:
+// CHECK: #0 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:22
+//
+// CHECK: #0 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:19
diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c
new file mode 100644
index 0000000000000..3b4f2d5c51571
--- /dev/null
+++ b/offload/test/sanitizer/double_free_racy.c
@@ -0,0 +1,33 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+#pragma omp parallel num_threads(4)
+  omp_target_free(Ptr1, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK   dataDelete
+// CHECK:  omp_target_free
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
diff --git a/offload/test/sanitizer/free_host_ptr.c b/offload/test/sanitizer/free_host_ptr.c
new file mode 100644
index 0000000000000..1ec68635499a9
--- /dev/null
+++ b/offload/test/sanitizer/free_host_ptr.c
@@ -0,0 +1,25 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int X;
+  omp_target_free(&X, 0);
+}
+
+// CHECK:  OFFLOAD ERROR: deallocation of non-allocated device memory: 0x
+// CHECK:   dataDelete
+// NDEBG:   main
+// DEBUG:   main {{.*}}free_host_ptr.c:20
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c
new file mode 100644
index 0000000000000..0c178541db117
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_host(8, 0);
+  omp_target_free(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires device memory but allocation was pinned host memory: 0x 
+// CHECK:  dataDelete 
+// CHECK:  omp_target_free
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:22
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_host
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:21
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
new file mode 100644
index 0000000000000..87a52c5d4baf2
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
@@ -0,0 +1,38 @@
+// clang-format off
+// RUN: %libomptarget-compileoptxx-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileoptxx-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
+}
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_shared(8, 0);
+  llvm_omp_target_free_host(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires pinned host memory but allocation was managed memory: 0x
+// CHECK:  dataDelete 
+// CHECK:  llvm_omp_target_free_host
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:25
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_shared
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:24
diff --git a/offload/test/sanitizer/kernel_crash.c b/offload/test/sanitizer/kernel_crash.c
new file mode 100644
index 0000000000000..c69219d97d3d0
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash.c
@@ -0,0 +1,47 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+    *A = 42;
+  }
+#pragma omp target
+  {
+  }
+}
+// TRACE: Display 1 of the 3 last kernel launch traces
+// TRACE: Kernel 0: {{.*}} (__omp_offloading_{{.*}}_main_l30)
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash.c:30
+//
+// CHECK: Display last 3 kernels launched:
+// CHECK: Kernel 0: {{.*}} (__omp_offloading_{{.*}}_main_l30)
+// CHECK: Kernel 1: {{.*}} (__omp_offloading_{{.*}}_main_l27)
+// CHECK: Kernel 2: {{.*}} (__omp_offloading_{{.*}}_main_l24)
diff --git a/offload/test/sanitizer/kernel_crash_async.c b/offload/test/sanitizer/kernel_crash_async.c
new file mode 100644
index 0000000000000..6a0461b0045b2
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_async.c
@@ -0,0 +1,40 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+    *A = 42;
+  }
+#pragma omp taskwait
+}
+
+// TRACE: Kernel {{.*}} (__omp_offloading_{{.*}}_main_l30)
+// TRACE:     launchKernel
+//
+// CHECK: Kernel {{[0-9]}}: {{.*}} (__omp_offloading_{{.*}}_main_l30)
diff --git a/offload/test/sanitizer/kernel_crash_many.c b/offload/test/sanitizer/kernel_crash_many.c
new file mode 100644
index 0000000000000..25986e0a459c1
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_many.c
@@ -0,0 +1,73 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {
+    }
+  }
+#pragma omp target
+  {
+    *A = 42;
+  }
+}
+// CHECK: Display 8 of the 8 last kernel launch traces
+// CHECK: Kernel 0: {{.*}} (__omp_offloading_{{.*}}_main_l27)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:27
+//
+// CHECK: Kernel 1: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 2: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 3: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 4: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 5: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 6: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 7: {{.*}} (__omp_offloading_{{.*}}_main_l23)
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK-NOT: Kernel {{[[0-9]]+}}:
diff --git a/offload/test/sanitizer/kernel_crash_single.c b/offload/test/sanitizer/kernel_crash_single.c
new file mode 100644
index 0000000000000..075c3de7ffabb
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_single.c
@@ -0,0 +1,36 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  {
+    *A = 42;
+  }
+}
+// TRACE: Display kernel launch trace
+// TRACE: Kernel {{.*}} (__omp_offloading_{{.*}}_main_l24)
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_single.c:24
+//
+// CHECK: Display only launched kernel:
+// CHECK: Kernel {{.*}} (__omp_offloading_{{.*}}_main_l24)
diff --git a/offload/test/sanitizer/kernel_trap.c b/offload/test/sanitizer/kernel_trap.c
new file mode 100644
index 0000000000000..db243001c9056
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap.c
@@ -0,0 +1,44 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+#pragma omp target
+  {
+  }
+}
+// clang-format off
+// CHECK: OFFLOAD ERROR: Kernel 'omp target in main @ 30 (__omp_offloading_{{.*}}_main_l30)'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap.c:
+// clang-format on
diff --git a/offload/test/sanitizer/kernel_trap.cpp b/offload/test/sanitizer/kernel_trap.cpp
new file mode 100644
index 0000000000000..899b608d579a7
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap.cpp
@@ -0,0 +1,52 @@
+
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compilexx-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+struct S {};
+
+template <typename T> void cxx_function_name(int I, T *) {
+
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+#pragma omp target
+  {
+  }
+}
+
+int main(void) {
+  struct S s;
+  cxx_function_name(1, &s);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: Kernel 'omp target in void cxx_function_name<S>(int, S*) @ [[LINE:[0-9]+]] (__omp_offloading_{{.*}}__Z17cxx_function_nameI1SEviPT__l[[LINE]])'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     cxx_function_name<S>(int, S*)
+// NDEBG:     main
+// DEBUG:     cxx_function_name<S>(int, S*) {{.*}}kernel_trap.cpp:
+// DEBUG:     main {{.*}}kernel_trap.cpp:
+// clang-format on
diff --git a/offload/test/sanitizer/kernel_trap_async.c b/offload/test/sanitizer/kernel_trap_async.c
new file mode 100644
index 0000000000000..ee0d772fef9b8
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_async.c
@@ -0,0 +1,42 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+    __builtin_trap();
+  }
+#pragma omp taskwait
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: Kernel {{.*}} (__omp_offloading_{{.*}}_main_l30)
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// DEBUG:     kernel_trap_async.c:
+// clang-format on
diff --git a/offload/test/sanitizer/kernel_trap_many.c b/offload/test/sanitizer/kernel_trap_many.c
new file mode 100644
index 0000000000000..b3bdad9f07b4a
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_many.c
@@ -0,0 +1,36 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {
+    }
+  }
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+}
+// TRACE: OFFLOAD ERROR: Kernel {{.*}} (__omp_offloading_{{.*}}_main_l27)
+// TRACE: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap_many.c:
diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c
new file mode 100644
index 0000000000000..6f497309df2f2
--- /dev/null
+++ b/offload/test/tools/llvm-omp-device-info.c
@@ -0,0 +1,6 @@
+// RUN: %offload-device-info | %fcheck-generic
+//
+// Just check any device was found and something is printed
+//
+// CHECK: Found {{[1-9].*}} devices:
+// CHECK: Device 0:
diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt
index a39de8833bf1d..3787c12f940a6 100644
--- a/offload/tools/deviceinfo/CMakeLists.txt
+++ b/offload/tools/deviceinfo/CMakeLists.txt
@@ -1,13 +1,13 @@
-message(STATUS "Building the llvm-omp-device-info tool")
+message(STATUS "Building the llvm-offload-device-info tool")
 
-add_openmp_tool(llvm-omp-device-info llvm-omp-device-info.cpp)
+add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp)
 
-llvm_update_compile_flags(llvm-omp-device-info)
+llvm_update_compile_flags(llvm-offload-device-info)
 
-target_include_directories(llvm-omp-device-info PRIVATE
+target_include_directories(llvm-offload-device-info PRIVATE
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
-target_link_libraries(llvm-omp-device-info PRIVATE
+target_link_libraries(llvm-offload-device-info PRIVATE
   omp
   omptarget
 )
diff --git a/offload/tools/deviceinfo/llvm-omp-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
similarity index 73%
rename from offload/tools/deviceinfo/llvm-omp-device-info.cpp
rename to offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 7dd22aec9828f..2228fbf3ec173 100644
--- a/offload/tools/deviceinfo/llvm-omp-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -1,4 +1,4 @@
-//===- llvm-omp-device-info.cpp - Obtain device info as seen from OpenMP --===//
+//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a command line utility that, by using Libomptarget, and the device
-// plugins, list devices information as seen from the OpenMP Runtime.
+// This is a command line utility that, by using LLVM/Offload, and the device
+// plugins, list devices information as seen by the runtime.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,8 +19,9 @@ int main(int argc, char **argv) {
   __tgt_register_lib(&EmptyDesc);
   __tgt_init_all_rtls();
 
+  printf("Found %d devices:\n", omp_get_num_devices());
   for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) {
-    printf("Device (%d):\n", Dev);
+    printf("  Device %d:\n", Dev);
     if (!__tgt_print_device_info(Dev))
       printf("    print_device_info not implemented\n");
     printf("\n");
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index df944c3be68de..d4a4b1a99f781 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 ===========================
-OpenMP 19.0.0 Release Notes
+OpenMP 20.0.0 Release Notes
 ===========================
 
 
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 98dd984fd4b0c..ed002c8cf0f80 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -743,6 +743,8 @@ variables is defined below.
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
     * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
+    * ``OFFLOAD_TRACK_ALLOCATION_TRACES=[TRUE/FALSE] (default FALSE)``
+    * ``OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<Num> (default 0)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1170,6 +1172,18 @@ This environment variable can be used to control how the OpenMP runtime assigns
 blocks to loops with high trip counts. By default we reuse existing blocks
 rather than spawning new blocks.
 
+OFFLOAD_TRACK_ALLOCATION_TRACES
+"""""""""""""""""""""""""""""""
+
+This environment variable determines if the stack traces of allocations and
+deallocations are tracked to aid in error reporting, e.g., in case of
+double-free.
+
+OFFLOAD_TRACK_KERNEL_LAUNCH_TRACES
+""""""""""""""""""""""""""""""""""
+
+This environment variable determines how manytstack traces of kernel launches
+are tracked to aid in error reporting, e.g., what asynchronous kernel failed.
 
 .. _libomptarget_plugin:
 
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index f34e55555545f..ab13ac4f43596 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -203,7 +203,26 @@ int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
   for (int level = 0; level < depth; ++level) {
-    if (ahwthread->ids[level] < bhwthread->ids[level])
+    // Reverse sort (higher efficiencies earlier in list) cores by core
+    // efficiency if available.
+    if (__kmp_is_hybrid_cpu() &&
+        __kmp_topology->get_type(level) == KMP_HW_CORE &&
+        ahwthread->attrs.is_core_eff_valid() &&
+        bhwthread->attrs.is_core_eff_valid()) {
+      if (ahwthread->attrs.get_core_eff() < bhwthread->attrs.get_core_eff())
+        return 1;
+      if (ahwthread->attrs.get_core_eff() > bhwthread->attrs.get_core_eff())
+        return -1;
+    }
+    if (ahwthread->ids[level] == bhwthread->ids[level])
+      continue;
+    // If the hardware id is unknown for this level, then place hardware thread
+    // further down in the sorted list as it should take last priority
+    if (ahwthread->ids[level] == UNKNOWN_ID)
+      return 1;
+    else if (bhwthread->ids[level] == UNKNOWN_ID)
+      return -1;
+    else if (ahwthread->ids[level] < bhwthread->ids[level])
       return -1;
     else if (ahwthread->ids[level] > bhwthread->ids[level])
       return 1;
@@ -246,7 +265,7 @@ void kmp_hw_thread_t::print() const {
   int depth = __kmp_topology->get_depth();
   printf("%4d ", os_id);
   for (int i = 0; i < depth; ++i) {
-    printf("%4d ", ids[i]);
+    printf("%4d (%d) ", ids[i], sub_ids[i]);
   }
   if (attrs) {
     if (attrs.is_core_type_valid())
@@ -264,7 +283,7 @@ void kmp_hw_thread_t::print() const {
 
 // Add a layer to the topology based on the ids. Assume the topology
 // is perfectly nested (i.e., so no object has more than one parent)
-void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) {
   // Figure out where the layer should go by comparing the ids of the current
   // layers with the new ids
   int target_layer;
@@ -325,7 +344,7 @@ void kmp_topology_t::_insert_windows_proc_groups() {
     ids[i] = __kmp_get_proc_group(mask);
   }
   KMP_CPU_FREE(mask);
-  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  insert_layer(KMP_HW_PROC_GROUP, ids);
   __kmp_free(ids);
 
   // sort topology after adding proc groups
@@ -465,10 +484,13 @@ void kmp_topology_t::_gather_enumeration_information() {
       int id = hw_thread.ids[layer];
       if (id != previous_id[layer]) {
         // Add an additional increment to each count
-        for (int l = layer; l < depth; ++l)
-          count[l]++;
+        for (int l = layer; l < depth; ++l) {
+          if (hw_thread.ids[l] != kmp_hw_thread_t::UNKNOWN_ID)
+            count[l]++;
+        }
         // Keep track of topology layer ratio statistics
-        max[layer]++;
+        if (hw_thread.ids[layer] != kmp_hw_thread_t::UNKNOWN_ID)
+          max[layer]++;
         for (int l = layer + 1; l < depth; ++l) {
           if (max[l] > ratio[l])
             ratio[l] = max[l];
@@ -833,6 +855,8 @@ void kmp_topology_t::print(const char *env_var) const {
   for (int i = 0; i < num_hw_threads; i++) {
     __kmp_str_buf_clear(&buf);
     for (int level = 0; level < depth; ++level) {
+      if (hw_threads[i].ids[level] == kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
       kmp_hw_t type = types[level];
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
@@ -1354,7 +1378,8 @@ bool kmp_topology_t::filter_hw_subset() {
           sub_id = abs_sub_ids[level];
         else
           sub_id = hw_thread.sub_ids[level];
-        if (sub_id < offset ||
+        if (hw_thread.ids[level] == kmp_hw_thread_t::UNKNOWN_ID ||
+            sub_id < offset ||
             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
@@ -1904,6 +1929,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      hw_thread.original_idx = hw_thread_index;
       // If multiple core types, then set that attribute for the hardware thread
 #if HWLOC_API_VERSION >= 0x00020400
       if (cpukinds) {
@@ -2018,6 +2044,7 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i;
     hw_thread.ids[1] = 0;
     hw_thread.ids[2] = 0;
@@ -2063,11 +2090,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i / BITS_PER_GROUP;
     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+    avail_ct++;
   }
   return true;
 }
@@ -2123,15 +2152,43 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
   return 0;
 }
 
-class kmp_cache_info_t {
+class cpuid_cache_info_t {
 public:
   struct info_t {
-    unsigned level, mask;
+    unsigned level = 0;
+    unsigned mask = 0;
+    bool operator==(const info_t &rhs) const {
+      return level == rhs.level && mask == rhs.mask;
+    }
+    bool operator!=(const info_t &rhs) const { return !operator==(rhs); }
   };
-  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  cpuid_cache_info_t() : depth(0) {
+    table[MAX_CACHE_LEVEL].level = 0;
+    table[MAX_CACHE_LEVEL].mask = 0;
+  }
   size_t get_depth() const { return depth; }
   info_t &operator[](size_t index) { return table[index]; }
   const info_t &operator[](size_t index) const { return table[index]; }
+  bool operator==(const cpuid_cache_info_t &rhs) const {
+    if (rhs.depth != depth)
+      return false;
+    for (size_t i = 0; i < depth; ++i)
+      if (table[i] != rhs.table[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const cpuid_cache_info_t &rhs) const {
+    return !operator==(rhs);
+  }
+  // Get cache information assocaited with L1, L2, L3 cache, etc.
+  // If level does not exist, then return the "NULL" level (level 0)
+  const info_t &get_level(unsigned level) const {
+    for (size_t i = 0; i < depth; ++i) {
+      if (table[i].level == level)
+        return table[i];
+    }
+    return table[MAX_CACHE_LEVEL];
+  }
 
   static kmp_hw_t get_topology_type(unsigned level) {
     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
@@ -2145,13 +2202,6 @@ class kmp_cache_info_t {
     }
     return KMP_HW_UNKNOWN;
   }
-
-private:
-  static const int MAX_CACHE_LEVEL = 3;
-
-  size_t depth;
-  info_t table[MAX_CACHE_LEVEL];
-
   void get_leaf4_levels() {
     unsigned level = 0;
     while (depth < MAX_CACHE_LEVEL) {
@@ -2176,6 +2226,11 @@ class kmp_cache_info_t {
       level++;
     }
   }
+  static const int MAX_CACHE_LEVEL = 3;
+
+private:
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL + 1];
 };
 
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
@@ -2483,6 +2538,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
   }
 
   __kmp_free(threadInfo);
@@ -2543,10 +2599,8 @@ enum {
   INTEL_LEVEL_TYPE_DIE = 5,
   INTEL_LEVEL_TYPE_LAST = 6,
 };
-
-struct cpuid_level_info_t {
-  unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
+KMP_BUILD_ASSERT(INTEL_LEVEL_TYPE_LAST < sizeof(unsigned) * CHAR_BIT);
+#define KMP_LEAF_1F_KNOWN_LEVELS ((1u << INTEL_LEVEL_TYPE_LAST) - 1u)
 
 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   switch (intel_type) {
@@ -2566,16 +2620,77 @@ static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   return KMP_HW_UNKNOWN;
 }
 
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
-                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
-                          kmp_uint64 known_levels) {
+static int __kmp_topology_type_2_intel_type(kmp_hw_t type) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return INTEL_LEVEL_TYPE_INVALID;
+  case KMP_HW_THREAD:
+    return INTEL_LEVEL_TYPE_SMT;
+  case KMP_HW_CORE:
+    return INTEL_LEVEL_TYPE_CORE;
+  case KMP_HW_TILE:
+    return INTEL_LEVEL_TYPE_TILE;
+  case KMP_HW_MODULE:
+    return INTEL_LEVEL_TYPE_MODULE;
+  case KMP_HW_DIE:
+    return INTEL_LEVEL_TYPE_DIE;
+  }
+  return INTEL_LEVEL_TYPE_INVALID;
+}
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+class cpuid_topo_desc_t {
+  unsigned desc = 0;
+
+public:
+  void clear() { desc = 0; }
+  bool contains(int intel_type) const {
+    KMP_DEBUG_ASSERT(intel_type >= 0 && intel_type < INTEL_LEVEL_TYPE_LAST);
+    if ((1u << intel_type) & desc)
+      return true;
+    return false;
+  }
+  bool contains_topology_type(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT(type >= 0 && type < KMP_HW_LAST);
+    int intel_type = __kmp_topology_type_2_intel_type(type);
+    return contains(intel_type);
+  }
+  bool contains(cpuid_topo_desc_t rhs) const {
+    return ((desc | rhs.desc) == desc);
+  }
+  void add(int intel_type) { desc |= (1u << intel_type); }
+  void add(cpuid_topo_desc_t rhs) { desc |= rhs.desc; }
+};
+
+struct cpuid_proc_info_t {
+  // Topology info
+  int os_id;
+  unsigned apic_id;
+  unsigned depth;
+  // Hybrid info
+  unsigned native_model_id;
+  int efficiency;
+  kmp_hw_core_type_t type;
+  cpuid_topo_desc_t description;
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+};
+
+// This function takes the topology leaf, an info pointer to store the levels
+// detected, and writable descriptors for the total topology.
+// Returns whether total types, depth, or description were modified.
+static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
+                                      kmp_hw_t total_types[KMP_HW_LAST],
+                                      int *total_depth,
+                                      cpuid_topo_desc_t *total_description) {
   unsigned level, levels_index;
   unsigned level_type, mask_width, nitems;
   kmp_cpuid buf;
+  cpuid_level_info_t(&levels)[INTEL_LEVEL_TYPE_LAST] = info->levels;
+  bool retval = false;
 
   // New algorithm has known topology layers act as highest unknown topology
   // layers when unknown topology layers exist.
@@ -2590,10 +2705,12 @@ __kmp_x2apicid_get_levels(int leaf,
     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
-    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
-      return 0;
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) {
+      info->depth = 0;
+      return retval;
+    }
 
-    if (known_levels & (1ull << level_type)) {
+    if (KMP_LEAF_1F_KNOWN_LEVELS & (1u << level_type)) {
       // Add a new level to the topology
       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
       levels[levels_index].level_type = level_type;
@@ -2609,6 +2726,22 @@ __kmp_x2apicid_get_levels(int leaf,
     }
     level++;
   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  info->description.clear();
+  info->depth = levels_index;
+
+  // If types, depth, and total_description are uninitialized,
+  // then initialize them now
+  if (*total_depth == 0) {
+    *total_depth = info->depth;
+    total_description->clear();
+    for (int i = *total_depth - 1, j = 0; i >= 0; --i, ++j) {
+      total_types[j] =
+          __kmp_intel_type_2_topology_type(info->levels[i].level_type);
+      total_description->add(info->levels[i].level_type);
+    }
+    retval = true;
+  }
 
   // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
   if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
@@ -2626,38 +2759,61 @@ __kmp_x2apicid_get_levels(int leaf,
       levels[i].mask = (-1) << levels[i - 1].mask_width;
       levels[i].cache_mask = 0;
     }
+    info->description.add(info->levels[i].level_type);
+  }
+
+  // If this processor has level type not on other processors, then make
+  // sure to include it in total types, depth, and description.
+  // One assumption here is that the first type, i.e. socket, is known.
+  // Another assumption is that types array is always large enough to fit any
+  // new layers since its length is KMP_HW_LAST.
+  if (!total_description->contains(info->description)) {
+    for (int i = info->depth - 1, j = 0; i >= 0; --i, ++j) {
+      // If this level is known already, then skip it.
+      if (total_description->contains(levels[i].level_type))
+        continue;
+      // Unknown level, insert before last known level
+      kmp_hw_t curr_type =
+          __kmp_intel_type_2_topology_type(levels[i].level_type);
+      KMP_ASSERT(j != 0 && "Bad APIC Id information");
+      // Move over all known levels to make room for new level
+      for (int k = info->depth - 1; k >= j; --k) {
+        KMP_DEBUG_ASSERT(k + 1 < KMP_HW_LAST);
+        total_types[k + 1] = total_types[k];
+      }
+      // Insert new level
+      total_types[j] = curr_type;
+      (*total_depth)++;
+    }
+    total_description->add(info->description);
+    retval = true;
   }
-  return levels_index;
+  return retval;
 }
 
 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
 
-  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
-  unsigned levels_index;
   kmp_cpuid buf;
-  kmp_uint64 known_levels;
-  int topology_leaf, highest_leaf, apic_id;
+  int topology_leaf, highest_leaf;
   int num_leaves;
+  int depth = 0;
+  cpuid_topo_desc_t total_description;
   static int leaves[] = {0, 0};
 
-  kmp_i18n_id_t leaf_message_id;
+  // If affinity is disabled, __kmp_avail_proc may be zero
+  int ninfos = (__kmp_avail_proc > 0 ? __kmp_avail_proc : 1);
+  cpuid_proc_info_t *proc_info = (cpuid_proc_info_t *)__kmp_allocate(
+      (sizeof(cpuid_proc_info_t) + sizeof(cpuid_cache_info_t)) * ninfos);
+  cpuid_cache_info_t *cache_info = (cpuid_cache_info_t *)(proc_info + ninfos);
 
-  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+  kmp_i18n_id_t leaf_message_id;
 
   *msg_id = kmp_i18n_null;
   if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
   }
 
-  // Figure out the known topology levels
-  known_levels = 0ull;
-  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
-    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
-      known_levels |= (1ull << i);
-    }
-  }
-
   // Get the highest cpuid leaf supported
   __kmp_x86_cpuid(0, 0, &buf);
   highest_leaf = buf.eax;
@@ -2691,16 +2847,18 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     if (buf.ebx == 0)
       continue;
     topology_leaf = leaf;
-    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
-    if (levels_index == 0)
+    __kmp_x2apicid_get_levels(leaf, &proc_info[0], types, &depth,
+                              &total_description);
+    if (depth == 0)
       continue;
     break;
   }
-  if (topology_leaf == -1 || levels_index == 0) {
+  if (topology_leaf == -1 || depth == 0) {
     *msg_id = leaf_message_id;
+    __kmp_free(proc_info);
     return false;
   }
-  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  KMP_ASSERT(depth <= INTEL_LEVEL_TYPE_LAST);
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
@@ -2711,42 +2869,19 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity.type == affinity_none);
-    for (unsigned i = 0; i < levels_index; ++i) {
-      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
-        __kmp_nThreadsPerCore = levels[i].nitems;
-      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
-        nCoresPerPkg = levels[i].nitems;
+    for (int i = 0; i < depth; ++i) {
+      if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = proc_info[0].levels[i].nitems;
+      } else if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = proc_info[0].levels[i].nitems;
       }
     }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_free(proc_info);
     return true;
   }
 
-  // Allocate the data structure to be returned.
-  int depth = levels_index;
-  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
-    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
-  __kmp_topology =
-      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
-
-  // Insert equivalent cache types if they exist
-  kmp_cache_info_t cache_info;
-  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
-    const kmp_cache_info_t::info_t &info = cache_info[i];
-    unsigned cache_mask = info.mask;
-    unsigned cache_level = info.level;
-    for (unsigned j = 0; j < levels_index; ++j) {
-      unsigned hw_cache_mask = levels[j].cache_mask;
-      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
-      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
-        kmp_hw_t type =
-            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
-        __kmp_topology->set_equivalent_type(cache_type, type);
-      }
-    }
-  }
-
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
   // __kmp_affinity.type = affinity_none.
@@ -2758,56 +2893,167 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
   int hw_thread_index = 0;
-  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
-    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
-    unsigned my_levels_index;
+  bool uniform_caches = true;
 
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
+    // Gather topology information
     __kmp_affinity_dispatch->bind_thread(proc);
-
-    // New algorithm
     __kmp_x86_cpuid(topology_leaf, 0, &buf);
-    apic_id = buf.edx;
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
-    my_levels_index =
-        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
-    if (my_levels_index == 0 || my_levels_index != levels_index) {
+    proc_info[hw_thread_index].os_id = proc;
+    proc_info[hw_thread_index].apic_id = buf.edx;
+    __kmp_x2apicid_get_levels(topology_leaf, &proc_info[hw_thread_index], types,
+                              &depth, &total_description);
+    if (proc_info[hw_thread_index].depth == 0) {
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      __kmp_free(proc_info);
       return false;
     }
-    hw_thread.clear();
-    hw_thread.os_id = proc;
-    // Put in topology information
-    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
-      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
-      if (j > 0) {
-        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
-      }
-    }
+    // Gather cache information and insert afterwards
+    cache_info[hw_thread_index].get_leaf4_levels();
+    if (uniform_caches && hw_thread_index > 0)
+      if (cache_info[0] != cache_info[hw_thread_index])
+        uniform_caches = false;
     // Hybrid information
     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
-      kmp_hw_core_type_t type;
-      unsigned native_model_id;
-      int efficiency;
-      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
-      hw_thread.attrs.set_core_type(type);
-      hw_thread.attrs.set_core_eff(efficiency);
+      __kmp_get_hybrid_info(&proc_info[hw_thread_index].type,
+                            &proc_info[hw_thread_index].efficiency,
+                            &proc_info[hw_thread_index].native_model_id);
     }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
+  previous_affinity.restore();
+
+  // Allocate the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  // Create topology Ids and hybrid types in __kmp_topology
+  for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = proc_info[i].os_id;
+    hw_thread.original_idx = i;
+    unsigned apic_id = proc_info[i].apic_id;
+    // Put in topology information
+    for (int j = 0, idx = depth - 1; j < depth; ++j, --idx) {
+      if (!(proc_info[i].description.contains_topology_type(
+              __kmp_topology->get_type(j)))) {
+        hw_thread.ids[idx] = kmp_hw_thread_t::UNKNOWN_ID;
+      } else {
+        hw_thread.ids[idx] = apic_id & proc_info[i].levels[j].mask;
+        if (j > 0) {
+          hw_thread.ids[idx] >>= proc_info[i].levels[j - 1].mask_width;
+        }
+      }
+    }
+    hw_thread.attrs.set_core_type(proc_info[i].type);
+    hw_thread.attrs.set_core_eff(proc_info[i].efficiency);
+  }
+
   __kmp_topology->sort_ids();
+
+  // Change Ids to logical Ids
+  for (int j = 0; j < depth - 1; ++j) {
+    int new_id = 0;
+    int prev_id = __kmp_topology->at(0).ids[j];
+    int curr_id = __kmp_topology->at(0).ids[j + 1];
+    __kmp_topology->at(0).ids[j + 1] = new_id;
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[j] == prev_id && hw_thread.ids[j + 1] == curr_id) {
+        hw_thread.ids[j + 1] = new_id;
+      } else if (hw_thread.ids[j] == prev_id &&
+                 hw_thread.ids[j + 1] != curr_id) {
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      } else {
+        prev_id = hw_thread.ids[j];
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      }
+    }
+  }
+
+  // First check for easy cache placement. This occurs when caches are
+  // equivalent to a layer in the CPUID leaf 0xb or 0x1f topology.
+  if (uniform_caches) {
+    for (size_t i = 0; i < cache_info[0].get_depth(); ++i) {
+      unsigned cache_mask = cache_info[0][i].mask;
+      unsigned cache_level = cache_info[0][i].level;
+      KMP_ASSERT(cache_level <= cpuid_cache_info_t::MAX_CACHE_LEVEL);
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(cache_level);
+      __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      for (int j = 0; j < depth; ++j) {
+        unsigned hw_cache_mask = proc_info[0].levels[j].cache_mask;
+        if (hw_cache_mask == cache_mask && j < depth - 1) {
+          kmp_hw_t type = __kmp_intel_type_2_topology_type(
+              proc_info[0].levels[j + 1].level_type);
+          __kmp_topology->set_equivalent_type(cache_type, type);
+        }
+      }
+    }
+  } else {
+    // If caches are non-uniform, then record which caches exist.
+    for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      for (size_t j = 0; j < cache_info[i].get_depth(); ++j) {
+        unsigned cache_level = cache_info[i][j].level;
+        kmp_hw_t cache_type =
+            cpuid_cache_info_t::get_topology_type(cache_level);
+        if (__kmp_topology->get_equivalent_type(cache_type) == KMP_HW_UNKNOWN)
+          __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      }
+    }
+  }
+
+  // See if any cache level needs to be added manually through cache Ids
+  bool unresolved_cache_levels = false;
+  for (unsigned level = 1; level <= cpuid_cache_info_t::MAX_CACHE_LEVEL;
+       ++level) {
+    kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(level);
+    // This also filters out caches which may not be in the topology
+    // since the equivalent type might be KMP_HW_UNKNOWN.
+    if (__kmp_topology->get_equivalent_type(cache_type) == cache_type) {
+      unresolved_cache_levels = true;
+      break;
+    }
+  }
+
+  // Insert unresolved cache layers into machine topology using cache Ids
+  if (unresolved_cache_levels) {
+    int num_hw_threads = __kmp_topology->get_num_hw_threads();
+    int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+    for (unsigned l = 1; l <= cpuid_cache_info_t::MAX_CACHE_LEVEL; ++l) {
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(l);
+      if (__kmp_topology->get_equivalent_type(cache_type) != cache_type)
+        continue;
+      for (int i = 0; i < num_hw_threads; ++i) {
+        int original_idx = __kmp_topology->at(i).original_idx;
+        ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+        const cpuid_cache_info_t::info_t &info =
+            cache_info[original_idx].get_level(l);
+        // if cache level not in topology for this processor, then skip
+        if (info.level == 0)
+          continue;
+        ids[i] = info.mask & proc_info[original_idx].apic_id;
+      }
+      __kmp_topology->insert_layer(cache_type, ids);
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    __kmp_free(proc_info);
     return false;
   }
+  __kmp_free(proc_info);
   return true;
 }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -3571,6 +3817,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     hw_thread.clear();
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
 
     idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
@@ -3594,6 +3841,32 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
   __kmp_topology->sort_ids();
+
+  int tlevel = __kmp_topology->get_level(KMP_HW_THREAD);
+  if (tlevel > 0) {
+    // If the thread level does not have ids, then put them in.
+    if (__kmp_topology->at(0).ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID) {
+      __kmp_topology->at(0).ids[tlevel] = 0;
+    }
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[tlevel] != kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
+      kmp_hw_thread_t &prev_hw_thread = __kmp_topology->at(i - 1);
+      // Check if socket, core, anything above thread level changed.
+      // If the ids did change, then restart thread id at 0
+      // Otherwise, set thread id to prev thread's id + 1
+      for (int j = 0; j < tlevel; ++j) {
+        if (hw_thread.ids[j] != prev_hw_thread.ids[j]) {
+          hw_thread.ids[tlevel] = 0;
+          break;
+        }
+      }
+      if (hw_thread.ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID)
+        hw_thread.ids[tlevel] = prev_hw_thread.ids[tlevel] + 1;
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
@@ -3620,8 +3893,8 @@ static void __kmp_create_os_id_masks(unsigned *numUnique,
   KMP_ASSERT(depth);
 
   i = find_next(-1);
-  // If could not find HW thread location with attributes, then return and
-  // fallback to increment find_next and disregard core attributes.
+  // If could not find HW thread location that satisfies find_next conditions,
+  // then return and fallback to increment find_next.
   if (i >= numAddrs)
     return;
 
@@ -4739,16 +5012,33 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
     }
   }
   // If core attributes did not work, or none were specified,
-  // then make OS Id mask table using typical incremental way.
+  // then make OS Id mask table using typical incremental way with
+  // checking for validity of each id at granularity level specified.
+  if (!affinity.os_id_masks) {
+    int gran = affinity.gran_levels;
+    int gran_level = depth - 1 - affinity.gran_levels;
+    if (gran >= 0 && gran_level >= 0 && gran_level < depth) {
+      __kmp_create_os_id_masks(
+          &numUnique, affinity, [depth, numAddrs, &affinity](int idx) {
+            KMP_ASSERT(idx >= -1);
+            int gran = affinity.gran_levels;
+            int gran_level = depth - 1 - affinity.gran_levels;
+            for (int i = idx + 1; i < numAddrs; ++i)
+              if ((gran >= depth) ||
+                  (gran < depth && __kmp_topology->at(i).ids[gran_level] !=
+                                       kmp_hw_thread_t::UNKNOWN_ID))
+                return i;
+            return numAddrs;
+          });
+    }
+  }
+  // Final attempt to make OS Id mask table using typical incremental way.
   if (!affinity.os_id_masks) {
     __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
       KMP_ASSERT(idx >= -1);
       return idx + 1;
     });
   }
-  if (affinity.gran_levels == 0) {
-    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
-  }
 
   switch (affinity.type) {
 
@@ -4894,6 +5184,8 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
         int osId = __kmp_topology->at(i).os_id;
 
         kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+        if (KMP_CPU_ISEMPTY(src))
+          continue;
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
         KMP_CPU_COPY(dest, src);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 3dc2c84d53f77..ed24b6faf2f7e 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -846,6 +846,7 @@ class kmp_hw_thread_t {
   int sub_ids[KMP_HW_LAST];
   bool leader;
   int os_id;
+  int original_idx;
   kmp_hw_attr_t attrs;
 
   void print() const;
@@ -905,9 +906,6 @@ class kmp_topology_t {
   // Compact value used during sort_compact()
   int compact;
 
-  // Insert a new topology layer after allocation
-  void _insert_layer(kmp_hw_t type, const int *ids);
-
 #if KMP_GROUP_AFFINITY
   // Insert topology information about Windows Processor groups
   void _insert_windows_proc_groups();
@@ -967,6 +965,10 @@ class kmp_topology_t {
     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
           kmp_hw_thread_t::compare_ids);
   }
+
+  // Insert a new topology layer after allocation
+  void insert_layer(kmp_hw_t type, const int *ids);
+
   // Check if the hardware ids are unique, if they are
   // return true, otherwise return false
   bool check_ids() const;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 936afb91ac2f8..03ce0dd752af1 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3245,7 +3245,7 @@ static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
   threads_data = task_team->tt.tt_threads_data;
   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
   KMP_DEBUG_ASSERT(victim_tid >= 0);
-  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_nproc);
+  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
 
   victim_td = &threads_data[victim_tid];
   victim_thr = victim_td->td.td_thr;
diff --git a/openmp/runtime/test/tasking/issue-94260-1.cpp b/openmp/runtime/test/tasking/issue-94260-1.cpp
new file mode 100644
index 0000000000000..6f2bf2ae0a859
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-94260-1.cpp
@@ -0,0 +1,66 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// The number of times to run each test
+#define NTIMES 2
+
+// Every thread creates a single "increment" task
+void test_tasks() {
+  for (int i = 0; i < 100; ++i)
+#pragma omp task
+  {
+    int tid = omp_get_thread_num();
+  }
+}
+
+// Testing single level of parallelism with increment tasks
+void test_base(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("    test_base(%d)\n", nthreads);
+#endif
+#pragma omp parallel num_threads(nthreads)
+  { test_tasks(); }
+}
+
+// Testing nested parallel with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("   test_nest(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+  {
+    for (int i = 0; i < 100; ++i)
+#pragma omp task
+    {
+      int tid = omp_get_thread_num();
+    }
+    test_base(second);
+  }
+}
+
+template <typename... Args>
+void run_ntimes(int n, void (*func)(Args...), Args... args) {
+  for (int i = 0; i < n; ++i) {
+    func(args...);
+  }
+}
+
+int main() {
+  omp_set_max_active_levels(5);
+
+  for (int i = 0; i < 100; ++i) {
+    run_ntimes(NTIMES, test_nest, 4, 3);
+    run_ntimes(NTIMES, test_nest, 2, 1);
+  }
+
+  printf("PASS\n");
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/tasking/issue-94260-2.c b/openmp/runtime/test/tasking/issue-94260-2.c
new file mode 100644
index 0000000000000..c2f43f91f70f9
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-94260-2.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int test_omp_parallel_num_threads() {
+  int num_failed;
+  int threads;
+  int nthreads;
+  int max_threads = 0;
+
+  num_failed = 0;
+#pragma omp task
+  {}
+
+/* first we check how many threads are available */
+#pragma omp parallel
+  {
+#pragma omp task
+    {}
+#pragma omp master
+    max_threads = omp_get_num_threads();
+  }
+
+  /* we increase the number of threads from one to maximum:*/
+  for (threads = 1; threads <= max_threads; threads++) {
+    nthreads = 0;
+#pragma omp parallel reduction(+ : num_failed) num_threads(threads)
+    {
+#pragma omp task
+      {}
+      num_failed = num_failed + !(threads == omp_get_num_threads());
+#pragma omp atomic
+      nthreads += 1;
+    }
+    num_failed = num_failed + !(nthreads == threads);
+  }
+  return (!num_failed);
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < 100; i++) {
+    if (!test_omp_parallel_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/openmp/runtime/test/transform/interchange/foreach.cpp b/openmp/runtime/test/transform/interchange/foreach.cpp
new file mode 100644
index 0000000000000..17eaed6029d01
--- /dev/null
+++ b/openmp/runtime/test/transform/interchange/foreach.cpp
@@ -0,0 +1,216 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp interchange
+  for (Reporter c{"C"}; auto &&v : Reporter("A"))
+    for (Reporter d{"D"}; auto &&w : Reporter("B"))
+      printf("v=%d w=%d\n", v, w);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0 w=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1 w=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2 w=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0 w=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1 w=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2 w=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0 w=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1 w=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2 w=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/interchange/intfor.c b/openmp/runtime/test/transform/interchange/intfor.c
new file mode 100644
index 0000000000000..b4842f0f82913
--- /dev/null
+++ b/openmp/runtime/test/transform/interchange/intfor.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp interchange
+  for (int i = 7; i < 17; i += 3)
+    for (int j = 8; j < 18; j += 3)
+      printf("i=%d j=%d\n", i, j);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7 j=8
+// CHECK-NEXT: i=10 j=8
+// CHECK-NEXT: i=13 j=8
+// CHECK-NEXT: i=16 j=8
+// CHECK-NEXT: i=7 j=11
+// CHECK-NEXT: i=10 j=11
+// CHECK-NEXT: i=13 j=11
+// CHECK-NEXT: i=16 j=11
+// CHECK-NEXT: i=7 j=14
+// CHECK-NEXT: i=10 j=14
+// CHECK-NEXT: i=13 j=14
+// CHECK-NEXT: i=16 j=14
+// CHECK-NEXT: i=7 j=17
+// CHECK-NEXT: i=10 j=17
+// CHECK-NEXT: i=13 j=17
+// CHECK-NEXT: i=16 j=17
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/interchange/iterfor.cpp b/openmp/runtime/test/transform/interchange/iterfor.cpp
new file mode 100644
index 0000000000000..51219a07402e3
--- /dev/null
+++ b/openmp/runtime/test/transform/interchange/iterfor.cpp
@@ -0,0 +1,222 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  Reporter A("A"), B("B");
+#pragma omp interchange
+  for (auto it = A.begin(); it != A.end(); ++it)
+    for (auto jt = B.begin(); jt != B.end(); ++jt)
+      printf("i=%d j=%d\n", *it, *jt);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [A] dtor
diff --git a/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000000000..54399d25a3f15
--- /dev/null
+++ b/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,340 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 2; ++i)
+#pragma omp interchange
+    for (Reporter c{"C"}; auto &&v : Reporter("A"))
+      for (Reporter d{"D"}; auto &&w : Reporter("B"))
+        for (int k = 0; k < 2; ++k)
+          printf("i=%d v=%d w=%d k=%d\n", i, v, w, k);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=0 v=0 w=0 k=0
+// CHECK-NEXT: i=0 v=0 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=0 v=1 w=0 k=0
+// CHECK-NEXT: i=0 v=1 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=0 v=2 w=0 k=0
+// CHECK-NEXT: i=0 v=2 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=0 v=0 w=1 k=0
+// CHECK-NEXT: i=0 v=0 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=0 v=1 w=1 k=0
+// CHECK-NEXT: i=0 v=1 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=0 v=2 w=1 k=0
+// CHECK-NEXT: i=0 v=2 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=0 v=0 w=2 k=0
+// CHECK-NEXT: i=0 v=0 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=0 v=1 w=2 k=0
+// CHECK-NEXT: i=0 v=1 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=0 v=2 w=2 k=0
+// CHECK-NEXT: i=0 v=2 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=1 v=0 w=0 k=0
+// CHECK-NEXT: i=1 v=0 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=1 v=1 w=0 k=0
+// CHECK-NEXT: i=1 v=1 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=1 v=2 w=0 k=0
+// CHECK-NEXT: i=1 v=2 w=0 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=1 v=0 w=1 k=0
+// CHECK-NEXT: i=1 v=0 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=1 v=1 w=1 k=0
+// CHECK-NEXT: i=1 v=1 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=1 v=2 w=1 k=0
+// CHECK-NEXT: i=1 v=2 w=1 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: i=1 v=0 w=2 k=0
+// CHECK-NEXT: i=1 v=0 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: i=1 v=1 w=2 k=0
+// CHECK-NEXT: i=1 v=1 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: i=1 v=2 w=2 k=0
+// CHECK-NEXT: i=1 v=2 w=2 k=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-intfor.cpp
new file mode 100644
index 0000000000000..a4520afdb0f9c
--- /dev/null
+++ b/openmp/runtime/test/transform/interchange/parallel-wsloop-collapse-intfor.cpp
@@ -0,0 +1,106 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(4) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp interchange
+    for (int j = 0; j < 3; ++j)
+      for (int k = 0; k < 3; ++k)
+        for (int l = 0; l < 3; ++l)
+          printf("i=%d j=%d k=%d l=%d\n", i, j, k, l);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=0 k=0 l=0
+// CHECK-NEXT: i=0 j=0 k=0 l=1
+// CHECK-NEXT: i=0 j=0 k=0 l=2
+// CHECK-NEXT: i=0 j=1 k=0 l=0
+// CHECK-NEXT: i=0 j=1 k=0 l=1
+// CHECK-NEXT: i=0 j=1 k=0 l=2
+// CHECK-NEXT: i=0 j=2 k=0 l=0
+// CHECK-NEXT: i=0 j=2 k=0 l=1
+// CHECK-NEXT: i=0 j=2 k=0 l=2
+// CHECK-NEXT: i=0 j=0 k=1 l=0
+// CHECK-NEXT: i=0 j=0 k=1 l=1
+// CHECK-NEXT: i=0 j=0 k=1 l=2
+// CHECK-NEXT: i=0 j=1 k=1 l=0
+// CHECK-NEXT: i=0 j=1 k=1 l=1
+// CHECK-NEXT: i=0 j=1 k=1 l=2
+// CHECK-NEXT: i=0 j=2 k=1 l=0
+// CHECK-NEXT: i=0 j=2 k=1 l=1
+// CHECK-NEXT: i=0 j=2 k=1 l=2
+// CHECK-NEXT: i=0 j=0 k=2 l=0
+// CHECK-NEXT: i=0 j=0 k=2 l=1
+// CHECK-NEXT: i=0 j=0 k=2 l=2
+// CHECK-NEXT: i=0 j=1 k=2 l=0
+// CHECK-NEXT: i=0 j=1 k=2 l=1
+// CHECK-NEXT: i=0 j=1 k=2 l=2
+// CHECK-NEXT: i=0 j=2 k=2 l=0
+// CHECK-NEXT: i=0 j=2 k=2 l=1
+// CHECK-NEXT: i=0 j=2 k=2 l=2
+// CHECK-NEXT: i=1 j=0 k=0 l=0
+// CHECK-NEXT: i=1 j=0 k=0 l=1
+// CHECK-NEXT: i=1 j=0 k=0 l=2
+// CHECK-NEXT: i=1 j=1 k=0 l=0
+// CHECK-NEXT: i=1 j=1 k=0 l=1
+// CHECK-NEXT: i=1 j=1 k=0 l=2
+// CHECK-NEXT: i=1 j=2 k=0 l=0
+// CHECK-NEXT: i=1 j=2 k=0 l=1
+// CHECK-NEXT: i=1 j=2 k=0 l=2
+// CHECK-NEXT: i=1 j=0 k=1 l=0
+// CHECK-NEXT: i=1 j=0 k=1 l=1
+// CHECK-NEXT: i=1 j=0 k=1 l=2
+// CHECK-NEXT: i=1 j=1 k=1 l=0
+// CHECK-NEXT: i=1 j=1 k=1 l=1
+// CHECK-NEXT: i=1 j=1 k=1 l=2
+// CHECK-NEXT: i=1 j=2 k=1 l=0
+// CHECK-NEXT: i=1 j=2 k=1 l=1
+// CHECK-NEXT: i=1 j=2 k=1 l=2
+// CHECK-NEXT: i=1 j=0 k=2 l=0
+// CHECK-NEXT: i=1 j=0 k=2 l=1
+// CHECK-NEXT: i=1 j=0 k=2 l=2
+// CHECK-NEXT: i=1 j=1 k=2 l=0
+// CHECK-NEXT: i=1 j=1 k=2 l=1
+// CHECK-NEXT: i=1 j=1 k=2 l=2
+// CHECK-NEXT: i=1 j=2 k=2 l=0
+// CHECK-NEXT: i=1 j=2 k=2 l=1
+// CHECK-NEXT: i=1 j=2 k=2 l=2
+// CHECK-NEXT: i=2 j=0 k=0 l=0
+// CHECK-NEXT: i=2 j=0 k=0 l=1
+// CHECK-NEXT: i=2 j=0 k=0 l=2
+// CHECK-NEXT: i=2 j=1 k=0 l=0
+// CHECK-NEXT: i=2 j=1 k=0 l=1
+// CHECK-NEXT: i=2 j=1 k=0 l=2
+// CHECK-NEXT: i=2 j=2 k=0 l=0
+// CHECK-NEXT: i=2 j=2 k=0 l=1
+// CHECK-NEXT: i=2 j=2 k=0 l=2
+// CHECK-NEXT: i=2 j=0 k=1 l=0
+// CHECK-NEXT: i=2 j=0 k=1 l=1
+// CHECK-NEXT: i=2 j=0 k=1 l=2
+// CHECK-NEXT: i=2 j=1 k=1 l=0
+// CHECK-NEXT: i=2 j=1 k=1 l=1
+// CHECK-NEXT: i=2 j=1 k=1 l=2
+// CHECK-NEXT: i=2 j=2 k=1 l=0
+// CHECK-NEXT: i=2 j=2 k=1 l=1
+// CHECK-NEXT: i=2 j=2 k=1 l=2
+// CHECK-NEXT: i=2 j=0 k=2 l=0
+// CHECK-NEXT: i=2 j=0 k=2 l=1
+// CHECK-NEXT: i=2 j=0 k=2 l=2
+// CHECK-NEXT: i=2 j=1 k=2 l=0
+// CHECK-NEXT: i=2 j=1 k=2 l=1
+// CHECK-NEXT: i=2 j=1 k=2 l=2
+// CHECK-NEXT: i=2 j=2 k=2 l=0
+// CHECK-NEXT: i=2 j=2 k=2 l=1
+// CHECK-NEXT: i=2 j=2 k=2 l=2
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/reverse/foreach.cpp b/openmp/runtime/test/transform/reverse/foreach.cpp
new file mode 100644
index 0000000000000..0784e3c0057c9
--- /dev/null
+++ b/openmp/runtime/test/transform/reverse/foreach.cpp
@@ -0,0 +1,162 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp reverse
+  for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+    printf("v=%d\n", v);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/reverse/intfor.c b/openmp/runtime/test/transform/reverse/intfor.c
new file mode 100644
index 0000000000000..a526a8d493b3d
--- /dev/null
+++ b/openmp/runtime/test/transform/reverse/intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp reverse
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=16
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=7
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/reverse/iterfor.cpp b/openmp/runtime/test/transform/reverse/iterfor.cpp
new file mode 100644
index 0000000000000..ba1086dbd76a5
--- /dev/null
+++ b/openmp/runtime/test/transform/reverse/iterfor.cpp
@@ -0,0 +1,164 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  Reporter range("range");
+#pragma omp reverse
+  for (auto it = range.begin(); it != range.end(); ++it)
+    printf("v=%d\n", *it);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
diff --git a/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000000000..240ef59bd6b4b
--- /dev/null
+++ b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,285 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp reverse
+    for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+      for (int k = 0; k < 3; ++k)
+        printf("i=%d j=%d k=%d\n", i, v, k);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 j=2 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 j=2 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 j=2 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 j=1 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 j=1 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 j=1 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 j=0 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 j=0 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 j=0 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 j=2 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 j=2 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 j=2 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 j=1 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 j=1 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 j=1 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 j=0 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 j=0 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 j=0 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 j=2 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 j=2 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 j=2 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 j=1 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 j=1 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 j=1 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 j=0 k=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 j=0 k=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 j=0 k=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp
new file mode 100644
index 0000000000000..ae545b863d86c
--- /dev/null
+++ b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp
@@ -0,0 +1,51 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp reverse
+    for (int j = 0; j < 3; ++j)
+      for (int k = 0; k < 3; ++k)
+        printf("i=%d j=%d k=%d\n", i, j, k);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=2 k=0
+// CHECK-NEXT: i=0 j=2 k=1
+// CHECK-NEXT: i=0 j=2 k=2
+// CHECK-NEXT: i=0 j=1 k=0
+// CHECK-NEXT: i=0 j=1 k=1
+// CHECK-NEXT: i=0 j=1 k=2
+// CHECK-NEXT: i=0 j=0 k=0
+// CHECK-NEXT: i=0 j=0 k=1
+// CHECK-NEXT: i=0 j=0 k=2
+// CHECK-NEXT: i=1 j=2 k=0
+// CHECK-NEXT: i=1 j=2 k=1
+// CHECK-NEXT: i=1 j=2 k=2
+// CHECK-NEXT: i=1 j=1 k=0
+// CHECK-NEXT: i=1 j=1 k=1
+// CHECK-NEXT: i=1 j=1 k=2
+// CHECK-NEXT: i=1 j=0 k=0
+// CHECK-NEXT: i=1 j=0 k=1
+// CHECK-NEXT: i=1 j=0 k=2
+// CHECK-NEXT: i=2 j=2 k=0
+// CHECK-NEXT: i=2 j=2 k=1
+// CHECK-NEXT: i=2 j=2 k=2
+// CHECK-NEXT: i=2 j=1 k=0
+// CHECK-NEXT: i=2 j=1 k=1
+// CHECK-NEXT: i=2 j=1 k=2
+// CHECK-NEXT: i=2 j=0 k=0
+// CHECK-NEXT: i=2 j=0 k=1
+// CHECK-NEXT: i=2 j=0 k=2
+// CHECK-NEXT: done
diff --git a/polly/include/polly/ScopBuilder.h b/polly/include/polly/ScopBuilder.h
index 635c23ca7f972..e589a7f0b05d6 100644
--- a/polly/include/polly/ScopBuilder.h
+++ b/polly/include/polly/ScopBuilder.h
@@ -663,19 +663,6 @@ class ScopBuilder final {
   ///         nullptr if it cannot be hoisted at all.
   isl::set getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes);
 
-  /// Collect loads which might form a reduction chain with @p StoreMA.
-  ///
-  /// Check if the stored value for @p StoreMA is a binary operator with one or
-  /// two loads as operands. If the binary operand is commutative & associative,
-  /// used only once (by @p StoreMA) and its load operands are also used only
-  /// once, we have found a possible reduction chain. It starts at an operand
-  /// load and includes the binary operator and @p StoreMA.
-  ///
-  /// Note: We allow only one use to ensure the load and binary operator cannot
-  ///       escape this block or into any other store except @p StoreMA.
-  void collectCandidateReductionLoads(MemoryAccess *StoreMA,
-                                      SmallVectorImpl<MemoryAccess *> &Loads);
-
   /// Build the access relation of all memory accesses of @p Stmt.
   void buildAccessRelations(ScopStmt &Stmt);
 
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 1e0692ff40110..974de817e72db 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -470,6 +470,8 @@ class MemoryAccess final {
     RT_BOR,  ///< Bitwise Or
     RT_BXOR, ///< Bitwise XOr
     RT_BAND, ///< Bitwise And
+
+    RT_BOTTOM, ///< Pseudo type for the data flow analysis
   };
 
   using SubscriptsTy = SmallVector<const SCEV *, 4>;
@@ -1139,6 +1141,7 @@ class ScopStmt final {
   friend class ScopBuilder;
 
 public:
+  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   /// Create the ScopStmt from a BasicBlock.
   ScopStmt(Scop &parent, BasicBlock &bb, StringRef Name, Loop *SurroundingLoop,
            std::vector<Instruction *> Instructions);
@@ -1206,7 +1209,6 @@ class ScopStmt final {
   /// The memory accesses of this statement.
   ///
   /// The only side effects of a statement are its memory accesses.
-  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   MemoryAccessVec MemAccs;
 
   /// Mapping from instructions to (scalar) memory accesses.
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index d594823410f5f..c05fc1a347c25 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2481,8 +2481,8 @@ void ScopBuilder::collectSurroundingLoops(ScopStmt &Stmt) {
 }
 
 /// Return the reduction type for a given binary operator.
-static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
-                                                    const Instruction *Load) {
+static MemoryAccess::ReductionType
+getReductionType(const BinaryOperator *BinOp) {
   if (!BinOp)
     return MemoryAccess::RT_NONE;
   switch (BinOp->getOpcode()) {
@@ -2511,6 +2511,17 @@ static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
   }
 }
 
+/// @brief Combine two reduction types
+static MemoryAccess::ReductionType
+combineReductionType(MemoryAccess::ReductionType RT0,
+                     MemoryAccess::ReductionType RT1) {
+  if (RT0 == MemoryAccess::RT_BOTTOM)
+    return RT1;
+  if (RT0 == RT1)
+    return RT1;
+  return MemoryAccess::RT_NONE;
+}
+
 ///  True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p
 ///  StoreMA
 bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
@@ -2571,47 +2582,205 @@ bool checkCandidatePairAccesses(MemoryAccess *LoadMA, MemoryAccess *StoreMA,
     AllAccsRel = AllAccsRel.intersect_domain(Domain);
     isl::set AllAccs = AllAccsRel.range();
     Valid = !hasIntersectingAccesses(AllAccs, LoadMA, StoreMA, Domain, MemAccs);
-
     POLLY_DEBUG(dbgs() << " == The accessed memory is " << (Valid ? "not " : "")
                        << "accessed by other instructions!\n");
   }
+
   return Valid;
 }
 
 void ScopBuilder::checkForReductions(ScopStmt &Stmt) {
-  SmallVector<MemoryAccess *, 2> Loads;
-  SmallVector<std::pair<MemoryAccess *, MemoryAccess *>, 4> Candidates;
+  // Perform a data flow analysis on the current scop statement to propagate the
+  // uses of loaded values. Then check and mark the memory accesses which are
+  // part of reduction like chains.
+  // During the data flow analysis we use the State variable to keep track of
+  // the used "load-instructions" for each instruction in the scop statement.
+  // This includes the LLVM-IR of the load and the "number of uses" (or the
+  // number of paths in the operand tree which end in this load).
+  using StatePairTy = std::pair<unsigned, MemoryAccess::ReductionType>;
+  using FlowInSetTy = MapVector<const LoadInst *, StatePairTy>;
+  using StateTy = MapVector<const Instruction *, FlowInSetTy>;
+  StateTy State;
+
+  // Invalid loads are loads which have uses we can't track properly in the
+  // state map. This includes loads which:
+  //   o do not form a reduction when they flow into a memory location:
+  //     (e.g., A[i] = B[i] * 3 and  A[i] = A[i] * A[i] + A[i])
+  //   o are used by a non binary operator or one which is not commutative
+  //     and associative (e.g., A[i] = A[i] % 3)
+  //   o might change the control flow            (e.g., if (A[i]))
+  //   o are used in indirect memory accesses     (e.g., A[B[i]])
+  //   o are used outside the current scop statement
+  SmallPtrSet<const Instruction *, 8> InvalidLoads;
+  SmallVector<BasicBlock *, 8> ScopBlocks;
+  BasicBlock *BB = Stmt.getBasicBlock();
+  if (BB)
+    ScopBlocks.push_back(BB);
+  else
+    for (BasicBlock *Block : Stmt.getRegion()->blocks())
+      ScopBlocks.push_back(Block);
+  // Run the data flow analysis for all values in the scop statement
+  for (BasicBlock *Block : ScopBlocks) {
+    for (Instruction &Inst : *Block) {
+      if ((Stmt.getParent())->getStmtFor(&Inst) != &Stmt)
+        continue;
+      bool UsedOutsideStmt = any_of(Inst.users(), [&Stmt](User *U) {
+        return (Stmt.getParent())->getStmtFor(cast<Instruction>(U)) != &Stmt;
+      });
+      //  Treat loads and stores special
+      if (auto *Load = dyn_cast<LoadInst>(&Inst)) {
+        // Invalidate all loads used which feed into the address of this load.
+        if (auto *Ptr = dyn_cast<Instruction>(Load->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
 
-  // First collect candidate load-store reduction chains by iterating over all
-  // stores and collecting possible reduction loads.
-  for (MemoryAccess *StoreMA : Stmt) {
-    if (StoreMA->isRead())
-      continue;
+        // If this load is used outside this stmt, invalidate it.
+        if (UsedOutsideStmt)
+          InvalidLoads.insert(Load);
+
+        // And indicate that this load uses itself once but without specifying
+        // any reduction operator.
+        State[Load].insert(
+            std::make_pair(Load, std::make_pair(1, MemoryAccess::RT_BOTTOM)));
+        continue;
+      }
+
+      if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+        // Invalidate all loads which feed into the address of this store.
+        if (const Instruction *Ptr =
+                dyn_cast<Instruction>(Store->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
+
+        // Propagate the uses of the value operand to the store
+        if (auto *ValueInst = dyn_cast<Instruction>(Store->getValueOperand()))
+          State.insert(std::make_pair(Store, State[ValueInst]));
+        continue;
+      }
+
+      // Non load and store instructions are either binary operators or they
+      // will invalidate all used loads.
+      auto *BinOp = dyn_cast<BinaryOperator>(&Inst);
+      MemoryAccess::ReductionType CurRedType = getReductionType(BinOp);
+      POLLY_DEBUG(dbgs() << "CurInst: " << Inst << " RT: " << CurRedType
+                         << "\n");
+
+      // Iterate over all operands and propagate their input loads to
+      // instruction.
+      FlowInSetTy &InstInFlowSet = State[&Inst];
+      for (Use &Op : Inst.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(Op);
+        if (!OpInst)
+          continue;
+
+        POLLY_DEBUG(dbgs().indent(4) << "Op Inst: " << *OpInst << "\n");
+        const StateTy::iterator &OpInFlowSetIt = State.find(OpInst);
+        if (OpInFlowSetIt == State.end())
+          continue;
+
+        // Iterate over all the input loads of the operand and combine them
+        // with the input loads of current instruction.
+        FlowInSetTy &OpInFlowSet = OpInFlowSetIt->second;
+        for (auto &OpInFlowPair : OpInFlowSet) {
+          unsigned OpFlowIn = OpInFlowPair.second.first;
+          unsigned InstFlowIn = InstInFlowSet[OpInFlowPair.first].first;
+
+          MemoryAccess::ReductionType OpRedType = OpInFlowPair.second.second;
+          MemoryAccess::ReductionType InstRedType =
+              InstInFlowSet[OpInFlowPair.first].second;
+
+          MemoryAccess::ReductionType NewRedType =
+              combineReductionType(OpRedType, CurRedType);
+          if (InstFlowIn)
+            NewRedType = combineReductionType(NewRedType, InstRedType);
+
+          POLLY_DEBUG(dbgs().indent(8) << "OpRedType: " << OpRedType << "\n");
+          POLLY_DEBUG(dbgs().indent(8) << "NewRedType: " << NewRedType << "\n");
+          InstInFlowSet[OpInFlowPair.first] =
+              std::make_pair(OpFlowIn + InstFlowIn, NewRedType);
+        }
+      }
 
-    Loads.clear();
-    collectCandidateReductionLoads(StoreMA, Loads);
-    for (MemoryAccess *LoadMA : Loads)
-      Candidates.push_back(std::make_pair(LoadMA, StoreMA));
+      // If this operation is used outside the stmt, invalidate all the loads
+      // which feed into it.
+      if (UsedOutsideStmt)
+        for (const auto &FlowInSetElem : InstInFlowSet)
+          InvalidLoads.insert(FlowInSetElem.first);
+    }
   }
 
-  // Then check each possible candidate pair.
-  for (const auto &CandidatePair : Candidates) {
-    MemoryAccess *LoadMA = CandidatePair.first;
-    MemoryAccess *StoreMA = CandidatePair.second;
-    bool Valid = checkCandidatePairAccesses(LoadMA, StoreMA, Stmt.getDomain(),
-                                            Stmt.MemAccs);
-    if (!Valid)
+  // All used loads are propagated through the whole basic block; now try to
+  // find valid reduction-like candidate pairs. These load-store pairs fulfill
+  // all reduction like properties with regards to only this load-store chain.
+  // We later have to check if the loaded value was invalidated by an
+  // instruction not in that chain.
+  using MemAccPair = std::pair<MemoryAccess *, MemoryAccess *>;
+  DenseMap<MemAccPair, MemoryAccess::ReductionType> ValidCandidates;
+
+  // Iterate over all write memory accesses and check the loads flowing into
+  // it for reduction candidate pairs.
+  for (MemoryAccess *WriteMA : Stmt.MemAccs) {
+    if (WriteMA->isRead())
+      continue;
+    StoreInst *St = dyn_cast<StoreInst>(WriteMA->getAccessInstruction());
+    if (!St)
       continue;
+    assert(!St->isVolatile());
+
+    FlowInSetTy &MaInFlowSet = State[WriteMA->getAccessInstruction()];
+    for (auto &MaInFlowSetElem : MaInFlowSet) {
+      MemoryAccess *ReadMA = &Stmt.getArrayAccessFor(MaInFlowSetElem.first);
+      assert(ReadMA && "Couldn't find memory access for incoming load!");
 
-    const LoadInst *Load =
-        dyn_cast<const LoadInst>(CandidatePair.first->getAccessInstruction());
-    MemoryAccess::ReductionType RT =
-        getReductionType(dyn_cast<BinaryOperator>(Load->user_back()), Load);
+      POLLY_DEBUG(dbgs() << "'" << *ReadMA->getAccessInstruction()
+                         << "'\n\tflows into\n'"
+                         << *WriteMA->getAccessInstruction() << "'\n\t #"
+                         << MaInFlowSetElem.second.first << " times & RT: "
+                         << MaInFlowSetElem.second.second << "\n");
 
-    // If no overlapping access was found we mark the load and store as
-    // reduction like.
-    LoadMA->markAsReductionLike(RT);
-    StoreMA->markAsReductionLike(RT);
+      MemoryAccess::ReductionType RT = MaInFlowSetElem.second.second;
+      unsigned NumAllowableInFlow = 1;
+
+      // We allow the load to flow in exactly once for binary reductions
+      bool Valid = (MaInFlowSetElem.second.first == NumAllowableInFlow);
+
+      // Check if we saw a valid chain of binary operators.
+      Valid = Valid && RT != MemoryAccess::RT_BOTTOM;
+      Valid = Valid && RT != MemoryAccess::RT_NONE;
+
+      // Then check if the memory accesses allow a reduction.
+      Valid = Valid && checkCandidatePairAccesses(
+                           ReadMA, WriteMA, Stmt.getDomain(), Stmt.MemAccs);
+
+      // Finally, mark the pair as a candidate or the load as a invalid one.
+      if (Valid)
+        ValidCandidates[std::make_pair(ReadMA, WriteMA)] = RT;
+      else
+        InvalidLoads.insert(ReadMA->getAccessInstruction());
+    }
+  }
+
+  // In the last step mark the memory accesses of candidate pairs as reduction
+  // like if the load wasn't marked invalid in the previous step.
+  for (auto &CandidatePair : ValidCandidates) {
+    MemoryAccess *LoadMA = CandidatePair.first.first;
+    if (InvalidLoads.count(LoadMA->getAccessInstruction()))
+      continue;
+    POLLY_DEBUG(
+        dbgs() << " Load :: "
+               << *((CandidatePair.first.first)->getAccessInstruction())
+               << "\n Store :: "
+               << *((CandidatePair.first.second)->getAccessInstruction())
+               << "\n are marked as reduction like\n");
+    MemoryAccess::ReductionType RT = CandidatePair.second;
+    CandidatePair.first.first->markAsReductionLike(RT);
+    CandidatePair.first.second->markAsReductionLike(RT);
   }
 }
 
@@ -2770,7 +2939,7 @@ isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access,
 
   auto &DL = scop->getFunction().getDataLayout();
   if (isSafeToLoadUnconditionally(LI->getPointerOperand(), LI->getType(),
-                                  LI->getAlign(), DL)) {
+                                  LI->getAlign(), DL, nullptr)) {
     SafeToLoad = isl::set::universe(AccessRelation.get_space().range());
   } else if (BB != LI->getParent()) {
     // Skip accesses in non-affine subregions as they might not be executed
@@ -2965,52 +3134,6 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt,
   }
 }
 
-void ScopBuilder::collectCandidateReductionLoads(
-    MemoryAccess *StoreMA, SmallVectorImpl<MemoryAccess *> &Loads) {
-  ScopStmt *Stmt = StoreMA->getStatement();
-
-  auto *Store = dyn_cast<StoreInst>(StoreMA->getAccessInstruction());
-  if (!Store)
-    return;
-
-  // Skip if there is not one binary operator between the load and the store
-  auto *BinOp = dyn_cast<BinaryOperator>(Store->getValueOperand());
-  if (!BinOp)
-    return;
-
-  // Skip if the binary operators has multiple uses
-  if (BinOp->getNumUses() != 1)
-    return;
-
-  // Skip if the opcode of the binary operator is not commutative/associative
-  if (!BinOp->isCommutative() || !BinOp->isAssociative())
-    return;
-
-  // Skip if the binary operator is outside the current SCoP
-  if (BinOp->getParent() != Store->getParent())
-    return;
-
-  // Skip if it is a multiplicative reduction and we disabled them
-  if (DisableMultiplicativeReductions &&
-      (BinOp->getOpcode() == Instruction::Mul ||
-       BinOp->getOpcode() == Instruction::FMul))
-    return;
-
-  // Check the binary operator operands for a candidate load
-  auto *PossibleLoad0 = dyn_cast<LoadInst>(BinOp->getOperand(0));
-  auto *PossibleLoad1 = dyn_cast<LoadInst>(BinOp->getOperand(1));
-  if (!PossibleLoad0 && !PossibleLoad1)
-    return;
-
-  // A load is only a candidate if it cannot escape (thus has only this use)
-  if (PossibleLoad0 && PossibleLoad0->getNumUses() == 1)
-    if (PossibleLoad0->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad0));
-  if (PossibleLoad1 && PossibleLoad1->getNumUses() == 1)
-    if (PossibleLoad1->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad1));
-}
-
 /// Find the canonical scop array info object for a set of invariant load
 /// hoisted loads. The canonical array is the one that corresponds to the
 /// first load in the list of accesses which is used as base pointer of a
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index eab7bd83e6a4e..79db3965de023 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -490,7 +490,8 @@ bool ScopDetection::onlyValidRequiredInvariantLoads(
 
     for (auto NonAffineRegion : Context.NonAffineSubRegionSet) {
       if (isSafeToLoadUnconditionally(Load->getPointerOperand(),
-                                      Load->getType(), Load->getAlign(), DL))
+                                      Load->getType(), Load->getAlign(), DL,
+                                      nullptr))
         continue;
 
       if (NonAffineRegion->contains(Load) &&
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index fa35fae84aceb..044d3573793f0 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -533,6 +533,9 @@ MemoryAccess::getReductionOperatorStr(MemoryAccess::ReductionType RT) {
   case MemoryAccess::RT_NONE:
     llvm_unreachable("Requested a reduction operator string for a memory "
                      "access which isn't a reduction");
+  case MemoryAccess::RT_BOTTOM:
+    llvm_unreachable("Requested a reduction operator string for a internal "
+                     "reduction type!");
   case MemoryAccess::RT_ADD:
     return "+";
   case MemoryAccess::RT_MUL:
@@ -915,10 +918,15 @@ isl::id MemoryAccess::getId() const { return Id; }
 
 raw_ostream &polly::operator<<(raw_ostream &OS,
                                MemoryAccess::ReductionType RT) {
-  if (RT == MemoryAccess::RT_NONE)
+  switch (RT) {
+  case MemoryAccess::RT_NONE:
+  case MemoryAccess::RT_BOTTOM:
     OS << "NONE";
-  else
+    break;
+  default:
     OS << MemoryAccess::getReductionOperatorStr(RT);
+    break;
+  }
   return OS;
 }
 
diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000000000..3b4bd9ef04b5a
--- /dev/null
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction dependences:
+; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(ptr noalias %A, ptr noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds ptr, ptr %INDICES, i32 %i.0
+  %tmp = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds ptr, ptr %A, i32 %tmp
+  %tmp1 = load double, ptr %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll
new file mode 100644
index 0000000000000..d126d3d833ee1
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_double.ll
@@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if two independent reductions in same loop is detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+;
+; CHECK: Stmt_for_body_b
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+;
+; int red(int *A, int *B, int *sum, int * prod, int n) {
+;   for (int i = 0; i < n; ++i) {
+;     *sum += A[i];
+;     *prod += B[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local i32 @red(ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef %sum1, ptr nocapture noundef %sum2, i32 noundef %n) local_unnamed_addr #0 {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 undef
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %1 = load i32, ptr %sum1
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %sum1
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2
+  %3 = load i32, ptr %sum2
+  %add3 = add nsw i32 %3, %2
+  store i32 %add3, ptr %sum2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
new file mode 100644
index 0000000000000..92a071ea1c372
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -0,0 +1,43 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; void f(int N, int * restrict sums, int * restrict escape) {
+;   int i, j;
+;   for (i = 0; i < 1024; i++) {
+;     sums[i] += 5;
+;     escape[i] = sums[i];
+;   }
+; }
+;
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: escape
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32 %N, i32* noalias %sums, i32* noalias %escape) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc ]
+  %exitcond1 = icmp ne i32 %i.0, 1024
+  br i1 %exitcond1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %sums, i32 0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 5
+  store i32 %add, i32* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %escape, i32 %i.0
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc8 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll
new file mode 100644
index 0000000000000..4f7d3681e0a0b
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_if.ll
@@ -0,0 +1,52 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if reduction spread across multiple blocks in a single scop statement are detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+;
+; void f(int*__restrict A, int*__restrict B, int *sum) {
+;   for (int i = 0; i < 4444; ++i) {
+;     if (B[i])
+;       *sum += A[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @f(ptr noalias nocapture noundef readonly %A, ptr noalias nocapture noundef readonly %B, ptr nocapture noundef %sum) local_unnamed_addr #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret void
+
+for.body:                                         ; preds = %entry.split, %for.inc
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2
+  %2 = load i32, ptr %sum
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4444
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000000000..7acac4b150f40
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
new file mode 100644
index 0000000000000..331953991d86c
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Validate that the accesses to INDICES[i] is not part of a reduction.
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++) {
+;        A[INDICES[i]] += N;
+;        INDICES[i] += N;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  %add3 = add nsw i32 %tmp, %N
+  store i32 %add3, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
new file mode 100644
index 0000000000000..62ae1fef187b6
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -0,0 +1,61 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_A
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK-NOT: MemRef_A
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]);
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  store i32 %add13, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
new file mode 100644
index 0000000000000..7ca46fa9535ac
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -0,0 +1,58 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Sum is added twice in the statement. Hence no reduction.
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]) + *sum;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  %tmp7 = load i32, i32* %sum, align 4
+  %add14 = add nsw i32 %add13, %tmp7
+  store i32 %add14, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
new file mode 100644
index 0000000000000..b77c72a291744
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Should not be identified as reduction as there are different operations
+; involved on sum (multiplication followed by addition)
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum) {
+;      for (int i = 0; i < 1024; i++) {
+;        *sum = (*sum * 5) + 25;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp = load i32, i32* %sum, align 4
+  %tmp1 = mul i32 %tmp, 5
+  %mul = add i32 %tmp1, 25
+  store i32 %mul, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/pstl/docs/ReleaseNotes.rst b/pstl/docs/ReleaseNotes.rst
index 342b2eafc2f51..941b029ecd674 100644
--- a/pstl/docs/ReleaseNotes.rst
+++ b/pstl/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 =======================================
-PSTL 19.0.0 (In-Progress) Release Notes
+PSTL 20.0.0 (In-Progress) Release Notes
 =======================================
 
 .. contents::
diff --git a/sycl-jit/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-jit/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 3637930d72f8f..3cdee88821a94 100644
--- a/sycl-jit/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-jit/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -10,7 +10,7 @@
 
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/SYCLLowerIR/TargetHelpers.h"
+#include "llvm/TargetParser/Triple.h"
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/Debug.h>
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index ee355ade71876..4655a8756e0b9 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -57,8 +57,8 @@ if(MSVC)
 
   # Add PDB debug information
   list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-  include(LLVMCheckLinkerFlag)
-  llvm_check_linker_flag(CXX "/DEBUG" LINKER_SUPPORTS_DEBUG)
+  include(CheckLinkerFlag)
+  check_linker_flag(CXX "/DEBUG" LINKER_SUPPORTS_DEBUG)
   if(LINKER_SUPPORTS_DEBUG)
     # sccache is not compatible with /Zi flag
     if (CMAKE_CXX_COMPILER_LAUNCHER STREQUAL "sccache")
@@ -80,7 +80,7 @@ if(MSVC)
     add_link_options("/DEBUG")
 
     # Enable unreferenced removal and ICF in Release mode.
-    llvm_check_linker_flag(CXX "/OPT:REF /OPT:ICF" LINKER_SUPPORTS_OPTS)
+    check_linker_flag(CXX "/OPT:REF /OPT:ICF" LINKER_SUPPORTS_OPTS)
     if (LINKER_SUPPORTS_OPTS AND uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE")
       add_link_options("/OPT:REF" "/OPT:ICF")
     endif()
diff --git a/sycl/cmake/modules/SYCLUtils.cmake b/sycl/cmake/modules/SYCLUtils.cmake
index 4f5265fb68790..f44b7c9d5ba7e 100644
--- a/sycl/cmake/modules/SYCLUtils.cmake
+++ b/sycl/cmake/modules/SYCLUtils.cmake
@@ -1,5 +1,5 @@
 list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-include(LLVMCheckLinkerFlag)
+include(CheckLinkerFlag)
 
 # add_stripped_pdb(TARGET_NAME)
 #
@@ -7,7 +7,7 @@ include(LLVMCheckLinkerFlag)
 # file as ${ARG_TARGET_NAME}.pdb in bin folder.
 # NOTE: LLD does not currently support /PDBSTRIPPED so the PDB file is optional.
 macro(add_stripped_pdb ARG_TARGET_NAME)
-  llvm_check_linker_flag(CXX "/PDBSTRIPPED:${ARG_TARGET_NAME}.stripped.pdb"
+  check_linker_flag(CXX "/PDBSTRIPPED:${ARG_TARGET_NAME}.stripped.pdb"
                          LINKER_SUPPORTS_PDBSTRIPPED)
   if(LINKER_SUPPORTS_PDBSTRIPPED)
     target_link_options(${ARG_TARGET_NAME}
diff --git a/sycl/test-e2e/Config/kernel_from_file.cpp b/sycl/test-e2e/Config/kernel_from_file.cpp
index defbbac5601c8..8450d6eae2573 100644
--- a/sycl/test-e2e/Config/kernel_from_file.cpp
+++ b/sycl/test-e2e/Config/kernel_from_file.cpp
@@ -6,7 +6,7 @@
 // FIXME separate compilation requires -fno-sycl-dead-args-optimization
 // As we are doing a separate device compilation here, we need to explicitly
 // add the device lib instrumentation (itt_compiler_wrapper)
-// RUN: %clangxx -Wno-error=ignored-attributes -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 -fsycl-device-only -fno-sycl-dead-args-optimization -Xclang -fsycl-int-header=%t.h -c %s -o %t.bc -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
+// RUN: %clangxx -Wno-error=ignored-attributes -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 -fsycl-device-only -fno-sycl-dead-args-optimization -Xclang -fsycl-int-header=%t.h %s -o %t.bc -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
 // >> ---- unbundle compiler wrapper and sanitizer device objects
 // RUN: clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-itt-compiler-wrappers%obj_ext -output=%t_compiler_wrappers.bc -unbundle
 // RUN: %if linux %{ clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-sanitizer%obj_ext -output=%t_sanitizer.bc -unbundle %}
diff --git a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
index f7c9aad155b43..c085d744cb075 100644
--- a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
+++ b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} %debug_option -Ofast -o %t.out
+// RUN: %{build} %debug_option -ffp-model=fast -o %t.out
 // RUN: %{build} %debug_option -Os -o %t.out
 // RUN: %{build} %debug_option -Oz -o %t.out
 // RUN: %{build} %debug_option -Og -o %t.out
diff --git a/sycl/test-e2e/SeparateCompile/test.cpp b/sycl/test-e2e/SeparateCompile/test.cpp
index 27e165865fd37..5ee8774293895 100644
--- a/sycl/test-e2e/SeparateCompile/test.cpp
+++ b/sycl/test-e2e/SeparateCompile/test.cpp
@@ -6,7 +6,7 @@
 // FIXME separate compilation requires -fno-sycl-dead-args-optimization
 // >> ---- compile src1
 // >> device compilation...
-// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT -fno-sycl-dead-args-optimization -fsycl-device-only -Xclang -fsycl-int-header=sycl_ihdr_a.h %s -c -o a_kernel.bc -Wno-sycl-strict
+// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT -fno-sycl-dead-args-optimization -fsycl-device-only -Xclang -fsycl-int-header=sycl_ihdr_a.h %s -o a_kernel.bc -Wno-sycl-strict
 // >> host compilation...
 // RUN: %clangxx -Wno-error=ignored-attributes -Wno-error=unused-command-line-argument -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 %include_option sycl_ihdr_a.h %debug_option -c %s -o a.o %sycl_options -fno-sycl-dead-args-optimization -Wno-sycl-strict
 //
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index 298b64fd56291..7baca11eed3d3 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -2,7 +2,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 
@@ -27,6 +26,14 @@ load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
 
 llvm_configure(name = "llvm-project")
 
+maybe(
+    http_archive,
+    name = "rules_python",
+    sha256 = "778aaeab3e6cfd56d681c89f5c10d7ad6bf8d2f1a72de9de55b23081b2d31618",
+    strip_prefix = "rules_python-0.34.0",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.34.0/rules_python-0.34.0.tar.gz",
+)
+
 maybe(
     http_archive,
     name = "llvm_zlib",
@@ -76,10 +83,11 @@ apple_support_dependencies()
 # Note: that building from source requires `m4` to be installed on the host machine.
 # This is a known issue: https://github.com/bazelbuild/rules_foreign_cc/issues/755.
 
-git_repository(
+http_archive(
     name = "rules_foreign_cc",
-    remote = "https://github.com/bazelbuild/rules_foreign_cc.git",
-    tag = "0.9.0",
+    sha256 = "4b33d62cf109bcccf286b30ed7121129cc34cf4f4ed9d8a11f38d9108f40ba74",
+    strip_prefix = "rules_foreign_cc-0.11.1",
+    url = "https://github.com/bazelbuild/rules_foreign_cc/releases/download/0.11.1/rules_foreign_cc-0.11.1.tar.gz",
 )
 
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
@@ -112,11 +120,12 @@ maybe(
 )
 
 maybe(
-    new_git_repository,
+    http_archive,
     name = "pfm",
     build_file = "@llvm-raw//utils/bazel/third_party_build:pfm.BUILD",
-    remote = "https://git.code.sf.net/p/perfmon2/libpfm4",
-    tag = "v4.12.1",
+    sha256 = "d18b97764c755528c1051d376e33545d0eb60c6ebf85680436813fa5b04cc3d1",
+    strip_prefix = "libpfm-4.13.0",
+    urls = ["https://versaweb.dl.sourceforge.net/project/perfmon2/libpfm4/libpfm-4.13.0.tar.gz"],
 )
 
 maybe(
@@ -129,3 +138,21 @@ maybe(
         "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
+
+maybe(
+    http_archive,
+    name = "pybind11",
+    build_file = "@llvm-raw//utils/bazel/third_party_build:pybind.BUILD",
+    sha256 = "201966a61dc826f1b1879a24a3317a1ec9214a918c8eb035be2f30c3e9cfbdcb",
+    strip_prefix = "pybind11-2.10.3",
+    url = "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
+)
+
+load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains")
+
+py_repositories()
+
+python_register_toolchains(
+    name = "python_3_12",
+    python_version = "3.12",
+)
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index a1c089a89db87..907fe106166b3 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -10,6 +10,12 @@ package(
 
 licenses(["notice"])
 
+genrule(
+    name = "generate_vcs_revision",
+    outs = ["include/VCSVersion.inc"],
+    cmd = "echo '#undef BOLT_REVISION' >> $@\n",
+)
+
 cc_binary(
     name = "llvm-bolt-heatmap",
     srcs = glob([
@@ -114,6 +120,7 @@ cc_library(
     textual_hdrs = glob([
         "include/bolt/RuntimeLibs/*.h",
     ]) + ["include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc"],
+    defines=["CMAKE_INSTALL_FULL_LIBDIR=\\\"\\\""],
     deps = [
         ":Core",
         ":Passes",
@@ -289,7 +296,9 @@ cc_library(
     srcs = glob([
         "lib/Utils/*.cpp",
     ]),
-    hdrs = glob([
+    hdrs = [
+        "include/VCSVersion.inc",
+    ] + glob([
         "include/bolt/Utils/*.h",
     ]),
     includes = ["include"],
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 2d7ce8702a5d9..5aa6b0142b44e 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -4,10 +4,10 @@
 
 load(
     "//:vars.bzl",
-    "LLVM_VERSION",
     "LLVM_VERSION_MAJOR",
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
+    "PACKAGE_VERSION",
 )
 load("//:workspace_root.bzl", "workspace_root")
 load("//llvm:binary_alias.bzl", "binary_alias")
@@ -553,12 +553,12 @@ genrule(
         "echo '#define CLANG_VERSION_MAJOR_STRING \"{major}\"' >> $@\n" +
         "echo '#define CLANG_VERSION_MINOR {minor}' >> $@\n" +
         "echo '#define CLANG_VERSION_PATCHLEVEL {patch}' >> $@\n" +
-        "echo '#define CLANG_VERSION_STRING \"{vers}git\"' >> $@\n"
+        "echo '#define CLANG_VERSION_STRING \"{vers}\"' >> $@\n"
     ).format(
         major = LLVM_VERSION_MAJOR,
         minor = LLVM_VERSION_MINOR,
         patch = LLVM_VERSION_PATCH,
-        vers = LLVM_VERSION,
+        vers = PACKAGE_VERSION,
     ),
 )
 
@@ -2095,6 +2095,7 @@ cc_library(
         "//llvm:BitstreamReader",
         "//llvm:BitstreamWriter",
         "//llvm:FrontendOpenMP",
+        "//llvm:Object",
         "//llvm:Support",
         "//llvm:TargetParser",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f0c25e658fd18..253b89216a88f 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -675,6 +675,7 @@ libc_support_library(
     deps = [
         ":__support_common",
         ":__support_cpp_type_traits",
+        ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_macros_optimization",
@@ -2020,6 +2021,7 @@ libc_math_function(name = "hypot")
 libc_math_function(
     name = "hypotf",
     additional_deps = [
+        ":__support_fputil_double_double",
         ":__support_fputil_sqrt",
     ],
 )
@@ -3025,17 +3027,6 @@ libc_function(
     ],
 )
 
-libc_function(
-    name = "getpid",
-    srcs = ["src/unistd/linux/getpid.cpp"],
-    hdrs = ["src/unistd/getpid.h"],
-    deps = [
-        ":__support_common",
-        ":__support_osutil_syscall",
-        ":errno",
-    ],
-)
-
 libc_function(
     name = "getppid",
     srcs = ["src/unistd/linux/getppid.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index a6f9d4f2fdac2..4575c40bc76c5 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -104,16 +104,19 @@ libc_support_library(
     name = "qsort_test_helper",
     hdrs = ["SortingTest.h"],
     deps = [
+        "//libc:__support_macros_config",
         "//libc:qsort_util",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
+
 libc_test(
     name = "qsort_test",
     srcs = ["qsort_test.cpp"],
     libc_function_deps = ["//libc:qsort"],
     deps = [":qsort_test_helper"],
 )
+
 libc_test(
     name = "quick_sort_test",
     srcs = ["quick_sort_test.cpp"],
@@ -122,6 +125,7 @@ libc_test(
         "//libc:qsort_util",
     ],
 )
+
 libc_test(
     name = "heap_sort_test",
     srcs = ["heap_sort_test.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
index 4549fa2a193fc..66d8ddb0a67eb 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
@@ -261,13 +261,6 @@ libc_test(
 #     ],
 # )
 
-libc_test(
-    name = "getpid_test",
-    srcs = ["getpid_test.cpp"],
-    libc_function_deps = [
-        "//libc:getpid",
-    ],
-)
 
 libc_test(
     name = "getppid_test",
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 2ccd6fcae635d..db580495dfd91 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -235,6 +235,7 @@ cc_library(
         "//llvm:ObjCARC",
         "//llvm:Object",
         "//llvm:Option",
+        "//llvm:ProfileData",
         "//llvm:Support",
         "//llvm:TargetParser",
         "//llvm:TextAPI",
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index cc573864e29b1..b4889bbd46add 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -432,6 +432,7 @@ cc_library(
         ":Version",
         "//clang:codegen",
         "//clang:frontend",
+        "//clang:serialization",
         "//llvm:Support",
     ],
 )
@@ -707,7 +708,9 @@ cc_library(
 
 cc_library(
     name = "Headers",
-    hdrs = glob(["include/lldb/lldb-*.h"]),
+    hdrs = glob(["include/lldb/lldb-*.h"]) + [
+        "include/lldb/Symbol/SaveCoreOptions.h",
+    ],
     strip_include_prefix = "include",
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index ae17746c72882..c58236a4a3ed2 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -944,17 +944,7 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
-    ]) + [
-        # To avoid a dependency cycle.
-        "include/llvm/Analysis/IVDescriptors.h",
-        "include/llvm/CodeGen/GenVT.inc",
-    ] + glob(
-        # To avoid a dependency cycle.
-        [
-            "include/llvm/CodeGen/**/*.h",
-            "include/llvm/CodeGenTypes/**/*.h",
-        ],
-    ),
+    ]),
     hdrs = glob(
         [
             "include/llvm/*.h",
@@ -1609,6 +1599,7 @@ cc_library(
         ":Analysis",
         ":BitReader",
         ":Core",
+        ":Demangle",
         ":FrontendOffloading",
         ":MC",
         ":Scalar",
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index 2e3bff53ead9d..9de966688eda5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -6,10 +6,10 @@
 
 load(
     "//:vars.bzl",
-    "LLVM_VERSION",
     "LLVM_VERSION_MAJOR",
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
+    "PACKAGE_VERSION",
 )
 
 def native_arch_defines(arch, triple):
@@ -108,7 +108,7 @@ llvm_config_defines = os_defines + builtin_thread_pointer + select({
     "LLVM_VERSION_MAJOR={}".format(LLVM_VERSION_MAJOR),
     "LLVM_VERSION_MINOR={}".format(LLVM_VERSION_MINOR),
     "LLVM_VERSION_PATCH={}".format(LLVM_VERSION_PATCH),
-    r'LLVM_VERSION_STRING=\"{}git\"'.format(LLVM_VERSION),
+    r'LLVM_VERSION_STRING=\"{}\"'.format(PACKAGE_VERSION),
     # These shouldn't be needed by the C++11 standard, but are for some
     # platforms (e.g. glibc < 2.18. See
     # https://sourceware.org/bugzilla/show_bug.cgi?id=15366). These are also
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index dfc0f870177bb..8c53f7c816ebe 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -620,7 +620,7 @@ cc_test(
         allow_empty = False,
     ),
     deps = [
-        "//llvm:BitstreamReader",
+        "//llvm:BitReader",
         "//llvm:Core",
         "//llvm:Coverage",
         "//llvm:DebugInfo",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index fe67286422f15..8493823114012 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -918,24 +918,14 @@ exports_files(
     glob(["lib/Bindings/Python/**/*.cpp"]),
 )
 
-# In the targets related to Python bindings, the projects @pybind11 and
-# @local_config_python are defined by @pybind11_bazel. The latter contains
-# python headers, and can be configured in an out-of-tree bazel project via
-#
-#   load("@pybind11_bazel//:python_configure.bzl", "python_configure")
-#   python_configure(name = "local_config_python")
-#
-# For more up-to-date instructions, see
-# https://github.com/pybind/pybind11_bazel
-#
-# Some out-of-tree projects alias @python_runtime//:headers to
-# @local_config_python//:python_headers.
-
-MLIR_BINDINGS_PYTHON_HEADERS = [
-    "lib/Bindings/Python/*.h",
-    "include/mlir-c/Bindings/Python/*.h",
-    "include/mlir/Bindings/Python/*.h",
-]
+filegroup(
+    name = "MLIRBindingsPythonHeaderFiles",
+    srcs = glob([
+        "lib/Bindings/Python/*.h",
+        "include/mlir-c/Bindings/Python/*.h",
+        "include/mlir/Bindings/Python/*.h",
+    ]),
+)
 
 cc_library(
     name = "MLIRBindingsPythonHeaders",
@@ -943,16 +933,12 @@ cc_library(
         "include",
         "lib/Bindings/Python",
     ],
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
-    textual_hdrs = glob(MLIR_BINDINGS_PYTHON_HEADERS),
+    textual_hdrs = [":MLIRBindingsPythonHeaderFiles"],
     deps = [
         ":CAPIIRHeaders",
         ":CAPITransformsHeaders",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
@@ -962,16 +948,12 @@ cc_library(
         "include",
         "lib/Bindings/Python",
     ],
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
-    textual_hdrs = glob(MLIR_BINDINGS_PYTHON_HEADERS),
+    textual_hdrs = [":MLIRBindingsPythonHeaderFiles"],
     deps = [
         ":CAPIIR",
         ":CAPITransforms",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
@@ -986,26 +968,25 @@ PYBIND11_FEATURES = [
     "-use_header_modules",
 ]
 
-MLIR_PYTHON_BINDINGS_SOURCES = [
-    "lib/Bindings/Python/IRAffine.cpp",
-    "lib/Bindings/Python/IRAttributes.cpp",
-    "lib/Bindings/Python/IRCore.cpp",
-    "lib/Bindings/Python/IRInterfaces.cpp",
-    "lib/Bindings/Python/IRModule.cpp",
-    "lib/Bindings/Python/IRTypes.cpp",
-    "lib/Bindings/Python/Pass.cpp",
-    "lib/Bindings/Python/Rewrite.cpp",
-]
+filegroup(
+    name = "MLIRBindingsPythonSourceFiles",
+    srcs = [
+        "lib/Bindings/Python/IRAffine.cpp",
+        "lib/Bindings/Python/IRAttributes.cpp",
+        "lib/Bindings/Python/IRCore.cpp",
+        "lib/Bindings/Python/IRInterfaces.cpp",
+        "lib/Bindings/Python/IRModule.cpp",
+        "lib/Bindings/Python/IRTypes.cpp",
+        "lib/Bindings/Python/Pass.cpp",
+        "lib/Bindings/Python/Rewrite.cpp",
+    ],
+)
 
 cc_library(
     name = "MLIRBindingsPythonCore",
-    srcs = MLIR_PYTHON_BINDINGS_SOURCES,
+    srcs = [":MLIRBindingsPythonSourceFiles"],
     copts = PYBIND11_COPTS,
     features = PYBIND11_FEATURES,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIAsync",
         ":CAPIDebug",
@@ -1015,20 +996,16 @@ cc_library(
         ":Support",
         ":config",
         "//llvm:Support",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
 cc_library(
     name = "MLIRBindingsPythonCoreNoCAPI",
-    srcs = MLIR_PYTHON_BINDINGS_SOURCES,
+    srcs = [":MLIRBindingsPythonSourceFiles"],
     copts = PYBIND11_COPTS,
     features = PYBIND11_FEATURES,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIAsyncHeaders",
         ":CAPIDebugHeaders",
@@ -1037,8 +1014,8 @@ cc_library(
         ":Support",
         ":config",
         "//llvm:Support",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
@@ -1046,10 +1023,6 @@ cc_library(
 # MLIRBindingsPythonCoreNoCAPI.
 cc_library(
     name = "MLIRBindingsPythonCAPIObjects",
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIAsyncObjects",
         ":CAPIDebugObjects",
@@ -1068,10 +1041,6 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":MLIRBindingsPythonCore",
         ":MLIRBindingsPythonHeadersAndDeps",
@@ -1085,10 +1054,6 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIIR",
         ":CAPILinalg",
@@ -1103,10 +1068,6 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",
-    ],
     deps = [
         ":CAPIIR",
         ":CAPILLVM",
@@ -1122,10 +1083,6 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIIR",
         ":CAPIQuant",
@@ -1141,10 +1098,6 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIIR",
         ":CAPISparseTensor",
@@ -1161,15 +1114,11 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPIExecutionEngine",
         ":MLIRBindingsPythonHeadersAndDeps",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
@@ -1181,15 +1130,11 @@ cc_binary(
     features = PYBIND11_FEATURES,
     linkshared = 1,
     linkstatic = 0,
-    tags = [
-        "manual",  # External dependency
-        "nobuildkite",  # TODO(gcmn): Add support for this target
-    ],
     deps = [
         ":CAPILinalg",
         ":MLIRBindingsPythonHeadersAndDeps",
-        "@local_config_python//:python_headers",
         "@pybind11",
+        "@rules_python//python/cc:current_py_cc_headers",
     ],
 )
 
@@ -2197,6 +2142,7 @@ cc_library(
         ":FuncTransforms",
         ":FunctionInterfaces",
         ":IR",
+        ":IndexDialect",
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":MemRefDialect",
@@ -2204,6 +2150,7 @@ cc_library(
         ":SCFDialect",
         ":SCFTransforms",
         ":TransformUtils",
+        ":VectorUtils",
         "//llvm:Support",
     ],
 )
@@ -3442,6 +3389,7 @@ td_library(
     srcs = ["include/mlir/Dialect/NVGPU/IR/NVGPU.td"],
     includes = ["include"],
     deps = [
+        ":InferTypeOpInterfaceTdFiles",
         ":SideEffectInterfacesTdFiles",
     ],
 )
@@ -3530,6 +3478,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":GPUDialect",
         ":IR",
+        ":InferTypeOpInterface",
         ":LLVMDialect",
         ":NVGPUIncGen",
         ":SideEffectInterfaces",
@@ -5799,6 +5748,7 @@ cc_library(
         "lib/Conversion/GPUCommon/OpToFuncCallLowering.h",
     ],
     deps = [
+        ":ArithDialect",
         ":GPUDialect",
         ":IR",
         ":LLVMCommonConversion",
@@ -7213,6 +7163,7 @@ cc_library(
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
+        ":Pass",
         ":SPIRVDialect",
         ":Support",
         ":TransformUtils",
@@ -8374,6 +8325,7 @@ cc_library(
         ":TransformUtils",
         ":Transforms",
         ":UBToSPIRV",
+        ":VectorDialect",
         ":VectorToSPIRV",
         ":VectorTransforms",
     ],
@@ -10129,6 +10081,7 @@ td_library(
         "include/mlir/Dialect/OpenACC/AccCommon.td",
         "include/mlir/Dialect/OpenACC/OpenACCBase.td",
         "include/mlir/Dialect/OpenACC/OpenACCOps.td",
+        "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td",
         "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td",
         "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td",
         "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td",
@@ -10142,6 +10095,23 @@ td_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "OpenACCOpsInterfacesIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td",
+    deps = [":OpenAccOpsTdFiles"],
+)
+
 gentbl_cc_library(
     name = "OpenACCMPOpsInterfacesIncGen",
     tbl_outs = [
@@ -10285,6 +10255,7 @@ cc_library(
         ":MemRefDialect",
         ":OpenACCMPOpsInterfacesIncGen",
         ":OpenACCOpsIncGen",
+        ":OpenACCOpsInterfacesIncGen",
         ":OpenACCTypeInterfacesIncGen",
         ":OpenACCTypesIncGen",
         ":SideEffectInterfaces",
@@ -11242,6 +11213,7 @@ cc_library(
         ":AffineDialect",
         ":Analysis",
         ":ArithDialect",
+	":ArithUtils",
         ":AsmParser",
         ":BufferizationDialect",
         ":BufferizationTransforms",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index a1d2b20a106e6..34beb758a12dd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -668,6 +668,7 @@ cc_library(
         "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
+        "//mlir:VectorTransforms",
     ],
 )
 
diff --git a/utils/bazel/third_party_build/pybind.BUILD b/utils/bazel/third_party_build/pybind.BUILD
new file mode 100644
index 0000000000000..d9fb431dbabb5
--- /dev/null
+++ b/utils/bazel/third_party_build/pybind.BUILD
@@ -0,0 +1,15 @@
+cc_library(
+    name = "pybind11",
+    hdrs = glob(
+        include = ["include/pybind11/**/*.h"],
+        exclude = [
+            # Deprecated file that just emits a warning
+            "include/pybind11/common.h",
+        ],
+    ),
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@rules_python//python/cc:current_py_cc_headers",
+    ],
+)